wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,13 +2,14 @@ import asyncio
2
2
  import contextlib
3
3
  import inspect
4
4
  from collections import deque
5
- from typing import Any, AsyncGenerator, Iterator
5
+ from typing import Any, AsyncGenerator, Iterator, Iterable
6
6
 
7
7
  from lxml.html import HtmlElement
8
8
  from tqdm import tqdm
9
9
 
10
10
  from wxpath import patches # noqa: F401
11
11
  from wxpath.core import parser
12
+ from wxpath.core.exceptions import XPathEvaluationError
12
13
  from wxpath.core.models import (
13
14
  CrawlIntent,
14
15
  CrawlTask,
@@ -18,7 +19,7 @@ from wxpath.core.models import (
18
19
  ProcessIntent,
19
20
  )
20
21
  from wxpath.core.ops import get_operator
21
- from wxpath.core.parser import Binary, Segment, Segments
22
+ from wxpath.core.parser import Binary, Depth, Segment, Segments
22
23
  from wxpath.core.runtime.helpers import parse_html
23
24
  from wxpath.hooks.registry import FetchContext, get_hooks
24
25
  from wxpath.http.client.crawler import Crawler
@@ -145,6 +146,7 @@ class WXPathEngine(HookedEngineBase):
145
146
  respect_robots: bool = True,
146
147
  allowed_response_codes: set[int] = None,
147
148
  allow_redirects: bool = True,
149
+ yield_errors: bool = False,
148
150
  ):
149
151
  # NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
150
152
  self.seen_urls: set[str] = set()
@@ -157,19 +159,49 @@ class WXPathEngine(HookedEngineBase):
157
159
  self.allow_redirects = allow_redirects
158
160
  if allow_redirects:
159
161
  self.allowed_response_codes |= {301, 302, 303, 307, 308}
162
+ self.yield_errors = yield_errors
163
+
164
+ def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
165
+ """Get the maximum crawl depth for a given expression. Will find a Depth
166
+ argument at the beginning of the expression and return its value. Otherwise, returns the
167
+ max_depth value provided.
168
+ TODO: There has to be a better way to do this.
169
+ """
170
+ if isinstance(bin_or_segs, Binary):
171
+ if hasattr(bin_or_segs.left, 'func') == 'url':
172
+ depth_arg = [arg for arg in bin_or_segs.left.args if isinstance(arg, Depth)][0]
173
+ return int(depth_arg.value)
174
+ elif hasattr(bin_or_segs.right, 'func') == 'url':
175
+ depth_arg = [arg for arg in bin_or_segs.right.args if isinstance(arg, Depth)][0]
176
+ return int(depth_arg.value)
177
+ elif isinstance(bin_or_segs, Segments):
178
+ depth_arg = [arg for arg in bin_or_segs[0].args if isinstance(arg, Depth)]
179
+ if depth_arg:
180
+ return int(depth_arg[0].value)
181
+ return max_depth
160
182
 
161
183
  async def run(
162
184
  self,
163
185
  expression: str,
164
186
  max_depth: int,
165
187
  progress: bool = False,
166
- yield_errors: bool = False,
167
188
  ) -> AsyncGenerator[Any, None]:
168
189
  """Execute a wxpath expression concurrently and yield results.
169
190
 
170
191
  Builds and drives a BFS-like crawl pipeline that honors robots rules,
171
192
  throttling, and hook callbacks while walking the web graph.
172
193
 
194
+ NOTES ON max_depth:
195
+ If depth is provided in the expression, it will be used to limit the depth of the
196
+ crawl. If depth is provided in the expression and max_depth is provided as an argument
197
+ to `run`, the inline depth in the expression will take precedence.
198
+
199
+ Currently, max_depth control flow logic is detected and executed in the
200
+ engine. In the future, the operation handlers (ops.py) could be responsible for
201
+ detecting max_depth, and sending a terminal intent to the engine. It's also possible
202
+ that the depth terminals are relative to the current depth (i.e. `url(//xpath, depth=2)`
203
+ implies crawling only the next 2 levels). This is not yet supported.
204
+
173
205
  Args:
174
206
  expression: WXPath expression string to evaluate.
175
207
  max_depth: Maximum crawl depth to follow for url hops.
@@ -179,7 +211,9 @@ class WXPathEngine(HookedEngineBase):
179
211
  Extracted values produced by the expression (HTML elements or
180
212
  wxpath-specific value types).
181
213
  """
182
- segments = parser.parse(expression)
214
+ bin_or_segs = parser.parse(expression)
215
+
216
+ max_depth = self._get_max_depth(bin_or_segs, max_depth)
183
217
 
184
218
  queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
185
219
  inflight: dict[str, CrawlTask] = {}
@@ -223,7 +257,7 @@ class WXPathEngine(HookedEngineBase):
223
257
  seed_task = CrawlTask(
224
258
  elem=None,
225
259
  url=None,
226
- segments=segments,
260
+ segments=bin_or_segs,
227
261
  depth=-1,
228
262
  backlink=None,
229
263
  )
@@ -235,7 +269,10 @@ class WXPathEngine(HookedEngineBase):
235
269
  queue=queue,
236
270
  pbar=pbar,
237
271
  ):
238
- yield await self.post_extract_hooks(output)
272
+ if isinstance(output, dict) and output.get("__type__") == "error":
273
+ yield output
274
+ else:
275
+ yield await self.post_extract_hooks(output)
239
276
 
240
277
  # While looping asynchronous generators, you MUST make sure
241
278
  # to check terminal conditions before re-iteration.
@@ -250,7 +287,7 @@ class WXPathEngine(HookedEngineBase):
250
287
  if task is None:
251
288
  log.warning(f"Got unexpected response from {resp.request.url}")
252
289
 
253
- if yield_errors:
290
+ if self.yield_errors:
254
291
  yield {
255
292
  "__type__": "error",
256
293
  "url": resp.request.url,
@@ -266,7 +303,7 @@ class WXPathEngine(HookedEngineBase):
266
303
  if resp.error:
267
304
  log.warning(f"Got error from {resp.request.url}: {resp.error}")
268
305
 
269
- if yield_errors:
306
+ if self.yield_errors:
270
307
  yield {
271
308
  "__type__": "error",
272
309
  "url": resp.request.url,
@@ -283,7 +320,7 @@ class WXPathEngine(HookedEngineBase):
283
320
  if resp.status not in self.allowed_response_codes or not resp.body:
284
321
  log.warning(f"Got non-200 response from {resp.request.url}")
285
322
 
286
- if yield_errors:
323
+ if self.yield_errors:
287
324
  yield {
288
325
  "__type__": "error",
289
326
  "url": resp.request.url,
@@ -307,6 +344,7 @@ class WXPathEngine(HookedEngineBase):
307
344
  base_url=task.url,
308
345
  backlink=task.backlink,
309
346
  depth=task.depth,
347
+ response=resp
310
348
  )
311
349
 
312
350
  elem = await self.post_parse_hooks(elem, task)
@@ -380,7 +418,11 @@ class WXPathEngine(HookedEngineBase):
380
418
 
381
419
  binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
382
420
  operator = get_operator(binary_or_segment)
383
- intents = operator(elem, bin_or_segs, depth)
421
+
422
+ if self.yield_errors:
423
+ intents = _safe_iterator(operator(elem, bin_or_segs, depth))
424
+ else:
425
+ intents = operator(elem, bin_or_segs, depth)
384
426
 
385
427
  if not intents:
386
428
  return
@@ -416,6 +458,28 @@ class WXPathEngine(HookedEngineBase):
416
458
  mini_queue.append((elem, next_segments))
417
459
 
418
460
 
461
+ def _safe_iterator(iterable: Iterable[Any]) -> Iterator[Any]:
462
+ """Wrap an iterable in a try/except block and return an iterator that yields the result or the error."""
463
+ it = iter(iterable)
464
+ while True:
465
+ try:
466
+ yield next(it)
467
+ except StopIteration:
468
+ break
469
+ except XPathEvaluationError as e:
470
+ yield {
471
+ "__type__": "error",
472
+ "reason": "xpath_evaluation_error",
473
+ "exception": str(e),
474
+ }
475
+ except Exception as e:
476
+ yield {
477
+ "__type__": "error",
478
+ "reason": "iterator_error",
479
+ "exception": str(e),
480
+ }
481
+
482
+
419
483
  def wxpath_async(path_expr: str,
420
484
  max_depth: int,
421
485
  progress: bool = False,
@@ -6,7 +6,7 @@ from wxpath.util.logging import get_logger
6
6
  log = get_logger(__name__)
7
7
 
8
8
 
9
- def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
9
+ def parse_html(content, base_url=None, response=None, **elem_kv_pairs) -> html.HtmlElement:
10
10
  elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
11
11
  if base_url:
12
12
  elem.getroottree().docinfo.URL = base_url # make base-uri() work
@@ -14,12 +14,15 @@ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
14
14
  elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
15
15
  elem.base_url = base_url # sets both attribute and doc-level URL
16
16
 
17
- # NOTE: some pages may have multiple root elements, i.e.
17
+ if response:
18
+ elem.response = response
19
+ elem.getroottree().getroot().response = response
20
+ # NOTE: some pages may have multiple root elements, i.e.
18
21
  # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
19
22
  # This breaks elementpath. If elem has siblings, recreate the
20
23
  # root element and only the root element.
21
24
  if len(list(elem.itersiblings())) > 0:
22
- elem = detach_html_root(elem, base_url)
25
+ elem = detach_html_root(elem, base_url)
23
26
 
24
27
  for k, v in elem_kv_pairs.items():
25
28
  elem.set(k, str(v))
@@ -5,5 +5,5 @@ from wxpath.http.client.response import Response
5
5
  __all__ = [
6
6
  "Crawler",
7
7
  "Request",
8
- "Response"
8
+ "Response",
9
9
  ]
@@ -71,6 +71,7 @@ class Crawler:
71
71
  *,
72
72
  headers: dict | None = None,
73
73
  proxies: dict | None = None,
74
+ verify_ssl: bool | None = None,
74
75
  retry_policy: RetryPolicy | None = None,
75
76
  throttler: AbstractThrottler | None = None,
76
77
  auto_throttle_target_concurrency: float = None,
@@ -82,6 +83,9 @@ class Crawler:
82
83
 
83
84
  self.concurrency = concurrency if concurrency is not None else cfg.concurrency
84
85
  self.per_host = per_host if per_host is not None else cfg.per_host
86
+ self._verify_ssl = verify_ssl if verify_ssl is not None else getattr(
87
+ cfg, "verify_ssl", True
88
+ )
85
89
 
86
90
  timeout = timeout if timeout is not None else cfg.timeout
87
91
  self._timeout = aiohttp.ClientTimeout(total=timeout)
@@ -141,7 +145,11 @@ class Crawler:
141
145
  """Construct an `aiohttp.ClientSession` with tracing and pooling."""
142
146
  trace_config = build_trace_config(self._stats)
143
147
  # Need to build the connector as late as possible as it requires the loop
144
- connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
148
+ connector = aiohttp.TCPConnector(
149
+ limit=self.concurrency * 2,
150
+ ttl_dns_cache=300,
151
+ ssl=self._verify_ssl,
152
+ )
145
153
  return get_async_session(
146
154
  headers=self._headers,
147
155
  timeout=self._timeout,
@@ -274,22 +282,26 @@ class Crawler:
274
282
  else:
275
283
  log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
276
284
 
285
+ _start = time.monotonic()
277
286
  body = await resp.read()
278
287
 
279
- latency = time.monotonic() - start
288
+ end = time.monotonic()
289
+ latency = end - _start
280
290
  self.throttler.record_latency(host, latency)
281
291
 
282
292
  if self.retry_policy.should_retry(req, response=resp):
283
293
  await self._retry(req)
284
294
  return None
285
295
 
286
- return Response(req, resp.status, body, dict(resp.headers))
296
+ return Response(req, resp.status, body, dict(resp.headers),
297
+ request_start=_start, response_end=end)
287
298
  except asyncio.CancelledError:
288
299
  # Normal during shutdown / timeout propagation
289
300
  log.debug("cancelled error", extra={"url": req.url})
290
301
  raise
291
302
  except Exception as exc:
292
- latency = time.monotonic() - start
303
+ end = time.monotonic()
304
+ latency = end - start
293
305
  self.throttler.record_latency(host, latency)
294
306
 
295
307
  if self.retry_policy.should_retry(req, exception=exc):
@@ -297,7 +309,7 @@ class Crawler:
297
309
  return None
298
310
 
299
311
  log.error("request failed", extra={"url": req.url}, exc_info=exc)
300
- return Response(req, 0, b"", error=exc)
312
+ return Response(req, 0, b"", error=exc, request_start=start, response_end=end)
301
313
 
302
314
  async def _retry(self, req: Request) -> None:
303
315
  """Reschedule a request according to the retry policy."""
@@ -1,4 +1,3 @@
1
- # wxpath/http/response.py
2
1
  from dataclasses import dataclass, field
3
2
  from typing import Optional
4
3
 
@@ -12,3 +11,10 @@ class Response:
12
11
  body: bytes
13
12
  headers: dict[str, str] | None = None
14
13
  error: Optional[Exception] = field(default=None, kw_only=True)
14
+
15
+ request_start: float | None = None
16
+ response_end: float | None = None
17
+
18
+ @property
19
+ def latency(self) -> float:
20
+ return self.response_end - self.request_start
@@ -19,13 +19,13 @@ class RetryPolicy:
19
19
 
20
20
  if request.max_retries is not None and request.retries >= request.max_retries:
21
21
  return False
22
-
22
+
23
23
  if request.retries >= self.max_retries:
24
24
  return False
25
25
 
26
26
  if response is not None and response.status in self.retry_statuses:
27
27
  return True
28
-
28
+
29
29
  if exception is not None:
30
30
  return True
31
31
 
File without changes
File without changes
@@ -0,0 +1,85 @@
1
+
2
+ # pip install langchain langchain-ollama langchain-chroma chromadb
3
+ from langchain_chroma import Chroma
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.runnables import RunnablePassthrough
7
+ from langchain_ollama import ChatOllama, OllamaEmbeddings
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+
10
+ from wxpath.integrations.langchain.loader import WXPathLoader
11
+
12
+ # ------------------------------------------------------------------
13
+ # STEP 1: Load & Embed (Same as before)
14
+ # ------------------------------------------------------------------
15
+ print("🕷️ Crawling with wxpath...")
16
+ loader = WXPathLoader(
17
+ expression="""
18
+ url('https://docs.python.org/3/library/argparse.html',
19
+ follow=//a/@href[contains(., 'argparse')])
20
+ /map{
21
+ 'text': string-join(//div[@role='main']//text()),
22
+ 'source': string(base-uri(.))
23
+ }
24
+ """,
25
+ max_depth=1
26
+ )
27
+ docs = loader.load()
28
+
29
+ print("🔪 Splitting and Embedding...")
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
31
+ splits = text_splitter.split_documents(docs)
32
+
33
+ vectorstore = Chroma.from_documents(
34
+ documents=splits,
35
+ # Must use model that support embeddings (`ollama pull nomic-embed-text`)
36
+ embedding=OllamaEmbeddings(model="nomic-embed-text"),
37
+ collection_name="wxpath"
38
+ )
39
+ retriever = vectorstore.as_retriever()
40
+
41
+ # ------------------------------------------------------------------
42
+ # STEP 2: Define Components
43
+ # ------------------------------------------------------------------
44
+
45
+ # A helper to join retrieved documents into a single string
46
+ def format_docs(docs):
47
+ return "\n\n".join(doc.page_content for doc in docs)
48
+
49
+ # The Prompt (Standard RAG template)
50
+ template = """You are an assistant for question-answering tasks.
51
+ Use the following pieces of retrieved context to answer the question.
52
+ If you don't know the answer, just say that you don't know.
53
+ Use three sentences maximum and keep the answer concise.
54
+
55
+ Context: {context}
56
+
57
+ Question: {question}
58
+
59
+ Answer:"""
60
+ prompt = ChatPromptTemplate.from_template(template)
61
+
62
+ # The Model
63
+ llm = ChatOllama(model="gemma3")
64
+
65
+ # ------------------------------------------------------------------
66
+ # STEP 3: Build the Chain with LCEL
67
+ # ------------------------------------------------------------------
68
+ # The pipe operator (|) passes output from one component to the next.
69
+ rag_chain = (
70
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
71
+ | prompt
72
+ | llm
73
+ | StrOutputParser()
74
+ )
75
+
76
+ # ------------------------------------------------------------------
77
+ # STEP 4: Invoke
78
+ # ------------------------------------------------------------------
79
+ query = "How do I add arguments in argparse?"
80
+ print(f"\n❓ Question: {query}")
81
+
82
+ # The chain returns a string directly because of StrOutputParser
83
+ response = rag_chain.invoke(query)
84
+
85
+ print(f"\n🤖 Ollama Answer:\n{response}")
@@ -0,0 +1,218 @@
1
+ """
2
+ Rolling Window RAG Example
3
+
4
+ This examples demonstrates how to use a rolling window of news articles as context.
5
+
6
+ More importantly, it demonstrates complex string cleanup, metadata extraction, and other
7
+ real-world challenges of building a RAG application.
8
+
9
+ This script assumes you have gemma3 installed and your machine is capable of running a 32k
10
+ token model.
11
+ """
12
+ import asyncio
13
+ import datetime
14
+ import threading
15
+ from collections import deque
16
+ from operator import itemgetter
17
+ from typing import List
18
+
19
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
20
+ from langchain_core.documents import Document
21
+ from langchain_core.retrievers import BaseRetriever
22
+
23
+ from wxpath import wxpath_async
24
+
25
+ # If you have the cache dependency installed, you can enable it:
26
+ # wxpath.settings.CACHE_SETTINGS.enabled = True
27
+
28
+ # ------------------------------------------------------------------
29
+ # 1. The Rolling Buffer (The "Context Window")
30
+ # ------------------------------------------------------------------
31
+ class RollingNewsBuffer(BaseRetriever):
32
+ capacity: int = 100
33
+
34
+ # Define as PrivateAttrs so Pydantic ignores them for validation
35
+ _buffer: deque
36
+ _seen_urls: set
37
+ _lock: threading.Lock
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self._buffer = deque(maxlen=self.capacity)
42
+ self._seen_urls = set()
43
+ self._lock = threading.Lock()
44
+
45
+ def add_document(self, doc: Document):
46
+ """Thread-safe add with url cleanup on eviction."""
47
+ with self._lock:
48
+ # Check if we are about to evict an item (buffer full)
49
+ if len(self._buffer) == self._buffer.maxlen:
50
+ # We must manually find what is being removed to clean up seen_urls
51
+ # Note: deque[0] is the one about to be popped when appending
52
+ oldest_doc = self._buffer[0]
53
+ oldest_url = oldest_doc.metadata.get("url")
54
+ if oldest_url in self._seen_urls:
55
+ self._seen_urls.remove(oldest_url)
56
+
57
+ self._buffer.append(doc)
58
+ self._seen_urls.add(doc.metadata["url"])
59
+
60
+ def is_seen(self, url: str) -> bool:
61
+ """Thread-safe check."""
62
+ with self._lock:
63
+ return url in self._seen_urls
64
+
65
+ def _get_relevant_documents(
66
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None
67
+ ) -> List[Document]:
68
+ """
69
+ Thread-safe read.
70
+ """
71
+ with self._lock:
72
+ # Create a snapshot list while locked to prevent iteration crash
73
+ snapshot = list(self._buffer)
74
+
75
+ print(f"📰 Context Retrieval: Returning {len(snapshot)} docs for query: {query}")
76
+ return snapshot
77
+
78
+ # ------------------------------------------------------------------
79
+ # 2. The Background Crawler (The Producer)
80
+ # ------------------------------------------------------------------
81
+ async def continuous_crawl(buffer: RollingNewsBuffer):
82
+ """
83
+ Constantly crawls Newsweek and feeds the buffer.
84
+ """
85
+ print("🕷️ Crawler started...")
86
+
87
+ # Example Expression: deep crawl of newsweek
88
+ expression = """
89
+ url('https://www.newsweek.com/')
90
+ ///url(
91
+ //a/@href[starts-with(., '/') or starts-with(., './') or contains(., 'newsweek.com')]
92
+ )
93
+ /map{
94
+ 'title': //h1/text()[1] ! string(.),
95
+ 'text': string-join(//article//p/text()),
96
+ 'url': string(base-uri(.)),
97
+ 'pubDate': //meta[@name='article:modified_time']/@content[1] ! string(.)
98
+ }
99
+ """
100
+
101
+ # Infinite loop to restart crawl if it finishes, or run continuously
102
+ while True:
103
+ try:
104
+ # We use the async generator to stream results as they are found
105
+ async for item in wxpath_async(expression, max_depth=1):
106
+ item = item._map
107
+ url = item.get('url')
108
+ # Check seen status safely before doing processing work
109
+ if not url or buffer.is_seen(url):
110
+ continue
111
+
112
+ # Convert wxpath dict to LangChain Document
113
+ text_content = item.get('text', '')
114
+ # Basic cleaning (optional)
115
+ if isinstance(text_content, list):
116
+ text_content = " ".join(text_content)
117
+
118
+ if not text_content:
119
+ continue
120
+
121
+ title = item.get('title')
122
+ if not title:
123
+ title = ''
124
+
125
+ if isinstance(title, list):
126
+ title = " ".join(title)
127
+
128
+ pub_date = item.get('pubDate')
129
+ if not pub_date:
130
+ pub_date = str(datetime.date.today())
131
+
132
+ text_content = ("Title: " + title +
133
+ "\nPublished: " + pub_date + "\n" +
134
+ text_content)
135
+
136
+ doc = Document(
137
+ page_content=text_content,
138
+ metadata={"title": item.get('title'),
139
+ "url": item.get('url'),
140
+ "pubDate": item.get('pubDate')}
141
+ )
142
+
143
+ # PUSH TO BUFFER (Oldest gets evicted automatically if full)
144
+ buffer.add_document(doc)
145
+ print(f"📰 Added: {title[:30]}... (Buffer size: {len(buffer._buffer)})")
146
+ print(f"\tArticle text: {doc.page_content[:100]}...")
147
+ print()
148
+ # Rate limit slightly to be polite
149
+ await asyncio.sleep(60)
150
+
151
+ except Exception as e:
152
+ print(f"⚠️ Crawler error: {e}. Restarting in 10s...")
153
+ await asyncio.sleep(10)
154
+
155
+
156
+ def debug_print_prompt(prompt_value):
157
+ print("\n" + "="*40)
158
+ print("📢 FULL PROMPT SENT TO LLM:")
159
+ print("="*40)
160
+ print(prompt_value.to_string()) # This prints the exact text
161
+ print("="*40 + "\n")
162
+ return prompt_value
163
+
164
+ if __name__ == "__main__":
165
+ # Initialize the Rolling Buffer
166
+ retriever = RollingNewsBuffer(capacity=100)
167
+
168
+ # Start Crawler in a background thread so it doesn't block the Chat
169
+ def start_background_loop(loop):
170
+ asyncio.set_event_loop(loop)
171
+ loop.run_until_complete(continuous_crawl(retriever))
172
+
173
+ crawler_loop = asyncio.new_event_loop()
174
+ t = threading.Thread(target=start_background_loop, args=(crawler_loop,), daemon=True)
175
+ t.start()
176
+
177
+ import time
178
+
179
+ from langchain_core.prompts import ChatPromptTemplate
180
+ from langchain_ollama import ChatOllama
181
+
182
+ # Setup standard RAG chain
183
+ llm = ChatOllama(model="gemma3", num_ctx=32768)
184
+ prompt = ChatPromptTemplate.from_template(
185
+ "Answer based ONLY on the following news:\n\n{context}\n\nQuestion: {question}\n\n"
186
+ "DO NOT include generic Newsweek-administrative articles like 'Corrections', "
187
+ "'Company Info', 'Subscribe', Opinions', 'Press Releases', 'Editorials', etc. in your "
188
+ "analysis or answers. Answer the question using the non-Newsweek-related news provided. "
189
+ "You will be penalized for including old or undated news in your answer. If asked for "
190
+ "overviews or summaries, split news items into paragraphs and provide a summary of each "
191
+ "news item."
192
+ )
193
+
194
+ def format_docs(docs):
195
+ slice_of_news = "\n\n".join([d.page_content[:1000] for d in docs]) # Truncate for demo
196
+ print(f"📰 Latest news char length: {len(slice_of_news)}")
197
+ return slice_of_news
198
+
199
+ chain = (
200
+ {
201
+ # FIX: Use itemgetter so retriever gets a string, not a dict
202
+ "context": itemgetter("question") | retriever | format_docs,
203
+ "question": itemgetter("question")
204
+ }
205
+ | prompt
206
+ | debug_print_prompt
207
+ | llm
208
+ )
209
+
210
+ # Simulate querying constantly while buffer fills in background
211
+ print("⏳ Waiting for crawler to gather some data...")
212
+ time.sleep(10)
213
+
214
+ while True:
215
+ query = input("Press Enter to ask about current news (or Ctrl+C to quit)...")
216
+ print(f"\nQuery: {query}\nThinking... 🤔")
217
+ response = chain.invoke({"question": query})
218
+ print(response.content)