wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +2 -0
- wxpath/cli.py +6 -0
- wxpath/core/exceptions.py +53 -0
- wxpath/core/models.py +1 -0
- wxpath/core/ops.py +100 -19
- wxpath/core/parser.py +94 -24
- wxpath/core/runtime/engine.py +74 -10
- wxpath/core/runtime/helpers.py +6 -3
- wxpath/http/client/__init__.py +1 -1
- wxpath/http/client/crawler.py +17 -5
- wxpath/http/client/response.py +7 -1
- wxpath/http/policy/retry.py +2 -2
- wxpath/integrations/__init__.py +0 -0
- wxpath/integrations/langchain/__init__.py +0 -0
- wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath/integrations/langchain/loader.py +60 -0
- wxpath/patches.py +215 -5
- wxpath/settings.py +3 -1
- wxpath/tui.py +1225 -0
- wxpath/tui_settings.py +151 -0
- wxpath/util/cleaners.py +31 -0
- wxpath/util/common_paths.py +22 -0
- wxpath/util/logging.py +3 -7
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/METADATA +73 -9
- wxpath-0.5.1.dist-info/RECORD +45 -0
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/WHEEL +1 -1
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/entry_points.txt +1 -0
- wxpath-0.4.1.dist-info/RECORD +0 -35
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/top_level.txt +0 -0
wxpath/core/runtime/engine.py
CHANGED
|
@@ -2,13 +2,14 @@ import asyncio
|
|
|
2
2
|
import contextlib
|
|
3
3
|
import inspect
|
|
4
4
|
from collections import deque
|
|
5
|
-
from typing import Any, AsyncGenerator, Iterator
|
|
5
|
+
from typing import Any, AsyncGenerator, Iterator, Iterable
|
|
6
6
|
|
|
7
7
|
from lxml.html import HtmlElement
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
10
|
from wxpath import patches # noqa: F401
|
|
11
11
|
from wxpath.core import parser
|
|
12
|
+
from wxpath.core.exceptions import XPathEvaluationError
|
|
12
13
|
from wxpath.core.models import (
|
|
13
14
|
CrawlIntent,
|
|
14
15
|
CrawlTask,
|
|
@@ -18,7 +19,7 @@ from wxpath.core.models import (
|
|
|
18
19
|
ProcessIntent,
|
|
19
20
|
)
|
|
20
21
|
from wxpath.core.ops import get_operator
|
|
21
|
-
from wxpath.core.parser import Binary, Segment, Segments
|
|
22
|
+
from wxpath.core.parser import Binary, Depth, Segment, Segments
|
|
22
23
|
from wxpath.core.runtime.helpers import parse_html
|
|
23
24
|
from wxpath.hooks.registry import FetchContext, get_hooks
|
|
24
25
|
from wxpath.http.client.crawler import Crawler
|
|
@@ -145,6 +146,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
145
146
|
respect_robots: bool = True,
|
|
146
147
|
allowed_response_codes: set[int] = None,
|
|
147
148
|
allow_redirects: bool = True,
|
|
149
|
+
yield_errors: bool = False,
|
|
148
150
|
):
|
|
149
151
|
# NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
|
|
150
152
|
self.seen_urls: set[str] = set()
|
|
@@ -157,19 +159,49 @@ class WXPathEngine(HookedEngineBase):
|
|
|
157
159
|
self.allow_redirects = allow_redirects
|
|
158
160
|
if allow_redirects:
|
|
159
161
|
self.allowed_response_codes |= {301, 302, 303, 307, 308}
|
|
162
|
+
self.yield_errors = yield_errors
|
|
163
|
+
|
|
164
|
+
def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
|
|
165
|
+
"""Get the maximum crawl depth for a given expression. Will find a Depth
|
|
166
|
+
argument at the beginning of the expression and return its value. Otherwise, returns the
|
|
167
|
+
max_depth value provided.
|
|
168
|
+
TODO: There has to be a better way to do this.
|
|
169
|
+
"""
|
|
170
|
+
if isinstance(bin_or_segs, Binary):
|
|
171
|
+
if hasattr(bin_or_segs.left, 'func') == 'url':
|
|
172
|
+
depth_arg = [arg for arg in bin_or_segs.left.args if isinstance(arg, Depth)][0]
|
|
173
|
+
return int(depth_arg.value)
|
|
174
|
+
elif hasattr(bin_or_segs.right, 'func') == 'url':
|
|
175
|
+
depth_arg = [arg for arg in bin_or_segs.right.args if isinstance(arg, Depth)][0]
|
|
176
|
+
return int(depth_arg.value)
|
|
177
|
+
elif isinstance(bin_or_segs, Segments):
|
|
178
|
+
depth_arg = [arg for arg in bin_or_segs[0].args if isinstance(arg, Depth)]
|
|
179
|
+
if depth_arg:
|
|
180
|
+
return int(depth_arg[0].value)
|
|
181
|
+
return max_depth
|
|
160
182
|
|
|
161
183
|
async def run(
|
|
162
184
|
self,
|
|
163
185
|
expression: str,
|
|
164
186
|
max_depth: int,
|
|
165
187
|
progress: bool = False,
|
|
166
|
-
yield_errors: bool = False,
|
|
167
188
|
) -> AsyncGenerator[Any, None]:
|
|
168
189
|
"""Execute a wxpath expression concurrently and yield results.
|
|
169
190
|
|
|
170
191
|
Builds and drives a BFS-like crawl pipeline that honors robots rules,
|
|
171
192
|
throttling, and hook callbacks while walking the web graph.
|
|
172
193
|
|
|
194
|
+
NOTES ON max_depth:
|
|
195
|
+
If depth is provided in the expression, it will be used to limit the depth of the
|
|
196
|
+
crawl. If depth is provided in the expression and max_depth is provided as an argument
|
|
197
|
+
to `run`, the inline depth in the expression will take precedence.
|
|
198
|
+
|
|
199
|
+
Currently, max_depth control flow logic is detected and executed in the
|
|
200
|
+
engine. In the future, the operation handlers (ops.py) could be responsible for
|
|
201
|
+
detecting max_depth, and sending a terminal intent to the engine. It's also possible
|
|
202
|
+
that the depth terminals are relative to the current depth (i.e. `url(//xpath, depth=2)`
|
|
203
|
+
implies crawling only the next 2 levels). This is not yet supported.
|
|
204
|
+
|
|
173
205
|
Args:
|
|
174
206
|
expression: WXPath expression string to evaluate.
|
|
175
207
|
max_depth: Maximum crawl depth to follow for url hops.
|
|
@@ -179,7 +211,9 @@ class WXPathEngine(HookedEngineBase):
|
|
|
179
211
|
Extracted values produced by the expression (HTML elements or
|
|
180
212
|
wxpath-specific value types).
|
|
181
213
|
"""
|
|
182
|
-
|
|
214
|
+
bin_or_segs = parser.parse(expression)
|
|
215
|
+
|
|
216
|
+
max_depth = self._get_max_depth(bin_or_segs, max_depth)
|
|
183
217
|
|
|
184
218
|
queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
|
|
185
219
|
inflight: dict[str, CrawlTask] = {}
|
|
@@ -223,7 +257,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
223
257
|
seed_task = CrawlTask(
|
|
224
258
|
elem=None,
|
|
225
259
|
url=None,
|
|
226
|
-
segments=
|
|
260
|
+
segments=bin_or_segs,
|
|
227
261
|
depth=-1,
|
|
228
262
|
backlink=None,
|
|
229
263
|
)
|
|
@@ -235,7 +269,10 @@ class WXPathEngine(HookedEngineBase):
|
|
|
235
269
|
queue=queue,
|
|
236
270
|
pbar=pbar,
|
|
237
271
|
):
|
|
238
|
-
|
|
272
|
+
if isinstance(output, dict) and output.get("__type__") == "error":
|
|
273
|
+
yield output
|
|
274
|
+
else:
|
|
275
|
+
yield await self.post_extract_hooks(output)
|
|
239
276
|
|
|
240
277
|
# While looping asynchronous generators, you MUST make sure
|
|
241
278
|
# to check terminal conditions before re-iteration.
|
|
@@ -250,7 +287,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
250
287
|
if task is None:
|
|
251
288
|
log.warning(f"Got unexpected response from {resp.request.url}")
|
|
252
289
|
|
|
253
|
-
if yield_errors:
|
|
290
|
+
if self.yield_errors:
|
|
254
291
|
yield {
|
|
255
292
|
"__type__": "error",
|
|
256
293
|
"url": resp.request.url,
|
|
@@ -266,7 +303,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
266
303
|
if resp.error:
|
|
267
304
|
log.warning(f"Got error from {resp.request.url}: {resp.error}")
|
|
268
305
|
|
|
269
|
-
if yield_errors:
|
|
306
|
+
if self.yield_errors:
|
|
270
307
|
yield {
|
|
271
308
|
"__type__": "error",
|
|
272
309
|
"url": resp.request.url,
|
|
@@ -283,7 +320,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
283
320
|
if resp.status not in self.allowed_response_codes or not resp.body:
|
|
284
321
|
log.warning(f"Got non-200 response from {resp.request.url}")
|
|
285
322
|
|
|
286
|
-
if yield_errors:
|
|
323
|
+
if self.yield_errors:
|
|
287
324
|
yield {
|
|
288
325
|
"__type__": "error",
|
|
289
326
|
"url": resp.request.url,
|
|
@@ -307,6 +344,7 @@ class WXPathEngine(HookedEngineBase):
|
|
|
307
344
|
base_url=task.url,
|
|
308
345
|
backlink=task.backlink,
|
|
309
346
|
depth=task.depth,
|
|
347
|
+
response=resp
|
|
310
348
|
)
|
|
311
349
|
|
|
312
350
|
elem = await self.post_parse_hooks(elem, task)
|
|
@@ -380,7 +418,11 @@ class WXPathEngine(HookedEngineBase):
|
|
|
380
418
|
|
|
381
419
|
binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
|
|
382
420
|
operator = get_operator(binary_or_segment)
|
|
383
|
-
|
|
421
|
+
|
|
422
|
+
if self.yield_errors:
|
|
423
|
+
intents = _safe_iterator(operator(elem, bin_or_segs, depth))
|
|
424
|
+
else:
|
|
425
|
+
intents = operator(elem, bin_or_segs, depth)
|
|
384
426
|
|
|
385
427
|
if not intents:
|
|
386
428
|
return
|
|
@@ -416,6 +458,28 @@ class WXPathEngine(HookedEngineBase):
|
|
|
416
458
|
mini_queue.append((elem, next_segments))
|
|
417
459
|
|
|
418
460
|
|
|
461
|
+
def _safe_iterator(iterable: Iterable[Any]) -> Iterator[Any]:
|
|
462
|
+
"""Wrap an iterable in a try/except block and return an iterator that yields the result or the error."""
|
|
463
|
+
it = iter(iterable)
|
|
464
|
+
while True:
|
|
465
|
+
try:
|
|
466
|
+
yield next(it)
|
|
467
|
+
except StopIteration:
|
|
468
|
+
break
|
|
469
|
+
except XPathEvaluationError as e:
|
|
470
|
+
yield {
|
|
471
|
+
"__type__": "error",
|
|
472
|
+
"reason": "xpath_evaluation_error",
|
|
473
|
+
"exception": str(e),
|
|
474
|
+
}
|
|
475
|
+
except Exception as e:
|
|
476
|
+
yield {
|
|
477
|
+
"__type__": "error",
|
|
478
|
+
"reason": "iterator_error",
|
|
479
|
+
"exception": str(e),
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
|
|
419
483
|
def wxpath_async(path_expr: str,
|
|
420
484
|
max_depth: int,
|
|
421
485
|
progress: bool = False,
|
wxpath/core/runtime/helpers.py
CHANGED
|
@@ -6,7 +6,7 @@ from wxpath.util.logging import get_logger
|
|
|
6
6
|
log = get_logger(__name__)
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
9
|
+
def parse_html(content, base_url=None, response=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
10
10
|
elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
|
|
11
11
|
if base_url:
|
|
12
12
|
elem.getroottree().docinfo.URL = base_url # make base-uri() work
|
|
@@ -14,12 +14,15 @@ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
|
|
|
14
14
|
elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
|
|
15
15
|
elem.base_url = base_url # sets both attribute and doc-level URL
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
if response:
|
|
18
|
+
elem.response = response
|
|
19
|
+
elem.getroottree().getroot().response = response
|
|
20
|
+
# NOTE: some pages may have multiple root elements, i.e.
|
|
18
21
|
# len(elem.itersiblings()) > 0 AND elem.getparent() is None.
|
|
19
22
|
# This breaks elementpath. If elem has siblings, recreate the
|
|
20
23
|
# root element and only the root element.
|
|
21
24
|
if len(list(elem.itersiblings())) > 0:
|
|
22
|
-
elem = detach_html_root(elem, base_url)
|
|
25
|
+
elem = detach_html_root(elem, base_url)
|
|
23
26
|
|
|
24
27
|
for k, v in elem_kv_pairs.items():
|
|
25
28
|
elem.set(k, str(v))
|
wxpath/http/client/__init__.py
CHANGED
wxpath/http/client/crawler.py
CHANGED
|
@@ -71,6 +71,7 @@ class Crawler:
|
|
|
71
71
|
*,
|
|
72
72
|
headers: dict | None = None,
|
|
73
73
|
proxies: dict | None = None,
|
|
74
|
+
verify_ssl: bool | None = None,
|
|
74
75
|
retry_policy: RetryPolicy | None = None,
|
|
75
76
|
throttler: AbstractThrottler | None = None,
|
|
76
77
|
auto_throttle_target_concurrency: float = None,
|
|
@@ -82,6 +83,9 @@ class Crawler:
|
|
|
82
83
|
|
|
83
84
|
self.concurrency = concurrency if concurrency is not None else cfg.concurrency
|
|
84
85
|
self.per_host = per_host if per_host is not None else cfg.per_host
|
|
86
|
+
self._verify_ssl = verify_ssl if verify_ssl is not None else getattr(
|
|
87
|
+
cfg, "verify_ssl", True
|
|
88
|
+
)
|
|
85
89
|
|
|
86
90
|
timeout = timeout if timeout is not None else cfg.timeout
|
|
87
91
|
self._timeout = aiohttp.ClientTimeout(total=timeout)
|
|
@@ -141,7 +145,11 @@ class Crawler:
|
|
|
141
145
|
"""Construct an `aiohttp.ClientSession` with tracing and pooling."""
|
|
142
146
|
trace_config = build_trace_config(self._stats)
|
|
143
147
|
# Need to build the connector as late as possible as it requires the loop
|
|
144
|
-
connector = aiohttp.TCPConnector(
|
|
148
|
+
connector = aiohttp.TCPConnector(
|
|
149
|
+
limit=self.concurrency * 2,
|
|
150
|
+
ttl_dns_cache=300,
|
|
151
|
+
ssl=self._verify_ssl,
|
|
152
|
+
)
|
|
145
153
|
return get_async_session(
|
|
146
154
|
headers=self._headers,
|
|
147
155
|
timeout=self._timeout,
|
|
@@ -274,22 +282,26 @@ class Crawler:
|
|
|
274
282
|
else:
|
|
275
283
|
log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
|
|
276
284
|
|
|
285
|
+
_start = time.monotonic()
|
|
277
286
|
body = await resp.read()
|
|
278
287
|
|
|
279
|
-
|
|
288
|
+
end = time.monotonic()
|
|
289
|
+
latency = end - _start
|
|
280
290
|
self.throttler.record_latency(host, latency)
|
|
281
291
|
|
|
282
292
|
if self.retry_policy.should_retry(req, response=resp):
|
|
283
293
|
await self._retry(req)
|
|
284
294
|
return None
|
|
285
295
|
|
|
286
|
-
return Response(req, resp.status, body, dict(resp.headers)
|
|
296
|
+
return Response(req, resp.status, body, dict(resp.headers),
|
|
297
|
+
request_start=_start, response_end=end)
|
|
287
298
|
except asyncio.CancelledError:
|
|
288
299
|
# Normal during shutdown / timeout propagation
|
|
289
300
|
log.debug("cancelled error", extra={"url": req.url})
|
|
290
301
|
raise
|
|
291
302
|
except Exception as exc:
|
|
292
|
-
|
|
303
|
+
end = time.monotonic()
|
|
304
|
+
latency = end - start
|
|
293
305
|
self.throttler.record_latency(host, latency)
|
|
294
306
|
|
|
295
307
|
if self.retry_policy.should_retry(req, exception=exc):
|
|
@@ -297,7 +309,7 @@ class Crawler:
|
|
|
297
309
|
return None
|
|
298
310
|
|
|
299
311
|
log.error("request failed", extra={"url": req.url}, exc_info=exc)
|
|
300
|
-
return Response(req, 0, b"", error=exc)
|
|
312
|
+
return Response(req, 0, b"", error=exc, request_start=start, response_end=end)
|
|
301
313
|
|
|
302
314
|
async def _retry(self, req: Request) -> None:
|
|
303
315
|
"""Reschedule a request according to the retry policy."""
|
wxpath/http/client/response.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# wxpath/http/response.py
|
|
2
1
|
from dataclasses import dataclass, field
|
|
3
2
|
from typing import Optional
|
|
4
3
|
|
|
@@ -12,3 +11,10 @@ class Response:
|
|
|
12
11
|
body: bytes
|
|
13
12
|
headers: dict[str, str] | None = None
|
|
14
13
|
error: Optional[Exception] = field(default=None, kw_only=True)
|
|
14
|
+
|
|
15
|
+
request_start: float | None = None
|
|
16
|
+
response_end: float | None = None
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def latency(self) -> float:
|
|
20
|
+
return self.response_end - self.request_start
|
wxpath/http/policy/retry.py
CHANGED
|
@@ -19,13 +19,13 @@ class RetryPolicy:
|
|
|
19
19
|
|
|
20
20
|
if request.max_retries is not None and request.retries >= request.max_retries:
|
|
21
21
|
return False
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
if request.retries >= self.max_retries:
|
|
24
24
|
return False
|
|
25
25
|
|
|
26
26
|
if response is not None and response.status in self.retry_statuses:
|
|
27
27
|
return True
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
if exception is not None:
|
|
30
30
|
return True
|
|
31
31
|
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
|
|
2
|
+
# pip install langchain langchain-ollama langchain-chroma chromadb
|
|
3
|
+
from langchain_chroma import Chroma
|
|
4
|
+
from langchain_core.output_parsers import StrOutputParser
|
|
5
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
6
|
+
from langchain_core.runnables import RunnablePassthrough
|
|
7
|
+
from langchain_ollama import ChatOllama, OllamaEmbeddings
|
|
8
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
9
|
+
|
|
10
|
+
from wxpath.integrations.langchain.loader import WXPathLoader
|
|
11
|
+
|
|
12
|
+
# ------------------------------------------------------------------
|
|
13
|
+
# STEP 1: Load & Embed (Same as before)
|
|
14
|
+
# ------------------------------------------------------------------
|
|
15
|
+
print("🕷️ Crawling with wxpath...")
|
|
16
|
+
loader = WXPathLoader(
|
|
17
|
+
expression="""
|
|
18
|
+
url('https://docs.python.org/3/library/argparse.html',
|
|
19
|
+
follow=//a/@href[contains(., 'argparse')])
|
|
20
|
+
/map{
|
|
21
|
+
'text': string-join(//div[@role='main']//text()),
|
|
22
|
+
'source': string(base-uri(.))
|
|
23
|
+
}
|
|
24
|
+
""",
|
|
25
|
+
max_depth=1
|
|
26
|
+
)
|
|
27
|
+
docs = loader.load()
|
|
28
|
+
|
|
29
|
+
print("🔪 Splitting and Embedding...")
|
|
30
|
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
31
|
+
splits = text_splitter.split_documents(docs)
|
|
32
|
+
|
|
33
|
+
vectorstore = Chroma.from_documents(
|
|
34
|
+
documents=splits,
|
|
35
|
+
# Must use model that support embeddings (`ollama pull nomic-embed-text`)
|
|
36
|
+
embedding=OllamaEmbeddings(model="nomic-embed-text"),
|
|
37
|
+
collection_name="wxpath"
|
|
38
|
+
)
|
|
39
|
+
retriever = vectorstore.as_retriever()
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
# STEP 2: Define Components
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
# A helper to join retrieved documents into a single string
|
|
46
|
+
def format_docs(docs):
|
|
47
|
+
return "\n\n".join(doc.page_content for doc in docs)
|
|
48
|
+
|
|
49
|
+
# The Prompt (Standard RAG template)
|
|
50
|
+
template = """You are an assistant for question-answering tasks.
|
|
51
|
+
Use the following pieces of retrieved context to answer the question.
|
|
52
|
+
If you don't know the answer, just say that you don't know.
|
|
53
|
+
Use three sentences maximum and keep the answer concise.
|
|
54
|
+
|
|
55
|
+
Context: {context}
|
|
56
|
+
|
|
57
|
+
Question: {question}
|
|
58
|
+
|
|
59
|
+
Answer:"""
|
|
60
|
+
prompt = ChatPromptTemplate.from_template(template)
|
|
61
|
+
|
|
62
|
+
# The Model
|
|
63
|
+
llm = ChatOllama(model="gemma3")
|
|
64
|
+
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
# STEP 3: Build the Chain with LCEL
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
# The pipe operator (|) passes output from one component to the next.
|
|
69
|
+
rag_chain = (
|
|
70
|
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
|
71
|
+
| prompt
|
|
72
|
+
| llm
|
|
73
|
+
| StrOutputParser()
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# STEP 4: Invoke
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
query = "How do I add arguments in argparse?"
|
|
80
|
+
print(f"\n❓ Question: {query}")
|
|
81
|
+
|
|
82
|
+
# The chain returns a string directly because of StrOutputParser
|
|
83
|
+
response = rag_chain.invoke(query)
|
|
84
|
+
|
|
85
|
+
print(f"\n🤖 Ollama Answer:\n{response}")
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rolling Window RAG Example
|
|
3
|
+
|
|
4
|
+
This examples demonstrates how to use a rolling window of news articles as context.
|
|
5
|
+
|
|
6
|
+
More importantly, it demonstrates complex string cleanup, metadata extraction, and other
|
|
7
|
+
real-world challenges of building a RAG application.
|
|
8
|
+
|
|
9
|
+
This script assumes you have gemma3 installed and your machine is capable of running a 32k
|
|
10
|
+
token model.
|
|
11
|
+
"""
|
|
12
|
+
import asyncio
|
|
13
|
+
import datetime
|
|
14
|
+
import threading
|
|
15
|
+
from collections import deque
|
|
16
|
+
from operator import itemgetter
|
|
17
|
+
from typing import List
|
|
18
|
+
|
|
19
|
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
|
20
|
+
from langchain_core.documents import Document
|
|
21
|
+
from langchain_core.retrievers import BaseRetriever
|
|
22
|
+
|
|
23
|
+
from wxpath import wxpath_async
|
|
24
|
+
|
|
25
|
+
# If you have the cache dependency installed, you can enable it:
|
|
26
|
+
# wxpath.settings.CACHE_SETTINGS.enabled = True
|
|
27
|
+
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
# 1. The Rolling Buffer (The "Context Window")
|
|
30
|
+
# ------------------------------------------------------------------
|
|
31
|
+
class RollingNewsBuffer(BaseRetriever):
|
|
32
|
+
capacity: int = 100
|
|
33
|
+
|
|
34
|
+
# Define as PrivateAttrs so Pydantic ignores them for validation
|
|
35
|
+
_buffer: deque
|
|
36
|
+
_seen_urls: set
|
|
37
|
+
_lock: threading.Lock
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self._buffer = deque(maxlen=self.capacity)
|
|
42
|
+
self._seen_urls = set()
|
|
43
|
+
self._lock = threading.Lock()
|
|
44
|
+
|
|
45
|
+
def add_document(self, doc: Document):
|
|
46
|
+
"""Thread-safe add with url cleanup on eviction."""
|
|
47
|
+
with self._lock:
|
|
48
|
+
# Check if we are about to evict an item (buffer full)
|
|
49
|
+
if len(self._buffer) == self._buffer.maxlen:
|
|
50
|
+
# We must manually find what is being removed to clean up seen_urls
|
|
51
|
+
# Note: deque[0] is the one about to be popped when appending
|
|
52
|
+
oldest_doc = self._buffer[0]
|
|
53
|
+
oldest_url = oldest_doc.metadata.get("url")
|
|
54
|
+
if oldest_url in self._seen_urls:
|
|
55
|
+
self._seen_urls.remove(oldest_url)
|
|
56
|
+
|
|
57
|
+
self._buffer.append(doc)
|
|
58
|
+
self._seen_urls.add(doc.metadata["url"])
|
|
59
|
+
|
|
60
|
+
def is_seen(self, url: str) -> bool:
|
|
61
|
+
"""Thread-safe check."""
|
|
62
|
+
with self._lock:
|
|
63
|
+
return url in self._seen_urls
|
|
64
|
+
|
|
65
|
+
def _get_relevant_documents(
|
|
66
|
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None
|
|
67
|
+
) -> List[Document]:
|
|
68
|
+
"""
|
|
69
|
+
Thread-safe read.
|
|
70
|
+
"""
|
|
71
|
+
with self._lock:
|
|
72
|
+
# Create a snapshot list while locked to prevent iteration crash
|
|
73
|
+
snapshot = list(self._buffer)
|
|
74
|
+
|
|
75
|
+
print(f"📰 Context Retrieval: Returning {len(snapshot)} docs for query: {query}")
|
|
76
|
+
return snapshot
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
# 2. The Background Crawler (The Producer)
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
async def continuous_crawl(buffer: RollingNewsBuffer):
|
|
82
|
+
"""
|
|
83
|
+
Constantly crawls Newsweek and feeds the buffer.
|
|
84
|
+
"""
|
|
85
|
+
print("🕷️ Crawler started...")
|
|
86
|
+
|
|
87
|
+
# Example Expression: deep crawl of newsweek
|
|
88
|
+
expression = """
|
|
89
|
+
url('https://www.newsweek.com/')
|
|
90
|
+
///url(
|
|
91
|
+
//a/@href[starts-with(., '/') or starts-with(., './') or contains(., 'newsweek.com')]
|
|
92
|
+
)
|
|
93
|
+
/map{
|
|
94
|
+
'title': //h1/text()[1] ! string(.),
|
|
95
|
+
'text': string-join(//article//p/text()),
|
|
96
|
+
'url': string(base-uri(.)),
|
|
97
|
+
'pubDate': //meta[@name='article:modified_time']/@content[1] ! string(.)
|
|
98
|
+
}
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# Infinite loop to restart crawl if it finishes, or run continuously
|
|
102
|
+
while True:
|
|
103
|
+
try:
|
|
104
|
+
# We use the async generator to stream results as they are found
|
|
105
|
+
async for item in wxpath_async(expression, max_depth=1):
|
|
106
|
+
item = item._map
|
|
107
|
+
url = item.get('url')
|
|
108
|
+
# Check seen status safely before doing processing work
|
|
109
|
+
if not url or buffer.is_seen(url):
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
# Convert wxpath dict to LangChain Document
|
|
113
|
+
text_content = item.get('text', '')
|
|
114
|
+
# Basic cleaning (optional)
|
|
115
|
+
if isinstance(text_content, list):
|
|
116
|
+
text_content = " ".join(text_content)
|
|
117
|
+
|
|
118
|
+
if not text_content:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
title = item.get('title')
|
|
122
|
+
if not title:
|
|
123
|
+
title = ''
|
|
124
|
+
|
|
125
|
+
if isinstance(title, list):
|
|
126
|
+
title = " ".join(title)
|
|
127
|
+
|
|
128
|
+
pub_date = item.get('pubDate')
|
|
129
|
+
if not pub_date:
|
|
130
|
+
pub_date = str(datetime.date.today())
|
|
131
|
+
|
|
132
|
+
text_content = ("Title: " + title +
|
|
133
|
+
"\nPublished: " + pub_date + "\n" +
|
|
134
|
+
text_content)
|
|
135
|
+
|
|
136
|
+
doc = Document(
|
|
137
|
+
page_content=text_content,
|
|
138
|
+
metadata={"title": item.get('title'),
|
|
139
|
+
"url": item.get('url'),
|
|
140
|
+
"pubDate": item.get('pubDate')}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# PUSH TO BUFFER (Oldest gets evicted automatically if full)
|
|
144
|
+
buffer.add_document(doc)
|
|
145
|
+
print(f"📰 Added: {title[:30]}... (Buffer size: {len(buffer._buffer)})")
|
|
146
|
+
print(f"\tArticle text: {doc.page_content[:100]}...")
|
|
147
|
+
print()
|
|
148
|
+
# Rate limit slightly to be polite
|
|
149
|
+
await asyncio.sleep(60)
|
|
150
|
+
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f"⚠️ Crawler error: {e}. Restarting in 10s...")
|
|
153
|
+
await asyncio.sleep(10)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def debug_print_prompt(prompt_value):
|
|
157
|
+
print("\n" + "="*40)
|
|
158
|
+
print("📢 FULL PROMPT SENT TO LLM:")
|
|
159
|
+
print("="*40)
|
|
160
|
+
print(prompt_value.to_string()) # This prints the exact text
|
|
161
|
+
print("="*40 + "\n")
|
|
162
|
+
return prompt_value
|
|
163
|
+
|
|
164
|
+
if __name__ == "__main__":
|
|
165
|
+
# Initialize the Rolling Buffer
|
|
166
|
+
retriever = RollingNewsBuffer(capacity=100)
|
|
167
|
+
|
|
168
|
+
# Start Crawler in a background thread so it doesn't block the Chat
|
|
169
|
+
def start_background_loop(loop):
|
|
170
|
+
asyncio.set_event_loop(loop)
|
|
171
|
+
loop.run_until_complete(continuous_crawl(retriever))
|
|
172
|
+
|
|
173
|
+
crawler_loop = asyncio.new_event_loop()
|
|
174
|
+
t = threading.Thread(target=start_background_loop, args=(crawler_loop,), daemon=True)
|
|
175
|
+
t.start()
|
|
176
|
+
|
|
177
|
+
import time
|
|
178
|
+
|
|
179
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
180
|
+
from langchain_ollama import ChatOllama
|
|
181
|
+
|
|
182
|
+
# Setup standard RAG chain
|
|
183
|
+
llm = ChatOllama(model="gemma3", num_ctx=32768)
|
|
184
|
+
prompt = ChatPromptTemplate.from_template(
|
|
185
|
+
"Answer based ONLY on the following news:\n\n{context}\n\nQuestion: {question}\n\n"
|
|
186
|
+
"DO NOT include generic Newsweek-administrative articles like 'Corrections', "
|
|
187
|
+
"'Company Info', 'Subscribe', Opinions', 'Press Releases', 'Editorials', etc. in your "
|
|
188
|
+
"analysis or answers. Answer the question using the non-Newsweek-related news provided. "
|
|
189
|
+
"You will be penalized for including old or undated news in your answer. If asked for "
|
|
190
|
+
"overviews or summaries, split news items into paragraphs and provide a summary of each "
|
|
191
|
+
"news item."
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def format_docs(docs):
|
|
195
|
+
slice_of_news = "\n\n".join([d.page_content[:1000] for d in docs]) # Truncate for demo
|
|
196
|
+
print(f"📰 Latest news char length: {len(slice_of_news)}")
|
|
197
|
+
return slice_of_news
|
|
198
|
+
|
|
199
|
+
chain = (
|
|
200
|
+
{
|
|
201
|
+
# FIX: Use itemgetter so retriever gets a string, not a dict
|
|
202
|
+
"context": itemgetter("question") | retriever | format_docs,
|
|
203
|
+
"question": itemgetter("question")
|
|
204
|
+
}
|
|
205
|
+
| prompt
|
|
206
|
+
| debug_print_prompt
|
|
207
|
+
| llm
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Simulate querying constantly while buffer fills in background
|
|
211
|
+
print("⏳ Waiting for crawler to gather some data...")
|
|
212
|
+
time.sleep(10)
|
|
213
|
+
|
|
214
|
+
while True:
|
|
215
|
+
query = input("Press Enter to ask about current news (or Ctrl+C to quit)...")
|
|
216
|
+
print(f"\nQuery: {query}\nThinking... 🤔")
|
|
217
|
+
response = chain.invoke({"question": query})
|
|
218
|
+
print(response.content)
|