wxpath 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from . import settings
1
2
  from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
3
  from .util.logging import configure_logging
3
4
 
@@ -6,4 +7,5 @@ __all__ = [
6
7
  'wxpath_async_blocking',
7
8
  'wxpath_async_blocking_iter',
8
9
  'configure_logging',
10
+ 'settings',
9
11
  ]
wxpath/cli.py CHANGED
@@ -47,6 +47,11 @@ def main():
47
47
  help="Respect robots.txt",
48
48
  default=True
49
49
  )
50
+ arg_parser.add_argument(
51
+ "--insecure",
52
+ action="store_true",
53
+ help="Disable SSL certificate verification (use for sites with broken chains)",
54
+ )
50
55
  arg_parser.add_argument(
51
56
  "--cache",
52
57
  action="store_true",
@@ -112,6 +117,7 @@ def main():
112
117
  concurrency=args.concurrency,
113
118
  per_host=args.concurrency_per_host,
114
119
  respect_robots=args.respect_robots,
120
+ verify_ssl=not args.insecure,
115
121
  headers=custom_headers
116
122
  )
117
123
  engine = WXPathEngine(crawler=crawler)
wxpath/core/models.py CHANGED
@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
61
61
 
62
62
  @dataclass(slots=True)
63
63
  class ExtractIntent(ProcessIntent):
64
+ """TODO: May be redundant with ProcessIntent?"""
64
65
  pass
65
66
 
66
67
 
wxpath/core/ops.py CHANGED
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
19
19
  Binary,
20
20
  Call,
21
21
  ContextItem,
22
+ Depth,
22
23
  Segment,
23
24
  Segments,
24
25
  String,
@@ -78,7 +79,10 @@ def get_operator(
78
79
 
79
80
 
80
81
  @register('url', (String,))
82
+ @register('url', (String, Depth))
81
83
  @register('url', (String, Xpath))
84
+ @register('url', (String, Depth, Xpath))
85
+ @register('url', (String, Xpath, Depth))
82
86
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
87
  curr_segments: list[Url | Xpath],
84
88
  curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
87
91
 
88
92
  next_segments = curr_segments[1:]
89
93
 
90
- if len(url_call.args) == 2:
94
+ # NOTE: Expects parser to produce UrlCrawl node in expressions
95
+ # that look like `url('...', follow=//a/@href)`
96
+ if isinstance(url_call, UrlCrawl):
97
+ xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
91
98
  _segments = [
92
- UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
99
+ UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
93
100
  ] + next_segments
94
101
 
95
102
  yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
112
119
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
120
  base_url = getattr(curr_elem, 'base_url', None)
114
121
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
-
116
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
- # increment after each url*() hop
119
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
- expr = expr.replace('wx:backlink()', _backlink_str)
121
- expr = expr.replace('wx:backlink(.)', _backlink_str)
122
- expr = expr.replace('wx:depth()', _depth_str)
123
- expr = expr.replace('wx:depth(.)', _depth_str)
124
-
125
122
  elems = curr_elem.xpath3(expr)
126
123
 
127
124
  next_segments = curr_segments[1:]
wxpath/core/parser.py CHANGED
@@ -13,7 +13,8 @@ except ImportError:
13
13
 
14
14
 
15
15
  TOKEN_SPEC = [
16
- ("NUMBER", r"\d+(\.\d+)?"),
16
+ ("NUMBER", r"\d+\.\d+"),
17
+ ("INTEGER", r"\d+"),
17
18
  ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
18
19
  ("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
19
20
  # ("///URL", r"/{3}\s*url"),
@@ -22,6 +23,7 @@ TOKEN_SPEC = [
22
23
  ("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
23
24
  # ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
24
25
  ("FOLLOW", r",?\s{,}follow="),
26
+ ("DEPTH", r",?\s{,}depth="),
25
27
  ("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
26
28
  ("LPAREN", r"\("),
27
29
  ("RPAREN", r"\)"),
@@ -63,6 +65,14 @@ def tokenize(src: str):
63
65
  class Number:
64
66
  value: float
65
67
 
68
+ @dataclass
69
+ class Integer:
70
+ value: int
71
+
72
+ @dataclass
73
+ class Depth(Integer):
74
+ pass
75
+
66
76
  @dataclass
67
77
  class String:
68
78
  value: str
@@ -273,6 +283,10 @@ class Parser:
273
283
  if tok.type == "NUMBER":
274
284
  self.advance()
275
285
  return Number(float(tok.value))
286
+
287
+ if tok.type == "INTEGER":
288
+ self.advance()
289
+ return Integer(int(tok.value))
276
290
 
277
291
  if tok.type == "STRING":
278
292
  self.advance()
@@ -358,18 +372,18 @@ class Parser:
358
372
  self.advance()
359
373
 
360
374
  return result
361
-
362
375
 
363
376
  def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
364
377
  """Capture content inside a url() call, handling nested wxpath expressions.
365
378
 
366
379
  Supports patterns like::
367
380
 
368
- url('...') -> [String]
369
- url('...' follow=//a/@href) -> [String, Xpath]
370
- url(//a/@href) -> [Xpath]
371
- url( url('..')//a/@href ) -> [Call, Xpath]
372
- url( url( url('..')//a )//b ) -> [Call, Xpath]
381
+ url('...') -> [String]
382
+ url('...' follow=//a/@href) -> [String, Xpath]
383
+ url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
384
+ url(//a/@href depth=2) -> [Xpath, Integer]
385
+ url( url('..')//a/@href ) -> [Call, Xpath]
386
+ url( url( url('..')//a )//b ) -> [Call, Xpath]
373
387
 
374
388
  Returns:
375
389
  A list of parsed elements: Xpath nodes for xpath content and Call
@@ -380,7 +394,10 @@ class Parser:
380
394
  paren_balance = 1 # We're already inside the opening paren of url()
381
395
  brace_balance = 0 # Track braces for map constructors
382
396
  reached_follow_token = False
397
+ reached_depth_token = False
383
398
  follow_xpath = ""
399
+ depth_number = ""
400
+
384
401
  while paren_balance > 0 and self.token.type != "EOF":
385
402
  if self.token.type == "WXPATH":
386
403
  # Found nested wxpath: save any accumulated xpath content first
@@ -396,13 +413,22 @@ class Parser:
396
413
 
397
414
  elif self.token.type == "FOLLOW":
398
415
  reached_follow_token = True
416
+ reached_depth_token = False
417
+ self.advance()
418
+
419
+ elif self.token.type == "DEPTH":
420
+ reached_depth_token = True
421
+ reached_follow_token = False
399
422
  self.advance()
400
423
 
401
424
  elif self.token.type == "LPAREN":
402
425
  # Opening paren that's NOT part of a url() call
403
426
  # (it's part of an xpath function like contains(), starts-with(), etc.)
404
427
  paren_balance += 1
405
- current_xpath += self.token.value
428
+ if not reached_follow_token:
429
+ current_xpath += self.token.value
430
+ else:
431
+ follow_xpath += self.token.value
406
432
  self.advance()
407
433
 
408
434
  elif self.token.type == "RPAREN":
@@ -410,26 +436,37 @@ class Parser:
410
436
  if paren_balance == 0:
411
437
  # This is the closing paren of the outer url()
412
438
  break
413
- current_xpath += self.token.value
439
+ if not reached_follow_token:
440
+ current_xpath += self.token.value
441
+ else:
442
+ follow_xpath += self.token.value
414
443
  self.advance()
415
444
 
416
445
  elif self.token.type == "LBRACE":
417
446
  # Opening brace for map constructors
418
447
  brace_balance += 1
419
- current_xpath += self.token.value
448
+ if not reached_follow_token:
449
+ current_xpath += self.token.value
450
+ else:
451
+ follow_xpath += self.token.value
420
452
  self.advance()
421
453
 
422
454
  elif self.token.type == "RBRACE":
423
455
  brace_balance -= 1
424
- current_xpath += self.token.value
456
+ if not reached_follow_token:
457
+ current_xpath += self.token.value
458
+ else:
459
+ follow_xpath += self.token.value
425
460
  self.advance()
426
461
 
427
462
  else:
428
463
  # Accumulate all other tokens as xpath content
429
- if not reached_follow_token:
430
- current_xpath += self.token.value
431
- else:
464
+ if reached_follow_token:
432
465
  follow_xpath += self.token.value
466
+ elif reached_depth_token:
467
+ depth_number += self.token.value
468
+ else:
469
+ current_xpath += self.token.value
433
470
 
434
471
  self.advance()
435
472
 
@@ -447,6 +484,9 @@ class Parser:
447
484
  if follow_xpath.strip():
448
485
  elements.append(Xpath(follow_xpath.strip()))
449
486
 
487
+ if depth_number.strip():
488
+ elements.append(Depth(int(depth_number.strip())))
489
+
450
490
  return elements
451
491
 
452
492
  def parse_call(self, func_name: str) -> Call | Segments:
@@ -462,13 +502,16 @@ class Parser:
462
502
  self.advance()
463
503
  # Handle follow=...
464
504
  if self.token.type == "FOLLOW":
465
- self.advance()
466
505
  follow_arg = self.capture_url_arg_content()
467
506
  args.extend(follow_arg)
507
+ if self.token.type == "DEPTH":
508
+ depth_arg = self.capture_url_arg_content()
509
+ args.extend(depth_arg)
468
510
  elif self.token.type == "WXPATH":
469
511
  # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
470
- # Use capture_url_arg_content to handle nested wxpath and xpath
471
- args = self.capture_url_arg_content()
512
+ # NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
513
+ # args = self.capture_url_arg_content()
514
+ args = self.nud()
472
515
  else:
473
516
  # Simple xpath argument: url(//a/@href)
474
517
  # Could still contain nested wxpath, so use capture_url_arg_content
@@ -489,8 +532,18 @@ class Parser:
489
532
 
490
533
  return _specify_call_types(func_name, args)
491
534
 
492
-
493
535
  def _specify_call_types(func_name: str, args: list) -> Call | Segments:
536
+ """
537
+ Specify the type of a call based on the function name and arguments.
538
+ TODO: Provide example wxpath expressions for each call type.
539
+
540
+ Args:
541
+ func_name: The name of the function.
542
+ args: The arguments of the function.
543
+
544
+ Returns:
545
+ Call | Segments: The type of the call.
546
+ """
494
547
  if func_name == "url":
495
548
  if len(args) == 1:
496
549
  if isinstance(args[0], String):
@@ -500,17 +553,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
500
553
  else:
501
554
  raise ValueError(f"Unknown argument type: {type(args[0])}")
502
555
  elif len(args) == 2:
503
- if isinstance(args[0], String) and isinstance(args[1], Xpath):
556
+ arg0, arg1 = args
557
+ if isinstance(arg0, String) and isinstance(arg1, Xpath):
558
+ # Example: url('...', follow=//a/@href)
504
559
  return UrlCrawl(func_name, args)
505
- elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
560
+ elif isinstance(arg0, String) and isinstance(arg1, Integer):
561
+ # Example: url('...', depth=2)
562
+ return UrlLiteral(func_name, args)
563
+ elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
506
564
  args.append(UrlQuery('url', [ContextItem()]))
507
565
  return Segments(args)
508
- elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
509
- segs = args[0]
510
- segs.append(args[1])
566
+ elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
567
+ segs = arg0
568
+ segs.append(arg1)
511
569
  return Segments(segs)
512
570
  else:
513
571
  raise ValueError(f"Unknown arguments: {args}")
572
+ elif len(args) == 3:
573
+ arg0, arg1, arg2 = args
574
+ if (isinstance(arg0, String) and (
575
+ (isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
576
+ (isinstance(arg1, Integer) and isinstance(arg2, Xpath))
577
+ )):
578
+ # Example: url('...', follow=//a/@href, depth=2)
579
+ # Example: url('...', depth=2, follow=//a/@href)
580
+ return UrlCrawl(func_name, args)
581
+ else:
582
+ raise ValueError(f"Unknown arguments: {args}")
514
583
  else:
515
584
  raise ValueError(f"Unknown arguments: {args}")
516
585
  elif func_name == "/url" or func_name == "//url":
@@ -18,7 +18,7 @@ from wxpath.core.models import (
18
18
  ProcessIntent,
19
19
  )
20
20
  from wxpath.core.ops import get_operator
21
- from wxpath.core.parser import Binary, Segment, Segments
21
+ from wxpath.core.parser import Binary, Depth, Segment, Segments
22
22
  from wxpath.core.runtime.helpers import parse_html
23
23
  from wxpath.hooks.registry import FetchContext, get_hooks
24
24
  from wxpath.http.client.crawler import Crawler
@@ -158,6 +158,25 @@ class WXPathEngine(HookedEngineBase):
158
158
  if allow_redirects:
159
159
  self.allowed_response_codes |= {301, 302, 303, 307, 308}
160
160
 
161
+ def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
162
+ """Get the maximum crawl depth for a given expression. Will find a Depth
163
+ argument at the beginning of the expression and return its value. Otherwise, returns the
164
+ max_depth value provided.
165
+ TODO: There has to be a better way to do this.
166
+ """
167
+ if isinstance(bin_or_segs, Binary):
168
+ if hasattr(bin_or_segs.left, 'func') == 'url':
169
+ depth_arg = [arg for arg in bin_or_segs.left.args if isinstance(arg, Depth)][0]
170
+ return int(depth_arg.value)
171
+ elif hasattr(bin_or_segs.right, 'func') == 'url':
172
+ depth_arg = [arg for arg in bin_or_segs.right.args if isinstance(arg, Depth)][0]
173
+ return int(depth_arg.value)
174
+ elif isinstance(bin_or_segs, Segments):
175
+ depth_arg = [arg for arg in bin_or_segs[0].args if isinstance(arg, Depth)]
176
+ if depth_arg:
177
+ return int(depth_arg[0].value)
178
+ return max_depth
179
+
161
180
  async def run(
162
181
  self,
163
182
  expression: str,
@@ -170,6 +189,17 @@ class WXPathEngine(HookedEngineBase):
170
189
  Builds and drives a BFS-like crawl pipeline that honors robots rules,
171
190
  throttling, and hook callbacks while walking the web graph.
172
191
 
192
+ NOTES ON max_depth:
193
+ If depth is provided in the expression, it will be used to limit the depth of the
194
+ crawl. If depth is provided in the expression and max_depth is provided as an argument
195
+ to `run`, the inline depth in the expression will take precedence.
196
+
197
+ Currently, max_depth control flow logic is detected and executed in the
198
+ engine. In the future, the operation handlers (ops.py) could be responsible for
199
+ detecting max_depth, and sending a terminal intent to the engine. It's also possible
200
+ that the depth terminals are relative to the current depth (i.e. `url(//xpath, depth=2)`
201
+ implies crawling only the next 2 levels). This is not yet supported.
202
+
173
203
  Args:
174
204
  expression: WXPath expression string to evaluate.
175
205
  max_depth: Maximum crawl depth to follow for url hops.
@@ -179,7 +209,9 @@ class WXPathEngine(HookedEngineBase):
179
209
  Extracted values produced by the expression (HTML elements or
180
210
  wxpath-specific value types).
181
211
  """
182
- segments = parser.parse(expression)
212
+ bin_or_segs = parser.parse(expression)
213
+
214
+ max_depth = self._get_max_depth(bin_or_segs, max_depth)
183
215
 
184
216
  queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
185
217
  inflight: dict[str, CrawlTask] = {}
@@ -223,7 +255,7 @@ class WXPathEngine(HookedEngineBase):
223
255
  seed_task = CrawlTask(
224
256
  elem=None,
225
257
  url=None,
226
- segments=segments,
258
+ segments=bin_or_segs,
227
259
  depth=-1,
228
260
  backlink=None,
229
261
  )
@@ -307,6 +339,7 @@ class WXPathEngine(HookedEngineBase):
307
339
  base_url=task.url,
308
340
  backlink=task.backlink,
309
341
  depth=task.depth,
342
+ response=resp
310
343
  )
311
344
 
312
345
  elem = await self.post_parse_hooks(elem, task)
@@ -6,7 +6,7 @@ from wxpath.util.logging import get_logger
6
6
  log = get_logger(__name__)
7
7
 
8
8
 
9
- def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
9
+ def parse_html(content, base_url=None, response=None, **elem_kv_pairs) -> html.HtmlElement:
10
10
  elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
11
11
  if base_url:
12
12
  elem.getroottree().docinfo.URL = base_url # make base-uri() work
@@ -14,12 +14,15 @@ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
14
14
  elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
15
15
  elem.base_url = base_url # sets both attribute and doc-level URL
16
16
 
17
- # NOTE: some pages may have multiple root elements, i.e.
17
+ if response:
18
+ elem.response = response
19
+ elem.getroottree().getroot().response = response
20
+ # NOTE: some pages may have multiple root elements, i.e.
18
21
  # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
19
22
  # This breaks elementpath. If elem has siblings, recreate the
20
23
  # root element and only the root element.
21
24
  if len(list(elem.itersiblings())) > 0:
22
- elem = detach_html_root(elem, base_url)
25
+ elem = detach_html_root(elem, base_url)
23
26
 
24
27
  for k, v in elem_kv_pairs.items():
25
28
  elem.set(k, str(v))
@@ -5,5 +5,5 @@ from wxpath.http.client.response import Response
5
5
  __all__ = [
6
6
  "Crawler",
7
7
  "Request",
8
- "Response"
8
+ "Response",
9
9
  ]
@@ -71,6 +71,7 @@ class Crawler:
71
71
  *,
72
72
  headers: dict | None = None,
73
73
  proxies: dict | None = None,
74
+ verify_ssl: bool | None = None,
74
75
  retry_policy: RetryPolicy | None = None,
75
76
  throttler: AbstractThrottler | None = None,
76
77
  auto_throttle_target_concurrency: float = None,
@@ -82,6 +83,9 @@ class Crawler:
82
83
 
83
84
  self.concurrency = concurrency if concurrency is not None else cfg.concurrency
84
85
  self.per_host = per_host if per_host is not None else cfg.per_host
86
+ self._verify_ssl = verify_ssl if verify_ssl is not None else getattr(
87
+ cfg, "verify_ssl", True
88
+ )
85
89
 
86
90
  timeout = timeout if timeout is not None else cfg.timeout
87
91
  self._timeout = aiohttp.ClientTimeout(total=timeout)
@@ -141,7 +145,11 @@ class Crawler:
141
145
  """Construct an `aiohttp.ClientSession` with tracing and pooling."""
142
146
  trace_config = build_trace_config(self._stats)
143
147
  # Need to build the connector as late as possible as it requires the loop
144
- connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
148
+ connector = aiohttp.TCPConnector(
149
+ limit=self.concurrency * 2,
150
+ ttl_dns_cache=300,
151
+ ssl=self._verify_ssl,
152
+ )
145
153
  return get_async_session(
146
154
  headers=self._headers,
147
155
  timeout=self._timeout,
@@ -274,22 +282,26 @@ class Crawler:
274
282
  else:
275
283
  log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
276
284
 
285
+ _start = time.monotonic()
277
286
  body = await resp.read()
278
287
 
279
- latency = time.monotonic() - start
288
+ end = time.monotonic()
289
+ latency = end - _start
280
290
  self.throttler.record_latency(host, latency)
281
291
 
282
292
  if self.retry_policy.should_retry(req, response=resp):
283
293
  await self._retry(req)
284
294
  return None
285
295
 
286
- return Response(req, resp.status, body, dict(resp.headers))
296
+ return Response(req, resp.status, body, dict(resp.headers),
297
+ request_start=_start, response_end=end)
287
298
  except asyncio.CancelledError:
288
299
  # Normal during shutdown / timeout propagation
289
300
  log.debug("cancelled error", extra={"url": req.url})
290
301
  raise
291
302
  except Exception as exc:
292
- latency = time.monotonic() - start
303
+ end = time.monotonic()
304
+ latency = end - start
293
305
  self.throttler.record_latency(host, latency)
294
306
 
295
307
  if self.retry_policy.should_retry(req, exception=exc):
@@ -297,7 +309,7 @@ class Crawler:
297
309
  return None
298
310
 
299
311
  log.error("request failed", extra={"url": req.url}, exc_info=exc)
300
- return Response(req, 0, b"", error=exc)
312
+ return Response(req, 0, b"", error=exc, request_start=start, response_end=end)
301
313
 
302
314
  async def _retry(self, req: Request) -> None:
303
315
  """Reschedule a request according to the retry policy."""
@@ -1,4 +1,3 @@
1
- # wxpath/http/response.py
2
1
  from dataclasses import dataclass, field
3
2
  from typing import Optional
4
3
 
@@ -12,3 +11,10 @@ class Response:
12
11
  body: bytes
13
12
  headers: dict[str, str] | None = None
14
13
  error: Optional[Exception] = field(default=None, kw_only=True)
14
+
15
+ request_start: float | None = None
16
+ response_end: float | None = None
17
+
18
+ @property
19
+ def latency(self) -> float:
20
+ return self.response_end - self.request_start
@@ -19,13 +19,13 @@ class RetryPolicy:
19
19
 
20
20
  if request.max_retries is not None and request.retries >= request.max_retries:
21
21
  return False
22
-
22
+
23
23
  if request.retries >= self.max_retries:
24
24
  return False
25
25
 
26
26
  if response is not None and response.status in self.retry_statuses:
27
27
  return True
28
-
28
+
29
29
  if exception is not None:
30
30
  return True
31
31
 
File without changes
File without changes
@@ -0,0 +1,85 @@
1
+
2
+ # pip install langchain langchain-ollama langchain-chroma chromadb
3
+ from langchain_chroma import Chroma
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.runnables import RunnablePassthrough
7
+ from langchain_ollama import ChatOllama, OllamaEmbeddings
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+
10
+ from wxpath.integrations.langchain.loader import WXPathLoader
11
+
12
+ # ------------------------------------------------------------------
13
+ # STEP 1: Load & Embed (Same as before)
14
+ # ------------------------------------------------------------------
15
+ print("🕷️ Crawling with wxpath...")
16
+ loader = WXPathLoader(
17
+ expression="""
18
+ url('https://docs.python.org/3/library/argparse.html',
19
+ follow=//a/@href[contains(., 'argparse')])
20
+ /map{
21
+ 'text': string-join(//div[@role='main']//text()),
22
+ 'source': string(base-uri(.))
23
+ }
24
+ """,
25
+ max_depth=1
26
+ )
27
+ docs = loader.load()
28
+
29
+ print("🔪 Splitting and Embedding...")
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
31
+ splits = text_splitter.split_documents(docs)
32
+
33
+ vectorstore = Chroma.from_documents(
34
+ documents=splits,
35
+ # Must use model that support embeddings (`ollama pull nomic-embed-text`)
36
+ embedding=OllamaEmbeddings(model="nomic-embed-text"),
37
+ collection_name="wxpath"
38
+ )
39
+ retriever = vectorstore.as_retriever()
40
+
41
+ # ------------------------------------------------------------------
42
+ # STEP 2: Define Components
43
+ # ------------------------------------------------------------------
44
+
45
+ # A helper to join retrieved documents into a single string
46
+ def format_docs(docs):
47
+ return "\n\n".join(doc.page_content for doc in docs)
48
+
49
+ # The Prompt (Standard RAG template)
50
+ template = """You are an assistant for question-answering tasks.
51
+ Use the following pieces of retrieved context to answer the question.
52
+ If you don't know the answer, just say that you don't know.
53
+ Use three sentences maximum and keep the answer concise.
54
+
55
+ Context: {context}
56
+
57
+ Question: {question}
58
+
59
+ Answer:"""
60
+ prompt = ChatPromptTemplate.from_template(template)
61
+
62
+ # The Model
63
+ llm = ChatOllama(model="gemma3")
64
+
65
+ # ------------------------------------------------------------------
66
+ # STEP 3: Build the Chain with LCEL
67
+ # ------------------------------------------------------------------
68
+ # The pipe operator (|) passes output from one component to the next.
69
+ rag_chain = (
70
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
71
+ | prompt
72
+ | llm
73
+ | StrOutputParser()
74
+ )
75
+
76
+ # ------------------------------------------------------------------
77
+ # STEP 4: Invoke
78
+ # ------------------------------------------------------------------
79
+ query = "How do I add arguments in argparse?"
80
+ print(f"\n❓ Question: {query}")
81
+
82
+ # The chain returns a string directly because of StrOutputParser
83
+ response = rag_chain.invoke(query)
84
+
85
+ print(f"\n🤖 Ollama Answer:\n{response}")