wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/cli.py CHANGED
@@ -2,44 +2,84 @@ import argparse
2
2
  import json
3
3
  import sys
4
4
 
5
- from wxpath.core.parser import parse_wxpath_expr
5
+ from wxpath.core import parser as wxpath_parser
6
6
  from wxpath.core.runtime.engine import WXPathEngine, wxpath_async_blocking_iter
7
7
  from wxpath.hooks import builtin, registry
8
+ from wxpath.http.client.crawler import Crawler
8
9
  from wxpath.util.serialize import simplify
9
10
 
10
11
 
11
12
  def main():
12
13
  registry.register(builtin.SerializeXPathMapAndNodeHook)
13
- parser = argparse.ArgumentParser(description="Run wxpath expression.")
14
- parser.add_argument("expression", help="The wxpath expression")
15
- parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
14
+ arg_parser = argparse.ArgumentParser(description="Run wxpath expression.")
15
+ arg_parser.add_argument("expression", help="The wxpath expression")
16
+ arg_parser.add_argument("--depth", type=int, default=1, help="Recursion depth")
16
17
  # debug
17
- parser.add_argument("--debug", action="store_true", help="Debug mode")
18
+ arg_parser.add_argument("--debug", action="store_true", help="Debug mode")
18
19
  # verbose
19
- parser.add_argument("--verbose", action="store_true", help="Verbose mode")
20
+ arg_parser.add_argument("--verbose", action="store_true", help="Verbose mode")
20
21
 
21
- parser.add_argument("--concurrency", type=int, default=16, help="Number of concurrent fetches")
22
- parser.add_argument(
22
+ arg_parser.add_argument(
23
+ "--concurrency",
24
+ type=int,
25
+ default=16,
26
+ help="Number of concurrent fetches"
27
+ )
28
+ arg_parser.add_argument(
23
29
  "--concurrency-per-host",
24
30
  type=int,
25
31
  default=8,
26
32
  help="Number of concurrent fetches per host"
27
33
  )
34
+ arg_parser.add_argument(
35
+ "--header",
36
+ action="append",
37
+ dest="header_list",
38
+ default=[],
39
+ help="Add a custom header (e.g., 'Key:Value'). Can be used multiple times.",
40
+ )
41
+ arg_parser.add_argument(
42
+ "--respect-robots",
43
+ action="store_true",
44
+ help="Respect robots.txt",
45
+ default=True
46
+ )
28
47
 
29
- args = parser.parse_args()
48
+ args = arg_parser.parse_args()
30
49
 
31
50
  if args.verbose:
32
- print("wxpath expression:", args.expression)
33
- print("parsed expression:", parse_wxpath_expr(args.expression))
51
+ segments = wxpath_parser.parse(args.expression)
52
+ print("parsed expression:\n\nSegments([")
53
+ for s in segments:
54
+ print(f"\t{s},")
55
+ print("])")
56
+ print()
34
57
 
35
58
  if args.debug:
36
59
  from wxpath import configure_logging
37
60
  configure_logging('DEBUG')
38
61
 
39
- engine = WXPathEngine(
62
+ custom_headers = {}
63
+ if args.header_list:
64
+ for header_item in args.header_list:
65
+ try:
66
+ key, value = header_item.split(':', 1)
67
+ custom_headers[key.strip()] = value.strip()
68
+ except ValueError:
69
+ print(f"Warning: Invalid header format '{header_item}'. Use 'Key:Value'.")
70
+
71
+ if custom_headers and args.verbose:
72
+ print(f"Using custom headers: {custom_headers}")
73
+ print()
74
+
75
+ crawler = Crawler(
40
76
  concurrency=args.concurrency,
41
77
  per_host=args.concurrency_per_host,
78
+ respect_robots=args.respect_robots,
79
+ headers=custom_headers
42
80
  )
81
+ engine = WXPathEngine(crawler=crawler)
82
+
43
83
  try:
44
84
  for r in wxpath_async_blocking_iter(args.expression, args.depth, engine):
45
85
  clean = simplify(r)
wxpath/core/ops.py CHANGED
@@ -1,7 +1,3 @@
1
- """
2
- `ops` for "operations". This module contains side-effect-free functions (operators)
3
- for handling each segment of a wxpath expression.
4
- """
5
1
  from typing import Callable, Iterable
6
2
  from urllib.parse import urljoin
7
3
 
@@ -19,16 +15,24 @@ from wxpath.core.models import (
19
15
  Intent,
20
16
  ProcessIntent,
21
17
  )
22
- from wxpath.core.parser import OPS, Segment, UrlInfAndXpathValue, XpathValue
18
+ from wxpath.core.parser import (
19
+ Binary,
20
+ Call,
21
+ ContextItem,
22
+ Segment,
23
+ Segments,
24
+ String,
25
+ Url,
26
+ UrlCrawl,
27
+ Xpath,
28
+ )
23
29
  from wxpath.util.logging import get_logger
24
30
 
25
31
  log = get_logger(__name__)
26
32
 
27
33
 
28
34
  class WxStr(str):
29
- """
30
- A string that has a base_url and depth associated with it. Purely for debugging.
31
- """
35
+ """A string with associated base_url and depth metadata for debugging."""
32
36
  def __new__(cls, value, base_url=None, depth=-1):
33
37
  obj = super().__new__(cls, value)
34
38
  obj.base_url = base_url
@@ -39,61 +43,120 @@ class WxStr(str):
39
43
  return f"WxStr({super().__repr__()}, base_url={self.base_url!r}, depth={self.depth})"
40
44
 
41
45
 
42
- HANDLERS: dict[str, Callable] = {}
46
+ class RuntimeSetupError(Exception):
47
+ pass
48
+
49
+
50
+ OPS_REGISTER: dict[str, Callable] = {}
51
+
52
+ def register(func_name_or_type: str | type, args_types: tuple[type, ...] | None = None):
53
+ def _register(func: Callable) -> Callable:
54
+ global OPS_REGISTER
55
+ _key = (func_name_or_type, args_types) if args_types else func_name_or_type
56
+ if _key in OPS_REGISTER:
57
+ raise RuntimeSetupError(f"The operation handler for \"{_key}\" already registered")
58
+ OPS_REGISTER[_key] = func
59
+ return func
60
+ return _register
43
61
 
44
- def _op(name: OPS):
45
- def reg(fn):
46
- if name in HANDLERS:
47
- raise ValueError(f"Duplicate operation: {name}")
48
- HANDLERS[name] = fn
49
- return fn
50
- return reg
51
62
 
63
+ def get_operator(
64
+ binary_or_segment: Binary | Segment
65
+ ) -> Callable[[html.HtmlElement, list[Url | Xpath], int], Iterable[Intent]]:
66
+ func_name_or_type = getattr(binary_or_segment, 'func', None) or binary_or_segment.__class__
52
67
 
53
- def get_operator(name: OPS) -> Callable[[html.HtmlElement, list[Segment], int], Iterable[Intent]]:
54
- if name not in HANDLERS:
55
- raise ValueError(f"Unknown operation: {name}")
56
- return HANDLERS[name]
68
+ args_types = None
69
+ if isinstance(binary_or_segment, Binary):
70
+ args_types = (binary_or_segment.left.__class__, binary_or_segment.right.__class__)
71
+ elif isinstance(binary_or_segment, Call):
72
+ args_types = tuple(arg.__class__ for arg in binary_or_segment.args)
57
73
 
74
+ _key = (func_name_or_type, args_types) if args_types else func_name_or_type
75
+ if _key not in OPS_REGISTER:
76
+ raise ValueError(f"Unknown operation: {_key}")
77
+ return OPS_REGISTER[_key]
58
78
 
59
- @_op(OPS.URL_STR_LIT)
79
+
80
+ @register('url', (String,))
81
+ @register('url', (String, Xpath))
60
82
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
61
- curr_segments: list[Segment],
83
+ curr_segments: list[Url | Xpath],
62
84
  curr_depth: int, **kwargs) -> Iterable[Intent]:
63
- op, value = curr_segments[0]
64
-
65
- log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": value.target})
85
+ """Handle `url('<literal>')` segments and optional follow xpath."""
86
+ url_call = curr_segments[0] # type: Url
66
87
 
67
88
  next_segments = curr_segments[1:]
68
89
 
69
- if value.follow:
90
+ if len(url_call.args) == 2:
70
91
  _segments = [
71
- (OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', value.target, value.follow))
92
+ UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
72
93
  ] + next_segments
73
94
 
74
- yield CrawlIntent(url=value.target, next_segments=_segments)
95
+ yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
75
96
  else:
76
- yield CrawlIntent(url=value.target, next_segments=next_segments)
97
+ yield CrawlIntent(url=url_call.args[0].value, next_segments=next_segments)
98
+
99
+
100
+ # @register2('url', (Xpath,))
101
+ @register(Xpath)
102
+ def _handle_xpath(curr_elem: html.HtmlElement,
103
+ curr_segments: Segments,
104
+ curr_depth: int,
105
+ **kwargs) -> Iterable[Intent]:
106
+ """Execute an xpath step and yield data or chained processing intents."""
107
+ xpath_node = curr_segments[0] # type: Xpath
108
+
109
+ expr = xpath_node.value
110
+
111
+ if curr_elem is None:
112
+ raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
+ base_url = getattr(curr_elem, 'base_url', None)
114
+ log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
+
116
+ _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
+ # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
+ # increment after each url*() hop
119
+ _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
+ expr = expr.replace('wx:backlink()', _backlink_str)
121
+ expr = expr.replace('wx:backlink(.)', _backlink_str)
122
+ expr = expr.replace('wx:depth()', _depth_str)
123
+ expr = expr.replace('wx:depth(.)', _depth_str)
124
+
125
+ elems = curr_elem.xpath3(expr)
126
+
127
+ next_segments = curr_segments[1:]
128
+ for elem in elems:
129
+ value_or_elem = WxStr(
130
+ elem, base_url=base_url,
131
+ depth=curr_depth
132
+ ) if isinstance(elem, str) else elem
133
+ if len(curr_segments) == 1:
134
+ yield DataIntent(value=value_or_elem)
135
+ else:
136
+ yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
77
137
 
78
138
 
79
- @_op(OPS.URL_EVAL)
139
+ @register('//url', (ContextItem,))
140
+ @register('//url', (Xpath,))
141
+ @register('/url', (ContextItem,))
142
+ @register('/url', (Xpath,))
143
+ @register('url', (ContextItem,))
144
+ @register('url', (Xpath,))
80
145
  def _handle_url_eval(curr_elem: html.HtmlElement | str,
81
- curr_segments: list[Segment],
146
+ curr_segments: list[Url | Xpath],
82
147
  curr_depth: int,
83
148
  **kwargs) -> Iterable[Intent]:
84
- op, value = curr_segments[0]
85
-
86
- _path_exp = value.expr
87
-
88
- if isinstance(curr_elem, str):
89
- # TODO: IMO, ideally, wxpath grammar should not be checked/validated/enforced
90
- # in ops.py. It should instead be validated in the parser.
91
- if _path_exp not in {'.', 'self::node()'}:
92
- raise ValueError("Only '.' or 'self::node()' is supported in url() segments "
93
- f"when prior xpath operation results in a string. Got: {_path_exp}")
149
+ """Resolve dynamic url() arguments and enqueue crawl intents.
150
+
151
+ Yields:
152
+ CrawlIntent
153
+ """
154
+ url_call = curr_segments[0] # type: Url
94
155
 
156
+ if isinstance(url_call.args[0], ContextItem):
95
157
  urls = [urljoin(getattr(curr_elem, 'base_url', None) or '', curr_elem)]
96
158
  else:
159
+ _path_exp = url_call.args[0].value
97
160
  # TODO: If prior xpath operation is XPATH_FN_MAP_FRAG, then this will likely fail.
98
161
  # It should be handled in the parser.
99
162
  urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
@@ -101,144 +164,115 @@ def _handle_url_eval(curr_elem: html.HtmlElement | str,
101
164
 
102
165
  next_segments = curr_segments[1:]
103
166
  for url in urls:
104
- log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
167
+ # log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
105
168
  yield CrawlIntent(url=url, next_segments=next_segments)
106
169
 
107
170
 
108
- @_op(OPS.URL_INF)
171
+ @register('///url', (Xpath,))
109
172
  def _handle_url_inf(curr_elem: html.HtmlElement,
110
- curr_segments: list[Segment],
173
+ curr_segments: list[Url | Xpath],
111
174
  curr_depth: int,
112
175
  **kwargs) -> Iterable[CrawlIntent]:
176
+ """Handle the ``///url()`` segment of a wxpath expression.
177
+
178
+ This operation is also generated internally by the parser when a
179
+ ``///<xpath>/[/]url()`` segment is encountered.
180
+
181
+ Instead of fetching URLs directly, this operator XPaths the current
182
+ element for URLs and queues them for further processing via
183
+ ``_handle_url_inf_and_xpath``.
113
184
  """
114
- Handles the ///url() segment of a wxpath expression. This operation is also
115
- generated internally by the parser when a `///<xpath>/[/]url()` segment is
116
- encountered by the parser.
117
- This operation does not fetch URLs; instead, it XPaths the current element
118
- for URLs, then queues them for further processing (see
119
- _handle_url_inf_and_xpath).
120
- """
121
- op, value = curr_segments[0]
185
+ url_call = curr_segments[0] # type: Url
122
186
 
123
- _path_exp = value.expr
187
+ _path_exp = url_call.args[0].value
124
188
 
125
189
  urls = get_absolute_links_from_elem_and_xpath(curr_elem, _path_exp)
126
190
 
127
- log.debug("found urls",
128
- extra={"depth": curr_depth, "op": op, "url": getattr(curr_elem, 'base_url', None)})
129
-
130
191
  tail_segments = curr_segments[1:]
131
192
  for url in dict.fromkeys(urls):
132
193
  _segments = [
133
- (OPS.URL_INF_AND_XPATH, UrlInfAndXpathValue('', url, _path_exp))
194
+ UrlCrawl('///url', [url_call.args[0], url])
134
195
  ] + tail_segments
135
196
 
136
- log.debug("queueing", extra={"depth": curr_depth, "op": op, "url": url})
137
-
138
197
  yield CrawlIntent(url=url, next_segments=_segments)
139
198
 
140
199
 
141
- @_op(OPS.URL_INF_AND_XPATH)
200
+ @register('///url', (Xpath, str))
142
201
  def _handle_url_inf_and_xpath(curr_elem: html.HtmlElement,
143
- curr_segments: list[Segment],
202
+ curr_segments: list[Url | Xpath],
144
203
  curr_depth: int, **kwargs) \
145
204
  -> Iterable[DataIntent | ProcessIntent | InfiniteCrawlIntent]:
205
+ """Handle infinite-crawl with an xpath extraction step.
206
+
207
+ This operation is generated internally by the parser; there is no explicit
208
+ wxpath expression that produces it directly.
209
+
210
+ Yields:
211
+ DataIntent: If the current element is not None and no next segments are provided.
212
+ ExtractIntent: If the current element is not None and next segments are provided.
213
+ InfiniteCrawlIntent: If the current element is not None and next segments are provided.
214
+
215
+ Raises:
216
+ ValueError: If the current element is None.
146
217
  """
147
- This is an operation that is generated internally by the parser. There is
148
- no explicit wxpath expression that generates this operation.
149
- """
150
- op, value = curr_segments[0]
218
+ url_call = curr_segments[0]
151
219
 
152
220
  try:
153
221
  if curr_elem is None:
154
222
  raise ValueError("Missing element when op is 'url_inf_and_xpath'.")
155
223
 
156
224
  next_segments = curr_segments[1:]
225
+
157
226
  if not next_segments:
158
227
  yield DataIntent(value=curr_elem)
159
228
  else:
160
229
  yield ExtractIntent(elem=curr_elem, next_segments=next_segments)
161
230
 
162
231
  # For url_inf, also re-enqueue for further infinite expansion
163
- _segments = [(OPS.URL_INF, XpathValue('', value.expr))] + next_segments
232
+ _segments = [UrlCrawl('///url', url_call.args[:-1])] + next_segments
164
233
  crawl_intent = InfiniteCrawlIntent(elem=curr_elem, next_segments=_segments)
165
- log.debug("queueing InfiniteCrawlIntent",
166
- extra={"depth": curr_depth, "op": op,
167
- "url": value.target, "crawl_intent": crawl_intent})
234
+
168
235
  yield crawl_intent
169
236
 
170
237
  except Exception:
171
- log.exception("error fetching url",
172
- extra={"depth": curr_depth, "op": op, "url": value.target})
173
-
238
+ log.exception("error fetching url inf and xpath",
239
+ extra={"depth": curr_depth, "url": url_call.args[1]})
174
240
 
175
- @_op(OPS.XPATH)
176
- def _handle_xpath(curr_elem: html.HtmlElement,
177
- curr_segments: list[Segment],
178
- curr_depth: int,
179
- **kwargs) -> Iterable[DataIntent | ProcessIntent]:
180
- """
181
- Handles the [/|//]<xpath> segment of a wxpath expression. This is a plain XPath expression.
182
- Also handles wxpath-specific macro expansions like wx:backlink() or wx:depth().
183
- """
184
- _, value = curr_segments[0]
185
- expr = value.expr
186
- if curr_elem is None:
187
- raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
188
- base_url = getattr(curr_elem, 'base_url', None)
189
- log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
190
-
191
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
192
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
193
- # increment after each url*() hop
194
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
195
- expr = expr.replace('wx:backlink()', _backlink_str)
196
- expr = expr.replace('wx:backlink(.)', _backlink_str)
197
- expr = expr.replace('wx:depth()', _depth_str)
198
- expr = expr.replace('wx:depth(.)', _depth_str)
199
-
200
- elems = curr_elem.xpath3(expr)
201
-
202
- next_segments = curr_segments[1:]
203
- for elem in elems:
204
- value_or_elem = WxStr(
205
- elem, base_url=base_url,
206
- depth=curr_depth
207
- ) if isinstance(elem, str) else elem
208
- if len(curr_segments) == 1:
209
- yield DataIntent(value=value_or_elem)
210
- else:
211
- yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
212
-
213
-
214
- @_op(OPS.XPATH_FN_MAP_FRAG)
215
- def _handle_xpath_fn_map_frag(curr_elem: html.HtmlElement | str,
216
- curr_segments: list[Segment],
241
+ @register(Binary, (Xpath, Segments))
242
+ def _handle_binary(curr_elem: html.HtmlElement | str,
243
+ curr_segments: list[Url | Xpath] | Binary,
217
244
  curr_depth: int,
218
245
  **kwargs) -> Iterable[DataIntent | ProcessIntent]:
246
+ """Execute XPath expressions suffixed with the ``!`` (map) operator.
247
+
248
+ Yields:
249
+ ProcessIntent: Contrains either a WxStr or lxml or elementpath element.
219
250
  """
220
- Handles the execution of XPath functions that were initially suffixed with a
221
- '!' (map) operator.
222
- """
223
- _, value = curr_segments[0]
251
+ left = curr_segments.left
252
+ _ = curr_segments.op
253
+ right = curr_segments.right
254
+
255
+ if len(right) == 0:
256
+ # Binary operation on segments expects non-empty segments
257
+ raise ValueError("Binary operation on segments expects non-empty segments")
224
258
 
225
259
  base_url = getattr(curr_elem, 'base_url', None)
226
- next_segments = curr_segments[1:]
260
+ next_segments = right
227
261
 
228
- result = elementpath.select(
262
+ results = elementpath.select(
229
263
  curr_elem,
230
- value.expr,
264
+ left.value,
231
265
  parser=XPath3Parser,
232
266
  item='' if curr_elem is None else None
233
267
  )
234
268
 
235
- if isinstance(result, AnyAtomicType):
236
- result = [result]
269
+ if isinstance(results, AnyAtomicType):
270
+ results = [results]
237
271
 
238
- for r in result:
239
- value_or_elem = WxStr(r, base_url=base_url, depth=curr_depth) if isinstance(r, str) else r
240
- if len(curr_segments) == 1:
241
- # XPATH_FN_MAP_FRAG is not a terminal operation
242
- raise ValueError("XPATH_FN_MAP_FRAG is not a terminal operation")
272
+ for result in results:
273
+ if isinstance(result, str):
274
+ value_or_elem = WxStr(result, base_url=base_url, depth=curr_depth)
243
275
  else:
244
- yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)
276
+ value_or_elem = result
277
+
278
+ yield ProcessIntent(elem=value_or_elem, next_segments=next_segments)