wxpath 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/core/parser.py ADDED
@@ -0,0 +1,319 @@
1
+ """
2
+ This module contains mainly two kinds of functions:
3
+
4
+ 1. functions for parsing wxpath expressions.
5
+ 2. functions for extracting information from wxpath expressions or subexpressions.
6
+
7
+ """
8
+ import re
9
+ from dataclasses import dataclass, fields
10
+ from typing import NamedTuple, Optional, TypeAlias
11
+
12
+ try:
13
+ from enum import StrEnum
14
+ except ImportError:
15
+ from enum import Enum
16
+
17
+ class StrEnum(str, Enum):
18
+ pass
19
+
20
+
21
+ @dataclass(frozen=True, slots=True)
22
+ class ValueBase:
23
+ _value: str
24
+
25
+
26
+ @dataclass(frozen=True, slots=True)
27
+ class UrlValue(ValueBase):
28
+ target: str
29
+ follow: str | None = None
30
+
31
+
32
+ @dataclass(frozen=True, slots=True)
33
+ class XpathValue(ValueBase):
34
+ expr: str
35
+
36
+
37
+ @dataclass(frozen=True, slots=True)
38
+ class UrlInfAndXpathValue(ValueBase):
39
+ target: str
40
+ expr: str
41
+
42
+
43
+ Value: TypeAlias = UrlValue | XpathValue | UrlInfAndXpathValue
44
+
45
+
46
+ class Segment(NamedTuple):
47
+ op: str
48
+ value: Value
49
+
50
+
51
+ class OPS(StrEnum):
52
+ URL_STR_LIT = "url_str_lit"
53
+ URL_EVAL = "url_eval"
54
+ URL_INF = "url_inf"
55
+ URL_INF_AND_XPATH = "url_inf_and_xpath"
56
+ XPATH = "xpath"
57
+ XPATH_FN_MAP_FRAG = "xpath_fn_map_frag" # XPath function ending with map operator '!'
58
+ INF_XPATH = "inf_xpath" # Experimental
59
+ OBJECT = "object" # Deprecated
60
+ URL_FROM_ATTR = "url_from_attr" # Deprecated
61
+ URL_OPR_AND_ARG = "url_opr_and_arg" # Deprecated
62
+
63
+
64
+ def _scan_path_expr(path_expr: str) -> list[str]:
65
+ """
66
+ Provided a wxpath expression, produce a list of all xpath and url() partitions
67
+
68
+ :param path_expr: Description
69
+ """
70
+ # remove newlines
71
+ path_expr = path_expr.replace('\n', '')
72
+ partitions = [] # type: list[str]
73
+ i = 0
74
+ n = len(path_expr)
75
+ while i < n:
76
+ # Detect ///url(, //url(, /url(, or url(
77
+ match = re.match(r'/{0,3}url\(', path_expr[i:])
78
+ if match:
79
+ seg_start = i
80
+ i += match.end() # Move past the matched "url("
81
+ paren_depth = 1
82
+ while i < n and paren_depth > 0:
83
+ if path_expr[i] == '(':
84
+ paren_depth += 1
85
+ elif path_expr[i] == ')':
86
+ paren_depth -= 1
87
+ i += 1
88
+ partitions.append(path_expr[seg_start:i])
89
+ else:
90
+ # Grab until the next /url(
91
+ next_url = re.search(r'/{0,3}url\(', path_expr[i:])
92
+ next_pos = next_url.start() + i if next_url else n
93
+ if i != next_pos:
94
+ partitions.append(path_expr[i:next_pos])
95
+ i = next_pos
96
+
97
+ return partitions
98
+
99
+
100
+ def parse_wxpath_expr(path_expr):
101
+ partitions = _scan_path_expr(path_expr)
102
+
103
+ # Lex and parse
104
+ segments = [] # type: list[Segment]
105
+ for s in partitions:
106
+ s = s.strip()
107
+ if not s:
108
+ continue
109
+ if s.startswith('url("') or s.startswith("url('"):
110
+ segments.append(
111
+ Segment(
112
+ OPS.URL_STR_LIT,
113
+ UrlValue(s, *parse_url_value(_extract_arg_from_url_op(s))),
114
+ )
115
+ )
116
+ elif s.startswith('///url('):
117
+ segments.append(
118
+ Segment(
119
+ OPS.URL_INF,
120
+ # XpathValue(extract_url_op_arg(s))
121
+ XpathValue(_value=s, expr=_extract_arg_from_url_xpath_op(s))
122
+ )
123
+ )
124
+ elif s.startswith('/url("') or s.startswith('//url("'):
125
+ raise ValueError("url() segment cannot have string literal "
126
+ f"argument and preceding navigation slashes (/|//): {s}")
127
+ elif s.startswith("/url('") or s.startswith("//url('"):
128
+ raise ValueError("url() segment cannot have string literal "
129
+ f"argument and preceding navigation slashes (/|//): {s}")
130
+ elif s.startswith('/url(') or s.startswith("//url("):
131
+ segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
132
+ elif s.startswith('url('):
133
+ segments.append(Segment(OPS.URL_EVAL, XpathValue(s, _extract_arg_from_url_xpath_op(s))))
134
+ elif s.startswith('///'):
135
+ raise ValueError(f"xpath segment cannot have preceding triple slashes : {s}")
136
+ # segments.append(Segment(OPS.INF_XPATH, XpathValue(s, "//" + s[3:])))
137
+ elif s.endswith('!'):
138
+ segments.append(Segment(OPS.XPATH_FN_MAP_FRAG, XpathValue(s, s[:-1])))
139
+ else:
140
+ segments.append(Segment(OPS.XPATH, XpathValue(s, s)))
141
+
142
+ ## EXPERIMENTAL
143
+ ## Disabled for now
144
+ ## Collapes inf_xpath segment and the succeeding url_eval segment into a single url_inf segment
145
+ # for i in range(len(segments) - 1, 0, -1):
146
+ # if segments[i - 1][0] == OPS.INF_XPATH and segments[i][0] == OPS.URL_EVAL:
147
+ # inf_xpath_value = segments[i - 1][1]
148
+ # url_eval_value = segments[i][1]
149
+ # url_eval_traveral_fragment = url_eval_value._value.split('url')[0]
150
+ # segments[i - 1] = Segment(
151
+ # OPS.URL_INF,
152
+ # XpathValue(
153
+ # _value='',
154
+ # expr=(f'{inf_xpath_value.expr}'
155
+ # f'{url_eval_traveral_fragment}'
156
+ # f'{url_eval_value.expr}')
157
+ # )
158
+ # )
159
+ # segments.pop(i)
160
+
161
+ #### RAISE ERRORS FROM INVALID SEGMENTS ####
162
+ # Raises if multiple ///url() are present
163
+ if len([op for op, val in segments if op == OPS.URL_INF]) > 1:
164
+ raise ValueError("Only one ///url() is allowed")
165
+
166
+ # Raises if multiple url() with string literals are present
167
+ if len([op for op, _ in segments if op == OPS.URL_STR_LIT]) > 1:
168
+ raise ValueError("Only one url() with string literal argument is allowed")
169
+
170
+ # Raises when expr starts with //url(@<attr>)
171
+ if segments and segments[0][0] == OPS.URL_EVAL:
172
+ raise ValueError("Path expr cannot start with [//]url(<xpath>)")
173
+
174
+ # Raises if expr ends with INF_XPATH
175
+ if segments and segments[-1][0] == OPS.INF_XPATH:
176
+ raise ValueError("Path expr cannot end with ///<xpath>")
177
+
178
+ # Raises if expr ends with XPATH_FN_MAP_FRAG
179
+ if segments and segments[-1][0] == OPS.XPATH_FN_MAP_FRAG:
180
+ raise ValueError("Path expr cannot end with !")
181
+ return segments
182
+
183
+
184
+ def parse_url_value(src: str) -> tuple[str, Optional[str]]:
185
+ """
186
+ Parse the contents of url(...).
187
+
188
+ Examples of src:
189
+ "'https://example.com'"
190
+ "//a/@href"
191
+ "'https://x', follow=//a/@href"
192
+ """
193
+
194
+ parts = _split_top_level_commas(src)
195
+
196
+ if not parts:
197
+ raise SyntaxError("url() requires at least one argument")
198
+
199
+ # ---- positional argument (target) ----
200
+ target_src = parts[0].strip()
201
+ if not target_src:
202
+ raise SyntaxError("url() target cannot be empty")
203
+
204
+ target = _parse_url_target(target_src)
205
+
206
+ follow = None
207
+
208
+ # ---- keyword arguments ----
209
+ for part in parts[1:]:
210
+ name, value = _split_kwarg(part)
211
+
212
+ if name == "follow":
213
+ if follow is not None:
214
+ raise SyntaxError("duplicate follow= in url()")
215
+ follow = value.strip()
216
+ else:
217
+ raise SyntaxError(f"unknown url() argument: {name}")
218
+
219
+ return target, follow
220
+
221
+
222
+ def extract_url_op_arg(url_op_and_arg: str) -> str:
223
+ url_op_arg = _extract_arg_from_url_xpath_op(url_op_and_arg)
224
+ if url_op_arg.startswith('@'):
225
+ return ".//" + url_op_arg
226
+ elif url_op_arg.startswith('.'):
227
+ return url_op_arg
228
+ elif url_op_arg.startswith('//'):
229
+ return '.' + url_op_arg
230
+ elif not url_op_arg.startswith('.//'):
231
+ return './/' + url_op_arg
232
+ else:
233
+ return url_op_arg
234
+
235
+
236
+ def _extract_arg_from_url_xpath_op(url_subsegment):
237
+ match = re.search(r"url\((.+)\)", url_subsegment)
238
+ if not match:
239
+ raise ValueError(f"Invalid url() segment: {url_subsegment}")
240
+ return match.group(1).strip("'\"") # Remove surrounding quotes if any
241
+
242
+
243
+ def _extract_arg_from_url_op(url_subsegment):
244
+ match = re.search(r"url\((.+)\)", url_subsegment)
245
+ if not match:
246
+ raise ValueError(f"Invalid url() segment: {url_subsegment}")
247
+ return match.group(1) # Remove surrounding quotes if any
248
+
249
+
250
+ def _split_top_level_commas(src: str) -> list[str]:
251
+ parts = []
252
+ buf = []
253
+ depth = 0
254
+ in_string = False
255
+ quote = None
256
+
257
+ for ch in src:
258
+ if in_string:
259
+ buf.append(ch)
260
+ if ch == quote:
261
+ in_string = False
262
+ continue
263
+
264
+ if ch in ("'", '"'):
265
+ in_string = True
266
+ quote = ch
267
+ buf.append(ch)
268
+ continue
269
+
270
+ if ch in "([{":
271
+ depth += 1
272
+ elif ch in ")]}":
273
+ depth -= 1
274
+ if depth < 0:
275
+ raise SyntaxError("unbalanced parentheses in url()")
276
+
277
+ if ch == "," and depth == 0:
278
+ parts.append("".join(buf).strip())
279
+ buf.clear()
280
+ else:
281
+ buf.append(ch)
282
+
283
+ if in_string or depth != 0:
284
+ raise SyntaxError("unbalanced expression in url()")
285
+
286
+ if buf:
287
+ parts.append("".join(buf).strip())
288
+
289
+ return parts
290
+
291
+
292
+ def _split_kwarg(src: str) -> tuple[str, str]:
293
+ if "=" not in src:
294
+ raise SyntaxError(f"expected keyword argument, got: {src}")
295
+
296
+ name, value = src.split("=", 1)
297
+ name = name.strip()
298
+ value = value.strip()
299
+
300
+ if not name or not value:
301
+ raise SyntaxError(f"invalid keyword argument: {src}")
302
+
303
+ return name, value
304
+
305
+
306
+ def _parse_url_target(src: str):
307
+ src = src.strip()
308
+ # string literal
309
+ if (src.startswith("'") and src.endswith("'")) or \
310
+ (src.startswith('"') and src.endswith('"')):
311
+ return src[1:-1]
312
+
313
+ return src
314
+
315
+
316
+ def _get_shallow_dict(instance: Value):
317
+ return {field.name: getattr(instance, field.name)
318
+ for field in fields(instance) if field.name not in {'_value'}}
319
+
@@ -0,0 +1,5 @@
1
+ from wxpath.core.runtime.engine import WXPathEngine
2
+
3
+ __all__ = [
4
+ 'WXPathEngine',
5
+ ]
@@ -0,0 +1,315 @@
1
+ import asyncio
2
+ import contextlib
3
+ import inspect
4
+ from collections import deque
5
+ from typing import Any, AsyncGenerator
6
+
7
+ from lxml.html import HtmlElement
8
+
9
+ from wxpath import patches # noqa: F401
10
+ from wxpath.core.models import (
11
+ CrawlIntent,
12
+ CrawlTask,
13
+ DataIntent,
14
+ ExtractIntent,
15
+ InfiniteCrawlIntent,
16
+ ProcessIntent,
17
+ )
18
+ from wxpath.core.ops import get_operator
19
+ from wxpath.core.parser import parse_wxpath_expr
20
+ from wxpath.core.runtime.helpers import parse_html
21
+ from wxpath.hooks.registry import FetchContext, get_hooks
22
+ from wxpath.http.client.crawler import Crawler
23
+ from wxpath.http.client.request import Request
24
+ from wxpath.util.logging import get_logger
25
+
26
+ log = get_logger(__name__)
27
+
28
+
29
+ class HookedEngineBase:
30
+ async def post_fetch_hooks(self, body, task):
31
+ for hook in get_hooks():
32
+ hook_method = getattr(hook, "post_fetch", lambda _, b: b)
33
+ if inspect.iscoroutinefunction(hook_method):
34
+ body = await hook_method(
35
+ FetchContext(task.url, task.backlink, task.depth, task.segments),
36
+ body
37
+ )
38
+ else:
39
+ body = hook_method(
40
+ FetchContext(task.url, task.backlink, task.depth, task.segments),
41
+ body
42
+ )
43
+ if not body:
44
+ log.debug(f"hook {type(hook).__name__} dropped {task.url}")
45
+ break
46
+ return body
47
+
48
+ async def post_parse_hooks(self, elem, task):
49
+ for hook in get_hooks():
50
+ hook_method = getattr(hook, "post_parse", lambda _, e: e)
51
+ if inspect.iscoroutinefunction(hook_method):
52
+ elem = await hook_method(
53
+ FetchContext(
54
+ url=task.url,
55
+ backlink=task.backlink,
56
+ depth=task.depth,
57
+ segments=task.segments
58
+ ),
59
+ elem,
60
+ )
61
+ else:
62
+ elem = hook_method(
63
+ FetchContext(
64
+ url=task.url,
65
+ backlink=task.backlink,
66
+ depth=task.depth,
67
+ segments=task.segments
68
+ ),
69
+ elem,
70
+ )
71
+ if elem is None:
72
+ log.debug(f"hook {type(hook).__name__} dropped {task.url}")
73
+ break
74
+ return elem
75
+
76
+ async def post_extract_hooks(self, value):
77
+ for hook in get_hooks():
78
+ hook_method = getattr(hook, "post_extract", lambda v: v)
79
+ if inspect.iscoroutinefunction(hook_method):
80
+ value = await hook_method(value)
81
+ else:
82
+ value = hook_method(value)
83
+ if value is None:
84
+ log.debug(f"hook {type(hook).__name__} dropped value")
85
+ break
86
+ return value
87
+
88
+
89
+ class WXPathEngine(HookedEngineBase):
90
+ """
91
+ Main class for executing wxpath expressions.
92
+
93
+ The core pattern and directive for this engine is to build a queue of CrawlTasks,
94
+ which is crawled and processed FIFO. The traversal of the queue (and therefore
95
+ the web graph) is done concurrently and in BFS-ish order.
96
+
97
+ Args:
98
+ crawler: Crawler instance
99
+ concurrency: number of concurrent fetches at the Crawler (request engine) level
100
+ per_host: number of concurrent fetches per host
101
+ """
102
+ def __init__(
103
+ self,
104
+ crawler: Crawler | None = None,
105
+ concurrency: int = 16,
106
+ per_host: int = 8
107
+ ):
108
+ self.seen_urls: set[str] = set()
109
+ self.crawler = crawler or Crawler(concurrency=concurrency, per_host=per_host)
110
+
111
+ async def run(self, expression: str, max_depth: int):
112
+ segments = parse_wxpath_expr(expression)
113
+
114
+ queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
115
+ inflight: dict[str, CrawlTask] = {}
116
+ pending_tasks = 0
117
+
118
+ def is_terminal():
119
+ return queue.empty() and pending_tasks <= 0
120
+
121
+ async with self.crawler as crawler:
122
+ async def submitter():
123
+ nonlocal pending_tasks
124
+ while True:
125
+ task = await queue.get()
126
+
127
+ if task is None:
128
+ break
129
+
130
+ if task.url in self.seen_urls or task.url in inflight:
131
+ queue.task_done()
132
+ continue
133
+
134
+ # Mark URL as seen immediately
135
+ self.seen_urls.add(task.url)
136
+ inflight[task.url] = task
137
+
138
+ pending_tasks += 1
139
+ crawler.submit(Request(task.url, max_retries=0))
140
+ queue.task_done()
141
+
142
+ submit_task = asyncio.create_task(submitter())
143
+
144
+ # Seed the pipeline with a dummy task
145
+ seed_task = CrawlTask(
146
+ elem=None,
147
+ url=None,
148
+ segments=segments,
149
+ depth=-1,
150
+ backlink=None,
151
+ )
152
+ async for output in self._process_pipeline(
153
+ task=seed_task,
154
+ elem=None,
155
+ depth=seed_task.depth,
156
+ max_depth=max_depth,
157
+ queue=queue,
158
+ ):
159
+ yield await self.post_extract_hooks(output)
160
+
161
+ # While looping asynchronous generators, you MUST make sure
162
+ # to check terminal conditions before re-iteration.
163
+ async for resp in crawler:
164
+ task = inflight.pop(resp.request.url, None)
165
+ pending_tasks -= 1
166
+
167
+ if task is None:
168
+ log.warning(f"Got unexpected response from {resp.request.url}")
169
+ if is_terminal():
170
+ break
171
+ continue
172
+
173
+ if resp.error:
174
+ log.warning(f"Got error from {resp.request.url}: {resp.error}")
175
+ if is_terminal():
176
+ break
177
+ continue
178
+
179
+ # NOTE: Consider allowing redirects
180
+ if resp.status != 200 or not resp.body:
181
+ log.warning(f"Got non-200 response from {resp.request.url}")
182
+ if is_terminal():
183
+ break
184
+ continue
185
+
186
+ body = await self.post_fetch_hooks(resp.body, task)
187
+ if not body:
188
+ if is_terminal():
189
+ break
190
+ continue
191
+
192
+ elem = parse_html(
193
+ body,
194
+ base_url=task.url,
195
+ backlink=task.backlink,
196
+ depth=task.depth,
197
+ )
198
+
199
+ elem = await self.post_parse_hooks(elem, task)
200
+ if elem is None:
201
+ if is_terminal():
202
+ break
203
+ continue
204
+
205
+ if task.segments:
206
+ async for output in self._process_pipeline(
207
+ task=task,
208
+ elem=elem,
209
+ depth=task.depth,
210
+ max_depth=max_depth,
211
+ queue=queue,
212
+ ):
213
+
214
+ yield await self.post_extract_hooks(output)
215
+ else:
216
+ yield await self.post_extract_hooks(elem)
217
+
218
+ # Termination condition
219
+ if is_terminal():
220
+ break
221
+
222
+ submit_task.cancel()
223
+ with contextlib.suppress(asyncio.CancelledError):
224
+ await submit_task
225
+
226
+ async def _process_pipeline(
227
+ self,
228
+ task: CrawlTask,
229
+ elem,
230
+ depth: int,
231
+ max_depth: int,
232
+ queue: asyncio.Queue[CrawlTask],
233
+ ):
234
+ mini_queue: deque[(HtmlElement, list[tuple[str, str]])] = deque([(elem, task.segments)])
235
+
236
+ while mini_queue:
237
+ elem, segments = mini_queue.popleft()
238
+
239
+ op, _ = segments[0]
240
+ operator = get_operator(op)
241
+
242
+ intents = operator(elem, segments, depth)
243
+
244
+ if not intents:
245
+ return
246
+
247
+ for intent in intents:
248
+ if isinstance(intent, DataIntent):
249
+ yield intent.value
250
+
251
+ elif isinstance(intent, CrawlIntent):
252
+ next_depth = task.depth + 1
253
+ # if intent.url not in self.seen_urls and next_depth <= max_depth:
254
+ if next_depth <= max_depth:
255
+ # self.seen_urls.add(intent.url)
256
+ queue.put_nowait(
257
+ CrawlTask(
258
+ elem=None,
259
+ url=intent.url,
260
+ segments=intent.next_segments,
261
+ depth=next_depth,
262
+ backlink=task.url,
263
+ )
264
+ )
265
+
266
+ elif isinstance(intent, (ExtractIntent, ProcessIntent, InfiniteCrawlIntent)):
267
+ # immediately traverse the extraction
268
+ elem = intent.elem
269
+ next_segments = intent.next_segments
270
+ mini_queue.append((elem, next_segments))
271
+
272
+
273
+ def wxpath_async(path_expr: str,
274
+ max_depth: int,
275
+ engine: WXPathEngine = None) -> AsyncGenerator[Any, None]:
276
+ if engine is None:
277
+ engine = WXPathEngine()
278
+ return engine.run(path_expr, max_depth)
279
+
280
+
281
+ ##### ASYNC IN SYNC #####
282
+ def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = None):
283
+ """
284
+ Evaluate a wxpath expression using concurrent breadth-first traversal.
285
+
286
+ Args:
287
+ path_expr (str): A wxpath expression.
288
+ max_depth (int, optional): Maximum crawl depth. Must be at least the
289
+ number of `url*` segments minus one. Defaults to `1`.
290
+
291
+ Yields:
292
+ lxml.html.HtmlElement | wxpath.models.WxStr | dict | Any: The same objects
293
+ produced by the sequential evaluator.
294
+
295
+ Warning:
296
+ Spins up its own event loop therefore this function must **not** be
297
+ invoked from within an active asyncio event loop.
298
+ """
299
+ loop = asyncio.new_event_loop()
300
+ asyncio.set_event_loop(loop)
301
+ agen = wxpath_async(path_expr, max_depth=max_depth, engine=engine)
302
+
303
+ try:
304
+ while True:
305
+ try:
306
+ yield loop.run_until_complete(agen.__anext__())
307
+ except StopAsyncIteration:
308
+ break
309
+ finally:
310
+ loop.run_until_complete(loop.shutdown_asyncgens())
311
+ loop.close()
312
+
313
+
314
+ def wxpath_async_blocking(path_expr, max_depth=1, engine: WXPathEngine = None):
315
+ return list(wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine))
@@ -0,0 +1,48 @@
1
+ import requests
2
+ from lxml import etree, html
3
+
4
+ from wxpath import patches
5
+ from wxpath.util.logging import get_logger
6
+
7
+ log = get_logger(__name__)
8
+
9
+
10
+ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
11
+ elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
12
+ if base_url:
13
+ elem.getroottree().docinfo.URL = base_url # make base-uri() work
14
+ # Also set xml:base on the root element for XPath base-uri()
15
+ elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
16
+ elem.base_url = base_url # sets both attribute and doc-level URL
17
+
18
+ # NOTE: some pages may have multiple root elements, i.e.
19
+ # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
20
+ # This breaks elementpath. If elem has siblings, recreate the
21
+ # root element and only the root element.
22
+ if len(list(elem.itersiblings())) > 0:
23
+ elem = detach_html_root(elem, base_url)
24
+
25
+ for k, v in elem_kv_pairs.items():
26
+ elem.set(k, str(v))
27
+ return elem
28
+
29
+
30
+ def detach_html_root(elem, base_url=None):
31
+ new_root = etree.HTML(
32
+ etree.tostring(elem, encoding="utf-8"),
33
+ parser=patches.html_parser_with_xpath3,
34
+ base_url=base_url
35
+ )
36
+
37
+ if base_url:
38
+ new_root.getroottree().docinfo.URL = base_url
39
+ new_root.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
40
+ new_root.base_url = base_url
41
+
42
+ return new_root
43
+
44
+
45
+ def fetch_html(url):
46
+ response = requests.get(url, timeout=10)
47
+ response.raise_for_status()
48
+ return response.content
@@ -0,0 +1,9 @@
1
+ from wxpath.hooks.builtin import JSONLWriter as JSONLWriter
2
+ from wxpath.hooks.builtin import SerializeXPathMapAndNodeHook as SerializeXPathMapAndNodeHook
3
+ from wxpath.hooks.registry import register as register
4
+
5
+ __all__ = [
6
+ "JSONLWriter",
7
+ "SerializeXPathMapAndNodeHook",
8
+ "register",
9
+ ]