wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ from typing import Iterator
2
+
3
+ from elementpath.xpath_tokens import XPathMap
4
+ from langchain_core.document_loaders import BaseLoader
5
+ from langchain_core.documents import Document
6
+
7
+ import wxpath
8
+
9
+
10
+ class WXPathLoader(BaseLoader):
11
+ """A LangChain loader for wxpath queries.
12
+
13
+ For more complex examples, see the examples directory.
14
+ Best practice would be to subclass the loader and override the _prep_doc method.
15
+ For example:
16
+ ```python
17
+ class MyWXPathLoader(WXPathLoader):
18
+ def _prep_doc(self, item: (XPathMap | dict)) -> Document:
19
+ # Custom processing here
20
+ return super()._prep_doc(item)
21
+ ```
22
+ """
23
+
24
+ def __init__(self, expression: str, max_depth: int = 1):
25
+ self.expression = expression
26
+ self.max_depth = max_depth
27
+
28
+ def _prep_doc(self, item: (XPathMap | dict)) -> Document:
29
+
30
+ if isinstance(item, dict):
31
+ content = item.pop("text", str(item)) # Fallback if no "text" key
32
+ else:
33
+ content = item._map.pop("text", str(item._map)) # Fallback if no "text" key
34
+ item = item._map
35
+
36
+ return Document(
37
+ page_content=content,
38
+ metadata=item # Remaining keys go here (url, title, etc.)
39
+ )
40
+
41
+ def lazy_load(self) -> Iterator[Document]:
42
+ """
43
+ Lazy load documents from the wxpath query.
44
+ Each item yielded by wxpath becomes a LangChain Document.
45
+ """
46
+ # wxpath_async_blocking_iter allows iteration in sync environments
47
+ results = wxpath.wxpath_async_blocking_iter(
48
+ self.expression,
49
+ max_depth=self.max_depth
50
+ )
51
+
52
+ for item in results:
53
+ yield self._prep_doc(item)
54
+
55
+ async def alazy_load(self):
56
+ async for item in wxpath.wxpath_async(
57
+ self.expression,
58
+ max_depth=self.max_depth
59
+ ):
60
+ yield self._prep_doc(item)
wxpath/patches.py CHANGED
@@ -1,7 +1,17 @@
1
+ import urllib.parse
2
+
1
3
  import elementpath
4
+ from elementpath import XPathContext, XPathFunction
2
5
  from elementpath.xpath3 import XPath3Parser
3
6
  from lxml import etree, html
4
7
 
8
+ from wxpath.http.client import Response as Response
9
+ from wxpath.util.cleaners import main_text_extractor
10
+ from wxpath.util.common_paths import XPATH_PATH_TO_EXTERNAL_LINKS, XPATH_PATH_TO_INTERNAL_LINKS
11
+ from wxpath.util.logging import get_logger
12
+
13
+ log = get_logger(__name__)
14
+
5
15
 
6
16
  def html_element_repr(self):
7
17
  return (f"HtmlElement(tag={self.tag}, "
@@ -13,14 +23,18 @@ html.HtmlElement.__repr__ = html_element_repr
13
23
 
14
24
 
15
25
  class XPath3Element(etree.ElementBase):
16
- def xpath3(self, expr, **kwargs):
26
+ def __init__(self, tag, attrib=None, nsmap=None, **extra):
27
+ super().__init__(tag, attrib, nsmap, **extra)
28
+ self.response = None # type: Response | None
29
+
30
+ def xpath3(self, expr, request=None, **kwargs):
17
31
  """
18
32
  Evaluate an XPath 3 expression using elementpath library,
19
33
  returning the results as a list.
20
34
  """
21
- kwargs.setdefault("parser", XPath3Parser)
35
+ kwargs.setdefault("parser", WXPathParser)
22
36
  kwargs.setdefault(
23
- "uri",
37
+ "uri",
24
38
  getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
25
39
  )
26
40
  return elementpath.select(self, expr, **kwargs)
@@ -51,7 +65,8 @@ class XPath3Element(etree.ElementBase):
51
65
  @depth.setter
52
66
  def depth(self, value):
53
67
  self.set("depth", str(value))
54
-
68
+
69
+
55
70
  # Create and register custom parser that returns XPath3Element instances
56
71
  lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
57
72
  parser = etree.HTMLParser()
@@ -60,4 +75,199 @@ parser.set_element_class_lookup(lookup)
60
75
 
61
76
  # Expose parser for use in parse_html
62
77
  html_parser_with_xpath3 = parser
63
- html.HtmlElement.xpath3 = XPath3Element.xpath3
78
+ html.HtmlElement.xpath3 = XPath3Element.xpath3
79
+
80
+ # --- WXPATH functions ---
81
+ WX_NAMESPACE = "http://wxpath.dev/ns"
82
+
83
+ class WXPathParser(XPath3Parser):
84
+ """Custom parser that includes wxpath-specific functions."""
85
+ pass
86
+
87
+ # 2. Register the namespace mapping globally on the parser class
88
+ WXPathParser.DEFAULT_NAMESPACES['wx'] = WX_NAMESPACE
89
+
90
+ # 2. Helper to register functions easily
91
+ def register_wxpath_function(name, nargs=None, **kwargs):
92
+ """Registers a function token on the custom parser."""
93
+
94
+ # Define the token on the class (this registers the symbol)
95
+ # Check if this is a prefixed function (e.g. 'wx:depth')
96
+ if ':' in name:
97
+ prefix, local_name = name.split(':', 1)
98
+ kwargs['prefix'] = prefix
99
+ # kwargs['namespace'] = WX_NAMESPACE
100
+ name = local_name
101
+
102
+ # Register the token symbol
103
+ # WXPathParser.function(name, nargs=nargs, **kwargs)
104
+ # Register the token symbol and capture the created class
105
+ token_class = WXPathParser.function(name, nargs=nargs, **kwargs)
106
+ # Return a decorator to define the 'evaluate' method
107
+ def decorator(func):
108
+ # @WXPathParser.method(name)
109
+ # def evaluate(self, context=None):
110
+ # # 'self' is the Token instance.
111
+ # # 'self.get_argument(context, index)' evaluates arguments.
112
+ # return func(self, context)
113
+ # return evaluate
114
+ token_class.evaluate = func
115
+ return func
116
+ return decorator
117
+
118
+
119
+ class XPathContextRequired(Exception):
120
+ message = ('XPathContext is required. This usually arises when you call '
121
+ 'the function without a preceding axes expression ("/")')
122
+ def __init__(self, *args):
123
+ super().__init__(self.message, *args)
124
+
125
+
126
+ def _get_root(context: XPathContext):
127
+ if context is None:
128
+ raise XPathContextRequired
129
+
130
+ if not hasattr(context.item, 'elem'):
131
+ return context.item.parent.elem.getroottree().getroot()
132
+ return context.item.elem.getroottree().getroot()
133
+
134
+
135
+ @register_wxpath_function('wx:depth', nargs=0)
136
+ def wx_depth(_: XPathFunction, context: XPathContext):
137
+ if context is None:
138
+ raise XPathContextRequired
139
+
140
+ root = _get_root(context)
141
+
142
+ depth = root.get('depth')
143
+ return int(depth) if depth is not None else 0
144
+
145
+
146
+ @register_wxpath_function('wx:backlink', nargs=0)
147
+ def wx_backlink(_: XPathFunction, context: XPathContext):
148
+ if context is None:
149
+ raise XPathContextRequired
150
+
151
+ item = context.item.elem
152
+ if item is None:
153
+ return ''
154
+ return item.get('backlink') or ''
155
+
156
+
157
+ @register_wxpath_function('wx:current-url', nargs=0)
158
+ def wx_current_url(_: XPathFunction, context: XPathContext):
159
+ if context is None:
160
+ raise XPathContextRequired
161
+
162
+ item = context.item.elem
163
+ if item is None:
164
+ return ''
165
+ return item.base_url
166
+
167
+
168
+ @register_wxpath_function('wx:elapsed', nargs=0)
169
+ @register_wxpath_function('wx:fetch-time', nargs=0)
170
+ def wx_fetch_time(_: XPathFunction, context: XPathContext):
171
+ if context is None:
172
+ raise XPathContextRequired
173
+
174
+ item = context.item.elem
175
+ if item is None:
176
+ return ''
177
+ resp = item.response # type: Response
178
+ return resp.latency
179
+
180
+
181
+ # @register_wxpath_function('wx:status-code', nargs=0)
182
+ @register_wxpath_function('wx:status-code', nargs=0)
183
+ def wx_status_code(_: XPathFunction, context: XPathContext) -> int:
184
+ if context is None:
185
+ raise XPathContextRequired
186
+
187
+ item = context.item.elem
188
+ if item is None:
189
+ return ''
190
+
191
+ resp = item.response # type: Response
192
+ return resp.status
193
+
194
+
195
+ @register_wxpath_function('wx:elem', nargs=0)
196
+ def wx_elem(_: XPathFunction, context: XPathContext):
197
+ if context is None:
198
+ raise XPathContextRequired
199
+
200
+ item = context.item.elem
201
+ if item is None:
202
+ return ''
203
+ return item
204
+
205
+
206
+ def _get_root_domain(base_url: str) -> str:
207
+ parsed_url = urllib.parse.urlparse(base_url)
208
+
209
+ netloc = parsed_url.netloc
210
+ parts = netloc.split('.')
211
+ root_domain = netloc
212
+
213
+ if len(parts) > 2:
214
+ # Heuristic: If the last part is 2 chars (uk, au) and 2nd to last is < 4 (co, com, org)
215
+ # It's likely a compound TLD like co.uk. This isn't perfect but better than [-2:].
216
+ if len(parts[-1]) == 2 and len(parts[-2]) <= 3:
217
+ root_domain = ".".join(parts[-3:]) # grab bbc.co.uk
218
+ else:
219
+ # grab books.toscrape.com -> toscrape.com
220
+ root_domain = ".".join(parts[-2:])
221
+
222
+ return root_domain
223
+
224
+
225
+ @register_wxpath_function('wx:internal-links', nargs=0)
226
+ def wx_internal_links(_: XPathFunction, context: XPathContext):
227
+ """
228
+ Returns a list of internal links.
229
+ Allows for false positives.
230
+ """
231
+ if context is None:
232
+ raise XPathContextRequired
233
+
234
+ item = context.item.elem
235
+ if item is None:
236
+ return ''
237
+
238
+ root_domain = _get_root_domain(item.base_url)
239
+ _path = XPATH_PATH_TO_INTERNAL_LINKS.format(root_domain)
240
+ return item.xpath3(_path)
241
+
242
+
243
+ @register_wxpath_function('wx:external-links', nargs=0)
244
+ def wx_external_links(_: XPathFunction, context: XPathContext):
245
+ """
246
+ Returns a list of external links.
247
+ """
248
+ if context is None:
249
+ raise XPathContextRequired
250
+
251
+ item = context.item.elem
252
+ if item is None:
253
+ return ''
254
+
255
+ root_domain = _get_root_domain(item.base_url)
256
+ _path = XPATH_PATH_TO_EXTERNAL_LINKS.format(root_domain)
257
+ return item.xpath3(_path)
258
+
259
+
260
+ @register_wxpath_function('wx:main-article-text', nargs=0)
261
+ def wx_main_article_text(_: XPathFunction, context: XPathContext):
262
+ if context is None:
263
+ raise XPathContextRequired
264
+
265
+ item = context.item.elem
266
+ if item is None:
267
+ return ''
268
+
269
+ try:
270
+ return main_text_extractor(item)
271
+ except Exception:
272
+ log.exception('Failed to extract main article text')
273
+ return ''
wxpath/settings.py CHANGED
@@ -54,10 +54,12 @@ SETTINGS = {
54
54
  'concurrency': 16,
55
55
  'per_host': 8,
56
56
  'timeout': 15,
57
+ 'verify_ssl': True,
57
58
  'headers': {
58
59
  "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
59
60
  "AppleWebKit/537.36 (KHTML, like Gecko) "
60
- "Chrome/142.0.0.0 Safari/537.36")},
61
+ "Chrome/142.0.0.0 Safari/537.36")
62
+ },
61
63
  'proxies': None,
62
64
  'auto_throttle_target_concurrency': None,
63
65
  'auto_throttle_start_delay': 0.25,