wxpath 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +2 -0
- wxpath/cli.py +6 -0
- wxpath/core/exceptions.py +53 -0
- wxpath/core/models.py +1 -0
- wxpath/core/ops.py +100 -19
- wxpath/core/parser.py +94 -24
- wxpath/core/runtime/engine.py +74 -10
- wxpath/core/runtime/helpers.py +6 -3
- wxpath/http/client/__init__.py +1 -1
- wxpath/http/client/crawler.py +17 -5
- wxpath/http/client/response.py +7 -1
- wxpath/http/policy/retry.py +2 -2
- wxpath/integrations/__init__.py +0 -0
- wxpath/integrations/langchain/__init__.py +0 -0
- wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath/integrations/langchain/loader.py +60 -0
- wxpath/patches.py +215 -5
- wxpath/settings.py +3 -1
- wxpath/tui.py +1225 -0
- wxpath/tui_settings.py +151 -0
- wxpath/util/cleaners.py +31 -0
- wxpath/util/common_paths.py +22 -0
- wxpath/util/logging.py +3 -7
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/METADATA +73 -9
- wxpath-0.5.1.dist-info/RECORD +45 -0
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/WHEEL +1 -1
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/entry_points.txt +1 -0
- wxpath-0.4.1.dist-info/RECORD +0 -35
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.4.1.dist-info → wxpath-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
from elementpath.xpath_tokens import XPathMap
|
|
4
|
+
from langchain_core.document_loaders import BaseLoader
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
|
|
7
|
+
import wxpath
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WXPathLoader(BaseLoader):
|
|
11
|
+
"""A LangChain loader for wxpath queries.
|
|
12
|
+
|
|
13
|
+
For more complex examples, see the examples directory.
|
|
14
|
+
Best practice would be to subclass the loader and override the _prep_doc method.
|
|
15
|
+
For example:
|
|
16
|
+
```python
|
|
17
|
+
class MyWXPathLoader(WXPathLoader):
|
|
18
|
+
def _prep_doc(self, item: (XPathMap | dict)) -> Document:
|
|
19
|
+
# Custom processing here
|
|
20
|
+
return super()._prep_doc(item)
|
|
21
|
+
```
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, expression: str, max_depth: int = 1):
|
|
25
|
+
self.expression = expression
|
|
26
|
+
self.max_depth = max_depth
|
|
27
|
+
|
|
28
|
+
def _prep_doc(self, item: (XPathMap | dict)) -> Document:
|
|
29
|
+
|
|
30
|
+
if isinstance(item, dict):
|
|
31
|
+
content = item.pop("text", str(item)) # Fallback if no "text" key
|
|
32
|
+
else:
|
|
33
|
+
content = item._map.pop("text", str(item._map)) # Fallback if no "text" key
|
|
34
|
+
item = item._map
|
|
35
|
+
|
|
36
|
+
return Document(
|
|
37
|
+
page_content=content,
|
|
38
|
+
metadata=item # Remaining keys go here (url, title, etc.)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def lazy_load(self) -> Iterator[Document]:
|
|
42
|
+
"""
|
|
43
|
+
Lazy load documents from the wxpath query.
|
|
44
|
+
Each item yielded by wxpath becomes a LangChain Document.
|
|
45
|
+
"""
|
|
46
|
+
# wxpath_async_blocking_iter allows iteration in sync environments
|
|
47
|
+
results = wxpath.wxpath_async_blocking_iter(
|
|
48
|
+
self.expression,
|
|
49
|
+
max_depth=self.max_depth
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
for item in results:
|
|
53
|
+
yield self._prep_doc(item)
|
|
54
|
+
|
|
55
|
+
async def alazy_load(self):
|
|
56
|
+
async for item in wxpath.wxpath_async(
|
|
57
|
+
self.expression,
|
|
58
|
+
max_depth=self.max_depth
|
|
59
|
+
):
|
|
60
|
+
yield self._prep_doc(item)
|
wxpath/patches.py
CHANGED
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
import urllib.parse
|
|
2
|
+
|
|
1
3
|
import elementpath
|
|
4
|
+
from elementpath import XPathContext, XPathFunction
|
|
2
5
|
from elementpath.xpath3 import XPath3Parser
|
|
3
6
|
from lxml import etree, html
|
|
4
7
|
|
|
8
|
+
from wxpath.http.client import Response as Response
|
|
9
|
+
from wxpath.util.cleaners import main_text_extractor
|
|
10
|
+
from wxpath.util.common_paths import XPATH_PATH_TO_EXTERNAL_LINKS, XPATH_PATH_TO_INTERNAL_LINKS
|
|
11
|
+
from wxpath.util.logging import get_logger
|
|
12
|
+
|
|
13
|
+
log = get_logger(__name__)
|
|
14
|
+
|
|
5
15
|
|
|
6
16
|
def html_element_repr(self):
|
|
7
17
|
return (f"HtmlElement(tag={self.tag}, "
|
|
@@ -13,14 +23,18 @@ html.HtmlElement.__repr__ = html_element_repr
|
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
class XPath3Element(etree.ElementBase):
|
|
16
|
-
def
|
|
26
|
+
def __init__(self, tag, attrib=None, nsmap=None, **extra):
|
|
27
|
+
super().__init__(tag, attrib, nsmap, **extra)
|
|
28
|
+
self.response = None # type: Response | None
|
|
29
|
+
|
|
30
|
+
def xpath3(self, expr, request=None, **kwargs):
|
|
17
31
|
"""
|
|
18
32
|
Evaluate an XPath 3 expression using elementpath library,
|
|
19
33
|
returning the results as a list.
|
|
20
34
|
"""
|
|
21
|
-
kwargs.setdefault("parser",
|
|
35
|
+
kwargs.setdefault("parser", WXPathParser)
|
|
22
36
|
kwargs.setdefault(
|
|
23
|
-
"uri",
|
|
37
|
+
"uri",
|
|
24
38
|
getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
|
|
25
39
|
)
|
|
26
40
|
return elementpath.select(self, expr, **kwargs)
|
|
@@ -51,7 +65,8 @@ class XPath3Element(etree.ElementBase):
|
|
|
51
65
|
@depth.setter
|
|
52
66
|
def depth(self, value):
|
|
53
67
|
self.set("depth", str(value))
|
|
54
|
-
|
|
68
|
+
|
|
69
|
+
|
|
55
70
|
# Create and register custom parser that returns XPath3Element instances
|
|
56
71
|
lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
|
|
57
72
|
parser = etree.HTMLParser()
|
|
@@ -60,4 +75,199 @@ parser.set_element_class_lookup(lookup)
|
|
|
60
75
|
|
|
61
76
|
# Expose parser for use in parse_html
|
|
62
77
|
html_parser_with_xpath3 = parser
|
|
63
|
-
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
|
78
|
+
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
|
79
|
+
|
|
80
|
+
# --- WXPATH functions ---
|
|
81
|
+
WX_NAMESPACE = "http://wxpath.dev/ns"
|
|
82
|
+
|
|
83
|
+
class WXPathParser(XPath3Parser):
|
|
84
|
+
"""Custom parser that includes wxpath-specific functions."""
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
# 2. Register the namespace mapping globally on the parser class
|
|
88
|
+
WXPathParser.DEFAULT_NAMESPACES['wx'] = WX_NAMESPACE
|
|
89
|
+
|
|
90
|
+
# 2. Helper to register functions easily
|
|
91
|
+
def register_wxpath_function(name, nargs=None, **kwargs):
|
|
92
|
+
"""Registers a function token on the custom parser."""
|
|
93
|
+
|
|
94
|
+
# Define the token on the class (this registers the symbol)
|
|
95
|
+
# Check if this is a prefixed function (e.g. 'wx:depth')
|
|
96
|
+
if ':' in name:
|
|
97
|
+
prefix, local_name = name.split(':', 1)
|
|
98
|
+
kwargs['prefix'] = prefix
|
|
99
|
+
# kwargs['namespace'] = WX_NAMESPACE
|
|
100
|
+
name = local_name
|
|
101
|
+
|
|
102
|
+
# Register the token symbol
|
|
103
|
+
# WXPathParser.function(name, nargs=nargs, **kwargs)
|
|
104
|
+
# Register the token symbol and capture the created class
|
|
105
|
+
token_class = WXPathParser.function(name, nargs=nargs, **kwargs)
|
|
106
|
+
# Return a decorator to define the 'evaluate' method
|
|
107
|
+
def decorator(func):
|
|
108
|
+
# @WXPathParser.method(name)
|
|
109
|
+
# def evaluate(self, context=None):
|
|
110
|
+
# # 'self' is the Token instance.
|
|
111
|
+
# # 'self.get_argument(context, index)' evaluates arguments.
|
|
112
|
+
# return func(self, context)
|
|
113
|
+
# return evaluate
|
|
114
|
+
token_class.evaluate = func
|
|
115
|
+
return func
|
|
116
|
+
return decorator
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class XPathContextRequired(Exception):
|
|
120
|
+
message = ('XPathContext is required. This usually arises when you call '
|
|
121
|
+
'the function without a preceding axes expression ("/")')
|
|
122
|
+
def __init__(self, *args):
|
|
123
|
+
super().__init__(self.message, *args)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _get_root(context: XPathContext):
|
|
127
|
+
if context is None:
|
|
128
|
+
raise XPathContextRequired
|
|
129
|
+
|
|
130
|
+
if not hasattr(context.item, 'elem'):
|
|
131
|
+
return context.item.parent.elem.getroottree().getroot()
|
|
132
|
+
return context.item.elem.getroottree().getroot()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@register_wxpath_function('wx:depth', nargs=0)
|
|
136
|
+
def wx_depth(_: XPathFunction, context: XPathContext):
|
|
137
|
+
if context is None:
|
|
138
|
+
raise XPathContextRequired
|
|
139
|
+
|
|
140
|
+
root = _get_root(context)
|
|
141
|
+
|
|
142
|
+
depth = root.get('depth')
|
|
143
|
+
return int(depth) if depth is not None else 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@register_wxpath_function('wx:backlink', nargs=0)
|
|
147
|
+
def wx_backlink(_: XPathFunction, context: XPathContext):
|
|
148
|
+
if context is None:
|
|
149
|
+
raise XPathContextRequired
|
|
150
|
+
|
|
151
|
+
item = context.item.elem
|
|
152
|
+
if item is None:
|
|
153
|
+
return ''
|
|
154
|
+
return item.get('backlink') or ''
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@register_wxpath_function('wx:current-url', nargs=0)
|
|
158
|
+
def wx_current_url(_: XPathFunction, context: XPathContext):
|
|
159
|
+
if context is None:
|
|
160
|
+
raise XPathContextRequired
|
|
161
|
+
|
|
162
|
+
item = context.item.elem
|
|
163
|
+
if item is None:
|
|
164
|
+
return ''
|
|
165
|
+
return item.base_url
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@register_wxpath_function('wx:elapsed', nargs=0)
|
|
169
|
+
@register_wxpath_function('wx:fetch-time', nargs=0)
|
|
170
|
+
def wx_fetch_time(_: XPathFunction, context: XPathContext):
|
|
171
|
+
if context is None:
|
|
172
|
+
raise XPathContextRequired
|
|
173
|
+
|
|
174
|
+
item = context.item.elem
|
|
175
|
+
if item is None:
|
|
176
|
+
return ''
|
|
177
|
+
resp = item.response # type: Response
|
|
178
|
+
return resp.latency
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# @register_wxpath_function('wx:status-code', nargs=0)
|
|
182
|
+
@register_wxpath_function('wx:status-code', nargs=0)
|
|
183
|
+
def wx_status_code(_: XPathFunction, context: XPathContext) -> int:
|
|
184
|
+
if context is None:
|
|
185
|
+
raise XPathContextRequired
|
|
186
|
+
|
|
187
|
+
item = context.item.elem
|
|
188
|
+
if item is None:
|
|
189
|
+
return ''
|
|
190
|
+
|
|
191
|
+
resp = item.response # type: Response
|
|
192
|
+
return resp.status
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@register_wxpath_function('wx:elem', nargs=0)
|
|
196
|
+
def wx_elem(_: XPathFunction, context: XPathContext):
|
|
197
|
+
if context is None:
|
|
198
|
+
raise XPathContextRequired
|
|
199
|
+
|
|
200
|
+
item = context.item.elem
|
|
201
|
+
if item is None:
|
|
202
|
+
return ''
|
|
203
|
+
return item
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _get_root_domain(base_url: str) -> str:
|
|
207
|
+
parsed_url = urllib.parse.urlparse(base_url)
|
|
208
|
+
|
|
209
|
+
netloc = parsed_url.netloc
|
|
210
|
+
parts = netloc.split('.')
|
|
211
|
+
root_domain = netloc
|
|
212
|
+
|
|
213
|
+
if len(parts) > 2:
|
|
214
|
+
# Heuristic: If the last part is 2 chars (uk, au) and 2nd to last is < 4 (co, com, org)
|
|
215
|
+
# It's likely a compound TLD like co.uk. This isn't perfect but better than [-2:].
|
|
216
|
+
if len(parts[-1]) == 2 and len(parts[-2]) <= 3:
|
|
217
|
+
root_domain = ".".join(parts[-3:]) # grab bbc.co.uk
|
|
218
|
+
else:
|
|
219
|
+
# grab books.toscrape.com -> toscrape.com
|
|
220
|
+
root_domain = ".".join(parts[-2:])
|
|
221
|
+
|
|
222
|
+
return root_domain
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@register_wxpath_function('wx:internal-links', nargs=0)
|
|
226
|
+
def wx_internal_links(_: XPathFunction, context: XPathContext):
|
|
227
|
+
"""
|
|
228
|
+
Returns a list of internal links.
|
|
229
|
+
Allows for false positives.
|
|
230
|
+
"""
|
|
231
|
+
if context is None:
|
|
232
|
+
raise XPathContextRequired
|
|
233
|
+
|
|
234
|
+
item = context.item.elem
|
|
235
|
+
if item is None:
|
|
236
|
+
return ''
|
|
237
|
+
|
|
238
|
+
root_domain = _get_root_domain(item.base_url)
|
|
239
|
+
_path = XPATH_PATH_TO_INTERNAL_LINKS.format(root_domain)
|
|
240
|
+
return item.xpath3(_path)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@register_wxpath_function('wx:external-links', nargs=0)
|
|
244
|
+
def wx_external_links(_: XPathFunction, context: XPathContext):
|
|
245
|
+
"""
|
|
246
|
+
Returns a list of external links.
|
|
247
|
+
"""
|
|
248
|
+
if context is None:
|
|
249
|
+
raise XPathContextRequired
|
|
250
|
+
|
|
251
|
+
item = context.item.elem
|
|
252
|
+
if item is None:
|
|
253
|
+
return ''
|
|
254
|
+
|
|
255
|
+
root_domain = _get_root_domain(item.base_url)
|
|
256
|
+
_path = XPATH_PATH_TO_EXTERNAL_LINKS.format(root_domain)
|
|
257
|
+
return item.xpath3(_path)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@register_wxpath_function('wx:main-article-text', nargs=0)
|
|
261
|
+
def wx_main_article_text(_: XPathFunction, context: XPathContext):
|
|
262
|
+
if context is None:
|
|
263
|
+
raise XPathContextRequired
|
|
264
|
+
|
|
265
|
+
item = context.item.elem
|
|
266
|
+
if item is None:
|
|
267
|
+
return ''
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
return main_text_extractor(item)
|
|
271
|
+
except Exception:
|
|
272
|
+
log.exception('Failed to extract main article text')
|
|
273
|
+
return ''
|
wxpath/settings.py
CHANGED
|
@@ -54,10 +54,12 @@ SETTINGS = {
|
|
|
54
54
|
'concurrency': 16,
|
|
55
55
|
'per_host': 8,
|
|
56
56
|
'timeout': 15,
|
|
57
|
+
'verify_ssl': True,
|
|
57
58
|
'headers': {
|
|
58
59
|
"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
|
|
59
60
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
60
|
-
"Chrome/142.0.0.0 Safari/537.36")
|
|
61
|
+
"Chrome/142.0.0.0 Safari/537.36")
|
|
62
|
+
},
|
|
61
63
|
'proxies': None,
|
|
62
64
|
'auto_throttle_target_concurrency': None,
|
|
63
65
|
'auto_throttle_start_delay': 0.25,
|