wxpath 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+
2
+ # pip install langchain langchain-ollama langchain-chroma chromadb
3
+ from langchain_chroma import Chroma
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.runnables import RunnablePassthrough
7
+ from langchain_ollama import ChatOllama, OllamaEmbeddings
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+
10
+ from wxpath.integrations.langchain.loader import WXPathLoader
11
+
12
+ # ------------------------------------------------------------------
13
+ # STEP 1: Load & Embed (Same as before)
14
+ # ------------------------------------------------------------------
15
+ print("🕷️ Crawling with wxpath...")
16
+ loader = WXPathLoader(
17
+ expression="""
18
+ url('https://docs.python.org/3/library/argparse.html',
19
+ follow=//a/@href[contains(., 'argparse')])
20
+ /map{
21
+ 'text': string-join(//div[@role='main']//text()),
22
+ 'source': string(base-uri(.))
23
+ }
24
+ """,
25
+ max_depth=1
26
+ )
27
+ docs = loader.load()
28
+
29
+ print("🔪 Splitting and Embedding...")
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
31
+ splits = text_splitter.split_documents(docs)
32
+
33
+ vectorstore = Chroma.from_documents(
34
+ documents=splits,
35
+ # Must use model that support embeddings (`ollama pull nomic-embed-text`)
36
+ embedding=OllamaEmbeddings(model="nomic-embed-text"),
37
+ collection_name="wxpath"
38
+ )
39
+ retriever = vectorstore.as_retriever()
40
+
41
+ # ------------------------------------------------------------------
42
+ # STEP 2: Define Components
43
+ # ------------------------------------------------------------------
44
+
45
+ # A helper to join retrieved documents into a single string
46
+ def format_docs(docs):
47
+ return "\n\n".join(doc.page_content for doc in docs)
48
+
49
+ # The Prompt (Standard RAG template)
50
+ template = """You are an assistant for question-answering tasks.
51
+ Use the following pieces of retrieved context to answer the question.
52
+ If you don't know the answer, just say that you don't know.
53
+ Use three sentences maximum and keep the answer concise.
54
+
55
+ Context: {context}
56
+
57
+ Question: {question}
58
+
59
+ Answer:"""
60
+ prompt = ChatPromptTemplate.from_template(template)
61
+
62
+ # The Model
63
+ llm = ChatOllama(model="gemma3")
64
+
65
+ # ------------------------------------------------------------------
66
+ # STEP 3: Build the Chain with LCEL
67
+ # ------------------------------------------------------------------
68
+ # The pipe operator (|) passes output from one component to the next.
69
+ rag_chain = (
70
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
71
+ | prompt
72
+ | llm
73
+ | StrOutputParser()
74
+ )
75
+
76
+ # ------------------------------------------------------------------
77
+ # STEP 4: Invoke
78
+ # ------------------------------------------------------------------
79
+ query = "How do I add arguments in argparse?"
80
+ print(f"\n❓ Question: {query}")
81
+
82
+ # The chain returns a string directly because of StrOutputParser
83
+ response = rag_chain.invoke(query)
84
+
85
+ print(f"\n🤖 Ollama Answer:\n{response}")
@@ -0,0 +1,218 @@
1
+ """
2
+ Rolling Window RAG Example
3
+
4
+ This examples demonstrates how to use a rolling window of news articles as context.
5
+
6
+ More importantly, it demonstrates complex string cleanup, metadata extraction, and other
7
+ real-world challenges of building a RAG application.
8
+
9
+ This script assumes you have gemma3 installed and your machine is capable of running a 32k
10
+ token model.
11
+ """
12
+ import asyncio
13
+ import datetime
14
+ import threading
15
+ from collections import deque
16
+ from operator import itemgetter
17
+ from typing import List
18
+
19
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
20
+ from langchain_core.documents import Document
21
+ from langchain_core.retrievers import BaseRetriever
22
+
23
+ from wxpath import wxpath_async
24
+
25
+ # If you have the cache dependency installed, you can enable it:
26
+ # wxpath.settings.CACHE_SETTINGS.enabled = True
27
+
28
+ # ------------------------------------------------------------------
29
+ # 1. The Rolling Buffer (The "Context Window")
30
+ # ------------------------------------------------------------------
31
+ class RollingNewsBuffer(BaseRetriever):
32
+ capacity: int = 100
33
+
34
+ # Define as PrivateAttrs so Pydantic ignores them for validation
35
+ _buffer: deque
36
+ _seen_urls: set
37
+ _lock: threading.Lock
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self._buffer = deque(maxlen=self.capacity)
42
+ self._seen_urls = set()
43
+ self._lock = threading.Lock()
44
+
45
+ def add_document(self, doc: Document):
46
+ """Thread-safe add with url cleanup on eviction."""
47
+ with self._lock:
48
+ # Check if we are about to evict an item (buffer full)
49
+ if len(self._buffer) == self._buffer.maxlen:
50
+ # We must manually find what is being removed to clean up seen_urls
51
+ # Note: deque[0] is the one about to be popped when appending
52
+ oldest_doc = self._buffer[0]
53
+ oldest_url = oldest_doc.metadata.get("url")
54
+ if oldest_url in self._seen_urls:
55
+ self._seen_urls.remove(oldest_url)
56
+
57
+ self._buffer.append(doc)
58
+ self._seen_urls.add(doc.metadata["url"])
59
+
60
+ def is_seen(self, url: str) -> bool:
61
+ """Thread-safe check."""
62
+ with self._lock:
63
+ return url in self._seen_urls
64
+
65
+ def _get_relevant_documents(
66
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None
67
+ ) -> List[Document]:
68
+ """
69
+ Thread-safe read.
70
+ """
71
+ with self._lock:
72
+ # Create a snapshot list while locked to prevent iteration crash
73
+ snapshot = list(self._buffer)
74
+
75
+ print(f"📰 Context Retrieval: Returning {len(snapshot)} docs for query: {query}")
76
+ return snapshot
77
+
78
+ # ------------------------------------------------------------------
79
+ # 2. The Background Crawler (The Producer)
80
+ # ------------------------------------------------------------------
81
+ async def continuous_crawl(buffer: RollingNewsBuffer):
82
+ """
83
+ Constantly crawls Newsweek and feeds the buffer.
84
+ """
85
+ print("🕷️ Crawler started...")
86
+
87
+ # Example Expression: deep crawl of newsweek
88
+ expression = """
89
+ url('https://www.newsweek.com/')
90
+ ///url(
91
+ //a/@href[starts-with(., '/') or starts-with(., './') or contains(., 'newsweek.com')]
92
+ )
93
+ /map{
94
+ 'title': //h1/text()[1] ! string(.),
95
+ 'text': string-join(//article//p/text()),
96
+ 'url': string(base-uri(.)),
97
+ 'pubDate': //meta[@name='article:modified_time']/@content[1] ! string(.)
98
+ }
99
+ """
100
+
101
+ # Infinite loop to restart crawl if it finishes, or run continuously
102
+ while True:
103
+ try:
104
+ # We use the async generator to stream results as they are found
105
+ async for item in wxpath_async(expression, max_depth=1):
106
+ item = item._map
107
+ url = item.get('url')
108
+ # Check seen status safely before doing processing work
109
+ if not url or buffer.is_seen(url):
110
+ continue
111
+
112
+ # Convert wxpath dict to LangChain Document
113
+ text_content = item.get('text', '')
114
+ # Basic cleaning (optional)
115
+ if isinstance(text_content, list):
116
+ text_content = " ".join(text_content)
117
+
118
+ if not text_content:
119
+ continue
120
+
121
+ title = item.get('title')
122
+ if not title:
123
+ title = ''
124
+
125
+ if isinstance(title, list):
126
+ title = " ".join(title)
127
+
128
+ pub_date = item.get('pubDate')
129
+ if not pub_date:
130
+ pub_date = str(datetime.date.today())
131
+
132
+ text_content = ("Title: " + title +
133
+ "\nPublished: " + pub_date + "\n" +
134
+ text_content)
135
+
136
+ doc = Document(
137
+ page_content=text_content,
138
+ metadata={"title": item.get('title'),
139
+ "url": item.get('url'),
140
+ "pubDate": item.get('pubDate')}
141
+ )
142
+
143
+ # PUSH TO BUFFER (Oldest gets evicted automatically if full)
144
+ buffer.add_document(doc)
145
+ print(f"📰 Added: {title[:30]}... (Buffer size: {len(buffer._buffer)})")
146
+ print(f"\tArticle text: {doc.page_content[:100]}...")
147
+ print()
148
+ # Rate limit slightly to be polite
149
+ await asyncio.sleep(60)
150
+
151
+ except Exception as e:
152
+ print(f"⚠️ Crawler error: {e}. Restarting in 10s...")
153
+ await asyncio.sleep(10)
154
+
155
+
156
+ def debug_print_prompt(prompt_value):
157
+ print("\n" + "="*40)
158
+ print("📢 FULL PROMPT SENT TO LLM:")
159
+ print("="*40)
160
+ print(prompt_value.to_string()) # This prints the exact text
161
+ print("="*40 + "\n")
162
+ return prompt_value
163
+
164
+ if __name__ == "__main__":
165
+ # Initialize the Rolling Buffer
166
+ retriever = RollingNewsBuffer(capacity=100)
167
+
168
+ # Start Crawler in a background thread so it doesn't block the Chat
169
+ def start_background_loop(loop):
170
+ asyncio.set_event_loop(loop)
171
+ loop.run_until_complete(continuous_crawl(retriever))
172
+
173
+ crawler_loop = asyncio.new_event_loop()
174
+ t = threading.Thread(target=start_background_loop, args=(crawler_loop,), daemon=True)
175
+ t.start()
176
+
177
+ import time
178
+
179
+ from langchain_core.prompts import ChatPromptTemplate
180
+ from langchain_ollama import ChatOllama
181
+
182
+ # Setup standard RAG chain
183
+ llm = ChatOllama(model="gemma3", num_ctx=32768)
184
+ prompt = ChatPromptTemplate.from_template(
185
+ "Answer based ONLY on the following news:\n\n{context}\n\nQuestion: {question}\n\n"
186
+ "DO NOT include generic Newsweek-administrative articles like 'Corrections', "
187
+ "'Company Info', 'Subscribe', Opinions', 'Press Releases', 'Editorials', etc. in your "
188
+ "analysis or answers. Answer the question using the non-Newsweek-related news provided. "
189
+ "You will be penalized for including old or undated news in your answer. If asked for "
190
+ "overviews or summaries, split news items into paragraphs and provide a summary of each "
191
+ "news item."
192
+ )
193
+
194
+ def format_docs(docs):
195
+ slice_of_news = "\n\n".join([d.page_content[:1000] for d in docs]) # Truncate for demo
196
+ print(f"📰 Latest news char length: {len(slice_of_news)}")
197
+ return slice_of_news
198
+
199
+ chain = (
200
+ {
201
+ # FIX: Use itemgetter so retriever gets a string, not a dict
202
+ "context": itemgetter("question") | retriever | format_docs,
203
+ "question": itemgetter("question")
204
+ }
205
+ | prompt
206
+ | debug_print_prompt
207
+ | llm
208
+ )
209
+
210
+ # Simulate querying constantly while buffer fills in background
211
+ print("⏳ Waiting for crawler to gather some data...")
212
+ time.sleep(10)
213
+
214
+ while True:
215
+ query = input("Press Enter to ask about current news (or Ctrl+C to quit)...")
216
+ print(f"\nQuery: {query}\nThinking... 🤔")
217
+ response = chain.invoke({"question": query})
218
+ print(response.content)
@@ -0,0 +1,60 @@
1
+ from typing import Iterator
2
+
3
+ from elementpath.xpath_tokens import XPathMap
4
+ from langchain_core.document_loaders import BaseLoader
5
+ from langchain_core.documents import Document
6
+
7
+ import wxpath
8
+
9
+
10
+ class WXPathLoader(BaseLoader):
11
+ """A LangChain loader for wxpath queries.
12
+
13
+ For more complex examples, see the examples directory.
14
+ Best practice would be to subclass the loader and override the _prep_doc method.
15
+ For example:
16
+ ```python
17
+ class MyWXPathLoader(WXPathLoader):
18
+ def _prep_doc(self, item: (XPathMap | dict)) -> Document:
19
+ # Custom processing here
20
+ return super()._prep_doc(item)
21
+ ```
22
+ """
23
+
24
+ def __init__(self, expression: str, max_depth: int = 1):
25
+ self.expression = expression
26
+ self.max_depth = max_depth
27
+
28
+ def _prep_doc(self, item: (XPathMap | dict)) -> Document:
29
+
30
+ if isinstance(item, dict):
31
+ content = item.pop("text", str(item)) # Fallback if no "text" key
32
+ else:
33
+ content = item._map.pop("text", str(item._map)) # Fallback if no "text" key
34
+ item = item._map
35
+
36
+ return Document(
37
+ page_content=content,
38
+ metadata=item # Remaining keys go here (url, title, etc.)
39
+ )
40
+
41
+ def lazy_load(self) -> Iterator[Document]:
42
+ """
43
+ Lazy load documents from the wxpath query.
44
+ Each item yielded by wxpath becomes a LangChain Document.
45
+ """
46
+ # wxpath_async_blocking_iter allows iteration in sync environments
47
+ results = wxpath.wxpath_async_blocking_iter(
48
+ self.expression,
49
+ max_depth=self.max_depth
50
+ )
51
+
52
+ for item in results:
53
+ yield self._prep_doc(item)
54
+
55
+ async def alazy_load(self):
56
+ async for item in wxpath.wxpath_async(
57
+ self.expression,
58
+ max_depth=self.max_depth
59
+ ):
60
+ yield self._prep_doc(item)
wxpath/patches.py CHANGED
@@ -1,7 +1,17 @@
1
+ import urllib.parse
2
+
1
3
  import elementpath
4
+ from elementpath import XPathContext, XPathFunction
2
5
  from elementpath.xpath3 import XPath3Parser
3
6
  from lxml import etree, html
4
7
 
8
+ from wxpath.http.client import Response as Response
9
+ from wxpath.util.cleaners import main_text_extractor
10
+ from wxpath.util.common_paths import XPATH_PATH_TO_EXTERNAL_LINKS, XPATH_PATH_TO_INTERNAL_LINKS
11
+ from wxpath.util.logging import get_logger
12
+
13
+ log = get_logger(__name__)
14
+
5
15
 
6
16
  def html_element_repr(self):
7
17
  return (f"HtmlElement(tag={self.tag}, "
@@ -13,14 +23,18 @@ html.HtmlElement.__repr__ = html_element_repr
13
23
 
14
24
 
15
25
  class XPath3Element(etree.ElementBase):
16
- def xpath3(self, expr, **kwargs):
26
+ def __init__(self, tag, attrib=None, nsmap=None, **extra):
27
+ super().__init__(tag, attrib, nsmap, **extra)
28
+ self.response = None # type: Response | None
29
+
30
+ def xpath3(self, expr, request=None, **kwargs):
17
31
  """
18
32
  Evaluate an XPath 3 expression using elementpath library,
19
33
  returning the results as a list.
20
34
  """
21
- kwargs.setdefault("parser", XPath3Parser)
35
+ kwargs.setdefault("parser", WXPathParser)
22
36
  kwargs.setdefault(
23
- "uri",
37
+ "uri",
24
38
  getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
25
39
  )
26
40
  return elementpath.select(self, expr, **kwargs)
@@ -51,7 +65,8 @@ class XPath3Element(etree.ElementBase):
51
65
  @depth.setter
52
66
  def depth(self, value):
53
67
  self.set("depth", str(value))
54
-
68
+
69
+
55
70
  # Create and register custom parser that returns XPath3Element instances
56
71
  lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
57
72
  parser = etree.HTMLParser()
@@ -60,4 +75,199 @@ parser.set_element_class_lookup(lookup)
60
75
 
61
76
  # Expose parser for use in parse_html
62
77
  html_parser_with_xpath3 = parser
63
- html.HtmlElement.xpath3 = XPath3Element.xpath3
78
+ html.HtmlElement.xpath3 = XPath3Element.xpath3
79
+
80
+ # --- WXPATH functions ---
81
+ WX_NAMESPACE = "http://wxpath.dev/ns"
82
+
83
+ class WXPathParser(XPath3Parser):
84
+ """Custom parser that includes wxpath-specific functions."""
85
+ pass
86
+
87
+ # 2. Register the namespace mapping globally on the parser class
88
+ WXPathParser.DEFAULT_NAMESPACES['wx'] = WX_NAMESPACE
89
+
90
+ # 2. Helper to register functions easily
91
+ def register_wxpath_function(name, nargs=None, **kwargs):
92
+ """Registers a function token on the custom parser."""
93
+
94
+ # Define the token on the class (this registers the symbol)
95
+ # Check if this is a prefixed function (e.g. 'wx:depth')
96
+ if ':' in name:
97
+ prefix, local_name = name.split(':', 1)
98
+ kwargs['prefix'] = prefix
99
+ # kwargs['namespace'] = WX_NAMESPACE
100
+ name = local_name
101
+
102
+ # Register the token symbol
103
+ # WXPathParser.function(name, nargs=nargs, **kwargs)
104
+ # Register the token symbol and capture the created class
105
+ token_class = WXPathParser.function(name, nargs=nargs, **kwargs)
106
+ # Return a decorator to define the 'evaluate' method
107
+ def decorator(func):
108
+ # @WXPathParser.method(name)
109
+ # def evaluate(self, context=None):
110
+ # # 'self' is the Token instance.
111
+ # # 'self.get_argument(context, index)' evaluates arguments.
112
+ # return func(self, context)
113
+ # return evaluate
114
+ token_class.evaluate = func
115
+ return func
116
+ return decorator
117
+
118
+
119
+ class XPathContextRequired(Exception):
120
+ message = ('XPathContext is required. This usually arises when you call '
121
+ 'the function without a preceding axes expression ("/")')
122
+ def __init__(self, *args):
123
+ super().__init__(self.message, *args)
124
+
125
+
126
+ def _get_root(context: XPathContext):
127
+ if context is None:
128
+ raise XPathContextRequired
129
+
130
+ if not hasattr(context.item, 'elem'):
131
+ return context.item.parent.elem.getroottree().getroot()
132
+ return context.item.elem.getroottree().getroot()
133
+
134
+
135
+ @register_wxpath_function('wx:depth', nargs=0)
136
+ def wx_depth(_: XPathFunction, context: XPathContext):
137
+ if context is None:
138
+ raise XPathContextRequired
139
+
140
+ root = _get_root(context)
141
+
142
+ depth = root.get('depth')
143
+ return int(depth) if depth is not None else 0
144
+
145
+
146
+ @register_wxpath_function('wx:backlink', nargs=0)
147
+ def wx_backlink(_: XPathFunction, context: XPathContext):
148
+ if context is None:
149
+ raise XPathContextRequired
150
+
151
+ item = context.item.elem
152
+ if item is None:
153
+ return ''
154
+ return item.get('backlink') or ''
155
+
156
+
157
+ @register_wxpath_function('wx:current-url', nargs=0)
158
+ def wx_current_url(_: XPathFunction, context: XPathContext):
159
+ if context is None:
160
+ raise XPathContextRequired
161
+
162
+ item = context.item.elem
163
+ if item is None:
164
+ return ''
165
+ return item.base_url
166
+
167
+
168
+ @register_wxpath_function('wx:elapsed', nargs=0)
169
+ @register_wxpath_function('wx:fetch-time', nargs=0)
170
+ def wx_fetch_time(_: XPathFunction, context: XPathContext):
171
+ if context is None:
172
+ raise XPathContextRequired
173
+
174
+ item = context.item.elem
175
+ if item is None:
176
+ return ''
177
+ resp = item.response # type: Response
178
+ return resp.latency
179
+
180
+
181
+ # @register_wxpath_function('wx:status-code', nargs=0)
182
+ @register_wxpath_function('wx:status-code', nargs=0)
183
+ def wx_status_code(_: XPathFunction, context: XPathContext) -> int:
184
+ if context is None:
185
+ raise XPathContextRequired
186
+
187
+ item = context.item.elem
188
+ if item is None:
189
+ return ''
190
+
191
+ resp = item.response # type: Response
192
+ return resp.status
193
+
194
+
195
+ @register_wxpath_function('wx:elem', nargs=0)
196
+ def wx_elem(_: XPathFunction, context: XPathContext):
197
+ if context is None:
198
+ raise XPathContextRequired
199
+
200
+ item = context.item.elem
201
+ if item is None:
202
+ return ''
203
+ return item
204
+
205
+
206
+ def _get_root_domain(base_url: str) -> str:
207
+ parsed_url = urllib.parse.urlparse(base_url)
208
+
209
+ netloc = parsed_url.netloc
210
+ parts = netloc.split('.')
211
+ root_domain = netloc
212
+
213
+ if len(parts) > 2:
214
+ # Heuristic: If the last part is 2 chars (uk, au) and 2nd to last is < 4 (co, com, org)
215
+ # It's likely a compound TLD like co.uk. This isn't perfect but better than [-2:].
216
+ if len(parts[-1]) == 2 and len(parts[-2]) <= 3:
217
+ root_domain = ".".join(parts[-3:]) # grab bbc.co.uk
218
+ else:
219
+ # grab books.toscrape.com -> toscrape.com
220
+ root_domain = ".".join(parts[-2:])
221
+
222
+ return root_domain
223
+
224
+
225
+ @register_wxpath_function('wx:internal-links', nargs=0)
226
+ def wx_internal_links(_: XPathFunction, context: XPathContext):
227
+ """
228
+ Returns a list of internal links.
229
+ Allows for false positives.
230
+ """
231
+ if context is None:
232
+ raise XPathContextRequired
233
+
234
+ item = context.item.elem
235
+ if item is None:
236
+ return ''
237
+
238
+ root_domain = _get_root_domain(item.base_url)
239
+ _path = XPATH_PATH_TO_INTERNAL_LINKS.format(root_domain)
240
+ return item.xpath3(_path)
241
+
242
+
243
+ @register_wxpath_function('wx:external-links', nargs=0)
244
+ def wx_external_links(_: XPathFunction, context: XPathContext):
245
+ """
246
+ Returns a list of external links.
247
+ """
248
+ if context is None:
249
+ raise XPathContextRequired
250
+
251
+ item = context.item.elem
252
+ if item is None:
253
+ return ''
254
+
255
+ root_domain = _get_root_domain(item.base_url)
256
+ _path = XPATH_PATH_TO_EXTERNAL_LINKS.format(root_domain)
257
+ return item.xpath3(_path)
258
+
259
+
260
+ @register_wxpath_function('wx:main-article-text', nargs=0)
261
+ def wx_main_article_text(_: XPathFunction, context: XPathContext):
262
+ if context is None:
263
+ raise XPathContextRequired
264
+
265
+ item = context.item.elem
266
+ if item is None:
267
+ return ''
268
+
269
+ try:
270
+ return main_text_extractor(item)
271
+ except Exception:
272
+ log.exception('Failed to extract main article text')
273
+ return ''
wxpath/settings.py CHANGED
@@ -54,10 +54,12 @@ SETTINGS = {
54
54
  'concurrency': 16,
55
55
  'per_host': 8,
56
56
  'timeout': 15,
57
+ 'verify_ssl': True,
57
58
  'headers': {
58
59
  "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
59
60
  "AppleWebKit/537.36 (KHTML, like Gecko) "
60
- "Chrome/142.0.0.0 Safari/537.36")},
61
+ "Chrome/142.0.0.0 Safari/537.36")
62
+ },
61
63
  'proxies': None,
62
64
  'auto_throttle_target_concurrency': None,
63
65
  'auto_throttle_start_delay': 0.25,