wxpath 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wxpath/__init__.py +2 -0
- wxpath/cli.py +6 -0
- wxpath/core/models.py +1 -0
- wxpath/core/ops.py +9 -12
- wxpath/core/parser.py +92 -23
- wxpath/core/runtime/engine.py +79 -8
- wxpath/core/runtime/helpers.py +6 -3
- wxpath/http/client/__init__.py +1 -1
- wxpath/http/client/crawler.py +19 -7
- wxpath/http/client/request.py +1 -1
- wxpath/http/client/response.py +7 -1
- wxpath/http/policy/retry.py +2 -2
- wxpath/integrations/__init__.py +0 -0
- wxpath/integrations/langchain/__init__.py +0 -0
- wxpath/integrations/langchain/examples/basic_rag.py +85 -0
- wxpath/integrations/langchain/examples/rolling_window_rag.py +218 -0
- wxpath/integrations/langchain/loader.py +60 -0
- wxpath/patches.py +215 -5
- wxpath/settings.py +3 -1
- wxpath/tui.py +1204 -0
- wxpath/tui_settings.py +151 -0
- wxpath/util/cleaners.py +31 -0
- wxpath/util/common_paths.py +22 -0
- wxpath/util/logging.py +3 -7
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/METADATA +123 -19
- wxpath-0.5.0.dist-info/RECORD +44 -0
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/WHEEL +1 -1
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/entry_points.txt +1 -0
- wxpath-0.4.0.dist-info/RECORD +0 -35
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {wxpath-0.4.0.dist-info → wxpath-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
|
|
2
|
+
# pip install langchain langchain-ollama langchain-chroma chromadb
|
|
3
|
+
from langchain_chroma import Chroma
|
|
4
|
+
from langchain_core.output_parsers import StrOutputParser
|
|
5
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
6
|
+
from langchain_core.runnables import RunnablePassthrough
|
|
7
|
+
from langchain_ollama import ChatOllama, OllamaEmbeddings
|
|
8
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
9
|
+
|
|
10
|
+
from wxpath.integrations.langchain.loader import WXPathLoader
|
|
11
|
+
|
|
12
|
+
# ------------------------------------------------------------------
|
|
13
|
+
# STEP 1: Load & Embed (Same as before)
|
|
14
|
+
# ------------------------------------------------------------------
|
|
15
|
+
print("🕷️ Crawling with wxpath...")
|
|
16
|
+
loader = WXPathLoader(
|
|
17
|
+
expression="""
|
|
18
|
+
url('https://docs.python.org/3/library/argparse.html',
|
|
19
|
+
follow=//a/@href[contains(., 'argparse')])
|
|
20
|
+
/map{
|
|
21
|
+
'text': string-join(//div[@role='main']//text()),
|
|
22
|
+
'source': string(base-uri(.))
|
|
23
|
+
}
|
|
24
|
+
""",
|
|
25
|
+
max_depth=1
|
|
26
|
+
)
|
|
27
|
+
docs = loader.load()
|
|
28
|
+
|
|
29
|
+
print("🔪 Splitting and Embedding...")
|
|
30
|
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
31
|
+
splits = text_splitter.split_documents(docs)
|
|
32
|
+
|
|
33
|
+
vectorstore = Chroma.from_documents(
|
|
34
|
+
documents=splits,
|
|
35
|
+
# Must use model that support embeddings (`ollama pull nomic-embed-text`)
|
|
36
|
+
embedding=OllamaEmbeddings(model="nomic-embed-text"),
|
|
37
|
+
collection_name="wxpath"
|
|
38
|
+
)
|
|
39
|
+
retriever = vectorstore.as_retriever()
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------------------------------
|
|
42
|
+
# STEP 2: Define Components
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
# A helper to join retrieved documents into a single string
|
|
46
|
+
def format_docs(docs):
|
|
47
|
+
return "\n\n".join(doc.page_content for doc in docs)
|
|
48
|
+
|
|
49
|
+
# The Prompt (Standard RAG template)
|
|
50
|
+
template = """You are an assistant for question-answering tasks.
|
|
51
|
+
Use the following pieces of retrieved context to answer the question.
|
|
52
|
+
If you don't know the answer, just say that you don't know.
|
|
53
|
+
Use three sentences maximum and keep the answer concise.
|
|
54
|
+
|
|
55
|
+
Context: {context}
|
|
56
|
+
|
|
57
|
+
Question: {question}
|
|
58
|
+
|
|
59
|
+
Answer:"""
|
|
60
|
+
prompt = ChatPromptTemplate.from_template(template)
|
|
61
|
+
|
|
62
|
+
# The Model
|
|
63
|
+
llm = ChatOllama(model="gemma3")
|
|
64
|
+
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
# STEP 3: Build the Chain with LCEL
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
# The pipe operator (|) passes output from one component to the next.
|
|
69
|
+
rag_chain = (
|
|
70
|
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
|
71
|
+
| prompt
|
|
72
|
+
| llm
|
|
73
|
+
| StrOutputParser()
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# STEP 4: Invoke
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
query = "How do I add arguments in argparse?"
|
|
80
|
+
print(f"\n❓ Question: {query}")
|
|
81
|
+
|
|
82
|
+
# The chain returns a string directly because of StrOutputParser
|
|
83
|
+
response = rag_chain.invoke(query)
|
|
84
|
+
|
|
85
|
+
print(f"\n🤖 Ollama Answer:\n{response}")
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rolling Window RAG Example
|
|
3
|
+
|
|
4
|
+
This examples demonstrates how to use a rolling window of news articles as context.
|
|
5
|
+
|
|
6
|
+
More importantly, it demonstrates complex string cleanup, metadata extraction, and other
|
|
7
|
+
real-world challenges of building a RAG application.
|
|
8
|
+
|
|
9
|
+
This script assumes you have gemma3 installed and your machine is capable of running a 32k
|
|
10
|
+
token model.
|
|
11
|
+
"""
|
|
12
|
+
import asyncio
|
|
13
|
+
import datetime
|
|
14
|
+
import threading
|
|
15
|
+
from collections import deque
|
|
16
|
+
from operator import itemgetter
|
|
17
|
+
from typing import List
|
|
18
|
+
|
|
19
|
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
|
20
|
+
from langchain_core.documents import Document
|
|
21
|
+
from langchain_core.retrievers import BaseRetriever
|
|
22
|
+
|
|
23
|
+
from wxpath import wxpath_async
|
|
24
|
+
|
|
25
|
+
# If you have the cache dependency installed, you can enable it:
|
|
26
|
+
# wxpath.settings.CACHE_SETTINGS.enabled = True
|
|
27
|
+
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
# 1. The Rolling Buffer (The "Context Window")
|
|
30
|
+
# ------------------------------------------------------------------
|
|
31
|
+
class RollingNewsBuffer(BaseRetriever):
|
|
32
|
+
capacity: int = 100
|
|
33
|
+
|
|
34
|
+
# Define as PrivateAttrs so Pydantic ignores them for validation
|
|
35
|
+
_buffer: deque
|
|
36
|
+
_seen_urls: set
|
|
37
|
+
_lock: threading.Lock
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self._buffer = deque(maxlen=self.capacity)
|
|
42
|
+
self._seen_urls = set()
|
|
43
|
+
self._lock = threading.Lock()
|
|
44
|
+
|
|
45
|
+
def add_document(self, doc: Document):
|
|
46
|
+
"""Thread-safe add with url cleanup on eviction."""
|
|
47
|
+
with self._lock:
|
|
48
|
+
# Check if we are about to evict an item (buffer full)
|
|
49
|
+
if len(self._buffer) == self._buffer.maxlen:
|
|
50
|
+
# We must manually find what is being removed to clean up seen_urls
|
|
51
|
+
# Note: deque[0] is the one about to be popped when appending
|
|
52
|
+
oldest_doc = self._buffer[0]
|
|
53
|
+
oldest_url = oldest_doc.metadata.get("url")
|
|
54
|
+
if oldest_url in self._seen_urls:
|
|
55
|
+
self._seen_urls.remove(oldest_url)
|
|
56
|
+
|
|
57
|
+
self._buffer.append(doc)
|
|
58
|
+
self._seen_urls.add(doc.metadata["url"])
|
|
59
|
+
|
|
60
|
+
def is_seen(self, url: str) -> bool:
|
|
61
|
+
"""Thread-safe check."""
|
|
62
|
+
with self._lock:
|
|
63
|
+
return url in self._seen_urls
|
|
64
|
+
|
|
65
|
+
def _get_relevant_documents(
|
|
66
|
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None
|
|
67
|
+
) -> List[Document]:
|
|
68
|
+
"""
|
|
69
|
+
Thread-safe read.
|
|
70
|
+
"""
|
|
71
|
+
with self._lock:
|
|
72
|
+
# Create a snapshot list while locked to prevent iteration crash
|
|
73
|
+
snapshot = list(self._buffer)
|
|
74
|
+
|
|
75
|
+
print(f"📰 Context Retrieval: Returning {len(snapshot)} docs for query: {query}")
|
|
76
|
+
return snapshot
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
# 2. The Background Crawler (The Producer)
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
async def continuous_crawl(buffer: RollingNewsBuffer):
|
|
82
|
+
"""
|
|
83
|
+
Constantly crawls Newsweek and feeds the buffer.
|
|
84
|
+
"""
|
|
85
|
+
print("🕷️ Crawler started...")
|
|
86
|
+
|
|
87
|
+
# Example Expression: deep crawl of newsweek
|
|
88
|
+
expression = """
|
|
89
|
+
url('https://www.newsweek.com/')
|
|
90
|
+
///url(
|
|
91
|
+
//a/@href[starts-with(., '/') or starts-with(., './') or contains(., 'newsweek.com')]
|
|
92
|
+
)
|
|
93
|
+
/map{
|
|
94
|
+
'title': //h1/text()[1] ! string(.),
|
|
95
|
+
'text': string-join(//article//p/text()),
|
|
96
|
+
'url': string(base-uri(.)),
|
|
97
|
+
'pubDate': //meta[@name='article:modified_time']/@content[1] ! string(.)
|
|
98
|
+
}
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# Infinite loop to restart crawl if it finishes, or run continuously
|
|
102
|
+
while True:
|
|
103
|
+
try:
|
|
104
|
+
# We use the async generator to stream results as they are found
|
|
105
|
+
async for item in wxpath_async(expression, max_depth=1):
|
|
106
|
+
item = item._map
|
|
107
|
+
url = item.get('url')
|
|
108
|
+
# Check seen status safely before doing processing work
|
|
109
|
+
if not url or buffer.is_seen(url):
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
# Convert wxpath dict to LangChain Document
|
|
113
|
+
text_content = item.get('text', '')
|
|
114
|
+
# Basic cleaning (optional)
|
|
115
|
+
if isinstance(text_content, list):
|
|
116
|
+
text_content = " ".join(text_content)
|
|
117
|
+
|
|
118
|
+
if not text_content:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
title = item.get('title')
|
|
122
|
+
if not title:
|
|
123
|
+
title = ''
|
|
124
|
+
|
|
125
|
+
if isinstance(title, list):
|
|
126
|
+
title = " ".join(title)
|
|
127
|
+
|
|
128
|
+
pub_date = item.get('pubDate')
|
|
129
|
+
if not pub_date:
|
|
130
|
+
pub_date = str(datetime.date.today())
|
|
131
|
+
|
|
132
|
+
text_content = ("Title: " + title +
|
|
133
|
+
"\nPublished: " + pub_date + "\n" +
|
|
134
|
+
text_content)
|
|
135
|
+
|
|
136
|
+
doc = Document(
|
|
137
|
+
page_content=text_content,
|
|
138
|
+
metadata={"title": item.get('title'),
|
|
139
|
+
"url": item.get('url'),
|
|
140
|
+
"pubDate": item.get('pubDate')}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# PUSH TO BUFFER (Oldest gets evicted automatically if full)
|
|
144
|
+
buffer.add_document(doc)
|
|
145
|
+
print(f"📰 Added: {title[:30]}... (Buffer size: {len(buffer._buffer)})")
|
|
146
|
+
print(f"\tArticle text: {doc.page_content[:100]}...")
|
|
147
|
+
print()
|
|
148
|
+
# Rate limit slightly to be polite
|
|
149
|
+
await asyncio.sleep(60)
|
|
150
|
+
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f"⚠️ Crawler error: {e}. Restarting in 10s...")
|
|
153
|
+
await asyncio.sleep(10)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def debug_print_prompt(prompt_value):
|
|
157
|
+
print("\n" + "="*40)
|
|
158
|
+
print("📢 FULL PROMPT SENT TO LLM:")
|
|
159
|
+
print("="*40)
|
|
160
|
+
print(prompt_value.to_string()) # This prints the exact text
|
|
161
|
+
print("="*40 + "\n")
|
|
162
|
+
return prompt_value
|
|
163
|
+
|
|
164
|
+
if __name__ == "__main__":
|
|
165
|
+
# Initialize the Rolling Buffer
|
|
166
|
+
retriever = RollingNewsBuffer(capacity=100)
|
|
167
|
+
|
|
168
|
+
# Start Crawler in a background thread so it doesn't block the Chat
|
|
169
|
+
def start_background_loop(loop):
|
|
170
|
+
asyncio.set_event_loop(loop)
|
|
171
|
+
loop.run_until_complete(continuous_crawl(retriever))
|
|
172
|
+
|
|
173
|
+
crawler_loop = asyncio.new_event_loop()
|
|
174
|
+
t = threading.Thread(target=start_background_loop, args=(crawler_loop,), daemon=True)
|
|
175
|
+
t.start()
|
|
176
|
+
|
|
177
|
+
import time
|
|
178
|
+
|
|
179
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
180
|
+
from langchain_ollama import ChatOllama
|
|
181
|
+
|
|
182
|
+
# Setup standard RAG chain
|
|
183
|
+
llm = ChatOllama(model="gemma3", num_ctx=32768)
|
|
184
|
+
prompt = ChatPromptTemplate.from_template(
|
|
185
|
+
"Answer based ONLY on the following news:\n\n{context}\n\nQuestion: {question}\n\n"
|
|
186
|
+
"DO NOT include generic Newsweek-administrative articles like 'Corrections', "
|
|
187
|
+
"'Company Info', 'Subscribe', Opinions', 'Press Releases', 'Editorials', etc. in your "
|
|
188
|
+
"analysis or answers. Answer the question using the non-Newsweek-related news provided. "
|
|
189
|
+
"You will be penalized for including old or undated news in your answer. If asked for "
|
|
190
|
+
"overviews or summaries, split news items into paragraphs and provide a summary of each "
|
|
191
|
+
"news item."
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def format_docs(docs):
|
|
195
|
+
slice_of_news = "\n\n".join([d.page_content[:1000] for d in docs]) # Truncate for demo
|
|
196
|
+
print(f"📰 Latest news char length: {len(slice_of_news)}")
|
|
197
|
+
return slice_of_news
|
|
198
|
+
|
|
199
|
+
chain = (
|
|
200
|
+
{
|
|
201
|
+
# FIX: Use itemgetter so retriever gets a string, not a dict
|
|
202
|
+
"context": itemgetter("question") | retriever | format_docs,
|
|
203
|
+
"question": itemgetter("question")
|
|
204
|
+
}
|
|
205
|
+
| prompt
|
|
206
|
+
| debug_print_prompt
|
|
207
|
+
| llm
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Simulate querying constantly while buffer fills in background
|
|
211
|
+
print("⏳ Waiting for crawler to gather some data...")
|
|
212
|
+
time.sleep(10)
|
|
213
|
+
|
|
214
|
+
while True:
|
|
215
|
+
query = input("Press Enter to ask about current news (or Ctrl+C to quit)...")
|
|
216
|
+
print(f"\nQuery: {query}\nThinking... 🤔")
|
|
217
|
+
response = chain.invoke({"question": query})
|
|
218
|
+
print(response.content)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
from elementpath.xpath_tokens import XPathMap
|
|
4
|
+
from langchain_core.document_loaders import BaseLoader
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
|
|
7
|
+
import wxpath
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WXPathLoader(BaseLoader):
|
|
11
|
+
"""A LangChain loader for wxpath queries.
|
|
12
|
+
|
|
13
|
+
For more complex examples, see the examples directory.
|
|
14
|
+
Best practice would be to subclass the loader and override the _prep_doc method.
|
|
15
|
+
For example:
|
|
16
|
+
```python
|
|
17
|
+
class MyWXPathLoader(WXPathLoader):
|
|
18
|
+
def _prep_doc(self, item: (XPathMap | dict)) -> Document:
|
|
19
|
+
# Custom processing here
|
|
20
|
+
return super()._prep_doc(item)
|
|
21
|
+
```
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, expression: str, max_depth: int = 1):
|
|
25
|
+
self.expression = expression
|
|
26
|
+
self.max_depth = max_depth
|
|
27
|
+
|
|
28
|
+
def _prep_doc(self, item: (XPathMap | dict)) -> Document:
|
|
29
|
+
|
|
30
|
+
if isinstance(item, dict):
|
|
31
|
+
content = item.pop("text", str(item)) # Fallback if no "text" key
|
|
32
|
+
else:
|
|
33
|
+
content = item._map.pop("text", str(item._map)) # Fallback if no "text" key
|
|
34
|
+
item = item._map
|
|
35
|
+
|
|
36
|
+
return Document(
|
|
37
|
+
page_content=content,
|
|
38
|
+
metadata=item # Remaining keys go here (url, title, etc.)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def lazy_load(self) -> Iterator[Document]:
|
|
42
|
+
"""
|
|
43
|
+
Lazy load documents from the wxpath query.
|
|
44
|
+
Each item yielded by wxpath becomes a LangChain Document.
|
|
45
|
+
"""
|
|
46
|
+
# wxpath_async_blocking_iter allows iteration in sync environments
|
|
47
|
+
results = wxpath.wxpath_async_blocking_iter(
|
|
48
|
+
self.expression,
|
|
49
|
+
max_depth=self.max_depth
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
for item in results:
|
|
53
|
+
yield self._prep_doc(item)
|
|
54
|
+
|
|
55
|
+
async def alazy_load(self):
|
|
56
|
+
async for item in wxpath.wxpath_async(
|
|
57
|
+
self.expression,
|
|
58
|
+
max_depth=self.max_depth
|
|
59
|
+
):
|
|
60
|
+
yield self._prep_doc(item)
|
wxpath/patches.py
CHANGED
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
import urllib.parse
|
|
2
|
+
|
|
1
3
|
import elementpath
|
|
4
|
+
from elementpath import XPathContext, XPathFunction
|
|
2
5
|
from elementpath.xpath3 import XPath3Parser
|
|
3
6
|
from lxml import etree, html
|
|
4
7
|
|
|
8
|
+
from wxpath.http.client import Response as Response
|
|
9
|
+
from wxpath.util.cleaners import main_text_extractor
|
|
10
|
+
from wxpath.util.common_paths import XPATH_PATH_TO_EXTERNAL_LINKS, XPATH_PATH_TO_INTERNAL_LINKS
|
|
11
|
+
from wxpath.util.logging import get_logger
|
|
12
|
+
|
|
13
|
+
log = get_logger(__name__)
|
|
14
|
+
|
|
5
15
|
|
|
6
16
|
def html_element_repr(self):
|
|
7
17
|
return (f"HtmlElement(tag={self.tag}, "
|
|
@@ -13,14 +23,18 @@ html.HtmlElement.__repr__ = html_element_repr
|
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
class XPath3Element(etree.ElementBase):
|
|
16
|
-
def
|
|
26
|
+
def __init__(self, tag, attrib=None, nsmap=None, **extra):
|
|
27
|
+
super().__init__(tag, attrib, nsmap, **extra)
|
|
28
|
+
self.response = None # type: Response | None
|
|
29
|
+
|
|
30
|
+
def xpath3(self, expr, request=None, **kwargs):
|
|
17
31
|
"""
|
|
18
32
|
Evaluate an XPath 3 expression using elementpath library,
|
|
19
33
|
returning the results as a list.
|
|
20
34
|
"""
|
|
21
|
-
kwargs.setdefault("parser",
|
|
35
|
+
kwargs.setdefault("parser", WXPathParser)
|
|
22
36
|
kwargs.setdefault(
|
|
23
|
-
"uri",
|
|
37
|
+
"uri",
|
|
24
38
|
getattr(self.getroottree().docinfo, "URL", None) or self.get("base_url")
|
|
25
39
|
)
|
|
26
40
|
return elementpath.select(self, expr, **kwargs)
|
|
@@ -51,7 +65,8 @@ class XPath3Element(etree.ElementBase):
|
|
|
51
65
|
@depth.setter
|
|
52
66
|
def depth(self, value):
|
|
53
67
|
self.set("depth", str(value))
|
|
54
|
-
|
|
68
|
+
|
|
69
|
+
|
|
55
70
|
# Create and register custom parser that returns XPath3Element instances
|
|
56
71
|
lookup = etree.ElementDefaultClassLookup(element=XPath3Element)
|
|
57
72
|
parser = etree.HTMLParser()
|
|
@@ -60,4 +75,199 @@ parser.set_element_class_lookup(lookup)
|
|
|
60
75
|
|
|
61
76
|
# Expose parser for use in parse_html
|
|
62
77
|
html_parser_with_xpath3 = parser
|
|
63
|
-
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
|
78
|
+
html.HtmlElement.xpath3 = XPath3Element.xpath3
|
|
79
|
+
|
|
80
|
+
# --- WXPATH functions ---
|
|
81
|
+
WX_NAMESPACE = "http://wxpath.dev/ns"
|
|
82
|
+
|
|
83
|
+
class WXPathParser(XPath3Parser):
|
|
84
|
+
"""Custom parser that includes wxpath-specific functions."""
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
# 2. Register the namespace mapping globally on the parser class
|
|
88
|
+
WXPathParser.DEFAULT_NAMESPACES['wx'] = WX_NAMESPACE
|
|
89
|
+
|
|
90
|
+
# 2. Helper to register functions easily
|
|
91
|
+
def register_wxpath_function(name, nargs=None, **kwargs):
|
|
92
|
+
"""Registers a function token on the custom parser."""
|
|
93
|
+
|
|
94
|
+
# Define the token on the class (this registers the symbol)
|
|
95
|
+
# Check if this is a prefixed function (e.g. 'wx:depth')
|
|
96
|
+
if ':' in name:
|
|
97
|
+
prefix, local_name = name.split(':', 1)
|
|
98
|
+
kwargs['prefix'] = prefix
|
|
99
|
+
# kwargs['namespace'] = WX_NAMESPACE
|
|
100
|
+
name = local_name
|
|
101
|
+
|
|
102
|
+
# Register the token symbol
|
|
103
|
+
# WXPathParser.function(name, nargs=nargs, **kwargs)
|
|
104
|
+
# Register the token symbol and capture the created class
|
|
105
|
+
token_class = WXPathParser.function(name, nargs=nargs, **kwargs)
|
|
106
|
+
# Return a decorator to define the 'evaluate' method
|
|
107
|
+
def decorator(func):
|
|
108
|
+
# @WXPathParser.method(name)
|
|
109
|
+
# def evaluate(self, context=None):
|
|
110
|
+
# # 'self' is the Token instance.
|
|
111
|
+
# # 'self.get_argument(context, index)' evaluates arguments.
|
|
112
|
+
# return func(self, context)
|
|
113
|
+
# return evaluate
|
|
114
|
+
token_class.evaluate = func
|
|
115
|
+
return func
|
|
116
|
+
return decorator
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class XPathContextRequired(Exception):
|
|
120
|
+
message = ('XPathContext is required. This usually arises when you call '
|
|
121
|
+
'the function without a preceding axes expression ("/")')
|
|
122
|
+
def __init__(self, *args):
|
|
123
|
+
super().__init__(self.message, *args)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _get_root(context: XPathContext):
|
|
127
|
+
if context is None:
|
|
128
|
+
raise XPathContextRequired
|
|
129
|
+
|
|
130
|
+
if not hasattr(context.item, 'elem'):
|
|
131
|
+
return context.item.parent.elem.getroottree().getroot()
|
|
132
|
+
return context.item.elem.getroottree().getroot()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@register_wxpath_function('wx:depth', nargs=0)
|
|
136
|
+
def wx_depth(_: XPathFunction, context: XPathContext):
|
|
137
|
+
if context is None:
|
|
138
|
+
raise XPathContextRequired
|
|
139
|
+
|
|
140
|
+
root = _get_root(context)
|
|
141
|
+
|
|
142
|
+
depth = root.get('depth')
|
|
143
|
+
return int(depth) if depth is not None else 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@register_wxpath_function('wx:backlink', nargs=0)
|
|
147
|
+
def wx_backlink(_: XPathFunction, context: XPathContext):
|
|
148
|
+
if context is None:
|
|
149
|
+
raise XPathContextRequired
|
|
150
|
+
|
|
151
|
+
item = context.item.elem
|
|
152
|
+
if item is None:
|
|
153
|
+
return ''
|
|
154
|
+
return item.get('backlink') or ''
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@register_wxpath_function('wx:current-url', nargs=0)
|
|
158
|
+
def wx_current_url(_: XPathFunction, context: XPathContext):
|
|
159
|
+
if context is None:
|
|
160
|
+
raise XPathContextRequired
|
|
161
|
+
|
|
162
|
+
item = context.item.elem
|
|
163
|
+
if item is None:
|
|
164
|
+
return ''
|
|
165
|
+
return item.base_url
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@register_wxpath_function('wx:elapsed', nargs=0)
|
|
169
|
+
@register_wxpath_function('wx:fetch-time', nargs=0)
|
|
170
|
+
def wx_fetch_time(_: XPathFunction, context: XPathContext):
|
|
171
|
+
if context is None:
|
|
172
|
+
raise XPathContextRequired
|
|
173
|
+
|
|
174
|
+
item = context.item.elem
|
|
175
|
+
if item is None:
|
|
176
|
+
return ''
|
|
177
|
+
resp = item.response # type: Response
|
|
178
|
+
return resp.latency
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# @register_wxpath_function('wx:status-code', nargs=0)
|
|
182
|
+
@register_wxpath_function('wx:status-code', nargs=0)
|
|
183
|
+
def wx_status_code(_: XPathFunction, context: XPathContext) -> int:
|
|
184
|
+
if context is None:
|
|
185
|
+
raise XPathContextRequired
|
|
186
|
+
|
|
187
|
+
item = context.item.elem
|
|
188
|
+
if item is None:
|
|
189
|
+
return ''
|
|
190
|
+
|
|
191
|
+
resp = item.response # type: Response
|
|
192
|
+
return resp.status
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@register_wxpath_function('wx:elem', nargs=0)
|
|
196
|
+
def wx_elem(_: XPathFunction, context: XPathContext):
|
|
197
|
+
if context is None:
|
|
198
|
+
raise XPathContextRequired
|
|
199
|
+
|
|
200
|
+
item = context.item.elem
|
|
201
|
+
if item is None:
|
|
202
|
+
return ''
|
|
203
|
+
return item
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _get_root_domain(base_url: str) -> str:
|
|
207
|
+
parsed_url = urllib.parse.urlparse(base_url)
|
|
208
|
+
|
|
209
|
+
netloc = parsed_url.netloc
|
|
210
|
+
parts = netloc.split('.')
|
|
211
|
+
root_domain = netloc
|
|
212
|
+
|
|
213
|
+
if len(parts) > 2:
|
|
214
|
+
# Heuristic: If the last part is 2 chars (uk, au) and 2nd to last is < 4 (co, com, org)
|
|
215
|
+
# It's likely a compound TLD like co.uk. This isn't perfect but better than [-2:].
|
|
216
|
+
if len(parts[-1]) == 2 and len(parts[-2]) <= 3:
|
|
217
|
+
root_domain = ".".join(parts[-3:]) # grab bbc.co.uk
|
|
218
|
+
else:
|
|
219
|
+
# grab books.toscrape.com -> toscrape.com
|
|
220
|
+
root_domain = ".".join(parts[-2:])
|
|
221
|
+
|
|
222
|
+
return root_domain
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@register_wxpath_function('wx:internal-links', nargs=0)
|
|
226
|
+
def wx_internal_links(_: XPathFunction, context: XPathContext):
|
|
227
|
+
"""
|
|
228
|
+
Returns a list of internal links.
|
|
229
|
+
Allows for false positives.
|
|
230
|
+
"""
|
|
231
|
+
if context is None:
|
|
232
|
+
raise XPathContextRequired
|
|
233
|
+
|
|
234
|
+
item = context.item.elem
|
|
235
|
+
if item is None:
|
|
236
|
+
return ''
|
|
237
|
+
|
|
238
|
+
root_domain = _get_root_domain(item.base_url)
|
|
239
|
+
_path = XPATH_PATH_TO_INTERNAL_LINKS.format(root_domain)
|
|
240
|
+
return item.xpath3(_path)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@register_wxpath_function('wx:external-links', nargs=0)
|
|
244
|
+
def wx_external_links(_: XPathFunction, context: XPathContext):
|
|
245
|
+
"""
|
|
246
|
+
Returns a list of external links.
|
|
247
|
+
"""
|
|
248
|
+
if context is None:
|
|
249
|
+
raise XPathContextRequired
|
|
250
|
+
|
|
251
|
+
item = context.item.elem
|
|
252
|
+
if item is None:
|
|
253
|
+
return ''
|
|
254
|
+
|
|
255
|
+
root_domain = _get_root_domain(item.base_url)
|
|
256
|
+
_path = XPATH_PATH_TO_EXTERNAL_LINKS.format(root_domain)
|
|
257
|
+
return item.xpath3(_path)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@register_wxpath_function('wx:main-article-text', nargs=0)
|
|
261
|
+
def wx_main_article_text(_: XPathFunction, context: XPathContext):
|
|
262
|
+
if context is None:
|
|
263
|
+
raise XPathContextRequired
|
|
264
|
+
|
|
265
|
+
item = context.item.elem
|
|
266
|
+
if item is None:
|
|
267
|
+
return ''
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
return main_text_extractor(item)
|
|
271
|
+
except Exception:
|
|
272
|
+
log.exception('Failed to extract main article text')
|
|
273
|
+
return ''
|
wxpath/settings.py
CHANGED
|
@@ -54,10 +54,12 @@ SETTINGS = {
|
|
|
54
54
|
'concurrency': 16,
|
|
55
55
|
'per_host': 8,
|
|
56
56
|
'timeout': 15,
|
|
57
|
+
'verify_ssl': True,
|
|
57
58
|
'headers': {
|
|
58
59
|
"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
|
|
59
60
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
60
|
-
"Chrome/142.0.0.0 Safari/537.36")
|
|
61
|
+
"Chrome/142.0.0.0 Safari/537.36")
|
|
62
|
+
},
|
|
61
63
|
'proxies': None,
|
|
62
64
|
'auto_throttle_target_concurrency': None,
|
|
63
65
|
'auto_throttle_start_delay': 0.25,
|