cat-stack 1.4.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-1.4.0 → cat_stack-1.5.0}/PKG-INFO +1 -2
- {cat_stack-1.4.0 → cat_stack-1.5.0}/pyproject.toml +0 -1
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/__about__.py +1 -1
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_batch.py +5 -2
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_providers.py +117 -30
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_utils.py +44 -5
- cat_stack-1.5.0/src/catstack/_web_fetch.py +265 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/__init__.py +6 -4
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/pdf_CoVe.py +88 -76
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/classify.py +7 -2
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/image_functions.py +29 -78
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/pdf_functions.py +4 -5
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/summarize.py +2 -2
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/text_functions.py +4 -5
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/text_functions_ensemble.py +177 -140
- cat_stack-1.4.0/src/catstack/_web_fetch.py +0 -194
- cat_stack-1.4.0/src/catstack/calls/all_calls.py +0 -622
- {cat_stack-1.4.0 → cat_stack-1.5.0}/.gitignore +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/LICENSE +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/README.md +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/cat_stack/__init__.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/__init__.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_category_analysis.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_chunked.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_embeddings.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_formatter.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_pilot_test.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_prompts.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_review_ui.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_tiebreaker.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_wrapper_helpers.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/CoVe.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/image_CoVe.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/image_stepback.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/pdf_stepback.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/stepback.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/top_n.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/explore.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/extract.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/circle.png +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/cube.png +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/diamond.png +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/overlapping_pentagons.png +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/rectangles.png +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/model_reference_list.py +0 -0
- {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/prompt_tune.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -20,7 +20,6 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
|
20
20
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
21
21
|
Requires-Python: >=3.8
|
|
22
22
|
Requires-Dist: pandas
|
|
23
|
-
Requires-Dist: regex
|
|
24
23
|
Requires-Dist: requests
|
|
25
24
|
Requires-Dist: tqdm
|
|
26
25
|
Provides-Extra: docx
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "1.
|
|
4
|
+
__version__ = "1.5.0"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -818,13 +818,16 @@ def _run_one_sync_model(
|
|
|
818
818
|
multi_label=prompt_params.get("multi_label", True),
|
|
819
819
|
)
|
|
820
820
|
try:
|
|
821
|
-
raw = client.complete(
|
|
821
|
+
raw, err = client.complete(
|
|
822
822
|
messages=messages,
|
|
823
823
|
json_schema=json_schema,
|
|
824
824
|
creativity=creativity,
|
|
825
825
|
thinking_budget=thinking_budget if thinking_budget and thinking_budget > 0 else None,
|
|
826
826
|
)
|
|
827
|
-
|
|
827
|
+
if err:
|
|
828
|
+
item_results[idx] = (None, err)
|
|
829
|
+
else:
|
|
830
|
+
item_results[idx] = (extract_json(raw), None)
|
|
828
831
|
except Exception as e:
|
|
829
832
|
item_results[idx] = (None, str(e))
|
|
830
833
|
|
|
@@ -7,6 +7,7 @@ without requiring provider-specific SDKs.
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
import json
|
|
10
|
+
import threading
|
|
10
11
|
import time
|
|
11
12
|
import requests
|
|
12
13
|
|
|
@@ -68,54 +69,76 @@ _HF_ROUTER_ENDPOINTS = {
|
|
|
68
69
|
}
|
|
69
70
|
|
|
70
71
|
|
|
71
|
-
def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
|
|
72
|
+
def _detect_huggingface_endpoint(api_key: str, model: str, skip: set = None) -> str:
|
|
72
73
|
"""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
Probe HuggingFace endpoints to find one that supports this model.
|
|
75
|
+
|
|
76
|
+
Two call modes:
|
|
77
|
+
- Legacy (skip=None): probe generic + Together only. Falls back to
|
|
78
|
+
returning the generic base URL when nothing responds 200 — keeps
|
|
79
|
+
existing `image_functions` / `pdf_functions` callers behaving as
|
|
80
|
+
before so they can surface their own error from the eventual request.
|
|
81
|
+
- Lazy-fallback (skip=non-empty set): probe generic + all five known
|
|
82
|
+
router endpoints, skipping any in `skip`. Returns None when no
|
|
83
|
+
candidate responds 200 — caller (e.g., UnifiedLLMClient.complete)
|
|
84
|
+
should then surface the original error.
|
|
77
85
|
|
|
78
86
|
Args:
|
|
79
|
-
api_key: HuggingFace API key
|
|
80
|
-
model: Model name to test (may include
|
|
87
|
+
api_key: HuggingFace API key.
|
|
88
|
+
model: Model name to test (may include `:router` suffix).
|
|
89
|
+
skip: optional set of base URLs to skip (typically the URL that
|
|
90
|
+
just failed at the call site).
|
|
81
91
|
|
|
82
92
|
Returns:
|
|
83
|
-
Base URL
|
|
93
|
+
Base URL (without /chat/completions) of a working endpoint, or
|
|
94
|
+
None when skip is non-empty and nothing worked.
|
|
84
95
|
"""
|
|
96
|
+
skip = skip or set()
|
|
85
97
|
clean_model, router = _parse_hf_model_suffix(model)
|
|
86
98
|
|
|
87
|
-
# If explicit router suffix
|
|
99
|
+
# If explicit router suffix and the suffix endpoint is not skipped,
|
|
100
|
+
# route directly without probing.
|
|
88
101
|
if router and router in _HF_ROUTER_ENDPOINTS:
|
|
89
|
-
|
|
102
|
+
candidate = _HF_ROUTER_ENDPOINTS[router]
|
|
103
|
+
if candidate not in skip:
|
|
104
|
+
return candidate
|
|
105
|
+
|
|
106
|
+
generic_base = PROVIDER_CONFIG["huggingface"]["endpoint"].replace("/chat/completions", "")
|
|
90
107
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
108
|
+
if skip:
|
|
109
|
+
# Lazy-fallback mode: probe all known routers in priority order.
|
|
110
|
+
candidates_base = [generic_base] + list(_HF_ROUTER_ENDPOINTS.values())
|
|
111
|
+
else:
|
|
112
|
+
# Legacy mode: only generic + Together (preserves prior behavior
|
|
113
|
+
# and probe count for non-UnifiedLLMClient callers).
|
|
114
|
+
candidates_base = [generic_base, _HF_ROUTER_ENDPOINTS["together"]]
|
|
96
115
|
|
|
97
116
|
headers = {
|
|
98
117
|
"Content-Type": "application/json",
|
|
99
|
-
"Authorization": f"Bearer {api_key}"
|
|
118
|
+
"Authorization": f"Bearer {api_key}",
|
|
100
119
|
}
|
|
101
|
-
|
|
102
120
|
payload = {
|
|
103
121
|
"model": clean_model,
|
|
104
122
|
"messages": [{"role": "user", "content": "hi"}],
|
|
105
|
-
"max_tokens": 5
|
|
123
|
+
"max_tokens": 5,
|
|
106
124
|
}
|
|
107
125
|
|
|
108
|
-
for
|
|
126
|
+
for base in candidates_base:
|
|
127
|
+
if base in skip:
|
|
128
|
+
continue
|
|
109
129
|
try:
|
|
110
|
-
response = requests.post(
|
|
130
|
+
response = requests.post(f"{base}/chat/completions", headers=headers, json=payload, timeout=30)
|
|
111
131
|
if response.status_code == 200:
|
|
112
|
-
|
|
113
|
-
return endpoint.replace("/chat/completions", "")
|
|
132
|
+
return base
|
|
114
133
|
except Exception:
|
|
115
134
|
continue
|
|
116
135
|
|
|
117
|
-
#
|
|
118
|
-
|
|
136
|
+
# Legacy callers expect a base URL even on failure (their HTTP call
|
|
137
|
+
# surfaces the real error). Lazy-fallback callers prefer None so they
|
|
138
|
+
# can surface the original error rather than retrying a known-bad URL.
|
|
139
|
+
if skip:
|
|
140
|
+
return None
|
|
141
|
+
return generic_base
|
|
119
142
|
|
|
120
143
|
|
|
121
144
|
# =============================================================================
|
|
@@ -186,14 +209,24 @@ class UnifiedLLMClient:
|
|
|
186
209
|
def __init__(self, provider: str, api_key: str, model: str):
|
|
187
210
|
self.provider = provider.lower()
|
|
188
211
|
self.api_key = api_key
|
|
189
|
-
|
|
190
|
-
# Keep full model name with router suffix — the generic HF router
|
|
191
|
-
# uses the suffix (e.g. :novita, :together) for routing.
|
|
192
212
|
self.model = model
|
|
193
213
|
|
|
194
|
-
#
|
|
214
|
+
# Lazy HuggingFace router fallback — start with None and only
|
|
215
|
+
# populate when we either (a) have an explicit router suffix, or
|
|
216
|
+
# (b) the default endpoint returns a "wrong router" 400 on a real
|
|
217
|
+
# request. Avoids burning two probe POSTs (and leaking the API key
|
|
218
|
+
# to two endpoints) on every UnifiedLLMClient construction.
|
|
219
|
+
self._custom_endpoint = None
|
|
220
|
+
self._endpoint_lock = threading.Lock()
|
|
221
|
+
|
|
195
222
|
if self.provider == "huggingface":
|
|
196
|
-
|
|
223
|
+
clean_model, router = _parse_hf_model_suffix(model)
|
|
224
|
+
if router and router in _HF_ROUTER_ENDPOINTS:
|
|
225
|
+
# User was explicit about the router; honour it directly and
|
|
226
|
+
# strip the suffix from the model name (specific-router
|
|
227
|
+
# endpoints expect the clean name, not the suffix).
|
|
228
|
+
self._custom_endpoint = f"{_HF_ROUTER_ENDPOINTS[router]}/chat/completions"
|
|
229
|
+
self.model = clean_model
|
|
197
230
|
|
|
198
231
|
if self.provider not in PROVIDER_CONFIG:
|
|
199
232
|
raise ValueError(f"Unsupported provider: {provider}. "
|
|
@@ -201,6 +234,54 @@ class UnifiedLLMClient:
|
|
|
201
234
|
|
|
202
235
|
self.config = PROVIDER_CONFIG[self.provider]
|
|
203
236
|
|
|
237
|
+
def _is_hf_wrong_router_400(self, body: str) -> bool:
|
|
238
|
+
"""True if a 400 response body indicates the current HF router doesn't
|
|
239
|
+
carry this model (vs. truly nonexistent or a non-routing problem).
|
|
240
|
+
|
|
241
|
+
Trigger shapes (from a smoke test against the live HF API):
|
|
242
|
+
- Generic router: `{"error":{"code":"model_not_supported",...}}`
|
|
243
|
+
- Specific router: `{"error":"Model not supported by provider XYZ"}`
|
|
244
|
+
|
|
245
|
+
Intentionally NOT triggered by `model_not_found` (no router will help
|
|
246
|
+
a nonexistent model), 401/403 (auth), 5xx/429 (transient), or any
|
|
247
|
+
other 400 unrelated to router routing.
|
|
248
|
+
"""
|
|
249
|
+
if self.provider != "huggingface":
|
|
250
|
+
return False
|
|
251
|
+
return (
|
|
252
|
+
'"code":"model_not_supported"' in body
|
|
253
|
+
or "Model not supported by provider" in body
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def _try_hf_router_fallback(self, failed_endpoint: str) -> bool:
|
|
257
|
+
"""Find an HF router that has this model. Cache it on self.
|
|
258
|
+
|
|
259
|
+
Called from `complete()` when an HF request returns a "wrong router"
|
|
260
|
+
400. Probes all five known specific routers plus the generic router,
|
|
261
|
+
skipping the one that just failed. Idempotent and thread-safe via
|
|
262
|
+
the per-instance endpoint lock — if two concurrent callers both hit
|
|
263
|
+
the fallback path, only one runs the probe.
|
|
264
|
+
|
|
265
|
+
Returns True if a working endpoint was found and cached (caller
|
|
266
|
+
should refresh and retry). Returns False if every alternative also
|
|
267
|
+
rejected the model (caller should surface the original error).
|
|
268
|
+
"""
|
|
269
|
+
failed_base = failed_endpoint.replace("/chat/completions", "")
|
|
270
|
+
with self._endpoint_lock:
|
|
271
|
+
# Did another thread already find a different working endpoint?
|
|
272
|
+
if self._custom_endpoint:
|
|
273
|
+
current_base = self._custom_endpoint.replace("/chat/completions", "")
|
|
274
|
+
if current_base != failed_base:
|
|
275
|
+
return True
|
|
276
|
+
|
|
277
|
+
new_base = _detect_huggingface_endpoint(
|
|
278
|
+
self.api_key, self.model, skip={failed_base}
|
|
279
|
+
)
|
|
280
|
+
if new_base:
|
|
281
|
+
self._custom_endpoint = f"{new_base}/chat/completions"
|
|
282
|
+
return True
|
|
283
|
+
return False
|
|
284
|
+
|
|
204
285
|
def _get_endpoint(self) -> str:
|
|
205
286
|
"""Get the API endpoint, substituting model if needed."""
|
|
206
287
|
# Use custom endpoint if set (e.g., for HuggingFace router suffixes)
|
|
@@ -555,11 +636,11 @@ class UnifiedLLMClient:
|
|
|
555
636
|
if self.provider == "claude-code":
|
|
556
637
|
return self._call_claude_cli(messages, max_retries=max_retries, initial_delay=initial_delay)
|
|
557
638
|
|
|
558
|
-
endpoint = self._get_endpoint()
|
|
559
639
|
headers = self._get_headers()
|
|
560
640
|
payload = self._build_payload(messages, json_schema, creativity, thinking_budget=thinking_budget, force_json=force_json)
|
|
561
641
|
|
|
562
642
|
for attempt in range(max_retries):
|
|
643
|
+
endpoint = self._get_endpoint()
|
|
563
644
|
try:
|
|
564
645
|
response = requests.post(
|
|
565
646
|
endpoint,
|
|
@@ -582,6 +663,12 @@ class UnifiedLLMClient:
|
|
|
582
663
|
self._warned_no_structured = True
|
|
583
664
|
payload.pop("response_format")
|
|
584
665
|
continue # Retry immediately without response_format
|
|
666
|
+
|
|
667
|
+
# HuggingFace: try other routers when the current one
|
|
668
|
+
# rejects the model with a "wrong router" 400.
|
|
669
|
+
if self._is_hf_wrong_router_400(response.text):
|
|
670
|
+
if self._try_hf_router_fallback(endpoint):
|
|
671
|
+
continue # retry with the newly-cached endpoint
|
|
585
672
|
if response.status_code == 404 or (response.status_code == 400 and "not found" in response.text.lower() and "model" in response.text.lower()):
|
|
586
673
|
return None, f"Model '{self.model}' not found for {self.provider}"
|
|
587
674
|
elif response.status_code in [401, 403]:
|
|
@@ -7,7 +7,6 @@ encoding, and other common operations used across the package.
|
|
|
7
7
|
|
|
8
8
|
import json
|
|
9
9
|
import re
|
|
10
|
-
import regex
|
|
11
10
|
|
|
12
11
|
__all__ = [
|
|
13
12
|
# JSON utilities
|
|
@@ -88,15 +87,55 @@ def build_json_schema(categories: list, include_additional_properties: bool = Tr
|
|
|
88
87
|
return schema
|
|
89
88
|
|
|
90
89
|
|
|
90
|
+
def _extract_balanced_json(text: str) -> str | None:
|
|
91
|
+
"""Return the first balanced-brace JSON object substring in text, or None.
|
|
92
|
+
|
|
93
|
+
String-aware: a `{` or `}` inside a JSON string (between unescaped double
|
|
94
|
+
quotes) doesn't change scan depth. Replaces the prior `regex.findall` with
|
|
95
|
+
a recursive `(?R)` pattern — same semantics for well-formed input, but
|
|
96
|
+
correct on inputs like `{"summary": "see Fig {3}"}` (the regex version
|
|
97
|
+
truncated at the first `}` inside the string).
|
|
98
|
+
"""
|
|
99
|
+
if text is None:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
depth = 0
|
|
103
|
+
start = None
|
|
104
|
+
in_string = False
|
|
105
|
+
escape = False
|
|
106
|
+
for i, ch in enumerate(text):
|
|
107
|
+
if escape:
|
|
108
|
+
escape = False
|
|
109
|
+
continue
|
|
110
|
+
if ch == '\\':
|
|
111
|
+
escape = True
|
|
112
|
+
continue
|
|
113
|
+
if ch == '"':
|
|
114
|
+
in_string = not in_string
|
|
115
|
+
continue
|
|
116
|
+
if in_string:
|
|
117
|
+
continue
|
|
118
|
+
if ch == '{':
|
|
119
|
+
if depth == 0:
|
|
120
|
+
start = i
|
|
121
|
+
depth += 1
|
|
122
|
+
elif ch == '}':
|
|
123
|
+
if depth == 0:
|
|
124
|
+
continue
|
|
125
|
+
depth -= 1
|
|
126
|
+
if depth == 0 and start is not None:
|
|
127
|
+
return text[start:i + 1]
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
|
|
91
131
|
def extract_json(reply: str) -> str:
|
|
92
132
|
"""Extract JSON from model reply."""
|
|
93
133
|
if reply is None:
|
|
94
134
|
return '{"1":"e"}'
|
|
95
135
|
|
|
96
|
-
extracted =
|
|
97
|
-
if extracted:
|
|
98
|
-
|
|
99
|
-
return extracted[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '')
|
|
136
|
+
extracted = _extract_balanced_json(reply)
|
|
137
|
+
if extracted is not None:
|
|
138
|
+
return extracted.replace('[', '').replace(']', '').replace('\n', '').replace(" ", '')
|
|
100
139
|
else:
|
|
101
140
|
return '{"1":"e"}'
|
|
102
141
|
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Web content fetching utilities for URL input type.
|
|
3
|
+
|
|
4
|
+
Provides URL detection, HTML text extraction, and batch URL fetching
|
|
5
|
+
for use as a preprocessing step before text classification/extraction/summarization.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import html as html_lib
|
|
9
|
+
import ipaddress
|
|
10
|
+
import re
|
|
11
|
+
import socket
|
|
12
|
+
from urllib.parse import urlsplit
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"is_url",
|
|
18
|
+
"fetch_url_text",
|
|
19
|
+
"fetch_urls",
|
|
20
|
+
"detect_url_input",
|
|
21
|
+
"strip_html_tags",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
_DEFAULT_TIMEOUT = 30
|
|
25
|
+
|
|
26
|
+
_MAX_CONTENT_CHARS = 50000
|
|
27
|
+
|
|
28
|
+
# Hard cap on bytes pulled from the response before bailing — guards against
|
|
29
|
+
# OOM on a hostile or accidentally-huge URL. 5x slack over the char cap so
|
|
30
|
+
# HTML markup that gets stripped later still leaves real payload room.
|
|
31
|
+
_MAX_RESPONSE_BYTES = 5 * _MAX_CONTENT_CHARS
|
|
32
|
+
|
|
33
|
+
# Schemes fetch_url_text will follow. Anything else (file://, ftp://, data:,
|
|
34
|
+
# javascript:, ...) is rejected at validation time.
|
|
35
|
+
_ALLOWED_SCHEMES = frozenset({"http", "https"})
|
|
36
|
+
|
|
37
|
+
_USER_AGENT = (
|
|
38
|
+
"Mozilla/5.0 (compatible; CatStack/1.0; "
|
|
39
|
+
"+https://github.com/chrissoria/cat-stack)"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def is_url(s) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Check whether a string is a well-formed http(s) URL.
|
|
46
|
+
|
|
47
|
+
Structural check only — no DNS resolution, no network call. Rejects
|
|
48
|
+
strings with embedded control characters, non-http(s) schemes, and
|
|
49
|
+
missing netloc.
|
|
50
|
+
"""
|
|
51
|
+
if not isinstance(s, str):
|
|
52
|
+
return False
|
|
53
|
+
s = s.strip()
|
|
54
|
+
if any(c in s for c in ("\r", "\n", "\x00")):
|
|
55
|
+
return False
|
|
56
|
+
try:
|
|
57
|
+
parts = urlsplit(s)
|
|
58
|
+
except Exception:
|
|
59
|
+
return False
|
|
60
|
+
return parts.scheme in _ALLOWED_SCHEMES and bool(parts.netloc)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def detect_url_input(items) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Check whether input data is a collection of URLs.
|
|
66
|
+
|
|
67
|
+
Inspects the first non-null item in the iterable. Returns True if it
|
|
68
|
+
looks like a URL.
|
|
69
|
+
"""
|
|
70
|
+
import pandas as pd
|
|
71
|
+
|
|
72
|
+
if isinstance(items, str):
|
|
73
|
+
return is_url(items)
|
|
74
|
+
|
|
75
|
+
if hasattr(items, "__iter__"):
|
|
76
|
+
for item in items:
|
|
77
|
+
if item is not None:
|
|
78
|
+
try:
|
|
79
|
+
if pd.isna(item):
|
|
80
|
+
continue
|
|
81
|
+
except (TypeError, ValueError):
|
|
82
|
+
pass
|
|
83
|
+
return is_url(str(item))
|
|
84
|
+
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _validate_url_safe(url):
|
|
89
|
+
"""
|
|
90
|
+
Validate a URL for safe fetching: structure + SSRF host guard.
|
|
91
|
+
|
|
92
|
+
Returns (cleaned_url, error_message). error_message is None on success.
|
|
93
|
+
|
|
94
|
+
The SSRF guard resolves the hostname via socket.getaddrinfo and rejects
|
|
95
|
+
if ANY returned address is private, loopback, link-local, reserved,
|
|
96
|
+
multicast, or unspecified. Catches AWS metadata (169.254.169.254),
|
|
97
|
+
localhost (127.0.0.1, ::1), RFC1918, GCP metadata host, and similar
|
|
98
|
+
internal targets before any HTTP request goes out.
|
|
99
|
+
|
|
100
|
+
Does NOT defend against DNS rebinding (resolve-once-then-reconnect to
|
|
101
|
+
a different IP); that requires a custom HTTPAdapter and is out of
|
|
102
|
+
scope here.
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(url, str):
|
|
105
|
+
return "", "url must be a string"
|
|
106
|
+
url = url.strip()
|
|
107
|
+
if any(c in url for c in ("\r", "\n", "\x00")):
|
|
108
|
+
return "", "url contains control characters"
|
|
109
|
+
try:
|
|
110
|
+
parts = urlsplit(url)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
return "", f"could not parse url: {e}"
|
|
113
|
+
if parts.scheme not in _ALLOWED_SCHEMES:
|
|
114
|
+
return "", f"scheme must be http or https; got {parts.scheme!r}"
|
|
115
|
+
if not parts.netloc:
|
|
116
|
+
return "", "url has empty netloc"
|
|
117
|
+
hostname = parts.hostname
|
|
118
|
+
if not hostname:
|
|
119
|
+
return "", "url has empty hostname"
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
addrinfo = socket.getaddrinfo(hostname, None)
|
|
123
|
+
except socket.gaierror as e:
|
|
124
|
+
return "", f"could not resolve {hostname!r}: {e}"
|
|
125
|
+
|
|
126
|
+
for info in addrinfo:
|
|
127
|
+
ip_str = info[4][0]
|
|
128
|
+
try:
|
|
129
|
+
ip = ipaddress.ip_address(ip_str)
|
|
130
|
+
except ValueError:
|
|
131
|
+
return "", f"resolved address {ip_str!r} is not a valid IP"
|
|
132
|
+
if (
|
|
133
|
+
ip.is_private
|
|
134
|
+
or ip.is_loopback
|
|
135
|
+
or ip.is_link_local
|
|
136
|
+
or ip.is_reserved
|
|
137
|
+
or ip.is_multicast
|
|
138
|
+
or ip.is_unspecified
|
|
139
|
+
):
|
|
140
|
+
return "", (
|
|
141
|
+
f"{hostname!r} resolves to {ip_str} (private/internal); "
|
|
142
|
+
f"refusing to fetch as an SSRF guard"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return url, None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def strip_html_tags(html: str) -> str:
|
|
149
|
+
"""
|
|
150
|
+
Extract readable text from an HTML string.
|
|
151
|
+
|
|
152
|
+
Removes non-content elements (navigation, headers, footers, sidebars,
|
|
153
|
+
forms, scripts, styles), strips remaining tags, collapses whitespace,
|
|
154
|
+
and decodes HTML entities.
|
|
155
|
+
"""
|
|
156
|
+
text = html
|
|
157
|
+
|
|
158
|
+
_JUNK_TAGS = (
|
|
159
|
+
"script", "style", "nav", "header", "footer", "aside",
|
|
160
|
+
"noscript", "iframe", "form", "svg",
|
|
161
|
+
)
|
|
162
|
+
for tag in _JUNK_TAGS:
|
|
163
|
+
text = re.sub(
|
|
164
|
+
rf"<{tag}[^>]*>.*?</{tag}>",
|
|
165
|
+
"",
|
|
166
|
+
text,
|
|
167
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
for tag in ("input", "meta", "link", "img"):
|
|
171
|
+
text = re.sub(rf"<{tag}[^>]*/?\s*>", "", text, flags=re.IGNORECASE)
|
|
172
|
+
|
|
173
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
174
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
175
|
+
text = html_lib.unescape(text)
|
|
176
|
+
return text
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def fetch_url_text(url: str, timeout: int = _DEFAULT_TIMEOUT):
|
|
180
|
+
"""
|
|
181
|
+
Fetch a single URL and extract its text content.
|
|
182
|
+
|
|
183
|
+
Pre-flight: the URL's scheme and hostname are validated, and the
|
|
184
|
+
hostname is resolved; if it points at a private/internal IP, the
|
|
185
|
+
fetch is refused (SSRF guard). The response body is streamed and
|
|
186
|
+
capped to prevent OOM on very large pages. TLS errors are surfaced —
|
|
187
|
+
there is no silent verify=False fallback.
|
|
188
|
+
|
|
189
|
+
Returns (text, error). error is None on success.
|
|
190
|
+
"""
|
|
191
|
+
cleaned_url, validation_error = _validate_url_safe(url)
|
|
192
|
+
if validation_error:
|
|
193
|
+
return "", f"Error fetching {url}: {validation_error}"
|
|
194
|
+
|
|
195
|
+
headers = {"User-Agent": _USER_AGENT}
|
|
196
|
+
try:
|
|
197
|
+
with requests.get(
|
|
198
|
+
cleaned_url,
|
|
199
|
+
headers=headers,
|
|
200
|
+
timeout=timeout,
|
|
201
|
+
stream=True,
|
|
202
|
+
) as response:
|
|
203
|
+
response.raise_for_status()
|
|
204
|
+
content_type = response.headers.get("Content-Type", "")
|
|
205
|
+
encoding = response.encoding
|
|
206
|
+
|
|
207
|
+
chunks = []
|
|
208
|
+
bytes_read = 0
|
|
209
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
210
|
+
if not chunk:
|
|
211
|
+
continue
|
|
212
|
+
chunks.append(chunk)
|
|
213
|
+
bytes_read += len(chunk)
|
|
214
|
+
if bytes_read > _MAX_RESPONSE_BYTES:
|
|
215
|
+
break
|
|
216
|
+
raw = b"".join(chunks)
|
|
217
|
+
|
|
218
|
+
encoding = encoding or "utf-8"
|
|
219
|
+
try:
|
|
220
|
+
body = raw.decode(encoding, errors="replace")
|
|
221
|
+
except (LookupError, TypeError):
|
|
222
|
+
body = raw.decode("utf-8", errors="replace")
|
|
223
|
+
|
|
224
|
+
if (
|
|
225
|
+
"text/html" in content_type
|
|
226
|
+
or "text/plain" in content_type
|
|
227
|
+
or not content_type
|
|
228
|
+
):
|
|
229
|
+
text = strip_html_tags(body)
|
|
230
|
+
else:
|
|
231
|
+
text = body
|
|
232
|
+
|
|
233
|
+
if len(text) > _MAX_CONTENT_CHARS:
|
|
234
|
+
text = text[:_MAX_CONTENT_CHARS] + (
|
|
235
|
+
f"\n\n[Content truncated at {_MAX_CONTENT_CHARS} characters]"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return text, None
|
|
239
|
+
|
|
240
|
+
except requests.exceptions.Timeout:
|
|
241
|
+
return "", f"Timeout after {timeout}s fetching {url}"
|
|
242
|
+
except requests.exceptions.SSLError as e:
|
|
243
|
+
return "", f"SSL/TLS error fetching {url}: {e}"
|
|
244
|
+
except requests.exceptions.HTTPError as e:
|
|
245
|
+
return "", f"HTTP {e.response.status_code} fetching {url}"
|
|
246
|
+
except Exception as e:
|
|
247
|
+
return "", f"Error fetching {url}: {e}"
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def fetch_urls(urls, timeout: int = _DEFAULT_TIMEOUT):
|
|
251
|
+
"""
|
|
252
|
+
Fetch content from a list of URLs.
|
|
253
|
+
|
|
254
|
+
Returns list of (original_url, fetched_text, error) tuples. On success
|
|
255
|
+
error is None; on failure fetched_text is "".
|
|
256
|
+
"""
|
|
257
|
+
results = []
|
|
258
|
+
for url in urls:
|
|
259
|
+
url_str = str(url).strip()
|
|
260
|
+
if not is_url(url_str):
|
|
261
|
+
results.append((url_str, "", f"Not a valid URL: {url_str}"))
|
|
262
|
+
continue
|
|
263
|
+
text, error = fetch_url_text(url_str, timeout=timeout)
|
|
264
|
+
results.append((url_str, text, error))
|
|
265
|
+
return results
|
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from .stepback import (
|
|
6
6
|
get_stepback_insight_openai,
|
|
7
7
|
get_stepback_insight_anthropic,
|
|
8
8
|
get_stepback_insight_google,
|
|
9
9
|
get_stepback_insight_mistral,
|
|
10
|
+
)
|
|
11
|
+
from .CoVe import (
|
|
10
12
|
chain_of_verification_openai,
|
|
11
|
-
chain_of_verification_google,
|
|
12
13
|
chain_of_verification_anthropic,
|
|
13
|
-
|
|
14
|
+
chain_of_verification_google,
|
|
15
|
+
chain_of_verification_mistral,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
__all__ = [
|
|
@@ -22,4 +24,4 @@ __all__ = [
|
|
|
22
24
|
'chain_of_verification_anthropic',
|
|
23
25
|
'chain_of_verification_google',
|
|
24
26
|
'chain_of_verification_mistral',
|
|
25
|
-
]
|
|
27
|
+
]
|