cat-stack 1.4.0__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {cat_stack-1.4.0 → cat_stack-1.5.0}/PKG-INFO +1 -2
  2. {cat_stack-1.4.0 → cat_stack-1.5.0}/pyproject.toml +0 -1
  3. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/__about__.py +1 -1
  4. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_batch.py +5 -2
  5. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_providers.py +117 -30
  6. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_utils.py +44 -5
  7. cat_stack-1.5.0/src/catstack/_web_fetch.py +265 -0
  8. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/__init__.py +6 -4
  9. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/pdf_CoVe.py +88 -76
  10. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/classify.py +7 -2
  11. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/image_functions.py +29 -78
  12. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/pdf_functions.py +4 -5
  13. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/summarize.py +2 -2
  14. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/text_functions.py +4 -5
  15. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/text_functions_ensemble.py +177 -140
  16. cat_stack-1.4.0/src/catstack/_web_fetch.py +0 -194
  17. cat_stack-1.4.0/src/catstack/calls/all_calls.py +0 -622
  18. {cat_stack-1.4.0 → cat_stack-1.5.0}/.gitignore +0 -0
  19. {cat_stack-1.4.0 → cat_stack-1.5.0}/LICENSE +0 -0
  20. {cat_stack-1.4.0 → cat_stack-1.5.0}/README.md +0 -0
  21. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/cat_stack/__init__.py +0 -0
  22. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/__init__.py +0 -0
  23. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_category_analysis.py +0 -0
  24. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_chunked.py +0 -0
  25. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_embeddings.py +0 -0
  26. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_formatter.py +0 -0
  27. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_pilot_test.py +0 -0
  28. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_prompts.py +0 -0
  29. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_review_ui.py +0 -0
  30. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_tiebreaker.py +0 -0
  31. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/_wrapper_helpers.py +0 -0
  32. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/CoVe.py +0 -0
  33. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/image_CoVe.py +0 -0
  34. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/image_stepback.py +0 -0
  35. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/pdf_stepback.py +0 -0
  36. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/stepback.py +0 -0
  37. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/calls/top_n.py +0 -0
  38. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/explore.py +0 -0
  39. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/extract.py +0 -0
  40. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/circle.png +0 -0
  41. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/cube.png +0 -0
  42. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/diamond.png +0 -0
  43. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/overlapping_pentagons.png +0 -0
  44. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/images/rectangles.png +0 -0
  45. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/model_reference_list.py +0 -0
  46. {cat_stack-1.4.0 → cat_stack-1.5.0}/src/catstack/prompt_tune.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.4.0
3
+ Version: 1.5.0
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -20,7 +20,6 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
20
20
  Classifier: Programming Language :: Python :: Implementation :: PyPy
21
21
  Requires-Python: >=3.8
22
22
  Requires-Dist: pandas
23
- Requires-Dist: regex
24
23
  Requires-Dist: requests
25
24
  Requires-Dist: tqdm
26
25
  Provides-Extra: docx
@@ -28,7 +28,6 @@ dependencies = [
28
28
  "pandas",
29
29
  "tqdm",
30
30
  "requests",
31
- "regex",
32
31
  ]
33
32
 
34
33
  [project.optional-dependencies]
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.4.0"
4
+ __version__ = "1.5.0"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -818,13 +818,16 @@ def _run_one_sync_model(
818
818
  multi_label=prompt_params.get("multi_label", True),
819
819
  )
820
820
  try:
821
- raw = client.complete(
821
+ raw, err = client.complete(
822
822
  messages=messages,
823
823
  json_schema=json_schema,
824
824
  creativity=creativity,
825
825
  thinking_budget=thinking_budget if thinking_budget and thinking_budget > 0 else None,
826
826
  )
827
- item_results[idx] = (extract_json(raw), None)
827
+ if err:
828
+ item_results[idx] = (None, err)
829
+ else:
830
+ item_results[idx] = (extract_json(raw), None)
828
831
  except Exception as e:
829
832
  item_results[idx] = (None, str(e))
830
833
 
@@ -7,6 +7,7 @@ without requiring provider-specific SDKs.
7
7
  """
8
8
 
9
9
  import json
10
+ import threading
10
11
  import time
11
12
  import requests
12
13
 
@@ -68,54 +69,76 @@ _HF_ROUTER_ENDPOINTS = {
68
69
  }
69
70
 
70
71
 
71
- def _detect_huggingface_endpoint(api_key: str, model: str) -> str:
72
+ def _detect_huggingface_endpoint(api_key: str, model: str, skip: set = None) -> str:
72
73
  """
73
- Test which HuggingFace endpoint works for this model.
74
-
75
- If the model name has a router suffix (e.g., ":novita"), route directly
76
- to that provider's endpoint. Otherwise tries generic router, then Together.
74
+ Probe HuggingFace endpoints to find one that supports this model.
75
+
76
+ Two call modes:
77
+ - Legacy (skip=None): probe generic + Together only. Falls back to
78
+ returning the generic base URL when nothing responds 200 — keeps
79
+ existing `image_functions` / `pdf_functions` callers behaving as
80
+ before so they can surface their own error from the eventual request.
81
+ - Lazy-fallback (skip=non-empty set): probe generic + all five known
82
+ router endpoints, skipping any in `skip`. Returns None when no
83
+ candidate responds 200 — caller (e.g., UnifiedLLMClient.complete)
84
+ should then surface the original error.
77
85
 
78
86
  Args:
79
- api_key: HuggingFace API key
80
- model: Model name to test (may include :router suffix)
87
+ api_key: HuggingFace API key.
88
+ model: Model name to test (may include `:router` suffix).
89
+ skip: optional set of base URLs to skip (typically the URL that
90
+ just failed at the call site).
81
91
 
82
92
  Returns:
83
- Base URL for the working endpoint (without /chat/completions)
93
+ Base URL (without /chat/completions) of a working endpoint, or
94
+ None when skip is non-empty and nothing worked.
84
95
  """
96
+ skip = skip or set()
85
97
  clean_model, router = _parse_hf_model_suffix(model)
86
98
 
87
- # If explicit router suffix, use that endpoint directly
99
+ # If explicit router suffix and the suffix endpoint is not skipped,
100
+ # route directly without probing.
88
101
  if router and router in _HF_ROUTER_ENDPOINTS:
89
- return _HF_ROUTER_ENDPOINTS[router]
102
+ candidate = _HF_ROUTER_ENDPOINTS[router]
103
+ if candidate not in skip:
104
+ return candidate
105
+
106
+ generic_base = PROVIDER_CONFIG["huggingface"]["endpoint"].replace("/chat/completions", "")
90
107
 
91
- # Otherwise auto-detect
92
- endpoints = [
93
- "https://router.huggingface.co/v1/chat/completions",
94
- "https://router.huggingface.co/together/v1/chat/completions",
95
- ]
108
+ if skip:
109
+ # Lazy-fallback mode: probe all known routers in priority order.
110
+ candidates_base = [generic_base] + list(_HF_ROUTER_ENDPOINTS.values())
111
+ else:
112
+ # Legacy mode: only generic + Together (preserves prior behavior
113
+ # and probe count for non-UnifiedLLMClient callers).
114
+ candidates_base = [generic_base, _HF_ROUTER_ENDPOINTS["together"]]
96
115
 
97
116
  headers = {
98
117
  "Content-Type": "application/json",
99
- "Authorization": f"Bearer {api_key}"
118
+ "Authorization": f"Bearer {api_key}",
100
119
  }
101
-
102
120
  payload = {
103
121
  "model": clean_model,
104
122
  "messages": [{"role": "user", "content": "hi"}],
105
- "max_tokens": 5
123
+ "max_tokens": 5,
106
124
  }
107
125
 
108
- for endpoint in endpoints:
126
+ for base in candidates_base:
127
+ if base in skip:
128
+ continue
109
129
  try:
110
- response = requests.post(endpoint, headers=headers, json=payload, timeout=30)
130
+ response = requests.post(f"{base}/chat/completions", headers=headers, json=payload, timeout=30)
111
131
  if response.status_code == 200:
112
- # Return the base URL (without /chat/completions)
113
- return endpoint.replace("/chat/completions", "")
132
+ return base
114
133
  except Exception:
115
134
  continue
116
135
 
117
- # Default to generic (will fail with informative error)
118
- return "https://router.huggingface.co/v1"
136
+ # Legacy callers expect a base URL even on failure (their HTTP call
137
+ # surfaces the real error). Lazy-fallback callers prefer None so they
138
+ # can surface the original error rather than retrying a known-bad URL.
139
+ if skip:
140
+ return None
141
+ return generic_base
119
142
 
120
143
 
121
144
  # =============================================================================
@@ -186,14 +209,24 @@ class UnifiedLLMClient:
186
209
  def __init__(self, provider: str, api_key: str, model: str):
187
210
  self.provider = provider.lower()
188
211
  self.api_key = api_key
189
-
190
- # Keep full model name with router suffix — the generic HF router
191
- # uses the suffix (e.g. :novita, :together) for routing.
192
212
  self.model = model
193
213
 
194
- # Auto-detect HuggingFace endpoint (but always use generic router)
214
+ # Lazy HuggingFace router fallback start with None and only
215
+ # populate when we either (a) have an explicit router suffix, or
216
+ # (b) the default endpoint returns a "wrong router" 400 on a real
217
+ # request. Avoids burning two probe POSTs (and leaking the API key
218
+ # to two endpoints) on every UnifiedLLMClient construction.
219
+ self._custom_endpoint = None
220
+ self._endpoint_lock = threading.Lock()
221
+
195
222
  if self.provider == "huggingface":
196
- _detect_huggingface_endpoint(api_key, model)
223
+ clean_model, router = _parse_hf_model_suffix(model)
224
+ if router and router in _HF_ROUTER_ENDPOINTS:
225
+ # User was explicit about the router; honour it directly and
226
+ # strip the suffix from the model name (specific-router
227
+ # endpoints expect the clean name, not the suffix).
228
+ self._custom_endpoint = f"{_HF_ROUTER_ENDPOINTS[router]}/chat/completions"
229
+ self.model = clean_model
197
230
 
198
231
  if self.provider not in PROVIDER_CONFIG:
199
232
  raise ValueError(f"Unsupported provider: {provider}. "
@@ -201,6 +234,54 @@ class UnifiedLLMClient:
201
234
 
202
235
  self.config = PROVIDER_CONFIG[self.provider]
203
236
 
237
+ def _is_hf_wrong_router_400(self, body: str) -> bool:
238
+ """True if a 400 response body indicates the current HF router doesn't
239
+ carry this model (vs. truly nonexistent or a non-routing problem).
240
+
241
+ Trigger shapes (from a smoke test against the live HF API):
242
+ - Generic router: `{"error":{"code":"model_not_supported",...}}`
243
+ - Specific router: `{"error":"Model not supported by provider XYZ"}`
244
+
245
+ Intentionally NOT triggered by `model_not_found` (no router will help
246
+ a nonexistent model), 401/403 (auth), 5xx/429 (transient), or any
247
+ other 400 unrelated to router routing.
248
+ """
249
+ if self.provider != "huggingface":
250
+ return False
251
+ return (
252
+ '"code":"model_not_supported"' in body
253
+ or "Model not supported by provider" in body
254
+ )
255
+
256
+ def _try_hf_router_fallback(self, failed_endpoint: str) -> bool:
257
+ """Find an HF router that has this model. Cache it on self.
258
+
259
+ Called from `complete()` when an HF request returns a "wrong router"
260
+ 400. Probes all five known specific routers plus the generic router,
261
+ skipping the one that just failed. Idempotent and thread-safe via
262
+ the per-instance endpoint lock — if two concurrent callers both hit
263
+ the fallback path, only one runs the probe.
264
+
265
+ Returns True if a working endpoint was found and cached (caller
266
+ should refresh and retry). Returns False if every alternative also
267
+ rejected the model (caller should surface the original error).
268
+ """
269
+ failed_base = failed_endpoint.replace("/chat/completions", "")
270
+ with self._endpoint_lock:
271
+ # Did another thread already find a different working endpoint?
272
+ if self._custom_endpoint:
273
+ current_base = self._custom_endpoint.replace("/chat/completions", "")
274
+ if current_base != failed_base:
275
+ return True
276
+
277
+ new_base = _detect_huggingface_endpoint(
278
+ self.api_key, self.model, skip={failed_base}
279
+ )
280
+ if new_base:
281
+ self._custom_endpoint = f"{new_base}/chat/completions"
282
+ return True
283
+ return False
284
+
204
285
  def _get_endpoint(self) -> str:
205
286
  """Get the API endpoint, substituting model if needed."""
206
287
  # Use custom endpoint if set (e.g., for HuggingFace router suffixes)
@@ -555,11 +636,11 @@ class UnifiedLLMClient:
555
636
  if self.provider == "claude-code":
556
637
  return self._call_claude_cli(messages, max_retries=max_retries, initial_delay=initial_delay)
557
638
 
558
- endpoint = self._get_endpoint()
559
639
  headers = self._get_headers()
560
640
  payload = self._build_payload(messages, json_schema, creativity, thinking_budget=thinking_budget, force_json=force_json)
561
641
 
562
642
  for attempt in range(max_retries):
643
+ endpoint = self._get_endpoint()
563
644
  try:
564
645
  response = requests.post(
565
646
  endpoint,
@@ -582,6 +663,12 @@ class UnifiedLLMClient:
582
663
  self._warned_no_structured = True
583
664
  payload.pop("response_format")
584
665
  continue # Retry immediately without response_format
666
+
667
+ # HuggingFace: try other routers when the current one
668
+ # rejects the model with a "wrong router" 400.
669
+ if self._is_hf_wrong_router_400(response.text):
670
+ if self._try_hf_router_fallback(endpoint):
671
+ continue # retry with the newly-cached endpoint
585
672
  if response.status_code == 404 or (response.status_code == 400 and "not found" in response.text.lower() and "model" in response.text.lower()):
586
673
  return None, f"Model '{self.model}' not found for {self.provider}"
587
674
  elif response.status_code in [401, 403]:
@@ -7,7 +7,6 @@ encoding, and other common operations used across the package.
7
7
 
8
8
  import json
9
9
  import re
10
- import regex
11
10
 
12
11
  __all__ = [
13
12
  # JSON utilities
@@ -88,15 +87,55 @@ def build_json_schema(categories: list, include_additional_properties: bool = Tr
88
87
  return schema
89
88
 
90
89
 
90
+ def _extract_balanced_json(text: str) -> str | None:
91
+ """Return the first balanced-brace JSON object substring in text, or None.
92
+
93
+ String-aware: a `{` or `}` inside a JSON string (between unescaped double
94
+ quotes) doesn't change scan depth. Replaces the prior `regex.findall` with
95
+ a recursive `(?R)` pattern — same semantics for well-formed input, but
96
+ correct on inputs like `{"summary": "see Fig {3}"}` (the regex version
97
+ truncated at the first `}` inside the string).
98
+ """
99
+ if text is None:
100
+ return None
101
+
102
+ depth = 0
103
+ start = None
104
+ in_string = False
105
+ escape = False
106
+ for i, ch in enumerate(text):
107
+ if escape:
108
+ escape = False
109
+ continue
110
+ if ch == '\\':
111
+ escape = True
112
+ continue
113
+ if ch == '"':
114
+ in_string = not in_string
115
+ continue
116
+ if in_string:
117
+ continue
118
+ if ch == '{':
119
+ if depth == 0:
120
+ start = i
121
+ depth += 1
122
+ elif ch == '}':
123
+ if depth == 0:
124
+ continue
125
+ depth -= 1
126
+ if depth == 0 and start is not None:
127
+ return text[start:i + 1]
128
+ return None
129
+
130
+
91
131
  def extract_json(reply: str) -> str:
92
132
  """Extract JSON from model reply."""
93
133
  if reply is None:
94
134
  return '{"1":"e"}'
95
135
 
96
- extracted = regex.findall(r'\{(?:[^{}]|(?R))*\}', reply, regex.DOTALL)
97
- if extracted:
98
- # Clean up the JSON string
99
- return extracted[0].replace('[', '').replace(']', '').replace('\n', '').replace(" ", '')
136
+ extracted = _extract_balanced_json(reply)
137
+ if extracted is not None:
138
+ return extracted.replace('[', '').replace(']', '').replace('\n', '').replace(" ", '')
100
139
  else:
101
140
  return '{"1":"e"}'
102
141
 
@@ -0,0 +1,265 @@
1
+ """
2
+ Web content fetching utilities for URL input type.
3
+
4
+ Provides URL detection, HTML text extraction, and batch URL fetching
5
+ for use as a preprocessing step before text classification/extraction/summarization.
6
+ """
7
+
8
+ import html as html_lib
9
+ import ipaddress
10
+ import re
11
+ import socket
12
+ from urllib.parse import urlsplit
13
+
14
+ import requests
15
+
16
+ __all__ = [
17
+ "is_url",
18
+ "fetch_url_text",
19
+ "fetch_urls",
20
+ "detect_url_input",
21
+ "strip_html_tags",
22
+ ]
23
+
24
+ _DEFAULT_TIMEOUT = 30
25
+
26
+ _MAX_CONTENT_CHARS = 50000
27
+
28
+ # Hard cap on bytes pulled from the response before bailing — guards against
29
+ # OOM on a hostile or accidentally-huge URL. 5x slack over the char cap so
30
+ # HTML markup that gets stripped later still leaves real payload room.
31
+ _MAX_RESPONSE_BYTES = 5 * _MAX_CONTENT_CHARS
32
+
33
+ # Schemes fetch_url_text will follow. Anything else (file://, ftp://, data:,
34
+ # javascript:, ...) is rejected at validation time.
35
+ _ALLOWED_SCHEMES = frozenset({"http", "https"})
36
+
37
+ _USER_AGENT = (
38
+ "Mozilla/5.0 (compatible; CatStack/1.0; "
39
+ "+https://github.com/chrissoria/cat-stack)"
40
+ )
41
+
42
+
43
+ def is_url(s) -> bool:
44
+ """
45
+ Check whether a string is a well-formed http(s) URL.
46
+
47
+ Structural check only — no DNS resolution, no network call. Rejects
48
+ strings with embedded control characters, non-http(s) schemes, and
49
+ missing netloc.
50
+ """
51
+ if not isinstance(s, str):
52
+ return False
53
+ s = s.strip()
54
+ if any(c in s for c in ("\r", "\n", "\x00")):
55
+ return False
56
+ try:
57
+ parts = urlsplit(s)
58
+ except Exception:
59
+ return False
60
+ return parts.scheme in _ALLOWED_SCHEMES and bool(parts.netloc)
61
+
62
+
63
+ def detect_url_input(items) -> bool:
64
+ """
65
+ Check whether input data is a collection of URLs.
66
+
67
+ Inspects the first non-null item in the iterable. Returns True if it
68
+ looks like a URL.
69
+ """
70
+ import pandas as pd
71
+
72
+ if isinstance(items, str):
73
+ return is_url(items)
74
+
75
+ if hasattr(items, "__iter__"):
76
+ for item in items:
77
+ if item is not None:
78
+ try:
79
+ if pd.isna(item):
80
+ continue
81
+ except (TypeError, ValueError):
82
+ pass
83
+ return is_url(str(item))
84
+
85
+ return False
86
+
87
+
88
+ def _validate_url_safe(url):
89
+ """
90
+ Validate a URL for safe fetching: structure + SSRF host guard.
91
+
92
+ Returns (cleaned_url, error_message). error_message is None on success.
93
+
94
+ The SSRF guard resolves the hostname via socket.getaddrinfo and rejects
95
+ if ANY returned address is private, loopback, link-local, reserved,
96
+ multicast, or unspecified. Catches AWS metadata (169.254.169.254),
97
+ localhost (127.0.0.1, ::1), RFC1918, GCP metadata host, and similar
98
+ internal targets before any HTTP request goes out.
99
+
100
+ Does NOT defend against DNS rebinding (resolve-once-then-reconnect to
101
+ a different IP); that requires a custom HTTPAdapter and is out of
102
+ scope here.
103
+ """
104
+ if not isinstance(url, str):
105
+ return "", "url must be a string"
106
+ url = url.strip()
107
+ if any(c in url for c in ("\r", "\n", "\x00")):
108
+ return "", "url contains control characters"
109
+ try:
110
+ parts = urlsplit(url)
111
+ except Exception as e:
112
+ return "", f"could not parse url: {e}"
113
+ if parts.scheme not in _ALLOWED_SCHEMES:
114
+ return "", f"scheme must be http or https; got {parts.scheme!r}"
115
+ if not parts.netloc:
116
+ return "", "url has empty netloc"
117
+ hostname = parts.hostname
118
+ if not hostname:
119
+ return "", "url has empty hostname"
120
+
121
+ try:
122
+ addrinfo = socket.getaddrinfo(hostname, None)
123
+ except socket.gaierror as e:
124
+ return "", f"could not resolve {hostname!r}: {e}"
125
+
126
+ for info in addrinfo:
127
+ ip_str = info[4][0]
128
+ try:
129
+ ip = ipaddress.ip_address(ip_str)
130
+ except ValueError:
131
+ return "", f"resolved address {ip_str!r} is not a valid IP"
132
+ if (
133
+ ip.is_private
134
+ or ip.is_loopback
135
+ or ip.is_link_local
136
+ or ip.is_reserved
137
+ or ip.is_multicast
138
+ or ip.is_unspecified
139
+ ):
140
+ return "", (
141
+ f"{hostname!r} resolves to {ip_str} (private/internal); "
142
+ f"refusing to fetch as an SSRF guard"
143
+ )
144
+
145
+ return url, None
146
+
147
+
148
+ def strip_html_tags(html: str) -> str:
149
+ """
150
+ Extract readable text from an HTML string.
151
+
152
+ Removes non-content elements (navigation, headers, footers, sidebars,
153
+ forms, scripts, styles), strips remaining tags, collapses whitespace,
154
+ and decodes HTML entities.
155
+ """
156
+ text = html
157
+
158
+ _JUNK_TAGS = (
159
+ "script", "style", "nav", "header", "footer", "aside",
160
+ "noscript", "iframe", "form", "svg",
161
+ )
162
+ for tag in _JUNK_TAGS:
163
+ text = re.sub(
164
+ rf"<{tag}[^>]*>.*?</{tag}>",
165
+ "",
166
+ text,
167
+ flags=re.DOTALL | re.IGNORECASE,
168
+ )
169
+
170
+ for tag in ("input", "meta", "link", "img"):
171
+ text = re.sub(rf"<{tag}[^>]*/?\s*>", "", text, flags=re.IGNORECASE)
172
+
173
+ text = re.sub(r"<[^>]+>", " ", text)
174
+ text = re.sub(r"\s+", " ", text).strip()
175
+ text = html_lib.unescape(text)
176
+ return text
177
+
178
+
179
+ def fetch_url_text(url: str, timeout: int = _DEFAULT_TIMEOUT):
180
+ """
181
+ Fetch a single URL and extract its text content.
182
+
183
+ Pre-flight: the URL's scheme and hostname are validated, and the
184
+ hostname is resolved; if it points at a private/internal IP, the
185
+ fetch is refused (SSRF guard). The response body is streamed and
186
+ capped to prevent OOM on very large pages. TLS errors are surfaced —
187
+ there is no silent verify=False fallback.
188
+
189
+ Returns (text, error). error is None on success.
190
+ """
191
+ cleaned_url, validation_error = _validate_url_safe(url)
192
+ if validation_error:
193
+ return "", f"Error fetching {url}: {validation_error}"
194
+
195
+ headers = {"User-Agent": _USER_AGENT}
196
+ try:
197
+ with requests.get(
198
+ cleaned_url,
199
+ headers=headers,
200
+ timeout=timeout,
201
+ stream=True,
202
+ ) as response:
203
+ response.raise_for_status()
204
+ content_type = response.headers.get("Content-Type", "")
205
+ encoding = response.encoding
206
+
207
+ chunks = []
208
+ bytes_read = 0
209
+ for chunk in response.iter_content(chunk_size=8192):
210
+ if not chunk:
211
+ continue
212
+ chunks.append(chunk)
213
+ bytes_read += len(chunk)
214
+ if bytes_read > _MAX_RESPONSE_BYTES:
215
+ break
216
+ raw = b"".join(chunks)
217
+
218
+ encoding = encoding or "utf-8"
219
+ try:
220
+ body = raw.decode(encoding, errors="replace")
221
+ except (LookupError, TypeError):
222
+ body = raw.decode("utf-8", errors="replace")
223
+
224
+ if (
225
+ "text/html" in content_type
226
+ or "text/plain" in content_type
227
+ or not content_type
228
+ ):
229
+ text = strip_html_tags(body)
230
+ else:
231
+ text = body
232
+
233
+ if len(text) > _MAX_CONTENT_CHARS:
234
+ text = text[:_MAX_CONTENT_CHARS] + (
235
+ f"\n\n[Content truncated at {_MAX_CONTENT_CHARS} characters]"
236
+ )
237
+
238
+ return text, None
239
+
240
+ except requests.exceptions.Timeout:
241
+ return "", f"Timeout after {timeout}s fetching {url}"
242
+ except requests.exceptions.SSLError as e:
243
+ return "", f"SSL/TLS error fetching {url}: {e}"
244
+ except requests.exceptions.HTTPError as e:
245
+ return "", f"HTTP {e.response.status_code} fetching {url}"
246
+ except Exception as e:
247
+ return "", f"Error fetching {url}: {e}"
248
+
249
+
250
+ def fetch_urls(urls, timeout: int = _DEFAULT_TIMEOUT):
251
+ """
252
+ Fetch content from a list of URLs.
253
+
254
+ Returns list of (original_url, fetched_text, error) tuples. On success
255
+ error is None; on failure fetched_text is "".
256
+ """
257
+ results = []
258
+ for url in urls:
259
+ url_str = str(url).strip()
260
+ if not is_url(url_str):
261
+ results.append((url_str, "", f"Not a valid URL: {url_str}"))
262
+ continue
263
+ text, error = fetch_url_text(url_str, timeout=timeout)
264
+ results.append((url_str, text, error))
265
+ return results
@@ -2,15 +2,17 @@
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
4
 
5
- from .all_calls import (
5
+ from .stepback import (
6
6
  get_stepback_insight_openai,
7
7
  get_stepback_insight_anthropic,
8
8
  get_stepback_insight_google,
9
9
  get_stepback_insight_mistral,
10
+ )
11
+ from .CoVe import (
10
12
  chain_of_verification_openai,
11
- chain_of_verification_google,
12
13
  chain_of_verification_anthropic,
13
- chain_of_verification_mistral
14
+ chain_of_verification_google,
15
+ chain_of_verification_mistral,
14
16
  )
15
17
 
16
18
  __all__ = [
@@ -22,4 +24,4 @@ __all__ = [
22
24
  'chain_of_verification_anthropic',
23
25
  'chain_of_verification_google',
24
26
  'chain_of_verification_mistral',
25
- ]
27
+ ]