voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
extractor/llm_extract.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor/llm_extract.py — LLM-assisted entity extraction.
|
|
3
|
+
|
|
4
|
+
Runs AFTER regex and NER — only on text chunks that already contain at least
|
|
5
|
+
one entity (to avoid wasting API calls on irrelevant content).
|
|
6
|
+
|
|
7
|
+
Accepts an *llm* object (any LangChain chat model) as a parameter — does not
|
|
8
|
+
instantiate LLMs internally.
|
|
9
|
+
|
|
10
|
+
Public interface
|
|
11
|
+
----------------
|
|
12
|
+
async extract_with_llm(text, llm, existing_entities, max_chunk_chars, page_hash, disable_cache) → dict[str, list[str]]
|
|
13
|
+
|
|
14
|
+
Configuration
|
|
15
|
+
-------------
|
|
16
|
+
- Set DISABLE_EXTRACTION_CACHE=true in .env to disable caching entirely
|
|
17
|
+
- Use --no-cache CLI flag to bypass cache for a specific run
|
|
18
|
+
- Cache TTL is 30 days
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import hashlib
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
import os
|
|
27
|
+
from datetime import datetime, timezone, timedelta
|
|
28
|
+
from typing import Optional
|
|
29
|
+
|
|
30
|
+
from config import DISABLE_EXTRACTION_CACHE
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
_CACHE_TTL_DAYS = 30
|
|
35
|
+
_DEFAULT_MAX_CHUNK_CHARS = 12000
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Prompt template
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
_PROMPT_TEMPLATE = (
|
|
42
|
+
"You are a threat intelligence analyst. Extract structured entities from the "
|
|
43
|
+
"following dark web content. Return ONLY valid JSON with these keys: "
|
|
44
|
+
"crypto_wallets, threat_actor_handles, malware_names, dates, urls, "
|
|
45
|
+
"cve_identifiers, mitre_techniques, file_hashes_md5, file_hashes_sha1, file_hashes_sha256. "
|
|
46
|
+
"Each key maps to a list of strings. If none found, use empty list. "
|
|
47
|
+
"Do not include any text outside the JSON object.\n\n"
|
|
48
|
+
"CRITICAL: File hashes (MD5, SHA1, SHA256) must be extracted in their complete, "
|
|
49
|
+
"untruncated form. MD5 hashes are exactly 32 hex characters. "
|
|
50
|
+
"SHA1 hashes are exactly 40 hex characters. "
|
|
51
|
+
"SHA256 hashes are exactly 64 hex characters. "
|
|
52
|
+
"If a hash appears truncated in the source text (e.g. 'a3f8b2...'), "
|
|
53
|
+
"do NOT extract it — skip truncated hashes entirely.\n\n"
|
|
54
|
+
"CVE: Common Vulnerabilities and Exposures identifiers in format CVE-YYYY-NNNNN. "
|
|
55
|
+
"Extract the complete ID including year and number.\n\n"
|
|
56
|
+
"MITRE_TECHNIQUE: MITRE ATT&CK technique identifiers in format TNNNN "
|
|
57
|
+
"or TNNNN.NNN (sub-techniques). These map to adversary tactics and are "
|
|
58
|
+
"critical for detection engineering.\n\n"
|
|
59
|
+
"Content:\n{chunk}"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Map LLM output keys → internal entity type constants
|
|
63
|
+
_LLM_KEY_TO_TYPE: dict[str, str] = {
|
|
64
|
+
"crypto_wallets": "BITCOIN_ADDRESS",
|
|
65
|
+
"threat_actor_handles": "THREAT_ACTOR_HANDLE",
|
|
66
|
+
"malware_names": "MALWARE_FAMILY",
|
|
67
|
+
"dates": "DATE",
|
|
68
|
+
"urls": "ONION_URL",
|
|
69
|
+
"cve_identifiers": "CVE_NUMBER",
|
|
70
|
+
"mitre_techniques": "MITRE_TECHNIQUE",
|
|
71
|
+
"file_hashes_md5": "FILE_HASH_MD5",
|
|
72
|
+
"file_hashes_sha1": "FILE_HASH_SHA1",
|
|
73
|
+
"file_hashes_sha256": "FILE_HASH_SHA256",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# Cache layer
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def _get_cache_disabled(flag: Optional[bool] = None) -> bool:
|
|
81
|
+
"""Check if cache should be disabled (CLI flag overrides env var)."""
|
|
82
|
+
if flag is True:
|
|
83
|
+
return True
|
|
84
|
+
return DISABLE_EXTRACTION_CACHE
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _compute_page_hash(content: str) -> str:
|
|
88
|
+
"""Compute SHA-256 hash of page content for cache key."""
|
|
89
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _load_from_cache(page_hash: str) -> Optional[dict[str, list[str]]]:
|
|
93
|
+
"""Load cached extraction results from database if not expired."""
|
|
94
|
+
if not os.getenv("DATABASE_URL"):
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
from sqlalchemy import text
|
|
99
|
+
from db.session import get_session
|
|
100
|
+
|
|
101
|
+
with get_session() as session:
|
|
102
|
+
result = session.execute(
|
|
103
|
+
text("""
|
|
104
|
+
SELECT entities_json, expires_at
|
|
105
|
+
FROM page_extraction_cache
|
|
106
|
+
WHERE page_hash = :page_hash
|
|
107
|
+
"""),
|
|
108
|
+
{"page_hash": page_hash}
|
|
109
|
+
).fetchone()
|
|
110
|
+
|
|
111
|
+
if result is None:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
entities_json, expires_at = result
|
|
115
|
+
if expires_at.tzinfo is None:
|
|
116
|
+
expires_at = expires_at.replace(tzinfo=timezone.utc)
|
|
117
|
+
|
|
118
|
+
if expires_at < datetime.now(timezone.utc):
|
|
119
|
+
logger.debug("Cache expired for page_hash=%s", page_hash[:16])
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
logger.info("Cache HIT for page_hash=%s", page_hash[:16])
|
|
123
|
+
return json.loads(entities_json)
|
|
124
|
+
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
logger.warning("Cache lookup failed: %s", exc)
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _save_to_cache(page_hash: str, entities: dict[str, list[str]]) -> None:
|
|
131
|
+
"""Store extraction results in cache with 30-day TTL."""
|
|
132
|
+
if not os.getenv("DATABASE_URL"):
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
from sqlalchemy import text
|
|
137
|
+
from db.session import get_session
|
|
138
|
+
|
|
139
|
+
entities_json = json.dumps(entities)
|
|
140
|
+
expires_at = datetime.now(timezone.utc) + timedelta(days=_CACHE_TTL_DAYS)
|
|
141
|
+
|
|
142
|
+
with get_session() as session:
|
|
143
|
+
session.execute(
|
|
144
|
+
text("""
|
|
145
|
+
INSERT INTO page_extraction_cache (page_hash, entities_json, extracted_at, expires_at)
|
|
146
|
+
VALUES (:page_hash, :entities_json, :extracted_at, :expires_at)
|
|
147
|
+
ON CONFLICT (page_hash) DO UPDATE SET
|
|
148
|
+
entities_json = EXCLUDED.entities_json,
|
|
149
|
+
extracted_at = EXCLUDED.extracted_at,
|
|
150
|
+
expires_at = EXCLUDED.expires_at
|
|
151
|
+
"""),
|
|
152
|
+
{
|
|
153
|
+
"page_hash": page_hash,
|
|
154
|
+
"entities_json": entities_json,
|
|
155
|
+
"extracted_at": datetime.now(timezone.utc),
|
|
156
|
+
"expires_at": expires_at,
|
|
157
|
+
}
|
|
158
|
+
)
|
|
159
|
+
session.commit()
|
|
160
|
+
|
|
161
|
+
logger.info("Cache saved for page_hash=%s", page_hash[:16])
|
|
162
|
+
|
|
163
|
+
except Exception as exc:
|
|
164
|
+
logger.warning("Cache save failed: %s", exc)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Public interface
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def extract_with_llm(
|
|
173
|
+
text: str,
|
|
174
|
+
llm,
|
|
175
|
+
existing_entities: dict[str, list[str]],
|
|
176
|
+
max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS,
|
|
177
|
+
page_hash: Optional[str] = None,
|
|
178
|
+
disable_cache: Optional[bool] = None,
|
|
179
|
+
) -> dict[str, list[str]]:
|
|
180
|
+
"""
|
|
181
|
+
Augment *existing_entities* with entities found by the LLM.
|
|
182
|
+
|
|
183
|
+
- If *llm* is None, returns *existing_entities* unchanged.
|
|
184
|
+
- Only processes text when *existing_entities* has at least one value
|
|
185
|
+
(to avoid API calls on irrelevant pages).
|
|
186
|
+
- Splits text into overlapping chunks of *max_chunk_chars* with a 200-char
|
|
187
|
+
overlap to avoid splitting entities at boundaries.
|
|
188
|
+
- Merges and deduplicates results from every chunk into *existing_entities*.
|
|
189
|
+
- Uses content-hash caching to skip LLM calls for identical content.
|
|
190
|
+
- Entity confidence increases with chunk occurrence count.
|
|
191
|
+
- Invalid JSON from the LLM is logged as a warning; that chunk contributes
|
|
192
|
+
no results rather than raising.
|
|
193
|
+
- Never raises.
|
|
194
|
+
"""
|
|
195
|
+
if llm is None:
|
|
196
|
+
return existing_entities
|
|
197
|
+
|
|
198
|
+
# Skip expensive LLM calls if regex/NER found nothing at all
|
|
199
|
+
if not any(existing_entities.values()):
|
|
200
|
+
return existing_entities
|
|
201
|
+
|
|
202
|
+
# Determine page hash for caching
|
|
203
|
+
if page_hash is None:
|
|
204
|
+
page_hash = _compute_page_hash(text)
|
|
205
|
+
|
|
206
|
+
# Check cache first (unless disabled)
|
|
207
|
+
if not _get_cache_disabled(disable_cache):
|
|
208
|
+
cached = _load_from_cache(page_hash)
|
|
209
|
+
if cached is not None:
|
|
210
|
+
return _merge_existing_and_cached(existing_entities, cached)
|
|
211
|
+
|
|
212
|
+
# Filter blocked entities before LLM to avoid processing noise
|
|
213
|
+
# Only apply to NER types (regex types have precise patterns, skip blocklist)
|
|
214
|
+
try:
|
|
215
|
+
from extractor.normalizer import is_blocked_entity, _REGEX_TYPES
|
|
216
|
+
filtered: dict[str, list[str]] = {}
|
|
217
|
+
for entity_type, values in existing_entities.items():
|
|
218
|
+
if entity_type in _REGEX_TYPES:
|
|
219
|
+
filtered[entity_type] = list(values)
|
|
220
|
+
else:
|
|
221
|
+
kept = [v for v in values if not is_blocked_entity(entity_type, v)]
|
|
222
|
+
if kept:
|
|
223
|
+
filtered[entity_type] = kept
|
|
224
|
+
if not filtered:
|
|
225
|
+
# Still cache the empty result to avoid repeated LLM calls
|
|
226
|
+
if not _get_cache_disabled(disable_cache):
|
|
227
|
+
_save_to_cache(page_hash, {})
|
|
228
|
+
return existing_entities
|
|
229
|
+
existing_entities = filtered
|
|
230
|
+
except ImportError:
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
chunks = _chunk_text(text, max_chunk_chars, overlap=200)
|
|
235
|
+
|
|
236
|
+
# Track entity occurrences across chunks for confidence scoring
|
|
237
|
+
entity_occurrences: dict[str, dict[str, int]] = {}
|
|
238
|
+
for entity_type in _LLM_KEY_TO_TYPE.values():
|
|
239
|
+
entity_occurrences[entity_type] = {}
|
|
240
|
+
|
|
241
|
+
result: dict[str, list[str]] = {k: list(v) for k, v in existing_entities.items()}
|
|
242
|
+
|
|
243
|
+
for chunk_idx, chunk in enumerate(chunks):
|
|
244
|
+
chunk_result = await _extract_chunk(chunk, llm)
|
|
245
|
+
for llm_key, entity_type in _LLM_KEY_TO_TYPE.items():
|
|
246
|
+
new_values = chunk_result.get(llm_key, [])
|
|
247
|
+
if not isinstance(new_values, list):
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Track occurrences for confidence scoring
|
|
251
|
+
for val in new_values:
|
|
252
|
+
normalized = str(val).strip()
|
|
253
|
+
if normalized:
|
|
254
|
+
counts = entity_occurrences.get(entity_type, {})
|
|
255
|
+
counts[normalized] = counts.get(normalized, 0) + 1
|
|
256
|
+
|
|
257
|
+
existing = result.get(entity_type, [])
|
|
258
|
+
existing.extend(str(v) for v in new_values)
|
|
259
|
+
result[entity_type] = _dedup(existing)
|
|
260
|
+
|
|
261
|
+
# Store result in cache (even if empty)
|
|
262
|
+
if not _get_cache_disabled(disable_cache):
|
|
263
|
+
_save_to_cache(page_hash, result)
|
|
264
|
+
|
|
265
|
+
# Add confidence info via logging (could be extended to return metadata)
|
|
266
|
+
_log_confidence_stats(entity_occurrences, len(chunks))
|
|
267
|
+
|
|
268
|
+
return result
|
|
269
|
+
|
|
270
|
+
except Exception:
|
|
271
|
+
logger.exception("extract_with_llm encountered an unexpected error")
|
|
272
|
+
return existing_entities
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _merge_existing_and_cached(
|
|
276
|
+
existing: dict[str, list[str]],
|
|
277
|
+
cached: dict[str, list[str]],
|
|
278
|
+
) -> dict[str, list[str]]:
|
|
279
|
+
"""
|
|
280
|
+
Merge cached entities with existing ones.
|
|
281
|
+
Existing entities (from regex/NER) take precedence.
|
|
282
|
+
"""
|
|
283
|
+
merged = dict(cached)
|
|
284
|
+
for entity_type, values in existing.items():
|
|
285
|
+
if entity_type in merged:
|
|
286
|
+
# Dedupe and prefer existing values
|
|
287
|
+
merged[entity_type] = _dedup(list(values) + merged[entity_type])
|
|
288
|
+
else:
|
|
289
|
+
merged[entity_type] = list(values)
|
|
290
|
+
return merged
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _log_confidence_stats(
|
|
294
|
+
entity_occurrences: dict[str, dict[str, int]],
|
|
295
|
+
total_chunks: int,
|
|
296
|
+
) -> None:
|
|
297
|
+
"""Log confidence statistics for extracted entities."""
|
|
298
|
+
for entity_type, counts in entity_occurrences.items():
|
|
299
|
+
if not counts:
|
|
300
|
+
continue
|
|
301
|
+
for value, count in counts.items():
|
|
302
|
+
if count > 1:
|
|
303
|
+
confidence = count / total_chunks
|
|
304
|
+
logger.debug(
|
|
305
|
+
"Entity %s=%s found in %d/%d chunks (confidence=%.2f)",
|
|
306
|
+
entity_type, value[:20], count, total_chunks, confidence
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ---------------------------------------------------------------------------
|
|
311
|
+
# Internal helpers
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _chunk_text(text: str, max_chars: int, overlap: int) -> list[str]:
|
|
316
|
+
"""
|
|
317
|
+
Split *text* into chunks of at most *max_chars* with *overlap* char overlap.
|
|
318
|
+
|
|
319
|
+
The last chunk may be shorter. Single chunks are returned as-is without
|
|
320
|
+
copying.
|
|
321
|
+
"""
|
|
322
|
+
if len(text) <= max_chars:
|
|
323
|
+
return [text]
|
|
324
|
+
|
|
325
|
+
chunks: list[str] = []
|
|
326
|
+
start = 0
|
|
327
|
+
while start < len(text):
|
|
328
|
+
end = min(start + max_chars, len(text))
|
|
329
|
+
chunks.append(text[start:end])
|
|
330
|
+
if end == len(text):
|
|
331
|
+
break
|
|
332
|
+
start = end - overlap
|
|
333
|
+
return chunks
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
async def _extract_chunk(chunk: str, llm) -> dict:
|
|
337
|
+
"""
|
|
338
|
+
Send one chunk to the LLM and return the parsed JSON dict.
|
|
339
|
+
|
|
340
|
+
Returns an empty dict if the LLM returns invalid JSON or an error occurs.
|
|
341
|
+
"""
|
|
342
|
+
try:
|
|
343
|
+
prompt = _PROMPT_TEMPLATE.format(chunk=chunk)
|
|
344
|
+
response = await llm.ainvoke(prompt)
|
|
345
|
+
content = response.content if hasattr(response, "content") else str(response)
|
|
346
|
+
content = content.strip()
|
|
347
|
+
|
|
348
|
+
# Strip markdown code fences if the LLM wrapped output in them
|
|
349
|
+
if content.startswith("```"):
|
|
350
|
+
lines = content.split("\n", 1)
|
|
351
|
+
if len(lines) > 1:
|
|
352
|
+
content = lines[1]
|
|
353
|
+
content = content.rsplit("```", 1)[0].strip()
|
|
354
|
+
|
|
355
|
+
return json.loads(content)
|
|
356
|
+
|
|
357
|
+
except json.JSONDecodeError as exc:
|
|
358
|
+
logger.warning("LLM returned invalid JSON for chunk (len=%d): %s", len(chunk), exc)
|
|
359
|
+
return {}
|
|
360
|
+
except Exception as exc:
|
|
361
|
+
logger.warning("LLM chunk extraction failed: %s", exc)
|
|
362
|
+
return {}
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _dedup(values) -> list[str]:
|
|
366
|
+
seen: set[str] = set()
|
|
367
|
+
result: list[str] = []
|
|
368
|
+
for v in values:
|
|
369
|
+
if v not in seen:
|
|
370
|
+
seen.add(v)
|
|
371
|
+
result.append(v)
|
|
372
|
+
return result
|