voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
extractor/pipeline.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""
|
|
2
|
+
extractor/pipeline.py — Pipeline orchestrator for entity extraction.
|
|
3
|
+
|
|
4
|
+
Single entry point that the rest of the system calls. Runs:
|
|
5
|
+
1. Regex extraction (extractor/regex_patterns.py)
|
|
6
|
+
2. NER extraction (extractor/ner.py)
|
|
7
|
+
3. LLM extraction (extractor/llm_extract.py) — optional
|
|
8
|
+
4. Normalisation (extractor/normalizer.py)
|
|
9
|
+
5. DB persistence (extractor/normalizer.merge_with_db)
|
|
10
|
+
|
|
11
|
+
Public interface
|
|
12
|
+
----------------
|
|
13
|
+
async extract_entities_from_page(...) → ExtractionResult
|
|
14
|
+
async extract_entities_from_pages(...) → list[ExtractionResult]
|
|
15
|
+
|
|
16
|
+
ExtractionResult is a dataclass exported through extractor/__init__.py.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import asyncio
|
|
22
|
+
import logging
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import Optional
|
|
25
|
+
import uuid
|
|
26
|
+
|
|
27
|
+
from extractor.regex_patterns import extract_all as _regex_extract_all
|
|
28
|
+
from extractor.ner import extract_named_entities as _ner_extract
|
|
29
|
+
from extractor.llm_extract import extract_with_llm as _llm_extract
|
|
30
|
+
from extractor.normalizer import normalize_entities as _normalize, merge_with_db as _merge_db, NormalizedEntity, resolve_entity_type_conflicts as _resolve_conflicts
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
PER_TYPE_CAPS = {
|
|
35
|
+
"ORGANIZATION_NAME": 50,
|
|
36
|
+
"PERSON_NAME": 30,
|
|
37
|
+
"LOCATION": 20,
|
|
38
|
+
"THREAT_ACTOR_HANDLE": 80,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
_ENTITY_TYPE_PRIORITY = {
|
|
42
|
+
1: frozenset({"CVE", "CVE_NUMBER", "IP_ADDRESS", "IPV6_ADDRESS", "FILE_HASH", "FILE_HASH_MD5", "FILE_HASH_SHA1", "FILE_HASH_SHA256", "FILE_HASH_SHA512", "ONION_URL", "DOMAIN", "DOMAIN_NAME"}),
|
|
43
|
+
2: frozenset({"MALWARE_FAMILY", "RANSOMWARE_GROUP", "THREAT_ACTOR", "THREAT_ACTOR_HANDLE"}),
|
|
44
|
+
3: frozenset({"BITCOIN_ADDRESS", "MONERO_ADDRESS", "ETHEREUM_ADDRESS", "WALLET"}),
|
|
45
|
+
4: frozenset({"EMAIL_ADDRESS", "PGP_KEY_BLOCK"}),
|
|
46
|
+
5: frozenset({"ORGANIZATION_NAME", "PERSON_NAME"}),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _type_priority(entity_type: str) -> int:
|
|
51
|
+
for priority, types in _ENTITY_TYPE_PRIORITY.items():
|
|
52
|
+
if entity_type in types:
|
|
53
|
+
return priority
|
|
54
|
+
return 99
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Result dataclass
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class ExtractionResult:
|
|
63
|
+
page_url: str
|
|
64
|
+
entity_count: int
|
|
65
|
+
entities_by_type: dict[str, int] = field(default_factory=dict)
|
|
66
|
+
entity_ids: list = field(default_factory=list)
|
|
67
|
+
errors: list[str] = field(default_factory=list)
|
|
68
|
+
entities: list = field(default_factory=list)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Public interface
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def extract_entities_from_page(
|
|
77
|
+
page_text: str,
|
|
78
|
+
page_url: str,
|
|
79
|
+
page_id: Optional[int] = None,
|
|
80
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
81
|
+
llm=None,
|
|
82
|
+
run_llm_extraction: bool = False,
|
|
83
|
+
disable_cache: Optional[bool] = None,
|
|
84
|
+
persist: bool = True,
|
|
85
|
+
) -> ExtractionResult:
|
|
86
|
+
"""
|
|
87
|
+
Run the full extraction pipeline for a single page.
|
|
88
|
+
|
|
89
|
+
Each stage is wrapped in its own try/except so a failure in one stage
|
|
90
|
+
never prevents later stages from running. Non-fatal errors are collected
|
|
91
|
+
in ExtractionResult.errors.
|
|
92
|
+
|
|
93
|
+
Set persist=False to skip DB persistence (used when collecting entities
|
|
94
|
+
for batch capping before write).
|
|
95
|
+
"""
|
|
96
|
+
errors: list[str] = []
|
|
97
|
+
|
|
98
|
+
# -----------------------------------------------------------------------
|
|
99
|
+
# Stage 1 — Regex
|
|
100
|
+
# -----------------------------------------------------------------------
|
|
101
|
+
try:
|
|
102
|
+
regex_entities = _regex_extract_all(page_text)
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
logger.error("Regex extraction failed for %s: %s", page_url, exc)
|
|
105
|
+
errors.append(f"regex: {exc}")
|
|
106
|
+
regex_entities = {}
|
|
107
|
+
|
|
108
|
+
# -----------------------------------------------------------------------
|
|
109
|
+
# Stage 2 — NER
|
|
110
|
+
# -----------------------------------------------------------------------
|
|
111
|
+
try:
|
|
112
|
+
ner_entities = _ner_extract(page_text)
|
|
113
|
+
except Exception as exc:
|
|
114
|
+
logger.error("NER extraction failed for %s: %s", page_url, exc)
|
|
115
|
+
errors.append(f"ner: {exc}")
|
|
116
|
+
ner_entities = {}
|
|
117
|
+
|
|
118
|
+
# Merge regex + NER (regex results take precedence for shared types)
|
|
119
|
+
combined: dict[str, list[str]] = dict(regex_entities)
|
|
120
|
+
for entity_type, values in ner_entities.items():
|
|
121
|
+
if entity_type in combined:
|
|
122
|
+
combined[entity_type] = _dedup(combined[entity_type] + values)
|
|
123
|
+
else:
|
|
124
|
+
combined[entity_type] = list(values)
|
|
125
|
+
|
|
126
|
+
# -----------------------------------------------------------------------
|
|
127
|
+
# Stage 3 — LLM (optional)
|
|
128
|
+
# -----------------------------------------------------------------------
|
|
129
|
+
if run_llm_extraction and llm is not None:
|
|
130
|
+
try:
|
|
131
|
+
import hashlib
|
|
132
|
+
page_hash = hashlib.sha256(page_text.encode()).hexdigest() if page_text else None
|
|
133
|
+
combined = await _llm_extract(
|
|
134
|
+
page_text, llm, combined, page_hash=page_hash, disable_cache=disable_cache
|
|
135
|
+
)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
logger.error("LLM extraction failed for %s: %s", page_url, exc)
|
|
138
|
+
errors.append(f"llm: {exc}")
|
|
139
|
+
|
|
140
|
+
# -----------------------------------------------------------------------
|
|
141
|
+
# Stage 4 — Normalise
|
|
142
|
+
# -----------------------------------------------------------------------
|
|
143
|
+
try:
|
|
144
|
+
normalized = _normalize(combined, page_url, page_id, page_text=page_text)
|
|
145
|
+
except Exception as exc:
|
|
146
|
+
logger.error("Normalization failed for %s: %s", page_url, exc)
|
|
147
|
+
errors.append(f"normalize: {exc}")
|
|
148
|
+
normalized = []
|
|
149
|
+
|
|
150
|
+
# -----------------------------------------------------------------------
|
|
151
|
+
# Build result (no DB persist yet if persist=False)
|
|
152
|
+
# -----------------------------------------------------------------------
|
|
153
|
+
entities_by_type: dict[str, int] = {}
|
|
154
|
+
for entity in normalized:
|
|
155
|
+
entities_by_type[entity.entity_type] = (
|
|
156
|
+
entities_by_type.get(entity.entity_type, 0) + 1
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if not persist:
|
|
160
|
+
return ExtractionResult(
|
|
161
|
+
page_url=page_url,
|
|
162
|
+
entity_count=len(normalized),
|
|
163
|
+
entities_by_type=entities_by_type,
|
|
164
|
+
entity_ids=[],
|
|
165
|
+
errors=errors,
|
|
166
|
+
entities=normalized,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# -----------------------------------------------------------------------
|
|
170
|
+
# Stage 5 — DB persist
|
|
171
|
+
# -----------------------------------------------------------------------
|
|
172
|
+
try:
|
|
173
|
+
entity_ids = _merge_db(normalized, investigation_id)
|
|
174
|
+
except Exception as exc:
|
|
175
|
+
logger.error("DB persist failed for %s: %s", page_url, exc)
|
|
176
|
+
errors.append(f"db: {exc}")
|
|
177
|
+
entity_ids = []
|
|
178
|
+
|
|
179
|
+
return ExtractionResult(
|
|
180
|
+
page_url=page_url,
|
|
181
|
+
entity_count=len(normalized),
|
|
182
|
+
entities_by_type=entities_by_type,
|
|
183
|
+
entity_ids=entity_ids,
|
|
184
|
+
errors=errors,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def extract_entities_from_pages(
|
|
189
|
+
pages: list[dict],
|
|
190
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
191
|
+
llm=None,
|
|
192
|
+
run_llm_extraction: bool = False,
|
|
193
|
+
max_concurrent: int = 5,
|
|
194
|
+
disable_cache: Optional[bool] = None,
|
|
195
|
+
entity_cap: int = 400,
|
|
196
|
+
) -> list[ExtractionResult]:
|
|
197
|
+
"""
|
|
198
|
+
Run extraction concurrently across a list of pages.
|
|
199
|
+
|
|
200
|
+
Each page dict must have at least a "url" key. Content is read from
|
|
201
|
+
"text", "content", or "cleaned_text" keys (first found wins).
|
|
202
|
+
|
|
203
|
+
A semaphore limits concurrency to *max_concurrent* simultaneous pages.
|
|
204
|
+
One page failing never blocks others — failures are captured in each
|
|
205
|
+
page's ExtractionResult.errors.
|
|
206
|
+
|
|
207
|
+
Before DB persistence, applies entity cap (default 400) ranked by:
|
|
208
|
+
confidence (primary), entity type priority (secondary), occurrence count (tertiary).
|
|
209
|
+
"""
|
|
210
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
211
|
+
|
|
212
|
+
async def _process(page: dict) -> ExtractionResult:
|
|
213
|
+
async with semaphore:
|
|
214
|
+
url = page.get("url", "")
|
|
215
|
+
text = (
|
|
216
|
+
page.get("text")
|
|
217
|
+
or page.get("content")
|
|
218
|
+
or page.get("cleaned_text")
|
|
219
|
+
or ""
|
|
220
|
+
)
|
|
221
|
+
try:
|
|
222
|
+
return await extract_entities_from_page(
|
|
223
|
+
page_text=text,
|
|
224
|
+
page_url=url,
|
|
225
|
+
page_id=page.get("page_id"),
|
|
226
|
+
investigation_id=investigation_id,
|
|
227
|
+
llm=llm,
|
|
228
|
+
run_llm_extraction=run_llm_extraction,
|
|
229
|
+
disable_cache=disable_cache,
|
|
230
|
+
persist=False,
|
|
231
|
+
)
|
|
232
|
+
except Exception as exc:
|
|
233
|
+
logger.error("Page processing failed for %s: %s", url, exc)
|
|
234
|
+
return ExtractionResult(
|
|
235
|
+
page_url=url,
|
|
236
|
+
entity_count=0,
|
|
237
|
+
entities_by_type={},
|
|
238
|
+
entity_ids=[],
|
|
239
|
+
errors=[str(exc)],
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
results = list(await asyncio.gather(*[_process(p) for p in pages]))
|
|
243
|
+
|
|
244
|
+
all_normalized: list[NormalizedEntity] = []
|
|
245
|
+
for result in results:
|
|
246
|
+
all_normalized.extend(result.entities)
|
|
247
|
+
|
|
248
|
+
if not all_normalized:
|
|
249
|
+
return results
|
|
250
|
+
|
|
251
|
+
all_normalized = _resolve_conflicts(all_normalized)
|
|
252
|
+
|
|
253
|
+
# -----------------------------------------------------------------------
|
|
254
|
+
# Content safety: drop prohibited entity values before capping/storing.
|
|
255
|
+
# Only text-based types are checked; technical IOCs are never filtered.
|
|
256
|
+
# The actual value is never logged — only type and count.
|
|
257
|
+
# -----------------------------------------------------------------------
|
|
258
|
+
from utils.content_safety import is_blocked_entity_value as _is_blocked_entity_value
|
|
259
|
+
clean_entities: list[NormalizedEntity] = []
|
|
260
|
+
blocked_entity_count = 0
|
|
261
|
+
for _ent in all_normalized:
|
|
262
|
+
if _is_blocked_entity_value(_ent.entity_type, _ent.value):
|
|
263
|
+
blocked_entity_count += 1
|
|
264
|
+
logger.debug(
|
|
265
|
+
"Entity value blocked — prohibited content: type=%s",
|
|
266
|
+
_ent.entity_type,
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
clean_entities.append(_ent)
|
|
270
|
+
if blocked_entity_count > 0:
|
|
271
|
+
logger.info(
|
|
272
|
+
"Blocked %d entities for prohibited content",
|
|
273
|
+
blocked_entity_count,
|
|
274
|
+
)
|
|
275
|
+
all_normalized = clean_entities
|
|
276
|
+
|
|
277
|
+
capped_entities, original_count = apply_entity_cap(
|
|
278
|
+
all_normalized, cap=entity_cap, investigation_id=investigation_id
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if capped_entities:
|
|
282
|
+
try:
|
|
283
|
+
entity_id_map = _merge_db(capped_entities, investigation_id)
|
|
284
|
+
url_to_ids: dict[str, list] = {}
|
|
285
|
+
for ent, eid in zip(capped_entities, entity_id_map):
|
|
286
|
+
if ent.source_url not in url_to_ids:
|
|
287
|
+
url_to_ids[ent.source_url] = []
|
|
288
|
+
url_to_ids[ent.source_url].append(eid)
|
|
289
|
+
|
|
290
|
+
for result in results:
|
|
291
|
+
result.entity_ids = url_to_ids.get(result.page_url, [])
|
|
292
|
+
result.entities = [e for e in capped_entities if e.source_url == result.page_url]
|
|
293
|
+
except Exception as exc:
|
|
294
|
+
logger.error("Batch entity persist failed: %s", exc)
|
|
295
|
+
|
|
296
|
+
return results
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# ---------------------------------------------------------------------------
|
|
300
|
+
# Entity cap logic
|
|
301
|
+
# ---------------------------------------------------------------------------
|
|
302
|
+
|
|
303
|
+
def _occurrence_count(entity: NormalizedEntity, all_entities: list[NormalizedEntity]) -> int:
|
|
304
|
+
"""Count how many times this entity value appears across all pages."""
|
|
305
|
+
count = 0
|
|
306
|
+
for other in all_entities:
|
|
307
|
+
if other.entity_type == entity.entity_type and other.value == entity.value:
|
|
308
|
+
count += 1
|
|
309
|
+
return count
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _apply_per_type_caps(
|
|
313
|
+
entities: list[NormalizedEntity],
|
|
314
|
+
caps: dict = PER_TYPE_CAPS,
|
|
315
|
+
) -> list[NormalizedEntity]:
|
|
316
|
+
"""
|
|
317
|
+
Apply per-type sub-caps before the global cap.
|
|
318
|
+
|
|
319
|
+
This prevents high-volume low-specificity entity types (e.g., ORGANIZATION_NAME)
|
|
320
|
+
from crowding out high-value IOCs (FILE_HASH, CVE, MITRE_TECHNIQUE).
|
|
321
|
+
"""
|
|
322
|
+
type_counts: dict[str, int] = {}
|
|
323
|
+
result: list[NormalizedEntity] = []
|
|
324
|
+
|
|
325
|
+
for entity in entities:
|
|
326
|
+
etype = entity.entity_type
|
|
327
|
+
cap = caps.get(etype, float("inf"))
|
|
328
|
+
count = type_counts.get(etype, 0)
|
|
329
|
+
if count < cap:
|
|
330
|
+
result.append(entity)
|
|
331
|
+
type_counts[etype] = count + 1
|
|
332
|
+
else:
|
|
333
|
+
logger.debug(f"Per-type cap: {etype} capped at {cap}")
|
|
334
|
+
|
|
335
|
+
return result
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def apply_entity_cap(
|
|
339
|
+
entities: list[NormalizedEntity],
|
|
340
|
+
cap: int = 400,
|
|
341
|
+
investigation_id: Optional[uuid.UUID] = None,
|
|
342
|
+
) -> tuple[list[NormalizedEntity], int]:
|
|
343
|
+
"""
|
|
344
|
+
Apply quality-based entity filtering and hard cap.
|
|
345
|
+
|
|
346
|
+
Steps:
|
|
347
|
+
a) Remove any entity where confidence < 0.80
|
|
348
|
+
b) Apply per-type sub-caps (see _apply_per_type_caps)
|
|
349
|
+
c) Apply per-investigation hard cap of *cap* entities, ranked by:
|
|
350
|
+
- confidence score (primary, descending)
|
|
351
|
+
- entity type priority (secondary, ascending - lower number = higher priority)
|
|
352
|
+
- occurrence count across pages (tertiary, descending)
|
|
353
|
+
d) Log a warning when cap is applied
|
|
354
|
+
|
|
355
|
+
Returns: (capped_entities, original_count)
|
|
356
|
+
"""
|
|
357
|
+
original_count = len(entities)
|
|
358
|
+
|
|
359
|
+
# Step a: confidence filter
|
|
360
|
+
filtered = [e for e in entities if e.confidence >= 0.80]
|
|
361
|
+
removed_confidence = original_count - len(filtered)
|
|
362
|
+
if removed_confidence:
|
|
363
|
+
logger.warning(f"Entity confidence filter removed {removed_confidence} low-confidence entities")
|
|
364
|
+
|
|
365
|
+
# Count occurrences per entity (by type+value)
|
|
366
|
+
for ent in filtered:
|
|
367
|
+
ent._occurrence = _occurrence_count(ent, filtered)
|
|
368
|
+
|
|
369
|
+
# Step b: per-type sub-caps
|
|
370
|
+
filtered = _apply_per_type_caps(filtered)
|
|
371
|
+
|
|
372
|
+
# Step c: sort and cap
|
|
373
|
+
if len(filtered) > cap:
|
|
374
|
+
filtered.sort(key=lambda e: (-e.confidence, _type_priority(e.entity_type), -e._occurrence))
|
|
375
|
+
filtered = filtered[:cap]
|
|
376
|
+
logger.warning(
|
|
377
|
+
f"Entity cap applied: {original_count} entities reduced to {len(filtered)} "
|
|
378
|
+
f"for investigation {investigation_id}"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Clean up temporary attribute
|
|
382
|
+
for ent in filtered:
|
|
383
|
+
if hasattr(ent, "_occurrence"):
|
|
384
|
+
del ent._occurrence
|
|
385
|
+
|
|
386
|
+
return filtered, original_count
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
# ---------------------------------------------------------------------------
|
|
390
|
+
# Internal helper
|
|
391
|
+
# ---------------------------------------------------------------------------
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _dedup(values) -> list[str]:
|
|
395
|
+
seen: set[str] = set()
|
|
396
|
+
result: list[str] = []
|
|
397
|
+
for v in values:
|
|
398
|
+
if v not in seen:
|
|
399
|
+
seen.add(v)
|
|
400
|
+
result.append(v)
|
|
401
|
+
return result
|