corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
GLiNER2Extractor - Stage 2 plugin that
|
|
2
|
+
GLiNER2Extractor - Stage 2 plugin that extracts triples from sentences.
|
|
3
3
|
|
|
4
4
|
Uses GLiNER2 for:
|
|
5
|
-
1. Entity extraction:
|
|
6
|
-
2. Relation extraction:
|
|
5
|
+
1. Entity extraction: Identify subject/object entities with types
|
|
6
|
+
2. Relation extraction: Extract predicates using predicate list
|
|
7
7
|
3. Entity scoring: Score how entity-like subjects/objects are
|
|
8
8
|
4. Classification: Run labeler classification schemas in single pass
|
|
9
9
|
"""
|
|
@@ -16,7 +16,7 @@ from typing import Optional
|
|
|
16
16
|
from ..base import BaseExtractorPlugin, ClassificationSchema, PluginCapability
|
|
17
17
|
from ...pipeline.context import PipelineContext
|
|
18
18
|
from ...pipeline.registry import PluginRegistry
|
|
19
|
-
from ...models import
|
|
19
|
+
from ...models import SplitSentence, PipelineStatement, ExtractedEntity, EntityType
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -110,11 +110,11 @@ GLINER_TYPE_MAP = {
|
|
|
110
110
|
@PluginRegistry.extractor
|
|
111
111
|
class GLiNER2Extractor(BaseExtractorPlugin):
|
|
112
112
|
"""
|
|
113
|
-
Extractor plugin that uses GLiNER2 for entity and relation
|
|
113
|
+
Extractor plugin that uses GLiNER2 for entity and relation extraction.
|
|
114
114
|
|
|
115
|
-
Processes
|
|
116
|
-
objects with typed entities.
|
|
117
|
-
labeler plugins in a single pass.
|
|
115
|
+
Processes split sentences from Stage 1 and produces PipelineStatement
|
|
116
|
+
objects with subject-predicate-object triples and typed entities.
|
|
117
|
+
Also runs classification schemas from labeler plugins in a single pass.
|
|
118
118
|
"""
|
|
119
119
|
|
|
120
120
|
def __init__(
|
|
@@ -180,6 +180,16 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
180
180
|
def description(self) -> str:
|
|
181
181
|
return "GLiNER2 model for entity and relation extraction"
|
|
182
182
|
|
|
183
|
+
@property
|
|
184
|
+
def model_vram_gb(self) -> float:
|
|
185
|
+
"""GLiNER2 model weights ~0.8GB."""
|
|
186
|
+
return 0.8
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def per_item_vram_gb(self) -> float:
|
|
190
|
+
"""Each triple during batch processing ~0.1GB."""
|
|
191
|
+
return 0.1
|
|
192
|
+
|
|
183
193
|
def _get_model(self):
|
|
184
194
|
"""Lazy-load the GLiNER2 model."""
|
|
185
195
|
if self._model is None:
|
|
@@ -199,36 +209,36 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
199
209
|
|
|
200
210
|
def extract(
|
|
201
211
|
self,
|
|
202
|
-
|
|
212
|
+
split_sentences: list[SplitSentence],
|
|
203
213
|
context: PipelineContext,
|
|
204
214
|
) -> list[PipelineStatement]:
|
|
205
215
|
"""
|
|
206
|
-
Extract
|
|
216
|
+
Extract subject-predicate-object triples from split sentences using GLiNER2.
|
|
207
217
|
|
|
208
218
|
Returns ALL matching relations from GLiNER2 (not just the best one).
|
|
209
219
|
Also runs any classification schemas and stores results in context.
|
|
210
220
|
|
|
211
221
|
Args:
|
|
212
|
-
|
|
222
|
+
split_sentences: Atomic sentences from Stage 1
|
|
213
223
|
context: Pipeline context
|
|
214
224
|
|
|
215
225
|
Returns:
|
|
216
|
-
List of PipelineStatement objects (may contain multiple per
|
|
226
|
+
List of PipelineStatement objects (may contain multiple per sentence)
|
|
217
227
|
"""
|
|
218
228
|
predicate_categories = self._get_predicate_categories()
|
|
219
|
-
logger.info(f"GLiNER2Extractor processing {len(
|
|
229
|
+
logger.info(f"GLiNER2Extractor processing {len(split_sentences)} sentences")
|
|
220
230
|
logger.info(f"Using {len(predicate_categories)} predicate categories")
|
|
221
231
|
|
|
222
232
|
statements = []
|
|
223
233
|
model = self._get_model()
|
|
224
234
|
classified_texts: set[str] = set()
|
|
225
235
|
|
|
226
|
-
for
|
|
236
|
+
for sentence in split_sentences:
|
|
227
237
|
try:
|
|
228
238
|
if model:
|
|
229
239
|
# Use relation extraction iterating through categories
|
|
230
240
|
# Returns ALL matches, not just the best one
|
|
231
|
-
extracted_stmts = self._extract_with_relations(
|
|
241
|
+
extracted_stmts = self._extract_with_relations(sentence, model, predicate_categories)
|
|
232
242
|
else:
|
|
233
243
|
# No model available - skip
|
|
234
244
|
logger.warning("No GLiNER2 model available - skipping extraction")
|
|
@@ -243,10 +253,10 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
243
253
|
classified_texts.add(stmt.source_text)
|
|
244
254
|
|
|
245
255
|
except Exception as e:
|
|
246
|
-
logger.warning(f"Error extracting
|
|
247
|
-
# No fallback - skip this
|
|
256
|
+
logger.warning(f"Error extracting from sentence: {e}")
|
|
257
|
+
# No fallback - skip this sentence
|
|
248
258
|
|
|
249
|
-
logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(
|
|
259
|
+
logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(split_sentences)} sentences")
|
|
250
260
|
return statements
|
|
251
261
|
|
|
252
262
|
def _run_classifications(
|
|
@@ -306,7 +316,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
306
316
|
|
|
307
317
|
def _extract_with_relations(
|
|
308
318
|
self,
|
|
309
|
-
|
|
319
|
+
sentence: SplitSentence,
|
|
310
320
|
model,
|
|
311
321
|
predicate_categories: dict[str, dict[str, PredicateConfig]],
|
|
312
322
|
) -> list[PipelineStatement]:
|
|
@@ -318,14 +328,14 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
318
328
|
Returns ALL matching relations, not just the best one.
|
|
319
329
|
|
|
320
330
|
Args:
|
|
321
|
-
|
|
331
|
+
sentence: Split sentence from Stage 1
|
|
322
332
|
model: GLiNER2 model instance
|
|
323
333
|
predicate_categories: Dict of category -> predicates to use
|
|
324
334
|
|
|
325
335
|
Returns:
|
|
326
336
|
List of PipelineStatements for all relations found
|
|
327
337
|
"""
|
|
328
|
-
logger.debug(f"Attempting relation extraction for: '{
|
|
338
|
+
logger.debug(f"Attempting relation extraction for: '{sentence.text[:80]}...'")
|
|
329
339
|
|
|
330
340
|
# Iterate through each category separately to stay under GLiNER2's ~25 label limit
|
|
331
341
|
# Use schema API with entities + relations together for better extraction
|
|
@@ -345,7 +355,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
345
355
|
.entities(self._get_entity_types())
|
|
346
356
|
.relations(relations_dict)
|
|
347
357
|
)
|
|
348
|
-
result = model.extract(
|
|
358
|
+
result = model.extract(sentence.text, schema, include_confidence=True)
|
|
349
359
|
|
|
350
360
|
# Get relations from this category
|
|
351
361
|
relation_data = result.get("relations", result.get("relation_extraction", {}))
|
|
@@ -369,7 +379,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
369
379
|
logger.debug(f" GLiNER2 found {total_found} total relations across all categories")
|
|
370
380
|
|
|
371
381
|
if not all_relations:
|
|
372
|
-
logger.debug(f"No GLiNER2 relation match in: '{
|
|
382
|
+
logger.debug(f"No GLiNER2 relation match in: '{sentence.text[:60]}...'")
|
|
373
383
|
return []
|
|
374
384
|
|
|
375
385
|
# Filter by confidence threshold and sort descending
|
|
@@ -392,8 +402,8 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
392
402
|
)
|
|
393
403
|
|
|
394
404
|
# Get entity types
|
|
395
|
-
subj_type = self._infer_entity_type(head, model,
|
|
396
|
-
obj_type = self._infer_entity_type(tail, model,
|
|
405
|
+
subj_type = self._infer_entity_type(head, model, sentence.text)
|
|
406
|
+
obj_type = self._infer_entity_type(tail, model, sentence.text)
|
|
397
407
|
logger.debug(f" Entity types: {subj_type.value}, {obj_type.value}")
|
|
398
408
|
|
|
399
409
|
stmt = PipelineStatement(
|
|
@@ -409,7 +419,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
409
419
|
type=obj_type,
|
|
410
420
|
confidence=confidence,
|
|
411
421
|
),
|
|
412
|
-
source_text=
|
|
422
|
+
source_text=sentence.text,
|
|
413
423
|
confidence_score=confidence,
|
|
414
424
|
extraction_method="gliner_relation",
|
|
415
425
|
)
|
|
@@ -419,7 +429,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
419
429
|
|
|
420
430
|
def _extract_with_entities(
|
|
421
431
|
self,
|
|
422
|
-
|
|
432
|
+
sentence: SplitSentence,
|
|
423
433
|
model,
|
|
424
434
|
) -> Optional[PipelineStatement]:
|
|
425
435
|
"""
|
|
@@ -428,7 +438,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
428
438
|
This method is called when predicates are disabled. Without GLiNER2 relation
|
|
429
439
|
extraction, we cannot form valid statements.
|
|
430
440
|
"""
|
|
431
|
-
logger.debug(f"Entity extraction mode (no predicates) - skipping: '{
|
|
441
|
+
logger.debug(f"Entity extraction mode (no predicates) - skipping: '{sentence.text[:60]}...'")
|
|
432
442
|
return None
|
|
433
443
|
|
|
434
444
|
def _parse_relation(self, rel) -> tuple[str, str, float]:
|
|
@@ -8,9 +8,19 @@ there are too many possible values for simple multi-choice classification.
|
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Optional
|
|
11
|
+
from typing import Optional, TypedDict
|
|
12
12
|
|
|
13
13
|
from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TaxonomyEntry(TypedDict):
|
|
17
|
+
"""Structure for each taxonomy label entry."""
|
|
18
|
+
description: str
|
|
19
|
+
id: int
|
|
20
|
+
mnli_label: str
|
|
21
|
+
embedding_label: str
|
|
22
|
+
|
|
23
|
+
|
|
14
24
|
from ...pipeline.context import PipelineContext
|
|
15
25
|
from ...models import (
|
|
16
26
|
PipelineStatement,
|
|
@@ -214,7 +224,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
|
|
|
214
224
|
self._top_k_categories = top_k_categories
|
|
215
225
|
self._min_confidence = min_confidence
|
|
216
226
|
|
|
217
|
-
self._taxonomy: Optional[dict[str, dict[str,
|
|
227
|
+
self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
|
|
218
228
|
self._classifier: Optional[TaxonomyClassifier] = None
|
|
219
229
|
|
|
220
230
|
@property
|
|
@@ -250,7 +260,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
|
|
|
250
260
|
scope="statement",
|
|
251
261
|
)
|
|
252
262
|
|
|
253
|
-
def _load_taxonomy(self) -> dict[str, dict[str,
|
|
263
|
+
def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
|
|
254
264
|
"""Load taxonomy from JSON file."""
|
|
255
265
|
if self._taxonomy is not None:
|
|
256
266
|
return self._taxonomy
|
|
@@ -358,12 +368,15 @@ class TaxonomyLabeler(BaseLabelerPlugin):
|
|
|
358
368
|
taxonomy = self._load_taxonomy()
|
|
359
369
|
|
|
360
370
|
if category and category in taxonomy:
|
|
361
|
-
|
|
371
|
+
entry = taxonomy[category].get(label)
|
|
372
|
+
if entry:
|
|
373
|
+
return entry.get("id")
|
|
362
374
|
|
|
363
375
|
# Search all categories for flat classification
|
|
364
376
|
for cat_labels in taxonomy.values():
|
|
365
377
|
if label in cat_labels:
|
|
366
|
-
|
|
378
|
+
entry = cat_labels[label]
|
|
379
|
+
return entry.get("id")
|
|
367
380
|
|
|
368
381
|
return None
|
|
369
382
|
|
|
@@ -11,10 +11,19 @@ import json
|
|
|
11
11
|
import logging
|
|
12
12
|
import time
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Optional
|
|
14
|
+
from typing import Optional, TypedDict
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
|
|
18
|
+
|
|
19
|
+
class TaxonomyEntry(TypedDict):
|
|
20
|
+
"""Structure for each taxonomy label entry."""
|
|
21
|
+
description: str
|
|
22
|
+
id: int
|
|
23
|
+
mnli_label: str
|
|
24
|
+
embedding_label: str
|
|
25
|
+
|
|
26
|
+
|
|
18
27
|
from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
|
|
19
28
|
from ...pipeline.context import PipelineContext
|
|
20
29
|
from ...models import (
|
|
@@ -106,14 +115,14 @@ class EmbeddingClassifier:
|
|
|
106
115
|
|
|
107
116
|
def precompute_label_embeddings(
|
|
108
117
|
self,
|
|
109
|
-
taxonomy: dict[str, dict[str,
|
|
118
|
+
taxonomy: dict[str, dict[str, TaxonomyEntry]],
|
|
110
119
|
categories: Optional[list[str]] = None,
|
|
111
120
|
) -> None:
|
|
112
121
|
"""
|
|
113
122
|
Pre-compute embeddings for all label names.
|
|
114
123
|
|
|
115
124
|
Args:
|
|
116
|
-
taxonomy: Taxonomy dict {category: {label:
|
|
125
|
+
taxonomy: Taxonomy dict {category: {label: TaxonomyEntry, ...}, ...}
|
|
117
126
|
categories: Categories to include (default: all)
|
|
118
127
|
"""
|
|
119
128
|
self._load_model()
|
|
@@ -314,7 +323,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
|
|
|
314
323
|
self._top_k_categories = top_k_categories
|
|
315
324
|
self._min_confidence = min_confidence
|
|
316
325
|
|
|
317
|
-
self._taxonomy: Optional[dict[str, dict[str,
|
|
326
|
+
self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
|
|
318
327
|
self._classifier: Optional[EmbeddingClassifier] = None
|
|
319
328
|
self._embeddings_computed = False
|
|
320
329
|
|
|
@@ -350,7 +359,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
|
|
|
350
359
|
scope="statement",
|
|
351
360
|
)
|
|
352
361
|
|
|
353
|
-
def _load_taxonomy(self) -> dict[str, dict[str,
|
|
362
|
+
def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
|
|
354
363
|
"""Load taxonomy from JSON file."""
|
|
355
364
|
if self._taxonomy is not None:
|
|
356
365
|
return self._taxonomy
|
|
@@ -456,7 +465,9 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
|
|
|
456
465
|
taxonomy = self._load_taxonomy()
|
|
457
466
|
|
|
458
467
|
if category in taxonomy:
|
|
459
|
-
|
|
468
|
+
entry = taxonomy[category].get(label)
|
|
469
|
+
if entry:
|
|
470
|
+
return entry.get("id")
|
|
460
471
|
|
|
461
472
|
return None
|
|
462
473
|
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
|
|
3
|
+
|
|
4
|
+
Extracts text from PDFs page by page, with automatic detection of
|
|
5
|
+
image-heavy PDFs that may require OCR.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import tempfile
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from ..base import BasePDFParserPlugin, PDFParseResult
|
|
15
|
+
from ...pipeline.registry import PluginRegistry
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@PluginRegistry.pdf_parser
|
|
21
|
+
class PyPDFParserPlugin(BasePDFParserPlugin):
|
|
22
|
+
"""
|
|
23
|
+
PDF parser using PyMuPDF (fitz) with optional OCR fallback.
|
|
24
|
+
|
|
25
|
+
Features:
|
|
26
|
+
- Fast text extraction using PyMuPDF
|
|
27
|
+
- Automatic detection of image-heavy PDFs
|
|
28
|
+
- Optional OCR fallback using Tesseract
|
|
29
|
+
- Metadata extraction (title, author, etc.)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
image_threshold: float = 0.5,
|
|
35
|
+
text_threshold: float = 0.4,
|
|
36
|
+
use_ocr_fallback: bool = True,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the PDF parser.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
image_threshold: Images per page threshold for OCR trigger
|
|
43
|
+
text_threshold: Text density threshold (chars/1000 per page)
|
|
44
|
+
use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
|
|
45
|
+
"""
|
|
46
|
+
self._image_threshold = image_threshold
|
|
47
|
+
self._text_threshold = text_threshold
|
|
48
|
+
self._use_ocr_fallback = use_ocr_fallback
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def name(self) -> str:
|
|
52
|
+
return "pypdf_parser"
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def priority(self) -> int:
|
|
56
|
+
return 100
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def description(self) -> str:
|
|
60
|
+
return "PDF parser using PyMuPDF with optional OCR fallback"
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def supports_ocr(self) -> bool:
|
|
64
|
+
return self._use_ocr_fallback
|
|
65
|
+
|
|
66
|
+
def parse(
|
|
67
|
+
self,
|
|
68
|
+
pdf_bytes: bytes,
|
|
69
|
+
max_pages: int = 500,
|
|
70
|
+
use_ocr: bool = False,
|
|
71
|
+
) -> PDFParseResult:
|
|
72
|
+
"""
|
|
73
|
+
Extract text from PDF bytes.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
pdf_bytes: Raw PDF file content
|
|
77
|
+
max_pages: Maximum number of pages to process
|
|
78
|
+
use_ocr: Force OCR even for text-extractable PDFs
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
PDFParseResult with extracted text for each page
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
import fitz # PyMuPDF
|
|
85
|
+
except ImportError:
|
|
86
|
+
return PDFParseResult(
|
|
87
|
+
pages=[],
|
|
88
|
+
page_count=0,
|
|
89
|
+
error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
temp_path: Optional[str] = None
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
# Write bytes to temp file for fitz
|
|
96
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
|
|
97
|
+
f.write(pdf_bytes)
|
|
98
|
+
temp_path = f.name
|
|
99
|
+
|
|
100
|
+
logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
|
|
101
|
+
|
|
102
|
+
# Open the PDF
|
|
103
|
+
pdf_doc = fitz.open(temp_path)
|
|
104
|
+
total_pages = len(pdf_doc)
|
|
105
|
+
logger.info(f"PDF has {total_pages} pages")
|
|
106
|
+
|
|
107
|
+
# Check if we should use OCR
|
|
108
|
+
should_ocr = use_ocr or (
|
|
109
|
+
self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if should_ocr:
|
|
113
|
+
logger.info("PDF appears image-heavy, using OCR")
|
|
114
|
+
result = self._parse_with_ocr(pdf_doc, max_pages)
|
|
115
|
+
else:
|
|
116
|
+
logger.info("PDF has extractable text, using direct extraction")
|
|
117
|
+
result = self._parse_with_fitz(pdf_doc, max_pages)
|
|
118
|
+
|
|
119
|
+
pdf_doc.close()
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.exception(f"Error parsing PDF: {e}")
|
|
124
|
+
return PDFParseResult(
|
|
125
|
+
pages=[],
|
|
126
|
+
page_count=0,
|
|
127
|
+
error=f"Failed to parse PDF: {e}",
|
|
128
|
+
)
|
|
129
|
+
finally:
|
|
130
|
+
# Clean up temp file
|
|
131
|
+
if temp_path and os.path.exists(temp_path):
|
|
132
|
+
try:
|
|
133
|
+
os.unlink(temp_path)
|
|
134
|
+
except Exception:
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
def _is_mostly_images(self, pdf_doc) -> bool:
|
|
138
|
+
"""
|
|
139
|
+
Check if PDF is mostly images (may need OCR).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
pdf_doc: PyMuPDF document object
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
True if PDF appears to be image-heavy
|
|
146
|
+
"""
|
|
147
|
+
total_pages = len(pdf_doc)
|
|
148
|
+
if total_pages == 0:
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
# Count images in first few pages
|
|
152
|
+
sample_pages = min(3, total_pages)
|
|
153
|
+
image_count = 0
|
|
154
|
+
for i in range(sample_pages):
|
|
155
|
+
image_count += len(pdf_doc[i].get_images())
|
|
156
|
+
|
|
157
|
+
avg_images_per_page = image_count / sample_pages
|
|
158
|
+
|
|
159
|
+
# Check text density in sample pages
|
|
160
|
+
sample_text = ""
|
|
161
|
+
for i in range(sample_pages):
|
|
162
|
+
sample_text += pdf_doc[i].get_text()
|
|
163
|
+
|
|
164
|
+
text_density = len(sample_text) / 1000 / sample_pages
|
|
165
|
+
|
|
166
|
+
logger.debug(
|
|
167
|
+
f"PDF analysis: {avg_images_per_page:.1f} images/page, "
|
|
168
|
+
f"{text_density:.2f} text density"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# If text density is high, don't use OCR
|
|
172
|
+
if text_density > self._text_threshold:
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
# If many images per page and low text, probably needs OCR
|
|
176
|
+
return avg_images_per_page > self._image_threshold
|
|
177
|
+
|
|
178
|
+
def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
|
|
179
|
+
"""
|
|
180
|
+
Extract text using PyMuPDF (fast, direct extraction).
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
pdf_doc: PyMuPDF document object
|
|
184
|
+
max_pages: Maximum pages to process
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
PDFParseResult with extracted text
|
|
188
|
+
"""
|
|
189
|
+
pages = []
|
|
190
|
+
total_pages = len(pdf_doc)
|
|
191
|
+
|
|
192
|
+
for i in range(min(total_pages, max_pages)):
|
|
193
|
+
page = pdf_doc[i]
|
|
194
|
+
text = page.get_text()
|
|
195
|
+
pages.append(text.strip())
|
|
196
|
+
|
|
197
|
+
if (i + 1) % 50 == 0:
|
|
198
|
+
logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
|
|
199
|
+
|
|
200
|
+
# Extract metadata
|
|
201
|
+
metadata = self._extract_metadata(pdf_doc)
|
|
202
|
+
|
|
203
|
+
return PDFParseResult(
|
|
204
|
+
pages=pages,
|
|
205
|
+
page_count=total_pages,
|
|
206
|
+
metadata=metadata,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
|
|
210
|
+
"""
|
|
211
|
+
Extract text using OCR (Tesseract).
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
pdf_doc: PyMuPDF document object
|
|
215
|
+
max_pages: Maximum pages to process
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
PDFParseResult with OCR-extracted text
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
import pytesseract
|
|
222
|
+
from PIL import Image
|
|
223
|
+
except ImportError:
|
|
224
|
+
return PDFParseResult(
|
|
225
|
+
pages=[],
|
|
226
|
+
page_count=len(pdf_doc),
|
|
227
|
+
error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
pages = []
|
|
231
|
+
total_pages = len(pdf_doc)
|
|
232
|
+
|
|
233
|
+
for i in range(min(total_pages, max_pages)):
|
|
234
|
+
page = pdf_doc[i]
|
|
235
|
+
|
|
236
|
+
# Render page to image
|
|
237
|
+
pix = page.get_pixmap(dpi=150) # 150 DPI is good balance
|
|
238
|
+
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
|
239
|
+
|
|
240
|
+
# Run OCR
|
|
241
|
+
text = pytesseract.image_to_string(img)
|
|
242
|
+
pages.append(text.strip())
|
|
243
|
+
|
|
244
|
+
if (i + 1) % 10 == 0:
|
|
245
|
+
logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
|
|
246
|
+
|
|
247
|
+
# Extract metadata
|
|
248
|
+
metadata = self._extract_metadata(pdf_doc)
|
|
249
|
+
|
|
250
|
+
return PDFParseResult(
|
|
251
|
+
pages=pages,
|
|
252
|
+
page_count=total_pages,
|
|
253
|
+
metadata=metadata,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _extract_metadata(pdf_doc) -> dict[str, Any]:
|
|
258
|
+
"""
|
|
259
|
+
Extract PDF metadata.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
pdf_doc: PyMuPDF document object
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Dictionary of metadata fields
|
|
266
|
+
"""
|
|
267
|
+
metadata = {}
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
doc_metadata = pdf_doc.metadata
|
|
271
|
+
if doc_metadata:
|
|
272
|
+
# Map common PDF metadata fields
|
|
273
|
+
field_map = {
|
|
274
|
+
"title": "title",
|
|
275
|
+
"author": "author",
|
|
276
|
+
"subject": "subject",
|
|
277
|
+
"keywords": "keywords",
|
|
278
|
+
"creator": "creator",
|
|
279
|
+
"producer": "producer",
|
|
280
|
+
"creationDate": "created",
|
|
281
|
+
"modDate": "modified",
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
for pdf_key, our_key in field_map.items():
|
|
285
|
+
value = doc_metadata.get(pdf_key)
|
|
286
|
+
if value and isinstance(value, str) and value.strip():
|
|
287
|
+
metadata[our_key] = value.strip()
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.debug(f"Error extracting metadata: {e}")
|
|
290
|
+
|
|
291
|
+
return metadata
|
|
@@ -6,6 +6,15 @@ Adds qualifiers and identifiers to entities.
|
|
|
6
6
|
|
|
7
7
|
from .base import BaseQualifierPlugin
|
|
8
8
|
from .person import PersonQualifierPlugin
|
|
9
|
+
|
|
10
|
+
# Import embedding qualifier (may fail if database module not available)
|
|
11
|
+
try:
|
|
12
|
+
from .embedding_company import EmbeddingCompanyQualifier
|
|
13
|
+
except ImportError:
|
|
14
|
+
EmbeddingCompanyQualifier = None # type: ignore
|
|
15
|
+
|
|
16
|
+
# DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
|
|
17
|
+
# They are no longer auto-registered with the plugin registry.
|
|
9
18
|
from .gleif import GLEIFQualifierPlugin
|
|
10
19
|
from .companies_house import CompaniesHouseQualifierPlugin
|
|
11
20
|
from .sec_edgar import SECEdgarQualifierPlugin
|
|
@@ -13,6 +22,8 @@ from .sec_edgar import SECEdgarQualifierPlugin
|
|
|
13
22
|
__all__ = [
|
|
14
23
|
"BaseQualifierPlugin",
|
|
15
24
|
"PersonQualifierPlugin",
|
|
25
|
+
"EmbeddingCompanyQualifier",
|
|
26
|
+
# Deprecated - kept for backwards compatibility
|
|
16
27
|
"GLEIFQualifierPlugin",
|
|
17
28
|
"CompaniesHouseQualifierPlugin",
|
|
18
29
|
"SECEdgarQualifierPlugin",
|