corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  """
2
- GLiNER2Extractor - Stage 2 plugin that refines triples using GLiNER2.
2
+ GLiNER2Extractor - Stage 2 plugin that extracts triples from sentences.
3
3
 
4
4
  Uses GLiNER2 for:
5
- 1. Entity extraction: Refine subject/object boundaries
6
- 2. Relation extraction: When predicate list is provided
5
+ 1. Entity extraction: Identify subject/object entities with types
6
+ 2. Relation extraction: Extract predicates using predicate list
7
7
  3. Entity scoring: Score how entity-like subjects/objects are
8
8
  4. Classification: Run labeler classification schemas in single pass
9
9
  """
@@ -16,7 +16,7 @@ from typing import Optional
16
16
  from ..base import BaseExtractorPlugin, ClassificationSchema, PluginCapability
17
17
  from ...pipeline.context import PipelineContext
18
18
  from ...pipeline.registry import PluginRegistry
19
- from ...models import RawTriple, PipelineStatement, ExtractedEntity, EntityType
19
+ from ...models import SplitSentence, PipelineStatement, ExtractedEntity, EntityType
20
20
 
21
21
  logger = logging.getLogger(__name__)
22
22
 
@@ -110,11 +110,11 @@ GLINER_TYPE_MAP = {
110
110
  @PluginRegistry.extractor
111
111
  class GLiNER2Extractor(BaseExtractorPlugin):
112
112
  """
113
- Extractor plugin that uses GLiNER2 for entity and relation refinement.
113
+ Extractor plugin that uses GLiNER2 for entity and relation extraction.
114
114
 
115
- Processes raw triples from Stage 1 and produces PipelineStatement
116
- objects with typed entities. Also runs classification schemas from
117
- labeler plugins in a single pass.
115
+ Processes split sentences from Stage 1 and produces PipelineStatement
116
+ objects with subject-predicate-object triples and typed entities.
117
+ Also runs classification schemas from labeler plugins in a single pass.
118
118
  """
119
119
 
120
120
  def __init__(
@@ -180,6 +180,16 @@ class GLiNER2Extractor(BaseExtractorPlugin):
180
180
  def description(self) -> str:
181
181
  return "GLiNER2 model for entity and relation extraction"
182
182
 
183
+ @property
184
+ def model_vram_gb(self) -> float:
185
+ """GLiNER2 model weights ~0.8GB."""
186
+ return 0.8
187
+
188
+ @property
189
+ def per_item_vram_gb(self) -> float:
190
+ """Each triple during batch processing ~0.1GB."""
191
+ return 0.1
192
+
183
193
  def _get_model(self):
184
194
  """Lazy-load the GLiNER2 model."""
185
195
  if self._model is None:
@@ -199,36 +209,36 @@ class GLiNER2Extractor(BaseExtractorPlugin):
199
209
 
200
210
  def extract(
201
211
  self,
202
- raw_triples: list[RawTriple],
212
+ split_sentences: list[SplitSentence],
203
213
  context: PipelineContext,
204
214
  ) -> list[PipelineStatement]:
205
215
  """
206
- Extract statements from raw triples using GLiNER2.
216
+ Extract subject-predicate-object triples from split sentences using GLiNER2.
207
217
 
208
218
  Returns ALL matching relations from GLiNER2 (not just the best one).
209
219
  Also runs any classification schemas and stores results in context.
210
220
 
211
221
  Args:
212
- raw_triples: Raw triples from Stage 1
222
+ split_sentences: Atomic sentences from Stage 1
213
223
  context: Pipeline context
214
224
 
215
225
  Returns:
216
- List of PipelineStatement objects (may contain multiple per raw triple)
226
+ List of PipelineStatement objects (may contain multiple per sentence)
217
227
  """
218
228
  predicate_categories = self._get_predicate_categories()
219
- logger.info(f"GLiNER2Extractor processing {len(raw_triples)} triples")
229
+ logger.info(f"GLiNER2Extractor processing {len(split_sentences)} sentences")
220
230
  logger.info(f"Using {len(predicate_categories)} predicate categories")
221
231
 
222
232
  statements = []
223
233
  model = self._get_model()
224
234
  classified_texts: set[str] = set()
225
235
 
226
- for raw in raw_triples:
236
+ for sentence in split_sentences:
227
237
  try:
228
238
  if model:
229
239
  # Use relation extraction iterating through categories
230
240
  # Returns ALL matches, not just the best one
231
- extracted_stmts = self._extract_with_relations(raw, model, predicate_categories)
241
+ extracted_stmts = self._extract_with_relations(sentence, model, predicate_categories)
232
242
  else:
233
243
  # No model available - skip
234
244
  logger.warning("No GLiNER2 model available - skipping extraction")
@@ -243,10 +253,10 @@ class GLiNER2Extractor(BaseExtractorPlugin):
243
253
  classified_texts.add(stmt.source_text)
244
254
 
245
255
  except Exception as e:
246
- logger.warning(f"Error extracting triple: {e}")
247
- # No fallback - skip this triple
256
+ logger.warning(f"Error extracting from sentence: {e}")
257
+ # No fallback - skip this sentence
248
258
 
249
- logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(raw_triples)} raw triples")
259
+ logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(split_sentences)} sentences")
250
260
  return statements
251
261
 
252
262
  def _run_classifications(
@@ -306,7 +316,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
306
316
 
307
317
  def _extract_with_relations(
308
318
  self,
309
- raw: RawTriple,
319
+ sentence: SplitSentence,
310
320
  model,
311
321
  predicate_categories: dict[str, dict[str, PredicateConfig]],
312
322
  ) -> list[PipelineStatement]:
@@ -318,14 +328,14 @@ class GLiNER2Extractor(BaseExtractorPlugin):
318
328
  Returns ALL matching relations, not just the best one.
319
329
 
320
330
  Args:
321
- raw: Raw triple from Stage 1
331
+ sentence: Split sentence from Stage 1
322
332
  model: GLiNER2 model instance
323
333
  predicate_categories: Dict of category -> predicates to use
324
334
 
325
335
  Returns:
326
336
  List of PipelineStatements for all relations found
327
337
  """
328
- logger.debug(f"Attempting relation extraction for: '{raw.source_sentence[:80]}...'")
338
+ logger.debug(f"Attempting relation extraction for: '{sentence.text[:80]}...'")
329
339
 
330
340
  # Iterate through each category separately to stay under GLiNER2's ~25 label limit
331
341
  # Use schema API with entities + relations together for better extraction
@@ -345,7 +355,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
345
355
  .entities(self._get_entity_types())
346
356
  .relations(relations_dict)
347
357
  )
348
- result = model.extract(raw.source_sentence, schema, include_confidence=True)
358
+ result = model.extract(sentence.text, schema, include_confidence=True)
349
359
 
350
360
  # Get relations from this category
351
361
  relation_data = result.get("relations", result.get("relation_extraction", {}))
@@ -369,7 +379,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
369
379
  logger.debug(f" GLiNER2 found {total_found} total relations across all categories")
370
380
 
371
381
  if not all_relations:
372
- logger.debug(f"No GLiNER2 relation match in: '{raw.source_sentence[:60]}...'")
382
+ logger.debug(f"No GLiNER2 relation match in: '{sentence.text[:60]}...'")
373
383
  return []
374
384
 
375
385
  # Filter by confidence threshold and sort descending
@@ -392,8 +402,8 @@ class GLiNER2Extractor(BaseExtractorPlugin):
392
402
  )
393
403
 
394
404
  # Get entity types
395
- subj_type = self._infer_entity_type(head, model, raw.source_sentence)
396
- obj_type = self._infer_entity_type(tail, model, raw.source_sentence)
405
+ subj_type = self._infer_entity_type(head, model, sentence.text)
406
+ obj_type = self._infer_entity_type(tail, model, sentence.text)
397
407
  logger.debug(f" Entity types: {subj_type.value}, {obj_type.value}")
398
408
 
399
409
  stmt = PipelineStatement(
@@ -409,7 +419,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
409
419
  type=obj_type,
410
420
  confidence=confidence,
411
421
  ),
412
- source_text=raw.source_sentence,
422
+ source_text=sentence.text,
413
423
  confidence_score=confidence,
414
424
  extraction_method="gliner_relation",
415
425
  )
@@ -419,7 +429,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
419
429
 
420
430
  def _extract_with_entities(
421
431
  self,
422
- raw: RawTriple,
432
+ sentence: SplitSentence,
423
433
  model,
424
434
  ) -> Optional[PipelineStatement]:
425
435
  """
@@ -428,7 +438,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
428
438
  This method is called when predicates are disabled. Without GLiNER2 relation
429
439
  extraction, we cannot form valid statements.
430
440
  """
431
- logger.debug(f"Entity extraction mode (no predicates) - skipping: '{raw.source_sentence[:60]}...'")
441
+ logger.debug(f"Entity extraction mode (no predicates) - skipping: '{sentence.text[:60]}...'")
432
442
  return None
433
443
 
434
444
  def _parse_relation(self, rel) -> tuple[str, str, float]:
@@ -8,9 +8,19 @@ there are too many possible values for simple multi-choice classification.
8
8
  import json
9
9
  import logging
10
10
  from pathlib import Path
11
- from typing import Optional
11
+ from typing import Optional, TypedDict
12
12
 
13
13
  from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
14
+
15
+
16
+ class TaxonomyEntry(TypedDict):
17
+ """Structure for each taxonomy label entry."""
18
+ description: str
19
+ id: int
20
+ mnli_label: str
21
+ embedding_label: str
22
+
23
+
14
24
  from ...pipeline.context import PipelineContext
15
25
  from ...models import (
16
26
  PipelineStatement,
@@ -214,7 +224,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
214
224
  self._top_k_categories = top_k_categories
215
225
  self._min_confidence = min_confidence
216
226
 
217
- self._taxonomy: Optional[dict[str, dict[str, int]]] = None
227
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
218
228
  self._classifier: Optional[TaxonomyClassifier] = None
219
229
 
220
230
  @property
@@ -250,7 +260,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
250
260
  scope="statement",
251
261
  )
252
262
 
253
- def _load_taxonomy(self) -> dict[str, dict[str, int]]:
263
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
254
264
  """Load taxonomy from JSON file."""
255
265
  if self._taxonomy is not None:
256
266
  return self._taxonomy
@@ -358,12 +368,15 @@ class TaxonomyLabeler(BaseLabelerPlugin):
358
368
  taxonomy = self._load_taxonomy()
359
369
 
360
370
  if category and category in taxonomy:
361
- return taxonomy[category].get(label)
371
+ entry = taxonomy[category].get(label)
372
+ if entry:
373
+ return entry.get("id")
362
374
 
363
375
  # Search all categories for flat classification
364
376
  for cat_labels in taxonomy.values():
365
377
  if label in cat_labels:
366
- return cat_labels[label]
378
+ entry = cat_labels[label]
379
+ return entry.get("id")
367
380
 
368
381
  return None
369
382
 
@@ -11,10 +11,19 @@ import json
11
11
  import logging
12
12
  import time
13
13
  from pathlib import Path
14
- from typing import Optional
14
+ from typing import Optional, TypedDict
15
15
 
16
16
  import numpy as np
17
17
 
18
+
19
+ class TaxonomyEntry(TypedDict):
20
+ """Structure for each taxonomy label entry."""
21
+ description: str
22
+ id: int
23
+ mnli_label: str
24
+ embedding_label: str
25
+
26
+
18
27
  from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
19
28
  from ...pipeline.context import PipelineContext
20
29
  from ...models import (
@@ -106,14 +115,14 @@ class EmbeddingClassifier:
106
115
 
107
116
  def precompute_label_embeddings(
108
117
  self,
109
- taxonomy: dict[str, dict[str, int]],
118
+ taxonomy: dict[str, dict[str, TaxonomyEntry]],
110
119
  categories: Optional[list[str]] = None,
111
120
  ) -> None:
112
121
  """
113
122
  Pre-compute embeddings for all label names.
114
123
 
115
124
  Args:
116
- taxonomy: Taxonomy dict {category: {label: id, ...}, ...}
125
+ taxonomy: Taxonomy dict {category: {label: TaxonomyEntry, ...}, ...}
117
126
  categories: Categories to include (default: all)
118
127
  """
119
128
  self._load_model()
@@ -314,7 +323,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
314
323
  self._top_k_categories = top_k_categories
315
324
  self._min_confidence = min_confidence
316
325
 
317
- self._taxonomy: Optional[dict[str, dict[str, int]]] = None
326
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
318
327
  self._classifier: Optional[EmbeddingClassifier] = None
319
328
  self._embeddings_computed = False
320
329
 
@@ -350,7 +359,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
350
359
  scope="statement",
351
360
  )
352
361
 
353
- def _load_taxonomy(self) -> dict[str, dict[str, int]]:
362
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
354
363
  """Load taxonomy from JSON file."""
355
364
  if self._taxonomy is not None:
356
365
  return self._taxonomy
@@ -456,7 +465,9 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
456
465
  taxonomy = self._load_taxonomy()
457
466
 
458
467
  if category in taxonomy:
459
- return taxonomy[category].get(label)
468
+ entry = taxonomy[category].get(label)
469
+ if entry:
470
+ return entry.get("id")
460
471
 
461
472
  return None
462
473
 
@@ -0,0 +1,10 @@
1
+ """
2
+ PDF parser plugins for extracting text from PDF files.
3
+
4
+ Built-in parsers:
5
+ - pypdf_parser: Default PDF parser using PyMuPDF with optional OCR
6
+ """
7
+
8
+ from .pypdf import PyPDFParserPlugin
9
+
10
+ __all__ = ["PyPDFParserPlugin"]
@@ -0,0 +1,291 @@
1
+ """
2
+ PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
3
+
4
+ Extracts text from PDFs page by page, with automatic detection of
5
+ image-heavy PDFs that may require OCR.
6
+ """
7
+
8
+ import io
9
+ import logging
10
+ import os
11
+ import tempfile
12
+ from typing import Any, Optional
13
+
14
+ from ..base import BasePDFParserPlugin, PDFParseResult
15
+ from ...pipeline.registry import PluginRegistry
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @PluginRegistry.pdf_parser
21
+ class PyPDFParserPlugin(BasePDFParserPlugin):
22
+ """
23
+ PDF parser using PyMuPDF (fitz) with optional OCR fallback.
24
+
25
+ Features:
26
+ - Fast text extraction using PyMuPDF
27
+ - Automatic detection of image-heavy PDFs
28
+ - Optional OCR fallback using Tesseract
29
+ - Metadata extraction (title, author, etc.)
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ image_threshold: float = 0.5,
35
+ text_threshold: float = 0.4,
36
+ use_ocr_fallback: bool = True,
37
+ ):
38
+ """
39
+ Initialize the PDF parser.
40
+
41
+ Args:
42
+ image_threshold: Images per page threshold for OCR trigger
43
+ text_threshold: Text density threshold (chars/1000 per page)
44
+ use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
45
+ """
46
+ self._image_threshold = image_threshold
47
+ self._text_threshold = text_threshold
48
+ self._use_ocr_fallback = use_ocr_fallback
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return "pypdf_parser"
53
+
54
+ @property
55
+ def priority(self) -> int:
56
+ return 100
57
+
58
+ @property
59
+ def description(self) -> str:
60
+ return "PDF parser using PyMuPDF with optional OCR fallback"
61
+
62
+ @property
63
+ def supports_ocr(self) -> bool:
64
+ return self._use_ocr_fallback
65
+
66
+ def parse(
67
+ self,
68
+ pdf_bytes: bytes,
69
+ max_pages: int = 500,
70
+ use_ocr: bool = False,
71
+ ) -> PDFParseResult:
72
+ """
73
+ Extract text from PDF bytes.
74
+
75
+ Args:
76
+ pdf_bytes: Raw PDF file content
77
+ max_pages: Maximum number of pages to process
78
+ use_ocr: Force OCR even for text-extractable PDFs
79
+
80
+ Returns:
81
+ PDFParseResult with extracted text for each page
82
+ """
83
+ try:
84
+ import fitz # PyMuPDF
85
+ except ImportError:
86
+ return PDFParseResult(
87
+ pages=[],
88
+ page_count=0,
89
+ error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
90
+ )
91
+
92
+ temp_path: Optional[str] = None
93
+
94
+ try:
95
+ # Write bytes to temp file for fitz
96
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
97
+ f.write(pdf_bytes)
98
+ temp_path = f.name
99
+
100
+ logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
101
+
102
+ # Open the PDF
103
+ pdf_doc = fitz.open(temp_path)
104
+ total_pages = len(pdf_doc)
105
+ logger.info(f"PDF has {total_pages} pages")
106
+
107
+ # Check if we should use OCR
108
+ should_ocr = use_ocr or (
109
+ self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
110
+ )
111
+
112
+ if should_ocr:
113
+ logger.info("PDF appears image-heavy, using OCR")
114
+ result = self._parse_with_ocr(pdf_doc, max_pages)
115
+ else:
116
+ logger.info("PDF has extractable text, using direct extraction")
117
+ result = self._parse_with_fitz(pdf_doc, max_pages)
118
+
119
+ pdf_doc.close()
120
+ return result
121
+
122
+ except Exception as e:
123
+ logger.exception(f"Error parsing PDF: {e}")
124
+ return PDFParseResult(
125
+ pages=[],
126
+ page_count=0,
127
+ error=f"Failed to parse PDF: {e}",
128
+ )
129
+ finally:
130
+ # Clean up temp file
131
+ if temp_path and os.path.exists(temp_path):
132
+ try:
133
+ os.unlink(temp_path)
134
+ except Exception:
135
+ pass
136
+
137
+ def _is_mostly_images(self, pdf_doc) -> bool:
138
+ """
139
+ Check if PDF is mostly images (may need OCR).
140
+
141
+ Args:
142
+ pdf_doc: PyMuPDF document object
143
+
144
+ Returns:
145
+ True if PDF appears to be image-heavy
146
+ """
147
+ total_pages = len(pdf_doc)
148
+ if total_pages == 0:
149
+ return False
150
+
151
+ # Count images in first few pages
152
+ sample_pages = min(3, total_pages)
153
+ image_count = 0
154
+ for i in range(sample_pages):
155
+ image_count += len(pdf_doc[i].get_images())
156
+
157
+ avg_images_per_page = image_count / sample_pages
158
+
159
+ # Check text density in sample pages
160
+ sample_text = ""
161
+ for i in range(sample_pages):
162
+ sample_text += pdf_doc[i].get_text()
163
+
164
+ text_density = len(sample_text) / 1000 / sample_pages
165
+
166
+ logger.debug(
167
+ f"PDF analysis: {avg_images_per_page:.1f} images/page, "
168
+ f"{text_density:.2f} text density"
169
+ )
170
+
171
+ # If text density is high, don't use OCR
172
+ if text_density > self._text_threshold:
173
+ return False
174
+
175
+ # If many images per page and low text, probably needs OCR
176
+ return avg_images_per_page > self._image_threshold
177
+
178
+ def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
179
+ """
180
+ Extract text using PyMuPDF (fast, direct extraction).
181
+
182
+ Args:
183
+ pdf_doc: PyMuPDF document object
184
+ max_pages: Maximum pages to process
185
+
186
+ Returns:
187
+ PDFParseResult with extracted text
188
+ """
189
+ pages = []
190
+ total_pages = len(pdf_doc)
191
+
192
+ for i in range(min(total_pages, max_pages)):
193
+ page = pdf_doc[i]
194
+ text = page.get_text()
195
+ pages.append(text.strip())
196
+
197
+ if (i + 1) % 50 == 0:
198
+ logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
199
+
200
+ # Extract metadata
201
+ metadata = self._extract_metadata(pdf_doc)
202
+
203
+ return PDFParseResult(
204
+ pages=pages,
205
+ page_count=total_pages,
206
+ metadata=metadata,
207
+ )
208
+
209
+ def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
210
+ """
211
+ Extract text using OCR (Tesseract).
212
+
213
+ Args:
214
+ pdf_doc: PyMuPDF document object
215
+ max_pages: Maximum pages to process
216
+
217
+ Returns:
218
+ PDFParseResult with OCR-extracted text
219
+ """
220
+ try:
221
+ import pytesseract
222
+ from PIL import Image
223
+ except ImportError:
224
+ return PDFParseResult(
225
+ pages=[],
226
+ page_count=len(pdf_doc),
227
+ error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
228
+ )
229
+
230
+ pages = []
231
+ total_pages = len(pdf_doc)
232
+
233
+ for i in range(min(total_pages, max_pages)):
234
+ page = pdf_doc[i]
235
+
236
+ # Render page to image
237
+ pix = page.get_pixmap(dpi=150) # 150 DPI is good balance
238
+ img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
239
+
240
+ # Run OCR
241
+ text = pytesseract.image_to_string(img)
242
+ pages.append(text.strip())
243
+
244
+ if (i + 1) % 10 == 0:
245
+ logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
246
+
247
+ # Extract metadata
248
+ metadata = self._extract_metadata(pdf_doc)
249
+
250
+ return PDFParseResult(
251
+ pages=pages,
252
+ page_count=total_pages,
253
+ metadata=metadata,
254
+ )
255
+
256
+ @staticmethod
257
+ def _extract_metadata(pdf_doc) -> dict[str, Any]:
258
+ """
259
+ Extract PDF metadata.
260
+
261
+ Args:
262
+ pdf_doc: PyMuPDF document object
263
+
264
+ Returns:
265
+ Dictionary of metadata fields
266
+ """
267
+ metadata = {}
268
+
269
+ try:
270
+ doc_metadata = pdf_doc.metadata
271
+ if doc_metadata:
272
+ # Map common PDF metadata fields
273
+ field_map = {
274
+ "title": "title",
275
+ "author": "author",
276
+ "subject": "subject",
277
+ "keywords": "keywords",
278
+ "creator": "creator",
279
+ "producer": "producer",
280
+ "creationDate": "created",
281
+ "modDate": "modified",
282
+ }
283
+
284
+ for pdf_key, our_key in field_map.items():
285
+ value = doc_metadata.get(pdf_key)
286
+ if value and isinstance(value, str) and value.strip():
287
+ metadata[our_key] = value.strip()
288
+ except Exception as e:
289
+ logger.debug(f"Error extracting metadata: {e}")
290
+
291
+ return metadata
@@ -6,6 +6,15 @@ Adds qualifiers and identifiers to entities.
6
6
 
7
7
  from .base import BaseQualifierPlugin
8
8
  from .person import PersonQualifierPlugin
9
+
10
+ # Import embedding qualifier (may fail if database module not available)
11
+ try:
12
+ from .embedding_company import EmbeddingCompanyQualifier
13
+ except ImportError:
14
+ EmbeddingCompanyQualifier = None # type: ignore
15
+
16
+ # DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
17
+ # They are no longer auto-registered with the plugin registry.
9
18
  from .gleif import GLEIFQualifierPlugin
10
19
  from .companies_house import CompaniesHouseQualifierPlugin
11
20
  from .sec_edgar import SECEdgarQualifierPlugin
@@ -13,6 +22,8 @@ from .sec_edgar import SECEdgarQualifierPlugin
13
22
  __all__ = [
14
23
  "BaseQualifierPlugin",
15
24
  "PersonQualifierPlugin",
25
+ "EmbeddingCompanyQualifier",
26
+ # Deprecated - kept for backwards compatibility
16
27
  "GLEIFQualifierPlugin",
17
28
  "CompaniesHouseQualifierPlugin",
18
29
  "SECEdgarQualifierPlugin",