corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,291 @@
1
+ """
2
+ PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
3
+
4
+ Extracts text from PDFs page by page, with automatic detection of
5
+ image-heavy PDFs that may require OCR.
6
+ """
7
+
8
+ import io
9
+ import logging
10
+ import os
11
+ import tempfile
12
+ from typing import Any, Optional
13
+
14
+ from ..base import BasePDFParserPlugin, PDFParseResult
15
+ from ...pipeline.registry import PluginRegistry
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @PluginRegistry.pdf_parser
21
+ class PyPDFParserPlugin(BasePDFParserPlugin):
22
+ """
23
+ PDF parser using PyMuPDF (fitz) with optional OCR fallback.
24
+
25
+ Features:
26
+ - Fast text extraction using PyMuPDF
27
+ - Automatic detection of image-heavy PDFs
28
+ - Optional OCR fallback using Tesseract
29
+ - Metadata extraction (title, author, etc.)
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ image_threshold: float = 0.5,
35
+ text_threshold: float = 0.4,
36
+ use_ocr_fallback: bool = True,
37
+ ):
38
+ """
39
+ Initialize the PDF parser.
40
+
41
+ Args:
42
+ image_threshold: Images per page threshold for OCR trigger
43
+ text_threshold: Text density threshold (chars/1000 per page)
44
+ use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
45
+ """
46
+ self._image_threshold = image_threshold
47
+ self._text_threshold = text_threshold
48
+ self._use_ocr_fallback = use_ocr_fallback
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return "pypdf_parser"
53
+
54
+ @property
55
+ def priority(self) -> int:
56
+ return 100
57
+
58
+ @property
59
+ def description(self) -> str:
60
+ return "PDF parser using PyMuPDF with optional OCR fallback"
61
+
62
+ @property
63
+ def supports_ocr(self) -> bool:
64
+ return self._use_ocr_fallback
65
+
66
+ def parse(
67
+ self,
68
+ pdf_bytes: bytes,
69
+ max_pages: int = 500,
70
+ use_ocr: bool = False,
71
+ ) -> PDFParseResult:
72
+ """
73
+ Extract text from PDF bytes.
74
+
75
+ Args:
76
+ pdf_bytes: Raw PDF file content
77
+ max_pages: Maximum number of pages to process
78
+ use_ocr: Force OCR even for text-extractable PDFs
79
+
80
+ Returns:
81
+ PDFParseResult with extracted text for each page
82
+ """
83
+ try:
84
+ import fitz # PyMuPDF
85
+ except ImportError:
86
+ return PDFParseResult(
87
+ pages=[],
88
+ page_count=0,
89
+ error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
90
+ )
91
+
92
+ temp_path: Optional[str] = None
93
+
94
+ try:
95
+ # Write bytes to temp file for fitz
96
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
97
+ f.write(pdf_bytes)
98
+ temp_path = f.name
99
+
100
+ logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
101
+
102
+ # Open the PDF
103
+ pdf_doc = fitz.open(temp_path)
104
+ total_pages = len(pdf_doc)
105
+ logger.info(f"PDF has {total_pages} pages")
106
+
107
+ # Check if we should use OCR
108
+ should_ocr = use_ocr or (
109
+ self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
110
+ )
111
+
112
+ if should_ocr:
113
+ logger.info("PDF appears image-heavy, using OCR")
114
+ result = self._parse_with_ocr(pdf_doc, max_pages)
115
+ else:
116
+ logger.info("PDF has extractable text, using direct extraction")
117
+ result = self._parse_with_fitz(pdf_doc, max_pages)
118
+
119
+ pdf_doc.close()
120
+ return result
121
+
122
+ except Exception as e:
123
+ logger.exception(f"Error parsing PDF: {e}")
124
+ return PDFParseResult(
125
+ pages=[],
126
+ page_count=0,
127
+ error=f"Failed to parse PDF: {e}",
128
+ )
129
+ finally:
130
+ # Clean up temp file
131
+ if temp_path and os.path.exists(temp_path):
132
+ try:
133
+ os.unlink(temp_path)
134
+ except Exception:
135
+ pass
136
+
137
+ def _is_mostly_images(self, pdf_doc) -> bool:
138
+ """
139
+ Check if PDF is mostly images (may need OCR).
140
+
141
+ Args:
142
+ pdf_doc: PyMuPDF document object
143
+
144
+ Returns:
145
+ True if PDF appears to be image-heavy
146
+ """
147
+ total_pages = len(pdf_doc)
148
+ if total_pages == 0:
149
+ return False
150
+
151
+ # Count images in first few pages
152
+ sample_pages = min(3, total_pages)
153
+ image_count = 0
154
+ for i in range(sample_pages):
155
+ image_count += len(pdf_doc[i].get_images())
156
+
157
+ avg_images_per_page = image_count / sample_pages
158
+
159
+ # Check text density in sample pages
160
+ sample_text = ""
161
+ for i in range(sample_pages):
162
+ sample_text += pdf_doc[i].get_text()
163
+
164
+ text_density = len(sample_text) / 1000 / sample_pages
165
+
166
+ logger.debug(
167
+ f"PDF analysis: {avg_images_per_page:.1f} images/page, "
168
+ f"{text_density:.2f} text density"
169
+ )
170
+
171
+ # If text density is high, don't use OCR
172
+ if text_density > self._text_threshold:
173
+ return False
174
+
175
+ # If many images per page and low text, probably needs OCR
176
+ return avg_images_per_page > self._image_threshold
177
+
178
+ def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
179
+ """
180
+ Extract text using PyMuPDF (fast, direct extraction).
181
+
182
+ Args:
183
+ pdf_doc: PyMuPDF document object
184
+ max_pages: Maximum pages to process
185
+
186
+ Returns:
187
+ PDFParseResult with extracted text
188
+ """
189
+ pages = []
190
+ total_pages = len(pdf_doc)
191
+
192
+ for i in range(min(total_pages, max_pages)):
193
+ page = pdf_doc[i]
194
+ text = page.get_text()
195
+ pages.append(text.strip())
196
+
197
+ if (i + 1) % 50 == 0:
198
+ logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
199
+
200
+ # Extract metadata
201
+ metadata = self._extract_metadata(pdf_doc)
202
+
203
+ return PDFParseResult(
204
+ pages=pages,
205
+ page_count=total_pages,
206
+ metadata=metadata,
207
+ )
208
+
209
+ def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
210
+ """
211
+ Extract text using OCR (Tesseract).
212
+
213
+ Args:
214
+ pdf_doc: PyMuPDF document object
215
+ max_pages: Maximum pages to process
216
+
217
+ Returns:
218
+ PDFParseResult with OCR-extracted text
219
+ """
220
+ try:
221
+ import pytesseract
222
+ from PIL import Image
223
+ except ImportError:
224
+ return PDFParseResult(
225
+ pages=[],
226
+ page_count=len(pdf_doc),
227
+ error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
228
+ )
229
+
230
+ pages = []
231
+ total_pages = len(pdf_doc)
232
+
233
+ for i in range(min(total_pages, max_pages)):
234
+ page = pdf_doc[i]
235
+
236
+ # Render page to image
237
+ pix = page.get_pixmap(dpi=150) # 150 DPI is good balance
238
+ img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
239
+
240
+ # Run OCR
241
+ text = pytesseract.image_to_string(img)
242
+ pages.append(text.strip())
243
+
244
+ if (i + 1) % 10 == 0:
245
+ logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
246
+
247
+ # Extract metadata
248
+ metadata = self._extract_metadata(pdf_doc)
249
+
250
+ return PDFParseResult(
251
+ pages=pages,
252
+ page_count=total_pages,
253
+ metadata=metadata,
254
+ )
255
+
256
+ @staticmethod
257
+ def _extract_metadata(pdf_doc) -> dict[str, Any]:
258
+ """
259
+ Extract PDF metadata.
260
+
261
+ Args:
262
+ pdf_doc: PyMuPDF document object
263
+
264
+ Returns:
265
+ Dictionary of metadata fields
266
+ """
267
+ metadata = {}
268
+
269
+ try:
270
+ doc_metadata = pdf_doc.metadata
271
+ if doc_metadata:
272
+ # Map common PDF metadata fields
273
+ field_map = {
274
+ "title": "title",
275
+ "author": "author",
276
+ "subject": "subject",
277
+ "keywords": "keywords",
278
+ "creator": "creator",
279
+ "producer": "producer",
280
+ "creationDate": "created",
281
+ "modDate": "modified",
282
+ }
283
+
284
+ for pdf_key, our_key in field_map.items():
285
+ value = doc_metadata.get(pdf_key)
286
+ if value and isinstance(value, str) and value.strip():
287
+ metadata[our_key] = value.strip()
288
+ except Exception as e:
289
+ logger.debug(f"Error extracting metadata: {e}")
290
+
291
+ return metadata
@@ -6,6 +6,15 @@ Adds qualifiers and identifiers to entities.
6
6
 
7
7
  from .base import BaseQualifierPlugin
8
8
  from .person import PersonQualifierPlugin
9
+
10
+ # Import embedding qualifier (may fail if database module not available)
11
+ try:
12
+ from .embedding_company import EmbeddingCompanyQualifier
13
+ except ImportError:
14
+ EmbeddingCompanyQualifier = None # type: ignore
15
+
16
+ # DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
17
+ # They are no longer auto-registered with the plugin registry.
9
18
  from .gleif import GLEIFQualifierPlugin
10
19
  from .companies_house import CompaniesHouseQualifierPlugin
11
20
  from .sec_edgar import SECEdgarQualifierPlugin
@@ -13,6 +22,8 @@ from .sec_edgar import SECEdgarQualifierPlugin
13
22
  __all__ = [
14
23
  "BaseQualifierPlugin",
15
24
  "PersonQualifierPlugin",
25
+ "EmbeddingCompanyQualifier",
26
+ # Deprecated - kept for backwards compatibility
16
27
  "GLEIFQualifierPlugin",
17
28
  "CompaniesHouseQualifierPlugin",
18
29
  "SECEdgarQualifierPlugin",
@@ -1,6 +1,9 @@
1
1
  """
2
2
  CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
3
3
 
4
+ DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
5
+ embedding database with pre-loaded Companies House data for faster, offline matching.
6
+
4
7
  Uses the UK Companies House API to:
5
8
  - Look up company number by name
6
9
  - Retrieve company details, jurisdiction, officers
@@ -8,11 +11,11 @@ Uses the UK Companies House API to:
8
11
 
9
12
  import logging
10
13
  import os
14
+ import warnings
11
15
  from typing import Optional
12
16
 
13
17
  from ..base import BaseQualifierPlugin, PluginCapability
14
18
  from ...pipeline.context import PipelineContext
15
- from ...pipeline.registry import PluginRegistry
16
19
  from ...models import ExtractedEntity, EntityQualifiers, EntityType
17
20
 
18
21
  logger = logging.getLogger(__name__)
@@ -21,11 +24,12 @@ logger = logging.getLogger(__name__)
21
24
  CH_API_BASE = "https://api.company-information.service.gov.uk"
22
25
 
23
26
 
24
- @PluginRegistry.qualifier
27
+ # DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
25
28
  class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
26
29
  """
27
- Qualifier plugin for UK ORG entities using Companies House API.
30
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
28
31
 
32
+ Qualifier plugin for UK ORG entities using Companies House API.
29
33
  Requires COMPANIES_HOUSE_API_KEY environment variable.
30
34
  """
31
35
 
@@ -38,11 +42,18 @@ class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
38
42
  """
39
43
  Initialize the Companies House qualifier.
40
44
 
45
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
46
+
41
47
  Args:
42
48
  api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
43
49
  timeout: API request timeout in seconds
44
50
  cache_results: Whether to cache API results
45
51
  """
52
+ warnings.warn(
53
+ "CompaniesHouseQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
54
+ DeprecationWarning,
55
+ stacklevel=2,
56
+ )
46
57
  self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
47
58
  self._timeout = timeout
48
59
  self._cache_results = cache_results