corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
|
|
3
|
+
|
|
4
|
+
Extracts text from PDFs page by page, with automatic detection of
|
|
5
|
+
image-heavy PDFs that may require OCR.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import tempfile
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from ..base import BasePDFParserPlugin, PDFParseResult
|
|
15
|
+
from ...pipeline.registry import PluginRegistry
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@PluginRegistry.pdf_parser
|
|
21
|
+
class PyPDFParserPlugin(BasePDFParserPlugin):
|
|
22
|
+
"""
|
|
23
|
+
PDF parser using PyMuPDF (fitz) with optional OCR fallback.
|
|
24
|
+
|
|
25
|
+
Features:
|
|
26
|
+
- Fast text extraction using PyMuPDF
|
|
27
|
+
- Automatic detection of image-heavy PDFs
|
|
28
|
+
- Optional OCR fallback using Tesseract
|
|
29
|
+
- Metadata extraction (title, author, etc.)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
image_threshold: float = 0.5,
|
|
35
|
+
text_threshold: float = 0.4,
|
|
36
|
+
use_ocr_fallback: bool = True,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the PDF parser.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
image_threshold: Images per page threshold for OCR trigger
|
|
43
|
+
text_threshold: Text density threshold (chars/1000 per page)
|
|
44
|
+
use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
|
|
45
|
+
"""
|
|
46
|
+
self._image_threshold = image_threshold
|
|
47
|
+
self._text_threshold = text_threshold
|
|
48
|
+
self._use_ocr_fallback = use_ocr_fallback
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def name(self) -> str:
|
|
52
|
+
return "pypdf_parser"
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def priority(self) -> int:
|
|
56
|
+
return 100
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def description(self) -> str:
|
|
60
|
+
return "PDF parser using PyMuPDF with optional OCR fallback"
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def supports_ocr(self) -> bool:
|
|
64
|
+
return self._use_ocr_fallback
|
|
65
|
+
|
|
66
|
+
def parse(
|
|
67
|
+
self,
|
|
68
|
+
pdf_bytes: bytes,
|
|
69
|
+
max_pages: int = 500,
|
|
70
|
+
use_ocr: bool = False,
|
|
71
|
+
) -> PDFParseResult:
|
|
72
|
+
"""
|
|
73
|
+
Extract text from PDF bytes.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
pdf_bytes: Raw PDF file content
|
|
77
|
+
max_pages: Maximum number of pages to process
|
|
78
|
+
use_ocr: Force OCR even for text-extractable PDFs
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
PDFParseResult with extracted text for each page
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
import fitz # PyMuPDF
|
|
85
|
+
except ImportError:
|
|
86
|
+
return PDFParseResult(
|
|
87
|
+
pages=[],
|
|
88
|
+
page_count=0,
|
|
89
|
+
error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
temp_path: Optional[str] = None
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
# Write bytes to temp file for fitz
|
|
96
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
|
|
97
|
+
f.write(pdf_bytes)
|
|
98
|
+
temp_path = f.name
|
|
99
|
+
|
|
100
|
+
logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
|
|
101
|
+
|
|
102
|
+
# Open the PDF
|
|
103
|
+
pdf_doc = fitz.open(temp_path)
|
|
104
|
+
total_pages = len(pdf_doc)
|
|
105
|
+
logger.info(f"PDF has {total_pages} pages")
|
|
106
|
+
|
|
107
|
+
# Check if we should use OCR
|
|
108
|
+
should_ocr = use_ocr or (
|
|
109
|
+
self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if should_ocr:
|
|
113
|
+
logger.info("PDF appears image-heavy, using OCR")
|
|
114
|
+
result = self._parse_with_ocr(pdf_doc, max_pages)
|
|
115
|
+
else:
|
|
116
|
+
logger.info("PDF has extractable text, using direct extraction")
|
|
117
|
+
result = self._parse_with_fitz(pdf_doc, max_pages)
|
|
118
|
+
|
|
119
|
+
pdf_doc.close()
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.exception(f"Error parsing PDF: {e}")
|
|
124
|
+
return PDFParseResult(
|
|
125
|
+
pages=[],
|
|
126
|
+
page_count=0,
|
|
127
|
+
error=f"Failed to parse PDF: {e}",
|
|
128
|
+
)
|
|
129
|
+
finally:
|
|
130
|
+
# Clean up temp file
|
|
131
|
+
if temp_path and os.path.exists(temp_path):
|
|
132
|
+
try:
|
|
133
|
+
os.unlink(temp_path)
|
|
134
|
+
except Exception:
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
def _is_mostly_images(self, pdf_doc) -> bool:
|
|
138
|
+
"""
|
|
139
|
+
Check if PDF is mostly images (may need OCR).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
pdf_doc: PyMuPDF document object
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
True if PDF appears to be image-heavy
|
|
146
|
+
"""
|
|
147
|
+
total_pages = len(pdf_doc)
|
|
148
|
+
if total_pages == 0:
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
# Count images in first few pages
|
|
152
|
+
sample_pages = min(3, total_pages)
|
|
153
|
+
image_count = 0
|
|
154
|
+
for i in range(sample_pages):
|
|
155
|
+
image_count += len(pdf_doc[i].get_images())
|
|
156
|
+
|
|
157
|
+
avg_images_per_page = image_count / sample_pages
|
|
158
|
+
|
|
159
|
+
# Check text density in sample pages
|
|
160
|
+
sample_text = ""
|
|
161
|
+
for i in range(sample_pages):
|
|
162
|
+
sample_text += pdf_doc[i].get_text()
|
|
163
|
+
|
|
164
|
+
text_density = len(sample_text) / 1000 / sample_pages
|
|
165
|
+
|
|
166
|
+
logger.debug(
|
|
167
|
+
f"PDF analysis: {avg_images_per_page:.1f} images/page, "
|
|
168
|
+
f"{text_density:.2f} text density"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# If text density is high, don't use OCR
|
|
172
|
+
if text_density > self._text_threshold:
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
# If many images per page and low text, probably needs OCR
|
|
176
|
+
return avg_images_per_page > self._image_threshold
|
|
177
|
+
|
|
178
|
+
def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
|
|
179
|
+
"""
|
|
180
|
+
Extract text using PyMuPDF (fast, direct extraction).
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
pdf_doc: PyMuPDF document object
|
|
184
|
+
max_pages: Maximum pages to process
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
PDFParseResult with extracted text
|
|
188
|
+
"""
|
|
189
|
+
pages = []
|
|
190
|
+
total_pages = len(pdf_doc)
|
|
191
|
+
|
|
192
|
+
for i in range(min(total_pages, max_pages)):
|
|
193
|
+
page = pdf_doc[i]
|
|
194
|
+
text = page.get_text()
|
|
195
|
+
pages.append(text.strip())
|
|
196
|
+
|
|
197
|
+
if (i + 1) % 50 == 0:
|
|
198
|
+
logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
|
|
199
|
+
|
|
200
|
+
# Extract metadata
|
|
201
|
+
metadata = self._extract_metadata(pdf_doc)
|
|
202
|
+
|
|
203
|
+
return PDFParseResult(
|
|
204
|
+
pages=pages,
|
|
205
|
+
page_count=total_pages,
|
|
206
|
+
metadata=metadata,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
|
|
210
|
+
"""
|
|
211
|
+
Extract text using OCR (Tesseract).
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
pdf_doc: PyMuPDF document object
|
|
215
|
+
max_pages: Maximum pages to process
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
PDFParseResult with OCR-extracted text
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
import pytesseract
|
|
222
|
+
from PIL import Image
|
|
223
|
+
except ImportError:
|
|
224
|
+
return PDFParseResult(
|
|
225
|
+
pages=[],
|
|
226
|
+
page_count=len(pdf_doc),
|
|
227
|
+
error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
pages = []
|
|
231
|
+
total_pages = len(pdf_doc)
|
|
232
|
+
|
|
233
|
+
for i in range(min(total_pages, max_pages)):
|
|
234
|
+
page = pdf_doc[i]
|
|
235
|
+
|
|
236
|
+
# Render page to image
|
|
237
|
+
pix = page.get_pixmap(dpi=150) # 150 DPI is good balance
|
|
238
|
+
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
|
239
|
+
|
|
240
|
+
# Run OCR
|
|
241
|
+
text = pytesseract.image_to_string(img)
|
|
242
|
+
pages.append(text.strip())
|
|
243
|
+
|
|
244
|
+
if (i + 1) % 10 == 0:
|
|
245
|
+
logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
|
|
246
|
+
|
|
247
|
+
# Extract metadata
|
|
248
|
+
metadata = self._extract_metadata(pdf_doc)
|
|
249
|
+
|
|
250
|
+
return PDFParseResult(
|
|
251
|
+
pages=pages,
|
|
252
|
+
page_count=total_pages,
|
|
253
|
+
metadata=metadata,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def _extract_metadata(pdf_doc) -> dict[str, Any]:
|
|
258
|
+
"""
|
|
259
|
+
Extract PDF metadata.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
pdf_doc: PyMuPDF document object
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Dictionary of metadata fields
|
|
266
|
+
"""
|
|
267
|
+
metadata = {}
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
doc_metadata = pdf_doc.metadata
|
|
271
|
+
if doc_metadata:
|
|
272
|
+
# Map common PDF metadata fields
|
|
273
|
+
field_map = {
|
|
274
|
+
"title": "title",
|
|
275
|
+
"author": "author",
|
|
276
|
+
"subject": "subject",
|
|
277
|
+
"keywords": "keywords",
|
|
278
|
+
"creator": "creator",
|
|
279
|
+
"producer": "producer",
|
|
280
|
+
"creationDate": "created",
|
|
281
|
+
"modDate": "modified",
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
for pdf_key, our_key in field_map.items():
|
|
285
|
+
value = doc_metadata.get(pdf_key)
|
|
286
|
+
if value and isinstance(value, str) and value.strip():
|
|
287
|
+
metadata[our_key] = value.strip()
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.debug(f"Error extracting metadata: {e}")
|
|
290
|
+
|
|
291
|
+
return metadata
|
|
@@ -6,6 +6,15 @@ Adds qualifiers and identifiers to entities.
|
|
|
6
6
|
|
|
7
7
|
from .base import BaseQualifierPlugin
|
|
8
8
|
from .person import PersonQualifierPlugin
|
|
9
|
+
|
|
10
|
+
# Import embedding qualifier (may fail if database module not available)
|
|
11
|
+
try:
|
|
12
|
+
from .embedding_company import EmbeddingCompanyQualifier
|
|
13
|
+
except ImportError:
|
|
14
|
+
EmbeddingCompanyQualifier = None # type: ignore
|
|
15
|
+
|
|
16
|
+
# DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
|
|
17
|
+
# They are no longer auto-registered with the plugin registry.
|
|
9
18
|
from .gleif import GLEIFQualifierPlugin
|
|
10
19
|
from .companies_house import CompaniesHouseQualifierPlugin
|
|
11
20
|
from .sec_edgar import SECEdgarQualifierPlugin
|
|
@@ -13,6 +22,8 @@ from .sec_edgar import SECEdgarQualifierPlugin
|
|
|
13
22
|
__all__ = [
|
|
14
23
|
"BaseQualifierPlugin",
|
|
15
24
|
"PersonQualifierPlugin",
|
|
25
|
+
"EmbeddingCompanyQualifier",
|
|
26
|
+
# Deprecated - kept for backwards compatibility
|
|
16
27
|
"GLEIFQualifierPlugin",
|
|
17
28
|
"CompaniesHouseQualifierPlugin",
|
|
18
29
|
"SECEdgarQualifierPlugin",
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
2
|
CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
|
|
3
3
|
|
|
4
|
+
DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
|
|
5
|
+
embedding database with pre-loaded Companies House data for faster, offline matching.
|
|
6
|
+
|
|
4
7
|
Uses the UK Companies House API to:
|
|
5
8
|
- Look up company number by name
|
|
6
9
|
- Retrieve company details, jurisdiction, officers
|
|
@@ -8,11 +11,11 @@ Uses the UK Companies House API to:
|
|
|
8
11
|
|
|
9
12
|
import logging
|
|
10
13
|
import os
|
|
14
|
+
import warnings
|
|
11
15
|
from typing import Optional
|
|
12
16
|
|
|
13
17
|
from ..base import BaseQualifierPlugin, PluginCapability
|
|
14
18
|
from ...pipeline.context import PipelineContext
|
|
15
|
-
from ...pipeline.registry import PluginRegistry
|
|
16
19
|
from ...models import ExtractedEntity, EntityQualifiers, EntityType
|
|
17
20
|
|
|
18
21
|
logger = logging.getLogger(__name__)
|
|
@@ -21,11 +24,12 @@ logger = logging.getLogger(__name__)
|
|
|
21
24
|
CH_API_BASE = "https://api.company-information.service.gov.uk"
|
|
22
25
|
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
# DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
|
|
25
28
|
class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
|
|
26
29
|
"""
|
|
27
|
-
|
|
30
|
+
DEPRECATED: Use EmbeddingCompanyQualifier instead.
|
|
28
31
|
|
|
32
|
+
Qualifier plugin for UK ORG entities using Companies House API.
|
|
29
33
|
Requires COMPANIES_HOUSE_API_KEY environment variable.
|
|
30
34
|
"""
|
|
31
35
|
|
|
@@ -38,11 +42,18 @@ class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
|
|
|
38
42
|
"""
|
|
39
43
|
Initialize the Companies House qualifier.
|
|
40
44
|
|
|
45
|
+
DEPRECATED: Use EmbeddingCompanyQualifier instead.
|
|
46
|
+
|
|
41
47
|
Args:
|
|
42
48
|
api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
|
|
43
49
|
timeout: API request timeout in seconds
|
|
44
50
|
cache_results: Whether to cache API results
|
|
45
51
|
"""
|
|
52
|
+
warnings.warn(
|
|
53
|
+
"CompaniesHouseQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
|
|
54
|
+
DeprecationWarning,
|
|
55
|
+
stacklevel=2,
|
|
56
|
+
)
|
|
46
57
|
self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
|
|
47
58
|
self._timeout = timeout
|
|
48
59
|
self._cache_results = cache_results
|