corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,291 @@
1
+ """
2
+ PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
3
+
4
+ Extracts text from PDFs page by page, with automatic detection of
5
+ image-heavy PDFs that may require OCR.
6
+ """
7
+
8
+ import io
9
+ import logging
10
+ import os
11
+ import tempfile
12
+ from typing import Any, Optional
13
+
14
+ from ..base import BasePDFParserPlugin, PDFParseResult
15
+ from ...pipeline.registry import PluginRegistry
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @PluginRegistry.pdf_parser
21
+ class PyPDFParserPlugin(BasePDFParserPlugin):
22
+ """
23
+ PDF parser using PyMuPDF (fitz) with optional OCR fallback.
24
+
25
+ Features:
26
+ - Fast text extraction using PyMuPDF
27
+ - Automatic detection of image-heavy PDFs
28
+ - Optional OCR fallback using Tesseract
29
+ - Metadata extraction (title, author, etc.)
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ image_threshold: float = 0.5,
35
+ text_threshold: float = 0.4,
36
+ use_ocr_fallback: bool = True,
37
+ ):
38
+ """
39
+ Initialize the PDF parser.
40
+
41
+ Args:
42
+ image_threshold: Images per page threshold for OCR trigger
43
+ text_threshold: Text density threshold (chars/1000 per page)
44
+ use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
45
+ """
46
+ self._image_threshold = image_threshold
47
+ self._text_threshold = text_threshold
48
+ self._use_ocr_fallback = use_ocr_fallback
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return "pypdf_parser"
53
+
54
+ @property
55
+ def priority(self) -> int:
56
+ return 100
57
+
58
+ @property
59
+ def description(self) -> str:
60
+ return "PDF parser using PyMuPDF with optional OCR fallback"
61
+
62
+ @property
63
+ def supports_ocr(self) -> bool:
64
+ return self._use_ocr_fallback
65
+
66
+ def parse(
67
+ self,
68
+ pdf_bytes: bytes,
69
+ max_pages: int = 500,
70
+ use_ocr: bool = False,
71
+ ) -> PDFParseResult:
72
+ """
73
+ Extract text from PDF bytes.
74
+
75
+ Args:
76
+ pdf_bytes: Raw PDF file content
77
+ max_pages: Maximum number of pages to process
78
+ use_ocr: Force OCR even for text-extractable PDFs
79
+
80
+ Returns:
81
+ PDFParseResult with extracted text for each page
82
+ """
83
+ try:
84
+ import fitz # PyMuPDF
85
+ except ImportError:
86
+ return PDFParseResult(
87
+ pages=[],
88
+ page_count=0,
89
+ error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
90
+ )
91
+
92
+ temp_path: Optional[str] = None
93
+
94
+ try:
95
+ # Write bytes to temp file for fitz
96
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
97
+ f.write(pdf_bytes)
98
+ temp_path = f.name
99
+
100
+ logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
101
+
102
+ # Open the PDF
103
+ pdf_doc = fitz.open(temp_path)
104
+ total_pages = len(pdf_doc)
105
+ logger.info(f"PDF has {total_pages} pages")
106
+
107
+ # Check if we should use OCR
108
+ should_ocr = use_ocr or (
109
+ self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
110
+ )
111
+
112
+ if should_ocr:
113
+ logger.info("PDF appears image-heavy, using OCR")
114
+ result = self._parse_with_ocr(pdf_doc, max_pages)
115
+ else:
116
+ logger.info("PDF has extractable text, using direct extraction")
117
+ result = self._parse_with_fitz(pdf_doc, max_pages)
118
+
119
+ pdf_doc.close()
120
+ return result
121
+
122
+ except Exception as e:
123
+ logger.exception(f"Error parsing PDF: {e}")
124
+ return PDFParseResult(
125
+ pages=[],
126
+ page_count=0,
127
+ error=f"Failed to parse PDF: {e}",
128
+ )
129
+ finally:
130
+ # Clean up temp file
131
+ if temp_path and os.path.exists(temp_path):
132
+ try:
133
+ os.unlink(temp_path)
134
+ except Exception:
135
+ pass
136
+
137
+ def _is_mostly_images(self, pdf_doc) -> bool:
138
+ """
139
+ Check if PDF is mostly images (may need OCR).
140
+
141
+ Args:
142
+ pdf_doc: PyMuPDF document object
143
+
144
+ Returns:
145
+ True if PDF appears to be image-heavy
146
+ """
147
+ total_pages = len(pdf_doc)
148
+ if total_pages == 0:
149
+ return False
150
+
151
+ # Count images in first few pages
152
+ sample_pages = min(3, total_pages)
153
+ image_count = 0
154
+ for i in range(sample_pages):
155
+ image_count += len(pdf_doc[i].get_images())
156
+
157
+ avg_images_per_page = image_count / sample_pages
158
+
159
+ # Check text density in sample pages
160
+ sample_text = ""
161
+ for i in range(sample_pages):
162
+ sample_text += pdf_doc[i].get_text()
163
+
164
+ text_density = len(sample_text) / 1000 / sample_pages
165
+
166
+ logger.debug(
167
+ f"PDF analysis: {avg_images_per_page:.1f} images/page, "
168
+ f"{text_density:.2f} text density"
169
+ )
170
+
171
+ # If text density is high, don't use OCR
172
+ if text_density > self._text_threshold:
173
+ return False
174
+
175
+ # If many images per page and low text, probably needs OCR
176
+ return avg_images_per_page > self._image_threshold
177
+
178
+ def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
179
+ """
180
+ Extract text using PyMuPDF (fast, direct extraction).
181
+
182
+ Args:
183
+ pdf_doc: PyMuPDF document object
184
+ max_pages: Maximum pages to process
185
+
186
+ Returns:
187
+ PDFParseResult with extracted text
188
+ """
189
+ pages = []
190
+ total_pages = len(pdf_doc)
191
+
192
+ for i in range(min(total_pages, max_pages)):
193
+ page = pdf_doc[i]
194
+ text = page.get_text()
195
+ pages.append(text.strip())
196
+
197
+ if (i + 1) % 50 == 0:
198
+ logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
199
+
200
+ # Extract metadata
201
+ metadata = self._extract_metadata(pdf_doc)
202
+
203
+ return PDFParseResult(
204
+ pages=pages,
205
+ page_count=total_pages,
206
+ metadata=metadata,
207
+ )
208
+
209
+ def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
210
+ """
211
+ Extract text using OCR (Tesseract).
212
+
213
+ Args:
214
+ pdf_doc: PyMuPDF document object
215
+ max_pages: Maximum pages to process
216
+
217
+ Returns:
218
+ PDFParseResult with OCR-extracted text
219
+ """
220
+ try:
221
+ import pytesseract
222
+ from PIL import Image
223
+ except ImportError:
224
+ return PDFParseResult(
225
+ pages=[],
226
+ page_count=len(pdf_doc),
227
+ error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
228
+ )
229
+
230
+ pages = []
231
+ total_pages = len(pdf_doc)
232
+
233
+ for i in range(min(total_pages, max_pages)):
234
+ page = pdf_doc[i]
235
+
236
+ # Render page to image
237
+ pix = page.get_pixmap(dpi=150) # 150 DPI is good balance
238
+ img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
239
+
240
+ # Run OCR
241
+ text = pytesseract.image_to_string(img)
242
+ pages.append(text.strip())
243
+
244
+ if (i + 1) % 10 == 0:
245
+ logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
246
+
247
+ # Extract metadata
248
+ metadata = self._extract_metadata(pdf_doc)
249
+
250
+ return PDFParseResult(
251
+ pages=pages,
252
+ page_count=total_pages,
253
+ metadata=metadata,
254
+ )
255
+
256
+ @staticmethod
257
+ def _extract_metadata(pdf_doc) -> dict[str, Any]:
258
+ """
259
+ Extract PDF metadata.
260
+
261
+ Args:
262
+ pdf_doc: PyMuPDF document object
263
+
264
+ Returns:
265
+ Dictionary of metadata fields
266
+ """
267
+ metadata = {}
268
+
269
+ try:
270
+ doc_metadata = pdf_doc.metadata
271
+ if doc_metadata:
272
+ # Map common PDF metadata fields
273
+ field_map = {
274
+ "title": "title",
275
+ "author": "author",
276
+ "subject": "subject",
277
+ "keywords": "keywords",
278
+ "creator": "creator",
279
+ "producer": "producer",
280
+ "creationDate": "created",
281
+ "modDate": "modified",
282
+ }
283
+
284
+ for pdf_key, our_key in field_map.items():
285
+ value = doc_metadata.get(pdf_key)
286
+ if value and isinstance(value, str) and value.strip():
287
+ metadata[our_key] = value.strip()
288
+ except Exception as e:
289
+ logger.debug(f"Error extracting metadata: {e}")
290
+
291
+ return metadata
@@ -0,0 +1,30 @@
1
+ """
2
+ Qualifier plugins for Stage 3 (Qualification).
3
+
4
+ Adds qualifiers and identifiers to entities.
5
+ """
6
+
7
+ from .base import BaseQualifierPlugin
8
+ from .person import PersonQualifierPlugin
9
+
10
+ # Import embedding qualifier (may fail if database module not available)
11
+ try:
12
+ from .embedding_company import EmbeddingCompanyQualifier
13
+ except ImportError:
14
+ EmbeddingCompanyQualifier = None # type: ignore
15
+
16
+ # DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
17
+ # They are no longer auto-registered with the plugin registry.
18
+ from .gleif import GLEIFQualifierPlugin
19
+ from .companies_house import CompaniesHouseQualifierPlugin
20
+ from .sec_edgar import SECEdgarQualifierPlugin
21
+
22
+ __all__ = [
23
+ "BaseQualifierPlugin",
24
+ "PersonQualifierPlugin",
25
+ "EmbeddingCompanyQualifier",
26
+ # Deprecated - kept for backwards compatibility
27
+ "GLEIFQualifierPlugin",
28
+ "CompaniesHouseQualifierPlugin",
29
+ "SECEdgarQualifierPlugin",
30
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Base class for qualifier plugins.
3
+
4
+ Re-exports BaseQualifierPlugin from the main plugins module.
5
+ """
6
+
7
+ from ..base import BaseQualifierPlugin
8
+
9
+ __all__ = ["BaseQualifierPlugin"]
@@ -0,0 +1,185 @@
1
+ """
2
+ CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
3
+
4
+ DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
5
+ embedding database with pre-loaded Companies House data for faster, offline matching.
6
+
7
+ Uses the UK Companies House API to:
8
+ - Look up company number by name
9
+ - Retrieve company details, jurisdiction, officers
10
+ """
11
+
12
+ import logging
13
+ import os
14
+ import warnings
15
+ from typing import Optional
16
+
17
+ from ..base import BaseQualifierPlugin, PluginCapability
18
+ from ...pipeline.context import PipelineContext
19
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Companies House API base URL
24
+ CH_API_BASE = "https://api.company-information.service.gov.uk"
25
+
26
+
27
+ # DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
28
+ class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
29
+ """
30
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
31
+
32
+ Qualifier plugin for UK ORG entities using Companies House API.
33
+ Requires COMPANIES_HOUSE_API_KEY environment variable.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ api_key: Optional[str] = None,
39
+ timeout: int = 10,
40
+ cache_results: bool = True,
41
+ ):
42
+ """
43
+ Initialize the Companies House qualifier.
44
+
45
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
46
+
47
+ Args:
48
+ api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
49
+ timeout: API request timeout in seconds
50
+ cache_results: Whether to cache API results
51
+ """
52
+ warnings.warn(
53
+ "CompaniesHouseQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
54
+ DeprecationWarning,
55
+ stacklevel=2,
56
+ )
57
+ self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
58
+ self._timeout = timeout
59
+ self._cache_results = cache_results
60
+ self._cache: dict[str, Optional[dict]] = {}
61
+
62
+ @property
63
+ def name(self) -> str:
64
+ return "companies_house_qualifier"
65
+
66
+ @property
67
+ def priority(self) -> int:
68
+ return 20 # Run after GLEIF
69
+
70
+ @property
71
+ def capabilities(self) -> PluginCapability:
72
+ return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
73
+
74
+ @property
75
+ def description(self) -> str:
76
+ return "Looks up UK company data from Companies House API"
77
+
78
+ @property
79
+ def supported_entity_types(self) -> set[EntityType]:
80
+ return {EntityType.ORG}
81
+
82
+ @property
83
+ def supported_identifier_types(self) -> list[str]:
84
+ return ["ch_number"] # Can lookup by company number
85
+
86
+ @property
87
+ def provided_identifier_types(self) -> list[str]:
88
+ return ["ch_number"] # Provides company number
89
+
90
+ def qualify(
91
+ self,
92
+ entity: ExtractedEntity,
93
+ context: PipelineContext,
94
+ ) -> Optional[EntityQualifiers]:
95
+ """
96
+ Qualify an ORG entity with Companies House data.
97
+
98
+ Args:
99
+ entity: The ORG entity to qualify
100
+ context: Pipeline context
101
+
102
+ Returns:
103
+ EntityQualifiers with company number, or None if not found
104
+ """
105
+ if entity.type != EntityType.ORG:
106
+ return None
107
+
108
+ if not self._api_key:
109
+ logger.debug("Companies House API key not configured")
110
+ return None
111
+
112
+ # Check cache first
113
+ cache_key = entity.text.lower().strip()
114
+ if self._cache_results and cache_key in self._cache:
115
+ cached = self._cache[cache_key]
116
+ if cached is None:
117
+ return None
118
+ return self._data_to_qualifiers(cached)
119
+
120
+ # Search Companies House API
121
+ result = self._search_companies_house(entity.text)
122
+
123
+ # Cache result
124
+ if self._cache_results:
125
+ self._cache[cache_key] = result
126
+
127
+ if result:
128
+ return self._data_to_qualifiers(result)
129
+
130
+ return None
131
+
132
+ def _search_companies_house(self, org_name: str) -> Optional[dict]:
133
+ """Search Companies House API for organization."""
134
+ try:
135
+ import requests
136
+ from requests.auth import HTTPBasicAuth
137
+
138
+ url = f"{CH_API_BASE}/search/companies"
139
+ params = {"q": org_name, "items_per_page": 5}
140
+
141
+ response = requests.get(
142
+ url,
143
+ params=params,
144
+ auth=HTTPBasicAuth(self._api_key, ""),
145
+ timeout=self._timeout,
146
+ )
147
+ response.raise_for_status()
148
+ data = response.json()
149
+
150
+ items = data.get("items", [])
151
+ if items:
152
+ # Return first match
153
+ company = items[0]
154
+ return {
155
+ "ch_number": company.get("company_number", ""),
156
+ "title": company.get("title", ""),
157
+ "company_status": company.get("company_status", ""),
158
+ "company_type": company.get("company_type", ""),
159
+ "jurisdiction": "UK",
160
+ "country": "GB",
161
+ "address": company.get("address_snippet", ""),
162
+ }
163
+
164
+ except ImportError:
165
+ logger.warning("requests library not available for Companies House API")
166
+ except Exception as e:
167
+ logger.debug(f"Companies House API error: {e}")
168
+
169
+ return None
170
+
171
+ def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
172
+ """Convert Companies House data to EntityQualifiers."""
173
+ identifiers = {}
174
+ if data.get("ch_number"):
175
+ identifiers["ch_number"] = data["ch_number"]
176
+
177
+ return EntityQualifiers(
178
+ jurisdiction=data.get("jurisdiction"),
179
+ country=data.get("country"),
180
+ identifiers=identifiers,
181
+ )
182
+
183
+
184
+ # Allow importing without decorator for testing
185
+ CompaniesHouseQualifierPluginClass = CompaniesHouseQualifierPlugin