corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,254 @@
1
+ """
2
+ Pydantic models for organization/entity database records.
3
+ """
4
+
5
+ from enum import Enum
6
+ from typing import Any, Literal, Optional
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ SourceType = Literal["gleif", "sec_edgar", "companies_house", "wikipedia"]
12
+
13
+
14
+ class EntityType(str, Enum):
15
+ """
16
+ Classification of organization type.
17
+
18
+ Used to distinguish between businesses, non-profits, government agencies, etc.
19
+ """
20
+ # Business entities
21
+ BUSINESS = "business" # General business/company
22
+ FUND = "fund" # Investment funds, ETFs, mutual funds
23
+ BRANCH = "branch" # Branch offices of companies
24
+
25
+ # Non-profit/civil society
26
+ NONPROFIT = "nonprofit" # Non-profit organizations
27
+ NGO = "ngo" # Non-governmental organizations
28
+ FOUNDATION = "foundation" # Charitable foundations
29
+ TRADE_UNION = "trade_union" # Labor unions
30
+
31
+ # Government/public sector
32
+ GOVERNMENT = "government" # Government agencies
33
+ INTERNATIONAL_ORG = "international_org" # UN, WHO, IMF, etc.
34
+ POLITICAL_PARTY = "political_party" # Political parties
35
+
36
+ # Education/research
37
+ EDUCATIONAL = "educational" # Schools, universities
38
+ RESEARCH = "research" # Research institutes
39
+
40
+ # Other organization types
41
+ RELIGIOUS = "religious" # Religious organizations
42
+ SPORTS = "sports" # Sports clubs/teams
43
+ MEDIA = "media" # Media companies, studios
44
+ HEALTHCARE = "healthcare" # Hospitals, healthcare orgs
45
+
46
+ # Unknown/unclassified
47
+ UNKNOWN = "unknown" # Type not determined
48
+
49
+
50
+ class PersonType(str, Enum):
51
+ """
52
+ Classification of notable person type.
53
+
54
+ Used for categorizing people in the person database.
55
+ """
56
+ EXECUTIVE = "executive" # CEOs, board members, C-suite
57
+ POLITICIAN = "politician" # Elected officials (presidents, MPs, mayors)
58
+ GOVERNMENT = "government" # Civil servants, diplomats, appointed officials
59
+ MILITARY = "military" # Military officers, armed forces personnel
60
+ LEGAL = "legal" # Judges, lawyers, legal professionals
61
+ PROFESSIONAL = "professional" # Known for their profession (doctors, engineers, architects)
62
+ ACADEMIC = "academic" # Professors, researchers
63
+ ARTIST = "artist" # Traditional creatives (musicians, actors, painters, writers)
64
+ MEDIA = "media" # Internet/social media personalities (YouTubers, influencers, podcasters)
65
+ ATHLETE = "athlete" # Sports figures
66
+ ENTREPRENEUR = "entrepreneur" # Founders, business owners
67
+ JOURNALIST = "journalist" # Reporters, news presenters, columnists
68
+ ACTIVIST = "activist" # Advocates, campaigners
69
+ SCIENTIST = "scientist" # Scientists, inventors
70
+ UNKNOWN = "unknown" # Type not determined
71
+
72
+
73
+ class CompanyRecord(BaseModel):
74
+ """
75
+ An organization record for the embedding database.
76
+
77
+ Used for storing and searching organizations by embedding similarity.
78
+ Note: Class name kept as CompanyRecord for API compatibility.
79
+ """
80
+ name: str = Field(..., description="Organization name (used for embedding and display)")
81
+ source: SourceType = Field(..., description="Data source")
82
+ source_id: str = Field(..., description="Unique identifier from source (LEI, CIK, CH number)")
83
+ region: str = Field(default="", description="Geographic region/country (e.g., 'UK', 'US', 'DE')")
84
+ entity_type: EntityType = Field(default=EntityType.UNKNOWN, description="Organization type classification")
85
+ from_date: Optional[str] = Field(default=None, description="Start date (ISO format YYYY-MM-DD)")
86
+ to_date: Optional[str] = Field(default=None, description="End date (ISO format YYYY-MM-DD)")
87
+ record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
88
+
89
+ @property
90
+ def canonical_id(self) -> str:
91
+ """Generate canonical ID in format source:source_id."""
92
+ return f"{self.source}:{self.source_id}"
93
+
94
+ def model_dump_for_db(self) -> dict[str, Any]:
95
+ """Convert to dict suitable for database storage."""
96
+ return {
97
+ "name": self.name,
98
+ "source": self.source,
99
+ "source_id": self.source_id,
100
+ "region": self.region,
101
+ "entity_type": self.entity_type.value,
102
+ "from_date": self.from_date or "",
103
+ "to_date": self.to_date or "",
104
+ "record": self.record,
105
+ }
106
+
107
+
108
+ PersonSourceType = Literal["wikidata", "sec_edgar", "companies_house"]
109
+
110
+
111
+ class PersonRecord(BaseModel):
112
+ """
113
+ A person record for the embedding database.
114
+
115
+ Used for storing and searching notable people by embedding similarity.
116
+ Supports people from Wikipedia/Wikidata with role/org context.
117
+ """
118
+ name: str = Field(..., description="Display name (used for embedding and display)")
119
+ source: PersonSourceType = Field(default="wikidata", description="Data source")
120
+ source_id: str = Field(..., description="Unique identifier from source (Wikidata QID)")
121
+ country: str = Field(default="", description="Country code or name (e.g., 'US', 'Germany')")
122
+ person_type: PersonType = Field(default=PersonType.UNKNOWN, description="Person type classification")
123
+ known_for_role: str = Field(default="", description="Primary role (e.g., 'CEO', 'President')")
124
+ known_for_org: str = Field(default="", description="Primary org (e.g., 'Apple Inc', 'Tesla')")
125
+ known_for_org_id: Optional[int] = Field(default=None, description="Foreign key to organizations table")
126
+ from_date: Optional[str] = Field(default=None, description="Start date of role (ISO format YYYY-MM-DD)")
127
+ to_date: Optional[str] = Field(default=None, description="End date of role (ISO format YYYY-MM-DD)")
128
+ birth_date: Optional[str] = Field(default=None, description="Date of birth (ISO format YYYY-MM-DD)")
129
+ death_date: Optional[str] = Field(default=None, description="Date of death (ISO format YYYY-MM-DD) - if set, person is historic")
130
+ record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
131
+
132
+ @property
133
+ def canonical_id(self) -> str:
134
+ """Generate canonical ID in format source:source_id."""
135
+ return f"{self.source}:{self.source_id}"
136
+
137
+ @property
138
+ def is_historic(self) -> bool:
139
+ """Return True if the person is deceased (has a death date)."""
140
+ return self.death_date is not None and self.death_date != ""
141
+
142
+ def model_dump_for_db(self) -> dict[str, Any]:
143
+ """Convert to dict suitable for database storage."""
144
+ return {
145
+ "name": self.name,
146
+ "source": self.source,
147
+ "source_id": self.source_id,
148
+ "country": self.country,
149
+ "person_type": self.person_type.value,
150
+ "known_for_role": self.known_for_role,
151
+ "known_for_org": self.known_for_org,
152
+ "known_for_org_id": self.known_for_org_id, # Can be None
153
+ "from_date": self.from_date or "",
154
+ "to_date": self.to_date or "",
155
+ "birth_date": self.birth_date or "",
156
+ "death_date": self.death_date or "",
157
+ "record": self.record,
158
+ }
159
+
160
+ def get_embedding_text(self) -> str:
161
+ """Build text for embedding that includes role/org context."""
162
+ parts = [self.name]
163
+ if self.known_for_role:
164
+ parts.append(self.known_for_role)
165
+ if self.known_for_org:
166
+ parts.append(self.known_for_org)
167
+ return " | ".join(parts)
168
+
169
+
170
+ class PersonMatch(BaseModel):
171
+ """
172
+ A person match result from embedding search.
173
+
174
+ Returned by the person qualifier when finding potential matches.
175
+ """
176
+ query_name: str = Field(..., description="Name extracted from text (the search query)")
177
+ record: PersonRecord = Field(..., description="The matched person record")
178
+ source: PersonSourceType = Field(..., description="Data source of match")
179
+ source_id: str = Field(..., description="Source identifier of match")
180
+ canonical_id: str = Field(..., description="Canonical ID in format source:source_id")
181
+ similarity_score: float = Field(..., description="Embedding similarity score (0-1)")
182
+ llm_confirmed: bool = Field(default=False, description="Whether LLM confirmed this match")
183
+
184
+ @property
185
+ def name(self) -> str:
186
+ """Get the matched person name."""
187
+ return self.record.name
188
+
189
+ @classmethod
190
+ def from_record(
191
+ cls,
192
+ query_name: str,
193
+ record: PersonRecord,
194
+ similarity_score: float,
195
+ llm_confirmed: bool = False,
196
+ ) -> "PersonMatch":
197
+ """Create a PersonMatch from a person record."""
198
+ return cls(
199
+ query_name=query_name,
200
+ record=record,
201
+ source=record.source,
202
+ source_id=record.source_id,
203
+ canonical_id=record.canonical_id,
204
+ similarity_score=similarity_score,
205
+ llm_confirmed=llm_confirmed,
206
+ )
207
+
208
+
209
+ class CompanyMatch(BaseModel):
210
+ """
211
+ An organization match result from embedding search.
212
+
213
+ Returned by the organization qualifier when finding potential matches.
214
+ Note: Class name kept as CompanyMatch for API compatibility.
215
+ """
216
+ query_name: str = Field(..., description="Name extracted from text (the search query)")
217
+ record: CompanyRecord = Field(..., description="The matched organization record")
218
+ source: SourceType = Field(..., description="Data source of match")
219
+ source_id: str = Field(..., description="Source identifier of match")
220
+ canonical_id: str = Field(..., description="Canonical ID in format source:source_id")
221
+ similarity_score: float = Field(..., description="Embedding similarity score (0-1)")
222
+ llm_confirmed: bool = Field(default=False, description="Whether LLM confirmed this match")
223
+
224
+ @property
225
+ def name(self) -> str:
226
+ """Get the matched organization name."""
227
+ return self.record.name
228
+
229
+ @classmethod
230
+ def from_record(
231
+ cls,
232
+ query_name: str,
233
+ record: CompanyRecord,
234
+ similarity_score: float,
235
+ llm_confirmed: bool = False,
236
+ ) -> "CompanyMatch":
237
+ """Create a CompanyMatch from an organization record."""
238
+ return cls(
239
+ query_name=query_name,
240
+ record=record,
241
+ source=record.source,
242
+ source_id=record.source_id,
243
+ canonical_id=record.canonical_id,
244
+ similarity_score=similarity_score,
245
+ llm_confirmed=llm_confirmed,
246
+ )
247
+
248
+
249
+ class DatabaseStats(BaseModel):
250
+ """Statistics about the organization database."""
251
+ total_records: int = 0
252
+ by_source: dict[str, int] = Field(default_factory=dict)
253
+ embedding_dimension: int = 0
254
+ database_size_bytes: int = 0
@@ -0,0 +1,245 @@
1
+ """
2
+ Entity resolver utilities for database lookups.
3
+
4
+ Provides shared functionality for resolving entity names against
5
+ the organization and person databases.
6
+ """
7
+
8
+ import logging
9
+ from typing import Optional
10
+
11
+ from .models import CompanyRecord
12
+ from ..models import ResolvedOrganization
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Source prefix mapping for canonical IDs
17
+ SOURCE_PREFIX_MAP = {
18
+ "gleif": "LEI",
19
+ "sec_edgar": "SEC-CIK",
20
+ "companies_house": "UK-CH",
21
+ "wikidata": "WIKIDATA",
22
+ "wikipedia": "WIKIDATA",
23
+ }
24
+
25
+
26
+ def get_source_prefix(source: str) -> str:
27
+ """Get the canonical ID prefix for a data source."""
28
+ return SOURCE_PREFIX_MAP.get(source, source.upper())
29
+
30
+
31
+ class OrganizationResolver:
32
+ """
33
+ Resolves organization names against the organization database.
34
+
35
+ Shared utility that can be used by both EmbeddingCompanyQualifier
36
+ and PersonQualifierPlugin for resolving organization references.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ db_path: Optional[str] = None,
42
+ top_k: int = 5,
43
+ min_similarity: float = 0.7,
44
+ auto_download_db: bool = True,
45
+ ):
46
+ """
47
+ Initialize the organization resolver.
48
+
49
+ Args:
50
+ db_path: Path to database (auto-detects if None)
51
+ top_k: Number of candidates to retrieve
52
+ min_similarity: Minimum similarity threshold
53
+ auto_download_db: Whether to auto-download database
54
+ """
55
+ self._db_path = db_path
56
+ self._top_k = top_k
57
+ self._min_similarity = min_similarity
58
+ self._auto_download_db = auto_download_db
59
+
60
+ # Lazy-loaded components
61
+ self._database = None
62
+ self._embedder = None
63
+ self._cache: dict[str, Optional[ResolvedOrganization]] = {}
64
+
65
+ def _get_database(self):
66
+ """Get or initialize the organization database."""
67
+ if self._database is not None:
68
+ return self._database
69
+
70
+ try:
71
+ from .store import get_database
72
+ from .hub import get_database_path
73
+
74
+ db_path = self._db_path
75
+ if db_path is None:
76
+ db_path = get_database_path(auto_download=self._auto_download_db)
77
+
78
+ if db_path is None:
79
+ logger.warning("Organization database not available.")
80
+ return None
81
+
82
+ self._database = get_database(db_path=db_path)
83
+ return self._database
84
+ except Exception as e:
85
+ logger.warning(f"Failed to load organization database: {e}")
86
+ return None
87
+
88
+ def _get_embedder(self):
89
+ """Get or initialize the embedder."""
90
+ if self._embedder is not None:
91
+ return self._embedder
92
+
93
+ try:
94
+ from .embeddings import CompanyEmbedder
95
+ self._embedder = CompanyEmbedder()
96
+ return self._embedder
97
+ except Exception as e:
98
+ logger.warning(f"Failed to load embedder: {e}")
99
+ return None
100
+
101
+ def resolve(self, org_name: str, use_cache: bool = True) -> Optional[ResolvedOrganization]:
102
+ """
103
+ Resolve an organization name against the database.
104
+
105
+ Args:
106
+ org_name: Organization name to resolve
107
+ use_cache: Whether to use cached results
108
+
109
+ Returns:
110
+ ResolvedOrganization if found, None otherwise
111
+ """
112
+ if not org_name:
113
+ return None
114
+
115
+ # Check cache
116
+ cache_key = org_name.lower().strip()
117
+ if use_cache and cache_key in self._cache:
118
+ return self._cache[cache_key]
119
+
120
+ database = self._get_database()
121
+ if database is None:
122
+ return None
123
+
124
+ embedder = self._get_embedder()
125
+ if embedder is None:
126
+ return None
127
+
128
+ try:
129
+ # Embed the org name
130
+ query_embedding = embedder.embed(org_name)
131
+
132
+ # Search with text pre-filtering
133
+ results = database.search(
134
+ query_embedding,
135
+ top_k=self._top_k,
136
+ query_text=org_name,
137
+ )
138
+
139
+ # Filter by similarity threshold
140
+ results = [(r, s) for r, s in results if s >= self._min_similarity]
141
+
142
+ if not results:
143
+ if use_cache:
144
+ self._cache[cache_key] = None
145
+ return None
146
+
147
+ # Take the best match
148
+ record, similarity = results[0]
149
+ resolved = self._build_resolved_organization(record, similarity)
150
+
151
+ if use_cache:
152
+ self._cache[cache_key] = resolved
153
+
154
+ return resolved
155
+
156
+ except Exception as e:
157
+ logger.debug(f"Failed to resolve organization '{org_name}': {e}")
158
+ if use_cache:
159
+ self._cache[cache_key] = None
160
+ return None
161
+
162
+ def resolve_with_candidates(
163
+ self,
164
+ org_name: str,
165
+ top_k: Optional[int] = None,
166
+ ) -> list[tuple[CompanyRecord, float]]:
167
+ """
168
+ Get organization candidates with similarity scores.
169
+
170
+ Args:
171
+ org_name: Organization name to search
172
+ top_k: Number of candidates (uses instance default if None)
173
+
174
+ Returns:
175
+ List of (CompanyRecord, similarity) tuples
176
+ """
177
+ if not org_name:
178
+ return []
179
+
180
+ database = self._get_database()
181
+ if database is None:
182
+ return []
183
+
184
+ embedder = self._get_embedder()
185
+ if embedder is None:
186
+ return []
187
+
188
+ try:
189
+ query_embedding = embedder.embed(org_name)
190
+ results = database.search(
191
+ query_embedding,
192
+ top_k=top_k or self._top_k,
193
+ query_text=org_name,
194
+ )
195
+ return [(r, s) for r, s in results if s >= self._min_similarity]
196
+ except Exception as e:
197
+ logger.debug(f"Failed to search for organization '{org_name}': {e}")
198
+ return []
199
+
200
+ def _build_resolved_organization(
201
+ self,
202
+ record: CompanyRecord,
203
+ similarity: float,
204
+ ) -> ResolvedOrganization:
205
+ """Build ResolvedOrganization from a database record."""
206
+ source_prefix = get_source_prefix(record.source)
207
+
208
+ return ResolvedOrganization(
209
+ canonical_name=record.name,
210
+ canonical_id=f"{source_prefix}:{record.source_id}",
211
+ source=record.source,
212
+ source_id=record.source_id,
213
+ region=record.region or None,
214
+ match_confidence=min(max(similarity, 0.0), 1.0),
215
+ match_details={"similarity": similarity},
216
+ )
217
+
218
+
219
+ # Singleton instance for shared use
220
+ _default_resolver: Optional[OrganizationResolver] = None
221
+
222
+
223
+ def get_organization_resolver(
224
+ db_path: Optional[str] = None,
225
+ auto_download_db: bool = True,
226
+ ) -> OrganizationResolver:
227
+ """
228
+ Get or create a shared OrganizationResolver instance.
229
+
230
+ Args:
231
+ db_path: Path to database
232
+ auto_download_db: Whether to auto-download database
233
+
234
+ Returns:
235
+ OrganizationResolver instance
236
+ """
237
+ global _default_resolver
238
+
239
+ if _default_resolver is None:
240
+ _default_resolver = OrganizationResolver(
241
+ db_path=db_path,
242
+ auto_download_db=auto_download_db,
243
+ )
244
+
245
+ return _default_resolver