corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared utilities for v2 database importers.
|
|
3
|
+
|
|
4
|
+
Provides helper functions for resolving locations, roles, and QIDs
|
|
5
|
+
to their normalized FK references in the v2 schema.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import TYPE_CHECKING, Optional
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from ..store import LocationsDatabase, RolesDatabase
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_qid(qid_text: Optional[str]) -> Optional[int]:
|
|
18
|
+
"""
|
|
19
|
+
Parse a QID string to integer.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
qid_text: QID string like "Q12345" or just "12345"
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Integer QID or None if invalid
|
|
26
|
+
"""
|
|
27
|
+
if not qid_text:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
# Strip whitespace
|
|
31
|
+
qid_text = qid_text.strip()
|
|
32
|
+
|
|
33
|
+
# Handle "Q12345" format
|
|
34
|
+
if qid_text.startswith("Q") or qid_text.startswith("q"):
|
|
35
|
+
qid_text = qid_text[1:]
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
return int(qid_text)
|
|
39
|
+
except ValueError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def format_qid(qid_int: Optional[int]) -> Optional[str]:
|
|
44
|
+
"""
|
|
45
|
+
Format an integer QID back to string format.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
qid_int: Integer QID (e.g., 12345)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
String QID like "Q12345" or None
|
|
52
|
+
"""
|
|
53
|
+
if qid_int is None:
|
|
54
|
+
return None
|
|
55
|
+
return f"Q{qid_int}"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def normalize_name(name: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Normalize a name for database lookup.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
name: Name to normalize
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Lowercase, stripped name
|
|
67
|
+
"""
|
|
68
|
+
if not name:
|
|
69
|
+
return ""
|
|
70
|
+
return name.lower().strip()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_or_create_location(
|
|
74
|
+
locations_db: "LocationsDatabase",
|
|
75
|
+
name: str,
|
|
76
|
+
location_type_id: int,
|
|
77
|
+
source_id: int = 4, # wikidata
|
|
78
|
+
qid: Optional[int] = None,
|
|
79
|
+
source_identifier: Optional[str] = None,
|
|
80
|
+
parent_ids: Optional[list[int]] = None,
|
|
81
|
+
) -> int:
|
|
82
|
+
"""
|
|
83
|
+
Get or create a location record.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
locations_db: LocationsDatabase instance
|
|
87
|
+
name: Location name
|
|
88
|
+
location_type_id: FK to location_types table
|
|
89
|
+
source_id: FK to source_types table
|
|
90
|
+
qid: Optional Wikidata QID as integer
|
|
91
|
+
source_identifier: Optional source-specific identifier
|
|
92
|
+
parent_ids: Optional list of parent location IDs
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Location ID
|
|
96
|
+
"""
|
|
97
|
+
return locations_db.get_or_create(
|
|
98
|
+
name=name,
|
|
99
|
+
location_type_id=location_type_id,
|
|
100
|
+
source_id=source_id,
|
|
101
|
+
qid=qid,
|
|
102
|
+
source_identifier=source_identifier,
|
|
103
|
+
parent_ids=parent_ids,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_or_create_role(
|
|
108
|
+
roles_db: "RolesDatabase",
|
|
109
|
+
name: str,
|
|
110
|
+
source_id: int = 4, # wikidata
|
|
111
|
+
qid: Optional[int] = None,
|
|
112
|
+
source_identifier: Optional[str] = None,
|
|
113
|
+
) -> int:
|
|
114
|
+
"""
|
|
115
|
+
Get or create a role record.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
roles_db: RolesDatabase instance
|
|
119
|
+
name: Role/title name
|
|
120
|
+
source_id: FK to source_types table
|
|
121
|
+
qid: Optional Wikidata QID as integer
|
|
122
|
+
source_identifier: Optional source-specific identifier
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Role ID
|
|
126
|
+
"""
|
|
127
|
+
return roles_db.get_or_create(
|
|
128
|
+
name=name,
|
|
129
|
+
source_id=source_id,
|
|
130
|
+
qid=qid,
|
|
131
|
+
source_identifier=source_identifier,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def resolve_country_to_location_id(
|
|
136
|
+
locations_db: "LocationsDatabase",
|
|
137
|
+
country_text: str,
|
|
138
|
+
) -> Optional[int]:
|
|
139
|
+
"""
|
|
140
|
+
Resolve a country name/code to a location ID.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
locations_db: LocationsDatabase instance
|
|
144
|
+
country_text: Country code (e.g., "US") or name (e.g., "United States")
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Location ID or None if not found
|
|
148
|
+
"""
|
|
149
|
+
if not country_text:
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
return locations_db.resolve_region_text(country_text)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_source_id(source_name: str) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Get source_id for a source name.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
source_name: Source name (e.g., "gleif", "sec_edgar")
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Source ID (1-4)
|
|
164
|
+
"""
|
|
165
|
+
from ..seed_data import SOURCE_NAME_TO_ID
|
|
166
|
+
return SOURCE_NAME_TO_ID.get(source_name, 4) # default to wikidata
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_source_name(source_id: int) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Get source name for a source_id.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
source_id: Source ID (1-4)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Source name
|
|
178
|
+
"""
|
|
179
|
+
from ..seed_data import SOURCE_ID_TO_NAME
|
|
180
|
+
return SOURCE_ID_TO_NAME.get(source_id, "wikidata")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_entity_type_id(entity_type_name: str) -> int:
|
|
184
|
+
"""
|
|
185
|
+
Get entity_type_id for an entity type name.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
entity_type_name: Entity type name (e.g., "business", "fund")
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Entity type ID (1-17)
|
|
192
|
+
"""
|
|
193
|
+
from ..seed_data import ORG_TYPE_NAME_TO_ID
|
|
194
|
+
return ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17) # default to unknown
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def get_entity_type_name(entity_type_id: int) -> str:
|
|
198
|
+
"""
|
|
199
|
+
Get entity type name for an entity_type_id.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
entity_type_id: Entity type ID (1-17)
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Entity type name
|
|
206
|
+
"""
|
|
207
|
+
from ..seed_data import ORG_TYPE_ID_TO_NAME
|
|
208
|
+
return ORG_TYPE_ID_TO_NAME.get(entity_type_id, "unknown")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_person_type_id(person_type_name: str) -> int:
|
|
212
|
+
"""
|
|
213
|
+
Get person_type_id for a person type name.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
person_type_name: Person type name (e.g., "executive", "politician")
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Person type ID (1-15)
|
|
220
|
+
"""
|
|
221
|
+
from ..seed_data import PEOPLE_TYPE_NAME_TO_ID
|
|
222
|
+
return PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15) # default to unknown
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def get_person_type_name(person_type_id: int) -> str:
|
|
226
|
+
"""
|
|
227
|
+
Get person type name for a person_type_id.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
person_type_id: Person type ID (1-15)
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Person type name
|
|
234
|
+
"""
|
|
235
|
+
from ..seed_data import PEOPLE_TYPE_ID_TO_NAME
|
|
236
|
+
return PEOPLE_TYPE_ID_TO_NAME.get(person_type_id, "unknown")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_location_type_id(location_type_name: str) -> int:
|
|
240
|
+
"""
|
|
241
|
+
Get location_type_id for a location type name.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
location_type_name: Location type name (e.g., "country", "city")
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Location type ID
|
|
248
|
+
"""
|
|
249
|
+
from ..seed_data import LOCATION_TYPE_NAME_TO_ID
|
|
250
|
+
return LOCATION_TYPE_NAME_TO_ID.get(location_type_name, 36) # default to other
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_location_type_id_from_qid(wikidata_qid: int) -> int:
|
|
254
|
+
"""
|
|
255
|
+
Get location_type_id from a Wikidata P31 QID.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
wikidata_qid: Wikidata instance-of QID (e.g., 515 for city)
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Location type ID (defaults to 36 = other)
|
|
262
|
+
"""
|
|
263
|
+
from ..seed_data import LOCATION_TYPE_QID_TO_ID
|
|
264
|
+
return LOCATION_TYPE_QID_TO_ID.get(wikidata_qid, 36) # default to other
|
|
@@ -274,6 +274,20 @@ class SecEdgarImporter:
|
|
|
274
274
|
exchanges = data.get("exchanges", [])
|
|
275
275
|
exchange = exchanges[0] if exchanges else ""
|
|
276
276
|
|
|
277
|
+
# Get dates from filings history
|
|
278
|
+
# Use oldest filing date as from_date (when company started filing with SEC)
|
|
279
|
+
filings = data.get("filings", {})
|
|
280
|
+
recent_filings = filings.get("recent", {})
|
|
281
|
+
filing_dates = recent_filings.get("filingDate", [])
|
|
282
|
+
|
|
283
|
+
# Get the oldest filing date (last in the list, as they're typically newest-first)
|
|
284
|
+
from_date = None
|
|
285
|
+
if filing_dates:
|
|
286
|
+
# Filing dates are in YYYY-MM-DD format
|
|
287
|
+
oldest_date = filing_dates[-1] if filing_dates else None
|
|
288
|
+
if oldest_date and len(oldest_date) >= 10:
|
|
289
|
+
from_date = oldest_date[:10]
|
|
290
|
+
|
|
277
291
|
# Build record
|
|
278
292
|
record_data = {
|
|
279
293
|
"cik": cik,
|
|
@@ -292,6 +306,8 @@ class SecEdgarImporter:
|
|
|
292
306
|
"zip": business_addr.get("zipCode", ""),
|
|
293
307
|
},
|
|
294
308
|
}
|
|
309
|
+
if from_date:
|
|
310
|
+
record_data["first_filing_date"] = from_date
|
|
295
311
|
|
|
296
312
|
# Use stateOrCountry for region (2-letter US state or country code)
|
|
297
313
|
region = business_addr.get("stateOrCountry", "US")
|
|
@@ -302,6 +318,7 @@ class SecEdgarImporter:
|
|
|
302
318
|
source_id=cik,
|
|
303
319
|
region=region,
|
|
304
320
|
entity_type=record_entity_type,
|
|
321
|
+
from_date=from_date,
|
|
305
322
|
record=record_data,
|
|
306
323
|
)
|
|
307
324
|
|