corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,264 @@
1
+ """
2
+ Shared utilities for v2 database importers.
3
+
4
+ Provides helper functions for resolving locations, roles, and QIDs
5
+ to their normalized FK references in the v2 schema.
6
+ """
7
+
8
+ import logging
9
+ from typing import TYPE_CHECKING, Optional
10
+
11
+ if TYPE_CHECKING:
12
+ from ..store import LocationsDatabase, RolesDatabase
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def parse_qid(qid_text: Optional[str]) -> Optional[int]:
18
+ """
19
+ Parse a QID string to integer.
20
+
21
+ Args:
22
+ qid_text: QID string like "Q12345" or just "12345"
23
+
24
+ Returns:
25
+ Integer QID or None if invalid
26
+ """
27
+ if not qid_text:
28
+ return None
29
+
30
+ # Strip whitespace
31
+ qid_text = qid_text.strip()
32
+
33
+ # Handle "Q12345" format
34
+ if qid_text.startswith("Q") or qid_text.startswith("q"):
35
+ qid_text = qid_text[1:]
36
+
37
+ try:
38
+ return int(qid_text)
39
+ except ValueError:
40
+ return None
41
+
42
+
43
+ def format_qid(qid_int: Optional[int]) -> Optional[str]:
44
+ """
45
+ Format an integer QID back to string format.
46
+
47
+ Args:
48
+ qid_int: Integer QID (e.g., 12345)
49
+
50
+ Returns:
51
+ String QID like "Q12345" or None
52
+ """
53
+ if qid_int is None:
54
+ return None
55
+ return f"Q{qid_int}"
56
+
57
+
58
+ def normalize_name(name: str) -> str:
59
+ """
60
+ Normalize a name for database lookup.
61
+
62
+ Args:
63
+ name: Name to normalize
64
+
65
+ Returns:
66
+ Lowercase, stripped name
67
+ """
68
+ if not name:
69
+ return ""
70
+ return name.lower().strip()
71
+
72
+
73
+ def get_or_create_location(
74
+ locations_db: "LocationsDatabase",
75
+ name: str,
76
+ location_type_id: int,
77
+ source_id: int = 4, # wikidata
78
+ qid: Optional[int] = None,
79
+ source_identifier: Optional[str] = None,
80
+ parent_ids: Optional[list[int]] = None,
81
+ ) -> int:
82
+ """
83
+ Get or create a location record.
84
+
85
+ Args:
86
+ locations_db: LocationsDatabase instance
87
+ name: Location name
88
+ location_type_id: FK to location_types table
89
+ source_id: FK to source_types table
90
+ qid: Optional Wikidata QID as integer
91
+ source_identifier: Optional source-specific identifier
92
+ parent_ids: Optional list of parent location IDs
93
+
94
+ Returns:
95
+ Location ID
96
+ """
97
+ return locations_db.get_or_create(
98
+ name=name,
99
+ location_type_id=location_type_id,
100
+ source_id=source_id,
101
+ qid=qid,
102
+ source_identifier=source_identifier,
103
+ parent_ids=parent_ids,
104
+ )
105
+
106
+
107
+ def get_or_create_role(
108
+ roles_db: "RolesDatabase",
109
+ name: str,
110
+ source_id: int = 4, # wikidata
111
+ qid: Optional[int] = None,
112
+ source_identifier: Optional[str] = None,
113
+ ) -> int:
114
+ """
115
+ Get or create a role record.
116
+
117
+ Args:
118
+ roles_db: RolesDatabase instance
119
+ name: Role/title name
120
+ source_id: FK to source_types table
121
+ qid: Optional Wikidata QID as integer
122
+ source_identifier: Optional source-specific identifier
123
+
124
+ Returns:
125
+ Role ID
126
+ """
127
+ return roles_db.get_or_create(
128
+ name=name,
129
+ source_id=source_id,
130
+ qid=qid,
131
+ source_identifier=source_identifier,
132
+ )
133
+
134
+
135
+ def resolve_country_to_location_id(
136
+ locations_db: "LocationsDatabase",
137
+ country_text: str,
138
+ ) -> Optional[int]:
139
+ """
140
+ Resolve a country name/code to a location ID.
141
+
142
+ Args:
143
+ locations_db: LocationsDatabase instance
144
+ country_text: Country code (e.g., "US") or name (e.g., "United States")
145
+
146
+ Returns:
147
+ Location ID or None if not found
148
+ """
149
+ if not country_text:
150
+ return None
151
+
152
+ return locations_db.resolve_region_text(country_text)
153
+
154
+
155
+ def get_source_id(source_name: str) -> int:
156
+ """
157
+ Get source_id for a source name.
158
+
159
+ Args:
160
+ source_name: Source name (e.g., "gleif", "sec_edgar")
161
+
162
+ Returns:
163
+ Source ID (1-4)
164
+ """
165
+ from ..seed_data import SOURCE_NAME_TO_ID
166
+ return SOURCE_NAME_TO_ID.get(source_name, 4) # default to wikidata
167
+
168
+
169
+ def get_source_name(source_id: int) -> str:
170
+ """
171
+ Get source name for a source_id.
172
+
173
+ Args:
174
+ source_id: Source ID (1-4)
175
+
176
+ Returns:
177
+ Source name
178
+ """
179
+ from ..seed_data import SOURCE_ID_TO_NAME
180
+ return SOURCE_ID_TO_NAME.get(source_id, "wikidata")
181
+
182
+
183
+ def get_entity_type_id(entity_type_name: str) -> int:
184
+ """
185
+ Get entity_type_id for an entity type name.
186
+
187
+ Args:
188
+ entity_type_name: Entity type name (e.g., "business", "fund")
189
+
190
+ Returns:
191
+ Entity type ID (1-17)
192
+ """
193
+ from ..seed_data import ORG_TYPE_NAME_TO_ID
194
+ return ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17) # default to unknown
195
+
196
+
197
+ def get_entity_type_name(entity_type_id: int) -> str:
198
+ """
199
+ Get entity type name for an entity_type_id.
200
+
201
+ Args:
202
+ entity_type_id: Entity type ID (1-17)
203
+
204
+ Returns:
205
+ Entity type name
206
+ """
207
+ from ..seed_data import ORG_TYPE_ID_TO_NAME
208
+ return ORG_TYPE_ID_TO_NAME.get(entity_type_id, "unknown")
209
+
210
+
211
+ def get_person_type_id(person_type_name: str) -> int:
212
+ """
213
+ Get person_type_id for a person type name.
214
+
215
+ Args:
216
+ person_type_name: Person type name (e.g., "executive", "politician")
217
+
218
+ Returns:
219
+ Person type ID (1-15)
220
+ """
221
+ from ..seed_data import PEOPLE_TYPE_NAME_TO_ID
222
+ return PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15) # default to unknown
223
+
224
+
225
+ def get_person_type_name(person_type_id: int) -> str:
226
+ """
227
+ Get person type name for a person_type_id.
228
+
229
+ Args:
230
+ person_type_id: Person type ID (1-15)
231
+
232
+ Returns:
233
+ Person type name
234
+ """
235
+ from ..seed_data import PEOPLE_TYPE_ID_TO_NAME
236
+ return PEOPLE_TYPE_ID_TO_NAME.get(person_type_id, "unknown")
237
+
238
+
239
+ def get_location_type_id(location_type_name: str) -> int:
240
+ """
241
+ Get location_type_id for a location type name.
242
+
243
+ Args:
244
+ location_type_name: Location type name (e.g., "country", "city")
245
+
246
+ Returns:
247
+ Location type ID
248
+ """
249
+ from ..seed_data import LOCATION_TYPE_NAME_TO_ID
250
+ return LOCATION_TYPE_NAME_TO_ID.get(location_type_name, 36) # default to other
251
+
252
+
253
+ def get_location_type_id_from_qid(wikidata_qid: int) -> int:
254
+ """
255
+ Get location_type_id from a Wikidata P31 QID.
256
+
257
+ Args:
258
+ wikidata_qid: Wikidata instance-of QID (e.g., 515 for city)
259
+
260
+ Returns:
261
+ Location type ID (defaults to 36 = other)
262
+ """
263
+ from ..seed_data import LOCATION_TYPE_QID_TO_ID
264
+ return LOCATION_TYPE_QID_TO_ID.get(wikidata_qid, 36) # default to other
@@ -274,6 +274,20 @@ class SecEdgarImporter:
274
274
  exchanges = data.get("exchanges", [])
275
275
  exchange = exchanges[0] if exchanges else ""
276
276
 
277
+ # Get dates from filings history
278
+ # Use oldest filing date as from_date (when company started filing with SEC)
279
+ filings = data.get("filings", {})
280
+ recent_filings = filings.get("recent", {})
281
+ filing_dates = recent_filings.get("filingDate", [])
282
+
283
+ # Get the oldest filing date (last in the list, as they're typically newest-first)
284
+ from_date = None
285
+ if filing_dates:
286
+ # Filing dates are in YYYY-MM-DD format
287
+ oldest_date = filing_dates[-1] if filing_dates else None
288
+ if oldest_date and len(oldest_date) >= 10:
289
+ from_date = oldest_date[:10]
290
+
277
291
  # Build record
278
292
  record_data = {
279
293
  "cik": cik,
@@ -292,6 +306,8 @@ class SecEdgarImporter:
292
306
  "zip": business_addr.get("zipCode", ""),
293
307
  },
294
308
  }
309
+ if from_date:
310
+ record_data["first_filing_date"] = from_date
295
311
 
296
312
  # Use stateOrCountry for region (2-letter US state or country code)
297
313
  region = business_addr.get("stateOrCountry", "US")
@@ -302,6 +318,7 @@ class SecEdgarImporter:
302
318
  source_id=cik,
303
319
  region=region,
304
320
  entity_type=record_entity_type,
321
+ from_date=from_date,
305
322
  record=record_data,
306
323
  )
307
324