corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Seed data for enum lookup tables in the v2 normalized schema.
|
|
3
|
+
|
|
4
|
+
This module contains all enum values that are seeded into lookup tables
|
|
5
|
+
when creating a fresh database or migrating from v1.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
# =============================================================================
|
|
11
|
+
# SOURCE TYPES
|
|
12
|
+
# =============================================================================
|
|
13
|
+
|
|
14
|
+
SOURCE_TYPES: list[tuple[int, str]] = [
|
|
15
|
+
(1, "gleif"),
|
|
16
|
+
(2, "sec_edgar"),
|
|
17
|
+
(3, "companies_house"),
|
|
18
|
+
(4, "wikidata"),
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
# Mapping from old v1 source names to v2 source IDs
|
|
22
|
+
SOURCE_NAME_TO_ID: dict[str, int] = {
|
|
23
|
+
"gleif": 1,
|
|
24
|
+
"sec_edgar": 2,
|
|
25
|
+
"companies_house": 3,
|
|
26
|
+
"wikidata": 4,
|
|
27
|
+
# Legacy name mapping (v1 used "wikipedia" for Wikidata sources)
|
|
28
|
+
"wikipedia": 4,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
SOURCE_ID_TO_NAME: dict[int, str] = {
|
|
32
|
+
1: "gleif",
|
|
33
|
+
2: "sec_edgar",
|
|
34
|
+
3: "companies_house",
|
|
35
|
+
4: "wikidata",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# =============================================================================
|
|
39
|
+
# PEOPLE TYPES
|
|
40
|
+
# =============================================================================
|
|
41
|
+
|
|
42
|
+
PEOPLE_TYPES: list[tuple[int, str]] = [
|
|
43
|
+
(1, "executive"),
|
|
44
|
+
(2, "politician"),
|
|
45
|
+
(3, "government"),
|
|
46
|
+
(4, "military"),
|
|
47
|
+
(5, "legal"),
|
|
48
|
+
(6, "professional"),
|
|
49
|
+
(7, "academic"),
|
|
50
|
+
(8, "artist"),
|
|
51
|
+
(9, "media"),
|
|
52
|
+
(10, "athlete"),
|
|
53
|
+
(11, "entrepreneur"),
|
|
54
|
+
(12, "journalist"),
|
|
55
|
+
(13, "activist"),
|
|
56
|
+
(14, "scientist"),
|
|
57
|
+
(15, "unknown"),
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
PEOPLE_TYPE_NAME_TO_ID: dict[str, int] = {name: id_ for id_, name in PEOPLE_TYPES}
|
|
61
|
+
PEOPLE_TYPE_ID_TO_NAME: dict[int, str] = {id_: name for id_, name in PEOPLE_TYPES}
|
|
62
|
+
|
|
63
|
+
# =============================================================================
|
|
64
|
+
# ORGANIZATION TYPES
|
|
65
|
+
# =============================================================================
|
|
66
|
+
|
|
67
|
+
ORGANIZATION_TYPES: list[tuple[int, str]] = [
|
|
68
|
+
(1, "business"),
|
|
69
|
+
(2, "fund"),
|
|
70
|
+
(3, "branch"),
|
|
71
|
+
(4, "nonprofit"),
|
|
72
|
+
(5, "ngo"),
|
|
73
|
+
(6, "foundation"),
|
|
74
|
+
(7, "trade_union"),
|
|
75
|
+
(8, "government"),
|
|
76
|
+
(9, "international_org"),
|
|
77
|
+
(10, "political_party"),
|
|
78
|
+
(11, "educational"),
|
|
79
|
+
(12, "research"),
|
|
80
|
+
(13, "religious"),
|
|
81
|
+
(14, "sports"),
|
|
82
|
+
(15, "media"),
|
|
83
|
+
(16, "healthcare"),
|
|
84
|
+
(17, "unknown"),
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
ORG_TYPE_NAME_TO_ID: dict[str, int] = {name: id_ for id_, name in ORGANIZATION_TYPES}
|
|
88
|
+
ORG_TYPE_ID_TO_NAME: dict[int, str] = {id_: name for id_, name in ORGANIZATION_TYPES}
|
|
89
|
+
|
|
90
|
+
# =============================================================================
|
|
91
|
+
# SIMPLIFIED LOCATION TYPES
|
|
92
|
+
# =============================================================================
|
|
93
|
+
|
|
94
|
+
SIMPLIFIED_LOCATION_TYPES: list[tuple[int, str]] = [
|
|
95
|
+
(1, "continent"),
|
|
96
|
+
(2, "country"),
|
|
97
|
+
(3, "subdivision"), # States, provinces, regions, counties, departments
|
|
98
|
+
(4, "city"), # Cities, towns, municipalities, communes
|
|
99
|
+
(5, "district"), # Districts, boroughs, neighborhoods
|
|
100
|
+
(6, "historic"), # Former countries, historic territories
|
|
101
|
+
(7, "other"), # Unclassified locations
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
SIMPLIFIED_LOCATION_TYPE_NAME_TO_ID: dict[str, int] = {
|
|
105
|
+
name: id_ for id_, name in SIMPLIFIED_LOCATION_TYPES
|
|
106
|
+
}
|
|
107
|
+
SIMPLIFIED_LOCATION_TYPE_ID_TO_NAME: dict[int, str] = {
|
|
108
|
+
id_: name for id_, name in SIMPLIFIED_LOCATION_TYPES
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# =============================================================================
|
|
112
|
+
# DETAILED LOCATION TYPES WITH WIKIDATA QID MAPPINGS
|
|
113
|
+
# =============================================================================
|
|
114
|
+
|
|
115
|
+
# Format: (id, name, qid, simplified_id)
|
|
116
|
+
# qid is the Wikidata Q code as integer (e.g., Q515 -> 515)
|
|
117
|
+
LOCATION_TYPES: list[tuple[int, str, int | None, int]] = [
|
|
118
|
+
# Continents (simplified_id=1)
|
|
119
|
+
(1, "continent", 5107, 1),
|
|
120
|
+
|
|
121
|
+
# Countries/Sovereigns (simplified_id=2)
|
|
122
|
+
(2, "country", 6256, 2),
|
|
123
|
+
(3, "sovereign_state", 3624078, 2),
|
|
124
|
+
(4, "dependent_territory", 161243, 2),
|
|
125
|
+
|
|
126
|
+
# Subdivisions - US specific (simplified_id=3)
|
|
127
|
+
(5, "us_state", 35657, 3),
|
|
128
|
+
(6, "us_county", 47168, 3),
|
|
129
|
+
|
|
130
|
+
# Subdivisions - Other countries (simplified_id=3)
|
|
131
|
+
(7, "state_of_australia", 5852411, 3),
|
|
132
|
+
(8, "state_of_germany", 1221156, 3),
|
|
133
|
+
(9, "state_of_india", 131541, 3),
|
|
134
|
+
(10, "province", 34876, 3),
|
|
135
|
+
(11, "region", 82794, 3),
|
|
136
|
+
(12, "county", 28575, 3),
|
|
137
|
+
(13, "department_france", 6465, 3),
|
|
138
|
+
(14, "prefecture_japan", 50337, 3),
|
|
139
|
+
(15, "canton_switzerland", 23058, 3),
|
|
140
|
+
(16, "autonomous_community_spain", 10742, 3),
|
|
141
|
+
(17, "voivodeship_poland", 150093, 3),
|
|
142
|
+
(18, "oblast_russia", 835714, 3),
|
|
143
|
+
|
|
144
|
+
# Cities/Towns (simplified_id=4)
|
|
145
|
+
(19, "city", 515, 4),
|
|
146
|
+
(20, "big_city", 1549591, 4),
|
|
147
|
+
(21, "capital", 5119, 4),
|
|
148
|
+
(22, "town", 3957, 4),
|
|
149
|
+
(23, "municipality", 15284, 4),
|
|
150
|
+
(24, "commune_france", 484170, 4),
|
|
151
|
+
(25, "municipality_germany", 262166, 4),
|
|
152
|
+
(26, "municipality_japan", 1054813, 4),
|
|
153
|
+
(27, "village", 532, 4),
|
|
154
|
+
(28, "hamlet", 5084, 4),
|
|
155
|
+
|
|
156
|
+
# Districts (simplified_id=5)
|
|
157
|
+
(29, "district", 149621, 5),
|
|
158
|
+
(30, "borough", 5765681, 5),
|
|
159
|
+
(31, "neighborhood", 123705, 5),
|
|
160
|
+
(32, "ward", 12813115, 5),
|
|
161
|
+
|
|
162
|
+
# Historic (simplified_id=6)
|
|
163
|
+
(33, "former_country", 3024240, 6),
|
|
164
|
+
(34, "ancient_civilization", 28171280, 6),
|
|
165
|
+
(35, "historic_territory", 1620908, 6),
|
|
166
|
+
|
|
167
|
+
# Other/Unknown (simplified_id=7)
|
|
168
|
+
(36, "other", None, 7),
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
LOCATION_TYPE_NAME_TO_ID: dict[str, int] = {
|
|
172
|
+
name: id_ for id_, name, _, _ in LOCATION_TYPES
|
|
173
|
+
}
|
|
174
|
+
LOCATION_TYPE_ID_TO_NAME: dict[int, str] = {
|
|
175
|
+
id_: name for id_, name, _, _ in LOCATION_TYPES
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Mapping from Wikidata QID (P31 value) to location_type_id
|
|
179
|
+
LOCATION_TYPE_QID_TO_ID: dict[int, int] = {
|
|
180
|
+
qid: id_ for id_, name, qid, _ in LOCATION_TYPES if qid is not None
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# Mapping from location_type_id to simplified_id
|
|
184
|
+
LOCATION_TYPE_TO_SIMPLIFIED: dict[int, int] = {
|
|
185
|
+
id_: simplified_id for id_, _, _, simplified_id in LOCATION_TYPES
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# =============================================================================
|
|
190
|
+
# PYCOUNTRY INTEGRATION
|
|
191
|
+
# =============================================================================
|
|
192
|
+
|
|
193
|
+
def get_pycountry_countries() -> list[dict[str, Any]]:
|
|
194
|
+
"""
|
|
195
|
+
Get all countries from pycountry for seeding the locations table.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
List of dicts with keys: name, alpha_2, alpha_3, numeric
|
|
199
|
+
"""
|
|
200
|
+
import pycountry
|
|
201
|
+
|
|
202
|
+
countries = []
|
|
203
|
+
for country in pycountry.countries:
|
|
204
|
+
countries.append({
|
|
205
|
+
"name": country.name,
|
|
206
|
+
"alpha_2": country.alpha_2,
|
|
207
|
+
"alpha_3": getattr(country, "alpha_3", None),
|
|
208
|
+
"numeric": getattr(country, "numeric", None),
|
|
209
|
+
})
|
|
210
|
+
return countries
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# =============================================================================
|
|
214
|
+
# SEED FUNCTIONS
|
|
215
|
+
# =============================================================================
|
|
216
|
+
|
|
217
|
+
def seed_source_types(conn) -> int:
|
|
218
|
+
"""
|
|
219
|
+
Seed source_types table.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
conn: SQLite connection
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Number of rows inserted
|
|
226
|
+
"""
|
|
227
|
+
conn.executemany(
|
|
228
|
+
"INSERT OR IGNORE INTO source_types (id, name) VALUES (?, ?)",
|
|
229
|
+
SOURCE_TYPES
|
|
230
|
+
)
|
|
231
|
+
conn.commit()
|
|
232
|
+
return len(SOURCE_TYPES)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def seed_people_types(conn) -> int:
|
|
236
|
+
"""
|
|
237
|
+
Seed people_types table.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
conn: SQLite connection
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Number of rows inserted
|
|
244
|
+
"""
|
|
245
|
+
conn.executemany(
|
|
246
|
+
"INSERT OR IGNORE INTO people_types (id, name) VALUES (?, ?)",
|
|
247
|
+
PEOPLE_TYPES
|
|
248
|
+
)
|
|
249
|
+
conn.commit()
|
|
250
|
+
return len(PEOPLE_TYPES)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def seed_organization_types(conn) -> int:
|
|
254
|
+
"""
|
|
255
|
+
Seed organization_types table.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
conn: SQLite connection
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Number of rows inserted
|
|
262
|
+
"""
|
|
263
|
+
conn.executemany(
|
|
264
|
+
"INSERT OR IGNORE INTO organization_types (id, name) VALUES (?, ?)",
|
|
265
|
+
ORGANIZATION_TYPES
|
|
266
|
+
)
|
|
267
|
+
conn.commit()
|
|
268
|
+
return len(ORGANIZATION_TYPES)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def seed_simplified_location_types(conn) -> int:
|
|
272
|
+
"""
|
|
273
|
+
Seed simplified_location_types table.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
conn: SQLite connection
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Number of rows inserted
|
|
280
|
+
"""
|
|
281
|
+
conn.executemany(
|
|
282
|
+
"INSERT OR IGNORE INTO simplified_location_types (id, name) VALUES (?, ?)",
|
|
283
|
+
SIMPLIFIED_LOCATION_TYPES
|
|
284
|
+
)
|
|
285
|
+
conn.commit()
|
|
286
|
+
return len(SIMPLIFIED_LOCATION_TYPES)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def seed_location_types(conn) -> int:
|
|
290
|
+
"""
|
|
291
|
+
Seed location_types table with Wikidata QID mappings.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
conn: SQLite connection
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Number of rows inserted
|
|
298
|
+
"""
|
|
299
|
+
conn.executemany(
|
|
300
|
+
"INSERT OR IGNORE INTO location_types (id, name, qid, simplified_id) VALUES (?, ?, ?, ?)",
|
|
301
|
+
LOCATION_TYPES
|
|
302
|
+
)
|
|
303
|
+
conn.commit()
|
|
304
|
+
return len(LOCATION_TYPES)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def seed_all_enums(conn) -> dict[str, int]:
|
|
308
|
+
"""
|
|
309
|
+
Seed all enum tables.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
conn: SQLite connection
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Dict mapping table name to number of rows inserted
|
|
316
|
+
"""
|
|
317
|
+
return {
|
|
318
|
+
"source_types": seed_source_types(conn),
|
|
319
|
+
"people_types": seed_people_types(conn),
|
|
320
|
+
"organization_types": seed_organization_types(conn),
|
|
321
|
+
"simplified_location_types": seed_simplified_location_types(conn),
|
|
322
|
+
"location_types": seed_location_types(conn),
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def seed_pycountry_locations(conn, source_id: int = 4) -> int:
|
|
327
|
+
"""
|
|
328
|
+
Seed locations table with pycountry countries.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
conn: SQLite connection
|
|
332
|
+
source_id: Source ID to use (default: 4 = wikidata, used for pycountry data)
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Number of locations inserted
|
|
336
|
+
"""
|
|
337
|
+
import pycountry
|
|
338
|
+
|
|
339
|
+
# Get country location_type_id
|
|
340
|
+
country_type_id = LOCATION_TYPE_NAME_TO_ID["country"]
|
|
341
|
+
count = 0
|
|
342
|
+
|
|
343
|
+
for country in pycountry.countries:
|
|
344
|
+
name = country.name
|
|
345
|
+
alpha_2 = country.alpha_2
|
|
346
|
+
name_normalized = name.lower()
|
|
347
|
+
|
|
348
|
+
conn.execute(
|
|
349
|
+
"""
|
|
350
|
+
INSERT OR IGNORE INTO locations
|
|
351
|
+
(name, name_normalized, source_id, source_identifier, location_type_id)
|
|
352
|
+
VALUES (?, ?, ?, ?, ?)
|
|
353
|
+
""",
|
|
354
|
+
(name, name_normalized, source_id, alpha_2, country_type_id)
|
|
355
|
+
)
|
|
356
|
+
count += 1
|
|
357
|
+
|
|
358
|
+
conn.commit()
|
|
359
|
+
return count
|