corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,359 @@
1
+ """
2
+ Seed data for enum lookup tables in the v2 normalized schema.
3
+
4
+ This module contains all enum values that are seeded into lookup tables
5
+ when creating a fresh database or migrating from v1.
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ # =============================================================================
11
+ # SOURCE TYPES
12
+ # =============================================================================
13
+
14
+ SOURCE_TYPES: list[tuple[int, str]] = [
15
+ (1, "gleif"),
16
+ (2, "sec_edgar"),
17
+ (3, "companies_house"),
18
+ (4, "wikidata"),
19
+ ]
20
+
21
+ # Mapping from old v1 source names to v2 source IDs
22
+ SOURCE_NAME_TO_ID: dict[str, int] = {
23
+ "gleif": 1,
24
+ "sec_edgar": 2,
25
+ "companies_house": 3,
26
+ "wikidata": 4,
27
+ # Legacy name mapping (v1 used "wikipedia" for Wikidata sources)
28
+ "wikipedia": 4,
29
+ }
30
+
31
+ SOURCE_ID_TO_NAME: dict[int, str] = {
32
+ 1: "gleif",
33
+ 2: "sec_edgar",
34
+ 3: "companies_house",
35
+ 4: "wikidata",
36
+ }
37
+
38
+ # =============================================================================
39
+ # PEOPLE TYPES
40
+ # =============================================================================
41
+
42
+ PEOPLE_TYPES: list[tuple[int, str]] = [
43
+ (1, "executive"),
44
+ (2, "politician"),
45
+ (3, "government"),
46
+ (4, "military"),
47
+ (5, "legal"),
48
+ (6, "professional"),
49
+ (7, "academic"),
50
+ (8, "artist"),
51
+ (9, "media"),
52
+ (10, "athlete"),
53
+ (11, "entrepreneur"),
54
+ (12, "journalist"),
55
+ (13, "activist"),
56
+ (14, "scientist"),
57
+ (15, "unknown"),
58
+ ]
59
+
60
+ PEOPLE_TYPE_NAME_TO_ID: dict[str, int] = {name: id_ for id_, name in PEOPLE_TYPES}
61
+ PEOPLE_TYPE_ID_TO_NAME: dict[int, str] = {id_: name for id_, name in PEOPLE_TYPES}
62
+
63
+ # =============================================================================
64
+ # ORGANIZATION TYPES
65
+ # =============================================================================
66
+
67
+ ORGANIZATION_TYPES: list[tuple[int, str]] = [
68
+ (1, "business"),
69
+ (2, "fund"),
70
+ (3, "branch"),
71
+ (4, "nonprofit"),
72
+ (5, "ngo"),
73
+ (6, "foundation"),
74
+ (7, "trade_union"),
75
+ (8, "government"),
76
+ (9, "international_org"),
77
+ (10, "political_party"),
78
+ (11, "educational"),
79
+ (12, "research"),
80
+ (13, "religious"),
81
+ (14, "sports"),
82
+ (15, "media"),
83
+ (16, "healthcare"),
84
+ (17, "unknown"),
85
+ ]
86
+
87
+ ORG_TYPE_NAME_TO_ID: dict[str, int] = {name: id_ for id_, name in ORGANIZATION_TYPES}
88
+ ORG_TYPE_ID_TO_NAME: dict[int, str] = {id_: name for id_, name in ORGANIZATION_TYPES}
89
+
90
+ # =============================================================================
91
+ # SIMPLIFIED LOCATION TYPES
92
+ # =============================================================================
93
+
94
+ SIMPLIFIED_LOCATION_TYPES: list[tuple[int, str]] = [
95
+ (1, "continent"),
96
+ (2, "country"),
97
+ (3, "subdivision"), # States, provinces, regions, counties, departments
98
+ (4, "city"), # Cities, towns, municipalities, communes
99
+ (5, "district"), # Districts, boroughs, neighborhoods
100
+ (6, "historic"), # Former countries, historic territories
101
+ (7, "other"), # Unclassified locations
102
+ ]
103
+
104
+ SIMPLIFIED_LOCATION_TYPE_NAME_TO_ID: dict[str, int] = {
105
+ name: id_ for id_, name in SIMPLIFIED_LOCATION_TYPES
106
+ }
107
+ SIMPLIFIED_LOCATION_TYPE_ID_TO_NAME: dict[int, str] = {
108
+ id_: name for id_, name in SIMPLIFIED_LOCATION_TYPES
109
+ }
110
+
111
+ # =============================================================================
112
+ # DETAILED LOCATION TYPES WITH WIKIDATA QID MAPPINGS
113
+ # =============================================================================
114
+
115
+ # Format: (id, name, qid, simplified_id)
116
+ # qid is the Wikidata Q code as integer (e.g., Q515 -> 515)
117
+ LOCATION_TYPES: list[tuple[int, str, int | None, int]] = [
118
+ # Continents (simplified_id=1)
119
+ (1, "continent", 5107, 1),
120
+
121
+ # Countries/Sovereigns (simplified_id=2)
122
+ (2, "country", 6256, 2),
123
+ (3, "sovereign_state", 3624078, 2),
124
+ (4, "dependent_territory", 161243, 2),
125
+
126
+ # Subdivisions - US specific (simplified_id=3)
127
+ (5, "us_state", 35657, 3),
128
+ (6, "us_county", 47168, 3),
129
+
130
+ # Subdivisions - Other countries (simplified_id=3)
131
+ (7, "state_of_australia", 5852411, 3),
132
+ (8, "state_of_germany", 1221156, 3),
133
+ (9, "state_of_india", 131541, 3),
134
+ (10, "province", 34876, 3),
135
+ (11, "region", 82794, 3),
136
+ (12, "county", 28575, 3),
137
+ (13, "department_france", 6465, 3),
138
+ (14, "prefecture_japan", 50337, 3),
139
+ (15, "canton_switzerland", 23058, 3),
140
+ (16, "autonomous_community_spain", 10742, 3),
141
+ (17, "voivodeship_poland", 150093, 3),
142
+ (18, "oblast_russia", 835714, 3),
143
+
144
+ # Cities/Towns (simplified_id=4)
145
+ (19, "city", 515, 4),
146
+ (20, "big_city", 1549591, 4),
147
+ (21, "capital", 5119, 4),
148
+ (22, "town", 3957, 4),
149
+ (23, "municipality", 15284, 4),
150
+ (24, "commune_france", 484170, 4),
151
+ (25, "municipality_germany", 262166, 4),
152
+ (26, "municipality_japan", 1054813, 4),
153
+ (27, "village", 532, 4),
154
+ (28, "hamlet", 5084, 4),
155
+
156
+ # Districts (simplified_id=5)
157
+ (29, "district", 149621, 5),
158
+ (30, "borough", 5765681, 5),
159
+ (31, "neighborhood", 123705, 5),
160
+ (32, "ward", 12813115, 5),
161
+
162
+ # Historic (simplified_id=6)
163
+ (33, "former_country", 3024240, 6),
164
+ (34, "ancient_civilization", 28171280, 6),
165
+ (35, "historic_territory", 1620908, 6),
166
+
167
+ # Other/Unknown (simplified_id=7)
168
+ (36, "other", None, 7),
169
+ ]
170
+
171
+ LOCATION_TYPE_NAME_TO_ID: dict[str, int] = {
172
+ name: id_ for id_, name, _, _ in LOCATION_TYPES
173
+ }
174
+ LOCATION_TYPE_ID_TO_NAME: dict[int, str] = {
175
+ id_: name for id_, name, _, _ in LOCATION_TYPES
176
+ }
177
+
178
+ # Mapping from Wikidata QID (P31 value) to location_type_id
179
+ LOCATION_TYPE_QID_TO_ID: dict[int, int] = {
180
+ qid: id_ for id_, name, qid, _ in LOCATION_TYPES if qid is not None
181
+ }
182
+
183
+ # Mapping from location_type_id to simplified_id
184
+ LOCATION_TYPE_TO_SIMPLIFIED: dict[int, int] = {
185
+ id_: simplified_id for id_, _, _, simplified_id in LOCATION_TYPES
186
+ }
187
+
188
+
189
+ # =============================================================================
190
+ # PYCOUNTRY INTEGRATION
191
+ # =============================================================================
192
+
193
+ def get_pycountry_countries() -> list[dict[str, Any]]:
194
+ """
195
+ Get all countries from pycountry for seeding the locations table.
196
+
197
+ Returns:
198
+ List of dicts with keys: name, alpha_2, alpha_3, numeric
199
+ """
200
+ import pycountry
201
+
202
+ countries = []
203
+ for country in pycountry.countries:
204
+ countries.append({
205
+ "name": country.name,
206
+ "alpha_2": country.alpha_2,
207
+ "alpha_3": getattr(country, "alpha_3", None),
208
+ "numeric": getattr(country, "numeric", None),
209
+ })
210
+ return countries
211
+
212
+
213
+ # =============================================================================
214
+ # SEED FUNCTIONS
215
+ # =============================================================================
216
+
217
+ def seed_source_types(conn) -> int:
218
+ """
219
+ Seed source_types table.
220
+
221
+ Args:
222
+ conn: SQLite connection
223
+
224
+ Returns:
225
+ Number of rows inserted
226
+ """
227
+ conn.executemany(
228
+ "INSERT OR IGNORE INTO source_types (id, name) VALUES (?, ?)",
229
+ SOURCE_TYPES
230
+ )
231
+ conn.commit()
232
+ return len(SOURCE_TYPES)
233
+
234
+
235
+ def seed_people_types(conn) -> int:
236
+ """
237
+ Seed people_types table.
238
+
239
+ Args:
240
+ conn: SQLite connection
241
+
242
+ Returns:
243
+ Number of rows inserted
244
+ """
245
+ conn.executemany(
246
+ "INSERT OR IGNORE INTO people_types (id, name) VALUES (?, ?)",
247
+ PEOPLE_TYPES
248
+ )
249
+ conn.commit()
250
+ return len(PEOPLE_TYPES)
251
+
252
+
253
+ def seed_organization_types(conn) -> int:
254
+ """
255
+ Seed organization_types table.
256
+
257
+ Args:
258
+ conn: SQLite connection
259
+
260
+ Returns:
261
+ Number of rows inserted
262
+ """
263
+ conn.executemany(
264
+ "INSERT OR IGNORE INTO organization_types (id, name) VALUES (?, ?)",
265
+ ORGANIZATION_TYPES
266
+ )
267
+ conn.commit()
268
+ return len(ORGANIZATION_TYPES)
269
+
270
+
271
+ def seed_simplified_location_types(conn) -> int:
272
+ """
273
+ Seed simplified_location_types table.
274
+
275
+ Args:
276
+ conn: SQLite connection
277
+
278
+ Returns:
279
+ Number of rows inserted
280
+ """
281
+ conn.executemany(
282
+ "INSERT OR IGNORE INTO simplified_location_types (id, name) VALUES (?, ?)",
283
+ SIMPLIFIED_LOCATION_TYPES
284
+ )
285
+ conn.commit()
286
+ return len(SIMPLIFIED_LOCATION_TYPES)
287
+
288
+
289
+ def seed_location_types(conn) -> int:
290
+ """
291
+ Seed location_types table with Wikidata QID mappings.
292
+
293
+ Args:
294
+ conn: SQLite connection
295
+
296
+ Returns:
297
+ Number of rows inserted
298
+ """
299
+ conn.executemany(
300
+ "INSERT OR IGNORE INTO location_types (id, name, qid, simplified_id) VALUES (?, ?, ?, ?)",
301
+ LOCATION_TYPES
302
+ )
303
+ conn.commit()
304
+ return len(LOCATION_TYPES)
305
+
306
+
307
+ def seed_all_enums(conn) -> dict[str, int]:
308
+ """
309
+ Seed all enum tables.
310
+
311
+ Args:
312
+ conn: SQLite connection
313
+
314
+ Returns:
315
+ Dict mapping table name to number of rows inserted
316
+ """
317
+ return {
318
+ "source_types": seed_source_types(conn),
319
+ "people_types": seed_people_types(conn),
320
+ "organization_types": seed_organization_types(conn),
321
+ "simplified_location_types": seed_simplified_location_types(conn),
322
+ "location_types": seed_location_types(conn),
323
+ }
324
+
325
+
326
+ def seed_pycountry_locations(conn, source_id: int = 4) -> int:
327
+ """
328
+ Seed locations table with pycountry countries.
329
+
330
+ Args:
331
+ conn: SQLite connection
332
+ source_id: Source ID to use (default: 4 = wikidata, used for pycountry data)
333
+
334
+ Returns:
335
+ Number of locations inserted
336
+ """
337
+ import pycountry
338
+
339
+ # Get country location_type_id
340
+ country_type_id = LOCATION_TYPE_NAME_TO_ID["country"]
341
+ count = 0
342
+
343
+ for country in pycountry.countries:
344
+ name = country.name
345
+ alpha_2 = country.alpha_2
346
+ name_normalized = name.lower()
347
+
348
+ conn.execute(
349
+ """
350
+ INSERT OR IGNORE INTO locations
351
+ (name, name_normalized, source_id, source_identifier, location_type_id)
352
+ VALUES (?, ?, ?, ?, ?)
353
+ """,
354
+ (name, name_normalized, source_id, alpha_2, country_type_id)
355
+ )
356
+ count += 1
357
+
358
+ conn.commit()
359
+ return count