corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Migration script from v1 to v2 normalized schema.
|
|
3
|
+
|
|
4
|
+
Transforms TEXT-based enum storage to INTEGER FK references,
|
|
5
|
+
adds roles and locations tables, and converts QIDs to integers.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
import sqlite3
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
import pycountry
|
|
19
|
+
import sqlite_vec
|
|
20
|
+
|
|
21
|
+
from .schema_v2 import create_all_tables
|
|
22
|
+
from .seed_data import (
|
|
23
|
+
LOCATION_TYPE_NAME_TO_ID,
|
|
24
|
+
ORG_TYPE_NAME_TO_ID,
|
|
25
|
+
PEOPLE_TYPE_NAME_TO_ID,
|
|
26
|
+
SOURCE_NAME_TO_ID,
|
|
27
|
+
seed_all_enums,
|
|
28
|
+
seed_pycountry_locations,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_qid(qid_text: Optional[str]) -> Optional[int]:
|
|
35
|
+
"""
|
|
36
|
+
Parse a QID string to integer.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
qid_text: QID string like "Q12345" or just "12345"
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Integer QID or None if invalid
|
|
43
|
+
"""
|
|
44
|
+
if not qid_text:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# Strip whitespace
|
|
48
|
+
qid_text = qid_text.strip()
|
|
49
|
+
|
|
50
|
+
# Handle "Q12345" format
|
|
51
|
+
if qid_text.startswith("Q") or qid_text.startswith("q"):
|
|
52
|
+
qid_text = qid_text[1:]
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
return int(qid_text)
|
|
56
|
+
except ValueError:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def normalize_name_for_lookup(name: str) -> str:
|
|
61
|
+
"""Normalize a name for database lookup."""
|
|
62
|
+
if not name:
|
|
63
|
+
return ""
|
|
64
|
+
return name.lower().strip()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DatabaseMigrator:
|
|
68
|
+
"""
|
|
69
|
+
Migrates v1 entity database to v2 normalized schema.
|
|
70
|
+
|
|
71
|
+
Handles:
|
|
72
|
+
- Creating v2 schema with enum tables
|
|
73
|
+
- Seeding enum lookup data
|
|
74
|
+
- Importing pycountry countries
|
|
75
|
+
- Migrating organizations with FK resolution
|
|
76
|
+
- Migrating people with FK resolution
|
|
77
|
+
- Converting QIDs from TEXT to INTEGER
|
|
78
|
+
- Preserving embeddings
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
source_path: str | Path,
|
|
84
|
+
target_path: str | Path,
|
|
85
|
+
embedding_dim: int = 768,
|
|
86
|
+
resume: bool = False,
|
|
87
|
+
):
|
|
88
|
+
"""
|
|
89
|
+
Initialize the migrator.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
source_path: Path to v1 database
|
|
93
|
+
target_path: Path for v2 database (will be created)
|
|
94
|
+
embedding_dim: Embedding dimension (default 768)
|
|
95
|
+
resume: If True, resume from last completed step
|
|
96
|
+
"""
|
|
97
|
+
self.source_path = Path(source_path)
|
|
98
|
+
self.target_path = Path(target_path)
|
|
99
|
+
self.embedding_dim = embedding_dim
|
|
100
|
+
self.resume = resume
|
|
101
|
+
|
|
102
|
+
if not self.source_path.exists():
|
|
103
|
+
raise FileNotFoundError(f"Source database not found: {self.source_path}")
|
|
104
|
+
|
|
105
|
+
if self.target_path.exists() and not resume:
|
|
106
|
+
raise FileExistsError(f"Target database already exists: {self.target_path}. Use resume=True to continue.")
|
|
107
|
+
|
|
108
|
+
# Caches for FK lookups during migration
|
|
109
|
+
self._location_cache: dict[str, int] = {} # name_normalized -> location_id
|
|
110
|
+
self._role_cache: dict[str, int] = {} # name_normalized -> role_id
|
|
111
|
+
|
|
112
|
+
def migrate(self, batch_size: int = 10000) -> dict[str, int]:
|
|
113
|
+
"""
|
|
114
|
+
Run the full migration.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
batch_size: Number of records per batch commit
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Dict with migration statistics
|
|
121
|
+
"""
|
|
122
|
+
if self.resume and self.target_path.exists():
|
|
123
|
+
logger.info(f"Resuming migration from {self.source_path} to {self.target_path}")
|
|
124
|
+
else:
|
|
125
|
+
logger.info(f"Starting migration from {self.source_path} to {self.target_path}")
|
|
126
|
+
|
|
127
|
+
# Open connections
|
|
128
|
+
source_conn = sqlite3.connect(str(self.source_path))
|
|
129
|
+
source_conn.row_factory = sqlite3.Row
|
|
130
|
+
|
|
131
|
+
# Load sqlite-vec for source (needed to read embedding virtual tables)
|
|
132
|
+
source_conn.enable_load_extension(True)
|
|
133
|
+
sqlite_vec.load(source_conn)
|
|
134
|
+
source_conn.enable_load_extension(False)
|
|
135
|
+
|
|
136
|
+
self.target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
137
|
+
target_conn = sqlite3.connect(str(self.target_path))
|
|
138
|
+
target_conn.row_factory = sqlite3.Row
|
|
139
|
+
|
|
140
|
+
# Load sqlite-vec for target
|
|
141
|
+
target_conn.enable_load_extension(True)
|
|
142
|
+
sqlite_vec.load(target_conn)
|
|
143
|
+
target_conn.enable_load_extension(False)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
stats = self._run_migration(source_conn, target_conn, batch_size)
|
|
147
|
+
finally:
|
|
148
|
+
source_conn.close()
|
|
149
|
+
target_conn.close()
|
|
150
|
+
|
|
151
|
+
logger.info(f"Migration complete: {stats}")
|
|
152
|
+
return stats
|
|
153
|
+
|
|
154
|
+
def _run_migration(
|
|
155
|
+
self,
|
|
156
|
+
source: sqlite3.Connection,
|
|
157
|
+
target: sqlite3.Connection,
|
|
158
|
+
batch_size: int,
|
|
159
|
+
) -> dict[str, int]:
|
|
160
|
+
"""Run all migration steps."""
|
|
161
|
+
stats: dict[str, int] = {}
|
|
162
|
+
|
|
163
|
+
# Determine which step to start from
|
|
164
|
+
start_step = 1
|
|
165
|
+
if self.resume and self.target_path.exists():
|
|
166
|
+
start_step = self._detect_completed_step(target)
|
|
167
|
+
logger.info(f"Resuming from step {start_step}")
|
|
168
|
+
|
|
169
|
+
# Step 1: Create v2 schema
|
|
170
|
+
if start_step <= 1:
|
|
171
|
+
logger.info("Step 1: Creating v2 schema...")
|
|
172
|
+
create_all_tables(target, self.embedding_dim)
|
|
173
|
+
else:
|
|
174
|
+
logger.info("Step 1: Skipped (schema already exists)")
|
|
175
|
+
|
|
176
|
+
# Step 2: Seed enum tables
|
|
177
|
+
if start_step <= 2:
|
|
178
|
+
logger.info("Step 2: Seeding enum tables...")
|
|
179
|
+
enum_stats = seed_all_enums(target)
|
|
180
|
+
stats.update({f"seed_{k}": v for k, v in enum_stats.items()})
|
|
181
|
+
else:
|
|
182
|
+
logger.info("Step 2: Skipped (enums already seeded)")
|
|
183
|
+
|
|
184
|
+
# Step 3: Import pycountry countries into locations
|
|
185
|
+
if start_step <= 3:
|
|
186
|
+
logger.info("Step 3: Importing pycountry countries...")
|
|
187
|
+
stats["locations_pycountry"] = seed_pycountry_locations(target)
|
|
188
|
+
else:
|
|
189
|
+
logger.info("Step 3: Skipped (pycountry already imported)")
|
|
190
|
+
|
|
191
|
+
# Build location lookup cache from imported countries
|
|
192
|
+
self._build_location_cache(target)
|
|
193
|
+
|
|
194
|
+
# Step 4: Migrate qid_labels
|
|
195
|
+
if start_step <= 4:
|
|
196
|
+
logger.info("Step 4: Migrating qid_labels...")
|
|
197
|
+
stats["qid_labels"] = self._migrate_qid_labels(source, target)
|
|
198
|
+
else:
|
|
199
|
+
logger.info("Step 4: Skipped (qid_labels already migrated)")
|
|
200
|
+
|
|
201
|
+
# Step 5: Migrate organizations
|
|
202
|
+
if start_step <= 5:
|
|
203
|
+
logger.info("Step 5: Migrating organizations...")
|
|
204
|
+
stats["organizations"] = self._migrate_organizations(source, target, batch_size)
|
|
205
|
+
else:
|
|
206
|
+
logger.info("Step 5: Skipped (organizations already migrated)")
|
|
207
|
+
# Rebuild ID mapping for embedding migration
|
|
208
|
+
self._rebuild_org_id_mapping(source, target)
|
|
209
|
+
|
|
210
|
+
# Step 6: Migrate people
|
|
211
|
+
if start_step <= 6:
|
|
212
|
+
logger.info("Step 6: Migrating people...")
|
|
213
|
+
stats["people"] = self._migrate_people(source, target, batch_size)
|
|
214
|
+
else:
|
|
215
|
+
logger.info("Step 6: Skipped (people already migrated)")
|
|
216
|
+
# Rebuild ID mapping for embedding migration
|
|
217
|
+
self._rebuild_person_id_mapping(source, target)
|
|
218
|
+
|
|
219
|
+
# Step 7: Migrate organization embeddings
|
|
220
|
+
if start_step <= 7:
|
|
221
|
+
logger.info("Step 7: Migrating organization embeddings...")
|
|
222
|
+
stats["org_embeddings"] = self._migrate_org_embeddings(source, target, batch_size)
|
|
223
|
+
else:
|
|
224
|
+
logger.info("Step 7: Skipped (organization embeddings already migrated)")
|
|
225
|
+
|
|
226
|
+
# Step 8: Migrate person embeddings
|
|
227
|
+
if start_step <= 8:
|
|
228
|
+
logger.info("Step 8: Migrating person embeddings...")
|
|
229
|
+
stats["person_embeddings"] = self._migrate_person_embeddings(source, target, batch_size)
|
|
230
|
+
else:
|
|
231
|
+
logger.info("Step 8: Skipped (person embeddings already migrated)")
|
|
232
|
+
|
|
233
|
+
# Vacuum to optimize
|
|
234
|
+
logger.info("Step 9: Optimizing database...")
|
|
235
|
+
target.execute("VACUUM")
|
|
236
|
+
|
|
237
|
+
return stats
|
|
238
|
+
|
|
239
|
+
def _detect_completed_step(self, target: sqlite3.Connection) -> int:
|
|
240
|
+
"""
|
|
241
|
+
Detect the first incomplete migration step.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Step number to resume from (1-9)
|
|
245
|
+
"""
|
|
246
|
+
# Check if organization_embeddings has data
|
|
247
|
+
try:
|
|
248
|
+
cursor = target.execute("SELECT COUNT(*) FROM organization_embeddings")
|
|
249
|
+
if cursor.fetchone()[0] > 0:
|
|
250
|
+
# Check person embeddings
|
|
251
|
+
cursor = target.execute("SELECT COUNT(*) FROM person_embeddings")
|
|
252
|
+
if cursor.fetchone()[0] > 0:
|
|
253
|
+
return 9 # All done, just vacuum
|
|
254
|
+
return 8 # Person embeddings pending
|
|
255
|
+
# Org embeddings empty, check if organizations exist
|
|
256
|
+
cursor = target.execute("SELECT COUNT(*) FROM organizations")
|
|
257
|
+
if cursor.fetchone()[0] > 0:
|
|
258
|
+
return 7 # Org embeddings pending
|
|
259
|
+
except sqlite3.OperationalError:
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
# Check if organizations table has data
|
|
263
|
+
try:
|
|
264
|
+
cursor = target.execute("SELECT COUNT(*) FROM organizations")
|
|
265
|
+
if cursor.fetchone()[0] > 0:
|
|
266
|
+
# Check if people exist
|
|
267
|
+
cursor = target.execute("SELECT COUNT(*) FROM people")
|
|
268
|
+
if cursor.fetchone()[0] > 0:
|
|
269
|
+
return 7 # Ready for embeddings
|
|
270
|
+
return 6 # People pending
|
|
271
|
+
except sqlite3.OperationalError:
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
# Check if qid_labels has data
|
|
275
|
+
try:
|
|
276
|
+
cursor = target.execute("SELECT COUNT(*) FROM qid_labels")
|
|
277
|
+
if cursor.fetchone()[0] > 0:
|
|
278
|
+
return 5 # Organizations pending
|
|
279
|
+
except sqlite3.OperationalError:
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
# Check if locations has data
|
|
283
|
+
try:
|
|
284
|
+
cursor = target.execute("SELECT COUNT(*) FROM locations")
|
|
285
|
+
if cursor.fetchone()[0] > 0:
|
|
286
|
+
return 4 # qid_labels pending
|
|
287
|
+
except sqlite3.OperationalError:
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
# Check if source_types has data
|
|
291
|
+
try:
|
|
292
|
+
cursor = target.execute("SELECT COUNT(*) FROM source_types")
|
|
293
|
+
if cursor.fetchone()[0] > 0:
|
|
294
|
+
return 3 # pycountry import pending
|
|
295
|
+
except sqlite3.OperationalError:
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
# Check if organizations table exists at all
|
|
299
|
+
try:
|
|
300
|
+
target.execute("SELECT 1 FROM organizations LIMIT 1")
|
|
301
|
+
return 2 # Schema exists, enum seeding pending
|
|
302
|
+
except sqlite3.OperationalError:
|
|
303
|
+
pass
|
|
304
|
+
|
|
305
|
+
return 1 # Start from beginning
|
|
306
|
+
|
|
307
|
+
def _rebuild_org_id_mapping(
|
|
308
|
+
self,
|
|
309
|
+
source: sqlite3.Connection,
|
|
310
|
+
target: sqlite3.Connection,
|
|
311
|
+
) -> None:
|
|
312
|
+
"""Rebuild organization ID mapping for embedding migration when resuming."""
|
|
313
|
+
logger.info("Rebuilding organization ID mapping...")
|
|
314
|
+
|
|
315
|
+
self._org_id_mapping = {}
|
|
316
|
+
|
|
317
|
+
# Get all source organizations with their IDs and source_ids
|
|
318
|
+
source_cursor = source.execute(
|
|
319
|
+
"SELECT id, source_id FROM organizations"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
for row in source_cursor:
|
|
323
|
+
old_id = row["id"]
|
|
324
|
+
source_identifier = row["source_id"]
|
|
325
|
+
|
|
326
|
+
if source_identifier:
|
|
327
|
+
# Look up in target by source_identifier
|
|
328
|
+
target_cursor = target.execute(
|
|
329
|
+
"SELECT id FROM organizations WHERE source_identifier = ?",
|
|
330
|
+
(source_identifier,)
|
|
331
|
+
)
|
|
332
|
+
target_row = target_cursor.fetchone()
|
|
333
|
+
if target_row:
|
|
334
|
+
self._org_id_mapping[old_id] = target_row["id"]
|
|
335
|
+
|
|
336
|
+
logger.info(f"Rebuilt mapping for {len(self._org_id_mapping)} organizations")
|
|
337
|
+
|
|
338
|
+
def _rebuild_person_id_mapping(
|
|
339
|
+
self,
|
|
340
|
+
source: sqlite3.Connection,
|
|
341
|
+
target: sqlite3.Connection,
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Rebuild person ID mapping for embedding migration when resuming."""
|
|
344
|
+
logger.info("Rebuilding person ID mapping...")
|
|
345
|
+
|
|
346
|
+
self._person_id_mapping = {}
|
|
347
|
+
|
|
348
|
+
# Get all source people with their IDs and source_ids
|
|
349
|
+
source_cursor = source.execute(
|
|
350
|
+
"SELECT id, source_id FROM people"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
for row in source_cursor:
|
|
354
|
+
old_id = row["id"]
|
|
355
|
+
source_identifier = row["source_id"]
|
|
356
|
+
|
|
357
|
+
if source_identifier:
|
|
358
|
+
# Look up in target by source_identifier
|
|
359
|
+
target_cursor = target.execute(
|
|
360
|
+
"SELECT id FROM people WHERE source_identifier = ?",
|
|
361
|
+
(source_identifier,)
|
|
362
|
+
)
|
|
363
|
+
target_row = target_cursor.fetchone()
|
|
364
|
+
if target_row:
|
|
365
|
+
self._person_id_mapping[old_id] = target_row["id"]
|
|
366
|
+
|
|
367
|
+
logger.info(f"Rebuilt mapping for {len(self._person_id_mapping)} people")
|
|
368
|
+
|
|
369
|
+
def _build_location_cache(self, conn: sqlite3.Connection) -> None:
|
|
370
|
+
"""Build location lookup cache from existing locations."""
|
|
371
|
+
cursor = conn.execute("SELECT id, name_normalized, source_identifier FROM locations")
|
|
372
|
+
for row in cursor:
|
|
373
|
+
# Cache by normalized name
|
|
374
|
+
self._location_cache[row["name_normalized"]] = row["id"]
|
|
375
|
+
# Also cache by source_identifier (e.g., "US", "GB")
|
|
376
|
+
if row["source_identifier"]:
|
|
377
|
+
self._location_cache[row["source_identifier"].lower()] = row["id"]
|
|
378
|
+
|
|
379
|
+
def _resolve_region_to_location(
|
|
380
|
+
self,
|
|
381
|
+
conn: sqlite3.Connection,
|
|
382
|
+
region: str,
|
|
383
|
+
) -> Optional[int]:
|
|
384
|
+
"""
|
|
385
|
+
Resolve a region string to a location ID.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
conn: Target database connection
|
|
389
|
+
region: Region string (country code, name, or QID)
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Location ID or None if not resolved
|
|
393
|
+
"""
|
|
394
|
+
if not region:
|
|
395
|
+
return None
|
|
396
|
+
|
|
397
|
+
# Check cache first
|
|
398
|
+
region_lower = region.lower().strip()
|
|
399
|
+
if region_lower in self._location_cache:
|
|
400
|
+
return self._location_cache[region_lower]
|
|
401
|
+
|
|
402
|
+
# Try to resolve via pycountry
|
|
403
|
+
location_id = self._resolve_via_pycountry(conn, region)
|
|
404
|
+
if location_id:
|
|
405
|
+
self._location_cache[region_lower] = location_id
|
|
406
|
+
return location_id
|
|
407
|
+
|
|
408
|
+
# If it's a QID, try to look up or create
|
|
409
|
+
if region.startswith("Q") and region[1:].isdigit():
|
|
410
|
+
# We can't resolve QIDs to locations without more data
|
|
411
|
+
# Return None for now
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
return None
|
|
415
|
+
|
|
416
|
+
def _resolve_via_pycountry(
|
|
417
|
+
self,
|
|
418
|
+
conn: sqlite3.Connection,
|
|
419
|
+
region: str,
|
|
420
|
+
) -> Optional[int]:
|
|
421
|
+
"""Try to resolve region via pycountry."""
|
|
422
|
+
region_clean = region.strip()
|
|
423
|
+
if not region_clean:
|
|
424
|
+
return None
|
|
425
|
+
|
|
426
|
+
alpha_2 = None
|
|
427
|
+
|
|
428
|
+
# Try as 2-letter code
|
|
429
|
+
if len(region_clean) == 2:
|
|
430
|
+
country = pycountry.countries.get(alpha_2=region_clean.upper())
|
|
431
|
+
if country:
|
|
432
|
+
alpha_2 = country.alpha_2
|
|
433
|
+
|
|
434
|
+
# Try as 3-letter code
|
|
435
|
+
if not alpha_2 and len(region_clean) == 3:
|
|
436
|
+
country = pycountry.countries.get(alpha_3=region_clean.upper())
|
|
437
|
+
if country:
|
|
438
|
+
alpha_2 = country.alpha_2
|
|
439
|
+
|
|
440
|
+
# Try fuzzy search
|
|
441
|
+
if not alpha_2:
|
|
442
|
+
try:
|
|
443
|
+
matches = pycountry.countries.search_fuzzy(region_clean)
|
|
444
|
+
if matches:
|
|
445
|
+
alpha_2 = matches[0].alpha_2
|
|
446
|
+
except LookupError:
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
# Look up by alpha_2 in cache
|
|
450
|
+
if alpha_2:
|
|
451
|
+
return self._location_cache.get(alpha_2.lower())
|
|
452
|
+
|
|
453
|
+
return None
|
|
454
|
+
|
|
455
|
+
def _get_or_create_role(
|
|
456
|
+
self,
|
|
457
|
+
conn: sqlite3.Connection,
|
|
458
|
+
role_name: str,
|
|
459
|
+
source_id: int = 4, # wikidata
|
|
460
|
+
) -> int:
|
|
461
|
+
"""
|
|
462
|
+
Get or create a role record.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
conn: Target database connection
|
|
466
|
+
role_name: Role/title name
|
|
467
|
+
source_id: Source type ID (default: wikidata)
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
Role ID
|
|
471
|
+
"""
|
|
472
|
+
if not role_name:
|
|
473
|
+
raise ValueError("Role name cannot be empty")
|
|
474
|
+
|
|
475
|
+
name_normalized = normalize_name_for_lookup(role_name)
|
|
476
|
+
|
|
477
|
+
# Check cache
|
|
478
|
+
if name_normalized in self._role_cache:
|
|
479
|
+
return self._role_cache[name_normalized]
|
|
480
|
+
|
|
481
|
+
# Check database
|
|
482
|
+
cursor = conn.execute(
|
|
483
|
+
"SELECT id FROM roles WHERE name_normalized = ? AND source_id = ?",
|
|
484
|
+
(name_normalized, source_id)
|
|
485
|
+
)
|
|
486
|
+
row = cursor.fetchone()
|
|
487
|
+
if row:
|
|
488
|
+
self._role_cache[name_normalized] = row["id"]
|
|
489
|
+
return row["id"]
|
|
490
|
+
|
|
491
|
+
# Create new role
|
|
492
|
+
cursor = conn.execute(
|
|
493
|
+
"""
|
|
494
|
+
INSERT INTO roles (name, name_normalized, source_id, record)
|
|
495
|
+
VALUES (?, ?, ?, '{}')
|
|
496
|
+
""",
|
|
497
|
+
(role_name, name_normalized, source_id)
|
|
498
|
+
)
|
|
499
|
+
role_id = cursor.lastrowid
|
|
500
|
+
assert role_id is not None
|
|
501
|
+
conn.commit()
|
|
502
|
+
|
|
503
|
+
self._role_cache[name_normalized] = role_id
|
|
504
|
+
return role_id
|
|
505
|
+
|
|
506
|
+
def _migrate_qid_labels(
|
|
507
|
+
self,
|
|
508
|
+
source: sqlite3.Connection,
|
|
509
|
+
target: sqlite3.Connection,
|
|
510
|
+
) -> int:
|
|
511
|
+
"""Migrate qid_labels table, converting TEXT QIDs to INTEGER."""
|
|
512
|
+
# Check if source has qid_labels table
|
|
513
|
+
cursor = source.execute(
|
|
514
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='qid_labels'"
|
|
515
|
+
)
|
|
516
|
+
if not cursor.fetchone():
|
|
517
|
+
logger.info("No qid_labels table in source, skipping")
|
|
518
|
+
return 0
|
|
519
|
+
|
|
520
|
+
cursor = source.execute("SELECT qid, label FROM qid_labels")
|
|
521
|
+
count = 0
|
|
522
|
+
|
|
523
|
+
for row in cursor:
|
|
524
|
+
qid_int = parse_qid(row["qid"])
|
|
525
|
+
if qid_int is not None:
|
|
526
|
+
target.execute(
|
|
527
|
+
"INSERT OR IGNORE INTO qid_labels (qid, label) VALUES (?, ?)",
|
|
528
|
+
(qid_int, row["label"])
|
|
529
|
+
)
|
|
530
|
+
count += 1
|
|
531
|
+
|
|
532
|
+
target.commit()
|
|
533
|
+
logger.info(f"Migrated {count} QID labels")
|
|
534
|
+
return count
|
|
535
|
+
|
|
536
|
+
def _migrate_organizations(
|
|
537
|
+
self,
|
|
538
|
+
source: sqlite3.Connection,
|
|
539
|
+
target: sqlite3.Connection,
|
|
540
|
+
batch_size: int,
|
|
541
|
+
) -> int:
|
|
542
|
+
"""Migrate organizations table with FK conversion."""
|
|
543
|
+
# Check source schema
|
|
544
|
+
cursor = source.execute("PRAGMA table_info(organizations)")
|
|
545
|
+
columns = {row["name"] for row in cursor}
|
|
546
|
+
|
|
547
|
+
if "source_id" in columns and "source" not in columns:
|
|
548
|
+
logger.info("Source appears to already be v2 schema")
|
|
549
|
+
return 0
|
|
550
|
+
|
|
551
|
+
cursor = source.execute("""
|
|
552
|
+
SELECT id, name, name_normalized, source, source_id, region,
|
|
553
|
+
entity_type, from_date, to_date, record, canon_id, canon_size
|
|
554
|
+
FROM organizations
|
|
555
|
+
""")
|
|
556
|
+
|
|
557
|
+
count = 0
|
|
558
|
+
id_mapping: dict[int, int] = {} # old_id -> new_id
|
|
559
|
+
|
|
560
|
+
for row in cursor:
|
|
561
|
+
# Convert source to source_id FK
|
|
562
|
+
source_name = row["source"]
|
|
563
|
+
# Map "wikipedia" to "wikidata"
|
|
564
|
+
if source_name == "wikipedia":
|
|
565
|
+
source_name = "wikidata"
|
|
566
|
+
source_type_id = SOURCE_NAME_TO_ID.get(source_name, 4) # default to wikidata
|
|
567
|
+
|
|
568
|
+
# Convert entity_type to entity_type_id FK
|
|
569
|
+
entity_type_name = row["entity_type"] or "unknown"
|
|
570
|
+
entity_type_id = ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17) # default to unknown
|
|
571
|
+
|
|
572
|
+
# Resolve region to location_id
|
|
573
|
+
region_id = self._resolve_region_to_location(target, row["region"] or "")
|
|
574
|
+
|
|
575
|
+
# Extract QID from source_id if it's a Q code
|
|
576
|
+
qid = None
|
|
577
|
+
old_source_id = row["source_id"]
|
|
578
|
+
if old_source_id and old_source_id.startswith("Q"):
|
|
579
|
+
qid = parse_qid(old_source_id)
|
|
580
|
+
|
|
581
|
+
# Insert into target (use OR IGNORE to handle region normalization duplicates)
|
|
582
|
+
cursor2 = target.execute(
|
|
583
|
+
"""
|
|
584
|
+
INSERT OR IGNORE INTO organizations
|
|
585
|
+
(qid, name, name_normalized, source_id, source_identifier, region_id,
|
|
586
|
+
entity_type_id, from_date, to_date, record, canon_id, canon_size)
|
|
587
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
588
|
+
""",
|
|
589
|
+
(
|
|
590
|
+
qid,
|
|
591
|
+
row["name"],
|
|
592
|
+
row["name_normalized"],
|
|
593
|
+
source_type_id,
|
|
594
|
+
old_source_id, # Keep original source_id as source_identifier
|
|
595
|
+
region_id,
|
|
596
|
+
entity_type_id,
|
|
597
|
+
row["from_date"] or None,
|
|
598
|
+
row["to_date"] or None,
|
|
599
|
+
row["record"],
|
|
600
|
+
None, # Reset canon_id for fresh canonicalization
|
|
601
|
+
1, # Reset canon_size
|
|
602
|
+
)
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
new_id = cursor2.lastrowid
|
|
606
|
+
if new_id and cursor2.rowcount > 0:
|
|
607
|
+
id_mapping[row["id"]] = new_id
|
|
608
|
+
count += 1
|
|
609
|
+
else:
|
|
610
|
+
# Duplicate - look up the existing record's ID for embedding mapping
|
|
611
|
+
existing = target.execute(
|
|
612
|
+
"SELECT id FROM organizations WHERE source_identifier = ? AND source_id = ?",
|
|
613
|
+
(old_source_id, source_type_id)
|
|
614
|
+
).fetchone()
|
|
615
|
+
if existing:
|
|
616
|
+
id_mapping[row["id"]] = existing["id"]
|
|
617
|
+
|
|
618
|
+
if count % batch_size == 0:
|
|
619
|
+
target.commit()
|
|
620
|
+
logger.info(f" Migrated {count} organizations...")
|
|
621
|
+
|
|
622
|
+
target.commit()
|
|
623
|
+
logger.info(f"Migrated {count} organizations")
|
|
624
|
+
|
|
625
|
+
# Store ID mapping for embedding migration
|
|
626
|
+
self._org_id_mapping = id_mapping
|
|
627
|
+
return count
|
|
628
|
+
|
|
629
|
+
def _migrate_people(
|
|
630
|
+
self,
|
|
631
|
+
source: sqlite3.Connection,
|
|
632
|
+
target: sqlite3.Connection,
|
|
633
|
+
batch_size: int,
|
|
634
|
+
) -> int:
|
|
635
|
+
"""Migrate people table with FK conversion."""
|
|
636
|
+
# Check if source has people table
|
|
637
|
+
cursor = source.execute(
|
|
638
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='people'"
|
|
639
|
+
)
|
|
640
|
+
if not cursor.fetchone():
|
|
641
|
+
logger.info("No people table in source, skipping")
|
|
642
|
+
return 0
|
|
643
|
+
|
|
644
|
+
# Check source schema
|
|
645
|
+
cursor = source.execute("PRAGMA table_info(people)")
|
|
646
|
+
columns = {row["name"] for row in cursor}
|
|
647
|
+
|
|
648
|
+
if "source_id" in columns and "source" not in columns:
|
|
649
|
+
logger.info("People table appears to already be v2 schema")
|
|
650
|
+
return 0
|
|
651
|
+
|
|
652
|
+
cursor = source.execute("""
|
|
653
|
+
SELECT id, name, name_normalized, source, source_id, country,
|
|
654
|
+
person_type, known_for_role, known_for_org, known_for_org_id,
|
|
655
|
+
from_date, to_date, birth_date, death_date, record,
|
|
656
|
+
canon_id, canon_size
|
|
657
|
+
FROM people
|
|
658
|
+
""")
|
|
659
|
+
|
|
660
|
+
count = 0
|
|
661
|
+
id_mapping: dict[int, int] = {} # old_id -> new_id
|
|
662
|
+
|
|
663
|
+
for row in cursor:
|
|
664
|
+
# Convert source to source_id FK
|
|
665
|
+
source_name = row["source"] or "wikidata"
|
|
666
|
+
source_type_id = SOURCE_NAME_TO_ID.get(source_name, 4)
|
|
667
|
+
|
|
668
|
+
# Convert person_type to person_type_id FK
|
|
669
|
+
person_type_name = row["person_type"] or "unknown"
|
|
670
|
+
person_type_id = PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15)
|
|
671
|
+
|
|
672
|
+
# Resolve country to location_id
|
|
673
|
+
country_id = self._resolve_region_to_location(target, row["country"] or "")
|
|
674
|
+
|
|
675
|
+
# Get or create role_id for known_for_role
|
|
676
|
+
role_id = None
|
|
677
|
+
if row["known_for_role"]:
|
|
678
|
+
role_id = self._get_or_create_role(target, row["known_for_role"], source_type_id)
|
|
679
|
+
|
|
680
|
+
# Map known_for_org_id to new org ID
|
|
681
|
+
old_org_id = row["known_for_org_id"]
|
|
682
|
+
new_org_id = None
|
|
683
|
+
if old_org_id and hasattr(self, "_org_id_mapping"):
|
|
684
|
+
new_org_id = self._org_id_mapping.get(old_org_id)
|
|
685
|
+
|
|
686
|
+
# Extract QID from source_id if it's a Q code
|
|
687
|
+
qid = None
|
|
688
|
+
old_source_id = row["source_id"]
|
|
689
|
+
if old_source_id and old_source_id.startswith("Q"):
|
|
690
|
+
qid = parse_qid(old_source_id)
|
|
691
|
+
|
|
692
|
+
# Insert into target (use OR IGNORE to handle duplicates from normalization)
|
|
693
|
+
cursor2 = target.execute(
|
|
694
|
+
"""
|
|
695
|
+
INSERT OR IGNORE INTO people
|
|
696
|
+
(qid, name, name_normalized, source_id, source_identifier, country_id,
|
|
697
|
+
person_type_id, known_for_role_id, known_for_org, known_for_org_id,
|
|
698
|
+
from_date, to_date, birth_date, death_date, record, canon_id, canon_size)
|
|
699
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
700
|
+
""",
|
|
701
|
+
(
|
|
702
|
+
qid,
|
|
703
|
+
row["name"],
|
|
704
|
+
row["name_normalized"],
|
|
705
|
+
source_type_id,
|
|
706
|
+
old_source_id,
|
|
707
|
+
country_id,
|
|
708
|
+
person_type_id,
|
|
709
|
+
role_id,
|
|
710
|
+
row["known_for_org"] or "",
|
|
711
|
+
new_org_id,
|
|
712
|
+
row["from_date"] or None,
|
|
713
|
+
row["to_date"] or None,
|
|
714
|
+
row["birth_date"] or None,
|
|
715
|
+
row["death_date"] or None,
|
|
716
|
+
row["record"],
|
|
717
|
+
None, # Reset canon_id
|
|
718
|
+
1, # Reset canon_size
|
|
719
|
+
)
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
new_id = cursor2.lastrowid
|
|
723
|
+
if new_id and cursor2.rowcount > 0:
|
|
724
|
+
id_mapping[row["id"]] = new_id
|
|
725
|
+
count += 1
|
|
726
|
+
else:
|
|
727
|
+
# Duplicate - look up existing record for embedding mapping
|
|
728
|
+
existing = target.execute(
|
|
729
|
+
"""SELECT id FROM people
|
|
730
|
+
WHERE source_identifier = ? AND source_id = ?
|
|
731
|
+
AND known_for_role_id IS ? AND known_for_org_id IS ?""",
|
|
732
|
+
(old_source_id, source_type_id, role_id, new_org_id)
|
|
733
|
+
).fetchone()
|
|
734
|
+
if existing:
|
|
735
|
+
id_mapping[row["id"]] = existing["id"]
|
|
736
|
+
|
|
737
|
+
if count % batch_size == 0:
|
|
738
|
+
target.commit()
|
|
739
|
+
logger.info(f" Migrated {count} people...")
|
|
740
|
+
|
|
741
|
+
target.commit()
|
|
742
|
+
logger.info(f"Migrated {count} people")
|
|
743
|
+
|
|
744
|
+
# Store ID mapping for embedding migration
|
|
745
|
+
self._person_id_mapping = id_mapping
|
|
746
|
+
return count
|
|
747
|
+
|
|
748
|
+
def _migrate_org_embeddings(
|
|
749
|
+
self,
|
|
750
|
+
source: sqlite3.Connection,
|
|
751
|
+
target: sqlite3.Connection,
|
|
752
|
+
batch_size: int,
|
|
753
|
+
) -> int:
|
|
754
|
+
"""Migrate organization embeddings using ID mapping."""
|
|
755
|
+
# Check if source has embeddings table
|
|
756
|
+
cursor = source.execute(
|
|
757
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings'"
|
|
758
|
+
)
|
|
759
|
+
if not cursor.fetchone():
|
|
760
|
+
logger.info("No organization_embeddings table in source, skipping")
|
|
761
|
+
return 0
|
|
762
|
+
|
|
763
|
+
if not hasattr(self, "_org_id_mapping"):
|
|
764
|
+
logger.warning("No org ID mapping available, skipping embedding migration")
|
|
765
|
+
return 0
|
|
766
|
+
|
|
767
|
+
cursor = source.execute("SELECT org_id, embedding FROM organization_embeddings")
|
|
768
|
+
count = 0
|
|
769
|
+
|
|
770
|
+
for row in cursor:
|
|
771
|
+
old_id = row["org_id"]
|
|
772
|
+
new_id = self._org_id_mapping.get(old_id)
|
|
773
|
+
|
|
774
|
+
if new_id is not None:
|
|
775
|
+
target.execute(
|
|
776
|
+
"INSERT OR REPLACE INTO organization_embeddings (org_id, embedding) VALUES (?, ?)",
|
|
777
|
+
(new_id, row["embedding"])
|
|
778
|
+
)
|
|
779
|
+
count += 1
|
|
780
|
+
|
|
781
|
+
if count % batch_size == 0:
|
|
782
|
+
target.commit()
|
|
783
|
+
logger.info(f" Migrated {count} organization embeddings...")
|
|
784
|
+
|
|
785
|
+
target.commit()
|
|
786
|
+
logger.info(f"Migrated {count} organization embeddings")
|
|
787
|
+
return count
|
|
788
|
+
|
|
789
|
+
def _migrate_person_embeddings(
|
|
790
|
+
self,
|
|
791
|
+
source: sqlite3.Connection,
|
|
792
|
+
target: sqlite3.Connection,
|
|
793
|
+
batch_size: int,
|
|
794
|
+
) -> int:
|
|
795
|
+
"""Migrate person embeddings using ID mapping."""
|
|
796
|
+
# Check if source has embeddings table
|
|
797
|
+
cursor = source.execute(
|
|
798
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings'"
|
|
799
|
+
)
|
|
800
|
+
if not cursor.fetchone():
|
|
801
|
+
logger.info("No person_embeddings table in source, skipping")
|
|
802
|
+
return 0
|
|
803
|
+
|
|
804
|
+
if not hasattr(self, "_person_id_mapping"):
|
|
805
|
+
logger.warning("No person ID mapping available, skipping embedding migration")
|
|
806
|
+
return 0
|
|
807
|
+
|
|
808
|
+
cursor = source.execute("SELECT person_id, embedding FROM person_embeddings")
|
|
809
|
+
count = 0
|
|
810
|
+
|
|
811
|
+
for row in cursor:
|
|
812
|
+
old_id = row["person_id"]
|
|
813
|
+
new_id = self._person_id_mapping.get(old_id)
|
|
814
|
+
|
|
815
|
+
if new_id is not None:
|
|
816
|
+
target.execute(
|
|
817
|
+
"INSERT OR REPLACE INTO person_embeddings (person_id, embedding) VALUES (?, ?)",
|
|
818
|
+
(new_id, row["embedding"])
|
|
819
|
+
)
|
|
820
|
+
count += 1
|
|
821
|
+
|
|
822
|
+
if count % batch_size == 0:
|
|
823
|
+
target.commit()
|
|
824
|
+
logger.info(f" Migrated {count} person embeddings...")
|
|
825
|
+
|
|
826
|
+
target.commit()
|
|
827
|
+
logger.info(f"Migrated {count} person embeddings")
|
|
828
|
+
return count
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def migrate_database(
|
|
832
|
+
source_path: str | Path,
|
|
833
|
+
target_path: str | Path,
|
|
834
|
+
embedding_dim: int = 768,
|
|
835
|
+
batch_size: int = 10000,
|
|
836
|
+
resume: bool = False,
|
|
837
|
+
) -> dict[str, int]:
|
|
838
|
+
"""
|
|
839
|
+
Migrate a v1 database to v2 normalized schema.
|
|
840
|
+
|
|
841
|
+
Args:
|
|
842
|
+
source_path: Path to v1 database
|
|
843
|
+
target_path: Path for v2 database (will be created)
|
|
844
|
+
embedding_dim: Embedding dimension
|
|
845
|
+
batch_size: Batch size for commits
|
|
846
|
+
resume: If True, resume from last completed step
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
Migration statistics
|
|
850
|
+
"""
|
|
851
|
+
migrator = DatabaseMigrator(source_path, target_path, embedding_dim, resume=resume)
|
|
852
|
+
return migrator.migrate(batch_size)
|