corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,852 @@
1
+ """
2
+ Migration script from v1 to v2 normalized schema.
3
+
4
+ Transforms TEXT-based enum storage to INTEGER FK references,
5
+ adds roles and locations tables, and converts QIDs to integers.
6
+
7
+ Usage:
8
+ corp-extractor db migrate-v2 entities.db entities-v2.db
9
+ """
10
+
11
+ import json
12
+ import logging
13
+ import re
14
+ import sqlite3
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ import pycountry
19
+ import sqlite_vec
20
+
21
+ from .schema_v2 import create_all_tables
22
+ from .seed_data import (
23
+ LOCATION_TYPE_NAME_TO_ID,
24
+ ORG_TYPE_NAME_TO_ID,
25
+ PEOPLE_TYPE_NAME_TO_ID,
26
+ SOURCE_NAME_TO_ID,
27
+ seed_all_enums,
28
+ seed_pycountry_locations,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def parse_qid(qid_text: Optional[str]) -> Optional[int]:
35
+ """
36
+ Parse a QID string to integer.
37
+
38
+ Args:
39
+ qid_text: QID string like "Q12345" or just "12345"
40
+
41
+ Returns:
42
+ Integer QID or None if invalid
43
+ """
44
+ if not qid_text:
45
+ return None
46
+
47
+ # Strip whitespace
48
+ qid_text = qid_text.strip()
49
+
50
+ # Handle "Q12345" format
51
+ if qid_text.startswith("Q") or qid_text.startswith("q"):
52
+ qid_text = qid_text[1:]
53
+
54
+ try:
55
+ return int(qid_text)
56
+ except ValueError:
57
+ return None
58
+
59
+
60
+ def normalize_name_for_lookup(name: str) -> str:
61
+ """Normalize a name for database lookup."""
62
+ if not name:
63
+ return ""
64
+ return name.lower().strip()
65
+
66
+
67
+ class DatabaseMigrator:
68
+ """
69
+ Migrates v1 entity database to v2 normalized schema.
70
+
71
+ Handles:
72
+ - Creating v2 schema with enum tables
73
+ - Seeding enum lookup data
74
+ - Importing pycountry countries
75
+ - Migrating organizations with FK resolution
76
+ - Migrating people with FK resolution
77
+ - Converting QIDs from TEXT to INTEGER
78
+ - Preserving embeddings
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ source_path: str | Path,
84
+ target_path: str | Path,
85
+ embedding_dim: int = 768,
86
+ resume: bool = False,
87
+ ):
88
+ """
89
+ Initialize the migrator.
90
+
91
+ Args:
92
+ source_path: Path to v1 database
93
+ target_path: Path for v2 database (will be created)
94
+ embedding_dim: Embedding dimension (default 768)
95
+ resume: If True, resume from last completed step
96
+ """
97
+ self.source_path = Path(source_path)
98
+ self.target_path = Path(target_path)
99
+ self.embedding_dim = embedding_dim
100
+ self.resume = resume
101
+
102
+ if not self.source_path.exists():
103
+ raise FileNotFoundError(f"Source database not found: {self.source_path}")
104
+
105
+ if self.target_path.exists() and not resume:
106
+ raise FileExistsError(f"Target database already exists: {self.target_path}. Use resume=True to continue.")
107
+
108
+ # Caches for FK lookups during migration
109
+ self._location_cache: dict[str, int] = {} # name_normalized -> location_id
110
+ self._role_cache: dict[str, int] = {} # name_normalized -> role_id
111
+
112
+ def migrate(self, batch_size: int = 10000) -> dict[str, int]:
113
+ """
114
+ Run the full migration.
115
+
116
+ Args:
117
+ batch_size: Number of records per batch commit
118
+
119
+ Returns:
120
+ Dict with migration statistics
121
+ """
122
+ if self.resume and self.target_path.exists():
123
+ logger.info(f"Resuming migration from {self.source_path} to {self.target_path}")
124
+ else:
125
+ logger.info(f"Starting migration from {self.source_path} to {self.target_path}")
126
+
127
+ # Open connections
128
+ source_conn = sqlite3.connect(str(self.source_path))
129
+ source_conn.row_factory = sqlite3.Row
130
+
131
+ # Load sqlite-vec for source (needed to read embedding virtual tables)
132
+ source_conn.enable_load_extension(True)
133
+ sqlite_vec.load(source_conn)
134
+ source_conn.enable_load_extension(False)
135
+
136
+ self.target_path.parent.mkdir(parents=True, exist_ok=True)
137
+ target_conn = sqlite3.connect(str(self.target_path))
138
+ target_conn.row_factory = sqlite3.Row
139
+
140
+ # Load sqlite-vec for target
141
+ target_conn.enable_load_extension(True)
142
+ sqlite_vec.load(target_conn)
143
+ target_conn.enable_load_extension(False)
144
+
145
+ try:
146
+ stats = self._run_migration(source_conn, target_conn, batch_size)
147
+ finally:
148
+ source_conn.close()
149
+ target_conn.close()
150
+
151
+ logger.info(f"Migration complete: {stats}")
152
+ return stats
153
+
154
+ def _run_migration(
155
+ self,
156
+ source: sqlite3.Connection,
157
+ target: sqlite3.Connection,
158
+ batch_size: int,
159
+ ) -> dict[str, int]:
160
+ """Run all migration steps."""
161
+ stats: dict[str, int] = {}
162
+
163
+ # Determine which step to start from
164
+ start_step = 1
165
+ if self.resume and self.target_path.exists():
166
+ start_step = self._detect_completed_step(target)
167
+ logger.info(f"Resuming from step {start_step}")
168
+
169
+ # Step 1: Create v2 schema
170
+ if start_step <= 1:
171
+ logger.info("Step 1: Creating v2 schema...")
172
+ create_all_tables(target, self.embedding_dim)
173
+ else:
174
+ logger.info("Step 1: Skipped (schema already exists)")
175
+
176
+ # Step 2: Seed enum tables
177
+ if start_step <= 2:
178
+ logger.info("Step 2: Seeding enum tables...")
179
+ enum_stats = seed_all_enums(target)
180
+ stats.update({f"seed_{k}": v for k, v in enum_stats.items()})
181
+ else:
182
+ logger.info("Step 2: Skipped (enums already seeded)")
183
+
184
+ # Step 3: Import pycountry countries into locations
185
+ if start_step <= 3:
186
+ logger.info("Step 3: Importing pycountry countries...")
187
+ stats["locations_pycountry"] = seed_pycountry_locations(target)
188
+ else:
189
+ logger.info("Step 3: Skipped (pycountry already imported)")
190
+
191
+ # Build location lookup cache from imported countries
192
+ self._build_location_cache(target)
193
+
194
+ # Step 4: Migrate qid_labels
195
+ if start_step <= 4:
196
+ logger.info("Step 4: Migrating qid_labels...")
197
+ stats["qid_labels"] = self._migrate_qid_labels(source, target)
198
+ else:
199
+ logger.info("Step 4: Skipped (qid_labels already migrated)")
200
+
201
+ # Step 5: Migrate organizations
202
+ if start_step <= 5:
203
+ logger.info("Step 5: Migrating organizations...")
204
+ stats["organizations"] = self._migrate_organizations(source, target, batch_size)
205
+ else:
206
+ logger.info("Step 5: Skipped (organizations already migrated)")
207
+ # Rebuild ID mapping for embedding migration
208
+ self._rebuild_org_id_mapping(source, target)
209
+
210
+ # Step 6: Migrate people
211
+ if start_step <= 6:
212
+ logger.info("Step 6: Migrating people...")
213
+ stats["people"] = self._migrate_people(source, target, batch_size)
214
+ else:
215
+ logger.info("Step 6: Skipped (people already migrated)")
216
+ # Rebuild ID mapping for embedding migration
217
+ self._rebuild_person_id_mapping(source, target)
218
+
219
+ # Step 7: Migrate organization embeddings
220
+ if start_step <= 7:
221
+ logger.info("Step 7: Migrating organization embeddings...")
222
+ stats["org_embeddings"] = self._migrate_org_embeddings(source, target, batch_size)
223
+ else:
224
+ logger.info("Step 7: Skipped (organization embeddings already migrated)")
225
+
226
+ # Step 8: Migrate person embeddings
227
+ if start_step <= 8:
228
+ logger.info("Step 8: Migrating person embeddings...")
229
+ stats["person_embeddings"] = self._migrate_person_embeddings(source, target, batch_size)
230
+ else:
231
+ logger.info("Step 8: Skipped (person embeddings already migrated)")
232
+
233
+ # Vacuum to optimize
234
+ logger.info("Step 9: Optimizing database...")
235
+ target.execute("VACUUM")
236
+
237
+ return stats
238
+
239
+ def _detect_completed_step(self, target: sqlite3.Connection) -> int:
240
+ """
241
+ Detect the first incomplete migration step.
242
+
243
+ Returns:
244
+ Step number to resume from (1-9)
245
+ """
246
+ # Check if organization_embeddings has data
247
+ try:
248
+ cursor = target.execute("SELECT COUNT(*) FROM organization_embeddings")
249
+ if cursor.fetchone()[0] > 0:
250
+ # Check person embeddings
251
+ cursor = target.execute("SELECT COUNT(*) FROM person_embeddings")
252
+ if cursor.fetchone()[0] > 0:
253
+ return 9 # All done, just vacuum
254
+ return 8 # Person embeddings pending
255
+ # Org embeddings empty, check if organizations exist
256
+ cursor = target.execute("SELECT COUNT(*) FROM organizations")
257
+ if cursor.fetchone()[0] > 0:
258
+ return 7 # Org embeddings pending
259
+ except sqlite3.OperationalError:
260
+ pass
261
+
262
+ # Check if organizations table has data
263
+ try:
264
+ cursor = target.execute("SELECT COUNT(*) FROM organizations")
265
+ if cursor.fetchone()[0] > 0:
266
+ # Check if people exist
267
+ cursor = target.execute("SELECT COUNT(*) FROM people")
268
+ if cursor.fetchone()[0] > 0:
269
+ return 7 # Ready for embeddings
270
+ return 6 # People pending
271
+ except sqlite3.OperationalError:
272
+ pass
273
+
274
+ # Check if qid_labels has data
275
+ try:
276
+ cursor = target.execute("SELECT COUNT(*) FROM qid_labels")
277
+ if cursor.fetchone()[0] > 0:
278
+ return 5 # Organizations pending
279
+ except sqlite3.OperationalError:
280
+ pass
281
+
282
+ # Check if locations has data
283
+ try:
284
+ cursor = target.execute("SELECT COUNT(*) FROM locations")
285
+ if cursor.fetchone()[0] > 0:
286
+ return 4 # qid_labels pending
287
+ except sqlite3.OperationalError:
288
+ pass
289
+
290
+ # Check if source_types has data
291
+ try:
292
+ cursor = target.execute("SELECT COUNT(*) FROM source_types")
293
+ if cursor.fetchone()[0] > 0:
294
+ return 3 # pycountry import pending
295
+ except sqlite3.OperationalError:
296
+ pass
297
+
298
+ # Check if organizations table exists at all
299
+ try:
300
+ target.execute("SELECT 1 FROM organizations LIMIT 1")
301
+ return 2 # Schema exists, enum seeding pending
302
+ except sqlite3.OperationalError:
303
+ pass
304
+
305
+ return 1 # Start from beginning
306
+
307
+ def _rebuild_org_id_mapping(
308
+ self,
309
+ source: sqlite3.Connection,
310
+ target: sqlite3.Connection,
311
+ ) -> None:
312
+ """Rebuild organization ID mapping for embedding migration when resuming."""
313
+ logger.info("Rebuilding organization ID mapping...")
314
+
315
+ self._org_id_mapping = {}
316
+
317
+ # Get all source organizations with their IDs and source_ids
318
+ source_cursor = source.execute(
319
+ "SELECT id, source_id FROM organizations"
320
+ )
321
+
322
+ for row in source_cursor:
323
+ old_id = row["id"]
324
+ source_identifier = row["source_id"]
325
+
326
+ if source_identifier:
327
+ # Look up in target by source_identifier
328
+ target_cursor = target.execute(
329
+ "SELECT id FROM organizations WHERE source_identifier = ?",
330
+ (source_identifier,)
331
+ )
332
+ target_row = target_cursor.fetchone()
333
+ if target_row:
334
+ self._org_id_mapping[old_id] = target_row["id"]
335
+
336
+ logger.info(f"Rebuilt mapping for {len(self._org_id_mapping)} organizations")
337
+
338
+ def _rebuild_person_id_mapping(
339
+ self,
340
+ source: sqlite3.Connection,
341
+ target: sqlite3.Connection,
342
+ ) -> None:
343
+ """Rebuild person ID mapping for embedding migration when resuming."""
344
+ logger.info("Rebuilding person ID mapping...")
345
+
346
+ self._person_id_mapping = {}
347
+
348
+ # Get all source people with their IDs and source_ids
349
+ source_cursor = source.execute(
350
+ "SELECT id, source_id FROM people"
351
+ )
352
+
353
+ for row in source_cursor:
354
+ old_id = row["id"]
355
+ source_identifier = row["source_id"]
356
+
357
+ if source_identifier:
358
+ # Look up in target by source_identifier
359
+ target_cursor = target.execute(
360
+ "SELECT id FROM people WHERE source_identifier = ?",
361
+ (source_identifier,)
362
+ )
363
+ target_row = target_cursor.fetchone()
364
+ if target_row:
365
+ self._person_id_mapping[old_id] = target_row["id"]
366
+
367
+ logger.info(f"Rebuilt mapping for {len(self._person_id_mapping)} people")
368
+
369
+ def _build_location_cache(self, conn: sqlite3.Connection) -> None:
370
+ """Build location lookup cache from existing locations."""
371
+ cursor = conn.execute("SELECT id, name_normalized, source_identifier FROM locations")
372
+ for row in cursor:
373
+ # Cache by normalized name
374
+ self._location_cache[row["name_normalized"]] = row["id"]
375
+ # Also cache by source_identifier (e.g., "US", "GB")
376
+ if row["source_identifier"]:
377
+ self._location_cache[row["source_identifier"].lower()] = row["id"]
378
+
379
+ def _resolve_region_to_location(
380
+ self,
381
+ conn: sqlite3.Connection,
382
+ region: str,
383
+ ) -> Optional[int]:
384
+ """
385
+ Resolve a region string to a location ID.
386
+
387
+ Args:
388
+ conn: Target database connection
389
+ region: Region string (country code, name, or QID)
390
+
391
+ Returns:
392
+ Location ID or None if not resolved
393
+ """
394
+ if not region:
395
+ return None
396
+
397
+ # Check cache first
398
+ region_lower = region.lower().strip()
399
+ if region_lower in self._location_cache:
400
+ return self._location_cache[region_lower]
401
+
402
+ # Try to resolve via pycountry
403
+ location_id = self._resolve_via_pycountry(conn, region)
404
+ if location_id:
405
+ self._location_cache[region_lower] = location_id
406
+ return location_id
407
+
408
+ # If it's a QID, try to look up or create
409
+ if region.startswith("Q") and region[1:].isdigit():
410
+ # We can't resolve QIDs to locations without more data
411
+ # Return None for now
412
+ return None
413
+
414
+ return None
415
+
416
+ def _resolve_via_pycountry(
417
+ self,
418
+ conn: sqlite3.Connection,
419
+ region: str,
420
+ ) -> Optional[int]:
421
+ """Try to resolve region via pycountry."""
422
+ region_clean = region.strip()
423
+ if not region_clean:
424
+ return None
425
+
426
+ alpha_2 = None
427
+
428
+ # Try as 2-letter code
429
+ if len(region_clean) == 2:
430
+ country = pycountry.countries.get(alpha_2=region_clean.upper())
431
+ if country:
432
+ alpha_2 = country.alpha_2
433
+
434
+ # Try as 3-letter code
435
+ if not alpha_2 and len(region_clean) == 3:
436
+ country = pycountry.countries.get(alpha_3=region_clean.upper())
437
+ if country:
438
+ alpha_2 = country.alpha_2
439
+
440
+ # Try fuzzy search
441
+ if not alpha_2:
442
+ try:
443
+ matches = pycountry.countries.search_fuzzy(region_clean)
444
+ if matches:
445
+ alpha_2 = matches[0].alpha_2
446
+ except LookupError:
447
+ pass
448
+
449
+ # Look up by alpha_2 in cache
450
+ if alpha_2:
451
+ return self._location_cache.get(alpha_2.lower())
452
+
453
+ return None
454
+
455
+ def _get_or_create_role(
456
+ self,
457
+ conn: sqlite3.Connection,
458
+ role_name: str,
459
+ source_id: int = 4, # wikidata
460
+ ) -> int:
461
+ """
462
+ Get or create a role record.
463
+
464
+ Args:
465
+ conn: Target database connection
466
+ role_name: Role/title name
467
+ source_id: Source type ID (default: wikidata)
468
+
469
+ Returns:
470
+ Role ID
471
+ """
472
+ if not role_name:
473
+ raise ValueError("Role name cannot be empty")
474
+
475
+ name_normalized = normalize_name_for_lookup(role_name)
476
+
477
+ # Check cache
478
+ if name_normalized in self._role_cache:
479
+ return self._role_cache[name_normalized]
480
+
481
+ # Check database
482
+ cursor = conn.execute(
483
+ "SELECT id FROM roles WHERE name_normalized = ? AND source_id = ?",
484
+ (name_normalized, source_id)
485
+ )
486
+ row = cursor.fetchone()
487
+ if row:
488
+ self._role_cache[name_normalized] = row["id"]
489
+ return row["id"]
490
+
491
+ # Create new role
492
+ cursor = conn.execute(
493
+ """
494
+ INSERT INTO roles (name, name_normalized, source_id, record)
495
+ VALUES (?, ?, ?, '{}')
496
+ """,
497
+ (role_name, name_normalized, source_id)
498
+ )
499
+ role_id = cursor.lastrowid
500
+ assert role_id is not None
501
+ conn.commit()
502
+
503
+ self._role_cache[name_normalized] = role_id
504
+ return role_id
505
+
506
+ def _migrate_qid_labels(
507
+ self,
508
+ source: sqlite3.Connection,
509
+ target: sqlite3.Connection,
510
+ ) -> int:
511
+ """Migrate qid_labels table, converting TEXT QIDs to INTEGER."""
512
+ # Check if source has qid_labels table
513
+ cursor = source.execute(
514
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='qid_labels'"
515
+ )
516
+ if not cursor.fetchone():
517
+ logger.info("No qid_labels table in source, skipping")
518
+ return 0
519
+
520
+ cursor = source.execute("SELECT qid, label FROM qid_labels")
521
+ count = 0
522
+
523
+ for row in cursor:
524
+ qid_int = parse_qid(row["qid"])
525
+ if qid_int is not None:
526
+ target.execute(
527
+ "INSERT OR IGNORE INTO qid_labels (qid, label) VALUES (?, ?)",
528
+ (qid_int, row["label"])
529
+ )
530
+ count += 1
531
+
532
+ target.commit()
533
+ logger.info(f"Migrated {count} QID labels")
534
+ return count
535
+
536
+ def _migrate_organizations(
537
+ self,
538
+ source: sqlite3.Connection,
539
+ target: sqlite3.Connection,
540
+ batch_size: int,
541
+ ) -> int:
542
+ """Migrate organizations table with FK conversion."""
543
+ # Check source schema
544
+ cursor = source.execute("PRAGMA table_info(organizations)")
545
+ columns = {row["name"] for row in cursor}
546
+
547
+ if "source_id" in columns and "source" not in columns:
548
+ logger.info("Source appears to already be v2 schema")
549
+ return 0
550
+
551
+ cursor = source.execute("""
552
+ SELECT id, name, name_normalized, source, source_id, region,
553
+ entity_type, from_date, to_date, record, canon_id, canon_size
554
+ FROM organizations
555
+ """)
556
+
557
+ count = 0
558
+ id_mapping: dict[int, int] = {} # old_id -> new_id
559
+
560
+ for row in cursor:
561
+ # Convert source to source_id FK
562
+ source_name = row["source"]
563
+ # Map "wikipedia" to "wikidata"
564
+ if source_name == "wikipedia":
565
+ source_name = "wikidata"
566
+ source_type_id = SOURCE_NAME_TO_ID.get(source_name, 4) # default to wikidata
567
+
568
+ # Convert entity_type to entity_type_id FK
569
+ entity_type_name = row["entity_type"] or "unknown"
570
+ entity_type_id = ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17) # default to unknown
571
+
572
+ # Resolve region to location_id
573
+ region_id = self._resolve_region_to_location(target, row["region"] or "")
574
+
575
+ # Extract QID from source_id if it's a Q code
576
+ qid = None
577
+ old_source_id = row["source_id"]
578
+ if old_source_id and old_source_id.startswith("Q"):
579
+ qid = parse_qid(old_source_id)
580
+
581
+ # Insert into target (use OR IGNORE to handle region normalization duplicates)
582
+ cursor2 = target.execute(
583
+ """
584
+ INSERT OR IGNORE INTO organizations
585
+ (qid, name, name_normalized, source_id, source_identifier, region_id,
586
+ entity_type_id, from_date, to_date, record, canon_id, canon_size)
587
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
588
+ """,
589
+ (
590
+ qid,
591
+ row["name"],
592
+ row["name_normalized"],
593
+ source_type_id,
594
+ old_source_id, # Keep original source_id as source_identifier
595
+ region_id,
596
+ entity_type_id,
597
+ row["from_date"] or None,
598
+ row["to_date"] or None,
599
+ row["record"],
600
+ None, # Reset canon_id for fresh canonicalization
601
+ 1, # Reset canon_size
602
+ )
603
+ )
604
+
605
+ new_id = cursor2.lastrowid
606
+ if new_id and cursor2.rowcount > 0:
607
+ id_mapping[row["id"]] = new_id
608
+ count += 1
609
+ else:
610
+ # Duplicate - look up the existing record's ID for embedding mapping
611
+ existing = target.execute(
612
+ "SELECT id FROM organizations WHERE source_identifier = ? AND source_id = ?",
613
+ (old_source_id, source_type_id)
614
+ ).fetchone()
615
+ if existing:
616
+ id_mapping[row["id"]] = existing["id"]
617
+
618
+ if count % batch_size == 0:
619
+ target.commit()
620
+ logger.info(f" Migrated {count} organizations...")
621
+
622
+ target.commit()
623
+ logger.info(f"Migrated {count} organizations")
624
+
625
+ # Store ID mapping for embedding migration
626
+ self._org_id_mapping = id_mapping
627
+ return count
628
+
629
+ def _migrate_people(
630
+ self,
631
+ source: sqlite3.Connection,
632
+ target: sqlite3.Connection,
633
+ batch_size: int,
634
+ ) -> int:
635
+ """Migrate people table with FK conversion."""
636
+ # Check if source has people table
637
+ cursor = source.execute(
638
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='people'"
639
+ )
640
+ if not cursor.fetchone():
641
+ logger.info("No people table in source, skipping")
642
+ return 0
643
+
644
+ # Check source schema
645
+ cursor = source.execute("PRAGMA table_info(people)")
646
+ columns = {row["name"] for row in cursor}
647
+
648
+ if "source_id" in columns and "source" not in columns:
649
+ logger.info("People table appears to already be v2 schema")
650
+ return 0
651
+
652
+ cursor = source.execute("""
653
+ SELECT id, name, name_normalized, source, source_id, country,
654
+ person_type, known_for_role, known_for_org, known_for_org_id,
655
+ from_date, to_date, birth_date, death_date, record,
656
+ canon_id, canon_size
657
+ FROM people
658
+ """)
659
+
660
+ count = 0
661
+ id_mapping: dict[int, int] = {} # old_id -> new_id
662
+
663
+ for row in cursor:
664
+ # Convert source to source_id FK
665
+ source_name = row["source"] or "wikidata"
666
+ source_type_id = SOURCE_NAME_TO_ID.get(source_name, 4)
667
+
668
+ # Convert person_type to person_type_id FK
669
+ person_type_name = row["person_type"] or "unknown"
670
+ person_type_id = PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15)
671
+
672
+ # Resolve country to location_id
673
+ country_id = self._resolve_region_to_location(target, row["country"] or "")
674
+
675
+ # Get or create role_id for known_for_role
676
+ role_id = None
677
+ if row["known_for_role"]:
678
+ role_id = self._get_or_create_role(target, row["known_for_role"], source_type_id)
679
+
680
+ # Map known_for_org_id to new org ID
681
+ old_org_id = row["known_for_org_id"]
682
+ new_org_id = None
683
+ if old_org_id and hasattr(self, "_org_id_mapping"):
684
+ new_org_id = self._org_id_mapping.get(old_org_id)
685
+
686
+ # Extract QID from source_id if it's a Q code
687
+ qid = None
688
+ old_source_id = row["source_id"]
689
+ if old_source_id and old_source_id.startswith("Q"):
690
+ qid = parse_qid(old_source_id)
691
+
692
+ # Insert into target (use OR IGNORE to handle duplicates from normalization)
693
+ cursor2 = target.execute(
694
+ """
695
+ INSERT OR IGNORE INTO people
696
+ (qid, name, name_normalized, source_id, source_identifier, country_id,
697
+ person_type_id, known_for_role_id, known_for_org, known_for_org_id,
698
+ from_date, to_date, birth_date, death_date, record, canon_id, canon_size)
699
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
700
+ """,
701
+ (
702
+ qid,
703
+ row["name"],
704
+ row["name_normalized"],
705
+ source_type_id,
706
+ old_source_id,
707
+ country_id,
708
+ person_type_id,
709
+ role_id,
710
+ row["known_for_org"] or "",
711
+ new_org_id,
712
+ row["from_date"] or None,
713
+ row["to_date"] or None,
714
+ row["birth_date"] or None,
715
+ row["death_date"] or None,
716
+ row["record"],
717
+ None, # Reset canon_id
718
+ 1, # Reset canon_size
719
+ )
720
+ )
721
+
722
+ new_id = cursor2.lastrowid
723
+ if new_id and cursor2.rowcount > 0:
724
+ id_mapping[row["id"]] = new_id
725
+ count += 1
726
+ else:
727
+ # Duplicate - look up existing record for embedding mapping
728
+ existing = target.execute(
729
+ """SELECT id FROM people
730
+ WHERE source_identifier = ? AND source_id = ?
731
+ AND known_for_role_id IS ? AND known_for_org_id IS ?""",
732
+ (old_source_id, source_type_id, role_id, new_org_id)
733
+ ).fetchone()
734
+ if existing:
735
+ id_mapping[row["id"]] = existing["id"]
736
+
737
+ if count % batch_size == 0:
738
+ target.commit()
739
+ logger.info(f" Migrated {count} people...")
740
+
741
+ target.commit()
742
+ logger.info(f"Migrated {count} people")
743
+
744
+ # Store ID mapping for embedding migration
745
+ self._person_id_mapping = id_mapping
746
+ return count
747
+
748
+ def _migrate_org_embeddings(
749
+ self,
750
+ source: sqlite3.Connection,
751
+ target: sqlite3.Connection,
752
+ batch_size: int,
753
+ ) -> int:
754
+ """Migrate organization embeddings using ID mapping."""
755
+ # Check if source has embeddings table
756
+ cursor = source.execute(
757
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings'"
758
+ )
759
+ if not cursor.fetchone():
760
+ logger.info("No organization_embeddings table in source, skipping")
761
+ return 0
762
+
763
+ if not hasattr(self, "_org_id_mapping"):
764
+ logger.warning("No org ID mapping available, skipping embedding migration")
765
+ return 0
766
+
767
+ cursor = source.execute("SELECT org_id, embedding FROM organization_embeddings")
768
+ count = 0
769
+
770
+ for row in cursor:
771
+ old_id = row["org_id"]
772
+ new_id = self._org_id_mapping.get(old_id)
773
+
774
+ if new_id is not None:
775
+ target.execute(
776
+ "INSERT OR REPLACE INTO organization_embeddings (org_id, embedding) VALUES (?, ?)",
777
+ (new_id, row["embedding"])
778
+ )
779
+ count += 1
780
+
781
+ if count % batch_size == 0:
782
+ target.commit()
783
+ logger.info(f" Migrated {count} organization embeddings...")
784
+
785
+ target.commit()
786
+ logger.info(f"Migrated {count} organization embeddings")
787
+ return count
788
+
789
+ def _migrate_person_embeddings(
790
+ self,
791
+ source: sqlite3.Connection,
792
+ target: sqlite3.Connection,
793
+ batch_size: int,
794
+ ) -> int:
795
+ """Migrate person embeddings using ID mapping."""
796
+ # Check if source has embeddings table
797
+ cursor = source.execute(
798
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings'"
799
+ )
800
+ if not cursor.fetchone():
801
+ logger.info("No person_embeddings table in source, skipping")
802
+ return 0
803
+
804
+ if not hasattr(self, "_person_id_mapping"):
805
+ logger.warning("No person ID mapping available, skipping embedding migration")
806
+ return 0
807
+
808
+ cursor = source.execute("SELECT person_id, embedding FROM person_embeddings")
809
+ count = 0
810
+
811
+ for row in cursor:
812
+ old_id = row["person_id"]
813
+ new_id = self._person_id_mapping.get(old_id)
814
+
815
+ if new_id is not None:
816
+ target.execute(
817
+ "INSERT OR REPLACE INTO person_embeddings (person_id, embedding) VALUES (?, ?)",
818
+ (new_id, row["embedding"])
819
+ )
820
+ count += 1
821
+
822
+ if count % batch_size == 0:
823
+ target.commit()
824
+ logger.info(f" Migrated {count} person embeddings...")
825
+
826
+ target.commit()
827
+ logger.info(f"Migrated {count} person embeddings")
828
+ return count
829
+
830
+
831
+ def migrate_database(
832
+ source_path: str | Path,
833
+ target_path: str | Path,
834
+ embedding_dim: int = 768,
835
+ batch_size: int = 10000,
836
+ resume: bool = False,
837
+ ) -> dict[str, int]:
838
+ """
839
+ Migrate a v1 database to v2 normalized schema.
840
+
841
+ Args:
842
+ source_path: Path to v1 database
843
+ target_path: Path for v2 database (will be created)
844
+ embedding_dim: Embedding dimension
845
+ batch_size: Batch size for commits
846
+ resume: If True, resume from last completed step
847
+
848
+ Returns:
849
+ Migration statistics
850
+ """
851
+ migrator = DatabaseMigrator(source_path, target_path, embedding_dim, resume=resume)
852
+ return migrator.migrate(batch_size)