corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
- statement_extractor/cli.py +472 -45
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +51 -9
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/wikidata_dump.py +334 -3
- statement_extractor/database/importers/wikidata_people.py +44 -0
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +125 -1
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +2113 -322
- statement_extractor/plugins/qualifiers/person.py +109 -52
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -32,10 +32,10 @@ from datetime import datetime
|
|
|
32
32
|
from pathlib import Path
|
|
33
33
|
from typing import Callable, Iterator, Optional
|
|
34
34
|
|
|
35
|
-
from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
|
|
35
|
+
from ..models import CompanyRecord, EntityType, LocationRecord, PersonRecord, PersonType, SimplifiedLocationType
|
|
36
36
|
|
|
37
|
-
# Type alias for records that can be either people or orgs
|
|
38
|
-
ImportRecord = PersonRecord | CompanyRecord
|
|
37
|
+
# Type alias for records that can be either people or orgs or locations
|
|
38
|
+
ImportRecord = PersonRecord | CompanyRecord | LocationRecord
|
|
39
39
|
|
|
40
40
|
logger = logging.getLogger(__name__)
|
|
41
41
|
|
|
@@ -458,6 +458,111 @@ ORG_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
|
|
|
458
458
|
}
|
|
459
459
|
|
|
460
460
|
|
|
461
|
+
# =============================================================================
|
|
462
|
+
# LOCATION TYPE MAPPING (P31 - instance of)
|
|
463
|
+
# Maps P31 QID -> (location_type_name, simplified_type)
|
|
464
|
+
# =============================================================================
|
|
465
|
+
|
|
466
|
+
LOCATION_TYPE_QIDS: dict[str, tuple[str, SimplifiedLocationType]] = {
|
|
467
|
+
# ==========================================================================
|
|
468
|
+
# IMPORTANT: The type names (first element of tuple) MUST match exactly
|
|
469
|
+
# the names in database/seed_data.py LOCATION_TYPES. Any new types need
|
|
470
|
+
# to be added there first, or use existing type names.
|
|
471
|
+
# ==========================================================================
|
|
472
|
+
|
|
473
|
+
# Continents (maps to: continent)
|
|
474
|
+
"Q5107": ("continent", SimplifiedLocationType.CONTINENT),
|
|
475
|
+
|
|
476
|
+
# Countries / Sovereign states (maps to: country, sovereign_state, dependent_territory)
|
|
477
|
+
"Q6256": ("country", SimplifiedLocationType.COUNTRY),
|
|
478
|
+
"Q3624078": ("sovereign_state", SimplifiedLocationType.COUNTRY),
|
|
479
|
+
"Q161243": ("dependent_territory", SimplifiedLocationType.COUNTRY),
|
|
480
|
+
# Additional country-like types -> map to country
|
|
481
|
+
"Q15634554": ("country", SimplifiedLocationType.COUNTRY), # state with limited recognition
|
|
482
|
+
"Q1763527": ("country", SimplifiedLocationType.COUNTRY), # constituent country
|
|
483
|
+
"Q46395": ("dependent_territory", SimplifiedLocationType.COUNTRY), # british overseas territory
|
|
484
|
+
|
|
485
|
+
# Subdivisions (states/provinces) - US
|
|
486
|
+
"Q35657": ("us_state", SimplifiedLocationType.SUBDIVISION),
|
|
487
|
+
"Q47168": ("us_county", SimplifiedLocationType.SUBDIVISION),
|
|
488
|
+
|
|
489
|
+
# Subdivisions - Country-specific
|
|
490
|
+
"Q5852411": ("state_of_australia", SimplifiedLocationType.SUBDIVISION),
|
|
491
|
+
"Q1221156": ("state_of_germany", SimplifiedLocationType.SUBDIVISION),
|
|
492
|
+
"Q131541": ("state_of_india", SimplifiedLocationType.SUBDIVISION),
|
|
493
|
+
"Q6465": ("department_france", SimplifiedLocationType.SUBDIVISION),
|
|
494
|
+
"Q50337": ("prefecture_japan", SimplifiedLocationType.SUBDIVISION),
|
|
495
|
+
"Q23058": ("canton_switzerland", SimplifiedLocationType.SUBDIVISION),
|
|
496
|
+
"Q10742": ("autonomous_community_spain", SimplifiedLocationType.SUBDIVISION),
|
|
497
|
+
"Q150093": ("voivodeship_poland", SimplifiedLocationType.SUBDIVISION),
|
|
498
|
+
"Q835714": ("oblast_russia", SimplifiedLocationType.SUBDIVISION),
|
|
499
|
+
|
|
500
|
+
# Subdivisions - Generic (map to existing types)
|
|
501
|
+
"Q34876": ("province", SimplifiedLocationType.SUBDIVISION),
|
|
502
|
+
"Q82794": ("region", SimplifiedLocationType.SUBDIVISION),
|
|
503
|
+
"Q28575": ("county", SimplifiedLocationType.SUBDIVISION),
|
|
504
|
+
# Additional generic subdivision types -> map to region/province/county
|
|
505
|
+
"Q10864048": ("region", SimplifiedLocationType.SUBDIVISION), # first-level admin
|
|
506
|
+
"Q11828004": ("county", SimplifiedLocationType.SUBDIVISION), # second-level admin
|
|
507
|
+
"Q12483": ("region", SimplifiedLocationType.SUBDIVISION), # territory
|
|
508
|
+
"Q515716": ("region", SimplifiedLocationType.SUBDIVISION), # region of Italy
|
|
509
|
+
"Q1132541": ("county", SimplifiedLocationType.SUBDIVISION), # county of Sweden
|
|
510
|
+
"Q1780990": ("region", SimplifiedLocationType.SUBDIVISION), # council area Scotland
|
|
511
|
+
"Q211690": ("county", SimplifiedLocationType.SUBDIVISION), # ceremonial county England
|
|
512
|
+
"Q180673": ("county", SimplifiedLocationType.SUBDIVISION), # ceremonial county
|
|
513
|
+
"Q1136601": ("county", SimplifiedLocationType.SUBDIVISION), # metropolitan county
|
|
514
|
+
"Q21451686": ("region", SimplifiedLocationType.SUBDIVISION), # region of England
|
|
515
|
+
"Q1006876": ("region", SimplifiedLocationType.SUBDIVISION), # unitary authority Wales
|
|
516
|
+
"Q179872": ("province", SimplifiedLocationType.SUBDIVISION), # province of Canada
|
|
517
|
+
"Q1352230": ("region", SimplifiedLocationType.SUBDIVISION), # territory of Canada
|
|
518
|
+
"Q13360155": ("province", SimplifiedLocationType.SUBDIVISION), # province of China
|
|
519
|
+
"Q842112": ("region", SimplifiedLocationType.SUBDIVISION), # autonomous region China
|
|
520
|
+
"Q1348006": ("municipality", SimplifiedLocationType.CITY), # municipality of China (city-level)
|
|
521
|
+
"Q11774097": ("city", SimplifiedLocationType.CITY), # prefecture-level city
|
|
522
|
+
|
|
523
|
+
# Cities/Towns/Municipalities (maps to: city, big_city, capital, town, municipality, village, hamlet)
|
|
524
|
+
"Q515": ("city", SimplifiedLocationType.CITY),
|
|
525
|
+
"Q1549591": ("big_city", SimplifiedLocationType.CITY),
|
|
526
|
+
"Q5119": ("capital", SimplifiedLocationType.CITY),
|
|
527
|
+
"Q3957": ("town", SimplifiedLocationType.CITY),
|
|
528
|
+
"Q15284": ("municipality", SimplifiedLocationType.CITY),
|
|
529
|
+
"Q532": ("village", SimplifiedLocationType.CITY),
|
|
530
|
+
"Q5084": ("hamlet", SimplifiedLocationType.CITY),
|
|
531
|
+
# Country-specific municipalities
|
|
532
|
+
"Q484170": ("commune_france", SimplifiedLocationType.CITY),
|
|
533
|
+
"Q262166": ("municipality_germany", SimplifiedLocationType.CITY),
|
|
534
|
+
"Q1054813": ("municipality_japan", SimplifiedLocationType.CITY),
|
|
535
|
+
# Additional city types -> map to city/town/village
|
|
536
|
+
"Q7930989": ("city", SimplifiedLocationType.CITY), # city of US
|
|
537
|
+
"Q200250": ("big_city", SimplifiedLocationType.CITY), # metropolis
|
|
538
|
+
"Q2264924": ("big_city", SimplifiedLocationType.CITY), # conurbation
|
|
539
|
+
"Q174844": ("big_city", SimplifiedLocationType.CITY), # megacity
|
|
540
|
+
"Q22865": ("city", SimplifiedLocationType.CITY), # independent city
|
|
541
|
+
"Q5153359": ("municipality", SimplifiedLocationType.CITY), # commune (generic)
|
|
542
|
+
"Q4286337": ("village", SimplifiedLocationType.CITY), # locality
|
|
543
|
+
"Q486972": ("village", SimplifiedLocationType.CITY), # human settlement
|
|
544
|
+
"Q95993392": ("city", SimplifiedLocationType.CITY), # city or town
|
|
545
|
+
|
|
546
|
+
# Districts (maps to: district, borough, neighborhood, ward)
|
|
547
|
+
"Q149621": ("district", SimplifiedLocationType.DISTRICT),
|
|
548
|
+
"Q5765681": ("borough", SimplifiedLocationType.DISTRICT),
|
|
549
|
+
"Q123705": ("neighborhood", SimplifiedLocationType.DISTRICT),
|
|
550
|
+
"Q12813115": ("ward", SimplifiedLocationType.DISTRICT),
|
|
551
|
+
# Additional district types -> map to district/borough
|
|
552
|
+
"Q2198484": ("borough", SimplifiedLocationType.DISTRICT), # borough of London
|
|
553
|
+
"Q667509": ("district", SimplifiedLocationType.DISTRICT), # arrondissement
|
|
554
|
+
"Q2100709": ("district", SimplifiedLocationType.DISTRICT), # city district
|
|
555
|
+
|
|
556
|
+
# Historic (maps to: former_country, ancient_civilization, historic_territory)
|
|
557
|
+
"Q3024240": ("former_country", SimplifiedLocationType.HISTORIC),
|
|
558
|
+
"Q28171280": ("ancient_civilization", SimplifiedLocationType.HISTORIC),
|
|
559
|
+
"Q1620908": ("historic_territory", SimplifiedLocationType.HISTORIC),
|
|
560
|
+
# Additional historic types
|
|
561
|
+
"Q19953632": ("historic_territory", SimplifiedLocationType.HISTORIC), # historical region
|
|
562
|
+
"Q1307214": ("historic_territory", SimplifiedLocationType.HISTORIC), # historical admin region
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
|
|
461
566
|
# =============================================================================
|
|
462
567
|
# PROGRESS TRACKING
|
|
463
568
|
# =============================================================================
|
|
@@ -1250,6 +1355,27 @@ class WikidataDumpImporter:
|
|
|
1250
1355
|
return ORG_TYPE_TO_ENTITY_TYPE[qid]
|
|
1251
1356
|
return None
|
|
1252
1357
|
|
|
1358
|
+
def _get_location_type(self, entity: dict) -> Optional[tuple[str, SimplifiedLocationType]]:
|
|
1359
|
+
"""
|
|
1360
|
+
Check if entity has P31 (instance of) matching a location type.
|
|
1361
|
+
|
|
1362
|
+
Args:
|
|
1363
|
+
entity: Parsed Wikidata entity dictionary
|
|
1364
|
+
|
|
1365
|
+
Returns:
|
|
1366
|
+
Tuple of (location_type_name, SimplifiedLocationType) if entity is a location, None otherwise
|
|
1367
|
+
"""
|
|
1368
|
+
claims = entity.get("claims", {})
|
|
1369
|
+
for claim in claims.get("P31", []):
|
|
1370
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1371
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1372
|
+
value = datavalue.get("value", {})
|
|
1373
|
+
if isinstance(value, dict):
|
|
1374
|
+
qid = value.get("id", "")
|
|
1375
|
+
if qid in LOCATION_TYPE_QIDS:
|
|
1376
|
+
return LOCATION_TYPE_QIDS[qid]
|
|
1377
|
+
return None
|
|
1378
|
+
|
|
1253
1379
|
def _get_claim_values(self, entity: dict, prop: str) -> list[str]:
|
|
1254
1380
|
"""
|
|
1255
1381
|
Get all QID values for a property (e.g., P39, P106).
|
|
@@ -1545,6 +1671,13 @@ class WikidataDumpImporter:
|
|
|
1545
1671
|
# Get best role/org/dates from positions
|
|
1546
1672
|
role_qid, _, org_qid, start_date, end_date, extra_context = self._get_best_role_org(positions)
|
|
1547
1673
|
|
|
1674
|
+
# Fallback: if no org from positions, check top-level P108 (employer)
|
|
1675
|
+
if not org_qid:
|
|
1676
|
+
employers = self._get_claim_values(entity, "P108")
|
|
1677
|
+
if employers:
|
|
1678
|
+
org_qid = employers[0]
|
|
1679
|
+
logger.debug(f"Using top-level P108 employer for {qid}: {org_qid}")
|
|
1680
|
+
|
|
1548
1681
|
# Get country (P27 - country of citizenship)
|
|
1549
1682
|
countries = self._get_claim_values(entity, "P27")
|
|
1550
1683
|
country_qid = countries[0] if countries else ""
|
|
@@ -1663,6 +1796,204 @@ class WikidataDumpImporter:
|
|
|
1663
1796
|
},
|
|
1664
1797
|
)
|
|
1665
1798
|
|
|
1799
|
+
def _process_location_entity(
|
|
1800
|
+
self,
|
|
1801
|
+
entity: dict,
|
|
1802
|
+
require_enwiki: bool = False,
|
|
1803
|
+
) -> Optional[LocationRecord]:
|
|
1804
|
+
"""
|
|
1805
|
+
Process a single entity, return LocationRecord if it's a location.
|
|
1806
|
+
|
|
1807
|
+
Args:
|
|
1808
|
+
entity: Parsed Wikidata entity dictionary
|
|
1809
|
+
require_enwiki: If True, only include locations with English Wikipedia articles
|
|
1810
|
+
|
|
1811
|
+
Returns:
|
|
1812
|
+
LocationRecord if entity qualifies, None otherwise
|
|
1813
|
+
"""
|
|
1814
|
+
# Must be an item (not property)
|
|
1815
|
+
if entity.get("type") != "item":
|
|
1816
|
+
return None
|
|
1817
|
+
|
|
1818
|
+
# Get location type from P31
|
|
1819
|
+
location_type_info = self._get_location_type(entity)
|
|
1820
|
+
if location_type_info is None:
|
|
1821
|
+
return None
|
|
1822
|
+
|
|
1823
|
+
location_type_name, simplified_type = location_type_info
|
|
1824
|
+
|
|
1825
|
+
# Optionally require English Wikipedia article
|
|
1826
|
+
if require_enwiki:
|
|
1827
|
+
sitelinks = entity.get("sitelinks", {})
|
|
1828
|
+
if "enwiki" not in sitelinks:
|
|
1829
|
+
return None
|
|
1830
|
+
|
|
1831
|
+
# Extract location data
|
|
1832
|
+
return self._extract_location_data(entity, location_type_name, simplified_type)
|
|
1833
|
+
|
|
1834
|
+
def _extract_location_data(
|
|
1835
|
+
self,
|
|
1836
|
+
entity: dict,
|
|
1837
|
+
location_type: str,
|
|
1838
|
+
simplified_type: SimplifiedLocationType,
|
|
1839
|
+
) -> Optional[LocationRecord]:
|
|
1840
|
+
"""
|
|
1841
|
+
Extract LocationRecord from entity dict.
|
|
1842
|
+
|
|
1843
|
+
Args:
|
|
1844
|
+
entity: Parsed Wikidata entity dictionary
|
|
1845
|
+
location_type: Detailed location type name
|
|
1846
|
+
simplified_type: Simplified location type enum
|
|
1847
|
+
|
|
1848
|
+
Returns:
|
|
1849
|
+
LocationRecord or None if essential data is missing
|
|
1850
|
+
"""
|
|
1851
|
+
qid = entity.get("id", "")
|
|
1852
|
+
labels = entity.get("labels", {})
|
|
1853
|
+
label = labels.get("en", {}).get("value", "")
|
|
1854
|
+
|
|
1855
|
+
if not label or not qid:
|
|
1856
|
+
return None
|
|
1857
|
+
|
|
1858
|
+
claims = entity.get("claims", {})
|
|
1859
|
+
|
|
1860
|
+
# Get parent locations from P131 (located in administrative territorial entity)
|
|
1861
|
+
# This gives us the full hierarchy (city -> state -> country)
|
|
1862
|
+
parent_qids = self._get_claim_values(entity, "P131")
|
|
1863
|
+
|
|
1864
|
+
# Get country from P17 as fallback/additional parent
|
|
1865
|
+
country_qids = self._get_claim_values(entity, "P17")
|
|
1866
|
+
|
|
1867
|
+
# Get coordinates from P625 (coordinate location)
|
|
1868
|
+
coordinates = self._get_coordinates(claims)
|
|
1869
|
+
|
|
1870
|
+
# Get description
|
|
1871
|
+
descriptions = entity.get("descriptions", {})
|
|
1872
|
+
description = descriptions.get("en", {}).get("value", "")
|
|
1873
|
+
|
|
1874
|
+
# Get inception date (P571) - when location was established
|
|
1875
|
+
inception = self._get_time_claim(claims, "P571")
|
|
1876
|
+
|
|
1877
|
+
# Get dissolution date (P576) - when location ceased to exist
|
|
1878
|
+
dissolution = self._get_time_claim(claims, "P576")
|
|
1879
|
+
|
|
1880
|
+
# Parse QID to integer
|
|
1881
|
+
qid_int = int(qid[1:]) if qid.startswith("Q") and qid[1:].isdigit() else None
|
|
1882
|
+
|
|
1883
|
+
# Build record with extra details
|
|
1884
|
+
record_data = {
|
|
1885
|
+
"wikidata_id": qid,
|
|
1886
|
+
"label": label,
|
|
1887
|
+
"description": description,
|
|
1888
|
+
"parent_qids": parent_qids,
|
|
1889
|
+
"country_qids": country_qids,
|
|
1890
|
+
}
|
|
1891
|
+
if coordinates:
|
|
1892
|
+
record_data["coordinates"] = coordinates
|
|
1893
|
+
|
|
1894
|
+
return LocationRecord(
|
|
1895
|
+
name=label,
|
|
1896
|
+
source="wikidata",
|
|
1897
|
+
source_id=qid,
|
|
1898
|
+
qid=qid_int,
|
|
1899
|
+
location_type=location_type,
|
|
1900
|
+
simplified_type=simplified_type,
|
|
1901
|
+
parent_ids=[], # Will be resolved later by looking up parent QIDs in the database
|
|
1902
|
+
from_date=inception,
|
|
1903
|
+
to_date=dissolution,
|
|
1904
|
+
record=record_data,
|
|
1905
|
+
)
|
|
1906
|
+
|
|
1907
|
+
def _get_coordinates(self, claims: dict) -> Optional[dict]:
|
|
1908
|
+
"""
|
|
1909
|
+
Get coordinates from P625 (coordinate location).
|
|
1910
|
+
|
|
1911
|
+
Args:
|
|
1912
|
+
claims: Claims dictionary
|
|
1913
|
+
|
|
1914
|
+
Returns:
|
|
1915
|
+
Dict with lat/lon or None
|
|
1916
|
+
"""
|
|
1917
|
+
for claim in claims.get("P625", []):
|
|
1918
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1919
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1920
|
+
value = datavalue.get("value", {})
|
|
1921
|
+
if isinstance(value, dict):
|
|
1922
|
+
lat = value.get("latitude")
|
|
1923
|
+
lon = value.get("longitude")
|
|
1924
|
+
if lat is not None and lon is not None:
|
|
1925
|
+
return {"lat": lat, "lon": lon}
|
|
1926
|
+
return None
|
|
1927
|
+
|
|
1928
|
+
def import_locations(
|
|
1929
|
+
self,
|
|
1930
|
+
dump_path: Optional[Path] = None,
|
|
1931
|
+
limit: Optional[int] = None,
|
|
1932
|
+
require_enwiki: bool = False,
|
|
1933
|
+
skip_ids: Optional[set[str]] = None,
|
|
1934
|
+
start_index: int = 0,
|
|
1935
|
+
progress_callback: Optional[Callable[[int, str, int], None]] = None,
|
|
1936
|
+
) -> Iterator[LocationRecord]:
|
|
1937
|
+
"""
|
|
1938
|
+
Stream through dump, yielding locations (geopolitical entities).
|
|
1939
|
+
|
|
1940
|
+
This method filters the dump for:
|
|
1941
|
+
- Items with type "item"
|
|
1942
|
+
- Has P31 (instance of) matching a location type
|
|
1943
|
+
- Optionally: Has English Wikipedia article (enwiki sitelink)
|
|
1944
|
+
|
|
1945
|
+
Args:
|
|
1946
|
+
dump_path: Path to dump file (uses self._dump_path if not provided)
|
|
1947
|
+
limit: Optional maximum number of records to return
|
|
1948
|
+
require_enwiki: If True, only include locations with English Wikipedia articles
|
|
1949
|
+
skip_ids: Optional set of source_ids (Q codes) to skip
|
|
1950
|
+
start_index: Entity index to start from (for resume support)
|
|
1951
|
+
progress_callback: Optional callback(entity_index, entity_id, records_yielded)
|
|
1952
|
+
|
|
1953
|
+
Yields:
|
|
1954
|
+
LocationRecord for each qualifying location
|
|
1955
|
+
"""
|
|
1956
|
+
path = dump_path or self._dump_path
|
|
1957
|
+
count = 0
|
|
1958
|
+
skipped_existing = 0
|
|
1959
|
+
current_entity_index = start_index
|
|
1960
|
+
|
|
1961
|
+
logger.info("Starting location import from Wikidata dump...")
|
|
1962
|
+
if start_index > 0:
|
|
1963
|
+
logger.info(f"Resuming from entity index {start_index:,}")
|
|
1964
|
+
if not require_enwiki:
|
|
1965
|
+
logger.info("Importing ALL locations (no enwiki filter)")
|
|
1966
|
+
if skip_ids:
|
|
1967
|
+
logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
|
|
1968
|
+
|
|
1969
|
+
def track_entity(entity_index: int, entity_id: str) -> None:
|
|
1970
|
+
nonlocal current_entity_index
|
|
1971
|
+
current_entity_index = entity_index
|
|
1972
|
+
|
|
1973
|
+
for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
|
|
1974
|
+
if limit and count >= limit:
|
|
1975
|
+
break
|
|
1976
|
+
|
|
1977
|
+
# Check skip_ids early, before full processing
|
|
1978
|
+
entity_id = entity.get("id", "")
|
|
1979
|
+
if skip_ids and entity_id in skip_ids:
|
|
1980
|
+
skipped_existing += 1
|
|
1981
|
+
continue
|
|
1982
|
+
|
|
1983
|
+
record = self._process_location_entity(entity, require_enwiki=require_enwiki)
|
|
1984
|
+
if record:
|
|
1985
|
+
count += 1
|
|
1986
|
+
if count % 10_000 == 0:
|
|
1987
|
+
logger.info(f"Yielded {count:,} location records (skipped {skipped_existing:,})...")
|
|
1988
|
+
|
|
1989
|
+
# Call progress callback with current position
|
|
1990
|
+
if progress_callback:
|
|
1991
|
+
progress_callback(current_entity_index, entity_id, count)
|
|
1992
|
+
|
|
1993
|
+
yield record
|
|
1994
|
+
|
|
1995
|
+
logger.info(f"Location import complete: {count:,} records (skipped {skipped_existing:,})")
|
|
1996
|
+
|
|
1666
1997
|
def _get_string_claim(self, claims: dict, prop: str) -> str:
|
|
1667
1998
|
"""
|
|
1668
1999
|
Get first string value for a property.
|
|
@@ -1049,6 +1049,12 @@ class WikidataPeopleImporter:
|
|
|
1049
1049
|
best_result = result
|
|
1050
1050
|
|
|
1051
1051
|
if best_result:
|
|
1052
|
+
# If we have a role but no org, try P108 (employer) as fallback
|
|
1053
|
+
role_label, org_label, org_qid, from_date, to_date = best_result
|
|
1054
|
+
if role_label and not org_label:
|
|
1055
|
+
fallback_org, fallback_org_qid = self._get_employer(person_qid)
|
|
1056
|
+
if fallback_org:
|
|
1057
|
+
return role_label, fallback_org, fallback_org_qid, from_date, to_date
|
|
1052
1058
|
return best_result
|
|
1053
1059
|
|
|
1054
1060
|
return "", "", "", None, None
|
|
@@ -1057,6 +1063,44 @@ class WikidataPeopleImporter:
|
|
|
1057
1063
|
logger.debug(f"Failed to enrich role/org for {person_qid}: {e}")
|
|
1058
1064
|
return "", "", "", None, None
|
|
1059
1065
|
|
|
1066
|
+
def _get_employer(self, person_qid: str) -> tuple[str, str]:
|
|
1067
|
+
"""
|
|
1068
|
+
Query P108 (employer) as fallback for org.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
person_qid: Wikidata QID of the person
|
|
1072
|
+
|
|
1073
|
+
Returns:
|
|
1074
|
+
Tuple of (org_label, org_qid) or ("", "") if not found
|
|
1075
|
+
"""
|
|
1076
|
+
query = """
|
|
1077
|
+
SELECT ?org ?orgLabel WHERE {
|
|
1078
|
+
wd:%s wdt:P108 ?org .
|
|
1079
|
+
?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") .
|
|
1080
|
+
}
|
|
1081
|
+
LIMIT 1
|
|
1082
|
+
""" % person_qid
|
|
1083
|
+
|
|
1084
|
+
try:
|
|
1085
|
+
url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
|
|
1086
|
+
req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
|
|
1087
|
+
|
|
1088
|
+
with urllib.request.urlopen(req, timeout=15) as response:
|
|
1089
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1090
|
+
|
|
1091
|
+
bindings = data.get("results", {}).get("bindings", [])
|
|
1092
|
+
if bindings:
|
|
1093
|
+
org_label = bindings[0].get("orgLabel", {}).get("value", "")
|
|
1094
|
+
org_uri = bindings[0].get("org", {}).get("value", "")
|
|
1095
|
+
org_qid = org_uri.split("/")[-1] if org_uri else ""
|
|
1096
|
+
if org_label and not org_label.startswith("Q"):
|
|
1097
|
+
return org_label, org_qid
|
|
1098
|
+
|
|
1099
|
+
except Exception as e:
|
|
1100
|
+
logger.debug(f"Failed to get employer for {person_qid}: {e}")
|
|
1101
|
+
|
|
1102
|
+
return "", ""
|
|
1103
|
+
|
|
1060
1104
|
def enrich_people_role_org_batch(
|
|
1061
1105
|
self,
|
|
1062
1106
|
people: list[PersonRecord],
|