corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,10 +32,10 @@ from datetime import datetime
32
32
  from pathlib import Path
33
33
  from typing import Callable, Iterator, Optional
34
34
 
35
- from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
35
+ from ..models import CompanyRecord, EntityType, LocationRecord, PersonRecord, PersonType, SimplifiedLocationType
36
36
 
37
- # Type alias for records that can be either people or orgs
38
- ImportRecord = PersonRecord | CompanyRecord
37
+ # Type alias for records that can be either people or orgs or locations
38
+ ImportRecord = PersonRecord | CompanyRecord | LocationRecord
39
39
 
40
40
  logger = logging.getLogger(__name__)
41
41
 
@@ -458,6 +458,111 @@ ORG_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
458
458
  }
459
459
 
460
460
 
461
+ # =============================================================================
462
+ # LOCATION TYPE MAPPING (P31 - instance of)
463
+ # Maps P31 QID -> (location_type_name, simplified_type)
464
+ # =============================================================================
465
+
466
+ LOCATION_TYPE_QIDS: dict[str, tuple[str, SimplifiedLocationType]] = {
467
+ # ==========================================================================
468
+ # IMPORTANT: The type names (first element of tuple) MUST match exactly
469
+ # the names in database/seed_data.py LOCATION_TYPES. Any new types need
470
+ # to be added there first, or use existing type names.
471
+ # ==========================================================================
472
+
473
+ # Continents (maps to: continent)
474
+ "Q5107": ("continent", SimplifiedLocationType.CONTINENT),
475
+
476
+ # Countries / Sovereign states (maps to: country, sovereign_state, dependent_territory)
477
+ "Q6256": ("country", SimplifiedLocationType.COUNTRY),
478
+ "Q3624078": ("sovereign_state", SimplifiedLocationType.COUNTRY),
479
+ "Q161243": ("dependent_territory", SimplifiedLocationType.COUNTRY),
480
+ # Additional country-like types -> map to country
481
+ "Q15634554": ("country", SimplifiedLocationType.COUNTRY), # state with limited recognition
482
+ "Q1763527": ("country", SimplifiedLocationType.COUNTRY), # constituent country
483
+ "Q46395": ("dependent_territory", SimplifiedLocationType.COUNTRY), # british overseas territory
484
+
485
+ # Subdivisions (states/provinces) - US
486
+ "Q35657": ("us_state", SimplifiedLocationType.SUBDIVISION),
487
+ "Q47168": ("us_county", SimplifiedLocationType.SUBDIVISION),
488
+
489
+ # Subdivisions - Country-specific
490
+ "Q5852411": ("state_of_australia", SimplifiedLocationType.SUBDIVISION),
491
+ "Q1221156": ("state_of_germany", SimplifiedLocationType.SUBDIVISION),
492
+ "Q131541": ("state_of_india", SimplifiedLocationType.SUBDIVISION),
493
+ "Q6465": ("department_france", SimplifiedLocationType.SUBDIVISION),
494
+ "Q50337": ("prefecture_japan", SimplifiedLocationType.SUBDIVISION),
495
+ "Q23058": ("canton_switzerland", SimplifiedLocationType.SUBDIVISION),
496
+ "Q10742": ("autonomous_community_spain", SimplifiedLocationType.SUBDIVISION),
497
+ "Q150093": ("voivodeship_poland", SimplifiedLocationType.SUBDIVISION),
498
+ "Q835714": ("oblast_russia", SimplifiedLocationType.SUBDIVISION),
499
+
500
+ # Subdivisions - Generic (map to existing types)
501
+ "Q34876": ("province", SimplifiedLocationType.SUBDIVISION),
502
+ "Q82794": ("region", SimplifiedLocationType.SUBDIVISION),
503
+ "Q28575": ("county", SimplifiedLocationType.SUBDIVISION),
504
+ # Additional generic subdivision types -> map to region/province/county
505
+ "Q10864048": ("region", SimplifiedLocationType.SUBDIVISION), # first-level admin
506
+ "Q11828004": ("county", SimplifiedLocationType.SUBDIVISION), # second-level admin
507
+ "Q12483": ("region", SimplifiedLocationType.SUBDIVISION), # territory
508
+ "Q515716": ("region", SimplifiedLocationType.SUBDIVISION), # region of Italy
509
+ "Q1132541": ("county", SimplifiedLocationType.SUBDIVISION), # county of Sweden
510
+ "Q1780990": ("region", SimplifiedLocationType.SUBDIVISION), # council area Scotland
511
+ "Q211690": ("county", SimplifiedLocationType.SUBDIVISION), # ceremonial county England
512
+ "Q180673": ("county", SimplifiedLocationType.SUBDIVISION), # ceremonial county
513
+ "Q1136601": ("county", SimplifiedLocationType.SUBDIVISION), # metropolitan county
514
+ "Q21451686": ("region", SimplifiedLocationType.SUBDIVISION), # region of England
515
+ "Q1006876": ("region", SimplifiedLocationType.SUBDIVISION), # unitary authority Wales
516
+ "Q179872": ("province", SimplifiedLocationType.SUBDIVISION), # province of Canada
517
+ "Q1352230": ("region", SimplifiedLocationType.SUBDIVISION), # territory of Canada
518
+ "Q13360155": ("province", SimplifiedLocationType.SUBDIVISION), # province of China
519
+ "Q842112": ("region", SimplifiedLocationType.SUBDIVISION), # autonomous region China
520
+ "Q1348006": ("municipality", SimplifiedLocationType.CITY), # municipality of China (city-level)
521
+ "Q11774097": ("city", SimplifiedLocationType.CITY), # prefecture-level city
522
+
523
+ # Cities/Towns/Municipalities (maps to: city, big_city, capital, town, municipality, village, hamlet)
524
+ "Q515": ("city", SimplifiedLocationType.CITY),
525
+ "Q1549591": ("big_city", SimplifiedLocationType.CITY),
526
+ "Q5119": ("capital", SimplifiedLocationType.CITY),
527
+ "Q3957": ("town", SimplifiedLocationType.CITY),
528
+ "Q15284": ("municipality", SimplifiedLocationType.CITY),
529
+ "Q532": ("village", SimplifiedLocationType.CITY),
530
+ "Q5084": ("hamlet", SimplifiedLocationType.CITY),
531
+ # Country-specific municipalities
532
+ "Q484170": ("commune_france", SimplifiedLocationType.CITY),
533
+ "Q262166": ("municipality_germany", SimplifiedLocationType.CITY),
534
+ "Q1054813": ("municipality_japan", SimplifiedLocationType.CITY),
535
+ # Additional city types -> map to city/town/village
536
+ "Q7930989": ("city", SimplifiedLocationType.CITY), # city of US
537
+ "Q200250": ("big_city", SimplifiedLocationType.CITY), # metropolis
538
+ "Q2264924": ("big_city", SimplifiedLocationType.CITY), # conurbation
539
+ "Q174844": ("big_city", SimplifiedLocationType.CITY), # megacity
540
+ "Q22865": ("city", SimplifiedLocationType.CITY), # independent city
541
+ "Q5153359": ("municipality", SimplifiedLocationType.CITY), # commune (generic)
542
+ "Q4286337": ("village", SimplifiedLocationType.CITY), # locality
543
+ "Q486972": ("village", SimplifiedLocationType.CITY), # human settlement
544
+ "Q95993392": ("city", SimplifiedLocationType.CITY), # city or town
545
+
546
+ # Districts (maps to: district, borough, neighborhood, ward)
547
+ "Q149621": ("district", SimplifiedLocationType.DISTRICT),
548
+ "Q5765681": ("borough", SimplifiedLocationType.DISTRICT),
549
+ "Q123705": ("neighborhood", SimplifiedLocationType.DISTRICT),
550
+ "Q12813115": ("ward", SimplifiedLocationType.DISTRICT),
551
+ # Additional district types -> map to district/borough
552
+ "Q2198484": ("borough", SimplifiedLocationType.DISTRICT), # borough of London
553
+ "Q667509": ("district", SimplifiedLocationType.DISTRICT), # arrondissement
554
+ "Q2100709": ("district", SimplifiedLocationType.DISTRICT), # city district
555
+
556
+ # Historic (maps to: former_country, ancient_civilization, historic_territory)
557
+ "Q3024240": ("former_country", SimplifiedLocationType.HISTORIC),
558
+ "Q28171280": ("ancient_civilization", SimplifiedLocationType.HISTORIC),
559
+ "Q1620908": ("historic_territory", SimplifiedLocationType.HISTORIC),
560
+ # Additional historic types
561
+ "Q19953632": ("historic_territory", SimplifiedLocationType.HISTORIC), # historical region
562
+ "Q1307214": ("historic_territory", SimplifiedLocationType.HISTORIC), # historical admin region
563
+ }
564
+
565
+
461
566
  # =============================================================================
462
567
  # PROGRESS TRACKING
463
568
  # =============================================================================
@@ -1250,6 +1355,27 @@ class WikidataDumpImporter:
1250
1355
  return ORG_TYPE_TO_ENTITY_TYPE[qid]
1251
1356
  return None
1252
1357
 
1358
+ def _get_location_type(self, entity: dict) -> Optional[tuple[str, SimplifiedLocationType]]:
1359
+ """
1360
+ Check if entity has P31 (instance of) matching a location type.
1361
+
1362
+ Args:
1363
+ entity: Parsed Wikidata entity dictionary
1364
+
1365
+ Returns:
1366
+ Tuple of (location_type_name, SimplifiedLocationType) if entity is a location, None otherwise
1367
+ """
1368
+ claims = entity.get("claims", {})
1369
+ for claim in claims.get("P31", []):
1370
+ mainsnak = claim.get("mainsnak", {})
1371
+ datavalue = mainsnak.get("datavalue", {})
1372
+ value = datavalue.get("value", {})
1373
+ if isinstance(value, dict):
1374
+ qid = value.get("id", "")
1375
+ if qid in LOCATION_TYPE_QIDS:
1376
+ return LOCATION_TYPE_QIDS[qid]
1377
+ return None
1378
+
1253
1379
  def _get_claim_values(self, entity: dict, prop: str) -> list[str]:
1254
1380
  """
1255
1381
  Get all QID values for a property (e.g., P39, P106).
@@ -1545,6 +1671,13 @@ class WikidataDumpImporter:
1545
1671
  # Get best role/org/dates from positions
1546
1672
  role_qid, _, org_qid, start_date, end_date, extra_context = self._get_best_role_org(positions)
1547
1673
 
1674
+ # Fallback: if no org from positions, check top-level P108 (employer)
1675
+ if not org_qid:
1676
+ employers = self._get_claim_values(entity, "P108")
1677
+ if employers:
1678
+ org_qid = employers[0]
1679
+ logger.debug(f"Using top-level P108 employer for {qid}: {org_qid}")
1680
+
1548
1681
  # Get country (P27 - country of citizenship)
1549
1682
  countries = self._get_claim_values(entity, "P27")
1550
1683
  country_qid = countries[0] if countries else ""
@@ -1663,6 +1796,204 @@ class WikidataDumpImporter:
1663
1796
  },
1664
1797
  )
1665
1798
 
1799
+ def _process_location_entity(
1800
+ self,
1801
+ entity: dict,
1802
+ require_enwiki: bool = False,
1803
+ ) -> Optional[LocationRecord]:
1804
+ """
1805
+ Process a single entity, return LocationRecord if it's a location.
1806
+
1807
+ Args:
1808
+ entity: Parsed Wikidata entity dictionary
1809
+ require_enwiki: If True, only include locations with English Wikipedia articles
1810
+
1811
+ Returns:
1812
+ LocationRecord if entity qualifies, None otherwise
1813
+ """
1814
+ # Must be an item (not property)
1815
+ if entity.get("type") != "item":
1816
+ return None
1817
+
1818
+ # Get location type from P31
1819
+ location_type_info = self._get_location_type(entity)
1820
+ if location_type_info is None:
1821
+ return None
1822
+
1823
+ location_type_name, simplified_type = location_type_info
1824
+
1825
+ # Optionally require English Wikipedia article
1826
+ if require_enwiki:
1827
+ sitelinks = entity.get("sitelinks", {})
1828
+ if "enwiki" not in sitelinks:
1829
+ return None
1830
+
1831
+ # Extract location data
1832
+ return self._extract_location_data(entity, location_type_name, simplified_type)
1833
+
1834
+ def _extract_location_data(
1835
+ self,
1836
+ entity: dict,
1837
+ location_type: str,
1838
+ simplified_type: SimplifiedLocationType,
1839
+ ) -> Optional[LocationRecord]:
1840
+ """
1841
+ Extract LocationRecord from entity dict.
1842
+
1843
+ Args:
1844
+ entity: Parsed Wikidata entity dictionary
1845
+ location_type: Detailed location type name
1846
+ simplified_type: Simplified location type enum
1847
+
1848
+ Returns:
1849
+ LocationRecord or None if essential data is missing
1850
+ """
1851
+ qid = entity.get("id", "")
1852
+ labels = entity.get("labels", {})
1853
+ label = labels.get("en", {}).get("value", "")
1854
+
1855
+ if not label or not qid:
1856
+ return None
1857
+
1858
+ claims = entity.get("claims", {})
1859
+
1860
+ # Get parent locations from P131 (located in administrative territorial entity)
1861
+ # This gives us the full hierarchy (city -> state -> country)
1862
+ parent_qids = self._get_claim_values(entity, "P131")
1863
+
1864
+ # Get country from P17 as fallback/additional parent
1865
+ country_qids = self._get_claim_values(entity, "P17")
1866
+
1867
+ # Get coordinates from P625 (coordinate location)
1868
+ coordinates = self._get_coordinates(claims)
1869
+
1870
+ # Get description
1871
+ descriptions = entity.get("descriptions", {})
1872
+ description = descriptions.get("en", {}).get("value", "")
1873
+
1874
+ # Get inception date (P571) - when location was established
1875
+ inception = self._get_time_claim(claims, "P571")
1876
+
1877
+ # Get dissolution date (P576) - when location ceased to exist
1878
+ dissolution = self._get_time_claim(claims, "P576")
1879
+
1880
+ # Parse QID to integer
1881
+ qid_int = int(qid[1:]) if qid.startswith("Q") and qid[1:].isdigit() else None
1882
+
1883
+ # Build record with extra details
1884
+ record_data = {
1885
+ "wikidata_id": qid,
1886
+ "label": label,
1887
+ "description": description,
1888
+ "parent_qids": parent_qids,
1889
+ "country_qids": country_qids,
1890
+ }
1891
+ if coordinates:
1892
+ record_data["coordinates"] = coordinates
1893
+
1894
+ return LocationRecord(
1895
+ name=label,
1896
+ source="wikidata",
1897
+ source_id=qid,
1898
+ qid=qid_int,
1899
+ location_type=location_type,
1900
+ simplified_type=simplified_type,
1901
+ parent_ids=[], # Will be resolved later by looking up parent QIDs in the database
1902
+ from_date=inception,
1903
+ to_date=dissolution,
1904
+ record=record_data,
1905
+ )
1906
+
1907
+ def _get_coordinates(self, claims: dict) -> Optional[dict]:
1908
+ """
1909
+ Get coordinates from P625 (coordinate location).
1910
+
1911
+ Args:
1912
+ claims: Claims dictionary
1913
+
1914
+ Returns:
1915
+ Dict with lat/lon or None
1916
+ """
1917
+ for claim in claims.get("P625", []):
1918
+ mainsnak = claim.get("mainsnak", {})
1919
+ datavalue = mainsnak.get("datavalue", {})
1920
+ value = datavalue.get("value", {})
1921
+ if isinstance(value, dict):
1922
+ lat = value.get("latitude")
1923
+ lon = value.get("longitude")
1924
+ if lat is not None and lon is not None:
1925
+ return {"lat": lat, "lon": lon}
1926
+ return None
1927
+
1928
+ def import_locations(
1929
+ self,
1930
+ dump_path: Optional[Path] = None,
1931
+ limit: Optional[int] = None,
1932
+ require_enwiki: bool = False,
1933
+ skip_ids: Optional[set[str]] = None,
1934
+ start_index: int = 0,
1935
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
1936
+ ) -> Iterator[LocationRecord]:
1937
+ """
1938
+ Stream through dump, yielding locations (geopolitical entities).
1939
+
1940
+ This method filters the dump for:
1941
+ - Items with type "item"
1942
+ - Has P31 (instance of) matching a location type
1943
+ - Optionally: Has English Wikipedia article (enwiki sitelink)
1944
+
1945
+ Args:
1946
+ dump_path: Path to dump file (uses self._dump_path if not provided)
1947
+ limit: Optional maximum number of records to return
1948
+ require_enwiki: If True, only include locations with English Wikipedia articles
1949
+ skip_ids: Optional set of source_ids (Q codes) to skip
1950
+ start_index: Entity index to start from (for resume support)
1951
+ progress_callback: Optional callback(entity_index, entity_id, records_yielded)
1952
+
1953
+ Yields:
1954
+ LocationRecord for each qualifying location
1955
+ """
1956
+ path = dump_path or self._dump_path
1957
+ count = 0
1958
+ skipped_existing = 0
1959
+ current_entity_index = start_index
1960
+
1961
+ logger.info("Starting location import from Wikidata dump...")
1962
+ if start_index > 0:
1963
+ logger.info(f"Resuming from entity index {start_index:,}")
1964
+ if not require_enwiki:
1965
+ logger.info("Importing ALL locations (no enwiki filter)")
1966
+ if skip_ids:
1967
+ logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
1968
+
1969
+ def track_entity(entity_index: int, entity_id: str) -> None:
1970
+ nonlocal current_entity_index
1971
+ current_entity_index = entity_index
1972
+
1973
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
1974
+ if limit and count >= limit:
1975
+ break
1976
+
1977
+ # Check skip_ids early, before full processing
1978
+ entity_id = entity.get("id", "")
1979
+ if skip_ids and entity_id in skip_ids:
1980
+ skipped_existing += 1
1981
+ continue
1982
+
1983
+ record = self._process_location_entity(entity, require_enwiki=require_enwiki)
1984
+ if record:
1985
+ count += 1
1986
+ if count % 10_000 == 0:
1987
+ logger.info(f"Yielded {count:,} location records (skipped {skipped_existing:,})...")
1988
+
1989
+ # Call progress callback with current position
1990
+ if progress_callback:
1991
+ progress_callback(current_entity_index, entity_id, count)
1992
+
1993
+ yield record
1994
+
1995
+ logger.info(f"Location import complete: {count:,} records (skipped {skipped_existing:,})")
1996
+
1666
1997
  def _get_string_claim(self, claims: dict, prop: str) -> str:
1667
1998
  """
1668
1999
  Get first string value for a property.
@@ -1049,6 +1049,12 @@ class WikidataPeopleImporter:
1049
1049
  best_result = result
1050
1050
 
1051
1051
  if best_result:
1052
+ # If we have a role but no org, try P108 (employer) as fallback
1053
+ role_label, org_label, org_qid, from_date, to_date = best_result
1054
+ if role_label and not org_label:
1055
+ fallback_org, fallback_org_qid = self._get_employer(person_qid)
1056
+ if fallback_org:
1057
+ return role_label, fallback_org, fallback_org_qid, from_date, to_date
1052
1058
  return best_result
1053
1059
 
1054
1060
  return "", "", "", None, None
@@ -1057,6 +1063,44 @@ class WikidataPeopleImporter:
1057
1063
  logger.debug(f"Failed to enrich role/org for {person_qid}: {e}")
1058
1064
  return "", "", "", None, None
1059
1065
 
1066
+ def _get_employer(self, person_qid: str) -> tuple[str, str]:
1067
+ """
1068
+ Query P108 (employer) as fallback for org.
1069
+
1070
+ Args:
1071
+ person_qid: Wikidata QID of the person
1072
+
1073
+ Returns:
1074
+ Tuple of (org_label, org_qid) or ("", "") if not found
1075
+ """
1076
+ query = """
1077
+ SELECT ?org ?orgLabel WHERE {
1078
+ wd:%s wdt:P108 ?org .
1079
+ ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") .
1080
+ }
1081
+ LIMIT 1
1082
+ """ % person_qid
1083
+
1084
+ try:
1085
+ url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
1086
+ req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
1087
+
1088
+ with urllib.request.urlopen(req, timeout=15) as response:
1089
+ data = json.loads(response.read().decode("utf-8"))
1090
+
1091
+ bindings = data.get("results", {}).get("bindings", [])
1092
+ if bindings:
1093
+ org_label = bindings[0].get("orgLabel", {}).get("value", "")
1094
+ org_uri = bindings[0].get("org", {}).get("value", "")
1095
+ org_qid = org_uri.split("/")[-1] if org_uri else ""
1096
+ if org_label and not org_label.startswith("Q"):
1097
+ return org_label, org_qid
1098
+
1099
+ except Exception as e:
1100
+ logger.debug(f"Failed to get employer for {person_qid}: {e}")
1101
+
1102
+ return "", ""
1103
+
1060
1104
  def enrich_people_role_org_batch(
1061
1105
  self,
1062
1106
  people: list[PersonRecord],