corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1130 @@
1
+ """
2
+ Wikidata importer for the person database.
3
+
4
+ Imports notable people data from Wikidata using SPARQL queries
5
+ into the embedding database for person name matching.
6
+
7
+ Uses a two-phase approach for reliability:
8
+ 1. Bulk fetch: Simple queries to get QID + name + country (fast, no timeouts)
9
+ 2. Enrich: Targeted per-person queries for role/org/dates (resumable)
10
+
11
+ Notable people are those with English Wikipedia articles, ensuring
12
+ a basic level of notability.
13
+
14
+ Query categories (organized by PersonType):
15
+ - executives: Business executives (CEOs, CFOs, etc.)
16
+ - politicians: Politicians and diplomats
17
+ - athletes: Sports figures
18
+ - artists: Actors, musicians, directors
19
+ - academics: Professors and researchers
20
+ - scientists: Scientists and inventors
21
+ - journalists: Media personalities
22
+ - entrepreneurs: Founders and business owners
23
+
24
+ Uses the public Wikidata Query Service endpoint.
25
+ """
26
+
27
+ import json
28
+ import logging
29
+ import time
30
+ import urllib.parse
31
+ import urllib.request
32
+ from typing import Any, Iterator, Optional
33
+
34
+ from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # Wikidata SPARQL endpoint
39
+ WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
40
+
41
+ # =============================================================================
42
+ # BULK QUERIES - Simple, fast queries for initial import (no role/org/dates)
43
+ # Uses rdfs:label instead of SERVICE wikibase:label for better performance
44
+ # Each query targets a single role/occupation for speed
45
+ # =============================================================================
46
+
47
+ # Template for position-held queries (P39) - for executives, politicians
48
+ # Matches people who held a position that IS the role, or is an INSTANCE OF the role
49
+ # {role_qid} = single role QID, {seed} = unique seed, {limit} = batch limit
50
+ POSITION_QUERY_TEMPLATE = """
51
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
52
+ ?person wdt:P31 wd:Q5 .
53
+ ?person wdt:P39 ?position .
54
+ {{ ?position wdt:P31 wd:{role_qid} . }} UNION {{ VALUES ?position {{ wd:{role_qid} }} }}
55
+ ?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
56
+ OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
57
+ OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
58
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
59
+ }}
60
+ ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
61
+ LIMIT {limit}
62
+ """
63
+
64
+ # Template for occupation queries (P106) - for athletes, artists, etc.
65
+ # {occupation_qid} = single occupation QID, {seed} = unique seed, {limit} = batch limit
66
+ OCCUPATION_QUERY_TEMPLATE = """
67
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
68
+ ?person wdt:P31 wd:Q5 .
69
+ ?person wdt:P106 wd:{occupation_qid} .
70
+ ?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
71
+ OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
72
+ OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
73
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
74
+ }}
75
+ ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
76
+ LIMIT {limit}
77
+ """
78
+
79
+ # Template for founder queries (P112) - for entrepreneurs
80
+ # {seed} = unique seed, {limit} = batch limit
81
+ FOUNDER_QUERY_TEMPLATE = """
82
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
83
+ ?person wdt:P31 wd:Q5 .
84
+ ?org wdt:P112 ?person .
85
+ ?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
86
+ OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
87
+ OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
88
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
89
+ }}
90
+ ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
91
+ LIMIT {limit}
92
+ """
93
+
94
+ # Role QIDs for executives (position held - P39)
95
+ EXECUTIVE_ROLES = [
96
+ "Q484876", # CEO
97
+ "Q623279", # CFO
98
+ "Q1502675", # COO
99
+ "Q935019", # CTO
100
+ "Q1057716", # CIO
101
+ "Q2140589", # CMO
102
+ "Q1115042", # chairperson
103
+ "Q4720025", # board of directors member
104
+ "Q60432825", # chief human resources officer
105
+ "Q15967139", # chief compliance officer
106
+ "Q15729310", # chief risk officer
107
+ "Q47523568", # chief legal officer
108
+ "Q258557", # board chair
109
+ "Q114863313", # chief sustainability officer
110
+ "Q726114", # company president
111
+ "Q1372944", # managing director
112
+ "Q18918145", # chief commercial officer
113
+ "Q1057569", # chief strategy officer
114
+ "Q24058752", # chief product officer
115
+ "Q3578048", # vice president
116
+ "Q476675", # business executive (generic)
117
+ "Q5441744", # finance director
118
+ "Q4188234", # general manager
119
+ "Q38844673", # chief data officer
120
+ "Q97273203", # chief digital officer
121
+ "Q60715311", # chief growth officer
122
+ "Q3563879", # treasurer
123
+ "Q3505845", # corporate secretary
124
+ ]
125
+
126
+ # Role QIDs for politicians (position held - P39)
127
+ POLITICIAN_ROLES = [
128
+ "Q30461", # president
129
+ "Q14212", # prime minister
130
+ "Q83307", # minister
131
+ "Q2285706", # head of government
132
+ "Q4175034", # legislator
133
+ "Q486839", # member of parliament
134
+ "Q193391", # member of national legislature
135
+ "Q212071", # mayor
136
+ "Q382617", # governor
137
+ "Q116", # monarch
138
+ "Q484529", # member of congress
139
+ ]
140
+
141
+ # Note: Politicians with generic position types (like "public office") may not be found
142
+ # because querying all public office holders times out. This includes some mayors
143
+ # whose positions are typed as "public office" rather than "mayor".
144
+
145
+ # Occupation QIDs for athletes (P106)
146
+ ATHLETE_OCCUPATIONS = [
147
+ "Q2066131", # athlete
148
+ "Q937857", # football player
149
+ "Q3665646", # basketball player
150
+ "Q10871364", # baseball player
151
+ "Q19204627", # ice hockey player
152
+ "Q10843402", # tennis player
153
+ "Q13381376", # golfer
154
+ "Q11338576", # boxer
155
+ "Q10873124", # swimmer
156
+ ]
157
+
158
+ # Occupation QIDs for artists (P106)
159
+ ARTIST_OCCUPATIONS = [
160
+ "Q33999", # actor
161
+ "Q177220", # singer
162
+ "Q639669", # musician
163
+ "Q2526255", # film director
164
+ "Q36180", # writer
165
+ "Q483501", # artist
166
+ "Q488205", # singer-songwriter
167
+ "Q753110", # songwriter
168
+ "Q2405480", # voice actor
169
+ "Q10800557", # film actor
170
+ ]
171
+
172
+ # Occupation QIDs for academics (P106)
173
+ ACADEMIC_OCCUPATIONS = [
174
+ "Q121594", # professor
175
+ "Q3400985", # academic
176
+ "Q1622272", # university professor
177
+ ]
178
+
179
+ # Occupation QIDs for scientists (P106)
180
+ SCIENTIST_OCCUPATIONS = [
181
+ "Q901", # scientist
182
+ "Q1650915", # researcher
183
+ "Q169470", # physicist
184
+ "Q593644", # chemist
185
+ "Q864503", # biologist
186
+ "Q11063", # astronomer
187
+ ]
188
+
189
+ # Occupation QIDs for journalists (P106)
190
+ JOURNALIST_OCCUPATIONS = [
191
+ "Q1930187", # journalist
192
+ "Q13590141", # news presenter
193
+ "Q947873", # television presenter
194
+ "Q4263842", # columnist
195
+ ]
196
+
197
+ # Occupation QIDs for activists (P106)
198
+ ACTIVIST_OCCUPATIONS = [
199
+ "Q15253558", # activist
200
+ "Q11631410", # human rights activist
201
+ "Q18939491", # environmental activist
202
+ ]
203
+
204
+ # Mapping query type to role/occupation lists and query template type
205
+ # Each entry can have multiple query groups to combine different approaches
206
+ QUERY_TYPE_CONFIG: dict[str, list[dict]] = {
207
+ "executive": [
208
+ {"template": "position", "items": EXECUTIVE_ROLES},
209
+ ],
210
+ "politician": [
211
+ {"template": "position", "items": POLITICIAN_ROLES},
212
+ ],
213
+ "athlete": [
214
+ {"template": "occupation", "items": ATHLETE_OCCUPATIONS},
215
+ ],
216
+ "artist": [
217
+ {"template": "occupation", "items": ARTIST_OCCUPATIONS},
218
+ ],
219
+ "academic": [
220
+ {"template": "occupation", "items": ACADEMIC_OCCUPATIONS},
221
+ ],
222
+ "scientist": [
223
+ {"template": "occupation", "items": SCIENTIST_OCCUPATIONS},
224
+ ],
225
+ "journalist": [
226
+ {"template": "occupation", "items": JOURNALIST_OCCUPATIONS},
227
+ ],
228
+ "activist": [
229
+ {"template": "occupation", "items": ACTIVIST_OCCUPATIONS},
230
+ ],
231
+ "entrepreneur": [
232
+ {"template": "founder", "items": []}, # No items, uses special template
233
+ ],
234
+ }
235
+
236
+ # Mapping query type to PersonType
237
+ QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
238
+ "executive": PersonType.EXECUTIVE,
239
+ "politician": PersonType.POLITICIAN,
240
+ "athlete": PersonType.ATHLETE,
241
+ "artist": PersonType.ARTIST,
242
+ "academic": PersonType.ACADEMIC,
243
+ "scientist": PersonType.SCIENTIST,
244
+ "journalist": PersonType.JOURNALIST,
245
+ "entrepreneur": PersonType.ENTREPRENEUR,
246
+ "activist": PersonType.ACTIVIST,
247
+ }
248
+
249
+
250
+ class WikidataPeopleImporter:
251
+ """
252
+ Importer for Wikidata person data.
253
+
254
+ Uses SPARQL queries against the public Wikidata Query Service
255
+ to fetch notable people including executives, politicians, athletes, etc.
256
+
257
+ Query types:
258
+ - executive: Business executives (CEOs, CFOs, etc.)
259
+ - politician: Politicians and diplomats
260
+ - athlete: Sports figures
261
+ - artist: Actors, musicians, directors, writers
262
+ - academic: Professors and researchers
263
+ - scientist: Scientists and inventors
264
+ - journalist: Media personalities
265
+ - entrepreneur: Company founders
266
+ - activist: Activists and advocates
267
+ """
268
+
269
+ def __init__(
270
+ self,
271
+ batch_size: int = 5000,
272
+ delay_seconds: float = 2.0,
273
+ timeout: int = 120,
274
+ max_retries: int = 3,
275
+ min_batch_size: int = 50,
276
+ ):
277
+ """
278
+ Initialize the Wikidata people importer.
279
+
280
+ Args:
281
+ batch_size: Number of records to fetch per SPARQL query (default 5000)
282
+ delay_seconds: Delay between requests to be polite to the endpoint
283
+ timeout: HTTP timeout in seconds (default 120)
284
+ max_retries: Maximum retries per batch on timeout (default 3)
285
+ min_batch_size: Minimum batch size before giving up (default 50)
286
+ """
287
+ self._batch_size = batch_size
288
+ self._delay = delay_seconds
289
+ self._timeout = timeout
290
+ self._max_retries = max_retries
291
+ self._min_batch_size = min_batch_size
292
+ # Track discovered organizations: org_qid -> org_label
293
+ self._discovered_orgs: dict[str, str] = {}
294
+
295
+ def import_from_sparql(
296
+ self,
297
+ limit: Optional[int] = None,
298
+ query_type: str = "executive",
299
+ import_all: bool = False,
300
+ convergence_threshold: int = 5,
301
+ ) -> Iterator[PersonRecord]:
302
+ """
303
+ Import person records from Wikidata via SPARQL (bulk fetch phase).
304
+
305
+ This performs the fast bulk import with minimal data (QID, name, country).
306
+ Use enrich_people_batch() afterwards to add role/org/dates.
307
+
308
+ Iterates through each role/occupation individually for faster queries,
309
+ using random sampling with convergence detection per role.
310
+
311
+ Args:
312
+ limit: Optional limit on total records
313
+ query_type: Which query to use (executive, politician, athlete, etc.)
314
+ import_all: If True, run all query types sequentially
315
+ convergence_threshold: Stop after this many consecutive batches with no new records per role
316
+
317
+ Yields:
318
+ PersonRecord for each person (without role/org - use enrich to add)
319
+ """
320
+ if import_all:
321
+ yield from self._import_all_types(limit)
322
+ return
323
+
324
+ if query_type not in QUERY_TYPE_CONFIG:
325
+ raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPE_CONFIG.keys())}")
326
+
327
+ config_groups = QUERY_TYPE_CONFIG[query_type]
328
+ person_type = QUERY_TYPE_TO_PERSON_TYPE.get(query_type, PersonType.UNKNOWN)
329
+
330
+ logger.info(f"Starting Wikidata bulk import (query_type={query_type}, person_type={person_type.value})...")
331
+
332
+ total_count = 0
333
+ # Track seen QIDs to deduplicate across all roles
334
+ seen_qids: set[str] = set()
335
+
336
+ # Iterate through each config group (e.g., position queries + occupation queries)
337
+ for config in config_groups:
338
+ if limit and total_count >= limit:
339
+ break
340
+
341
+ template_type = config["template"]
342
+ items = config["items"]
343
+
344
+ # For founder template, run a single query
345
+ if template_type == "founder":
346
+ for record in self._import_single_template(
347
+ template=FOUNDER_QUERY_TEMPLATE,
348
+ template_params={},
349
+ person_type=person_type,
350
+ seen_qids=seen_qids,
351
+ limit=(limit - total_count) if limit else None,
352
+ convergence_threshold=convergence_threshold,
353
+ role_name="founder",
354
+ ):
355
+ total_count += 1
356
+ yield record
357
+ continue
358
+
359
+ # Select the right template
360
+ if template_type == "position":
361
+ template = POSITION_QUERY_TEMPLATE
362
+ param_name = "role_qid"
363
+ else: # occupation
364
+ template = OCCUPATION_QUERY_TEMPLATE
365
+ param_name = "occupation_qid"
366
+
367
+ # Iterate through each role/occupation in this group
368
+ for item_qid in items:
369
+ if limit and total_count >= limit:
370
+ break
371
+
372
+ remaining = (limit - total_count) if limit else None
373
+ role_count = 0
374
+
375
+ for record in self._import_single_template(
376
+ template=template,
377
+ template_params={param_name: item_qid},
378
+ person_type=person_type,
379
+ seen_qids=seen_qids,
380
+ limit=remaining,
381
+ convergence_threshold=convergence_threshold,
382
+ role_name=item_qid,
383
+ ):
384
+ role_count += 1
385
+ total_count += 1
386
+ yield record
387
+
388
+ logger.info(f"Role {item_qid}: {role_count} new (total: {total_count})")
389
+
390
+ logger.info(f"Completed Wikidata bulk import: {total_count} records (use enrich to add role/org)")
391
+
392
+ def _import_single_template(
393
+ self,
394
+ template: str,
395
+ template_params: dict[str, str],
396
+ person_type: PersonType,
397
+ seen_qids: set[str],
398
+ limit: Optional[int],
399
+ convergence_threshold: int,
400
+ role_name: str,
401
+ ) -> Iterator[PersonRecord]:
402
+ """
403
+ Import from a single role/occupation using random sampling with convergence.
404
+
405
+ Args:
406
+ template: SPARQL query template
407
+ template_params: Parameters to format into template (role_qid or occupation_qid)
408
+ person_type: PersonType to assign to records
409
+ seen_qids: Set of already-seen QIDs (shared across roles)
410
+ limit: Optional limit on records from this role
411
+ convergence_threshold: Stop after this many consecutive empty batches
412
+ role_name: Name for logging
413
+
414
+ Yields:
415
+ PersonRecord for each new person found
416
+ """
417
+ batch_num = 0
418
+ total_count = 0
419
+ current_batch_size = self._batch_size
420
+ consecutive_empty_batches = 0
421
+
422
+ logger.info(f"Querying role {role_name}...")
423
+
424
+ while True:
425
+ if limit and total_count >= limit:
426
+ break
427
+
428
+ batch_num += 1
429
+ batch_limit = min(current_batch_size, (limit - total_count) if limit else current_batch_size)
430
+
431
+ # Generate unique seed for this batch
432
+ batch_seed = f"{role_name}_{batch_num}_{int(time.time() * 1000)}"
433
+
434
+ # Build query
435
+ query = template.format(
436
+ **template_params,
437
+ seed=batch_seed,
438
+ limit=batch_limit,
439
+ )
440
+
441
+ # Execute with retries
442
+ results = None
443
+ retries = 0
444
+ retry_batch_size = batch_limit
445
+
446
+ while retries <= self._max_retries:
447
+ try:
448
+ # Rebuild query with potentially smaller batch size
449
+ if retry_batch_size != batch_limit:
450
+ query = template.format(
451
+ **template_params,
452
+ seed=batch_seed,
453
+ limit=retry_batch_size,
454
+ )
455
+ results = self._execute_sparql(query)
456
+ if retry_batch_size < current_batch_size:
457
+ current_batch_size = retry_batch_size
458
+ break
459
+ except Exception as e:
460
+ is_timeout = "timeout" in str(e).lower() or "504" in str(e) or "503" in str(e)
461
+ if is_timeout and retry_batch_size > self._min_batch_size:
462
+ retries += 1
463
+ retry_batch_size = max(retry_batch_size // 2, self._min_batch_size)
464
+ wait_time = self._delay * (2 ** retries)
465
+ logger.warning(
466
+ f"Timeout on {role_name} batch #{batch_num}, retry {retries}/{self._max_retries} "
467
+ f"with batch_size={retry_batch_size} after {wait_time:.1f}s wait"
468
+ )
469
+ time.sleep(wait_time)
470
+ else:
471
+ logger.error(f"SPARQL query failed on {role_name} batch #{batch_num}: {e}")
472
+ break
473
+
474
+ if results is None:
475
+ logger.warning(f"Giving up on {role_name} after {retries} retries")
476
+ break
477
+
478
+ bindings = results.get("results", {}).get("bindings", [])
479
+
480
+ if not bindings:
481
+ consecutive_empty_batches += 1
482
+ if consecutive_empty_batches >= convergence_threshold:
483
+ logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
484
+ break
485
+ continue
486
+
487
+ batch_count = 0
488
+ for binding in bindings:
489
+ if limit and total_count >= limit:
490
+ break
491
+
492
+ record, skip_reason = self._parse_bulk_binding(binding, person_type=person_type)
493
+ if record is None:
494
+ continue
495
+
496
+ # Deduplicate
497
+ if record.source_id in seen_qids:
498
+ continue
499
+
500
+ seen_qids.add(record.source_id)
501
+ total_count += 1
502
+ batch_count += 1
503
+ yield record
504
+
505
+ # Check convergence
506
+ if batch_count == 0:
507
+ consecutive_empty_batches += 1
508
+ if consecutive_empty_batches >= convergence_threshold:
509
+ logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
510
+ break
511
+ else:
512
+ consecutive_empty_batches = 0
513
+
514
+ # Rate limit
515
+ if self._delay > 0:
516
+ time.sleep(self._delay)
517
+
518
+ def _import_all_types(self, limit: Optional[int]) -> Iterator[PersonRecord]:
519
+ """Import from all query types sequentially, deduplicating across types."""
520
+ # Track seen QIDs across all types
521
+ seen_qids: set[str] = set()
522
+ total_count = 0
523
+
524
+ # Calculate per-type limits if a total limit is set
525
+ num_types = len(QUERY_TYPE_CONFIG)
526
+ per_type_limit = limit // num_types if limit else None
527
+
528
+ for query_type in QUERY_TYPE_CONFIG:
529
+ logger.info(f"=== Importing people: {query_type} ===")
530
+ type_count = 0
531
+ skipped_count = 0
532
+
533
+ for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
534
+ if record.source_id in seen_qids:
535
+ skipped_count += 1
536
+ continue
537
+
538
+ seen_qids.add(record.source_id)
539
+ total_count += 1
540
+ type_count += 1
541
+ yield record
542
+
543
+ if limit and total_count >= limit:
544
+ logger.info(f"Reached total limit of {limit} records")
545
+ return
546
+
547
+ logger.info(
548
+ f"Got {type_count} new from {query_type}, skipped {skipped_count} (total: {total_count})"
549
+ )
550
+
551
+ logger.info(f"Completed all query types: {total_count} total people records")
552
+
553
+ @staticmethod
554
+ def _parse_wikidata_date(date_str: str) -> Optional[str]:
555
+ """
556
+ Parse a Wikidata date string into ISO format (YYYY-MM-DD).
557
+
558
+ Wikidata returns dates like "2020-01-15T00:00:00Z" or just "2020".
559
+ Returns None if the date cannot be parsed.
560
+ """
561
+ if not date_str:
562
+ return None
563
+ # Handle ISO datetime format (e.g., "2020-01-15T00:00:00Z")
564
+ if "T" in date_str:
565
+ return date_str.split("T")[0]
566
+ # Handle year-only format (e.g., "2020")
567
+ if len(date_str) == 4 and date_str.isdigit():
568
+ return f"{date_str}-01-01"
569
+ # Return as-is if it looks like a date
570
+ if len(date_str) >= 4:
571
+ return date_str[:10] # Take first 10 chars (YYYY-MM-DD)
572
+ return None
573
+
574
+ def _execute_sparql(self, query: str) -> dict[str, Any]:
575
+ """Execute a SPARQL query against Wikidata."""
576
+ params = urllib.parse.urlencode({
577
+ "query": query,
578
+ "format": "json",
579
+ })
580
+
581
+ url = f"{WIKIDATA_SPARQL_URL}?{params}"
582
+
583
+ req = urllib.request.Request(
584
+ url,
585
+ headers={
586
+ "Accept": "application/sparql-results+json",
587
+ "User-Agent": "corp-extractor/1.0 (person database builder)",
588
+ }
589
+ )
590
+
591
+ with urllib.request.urlopen(req, timeout=self._timeout) as response:
592
+ return json.loads(response.read().decode("utf-8"))
593
+
594
+ def _parse_bulk_binding(
595
+ self,
596
+ binding: dict[str, Any],
597
+ person_type: PersonType = PersonType.UNKNOWN,
598
+ ) -> tuple[Optional[PersonRecord], Optional[str]]:
599
+ """
600
+ Parse a bulk SPARQL result binding into a PersonRecord.
601
+
602
+ Bulk bindings only have: person, personLabel, countryLabel, description.
603
+ Role/org/dates are NOT included - use enrich methods to add them later.
604
+
605
+ Returns:
606
+ Tuple of (PersonRecord or None, skip_reason or None)
607
+ """
608
+ try:
609
+ # Get Wikidata entity ID
610
+ person_uri = binding.get("person", {}).get("value", "")
611
+ if not person_uri:
612
+ return None, "missing person URI"
613
+
614
+ # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
615
+ wikidata_id = person_uri.split("/")[-1]
616
+ if not wikidata_id.startswith("Q"):
617
+ return None, f"invalid Wikidata ID format: {wikidata_id}"
618
+
619
+ # Get label
620
+ label = binding.get("personLabel", {}).get("value", "")
621
+ if not label:
622
+ return None, f"{wikidata_id}: no label"
623
+ if label == wikidata_id:
624
+ return None, f"{wikidata_id}: no English label (label equals QID)"
625
+
626
+ # Get optional fields from bulk query
627
+ country = binding.get("countryLabel", {}).get("value", "")
628
+ description = binding.get("description", {}).get("value", "")
629
+
630
+ # Build minimal record data
631
+ record_data: dict[str, Any] = {
632
+ "wikidata_id": wikidata_id,
633
+ "label": label,
634
+ }
635
+ if country:
636
+ record_data["country"] = country
637
+ if description:
638
+ record_data["description"] = description
639
+
640
+ return PersonRecord(
641
+ name=label.strip(),
642
+ source="wikidata",
643
+ source_id=wikidata_id,
644
+ country=country or "",
645
+ person_type=person_type,
646
+ known_for_role="", # To be enriched later
647
+ known_for_org="", # To be enriched later
648
+ from_date=None, # To be enriched later
649
+ to_date=None, # To be enriched later
650
+ record=record_data,
651
+ ), None
652
+
653
+ except Exception as e:
654
+ return None, f"parse error: {e}"
655
+
656
+ def _parse_binding_with_reason(
657
+ self,
658
+ binding: dict[str, Any],
659
+ person_type: PersonType = PersonType.UNKNOWN,
660
+ ) -> tuple[Optional[PersonRecord], Optional[str]]:
661
+ """
662
+ Parse a SPARQL result binding into a PersonRecord.
663
+
664
+ Returns:
665
+ Tuple of (PersonRecord or None, skip_reason or None)
666
+ """
667
+ try:
668
+ # Get Wikidata entity ID
669
+ person_uri = binding.get("person", {}).get("value", "")
670
+ if not person_uri:
671
+ return None, "missing person URI"
672
+
673
+ # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
674
+ wikidata_id = person_uri.split("/")[-1]
675
+ if not wikidata_id.startswith("Q"):
676
+ return None, f"invalid Wikidata ID format: {wikidata_id}"
677
+
678
+ # Get label
679
+ label = binding.get("personLabel", {}).get("value", "")
680
+ if not label:
681
+ return None, f"{wikidata_id}: no label"
682
+ if label == wikidata_id:
683
+ return None, f"{wikidata_id}: no English label (label equals QID)"
684
+
685
+ # Get optional fields
686
+ country = binding.get("countryLabel", {}).get("value", "")
687
+ role = binding.get("roleLabel", {}).get("value", "")
688
+ org_label = binding.get("orgLabel", {}).get("value", "")
689
+ org_uri = binding.get("org", {}).get("value", "")
690
+ description = binding.get("description", {}).get("value", "")
691
+
692
+ # Extract org QID from URI (e.g., "http://www.wikidata.org/entity/Q715583" -> "Q715583")
693
+ org_qid = ""
694
+ if org_uri:
695
+ org_qid = org_uri.split("/")[-1]
696
+ if not org_qid.startswith("Q"):
697
+ org_qid = ""
698
+
699
+ # Get dates (Wikidata returns ISO datetime, extract just the date part)
700
+ start_date_raw = binding.get("startDate", {}).get("value", "")
701
+ end_date_raw = binding.get("endDate", {}).get("value", "")
702
+ from_date = WikidataPeopleImporter._parse_wikidata_date(start_date_raw)
703
+ to_date = WikidataPeopleImporter._parse_wikidata_date(end_date_raw)
704
+
705
+ # Clean up role and org label (remove QID if it's the same as the label)
706
+ if role and role.startswith("Q"):
707
+ role = ""
708
+ if org_label and org_label.startswith("Q"):
709
+ org_label = ""
710
+
711
+ # Track discovered organization if we have both QID and label
712
+ if org_qid and org_label:
713
+ self._discovered_orgs[org_qid] = org_label
714
+
715
+ # Build record data
716
+ record_data: dict[str, Any] = {
717
+ "wikidata_id": wikidata_id,
718
+ "label": label,
719
+ }
720
+ if country:
721
+ record_data["country"] = country
722
+ if role:
723
+ record_data["role"] = role
724
+ if org_label:
725
+ record_data["org"] = org_label
726
+ if org_qid:
727
+ record_data["org_qid"] = org_qid
728
+ if description:
729
+ record_data["description"] = description
730
+ if from_date:
731
+ record_data["from_date"] = from_date
732
+ if to_date:
733
+ record_data["to_date"] = to_date
734
+
735
+ return PersonRecord(
736
+ name=label.strip(),
737
+ source="wikidata",
738
+ source_id=wikidata_id,
739
+ country=country or "",
740
+ person_type=person_type,
741
+ known_for_role=role or "",
742
+ known_for_org=org_label or "",
743
+ from_date=from_date,
744
+ to_date=to_date,
745
+ record=record_data,
746
+ ), None
747
+
748
+ except Exception as e:
749
+ return None, f"parse error: {e}"
750
+
751
+ def _parse_binding(
752
+ self,
753
+ binding: dict[str, Any],
754
+ person_type: PersonType = PersonType.UNKNOWN,
755
+ ) -> Optional[PersonRecord]:
756
+ """Parse a SPARQL result binding into a PersonRecord (legacy wrapper)."""
757
+ record, _ = self._parse_binding_with_reason(binding, person_type)
758
+ return record
759
+
760
+ def search_person(self, name: str, limit: int = 10) -> list[PersonRecord]:
761
+ """
762
+ Search for a specific person by name.
763
+
764
+ Args:
765
+ name: Person name to search for
766
+ limit: Maximum results to return
767
+
768
+ Returns:
769
+ List of matching PersonRecords
770
+ """
771
+ # Use Wikidata search API for better name matching
772
+ search_url = "https://www.wikidata.org/w/api.php"
773
+ params = urllib.parse.urlencode({
774
+ "action": "wbsearchentities",
775
+ "search": name,
776
+ "language": "en",
777
+ "type": "item",
778
+ "limit": limit,
779
+ "format": "json",
780
+ })
781
+
782
+ req = urllib.request.Request(
783
+ f"{search_url}?{params}",
784
+ headers={"User-Agent": "corp-extractor/1.0"}
785
+ )
786
+
787
+ with urllib.request.urlopen(req, timeout=30) as response:
788
+ data = json.loads(response.read().decode("utf-8"))
789
+
790
+ results = []
791
+ for item in data.get("search", []):
792
+ qid = item.get("id")
793
+ label = item.get("label", "")
794
+ description = item.get("description", "")
795
+
796
+ # Check if it looks like a person
797
+ person_keywords = [
798
+ "politician", "actor", "actress", "singer", "musician",
799
+ "businessman", "businesswoman", "ceo", "executive", "director",
800
+ "president", "founder", "professor", "scientist", "author",
801
+ "writer", "journalist", "athlete", "player", "coach",
802
+ ]
803
+ description_lower = description.lower()
804
+ is_person = any(kw in description_lower for kw in person_keywords)
805
+ if not is_person:
806
+ continue
807
+
808
+ # Try to infer person type from description
809
+ person_type = PersonType.UNKNOWN
810
+ if any(kw in description_lower for kw in ["ceo", "executive", "businessman", "businesswoman"]):
811
+ person_type = PersonType.EXECUTIVE
812
+ elif any(kw in description_lower for kw in ["politician", "president", "senator", "minister"]):
813
+ person_type = PersonType.POLITICIAN
814
+ elif any(kw in description_lower for kw in ["athlete", "player", "coach"]):
815
+ person_type = PersonType.ATHLETE
816
+ elif any(kw in description_lower for kw in ["actor", "actress", "singer", "musician", "director"]):
817
+ person_type = PersonType.ARTIST
818
+ elif any(kw in description_lower for kw in ["professor", "academic"]):
819
+ person_type = PersonType.ACADEMIC
820
+ elif any(kw in description_lower for kw in ["scientist", "researcher"]):
821
+ person_type = PersonType.SCIENTIST
822
+ elif any(kw in description_lower for kw in ["journalist", "reporter"]):
823
+ person_type = PersonType.JOURNALIST
824
+ elif any(kw in description_lower for kw in ["founder", "entrepreneur"]):
825
+ person_type = PersonType.ENTREPRENEUR
826
+
827
+ record = PersonRecord(
828
+ name=label,
829
+ source="wikidata",
830
+ source_id=qid,
831
+ country="", # Not available from search API
832
+ person_type=person_type,
833
+ known_for_role="",
834
+ known_for_org="",
835
+ record={
836
+ "wikidata_id": qid,
837
+ "label": label,
838
+ "description": description,
839
+ },
840
+ )
841
+ results.append(record)
842
+
843
+ return results
844
+
845
+ def get_discovered_organizations(self) -> list[CompanyRecord]:
846
+ """
847
+ Get organizations discovered during the people import.
848
+
849
+ These are organizations associated with people (employers, positions, etc.)
850
+ that can be inserted into the organizations database if not already present.
851
+
852
+ Returns:
853
+ List of CompanyRecord objects for discovered organizations
854
+ """
855
+ records = []
856
+ for org_qid, org_label in self._discovered_orgs.items():
857
+ record = CompanyRecord(
858
+ name=org_label,
859
+ source="wikipedia", # Use "wikipedia" as source per wikidata.py convention
860
+ source_id=org_qid,
861
+ region="", # Not available from this context
862
+ entity_type=EntityType.BUSINESS, # Default to business for orgs linked to people
863
+ record={
864
+ "wikidata_id": org_qid,
865
+ "label": org_label,
866
+ "discovered_from": "people_import",
867
+ },
868
+ )
869
+ records.append(record)
870
+ logger.info(f"Discovered {len(records)} organizations from people import")
871
+ return records
872
+
873
+ def clear_discovered_organizations(self) -> None:
874
+ """Clear the discovered organizations cache."""
875
+ self._discovered_orgs.clear()
876
+
877
+ def enrich_person_dates(self, person_qid: str, role: str = "", org: str = "") -> tuple[Optional[str], Optional[str]]:
878
+ """
879
+ Query Wikidata to get start/end dates for a person's position.
880
+
881
+ Args:
882
+ person_qid: Wikidata QID of the person (e.g., 'Q123')
883
+ role: Optional role label to match (e.g., 'chief executive officer')
884
+ org: Optional org label to match (e.g., 'Apple Inc')
885
+
886
+ Returns:
887
+ Tuple of (from_date, to_date) in ISO format, or (None, None) if not found
888
+ """
889
+ # Query for position dates for this specific person
890
+ # Uses rdfs:label instead of SERVICE wikibase:label for better performance
891
+ query = """
892
+ SELECT ?roleLabel ?orgLabel ?startDate ?endDate WHERE {
893
+ wd:%s p:P39 ?positionStatement .
894
+ ?positionStatement ps:P39 ?role .
895
+ ?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
896
+ OPTIONAL { ?positionStatement pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
897
+ OPTIONAL { ?positionStatement pq:P580 ?startDate }
898
+ OPTIONAL { ?positionStatement pq:P582 ?endDate }
899
+ }
900
+ LIMIT 50
901
+ """ % person_qid
902
+
903
+ try:
904
+ url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
905
+ req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
906
+
907
+ with urllib.request.urlopen(req, timeout=30) as response:
908
+ data = json.loads(response.read().decode("utf-8"))
909
+
910
+ # Find the best matching position
911
+ best_start = None
912
+ best_end = None
913
+
914
+ for binding in data.get("results", {}).get("bindings", []):
915
+ role_label = binding.get("roleLabel", {}).get("value", "")
916
+ org_label = binding.get("orgLabel", {}).get("value", "")
917
+ start_raw = binding.get("startDate", {}).get("value", "")
918
+ end_raw = binding.get("endDate", {}).get("value", "")
919
+
920
+ # If role/org specified, try to match
921
+ if role and role.lower() not in role_label.lower():
922
+ continue
923
+ if org and org.lower() not in org_label.lower():
924
+ continue
925
+
926
+ # Parse dates
927
+ start_date = self._parse_wikidata_date(start_raw)
928
+ end_date = self._parse_wikidata_date(end_raw)
929
+
930
+ # Prefer entries with dates
931
+ if start_date or end_date:
932
+ best_start = start_date
933
+ best_end = end_date
934
+ break # Found a match with dates
935
+
936
+ return best_start, best_end
937
+
938
+ except Exception as e:
939
+ logger.debug(f"Failed to enrich dates for {person_qid}: {e}")
940
+ return None, None
941
+
942
+ def enrich_people_batch(
943
+ self,
944
+ people: list[PersonRecord],
945
+ delay_seconds: float = 0.5,
946
+ ) -> int:
947
+ """
948
+ Enrich a batch of people with start/end dates.
949
+
950
+ Args:
951
+ people: List of PersonRecord objects to enrich
952
+ delay_seconds: Delay between requests
953
+
954
+ Returns:
955
+ Number of people enriched with dates
956
+ """
957
+ enriched_count = 0
958
+
959
+ for person in people:
960
+ if person.from_date or person.to_date:
961
+ continue # Already has dates
962
+
963
+ qid = person.source_id
964
+ role = person.known_for_role
965
+ org = person.known_for_org
966
+
967
+ from_date, to_date = self.enrich_person_dates(qid, role, org)
968
+
969
+ if from_date or to_date:
970
+ person.from_date = from_date
971
+ person.to_date = to_date
972
+ enriched_count += 1
973
+ logger.debug(f"Enriched {person.name}: {from_date} - {to_date}")
974
+
975
+ time.sleep(delay_seconds)
976
+
977
+ logger.info(f"Enriched {enriched_count}/{len(people)} people with dates")
978
+ return enriched_count
979
+
980
+ def enrich_person_role_org(
981
+ self, person_qid: str
982
+ ) -> tuple[str, str, str, Optional[str], Optional[str]]:
983
+ """
984
+ Query Wikidata to get role, org, and dates for a person.
985
+
986
+ Args:
987
+ person_qid: Wikidata QID of the person (e.g., 'Q123')
988
+
989
+ Returns:
990
+ Tuple of (role_label, org_label, org_qid, from_date, to_date)
991
+ Empty strings/None if not found
992
+ """
993
+ # Query for position held (P39) with org qualifier and dates
994
+ # Uses rdfs:label instead of SERVICE wikibase:label for better performance
995
+ query = """
996
+ SELECT ?roleLabel ?org ?orgLabel ?startDate ?endDate WHERE {
997
+ wd:%s p:P39 ?stmt .
998
+ ?stmt ps:P39 ?role .
999
+ ?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
1000
+ OPTIONAL { ?stmt pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
1001
+ OPTIONAL { ?stmt pq:P580 ?startDate . }
1002
+ OPTIONAL { ?stmt pq:P582 ?endDate . }
1003
+ }
1004
+ LIMIT 5
1005
+ """ % person_qid
1006
+
1007
+ try:
1008
+ url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
1009
+ req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
1010
+
1011
+ with urllib.request.urlopen(req, timeout=30) as response:
1012
+ data = json.loads(response.read().decode("utf-8"))
1013
+
1014
+ bindings = data.get("results", {}).get("bindings", [])
1015
+
1016
+ # Find the best result (prefer one with org and dates)
1017
+ best_result = None
1018
+ for binding in bindings:
1019
+ role_label = binding.get("roleLabel", {}).get("value", "")
1020
+ org_label = binding.get("orgLabel", {}).get("value", "")
1021
+ org_uri = binding.get("org", {}).get("value", "")
1022
+ start_raw = binding.get("startDate", {}).get("value", "")
1023
+ end_raw = binding.get("endDate", {}).get("value", "")
1024
+
1025
+ # Skip if role is just a QID (no label resolved)
1026
+ if role_label and role_label.startswith("Q"):
1027
+ continue
1028
+ if org_label and org_label.startswith("Q"):
1029
+ org_label = ""
1030
+
1031
+ # Extract QID from URI
1032
+ org_qid = ""
1033
+ if org_uri:
1034
+ org_qid = org_uri.split("/")[-1]
1035
+ if not org_qid.startswith("Q"):
1036
+ org_qid = ""
1037
+
1038
+ from_date = self._parse_wikidata_date(start_raw)
1039
+ to_date = self._parse_wikidata_date(end_raw)
1040
+
1041
+ result = (role_label, org_label, org_qid, from_date, to_date)
1042
+
1043
+ # Prefer results with org and dates
1044
+ if org_label and (from_date or to_date):
1045
+ return result
1046
+ elif org_label and best_result is None:
1047
+ best_result = result
1048
+ elif role_label and best_result is None:
1049
+ best_result = result
1050
+
1051
+ if best_result:
1052
+ return best_result
1053
+
1054
+ return "", "", "", None, None
1055
+
1056
+ except Exception as e:
1057
+ logger.debug(f"Failed to enrich role/org for {person_qid}: {e}")
1058
+ return "", "", "", None, None
1059
+
1060
+ def enrich_people_role_org_batch(
1061
+ self,
1062
+ people: list[PersonRecord],
1063
+ delay_seconds: float = 0.1,
1064
+ max_workers: int = 5,
1065
+ ) -> int:
1066
+ """
1067
+ Enrich a batch of people with role/org/dates data using parallel queries.
1068
+
1069
+ Args:
1070
+ people: List of PersonRecord objects to enrich
1071
+ delay_seconds: Delay between requests (per worker)
1072
+ max_workers: Number of parallel workers (default 5 for Wikidata rate limits)
1073
+
1074
+ Returns:
1075
+ Number of people enriched with role/org
1076
+ """
1077
+ from concurrent.futures import ThreadPoolExecutor, as_completed
1078
+
1079
+ # Filter to people that need enrichment
1080
+ to_enrich = [p for p in people if not p.known_for_role and not p.known_for_org]
1081
+
1082
+ if not to_enrich:
1083
+ logger.info("No people need enrichment")
1084
+ return 0
1085
+
1086
+ enriched_count = 0
1087
+ total = len(to_enrich)
1088
+
1089
+ def enrich_one(person: PersonRecord) -> tuple[PersonRecord, bool]:
1090
+ """Enrich a single person, returns (person, success)."""
1091
+ try:
1092
+ role, org, org_qid, from_date, to_date = self.enrich_person_role_org(person.source_id)
1093
+
1094
+ if role or org:
1095
+ person.known_for_role = role
1096
+ person.known_for_org = org
1097
+ if org_qid:
1098
+ person.record["org_qid"] = org_qid
1099
+ if from_date:
1100
+ person.from_date = from_date
1101
+ if to_date:
1102
+ person.to_date = to_date
1103
+ return person, True
1104
+
1105
+ return person, False
1106
+ except Exception as e:
1107
+ logger.debug(f"Failed to enrich {person.source_id}: {e}")
1108
+ return person, False
1109
+
1110
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1111
+ # Submit all tasks
1112
+ futures = {executor.submit(enrich_one, person): person for person in to_enrich}
1113
+
1114
+ # Process results as they complete
1115
+ completed = 0
1116
+ for future in as_completed(futures):
1117
+ person, success = future.result()
1118
+ if success:
1119
+ enriched_count += 1
1120
+ logger.debug(f"Enriched {person.name}: {person.known_for_role} at {person.known_for_org}")
1121
+
1122
+ completed += 1
1123
+ if completed % 100 == 0:
1124
+ logger.info(f"Enriched {completed}/{total} people ({enriched_count} with data)...")
1125
+
1126
+ # Small delay to avoid rate limiting
1127
+ time.sleep(delay_seconds)
1128
+
1129
+ logger.info(f"Enriched {enriched_count}/{total} people with role/org/dates")
1130
+ return enriched_count