corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,632 @@
1
+ """
2
+ Wikidata importer for the person database.
3
+
4
+ Imports notable people data from Wikidata using SPARQL queries
5
+ into the embedding database for person name matching.
6
+
7
+ Notable people are those with English Wikipedia articles, ensuring
8
+ a basic level of notability.
9
+
10
+ Query categories (organized by PersonType):
11
+ - executives: Business executives (CEOs, CFOs, etc.)
12
+ - politicians: Politicians and diplomats
13
+ - athletes: Sports figures
14
+ - artists: Actors, musicians, directors
15
+ - academics: Professors and researchers
16
+ - scientists: Scientists and inventors
17
+ - journalists: Media personalities
18
+ - entrepreneurs: Founders and business owners
19
+
20
+ Uses the public Wikidata Query Service endpoint.
21
+ """
22
+
23
+ import json
24
+ import logging
25
+ import time
26
+ import urllib.parse
27
+ import urllib.request
28
+ from typing import Any, Iterator, Optional
29
+
30
+ from ..models import PersonRecord, PersonType
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Wikidata SPARQL endpoint
35
+ WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
36
+
37
+ # Base query template for people with Wikipedia articles
38
+ # Gets person, their position/role, and organization
39
+ PERSON_BASE_QUERY = """
40
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {{
41
+ ?person wdt:P31 wd:Q5 . # Instance of human
42
+
43
+ # Filter condition specific to query type
44
+ {filter_condition}
45
+
46
+ # Get country of citizenship
47
+ OPTIONAL {{ ?person wdt:P27 ?country. }}
48
+
49
+ # Get position held and associated organization
50
+ OPTIONAL {{
51
+ ?person p:P39 ?positionStatement .
52
+ ?positionStatement ps:P39 ?role .
53
+ OPTIONAL {{ ?positionStatement pq:P642 ?org }} # "of" qualifier
54
+ }}
55
+
56
+ # Fallback: direct employer
57
+ OPTIONAL {{ ?person wdt:P108 ?employer. BIND(?employer AS ?org) }}
58
+
59
+ # Get description
60
+ OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
61
+
62
+ # Must have English Wikipedia article (notability filter)
63
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
64
+
65
+ SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
66
+ }}
67
+ LIMIT %d
68
+ OFFSET %d
69
+ """
70
+
71
+ # Query for business executives (CEOs, CFOs, board members, etc.) - P39 = executive positions
72
+ EXECUTIVE_QUERY = """
73
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
74
+ ?person wdt:P31 wd:Q5 . # Instance of human
75
+
76
+ # Has held executive position
77
+ ?person p:P39 ?positionStatement .
78
+ ?positionStatement ps:P39 ?role .
79
+
80
+ # Role is a type of corporate officer, board member, or executive
81
+ VALUES ?role {
82
+ # C-Suite
83
+ wd:Q484876 # CEO (Chief Executive Officer)
84
+ wd:Q623279 # CFO (Chief Financial Officer)
85
+ wd:Q1502675 # CTO (Chief Technology Officer)
86
+ wd:Q935019 # COO (Chief Operating Officer)
87
+ wd:Q1057716 # CMO (Chief Marketing Officer)
88
+ wd:Q2140589 # CIO (Chief Information Officer)
89
+ wd:Q1115042 # Chief Human Resources Officer
90
+ wd:Q4720025 # Chief Legal Officer / General Counsel
91
+ wd:Q60432825 # Chief Product Officer
92
+ wd:Q15967139 # Chief Strategy Officer
93
+ wd:Q15729310 # Chief Revenue Officer
94
+ wd:Q47523568 # Chief Digital Officer
95
+
96
+ # Board positions
97
+ wd:Q258557 # Chairman / Chairman of the Board
98
+ wd:Q114863313 # Vice Chairman
99
+ wd:Q726114 # President (business)
100
+ wd:Q1372944 # Vice President
101
+ wd:Q18918145 # Executive Vice President
102
+ wd:Q1057569 # Board of directors member
103
+ wd:Q24058752 # Non-executive director
104
+ wd:Q3578048 # Independent director
105
+
106
+ # Other executive roles
107
+ wd:Q476675 # Managing Director
108
+ wd:Q5441744 # Executive Director
109
+ wd:Q4188234 # General Manager
110
+ wd:Q38844673 # Group CEO
111
+ wd:Q97273203 # President and CEO
112
+ wd:Q60715311 # Chairman and CEO
113
+ wd:Q3563879 # Partner (business)
114
+ wd:Q3505845 # Senior Partner
115
+ }
116
+
117
+ OPTIONAL { ?positionStatement pq:P642 ?org } # "of" qualifier
118
+ OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
119
+ OPTIONAL { ?person wdt:P27 ?country. }
120
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
121
+
122
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
123
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
124
+ }
125
+ LIMIT %d
126
+ OFFSET %d
127
+ """
128
+
129
+ # Query for politicians
130
+ POLITICIAN_QUERY = """
131
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
132
+ ?person wdt:P31 wd:Q5 . # Instance of human
133
+
134
+ # Occupation is politician
135
+ ?person wdt:P106 wd:Q82955 .
136
+
137
+ OPTIONAL {
138
+ ?person p:P39 ?positionStatement .
139
+ ?positionStatement ps:P39 ?role .
140
+ OPTIONAL { ?positionStatement pq:P642 ?org }
141
+ }
142
+ OPTIONAL { ?person wdt:P27 ?country. }
143
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
144
+
145
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
146
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
147
+ }
148
+ LIMIT %d
149
+ OFFSET %d
150
+ """
151
+
152
+ # Query for athletes
153
+ ATHLETE_QUERY = """
154
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
155
+ ?person wdt:P31 wd:Q5 . # Instance of human
156
+
157
+ # Is an athlete (has sports team membership P54 or is athlete P106)
158
+ { ?person wdt:P106 wd:Q2066131 . } # Athlete occupation
159
+ UNION
160
+ { ?person wdt:P54 ?team . } # Member of sports team
161
+
162
+ OPTIONAL { ?person wdt:P54 ?team . BIND(?team AS ?org) }
163
+ OPTIONAL { ?person wdt:P27 ?country. }
164
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
165
+
166
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
167
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
168
+ }
169
+ LIMIT %d
170
+ OFFSET %d
171
+ """
172
+
173
+ # Query for artists (actors, musicians, directors)
174
+ ARTIST_QUERY = """
175
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
176
+ ?person wdt:P31 wd:Q5 . # Instance of human
177
+
178
+ # Has artist occupation
179
+ ?person wdt:P106 ?occupation .
180
+ VALUES ?occupation {
181
+ wd:Q33999 # Actor
182
+ wd:Q177220 # Singer
183
+ wd:Q639669 # Musician
184
+ wd:Q2526255 # Film director
185
+ wd:Q36180 # Writer
186
+ wd:Q483501 # Artist
187
+ }
188
+
189
+ OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
190
+ OPTIONAL { ?person wdt:P27 ?country. }
191
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
192
+
193
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
194
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
195
+ }
196
+ LIMIT %d
197
+ OFFSET %d
198
+ """
199
+
200
+ # Query for academics (professors)
201
+ ACADEMIC_QUERY = """
202
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
203
+ ?person wdt:P31 wd:Q5 . # Instance of human
204
+
205
+ # Is a professor or academic
206
+ { ?person wdt:P106 wd:Q121594 . } # Professor
207
+ UNION
208
+ { ?person wdt:P106 wd:Q3400985 . } # Academic
209
+
210
+ OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
211
+ OPTIONAL { ?person wdt:P27 ?country. }
212
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
213
+
214
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
215
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
216
+ }
217
+ LIMIT %d
218
+ OFFSET %d
219
+ """
220
+
221
+ # Query for scientists
222
+ SCIENTIST_QUERY = """
223
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
224
+ ?person wdt:P31 wd:Q5 . # Instance of human
225
+
226
+ # Is a scientist or researcher
227
+ { ?person wdt:P106 wd:Q901 . } # Scientist
228
+ UNION
229
+ { ?person wdt:P106 wd:Q1650915 . } # Researcher
230
+
231
+ OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
232
+ OPTIONAL { ?person wdt:P27 ?country. }
233
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
234
+
235
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
236
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
237
+ }
238
+ LIMIT %d
239
+ OFFSET %d
240
+ """
241
+
242
+ # Query for journalists and media personalities
243
+ JOURNALIST_QUERY = """
244
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
245
+ ?person wdt:P31 wd:Q5 . # Instance of human
246
+
247
+ # Is a journalist or presenter
248
+ { ?person wdt:P106 wd:Q1930187 . } # Journalist
249
+ UNION
250
+ { ?person wdt:P106 wd:Q13590141 . } # Television presenter
251
+
252
+ OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
253
+ OPTIONAL { ?person wdt:P27 ?country. }
254
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
255
+
256
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
257
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
258
+ }
259
+ LIMIT %d
260
+ OFFSET %d
261
+ """
262
+
263
+ # Query for entrepreneurs (founders)
264
+ ENTREPRENEUR_QUERY = """
265
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
266
+ ?person wdt:P31 wd:Q5 . # Instance of human
267
+
268
+ # Founded a company (inverse of P112)
269
+ ?org wdt:P112 ?person .
270
+
271
+ OPTIONAL { ?person wdt:P27 ?country. }
272
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
273
+
274
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
275
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
276
+ }
277
+ LIMIT %d
278
+ OFFSET %d
279
+ """
280
+
281
+ # Query for activists
282
+ ACTIVIST_QUERY = """
283
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
284
+ ?person wdt:P31 wd:Q5 . # Instance of human
285
+
286
+ # Is an activist
287
+ ?person wdt:P106 wd:Q15253558 . # Activist
288
+
289
+ OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
290
+ OPTIONAL { ?person wdt:P27 ?country. }
291
+ OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
292
+
293
+ ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
294
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
295
+ }
296
+ LIMIT %d
297
+ OFFSET %d
298
+ """
299
+
300
+ # Mapping query type to PersonType
301
+ QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
302
+ "executive": PersonType.EXECUTIVE,
303
+ "politician": PersonType.POLITICIAN,
304
+ "athlete": PersonType.ATHLETE,
305
+ "artist": PersonType.ARTIST,
306
+ "academic": PersonType.ACADEMIC,
307
+ "scientist": PersonType.SCIENTIST,
308
+ "journalist": PersonType.JOURNALIST,
309
+ "entrepreneur": PersonType.ENTREPRENEUR,
310
+ "activist": PersonType.ACTIVIST,
311
+ }
312
+
313
+ # Mapping query type to SPARQL query template
314
+ QUERY_TYPES: dict[str, str] = {
315
+ "executive": EXECUTIVE_QUERY,
316
+ "politician": POLITICIAN_QUERY,
317
+ "athlete": ATHLETE_QUERY,
318
+ "artist": ARTIST_QUERY,
319
+ "academic": ACADEMIC_QUERY,
320
+ "scientist": SCIENTIST_QUERY,
321
+ "journalist": JOURNALIST_QUERY,
322
+ "entrepreneur": ENTREPRENEUR_QUERY,
323
+ "activist": ACTIVIST_QUERY,
324
+ }
325
+
326
+
327
+ class WikidataPeopleImporter:
328
+ """
329
+ Importer for Wikidata person data.
330
+
331
+ Uses SPARQL queries against the public Wikidata Query Service
332
+ to fetch notable people including executives, politicians, athletes, etc.
333
+
334
+ Query types:
335
+ - executive: Business executives (CEOs, CFOs, etc.)
336
+ - politician: Politicians and diplomats
337
+ - athlete: Sports figures
338
+ - artist: Actors, musicians, directors, writers
339
+ - academic: Professors and researchers
340
+ - scientist: Scientists and inventors
341
+ - journalist: Media personalities
342
+ - entrepreneur: Company founders
343
+ - activist: Activists and advocates
344
+ """
345
+
346
+ def __init__(self, batch_size: int = 1000, delay_seconds: float = 2.0, timeout: int = 120):
347
+ """
348
+ Initialize the Wikidata people importer.
349
+
350
+ Args:
351
+ batch_size: Number of records to fetch per SPARQL query (default 1000)
352
+ delay_seconds: Delay between requests to be polite to the endpoint
353
+ timeout: HTTP timeout in seconds (default 120)
354
+ """
355
+ self._batch_size = batch_size
356
+ self._delay = delay_seconds
357
+ self._timeout = timeout
358
+
359
+ def import_from_sparql(
360
+ self,
361
+ limit: Optional[int] = None,
362
+ query_type: str = "executive",
363
+ import_all: bool = False,
364
+ ) -> Iterator[PersonRecord]:
365
+ """
366
+ Import person records from Wikidata via SPARQL.
367
+
368
+ Args:
369
+ limit: Optional limit on total records
370
+ query_type: Which query to use (executive, politician, athlete, etc.)
371
+ import_all: If True, run all query types sequentially
372
+
373
+ Yields:
374
+ PersonRecord for each person
375
+ """
376
+ if import_all:
377
+ yield from self._import_all_types(limit)
378
+ return
379
+
380
+ if query_type not in QUERY_TYPES:
381
+ raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPES.keys())}")
382
+
383
+ query_template = QUERY_TYPES[query_type]
384
+ person_type = QUERY_TYPE_TO_PERSON_TYPE.get(query_type, PersonType.UNKNOWN)
385
+ logger.info(f"Starting Wikidata people import via SPARQL (query_type={query_type}, person_type={person_type.value})...")
386
+
387
+ offset = 0
388
+ total_count = 0
389
+ seen_ids = set() # Track seen Wikidata IDs to avoid duplicates
390
+
391
+ while True:
392
+ if limit and total_count >= limit:
393
+ break
394
+
395
+ batch_limit = min(self._batch_size, (limit - total_count) if limit else self._batch_size)
396
+ query = query_template % (batch_limit, offset)
397
+
398
+ logger.info(f"Fetching Wikidata people batch at offset {offset}...")
399
+
400
+ try:
401
+ results = self._execute_sparql(query)
402
+ except Exception as e:
403
+ logger.error(f"SPARQL query failed at offset {offset}: {e}")
404
+ break
405
+
406
+ bindings = results.get("results", {}).get("bindings", [])
407
+
408
+ if not bindings:
409
+ logger.info("No more results from Wikidata")
410
+ break
411
+
412
+ batch_count = 0
413
+ for binding in bindings:
414
+ if limit and total_count >= limit:
415
+ break
416
+
417
+ record = self._parse_binding(binding, person_type=person_type)
418
+ if record and record.source_id not in seen_ids:
419
+ seen_ids.add(record.source_id)
420
+ total_count += 1
421
+ batch_count += 1
422
+ yield record
423
+
424
+ logger.info(f"Processed {batch_count} people from batch (total: {total_count})")
425
+
426
+ if len(bindings) < batch_limit:
427
+ # Last batch
428
+ break
429
+
430
+ offset += self._batch_size
431
+
432
+ # Be polite to the endpoint
433
+ if self._delay > 0:
434
+ time.sleep(self._delay)
435
+
436
+ logger.info(f"Completed Wikidata people import: {total_count} records")
437
+
438
+ def _import_all_types(self, limit: Optional[int]) -> Iterator[PersonRecord]:
439
+ """Import from all query types sequentially, deduplicating across types."""
440
+ seen_ids: set[str] = set()
441
+ total_count = 0
442
+
443
+ # Calculate per-type limits if a total limit is set
444
+ num_types = len(QUERY_TYPES)
445
+ per_type_limit = limit // num_types if limit else None
446
+
447
+ for query_type in QUERY_TYPES:
448
+ logger.info(f"=== Importing people: {query_type} ===")
449
+ type_count = 0
450
+
451
+ for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
452
+ if record.source_id not in seen_ids:
453
+ seen_ids.add(record.source_id)
454
+ total_count += 1
455
+ type_count += 1
456
+ yield record
457
+
458
+ if limit and total_count >= limit:
459
+ logger.info(f"Reached total limit of {limit} records")
460
+ return
461
+
462
+ logger.info(f"Got {type_count} new records from {query_type} (total: {total_count})")
463
+
464
+ logger.info(f"Completed all query types: {total_count} total people records")
465
+
466
+ def _execute_sparql(self, query: str) -> dict[str, Any]:
467
+ """Execute a SPARQL query against Wikidata."""
468
+ params = urllib.parse.urlencode({
469
+ "query": query,
470
+ "format": "json",
471
+ })
472
+
473
+ url = f"{WIKIDATA_SPARQL_URL}?{params}"
474
+
475
+ req = urllib.request.Request(
476
+ url,
477
+ headers={
478
+ "Accept": "application/sparql-results+json",
479
+ "User-Agent": "corp-extractor/1.0 (person database builder)",
480
+ }
481
+ )
482
+
483
+ with urllib.request.urlopen(req, timeout=self._timeout) as response:
484
+ return json.loads(response.read().decode("utf-8"))
485
+
486
+ def _parse_binding(
487
+ self,
488
+ binding: dict[str, Any],
489
+ person_type: PersonType = PersonType.UNKNOWN,
490
+ ) -> Optional[PersonRecord]:
491
+ """Parse a SPARQL result binding into a PersonRecord."""
492
+ try:
493
+ # Get Wikidata entity ID
494
+ person_uri = binding.get("person", {}).get("value", "")
495
+ if not person_uri:
496
+ return None
497
+
498
+ # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
499
+ wikidata_id = person_uri.split("/")[-1]
500
+ if not wikidata_id.startswith("Q"):
501
+ return None
502
+
503
+ # Get label
504
+ label = binding.get("personLabel", {}).get("value", "")
505
+ if not label or label == wikidata_id: # Skip if no English label
506
+ return None
507
+
508
+ # Get optional fields
509
+ country = binding.get("countryLabel", {}).get("value", "")
510
+ role = binding.get("roleLabel", {}).get("value", "")
511
+ org = binding.get("orgLabel", {}).get("value", "")
512
+ description = binding.get("description", {}).get("value", "")
513
+
514
+ # Clean up role and org (remove QID if it's the same as the label)
515
+ if role and role.startswith("Q"):
516
+ role = ""
517
+ if org and org.startswith("Q"):
518
+ org = ""
519
+
520
+ # Build record data
521
+ record_data: dict[str, Any] = {
522
+ "wikidata_id": wikidata_id,
523
+ "label": label,
524
+ }
525
+ if country:
526
+ record_data["country"] = country
527
+ if role:
528
+ record_data["role"] = role
529
+ if org:
530
+ record_data["org"] = org
531
+ if description:
532
+ record_data["description"] = description
533
+
534
+ return PersonRecord(
535
+ name=label.strip(),
536
+ source="wikidata",
537
+ source_id=wikidata_id,
538
+ country=country or "",
539
+ person_type=person_type,
540
+ known_for_role=role or "",
541
+ known_for_org=org or "",
542
+ record=record_data,
543
+ )
544
+
545
+ except Exception as e:
546
+ logger.debug(f"Failed to parse Wikidata binding: {e}")
547
+ return None
548
+
549
+ def search_person(self, name: str, limit: int = 10) -> list[PersonRecord]:
550
+ """
551
+ Search for a specific person by name.
552
+
553
+ Args:
554
+ name: Person name to search for
555
+ limit: Maximum results to return
556
+
557
+ Returns:
558
+ List of matching PersonRecords
559
+ """
560
+ # Use Wikidata search API for better name matching
561
+ search_url = "https://www.wikidata.org/w/api.php"
562
+ params = urllib.parse.urlencode({
563
+ "action": "wbsearchentities",
564
+ "search": name,
565
+ "language": "en",
566
+ "type": "item",
567
+ "limit": limit,
568
+ "format": "json",
569
+ })
570
+
571
+ req = urllib.request.Request(
572
+ f"{search_url}?{params}",
573
+ headers={"User-Agent": "corp-extractor/1.0"}
574
+ )
575
+
576
+ with urllib.request.urlopen(req, timeout=30) as response:
577
+ data = json.loads(response.read().decode("utf-8"))
578
+
579
+ results = []
580
+ for item in data.get("search", []):
581
+ qid = item.get("id")
582
+ label = item.get("label", "")
583
+ description = item.get("description", "")
584
+
585
+ # Check if it looks like a person
586
+ person_keywords = [
587
+ "politician", "actor", "actress", "singer", "musician",
588
+ "businessman", "businesswoman", "ceo", "executive", "director",
589
+ "president", "founder", "professor", "scientist", "author",
590
+ "writer", "journalist", "athlete", "player", "coach",
591
+ ]
592
+ description_lower = description.lower()
593
+ is_person = any(kw in description_lower for kw in person_keywords)
594
+ if not is_person:
595
+ continue
596
+
597
+ # Try to infer person type from description
598
+ person_type = PersonType.UNKNOWN
599
+ if any(kw in description_lower for kw in ["ceo", "executive", "businessman", "businesswoman"]):
600
+ person_type = PersonType.EXECUTIVE
601
+ elif any(kw in description_lower for kw in ["politician", "president", "senator", "minister"]):
602
+ person_type = PersonType.POLITICIAN
603
+ elif any(kw in description_lower for kw in ["athlete", "player", "coach"]):
604
+ person_type = PersonType.ATHLETE
605
+ elif any(kw in description_lower for kw in ["actor", "actress", "singer", "musician", "director"]):
606
+ person_type = PersonType.ARTIST
607
+ elif any(kw in description_lower for kw in ["professor", "academic"]):
608
+ person_type = PersonType.ACADEMIC
609
+ elif any(kw in description_lower for kw in ["scientist", "researcher"]):
610
+ person_type = PersonType.SCIENTIST
611
+ elif any(kw in description_lower for kw in ["journalist", "reporter"]):
612
+ person_type = PersonType.JOURNALIST
613
+ elif any(kw in description_lower for kw in ["founder", "entrepreneur"]):
614
+ person_type = PersonType.ENTREPRENEUR
615
+
616
+ record = PersonRecord(
617
+ name=label,
618
+ source="wikidata",
619
+ source_id=qid,
620
+ country="", # Not available from search API
621
+ person_type=person_type,
622
+ known_for_role="",
623
+ known_for_org="",
624
+ record={
625
+ "wikidata_id": qid,
626
+ "label": label,
627
+ "description": description,
628
+ },
629
+ )
630
+ results.append(record)
631
+
632
+ return results