corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1951 @@
1
+ """
2
+ Wikidata dump importer for people and organizations.
3
+
4
+ Uses the Wikidata JSON dump (~100GB compressed) to import:
5
+ 1. People: All humans (P31=Q5) with English Wikipedia articles
6
+ 2. Organizations: All organizations with English Wikipedia articles
7
+
8
+ This avoids SPARQL query timeouts that occur with large result sets.
9
+ The dump is processed line-by-line to minimize memory usage.
10
+
11
+ Dump format:
12
+ - File: `latest-all.json.bz2` (~100GB) or `.gz` (~150GB)
13
+ - Format: JSON array where each line is a separate entity (after first `[` line)
14
+ - Each line: `{"type":"item","id":"Q123","labels":{...},"claims":{...},"sitelinks":{...}},`
15
+ - Streaming: Read line-by-line, strip trailing comma, parse JSON
16
+
17
+ Resume support:
18
+ - Progress is tracked by entity index (count of entities processed)
19
+ - Progress can be saved to a JSON file and loaded on resume
20
+ - On resume, entities are skipped efficiently until reaching the saved position
21
+ """
22
+
23
+ import bz2
24
+ import gzip
25
+ import json
26
+ import logging
27
+ import shutil
28
+ import subprocess
29
+ import urllib.request
30
+ from dataclasses import dataclass, field
31
+ from datetime import datetime
32
+ from pathlib import Path
33
+ from typing import Callable, Iterator, Optional
34
+
35
+ from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
36
+
37
+ # Type alias for records that can be either people or orgs
38
+ ImportRecord = PersonRecord | CompanyRecord
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ # Wikidata dump URLs - mirrors for faster downloads
43
+ # Primary is Wikimedia (slow), alternatives may be faster
44
+ DUMP_MIRRORS = [
45
+ # Wikimedia Foundation (official, often slow)
46
+ "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
47
+ # Academic Torrents mirror (if available) - typically faster
48
+ # Note: Check https://academictorrents.com/browse?search=wikidata for current links
49
+ ]
50
+
51
+ # Default URL (can be overridden)
52
+ DUMP_URL = DUMP_MIRRORS[0]
53
+
54
+ # For even faster downloads, users can:
55
+ # 1. Use a torrent client with the Academic Torrents magnet link
56
+ # 2. Download from a regional Wikimedia mirror
57
+ # 3. Use aria2c with multiple connections: aria2c -x 16 -s 16 <url>
58
+
59
+ # =============================================================================
60
+ # POSITION TO PERSON TYPE MAPPING (P39 - position held)
61
+ # =============================================================================
62
+
63
+ # Executive positions (P39 values)
64
+ EXECUTIVE_POSITION_QIDS = {
65
+ "Q484876", # CEO
66
+ "Q623279", # CFO
67
+ "Q1502675", # COO
68
+ "Q935019", # CTO
69
+ "Q1057716", # CIO
70
+ "Q2140589", # CMO
71
+ "Q1115042", # chairperson
72
+ "Q4720025", # board of directors member
73
+ "Q60432825", # chief human resources officer
74
+ "Q15967139", # chief compliance officer
75
+ "Q15729310", # chief risk officer
76
+ "Q47523568", # chief legal officer
77
+ "Q258557", # board chair
78
+ "Q114863313", # chief sustainability officer
79
+ "Q726114", # company president
80
+ "Q1372944", # managing director
81
+ "Q18918145", # chief commercial officer
82
+ "Q1057569", # chief strategy officer
83
+ "Q24058752", # chief product officer
84
+ "Q3578048", # vice president
85
+ "Q476675", # business executive (generic)
86
+ "Q5441744", # finance director
87
+ "Q4188234", # general manager
88
+ "Q38844673", # chief data officer
89
+ "Q97273203", # chief digital officer
90
+ "Q60715311", # chief growth officer
91
+ "Q3563879", # treasurer
92
+ "Q3505845", # corporate secretary
93
+ }
94
+
95
+ # Politician positions (P39 values)
96
+ # Includes heads of state/government, legislators, and local officials
97
+ POLITICIAN_POSITION_QIDS = {
98
+ # Heads of state/government
99
+ "Q30461", # president
100
+ "Q14212", # prime minister
101
+ "Q83307", # minister
102
+ "Q2285706", # head of government
103
+ "Q48352", # head of state
104
+ "Q116", # monarch
105
+ "Q382617", # governor
106
+ "Q212071", # mayor
107
+ "Q1553195", # deputy prime minister
108
+ "Q1670573", # cabinet minister
109
+ "Q13218630", # secretary of state
110
+ "Q581682", # vice president
111
+
112
+ # Legislators - national
113
+ "Q4175034", # legislator
114
+ "Q486839", # member of parliament
115
+ "Q193391", # member of national legislature
116
+ "Q484529", # member of congress
117
+ "Q1711695", # senator
118
+ "Q18941264", # member of the House of Representatives (US)
119
+ "Q16707842", # member of the House of Commons (UK)
120
+ "Q18015642", # member of the House of Lords (UK)
121
+ "Q17295570", # member of the Bundestag (Germany)
122
+ "Q27169", # member of the European Parliament
123
+ "Q64366569", # member of Dáil Éireann (Ireland)
124
+ "Q19823090", # member of the Riksdag (Sweden)
125
+ "Q18229048", # member of Sejm (Poland)
126
+ "Q21032547", # member of the National Assembly (France)
127
+ "Q64511800", # member of the Knesset (Israel)
128
+ "Q50393121", # member of the State Duma (Russia)
129
+ "Q18558055", # member of the Diet (Japan)
130
+ "Q109862831", # member of Lok Sabha (India)
131
+ "Q63078776", # member of the Canadian House of Commons
132
+ "Q83767637", # member of the Australian House of Representatives
133
+
134
+ # Legislators - regional/local
135
+ "Q4382506", # member of state legislature
136
+ "Q17765219", # member of regional parliament
137
+ "Q1752514", # councillor (local government)
138
+ "Q18824436", # city councillor
139
+
140
+ # Other political offices
141
+ "Q294414", # public office (generic)
142
+ "Q889821", # ambassador
143
+ "Q15966511", # diplomat
144
+ "Q334344", # lord lieutenant
145
+ "Q16533", # judge (some are appointed politicians)
146
+ "Q3099732", # ombudsman
147
+ "Q1500443", # prefect
148
+ "Q611644", # envoy
149
+ "Q2824523", # political commissar
150
+ }
151
+
152
+ # =============================================================================
153
+ # OCCUPATION TO PERSON TYPE MAPPING (P106 - occupation)
154
+ # =============================================================================
155
+
156
+ OCCUPATION_TO_TYPE: dict[str, PersonType] = {
157
+ # Politicians (elected officials)
158
+ "Q82955": PersonType.POLITICIAN, # politician
159
+ "Q193391": PersonType.POLITICIAN, # member of parliament
160
+ "Q372436": PersonType.POLITICIAN, # statesperson
161
+
162
+ # Government (civil servants, diplomats, appointed officials)
163
+ "Q212238": PersonType.GOVERNMENT, # civil servant
164
+ "Q806798": PersonType.GOVERNMENT, # diplomat
165
+ "Q15627169": PersonType.GOVERNMENT, # trade unionist (often govt-adjacent)
166
+
167
+ # Military
168
+ "Q189290": PersonType.MILITARY, # military officer
169
+ "Q47064": PersonType.MILITARY, # military personnel
170
+ "Q4991371": PersonType.MILITARY, # soldier
171
+ "Q10669499": PersonType.MILITARY, # naval officer
172
+ "Q11974939": PersonType.MILITARY, # air force officer
173
+ "Q10974448": PersonType.MILITARY, # army officer
174
+
175
+ # Legal professionals
176
+ "Q16533": PersonType.LEGAL, # judge
177
+ "Q40348": PersonType.LEGAL, # lawyer
178
+ "Q185351": PersonType.LEGAL, # jurist
179
+ "Q3242871": PersonType.LEGAL, # prosecutor
180
+ "Q1792450": PersonType.LEGAL, # barrister
181
+ "Q3406182": PersonType.LEGAL, # solicitor
182
+
183
+ # Athletes
184
+ "Q2066131": PersonType.ATHLETE, # athlete
185
+ "Q937857": PersonType.ATHLETE, # football player
186
+ "Q3665646": PersonType.ATHLETE, # basketball player
187
+ "Q10871364": PersonType.ATHLETE, # baseball player
188
+ "Q19204627": PersonType.ATHLETE, # ice hockey player
189
+ "Q10843402": PersonType.ATHLETE, # tennis player
190
+ "Q13381376": PersonType.ATHLETE, # golfer
191
+ "Q11338576": PersonType.ATHLETE, # boxer
192
+ "Q10873124": PersonType.ATHLETE, # swimmer
193
+ "Q11303721": PersonType.ATHLETE, # racing driver
194
+ "Q10833314": PersonType.ATHLETE, # cricket player
195
+ "Q13141064": PersonType.ATHLETE, # rugby player
196
+
197
+ # Artists (traditional creative professions)
198
+ "Q33999": PersonType.ARTIST, # actor
199
+ "Q177220": PersonType.ARTIST, # singer
200
+ "Q639669": PersonType.ARTIST, # musician
201
+ "Q2526255": PersonType.ARTIST, # film director
202
+ "Q36180": PersonType.ARTIST, # writer
203
+ "Q483501": PersonType.ARTIST, # artist
204
+ "Q488205": PersonType.ARTIST, # singer-songwriter
205
+ "Q753110": PersonType.ARTIST, # songwriter
206
+ "Q2405480": PersonType.ARTIST, # voice actor
207
+ "Q10800557": PersonType.ARTIST, # film actor
208
+ "Q3455803": PersonType.ARTIST, # director
209
+ "Q28389": PersonType.ARTIST, # screenwriter
210
+ "Q6625963": PersonType.ARTIST, # comedian
211
+ "Q2259451": PersonType.ARTIST, # stand-up comedian
212
+ "Q2490358": PersonType.ARTIST, # choreographer
213
+ "Q2722764": PersonType.ARTIST, # DJ (disc jockey)
214
+ "Q183945": PersonType.ARTIST, # record producer
215
+ "Q3282637": PersonType.ARTIST, # film producer
216
+ "Q49757": PersonType.ARTIST, # poet
217
+ "Q28640": PersonType.ARTIST, # illustrator
218
+ "Q1028181": PersonType.ARTIST, # painter
219
+ "Q1281618": PersonType.ARTIST, # sculptor
220
+ "Q33231": PersonType.ARTIST, # photographer
221
+ "Q806349": PersonType.ARTIST, # band leader
222
+ "Q855091": PersonType.ARTIST, # rapper
223
+ "Q4351403": PersonType.ARTIST, # novelist
224
+ "Q158852": PersonType.ARTIST, # conductor (music)
225
+ "Q486748": PersonType.ARTIST, # pianist
226
+ "Q1415090": PersonType.ARTIST, # guitarist
227
+
228
+ # Media (internet/social media personalities)
229
+ "Q6168364": PersonType.MEDIA, # YouTuber
230
+ "Q15077007": PersonType.MEDIA, # podcaster
231
+ "Q17125263": PersonType.MEDIA, # social media influencer
232
+ "Q15981151": PersonType.MEDIA, # internet celebrity
233
+ "Q2059704": PersonType.MEDIA, # television personality
234
+ "Q4610556": PersonType.MEDIA, # model
235
+ "Q578109": PersonType.MEDIA, # television producer
236
+ "Q2516866": PersonType.MEDIA, # publisher
237
+ "Q93191800": PersonType.MEDIA, # content creator
238
+ "Q105756498": PersonType.MEDIA, # streamer (Twitch etc.)
239
+
240
+ # Professionals (known for their profession/work)
241
+ "Q39631": PersonType.PROFESSIONAL, # physician/doctor
242
+ "Q774306": PersonType.PROFESSIONAL, # surgeon
243
+ "Q1234713": PersonType.PROFESSIONAL, # dentist
244
+ "Q15924224": PersonType.PROFESSIONAL, # psychiatrist
245
+ "Q212980": PersonType.PROFESSIONAL, # psychologist
246
+ "Q81096": PersonType.PROFESSIONAL, # engineer
247
+ "Q42603": PersonType.PROFESSIONAL, # priest/clergy
248
+ "Q432386": PersonType.PROFESSIONAL, # architect
249
+ "Q3621491": PersonType.PROFESSIONAL, # nurse
250
+ "Q18805": PersonType.PROFESSIONAL, # pharmacist
251
+ "Q15895020": PersonType.PROFESSIONAL, # veterinarian
252
+ "Q131512": PersonType.PROFESSIONAL, # chef
253
+ "Q3499072": PersonType.PROFESSIONAL, # pilot
254
+ "Q15895449": PersonType.PROFESSIONAL, # accountant
255
+ "Q806750": PersonType.PROFESSIONAL, # consultant
256
+ "Q584301": PersonType.PROFESSIONAL, # economist (often professional)
257
+ "Q1371925": PersonType.PROFESSIONAL, # real estate agent
258
+ "Q266569": PersonType.PROFESSIONAL, # librarian
259
+ "Q5323050": PersonType.PROFESSIONAL, # electrical engineer
260
+ "Q13582652": PersonType.PROFESSIONAL, # civil engineer
261
+ "Q81965": PersonType.PROFESSIONAL, # software engineer
262
+ "Q5482740": PersonType.PROFESSIONAL, # data scientist
263
+
264
+ # Academics
265
+ "Q121594": PersonType.ACADEMIC, # professor
266
+ "Q3400985": PersonType.ACADEMIC, # academic
267
+ "Q1622272": PersonType.ACADEMIC, # university professor
268
+
269
+ # Scientists
270
+ "Q901": PersonType.SCIENTIST, # scientist
271
+ "Q1650915": PersonType.SCIENTIST, # researcher
272
+ "Q169470": PersonType.SCIENTIST, # physicist
273
+ "Q593644": PersonType.SCIENTIST, # chemist
274
+ "Q864503": PersonType.SCIENTIST, # biologist
275
+ "Q11063": PersonType.SCIENTIST, # astronomer
276
+
277
+ # Journalists
278
+ "Q1930187": PersonType.JOURNALIST, # journalist
279
+ "Q13590141": PersonType.JOURNALIST, # news presenter
280
+ "Q947873": PersonType.JOURNALIST, # television presenter
281
+ "Q4263842": PersonType.JOURNALIST, # columnist
282
+
283
+ # Activists
284
+ "Q15253558": PersonType.ACTIVIST, # activist
285
+ "Q11631410": PersonType.ACTIVIST, # human rights activist
286
+ "Q18939491": PersonType.ACTIVIST, # environmental activist
287
+
288
+ # Entrepreneurs/Executives via occupation
289
+ "Q131524": PersonType.ENTREPRENEUR, # entrepreneur
290
+ "Q43845": PersonType.ENTREPRENEUR, # businessperson
291
+ }
292
+
293
+ # =============================================================================
294
+ # ORGANIZATION TYPE MAPPING (P31 - instance of)
295
+ # =============================================================================
296
+
297
+ ORG_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
298
+ # Business - core types
299
+ "Q4830453": EntityType.BUSINESS, # business
300
+ "Q6881511": EntityType.BUSINESS, # enterprise
301
+ "Q783794": EntityType.BUSINESS, # company
302
+ "Q891723": EntityType.BUSINESS, # public company
303
+ "Q167037": EntityType.BUSINESS, # corporation
304
+ "Q658255": EntityType.BUSINESS, # subsidiary
305
+ "Q206652": EntityType.BUSINESS, # conglomerate
306
+ "Q22687": EntityType.BUSINESS, # bank
307
+ "Q1145276": EntityType.BUSINESS, # insurance company
308
+ "Q46970": EntityType.BUSINESS, # airline
309
+ "Q613142": EntityType.BUSINESS, # law firm
310
+ "Q507619": EntityType.BUSINESS, # pharmaceutical company
311
+ "Q2979960": EntityType.BUSINESS, # technology company
312
+ "Q1631111": EntityType.BUSINESS, # retailer
313
+ "Q187652": EntityType.BUSINESS, # manufacturer
314
+ # Business - additional types
315
+ "Q43229": EntityType.BUSINESS, # organization (generic)
316
+ "Q4671277": EntityType.BUSINESS, # academic institution (some are businesses)
317
+ "Q1664720": EntityType.BUSINESS, # institute
318
+ "Q15911314": EntityType.BUSINESS, # association
319
+ "Q15925165": EntityType.BUSINESS, # private company
320
+ "Q5225895": EntityType.BUSINESS, # credit union
321
+ "Q161726": EntityType.BUSINESS, # multinational corporation
322
+ "Q134161": EntityType.BUSINESS, # joint venture
323
+ "Q1589009": EntityType.BUSINESS, # privately held company
324
+ "Q270791": EntityType.BUSINESS, # state-owned enterprise
325
+ "Q1762059": EntityType.BUSINESS, # online service provider
326
+ "Q17127659": EntityType.BUSINESS, # energy company
327
+ "Q2695280": EntityType.BUSINESS, # construction company
328
+ "Q1624464": EntityType.BUSINESS, # telecommunications company
329
+ "Q1668024": EntityType.BUSINESS, # car manufacturer
330
+ "Q3914": EntityType.BUSINESS, # school (some are businesses)
331
+ "Q1030034": EntityType.BUSINESS, # management consulting firm
332
+ "Q1370614": EntityType.BUSINESS, # investment bank
333
+ "Q1785271": EntityType.BUSINESS, # advertising agency
334
+ "Q4686042": EntityType.BUSINESS, # automotive supplier
335
+ "Q431289": EntityType.BUSINESS, # brand
336
+ "Q622438": EntityType.BUSINESS, # supermarket chain
337
+ "Q6500733": EntityType.BUSINESS, # licensed retailer
338
+ "Q2659904": EntityType.BUSINESS, # government-owned corporation
339
+ "Q1065118": EntityType.BUSINESS, # bookmaker
340
+ "Q179179": EntityType.BUSINESS, # startup
341
+ "Q210167": EntityType.BUSINESS, # video game developer
342
+ "Q18388277": EntityType.BUSINESS, # video game publisher
343
+ "Q1762913": EntityType.BUSINESS, # film production company
344
+ "Q18558478": EntityType.BUSINESS, # money services business
345
+ "Q6463968": EntityType.BUSINESS, # asset management company
346
+ "Q2864737": EntityType.BUSINESS, # cooperative bank
347
+ "Q161380": EntityType.BUSINESS, # cooperative
348
+ "Q15850590": EntityType.BUSINESS, # real estate company
349
+ "Q1048835": EntityType.BUSINESS, # political organization
350
+ "Q1254933": EntityType.BUSINESS, # astronomical observatory (often research orgs)
351
+ "Q294414": EntityType.BUSINESS, # public office
352
+
353
+ # Funds
354
+ "Q45400320": EntityType.FUND, # investment fund
355
+ "Q476028": EntityType.FUND, # hedge fund
356
+ "Q380649": EntityType.FUND, # investment company
357
+ "Q1377053": EntityType.FUND, # mutual fund
358
+ "Q3312546": EntityType.FUND, # private equity firm
359
+ "Q751705": EntityType.FUND, # venture capital firm
360
+ "Q2296920": EntityType.FUND, # sovereign wealth fund
361
+ "Q2824951": EntityType.FUND, # exchange-traded fund
362
+ "Q1755098": EntityType.FUND, # pension fund
363
+
364
+ # Nonprofits
365
+ "Q163740": EntityType.NONPROFIT, # nonprofit organization
366
+ "Q79913": EntityType.NGO, # non-governmental organization
367
+ "Q157031": EntityType.FOUNDATION, # foundation
368
+ "Q48204": EntityType.NONPROFIT, # voluntary association
369
+ "Q988108": EntityType.NONPROFIT, # club
370
+ "Q476436": EntityType.NONPROFIT, # charitable organization
371
+ "Q3591957": EntityType.NONPROFIT, # cultural institution
372
+ "Q162633": EntityType.NONPROFIT, # academy
373
+ "Q270791": EntityType.NONPROFIT, # learned society
374
+ "Q484652": EntityType.NONPROFIT, # international organization
375
+
376
+ # Government
377
+ "Q327333": EntityType.GOVERNMENT, # government agency
378
+ "Q7278": EntityType.POLITICAL_PARTY, # political party
379
+ "Q178790": EntityType.TRADE_UNION, # trade union
380
+ "Q7188": EntityType.GOVERNMENT, # government
381
+ "Q2659904": EntityType.GOVERNMENT, # government-owned corporation
382
+ "Q35798": EntityType.GOVERNMENT, # executive branch
383
+ "Q35749": EntityType.GOVERNMENT, # legislature
384
+ "Q12076836": EntityType.GOVERNMENT, # law enforcement agency
385
+ "Q17362920": EntityType.GOVERNMENT, # public body
386
+ "Q1063239": EntityType.GOVERNMENT, # regulatory agency
387
+ "Q3624078": EntityType.GOVERNMENT, # sovereign state
388
+ "Q133442": EntityType.GOVERNMENT, # embassy
389
+ "Q174834": EntityType.GOVERNMENT, # authority (government)
390
+
391
+ # International organizations
392
+ "Q484652": EntityType.INTERNATIONAL_ORG, # international organization
393
+ "Q1335818": EntityType.INTERNATIONAL_ORG, # supranational organisation
394
+ "Q1616075": EntityType.INTERNATIONAL_ORG, # intergovernmental organization
395
+
396
+ # Education/Research
397
+ "Q2385804": EntityType.EDUCATIONAL, # educational institution
398
+ "Q3918": EntityType.EDUCATIONAL, # university
399
+ "Q31855": EntityType.RESEARCH, # research institute
400
+ "Q875538": EntityType.EDUCATIONAL, # public university
401
+ "Q23002039": EntityType.EDUCATIONAL, # private university
402
+ "Q38723": EntityType.EDUCATIONAL, # higher education institution
403
+ "Q1371037": EntityType.EDUCATIONAL, # secondary school
404
+ "Q9842": EntityType.EDUCATIONAL, # primary school
405
+ "Q189004": EntityType.EDUCATIONAL, # college
406
+ "Q1188663": EntityType.EDUCATIONAL, # community college
407
+ "Q1321960": EntityType.RESEARCH, # think tank
408
+ "Q31855": EntityType.RESEARCH, # research institute
409
+ "Q3354859": EntityType.RESEARCH, # observatory
410
+ "Q1298668": EntityType.RESEARCH, # research center
411
+
412
+ # Healthcare
413
+ "Q16917": EntityType.HEALTHCARE, # hospital
414
+ "Q1774898": EntityType.HEALTHCARE, # health care organization
415
+ "Q180958": EntityType.HEALTHCARE, # clinic
416
+ "Q4260475": EntityType.HEALTHCARE, # medical facility
417
+ "Q871964": EntityType.HEALTHCARE, # biotechnology company
418
+ "Q902104": EntityType.HEALTHCARE, # health insurance company
419
+
420
+ # Sports
421
+ "Q847017": EntityType.SPORTS, # sports club
422
+ "Q476068": EntityType.SPORTS, # sports team
423
+ "Q12973014": EntityType.SPORTS, # sports organization
424
+ "Q14350": EntityType.SPORTS, # association football club
425
+ "Q20639847": EntityType.SPORTS, # American football team
426
+ "Q13393265": EntityType.SPORTS, # basketball team
427
+ "Q13406463": EntityType.SPORTS, # baseball team
428
+ "Q1410877": EntityType.SPORTS, # ice hockey team
429
+ "Q18558301": EntityType.SPORTS, # rugby union club
430
+ "Q2093802": EntityType.SPORTS, # cricket team
431
+ "Q5137836": EntityType.SPORTS, # motorsport racing team
432
+
433
+ # Media
434
+ "Q18127": EntityType.MEDIA, # record label
435
+ "Q1366047": EntityType.MEDIA, # film studio
436
+ "Q1137109": EntityType.MEDIA, # video game company
437
+ "Q11032": EntityType.MEDIA, # newspaper
438
+ "Q1002697": EntityType.MEDIA, # periodical
439
+ "Q5398426": EntityType.MEDIA, # television series
440
+ "Q1110794": EntityType.MEDIA, # daily newspaper
441
+ "Q1616075": EntityType.MEDIA, # news agency
442
+ "Q14350": EntityType.MEDIA, # magazine
443
+ "Q15265344": EntityType.MEDIA, # broadcaster
444
+ "Q131436": EntityType.MEDIA, # radio station
445
+ "Q1616075": EntityType.MEDIA, # television station
446
+ "Q41298": EntityType.MEDIA, # magazine
447
+ "Q30022": EntityType.MEDIA, # television channel
448
+ "Q17232649": EntityType.MEDIA, # publishing company
449
+ "Q28803812": EntityType.MEDIA, # streaming service
450
+ "Q159334": EntityType.MEDIA, # entertainment company
451
+
452
+ # Religious
453
+ "Q9174": EntityType.RELIGIOUS, # religion
454
+ "Q1530022": EntityType.RELIGIOUS, # religious organization
455
+ "Q2994867": EntityType.RELIGIOUS, # religious community
456
+ "Q34651": EntityType.RELIGIOUS, # church (building as org)
457
+ "Q44613": EntityType.RELIGIOUS, # monastery
458
+ }
459
+
460
+
461
+ # =============================================================================
462
+ # PROGRESS TRACKING
463
+ # =============================================================================
464
+
465
+ DEFAULT_PROGRESS_PATH = Path.home() / ".cache" / "corp-extractor" / "wikidata-dump-progress.json"
466
+
467
+
468
+ @dataclass
469
+ class DumpProgress:
470
+ """
471
+ Tracks progress through the Wikidata dump file for resume support.
472
+
473
+ Progress is tracked by entity index (number of entities processed).
474
+ On resume, entities are skipped until reaching the saved position.
475
+ """
476
+ # Entity index - number of entities yielded from the dump
477
+ entity_index: int = 0
478
+
479
+ # Separate counters for people and orgs import
480
+ people_yielded: int = 0
481
+ orgs_yielded: int = 0
482
+
483
+ # Last entity ID processed (for verification)
484
+ last_entity_id: str = ""
485
+
486
+ # Timestamp of last update
487
+ last_updated: str = field(default_factory=lambda: datetime.now().isoformat())
488
+
489
+ # Dump file path (to detect if dump changed)
490
+ dump_path: str = ""
491
+
492
+ # Dump file size (to detect if dump changed)
493
+ dump_size: int = 0
494
+
495
+ def save(self, path: Optional[Path] = None) -> None:
496
+ """Save progress to JSON file."""
497
+ path = path or DEFAULT_PROGRESS_PATH
498
+ path.parent.mkdir(parents=True, exist_ok=True)
499
+ self.last_updated = datetime.now().isoformat()
500
+ with open(path, "w") as f:
501
+ json.dump({
502
+ "entity_index": self.entity_index,
503
+ "people_yielded": self.people_yielded,
504
+ "orgs_yielded": self.orgs_yielded,
505
+ "last_entity_id": self.last_entity_id,
506
+ "last_updated": self.last_updated,
507
+ "dump_path": self.dump_path,
508
+ "dump_size": self.dump_size,
509
+ }, f, indent=2)
510
+ logger.debug(f"Saved progress: entity_index={self.entity_index}, last_id={self.last_entity_id}")
511
+
512
+ @classmethod
513
+ def load(cls, path: Optional[Path] = None) -> Optional["DumpProgress"]:
514
+ """Load progress from JSON file, returns None if not found."""
515
+ path = path or DEFAULT_PROGRESS_PATH
516
+ if not path.exists():
517
+ return None
518
+ try:
519
+ with open(path) as f:
520
+ data = json.load(f)
521
+ return cls(
522
+ entity_index=data.get("entity_index", 0),
523
+ people_yielded=data.get("people_yielded", 0),
524
+ orgs_yielded=data.get("orgs_yielded", 0),
525
+ last_entity_id=data.get("last_entity_id", ""),
526
+ last_updated=data.get("last_updated", ""),
527
+ dump_path=data.get("dump_path", ""),
528
+ dump_size=data.get("dump_size", 0),
529
+ )
530
+ except (json.JSONDecodeError, KeyError, TypeError) as e:
531
+ logger.warning(f"Failed to load progress from {path}: {e}")
532
+ return None
533
+
534
+ @classmethod
535
+ def clear(cls, path: Optional[Path] = None) -> None:
536
+ """Delete the progress file."""
537
+ path = path or DEFAULT_PROGRESS_PATH
538
+ if path.exists():
539
+ path.unlink()
540
+ logger.info(f"Cleared progress file: {path}")
541
+
542
+ def matches_dump(self, dump_path: Path) -> bool:
543
+ """Check if this progress matches the given dump file."""
544
+ if str(dump_path) != self.dump_path:
545
+ return False
546
+ if dump_path.exists() and dump_path.stat().st_size != self.dump_size:
547
+ return False
548
+ return True
549
+
550
+
551
+ class WikidataDumpImporter:
552
+ """
553
+ Stream Wikidata JSON dump to extract people and organization records.
554
+
555
+ This importer processes the Wikidata dump line-by-line to avoid memory issues
556
+ with the ~100GB compressed file. It filters for:
557
+ - Humans (P31=Q5) with English Wikipedia articles
558
+ - Organizations with English Wikipedia articles
559
+
560
+ The dump URL can be customized, and the importer supports both .bz2 and .gz
561
+ compression formats.
562
+ """
563
+
564
+ def __init__(self, dump_path: Optional[str] = None):
565
+ """
566
+ Initialize the dump importer.
567
+
568
+ Args:
569
+ dump_path: Optional path to a pre-downloaded dump file.
570
+ If not provided, will need to call download_dump() first.
571
+ """
572
+ self._dump_path = Path(dump_path) if dump_path else None
573
+ # Track discovered organizations from people import
574
+ self._discovered_orgs: dict[str, str] = {}
575
+ # Track QIDs that need label resolution (country, role)
576
+ self._unresolved_qids: set[str] = set()
577
+ # Label cache built during dump processing
578
+ self._label_cache: dict[str, str] = {}
579
+
580
+ def download_dump(
581
+ self,
582
+ target_dir: Optional[Path] = None,
583
+ force: bool = False,
584
+ progress_callback: Optional[Callable[[int, int], None]] = None,
585
+ use_aria2: bool = True,
586
+ aria2_connections: int = 16,
587
+ ) -> Path:
588
+ """
589
+ Download the latest Wikidata dump with progress indicator.
590
+
591
+ For fastest downloads, uses aria2c if available (16 parallel connections).
592
+ Falls back to urllib if aria2c is not installed.
593
+
594
+ Args:
595
+ target_dir: Directory to save the dump (default: ~/.cache/corp-extractor)
596
+ force: Force re-download even if file exists
597
+ progress_callback: Optional callback(downloaded_bytes, total_bytes) for progress
598
+ use_aria2: Try to use aria2c for faster downloads (default: True)
599
+ aria2_connections: Number of connections for aria2c (default: 16)
600
+
601
+ Returns:
602
+ Path to the downloaded dump file
603
+ """
604
+ if target_dir is None:
605
+ target_dir = Path.home() / ".cache" / "corp-extractor"
606
+
607
+ target_dir.mkdir(parents=True, exist_ok=True)
608
+ dump_path = target_dir / "wikidata-latest-all.json.bz2"
609
+
610
+ if dump_path.exists() and not force:
611
+ logger.info(f"Using cached dump at {dump_path}")
612
+ self._dump_path = dump_path
613
+ return dump_path
614
+
615
+ logger.info(f"Target: {dump_path}")
616
+
617
+ # Try aria2c first for much faster downloads
618
+ if use_aria2 and shutil.which("aria2c"):
619
+ logger.info("Using aria2c for fast parallel download...")
620
+ try:
621
+ self._download_with_aria2(dump_path, connections=aria2_connections)
622
+ self._dump_path = dump_path
623
+ return dump_path
624
+ except Exception as e:
625
+ logger.warning(f"aria2c download failed: {e}, falling back to urllib")
626
+
627
+ # Fallback to urllib
628
+ logger.info(f"Downloading Wikidata dump from {DUMP_URL}...")
629
+ logger.info("TIP: Install aria2c for 10-20x faster downloads: brew install aria2")
630
+ logger.info("This is a large file (~100GB) and will take significant time.")
631
+
632
+ # Stream download with progress
633
+ req = urllib.request.Request(
634
+ DUMP_URL,
635
+ headers={"User-Agent": "corp-extractor/1.0 (Wikidata dump importer)"}
636
+ )
637
+
638
+ with urllib.request.urlopen(req) as response:
639
+ total = int(response.headers.get("content-length", 0))
640
+ total_gb = total / (1024 ** 3) if total else 0
641
+
642
+ with open(dump_path, "wb") as f:
643
+ downloaded = 0
644
+ chunk_size = 8 * 1024 * 1024 # 8MB chunks
645
+ last_log_pct = 0
646
+
647
+ while True:
648
+ chunk = response.read(chunk_size)
649
+ if not chunk:
650
+ break
651
+ f.write(chunk)
652
+ downloaded += len(chunk)
653
+
654
+ # Call progress callback if provided
655
+ if progress_callback:
656
+ progress_callback(downloaded, total)
657
+ else:
658
+ # Default logging (every 1%)
659
+ if total:
660
+ pct = int((downloaded / total) * 100)
661
+ if pct > last_log_pct:
662
+ downloaded_gb = downloaded / (1024 ** 3)
663
+ logger.info(f"Downloaded {downloaded_gb:.1f}GB / {total_gb:.1f}GB ({pct}%)")
664
+ last_log_pct = pct
665
+ elif downloaded % (1024 ** 3) < chunk_size:
666
+ # Log every GB if total unknown
667
+ downloaded_gb = downloaded / (1024 ** 3)
668
+ logger.info(f"Downloaded {downloaded_gb:.1f}GB")
669
+
670
+ logger.info(f"Download complete: {dump_path}")
671
+ self._dump_path = dump_path
672
+ return dump_path
673
+
674
+ def _download_with_aria2(
675
+ self,
676
+ output_path: Path,
677
+ connections: int = 16,
678
+ ) -> None:
679
+ """
680
+ Download using aria2c with multiple parallel connections.
681
+
682
+ aria2c can achieve 10-20x faster downloads by using multiple
683
+ connections to the server.
684
+
685
+ Args:
686
+ output_path: Where to save the downloaded file
687
+ connections: Number of parallel connections (default: 16)
688
+ """
689
+ cmd = [
690
+ "aria2c",
691
+ "-x", str(connections), # Max connections per server
692
+ "-s", str(connections), # Split file into N parts
693
+ "-k", "10M", # Min split size
694
+ "--file-allocation=none", # Faster on SSDs
695
+ "-d", str(output_path.parent),
696
+ "-o", output_path.name,
697
+ "--console-log-level=notice",
698
+ "--summary-interval=10",
699
+ DUMP_URL,
700
+ ]
701
+
702
+ logger.info(f"Running: {' '.join(cmd)}")
703
+
704
+ # Run aria2c and stream output
705
+ process = subprocess.Popen(
706
+ cmd,
707
+ stdout=subprocess.PIPE,
708
+ stderr=subprocess.STDOUT,
709
+ text=True,
710
+ )
711
+
712
+ # Stream output to logger
713
+ if process.stdout:
714
+ for line in process.stdout:
715
+ line = line.strip()
716
+ if line:
717
+ logger.info(f"aria2c: {line}")
718
+
719
+ return_code = process.wait()
720
+ if return_code != 0:
721
+ raise RuntimeError(f"aria2c exited with code {return_code}")
722
+
723
+ def get_dump_path(self, target_dir: Optional[Path] = None) -> Path:
724
+ """
725
+ Get the path where the dump would be/is downloaded.
726
+
727
+ Args:
728
+ target_dir: Directory for the dump (default: ~/.cache/corp-extractor)
729
+
730
+ Returns:
731
+ Path to the dump file location
732
+ """
733
+ if target_dir is None:
734
+ target_dir = Path.home() / ".cache" / "corp-extractor"
735
+ return target_dir / "wikidata-latest-all.json.bz2"
736
+
737
+ def iter_entities(
738
+ self,
739
+ dump_path: Optional[Path] = None,
740
+ start_index: int = 0,
741
+ progress_callback: Optional[Callable[[int, str], None]] = None,
742
+ ) -> Iterator[dict]:
743
+ """
744
+ Stream entities from dump file, one at a time.
745
+
746
+ Handles the Wikidata JSON dump format where each line after the opening
747
+ bracket is a JSON object with a trailing comma (except the last).
748
+
749
+ Args:
750
+ dump_path: Path to dump file (uses self._dump_path if not provided)
751
+ start_index: Entity index to start yielding from (default 0). Entities
752
+ before this index are skipped but still cached for label lookups.
753
+ progress_callback: Optional callback(entity_index, entity_id) called for each
754
+ yielded entity. Useful for tracking progress.
755
+
756
+ Yields:
757
+ Parsed entity dictionaries
758
+ """
759
+ path = dump_path or self._dump_path
760
+ if path is None:
761
+ raise ValueError("No dump path provided. Call download_dump() first or pass dump_path.")
762
+
763
+ path = Path(path)
764
+
765
+ # Select opener based on extension
766
+ if path.suffix == ".bz2":
767
+ opener = bz2.open
768
+ elif path.suffix == ".gz":
769
+ opener = gzip.open
770
+ else:
771
+ # Assume uncompressed
772
+ opener = open
773
+
774
+ logger.info(f"Opening dump file: {path}")
775
+ logger.info(f"File size: {path.stat().st_size / (1024**3):.1f} GB")
776
+ if start_index > 0:
777
+ logger.info(f"Resuming from entity index {start_index:,} (skipping earlier entities)")
778
+ logger.info("Starting to read dump (bz2 decompression is slow, please wait)...")
779
+
780
+ with opener(path, "rt", encoding="utf-8") as f:
781
+ logger.info("Dump file opened successfully, reading lines...")
782
+ line_count = 0
783
+ entity_count = 0
784
+ skipped_count = 0
785
+ # Log more frequently at start, then reduce frequency
786
+ next_log_threshold = 10_000
787
+
788
+ for line in f:
789
+ line_count += 1
790
+
791
+ # Log first few lines to show we're making progress
792
+ if line_count <= 5:
793
+ logger.info(f"Read line {line_count} ({len(line)} chars)")
794
+ elif line_count == 100:
795
+ logger.info(f"Read {line_count} lines...")
796
+ elif line_count == 1000:
797
+ logger.info(f"Read {line_count} lines...")
798
+
799
+ line = line.strip()
800
+
801
+ # Skip array brackets
802
+ if line in ("[", "]"):
803
+ continue
804
+
805
+ # Strip trailing comma
806
+ if line.endswith(","):
807
+ line = line[:-1]
808
+
809
+ if not line:
810
+ continue
811
+
812
+ try:
813
+ entity = json.loads(line)
814
+ entity_id = entity.get("id", "")
815
+
816
+ # Always cache label for QID lookups (even when skipping)
817
+ self._cache_entity_label(entity)
818
+
819
+ # Check if we should skip this entity (resuming)
820
+ if entity_count < start_index:
821
+ entity_count += 1
822
+ skipped_count += 1
823
+ # Log skipping progress with adaptive frequency
824
+ if skipped_count >= next_log_threshold:
825
+ pct = 100 * skipped_count / start_index if start_index > 0 else 0
826
+ logger.info(
827
+ f"Skipping... {skipped_count:,}/{start_index:,} entities "
828
+ f"({pct:.1f}%), label cache: {len(self._label_cache):,}"
829
+ )
830
+ # Increase threshold: 10K -> 100K -> 1M
831
+ if next_log_threshold < 100_000:
832
+ next_log_threshold = 100_000
833
+ elif next_log_threshold < 1_000_000:
834
+ next_log_threshold = 1_000_000
835
+ else:
836
+ next_log_threshold += 1_000_000
837
+ continue
838
+
839
+ entity_count += 1
840
+
841
+ # Log progress with adaptive frequency
842
+ if entity_count >= next_log_threshold:
843
+ logger.info(
844
+ f"Processed {entity_count:,} entities, "
845
+ f"label cache: {len(self._label_cache):,}, "
846
+ f"unresolved QIDs: {len(self._unresolved_qids):,}"
847
+ )
848
+ # Increase threshold: 10K -> 100K -> 1M -> 2M -> 3M...
849
+ if next_log_threshold < 100_000:
850
+ next_log_threshold = 100_000
851
+ elif next_log_threshold < 1_000_000:
852
+ next_log_threshold = 1_000_000
853
+ else:
854
+ next_log_threshold += 1_000_000
855
+
856
+ # Call progress callback if provided
857
+ if progress_callback:
858
+ progress_callback(entity_count, entity_id)
859
+
860
+ yield entity
861
+
862
+ except json.JSONDecodeError as e:
863
+ logger.debug(f"Line {line_count}: JSON decode error: {e}")
864
+ continue
865
+
866
+ def import_people(
867
+ self,
868
+ dump_path: Optional[Path] = None,
869
+ limit: Optional[int] = None,
870
+ require_enwiki: bool = False,
871
+ skip_ids: Optional[set[str]] = None,
872
+ start_index: int = 0,
873
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
874
+ ) -> Iterator[PersonRecord]:
875
+ """
876
+ Stream through dump, yielding ALL people (humans with P31=Q5).
877
+
878
+ This method filters the dump for:
879
+ - Items with type "item" (not properties)
880
+ - Humans (P31 contains Q5)
881
+ - Optionally: Has English Wikipedia article (enwiki sitelink)
882
+
883
+ PersonType is derived from positions (P39) and occupations (P106).
884
+ Parliamentary context (electoral district, term, party) is extracted from P39 qualifiers.
885
+
886
+ Args:
887
+ dump_path: Path to dump file (uses self._dump_path if not provided)
888
+ limit: Optional maximum number of records to return
889
+ require_enwiki: If True, only include people with English Wikipedia articles
890
+ skip_ids: Optional set of source_ids (Q codes) to skip. Checked early before
891
+ full processing to avoid unnecessary QID resolution.
892
+ start_index: Entity index to start from (for resume support). Entities
893
+ before this index are skipped but labels are still cached.
894
+ progress_callback: Optional callback(entity_index, entity_id, records_yielded)
895
+ called for each yielded record. Useful for saving progress.
896
+
897
+ Yields:
898
+ PersonRecord for each qualifying person
899
+ """
900
+ path = dump_path or self._dump_path
901
+ count = 0
902
+ skipped = 0
903
+ current_entity_index = start_index
904
+
905
+ logger.info("Starting people import from Wikidata dump...")
906
+ if start_index > 0:
907
+ logger.info(f"Resuming from entity index {start_index:,}")
908
+ if not require_enwiki:
909
+ logger.info("Importing ALL humans (no enwiki filter)")
910
+ if skip_ids:
911
+ logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
912
+
913
+ def track_entity(entity_index: int, entity_id: str) -> None:
914
+ nonlocal current_entity_index
915
+ current_entity_index = entity_index
916
+
917
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
918
+ if limit and count >= limit:
919
+ break
920
+
921
+ # Check skip_ids early, before full processing (avoids QID resolution)
922
+ entity_id = entity.get("id", "")
923
+ if skip_ids and entity_id in skip_ids:
924
+ skipped += 1
925
+ continue
926
+
927
+ record = self._process_person_entity(entity, require_enwiki=require_enwiki)
928
+ if record:
929
+ count += 1
930
+ if count % 10_000 == 0:
931
+ logger.info(f"Yielded {count:,} people records (skipped {skipped:,})...")
932
+
933
+ # Call progress callback with current position
934
+ if progress_callback:
935
+ progress_callback(current_entity_index, entity_id, count)
936
+
937
+ yield record
938
+
939
+ logger.info(f"People import complete: {count:,} records (skipped {skipped:,})")
940
+
941
+ def import_organizations(
942
+ self,
943
+ dump_path: Optional[Path] = None,
944
+ limit: Optional[int] = None,
945
+ require_enwiki: bool = False,
946
+ skip_ids: Optional[set[str]] = None,
947
+ start_index: int = 0,
948
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
949
+ ) -> Iterator[CompanyRecord]:
950
+ """
951
+ Stream through dump, yielding organizations.
952
+
953
+ This method filters the dump for:
954
+ - Items with type "item"
955
+ - Has P31 (instance of) matching an organization type
956
+ - Optionally: Has English Wikipedia article (enwiki sitelink)
957
+
958
+ Args:
959
+ dump_path: Path to dump file (uses self._dump_path if not provided)
960
+ limit: Optional maximum number of records to return
961
+ require_enwiki: If True, only include orgs with English Wikipedia articles
962
+ skip_ids: Optional set of source_ids (Q codes) to skip. Checked early before
963
+ full processing to avoid unnecessary QID resolution.
964
+ start_index: Entity index to start from (for resume support). Entities
965
+ before this index are skipped but labels are still cached.
966
+ progress_callback: Optional callback(entity_index, entity_id, records_yielded)
967
+ called for each yielded record. Useful for saving progress.
968
+
969
+ Yields:
970
+ CompanyRecord for each qualifying organization
971
+ """
972
+ path = dump_path or self._dump_path
973
+ count = 0
974
+ skipped_existing = 0
975
+ skipped_no_type = 0
976
+ skipped_no_enwiki = 0
977
+ skipped_no_label = 0
978
+ current_entity_index = start_index
979
+
980
+ logger.info("Starting organization import from Wikidata dump...")
981
+ if start_index > 0:
982
+ logger.info(f"Resuming from entity index {start_index:,}")
983
+ if not require_enwiki:
984
+ logger.info("Importing ALL organizations (no enwiki filter)")
985
+ if skip_ids:
986
+ logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
987
+
988
+ def track_entity(entity_index: int, entity_id: str) -> None:
989
+ nonlocal current_entity_index
990
+ current_entity_index = entity_index
991
+
992
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
993
+ if limit and count >= limit:
994
+ break
995
+
996
+ # Check skip_ids early, before full processing (avoids QID resolution)
997
+ entity_id = entity.get("id", "")
998
+ if skip_ids and entity_id in skip_ids:
999
+ skipped_existing += 1
1000
+ continue
1001
+
1002
+ record = self._process_org_entity(entity, require_enwiki=require_enwiki)
1003
+ if record:
1004
+ count += 1
1005
+ if count % 10_000 == 0:
1006
+ logger.info(f"Yielded {count:,} organization records (skipped {skipped_existing:,} existing)...")
1007
+
1008
+ # Call progress callback with current position
1009
+ if progress_callback:
1010
+ progress_callback(current_entity_index, entity_id, count)
1011
+
1012
+ yield record
1013
+ elif entity.get("type") == "item":
1014
+ # Track skip reasons for debugging
1015
+ if self._get_org_type(entity) is None:
1016
+ skipped_no_type += 1
1017
+ elif require_enwiki and "enwiki" not in entity.get("sitelinks", {}):
1018
+ skipped_no_enwiki += 1
1019
+ else:
1020
+ skipped_no_label += 1
1021
+
1022
+ # Log skip stats periodically
1023
+ total_skipped = skipped_no_type + skipped_no_enwiki + skipped_no_label
1024
+ if total_skipped > 0 and total_skipped % 1_000_000 == 0:
1025
+ logger.debug(
1026
+ f"Skip stats: no_matching_type={skipped_no_type:,}, "
1027
+ f"no_enwiki={skipped_no_enwiki:,}, no_label={skipped_no_label:,}"
1028
+ )
1029
+
1030
+ logger.info(f"Organization import complete: {count:,} records (skipped {skipped_existing:,} existing)")
1031
+ logger.info(
1032
+ f"Skipped: no_matching_type={skipped_no_type:,}, "
1033
+ f"no_enwiki={skipped_no_enwiki:,}, no_label={skipped_no_label:,}"
1034
+ )
1035
+
1036
+ def import_all(
1037
+ self,
1038
+ dump_path: Optional[Path] = None,
1039
+ people_limit: Optional[int] = None,
1040
+ orgs_limit: Optional[int] = None,
1041
+ import_people: bool = True,
1042
+ import_orgs: bool = True,
1043
+ require_enwiki: bool = False,
1044
+ skip_people_ids: Optional[set[str]] = None,
1045
+ skip_org_ids: Optional[set[str]] = None,
1046
+ start_index: int = 0,
1047
+ progress_callback: Optional[Callable[[int, str, int, int], None]] = None,
1048
+ ) -> Iterator[tuple[str, ImportRecord]]:
1049
+ """
1050
+ Import both people and organizations in a single pass through the dump.
1051
+
1052
+ This is more efficient than calling import_people() and import_organizations()
1053
+ separately, as it only reads the ~100GB dump file once.
1054
+
1055
+ Args:
1056
+ dump_path: Path to dump file (uses self._dump_path if not provided)
1057
+ people_limit: Optional maximum number of people records
1058
+ orgs_limit: Optional maximum number of org records
1059
+ import_people: Whether to import people (default: True)
1060
+ import_orgs: Whether to import organizations (default: True)
1061
+ require_enwiki: If True, only include entities with English Wikipedia articles
1062
+ skip_people_ids: Optional set of people source_ids (Q codes) to skip
1063
+ skip_org_ids: Optional set of org source_ids (Q codes) to skip
1064
+ start_index: Entity index to start from (for resume support)
1065
+ progress_callback: Optional callback(entity_index, entity_id, people_count, orgs_count)
1066
+ called periodically. Useful for saving progress.
1067
+
1068
+ Yields:
1069
+ Tuples of (record_type, record) where record_type is "person" or "org"
1070
+ """
1071
+ path = dump_path or self._dump_path
1072
+ people_count = 0
1073
+ orgs_count = 0
1074
+ people_skipped = 0
1075
+ orgs_skipped = 0
1076
+ current_entity_index = start_index
1077
+
1078
+ logger.info("Starting combined import from Wikidata dump...")
1079
+ if start_index > 0:
1080
+ logger.info(f"Resuming from entity index {start_index:,}")
1081
+ if import_people:
1082
+ logger.info(f"Importing people (limit: {people_limit or 'none'})")
1083
+ if skip_people_ids:
1084
+ logger.info(f" Skipping {len(skip_people_ids):,} existing people Q codes")
1085
+ if import_orgs:
1086
+ logger.info(f"Importing organizations (limit: {orgs_limit or 'none'})")
1087
+ if skip_org_ids:
1088
+ logger.info(f" Skipping {len(skip_org_ids):,} existing org Q codes")
1089
+
1090
+ # Check if we've hit both limits
1091
+ def limits_reached() -> bool:
1092
+ people_done = not import_people or (people_limit and people_count >= people_limit)
1093
+ orgs_done = not import_orgs or (orgs_limit and orgs_count >= orgs_limit)
1094
+ return bool(people_done and orgs_done)
1095
+
1096
+ def track_entity(entity_index: int, entity_id: str) -> None:
1097
+ nonlocal current_entity_index
1098
+ current_entity_index = entity_index
1099
+
1100
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
1101
+ if limits_reached():
1102
+ break
1103
+
1104
+ entity_id = entity.get("id", "")
1105
+
1106
+ # Try to process as person first (if importing people and not at limit)
1107
+ if import_people and (not people_limit or people_count < people_limit):
1108
+ # Check skip_ids early
1109
+ if skip_people_ids and entity_id in skip_people_ids:
1110
+ people_skipped += 1
1111
+ else:
1112
+ person_record = self._process_person_entity(entity, require_enwiki=require_enwiki)
1113
+ if person_record:
1114
+ people_count += 1
1115
+ if people_count % 10_000 == 0:
1116
+ logger.info(
1117
+ f"Progress: {people_count:,} people, {orgs_count:,} orgs "
1118
+ f"(entity {current_entity_index:,})"
1119
+ )
1120
+ if progress_callback:
1121
+ progress_callback(current_entity_index, entity_id, people_count, orgs_count)
1122
+ yield ("person", person_record)
1123
+ continue # Entity was a person, don't check for org
1124
+
1125
+ # Try to process as organization (if importing orgs and not at limit)
1126
+ if import_orgs and (not orgs_limit or orgs_count < orgs_limit):
1127
+ # Check skip_ids early
1128
+ if skip_org_ids and entity_id in skip_org_ids:
1129
+ orgs_skipped += 1
1130
+ else:
1131
+ org_record = self._process_org_entity(entity, require_enwiki=require_enwiki)
1132
+ if org_record:
1133
+ orgs_count += 1
1134
+ if orgs_count % 10_000 == 0:
1135
+ logger.info(
1136
+ f"Progress: {people_count:,} people, {orgs_count:,} orgs "
1137
+ f"(entity {current_entity_index:,})"
1138
+ )
1139
+ if progress_callback:
1140
+ progress_callback(current_entity_index, entity_id, people_count, orgs_count)
1141
+ yield ("org", org_record)
1142
+
1143
+ logger.info(
1144
+ f"Combined import complete: {people_count:,} people, {orgs_count:,} orgs "
1145
+ f"(skipped {people_skipped:,} people, {orgs_skipped:,} orgs)"
1146
+ )
1147
+
1148
+ def _process_person_entity(
1149
+ self,
1150
+ entity: dict,
1151
+ require_enwiki: bool = False,
1152
+ ) -> Optional[PersonRecord]:
1153
+ """
1154
+ Process a single entity, return PersonRecord if it's a human.
1155
+
1156
+ Args:
1157
+ entity: Parsed Wikidata entity dictionary
1158
+ require_enwiki: If True, only include people with English Wikipedia articles
1159
+
1160
+ Returns:
1161
+ PersonRecord if entity qualifies, None otherwise
1162
+ """
1163
+ # Must be an item (not property)
1164
+ if entity.get("type") != "item":
1165
+ return None
1166
+
1167
+ # Must be human (P31 contains Q5)
1168
+ if not self._is_human(entity):
1169
+ return None
1170
+
1171
+ # Optionally require English Wikipedia article
1172
+ if require_enwiki:
1173
+ sitelinks = entity.get("sitelinks", {})
1174
+ if "enwiki" not in sitelinks:
1175
+ return None
1176
+
1177
+ # Extract person data
1178
+ return self._extract_person_data(entity)
1179
+
1180
+ def _process_org_entity(
1181
+ self,
1182
+ entity: dict,
1183
+ require_enwiki: bool = False,
1184
+ ) -> Optional[CompanyRecord]:
1185
+ """
1186
+ Process a single entity, return CompanyRecord if it's an organization.
1187
+
1188
+ Args:
1189
+ entity: Parsed Wikidata entity dictionary
1190
+ require_enwiki: If True, only include orgs with English Wikipedia articles
1191
+
1192
+ Returns:
1193
+ CompanyRecord if entity qualifies, None otherwise
1194
+ """
1195
+ # Must be an item (not property)
1196
+ if entity.get("type") != "item":
1197
+ return None
1198
+
1199
+ # Get organization type from P31
1200
+ entity_type = self._get_org_type(entity)
1201
+ if entity_type is None:
1202
+ return None
1203
+
1204
+ # Optionally require English Wikipedia article
1205
+ if require_enwiki:
1206
+ sitelinks = entity.get("sitelinks", {})
1207
+ if "enwiki" not in sitelinks:
1208
+ return None
1209
+
1210
+ # Extract organization data
1211
+ return self._extract_org_data(entity, entity_type)
1212
+
1213
+ def _is_human(self, entity: dict) -> bool:
1214
+ """
1215
+ Check if entity has P31 (instance of) = Q5 (human).
1216
+
1217
+ Args:
1218
+ entity: Parsed Wikidata entity dictionary
1219
+
1220
+ Returns:
1221
+ True if entity is a human
1222
+ """
1223
+ claims = entity.get("claims", {})
1224
+ for claim in claims.get("P31", []):
1225
+ mainsnak = claim.get("mainsnak", {})
1226
+ datavalue = mainsnak.get("datavalue", {})
1227
+ value = datavalue.get("value", {})
1228
+ if isinstance(value, dict) and value.get("id") == "Q5":
1229
+ return True
1230
+ return False
1231
+
1232
+ def _get_org_type(self, entity: dict) -> Optional[EntityType]:
1233
+ """
1234
+ Check if entity has P31 (instance of) matching an organization type.
1235
+
1236
+ Args:
1237
+ entity: Parsed Wikidata entity dictionary
1238
+
1239
+ Returns:
1240
+ EntityType if entity is an organization, None otherwise
1241
+ """
1242
+ claims = entity.get("claims", {})
1243
+ for claim in claims.get("P31", []):
1244
+ mainsnak = claim.get("mainsnak", {})
1245
+ datavalue = mainsnak.get("datavalue", {})
1246
+ value = datavalue.get("value", {})
1247
+ if isinstance(value, dict):
1248
+ qid = value.get("id", "")
1249
+ if qid in ORG_TYPE_TO_ENTITY_TYPE:
1250
+ return ORG_TYPE_TO_ENTITY_TYPE[qid]
1251
+ return None
1252
+
1253
+ def _get_claim_values(self, entity: dict, prop: str) -> list[str]:
1254
+ """
1255
+ Get all QID values for a property (e.g., P39, P106).
1256
+
1257
+ Args:
1258
+ entity: Parsed Wikidata entity dictionary
1259
+ prop: Property ID (e.g., "P39", "P106")
1260
+
1261
+ Returns:
1262
+ List of QID strings
1263
+ """
1264
+ claims = entity.get("claims", {})
1265
+ values = []
1266
+ for claim in claims.get(prop, []):
1267
+ mainsnak = claim.get("mainsnak", {})
1268
+ datavalue = mainsnak.get("datavalue", {})
1269
+ value = datavalue.get("value", {})
1270
+ if isinstance(value, dict):
1271
+ qid = value.get("id")
1272
+ if qid:
1273
+ values.append(qid)
1274
+ return values
1275
+
1276
+ def _get_qid_qualifier(self, qualifiers: dict, prop: str) -> Optional[str]:
1277
+ """Extract first QID from a qualifier property."""
1278
+ for qual in qualifiers.get(prop, []):
1279
+ qual_datavalue = qual.get("datavalue", {})
1280
+ qual_value = qual_datavalue.get("value", {})
1281
+ if isinstance(qual_value, dict):
1282
+ return qual_value.get("id")
1283
+ return None
1284
+
1285
+ def _get_time_qualifier(self, qualifiers: dict, prop: str) -> Optional[str]:
1286
+ """Extract first time value from a qualifier property."""
1287
+ for qual in qualifiers.get(prop, []):
1288
+ qual_datavalue = qual.get("datavalue", {})
1289
+ qual_value = qual_datavalue.get("value", {})
1290
+ if isinstance(qual_value, dict):
1291
+ time_str = qual_value.get("time", "")
1292
+ return self._parse_time_value(time_str)
1293
+ return None
1294
+
1295
+ def _get_positions_with_org(self, claims: dict) -> list[dict]:
1296
+ """
1297
+ Extract P39 positions with qualifiers for org, dates, and parliamentary context.
1298
+
1299
+ Qualifiers extracted per WikiProject Parliaments guidelines:
1300
+ - P580 (start time) - when the position started
1301
+ - P582 (end time) - when the position ended
1302
+ - P108 (employer) - organization they work for
1303
+ - P642 (of) - the organization (legacy/fallback)
1304
+ - P768 (electoral district) - constituency for MPs
1305
+ - P2937 (parliamentary term) - which term they served in
1306
+ - P4100 (parliamentary group) - political party/faction
1307
+ - P1001 (applies to jurisdiction) - jurisdiction they represent
1308
+ - P2715 (elected in) - which election elected them
1309
+
1310
+ Args:
1311
+ claims: Claims dictionary from entity
1312
+
1313
+ Returns:
1314
+ List of position dictionaries with position metadata
1315
+ """
1316
+ positions = []
1317
+ for claim in claims.get("P39", []):
1318
+ mainsnak = claim.get("mainsnak", {})
1319
+ datavalue = mainsnak.get("datavalue", {})
1320
+ pos_value = datavalue.get("value", {})
1321
+ pos_qid = pos_value.get("id") if isinstance(pos_value, dict) else None
1322
+ if not pos_qid:
1323
+ continue
1324
+
1325
+ qualifiers = claim.get("qualifiers", {})
1326
+
1327
+ # Extract organization from multiple possible qualifiers
1328
+ # Priority: P108 (employer) > P642 (of) > P1001 (jurisdiction)
1329
+ org_qid = (
1330
+ self._get_qid_qualifier(qualifiers, "P108") or # employer
1331
+ self._get_qid_qualifier(qualifiers, "P642") or # of (legacy)
1332
+ self._get_qid_qualifier(qualifiers, "P1001") # applies to jurisdiction
1333
+ )
1334
+
1335
+ # Extract dates
1336
+ start_date = self._get_time_qualifier(qualifiers, "P580")
1337
+ end_date = self._get_time_qualifier(qualifiers, "P582")
1338
+
1339
+ # Extract parliamentary/political qualifiers
1340
+ electoral_district = self._get_qid_qualifier(qualifiers, "P768")
1341
+ parliamentary_term = self._get_qid_qualifier(qualifiers, "P2937")
1342
+ parliamentary_group = self._get_qid_qualifier(qualifiers, "P4100")
1343
+ elected_in = self._get_qid_qualifier(qualifiers, "P2715")
1344
+
1345
+ positions.append({
1346
+ "position_qid": pos_qid,
1347
+ "org_qid": org_qid,
1348
+ "start_date": start_date,
1349
+ "end_date": end_date,
1350
+ # Parliamentary context
1351
+ "electoral_district": electoral_district,
1352
+ "parliamentary_term": parliamentary_term,
1353
+ "parliamentary_group": parliamentary_group,
1354
+ "elected_in": elected_in,
1355
+ })
1356
+ return positions
1357
+
1358
+ def _parse_time_value(self, time_str: str) -> Optional[str]:
1359
+ """
1360
+ Parse Wikidata time value to ISO date string.
1361
+
1362
+ Args:
1363
+ time_str: Wikidata time format like "+2020-01-15T00:00:00Z"
1364
+
1365
+ Returns:
1366
+ ISO date string (YYYY-MM-DD) or None
1367
+ """
1368
+ if not time_str:
1369
+ return None
1370
+ # Remove leading + and extract date part
1371
+ time_str = time_str.lstrip("+")
1372
+ if "T" in time_str:
1373
+ return time_str.split("T")[0]
1374
+ return None
1375
+
1376
+ def _classify_person_type(
1377
+ self,
1378
+ positions: list[dict],
1379
+ occupations: list[str],
1380
+ ) -> PersonType:
1381
+ """
1382
+ Determine PersonType from P39 positions and P106 occupations.
1383
+
1384
+ Priority order:
1385
+ 1. Check positions (more specific)
1386
+ 2. Check occupations
1387
+ 3. Default to UNKNOWN
1388
+
1389
+ Args:
1390
+ positions: List of position dictionaries from _get_positions_with_org
1391
+ occupations: List of occupation QIDs from P106
1392
+
1393
+ Returns:
1394
+ Classified PersonType
1395
+ """
1396
+ # Check positions first (more specific)
1397
+ for pos in positions:
1398
+ pos_qid = pos.get("position_qid", "")
1399
+ if pos_qid in EXECUTIVE_POSITION_QIDS:
1400
+ return PersonType.EXECUTIVE
1401
+ if pos_qid in POLITICIAN_POSITION_QIDS:
1402
+ return PersonType.POLITICIAN
1403
+
1404
+ # Then check occupations
1405
+ for occ in occupations:
1406
+ if occ in OCCUPATION_TO_TYPE:
1407
+ return OCCUPATION_TO_TYPE[occ]
1408
+
1409
+ # Default
1410
+ return PersonType.UNKNOWN
1411
+
1412
+ def _get_org_or_context(self, pos: dict) -> str:
1413
+ """Get org QID from position, falling back to electoral district or parliamentary group."""
1414
+ return (
1415
+ pos.get("org_qid") or
1416
+ pos.get("electoral_district") or
1417
+ pos.get("parliamentary_group") or
1418
+ ""
1419
+ )
1420
+
1421
+ def _get_best_role_org(
1422
+ self,
1423
+ positions: list[dict],
1424
+ ) -> tuple[str, str, str, Optional[str], Optional[str], dict]:
1425
+ """
1426
+ Select best position for role/org display.
1427
+
1428
+ Priority:
1429
+ 1. Positions with org/context and dates
1430
+ 2. Positions with org/context
1431
+ 3. Positions with dates
1432
+ 4. Any position
1433
+
1434
+ Args:
1435
+ positions: List of position dictionaries
1436
+
1437
+ Returns:
1438
+ Tuple of (role_qid, org_label, org_qid, start_date, end_date, extra_context)
1439
+ Note: In dump mode, we return QIDs since we don't have labels
1440
+ extra_context contains parliamentary metadata
1441
+ """
1442
+ def has_context(pos: dict) -> bool:
1443
+ return bool(
1444
+ pos.get("org_qid") or
1445
+ pos.get("electoral_district") or
1446
+ pos.get("parliamentary_group")
1447
+ )
1448
+
1449
+ def get_extra_context(pos: dict) -> dict:
1450
+ return {
1451
+ k: v for k, v in {
1452
+ "electoral_district": pos.get("electoral_district"),
1453
+ "parliamentary_term": pos.get("parliamentary_term"),
1454
+ "parliamentary_group": pos.get("parliamentary_group"),
1455
+ "elected_in": pos.get("elected_in"),
1456
+ }.items() if v
1457
+ }
1458
+
1459
+ # Priority 1: Position with org/context and dates
1460
+ for pos in positions:
1461
+ if has_context(pos) and (pos.get("start_date") or pos.get("end_date")):
1462
+ return (
1463
+ pos["position_qid"],
1464
+ "",
1465
+ self._get_org_or_context(pos),
1466
+ pos.get("start_date"),
1467
+ pos.get("end_date"),
1468
+ get_extra_context(pos),
1469
+ )
1470
+
1471
+ # Priority 2: Position with org/context
1472
+ for pos in positions:
1473
+ if has_context(pos):
1474
+ return (
1475
+ pos["position_qid"],
1476
+ "",
1477
+ self._get_org_or_context(pos),
1478
+ pos.get("start_date"),
1479
+ pos.get("end_date"),
1480
+ get_extra_context(pos),
1481
+ )
1482
+
1483
+ # Priority 3: Position with dates
1484
+ for pos in positions:
1485
+ if pos.get("start_date") or pos.get("end_date"):
1486
+ return (
1487
+ pos["position_qid"],
1488
+ "",
1489
+ self._get_org_or_context(pos),
1490
+ pos.get("start_date"),
1491
+ pos.get("end_date"),
1492
+ get_extra_context(pos),
1493
+ )
1494
+
1495
+ # Priority 4: Any position
1496
+ if positions:
1497
+ pos = positions[0]
1498
+ return (
1499
+ pos["position_qid"],
1500
+ "",
1501
+ self._get_org_or_context(pos),
1502
+ pos.get("start_date"),
1503
+ pos.get("end_date"),
1504
+ get_extra_context(pos),
1505
+ )
1506
+
1507
+ return "", "", "", None, None, {}
1508
+
1509
+ def _extract_person_data(self, entity: dict) -> Optional[PersonRecord]:
1510
+ """
1511
+ Extract PersonRecord from entity dict.
1512
+
1513
+ Derives type/role/org from claims.
1514
+
1515
+ Args:
1516
+ entity: Parsed Wikidata entity dictionary
1517
+
1518
+ Returns:
1519
+ PersonRecord or None if essential data is missing
1520
+ """
1521
+ qid = entity.get("id", "")
1522
+ labels = entity.get("labels", {})
1523
+ # Try English label first, fall back to any available label
1524
+ label = labels.get("en", {}).get("value", "")
1525
+ if not label:
1526
+ # Try to get any label
1527
+ for lang_data in labels.values():
1528
+ if isinstance(lang_data, dict) and lang_data.get("value"):
1529
+ label = lang_data["value"]
1530
+ break
1531
+
1532
+ if not label or not qid:
1533
+ return None
1534
+
1535
+ claims = entity.get("claims", {})
1536
+
1537
+ # Get positions (P39) with qualifiers for org
1538
+ positions = self._get_positions_with_org(claims)
1539
+ # Get occupations (P106)
1540
+ occupations = self._get_claim_values(entity, "P106")
1541
+
1542
+ # Classify person type from positions + occupations
1543
+ person_type = self._classify_person_type(positions, occupations)
1544
+
1545
+ # Get best role/org/dates from positions
1546
+ role_qid, _, org_qid, start_date, end_date, extra_context = self._get_best_role_org(positions)
1547
+
1548
+ # Get country (P27 - country of citizenship)
1549
+ countries = self._get_claim_values(entity, "P27")
1550
+ country_qid = countries[0] if countries else ""
1551
+
1552
+ # Resolve QIDs to labels using the cache (or track for later resolution)
1553
+ country_label = self._resolve_qid(country_qid) if country_qid else ""
1554
+ role_label = self._resolve_qid(role_qid) if role_qid else ""
1555
+ org_label = self._resolve_qid(org_qid) if org_qid else ""
1556
+
1557
+ # Get birth and death dates (P569, P570)
1558
+ birth_date = self._get_time_claim(claims, "P569")
1559
+ death_date = self._get_time_claim(claims, "P570")
1560
+
1561
+ # Get description
1562
+ descriptions = entity.get("descriptions", {})
1563
+ description = descriptions.get("en", {}).get("value", "")
1564
+
1565
+ # Track discovered organization
1566
+ if org_qid:
1567
+ self._discovered_orgs[org_qid] = org_label
1568
+
1569
+ # Build record with all position metadata
1570
+ record_data = {
1571
+ "wikidata_id": qid,
1572
+ "label": label,
1573
+ "description": description,
1574
+ "positions": [p["position_qid"] for p in positions],
1575
+ "occupations": occupations,
1576
+ "org_qid": org_qid,
1577
+ "country_qid": country_qid,
1578
+ "role_qid": role_qid,
1579
+ "birth_date": birth_date,
1580
+ "death_date": death_date,
1581
+ }
1582
+ # Add parliamentary context if present
1583
+ if extra_context:
1584
+ record_data.update(extra_context)
1585
+
1586
+ return PersonRecord(
1587
+ name=label,
1588
+ source="wikidata",
1589
+ source_id=qid,
1590
+ country=country_label,
1591
+ person_type=person_type,
1592
+ known_for_role=role_label,
1593
+ known_for_org=org_label,
1594
+ from_date=start_date,
1595
+ to_date=end_date,
1596
+ birth_date=birth_date,
1597
+ death_date=death_date,
1598
+ record=record_data,
1599
+ )
1600
+
1601
+ def _extract_org_data(
1602
+ self,
1603
+ entity: dict,
1604
+ entity_type: EntityType,
1605
+ ) -> Optional[CompanyRecord]:
1606
+ """
1607
+ Extract CompanyRecord from entity dict.
1608
+
1609
+ Args:
1610
+ entity: Parsed Wikidata entity dictionary
1611
+ entity_type: Determined EntityType
1612
+
1613
+ Returns:
1614
+ CompanyRecord or None if essential data is missing
1615
+ """
1616
+ qid = entity.get("id", "")
1617
+ labels = entity.get("labels", {})
1618
+ label = labels.get("en", {}).get("value", "")
1619
+
1620
+ if not label or not qid:
1621
+ return None
1622
+
1623
+ claims = entity.get("claims", {})
1624
+
1625
+ # Get country (P17 - country)
1626
+ countries = self._get_claim_values(entity, "P17")
1627
+ country_qid = countries[0] if countries else ""
1628
+
1629
+ # Resolve country QID to label
1630
+ country_label = self._resolve_qid(country_qid) if country_qid else ""
1631
+
1632
+ # Get LEI (P1278)
1633
+ lei = self._get_string_claim(claims, "P1278")
1634
+
1635
+ # Get ticker (P249)
1636
+ ticker = self._get_string_claim(claims, "P249")
1637
+
1638
+ # Get description
1639
+ descriptions = entity.get("descriptions", {})
1640
+ description = descriptions.get("en", {}).get("value", "")
1641
+
1642
+ # Get inception date (P571)
1643
+ inception = self._get_time_claim(claims, "P571")
1644
+
1645
+ # Get dissolution date (P576)
1646
+ dissolution = self._get_time_claim(claims, "P576")
1647
+
1648
+ return CompanyRecord(
1649
+ name=label,
1650
+ source="wikipedia", # Use "wikipedia" per existing convention
1651
+ source_id=qid,
1652
+ region=country_label,
1653
+ entity_type=entity_type,
1654
+ from_date=inception,
1655
+ to_date=dissolution,
1656
+ record={
1657
+ "wikidata_id": qid,
1658
+ "label": label,
1659
+ "description": description,
1660
+ "lei": lei,
1661
+ "ticker": ticker,
1662
+ "country_qid": country_qid,
1663
+ },
1664
+ )
1665
+
1666
+ def _get_string_claim(self, claims: dict, prop: str) -> str:
1667
+ """
1668
+ Get first string value for a property.
1669
+
1670
+ Args:
1671
+ claims: Claims dictionary
1672
+ prop: Property ID
1673
+
1674
+ Returns:
1675
+ String value or empty string
1676
+ """
1677
+ for claim in claims.get(prop, []):
1678
+ mainsnak = claim.get("mainsnak", {})
1679
+ datavalue = mainsnak.get("datavalue", {})
1680
+ value = datavalue.get("value")
1681
+ if isinstance(value, str):
1682
+ return value
1683
+ return ""
1684
+
1685
+ def _get_time_claim(self, claims: dict, prop: str) -> Optional[str]:
1686
+ """
1687
+ Get first time value for a property as ISO date string.
1688
+
1689
+ Args:
1690
+ claims: Claims dictionary
1691
+ prop: Property ID
1692
+
1693
+ Returns:
1694
+ ISO date string (YYYY-MM-DD) or None
1695
+ """
1696
+ for claim in claims.get(prop, []):
1697
+ mainsnak = claim.get("mainsnak", {})
1698
+ datavalue = mainsnak.get("datavalue", {})
1699
+ value = datavalue.get("value", {})
1700
+ if isinstance(value, dict):
1701
+ time_str = value.get("time", "")
1702
+ # Format: +2020-01-15T00:00:00Z
1703
+ if time_str:
1704
+ # Remove leading + and extract date part
1705
+ time_str = time_str.lstrip("+")
1706
+ if "T" in time_str:
1707
+ return time_str.split("T")[0]
1708
+ return None
1709
+
1710
+ def get_discovered_organizations(self) -> list[CompanyRecord]:
1711
+ """
1712
+ Get organizations discovered during the people import.
1713
+
1714
+ These are organizations associated with people (from P39 P642 qualifiers)
1715
+ that can be inserted into the organizations database if not already present.
1716
+
1717
+ Note: In dump mode, we only have QIDs, not labels.
1718
+
1719
+ Returns:
1720
+ List of CompanyRecord objects for discovered organizations
1721
+ """
1722
+ records = []
1723
+ for org_qid in self._discovered_orgs:
1724
+ record = CompanyRecord(
1725
+ name=org_qid, # Only have QID, not label
1726
+ source="wikipedia",
1727
+ source_id=org_qid,
1728
+ region="",
1729
+ entity_type=EntityType.BUSINESS, # Default
1730
+ record={
1731
+ "wikidata_id": org_qid,
1732
+ "discovered_from": "people_import",
1733
+ "needs_label_resolution": True,
1734
+ },
1735
+ )
1736
+ records.append(record)
1737
+ logger.info(f"Discovered {len(records)} organizations from people import")
1738
+ return records
1739
+
1740
+ def clear_discovered_organizations(self) -> None:
1741
+ """Clear the discovered organizations cache."""
1742
+ self._discovered_orgs.clear()
1743
+
1744
+ def get_unresolved_qids(self) -> set[str]:
1745
+ """Get QIDs that need label resolution."""
1746
+ return self._unresolved_qids.copy()
1747
+
1748
+ def get_label_cache(self) -> dict[str, str]:
1749
+ """Get the label cache built during import."""
1750
+ return self._label_cache.copy()
1751
+
1752
+ def set_label_cache(self, labels: dict[str, str]) -> None:
1753
+ """
1754
+ Set initial label cache from existing data (e.g., from database).
1755
+
1756
+ Args:
1757
+ labels: Mapping of QID -> label to seed the cache
1758
+ """
1759
+ self._label_cache.update(labels)
1760
+ logger.info(f"Seeded label cache with {len(labels)} existing labels")
1761
+
1762
+ def get_new_labels_since(self, known_qids: set[str]) -> dict[str, str]:
1763
+ """
1764
+ Get labels that were added to cache since a known set.
1765
+
1766
+ Args:
1767
+ known_qids: Set of QIDs that were already known
1768
+
1769
+ Returns:
1770
+ Dict of new QID -> label mappings
1771
+ """
1772
+ return {qid: label for qid, label in self._label_cache.items() if qid not in known_qids}
1773
+
1774
+ def _cache_entity_label(self, entity: dict) -> None:
1775
+ """
1776
+ Cache the English label for an entity during dump processing.
1777
+
1778
+ This builds up a lookup table as we iterate through the dump,
1779
+ so we can resolve QID references (countries, roles) to labels.
1780
+ """
1781
+ qid = entity.get("id", "")
1782
+ if not qid:
1783
+ return
1784
+
1785
+ labels = entity.get("labels", {})
1786
+ en_label = labels.get("en", {}).get("value", "")
1787
+ if en_label:
1788
+ self._label_cache[qid] = en_label
1789
+
1790
+ def _resolve_qid(self, qid: str) -> str:
1791
+ """
1792
+ Resolve a QID to a label, using cache or SPARQL lookup.
1793
+
1794
+ Returns the label if found/resolved, otherwise returns the QID.
1795
+ """
1796
+ if not qid or not qid.startswith("Q"):
1797
+ return qid
1798
+
1799
+ if qid in self._label_cache:
1800
+ label = self._label_cache[qid]
1801
+ logger.debug(f"Resolved QID (cache): {qid} -> {label}")
1802
+ return label
1803
+
1804
+ # Not in cache - resolve via SPARQL immediately
1805
+ label = self._resolve_single_qid_sparql(qid)
1806
+ if label:
1807
+ logger.info(f"Resolved QID (SPARQL): {qid} -> {label}")
1808
+ self._label_cache[qid] = label
1809
+ return label
1810
+
1811
+ # Track unresolved
1812
+ if qid not in self._unresolved_qids:
1813
+ logger.debug(f"Unresolved QID: {qid}")
1814
+ self._unresolved_qids.add(qid)
1815
+ return qid
1816
+
1817
+ def _resolve_single_qid_sparql(self, qid: str) -> Optional[str]:
1818
+ """
1819
+ Resolve a single QID to a label via SPARQL.
1820
+
1821
+ Args:
1822
+ qid: Wikidata QID (e.g., 'Q30')
1823
+
1824
+ Returns:
1825
+ Label string or None if not found
1826
+ """
1827
+ import json
1828
+ import urllib.parse
1829
+ import urllib.request
1830
+
1831
+ query = f"""
1832
+ SELECT ?label WHERE {{
1833
+ wd:{qid} rdfs:label ?label FILTER(LANG(?label) = "en") .
1834
+ }}
1835
+ LIMIT 1
1836
+ """
1837
+
1838
+ try:
1839
+ params = urllib.parse.urlencode({
1840
+ "query": query,
1841
+ "format": "json",
1842
+ })
1843
+ url = f"https://query.wikidata.org/sparql?{params}"
1844
+
1845
+ req = urllib.request.Request(
1846
+ url,
1847
+ headers={
1848
+ "Accept": "application/sparql-results+json",
1849
+ "User-Agent": "corp-extractor/1.0 (QID resolver)",
1850
+ }
1851
+ )
1852
+
1853
+ with urllib.request.urlopen(req, timeout=10) as response:
1854
+ data = json.loads(response.read().decode("utf-8"))
1855
+
1856
+ bindings = data.get("results", {}).get("bindings", [])
1857
+ if bindings:
1858
+ return bindings[0].get("label", {}).get("value")
1859
+
1860
+ except Exception as e:
1861
+ logger.debug(f"SPARQL lookup failed for {qid}: {e}")
1862
+
1863
+ return None
1864
+
1865
+ def resolve_qids_via_sparql(
1866
+ self,
1867
+ qids: Optional[set[str]] = None,
1868
+ batch_size: int = 50,
1869
+ delay_seconds: float = 1.0,
1870
+ ) -> dict[str, str]:
1871
+ """
1872
+ Resolve QIDs to labels via Wikidata SPARQL queries.
1873
+
1874
+ This is used after import to resolve any QIDs that weren't found
1875
+ in the dump (e.g., if import was limited or dump was incomplete).
1876
+
1877
+ Args:
1878
+ qids: Set of QIDs to resolve (defaults to unresolved_qids)
1879
+ batch_size: Number of QIDs per SPARQL query (default 50)
1880
+ delay_seconds: Delay between queries to avoid rate limiting
1881
+
1882
+ Returns:
1883
+ Dict mapping QID -> label for resolved QIDs
1884
+ """
1885
+ import json
1886
+ import time
1887
+ import urllib.parse
1888
+ import urllib.request
1889
+
1890
+ if qids is None:
1891
+ qids = self._unresolved_qids
1892
+
1893
+ if not qids:
1894
+ return {}
1895
+
1896
+ resolved: dict[str, str] = {}
1897
+ qid_list = list(qids)
1898
+
1899
+ logger.info(f"Resolving {len(qid_list)} QIDs via SPARQL...")
1900
+
1901
+ for i in range(0, len(qid_list), batch_size):
1902
+ batch = qid_list[i:i + batch_size]
1903
+
1904
+ # Build VALUES clause
1905
+ values = " ".join(f"wd:{qid}" for qid in batch)
1906
+ query = f"""
1907
+ SELECT ?item ?itemLabel WHERE {{
1908
+ VALUES ?item {{ {values} }}
1909
+ ?item rdfs:label ?itemLabel FILTER(LANG(?itemLabel) = "en") .
1910
+ }}
1911
+ """
1912
+
1913
+ try:
1914
+ params = urllib.parse.urlencode({
1915
+ "query": query,
1916
+ "format": "json",
1917
+ })
1918
+ url = f"https://query.wikidata.org/sparql?{params}"
1919
+
1920
+ req = urllib.request.Request(
1921
+ url,
1922
+ headers={
1923
+ "Accept": "application/sparql-results+json",
1924
+ "User-Agent": "corp-extractor/1.0 (QID resolver)",
1925
+ }
1926
+ )
1927
+
1928
+ with urllib.request.urlopen(req, timeout=60) as response:
1929
+ data = json.loads(response.read().decode("utf-8"))
1930
+
1931
+ for binding in data.get("results", {}).get("bindings", []):
1932
+ item_uri = binding.get("item", {}).get("value", "")
1933
+ label = binding.get("itemLabel", {}).get("value", "")
1934
+ if item_uri and label:
1935
+ qid = item_uri.split("/")[-1]
1936
+ resolved[qid] = label
1937
+ self._label_cache[qid] = label
1938
+
1939
+ logger.debug(f"Resolved batch {i // batch_size + 1}: {len(batch)} QIDs")
1940
+
1941
+ except Exception as e:
1942
+ logger.warning(f"SPARQL batch failed: {e}")
1943
+
1944
+ if i + batch_size < len(qid_list):
1945
+ time.sleep(delay_seconds)
1946
+
1947
+ # Update unresolved set
1948
+ self._unresolved_qids -= set(resolved.keys())
1949
+
1950
+ logger.info(f"Resolved {len(resolved)} QIDs, {len(self._unresolved_qids)} remaining unresolved")
1951
+ return resolved