corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,2282 @@
1
+ """
2
+ Wikidata dump importer for people and organizations.
3
+
4
+ Uses the Wikidata JSON dump (~100GB compressed) to import:
5
+ 1. People: All humans (P31=Q5) with English Wikipedia articles
6
+ 2. Organizations: All organizations with English Wikipedia articles
7
+
8
+ This avoids SPARQL query timeouts that occur with large result sets.
9
+ The dump is processed line-by-line to minimize memory usage.
10
+
11
+ Dump format:
12
+ - File: `latest-all.json.bz2` (~100GB) or `.gz` (~150GB)
13
+ - Format: JSON array where each line is a separate entity (after first `[` line)
14
+ - Each line: `{"type":"item","id":"Q123","labels":{...},"claims":{...},"sitelinks":{...}},`
15
+ - Streaming: Read line-by-line, strip trailing comma, parse JSON
16
+
17
+ Resume support:
18
+ - Progress is tracked by entity index (count of entities processed)
19
+ - Progress can be saved to a JSON file and loaded on resume
20
+ - On resume, entities are skipped efficiently until reaching the saved position
21
+ """
22
+
23
+ import bz2
24
+ import gzip
25
+ import json
26
+ import logging
27
+ import shutil
28
+ import subprocess
29
+ import urllib.request
30
+ from dataclasses import dataclass, field
31
+ from datetime import datetime
32
+ from pathlib import Path
33
+ from typing import Callable, Iterator, Optional
34
+
35
+ from ..models import CompanyRecord, EntityType, LocationRecord, PersonRecord, PersonType, SimplifiedLocationType
36
+
37
+ # Type alias for records that can be either people or orgs or locations
38
+ ImportRecord = PersonRecord | CompanyRecord | LocationRecord
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ # Wikidata dump URLs - mirrors for faster downloads
43
+ # Primary is Wikimedia (slow), alternatives may be faster
44
+ DUMP_MIRRORS = [
45
+ # Wikimedia Foundation (official, often slow)
46
+ "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
47
+ # Academic Torrents mirror (if available) - typically faster
48
+ # Note: Check https://academictorrents.com/browse?search=wikidata for current links
49
+ ]
50
+
51
+ # Default URL (can be overridden)
52
+ DUMP_URL = DUMP_MIRRORS[0]
53
+
54
+ # For even faster downloads, users can:
55
+ # 1. Use a torrent client with the Academic Torrents magnet link
56
+ # 2. Download from a regional Wikimedia mirror
57
+ # 3. Use aria2c with multiple connections: aria2c -x 16 -s 16 <url>
58
+
59
+ # =============================================================================
60
+ # POSITION TO PERSON TYPE MAPPING (P39 - position held)
61
+ # =============================================================================
62
+
63
+ # Executive positions (P39 values)
64
+ EXECUTIVE_POSITION_QIDS = {
65
+ "Q484876", # CEO
66
+ "Q623279", # CFO
67
+ "Q1502675", # COO
68
+ "Q935019", # CTO
69
+ "Q1057716", # CIO
70
+ "Q2140589", # CMO
71
+ "Q1115042", # chairperson
72
+ "Q4720025", # board of directors member
73
+ "Q60432825", # chief human resources officer
74
+ "Q15967139", # chief compliance officer
75
+ "Q15729310", # chief risk officer
76
+ "Q47523568", # chief legal officer
77
+ "Q258557", # board chair
78
+ "Q114863313", # chief sustainability officer
79
+ "Q726114", # company president
80
+ "Q1372944", # managing director
81
+ "Q18918145", # chief commercial officer
82
+ "Q1057569", # chief strategy officer
83
+ "Q24058752", # chief product officer
84
+ "Q3578048", # vice president
85
+ "Q476675", # business executive (generic)
86
+ "Q5441744", # finance director
87
+ "Q4188234", # general manager
88
+ "Q38844673", # chief data officer
89
+ "Q97273203", # chief digital officer
90
+ "Q60715311", # chief growth officer
91
+ "Q3563879", # treasurer
92
+ "Q3505845", # corporate secretary
93
+ }
94
+
95
+ # Politician positions (P39 values)
96
+ # Includes heads of state/government, legislators, and local officials
97
+ POLITICIAN_POSITION_QIDS = {
98
+ # Heads of state/government
99
+ "Q30461", # president
100
+ "Q14212", # prime minister
101
+ "Q83307", # minister
102
+ "Q2285706", # head of government
103
+ "Q48352", # head of state
104
+ "Q116", # monarch
105
+ "Q382617", # governor
106
+ "Q212071", # mayor
107
+ "Q1553195", # deputy prime minister
108
+ "Q1670573", # cabinet minister
109
+ "Q13218630", # secretary of state
110
+ "Q581682", # vice president
111
+
112
+ # Legislators - national
113
+ "Q4175034", # legislator
114
+ "Q486839", # member of parliament
115
+ "Q193391", # member of national legislature
116
+ "Q484529", # member of congress
117
+ "Q1711695", # senator
118
+ "Q18941264", # member of the House of Representatives (US)
119
+ "Q16707842", # member of the House of Commons (UK)
120
+ "Q18015642", # member of the House of Lords (UK)
121
+ "Q17295570", # member of the Bundestag (Germany)
122
+ "Q27169", # member of the European Parliament
123
+ "Q64366569", # member of Dáil Éireann (Ireland)
124
+ "Q19823090", # member of the Riksdag (Sweden)
125
+ "Q18229048", # member of Sejm (Poland)
126
+ "Q21032547", # member of the National Assembly (France)
127
+ "Q64511800", # member of the Knesset (Israel)
128
+ "Q50393121", # member of the State Duma (Russia)
129
+ "Q18558055", # member of the Diet (Japan)
130
+ "Q109862831", # member of Lok Sabha (India)
131
+ "Q63078776", # member of the Canadian House of Commons
132
+ "Q83767637", # member of the Australian House of Representatives
133
+
134
+ # Legislators - regional/local
135
+ "Q4382506", # member of state legislature
136
+ "Q17765219", # member of regional parliament
137
+ "Q1752514", # councillor (local government)
138
+ "Q18824436", # city councillor
139
+
140
+ # Other political offices
141
+ "Q294414", # public office (generic)
142
+ "Q889821", # ambassador
143
+ "Q15966511", # diplomat
144
+ "Q334344", # lord lieutenant
145
+ "Q16533", # judge (some are appointed politicians)
146
+ "Q3099732", # ombudsman
147
+ "Q1500443", # prefect
148
+ "Q611644", # envoy
149
+ "Q2824523", # political commissar
150
+ }
151
+
152
+ # =============================================================================
153
+ # OCCUPATION TO PERSON TYPE MAPPING (P106 - occupation)
154
+ # =============================================================================
155
+
156
+ OCCUPATION_TO_TYPE: dict[str, PersonType] = {
157
+ # Politicians (elected officials)
158
+ "Q82955": PersonType.POLITICIAN, # politician
159
+ "Q193391": PersonType.POLITICIAN, # member of parliament
160
+ "Q372436": PersonType.POLITICIAN, # statesperson
161
+
162
+ # Government (civil servants, diplomats, appointed officials)
163
+ "Q212238": PersonType.GOVERNMENT, # civil servant
164
+ "Q806798": PersonType.GOVERNMENT, # diplomat
165
+ "Q15627169": PersonType.GOVERNMENT, # trade unionist (often govt-adjacent)
166
+
167
+ # Military
168
+ "Q189290": PersonType.MILITARY, # military officer
169
+ "Q47064": PersonType.MILITARY, # military personnel
170
+ "Q4991371": PersonType.MILITARY, # soldier
171
+ "Q10669499": PersonType.MILITARY, # naval officer
172
+ "Q11974939": PersonType.MILITARY, # air force officer
173
+ "Q10974448": PersonType.MILITARY, # army officer
174
+
175
+ # Legal professionals
176
+ "Q16533": PersonType.LEGAL, # judge
177
+ "Q40348": PersonType.LEGAL, # lawyer
178
+ "Q185351": PersonType.LEGAL, # jurist
179
+ "Q3242871": PersonType.LEGAL, # prosecutor
180
+ "Q1792450": PersonType.LEGAL, # barrister
181
+ "Q3406182": PersonType.LEGAL, # solicitor
182
+
183
+ # Athletes
184
+ "Q2066131": PersonType.ATHLETE, # athlete
185
+ "Q937857": PersonType.ATHLETE, # football player
186
+ "Q3665646": PersonType.ATHLETE, # basketball player
187
+ "Q10871364": PersonType.ATHLETE, # baseball player
188
+ "Q19204627": PersonType.ATHLETE, # ice hockey player
189
+ "Q10843402": PersonType.ATHLETE, # tennis player
190
+ "Q13381376": PersonType.ATHLETE, # golfer
191
+ "Q11338576": PersonType.ATHLETE, # boxer
192
+ "Q10873124": PersonType.ATHLETE, # swimmer
193
+ "Q11303721": PersonType.ATHLETE, # racing driver
194
+ "Q10833314": PersonType.ATHLETE, # cricket player
195
+ "Q13141064": PersonType.ATHLETE, # rugby player
196
+
197
+ # Artists (traditional creative professions)
198
+ "Q33999": PersonType.ARTIST, # actor
199
+ "Q177220": PersonType.ARTIST, # singer
200
+ "Q639669": PersonType.ARTIST, # musician
201
+ "Q2526255": PersonType.ARTIST, # film director
202
+ "Q36180": PersonType.ARTIST, # writer
203
+ "Q483501": PersonType.ARTIST, # artist
204
+ "Q488205": PersonType.ARTIST, # singer-songwriter
205
+ "Q753110": PersonType.ARTIST, # songwriter
206
+ "Q2405480": PersonType.ARTIST, # voice actor
207
+ "Q10800557": PersonType.ARTIST, # film actor
208
+ "Q3455803": PersonType.ARTIST, # director
209
+ "Q28389": PersonType.ARTIST, # screenwriter
210
+ "Q6625963": PersonType.ARTIST, # comedian
211
+ "Q2259451": PersonType.ARTIST, # stand-up comedian
212
+ "Q2490358": PersonType.ARTIST, # choreographer
213
+ "Q2722764": PersonType.ARTIST, # DJ (disc jockey)
214
+ "Q183945": PersonType.ARTIST, # record producer
215
+ "Q3282637": PersonType.ARTIST, # film producer
216
+ "Q49757": PersonType.ARTIST, # poet
217
+ "Q28640": PersonType.ARTIST, # illustrator
218
+ "Q1028181": PersonType.ARTIST, # painter
219
+ "Q1281618": PersonType.ARTIST, # sculptor
220
+ "Q33231": PersonType.ARTIST, # photographer
221
+ "Q806349": PersonType.ARTIST, # band leader
222
+ "Q855091": PersonType.ARTIST, # rapper
223
+ "Q4351403": PersonType.ARTIST, # novelist
224
+ "Q158852": PersonType.ARTIST, # conductor (music)
225
+ "Q486748": PersonType.ARTIST, # pianist
226
+ "Q1415090": PersonType.ARTIST, # guitarist
227
+
228
+ # Media (internet/social media personalities)
229
+ "Q6168364": PersonType.MEDIA, # YouTuber
230
+ "Q15077007": PersonType.MEDIA, # podcaster
231
+ "Q17125263": PersonType.MEDIA, # social media influencer
232
+ "Q15981151": PersonType.MEDIA, # internet celebrity
233
+ "Q2059704": PersonType.MEDIA, # television personality
234
+ "Q4610556": PersonType.MEDIA, # model
235
+ "Q578109": PersonType.MEDIA, # television producer
236
+ "Q2516866": PersonType.MEDIA, # publisher
237
+ "Q93191800": PersonType.MEDIA, # content creator
238
+ "Q105756498": PersonType.MEDIA, # streamer (Twitch etc.)
239
+
240
+ # Professionals (known for their profession/work)
241
+ "Q39631": PersonType.PROFESSIONAL, # physician/doctor
242
+ "Q774306": PersonType.PROFESSIONAL, # surgeon
243
+ "Q1234713": PersonType.PROFESSIONAL, # dentist
244
+ "Q15924224": PersonType.PROFESSIONAL, # psychiatrist
245
+ "Q212980": PersonType.PROFESSIONAL, # psychologist
246
+ "Q81096": PersonType.PROFESSIONAL, # engineer
247
+ "Q42603": PersonType.PROFESSIONAL, # priest/clergy
248
+ "Q432386": PersonType.PROFESSIONAL, # architect
249
+ "Q3621491": PersonType.PROFESSIONAL, # nurse
250
+ "Q18805": PersonType.PROFESSIONAL, # pharmacist
251
+ "Q15895020": PersonType.PROFESSIONAL, # veterinarian
252
+ "Q131512": PersonType.PROFESSIONAL, # chef
253
+ "Q3499072": PersonType.PROFESSIONAL, # pilot
254
+ "Q15895449": PersonType.PROFESSIONAL, # accountant
255
+ "Q806750": PersonType.PROFESSIONAL, # consultant
256
+ "Q584301": PersonType.PROFESSIONAL, # economist (often professional)
257
+ "Q1371925": PersonType.PROFESSIONAL, # real estate agent
258
+ "Q266569": PersonType.PROFESSIONAL, # librarian
259
+ "Q5323050": PersonType.PROFESSIONAL, # electrical engineer
260
+ "Q13582652": PersonType.PROFESSIONAL, # civil engineer
261
+ "Q81965": PersonType.PROFESSIONAL, # software engineer
262
+ "Q5482740": PersonType.PROFESSIONAL, # data scientist
263
+
264
+ # Academics
265
+ "Q121594": PersonType.ACADEMIC, # professor
266
+ "Q3400985": PersonType.ACADEMIC, # academic
267
+ "Q1622272": PersonType.ACADEMIC, # university professor
268
+
269
+ # Scientists
270
+ "Q901": PersonType.SCIENTIST, # scientist
271
+ "Q1650915": PersonType.SCIENTIST, # researcher
272
+ "Q169470": PersonType.SCIENTIST, # physicist
273
+ "Q593644": PersonType.SCIENTIST, # chemist
274
+ "Q864503": PersonType.SCIENTIST, # biologist
275
+ "Q11063": PersonType.SCIENTIST, # astronomer
276
+
277
+ # Journalists
278
+ "Q1930187": PersonType.JOURNALIST, # journalist
279
+ "Q13590141": PersonType.JOURNALIST, # news presenter
280
+ "Q947873": PersonType.JOURNALIST, # television presenter
281
+ "Q4263842": PersonType.JOURNALIST, # columnist
282
+
283
+ # Activists
284
+ "Q15253558": PersonType.ACTIVIST, # activist
285
+ "Q11631410": PersonType.ACTIVIST, # human rights activist
286
+ "Q18939491": PersonType.ACTIVIST, # environmental activist
287
+
288
+ # Entrepreneurs/Executives via occupation
289
+ "Q131524": PersonType.ENTREPRENEUR, # entrepreneur
290
+ "Q43845": PersonType.ENTREPRENEUR, # businessperson
291
+ }
292
+
293
+ # =============================================================================
294
+ # ORGANIZATION TYPE MAPPING (P31 - instance of)
295
+ # =============================================================================
296
+
297
+ ORG_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
298
+ # Business - core types
299
+ "Q4830453": EntityType.BUSINESS, # business
300
+ "Q6881511": EntityType.BUSINESS, # enterprise
301
+ "Q783794": EntityType.BUSINESS, # company
302
+ "Q891723": EntityType.BUSINESS, # public company
303
+ "Q167037": EntityType.BUSINESS, # corporation
304
+ "Q658255": EntityType.BUSINESS, # subsidiary
305
+ "Q206652": EntityType.BUSINESS, # conglomerate
306
+ "Q22687": EntityType.BUSINESS, # bank
307
+ "Q1145276": EntityType.BUSINESS, # insurance company
308
+ "Q46970": EntityType.BUSINESS, # airline
309
+ "Q613142": EntityType.BUSINESS, # law firm
310
+ "Q507619": EntityType.BUSINESS, # pharmaceutical company
311
+ "Q2979960": EntityType.BUSINESS, # technology company
312
+ "Q1631111": EntityType.BUSINESS, # retailer
313
+ "Q187652": EntityType.BUSINESS, # manufacturer
314
+ # Business - additional types
315
+ "Q43229": EntityType.BUSINESS, # organization (generic)
316
+ "Q4671277": EntityType.BUSINESS, # academic institution (some are businesses)
317
+ "Q1664720": EntityType.BUSINESS, # institute
318
+ "Q15911314": EntityType.BUSINESS, # association
319
+ "Q15925165": EntityType.BUSINESS, # private company
320
+ "Q5225895": EntityType.BUSINESS, # credit union
321
+ "Q161726": EntityType.BUSINESS, # multinational corporation
322
+ "Q134161": EntityType.BUSINESS, # joint venture
323
+ "Q1589009": EntityType.BUSINESS, # privately held company
324
+ "Q270791": EntityType.BUSINESS, # state-owned enterprise
325
+ "Q1762059": EntityType.BUSINESS, # online service provider
326
+ "Q17127659": EntityType.BUSINESS, # energy company
327
+ "Q2695280": EntityType.BUSINESS, # construction company
328
+ "Q1624464": EntityType.BUSINESS, # telecommunications company
329
+ "Q1668024": EntityType.BUSINESS, # car manufacturer
330
+ "Q3914": EntityType.BUSINESS, # school (some are businesses)
331
+ "Q1030034": EntityType.BUSINESS, # management consulting firm
332
+ "Q1370614": EntityType.BUSINESS, # investment bank
333
+ "Q1785271": EntityType.BUSINESS, # advertising agency
334
+ "Q4686042": EntityType.BUSINESS, # automotive supplier
335
+ "Q431289": EntityType.BUSINESS, # brand
336
+ "Q622438": EntityType.BUSINESS, # supermarket chain
337
+ "Q6500733": EntityType.BUSINESS, # licensed retailer
338
+ "Q2659904": EntityType.BUSINESS, # government-owned corporation
339
+ "Q1065118": EntityType.BUSINESS, # bookmaker
340
+ "Q179179": EntityType.BUSINESS, # startup
341
+ "Q210167": EntityType.BUSINESS, # video game developer
342
+ "Q18388277": EntityType.BUSINESS, # video game publisher
343
+ "Q1762913": EntityType.BUSINESS, # film production company
344
+ "Q18558478": EntityType.BUSINESS, # money services business
345
+ "Q6463968": EntityType.BUSINESS, # asset management company
346
+ "Q2864737": EntityType.BUSINESS, # cooperative bank
347
+ "Q161380": EntityType.BUSINESS, # cooperative
348
+ "Q15850590": EntityType.BUSINESS, # real estate company
349
+ "Q1048835": EntityType.BUSINESS, # political organization
350
+ "Q1254933": EntityType.BUSINESS, # astronomical observatory (often research orgs)
351
+ "Q294414": EntityType.BUSINESS, # public office
352
+
353
+ # Funds
354
+ "Q45400320": EntityType.FUND, # investment fund
355
+ "Q476028": EntityType.FUND, # hedge fund
356
+ "Q380649": EntityType.FUND, # investment company
357
+ "Q1377053": EntityType.FUND, # mutual fund
358
+ "Q3312546": EntityType.FUND, # private equity firm
359
+ "Q751705": EntityType.FUND, # venture capital firm
360
+ "Q2296920": EntityType.FUND, # sovereign wealth fund
361
+ "Q2824951": EntityType.FUND, # exchange-traded fund
362
+ "Q1755098": EntityType.FUND, # pension fund
363
+
364
+ # Nonprofits
365
+ "Q163740": EntityType.NONPROFIT, # nonprofit organization
366
+ "Q79913": EntityType.NGO, # non-governmental organization
367
+ "Q157031": EntityType.FOUNDATION, # foundation
368
+ "Q48204": EntityType.NONPROFIT, # voluntary association
369
+ "Q988108": EntityType.NONPROFIT, # club
370
+ "Q476436": EntityType.NONPROFIT, # charitable organization
371
+ "Q3591957": EntityType.NONPROFIT, # cultural institution
372
+ "Q162633": EntityType.NONPROFIT, # academy
373
+ "Q270791": EntityType.NONPROFIT, # learned society
374
+ "Q484652": EntityType.NONPROFIT, # international organization
375
+
376
+ # Government
377
+ "Q327333": EntityType.GOVERNMENT, # government agency
378
+ "Q7278": EntityType.POLITICAL_PARTY, # political party
379
+ "Q178790": EntityType.TRADE_UNION, # trade union
380
+ "Q7188": EntityType.GOVERNMENT, # government
381
+ "Q2659904": EntityType.GOVERNMENT, # government-owned corporation
382
+ "Q35798": EntityType.GOVERNMENT, # executive branch
383
+ "Q35749": EntityType.GOVERNMENT, # legislature
384
+ "Q12076836": EntityType.GOVERNMENT, # law enforcement agency
385
+ "Q17362920": EntityType.GOVERNMENT, # public body
386
+ "Q1063239": EntityType.GOVERNMENT, # regulatory agency
387
+ "Q3624078": EntityType.GOVERNMENT, # sovereign state
388
+ "Q133442": EntityType.GOVERNMENT, # embassy
389
+ "Q174834": EntityType.GOVERNMENT, # authority (government)
390
+
391
+ # International organizations
392
+ "Q484652": EntityType.INTERNATIONAL_ORG, # international organization
393
+ "Q1335818": EntityType.INTERNATIONAL_ORG, # supranational organisation
394
+ "Q1616075": EntityType.INTERNATIONAL_ORG, # intergovernmental organization
395
+
396
+ # Education/Research
397
+ "Q2385804": EntityType.EDUCATIONAL, # educational institution
398
+ "Q3918": EntityType.EDUCATIONAL, # university
399
+ "Q31855": EntityType.RESEARCH, # research institute
400
+ "Q875538": EntityType.EDUCATIONAL, # public university
401
+ "Q23002039": EntityType.EDUCATIONAL, # private university
402
+ "Q38723": EntityType.EDUCATIONAL, # higher education institution
403
+ "Q1371037": EntityType.EDUCATIONAL, # secondary school
404
+ "Q9842": EntityType.EDUCATIONAL, # primary school
405
+ "Q189004": EntityType.EDUCATIONAL, # college
406
+ "Q1188663": EntityType.EDUCATIONAL, # community college
407
+ "Q1321960": EntityType.RESEARCH, # think tank
408
+ "Q31855": EntityType.RESEARCH, # research institute
409
+ "Q3354859": EntityType.RESEARCH, # observatory
410
+ "Q1298668": EntityType.RESEARCH, # research center
411
+
412
+ # Healthcare
413
+ "Q16917": EntityType.HEALTHCARE, # hospital
414
+ "Q1774898": EntityType.HEALTHCARE, # health care organization
415
+ "Q180958": EntityType.HEALTHCARE, # clinic
416
+ "Q4260475": EntityType.HEALTHCARE, # medical facility
417
+ "Q871964": EntityType.HEALTHCARE, # biotechnology company
418
+ "Q902104": EntityType.HEALTHCARE, # health insurance company
419
+
420
+ # Sports
421
+ "Q847017": EntityType.SPORTS, # sports club
422
+ "Q476068": EntityType.SPORTS, # sports team
423
+ "Q12973014": EntityType.SPORTS, # sports organization
424
+ "Q14350": EntityType.SPORTS, # association football club
425
+ "Q20639847": EntityType.SPORTS, # American football team
426
+ "Q13393265": EntityType.SPORTS, # basketball team
427
+ "Q13406463": EntityType.SPORTS, # baseball team
428
+ "Q1410877": EntityType.SPORTS, # ice hockey team
429
+ "Q18558301": EntityType.SPORTS, # rugby union club
430
+ "Q2093802": EntityType.SPORTS, # cricket team
431
+ "Q5137836": EntityType.SPORTS, # motorsport racing team
432
+
433
+ # Media
434
+ "Q18127": EntityType.MEDIA, # record label
435
+ "Q1366047": EntityType.MEDIA, # film studio
436
+ "Q1137109": EntityType.MEDIA, # video game company
437
+ "Q11032": EntityType.MEDIA, # newspaper
438
+ "Q1002697": EntityType.MEDIA, # periodical
439
+ "Q5398426": EntityType.MEDIA, # television series
440
+ "Q1110794": EntityType.MEDIA, # daily newspaper
441
+ "Q1616075": EntityType.MEDIA, # news agency
442
+ "Q14350": EntityType.MEDIA, # magazine
443
+ "Q15265344": EntityType.MEDIA, # broadcaster
444
+ "Q131436": EntityType.MEDIA, # radio station
445
+ "Q1616075": EntityType.MEDIA, # television station
446
+ "Q41298": EntityType.MEDIA, # magazine
447
+ "Q30022": EntityType.MEDIA, # television channel
448
+ "Q17232649": EntityType.MEDIA, # publishing company
449
+ "Q28803812": EntityType.MEDIA, # streaming service
450
+ "Q159334": EntityType.MEDIA, # entertainment company
451
+
452
+ # Religious
453
+ "Q9174": EntityType.RELIGIOUS, # religion
454
+ "Q1530022": EntityType.RELIGIOUS, # religious organization
455
+ "Q2994867": EntityType.RELIGIOUS, # religious community
456
+ "Q34651": EntityType.RELIGIOUS, # church (building as org)
457
+ "Q44613": EntityType.RELIGIOUS, # monastery
458
+ }
459
+
460
+
461
+ # =============================================================================
462
+ # LOCATION TYPE MAPPING (P31 - instance of)
463
+ # Maps P31 QID -> (location_type_name, simplified_type)
464
+ # =============================================================================
465
+
466
+ LOCATION_TYPE_QIDS: dict[str, tuple[str, SimplifiedLocationType]] = {
467
+ # ==========================================================================
468
+ # IMPORTANT: The type names (first element of tuple) MUST match exactly
469
+ # the names in database/seed_data.py LOCATION_TYPES. Any new types need
470
+ # to be added there first, or use existing type names.
471
+ # ==========================================================================
472
+
473
+ # Continents (maps to: continent)
474
+ "Q5107": ("continent", SimplifiedLocationType.CONTINENT),
475
+
476
+ # Countries / Sovereign states (maps to: country, sovereign_state, dependent_territory)
477
+ "Q6256": ("country", SimplifiedLocationType.COUNTRY),
478
+ "Q3624078": ("sovereign_state", SimplifiedLocationType.COUNTRY),
479
+ "Q161243": ("dependent_territory", SimplifiedLocationType.COUNTRY),
480
+ # Additional country-like types -> map to country
481
+ "Q15634554": ("country", SimplifiedLocationType.COUNTRY), # state with limited recognition
482
+ "Q1763527": ("country", SimplifiedLocationType.COUNTRY), # constituent country
483
+ "Q46395": ("dependent_territory", SimplifiedLocationType.COUNTRY), # british overseas territory
484
+
485
+ # Subdivisions (states/provinces) - US
486
+ "Q35657": ("us_state", SimplifiedLocationType.SUBDIVISION),
487
+ "Q47168": ("us_county", SimplifiedLocationType.SUBDIVISION),
488
+
489
+ # Subdivisions - Country-specific
490
+ "Q5852411": ("state_of_australia", SimplifiedLocationType.SUBDIVISION),
491
+ "Q1221156": ("state_of_germany", SimplifiedLocationType.SUBDIVISION),
492
+ "Q131541": ("state_of_india", SimplifiedLocationType.SUBDIVISION),
493
+ "Q6465": ("department_france", SimplifiedLocationType.SUBDIVISION),
494
+ "Q50337": ("prefecture_japan", SimplifiedLocationType.SUBDIVISION),
495
+ "Q23058": ("canton_switzerland", SimplifiedLocationType.SUBDIVISION),
496
+ "Q10742": ("autonomous_community_spain", SimplifiedLocationType.SUBDIVISION),
497
+ "Q150093": ("voivodeship_poland", SimplifiedLocationType.SUBDIVISION),
498
+ "Q835714": ("oblast_russia", SimplifiedLocationType.SUBDIVISION),
499
+
500
+ # Subdivisions - Generic (map to existing types)
501
+ "Q34876": ("province", SimplifiedLocationType.SUBDIVISION),
502
+ "Q82794": ("region", SimplifiedLocationType.SUBDIVISION),
503
+ "Q28575": ("county", SimplifiedLocationType.SUBDIVISION),
504
+ # Additional generic subdivision types -> map to region/province/county
505
+ "Q10864048": ("region", SimplifiedLocationType.SUBDIVISION), # first-level admin
506
+ "Q11828004": ("county", SimplifiedLocationType.SUBDIVISION), # second-level admin
507
+ "Q12483": ("region", SimplifiedLocationType.SUBDIVISION), # territory
508
+ "Q515716": ("region", SimplifiedLocationType.SUBDIVISION), # region of Italy
509
+ "Q1132541": ("county", SimplifiedLocationType.SUBDIVISION), # county of Sweden
510
+ "Q1780990": ("region", SimplifiedLocationType.SUBDIVISION), # council area Scotland
511
+ "Q211690": ("county", SimplifiedLocationType.SUBDIVISION), # ceremonial county England
512
+ "Q180673": ("county", SimplifiedLocationType.SUBDIVISION), # ceremonial county
513
+ "Q1136601": ("county", SimplifiedLocationType.SUBDIVISION), # metropolitan county
514
+ "Q21451686": ("region", SimplifiedLocationType.SUBDIVISION), # region of England
515
+ "Q1006876": ("region", SimplifiedLocationType.SUBDIVISION), # unitary authority Wales
516
+ "Q179872": ("province", SimplifiedLocationType.SUBDIVISION), # province of Canada
517
+ "Q1352230": ("region", SimplifiedLocationType.SUBDIVISION), # territory of Canada
518
+ "Q13360155": ("province", SimplifiedLocationType.SUBDIVISION), # province of China
519
+ "Q842112": ("region", SimplifiedLocationType.SUBDIVISION), # autonomous region China
520
+ "Q1348006": ("municipality", SimplifiedLocationType.CITY), # municipality of China (city-level)
521
+ "Q11774097": ("city", SimplifiedLocationType.CITY), # prefecture-level city
522
+
523
+ # Cities/Towns/Municipalities (maps to: city, big_city, capital, town, municipality, village, hamlet)
524
+ "Q515": ("city", SimplifiedLocationType.CITY),
525
+ "Q1549591": ("big_city", SimplifiedLocationType.CITY),
526
+ "Q5119": ("capital", SimplifiedLocationType.CITY),
527
+ "Q3957": ("town", SimplifiedLocationType.CITY),
528
+ "Q15284": ("municipality", SimplifiedLocationType.CITY),
529
+ "Q532": ("village", SimplifiedLocationType.CITY),
530
+ "Q5084": ("hamlet", SimplifiedLocationType.CITY),
531
+ # Country-specific municipalities
532
+ "Q484170": ("commune_france", SimplifiedLocationType.CITY),
533
+ "Q262166": ("municipality_germany", SimplifiedLocationType.CITY),
534
+ "Q1054813": ("municipality_japan", SimplifiedLocationType.CITY),
535
+ # Additional city types -> map to city/town/village
536
+ "Q7930989": ("city", SimplifiedLocationType.CITY), # city of US
537
+ "Q200250": ("big_city", SimplifiedLocationType.CITY), # metropolis
538
+ "Q2264924": ("big_city", SimplifiedLocationType.CITY), # conurbation
539
+ "Q174844": ("big_city", SimplifiedLocationType.CITY), # megacity
540
+ "Q22865": ("city", SimplifiedLocationType.CITY), # independent city
541
+ "Q5153359": ("municipality", SimplifiedLocationType.CITY), # commune (generic)
542
+ "Q4286337": ("village", SimplifiedLocationType.CITY), # locality
543
+ "Q486972": ("village", SimplifiedLocationType.CITY), # human settlement
544
+ "Q95993392": ("city", SimplifiedLocationType.CITY), # city or town
545
+
546
+ # Districts (maps to: district, borough, neighborhood, ward)
547
+ "Q149621": ("district", SimplifiedLocationType.DISTRICT),
548
+ "Q5765681": ("borough", SimplifiedLocationType.DISTRICT),
549
+ "Q123705": ("neighborhood", SimplifiedLocationType.DISTRICT),
550
+ "Q12813115": ("ward", SimplifiedLocationType.DISTRICT),
551
+ # Additional district types -> map to district/borough
552
+ "Q2198484": ("borough", SimplifiedLocationType.DISTRICT), # borough of London
553
+ "Q667509": ("district", SimplifiedLocationType.DISTRICT), # arrondissement
554
+ "Q2100709": ("district", SimplifiedLocationType.DISTRICT), # city district
555
+
556
+ # Historic (maps to: former_country, ancient_civilization, historic_territory)
557
+ "Q3024240": ("former_country", SimplifiedLocationType.HISTORIC),
558
+ "Q28171280": ("ancient_civilization", SimplifiedLocationType.HISTORIC),
559
+ "Q1620908": ("historic_territory", SimplifiedLocationType.HISTORIC),
560
+ # Additional historic types
561
+ "Q19953632": ("historic_territory", SimplifiedLocationType.HISTORIC), # historical region
562
+ "Q1307214": ("historic_territory", SimplifiedLocationType.HISTORIC), # historical admin region
563
+ }
564
+
565
+
566
+ # =============================================================================
567
+ # PROGRESS TRACKING
568
+ # =============================================================================
569
+
570
+ DEFAULT_PROGRESS_PATH = Path.home() / ".cache" / "corp-extractor" / "wikidata-dump-progress.json"
571
+
572
+
573
+ @dataclass
574
+ class DumpProgress:
575
+ """
576
+ Tracks progress through the Wikidata dump file for resume support.
577
+
578
+ Progress is tracked by entity index (number of entities processed).
579
+ On resume, entities are skipped until reaching the saved position.
580
+ """
581
+ # Entity index - number of entities yielded from the dump
582
+ entity_index: int = 0
583
+
584
+ # Separate counters for people and orgs import
585
+ people_yielded: int = 0
586
+ orgs_yielded: int = 0
587
+
588
+ # Last entity ID processed (for verification)
589
+ last_entity_id: str = ""
590
+
591
+ # Timestamp of last update
592
+ last_updated: str = field(default_factory=lambda: datetime.now().isoformat())
593
+
594
+ # Dump file path (to detect if dump changed)
595
+ dump_path: str = ""
596
+
597
+ # Dump file size (to detect if dump changed)
598
+ dump_size: int = 0
599
+
600
+ def save(self, path: Optional[Path] = None) -> None:
601
+ """Save progress to JSON file."""
602
+ path = path or DEFAULT_PROGRESS_PATH
603
+ path.parent.mkdir(parents=True, exist_ok=True)
604
+ self.last_updated = datetime.now().isoformat()
605
+ with open(path, "w") as f:
606
+ json.dump({
607
+ "entity_index": self.entity_index,
608
+ "people_yielded": self.people_yielded,
609
+ "orgs_yielded": self.orgs_yielded,
610
+ "last_entity_id": self.last_entity_id,
611
+ "last_updated": self.last_updated,
612
+ "dump_path": self.dump_path,
613
+ "dump_size": self.dump_size,
614
+ }, f, indent=2)
615
+ logger.debug(f"Saved progress: entity_index={self.entity_index}, last_id={self.last_entity_id}")
616
+
617
+ @classmethod
618
+ def load(cls, path: Optional[Path] = None) -> Optional["DumpProgress"]:
619
+ """Load progress from JSON file, returns None if not found."""
620
+ path = path or DEFAULT_PROGRESS_PATH
621
+ if not path.exists():
622
+ return None
623
+ try:
624
+ with open(path) as f:
625
+ data = json.load(f)
626
+ return cls(
627
+ entity_index=data.get("entity_index", 0),
628
+ people_yielded=data.get("people_yielded", 0),
629
+ orgs_yielded=data.get("orgs_yielded", 0),
630
+ last_entity_id=data.get("last_entity_id", ""),
631
+ last_updated=data.get("last_updated", ""),
632
+ dump_path=data.get("dump_path", ""),
633
+ dump_size=data.get("dump_size", 0),
634
+ )
635
+ except (json.JSONDecodeError, KeyError, TypeError) as e:
636
+ logger.warning(f"Failed to load progress from {path}: {e}")
637
+ return None
638
+
639
+ @classmethod
640
+ def clear(cls, path: Optional[Path] = None) -> None:
641
+ """Delete the progress file."""
642
+ path = path or DEFAULT_PROGRESS_PATH
643
+ if path.exists():
644
+ path.unlink()
645
+ logger.info(f"Cleared progress file: {path}")
646
+
647
+ def matches_dump(self, dump_path: Path) -> bool:
648
+ """Check if this progress matches the given dump file."""
649
+ if str(dump_path) != self.dump_path:
650
+ return False
651
+ if dump_path.exists() and dump_path.stat().st_size != self.dump_size:
652
+ return False
653
+ return True
654
+
655
+
656
+ class WikidataDumpImporter:
657
+ """
658
+ Stream Wikidata JSON dump to extract people and organization records.
659
+
660
+ This importer processes the Wikidata dump line-by-line to avoid memory issues
661
+ with the ~100GB compressed file. It filters for:
662
+ - Humans (P31=Q5) with English Wikipedia articles
663
+ - Organizations with English Wikipedia articles
664
+
665
+ The dump URL can be customized, and the importer supports both .bz2 and .gz
666
+ compression formats.
667
+ """
668
+
669
+ def __init__(self, dump_path: Optional[str] = None):
670
+ """
671
+ Initialize the dump importer.
672
+
673
+ Args:
674
+ dump_path: Optional path to a pre-downloaded dump file.
675
+ If not provided, will need to call download_dump() first.
676
+ """
677
+ self._dump_path = Path(dump_path) if dump_path else None
678
+ # Track discovered organizations from people import
679
+ self._discovered_orgs: dict[str, str] = {}
680
+ # Track QIDs that need label resolution (country, role)
681
+ self._unresolved_qids: set[str] = set()
682
+ # Label cache built during dump processing
683
+ self._label_cache: dict[str, str] = {}
684
+
685
+ def download_dump(
686
+ self,
687
+ target_dir: Optional[Path] = None,
688
+ force: bool = False,
689
+ progress_callback: Optional[Callable[[int, int], None]] = None,
690
+ use_aria2: bool = True,
691
+ aria2_connections: int = 16,
692
+ ) -> Path:
693
+ """
694
+ Download the latest Wikidata dump with progress indicator.
695
+
696
+ For fastest downloads, uses aria2c if available (16 parallel connections).
697
+ Falls back to urllib if aria2c is not installed.
698
+
699
+ Args:
700
+ target_dir: Directory to save the dump (default: ~/.cache/corp-extractor)
701
+ force: Force re-download even if file exists
702
+ progress_callback: Optional callback(downloaded_bytes, total_bytes) for progress
703
+ use_aria2: Try to use aria2c for faster downloads (default: True)
704
+ aria2_connections: Number of connections for aria2c (default: 16)
705
+
706
+ Returns:
707
+ Path to the downloaded dump file
708
+ """
709
+ if target_dir is None:
710
+ target_dir = Path.home() / ".cache" / "corp-extractor"
711
+
712
+ target_dir.mkdir(parents=True, exist_ok=True)
713
+ dump_path = target_dir / "wikidata-latest-all.json.bz2"
714
+
715
+ if dump_path.exists() and not force:
716
+ logger.info(f"Using cached dump at {dump_path}")
717
+ self._dump_path = dump_path
718
+ return dump_path
719
+
720
+ logger.info(f"Target: {dump_path}")
721
+
722
+ # Try aria2c first for much faster downloads
723
+ if use_aria2 and shutil.which("aria2c"):
724
+ logger.info("Using aria2c for fast parallel download...")
725
+ try:
726
+ self._download_with_aria2(dump_path, connections=aria2_connections)
727
+ self._dump_path = dump_path
728
+ return dump_path
729
+ except Exception as e:
730
+ logger.warning(f"aria2c download failed: {e}, falling back to urllib")
731
+
732
+ # Fallback to urllib
733
+ logger.info(f"Downloading Wikidata dump from {DUMP_URL}...")
734
+ logger.info("TIP: Install aria2c for 10-20x faster downloads: brew install aria2")
735
+ logger.info("This is a large file (~100GB) and will take significant time.")
736
+
737
+ # Stream download with progress
738
+ req = urllib.request.Request(
739
+ DUMP_URL,
740
+ headers={"User-Agent": "corp-extractor/1.0 (Wikidata dump importer)"}
741
+ )
742
+
743
+ with urllib.request.urlopen(req) as response:
744
+ total = int(response.headers.get("content-length", 0))
745
+ total_gb = total / (1024 ** 3) if total else 0
746
+
747
+ with open(dump_path, "wb") as f:
748
+ downloaded = 0
749
+ chunk_size = 8 * 1024 * 1024 # 8MB chunks
750
+ last_log_pct = 0
751
+
752
+ while True:
753
+ chunk = response.read(chunk_size)
754
+ if not chunk:
755
+ break
756
+ f.write(chunk)
757
+ downloaded += len(chunk)
758
+
759
+ # Call progress callback if provided
760
+ if progress_callback:
761
+ progress_callback(downloaded, total)
762
+ else:
763
+ # Default logging (every 1%)
764
+ if total:
765
+ pct = int((downloaded / total) * 100)
766
+ if pct > last_log_pct:
767
+ downloaded_gb = downloaded / (1024 ** 3)
768
+ logger.info(f"Downloaded {downloaded_gb:.1f}GB / {total_gb:.1f}GB ({pct}%)")
769
+ last_log_pct = pct
770
+ elif downloaded % (1024 ** 3) < chunk_size:
771
+ # Log every GB if total unknown
772
+ downloaded_gb = downloaded / (1024 ** 3)
773
+ logger.info(f"Downloaded {downloaded_gb:.1f}GB")
774
+
775
+ logger.info(f"Download complete: {dump_path}")
776
+ self._dump_path = dump_path
777
+ return dump_path
778
+
779
+ def _download_with_aria2(
780
+ self,
781
+ output_path: Path,
782
+ connections: int = 16,
783
+ ) -> None:
784
+ """
785
+ Download using aria2c with multiple parallel connections.
786
+
787
+ aria2c can achieve 10-20x faster downloads by using multiple
788
+ connections to the server.
789
+
790
+ Args:
791
+ output_path: Where to save the downloaded file
792
+ connections: Number of parallel connections (default: 16)
793
+ """
794
+ cmd = [
795
+ "aria2c",
796
+ "-x", str(connections), # Max connections per server
797
+ "-s", str(connections), # Split file into N parts
798
+ "-k", "10M", # Min split size
799
+ "--file-allocation=none", # Faster on SSDs
800
+ "-d", str(output_path.parent),
801
+ "-o", output_path.name,
802
+ "--console-log-level=notice",
803
+ "--summary-interval=10",
804
+ DUMP_URL,
805
+ ]
806
+
807
+ logger.info(f"Running: {' '.join(cmd)}")
808
+
809
+ # Run aria2c and stream output
810
+ process = subprocess.Popen(
811
+ cmd,
812
+ stdout=subprocess.PIPE,
813
+ stderr=subprocess.STDOUT,
814
+ text=True,
815
+ )
816
+
817
+ # Stream output to logger
818
+ if process.stdout:
819
+ for line in process.stdout:
820
+ line = line.strip()
821
+ if line:
822
+ logger.info(f"aria2c: {line}")
823
+
824
+ return_code = process.wait()
825
+ if return_code != 0:
826
+ raise RuntimeError(f"aria2c exited with code {return_code}")
827
+
828
+ def get_dump_path(self, target_dir: Optional[Path] = None) -> Path:
829
+ """
830
+ Get the path where the dump would be/is downloaded.
831
+
832
+ Args:
833
+ target_dir: Directory for the dump (default: ~/.cache/corp-extractor)
834
+
835
+ Returns:
836
+ Path to the dump file location
837
+ """
838
+ if target_dir is None:
839
+ target_dir = Path.home() / ".cache" / "corp-extractor"
840
+ return target_dir / "wikidata-latest-all.json.bz2"
841
+
842
+ def iter_entities(
843
+ self,
844
+ dump_path: Optional[Path] = None,
845
+ start_index: int = 0,
846
+ progress_callback: Optional[Callable[[int, str], None]] = None,
847
+ ) -> Iterator[dict]:
848
+ """
849
+ Stream entities from dump file, one at a time.
850
+
851
+ Handles the Wikidata JSON dump format where each line after the opening
852
+ bracket is a JSON object with a trailing comma (except the last).
853
+
854
+ Args:
855
+ dump_path: Path to dump file (uses self._dump_path if not provided)
856
+ start_index: Entity index to start yielding from (default 0). Entities
857
+ before this index are skipped but still cached for label lookups.
858
+ progress_callback: Optional callback(entity_index, entity_id) called for each
859
+ yielded entity. Useful for tracking progress.
860
+
861
+ Yields:
862
+ Parsed entity dictionaries
863
+ """
864
+ path = dump_path or self._dump_path
865
+ if path is None:
866
+ raise ValueError("No dump path provided. Call download_dump() first or pass dump_path.")
867
+
868
+ path = Path(path)
869
+
870
+ # Select opener based on extension
871
+ if path.suffix == ".bz2":
872
+ opener = bz2.open
873
+ elif path.suffix == ".gz":
874
+ opener = gzip.open
875
+ else:
876
+ # Assume uncompressed
877
+ opener = open
878
+
879
+ logger.info(f"Opening dump file: {path}")
880
+ logger.info(f"File size: {path.stat().st_size / (1024**3):.1f} GB")
881
+ if start_index > 0:
882
+ logger.info(f"Resuming from entity index {start_index:,} (skipping earlier entities)")
883
+ logger.info("Starting to read dump (bz2 decompression is slow, please wait)...")
884
+
885
+ with opener(path, "rt", encoding="utf-8") as f:
886
+ logger.info("Dump file opened successfully, reading lines...")
887
+ line_count = 0
888
+ entity_count = 0
889
+ skipped_count = 0
890
+ # Log more frequently at start, then reduce frequency
891
+ next_log_threshold = 10_000
892
+
893
+ for line in f:
894
+ line_count += 1
895
+
896
+ # Log first few lines to show we're making progress
897
+ if line_count <= 5:
898
+ logger.info(f"Read line {line_count} ({len(line)} chars)")
899
+ elif line_count == 100:
900
+ logger.info(f"Read {line_count} lines...")
901
+ elif line_count == 1000:
902
+ logger.info(f"Read {line_count} lines...")
903
+
904
+ line = line.strip()
905
+
906
+ # Skip array brackets
907
+ if line in ("[", "]"):
908
+ continue
909
+
910
+ # Strip trailing comma
911
+ if line.endswith(","):
912
+ line = line[:-1]
913
+
914
+ if not line:
915
+ continue
916
+
917
+ try:
918
+ entity = json.loads(line)
919
+ entity_id = entity.get("id", "")
920
+
921
+ # Always cache label for QID lookups (even when skipping)
922
+ self._cache_entity_label(entity)
923
+
924
+ # Check if we should skip this entity (resuming)
925
+ if entity_count < start_index:
926
+ entity_count += 1
927
+ skipped_count += 1
928
+ # Log skipping progress with adaptive frequency
929
+ if skipped_count >= next_log_threshold:
930
+ pct = 100 * skipped_count / start_index if start_index > 0 else 0
931
+ logger.info(
932
+ f"Skipping... {skipped_count:,}/{start_index:,} entities "
933
+ f"({pct:.1f}%), label cache: {len(self._label_cache):,}"
934
+ )
935
+ # Increase threshold: 10K -> 100K -> 1M
936
+ if next_log_threshold < 100_000:
937
+ next_log_threshold = 100_000
938
+ elif next_log_threshold < 1_000_000:
939
+ next_log_threshold = 1_000_000
940
+ else:
941
+ next_log_threshold += 1_000_000
942
+ continue
943
+
944
+ entity_count += 1
945
+
946
+ # Log progress with adaptive frequency
947
+ if entity_count >= next_log_threshold:
948
+ logger.info(
949
+ f"Processed {entity_count:,} entities, "
950
+ f"label cache: {len(self._label_cache):,}, "
951
+ f"unresolved QIDs: {len(self._unresolved_qids):,}"
952
+ )
953
+ # Increase threshold: 10K -> 100K -> 1M -> 2M -> 3M...
954
+ if next_log_threshold < 100_000:
955
+ next_log_threshold = 100_000
956
+ elif next_log_threshold < 1_000_000:
957
+ next_log_threshold = 1_000_000
958
+ else:
959
+ next_log_threshold += 1_000_000
960
+
961
+ # Call progress callback if provided
962
+ if progress_callback:
963
+ progress_callback(entity_count, entity_id)
964
+
965
+ yield entity
966
+
967
+ except json.JSONDecodeError as e:
968
+ logger.debug(f"Line {line_count}: JSON decode error: {e}")
969
+ continue
970
+
971
+ def import_people(
972
+ self,
973
+ dump_path: Optional[Path] = None,
974
+ limit: Optional[int] = None,
975
+ require_enwiki: bool = False,
976
+ skip_ids: Optional[set[str]] = None,
977
+ start_index: int = 0,
978
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
979
+ ) -> Iterator[PersonRecord]:
980
+ """
981
+ Stream through dump, yielding ALL people (humans with P31=Q5).
982
+
983
+ This method filters the dump for:
984
+ - Items with type "item" (not properties)
985
+ - Humans (P31 contains Q5)
986
+ - Optionally: Has English Wikipedia article (enwiki sitelink)
987
+
988
+ PersonType is derived from positions (P39) and occupations (P106).
989
+ Parliamentary context (electoral district, term, party) is extracted from P39 qualifiers.
990
+
991
+ Args:
992
+ dump_path: Path to dump file (uses self._dump_path if not provided)
993
+ limit: Optional maximum number of records to return
994
+ require_enwiki: If True, only include people with English Wikipedia articles
995
+ skip_ids: Optional set of source_ids (Q codes) to skip. Checked early before
996
+ full processing to avoid unnecessary QID resolution.
997
+ start_index: Entity index to start from (for resume support). Entities
998
+ before this index are skipped but labels are still cached.
999
+ progress_callback: Optional callback(entity_index, entity_id, records_yielded)
1000
+ called for each yielded record. Useful for saving progress.
1001
+
1002
+ Yields:
1003
+ PersonRecord for each qualifying person
1004
+ """
1005
+ path = dump_path or self._dump_path
1006
+ count = 0
1007
+ skipped = 0
1008
+ current_entity_index = start_index
1009
+
1010
+ logger.info("Starting people import from Wikidata dump...")
1011
+ if start_index > 0:
1012
+ logger.info(f"Resuming from entity index {start_index:,}")
1013
+ if not require_enwiki:
1014
+ logger.info("Importing ALL humans (no enwiki filter)")
1015
+ if skip_ids:
1016
+ logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
1017
+
1018
+ def track_entity(entity_index: int, entity_id: str) -> None:
1019
+ nonlocal current_entity_index
1020
+ current_entity_index = entity_index
1021
+
1022
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
1023
+ if limit and count >= limit:
1024
+ break
1025
+
1026
+ # Check skip_ids early, before full processing (avoids QID resolution)
1027
+ entity_id = entity.get("id", "")
1028
+ if skip_ids and entity_id in skip_ids:
1029
+ skipped += 1
1030
+ continue
1031
+
1032
+ record = self._process_person_entity(entity, require_enwiki=require_enwiki)
1033
+ if record:
1034
+ count += 1
1035
+ if count % 10_000 == 0:
1036
+ logger.info(f"Yielded {count:,} people records (skipped {skipped:,})...")
1037
+
1038
+ # Call progress callback with current position
1039
+ if progress_callback:
1040
+ progress_callback(current_entity_index, entity_id, count)
1041
+
1042
+ yield record
1043
+
1044
+ logger.info(f"People import complete: {count:,} records (skipped {skipped:,})")
1045
+
1046
+ def import_organizations(
1047
+ self,
1048
+ dump_path: Optional[Path] = None,
1049
+ limit: Optional[int] = None,
1050
+ require_enwiki: bool = False,
1051
+ skip_ids: Optional[set[str]] = None,
1052
+ start_index: int = 0,
1053
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
1054
+ ) -> Iterator[CompanyRecord]:
1055
+ """
1056
+ Stream through dump, yielding organizations.
1057
+
1058
+ This method filters the dump for:
1059
+ - Items with type "item"
1060
+ - Has P31 (instance of) matching an organization type
1061
+ - Optionally: Has English Wikipedia article (enwiki sitelink)
1062
+
1063
+ Args:
1064
+ dump_path: Path to dump file (uses self._dump_path if not provided)
1065
+ limit: Optional maximum number of records to return
1066
+ require_enwiki: If True, only include orgs with English Wikipedia articles
1067
+ skip_ids: Optional set of source_ids (Q codes) to skip. Checked early before
1068
+ full processing to avoid unnecessary QID resolution.
1069
+ start_index: Entity index to start from (for resume support). Entities
1070
+ before this index are skipped but labels are still cached.
1071
+ progress_callback: Optional callback(entity_index, entity_id, records_yielded)
1072
+ called for each yielded record. Useful for saving progress.
1073
+
1074
+ Yields:
1075
+ CompanyRecord for each qualifying organization
1076
+ """
1077
+ path = dump_path or self._dump_path
1078
+ count = 0
1079
+ skipped_existing = 0
1080
+ skipped_no_type = 0
1081
+ skipped_no_enwiki = 0
1082
+ skipped_no_label = 0
1083
+ current_entity_index = start_index
1084
+
1085
+ logger.info("Starting organization import from Wikidata dump...")
1086
+ if start_index > 0:
1087
+ logger.info(f"Resuming from entity index {start_index:,}")
1088
+ if not require_enwiki:
1089
+ logger.info("Importing ALL organizations (no enwiki filter)")
1090
+ if skip_ids:
1091
+ logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
1092
+
1093
+ def track_entity(entity_index: int, entity_id: str) -> None:
1094
+ nonlocal current_entity_index
1095
+ current_entity_index = entity_index
1096
+
1097
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
1098
+ if limit and count >= limit:
1099
+ break
1100
+
1101
+ # Check skip_ids early, before full processing (avoids QID resolution)
1102
+ entity_id = entity.get("id", "")
1103
+ if skip_ids and entity_id in skip_ids:
1104
+ skipped_existing += 1
1105
+ continue
1106
+
1107
+ record = self._process_org_entity(entity, require_enwiki=require_enwiki)
1108
+ if record:
1109
+ count += 1
1110
+ if count % 10_000 == 0:
1111
+ logger.info(f"Yielded {count:,} organization records (skipped {skipped_existing:,} existing)...")
1112
+
1113
+ # Call progress callback with current position
1114
+ if progress_callback:
1115
+ progress_callback(current_entity_index, entity_id, count)
1116
+
1117
+ yield record
1118
+ elif entity.get("type") == "item":
1119
+ # Track skip reasons for debugging
1120
+ if self._get_org_type(entity) is None:
1121
+ skipped_no_type += 1
1122
+ elif require_enwiki and "enwiki" not in entity.get("sitelinks", {}):
1123
+ skipped_no_enwiki += 1
1124
+ else:
1125
+ skipped_no_label += 1
1126
+
1127
+ # Log skip stats periodically
1128
+ total_skipped = skipped_no_type + skipped_no_enwiki + skipped_no_label
1129
+ if total_skipped > 0 and total_skipped % 1_000_000 == 0:
1130
+ logger.debug(
1131
+ f"Skip stats: no_matching_type={skipped_no_type:,}, "
1132
+ f"no_enwiki={skipped_no_enwiki:,}, no_label={skipped_no_label:,}"
1133
+ )
1134
+
1135
+ logger.info(f"Organization import complete: {count:,} records (skipped {skipped_existing:,} existing)")
1136
+ logger.info(
1137
+ f"Skipped: no_matching_type={skipped_no_type:,}, "
1138
+ f"no_enwiki={skipped_no_enwiki:,}, no_label={skipped_no_label:,}"
1139
+ )
1140
+
1141
+ def import_all(
1142
+ self,
1143
+ dump_path: Optional[Path] = None,
1144
+ people_limit: Optional[int] = None,
1145
+ orgs_limit: Optional[int] = None,
1146
+ import_people: bool = True,
1147
+ import_orgs: bool = True,
1148
+ require_enwiki: bool = False,
1149
+ skip_people_ids: Optional[set[str]] = None,
1150
+ skip_org_ids: Optional[set[str]] = None,
1151
+ start_index: int = 0,
1152
+ progress_callback: Optional[Callable[[int, str, int, int], None]] = None,
1153
+ ) -> Iterator[tuple[str, ImportRecord]]:
1154
+ """
1155
+ Import both people and organizations in a single pass through the dump.
1156
+
1157
+ This is more efficient than calling import_people() and import_organizations()
1158
+ separately, as it only reads the ~100GB dump file once.
1159
+
1160
+ Args:
1161
+ dump_path: Path to dump file (uses self._dump_path if not provided)
1162
+ people_limit: Optional maximum number of people records
1163
+ orgs_limit: Optional maximum number of org records
1164
+ import_people: Whether to import people (default: True)
1165
+ import_orgs: Whether to import organizations (default: True)
1166
+ require_enwiki: If True, only include entities with English Wikipedia articles
1167
+ skip_people_ids: Optional set of people source_ids (Q codes) to skip
1168
+ skip_org_ids: Optional set of org source_ids (Q codes) to skip
1169
+ start_index: Entity index to start from (for resume support)
1170
+ progress_callback: Optional callback(entity_index, entity_id, people_count, orgs_count)
1171
+ called periodically. Useful for saving progress.
1172
+
1173
+ Yields:
1174
+ Tuples of (record_type, record) where record_type is "person" or "org"
1175
+ """
1176
+ path = dump_path or self._dump_path
1177
+ people_count = 0
1178
+ orgs_count = 0
1179
+ people_skipped = 0
1180
+ orgs_skipped = 0
1181
+ current_entity_index = start_index
1182
+
1183
+ logger.info("Starting combined import from Wikidata dump...")
1184
+ if start_index > 0:
1185
+ logger.info(f"Resuming from entity index {start_index:,}")
1186
+ if import_people:
1187
+ logger.info(f"Importing people (limit: {people_limit or 'none'})")
1188
+ if skip_people_ids:
1189
+ logger.info(f" Skipping {len(skip_people_ids):,} existing people Q codes")
1190
+ if import_orgs:
1191
+ logger.info(f"Importing organizations (limit: {orgs_limit or 'none'})")
1192
+ if skip_org_ids:
1193
+ logger.info(f" Skipping {len(skip_org_ids):,} existing org Q codes")
1194
+
1195
+ # Check if we've hit both limits
1196
+ def limits_reached() -> bool:
1197
+ people_done = not import_people or (people_limit and people_count >= people_limit)
1198
+ orgs_done = not import_orgs or (orgs_limit and orgs_count >= orgs_limit)
1199
+ return bool(people_done and orgs_done)
1200
+
1201
+ def track_entity(entity_index: int, entity_id: str) -> None:
1202
+ nonlocal current_entity_index
1203
+ current_entity_index = entity_index
1204
+
1205
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
1206
+ if limits_reached():
1207
+ break
1208
+
1209
+ entity_id = entity.get("id", "")
1210
+
1211
+ # Try to process as person first (if importing people and not at limit)
1212
+ if import_people and (not people_limit or people_count < people_limit):
1213
+ # Check skip_ids early
1214
+ if skip_people_ids and entity_id in skip_people_ids:
1215
+ people_skipped += 1
1216
+ else:
1217
+ person_record = self._process_person_entity(entity, require_enwiki=require_enwiki)
1218
+ if person_record:
1219
+ people_count += 1
1220
+ if people_count % 10_000 == 0:
1221
+ logger.info(
1222
+ f"Progress: {people_count:,} people, {orgs_count:,} orgs "
1223
+ f"(entity {current_entity_index:,})"
1224
+ )
1225
+ if progress_callback:
1226
+ progress_callback(current_entity_index, entity_id, people_count, orgs_count)
1227
+ yield ("person", person_record)
1228
+ continue # Entity was a person, don't check for org
1229
+
1230
+ # Try to process as organization (if importing orgs and not at limit)
1231
+ if import_orgs and (not orgs_limit or orgs_count < orgs_limit):
1232
+ # Check skip_ids early
1233
+ if skip_org_ids and entity_id in skip_org_ids:
1234
+ orgs_skipped += 1
1235
+ else:
1236
+ org_record = self._process_org_entity(entity, require_enwiki=require_enwiki)
1237
+ if org_record:
1238
+ orgs_count += 1
1239
+ if orgs_count % 10_000 == 0:
1240
+ logger.info(
1241
+ f"Progress: {people_count:,} people, {orgs_count:,} orgs "
1242
+ f"(entity {current_entity_index:,})"
1243
+ )
1244
+ if progress_callback:
1245
+ progress_callback(current_entity_index, entity_id, people_count, orgs_count)
1246
+ yield ("org", org_record)
1247
+
1248
+ logger.info(
1249
+ f"Combined import complete: {people_count:,} people, {orgs_count:,} orgs "
1250
+ f"(skipped {people_skipped:,} people, {orgs_skipped:,} orgs)"
1251
+ )
1252
+
1253
+ def _process_person_entity(
1254
+ self,
1255
+ entity: dict,
1256
+ require_enwiki: bool = False,
1257
+ ) -> Optional[PersonRecord]:
1258
+ """
1259
+ Process a single entity, return PersonRecord if it's a human.
1260
+
1261
+ Args:
1262
+ entity: Parsed Wikidata entity dictionary
1263
+ require_enwiki: If True, only include people with English Wikipedia articles
1264
+
1265
+ Returns:
1266
+ PersonRecord if entity qualifies, None otherwise
1267
+ """
1268
+ # Must be an item (not property)
1269
+ if entity.get("type") != "item":
1270
+ return None
1271
+
1272
+ # Must be human (P31 contains Q5)
1273
+ if not self._is_human(entity):
1274
+ return None
1275
+
1276
+ # Optionally require English Wikipedia article
1277
+ if require_enwiki:
1278
+ sitelinks = entity.get("sitelinks", {})
1279
+ if "enwiki" not in sitelinks:
1280
+ return None
1281
+
1282
+ # Extract person data
1283
+ return self._extract_person_data(entity)
1284
+
1285
+ def _process_org_entity(
1286
+ self,
1287
+ entity: dict,
1288
+ require_enwiki: bool = False,
1289
+ ) -> Optional[CompanyRecord]:
1290
+ """
1291
+ Process a single entity, return CompanyRecord if it's an organization.
1292
+
1293
+ Args:
1294
+ entity: Parsed Wikidata entity dictionary
1295
+ require_enwiki: If True, only include orgs with English Wikipedia articles
1296
+
1297
+ Returns:
1298
+ CompanyRecord if entity qualifies, None otherwise
1299
+ """
1300
+ # Must be an item (not property)
1301
+ if entity.get("type") != "item":
1302
+ return None
1303
+
1304
+ # Get organization type from P31
1305
+ entity_type = self._get_org_type(entity)
1306
+ if entity_type is None:
1307
+ return None
1308
+
1309
+ # Optionally require English Wikipedia article
1310
+ if require_enwiki:
1311
+ sitelinks = entity.get("sitelinks", {})
1312
+ if "enwiki" not in sitelinks:
1313
+ return None
1314
+
1315
+ # Extract organization data
1316
+ return self._extract_org_data(entity, entity_type)
1317
+
1318
+ def _is_human(self, entity: dict) -> bool:
1319
+ """
1320
+ Check if entity has P31 (instance of) = Q5 (human).
1321
+
1322
+ Args:
1323
+ entity: Parsed Wikidata entity dictionary
1324
+
1325
+ Returns:
1326
+ True if entity is a human
1327
+ """
1328
+ claims = entity.get("claims", {})
1329
+ for claim in claims.get("P31", []):
1330
+ mainsnak = claim.get("mainsnak", {})
1331
+ datavalue = mainsnak.get("datavalue", {})
1332
+ value = datavalue.get("value", {})
1333
+ if isinstance(value, dict) and value.get("id") == "Q5":
1334
+ return True
1335
+ return False
1336
+
1337
+ def _get_org_type(self, entity: dict) -> Optional[EntityType]:
1338
+ """
1339
+ Check if entity has P31 (instance of) matching an organization type.
1340
+
1341
+ Args:
1342
+ entity: Parsed Wikidata entity dictionary
1343
+
1344
+ Returns:
1345
+ EntityType if entity is an organization, None otherwise
1346
+ """
1347
+ claims = entity.get("claims", {})
1348
+ for claim in claims.get("P31", []):
1349
+ mainsnak = claim.get("mainsnak", {})
1350
+ datavalue = mainsnak.get("datavalue", {})
1351
+ value = datavalue.get("value", {})
1352
+ if isinstance(value, dict):
1353
+ qid = value.get("id", "")
1354
+ if qid in ORG_TYPE_TO_ENTITY_TYPE:
1355
+ return ORG_TYPE_TO_ENTITY_TYPE[qid]
1356
+ return None
1357
+
1358
+ def _get_location_type(self, entity: dict) -> Optional[tuple[str, SimplifiedLocationType]]:
1359
+ """
1360
+ Check if entity has P31 (instance of) matching a location type.
1361
+
1362
+ Args:
1363
+ entity: Parsed Wikidata entity dictionary
1364
+
1365
+ Returns:
1366
+ Tuple of (location_type_name, SimplifiedLocationType) if entity is a location, None otherwise
1367
+ """
1368
+ claims = entity.get("claims", {})
1369
+ for claim in claims.get("P31", []):
1370
+ mainsnak = claim.get("mainsnak", {})
1371
+ datavalue = mainsnak.get("datavalue", {})
1372
+ value = datavalue.get("value", {})
1373
+ if isinstance(value, dict):
1374
+ qid = value.get("id", "")
1375
+ if qid in LOCATION_TYPE_QIDS:
1376
+ return LOCATION_TYPE_QIDS[qid]
1377
+ return None
1378
+
1379
+ def _get_claim_values(self, entity: dict, prop: str) -> list[str]:
1380
+ """
1381
+ Get all QID values for a property (e.g., P39, P106).
1382
+
1383
+ Args:
1384
+ entity: Parsed Wikidata entity dictionary
1385
+ prop: Property ID (e.g., "P39", "P106")
1386
+
1387
+ Returns:
1388
+ List of QID strings
1389
+ """
1390
+ claims = entity.get("claims", {})
1391
+ values = []
1392
+ for claim in claims.get(prop, []):
1393
+ mainsnak = claim.get("mainsnak", {})
1394
+ datavalue = mainsnak.get("datavalue", {})
1395
+ value = datavalue.get("value", {})
1396
+ if isinstance(value, dict):
1397
+ qid = value.get("id")
1398
+ if qid:
1399
+ values.append(qid)
1400
+ return values
1401
+
1402
+ def _get_qid_qualifier(self, qualifiers: dict, prop: str) -> Optional[str]:
1403
+ """Extract first QID from a qualifier property."""
1404
+ for qual in qualifiers.get(prop, []):
1405
+ qual_datavalue = qual.get("datavalue", {})
1406
+ qual_value = qual_datavalue.get("value", {})
1407
+ if isinstance(qual_value, dict):
1408
+ return qual_value.get("id")
1409
+ return None
1410
+
1411
+ def _get_time_qualifier(self, qualifiers: dict, prop: str) -> Optional[str]:
1412
+ """Extract first time value from a qualifier property."""
1413
+ for qual in qualifiers.get(prop, []):
1414
+ qual_datavalue = qual.get("datavalue", {})
1415
+ qual_value = qual_datavalue.get("value", {})
1416
+ if isinstance(qual_value, dict):
1417
+ time_str = qual_value.get("time", "")
1418
+ return self._parse_time_value(time_str)
1419
+ return None
1420
+
1421
+ def _get_positions_with_org(self, claims: dict) -> list[dict]:
1422
+ """
1423
+ Extract P39 positions with qualifiers for org, dates, and parliamentary context.
1424
+
1425
+ Qualifiers extracted per WikiProject Parliaments guidelines:
1426
+ - P580 (start time) - when the position started
1427
+ - P582 (end time) - when the position ended
1428
+ - P108 (employer) - organization they work for
1429
+ - P642 (of) - the organization (legacy/fallback)
1430
+ - P768 (electoral district) - constituency for MPs
1431
+ - P2937 (parliamentary term) - which term they served in
1432
+ - P4100 (parliamentary group) - political party/faction
1433
+ - P1001 (applies to jurisdiction) - jurisdiction they represent
1434
+ - P2715 (elected in) - which election elected them
1435
+
1436
+ Args:
1437
+ claims: Claims dictionary from entity
1438
+
1439
+ Returns:
1440
+ List of position dictionaries with position metadata
1441
+ """
1442
+ positions = []
1443
+ for claim in claims.get("P39", []):
1444
+ mainsnak = claim.get("mainsnak", {})
1445
+ datavalue = mainsnak.get("datavalue", {})
1446
+ pos_value = datavalue.get("value", {})
1447
+ pos_qid = pos_value.get("id") if isinstance(pos_value, dict) else None
1448
+ if not pos_qid:
1449
+ continue
1450
+
1451
+ qualifiers = claim.get("qualifiers", {})
1452
+
1453
+ # Extract organization from multiple possible qualifiers
1454
+ # Priority: P108 (employer) > P642 (of) > P1001 (jurisdiction)
1455
+ org_qid = (
1456
+ self._get_qid_qualifier(qualifiers, "P108") or # employer
1457
+ self._get_qid_qualifier(qualifiers, "P642") or # of (legacy)
1458
+ self._get_qid_qualifier(qualifiers, "P1001") # applies to jurisdiction
1459
+ )
1460
+
1461
+ # Extract dates
1462
+ start_date = self._get_time_qualifier(qualifiers, "P580")
1463
+ end_date = self._get_time_qualifier(qualifiers, "P582")
1464
+
1465
+ # Extract parliamentary/political qualifiers
1466
+ electoral_district = self._get_qid_qualifier(qualifiers, "P768")
1467
+ parliamentary_term = self._get_qid_qualifier(qualifiers, "P2937")
1468
+ parliamentary_group = self._get_qid_qualifier(qualifiers, "P4100")
1469
+ elected_in = self._get_qid_qualifier(qualifiers, "P2715")
1470
+
1471
+ positions.append({
1472
+ "position_qid": pos_qid,
1473
+ "org_qid": org_qid,
1474
+ "start_date": start_date,
1475
+ "end_date": end_date,
1476
+ # Parliamentary context
1477
+ "electoral_district": electoral_district,
1478
+ "parliamentary_term": parliamentary_term,
1479
+ "parliamentary_group": parliamentary_group,
1480
+ "elected_in": elected_in,
1481
+ })
1482
+ return positions
1483
+
1484
+ def _parse_time_value(self, time_str: str) -> Optional[str]:
1485
+ """
1486
+ Parse Wikidata time value to ISO date string.
1487
+
1488
+ Args:
1489
+ time_str: Wikidata time format like "+2020-01-15T00:00:00Z"
1490
+
1491
+ Returns:
1492
+ ISO date string (YYYY-MM-DD) or None
1493
+ """
1494
+ if not time_str:
1495
+ return None
1496
+ # Remove leading + and extract date part
1497
+ time_str = time_str.lstrip("+")
1498
+ if "T" in time_str:
1499
+ return time_str.split("T")[0]
1500
+ return None
1501
+
1502
+ def _classify_person_type(
1503
+ self,
1504
+ positions: list[dict],
1505
+ occupations: list[str],
1506
+ ) -> PersonType:
1507
+ """
1508
+ Determine PersonType from P39 positions and P106 occupations.
1509
+
1510
+ Priority order:
1511
+ 1. Check positions (more specific)
1512
+ 2. Check occupations
1513
+ 3. Default to UNKNOWN
1514
+
1515
+ Args:
1516
+ positions: List of position dictionaries from _get_positions_with_org
1517
+ occupations: List of occupation QIDs from P106
1518
+
1519
+ Returns:
1520
+ Classified PersonType
1521
+ """
1522
+ # Check positions first (more specific)
1523
+ for pos in positions:
1524
+ pos_qid = pos.get("position_qid", "")
1525
+ if pos_qid in EXECUTIVE_POSITION_QIDS:
1526
+ return PersonType.EXECUTIVE
1527
+ if pos_qid in POLITICIAN_POSITION_QIDS:
1528
+ return PersonType.POLITICIAN
1529
+
1530
+ # Then check occupations
1531
+ for occ in occupations:
1532
+ if occ in OCCUPATION_TO_TYPE:
1533
+ return OCCUPATION_TO_TYPE[occ]
1534
+
1535
+ # Default
1536
+ return PersonType.UNKNOWN
1537
+
1538
+ def _get_org_or_context(self, pos: dict) -> str:
1539
+ """Get org QID from position, falling back to electoral district or parliamentary group."""
1540
+ return (
1541
+ pos.get("org_qid") or
1542
+ pos.get("electoral_district") or
1543
+ pos.get("parliamentary_group") or
1544
+ ""
1545
+ )
1546
+
1547
+ def _get_best_role_org(
1548
+ self,
1549
+ positions: list[dict],
1550
+ ) -> tuple[str, str, str, Optional[str], Optional[str], dict]:
1551
+ """
1552
+ Select best position for role/org display.
1553
+
1554
+ Priority:
1555
+ 1. Positions with org/context and dates
1556
+ 2. Positions with org/context
1557
+ 3. Positions with dates
1558
+ 4. Any position
1559
+
1560
+ Args:
1561
+ positions: List of position dictionaries
1562
+
1563
+ Returns:
1564
+ Tuple of (role_qid, org_label, org_qid, start_date, end_date, extra_context)
1565
+ Note: In dump mode, we return QIDs since we don't have labels
1566
+ extra_context contains parliamentary metadata
1567
+ """
1568
+ def has_context(pos: dict) -> bool:
1569
+ return bool(
1570
+ pos.get("org_qid") or
1571
+ pos.get("electoral_district") or
1572
+ pos.get("parliamentary_group")
1573
+ )
1574
+
1575
+ def get_extra_context(pos: dict) -> dict:
1576
+ return {
1577
+ k: v for k, v in {
1578
+ "electoral_district": pos.get("electoral_district"),
1579
+ "parliamentary_term": pos.get("parliamentary_term"),
1580
+ "parliamentary_group": pos.get("parliamentary_group"),
1581
+ "elected_in": pos.get("elected_in"),
1582
+ }.items() if v
1583
+ }
1584
+
1585
+ # Priority 1: Position with org/context and dates
1586
+ for pos in positions:
1587
+ if has_context(pos) and (pos.get("start_date") or pos.get("end_date")):
1588
+ return (
1589
+ pos["position_qid"],
1590
+ "",
1591
+ self._get_org_or_context(pos),
1592
+ pos.get("start_date"),
1593
+ pos.get("end_date"),
1594
+ get_extra_context(pos),
1595
+ )
1596
+
1597
+ # Priority 2: Position with org/context
1598
+ for pos in positions:
1599
+ if has_context(pos):
1600
+ return (
1601
+ pos["position_qid"],
1602
+ "",
1603
+ self._get_org_or_context(pos),
1604
+ pos.get("start_date"),
1605
+ pos.get("end_date"),
1606
+ get_extra_context(pos),
1607
+ )
1608
+
1609
+ # Priority 3: Position with dates
1610
+ for pos in positions:
1611
+ if pos.get("start_date") or pos.get("end_date"):
1612
+ return (
1613
+ pos["position_qid"],
1614
+ "",
1615
+ self._get_org_or_context(pos),
1616
+ pos.get("start_date"),
1617
+ pos.get("end_date"),
1618
+ get_extra_context(pos),
1619
+ )
1620
+
1621
+ # Priority 4: Any position
1622
+ if positions:
1623
+ pos = positions[0]
1624
+ return (
1625
+ pos["position_qid"],
1626
+ "",
1627
+ self._get_org_or_context(pos),
1628
+ pos.get("start_date"),
1629
+ pos.get("end_date"),
1630
+ get_extra_context(pos),
1631
+ )
1632
+
1633
+ return "", "", "", None, None, {}
1634
+
1635
+ def _extract_person_data(self, entity: dict) -> Optional[PersonRecord]:
1636
+ """
1637
+ Extract PersonRecord from entity dict.
1638
+
1639
+ Derives type/role/org from claims.
1640
+
1641
+ Args:
1642
+ entity: Parsed Wikidata entity dictionary
1643
+
1644
+ Returns:
1645
+ PersonRecord or None if essential data is missing
1646
+ """
1647
+ qid = entity.get("id", "")
1648
+ labels = entity.get("labels", {})
1649
+ # Try English label first, fall back to any available label
1650
+ label = labels.get("en", {}).get("value", "")
1651
+ if not label:
1652
+ # Try to get any label
1653
+ for lang_data in labels.values():
1654
+ if isinstance(lang_data, dict) and lang_data.get("value"):
1655
+ label = lang_data["value"]
1656
+ break
1657
+
1658
+ if not label or not qid:
1659
+ return None
1660
+
1661
+ claims = entity.get("claims", {})
1662
+
1663
+ # Get positions (P39) with qualifiers for org
1664
+ positions = self._get_positions_with_org(claims)
1665
+ # Get occupations (P106)
1666
+ occupations = self._get_claim_values(entity, "P106")
1667
+
1668
+ # Classify person type from positions + occupations
1669
+ person_type = self._classify_person_type(positions, occupations)
1670
+
1671
+ # Get best role/org/dates from positions
1672
+ role_qid, _, org_qid, start_date, end_date, extra_context = self._get_best_role_org(positions)
1673
+
1674
+ # Fallback: if no org from positions, check top-level P108 (employer)
1675
+ if not org_qid:
1676
+ employers = self._get_claim_values(entity, "P108")
1677
+ if employers:
1678
+ org_qid = employers[0]
1679
+ logger.debug(f"Using top-level P108 employer for {qid}: {org_qid}")
1680
+
1681
+ # Get country (P27 - country of citizenship)
1682
+ countries = self._get_claim_values(entity, "P27")
1683
+ country_qid = countries[0] if countries else ""
1684
+
1685
+ # Resolve QIDs to labels using the cache (or track for later resolution)
1686
+ country_label = self._resolve_qid(country_qid) if country_qid else ""
1687
+ role_label = self._resolve_qid(role_qid) if role_qid else ""
1688
+ org_label = self._resolve_qid(org_qid) if org_qid else ""
1689
+
1690
+ # Get birth and death dates (P569, P570)
1691
+ birth_date = self._get_time_claim(claims, "P569")
1692
+ death_date = self._get_time_claim(claims, "P570")
1693
+
1694
+ # Get description
1695
+ descriptions = entity.get("descriptions", {})
1696
+ description = descriptions.get("en", {}).get("value", "")
1697
+
1698
+ # Track discovered organization
1699
+ if org_qid:
1700
+ self._discovered_orgs[org_qid] = org_label
1701
+
1702
+ # Build record with all position metadata
1703
+ record_data = {
1704
+ "wikidata_id": qid,
1705
+ "label": label,
1706
+ "description": description,
1707
+ "positions": [p["position_qid"] for p in positions],
1708
+ "occupations": occupations,
1709
+ "org_qid": org_qid,
1710
+ "country_qid": country_qid,
1711
+ "role_qid": role_qid,
1712
+ "birth_date": birth_date,
1713
+ "death_date": death_date,
1714
+ }
1715
+ # Add parliamentary context if present
1716
+ if extra_context:
1717
+ record_data.update(extra_context)
1718
+
1719
+ return PersonRecord(
1720
+ name=label,
1721
+ source="wikidata",
1722
+ source_id=qid,
1723
+ country=country_label,
1724
+ person_type=person_type,
1725
+ known_for_role=role_label,
1726
+ known_for_org=org_label,
1727
+ from_date=start_date,
1728
+ to_date=end_date,
1729
+ birth_date=birth_date,
1730
+ death_date=death_date,
1731
+ record=record_data,
1732
+ )
1733
+
1734
+ def _extract_org_data(
1735
+ self,
1736
+ entity: dict,
1737
+ entity_type: EntityType,
1738
+ ) -> Optional[CompanyRecord]:
1739
+ """
1740
+ Extract CompanyRecord from entity dict.
1741
+
1742
+ Args:
1743
+ entity: Parsed Wikidata entity dictionary
1744
+ entity_type: Determined EntityType
1745
+
1746
+ Returns:
1747
+ CompanyRecord or None if essential data is missing
1748
+ """
1749
+ qid = entity.get("id", "")
1750
+ labels = entity.get("labels", {})
1751
+ label = labels.get("en", {}).get("value", "")
1752
+
1753
+ if not label or not qid:
1754
+ return None
1755
+
1756
+ claims = entity.get("claims", {})
1757
+
1758
+ # Get country (P17 - country)
1759
+ countries = self._get_claim_values(entity, "P17")
1760
+ country_qid = countries[0] if countries else ""
1761
+
1762
+ # Resolve country QID to label
1763
+ country_label = self._resolve_qid(country_qid) if country_qid else ""
1764
+
1765
+ # Get LEI (P1278)
1766
+ lei = self._get_string_claim(claims, "P1278")
1767
+
1768
+ # Get ticker (P249)
1769
+ ticker = self._get_string_claim(claims, "P249")
1770
+
1771
+ # Get description
1772
+ descriptions = entity.get("descriptions", {})
1773
+ description = descriptions.get("en", {}).get("value", "")
1774
+
1775
+ # Get inception date (P571)
1776
+ inception = self._get_time_claim(claims, "P571")
1777
+
1778
+ # Get dissolution date (P576)
1779
+ dissolution = self._get_time_claim(claims, "P576")
1780
+
1781
+ return CompanyRecord(
1782
+ name=label,
1783
+ source="wikipedia", # Use "wikipedia" per existing convention
1784
+ source_id=qid,
1785
+ region=country_label,
1786
+ entity_type=entity_type,
1787
+ from_date=inception,
1788
+ to_date=dissolution,
1789
+ record={
1790
+ "wikidata_id": qid,
1791
+ "label": label,
1792
+ "description": description,
1793
+ "lei": lei,
1794
+ "ticker": ticker,
1795
+ "country_qid": country_qid,
1796
+ },
1797
+ )
1798
+
1799
+ def _process_location_entity(
1800
+ self,
1801
+ entity: dict,
1802
+ require_enwiki: bool = False,
1803
+ ) -> Optional[LocationRecord]:
1804
+ """
1805
+ Process a single entity, return LocationRecord if it's a location.
1806
+
1807
+ Args:
1808
+ entity: Parsed Wikidata entity dictionary
1809
+ require_enwiki: If True, only include locations with English Wikipedia articles
1810
+
1811
+ Returns:
1812
+ LocationRecord if entity qualifies, None otherwise
1813
+ """
1814
+ # Must be an item (not property)
1815
+ if entity.get("type") != "item":
1816
+ return None
1817
+
1818
+ # Get location type from P31
1819
+ location_type_info = self._get_location_type(entity)
1820
+ if location_type_info is None:
1821
+ return None
1822
+
1823
+ location_type_name, simplified_type = location_type_info
1824
+
1825
+ # Optionally require English Wikipedia article
1826
+ if require_enwiki:
1827
+ sitelinks = entity.get("sitelinks", {})
1828
+ if "enwiki" not in sitelinks:
1829
+ return None
1830
+
1831
+ # Extract location data
1832
+ return self._extract_location_data(entity, location_type_name, simplified_type)
1833
+
1834
+ def _extract_location_data(
1835
+ self,
1836
+ entity: dict,
1837
+ location_type: str,
1838
+ simplified_type: SimplifiedLocationType,
1839
+ ) -> Optional[LocationRecord]:
1840
+ """
1841
+ Extract LocationRecord from entity dict.
1842
+
1843
+ Args:
1844
+ entity: Parsed Wikidata entity dictionary
1845
+ location_type: Detailed location type name
1846
+ simplified_type: Simplified location type enum
1847
+
1848
+ Returns:
1849
+ LocationRecord or None if essential data is missing
1850
+ """
1851
+ qid = entity.get("id", "")
1852
+ labels = entity.get("labels", {})
1853
+ label = labels.get("en", {}).get("value", "")
1854
+
1855
+ if not label or not qid:
1856
+ return None
1857
+
1858
+ claims = entity.get("claims", {})
1859
+
1860
+ # Get parent locations from P131 (located in administrative territorial entity)
1861
+ # This gives us the full hierarchy (city -> state -> country)
1862
+ parent_qids = self._get_claim_values(entity, "P131")
1863
+
1864
+ # Get country from P17 as fallback/additional parent
1865
+ country_qids = self._get_claim_values(entity, "P17")
1866
+
1867
+ # Get coordinates from P625 (coordinate location)
1868
+ coordinates = self._get_coordinates(claims)
1869
+
1870
+ # Get description
1871
+ descriptions = entity.get("descriptions", {})
1872
+ description = descriptions.get("en", {}).get("value", "")
1873
+
1874
+ # Get inception date (P571) - when location was established
1875
+ inception = self._get_time_claim(claims, "P571")
1876
+
1877
+ # Get dissolution date (P576) - when location ceased to exist
1878
+ dissolution = self._get_time_claim(claims, "P576")
1879
+
1880
+ # Parse QID to integer
1881
+ qid_int = int(qid[1:]) if qid.startswith("Q") and qid[1:].isdigit() else None
1882
+
1883
+ # Build record with extra details
1884
+ record_data = {
1885
+ "wikidata_id": qid,
1886
+ "label": label,
1887
+ "description": description,
1888
+ "parent_qids": parent_qids,
1889
+ "country_qids": country_qids,
1890
+ }
1891
+ if coordinates:
1892
+ record_data["coordinates"] = coordinates
1893
+
1894
+ return LocationRecord(
1895
+ name=label,
1896
+ source="wikidata",
1897
+ source_id=qid,
1898
+ qid=qid_int,
1899
+ location_type=location_type,
1900
+ simplified_type=simplified_type,
1901
+ parent_ids=[], # Will be resolved later by looking up parent QIDs in the database
1902
+ from_date=inception,
1903
+ to_date=dissolution,
1904
+ record=record_data,
1905
+ )
1906
+
1907
+ def _get_coordinates(self, claims: dict) -> Optional[dict]:
1908
+ """
1909
+ Get coordinates from P625 (coordinate location).
1910
+
1911
+ Args:
1912
+ claims: Claims dictionary
1913
+
1914
+ Returns:
1915
+ Dict with lat/lon or None
1916
+ """
1917
+ for claim in claims.get("P625", []):
1918
+ mainsnak = claim.get("mainsnak", {})
1919
+ datavalue = mainsnak.get("datavalue", {})
1920
+ value = datavalue.get("value", {})
1921
+ if isinstance(value, dict):
1922
+ lat = value.get("latitude")
1923
+ lon = value.get("longitude")
1924
+ if lat is not None and lon is not None:
1925
+ return {"lat": lat, "lon": lon}
1926
+ return None
1927
+
1928
+ def import_locations(
1929
+ self,
1930
+ dump_path: Optional[Path] = None,
1931
+ limit: Optional[int] = None,
1932
+ require_enwiki: bool = False,
1933
+ skip_ids: Optional[set[str]] = None,
1934
+ start_index: int = 0,
1935
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
1936
+ ) -> Iterator[LocationRecord]:
1937
+ """
1938
+ Stream through dump, yielding locations (geopolitical entities).
1939
+
1940
+ This method filters the dump for:
1941
+ - Items with type "item"
1942
+ - Has P31 (instance of) matching a location type
1943
+ - Optionally: Has English Wikipedia article (enwiki sitelink)
1944
+
1945
+ Args:
1946
+ dump_path: Path to dump file (uses self._dump_path if not provided)
1947
+ limit: Optional maximum number of records to return
1948
+ require_enwiki: If True, only include locations with English Wikipedia articles
1949
+ skip_ids: Optional set of source_ids (Q codes) to skip
1950
+ start_index: Entity index to start from (for resume support)
1951
+ progress_callback: Optional callback(entity_index, entity_id, records_yielded)
1952
+
1953
+ Yields:
1954
+ LocationRecord for each qualifying location
1955
+ """
1956
+ path = dump_path or self._dump_path
1957
+ count = 0
1958
+ skipped_existing = 0
1959
+ current_entity_index = start_index
1960
+
1961
+ logger.info("Starting location import from Wikidata dump...")
1962
+ if start_index > 0:
1963
+ logger.info(f"Resuming from entity index {start_index:,}")
1964
+ if not require_enwiki:
1965
+ logger.info("Importing ALL locations (no enwiki filter)")
1966
+ if skip_ids:
1967
+ logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
1968
+
1969
+ def track_entity(entity_index: int, entity_id: str) -> None:
1970
+ nonlocal current_entity_index
1971
+ current_entity_index = entity_index
1972
+
1973
+ for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
1974
+ if limit and count >= limit:
1975
+ break
1976
+
1977
+ # Check skip_ids early, before full processing
1978
+ entity_id = entity.get("id", "")
1979
+ if skip_ids and entity_id in skip_ids:
1980
+ skipped_existing += 1
1981
+ continue
1982
+
1983
+ record = self._process_location_entity(entity, require_enwiki=require_enwiki)
1984
+ if record:
1985
+ count += 1
1986
+ if count % 10_000 == 0:
1987
+ logger.info(f"Yielded {count:,} location records (skipped {skipped_existing:,})...")
1988
+
1989
+ # Call progress callback with current position
1990
+ if progress_callback:
1991
+ progress_callback(current_entity_index, entity_id, count)
1992
+
1993
+ yield record
1994
+
1995
+ logger.info(f"Location import complete: {count:,} records (skipped {skipped_existing:,})")
1996
+
1997
+ def _get_string_claim(self, claims: dict, prop: str) -> str:
1998
+ """
1999
+ Get first string value for a property.
2000
+
2001
+ Args:
2002
+ claims: Claims dictionary
2003
+ prop: Property ID
2004
+
2005
+ Returns:
2006
+ String value or empty string
2007
+ """
2008
+ for claim in claims.get(prop, []):
2009
+ mainsnak = claim.get("mainsnak", {})
2010
+ datavalue = mainsnak.get("datavalue", {})
2011
+ value = datavalue.get("value")
2012
+ if isinstance(value, str):
2013
+ return value
2014
+ return ""
2015
+
2016
+ def _get_time_claim(self, claims: dict, prop: str) -> Optional[str]:
2017
+ """
2018
+ Get first time value for a property as ISO date string.
2019
+
2020
+ Args:
2021
+ claims: Claims dictionary
2022
+ prop: Property ID
2023
+
2024
+ Returns:
2025
+ ISO date string (YYYY-MM-DD) or None
2026
+ """
2027
+ for claim in claims.get(prop, []):
2028
+ mainsnak = claim.get("mainsnak", {})
2029
+ datavalue = mainsnak.get("datavalue", {})
2030
+ value = datavalue.get("value", {})
2031
+ if isinstance(value, dict):
2032
+ time_str = value.get("time", "")
2033
+ # Format: +2020-01-15T00:00:00Z
2034
+ if time_str:
2035
+ # Remove leading + and extract date part
2036
+ time_str = time_str.lstrip("+")
2037
+ if "T" in time_str:
2038
+ return time_str.split("T")[0]
2039
+ return None
2040
+
2041
+ def get_discovered_organizations(self) -> list[CompanyRecord]:
2042
+ """
2043
+ Get organizations discovered during the people import.
2044
+
2045
+ These are organizations associated with people (from P39 P642 qualifiers)
2046
+ that can be inserted into the organizations database if not already present.
2047
+
2048
+ Note: In dump mode, we only have QIDs, not labels.
2049
+
2050
+ Returns:
2051
+ List of CompanyRecord objects for discovered organizations
2052
+ """
2053
+ records = []
2054
+ for org_qid in self._discovered_orgs:
2055
+ record = CompanyRecord(
2056
+ name=org_qid, # Only have QID, not label
2057
+ source="wikipedia",
2058
+ source_id=org_qid,
2059
+ region="",
2060
+ entity_type=EntityType.BUSINESS, # Default
2061
+ record={
2062
+ "wikidata_id": org_qid,
2063
+ "discovered_from": "people_import",
2064
+ "needs_label_resolution": True,
2065
+ },
2066
+ )
2067
+ records.append(record)
2068
+ logger.info(f"Discovered {len(records)} organizations from people import")
2069
+ return records
2070
+
2071
+ def clear_discovered_organizations(self) -> None:
2072
+ """Clear the discovered organizations cache."""
2073
+ self._discovered_orgs.clear()
2074
+
2075
+ def get_unresolved_qids(self) -> set[str]:
2076
+ """Get QIDs that need label resolution."""
2077
+ return self._unresolved_qids.copy()
2078
+
2079
+ def get_label_cache(self) -> dict[str, str]:
2080
+ """Get the label cache built during import."""
2081
+ return self._label_cache.copy()
2082
+
2083
+ def set_label_cache(self, labels: dict[str, str]) -> None:
2084
+ """
2085
+ Set initial label cache from existing data (e.g., from database).
2086
+
2087
+ Args:
2088
+ labels: Mapping of QID -> label to seed the cache
2089
+ """
2090
+ self._label_cache.update(labels)
2091
+ logger.info(f"Seeded label cache with {len(labels)} existing labels")
2092
+
2093
+ def get_new_labels_since(self, known_qids: set[str]) -> dict[str, str]:
2094
+ """
2095
+ Get labels that were added to cache since a known set.
2096
+
2097
+ Args:
2098
+ known_qids: Set of QIDs that were already known
2099
+
2100
+ Returns:
2101
+ Dict of new QID -> label mappings
2102
+ """
2103
+ return {qid: label for qid, label in self._label_cache.items() if qid not in known_qids}
2104
+
2105
+ def _cache_entity_label(self, entity: dict) -> None:
2106
+ """
2107
+ Cache the English label for an entity during dump processing.
2108
+
2109
+ This builds up a lookup table as we iterate through the dump,
2110
+ so we can resolve QID references (countries, roles) to labels.
2111
+ """
2112
+ qid = entity.get("id", "")
2113
+ if not qid:
2114
+ return
2115
+
2116
+ labels = entity.get("labels", {})
2117
+ en_label = labels.get("en", {}).get("value", "")
2118
+ if en_label:
2119
+ self._label_cache[qid] = en_label
2120
+
2121
+ def _resolve_qid(self, qid: str) -> str:
2122
+ """
2123
+ Resolve a QID to a label, using cache or SPARQL lookup.
2124
+
2125
+ Returns the label if found/resolved, otherwise returns the QID.
2126
+ """
2127
+ if not qid or not qid.startswith("Q"):
2128
+ return qid
2129
+
2130
+ if qid in self._label_cache:
2131
+ label = self._label_cache[qid]
2132
+ logger.debug(f"Resolved QID (cache): {qid} -> {label}")
2133
+ return label
2134
+
2135
+ # Not in cache - resolve via SPARQL immediately
2136
+ label = self._resolve_single_qid_sparql(qid)
2137
+ if label:
2138
+ logger.info(f"Resolved QID (SPARQL): {qid} -> {label}")
2139
+ self._label_cache[qid] = label
2140
+ return label
2141
+
2142
+ # Track unresolved
2143
+ if qid not in self._unresolved_qids:
2144
+ logger.debug(f"Unresolved QID: {qid}")
2145
+ self._unresolved_qids.add(qid)
2146
+ return qid
2147
+
2148
+ def _resolve_single_qid_sparql(self, qid: str) -> Optional[str]:
2149
+ """
2150
+ Resolve a single QID to a label via SPARQL.
2151
+
2152
+ Args:
2153
+ qid: Wikidata QID (e.g., 'Q30')
2154
+
2155
+ Returns:
2156
+ Label string or None if not found
2157
+ """
2158
+ import json
2159
+ import urllib.parse
2160
+ import urllib.request
2161
+
2162
+ query = f"""
2163
+ SELECT ?label WHERE {{
2164
+ wd:{qid} rdfs:label ?label FILTER(LANG(?label) = "en") .
2165
+ }}
2166
+ LIMIT 1
2167
+ """
2168
+
2169
+ try:
2170
+ params = urllib.parse.urlencode({
2171
+ "query": query,
2172
+ "format": "json",
2173
+ })
2174
+ url = f"https://query.wikidata.org/sparql?{params}"
2175
+
2176
+ req = urllib.request.Request(
2177
+ url,
2178
+ headers={
2179
+ "Accept": "application/sparql-results+json",
2180
+ "User-Agent": "corp-extractor/1.0 (QID resolver)",
2181
+ }
2182
+ )
2183
+
2184
+ with urllib.request.urlopen(req, timeout=10) as response:
2185
+ data = json.loads(response.read().decode("utf-8"))
2186
+
2187
+ bindings = data.get("results", {}).get("bindings", [])
2188
+ if bindings:
2189
+ return bindings[0].get("label", {}).get("value")
2190
+
2191
+ except Exception as e:
2192
+ logger.debug(f"SPARQL lookup failed for {qid}: {e}")
2193
+
2194
+ return None
2195
+
2196
+ def resolve_qids_via_sparql(
2197
+ self,
2198
+ qids: Optional[set[str]] = None,
2199
+ batch_size: int = 50,
2200
+ delay_seconds: float = 1.0,
2201
+ ) -> dict[str, str]:
2202
+ """
2203
+ Resolve QIDs to labels via Wikidata SPARQL queries.
2204
+
2205
+ This is used after import to resolve any QIDs that weren't found
2206
+ in the dump (e.g., if import was limited or dump was incomplete).
2207
+
2208
+ Args:
2209
+ qids: Set of QIDs to resolve (defaults to unresolved_qids)
2210
+ batch_size: Number of QIDs per SPARQL query (default 50)
2211
+ delay_seconds: Delay between queries to avoid rate limiting
2212
+
2213
+ Returns:
2214
+ Dict mapping QID -> label for resolved QIDs
2215
+ """
2216
+ import json
2217
+ import time
2218
+ import urllib.parse
2219
+ import urllib.request
2220
+
2221
+ if qids is None:
2222
+ qids = self._unresolved_qids
2223
+
2224
+ if not qids:
2225
+ return {}
2226
+
2227
+ resolved: dict[str, str] = {}
2228
+ qid_list = list(qids)
2229
+
2230
+ logger.info(f"Resolving {len(qid_list)} QIDs via SPARQL...")
2231
+
2232
+ for i in range(0, len(qid_list), batch_size):
2233
+ batch = qid_list[i:i + batch_size]
2234
+
2235
+ # Build VALUES clause
2236
+ values = " ".join(f"wd:{qid}" for qid in batch)
2237
+ query = f"""
2238
+ SELECT ?item ?itemLabel WHERE {{
2239
+ VALUES ?item {{ {values} }}
2240
+ ?item rdfs:label ?itemLabel FILTER(LANG(?itemLabel) = "en") .
2241
+ }}
2242
+ """
2243
+
2244
+ try:
2245
+ params = urllib.parse.urlencode({
2246
+ "query": query,
2247
+ "format": "json",
2248
+ })
2249
+ url = f"https://query.wikidata.org/sparql?{params}"
2250
+
2251
+ req = urllib.request.Request(
2252
+ url,
2253
+ headers={
2254
+ "Accept": "application/sparql-results+json",
2255
+ "User-Agent": "corp-extractor/1.0 (QID resolver)",
2256
+ }
2257
+ )
2258
+
2259
+ with urllib.request.urlopen(req, timeout=60) as response:
2260
+ data = json.loads(response.read().decode("utf-8"))
2261
+
2262
+ for binding in data.get("results", {}).get("bindings", []):
2263
+ item_uri = binding.get("item", {}).get("value", "")
2264
+ label = binding.get("itemLabel", {}).get("value", "")
2265
+ if item_uri and label:
2266
+ qid = item_uri.split("/")[-1]
2267
+ resolved[qid] = label
2268
+ self._label_cache[qid] = label
2269
+
2270
+ logger.debug(f"Resolved batch {i // batch_size + 1}: {len(batch)} QIDs")
2271
+
2272
+ except Exception as e:
2273
+ logger.warning(f"SPARQL batch failed: {e}")
2274
+
2275
+ if i + batch_size < len(qid_list):
2276
+ time.sleep(delay_seconds)
2277
+
2278
+ # Update unresolved set
2279
+ self._unresolved_qids -= set(resolved.keys())
2280
+
2281
+ logger.info(f"Resolved {len(resolved)} QIDs, {len(self._unresolved_qids)} remaining unresolved")
2282
+ return resolved