corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1951 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wikidata dump importer for people and organizations.
|
|
3
|
+
|
|
4
|
+
Uses the Wikidata JSON dump (~100GB compressed) to import:
|
|
5
|
+
1. People: All humans (P31=Q5) with English Wikipedia articles
|
|
6
|
+
2. Organizations: All organizations with English Wikipedia articles
|
|
7
|
+
|
|
8
|
+
This avoids SPARQL query timeouts that occur with large result sets.
|
|
9
|
+
The dump is processed line-by-line to minimize memory usage.
|
|
10
|
+
|
|
11
|
+
Dump format:
|
|
12
|
+
- File: `latest-all.json.bz2` (~100GB) or `.gz` (~150GB)
|
|
13
|
+
- Format: JSON array where each line is a separate entity (after first `[` line)
|
|
14
|
+
- Each line: `{"type":"item","id":"Q123","labels":{...},"claims":{...},"sitelinks":{...}},`
|
|
15
|
+
- Streaming: Read line-by-line, strip trailing comma, parse JSON
|
|
16
|
+
|
|
17
|
+
Resume support:
|
|
18
|
+
- Progress is tracked by entity index (count of entities processed)
|
|
19
|
+
- Progress can be saved to a JSON file and loaded on resume
|
|
20
|
+
- On resume, entities are skipped efficiently until reaching the saved position
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import bz2
|
|
24
|
+
import gzip
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
import shutil
|
|
28
|
+
import subprocess
|
|
29
|
+
import urllib.request
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from datetime import datetime
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Callable, Iterator, Optional
|
|
34
|
+
|
|
35
|
+
from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
|
|
36
|
+
|
|
37
|
+
# Type alias for records that can be either people or orgs
|
|
38
|
+
ImportRecord = PersonRecord | CompanyRecord
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
# Wikidata dump URLs - mirrors for faster downloads
|
|
43
|
+
# Primary is Wikimedia (slow), alternatives may be faster
|
|
44
|
+
DUMP_MIRRORS = [
|
|
45
|
+
# Wikimedia Foundation (official, often slow)
|
|
46
|
+
"https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
|
|
47
|
+
# Academic Torrents mirror (if available) - typically faster
|
|
48
|
+
# Note: Check https://academictorrents.com/browse?search=wikidata for current links
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Default URL (can be overridden)
|
|
52
|
+
DUMP_URL = DUMP_MIRRORS[0]
|
|
53
|
+
|
|
54
|
+
# For even faster downloads, users can:
|
|
55
|
+
# 1. Use a torrent client with the Academic Torrents magnet link
|
|
56
|
+
# 2. Download from a regional Wikimedia mirror
|
|
57
|
+
# 3. Use aria2c with multiple connections: aria2c -x 16 -s 16 <url>
|
|
58
|
+
|
|
59
|
+
# =============================================================================
|
|
60
|
+
# POSITION TO PERSON TYPE MAPPING (P39 - position held)
|
|
61
|
+
# =============================================================================
|
|
62
|
+
|
|
63
|
+
# Executive positions (P39 values)
|
|
64
|
+
EXECUTIVE_POSITION_QIDS = {
|
|
65
|
+
"Q484876", # CEO
|
|
66
|
+
"Q623279", # CFO
|
|
67
|
+
"Q1502675", # COO
|
|
68
|
+
"Q935019", # CTO
|
|
69
|
+
"Q1057716", # CIO
|
|
70
|
+
"Q2140589", # CMO
|
|
71
|
+
"Q1115042", # chairperson
|
|
72
|
+
"Q4720025", # board of directors member
|
|
73
|
+
"Q60432825", # chief human resources officer
|
|
74
|
+
"Q15967139", # chief compliance officer
|
|
75
|
+
"Q15729310", # chief risk officer
|
|
76
|
+
"Q47523568", # chief legal officer
|
|
77
|
+
"Q258557", # board chair
|
|
78
|
+
"Q114863313", # chief sustainability officer
|
|
79
|
+
"Q726114", # company president
|
|
80
|
+
"Q1372944", # managing director
|
|
81
|
+
"Q18918145", # chief commercial officer
|
|
82
|
+
"Q1057569", # chief strategy officer
|
|
83
|
+
"Q24058752", # chief product officer
|
|
84
|
+
"Q3578048", # vice president
|
|
85
|
+
"Q476675", # business executive (generic)
|
|
86
|
+
"Q5441744", # finance director
|
|
87
|
+
"Q4188234", # general manager
|
|
88
|
+
"Q38844673", # chief data officer
|
|
89
|
+
"Q97273203", # chief digital officer
|
|
90
|
+
"Q60715311", # chief growth officer
|
|
91
|
+
"Q3563879", # treasurer
|
|
92
|
+
"Q3505845", # corporate secretary
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Politician positions (P39 values)
|
|
96
|
+
# Includes heads of state/government, legislators, and local officials
|
|
97
|
+
POLITICIAN_POSITION_QIDS = {
|
|
98
|
+
# Heads of state/government
|
|
99
|
+
"Q30461", # president
|
|
100
|
+
"Q14212", # prime minister
|
|
101
|
+
"Q83307", # minister
|
|
102
|
+
"Q2285706", # head of government
|
|
103
|
+
"Q48352", # head of state
|
|
104
|
+
"Q116", # monarch
|
|
105
|
+
"Q382617", # governor
|
|
106
|
+
"Q212071", # mayor
|
|
107
|
+
"Q1553195", # deputy prime minister
|
|
108
|
+
"Q1670573", # cabinet minister
|
|
109
|
+
"Q13218630", # secretary of state
|
|
110
|
+
"Q581682", # vice president
|
|
111
|
+
|
|
112
|
+
# Legislators - national
|
|
113
|
+
"Q4175034", # legislator
|
|
114
|
+
"Q486839", # member of parliament
|
|
115
|
+
"Q193391", # member of national legislature
|
|
116
|
+
"Q484529", # member of congress
|
|
117
|
+
"Q1711695", # senator
|
|
118
|
+
"Q18941264", # member of the House of Representatives (US)
|
|
119
|
+
"Q16707842", # member of the House of Commons (UK)
|
|
120
|
+
"Q18015642", # member of the House of Lords (UK)
|
|
121
|
+
"Q17295570", # member of the Bundestag (Germany)
|
|
122
|
+
"Q27169", # member of the European Parliament
|
|
123
|
+
"Q64366569", # member of Dáil Éireann (Ireland)
|
|
124
|
+
"Q19823090", # member of the Riksdag (Sweden)
|
|
125
|
+
"Q18229048", # member of Sejm (Poland)
|
|
126
|
+
"Q21032547", # member of the National Assembly (France)
|
|
127
|
+
"Q64511800", # member of the Knesset (Israel)
|
|
128
|
+
"Q50393121", # member of the State Duma (Russia)
|
|
129
|
+
"Q18558055", # member of the Diet (Japan)
|
|
130
|
+
"Q109862831", # member of Lok Sabha (India)
|
|
131
|
+
"Q63078776", # member of the Canadian House of Commons
|
|
132
|
+
"Q83767637", # member of the Australian House of Representatives
|
|
133
|
+
|
|
134
|
+
# Legislators - regional/local
|
|
135
|
+
"Q4382506", # member of state legislature
|
|
136
|
+
"Q17765219", # member of regional parliament
|
|
137
|
+
"Q1752514", # councillor (local government)
|
|
138
|
+
"Q18824436", # city councillor
|
|
139
|
+
|
|
140
|
+
# Other political offices
|
|
141
|
+
"Q294414", # public office (generic)
|
|
142
|
+
"Q889821", # ambassador
|
|
143
|
+
"Q15966511", # diplomat
|
|
144
|
+
"Q334344", # lord lieutenant
|
|
145
|
+
"Q16533", # judge (some are appointed politicians)
|
|
146
|
+
"Q3099732", # ombudsman
|
|
147
|
+
"Q1500443", # prefect
|
|
148
|
+
"Q611644", # envoy
|
|
149
|
+
"Q2824523", # political commissar
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# =============================================================================
|
|
153
|
+
# OCCUPATION TO PERSON TYPE MAPPING (P106 - occupation)
|
|
154
|
+
# =============================================================================
|
|
155
|
+
|
|
156
|
+
OCCUPATION_TO_TYPE: dict[str, PersonType] = {
|
|
157
|
+
# Politicians (elected officials)
|
|
158
|
+
"Q82955": PersonType.POLITICIAN, # politician
|
|
159
|
+
"Q193391": PersonType.POLITICIAN, # member of parliament
|
|
160
|
+
"Q372436": PersonType.POLITICIAN, # statesperson
|
|
161
|
+
|
|
162
|
+
# Government (civil servants, diplomats, appointed officials)
|
|
163
|
+
"Q212238": PersonType.GOVERNMENT, # civil servant
|
|
164
|
+
"Q806798": PersonType.GOVERNMENT, # diplomat
|
|
165
|
+
"Q15627169": PersonType.GOVERNMENT, # trade unionist (often govt-adjacent)
|
|
166
|
+
|
|
167
|
+
# Military
|
|
168
|
+
"Q189290": PersonType.MILITARY, # military officer
|
|
169
|
+
"Q47064": PersonType.MILITARY, # military personnel
|
|
170
|
+
"Q4991371": PersonType.MILITARY, # soldier
|
|
171
|
+
"Q10669499": PersonType.MILITARY, # naval officer
|
|
172
|
+
"Q11974939": PersonType.MILITARY, # air force officer
|
|
173
|
+
"Q10974448": PersonType.MILITARY, # army officer
|
|
174
|
+
|
|
175
|
+
# Legal professionals
|
|
176
|
+
"Q16533": PersonType.LEGAL, # judge
|
|
177
|
+
"Q40348": PersonType.LEGAL, # lawyer
|
|
178
|
+
"Q185351": PersonType.LEGAL, # jurist
|
|
179
|
+
"Q3242871": PersonType.LEGAL, # prosecutor
|
|
180
|
+
"Q1792450": PersonType.LEGAL, # barrister
|
|
181
|
+
"Q3406182": PersonType.LEGAL, # solicitor
|
|
182
|
+
|
|
183
|
+
# Athletes
|
|
184
|
+
"Q2066131": PersonType.ATHLETE, # athlete
|
|
185
|
+
"Q937857": PersonType.ATHLETE, # football player
|
|
186
|
+
"Q3665646": PersonType.ATHLETE, # basketball player
|
|
187
|
+
"Q10871364": PersonType.ATHLETE, # baseball player
|
|
188
|
+
"Q19204627": PersonType.ATHLETE, # ice hockey player
|
|
189
|
+
"Q10843402": PersonType.ATHLETE, # tennis player
|
|
190
|
+
"Q13381376": PersonType.ATHLETE, # golfer
|
|
191
|
+
"Q11338576": PersonType.ATHLETE, # boxer
|
|
192
|
+
"Q10873124": PersonType.ATHLETE, # swimmer
|
|
193
|
+
"Q11303721": PersonType.ATHLETE, # racing driver
|
|
194
|
+
"Q10833314": PersonType.ATHLETE, # cricket player
|
|
195
|
+
"Q13141064": PersonType.ATHLETE, # rugby player
|
|
196
|
+
|
|
197
|
+
# Artists (traditional creative professions)
|
|
198
|
+
"Q33999": PersonType.ARTIST, # actor
|
|
199
|
+
"Q177220": PersonType.ARTIST, # singer
|
|
200
|
+
"Q639669": PersonType.ARTIST, # musician
|
|
201
|
+
"Q2526255": PersonType.ARTIST, # film director
|
|
202
|
+
"Q36180": PersonType.ARTIST, # writer
|
|
203
|
+
"Q483501": PersonType.ARTIST, # artist
|
|
204
|
+
"Q488205": PersonType.ARTIST, # singer-songwriter
|
|
205
|
+
"Q753110": PersonType.ARTIST, # songwriter
|
|
206
|
+
"Q2405480": PersonType.ARTIST, # voice actor
|
|
207
|
+
"Q10800557": PersonType.ARTIST, # film actor
|
|
208
|
+
"Q3455803": PersonType.ARTIST, # director
|
|
209
|
+
"Q28389": PersonType.ARTIST, # screenwriter
|
|
210
|
+
"Q6625963": PersonType.ARTIST, # comedian
|
|
211
|
+
"Q2259451": PersonType.ARTIST, # stand-up comedian
|
|
212
|
+
"Q2490358": PersonType.ARTIST, # choreographer
|
|
213
|
+
"Q2722764": PersonType.ARTIST, # DJ (disc jockey)
|
|
214
|
+
"Q183945": PersonType.ARTIST, # record producer
|
|
215
|
+
"Q3282637": PersonType.ARTIST, # film producer
|
|
216
|
+
"Q49757": PersonType.ARTIST, # poet
|
|
217
|
+
"Q28640": PersonType.ARTIST, # illustrator
|
|
218
|
+
"Q1028181": PersonType.ARTIST, # painter
|
|
219
|
+
"Q1281618": PersonType.ARTIST, # sculptor
|
|
220
|
+
"Q33231": PersonType.ARTIST, # photographer
|
|
221
|
+
"Q806349": PersonType.ARTIST, # band leader
|
|
222
|
+
"Q855091": PersonType.ARTIST, # rapper
|
|
223
|
+
"Q4351403": PersonType.ARTIST, # novelist
|
|
224
|
+
"Q158852": PersonType.ARTIST, # conductor (music)
|
|
225
|
+
"Q486748": PersonType.ARTIST, # pianist
|
|
226
|
+
"Q1415090": PersonType.ARTIST, # guitarist
|
|
227
|
+
|
|
228
|
+
# Media (internet/social media personalities)
|
|
229
|
+
"Q6168364": PersonType.MEDIA, # YouTuber
|
|
230
|
+
"Q15077007": PersonType.MEDIA, # podcaster
|
|
231
|
+
"Q17125263": PersonType.MEDIA, # social media influencer
|
|
232
|
+
"Q15981151": PersonType.MEDIA, # internet celebrity
|
|
233
|
+
"Q2059704": PersonType.MEDIA, # television personality
|
|
234
|
+
"Q4610556": PersonType.MEDIA, # model
|
|
235
|
+
"Q578109": PersonType.MEDIA, # television producer
|
|
236
|
+
"Q2516866": PersonType.MEDIA, # publisher
|
|
237
|
+
"Q93191800": PersonType.MEDIA, # content creator
|
|
238
|
+
"Q105756498": PersonType.MEDIA, # streamer (Twitch etc.)
|
|
239
|
+
|
|
240
|
+
# Professionals (known for their profession/work)
|
|
241
|
+
"Q39631": PersonType.PROFESSIONAL, # physician/doctor
|
|
242
|
+
"Q774306": PersonType.PROFESSIONAL, # surgeon
|
|
243
|
+
"Q1234713": PersonType.PROFESSIONAL, # dentist
|
|
244
|
+
"Q15924224": PersonType.PROFESSIONAL, # psychiatrist
|
|
245
|
+
"Q212980": PersonType.PROFESSIONAL, # psychologist
|
|
246
|
+
"Q81096": PersonType.PROFESSIONAL, # engineer
|
|
247
|
+
"Q42603": PersonType.PROFESSIONAL, # priest/clergy
|
|
248
|
+
"Q432386": PersonType.PROFESSIONAL, # architect
|
|
249
|
+
"Q3621491": PersonType.PROFESSIONAL, # nurse
|
|
250
|
+
"Q18805": PersonType.PROFESSIONAL, # pharmacist
|
|
251
|
+
"Q15895020": PersonType.PROFESSIONAL, # veterinarian
|
|
252
|
+
"Q131512": PersonType.PROFESSIONAL, # chef
|
|
253
|
+
"Q3499072": PersonType.PROFESSIONAL, # pilot
|
|
254
|
+
"Q15895449": PersonType.PROFESSIONAL, # accountant
|
|
255
|
+
"Q806750": PersonType.PROFESSIONAL, # consultant
|
|
256
|
+
"Q584301": PersonType.PROFESSIONAL, # economist (often professional)
|
|
257
|
+
"Q1371925": PersonType.PROFESSIONAL, # real estate agent
|
|
258
|
+
"Q266569": PersonType.PROFESSIONAL, # librarian
|
|
259
|
+
"Q5323050": PersonType.PROFESSIONAL, # electrical engineer
|
|
260
|
+
"Q13582652": PersonType.PROFESSIONAL, # civil engineer
|
|
261
|
+
"Q81965": PersonType.PROFESSIONAL, # software engineer
|
|
262
|
+
"Q5482740": PersonType.PROFESSIONAL, # data scientist
|
|
263
|
+
|
|
264
|
+
# Academics
|
|
265
|
+
"Q121594": PersonType.ACADEMIC, # professor
|
|
266
|
+
"Q3400985": PersonType.ACADEMIC, # academic
|
|
267
|
+
"Q1622272": PersonType.ACADEMIC, # university professor
|
|
268
|
+
|
|
269
|
+
# Scientists
|
|
270
|
+
"Q901": PersonType.SCIENTIST, # scientist
|
|
271
|
+
"Q1650915": PersonType.SCIENTIST, # researcher
|
|
272
|
+
"Q169470": PersonType.SCIENTIST, # physicist
|
|
273
|
+
"Q593644": PersonType.SCIENTIST, # chemist
|
|
274
|
+
"Q864503": PersonType.SCIENTIST, # biologist
|
|
275
|
+
"Q11063": PersonType.SCIENTIST, # astronomer
|
|
276
|
+
|
|
277
|
+
# Journalists
|
|
278
|
+
"Q1930187": PersonType.JOURNALIST, # journalist
|
|
279
|
+
"Q13590141": PersonType.JOURNALIST, # news presenter
|
|
280
|
+
"Q947873": PersonType.JOURNALIST, # television presenter
|
|
281
|
+
"Q4263842": PersonType.JOURNALIST, # columnist
|
|
282
|
+
|
|
283
|
+
# Activists
|
|
284
|
+
"Q15253558": PersonType.ACTIVIST, # activist
|
|
285
|
+
"Q11631410": PersonType.ACTIVIST, # human rights activist
|
|
286
|
+
"Q18939491": PersonType.ACTIVIST, # environmental activist
|
|
287
|
+
|
|
288
|
+
# Entrepreneurs/Executives via occupation
|
|
289
|
+
"Q131524": PersonType.ENTREPRENEUR, # entrepreneur
|
|
290
|
+
"Q43845": PersonType.ENTREPRENEUR, # businessperson
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
# =============================================================================
|
|
294
|
+
# ORGANIZATION TYPE MAPPING (P31 - instance of)
|
|
295
|
+
# =============================================================================
|
|
296
|
+
|
|
297
|
+
ORG_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
|
|
298
|
+
# Business - core types
|
|
299
|
+
"Q4830453": EntityType.BUSINESS, # business
|
|
300
|
+
"Q6881511": EntityType.BUSINESS, # enterprise
|
|
301
|
+
"Q783794": EntityType.BUSINESS, # company
|
|
302
|
+
"Q891723": EntityType.BUSINESS, # public company
|
|
303
|
+
"Q167037": EntityType.BUSINESS, # corporation
|
|
304
|
+
"Q658255": EntityType.BUSINESS, # subsidiary
|
|
305
|
+
"Q206652": EntityType.BUSINESS, # conglomerate
|
|
306
|
+
"Q22687": EntityType.BUSINESS, # bank
|
|
307
|
+
"Q1145276": EntityType.BUSINESS, # insurance company
|
|
308
|
+
"Q46970": EntityType.BUSINESS, # airline
|
|
309
|
+
"Q613142": EntityType.BUSINESS, # law firm
|
|
310
|
+
"Q507619": EntityType.BUSINESS, # pharmaceutical company
|
|
311
|
+
"Q2979960": EntityType.BUSINESS, # technology company
|
|
312
|
+
"Q1631111": EntityType.BUSINESS, # retailer
|
|
313
|
+
"Q187652": EntityType.BUSINESS, # manufacturer
|
|
314
|
+
# Business - additional types
|
|
315
|
+
"Q43229": EntityType.BUSINESS, # organization (generic)
|
|
316
|
+
"Q4671277": EntityType.BUSINESS, # academic institution (some are businesses)
|
|
317
|
+
"Q1664720": EntityType.BUSINESS, # institute
|
|
318
|
+
"Q15911314": EntityType.BUSINESS, # association
|
|
319
|
+
"Q15925165": EntityType.BUSINESS, # private company
|
|
320
|
+
"Q5225895": EntityType.BUSINESS, # credit union
|
|
321
|
+
"Q161726": EntityType.BUSINESS, # multinational corporation
|
|
322
|
+
"Q134161": EntityType.BUSINESS, # joint venture
|
|
323
|
+
"Q1589009": EntityType.BUSINESS, # privately held company
|
|
324
|
+
"Q270791": EntityType.BUSINESS, # state-owned enterprise
|
|
325
|
+
"Q1762059": EntityType.BUSINESS, # online service provider
|
|
326
|
+
"Q17127659": EntityType.BUSINESS, # energy company
|
|
327
|
+
"Q2695280": EntityType.BUSINESS, # construction company
|
|
328
|
+
"Q1624464": EntityType.BUSINESS, # telecommunications company
|
|
329
|
+
"Q1668024": EntityType.BUSINESS, # car manufacturer
|
|
330
|
+
"Q3914": EntityType.BUSINESS, # school (some are businesses)
|
|
331
|
+
"Q1030034": EntityType.BUSINESS, # management consulting firm
|
|
332
|
+
"Q1370614": EntityType.BUSINESS, # investment bank
|
|
333
|
+
"Q1785271": EntityType.BUSINESS, # advertising agency
|
|
334
|
+
"Q4686042": EntityType.BUSINESS, # automotive supplier
|
|
335
|
+
"Q431289": EntityType.BUSINESS, # brand
|
|
336
|
+
"Q622438": EntityType.BUSINESS, # supermarket chain
|
|
337
|
+
"Q6500733": EntityType.BUSINESS, # licensed retailer
|
|
338
|
+
"Q2659904": EntityType.BUSINESS, # government-owned corporation
|
|
339
|
+
"Q1065118": EntityType.BUSINESS, # bookmaker
|
|
340
|
+
"Q179179": EntityType.BUSINESS, # startup
|
|
341
|
+
"Q210167": EntityType.BUSINESS, # video game developer
|
|
342
|
+
"Q18388277": EntityType.BUSINESS, # video game publisher
|
|
343
|
+
"Q1762913": EntityType.BUSINESS, # film production company
|
|
344
|
+
"Q18558478": EntityType.BUSINESS, # money services business
|
|
345
|
+
"Q6463968": EntityType.BUSINESS, # asset management company
|
|
346
|
+
"Q2864737": EntityType.BUSINESS, # cooperative bank
|
|
347
|
+
"Q161380": EntityType.BUSINESS, # cooperative
|
|
348
|
+
"Q15850590": EntityType.BUSINESS, # real estate company
|
|
349
|
+
"Q1048835": EntityType.BUSINESS, # political organization
|
|
350
|
+
"Q1254933": EntityType.BUSINESS, # astronomical observatory (often research orgs)
|
|
351
|
+
"Q294414": EntityType.BUSINESS, # public office
|
|
352
|
+
|
|
353
|
+
# Funds
|
|
354
|
+
"Q45400320": EntityType.FUND, # investment fund
|
|
355
|
+
"Q476028": EntityType.FUND, # hedge fund
|
|
356
|
+
"Q380649": EntityType.FUND, # investment company
|
|
357
|
+
"Q1377053": EntityType.FUND, # mutual fund
|
|
358
|
+
"Q3312546": EntityType.FUND, # private equity firm
|
|
359
|
+
"Q751705": EntityType.FUND, # venture capital firm
|
|
360
|
+
"Q2296920": EntityType.FUND, # sovereign wealth fund
|
|
361
|
+
"Q2824951": EntityType.FUND, # exchange-traded fund
|
|
362
|
+
"Q1755098": EntityType.FUND, # pension fund
|
|
363
|
+
|
|
364
|
+
# Nonprofits
|
|
365
|
+
"Q163740": EntityType.NONPROFIT, # nonprofit organization
|
|
366
|
+
"Q79913": EntityType.NGO, # non-governmental organization
|
|
367
|
+
"Q157031": EntityType.FOUNDATION, # foundation
|
|
368
|
+
"Q48204": EntityType.NONPROFIT, # voluntary association
|
|
369
|
+
"Q988108": EntityType.NONPROFIT, # club
|
|
370
|
+
"Q476436": EntityType.NONPROFIT, # charitable organization
|
|
371
|
+
"Q3591957": EntityType.NONPROFIT, # cultural institution
|
|
372
|
+
"Q162633": EntityType.NONPROFIT, # academy
|
|
373
|
+
"Q270791": EntityType.NONPROFIT, # learned society
|
|
374
|
+
"Q484652": EntityType.NONPROFIT, # international organization
|
|
375
|
+
|
|
376
|
+
# Government
|
|
377
|
+
"Q327333": EntityType.GOVERNMENT, # government agency
|
|
378
|
+
"Q7278": EntityType.POLITICAL_PARTY, # political party
|
|
379
|
+
"Q178790": EntityType.TRADE_UNION, # trade union
|
|
380
|
+
"Q7188": EntityType.GOVERNMENT, # government
|
|
381
|
+
"Q2659904": EntityType.GOVERNMENT, # government-owned corporation
|
|
382
|
+
"Q35798": EntityType.GOVERNMENT, # executive branch
|
|
383
|
+
"Q35749": EntityType.GOVERNMENT, # legislature
|
|
384
|
+
"Q12076836": EntityType.GOVERNMENT, # law enforcement agency
|
|
385
|
+
"Q17362920": EntityType.GOVERNMENT, # public body
|
|
386
|
+
"Q1063239": EntityType.GOVERNMENT, # regulatory agency
|
|
387
|
+
"Q3624078": EntityType.GOVERNMENT, # sovereign state
|
|
388
|
+
"Q133442": EntityType.GOVERNMENT, # embassy
|
|
389
|
+
"Q174834": EntityType.GOVERNMENT, # authority (government)
|
|
390
|
+
|
|
391
|
+
# International organizations
|
|
392
|
+
"Q484652": EntityType.INTERNATIONAL_ORG, # international organization
|
|
393
|
+
"Q1335818": EntityType.INTERNATIONAL_ORG, # supranational organisation
|
|
394
|
+
"Q1616075": EntityType.INTERNATIONAL_ORG, # intergovernmental organization
|
|
395
|
+
|
|
396
|
+
# Education/Research
|
|
397
|
+
"Q2385804": EntityType.EDUCATIONAL, # educational institution
|
|
398
|
+
"Q3918": EntityType.EDUCATIONAL, # university
|
|
399
|
+
"Q31855": EntityType.RESEARCH, # research institute
|
|
400
|
+
"Q875538": EntityType.EDUCATIONAL, # public university
|
|
401
|
+
"Q23002039": EntityType.EDUCATIONAL, # private university
|
|
402
|
+
"Q38723": EntityType.EDUCATIONAL, # higher education institution
|
|
403
|
+
"Q1371037": EntityType.EDUCATIONAL, # secondary school
|
|
404
|
+
"Q9842": EntityType.EDUCATIONAL, # primary school
|
|
405
|
+
"Q189004": EntityType.EDUCATIONAL, # college
|
|
406
|
+
"Q1188663": EntityType.EDUCATIONAL, # community college
|
|
407
|
+
"Q1321960": EntityType.RESEARCH, # think tank
|
|
408
|
+
"Q31855": EntityType.RESEARCH, # research institute
|
|
409
|
+
"Q3354859": EntityType.RESEARCH, # observatory
|
|
410
|
+
"Q1298668": EntityType.RESEARCH, # research center
|
|
411
|
+
|
|
412
|
+
# Healthcare
|
|
413
|
+
"Q16917": EntityType.HEALTHCARE, # hospital
|
|
414
|
+
"Q1774898": EntityType.HEALTHCARE, # health care organization
|
|
415
|
+
"Q180958": EntityType.HEALTHCARE, # clinic
|
|
416
|
+
"Q4260475": EntityType.HEALTHCARE, # medical facility
|
|
417
|
+
"Q871964": EntityType.HEALTHCARE, # biotechnology company
|
|
418
|
+
"Q902104": EntityType.HEALTHCARE, # health insurance company
|
|
419
|
+
|
|
420
|
+
# Sports
|
|
421
|
+
"Q847017": EntityType.SPORTS, # sports club
|
|
422
|
+
"Q476068": EntityType.SPORTS, # sports team
|
|
423
|
+
"Q12973014": EntityType.SPORTS, # sports organization
|
|
424
|
+
"Q14350": EntityType.SPORTS, # association football club
|
|
425
|
+
"Q20639847": EntityType.SPORTS, # American football team
|
|
426
|
+
"Q13393265": EntityType.SPORTS, # basketball team
|
|
427
|
+
"Q13406463": EntityType.SPORTS, # baseball team
|
|
428
|
+
"Q1410877": EntityType.SPORTS, # ice hockey team
|
|
429
|
+
"Q18558301": EntityType.SPORTS, # rugby union club
|
|
430
|
+
"Q2093802": EntityType.SPORTS, # cricket team
|
|
431
|
+
"Q5137836": EntityType.SPORTS, # motorsport racing team
|
|
432
|
+
|
|
433
|
+
# Media
|
|
434
|
+
"Q18127": EntityType.MEDIA, # record label
|
|
435
|
+
"Q1366047": EntityType.MEDIA, # film studio
|
|
436
|
+
"Q1137109": EntityType.MEDIA, # video game company
|
|
437
|
+
"Q11032": EntityType.MEDIA, # newspaper
|
|
438
|
+
"Q1002697": EntityType.MEDIA, # periodical
|
|
439
|
+
"Q5398426": EntityType.MEDIA, # television series
|
|
440
|
+
"Q1110794": EntityType.MEDIA, # daily newspaper
|
|
441
|
+
"Q1616075": EntityType.MEDIA, # news agency
|
|
442
|
+
"Q14350": EntityType.MEDIA, # magazine
|
|
443
|
+
"Q15265344": EntityType.MEDIA, # broadcaster
|
|
444
|
+
"Q131436": EntityType.MEDIA, # radio station
|
|
445
|
+
"Q1616075": EntityType.MEDIA, # television station
|
|
446
|
+
"Q41298": EntityType.MEDIA, # magazine
|
|
447
|
+
"Q30022": EntityType.MEDIA, # television channel
|
|
448
|
+
"Q17232649": EntityType.MEDIA, # publishing company
|
|
449
|
+
"Q28803812": EntityType.MEDIA, # streaming service
|
|
450
|
+
"Q159334": EntityType.MEDIA, # entertainment company
|
|
451
|
+
|
|
452
|
+
# Religious
|
|
453
|
+
"Q9174": EntityType.RELIGIOUS, # religion
|
|
454
|
+
"Q1530022": EntityType.RELIGIOUS, # religious organization
|
|
455
|
+
"Q2994867": EntityType.RELIGIOUS, # religious community
|
|
456
|
+
"Q34651": EntityType.RELIGIOUS, # church (building as org)
|
|
457
|
+
"Q44613": EntityType.RELIGIOUS, # monastery
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# =============================================================================
|
|
462
|
+
# PROGRESS TRACKING
|
|
463
|
+
# =============================================================================
|
|
464
|
+
|
|
465
|
+
DEFAULT_PROGRESS_PATH = Path.home() / ".cache" / "corp-extractor" / "wikidata-dump-progress.json"
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
@dataclass
|
|
469
|
+
class DumpProgress:
|
|
470
|
+
"""
|
|
471
|
+
Tracks progress through the Wikidata dump file for resume support.
|
|
472
|
+
|
|
473
|
+
Progress is tracked by entity index (number of entities processed).
|
|
474
|
+
On resume, entities are skipped until reaching the saved position.
|
|
475
|
+
"""
|
|
476
|
+
# Entity index - number of entities yielded from the dump
|
|
477
|
+
entity_index: int = 0
|
|
478
|
+
|
|
479
|
+
# Separate counters for people and orgs import
|
|
480
|
+
people_yielded: int = 0
|
|
481
|
+
orgs_yielded: int = 0
|
|
482
|
+
|
|
483
|
+
# Last entity ID processed (for verification)
|
|
484
|
+
last_entity_id: str = ""
|
|
485
|
+
|
|
486
|
+
# Timestamp of last update
|
|
487
|
+
last_updated: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
488
|
+
|
|
489
|
+
# Dump file path (to detect if dump changed)
|
|
490
|
+
dump_path: str = ""
|
|
491
|
+
|
|
492
|
+
# Dump file size (to detect if dump changed)
|
|
493
|
+
dump_size: int = 0
|
|
494
|
+
|
|
495
|
+
def save(self, path: Optional[Path] = None) -> None:
|
|
496
|
+
"""Save progress to JSON file."""
|
|
497
|
+
path = path or DEFAULT_PROGRESS_PATH
|
|
498
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
499
|
+
self.last_updated = datetime.now().isoformat()
|
|
500
|
+
with open(path, "w") as f:
|
|
501
|
+
json.dump({
|
|
502
|
+
"entity_index": self.entity_index,
|
|
503
|
+
"people_yielded": self.people_yielded,
|
|
504
|
+
"orgs_yielded": self.orgs_yielded,
|
|
505
|
+
"last_entity_id": self.last_entity_id,
|
|
506
|
+
"last_updated": self.last_updated,
|
|
507
|
+
"dump_path": self.dump_path,
|
|
508
|
+
"dump_size": self.dump_size,
|
|
509
|
+
}, f, indent=2)
|
|
510
|
+
logger.debug(f"Saved progress: entity_index={self.entity_index}, last_id={self.last_entity_id}")
|
|
511
|
+
|
|
512
|
+
@classmethod
|
|
513
|
+
def load(cls, path: Optional[Path] = None) -> Optional["DumpProgress"]:
|
|
514
|
+
"""Load progress from JSON file, returns None if not found."""
|
|
515
|
+
path = path or DEFAULT_PROGRESS_PATH
|
|
516
|
+
if not path.exists():
|
|
517
|
+
return None
|
|
518
|
+
try:
|
|
519
|
+
with open(path) as f:
|
|
520
|
+
data = json.load(f)
|
|
521
|
+
return cls(
|
|
522
|
+
entity_index=data.get("entity_index", 0),
|
|
523
|
+
people_yielded=data.get("people_yielded", 0),
|
|
524
|
+
orgs_yielded=data.get("orgs_yielded", 0),
|
|
525
|
+
last_entity_id=data.get("last_entity_id", ""),
|
|
526
|
+
last_updated=data.get("last_updated", ""),
|
|
527
|
+
dump_path=data.get("dump_path", ""),
|
|
528
|
+
dump_size=data.get("dump_size", 0),
|
|
529
|
+
)
|
|
530
|
+
except (json.JSONDecodeError, KeyError, TypeError) as e:
|
|
531
|
+
logger.warning(f"Failed to load progress from {path}: {e}")
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
@classmethod
|
|
535
|
+
def clear(cls, path: Optional[Path] = None) -> None:
|
|
536
|
+
"""Delete the progress file."""
|
|
537
|
+
path = path or DEFAULT_PROGRESS_PATH
|
|
538
|
+
if path.exists():
|
|
539
|
+
path.unlink()
|
|
540
|
+
logger.info(f"Cleared progress file: {path}")
|
|
541
|
+
|
|
542
|
+
def matches_dump(self, dump_path: Path) -> bool:
|
|
543
|
+
"""Check if this progress matches the given dump file."""
|
|
544
|
+
if str(dump_path) != self.dump_path:
|
|
545
|
+
return False
|
|
546
|
+
if dump_path.exists() and dump_path.stat().st_size != self.dump_size:
|
|
547
|
+
return False
|
|
548
|
+
return True
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
class WikidataDumpImporter:
|
|
552
|
+
"""
|
|
553
|
+
Stream Wikidata JSON dump to extract people and organization records.
|
|
554
|
+
|
|
555
|
+
This importer processes the Wikidata dump line-by-line to avoid memory issues
|
|
556
|
+
with the ~100GB compressed file. It filters for:
|
|
557
|
+
- Humans (P31=Q5) with English Wikipedia articles
|
|
558
|
+
- Organizations with English Wikipedia articles
|
|
559
|
+
|
|
560
|
+
The dump URL can be customized, and the importer supports both .bz2 and .gz
|
|
561
|
+
compression formats.
|
|
562
|
+
"""
|
|
563
|
+
|
|
564
|
+
def __init__(self, dump_path: Optional[str] = None):
|
|
565
|
+
"""
|
|
566
|
+
Initialize the dump importer.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
dump_path: Optional path to a pre-downloaded dump file.
|
|
570
|
+
If not provided, will need to call download_dump() first.
|
|
571
|
+
"""
|
|
572
|
+
self._dump_path = Path(dump_path) if dump_path else None
|
|
573
|
+
# Track discovered organizations from people import
|
|
574
|
+
self._discovered_orgs: dict[str, str] = {}
|
|
575
|
+
# Track QIDs that need label resolution (country, role)
|
|
576
|
+
self._unresolved_qids: set[str] = set()
|
|
577
|
+
# Label cache built during dump processing
|
|
578
|
+
self._label_cache: dict[str, str] = {}
|
|
579
|
+
|
|
580
|
+
def download_dump(
|
|
581
|
+
self,
|
|
582
|
+
target_dir: Optional[Path] = None,
|
|
583
|
+
force: bool = False,
|
|
584
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
585
|
+
use_aria2: bool = True,
|
|
586
|
+
aria2_connections: int = 16,
|
|
587
|
+
) -> Path:
|
|
588
|
+
"""
|
|
589
|
+
Download the latest Wikidata dump with progress indicator.
|
|
590
|
+
|
|
591
|
+
For fastest downloads, uses aria2c if available (16 parallel connections).
|
|
592
|
+
Falls back to urllib if aria2c is not installed.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
target_dir: Directory to save the dump (default: ~/.cache/corp-extractor)
|
|
596
|
+
force: Force re-download even if file exists
|
|
597
|
+
progress_callback: Optional callback(downloaded_bytes, total_bytes) for progress
|
|
598
|
+
use_aria2: Try to use aria2c for faster downloads (default: True)
|
|
599
|
+
aria2_connections: Number of connections for aria2c (default: 16)
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
Path to the downloaded dump file
|
|
603
|
+
"""
|
|
604
|
+
if target_dir is None:
|
|
605
|
+
target_dir = Path.home() / ".cache" / "corp-extractor"
|
|
606
|
+
|
|
607
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
608
|
+
dump_path = target_dir / "wikidata-latest-all.json.bz2"
|
|
609
|
+
|
|
610
|
+
if dump_path.exists() and not force:
|
|
611
|
+
logger.info(f"Using cached dump at {dump_path}")
|
|
612
|
+
self._dump_path = dump_path
|
|
613
|
+
return dump_path
|
|
614
|
+
|
|
615
|
+
logger.info(f"Target: {dump_path}")
|
|
616
|
+
|
|
617
|
+
# Try aria2c first for much faster downloads
|
|
618
|
+
if use_aria2 and shutil.which("aria2c"):
|
|
619
|
+
logger.info("Using aria2c for fast parallel download...")
|
|
620
|
+
try:
|
|
621
|
+
self._download_with_aria2(dump_path, connections=aria2_connections)
|
|
622
|
+
self._dump_path = dump_path
|
|
623
|
+
return dump_path
|
|
624
|
+
except Exception as e:
|
|
625
|
+
logger.warning(f"aria2c download failed: {e}, falling back to urllib")
|
|
626
|
+
|
|
627
|
+
# Fallback to urllib
|
|
628
|
+
logger.info(f"Downloading Wikidata dump from {DUMP_URL}...")
|
|
629
|
+
logger.info("TIP: Install aria2c for 10-20x faster downloads: brew install aria2")
|
|
630
|
+
logger.info("This is a large file (~100GB) and will take significant time.")
|
|
631
|
+
|
|
632
|
+
# Stream download with progress
|
|
633
|
+
req = urllib.request.Request(
|
|
634
|
+
DUMP_URL,
|
|
635
|
+
headers={"User-Agent": "corp-extractor/1.0 (Wikidata dump importer)"}
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
with urllib.request.urlopen(req) as response:
|
|
639
|
+
total = int(response.headers.get("content-length", 0))
|
|
640
|
+
total_gb = total / (1024 ** 3) if total else 0
|
|
641
|
+
|
|
642
|
+
with open(dump_path, "wb") as f:
|
|
643
|
+
downloaded = 0
|
|
644
|
+
chunk_size = 8 * 1024 * 1024 # 8MB chunks
|
|
645
|
+
last_log_pct = 0
|
|
646
|
+
|
|
647
|
+
while True:
|
|
648
|
+
chunk = response.read(chunk_size)
|
|
649
|
+
if not chunk:
|
|
650
|
+
break
|
|
651
|
+
f.write(chunk)
|
|
652
|
+
downloaded += len(chunk)
|
|
653
|
+
|
|
654
|
+
# Call progress callback if provided
|
|
655
|
+
if progress_callback:
|
|
656
|
+
progress_callback(downloaded, total)
|
|
657
|
+
else:
|
|
658
|
+
# Default logging (every 1%)
|
|
659
|
+
if total:
|
|
660
|
+
pct = int((downloaded / total) * 100)
|
|
661
|
+
if pct > last_log_pct:
|
|
662
|
+
downloaded_gb = downloaded / (1024 ** 3)
|
|
663
|
+
logger.info(f"Downloaded {downloaded_gb:.1f}GB / {total_gb:.1f}GB ({pct}%)")
|
|
664
|
+
last_log_pct = pct
|
|
665
|
+
elif downloaded % (1024 ** 3) < chunk_size:
|
|
666
|
+
# Log every GB if total unknown
|
|
667
|
+
downloaded_gb = downloaded / (1024 ** 3)
|
|
668
|
+
logger.info(f"Downloaded {downloaded_gb:.1f}GB")
|
|
669
|
+
|
|
670
|
+
logger.info(f"Download complete: {dump_path}")
|
|
671
|
+
self._dump_path = dump_path
|
|
672
|
+
return dump_path
|
|
673
|
+
|
|
674
|
+
def _download_with_aria2(
|
|
675
|
+
self,
|
|
676
|
+
output_path: Path,
|
|
677
|
+
connections: int = 16,
|
|
678
|
+
) -> None:
|
|
679
|
+
"""
|
|
680
|
+
Download using aria2c with multiple parallel connections.
|
|
681
|
+
|
|
682
|
+
aria2c can achieve 10-20x faster downloads by using multiple
|
|
683
|
+
connections to the server.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
output_path: Where to save the downloaded file
|
|
687
|
+
connections: Number of parallel connections (default: 16)
|
|
688
|
+
"""
|
|
689
|
+
cmd = [
|
|
690
|
+
"aria2c",
|
|
691
|
+
"-x", str(connections), # Max connections per server
|
|
692
|
+
"-s", str(connections), # Split file into N parts
|
|
693
|
+
"-k", "10M", # Min split size
|
|
694
|
+
"--file-allocation=none", # Faster on SSDs
|
|
695
|
+
"-d", str(output_path.parent),
|
|
696
|
+
"-o", output_path.name,
|
|
697
|
+
"--console-log-level=notice",
|
|
698
|
+
"--summary-interval=10",
|
|
699
|
+
DUMP_URL,
|
|
700
|
+
]
|
|
701
|
+
|
|
702
|
+
logger.info(f"Running: {' '.join(cmd)}")
|
|
703
|
+
|
|
704
|
+
# Run aria2c and stream output
|
|
705
|
+
process = subprocess.Popen(
|
|
706
|
+
cmd,
|
|
707
|
+
stdout=subprocess.PIPE,
|
|
708
|
+
stderr=subprocess.STDOUT,
|
|
709
|
+
text=True,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
# Stream output to logger
|
|
713
|
+
if process.stdout:
|
|
714
|
+
for line in process.stdout:
|
|
715
|
+
line = line.strip()
|
|
716
|
+
if line:
|
|
717
|
+
logger.info(f"aria2c: {line}")
|
|
718
|
+
|
|
719
|
+
return_code = process.wait()
|
|
720
|
+
if return_code != 0:
|
|
721
|
+
raise RuntimeError(f"aria2c exited with code {return_code}")
|
|
722
|
+
|
|
723
|
+
def get_dump_path(self, target_dir: Optional[Path] = None) -> Path:
|
|
724
|
+
"""
|
|
725
|
+
Get the path where the dump would be/is downloaded.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
target_dir: Directory for the dump (default: ~/.cache/corp-extractor)
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
Path to the dump file location
|
|
732
|
+
"""
|
|
733
|
+
if target_dir is None:
|
|
734
|
+
target_dir = Path.home() / ".cache" / "corp-extractor"
|
|
735
|
+
return target_dir / "wikidata-latest-all.json.bz2"
|
|
736
|
+
|
|
737
|
+
def iter_entities(
|
|
738
|
+
self,
|
|
739
|
+
dump_path: Optional[Path] = None,
|
|
740
|
+
start_index: int = 0,
|
|
741
|
+
progress_callback: Optional[Callable[[int, str], None]] = None,
|
|
742
|
+
) -> Iterator[dict]:
|
|
743
|
+
"""
|
|
744
|
+
Stream entities from dump file, one at a time.
|
|
745
|
+
|
|
746
|
+
Handles the Wikidata JSON dump format where each line after the opening
|
|
747
|
+
bracket is a JSON object with a trailing comma (except the last).
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
dump_path: Path to dump file (uses self._dump_path if not provided)
|
|
751
|
+
start_index: Entity index to start yielding from (default 0). Entities
|
|
752
|
+
before this index are skipped but still cached for label lookups.
|
|
753
|
+
progress_callback: Optional callback(entity_index, entity_id) called for each
|
|
754
|
+
yielded entity. Useful for tracking progress.
|
|
755
|
+
|
|
756
|
+
Yields:
|
|
757
|
+
Parsed entity dictionaries
|
|
758
|
+
"""
|
|
759
|
+
path = dump_path or self._dump_path
|
|
760
|
+
if path is None:
|
|
761
|
+
raise ValueError("No dump path provided. Call download_dump() first or pass dump_path.")
|
|
762
|
+
|
|
763
|
+
path = Path(path)
|
|
764
|
+
|
|
765
|
+
# Select opener based on extension
|
|
766
|
+
if path.suffix == ".bz2":
|
|
767
|
+
opener = bz2.open
|
|
768
|
+
elif path.suffix == ".gz":
|
|
769
|
+
opener = gzip.open
|
|
770
|
+
else:
|
|
771
|
+
# Assume uncompressed
|
|
772
|
+
opener = open
|
|
773
|
+
|
|
774
|
+
logger.info(f"Opening dump file: {path}")
|
|
775
|
+
logger.info(f"File size: {path.stat().st_size / (1024**3):.1f} GB")
|
|
776
|
+
if start_index > 0:
|
|
777
|
+
logger.info(f"Resuming from entity index {start_index:,} (skipping earlier entities)")
|
|
778
|
+
logger.info("Starting to read dump (bz2 decompression is slow, please wait)...")
|
|
779
|
+
|
|
780
|
+
with opener(path, "rt", encoding="utf-8") as f:
|
|
781
|
+
logger.info("Dump file opened successfully, reading lines...")
|
|
782
|
+
line_count = 0
|
|
783
|
+
entity_count = 0
|
|
784
|
+
skipped_count = 0
|
|
785
|
+
# Log more frequently at start, then reduce frequency
|
|
786
|
+
next_log_threshold = 10_000
|
|
787
|
+
|
|
788
|
+
for line in f:
|
|
789
|
+
line_count += 1
|
|
790
|
+
|
|
791
|
+
# Log first few lines to show we're making progress
|
|
792
|
+
if line_count <= 5:
|
|
793
|
+
logger.info(f"Read line {line_count} ({len(line)} chars)")
|
|
794
|
+
elif line_count == 100:
|
|
795
|
+
logger.info(f"Read {line_count} lines...")
|
|
796
|
+
elif line_count == 1000:
|
|
797
|
+
logger.info(f"Read {line_count} lines...")
|
|
798
|
+
|
|
799
|
+
line = line.strip()
|
|
800
|
+
|
|
801
|
+
# Skip array brackets
|
|
802
|
+
if line in ("[", "]"):
|
|
803
|
+
continue
|
|
804
|
+
|
|
805
|
+
# Strip trailing comma
|
|
806
|
+
if line.endswith(","):
|
|
807
|
+
line = line[:-1]
|
|
808
|
+
|
|
809
|
+
if not line:
|
|
810
|
+
continue
|
|
811
|
+
|
|
812
|
+
try:
|
|
813
|
+
entity = json.loads(line)
|
|
814
|
+
entity_id = entity.get("id", "")
|
|
815
|
+
|
|
816
|
+
# Always cache label for QID lookups (even when skipping)
|
|
817
|
+
self._cache_entity_label(entity)
|
|
818
|
+
|
|
819
|
+
# Check if we should skip this entity (resuming)
|
|
820
|
+
if entity_count < start_index:
|
|
821
|
+
entity_count += 1
|
|
822
|
+
skipped_count += 1
|
|
823
|
+
# Log skipping progress with adaptive frequency
|
|
824
|
+
if skipped_count >= next_log_threshold:
|
|
825
|
+
pct = 100 * skipped_count / start_index if start_index > 0 else 0
|
|
826
|
+
logger.info(
|
|
827
|
+
f"Skipping... {skipped_count:,}/{start_index:,} entities "
|
|
828
|
+
f"({pct:.1f}%), label cache: {len(self._label_cache):,}"
|
|
829
|
+
)
|
|
830
|
+
# Increase threshold: 10K -> 100K -> 1M
|
|
831
|
+
if next_log_threshold < 100_000:
|
|
832
|
+
next_log_threshold = 100_000
|
|
833
|
+
elif next_log_threshold < 1_000_000:
|
|
834
|
+
next_log_threshold = 1_000_000
|
|
835
|
+
else:
|
|
836
|
+
next_log_threshold += 1_000_000
|
|
837
|
+
continue
|
|
838
|
+
|
|
839
|
+
entity_count += 1
|
|
840
|
+
|
|
841
|
+
# Log progress with adaptive frequency
|
|
842
|
+
if entity_count >= next_log_threshold:
|
|
843
|
+
logger.info(
|
|
844
|
+
f"Processed {entity_count:,} entities, "
|
|
845
|
+
f"label cache: {len(self._label_cache):,}, "
|
|
846
|
+
f"unresolved QIDs: {len(self._unresolved_qids):,}"
|
|
847
|
+
)
|
|
848
|
+
# Increase threshold: 10K -> 100K -> 1M -> 2M -> 3M...
|
|
849
|
+
if next_log_threshold < 100_000:
|
|
850
|
+
next_log_threshold = 100_000
|
|
851
|
+
elif next_log_threshold < 1_000_000:
|
|
852
|
+
next_log_threshold = 1_000_000
|
|
853
|
+
else:
|
|
854
|
+
next_log_threshold += 1_000_000
|
|
855
|
+
|
|
856
|
+
# Call progress callback if provided
|
|
857
|
+
if progress_callback:
|
|
858
|
+
progress_callback(entity_count, entity_id)
|
|
859
|
+
|
|
860
|
+
yield entity
|
|
861
|
+
|
|
862
|
+
except json.JSONDecodeError as e:
|
|
863
|
+
logger.debug(f"Line {line_count}: JSON decode error: {e}")
|
|
864
|
+
continue
|
|
865
|
+
|
|
866
|
+
def import_people(
|
|
867
|
+
self,
|
|
868
|
+
dump_path: Optional[Path] = None,
|
|
869
|
+
limit: Optional[int] = None,
|
|
870
|
+
require_enwiki: bool = False,
|
|
871
|
+
skip_ids: Optional[set[str]] = None,
|
|
872
|
+
start_index: int = 0,
|
|
873
|
+
progress_callback: Optional[Callable[[int, str, int], None]] = None,
|
|
874
|
+
) -> Iterator[PersonRecord]:
|
|
875
|
+
"""
|
|
876
|
+
Stream through dump, yielding ALL people (humans with P31=Q5).
|
|
877
|
+
|
|
878
|
+
This method filters the dump for:
|
|
879
|
+
- Items with type "item" (not properties)
|
|
880
|
+
- Humans (P31 contains Q5)
|
|
881
|
+
- Optionally: Has English Wikipedia article (enwiki sitelink)
|
|
882
|
+
|
|
883
|
+
PersonType is derived from positions (P39) and occupations (P106).
|
|
884
|
+
Parliamentary context (electoral district, term, party) is extracted from P39 qualifiers.
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
dump_path: Path to dump file (uses self._dump_path if not provided)
|
|
888
|
+
limit: Optional maximum number of records to return
|
|
889
|
+
require_enwiki: If True, only include people with English Wikipedia articles
|
|
890
|
+
skip_ids: Optional set of source_ids (Q codes) to skip. Checked early before
|
|
891
|
+
full processing to avoid unnecessary QID resolution.
|
|
892
|
+
start_index: Entity index to start from (for resume support). Entities
|
|
893
|
+
before this index are skipped but labels are still cached.
|
|
894
|
+
progress_callback: Optional callback(entity_index, entity_id, records_yielded)
|
|
895
|
+
called for each yielded record. Useful for saving progress.
|
|
896
|
+
|
|
897
|
+
Yields:
|
|
898
|
+
PersonRecord for each qualifying person
|
|
899
|
+
"""
|
|
900
|
+
path = dump_path or self._dump_path
|
|
901
|
+
count = 0
|
|
902
|
+
skipped = 0
|
|
903
|
+
current_entity_index = start_index
|
|
904
|
+
|
|
905
|
+
logger.info("Starting people import from Wikidata dump...")
|
|
906
|
+
if start_index > 0:
|
|
907
|
+
logger.info(f"Resuming from entity index {start_index:,}")
|
|
908
|
+
if not require_enwiki:
|
|
909
|
+
logger.info("Importing ALL humans (no enwiki filter)")
|
|
910
|
+
if skip_ids:
|
|
911
|
+
logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
|
|
912
|
+
|
|
913
|
+
def track_entity(entity_index: int, entity_id: str) -> None:
|
|
914
|
+
nonlocal current_entity_index
|
|
915
|
+
current_entity_index = entity_index
|
|
916
|
+
|
|
917
|
+
for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
|
|
918
|
+
if limit and count >= limit:
|
|
919
|
+
break
|
|
920
|
+
|
|
921
|
+
# Check skip_ids early, before full processing (avoids QID resolution)
|
|
922
|
+
entity_id = entity.get("id", "")
|
|
923
|
+
if skip_ids and entity_id in skip_ids:
|
|
924
|
+
skipped += 1
|
|
925
|
+
continue
|
|
926
|
+
|
|
927
|
+
record = self._process_person_entity(entity, require_enwiki=require_enwiki)
|
|
928
|
+
if record:
|
|
929
|
+
count += 1
|
|
930
|
+
if count % 10_000 == 0:
|
|
931
|
+
logger.info(f"Yielded {count:,} people records (skipped {skipped:,})...")
|
|
932
|
+
|
|
933
|
+
# Call progress callback with current position
|
|
934
|
+
if progress_callback:
|
|
935
|
+
progress_callback(current_entity_index, entity_id, count)
|
|
936
|
+
|
|
937
|
+
yield record
|
|
938
|
+
|
|
939
|
+
logger.info(f"People import complete: {count:,} records (skipped {skipped:,})")
|
|
940
|
+
|
|
941
|
+
def import_organizations(
|
|
942
|
+
self,
|
|
943
|
+
dump_path: Optional[Path] = None,
|
|
944
|
+
limit: Optional[int] = None,
|
|
945
|
+
require_enwiki: bool = False,
|
|
946
|
+
skip_ids: Optional[set[str]] = None,
|
|
947
|
+
start_index: int = 0,
|
|
948
|
+
progress_callback: Optional[Callable[[int, str, int], None]] = None,
|
|
949
|
+
) -> Iterator[CompanyRecord]:
|
|
950
|
+
"""
|
|
951
|
+
Stream through dump, yielding organizations.
|
|
952
|
+
|
|
953
|
+
This method filters the dump for:
|
|
954
|
+
- Items with type "item"
|
|
955
|
+
- Has P31 (instance of) matching an organization type
|
|
956
|
+
- Optionally: Has English Wikipedia article (enwiki sitelink)
|
|
957
|
+
|
|
958
|
+
Args:
|
|
959
|
+
dump_path: Path to dump file (uses self._dump_path if not provided)
|
|
960
|
+
limit: Optional maximum number of records to return
|
|
961
|
+
require_enwiki: If True, only include orgs with English Wikipedia articles
|
|
962
|
+
skip_ids: Optional set of source_ids (Q codes) to skip. Checked early before
|
|
963
|
+
full processing to avoid unnecessary QID resolution.
|
|
964
|
+
start_index: Entity index to start from (for resume support). Entities
|
|
965
|
+
before this index are skipped but labels are still cached.
|
|
966
|
+
progress_callback: Optional callback(entity_index, entity_id, records_yielded)
|
|
967
|
+
called for each yielded record. Useful for saving progress.
|
|
968
|
+
|
|
969
|
+
Yields:
|
|
970
|
+
CompanyRecord for each qualifying organization
|
|
971
|
+
"""
|
|
972
|
+
path = dump_path or self._dump_path
|
|
973
|
+
count = 0
|
|
974
|
+
skipped_existing = 0
|
|
975
|
+
skipped_no_type = 0
|
|
976
|
+
skipped_no_enwiki = 0
|
|
977
|
+
skipped_no_label = 0
|
|
978
|
+
current_entity_index = start_index
|
|
979
|
+
|
|
980
|
+
logger.info("Starting organization import from Wikidata dump...")
|
|
981
|
+
if start_index > 0:
|
|
982
|
+
logger.info(f"Resuming from entity index {start_index:,}")
|
|
983
|
+
if not require_enwiki:
|
|
984
|
+
logger.info("Importing ALL organizations (no enwiki filter)")
|
|
985
|
+
if skip_ids:
|
|
986
|
+
logger.info(f"Skipping {len(skip_ids):,} existing Q codes")
|
|
987
|
+
|
|
988
|
+
def track_entity(entity_index: int, entity_id: str) -> None:
|
|
989
|
+
nonlocal current_entity_index
|
|
990
|
+
current_entity_index = entity_index
|
|
991
|
+
|
|
992
|
+
for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
|
|
993
|
+
if limit and count >= limit:
|
|
994
|
+
break
|
|
995
|
+
|
|
996
|
+
# Check skip_ids early, before full processing (avoids QID resolution)
|
|
997
|
+
entity_id = entity.get("id", "")
|
|
998
|
+
if skip_ids and entity_id in skip_ids:
|
|
999
|
+
skipped_existing += 1
|
|
1000
|
+
continue
|
|
1001
|
+
|
|
1002
|
+
record = self._process_org_entity(entity, require_enwiki=require_enwiki)
|
|
1003
|
+
if record:
|
|
1004
|
+
count += 1
|
|
1005
|
+
if count % 10_000 == 0:
|
|
1006
|
+
logger.info(f"Yielded {count:,} organization records (skipped {skipped_existing:,} existing)...")
|
|
1007
|
+
|
|
1008
|
+
# Call progress callback with current position
|
|
1009
|
+
if progress_callback:
|
|
1010
|
+
progress_callback(current_entity_index, entity_id, count)
|
|
1011
|
+
|
|
1012
|
+
yield record
|
|
1013
|
+
elif entity.get("type") == "item":
|
|
1014
|
+
# Track skip reasons for debugging
|
|
1015
|
+
if self._get_org_type(entity) is None:
|
|
1016
|
+
skipped_no_type += 1
|
|
1017
|
+
elif require_enwiki and "enwiki" not in entity.get("sitelinks", {}):
|
|
1018
|
+
skipped_no_enwiki += 1
|
|
1019
|
+
else:
|
|
1020
|
+
skipped_no_label += 1
|
|
1021
|
+
|
|
1022
|
+
# Log skip stats periodically
|
|
1023
|
+
total_skipped = skipped_no_type + skipped_no_enwiki + skipped_no_label
|
|
1024
|
+
if total_skipped > 0 and total_skipped % 1_000_000 == 0:
|
|
1025
|
+
logger.debug(
|
|
1026
|
+
f"Skip stats: no_matching_type={skipped_no_type:,}, "
|
|
1027
|
+
f"no_enwiki={skipped_no_enwiki:,}, no_label={skipped_no_label:,}"
|
|
1028
|
+
)
|
|
1029
|
+
|
|
1030
|
+
logger.info(f"Organization import complete: {count:,} records (skipped {skipped_existing:,} existing)")
|
|
1031
|
+
logger.info(
|
|
1032
|
+
f"Skipped: no_matching_type={skipped_no_type:,}, "
|
|
1033
|
+
f"no_enwiki={skipped_no_enwiki:,}, no_label={skipped_no_label:,}"
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
def import_all(
|
|
1037
|
+
self,
|
|
1038
|
+
dump_path: Optional[Path] = None,
|
|
1039
|
+
people_limit: Optional[int] = None,
|
|
1040
|
+
orgs_limit: Optional[int] = None,
|
|
1041
|
+
import_people: bool = True,
|
|
1042
|
+
import_orgs: bool = True,
|
|
1043
|
+
require_enwiki: bool = False,
|
|
1044
|
+
skip_people_ids: Optional[set[str]] = None,
|
|
1045
|
+
skip_org_ids: Optional[set[str]] = None,
|
|
1046
|
+
start_index: int = 0,
|
|
1047
|
+
progress_callback: Optional[Callable[[int, str, int, int], None]] = None,
|
|
1048
|
+
) -> Iterator[tuple[str, ImportRecord]]:
|
|
1049
|
+
"""
|
|
1050
|
+
Import both people and organizations in a single pass through the dump.
|
|
1051
|
+
|
|
1052
|
+
This is more efficient than calling import_people() and import_organizations()
|
|
1053
|
+
separately, as it only reads the ~100GB dump file once.
|
|
1054
|
+
|
|
1055
|
+
Args:
|
|
1056
|
+
dump_path: Path to dump file (uses self._dump_path if not provided)
|
|
1057
|
+
people_limit: Optional maximum number of people records
|
|
1058
|
+
orgs_limit: Optional maximum number of org records
|
|
1059
|
+
import_people: Whether to import people (default: True)
|
|
1060
|
+
import_orgs: Whether to import organizations (default: True)
|
|
1061
|
+
require_enwiki: If True, only include entities with English Wikipedia articles
|
|
1062
|
+
skip_people_ids: Optional set of people source_ids (Q codes) to skip
|
|
1063
|
+
skip_org_ids: Optional set of org source_ids (Q codes) to skip
|
|
1064
|
+
start_index: Entity index to start from (for resume support)
|
|
1065
|
+
progress_callback: Optional callback(entity_index, entity_id, people_count, orgs_count)
|
|
1066
|
+
called periodically. Useful for saving progress.
|
|
1067
|
+
|
|
1068
|
+
Yields:
|
|
1069
|
+
Tuples of (record_type, record) where record_type is "person" or "org"
|
|
1070
|
+
"""
|
|
1071
|
+
path = dump_path or self._dump_path
|
|
1072
|
+
people_count = 0
|
|
1073
|
+
orgs_count = 0
|
|
1074
|
+
people_skipped = 0
|
|
1075
|
+
orgs_skipped = 0
|
|
1076
|
+
current_entity_index = start_index
|
|
1077
|
+
|
|
1078
|
+
logger.info("Starting combined import from Wikidata dump...")
|
|
1079
|
+
if start_index > 0:
|
|
1080
|
+
logger.info(f"Resuming from entity index {start_index:,}")
|
|
1081
|
+
if import_people:
|
|
1082
|
+
logger.info(f"Importing people (limit: {people_limit or 'none'})")
|
|
1083
|
+
if skip_people_ids:
|
|
1084
|
+
logger.info(f" Skipping {len(skip_people_ids):,} existing people Q codes")
|
|
1085
|
+
if import_orgs:
|
|
1086
|
+
logger.info(f"Importing organizations (limit: {orgs_limit or 'none'})")
|
|
1087
|
+
if skip_org_ids:
|
|
1088
|
+
logger.info(f" Skipping {len(skip_org_ids):,} existing org Q codes")
|
|
1089
|
+
|
|
1090
|
+
# Check if we've hit both limits
|
|
1091
|
+
def limits_reached() -> bool:
|
|
1092
|
+
people_done = not import_people or (people_limit and people_count >= people_limit)
|
|
1093
|
+
orgs_done = not import_orgs or (orgs_limit and orgs_count >= orgs_limit)
|
|
1094
|
+
return bool(people_done and orgs_done)
|
|
1095
|
+
|
|
1096
|
+
def track_entity(entity_index: int, entity_id: str) -> None:
|
|
1097
|
+
nonlocal current_entity_index
|
|
1098
|
+
current_entity_index = entity_index
|
|
1099
|
+
|
|
1100
|
+
for entity in self.iter_entities(path, start_index=start_index, progress_callback=track_entity):
|
|
1101
|
+
if limits_reached():
|
|
1102
|
+
break
|
|
1103
|
+
|
|
1104
|
+
entity_id = entity.get("id", "")
|
|
1105
|
+
|
|
1106
|
+
# Try to process as person first (if importing people and not at limit)
|
|
1107
|
+
if import_people and (not people_limit or people_count < people_limit):
|
|
1108
|
+
# Check skip_ids early
|
|
1109
|
+
if skip_people_ids and entity_id in skip_people_ids:
|
|
1110
|
+
people_skipped += 1
|
|
1111
|
+
else:
|
|
1112
|
+
person_record = self._process_person_entity(entity, require_enwiki=require_enwiki)
|
|
1113
|
+
if person_record:
|
|
1114
|
+
people_count += 1
|
|
1115
|
+
if people_count % 10_000 == 0:
|
|
1116
|
+
logger.info(
|
|
1117
|
+
f"Progress: {people_count:,} people, {orgs_count:,} orgs "
|
|
1118
|
+
f"(entity {current_entity_index:,})"
|
|
1119
|
+
)
|
|
1120
|
+
if progress_callback:
|
|
1121
|
+
progress_callback(current_entity_index, entity_id, people_count, orgs_count)
|
|
1122
|
+
yield ("person", person_record)
|
|
1123
|
+
continue # Entity was a person, don't check for org
|
|
1124
|
+
|
|
1125
|
+
# Try to process as organization (if importing orgs and not at limit)
|
|
1126
|
+
if import_orgs and (not orgs_limit or orgs_count < orgs_limit):
|
|
1127
|
+
# Check skip_ids early
|
|
1128
|
+
if skip_org_ids and entity_id in skip_org_ids:
|
|
1129
|
+
orgs_skipped += 1
|
|
1130
|
+
else:
|
|
1131
|
+
org_record = self._process_org_entity(entity, require_enwiki=require_enwiki)
|
|
1132
|
+
if org_record:
|
|
1133
|
+
orgs_count += 1
|
|
1134
|
+
if orgs_count % 10_000 == 0:
|
|
1135
|
+
logger.info(
|
|
1136
|
+
f"Progress: {people_count:,} people, {orgs_count:,} orgs "
|
|
1137
|
+
f"(entity {current_entity_index:,})"
|
|
1138
|
+
)
|
|
1139
|
+
if progress_callback:
|
|
1140
|
+
progress_callback(current_entity_index, entity_id, people_count, orgs_count)
|
|
1141
|
+
yield ("org", org_record)
|
|
1142
|
+
|
|
1143
|
+
logger.info(
|
|
1144
|
+
f"Combined import complete: {people_count:,} people, {orgs_count:,} orgs "
|
|
1145
|
+
f"(skipped {people_skipped:,} people, {orgs_skipped:,} orgs)"
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
def _process_person_entity(
|
|
1149
|
+
self,
|
|
1150
|
+
entity: dict,
|
|
1151
|
+
require_enwiki: bool = False,
|
|
1152
|
+
) -> Optional[PersonRecord]:
|
|
1153
|
+
"""
|
|
1154
|
+
Process a single entity, return PersonRecord if it's a human.
|
|
1155
|
+
|
|
1156
|
+
Args:
|
|
1157
|
+
entity: Parsed Wikidata entity dictionary
|
|
1158
|
+
require_enwiki: If True, only include people with English Wikipedia articles
|
|
1159
|
+
|
|
1160
|
+
Returns:
|
|
1161
|
+
PersonRecord if entity qualifies, None otherwise
|
|
1162
|
+
"""
|
|
1163
|
+
# Must be an item (not property)
|
|
1164
|
+
if entity.get("type") != "item":
|
|
1165
|
+
return None
|
|
1166
|
+
|
|
1167
|
+
# Must be human (P31 contains Q5)
|
|
1168
|
+
if not self._is_human(entity):
|
|
1169
|
+
return None
|
|
1170
|
+
|
|
1171
|
+
# Optionally require English Wikipedia article
|
|
1172
|
+
if require_enwiki:
|
|
1173
|
+
sitelinks = entity.get("sitelinks", {})
|
|
1174
|
+
if "enwiki" not in sitelinks:
|
|
1175
|
+
return None
|
|
1176
|
+
|
|
1177
|
+
# Extract person data
|
|
1178
|
+
return self._extract_person_data(entity)
|
|
1179
|
+
|
|
1180
|
+
def _process_org_entity(
|
|
1181
|
+
self,
|
|
1182
|
+
entity: dict,
|
|
1183
|
+
require_enwiki: bool = False,
|
|
1184
|
+
) -> Optional[CompanyRecord]:
|
|
1185
|
+
"""
|
|
1186
|
+
Process a single entity, return CompanyRecord if it's an organization.
|
|
1187
|
+
|
|
1188
|
+
Args:
|
|
1189
|
+
entity: Parsed Wikidata entity dictionary
|
|
1190
|
+
require_enwiki: If True, only include orgs with English Wikipedia articles
|
|
1191
|
+
|
|
1192
|
+
Returns:
|
|
1193
|
+
CompanyRecord if entity qualifies, None otherwise
|
|
1194
|
+
"""
|
|
1195
|
+
# Must be an item (not property)
|
|
1196
|
+
if entity.get("type") != "item":
|
|
1197
|
+
return None
|
|
1198
|
+
|
|
1199
|
+
# Get organization type from P31
|
|
1200
|
+
entity_type = self._get_org_type(entity)
|
|
1201
|
+
if entity_type is None:
|
|
1202
|
+
return None
|
|
1203
|
+
|
|
1204
|
+
# Optionally require English Wikipedia article
|
|
1205
|
+
if require_enwiki:
|
|
1206
|
+
sitelinks = entity.get("sitelinks", {})
|
|
1207
|
+
if "enwiki" not in sitelinks:
|
|
1208
|
+
return None
|
|
1209
|
+
|
|
1210
|
+
# Extract organization data
|
|
1211
|
+
return self._extract_org_data(entity, entity_type)
|
|
1212
|
+
|
|
1213
|
+
def _is_human(self, entity: dict) -> bool:
|
|
1214
|
+
"""
|
|
1215
|
+
Check if entity has P31 (instance of) = Q5 (human).
|
|
1216
|
+
|
|
1217
|
+
Args:
|
|
1218
|
+
entity: Parsed Wikidata entity dictionary
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
True if entity is a human
|
|
1222
|
+
"""
|
|
1223
|
+
claims = entity.get("claims", {})
|
|
1224
|
+
for claim in claims.get("P31", []):
|
|
1225
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1226
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1227
|
+
value = datavalue.get("value", {})
|
|
1228
|
+
if isinstance(value, dict) and value.get("id") == "Q5":
|
|
1229
|
+
return True
|
|
1230
|
+
return False
|
|
1231
|
+
|
|
1232
|
+
def _get_org_type(self, entity: dict) -> Optional[EntityType]:
|
|
1233
|
+
"""
|
|
1234
|
+
Check if entity has P31 (instance of) matching an organization type.
|
|
1235
|
+
|
|
1236
|
+
Args:
|
|
1237
|
+
entity: Parsed Wikidata entity dictionary
|
|
1238
|
+
|
|
1239
|
+
Returns:
|
|
1240
|
+
EntityType if entity is an organization, None otherwise
|
|
1241
|
+
"""
|
|
1242
|
+
claims = entity.get("claims", {})
|
|
1243
|
+
for claim in claims.get("P31", []):
|
|
1244
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1245
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1246
|
+
value = datavalue.get("value", {})
|
|
1247
|
+
if isinstance(value, dict):
|
|
1248
|
+
qid = value.get("id", "")
|
|
1249
|
+
if qid in ORG_TYPE_TO_ENTITY_TYPE:
|
|
1250
|
+
return ORG_TYPE_TO_ENTITY_TYPE[qid]
|
|
1251
|
+
return None
|
|
1252
|
+
|
|
1253
|
+
def _get_claim_values(self, entity: dict, prop: str) -> list[str]:
|
|
1254
|
+
"""
|
|
1255
|
+
Get all QID values for a property (e.g., P39, P106).
|
|
1256
|
+
|
|
1257
|
+
Args:
|
|
1258
|
+
entity: Parsed Wikidata entity dictionary
|
|
1259
|
+
prop: Property ID (e.g., "P39", "P106")
|
|
1260
|
+
|
|
1261
|
+
Returns:
|
|
1262
|
+
List of QID strings
|
|
1263
|
+
"""
|
|
1264
|
+
claims = entity.get("claims", {})
|
|
1265
|
+
values = []
|
|
1266
|
+
for claim in claims.get(prop, []):
|
|
1267
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1268
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1269
|
+
value = datavalue.get("value", {})
|
|
1270
|
+
if isinstance(value, dict):
|
|
1271
|
+
qid = value.get("id")
|
|
1272
|
+
if qid:
|
|
1273
|
+
values.append(qid)
|
|
1274
|
+
return values
|
|
1275
|
+
|
|
1276
|
+
def _get_qid_qualifier(self, qualifiers: dict, prop: str) -> Optional[str]:
|
|
1277
|
+
"""Extract first QID from a qualifier property."""
|
|
1278
|
+
for qual in qualifiers.get(prop, []):
|
|
1279
|
+
qual_datavalue = qual.get("datavalue", {})
|
|
1280
|
+
qual_value = qual_datavalue.get("value", {})
|
|
1281
|
+
if isinstance(qual_value, dict):
|
|
1282
|
+
return qual_value.get("id")
|
|
1283
|
+
return None
|
|
1284
|
+
|
|
1285
|
+
def _get_time_qualifier(self, qualifiers: dict, prop: str) -> Optional[str]:
|
|
1286
|
+
"""Extract first time value from a qualifier property."""
|
|
1287
|
+
for qual in qualifiers.get(prop, []):
|
|
1288
|
+
qual_datavalue = qual.get("datavalue", {})
|
|
1289
|
+
qual_value = qual_datavalue.get("value", {})
|
|
1290
|
+
if isinstance(qual_value, dict):
|
|
1291
|
+
time_str = qual_value.get("time", "")
|
|
1292
|
+
return self._parse_time_value(time_str)
|
|
1293
|
+
return None
|
|
1294
|
+
|
|
1295
|
+
def _get_positions_with_org(self, claims: dict) -> list[dict]:
|
|
1296
|
+
"""
|
|
1297
|
+
Extract P39 positions with qualifiers for org, dates, and parliamentary context.
|
|
1298
|
+
|
|
1299
|
+
Qualifiers extracted per WikiProject Parliaments guidelines:
|
|
1300
|
+
- P580 (start time) - when the position started
|
|
1301
|
+
- P582 (end time) - when the position ended
|
|
1302
|
+
- P108 (employer) - organization they work for
|
|
1303
|
+
- P642 (of) - the organization (legacy/fallback)
|
|
1304
|
+
- P768 (electoral district) - constituency for MPs
|
|
1305
|
+
- P2937 (parliamentary term) - which term they served in
|
|
1306
|
+
- P4100 (parliamentary group) - political party/faction
|
|
1307
|
+
- P1001 (applies to jurisdiction) - jurisdiction they represent
|
|
1308
|
+
- P2715 (elected in) - which election elected them
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
claims: Claims dictionary from entity
|
|
1312
|
+
|
|
1313
|
+
Returns:
|
|
1314
|
+
List of position dictionaries with position metadata
|
|
1315
|
+
"""
|
|
1316
|
+
positions = []
|
|
1317
|
+
for claim in claims.get("P39", []):
|
|
1318
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1319
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1320
|
+
pos_value = datavalue.get("value", {})
|
|
1321
|
+
pos_qid = pos_value.get("id") if isinstance(pos_value, dict) else None
|
|
1322
|
+
if not pos_qid:
|
|
1323
|
+
continue
|
|
1324
|
+
|
|
1325
|
+
qualifiers = claim.get("qualifiers", {})
|
|
1326
|
+
|
|
1327
|
+
# Extract organization from multiple possible qualifiers
|
|
1328
|
+
# Priority: P108 (employer) > P642 (of) > P1001 (jurisdiction)
|
|
1329
|
+
org_qid = (
|
|
1330
|
+
self._get_qid_qualifier(qualifiers, "P108") or # employer
|
|
1331
|
+
self._get_qid_qualifier(qualifiers, "P642") or # of (legacy)
|
|
1332
|
+
self._get_qid_qualifier(qualifiers, "P1001") # applies to jurisdiction
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
# Extract dates
|
|
1336
|
+
start_date = self._get_time_qualifier(qualifiers, "P580")
|
|
1337
|
+
end_date = self._get_time_qualifier(qualifiers, "P582")
|
|
1338
|
+
|
|
1339
|
+
# Extract parliamentary/political qualifiers
|
|
1340
|
+
electoral_district = self._get_qid_qualifier(qualifiers, "P768")
|
|
1341
|
+
parliamentary_term = self._get_qid_qualifier(qualifiers, "P2937")
|
|
1342
|
+
parliamentary_group = self._get_qid_qualifier(qualifiers, "P4100")
|
|
1343
|
+
elected_in = self._get_qid_qualifier(qualifiers, "P2715")
|
|
1344
|
+
|
|
1345
|
+
positions.append({
|
|
1346
|
+
"position_qid": pos_qid,
|
|
1347
|
+
"org_qid": org_qid,
|
|
1348
|
+
"start_date": start_date,
|
|
1349
|
+
"end_date": end_date,
|
|
1350
|
+
# Parliamentary context
|
|
1351
|
+
"electoral_district": electoral_district,
|
|
1352
|
+
"parliamentary_term": parliamentary_term,
|
|
1353
|
+
"parliamentary_group": parliamentary_group,
|
|
1354
|
+
"elected_in": elected_in,
|
|
1355
|
+
})
|
|
1356
|
+
return positions
|
|
1357
|
+
|
|
1358
|
+
def _parse_time_value(self, time_str: str) -> Optional[str]:
|
|
1359
|
+
"""
|
|
1360
|
+
Parse Wikidata time value to ISO date string.
|
|
1361
|
+
|
|
1362
|
+
Args:
|
|
1363
|
+
time_str: Wikidata time format like "+2020-01-15T00:00:00Z"
|
|
1364
|
+
|
|
1365
|
+
Returns:
|
|
1366
|
+
ISO date string (YYYY-MM-DD) or None
|
|
1367
|
+
"""
|
|
1368
|
+
if not time_str:
|
|
1369
|
+
return None
|
|
1370
|
+
# Remove leading + and extract date part
|
|
1371
|
+
time_str = time_str.lstrip("+")
|
|
1372
|
+
if "T" in time_str:
|
|
1373
|
+
return time_str.split("T")[0]
|
|
1374
|
+
return None
|
|
1375
|
+
|
|
1376
|
+
def _classify_person_type(
|
|
1377
|
+
self,
|
|
1378
|
+
positions: list[dict],
|
|
1379
|
+
occupations: list[str],
|
|
1380
|
+
) -> PersonType:
|
|
1381
|
+
"""
|
|
1382
|
+
Determine PersonType from P39 positions and P106 occupations.
|
|
1383
|
+
|
|
1384
|
+
Priority order:
|
|
1385
|
+
1. Check positions (more specific)
|
|
1386
|
+
2. Check occupations
|
|
1387
|
+
3. Default to UNKNOWN
|
|
1388
|
+
|
|
1389
|
+
Args:
|
|
1390
|
+
positions: List of position dictionaries from _get_positions_with_org
|
|
1391
|
+
occupations: List of occupation QIDs from P106
|
|
1392
|
+
|
|
1393
|
+
Returns:
|
|
1394
|
+
Classified PersonType
|
|
1395
|
+
"""
|
|
1396
|
+
# Check positions first (more specific)
|
|
1397
|
+
for pos in positions:
|
|
1398
|
+
pos_qid = pos.get("position_qid", "")
|
|
1399
|
+
if pos_qid in EXECUTIVE_POSITION_QIDS:
|
|
1400
|
+
return PersonType.EXECUTIVE
|
|
1401
|
+
if pos_qid in POLITICIAN_POSITION_QIDS:
|
|
1402
|
+
return PersonType.POLITICIAN
|
|
1403
|
+
|
|
1404
|
+
# Then check occupations
|
|
1405
|
+
for occ in occupations:
|
|
1406
|
+
if occ in OCCUPATION_TO_TYPE:
|
|
1407
|
+
return OCCUPATION_TO_TYPE[occ]
|
|
1408
|
+
|
|
1409
|
+
# Default
|
|
1410
|
+
return PersonType.UNKNOWN
|
|
1411
|
+
|
|
1412
|
+
def _get_org_or_context(self, pos: dict) -> str:
|
|
1413
|
+
"""Get org QID from position, falling back to electoral district or parliamentary group."""
|
|
1414
|
+
return (
|
|
1415
|
+
pos.get("org_qid") or
|
|
1416
|
+
pos.get("electoral_district") or
|
|
1417
|
+
pos.get("parliamentary_group") or
|
|
1418
|
+
""
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
def _get_best_role_org(
|
|
1422
|
+
self,
|
|
1423
|
+
positions: list[dict],
|
|
1424
|
+
) -> tuple[str, str, str, Optional[str], Optional[str], dict]:
|
|
1425
|
+
"""
|
|
1426
|
+
Select best position for role/org display.
|
|
1427
|
+
|
|
1428
|
+
Priority:
|
|
1429
|
+
1. Positions with org/context and dates
|
|
1430
|
+
2. Positions with org/context
|
|
1431
|
+
3. Positions with dates
|
|
1432
|
+
4. Any position
|
|
1433
|
+
|
|
1434
|
+
Args:
|
|
1435
|
+
positions: List of position dictionaries
|
|
1436
|
+
|
|
1437
|
+
Returns:
|
|
1438
|
+
Tuple of (role_qid, org_label, org_qid, start_date, end_date, extra_context)
|
|
1439
|
+
Note: In dump mode, we return QIDs since we don't have labels
|
|
1440
|
+
extra_context contains parliamentary metadata
|
|
1441
|
+
"""
|
|
1442
|
+
def has_context(pos: dict) -> bool:
|
|
1443
|
+
return bool(
|
|
1444
|
+
pos.get("org_qid") or
|
|
1445
|
+
pos.get("electoral_district") or
|
|
1446
|
+
pos.get("parliamentary_group")
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
def get_extra_context(pos: dict) -> dict:
|
|
1450
|
+
return {
|
|
1451
|
+
k: v for k, v in {
|
|
1452
|
+
"electoral_district": pos.get("electoral_district"),
|
|
1453
|
+
"parliamentary_term": pos.get("parliamentary_term"),
|
|
1454
|
+
"parliamentary_group": pos.get("parliamentary_group"),
|
|
1455
|
+
"elected_in": pos.get("elected_in"),
|
|
1456
|
+
}.items() if v
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
# Priority 1: Position with org/context and dates
|
|
1460
|
+
for pos in positions:
|
|
1461
|
+
if has_context(pos) and (pos.get("start_date") or pos.get("end_date")):
|
|
1462
|
+
return (
|
|
1463
|
+
pos["position_qid"],
|
|
1464
|
+
"",
|
|
1465
|
+
self._get_org_or_context(pos),
|
|
1466
|
+
pos.get("start_date"),
|
|
1467
|
+
pos.get("end_date"),
|
|
1468
|
+
get_extra_context(pos),
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
# Priority 2: Position with org/context
|
|
1472
|
+
for pos in positions:
|
|
1473
|
+
if has_context(pos):
|
|
1474
|
+
return (
|
|
1475
|
+
pos["position_qid"],
|
|
1476
|
+
"",
|
|
1477
|
+
self._get_org_or_context(pos),
|
|
1478
|
+
pos.get("start_date"),
|
|
1479
|
+
pos.get("end_date"),
|
|
1480
|
+
get_extra_context(pos),
|
|
1481
|
+
)
|
|
1482
|
+
|
|
1483
|
+
# Priority 3: Position with dates
|
|
1484
|
+
for pos in positions:
|
|
1485
|
+
if pos.get("start_date") or pos.get("end_date"):
|
|
1486
|
+
return (
|
|
1487
|
+
pos["position_qid"],
|
|
1488
|
+
"",
|
|
1489
|
+
self._get_org_or_context(pos),
|
|
1490
|
+
pos.get("start_date"),
|
|
1491
|
+
pos.get("end_date"),
|
|
1492
|
+
get_extra_context(pos),
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
# Priority 4: Any position
|
|
1496
|
+
if positions:
|
|
1497
|
+
pos = positions[0]
|
|
1498
|
+
return (
|
|
1499
|
+
pos["position_qid"],
|
|
1500
|
+
"",
|
|
1501
|
+
self._get_org_or_context(pos),
|
|
1502
|
+
pos.get("start_date"),
|
|
1503
|
+
pos.get("end_date"),
|
|
1504
|
+
get_extra_context(pos),
|
|
1505
|
+
)
|
|
1506
|
+
|
|
1507
|
+
return "", "", "", None, None, {}
|
|
1508
|
+
|
|
1509
|
+
def _extract_person_data(self, entity: dict) -> Optional[PersonRecord]:
|
|
1510
|
+
"""
|
|
1511
|
+
Extract PersonRecord from entity dict.
|
|
1512
|
+
|
|
1513
|
+
Derives type/role/org from claims.
|
|
1514
|
+
|
|
1515
|
+
Args:
|
|
1516
|
+
entity: Parsed Wikidata entity dictionary
|
|
1517
|
+
|
|
1518
|
+
Returns:
|
|
1519
|
+
PersonRecord or None if essential data is missing
|
|
1520
|
+
"""
|
|
1521
|
+
qid = entity.get("id", "")
|
|
1522
|
+
labels = entity.get("labels", {})
|
|
1523
|
+
# Try English label first, fall back to any available label
|
|
1524
|
+
label = labels.get("en", {}).get("value", "")
|
|
1525
|
+
if not label:
|
|
1526
|
+
# Try to get any label
|
|
1527
|
+
for lang_data in labels.values():
|
|
1528
|
+
if isinstance(lang_data, dict) and lang_data.get("value"):
|
|
1529
|
+
label = lang_data["value"]
|
|
1530
|
+
break
|
|
1531
|
+
|
|
1532
|
+
if not label or not qid:
|
|
1533
|
+
return None
|
|
1534
|
+
|
|
1535
|
+
claims = entity.get("claims", {})
|
|
1536
|
+
|
|
1537
|
+
# Get positions (P39) with qualifiers for org
|
|
1538
|
+
positions = self._get_positions_with_org(claims)
|
|
1539
|
+
# Get occupations (P106)
|
|
1540
|
+
occupations = self._get_claim_values(entity, "P106")
|
|
1541
|
+
|
|
1542
|
+
# Classify person type from positions + occupations
|
|
1543
|
+
person_type = self._classify_person_type(positions, occupations)
|
|
1544
|
+
|
|
1545
|
+
# Get best role/org/dates from positions
|
|
1546
|
+
role_qid, _, org_qid, start_date, end_date, extra_context = self._get_best_role_org(positions)
|
|
1547
|
+
|
|
1548
|
+
# Get country (P27 - country of citizenship)
|
|
1549
|
+
countries = self._get_claim_values(entity, "P27")
|
|
1550
|
+
country_qid = countries[0] if countries else ""
|
|
1551
|
+
|
|
1552
|
+
# Resolve QIDs to labels using the cache (or track for later resolution)
|
|
1553
|
+
country_label = self._resolve_qid(country_qid) if country_qid else ""
|
|
1554
|
+
role_label = self._resolve_qid(role_qid) if role_qid else ""
|
|
1555
|
+
org_label = self._resolve_qid(org_qid) if org_qid else ""
|
|
1556
|
+
|
|
1557
|
+
# Get birth and death dates (P569, P570)
|
|
1558
|
+
birth_date = self._get_time_claim(claims, "P569")
|
|
1559
|
+
death_date = self._get_time_claim(claims, "P570")
|
|
1560
|
+
|
|
1561
|
+
# Get description
|
|
1562
|
+
descriptions = entity.get("descriptions", {})
|
|
1563
|
+
description = descriptions.get("en", {}).get("value", "")
|
|
1564
|
+
|
|
1565
|
+
# Track discovered organization
|
|
1566
|
+
if org_qid:
|
|
1567
|
+
self._discovered_orgs[org_qid] = org_label
|
|
1568
|
+
|
|
1569
|
+
# Build record with all position metadata
|
|
1570
|
+
record_data = {
|
|
1571
|
+
"wikidata_id": qid,
|
|
1572
|
+
"label": label,
|
|
1573
|
+
"description": description,
|
|
1574
|
+
"positions": [p["position_qid"] for p in positions],
|
|
1575
|
+
"occupations": occupations,
|
|
1576
|
+
"org_qid": org_qid,
|
|
1577
|
+
"country_qid": country_qid,
|
|
1578
|
+
"role_qid": role_qid,
|
|
1579
|
+
"birth_date": birth_date,
|
|
1580
|
+
"death_date": death_date,
|
|
1581
|
+
}
|
|
1582
|
+
# Add parliamentary context if present
|
|
1583
|
+
if extra_context:
|
|
1584
|
+
record_data.update(extra_context)
|
|
1585
|
+
|
|
1586
|
+
return PersonRecord(
|
|
1587
|
+
name=label,
|
|
1588
|
+
source="wikidata",
|
|
1589
|
+
source_id=qid,
|
|
1590
|
+
country=country_label,
|
|
1591
|
+
person_type=person_type,
|
|
1592
|
+
known_for_role=role_label,
|
|
1593
|
+
known_for_org=org_label,
|
|
1594
|
+
from_date=start_date,
|
|
1595
|
+
to_date=end_date,
|
|
1596
|
+
birth_date=birth_date,
|
|
1597
|
+
death_date=death_date,
|
|
1598
|
+
record=record_data,
|
|
1599
|
+
)
|
|
1600
|
+
|
|
1601
|
+
def _extract_org_data(
|
|
1602
|
+
self,
|
|
1603
|
+
entity: dict,
|
|
1604
|
+
entity_type: EntityType,
|
|
1605
|
+
) -> Optional[CompanyRecord]:
|
|
1606
|
+
"""
|
|
1607
|
+
Extract CompanyRecord from entity dict.
|
|
1608
|
+
|
|
1609
|
+
Args:
|
|
1610
|
+
entity: Parsed Wikidata entity dictionary
|
|
1611
|
+
entity_type: Determined EntityType
|
|
1612
|
+
|
|
1613
|
+
Returns:
|
|
1614
|
+
CompanyRecord or None if essential data is missing
|
|
1615
|
+
"""
|
|
1616
|
+
qid = entity.get("id", "")
|
|
1617
|
+
labels = entity.get("labels", {})
|
|
1618
|
+
label = labels.get("en", {}).get("value", "")
|
|
1619
|
+
|
|
1620
|
+
if not label or not qid:
|
|
1621
|
+
return None
|
|
1622
|
+
|
|
1623
|
+
claims = entity.get("claims", {})
|
|
1624
|
+
|
|
1625
|
+
# Get country (P17 - country)
|
|
1626
|
+
countries = self._get_claim_values(entity, "P17")
|
|
1627
|
+
country_qid = countries[0] if countries else ""
|
|
1628
|
+
|
|
1629
|
+
# Resolve country QID to label
|
|
1630
|
+
country_label = self._resolve_qid(country_qid) if country_qid else ""
|
|
1631
|
+
|
|
1632
|
+
# Get LEI (P1278)
|
|
1633
|
+
lei = self._get_string_claim(claims, "P1278")
|
|
1634
|
+
|
|
1635
|
+
# Get ticker (P249)
|
|
1636
|
+
ticker = self._get_string_claim(claims, "P249")
|
|
1637
|
+
|
|
1638
|
+
# Get description
|
|
1639
|
+
descriptions = entity.get("descriptions", {})
|
|
1640
|
+
description = descriptions.get("en", {}).get("value", "")
|
|
1641
|
+
|
|
1642
|
+
# Get inception date (P571)
|
|
1643
|
+
inception = self._get_time_claim(claims, "P571")
|
|
1644
|
+
|
|
1645
|
+
# Get dissolution date (P576)
|
|
1646
|
+
dissolution = self._get_time_claim(claims, "P576")
|
|
1647
|
+
|
|
1648
|
+
return CompanyRecord(
|
|
1649
|
+
name=label,
|
|
1650
|
+
source="wikipedia", # Use "wikipedia" per existing convention
|
|
1651
|
+
source_id=qid,
|
|
1652
|
+
region=country_label,
|
|
1653
|
+
entity_type=entity_type,
|
|
1654
|
+
from_date=inception,
|
|
1655
|
+
to_date=dissolution,
|
|
1656
|
+
record={
|
|
1657
|
+
"wikidata_id": qid,
|
|
1658
|
+
"label": label,
|
|
1659
|
+
"description": description,
|
|
1660
|
+
"lei": lei,
|
|
1661
|
+
"ticker": ticker,
|
|
1662
|
+
"country_qid": country_qid,
|
|
1663
|
+
},
|
|
1664
|
+
)
|
|
1665
|
+
|
|
1666
|
+
def _get_string_claim(self, claims: dict, prop: str) -> str:
|
|
1667
|
+
"""
|
|
1668
|
+
Get first string value for a property.
|
|
1669
|
+
|
|
1670
|
+
Args:
|
|
1671
|
+
claims: Claims dictionary
|
|
1672
|
+
prop: Property ID
|
|
1673
|
+
|
|
1674
|
+
Returns:
|
|
1675
|
+
String value or empty string
|
|
1676
|
+
"""
|
|
1677
|
+
for claim in claims.get(prop, []):
|
|
1678
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1679
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1680
|
+
value = datavalue.get("value")
|
|
1681
|
+
if isinstance(value, str):
|
|
1682
|
+
return value
|
|
1683
|
+
return ""
|
|
1684
|
+
|
|
1685
|
+
def _get_time_claim(self, claims: dict, prop: str) -> Optional[str]:
|
|
1686
|
+
"""
|
|
1687
|
+
Get first time value for a property as ISO date string.
|
|
1688
|
+
|
|
1689
|
+
Args:
|
|
1690
|
+
claims: Claims dictionary
|
|
1691
|
+
prop: Property ID
|
|
1692
|
+
|
|
1693
|
+
Returns:
|
|
1694
|
+
ISO date string (YYYY-MM-DD) or None
|
|
1695
|
+
"""
|
|
1696
|
+
for claim in claims.get(prop, []):
|
|
1697
|
+
mainsnak = claim.get("mainsnak", {})
|
|
1698
|
+
datavalue = mainsnak.get("datavalue", {})
|
|
1699
|
+
value = datavalue.get("value", {})
|
|
1700
|
+
if isinstance(value, dict):
|
|
1701
|
+
time_str = value.get("time", "")
|
|
1702
|
+
# Format: +2020-01-15T00:00:00Z
|
|
1703
|
+
if time_str:
|
|
1704
|
+
# Remove leading + and extract date part
|
|
1705
|
+
time_str = time_str.lstrip("+")
|
|
1706
|
+
if "T" in time_str:
|
|
1707
|
+
return time_str.split("T")[0]
|
|
1708
|
+
return None
|
|
1709
|
+
|
|
1710
|
+
def get_discovered_organizations(self) -> list[CompanyRecord]:
|
|
1711
|
+
"""
|
|
1712
|
+
Get organizations discovered during the people import.
|
|
1713
|
+
|
|
1714
|
+
These are organizations associated with people (from P39 P642 qualifiers)
|
|
1715
|
+
that can be inserted into the organizations database if not already present.
|
|
1716
|
+
|
|
1717
|
+
Note: In dump mode, we only have QIDs, not labels.
|
|
1718
|
+
|
|
1719
|
+
Returns:
|
|
1720
|
+
List of CompanyRecord objects for discovered organizations
|
|
1721
|
+
"""
|
|
1722
|
+
records = []
|
|
1723
|
+
for org_qid in self._discovered_orgs:
|
|
1724
|
+
record = CompanyRecord(
|
|
1725
|
+
name=org_qid, # Only have QID, not label
|
|
1726
|
+
source="wikipedia",
|
|
1727
|
+
source_id=org_qid,
|
|
1728
|
+
region="",
|
|
1729
|
+
entity_type=EntityType.BUSINESS, # Default
|
|
1730
|
+
record={
|
|
1731
|
+
"wikidata_id": org_qid,
|
|
1732
|
+
"discovered_from": "people_import",
|
|
1733
|
+
"needs_label_resolution": True,
|
|
1734
|
+
},
|
|
1735
|
+
)
|
|
1736
|
+
records.append(record)
|
|
1737
|
+
logger.info(f"Discovered {len(records)} organizations from people import")
|
|
1738
|
+
return records
|
|
1739
|
+
|
|
1740
|
+
def clear_discovered_organizations(self) -> None:
|
|
1741
|
+
"""Clear the discovered organizations cache."""
|
|
1742
|
+
self._discovered_orgs.clear()
|
|
1743
|
+
|
|
1744
|
+
def get_unresolved_qids(self) -> set[str]:
|
|
1745
|
+
"""Get QIDs that need label resolution."""
|
|
1746
|
+
return self._unresolved_qids.copy()
|
|
1747
|
+
|
|
1748
|
+
def get_label_cache(self) -> dict[str, str]:
|
|
1749
|
+
"""Get the label cache built during import."""
|
|
1750
|
+
return self._label_cache.copy()
|
|
1751
|
+
|
|
1752
|
+
def set_label_cache(self, labels: dict[str, str]) -> None:
|
|
1753
|
+
"""
|
|
1754
|
+
Set initial label cache from existing data (e.g., from database).
|
|
1755
|
+
|
|
1756
|
+
Args:
|
|
1757
|
+
labels: Mapping of QID -> label to seed the cache
|
|
1758
|
+
"""
|
|
1759
|
+
self._label_cache.update(labels)
|
|
1760
|
+
logger.info(f"Seeded label cache with {len(labels)} existing labels")
|
|
1761
|
+
|
|
1762
|
+
def get_new_labels_since(self, known_qids: set[str]) -> dict[str, str]:
|
|
1763
|
+
"""
|
|
1764
|
+
Get labels that were added to cache since a known set.
|
|
1765
|
+
|
|
1766
|
+
Args:
|
|
1767
|
+
known_qids: Set of QIDs that were already known
|
|
1768
|
+
|
|
1769
|
+
Returns:
|
|
1770
|
+
Dict of new QID -> label mappings
|
|
1771
|
+
"""
|
|
1772
|
+
return {qid: label for qid, label in self._label_cache.items() if qid not in known_qids}
|
|
1773
|
+
|
|
1774
|
+
def _cache_entity_label(self, entity: dict) -> None:
|
|
1775
|
+
"""
|
|
1776
|
+
Cache the English label for an entity during dump processing.
|
|
1777
|
+
|
|
1778
|
+
This builds up a lookup table as we iterate through the dump,
|
|
1779
|
+
so we can resolve QID references (countries, roles) to labels.
|
|
1780
|
+
"""
|
|
1781
|
+
qid = entity.get("id", "")
|
|
1782
|
+
if not qid:
|
|
1783
|
+
return
|
|
1784
|
+
|
|
1785
|
+
labels = entity.get("labels", {})
|
|
1786
|
+
en_label = labels.get("en", {}).get("value", "")
|
|
1787
|
+
if en_label:
|
|
1788
|
+
self._label_cache[qid] = en_label
|
|
1789
|
+
|
|
1790
|
+
def _resolve_qid(self, qid: str) -> str:
|
|
1791
|
+
"""
|
|
1792
|
+
Resolve a QID to a label, using cache or SPARQL lookup.
|
|
1793
|
+
|
|
1794
|
+
Returns the label if found/resolved, otherwise returns the QID.
|
|
1795
|
+
"""
|
|
1796
|
+
if not qid or not qid.startswith("Q"):
|
|
1797
|
+
return qid
|
|
1798
|
+
|
|
1799
|
+
if qid in self._label_cache:
|
|
1800
|
+
label = self._label_cache[qid]
|
|
1801
|
+
logger.debug(f"Resolved QID (cache): {qid} -> {label}")
|
|
1802
|
+
return label
|
|
1803
|
+
|
|
1804
|
+
# Not in cache - resolve via SPARQL immediately
|
|
1805
|
+
label = self._resolve_single_qid_sparql(qid)
|
|
1806
|
+
if label:
|
|
1807
|
+
logger.info(f"Resolved QID (SPARQL): {qid} -> {label}")
|
|
1808
|
+
self._label_cache[qid] = label
|
|
1809
|
+
return label
|
|
1810
|
+
|
|
1811
|
+
# Track unresolved
|
|
1812
|
+
if qid not in self._unresolved_qids:
|
|
1813
|
+
logger.debug(f"Unresolved QID: {qid}")
|
|
1814
|
+
self._unresolved_qids.add(qid)
|
|
1815
|
+
return qid
|
|
1816
|
+
|
|
1817
|
+
def _resolve_single_qid_sparql(self, qid: str) -> Optional[str]:
|
|
1818
|
+
"""
|
|
1819
|
+
Resolve a single QID to a label via SPARQL.
|
|
1820
|
+
|
|
1821
|
+
Args:
|
|
1822
|
+
qid: Wikidata QID (e.g., 'Q30')
|
|
1823
|
+
|
|
1824
|
+
Returns:
|
|
1825
|
+
Label string or None if not found
|
|
1826
|
+
"""
|
|
1827
|
+
import json
|
|
1828
|
+
import urllib.parse
|
|
1829
|
+
import urllib.request
|
|
1830
|
+
|
|
1831
|
+
query = f"""
|
|
1832
|
+
SELECT ?label WHERE {{
|
|
1833
|
+
wd:{qid} rdfs:label ?label FILTER(LANG(?label) = "en") .
|
|
1834
|
+
}}
|
|
1835
|
+
LIMIT 1
|
|
1836
|
+
"""
|
|
1837
|
+
|
|
1838
|
+
try:
|
|
1839
|
+
params = urllib.parse.urlencode({
|
|
1840
|
+
"query": query,
|
|
1841
|
+
"format": "json",
|
|
1842
|
+
})
|
|
1843
|
+
url = f"https://query.wikidata.org/sparql?{params}"
|
|
1844
|
+
|
|
1845
|
+
req = urllib.request.Request(
|
|
1846
|
+
url,
|
|
1847
|
+
headers={
|
|
1848
|
+
"Accept": "application/sparql-results+json",
|
|
1849
|
+
"User-Agent": "corp-extractor/1.0 (QID resolver)",
|
|
1850
|
+
}
|
|
1851
|
+
)
|
|
1852
|
+
|
|
1853
|
+
with urllib.request.urlopen(req, timeout=10) as response:
|
|
1854
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1855
|
+
|
|
1856
|
+
bindings = data.get("results", {}).get("bindings", [])
|
|
1857
|
+
if bindings:
|
|
1858
|
+
return bindings[0].get("label", {}).get("value")
|
|
1859
|
+
|
|
1860
|
+
except Exception as e:
|
|
1861
|
+
logger.debug(f"SPARQL lookup failed for {qid}: {e}")
|
|
1862
|
+
|
|
1863
|
+
return None
|
|
1864
|
+
|
|
1865
|
+
def resolve_qids_via_sparql(
|
|
1866
|
+
self,
|
|
1867
|
+
qids: Optional[set[str]] = None,
|
|
1868
|
+
batch_size: int = 50,
|
|
1869
|
+
delay_seconds: float = 1.0,
|
|
1870
|
+
) -> dict[str, str]:
|
|
1871
|
+
"""
|
|
1872
|
+
Resolve QIDs to labels via Wikidata SPARQL queries.
|
|
1873
|
+
|
|
1874
|
+
This is used after import to resolve any QIDs that weren't found
|
|
1875
|
+
in the dump (e.g., if import was limited or dump was incomplete).
|
|
1876
|
+
|
|
1877
|
+
Args:
|
|
1878
|
+
qids: Set of QIDs to resolve (defaults to unresolved_qids)
|
|
1879
|
+
batch_size: Number of QIDs per SPARQL query (default 50)
|
|
1880
|
+
delay_seconds: Delay between queries to avoid rate limiting
|
|
1881
|
+
|
|
1882
|
+
Returns:
|
|
1883
|
+
Dict mapping QID -> label for resolved QIDs
|
|
1884
|
+
"""
|
|
1885
|
+
import json
|
|
1886
|
+
import time
|
|
1887
|
+
import urllib.parse
|
|
1888
|
+
import urllib.request
|
|
1889
|
+
|
|
1890
|
+
if qids is None:
|
|
1891
|
+
qids = self._unresolved_qids
|
|
1892
|
+
|
|
1893
|
+
if not qids:
|
|
1894
|
+
return {}
|
|
1895
|
+
|
|
1896
|
+
resolved: dict[str, str] = {}
|
|
1897
|
+
qid_list = list(qids)
|
|
1898
|
+
|
|
1899
|
+
logger.info(f"Resolving {len(qid_list)} QIDs via SPARQL...")
|
|
1900
|
+
|
|
1901
|
+
for i in range(0, len(qid_list), batch_size):
|
|
1902
|
+
batch = qid_list[i:i + batch_size]
|
|
1903
|
+
|
|
1904
|
+
# Build VALUES clause
|
|
1905
|
+
values = " ".join(f"wd:{qid}" for qid in batch)
|
|
1906
|
+
query = f"""
|
|
1907
|
+
SELECT ?item ?itemLabel WHERE {{
|
|
1908
|
+
VALUES ?item {{ {values} }}
|
|
1909
|
+
?item rdfs:label ?itemLabel FILTER(LANG(?itemLabel) = "en") .
|
|
1910
|
+
}}
|
|
1911
|
+
"""
|
|
1912
|
+
|
|
1913
|
+
try:
|
|
1914
|
+
params = urllib.parse.urlencode({
|
|
1915
|
+
"query": query,
|
|
1916
|
+
"format": "json",
|
|
1917
|
+
})
|
|
1918
|
+
url = f"https://query.wikidata.org/sparql?{params}"
|
|
1919
|
+
|
|
1920
|
+
req = urllib.request.Request(
|
|
1921
|
+
url,
|
|
1922
|
+
headers={
|
|
1923
|
+
"Accept": "application/sparql-results+json",
|
|
1924
|
+
"User-Agent": "corp-extractor/1.0 (QID resolver)",
|
|
1925
|
+
}
|
|
1926
|
+
)
|
|
1927
|
+
|
|
1928
|
+
with urllib.request.urlopen(req, timeout=60) as response:
|
|
1929
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1930
|
+
|
|
1931
|
+
for binding in data.get("results", {}).get("bindings", []):
|
|
1932
|
+
item_uri = binding.get("item", {}).get("value", "")
|
|
1933
|
+
label = binding.get("itemLabel", {}).get("value", "")
|
|
1934
|
+
if item_uri and label:
|
|
1935
|
+
qid = item_uri.split("/")[-1]
|
|
1936
|
+
resolved[qid] = label
|
|
1937
|
+
self._label_cache[qid] = label
|
|
1938
|
+
|
|
1939
|
+
logger.debug(f"Resolved batch {i // batch_size + 1}: {len(batch)} QIDs")
|
|
1940
|
+
|
|
1941
|
+
except Exception as e:
|
|
1942
|
+
logger.warning(f"SPARQL batch failed: {e}")
|
|
1943
|
+
|
|
1944
|
+
if i + batch_size < len(qid_list):
|
|
1945
|
+
time.sleep(delay_seconds)
|
|
1946
|
+
|
|
1947
|
+
# Update unresolved set
|
|
1948
|
+
self._unresolved_qids -= set(resolved.keys())
|
|
1949
|
+
|
|
1950
|
+
logger.info(f"Resolved {len(resolved)} QIDs, {len(self._unresolved_qids)} remaining unresolved")
|
|
1951
|
+
return resolved
|