corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wikidata importer for the person database.
|
|
3
|
+
|
|
4
|
+
Imports notable people data from Wikidata using SPARQL queries
|
|
5
|
+
into the embedding database for person name matching.
|
|
6
|
+
|
|
7
|
+
Uses a two-phase approach for reliability:
|
|
8
|
+
1. Bulk fetch: Simple queries to get QID + name + country (fast, no timeouts)
|
|
9
|
+
2. Enrich: Targeted per-person queries for role/org/dates (resumable)
|
|
10
|
+
|
|
11
|
+
Notable people are those with English Wikipedia articles, ensuring
|
|
12
|
+
a basic level of notability.
|
|
13
|
+
|
|
14
|
+
Query categories (organized by PersonType):
|
|
15
|
+
- executives: Business executives (CEOs, CFOs, etc.)
|
|
16
|
+
- politicians: Politicians and diplomats
|
|
17
|
+
- athletes: Sports figures
|
|
18
|
+
- artists: Actors, musicians, directors
|
|
19
|
+
- academics: Professors and researchers
|
|
20
|
+
- scientists: Scientists and inventors
|
|
21
|
+
- journalists: Media personalities
|
|
22
|
+
- entrepreneurs: Founders and business owners
|
|
23
|
+
|
|
24
|
+
Uses the public Wikidata Query Service endpoint.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
import logging
|
|
29
|
+
import time
|
|
30
|
+
import urllib.parse
|
|
31
|
+
import urllib.request
|
|
32
|
+
from typing import Any, Iterator, Optional
|
|
33
|
+
|
|
34
|
+
from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
# Wikidata SPARQL endpoint
|
|
39
|
+
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
40
|
+
|
|
41
|
+
# =============================================================================
|
|
42
|
+
# BULK QUERIES - Simple, fast queries for initial import (no role/org/dates)
|
|
43
|
+
# Uses rdfs:label instead of SERVICE wikibase:label for better performance
|
|
44
|
+
# Each query targets a single role/occupation for speed
|
|
45
|
+
# =============================================================================
|
|
46
|
+
|
|
47
|
+
# Template for position-held queries (P39) - for executives, politicians
|
|
48
|
+
# Matches people who held a position that IS the role, or is an INSTANCE OF the role
|
|
49
|
+
# {role_qid} = single role QID, {seed} = unique seed, {limit} = batch limit
|
|
50
|
+
POSITION_QUERY_TEMPLATE = """
|
|
51
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
|
|
52
|
+
?person wdt:P31 wd:Q5 .
|
|
53
|
+
?person wdt:P39 ?position .
|
|
54
|
+
{{ ?position wdt:P31 wd:{role_qid} . }} UNION {{ VALUES ?position {{ wd:{role_qid} }} }}
|
|
55
|
+
?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
|
|
56
|
+
OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
|
|
57
|
+
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
58
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
59
|
+
}}
|
|
60
|
+
ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
|
|
61
|
+
LIMIT {limit}
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# Template for occupation queries (P106) - for athletes, artists, etc.
|
|
65
|
+
# {occupation_qid} = single occupation QID, {seed} = unique seed, {limit} = batch limit
|
|
66
|
+
OCCUPATION_QUERY_TEMPLATE = """
|
|
67
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
|
|
68
|
+
?person wdt:P31 wd:Q5 .
|
|
69
|
+
?person wdt:P106 wd:{occupation_qid} .
|
|
70
|
+
?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
|
|
71
|
+
OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
|
|
72
|
+
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
73
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
74
|
+
}}
|
|
75
|
+
ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
|
|
76
|
+
LIMIT {limit}
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
# Template for founder queries (P112) - for entrepreneurs
|
|
80
|
+
# {seed} = unique seed, {limit} = batch limit
|
|
81
|
+
FOUNDER_QUERY_TEMPLATE = """
|
|
82
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
|
|
83
|
+
?person wdt:P31 wd:Q5 .
|
|
84
|
+
?org wdt:P112 ?person .
|
|
85
|
+
?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
|
|
86
|
+
OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
|
|
87
|
+
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
88
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
89
|
+
}}
|
|
90
|
+
ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
|
|
91
|
+
LIMIT {limit}
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# Role QIDs for executives (position held - P39)
|
|
95
|
+
EXECUTIVE_ROLES = [
|
|
96
|
+
"Q484876", # CEO
|
|
97
|
+
"Q623279", # CFO
|
|
98
|
+
"Q1502675", # COO
|
|
99
|
+
"Q935019", # CTO
|
|
100
|
+
"Q1057716", # CIO
|
|
101
|
+
"Q2140589", # CMO
|
|
102
|
+
"Q1115042", # chairperson
|
|
103
|
+
"Q4720025", # board of directors member
|
|
104
|
+
"Q60432825", # chief human resources officer
|
|
105
|
+
"Q15967139", # chief compliance officer
|
|
106
|
+
"Q15729310", # chief risk officer
|
|
107
|
+
"Q47523568", # chief legal officer
|
|
108
|
+
"Q258557", # board chair
|
|
109
|
+
"Q114863313", # chief sustainability officer
|
|
110
|
+
"Q726114", # company president
|
|
111
|
+
"Q1372944", # managing director
|
|
112
|
+
"Q18918145", # chief commercial officer
|
|
113
|
+
"Q1057569", # chief strategy officer
|
|
114
|
+
"Q24058752", # chief product officer
|
|
115
|
+
"Q3578048", # vice president
|
|
116
|
+
"Q476675", # business executive (generic)
|
|
117
|
+
"Q5441744", # finance director
|
|
118
|
+
"Q4188234", # general manager
|
|
119
|
+
"Q38844673", # chief data officer
|
|
120
|
+
"Q97273203", # chief digital officer
|
|
121
|
+
"Q60715311", # chief growth officer
|
|
122
|
+
"Q3563879", # treasurer
|
|
123
|
+
"Q3505845", # corporate secretary
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
# Role QIDs for politicians (position held - P39)
|
|
127
|
+
POLITICIAN_ROLES = [
|
|
128
|
+
"Q30461", # president
|
|
129
|
+
"Q14212", # prime minister
|
|
130
|
+
"Q83307", # minister
|
|
131
|
+
"Q2285706", # head of government
|
|
132
|
+
"Q4175034", # legislator
|
|
133
|
+
"Q486839", # member of parliament
|
|
134
|
+
"Q193391", # member of national legislature
|
|
135
|
+
"Q212071", # mayor
|
|
136
|
+
"Q382617", # governor
|
|
137
|
+
"Q116", # monarch
|
|
138
|
+
"Q484529", # member of congress
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
# Note: Politicians with generic position types (like "public office") may not be found
|
|
142
|
+
# because querying all public office holders times out. This includes some mayors
|
|
143
|
+
# whose positions are typed as "public office" rather than "mayor".
|
|
144
|
+
|
|
145
|
+
# Occupation QIDs for athletes (P106)
|
|
146
|
+
ATHLETE_OCCUPATIONS = [
|
|
147
|
+
"Q2066131", # athlete
|
|
148
|
+
"Q937857", # football player
|
|
149
|
+
"Q3665646", # basketball player
|
|
150
|
+
"Q10871364", # baseball player
|
|
151
|
+
"Q19204627", # ice hockey player
|
|
152
|
+
"Q10843402", # tennis player
|
|
153
|
+
"Q13381376", # golfer
|
|
154
|
+
"Q11338576", # boxer
|
|
155
|
+
"Q10873124", # swimmer
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
# Occupation QIDs for artists (P106)
|
|
159
|
+
ARTIST_OCCUPATIONS = [
|
|
160
|
+
"Q33999", # actor
|
|
161
|
+
"Q177220", # singer
|
|
162
|
+
"Q639669", # musician
|
|
163
|
+
"Q2526255", # film director
|
|
164
|
+
"Q36180", # writer
|
|
165
|
+
"Q483501", # artist
|
|
166
|
+
"Q488205", # singer-songwriter
|
|
167
|
+
"Q753110", # songwriter
|
|
168
|
+
"Q2405480", # voice actor
|
|
169
|
+
"Q10800557", # film actor
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
# Occupation QIDs for academics (P106)
|
|
173
|
+
ACADEMIC_OCCUPATIONS = [
|
|
174
|
+
"Q121594", # professor
|
|
175
|
+
"Q3400985", # academic
|
|
176
|
+
"Q1622272", # university professor
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
# Occupation QIDs for scientists (P106)
|
|
180
|
+
SCIENTIST_OCCUPATIONS = [
|
|
181
|
+
"Q901", # scientist
|
|
182
|
+
"Q1650915", # researcher
|
|
183
|
+
"Q169470", # physicist
|
|
184
|
+
"Q593644", # chemist
|
|
185
|
+
"Q864503", # biologist
|
|
186
|
+
"Q11063", # astronomer
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
# Occupation QIDs for journalists (P106)
|
|
190
|
+
JOURNALIST_OCCUPATIONS = [
|
|
191
|
+
"Q1930187", # journalist
|
|
192
|
+
"Q13590141", # news presenter
|
|
193
|
+
"Q947873", # television presenter
|
|
194
|
+
"Q4263842", # columnist
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
# Occupation QIDs for activists (P106)
|
|
198
|
+
ACTIVIST_OCCUPATIONS = [
|
|
199
|
+
"Q15253558", # activist
|
|
200
|
+
"Q11631410", # human rights activist
|
|
201
|
+
"Q18939491", # environmental activist
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
# Mapping query type to role/occupation lists and query template type
|
|
205
|
+
# Each entry can have multiple query groups to combine different approaches
|
|
206
|
+
QUERY_TYPE_CONFIG: dict[str, list[dict]] = {
|
|
207
|
+
"executive": [
|
|
208
|
+
{"template": "position", "items": EXECUTIVE_ROLES},
|
|
209
|
+
],
|
|
210
|
+
"politician": [
|
|
211
|
+
{"template": "position", "items": POLITICIAN_ROLES},
|
|
212
|
+
],
|
|
213
|
+
"athlete": [
|
|
214
|
+
{"template": "occupation", "items": ATHLETE_OCCUPATIONS},
|
|
215
|
+
],
|
|
216
|
+
"artist": [
|
|
217
|
+
{"template": "occupation", "items": ARTIST_OCCUPATIONS},
|
|
218
|
+
],
|
|
219
|
+
"academic": [
|
|
220
|
+
{"template": "occupation", "items": ACADEMIC_OCCUPATIONS},
|
|
221
|
+
],
|
|
222
|
+
"scientist": [
|
|
223
|
+
{"template": "occupation", "items": SCIENTIST_OCCUPATIONS},
|
|
224
|
+
],
|
|
225
|
+
"journalist": [
|
|
226
|
+
{"template": "occupation", "items": JOURNALIST_OCCUPATIONS},
|
|
227
|
+
],
|
|
228
|
+
"activist": [
|
|
229
|
+
{"template": "occupation", "items": ACTIVIST_OCCUPATIONS},
|
|
230
|
+
],
|
|
231
|
+
"entrepreneur": [
|
|
232
|
+
{"template": "founder", "items": []}, # No items, uses special template
|
|
233
|
+
],
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Mapping query type to PersonType
|
|
237
|
+
QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
|
|
238
|
+
"executive": PersonType.EXECUTIVE,
|
|
239
|
+
"politician": PersonType.POLITICIAN,
|
|
240
|
+
"athlete": PersonType.ATHLETE,
|
|
241
|
+
"artist": PersonType.ARTIST,
|
|
242
|
+
"academic": PersonType.ACADEMIC,
|
|
243
|
+
"scientist": PersonType.SCIENTIST,
|
|
244
|
+
"journalist": PersonType.JOURNALIST,
|
|
245
|
+
"entrepreneur": PersonType.ENTREPRENEUR,
|
|
246
|
+
"activist": PersonType.ACTIVIST,
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class WikidataPeopleImporter:
|
|
251
|
+
"""
|
|
252
|
+
Importer for Wikidata person data.
|
|
253
|
+
|
|
254
|
+
Uses SPARQL queries against the public Wikidata Query Service
|
|
255
|
+
to fetch notable people including executives, politicians, athletes, etc.
|
|
256
|
+
|
|
257
|
+
Query types:
|
|
258
|
+
- executive: Business executives (CEOs, CFOs, etc.)
|
|
259
|
+
- politician: Politicians and diplomats
|
|
260
|
+
- athlete: Sports figures
|
|
261
|
+
- artist: Actors, musicians, directors, writers
|
|
262
|
+
- academic: Professors and researchers
|
|
263
|
+
- scientist: Scientists and inventors
|
|
264
|
+
- journalist: Media personalities
|
|
265
|
+
- entrepreneur: Company founders
|
|
266
|
+
- activist: Activists and advocates
|
|
267
|
+
"""
|
|
268
|
+
|
|
269
|
+
def __init__(
|
|
270
|
+
self,
|
|
271
|
+
batch_size: int = 5000,
|
|
272
|
+
delay_seconds: float = 2.0,
|
|
273
|
+
timeout: int = 120,
|
|
274
|
+
max_retries: int = 3,
|
|
275
|
+
min_batch_size: int = 50,
|
|
276
|
+
):
|
|
277
|
+
"""
|
|
278
|
+
Initialize the Wikidata people importer.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
batch_size: Number of records to fetch per SPARQL query (default 5000)
|
|
282
|
+
delay_seconds: Delay between requests to be polite to the endpoint
|
|
283
|
+
timeout: HTTP timeout in seconds (default 120)
|
|
284
|
+
max_retries: Maximum retries per batch on timeout (default 3)
|
|
285
|
+
min_batch_size: Minimum batch size before giving up (default 50)
|
|
286
|
+
"""
|
|
287
|
+
self._batch_size = batch_size
|
|
288
|
+
self._delay = delay_seconds
|
|
289
|
+
self._timeout = timeout
|
|
290
|
+
self._max_retries = max_retries
|
|
291
|
+
self._min_batch_size = min_batch_size
|
|
292
|
+
# Track discovered organizations: org_qid -> org_label
|
|
293
|
+
self._discovered_orgs: dict[str, str] = {}
|
|
294
|
+
|
|
295
|
+
def import_from_sparql(
|
|
296
|
+
self,
|
|
297
|
+
limit: Optional[int] = None,
|
|
298
|
+
query_type: str = "executive",
|
|
299
|
+
import_all: bool = False,
|
|
300
|
+
convergence_threshold: int = 5,
|
|
301
|
+
) -> Iterator[PersonRecord]:
|
|
302
|
+
"""
|
|
303
|
+
Import person records from Wikidata via SPARQL (bulk fetch phase).
|
|
304
|
+
|
|
305
|
+
This performs the fast bulk import with minimal data (QID, name, country).
|
|
306
|
+
Use enrich_people_batch() afterwards to add role/org/dates.
|
|
307
|
+
|
|
308
|
+
Iterates through each role/occupation individually for faster queries,
|
|
309
|
+
using random sampling with convergence detection per role.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
limit: Optional limit on total records
|
|
313
|
+
query_type: Which query to use (executive, politician, athlete, etc.)
|
|
314
|
+
import_all: If True, run all query types sequentially
|
|
315
|
+
convergence_threshold: Stop after this many consecutive batches with no new records per role
|
|
316
|
+
|
|
317
|
+
Yields:
|
|
318
|
+
PersonRecord for each person (without role/org - use enrich to add)
|
|
319
|
+
"""
|
|
320
|
+
if import_all:
|
|
321
|
+
yield from self._import_all_types(limit)
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
if query_type not in QUERY_TYPE_CONFIG:
|
|
325
|
+
raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPE_CONFIG.keys())}")
|
|
326
|
+
|
|
327
|
+
config_groups = QUERY_TYPE_CONFIG[query_type]
|
|
328
|
+
person_type = QUERY_TYPE_TO_PERSON_TYPE.get(query_type, PersonType.UNKNOWN)
|
|
329
|
+
|
|
330
|
+
logger.info(f"Starting Wikidata bulk import (query_type={query_type}, person_type={person_type.value})...")
|
|
331
|
+
|
|
332
|
+
total_count = 0
|
|
333
|
+
# Track seen QIDs to deduplicate across all roles
|
|
334
|
+
seen_qids: set[str] = set()
|
|
335
|
+
|
|
336
|
+
# Iterate through each config group (e.g., position queries + occupation queries)
|
|
337
|
+
for config in config_groups:
|
|
338
|
+
if limit and total_count >= limit:
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
template_type = config["template"]
|
|
342
|
+
items = config["items"]
|
|
343
|
+
|
|
344
|
+
# For founder template, run a single query
|
|
345
|
+
if template_type == "founder":
|
|
346
|
+
for record in self._import_single_template(
|
|
347
|
+
template=FOUNDER_QUERY_TEMPLATE,
|
|
348
|
+
template_params={},
|
|
349
|
+
person_type=person_type,
|
|
350
|
+
seen_qids=seen_qids,
|
|
351
|
+
limit=(limit - total_count) if limit else None,
|
|
352
|
+
convergence_threshold=convergence_threshold,
|
|
353
|
+
role_name="founder",
|
|
354
|
+
):
|
|
355
|
+
total_count += 1
|
|
356
|
+
yield record
|
|
357
|
+
continue
|
|
358
|
+
|
|
359
|
+
# Select the right template
|
|
360
|
+
if template_type == "position":
|
|
361
|
+
template = POSITION_QUERY_TEMPLATE
|
|
362
|
+
param_name = "role_qid"
|
|
363
|
+
else: # occupation
|
|
364
|
+
template = OCCUPATION_QUERY_TEMPLATE
|
|
365
|
+
param_name = "occupation_qid"
|
|
366
|
+
|
|
367
|
+
# Iterate through each role/occupation in this group
|
|
368
|
+
for item_qid in items:
|
|
369
|
+
if limit and total_count >= limit:
|
|
370
|
+
break
|
|
371
|
+
|
|
372
|
+
remaining = (limit - total_count) if limit else None
|
|
373
|
+
role_count = 0
|
|
374
|
+
|
|
375
|
+
for record in self._import_single_template(
|
|
376
|
+
template=template,
|
|
377
|
+
template_params={param_name: item_qid},
|
|
378
|
+
person_type=person_type,
|
|
379
|
+
seen_qids=seen_qids,
|
|
380
|
+
limit=remaining,
|
|
381
|
+
convergence_threshold=convergence_threshold,
|
|
382
|
+
role_name=item_qid,
|
|
383
|
+
):
|
|
384
|
+
role_count += 1
|
|
385
|
+
total_count += 1
|
|
386
|
+
yield record
|
|
387
|
+
|
|
388
|
+
logger.info(f"Role {item_qid}: {role_count} new (total: {total_count})")
|
|
389
|
+
|
|
390
|
+
logger.info(f"Completed Wikidata bulk import: {total_count} records (use enrich to add role/org)")
|
|
391
|
+
|
|
392
|
+
def _import_single_template(
|
|
393
|
+
self,
|
|
394
|
+
template: str,
|
|
395
|
+
template_params: dict[str, str],
|
|
396
|
+
person_type: PersonType,
|
|
397
|
+
seen_qids: set[str],
|
|
398
|
+
limit: Optional[int],
|
|
399
|
+
convergence_threshold: int,
|
|
400
|
+
role_name: str,
|
|
401
|
+
) -> Iterator[PersonRecord]:
|
|
402
|
+
"""
|
|
403
|
+
Import from a single role/occupation using random sampling with convergence.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
template: SPARQL query template
|
|
407
|
+
template_params: Parameters to format into template (role_qid or occupation_qid)
|
|
408
|
+
person_type: PersonType to assign to records
|
|
409
|
+
seen_qids: Set of already-seen QIDs (shared across roles)
|
|
410
|
+
limit: Optional limit on records from this role
|
|
411
|
+
convergence_threshold: Stop after this many consecutive empty batches
|
|
412
|
+
role_name: Name for logging
|
|
413
|
+
|
|
414
|
+
Yields:
|
|
415
|
+
PersonRecord for each new person found
|
|
416
|
+
"""
|
|
417
|
+
batch_num = 0
|
|
418
|
+
total_count = 0
|
|
419
|
+
current_batch_size = self._batch_size
|
|
420
|
+
consecutive_empty_batches = 0
|
|
421
|
+
|
|
422
|
+
logger.info(f"Querying role {role_name}...")
|
|
423
|
+
|
|
424
|
+
while True:
|
|
425
|
+
if limit and total_count >= limit:
|
|
426
|
+
break
|
|
427
|
+
|
|
428
|
+
batch_num += 1
|
|
429
|
+
batch_limit = min(current_batch_size, (limit - total_count) if limit else current_batch_size)
|
|
430
|
+
|
|
431
|
+
# Generate unique seed for this batch
|
|
432
|
+
batch_seed = f"{role_name}_{batch_num}_{int(time.time() * 1000)}"
|
|
433
|
+
|
|
434
|
+
# Build query
|
|
435
|
+
query = template.format(
|
|
436
|
+
**template_params,
|
|
437
|
+
seed=batch_seed,
|
|
438
|
+
limit=batch_limit,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Execute with retries
|
|
442
|
+
results = None
|
|
443
|
+
retries = 0
|
|
444
|
+
retry_batch_size = batch_limit
|
|
445
|
+
|
|
446
|
+
while retries <= self._max_retries:
|
|
447
|
+
try:
|
|
448
|
+
# Rebuild query with potentially smaller batch size
|
|
449
|
+
if retry_batch_size != batch_limit:
|
|
450
|
+
query = template.format(
|
|
451
|
+
**template_params,
|
|
452
|
+
seed=batch_seed,
|
|
453
|
+
limit=retry_batch_size,
|
|
454
|
+
)
|
|
455
|
+
results = self._execute_sparql(query)
|
|
456
|
+
if retry_batch_size < current_batch_size:
|
|
457
|
+
current_batch_size = retry_batch_size
|
|
458
|
+
break
|
|
459
|
+
except Exception as e:
|
|
460
|
+
is_timeout = "timeout" in str(e).lower() or "504" in str(e) or "503" in str(e)
|
|
461
|
+
if is_timeout and retry_batch_size > self._min_batch_size:
|
|
462
|
+
retries += 1
|
|
463
|
+
retry_batch_size = max(retry_batch_size // 2, self._min_batch_size)
|
|
464
|
+
wait_time = self._delay * (2 ** retries)
|
|
465
|
+
logger.warning(
|
|
466
|
+
f"Timeout on {role_name} batch #{batch_num}, retry {retries}/{self._max_retries} "
|
|
467
|
+
f"with batch_size={retry_batch_size} after {wait_time:.1f}s wait"
|
|
468
|
+
)
|
|
469
|
+
time.sleep(wait_time)
|
|
470
|
+
else:
|
|
471
|
+
logger.error(f"SPARQL query failed on {role_name} batch #{batch_num}: {e}")
|
|
472
|
+
break
|
|
473
|
+
|
|
474
|
+
if results is None:
|
|
475
|
+
logger.warning(f"Giving up on {role_name} after {retries} retries")
|
|
476
|
+
break
|
|
477
|
+
|
|
478
|
+
bindings = results.get("results", {}).get("bindings", [])
|
|
479
|
+
|
|
480
|
+
if not bindings:
|
|
481
|
+
consecutive_empty_batches += 1
|
|
482
|
+
if consecutive_empty_batches >= convergence_threshold:
|
|
483
|
+
logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
|
|
484
|
+
break
|
|
485
|
+
continue
|
|
486
|
+
|
|
487
|
+
batch_count = 0
|
|
488
|
+
for binding in bindings:
|
|
489
|
+
if limit and total_count >= limit:
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
record, skip_reason = self._parse_bulk_binding(binding, person_type=person_type)
|
|
493
|
+
if record is None:
|
|
494
|
+
continue
|
|
495
|
+
|
|
496
|
+
# Deduplicate
|
|
497
|
+
if record.source_id in seen_qids:
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
seen_qids.add(record.source_id)
|
|
501
|
+
total_count += 1
|
|
502
|
+
batch_count += 1
|
|
503
|
+
yield record
|
|
504
|
+
|
|
505
|
+
# Check convergence
|
|
506
|
+
if batch_count == 0:
|
|
507
|
+
consecutive_empty_batches += 1
|
|
508
|
+
if consecutive_empty_batches >= convergence_threshold:
|
|
509
|
+
logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
|
|
510
|
+
break
|
|
511
|
+
else:
|
|
512
|
+
consecutive_empty_batches = 0
|
|
513
|
+
|
|
514
|
+
# Rate limit
|
|
515
|
+
if self._delay > 0:
|
|
516
|
+
time.sleep(self._delay)
|
|
517
|
+
|
|
518
|
+
def _import_all_types(self, limit: Optional[int]) -> Iterator[PersonRecord]:
|
|
519
|
+
"""Import from all query types sequentially, deduplicating across types."""
|
|
520
|
+
# Track seen QIDs across all types
|
|
521
|
+
seen_qids: set[str] = set()
|
|
522
|
+
total_count = 0
|
|
523
|
+
|
|
524
|
+
# Calculate per-type limits if a total limit is set
|
|
525
|
+
num_types = len(QUERY_TYPE_CONFIG)
|
|
526
|
+
per_type_limit = limit // num_types if limit else None
|
|
527
|
+
|
|
528
|
+
for query_type in QUERY_TYPE_CONFIG:
|
|
529
|
+
logger.info(f"=== Importing people: {query_type} ===")
|
|
530
|
+
type_count = 0
|
|
531
|
+
skipped_count = 0
|
|
532
|
+
|
|
533
|
+
for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
|
|
534
|
+
if record.source_id in seen_qids:
|
|
535
|
+
skipped_count += 1
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
seen_qids.add(record.source_id)
|
|
539
|
+
total_count += 1
|
|
540
|
+
type_count += 1
|
|
541
|
+
yield record
|
|
542
|
+
|
|
543
|
+
if limit and total_count >= limit:
|
|
544
|
+
logger.info(f"Reached total limit of {limit} records")
|
|
545
|
+
return
|
|
546
|
+
|
|
547
|
+
logger.info(
|
|
548
|
+
f"Got {type_count} new from {query_type}, skipped {skipped_count} (total: {total_count})"
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
logger.info(f"Completed all query types: {total_count} total people records")
|
|
552
|
+
|
|
553
|
+
@staticmethod
|
|
554
|
+
def _parse_wikidata_date(date_str: str) -> Optional[str]:
|
|
555
|
+
"""
|
|
556
|
+
Parse a Wikidata date string into ISO format (YYYY-MM-DD).
|
|
557
|
+
|
|
558
|
+
Wikidata returns dates like "2020-01-15T00:00:00Z" or just "2020".
|
|
559
|
+
Returns None if the date cannot be parsed.
|
|
560
|
+
"""
|
|
561
|
+
if not date_str:
|
|
562
|
+
return None
|
|
563
|
+
# Handle ISO datetime format (e.g., "2020-01-15T00:00:00Z")
|
|
564
|
+
if "T" in date_str:
|
|
565
|
+
return date_str.split("T")[0]
|
|
566
|
+
# Handle year-only format (e.g., "2020")
|
|
567
|
+
if len(date_str) == 4 and date_str.isdigit():
|
|
568
|
+
return f"{date_str}-01-01"
|
|
569
|
+
# Return as-is if it looks like a date
|
|
570
|
+
if len(date_str) >= 4:
|
|
571
|
+
return date_str[:10] # Take first 10 chars (YYYY-MM-DD)
|
|
572
|
+
return None
|
|
573
|
+
|
|
574
|
+
def _execute_sparql(self, query: str) -> dict[str, Any]:
|
|
575
|
+
"""Execute a SPARQL query against Wikidata."""
|
|
576
|
+
params = urllib.parse.urlencode({
|
|
577
|
+
"query": query,
|
|
578
|
+
"format": "json",
|
|
579
|
+
})
|
|
580
|
+
|
|
581
|
+
url = f"{WIKIDATA_SPARQL_URL}?{params}"
|
|
582
|
+
|
|
583
|
+
req = urllib.request.Request(
|
|
584
|
+
url,
|
|
585
|
+
headers={
|
|
586
|
+
"Accept": "application/sparql-results+json",
|
|
587
|
+
"User-Agent": "corp-extractor/1.0 (person database builder)",
|
|
588
|
+
}
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
with urllib.request.urlopen(req, timeout=self._timeout) as response:
|
|
592
|
+
return json.loads(response.read().decode("utf-8"))
|
|
593
|
+
|
|
594
|
+
def _parse_bulk_binding(
|
|
595
|
+
self,
|
|
596
|
+
binding: dict[str, Any],
|
|
597
|
+
person_type: PersonType = PersonType.UNKNOWN,
|
|
598
|
+
) -> tuple[Optional[PersonRecord], Optional[str]]:
|
|
599
|
+
"""
|
|
600
|
+
Parse a bulk SPARQL result binding into a PersonRecord.
|
|
601
|
+
|
|
602
|
+
Bulk bindings only have: person, personLabel, countryLabel, description.
|
|
603
|
+
Role/org/dates are NOT included - use enrich methods to add them later.
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
Tuple of (PersonRecord or None, skip_reason or None)
|
|
607
|
+
"""
|
|
608
|
+
try:
|
|
609
|
+
# Get Wikidata entity ID
|
|
610
|
+
person_uri = binding.get("person", {}).get("value", "")
|
|
611
|
+
if not person_uri:
|
|
612
|
+
return None, "missing person URI"
|
|
613
|
+
|
|
614
|
+
# Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
|
|
615
|
+
wikidata_id = person_uri.split("/")[-1]
|
|
616
|
+
if not wikidata_id.startswith("Q"):
|
|
617
|
+
return None, f"invalid Wikidata ID format: {wikidata_id}"
|
|
618
|
+
|
|
619
|
+
# Get label
|
|
620
|
+
label = binding.get("personLabel", {}).get("value", "")
|
|
621
|
+
if not label:
|
|
622
|
+
return None, f"{wikidata_id}: no label"
|
|
623
|
+
if label == wikidata_id:
|
|
624
|
+
return None, f"{wikidata_id}: no English label (label equals QID)"
|
|
625
|
+
|
|
626
|
+
# Get optional fields from bulk query
|
|
627
|
+
country = binding.get("countryLabel", {}).get("value", "")
|
|
628
|
+
description = binding.get("description", {}).get("value", "")
|
|
629
|
+
|
|
630
|
+
# Build minimal record data
|
|
631
|
+
record_data: dict[str, Any] = {
|
|
632
|
+
"wikidata_id": wikidata_id,
|
|
633
|
+
"label": label,
|
|
634
|
+
}
|
|
635
|
+
if country:
|
|
636
|
+
record_data["country"] = country
|
|
637
|
+
if description:
|
|
638
|
+
record_data["description"] = description
|
|
639
|
+
|
|
640
|
+
return PersonRecord(
|
|
641
|
+
name=label.strip(),
|
|
642
|
+
source="wikidata",
|
|
643
|
+
source_id=wikidata_id,
|
|
644
|
+
country=country or "",
|
|
645
|
+
person_type=person_type,
|
|
646
|
+
known_for_role="", # To be enriched later
|
|
647
|
+
known_for_org="", # To be enriched later
|
|
648
|
+
from_date=None, # To be enriched later
|
|
649
|
+
to_date=None, # To be enriched later
|
|
650
|
+
record=record_data,
|
|
651
|
+
), None
|
|
652
|
+
|
|
653
|
+
except Exception as e:
|
|
654
|
+
return None, f"parse error: {e}"
|
|
655
|
+
|
|
656
|
+
def _parse_binding_with_reason(
|
|
657
|
+
self,
|
|
658
|
+
binding: dict[str, Any],
|
|
659
|
+
person_type: PersonType = PersonType.UNKNOWN,
|
|
660
|
+
) -> tuple[Optional[PersonRecord], Optional[str]]:
|
|
661
|
+
"""
|
|
662
|
+
Parse a SPARQL result binding into a PersonRecord.
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
Tuple of (PersonRecord or None, skip_reason or None)
|
|
666
|
+
"""
|
|
667
|
+
try:
|
|
668
|
+
# Get Wikidata entity ID
|
|
669
|
+
person_uri = binding.get("person", {}).get("value", "")
|
|
670
|
+
if not person_uri:
|
|
671
|
+
return None, "missing person URI"
|
|
672
|
+
|
|
673
|
+
# Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
|
|
674
|
+
wikidata_id = person_uri.split("/")[-1]
|
|
675
|
+
if not wikidata_id.startswith("Q"):
|
|
676
|
+
return None, f"invalid Wikidata ID format: {wikidata_id}"
|
|
677
|
+
|
|
678
|
+
# Get label
|
|
679
|
+
label = binding.get("personLabel", {}).get("value", "")
|
|
680
|
+
if not label:
|
|
681
|
+
return None, f"{wikidata_id}: no label"
|
|
682
|
+
if label == wikidata_id:
|
|
683
|
+
return None, f"{wikidata_id}: no English label (label equals QID)"
|
|
684
|
+
|
|
685
|
+
# Get optional fields
|
|
686
|
+
country = binding.get("countryLabel", {}).get("value", "")
|
|
687
|
+
role = binding.get("roleLabel", {}).get("value", "")
|
|
688
|
+
org_label = binding.get("orgLabel", {}).get("value", "")
|
|
689
|
+
org_uri = binding.get("org", {}).get("value", "")
|
|
690
|
+
description = binding.get("description", {}).get("value", "")
|
|
691
|
+
|
|
692
|
+
# Extract org QID from URI (e.g., "http://www.wikidata.org/entity/Q715583" -> "Q715583")
|
|
693
|
+
org_qid = ""
|
|
694
|
+
if org_uri:
|
|
695
|
+
org_qid = org_uri.split("/")[-1]
|
|
696
|
+
if not org_qid.startswith("Q"):
|
|
697
|
+
org_qid = ""
|
|
698
|
+
|
|
699
|
+
# Get dates (Wikidata returns ISO datetime, extract just the date part)
|
|
700
|
+
start_date_raw = binding.get("startDate", {}).get("value", "")
|
|
701
|
+
end_date_raw = binding.get("endDate", {}).get("value", "")
|
|
702
|
+
from_date = WikidataPeopleImporter._parse_wikidata_date(start_date_raw)
|
|
703
|
+
to_date = WikidataPeopleImporter._parse_wikidata_date(end_date_raw)
|
|
704
|
+
|
|
705
|
+
# Clean up role and org label (remove QID if it's the same as the label)
|
|
706
|
+
if role and role.startswith("Q"):
|
|
707
|
+
role = ""
|
|
708
|
+
if org_label and org_label.startswith("Q"):
|
|
709
|
+
org_label = ""
|
|
710
|
+
|
|
711
|
+
# Track discovered organization if we have both QID and label
|
|
712
|
+
if org_qid and org_label:
|
|
713
|
+
self._discovered_orgs[org_qid] = org_label
|
|
714
|
+
|
|
715
|
+
# Build record data
|
|
716
|
+
record_data: dict[str, Any] = {
|
|
717
|
+
"wikidata_id": wikidata_id,
|
|
718
|
+
"label": label,
|
|
719
|
+
}
|
|
720
|
+
if country:
|
|
721
|
+
record_data["country"] = country
|
|
722
|
+
if role:
|
|
723
|
+
record_data["role"] = role
|
|
724
|
+
if org_label:
|
|
725
|
+
record_data["org"] = org_label
|
|
726
|
+
if org_qid:
|
|
727
|
+
record_data["org_qid"] = org_qid
|
|
728
|
+
if description:
|
|
729
|
+
record_data["description"] = description
|
|
730
|
+
if from_date:
|
|
731
|
+
record_data["from_date"] = from_date
|
|
732
|
+
if to_date:
|
|
733
|
+
record_data["to_date"] = to_date
|
|
734
|
+
|
|
735
|
+
return PersonRecord(
|
|
736
|
+
name=label.strip(),
|
|
737
|
+
source="wikidata",
|
|
738
|
+
source_id=wikidata_id,
|
|
739
|
+
country=country or "",
|
|
740
|
+
person_type=person_type,
|
|
741
|
+
known_for_role=role or "",
|
|
742
|
+
known_for_org=org_label or "",
|
|
743
|
+
from_date=from_date,
|
|
744
|
+
to_date=to_date,
|
|
745
|
+
record=record_data,
|
|
746
|
+
), None
|
|
747
|
+
|
|
748
|
+
except Exception as e:
|
|
749
|
+
return None, f"parse error: {e}"
|
|
750
|
+
|
|
751
|
+
def _parse_binding(
|
|
752
|
+
self,
|
|
753
|
+
binding: dict[str, Any],
|
|
754
|
+
person_type: PersonType = PersonType.UNKNOWN,
|
|
755
|
+
) -> Optional[PersonRecord]:
|
|
756
|
+
"""Parse a SPARQL result binding into a PersonRecord (legacy wrapper)."""
|
|
757
|
+
record, _ = self._parse_binding_with_reason(binding, person_type)
|
|
758
|
+
return record
|
|
759
|
+
|
|
760
|
+
def search_person(self, name: str, limit: int = 10) -> list[PersonRecord]:
|
|
761
|
+
"""
|
|
762
|
+
Search for a specific person by name.
|
|
763
|
+
|
|
764
|
+
Args:
|
|
765
|
+
name: Person name to search for
|
|
766
|
+
limit: Maximum results to return
|
|
767
|
+
|
|
768
|
+
Returns:
|
|
769
|
+
List of matching PersonRecords
|
|
770
|
+
"""
|
|
771
|
+
# Use Wikidata search API for better name matching
|
|
772
|
+
search_url = "https://www.wikidata.org/w/api.php"
|
|
773
|
+
params = urllib.parse.urlencode({
|
|
774
|
+
"action": "wbsearchentities",
|
|
775
|
+
"search": name,
|
|
776
|
+
"language": "en",
|
|
777
|
+
"type": "item",
|
|
778
|
+
"limit": limit,
|
|
779
|
+
"format": "json",
|
|
780
|
+
})
|
|
781
|
+
|
|
782
|
+
req = urllib.request.Request(
|
|
783
|
+
f"{search_url}?{params}",
|
|
784
|
+
headers={"User-Agent": "corp-extractor/1.0"}
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
788
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
789
|
+
|
|
790
|
+
results = []
|
|
791
|
+
for item in data.get("search", []):
|
|
792
|
+
qid = item.get("id")
|
|
793
|
+
label = item.get("label", "")
|
|
794
|
+
description = item.get("description", "")
|
|
795
|
+
|
|
796
|
+
# Check if it looks like a person
|
|
797
|
+
person_keywords = [
|
|
798
|
+
"politician", "actor", "actress", "singer", "musician",
|
|
799
|
+
"businessman", "businesswoman", "ceo", "executive", "director",
|
|
800
|
+
"president", "founder", "professor", "scientist", "author",
|
|
801
|
+
"writer", "journalist", "athlete", "player", "coach",
|
|
802
|
+
]
|
|
803
|
+
description_lower = description.lower()
|
|
804
|
+
is_person = any(kw in description_lower for kw in person_keywords)
|
|
805
|
+
if not is_person:
|
|
806
|
+
continue
|
|
807
|
+
|
|
808
|
+
# Try to infer person type from description
|
|
809
|
+
person_type = PersonType.UNKNOWN
|
|
810
|
+
if any(kw in description_lower for kw in ["ceo", "executive", "businessman", "businesswoman"]):
|
|
811
|
+
person_type = PersonType.EXECUTIVE
|
|
812
|
+
elif any(kw in description_lower for kw in ["politician", "president", "senator", "minister"]):
|
|
813
|
+
person_type = PersonType.POLITICIAN
|
|
814
|
+
elif any(kw in description_lower for kw in ["athlete", "player", "coach"]):
|
|
815
|
+
person_type = PersonType.ATHLETE
|
|
816
|
+
elif any(kw in description_lower for kw in ["actor", "actress", "singer", "musician", "director"]):
|
|
817
|
+
person_type = PersonType.ARTIST
|
|
818
|
+
elif any(kw in description_lower for kw in ["professor", "academic"]):
|
|
819
|
+
person_type = PersonType.ACADEMIC
|
|
820
|
+
elif any(kw in description_lower for kw in ["scientist", "researcher"]):
|
|
821
|
+
person_type = PersonType.SCIENTIST
|
|
822
|
+
elif any(kw in description_lower for kw in ["journalist", "reporter"]):
|
|
823
|
+
person_type = PersonType.JOURNALIST
|
|
824
|
+
elif any(kw in description_lower for kw in ["founder", "entrepreneur"]):
|
|
825
|
+
person_type = PersonType.ENTREPRENEUR
|
|
826
|
+
|
|
827
|
+
record = PersonRecord(
|
|
828
|
+
name=label,
|
|
829
|
+
source="wikidata",
|
|
830
|
+
source_id=qid,
|
|
831
|
+
country="", # Not available from search API
|
|
832
|
+
person_type=person_type,
|
|
833
|
+
known_for_role="",
|
|
834
|
+
known_for_org="",
|
|
835
|
+
record={
|
|
836
|
+
"wikidata_id": qid,
|
|
837
|
+
"label": label,
|
|
838
|
+
"description": description,
|
|
839
|
+
},
|
|
840
|
+
)
|
|
841
|
+
results.append(record)
|
|
842
|
+
|
|
843
|
+
return results
|
|
844
|
+
|
|
845
|
+
def get_discovered_organizations(self) -> list[CompanyRecord]:
|
|
846
|
+
"""
|
|
847
|
+
Get organizations discovered during the people import.
|
|
848
|
+
|
|
849
|
+
These are organizations associated with people (employers, positions, etc.)
|
|
850
|
+
that can be inserted into the organizations database if not already present.
|
|
851
|
+
|
|
852
|
+
Returns:
|
|
853
|
+
List of CompanyRecord objects for discovered organizations
|
|
854
|
+
"""
|
|
855
|
+
records = []
|
|
856
|
+
for org_qid, org_label in self._discovered_orgs.items():
|
|
857
|
+
record = CompanyRecord(
|
|
858
|
+
name=org_label,
|
|
859
|
+
source="wikipedia", # Use "wikipedia" as source per wikidata.py convention
|
|
860
|
+
source_id=org_qid,
|
|
861
|
+
region="", # Not available from this context
|
|
862
|
+
entity_type=EntityType.BUSINESS, # Default to business for orgs linked to people
|
|
863
|
+
record={
|
|
864
|
+
"wikidata_id": org_qid,
|
|
865
|
+
"label": org_label,
|
|
866
|
+
"discovered_from": "people_import",
|
|
867
|
+
},
|
|
868
|
+
)
|
|
869
|
+
records.append(record)
|
|
870
|
+
logger.info(f"Discovered {len(records)} organizations from people import")
|
|
871
|
+
return records
|
|
872
|
+
|
|
873
|
+
def clear_discovered_organizations(self) -> None:
|
|
874
|
+
"""Clear the discovered organizations cache."""
|
|
875
|
+
self._discovered_orgs.clear()
|
|
876
|
+
|
|
877
|
+
def enrich_person_dates(self, person_qid: str, role: str = "", org: str = "") -> tuple[Optional[str], Optional[str]]:
|
|
878
|
+
"""
|
|
879
|
+
Query Wikidata to get start/end dates for a person's position.
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
person_qid: Wikidata QID of the person (e.g., 'Q123')
|
|
883
|
+
role: Optional role label to match (e.g., 'chief executive officer')
|
|
884
|
+
org: Optional org label to match (e.g., 'Apple Inc')
|
|
885
|
+
|
|
886
|
+
Returns:
|
|
887
|
+
Tuple of (from_date, to_date) in ISO format, or (None, None) if not found
|
|
888
|
+
"""
|
|
889
|
+
# Query for position dates for this specific person
|
|
890
|
+
# Uses rdfs:label instead of SERVICE wikibase:label for better performance
|
|
891
|
+
query = """
|
|
892
|
+
SELECT ?roleLabel ?orgLabel ?startDate ?endDate WHERE {
|
|
893
|
+
wd:%s p:P39 ?positionStatement .
|
|
894
|
+
?positionStatement ps:P39 ?role .
|
|
895
|
+
?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
|
|
896
|
+
OPTIONAL { ?positionStatement pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
|
|
897
|
+
OPTIONAL { ?positionStatement pq:P580 ?startDate }
|
|
898
|
+
OPTIONAL { ?positionStatement pq:P582 ?endDate }
|
|
899
|
+
}
|
|
900
|
+
LIMIT 50
|
|
901
|
+
""" % person_qid
|
|
902
|
+
|
|
903
|
+
try:
|
|
904
|
+
url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
|
|
905
|
+
req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
|
|
906
|
+
|
|
907
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
908
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
909
|
+
|
|
910
|
+
# Find the best matching position
|
|
911
|
+
best_start = None
|
|
912
|
+
best_end = None
|
|
913
|
+
|
|
914
|
+
for binding in data.get("results", {}).get("bindings", []):
|
|
915
|
+
role_label = binding.get("roleLabel", {}).get("value", "")
|
|
916
|
+
org_label = binding.get("orgLabel", {}).get("value", "")
|
|
917
|
+
start_raw = binding.get("startDate", {}).get("value", "")
|
|
918
|
+
end_raw = binding.get("endDate", {}).get("value", "")
|
|
919
|
+
|
|
920
|
+
# If role/org specified, try to match
|
|
921
|
+
if role and role.lower() not in role_label.lower():
|
|
922
|
+
continue
|
|
923
|
+
if org and org.lower() not in org_label.lower():
|
|
924
|
+
continue
|
|
925
|
+
|
|
926
|
+
# Parse dates
|
|
927
|
+
start_date = self._parse_wikidata_date(start_raw)
|
|
928
|
+
end_date = self._parse_wikidata_date(end_raw)
|
|
929
|
+
|
|
930
|
+
# Prefer entries with dates
|
|
931
|
+
if start_date or end_date:
|
|
932
|
+
best_start = start_date
|
|
933
|
+
best_end = end_date
|
|
934
|
+
break # Found a match with dates
|
|
935
|
+
|
|
936
|
+
return best_start, best_end
|
|
937
|
+
|
|
938
|
+
except Exception as e:
|
|
939
|
+
logger.debug(f"Failed to enrich dates for {person_qid}: {e}")
|
|
940
|
+
return None, None
|
|
941
|
+
|
|
942
|
+
def enrich_people_batch(
|
|
943
|
+
self,
|
|
944
|
+
people: list[PersonRecord],
|
|
945
|
+
delay_seconds: float = 0.5,
|
|
946
|
+
) -> int:
|
|
947
|
+
"""
|
|
948
|
+
Enrich a batch of people with start/end dates.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
people: List of PersonRecord objects to enrich
|
|
952
|
+
delay_seconds: Delay between requests
|
|
953
|
+
|
|
954
|
+
Returns:
|
|
955
|
+
Number of people enriched with dates
|
|
956
|
+
"""
|
|
957
|
+
enriched_count = 0
|
|
958
|
+
|
|
959
|
+
for person in people:
|
|
960
|
+
if person.from_date or person.to_date:
|
|
961
|
+
continue # Already has dates
|
|
962
|
+
|
|
963
|
+
qid = person.source_id
|
|
964
|
+
role = person.known_for_role
|
|
965
|
+
org = person.known_for_org
|
|
966
|
+
|
|
967
|
+
from_date, to_date = self.enrich_person_dates(qid, role, org)
|
|
968
|
+
|
|
969
|
+
if from_date or to_date:
|
|
970
|
+
person.from_date = from_date
|
|
971
|
+
person.to_date = to_date
|
|
972
|
+
enriched_count += 1
|
|
973
|
+
logger.debug(f"Enriched {person.name}: {from_date} - {to_date}")
|
|
974
|
+
|
|
975
|
+
time.sleep(delay_seconds)
|
|
976
|
+
|
|
977
|
+
logger.info(f"Enriched {enriched_count}/{len(people)} people with dates")
|
|
978
|
+
return enriched_count
|
|
979
|
+
|
|
980
|
+
def enrich_person_role_org(
|
|
981
|
+
self, person_qid: str
|
|
982
|
+
) -> tuple[str, str, str, Optional[str], Optional[str]]:
|
|
983
|
+
"""
|
|
984
|
+
Query Wikidata to get role, org, and dates for a person.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
person_qid: Wikidata QID of the person (e.g., 'Q123')
|
|
988
|
+
|
|
989
|
+
Returns:
|
|
990
|
+
Tuple of (role_label, org_label, org_qid, from_date, to_date)
|
|
991
|
+
Empty strings/None if not found
|
|
992
|
+
"""
|
|
993
|
+
# Query for position held (P39) with org qualifier and dates
|
|
994
|
+
# Uses rdfs:label instead of SERVICE wikibase:label for better performance
|
|
995
|
+
query = """
|
|
996
|
+
SELECT ?roleLabel ?org ?orgLabel ?startDate ?endDate WHERE {
|
|
997
|
+
wd:%s p:P39 ?stmt .
|
|
998
|
+
?stmt ps:P39 ?role .
|
|
999
|
+
?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
|
|
1000
|
+
OPTIONAL { ?stmt pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
|
|
1001
|
+
OPTIONAL { ?stmt pq:P580 ?startDate . }
|
|
1002
|
+
OPTIONAL { ?stmt pq:P582 ?endDate . }
|
|
1003
|
+
}
|
|
1004
|
+
LIMIT 5
|
|
1005
|
+
""" % person_qid
|
|
1006
|
+
|
|
1007
|
+
try:
|
|
1008
|
+
url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
|
|
1009
|
+
req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
|
|
1010
|
+
|
|
1011
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
1012
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1013
|
+
|
|
1014
|
+
bindings = data.get("results", {}).get("bindings", [])
|
|
1015
|
+
|
|
1016
|
+
# Find the best result (prefer one with org and dates)
|
|
1017
|
+
best_result = None
|
|
1018
|
+
for binding in bindings:
|
|
1019
|
+
role_label = binding.get("roleLabel", {}).get("value", "")
|
|
1020
|
+
org_label = binding.get("orgLabel", {}).get("value", "")
|
|
1021
|
+
org_uri = binding.get("org", {}).get("value", "")
|
|
1022
|
+
start_raw = binding.get("startDate", {}).get("value", "")
|
|
1023
|
+
end_raw = binding.get("endDate", {}).get("value", "")
|
|
1024
|
+
|
|
1025
|
+
# Skip if role is just a QID (no label resolved)
|
|
1026
|
+
if role_label and role_label.startswith("Q"):
|
|
1027
|
+
continue
|
|
1028
|
+
if org_label and org_label.startswith("Q"):
|
|
1029
|
+
org_label = ""
|
|
1030
|
+
|
|
1031
|
+
# Extract QID from URI
|
|
1032
|
+
org_qid = ""
|
|
1033
|
+
if org_uri:
|
|
1034
|
+
org_qid = org_uri.split("/")[-1]
|
|
1035
|
+
if not org_qid.startswith("Q"):
|
|
1036
|
+
org_qid = ""
|
|
1037
|
+
|
|
1038
|
+
from_date = self._parse_wikidata_date(start_raw)
|
|
1039
|
+
to_date = self._parse_wikidata_date(end_raw)
|
|
1040
|
+
|
|
1041
|
+
result = (role_label, org_label, org_qid, from_date, to_date)
|
|
1042
|
+
|
|
1043
|
+
# Prefer results with org and dates
|
|
1044
|
+
if org_label and (from_date or to_date):
|
|
1045
|
+
return result
|
|
1046
|
+
elif org_label and best_result is None:
|
|
1047
|
+
best_result = result
|
|
1048
|
+
elif role_label and best_result is None:
|
|
1049
|
+
best_result = result
|
|
1050
|
+
|
|
1051
|
+
if best_result:
|
|
1052
|
+
return best_result
|
|
1053
|
+
|
|
1054
|
+
return "", "", "", None, None
|
|
1055
|
+
|
|
1056
|
+
except Exception as e:
|
|
1057
|
+
logger.debug(f"Failed to enrich role/org for {person_qid}: {e}")
|
|
1058
|
+
return "", "", "", None, None
|
|
1059
|
+
|
|
1060
|
+
def enrich_people_role_org_batch(
|
|
1061
|
+
self,
|
|
1062
|
+
people: list[PersonRecord],
|
|
1063
|
+
delay_seconds: float = 0.1,
|
|
1064
|
+
max_workers: int = 5,
|
|
1065
|
+
) -> int:
|
|
1066
|
+
"""
|
|
1067
|
+
Enrich a batch of people with role/org/dates data using parallel queries.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
people: List of PersonRecord objects to enrich
|
|
1071
|
+
delay_seconds: Delay between requests (per worker)
|
|
1072
|
+
max_workers: Number of parallel workers (default 5 for Wikidata rate limits)
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
Number of people enriched with role/org
|
|
1076
|
+
"""
|
|
1077
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
1078
|
+
|
|
1079
|
+
# Filter to people that need enrichment
|
|
1080
|
+
to_enrich = [p for p in people if not p.known_for_role and not p.known_for_org]
|
|
1081
|
+
|
|
1082
|
+
if not to_enrich:
|
|
1083
|
+
logger.info("No people need enrichment")
|
|
1084
|
+
return 0
|
|
1085
|
+
|
|
1086
|
+
enriched_count = 0
|
|
1087
|
+
total = len(to_enrich)
|
|
1088
|
+
|
|
1089
|
+
def enrich_one(person: PersonRecord) -> tuple[PersonRecord, bool]:
|
|
1090
|
+
"""Enrich a single person, returns (person, success)."""
|
|
1091
|
+
try:
|
|
1092
|
+
role, org, org_qid, from_date, to_date = self.enrich_person_role_org(person.source_id)
|
|
1093
|
+
|
|
1094
|
+
if role or org:
|
|
1095
|
+
person.known_for_role = role
|
|
1096
|
+
person.known_for_org = org
|
|
1097
|
+
if org_qid:
|
|
1098
|
+
person.record["org_qid"] = org_qid
|
|
1099
|
+
if from_date:
|
|
1100
|
+
person.from_date = from_date
|
|
1101
|
+
if to_date:
|
|
1102
|
+
person.to_date = to_date
|
|
1103
|
+
return person, True
|
|
1104
|
+
|
|
1105
|
+
return person, False
|
|
1106
|
+
except Exception as e:
|
|
1107
|
+
logger.debug(f"Failed to enrich {person.source_id}: {e}")
|
|
1108
|
+
return person, False
|
|
1109
|
+
|
|
1110
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1111
|
+
# Submit all tasks
|
|
1112
|
+
futures = {executor.submit(enrich_one, person): person for person in to_enrich}
|
|
1113
|
+
|
|
1114
|
+
# Process results as they complete
|
|
1115
|
+
completed = 0
|
|
1116
|
+
for future in as_completed(futures):
|
|
1117
|
+
person, success = future.result()
|
|
1118
|
+
if success:
|
|
1119
|
+
enriched_count += 1
|
|
1120
|
+
logger.debug(f"Enriched {person.name}: {person.known_for_role} at {person.known_for_org}")
|
|
1121
|
+
|
|
1122
|
+
completed += 1
|
|
1123
|
+
if completed % 100 == 0:
|
|
1124
|
+
logger.info(f"Enriched {completed}/{total} people ({enriched_count} with data)...")
|
|
1125
|
+
|
|
1126
|
+
# Small delay to avoid rate limiting
|
|
1127
|
+
time.sleep(delay_seconds)
|
|
1128
|
+
|
|
1129
|
+
logger.info(f"Enriched {enriched_count}/{total} people with role/org/dates")
|
|
1130
|
+
return enriched_count
|