corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
- statement_extractor/cli.py +866 -77
- statement_extractor/database/hub.py +35 -127
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +823 -325
- statement_extractor/database/models.py +30 -6
- statement_extractor/database/store.py +1485 -60
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +11 -1
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -4,6 +4,10 @@ Wikidata importer for the person database.
|
|
|
4
4
|
Imports notable people data from Wikidata using SPARQL queries
|
|
5
5
|
into the embedding database for person name matching.
|
|
6
6
|
|
|
7
|
+
Uses a two-phase approach for reliability:
|
|
8
|
+
1. Bulk fetch: Simple queries to get QID + name + country (fast, no timeouts)
|
|
9
|
+
2. Enrich: Targeted per-person queries for role/org/dates (resumable)
|
|
10
|
+
|
|
7
11
|
Notable people are those with English Wikipedia articles, ensuring
|
|
8
12
|
a basic level of notability.
|
|
9
13
|
|
|
@@ -27,275 +31,207 @@ import urllib.parse
|
|
|
27
31
|
import urllib.request
|
|
28
32
|
from typing import Any, Iterator, Optional
|
|
29
33
|
|
|
30
|
-
from ..models import PersonRecord, PersonType
|
|
34
|
+
from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
|
|
31
35
|
|
|
32
36
|
logger = logging.getLogger(__name__)
|
|
33
37
|
|
|
34
38
|
# Wikidata SPARQL endpoint
|
|
35
39
|
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
36
40
|
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
OPTIONAL {{ ?positionStatement pq:P642 ?org }} # "of" qualifier
|
|
54
|
-
}}
|
|
55
|
-
|
|
56
|
-
# Fallback: direct employer
|
|
57
|
-
OPTIONAL {{ ?person wdt:P108 ?employer. BIND(?employer AS ?org) }}
|
|
58
|
-
|
|
59
|
-
# Get description
|
|
41
|
+
# =============================================================================
|
|
42
|
+
# BULK QUERIES - Simple, fast queries for initial import (no role/org/dates)
|
|
43
|
+
# Uses rdfs:label instead of SERVICE wikibase:label for better performance
|
|
44
|
+
# Each query targets a single role/occupation for speed
|
|
45
|
+
# =============================================================================
|
|
46
|
+
|
|
47
|
+
# Template for position-held queries (P39) - for executives, politicians
|
|
48
|
+
# Matches people who held a position that IS the role, or is an INSTANCE OF the role
|
|
49
|
+
# {role_qid} = single role QID, {seed} = unique seed, {limit} = batch limit
|
|
50
|
+
POSITION_QUERY_TEMPLATE = """
|
|
51
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
|
|
52
|
+
?person wdt:P31 wd:Q5 .
|
|
53
|
+
?person wdt:P39 ?position .
|
|
54
|
+
{{ ?position wdt:P31 wd:{role_qid} . }} UNION {{ VALUES ?position {{ wd:{role_qid} }} }}
|
|
55
|
+
?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
|
|
56
|
+
OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
|
|
60
57
|
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
61
|
-
|
|
62
|
-
# Must have English Wikipedia article (notability filter)
|
|
63
58
|
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
64
|
-
|
|
65
|
-
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
|
|
66
59
|
}}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
# Query for business executives (CEOs, CFOs, board members, etc.) - P39 = executive positions
|
|
72
|
-
EXECUTIVE_QUERY = """
|
|
73
|
-
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
74
|
-
?person wdt:P31 wd:Q5 . # Instance of human
|
|
75
|
-
|
|
76
|
-
# Has held executive position
|
|
77
|
-
?person p:P39 ?positionStatement .
|
|
78
|
-
?positionStatement ps:P39 ?role .
|
|
79
|
-
|
|
80
|
-
# Role is a type of corporate officer, board member, or executive
|
|
81
|
-
VALUES ?role {
|
|
82
|
-
# C-Suite
|
|
83
|
-
wd:Q484876 # CEO (Chief Executive Officer)
|
|
84
|
-
wd:Q623279 # CFO (Chief Financial Officer)
|
|
85
|
-
wd:Q1502675 # CTO (Chief Technology Officer)
|
|
86
|
-
wd:Q935019 # COO (Chief Operating Officer)
|
|
87
|
-
wd:Q1057716 # CMO (Chief Marketing Officer)
|
|
88
|
-
wd:Q2140589 # CIO (Chief Information Officer)
|
|
89
|
-
wd:Q1115042 # Chief Human Resources Officer
|
|
90
|
-
wd:Q4720025 # Chief Legal Officer / General Counsel
|
|
91
|
-
wd:Q60432825 # Chief Product Officer
|
|
92
|
-
wd:Q15967139 # Chief Strategy Officer
|
|
93
|
-
wd:Q15729310 # Chief Revenue Officer
|
|
94
|
-
wd:Q47523568 # Chief Digital Officer
|
|
95
|
-
|
|
96
|
-
# Board positions
|
|
97
|
-
wd:Q258557 # Chairman / Chairman of the Board
|
|
98
|
-
wd:Q114863313 # Vice Chairman
|
|
99
|
-
wd:Q726114 # President (business)
|
|
100
|
-
wd:Q1372944 # Vice President
|
|
101
|
-
wd:Q18918145 # Executive Vice President
|
|
102
|
-
wd:Q1057569 # Board of directors member
|
|
103
|
-
wd:Q24058752 # Non-executive director
|
|
104
|
-
wd:Q3578048 # Independent director
|
|
105
|
-
|
|
106
|
-
# Other executive roles
|
|
107
|
-
wd:Q476675 # Managing Director
|
|
108
|
-
wd:Q5441744 # Executive Director
|
|
109
|
-
wd:Q4188234 # General Manager
|
|
110
|
-
wd:Q38844673 # Group CEO
|
|
111
|
-
wd:Q97273203 # President and CEO
|
|
112
|
-
wd:Q60715311 # Chairman and CEO
|
|
113
|
-
wd:Q3563879 # Partner (business)
|
|
114
|
-
wd:Q3505845 # Senior Partner
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
OPTIONAL { ?positionStatement pq:P642 ?org } # "of" qualifier
|
|
118
|
-
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
119
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
120
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
121
|
-
|
|
122
|
-
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
123
|
-
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
124
|
-
}
|
|
125
|
-
LIMIT %d
|
|
126
|
-
OFFSET %d
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
# Query for politicians
|
|
130
|
-
POLITICIAN_QUERY = """
|
|
131
|
-
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
132
|
-
?person wdt:P31 wd:Q5 . # Instance of human
|
|
133
|
-
|
|
134
|
-
# Occupation is politician
|
|
135
|
-
?person wdt:P106 wd:Q82955 .
|
|
136
|
-
|
|
137
|
-
OPTIONAL {
|
|
138
|
-
?person p:P39 ?positionStatement .
|
|
139
|
-
?positionStatement ps:P39 ?role .
|
|
140
|
-
OPTIONAL { ?positionStatement pq:P642 ?org }
|
|
141
|
-
}
|
|
142
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
143
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
144
|
-
|
|
145
|
-
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
146
|
-
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
147
|
-
}
|
|
148
|
-
LIMIT %d
|
|
149
|
-
OFFSET %d
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
# Query for athletes
|
|
153
|
-
ATHLETE_QUERY = """
|
|
154
|
-
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
155
|
-
?person wdt:P31 wd:Q5 . # Instance of human
|
|
156
|
-
|
|
157
|
-
# Is an athlete (has sports team membership P54 or is athlete P106)
|
|
158
|
-
{ ?person wdt:P106 wd:Q2066131 . } # Athlete occupation
|
|
159
|
-
UNION
|
|
160
|
-
{ ?person wdt:P54 ?team . } # Member of sports team
|
|
161
|
-
|
|
162
|
-
OPTIONAL { ?person wdt:P54 ?team . BIND(?team AS ?org) }
|
|
163
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
164
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
165
|
-
|
|
166
|
-
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
167
|
-
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
168
|
-
}
|
|
169
|
-
LIMIT %d
|
|
170
|
-
OFFSET %d
|
|
171
|
-
"""
|
|
172
|
-
|
|
173
|
-
# Query for artists (actors, musicians, directors)
|
|
174
|
-
ARTIST_QUERY = """
|
|
175
|
-
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
176
|
-
?person wdt:P31 wd:Q5 . # Instance of human
|
|
177
|
-
|
|
178
|
-
# Has artist occupation
|
|
179
|
-
?person wdt:P106 ?occupation .
|
|
180
|
-
VALUES ?occupation {
|
|
181
|
-
wd:Q33999 # Actor
|
|
182
|
-
wd:Q177220 # Singer
|
|
183
|
-
wd:Q639669 # Musician
|
|
184
|
-
wd:Q2526255 # Film director
|
|
185
|
-
wd:Q36180 # Writer
|
|
186
|
-
wd:Q483501 # Artist
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
190
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
191
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
192
|
-
|
|
193
|
-
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
194
|
-
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
195
|
-
}
|
|
196
|
-
LIMIT %d
|
|
197
|
-
OFFSET %d
|
|
60
|
+
ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
|
|
61
|
+
LIMIT {limit}
|
|
198
62
|
"""
|
|
199
63
|
|
|
200
|
-
#
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
{ ?person
|
|
209
|
-
|
|
210
|
-
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
211
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
212
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
213
|
-
|
|
214
|
-
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
215
|
-
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
216
|
-
}
|
|
217
|
-
LIMIT %d
|
|
218
|
-
OFFSET %d
|
|
219
|
-
"""
|
|
220
|
-
|
|
221
|
-
# Query for scientists
|
|
222
|
-
SCIENTIST_QUERY = """
|
|
223
|
-
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
224
|
-
?person wdt:P31 wd:Q5 . # Instance of human
|
|
225
|
-
|
|
226
|
-
# Is a scientist or researcher
|
|
227
|
-
{ ?person wdt:P106 wd:Q901 . } # Scientist
|
|
228
|
-
UNION
|
|
229
|
-
{ ?person wdt:P106 wd:Q1650915 . } # Researcher
|
|
230
|
-
|
|
231
|
-
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
232
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
233
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
234
|
-
|
|
235
|
-
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
236
|
-
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
237
|
-
}
|
|
238
|
-
LIMIT %d
|
|
239
|
-
OFFSET %d
|
|
240
|
-
"""
|
|
241
|
-
|
|
242
|
-
# Query for journalists and media personalities
|
|
243
|
-
JOURNALIST_QUERY = """
|
|
244
|
-
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
245
|
-
?person wdt:P31 wd:Q5 . # Instance of human
|
|
246
|
-
|
|
247
|
-
# Is a journalist or presenter
|
|
248
|
-
{ ?person wdt:P106 wd:Q1930187 . } # Journalist
|
|
249
|
-
UNION
|
|
250
|
-
{ ?person wdt:P106 wd:Q13590141 . } # Television presenter
|
|
251
|
-
|
|
252
|
-
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
253
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
254
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
255
|
-
|
|
64
|
+
# Template for occupation queries (P106) - for athletes, artists, etc.
|
|
65
|
+
# {occupation_qid} = single occupation QID, {seed} = unique seed, {limit} = batch limit
|
|
66
|
+
OCCUPATION_QUERY_TEMPLATE = """
|
|
67
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
|
|
68
|
+
?person wdt:P31 wd:Q5 .
|
|
69
|
+
?person wdt:P106 wd:{occupation_qid} .
|
|
70
|
+
?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
|
|
71
|
+
OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
|
|
72
|
+
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
256
73
|
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
257
|
-
|
|
258
|
-
}
|
|
259
|
-
LIMIT
|
|
260
|
-
OFFSET %d
|
|
74
|
+
}}
|
|
75
|
+
ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
|
|
76
|
+
LIMIT {limit}
|
|
261
77
|
"""
|
|
262
78
|
|
|
263
|
-
#
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
# Founded a company (inverse of P112)
|
|
79
|
+
# Template for founder queries (P112) - for entrepreneurs
|
|
80
|
+
# {seed} = unique seed, {limit} = batch limit
|
|
81
|
+
FOUNDER_QUERY_TEMPLATE = """
|
|
82
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
|
|
83
|
+
?person wdt:P31 wd:Q5 .
|
|
269
84
|
?org wdt:P112 ?person .
|
|
270
|
-
|
|
271
|
-
OPTIONAL { ?person wdt:P27 ?country. }
|
|
272
|
-
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
273
|
-
|
|
85
|
+
?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
|
|
86
|
+
OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
|
|
87
|
+
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
274
88
|
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
275
|
-
|
|
276
|
-
}
|
|
277
|
-
LIMIT
|
|
278
|
-
OFFSET %d
|
|
89
|
+
}}
|
|
90
|
+
ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
|
|
91
|
+
LIMIT {limit}
|
|
279
92
|
"""
|
|
280
93
|
|
|
281
|
-
#
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
94
|
+
# Role QIDs for executives (position held - P39)
|
|
95
|
+
EXECUTIVE_ROLES = [
|
|
96
|
+
"Q484876", # CEO
|
|
97
|
+
"Q623279", # CFO
|
|
98
|
+
"Q1502675", # COO
|
|
99
|
+
"Q935019", # CTO
|
|
100
|
+
"Q1057716", # CIO
|
|
101
|
+
"Q2140589", # CMO
|
|
102
|
+
"Q1115042", # chairperson
|
|
103
|
+
"Q4720025", # board of directors member
|
|
104
|
+
"Q60432825", # chief human resources officer
|
|
105
|
+
"Q15967139", # chief compliance officer
|
|
106
|
+
"Q15729310", # chief risk officer
|
|
107
|
+
"Q47523568", # chief legal officer
|
|
108
|
+
"Q258557", # board chair
|
|
109
|
+
"Q114863313", # chief sustainability officer
|
|
110
|
+
"Q726114", # company president
|
|
111
|
+
"Q1372944", # managing director
|
|
112
|
+
"Q18918145", # chief commercial officer
|
|
113
|
+
"Q1057569", # chief strategy officer
|
|
114
|
+
"Q24058752", # chief product officer
|
|
115
|
+
"Q3578048", # vice president
|
|
116
|
+
"Q476675", # business executive (generic)
|
|
117
|
+
"Q5441744", # finance director
|
|
118
|
+
"Q4188234", # general manager
|
|
119
|
+
"Q38844673", # chief data officer
|
|
120
|
+
"Q97273203", # chief digital officer
|
|
121
|
+
"Q60715311", # chief growth officer
|
|
122
|
+
"Q3563879", # treasurer
|
|
123
|
+
"Q3505845", # corporate secretary
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
# Role QIDs for politicians (position held - P39)
|
|
127
|
+
POLITICIAN_ROLES = [
|
|
128
|
+
"Q30461", # president
|
|
129
|
+
"Q14212", # prime minister
|
|
130
|
+
"Q83307", # minister
|
|
131
|
+
"Q2285706", # head of government
|
|
132
|
+
"Q4175034", # legislator
|
|
133
|
+
"Q486839", # member of parliament
|
|
134
|
+
"Q193391", # member of national legislature
|
|
135
|
+
"Q212071", # mayor
|
|
136
|
+
"Q382617", # governor
|
|
137
|
+
"Q116", # monarch
|
|
138
|
+
"Q484529", # member of congress
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
# Note: Politicians with generic position types (like "public office") may not be found
|
|
142
|
+
# because querying all public office holders times out. This includes some mayors
|
|
143
|
+
# whose positions are typed as "public office" rather than "mayor".
|
|
144
|
+
|
|
145
|
+
# Occupation QIDs for athletes (P106)
|
|
146
|
+
ATHLETE_OCCUPATIONS = [
|
|
147
|
+
"Q2066131", # athlete
|
|
148
|
+
"Q937857", # football player
|
|
149
|
+
"Q3665646", # basketball player
|
|
150
|
+
"Q10871364", # baseball player
|
|
151
|
+
"Q19204627", # ice hockey player
|
|
152
|
+
"Q10843402", # tennis player
|
|
153
|
+
"Q13381376", # golfer
|
|
154
|
+
"Q11338576", # boxer
|
|
155
|
+
"Q10873124", # swimmer
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
# Occupation QIDs for artists (P106)
|
|
159
|
+
ARTIST_OCCUPATIONS = [
|
|
160
|
+
"Q33999", # actor
|
|
161
|
+
"Q177220", # singer
|
|
162
|
+
"Q639669", # musician
|
|
163
|
+
"Q2526255", # film director
|
|
164
|
+
"Q36180", # writer
|
|
165
|
+
"Q483501", # artist
|
|
166
|
+
"Q488205", # singer-songwriter
|
|
167
|
+
"Q753110", # songwriter
|
|
168
|
+
"Q2405480", # voice actor
|
|
169
|
+
"Q10800557", # film actor
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
# Occupation QIDs for academics (P106)
|
|
173
|
+
ACADEMIC_OCCUPATIONS = [
|
|
174
|
+
"Q121594", # professor
|
|
175
|
+
"Q3400985", # academic
|
|
176
|
+
"Q1622272", # university professor
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
# Occupation QIDs for scientists (P106)
|
|
180
|
+
SCIENTIST_OCCUPATIONS = [
|
|
181
|
+
"Q901", # scientist
|
|
182
|
+
"Q1650915", # researcher
|
|
183
|
+
"Q169470", # physicist
|
|
184
|
+
"Q593644", # chemist
|
|
185
|
+
"Q864503", # biologist
|
|
186
|
+
"Q11063", # astronomer
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
# Occupation QIDs for journalists (P106)
|
|
190
|
+
JOURNALIST_OCCUPATIONS = [
|
|
191
|
+
"Q1930187", # journalist
|
|
192
|
+
"Q13590141", # news presenter
|
|
193
|
+
"Q947873", # television presenter
|
|
194
|
+
"Q4263842", # columnist
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
# Occupation QIDs for activists (P106)
|
|
198
|
+
ACTIVIST_OCCUPATIONS = [
|
|
199
|
+
"Q15253558", # activist
|
|
200
|
+
"Q11631410", # human rights activist
|
|
201
|
+
"Q18939491", # environmental activist
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
# Mapping query type to role/occupation lists and query template type
|
|
205
|
+
# Each entry can have multiple query groups to combine different approaches
|
|
206
|
+
QUERY_TYPE_CONFIG: dict[str, list[dict]] = {
|
|
207
|
+
"executive": [
|
|
208
|
+
{"template": "position", "items": EXECUTIVE_ROLES},
|
|
209
|
+
],
|
|
210
|
+
"politician": [
|
|
211
|
+
{"template": "position", "items": POLITICIAN_ROLES},
|
|
212
|
+
],
|
|
213
|
+
"athlete": [
|
|
214
|
+
{"template": "occupation", "items": ATHLETE_OCCUPATIONS},
|
|
215
|
+
],
|
|
216
|
+
"artist": [
|
|
217
|
+
{"template": "occupation", "items": ARTIST_OCCUPATIONS},
|
|
218
|
+
],
|
|
219
|
+
"academic": [
|
|
220
|
+
{"template": "occupation", "items": ACADEMIC_OCCUPATIONS},
|
|
221
|
+
],
|
|
222
|
+
"scientist": [
|
|
223
|
+
{"template": "occupation", "items": SCIENTIST_OCCUPATIONS},
|
|
224
|
+
],
|
|
225
|
+
"journalist": [
|
|
226
|
+
{"template": "occupation", "items": JOURNALIST_OCCUPATIONS},
|
|
227
|
+
],
|
|
228
|
+
"activist": [
|
|
229
|
+
{"template": "occupation", "items": ACTIVIST_OCCUPATIONS},
|
|
230
|
+
],
|
|
231
|
+
"entrepreneur": [
|
|
232
|
+
{"template": "founder", "items": []}, # No items, uses special template
|
|
233
|
+
],
|
|
295
234
|
}
|
|
296
|
-
LIMIT %d
|
|
297
|
-
OFFSET %d
|
|
298
|
-
"""
|
|
299
235
|
|
|
300
236
|
# Mapping query type to PersonType
|
|
301
237
|
QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
|
|
@@ -310,19 +246,6 @@ QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
|
|
|
310
246
|
"activist": PersonType.ACTIVIST,
|
|
311
247
|
}
|
|
312
248
|
|
|
313
|
-
# Mapping query type to SPARQL query template
|
|
314
|
-
QUERY_TYPES: dict[str, str] = {
|
|
315
|
-
"executive": EXECUTIVE_QUERY,
|
|
316
|
-
"politician": POLITICIAN_QUERY,
|
|
317
|
-
"athlete": ATHLETE_QUERY,
|
|
318
|
-
"artist": ARTIST_QUERY,
|
|
319
|
-
"academic": ACADEMIC_QUERY,
|
|
320
|
-
"scientist": SCIENTIST_QUERY,
|
|
321
|
-
"journalist": JOURNALIST_QUERY,
|
|
322
|
-
"entrepreneur": ENTREPRENEUR_QUERY,
|
|
323
|
-
"activist": ACTIVIST_QUERY,
|
|
324
|
-
}
|
|
325
|
-
|
|
326
249
|
|
|
327
250
|
class WikidataPeopleImporter:
|
|
328
251
|
"""
|
|
@@ -343,126 +266,311 @@ class WikidataPeopleImporter:
|
|
|
343
266
|
- activist: Activists and advocates
|
|
344
267
|
"""
|
|
345
268
|
|
|
346
|
-
def __init__(
|
|
269
|
+
def __init__(
|
|
270
|
+
self,
|
|
271
|
+
batch_size: int = 5000,
|
|
272
|
+
delay_seconds: float = 2.0,
|
|
273
|
+
timeout: int = 120,
|
|
274
|
+
max_retries: int = 3,
|
|
275
|
+
min_batch_size: int = 50,
|
|
276
|
+
):
|
|
347
277
|
"""
|
|
348
278
|
Initialize the Wikidata people importer.
|
|
349
279
|
|
|
350
280
|
Args:
|
|
351
|
-
batch_size: Number of records to fetch per SPARQL query (default
|
|
281
|
+
batch_size: Number of records to fetch per SPARQL query (default 5000)
|
|
352
282
|
delay_seconds: Delay between requests to be polite to the endpoint
|
|
353
283
|
timeout: HTTP timeout in seconds (default 120)
|
|
284
|
+
max_retries: Maximum retries per batch on timeout (default 3)
|
|
285
|
+
min_batch_size: Minimum batch size before giving up (default 50)
|
|
354
286
|
"""
|
|
355
287
|
self._batch_size = batch_size
|
|
356
288
|
self._delay = delay_seconds
|
|
357
289
|
self._timeout = timeout
|
|
290
|
+
self._max_retries = max_retries
|
|
291
|
+
self._min_batch_size = min_batch_size
|
|
292
|
+
# Track discovered organizations: org_qid -> org_label
|
|
293
|
+
self._discovered_orgs: dict[str, str] = {}
|
|
358
294
|
|
|
359
295
|
def import_from_sparql(
|
|
360
296
|
self,
|
|
361
297
|
limit: Optional[int] = None,
|
|
362
298
|
query_type: str = "executive",
|
|
363
299
|
import_all: bool = False,
|
|
300
|
+
convergence_threshold: int = 5,
|
|
364
301
|
) -> Iterator[PersonRecord]:
|
|
365
302
|
"""
|
|
366
|
-
Import person records from Wikidata via SPARQL.
|
|
303
|
+
Import person records from Wikidata via SPARQL (bulk fetch phase).
|
|
304
|
+
|
|
305
|
+
This performs the fast bulk import with minimal data (QID, name, country).
|
|
306
|
+
Use enrich_people_batch() afterwards to add role/org/dates.
|
|
307
|
+
|
|
308
|
+
Iterates through each role/occupation individually for faster queries,
|
|
309
|
+
using random sampling with convergence detection per role.
|
|
367
310
|
|
|
368
311
|
Args:
|
|
369
312
|
limit: Optional limit on total records
|
|
370
313
|
query_type: Which query to use (executive, politician, athlete, etc.)
|
|
371
314
|
import_all: If True, run all query types sequentially
|
|
315
|
+
convergence_threshold: Stop after this many consecutive batches with no new records per role
|
|
372
316
|
|
|
373
317
|
Yields:
|
|
374
|
-
PersonRecord for each person
|
|
318
|
+
PersonRecord for each person (without role/org - use enrich to add)
|
|
375
319
|
"""
|
|
376
320
|
if import_all:
|
|
377
321
|
yield from self._import_all_types(limit)
|
|
378
322
|
return
|
|
379
323
|
|
|
380
|
-
if query_type not in
|
|
381
|
-
raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(
|
|
324
|
+
if query_type not in QUERY_TYPE_CONFIG:
|
|
325
|
+
raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPE_CONFIG.keys())}")
|
|
382
326
|
|
|
383
|
-
|
|
327
|
+
config_groups = QUERY_TYPE_CONFIG[query_type]
|
|
384
328
|
person_type = QUERY_TYPE_TO_PERSON_TYPE.get(query_type, PersonType.UNKNOWN)
|
|
385
|
-
logger.info(f"Starting Wikidata people import via SPARQL (query_type={query_type}, person_type={person_type.value})...")
|
|
386
329
|
|
|
387
|
-
|
|
330
|
+
logger.info(f"Starting Wikidata bulk import (query_type={query_type}, person_type={person_type.value})...")
|
|
331
|
+
|
|
332
|
+
total_count = 0
|
|
333
|
+
# Track seen QIDs to deduplicate across all roles
|
|
334
|
+
seen_qids: set[str] = set()
|
|
335
|
+
|
|
336
|
+
# Iterate through each config group (e.g., position queries + occupation queries)
|
|
337
|
+
for config in config_groups:
|
|
338
|
+
if limit and total_count >= limit:
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
template_type = config["template"]
|
|
342
|
+
items = config["items"]
|
|
343
|
+
|
|
344
|
+
# For founder template, run a single query
|
|
345
|
+
if template_type == "founder":
|
|
346
|
+
for record in self._import_single_template(
|
|
347
|
+
template=FOUNDER_QUERY_TEMPLATE,
|
|
348
|
+
template_params={},
|
|
349
|
+
person_type=person_type,
|
|
350
|
+
seen_qids=seen_qids,
|
|
351
|
+
limit=(limit - total_count) if limit else None,
|
|
352
|
+
convergence_threshold=convergence_threshold,
|
|
353
|
+
role_name="founder",
|
|
354
|
+
):
|
|
355
|
+
total_count += 1
|
|
356
|
+
yield record
|
|
357
|
+
continue
|
|
358
|
+
|
|
359
|
+
# Select the right template
|
|
360
|
+
if template_type == "position":
|
|
361
|
+
template = POSITION_QUERY_TEMPLATE
|
|
362
|
+
param_name = "role_qid"
|
|
363
|
+
else: # occupation
|
|
364
|
+
template = OCCUPATION_QUERY_TEMPLATE
|
|
365
|
+
param_name = "occupation_qid"
|
|
366
|
+
|
|
367
|
+
# Iterate through each role/occupation in this group
|
|
368
|
+
for item_qid in items:
|
|
369
|
+
if limit and total_count >= limit:
|
|
370
|
+
break
|
|
371
|
+
|
|
372
|
+
remaining = (limit - total_count) if limit else None
|
|
373
|
+
role_count = 0
|
|
374
|
+
|
|
375
|
+
for record in self._import_single_template(
|
|
376
|
+
template=template,
|
|
377
|
+
template_params={param_name: item_qid},
|
|
378
|
+
person_type=person_type,
|
|
379
|
+
seen_qids=seen_qids,
|
|
380
|
+
limit=remaining,
|
|
381
|
+
convergence_threshold=convergence_threshold,
|
|
382
|
+
role_name=item_qid,
|
|
383
|
+
):
|
|
384
|
+
role_count += 1
|
|
385
|
+
total_count += 1
|
|
386
|
+
yield record
|
|
387
|
+
|
|
388
|
+
logger.info(f"Role {item_qid}: {role_count} new (total: {total_count})")
|
|
389
|
+
|
|
390
|
+
logger.info(f"Completed Wikidata bulk import: {total_count} records (use enrich to add role/org)")
|
|
391
|
+
|
|
392
|
+
def _import_single_template(
|
|
393
|
+
self,
|
|
394
|
+
template: str,
|
|
395
|
+
template_params: dict[str, str],
|
|
396
|
+
person_type: PersonType,
|
|
397
|
+
seen_qids: set[str],
|
|
398
|
+
limit: Optional[int],
|
|
399
|
+
convergence_threshold: int,
|
|
400
|
+
role_name: str,
|
|
401
|
+
) -> Iterator[PersonRecord]:
|
|
402
|
+
"""
|
|
403
|
+
Import from a single role/occupation using random sampling with convergence.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
template: SPARQL query template
|
|
407
|
+
template_params: Parameters to format into template (role_qid or occupation_qid)
|
|
408
|
+
person_type: PersonType to assign to records
|
|
409
|
+
seen_qids: Set of already-seen QIDs (shared across roles)
|
|
410
|
+
limit: Optional limit on records from this role
|
|
411
|
+
convergence_threshold: Stop after this many consecutive empty batches
|
|
412
|
+
role_name: Name for logging
|
|
413
|
+
|
|
414
|
+
Yields:
|
|
415
|
+
PersonRecord for each new person found
|
|
416
|
+
"""
|
|
417
|
+
batch_num = 0
|
|
388
418
|
total_count = 0
|
|
389
|
-
|
|
419
|
+
current_batch_size = self._batch_size
|
|
420
|
+
consecutive_empty_batches = 0
|
|
421
|
+
|
|
422
|
+
logger.info(f"Querying role {role_name}...")
|
|
390
423
|
|
|
391
424
|
while True:
|
|
392
425
|
if limit and total_count >= limit:
|
|
393
426
|
break
|
|
394
427
|
|
|
395
|
-
|
|
396
|
-
|
|
428
|
+
batch_num += 1
|
|
429
|
+
batch_limit = min(current_batch_size, (limit - total_count) if limit else current_batch_size)
|
|
397
430
|
|
|
398
|
-
|
|
431
|
+
# Generate unique seed for this batch
|
|
432
|
+
batch_seed = f"{role_name}_{batch_num}_{int(time.time() * 1000)}"
|
|
399
433
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
434
|
+
# Build query
|
|
435
|
+
query = template.format(
|
|
436
|
+
**template_params,
|
|
437
|
+
seed=batch_seed,
|
|
438
|
+
limit=batch_limit,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Execute with retries
|
|
442
|
+
results = None
|
|
443
|
+
retries = 0
|
|
444
|
+
retry_batch_size = batch_limit
|
|
445
|
+
|
|
446
|
+
while retries <= self._max_retries:
|
|
447
|
+
try:
|
|
448
|
+
# Rebuild query with potentially smaller batch size
|
|
449
|
+
if retry_batch_size != batch_limit:
|
|
450
|
+
query = template.format(
|
|
451
|
+
**template_params,
|
|
452
|
+
seed=batch_seed,
|
|
453
|
+
limit=retry_batch_size,
|
|
454
|
+
)
|
|
455
|
+
results = self._execute_sparql(query)
|
|
456
|
+
if retry_batch_size < current_batch_size:
|
|
457
|
+
current_batch_size = retry_batch_size
|
|
458
|
+
break
|
|
459
|
+
except Exception as e:
|
|
460
|
+
is_timeout = "timeout" in str(e).lower() or "504" in str(e) or "503" in str(e)
|
|
461
|
+
if is_timeout and retry_batch_size > self._min_batch_size:
|
|
462
|
+
retries += 1
|
|
463
|
+
retry_batch_size = max(retry_batch_size // 2, self._min_batch_size)
|
|
464
|
+
wait_time = self._delay * (2 ** retries)
|
|
465
|
+
logger.warning(
|
|
466
|
+
f"Timeout on {role_name} batch #{batch_num}, retry {retries}/{self._max_retries} "
|
|
467
|
+
f"with batch_size={retry_batch_size} after {wait_time:.1f}s wait"
|
|
468
|
+
)
|
|
469
|
+
time.sleep(wait_time)
|
|
470
|
+
else:
|
|
471
|
+
logger.error(f"SPARQL query failed on {role_name} batch #{batch_num}: {e}")
|
|
472
|
+
break
|
|
473
|
+
|
|
474
|
+
if results is None:
|
|
475
|
+
logger.warning(f"Giving up on {role_name} after {retries} retries")
|
|
404
476
|
break
|
|
405
477
|
|
|
406
478
|
bindings = results.get("results", {}).get("bindings", [])
|
|
407
479
|
|
|
408
480
|
if not bindings:
|
|
409
|
-
|
|
410
|
-
|
|
481
|
+
consecutive_empty_batches += 1
|
|
482
|
+
if consecutive_empty_batches >= convergence_threshold:
|
|
483
|
+
logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
|
|
484
|
+
break
|
|
485
|
+
continue
|
|
411
486
|
|
|
412
487
|
batch_count = 0
|
|
413
488
|
for binding in bindings:
|
|
414
489
|
if limit and total_count >= limit:
|
|
415
490
|
break
|
|
416
491
|
|
|
417
|
-
record = self.
|
|
418
|
-
if record
|
|
419
|
-
|
|
420
|
-
total_count += 1
|
|
421
|
-
batch_count += 1
|
|
422
|
-
yield record
|
|
492
|
+
record, skip_reason = self._parse_bulk_binding(binding, person_type=person_type)
|
|
493
|
+
if record is None:
|
|
494
|
+
continue
|
|
423
495
|
|
|
424
|
-
|
|
496
|
+
# Deduplicate
|
|
497
|
+
if record.source_id in seen_qids:
|
|
498
|
+
continue
|
|
425
499
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
500
|
+
seen_qids.add(record.source_id)
|
|
501
|
+
total_count += 1
|
|
502
|
+
batch_count += 1
|
|
503
|
+
yield record
|
|
429
504
|
|
|
430
|
-
|
|
505
|
+
# Check convergence
|
|
506
|
+
if batch_count == 0:
|
|
507
|
+
consecutive_empty_batches += 1
|
|
508
|
+
if consecutive_empty_batches >= convergence_threshold:
|
|
509
|
+
logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
|
|
510
|
+
break
|
|
511
|
+
else:
|
|
512
|
+
consecutive_empty_batches = 0
|
|
431
513
|
|
|
432
|
-
#
|
|
514
|
+
# Rate limit
|
|
433
515
|
if self._delay > 0:
|
|
434
516
|
time.sleep(self._delay)
|
|
435
517
|
|
|
436
|
-
logger.info(f"Completed Wikidata people import: {total_count} records")
|
|
437
|
-
|
|
438
518
|
def _import_all_types(self, limit: Optional[int]) -> Iterator[PersonRecord]:
|
|
439
519
|
"""Import from all query types sequentially, deduplicating across types."""
|
|
440
|
-
|
|
520
|
+
# Track seen QIDs across all types
|
|
521
|
+
seen_qids: set[str] = set()
|
|
441
522
|
total_count = 0
|
|
442
523
|
|
|
443
524
|
# Calculate per-type limits if a total limit is set
|
|
444
|
-
num_types = len(
|
|
525
|
+
num_types = len(QUERY_TYPE_CONFIG)
|
|
445
526
|
per_type_limit = limit // num_types if limit else None
|
|
446
527
|
|
|
447
|
-
for query_type in
|
|
528
|
+
for query_type in QUERY_TYPE_CONFIG:
|
|
448
529
|
logger.info(f"=== Importing people: {query_type} ===")
|
|
449
530
|
type_count = 0
|
|
531
|
+
skipped_count = 0
|
|
450
532
|
|
|
451
533
|
for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
|
|
452
|
-
if record.source_id
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
534
|
+
if record.source_id in seen_qids:
|
|
535
|
+
skipped_count += 1
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
seen_qids.add(record.source_id)
|
|
539
|
+
total_count += 1
|
|
540
|
+
type_count += 1
|
|
541
|
+
yield record
|
|
457
542
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
543
|
+
if limit and total_count >= limit:
|
|
544
|
+
logger.info(f"Reached total limit of {limit} records")
|
|
545
|
+
return
|
|
461
546
|
|
|
462
|
-
logger.info(
|
|
547
|
+
logger.info(
|
|
548
|
+
f"Got {type_count} new from {query_type}, skipped {skipped_count} (total: {total_count})"
|
|
549
|
+
)
|
|
463
550
|
|
|
464
551
|
logger.info(f"Completed all query types: {total_count} total people records")
|
|
465
552
|
|
|
553
|
+
@staticmethod
|
|
554
|
+
def _parse_wikidata_date(date_str: str) -> Optional[str]:
|
|
555
|
+
"""
|
|
556
|
+
Parse a Wikidata date string into ISO format (YYYY-MM-DD).
|
|
557
|
+
|
|
558
|
+
Wikidata returns dates like "2020-01-15T00:00:00Z" or just "2020".
|
|
559
|
+
Returns None if the date cannot be parsed.
|
|
560
|
+
"""
|
|
561
|
+
if not date_str:
|
|
562
|
+
return None
|
|
563
|
+
# Handle ISO datetime format (e.g., "2020-01-15T00:00:00Z")
|
|
564
|
+
if "T" in date_str:
|
|
565
|
+
return date_str.split("T")[0]
|
|
566
|
+
# Handle year-only format (e.g., "2020")
|
|
567
|
+
if len(date_str) == 4 and date_str.isdigit():
|
|
568
|
+
return f"{date_str}-01-01"
|
|
569
|
+
# Return as-is if it looks like a date
|
|
570
|
+
if len(date_str) >= 4:
|
|
571
|
+
return date_str[:10] # Take first 10 chars (YYYY-MM-DD)
|
|
572
|
+
return None
|
|
573
|
+
|
|
466
574
|
def _execute_sparql(self, query: str) -> dict[str, Any]:
|
|
467
575
|
"""Execute a SPARQL query against Wikidata."""
|
|
468
576
|
params = urllib.parse.urlencode({
|
|
@@ -483,39 +591,126 @@ class WikidataPeopleImporter:
|
|
|
483
591
|
with urllib.request.urlopen(req, timeout=self._timeout) as response:
|
|
484
592
|
return json.loads(response.read().decode("utf-8"))
|
|
485
593
|
|
|
486
|
-
def
|
|
594
|
+
def _parse_bulk_binding(
|
|
487
595
|
self,
|
|
488
596
|
binding: dict[str, Any],
|
|
489
597
|
person_type: PersonType = PersonType.UNKNOWN,
|
|
490
|
-
) -> Optional[PersonRecord]:
|
|
491
|
-
"""
|
|
598
|
+
) -> tuple[Optional[PersonRecord], Optional[str]]:
|
|
599
|
+
"""
|
|
600
|
+
Parse a bulk SPARQL result binding into a PersonRecord.
|
|
601
|
+
|
|
602
|
+
Bulk bindings only have: person, personLabel, countryLabel, description.
|
|
603
|
+
Role/org/dates are NOT included - use enrich methods to add them later.
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
Tuple of (PersonRecord or None, skip_reason or None)
|
|
607
|
+
"""
|
|
608
|
+
try:
|
|
609
|
+
# Get Wikidata entity ID
|
|
610
|
+
person_uri = binding.get("person", {}).get("value", "")
|
|
611
|
+
if not person_uri:
|
|
612
|
+
return None, "missing person URI"
|
|
613
|
+
|
|
614
|
+
# Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
|
|
615
|
+
wikidata_id = person_uri.split("/")[-1]
|
|
616
|
+
if not wikidata_id.startswith("Q"):
|
|
617
|
+
return None, f"invalid Wikidata ID format: {wikidata_id}"
|
|
618
|
+
|
|
619
|
+
# Get label
|
|
620
|
+
label = binding.get("personLabel", {}).get("value", "")
|
|
621
|
+
if not label:
|
|
622
|
+
return None, f"{wikidata_id}: no label"
|
|
623
|
+
if label == wikidata_id:
|
|
624
|
+
return None, f"{wikidata_id}: no English label (label equals QID)"
|
|
625
|
+
|
|
626
|
+
# Get optional fields from bulk query
|
|
627
|
+
country = binding.get("countryLabel", {}).get("value", "")
|
|
628
|
+
description = binding.get("description", {}).get("value", "")
|
|
629
|
+
|
|
630
|
+
# Build minimal record data
|
|
631
|
+
record_data: dict[str, Any] = {
|
|
632
|
+
"wikidata_id": wikidata_id,
|
|
633
|
+
"label": label,
|
|
634
|
+
}
|
|
635
|
+
if country:
|
|
636
|
+
record_data["country"] = country
|
|
637
|
+
if description:
|
|
638
|
+
record_data["description"] = description
|
|
639
|
+
|
|
640
|
+
return PersonRecord(
|
|
641
|
+
name=label.strip(),
|
|
642
|
+
source="wikidata",
|
|
643
|
+
source_id=wikidata_id,
|
|
644
|
+
country=country or "",
|
|
645
|
+
person_type=person_type,
|
|
646
|
+
known_for_role="", # To be enriched later
|
|
647
|
+
known_for_org="", # To be enriched later
|
|
648
|
+
from_date=None, # To be enriched later
|
|
649
|
+
to_date=None, # To be enriched later
|
|
650
|
+
record=record_data,
|
|
651
|
+
), None
|
|
652
|
+
|
|
653
|
+
except Exception as e:
|
|
654
|
+
return None, f"parse error: {e}"
|
|
655
|
+
|
|
656
|
+
def _parse_binding_with_reason(
|
|
657
|
+
self,
|
|
658
|
+
binding: dict[str, Any],
|
|
659
|
+
person_type: PersonType = PersonType.UNKNOWN,
|
|
660
|
+
) -> tuple[Optional[PersonRecord], Optional[str]]:
|
|
661
|
+
"""
|
|
662
|
+
Parse a SPARQL result binding into a PersonRecord.
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
Tuple of (PersonRecord or None, skip_reason or None)
|
|
666
|
+
"""
|
|
492
667
|
try:
|
|
493
668
|
# Get Wikidata entity ID
|
|
494
669
|
person_uri = binding.get("person", {}).get("value", "")
|
|
495
670
|
if not person_uri:
|
|
496
|
-
return None
|
|
671
|
+
return None, "missing person URI"
|
|
497
672
|
|
|
498
673
|
# Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
|
|
499
674
|
wikidata_id = person_uri.split("/")[-1]
|
|
500
675
|
if not wikidata_id.startswith("Q"):
|
|
501
|
-
return None
|
|
676
|
+
return None, f"invalid Wikidata ID format: {wikidata_id}"
|
|
502
677
|
|
|
503
678
|
# Get label
|
|
504
679
|
label = binding.get("personLabel", {}).get("value", "")
|
|
505
|
-
if not label
|
|
506
|
-
return None
|
|
680
|
+
if not label:
|
|
681
|
+
return None, f"{wikidata_id}: no label"
|
|
682
|
+
if label == wikidata_id:
|
|
683
|
+
return None, f"{wikidata_id}: no English label (label equals QID)"
|
|
507
684
|
|
|
508
685
|
# Get optional fields
|
|
509
686
|
country = binding.get("countryLabel", {}).get("value", "")
|
|
510
687
|
role = binding.get("roleLabel", {}).get("value", "")
|
|
511
|
-
|
|
688
|
+
org_label = binding.get("orgLabel", {}).get("value", "")
|
|
689
|
+
org_uri = binding.get("org", {}).get("value", "")
|
|
512
690
|
description = binding.get("description", {}).get("value", "")
|
|
513
691
|
|
|
514
|
-
#
|
|
692
|
+
# Extract org QID from URI (e.g., "http://www.wikidata.org/entity/Q715583" -> "Q715583")
|
|
693
|
+
org_qid = ""
|
|
694
|
+
if org_uri:
|
|
695
|
+
org_qid = org_uri.split("/")[-1]
|
|
696
|
+
if not org_qid.startswith("Q"):
|
|
697
|
+
org_qid = ""
|
|
698
|
+
|
|
699
|
+
# Get dates (Wikidata returns ISO datetime, extract just the date part)
|
|
700
|
+
start_date_raw = binding.get("startDate", {}).get("value", "")
|
|
701
|
+
end_date_raw = binding.get("endDate", {}).get("value", "")
|
|
702
|
+
from_date = WikidataPeopleImporter._parse_wikidata_date(start_date_raw)
|
|
703
|
+
to_date = WikidataPeopleImporter._parse_wikidata_date(end_date_raw)
|
|
704
|
+
|
|
705
|
+
# Clean up role and org label (remove QID if it's the same as the label)
|
|
515
706
|
if role and role.startswith("Q"):
|
|
516
707
|
role = ""
|
|
517
|
-
if
|
|
518
|
-
|
|
708
|
+
if org_label and org_label.startswith("Q"):
|
|
709
|
+
org_label = ""
|
|
710
|
+
|
|
711
|
+
# Track discovered organization if we have both QID and label
|
|
712
|
+
if org_qid and org_label:
|
|
713
|
+
self._discovered_orgs[org_qid] = org_label
|
|
519
714
|
|
|
520
715
|
# Build record data
|
|
521
716
|
record_data: dict[str, Any] = {
|
|
@@ -526,10 +721,16 @@ class WikidataPeopleImporter:
|
|
|
526
721
|
record_data["country"] = country
|
|
527
722
|
if role:
|
|
528
723
|
record_data["role"] = role
|
|
529
|
-
if
|
|
530
|
-
record_data["org"] =
|
|
724
|
+
if org_label:
|
|
725
|
+
record_data["org"] = org_label
|
|
726
|
+
if org_qid:
|
|
727
|
+
record_data["org_qid"] = org_qid
|
|
531
728
|
if description:
|
|
532
729
|
record_data["description"] = description
|
|
730
|
+
if from_date:
|
|
731
|
+
record_data["from_date"] = from_date
|
|
732
|
+
if to_date:
|
|
733
|
+
record_data["to_date"] = to_date
|
|
533
734
|
|
|
534
735
|
return PersonRecord(
|
|
535
736
|
name=label.strip(),
|
|
@@ -538,13 +739,23 @@ class WikidataPeopleImporter:
|
|
|
538
739
|
country=country or "",
|
|
539
740
|
person_type=person_type,
|
|
540
741
|
known_for_role=role or "",
|
|
541
|
-
known_for_org=
|
|
742
|
+
known_for_org=org_label or "",
|
|
743
|
+
from_date=from_date,
|
|
744
|
+
to_date=to_date,
|
|
542
745
|
record=record_data,
|
|
543
|
-
)
|
|
746
|
+
), None
|
|
544
747
|
|
|
545
748
|
except Exception as e:
|
|
546
|
-
|
|
547
|
-
|
|
749
|
+
return None, f"parse error: {e}"
|
|
750
|
+
|
|
751
|
+
def _parse_binding(
|
|
752
|
+
self,
|
|
753
|
+
binding: dict[str, Any],
|
|
754
|
+
person_type: PersonType = PersonType.UNKNOWN,
|
|
755
|
+
) -> Optional[PersonRecord]:
|
|
756
|
+
"""Parse a SPARQL result binding into a PersonRecord (legacy wrapper)."""
|
|
757
|
+
record, _ = self._parse_binding_with_reason(binding, person_type)
|
|
758
|
+
return record
|
|
548
759
|
|
|
549
760
|
def search_person(self, name: str, limit: int = 10) -> list[PersonRecord]:
|
|
550
761
|
"""
|
|
@@ -630,3 +841,290 @@ class WikidataPeopleImporter:
|
|
|
630
841
|
results.append(record)
|
|
631
842
|
|
|
632
843
|
return results
|
|
844
|
+
|
|
845
|
+
def get_discovered_organizations(self) -> list[CompanyRecord]:
|
|
846
|
+
"""
|
|
847
|
+
Get organizations discovered during the people import.
|
|
848
|
+
|
|
849
|
+
These are organizations associated with people (employers, positions, etc.)
|
|
850
|
+
that can be inserted into the organizations database if not already present.
|
|
851
|
+
|
|
852
|
+
Returns:
|
|
853
|
+
List of CompanyRecord objects for discovered organizations
|
|
854
|
+
"""
|
|
855
|
+
records = []
|
|
856
|
+
for org_qid, org_label in self._discovered_orgs.items():
|
|
857
|
+
record = CompanyRecord(
|
|
858
|
+
name=org_label,
|
|
859
|
+
source="wikipedia", # Use "wikipedia" as source per wikidata.py convention
|
|
860
|
+
source_id=org_qid,
|
|
861
|
+
region="", # Not available from this context
|
|
862
|
+
entity_type=EntityType.BUSINESS, # Default to business for orgs linked to people
|
|
863
|
+
record={
|
|
864
|
+
"wikidata_id": org_qid,
|
|
865
|
+
"label": org_label,
|
|
866
|
+
"discovered_from": "people_import",
|
|
867
|
+
},
|
|
868
|
+
)
|
|
869
|
+
records.append(record)
|
|
870
|
+
logger.info(f"Discovered {len(records)} organizations from people import")
|
|
871
|
+
return records
|
|
872
|
+
|
|
873
|
+
def clear_discovered_organizations(self) -> None:
|
|
874
|
+
"""Clear the discovered organizations cache."""
|
|
875
|
+
self._discovered_orgs.clear()
|
|
876
|
+
|
|
877
|
+
def enrich_person_dates(self, person_qid: str, role: str = "", org: str = "") -> tuple[Optional[str], Optional[str]]:
|
|
878
|
+
"""
|
|
879
|
+
Query Wikidata to get start/end dates for a person's position.
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
person_qid: Wikidata QID of the person (e.g., 'Q123')
|
|
883
|
+
role: Optional role label to match (e.g., 'chief executive officer')
|
|
884
|
+
org: Optional org label to match (e.g., 'Apple Inc')
|
|
885
|
+
|
|
886
|
+
Returns:
|
|
887
|
+
Tuple of (from_date, to_date) in ISO format, or (None, None) if not found
|
|
888
|
+
"""
|
|
889
|
+
# Query for position dates for this specific person
|
|
890
|
+
# Uses rdfs:label instead of SERVICE wikibase:label for better performance
|
|
891
|
+
query = """
|
|
892
|
+
SELECT ?roleLabel ?orgLabel ?startDate ?endDate WHERE {
|
|
893
|
+
wd:%s p:P39 ?positionStatement .
|
|
894
|
+
?positionStatement ps:P39 ?role .
|
|
895
|
+
?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
|
|
896
|
+
OPTIONAL { ?positionStatement pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
|
|
897
|
+
OPTIONAL { ?positionStatement pq:P580 ?startDate }
|
|
898
|
+
OPTIONAL { ?positionStatement pq:P582 ?endDate }
|
|
899
|
+
}
|
|
900
|
+
LIMIT 50
|
|
901
|
+
""" % person_qid
|
|
902
|
+
|
|
903
|
+
try:
|
|
904
|
+
url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
|
|
905
|
+
req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
|
|
906
|
+
|
|
907
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
908
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
909
|
+
|
|
910
|
+
# Find the best matching position
|
|
911
|
+
best_start = None
|
|
912
|
+
best_end = None
|
|
913
|
+
|
|
914
|
+
for binding in data.get("results", {}).get("bindings", []):
|
|
915
|
+
role_label = binding.get("roleLabel", {}).get("value", "")
|
|
916
|
+
org_label = binding.get("orgLabel", {}).get("value", "")
|
|
917
|
+
start_raw = binding.get("startDate", {}).get("value", "")
|
|
918
|
+
end_raw = binding.get("endDate", {}).get("value", "")
|
|
919
|
+
|
|
920
|
+
# If role/org specified, try to match
|
|
921
|
+
if role and role.lower() not in role_label.lower():
|
|
922
|
+
continue
|
|
923
|
+
if org and org.lower() not in org_label.lower():
|
|
924
|
+
continue
|
|
925
|
+
|
|
926
|
+
# Parse dates
|
|
927
|
+
start_date = self._parse_wikidata_date(start_raw)
|
|
928
|
+
end_date = self._parse_wikidata_date(end_raw)
|
|
929
|
+
|
|
930
|
+
# Prefer entries with dates
|
|
931
|
+
if start_date or end_date:
|
|
932
|
+
best_start = start_date
|
|
933
|
+
best_end = end_date
|
|
934
|
+
break # Found a match with dates
|
|
935
|
+
|
|
936
|
+
return best_start, best_end
|
|
937
|
+
|
|
938
|
+
except Exception as e:
|
|
939
|
+
logger.debug(f"Failed to enrich dates for {person_qid}: {e}")
|
|
940
|
+
return None, None
|
|
941
|
+
|
|
942
|
+
def enrich_people_batch(
|
|
943
|
+
self,
|
|
944
|
+
people: list[PersonRecord],
|
|
945
|
+
delay_seconds: float = 0.5,
|
|
946
|
+
) -> int:
|
|
947
|
+
"""
|
|
948
|
+
Enrich a batch of people with start/end dates.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
people: List of PersonRecord objects to enrich
|
|
952
|
+
delay_seconds: Delay between requests
|
|
953
|
+
|
|
954
|
+
Returns:
|
|
955
|
+
Number of people enriched with dates
|
|
956
|
+
"""
|
|
957
|
+
enriched_count = 0
|
|
958
|
+
|
|
959
|
+
for person in people:
|
|
960
|
+
if person.from_date or person.to_date:
|
|
961
|
+
continue # Already has dates
|
|
962
|
+
|
|
963
|
+
qid = person.source_id
|
|
964
|
+
role = person.known_for_role
|
|
965
|
+
org = person.known_for_org
|
|
966
|
+
|
|
967
|
+
from_date, to_date = self.enrich_person_dates(qid, role, org)
|
|
968
|
+
|
|
969
|
+
if from_date or to_date:
|
|
970
|
+
person.from_date = from_date
|
|
971
|
+
person.to_date = to_date
|
|
972
|
+
enriched_count += 1
|
|
973
|
+
logger.debug(f"Enriched {person.name}: {from_date} - {to_date}")
|
|
974
|
+
|
|
975
|
+
time.sleep(delay_seconds)
|
|
976
|
+
|
|
977
|
+
logger.info(f"Enriched {enriched_count}/{len(people)} people with dates")
|
|
978
|
+
return enriched_count
|
|
979
|
+
|
|
980
|
+
def enrich_person_role_org(
|
|
981
|
+
self, person_qid: str
|
|
982
|
+
) -> tuple[str, str, str, Optional[str], Optional[str]]:
|
|
983
|
+
"""
|
|
984
|
+
Query Wikidata to get role, org, and dates for a person.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
person_qid: Wikidata QID of the person (e.g., 'Q123')
|
|
988
|
+
|
|
989
|
+
Returns:
|
|
990
|
+
Tuple of (role_label, org_label, org_qid, from_date, to_date)
|
|
991
|
+
Empty strings/None if not found
|
|
992
|
+
"""
|
|
993
|
+
# Query for position held (P39) with org qualifier and dates
|
|
994
|
+
# Uses rdfs:label instead of SERVICE wikibase:label for better performance
|
|
995
|
+
query = """
|
|
996
|
+
SELECT ?roleLabel ?org ?orgLabel ?startDate ?endDate WHERE {
|
|
997
|
+
wd:%s p:P39 ?stmt .
|
|
998
|
+
?stmt ps:P39 ?role .
|
|
999
|
+
?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
|
|
1000
|
+
OPTIONAL { ?stmt pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
|
|
1001
|
+
OPTIONAL { ?stmt pq:P580 ?startDate . }
|
|
1002
|
+
OPTIONAL { ?stmt pq:P582 ?endDate . }
|
|
1003
|
+
}
|
|
1004
|
+
LIMIT 5
|
|
1005
|
+
""" % person_qid
|
|
1006
|
+
|
|
1007
|
+
try:
|
|
1008
|
+
url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
|
|
1009
|
+
req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
|
|
1010
|
+
|
|
1011
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
1012
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1013
|
+
|
|
1014
|
+
bindings = data.get("results", {}).get("bindings", [])
|
|
1015
|
+
|
|
1016
|
+
# Find the best result (prefer one with org and dates)
|
|
1017
|
+
best_result = None
|
|
1018
|
+
for binding in bindings:
|
|
1019
|
+
role_label = binding.get("roleLabel", {}).get("value", "")
|
|
1020
|
+
org_label = binding.get("orgLabel", {}).get("value", "")
|
|
1021
|
+
org_uri = binding.get("org", {}).get("value", "")
|
|
1022
|
+
start_raw = binding.get("startDate", {}).get("value", "")
|
|
1023
|
+
end_raw = binding.get("endDate", {}).get("value", "")
|
|
1024
|
+
|
|
1025
|
+
# Skip if role is just a QID (no label resolved)
|
|
1026
|
+
if role_label and role_label.startswith("Q"):
|
|
1027
|
+
continue
|
|
1028
|
+
if org_label and org_label.startswith("Q"):
|
|
1029
|
+
org_label = ""
|
|
1030
|
+
|
|
1031
|
+
# Extract QID from URI
|
|
1032
|
+
org_qid = ""
|
|
1033
|
+
if org_uri:
|
|
1034
|
+
org_qid = org_uri.split("/")[-1]
|
|
1035
|
+
if not org_qid.startswith("Q"):
|
|
1036
|
+
org_qid = ""
|
|
1037
|
+
|
|
1038
|
+
from_date = self._parse_wikidata_date(start_raw)
|
|
1039
|
+
to_date = self._parse_wikidata_date(end_raw)
|
|
1040
|
+
|
|
1041
|
+
result = (role_label, org_label, org_qid, from_date, to_date)
|
|
1042
|
+
|
|
1043
|
+
# Prefer results with org and dates
|
|
1044
|
+
if org_label and (from_date or to_date):
|
|
1045
|
+
return result
|
|
1046
|
+
elif org_label and best_result is None:
|
|
1047
|
+
best_result = result
|
|
1048
|
+
elif role_label and best_result is None:
|
|
1049
|
+
best_result = result
|
|
1050
|
+
|
|
1051
|
+
if best_result:
|
|
1052
|
+
return best_result
|
|
1053
|
+
|
|
1054
|
+
return "", "", "", None, None
|
|
1055
|
+
|
|
1056
|
+
except Exception as e:
|
|
1057
|
+
logger.debug(f"Failed to enrich role/org for {person_qid}: {e}")
|
|
1058
|
+
return "", "", "", None, None
|
|
1059
|
+
|
|
1060
|
+
def enrich_people_role_org_batch(
|
|
1061
|
+
self,
|
|
1062
|
+
people: list[PersonRecord],
|
|
1063
|
+
delay_seconds: float = 0.1,
|
|
1064
|
+
max_workers: int = 5,
|
|
1065
|
+
) -> int:
|
|
1066
|
+
"""
|
|
1067
|
+
Enrich a batch of people with role/org/dates data using parallel queries.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
people: List of PersonRecord objects to enrich
|
|
1071
|
+
delay_seconds: Delay between requests (per worker)
|
|
1072
|
+
max_workers: Number of parallel workers (default 5 for Wikidata rate limits)
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
Number of people enriched with role/org
|
|
1076
|
+
"""
|
|
1077
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
1078
|
+
|
|
1079
|
+
# Filter to people that need enrichment
|
|
1080
|
+
to_enrich = [p for p in people if not p.known_for_role and not p.known_for_org]
|
|
1081
|
+
|
|
1082
|
+
if not to_enrich:
|
|
1083
|
+
logger.info("No people need enrichment")
|
|
1084
|
+
return 0
|
|
1085
|
+
|
|
1086
|
+
enriched_count = 0
|
|
1087
|
+
total = len(to_enrich)
|
|
1088
|
+
|
|
1089
|
+
def enrich_one(person: PersonRecord) -> tuple[PersonRecord, bool]:
|
|
1090
|
+
"""Enrich a single person, returns (person, success)."""
|
|
1091
|
+
try:
|
|
1092
|
+
role, org, org_qid, from_date, to_date = self.enrich_person_role_org(person.source_id)
|
|
1093
|
+
|
|
1094
|
+
if role or org:
|
|
1095
|
+
person.known_for_role = role
|
|
1096
|
+
person.known_for_org = org
|
|
1097
|
+
if org_qid:
|
|
1098
|
+
person.record["org_qid"] = org_qid
|
|
1099
|
+
if from_date:
|
|
1100
|
+
person.from_date = from_date
|
|
1101
|
+
if to_date:
|
|
1102
|
+
person.to_date = to_date
|
|
1103
|
+
return person, True
|
|
1104
|
+
|
|
1105
|
+
return person, False
|
|
1106
|
+
except Exception as e:
|
|
1107
|
+
logger.debug(f"Failed to enrich {person.source_id}: {e}")
|
|
1108
|
+
return person, False
|
|
1109
|
+
|
|
1110
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1111
|
+
# Submit all tasks
|
|
1112
|
+
futures = {executor.submit(enrich_one, person): person for person in to_enrich}
|
|
1113
|
+
|
|
1114
|
+
# Process results as they complete
|
|
1115
|
+
completed = 0
|
|
1116
|
+
for future in as_completed(futures):
|
|
1117
|
+
person, success = future.result()
|
|
1118
|
+
if success:
|
|
1119
|
+
enriched_count += 1
|
|
1120
|
+
logger.debug(f"Enriched {person.name}: {person.known_for_role} at {person.known_for_org}")
|
|
1121
|
+
|
|
1122
|
+
completed += 1
|
|
1123
|
+
if completed % 100 == 0:
|
|
1124
|
+
logger.info(f"Enriched {completed}/{total} people ({enriched_count} with data)...")
|
|
1125
|
+
|
|
1126
|
+
# Small delay to avoid rate limiting
|
|
1127
|
+
time.sleep(delay_seconds)
|
|
1128
|
+
|
|
1129
|
+
logger.info(f"Enriched {enriched_count}/{total} people with role/org/dates")
|
|
1130
|
+
return enriched_count
|