corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wikidata importer for the person database.
|
|
3
|
+
|
|
4
|
+
Imports notable people data from Wikidata using SPARQL queries
|
|
5
|
+
into the embedding database for person name matching.
|
|
6
|
+
|
|
7
|
+
Notable people are those with English Wikipedia articles, ensuring
|
|
8
|
+
a basic level of notability.
|
|
9
|
+
|
|
10
|
+
Query categories (organized by PersonType):
|
|
11
|
+
- executives: Business executives (CEOs, CFOs, etc.)
|
|
12
|
+
- politicians: Politicians and diplomats
|
|
13
|
+
- athletes: Sports figures
|
|
14
|
+
- artists: Actors, musicians, directors
|
|
15
|
+
- academics: Professors and researchers
|
|
16
|
+
- scientists: Scientists and inventors
|
|
17
|
+
- journalists: Media personalities
|
|
18
|
+
- entrepreneurs: Founders and business owners
|
|
19
|
+
|
|
20
|
+
Uses the public Wikidata Query Service endpoint.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import logging
|
|
25
|
+
import time
|
|
26
|
+
import urllib.parse
|
|
27
|
+
import urllib.request
|
|
28
|
+
from typing import Any, Iterator, Optional
|
|
29
|
+
|
|
30
|
+
from ..models import PersonRecord, PersonType
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Wikidata SPARQL endpoint
|
|
35
|
+
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
36
|
+
|
|
37
|
+
# Base query template for people with Wikipedia articles
|
|
38
|
+
# Gets person, their position/role, and organization
|
|
39
|
+
PERSON_BASE_QUERY = """
|
|
40
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {{
|
|
41
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
42
|
+
|
|
43
|
+
# Filter condition specific to query type
|
|
44
|
+
{filter_condition}
|
|
45
|
+
|
|
46
|
+
# Get country of citizenship
|
|
47
|
+
OPTIONAL {{ ?person wdt:P27 ?country. }}
|
|
48
|
+
|
|
49
|
+
# Get position held and associated organization
|
|
50
|
+
OPTIONAL {{
|
|
51
|
+
?person p:P39 ?positionStatement .
|
|
52
|
+
?positionStatement ps:P39 ?role .
|
|
53
|
+
OPTIONAL {{ ?positionStatement pq:P642 ?org }} # "of" qualifier
|
|
54
|
+
}}
|
|
55
|
+
|
|
56
|
+
# Fallback: direct employer
|
|
57
|
+
OPTIONAL {{ ?person wdt:P108 ?employer. BIND(?employer AS ?org) }}
|
|
58
|
+
|
|
59
|
+
# Get description
|
|
60
|
+
OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
|
|
61
|
+
|
|
62
|
+
# Must have English Wikipedia article (notability filter)
|
|
63
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
64
|
+
|
|
65
|
+
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
|
|
66
|
+
}}
|
|
67
|
+
LIMIT %d
|
|
68
|
+
OFFSET %d
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# Query for business executives (CEOs, CFOs, board members, etc.) - P39 = executive positions
|
|
72
|
+
EXECUTIVE_QUERY = """
|
|
73
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
74
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
75
|
+
|
|
76
|
+
# Has held executive position
|
|
77
|
+
?person p:P39 ?positionStatement .
|
|
78
|
+
?positionStatement ps:P39 ?role .
|
|
79
|
+
|
|
80
|
+
# Role is a type of corporate officer, board member, or executive
|
|
81
|
+
VALUES ?role {
|
|
82
|
+
# C-Suite
|
|
83
|
+
wd:Q484876 # CEO (Chief Executive Officer)
|
|
84
|
+
wd:Q623279 # CFO (Chief Financial Officer)
|
|
85
|
+
wd:Q1502675 # CTO (Chief Technology Officer)
|
|
86
|
+
wd:Q935019 # COO (Chief Operating Officer)
|
|
87
|
+
wd:Q1057716 # CMO (Chief Marketing Officer)
|
|
88
|
+
wd:Q2140589 # CIO (Chief Information Officer)
|
|
89
|
+
wd:Q1115042 # Chief Human Resources Officer
|
|
90
|
+
wd:Q4720025 # Chief Legal Officer / General Counsel
|
|
91
|
+
wd:Q60432825 # Chief Product Officer
|
|
92
|
+
wd:Q15967139 # Chief Strategy Officer
|
|
93
|
+
wd:Q15729310 # Chief Revenue Officer
|
|
94
|
+
wd:Q47523568 # Chief Digital Officer
|
|
95
|
+
|
|
96
|
+
# Board positions
|
|
97
|
+
wd:Q258557 # Chairman / Chairman of the Board
|
|
98
|
+
wd:Q114863313 # Vice Chairman
|
|
99
|
+
wd:Q726114 # President (business)
|
|
100
|
+
wd:Q1372944 # Vice President
|
|
101
|
+
wd:Q18918145 # Executive Vice President
|
|
102
|
+
wd:Q1057569 # Board of directors member
|
|
103
|
+
wd:Q24058752 # Non-executive director
|
|
104
|
+
wd:Q3578048 # Independent director
|
|
105
|
+
|
|
106
|
+
# Other executive roles
|
|
107
|
+
wd:Q476675 # Managing Director
|
|
108
|
+
wd:Q5441744 # Executive Director
|
|
109
|
+
wd:Q4188234 # General Manager
|
|
110
|
+
wd:Q38844673 # Group CEO
|
|
111
|
+
wd:Q97273203 # President and CEO
|
|
112
|
+
wd:Q60715311 # Chairman and CEO
|
|
113
|
+
wd:Q3563879 # Partner (business)
|
|
114
|
+
wd:Q3505845 # Senior Partner
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
OPTIONAL { ?positionStatement pq:P642 ?org } # "of" qualifier
|
|
118
|
+
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
119
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
120
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
121
|
+
|
|
122
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
123
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
124
|
+
}
|
|
125
|
+
LIMIT %d
|
|
126
|
+
OFFSET %d
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
# Query for politicians
|
|
130
|
+
POLITICIAN_QUERY = """
|
|
131
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
132
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
133
|
+
|
|
134
|
+
# Occupation is politician
|
|
135
|
+
?person wdt:P106 wd:Q82955 .
|
|
136
|
+
|
|
137
|
+
OPTIONAL {
|
|
138
|
+
?person p:P39 ?positionStatement .
|
|
139
|
+
?positionStatement ps:P39 ?role .
|
|
140
|
+
OPTIONAL { ?positionStatement pq:P642 ?org }
|
|
141
|
+
}
|
|
142
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
143
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
144
|
+
|
|
145
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
146
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
147
|
+
}
|
|
148
|
+
LIMIT %d
|
|
149
|
+
OFFSET %d
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
# Query for athletes
|
|
153
|
+
ATHLETE_QUERY = """
|
|
154
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
155
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
156
|
+
|
|
157
|
+
# Is an athlete (has sports team membership P54 or is athlete P106)
|
|
158
|
+
{ ?person wdt:P106 wd:Q2066131 . } # Athlete occupation
|
|
159
|
+
UNION
|
|
160
|
+
{ ?person wdt:P54 ?team . } # Member of sports team
|
|
161
|
+
|
|
162
|
+
OPTIONAL { ?person wdt:P54 ?team . BIND(?team AS ?org) }
|
|
163
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
164
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
165
|
+
|
|
166
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
167
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
168
|
+
}
|
|
169
|
+
LIMIT %d
|
|
170
|
+
OFFSET %d
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
# Query for artists (actors, musicians, directors)
|
|
174
|
+
ARTIST_QUERY = """
|
|
175
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
176
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
177
|
+
|
|
178
|
+
# Has artist occupation
|
|
179
|
+
?person wdt:P106 ?occupation .
|
|
180
|
+
VALUES ?occupation {
|
|
181
|
+
wd:Q33999 # Actor
|
|
182
|
+
wd:Q177220 # Singer
|
|
183
|
+
wd:Q639669 # Musician
|
|
184
|
+
wd:Q2526255 # Film director
|
|
185
|
+
wd:Q36180 # Writer
|
|
186
|
+
wd:Q483501 # Artist
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
190
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
191
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
192
|
+
|
|
193
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
194
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
195
|
+
}
|
|
196
|
+
LIMIT %d
|
|
197
|
+
OFFSET %d
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
# Query for academics (professors)
|
|
201
|
+
ACADEMIC_QUERY = """
|
|
202
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
203
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
204
|
+
|
|
205
|
+
# Is a professor or academic
|
|
206
|
+
{ ?person wdt:P106 wd:Q121594 . } # Professor
|
|
207
|
+
UNION
|
|
208
|
+
{ ?person wdt:P106 wd:Q3400985 . } # Academic
|
|
209
|
+
|
|
210
|
+
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
211
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
212
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
213
|
+
|
|
214
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
215
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
216
|
+
}
|
|
217
|
+
LIMIT %d
|
|
218
|
+
OFFSET %d
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
# Query for scientists
|
|
222
|
+
SCIENTIST_QUERY = """
|
|
223
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
224
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
225
|
+
|
|
226
|
+
# Is a scientist or researcher
|
|
227
|
+
{ ?person wdt:P106 wd:Q901 . } # Scientist
|
|
228
|
+
UNION
|
|
229
|
+
{ ?person wdt:P106 wd:Q1650915 . } # Researcher
|
|
230
|
+
|
|
231
|
+
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
232
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
233
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
234
|
+
|
|
235
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
236
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
237
|
+
}
|
|
238
|
+
LIMIT %d
|
|
239
|
+
OFFSET %d
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
# Query for journalists and media personalities
|
|
243
|
+
JOURNALIST_QUERY = """
|
|
244
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
245
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
246
|
+
|
|
247
|
+
# Is a journalist or presenter
|
|
248
|
+
{ ?person wdt:P106 wd:Q1930187 . } # Journalist
|
|
249
|
+
UNION
|
|
250
|
+
{ ?person wdt:P106 wd:Q13590141 . } # Television presenter
|
|
251
|
+
|
|
252
|
+
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
253
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
254
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
255
|
+
|
|
256
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
257
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
258
|
+
}
|
|
259
|
+
LIMIT %d
|
|
260
|
+
OFFSET %d
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
# Query for entrepreneurs (founders)
|
|
264
|
+
ENTREPRENEUR_QUERY = """
|
|
265
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
266
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
267
|
+
|
|
268
|
+
# Founded a company (inverse of P112)
|
|
269
|
+
?org wdt:P112 ?person .
|
|
270
|
+
|
|
271
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
272
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
273
|
+
|
|
274
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
275
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
276
|
+
}
|
|
277
|
+
LIMIT %d
|
|
278
|
+
OFFSET %d
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
# Query for activists
|
|
282
|
+
ACTIVIST_QUERY = """
|
|
283
|
+
SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
|
|
284
|
+
?person wdt:P31 wd:Q5 . # Instance of human
|
|
285
|
+
|
|
286
|
+
# Is an activist
|
|
287
|
+
?person wdt:P106 wd:Q15253558 . # Activist
|
|
288
|
+
|
|
289
|
+
OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
|
|
290
|
+
OPTIONAL { ?person wdt:P27 ?country. }
|
|
291
|
+
OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
|
|
292
|
+
|
|
293
|
+
?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
|
|
294
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
295
|
+
}
|
|
296
|
+
LIMIT %d
|
|
297
|
+
OFFSET %d
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
# Mapping query type to PersonType
|
|
301
|
+
QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
|
|
302
|
+
"executive": PersonType.EXECUTIVE,
|
|
303
|
+
"politician": PersonType.POLITICIAN,
|
|
304
|
+
"athlete": PersonType.ATHLETE,
|
|
305
|
+
"artist": PersonType.ARTIST,
|
|
306
|
+
"academic": PersonType.ACADEMIC,
|
|
307
|
+
"scientist": PersonType.SCIENTIST,
|
|
308
|
+
"journalist": PersonType.JOURNALIST,
|
|
309
|
+
"entrepreneur": PersonType.ENTREPRENEUR,
|
|
310
|
+
"activist": PersonType.ACTIVIST,
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
# Mapping query type to SPARQL query template
|
|
314
|
+
QUERY_TYPES: dict[str, str] = {
|
|
315
|
+
"executive": EXECUTIVE_QUERY,
|
|
316
|
+
"politician": POLITICIAN_QUERY,
|
|
317
|
+
"athlete": ATHLETE_QUERY,
|
|
318
|
+
"artist": ARTIST_QUERY,
|
|
319
|
+
"academic": ACADEMIC_QUERY,
|
|
320
|
+
"scientist": SCIENTIST_QUERY,
|
|
321
|
+
"journalist": JOURNALIST_QUERY,
|
|
322
|
+
"entrepreneur": ENTREPRENEUR_QUERY,
|
|
323
|
+
"activist": ACTIVIST_QUERY,
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class WikidataPeopleImporter:
|
|
328
|
+
"""
|
|
329
|
+
Importer for Wikidata person data.
|
|
330
|
+
|
|
331
|
+
Uses SPARQL queries against the public Wikidata Query Service
|
|
332
|
+
to fetch notable people including executives, politicians, athletes, etc.
|
|
333
|
+
|
|
334
|
+
Query types:
|
|
335
|
+
- executive: Business executives (CEOs, CFOs, etc.)
|
|
336
|
+
- politician: Politicians and diplomats
|
|
337
|
+
- athlete: Sports figures
|
|
338
|
+
- artist: Actors, musicians, directors, writers
|
|
339
|
+
- academic: Professors and researchers
|
|
340
|
+
- scientist: Scientists and inventors
|
|
341
|
+
- journalist: Media personalities
|
|
342
|
+
- entrepreneur: Company founders
|
|
343
|
+
- activist: Activists and advocates
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
def __init__(self, batch_size: int = 1000, delay_seconds: float = 2.0, timeout: int = 120):
|
|
347
|
+
"""
|
|
348
|
+
Initialize the Wikidata people importer.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
batch_size: Number of records to fetch per SPARQL query (default 1000)
|
|
352
|
+
delay_seconds: Delay between requests to be polite to the endpoint
|
|
353
|
+
timeout: HTTP timeout in seconds (default 120)
|
|
354
|
+
"""
|
|
355
|
+
self._batch_size = batch_size
|
|
356
|
+
self._delay = delay_seconds
|
|
357
|
+
self._timeout = timeout
|
|
358
|
+
|
|
359
|
+
def import_from_sparql(
|
|
360
|
+
self,
|
|
361
|
+
limit: Optional[int] = None,
|
|
362
|
+
query_type: str = "executive",
|
|
363
|
+
import_all: bool = False,
|
|
364
|
+
) -> Iterator[PersonRecord]:
|
|
365
|
+
"""
|
|
366
|
+
Import person records from Wikidata via SPARQL.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
limit: Optional limit on total records
|
|
370
|
+
query_type: Which query to use (executive, politician, athlete, etc.)
|
|
371
|
+
import_all: If True, run all query types sequentially
|
|
372
|
+
|
|
373
|
+
Yields:
|
|
374
|
+
PersonRecord for each person
|
|
375
|
+
"""
|
|
376
|
+
if import_all:
|
|
377
|
+
yield from self._import_all_types(limit)
|
|
378
|
+
return
|
|
379
|
+
|
|
380
|
+
if query_type not in QUERY_TYPES:
|
|
381
|
+
raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPES.keys())}")
|
|
382
|
+
|
|
383
|
+
query_template = QUERY_TYPES[query_type]
|
|
384
|
+
person_type = QUERY_TYPE_TO_PERSON_TYPE.get(query_type, PersonType.UNKNOWN)
|
|
385
|
+
logger.info(f"Starting Wikidata people import via SPARQL (query_type={query_type}, person_type={person_type.value})...")
|
|
386
|
+
|
|
387
|
+
offset = 0
|
|
388
|
+
total_count = 0
|
|
389
|
+
seen_ids = set() # Track seen Wikidata IDs to avoid duplicates
|
|
390
|
+
|
|
391
|
+
while True:
|
|
392
|
+
if limit and total_count >= limit:
|
|
393
|
+
break
|
|
394
|
+
|
|
395
|
+
batch_limit = min(self._batch_size, (limit - total_count) if limit else self._batch_size)
|
|
396
|
+
query = query_template % (batch_limit, offset)
|
|
397
|
+
|
|
398
|
+
logger.info(f"Fetching Wikidata people batch at offset {offset}...")
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
results = self._execute_sparql(query)
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logger.error(f"SPARQL query failed at offset {offset}: {e}")
|
|
404
|
+
break
|
|
405
|
+
|
|
406
|
+
bindings = results.get("results", {}).get("bindings", [])
|
|
407
|
+
|
|
408
|
+
if not bindings:
|
|
409
|
+
logger.info("No more results from Wikidata")
|
|
410
|
+
break
|
|
411
|
+
|
|
412
|
+
batch_count = 0
|
|
413
|
+
for binding in bindings:
|
|
414
|
+
if limit and total_count >= limit:
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
record = self._parse_binding(binding, person_type=person_type)
|
|
418
|
+
if record and record.source_id not in seen_ids:
|
|
419
|
+
seen_ids.add(record.source_id)
|
|
420
|
+
total_count += 1
|
|
421
|
+
batch_count += 1
|
|
422
|
+
yield record
|
|
423
|
+
|
|
424
|
+
logger.info(f"Processed {batch_count} people from batch (total: {total_count})")
|
|
425
|
+
|
|
426
|
+
if len(bindings) < batch_limit:
|
|
427
|
+
# Last batch
|
|
428
|
+
break
|
|
429
|
+
|
|
430
|
+
offset += self._batch_size
|
|
431
|
+
|
|
432
|
+
# Be polite to the endpoint
|
|
433
|
+
if self._delay > 0:
|
|
434
|
+
time.sleep(self._delay)
|
|
435
|
+
|
|
436
|
+
logger.info(f"Completed Wikidata people import: {total_count} records")
|
|
437
|
+
|
|
438
|
+
def _import_all_types(self, limit: Optional[int]) -> Iterator[PersonRecord]:
|
|
439
|
+
"""Import from all query types sequentially, deduplicating across types."""
|
|
440
|
+
seen_ids: set[str] = set()
|
|
441
|
+
total_count = 0
|
|
442
|
+
|
|
443
|
+
# Calculate per-type limits if a total limit is set
|
|
444
|
+
num_types = len(QUERY_TYPES)
|
|
445
|
+
per_type_limit = limit // num_types if limit else None
|
|
446
|
+
|
|
447
|
+
for query_type in QUERY_TYPES:
|
|
448
|
+
logger.info(f"=== Importing people: {query_type} ===")
|
|
449
|
+
type_count = 0
|
|
450
|
+
|
|
451
|
+
for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
|
|
452
|
+
if record.source_id not in seen_ids:
|
|
453
|
+
seen_ids.add(record.source_id)
|
|
454
|
+
total_count += 1
|
|
455
|
+
type_count += 1
|
|
456
|
+
yield record
|
|
457
|
+
|
|
458
|
+
if limit and total_count >= limit:
|
|
459
|
+
logger.info(f"Reached total limit of {limit} records")
|
|
460
|
+
return
|
|
461
|
+
|
|
462
|
+
logger.info(f"Got {type_count} new records from {query_type} (total: {total_count})")
|
|
463
|
+
|
|
464
|
+
logger.info(f"Completed all query types: {total_count} total people records")
|
|
465
|
+
|
|
466
|
+
def _execute_sparql(self, query: str) -> dict[str, Any]:
|
|
467
|
+
"""Execute a SPARQL query against Wikidata."""
|
|
468
|
+
params = urllib.parse.urlencode({
|
|
469
|
+
"query": query,
|
|
470
|
+
"format": "json",
|
|
471
|
+
})
|
|
472
|
+
|
|
473
|
+
url = f"{WIKIDATA_SPARQL_URL}?{params}"
|
|
474
|
+
|
|
475
|
+
req = urllib.request.Request(
|
|
476
|
+
url,
|
|
477
|
+
headers={
|
|
478
|
+
"Accept": "application/sparql-results+json",
|
|
479
|
+
"User-Agent": "corp-extractor/1.0 (person database builder)",
|
|
480
|
+
}
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
with urllib.request.urlopen(req, timeout=self._timeout) as response:
|
|
484
|
+
return json.loads(response.read().decode("utf-8"))
|
|
485
|
+
|
|
486
|
+
def _parse_binding(
|
|
487
|
+
self,
|
|
488
|
+
binding: dict[str, Any],
|
|
489
|
+
person_type: PersonType = PersonType.UNKNOWN,
|
|
490
|
+
) -> Optional[PersonRecord]:
|
|
491
|
+
"""Parse a SPARQL result binding into a PersonRecord."""
|
|
492
|
+
try:
|
|
493
|
+
# Get Wikidata entity ID
|
|
494
|
+
person_uri = binding.get("person", {}).get("value", "")
|
|
495
|
+
if not person_uri:
|
|
496
|
+
return None
|
|
497
|
+
|
|
498
|
+
# Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
|
|
499
|
+
wikidata_id = person_uri.split("/")[-1]
|
|
500
|
+
if not wikidata_id.startswith("Q"):
|
|
501
|
+
return None
|
|
502
|
+
|
|
503
|
+
# Get label
|
|
504
|
+
label = binding.get("personLabel", {}).get("value", "")
|
|
505
|
+
if not label or label == wikidata_id: # Skip if no English label
|
|
506
|
+
return None
|
|
507
|
+
|
|
508
|
+
# Get optional fields
|
|
509
|
+
country = binding.get("countryLabel", {}).get("value", "")
|
|
510
|
+
role = binding.get("roleLabel", {}).get("value", "")
|
|
511
|
+
org = binding.get("orgLabel", {}).get("value", "")
|
|
512
|
+
description = binding.get("description", {}).get("value", "")
|
|
513
|
+
|
|
514
|
+
# Clean up role and org (remove QID if it's the same as the label)
|
|
515
|
+
if role and role.startswith("Q"):
|
|
516
|
+
role = ""
|
|
517
|
+
if org and org.startswith("Q"):
|
|
518
|
+
org = ""
|
|
519
|
+
|
|
520
|
+
# Build record data
|
|
521
|
+
record_data: dict[str, Any] = {
|
|
522
|
+
"wikidata_id": wikidata_id,
|
|
523
|
+
"label": label,
|
|
524
|
+
}
|
|
525
|
+
if country:
|
|
526
|
+
record_data["country"] = country
|
|
527
|
+
if role:
|
|
528
|
+
record_data["role"] = role
|
|
529
|
+
if org:
|
|
530
|
+
record_data["org"] = org
|
|
531
|
+
if description:
|
|
532
|
+
record_data["description"] = description
|
|
533
|
+
|
|
534
|
+
return PersonRecord(
|
|
535
|
+
name=label.strip(),
|
|
536
|
+
source="wikidata",
|
|
537
|
+
source_id=wikidata_id,
|
|
538
|
+
country=country or "",
|
|
539
|
+
person_type=person_type,
|
|
540
|
+
known_for_role=role or "",
|
|
541
|
+
known_for_org=org or "",
|
|
542
|
+
record=record_data,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
except Exception as e:
|
|
546
|
+
logger.debug(f"Failed to parse Wikidata binding: {e}")
|
|
547
|
+
return None
|
|
548
|
+
|
|
549
|
+
def search_person(self, name: str, limit: int = 10) -> list[PersonRecord]:
|
|
550
|
+
"""
|
|
551
|
+
Search for a specific person by name.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
name: Person name to search for
|
|
555
|
+
limit: Maximum results to return
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
List of matching PersonRecords
|
|
559
|
+
"""
|
|
560
|
+
# Use Wikidata search API for better name matching
|
|
561
|
+
search_url = "https://www.wikidata.org/w/api.php"
|
|
562
|
+
params = urllib.parse.urlencode({
|
|
563
|
+
"action": "wbsearchentities",
|
|
564
|
+
"search": name,
|
|
565
|
+
"language": "en",
|
|
566
|
+
"type": "item",
|
|
567
|
+
"limit": limit,
|
|
568
|
+
"format": "json",
|
|
569
|
+
})
|
|
570
|
+
|
|
571
|
+
req = urllib.request.Request(
|
|
572
|
+
f"{search_url}?{params}",
|
|
573
|
+
headers={"User-Agent": "corp-extractor/1.0"}
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
577
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
578
|
+
|
|
579
|
+
results = []
|
|
580
|
+
for item in data.get("search", []):
|
|
581
|
+
qid = item.get("id")
|
|
582
|
+
label = item.get("label", "")
|
|
583
|
+
description = item.get("description", "")
|
|
584
|
+
|
|
585
|
+
# Check if it looks like a person
|
|
586
|
+
person_keywords = [
|
|
587
|
+
"politician", "actor", "actress", "singer", "musician",
|
|
588
|
+
"businessman", "businesswoman", "ceo", "executive", "director",
|
|
589
|
+
"president", "founder", "professor", "scientist", "author",
|
|
590
|
+
"writer", "journalist", "athlete", "player", "coach",
|
|
591
|
+
]
|
|
592
|
+
description_lower = description.lower()
|
|
593
|
+
is_person = any(kw in description_lower for kw in person_keywords)
|
|
594
|
+
if not is_person:
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
# Try to infer person type from description
|
|
598
|
+
person_type = PersonType.UNKNOWN
|
|
599
|
+
if any(kw in description_lower for kw in ["ceo", "executive", "businessman", "businesswoman"]):
|
|
600
|
+
person_type = PersonType.EXECUTIVE
|
|
601
|
+
elif any(kw in description_lower for kw in ["politician", "president", "senator", "minister"]):
|
|
602
|
+
person_type = PersonType.POLITICIAN
|
|
603
|
+
elif any(kw in description_lower for kw in ["athlete", "player", "coach"]):
|
|
604
|
+
person_type = PersonType.ATHLETE
|
|
605
|
+
elif any(kw in description_lower for kw in ["actor", "actress", "singer", "musician", "director"]):
|
|
606
|
+
person_type = PersonType.ARTIST
|
|
607
|
+
elif any(kw in description_lower for kw in ["professor", "academic"]):
|
|
608
|
+
person_type = PersonType.ACADEMIC
|
|
609
|
+
elif any(kw in description_lower for kw in ["scientist", "researcher"]):
|
|
610
|
+
person_type = PersonType.SCIENTIST
|
|
611
|
+
elif any(kw in description_lower for kw in ["journalist", "reporter"]):
|
|
612
|
+
person_type = PersonType.JOURNALIST
|
|
613
|
+
elif any(kw in description_lower for kw in ["founder", "entrepreneur"]):
|
|
614
|
+
person_type = PersonType.ENTREPRENEUR
|
|
615
|
+
|
|
616
|
+
record = PersonRecord(
|
|
617
|
+
name=label,
|
|
618
|
+
source="wikidata",
|
|
619
|
+
source_id=qid,
|
|
620
|
+
country="", # Not available from search API
|
|
621
|
+
person_type=person_type,
|
|
622
|
+
known_for_role="",
|
|
623
|
+
known_for_org="",
|
|
624
|
+
record={
|
|
625
|
+
"wikidata_id": qid,
|
|
626
|
+
"label": label,
|
|
627
|
+
"description": description,
|
|
628
|
+
},
|
|
629
|
+
)
|
|
630
|
+
results.append(record)
|
|
631
|
+
|
|
632
|
+
return results
|