corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -4,6 +4,10 @@ Wikidata importer for the person database.
4
4
  Imports notable people data from Wikidata using SPARQL queries
5
5
  into the embedding database for person name matching.
6
6
 
7
+ Uses a two-phase approach for reliability:
8
+ 1. Bulk fetch: Simple queries to get QID + name + country (fast, no timeouts)
9
+ 2. Enrich: Targeted per-person queries for role/org/dates (resumable)
10
+
7
11
  Notable people are those with English Wikipedia articles, ensuring
8
12
  a basic level of notability.
9
13
 
@@ -27,275 +31,207 @@ import urllib.parse
27
31
  import urllib.request
28
32
  from typing import Any, Iterator, Optional
29
33
 
30
- from ..models import PersonRecord, PersonType
34
+ from ..models import CompanyRecord, EntityType, PersonRecord, PersonType
31
35
 
32
36
  logger = logging.getLogger(__name__)
33
37
 
34
38
  # Wikidata SPARQL endpoint
35
39
  WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
36
40
 
37
- # Base query template for people with Wikipedia articles
38
- # Gets person, their position/role, and organization
39
- PERSON_BASE_QUERY = """
40
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {{
41
- ?person wdt:P31 wd:Q5 . # Instance of human
42
-
43
- # Filter condition specific to query type
44
- {filter_condition}
45
-
46
- # Get country of citizenship
47
- OPTIONAL {{ ?person wdt:P27 ?country. }}
48
-
49
- # Get position held and associated organization
50
- OPTIONAL {{
51
- ?person p:P39 ?positionStatement .
52
- ?positionStatement ps:P39 ?role .
53
- OPTIONAL {{ ?positionStatement pq:P642 ?org }} # "of" qualifier
54
- }}
55
-
56
- # Fallback: direct employer
57
- OPTIONAL {{ ?person wdt:P108 ?employer. BIND(?employer AS ?org) }}
58
-
59
- # Get description
41
+ # =============================================================================
42
+ # BULK QUERIES - Simple, fast queries for initial import (no role/org/dates)
43
+ # Uses rdfs:label instead of SERVICE wikibase:label for better performance
44
+ # Each query targets a single role/occupation for speed
45
+ # =============================================================================
46
+
47
+ # Template for position-held queries (P39) - for executives, politicians
48
+ # Matches people who held a position that IS the role, or is an INSTANCE OF the role
49
+ # {role_qid} = single role QID, {seed} = unique seed, {limit} = batch limit
50
+ POSITION_QUERY_TEMPLATE = """
51
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
52
+ ?person wdt:P31 wd:Q5 .
53
+ ?person wdt:P39 ?position .
54
+ {{ ?position wdt:P31 wd:{role_qid} . }} UNION {{ VALUES ?position {{ wd:{role_qid} }} }}
55
+ ?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
56
+ OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
60
57
  OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
61
-
62
- # Must have English Wikipedia article (notability filter)
63
58
  ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
64
-
65
- SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
66
59
  }}
67
- LIMIT %d
68
- OFFSET %d
69
- """
70
-
71
- # Query for business executives (CEOs, CFOs, board members, etc.) - P39 = executive positions
72
- EXECUTIVE_QUERY = """
73
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
74
- ?person wdt:P31 wd:Q5 . # Instance of human
75
-
76
- # Has held executive position
77
- ?person p:P39 ?positionStatement .
78
- ?positionStatement ps:P39 ?role .
79
-
80
- # Role is a type of corporate officer, board member, or executive
81
- VALUES ?role {
82
- # C-Suite
83
- wd:Q484876 # CEO (Chief Executive Officer)
84
- wd:Q623279 # CFO (Chief Financial Officer)
85
- wd:Q1502675 # CTO (Chief Technology Officer)
86
- wd:Q935019 # COO (Chief Operating Officer)
87
- wd:Q1057716 # CMO (Chief Marketing Officer)
88
- wd:Q2140589 # CIO (Chief Information Officer)
89
- wd:Q1115042 # Chief Human Resources Officer
90
- wd:Q4720025 # Chief Legal Officer / General Counsel
91
- wd:Q60432825 # Chief Product Officer
92
- wd:Q15967139 # Chief Strategy Officer
93
- wd:Q15729310 # Chief Revenue Officer
94
- wd:Q47523568 # Chief Digital Officer
95
-
96
- # Board positions
97
- wd:Q258557 # Chairman / Chairman of the Board
98
- wd:Q114863313 # Vice Chairman
99
- wd:Q726114 # President (business)
100
- wd:Q1372944 # Vice President
101
- wd:Q18918145 # Executive Vice President
102
- wd:Q1057569 # Board of directors member
103
- wd:Q24058752 # Non-executive director
104
- wd:Q3578048 # Independent director
105
-
106
- # Other executive roles
107
- wd:Q476675 # Managing Director
108
- wd:Q5441744 # Executive Director
109
- wd:Q4188234 # General Manager
110
- wd:Q38844673 # Group CEO
111
- wd:Q97273203 # President and CEO
112
- wd:Q60715311 # Chairman and CEO
113
- wd:Q3563879 # Partner (business)
114
- wd:Q3505845 # Senior Partner
115
- }
116
-
117
- OPTIONAL { ?positionStatement pq:P642 ?org } # "of" qualifier
118
- OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
119
- OPTIONAL { ?person wdt:P27 ?country. }
120
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
121
-
122
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
123
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
124
- }
125
- LIMIT %d
126
- OFFSET %d
127
- """
128
-
129
- # Query for politicians
130
- POLITICIAN_QUERY = """
131
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
132
- ?person wdt:P31 wd:Q5 . # Instance of human
133
-
134
- # Occupation is politician
135
- ?person wdt:P106 wd:Q82955 .
136
-
137
- OPTIONAL {
138
- ?person p:P39 ?positionStatement .
139
- ?positionStatement ps:P39 ?role .
140
- OPTIONAL { ?positionStatement pq:P642 ?org }
141
- }
142
- OPTIONAL { ?person wdt:P27 ?country. }
143
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
144
-
145
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
146
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
147
- }
148
- LIMIT %d
149
- OFFSET %d
150
- """
151
-
152
- # Query for athletes
153
- ATHLETE_QUERY = """
154
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
155
- ?person wdt:P31 wd:Q5 . # Instance of human
156
-
157
- # Is an athlete (has sports team membership P54 or is athlete P106)
158
- { ?person wdt:P106 wd:Q2066131 . } # Athlete occupation
159
- UNION
160
- { ?person wdt:P54 ?team . } # Member of sports team
161
-
162
- OPTIONAL { ?person wdt:P54 ?team . BIND(?team AS ?org) }
163
- OPTIONAL { ?person wdt:P27 ?country. }
164
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
165
-
166
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
167
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
168
- }
169
- LIMIT %d
170
- OFFSET %d
171
- """
172
-
173
- # Query for artists (actors, musicians, directors)
174
- ARTIST_QUERY = """
175
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
176
- ?person wdt:P31 wd:Q5 . # Instance of human
177
-
178
- # Has artist occupation
179
- ?person wdt:P106 ?occupation .
180
- VALUES ?occupation {
181
- wd:Q33999 # Actor
182
- wd:Q177220 # Singer
183
- wd:Q639669 # Musician
184
- wd:Q2526255 # Film director
185
- wd:Q36180 # Writer
186
- wd:Q483501 # Artist
187
- }
188
-
189
- OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
190
- OPTIONAL { ?person wdt:P27 ?country. }
191
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
192
-
193
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
194
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
195
- }
196
- LIMIT %d
197
- OFFSET %d
198
- """
199
-
200
- # Query for academics (professors)
201
- ACADEMIC_QUERY = """
202
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
203
- ?person wdt:P31 wd:Q5 . # Instance of human
204
-
205
- # Is a professor or academic
206
- { ?person wdt:P106 wd:Q121594 . } # Professor
207
- UNION
208
- { ?person wdt:P106 wd:Q3400985 . } # Academic
209
-
210
- OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
211
- OPTIONAL { ?person wdt:P27 ?country. }
212
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
213
-
214
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
215
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
216
- }
217
- LIMIT %d
218
- OFFSET %d
60
+ ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
61
+ LIMIT {limit}
219
62
  """
220
63
 
221
- # Query for scientists
222
- SCIENTIST_QUERY = """
223
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
224
- ?person wdt:P31 wd:Q5 . # Instance of human
225
-
226
- # Is a scientist or researcher
227
- { ?person wdt:P106 wd:Q901 . } # Scientist
228
- UNION
229
- { ?person wdt:P106 wd:Q1650915 . } # Researcher
230
-
231
- OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
232
- OPTIONAL { ?person wdt:P27 ?country. }
233
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
234
-
235
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
236
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
237
- }
238
- LIMIT %d
239
- OFFSET %d
240
- """
241
-
242
- # Query for journalists and media personalities
243
- JOURNALIST_QUERY = """
244
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
245
- ?person wdt:P31 wd:Q5 . # Instance of human
246
-
247
- # Is a journalist or presenter
248
- { ?person wdt:P106 wd:Q1930187 . } # Journalist
249
- UNION
250
- { ?person wdt:P106 wd:Q13590141 . } # Television presenter
251
-
252
- OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
253
- OPTIONAL { ?person wdt:P27 ?country. }
254
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
255
-
64
+ # Template for occupation queries (P106) - for athletes, artists, etc.
65
+ # {occupation_qid} = single occupation QID, {seed} = unique seed, {limit} = batch limit
66
+ OCCUPATION_QUERY_TEMPLATE = """
67
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
68
+ ?person wdt:P31 wd:Q5 .
69
+ ?person wdt:P106 wd:{occupation_qid} .
70
+ ?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
71
+ OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
72
+ OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
256
73
  ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
257
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
258
- }
259
- LIMIT %d
260
- OFFSET %d
74
+ }}
75
+ ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
76
+ LIMIT {limit}
261
77
  """
262
78
 
263
- # Query for entrepreneurs (founders)
264
- ENTREPRENEUR_QUERY = """
265
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
266
- ?person wdt:P31 wd:Q5 . # Instance of human
267
-
268
- # Founded a company (inverse of P112)
79
+ # Template for founder queries (P112) - for entrepreneurs
80
+ # {seed} = unique seed, {limit} = batch limit
81
+ FOUNDER_QUERY_TEMPLATE = """
82
+ SELECT DISTINCT ?person ?personLabel ?countryLabel ?description WHERE {{
83
+ ?person wdt:P31 wd:Q5 .
269
84
  ?org wdt:P112 ?person .
270
-
271
- OPTIONAL { ?person wdt:P27 ?country. }
272
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
273
-
85
+ ?person rdfs:label ?personLabel FILTER(LANG(?personLabel) = "en") .
86
+ OPTIONAL {{ ?person wdt:P27 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") . }}
87
+ OPTIONAL {{ ?person schema:description ?description FILTER(LANG(?description) = "en") }}
274
88
  ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
275
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
276
- }
277
- LIMIT %d
278
- OFFSET %d
89
+ }}
90
+ ORDER BY MD5(CONCAT(STR(?person), "{seed}"))
91
+ LIMIT {limit}
279
92
  """
280
93
 
281
- # Query for activists
282
- ACTIVIST_QUERY = """
283
- SELECT DISTINCT ?person ?personLabel ?countryLabel ?roleLabel ?orgLabel ?description WHERE {
284
- ?person wdt:P31 wd:Q5 . # Instance of human
285
-
286
- # Is an activist
287
- ?person wdt:P106 wd:Q15253558 . # Activist
288
-
289
- OPTIONAL { ?person wdt:P108 ?employer. BIND(?employer AS ?org) }
290
- OPTIONAL { ?person wdt:P27 ?country. }
291
- OPTIONAL { ?person schema:description ?description FILTER(LANG(?description) = "en") }
292
-
293
- ?article schema:about ?person ; schema:isPartOf <https://en.wikipedia.org/> .
294
- SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
94
+ # Role QIDs for executives (position held - P39)
95
+ EXECUTIVE_ROLES = [
96
+ "Q484876", # CEO
97
+ "Q623279", # CFO
98
+ "Q1502675", # COO
99
+ "Q935019", # CTO
100
+ "Q1057716", # CIO
101
+ "Q2140589", # CMO
102
+ "Q1115042", # chairperson
103
+ "Q4720025", # board of directors member
104
+ "Q60432825", # chief human resources officer
105
+ "Q15967139", # chief compliance officer
106
+ "Q15729310", # chief risk officer
107
+ "Q47523568", # chief legal officer
108
+ "Q258557", # board chair
109
+ "Q114863313", # chief sustainability officer
110
+ "Q726114", # company president
111
+ "Q1372944", # managing director
112
+ "Q18918145", # chief commercial officer
113
+ "Q1057569", # chief strategy officer
114
+ "Q24058752", # chief product officer
115
+ "Q3578048", # vice president
116
+ "Q476675", # business executive (generic)
117
+ "Q5441744", # finance director
118
+ "Q4188234", # general manager
119
+ "Q38844673", # chief data officer
120
+ "Q97273203", # chief digital officer
121
+ "Q60715311", # chief growth officer
122
+ "Q3563879", # treasurer
123
+ "Q3505845", # corporate secretary
124
+ ]
125
+
126
+ # Role QIDs for politicians (position held - P39)
127
+ POLITICIAN_ROLES = [
128
+ "Q30461", # president
129
+ "Q14212", # prime minister
130
+ "Q83307", # minister
131
+ "Q2285706", # head of government
132
+ "Q4175034", # legislator
133
+ "Q486839", # member of parliament
134
+ "Q193391", # member of national legislature
135
+ "Q212071", # mayor
136
+ "Q382617", # governor
137
+ "Q116", # monarch
138
+ "Q484529", # member of congress
139
+ ]
140
+
141
+ # Note: Politicians with generic position types (like "public office") may not be found
142
+ # because querying all public office holders times out. This includes some mayors
143
+ # whose positions are typed as "public office" rather than "mayor".
144
+
145
+ # Occupation QIDs for athletes (P106)
146
+ ATHLETE_OCCUPATIONS = [
147
+ "Q2066131", # athlete
148
+ "Q937857", # football player
149
+ "Q3665646", # basketball player
150
+ "Q10871364", # baseball player
151
+ "Q19204627", # ice hockey player
152
+ "Q10843402", # tennis player
153
+ "Q13381376", # golfer
154
+ "Q11338576", # boxer
155
+ "Q10873124", # swimmer
156
+ ]
157
+
158
+ # Occupation QIDs for artists (P106)
159
+ ARTIST_OCCUPATIONS = [
160
+ "Q33999", # actor
161
+ "Q177220", # singer
162
+ "Q639669", # musician
163
+ "Q2526255", # film director
164
+ "Q36180", # writer
165
+ "Q483501", # artist
166
+ "Q488205", # singer-songwriter
167
+ "Q753110", # songwriter
168
+ "Q2405480", # voice actor
169
+ "Q10800557", # film actor
170
+ ]
171
+
172
+ # Occupation QIDs for academics (P106)
173
+ ACADEMIC_OCCUPATIONS = [
174
+ "Q121594", # professor
175
+ "Q3400985", # academic
176
+ "Q1622272", # university professor
177
+ ]
178
+
179
+ # Occupation QIDs for scientists (P106)
180
+ SCIENTIST_OCCUPATIONS = [
181
+ "Q901", # scientist
182
+ "Q1650915", # researcher
183
+ "Q169470", # physicist
184
+ "Q593644", # chemist
185
+ "Q864503", # biologist
186
+ "Q11063", # astronomer
187
+ ]
188
+
189
+ # Occupation QIDs for journalists (P106)
190
+ JOURNALIST_OCCUPATIONS = [
191
+ "Q1930187", # journalist
192
+ "Q13590141", # news presenter
193
+ "Q947873", # television presenter
194
+ "Q4263842", # columnist
195
+ ]
196
+
197
+ # Occupation QIDs for activists (P106)
198
+ ACTIVIST_OCCUPATIONS = [
199
+ "Q15253558", # activist
200
+ "Q11631410", # human rights activist
201
+ "Q18939491", # environmental activist
202
+ ]
203
+
204
+ # Mapping query type to role/occupation lists and query template type
205
+ # Each entry can have multiple query groups to combine different approaches
206
+ QUERY_TYPE_CONFIG: dict[str, list[dict]] = {
207
+ "executive": [
208
+ {"template": "position", "items": EXECUTIVE_ROLES},
209
+ ],
210
+ "politician": [
211
+ {"template": "position", "items": POLITICIAN_ROLES},
212
+ ],
213
+ "athlete": [
214
+ {"template": "occupation", "items": ATHLETE_OCCUPATIONS},
215
+ ],
216
+ "artist": [
217
+ {"template": "occupation", "items": ARTIST_OCCUPATIONS},
218
+ ],
219
+ "academic": [
220
+ {"template": "occupation", "items": ACADEMIC_OCCUPATIONS},
221
+ ],
222
+ "scientist": [
223
+ {"template": "occupation", "items": SCIENTIST_OCCUPATIONS},
224
+ ],
225
+ "journalist": [
226
+ {"template": "occupation", "items": JOURNALIST_OCCUPATIONS},
227
+ ],
228
+ "activist": [
229
+ {"template": "occupation", "items": ACTIVIST_OCCUPATIONS},
230
+ ],
231
+ "entrepreneur": [
232
+ {"template": "founder", "items": []}, # No items, uses special template
233
+ ],
295
234
  }
296
- LIMIT %d
297
- OFFSET %d
298
- """
299
235
 
300
236
  # Mapping query type to PersonType
301
237
  QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
@@ -310,19 +246,6 @@ QUERY_TYPE_TO_PERSON_TYPE: dict[str, PersonType] = {
310
246
  "activist": PersonType.ACTIVIST,
311
247
  }
312
248
 
313
- # Mapping query type to SPARQL query template
314
- QUERY_TYPES: dict[str, str] = {
315
- "executive": EXECUTIVE_QUERY,
316
- "politician": POLITICIAN_QUERY,
317
- "athlete": ATHLETE_QUERY,
318
- "artist": ARTIST_QUERY,
319
- "academic": ACADEMIC_QUERY,
320
- "scientist": SCIENTIST_QUERY,
321
- "journalist": JOURNALIST_QUERY,
322
- "entrepreneur": ENTREPRENEUR_QUERY,
323
- "activist": ACTIVIST_QUERY,
324
- }
325
-
326
249
 
327
250
  class WikidataPeopleImporter:
328
251
  """
@@ -343,126 +266,311 @@ class WikidataPeopleImporter:
343
266
  - activist: Activists and advocates
344
267
  """
345
268
 
346
- def __init__(self, batch_size: int = 1000, delay_seconds: float = 2.0, timeout: int = 120):
269
+ def __init__(
270
+ self,
271
+ batch_size: int = 5000,
272
+ delay_seconds: float = 2.0,
273
+ timeout: int = 120,
274
+ max_retries: int = 3,
275
+ min_batch_size: int = 50,
276
+ ):
347
277
  """
348
278
  Initialize the Wikidata people importer.
349
279
 
350
280
  Args:
351
- batch_size: Number of records to fetch per SPARQL query (default 1000)
281
+ batch_size: Number of records to fetch per SPARQL query (default 5000)
352
282
  delay_seconds: Delay between requests to be polite to the endpoint
353
283
  timeout: HTTP timeout in seconds (default 120)
284
+ max_retries: Maximum retries per batch on timeout (default 3)
285
+ min_batch_size: Minimum batch size before giving up (default 50)
354
286
  """
355
287
  self._batch_size = batch_size
356
288
  self._delay = delay_seconds
357
289
  self._timeout = timeout
290
+ self._max_retries = max_retries
291
+ self._min_batch_size = min_batch_size
292
+ # Track discovered organizations: org_qid -> org_label
293
+ self._discovered_orgs: dict[str, str] = {}
358
294
 
359
295
  def import_from_sparql(
360
296
  self,
361
297
  limit: Optional[int] = None,
362
298
  query_type: str = "executive",
363
299
  import_all: bool = False,
300
+ convergence_threshold: int = 5,
364
301
  ) -> Iterator[PersonRecord]:
365
302
  """
366
- Import person records from Wikidata via SPARQL.
303
+ Import person records from Wikidata via SPARQL (bulk fetch phase).
304
+
305
+ This performs the fast bulk import with minimal data (QID, name, country).
306
+ Use enrich_people_batch() afterwards to add role/org/dates.
307
+
308
+ Iterates through each role/occupation individually for faster queries,
309
+ using random sampling with convergence detection per role.
367
310
 
368
311
  Args:
369
312
  limit: Optional limit on total records
370
313
  query_type: Which query to use (executive, politician, athlete, etc.)
371
314
  import_all: If True, run all query types sequentially
315
+ convergence_threshold: Stop after this many consecutive batches with no new records per role
372
316
 
373
317
  Yields:
374
- PersonRecord for each person
318
+ PersonRecord for each person (without role/org - use enrich to add)
375
319
  """
376
320
  if import_all:
377
321
  yield from self._import_all_types(limit)
378
322
  return
379
323
 
380
- if query_type not in QUERY_TYPES:
381
- raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPES.keys())}")
324
+ if query_type not in QUERY_TYPE_CONFIG:
325
+ raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPE_CONFIG.keys())}")
382
326
 
383
- query_template = QUERY_TYPES[query_type]
327
+ config_groups = QUERY_TYPE_CONFIG[query_type]
384
328
  person_type = QUERY_TYPE_TO_PERSON_TYPE.get(query_type, PersonType.UNKNOWN)
385
- logger.info(f"Starting Wikidata people import via SPARQL (query_type={query_type}, person_type={person_type.value})...")
386
329
 
387
- offset = 0
330
+ logger.info(f"Starting Wikidata bulk import (query_type={query_type}, person_type={person_type.value})...")
331
+
388
332
  total_count = 0
389
- seen_ids = set() # Track seen Wikidata IDs to avoid duplicates
333
+ # Track seen QIDs to deduplicate across all roles
334
+ seen_qids: set[str] = set()
335
+
336
+ # Iterate through each config group (e.g., position queries + occupation queries)
337
+ for config in config_groups:
338
+ if limit and total_count >= limit:
339
+ break
340
+
341
+ template_type = config["template"]
342
+ items = config["items"]
343
+
344
+ # For founder template, run a single query
345
+ if template_type == "founder":
346
+ for record in self._import_single_template(
347
+ template=FOUNDER_QUERY_TEMPLATE,
348
+ template_params={},
349
+ person_type=person_type,
350
+ seen_qids=seen_qids,
351
+ limit=(limit - total_count) if limit else None,
352
+ convergence_threshold=convergence_threshold,
353
+ role_name="founder",
354
+ ):
355
+ total_count += 1
356
+ yield record
357
+ continue
358
+
359
+ # Select the right template
360
+ if template_type == "position":
361
+ template = POSITION_QUERY_TEMPLATE
362
+ param_name = "role_qid"
363
+ else: # occupation
364
+ template = OCCUPATION_QUERY_TEMPLATE
365
+ param_name = "occupation_qid"
366
+
367
+ # Iterate through each role/occupation in this group
368
+ for item_qid in items:
369
+ if limit and total_count >= limit:
370
+ break
371
+
372
+ remaining = (limit - total_count) if limit else None
373
+ role_count = 0
374
+
375
+ for record in self._import_single_template(
376
+ template=template,
377
+ template_params={param_name: item_qid},
378
+ person_type=person_type,
379
+ seen_qids=seen_qids,
380
+ limit=remaining,
381
+ convergence_threshold=convergence_threshold,
382
+ role_name=item_qid,
383
+ ):
384
+ role_count += 1
385
+ total_count += 1
386
+ yield record
387
+
388
+ logger.info(f"Role {item_qid}: {role_count} new (total: {total_count})")
389
+
390
+ logger.info(f"Completed Wikidata bulk import: {total_count} records (use enrich to add role/org)")
391
+
392
+ def _import_single_template(
393
+ self,
394
+ template: str,
395
+ template_params: dict[str, str],
396
+ person_type: PersonType,
397
+ seen_qids: set[str],
398
+ limit: Optional[int],
399
+ convergence_threshold: int,
400
+ role_name: str,
401
+ ) -> Iterator[PersonRecord]:
402
+ """
403
+ Import from a single role/occupation using random sampling with convergence.
404
+
405
+ Args:
406
+ template: SPARQL query template
407
+ template_params: Parameters to format into template (role_qid or occupation_qid)
408
+ person_type: PersonType to assign to records
409
+ seen_qids: Set of already-seen QIDs (shared across roles)
410
+ limit: Optional limit on records from this role
411
+ convergence_threshold: Stop after this many consecutive empty batches
412
+ role_name: Name for logging
413
+
414
+ Yields:
415
+ PersonRecord for each new person found
416
+ """
417
+ batch_num = 0
418
+ total_count = 0
419
+ current_batch_size = self._batch_size
420
+ consecutive_empty_batches = 0
421
+
422
+ logger.info(f"Querying role {role_name}...")
390
423
 
391
424
  while True:
392
425
  if limit and total_count >= limit:
393
426
  break
394
427
 
395
- batch_limit = min(self._batch_size, (limit - total_count) if limit else self._batch_size)
396
- query = query_template % (batch_limit, offset)
428
+ batch_num += 1
429
+ batch_limit = min(current_batch_size, (limit - total_count) if limit else current_batch_size)
397
430
 
398
- logger.info(f"Fetching Wikidata people batch at offset {offset}...")
431
+ # Generate unique seed for this batch
432
+ batch_seed = f"{role_name}_{batch_num}_{int(time.time() * 1000)}"
399
433
 
400
- try:
401
- results = self._execute_sparql(query)
402
- except Exception as e:
403
- logger.error(f"SPARQL query failed at offset {offset}: {e}")
434
+ # Build query
435
+ query = template.format(
436
+ **template_params,
437
+ seed=batch_seed,
438
+ limit=batch_limit,
439
+ )
440
+
441
+ # Execute with retries
442
+ results = None
443
+ retries = 0
444
+ retry_batch_size = batch_limit
445
+
446
+ while retries <= self._max_retries:
447
+ try:
448
+ # Rebuild query with potentially smaller batch size
449
+ if retry_batch_size != batch_limit:
450
+ query = template.format(
451
+ **template_params,
452
+ seed=batch_seed,
453
+ limit=retry_batch_size,
454
+ )
455
+ results = self._execute_sparql(query)
456
+ if retry_batch_size < current_batch_size:
457
+ current_batch_size = retry_batch_size
458
+ break
459
+ except Exception as e:
460
+ is_timeout = "timeout" in str(e).lower() or "504" in str(e) or "503" in str(e)
461
+ if is_timeout and retry_batch_size > self._min_batch_size:
462
+ retries += 1
463
+ retry_batch_size = max(retry_batch_size // 2, self._min_batch_size)
464
+ wait_time = self._delay * (2 ** retries)
465
+ logger.warning(
466
+ f"Timeout on {role_name} batch #{batch_num}, retry {retries}/{self._max_retries} "
467
+ f"with batch_size={retry_batch_size} after {wait_time:.1f}s wait"
468
+ )
469
+ time.sleep(wait_time)
470
+ else:
471
+ logger.error(f"SPARQL query failed on {role_name} batch #{batch_num}: {e}")
472
+ break
473
+
474
+ if results is None:
475
+ logger.warning(f"Giving up on {role_name} after {retries} retries")
404
476
  break
405
477
 
406
478
  bindings = results.get("results", {}).get("bindings", [])
407
479
 
408
480
  if not bindings:
409
- logger.info("No more results from Wikidata")
410
- break
481
+ consecutive_empty_batches += 1
482
+ if consecutive_empty_batches >= convergence_threshold:
483
+ logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
484
+ break
485
+ continue
411
486
 
412
487
  batch_count = 0
413
488
  for binding in bindings:
414
489
  if limit and total_count >= limit:
415
490
  break
416
491
 
417
- record = self._parse_binding(binding, person_type=person_type)
418
- if record and record.source_id not in seen_ids:
419
- seen_ids.add(record.source_id)
420
- total_count += 1
421
- batch_count += 1
422
- yield record
492
+ record, skip_reason = self._parse_bulk_binding(binding, person_type=person_type)
493
+ if record is None:
494
+ continue
423
495
 
424
- logger.info(f"Processed {batch_count} people from batch (total: {total_count})")
496
+ # Deduplicate
497
+ if record.source_id in seen_qids:
498
+ continue
425
499
 
426
- if len(bindings) < batch_limit:
427
- # Last batch
428
- break
500
+ seen_qids.add(record.source_id)
501
+ total_count += 1
502
+ batch_count += 1
503
+ yield record
429
504
 
430
- offset += self._batch_size
505
+ # Check convergence
506
+ if batch_count == 0:
507
+ consecutive_empty_batches += 1
508
+ if consecutive_empty_batches >= convergence_threshold:
509
+ logger.debug(f"Role {role_name}: convergence after {batch_num} batches")
510
+ break
511
+ else:
512
+ consecutive_empty_batches = 0
431
513
 
432
- # Be polite to the endpoint
514
+ # Rate limit
433
515
  if self._delay > 0:
434
516
  time.sleep(self._delay)
435
517
 
436
- logger.info(f"Completed Wikidata people import: {total_count} records")
437
-
438
518
  def _import_all_types(self, limit: Optional[int]) -> Iterator[PersonRecord]:
439
519
  """Import from all query types sequentially, deduplicating across types."""
440
- seen_ids: set[str] = set()
520
+ # Track seen QIDs across all types
521
+ seen_qids: set[str] = set()
441
522
  total_count = 0
442
523
 
443
524
  # Calculate per-type limits if a total limit is set
444
- num_types = len(QUERY_TYPES)
525
+ num_types = len(QUERY_TYPE_CONFIG)
445
526
  per_type_limit = limit // num_types if limit else None
446
527
 
447
- for query_type in QUERY_TYPES:
528
+ for query_type in QUERY_TYPE_CONFIG:
448
529
  logger.info(f"=== Importing people: {query_type} ===")
449
530
  type_count = 0
531
+ skipped_count = 0
450
532
 
451
533
  for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
452
- if record.source_id not in seen_ids:
453
- seen_ids.add(record.source_id)
454
- total_count += 1
455
- type_count += 1
456
- yield record
534
+ if record.source_id in seen_qids:
535
+ skipped_count += 1
536
+ continue
537
+
538
+ seen_qids.add(record.source_id)
539
+ total_count += 1
540
+ type_count += 1
541
+ yield record
457
542
 
458
- if limit and total_count >= limit:
459
- logger.info(f"Reached total limit of {limit} records")
460
- return
543
+ if limit and total_count >= limit:
544
+ logger.info(f"Reached total limit of {limit} records")
545
+ return
461
546
 
462
- logger.info(f"Got {type_count} new records from {query_type} (total: {total_count})")
547
+ logger.info(
548
+ f"Got {type_count} new from {query_type}, skipped {skipped_count} (total: {total_count})"
549
+ )
463
550
 
464
551
  logger.info(f"Completed all query types: {total_count} total people records")
465
552
 
553
+ @staticmethod
554
+ def _parse_wikidata_date(date_str: str) -> Optional[str]:
555
+ """
556
+ Parse a Wikidata date string into ISO format (YYYY-MM-DD).
557
+
558
+ Wikidata returns dates like "2020-01-15T00:00:00Z" or just "2020".
559
+ Returns None if the date cannot be parsed.
560
+ """
561
+ if not date_str:
562
+ return None
563
+ # Handle ISO datetime format (e.g., "2020-01-15T00:00:00Z")
564
+ if "T" in date_str:
565
+ return date_str.split("T")[0]
566
+ # Handle year-only format (e.g., "2020")
567
+ if len(date_str) == 4 and date_str.isdigit():
568
+ return f"{date_str}-01-01"
569
+ # Return as-is if it looks like a date
570
+ if len(date_str) >= 4:
571
+ return date_str[:10] # Take first 10 chars (YYYY-MM-DD)
572
+ return None
573
+
466
574
  def _execute_sparql(self, query: str) -> dict[str, Any]:
467
575
  """Execute a SPARQL query against Wikidata."""
468
576
  params = urllib.parse.urlencode({
@@ -483,39 +591,126 @@ class WikidataPeopleImporter:
483
591
  with urllib.request.urlopen(req, timeout=self._timeout) as response:
484
592
  return json.loads(response.read().decode("utf-8"))
485
593
 
486
- def _parse_binding(
594
+ def _parse_bulk_binding(
487
595
  self,
488
596
  binding: dict[str, Any],
489
597
  person_type: PersonType = PersonType.UNKNOWN,
490
- ) -> Optional[PersonRecord]:
491
- """Parse a SPARQL result binding into a PersonRecord."""
598
+ ) -> tuple[Optional[PersonRecord], Optional[str]]:
599
+ """
600
+ Parse a bulk SPARQL result binding into a PersonRecord.
601
+
602
+ Bulk bindings only have: person, personLabel, countryLabel, description.
603
+ Role/org/dates are NOT included - use enrich methods to add them later.
604
+
605
+ Returns:
606
+ Tuple of (PersonRecord or None, skip_reason or None)
607
+ """
492
608
  try:
493
609
  # Get Wikidata entity ID
494
610
  person_uri = binding.get("person", {}).get("value", "")
495
611
  if not person_uri:
496
- return None
612
+ return None, "missing person URI"
497
613
 
498
614
  # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
499
615
  wikidata_id = person_uri.split("/")[-1]
500
616
  if not wikidata_id.startswith("Q"):
501
- return None
617
+ return None, f"invalid Wikidata ID format: {wikidata_id}"
502
618
 
503
619
  # Get label
504
620
  label = binding.get("personLabel", {}).get("value", "")
505
- if not label or label == wikidata_id: # Skip if no English label
506
- return None
621
+ if not label:
622
+ return None, f"{wikidata_id}: no label"
623
+ if label == wikidata_id:
624
+ return None, f"{wikidata_id}: no English label (label equals QID)"
625
+
626
+ # Get optional fields from bulk query
627
+ country = binding.get("countryLabel", {}).get("value", "")
628
+ description = binding.get("description", {}).get("value", "")
629
+
630
+ # Build minimal record data
631
+ record_data: dict[str, Any] = {
632
+ "wikidata_id": wikidata_id,
633
+ "label": label,
634
+ }
635
+ if country:
636
+ record_data["country"] = country
637
+ if description:
638
+ record_data["description"] = description
639
+
640
+ return PersonRecord(
641
+ name=label.strip(),
642
+ source="wikidata",
643
+ source_id=wikidata_id,
644
+ country=country or "",
645
+ person_type=person_type,
646
+ known_for_role="", # To be enriched later
647
+ known_for_org="", # To be enriched later
648
+ from_date=None, # To be enriched later
649
+ to_date=None, # To be enriched later
650
+ record=record_data,
651
+ ), None
652
+
653
+ except Exception as e:
654
+ return None, f"parse error: {e}"
655
+
656
+ def _parse_binding_with_reason(
657
+ self,
658
+ binding: dict[str, Any],
659
+ person_type: PersonType = PersonType.UNKNOWN,
660
+ ) -> tuple[Optional[PersonRecord], Optional[str]]:
661
+ """
662
+ Parse a SPARQL result binding into a PersonRecord.
663
+
664
+ Returns:
665
+ Tuple of (PersonRecord or None, skip_reason or None)
666
+ """
667
+ try:
668
+ # Get Wikidata entity ID
669
+ person_uri = binding.get("person", {}).get("value", "")
670
+ if not person_uri:
671
+ return None, "missing person URI"
672
+
673
+ # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
674
+ wikidata_id = person_uri.split("/")[-1]
675
+ if not wikidata_id.startswith("Q"):
676
+ return None, f"invalid Wikidata ID format: {wikidata_id}"
677
+
678
+ # Get label
679
+ label = binding.get("personLabel", {}).get("value", "")
680
+ if not label:
681
+ return None, f"{wikidata_id}: no label"
682
+ if label == wikidata_id:
683
+ return None, f"{wikidata_id}: no English label (label equals QID)"
507
684
 
508
685
  # Get optional fields
509
686
  country = binding.get("countryLabel", {}).get("value", "")
510
687
  role = binding.get("roleLabel", {}).get("value", "")
511
- org = binding.get("orgLabel", {}).get("value", "")
688
+ org_label = binding.get("orgLabel", {}).get("value", "")
689
+ org_uri = binding.get("org", {}).get("value", "")
512
690
  description = binding.get("description", {}).get("value", "")
513
691
 
514
- # Clean up role and org (remove QID if it's the same as the label)
692
+ # Extract org QID from URI (e.g., "http://www.wikidata.org/entity/Q715583" -> "Q715583")
693
+ org_qid = ""
694
+ if org_uri:
695
+ org_qid = org_uri.split("/")[-1]
696
+ if not org_qid.startswith("Q"):
697
+ org_qid = ""
698
+
699
+ # Get dates (Wikidata returns ISO datetime, extract just the date part)
700
+ start_date_raw = binding.get("startDate", {}).get("value", "")
701
+ end_date_raw = binding.get("endDate", {}).get("value", "")
702
+ from_date = WikidataPeopleImporter._parse_wikidata_date(start_date_raw)
703
+ to_date = WikidataPeopleImporter._parse_wikidata_date(end_date_raw)
704
+
705
+ # Clean up role and org label (remove QID if it's the same as the label)
515
706
  if role and role.startswith("Q"):
516
707
  role = ""
517
- if org and org.startswith("Q"):
518
- org = ""
708
+ if org_label and org_label.startswith("Q"):
709
+ org_label = ""
710
+
711
+ # Track discovered organization if we have both QID and label
712
+ if org_qid and org_label:
713
+ self._discovered_orgs[org_qid] = org_label
519
714
 
520
715
  # Build record data
521
716
  record_data: dict[str, Any] = {
@@ -526,10 +721,16 @@ class WikidataPeopleImporter:
526
721
  record_data["country"] = country
527
722
  if role:
528
723
  record_data["role"] = role
529
- if org:
530
- record_data["org"] = org
724
+ if org_label:
725
+ record_data["org"] = org_label
726
+ if org_qid:
727
+ record_data["org_qid"] = org_qid
531
728
  if description:
532
729
  record_data["description"] = description
730
+ if from_date:
731
+ record_data["from_date"] = from_date
732
+ if to_date:
733
+ record_data["to_date"] = to_date
533
734
 
534
735
  return PersonRecord(
535
736
  name=label.strip(),
@@ -538,13 +739,23 @@ class WikidataPeopleImporter:
538
739
  country=country or "",
539
740
  person_type=person_type,
540
741
  known_for_role=role or "",
541
- known_for_org=org or "",
742
+ known_for_org=org_label or "",
743
+ from_date=from_date,
744
+ to_date=to_date,
542
745
  record=record_data,
543
- )
746
+ ), None
544
747
 
545
748
  except Exception as e:
546
- logger.debug(f"Failed to parse Wikidata binding: {e}")
547
- return None
749
+ return None, f"parse error: {e}"
750
+
751
+ def _parse_binding(
752
+ self,
753
+ binding: dict[str, Any],
754
+ person_type: PersonType = PersonType.UNKNOWN,
755
+ ) -> Optional[PersonRecord]:
756
+ """Parse a SPARQL result binding into a PersonRecord (legacy wrapper)."""
757
+ record, _ = self._parse_binding_with_reason(binding, person_type)
758
+ return record
548
759
 
549
760
  def search_person(self, name: str, limit: int = 10) -> list[PersonRecord]:
550
761
  """
@@ -630,3 +841,334 @@ class WikidataPeopleImporter:
630
841
  results.append(record)
631
842
 
632
843
  return results
844
+
845
+ def get_discovered_organizations(self) -> list[CompanyRecord]:
846
+ """
847
+ Get organizations discovered during the people import.
848
+
849
+ These are organizations associated with people (employers, positions, etc.)
850
+ that can be inserted into the organizations database if not already present.
851
+
852
+ Returns:
853
+ List of CompanyRecord objects for discovered organizations
854
+ """
855
+ records = []
856
+ for org_qid, org_label in self._discovered_orgs.items():
857
+ record = CompanyRecord(
858
+ name=org_label,
859
+ source="wikipedia", # Use "wikipedia" as source per wikidata.py convention
860
+ source_id=org_qid,
861
+ region="", # Not available from this context
862
+ entity_type=EntityType.BUSINESS, # Default to business for orgs linked to people
863
+ record={
864
+ "wikidata_id": org_qid,
865
+ "label": org_label,
866
+ "discovered_from": "people_import",
867
+ },
868
+ )
869
+ records.append(record)
870
+ logger.info(f"Discovered {len(records)} organizations from people import")
871
+ return records
872
+
873
+ def clear_discovered_organizations(self) -> None:
874
+ """Clear the discovered organizations cache."""
875
+ self._discovered_orgs.clear()
876
+
877
+ def enrich_person_dates(self, person_qid: str, role: str = "", org: str = "") -> tuple[Optional[str], Optional[str]]:
878
+ """
879
+ Query Wikidata to get start/end dates for a person's position.
880
+
881
+ Args:
882
+ person_qid: Wikidata QID of the person (e.g., 'Q123')
883
+ role: Optional role label to match (e.g., 'chief executive officer')
884
+ org: Optional org label to match (e.g., 'Apple Inc')
885
+
886
+ Returns:
887
+ Tuple of (from_date, to_date) in ISO format, or (None, None) if not found
888
+ """
889
+ # Query for position dates for this specific person
890
+ # Uses rdfs:label instead of SERVICE wikibase:label for better performance
891
+ query = """
892
+ SELECT ?roleLabel ?orgLabel ?startDate ?endDate WHERE {
893
+ wd:%s p:P39 ?positionStatement .
894
+ ?positionStatement ps:P39 ?role .
895
+ ?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
896
+ OPTIONAL { ?positionStatement pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
897
+ OPTIONAL { ?positionStatement pq:P580 ?startDate }
898
+ OPTIONAL { ?positionStatement pq:P582 ?endDate }
899
+ }
900
+ LIMIT 50
901
+ """ % person_qid
902
+
903
+ try:
904
+ url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
905
+ req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
906
+
907
+ with urllib.request.urlopen(req, timeout=30) as response:
908
+ data = json.loads(response.read().decode("utf-8"))
909
+
910
+ # Find the best matching position
911
+ best_start = None
912
+ best_end = None
913
+
914
+ for binding in data.get("results", {}).get("bindings", []):
915
+ role_label = binding.get("roleLabel", {}).get("value", "")
916
+ org_label = binding.get("orgLabel", {}).get("value", "")
917
+ start_raw = binding.get("startDate", {}).get("value", "")
918
+ end_raw = binding.get("endDate", {}).get("value", "")
919
+
920
+ # If role/org specified, try to match
921
+ if role and role.lower() not in role_label.lower():
922
+ continue
923
+ if org and org.lower() not in org_label.lower():
924
+ continue
925
+
926
+ # Parse dates
927
+ start_date = self._parse_wikidata_date(start_raw)
928
+ end_date = self._parse_wikidata_date(end_raw)
929
+
930
+ # Prefer entries with dates
931
+ if start_date or end_date:
932
+ best_start = start_date
933
+ best_end = end_date
934
+ break # Found a match with dates
935
+
936
+ return best_start, best_end
937
+
938
+ except Exception as e:
939
+ logger.debug(f"Failed to enrich dates for {person_qid}: {e}")
940
+ return None, None
941
+
942
+ def enrich_people_batch(
943
+ self,
944
+ people: list[PersonRecord],
945
+ delay_seconds: float = 0.5,
946
+ ) -> int:
947
+ """
948
+ Enrich a batch of people with start/end dates.
949
+
950
+ Args:
951
+ people: List of PersonRecord objects to enrich
952
+ delay_seconds: Delay between requests
953
+
954
+ Returns:
955
+ Number of people enriched with dates
956
+ """
957
+ enriched_count = 0
958
+
959
+ for person in people:
960
+ if person.from_date or person.to_date:
961
+ continue # Already has dates
962
+
963
+ qid = person.source_id
964
+ role = person.known_for_role
965
+ org = person.known_for_org
966
+
967
+ from_date, to_date = self.enrich_person_dates(qid, role, org)
968
+
969
+ if from_date or to_date:
970
+ person.from_date = from_date
971
+ person.to_date = to_date
972
+ enriched_count += 1
973
+ logger.debug(f"Enriched {person.name}: {from_date} - {to_date}")
974
+
975
+ time.sleep(delay_seconds)
976
+
977
+ logger.info(f"Enriched {enriched_count}/{len(people)} people with dates")
978
+ return enriched_count
979
+
980
+ def enrich_person_role_org(
981
+ self, person_qid: str
982
+ ) -> tuple[str, str, str, Optional[str], Optional[str]]:
983
+ """
984
+ Query Wikidata to get role, org, and dates for a person.
985
+
986
+ Args:
987
+ person_qid: Wikidata QID of the person (e.g., 'Q123')
988
+
989
+ Returns:
990
+ Tuple of (role_label, org_label, org_qid, from_date, to_date)
991
+ Empty strings/None if not found
992
+ """
993
+ # Query for position held (P39) with org qualifier and dates
994
+ # Uses rdfs:label instead of SERVICE wikibase:label for better performance
995
+ query = """
996
+ SELECT ?roleLabel ?org ?orgLabel ?startDate ?endDate WHERE {
997
+ wd:%s p:P39 ?stmt .
998
+ ?stmt ps:P39 ?role .
999
+ ?role rdfs:label ?roleLabel FILTER(LANG(?roleLabel) = "en") .
1000
+ OPTIONAL { ?stmt pq:P642 ?org . ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") . }
1001
+ OPTIONAL { ?stmt pq:P580 ?startDate . }
1002
+ OPTIONAL { ?stmt pq:P582 ?endDate . }
1003
+ }
1004
+ LIMIT 5
1005
+ """ % person_qid
1006
+
1007
+ try:
1008
+ url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
1009
+ req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
1010
+
1011
+ with urllib.request.urlopen(req, timeout=30) as response:
1012
+ data = json.loads(response.read().decode("utf-8"))
1013
+
1014
+ bindings = data.get("results", {}).get("bindings", [])
1015
+
1016
+ # Find the best result (prefer one with org and dates)
1017
+ best_result = None
1018
+ for binding in bindings:
1019
+ role_label = binding.get("roleLabel", {}).get("value", "")
1020
+ org_label = binding.get("orgLabel", {}).get("value", "")
1021
+ org_uri = binding.get("org", {}).get("value", "")
1022
+ start_raw = binding.get("startDate", {}).get("value", "")
1023
+ end_raw = binding.get("endDate", {}).get("value", "")
1024
+
1025
+ # Skip if role is just a QID (no label resolved)
1026
+ if role_label and role_label.startswith("Q"):
1027
+ continue
1028
+ if org_label and org_label.startswith("Q"):
1029
+ org_label = ""
1030
+
1031
+ # Extract QID from URI
1032
+ org_qid = ""
1033
+ if org_uri:
1034
+ org_qid = org_uri.split("/")[-1]
1035
+ if not org_qid.startswith("Q"):
1036
+ org_qid = ""
1037
+
1038
+ from_date = self._parse_wikidata_date(start_raw)
1039
+ to_date = self._parse_wikidata_date(end_raw)
1040
+
1041
+ result = (role_label, org_label, org_qid, from_date, to_date)
1042
+
1043
+ # Prefer results with org and dates
1044
+ if org_label and (from_date or to_date):
1045
+ return result
1046
+ elif org_label and best_result is None:
1047
+ best_result = result
1048
+ elif role_label and best_result is None:
1049
+ best_result = result
1050
+
1051
+ if best_result:
1052
+ # If we have a role but no org, try P108 (employer) as fallback
1053
+ role_label, org_label, org_qid, from_date, to_date = best_result
1054
+ if role_label and not org_label:
1055
+ fallback_org, fallback_org_qid = self._get_employer(person_qid)
1056
+ if fallback_org:
1057
+ return role_label, fallback_org, fallback_org_qid, from_date, to_date
1058
+ return best_result
1059
+
1060
+ return "", "", "", None, None
1061
+
1062
+ except Exception as e:
1063
+ logger.debug(f"Failed to enrich role/org for {person_qid}: {e}")
1064
+ return "", "", "", None, None
1065
+
1066
+ def _get_employer(self, person_qid: str) -> tuple[str, str]:
1067
+ """
1068
+ Query P108 (employer) as fallback for org.
1069
+
1070
+ Args:
1071
+ person_qid: Wikidata QID of the person
1072
+
1073
+ Returns:
1074
+ Tuple of (org_label, org_qid) or ("", "") if not found
1075
+ """
1076
+ query = """
1077
+ SELECT ?org ?orgLabel WHERE {
1078
+ wd:%s wdt:P108 ?org .
1079
+ ?org rdfs:label ?orgLabel FILTER(LANG(?orgLabel) = "en") .
1080
+ }
1081
+ LIMIT 1
1082
+ """ % person_qid
1083
+
1084
+ try:
1085
+ url = f"{WIKIDATA_SPARQL_URL}?query={urllib.parse.quote(query)}&format=json"
1086
+ req = urllib.request.Request(url, headers={"User-Agent": "corp-extractor/1.0"})
1087
+
1088
+ with urllib.request.urlopen(req, timeout=15) as response:
1089
+ data = json.loads(response.read().decode("utf-8"))
1090
+
1091
+ bindings = data.get("results", {}).get("bindings", [])
1092
+ if bindings:
1093
+ org_label = bindings[0].get("orgLabel", {}).get("value", "")
1094
+ org_uri = bindings[0].get("org", {}).get("value", "")
1095
+ org_qid = org_uri.split("/")[-1] if org_uri else ""
1096
+ if org_label and not org_label.startswith("Q"):
1097
+ return org_label, org_qid
1098
+
1099
+ except Exception as e:
1100
+ logger.debug(f"Failed to get employer for {person_qid}: {e}")
1101
+
1102
+ return "", ""
1103
+
1104
+ def enrich_people_role_org_batch(
1105
+ self,
1106
+ people: list[PersonRecord],
1107
+ delay_seconds: float = 0.1,
1108
+ max_workers: int = 5,
1109
+ ) -> int:
1110
+ """
1111
+ Enrich a batch of people with role/org/dates data using parallel queries.
1112
+
1113
+ Args:
1114
+ people: List of PersonRecord objects to enrich
1115
+ delay_seconds: Delay between requests (per worker)
1116
+ max_workers: Number of parallel workers (default 5 for Wikidata rate limits)
1117
+
1118
+ Returns:
1119
+ Number of people enriched with role/org
1120
+ """
1121
+ from concurrent.futures import ThreadPoolExecutor, as_completed
1122
+
1123
+ # Filter to people that need enrichment
1124
+ to_enrich = [p for p in people if not p.known_for_role and not p.known_for_org]
1125
+
1126
+ if not to_enrich:
1127
+ logger.info("No people need enrichment")
1128
+ return 0
1129
+
1130
+ enriched_count = 0
1131
+ total = len(to_enrich)
1132
+
1133
+ def enrich_one(person: PersonRecord) -> tuple[PersonRecord, bool]:
1134
+ """Enrich a single person, returns (person, success)."""
1135
+ try:
1136
+ role, org, org_qid, from_date, to_date = self.enrich_person_role_org(person.source_id)
1137
+
1138
+ if role or org:
1139
+ person.known_for_role = role
1140
+ person.known_for_org = org
1141
+ if org_qid:
1142
+ person.record["org_qid"] = org_qid
1143
+ if from_date:
1144
+ person.from_date = from_date
1145
+ if to_date:
1146
+ person.to_date = to_date
1147
+ return person, True
1148
+
1149
+ return person, False
1150
+ except Exception as e:
1151
+ logger.debug(f"Failed to enrich {person.source_id}: {e}")
1152
+ return person, False
1153
+
1154
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1155
+ # Submit all tasks
1156
+ futures = {executor.submit(enrich_one, person): person for person in to_enrich}
1157
+
1158
+ # Process results as they complete
1159
+ completed = 0
1160
+ for future in as_completed(futures):
1161
+ person, success = future.result()
1162
+ if success:
1163
+ enriched_count += 1
1164
+ logger.debug(f"Enriched {person.name}: {person.known_for_role} at {person.known_for_org}")
1165
+
1166
+ completed += 1
1167
+ if completed % 100 == 0:
1168
+ logger.info(f"Enriched {completed}/{total} people ({enriched_count} with data)...")
1169
+
1170
+ # Small delay to avoid rate limiting
1171
+ time.sleep(delay_seconds)
1172
+
1173
+ logger.info(f"Enriched {enriched_count}/{total} people with role/org/dates")
1174
+ return enriched_count