corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1120 @@
1
+ """
2
+ Wikidata importer for the company/organization database.
3
+
4
+ Imports organization data from Wikidata using SPARQL queries
5
+ into the embedding database for entity name matching.
6
+
7
+ Supports 35+ entity types across 4 categories:
8
+
9
+ Organizations (highest priority):
10
+ - Organizations, nonprofits, NGOs, foundations
11
+ - Government agencies, international organizations
12
+ - Political parties, trade unions
13
+ - Educational institutions, universities, research institutes
14
+ - Hospitals, sports clubs
15
+
16
+ Companies:
17
+ - Companies with LEI codes or stock tickers
18
+ - Public companies, business enterprises, corporations
19
+ - Subsidiaries, conglomerates
20
+
21
+ Industry-specific:
22
+ - Banks, insurance companies, investment companies
23
+ - Airlines, retailers, manufacturers
24
+ - Pharma, tech companies, law firms
25
+ - Record labels, film studios, video game companies
26
+
27
+ Property-based (catches untyped entities):
28
+ - Entities with CEO, subsidiaries, legal form
29
+ - Entities with employee count or revenue data
30
+
31
+ Uses the public Wikidata Query Service endpoint.
32
+ """
33
+
34
+ import json
35
+ import logging
36
+ import time
37
+ import urllib.parse
38
+ import urllib.request
39
+ from typing import Any, Iterator, Optional
40
+
41
+ from ..models import CompanyRecord, EntityType
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Wikidata SPARQL endpoint
46
+ WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
47
+
48
+ # Simpler SPARQL query - directly query for companies with LEI codes (fastest, most reliable)
49
+ # Avoids property path wildcards (wdt:P279*) which timeout on Wikidata
50
+ LEI_COMPANY_QUERY = """
51
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
52
+ ?company wdt:P1278 ?lei.
53
+ OPTIONAL { ?company wdt:P249 ?ticker. }
54
+ OPTIONAL { ?company wdt:P17 ?country. }
55
+ OPTIONAL { ?company wdt:P571 ?inception. }
56
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
57
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
58
+ }
59
+ LIMIT %d
60
+ OFFSET %d
61
+ """
62
+
63
+ # Query for companies with stock exchange listing (has ticker)
64
+ TICKER_COMPANY_QUERY = """
65
+ SELECT ?company ?companyLabel ?ticker ?exchange ?exchangeLabel ?country ?countryLabel ?inception ?dissolution WHERE {
66
+ ?company wdt:P414 ?exchange.
67
+ OPTIONAL { ?company wdt:P249 ?ticker. }
68
+ OPTIONAL { ?company wdt:P17 ?country. }
69
+ OPTIONAL { ?company wdt:P571 ?inception. }
70
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
71
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
72
+ }
73
+ LIMIT %d
74
+ OFFSET %d
75
+ """
76
+
77
+ # Query for direct instances of public company (Q891723) - no subclass traversal
78
+ PUBLIC_COMPANY_QUERY = """
79
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
80
+ ?company wdt:P31 wd:Q891723.
81
+ OPTIONAL { ?company wdt:P1278 ?lei. }
82
+ OPTIONAL { ?company wdt:P249 ?ticker. }
83
+ OPTIONAL { ?company wdt:P17 ?country. }
84
+ OPTIONAL { ?company wdt:P571 ?inception. }
85
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
86
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
87
+ }
88
+ LIMIT %d
89
+ OFFSET %d
90
+ """
91
+
92
+ # Query for direct instances of business enterprise (Q4830453) - no subclass traversal
93
+ BUSINESS_QUERY = """
94
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
95
+ ?company wdt:P31 wd:Q4830453.
96
+ OPTIONAL { ?company wdt:P1278 ?lei. }
97
+ OPTIONAL { ?company wdt:P249 ?ticker. }
98
+ OPTIONAL { ?company wdt:P17 ?country. }
99
+ OPTIONAL { ?company wdt:P571 ?inception. }
100
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
101
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
102
+ }
103
+ LIMIT %d
104
+ OFFSET %d
105
+ """
106
+
107
+ # Query for direct instances of organization (Q43229) - includes NGOs, gov agencies, etc.
108
+ ORGANIZATION_QUERY = """
109
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
110
+ ?company wdt:P31 wd:Q43229.
111
+ OPTIONAL { ?company wdt:P1278 ?lei. }
112
+ OPTIONAL { ?company wdt:P249 ?ticker. }
113
+ OPTIONAL { ?company wdt:P17 ?country. }
114
+ OPTIONAL { ?company wdt:P571 ?inception. }
115
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
116
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
117
+ }
118
+ LIMIT %d
119
+ OFFSET %d
120
+ """
121
+
122
+ # Query for non-profit organizations (Q163740)
123
+ NONPROFIT_QUERY = """
124
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
125
+ ?company wdt:P31 wd:Q163740.
126
+ OPTIONAL { ?company wdt:P1278 ?lei. }
127
+ OPTIONAL { ?company wdt:P249 ?ticker. }
128
+ OPTIONAL { ?company wdt:P17 ?country. }
129
+ OPTIONAL { ?company wdt:P571 ?inception. }
130
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
131
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
132
+ }
133
+ LIMIT %d
134
+ OFFSET %d
135
+ """
136
+
137
+ # Query for government agencies (Q327333)
138
+ GOV_AGENCY_QUERY = """
139
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
140
+ ?company wdt:P31 wd:Q327333.
141
+ OPTIONAL { ?company wdt:P1278 ?lei. }
142
+ OPTIONAL { ?company wdt:P249 ?ticker. }
143
+ OPTIONAL { ?company wdt:P17 ?country. }
144
+ OPTIONAL { ?company wdt:P571 ?inception. }
145
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
146
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
147
+ }
148
+ LIMIT %d
149
+ OFFSET %d
150
+ """
151
+
152
+ # Query for enterprises (Q6881511) - broader than business enterprise
153
+ ENTERPRISE_QUERY = """
154
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
155
+ ?company wdt:P31 wd:Q6881511.
156
+ OPTIONAL { ?company wdt:P1278 ?lei. }
157
+ OPTIONAL { ?company wdt:P249 ?ticker. }
158
+ OPTIONAL { ?company wdt:P17 ?country. }
159
+ OPTIONAL { ?company wdt:P571 ?inception. }
160
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
161
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
162
+ }
163
+ LIMIT %d
164
+ OFFSET %d
165
+ """
166
+
167
+ # Query for corporations (Q167037)
168
+ CORPORATION_QUERY = """
169
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
170
+ ?company wdt:P31 wd:Q167037.
171
+ OPTIONAL { ?company wdt:P1278 ?lei. }
172
+ OPTIONAL { ?company wdt:P249 ?ticker. }
173
+ OPTIONAL { ?company wdt:P17 ?country. }
174
+ OPTIONAL { ?company wdt:P571 ?inception. }
175
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
176
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
177
+ }
178
+ LIMIT %d
179
+ OFFSET %d
180
+ """
181
+
182
+ # Query for subsidiaries (Q658255)
183
+ SUBSIDIARY_QUERY = """
184
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
185
+ ?company wdt:P31 wd:Q658255.
186
+ OPTIONAL { ?company wdt:P1278 ?lei. }
187
+ OPTIONAL { ?company wdt:P249 ?ticker. }
188
+ OPTIONAL { ?company wdt:P17 ?country. }
189
+ OPTIONAL { ?company wdt:P571 ?inception. }
190
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
191
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
192
+ }
193
+ LIMIT %d
194
+ OFFSET %d
195
+ """
196
+
197
+ # Query for banks (Q22687)
198
+ BANK_QUERY = """
199
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
200
+ ?company wdt:P31 wd:Q22687.
201
+ OPTIONAL { ?company wdt:P1278 ?lei. }
202
+ OPTIONAL { ?company wdt:P249 ?ticker. }
203
+ OPTIONAL { ?company wdt:P17 ?country. }
204
+ OPTIONAL { ?company wdt:P571 ?inception. }
205
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
206
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
207
+ }
208
+ LIMIT %d
209
+ OFFSET %d
210
+ """
211
+
212
+ # Query for insurance companies (Q6881511)
213
+ INSURANCE_QUERY = """
214
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
215
+ ?company wdt:P31 wd:Q1145276.
216
+ OPTIONAL { ?company wdt:P1278 ?lei. }
217
+ OPTIONAL { ?company wdt:P249 ?ticker. }
218
+ OPTIONAL { ?company wdt:P17 ?country. }
219
+ OPTIONAL { ?company wdt:P571 ?inception. }
220
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
221
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
222
+ }
223
+ LIMIT %d
224
+ OFFSET %d
225
+ """
226
+
227
+ # Query for airlines (Q46970)
228
+ AIRLINE_QUERY = """
229
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
230
+ ?company wdt:P31 wd:Q46970.
231
+ OPTIONAL { ?company wdt:P1278 ?lei. }
232
+ OPTIONAL { ?company wdt:P249 ?ticker. }
233
+ OPTIONAL { ?company wdt:P17 ?country. }
234
+ OPTIONAL { ?company wdt:P571 ?inception. }
235
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
236
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
237
+ }
238
+ LIMIT %d
239
+ OFFSET %d
240
+ """
241
+
242
+ # Query for law firms (Q613142)
243
+ LAW_FIRM_QUERY = """
244
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
245
+ ?company wdt:P31 wd:Q613142.
246
+ OPTIONAL { ?company wdt:P1278 ?lei. }
247
+ OPTIONAL { ?company wdt:P249 ?ticker. }
248
+ OPTIONAL { ?company wdt:P17 ?country. }
249
+ OPTIONAL { ?company wdt:P571 ?inception. }
250
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
251
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
252
+ }
253
+ LIMIT %d
254
+ OFFSET %d
255
+ """
256
+
257
+ # Query for educational institutions (Q2385804)
258
+ EDUCATIONAL_QUERY = """
259
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
260
+ ?company wdt:P31 wd:Q2385804.
261
+ OPTIONAL { ?company wdt:P1278 ?lei. }
262
+ OPTIONAL { ?company wdt:P249 ?ticker. }
263
+ OPTIONAL { ?company wdt:P17 ?country. }
264
+ OPTIONAL { ?company wdt:P571 ?inception. }
265
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
266
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
267
+ }
268
+ LIMIT %d
269
+ OFFSET %d
270
+ """
271
+
272
+ # Query for universities (Q3918)
273
+ UNIVERSITY_QUERY = """
274
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
275
+ ?company wdt:P31 wd:Q3918.
276
+ OPTIONAL { ?company wdt:P1278 ?lei. }
277
+ OPTIONAL { ?company wdt:P249 ?ticker. }
278
+ OPTIONAL { ?company wdt:P17 ?country. }
279
+ OPTIONAL { ?company wdt:P571 ?inception. }
280
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
281
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
282
+ }
283
+ LIMIT %d
284
+ OFFSET %d
285
+ """
286
+
287
+ # Query for research institutes (Q31855)
288
+ RESEARCH_INSTITUTE_QUERY = """
289
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
290
+ ?company wdt:P31 wd:Q31855.
291
+ OPTIONAL { ?company wdt:P1278 ?lei. }
292
+ OPTIONAL { ?company wdt:P249 ?ticker. }
293
+ OPTIONAL { ?company wdt:P17 ?country. }
294
+ OPTIONAL { ?company wdt:P571 ?inception. }
295
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
296
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
297
+ }
298
+ LIMIT %d
299
+ OFFSET %d
300
+ """
301
+
302
+ # Query for political parties (Q7278)
303
+ POLITICAL_PARTY_QUERY = """
304
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
305
+ ?company wdt:P31 wd:Q7278.
306
+ OPTIONAL { ?company wdt:P1278 ?lei. }
307
+ OPTIONAL { ?company wdt:P249 ?ticker. }
308
+ OPTIONAL { ?company wdt:P17 ?country. }
309
+ OPTIONAL { ?company wdt:P571 ?inception. }
310
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
311
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
312
+ }
313
+ LIMIT %d
314
+ OFFSET %d
315
+ """
316
+
317
+ # Query for trade unions (Q178790)
318
+ TRADE_UNION_QUERY = """
319
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
320
+ ?company wdt:P31 wd:Q178790.
321
+ OPTIONAL { ?company wdt:P1278 ?lei. }
322
+ OPTIONAL { ?company wdt:P249 ?ticker. }
323
+ OPTIONAL { ?company wdt:P17 ?country. }
324
+ OPTIONAL { ?company wdt:P571 ?inception. }
325
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
326
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
327
+ }
328
+ LIMIT %d
329
+ OFFSET %d
330
+ """
331
+
332
+ # Query for NGOs (Q79913)
333
+ NGO_QUERY = """
334
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
335
+ ?company wdt:P31 wd:Q79913.
336
+ OPTIONAL { ?company wdt:P1278 ?lei. }
337
+ OPTIONAL { ?company wdt:P249 ?ticker. }
338
+ OPTIONAL { ?company wdt:P17 ?country. }
339
+ OPTIONAL { ?company wdt:P571 ?inception. }
340
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
341
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
342
+ }
343
+ LIMIT %d
344
+ OFFSET %d
345
+ """
346
+
347
+ # Query for foundations (Q157031)
348
+ FOUNDATION_QUERY = """
349
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
350
+ ?company wdt:P31 wd:Q157031.
351
+ OPTIONAL { ?company wdt:P1278 ?lei. }
352
+ OPTIONAL { ?company wdt:P249 ?ticker. }
353
+ OPTIONAL { ?company wdt:P17 ?country. }
354
+ OPTIONAL { ?company wdt:P571 ?inception. }
355
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
356
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
357
+ }
358
+ LIMIT %d
359
+ OFFSET %d
360
+ """
361
+
362
+ # Query for international organizations (Q484652)
363
+ INTL_ORG_QUERY = """
364
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
365
+ ?company wdt:P31 wd:Q484652.
366
+ OPTIONAL { ?company wdt:P1278 ?lei. }
367
+ OPTIONAL { ?company wdt:P249 ?ticker. }
368
+ OPTIONAL { ?company wdt:P17 ?country. }
369
+ OPTIONAL { ?company wdt:P571 ?inception. }
370
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
371
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
372
+ }
373
+ LIMIT %d
374
+ OFFSET %d
375
+ """
376
+
377
+ # Query for sports teams/clubs (Q476028)
378
+ SPORTS_CLUB_QUERY = """
379
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
380
+ ?company wdt:P31 wd:Q476028.
381
+ OPTIONAL { ?company wdt:P1278 ?lei. }
382
+ OPTIONAL { ?company wdt:P249 ?ticker. }
383
+ OPTIONAL { ?company wdt:P17 ?country. }
384
+ OPTIONAL { ?company wdt:P571 ?inception. }
385
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
386
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
387
+ }
388
+ LIMIT %d
389
+ OFFSET %d
390
+ """
391
+
392
+ # Query for hospitals (Q16917)
393
+ HOSPITAL_QUERY = """
394
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
395
+ ?company wdt:P31 wd:Q16917.
396
+ OPTIONAL { ?company wdt:P1278 ?lei. }
397
+ OPTIONAL { ?company wdt:P249 ?ticker. }
398
+ OPTIONAL { ?company wdt:P17 ?country. }
399
+ OPTIONAL { ?company wdt:P571 ?inception. }
400
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
401
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
402
+ }
403
+ LIMIT %d
404
+ OFFSET %d
405
+ """
406
+
407
+ # Query for record labels (Q18127)
408
+ RECORD_LABEL_QUERY = """
409
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
410
+ ?company wdt:P31 wd:Q18127.
411
+ OPTIONAL { ?company wdt:P1278 ?lei. }
412
+ OPTIONAL { ?company wdt:P249 ?ticker. }
413
+ OPTIONAL { ?company wdt:P17 ?country. }
414
+ OPTIONAL { ?company wdt:P571 ?inception. }
415
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
416
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
417
+ }
418
+ LIMIT %d
419
+ OFFSET %d
420
+ """
421
+
422
+ # Query for film studios (Q1366047)
423
+ FILM_STUDIO_QUERY = """
424
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
425
+ ?company wdt:P31 wd:Q1366047.
426
+ OPTIONAL { ?company wdt:P1278 ?lei. }
427
+ OPTIONAL { ?company wdt:P249 ?ticker. }
428
+ OPTIONAL { ?company wdt:P17 ?country. }
429
+ OPTIONAL { ?company wdt:P571 ?inception. }
430
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
431
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
432
+ }
433
+ LIMIT %d
434
+ OFFSET %d
435
+ """
436
+
437
+ # Query for video game companies (Q1137109)
438
+ VIDEO_GAME_COMPANY_QUERY = """
439
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
440
+ ?company wdt:P31 wd:Q1137109.
441
+ OPTIONAL { ?company wdt:P1278 ?lei. }
442
+ OPTIONAL { ?company wdt:P249 ?ticker. }
443
+ OPTIONAL { ?company wdt:P17 ?country. }
444
+ OPTIONAL { ?company wdt:P571 ?inception. }
445
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
446
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
447
+ }
448
+ LIMIT %d
449
+ OFFSET %d
450
+ """
451
+
452
+ # Query for pharmaceutical companies (Q507619)
453
+ PHARMA_QUERY = """
454
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
455
+ ?company wdt:P31 wd:Q507619.
456
+ OPTIONAL { ?company wdt:P1278 ?lei. }
457
+ OPTIONAL { ?company wdt:P249 ?ticker. }
458
+ OPTIONAL { ?company wdt:P17 ?country. }
459
+ OPTIONAL { ?company wdt:P571 ?inception. }
460
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
461
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
462
+ }
463
+ LIMIT %d
464
+ OFFSET %d
465
+ """
466
+
467
+ # Query for tech companies (Q2979960)
468
+ TECH_COMPANY_QUERY = """
469
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
470
+ ?company wdt:P31 wd:Q2979960.
471
+ OPTIONAL { ?company wdt:P1278 ?lei. }
472
+ OPTIONAL { ?company wdt:P249 ?ticker. }
473
+ OPTIONAL { ?company wdt:P17 ?country. }
474
+ OPTIONAL { ?company wdt:P571 ?inception. }
475
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
476
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
477
+ }
478
+ LIMIT %d
479
+ OFFSET %d
480
+ """
481
+
482
+ # Query for retailers (Q1631111)
483
+ RETAILER_QUERY = """
484
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
485
+ ?company wdt:P31 wd:Q1631111.
486
+ OPTIONAL { ?company wdt:P1278 ?lei. }
487
+ OPTIONAL { ?company wdt:P249 ?ticker. }
488
+ OPTIONAL { ?company wdt:P17 ?country. }
489
+ OPTIONAL { ?company wdt:P571 ?inception. }
490
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
491
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
492
+ }
493
+ LIMIT %d
494
+ OFFSET %d
495
+ """
496
+
497
+ # Query for manufacturers (Q187652)
498
+ MANUFACTURER_QUERY = """
499
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
500
+ ?company wdt:P31 wd:Q187652.
501
+ OPTIONAL { ?company wdt:P1278 ?lei. }
502
+ OPTIONAL { ?company wdt:P249 ?ticker. }
503
+ OPTIONAL { ?company wdt:P17 ?country. }
504
+ OPTIONAL { ?company wdt:P571 ?inception. }
505
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
506
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
507
+ }
508
+ LIMIT %d
509
+ OFFSET %d
510
+ """
511
+
512
+ # Query for conglomerates (Q206652)
513
+ CONGLOMERATE_QUERY = """
514
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
515
+ ?company wdt:P31 wd:Q206652.
516
+ OPTIONAL { ?company wdt:P1278 ?lei. }
517
+ OPTIONAL { ?company wdt:P249 ?ticker. }
518
+ OPTIONAL { ?company wdt:P17 ?country. }
519
+ OPTIONAL { ?company wdt:P571 ?inception. }
520
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
521
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
522
+ }
523
+ LIMIT %d
524
+ OFFSET %d
525
+ """
526
+
527
+ # Query for investment companies (Q380649)
528
+ INVESTMENT_COMPANY_QUERY = """
529
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
530
+ ?company wdt:P31 wd:Q380649.
531
+ OPTIONAL { ?company wdt:P1278 ?lei. }
532
+ OPTIONAL { ?company wdt:P249 ?ticker. }
533
+ OPTIONAL { ?company wdt:P17 ?country. }
534
+ OPTIONAL { ?company wdt:P571 ?inception. }
535
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
536
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
537
+ }
538
+ LIMIT %d
539
+ OFFSET %d
540
+ """
541
+
542
+ # Property-based query: entities with a CEO (P169) - likely companies
543
+ HAS_CEO_QUERY = """
544
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
545
+ ?company wdt:P169 ?ceo.
546
+ OPTIONAL { ?company wdt:P1278 ?lei. }
547
+ OPTIONAL { ?company wdt:P249 ?ticker. }
548
+ OPTIONAL { ?company wdt:P17 ?country. }
549
+ OPTIONAL { ?company wdt:P571 ?inception. }
550
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
551
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
552
+ }
553
+ LIMIT %d
554
+ OFFSET %d
555
+ """
556
+
557
+ # Property-based query: entities with subsidiaries (P355) - parent companies
558
+ HAS_SUBSIDIARIES_QUERY = """
559
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
560
+ ?company wdt:P355 ?subsidiary.
561
+ OPTIONAL { ?company wdt:P1278 ?lei. }
562
+ OPTIONAL { ?company wdt:P249 ?ticker. }
563
+ OPTIONAL { ?company wdt:P17 ?country. }
564
+ OPTIONAL { ?company wdt:P571 ?inception. }
565
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
566
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
567
+ }
568
+ LIMIT %d
569
+ OFFSET %d
570
+ """
571
+
572
+ # Property-based query: entities owned by another entity (P127) - subsidiaries/companies
573
+ OWNED_BY_QUERY = """
574
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
575
+ ?company wdt:P127 ?owner.
576
+ OPTIONAL { ?company wdt:P1278 ?lei. }
577
+ OPTIONAL { ?company wdt:P249 ?ticker. }
578
+ OPTIONAL { ?company wdt:P17 ?country. }
579
+ OPTIONAL { ?company wdt:P571 ?inception. }
580
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
581
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
582
+ }
583
+ LIMIT %d
584
+ OFFSET %d
585
+ """
586
+
587
+ # Property-based query: entities with legal form (P1454) - structured companies
588
+ HAS_LEGAL_FORM_QUERY = """
589
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
590
+ ?company wdt:P1454 ?legalForm.
591
+ OPTIONAL { ?company wdt:P1278 ?lei. }
592
+ OPTIONAL { ?company wdt:P249 ?ticker. }
593
+ OPTIONAL { ?company wdt:P17 ?country. }
594
+ OPTIONAL { ?company wdt:P571 ?inception. }
595
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
596
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
597
+ }
598
+ LIMIT %d
599
+ OFFSET %d
600
+ """
601
+
602
+ # Property-based query: entities with employees count (P1128) - organizations
603
+ HAS_EMPLOYEES_QUERY = """
604
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
605
+ ?company wdt:P1128 ?employees.
606
+ OPTIONAL { ?company wdt:P1278 ?lei. }
607
+ OPTIONAL { ?company wdt:P249 ?ticker. }
608
+ OPTIONAL { ?company wdt:P17 ?country. }
609
+ OPTIONAL { ?company wdt:P571 ?inception. }
610
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
611
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
612
+ }
613
+ LIMIT %d
614
+ OFFSET %d
615
+ """
616
+
617
+ # Property-based query: entities with revenue (P2139) - companies
618
+ HAS_REVENUE_QUERY = """
619
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
620
+ ?company wdt:P2139 ?revenue.
621
+ OPTIONAL { ?company wdt:P1278 ?lei. }
622
+ OPTIONAL { ?company wdt:P249 ?ticker. }
623
+ OPTIONAL { ?company wdt:P17 ?country. }
624
+ OPTIONAL { ?company wdt:P571 ?inception. }
625
+ OPTIONAL { ?company wdt:P576 ?dissolution. }
626
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
627
+ }
628
+ LIMIT %d
629
+ OFFSET %d
630
+ """
631
+
632
+
633
+ # Query types available for import - organized by category
634
+ # Organization types (highest priority - run first)
635
+ ORG_QUERY_TYPES = {
636
+ "organization": ORGANIZATION_QUERY,
637
+ "nonprofit": NONPROFIT_QUERY,
638
+ "ngo": NGO_QUERY,
639
+ "foundation": FOUNDATION_QUERY,
640
+ "government": GOV_AGENCY_QUERY,
641
+ "intl_org": INTL_ORG_QUERY,
642
+ "political_party": POLITICAL_PARTY_QUERY,
643
+ "trade_union": TRADE_UNION_QUERY,
644
+ "educational": EDUCATIONAL_QUERY,
645
+ "university": UNIVERSITY_QUERY,
646
+ "research_institute": RESEARCH_INSTITUTE_QUERY,
647
+ "hospital": HOSPITAL_QUERY,
648
+ "sports_club": SPORTS_CLUB_QUERY,
649
+ }
650
+
651
+ # Company types
652
+ COMPANY_QUERY_TYPES = {
653
+ "lei": LEI_COMPANY_QUERY,
654
+ "ticker": TICKER_COMPANY_QUERY,
655
+ "public": PUBLIC_COMPANY_QUERY,
656
+ "business": BUSINESS_QUERY,
657
+ "enterprise": ENTERPRISE_QUERY,
658
+ "corporation": CORPORATION_QUERY,
659
+ "subsidiary": SUBSIDIARY_QUERY,
660
+ "conglomerate": CONGLOMERATE_QUERY,
661
+ }
662
+
663
+ # Industry-specific company types
664
+ INDUSTRY_QUERY_TYPES = {
665
+ "bank": BANK_QUERY,
666
+ "insurance": INSURANCE_QUERY,
667
+ "airline": AIRLINE_QUERY,
668
+ "law_firm": LAW_FIRM_QUERY,
669
+ "pharma": PHARMA_QUERY,
670
+ "tech_company": TECH_COMPANY_QUERY,
671
+ "retailer": RETAILER_QUERY,
672
+ "manufacturer": MANUFACTURER_QUERY,
673
+ "investment_company": INVESTMENT_COMPANY_QUERY,
674
+ "record_label": RECORD_LABEL_QUERY,
675
+ "film_studio": FILM_STUDIO_QUERY,
676
+ "video_game_company": VIDEO_GAME_COMPANY_QUERY,
677
+ }
678
+
679
+ # Property-based queries (catches entities not typed correctly)
680
+ PROPERTY_QUERY_TYPES = {
681
+ "has_ceo": HAS_CEO_QUERY,
682
+ "has_subsidiaries": HAS_SUBSIDIARIES_QUERY,
683
+ "owned_by": OWNED_BY_QUERY,
684
+ "has_legal_form": HAS_LEGAL_FORM_QUERY,
685
+ "has_employees": HAS_EMPLOYEES_QUERY,
686
+ "has_revenue": HAS_REVENUE_QUERY,
687
+ }
688
+
689
+ # All query types combined
690
+ QUERY_TYPES = {
691
+ **ORG_QUERY_TYPES,
692
+ **COMPANY_QUERY_TYPES,
693
+ **INDUSTRY_QUERY_TYPES,
694
+ **PROPERTY_QUERY_TYPES,
695
+ }
696
+
697
+ # Mapping from query type to EntityType
698
+ QUERY_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
699
+ # Organizations
700
+ "organization": EntityType.NONPROFIT, # Generic org, default to nonprofit
701
+ "nonprofit": EntityType.NONPROFIT,
702
+ "ngo": EntityType.NGO,
703
+ "foundation": EntityType.FOUNDATION,
704
+ "government": EntityType.GOVERNMENT,
705
+ "intl_org": EntityType.INTERNATIONAL_ORG,
706
+ "political_party": EntityType.POLITICAL_PARTY,
707
+ "trade_union": EntityType.TRADE_UNION,
708
+ "educational": EntityType.EDUCATIONAL,
709
+ "university": EntityType.EDUCATIONAL,
710
+ "research_institute": EntityType.RESEARCH,
711
+ "hospital": EntityType.HEALTHCARE,
712
+ "sports_club": EntityType.SPORTS,
713
+
714
+ # Companies
715
+ "lei": EntityType.BUSINESS,
716
+ "ticker": EntityType.BUSINESS,
717
+ "public": EntityType.BUSINESS,
718
+ "business": EntityType.BUSINESS,
719
+ "enterprise": EntityType.BUSINESS,
720
+ "corporation": EntityType.BUSINESS,
721
+ "subsidiary": EntityType.BUSINESS,
722
+ "conglomerate": EntityType.BUSINESS,
723
+
724
+ # Industry-specific (all business)
725
+ "bank": EntityType.BUSINESS,
726
+ "insurance": EntityType.BUSINESS,
727
+ "airline": EntityType.BUSINESS,
728
+ "law_firm": EntityType.BUSINESS,
729
+ "pharma": EntityType.BUSINESS,
730
+ "tech_company": EntityType.BUSINESS,
731
+ "retailer": EntityType.BUSINESS,
732
+ "manufacturer": EntityType.BUSINESS,
733
+ "investment_company": EntityType.FUND,
734
+ "record_label": EntityType.MEDIA,
735
+ "film_studio": EntityType.MEDIA,
736
+ "video_game_company": EntityType.MEDIA,
737
+
738
+ # Property-based (assume business as they have CEO/revenue/etc)
739
+ "has_ceo": EntityType.BUSINESS,
740
+ "has_subsidiaries": EntityType.BUSINESS,
741
+ "owned_by": EntityType.BUSINESS,
742
+ "has_legal_form": EntityType.BUSINESS,
743
+ "has_employees": EntityType.UNKNOWN, # Could be any org type
744
+ "has_revenue": EntityType.BUSINESS,
745
+ }
746
+
747
+
748
+ class WikidataImporter:
749
+ """
750
+ Importer for Wikidata organization data.
751
+
752
+ Uses SPARQL queries against the public Wikidata Query Service
753
+ to fetch organizations including companies, nonprofits, government agencies, etc.
754
+
755
+ Query categories (run in this order with import_all=True):
756
+
757
+ Organizations:
758
+ - organization: All organizations (Q43229)
759
+ - nonprofit: Non-profit organizations (Q163740)
760
+ - ngo: NGOs (Q79913)
761
+ - foundation: Foundations (Q157031)
762
+ - government: Government agencies (Q327333)
763
+ - intl_org: International organizations (Q484652)
764
+ - political_party: Political parties (Q7278)
765
+ - trade_union: Trade unions (Q178790)
766
+ - educational: Educational institutions (Q2385804)
767
+ - university: Universities (Q3918)
768
+ - research_institute: Research institutes (Q31855)
769
+ - hospital: Hospitals (Q16917)
770
+ - sports_club: Sports clubs (Q476028)
771
+
772
+ Companies:
773
+ - lei: Companies with LEI codes
774
+ - ticker: Companies with stock exchange listings
775
+ - public: Public companies (Q891723)
776
+ - business: Business enterprises (Q4830453)
777
+ - enterprise: Enterprises (Q6881511)
778
+ - corporation: Corporations (Q167037)
779
+ - subsidiary: Subsidiaries (Q658255)
780
+ - conglomerate: Conglomerates (Q206652)
781
+
782
+ Industry-specific:
783
+ - bank: Banks (Q22687)
784
+ - insurance: Insurance companies (Q1145276)
785
+ - airline: Airlines (Q46970)
786
+ - law_firm: Law firms (Q613142)
787
+ - pharma: Pharmaceutical companies (Q507619)
788
+ - tech_company: Tech companies (Q2979960)
789
+ - retailer: Retailers (Q1631111)
790
+ - manufacturer: Manufacturers (Q187652)
791
+ - investment_company: Investment companies (Q380649)
792
+ - record_label: Record labels (Q18127)
793
+ - film_studio: Film studios (Q1366047)
794
+ - video_game_company: Video game companies (Q1137109)
795
+
796
+ Property-based (catches untyped entities):
797
+ - has_ceo: Entities with CEO (P169)
798
+ - has_subsidiaries: Entities with subsidiaries (P355)
799
+ - owned_by: Entities owned by another (P127)
800
+ - has_legal_form: Entities with legal form (P1454)
801
+ - has_employees: Entities with employee count (P1128)
802
+ - has_revenue: Entities with revenue (P2139)
803
+ """
804
+
805
+ def __init__(self, batch_size: int = 1000, delay_seconds: float = 2.0, timeout: int = 120):
806
+ """
807
+ Initialize the Wikidata importer.
808
+
809
+ Args:
810
+ batch_size: Number of records to fetch per SPARQL query (default 1000)
811
+ delay_seconds: Delay between requests to be polite to the endpoint
812
+ timeout: HTTP timeout in seconds (default 120)
813
+ """
814
+ self._batch_size = batch_size
815
+ self._delay = delay_seconds
816
+ self._timeout = timeout
817
+
818
+ def import_from_sparql(
819
+ self,
820
+ limit: Optional[int] = None,
821
+ query_type: str = "lei",
822
+ import_all: bool = False,
823
+ ) -> Iterator[CompanyRecord]:
824
+ """
825
+ Import organization records from Wikidata via SPARQL.
826
+
827
+ Args:
828
+ limit: Optional limit on total records
829
+ query_type: Which query to use (see class docstring for full list).
830
+ Common options:
831
+ - "lei": Companies with LEI codes (default, fastest)
832
+ - "organization": All organizations (Q43229)
833
+ - "nonprofit": Non-profit organizations (Q163740)
834
+ - "government": Government agencies (Q327333)
835
+ - "has_ceo": Entities with CEO property (catches many companies)
836
+ import_all: If True, run all query types sequentially in priority order:
837
+ 1. Organization types (nonprofits, gov agencies, NGOs, etc.)
838
+ 2. Company types (public companies, business enterprises, etc.)
839
+ 3. Industry-specific types (banks, airlines, pharma, etc.)
840
+ 4. Property-based queries (catches entities not properly typed)
841
+
842
+ Yields:
843
+ CompanyRecord for each organization
844
+ """
845
+ if import_all:
846
+ yield from self._import_all_types(limit)
847
+ return
848
+
849
+ if query_type not in QUERY_TYPES:
850
+ raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPES.keys())}")
851
+
852
+ query_template = QUERY_TYPES[query_type]
853
+ entity_type = QUERY_TYPE_TO_ENTITY_TYPE.get(query_type, EntityType.UNKNOWN)
854
+ logger.info(f"Starting Wikidata company import via SPARQL (query_type={query_type}, entity_type={entity_type.value})...")
855
+
856
+ offset = 0
857
+ total_count = 0
858
+ seen_ids = set() # Track seen Wikidata IDs to avoid duplicates
859
+
860
+ while True:
861
+ if limit and total_count >= limit:
862
+ break
863
+
864
+ batch_limit = min(self._batch_size, (limit - total_count) if limit else self._batch_size)
865
+ query = query_template % (batch_limit, offset)
866
+
867
+ logger.info(f"Fetching Wikidata batch at offset {offset}...")
868
+
869
+ try:
870
+ results = self._execute_sparql(query)
871
+ except Exception as e:
872
+ logger.error(f"SPARQL query failed at offset {offset}: {e}")
873
+ break
874
+
875
+ bindings = results.get("results", {}).get("bindings", [])
876
+
877
+ if not bindings:
878
+ logger.info("No more results from Wikidata")
879
+ break
880
+
881
+ batch_count = 0
882
+ for binding in bindings:
883
+ if limit and total_count >= limit:
884
+ break
885
+
886
+ record = self._parse_binding(binding, entity_type=entity_type)
887
+ if record and record.source_id not in seen_ids:
888
+ seen_ids.add(record.source_id)
889
+ total_count += 1
890
+ batch_count += 1
891
+ yield record
892
+
893
+ logger.info(f"Processed {batch_count} records from batch (total: {total_count})")
894
+
895
+ if len(bindings) < batch_limit:
896
+ # Last batch
897
+ break
898
+
899
+ offset += self._batch_size
900
+
901
+ # Be polite to the endpoint
902
+ if self._delay > 0:
903
+ time.sleep(self._delay)
904
+
905
+ logger.info(f"Completed Wikidata import: {total_count} records")
906
+
907
+ def _import_all_types(self, limit: Optional[int]) -> Iterator[CompanyRecord]:
908
+ """Import from all query types sequentially, deduplicating across types.
909
+
910
+ Query categories are run in priority order:
911
+ 1. Organization types (nonprofits, gov agencies, NGOs, etc.)
912
+ 2. Company types (public companies, business enterprises, etc.)
913
+ 3. Industry-specific types (banks, airlines, pharma, etc.)
914
+ 4. Property-based queries (catches entities not properly typed)
915
+ """
916
+ seen_ids: set[str] = set()
917
+ total_count = 0
918
+
919
+ # Calculate per-category limits if a total limit is set
920
+ num_categories = 4
921
+ per_category_limit = limit // num_categories if limit else None
922
+
923
+ # Run categories in priority order: organizations first
924
+ categories = [
925
+ ("Organizations", ORG_QUERY_TYPES, per_category_limit),
926
+ ("Companies", COMPANY_QUERY_TYPES, per_category_limit),
927
+ ("Industry-specific", INDUSTRY_QUERY_TYPES, per_category_limit),
928
+ ("Property-based", PROPERTY_QUERY_TYPES, per_category_limit),
929
+ ]
930
+
931
+ for category_name, query_types, category_limit in categories:
932
+ logger.info(f"=== Starting category: {category_name} ({len(query_types)} query types) ===")
933
+ category_count = 0
934
+ per_type_limit = category_limit // len(query_types) if category_limit else None
935
+
936
+ for query_type in query_types:
937
+ logger.info(f"Importing from query type: {query_type}")
938
+ type_count = 0
939
+
940
+ for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
941
+ if record.source_id not in seen_ids:
942
+ seen_ids.add(record.source_id)
943
+ total_count += 1
944
+ type_count += 1
945
+ category_count += 1
946
+ yield record
947
+
948
+ if limit and total_count >= limit:
949
+ logger.info(f"Reached total limit of {limit} records")
950
+ return
951
+
952
+ logger.info(f"Got {type_count} new records from {query_type} (total: {total_count})")
953
+
954
+ logger.info(f"=== Completed {category_name}: {category_count} new records ===")
955
+
956
+ logger.info(f"Completed all query types: {total_count} total records")
957
+
958
+ @staticmethod
959
+ def _parse_wikidata_date(date_str: Optional[str]) -> Optional[str]:
960
+ """
961
+ Parse a Wikidata date string into ISO format (YYYY-MM-DD).
962
+
963
+ Wikidata returns dates like "2020-01-15T00:00:00Z" or just "2020".
964
+ Returns None if the date cannot be parsed.
965
+ """
966
+ if not date_str:
967
+ return None
968
+ # Handle ISO datetime format (e.g., "2020-01-15T00:00:00Z")
969
+ if "T" in date_str:
970
+ return date_str.split("T")[0]
971
+ # Handle year-only format (e.g., "2020")
972
+ if len(date_str) == 4 and date_str.isdigit():
973
+ return f"{date_str}-01-01"
974
+ # Return as-is if it looks like a date
975
+ if len(date_str) >= 4:
976
+ return date_str[:10] # Take first 10 chars (YYYY-MM-DD)
977
+ return None
978
+
979
+ def _execute_sparql(self, query: str) -> dict[str, Any]:
980
+ """Execute a SPARQL query against Wikidata."""
981
+ params = urllib.parse.urlencode({
982
+ "query": query,
983
+ "format": "json",
984
+ })
985
+
986
+ url = f"{WIKIDATA_SPARQL_URL}?{params}"
987
+
988
+ req = urllib.request.Request(
989
+ url,
990
+ headers={
991
+ "Accept": "application/sparql-results+json",
992
+ "User-Agent": "corp-extractor/1.0 (company database builder)",
993
+ }
994
+ )
995
+
996
+ with urllib.request.urlopen(req, timeout=self._timeout) as response:
997
+ return json.loads(response.read().decode("utf-8"))
998
+
999
+ def _parse_binding(
1000
+ self,
1001
+ binding: dict[str, Any],
1002
+ entity_type: EntityType = EntityType.UNKNOWN,
1003
+ ) -> Optional[CompanyRecord]:
1004
+ """Parse a SPARQL result binding into a CompanyRecord."""
1005
+ try:
1006
+ # Get Wikidata entity ID
1007
+ company_uri = binding.get("company", {}).get("value", "")
1008
+ if not company_uri:
1009
+ return None
1010
+
1011
+ # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
1012
+ wikidata_id = company_uri.split("/")[-1]
1013
+ if not wikidata_id.startswith("Q"):
1014
+ return None
1015
+
1016
+ # Get label
1017
+ label = binding.get("companyLabel", {}).get("value", "")
1018
+ if not label or label == wikidata_id: # Skip if no English label
1019
+ return None
1020
+
1021
+ # Get optional fields
1022
+ lei = binding.get("lei", {}).get("value")
1023
+ ticker = binding.get("ticker", {}).get("value")
1024
+ exchange_label = binding.get("exchangeLabel", {}).get("value")
1025
+ country_label = binding.get("countryLabel", {}).get("value")
1026
+ inception_raw = binding.get("inception", {}).get("value")
1027
+ dissolution_raw = binding.get("dissolution", {}).get("value")
1028
+
1029
+ # Parse dates (Wikidata returns ISO datetime, extract date part)
1030
+ from_date = WikidataImporter._parse_wikidata_date(inception_raw)
1031
+ to_date = WikidataImporter._parse_wikidata_date(dissolution_raw)
1032
+
1033
+ # Build record data
1034
+ record_data: dict[str, Any] = {
1035
+ "wikidata_id": wikidata_id,
1036
+ "label": label,
1037
+ }
1038
+ if lei:
1039
+ record_data["lei"] = lei
1040
+ if ticker:
1041
+ record_data["ticker"] = ticker
1042
+ if exchange_label:
1043
+ record_data["exchange"] = exchange_label
1044
+ if country_label:
1045
+ record_data["country"] = country_label
1046
+ if from_date:
1047
+ record_data["inception"] = from_date
1048
+ if to_date:
1049
+ record_data["dissolution"] = to_date
1050
+
1051
+ return CompanyRecord(
1052
+ name=label.strip(),
1053
+ source="wikipedia", # Use "wikipedia" as source per schema
1054
+ source_id=wikidata_id,
1055
+ region=country_label or "",
1056
+ entity_type=entity_type,
1057
+ from_date=from_date,
1058
+ to_date=to_date,
1059
+ record=record_data,
1060
+ )
1061
+
1062
+ except Exception as e:
1063
+ logger.debug(f"Failed to parse Wikidata binding: {e}")
1064
+ return None
1065
+
1066
+ def search_company(self, name: str, limit: int = 10) -> list[CompanyRecord]:
1067
+ """
1068
+ Search for a specific company by name.
1069
+
1070
+ Args:
1071
+ name: Company name to search for
1072
+ limit: Maximum results to return
1073
+
1074
+ Returns:
1075
+ List of matching CompanyRecords
1076
+ """
1077
+ # Use Wikidata search API for better name matching
1078
+ search_url = "https://www.wikidata.org/w/api.php"
1079
+ params = urllib.parse.urlencode({
1080
+ "action": "wbsearchentities",
1081
+ "search": name,
1082
+ "language": "en",
1083
+ "type": "item",
1084
+ "limit": limit,
1085
+ "format": "json",
1086
+ })
1087
+
1088
+ req = urllib.request.Request(
1089
+ f"{search_url}?{params}",
1090
+ headers={"User-Agent": "corp-extractor/1.0"}
1091
+ )
1092
+
1093
+ with urllib.request.urlopen(req, timeout=30) as response:
1094
+ data = json.loads(response.read().decode("utf-8"))
1095
+
1096
+ results = []
1097
+ for item in data.get("search", []):
1098
+ qid = item.get("id")
1099
+ label = item.get("label", "")
1100
+ description = item.get("description", "")
1101
+
1102
+ # Check if it looks like a company
1103
+ company_keywords = ["company", "corporation", "inc", "ltd", "enterprise", "business"]
1104
+ if not any(kw in description.lower() for kw in company_keywords):
1105
+ continue
1106
+
1107
+ record = CompanyRecord(
1108
+ name=label,
1109
+ source="wikipedia",
1110
+ source_id=qid,
1111
+ region="", # Not available from search API
1112
+ record={
1113
+ "wikidata_id": qid,
1114
+ "label": label,
1115
+ "description": description,
1116
+ },
1117
+ )
1118
+ results.append(record)
1119
+
1120
+ return results