corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1012 @@
1
+ """
2
+ Wikidata importer for the company/organization database.
3
+
4
+ Imports organization data from Wikidata using SPARQL queries
5
+ into the embedding database for entity name matching.
6
+
7
+ Supports 35+ entity types across 4 categories:
8
+
9
+ Organizations (highest priority):
10
+ - Organizations, nonprofits, NGOs, foundations
11
+ - Government agencies, international organizations
12
+ - Political parties, trade unions
13
+ - Educational institutions, universities, research institutes
14
+ - Hospitals, sports clubs
15
+
16
+ Companies:
17
+ - Companies with LEI codes or stock tickers
18
+ - Public companies, business enterprises, corporations
19
+ - Subsidiaries, conglomerates
20
+
21
+ Industry-specific:
22
+ - Banks, insurance companies, investment companies
23
+ - Airlines, retailers, manufacturers
24
+ - Pharma, tech companies, law firms
25
+ - Record labels, film studios, video game companies
26
+
27
+ Property-based (catches untyped entities):
28
+ - Entities with CEO, subsidiaries, legal form
29
+ - Entities with employee count or revenue data
30
+
31
+ Uses the public Wikidata Query Service endpoint.
32
+ """
33
+
34
+ import json
35
+ import logging
36
+ import time
37
+ import urllib.parse
38
+ import urllib.request
39
+ from typing import Any, Iterator, Optional
40
+
41
+ from ..models import CompanyRecord, EntityType
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Wikidata SPARQL endpoint
46
+ WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
47
+
48
+ # Simpler SPARQL query - directly query for companies with LEI codes (fastest, most reliable)
49
+ # Avoids property path wildcards (wdt:P279*) which timeout on Wikidata
50
+ LEI_COMPANY_QUERY = """
51
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
52
+ ?company wdt:P1278 ?lei.
53
+ OPTIONAL { ?company wdt:P249 ?ticker. }
54
+ OPTIONAL { ?company wdt:P17 ?country. }
55
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
56
+ }
57
+ LIMIT %d
58
+ OFFSET %d
59
+ """
60
+
61
+ # Query for companies with stock exchange listing (has ticker)
62
+ TICKER_COMPANY_QUERY = """
63
+ SELECT ?company ?companyLabel ?ticker ?exchange ?exchangeLabel ?country ?countryLabel WHERE {
64
+ ?company wdt:P414 ?exchange.
65
+ OPTIONAL { ?company wdt:P249 ?ticker. }
66
+ OPTIONAL { ?company wdt:P17 ?country. }
67
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
68
+ }
69
+ LIMIT %d
70
+ OFFSET %d
71
+ """
72
+
73
+ # Query for direct instances of public company (Q891723) - no subclass traversal
74
+ PUBLIC_COMPANY_QUERY = """
75
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
76
+ ?company wdt:P31 wd:Q891723.
77
+ OPTIONAL { ?company wdt:P1278 ?lei. }
78
+ OPTIONAL { ?company wdt:P249 ?ticker. }
79
+ OPTIONAL { ?company wdt:P17 ?country. }
80
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
81
+ }
82
+ LIMIT %d
83
+ OFFSET %d
84
+ """
85
+
86
+ # Query for direct instances of business enterprise (Q4830453) - no subclass traversal
87
+ BUSINESS_QUERY = """
88
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
89
+ ?company wdt:P31 wd:Q4830453.
90
+ OPTIONAL { ?company wdt:P1278 ?lei. }
91
+ OPTIONAL { ?company wdt:P249 ?ticker. }
92
+ OPTIONAL { ?company wdt:P17 ?country. }
93
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
94
+ }
95
+ LIMIT %d
96
+ OFFSET %d
97
+ """
98
+
99
+ # Query for direct instances of organization (Q43229) - includes NGOs, gov agencies, etc.
100
+ ORGANIZATION_QUERY = """
101
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
102
+ ?company wdt:P31 wd:Q43229.
103
+ OPTIONAL { ?company wdt:P1278 ?lei. }
104
+ OPTIONAL { ?company wdt:P249 ?ticker. }
105
+ OPTIONAL { ?company wdt:P17 ?country. }
106
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
107
+ }
108
+ LIMIT %d
109
+ OFFSET %d
110
+ """
111
+
112
+ # Query for non-profit organizations (Q163740)
113
+ NONPROFIT_QUERY = """
114
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
115
+ ?company wdt:P31 wd:Q163740.
116
+ OPTIONAL { ?company wdt:P1278 ?lei. }
117
+ OPTIONAL { ?company wdt:P249 ?ticker. }
118
+ OPTIONAL { ?company wdt:P17 ?country. }
119
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
120
+ }
121
+ LIMIT %d
122
+ OFFSET %d
123
+ """
124
+
125
+ # Query for government agencies (Q327333)
126
+ GOV_AGENCY_QUERY = """
127
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
128
+ ?company wdt:P31 wd:Q327333.
129
+ OPTIONAL { ?company wdt:P1278 ?lei. }
130
+ OPTIONAL { ?company wdt:P249 ?ticker. }
131
+ OPTIONAL { ?company wdt:P17 ?country. }
132
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
133
+ }
134
+ LIMIT %d
135
+ OFFSET %d
136
+ """
137
+
138
+ # Query for enterprises (Q6881511) - broader than business enterprise
139
+ ENTERPRISE_QUERY = """
140
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
141
+ ?company wdt:P31 wd:Q6881511.
142
+ OPTIONAL { ?company wdt:P1278 ?lei. }
143
+ OPTIONAL { ?company wdt:P249 ?ticker. }
144
+ OPTIONAL { ?company wdt:P17 ?country. }
145
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
146
+ }
147
+ LIMIT %d
148
+ OFFSET %d
149
+ """
150
+
151
+ # Query for corporations (Q167037)
152
+ CORPORATION_QUERY = """
153
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
154
+ ?company wdt:P31 wd:Q167037.
155
+ OPTIONAL { ?company wdt:P1278 ?lei. }
156
+ OPTIONAL { ?company wdt:P249 ?ticker. }
157
+ OPTIONAL { ?company wdt:P17 ?country. }
158
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
159
+ }
160
+ LIMIT %d
161
+ OFFSET %d
162
+ """
163
+
164
+ # Query for subsidiaries (Q658255)
165
+ SUBSIDIARY_QUERY = """
166
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
167
+ ?company wdt:P31 wd:Q658255.
168
+ OPTIONAL { ?company wdt:P1278 ?lei. }
169
+ OPTIONAL { ?company wdt:P249 ?ticker. }
170
+ OPTIONAL { ?company wdt:P17 ?country. }
171
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
172
+ }
173
+ LIMIT %d
174
+ OFFSET %d
175
+ """
176
+
177
+ # Query for banks (Q22687)
178
+ BANK_QUERY = """
179
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
180
+ ?company wdt:P31 wd:Q22687.
181
+ OPTIONAL { ?company wdt:P1278 ?lei. }
182
+ OPTIONAL { ?company wdt:P249 ?ticker. }
183
+ OPTIONAL { ?company wdt:P17 ?country. }
184
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
185
+ }
186
+ LIMIT %d
187
+ OFFSET %d
188
+ """
189
+
190
+ # Query for insurance companies (Q6881511)
191
+ INSURANCE_QUERY = """
192
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
193
+ ?company wdt:P31 wd:Q1145276.
194
+ OPTIONAL { ?company wdt:P1278 ?lei. }
195
+ OPTIONAL { ?company wdt:P249 ?ticker. }
196
+ OPTIONAL { ?company wdt:P17 ?country. }
197
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
198
+ }
199
+ LIMIT %d
200
+ OFFSET %d
201
+ """
202
+
203
+ # Query for airlines (Q46970)
204
+ AIRLINE_QUERY = """
205
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
206
+ ?company wdt:P31 wd:Q46970.
207
+ OPTIONAL { ?company wdt:P1278 ?lei. }
208
+ OPTIONAL { ?company wdt:P249 ?ticker. }
209
+ OPTIONAL { ?company wdt:P17 ?country. }
210
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
211
+ }
212
+ LIMIT %d
213
+ OFFSET %d
214
+ """
215
+
216
+ # Query for law firms (Q613142)
217
+ LAW_FIRM_QUERY = """
218
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
219
+ ?company wdt:P31 wd:Q613142.
220
+ OPTIONAL { ?company wdt:P1278 ?lei. }
221
+ OPTIONAL { ?company wdt:P249 ?ticker. }
222
+ OPTIONAL { ?company wdt:P17 ?country. }
223
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
224
+ }
225
+ LIMIT %d
226
+ OFFSET %d
227
+ """
228
+
229
+ # Query for educational institutions (Q2385804)
230
+ EDUCATIONAL_QUERY = """
231
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
232
+ ?company wdt:P31 wd:Q2385804.
233
+ OPTIONAL { ?company wdt:P1278 ?lei. }
234
+ OPTIONAL { ?company wdt:P249 ?ticker. }
235
+ OPTIONAL { ?company wdt:P17 ?country. }
236
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
237
+ }
238
+ LIMIT %d
239
+ OFFSET %d
240
+ """
241
+
242
+ # Query for universities (Q3918)
243
+ UNIVERSITY_QUERY = """
244
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
245
+ ?company wdt:P31 wd:Q3918.
246
+ OPTIONAL { ?company wdt:P1278 ?lei. }
247
+ OPTIONAL { ?company wdt:P249 ?ticker. }
248
+ OPTIONAL { ?company wdt:P17 ?country. }
249
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
250
+ }
251
+ LIMIT %d
252
+ OFFSET %d
253
+ """
254
+
255
+ # Query for research institutes (Q31855)
256
+ RESEARCH_INSTITUTE_QUERY = """
257
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
258
+ ?company wdt:P31 wd:Q31855.
259
+ OPTIONAL { ?company wdt:P1278 ?lei. }
260
+ OPTIONAL { ?company wdt:P249 ?ticker. }
261
+ OPTIONAL { ?company wdt:P17 ?country. }
262
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
263
+ }
264
+ LIMIT %d
265
+ OFFSET %d
266
+ """
267
+
268
+ # Query for political parties (Q7278)
269
+ POLITICAL_PARTY_QUERY = """
270
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
271
+ ?company wdt:P31 wd:Q7278.
272
+ OPTIONAL { ?company wdt:P1278 ?lei. }
273
+ OPTIONAL { ?company wdt:P249 ?ticker. }
274
+ OPTIONAL { ?company wdt:P17 ?country. }
275
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
276
+ }
277
+ LIMIT %d
278
+ OFFSET %d
279
+ """
280
+
281
+ # Query for trade unions (Q178790)
282
+ TRADE_UNION_QUERY = """
283
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
284
+ ?company wdt:P31 wd:Q178790.
285
+ OPTIONAL { ?company wdt:P1278 ?lei. }
286
+ OPTIONAL { ?company wdt:P249 ?ticker. }
287
+ OPTIONAL { ?company wdt:P17 ?country. }
288
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
289
+ }
290
+ LIMIT %d
291
+ OFFSET %d
292
+ """
293
+
294
+ # Query for NGOs (Q79913)
295
+ NGO_QUERY = """
296
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
297
+ ?company wdt:P31 wd:Q79913.
298
+ OPTIONAL { ?company wdt:P1278 ?lei. }
299
+ OPTIONAL { ?company wdt:P249 ?ticker. }
300
+ OPTIONAL { ?company wdt:P17 ?country. }
301
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
302
+ }
303
+ LIMIT %d
304
+ OFFSET %d
305
+ """
306
+
307
+ # Query for foundations (Q157031)
308
+ FOUNDATION_QUERY = """
309
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
310
+ ?company wdt:P31 wd:Q157031.
311
+ OPTIONAL { ?company wdt:P1278 ?lei. }
312
+ OPTIONAL { ?company wdt:P249 ?ticker. }
313
+ OPTIONAL { ?company wdt:P17 ?country. }
314
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
315
+ }
316
+ LIMIT %d
317
+ OFFSET %d
318
+ """
319
+
320
+ # Query for international organizations (Q484652)
321
+ INTL_ORG_QUERY = """
322
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
323
+ ?company wdt:P31 wd:Q484652.
324
+ OPTIONAL { ?company wdt:P1278 ?lei. }
325
+ OPTIONAL { ?company wdt:P249 ?ticker. }
326
+ OPTIONAL { ?company wdt:P17 ?country. }
327
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
328
+ }
329
+ LIMIT %d
330
+ OFFSET %d
331
+ """
332
+
333
+ # Query for sports teams/clubs (Q476028)
334
+ SPORTS_CLUB_QUERY = """
335
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
336
+ ?company wdt:P31 wd:Q476028.
337
+ OPTIONAL { ?company wdt:P1278 ?lei. }
338
+ OPTIONAL { ?company wdt:P249 ?ticker. }
339
+ OPTIONAL { ?company wdt:P17 ?country. }
340
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
341
+ }
342
+ LIMIT %d
343
+ OFFSET %d
344
+ """
345
+
346
+ # Query for hospitals (Q16917)
347
+ HOSPITAL_QUERY = """
348
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
349
+ ?company wdt:P31 wd:Q16917.
350
+ OPTIONAL { ?company wdt:P1278 ?lei. }
351
+ OPTIONAL { ?company wdt:P249 ?ticker. }
352
+ OPTIONAL { ?company wdt:P17 ?country. }
353
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
354
+ }
355
+ LIMIT %d
356
+ OFFSET %d
357
+ """
358
+
359
+ # Query for record labels (Q18127)
360
+ RECORD_LABEL_QUERY = """
361
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
362
+ ?company wdt:P31 wd:Q18127.
363
+ OPTIONAL { ?company wdt:P1278 ?lei. }
364
+ OPTIONAL { ?company wdt:P249 ?ticker. }
365
+ OPTIONAL { ?company wdt:P17 ?country. }
366
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
367
+ }
368
+ LIMIT %d
369
+ OFFSET %d
370
+ """
371
+
372
+ # Query for film studios (Q1366047)
373
+ FILM_STUDIO_QUERY = """
374
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
375
+ ?company wdt:P31 wd:Q1366047.
376
+ OPTIONAL { ?company wdt:P1278 ?lei. }
377
+ OPTIONAL { ?company wdt:P249 ?ticker. }
378
+ OPTIONAL { ?company wdt:P17 ?country. }
379
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
380
+ }
381
+ LIMIT %d
382
+ OFFSET %d
383
+ """
384
+
385
+ # Query for video game companies (Q1137109)
386
+ VIDEO_GAME_COMPANY_QUERY = """
387
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
388
+ ?company wdt:P31 wd:Q1137109.
389
+ OPTIONAL { ?company wdt:P1278 ?lei. }
390
+ OPTIONAL { ?company wdt:P249 ?ticker. }
391
+ OPTIONAL { ?company wdt:P17 ?country. }
392
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
393
+ }
394
+ LIMIT %d
395
+ OFFSET %d
396
+ """
397
+
398
+ # Query for pharmaceutical companies (Q507619)
399
+ PHARMA_QUERY = """
400
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
401
+ ?company wdt:P31 wd:Q507619.
402
+ OPTIONAL { ?company wdt:P1278 ?lei. }
403
+ OPTIONAL { ?company wdt:P249 ?ticker. }
404
+ OPTIONAL { ?company wdt:P17 ?country. }
405
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
406
+ }
407
+ LIMIT %d
408
+ OFFSET %d
409
+ """
410
+
411
+ # Query for tech companies (Q2979960)
412
+ TECH_COMPANY_QUERY = """
413
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
414
+ ?company wdt:P31 wd:Q2979960.
415
+ OPTIONAL { ?company wdt:P1278 ?lei. }
416
+ OPTIONAL { ?company wdt:P249 ?ticker. }
417
+ OPTIONAL { ?company wdt:P17 ?country. }
418
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
419
+ }
420
+ LIMIT %d
421
+ OFFSET %d
422
+ """
423
+
424
+ # Query for retailers (Q1631111)
425
+ RETAILER_QUERY = """
426
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
427
+ ?company wdt:P31 wd:Q1631111.
428
+ OPTIONAL { ?company wdt:P1278 ?lei. }
429
+ OPTIONAL { ?company wdt:P249 ?ticker. }
430
+ OPTIONAL { ?company wdt:P17 ?country. }
431
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
432
+ }
433
+ LIMIT %d
434
+ OFFSET %d
435
+ """
436
+
437
+ # Query for manufacturers (Q187652)
438
+ MANUFACTURER_QUERY = """
439
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
440
+ ?company wdt:P31 wd:Q187652.
441
+ OPTIONAL { ?company wdt:P1278 ?lei. }
442
+ OPTIONAL { ?company wdt:P249 ?ticker. }
443
+ OPTIONAL { ?company wdt:P17 ?country. }
444
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
445
+ }
446
+ LIMIT %d
447
+ OFFSET %d
448
+ """
449
+
450
+ # Query for conglomerates (Q206652)
451
+ CONGLOMERATE_QUERY = """
452
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
453
+ ?company wdt:P31 wd:Q206652.
454
+ OPTIONAL { ?company wdt:P1278 ?lei. }
455
+ OPTIONAL { ?company wdt:P249 ?ticker. }
456
+ OPTIONAL { ?company wdt:P17 ?country. }
457
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
458
+ }
459
+ LIMIT %d
460
+ OFFSET %d
461
+ """
462
+
463
+ # Query for investment companies (Q380649)
464
+ INVESTMENT_COMPANY_QUERY = """
465
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
466
+ ?company wdt:P31 wd:Q380649.
467
+ OPTIONAL { ?company wdt:P1278 ?lei. }
468
+ OPTIONAL { ?company wdt:P249 ?ticker. }
469
+ OPTIONAL { ?company wdt:P17 ?country. }
470
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
471
+ }
472
+ LIMIT %d
473
+ OFFSET %d
474
+ """
475
+
476
+ # Property-based query: entities with a CEO (P169) - likely companies
477
+ HAS_CEO_QUERY = """
478
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
479
+ ?company wdt:P169 ?ceo.
480
+ OPTIONAL { ?company wdt:P1278 ?lei. }
481
+ OPTIONAL { ?company wdt:P249 ?ticker. }
482
+ OPTIONAL { ?company wdt:P17 ?country. }
483
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
484
+ }
485
+ LIMIT %d
486
+ OFFSET %d
487
+ """
488
+
489
+ # Property-based query: entities with subsidiaries (P355) - parent companies
490
+ HAS_SUBSIDIARIES_QUERY = """
491
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
492
+ ?company wdt:P355 ?subsidiary.
493
+ OPTIONAL { ?company wdt:P1278 ?lei. }
494
+ OPTIONAL { ?company wdt:P249 ?ticker. }
495
+ OPTIONAL { ?company wdt:P17 ?country. }
496
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
497
+ }
498
+ LIMIT %d
499
+ OFFSET %d
500
+ """
501
+
502
+ # Property-based query: entities owned by another entity (P127) - subsidiaries/companies
503
+ OWNED_BY_QUERY = """
504
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
505
+ ?company wdt:P127 ?owner.
506
+ OPTIONAL { ?company wdt:P1278 ?lei. }
507
+ OPTIONAL { ?company wdt:P249 ?ticker. }
508
+ OPTIONAL { ?company wdt:P17 ?country. }
509
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
510
+ }
511
+ LIMIT %d
512
+ OFFSET %d
513
+ """
514
+
515
+ # Property-based query: entities with legal form (P1454) - structured companies
516
+ HAS_LEGAL_FORM_QUERY = """
517
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
518
+ ?company wdt:P1454 ?legalForm.
519
+ OPTIONAL { ?company wdt:P1278 ?lei. }
520
+ OPTIONAL { ?company wdt:P249 ?ticker. }
521
+ OPTIONAL { ?company wdt:P17 ?country. }
522
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
523
+ }
524
+ LIMIT %d
525
+ OFFSET %d
526
+ """
527
+
528
+ # Property-based query: entities with employees count (P1128) - organizations
529
+ HAS_EMPLOYEES_QUERY = """
530
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
531
+ ?company wdt:P1128 ?employees.
532
+ OPTIONAL { ?company wdt:P1278 ?lei. }
533
+ OPTIONAL { ?company wdt:P249 ?ticker. }
534
+ OPTIONAL { ?company wdt:P17 ?country. }
535
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
536
+ }
537
+ LIMIT %d
538
+ OFFSET %d
539
+ """
540
+
541
+ # Property-based query: entities with revenue (P2139) - companies
542
+ HAS_REVENUE_QUERY = """
543
+ SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel WHERE {
544
+ ?company wdt:P2139 ?revenue.
545
+ OPTIONAL { ?company wdt:P1278 ?lei. }
546
+ OPTIONAL { ?company wdt:P249 ?ticker. }
547
+ OPTIONAL { ?company wdt:P17 ?country. }
548
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
549
+ }
550
+ LIMIT %d
551
+ OFFSET %d
552
+ """
553
+
554
+
555
+ # Query types available for import - organized by category
556
+ # Organization types (highest priority - run first)
557
+ ORG_QUERY_TYPES = {
558
+ "organization": ORGANIZATION_QUERY,
559
+ "nonprofit": NONPROFIT_QUERY,
560
+ "ngo": NGO_QUERY,
561
+ "foundation": FOUNDATION_QUERY,
562
+ "government": GOV_AGENCY_QUERY,
563
+ "intl_org": INTL_ORG_QUERY,
564
+ "political_party": POLITICAL_PARTY_QUERY,
565
+ "trade_union": TRADE_UNION_QUERY,
566
+ "educational": EDUCATIONAL_QUERY,
567
+ "university": UNIVERSITY_QUERY,
568
+ "research_institute": RESEARCH_INSTITUTE_QUERY,
569
+ "hospital": HOSPITAL_QUERY,
570
+ "sports_club": SPORTS_CLUB_QUERY,
571
+ }
572
+
573
+ # Company types
574
+ COMPANY_QUERY_TYPES = {
575
+ "lei": LEI_COMPANY_QUERY,
576
+ "ticker": TICKER_COMPANY_QUERY,
577
+ "public": PUBLIC_COMPANY_QUERY,
578
+ "business": BUSINESS_QUERY,
579
+ "enterprise": ENTERPRISE_QUERY,
580
+ "corporation": CORPORATION_QUERY,
581
+ "subsidiary": SUBSIDIARY_QUERY,
582
+ "conglomerate": CONGLOMERATE_QUERY,
583
+ }
584
+
585
+ # Industry-specific company types
586
+ INDUSTRY_QUERY_TYPES = {
587
+ "bank": BANK_QUERY,
588
+ "insurance": INSURANCE_QUERY,
589
+ "airline": AIRLINE_QUERY,
590
+ "law_firm": LAW_FIRM_QUERY,
591
+ "pharma": PHARMA_QUERY,
592
+ "tech_company": TECH_COMPANY_QUERY,
593
+ "retailer": RETAILER_QUERY,
594
+ "manufacturer": MANUFACTURER_QUERY,
595
+ "investment_company": INVESTMENT_COMPANY_QUERY,
596
+ "record_label": RECORD_LABEL_QUERY,
597
+ "film_studio": FILM_STUDIO_QUERY,
598
+ "video_game_company": VIDEO_GAME_COMPANY_QUERY,
599
+ }
600
+
601
+ # Property-based queries (catches entities not typed correctly)
602
+ PROPERTY_QUERY_TYPES = {
603
+ "has_ceo": HAS_CEO_QUERY,
604
+ "has_subsidiaries": HAS_SUBSIDIARIES_QUERY,
605
+ "owned_by": OWNED_BY_QUERY,
606
+ "has_legal_form": HAS_LEGAL_FORM_QUERY,
607
+ "has_employees": HAS_EMPLOYEES_QUERY,
608
+ "has_revenue": HAS_REVENUE_QUERY,
609
+ }
610
+
611
+ # All query types combined
612
+ QUERY_TYPES = {
613
+ **ORG_QUERY_TYPES,
614
+ **COMPANY_QUERY_TYPES,
615
+ **INDUSTRY_QUERY_TYPES,
616
+ **PROPERTY_QUERY_TYPES,
617
+ }
618
+
619
+ # Mapping from query type to EntityType
620
+ QUERY_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
621
+ # Organizations
622
+ "organization": EntityType.NONPROFIT, # Generic org, default to nonprofit
623
+ "nonprofit": EntityType.NONPROFIT,
624
+ "ngo": EntityType.NGO,
625
+ "foundation": EntityType.FOUNDATION,
626
+ "government": EntityType.GOVERNMENT,
627
+ "intl_org": EntityType.INTERNATIONAL_ORG,
628
+ "political_party": EntityType.POLITICAL_PARTY,
629
+ "trade_union": EntityType.TRADE_UNION,
630
+ "educational": EntityType.EDUCATIONAL,
631
+ "university": EntityType.EDUCATIONAL,
632
+ "research_institute": EntityType.RESEARCH,
633
+ "hospital": EntityType.HEALTHCARE,
634
+ "sports_club": EntityType.SPORTS,
635
+
636
+ # Companies
637
+ "lei": EntityType.BUSINESS,
638
+ "ticker": EntityType.BUSINESS,
639
+ "public": EntityType.BUSINESS,
640
+ "business": EntityType.BUSINESS,
641
+ "enterprise": EntityType.BUSINESS,
642
+ "corporation": EntityType.BUSINESS,
643
+ "subsidiary": EntityType.BUSINESS,
644
+ "conglomerate": EntityType.BUSINESS,
645
+
646
+ # Industry-specific (all business)
647
+ "bank": EntityType.BUSINESS,
648
+ "insurance": EntityType.BUSINESS,
649
+ "airline": EntityType.BUSINESS,
650
+ "law_firm": EntityType.BUSINESS,
651
+ "pharma": EntityType.BUSINESS,
652
+ "tech_company": EntityType.BUSINESS,
653
+ "retailer": EntityType.BUSINESS,
654
+ "manufacturer": EntityType.BUSINESS,
655
+ "investment_company": EntityType.FUND,
656
+ "record_label": EntityType.MEDIA,
657
+ "film_studio": EntityType.MEDIA,
658
+ "video_game_company": EntityType.MEDIA,
659
+
660
+ # Property-based (assume business as they have CEO/revenue/etc)
661
+ "has_ceo": EntityType.BUSINESS,
662
+ "has_subsidiaries": EntityType.BUSINESS,
663
+ "owned_by": EntityType.BUSINESS,
664
+ "has_legal_form": EntityType.BUSINESS,
665
+ "has_employees": EntityType.UNKNOWN, # Could be any org type
666
+ "has_revenue": EntityType.BUSINESS,
667
+ }
668
+
669
+
670
+ class WikidataImporter:
671
+ """
672
+ Importer for Wikidata organization data.
673
+
674
+ Uses SPARQL queries against the public Wikidata Query Service
675
+ to fetch organizations including companies, nonprofits, government agencies, etc.
676
+
677
+ Query categories (run in this order with import_all=True):
678
+
679
+ Organizations:
680
+ - organization: All organizations (Q43229)
681
+ - nonprofit: Non-profit organizations (Q163740)
682
+ - ngo: NGOs (Q79913)
683
+ - foundation: Foundations (Q157031)
684
+ - government: Government agencies (Q327333)
685
+ - intl_org: International organizations (Q484652)
686
+ - political_party: Political parties (Q7278)
687
+ - trade_union: Trade unions (Q178790)
688
+ - educational: Educational institutions (Q2385804)
689
+ - university: Universities (Q3918)
690
+ - research_institute: Research institutes (Q31855)
691
+ - hospital: Hospitals (Q16917)
692
+ - sports_club: Sports clubs (Q476028)
693
+
694
+ Companies:
695
+ - lei: Companies with LEI codes
696
+ - ticker: Companies with stock exchange listings
697
+ - public: Public companies (Q891723)
698
+ - business: Business enterprises (Q4830453)
699
+ - enterprise: Enterprises (Q6881511)
700
+ - corporation: Corporations (Q167037)
701
+ - subsidiary: Subsidiaries (Q658255)
702
+ - conglomerate: Conglomerates (Q206652)
703
+
704
+ Industry-specific:
705
+ - bank: Banks (Q22687)
706
+ - insurance: Insurance companies (Q1145276)
707
+ - airline: Airlines (Q46970)
708
+ - law_firm: Law firms (Q613142)
709
+ - pharma: Pharmaceutical companies (Q507619)
710
+ - tech_company: Tech companies (Q2979960)
711
+ - retailer: Retailers (Q1631111)
712
+ - manufacturer: Manufacturers (Q187652)
713
+ - investment_company: Investment companies (Q380649)
714
+ - record_label: Record labels (Q18127)
715
+ - film_studio: Film studios (Q1366047)
716
+ - video_game_company: Video game companies (Q1137109)
717
+
718
+ Property-based (catches untyped entities):
719
+ - has_ceo: Entities with CEO (P169)
720
+ - has_subsidiaries: Entities with subsidiaries (P355)
721
+ - owned_by: Entities owned by another (P127)
722
+ - has_legal_form: Entities with legal form (P1454)
723
+ - has_employees: Entities with employee count (P1128)
724
+ - has_revenue: Entities with revenue (P2139)
725
+ """
726
+
727
+ def __init__(self, batch_size: int = 1000, delay_seconds: float = 2.0, timeout: int = 120):
728
+ """
729
+ Initialize the Wikidata importer.
730
+
731
+ Args:
732
+ batch_size: Number of records to fetch per SPARQL query (default 1000)
733
+ delay_seconds: Delay between requests to be polite to the endpoint
734
+ timeout: HTTP timeout in seconds (default 120)
735
+ """
736
+ self._batch_size = batch_size
737
+ self._delay = delay_seconds
738
+ self._timeout = timeout
739
+
740
+ def import_from_sparql(
741
+ self,
742
+ limit: Optional[int] = None,
743
+ query_type: str = "lei",
744
+ import_all: bool = False,
745
+ ) -> Iterator[CompanyRecord]:
746
+ """
747
+ Import organization records from Wikidata via SPARQL.
748
+
749
+ Args:
750
+ limit: Optional limit on total records
751
+ query_type: Which query to use (see class docstring for full list).
752
+ Common options:
753
+ - "lei": Companies with LEI codes (default, fastest)
754
+ - "organization": All organizations (Q43229)
755
+ - "nonprofit": Non-profit organizations (Q163740)
756
+ - "government": Government agencies (Q327333)
757
+ - "has_ceo": Entities with CEO property (catches many companies)
758
+ import_all: If True, run all query types sequentially in priority order:
759
+ 1. Organization types (nonprofits, gov agencies, NGOs, etc.)
760
+ 2. Company types (public companies, business enterprises, etc.)
761
+ 3. Industry-specific types (banks, airlines, pharma, etc.)
762
+ 4. Property-based queries (catches entities not properly typed)
763
+
764
+ Yields:
765
+ CompanyRecord for each organization
766
+ """
767
+ if import_all:
768
+ yield from self._import_all_types(limit)
769
+ return
770
+
771
+ if query_type not in QUERY_TYPES:
772
+ raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPES.keys())}")
773
+
774
+ query_template = QUERY_TYPES[query_type]
775
+ entity_type = QUERY_TYPE_TO_ENTITY_TYPE.get(query_type, EntityType.UNKNOWN)
776
+ logger.info(f"Starting Wikidata company import via SPARQL (query_type={query_type}, entity_type={entity_type.value})...")
777
+
778
+ offset = 0
779
+ total_count = 0
780
+ seen_ids = set() # Track seen Wikidata IDs to avoid duplicates
781
+
782
+ while True:
783
+ if limit and total_count >= limit:
784
+ break
785
+
786
+ batch_limit = min(self._batch_size, (limit - total_count) if limit else self._batch_size)
787
+ query = query_template % (batch_limit, offset)
788
+
789
+ logger.info(f"Fetching Wikidata batch at offset {offset}...")
790
+
791
+ try:
792
+ results = self._execute_sparql(query)
793
+ except Exception as e:
794
+ logger.error(f"SPARQL query failed at offset {offset}: {e}")
795
+ break
796
+
797
+ bindings = results.get("results", {}).get("bindings", [])
798
+
799
+ if not bindings:
800
+ logger.info("No more results from Wikidata")
801
+ break
802
+
803
+ batch_count = 0
804
+ for binding in bindings:
805
+ if limit and total_count >= limit:
806
+ break
807
+
808
+ record = self._parse_binding(binding, entity_type=entity_type)
809
+ if record and record.source_id not in seen_ids:
810
+ seen_ids.add(record.source_id)
811
+ total_count += 1
812
+ batch_count += 1
813
+ yield record
814
+
815
+ logger.info(f"Processed {batch_count} records from batch (total: {total_count})")
816
+
817
+ if len(bindings) < batch_limit:
818
+ # Last batch
819
+ break
820
+
821
+ offset += self._batch_size
822
+
823
+ # Be polite to the endpoint
824
+ if self._delay > 0:
825
+ time.sleep(self._delay)
826
+
827
+ logger.info(f"Completed Wikidata import: {total_count} records")
828
+
829
+ def _import_all_types(self, limit: Optional[int]) -> Iterator[CompanyRecord]:
830
+ """Import from all query types sequentially, deduplicating across types.
831
+
832
+ Query categories are run in priority order:
833
+ 1. Organization types (nonprofits, gov agencies, NGOs, etc.)
834
+ 2. Company types (public companies, business enterprises, etc.)
835
+ 3. Industry-specific types (banks, airlines, pharma, etc.)
836
+ 4. Property-based queries (catches entities not properly typed)
837
+ """
838
+ seen_ids: set[str] = set()
839
+ total_count = 0
840
+
841
+ # Calculate per-category limits if a total limit is set
842
+ num_categories = 4
843
+ per_category_limit = limit // num_categories if limit else None
844
+
845
+ # Run categories in priority order: organizations first
846
+ categories = [
847
+ ("Organizations", ORG_QUERY_TYPES, per_category_limit),
848
+ ("Companies", COMPANY_QUERY_TYPES, per_category_limit),
849
+ ("Industry-specific", INDUSTRY_QUERY_TYPES, per_category_limit),
850
+ ("Property-based", PROPERTY_QUERY_TYPES, per_category_limit),
851
+ ]
852
+
853
+ for category_name, query_types, category_limit in categories:
854
+ logger.info(f"=== Starting category: {category_name} ({len(query_types)} query types) ===")
855
+ category_count = 0
856
+ per_type_limit = category_limit // len(query_types) if category_limit else None
857
+
858
+ for query_type in query_types:
859
+ logger.info(f"Importing from query type: {query_type}")
860
+ type_count = 0
861
+
862
+ for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
863
+ if record.source_id not in seen_ids:
864
+ seen_ids.add(record.source_id)
865
+ total_count += 1
866
+ type_count += 1
867
+ category_count += 1
868
+ yield record
869
+
870
+ if limit and total_count >= limit:
871
+ logger.info(f"Reached total limit of {limit} records")
872
+ return
873
+
874
+ logger.info(f"Got {type_count} new records from {query_type} (total: {total_count})")
875
+
876
+ logger.info(f"=== Completed {category_name}: {category_count} new records ===")
877
+
878
+ logger.info(f"Completed all query types: {total_count} total records")
879
+
880
+ def _execute_sparql(self, query: str) -> dict[str, Any]:
881
+ """Execute a SPARQL query against Wikidata."""
882
+ params = urllib.parse.urlencode({
883
+ "query": query,
884
+ "format": "json",
885
+ })
886
+
887
+ url = f"{WIKIDATA_SPARQL_URL}?{params}"
888
+
889
+ req = urllib.request.Request(
890
+ url,
891
+ headers={
892
+ "Accept": "application/sparql-results+json",
893
+ "User-Agent": "corp-extractor/1.0 (company database builder)",
894
+ }
895
+ )
896
+
897
+ with urllib.request.urlopen(req, timeout=self._timeout) as response:
898
+ return json.loads(response.read().decode("utf-8"))
899
+
900
+ def _parse_binding(
901
+ self,
902
+ binding: dict[str, Any],
903
+ entity_type: EntityType = EntityType.UNKNOWN,
904
+ ) -> Optional[CompanyRecord]:
905
+ """Parse a SPARQL result binding into a CompanyRecord."""
906
+ try:
907
+ # Get Wikidata entity ID
908
+ company_uri = binding.get("company", {}).get("value", "")
909
+ if not company_uri:
910
+ return None
911
+
912
+ # Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
913
+ wikidata_id = company_uri.split("/")[-1]
914
+ if not wikidata_id.startswith("Q"):
915
+ return None
916
+
917
+ # Get label
918
+ label = binding.get("companyLabel", {}).get("value", "")
919
+ if not label or label == wikidata_id: # Skip if no English label
920
+ return None
921
+
922
+ # Get optional fields
923
+ lei = binding.get("lei", {}).get("value")
924
+ ticker = binding.get("ticker", {}).get("value")
925
+ exchange_label = binding.get("exchangeLabel", {}).get("value")
926
+ country_label = binding.get("countryLabel", {}).get("value")
927
+ inception = binding.get("inception", {}).get("value")
928
+
929
+ # Build record data
930
+ record_data = {
931
+ "wikidata_id": wikidata_id,
932
+ "label": label,
933
+ }
934
+ if lei:
935
+ record_data["lei"] = lei
936
+ if ticker:
937
+ record_data["ticker"] = ticker
938
+ if exchange_label:
939
+ record_data["exchange"] = exchange_label
940
+ if country_label:
941
+ record_data["country"] = country_label
942
+ if inception:
943
+ record_data["inception"] = inception
944
+
945
+ return CompanyRecord(
946
+ name=label.strip(),
947
+ source="wikipedia", # Use "wikipedia" as source per schema
948
+ source_id=wikidata_id,
949
+ region=country_label or "",
950
+ entity_type=entity_type,
951
+ record=record_data,
952
+ )
953
+
954
+ except Exception as e:
955
+ logger.debug(f"Failed to parse Wikidata binding: {e}")
956
+ return None
957
+
958
+ def search_company(self, name: str, limit: int = 10) -> list[CompanyRecord]:
959
+ """
960
+ Search for a specific company by name.
961
+
962
+ Args:
963
+ name: Company name to search for
964
+ limit: Maximum results to return
965
+
966
+ Returns:
967
+ List of matching CompanyRecords
968
+ """
969
+ # Use Wikidata search API for better name matching
970
+ search_url = "https://www.wikidata.org/w/api.php"
971
+ params = urllib.parse.urlencode({
972
+ "action": "wbsearchentities",
973
+ "search": name,
974
+ "language": "en",
975
+ "type": "item",
976
+ "limit": limit,
977
+ "format": "json",
978
+ })
979
+
980
+ req = urllib.request.Request(
981
+ f"{search_url}?{params}",
982
+ headers={"User-Agent": "corp-extractor/1.0"}
983
+ )
984
+
985
+ with urllib.request.urlopen(req, timeout=30) as response:
986
+ data = json.loads(response.read().decode("utf-8"))
987
+
988
+ results = []
989
+ for item in data.get("search", []):
990
+ qid = item.get("id")
991
+ label = item.get("label", "")
992
+ description = item.get("description", "")
993
+
994
+ # Check if it looks like a company
995
+ company_keywords = ["company", "corporation", "inc", "ltd", "enterprise", "business"]
996
+ if not any(kw in description.lower() for kw in company_keywords):
997
+ continue
998
+
999
+ record = CompanyRecord(
1000
+ name=label,
1001
+ source="wikipedia",
1002
+ source_id=qid,
1003
+ region="", # Not available from search API
1004
+ record={
1005
+ "wikidata_id": qid,
1006
+ "label": label,
1007
+ "description": description,
1008
+ },
1009
+ )
1010
+ results.append(record)
1011
+
1012
+ return results