corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wikidata importer for the company/organization database.
|
|
3
|
+
|
|
4
|
+
Imports organization data from Wikidata using SPARQL queries
|
|
5
|
+
into the embedding database for entity name matching.
|
|
6
|
+
|
|
7
|
+
Supports 35+ entity types across 4 categories:
|
|
8
|
+
|
|
9
|
+
Organizations (highest priority):
|
|
10
|
+
- Organizations, nonprofits, NGOs, foundations
|
|
11
|
+
- Government agencies, international organizations
|
|
12
|
+
- Political parties, trade unions
|
|
13
|
+
- Educational institutions, universities, research institutes
|
|
14
|
+
- Hospitals, sports clubs
|
|
15
|
+
|
|
16
|
+
Companies:
|
|
17
|
+
- Companies with LEI codes or stock tickers
|
|
18
|
+
- Public companies, business enterprises, corporations
|
|
19
|
+
- Subsidiaries, conglomerates
|
|
20
|
+
|
|
21
|
+
Industry-specific:
|
|
22
|
+
- Banks, insurance companies, investment companies
|
|
23
|
+
- Airlines, retailers, manufacturers
|
|
24
|
+
- Pharma, tech companies, law firms
|
|
25
|
+
- Record labels, film studios, video game companies
|
|
26
|
+
|
|
27
|
+
Property-based (catches untyped entities):
|
|
28
|
+
- Entities with CEO, subsidiaries, legal form
|
|
29
|
+
- Entities with employee count or revenue data
|
|
30
|
+
|
|
31
|
+
Uses the public Wikidata Query Service endpoint.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import json
|
|
35
|
+
import logging
|
|
36
|
+
import time
|
|
37
|
+
import urllib.parse
|
|
38
|
+
import urllib.request
|
|
39
|
+
from typing import Any, Iterator, Optional
|
|
40
|
+
|
|
41
|
+
from ..models import CompanyRecord, EntityType
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
# Wikidata SPARQL endpoint
|
|
46
|
+
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
47
|
+
|
|
48
|
+
# Simpler SPARQL query - directly query for companies with LEI codes (fastest, most reliable)
|
|
49
|
+
# Avoids property path wildcards (wdt:P279*) which timeout on Wikidata
|
|
50
|
+
LEI_COMPANY_QUERY = """
|
|
51
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
52
|
+
?company wdt:P1278 ?lei.
|
|
53
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
54
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
55
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
56
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
57
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
58
|
+
}
|
|
59
|
+
LIMIT %d
|
|
60
|
+
OFFSET %d
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Query for companies with stock exchange listing (has ticker)
|
|
64
|
+
TICKER_COMPANY_QUERY = """
|
|
65
|
+
SELECT ?company ?companyLabel ?ticker ?exchange ?exchangeLabel ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
66
|
+
?company wdt:P414 ?exchange.
|
|
67
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
68
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
69
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
70
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
71
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
72
|
+
}
|
|
73
|
+
LIMIT %d
|
|
74
|
+
OFFSET %d
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
# Query for direct instances of public company (Q891723) - no subclass traversal
|
|
78
|
+
PUBLIC_COMPANY_QUERY = """
|
|
79
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
80
|
+
?company wdt:P31 wd:Q891723.
|
|
81
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
82
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
83
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
84
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
85
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
86
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
87
|
+
}
|
|
88
|
+
LIMIT %d
|
|
89
|
+
OFFSET %d
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
# Query for direct instances of business enterprise (Q4830453) - no subclass traversal
|
|
93
|
+
BUSINESS_QUERY = """
|
|
94
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
95
|
+
?company wdt:P31 wd:Q4830453.
|
|
96
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
97
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
98
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
99
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
100
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
101
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
102
|
+
}
|
|
103
|
+
LIMIT %d
|
|
104
|
+
OFFSET %d
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
# Query for direct instances of organization (Q43229) - includes NGOs, gov agencies, etc.
|
|
108
|
+
ORGANIZATION_QUERY = """
|
|
109
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
110
|
+
?company wdt:P31 wd:Q43229.
|
|
111
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
112
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
113
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
114
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
115
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
116
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
117
|
+
}
|
|
118
|
+
LIMIT %d
|
|
119
|
+
OFFSET %d
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
# Query for non-profit organizations (Q163740)
|
|
123
|
+
NONPROFIT_QUERY = """
|
|
124
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
125
|
+
?company wdt:P31 wd:Q163740.
|
|
126
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
127
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
128
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
129
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
130
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
131
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
132
|
+
}
|
|
133
|
+
LIMIT %d
|
|
134
|
+
OFFSET %d
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
# Query for government agencies (Q327333)
|
|
138
|
+
GOV_AGENCY_QUERY = """
|
|
139
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
140
|
+
?company wdt:P31 wd:Q327333.
|
|
141
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
142
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
143
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
144
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
145
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
146
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
147
|
+
}
|
|
148
|
+
LIMIT %d
|
|
149
|
+
OFFSET %d
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
# Query for enterprises (Q6881511) - broader than business enterprise
|
|
153
|
+
ENTERPRISE_QUERY = """
|
|
154
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
155
|
+
?company wdt:P31 wd:Q6881511.
|
|
156
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
157
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
158
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
159
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
160
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
161
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
162
|
+
}
|
|
163
|
+
LIMIT %d
|
|
164
|
+
OFFSET %d
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
# Query for corporations (Q167037)
|
|
168
|
+
CORPORATION_QUERY = """
|
|
169
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
170
|
+
?company wdt:P31 wd:Q167037.
|
|
171
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
172
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
173
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
174
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
175
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
176
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
177
|
+
}
|
|
178
|
+
LIMIT %d
|
|
179
|
+
OFFSET %d
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
# Query for subsidiaries (Q658255)
|
|
183
|
+
SUBSIDIARY_QUERY = """
|
|
184
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
185
|
+
?company wdt:P31 wd:Q658255.
|
|
186
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
187
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
188
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
189
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
190
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
191
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
192
|
+
}
|
|
193
|
+
LIMIT %d
|
|
194
|
+
OFFSET %d
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
# Query for banks (Q22687)
|
|
198
|
+
BANK_QUERY = """
|
|
199
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
200
|
+
?company wdt:P31 wd:Q22687.
|
|
201
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
202
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
203
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
204
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
205
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
206
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
207
|
+
}
|
|
208
|
+
LIMIT %d
|
|
209
|
+
OFFSET %d
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
# Query for insurance companies (Q6881511)
|
|
213
|
+
INSURANCE_QUERY = """
|
|
214
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
215
|
+
?company wdt:P31 wd:Q1145276.
|
|
216
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
217
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
218
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
219
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
220
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
221
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
222
|
+
}
|
|
223
|
+
LIMIT %d
|
|
224
|
+
OFFSET %d
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
# Query for airlines (Q46970)
|
|
228
|
+
AIRLINE_QUERY = """
|
|
229
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
230
|
+
?company wdt:P31 wd:Q46970.
|
|
231
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
232
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
233
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
234
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
235
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
236
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
237
|
+
}
|
|
238
|
+
LIMIT %d
|
|
239
|
+
OFFSET %d
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
# Query for law firms (Q613142)
|
|
243
|
+
LAW_FIRM_QUERY = """
|
|
244
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
245
|
+
?company wdt:P31 wd:Q613142.
|
|
246
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
247
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
248
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
249
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
250
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
251
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
252
|
+
}
|
|
253
|
+
LIMIT %d
|
|
254
|
+
OFFSET %d
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
# Query for educational institutions (Q2385804)
|
|
258
|
+
EDUCATIONAL_QUERY = """
|
|
259
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
260
|
+
?company wdt:P31 wd:Q2385804.
|
|
261
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
262
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
263
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
264
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
265
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
266
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
267
|
+
}
|
|
268
|
+
LIMIT %d
|
|
269
|
+
OFFSET %d
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
# Query for universities (Q3918)
|
|
273
|
+
UNIVERSITY_QUERY = """
|
|
274
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
275
|
+
?company wdt:P31 wd:Q3918.
|
|
276
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
277
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
278
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
279
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
280
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
281
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
282
|
+
}
|
|
283
|
+
LIMIT %d
|
|
284
|
+
OFFSET %d
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
# Query for research institutes (Q31855)
|
|
288
|
+
RESEARCH_INSTITUTE_QUERY = """
|
|
289
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
290
|
+
?company wdt:P31 wd:Q31855.
|
|
291
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
292
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
293
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
294
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
295
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
296
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
297
|
+
}
|
|
298
|
+
LIMIT %d
|
|
299
|
+
OFFSET %d
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
# Query for political parties (Q7278)
|
|
303
|
+
POLITICAL_PARTY_QUERY = """
|
|
304
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
305
|
+
?company wdt:P31 wd:Q7278.
|
|
306
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
307
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
308
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
309
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
310
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
311
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
312
|
+
}
|
|
313
|
+
LIMIT %d
|
|
314
|
+
OFFSET %d
|
|
315
|
+
"""
|
|
316
|
+
|
|
317
|
+
# Query for trade unions (Q178790)
|
|
318
|
+
TRADE_UNION_QUERY = """
|
|
319
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
320
|
+
?company wdt:P31 wd:Q178790.
|
|
321
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
322
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
323
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
324
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
325
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
326
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
327
|
+
}
|
|
328
|
+
LIMIT %d
|
|
329
|
+
OFFSET %d
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
# Query for NGOs (Q79913)
|
|
333
|
+
NGO_QUERY = """
|
|
334
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
335
|
+
?company wdt:P31 wd:Q79913.
|
|
336
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
337
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
338
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
339
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
340
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
341
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
342
|
+
}
|
|
343
|
+
LIMIT %d
|
|
344
|
+
OFFSET %d
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
# Query for foundations (Q157031)
|
|
348
|
+
FOUNDATION_QUERY = """
|
|
349
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
350
|
+
?company wdt:P31 wd:Q157031.
|
|
351
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
352
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
353
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
354
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
355
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
356
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
357
|
+
}
|
|
358
|
+
LIMIT %d
|
|
359
|
+
OFFSET %d
|
|
360
|
+
"""
|
|
361
|
+
|
|
362
|
+
# Query for international organizations (Q484652)
|
|
363
|
+
INTL_ORG_QUERY = """
|
|
364
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
365
|
+
?company wdt:P31 wd:Q484652.
|
|
366
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
367
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
368
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
369
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
370
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
371
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
372
|
+
}
|
|
373
|
+
LIMIT %d
|
|
374
|
+
OFFSET %d
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
# Query for sports teams/clubs (Q476028)
|
|
378
|
+
SPORTS_CLUB_QUERY = """
|
|
379
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
380
|
+
?company wdt:P31 wd:Q476028.
|
|
381
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
382
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
383
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
384
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
385
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
386
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
387
|
+
}
|
|
388
|
+
LIMIT %d
|
|
389
|
+
OFFSET %d
|
|
390
|
+
"""
|
|
391
|
+
|
|
392
|
+
# Query for hospitals (Q16917)
|
|
393
|
+
HOSPITAL_QUERY = """
|
|
394
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
395
|
+
?company wdt:P31 wd:Q16917.
|
|
396
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
397
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
398
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
399
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
400
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
401
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
402
|
+
}
|
|
403
|
+
LIMIT %d
|
|
404
|
+
OFFSET %d
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
# Query for record labels (Q18127)
|
|
408
|
+
RECORD_LABEL_QUERY = """
|
|
409
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
410
|
+
?company wdt:P31 wd:Q18127.
|
|
411
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
412
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
413
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
414
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
415
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
416
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
417
|
+
}
|
|
418
|
+
LIMIT %d
|
|
419
|
+
OFFSET %d
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
# Query for film studios (Q1366047)
|
|
423
|
+
FILM_STUDIO_QUERY = """
|
|
424
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
425
|
+
?company wdt:P31 wd:Q1366047.
|
|
426
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
427
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
428
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
429
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
430
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
431
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
432
|
+
}
|
|
433
|
+
LIMIT %d
|
|
434
|
+
OFFSET %d
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
# Query for video game companies (Q1137109)
|
|
438
|
+
VIDEO_GAME_COMPANY_QUERY = """
|
|
439
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
440
|
+
?company wdt:P31 wd:Q1137109.
|
|
441
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
442
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
443
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
444
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
445
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
446
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
447
|
+
}
|
|
448
|
+
LIMIT %d
|
|
449
|
+
OFFSET %d
|
|
450
|
+
"""
|
|
451
|
+
|
|
452
|
+
# Query for pharmaceutical companies (Q507619)
|
|
453
|
+
PHARMA_QUERY = """
|
|
454
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
455
|
+
?company wdt:P31 wd:Q507619.
|
|
456
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
457
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
458
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
459
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
460
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
461
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
462
|
+
}
|
|
463
|
+
LIMIT %d
|
|
464
|
+
OFFSET %d
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
# Query for tech companies (Q2979960)
|
|
468
|
+
TECH_COMPANY_QUERY = """
|
|
469
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
470
|
+
?company wdt:P31 wd:Q2979960.
|
|
471
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
472
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
473
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
474
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
475
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
476
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
477
|
+
}
|
|
478
|
+
LIMIT %d
|
|
479
|
+
OFFSET %d
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
# Query for retailers (Q1631111)
|
|
483
|
+
RETAILER_QUERY = """
|
|
484
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
485
|
+
?company wdt:P31 wd:Q1631111.
|
|
486
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
487
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
488
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
489
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
490
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
491
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
492
|
+
}
|
|
493
|
+
LIMIT %d
|
|
494
|
+
OFFSET %d
|
|
495
|
+
"""
|
|
496
|
+
|
|
497
|
+
# Query for manufacturers (Q187652)
|
|
498
|
+
MANUFACTURER_QUERY = """
|
|
499
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
500
|
+
?company wdt:P31 wd:Q187652.
|
|
501
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
502
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
503
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
504
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
505
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
506
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
507
|
+
}
|
|
508
|
+
LIMIT %d
|
|
509
|
+
OFFSET %d
|
|
510
|
+
"""
|
|
511
|
+
|
|
512
|
+
# Query for conglomerates (Q206652)
|
|
513
|
+
CONGLOMERATE_QUERY = """
|
|
514
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
515
|
+
?company wdt:P31 wd:Q206652.
|
|
516
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
517
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
518
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
519
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
520
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
521
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
522
|
+
}
|
|
523
|
+
LIMIT %d
|
|
524
|
+
OFFSET %d
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
# Query for investment companies (Q380649)
|
|
528
|
+
INVESTMENT_COMPANY_QUERY = """
|
|
529
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
530
|
+
?company wdt:P31 wd:Q380649.
|
|
531
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
532
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
533
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
534
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
535
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
536
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
537
|
+
}
|
|
538
|
+
LIMIT %d
|
|
539
|
+
OFFSET %d
|
|
540
|
+
"""
|
|
541
|
+
|
|
542
|
+
# Property-based query: entities with a CEO (P169) - likely companies
|
|
543
|
+
HAS_CEO_QUERY = """
|
|
544
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
545
|
+
?company wdt:P169 ?ceo.
|
|
546
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
547
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
548
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
549
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
550
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
551
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
552
|
+
}
|
|
553
|
+
LIMIT %d
|
|
554
|
+
OFFSET %d
|
|
555
|
+
"""
|
|
556
|
+
|
|
557
|
+
# Property-based query: entities with subsidiaries (P355) - parent companies
|
|
558
|
+
HAS_SUBSIDIARIES_QUERY = """
|
|
559
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
560
|
+
?company wdt:P355 ?subsidiary.
|
|
561
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
562
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
563
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
564
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
565
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
566
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
567
|
+
}
|
|
568
|
+
LIMIT %d
|
|
569
|
+
OFFSET %d
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
# Property-based query: entities owned by another entity (P127) - subsidiaries/companies
|
|
573
|
+
OWNED_BY_QUERY = """
|
|
574
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
575
|
+
?company wdt:P127 ?owner.
|
|
576
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
577
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
578
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
579
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
580
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
581
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
582
|
+
}
|
|
583
|
+
LIMIT %d
|
|
584
|
+
OFFSET %d
|
|
585
|
+
"""
|
|
586
|
+
|
|
587
|
+
# Property-based query: entities with legal form (P1454) - structured companies
|
|
588
|
+
HAS_LEGAL_FORM_QUERY = """
|
|
589
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
590
|
+
?company wdt:P1454 ?legalForm.
|
|
591
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
592
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
593
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
594
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
595
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
596
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
597
|
+
}
|
|
598
|
+
LIMIT %d
|
|
599
|
+
OFFSET %d
|
|
600
|
+
"""
|
|
601
|
+
|
|
602
|
+
# Property-based query: entities with employees count (P1128) - organizations
|
|
603
|
+
HAS_EMPLOYEES_QUERY = """
|
|
604
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
605
|
+
?company wdt:P1128 ?employees.
|
|
606
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
607
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
608
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
609
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
610
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
611
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
612
|
+
}
|
|
613
|
+
LIMIT %d
|
|
614
|
+
OFFSET %d
|
|
615
|
+
"""
|
|
616
|
+
|
|
617
|
+
# Property-based query: entities with revenue (P2139) - companies
|
|
618
|
+
HAS_REVENUE_QUERY = """
|
|
619
|
+
SELECT ?company ?companyLabel ?lei ?ticker ?country ?countryLabel ?inception ?dissolution WHERE {
|
|
620
|
+
?company wdt:P2139 ?revenue.
|
|
621
|
+
OPTIONAL { ?company wdt:P1278 ?lei. }
|
|
622
|
+
OPTIONAL { ?company wdt:P249 ?ticker. }
|
|
623
|
+
OPTIONAL { ?company wdt:P17 ?country. }
|
|
624
|
+
OPTIONAL { ?company wdt:P571 ?inception. }
|
|
625
|
+
OPTIONAL { ?company wdt:P576 ?dissolution. }
|
|
626
|
+
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
|
|
627
|
+
}
|
|
628
|
+
LIMIT %d
|
|
629
|
+
OFFSET %d
|
|
630
|
+
"""
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
# Query types available for import - organized by category
|
|
634
|
+
# Organization types (highest priority - run first)
|
|
635
|
+
ORG_QUERY_TYPES = {
|
|
636
|
+
"organization": ORGANIZATION_QUERY,
|
|
637
|
+
"nonprofit": NONPROFIT_QUERY,
|
|
638
|
+
"ngo": NGO_QUERY,
|
|
639
|
+
"foundation": FOUNDATION_QUERY,
|
|
640
|
+
"government": GOV_AGENCY_QUERY,
|
|
641
|
+
"intl_org": INTL_ORG_QUERY,
|
|
642
|
+
"political_party": POLITICAL_PARTY_QUERY,
|
|
643
|
+
"trade_union": TRADE_UNION_QUERY,
|
|
644
|
+
"educational": EDUCATIONAL_QUERY,
|
|
645
|
+
"university": UNIVERSITY_QUERY,
|
|
646
|
+
"research_institute": RESEARCH_INSTITUTE_QUERY,
|
|
647
|
+
"hospital": HOSPITAL_QUERY,
|
|
648
|
+
"sports_club": SPORTS_CLUB_QUERY,
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
# Company types
|
|
652
|
+
COMPANY_QUERY_TYPES = {
|
|
653
|
+
"lei": LEI_COMPANY_QUERY,
|
|
654
|
+
"ticker": TICKER_COMPANY_QUERY,
|
|
655
|
+
"public": PUBLIC_COMPANY_QUERY,
|
|
656
|
+
"business": BUSINESS_QUERY,
|
|
657
|
+
"enterprise": ENTERPRISE_QUERY,
|
|
658
|
+
"corporation": CORPORATION_QUERY,
|
|
659
|
+
"subsidiary": SUBSIDIARY_QUERY,
|
|
660
|
+
"conglomerate": CONGLOMERATE_QUERY,
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
# Industry-specific company types
|
|
664
|
+
INDUSTRY_QUERY_TYPES = {
|
|
665
|
+
"bank": BANK_QUERY,
|
|
666
|
+
"insurance": INSURANCE_QUERY,
|
|
667
|
+
"airline": AIRLINE_QUERY,
|
|
668
|
+
"law_firm": LAW_FIRM_QUERY,
|
|
669
|
+
"pharma": PHARMA_QUERY,
|
|
670
|
+
"tech_company": TECH_COMPANY_QUERY,
|
|
671
|
+
"retailer": RETAILER_QUERY,
|
|
672
|
+
"manufacturer": MANUFACTURER_QUERY,
|
|
673
|
+
"investment_company": INVESTMENT_COMPANY_QUERY,
|
|
674
|
+
"record_label": RECORD_LABEL_QUERY,
|
|
675
|
+
"film_studio": FILM_STUDIO_QUERY,
|
|
676
|
+
"video_game_company": VIDEO_GAME_COMPANY_QUERY,
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
# Property-based queries (catches entities not typed correctly)
|
|
680
|
+
PROPERTY_QUERY_TYPES = {
|
|
681
|
+
"has_ceo": HAS_CEO_QUERY,
|
|
682
|
+
"has_subsidiaries": HAS_SUBSIDIARIES_QUERY,
|
|
683
|
+
"owned_by": OWNED_BY_QUERY,
|
|
684
|
+
"has_legal_form": HAS_LEGAL_FORM_QUERY,
|
|
685
|
+
"has_employees": HAS_EMPLOYEES_QUERY,
|
|
686
|
+
"has_revenue": HAS_REVENUE_QUERY,
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
# All query types combined
|
|
690
|
+
QUERY_TYPES = {
|
|
691
|
+
**ORG_QUERY_TYPES,
|
|
692
|
+
**COMPANY_QUERY_TYPES,
|
|
693
|
+
**INDUSTRY_QUERY_TYPES,
|
|
694
|
+
**PROPERTY_QUERY_TYPES,
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
# Mapping from query type to EntityType
|
|
698
|
+
QUERY_TYPE_TO_ENTITY_TYPE: dict[str, EntityType] = {
|
|
699
|
+
# Organizations
|
|
700
|
+
"organization": EntityType.NONPROFIT, # Generic org, default to nonprofit
|
|
701
|
+
"nonprofit": EntityType.NONPROFIT,
|
|
702
|
+
"ngo": EntityType.NGO,
|
|
703
|
+
"foundation": EntityType.FOUNDATION,
|
|
704
|
+
"government": EntityType.GOVERNMENT,
|
|
705
|
+
"intl_org": EntityType.INTERNATIONAL_ORG,
|
|
706
|
+
"political_party": EntityType.POLITICAL_PARTY,
|
|
707
|
+
"trade_union": EntityType.TRADE_UNION,
|
|
708
|
+
"educational": EntityType.EDUCATIONAL,
|
|
709
|
+
"university": EntityType.EDUCATIONAL,
|
|
710
|
+
"research_institute": EntityType.RESEARCH,
|
|
711
|
+
"hospital": EntityType.HEALTHCARE,
|
|
712
|
+
"sports_club": EntityType.SPORTS,
|
|
713
|
+
|
|
714
|
+
# Companies
|
|
715
|
+
"lei": EntityType.BUSINESS,
|
|
716
|
+
"ticker": EntityType.BUSINESS,
|
|
717
|
+
"public": EntityType.BUSINESS,
|
|
718
|
+
"business": EntityType.BUSINESS,
|
|
719
|
+
"enterprise": EntityType.BUSINESS,
|
|
720
|
+
"corporation": EntityType.BUSINESS,
|
|
721
|
+
"subsidiary": EntityType.BUSINESS,
|
|
722
|
+
"conglomerate": EntityType.BUSINESS,
|
|
723
|
+
|
|
724
|
+
# Industry-specific (all business)
|
|
725
|
+
"bank": EntityType.BUSINESS,
|
|
726
|
+
"insurance": EntityType.BUSINESS,
|
|
727
|
+
"airline": EntityType.BUSINESS,
|
|
728
|
+
"law_firm": EntityType.BUSINESS,
|
|
729
|
+
"pharma": EntityType.BUSINESS,
|
|
730
|
+
"tech_company": EntityType.BUSINESS,
|
|
731
|
+
"retailer": EntityType.BUSINESS,
|
|
732
|
+
"manufacturer": EntityType.BUSINESS,
|
|
733
|
+
"investment_company": EntityType.FUND,
|
|
734
|
+
"record_label": EntityType.MEDIA,
|
|
735
|
+
"film_studio": EntityType.MEDIA,
|
|
736
|
+
"video_game_company": EntityType.MEDIA,
|
|
737
|
+
|
|
738
|
+
# Property-based (assume business as they have CEO/revenue/etc)
|
|
739
|
+
"has_ceo": EntityType.BUSINESS,
|
|
740
|
+
"has_subsidiaries": EntityType.BUSINESS,
|
|
741
|
+
"owned_by": EntityType.BUSINESS,
|
|
742
|
+
"has_legal_form": EntityType.BUSINESS,
|
|
743
|
+
"has_employees": EntityType.UNKNOWN, # Could be any org type
|
|
744
|
+
"has_revenue": EntityType.BUSINESS,
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
class WikidataImporter:
|
|
749
|
+
"""
|
|
750
|
+
Importer for Wikidata organization data.
|
|
751
|
+
|
|
752
|
+
Uses SPARQL queries against the public Wikidata Query Service
|
|
753
|
+
to fetch organizations including companies, nonprofits, government agencies, etc.
|
|
754
|
+
|
|
755
|
+
Query categories (run in this order with import_all=True):
|
|
756
|
+
|
|
757
|
+
Organizations:
|
|
758
|
+
- organization: All organizations (Q43229)
|
|
759
|
+
- nonprofit: Non-profit organizations (Q163740)
|
|
760
|
+
- ngo: NGOs (Q79913)
|
|
761
|
+
- foundation: Foundations (Q157031)
|
|
762
|
+
- government: Government agencies (Q327333)
|
|
763
|
+
- intl_org: International organizations (Q484652)
|
|
764
|
+
- political_party: Political parties (Q7278)
|
|
765
|
+
- trade_union: Trade unions (Q178790)
|
|
766
|
+
- educational: Educational institutions (Q2385804)
|
|
767
|
+
- university: Universities (Q3918)
|
|
768
|
+
- research_institute: Research institutes (Q31855)
|
|
769
|
+
- hospital: Hospitals (Q16917)
|
|
770
|
+
- sports_club: Sports clubs (Q476028)
|
|
771
|
+
|
|
772
|
+
Companies:
|
|
773
|
+
- lei: Companies with LEI codes
|
|
774
|
+
- ticker: Companies with stock exchange listings
|
|
775
|
+
- public: Public companies (Q891723)
|
|
776
|
+
- business: Business enterprises (Q4830453)
|
|
777
|
+
- enterprise: Enterprises (Q6881511)
|
|
778
|
+
- corporation: Corporations (Q167037)
|
|
779
|
+
- subsidiary: Subsidiaries (Q658255)
|
|
780
|
+
- conglomerate: Conglomerates (Q206652)
|
|
781
|
+
|
|
782
|
+
Industry-specific:
|
|
783
|
+
- bank: Banks (Q22687)
|
|
784
|
+
- insurance: Insurance companies (Q1145276)
|
|
785
|
+
- airline: Airlines (Q46970)
|
|
786
|
+
- law_firm: Law firms (Q613142)
|
|
787
|
+
- pharma: Pharmaceutical companies (Q507619)
|
|
788
|
+
- tech_company: Tech companies (Q2979960)
|
|
789
|
+
- retailer: Retailers (Q1631111)
|
|
790
|
+
- manufacturer: Manufacturers (Q187652)
|
|
791
|
+
- investment_company: Investment companies (Q380649)
|
|
792
|
+
- record_label: Record labels (Q18127)
|
|
793
|
+
- film_studio: Film studios (Q1366047)
|
|
794
|
+
- video_game_company: Video game companies (Q1137109)
|
|
795
|
+
|
|
796
|
+
Property-based (catches untyped entities):
|
|
797
|
+
- has_ceo: Entities with CEO (P169)
|
|
798
|
+
- has_subsidiaries: Entities with subsidiaries (P355)
|
|
799
|
+
- owned_by: Entities owned by another (P127)
|
|
800
|
+
- has_legal_form: Entities with legal form (P1454)
|
|
801
|
+
- has_employees: Entities with employee count (P1128)
|
|
802
|
+
- has_revenue: Entities with revenue (P2139)
|
|
803
|
+
"""
|
|
804
|
+
|
|
805
|
+
def __init__(self, batch_size: int = 1000, delay_seconds: float = 2.0, timeout: int = 120):
|
|
806
|
+
"""
|
|
807
|
+
Initialize the Wikidata importer.
|
|
808
|
+
|
|
809
|
+
Args:
|
|
810
|
+
batch_size: Number of records to fetch per SPARQL query (default 1000)
|
|
811
|
+
delay_seconds: Delay between requests to be polite to the endpoint
|
|
812
|
+
timeout: HTTP timeout in seconds (default 120)
|
|
813
|
+
"""
|
|
814
|
+
self._batch_size = batch_size
|
|
815
|
+
self._delay = delay_seconds
|
|
816
|
+
self._timeout = timeout
|
|
817
|
+
|
|
818
|
+
def import_from_sparql(
|
|
819
|
+
self,
|
|
820
|
+
limit: Optional[int] = None,
|
|
821
|
+
query_type: str = "lei",
|
|
822
|
+
import_all: bool = False,
|
|
823
|
+
) -> Iterator[CompanyRecord]:
|
|
824
|
+
"""
|
|
825
|
+
Import organization records from Wikidata via SPARQL.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
limit: Optional limit on total records
|
|
829
|
+
query_type: Which query to use (see class docstring for full list).
|
|
830
|
+
Common options:
|
|
831
|
+
- "lei": Companies with LEI codes (default, fastest)
|
|
832
|
+
- "organization": All organizations (Q43229)
|
|
833
|
+
- "nonprofit": Non-profit organizations (Q163740)
|
|
834
|
+
- "government": Government agencies (Q327333)
|
|
835
|
+
- "has_ceo": Entities with CEO property (catches many companies)
|
|
836
|
+
import_all: If True, run all query types sequentially in priority order:
|
|
837
|
+
1. Organization types (nonprofits, gov agencies, NGOs, etc.)
|
|
838
|
+
2. Company types (public companies, business enterprises, etc.)
|
|
839
|
+
3. Industry-specific types (banks, airlines, pharma, etc.)
|
|
840
|
+
4. Property-based queries (catches entities not properly typed)
|
|
841
|
+
|
|
842
|
+
Yields:
|
|
843
|
+
CompanyRecord for each organization
|
|
844
|
+
"""
|
|
845
|
+
if import_all:
|
|
846
|
+
yield from self._import_all_types(limit)
|
|
847
|
+
return
|
|
848
|
+
|
|
849
|
+
if query_type not in QUERY_TYPES:
|
|
850
|
+
raise ValueError(f"Unknown query type: {query_type}. Use one of: {list(QUERY_TYPES.keys())}")
|
|
851
|
+
|
|
852
|
+
query_template = QUERY_TYPES[query_type]
|
|
853
|
+
entity_type = QUERY_TYPE_TO_ENTITY_TYPE.get(query_type, EntityType.UNKNOWN)
|
|
854
|
+
logger.info(f"Starting Wikidata company import via SPARQL (query_type={query_type}, entity_type={entity_type.value})...")
|
|
855
|
+
|
|
856
|
+
offset = 0
|
|
857
|
+
total_count = 0
|
|
858
|
+
seen_ids = set() # Track seen Wikidata IDs to avoid duplicates
|
|
859
|
+
|
|
860
|
+
while True:
|
|
861
|
+
if limit and total_count >= limit:
|
|
862
|
+
break
|
|
863
|
+
|
|
864
|
+
batch_limit = min(self._batch_size, (limit - total_count) if limit else self._batch_size)
|
|
865
|
+
query = query_template % (batch_limit, offset)
|
|
866
|
+
|
|
867
|
+
logger.info(f"Fetching Wikidata batch at offset {offset}...")
|
|
868
|
+
|
|
869
|
+
try:
|
|
870
|
+
results = self._execute_sparql(query)
|
|
871
|
+
except Exception as e:
|
|
872
|
+
logger.error(f"SPARQL query failed at offset {offset}: {e}")
|
|
873
|
+
break
|
|
874
|
+
|
|
875
|
+
bindings = results.get("results", {}).get("bindings", [])
|
|
876
|
+
|
|
877
|
+
if not bindings:
|
|
878
|
+
logger.info("No more results from Wikidata")
|
|
879
|
+
break
|
|
880
|
+
|
|
881
|
+
batch_count = 0
|
|
882
|
+
for binding in bindings:
|
|
883
|
+
if limit and total_count >= limit:
|
|
884
|
+
break
|
|
885
|
+
|
|
886
|
+
record = self._parse_binding(binding, entity_type=entity_type)
|
|
887
|
+
if record and record.source_id not in seen_ids:
|
|
888
|
+
seen_ids.add(record.source_id)
|
|
889
|
+
total_count += 1
|
|
890
|
+
batch_count += 1
|
|
891
|
+
yield record
|
|
892
|
+
|
|
893
|
+
logger.info(f"Processed {batch_count} records from batch (total: {total_count})")
|
|
894
|
+
|
|
895
|
+
if len(bindings) < batch_limit:
|
|
896
|
+
# Last batch
|
|
897
|
+
break
|
|
898
|
+
|
|
899
|
+
offset += self._batch_size
|
|
900
|
+
|
|
901
|
+
# Be polite to the endpoint
|
|
902
|
+
if self._delay > 0:
|
|
903
|
+
time.sleep(self._delay)
|
|
904
|
+
|
|
905
|
+
logger.info(f"Completed Wikidata import: {total_count} records")
|
|
906
|
+
|
|
907
|
+
def _import_all_types(self, limit: Optional[int]) -> Iterator[CompanyRecord]:
|
|
908
|
+
"""Import from all query types sequentially, deduplicating across types.
|
|
909
|
+
|
|
910
|
+
Query categories are run in priority order:
|
|
911
|
+
1. Organization types (nonprofits, gov agencies, NGOs, etc.)
|
|
912
|
+
2. Company types (public companies, business enterprises, etc.)
|
|
913
|
+
3. Industry-specific types (banks, airlines, pharma, etc.)
|
|
914
|
+
4. Property-based queries (catches entities not properly typed)
|
|
915
|
+
"""
|
|
916
|
+
seen_ids: set[str] = set()
|
|
917
|
+
total_count = 0
|
|
918
|
+
|
|
919
|
+
# Calculate per-category limits if a total limit is set
|
|
920
|
+
num_categories = 4
|
|
921
|
+
per_category_limit = limit // num_categories if limit else None
|
|
922
|
+
|
|
923
|
+
# Run categories in priority order: organizations first
|
|
924
|
+
categories = [
|
|
925
|
+
("Organizations", ORG_QUERY_TYPES, per_category_limit),
|
|
926
|
+
("Companies", COMPANY_QUERY_TYPES, per_category_limit),
|
|
927
|
+
("Industry-specific", INDUSTRY_QUERY_TYPES, per_category_limit),
|
|
928
|
+
("Property-based", PROPERTY_QUERY_TYPES, per_category_limit),
|
|
929
|
+
]
|
|
930
|
+
|
|
931
|
+
for category_name, query_types, category_limit in categories:
|
|
932
|
+
logger.info(f"=== Starting category: {category_name} ({len(query_types)} query types) ===")
|
|
933
|
+
category_count = 0
|
|
934
|
+
per_type_limit = category_limit // len(query_types) if category_limit else None
|
|
935
|
+
|
|
936
|
+
for query_type in query_types:
|
|
937
|
+
logger.info(f"Importing from query type: {query_type}")
|
|
938
|
+
type_count = 0
|
|
939
|
+
|
|
940
|
+
for record in self.import_from_sparql(limit=per_type_limit, query_type=query_type):
|
|
941
|
+
if record.source_id not in seen_ids:
|
|
942
|
+
seen_ids.add(record.source_id)
|
|
943
|
+
total_count += 1
|
|
944
|
+
type_count += 1
|
|
945
|
+
category_count += 1
|
|
946
|
+
yield record
|
|
947
|
+
|
|
948
|
+
if limit and total_count >= limit:
|
|
949
|
+
logger.info(f"Reached total limit of {limit} records")
|
|
950
|
+
return
|
|
951
|
+
|
|
952
|
+
logger.info(f"Got {type_count} new records from {query_type} (total: {total_count})")
|
|
953
|
+
|
|
954
|
+
logger.info(f"=== Completed {category_name}: {category_count} new records ===")
|
|
955
|
+
|
|
956
|
+
logger.info(f"Completed all query types: {total_count} total records")
|
|
957
|
+
|
|
958
|
+
@staticmethod
|
|
959
|
+
def _parse_wikidata_date(date_str: Optional[str]) -> Optional[str]:
|
|
960
|
+
"""
|
|
961
|
+
Parse a Wikidata date string into ISO format (YYYY-MM-DD).
|
|
962
|
+
|
|
963
|
+
Wikidata returns dates like "2020-01-15T00:00:00Z" or just "2020".
|
|
964
|
+
Returns None if the date cannot be parsed.
|
|
965
|
+
"""
|
|
966
|
+
if not date_str:
|
|
967
|
+
return None
|
|
968
|
+
# Handle ISO datetime format (e.g., "2020-01-15T00:00:00Z")
|
|
969
|
+
if "T" in date_str:
|
|
970
|
+
return date_str.split("T")[0]
|
|
971
|
+
# Handle year-only format (e.g., "2020")
|
|
972
|
+
if len(date_str) == 4 and date_str.isdigit():
|
|
973
|
+
return f"{date_str}-01-01"
|
|
974
|
+
# Return as-is if it looks like a date
|
|
975
|
+
if len(date_str) >= 4:
|
|
976
|
+
return date_str[:10] # Take first 10 chars (YYYY-MM-DD)
|
|
977
|
+
return None
|
|
978
|
+
|
|
979
|
+
def _execute_sparql(self, query: str) -> dict[str, Any]:
|
|
980
|
+
"""Execute a SPARQL query against Wikidata."""
|
|
981
|
+
params = urllib.parse.urlencode({
|
|
982
|
+
"query": query,
|
|
983
|
+
"format": "json",
|
|
984
|
+
})
|
|
985
|
+
|
|
986
|
+
url = f"{WIKIDATA_SPARQL_URL}?{params}"
|
|
987
|
+
|
|
988
|
+
req = urllib.request.Request(
|
|
989
|
+
url,
|
|
990
|
+
headers={
|
|
991
|
+
"Accept": "application/sparql-results+json",
|
|
992
|
+
"User-Agent": "corp-extractor/1.0 (company database builder)",
|
|
993
|
+
}
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
with urllib.request.urlopen(req, timeout=self._timeout) as response:
|
|
997
|
+
return json.loads(response.read().decode("utf-8"))
|
|
998
|
+
|
|
999
|
+
def _parse_binding(
|
|
1000
|
+
self,
|
|
1001
|
+
binding: dict[str, Any],
|
|
1002
|
+
entity_type: EntityType = EntityType.UNKNOWN,
|
|
1003
|
+
) -> Optional[CompanyRecord]:
|
|
1004
|
+
"""Parse a SPARQL result binding into a CompanyRecord."""
|
|
1005
|
+
try:
|
|
1006
|
+
# Get Wikidata entity ID
|
|
1007
|
+
company_uri = binding.get("company", {}).get("value", "")
|
|
1008
|
+
if not company_uri:
|
|
1009
|
+
return None
|
|
1010
|
+
|
|
1011
|
+
# Extract QID from URI (e.g., "http://www.wikidata.org/entity/Q312" -> "Q312")
|
|
1012
|
+
wikidata_id = company_uri.split("/")[-1]
|
|
1013
|
+
if not wikidata_id.startswith("Q"):
|
|
1014
|
+
return None
|
|
1015
|
+
|
|
1016
|
+
# Get label
|
|
1017
|
+
label = binding.get("companyLabel", {}).get("value", "")
|
|
1018
|
+
if not label or label == wikidata_id: # Skip if no English label
|
|
1019
|
+
return None
|
|
1020
|
+
|
|
1021
|
+
# Get optional fields
|
|
1022
|
+
lei = binding.get("lei", {}).get("value")
|
|
1023
|
+
ticker = binding.get("ticker", {}).get("value")
|
|
1024
|
+
exchange_label = binding.get("exchangeLabel", {}).get("value")
|
|
1025
|
+
country_label = binding.get("countryLabel", {}).get("value")
|
|
1026
|
+
inception_raw = binding.get("inception", {}).get("value")
|
|
1027
|
+
dissolution_raw = binding.get("dissolution", {}).get("value")
|
|
1028
|
+
|
|
1029
|
+
# Parse dates (Wikidata returns ISO datetime, extract date part)
|
|
1030
|
+
from_date = WikidataImporter._parse_wikidata_date(inception_raw)
|
|
1031
|
+
to_date = WikidataImporter._parse_wikidata_date(dissolution_raw)
|
|
1032
|
+
|
|
1033
|
+
# Build record data
|
|
1034
|
+
record_data: dict[str, Any] = {
|
|
1035
|
+
"wikidata_id": wikidata_id,
|
|
1036
|
+
"label": label,
|
|
1037
|
+
}
|
|
1038
|
+
if lei:
|
|
1039
|
+
record_data["lei"] = lei
|
|
1040
|
+
if ticker:
|
|
1041
|
+
record_data["ticker"] = ticker
|
|
1042
|
+
if exchange_label:
|
|
1043
|
+
record_data["exchange"] = exchange_label
|
|
1044
|
+
if country_label:
|
|
1045
|
+
record_data["country"] = country_label
|
|
1046
|
+
if from_date:
|
|
1047
|
+
record_data["inception"] = from_date
|
|
1048
|
+
if to_date:
|
|
1049
|
+
record_data["dissolution"] = to_date
|
|
1050
|
+
|
|
1051
|
+
return CompanyRecord(
|
|
1052
|
+
name=label.strip(),
|
|
1053
|
+
source="wikipedia", # Use "wikipedia" as source per schema
|
|
1054
|
+
source_id=wikidata_id,
|
|
1055
|
+
region=country_label or "",
|
|
1056
|
+
entity_type=entity_type,
|
|
1057
|
+
from_date=from_date,
|
|
1058
|
+
to_date=to_date,
|
|
1059
|
+
record=record_data,
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
except Exception as e:
|
|
1063
|
+
logger.debug(f"Failed to parse Wikidata binding: {e}")
|
|
1064
|
+
return None
|
|
1065
|
+
|
|
1066
|
+
def search_company(self, name: str, limit: int = 10) -> list[CompanyRecord]:
|
|
1067
|
+
"""
|
|
1068
|
+
Search for a specific company by name.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
name: Company name to search for
|
|
1072
|
+
limit: Maximum results to return
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
List of matching CompanyRecords
|
|
1076
|
+
"""
|
|
1077
|
+
# Use Wikidata search API for better name matching
|
|
1078
|
+
search_url = "https://www.wikidata.org/w/api.php"
|
|
1079
|
+
params = urllib.parse.urlencode({
|
|
1080
|
+
"action": "wbsearchentities",
|
|
1081
|
+
"search": name,
|
|
1082
|
+
"language": "en",
|
|
1083
|
+
"type": "item",
|
|
1084
|
+
"limit": limit,
|
|
1085
|
+
"format": "json",
|
|
1086
|
+
})
|
|
1087
|
+
|
|
1088
|
+
req = urllib.request.Request(
|
|
1089
|
+
f"{search_url}?{params}",
|
|
1090
|
+
headers={"User-Agent": "corp-extractor/1.0"}
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
with urllib.request.urlopen(req, timeout=30) as response:
|
|
1094
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1095
|
+
|
|
1096
|
+
results = []
|
|
1097
|
+
for item in data.get("search", []):
|
|
1098
|
+
qid = item.get("id")
|
|
1099
|
+
label = item.get("label", "")
|
|
1100
|
+
description = item.get("description", "")
|
|
1101
|
+
|
|
1102
|
+
# Check if it looks like a company
|
|
1103
|
+
company_keywords = ["company", "corporation", "inc", "ltd", "enterprise", "business"]
|
|
1104
|
+
if not any(kw in description.lower() for kw in company_keywords):
|
|
1105
|
+
continue
|
|
1106
|
+
|
|
1107
|
+
record = CompanyRecord(
|
|
1108
|
+
name=label,
|
|
1109
|
+
source="wikipedia",
|
|
1110
|
+
source_id=qid,
|
|
1111
|
+
region="", # Not available from search API
|
|
1112
|
+
record={
|
|
1113
|
+
"wikidata_id": qid,
|
|
1114
|
+
"label": label,
|
|
1115
|
+
"description": description,
|
|
1116
|
+
},
|
|
1117
|
+
)
|
|
1118
|
+
results.append(record)
|
|
1119
|
+
|
|
1120
|
+
return results
|