corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -121,6 +121,51 @@ class CompanyEmbedder:
|
|
|
121
121
|
)
|
|
122
122
|
return embeddings.astype(np.float32)
|
|
123
123
|
|
|
124
|
+
def quantize_to_int8(self, embedding: np.ndarray) -> np.ndarray:
|
|
125
|
+
"""
|
|
126
|
+
Quantize L2-normalized float32 embedding to int8.
|
|
127
|
+
|
|
128
|
+
For normalized embeddings (values in [-1, 1]), this provides
|
|
129
|
+
75% storage reduction with ~92% recall at top-100.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
embedding: L2-normalized float32 embedding vector
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
int8 embedding vector
|
|
136
|
+
"""
|
|
137
|
+
return np.clip(np.round(embedding * 127), -127, 127).astype(np.int8)
|
|
138
|
+
|
|
139
|
+
def embed_and_quantize(self, text: str) -> tuple[np.ndarray, np.ndarray]:
|
|
140
|
+
"""
|
|
141
|
+
Embed text and return both float32 and int8 embeddings.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
text: Text to embed
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple of (float32_embedding, int8_embedding)
|
|
148
|
+
"""
|
|
149
|
+
fp32 = self.embed(text)
|
|
150
|
+
return fp32, self.quantize_to_int8(fp32)
|
|
151
|
+
|
|
152
|
+
def embed_batch_and_quantize(
|
|
153
|
+
self, texts: list[str], batch_size: int = 32
|
|
154
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
155
|
+
"""
|
|
156
|
+
Embed multiple texts and return both float32 and int8 embeddings.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
texts: List of texts to embed
|
|
160
|
+
batch_size: Batch size for processing
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Tuple of (float32_embeddings, int8_embeddings) matrices
|
|
164
|
+
"""
|
|
165
|
+
fp32 = self.embed_batch(texts, batch_size=batch_size)
|
|
166
|
+
int8 = np.array([self.quantize_to_int8(e) for e in fp32])
|
|
167
|
+
return fp32, int8
|
|
168
|
+
|
|
124
169
|
def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
|
|
125
170
|
"""
|
|
126
171
|
Compute cosine similarity between two embeddings.
|
|
@@ -6,10 +6,8 @@ Provides functionality to:
|
|
|
6
6
|
- Upload/publish database updates
|
|
7
7
|
- Version management for database files
|
|
8
8
|
- Create "lite" versions without full records for smaller downloads
|
|
9
|
-
- Optional gzip compression for reduced file sizes
|
|
10
9
|
"""
|
|
11
10
|
|
|
12
|
-
import gzip
|
|
13
11
|
import logging
|
|
14
12
|
import os
|
|
15
13
|
import shutil
|
|
@@ -22,11 +20,9 @@ logger = logging.getLogger(__name__)
|
|
|
22
20
|
|
|
23
21
|
# Default HuggingFace repo for entity database
|
|
24
22
|
DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
|
|
25
|
-
DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
|
|
26
|
-
DEFAULT_DB_FULL_FILENAME = "entities.db"
|
|
27
|
-
DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
|
|
28
|
-
DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
|
|
29
|
-
DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
|
|
23
|
+
DEFAULT_DB_FILENAME = "entities-v2-lite.db" # Lite is the default (smaller download)
|
|
24
|
+
DEFAULT_DB_FULL_FILENAME = "entities-v2.db"
|
|
25
|
+
DEFAULT_DB_LITE_FILENAME = "entities-v2-lite.db"
|
|
30
26
|
|
|
31
27
|
# Local cache directory
|
|
32
28
|
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
|
|
@@ -59,7 +55,8 @@ def get_database_path(
|
|
|
59
55
|
# Check common locations
|
|
60
56
|
possible_paths = [
|
|
61
57
|
cache_dir / filename,
|
|
62
|
-
cache_dir / "entities.db",
|
|
58
|
+
cache_dir / "entities-v2.db",
|
|
59
|
+
cache_dir / "entities.db", # Legacy fallback
|
|
63
60
|
Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
|
|
64
61
|
]
|
|
65
62
|
|
|
@@ -139,7 +136,7 @@ def upload_database(
|
|
|
139
136
|
commit_message=commit_message,
|
|
140
137
|
)
|
|
141
138
|
|
|
142
|
-
logger.info(
|
|
139
|
+
logger.info("Database uploaded successfully")
|
|
143
140
|
return result
|
|
144
141
|
|
|
145
142
|
|
|
@@ -189,6 +186,33 @@ def check_for_updates(
|
|
|
189
186
|
return latest != current_version, latest
|
|
190
187
|
|
|
191
188
|
|
|
189
|
+
def vacuum_database(db_path: str | Path) -> None:
|
|
190
|
+
"""
|
|
191
|
+
VACUUM the database to reclaim space and optimize it.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
db_path: Path to the database file
|
|
195
|
+
"""
|
|
196
|
+
db_path = Path(db_path)
|
|
197
|
+
if not db_path.exists():
|
|
198
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
199
|
+
|
|
200
|
+
original_size = db_path.stat().st_size
|
|
201
|
+
logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
|
|
202
|
+
|
|
203
|
+
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
204
|
+
conn = sqlite3.connect(str(db_path), isolation_level=None)
|
|
205
|
+
try:
|
|
206
|
+
conn.execute("VACUUM")
|
|
207
|
+
finally:
|
|
208
|
+
conn.close()
|
|
209
|
+
|
|
210
|
+
new_size = db_path.stat().st_size
|
|
211
|
+
reduction = (1 - new_size / original_size) * 100
|
|
212
|
+
|
|
213
|
+
logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
|
|
214
|
+
|
|
215
|
+
|
|
192
216
|
def create_lite_database(
|
|
193
217
|
source_db_path: str | Path,
|
|
194
218
|
output_path: Optional[str | Path] = None,
|
|
@@ -196,8 +220,10 @@ def create_lite_database(
|
|
|
196
220
|
"""
|
|
197
221
|
Create a lite version of the database without full records.
|
|
198
222
|
|
|
199
|
-
The lite version
|
|
200
|
-
|
|
223
|
+
The lite version:
|
|
224
|
+
- Strips the `record` column content (sets to empty {})
|
|
225
|
+
- Drops float32 embedding tables (keeps only scalar int8 embeddings)
|
|
226
|
+
- Significantly reduces file size (~75% reduction)
|
|
201
227
|
|
|
202
228
|
Args:
|
|
203
229
|
source_db_path: Path to the full database
|
|
@@ -206,6 +232,8 @@ def create_lite_database(
|
|
|
206
232
|
Returns:
|
|
207
233
|
Path to the lite database
|
|
208
234
|
"""
|
|
235
|
+
import sqlite_vec
|
|
236
|
+
|
|
209
237
|
source_db_path = Path(source_db_path)
|
|
210
238
|
if not source_db_path.exists():
|
|
211
239
|
raise FileNotFoundError(f"Source database not found: {source_db_path}")
|
|
@@ -223,14 +251,51 @@ def create_lite_database(
|
|
|
223
251
|
# Connect and strip record contents
|
|
224
252
|
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
225
253
|
conn = sqlite3.connect(str(output_path), isolation_level=None)
|
|
254
|
+
|
|
255
|
+
# Load sqlite-vec extension (required for vec0 virtual tables)
|
|
256
|
+
conn.enable_load_extension(True)
|
|
257
|
+
sqlite_vec.load(conn)
|
|
258
|
+
conn.enable_load_extension(False)
|
|
259
|
+
|
|
226
260
|
try:
|
|
227
261
|
# Update all records to have empty record JSON
|
|
228
262
|
conn.execute("BEGIN")
|
|
229
263
|
cursor = conn.execute("UPDATE organizations SET record = '{}'")
|
|
230
264
|
updated = cursor.rowcount
|
|
231
|
-
logger.info(f"Stripped {updated} record fields")
|
|
265
|
+
logger.info(f"Stripped {updated} organization record fields")
|
|
266
|
+
|
|
267
|
+
# Also strip people records if table exists
|
|
268
|
+
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='people'")
|
|
269
|
+
if cursor.fetchone():
|
|
270
|
+
cursor = conn.execute("UPDATE people SET record = '{}'")
|
|
271
|
+
logger.info(f"Stripped {cursor.rowcount} people record fields")
|
|
272
|
+
|
|
232
273
|
conn.execute("COMMIT")
|
|
233
274
|
|
|
275
|
+
# Drop float32 embedding tables (keep only scalar int8 for 75% storage savings)
|
|
276
|
+
# Check if scalar tables exist before dropping float32 tables
|
|
277
|
+
cursor = conn.execute(
|
|
278
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings_scalar'"
|
|
279
|
+
)
|
|
280
|
+
has_org_scalar = cursor.fetchone() is not None
|
|
281
|
+
|
|
282
|
+
cursor = conn.execute(
|
|
283
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings_scalar'"
|
|
284
|
+
)
|
|
285
|
+
has_person_scalar = cursor.fetchone() is not None
|
|
286
|
+
|
|
287
|
+
if has_org_scalar:
|
|
288
|
+
logger.info("Dropping float32 organization_embeddings table (keeping scalar)")
|
|
289
|
+
conn.execute("DROP TABLE IF EXISTS organization_embeddings")
|
|
290
|
+
else:
|
|
291
|
+
logger.warning("No scalar organization embeddings found, keeping float32 table")
|
|
292
|
+
|
|
293
|
+
if has_person_scalar:
|
|
294
|
+
logger.info("Dropping float32 person_embeddings table (keeping scalar)")
|
|
295
|
+
conn.execute("DROP TABLE IF EXISTS person_embeddings")
|
|
296
|
+
else:
|
|
297
|
+
logger.warning("No scalar person embeddings found, keeping float32 table")
|
|
298
|
+
|
|
234
299
|
# Vacuum to reclaim space (must be outside transaction)
|
|
235
300
|
conn.execute("VACUUM")
|
|
236
301
|
finally:
|
|
@@ -248,98 +313,20 @@ def create_lite_database(
|
|
|
248
313
|
return output_path
|
|
249
314
|
|
|
250
315
|
|
|
251
|
-
def compress_database(
|
|
252
|
-
db_path: str | Path,
|
|
253
|
-
output_path: Optional[str | Path] = None,
|
|
254
|
-
) -> Path:
|
|
255
|
-
"""
|
|
256
|
-
Compress a database file using gzip.
|
|
257
|
-
|
|
258
|
-
Args:
|
|
259
|
-
db_path: Path to the database file
|
|
260
|
-
output_path: Output path for compressed file (default: adds .gz suffix)
|
|
261
|
-
|
|
262
|
-
Returns:
|
|
263
|
-
Path to the compressed file
|
|
264
|
-
"""
|
|
265
|
-
db_path = Path(db_path)
|
|
266
|
-
if not db_path.exists():
|
|
267
|
-
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
268
|
-
|
|
269
|
-
if output_path is None:
|
|
270
|
-
output_path = db_path.with_suffix(db_path.suffix + ".gz")
|
|
271
|
-
output_path = Path(output_path)
|
|
272
|
-
|
|
273
|
-
logger.info(f"Compressing {db_path} to {output_path}")
|
|
274
|
-
|
|
275
|
-
with open(db_path, "rb") as f_in:
|
|
276
|
-
with gzip.open(output_path, "wb", compresslevel=9) as f_out:
|
|
277
|
-
shutil.copyfileobj(f_in, f_out)
|
|
278
|
-
|
|
279
|
-
# Log compression results
|
|
280
|
-
original_size = db_path.stat().st_size
|
|
281
|
-
compressed_size = output_path.stat().st_size
|
|
282
|
-
ratio = (1 - compressed_size / original_size) * 100
|
|
283
|
-
|
|
284
|
-
logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
|
|
285
|
-
logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
|
|
286
|
-
logger.info(f"Compression ratio: {ratio:.1f}%")
|
|
287
|
-
|
|
288
|
-
return output_path
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def decompress_database(
|
|
292
|
-
compressed_path: str | Path,
|
|
293
|
-
output_path: Optional[str | Path] = None,
|
|
294
|
-
) -> Path:
|
|
295
|
-
"""
|
|
296
|
-
Decompress a gzipped database file.
|
|
297
|
-
|
|
298
|
-
Args:
|
|
299
|
-
compressed_path: Path to the .gz file
|
|
300
|
-
output_path: Output path (default: removes .gz suffix)
|
|
301
|
-
|
|
302
|
-
Returns:
|
|
303
|
-
Path to the decompressed file
|
|
304
|
-
"""
|
|
305
|
-
compressed_path = Path(compressed_path)
|
|
306
|
-
if not compressed_path.exists():
|
|
307
|
-
raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
|
|
308
|
-
|
|
309
|
-
if output_path is None:
|
|
310
|
-
if compressed_path.suffix == ".gz":
|
|
311
|
-
output_path = compressed_path.with_suffix("")
|
|
312
|
-
else:
|
|
313
|
-
output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
|
|
314
|
-
output_path = Path(output_path)
|
|
315
|
-
|
|
316
|
-
logger.info(f"Decompressing {compressed_path} to {output_path}")
|
|
317
|
-
|
|
318
|
-
with gzip.open(compressed_path, "rb") as f_in:
|
|
319
|
-
with open(output_path, "wb") as f_out:
|
|
320
|
-
shutil.copyfileobj(f_in, f_out)
|
|
321
|
-
|
|
322
|
-
logger.info(f"Decompressed to {output_path}")
|
|
323
|
-
return output_path
|
|
324
|
-
|
|
325
|
-
|
|
326
316
|
def upload_database_with_variants(
|
|
327
317
|
db_path: str | Path,
|
|
328
318
|
repo_id: str = DEFAULT_REPO_ID,
|
|
329
319
|
commit_message: str = "Update entity database",
|
|
330
320
|
token: Optional[str] = None,
|
|
331
321
|
include_lite: bool = True,
|
|
332
|
-
include_compressed: bool = True,
|
|
333
322
|
include_readme: bool = True,
|
|
334
323
|
) -> dict[str, str]:
|
|
335
324
|
"""
|
|
336
|
-
Upload entity database with optional lite
|
|
325
|
+
Upload entity database with optional lite variant.
|
|
337
326
|
|
|
338
|
-
|
|
339
|
-
- entities.db (full database)
|
|
340
|
-
- entities-lite.db (without record data, smaller)
|
|
341
|
-
- entities.db.gz (compressed full database)
|
|
342
|
-
- entities-lite.db.gz (compressed lite database)
|
|
327
|
+
First VACUUMs the database, then creates and uploads:
|
|
328
|
+
- entities-v2.db (full database with v2 normalized schema)
|
|
329
|
+
- entities-v2-lite.db (without record data, smaller)
|
|
343
330
|
- README.md (dataset card from HUGGINGFACE_README.md)
|
|
344
331
|
|
|
345
332
|
Args:
|
|
@@ -348,7 +335,6 @@ def upload_database_with_variants(
|
|
|
348
335
|
commit_message: Git commit message
|
|
349
336
|
token: HuggingFace API token
|
|
350
337
|
include_lite: Whether to create and upload lite version
|
|
351
|
-
include_compressed: Whether to create and upload compressed versions
|
|
352
338
|
include_readme: Whether to upload the README.md dataset card
|
|
353
339
|
|
|
354
340
|
Returns:
|
|
@@ -383,6 +369,9 @@ def upload_database_with_variants(
|
|
|
383
369
|
except Exception as e:
|
|
384
370
|
logger.debug(f"Repo creation note: {e}")
|
|
385
371
|
|
|
372
|
+
# VACUUM the database first to optimize it
|
|
373
|
+
vacuum_database(db_path)
|
|
374
|
+
|
|
386
375
|
results = {}
|
|
387
376
|
|
|
388
377
|
# Create temp directory for variants
|
|
@@ -399,20 +388,6 @@ def upload_database_with_variants(
|
|
|
399
388
|
create_lite_database(db_path, lite_path)
|
|
400
389
|
files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
|
|
401
390
|
|
|
402
|
-
# Compressed versions
|
|
403
|
-
if include_compressed:
|
|
404
|
-
# Compress full database
|
|
405
|
-
compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
|
|
406
|
-
compress_database(db_path, compressed_path)
|
|
407
|
-
files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
|
|
408
|
-
|
|
409
|
-
# Compress lite database
|
|
410
|
-
if include_lite:
|
|
411
|
-
lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
|
|
412
|
-
lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
|
|
413
|
-
compress_database(lite_path, lite_compressed_path)
|
|
414
|
-
files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
|
|
415
|
-
|
|
416
391
|
# Copy all files to a staging directory for upload_folder
|
|
417
392
|
staging_dir = temp_path / "staging"
|
|
418
393
|
staging_dir.mkdir()
|
|
@@ -455,7 +430,6 @@ def download_database(
|
|
|
455
430
|
revision: Optional[str] = None,
|
|
456
431
|
cache_dir: Optional[Path] = None,
|
|
457
432
|
force_download: bool = False,
|
|
458
|
-
prefer_compressed: bool = True,
|
|
459
433
|
) -> Path:
|
|
460
434
|
"""
|
|
461
435
|
Download entity database from HuggingFace Hub.
|
|
@@ -466,10 +440,9 @@ def download_database(
|
|
|
466
440
|
revision: Git revision (branch, tag, commit) or None for latest
|
|
467
441
|
cache_dir: Local cache directory
|
|
468
442
|
force_download: Force re-download even if cached
|
|
469
|
-
prefer_compressed: Try to download compressed version first
|
|
470
443
|
|
|
471
444
|
Returns:
|
|
472
|
-
Path to the downloaded database file
|
|
445
|
+
Path to the downloaded database file
|
|
473
446
|
"""
|
|
474
447
|
try:
|
|
475
448
|
from huggingface_hub import hf_hub_download
|
|
@@ -482,34 +455,11 @@ def download_database(
|
|
|
482
455
|
cache_dir = cache_dir or DEFAULT_CACHE_DIR
|
|
483
456
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
484
457
|
|
|
485
|
-
# Try compressed version first if preferred
|
|
486
|
-
download_filename = filename
|
|
487
|
-
if prefer_compressed and not filename.endswith(".gz"):
|
|
488
|
-
compressed_filename = filename + ".gz"
|
|
489
|
-
try:
|
|
490
|
-
logger.info(f"Trying compressed version: {compressed_filename}")
|
|
491
|
-
local_path = hf_hub_download(
|
|
492
|
-
repo_id=repo_id,
|
|
493
|
-
filename=compressed_filename,
|
|
494
|
-
revision=revision,
|
|
495
|
-
cache_dir=str(cache_dir),
|
|
496
|
-
force_download=force_download,
|
|
497
|
-
repo_type="dataset",
|
|
498
|
-
)
|
|
499
|
-
# Decompress to final location
|
|
500
|
-
final_path = cache_dir / filename
|
|
501
|
-
decompress_database(local_path, final_path)
|
|
502
|
-
logger.info(f"Database downloaded and decompressed to {final_path}")
|
|
503
|
-
return final_path
|
|
504
|
-
except Exception as e:
|
|
505
|
-
logger.debug(f"Compressed version not available: {e}")
|
|
506
|
-
|
|
507
|
-
# Download uncompressed version
|
|
508
458
|
logger.info(f"Downloading entity database from {repo_id}...")
|
|
509
459
|
|
|
510
460
|
local_path = hf_hub_download(
|
|
511
461
|
repo_id=repo_id,
|
|
512
|
-
filename=
|
|
462
|
+
filename=filename,
|
|
513
463
|
revision=revision,
|
|
514
464
|
cache_dir=str(cache_dir),
|
|
515
465
|
force_download=force_download,
|
|
@@ -4,21 +4,29 @@ Data importers for the entity database.
|
|
|
4
4
|
Provides importers for various data sources:
|
|
5
5
|
- GLEIF: Legal Entity Identifier data
|
|
6
6
|
- SEC Edgar: US SEC company data
|
|
7
|
+
- SEC Form 4: US SEC insider ownership data (officers/directors)
|
|
7
8
|
- Companies House: UK company data
|
|
8
|
-
- Wikidata: Wikipedia/Wikidata organization data
|
|
9
|
-
- Wikidata People: Notable people from Wikipedia/Wikidata
|
|
9
|
+
- Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
|
|
10
|
+
- Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
|
|
11
|
+
- Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
|
|
10
12
|
"""
|
|
11
13
|
|
|
12
14
|
from .gleif import GleifImporter
|
|
13
15
|
from .sec_edgar import SecEdgarImporter
|
|
16
|
+
from .sec_form4 import SecForm4Importer
|
|
14
17
|
from .companies_house import CompaniesHouseImporter
|
|
18
|
+
from .companies_house_officers import CompaniesHouseOfficersImporter
|
|
15
19
|
from .wikidata import WikidataImporter
|
|
16
20
|
from .wikidata_people import WikidataPeopleImporter
|
|
21
|
+
from .wikidata_dump import WikidataDumpImporter
|
|
17
22
|
|
|
18
23
|
__all__ = [
|
|
19
24
|
"GleifImporter",
|
|
20
25
|
"SecEdgarImporter",
|
|
26
|
+
"SecForm4Importer",
|
|
21
27
|
"CompaniesHouseImporter",
|
|
28
|
+
"CompaniesHouseOfficersImporter",
|
|
22
29
|
"WikidataImporter",
|
|
23
30
|
"WikidataPeopleImporter",
|
|
31
|
+
"WikidataDumpImporter",
|
|
24
32
|
]
|
|
@@ -342,13 +342,18 @@ class CompaniesHouseImporter:
|
|
|
342
342
|
raw_company_type = item.get("company_type", "")
|
|
343
343
|
entity_type = _get_entity_type_from_company_type(raw_company_type)
|
|
344
344
|
|
|
345
|
+
# Get dates
|
|
346
|
+
date_of_creation = item.get("date_of_creation")
|
|
347
|
+
date_of_cessation = item.get("date_of_cessation") # For dissolved companies
|
|
348
|
+
|
|
345
349
|
# Build record
|
|
346
350
|
record_data = {
|
|
347
351
|
"company_number": company_number,
|
|
348
352
|
"title": title,
|
|
349
353
|
"company_status": company_status,
|
|
350
354
|
"company_type": raw_company_type,
|
|
351
|
-
"date_of_creation":
|
|
355
|
+
"date_of_creation": date_of_creation,
|
|
356
|
+
"date_of_cessation": date_of_cessation,
|
|
352
357
|
"locality": locality,
|
|
353
358
|
"region": region,
|
|
354
359
|
"country": country,
|
|
@@ -360,6 +365,8 @@ class CompaniesHouseImporter:
|
|
|
360
365
|
source_id=company_number,
|
|
361
366
|
region=country,
|
|
362
367
|
entity_type=entity_type,
|
|
368
|
+
from_date=date_of_creation,
|
|
369
|
+
to_date=date_of_cessation,
|
|
363
370
|
record=record_data,
|
|
364
371
|
)
|
|
365
372
|
|
|
@@ -397,12 +404,17 @@ class CompaniesHouseImporter:
|
|
|
397
404
|
raw_company_type = row.get("CompanyCategory", "").strip()
|
|
398
405
|
entity_type = _get_entity_type_from_company_type(raw_company_type)
|
|
399
406
|
|
|
407
|
+
# Get dates from CSV
|
|
408
|
+
date_of_creation = row.get("IncorporationDate", "").strip() or None
|
|
409
|
+
date_of_cessation = row.get("DissolutionDate", "").strip() or None
|
|
410
|
+
|
|
400
411
|
record_data = {
|
|
401
412
|
"company_number": company_number,
|
|
402
413
|
"title": company_name,
|
|
403
414
|
"company_status": company_status,
|
|
404
415
|
"company_type": raw_company_type,
|
|
405
|
-
"date_of_creation":
|
|
416
|
+
"date_of_creation": date_of_creation,
|
|
417
|
+
"date_of_cessation": date_of_cessation,
|
|
406
418
|
"country": row.get("CountryOfOrigin", "United Kingdom").strip(),
|
|
407
419
|
"sic_code": row.get("SICCode.SicText_1", "").strip(),
|
|
408
420
|
}
|
|
@@ -416,6 +428,8 @@ class CompaniesHouseImporter:
|
|
|
416
428
|
source_id=company_number,
|
|
417
429
|
region=region,
|
|
418
430
|
entity_type=entity_type,
|
|
431
|
+
from_date=date_of_creation,
|
|
432
|
+
to_date=date_of_cessation,
|
|
419
433
|
record=record_data,
|
|
420
434
|
)
|
|
421
435
|
|