corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
- statement_extractor/cli.py +866 -77
- statement_extractor/database/hub.py +35 -127
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +823 -325
- statement_extractor/database/models.py +30 -6
- statement_extractor/database/store.py +1485 -60
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +11 -1
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -6,10 +6,8 @@ Provides functionality to:
|
|
|
6
6
|
- Upload/publish database updates
|
|
7
7
|
- Version management for database files
|
|
8
8
|
- Create "lite" versions without full records for smaller downloads
|
|
9
|
-
- Optional gzip compression for reduced file sizes
|
|
10
9
|
"""
|
|
11
10
|
|
|
12
|
-
import gzip
|
|
13
11
|
import logging
|
|
14
12
|
import os
|
|
15
13
|
import shutil
|
|
@@ -25,8 +23,6 @@ DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
|
|
|
25
23
|
DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
|
|
26
24
|
DEFAULT_DB_FULL_FILENAME = "entities.db"
|
|
27
25
|
DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
|
|
28
|
-
DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
|
|
29
|
-
DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
|
|
30
26
|
|
|
31
27
|
# Local cache directory
|
|
32
28
|
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
|
|
@@ -139,7 +135,7 @@ def upload_database(
|
|
|
139
135
|
commit_message=commit_message,
|
|
140
136
|
)
|
|
141
137
|
|
|
142
|
-
logger.info(
|
|
138
|
+
logger.info("Database uploaded successfully")
|
|
143
139
|
return result
|
|
144
140
|
|
|
145
141
|
|
|
@@ -189,6 +185,33 @@ def check_for_updates(
|
|
|
189
185
|
return latest != current_version, latest
|
|
190
186
|
|
|
191
187
|
|
|
188
|
+
def vacuum_database(db_path: str | Path) -> None:
|
|
189
|
+
"""
|
|
190
|
+
VACUUM the database to reclaim space and optimize it.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
db_path: Path to the database file
|
|
194
|
+
"""
|
|
195
|
+
db_path = Path(db_path)
|
|
196
|
+
if not db_path.exists():
|
|
197
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
198
|
+
|
|
199
|
+
original_size = db_path.stat().st_size
|
|
200
|
+
logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
|
|
201
|
+
|
|
202
|
+
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
203
|
+
conn = sqlite3.connect(str(db_path), isolation_level=None)
|
|
204
|
+
try:
|
|
205
|
+
conn.execute("VACUUM")
|
|
206
|
+
finally:
|
|
207
|
+
conn.close()
|
|
208
|
+
|
|
209
|
+
new_size = db_path.stat().st_size
|
|
210
|
+
reduction = (1 - new_size / original_size) * 100
|
|
211
|
+
|
|
212
|
+
logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
|
|
213
|
+
|
|
214
|
+
|
|
192
215
|
def create_lite_database(
|
|
193
216
|
source_db_path: str | Path,
|
|
194
217
|
output_path: Optional[str | Path] = None,
|
|
@@ -248,98 +271,20 @@ def create_lite_database(
|
|
|
248
271
|
return output_path
|
|
249
272
|
|
|
250
273
|
|
|
251
|
-
def compress_database(
|
|
252
|
-
db_path: str | Path,
|
|
253
|
-
output_path: Optional[str | Path] = None,
|
|
254
|
-
) -> Path:
|
|
255
|
-
"""
|
|
256
|
-
Compress a database file using gzip.
|
|
257
|
-
|
|
258
|
-
Args:
|
|
259
|
-
db_path: Path to the database file
|
|
260
|
-
output_path: Output path for compressed file (default: adds .gz suffix)
|
|
261
|
-
|
|
262
|
-
Returns:
|
|
263
|
-
Path to the compressed file
|
|
264
|
-
"""
|
|
265
|
-
db_path = Path(db_path)
|
|
266
|
-
if not db_path.exists():
|
|
267
|
-
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
268
|
-
|
|
269
|
-
if output_path is None:
|
|
270
|
-
output_path = db_path.with_suffix(db_path.suffix + ".gz")
|
|
271
|
-
output_path = Path(output_path)
|
|
272
|
-
|
|
273
|
-
logger.info(f"Compressing {db_path} to {output_path}")
|
|
274
|
-
|
|
275
|
-
with open(db_path, "rb") as f_in:
|
|
276
|
-
with gzip.open(output_path, "wb", compresslevel=9) as f_out:
|
|
277
|
-
shutil.copyfileobj(f_in, f_out)
|
|
278
|
-
|
|
279
|
-
# Log compression results
|
|
280
|
-
original_size = db_path.stat().st_size
|
|
281
|
-
compressed_size = output_path.stat().st_size
|
|
282
|
-
ratio = (1 - compressed_size / original_size) * 100
|
|
283
|
-
|
|
284
|
-
logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
|
|
285
|
-
logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
|
|
286
|
-
logger.info(f"Compression ratio: {ratio:.1f}%")
|
|
287
|
-
|
|
288
|
-
return output_path
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def decompress_database(
|
|
292
|
-
compressed_path: str | Path,
|
|
293
|
-
output_path: Optional[str | Path] = None,
|
|
294
|
-
) -> Path:
|
|
295
|
-
"""
|
|
296
|
-
Decompress a gzipped database file.
|
|
297
|
-
|
|
298
|
-
Args:
|
|
299
|
-
compressed_path: Path to the .gz file
|
|
300
|
-
output_path: Output path (default: removes .gz suffix)
|
|
301
|
-
|
|
302
|
-
Returns:
|
|
303
|
-
Path to the decompressed file
|
|
304
|
-
"""
|
|
305
|
-
compressed_path = Path(compressed_path)
|
|
306
|
-
if not compressed_path.exists():
|
|
307
|
-
raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
|
|
308
|
-
|
|
309
|
-
if output_path is None:
|
|
310
|
-
if compressed_path.suffix == ".gz":
|
|
311
|
-
output_path = compressed_path.with_suffix("")
|
|
312
|
-
else:
|
|
313
|
-
output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
|
|
314
|
-
output_path = Path(output_path)
|
|
315
|
-
|
|
316
|
-
logger.info(f"Decompressing {compressed_path} to {output_path}")
|
|
317
|
-
|
|
318
|
-
with gzip.open(compressed_path, "rb") as f_in:
|
|
319
|
-
with open(output_path, "wb") as f_out:
|
|
320
|
-
shutil.copyfileobj(f_in, f_out)
|
|
321
|
-
|
|
322
|
-
logger.info(f"Decompressed to {output_path}")
|
|
323
|
-
return output_path
|
|
324
|
-
|
|
325
|
-
|
|
326
274
|
def upload_database_with_variants(
|
|
327
275
|
db_path: str | Path,
|
|
328
276
|
repo_id: str = DEFAULT_REPO_ID,
|
|
329
277
|
commit_message: str = "Update entity database",
|
|
330
278
|
token: Optional[str] = None,
|
|
331
279
|
include_lite: bool = True,
|
|
332
|
-
include_compressed: bool = True,
|
|
333
280
|
include_readme: bool = True,
|
|
334
281
|
) -> dict[str, str]:
|
|
335
282
|
"""
|
|
336
|
-
Upload entity database with optional lite
|
|
283
|
+
Upload entity database with optional lite variant.
|
|
337
284
|
|
|
338
|
-
|
|
285
|
+
First VACUUMs the database, then creates and uploads:
|
|
339
286
|
- entities.db (full database)
|
|
340
287
|
- entities-lite.db (without record data, smaller)
|
|
341
|
-
- entities.db.gz (compressed full database)
|
|
342
|
-
- entities-lite.db.gz (compressed lite database)
|
|
343
288
|
- README.md (dataset card from HUGGINGFACE_README.md)
|
|
344
289
|
|
|
345
290
|
Args:
|
|
@@ -348,7 +293,6 @@ def upload_database_with_variants(
|
|
|
348
293
|
commit_message: Git commit message
|
|
349
294
|
token: HuggingFace API token
|
|
350
295
|
include_lite: Whether to create and upload lite version
|
|
351
|
-
include_compressed: Whether to create and upload compressed versions
|
|
352
296
|
include_readme: Whether to upload the README.md dataset card
|
|
353
297
|
|
|
354
298
|
Returns:
|
|
@@ -383,6 +327,9 @@ def upload_database_with_variants(
|
|
|
383
327
|
except Exception as e:
|
|
384
328
|
logger.debug(f"Repo creation note: {e}")
|
|
385
329
|
|
|
330
|
+
# VACUUM the database first to optimize it
|
|
331
|
+
vacuum_database(db_path)
|
|
332
|
+
|
|
386
333
|
results = {}
|
|
387
334
|
|
|
388
335
|
# Create temp directory for variants
|
|
@@ -399,20 +346,6 @@ def upload_database_with_variants(
|
|
|
399
346
|
create_lite_database(db_path, lite_path)
|
|
400
347
|
files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
|
|
401
348
|
|
|
402
|
-
# Compressed versions
|
|
403
|
-
if include_compressed:
|
|
404
|
-
# Compress full database
|
|
405
|
-
compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
|
|
406
|
-
compress_database(db_path, compressed_path)
|
|
407
|
-
files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
|
|
408
|
-
|
|
409
|
-
# Compress lite database
|
|
410
|
-
if include_lite:
|
|
411
|
-
lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
|
|
412
|
-
lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
|
|
413
|
-
compress_database(lite_path, lite_compressed_path)
|
|
414
|
-
files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
|
|
415
|
-
|
|
416
349
|
# Copy all files to a staging directory for upload_folder
|
|
417
350
|
staging_dir = temp_path / "staging"
|
|
418
351
|
staging_dir.mkdir()
|
|
@@ -455,7 +388,6 @@ def download_database(
|
|
|
455
388
|
revision: Optional[str] = None,
|
|
456
389
|
cache_dir: Optional[Path] = None,
|
|
457
390
|
force_download: bool = False,
|
|
458
|
-
prefer_compressed: bool = True,
|
|
459
391
|
) -> Path:
|
|
460
392
|
"""
|
|
461
393
|
Download entity database from HuggingFace Hub.
|
|
@@ -466,10 +398,9 @@ def download_database(
|
|
|
466
398
|
revision: Git revision (branch, tag, commit) or None for latest
|
|
467
399
|
cache_dir: Local cache directory
|
|
468
400
|
force_download: Force re-download even if cached
|
|
469
|
-
prefer_compressed: Try to download compressed version first
|
|
470
401
|
|
|
471
402
|
Returns:
|
|
472
|
-
Path to the downloaded database file
|
|
403
|
+
Path to the downloaded database file
|
|
473
404
|
"""
|
|
474
405
|
try:
|
|
475
406
|
from huggingface_hub import hf_hub_download
|
|
@@ -482,34 +413,11 @@ def download_database(
|
|
|
482
413
|
cache_dir = cache_dir or DEFAULT_CACHE_DIR
|
|
483
414
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
484
415
|
|
|
485
|
-
# Try compressed version first if preferred
|
|
486
|
-
download_filename = filename
|
|
487
|
-
if prefer_compressed and not filename.endswith(".gz"):
|
|
488
|
-
compressed_filename = filename + ".gz"
|
|
489
|
-
try:
|
|
490
|
-
logger.info(f"Trying compressed version: {compressed_filename}")
|
|
491
|
-
local_path = hf_hub_download(
|
|
492
|
-
repo_id=repo_id,
|
|
493
|
-
filename=compressed_filename,
|
|
494
|
-
revision=revision,
|
|
495
|
-
cache_dir=str(cache_dir),
|
|
496
|
-
force_download=force_download,
|
|
497
|
-
repo_type="dataset",
|
|
498
|
-
)
|
|
499
|
-
# Decompress to final location
|
|
500
|
-
final_path = cache_dir / filename
|
|
501
|
-
decompress_database(local_path, final_path)
|
|
502
|
-
logger.info(f"Database downloaded and decompressed to {final_path}")
|
|
503
|
-
return final_path
|
|
504
|
-
except Exception as e:
|
|
505
|
-
logger.debug(f"Compressed version not available: {e}")
|
|
506
|
-
|
|
507
|
-
# Download uncompressed version
|
|
508
416
|
logger.info(f"Downloading entity database from {repo_id}...")
|
|
509
417
|
|
|
510
418
|
local_path = hf_hub_download(
|
|
511
419
|
repo_id=repo_id,
|
|
512
|
-
filename=
|
|
420
|
+
filename=filename,
|
|
513
421
|
revision=revision,
|
|
514
422
|
cache_dir=str(cache_dir),
|
|
515
423
|
force_download=force_download,
|
|
@@ -4,21 +4,29 @@ Data importers for the entity database.
|
|
|
4
4
|
Provides importers for various data sources:
|
|
5
5
|
- GLEIF: Legal Entity Identifier data
|
|
6
6
|
- SEC Edgar: US SEC company data
|
|
7
|
+
- SEC Form 4: US SEC insider ownership data (officers/directors)
|
|
7
8
|
- Companies House: UK company data
|
|
8
|
-
- Wikidata: Wikipedia/Wikidata organization data
|
|
9
|
-
- Wikidata People: Notable people from Wikipedia/Wikidata
|
|
9
|
+
- Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
|
|
10
|
+
- Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
|
|
11
|
+
- Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
|
|
10
12
|
"""
|
|
11
13
|
|
|
12
14
|
from .gleif import GleifImporter
|
|
13
15
|
from .sec_edgar import SecEdgarImporter
|
|
16
|
+
from .sec_form4 import SecForm4Importer
|
|
14
17
|
from .companies_house import CompaniesHouseImporter
|
|
18
|
+
from .companies_house_officers import CompaniesHouseOfficersImporter
|
|
15
19
|
from .wikidata import WikidataImporter
|
|
16
20
|
from .wikidata_people import WikidataPeopleImporter
|
|
21
|
+
from .wikidata_dump import WikidataDumpImporter
|
|
17
22
|
|
|
18
23
|
__all__ = [
|
|
19
24
|
"GleifImporter",
|
|
20
25
|
"SecEdgarImporter",
|
|
26
|
+
"SecForm4Importer",
|
|
21
27
|
"CompaniesHouseImporter",
|
|
28
|
+
"CompaniesHouseOfficersImporter",
|
|
22
29
|
"WikidataImporter",
|
|
23
30
|
"WikidataPeopleImporter",
|
|
31
|
+
"WikidataDumpImporter",
|
|
24
32
|
]
|
|
@@ -342,13 +342,18 @@ class CompaniesHouseImporter:
|
|
|
342
342
|
raw_company_type = item.get("company_type", "")
|
|
343
343
|
entity_type = _get_entity_type_from_company_type(raw_company_type)
|
|
344
344
|
|
|
345
|
+
# Get dates
|
|
346
|
+
date_of_creation = item.get("date_of_creation")
|
|
347
|
+
date_of_cessation = item.get("date_of_cessation") # For dissolved companies
|
|
348
|
+
|
|
345
349
|
# Build record
|
|
346
350
|
record_data = {
|
|
347
351
|
"company_number": company_number,
|
|
348
352
|
"title": title,
|
|
349
353
|
"company_status": company_status,
|
|
350
354
|
"company_type": raw_company_type,
|
|
351
|
-
"date_of_creation":
|
|
355
|
+
"date_of_creation": date_of_creation,
|
|
356
|
+
"date_of_cessation": date_of_cessation,
|
|
352
357
|
"locality": locality,
|
|
353
358
|
"region": region,
|
|
354
359
|
"country": country,
|
|
@@ -360,6 +365,8 @@ class CompaniesHouseImporter:
|
|
|
360
365
|
source_id=company_number,
|
|
361
366
|
region=country,
|
|
362
367
|
entity_type=entity_type,
|
|
368
|
+
from_date=date_of_creation,
|
|
369
|
+
to_date=date_of_cessation,
|
|
363
370
|
record=record_data,
|
|
364
371
|
)
|
|
365
372
|
|
|
@@ -397,12 +404,17 @@ class CompaniesHouseImporter:
|
|
|
397
404
|
raw_company_type = row.get("CompanyCategory", "").strip()
|
|
398
405
|
entity_type = _get_entity_type_from_company_type(raw_company_type)
|
|
399
406
|
|
|
407
|
+
# Get dates from CSV
|
|
408
|
+
date_of_creation = row.get("IncorporationDate", "").strip() or None
|
|
409
|
+
date_of_cessation = row.get("DissolutionDate", "").strip() or None
|
|
410
|
+
|
|
400
411
|
record_data = {
|
|
401
412
|
"company_number": company_number,
|
|
402
413
|
"title": company_name,
|
|
403
414
|
"company_status": company_status,
|
|
404
415
|
"company_type": raw_company_type,
|
|
405
|
-
"date_of_creation":
|
|
416
|
+
"date_of_creation": date_of_creation,
|
|
417
|
+
"date_of_cessation": date_of_cessation,
|
|
406
418
|
"country": row.get("CountryOfOrigin", "United Kingdom").strip(),
|
|
407
419
|
"sic_code": row.get("SICCode.SicText_1", "").strip(),
|
|
408
420
|
}
|
|
@@ -416,6 +428,8 @@ class CompaniesHouseImporter:
|
|
|
416
428
|
source_id=company_number,
|
|
417
429
|
region=region,
|
|
418
430
|
entity_type=entity_type,
|
|
431
|
+
from_date=date_of_creation,
|
|
432
|
+
to_date=date_of_cessation,
|
|
419
433
|
record=record_data,
|
|
420
434
|
)
|
|
421
435
|
|