corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -121,6 +121,51 @@ class CompanyEmbedder:
121
121
  )
122
122
  return embeddings.astype(np.float32)
123
123
 
124
+ def quantize_to_int8(self, embedding: np.ndarray) -> np.ndarray:
125
+ """
126
+ Quantize L2-normalized float32 embedding to int8.
127
+
128
+ For normalized embeddings (values in [-1, 1]), this provides
129
+ 75% storage reduction with ~92% recall at top-100.
130
+
131
+ Args:
132
+ embedding: L2-normalized float32 embedding vector
133
+
134
+ Returns:
135
+ int8 embedding vector
136
+ """
137
+ return np.clip(np.round(embedding * 127), -127, 127).astype(np.int8)
138
+
139
+ def embed_and_quantize(self, text: str) -> tuple[np.ndarray, np.ndarray]:
140
+ """
141
+ Embed text and return both float32 and int8 embeddings.
142
+
143
+ Args:
144
+ text: Text to embed
145
+
146
+ Returns:
147
+ Tuple of (float32_embedding, int8_embedding)
148
+ """
149
+ fp32 = self.embed(text)
150
+ return fp32, self.quantize_to_int8(fp32)
151
+
152
+ def embed_batch_and_quantize(
153
+ self, texts: list[str], batch_size: int = 32
154
+ ) -> tuple[np.ndarray, np.ndarray]:
155
+ """
156
+ Embed multiple texts and return both float32 and int8 embeddings.
157
+
158
+ Args:
159
+ texts: List of texts to embed
160
+ batch_size: Batch size for processing
161
+
162
+ Returns:
163
+ Tuple of (float32_embeddings, int8_embeddings) matrices
164
+ """
165
+ fp32 = self.embed_batch(texts, batch_size=batch_size)
166
+ int8 = np.array([self.quantize_to_int8(e) for e in fp32])
167
+ return fp32, int8
168
+
124
169
  def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
125
170
  """
126
171
  Compute cosine similarity between two embeddings.
@@ -6,10 +6,8 @@ Provides functionality to:
6
6
  - Upload/publish database updates
7
7
  - Version management for database files
8
8
  - Create "lite" versions without full records for smaller downloads
9
- - Optional gzip compression for reduced file sizes
10
9
  """
11
10
 
12
- import gzip
13
11
  import logging
14
12
  import os
15
13
  import shutil
@@ -22,11 +20,9 @@ logger = logging.getLogger(__name__)
22
20
 
23
21
  # Default HuggingFace repo for entity database
24
22
  DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
25
- DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
26
- DEFAULT_DB_FULL_FILENAME = "entities.db"
27
- DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
28
- DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
29
- DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
23
+ DEFAULT_DB_FILENAME = "entities-v2-lite.db" # Lite is the default (smaller download)
24
+ DEFAULT_DB_FULL_FILENAME = "entities-v2.db"
25
+ DEFAULT_DB_LITE_FILENAME = "entities-v2-lite.db"
30
26
 
31
27
  # Local cache directory
32
28
  DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
@@ -59,7 +55,8 @@ def get_database_path(
59
55
  # Check common locations
60
56
  possible_paths = [
61
57
  cache_dir / filename,
62
- cache_dir / "entities.db",
58
+ cache_dir / "entities-v2.db",
59
+ cache_dir / "entities.db", # Legacy fallback
63
60
  Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
64
61
  ]
65
62
 
@@ -139,7 +136,7 @@ def upload_database(
139
136
  commit_message=commit_message,
140
137
  )
141
138
 
142
- logger.info(f"Database uploaded successfully")
139
+ logger.info("Database uploaded successfully")
143
140
  return result
144
141
 
145
142
 
@@ -189,6 +186,33 @@ def check_for_updates(
189
186
  return latest != current_version, latest
190
187
 
191
188
 
189
+ def vacuum_database(db_path: str | Path) -> None:
190
+ """
191
+ VACUUM the database to reclaim space and optimize it.
192
+
193
+ Args:
194
+ db_path: Path to the database file
195
+ """
196
+ db_path = Path(db_path)
197
+ if not db_path.exists():
198
+ raise FileNotFoundError(f"Database not found: {db_path}")
199
+
200
+ original_size = db_path.stat().st_size
201
+ logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
202
+
203
+ # Use isolation_level=None for autocommit (required for VACUUM)
204
+ conn = sqlite3.connect(str(db_path), isolation_level=None)
205
+ try:
206
+ conn.execute("VACUUM")
207
+ finally:
208
+ conn.close()
209
+
210
+ new_size = db_path.stat().st_size
211
+ reduction = (1 - new_size / original_size) * 100
212
+
213
+ logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
214
+
215
+
192
216
  def create_lite_database(
193
217
  source_db_path: str | Path,
194
218
  output_path: Optional[str | Path] = None,
@@ -196,8 +220,10 @@ def create_lite_database(
196
220
  """
197
221
  Create a lite version of the database without full records.
198
222
 
199
- The lite version strips the `record` column content (sets to empty {}),
200
- significantly reducing file size while keeping embeddings and core fields.
223
+ The lite version:
224
+ - Strips the `record` column content (sets to empty {})
225
+ - Drops float32 embedding tables (keeps only scalar int8 embeddings)
226
+ - Significantly reduces file size (~75% reduction)
201
227
 
202
228
  Args:
203
229
  source_db_path: Path to the full database
@@ -206,6 +232,8 @@ def create_lite_database(
206
232
  Returns:
207
233
  Path to the lite database
208
234
  """
235
+ import sqlite_vec
236
+
209
237
  source_db_path = Path(source_db_path)
210
238
  if not source_db_path.exists():
211
239
  raise FileNotFoundError(f"Source database not found: {source_db_path}")
@@ -223,14 +251,51 @@ def create_lite_database(
223
251
  # Connect and strip record contents
224
252
  # Use isolation_level=None for autocommit (required for VACUUM)
225
253
  conn = sqlite3.connect(str(output_path), isolation_level=None)
254
+
255
+ # Load sqlite-vec extension (required for vec0 virtual tables)
256
+ conn.enable_load_extension(True)
257
+ sqlite_vec.load(conn)
258
+ conn.enable_load_extension(False)
259
+
226
260
  try:
227
261
  # Update all records to have empty record JSON
228
262
  conn.execute("BEGIN")
229
263
  cursor = conn.execute("UPDATE organizations SET record = '{}'")
230
264
  updated = cursor.rowcount
231
- logger.info(f"Stripped {updated} record fields")
265
+ logger.info(f"Stripped {updated} organization record fields")
266
+
267
+ # Also strip people records if table exists
268
+ cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='people'")
269
+ if cursor.fetchone():
270
+ cursor = conn.execute("UPDATE people SET record = '{}'")
271
+ logger.info(f"Stripped {cursor.rowcount} people record fields")
272
+
232
273
  conn.execute("COMMIT")
233
274
 
275
+ # Drop float32 embedding tables (keep only scalar int8 for 75% storage savings)
276
+ # Check if scalar tables exist before dropping float32 tables
277
+ cursor = conn.execute(
278
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings_scalar'"
279
+ )
280
+ has_org_scalar = cursor.fetchone() is not None
281
+
282
+ cursor = conn.execute(
283
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings_scalar'"
284
+ )
285
+ has_person_scalar = cursor.fetchone() is not None
286
+
287
+ if has_org_scalar:
288
+ logger.info("Dropping float32 organization_embeddings table (keeping scalar)")
289
+ conn.execute("DROP TABLE IF EXISTS organization_embeddings")
290
+ else:
291
+ logger.warning("No scalar organization embeddings found, keeping float32 table")
292
+
293
+ if has_person_scalar:
294
+ logger.info("Dropping float32 person_embeddings table (keeping scalar)")
295
+ conn.execute("DROP TABLE IF EXISTS person_embeddings")
296
+ else:
297
+ logger.warning("No scalar person embeddings found, keeping float32 table")
298
+
234
299
  # Vacuum to reclaim space (must be outside transaction)
235
300
  conn.execute("VACUUM")
236
301
  finally:
@@ -248,98 +313,20 @@ def create_lite_database(
248
313
  return output_path
249
314
 
250
315
 
251
- def compress_database(
252
- db_path: str | Path,
253
- output_path: Optional[str | Path] = None,
254
- ) -> Path:
255
- """
256
- Compress a database file using gzip.
257
-
258
- Args:
259
- db_path: Path to the database file
260
- output_path: Output path for compressed file (default: adds .gz suffix)
261
-
262
- Returns:
263
- Path to the compressed file
264
- """
265
- db_path = Path(db_path)
266
- if not db_path.exists():
267
- raise FileNotFoundError(f"Database not found: {db_path}")
268
-
269
- if output_path is None:
270
- output_path = db_path.with_suffix(db_path.suffix + ".gz")
271
- output_path = Path(output_path)
272
-
273
- logger.info(f"Compressing {db_path} to {output_path}")
274
-
275
- with open(db_path, "rb") as f_in:
276
- with gzip.open(output_path, "wb", compresslevel=9) as f_out:
277
- shutil.copyfileobj(f_in, f_out)
278
-
279
- # Log compression results
280
- original_size = db_path.stat().st_size
281
- compressed_size = output_path.stat().st_size
282
- ratio = (1 - compressed_size / original_size) * 100
283
-
284
- logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
285
- logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
286
- logger.info(f"Compression ratio: {ratio:.1f}%")
287
-
288
- return output_path
289
-
290
-
291
- def decompress_database(
292
- compressed_path: str | Path,
293
- output_path: Optional[str | Path] = None,
294
- ) -> Path:
295
- """
296
- Decompress a gzipped database file.
297
-
298
- Args:
299
- compressed_path: Path to the .gz file
300
- output_path: Output path (default: removes .gz suffix)
301
-
302
- Returns:
303
- Path to the decompressed file
304
- """
305
- compressed_path = Path(compressed_path)
306
- if not compressed_path.exists():
307
- raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
308
-
309
- if output_path is None:
310
- if compressed_path.suffix == ".gz":
311
- output_path = compressed_path.with_suffix("")
312
- else:
313
- output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
314
- output_path = Path(output_path)
315
-
316
- logger.info(f"Decompressing {compressed_path} to {output_path}")
317
-
318
- with gzip.open(compressed_path, "rb") as f_in:
319
- with open(output_path, "wb") as f_out:
320
- shutil.copyfileobj(f_in, f_out)
321
-
322
- logger.info(f"Decompressed to {output_path}")
323
- return output_path
324
-
325
-
326
316
  def upload_database_with_variants(
327
317
  db_path: str | Path,
328
318
  repo_id: str = DEFAULT_REPO_ID,
329
319
  commit_message: str = "Update entity database",
330
320
  token: Optional[str] = None,
331
321
  include_lite: bool = True,
332
- include_compressed: bool = True,
333
322
  include_readme: bool = True,
334
323
  ) -> dict[str, str]:
335
324
  """
336
- Upload entity database with optional lite and compressed variants.
325
+ Upload entity database with optional lite variant.
337
326
 
338
- Creates and uploads:
339
- - entities.db (full database)
340
- - entities-lite.db (without record data, smaller)
341
- - entities.db.gz (compressed full database)
342
- - entities-lite.db.gz (compressed lite database)
327
+ First VACUUMs the database, then creates and uploads:
328
+ - entities-v2.db (full database with v2 normalized schema)
329
+ - entities-v2-lite.db (without record data, smaller)
343
330
  - README.md (dataset card from HUGGINGFACE_README.md)
344
331
 
345
332
  Args:
@@ -348,7 +335,6 @@ def upload_database_with_variants(
348
335
  commit_message: Git commit message
349
336
  token: HuggingFace API token
350
337
  include_lite: Whether to create and upload lite version
351
- include_compressed: Whether to create and upload compressed versions
352
338
  include_readme: Whether to upload the README.md dataset card
353
339
 
354
340
  Returns:
@@ -383,6 +369,9 @@ def upload_database_with_variants(
383
369
  except Exception as e:
384
370
  logger.debug(f"Repo creation note: {e}")
385
371
 
372
+ # VACUUM the database first to optimize it
373
+ vacuum_database(db_path)
374
+
386
375
  results = {}
387
376
 
388
377
  # Create temp directory for variants
@@ -399,20 +388,6 @@ def upload_database_with_variants(
399
388
  create_lite_database(db_path, lite_path)
400
389
  files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
401
390
 
402
- # Compressed versions
403
- if include_compressed:
404
- # Compress full database
405
- compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
406
- compress_database(db_path, compressed_path)
407
- files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
408
-
409
- # Compress lite database
410
- if include_lite:
411
- lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
412
- lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
413
- compress_database(lite_path, lite_compressed_path)
414
- files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
415
-
416
391
  # Copy all files to a staging directory for upload_folder
417
392
  staging_dir = temp_path / "staging"
418
393
  staging_dir.mkdir()
@@ -455,7 +430,6 @@ def download_database(
455
430
  revision: Optional[str] = None,
456
431
  cache_dir: Optional[Path] = None,
457
432
  force_download: bool = False,
458
- prefer_compressed: bool = True,
459
433
  ) -> Path:
460
434
  """
461
435
  Download entity database from HuggingFace Hub.
@@ -466,10 +440,9 @@ def download_database(
466
440
  revision: Git revision (branch, tag, commit) or None for latest
467
441
  cache_dir: Local cache directory
468
442
  force_download: Force re-download even if cached
469
- prefer_compressed: Try to download compressed version first
470
443
 
471
444
  Returns:
472
- Path to the downloaded database file (decompressed if was .gz)
445
+ Path to the downloaded database file
473
446
  """
474
447
  try:
475
448
  from huggingface_hub import hf_hub_download
@@ -482,34 +455,11 @@ def download_database(
482
455
  cache_dir = cache_dir or DEFAULT_CACHE_DIR
483
456
  cache_dir.mkdir(parents=True, exist_ok=True)
484
457
 
485
- # Try compressed version first if preferred
486
- download_filename = filename
487
- if prefer_compressed and not filename.endswith(".gz"):
488
- compressed_filename = filename + ".gz"
489
- try:
490
- logger.info(f"Trying compressed version: {compressed_filename}")
491
- local_path = hf_hub_download(
492
- repo_id=repo_id,
493
- filename=compressed_filename,
494
- revision=revision,
495
- cache_dir=str(cache_dir),
496
- force_download=force_download,
497
- repo_type="dataset",
498
- )
499
- # Decompress to final location
500
- final_path = cache_dir / filename
501
- decompress_database(local_path, final_path)
502
- logger.info(f"Database downloaded and decompressed to {final_path}")
503
- return final_path
504
- except Exception as e:
505
- logger.debug(f"Compressed version not available: {e}")
506
-
507
- # Download uncompressed version
508
458
  logger.info(f"Downloading entity database from {repo_id}...")
509
459
 
510
460
  local_path = hf_hub_download(
511
461
  repo_id=repo_id,
512
- filename=download_filename,
462
+ filename=filename,
513
463
  revision=revision,
514
464
  cache_dir=str(cache_dir),
515
465
  force_download=force_download,
@@ -4,21 +4,29 @@ Data importers for the entity database.
4
4
  Provides importers for various data sources:
5
5
  - GLEIF: Legal Entity Identifier data
6
6
  - SEC Edgar: US SEC company data
7
+ - SEC Form 4: US SEC insider ownership data (officers/directors)
7
8
  - Companies House: UK company data
8
- - Wikidata: Wikipedia/Wikidata organization data
9
- - Wikidata People: Notable people from Wikipedia/Wikidata
9
+ - Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
10
+ - Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
11
+ - Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
10
12
  """
11
13
 
12
14
  from .gleif import GleifImporter
13
15
  from .sec_edgar import SecEdgarImporter
16
+ from .sec_form4 import SecForm4Importer
14
17
  from .companies_house import CompaniesHouseImporter
18
+ from .companies_house_officers import CompaniesHouseOfficersImporter
15
19
  from .wikidata import WikidataImporter
16
20
  from .wikidata_people import WikidataPeopleImporter
21
+ from .wikidata_dump import WikidataDumpImporter
17
22
 
18
23
  __all__ = [
19
24
  "GleifImporter",
20
25
  "SecEdgarImporter",
26
+ "SecForm4Importer",
21
27
  "CompaniesHouseImporter",
28
+ "CompaniesHouseOfficersImporter",
22
29
  "WikidataImporter",
23
30
  "WikidataPeopleImporter",
31
+ "WikidataDumpImporter",
24
32
  ]
@@ -342,13 +342,18 @@ class CompaniesHouseImporter:
342
342
  raw_company_type = item.get("company_type", "")
343
343
  entity_type = _get_entity_type_from_company_type(raw_company_type)
344
344
 
345
+ # Get dates
346
+ date_of_creation = item.get("date_of_creation")
347
+ date_of_cessation = item.get("date_of_cessation") # For dissolved companies
348
+
345
349
  # Build record
346
350
  record_data = {
347
351
  "company_number": company_number,
348
352
  "title": title,
349
353
  "company_status": company_status,
350
354
  "company_type": raw_company_type,
351
- "date_of_creation": item.get("date_of_creation"),
355
+ "date_of_creation": date_of_creation,
356
+ "date_of_cessation": date_of_cessation,
352
357
  "locality": locality,
353
358
  "region": region,
354
359
  "country": country,
@@ -360,6 +365,8 @@ class CompaniesHouseImporter:
360
365
  source_id=company_number,
361
366
  region=country,
362
367
  entity_type=entity_type,
368
+ from_date=date_of_creation,
369
+ to_date=date_of_cessation,
363
370
  record=record_data,
364
371
  )
365
372
 
@@ -397,12 +404,17 @@ class CompaniesHouseImporter:
397
404
  raw_company_type = row.get("CompanyCategory", "").strip()
398
405
  entity_type = _get_entity_type_from_company_type(raw_company_type)
399
406
 
407
+ # Get dates from CSV
408
+ date_of_creation = row.get("IncorporationDate", "").strip() or None
409
+ date_of_cessation = row.get("DissolutionDate", "").strip() or None
410
+
400
411
  record_data = {
401
412
  "company_number": company_number,
402
413
  "title": company_name,
403
414
  "company_status": company_status,
404
415
  "company_type": raw_company_type,
405
- "date_of_creation": row.get("IncorporationDate", "").strip(),
416
+ "date_of_creation": date_of_creation,
417
+ "date_of_cessation": date_of_cessation,
406
418
  "country": row.get("CountryOfOrigin", "United Kingdom").strip(),
407
419
  "sic_code": row.get("SICCode.SicText_1", "").strip(),
408
420
  }
@@ -416,6 +428,8 @@ class CompaniesHouseImporter:
416
428
  source_id=company_number,
417
429
  region=region,
418
430
  entity_type=entity_type,
431
+ from_date=date_of_creation,
432
+ to_date=date_of_cessation,
419
433
  record=record_data,
420
434
  )
421
435