corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,52 @@
1
+ """
2
+ Entity/Organization database module for embedding-based entity qualification.
3
+
4
+ Provides:
5
+ - CompanyRecord: Pydantic model for organization records
6
+ - PersonRecord: Pydantic model for person records
7
+ - OrganizationDatabase: sqlite-vec database for org embedding search
8
+ - PersonDatabase: sqlite-vec database for person embedding search
9
+ - CompanyEmbedder: Embedding service using Gemma3
10
+ - Hub functions: Download/upload database from HuggingFace
11
+ """
12
+
13
+ from .models import CompanyRecord, CompanyMatch, DatabaseStats, PersonRecord, PersonMatch, PersonType
14
+ from .store import OrganizationDatabase, get_database, PersonDatabase, get_person_database
15
+ from .embeddings import CompanyEmbedder, get_embedder
16
+ from .hub import (
17
+ download_database,
18
+ get_database_path,
19
+ upload_database,
20
+ upload_database_with_variants,
21
+ )
22
+ from .resolver import OrganizationResolver, get_organization_resolver
23
+
24
+ # Backwards compatibility alias
25
+ CompanyDatabase = OrganizationDatabase
26
+
27
+ __all__ = [
28
+ # Organization models
29
+ "CompanyRecord",
30
+ "CompanyMatch",
31
+ "DatabaseStats",
32
+ "OrganizationDatabase",
33
+ "CompanyDatabase", # Backwards compatibility alias
34
+ "get_database",
35
+ # Person models
36
+ "PersonRecord",
37
+ "PersonMatch",
38
+ "PersonType",
39
+ "PersonDatabase",
40
+ "get_person_database",
41
+ # Embedding
42
+ "CompanyEmbedder",
43
+ "get_embedder",
44
+ # Hub
45
+ "download_database",
46
+ "get_database_path",
47
+ "upload_database",
48
+ "upload_database_with_variants",
49
+ # Resolver
50
+ "OrganizationResolver",
51
+ "get_organization_resolver",
52
+ ]
@@ -0,0 +1,186 @@
1
+ """
2
+ Embedding service for company name matching.
3
+
4
+ Uses sentence-transformers with Gemma3 embedding model for high-quality
5
+ semantic similarity matching of company names.
6
+ """
7
+
8
+ import logging
9
+ from typing import Optional
10
+
11
+ import numpy as np
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class CompanyEmbedder:
17
+ """
18
+ Embedding service for company names.
19
+
20
+ Uses Google's embedding models for high-quality semantic embeddings
21
+ suitable for company name matching.
22
+ """
23
+
24
+ # Default model - good balance of quality and speed
25
+ DEFAULT_MODEL = "google/embeddinggemma-300m"
26
+ # Alternative: smaller but faster
27
+ # DEFAULT_MODEL = "all-MiniLM-L6-v2"
28
+
29
+ def __init__(
30
+ self,
31
+ model_name: str = DEFAULT_MODEL,
32
+ device: Optional[str] = None,
33
+ ):
34
+ """
35
+ Initialize the embedder.
36
+
37
+ Args:
38
+ model_name: HuggingFace model ID for embeddings
39
+ device: Device to use (cuda, mps, cpu, or None for auto)
40
+ """
41
+ self._model_name = model_name
42
+ self._device = device
43
+ self._model = None
44
+ self._embedding_dim: Optional[int] = None
45
+
46
+ @property
47
+ def embedding_dim(self) -> int:
48
+ """Get the embedding dimension (loads model if needed)."""
49
+ if self._embedding_dim is None:
50
+ self._load_model()
51
+ return self._embedding_dim
52
+
53
+ def _load_model(self) -> None:
54
+ """Load the embedding model (lazy loading)."""
55
+ if self._model is not None:
56
+ return
57
+
58
+ try:
59
+ from sentence_transformers import SentenceTransformer
60
+ import torch
61
+
62
+ device = self._device
63
+ if device is None:
64
+ if torch.cuda.is_available():
65
+ device = "cuda"
66
+ elif torch.backends.mps.is_available():
67
+ device = "mps"
68
+ else:
69
+ device = "cpu"
70
+
71
+ logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
72
+ self._model = SentenceTransformer(self._model_name, device=device)
73
+ self._embedding_dim = self._model.get_sentence_embedding_dimension()
74
+ logger.info(f"Embedding model loaded (dim={self._embedding_dim})")
75
+
76
+ except ImportError as e:
77
+ raise ImportError(
78
+ "sentence-transformers is required for embeddings. "
79
+ "Install with: pip install sentence-transformers"
80
+ ) from e
81
+
82
+ def embed(self, text: str) -> np.ndarray:
83
+ """
84
+ Embed a single text string.
85
+
86
+ Args:
87
+ text: Text to embed
88
+
89
+ Returns:
90
+ Normalized embedding vector as numpy array
91
+ """
92
+ self._load_model()
93
+
94
+ embedding = self._model.encode(
95
+ text,
96
+ convert_to_numpy=True,
97
+ show_progress_bar=False,
98
+ normalize_embeddings=True,
99
+ )
100
+ return embedding.astype(np.float32)
101
+
102
+ def embed_batch(self, texts: list[str], batch_size: int = 32) -> np.ndarray:
103
+ """
104
+ Embed multiple texts in batches.
105
+
106
+ Args:
107
+ texts: List of texts to embed
108
+ batch_size: Batch size for processing
109
+
110
+ Returns:
111
+ Array of normalized embeddings (N x dim)
112
+ """
113
+ self._load_model()
114
+
115
+ embeddings = self._model.encode(
116
+ texts,
117
+ convert_to_numpy=True,
118
+ show_progress_bar=len(texts) > 100,
119
+ batch_size=batch_size,
120
+ normalize_embeddings=True,
121
+ )
122
+ return embeddings.astype(np.float32)
123
+
124
+ def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
125
+ """
126
+ Compute cosine similarity between two embeddings.
127
+
128
+ Args:
129
+ embedding1: First embedding (normalized)
130
+ embedding2: Second embedding (normalized)
131
+
132
+ Returns:
133
+ Cosine similarity score (0-1 for normalized vectors)
134
+ """
135
+ return float(np.dot(embedding1, embedding2))
136
+
137
+ def search_similar(
138
+ self,
139
+ query_embedding: np.ndarray,
140
+ candidate_embeddings: np.ndarray,
141
+ top_k: int = 20,
142
+ ) -> list[tuple[int, float]]:
143
+ """
144
+ Find most similar embeddings to query.
145
+
146
+ Args:
147
+ query_embedding: Query embedding vector
148
+ candidate_embeddings: Matrix of candidate embeddings (N x dim)
149
+ top_k: Number of results to return
150
+
151
+ Returns:
152
+ List of (index, similarity) tuples, sorted by similarity descending
153
+ """
154
+ # Compute similarities (dot product for normalized vectors)
155
+ similarities = np.dot(candidate_embeddings, query_embedding)
156
+
157
+ # Get top-k indices
158
+ if len(similarities) <= top_k:
159
+ indices = np.argsort(similarities)[::-1]
160
+ else:
161
+ indices = np.argpartition(similarities, -top_k)[-top_k:]
162
+ indices = indices[np.argsort(similarities[indices])[::-1]]
163
+
164
+ return [(int(idx), float(similarities[idx])) for idx in indices]
165
+
166
+
167
+ # Singleton instance for shared use
168
+ _default_embedder: Optional[CompanyEmbedder] = None
169
+
170
+
171
+ def get_embedder(model_name: str = CompanyEmbedder.DEFAULT_MODEL) -> CompanyEmbedder:
172
+ """
173
+ Get or create a shared embedder instance.
174
+
175
+ Args:
176
+ model_name: HuggingFace model ID
177
+
178
+ Returns:
179
+ CompanyEmbedder instance
180
+ """
181
+ global _default_embedder
182
+
183
+ if _default_embedder is None or _default_embedder._model_name != model_name:
184
+ _default_embedder = CompanyEmbedder(model_name=model_name)
185
+
186
+ return _default_embedder
@@ -0,0 +1,428 @@
1
+ """
2
+ HuggingFace Hub integration for entity/organization database distribution.
3
+
4
+ Provides functionality to:
5
+ - Download pre-built entity databases from HuggingFace Hub
6
+ - Upload/publish database updates
7
+ - Version management for database files
8
+ - Create "lite" versions without full records for smaller downloads
9
+ """
10
+
11
+ import logging
12
+ import os
13
+ import shutil
14
+ import sqlite3
15
+ import tempfile
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Default HuggingFace repo for entity database
22
+ DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
23
+ DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
24
+ DEFAULT_DB_FULL_FILENAME = "entities.db"
25
+ DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
26
+
27
+ # Local cache directory
28
+ DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
29
+
30
+
31
+ def get_database_path(
32
+ repo_id: str = DEFAULT_REPO_ID,
33
+ filename: str = DEFAULT_DB_FILENAME,
34
+ auto_download: bool = True,
35
+ full: bool = False,
36
+ ) -> Optional[Path]:
37
+ """
38
+ Get path to entity database, downloading if necessary.
39
+
40
+ Args:
41
+ repo_id: HuggingFace repo ID
42
+ filename: Database filename (overrides full flag if specified)
43
+ auto_download: Whether to download if not cached
44
+ full: If True, get the full database instead of lite
45
+
46
+ Returns:
47
+ Path to database file, or None if not available
48
+ """
49
+ # Override filename if full is requested and using default
50
+ if full and filename == DEFAULT_DB_FILENAME:
51
+ filename = DEFAULT_DB_FULL_FILENAME
52
+ # Check if database exists in cache
53
+ cache_dir = DEFAULT_CACHE_DIR
54
+
55
+ # Check common locations
56
+ possible_paths = [
57
+ cache_dir / filename,
58
+ cache_dir / "entities.db",
59
+ Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
60
+ ]
61
+
62
+ for path in possible_paths:
63
+ if path.exists():
64
+ logger.debug(f"Found cached database at {path}")
65
+ return path
66
+
67
+ # Try to download
68
+ if auto_download:
69
+ try:
70
+ return download_database(repo_id=repo_id, filename=filename)
71
+ except Exception as e:
72
+ logger.warning(f"Failed to download database: {e}")
73
+ return None
74
+
75
+ return None
76
+
77
+
78
+ def upload_database(
79
+ db_path: str | Path,
80
+ repo_id: str = DEFAULT_REPO_ID,
81
+ filename: str = DEFAULT_DB_FILENAME,
82
+ commit_message: str = "Update entity database",
83
+ token: Optional[str] = None,
84
+ ) -> str:
85
+ """
86
+ Upload entity database to HuggingFace Hub.
87
+
88
+ Args:
89
+ db_path: Local path to database file
90
+ repo_id: HuggingFace repo ID
91
+ filename: Target filename in repo
92
+ commit_message: Git commit message
93
+ token: HuggingFace API token (uses HF_TOKEN env var if not provided)
94
+
95
+ Returns:
96
+ URL of the uploaded file
97
+ """
98
+ try:
99
+ from huggingface_hub import HfApi, create_repo
100
+ except ImportError:
101
+ raise ImportError(
102
+ "huggingface_hub is required for database upload. "
103
+ "Install with: pip install huggingface_hub"
104
+ )
105
+
106
+ db_path = Path(db_path)
107
+ if not db_path.exists():
108
+ raise FileNotFoundError(f"Database file not found: {db_path}")
109
+
110
+ token = token or os.environ.get("HF_TOKEN")
111
+ if not token:
112
+ raise ValueError("HuggingFace token required. Set HF_TOKEN env var or pass token argument.")
113
+
114
+ api = HfApi(token=token)
115
+
116
+ # Create repo if it doesn't exist
117
+ try:
118
+ create_repo(
119
+ repo_id=repo_id,
120
+ repo_type="dataset",
121
+ exist_ok=True,
122
+ token=token,
123
+ )
124
+ except Exception as e:
125
+ logger.debug(f"Repo creation note: {e}")
126
+
127
+ # Upload file
128
+ logger.info(f"Uploading database to {repo_id}...")
129
+
130
+ result = api.upload_file(
131
+ path_or_fileobj=str(db_path),
132
+ path_in_repo=filename,
133
+ repo_id=repo_id,
134
+ repo_type="dataset",
135
+ commit_message=commit_message,
136
+ )
137
+
138
+ logger.info("Database uploaded successfully")
139
+ return result
140
+
141
+
142
+ def get_latest_version(repo_id: str = DEFAULT_REPO_ID) -> Optional[str]:
143
+ """
144
+ Get the latest version/commit of the database repo.
145
+
146
+ Args:
147
+ repo_id: HuggingFace repo ID
148
+
149
+ Returns:
150
+ Latest commit SHA or None if unavailable
151
+ """
152
+ try:
153
+ from huggingface_hub import HfApi
154
+
155
+ api = HfApi()
156
+ info = api.repo_info(repo_id=repo_id, repo_type="dataset")
157
+ return info.sha
158
+ except Exception as e:
159
+ logger.debug(f"Failed to get repo info: {e}")
160
+ return None
161
+
162
+
163
+ def check_for_updates(
164
+ repo_id: str = DEFAULT_REPO_ID,
165
+ current_version: Optional[str] = None,
166
+ ) -> tuple[bool, Optional[str]]:
167
+ """
168
+ Check if a newer version of the database is available.
169
+
170
+ Args:
171
+ repo_id: HuggingFace repo ID
172
+ current_version: Current cached version (commit SHA)
173
+
174
+ Returns:
175
+ Tuple of (update_available: bool, latest_version: str or None)
176
+ """
177
+ latest = get_latest_version(repo_id)
178
+
179
+ if latest is None:
180
+ return False, None
181
+
182
+ if current_version is None:
183
+ return True, latest
184
+
185
+ return latest != current_version, latest
186
+
187
+
188
+ def vacuum_database(db_path: str | Path) -> None:
189
+ """
190
+ VACUUM the database to reclaim space and optimize it.
191
+
192
+ Args:
193
+ db_path: Path to the database file
194
+ """
195
+ db_path = Path(db_path)
196
+ if not db_path.exists():
197
+ raise FileNotFoundError(f"Database not found: {db_path}")
198
+
199
+ original_size = db_path.stat().st_size
200
+ logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
201
+
202
+ # Use isolation_level=None for autocommit (required for VACUUM)
203
+ conn = sqlite3.connect(str(db_path), isolation_level=None)
204
+ try:
205
+ conn.execute("VACUUM")
206
+ finally:
207
+ conn.close()
208
+
209
+ new_size = db_path.stat().st_size
210
+ reduction = (1 - new_size / original_size) * 100
211
+
212
+ logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
213
+
214
+
215
+ def create_lite_database(
216
+ source_db_path: str | Path,
217
+ output_path: Optional[str | Path] = None,
218
+ ) -> Path:
219
+ """
220
+ Create a lite version of the database without full records.
221
+
222
+ The lite version strips the `record` column content (sets to empty {}),
223
+ significantly reducing file size while keeping embeddings and core fields.
224
+
225
+ Args:
226
+ source_db_path: Path to the full database
227
+ output_path: Output path for lite database (default: adds -lite suffix)
228
+
229
+ Returns:
230
+ Path to the lite database
231
+ """
232
+ source_db_path = Path(source_db_path)
233
+ if not source_db_path.exists():
234
+ raise FileNotFoundError(f"Source database not found: {source_db_path}")
235
+
236
+ if output_path is None:
237
+ output_path = source_db_path.with_stem(source_db_path.stem + "-lite")
238
+ output_path = Path(output_path)
239
+
240
+ logger.info(f"Creating lite database from {source_db_path}")
241
+ logger.info(f"Output: {output_path}")
242
+
243
+ # Copy the database first
244
+ shutil.copy2(source_db_path, output_path)
245
+
246
+ # Connect and strip record contents
247
+ # Use isolation_level=None for autocommit (required for VACUUM)
248
+ conn = sqlite3.connect(str(output_path), isolation_level=None)
249
+ try:
250
+ # Update all records to have empty record JSON
251
+ conn.execute("BEGIN")
252
+ cursor = conn.execute("UPDATE organizations SET record = '{}'")
253
+ updated = cursor.rowcount
254
+ logger.info(f"Stripped {updated} record fields")
255
+ conn.execute("COMMIT")
256
+
257
+ # Vacuum to reclaim space (must be outside transaction)
258
+ conn.execute("VACUUM")
259
+ finally:
260
+ conn.close()
261
+
262
+ # Log size reduction
263
+ original_size = source_db_path.stat().st_size
264
+ lite_size = output_path.stat().st_size
265
+ reduction = (1 - lite_size / original_size) * 100
266
+
267
+ logger.info(f"Original size: {original_size / (1024*1024):.1f}MB")
268
+ logger.info(f"Lite size: {lite_size / (1024*1024):.1f}MB")
269
+ logger.info(f"Size reduction: {reduction:.1f}%")
270
+
271
+ return output_path
272
+
273
+
274
+ def upload_database_with_variants(
275
+ db_path: str | Path,
276
+ repo_id: str = DEFAULT_REPO_ID,
277
+ commit_message: str = "Update entity database",
278
+ token: Optional[str] = None,
279
+ include_lite: bool = True,
280
+ include_readme: bool = True,
281
+ ) -> dict[str, str]:
282
+ """
283
+ Upload entity database with optional lite variant.
284
+
285
+ First VACUUMs the database, then creates and uploads:
286
+ - entities.db (full database)
287
+ - entities-lite.db (without record data, smaller)
288
+ - README.md (dataset card from HUGGINGFACE_README.md)
289
+
290
+ Args:
291
+ db_path: Local path to full database file
292
+ repo_id: HuggingFace repo ID
293
+ commit_message: Git commit message
294
+ token: HuggingFace API token
295
+ include_lite: Whether to create and upload lite version
296
+ include_readme: Whether to upload the README.md dataset card
297
+
298
+ Returns:
299
+ Dict mapping filename to upload URL
300
+ """
301
+ try:
302
+ from huggingface_hub import HfApi, create_repo
303
+ except ImportError:
304
+ raise ImportError(
305
+ "huggingface_hub is required for database upload. "
306
+ "Install with: pip install huggingface_hub"
307
+ )
308
+
309
+ db_path = Path(db_path)
310
+ if not db_path.exists():
311
+ raise FileNotFoundError(f"Database file not found: {db_path}")
312
+
313
+ token = token or os.environ.get("HF_TOKEN")
314
+ if not token:
315
+ raise ValueError("HuggingFace token required. Set HF_TOKEN env var or pass token argument.")
316
+
317
+ api = HfApi(token=token)
318
+
319
+ # Create repo if it doesn't exist
320
+ try:
321
+ create_repo(
322
+ repo_id=repo_id,
323
+ repo_type="dataset",
324
+ exist_ok=True,
325
+ token=token,
326
+ )
327
+ except Exception as e:
328
+ logger.debug(f"Repo creation note: {e}")
329
+
330
+ # VACUUM the database first to optimize it
331
+ vacuum_database(db_path)
332
+
333
+ results = {}
334
+
335
+ # Create temp directory for variants
336
+ with tempfile.TemporaryDirectory() as temp_dir:
337
+ temp_path = Path(temp_dir)
338
+ files_to_upload = []
339
+
340
+ # Full database
341
+ files_to_upload.append((db_path, DEFAULT_DB_FULL_FILENAME))
342
+
343
+ # Lite version
344
+ if include_lite:
345
+ lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
346
+ create_lite_database(db_path, lite_path)
347
+ files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
348
+
349
+ # Copy all files to a staging directory for upload_folder
350
+ staging_dir = temp_path / "staging"
351
+ staging_dir.mkdir()
352
+
353
+ for local_path, remote_filename in files_to_upload:
354
+ shutil.copy2(local_path, staging_dir / remote_filename)
355
+ logger.info(f"Staged {remote_filename}")
356
+
357
+ # Add README.md from HUGGINGFACE_README.md
358
+ if include_readme:
359
+ # Look for HUGGINGFACE_README.md in the package directory
360
+ package_dir = Path(__file__).parent.parent.parent.parent # Go up to statement-extractor-lib
361
+ readme_source = package_dir / "HUGGINGFACE_README.md"
362
+ if readme_source.exists():
363
+ shutil.copy2(readme_source, staging_dir / "README.md")
364
+ files_to_upload.append((readme_source, "README.md"))
365
+ logger.info("Staged README.md from HUGGINGFACE_README.md")
366
+ else:
367
+ logger.warning(f"HUGGINGFACE_README.md not found at {readme_source}")
368
+
369
+ # Upload all files in a single commit to avoid LFS pointer issues
370
+ logger.info(f"Uploading {len(files_to_upload)} files to {repo_id}...")
371
+ api.upload_folder(
372
+ folder_path=str(staging_dir),
373
+ repo_id=repo_id,
374
+ repo_type="dataset",
375
+ commit_message=commit_message,
376
+ )
377
+
378
+ for _, remote_filename in files_to_upload:
379
+ results[remote_filename] = f"https://huggingface.co/datasets/{repo_id}/blob/main/{remote_filename}"
380
+ logger.info(f"Uploaded {remote_filename}")
381
+
382
+ return results
383
+
384
+
385
+ def download_database(
386
+ repo_id: str = DEFAULT_REPO_ID,
387
+ filename: str = DEFAULT_DB_FILENAME,
388
+ revision: Optional[str] = None,
389
+ cache_dir: Optional[Path] = None,
390
+ force_download: bool = False,
391
+ ) -> Path:
392
+ """
393
+ Download entity database from HuggingFace Hub.
394
+
395
+ Args:
396
+ repo_id: HuggingFace repo ID (e.g., "Corp-o-Rate-Community/entity-references")
397
+ filename: Database filename in the repo
398
+ revision: Git revision (branch, tag, commit) or None for latest
399
+ cache_dir: Local cache directory
400
+ force_download: Force re-download even if cached
401
+
402
+ Returns:
403
+ Path to the downloaded database file
404
+ """
405
+ try:
406
+ from huggingface_hub import hf_hub_download
407
+ except ImportError:
408
+ raise ImportError(
409
+ "huggingface_hub is required for database download. "
410
+ "Install with: pip install huggingface_hub"
411
+ )
412
+
413
+ cache_dir = cache_dir or DEFAULT_CACHE_DIR
414
+ cache_dir.mkdir(parents=True, exist_ok=True)
415
+
416
+ logger.info(f"Downloading entity database from {repo_id}...")
417
+
418
+ local_path = hf_hub_download(
419
+ repo_id=repo_id,
420
+ filename=filename,
421
+ revision=revision,
422
+ cache_dir=str(cache_dir),
423
+ force_download=force_download,
424
+ repo_type="dataset",
425
+ )
426
+
427
+ logger.info(f"Database downloaded to {local_path}")
428
+ return Path(local_path)
@@ -0,0 +1,32 @@
1
+ """
2
+ Data importers for the entity database.
3
+
4
+ Provides importers for various data sources:
5
+ - GLEIF: Legal Entity Identifier data
6
+ - SEC Edgar: US SEC company data
7
+ - SEC Form 4: US SEC insider ownership data (officers/directors)
8
+ - Companies House: UK company data
9
+ - Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
10
+ - Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
11
+ - Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
12
+ """
13
+
14
+ from .gleif import GleifImporter
15
+ from .sec_edgar import SecEdgarImporter
16
+ from .sec_form4 import SecForm4Importer
17
+ from .companies_house import CompaniesHouseImporter
18
+ from .companies_house_officers import CompaniesHouseOfficersImporter
19
+ from .wikidata import WikidataImporter
20
+ from .wikidata_people import WikidataPeopleImporter
21
+ from .wikidata_dump import WikidataDumpImporter
22
+
23
+ __all__ = [
24
+ "GleifImporter",
25
+ "SecEdgarImporter",
26
+ "SecForm4Importer",
27
+ "CompaniesHouseImporter",
28
+ "CompaniesHouseOfficersImporter",
29
+ "WikidataImporter",
30
+ "WikidataPeopleImporter",
31
+ "WikidataDumpImporter",
32
+ ]