corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace Hub integration for entity/organization database distribution.
|
|
3
|
+
|
|
4
|
+
Provides functionality to:
|
|
5
|
+
- Download pre-built entity databases from HuggingFace Hub
|
|
6
|
+
- Upload/publish database updates
|
|
7
|
+
- Version management for database files
|
|
8
|
+
- Create "lite" versions without full records for smaller downloads
|
|
9
|
+
- Optional gzip compression for reduced file sizes
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import gzip
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import shutil
|
|
16
|
+
import sqlite3
|
|
17
|
+
import tempfile
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Default HuggingFace repo for entity database
|
|
24
|
+
DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
|
|
25
|
+
DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
|
|
26
|
+
DEFAULT_DB_FULL_FILENAME = "entities.db"
|
|
27
|
+
DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
|
|
28
|
+
DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
|
|
29
|
+
DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
|
|
30
|
+
|
|
31
|
+
# Local cache directory
|
|
32
|
+
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_database_path(
|
|
36
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
37
|
+
filename: str = DEFAULT_DB_FILENAME,
|
|
38
|
+
auto_download: bool = True,
|
|
39
|
+
full: bool = False,
|
|
40
|
+
) -> Optional[Path]:
|
|
41
|
+
"""
|
|
42
|
+
Get path to entity database, downloading if necessary.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
repo_id: HuggingFace repo ID
|
|
46
|
+
filename: Database filename (overrides full flag if specified)
|
|
47
|
+
auto_download: Whether to download if not cached
|
|
48
|
+
full: If True, get the full database instead of lite
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Path to database file, or None if not available
|
|
52
|
+
"""
|
|
53
|
+
# Override filename if full is requested and using default
|
|
54
|
+
if full and filename == DEFAULT_DB_FILENAME:
|
|
55
|
+
filename = DEFAULT_DB_FULL_FILENAME
|
|
56
|
+
# Check if database exists in cache
|
|
57
|
+
cache_dir = DEFAULT_CACHE_DIR
|
|
58
|
+
|
|
59
|
+
# Check common locations
|
|
60
|
+
possible_paths = [
|
|
61
|
+
cache_dir / filename,
|
|
62
|
+
cache_dir / "entities.db",
|
|
63
|
+
Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
for path in possible_paths:
|
|
67
|
+
if path.exists():
|
|
68
|
+
logger.debug(f"Found cached database at {path}")
|
|
69
|
+
return path
|
|
70
|
+
|
|
71
|
+
# Try to download
|
|
72
|
+
if auto_download:
|
|
73
|
+
try:
|
|
74
|
+
return download_database(repo_id=repo_id, filename=filename)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.warning(f"Failed to download database: {e}")
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def upload_database(
|
|
83
|
+
db_path: str | Path,
|
|
84
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
85
|
+
filename: str = DEFAULT_DB_FILENAME,
|
|
86
|
+
commit_message: str = "Update entity database",
|
|
87
|
+
token: Optional[str] = None,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Upload entity database to HuggingFace Hub.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
db_path: Local path to database file
|
|
94
|
+
repo_id: HuggingFace repo ID
|
|
95
|
+
filename: Target filename in repo
|
|
96
|
+
commit_message: Git commit message
|
|
97
|
+
token: HuggingFace API token (uses HF_TOKEN env var if not provided)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
URL of the uploaded file
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
from huggingface_hub import HfApi, create_repo
|
|
104
|
+
except ImportError:
|
|
105
|
+
raise ImportError(
|
|
106
|
+
"huggingface_hub is required for database upload. "
|
|
107
|
+
"Install with: pip install huggingface_hub"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
db_path = Path(db_path)
|
|
111
|
+
if not db_path.exists():
|
|
112
|
+
raise FileNotFoundError(f"Database file not found: {db_path}")
|
|
113
|
+
|
|
114
|
+
token = token or os.environ.get("HF_TOKEN")
|
|
115
|
+
if not token:
|
|
116
|
+
raise ValueError("HuggingFace token required. Set HF_TOKEN env var or pass token argument.")
|
|
117
|
+
|
|
118
|
+
api = HfApi(token=token)
|
|
119
|
+
|
|
120
|
+
# Create repo if it doesn't exist
|
|
121
|
+
try:
|
|
122
|
+
create_repo(
|
|
123
|
+
repo_id=repo_id,
|
|
124
|
+
repo_type="dataset",
|
|
125
|
+
exist_ok=True,
|
|
126
|
+
token=token,
|
|
127
|
+
)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.debug(f"Repo creation note: {e}")
|
|
130
|
+
|
|
131
|
+
# Upload file
|
|
132
|
+
logger.info(f"Uploading database to {repo_id}...")
|
|
133
|
+
|
|
134
|
+
result = api.upload_file(
|
|
135
|
+
path_or_fileobj=str(db_path),
|
|
136
|
+
path_in_repo=filename,
|
|
137
|
+
repo_id=repo_id,
|
|
138
|
+
repo_type="dataset",
|
|
139
|
+
commit_message=commit_message,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logger.info(f"Database uploaded successfully")
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_latest_version(repo_id: str = DEFAULT_REPO_ID) -> Optional[str]:
|
|
147
|
+
"""
|
|
148
|
+
Get the latest version/commit of the database repo.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
repo_id: HuggingFace repo ID
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Latest commit SHA or None if unavailable
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
from huggingface_hub import HfApi
|
|
158
|
+
|
|
159
|
+
api = HfApi()
|
|
160
|
+
info = api.repo_info(repo_id=repo_id, repo_type="dataset")
|
|
161
|
+
return info.sha
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.debug(f"Failed to get repo info: {e}")
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def check_for_updates(
|
|
168
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
169
|
+
current_version: Optional[str] = None,
|
|
170
|
+
) -> tuple[bool, Optional[str]]:
|
|
171
|
+
"""
|
|
172
|
+
Check if a newer version of the database is available.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
repo_id: HuggingFace repo ID
|
|
176
|
+
current_version: Current cached version (commit SHA)
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Tuple of (update_available: bool, latest_version: str or None)
|
|
180
|
+
"""
|
|
181
|
+
latest = get_latest_version(repo_id)
|
|
182
|
+
|
|
183
|
+
if latest is None:
|
|
184
|
+
return False, None
|
|
185
|
+
|
|
186
|
+
if current_version is None:
|
|
187
|
+
return True, latest
|
|
188
|
+
|
|
189
|
+
return latest != current_version, latest
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def create_lite_database(
|
|
193
|
+
source_db_path: str | Path,
|
|
194
|
+
output_path: Optional[str | Path] = None,
|
|
195
|
+
) -> Path:
|
|
196
|
+
"""
|
|
197
|
+
Create a lite version of the database without full records.
|
|
198
|
+
|
|
199
|
+
The lite version strips the `record` column content (sets to empty {}),
|
|
200
|
+
significantly reducing file size while keeping embeddings and core fields.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
source_db_path: Path to the full database
|
|
204
|
+
output_path: Output path for lite database (default: adds -lite suffix)
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Path to the lite database
|
|
208
|
+
"""
|
|
209
|
+
source_db_path = Path(source_db_path)
|
|
210
|
+
if not source_db_path.exists():
|
|
211
|
+
raise FileNotFoundError(f"Source database not found: {source_db_path}")
|
|
212
|
+
|
|
213
|
+
if output_path is None:
|
|
214
|
+
output_path = source_db_path.with_stem(source_db_path.stem + "-lite")
|
|
215
|
+
output_path = Path(output_path)
|
|
216
|
+
|
|
217
|
+
logger.info(f"Creating lite database from {source_db_path}")
|
|
218
|
+
logger.info(f"Output: {output_path}")
|
|
219
|
+
|
|
220
|
+
# Copy the database first
|
|
221
|
+
shutil.copy2(source_db_path, output_path)
|
|
222
|
+
|
|
223
|
+
# Connect and strip record contents
|
|
224
|
+
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
225
|
+
conn = sqlite3.connect(str(output_path), isolation_level=None)
|
|
226
|
+
try:
|
|
227
|
+
# Update all records to have empty record JSON
|
|
228
|
+
conn.execute("BEGIN")
|
|
229
|
+
cursor = conn.execute("UPDATE organizations SET record = '{}'")
|
|
230
|
+
updated = cursor.rowcount
|
|
231
|
+
logger.info(f"Stripped {updated} record fields")
|
|
232
|
+
conn.execute("COMMIT")
|
|
233
|
+
|
|
234
|
+
# Vacuum to reclaim space (must be outside transaction)
|
|
235
|
+
conn.execute("VACUUM")
|
|
236
|
+
finally:
|
|
237
|
+
conn.close()
|
|
238
|
+
|
|
239
|
+
# Log size reduction
|
|
240
|
+
original_size = source_db_path.stat().st_size
|
|
241
|
+
lite_size = output_path.stat().st_size
|
|
242
|
+
reduction = (1 - lite_size / original_size) * 100
|
|
243
|
+
|
|
244
|
+
logger.info(f"Original size: {original_size / (1024*1024):.1f}MB")
|
|
245
|
+
logger.info(f"Lite size: {lite_size / (1024*1024):.1f}MB")
|
|
246
|
+
logger.info(f"Size reduction: {reduction:.1f}%")
|
|
247
|
+
|
|
248
|
+
return output_path
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def compress_database(
|
|
252
|
+
db_path: str | Path,
|
|
253
|
+
output_path: Optional[str | Path] = None,
|
|
254
|
+
) -> Path:
|
|
255
|
+
"""
|
|
256
|
+
Compress a database file using gzip.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
db_path: Path to the database file
|
|
260
|
+
output_path: Output path for compressed file (default: adds .gz suffix)
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Path to the compressed file
|
|
264
|
+
"""
|
|
265
|
+
db_path = Path(db_path)
|
|
266
|
+
if not db_path.exists():
|
|
267
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
268
|
+
|
|
269
|
+
if output_path is None:
|
|
270
|
+
output_path = db_path.with_suffix(db_path.suffix + ".gz")
|
|
271
|
+
output_path = Path(output_path)
|
|
272
|
+
|
|
273
|
+
logger.info(f"Compressing {db_path} to {output_path}")
|
|
274
|
+
|
|
275
|
+
with open(db_path, "rb") as f_in:
|
|
276
|
+
with gzip.open(output_path, "wb", compresslevel=9) as f_out:
|
|
277
|
+
shutil.copyfileobj(f_in, f_out)
|
|
278
|
+
|
|
279
|
+
# Log compression results
|
|
280
|
+
original_size = db_path.stat().st_size
|
|
281
|
+
compressed_size = output_path.stat().st_size
|
|
282
|
+
ratio = (1 - compressed_size / original_size) * 100
|
|
283
|
+
|
|
284
|
+
logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
|
|
285
|
+
logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
|
|
286
|
+
logger.info(f"Compression ratio: {ratio:.1f}%")
|
|
287
|
+
|
|
288
|
+
return output_path
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def decompress_database(
|
|
292
|
+
compressed_path: str | Path,
|
|
293
|
+
output_path: Optional[str | Path] = None,
|
|
294
|
+
) -> Path:
|
|
295
|
+
"""
|
|
296
|
+
Decompress a gzipped database file.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
compressed_path: Path to the .gz file
|
|
300
|
+
output_path: Output path (default: removes .gz suffix)
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Path to the decompressed file
|
|
304
|
+
"""
|
|
305
|
+
compressed_path = Path(compressed_path)
|
|
306
|
+
if not compressed_path.exists():
|
|
307
|
+
raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
|
|
308
|
+
|
|
309
|
+
if output_path is None:
|
|
310
|
+
if compressed_path.suffix == ".gz":
|
|
311
|
+
output_path = compressed_path.with_suffix("")
|
|
312
|
+
else:
|
|
313
|
+
output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
|
|
314
|
+
output_path = Path(output_path)
|
|
315
|
+
|
|
316
|
+
logger.info(f"Decompressing {compressed_path} to {output_path}")
|
|
317
|
+
|
|
318
|
+
with gzip.open(compressed_path, "rb") as f_in:
|
|
319
|
+
with open(output_path, "wb") as f_out:
|
|
320
|
+
shutil.copyfileobj(f_in, f_out)
|
|
321
|
+
|
|
322
|
+
logger.info(f"Decompressed to {output_path}")
|
|
323
|
+
return output_path
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def upload_database_with_variants(
|
|
327
|
+
db_path: str | Path,
|
|
328
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
329
|
+
commit_message: str = "Update entity database",
|
|
330
|
+
token: Optional[str] = None,
|
|
331
|
+
include_lite: bool = True,
|
|
332
|
+
include_compressed: bool = True,
|
|
333
|
+
include_readme: bool = True,
|
|
334
|
+
) -> dict[str, str]:
|
|
335
|
+
"""
|
|
336
|
+
Upload entity database with optional lite and compressed variants.
|
|
337
|
+
|
|
338
|
+
Creates and uploads:
|
|
339
|
+
- entities.db (full database)
|
|
340
|
+
- entities-lite.db (without record data, smaller)
|
|
341
|
+
- entities.db.gz (compressed full database)
|
|
342
|
+
- entities-lite.db.gz (compressed lite database)
|
|
343
|
+
- README.md (dataset card from HUGGINGFACE_README.md)
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
db_path: Local path to full database file
|
|
347
|
+
repo_id: HuggingFace repo ID
|
|
348
|
+
commit_message: Git commit message
|
|
349
|
+
token: HuggingFace API token
|
|
350
|
+
include_lite: Whether to create and upload lite version
|
|
351
|
+
include_compressed: Whether to create and upload compressed versions
|
|
352
|
+
include_readme: Whether to upload the README.md dataset card
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Dict mapping filename to upload URL
|
|
356
|
+
"""
|
|
357
|
+
try:
|
|
358
|
+
from huggingface_hub import HfApi, create_repo
|
|
359
|
+
except ImportError:
|
|
360
|
+
raise ImportError(
|
|
361
|
+
"huggingface_hub is required for database upload. "
|
|
362
|
+
"Install with: pip install huggingface_hub"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
db_path = Path(db_path)
|
|
366
|
+
if not db_path.exists():
|
|
367
|
+
raise FileNotFoundError(f"Database file not found: {db_path}")
|
|
368
|
+
|
|
369
|
+
token = token or os.environ.get("HF_TOKEN")
|
|
370
|
+
if not token:
|
|
371
|
+
raise ValueError("HuggingFace token required. Set HF_TOKEN env var or pass token argument.")
|
|
372
|
+
|
|
373
|
+
api = HfApi(token=token)
|
|
374
|
+
|
|
375
|
+
# Create repo if it doesn't exist
|
|
376
|
+
try:
|
|
377
|
+
create_repo(
|
|
378
|
+
repo_id=repo_id,
|
|
379
|
+
repo_type="dataset",
|
|
380
|
+
exist_ok=True,
|
|
381
|
+
token=token,
|
|
382
|
+
)
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logger.debug(f"Repo creation note: {e}")
|
|
385
|
+
|
|
386
|
+
results = {}
|
|
387
|
+
|
|
388
|
+
# Create temp directory for variants
|
|
389
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
390
|
+
temp_path = Path(temp_dir)
|
|
391
|
+
files_to_upload = []
|
|
392
|
+
|
|
393
|
+
# Full database
|
|
394
|
+
files_to_upload.append((db_path, DEFAULT_DB_FULL_FILENAME))
|
|
395
|
+
|
|
396
|
+
# Lite version
|
|
397
|
+
if include_lite:
|
|
398
|
+
lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
|
|
399
|
+
create_lite_database(db_path, lite_path)
|
|
400
|
+
files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
|
|
401
|
+
|
|
402
|
+
# Compressed versions
|
|
403
|
+
if include_compressed:
|
|
404
|
+
# Compress full database
|
|
405
|
+
compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
|
|
406
|
+
compress_database(db_path, compressed_path)
|
|
407
|
+
files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
|
|
408
|
+
|
|
409
|
+
# Compress lite database
|
|
410
|
+
if include_lite:
|
|
411
|
+
lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
|
|
412
|
+
lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
|
|
413
|
+
compress_database(lite_path, lite_compressed_path)
|
|
414
|
+
files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
|
|
415
|
+
|
|
416
|
+
# Copy all files to a staging directory for upload_folder
|
|
417
|
+
staging_dir = temp_path / "staging"
|
|
418
|
+
staging_dir.mkdir()
|
|
419
|
+
|
|
420
|
+
for local_path, remote_filename in files_to_upload:
|
|
421
|
+
shutil.copy2(local_path, staging_dir / remote_filename)
|
|
422
|
+
logger.info(f"Staged {remote_filename}")
|
|
423
|
+
|
|
424
|
+
# Add README.md from HUGGINGFACE_README.md
|
|
425
|
+
if include_readme:
|
|
426
|
+
# Look for HUGGINGFACE_README.md in the package directory
|
|
427
|
+
package_dir = Path(__file__).parent.parent.parent.parent # Go up to statement-extractor-lib
|
|
428
|
+
readme_source = package_dir / "HUGGINGFACE_README.md"
|
|
429
|
+
if readme_source.exists():
|
|
430
|
+
shutil.copy2(readme_source, staging_dir / "README.md")
|
|
431
|
+
files_to_upload.append((readme_source, "README.md"))
|
|
432
|
+
logger.info("Staged README.md from HUGGINGFACE_README.md")
|
|
433
|
+
else:
|
|
434
|
+
logger.warning(f"HUGGINGFACE_README.md not found at {readme_source}")
|
|
435
|
+
|
|
436
|
+
# Upload all files in a single commit to avoid LFS pointer issues
|
|
437
|
+
logger.info(f"Uploading {len(files_to_upload)} files to {repo_id}...")
|
|
438
|
+
api.upload_folder(
|
|
439
|
+
folder_path=str(staging_dir),
|
|
440
|
+
repo_id=repo_id,
|
|
441
|
+
repo_type="dataset",
|
|
442
|
+
commit_message=commit_message,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
for _, remote_filename in files_to_upload:
|
|
446
|
+
results[remote_filename] = f"https://huggingface.co/datasets/{repo_id}/blob/main/{remote_filename}"
|
|
447
|
+
logger.info(f"Uploaded {remote_filename}")
|
|
448
|
+
|
|
449
|
+
return results
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def download_database(
|
|
453
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
454
|
+
filename: str = DEFAULT_DB_FILENAME,
|
|
455
|
+
revision: Optional[str] = None,
|
|
456
|
+
cache_dir: Optional[Path] = None,
|
|
457
|
+
force_download: bool = False,
|
|
458
|
+
prefer_compressed: bool = True,
|
|
459
|
+
) -> Path:
|
|
460
|
+
"""
|
|
461
|
+
Download entity database from HuggingFace Hub.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
repo_id: HuggingFace repo ID (e.g., "Corp-o-Rate-Community/entity-references")
|
|
465
|
+
filename: Database filename in the repo
|
|
466
|
+
revision: Git revision (branch, tag, commit) or None for latest
|
|
467
|
+
cache_dir: Local cache directory
|
|
468
|
+
force_download: Force re-download even if cached
|
|
469
|
+
prefer_compressed: Try to download compressed version first
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
Path to the downloaded database file (decompressed if was .gz)
|
|
473
|
+
"""
|
|
474
|
+
try:
|
|
475
|
+
from huggingface_hub import hf_hub_download
|
|
476
|
+
except ImportError:
|
|
477
|
+
raise ImportError(
|
|
478
|
+
"huggingface_hub is required for database download. "
|
|
479
|
+
"Install with: pip install huggingface_hub"
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
cache_dir = cache_dir or DEFAULT_CACHE_DIR
|
|
483
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
484
|
+
|
|
485
|
+
# Try compressed version first if preferred
|
|
486
|
+
download_filename = filename
|
|
487
|
+
if prefer_compressed and not filename.endswith(".gz"):
|
|
488
|
+
compressed_filename = filename + ".gz"
|
|
489
|
+
try:
|
|
490
|
+
logger.info(f"Trying compressed version: {compressed_filename}")
|
|
491
|
+
local_path = hf_hub_download(
|
|
492
|
+
repo_id=repo_id,
|
|
493
|
+
filename=compressed_filename,
|
|
494
|
+
revision=revision,
|
|
495
|
+
cache_dir=str(cache_dir),
|
|
496
|
+
force_download=force_download,
|
|
497
|
+
repo_type="dataset",
|
|
498
|
+
)
|
|
499
|
+
# Decompress to final location
|
|
500
|
+
final_path = cache_dir / filename
|
|
501
|
+
decompress_database(local_path, final_path)
|
|
502
|
+
logger.info(f"Database downloaded and decompressed to {final_path}")
|
|
503
|
+
return final_path
|
|
504
|
+
except Exception as e:
|
|
505
|
+
logger.debug(f"Compressed version not available: {e}")
|
|
506
|
+
|
|
507
|
+
# Download uncompressed version
|
|
508
|
+
logger.info(f"Downloading entity database from {repo_id}...")
|
|
509
|
+
|
|
510
|
+
local_path = hf_hub_download(
|
|
511
|
+
repo_id=repo_id,
|
|
512
|
+
filename=download_filename,
|
|
513
|
+
revision=revision,
|
|
514
|
+
cache_dir=str(cache_dir),
|
|
515
|
+
force_download=force_download,
|
|
516
|
+
repo_type="dataset",
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
logger.info(f"Database downloaded to {local_path}")
|
|
520
|
+
return Path(local_path)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data importers for the entity database.
|
|
3
|
+
|
|
4
|
+
Provides importers for various data sources:
|
|
5
|
+
- GLEIF: Legal Entity Identifier data
|
|
6
|
+
- SEC Edgar: US SEC company data
|
|
7
|
+
- Companies House: UK company data
|
|
8
|
+
- Wikidata: Wikipedia/Wikidata organization data
|
|
9
|
+
- Wikidata People: Notable people from Wikipedia/Wikidata
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .gleif import GleifImporter
|
|
13
|
+
from .sec_edgar import SecEdgarImporter
|
|
14
|
+
from .companies_house import CompaniesHouseImporter
|
|
15
|
+
from .wikidata import WikidataImporter
|
|
16
|
+
from .wikidata_people import WikidataPeopleImporter
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"GleifImporter",
|
|
20
|
+
"SecEdgarImporter",
|
|
21
|
+
"CompaniesHouseImporter",
|
|
22
|
+
"WikidataImporter",
|
|
23
|
+
"WikidataPeopleImporter",
|
|
24
|
+
]
|