corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity/Organization database module for embedding-based entity qualification.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- CompanyRecord: Pydantic model for organization records
|
|
6
|
+
- PersonRecord: Pydantic model for person records
|
|
7
|
+
- OrganizationDatabase: sqlite-vec database for org embedding search
|
|
8
|
+
- PersonDatabase: sqlite-vec database for person embedding search
|
|
9
|
+
- CompanyEmbedder: Embedding service using Gemma3
|
|
10
|
+
- Hub functions: Download/upload database from HuggingFace
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .models import CompanyRecord, CompanyMatch, DatabaseStats, PersonRecord, PersonMatch, PersonType
|
|
14
|
+
from .store import OrganizationDatabase, get_database, PersonDatabase, get_person_database
|
|
15
|
+
from .embeddings import CompanyEmbedder, get_embedder
|
|
16
|
+
from .hub import (
|
|
17
|
+
download_database,
|
|
18
|
+
get_database_path,
|
|
19
|
+
upload_database,
|
|
20
|
+
upload_database_with_variants,
|
|
21
|
+
)
|
|
22
|
+
from .resolver import OrganizationResolver, get_organization_resolver
|
|
23
|
+
|
|
24
|
+
# Backwards compatibility alias
|
|
25
|
+
CompanyDatabase = OrganizationDatabase
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
# Organization models
|
|
29
|
+
"CompanyRecord",
|
|
30
|
+
"CompanyMatch",
|
|
31
|
+
"DatabaseStats",
|
|
32
|
+
"OrganizationDatabase",
|
|
33
|
+
"CompanyDatabase", # Backwards compatibility alias
|
|
34
|
+
"get_database",
|
|
35
|
+
# Person models
|
|
36
|
+
"PersonRecord",
|
|
37
|
+
"PersonMatch",
|
|
38
|
+
"PersonType",
|
|
39
|
+
"PersonDatabase",
|
|
40
|
+
"get_person_database",
|
|
41
|
+
# Embedding
|
|
42
|
+
"CompanyEmbedder",
|
|
43
|
+
"get_embedder",
|
|
44
|
+
# Hub
|
|
45
|
+
"download_database",
|
|
46
|
+
"get_database_path",
|
|
47
|
+
"upload_database",
|
|
48
|
+
"upload_database_with_variants",
|
|
49
|
+
# Resolver
|
|
50
|
+
"OrganizationResolver",
|
|
51
|
+
"get_organization_resolver",
|
|
52
|
+
]
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding service for company name matching.
|
|
3
|
+
|
|
4
|
+
Uses sentence-transformers with Gemma3 embedding model for high-quality
|
|
5
|
+
semantic similarity matching of company names.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CompanyEmbedder:
|
|
17
|
+
"""
|
|
18
|
+
Embedding service for company names.
|
|
19
|
+
|
|
20
|
+
Uses Google's embedding models for high-quality semantic embeddings
|
|
21
|
+
suitable for company name matching.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Default model - good balance of quality and speed
|
|
25
|
+
DEFAULT_MODEL = "google/embeddinggemma-300m"
|
|
26
|
+
# Alternative: smaller but faster
|
|
27
|
+
# DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model_name: str = DEFAULT_MODEL,
|
|
32
|
+
device: Optional[str] = None,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the embedder.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
model_name: HuggingFace model ID for embeddings
|
|
39
|
+
device: Device to use (cuda, mps, cpu, or None for auto)
|
|
40
|
+
"""
|
|
41
|
+
self._model_name = model_name
|
|
42
|
+
self._device = device
|
|
43
|
+
self._model = None
|
|
44
|
+
self._embedding_dim: Optional[int] = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def embedding_dim(self) -> int:
|
|
48
|
+
"""Get the embedding dimension (loads model if needed)."""
|
|
49
|
+
if self._embedding_dim is None:
|
|
50
|
+
self._load_model()
|
|
51
|
+
return self._embedding_dim
|
|
52
|
+
|
|
53
|
+
def _load_model(self) -> None:
|
|
54
|
+
"""Load the embedding model (lazy loading)."""
|
|
55
|
+
if self._model is not None:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
from sentence_transformers import SentenceTransformer
|
|
60
|
+
import torch
|
|
61
|
+
|
|
62
|
+
device = self._device
|
|
63
|
+
if device is None:
|
|
64
|
+
if torch.cuda.is_available():
|
|
65
|
+
device = "cuda"
|
|
66
|
+
elif torch.backends.mps.is_available():
|
|
67
|
+
device = "mps"
|
|
68
|
+
else:
|
|
69
|
+
device = "cpu"
|
|
70
|
+
|
|
71
|
+
logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
|
|
72
|
+
self._model = SentenceTransformer(self._model_name, device=device)
|
|
73
|
+
self._embedding_dim = self._model.get_sentence_embedding_dimension()
|
|
74
|
+
logger.info(f"Embedding model loaded (dim={self._embedding_dim})")
|
|
75
|
+
|
|
76
|
+
except ImportError as e:
|
|
77
|
+
raise ImportError(
|
|
78
|
+
"sentence-transformers is required for embeddings. "
|
|
79
|
+
"Install with: pip install sentence-transformers"
|
|
80
|
+
) from e
|
|
81
|
+
|
|
82
|
+
def embed(self, text: str) -> np.ndarray:
|
|
83
|
+
"""
|
|
84
|
+
Embed a single text string.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
text: Text to embed
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Normalized embedding vector as numpy array
|
|
91
|
+
"""
|
|
92
|
+
self._load_model()
|
|
93
|
+
|
|
94
|
+
embedding = self._model.encode(
|
|
95
|
+
text,
|
|
96
|
+
convert_to_numpy=True,
|
|
97
|
+
show_progress_bar=False,
|
|
98
|
+
normalize_embeddings=True,
|
|
99
|
+
)
|
|
100
|
+
return embedding.astype(np.float32)
|
|
101
|
+
|
|
102
|
+
def embed_batch(self, texts: list[str], batch_size: int = 32) -> np.ndarray:
|
|
103
|
+
"""
|
|
104
|
+
Embed multiple texts in batches.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
texts: List of texts to embed
|
|
108
|
+
batch_size: Batch size for processing
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Array of normalized embeddings (N x dim)
|
|
112
|
+
"""
|
|
113
|
+
self._load_model()
|
|
114
|
+
|
|
115
|
+
embeddings = self._model.encode(
|
|
116
|
+
texts,
|
|
117
|
+
convert_to_numpy=True,
|
|
118
|
+
show_progress_bar=len(texts) > 100,
|
|
119
|
+
batch_size=batch_size,
|
|
120
|
+
normalize_embeddings=True,
|
|
121
|
+
)
|
|
122
|
+
return embeddings.astype(np.float32)
|
|
123
|
+
|
|
124
|
+
def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
|
|
125
|
+
"""
|
|
126
|
+
Compute cosine similarity between two embeddings.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
embedding1: First embedding (normalized)
|
|
130
|
+
embedding2: Second embedding (normalized)
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Cosine similarity score (0-1 for normalized vectors)
|
|
134
|
+
"""
|
|
135
|
+
return float(np.dot(embedding1, embedding2))
|
|
136
|
+
|
|
137
|
+
def search_similar(
|
|
138
|
+
self,
|
|
139
|
+
query_embedding: np.ndarray,
|
|
140
|
+
candidate_embeddings: np.ndarray,
|
|
141
|
+
top_k: int = 20,
|
|
142
|
+
) -> list[tuple[int, float]]:
|
|
143
|
+
"""
|
|
144
|
+
Find most similar embeddings to query.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
query_embedding: Query embedding vector
|
|
148
|
+
candidate_embeddings: Matrix of candidate embeddings (N x dim)
|
|
149
|
+
top_k: Number of results to return
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of (index, similarity) tuples, sorted by similarity descending
|
|
153
|
+
"""
|
|
154
|
+
# Compute similarities (dot product for normalized vectors)
|
|
155
|
+
similarities = np.dot(candidate_embeddings, query_embedding)
|
|
156
|
+
|
|
157
|
+
# Get top-k indices
|
|
158
|
+
if len(similarities) <= top_k:
|
|
159
|
+
indices = np.argsort(similarities)[::-1]
|
|
160
|
+
else:
|
|
161
|
+
indices = np.argpartition(similarities, -top_k)[-top_k:]
|
|
162
|
+
indices = indices[np.argsort(similarities[indices])[::-1]]
|
|
163
|
+
|
|
164
|
+
return [(int(idx), float(similarities[idx])) for idx in indices]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Singleton instance for shared use
|
|
168
|
+
_default_embedder: Optional[CompanyEmbedder] = None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_embedder(model_name: str = CompanyEmbedder.DEFAULT_MODEL) -> CompanyEmbedder:
|
|
172
|
+
"""
|
|
173
|
+
Get or create a shared embedder instance.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
model_name: HuggingFace model ID
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
CompanyEmbedder instance
|
|
180
|
+
"""
|
|
181
|
+
global _default_embedder
|
|
182
|
+
|
|
183
|
+
if _default_embedder is None or _default_embedder._model_name != model_name:
|
|
184
|
+
_default_embedder = CompanyEmbedder(model_name=model_name)
|
|
185
|
+
|
|
186
|
+
return _default_embedder
|
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace Hub integration for entity/organization database distribution.
|
|
3
|
+
|
|
4
|
+
Provides functionality to:
|
|
5
|
+
- Download pre-built entity databases from HuggingFace Hub
|
|
6
|
+
- Upload/publish database updates
|
|
7
|
+
- Version management for database files
|
|
8
|
+
- Create "lite" versions without full records for smaller downloads
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import shutil
|
|
14
|
+
import sqlite3
|
|
15
|
+
import tempfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Default HuggingFace repo for entity database
|
|
22
|
+
DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
|
|
23
|
+
DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
|
|
24
|
+
DEFAULT_DB_FULL_FILENAME = "entities.db"
|
|
25
|
+
DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
|
|
26
|
+
|
|
27
|
+
# Local cache directory
|
|
28
|
+
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_database_path(
|
|
32
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
33
|
+
filename: str = DEFAULT_DB_FILENAME,
|
|
34
|
+
auto_download: bool = True,
|
|
35
|
+
full: bool = False,
|
|
36
|
+
) -> Optional[Path]:
|
|
37
|
+
"""
|
|
38
|
+
Get path to entity database, downloading if necessary.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
repo_id: HuggingFace repo ID
|
|
42
|
+
filename: Database filename (overrides full flag if specified)
|
|
43
|
+
auto_download: Whether to download if not cached
|
|
44
|
+
full: If True, get the full database instead of lite
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Path to database file, or None if not available
|
|
48
|
+
"""
|
|
49
|
+
# Override filename if full is requested and using default
|
|
50
|
+
if full and filename == DEFAULT_DB_FILENAME:
|
|
51
|
+
filename = DEFAULT_DB_FULL_FILENAME
|
|
52
|
+
# Check if database exists in cache
|
|
53
|
+
cache_dir = DEFAULT_CACHE_DIR
|
|
54
|
+
|
|
55
|
+
# Check common locations
|
|
56
|
+
possible_paths = [
|
|
57
|
+
cache_dir / filename,
|
|
58
|
+
cache_dir / "entities.db",
|
|
59
|
+
Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
for path in possible_paths:
|
|
63
|
+
if path.exists():
|
|
64
|
+
logger.debug(f"Found cached database at {path}")
|
|
65
|
+
return path
|
|
66
|
+
|
|
67
|
+
# Try to download
|
|
68
|
+
if auto_download:
|
|
69
|
+
try:
|
|
70
|
+
return download_database(repo_id=repo_id, filename=filename)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.warning(f"Failed to download database: {e}")
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def upload_database(
|
|
79
|
+
db_path: str | Path,
|
|
80
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
81
|
+
filename: str = DEFAULT_DB_FILENAME,
|
|
82
|
+
commit_message: str = "Update entity database",
|
|
83
|
+
token: Optional[str] = None,
|
|
84
|
+
) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Upload entity database to HuggingFace Hub.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
db_path: Local path to database file
|
|
90
|
+
repo_id: HuggingFace repo ID
|
|
91
|
+
filename: Target filename in repo
|
|
92
|
+
commit_message: Git commit message
|
|
93
|
+
token: HuggingFace API token (uses HF_TOKEN env var if not provided)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
URL of the uploaded file
|
|
97
|
+
"""
|
|
98
|
+
try:
|
|
99
|
+
from huggingface_hub import HfApi, create_repo
|
|
100
|
+
except ImportError:
|
|
101
|
+
raise ImportError(
|
|
102
|
+
"huggingface_hub is required for database upload. "
|
|
103
|
+
"Install with: pip install huggingface_hub"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
db_path = Path(db_path)
|
|
107
|
+
if not db_path.exists():
|
|
108
|
+
raise FileNotFoundError(f"Database file not found: {db_path}")
|
|
109
|
+
|
|
110
|
+
token = token or os.environ.get("HF_TOKEN")
|
|
111
|
+
if not token:
|
|
112
|
+
raise ValueError("HuggingFace token required. Set HF_TOKEN env var or pass token argument.")
|
|
113
|
+
|
|
114
|
+
api = HfApi(token=token)
|
|
115
|
+
|
|
116
|
+
# Create repo if it doesn't exist
|
|
117
|
+
try:
|
|
118
|
+
create_repo(
|
|
119
|
+
repo_id=repo_id,
|
|
120
|
+
repo_type="dataset",
|
|
121
|
+
exist_ok=True,
|
|
122
|
+
token=token,
|
|
123
|
+
)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.debug(f"Repo creation note: {e}")
|
|
126
|
+
|
|
127
|
+
# Upload file
|
|
128
|
+
logger.info(f"Uploading database to {repo_id}...")
|
|
129
|
+
|
|
130
|
+
result = api.upload_file(
|
|
131
|
+
path_or_fileobj=str(db_path),
|
|
132
|
+
path_in_repo=filename,
|
|
133
|
+
repo_id=repo_id,
|
|
134
|
+
repo_type="dataset",
|
|
135
|
+
commit_message=commit_message,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
logger.info("Database uploaded successfully")
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_latest_version(repo_id: str = DEFAULT_REPO_ID) -> Optional[str]:
|
|
143
|
+
"""
|
|
144
|
+
Get the latest version/commit of the database repo.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
repo_id: HuggingFace repo ID
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Latest commit SHA or None if unavailable
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
from huggingface_hub import HfApi
|
|
154
|
+
|
|
155
|
+
api = HfApi()
|
|
156
|
+
info = api.repo_info(repo_id=repo_id, repo_type="dataset")
|
|
157
|
+
return info.sha
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.debug(f"Failed to get repo info: {e}")
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def check_for_updates(
|
|
164
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
165
|
+
current_version: Optional[str] = None,
|
|
166
|
+
) -> tuple[bool, Optional[str]]:
|
|
167
|
+
"""
|
|
168
|
+
Check if a newer version of the database is available.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
repo_id: HuggingFace repo ID
|
|
172
|
+
current_version: Current cached version (commit SHA)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Tuple of (update_available: bool, latest_version: str or None)
|
|
176
|
+
"""
|
|
177
|
+
latest = get_latest_version(repo_id)
|
|
178
|
+
|
|
179
|
+
if latest is None:
|
|
180
|
+
return False, None
|
|
181
|
+
|
|
182
|
+
if current_version is None:
|
|
183
|
+
return True, latest
|
|
184
|
+
|
|
185
|
+
return latest != current_version, latest
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def vacuum_database(db_path: str | Path) -> None:
|
|
189
|
+
"""
|
|
190
|
+
VACUUM the database to reclaim space and optimize it.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
db_path: Path to the database file
|
|
194
|
+
"""
|
|
195
|
+
db_path = Path(db_path)
|
|
196
|
+
if not db_path.exists():
|
|
197
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
198
|
+
|
|
199
|
+
original_size = db_path.stat().st_size
|
|
200
|
+
logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
|
|
201
|
+
|
|
202
|
+
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
203
|
+
conn = sqlite3.connect(str(db_path), isolation_level=None)
|
|
204
|
+
try:
|
|
205
|
+
conn.execute("VACUUM")
|
|
206
|
+
finally:
|
|
207
|
+
conn.close()
|
|
208
|
+
|
|
209
|
+
new_size = db_path.stat().st_size
|
|
210
|
+
reduction = (1 - new_size / original_size) * 100
|
|
211
|
+
|
|
212
|
+
logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def create_lite_database(
|
|
216
|
+
source_db_path: str | Path,
|
|
217
|
+
output_path: Optional[str | Path] = None,
|
|
218
|
+
) -> Path:
|
|
219
|
+
"""
|
|
220
|
+
Create a lite version of the database without full records.
|
|
221
|
+
|
|
222
|
+
The lite version strips the `record` column content (sets to empty {}),
|
|
223
|
+
significantly reducing file size while keeping embeddings and core fields.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
source_db_path: Path to the full database
|
|
227
|
+
output_path: Output path for lite database (default: adds -lite suffix)
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Path to the lite database
|
|
231
|
+
"""
|
|
232
|
+
source_db_path = Path(source_db_path)
|
|
233
|
+
if not source_db_path.exists():
|
|
234
|
+
raise FileNotFoundError(f"Source database not found: {source_db_path}")
|
|
235
|
+
|
|
236
|
+
if output_path is None:
|
|
237
|
+
output_path = source_db_path.with_stem(source_db_path.stem + "-lite")
|
|
238
|
+
output_path = Path(output_path)
|
|
239
|
+
|
|
240
|
+
logger.info(f"Creating lite database from {source_db_path}")
|
|
241
|
+
logger.info(f"Output: {output_path}")
|
|
242
|
+
|
|
243
|
+
# Copy the database first
|
|
244
|
+
shutil.copy2(source_db_path, output_path)
|
|
245
|
+
|
|
246
|
+
# Connect and strip record contents
|
|
247
|
+
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
248
|
+
conn = sqlite3.connect(str(output_path), isolation_level=None)
|
|
249
|
+
try:
|
|
250
|
+
# Update all records to have empty record JSON
|
|
251
|
+
conn.execute("BEGIN")
|
|
252
|
+
cursor = conn.execute("UPDATE organizations SET record = '{}'")
|
|
253
|
+
updated = cursor.rowcount
|
|
254
|
+
logger.info(f"Stripped {updated} record fields")
|
|
255
|
+
conn.execute("COMMIT")
|
|
256
|
+
|
|
257
|
+
# Vacuum to reclaim space (must be outside transaction)
|
|
258
|
+
conn.execute("VACUUM")
|
|
259
|
+
finally:
|
|
260
|
+
conn.close()
|
|
261
|
+
|
|
262
|
+
# Log size reduction
|
|
263
|
+
original_size = source_db_path.stat().st_size
|
|
264
|
+
lite_size = output_path.stat().st_size
|
|
265
|
+
reduction = (1 - lite_size / original_size) * 100
|
|
266
|
+
|
|
267
|
+
logger.info(f"Original size: {original_size / (1024*1024):.1f}MB")
|
|
268
|
+
logger.info(f"Lite size: {lite_size / (1024*1024):.1f}MB")
|
|
269
|
+
logger.info(f"Size reduction: {reduction:.1f}%")
|
|
270
|
+
|
|
271
|
+
return output_path
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def upload_database_with_variants(
|
|
275
|
+
db_path: str | Path,
|
|
276
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
277
|
+
commit_message: str = "Update entity database",
|
|
278
|
+
token: Optional[str] = None,
|
|
279
|
+
include_lite: bool = True,
|
|
280
|
+
include_readme: bool = True,
|
|
281
|
+
) -> dict[str, str]:
|
|
282
|
+
"""
|
|
283
|
+
Upload entity database with optional lite variant.
|
|
284
|
+
|
|
285
|
+
First VACUUMs the database, then creates and uploads:
|
|
286
|
+
- entities.db (full database)
|
|
287
|
+
- entities-lite.db (without record data, smaller)
|
|
288
|
+
- README.md (dataset card from HUGGINGFACE_README.md)
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
db_path: Local path to full database file
|
|
292
|
+
repo_id: HuggingFace repo ID
|
|
293
|
+
commit_message: Git commit message
|
|
294
|
+
token: HuggingFace API token
|
|
295
|
+
include_lite: Whether to create and upload lite version
|
|
296
|
+
include_readme: Whether to upload the README.md dataset card
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Dict mapping filename to upload URL
|
|
300
|
+
"""
|
|
301
|
+
try:
|
|
302
|
+
from huggingface_hub import HfApi, create_repo
|
|
303
|
+
except ImportError:
|
|
304
|
+
raise ImportError(
|
|
305
|
+
"huggingface_hub is required for database upload. "
|
|
306
|
+
"Install with: pip install huggingface_hub"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
db_path = Path(db_path)
|
|
310
|
+
if not db_path.exists():
|
|
311
|
+
raise FileNotFoundError(f"Database file not found: {db_path}")
|
|
312
|
+
|
|
313
|
+
token = token or os.environ.get("HF_TOKEN")
|
|
314
|
+
if not token:
|
|
315
|
+
raise ValueError("HuggingFace token required. Set HF_TOKEN env var or pass token argument.")
|
|
316
|
+
|
|
317
|
+
api = HfApi(token=token)
|
|
318
|
+
|
|
319
|
+
# Create repo if it doesn't exist
|
|
320
|
+
try:
|
|
321
|
+
create_repo(
|
|
322
|
+
repo_id=repo_id,
|
|
323
|
+
repo_type="dataset",
|
|
324
|
+
exist_ok=True,
|
|
325
|
+
token=token,
|
|
326
|
+
)
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.debug(f"Repo creation note: {e}")
|
|
329
|
+
|
|
330
|
+
# VACUUM the database first to optimize it
|
|
331
|
+
vacuum_database(db_path)
|
|
332
|
+
|
|
333
|
+
results = {}
|
|
334
|
+
|
|
335
|
+
# Create temp directory for variants
|
|
336
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
337
|
+
temp_path = Path(temp_dir)
|
|
338
|
+
files_to_upload = []
|
|
339
|
+
|
|
340
|
+
# Full database
|
|
341
|
+
files_to_upload.append((db_path, DEFAULT_DB_FULL_FILENAME))
|
|
342
|
+
|
|
343
|
+
# Lite version
|
|
344
|
+
if include_lite:
|
|
345
|
+
lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
|
|
346
|
+
create_lite_database(db_path, lite_path)
|
|
347
|
+
files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
|
|
348
|
+
|
|
349
|
+
# Copy all files to a staging directory for upload_folder
|
|
350
|
+
staging_dir = temp_path / "staging"
|
|
351
|
+
staging_dir.mkdir()
|
|
352
|
+
|
|
353
|
+
for local_path, remote_filename in files_to_upload:
|
|
354
|
+
shutil.copy2(local_path, staging_dir / remote_filename)
|
|
355
|
+
logger.info(f"Staged {remote_filename}")
|
|
356
|
+
|
|
357
|
+
# Add README.md from HUGGINGFACE_README.md
|
|
358
|
+
if include_readme:
|
|
359
|
+
# Look for HUGGINGFACE_README.md in the package directory
|
|
360
|
+
package_dir = Path(__file__).parent.parent.parent.parent # Go up to statement-extractor-lib
|
|
361
|
+
readme_source = package_dir / "HUGGINGFACE_README.md"
|
|
362
|
+
if readme_source.exists():
|
|
363
|
+
shutil.copy2(readme_source, staging_dir / "README.md")
|
|
364
|
+
files_to_upload.append((readme_source, "README.md"))
|
|
365
|
+
logger.info("Staged README.md from HUGGINGFACE_README.md")
|
|
366
|
+
else:
|
|
367
|
+
logger.warning(f"HUGGINGFACE_README.md not found at {readme_source}")
|
|
368
|
+
|
|
369
|
+
# Upload all files in a single commit to avoid LFS pointer issues
|
|
370
|
+
logger.info(f"Uploading {len(files_to_upload)} files to {repo_id}...")
|
|
371
|
+
api.upload_folder(
|
|
372
|
+
folder_path=str(staging_dir),
|
|
373
|
+
repo_id=repo_id,
|
|
374
|
+
repo_type="dataset",
|
|
375
|
+
commit_message=commit_message,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
for _, remote_filename in files_to_upload:
|
|
379
|
+
results[remote_filename] = f"https://huggingface.co/datasets/{repo_id}/blob/main/{remote_filename}"
|
|
380
|
+
logger.info(f"Uploaded {remote_filename}")
|
|
381
|
+
|
|
382
|
+
return results
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def download_database(
|
|
386
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
387
|
+
filename: str = DEFAULT_DB_FILENAME,
|
|
388
|
+
revision: Optional[str] = None,
|
|
389
|
+
cache_dir: Optional[Path] = None,
|
|
390
|
+
force_download: bool = False,
|
|
391
|
+
) -> Path:
|
|
392
|
+
"""
|
|
393
|
+
Download entity database from HuggingFace Hub.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
repo_id: HuggingFace repo ID (e.g., "Corp-o-Rate-Community/entity-references")
|
|
397
|
+
filename: Database filename in the repo
|
|
398
|
+
revision: Git revision (branch, tag, commit) or None for latest
|
|
399
|
+
cache_dir: Local cache directory
|
|
400
|
+
force_download: Force re-download even if cached
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Path to the downloaded database file
|
|
404
|
+
"""
|
|
405
|
+
try:
|
|
406
|
+
from huggingface_hub import hf_hub_download
|
|
407
|
+
except ImportError:
|
|
408
|
+
raise ImportError(
|
|
409
|
+
"huggingface_hub is required for database download. "
|
|
410
|
+
"Install with: pip install huggingface_hub"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
cache_dir = cache_dir or DEFAULT_CACHE_DIR
|
|
414
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
415
|
+
|
|
416
|
+
logger.info(f"Downloading entity database from {repo_id}...")
|
|
417
|
+
|
|
418
|
+
local_path = hf_hub_download(
|
|
419
|
+
repo_id=repo_id,
|
|
420
|
+
filename=filename,
|
|
421
|
+
revision=revision,
|
|
422
|
+
cache_dir=str(cache_dir),
|
|
423
|
+
force_download=force_download,
|
|
424
|
+
repo_type="dataset",
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
logger.info(f"Database downloaded to {local_path}")
|
|
428
|
+
return Path(local_path)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data importers for the entity database.
|
|
3
|
+
|
|
4
|
+
Provides importers for various data sources:
|
|
5
|
+
- GLEIF: Legal Entity Identifier data
|
|
6
|
+
- SEC Edgar: US SEC company data
|
|
7
|
+
- SEC Form 4: US SEC insider ownership data (officers/directors)
|
|
8
|
+
- Companies House: UK company data
|
|
9
|
+
- Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
|
|
10
|
+
- Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
|
|
11
|
+
- Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .gleif import GleifImporter
|
|
15
|
+
from .sec_edgar import SecEdgarImporter
|
|
16
|
+
from .sec_form4 import SecForm4Importer
|
|
17
|
+
from .companies_house import CompaniesHouseImporter
|
|
18
|
+
from .companies_house_officers import CompaniesHouseOfficersImporter
|
|
19
|
+
from .wikidata import WikidataImporter
|
|
20
|
+
from .wikidata_people import WikidataPeopleImporter
|
|
21
|
+
from .wikidata_dump import WikidataDumpImporter
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"GleifImporter",
|
|
25
|
+
"SecEdgarImporter",
|
|
26
|
+
"SecForm4Importer",
|
|
27
|
+
"CompaniesHouseImporter",
|
|
28
|
+
"CompaniesHouseOfficersImporter",
|
|
29
|
+
"WikidataImporter",
|
|
30
|
+
"WikidataPeopleImporter",
|
|
31
|
+
"WikidataDumpImporter",
|
|
32
|
+
]
|