corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity/Organization database module for embedding-based entity qualification.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- CompanyRecord: Pydantic model for organization records
|
|
6
|
+
- PersonRecord: Pydantic model for person records
|
|
7
|
+
- OrganizationDatabase: sqlite-vec database for org embedding search
|
|
8
|
+
- PersonDatabase: sqlite-vec database for person embedding search
|
|
9
|
+
- CompanyEmbedder: Embedding service using Gemma3
|
|
10
|
+
- Hub functions: Download/upload database from HuggingFace
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .models import CompanyRecord, CompanyMatch, DatabaseStats, PersonRecord, PersonMatch, PersonType
|
|
14
|
+
from .store import OrganizationDatabase, get_database, PersonDatabase, get_person_database
|
|
15
|
+
from .embeddings import CompanyEmbedder, get_embedder
|
|
16
|
+
from .hub import (
|
|
17
|
+
download_database,
|
|
18
|
+
get_database_path,
|
|
19
|
+
upload_database,
|
|
20
|
+
upload_database_with_variants,
|
|
21
|
+
)
|
|
22
|
+
from .resolver import OrganizationResolver, get_organization_resolver
|
|
23
|
+
|
|
24
|
+
# Backwards compatibility alias
|
|
25
|
+
CompanyDatabase = OrganizationDatabase
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
# Organization models
|
|
29
|
+
"CompanyRecord",
|
|
30
|
+
"CompanyMatch",
|
|
31
|
+
"DatabaseStats",
|
|
32
|
+
"OrganizationDatabase",
|
|
33
|
+
"CompanyDatabase", # Backwards compatibility alias
|
|
34
|
+
"get_database",
|
|
35
|
+
# Person models
|
|
36
|
+
"PersonRecord",
|
|
37
|
+
"PersonMatch",
|
|
38
|
+
"PersonType",
|
|
39
|
+
"PersonDatabase",
|
|
40
|
+
"get_person_database",
|
|
41
|
+
# Embedding
|
|
42
|
+
"CompanyEmbedder",
|
|
43
|
+
"get_embedder",
|
|
44
|
+
# Hub
|
|
45
|
+
"download_database",
|
|
46
|
+
"get_database_path",
|
|
47
|
+
"upload_database",
|
|
48
|
+
"upload_database_with_variants",
|
|
49
|
+
# Resolver
|
|
50
|
+
"OrganizationResolver",
|
|
51
|
+
"get_organization_resolver",
|
|
52
|
+
]
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding service for company name matching.
|
|
3
|
+
|
|
4
|
+
Uses sentence-transformers with Gemma3 embedding model for high-quality
|
|
5
|
+
semantic similarity matching of company names.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CompanyEmbedder:
|
|
17
|
+
"""
|
|
18
|
+
Embedding service for company names.
|
|
19
|
+
|
|
20
|
+
Uses Google's embedding models for high-quality semantic embeddings
|
|
21
|
+
suitable for company name matching.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Default model - good balance of quality and speed
|
|
25
|
+
DEFAULT_MODEL = "google/embeddinggemma-300m"
|
|
26
|
+
# Alternative: smaller but faster
|
|
27
|
+
# DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model_name: str = DEFAULT_MODEL,
|
|
32
|
+
device: Optional[str] = None,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the embedder.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
model_name: HuggingFace model ID for embeddings
|
|
39
|
+
device: Device to use (cuda, mps, cpu, or None for auto)
|
|
40
|
+
"""
|
|
41
|
+
self._model_name = model_name
|
|
42
|
+
self._device = device
|
|
43
|
+
self._model = None
|
|
44
|
+
self._embedding_dim: Optional[int] = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def embedding_dim(self) -> int:
|
|
48
|
+
"""Get the embedding dimension (loads model if needed)."""
|
|
49
|
+
if self._embedding_dim is None:
|
|
50
|
+
self._load_model()
|
|
51
|
+
return self._embedding_dim
|
|
52
|
+
|
|
53
|
+
def _load_model(self) -> None:
|
|
54
|
+
"""Load the embedding model (lazy loading)."""
|
|
55
|
+
if self._model is not None:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
from sentence_transformers import SentenceTransformer
|
|
60
|
+
import torch
|
|
61
|
+
|
|
62
|
+
device = self._device
|
|
63
|
+
if device is None:
|
|
64
|
+
if torch.cuda.is_available():
|
|
65
|
+
device = "cuda"
|
|
66
|
+
elif torch.backends.mps.is_available():
|
|
67
|
+
device = "mps"
|
|
68
|
+
else:
|
|
69
|
+
device = "cpu"
|
|
70
|
+
|
|
71
|
+
logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
|
|
72
|
+
self._model = SentenceTransformer(self._model_name, device=device)
|
|
73
|
+
self._embedding_dim = self._model.get_sentence_embedding_dimension()
|
|
74
|
+
logger.info(f"Embedding model loaded (dim={self._embedding_dim})")
|
|
75
|
+
|
|
76
|
+
except ImportError as e:
|
|
77
|
+
raise ImportError(
|
|
78
|
+
"sentence-transformers is required for embeddings. "
|
|
79
|
+
"Install with: pip install sentence-transformers"
|
|
80
|
+
) from e
|
|
81
|
+
|
|
82
|
+
def embed(self, text: str) -> np.ndarray:
|
|
83
|
+
"""
|
|
84
|
+
Embed a single text string.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
text: Text to embed
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Normalized embedding vector as numpy array
|
|
91
|
+
"""
|
|
92
|
+
self._load_model()
|
|
93
|
+
|
|
94
|
+
embedding = self._model.encode(
|
|
95
|
+
text,
|
|
96
|
+
convert_to_numpy=True,
|
|
97
|
+
show_progress_bar=False,
|
|
98
|
+
normalize_embeddings=True,
|
|
99
|
+
)
|
|
100
|
+
return embedding.astype(np.float32)
|
|
101
|
+
|
|
102
|
+
def embed_batch(self, texts: list[str], batch_size: int = 32) -> np.ndarray:
|
|
103
|
+
"""
|
|
104
|
+
Embed multiple texts in batches.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
texts: List of texts to embed
|
|
108
|
+
batch_size: Batch size for processing
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Array of normalized embeddings (N x dim)
|
|
112
|
+
"""
|
|
113
|
+
self._load_model()
|
|
114
|
+
|
|
115
|
+
embeddings = self._model.encode(
|
|
116
|
+
texts,
|
|
117
|
+
convert_to_numpy=True,
|
|
118
|
+
show_progress_bar=len(texts) > 100,
|
|
119
|
+
batch_size=batch_size,
|
|
120
|
+
normalize_embeddings=True,
|
|
121
|
+
)
|
|
122
|
+
return embeddings.astype(np.float32)
|
|
123
|
+
|
|
124
|
+
def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
|
|
125
|
+
"""
|
|
126
|
+
Compute cosine similarity between two embeddings.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
embedding1: First embedding (normalized)
|
|
130
|
+
embedding2: Second embedding (normalized)
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Cosine similarity score (0-1 for normalized vectors)
|
|
134
|
+
"""
|
|
135
|
+
return float(np.dot(embedding1, embedding2))
|
|
136
|
+
|
|
137
|
+
def search_similar(
|
|
138
|
+
self,
|
|
139
|
+
query_embedding: np.ndarray,
|
|
140
|
+
candidate_embeddings: np.ndarray,
|
|
141
|
+
top_k: int = 20,
|
|
142
|
+
) -> list[tuple[int, float]]:
|
|
143
|
+
"""
|
|
144
|
+
Find most similar embeddings to query.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
query_embedding: Query embedding vector
|
|
148
|
+
candidate_embeddings: Matrix of candidate embeddings (N x dim)
|
|
149
|
+
top_k: Number of results to return
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of (index, similarity) tuples, sorted by similarity descending
|
|
153
|
+
"""
|
|
154
|
+
# Compute similarities (dot product for normalized vectors)
|
|
155
|
+
similarities = np.dot(candidate_embeddings, query_embedding)
|
|
156
|
+
|
|
157
|
+
# Get top-k indices
|
|
158
|
+
if len(similarities) <= top_k:
|
|
159
|
+
indices = np.argsort(similarities)[::-1]
|
|
160
|
+
else:
|
|
161
|
+
indices = np.argpartition(similarities, -top_k)[-top_k:]
|
|
162
|
+
indices = indices[np.argsort(similarities[indices])[::-1]]
|
|
163
|
+
|
|
164
|
+
return [(int(idx), float(similarities[idx])) for idx in indices]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Singleton instance for shared use
|
|
168
|
+
_default_embedder: Optional[CompanyEmbedder] = None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_embedder(model_name: str = CompanyEmbedder.DEFAULT_MODEL) -> CompanyEmbedder:
|
|
172
|
+
"""
|
|
173
|
+
Get or create a shared embedder instance.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
model_name: HuggingFace model ID
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
CompanyEmbedder instance
|
|
180
|
+
"""
|
|
181
|
+
global _default_embedder
|
|
182
|
+
|
|
183
|
+
if _default_embedder is None or _default_embedder._model_name != model_name:
|
|
184
|
+
_default_embedder = CompanyEmbedder(model_name=model_name)
|
|
185
|
+
|
|
186
|
+
return _default_embedder
|