corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,52 @@
1
+ """
2
+ Entity/Organization database module for embedding-based entity qualification.
3
+
4
+ Provides:
5
+ - CompanyRecord: Pydantic model for organization records
6
+ - PersonRecord: Pydantic model for person records
7
+ - OrganizationDatabase: sqlite-vec database for org embedding search
8
+ - PersonDatabase: sqlite-vec database for person embedding search
9
+ - CompanyEmbedder: Embedding service using Gemma3
10
+ - Hub functions: Download/upload database from HuggingFace
11
+ """
12
+
13
+ from .models import CompanyRecord, CompanyMatch, DatabaseStats, PersonRecord, PersonMatch, PersonType
14
+ from .store import OrganizationDatabase, get_database, PersonDatabase, get_person_database
15
+ from .embeddings import CompanyEmbedder, get_embedder
16
+ from .hub import (
17
+ download_database,
18
+ get_database_path,
19
+ upload_database,
20
+ upload_database_with_variants,
21
+ )
22
+ from .resolver import OrganizationResolver, get_organization_resolver
23
+
24
+ # Backwards compatibility alias
25
+ CompanyDatabase = OrganizationDatabase
26
+
27
+ __all__ = [
28
+ # Organization models
29
+ "CompanyRecord",
30
+ "CompanyMatch",
31
+ "DatabaseStats",
32
+ "OrganizationDatabase",
33
+ "CompanyDatabase", # Backwards compatibility alias
34
+ "get_database",
35
+ # Person models
36
+ "PersonRecord",
37
+ "PersonMatch",
38
+ "PersonType",
39
+ "PersonDatabase",
40
+ "get_person_database",
41
+ # Embedding
42
+ "CompanyEmbedder",
43
+ "get_embedder",
44
+ # Hub
45
+ "download_database",
46
+ "get_database_path",
47
+ "upload_database",
48
+ "upload_database_with_variants",
49
+ # Resolver
50
+ "OrganizationResolver",
51
+ "get_organization_resolver",
52
+ ]
@@ -0,0 +1,186 @@
1
+ """
2
+ Embedding service for company name matching.
3
+
4
+ Uses sentence-transformers with Gemma3 embedding model for high-quality
5
+ semantic similarity matching of company names.
6
+ """
7
+
8
+ import logging
9
+ from typing import Optional
10
+
11
+ import numpy as np
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class CompanyEmbedder:
17
+ """
18
+ Embedding service for company names.
19
+
20
+ Uses Google's embedding models for high-quality semantic embeddings
21
+ suitable for company name matching.
22
+ """
23
+
24
+ # Default model - good balance of quality and speed
25
+ DEFAULT_MODEL = "google/embeddinggemma-300m"
26
+ # Alternative: smaller but faster
27
+ # DEFAULT_MODEL = "all-MiniLM-L6-v2"
28
+
29
+ def __init__(
30
+ self,
31
+ model_name: str = DEFAULT_MODEL,
32
+ device: Optional[str] = None,
33
+ ):
34
+ """
35
+ Initialize the embedder.
36
+
37
+ Args:
38
+ model_name: HuggingFace model ID for embeddings
39
+ device: Device to use (cuda, mps, cpu, or None for auto)
40
+ """
41
+ self._model_name = model_name
42
+ self._device = device
43
+ self._model = None
44
+ self._embedding_dim: Optional[int] = None
45
+
46
+ @property
47
+ def embedding_dim(self) -> int:
48
+ """Get the embedding dimension (loads model if needed)."""
49
+ if self._embedding_dim is None:
50
+ self._load_model()
51
+ return self._embedding_dim
52
+
53
+ def _load_model(self) -> None:
54
+ """Load the embedding model (lazy loading)."""
55
+ if self._model is not None:
56
+ return
57
+
58
+ try:
59
+ from sentence_transformers import SentenceTransformer
60
+ import torch
61
+
62
+ device = self._device
63
+ if device is None:
64
+ if torch.cuda.is_available():
65
+ device = "cuda"
66
+ elif torch.backends.mps.is_available():
67
+ device = "mps"
68
+ else:
69
+ device = "cpu"
70
+
71
+ logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
72
+ self._model = SentenceTransformer(self._model_name, device=device)
73
+ self._embedding_dim = self._model.get_sentence_embedding_dimension()
74
+ logger.info(f"Embedding model loaded (dim={self._embedding_dim})")
75
+
76
+ except ImportError as e:
77
+ raise ImportError(
78
+ "sentence-transformers is required for embeddings. "
79
+ "Install with: pip install sentence-transformers"
80
+ ) from e
81
+
82
+ def embed(self, text: str) -> np.ndarray:
83
+ """
84
+ Embed a single text string.
85
+
86
+ Args:
87
+ text: Text to embed
88
+
89
+ Returns:
90
+ Normalized embedding vector as numpy array
91
+ """
92
+ self._load_model()
93
+
94
+ embedding = self._model.encode(
95
+ text,
96
+ convert_to_numpy=True,
97
+ show_progress_bar=False,
98
+ normalize_embeddings=True,
99
+ )
100
+ return embedding.astype(np.float32)
101
+
102
+ def embed_batch(self, texts: list[str], batch_size: int = 32) -> np.ndarray:
103
+ """
104
+ Embed multiple texts in batches.
105
+
106
+ Args:
107
+ texts: List of texts to embed
108
+ batch_size: Batch size for processing
109
+
110
+ Returns:
111
+ Array of normalized embeddings (N x dim)
112
+ """
113
+ self._load_model()
114
+
115
+ embeddings = self._model.encode(
116
+ texts,
117
+ convert_to_numpy=True,
118
+ show_progress_bar=len(texts) > 100,
119
+ batch_size=batch_size,
120
+ normalize_embeddings=True,
121
+ )
122
+ return embeddings.astype(np.float32)
123
+
124
+ def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
125
+ """
126
+ Compute cosine similarity between two embeddings.
127
+
128
+ Args:
129
+ embedding1: First embedding (normalized)
130
+ embedding2: Second embedding (normalized)
131
+
132
+ Returns:
133
+ Cosine similarity score (0-1 for normalized vectors)
134
+ """
135
+ return float(np.dot(embedding1, embedding2))
136
+
137
+ def search_similar(
138
+ self,
139
+ query_embedding: np.ndarray,
140
+ candidate_embeddings: np.ndarray,
141
+ top_k: int = 20,
142
+ ) -> list[tuple[int, float]]:
143
+ """
144
+ Find most similar embeddings to query.
145
+
146
+ Args:
147
+ query_embedding: Query embedding vector
148
+ candidate_embeddings: Matrix of candidate embeddings (N x dim)
149
+ top_k: Number of results to return
150
+
151
+ Returns:
152
+ List of (index, similarity) tuples, sorted by similarity descending
153
+ """
154
+ # Compute similarities (dot product for normalized vectors)
155
+ similarities = np.dot(candidate_embeddings, query_embedding)
156
+
157
+ # Get top-k indices
158
+ if len(similarities) <= top_k:
159
+ indices = np.argsort(similarities)[::-1]
160
+ else:
161
+ indices = np.argpartition(similarities, -top_k)[-top_k:]
162
+ indices = indices[np.argsort(similarities[indices])[::-1]]
163
+
164
+ return [(int(idx), float(similarities[idx])) for idx in indices]
165
+
166
+
167
+ # Singleton instance for shared use
168
+ _default_embedder: Optional[CompanyEmbedder] = None
169
+
170
+
171
+ def get_embedder(model_name: str = CompanyEmbedder.DEFAULT_MODEL) -> CompanyEmbedder:
172
+ """
173
+ Get or create a shared embedder instance.
174
+
175
+ Args:
176
+ model_name: HuggingFace model ID
177
+
178
+ Returns:
179
+ CompanyEmbedder instance
180
+ """
181
+ global _default_embedder
182
+
183
+ if _default_embedder is None or _default_embedder._model_name != model_name:
184
+ _default_embedder = CompanyEmbedder(model_name=model_name)
185
+
186
+ return _default_embedder