corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
- statement_extractor/cli.py +472 -45
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +51 -9
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/wikidata_dump.py +334 -3
- statement_extractor/database/importers/wikidata_people.py +44 -0
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +125 -1
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +2113 -322
- statement_extractor/plugins/qualifiers/person.py +109 -52
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -121,6 +121,51 @@ class CompanyEmbedder:
|
|
|
121
121
|
)
|
|
122
122
|
return embeddings.astype(np.float32)
|
|
123
123
|
|
|
124
|
+
def quantize_to_int8(self, embedding: np.ndarray) -> np.ndarray:
|
|
125
|
+
"""
|
|
126
|
+
Quantize L2-normalized float32 embedding to int8.
|
|
127
|
+
|
|
128
|
+
For normalized embeddings (values in [-1, 1]), this provides
|
|
129
|
+
75% storage reduction with ~92% recall at top-100.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
embedding: L2-normalized float32 embedding vector
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
int8 embedding vector
|
|
136
|
+
"""
|
|
137
|
+
return np.clip(np.round(embedding * 127), -127, 127).astype(np.int8)
|
|
138
|
+
|
|
139
|
+
def embed_and_quantize(self, text: str) -> tuple[np.ndarray, np.ndarray]:
|
|
140
|
+
"""
|
|
141
|
+
Embed text and return both float32 and int8 embeddings.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
text: Text to embed
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple of (float32_embedding, int8_embedding)
|
|
148
|
+
"""
|
|
149
|
+
fp32 = self.embed(text)
|
|
150
|
+
return fp32, self.quantize_to_int8(fp32)
|
|
151
|
+
|
|
152
|
+
def embed_batch_and_quantize(
|
|
153
|
+
self, texts: list[str], batch_size: int = 32
|
|
154
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
155
|
+
"""
|
|
156
|
+
Embed multiple texts and return both float32 and int8 embeddings.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
texts: List of texts to embed
|
|
160
|
+
batch_size: Batch size for processing
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Tuple of (float32_embeddings, int8_embeddings) matrices
|
|
164
|
+
"""
|
|
165
|
+
fp32 = self.embed_batch(texts, batch_size=batch_size)
|
|
166
|
+
int8 = np.array([self.quantize_to_int8(e) for e in fp32])
|
|
167
|
+
return fp32, int8
|
|
168
|
+
|
|
124
169
|
def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
|
|
125
170
|
"""
|
|
126
171
|
Compute cosine similarity between two embeddings.
|
|
@@ -20,9 +20,9 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
# Default HuggingFace repo for entity database
|
|
22
22
|
DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
|
|
23
|
-
DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
|
|
24
|
-
DEFAULT_DB_FULL_FILENAME = "entities.db"
|
|
25
|
-
DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
|
|
23
|
+
DEFAULT_DB_FILENAME = "entities-v2-lite.db" # Lite is the default (smaller download)
|
|
24
|
+
DEFAULT_DB_FULL_FILENAME = "entities-v2.db"
|
|
25
|
+
DEFAULT_DB_LITE_FILENAME = "entities-v2-lite.db"
|
|
26
26
|
|
|
27
27
|
# Local cache directory
|
|
28
28
|
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
|
|
@@ -55,7 +55,8 @@ def get_database_path(
|
|
|
55
55
|
# Check common locations
|
|
56
56
|
possible_paths = [
|
|
57
57
|
cache_dir / filename,
|
|
58
|
-
cache_dir / "entities.db",
|
|
58
|
+
cache_dir / "entities-v2.db",
|
|
59
|
+
cache_dir / "entities.db", # Legacy fallback
|
|
59
60
|
Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
|
|
60
61
|
]
|
|
61
62
|
|
|
@@ -219,8 +220,10 @@ def create_lite_database(
|
|
|
219
220
|
"""
|
|
220
221
|
Create a lite version of the database without full records.
|
|
221
222
|
|
|
222
|
-
The lite version
|
|
223
|
-
|
|
223
|
+
The lite version:
|
|
224
|
+
- Strips the `record` column content (sets to empty {})
|
|
225
|
+
- Drops float32 embedding tables (keeps only scalar int8 embeddings)
|
|
226
|
+
- Significantly reduces file size (~75% reduction)
|
|
224
227
|
|
|
225
228
|
Args:
|
|
226
229
|
source_db_path: Path to the full database
|
|
@@ -229,6 +232,8 @@ def create_lite_database(
|
|
|
229
232
|
Returns:
|
|
230
233
|
Path to the lite database
|
|
231
234
|
"""
|
|
235
|
+
import sqlite_vec
|
|
236
|
+
|
|
232
237
|
source_db_path = Path(source_db_path)
|
|
233
238
|
if not source_db_path.exists():
|
|
234
239
|
raise FileNotFoundError(f"Source database not found: {source_db_path}")
|
|
@@ -246,14 +251,51 @@ def create_lite_database(
|
|
|
246
251
|
# Connect and strip record contents
|
|
247
252
|
# Use isolation_level=None for autocommit (required for VACUUM)
|
|
248
253
|
conn = sqlite3.connect(str(output_path), isolation_level=None)
|
|
254
|
+
|
|
255
|
+
# Load sqlite-vec extension (required for vec0 virtual tables)
|
|
256
|
+
conn.enable_load_extension(True)
|
|
257
|
+
sqlite_vec.load(conn)
|
|
258
|
+
conn.enable_load_extension(False)
|
|
259
|
+
|
|
249
260
|
try:
|
|
250
261
|
# Update all records to have empty record JSON
|
|
251
262
|
conn.execute("BEGIN")
|
|
252
263
|
cursor = conn.execute("UPDATE organizations SET record = '{}'")
|
|
253
264
|
updated = cursor.rowcount
|
|
254
|
-
logger.info(f"Stripped {updated} record fields")
|
|
265
|
+
logger.info(f"Stripped {updated} organization record fields")
|
|
266
|
+
|
|
267
|
+
# Also strip people records if table exists
|
|
268
|
+
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='people'")
|
|
269
|
+
if cursor.fetchone():
|
|
270
|
+
cursor = conn.execute("UPDATE people SET record = '{}'")
|
|
271
|
+
logger.info(f"Stripped {cursor.rowcount} people record fields")
|
|
272
|
+
|
|
255
273
|
conn.execute("COMMIT")
|
|
256
274
|
|
|
275
|
+
# Drop float32 embedding tables (keep only scalar int8 for 75% storage savings)
|
|
276
|
+
# Check if scalar tables exist before dropping float32 tables
|
|
277
|
+
cursor = conn.execute(
|
|
278
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings_scalar'"
|
|
279
|
+
)
|
|
280
|
+
has_org_scalar = cursor.fetchone() is not None
|
|
281
|
+
|
|
282
|
+
cursor = conn.execute(
|
|
283
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings_scalar'"
|
|
284
|
+
)
|
|
285
|
+
has_person_scalar = cursor.fetchone() is not None
|
|
286
|
+
|
|
287
|
+
if has_org_scalar:
|
|
288
|
+
logger.info("Dropping float32 organization_embeddings table (keeping scalar)")
|
|
289
|
+
conn.execute("DROP TABLE IF EXISTS organization_embeddings")
|
|
290
|
+
else:
|
|
291
|
+
logger.warning("No scalar organization embeddings found, keeping float32 table")
|
|
292
|
+
|
|
293
|
+
if has_person_scalar:
|
|
294
|
+
logger.info("Dropping float32 person_embeddings table (keeping scalar)")
|
|
295
|
+
conn.execute("DROP TABLE IF EXISTS person_embeddings")
|
|
296
|
+
else:
|
|
297
|
+
logger.warning("No scalar person embeddings found, keeping float32 table")
|
|
298
|
+
|
|
257
299
|
# Vacuum to reclaim space (must be outside transaction)
|
|
258
300
|
conn.execute("VACUUM")
|
|
259
301
|
finally:
|
|
@@ -283,8 +325,8 @@ def upload_database_with_variants(
|
|
|
283
325
|
Upload entity database with optional lite variant.
|
|
284
326
|
|
|
285
327
|
First VACUUMs the database, then creates and uploads:
|
|
286
|
-
- entities.db (full database)
|
|
287
|
-
- entities-lite.db (without record data, smaller)
|
|
328
|
+
- entities-v2.db (full database with v2 normalized schema)
|
|
329
|
+
- entities-v2-lite.db (without record data, smaller)
|
|
288
330
|
- README.md (dataset card from HUGGINGFACE_README.md)
|
|
289
331
|
|
|
290
332
|
Args:
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared utilities for v2 database importers.
|
|
3
|
+
|
|
4
|
+
Provides helper functions for resolving locations, roles, and QIDs
|
|
5
|
+
to their normalized FK references in the v2 schema.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import TYPE_CHECKING, Optional
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from ..store import LocationsDatabase, RolesDatabase
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_qid(qid_text: Optional[str]) -> Optional[int]:
|
|
18
|
+
"""
|
|
19
|
+
Parse a QID string to integer.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
qid_text: QID string like "Q12345" or just "12345"
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Integer QID or None if invalid
|
|
26
|
+
"""
|
|
27
|
+
if not qid_text:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
# Strip whitespace
|
|
31
|
+
qid_text = qid_text.strip()
|
|
32
|
+
|
|
33
|
+
# Handle "Q12345" format
|
|
34
|
+
if qid_text.startswith("Q") or qid_text.startswith("q"):
|
|
35
|
+
qid_text = qid_text[1:]
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
return int(qid_text)
|
|
39
|
+
except ValueError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def format_qid(qid_int: Optional[int]) -> Optional[str]:
|
|
44
|
+
"""
|
|
45
|
+
Format an integer QID back to string format.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
qid_int: Integer QID (e.g., 12345)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
String QID like "Q12345" or None
|
|
52
|
+
"""
|
|
53
|
+
if qid_int is None:
|
|
54
|
+
return None
|
|
55
|
+
return f"Q{qid_int}"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def normalize_name(name: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Normalize a name for database lookup.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
name: Name to normalize
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Lowercase, stripped name
|
|
67
|
+
"""
|
|
68
|
+
if not name:
|
|
69
|
+
return ""
|
|
70
|
+
return name.lower().strip()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_or_create_location(
|
|
74
|
+
locations_db: "LocationsDatabase",
|
|
75
|
+
name: str,
|
|
76
|
+
location_type_id: int,
|
|
77
|
+
source_id: int = 4, # wikidata
|
|
78
|
+
qid: Optional[int] = None,
|
|
79
|
+
source_identifier: Optional[str] = None,
|
|
80
|
+
parent_ids: Optional[list[int]] = None,
|
|
81
|
+
) -> int:
|
|
82
|
+
"""
|
|
83
|
+
Get or create a location record.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
locations_db: LocationsDatabase instance
|
|
87
|
+
name: Location name
|
|
88
|
+
location_type_id: FK to location_types table
|
|
89
|
+
source_id: FK to source_types table
|
|
90
|
+
qid: Optional Wikidata QID as integer
|
|
91
|
+
source_identifier: Optional source-specific identifier
|
|
92
|
+
parent_ids: Optional list of parent location IDs
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Location ID
|
|
96
|
+
"""
|
|
97
|
+
return locations_db.get_or_create(
|
|
98
|
+
name=name,
|
|
99
|
+
location_type_id=location_type_id,
|
|
100
|
+
source_id=source_id,
|
|
101
|
+
qid=qid,
|
|
102
|
+
source_identifier=source_identifier,
|
|
103
|
+
parent_ids=parent_ids,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_or_create_role(
|
|
108
|
+
roles_db: "RolesDatabase",
|
|
109
|
+
name: str,
|
|
110
|
+
source_id: int = 4, # wikidata
|
|
111
|
+
qid: Optional[int] = None,
|
|
112
|
+
source_identifier: Optional[str] = None,
|
|
113
|
+
) -> int:
|
|
114
|
+
"""
|
|
115
|
+
Get or create a role record.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
roles_db: RolesDatabase instance
|
|
119
|
+
name: Role/title name
|
|
120
|
+
source_id: FK to source_types table
|
|
121
|
+
qid: Optional Wikidata QID as integer
|
|
122
|
+
source_identifier: Optional source-specific identifier
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Role ID
|
|
126
|
+
"""
|
|
127
|
+
return roles_db.get_or_create(
|
|
128
|
+
name=name,
|
|
129
|
+
source_id=source_id,
|
|
130
|
+
qid=qid,
|
|
131
|
+
source_identifier=source_identifier,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def resolve_country_to_location_id(
|
|
136
|
+
locations_db: "LocationsDatabase",
|
|
137
|
+
country_text: str,
|
|
138
|
+
) -> Optional[int]:
|
|
139
|
+
"""
|
|
140
|
+
Resolve a country name/code to a location ID.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
locations_db: LocationsDatabase instance
|
|
144
|
+
country_text: Country code (e.g., "US") or name (e.g., "United States")
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Location ID or None if not found
|
|
148
|
+
"""
|
|
149
|
+
if not country_text:
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
return locations_db.resolve_region_text(country_text)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_source_id(source_name: str) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Get source_id for a source name.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
source_name: Source name (e.g., "gleif", "sec_edgar")
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Source ID (1-4)
|
|
164
|
+
"""
|
|
165
|
+
from ..seed_data import SOURCE_NAME_TO_ID
|
|
166
|
+
return SOURCE_NAME_TO_ID.get(source_name, 4) # default to wikidata
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_source_name(source_id: int) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Get source name for a source_id.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
source_id: Source ID (1-4)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Source name
|
|
178
|
+
"""
|
|
179
|
+
from ..seed_data import SOURCE_ID_TO_NAME
|
|
180
|
+
return SOURCE_ID_TO_NAME.get(source_id, "wikidata")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_entity_type_id(entity_type_name: str) -> int:
|
|
184
|
+
"""
|
|
185
|
+
Get entity_type_id for an entity type name.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
entity_type_name: Entity type name (e.g., "business", "fund")
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Entity type ID (1-17)
|
|
192
|
+
"""
|
|
193
|
+
from ..seed_data import ORG_TYPE_NAME_TO_ID
|
|
194
|
+
return ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17) # default to unknown
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def get_entity_type_name(entity_type_id: int) -> str:
|
|
198
|
+
"""
|
|
199
|
+
Get entity type name for an entity_type_id.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
entity_type_id: Entity type ID (1-17)
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Entity type name
|
|
206
|
+
"""
|
|
207
|
+
from ..seed_data import ORG_TYPE_ID_TO_NAME
|
|
208
|
+
return ORG_TYPE_ID_TO_NAME.get(entity_type_id, "unknown")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_person_type_id(person_type_name: str) -> int:
|
|
212
|
+
"""
|
|
213
|
+
Get person_type_id for a person type name.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
person_type_name: Person type name (e.g., "executive", "politician")
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Person type ID (1-15)
|
|
220
|
+
"""
|
|
221
|
+
from ..seed_data import PEOPLE_TYPE_NAME_TO_ID
|
|
222
|
+
return PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15) # default to unknown
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def get_person_type_name(person_type_id: int) -> str:
|
|
226
|
+
"""
|
|
227
|
+
Get person type name for a person_type_id.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
person_type_id: Person type ID (1-15)
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Person type name
|
|
234
|
+
"""
|
|
235
|
+
from ..seed_data import PEOPLE_TYPE_ID_TO_NAME
|
|
236
|
+
return PEOPLE_TYPE_ID_TO_NAME.get(person_type_id, "unknown")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_location_type_id(location_type_name: str) -> int:
|
|
240
|
+
"""
|
|
241
|
+
Get location_type_id for a location type name.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
location_type_name: Location type name (e.g., "country", "city")
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Location type ID
|
|
248
|
+
"""
|
|
249
|
+
from ..seed_data import LOCATION_TYPE_NAME_TO_ID
|
|
250
|
+
return LOCATION_TYPE_NAME_TO_ID.get(location_type_name, 36) # default to other
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_location_type_id_from_qid(wikidata_qid: int) -> int:
|
|
254
|
+
"""
|
|
255
|
+
Get location_type_id from a Wikidata P31 QID.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
wikidata_qid: Wikidata instance-of QID (e.g., 515 for city)
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Location type ID (defaults to 36 = other)
|
|
262
|
+
"""
|
|
263
|
+
from ..seed_data import LOCATION_TYPE_QID_TO_ID
|
|
264
|
+
return LOCATION_TYPE_QID_TO_ID.get(wikidata_qid, 36) # default to other
|