corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -121,6 +121,51 @@ class CompanyEmbedder:
121
121
  )
122
122
  return embeddings.astype(np.float32)
123
123
 
124
+ def quantize_to_int8(self, embedding: np.ndarray) -> np.ndarray:
125
+ """
126
+ Quantize L2-normalized float32 embedding to int8.
127
+
128
+ For normalized embeddings (values in [-1, 1]), this provides
129
+ 75% storage reduction with ~92% recall at top-100.
130
+
131
+ Args:
132
+ embedding: L2-normalized float32 embedding vector
133
+
134
+ Returns:
135
+ int8 embedding vector
136
+ """
137
+ return np.clip(np.round(embedding * 127), -127, 127).astype(np.int8)
138
+
139
+ def embed_and_quantize(self, text: str) -> tuple[np.ndarray, np.ndarray]:
140
+ """
141
+ Embed text and return both float32 and int8 embeddings.
142
+
143
+ Args:
144
+ text: Text to embed
145
+
146
+ Returns:
147
+ Tuple of (float32_embedding, int8_embedding)
148
+ """
149
+ fp32 = self.embed(text)
150
+ return fp32, self.quantize_to_int8(fp32)
151
+
152
+ def embed_batch_and_quantize(
153
+ self, texts: list[str], batch_size: int = 32
154
+ ) -> tuple[np.ndarray, np.ndarray]:
155
+ """
156
+ Embed multiple texts and return both float32 and int8 embeddings.
157
+
158
+ Args:
159
+ texts: List of texts to embed
160
+ batch_size: Batch size for processing
161
+
162
+ Returns:
163
+ Tuple of (float32_embeddings, int8_embeddings) matrices
164
+ """
165
+ fp32 = self.embed_batch(texts, batch_size=batch_size)
166
+ int8 = np.array([self.quantize_to_int8(e) for e in fp32])
167
+ return fp32, int8
168
+
124
169
  def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
125
170
  """
126
171
  Compute cosine similarity between two embeddings.
@@ -20,9 +20,9 @@ logger = logging.getLogger(__name__)
20
20
 
21
21
  # Default HuggingFace repo for entity database
22
22
  DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
23
- DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
24
- DEFAULT_DB_FULL_FILENAME = "entities.db"
25
- DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
23
+ DEFAULT_DB_FILENAME = "entities-v2-lite.db" # Lite is the default (smaller download)
24
+ DEFAULT_DB_FULL_FILENAME = "entities-v2.db"
25
+ DEFAULT_DB_LITE_FILENAME = "entities-v2-lite.db"
26
26
 
27
27
  # Local cache directory
28
28
  DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
@@ -55,7 +55,8 @@ def get_database_path(
55
55
  # Check common locations
56
56
  possible_paths = [
57
57
  cache_dir / filename,
58
- cache_dir / "entities.db",
58
+ cache_dir / "entities-v2.db",
59
+ cache_dir / "entities.db", # Legacy fallback
59
60
  Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
60
61
  ]
61
62
 
@@ -219,8 +220,10 @@ def create_lite_database(
219
220
  """
220
221
  Create a lite version of the database without full records.
221
222
 
222
- The lite version strips the `record` column content (sets to empty {}),
223
- significantly reducing file size while keeping embeddings and core fields.
223
+ The lite version:
224
+ - Strips the `record` column content (sets to empty {})
225
+ - Drops float32 embedding tables (keeps only scalar int8 embeddings)
226
+ - Significantly reduces file size (~75% reduction)
224
227
 
225
228
  Args:
226
229
  source_db_path: Path to the full database
@@ -229,6 +232,8 @@ def create_lite_database(
229
232
  Returns:
230
233
  Path to the lite database
231
234
  """
235
+ import sqlite_vec
236
+
232
237
  source_db_path = Path(source_db_path)
233
238
  if not source_db_path.exists():
234
239
  raise FileNotFoundError(f"Source database not found: {source_db_path}")
@@ -246,14 +251,51 @@ def create_lite_database(
246
251
  # Connect and strip record contents
247
252
  # Use isolation_level=None for autocommit (required for VACUUM)
248
253
  conn = sqlite3.connect(str(output_path), isolation_level=None)
254
+
255
+ # Load sqlite-vec extension (required for vec0 virtual tables)
256
+ conn.enable_load_extension(True)
257
+ sqlite_vec.load(conn)
258
+ conn.enable_load_extension(False)
259
+
249
260
  try:
250
261
  # Update all records to have empty record JSON
251
262
  conn.execute("BEGIN")
252
263
  cursor = conn.execute("UPDATE organizations SET record = '{}'")
253
264
  updated = cursor.rowcount
254
- logger.info(f"Stripped {updated} record fields")
265
+ logger.info(f"Stripped {updated} organization record fields")
266
+
267
+ # Also strip people records if table exists
268
+ cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='people'")
269
+ if cursor.fetchone():
270
+ cursor = conn.execute("UPDATE people SET record = '{}'")
271
+ logger.info(f"Stripped {cursor.rowcount} people record fields")
272
+
255
273
  conn.execute("COMMIT")
256
274
 
275
+ # Drop float32 embedding tables (keep only scalar int8 for 75% storage savings)
276
+ # Check if scalar tables exist before dropping float32 tables
277
+ cursor = conn.execute(
278
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings_scalar'"
279
+ )
280
+ has_org_scalar = cursor.fetchone() is not None
281
+
282
+ cursor = conn.execute(
283
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings_scalar'"
284
+ )
285
+ has_person_scalar = cursor.fetchone() is not None
286
+
287
+ if has_org_scalar:
288
+ logger.info("Dropping float32 organization_embeddings table (keeping scalar)")
289
+ conn.execute("DROP TABLE IF EXISTS organization_embeddings")
290
+ else:
291
+ logger.warning("No scalar organization embeddings found, keeping float32 table")
292
+
293
+ if has_person_scalar:
294
+ logger.info("Dropping float32 person_embeddings table (keeping scalar)")
295
+ conn.execute("DROP TABLE IF EXISTS person_embeddings")
296
+ else:
297
+ logger.warning("No scalar person embeddings found, keeping float32 table")
298
+
257
299
  # Vacuum to reclaim space (must be outside transaction)
258
300
  conn.execute("VACUUM")
259
301
  finally:
@@ -283,8 +325,8 @@ def upload_database_with_variants(
283
325
  Upload entity database with optional lite variant.
284
326
 
285
327
  First VACUUMs the database, then creates and uploads:
286
- - entities.db (full database)
287
- - entities-lite.db (without record data, smaller)
328
+ - entities-v2.db (full database with v2 normalized schema)
329
+ - entities-v2-lite.db (without record data, smaller)
288
330
  - README.md (dataset card from HUGGINGFACE_README.md)
289
331
 
290
332
  Args:
@@ -0,0 +1,264 @@
1
+ """
2
+ Shared utilities for v2 database importers.
3
+
4
+ Provides helper functions for resolving locations, roles, and QIDs
5
+ to their normalized FK references in the v2 schema.
6
+ """
7
+
8
+ import logging
9
+ from typing import TYPE_CHECKING, Optional
10
+
11
+ if TYPE_CHECKING:
12
+ from ..store import LocationsDatabase, RolesDatabase
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def parse_qid(qid_text: Optional[str]) -> Optional[int]:
18
+ """
19
+ Parse a QID string to integer.
20
+
21
+ Args:
22
+ qid_text: QID string like "Q12345" or just "12345"
23
+
24
+ Returns:
25
+ Integer QID or None if invalid
26
+ """
27
+ if not qid_text:
28
+ return None
29
+
30
+ # Strip whitespace
31
+ qid_text = qid_text.strip()
32
+
33
+ # Handle "Q12345" format
34
+ if qid_text.startswith("Q") or qid_text.startswith("q"):
35
+ qid_text = qid_text[1:]
36
+
37
+ try:
38
+ return int(qid_text)
39
+ except ValueError:
40
+ return None
41
+
42
+
43
+ def format_qid(qid_int: Optional[int]) -> Optional[str]:
44
+ """
45
+ Format an integer QID back to string format.
46
+
47
+ Args:
48
+ qid_int: Integer QID (e.g., 12345)
49
+
50
+ Returns:
51
+ String QID like "Q12345" or None
52
+ """
53
+ if qid_int is None:
54
+ return None
55
+ return f"Q{qid_int}"
56
+
57
+
58
+ def normalize_name(name: str) -> str:
59
+ """
60
+ Normalize a name for database lookup.
61
+
62
+ Args:
63
+ name: Name to normalize
64
+
65
+ Returns:
66
+ Lowercase, stripped name
67
+ """
68
+ if not name:
69
+ return ""
70
+ return name.lower().strip()
71
+
72
+
73
+ def get_or_create_location(
74
+ locations_db: "LocationsDatabase",
75
+ name: str,
76
+ location_type_id: int,
77
+ source_id: int = 4, # wikidata
78
+ qid: Optional[int] = None,
79
+ source_identifier: Optional[str] = None,
80
+ parent_ids: Optional[list[int]] = None,
81
+ ) -> int:
82
+ """
83
+ Get or create a location record.
84
+
85
+ Args:
86
+ locations_db: LocationsDatabase instance
87
+ name: Location name
88
+ location_type_id: FK to location_types table
89
+ source_id: FK to source_types table
90
+ qid: Optional Wikidata QID as integer
91
+ source_identifier: Optional source-specific identifier
92
+ parent_ids: Optional list of parent location IDs
93
+
94
+ Returns:
95
+ Location ID
96
+ """
97
+ return locations_db.get_or_create(
98
+ name=name,
99
+ location_type_id=location_type_id,
100
+ source_id=source_id,
101
+ qid=qid,
102
+ source_identifier=source_identifier,
103
+ parent_ids=parent_ids,
104
+ )
105
+
106
+
107
+ def get_or_create_role(
108
+ roles_db: "RolesDatabase",
109
+ name: str,
110
+ source_id: int = 4, # wikidata
111
+ qid: Optional[int] = None,
112
+ source_identifier: Optional[str] = None,
113
+ ) -> int:
114
+ """
115
+ Get or create a role record.
116
+
117
+ Args:
118
+ roles_db: RolesDatabase instance
119
+ name: Role/title name
120
+ source_id: FK to source_types table
121
+ qid: Optional Wikidata QID as integer
122
+ source_identifier: Optional source-specific identifier
123
+
124
+ Returns:
125
+ Role ID
126
+ """
127
+ return roles_db.get_or_create(
128
+ name=name,
129
+ source_id=source_id,
130
+ qid=qid,
131
+ source_identifier=source_identifier,
132
+ )
133
+
134
+
135
+ def resolve_country_to_location_id(
136
+ locations_db: "LocationsDatabase",
137
+ country_text: str,
138
+ ) -> Optional[int]:
139
+ """
140
+ Resolve a country name/code to a location ID.
141
+
142
+ Args:
143
+ locations_db: LocationsDatabase instance
144
+ country_text: Country code (e.g., "US") or name (e.g., "United States")
145
+
146
+ Returns:
147
+ Location ID or None if not found
148
+ """
149
+ if not country_text:
150
+ return None
151
+
152
+ return locations_db.resolve_region_text(country_text)
153
+
154
+
155
+ def get_source_id(source_name: str) -> int:
156
+ """
157
+ Get source_id for a source name.
158
+
159
+ Args:
160
+ source_name: Source name (e.g., "gleif", "sec_edgar")
161
+
162
+ Returns:
163
+ Source ID (1-4)
164
+ """
165
+ from ..seed_data import SOURCE_NAME_TO_ID
166
+ return SOURCE_NAME_TO_ID.get(source_name, 4) # default to wikidata
167
+
168
+
169
+ def get_source_name(source_id: int) -> str:
170
+ """
171
+ Get source name for a source_id.
172
+
173
+ Args:
174
+ source_id: Source ID (1-4)
175
+
176
+ Returns:
177
+ Source name
178
+ """
179
+ from ..seed_data import SOURCE_ID_TO_NAME
180
+ return SOURCE_ID_TO_NAME.get(source_id, "wikidata")
181
+
182
+
183
+ def get_entity_type_id(entity_type_name: str) -> int:
184
+ """
185
+ Get entity_type_id for an entity type name.
186
+
187
+ Args:
188
+ entity_type_name: Entity type name (e.g., "business", "fund")
189
+
190
+ Returns:
191
+ Entity type ID (1-17)
192
+ """
193
+ from ..seed_data import ORG_TYPE_NAME_TO_ID
194
+ return ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17) # default to unknown
195
+
196
+
197
+ def get_entity_type_name(entity_type_id: int) -> str:
198
+ """
199
+ Get entity type name for an entity_type_id.
200
+
201
+ Args:
202
+ entity_type_id: Entity type ID (1-17)
203
+
204
+ Returns:
205
+ Entity type name
206
+ """
207
+ from ..seed_data import ORG_TYPE_ID_TO_NAME
208
+ return ORG_TYPE_ID_TO_NAME.get(entity_type_id, "unknown")
209
+
210
+
211
+ def get_person_type_id(person_type_name: str) -> int:
212
+ """
213
+ Get person_type_id for a person type name.
214
+
215
+ Args:
216
+ person_type_name: Person type name (e.g., "executive", "politician")
217
+
218
+ Returns:
219
+ Person type ID (1-15)
220
+ """
221
+ from ..seed_data import PEOPLE_TYPE_NAME_TO_ID
222
+ return PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15) # default to unknown
223
+
224
+
225
+ def get_person_type_name(person_type_id: int) -> str:
226
+ """
227
+ Get person type name for a person_type_id.
228
+
229
+ Args:
230
+ person_type_id: Person type ID (1-15)
231
+
232
+ Returns:
233
+ Person type name
234
+ """
235
+ from ..seed_data import PEOPLE_TYPE_ID_TO_NAME
236
+ return PEOPLE_TYPE_ID_TO_NAME.get(person_type_id, "unknown")
237
+
238
+
239
+ def get_location_type_id(location_type_name: str) -> int:
240
+ """
241
+ Get location_type_id for a location type name.
242
+
243
+ Args:
244
+ location_type_name: Location type name (e.g., "country", "city")
245
+
246
+ Returns:
247
+ Location type ID
248
+ """
249
+ from ..seed_data import LOCATION_TYPE_NAME_TO_ID
250
+ return LOCATION_TYPE_NAME_TO_ID.get(location_type_name, 36) # default to other
251
+
252
+
253
+ def get_location_type_id_from_qid(wikidata_qid: int) -> int:
254
+ """
255
+ Get location_type_id from a Wikidata P31 QID.
256
+
257
+ Args:
258
+ wikidata_qid: Wikidata instance-of QID (e.g., 515 for city)
259
+
260
+ Returns:
261
+ Location type ID (defaults to 36 = other)
262
+ """
263
+ from ..seed_data import LOCATION_TYPE_QID_TO_ID
264
+ return LOCATION_TYPE_QID_TO_ID.get(wikidata_qid, 36) # default to other