corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
- statement_extractor/cli.py +472 -45
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +51 -9
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/wikidata_dump.py +334 -3
- statement_extractor/database/importers/wikidata_people.py +44 -0
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +125 -1
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +2113 -322
- statement_extractor/plugins/qualifiers/person.py +109 -52
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pydantic models for organization/entity database records.
|
|
3
|
+
|
|
4
|
+
v2 Schema Changes:
|
|
5
|
+
- Added SimplifiedLocationType enum for location categorization
|
|
6
|
+
- Added SourceTypeEnum for normalized source references
|
|
7
|
+
- Added RoleRecord and LocationRecord models for new tables
|
|
8
|
+
- Models support both TEXT-based v1 and FK-based v2 schemas
|
|
3
9
|
"""
|
|
4
10
|
|
|
5
11
|
from enum import Enum
|
|
@@ -8,7 +14,36 @@ from typing import Any, Literal, Optional
|
|
|
8
14
|
from pydantic import BaseModel, Field
|
|
9
15
|
|
|
10
16
|
|
|
11
|
-
|
|
17
|
+
# Legacy source types for backward compatibility with v1 schema
|
|
18
|
+
SourceType = Literal["gleif", "sec_edgar", "companies_house", "wikipedia", "wikidata"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SourceTypeEnum(str, Enum):
|
|
22
|
+
"""
|
|
23
|
+
Data source enumeration for v2 normalized schema.
|
|
24
|
+
|
|
25
|
+
Used as foreign key reference to source_types table.
|
|
26
|
+
"""
|
|
27
|
+
GLEIF = "gleif" # id=1: GLEIF LEI data
|
|
28
|
+
SEC_EDGAR = "sec_edgar" # id=2: SEC EDGAR filings
|
|
29
|
+
COMPANIES_HOUSE = "companies_house" # id=3: UK Companies House
|
|
30
|
+
WIKIDATA = "wikidata" # id=4: Wikidata/Wikipedia
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SimplifiedLocationType(str, Enum):
|
|
34
|
+
"""
|
|
35
|
+
Simplified location type categories for querying.
|
|
36
|
+
|
|
37
|
+
Maps detailed Wikidata location types to canonical categories.
|
|
38
|
+
Used for filtering searches (e.g., "find all cities").
|
|
39
|
+
"""
|
|
40
|
+
CONTINENT = "continent" # id=1: Continents (Q5107)
|
|
41
|
+
COUNTRY = "country" # id=2: Countries and sovereign states
|
|
42
|
+
SUBDIVISION = "subdivision" # id=3: States, provinces, regions, counties
|
|
43
|
+
CITY = "city" # id=4: Cities, towns, municipalities, communes
|
|
44
|
+
DISTRICT = "district" # id=5: Districts, boroughs, neighborhoods
|
|
45
|
+
HISTORIC = "historic" # id=6: Former countries, historic territories
|
|
46
|
+
OTHER = "other" # id=7: Unclassified locations
|
|
12
47
|
|
|
13
48
|
|
|
14
49
|
class EntityType(str, Enum):
|
|
@@ -105,9 +140,98 @@ class CompanyRecord(BaseModel):
|
|
|
105
140
|
}
|
|
106
141
|
|
|
107
142
|
|
|
143
|
+
# Person sources (same as org sources but without GLEIF)
|
|
108
144
|
PersonSourceType = Literal["wikidata", "sec_edgar", "companies_house"]
|
|
109
145
|
|
|
110
146
|
|
|
147
|
+
# =============================================================================
|
|
148
|
+
# ROLE RECORD MODEL (v2)
|
|
149
|
+
# =============================================================================
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class RoleRecord(BaseModel):
|
|
153
|
+
"""
|
|
154
|
+
A role/job title record for the roles table.
|
|
155
|
+
|
|
156
|
+
Used for normalizing job titles across sources and enabling role-based search.
|
|
157
|
+
Supports canonicalization to group equivalent roles (e.g., CEO, Chief Executive).
|
|
158
|
+
"""
|
|
159
|
+
name: str = Field(..., description="Role/title name (e.g., 'Chief Executive Officer')")
|
|
160
|
+
source: SourceType = Field(default="wikidata", description="Data source")
|
|
161
|
+
source_id: Optional[str] = Field(default=None, description="Source identifier (e.g., Q484876 for CEO)")
|
|
162
|
+
qid: Optional[int] = Field(default=None, description="Wikidata QID as integer (e.g., 484876)")
|
|
163
|
+
record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def canonical_id(self) -> str:
|
|
167
|
+
"""Generate canonical ID in format source:source_id."""
|
|
168
|
+
if self.source_id:
|
|
169
|
+
return f"{self.source}:{self.source_id}"
|
|
170
|
+
return f"{self.source}:{self.name}"
|
|
171
|
+
|
|
172
|
+
def model_dump_for_db(self) -> dict[str, Any]:
|
|
173
|
+
"""Convert to dict suitable for database storage."""
|
|
174
|
+
return {
|
|
175
|
+
"name": self.name,
|
|
176
|
+
"source": self.source,
|
|
177
|
+
"source_id": self.source_id or "",
|
|
178
|
+
"qid": self.qid,
|
|
179
|
+
"record": self.record,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# =============================================================================
|
|
184
|
+
# LOCATION RECORD MODEL (v2)
|
|
185
|
+
# =============================================================================
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class LocationRecord(BaseModel):
|
|
189
|
+
"""
|
|
190
|
+
A location/place record for the locations table.
|
|
191
|
+
|
|
192
|
+
Used for storing geopolitical entities (countries, states, cities) with
|
|
193
|
+
hierarchical relationships and type classification.
|
|
194
|
+
"""
|
|
195
|
+
name: str = Field(..., description="Location name (e.g., 'United States', 'California')")
|
|
196
|
+
source: SourceType = Field(default="wikidata", description="Data source")
|
|
197
|
+
source_id: Optional[str] = Field(default=None, description="Source identifier (e.g., 'US', 'Q30')")
|
|
198
|
+
qid: Optional[int] = Field(default=None, description="Wikidata QID as integer (e.g., 30 for USA)")
|
|
199
|
+
location_type: str = Field(default="country", description="Detailed location type (e.g., 'us_state', 'city')")
|
|
200
|
+
simplified_type: SimplifiedLocationType = Field(
|
|
201
|
+
default=SimplifiedLocationType.COUNTRY,
|
|
202
|
+
description="Simplified type for filtering"
|
|
203
|
+
)
|
|
204
|
+
parent_ids: list[int] = Field(
|
|
205
|
+
default_factory=list,
|
|
206
|
+
description="Parent location IDs in hierarchy (e.g., [country_id, state_id])"
|
|
207
|
+
)
|
|
208
|
+
from_date: Optional[str] = Field(default=None, description="Start date (ISO format YYYY-MM-DD)")
|
|
209
|
+
to_date: Optional[str] = Field(default=None, description="End date (ISO format YYYY-MM-DD)")
|
|
210
|
+
record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def canonical_id(self) -> str:
|
|
214
|
+
"""Generate canonical ID in format source:source_id."""
|
|
215
|
+
if self.source_id:
|
|
216
|
+
return f"{self.source}:{self.source_id}"
|
|
217
|
+
return f"{self.source}:{self.name}"
|
|
218
|
+
|
|
219
|
+
def model_dump_for_db(self) -> dict[str, Any]:
|
|
220
|
+
"""Convert to dict suitable for database storage."""
|
|
221
|
+
import json
|
|
222
|
+
return {
|
|
223
|
+
"name": self.name,
|
|
224
|
+
"source": self.source,
|
|
225
|
+
"source_id": self.source_id or "",
|
|
226
|
+
"qid": self.qid,
|
|
227
|
+
"location_type": self.location_type,
|
|
228
|
+
"parent_ids": json.dumps(self.parent_ids),
|
|
229
|
+
"from_date": self.from_date or "",
|
|
230
|
+
"to_date": self.to_date or "",
|
|
231
|
+
"record": self.record,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
|
|
111
235
|
class PersonRecord(BaseModel):
|
|
112
236
|
"""
|
|
113
237
|
A person record for the embedding database.
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database schema v2 with normalized foreign key references.
|
|
3
|
+
|
|
4
|
+
This module contains DDL statements for the normalized entity database schema
|
|
5
|
+
that replaces TEXT-based enum storage with INTEGER FK references to lookup tables.
|
|
6
|
+
|
|
7
|
+
Changes from v1:
|
|
8
|
+
- Enum tables: source_types, people_types, organization_types, location_types
|
|
9
|
+
- New tables: roles, locations, simplified_location_types
|
|
10
|
+
- organizations_v2: source_id FK, entity_type_id FK, region_id FK (to locations)
|
|
11
|
+
- people_v2: source_id FK, person_type_id FK, country_id FK, known_for_role_id FK
|
|
12
|
+
- qid_labels: qid stored as INTEGER (Q prefix stripped)
|
|
13
|
+
- Human-readable views with JOINs
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# =============================================================================
|
|
17
|
+
# ENUM LOOKUP TABLES
|
|
18
|
+
# =============================================================================
|
|
19
|
+
|
|
20
|
+
CREATE_SOURCE_TYPES = """
|
|
21
|
+
CREATE TABLE IF NOT EXISTS source_types (
|
|
22
|
+
id INTEGER PRIMARY KEY,
|
|
23
|
+
name TEXT NOT NULL UNIQUE
|
|
24
|
+
);
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
CREATE_PEOPLE_TYPES = """
|
|
28
|
+
CREATE TABLE IF NOT EXISTS people_types (
|
|
29
|
+
id INTEGER PRIMARY KEY,
|
|
30
|
+
name TEXT NOT NULL UNIQUE
|
|
31
|
+
);
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
CREATE_ORGANIZATION_TYPES = """
|
|
35
|
+
CREATE TABLE IF NOT EXISTS organization_types (
|
|
36
|
+
id INTEGER PRIMARY KEY,
|
|
37
|
+
name TEXT NOT NULL UNIQUE
|
|
38
|
+
);
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
CREATE_SIMPLIFIED_LOCATION_TYPES = """
|
|
42
|
+
CREATE TABLE IF NOT EXISTS simplified_location_types (
|
|
43
|
+
id INTEGER PRIMARY KEY,
|
|
44
|
+
name TEXT NOT NULL UNIQUE
|
|
45
|
+
);
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
CREATE_LOCATION_TYPES = """
|
|
49
|
+
CREATE TABLE IF NOT EXISTS location_types (
|
|
50
|
+
id INTEGER PRIMARY KEY,
|
|
51
|
+
name TEXT NOT NULL UNIQUE,
|
|
52
|
+
qid INTEGER,
|
|
53
|
+
simplified_id INTEGER NOT NULL,
|
|
54
|
+
FOREIGN KEY (simplified_id) REFERENCES simplified_location_types(id)
|
|
55
|
+
);
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
# =============================================================================
|
|
59
|
+
# ROLES TABLE
|
|
60
|
+
# =============================================================================
|
|
61
|
+
|
|
62
|
+
CREATE_ROLES = """
|
|
63
|
+
CREATE TABLE IF NOT EXISTS roles (
|
|
64
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
65
|
+
qid INTEGER,
|
|
66
|
+
name TEXT NOT NULL,
|
|
67
|
+
name_normalized TEXT NOT NULL,
|
|
68
|
+
source_id INTEGER NOT NULL,
|
|
69
|
+
source_identifier TEXT,
|
|
70
|
+
record TEXT NOT NULL DEFAULT '{}',
|
|
71
|
+
canon_id INTEGER DEFAULT NULL,
|
|
72
|
+
canon_size INTEGER DEFAULT 1,
|
|
73
|
+
FOREIGN KEY (source_id) REFERENCES source_types(id),
|
|
74
|
+
UNIQUE(name_normalized, source_id)
|
|
75
|
+
);
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
CREATE_ROLES_INDEXES = """
|
|
79
|
+
CREATE INDEX IF NOT EXISTS idx_roles_name ON roles(name);
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_roles_name_normalized ON roles(name_normalized);
|
|
81
|
+
CREATE INDEX IF NOT EXISTS idx_roles_qid ON roles(qid);
|
|
82
|
+
CREATE INDEX IF NOT EXISTS idx_roles_source_id ON roles(source_id);
|
|
83
|
+
CREATE INDEX IF NOT EXISTS idx_roles_canon_id ON roles(canon_id);
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
# =============================================================================
|
|
87
|
+
# LOCATIONS TABLE
|
|
88
|
+
# =============================================================================
|
|
89
|
+
|
|
90
|
+
CREATE_LOCATIONS = """
|
|
91
|
+
CREATE TABLE IF NOT EXISTS locations (
|
|
92
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
93
|
+
qid INTEGER,
|
|
94
|
+
name TEXT NOT NULL,
|
|
95
|
+
name_normalized TEXT NOT NULL,
|
|
96
|
+
source_id INTEGER NOT NULL,
|
|
97
|
+
source_identifier TEXT,
|
|
98
|
+
parent_ids TEXT,
|
|
99
|
+
location_type_id INTEGER NOT NULL,
|
|
100
|
+
record TEXT NOT NULL DEFAULT '{}',
|
|
101
|
+
from_date TEXT DEFAULT NULL,
|
|
102
|
+
to_date TEXT DEFAULT NULL,
|
|
103
|
+
canon_id INTEGER DEFAULT NULL,
|
|
104
|
+
canon_size INTEGER DEFAULT 1,
|
|
105
|
+
FOREIGN KEY (source_id) REFERENCES source_types(id),
|
|
106
|
+
FOREIGN KEY (location_type_id) REFERENCES location_types(id),
|
|
107
|
+
UNIQUE(source_identifier, source_id)
|
|
108
|
+
);
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
CREATE_LOCATIONS_INDEXES = """
|
|
112
|
+
CREATE INDEX IF NOT EXISTS idx_locations_name ON locations(name);
|
|
113
|
+
CREATE INDEX IF NOT EXISTS idx_locations_name_normalized ON locations(name_normalized);
|
|
114
|
+
CREATE INDEX IF NOT EXISTS idx_locations_qid ON locations(qid);
|
|
115
|
+
CREATE INDEX IF NOT EXISTS idx_locations_source_id ON locations(source_id);
|
|
116
|
+
CREATE INDEX IF NOT EXISTS idx_locations_location_type_id ON locations(location_type_id);
|
|
117
|
+
CREATE INDEX IF NOT EXISTS idx_locations_canon_id ON locations(canon_id);
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
# =============================================================================
|
|
121
|
+
# ORGANIZATIONS V2 TABLE
|
|
122
|
+
# =============================================================================
|
|
123
|
+
|
|
124
|
+
CREATE_ORGANIZATIONS_V2 = """
|
|
125
|
+
CREATE TABLE IF NOT EXISTS organizations (
|
|
126
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
127
|
+
qid INTEGER,
|
|
128
|
+
name TEXT NOT NULL,
|
|
129
|
+
name_normalized TEXT NOT NULL,
|
|
130
|
+
source_id INTEGER NOT NULL,
|
|
131
|
+
source_identifier TEXT NOT NULL,
|
|
132
|
+
region_id INTEGER,
|
|
133
|
+
entity_type_id INTEGER NOT NULL DEFAULT 17,
|
|
134
|
+
from_date TEXT DEFAULT NULL,
|
|
135
|
+
to_date TEXT DEFAULT NULL,
|
|
136
|
+
record TEXT NOT NULL DEFAULT '{}',
|
|
137
|
+
canon_id INTEGER DEFAULT NULL,
|
|
138
|
+
canon_size INTEGER DEFAULT 1,
|
|
139
|
+
FOREIGN KEY (source_id) REFERENCES source_types(id),
|
|
140
|
+
FOREIGN KEY (region_id) REFERENCES locations(id),
|
|
141
|
+
FOREIGN KEY (entity_type_id) REFERENCES organization_types(id),
|
|
142
|
+
UNIQUE(source_identifier, source_id)
|
|
143
|
+
);
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
CREATE_ORGANIZATIONS_V2_INDEXES = """
|
|
147
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_name ON organizations(name);
|
|
148
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_name_normalized ON organizations(name_normalized);
|
|
149
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_qid ON organizations(qid);
|
|
150
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_source_id ON organizations(source_id);
|
|
151
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_source_identifier ON organizations(source_identifier);
|
|
152
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_region_id ON organizations(region_id);
|
|
153
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_entity_type_id ON organizations(entity_type_id);
|
|
154
|
+
CREATE INDEX IF NOT EXISTS idx_orgs_canon_id ON organizations(canon_id);
|
|
155
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_orgs_name_region_source ON organizations(name, region_id, source_id);
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
# =============================================================================
|
|
159
|
+
# PEOPLE V2 TABLE
|
|
160
|
+
# =============================================================================
|
|
161
|
+
|
|
162
|
+
CREATE_PEOPLE_V2 = """
|
|
163
|
+
CREATE TABLE IF NOT EXISTS people (
|
|
164
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
165
|
+
qid INTEGER,
|
|
166
|
+
name TEXT NOT NULL,
|
|
167
|
+
name_normalized TEXT NOT NULL,
|
|
168
|
+
source_id INTEGER NOT NULL,
|
|
169
|
+
source_identifier TEXT NOT NULL,
|
|
170
|
+
country_id INTEGER,
|
|
171
|
+
person_type_id INTEGER NOT NULL DEFAULT 15,
|
|
172
|
+
known_for_role_id INTEGER,
|
|
173
|
+
known_for_org TEXT NOT NULL DEFAULT '',
|
|
174
|
+
known_for_org_id INTEGER,
|
|
175
|
+
from_date TEXT DEFAULT NULL,
|
|
176
|
+
to_date TEXT DEFAULT NULL,
|
|
177
|
+
birth_date TEXT DEFAULT NULL,
|
|
178
|
+
death_date TEXT DEFAULT NULL,
|
|
179
|
+
record TEXT NOT NULL DEFAULT '{}',
|
|
180
|
+
canon_id INTEGER DEFAULT NULL,
|
|
181
|
+
canon_size INTEGER DEFAULT 1,
|
|
182
|
+
FOREIGN KEY (source_id) REFERENCES source_types(id),
|
|
183
|
+
FOREIGN KEY (country_id) REFERENCES locations(id),
|
|
184
|
+
FOREIGN KEY (person_type_id) REFERENCES people_types(id),
|
|
185
|
+
FOREIGN KEY (known_for_role_id) REFERENCES roles(id),
|
|
186
|
+
FOREIGN KEY (known_for_org_id) REFERENCES organizations(id),
|
|
187
|
+
UNIQUE(source_identifier, source_id, known_for_role_id, known_for_org_id)
|
|
188
|
+
);
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
CREATE_PEOPLE_V2_INDEXES = """
|
|
192
|
+
CREATE INDEX IF NOT EXISTS idx_people_name ON people(name);
|
|
193
|
+
CREATE INDEX IF NOT EXISTS idx_people_name_normalized ON people(name_normalized);
|
|
194
|
+
CREATE INDEX IF NOT EXISTS idx_people_qid ON people(qid);
|
|
195
|
+
CREATE INDEX IF NOT EXISTS idx_people_source_id ON people(source_id);
|
|
196
|
+
CREATE INDEX IF NOT EXISTS idx_people_source_identifier ON people(source_identifier);
|
|
197
|
+
CREATE INDEX IF NOT EXISTS idx_people_country_id ON people(country_id);
|
|
198
|
+
CREATE INDEX IF NOT EXISTS idx_people_person_type_id ON people(person_type_id);
|
|
199
|
+
CREATE INDEX IF NOT EXISTS idx_people_known_for_role_id ON people(known_for_role_id);
|
|
200
|
+
CREATE INDEX IF NOT EXISTS idx_people_known_for_org_id ON people(known_for_org_id);
|
|
201
|
+
CREATE INDEX IF NOT EXISTS idx_people_canon_id ON people(canon_id);
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
# =============================================================================
|
|
205
|
+
# QID LABELS TABLE (V2 - INTEGER QID)
|
|
206
|
+
# =============================================================================
|
|
207
|
+
|
|
208
|
+
CREATE_QID_LABELS_V2 = """
|
|
209
|
+
CREATE TABLE IF NOT EXISTS qid_labels (
|
|
210
|
+
qid INTEGER PRIMARY KEY,
|
|
211
|
+
label TEXT NOT NULL
|
|
212
|
+
);
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
# =============================================================================
|
|
216
|
+
# EMBEDDING VIRTUAL TABLES
|
|
217
|
+
# =============================================================================
|
|
218
|
+
|
|
219
|
+
def get_create_organization_embeddings(embedding_dim: int = 768) -> str:
|
|
220
|
+
"""Get DDL for organization embeddings virtual table."""
|
|
221
|
+
return f"""
|
|
222
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS organization_embeddings USING vec0(
|
|
223
|
+
org_id INTEGER PRIMARY KEY,
|
|
224
|
+
embedding float[{embedding_dim}]
|
|
225
|
+
);
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def get_create_person_embeddings(embedding_dim: int = 768) -> str:
|
|
230
|
+
"""Get DDL for person embeddings virtual table."""
|
|
231
|
+
return f"""
|
|
232
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS person_embeddings USING vec0(
|
|
233
|
+
person_id INTEGER PRIMARY KEY,
|
|
234
|
+
embedding float[{embedding_dim}]
|
|
235
|
+
);
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def get_create_organization_embeddings_scalar(embedding_dim: int = 768) -> str:
|
|
240
|
+
"""Get DDL for organization scalar (int8) embeddings virtual table."""
|
|
241
|
+
return f"""
|
|
242
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS organization_embeddings_scalar USING vec0(
|
|
243
|
+
org_id INTEGER PRIMARY KEY,
|
|
244
|
+
embedding int8[{embedding_dim}]
|
|
245
|
+
);
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_create_person_embeddings_scalar(embedding_dim: int = 768) -> str:
|
|
250
|
+
"""Get DDL for person scalar (int8) embeddings virtual table."""
|
|
251
|
+
return f"""
|
|
252
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS person_embeddings_scalar USING vec0(
|
|
253
|
+
person_id INTEGER PRIMARY KEY,
|
|
254
|
+
embedding int8[{embedding_dim}]
|
|
255
|
+
);
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
# =============================================================================
|
|
259
|
+
# HUMAN-READABLE VIEWS
|
|
260
|
+
# =============================================================================
|
|
261
|
+
|
|
262
|
+
CREATE_ORGANIZATIONS_VIEW = """
|
|
263
|
+
CREATE VIEW IF NOT EXISTS organizations_view AS
|
|
264
|
+
SELECT
|
|
265
|
+
o.id,
|
|
266
|
+
o.qid,
|
|
267
|
+
o.name,
|
|
268
|
+
o.name_normalized,
|
|
269
|
+
s.name as source,
|
|
270
|
+
o.source_identifier,
|
|
271
|
+
l.name as region,
|
|
272
|
+
slt.name as region_type,
|
|
273
|
+
ot.name as entity_type,
|
|
274
|
+
o.from_date,
|
|
275
|
+
o.to_date,
|
|
276
|
+
o.canon_id,
|
|
277
|
+
o.canon_size
|
|
278
|
+
FROM organizations o
|
|
279
|
+
JOIN source_types s ON o.source_id = s.id
|
|
280
|
+
LEFT JOIN locations l ON o.region_id = l.id
|
|
281
|
+
LEFT JOIN location_types lt ON l.location_type_id = lt.id
|
|
282
|
+
LEFT JOIN simplified_location_types slt ON lt.simplified_id = slt.id
|
|
283
|
+
JOIN organization_types ot ON o.entity_type_id = ot.id;
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
CREATE_PEOPLE_VIEW = """
|
|
287
|
+
CREATE VIEW IF NOT EXISTS people_view AS
|
|
288
|
+
SELECT
|
|
289
|
+
p.id,
|
|
290
|
+
p.qid,
|
|
291
|
+
p.name,
|
|
292
|
+
p.name_normalized,
|
|
293
|
+
s.name as source,
|
|
294
|
+
p.source_identifier,
|
|
295
|
+
l.name as country,
|
|
296
|
+
pt.name as person_type,
|
|
297
|
+
r.name as known_for_role,
|
|
298
|
+
p.known_for_org,
|
|
299
|
+
p.known_for_org_id,
|
|
300
|
+
p.from_date,
|
|
301
|
+
p.to_date,
|
|
302
|
+
p.birth_date,
|
|
303
|
+
p.death_date,
|
|
304
|
+
p.canon_id,
|
|
305
|
+
p.canon_size
|
|
306
|
+
FROM people p
|
|
307
|
+
JOIN source_types s ON p.source_id = s.id
|
|
308
|
+
LEFT JOIN locations l ON p.country_id = l.id
|
|
309
|
+
JOIN people_types pt ON p.person_type_id = pt.id
|
|
310
|
+
LEFT JOIN roles r ON p.known_for_role_id = r.id;
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
CREATE_ROLES_VIEW = """
|
|
314
|
+
CREATE VIEW IF NOT EXISTS roles_view AS
|
|
315
|
+
SELECT
|
|
316
|
+
r.id,
|
|
317
|
+
r.qid,
|
|
318
|
+
r.name,
|
|
319
|
+
r.name_normalized,
|
|
320
|
+
s.name as source,
|
|
321
|
+
r.source_identifier,
|
|
322
|
+
r.canon_id,
|
|
323
|
+
r.canon_size
|
|
324
|
+
FROM roles r
|
|
325
|
+
JOIN source_types s ON r.source_id = s.id;
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
CREATE_LOCATIONS_VIEW = """
|
|
329
|
+
CREATE VIEW IF NOT EXISTS locations_view AS
|
|
330
|
+
SELECT
|
|
331
|
+
l.id,
|
|
332
|
+
l.qid,
|
|
333
|
+
l.name,
|
|
334
|
+
l.name_normalized,
|
|
335
|
+
s.name as source,
|
|
336
|
+
l.source_identifier,
|
|
337
|
+
l.parent_ids,
|
|
338
|
+
lt.name as location_type,
|
|
339
|
+
slt.name as simplified_type,
|
|
340
|
+
l.from_date,
|
|
341
|
+
l.to_date,
|
|
342
|
+
l.canon_id,
|
|
343
|
+
l.canon_size
|
|
344
|
+
FROM locations l
|
|
345
|
+
JOIN source_types s ON l.source_id = s.id
|
|
346
|
+
JOIN location_types lt ON l.location_type_id = lt.id
|
|
347
|
+
JOIN simplified_location_types slt ON lt.simplified_id = slt.id;
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
# =============================================================================
|
|
351
|
+
# ALL DDL STATEMENTS IN ORDER
|
|
352
|
+
# =============================================================================
|
|
353
|
+
|
|
354
|
+
ALL_DDL_STATEMENTS = [
|
|
355
|
+
# Enum tables first (no dependencies)
|
|
356
|
+
CREATE_SOURCE_TYPES,
|
|
357
|
+
CREATE_PEOPLE_TYPES,
|
|
358
|
+
CREATE_ORGANIZATION_TYPES,
|
|
359
|
+
CREATE_SIMPLIFIED_LOCATION_TYPES,
|
|
360
|
+
CREATE_LOCATION_TYPES,
|
|
361
|
+
# New entity tables
|
|
362
|
+
CREATE_ROLES,
|
|
363
|
+
CREATE_ROLES_INDEXES,
|
|
364
|
+
CREATE_LOCATIONS,
|
|
365
|
+
CREATE_LOCATIONS_INDEXES,
|
|
366
|
+
# Main entity tables
|
|
367
|
+
CREATE_ORGANIZATIONS_V2,
|
|
368
|
+
CREATE_ORGANIZATIONS_V2_INDEXES,
|
|
369
|
+
CREATE_PEOPLE_V2,
|
|
370
|
+
CREATE_PEOPLE_V2_INDEXES,
|
|
371
|
+
# Reference tables
|
|
372
|
+
CREATE_QID_LABELS_V2,
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
VIEW_DDL_STATEMENTS = [
|
|
376
|
+
CREATE_ORGANIZATIONS_VIEW,
|
|
377
|
+
CREATE_PEOPLE_VIEW,
|
|
378
|
+
CREATE_ROLES_VIEW,
|
|
379
|
+
CREATE_LOCATIONS_VIEW,
|
|
380
|
+
]
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def create_all_tables(conn, embedding_dim: int = 768) -> None:
|
|
384
|
+
"""
|
|
385
|
+
Create all v2 schema tables.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
conn: SQLite connection
|
|
389
|
+
embedding_dim: Dimension for embedding vectors
|
|
390
|
+
"""
|
|
391
|
+
for ddl in ALL_DDL_STATEMENTS:
|
|
392
|
+
for statement in ddl.strip().split(";"):
|
|
393
|
+
statement = statement.strip()
|
|
394
|
+
if statement:
|
|
395
|
+
conn.execute(statement)
|
|
396
|
+
|
|
397
|
+
# Create embedding virtual tables (float32)
|
|
398
|
+
conn.execute(get_create_organization_embeddings(embedding_dim))
|
|
399
|
+
conn.execute(get_create_person_embeddings(embedding_dim))
|
|
400
|
+
|
|
401
|
+
# Create scalar embedding virtual tables (int8) for 75% storage reduction
|
|
402
|
+
conn.execute(get_create_organization_embeddings_scalar(embedding_dim))
|
|
403
|
+
conn.execute(get_create_person_embeddings_scalar(embedding_dim))
|
|
404
|
+
|
|
405
|
+
# Create views
|
|
406
|
+
for ddl in VIEW_DDL_STATEMENTS:
|
|
407
|
+
conn.execute(ddl)
|
|
408
|
+
|
|
409
|
+
conn.commit()
|