corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,11 @@
1
1
  """
2
2
  Pydantic models for organization/entity database records.
3
+
4
+ v2 Schema Changes:
5
+ - Added SimplifiedLocationType enum for location categorization
6
+ - Added SourceTypeEnum for normalized source references
7
+ - Added RoleRecord and LocationRecord models for new tables
8
+ - Models support both TEXT-based v1 and FK-based v2 schemas
3
9
  """
4
10
 
5
11
  from enum import Enum
@@ -8,7 +14,36 @@ from typing import Any, Literal, Optional
8
14
  from pydantic import BaseModel, Field
9
15
 
10
16
 
11
- SourceType = Literal["gleif", "sec_edgar", "companies_house", "wikipedia"]
17
+ # Legacy source types for backward compatibility with v1 schema
18
+ SourceType = Literal["gleif", "sec_edgar", "companies_house", "wikipedia", "wikidata"]
19
+
20
+
21
+ class SourceTypeEnum(str, Enum):
22
+ """
23
+ Data source enumeration for v2 normalized schema.
24
+
25
+ Used as foreign key reference to source_types table.
26
+ """
27
+ GLEIF = "gleif" # id=1: GLEIF LEI data
28
+ SEC_EDGAR = "sec_edgar" # id=2: SEC EDGAR filings
29
+ COMPANIES_HOUSE = "companies_house" # id=3: UK Companies House
30
+ WIKIDATA = "wikidata" # id=4: Wikidata/Wikipedia
31
+
32
+
33
+ class SimplifiedLocationType(str, Enum):
34
+ """
35
+ Simplified location type categories for querying.
36
+
37
+ Maps detailed Wikidata location types to canonical categories.
38
+ Used for filtering searches (e.g., "find all cities").
39
+ """
40
+ CONTINENT = "continent" # id=1: Continents (Q5107)
41
+ COUNTRY = "country" # id=2: Countries and sovereign states
42
+ SUBDIVISION = "subdivision" # id=3: States, provinces, regions, counties
43
+ CITY = "city" # id=4: Cities, towns, municipalities, communes
44
+ DISTRICT = "district" # id=5: Districts, boroughs, neighborhoods
45
+ HISTORIC = "historic" # id=6: Former countries, historic territories
46
+ OTHER = "other" # id=7: Unclassified locations
12
47
 
13
48
 
14
49
  class EntityType(str, Enum):
@@ -54,12 +89,17 @@ class PersonType(str, Enum):
54
89
  Used for categorizing people in the person database.
55
90
  """
56
91
  EXECUTIVE = "executive" # CEOs, board members, C-suite
57
- POLITICIAN = "politician" # Elected officials, diplomats
92
+ POLITICIAN = "politician" # Elected officials (presidents, MPs, mayors)
93
+ GOVERNMENT = "government" # Civil servants, diplomats, appointed officials
94
+ MILITARY = "military" # Military officers, armed forces personnel
95
+ LEGAL = "legal" # Judges, lawyers, legal professionals
96
+ PROFESSIONAL = "professional" # Known for their profession (doctors, engineers, architects)
58
97
  ACADEMIC = "academic" # Professors, researchers
59
- ARTIST = "artist" # Musicians, actors, directors, writers
98
+ ARTIST = "artist" # Traditional creatives (musicians, actors, painters, writers)
99
+ MEDIA = "media" # Internet/social media personalities (YouTubers, influencers, podcasters)
60
100
  ATHLETE = "athlete" # Sports figures
61
101
  ENTREPRENEUR = "entrepreneur" # Founders, business owners
62
- JOURNALIST = "journalist" # Reporters, media personalities
102
+ JOURNALIST = "journalist" # Reporters, news presenters, columnists
63
103
  ACTIVIST = "activist" # Advocates, campaigners
64
104
  SCIENTIST = "scientist" # Scientists, inventors
65
105
  UNKNOWN = "unknown" # Type not determined
@@ -77,6 +117,8 @@ class CompanyRecord(BaseModel):
77
117
  source_id: str = Field(..., description="Unique identifier from source (LEI, CIK, CH number)")
78
118
  region: str = Field(default="", description="Geographic region/country (e.g., 'UK', 'US', 'DE')")
79
119
  entity_type: EntityType = Field(default=EntityType.UNKNOWN, description="Organization type classification")
120
+ from_date: Optional[str] = Field(default=None, description="Start date (ISO format YYYY-MM-DD)")
121
+ to_date: Optional[str] = Field(default=None, description="End date (ISO format YYYY-MM-DD)")
80
122
  record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
81
123
 
82
124
  @property
@@ -92,11 +134,102 @@ class CompanyRecord(BaseModel):
92
134
  "source_id": self.source_id,
93
135
  "region": self.region,
94
136
  "entity_type": self.entity_type.value,
137
+ "from_date": self.from_date or "",
138
+ "to_date": self.to_date or "",
95
139
  "record": self.record,
96
140
  }
97
141
 
98
142
 
99
- PersonSourceType = Literal["wikidata"]
143
+ # Person sources (same as org sources but without GLEIF)
144
+ PersonSourceType = Literal["wikidata", "sec_edgar", "companies_house"]
145
+
146
+
147
+ # =============================================================================
148
+ # ROLE RECORD MODEL (v2)
149
+ # =============================================================================
150
+
151
+
152
+ class RoleRecord(BaseModel):
153
+ """
154
+ A role/job title record for the roles table.
155
+
156
+ Used for normalizing job titles across sources and enabling role-based search.
157
+ Supports canonicalization to group equivalent roles (e.g., CEO, Chief Executive).
158
+ """
159
+ name: str = Field(..., description="Role/title name (e.g., 'Chief Executive Officer')")
160
+ source: SourceType = Field(default="wikidata", description="Data source")
161
+ source_id: Optional[str] = Field(default=None, description="Source identifier (e.g., Q484876 for CEO)")
162
+ qid: Optional[int] = Field(default=None, description="Wikidata QID as integer (e.g., 484876)")
163
+ record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
164
+
165
+ @property
166
+ def canonical_id(self) -> str:
167
+ """Generate canonical ID in format source:source_id."""
168
+ if self.source_id:
169
+ return f"{self.source}:{self.source_id}"
170
+ return f"{self.source}:{self.name}"
171
+
172
+ def model_dump_for_db(self) -> dict[str, Any]:
173
+ """Convert to dict suitable for database storage."""
174
+ return {
175
+ "name": self.name,
176
+ "source": self.source,
177
+ "source_id": self.source_id or "",
178
+ "qid": self.qid,
179
+ "record": self.record,
180
+ }
181
+
182
+
183
+ # =============================================================================
184
+ # LOCATION RECORD MODEL (v2)
185
+ # =============================================================================
186
+
187
+
188
+ class LocationRecord(BaseModel):
189
+ """
190
+ A location/place record for the locations table.
191
+
192
+ Used for storing geopolitical entities (countries, states, cities) with
193
+ hierarchical relationships and type classification.
194
+ """
195
+ name: str = Field(..., description="Location name (e.g., 'United States', 'California')")
196
+ source: SourceType = Field(default="wikidata", description="Data source")
197
+ source_id: Optional[str] = Field(default=None, description="Source identifier (e.g., 'US', 'Q30')")
198
+ qid: Optional[int] = Field(default=None, description="Wikidata QID as integer (e.g., 30 for USA)")
199
+ location_type: str = Field(default="country", description="Detailed location type (e.g., 'us_state', 'city')")
200
+ simplified_type: SimplifiedLocationType = Field(
201
+ default=SimplifiedLocationType.COUNTRY,
202
+ description="Simplified type for filtering"
203
+ )
204
+ parent_ids: list[int] = Field(
205
+ default_factory=list,
206
+ description="Parent location IDs in hierarchy (e.g., [country_id, state_id])"
207
+ )
208
+ from_date: Optional[str] = Field(default=None, description="Start date (ISO format YYYY-MM-DD)")
209
+ to_date: Optional[str] = Field(default=None, description="End date (ISO format YYYY-MM-DD)")
210
+ record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
211
+
212
+ @property
213
+ def canonical_id(self) -> str:
214
+ """Generate canonical ID in format source:source_id."""
215
+ if self.source_id:
216
+ return f"{self.source}:{self.source_id}"
217
+ return f"{self.source}:{self.name}"
218
+
219
+ def model_dump_for_db(self) -> dict[str, Any]:
220
+ """Convert to dict suitable for database storage."""
221
+ import json
222
+ return {
223
+ "name": self.name,
224
+ "source": self.source,
225
+ "source_id": self.source_id or "",
226
+ "qid": self.qid,
227
+ "location_type": self.location_type,
228
+ "parent_ids": json.dumps(self.parent_ids),
229
+ "from_date": self.from_date or "",
230
+ "to_date": self.to_date or "",
231
+ "record": self.record,
232
+ }
100
233
 
101
234
 
102
235
  class PersonRecord(BaseModel):
@@ -111,8 +244,13 @@ class PersonRecord(BaseModel):
111
244
  source_id: str = Field(..., description="Unique identifier from source (Wikidata QID)")
112
245
  country: str = Field(default="", description="Country code or name (e.g., 'US', 'Germany')")
113
246
  person_type: PersonType = Field(default=PersonType.UNKNOWN, description="Person type classification")
114
- known_for_role: str = Field(default="", description="Primary role from Wikipedia (e.g., 'CEO', 'President')")
115
- known_for_org: str = Field(default="", description="Primary org from Wikipedia (e.g., 'Apple Inc', 'Tesla')")
247
+ known_for_role: str = Field(default="", description="Primary role (e.g., 'CEO', 'President')")
248
+ known_for_org: str = Field(default="", description="Primary org (e.g., 'Apple Inc', 'Tesla')")
249
+ known_for_org_id: Optional[int] = Field(default=None, description="Foreign key to organizations table")
250
+ from_date: Optional[str] = Field(default=None, description="Start date of role (ISO format YYYY-MM-DD)")
251
+ to_date: Optional[str] = Field(default=None, description="End date of role (ISO format YYYY-MM-DD)")
252
+ birth_date: Optional[str] = Field(default=None, description="Date of birth (ISO format YYYY-MM-DD)")
253
+ death_date: Optional[str] = Field(default=None, description="Date of death (ISO format YYYY-MM-DD) - if set, person is historic")
116
254
  record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
117
255
 
118
256
  @property
@@ -120,6 +258,11 @@ class PersonRecord(BaseModel):
120
258
  """Generate canonical ID in format source:source_id."""
121
259
  return f"{self.source}:{self.source_id}"
122
260
 
261
+ @property
262
+ def is_historic(self) -> bool:
263
+ """Return True if the person is deceased (has a death date)."""
264
+ return self.death_date is not None and self.death_date != ""
265
+
123
266
  def model_dump_for_db(self) -> dict[str, Any]:
124
267
  """Convert to dict suitable for database storage."""
125
268
  return {
@@ -130,6 +273,11 @@ class PersonRecord(BaseModel):
130
273
  "person_type": self.person_type.value,
131
274
  "known_for_role": self.known_for_role,
132
275
  "known_for_org": self.known_for_org,
276
+ "known_for_org_id": self.known_for_org_id, # Can be None
277
+ "from_date": self.from_date or "",
278
+ "to_date": self.to_date or "",
279
+ "birth_date": self.birth_date or "",
280
+ "death_date": self.death_date or "",
133
281
  "record": self.record,
134
282
  }
135
283
 
@@ -0,0 +1,409 @@
1
+ """
2
+ Database schema v2 with normalized foreign key references.
3
+
4
+ This module contains DDL statements for the normalized entity database schema
5
+ that replaces TEXT-based enum storage with INTEGER FK references to lookup tables.
6
+
7
+ Changes from v1:
8
+ - Enum tables: source_types, people_types, organization_types, location_types
9
+ - New tables: roles, locations, simplified_location_types
10
+ - organizations_v2: source_id FK, entity_type_id FK, region_id FK (to locations)
11
+ - people_v2: source_id FK, person_type_id FK, country_id FK, known_for_role_id FK
12
+ - qid_labels: qid stored as INTEGER (Q prefix stripped)
13
+ - Human-readable views with JOINs
14
+ """
15
+
16
+ # =============================================================================
17
+ # ENUM LOOKUP TABLES
18
+ # =============================================================================
19
+
20
+ CREATE_SOURCE_TYPES = """
21
+ CREATE TABLE IF NOT EXISTS source_types (
22
+ id INTEGER PRIMARY KEY,
23
+ name TEXT NOT NULL UNIQUE
24
+ );
25
+ """
26
+
27
+ CREATE_PEOPLE_TYPES = """
28
+ CREATE TABLE IF NOT EXISTS people_types (
29
+ id INTEGER PRIMARY KEY,
30
+ name TEXT NOT NULL UNIQUE
31
+ );
32
+ """
33
+
34
+ CREATE_ORGANIZATION_TYPES = """
35
+ CREATE TABLE IF NOT EXISTS organization_types (
36
+ id INTEGER PRIMARY KEY,
37
+ name TEXT NOT NULL UNIQUE
38
+ );
39
+ """
40
+
41
+ CREATE_SIMPLIFIED_LOCATION_TYPES = """
42
+ CREATE TABLE IF NOT EXISTS simplified_location_types (
43
+ id INTEGER PRIMARY KEY,
44
+ name TEXT NOT NULL UNIQUE
45
+ );
46
+ """
47
+
48
+ CREATE_LOCATION_TYPES = """
49
+ CREATE TABLE IF NOT EXISTS location_types (
50
+ id INTEGER PRIMARY KEY,
51
+ name TEXT NOT NULL UNIQUE,
52
+ qid INTEGER,
53
+ simplified_id INTEGER NOT NULL,
54
+ FOREIGN KEY (simplified_id) REFERENCES simplified_location_types(id)
55
+ );
56
+ """
57
+
58
+ # =============================================================================
59
+ # ROLES TABLE
60
+ # =============================================================================
61
+
62
+ CREATE_ROLES = """
63
+ CREATE TABLE IF NOT EXISTS roles (
64
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
65
+ qid INTEGER,
66
+ name TEXT NOT NULL,
67
+ name_normalized TEXT NOT NULL,
68
+ source_id INTEGER NOT NULL,
69
+ source_identifier TEXT,
70
+ record TEXT NOT NULL DEFAULT '{}',
71
+ canon_id INTEGER DEFAULT NULL,
72
+ canon_size INTEGER DEFAULT 1,
73
+ FOREIGN KEY (source_id) REFERENCES source_types(id),
74
+ UNIQUE(name_normalized, source_id)
75
+ );
76
+ """
77
+
78
+ CREATE_ROLES_INDEXES = """
79
+ CREATE INDEX IF NOT EXISTS idx_roles_name ON roles(name);
80
+ CREATE INDEX IF NOT EXISTS idx_roles_name_normalized ON roles(name_normalized);
81
+ CREATE INDEX IF NOT EXISTS idx_roles_qid ON roles(qid);
82
+ CREATE INDEX IF NOT EXISTS idx_roles_source_id ON roles(source_id);
83
+ CREATE INDEX IF NOT EXISTS idx_roles_canon_id ON roles(canon_id);
84
+ """
85
+
86
+ # =============================================================================
87
+ # LOCATIONS TABLE
88
+ # =============================================================================
89
+
90
+ CREATE_LOCATIONS = """
91
+ CREATE TABLE IF NOT EXISTS locations (
92
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
93
+ qid INTEGER,
94
+ name TEXT NOT NULL,
95
+ name_normalized TEXT NOT NULL,
96
+ source_id INTEGER NOT NULL,
97
+ source_identifier TEXT,
98
+ parent_ids TEXT,
99
+ location_type_id INTEGER NOT NULL,
100
+ record TEXT NOT NULL DEFAULT '{}',
101
+ from_date TEXT DEFAULT NULL,
102
+ to_date TEXT DEFAULT NULL,
103
+ canon_id INTEGER DEFAULT NULL,
104
+ canon_size INTEGER DEFAULT 1,
105
+ FOREIGN KEY (source_id) REFERENCES source_types(id),
106
+ FOREIGN KEY (location_type_id) REFERENCES location_types(id),
107
+ UNIQUE(source_identifier, source_id)
108
+ );
109
+ """
110
+
111
+ CREATE_LOCATIONS_INDEXES = """
112
+ CREATE INDEX IF NOT EXISTS idx_locations_name ON locations(name);
113
+ CREATE INDEX IF NOT EXISTS idx_locations_name_normalized ON locations(name_normalized);
114
+ CREATE INDEX IF NOT EXISTS idx_locations_qid ON locations(qid);
115
+ CREATE INDEX IF NOT EXISTS idx_locations_source_id ON locations(source_id);
116
+ CREATE INDEX IF NOT EXISTS idx_locations_location_type_id ON locations(location_type_id);
117
+ CREATE INDEX IF NOT EXISTS idx_locations_canon_id ON locations(canon_id);
118
+ """
119
+
120
+ # =============================================================================
121
+ # ORGANIZATIONS V2 TABLE
122
+ # =============================================================================
123
+
124
+ CREATE_ORGANIZATIONS_V2 = """
125
+ CREATE TABLE IF NOT EXISTS organizations (
126
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
127
+ qid INTEGER,
128
+ name TEXT NOT NULL,
129
+ name_normalized TEXT NOT NULL,
130
+ source_id INTEGER NOT NULL,
131
+ source_identifier TEXT NOT NULL,
132
+ region_id INTEGER,
133
+ entity_type_id INTEGER NOT NULL DEFAULT 17,
134
+ from_date TEXT DEFAULT NULL,
135
+ to_date TEXT DEFAULT NULL,
136
+ record TEXT NOT NULL DEFAULT '{}',
137
+ canon_id INTEGER DEFAULT NULL,
138
+ canon_size INTEGER DEFAULT 1,
139
+ FOREIGN KEY (source_id) REFERENCES source_types(id),
140
+ FOREIGN KEY (region_id) REFERENCES locations(id),
141
+ FOREIGN KEY (entity_type_id) REFERENCES organization_types(id),
142
+ UNIQUE(source_identifier, source_id)
143
+ );
144
+ """
145
+
146
+ CREATE_ORGANIZATIONS_V2_INDEXES = """
147
+ CREATE INDEX IF NOT EXISTS idx_orgs_name ON organizations(name);
148
+ CREATE INDEX IF NOT EXISTS idx_orgs_name_normalized ON organizations(name_normalized);
149
+ CREATE INDEX IF NOT EXISTS idx_orgs_qid ON organizations(qid);
150
+ CREATE INDEX IF NOT EXISTS idx_orgs_source_id ON organizations(source_id);
151
+ CREATE INDEX IF NOT EXISTS idx_orgs_source_identifier ON organizations(source_identifier);
152
+ CREATE INDEX IF NOT EXISTS idx_orgs_region_id ON organizations(region_id);
153
+ CREATE INDEX IF NOT EXISTS idx_orgs_entity_type_id ON organizations(entity_type_id);
154
+ CREATE INDEX IF NOT EXISTS idx_orgs_canon_id ON organizations(canon_id);
155
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_orgs_name_region_source ON organizations(name, region_id, source_id);
156
+ """
157
+
158
+ # =============================================================================
159
+ # PEOPLE V2 TABLE
160
+ # =============================================================================
161
+
162
+ CREATE_PEOPLE_V2 = """
163
+ CREATE TABLE IF NOT EXISTS people (
164
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
165
+ qid INTEGER,
166
+ name TEXT NOT NULL,
167
+ name_normalized TEXT NOT NULL,
168
+ source_id INTEGER NOT NULL,
169
+ source_identifier TEXT NOT NULL,
170
+ country_id INTEGER,
171
+ person_type_id INTEGER NOT NULL DEFAULT 15,
172
+ known_for_role_id INTEGER,
173
+ known_for_org TEXT NOT NULL DEFAULT '',
174
+ known_for_org_id INTEGER,
175
+ from_date TEXT DEFAULT NULL,
176
+ to_date TEXT DEFAULT NULL,
177
+ birth_date TEXT DEFAULT NULL,
178
+ death_date TEXT DEFAULT NULL,
179
+ record TEXT NOT NULL DEFAULT '{}',
180
+ canon_id INTEGER DEFAULT NULL,
181
+ canon_size INTEGER DEFAULT 1,
182
+ FOREIGN KEY (source_id) REFERENCES source_types(id),
183
+ FOREIGN KEY (country_id) REFERENCES locations(id),
184
+ FOREIGN KEY (person_type_id) REFERENCES people_types(id),
185
+ FOREIGN KEY (known_for_role_id) REFERENCES roles(id),
186
+ FOREIGN KEY (known_for_org_id) REFERENCES organizations(id),
187
+ UNIQUE(source_identifier, source_id, known_for_role_id, known_for_org_id)
188
+ );
189
+ """
190
+
191
+ CREATE_PEOPLE_V2_INDEXES = """
192
+ CREATE INDEX IF NOT EXISTS idx_people_name ON people(name);
193
+ CREATE INDEX IF NOT EXISTS idx_people_name_normalized ON people(name_normalized);
194
+ CREATE INDEX IF NOT EXISTS idx_people_qid ON people(qid);
195
+ CREATE INDEX IF NOT EXISTS idx_people_source_id ON people(source_id);
196
+ CREATE INDEX IF NOT EXISTS idx_people_source_identifier ON people(source_identifier);
197
+ CREATE INDEX IF NOT EXISTS idx_people_country_id ON people(country_id);
198
+ CREATE INDEX IF NOT EXISTS idx_people_person_type_id ON people(person_type_id);
199
+ CREATE INDEX IF NOT EXISTS idx_people_known_for_role_id ON people(known_for_role_id);
200
+ CREATE INDEX IF NOT EXISTS idx_people_known_for_org_id ON people(known_for_org_id);
201
+ CREATE INDEX IF NOT EXISTS idx_people_canon_id ON people(canon_id);
202
+ """
203
+
204
+ # =============================================================================
205
+ # QID LABELS TABLE (V2 - INTEGER QID)
206
+ # =============================================================================
207
+
208
+ CREATE_QID_LABELS_V2 = """
209
+ CREATE TABLE IF NOT EXISTS qid_labels (
210
+ qid INTEGER PRIMARY KEY,
211
+ label TEXT NOT NULL
212
+ );
213
+ """
214
+
215
+ # =============================================================================
216
+ # EMBEDDING VIRTUAL TABLES
217
+ # =============================================================================
218
+
219
+ def get_create_organization_embeddings(embedding_dim: int = 768) -> str:
220
+ """Get DDL for organization embeddings virtual table."""
221
+ return f"""
222
+ CREATE VIRTUAL TABLE IF NOT EXISTS organization_embeddings USING vec0(
223
+ org_id INTEGER PRIMARY KEY,
224
+ embedding float[{embedding_dim}]
225
+ );
226
+ """
227
+
228
+
229
+ def get_create_person_embeddings(embedding_dim: int = 768) -> str:
230
+ """Get DDL for person embeddings virtual table."""
231
+ return f"""
232
+ CREATE VIRTUAL TABLE IF NOT EXISTS person_embeddings USING vec0(
233
+ person_id INTEGER PRIMARY KEY,
234
+ embedding float[{embedding_dim}]
235
+ );
236
+ """
237
+
238
+
239
+ def get_create_organization_embeddings_scalar(embedding_dim: int = 768) -> str:
240
+ """Get DDL for organization scalar (int8) embeddings virtual table."""
241
+ return f"""
242
+ CREATE VIRTUAL TABLE IF NOT EXISTS organization_embeddings_scalar USING vec0(
243
+ org_id INTEGER PRIMARY KEY,
244
+ embedding int8[{embedding_dim}]
245
+ );
246
+ """
247
+
248
+
249
+ def get_create_person_embeddings_scalar(embedding_dim: int = 768) -> str:
250
+ """Get DDL for person scalar (int8) embeddings virtual table."""
251
+ return f"""
252
+ CREATE VIRTUAL TABLE IF NOT EXISTS person_embeddings_scalar USING vec0(
253
+ person_id INTEGER PRIMARY KEY,
254
+ embedding int8[{embedding_dim}]
255
+ );
256
+ """
257
+
258
+ # =============================================================================
259
+ # HUMAN-READABLE VIEWS
260
+ # =============================================================================
261
+
262
+ CREATE_ORGANIZATIONS_VIEW = """
263
+ CREATE VIEW IF NOT EXISTS organizations_view AS
264
+ SELECT
265
+ o.id,
266
+ o.qid,
267
+ o.name,
268
+ o.name_normalized,
269
+ s.name as source,
270
+ o.source_identifier,
271
+ l.name as region,
272
+ slt.name as region_type,
273
+ ot.name as entity_type,
274
+ o.from_date,
275
+ o.to_date,
276
+ o.canon_id,
277
+ o.canon_size
278
+ FROM organizations o
279
+ JOIN source_types s ON o.source_id = s.id
280
+ LEFT JOIN locations l ON o.region_id = l.id
281
+ LEFT JOIN location_types lt ON l.location_type_id = lt.id
282
+ LEFT JOIN simplified_location_types slt ON lt.simplified_id = slt.id
283
+ JOIN organization_types ot ON o.entity_type_id = ot.id;
284
+ """
285
+
286
+ CREATE_PEOPLE_VIEW = """
287
+ CREATE VIEW IF NOT EXISTS people_view AS
288
+ SELECT
289
+ p.id,
290
+ p.qid,
291
+ p.name,
292
+ p.name_normalized,
293
+ s.name as source,
294
+ p.source_identifier,
295
+ l.name as country,
296
+ pt.name as person_type,
297
+ r.name as known_for_role,
298
+ p.known_for_org,
299
+ p.known_for_org_id,
300
+ p.from_date,
301
+ p.to_date,
302
+ p.birth_date,
303
+ p.death_date,
304
+ p.canon_id,
305
+ p.canon_size
306
+ FROM people p
307
+ JOIN source_types s ON p.source_id = s.id
308
+ LEFT JOIN locations l ON p.country_id = l.id
309
+ JOIN people_types pt ON p.person_type_id = pt.id
310
+ LEFT JOIN roles r ON p.known_for_role_id = r.id;
311
+ """
312
+
313
+ CREATE_ROLES_VIEW = """
314
+ CREATE VIEW IF NOT EXISTS roles_view AS
315
+ SELECT
316
+ r.id,
317
+ r.qid,
318
+ r.name,
319
+ r.name_normalized,
320
+ s.name as source,
321
+ r.source_identifier,
322
+ r.canon_id,
323
+ r.canon_size
324
+ FROM roles r
325
+ JOIN source_types s ON r.source_id = s.id;
326
+ """
327
+
328
+ CREATE_LOCATIONS_VIEW = """
329
+ CREATE VIEW IF NOT EXISTS locations_view AS
330
+ SELECT
331
+ l.id,
332
+ l.qid,
333
+ l.name,
334
+ l.name_normalized,
335
+ s.name as source,
336
+ l.source_identifier,
337
+ l.parent_ids,
338
+ lt.name as location_type,
339
+ slt.name as simplified_type,
340
+ l.from_date,
341
+ l.to_date,
342
+ l.canon_id,
343
+ l.canon_size
344
+ FROM locations l
345
+ JOIN source_types s ON l.source_id = s.id
346
+ JOIN location_types lt ON l.location_type_id = lt.id
347
+ JOIN simplified_location_types slt ON lt.simplified_id = slt.id;
348
+ """
349
+
350
+ # =============================================================================
351
+ # ALL DDL STATEMENTS IN ORDER
352
+ # =============================================================================
353
+
354
+ ALL_DDL_STATEMENTS = [
355
+ # Enum tables first (no dependencies)
356
+ CREATE_SOURCE_TYPES,
357
+ CREATE_PEOPLE_TYPES,
358
+ CREATE_ORGANIZATION_TYPES,
359
+ CREATE_SIMPLIFIED_LOCATION_TYPES,
360
+ CREATE_LOCATION_TYPES,
361
+ # New entity tables
362
+ CREATE_ROLES,
363
+ CREATE_ROLES_INDEXES,
364
+ CREATE_LOCATIONS,
365
+ CREATE_LOCATIONS_INDEXES,
366
+ # Main entity tables
367
+ CREATE_ORGANIZATIONS_V2,
368
+ CREATE_ORGANIZATIONS_V2_INDEXES,
369
+ CREATE_PEOPLE_V2,
370
+ CREATE_PEOPLE_V2_INDEXES,
371
+ # Reference tables
372
+ CREATE_QID_LABELS_V2,
373
+ ]
374
+
375
+ VIEW_DDL_STATEMENTS = [
376
+ CREATE_ORGANIZATIONS_VIEW,
377
+ CREATE_PEOPLE_VIEW,
378
+ CREATE_ROLES_VIEW,
379
+ CREATE_LOCATIONS_VIEW,
380
+ ]
381
+
382
+
383
+ def create_all_tables(conn, embedding_dim: int = 768) -> None:
384
+ """
385
+ Create all v2 schema tables.
386
+
387
+ Args:
388
+ conn: SQLite connection
389
+ embedding_dim: Dimension for embedding vectors
390
+ """
391
+ for ddl in ALL_DDL_STATEMENTS:
392
+ for statement in ddl.strip().split(";"):
393
+ statement = statement.strip()
394
+ if statement:
395
+ conn.execute(statement)
396
+
397
+ # Create embedding virtual tables (float32)
398
+ conn.execute(get_create_organization_embeddings(embedding_dim))
399
+ conn.execute(get_create_person_embeddings(embedding_dim))
400
+
401
+ # Create scalar embedding virtual tables (int8) for 75% storage reduction
402
+ conn.execute(get_create_organization_embeddings_scalar(embedding_dim))
403
+ conn.execute(get_create_person_embeddings_scalar(embedding_dim))
404
+
405
+ # Create views
406
+ for ddl in VIEW_DDL_STATEMENTS:
407
+ conn.execute(ddl)
408
+
409
+ conn.commit()