corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
- statement_extractor/cli.py +866 -77
- statement_extractor/database/hub.py +35 -127
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +823 -325
- statement_extractor/database/models.py +30 -6
- statement_extractor/database/store.py +1485 -60
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +11 -1
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -54,12 +54,17 @@ class PersonType(str, Enum):
|
|
|
54
54
|
Used for categorizing people in the person database.
|
|
55
55
|
"""
|
|
56
56
|
EXECUTIVE = "executive" # CEOs, board members, C-suite
|
|
57
|
-
POLITICIAN = "politician" # Elected officials,
|
|
57
|
+
POLITICIAN = "politician" # Elected officials (presidents, MPs, mayors)
|
|
58
|
+
GOVERNMENT = "government" # Civil servants, diplomats, appointed officials
|
|
59
|
+
MILITARY = "military" # Military officers, armed forces personnel
|
|
60
|
+
LEGAL = "legal" # Judges, lawyers, legal professionals
|
|
61
|
+
PROFESSIONAL = "professional" # Known for their profession (doctors, engineers, architects)
|
|
58
62
|
ACADEMIC = "academic" # Professors, researchers
|
|
59
|
-
ARTIST = "artist" #
|
|
63
|
+
ARTIST = "artist" # Traditional creatives (musicians, actors, painters, writers)
|
|
64
|
+
MEDIA = "media" # Internet/social media personalities (YouTubers, influencers, podcasters)
|
|
60
65
|
ATHLETE = "athlete" # Sports figures
|
|
61
66
|
ENTREPRENEUR = "entrepreneur" # Founders, business owners
|
|
62
|
-
JOURNALIST = "journalist" # Reporters,
|
|
67
|
+
JOURNALIST = "journalist" # Reporters, news presenters, columnists
|
|
63
68
|
ACTIVIST = "activist" # Advocates, campaigners
|
|
64
69
|
SCIENTIST = "scientist" # Scientists, inventors
|
|
65
70
|
UNKNOWN = "unknown" # Type not determined
|
|
@@ -77,6 +82,8 @@ class CompanyRecord(BaseModel):
|
|
|
77
82
|
source_id: str = Field(..., description="Unique identifier from source (LEI, CIK, CH number)")
|
|
78
83
|
region: str = Field(default="", description="Geographic region/country (e.g., 'UK', 'US', 'DE')")
|
|
79
84
|
entity_type: EntityType = Field(default=EntityType.UNKNOWN, description="Organization type classification")
|
|
85
|
+
from_date: Optional[str] = Field(default=None, description="Start date (ISO format YYYY-MM-DD)")
|
|
86
|
+
to_date: Optional[str] = Field(default=None, description="End date (ISO format YYYY-MM-DD)")
|
|
80
87
|
record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
|
|
81
88
|
|
|
82
89
|
@property
|
|
@@ -92,11 +99,13 @@ class CompanyRecord(BaseModel):
|
|
|
92
99
|
"source_id": self.source_id,
|
|
93
100
|
"region": self.region,
|
|
94
101
|
"entity_type": self.entity_type.value,
|
|
102
|
+
"from_date": self.from_date or "",
|
|
103
|
+
"to_date": self.to_date or "",
|
|
95
104
|
"record": self.record,
|
|
96
105
|
}
|
|
97
106
|
|
|
98
107
|
|
|
99
|
-
PersonSourceType = Literal["wikidata"]
|
|
108
|
+
PersonSourceType = Literal["wikidata", "sec_edgar", "companies_house"]
|
|
100
109
|
|
|
101
110
|
|
|
102
111
|
class PersonRecord(BaseModel):
|
|
@@ -111,8 +120,13 @@ class PersonRecord(BaseModel):
|
|
|
111
120
|
source_id: str = Field(..., description="Unique identifier from source (Wikidata QID)")
|
|
112
121
|
country: str = Field(default="", description="Country code or name (e.g., 'US', 'Germany')")
|
|
113
122
|
person_type: PersonType = Field(default=PersonType.UNKNOWN, description="Person type classification")
|
|
114
|
-
known_for_role: str = Field(default="", description="Primary role
|
|
115
|
-
known_for_org: str = Field(default="", description="Primary org
|
|
123
|
+
known_for_role: str = Field(default="", description="Primary role (e.g., 'CEO', 'President')")
|
|
124
|
+
known_for_org: str = Field(default="", description="Primary org (e.g., 'Apple Inc', 'Tesla')")
|
|
125
|
+
known_for_org_id: Optional[int] = Field(default=None, description="Foreign key to organizations table")
|
|
126
|
+
from_date: Optional[str] = Field(default=None, description="Start date of role (ISO format YYYY-MM-DD)")
|
|
127
|
+
to_date: Optional[str] = Field(default=None, description="End date of role (ISO format YYYY-MM-DD)")
|
|
128
|
+
birth_date: Optional[str] = Field(default=None, description="Date of birth (ISO format YYYY-MM-DD)")
|
|
129
|
+
death_date: Optional[str] = Field(default=None, description="Date of death (ISO format YYYY-MM-DD) - if set, person is historic")
|
|
116
130
|
record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
|
|
117
131
|
|
|
118
132
|
@property
|
|
@@ -120,6 +134,11 @@ class PersonRecord(BaseModel):
|
|
|
120
134
|
"""Generate canonical ID in format source:source_id."""
|
|
121
135
|
return f"{self.source}:{self.source_id}"
|
|
122
136
|
|
|
137
|
+
@property
|
|
138
|
+
def is_historic(self) -> bool:
|
|
139
|
+
"""Return True if the person is deceased (has a death date)."""
|
|
140
|
+
return self.death_date is not None and self.death_date != ""
|
|
141
|
+
|
|
123
142
|
def model_dump_for_db(self) -> dict[str, Any]:
|
|
124
143
|
"""Convert to dict suitable for database storage."""
|
|
125
144
|
return {
|
|
@@ -130,6 +149,11 @@ class PersonRecord(BaseModel):
|
|
|
130
149
|
"person_type": self.person_type.value,
|
|
131
150
|
"known_for_role": self.known_for_role,
|
|
132
151
|
"known_for_org": self.known_for_org,
|
|
152
|
+
"known_for_org_id": self.known_for_org_id, # Can be None
|
|
153
|
+
"from_date": self.from_date or "",
|
|
154
|
+
"to_date": self.to_date or "",
|
|
155
|
+
"birth_date": self.birth_date or "",
|
|
156
|
+
"death_date": self.death_date or "",
|
|
133
157
|
"record": self.record,
|
|
134
158
|
}
|
|
135
159
|
|