corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
  3. statement_extractor/cli.py +866 -77
  4. statement_extractor/database/hub.py +35 -127
  5. statement_extractor/database/importers/__init__.py +10 -2
  6. statement_extractor/database/importers/companies_house.py +16 -2
  7. statement_extractor/database/importers/companies_house_officers.py +431 -0
  8. statement_extractor/database/importers/gleif.py +23 -0
  9. statement_extractor/database/importers/sec_edgar.py +17 -0
  10. statement_extractor/database/importers/sec_form4.py +512 -0
  11. statement_extractor/database/importers/wikidata.py +151 -43
  12. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  13. statement_extractor/database/importers/wikidata_people.py +823 -325
  14. statement_extractor/database/models.py +30 -6
  15. statement_extractor/database/store.py +1485 -60
  16. statement_extractor/document/deduplicator.py +10 -12
  17. statement_extractor/extractor.py +1 -1
  18. statement_extractor/models/__init__.py +3 -2
  19. statement_extractor/models/statement.py +15 -17
  20. statement_extractor/models.py +1 -1
  21. statement_extractor/pipeline/context.py +5 -5
  22. statement_extractor/pipeline/orchestrator.py +12 -12
  23. statement_extractor/plugins/base.py +17 -17
  24. statement_extractor/plugins/extractors/gliner2.py +28 -28
  25. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  26. statement_extractor/plugins/qualifiers/person.py +11 -1
  27. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  28. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  29. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -54,12 +54,17 @@ class PersonType(str, Enum):
54
54
  Used for categorizing people in the person database.
55
55
  """
56
56
  EXECUTIVE = "executive" # CEOs, board members, C-suite
57
- POLITICIAN = "politician" # Elected officials, diplomats
57
+ POLITICIAN = "politician" # Elected officials (presidents, MPs, mayors)
58
+ GOVERNMENT = "government" # Civil servants, diplomats, appointed officials
59
+ MILITARY = "military" # Military officers, armed forces personnel
60
+ LEGAL = "legal" # Judges, lawyers, legal professionals
61
+ PROFESSIONAL = "professional" # Known for their profession (doctors, engineers, architects)
58
62
  ACADEMIC = "academic" # Professors, researchers
59
- ARTIST = "artist" # Musicians, actors, directors, writers
63
+ ARTIST = "artist" # Traditional creatives (musicians, actors, painters, writers)
64
+ MEDIA = "media" # Internet/social media personalities (YouTubers, influencers, podcasters)
60
65
  ATHLETE = "athlete" # Sports figures
61
66
  ENTREPRENEUR = "entrepreneur" # Founders, business owners
62
- JOURNALIST = "journalist" # Reporters, media personalities
67
+ JOURNALIST = "journalist" # Reporters, news presenters, columnists
63
68
  ACTIVIST = "activist" # Advocates, campaigners
64
69
  SCIENTIST = "scientist" # Scientists, inventors
65
70
  UNKNOWN = "unknown" # Type not determined
@@ -77,6 +82,8 @@ class CompanyRecord(BaseModel):
77
82
  source_id: str = Field(..., description="Unique identifier from source (LEI, CIK, CH number)")
78
83
  region: str = Field(default="", description="Geographic region/country (e.g., 'UK', 'US', 'DE')")
79
84
  entity_type: EntityType = Field(default=EntityType.UNKNOWN, description="Organization type classification")
85
+ from_date: Optional[str] = Field(default=None, description="Start date (ISO format YYYY-MM-DD)")
86
+ to_date: Optional[str] = Field(default=None, description="End date (ISO format YYYY-MM-DD)")
80
87
  record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
81
88
 
82
89
  @property
@@ -92,11 +99,13 @@ class CompanyRecord(BaseModel):
92
99
  "source_id": self.source_id,
93
100
  "region": self.region,
94
101
  "entity_type": self.entity_type.value,
102
+ "from_date": self.from_date or "",
103
+ "to_date": self.to_date or "",
95
104
  "record": self.record,
96
105
  }
97
106
 
98
107
 
99
- PersonSourceType = Literal["wikidata"]
108
+ PersonSourceType = Literal["wikidata", "sec_edgar", "companies_house"]
100
109
 
101
110
 
102
111
  class PersonRecord(BaseModel):
@@ -111,8 +120,13 @@ class PersonRecord(BaseModel):
111
120
  source_id: str = Field(..., description="Unique identifier from source (Wikidata QID)")
112
121
  country: str = Field(default="", description="Country code or name (e.g., 'US', 'Germany')")
113
122
  person_type: PersonType = Field(default=PersonType.UNKNOWN, description="Person type classification")
114
- known_for_role: str = Field(default="", description="Primary role from Wikipedia (e.g., 'CEO', 'President')")
115
- known_for_org: str = Field(default="", description="Primary org from Wikipedia (e.g., 'Apple Inc', 'Tesla')")
123
+ known_for_role: str = Field(default="", description="Primary role (e.g., 'CEO', 'President')")
124
+ known_for_org: str = Field(default="", description="Primary org (e.g., 'Apple Inc', 'Tesla')")
125
+ known_for_org_id: Optional[int] = Field(default=None, description="Foreign key to organizations table")
126
+ from_date: Optional[str] = Field(default=None, description="Start date of role (ISO format YYYY-MM-DD)")
127
+ to_date: Optional[str] = Field(default=None, description="End date of role (ISO format YYYY-MM-DD)")
128
+ birth_date: Optional[str] = Field(default=None, description="Date of birth (ISO format YYYY-MM-DD)")
129
+ death_date: Optional[str] = Field(default=None, description="Date of death (ISO format YYYY-MM-DD) - if set, person is historic")
116
130
  record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
117
131
 
118
132
  @property
@@ -120,6 +134,11 @@ class PersonRecord(BaseModel):
120
134
  """Generate canonical ID in format source:source_id."""
121
135
  return f"{self.source}:{self.source_id}"
122
136
 
137
+ @property
138
+ def is_historic(self) -> bool:
139
+ """Return True if the person is deceased (has a death date)."""
140
+ return self.death_date is not None and self.death_date != ""
141
+
123
142
  def model_dump_for_db(self) -> dict[str, Any]:
124
143
  """Convert to dict suitable for database storage."""
125
144
  return {
@@ -130,6 +149,11 @@ class PersonRecord(BaseModel):
130
149
  "person_type": self.person_type.value,
131
150
  "known_for_role": self.known_for_role,
132
151
  "known_for_org": self.known_for_org,
152
+ "known_for_org_id": self.known_for_org_id, # Can be None
153
+ "from_date": self.from_date or "",
154
+ "to_date": self.to_date or "",
155
+ "birth_date": self.birth_date or "",
156
+ "death_date": self.death_date or "",
133
157
  "record": self.record,
134
158
  }
135
159