corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,308 @@
1
+ """
2
+ Document models for document-level processing.
3
+
4
+ Document: A document with metadata, pages, and optional summary
5
+ DocumentMetadata: Metadata about the document source
6
+ DocumentPage: A single page within a document
7
+ TextChunk: A chunk of text for processing with page tracking
8
+ ChunkingConfig: Configuration for text chunking
9
+ """
10
+
11
+ import uuid
12
+ from typing import Any, Optional
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+
17
+ class DocumentMetadata(BaseModel):
18
+ """
19
+ Metadata about a document source.
20
+
21
+ Contains information about where the document came from and
22
+ who authored it, useful for generating citations.
23
+ """
24
+ url: Optional[str] = Field(None, description="URL source of the document")
25
+ title: Optional[str] = Field(None, description="Document title")
26
+ year: Optional[int] = Field(None, description="Publication year")
27
+ authors: list[str] = Field(default_factory=list, description="List of authors")
28
+ source_type: Optional[str] = Field(
29
+ None,
30
+ description="Type of source: 'pdf', 'webpage', 'text', etc."
31
+ )
32
+ custom: dict[str, Any] = Field(
33
+ default_factory=dict,
34
+ description="Custom metadata fields"
35
+ )
36
+
37
+ def format_citation(self, page_number: Optional[int] = None) -> str:
38
+ """
39
+ Format a citation string for this document.
40
+
41
+ Args:
42
+ page_number: Optional page number to include
43
+
44
+ Returns:
45
+ Citation string like "Title - Author, 2024, p. 5"
46
+ """
47
+ parts = []
48
+
49
+ if self.title:
50
+ parts.append(self.title)
51
+
52
+ if self.authors:
53
+ if len(self.authors) == 1:
54
+ parts.append(self.authors[0])
55
+ elif len(self.authors) == 2:
56
+ parts.append(f"{self.authors[0]} & {self.authors[1]}")
57
+ else:
58
+ parts.append(f"{self.authors[0]} et al.")
59
+
60
+ if self.year:
61
+ parts.append(str(self.year))
62
+
63
+ if page_number is not None:
64
+ parts.append(f"p. {page_number}")
65
+
66
+ return " - ".join(parts) if parts else ""
67
+
68
+
69
+ class DocumentPage(BaseModel):
70
+ """
71
+ A single page within a document.
72
+
73
+ Tracks the page number and character offset for citation purposes.
74
+ """
75
+ page_number: int = Field(..., description="1-indexed page number")
76
+ text: str = Field(..., description="Text content of the page")
77
+ char_offset: int = Field(
78
+ ...,
79
+ description="Character offset of this page in the full document text"
80
+ )
81
+
82
+ @property
83
+ def char_end(self) -> int:
84
+ """Get the ending character offset of this page."""
85
+ return self.char_offset + len(self.text)
86
+
87
+
88
+ class TextChunk(BaseModel):
89
+ """
90
+ A chunk of text for processing.
91
+
92
+ Contains the text along with position tracking for mapping
93
+ extracted statements back to their source pages.
94
+ """
95
+ chunk_index: int = Field(..., description="0-indexed chunk number")
96
+ text: str = Field(..., description="Chunk text content")
97
+ start_char: int = Field(..., description="Starting character offset in full document")
98
+ end_char: int = Field(..., description="Ending character offset in full document")
99
+ page_numbers: list[int] = Field(
100
+ default_factory=list,
101
+ description="Page numbers this chunk spans (1-indexed)"
102
+ )
103
+ token_count: int = Field(..., description="Number of tokens in this chunk")
104
+ overlap_chars: int = Field(
105
+ default=0,
106
+ description="Number of characters of overlap from previous chunk"
107
+ )
108
+ document_id: str = Field(..., description="ID of the source document")
109
+
110
+ @property
111
+ def primary_page(self) -> Optional[int]:
112
+ """Get the primary page number for this chunk (first page)."""
113
+ return self.page_numbers[0] if self.page_numbers else None
114
+
115
+
116
+ class ChunkingConfig(BaseModel):
117
+ """
118
+ Configuration for document chunking.
119
+
120
+ Controls how documents are split into chunks for processing.
121
+ """
122
+ max_tokens: int = Field(
123
+ default=2000,
124
+ ge=100,
125
+ description="Maximum tokens per chunk (hard limit)"
126
+ )
127
+ target_tokens: int = Field(
128
+ default=1000,
129
+ ge=50,
130
+ description="Target tokens per chunk (soft limit, prefers to split here)"
131
+ )
132
+ overlap_tokens: int = Field(
133
+ default=100,
134
+ ge=0,
135
+ description="Tokens of overlap between consecutive chunks"
136
+ )
137
+ respect_page_boundaries: bool = Field(
138
+ default=True,
139
+ description="Try to split at page boundaries when possible"
140
+ )
141
+ respect_sentence_boundaries: bool = Field(
142
+ default=True,
143
+ description="Try to split at sentence boundaries when possible"
144
+ )
145
+
146
+
147
+ class Document(BaseModel):
148
+ """
149
+ A document for processing through the extraction pipeline.
150
+
151
+ Contains the full text, optional page structure, metadata for citations,
152
+ and an optional summary for context.
153
+ """
154
+ document_id: str = Field(
155
+ default_factory=lambda: str(uuid.uuid4()),
156
+ description="Unique identifier for this document"
157
+ )
158
+ metadata: DocumentMetadata = Field(
159
+ default_factory=DocumentMetadata,
160
+ description="Document metadata for citations"
161
+ )
162
+ pages: list[DocumentPage] = Field(
163
+ default_factory=list,
164
+ description="List of pages (optional, for PDFs)"
165
+ )
166
+ full_text: str = Field(
167
+ default="",
168
+ description="Full text content of the document"
169
+ )
170
+ summary: Optional[str] = Field(
171
+ None,
172
+ description="Generated summary of the document"
173
+ )
174
+
175
+ @classmethod
176
+ def from_text(
177
+ cls,
178
+ text: str,
179
+ title: Optional[str] = None,
180
+ url: Optional[str] = None,
181
+ **metadata_kwargs,
182
+ ) -> "Document":
183
+ """
184
+ Create a document from plain text.
185
+
186
+ Args:
187
+ text: The document text
188
+ title: Optional document title
189
+ url: Optional source URL
190
+ **metadata_kwargs: Additional metadata fields
191
+
192
+ Returns:
193
+ Document instance
194
+ """
195
+ metadata = DocumentMetadata(
196
+ title=title,
197
+ url=url,
198
+ source_type="text",
199
+ **metadata_kwargs,
200
+ )
201
+ return cls(
202
+ metadata=metadata,
203
+ full_text=text,
204
+ )
205
+
206
+ @classmethod
207
+ def from_pages(
208
+ cls,
209
+ pages: list[str],
210
+ title: Optional[str] = None,
211
+ source_type: str = "pdf",
212
+ **metadata_kwargs,
213
+ ) -> "Document":
214
+ """
215
+ Create a document from a list of page texts.
216
+
217
+ Args:
218
+ pages: List of page text strings (0-indexed input, stored as 1-indexed)
219
+ title: Optional document title
220
+ source_type: Source type (default: "pdf")
221
+ **metadata_kwargs: Additional metadata fields
222
+
223
+ Returns:
224
+ Document instance
225
+ """
226
+ metadata = DocumentMetadata(
227
+ title=title,
228
+ source_type=source_type,
229
+ **metadata_kwargs,
230
+ )
231
+
232
+ # Build pages with character offsets
233
+ doc_pages = []
234
+ char_offset = 0
235
+
236
+ for i, page_text in enumerate(pages):
237
+ doc_pages.append(DocumentPage(
238
+ page_number=i + 1, # 1-indexed
239
+ text=page_text,
240
+ char_offset=char_offset,
241
+ ))
242
+ char_offset += len(page_text)
243
+ if i < len(pages) - 1:
244
+ char_offset += 1 # Account for newline between pages
245
+
246
+ # Join pages with newlines for full text
247
+ full_text = "\n".join(pages)
248
+
249
+ return cls(
250
+ metadata=metadata,
251
+ pages=doc_pages,
252
+ full_text=full_text,
253
+ )
254
+
255
+ def get_page_at_char(self, char_offset: int) -> Optional[int]:
256
+ """
257
+ Get the page number containing a character offset.
258
+
259
+ Args:
260
+ char_offset: Character offset in full_text
261
+
262
+ Returns:
263
+ 1-indexed page number, or None if no pages defined
264
+ """
265
+ if not self.pages:
266
+ return None
267
+
268
+ for page in self.pages:
269
+ if page.char_offset <= char_offset < page.char_end:
270
+ return page.page_number
271
+
272
+ # If past the last page, return last page
273
+ if char_offset >= self.pages[-1].char_end:
274
+ return self.pages[-1].page_number
275
+
276
+ return None
277
+
278
+ def get_pages_in_range(self, start_char: int, end_char: int) -> list[int]:
279
+ """
280
+ Get all page numbers that overlap with a character range.
281
+
282
+ Args:
283
+ start_char: Start character offset
284
+ end_char: End character offset
285
+
286
+ Returns:
287
+ List of 1-indexed page numbers
288
+ """
289
+ if not self.pages:
290
+ return []
291
+
292
+ page_numbers = []
293
+ for page in self.pages:
294
+ # Check if page overlaps with range
295
+ if page.char_offset < end_char and page.char_end > start_char:
296
+ page_numbers.append(page.page_number)
297
+
298
+ return page_numbers
299
+
300
+ @property
301
+ def page_count(self) -> int:
302
+ """Get the number of pages in the document."""
303
+ return len(self.pages)
304
+
305
+ @property
306
+ def char_count(self) -> int:
307
+ """Get the total character count."""
308
+ return len(self.full_text)
@@ -72,6 +72,19 @@ class LabeledStatement(BaseModel):
72
72
  default_factory=list,
73
73
  description="Taxonomy classifications from Stage 6"
74
74
  )
75
+ # Document tracking fields
76
+ document_id: Optional[str] = Field(
77
+ None,
78
+ description="ID of the source document (for document pipeline)"
79
+ )
80
+ page_number: Optional[int] = Field(
81
+ None,
82
+ description="Page number where this statement was extracted (1-indexed)"
83
+ )
84
+ citation: Optional[str] = Field(
85
+ None,
86
+ description="Formatted citation string (e.g., 'Title - Author, 2024, p. 5')"
87
+ )
75
88
 
76
89
  def get_label(self, label_type: str) -> Optional[StatementLabel]:
77
90
  """Get a label by type, or None if not found."""
@@ -102,28 +115,41 @@ class LabeledStatement(BaseModel):
102
115
  """Format as FQN triple."""
103
116
  return f"{self.subject_fqn} --[{self.statement.predicate}]--> {self.object_fqn}"
104
117
 
118
+ def _build_entity_dict(self, canonical: CanonicalEntity, entity_type: str) -> dict:
119
+ """Build entity dict for serialization."""
120
+ statement_entity = self.statement.subject if entity_type == "subject" else self.statement.object
121
+ fqn = self.subject_fqn if entity_type == "subject" else self.object_fqn
122
+
123
+ # Get canonical_id from identifiers or canonical_match
124
+ identifiers = canonical.qualified_entity.qualifiers.identifiers
125
+ canonical_id = identifiers.get("canonical_id")
126
+ if not canonical_id and canonical.canonical_match:
127
+ canonical_id = canonical.canonical_match.canonical_id
128
+
129
+ result = {
130
+ "text": statement_entity.text,
131
+ "type": statement_entity.type.value,
132
+ "fqn": fqn,
133
+ "canonical_id": canonical_id,
134
+ }
135
+
136
+ # Add name if available
137
+ if canonical.name:
138
+ result["name"] = canonical.name
139
+
140
+ # Add qualifiers if available
141
+ qualifiers_dict = canonical.qualifiers_dict
142
+ if qualifiers_dict:
143
+ result["qualifiers"] = qualifiers_dict
144
+
145
+ return result
146
+
105
147
  def as_dict(self) -> dict:
106
148
  """Convert to a simplified dictionary representation."""
107
149
  return {
108
- "subject": {
109
- "text": self.statement.subject.text,
110
- "type": self.statement.subject.type.value,
111
- "fqn": self.subject_fqn,
112
- "canonical_id": (
113
- self.subject_canonical.canonical_match.canonical_id
114
- if self.subject_canonical.canonical_match else None
115
- ),
116
- },
150
+ "subject": self._build_entity_dict(self.subject_canonical, "subject"),
117
151
  "predicate": self.statement.predicate,
118
- "object": {
119
- "text": self.statement.object.text,
120
- "type": self.statement.object.type.value,
121
- "fqn": self.object_fqn,
122
- "canonical_id": (
123
- self.object_canonical.canonical_match.canonical_id
124
- if self.object_canonical.canonical_match else None
125
- ),
126
- },
152
+ "object": self._build_entity_dict(self.object_canonical, "object"),
127
153
  "source_text": self.statement.source_text,
128
154
  "labels": {
129
155
  label.label_type: label.label_value
@@ -137,6 +163,9 @@ class LabeledStatement(BaseModel):
137
163
  }
138
164
  for t in self.taxonomy_results
139
165
  ],
166
+ "document_id": self.document_id,
167
+ "page_number": self.page_number,
168
+ "citation": self.citation,
140
169
  }
141
170
 
142
171
  class Config:
@@ -3,15 +3,46 @@ Qualifier models for the extraction pipeline.
3
3
 
4
4
  EntityQualifiers: Semantic qualifiers and external identifiers
5
5
  QualifiedEntity: Entity with qualification information from Stage 3
6
+ ResolvedRole: Canonical role information from database
7
+ ResolvedOrganization: Canonical organization information from database
6
8
  """
7
9
 
8
- from typing import Optional
10
+ from typing import Any, Optional
9
11
 
10
12
  from pydantic import BaseModel, Field
11
13
 
12
14
  from .entity import EntityType
13
15
 
14
16
 
17
+ class ResolvedRole(BaseModel):
18
+ """
19
+ Resolved/canonical role information for a person.
20
+
21
+ Populated when matching a person against the database,
22
+ capturing the canonical role from Wikidata or other sources.
23
+ """
24
+ canonical_name: str = Field(..., description="Canonical role name (e.g., 'Chief Executive Officer')")
25
+ canonical_id: Optional[str] = Field(None, description="Full canonical ID (e.g., 'wikidata:Q484876')")
26
+ source: str = Field(..., description="Source of resolution (e.g., 'wikidata')")
27
+ source_id: Optional[str] = Field(None, description="ID in the source (e.g., 'Q484876' for Wikidata)")
28
+
29
+
30
+ class ResolvedOrganization(BaseModel):
31
+ """
32
+ Resolved/canonical organization information.
33
+
34
+ Populated when resolving an organization mentioned in context
35
+ against the organization database (GLEIF, SEC, Companies House, Wikidata).
36
+ """
37
+ canonical_name: str = Field(..., description="Canonical organization name")
38
+ canonical_id: str = Field(..., description="Full canonical ID (e.g., 'LEI:549300XYZ', 'SEC-CIK:1234567')")
39
+ source: str = Field(..., description="Source of resolution (e.g., 'gleif', 'sec_edgar', 'wikidata')")
40
+ source_id: str = Field(..., description="ID in the source")
41
+ region: Optional[str] = Field(None, description="Organization's region/jurisdiction")
42
+ match_confidence: float = Field(default=1.0, description="Confidence in the match (0-1)")
43
+ match_details: Optional[dict[str, Any]] = Field(None, description="Additional match details")
44
+
45
+
15
46
  class EntityQualifiers(BaseModel):
16
47
  """
17
48
  Qualifiers that provide context and identifiers for an entity.
@@ -22,6 +53,9 @@ class EntityQualifiers(BaseModel):
22
53
  - CompaniesHouseQualifierPlugin: Adds UK company number
23
54
  - SECEdgarQualifierPlugin: Adds SEC CIK, ticker
24
55
  """
56
+ # Canonical name from database (for ORG entities)
57
+ legal_name: Optional[str] = Field(None, description="Canonical legal name from database")
58
+
25
59
  # Semantic qualifiers (for PERSON entities)
26
60
  org: Optional[str] = Field(None, description="Organization/employer name")
27
61
  role: Optional[str] = Field(None, description="Job title/position/role")
@@ -38,11 +72,22 @@ class EntityQualifiers(BaseModel):
38
72
  description="External identifiers: lei, ch_number, sec_cik, ticker, wikidata_qid, etc."
39
73
  )
40
74
 
75
+ # Resolved canonical information (for PERSON entities)
76
+ resolved_role: Optional[ResolvedRole] = Field(
77
+ None,
78
+ description="Canonical role information from database lookup"
79
+ )
80
+ resolved_org: Optional[ResolvedOrganization] = Field(
81
+ None,
82
+ description="Canonical organization information from database lookup"
83
+ )
84
+
41
85
  def has_any_qualifier(self) -> bool:
42
86
  """Check if any qualifier or identifier is set."""
43
87
  return bool(
44
- self.org or self.role or self.region or self.country or
45
- self.city or self.jurisdiction or self.identifiers
88
+ self.legal_name or self.org or self.role or self.region or self.country or
89
+ self.city or self.jurisdiction or self.identifiers or
90
+ self.resolved_role or self.resolved_org
46
91
  )
47
92
 
48
93
  def merge_with(self, other: "EntityQualifiers") -> "EntityQualifiers":
@@ -53,6 +98,7 @@ class EntityQualifiers(BaseModel):
53
98
  """
54
99
  merged_identifiers = {**self.identifiers, **other.identifiers}
55
100
  return EntityQualifiers(
101
+ legal_name=other.legal_name or self.legal_name,
56
102
  org=other.org or self.org,
57
103
  role=other.role or self.role,
58
104
  region=other.region or self.region,
@@ -60,6 +106,8 @@ class EntityQualifiers(BaseModel):
60
106
  city=other.city or self.city,
61
107
  jurisdiction=other.jurisdiction or self.jurisdiction,
62
108
  identifiers=merged_identifiers,
109
+ resolved_role=other.resolved_role or self.resolved_role,
110
+ resolved_org=other.resolved_org or self.resolved_org,
63
111
  )
64
112
 
65
113
 
@@ -1,8 +1,8 @@
1
1
  """
2
2
  Statement models for the extraction pipeline.
3
3
 
4
- RawTriple: Output of Stage 1 (Splitting)
5
- PipelineStatement: Output of Stage 2 (Extraction) with refined entities
4
+ SplitSentence: Output of Stage 1 (Splitting) - atomic sentences/statements
5
+ PipelineStatement: Output of Stage 2 (Extraction) with subject-predicate-object triples
6
6
  """
7
7
 
8
8
  from typing import Optional
@@ -12,30 +12,41 @@ from pydantic import BaseModel, Field
12
12
  from .entity import ExtractedEntity
13
13
 
14
14
 
15
- class RawTriple(BaseModel):
15
+ class SplitSentence(BaseModel):
16
16
  """
17
- A raw triple from Stage 1 (Splitting).
17
+ An atomic sentence from Stage 1 (Splitting).
18
18
 
19
- Contains the basic text components before entity refinement.
20
- Generated by T5-Gemma or other splitting plugins.
19
+ Stage 1 splits text into atomic sentences that can each be converted
20
+ to subject-predicate-object triples in Stage 2. Generated by T5-Gemma
21
+ or other splitting plugins.
21
22
  """
22
- subject_text: str = Field(..., description="Raw subject text")
23
- predicate_text: str = Field(..., description="Raw predicate text")
24
- object_text: str = Field(..., description="Raw object text")
25
- source_sentence: str = Field(..., description="The source sentence this triple was extracted from")
23
+ text: str = Field(..., description="The atomic sentence text")
26
24
  confidence: float = Field(
27
25
  default=1.0,
28
26
  ge=0.0,
29
27
  le=1.0,
30
- description="Extraction confidence from the splitter"
28
+ description="Confidence that this is a valid atomic statement"
29
+ )
30
+ # Document tracking fields
31
+ document_id: Optional[str] = Field(
32
+ None,
33
+ description="ID of the source document (for document pipeline)"
34
+ )
35
+ page_number: Optional[int] = Field(
36
+ None,
37
+ description="Page number where this sentence was extracted (1-indexed)"
38
+ )
39
+ chunk_index: Optional[int] = Field(
40
+ None,
41
+ description="Index of the chunk this sentence was extracted from (0-indexed)"
31
42
  )
32
43
 
33
44
  def __str__(self) -> str:
34
- return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
45
+ return self.text
35
46
 
36
- def as_tuple(self) -> tuple[str, str, str]:
37
- """Return as a simple (subject, predicate, object) tuple."""
38
- return (self.subject_text, self.predicate_text, self.object_text)
47
+
48
+ # Backwards compatibility alias
49
+ RawTriple = SplitSentence
39
50
 
40
51
 
41
52
  class PipelineStatement(BaseModel):
@@ -63,6 +74,19 @@ class PipelineStatement(BaseModel):
63
74
  None,
64
75
  description="Method used to extract this statement (e.g., 'hybrid', 'gliner', 'model')"
65
76
  )
77
+ # Document tracking fields
78
+ document_id: Optional[str] = Field(
79
+ None,
80
+ description="ID of the source document (for document pipeline)"
81
+ )
82
+ page_number: Optional[int] = Field(
83
+ None,
84
+ description="Page number where this statement was extracted (1-indexed)"
85
+ )
86
+ chunk_index: Optional[int] = Field(
87
+ None,
88
+ description="Index of the chunk this statement was extracted from (0-indexed)"
89
+ )
66
90
 
67
91
  def __str__(self) -> str:
68
92
  return f"{self.subject.text} --[{self.predicate}]--> {self.object.text}"
@@ -217,7 +217,7 @@ class ScoringConfig(BaseModel):
217
217
  quality_weight: float = Field(
218
218
  default=1.0,
219
219
  ge=0.0,
220
- description="Weight for groundedness/quality scores in beam selection"
220
+ description="Weight for confidence scores in beam selection"
221
221
  )
222
222
  coverage_weight: float = Field(
223
223
  default=0.5,
@@ -16,10 +16,10 @@ class PipelineConfig(BaseModel):
16
16
  Controls which stages are enabled, which plugins to use,
17
17
  and stage-specific options.
18
18
  """
19
- # Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=Canonicalization, 5=Labeling, 6=Taxonomy)
19
+ # Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=Labeling, 5=Taxonomy)
20
20
  enabled_stages: set[int] = Field(
21
- default={1, 2, 3, 4, 5, 6},
22
- description="Set of enabled stage numbers (1-6)"
21
+ default={1, 2, 3, 4, 5},
22
+ description="Set of enabled stage numbers (1-5)"
23
23
  )
24
24
 
25
25
  # Plugin selection
@@ -45,11 +45,7 @@ class PipelineConfig(BaseModel):
45
45
  )
46
46
  qualifier_options: dict[str, Any] = Field(
47
47
  default_factory=dict,
48
- description="Options passed to qualifier plugins"
49
- )
50
- canonicalizer_options: dict[str, Any] = Field(
51
- default_factory=dict,
52
- description="Options passed to canonicalizer plugins"
48
+ description="Options passed to qualifier plugins (includes canonicalizers)"
53
49
  )
54
50
  labeler_options: dict[str, Any] = Field(
55
51
  default_factory=dict,
@@ -123,9 +119,8 @@ STAGE_NAMES = {
123
119
  1: "splitting",
124
120
  2: "extraction",
125
121
  3: "qualification",
126
- 4: "canonicalization",
127
- 5: "labeling",
128
- 6: "taxonomy",
122
+ 4: "labeling",
123
+ 5: "taxonomy",
129
124
  }
130
125
 
131
126
 
@@ -2,7 +2,7 @@
2
2
  PipelineContext - Data container that flows through all pipeline stages.
3
3
 
4
4
  The context accumulates outputs from each stage:
5
- - Stage 1 (Splitting): raw_triples
5
+ - Stage 1 (Splitting): split_sentences
6
6
  - Stage 2 (Extraction): statements
7
7
  - Stage 3 (Qualification): qualified_entities
8
8
  - Stage 4 (Canonicalization): canonical_entities
@@ -14,7 +14,7 @@ from typing import Any, Optional
14
14
  from pydantic import BaseModel, Field
15
15
 
16
16
  from ..models import (
17
- RawTriple,
17
+ SplitSentence,
18
18
  PipelineStatement,
19
19
  QualifiedEntity,
20
20
  CanonicalEntity,
@@ -37,10 +37,10 @@ class PipelineContext(BaseModel):
37
37
  description="Metadata about the source (e.g., document ID, URL, timestamp)"
38
38
  )
39
39
 
40
- # Stage 1 output: Raw triples from splitting
41
- raw_triples: list[RawTriple] = Field(
40
+ # Stage 1 output: Split sentences
41
+ split_sentences: list[SplitSentence] = Field(
42
42
  default_factory=list,
43
- description="Raw triples from Stage 1 (Splitting)"
43
+ description="Atomic sentences from Stage 1 (Splitting)"
44
44
  )
45
45
 
46
46
  # Stage 2 output: Statements with extracted entities