corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,195 @@
1
+ """
2
+ DocumentSummarizer - Generate document summaries using Gemma3.
3
+
4
+ Creates concise summaries focused on entities, events, and relationships
5
+ that are useful for providing context during extraction.
6
+ """
7
+
8
+ import logging
9
+ from typing import Optional
10
+
11
+ from ..models.document import Document
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class DocumentSummarizer:
17
+ """
18
+ Generates document summaries using the Gemma3 LLM.
19
+
20
+ Summaries focus on:
21
+ - Key entities mentioned
22
+ - Important events and actions
23
+ - Relationships between entities
24
+ """
25
+
26
+ MAX_INPUT_TOKENS = 10_000
27
+ DEFAULT_MAX_OUTPUT_TOKENS = 300
28
+
29
+ def __init__(
30
+ self,
31
+ max_input_tokens: int = MAX_INPUT_TOKENS,
32
+ max_output_tokens: int = DEFAULT_MAX_OUTPUT_TOKENS,
33
+ ):
34
+ """
35
+ Initialize the summarizer.
36
+
37
+ Args:
38
+ max_input_tokens: Maximum tokens of input to send to the LLM
39
+ max_output_tokens: Maximum tokens for the summary output
40
+ """
41
+ self._max_input_tokens = max_input_tokens
42
+ self._max_output_tokens = max_output_tokens
43
+ self._llm = None
44
+ self._tokenizer = None
45
+
46
+ @property
47
+ def llm(self):
48
+ """Lazy-load the LLM."""
49
+ if self._llm is None:
50
+ from ..llm import get_llm
51
+ logger.debug("Loading LLM for summarization")
52
+ self._llm = get_llm()
53
+ return self._llm
54
+
55
+ @property
56
+ def tokenizer(self):
57
+ """Lazy-load tokenizer for token counting."""
58
+ if self._tokenizer is None:
59
+ from transformers import AutoTokenizer
60
+ self._tokenizer = AutoTokenizer.from_pretrained(
61
+ "Corp-o-Rate-Community/statement-extractor",
62
+ trust_remote_code=True,
63
+ )
64
+ return self._tokenizer
65
+
66
+ def _count_tokens(self, text: str) -> int:
67
+ """Count tokens in text."""
68
+ return len(self.tokenizer.encode(text, add_special_tokens=False))
69
+
70
+ def _truncate_to_tokens(self, text: str, max_tokens: int) -> str:
71
+ """
72
+ Truncate text to a maximum number of tokens.
73
+
74
+ Tries to truncate at sentence boundaries when possible.
75
+ """
76
+ token_count = self._count_tokens(text)
77
+
78
+ if token_count <= max_tokens:
79
+ return text
80
+
81
+ # Estimate chars per token
82
+ chars_per_token = len(text) / token_count
83
+ target_chars = int(max_tokens * chars_per_token * 0.95) # 5% buffer
84
+
85
+ # Truncate
86
+ truncated = text[:target_chars]
87
+
88
+ # Try to end at a sentence boundary
89
+ last_period = truncated.rfind(". ")
90
+ last_newline = truncated.rfind("\n")
91
+ split_pos = max(last_period, last_newline)
92
+
93
+ if split_pos > target_chars * 0.7: # Don't lose too much text
94
+ truncated = truncated[:split_pos + 1]
95
+
96
+ logger.debug(f"Truncated text from {len(text)} to {len(truncated)} chars")
97
+ return truncated
98
+
99
+ def summarize(
100
+ self,
101
+ document: Document,
102
+ custom_prompt: Optional[str] = None,
103
+ ) -> str:
104
+ """
105
+ Generate a summary of the document.
106
+
107
+ Args:
108
+ document: Document to summarize
109
+ custom_prompt: Optional custom prompt (uses default if not provided)
110
+
111
+ Returns:
112
+ Summary string
113
+ """
114
+ if not document.full_text.strip():
115
+ logger.warning("Cannot summarize empty document")
116
+ return ""
117
+
118
+ logger.info(f"Generating summary for document {document.document_id}")
119
+
120
+ # Truncate text to max input tokens
121
+ text = self._truncate_to_tokens(document.full_text, self._max_input_tokens)
122
+
123
+ # Build prompt
124
+ if custom_prompt:
125
+ prompt = f"{custom_prompt}\n\n{text}"
126
+ else:
127
+ prompt = self._build_prompt(text, document)
128
+
129
+ # Generate summary
130
+ try:
131
+ summary = self.llm.generate(
132
+ prompt=prompt,
133
+ max_tokens=self._max_output_tokens,
134
+ stop=["\n\n\n", "---"],
135
+ )
136
+ summary = summary.strip()
137
+ logger.info(f"Generated summary ({len(summary)} chars):")
138
+ # Log summary with indentation for readability
139
+ for line in summary.split("\n"):
140
+ logger.info(f" {line}")
141
+ return summary
142
+
143
+ except Exception as e:
144
+ logger.error(f"Summary generation failed: {e}")
145
+ raise
146
+
147
+ def _build_prompt(self, text: str, document: Document) -> str:
148
+ """Build the summarization prompt."""
149
+ # Include document metadata context if available
150
+ context_parts = []
151
+ if document.metadata.title:
152
+ context_parts.append(f"Title: {document.metadata.title}")
153
+ if document.metadata.authors:
154
+ context_parts.append(f"Authors: {', '.join(document.metadata.authors)}")
155
+ if document.metadata.source_type:
156
+ context_parts.append(f"Source type: {document.metadata.source_type}")
157
+
158
+ context = "\n".join(context_parts) if context_parts else ""
159
+
160
+ prompt = f"""Summarize the following document, focusing on:
161
+ 1. Key entities (companies, people, locations) mentioned
162
+ 2. Important events, actions, and decisions
163
+ 3. Relationships between entities
164
+ 4. Main topics and themes
165
+
166
+ Keep the summary concise (2-3 paragraphs) and factual.
167
+
168
+ {context}
169
+
170
+ Document text:
171
+ {text}
172
+
173
+ Summary:"""
174
+
175
+ return prompt
176
+
177
+ def summarize_text(
178
+ self,
179
+ text: str,
180
+ title: Optional[str] = None,
181
+ ) -> str:
182
+ """
183
+ Generate a summary from plain text.
184
+
185
+ Convenience method that creates a temporary Document.
186
+
187
+ Args:
188
+ text: Text to summarize
189
+ title: Optional document title for context
190
+
191
+ Returns:
192
+ Summary string
193
+ """
194
+ document = Document.from_text(text, title=title)
195
+ return self.summarize(document)
@@ -44,9 +44,16 @@ else:
44
44
  # New pipeline models
45
45
  from .entity import ExtractedEntity
46
46
  from .statement import RawTriple, PipelineStatement
47
- from .qualifiers import EntityQualifiers, QualifiedEntity
47
+ from .qualifiers import EntityQualifiers, QualifiedEntity, ResolvedRole, ResolvedOrganization
48
48
  from .canonical import CanonicalMatch, CanonicalEntity
49
49
  from .labels import StatementLabel, LabeledStatement, TaxonomyResult
50
+ from .document import (
51
+ Document,
52
+ DocumentMetadata,
53
+ DocumentPage,
54
+ TextChunk,
55
+ ChunkingConfig,
56
+ )
50
57
 
51
58
  __all__ = [
52
59
  # Re-exported from original models.py (backward compatibility)
@@ -66,9 +73,17 @@ __all__ = [
66
73
  "PipelineStatement",
67
74
  "EntityQualifiers",
68
75
  "QualifiedEntity",
76
+ "ResolvedRole",
77
+ "ResolvedOrganization",
69
78
  "CanonicalMatch",
70
79
  "CanonicalEntity",
71
80
  "StatementLabel",
72
81
  "LabeledStatement",
73
82
  "TaxonomyResult",
83
+ # Document models
84
+ "Document",
85
+ "DocumentMetadata",
86
+ "DocumentPage",
87
+ "TextChunk",
88
+ "ChunkingConfig",
74
89
  ]
@@ -64,9 +64,52 @@ class CanonicalEntity(BaseModel):
64
64
  )
65
65
  fqn: str = Field(
66
66
  ...,
67
- description="Fully qualified name, e.g., 'Tim Cook (CEO, Apple Inc)'"
67
+ description="Fully qualified name, e.g., 'AMAZON CORP INC (SEC-CIK,USA)'"
68
68
  )
69
69
 
70
+ @property
71
+ def name(self) -> Optional[str]:
72
+ """Get the canonical/legal name if available."""
73
+ # Prefer legal_name from qualifiers (set by embedding qualifier)
74
+ if self.qualified_entity.qualifiers.legal_name:
75
+ return self.qualified_entity.qualifiers.legal_name
76
+ # Fall back to canonical match name
77
+ if self.canonical_match and self.canonical_match.canonical_name:
78
+ return self.canonical_match.canonical_name
79
+ return None
80
+
81
+ @property
82
+ def qualifiers_dict(self) -> Optional[dict[str, str]]:
83
+ """
84
+ Get qualifiers as a dict for serialization.
85
+
86
+ Returns a dict with keys like: legal_name, region, source, source_id
87
+ Only returns non-None values. Returns None if no qualifiers are set.
88
+ """
89
+ qualifiers = self.qualified_entity.qualifiers
90
+ identifiers = qualifiers.identifiers
91
+ result = {}
92
+
93
+ # Add legal name
94
+ if qualifiers.legal_name:
95
+ result["legal_name"] = qualifiers.legal_name
96
+
97
+ # Add region (prefer region, fall back to jurisdiction/country)
98
+ if qualifiers.region:
99
+ result["region"] = qualifiers.region
100
+ elif qualifiers.jurisdiction:
101
+ result["region"] = qualifiers.jurisdiction
102
+ elif qualifiers.country:
103
+ result["region"] = qualifiers.country
104
+
105
+ # Add source and source_id from identifiers
106
+ if "source" in identifiers:
107
+ result["source"] = identifiers["source"]
108
+ if "source_id" in identifiers:
109
+ result["source_id"] = identifiers["source_id"]
110
+
111
+ return result if result else None
112
+
70
113
  @classmethod
71
114
  def from_qualified(
72
115
  cls,
@@ -0,0 +1,308 @@
1
+ """
2
+ Document models for document-level processing.
3
+
4
+ Document: A document with metadata, pages, and optional summary
5
+ DocumentMetadata: Metadata about the document source
6
+ DocumentPage: A single page within a document
7
+ TextChunk: A chunk of text for processing with page tracking
8
+ ChunkingConfig: Configuration for text chunking
9
+ """
10
+
11
+ import uuid
12
+ from typing import Any, Optional
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+
17
+ class DocumentMetadata(BaseModel):
18
+ """
19
+ Metadata about a document source.
20
+
21
+ Contains information about where the document came from and
22
+ who authored it, useful for generating citations.
23
+ """
24
+ url: Optional[str] = Field(None, description="URL source of the document")
25
+ title: Optional[str] = Field(None, description="Document title")
26
+ year: Optional[int] = Field(None, description="Publication year")
27
+ authors: list[str] = Field(default_factory=list, description="List of authors")
28
+ source_type: Optional[str] = Field(
29
+ None,
30
+ description="Type of source: 'pdf', 'webpage', 'text', etc."
31
+ )
32
+ custom: dict[str, Any] = Field(
33
+ default_factory=dict,
34
+ description="Custom metadata fields"
35
+ )
36
+
37
+ def format_citation(self, page_number: Optional[int] = None) -> str:
38
+ """
39
+ Format a citation string for this document.
40
+
41
+ Args:
42
+ page_number: Optional page number to include
43
+
44
+ Returns:
45
+ Citation string like "Title - Author, 2024, p. 5"
46
+ """
47
+ parts = []
48
+
49
+ if self.title:
50
+ parts.append(self.title)
51
+
52
+ if self.authors:
53
+ if len(self.authors) == 1:
54
+ parts.append(self.authors[0])
55
+ elif len(self.authors) == 2:
56
+ parts.append(f"{self.authors[0]} & {self.authors[1]}")
57
+ else:
58
+ parts.append(f"{self.authors[0]} et al.")
59
+
60
+ if self.year:
61
+ parts.append(str(self.year))
62
+
63
+ if page_number is not None:
64
+ parts.append(f"p. {page_number}")
65
+
66
+ return " - ".join(parts) if parts else ""
67
+
68
+
69
+ class DocumentPage(BaseModel):
70
+ """
71
+ A single page within a document.
72
+
73
+ Tracks the page number and character offset for citation purposes.
74
+ """
75
+ page_number: int = Field(..., description="1-indexed page number")
76
+ text: str = Field(..., description="Text content of the page")
77
+ char_offset: int = Field(
78
+ ...,
79
+ description="Character offset of this page in the full document text"
80
+ )
81
+
82
+ @property
83
+ def char_end(self) -> int:
84
+ """Get the ending character offset of this page."""
85
+ return self.char_offset + len(self.text)
86
+
87
+
88
+ class TextChunk(BaseModel):
89
+ """
90
+ A chunk of text for processing.
91
+
92
+ Contains the text along with position tracking for mapping
93
+ extracted statements back to their source pages.
94
+ """
95
+ chunk_index: int = Field(..., description="0-indexed chunk number")
96
+ text: str = Field(..., description="Chunk text content")
97
+ start_char: int = Field(..., description="Starting character offset in full document")
98
+ end_char: int = Field(..., description="Ending character offset in full document")
99
+ page_numbers: list[int] = Field(
100
+ default_factory=list,
101
+ description="Page numbers this chunk spans (1-indexed)"
102
+ )
103
+ token_count: int = Field(..., description="Number of tokens in this chunk")
104
+ overlap_chars: int = Field(
105
+ default=0,
106
+ description="Number of characters of overlap from previous chunk"
107
+ )
108
+ document_id: str = Field(..., description="ID of the source document")
109
+
110
+ @property
111
+ def primary_page(self) -> Optional[int]:
112
+ """Get the primary page number for this chunk (first page)."""
113
+ return self.page_numbers[0] if self.page_numbers else None
114
+
115
+
116
+ class ChunkingConfig(BaseModel):
117
+ """
118
+ Configuration for document chunking.
119
+
120
+ Controls how documents are split into chunks for processing.
121
+ """
122
+ max_tokens: int = Field(
123
+ default=2000,
124
+ ge=100,
125
+ description="Maximum tokens per chunk (hard limit)"
126
+ )
127
+ target_tokens: int = Field(
128
+ default=1000,
129
+ ge=50,
130
+ description="Target tokens per chunk (soft limit, prefers to split here)"
131
+ )
132
+ overlap_tokens: int = Field(
133
+ default=100,
134
+ ge=0,
135
+ description="Tokens of overlap between consecutive chunks"
136
+ )
137
+ respect_page_boundaries: bool = Field(
138
+ default=True,
139
+ description="Try to split at page boundaries when possible"
140
+ )
141
+ respect_sentence_boundaries: bool = Field(
142
+ default=True,
143
+ description="Try to split at sentence boundaries when possible"
144
+ )
145
+
146
+
147
+ class Document(BaseModel):
148
+ """
149
+ A document for processing through the extraction pipeline.
150
+
151
+ Contains the full text, optional page structure, metadata for citations,
152
+ and an optional summary for context.
153
+ """
154
+ document_id: str = Field(
155
+ default_factory=lambda: str(uuid.uuid4()),
156
+ description="Unique identifier for this document"
157
+ )
158
+ metadata: DocumentMetadata = Field(
159
+ default_factory=DocumentMetadata,
160
+ description="Document metadata for citations"
161
+ )
162
+ pages: list[DocumentPage] = Field(
163
+ default_factory=list,
164
+ description="List of pages (optional, for PDFs)"
165
+ )
166
+ full_text: str = Field(
167
+ default="",
168
+ description="Full text content of the document"
169
+ )
170
+ summary: Optional[str] = Field(
171
+ None,
172
+ description="Generated summary of the document"
173
+ )
174
+
175
+ @classmethod
176
+ def from_text(
177
+ cls,
178
+ text: str,
179
+ title: Optional[str] = None,
180
+ url: Optional[str] = None,
181
+ **metadata_kwargs,
182
+ ) -> "Document":
183
+ """
184
+ Create a document from plain text.
185
+
186
+ Args:
187
+ text: The document text
188
+ title: Optional document title
189
+ url: Optional source URL
190
+ **metadata_kwargs: Additional metadata fields
191
+
192
+ Returns:
193
+ Document instance
194
+ """
195
+ metadata = DocumentMetadata(
196
+ title=title,
197
+ url=url,
198
+ source_type="text",
199
+ **metadata_kwargs,
200
+ )
201
+ return cls(
202
+ metadata=metadata,
203
+ full_text=text,
204
+ )
205
+
206
+ @classmethod
207
+ def from_pages(
208
+ cls,
209
+ pages: list[str],
210
+ title: Optional[str] = None,
211
+ source_type: str = "pdf",
212
+ **metadata_kwargs,
213
+ ) -> "Document":
214
+ """
215
+ Create a document from a list of page texts.
216
+
217
+ Args:
218
+ pages: List of page text strings (0-indexed input, stored as 1-indexed)
219
+ title: Optional document title
220
+ source_type: Source type (default: "pdf")
221
+ **metadata_kwargs: Additional metadata fields
222
+
223
+ Returns:
224
+ Document instance
225
+ """
226
+ metadata = DocumentMetadata(
227
+ title=title,
228
+ source_type=source_type,
229
+ **metadata_kwargs,
230
+ )
231
+
232
+ # Build pages with character offsets
233
+ doc_pages = []
234
+ char_offset = 0
235
+
236
+ for i, page_text in enumerate(pages):
237
+ doc_pages.append(DocumentPage(
238
+ page_number=i + 1, # 1-indexed
239
+ text=page_text,
240
+ char_offset=char_offset,
241
+ ))
242
+ char_offset += len(page_text)
243
+ if i < len(pages) - 1:
244
+ char_offset += 1 # Account for newline between pages
245
+
246
+ # Join pages with newlines for full text
247
+ full_text = "\n".join(pages)
248
+
249
+ return cls(
250
+ metadata=metadata,
251
+ pages=doc_pages,
252
+ full_text=full_text,
253
+ )
254
+
255
+ def get_page_at_char(self, char_offset: int) -> Optional[int]:
256
+ """
257
+ Get the page number containing a character offset.
258
+
259
+ Args:
260
+ char_offset: Character offset in full_text
261
+
262
+ Returns:
263
+ 1-indexed page number, or None if no pages defined
264
+ """
265
+ if not self.pages:
266
+ return None
267
+
268
+ for page in self.pages:
269
+ if page.char_offset <= char_offset < page.char_end:
270
+ return page.page_number
271
+
272
+ # If past the last page, return last page
273
+ if char_offset >= self.pages[-1].char_end:
274
+ return self.pages[-1].page_number
275
+
276
+ return None
277
+
278
+ def get_pages_in_range(self, start_char: int, end_char: int) -> list[int]:
279
+ """
280
+ Get all page numbers that overlap with a character range.
281
+
282
+ Args:
283
+ start_char: Start character offset
284
+ end_char: End character offset
285
+
286
+ Returns:
287
+ List of 1-indexed page numbers
288
+ """
289
+ if not self.pages:
290
+ return []
291
+
292
+ page_numbers = []
293
+ for page in self.pages:
294
+ # Check if page overlaps with range
295
+ if page.char_offset < end_char and page.char_end > start_char:
296
+ page_numbers.append(page.page_number)
297
+
298
+ return page_numbers
299
+
300
+ @property
301
+ def page_count(self) -> int:
302
+ """Get the number of pages in the document."""
303
+ return len(self.pages)
304
+
305
+ @property
306
+ def char_count(self) -> int:
307
+ """Get the total character count."""
308
+ return len(self.full_text)
@@ -72,6 +72,19 @@ class LabeledStatement(BaseModel):
72
72
  default_factory=list,
73
73
  description="Taxonomy classifications from Stage 6"
74
74
  )
75
+ # Document tracking fields
76
+ document_id: Optional[str] = Field(
77
+ None,
78
+ description="ID of the source document (for document pipeline)"
79
+ )
80
+ page_number: Optional[int] = Field(
81
+ None,
82
+ description="Page number where this statement was extracted (1-indexed)"
83
+ )
84
+ citation: Optional[str] = Field(
85
+ None,
86
+ description="Formatted citation string (e.g., 'Title - Author, 2024, p. 5')"
87
+ )
75
88
 
76
89
  def get_label(self, label_type: str) -> Optional[StatementLabel]:
77
90
  """Get a label by type, or None if not found."""
@@ -102,28 +115,41 @@ class LabeledStatement(BaseModel):
102
115
  """Format as FQN triple."""
103
116
  return f"{self.subject_fqn} --[{self.statement.predicate}]--> {self.object_fqn}"
104
117
 
118
+ def _build_entity_dict(self, canonical: CanonicalEntity, entity_type: str) -> dict:
119
+ """Build entity dict for serialization."""
120
+ statement_entity = self.statement.subject if entity_type == "subject" else self.statement.object
121
+ fqn = self.subject_fqn if entity_type == "subject" else self.object_fqn
122
+
123
+ # Get canonical_id from identifiers or canonical_match
124
+ identifiers = canonical.qualified_entity.qualifiers.identifiers
125
+ canonical_id = identifiers.get("canonical_id")
126
+ if not canonical_id and canonical.canonical_match:
127
+ canonical_id = canonical.canonical_match.canonical_id
128
+
129
+ result = {
130
+ "text": statement_entity.text,
131
+ "type": statement_entity.type.value,
132
+ "fqn": fqn,
133
+ "canonical_id": canonical_id,
134
+ }
135
+
136
+ # Add name if available
137
+ if canonical.name:
138
+ result["name"] = canonical.name
139
+
140
+ # Add qualifiers if available
141
+ qualifiers_dict = canonical.qualifiers_dict
142
+ if qualifiers_dict:
143
+ result["qualifiers"] = qualifiers_dict
144
+
145
+ return result
146
+
105
147
  def as_dict(self) -> dict:
106
148
  """Convert to a simplified dictionary representation."""
107
149
  return {
108
- "subject": {
109
- "text": self.statement.subject.text,
110
- "type": self.statement.subject.type.value,
111
- "fqn": self.subject_fqn,
112
- "canonical_id": (
113
- self.subject_canonical.canonical_match.canonical_id
114
- if self.subject_canonical.canonical_match else None
115
- ),
116
- },
150
+ "subject": self._build_entity_dict(self.subject_canonical, "subject"),
117
151
  "predicate": self.statement.predicate,
118
- "object": {
119
- "text": self.statement.object.text,
120
- "type": self.statement.object.type.value,
121
- "fqn": self.object_fqn,
122
- "canonical_id": (
123
- self.object_canonical.canonical_match.canonical_id
124
- if self.object_canonical.canonical_match else None
125
- ),
126
- },
152
+ "object": self._build_entity_dict(self.object_canonical, "object"),
127
153
  "source_text": self.statement.source_text,
128
154
  "labels": {
129
155
  label.label_type: label.label_value
@@ -137,6 +163,9 @@ class LabeledStatement(BaseModel):
137
163
  }
138
164
  for t in self.taxonomy_results
139
165
  ],
166
+ "document_id": self.document_id,
167
+ "page_number": self.page_number,
168
+ "citation": self.citation,
140
169
  }
141
170
 
142
171
  class Config: