corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,182 @@
1
+ """
2
+ Canonical models for the extraction pipeline.
3
+
4
+ CanonicalMatch: Result of matching to a canonical form
5
+ CanonicalEntity: Entity with canonical form from Stage 4
6
+ """
7
+
8
+ from typing import Optional
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from .qualifiers import QualifiedEntity
13
+
14
+
15
+ class CanonicalMatch(BaseModel):
16
+ """
17
+ Result of matching an entity to its canonical form in Stage 4.
18
+
19
+ Contains information about how the match was made and confidence level.
20
+ """
21
+ canonical_id: Optional[str] = Field(
22
+ None,
23
+ description="ID in canonical database (e.g., LEI, Wikidata QID)"
24
+ )
25
+ canonical_name: Optional[str] = Field(
26
+ None,
27
+ description="Canonical name/label"
28
+ )
29
+ match_method: str = Field(
30
+ ...,
31
+ description="How the match was made: 'identifier', 'name_exact', 'name_fuzzy', 'llm_verified'"
32
+ )
33
+ match_confidence: float = Field(
34
+ default=1.0,
35
+ ge=0.0,
36
+ le=1.0,
37
+ description="Confidence in the canonical match"
38
+ )
39
+ match_details: Optional[dict] = Field(
40
+ None,
41
+ description="Additional details about the match (e.g., fuzzy score, LLM reasoning)"
42
+ )
43
+
44
+ def is_high_confidence(self, threshold: float = 0.85) -> bool:
45
+ """Check if this is a high-confidence match."""
46
+ return self.match_confidence >= threshold
47
+
48
+
49
+ class CanonicalEntity(BaseModel):
50
+ """
51
+ An entity with canonical form from Stage 4 (Canonicalization).
52
+
53
+ Contains the qualified entity plus its canonical match (if found)
54
+ and a fully qualified name (FQN) for display.
55
+ """
56
+ entity_ref: str = Field(..., description="Reference to the original ExtractedEntity")
57
+ qualified_entity: QualifiedEntity = Field(
58
+ ...,
59
+ description="The qualified entity from Stage 3"
60
+ )
61
+ canonical_match: Optional[CanonicalMatch] = Field(
62
+ None,
63
+ description="Canonical match if found"
64
+ )
65
+ fqn: str = Field(
66
+ ...,
67
+ description="Fully qualified name, e.g., 'AMAZON CORP INC (SEC-CIK,USA)'"
68
+ )
69
+
70
+ @property
71
+ def name(self) -> Optional[str]:
72
+ """Get the canonical/legal name if available."""
73
+ # Prefer legal_name from qualifiers (set by embedding qualifier)
74
+ if self.qualified_entity.qualifiers.legal_name:
75
+ return self.qualified_entity.qualifiers.legal_name
76
+ # Fall back to canonical match name
77
+ if self.canonical_match and self.canonical_match.canonical_name:
78
+ return self.canonical_match.canonical_name
79
+ return None
80
+
81
+ @property
82
+ def qualifiers_dict(self) -> Optional[dict[str, str]]:
83
+ """
84
+ Get qualifiers as a dict for serialization.
85
+
86
+ Returns a dict with keys like: legal_name, region, source, source_id
87
+ Only returns non-None values. Returns None if no qualifiers are set.
88
+ """
89
+ qualifiers = self.qualified_entity.qualifiers
90
+ identifiers = qualifiers.identifiers
91
+ result = {}
92
+
93
+ # Add legal name
94
+ if qualifiers.legal_name:
95
+ result["legal_name"] = qualifiers.legal_name
96
+
97
+ # Add region (prefer region, fall back to jurisdiction/country)
98
+ if qualifiers.region:
99
+ result["region"] = qualifiers.region
100
+ elif qualifiers.jurisdiction:
101
+ result["region"] = qualifiers.jurisdiction
102
+ elif qualifiers.country:
103
+ result["region"] = qualifiers.country
104
+
105
+ # Add source and source_id from identifiers
106
+ if "source" in identifiers:
107
+ result["source"] = identifiers["source"]
108
+ if "source_id" in identifiers:
109
+ result["source_id"] = identifiers["source_id"]
110
+
111
+ return result if result else None
112
+
113
+ @classmethod
114
+ def from_qualified(
115
+ cls,
116
+ qualified: QualifiedEntity,
117
+ canonical_match: Optional[CanonicalMatch] = None,
118
+ fqn: Optional[str] = None,
119
+ ) -> "CanonicalEntity":
120
+ """Create a CanonicalEntity from a QualifiedEntity."""
121
+ if fqn is None:
122
+ # Generate default FQN from qualifiers
123
+ fqn = cls._generate_fqn(qualified, canonical_match)
124
+
125
+ return cls(
126
+ entity_ref=qualified.entity_ref,
127
+ qualified_entity=qualified,
128
+ canonical_match=canonical_match,
129
+ fqn=fqn,
130
+ )
131
+
132
+ @staticmethod
133
+ def _generate_fqn(
134
+ qualified: QualifiedEntity,
135
+ canonical_match: Optional[CanonicalMatch] = None
136
+ ) -> str:
137
+ """
138
+ Generate a fully qualified name from qualifiers.
139
+
140
+ Examples:
141
+ - PERSON with role+org: "Tim Cook (CEO, Apple Inc)"
142
+ - ORG with canonical: "Apple Inc (AAPL)"
143
+ - PERSON with no qualifiers: "Tim Cook"
144
+ """
145
+ # Use canonical name if available, otherwise fall back to original text
146
+ if canonical_match and canonical_match.canonical_name:
147
+ base_name = canonical_match.canonical_name
148
+ else:
149
+ base_name = qualified.original_text
150
+
151
+ qualifiers = qualified.qualifiers
152
+ parts = []
153
+ seen = set() # Track seen values to avoid duplicates
154
+
155
+ def add_part(value: str) -> None:
156
+ """Add a part if not already seen (case-insensitive)."""
157
+ if value and value.lower() not in seen:
158
+ parts.append(value)
159
+ seen.add(value.lower())
160
+
161
+ # Add role for PERSON entities
162
+ if qualifiers.role:
163
+ add_part(qualifiers.role)
164
+
165
+ # Add organization for PERSON entities
166
+ if qualifiers.org:
167
+ add_part(qualifiers.org)
168
+
169
+ # Add ticker for ORG entities
170
+ if "ticker" in qualifiers.identifiers:
171
+ add_part(qualifiers.identifiers["ticker"])
172
+
173
+ # Add jurisdiction if relevant
174
+ if qualifiers.jurisdiction and not qualifiers.org:
175
+ add_part(qualifiers.jurisdiction)
176
+
177
+ if parts:
178
+ return f"{base_name} ({', '.join(parts)})"
179
+ return base_name
180
+
181
+ class Config:
182
+ frozen = False # Allow modification during pipeline stages
@@ -0,0 +1,308 @@
1
+ """
2
+ Document models for document-level processing.
3
+
4
+ Document: A document with metadata, pages, and optional summary
5
+ DocumentMetadata: Metadata about the document source
6
+ DocumentPage: A single page within a document
7
+ TextChunk: A chunk of text for processing with page tracking
8
+ ChunkingConfig: Configuration for text chunking
9
+ """
10
+
11
+ import uuid
12
+ from typing import Any, Optional
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+
17
+ class DocumentMetadata(BaseModel):
18
+ """
19
+ Metadata about a document source.
20
+
21
+ Contains information about where the document came from and
22
+ who authored it, useful for generating citations.
23
+ """
24
+ url: Optional[str] = Field(None, description="URL source of the document")
25
+ title: Optional[str] = Field(None, description="Document title")
26
+ year: Optional[int] = Field(None, description="Publication year")
27
+ authors: list[str] = Field(default_factory=list, description="List of authors")
28
+ source_type: Optional[str] = Field(
29
+ None,
30
+ description="Type of source: 'pdf', 'webpage', 'text', etc."
31
+ )
32
+ custom: dict[str, Any] = Field(
33
+ default_factory=dict,
34
+ description="Custom metadata fields"
35
+ )
36
+
37
+ def format_citation(self, page_number: Optional[int] = None) -> str:
38
+ """
39
+ Format a citation string for this document.
40
+
41
+ Args:
42
+ page_number: Optional page number to include
43
+
44
+ Returns:
45
+ Citation string like "Title - Author, 2024, p. 5"
46
+ """
47
+ parts = []
48
+
49
+ if self.title:
50
+ parts.append(self.title)
51
+
52
+ if self.authors:
53
+ if len(self.authors) == 1:
54
+ parts.append(self.authors[0])
55
+ elif len(self.authors) == 2:
56
+ parts.append(f"{self.authors[0]} & {self.authors[1]}")
57
+ else:
58
+ parts.append(f"{self.authors[0]} et al.")
59
+
60
+ if self.year:
61
+ parts.append(str(self.year))
62
+
63
+ if page_number is not None:
64
+ parts.append(f"p. {page_number}")
65
+
66
+ return " - ".join(parts) if parts else ""
67
+
68
+
69
+ class DocumentPage(BaseModel):
70
+ """
71
+ A single page within a document.
72
+
73
+ Tracks the page number and character offset for citation purposes.
74
+ """
75
+ page_number: int = Field(..., description="1-indexed page number")
76
+ text: str = Field(..., description="Text content of the page")
77
+ char_offset: int = Field(
78
+ ...,
79
+ description="Character offset of this page in the full document text"
80
+ )
81
+
82
+ @property
83
+ def char_end(self) -> int:
84
+ """Get the ending character offset of this page."""
85
+ return self.char_offset + len(self.text)
86
+
87
+
88
+ class TextChunk(BaseModel):
89
+ """
90
+ A chunk of text for processing.
91
+
92
+ Contains the text along with position tracking for mapping
93
+ extracted statements back to their source pages.
94
+ """
95
+ chunk_index: int = Field(..., description="0-indexed chunk number")
96
+ text: str = Field(..., description="Chunk text content")
97
+ start_char: int = Field(..., description="Starting character offset in full document")
98
+ end_char: int = Field(..., description="Ending character offset in full document")
99
+ page_numbers: list[int] = Field(
100
+ default_factory=list,
101
+ description="Page numbers this chunk spans (1-indexed)"
102
+ )
103
+ token_count: int = Field(..., description="Number of tokens in this chunk")
104
+ overlap_chars: int = Field(
105
+ default=0,
106
+ description="Number of characters of overlap from previous chunk"
107
+ )
108
+ document_id: str = Field(..., description="ID of the source document")
109
+
110
+ @property
111
+ def primary_page(self) -> Optional[int]:
112
+ """Get the primary page number for this chunk (first page)."""
113
+ return self.page_numbers[0] if self.page_numbers else None
114
+
115
+
116
+ class ChunkingConfig(BaseModel):
117
+ """
118
+ Configuration for document chunking.
119
+
120
+ Controls how documents are split into chunks for processing.
121
+ """
122
+ max_tokens: int = Field(
123
+ default=2000,
124
+ ge=100,
125
+ description="Maximum tokens per chunk (hard limit)"
126
+ )
127
+ target_tokens: int = Field(
128
+ default=1000,
129
+ ge=50,
130
+ description="Target tokens per chunk (soft limit, prefers to split here)"
131
+ )
132
+ overlap_tokens: int = Field(
133
+ default=100,
134
+ ge=0,
135
+ description="Tokens of overlap between consecutive chunks"
136
+ )
137
+ respect_page_boundaries: bool = Field(
138
+ default=True,
139
+ description="Try to split at page boundaries when possible"
140
+ )
141
+ respect_sentence_boundaries: bool = Field(
142
+ default=True,
143
+ description="Try to split at sentence boundaries when possible"
144
+ )
145
+
146
+
147
+ class Document(BaseModel):
148
+ """
149
+ A document for processing through the extraction pipeline.
150
+
151
+ Contains the full text, optional page structure, metadata for citations,
152
+ and an optional summary for context.
153
+ """
154
+ document_id: str = Field(
155
+ default_factory=lambda: str(uuid.uuid4()),
156
+ description="Unique identifier for this document"
157
+ )
158
+ metadata: DocumentMetadata = Field(
159
+ default_factory=DocumentMetadata,
160
+ description="Document metadata for citations"
161
+ )
162
+ pages: list[DocumentPage] = Field(
163
+ default_factory=list,
164
+ description="List of pages (optional, for PDFs)"
165
+ )
166
+ full_text: str = Field(
167
+ default="",
168
+ description="Full text content of the document"
169
+ )
170
+ summary: Optional[str] = Field(
171
+ None,
172
+ description="Generated summary of the document"
173
+ )
174
+
175
+ @classmethod
176
+ def from_text(
177
+ cls,
178
+ text: str,
179
+ title: Optional[str] = None,
180
+ url: Optional[str] = None,
181
+ **metadata_kwargs,
182
+ ) -> "Document":
183
+ """
184
+ Create a document from plain text.
185
+
186
+ Args:
187
+ text: The document text
188
+ title: Optional document title
189
+ url: Optional source URL
190
+ **metadata_kwargs: Additional metadata fields
191
+
192
+ Returns:
193
+ Document instance
194
+ """
195
+ metadata = DocumentMetadata(
196
+ title=title,
197
+ url=url,
198
+ source_type="text",
199
+ **metadata_kwargs,
200
+ )
201
+ return cls(
202
+ metadata=metadata,
203
+ full_text=text,
204
+ )
205
+
206
+ @classmethod
207
+ def from_pages(
208
+ cls,
209
+ pages: list[str],
210
+ title: Optional[str] = None,
211
+ source_type: str = "pdf",
212
+ **metadata_kwargs,
213
+ ) -> "Document":
214
+ """
215
+ Create a document from a list of page texts.
216
+
217
+ Args:
218
+ pages: List of page text strings (0-indexed input, stored as 1-indexed)
219
+ title: Optional document title
220
+ source_type: Source type (default: "pdf")
221
+ **metadata_kwargs: Additional metadata fields
222
+
223
+ Returns:
224
+ Document instance
225
+ """
226
+ metadata = DocumentMetadata(
227
+ title=title,
228
+ source_type=source_type,
229
+ **metadata_kwargs,
230
+ )
231
+
232
+ # Build pages with character offsets
233
+ doc_pages = []
234
+ char_offset = 0
235
+
236
+ for i, page_text in enumerate(pages):
237
+ doc_pages.append(DocumentPage(
238
+ page_number=i + 1, # 1-indexed
239
+ text=page_text,
240
+ char_offset=char_offset,
241
+ ))
242
+ char_offset += len(page_text)
243
+ if i < len(pages) - 1:
244
+ char_offset += 1 # Account for newline between pages
245
+
246
+ # Join pages with newlines for full text
247
+ full_text = "\n".join(pages)
248
+
249
+ return cls(
250
+ metadata=metadata,
251
+ pages=doc_pages,
252
+ full_text=full_text,
253
+ )
254
+
255
+ def get_page_at_char(self, char_offset: int) -> Optional[int]:
256
+ """
257
+ Get the page number containing a character offset.
258
+
259
+ Args:
260
+ char_offset: Character offset in full_text
261
+
262
+ Returns:
263
+ 1-indexed page number, or None if no pages defined
264
+ """
265
+ if not self.pages:
266
+ return None
267
+
268
+ for page in self.pages:
269
+ if page.char_offset <= char_offset < page.char_end:
270
+ return page.page_number
271
+
272
+ # If past the last page, return last page
273
+ if char_offset >= self.pages[-1].char_end:
274
+ return self.pages[-1].page_number
275
+
276
+ return None
277
+
278
+ def get_pages_in_range(self, start_char: int, end_char: int) -> list[int]:
279
+ """
280
+ Get all page numbers that overlap with a character range.
281
+
282
+ Args:
283
+ start_char: Start character offset
284
+ end_char: End character offset
285
+
286
+ Returns:
287
+ List of 1-indexed page numbers
288
+ """
289
+ if not self.pages:
290
+ return []
291
+
292
+ page_numbers = []
293
+ for page in self.pages:
294
+ # Check if page overlaps with range
295
+ if page.char_offset < end_char and page.char_end > start_char:
296
+ page_numbers.append(page.page_number)
297
+
298
+ return page_numbers
299
+
300
+ @property
301
+ def page_count(self) -> int:
302
+ """Get the number of pages in the document."""
303
+ return len(self.pages)
304
+
305
+ @property
306
+ def char_count(self) -> int:
307
+ """Get the total character count."""
308
+ return len(self.full_text)
@@ -0,0 +1,102 @@
1
+ """
2
+ Entity models for the extraction pipeline.
3
+
4
+ ExtractedEntity represents entities identified during extraction with
5
+ confidence scores and span information.
6
+
7
+ Note: EntityType is imported from the original models.py for consistency.
8
+ """
9
+
10
+ from typing import Optional, TYPE_CHECKING
11
+ import uuid
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+ # Import EntityType from parent module to avoid duplication
16
+ # This will be populated by __init__.py which loads from old models.py
17
+ if TYPE_CHECKING:
18
+ from enum import Enum
19
+
20
+ class EntityType(str, Enum):
21
+ """Supported entity types for subjects and objects."""
22
+ ORG = "ORG"
23
+ PERSON = "PERSON"
24
+ GPE = "GPE"
25
+ LOC = "LOC"
26
+ PRODUCT = "PRODUCT"
27
+ EVENT = "EVENT"
28
+ WORK_OF_ART = "WORK_OF_ART"
29
+ LAW = "LAW"
30
+ DATE = "DATE"
31
+ MONEY = "MONEY"
32
+ PERCENT = "PERCENT"
33
+ QUANTITY = "QUANTITY"
34
+ UNKNOWN = "UNKNOWN"
35
+ else:
36
+ # At runtime, we need to import it from somewhere
37
+ # Try the old models.py location first
38
+ try:
39
+ import importlib.util
40
+ from pathlib import Path
41
+ _models_py_path = Path(__file__).parent.parent / "models.py"
42
+ _spec = importlib.util.spec_from_file_location("_old_models", _models_py_path)
43
+ _old_models = importlib.util.module_from_spec(_spec)
44
+ _spec.loader.exec_module(_old_models)
45
+ EntityType = _old_models.EntityType
46
+ except Exception:
47
+ # Fallback to defining it here
48
+ from enum import Enum
49
+
50
+ class EntityType(str, Enum):
51
+ """Supported entity types for subjects and objects."""
52
+ ORG = "ORG"
53
+ PERSON = "PERSON"
54
+ GPE = "GPE"
55
+ LOC = "LOC"
56
+ PRODUCT = "PRODUCT"
57
+ EVENT = "EVENT"
58
+ WORK_OF_ART = "WORK_OF_ART"
59
+ LAW = "LAW"
60
+ DATE = "DATE"
61
+ MONEY = "MONEY"
62
+ PERCENT = "PERCENT"
63
+ QUANTITY = "QUANTITY"
64
+ UNKNOWN = "UNKNOWN"
65
+
66
+
67
+ class ExtractedEntity(BaseModel):
68
+ """
69
+ An entity extracted from text with type and confidence information.
70
+
71
+ Used in Stage 2 (Extraction) and flows through subsequent stages.
72
+ """
73
+ text: str = Field(..., description="The entity text as extracted")
74
+ type: EntityType = Field(default=EntityType.UNKNOWN, description="The entity type")
75
+ span: Optional[tuple[int, int]] = Field(
76
+ None,
77
+ description="Character offsets (start, end) in source text"
78
+ )
79
+ confidence: float = Field(
80
+ default=1.0,
81
+ ge=0.0,
82
+ le=1.0,
83
+ description="Confidence score for this entity extraction"
84
+ )
85
+ entity_ref: str = Field(
86
+ default_factory=lambda: str(uuid.uuid4()),
87
+ description="Unique reference ID for tracking this entity through the pipeline"
88
+ )
89
+
90
+ def __str__(self) -> str:
91
+ return f"{self.text} ({self.type.value})"
92
+
93
+ def __hash__(self) -> int:
94
+ return hash(self.entity_ref)
95
+
96
+ def __eq__(self, other: object) -> bool:
97
+ if not isinstance(other, ExtractedEntity):
98
+ return False
99
+ return self.entity_ref == other.entity_ref
100
+
101
+ class Config:
102
+ frozen = False # Allow modification during pipeline stages