corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Canonical models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
CanonicalMatch: Result of matching to a canonical form
|
|
5
|
+
CanonicalEntity: Entity with canonical form from Stage 4
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from .qualifiers import QualifiedEntity
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CanonicalMatch(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Result of matching an entity to its canonical form in Stage 4.
|
|
18
|
+
|
|
19
|
+
Contains information about how the match was made and confidence level.
|
|
20
|
+
"""
|
|
21
|
+
canonical_id: Optional[str] = Field(
|
|
22
|
+
None,
|
|
23
|
+
description="ID in canonical database (e.g., LEI, Wikidata QID)"
|
|
24
|
+
)
|
|
25
|
+
canonical_name: Optional[str] = Field(
|
|
26
|
+
None,
|
|
27
|
+
description="Canonical name/label"
|
|
28
|
+
)
|
|
29
|
+
match_method: str = Field(
|
|
30
|
+
...,
|
|
31
|
+
description="How the match was made: 'identifier', 'name_exact', 'name_fuzzy', 'llm_verified'"
|
|
32
|
+
)
|
|
33
|
+
match_confidence: float = Field(
|
|
34
|
+
default=1.0,
|
|
35
|
+
ge=0.0,
|
|
36
|
+
le=1.0,
|
|
37
|
+
description="Confidence in the canonical match"
|
|
38
|
+
)
|
|
39
|
+
match_details: Optional[dict] = Field(
|
|
40
|
+
None,
|
|
41
|
+
description="Additional details about the match (e.g., fuzzy score, LLM reasoning)"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def is_high_confidence(self, threshold: float = 0.85) -> bool:
|
|
45
|
+
"""Check if this is a high-confidence match."""
|
|
46
|
+
return self.match_confidence >= threshold
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CanonicalEntity(BaseModel):
|
|
50
|
+
"""
|
|
51
|
+
An entity with canonical form from Stage 4 (Canonicalization).
|
|
52
|
+
|
|
53
|
+
Contains the qualified entity plus its canonical match (if found)
|
|
54
|
+
and a fully qualified name (FQN) for display.
|
|
55
|
+
"""
|
|
56
|
+
entity_ref: str = Field(..., description="Reference to the original ExtractedEntity")
|
|
57
|
+
qualified_entity: QualifiedEntity = Field(
|
|
58
|
+
...,
|
|
59
|
+
description="The qualified entity from Stage 3"
|
|
60
|
+
)
|
|
61
|
+
canonical_match: Optional[CanonicalMatch] = Field(
|
|
62
|
+
None,
|
|
63
|
+
description="Canonical match if found"
|
|
64
|
+
)
|
|
65
|
+
fqn: str = Field(
|
|
66
|
+
...,
|
|
67
|
+
description="Fully qualified name, e.g., 'AMAZON CORP INC (SEC-CIK,USA)'"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def name(self) -> Optional[str]:
|
|
72
|
+
"""Get the canonical/legal name if available."""
|
|
73
|
+
# Prefer legal_name from qualifiers (set by embedding qualifier)
|
|
74
|
+
if self.qualified_entity.qualifiers.legal_name:
|
|
75
|
+
return self.qualified_entity.qualifiers.legal_name
|
|
76
|
+
# Fall back to canonical match name
|
|
77
|
+
if self.canonical_match and self.canonical_match.canonical_name:
|
|
78
|
+
return self.canonical_match.canonical_name
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def qualifiers_dict(self) -> Optional[dict[str, str]]:
|
|
83
|
+
"""
|
|
84
|
+
Get qualifiers as a dict for serialization.
|
|
85
|
+
|
|
86
|
+
Returns a dict with keys like: legal_name, region, source, source_id
|
|
87
|
+
Only returns non-None values. Returns None if no qualifiers are set.
|
|
88
|
+
"""
|
|
89
|
+
qualifiers = self.qualified_entity.qualifiers
|
|
90
|
+
identifiers = qualifiers.identifiers
|
|
91
|
+
result = {}
|
|
92
|
+
|
|
93
|
+
# Add legal name
|
|
94
|
+
if qualifiers.legal_name:
|
|
95
|
+
result["legal_name"] = qualifiers.legal_name
|
|
96
|
+
|
|
97
|
+
# Add region (prefer region, fall back to jurisdiction/country)
|
|
98
|
+
if qualifiers.region:
|
|
99
|
+
result["region"] = qualifiers.region
|
|
100
|
+
elif qualifiers.jurisdiction:
|
|
101
|
+
result["region"] = qualifiers.jurisdiction
|
|
102
|
+
elif qualifiers.country:
|
|
103
|
+
result["region"] = qualifiers.country
|
|
104
|
+
|
|
105
|
+
# Add source and source_id from identifiers
|
|
106
|
+
if "source" in identifiers:
|
|
107
|
+
result["source"] = identifiers["source"]
|
|
108
|
+
if "source_id" in identifiers:
|
|
109
|
+
result["source_id"] = identifiers["source_id"]
|
|
110
|
+
|
|
111
|
+
return result if result else None
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def from_qualified(
|
|
115
|
+
cls,
|
|
116
|
+
qualified: QualifiedEntity,
|
|
117
|
+
canonical_match: Optional[CanonicalMatch] = None,
|
|
118
|
+
fqn: Optional[str] = None,
|
|
119
|
+
) -> "CanonicalEntity":
|
|
120
|
+
"""Create a CanonicalEntity from a QualifiedEntity."""
|
|
121
|
+
if fqn is None:
|
|
122
|
+
# Generate default FQN from qualifiers
|
|
123
|
+
fqn = cls._generate_fqn(qualified, canonical_match)
|
|
124
|
+
|
|
125
|
+
return cls(
|
|
126
|
+
entity_ref=qualified.entity_ref,
|
|
127
|
+
qualified_entity=qualified,
|
|
128
|
+
canonical_match=canonical_match,
|
|
129
|
+
fqn=fqn,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _generate_fqn(
|
|
134
|
+
qualified: QualifiedEntity,
|
|
135
|
+
canonical_match: Optional[CanonicalMatch] = None
|
|
136
|
+
) -> str:
|
|
137
|
+
"""
|
|
138
|
+
Generate a fully qualified name from qualifiers.
|
|
139
|
+
|
|
140
|
+
Examples:
|
|
141
|
+
- PERSON with role+org: "Tim Cook (CEO, Apple Inc)"
|
|
142
|
+
- ORG with canonical: "Apple Inc (AAPL)"
|
|
143
|
+
- PERSON with no qualifiers: "Tim Cook"
|
|
144
|
+
"""
|
|
145
|
+
# Use canonical name if available, otherwise fall back to original text
|
|
146
|
+
if canonical_match and canonical_match.canonical_name:
|
|
147
|
+
base_name = canonical_match.canonical_name
|
|
148
|
+
else:
|
|
149
|
+
base_name = qualified.original_text
|
|
150
|
+
|
|
151
|
+
qualifiers = qualified.qualifiers
|
|
152
|
+
parts = []
|
|
153
|
+
seen = set() # Track seen values to avoid duplicates
|
|
154
|
+
|
|
155
|
+
def add_part(value: str) -> None:
|
|
156
|
+
"""Add a part if not already seen (case-insensitive)."""
|
|
157
|
+
if value and value.lower() not in seen:
|
|
158
|
+
parts.append(value)
|
|
159
|
+
seen.add(value.lower())
|
|
160
|
+
|
|
161
|
+
# Add role for PERSON entities
|
|
162
|
+
if qualifiers.role:
|
|
163
|
+
add_part(qualifiers.role)
|
|
164
|
+
|
|
165
|
+
# Add organization for PERSON entities
|
|
166
|
+
if qualifiers.org:
|
|
167
|
+
add_part(qualifiers.org)
|
|
168
|
+
|
|
169
|
+
# Add ticker for ORG entities
|
|
170
|
+
if "ticker" in qualifiers.identifiers:
|
|
171
|
+
add_part(qualifiers.identifiers["ticker"])
|
|
172
|
+
|
|
173
|
+
# Add jurisdiction if relevant
|
|
174
|
+
if qualifiers.jurisdiction and not qualifiers.org:
|
|
175
|
+
add_part(qualifiers.jurisdiction)
|
|
176
|
+
|
|
177
|
+
if parts:
|
|
178
|
+
return f"{base_name} ({', '.join(parts)})"
|
|
179
|
+
return base_name
|
|
180
|
+
|
|
181
|
+
class Config:
|
|
182
|
+
frozen = False # Allow modification during pipeline stages
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document models for document-level processing.
|
|
3
|
+
|
|
4
|
+
Document: A document with metadata, pages, and optional summary
|
|
5
|
+
DocumentMetadata: Metadata about the document source
|
|
6
|
+
DocumentPage: A single page within a document
|
|
7
|
+
TextChunk: A chunk of text for processing with page tracking
|
|
8
|
+
ChunkingConfig: Configuration for text chunking
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import uuid
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocumentMetadata(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Metadata about a document source.
|
|
20
|
+
|
|
21
|
+
Contains information about where the document came from and
|
|
22
|
+
who authored it, useful for generating citations.
|
|
23
|
+
"""
|
|
24
|
+
url: Optional[str] = Field(None, description="URL source of the document")
|
|
25
|
+
title: Optional[str] = Field(None, description="Document title")
|
|
26
|
+
year: Optional[int] = Field(None, description="Publication year")
|
|
27
|
+
authors: list[str] = Field(default_factory=list, description="List of authors")
|
|
28
|
+
source_type: Optional[str] = Field(
|
|
29
|
+
None,
|
|
30
|
+
description="Type of source: 'pdf', 'webpage', 'text', etc."
|
|
31
|
+
)
|
|
32
|
+
custom: dict[str, Any] = Field(
|
|
33
|
+
default_factory=dict,
|
|
34
|
+
description="Custom metadata fields"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def format_citation(self, page_number: Optional[int] = None) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Format a citation string for this document.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
page_number: Optional page number to include
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Citation string like "Title - Author, 2024, p. 5"
|
|
46
|
+
"""
|
|
47
|
+
parts = []
|
|
48
|
+
|
|
49
|
+
if self.title:
|
|
50
|
+
parts.append(self.title)
|
|
51
|
+
|
|
52
|
+
if self.authors:
|
|
53
|
+
if len(self.authors) == 1:
|
|
54
|
+
parts.append(self.authors[0])
|
|
55
|
+
elif len(self.authors) == 2:
|
|
56
|
+
parts.append(f"{self.authors[0]} & {self.authors[1]}")
|
|
57
|
+
else:
|
|
58
|
+
parts.append(f"{self.authors[0]} et al.")
|
|
59
|
+
|
|
60
|
+
if self.year:
|
|
61
|
+
parts.append(str(self.year))
|
|
62
|
+
|
|
63
|
+
if page_number is not None:
|
|
64
|
+
parts.append(f"p. {page_number}")
|
|
65
|
+
|
|
66
|
+
return " - ".join(parts) if parts else ""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DocumentPage(BaseModel):
|
|
70
|
+
"""
|
|
71
|
+
A single page within a document.
|
|
72
|
+
|
|
73
|
+
Tracks the page number and character offset for citation purposes.
|
|
74
|
+
"""
|
|
75
|
+
page_number: int = Field(..., description="1-indexed page number")
|
|
76
|
+
text: str = Field(..., description="Text content of the page")
|
|
77
|
+
char_offset: int = Field(
|
|
78
|
+
...,
|
|
79
|
+
description="Character offset of this page in the full document text"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def char_end(self) -> int:
|
|
84
|
+
"""Get the ending character offset of this page."""
|
|
85
|
+
return self.char_offset + len(self.text)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TextChunk(BaseModel):
|
|
89
|
+
"""
|
|
90
|
+
A chunk of text for processing.
|
|
91
|
+
|
|
92
|
+
Contains the text along with position tracking for mapping
|
|
93
|
+
extracted statements back to their source pages.
|
|
94
|
+
"""
|
|
95
|
+
chunk_index: int = Field(..., description="0-indexed chunk number")
|
|
96
|
+
text: str = Field(..., description="Chunk text content")
|
|
97
|
+
start_char: int = Field(..., description="Starting character offset in full document")
|
|
98
|
+
end_char: int = Field(..., description="Ending character offset in full document")
|
|
99
|
+
page_numbers: list[int] = Field(
|
|
100
|
+
default_factory=list,
|
|
101
|
+
description="Page numbers this chunk spans (1-indexed)"
|
|
102
|
+
)
|
|
103
|
+
token_count: int = Field(..., description="Number of tokens in this chunk")
|
|
104
|
+
overlap_chars: int = Field(
|
|
105
|
+
default=0,
|
|
106
|
+
description="Number of characters of overlap from previous chunk"
|
|
107
|
+
)
|
|
108
|
+
document_id: str = Field(..., description="ID of the source document")
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def primary_page(self) -> Optional[int]:
|
|
112
|
+
"""Get the primary page number for this chunk (first page)."""
|
|
113
|
+
return self.page_numbers[0] if self.page_numbers else None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class ChunkingConfig(BaseModel):
|
|
117
|
+
"""
|
|
118
|
+
Configuration for document chunking.
|
|
119
|
+
|
|
120
|
+
Controls how documents are split into chunks for processing.
|
|
121
|
+
"""
|
|
122
|
+
max_tokens: int = Field(
|
|
123
|
+
default=2000,
|
|
124
|
+
ge=100,
|
|
125
|
+
description="Maximum tokens per chunk (hard limit)"
|
|
126
|
+
)
|
|
127
|
+
target_tokens: int = Field(
|
|
128
|
+
default=1000,
|
|
129
|
+
ge=50,
|
|
130
|
+
description="Target tokens per chunk (soft limit, prefers to split here)"
|
|
131
|
+
)
|
|
132
|
+
overlap_tokens: int = Field(
|
|
133
|
+
default=100,
|
|
134
|
+
ge=0,
|
|
135
|
+
description="Tokens of overlap between consecutive chunks"
|
|
136
|
+
)
|
|
137
|
+
respect_page_boundaries: bool = Field(
|
|
138
|
+
default=True,
|
|
139
|
+
description="Try to split at page boundaries when possible"
|
|
140
|
+
)
|
|
141
|
+
respect_sentence_boundaries: bool = Field(
|
|
142
|
+
default=True,
|
|
143
|
+
description="Try to split at sentence boundaries when possible"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class Document(BaseModel):
|
|
148
|
+
"""
|
|
149
|
+
A document for processing through the extraction pipeline.
|
|
150
|
+
|
|
151
|
+
Contains the full text, optional page structure, metadata for citations,
|
|
152
|
+
and an optional summary for context.
|
|
153
|
+
"""
|
|
154
|
+
document_id: str = Field(
|
|
155
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
156
|
+
description="Unique identifier for this document"
|
|
157
|
+
)
|
|
158
|
+
metadata: DocumentMetadata = Field(
|
|
159
|
+
default_factory=DocumentMetadata,
|
|
160
|
+
description="Document metadata for citations"
|
|
161
|
+
)
|
|
162
|
+
pages: list[DocumentPage] = Field(
|
|
163
|
+
default_factory=list,
|
|
164
|
+
description="List of pages (optional, for PDFs)"
|
|
165
|
+
)
|
|
166
|
+
full_text: str = Field(
|
|
167
|
+
default="",
|
|
168
|
+
description="Full text content of the document"
|
|
169
|
+
)
|
|
170
|
+
summary: Optional[str] = Field(
|
|
171
|
+
None,
|
|
172
|
+
description="Generated summary of the document"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@classmethod
|
|
176
|
+
def from_text(
|
|
177
|
+
cls,
|
|
178
|
+
text: str,
|
|
179
|
+
title: Optional[str] = None,
|
|
180
|
+
url: Optional[str] = None,
|
|
181
|
+
**metadata_kwargs,
|
|
182
|
+
) -> "Document":
|
|
183
|
+
"""
|
|
184
|
+
Create a document from plain text.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
text: The document text
|
|
188
|
+
title: Optional document title
|
|
189
|
+
url: Optional source URL
|
|
190
|
+
**metadata_kwargs: Additional metadata fields
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Document instance
|
|
194
|
+
"""
|
|
195
|
+
metadata = DocumentMetadata(
|
|
196
|
+
title=title,
|
|
197
|
+
url=url,
|
|
198
|
+
source_type="text",
|
|
199
|
+
**metadata_kwargs,
|
|
200
|
+
)
|
|
201
|
+
return cls(
|
|
202
|
+
metadata=metadata,
|
|
203
|
+
full_text=text,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def from_pages(
|
|
208
|
+
cls,
|
|
209
|
+
pages: list[str],
|
|
210
|
+
title: Optional[str] = None,
|
|
211
|
+
source_type: str = "pdf",
|
|
212
|
+
**metadata_kwargs,
|
|
213
|
+
) -> "Document":
|
|
214
|
+
"""
|
|
215
|
+
Create a document from a list of page texts.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
pages: List of page text strings (0-indexed input, stored as 1-indexed)
|
|
219
|
+
title: Optional document title
|
|
220
|
+
source_type: Source type (default: "pdf")
|
|
221
|
+
**metadata_kwargs: Additional metadata fields
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Document instance
|
|
225
|
+
"""
|
|
226
|
+
metadata = DocumentMetadata(
|
|
227
|
+
title=title,
|
|
228
|
+
source_type=source_type,
|
|
229
|
+
**metadata_kwargs,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Build pages with character offsets
|
|
233
|
+
doc_pages = []
|
|
234
|
+
char_offset = 0
|
|
235
|
+
|
|
236
|
+
for i, page_text in enumerate(pages):
|
|
237
|
+
doc_pages.append(DocumentPage(
|
|
238
|
+
page_number=i + 1, # 1-indexed
|
|
239
|
+
text=page_text,
|
|
240
|
+
char_offset=char_offset,
|
|
241
|
+
))
|
|
242
|
+
char_offset += len(page_text)
|
|
243
|
+
if i < len(pages) - 1:
|
|
244
|
+
char_offset += 1 # Account for newline between pages
|
|
245
|
+
|
|
246
|
+
# Join pages with newlines for full text
|
|
247
|
+
full_text = "\n".join(pages)
|
|
248
|
+
|
|
249
|
+
return cls(
|
|
250
|
+
metadata=metadata,
|
|
251
|
+
pages=doc_pages,
|
|
252
|
+
full_text=full_text,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def get_page_at_char(self, char_offset: int) -> Optional[int]:
|
|
256
|
+
"""
|
|
257
|
+
Get the page number containing a character offset.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
char_offset: Character offset in full_text
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
1-indexed page number, or None if no pages defined
|
|
264
|
+
"""
|
|
265
|
+
if not self.pages:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
for page in self.pages:
|
|
269
|
+
if page.char_offset <= char_offset < page.char_end:
|
|
270
|
+
return page.page_number
|
|
271
|
+
|
|
272
|
+
# If past the last page, return last page
|
|
273
|
+
if char_offset >= self.pages[-1].char_end:
|
|
274
|
+
return self.pages[-1].page_number
|
|
275
|
+
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
def get_pages_in_range(self, start_char: int, end_char: int) -> list[int]:
|
|
279
|
+
"""
|
|
280
|
+
Get all page numbers that overlap with a character range.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
start_char: Start character offset
|
|
284
|
+
end_char: End character offset
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
List of 1-indexed page numbers
|
|
288
|
+
"""
|
|
289
|
+
if not self.pages:
|
|
290
|
+
return []
|
|
291
|
+
|
|
292
|
+
page_numbers = []
|
|
293
|
+
for page in self.pages:
|
|
294
|
+
# Check if page overlaps with range
|
|
295
|
+
if page.char_offset < end_char and page.char_end > start_char:
|
|
296
|
+
page_numbers.append(page.page_number)
|
|
297
|
+
|
|
298
|
+
return page_numbers
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def page_count(self) -> int:
|
|
302
|
+
"""Get the number of pages in the document."""
|
|
303
|
+
return len(self.pages)
|
|
304
|
+
|
|
305
|
+
@property
|
|
306
|
+
def char_count(self) -> int:
|
|
307
|
+
"""Get the total character count."""
|
|
308
|
+
return len(self.full_text)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
ExtractedEntity represents entities identified during extraction with
|
|
5
|
+
confidence scores and span information.
|
|
6
|
+
|
|
7
|
+
Note: EntityType is imported from the original models.py for consistency.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Optional, TYPE_CHECKING
|
|
11
|
+
import uuid
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
# Import EntityType from parent module to avoid duplication
|
|
16
|
+
# This will be populated by __init__.py which loads from old models.py
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from enum import Enum
|
|
19
|
+
|
|
20
|
+
class EntityType(str, Enum):
|
|
21
|
+
"""Supported entity types for subjects and objects."""
|
|
22
|
+
ORG = "ORG"
|
|
23
|
+
PERSON = "PERSON"
|
|
24
|
+
GPE = "GPE"
|
|
25
|
+
LOC = "LOC"
|
|
26
|
+
PRODUCT = "PRODUCT"
|
|
27
|
+
EVENT = "EVENT"
|
|
28
|
+
WORK_OF_ART = "WORK_OF_ART"
|
|
29
|
+
LAW = "LAW"
|
|
30
|
+
DATE = "DATE"
|
|
31
|
+
MONEY = "MONEY"
|
|
32
|
+
PERCENT = "PERCENT"
|
|
33
|
+
QUANTITY = "QUANTITY"
|
|
34
|
+
UNKNOWN = "UNKNOWN"
|
|
35
|
+
else:
|
|
36
|
+
# At runtime, we need to import it from somewhere
|
|
37
|
+
# Try the old models.py location first
|
|
38
|
+
try:
|
|
39
|
+
import importlib.util
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
_models_py_path = Path(__file__).parent.parent / "models.py"
|
|
42
|
+
_spec = importlib.util.spec_from_file_location("_old_models", _models_py_path)
|
|
43
|
+
_old_models = importlib.util.module_from_spec(_spec)
|
|
44
|
+
_spec.loader.exec_module(_old_models)
|
|
45
|
+
EntityType = _old_models.EntityType
|
|
46
|
+
except Exception:
|
|
47
|
+
# Fallback to defining it here
|
|
48
|
+
from enum import Enum
|
|
49
|
+
|
|
50
|
+
class EntityType(str, Enum):
|
|
51
|
+
"""Supported entity types for subjects and objects."""
|
|
52
|
+
ORG = "ORG"
|
|
53
|
+
PERSON = "PERSON"
|
|
54
|
+
GPE = "GPE"
|
|
55
|
+
LOC = "LOC"
|
|
56
|
+
PRODUCT = "PRODUCT"
|
|
57
|
+
EVENT = "EVENT"
|
|
58
|
+
WORK_OF_ART = "WORK_OF_ART"
|
|
59
|
+
LAW = "LAW"
|
|
60
|
+
DATE = "DATE"
|
|
61
|
+
MONEY = "MONEY"
|
|
62
|
+
PERCENT = "PERCENT"
|
|
63
|
+
QUANTITY = "QUANTITY"
|
|
64
|
+
UNKNOWN = "UNKNOWN"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ExtractedEntity(BaseModel):
|
|
68
|
+
"""
|
|
69
|
+
An entity extracted from text with type and confidence information.
|
|
70
|
+
|
|
71
|
+
Used in Stage 2 (Extraction) and flows through subsequent stages.
|
|
72
|
+
"""
|
|
73
|
+
text: str = Field(..., description="The entity text as extracted")
|
|
74
|
+
type: EntityType = Field(default=EntityType.UNKNOWN, description="The entity type")
|
|
75
|
+
span: Optional[tuple[int, int]] = Field(
|
|
76
|
+
None,
|
|
77
|
+
description="Character offsets (start, end) in source text"
|
|
78
|
+
)
|
|
79
|
+
confidence: float = Field(
|
|
80
|
+
default=1.0,
|
|
81
|
+
ge=0.0,
|
|
82
|
+
le=1.0,
|
|
83
|
+
description="Confidence score for this entity extraction"
|
|
84
|
+
)
|
|
85
|
+
entity_ref: str = Field(
|
|
86
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
87
|
+
description="Unique reference ID for tracking this entity through the pipeline"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def __str__(self) -> str:
|
|
91
|
+
return f"{self.text} ({self.type.value})"
|
|
92
|
+
|
|
93
|
+
def __hash__(self) -> int:
|
|
94
|
+
return hash(self.entity_ref)
|
|
95
|
+
|
|
96
|
+
def __eq__(self, other: object) -> bool:
|
|
97
|
+
if not isinstance(other, ExtractedEntity):
|
|
98
|
+
return False
|
|
99
|
+
return self.entity_ref == other.entity_ref
|
|
100
|
+
|
|
101
|
+
class Config:
|
|
102
|
+
frozen = False # Allow modification during pipeline stages
|