corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DocumentSummarizer - Generate document summaries using Gemma3.
|
|
3
|
+
|
|
4
|
+
Creates concise summaries focused on entities, events, and relationships
|
|
5
|
+
that are useful for providing context during extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from ..models.document import Document
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocumentSummarizer:
|
|
17
|
+
"""
|
|
18
|
+
Generates document summaries using the Gemma3 LLM.
|
|
19
|
+
|
|
20
|
+
Summaries focus on:
|
|
21
|
+
- Key entities mentioned
|
|
22
|
+
- Important events and actions
|
|
23
|
+
- Relationships between entities
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
MAX_INPUT_TOKENS = 10_000
|
|
27
|
+
DEFAULT_MAX_OUTPUT_TOKENS = 300
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
max_input_tokens: int = MAX_INPUT_TOKENS,
|
|
32
|
+
max_output_tokens: int = DEFAULT_MAX_OUTPUT_TOKENS,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the summarizer.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
max_input_tokens: Maximum tokens of input to send to the LLM
|
|
39
|
+
max_output_tokens: Maximum tokens for the summary output
|
|
40
|
+
"""
|
|
41
|
+
self._max_input_tokens = max_input_tokens
|
|
42
|
+
self._max_output_tokens = max_output_tokens
|
|
43
|
+
self._llm = None
|
|
44
|
+
self._tokenizer = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def llm(self):
|
|
48
|
+
"""Lazy-load the LLM."""
|
|
49
|
+
if self._llm is None:
|
|
50
|
+
from ..llm import get_llm
|
|
51
|
+
logger.debug("Loading LLM for summarization")
|
|
52
|
+
self._llm = get_llm()
|
|
53
|
+
return self._llm
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def tokenizer(self):
|
|
57
|
+
"""Lazy-load tokenizer for token counting."""
|
|
58
|
+
if self._tokenizer is None:
|
|
59
|
+
from transformers import AutoTokenizer
|
|
60
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
61
|
+
"Corp-o-Rate-Community/statement-extractor",
|
|
62
|
+
trust_remote_code=True,
|
|
63
|
+
)
|
|
64
|
+
return self._tokenizer
|
|
65
|
+
|
|
66
|
+
def _count_tokens(self, text: str) -> int:
|
|
67
|
+
"""Count tokens in text."""
|
|
68
|
+
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
69
|
+
|
|
70
|
+
def _truncate_to_tokens(self, text: str, max_tokens: int) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Truncate text to a maximum number of tokens.
|
|
73
|
+
|
|
74
|
+
Tries to truncate at sentence boundaries when possible.
|
|
75
|
+
"""
|
|
76
|
+
token_count = self._count_tokens(text)
|
|
77
|
+
|
|
78
|
+
if token_count <= max_tokens:
|
|
79
|
+
return text
|
|
80
|
+
|
|
81
|
+
# Estimate chars per token
|
|
82
|
+
chars_per_token = len(text) / token_count
|
|
83
|
+
target_chars = int(max_tokens * chars_per_token * 0.95) # 5% buffer
|
|
84
|
+
|
|
85
|
+
# Truncate
|
|
86
|
+
truncated = text[:target_chars]
|
|
87
|
+
|
|
88
|
+
# Try to end at a sentence boundary
|
|
89
|
+
last_period = truncated.rfind(". ")
|
|
90
|
+
last_newline = truncated.rfind("\n")
|
|
91
|
+
split_pos = max(last_period, last_newline)
|
|
92
|
+
|
|
93
|
+
if split_pos > target_chars * 0.7: # Don't lose too much text
|
|
94
|
+
truncated = truncated[:split_pos + 1]
|
|
95
|
+
|
|
96
|
+
logger.debug(f"Truncated text from {len(text)} to {len(truncated)} chars")
|
|
97
|
+
return truncated
|
|
98
|
+
|
|
99
|
+
def summarize(
|
|
100
|
+
self,
|
|
101
|
+
document: Document,
|
|
102
|
+
custom_prompt: Optional[str] = None,
|
|
103
|
+
) -> str:
|
|
104
|
+
"""
|
|
105
|
+
Generate a summary of the document.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
document: Document to summarize
|
|
109
|
+
custom_prompt: Optional custom prompt (uses default if not provided)
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Summary string
|
|
113
|
+
"""
|
|
114
|
+
if not document.full_text.strip():
|
|
115
|
+
logger.warning("Cannot summarize empty document")
|
|
116
|
+
return ""
|
|
117
|
+
|
|
118
|
+
logger.info(f"Generating summary for document {document.document_id}")
|
|
119
|
+
|
|
120
|
+
# Truncate text to max input tokens
|
|
121
|
+
text = self._truncate_to_tokens(document.full_text, self._max_input_tokens)
|
|
122
|
+
|
|
123
|
+
# Build prompt
|
|
124
|
+
if custom_prompt:
|
|
125
|
+
prompt = f"{custom_prompt}\n\n{text}"
|
|
126
|
+
else:
|
|
127
|
+
prompt = self._build_prompt(text, document)
|
|
128
|
+
|
|
129
|
+
# Generate summary
|
|
130
|
+
try:
|
|
131
|
+
summary = self.llm.generate(
|
|
132
|
+
prompt=prompt,
|
|
133
|
+
max_tokens=self._max_output_tokens,
|
|
134
|
+
stop=["\n\n\n", "---"],
|
|
135
|
+
)
|
|
136
|
+
summary = summary.strip()
|
|
137
|
+
logger.info(f"Generated summary ({len(summary)} chars):")
|
|
138
|
+
# Log summary with indentation for readability
|
|
139
|
+
for line in summary.split("\n"):
|
|
140
|
+
logger.info(f" {line}")
|
|
141
|
+
return summary
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"Summary generation failed: {e}")
|
|
145
|
+
raise
|
|
146
|
+
|
|
147
|
+
def _build_prompt(self, text: str, document: Document) -> str:
|
|
148
|
+
"""Build the summarization prompt."""
|
|
149
|
+
# Include document metadata context if available
|
|
150
|
+
context_parts = []
|
|
151
|
+
if document.metadata.title:
|
|
152
|
+
context_parts.append(f"Title: {document.metadata.title}")
|
|
153
|
+
if document.metadata.authors:
|
|
154
|
+
context_parts.append(f"Authors: {', '.join(document.metadata.authors)}")
|
|
155
|
+
if document.metadata.source_type:
|
|
156
|
+
context_parts.append(f"Source type: {document.metadata.source_type}")
|
|
157
|
+
|
|
158
|
+
context = "\n".join(context_parts) if context_parts else ""
|
|
159
|
+
|
|
160
|
+
prompt = f"""Summarize the following document, focusing on:
|
|
161
|
+
1. Key entities (companies, people, locations) mentioned
|
|
162
|
+
2. Important events, actions, and decisions
|
|
163
|
+
3. Relationships between entities
|
|
164
|
+
4. Main topics and themes
|
|
165
|
+
|
|
166
|
+
Keep the summary concise (2-3 paragraphs) and factual.
|
|
167
|
+
|
|
168
|
+
{context}
|
|
169
|
+
|
|
170
|
+
Document text:
|
|
171
|
+
{text}
|
|
172
|
+
|
|
173
|
+
Summary:"""
|
|
174
|
+
|
|
175
|
+
return prompt
|
|
176
|
+
|
|
177
|
+
def summarize_text(
|
|
178
|
+
self,
|
|
179
|
+
text: str,
|
|
180
|
+
title: Optional[str] = None,
|
|
181
|
+
) -> str:
|
|
182
|
+
"""
|
|
183
|
+
Generate a summary from plain text.
|
|
184
|
+
|
|
185
|
+
Convenience method that creates a temporary Document.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
text: Text to summarize
|
|
189
|
+
title: Optional document title for context
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Summary string
|
|
193
|
+
"""
|
|
194
|
+
document = Document.from_text(text, title=title)
|
|
195
|
+
return self.summarize(document)
|
|
@@ -44,9 +44,16 @@ else:
|
|
|
44
44
|
# New pipeline models
|
|
45
45
|
from .entity import ExtractedEntity
|
|
46
46
|
from .statement import RawTriple, PipelineStatement
|
|
47
|
-
from .qualifiers import EntityQualifiers, QualifiedEntity
|
|
47
|
+
from .qualifiers import EntityQualifiers, QualifiedEntity, ResolvedRole, ResolvedOrganization
|
|
48
48
|
from .canonical import CanonicalMatch, CanonicalEntity
|
|
49
49
|
from .labels import StatementLabel, LabeledStatement, TaxonomyResult
|
|
50
|
+
from .document import (
|
|
51
|
+
Document,
|
|
52
|
+
DocumentMetadata,
|
|
53
|
+
DocumentPage,
|
|
54
|
+
TextChunk,
|
|
55
|
+
ChunkingConfig,
|
|
56
|
+
)
|
|
50
57
|
|
|
51
58
|
__all__ = [
|
|
52
59
|
# Re-exported from original models.py (backward compatibility)
|
|
@@ -66,9 +73,17 @@ __all__ = [
|
|
|
66
73
|
"PipelineStatement",
|
|
67
74
|
"EntityQualifiers",
|
|
68
75
|
"QualifiedEntity",
|
|
76
|
+
"ResolvedRole",
|
|
77
|
+
"ResolvedOrganization",
|
|
69
78
|
"CanonicalMatch",
|
|
70
79
|
"CanonicalEntity",
|
|
71
80
|
"StatementLabel",
|
|
72
81
|
"LabeledStatement",
|
|
73
82
|
"TaxonomyResult",
|
|
83
|
+
# Document models
|
|
84
|
+
"Document",
|
|
85
|
+
"DocumentMetadata",
|
|
86
|
+
"DocumentPage",
|
|
87
|
+
"TextChunk",
|
|
88
|
+
"ChunkingConfig",
|
|
74
89
|
]
|
|
@@ -64,9 +64,52 @@ class CanonicalEntity(BaseModel):
|
|
|
64
64
|
)
|
|
65
65
|
fqn: str = Field(
|
|
66
66
|
...,
|
|
67
|
-
description="Fully qualified name, e.g., '
|
|
67
|
+
description="Fully qualified name, e.g., 'AMAZON CORP INC (SEC-CIK,USA)'"
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
+
@property
|
|
71
|
+
def name(self) -> Optional[str]:
|
|
72
|
+
"""Get the canonical/legal name if available."""
|
|
73
|
+
# Prefer legal_name from qualifiers (set by embedding qualifier)
|
|
74
|
+
if self.qualified_entity.qualifiers.legal_name:
|
|
75
|
+
return self.qualified_entity.qualifiers.legal_name
|
|
76
|
+
# Fall back to canonical match name
|
|
77
|
+
if self.canonical_match and self.canonical_match.canonical_name:
|
|
78
|
+
return self.canonical_match.canonical_name
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def qualifiers_dict(self) -> Optional[dict[str, str]]:
|
|
83
|
+
"""
|
|
84
|
+
Get qualifiers as a dict for serialization.
|
|
85
|
+
|
|
86
|
+
Returns a dict with keys like: legal_name, region, source, source_id
|
|
87
|
+
Only returns non-None values. Returns None if no qualifiers are set.
|
|
88
|
+
"""
|
|
89
|
+
qualifiers = self.qualified_entity.qualifiers
|
|
90
|
+
identifiers = qualifiers.identifiers
|
|
91
|
+
result = {}
|
|
92
|
+
|
|
93
|
+
# Add legal name
|
|
94
|
+
if qualifiers.legal_name:
|
|
95
|
+
result["legal_name"] = qualifiers.legal_name
|
|
96
|
+
|
|
97
|
+
# Add region (prefer region, fall back to jurisdiction/country)
|
|
98
|
+
if qualifiers.region:
|
|
99
|
+
result["region"] = qualifiers.region
|
|
100
|
+
elif qualifiers.jurisdiction:
|
|
101
|
+
result["region"] = qualifiers.jurisdiction
|
|
102
|
+
elif qualifiers.country:
|
|
103
|
+
result["region"] = qualifiers.country
|
|
104
|
+
|
|
105
|
+
# Add source and source_id from identifiers
|
|
106
|
+
if "source" in identifiers:
|
|
107
|
+
result["source"] = identifiers["source"]
|
|
108
|
+
if "source_id" in identifiers:
|
|
109
|
+
result["source_id"] = identifiers["source_id"]
|
|
110
|
+
|
|
111
|
+
return result if result else None
|
|
112
|
+
|
|
70
113
|
@classmethod
|
|
71
114
|
def from_qualified(
|
|
72
115
|
cls,
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document models for document-level processing.
|
|
3
|
+
|
|
4
|
+
Document: A document with metadata, pages, and optional summary
|
|
5
|
+
DocumentMetadata: Metadata about the document source
|
|
6
|
+
DocumentPage: A single page within a document
|
|
7
|
+
TextChunk: A chunk of text for processing with page tracking
|
|
8
|
+
ChunkingConfig: Configuration for text chunking
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import uuid
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocumentMetadata(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Metadata about a document source.
|
|
20
|
+
|
|
21
|
+
Contains information about where the document came from and
|
|
22
|
+
who authored it, useful for generating citations.
|
|
23
|
+
"""
|
|
24
|
+
url: Optional[str] = Field(None, description="URL source of the document")
|
|
25
|
+
title: Optional[str] = Field(None, description="Document title")
|
|
26
|
+
year: Optional[int] = Field(None, description="Publication year")
|
|
27
|
+
authors: list[str] = Field(default_factory=list, description="List of authors")
|
|
28
|
+
source_type: Optional[str] = Field(
|
|
29
|
+
None,
|
|
30
|
+
description="Type of source: 'pdf', 'webpage', 'text', etc."
|
|
31
|
+
)
|
|
32
|
+
custom: dict[str, Any] = Field(
|
|
33
|
+
default_factory=dict,
|
|
34
|
+
description="Custom metadata fields"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def format_citation(self, page_number: Optional[int] = None) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Format a citation string for this document.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
page_number: Optional page number to include
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Citation string like "Title - Author, 2024, p. 5"
|
|
46
|
+
"""
|
|
47
|
+
parts = []
|
|
48
|
+
|
|
49
|
+
if self.title:
|
|
50
|
+
parts.append(self.title)
|
|
51
|
+
|
|
52
|
+
if self.authors:
|
|
53
|
+
if len(self.authors) == 1:
|
|
54
|
+
parts.append(self.authors[0])
|
|
55
|
+
elif len(self.authors) == 2:
|
|
56
|
+
parts.append(f"{self.authors[0]} & {self.authors[1]}")
|
|
57
|
+
else:
|
|
58
|
+
parts.append(f"{self.authors[0]} et al.")
|
|
59
|
+
|
|
60
|
+
if self.year:
|
|
61
|
+
parts.append(str(self.year))
|
|
62
|
+
|
|
63
|
+
if page_number is not None:
|
|
64
|
+
parts.append(f"p. {page_number}")
|
|
65
|
+
|
|
66
|
+
return " - ".join(parts) if parts else ""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DocumentPage(BaseModel):
|
|
70
|
+
"""
|
|
71
|
+
A single page within a document.
|
|
72
|
+
|
|
73
|
+
Tracks the page number and character offset for citation purposes.
|
|
74
|
+
"""
|
|
75
|
+
page_number: int = Field(..., description="1-indexed page number")
|
|
76
|
+
text: str = Field(..., description="Text content of the page")
|
|
77
|
+
char_offset: int = Field(
|
|
78
|
+
...,
|
|
79
|
+
description="Character offset of this page in the full document text"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def char_end(self) -> int:
|
|
84
|
+
"""Get the ending character offset of this page."""
|
|
85
|
+
return self.char_offset + len(self.text)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TextChunk(BaseModel):
|
|
89
|
+
"""
|
|
90
|
+
A chunk of text for processing.
|
|
91
|
+
|
|
92
|
+
Contains the text along with position tracking for mapping
|
|
93
|
+
extracted statements back to their source pages.
|
|
94
|
+
"""
|
|
95
|
+
chunk_index: int = Field(..., description="0-indexed chunk number")
|
|
96
|
+
text: str = Field(..., description="Chunk text content")
|
|
97
|
+
start_char: int = Field(..., description="Starting character offset in full document")
|
|
98
|
+
end_char: int = Field(..., description="Ending character offset in full document")
|
|
99
|
+
page_numbers: list[int] = Field(
|
|
100
|
+
default_factory=list,
|
|
101
|
+
description="Page numbers this chunk spans (1-indexed)"
|
|
102
|
+
)
|
|
103
|
+
token_count: int = Field(..., description="Number of tokens in this chunk")
|
|
104
|
+
overlap_chars: int = Field(
|
|
105
|
+
default=0,
|
|
106
|
+
description="Number of characters of overlap from previous chunk"
|
|
107
|
+
)
|
|
108
|
+
document_id: str = Field(..., description="ID of the source document")
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def primary_page(self) -> Optional[int]:
|
|
112
|
+
"""Get the primary page number for this chunk (first page)."""
|
|
113
|
+
return self.page_numbers[0] if self.page_numbers else None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class ChunkingConfig(BaseModel):
|
|
117
|
+
"""
|
|
118
|
+
Configuration for document chunking.
|
|
119
|
+
|
|
120
|
+
Controls how documents are split into chunks for processing.
|
|
121
|
+
"""
|
|
122
|
+
max_tokens: int = Field(
|
|
123
|
+
default=2000,
|
|
124
|
+
ge=100,
|
|
125
|
+
description="Maximum tokens per chunk (hard limit)"
|
|
126
|
+
)
|
|
127
|
+
target_tokens: int = Field(
|
|
128
|
+
default=1000,
|
|
129
|
+
ge=50,
|
|
130
|
+
description="Target tokens per chunk (soft limit, prefers to split here)"
|
|
131
|
+
)
|
|
132
|
+
overlap_tokens: int = Field(
|
|
133
|
+
default=100,
|
|
134
|
+
ge=0,
|
|
135
|
+
description="Tokens of overlap between consecutive chunks"
|
|
136
|
+
)
|
|
137
|
+
respect_page_boundaries: bool = Field(
|
|
138
|
+
default=True,
|
|
139
|
+
description="Try to split at page boundaries when possible"
|
|
140
|
+
)
|
|
141
|
+
respect_sentence_boundaries: bool = Field(
|
|
142
|
+
default=True,
|
|
143
|
+
description="Try to split at sentence boundaries when possible"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class Document(BaseModel):
|
|
148
|
+
"""
|
|
149
|
+
A document for processing through the extraction pipeline.
|
|
150
|
+
|
|
151
|
+
Contains the full text, optional page structure, metadata for citations,
|
|
152
|
+
and an optional summary for context.
|
|
153
|
+
"""
|
|
154
|
+
document_id: str = Field(
|
|
155
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
156
|
+
description="Unique identifier for this document"
|
|
157
|
+
)
|
|
158
|
+
metadata: DocumentMetadata = Field(
|
|
159
|
+
default_factory=DocumentMetadata,
|
|
160
|
+
description="Document metadata for citations"
|
|
161
|
+
)
|
|
162
|
+
pages: list[DocumentPage] = Field(
|
|
163
|
+
default_factory=list,
|
|
164
|
+
description="List of pages (optional, for PDFs)"
|
|
165
|
+
)
|
|
166
|
+
full_text: str = Field(
|
|
167
|
+
default="",
|
|
168
|
+
description="Full text content of the document"
|
|
169
|
+
)
|
|
170
|
+
summary: Optional[str] = Field(
|
|
171
|
+
None,
|
|
172
|
+
description="Generated summary of the document"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
@classmethod
|
|
176
|
+
def from_text(
|
|
177
|
+
cls,
|
|
178
|
+
text: str,
|
|
179
|
+
title: Optional[str] = None,
|
|
180
|
+
url: Optional[str] = None,
|
|
181
|
+
**metadata_kwargs,
|
|
182
|
+
) -> "Document":
|
|
183
|
+
"""
|
|
184
|
+
Create a document from plain text.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
text: The document text
|
|
188
|
+
title: Optional document title
|
|
189
|
+
url: Optional source URL
|
|
190
|
+
**metadata_kwargs: Additional metadata fields
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Document instance
|
|
194
|
+
"""
|
|
195
|
+
metadata = DocumentMetadata(
|
|
196
|
+
title=title,
|
|
197
|
+
url=url,
|
|
198
|
+
source_type="text",
|
|
199
|
+
**metadata_kwargs,
|
|
200
|
+
)
|
|
201
|
+
return cls(
|
|
202
|
+
metadata=metadata,
|
|
203
|
+
full_text=text,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def from_pages(
|
|
208
|
+
cls,
|
|
209
|
+
pages: list[str],
|
|
210
|
+
title: Optional[str] = None,
|
|
211
|
+
source_type: str = "pdf",
|
|
212
|
+
**metadata_kwargs,
|
|
213
|
+
) -> "Document":
|
|
214
|
+
"""
|
|
215
|
+
Create a document from a list of page texts.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
pages: List of page text strings (0-indexed input, stored as 1-indexed)
|
|
219
|
+
title: Optional document title
|
|
220
|
+
source_type: Source type (default: "pdf")
|
|
221
|
+
**metadata_kwargs: Additional metadata fields
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Document instance
|
|
225
|
+
"""
|
|
226
|
+
metadata = DocumentMetadata(
|
|
227
|
+
title=title,
|
|
228
|
+
source_type=source_type,
|
|
229
|
+
**metadata_kwargs,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Build pages with character offsets
|
|
233
|
+
doc_pages = []
|
|
234
|
+
char_offset = 0
|
|
235
|
+
|
|
236
|
+
for i, page_text in enumerate(pages):
|
|
237
|
+
doc_pages.append(DocumentPage(
|
|
238
|
+
page_number=i + 1, # 1-indexed
|
|
239
|
+
text=page_text,
|
|
240
|
+
char_offset=char_offset,
|
|
241
|
+
))
|
|
242
|
+
char_offset += len(page_text)
|
|
243
|
+
if i < len(pages) - 1:
|
|
244
|
+
char_offset += 1 # Account for newline between pages
|
|
245
|
+
|
|
246
|
+
# Join pages with newlines for full text
|
|
247
|
+
full_text = "\n".join(pages)
|
|
248
|
+
|
|
249
|
+
return cls(
|
|
250
|
+
metadata=metadata,
|
|
251
|
+
pages=doc_pages,
|
|
252
|
+
full_text=full_text,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def get_page_at_char(self, char_offset: int) -> Optional[int]:
|
|
256
|
+
"""
|
|
257
|
+
Get the page number containing a character offset.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
char_offset: Character offset in full_text
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
1-indexed page number, or None if no pages defined
|
|
264
|
+
"""
|
|
265
|
+
if not self.pages:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
for page in self.pages:
|
|
269
|
+
if page.char_offset <= char_offset < page.char_end:
|
|
270
|
+
return page.page_number
|
|
271
|
+
|
|
272
|
+
# If past the last page, return last page
|
|
273
|
+
if char_offset >= self.pages[-1].char_end:
|
|
274
|
+
return self.pages[-1].page_number
|
|
275
|
+
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
def get_pages_in_range(self, start_char: int, end_char: int) -> list[int]:
|
|
279
|
+
"""
|
|
280
|
+
Get all page numbers that overlap with a character range.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
start_char: Start character offset
|
|
284
|
+
end_char: End character offset
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
List of 1-indexed page numbers
|
|
288
|
+
"""
|
|
289
|
+
if not self.pages:
|
|
290
|
+
return []
|
|
291
|
+
|
|
292
|
+
page_numbers = []
|
|
293
|
+
for page in self.pages:
|
|
294
|
+
# Check if page overlaps with range
|
|
295
|
+
if page.char_offset < end_char and page.char_end > start_char:
|
|
296
|
+
page_numbers.append(page.page_number)
|
|
297
|
+
|
|
298
|
+
return page_numbers
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def page_count(self) -> int:
|
|
302
|
+
"""Get the number of pages in the document."""
|
|
303
|
+
return len(self.pages)
|
|
304
|
+
|
|
305
|
+
@property
|
|
306
|
+
def char_count(self) -> int:
|
|
307
|
+
"""Get the total character count."""
|
|
308
|
+
return len(self.full_text)
|
|
@@ -72,6 +72,19 @@ class LabeledStatement(BaseModel):
|
|
|
72
72
|
default_factory=list,
|
|
73
73
|
description="Taxonomy classifications from Stage 6"
|
|
74
74
|
)
|
|
75
|
+
# Document tracking fields
|
|
76
|
+
document_id: Optional[str] = Field(
|
|
77
|
+
None,
|
|
78
|
+
description="ID of the source document (for document pipeline)"
|
|
79
|
+
)
|
|
80
|
+
page_number: Optional[int] = Field(
|
|
81
|
+
None,
|
|
82
|
+
description="Page number where this statement was extracted (1-indexed)"
|
|
83
|
+
)
|
|
84
|
+
citation: Optional[str] = Field(
|
|
85
|
+
None,
|
|
86
|
+
description="Formatted citation string (e.g., 'Title - Author, 2024, p. 5')"
|
|
87
|
+
)
|
|
75
88
|
|
|
76
89
|
def get_label(self, label_type: str) -> Optional[StatementLabel]:
|
|
77
90
|
"""Get a label by type, or None if not found."""
|
|
@@ -102,28 +115,41 @@ class LabeledStatement(BaseModel):
|
|
|
102
115
|
"""Format as FQN triple."""
|
|
103
116
|
return f"{self.subject_fqn} --[{self.statement.predicate}]--> {self.object_fqn}"
|
|
104
117
|
|
|
118
|
+
def _build_entity_dict(self, canonical: CanonicalEntity, entity_type: str) -> dict:
|
|
119
|
+
"""Build entity dict for serialization."""
|
|
120
|
+
statement_entity = self.statement.subject if entity_type == "subject" else self.statement.object
|
|
121
|
+
fqn = self.subject_fqn if entity_type == "subject" else self.object_fqn
|
|
122
|
+
|
|
123
|
+
# Get canonical_id from identifiers or canonical_match
|
|
124
|
+
identifiers = canonical.qualified_entity.qualifiers.identifiers
|
|
125
|
+
canonical_id = identifiers.get("canonical_id")
|
|
126
|
+
if not canonical_id and canonical.canonical_match:
|
|
127
|
+
canonical_id = canonical.canonical_match.canonical_id
|
|
128
|
+
|
|
129
|
+
result = {
|
|
130
|
+
"text": statement_entity.text,
|
|
131
|
+
"type": statement_entity.type.value,
|
|
132
|
+
"fqn": fqn,
|
|
133
|
+
"canonical_id": canonical_id,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Add name if available
|
|
137
|
+
if canonical.name:
|
|
138
|
+
result["name"] = canonical.name
|
|
139
|
+
|
|
140
|
+
# Add qualifiers if available
|
|
141
|
+
qualifiers_dict = canonical.qualifiers_dict
|
|
142
|
+
if qualifiers_dict:
|
|
143
|
+
result["qualifiers"] = qualifiers_dict
|
|
144
|
+
|
|
145
|
+
return result
|
|
146
|
+
|
|
105
147
|
def as_dict(self) -> dict:
|
|
106
148
|
"""Convert to a simplified dictionary representation."""
|
|
107
149
|
return {
|
|
108
|
-
"subject":
|
|
109
|
-
"text": self.statement.subject.text,
|
|
110
|
-
"type": self.statement.subject.type.value,
|
|
111
|
-
"fqn": self.subject_fqn,
|
|
112
|
-
"canonical_id": (
|
|
113
|
-
self.subject_canonical.canonical_match.canonical_id
|
|
114
|
-
if self.subject_canonical.canonical_match else None
|
|
115
|
-
),
|
|
116
|
-
},
|
|
150
|
+
"subject": self._build_entity_dict(self.subject_canonical, "subject"),
|
|
117
151
|
"predicate": self.statement.predicate,
|
|
118
|
-
"object":
|
|
119
|
-
"text": self.statement.object.text,
|
|
120
|
-
"type": self.statement.object.type.value,
|
|
121
|
-
"fqn": self.object_fqn,
|
|
122
|
-
"canonical_id": (
|
|
123
|
-
self.object_canonical.canonical_match.canonical_id
|
|
124
|
-
if self.object_canonical.canonical_match else None
|
|
125
|
-
),
|
|
126
|
-
},
|
|
152
|
+
"object": self._build_entity_dict(self.object_canonical, "object"),
|
|
127
153
|
"source_text": self.statement.source_text,
|
|
128
154
|
"labels": {
|
|
129
155
|
label.label_type: label.label_value
|
|
@@ -137,6 +163,9 @@ class LabeledStatement(BaseModel):
|
|
|
137
163
|
}
|
|
138
164
|
for t in self.taxonomy_results
|
|
139
165
|
],
|
|
166
|
+
"document_id": self.document_id,
|
|
167
|
+
"page_number": self.page_number,
|
|
168
|
+
"citation": self.citation,
|
|
140
169
|
}
|
|
141
170
|
|
|
142
171
|
class Config:
|