corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document processing module for statement extraction.
|
|
3
|
+
|
|
4
|
+
Provides document-level features including:
|
|
5
|
+
- Text chunking with page awareness
|
|
6
|
+
- Statement deduplication across chunks
|
|
7
|
+
- Document summarization
|
|
8
|
+
- Citation generation
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
>>> from statement_extractor.document import DocumentPipeline, Document
|
|
12
|
+
>>>
|
|
13
|
+
>>> pipeline = DocumentPipeline()
|
|
14
|
+
>>> document = Document.from_text("Your document text...", title="Report 2024")
|
|
15
|
+
>>> ctx = pipeline.process(document)
|
|
16
|
+
>>>
|
|
17
|
+
>>> for stmt in ctx.labeled_statements:
|
|
18
|
+
... print(f"{stmt.subject_fqn} -> {stmt.object_fqn}")
|
|
19
|
+
... print(f" Citation: {stmt.citation}")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Re-export document models for convenience
|
|
23
|
+
from ..models.document import (
|
|
24
|
+
ChunkingConfig,
|
|
25
|
+
Document,
|
|
26
|
+
DocumentMetadata,
|
|
27
|
+
DocumentPage,
|
|
28
|
+
TextChunk,
|
|
29
|
+
)
|
|
30
|
+
from .chunker import DocumentChunker
|
|
31
|
+
from .context import DocumentContext
|
|
32
|
+
from .deduplicator import StatementDeduplicator
|
|
33
|
+
from .html_extractor import extract_text_from_html, extract_article_content
|
|
34
|
+
from .loader import URLLoader, URLLoaderConfig, load_url, load_url_sync
|
|
35
|
+
from .pipeline import DocumentPipeline, DocumentPipelineConfig
|
|
36
|
+
from .summarizer import DocumentSummarizer
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
# Pipeline
|
|
40
|
+
"DocumentPipeline",
|
|
41
|
+
"DocumentPipelineConfig",
|
|
42
|
+
# Context
|
|
43
|
+
"DocumentContext",
|
|
44
|
+
# Components
|
|
45
|
+
"DocumentChunker",
|
|
46
|
+
"StatementDeduplicator",
|
|
47
|
+
"DocumentSummarizer",
|
|
48
|
+
# URL loading
|
|
49
|
+
"URLLoader",
|
|
50
|
+
"URLLoaderConfig",
|
|
51
|
+
"load_url",
|
|
52
|
+
"load_url_sync",
|
|
53
|
+
# HTML extraction
|
|
54
|
+
"extract_text_from_html",
|
|
55
|
+
"extract_article_content",
|
|
56
|
+
# Models (re-exported)
|
|
57
|
+
"Document",
|
|
58
|
+
"DocumentMetadata",
|
|
59
|
+
"DocumentPage",
|
|
60
|
+
"TextChunk",
|
|
61
|
+
"ChunkingConfig",
|
|
62
|
+
]
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DocumentChunker - Token-aware text chunking for document processing.
|
|
3
|
+
|
|
4
|
+
Splits documents into chunks suitable for the extraction pipeline while
|
|
5
|
+
maintaining page and sentence boundary awareness.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from ..models.document import ChunkingConfig, Document, TextChunk
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocumentChunker:
|
|
18
|
+
"""
|
|
19
|
+
Chunks documents into processable text segments.
|
|
20
|
+
|
|
21
|
+
Uses the T5-Gemma tokenizer for accurate token counting and supports:
|
|
22
|
+
- Page boundary awareness
|
|
23
|
+
- Sentence boundary splitting
|
|
24
|
+
- Configurable overlap between chunks
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: Optional[ChunkingConfig] = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the chunker.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: Chunking configuration (uses defaults if not provided)
|
|
33
|
+
"""
|
|
34
|
+
self._config = config or ChunkingConfig()
|
|
35
|
+
self._tokenizer = None
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def tokenizer(self):
|
|
39
|
+
"""Lazy-load the tokenizer from the T5-Gemma model."""
|
|
40
|
+
if self._tokenizer is None:
|
|
41
|
+
from transformers import AutoTokenizer
|
|
42
|
+
logger.debug("Loading T5-Gemma tokenizer for chunking")
|
|
43
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
44
|
+
"Corp-o-Rate-Community/statement-extractor",
|
|
45
|
+
trust_remote_code=True,
|
|
46
|
+
)
|
|
47
|
+
return self._tokenizer
|
|
48
|
+
|
|
49
|
+
def count_tokens(self, text: str) -> int:
|
|
50
|
+
"""
|
|
51
|
+
Count the number of tokens in a text string.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to count tokens for
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Number of tokens
|
|
58
|
+
"""
|
|
59
|
+
if not text:
|
|
60
|
+
return 0
|
|
61
|
+
return len(self.tokenizer.encode(text, add_special_tokens=False))
|
|
62
|
+
|
|
63
|
+
def chunk_document(self, document: Document) -> list[TextChunk]:
|
|
64
|
+
"""
|
|
65
|
+
Chunk a document into text segments.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
document: Document to chunk
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List of TextChunk objects
|
|
72
|
+
"""
|
|
73
|
+
if not document.full_text:
|
|
74
|
+
return []
|
|
75
|
+
|
|
76
|
+
logger.info(f"Chunking document {document.document_id}: {document.char_count} chars")
|
|
77
|
+
|
|
78
|
+
# If document has pages and we respect page boundaries, use page-aware chunking
|
|
79
|
+
if document.pages and self._config.respect_page_boundaries:
|
|
80
|
+
chunks = self._chunk_with_pages(document)
|
|
81
|
+
else:
|
|
82
|
+
chunks = self._chunk_text(
|
|
83
|
+
text=document.full_text,
|
|
84
|
+
document_id=document.document_id,
|
|
85
|
+
page_getter=document.get_pages_in_range if document.pages else None,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
logger.info(f"Created {len(chunks)} chunks")
|
|
89
|
+
return chunks
|
|
90
|
+
|
|
91
|
+
def chunk_text(
|
|
92
|
+
self,
|
|
93
|
+
text: str,
|
|
94
|
+
document_id: str,
|
|
95
|
+
) -> list[TextChunk]:
|
|
96
|
+
"""
|
|
97
|
+
Chunk plain text (without page structure).
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
text: Text to chunk
|
|
101
|
+
document_id: Document ID to assign to chunks
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of TextChunk objects
|
|
105
|
+
"""
|
|
106
|
+
return self._chunk_text(text, document_id, page_getter=None)
|
|
107
|
+
|
|
108
|
+
def _chunk_with_pages(self, document: Document) -> list[TextChunk]:
|
|
109
|
+
"""Chunk document respecting page boundaries."""
|
|
110
|
+
chunks = []
|
|
111
|
+
chunk_index = 0
|
|
112
|
+
current_text = ""
|
|
113
|
+
current_start = 0
|
|
114
|
+
current_pages = []
|
|
115
|
+
|
|
116
|
+
for page in document.pages:
|
|
117
|
+
page_tokens = self.count_tokens(page.text)
|
|
118
|
+
|
|
119
|
+
# Check if adding this page would exceed max_tokens
|
|
120
|
+
current_tokens = self.count_tokens(current_text)
|
|
121
|
+
|
|
122
|
+
if current_text and current_tokens + page_tokens > self._config.max_tokens:
|
|
123
|
+
# Flush current chunk
|
|
124
|
+
chunk = self._create_chunk(
|
|
125
|
+
chunk_index=chunk_index,
|
|
126
|
+
text=current_text,
|
|
127
|
+
start_char=current_start,
|
|
128
|
+
pages=current_pages,
|
|
129
|
+
document_id=document.document_id,
|
|
130
|
+
overlap_chars=0,
|
|
131
|
+
)
|
|
132
|
+
chunks.append(chunk)
|
|
133
|
+
chunk_index += 1
|
|
134
|
+
|
|
135
|
+
# Start new chunk with overlap from previous
|
|
136
|
+
overlap_text = self._get_overlap_text(current_text)
|
|
137
|
+
current_text = overlap_text + page.text
|
|
138
|
+
current_start = page.char_offset - len(overlap_text)
|
|
139
|
+
current_pages = [page.page_number]
|
|
140
|
+
else:
|
|
141
|
+
# Add page to current chunk
|
|
142
|
+
if current_text:
|
|
143
|
+
current_text += "\n" + page.text
|
|
144
|
+
else:
|
|
145
|
+
current_text = page.text
|
|
146
|
+
current_start = page.char_offset
|
|
147
|
+
current_pages.append(page.page_number)
|
|
148
|
+
|
|
149
|
+
# If current chunk exceeds target, try to split at sentence boundary
|
|
150
|
+
current_tokens = self.count_tokens(current_text)
|
|
151
|
+
if current_tokens > self._config.target_tokens:
|
|
152
|
+
# Split within the page if it's too large
|
|
153
|
+
sub_chunks = self._split_large_text(
|
|
154
|
+
text=current_text,
|
|
155
|
+
start_char=current_start,
|
|
156
|
+
pages=current_pages,
|
|
157
|
+
chunk_index=chunk_index,
|
|
158
|
+
document_id=document.document_id,
|
|
159
|
+
)
|
|
160
|
+
if len(sub_chunks) > 1:
|
|
161
|
+
chunks.extend(sub_chunks[:-1])
|
|
162
|
+
chunk_index += len(sub_chunks) - 1
|
|
163
|
+
last_chunk = sub_chunks[-1]
|
|
164
|
+
current_text = last_chunk.text
|
|
165
|
+
current_start = last_chunk.start_char
|
|
166
|
+
current_pages = last_chunk.page_numbers
|
|
167
|
+
|
|
168
|
+
# Flush remaining text
|
|
169
|
+
if current_text.strip():
|
|
170
|
+
chunk = self._create_chunk(
|
|
171
|
+
chunk_index=chunk_index,
|
|
172
|
+
text=current_text,
|
|
173
|
+
start_char=current_start,
|
|
174
|
+
pages=current_pages,
|
|
175
|
+
document_id=document.document_id,
|
|
176
|
+
overlap_chars=0,
|
|
177
|
+
)
|
|
178
|
+
chunks.append(chunk)
|
|
179
|
+
|
|
180
|
+
return chunks
|
|
181
|
+
|
|
182
|
+
def _chunk_text(
|
|
183
|
+
self,
|
|
184
|
+
text: str,
|
|
185
|
+
document_id: str,
|
|
186
|
+
page_getter: Optional[callable] = None,
|
|
187
|
+
) -> list[TextChunk]:
|
|
188
|
+
"""Chunk text without page structure."""
|
|
189
|
+
if not text.strip():
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
chunks = []
|
|
193
|
+
chunk_index = 0
|
|
194
|
+
remaining_text = text
|
|
195
|
+
current_start = 0
|
|
196
|
+
|
|
197
|
+
while remaining_text:
|
|
198
|
+
# Find a good split point
|
|
199
|
+
chunk_text, chars_consumed = self._find_chunk_boundary(remaining_text)
|
|
200
|
+
|
|
201
|
+
if not chunk_text.strip():
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
# Get pages for this chunk if page_getter is available
|
|
205
|
+
end_char = current_start + len(chunk_text)
|
|
206
|
+
pages = page_getter(current_start, end_char) if page_getter else []
|
|
207
|
+
|
|
208
|
+
# Calculate overlap from previous chunk
|
|
209
|
+
overlap_chars = 0
|
|
210
|
+
if chunks:
|
|
211
|
+
prev_chunk = chunks[-1]
|
|
212
|
+
if current_start < prev_chunk.end_char:
|
|
213
|
+
overlap_chars = prev_chunk.end_char - current_start
|
|
214
|
+
|
|
215
|
+
chunk = self._create_chunk(
|
|
216
|
+
chunk_index=chunk_index,
|
|
217
|
+
text=chunk_text,
|
|
218
|
+
start_char=current_start,
|
|
219
|
+
pages=pages,
|
|
220
|
+
document_id=document_id,
|
|
221
|
+
overlap_chars=overlap_chars,
|
|
222
|
+
)
|
|
223
|
+
chunks.append(chunk)
|
|
224
|
+
chunk_index += 1
|
|
225
|
+
|
|
226
|
+
# Move to next chunk with overlap
|
|
227
|
+
remaining_text = remaining_text[chars_consumed:]
|
|
228
|
+
current_start += chars_consumed
|
|
229
|
+
|
|
230
|
+
# Add overlap from the end of current chunk to start of next
|
|
231
|
+
if remaining_text:
|
|
232
|
+
overlap = self._get_overlap_text(chunk_text)
|
|
233
|
+
if overlap:
|
|
234
|
+
remaining_text = overlap + remaining_text
|
|
235
|
+
current_start -= len(overlap)
|
|
236
|
+
|
|
237
|
+
return chunks
|
|
238
|
+
|
|
239
|
+
def _find_chunk_boundary(self, text: str) -> tuple[str, int]:
|
|
240
|
+
"""
|
|
241
|
+
Find a good boundary to split text at.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Tuple of (chunk_text, chars_consumed)
|
|
245
|
+
"""
|
|
246
|
+
total_tokens = self.count_tokens(text)
|
|
247
|
+
|
|
248
|
+
# If text fits in target, return it all
|
|
249
|
+
if total_tokens <= self._config.target_tokens:
|
|
250
|
+
return text, len(text)
|
|
251
|
+
|
|
252
|
+
# Binary search for the right split point
|
|
253
|
+
target_chars = self._estimate_chars_for_tokens(text, self._config.target_tokens)
|
|
254
|
+
|
|
255
|
+
if self._config.respect_sentence_boundaries:
|
|
256
|
+
# Find sentence boundary near target
|
|
257
|
+
split_pos = self._find_sentence_boundary(text, target_chars)
|
|
258
|
+
else:
|
|
259
|
+
split_pos = target_chars
|
|
260
|
+
|
|
261
|
+
# Ensure we don't exceed max tokens
|
|
262
|
+
chunk_text = text[:split_pos]
|
|
263
|
+
while self.count_tokens(chunk_text) > self._config.max_tokens and split_pos > 100:
|
|
264
|
+
split_pos = int(split_pos * 0.9)
|
|
265
|
+
if self._config.respect_sentence_boundaries:
|
|
266
|
+
split_pos = self._find_sentence_boundary(text, split_pos)
|
|
267
|
+
chunk_text = text[:split_pos]
|
|
268
|
+
|
|
269
|
+
return chunk_text, split_pos
|
|
270
|
+
|
|
271
|
+
def _estimate_chars_for_tokens(self, text: str, target_tokens: int) -> int:
|
|
272
|
+
"""Estimate character count for a target token count."""
|
|
273
|
+
total_tokens = self.count_tokens(text)
|
|
274
|
+
if total_tokens == 0:
|
|
275
|
+
return len(text)
|
|
276
|
+
|
|
277
|
+
# Estimate chars per token ratio
|
|
278
|
+
chars_per_token = len(text) / total_tokens
|
|
279
|
+
return min(len(text), int(target_tokens * chars_per_token))
|
|
280
|
+
|
|
281
|
+
def _find_sentence_boundary(self, text: str, near_pos: int) -> int:
|
|
282
|
+
"""Find a sentence boundary near the given position."""
|
|
283
|
+
# Look for sentence endings near the position
|
|
284
|
+
search_start = max(0, near_pos - 200)
|
|
285
|
+
search_end = min(len(text), near_pos + 200)
|
|
286
|
+
search_region = text[search_start:search_end]
|
|
287
|
+
|
|
288
|
+
# Find all sentence boundaries in the region
|
|
289
|
+
sentence_pattern = r'[.!?]+[\s"\')]*'
|
|
290
|
+
matches = list(re.finditer(sentence_pattern, search_region))
|
|
291
|
+
|
|
292
|
+
if not matches:
|
|
293
|
+
# No sentence boundary found, fall back to word boundary
|
|
294
|
+
return self._find_word_boundary(text, near_pos)
|
|
295
|
+
|
|
296
|
+
# Find the boundary closest to our target position
|
|
297
|
+
target_in_region = near_pos - search_start
|
|
298
|
+
best_match = min(matches, key=lambda m: abs(m.end() - target_in_region))
|
|
299
|
+
return search_start + best_match.end()
|
|
300
|
+
|
|
301
|
+
def _find_word_boundary(self, text: str, near_pos: int) -> int:
|
|
302
|
+
"""Find a word boundary near the given position."""
|
|
303
|
+
# Look for whitespace near the position
|
|
304
|
+
search_start = max(0, near_pos - 50)
|
|
305
|
+
search_end = min(len(text), near_pos + 50)
|
|
306
|
+
|
|
307
|
+
# Prefer splitting at whitespace after the position
|
|
308
|
+
for i in range(near_pos, search_end):
|
|
309
|
+
if text[i].isspace():
|
|
310
|
+
return i + 1
|
|
311
|
+
|
|
312
|
+
# Fall back to whitespace before
|
|
313
|
+
for i in range(near_pos, search_start, -1):
|
|
314
|
+
if text[i].isspace():
|
|
315
|
+
return i + 1
|
|
316
|
+
|
|
317
|
+
# No good boundary found
|
|
318
|
+
return near_pos
|
|
319
|
+
|
|
320
|
+
def _get_overlap_text(self, text: str) -> str:
|
|
321
|
+
"""Get overlap text from the end of a chunk."""
|
|
322
|
+
if self._config.overlap_tokens <= 0:
|
|
323
|
+
return ""
|
|
324
|
+
|
|
325
|
+
# Estimate characters for overlap tokens
|
|
326
|
+
target_chars = self._estimate_chars_for_tokens(
|
|
327
|
+
text[-1000:] if len(text) > 1000 else text,
|
|
328
|
+
self._config.overlap_tokens
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Get text from the end
|
|
332
|
+
overlap_text = text[-target_chars:] if target_chars < len(text) else text
|
|
333
|
+
|
|
334
|
+
# Try to start at a sentence or word boundary
|
|
335
|
+
sentence_match = re.search(r'[.!?]+[\s"\')]*', overlap_text)
|
|
336
|
+
if sentence_match:
|
|
337
|
+
overlap_text = overlap_text[sentence_match.end():]
|
|
338
|
+
else:
|
|
339
|
+
# Start at word boundary
|
|
340
|
+
word_match = re.search(r'\s+', overlap_text)
|
|
341
|
+
if word_match:
|
|
342
|
+
overlap_text = overlap_text[word_match.end():]
|
|
343
|
+
|
|
344
|
+
return overlap_text
|
|
345
|
+
|
|
346
|
+
def _split_large_text(
|
|
347
|
+
self,
|
|
348
|
+
text: str,
|
|
349
|
+
start_char: int,
|
|
350
|
+
pages: list[int],
|
|
351
|
+
chunk_index: int,
|
|
352
|
+
document_id: str,
|
|
353
|
+
) -> list[TextChunk]:
|
|
354
|
+
"""Split text that's too large into multiple chunks."""
|
|
355
|
+
chunks = []
|
|
356
|
+
remaining = text
|
|
357
|
+
current_start = start_char
|
|
358
|
+
current_index = chunk_index
|
|
359
|
+
|
|
360
|
+
while remaining:
|
|
361
|
+
chunk_text, chars_consumed = self._find_chunk_boundary(remaining)
|
|
362
|
+
if not chunk_text.strip():
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
chunk = self._create_chunk(
|
|
366
|
+
chunk_index=current_index,
|
|
367
|
+
text=chunk_text,
|
|
368
|
+
start_char=current_start,
|
|
369
|
+
pages=pages, # All sub-chunks share the same pages
|
|
370
|
+
document_id=document_id,
|
|
371
|
+
overlap_chars=(
|
|
372
|
+
0 if current_index == chunk_index
|
|
373
|
+
else len(self._get_overlap_text(chunks[-1].text))
|
|
374
|
+
),
|
|
375
|
+
)
|
|
376
|
+
chunks.append(chunk)
|
|
377
|
+
current_index += 1
|
|
378
|
+
|
|
379
|
+
remaining = remaining[chars_consumed:]
|
|
380
|
+
current_start += chars_consumed
|
|
381
|
+
|
|
382
|
+
# Add overlap
|
|
383
|
+
if remaining:
|
|
384
|
+
overlap = self._get_overlap_text(chunk_text)
|
|
385
|
+
if overlap:
|
|
386
|
+
remaining = overlap + remaining
|
|
387
|
+
current_start -= len(overlap)
|
|
388
|
+
|
|
389
|
+
return chunks
|
|
390
|
+
|
|
391
|
+
def _create_chunk(
|
|
392
|
+
self,
|
|
393
|
+
chunk_index: int,
|
|
394
|
+
text: str,
|
|
395
|
+
start_char: int,
|
|
396
|
+
pages: list[int],
|
|
397
|
+
document_id: str,
|
|
398
|
+
overlap_chars: int,
|
|
399
|
+
) -> TextChunk:
|
|
400
|
+
"""Create a TextChunk object."""
|
|
401
|
+
return TextChunk(
|
|
402
|
+
chunk_index=chunk_index,
|
|
403
|
+
text=text,
|
|
404
|
+
start_char=start_char,
|
|
405
|
+
end_char=start_char + len(text),
|
|
406
|
+
page_numbers=pages,
|
|
407
|
+
token_count=self.count_tokens(text),
|
|
408
|
+
overlap_chars=overlap_chars,
|
|
409
|
+
document_id=document_id,
|
|
410
|
+
)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DocumentContext - Context object for document-level pipeline results.
|
|
3
|
+
|
|
4
|
+
Holds all results from document processing including chunks, statements,
|
|
5
|
+
and pipeline outputs.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from ..models.document import Document, TextChunk
|
|
13
|
+
from ..models.labels import LabeledStatement
|
|
14
|
+
from ..models.statement import PipelineStatement, RawTriple
|
|
15
|
+
from ..pipeline.context import PipelineContext
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DocumentContext(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Context for document-level processing results.
|
|
21
|
+
|
|
22
|
+
Contains the source document, chunks, and aggregated pipeline results.
|
|
23
|
+
"""
|
|
24
|
+
document: Document = Field(..., description="Source document")
|
|
25
|
+
chunks: list[TextChunk] = Field(
|
|
26
|
+
default_factory=list,
|
|
27
|
+
description="Text chunks created from the document"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Aggregated pipeline results
|
|
31
|
+
raw_triples: list[RawTriple] = Field(
|
|
32
|
+
default_factory=list,
|
|
33
|
+
description="Raw triples from all chunks (Stage 1)"
|
|
34
|
+
)
|
|
35
|
+
statements: list[PipelineStatement] = Field(
|
|
36
|
+
default_factory=list,
|
|
37
|
+
description="Pipeline statements from all chunks (Stage 2)"
|
|
38
|
+
)
|
|
39
|
+
labeled_statements: list[LabeledStatement] = Field(
|
|
40
|
+
default_factory=list,
|
|
41
|
+
description="Final labeled statements (Stage 5)"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Processing metadata
|
|
45
|
+
chunk_contexts: list[PipelineContext] = Field(
|
|
46
|
+
default_factory=list,
|
|
47
|
+
description="Individual pipeline contexts for each chunk"
|
|
48
|
+
)
|
|
49
|
+
stage_timings: dict[str, float] = Field(
|
|
50
|
+
default_factory=dict,
|
|
51
|
+
description="Total time spent in each stage across all chunks"
|
|
52
|
+
)
|
|
53
|
+
processing_errors: list[str] = Field(
|
|
54
|
+
default_factory=list,
|
|
55
|
+
description="Errors encountered during processing"
|
|
56
|
+
)
|
|
57
|
+
processing_warnings: list[str] = Field(
|
|
58
|
+
default_factory=list,
|
|
59
|
+
description="Warnings generated during processing"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Deduplication stats
|
|
63
|
+
pre_dedup_count: int = Field(
|
|
64
|
+
default=0,
|
|
65
|
+
description="Number of statements before deduplication"
|
|
66
|
+
)
|
|
67
|
+
post_dedup_count: int = Field(
|
|
68
|
+
default=0,
|
|
69
|
+
description="Number of statements after deduplication"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
class Config:
|
|
73
|
+
arbitrary_types_allowed = True # Allow PipelineContext
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def statement_count(self) -> int:
|
|
77
|
+
"""Get the total number of final statements."""
|
|
78
|
+
return len(self.labeled_statements)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def chunk_count(self) -> int:
|
|
82
|
+
"""Get the number of chunks."""
|
|
83
|
+
return len(self.chunks)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def duplicates_removed(self) -> int:
|
|
87
|
+
"""Get the number of duplicate statements removed."""
|
|
88
|
+
return self.pre_dedup_count - self.post_dedup_count
|
|
89
|
+
|
|
90
|
+
def add_error(self, error: str) -> None:
|
|
91
|
+
"""Add a processing error."""
|
|
92
|
+
self.processing_errors.append(error)
|
|
93
|
+
|
|
94
|
+
def add_warning(self, warning: str) -> None:
|
|
95
|
+
"""Add a processing warning."""
|
|
96
|
+
self.processing_warnings.append(warning)
|
|
97
|
+
|
|
98
|
+
def record_timing(self, stage: str, duration: float) -> None:
|
|
99
|
+
"""
|
|
100
|
+
Record timing for a stage (accumulates across chunks).
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
stage: Stage name
|
|
104
|
+
duration: Duration in seconds
|
|
105
|
+
"""
|
|
106
|
+
if stage in self.stage_timings:
|
|
107
|
+
self.stage_timings[stage] += duration
|
|
108
|
+
else:
|
|
109
|
+
self.stage_timings[stage] = duration
|
|
110
|
+
|
|
111
|
+
def merge_chunk_context(self, chunk_ctx: PipelineContext) -> None:
|
|
112
|
+
"""
|
|
113
|
+
Merge results from a chunk's pipeline context.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
chunk_ctx: Pipeline context from processing a chunk
|
|
117
|
+
"""
|
|
118
|
+
self.chunk_contexts.append(chunk_ctx)
|
|
119
|
+
|
|
120
|
+
# Merge timings
|
|
121
|
+
for stage, duration in chunk_ctx.stage_timings.items():
|
|
122
|
+
self.record_timing(stage, duration)
|
|
123
|
+
|
|
124
|
+
# Merge errors and warnings
|
|
125
|
+
self.processing_errors.extend(chunk_ctx.processing_errors)
|
|
126
|
+
self.processing_warnings.extend(chunk_ctx.processing_warnings)
|
|
127
|
+
|
|
128
|
+
def get_statements_by_page(self, page_number: int) -> list[LabeledStatement]:
|
|
129
|
+
"""
|
|
130
|
+
Get all statements from a specific page.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
page_number: 1-indexed page number
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
List of statements from that page
|
|
137
|
+
"""
|
|
138
|
+
return [
|
|
139
|
+
stmt for stmt in self.labeled_statements
|
|
140
|
+
if stmt.page_number == page_number
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
def get_statements_by_chunk(self, chunk_index: int) -> list[LabeledStatement]:
|
|
144
|
+
"""
|
|
145
|
+
Get all statements from a specific chunk.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
chunk_index: 0-indexed chunk index
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of statements from that chunk
|
|
152
|
+
"""
|
|
153
|
+
return [
|
|
154
|
+
stmt for stmt in self.labeled_statements
|
|
155
|
+
if stmt.statement.chunk_index == chunk_index
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
def as_dict(self) -> dict[str, Any]:
|
|
159
|
+
"""Convert to a dictionary representation."""
|
|
160
|
+
return {
|
|
161
|
+
"document_id": self.document.document_id,
|
|
162
|
+
"document_title": self.document.metadata.title,
|
|
163
|
+
"summary": self.document.summary,
|
|
164
|
+
"chunk_count": self.chunk_count,
|
|
165
|
+
"statement_count": self.statement_count,
|
|
166
|
+
"duplicates_removed": self.duplicates_removed,
|
|
167
|
+
"statements": [stmt.as_dict() for stmt in self.labeled_statements],
|
|
168
|
+
"timings": self.stage_timings,
|
|
169
|
+
"errors": self.processing_errors,
|
|
170
|
+
"warnings": self.processing_warnings,
|
|
171
|
+
}
|