corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,62 @@
1
+ """
2
+ Document processing module for statement extraction.
3
+
4
+ Provides document-level features including:
5
+ - Text chunking with page awareness
6
+ - Statement deduplication across chunks
7
+ - Document summarization
8
+ - Citation generation
9
+
10
+ Example:
11
+ >>> from statement_extractor.document import DocumentPipeline, Document
12
+ >>>
13
+ >>> pipeline = DocumentPipeline()
14
+ >>> document = Document.from_text("Your document text...", title="Report 2024")
15
+ >>> ctx = pipeline.process(document)
16
+ >>>
17
+ >>> for stmt in ctx.labeled_statements:
18
+ ... print(f"{stmt.subject_fqn} -> {stmt.object_fqn}")
19
+ ... print(f" Citation: {stmt.citation}")
20
+ """
21
+
22
+ # Re-export document models for convenience
23
+ from ..models.document import (
24
+ ChunkingConfig,
25
+ Document,
26
+ DocumentMetadata,
27
+ DocumentPage,
28
+ TextChunk,
29
+ )
30
+ from .chunker import DocumentChunker
31
+ from .context import DocumentContext
32
+ from .deduplicator import StatementDeduplicator
33
+ from .html_extractor import extract_text_from_html, extract_article_content
34
+ from .loader import URLLoader, URLLoaderConfig, load_url, load_url_sync
35
+ from .pipeline import DocumentPipeline, DocumentPipelineConfig
36
+ from .summarizer import DocumentSummarizer
37
+
38
+ __all__ = [
39
+ # Pipeline
40
+ "DocumentPipeline",
41
+ "DocumentPipelineConfig",
42
+ # Context
43
+ "DocumentContext",
44
+ # Components
45
+ "DocumentChunker",
46
+ "StatementDeduplicator",
47
+ "DocumentSummarizer",
48
+ # URL loading
49
+ "URLLoader",
50
+ "URLLoaderConfig",
51
+ "load_url",
52
+ "load_url_sync",
53
+ # HTML extraction
54
+ "extract_text_from_html",
55
+ "extract_article_content",
56
+ # Models (re-exported)
57
+ "Document",
58
+ "DocumentMetadata",
59
+ "DocumentPage",
60
+ "TextChunk",
61
+ "ChunkingConfig",
62
+ ]
@@ -0,0 +1,410 @@
1
+ """
2
+ DocumentChunker - Token-aware text chunking for document processing.
3
+
4
+ Splits documents into chunks suitable for the extraction pipeline while
5
+ maintaining page and sentence boundary awareness.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from typing import Optional
11
+
12
+ from ..models.document import ChunkingConfig, Document, TextChunk
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DocumentChunker:
18
+ """
19
+ Chunks documents into processable text segments.
20
+
21
+ Uses the T5-Gemma tokenizer for accurate token counting and supports:
22
+ - Page boundary awareness
23
+ - Sentence boundary splitting
24
+ - Configurable overlap between chunks
25
+ """
26
+
27
+ def __init__(self, config: Optional[ChunkingConfig] = None):
28
+ """
29
+ Initialize the chunker.
30
+
31
+ Args:
32
+ config: Chunking configuration (uses defaults if not provided)
33
+ """
34
+ self._config = config or ChunkingConfig()
35
+ self._tokenizer = None
36
+
37
+ @property
38
+ def tokenizer(self):
39
+ """Lazy-load the tokenizer from the T5-Gemma model."""
40
+ if self._tokenizer is None:
41
+ from transformers import AutoTokenizer
42
+ logger.debug("Loading T5-Gemma tokenizer for chunking")
43
+ self._tokenizer = AutoTokenizer.from_pretrained(
44
+ "Corp-o-Rate-Community/statement-extractor",
45
+ trust_remote_code=True,
46
+ )
47
+ return self._tokenizer
48
+
49
+ def count_tokens(self, text: str) -> int:
50
+ """
51
+ Count the number of tokens in a text string.
52
+
53
+ Args:
54
+ text: Text to count tokens for
55
+
56
+ Returns:
57
+ Number of tokens
58
+ """
59
+ if not text:
60
+ return 0
61
+ return len(self.tokenizer.encode(text, add_special_tokens=False))
62
+
63
+ def chunk_document(self, document: Document) -> list[TextChunk]:
64
+ """
65
+ Chunk a document into text segments.
66
+
67
+ Args:
68
+ document: Document to chunk
69
+
70
+ Returns:
71
+ List of TextChunk objects
72
+ """
73
+ if not document.full_text:
74
+ return []
75
+
76
+ logger.info(f"Chunking document {document.document_id}: {document.char_count} chars")
77
+
78
+ # If document has pages and we respect page boundaries, use page-aware chunking
79
+ if document.pages and self._config.respect_page_boundaries:
80
+ chunks = self._chunk_with_pages(document)
81
+ else:
82
+ chunks = self._chunk_text(
83
+ text=document.full_text,
84
+ document_id=document.document_id,
85
+ page_getter=document.get_pages_in_range if document.pages else None,
86
+ )
87
+
88
+ logger.info(f"Created {len(chunks)} chunks")
89
+ return chunks
90
+
91
+ def chunk_text(
92
+ self,
93
+ text: str,
94
+ document_id: str,
95
+ ) -> list[TextChunk]:
96
+ """
97
+ Chunk plain text (without page structure).
98
+
99
+ Args:
100
+ text: Text to chunk
101
+ document_id: Document ID to assign to chunks
102
+
103
+ Returns:
104
+ List of TextChunk objects
105
+ """
106
+ return self._chunk_text(text, document_id, page_getter=None)
107
+
108
+ def _chunk_with_pages(self, document: Document) -> list[TextChunk]:
109
+ """Chunk document respecting page boundaries."""
110
+ chunks = []
111
+ chunk_index = 0
112
+ current_text = ""
113
+ current_start = 0
114
+ current_pages = []
115
+
116
+ for page in document.pages:
117
+ page_tokens = self.count_tokens(page.text)
118
+
119
+ # Check if adding this page would exceed max_tokens
120
+ current_tokens = self.count_tokens(current_text)
121
+
122
+ if current_text and current_tokens + page_tokens > self._config.max_tokens:
123
+ # Flush current chunk
124
+ chunk = self._create_chunk(
125
+ chunk_index=chunk_index,
126
+ text=current_text,
127
+ start_char=current_start,
128
+ pages=current_pages,
129
+ document_id=document.document_id,
130
+ overlap_chars=0,
131
+ )
132
+ chunks.append(chunk)
133
+ chunk_index += 1
134
+
135
+ # Start new chunk with overlap from previous
136
+ overlap_text = self._get_overlap_text(current_text)
137
+ current_text = overlap_text + page.text
138
+ current_start = page.char_offset - len(overlap_text)
139
+ current_pages = [page.page_number]
140
+ else:
141
+ # Add page to current chunk
142
+ if current_text:
143
+ current_text += "\n" + page.text
144
+ else:
145
+ current_text = page.text
146
+ current_start = page.char_offset
147
+ current_pages.append(page.page_number)
148
+
149
+ # If current chunk exceeds target, try to split at sentence boundary
150
+ current_tokens = self.count_tokens(current_text)
151
+ if current_tokens > self._config.target_tokens:
152
+ # Split within the page if it's too large
153
+ sub_chunks = self._split_large_text(
154
+ text=current_text,
155
+ start_char=current_start,
156
+ pages=current_pages,
157
+ chunk_index=chunk_index,
158
+ document_id=document.document_id,
159
+ )
160
+ if len(sub_chunks) > 1:
161
+ chunks.extend(sub_chunks[:-1])
162
+ chunk_index += len(sub_chunks) - 1
163
+ last_chunk = sub_chunks[-1]
164
+ current_text = last_chunk.text
165
+ current_start = last_chunk.start_char
166
+ current_pages = last_chunk.page_numbers
167
+
168
+ # Flush remaining text
169
+ if current_text.strip():
170
+ chunk = self._create_chunk(
171
+ chunk_index=chunk_index,
172
+ text=current_text,
173
+ start_char=current_start,
174
+ pages=current_pages,
175
+ document_id=document.document_id,
176
+ overlap_chars=0,
177
+ )
178
+ chunks.append(chunk)
179
+
180
+ return chunks
181
+
182
+ def _chunk_text(
183
+ self,
184
+ text: str,
185
+ document_id: str,
186
+ page_getter: Optional[callable] = None,
187
+ ) -> list[TextChunk]:
188
+ """Chunk text without page structure."""
189
+ if not text.strip():
190
+ return []
191
+
192
+ chunks = []
193
+ chunk_index = 0
194
+ remaining_text = text
195
+ current_start = 0
196
+
197
+ while remaining_text:
198
+ # Find a good split point
199
+ chunk_text, chars_consumed = self._find_chunk_boundary(remaining_text)
200
+
201
+ if not chunk_text.strip():
202
+ break
203
+
204
+ # Get pages for this chunk if page_getter is available
205
+ end_char = current_start + len(chunk_text)
206
+ pages = page_getter(current_start, end_char) if page_getter else []
207
+
208
+ # Calculate overlap from previous chunk
209
+ overlap_chars = 0
210
+ if chunks:
211
+ prev_chunk = chunks[-1]
212
+ if current_start < prev_chunk.end_char:
213
+ overlap_chars = prev_chunk.end_char - current_start
214
+
215
+ chunk = self._create_chunk(
216
+ chunk_index=chunk_index,
217
+ text=chunk_text,
218
+ start_char=current_start,
219
+ pages=pages,
220
+ document_id=document_id,
221
+ overlap_chars=overlap_chars,
222
+ )
223
+ chunks.append(chunk)
224
+ chunk_index += 1
225
+
226
+ # Move to next chunk with overlap
227
+ remaining_text = remaining_text[chars_consumed:]
228
+ current_start += chars_consumed
229
+
230
+ # Add overlap from the end of current chunk to start of next
231
+ if remaining_text:
232
+ overlap = self._get_overlap_text(chunk_text)
233
+ if overlap:
234
+ remaining_text = overlap + remaining_text
235
+ current_start -= len(overlap)
236
+
237
+ return chunks
238
+
239
+ def _find_chunk_boundary(self, text: str) -> tuple[str, int]:
240
+ """
241
+ Find a good boundary to split text at.
242
+
243
+ Returns:
244
+ Tuple of (chunk_text, chars_consumed)
245
+ """
246
+ total_tokens = self.count_tokens(text)
247
+
248
+ # If text fits in target, return it all
249
+ if total_tokens <= self._config.target_tokens:
250
+ return text, len(text)
251
+
252
+ # Binary search for the right split point
253
+ target_chars = self._estimate_chars_for_tokens(text, self._config.target_tokens)
254
+
255
+ if self._config.respect_sentence_boundaries:
256
+ # Find sentence boundary near target
257
+ split_pos = self._find_sentence_boundary(text, target_chars)
258
+ else:
259
+ split_pos = target_chars
260
+
261
+ # Ensure we don't exceed max tokens
262
+ chunk_text = text[:split_pos]
263
+ while self.count_tokens(chunk_text) > self._config.max_tokens and split_pos > 100:
264
+ split_pos = int(split_pos * 0.9)
265
+ if self._config.respect_sentence_boundaries:
266
+ split_pos = self._find_sentence_boundary(text, split_pos)
267
+ chunk_text = text[:split_pos]
268
+
269
+ return chunk_text, split_pos
270
+
271
+ def _estimate_chars_for_tokens(self, text: str, target_tokens: int) -> int:
272
+ """Estimate character count for a target token count."""
273
+ total_tokens = self.count_tokens(text)
274
+ if total_tokens == 0:
275
+ return len(text)
276
+
277
+ # Estimate chars per token ratio
278
+ chars_per_token = len(text) / total_tokens
279
+ return min(len(text), int(target_tokens * chars_per_token))
280
+
281
+ def _find_sentence_boundary(self, text: str, near_pos: int) -> int:
282
+ """Find a sentence boundary near the given position."""
283
+ # Look for sentence endings near the position
284
+ search_start = max(0, near_pos - 200)
285
+ search_end = min(len(text), near_pos + 200)
286
+ search_region = text[search_start:search_end]
287
+
288
+ # Find all sentence boundaries in the region
289
+ sentence_pattern = r'[.!?]+[\s"\')]*'
290
+ matches = list(re.finditer(sentence_pattern, search_region))
291
+
292
+ if not matches:
293
+ # No sentence boundary found, fall back to word boundary
294
+ return self._find_word_boundary(text, near_pos)
295
+
296
+ # Find the boundary closest to our target position
297
+ target_in_region = near_pos - search_start
298
+ best_match = min(matches, key=lambda m: abs(m.end() - target_in_region))
299
+ return search_start + best_match.end()
300
+
301
+ def _find_word_boundary(self, text: str, near_pos: int) -> int:
302
+ """Find a word boundary near the given position."""
303
+ # Look for whitespace near the position
304
+ search_start = max(0, near_pos - 50)
305
+ search_end = min(len(text), near_pos + 50)
306
+
307
+ # Prefer splitting at whitespace after the position
308
+ for i in range(near_pos, search_end):
309
+ if text[i].isspace():
310
+ return i + 1
311
+
312
+ # Fall back to whitespace before
313
+ for i in range(near_pos, search_start, -1):
314
+ if text[i].isspace():
315
+ return i + 1
316
+
317
+ # No good boundary found
318
+ return near_pos
319
+
320
+ def _get_overlap_text(self, text: str) -> str:
321
+ """Get overlap text from the end of a chunk."""
322
+ if self._config.overlap_tokens <= 0:
323
+ return ""
324
+
325
+ # Estimate characters for overlap tokens
326
+ target_chars = self._estimate_chars_for_tokens(
327
+ text[-1000:] if len(text) > 1000 else text,
328
+ self._config.overlap_tokens
329
+ )
330
+
331
+ # Get text from the end
332
+ overlap_text = text[-target_chars:] if target_chars < len(text) else text
333
+
334
+ # Try to start at a sentence or word boundary
335
+ sentence_match = re.search(r'[.!?]+[\s"\')]*', overlap_text)
336
+ if sentence_match:
337
+ overlap_text = overlap_text[sentence_match.end():]
338
+ else:
339
+ # Start at word boundary
340
+ word_match = re.search(r'\s+', overlap_text)
341
+ if word_match:
342
+ overlap_text = overlap_text[word_match.end():]
343
+
344
+ return overlap_text
345
+
346
+ def _split_large_text(
347
+ self,
348
+ text: str,
349
+ start_char: int,
350
+ pages: list[int],
351
+ chunk_index: int,
352
+ document_id: str,
353
+ ) -> list[TextChunk]:
354
+ """Split text that's too large into multiple chunks."""
355
+ chunks = []
356
+ remaining = text
357
+ current_start = start_char
358
+ current_index = chunk_index
359
+
360
+ while remaining:
361
+ chunk_text, chars_consumed = self._find_chunk_boundary(remaining)
362
+ if not chunk_text.strip():
363
+ break
364
+
365
+ chunk = self._create_chunk(
366
+ chunk_index=current_index,
367
+ text=chunk_text,
368
+ start_char=current_start,
369
+ pages=pages, # All sub-chunks share the same pages
370
+ document_id=document_id,
371
+ overlap_chars=(
372
+ 0 if current_index == chunk_index
373
+ else len(self._get_overlap_text(chunks[-1].text))
374
+ ),
375
+ )
376
+ chunks.append(chunk)
377
+ current_index += 1
378
+
379
+ remaining = remaining[chars_consumed:]
380
+ current_start += chars_consumed
381
+
382
+ # Add overlap
383
+ if remaining:
384
+ overlap = self._get_overlap_text(chunk_text)
385
+ if overlap:
386
+ remaining = overlap + remaining
387
+ current_start -= len(overlap)
388
+
389
+ return chunks
390
+
391
+ def _create_chunk(
392
+ self,
393
+ chunk_index: int,
394
+ text: str,
395
+ start_char: int,
396
+ pages: list[int],
397
+ document_id: str,
398
+ overlap_chars: int,
399
+ ) -> TextChunk:
400
+ """Create a TextChunk object."""
401
+ return TextChunk(
402
+ chunk_index=chunk_index,
403
+ text=text,
404
+ start_char=start_char,
405
+ end_char=start_char + len(text),
406
+ page_numbers=pages,
407
+ token_count=self.count_tokens(text),
408
+ overlap_chars=overlap_chars,
409
+ document_id=document_id,
410
+ )
@@ -0,0 +1,171 @@
1
+ """
2
+ DocumentContext - Context object for document-level pipeline results.
3
+
4
+ Holds all results from document processing including chunks, statements,
5
+ and pipeline outputs.
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from ..models.document import Document, TextChunk
13
+ from ..models.labels import LabeledStatement
14
+ from ..models.statement import PipelineStatement, RawTriple
15
+ from ..pipeline.context import PipelineContext
16
+
17
+
18
+ class DocumentContext(BaseModel):
19
+ """
20
+ Context for document-level processing results.
21
+
22
+ Contains the source document, chunks, and aggregated pipeline results.
23
+ """
24
+ document: Document = Field(..., description="Source document")
25
+ chunks: list[TextChunk] = Field(
26
+ default_factory=list,
27
+ description="Text chunks created from the document"
28
+ )
29
+
30
+ # Aggregated pipeline results
31
+ raw_triples: list[RawTriple] = Field(
32
+ default_factory=list,
33
+ description="Raw triples from all chunks (Stage 1)"
34
+ )
35
+ statements: list[PipelineStatement] = Field(
36
+ default_factory=list,
37
+ description="Pipeline statements from all chunks (Stage 2)"
38
+ )
39
+ labeled_statements: list[LabeledStatement] = Field(
40
+ default_factory=list,
41
+ description="Final labeled statements (Stage 5)"
42
+ )
43
+
44
+ # Processing metadata
45
+ chunk_contexts: list[PipelineContext] = Field(
46
+ default_factory=list,
47
+ description="Individual pipeline contexts for each chunk"
48
+ )
49
+ stage_timings: dict[str, float] = Field(
50
+ default_factory=dict,
51
+ description="Total time spent in each stage across all chunks"
52
+ )
53
+ processing_errors: list[str] = Field(
54
+ default_factory=list,
55
+ description="Errors encountered during processing"
56
+ )
57
+ processing_warnings: list[str] = Field(
58
+ default_factory=list,
59
+ description="Warnings generated during processing"
60
+ )
61
+
62
+ # Deduplication stats
63
+ pre_dedup_count: int = Field(
64
+ default=0,
65
+ description="Number of statements before deduplication"
66
+ )
67
+ post_dedup_count: int = Field(
68
+ default=0,
69
+ description="Number of statements after deduplication"
70
+ )
71
+
72
+ class Config:
73
+ arbitrary_types_allowed = True # Allow PipelineContext
74
+
75
+ @property
76
+ def statement_count(self) -> int:
77
+ """Get the total number of final statements."""
78
+ return len(self.labeled_statements)
79
+
80
+ @property
81
+ def chunk_count(self) -> int:
82
+ """Get the number of chunks."""
83
+ return len(self.chunks)
84
+
85
+ @property
86
+ def duplicates_removed(self) -> int:
87
+ """Get the number of duplicate statements removed."""
88
+ return self.pre_dedup_count - self.post_dedup_count
89
+
90
+ def add_error(self, error: str) -> None:
91
+ """Add a processing error."""
92
+ self.processing_errors.append(error)
93
+
94
+ def add_warning(self, warning: str) -> None:
95
+ """Add a processing warning."""
96
+ self.processing_warnings.append(warning)
97
+
98
+ def record_timing(self, stage: str, duration: float) -> None:
99
+ """
100
+ Record timing for a stage (accumulates across chunks).
101
+
102
+ Args:
103
+ stage: Stage name
104
+ duration: Duration in seconds
105
+ """
106
+ if stage in self.stage_timings:
107
+ self.stage_timings[stage] += duration
108
+ else:
109
+ self.stage_timings[stage] = duration
110
+
111
+ def merge_chunk_context(self, chunk_ctx: PipelineContext) -> None:
112
+ """
113
+ Merge results from a chunk's pipeline context.
114
+
115
+ Args:
116
+ chunk_ctx: Pipeline context from processing a chunk
117
+ """
118
+ self.chunk_contexts.append(chunk_ctx)
119
+
120
+ # Merge timings
121
+ for stage, duration in chunk_ctx.stage_timings.items():
122
+ self.record_timing(stage, duration)
123
+
124
+ # Merge errors and warnings
125
+ self.processing_errors.extend(chunk_ctx.processing_errors)
126
+ self.processing_warnings.extend(chunk_ctx.processing_warnings)
127
+
128
+ def get_statements_by_page(self, page_number: int) -> list[LabeledStatement]:
129
+ """
130
+ Get all statements from a specific page.
131
+
132
+ Args:
133
+ page_number: 1-indexed page number
134
+
135
+ Returns:
136
+ List of statements from that page
137
+ """
138
+ return [
139
+ stmt for stmt in self.labeled_statements
140
+ if stmt.page_number == page_number
141
+ ]
142
+
143
+ def get_statements_by_chunk(self, chunk_index: int) -> list[LabeledStatement]:
144
+ """
145
+ Get all statements from a specific chunk.
146
+
147
+ Args:
148
+ chunk_index: 0-indexed chunk index
149
+
150
+ Returns:
151
+ List of statements from that chunk
152
+ """
153
+ return [
154
+ stmt for stmt in self.labeled_statements
155
+ if stmt.statement.chunk_index == chunk_index
156
+ ]
157
+
158
+ def as_dict(self) -> dict[str, Any]:
159
+ """Convert to a dictionary representation."""
160
+ return {
161
+ "document_id": self.document.document_id,
162
+ "document_title": self.document.metadata.title,
163
+ "summary": self.document.summary,
164
+ "chunk_count": self.chunk_count,
165
+ "statement_count": self.statement_count,
166
+ "duplicates_removed": self.duplicates_removed,
167
+ "statements": [stmt.as_dict() for stmt in self.labeled_statements],
168
+ "timings": self.stage_timings,
169
+ "errors": self.processing_errors,
170
+ "warnings": self.processing_warnings,
171
+ }