corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,303 @@
1
+ """
2
+ URL loader for fetching and parsing web content.
3
+
4
+ Orchestrates scraper and PDF parser plugins to load documents from URLs.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ from typing import Optional
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from ..models.document import Document
14
+ from ..pipeline.registry import PluginRegistry
15
+ from ..plugins.base import (
16
+ BaseScraperPlugin,
17
+ BasePDFParserPlugin,
18
+ ContentType,
19
+ ScraperResult,
20
+ )
21
+ from .html_extractor import extract_text_from_html, extract_article_content
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class URLLoaderConfig(BaseModel):
27
+ """Configuration for URL loading."""
28
+
29
+ timeout: float = Field(
30
+ default=30.0,
31
+ description="Request timeout in seconds"
32
+ )
33
+ use_ocr: bool = Field(
34
+ default=False,
35
+ description="Force OCR for PDF parsing"
36
+ )
37
+ max_pdf_pages: int = Field(
38
+ default=500,
39
+ description="Maximum pages to extract from PDFs"
40
+ )
41
+ scraper_plugin: Optional[str] = Field(
42
+ default=None,
43
+ description="Specific scraper plugin to use (None = auto-select)"
44
+ )
45
+ pdf_parser_plugin: Optional[str] = Field(
46
+ default=None,
47
+ description="Specific PDF parser plugin to use (None = auto-select)"
48
+ )
49
+ extract_metadata: bool = Field(
50
+ default=True,
51
+ description="Extract article metadata from HTML pages"
52
+ )
53
+
54
+
55
+ class URLLoader:
56
+ """
57
+ Loads documents from URLs using scraper and PDF parser plugins.
58
+
59
+ Orchestrates the content acquisition process:
60
+ 1. Fetch content using a scraper plugin
61
+ 2. Detect content type (HTML vs PDF)
62
+ 3. Parse content using appropriate parser
63
+ 4. Create a Document object
64
+
65
+ Example:
66
+ >>> loader = URLLoader()
67
+ >>> document = await loader.load("https://example.com/article")
68
+ >>> print(document.title)
69
+
70
+ >>> # Synchronous usage
71
+ >>> document = loader.load_sync("https://example.com/report.pdf")
72
+ """
73
+
74
+ def __init__(self, config: Optional[URLLoaderConfig] = None):
75
+ """
76
+ Initialize the URL loader.
77
+
78
+ Args:
79
+ config: Loader configuration
80
+ """
81
+ self.config = config or URLLoaderConfig()
82
+ self._scraper: Optional[BaseScraperPlugin] = None
83
+ self._pdf_parser: Optional[BasePDFParserPlugin] = None
84
+
85
+ def _get_scraper(self) -> BaseScraperPlugin:
86
+ """Get the scraper plugin to use."""
87
+ if self._scraper is not None:
88
+ return self._scraper
89
+
90
+ scrapers = PluginRegistry.get_scrapers()
91
+ if not scrapers:
92
+ raise RuntimeError(
93
+ "No scraper plugins registered. "
94
+ "Ensure plugins are loaded via 'from statement_extractor import plugins'"
95
+ )
96
+
97
+ if self.config.scraper_plugin:
98
+ for scraper in scrapers:
99
+ if scraper.name == self.config.scraper_plugin:
100
+ self._scraper = scraper
101
+ return scraper
102
+ raise ValueError(f"Scraper plugin not found: {self.config.scraper_plugin}")
103
+
104
+ # Use first available (highest priority)
105
+ self._scraper = scrapers[0]
106
+ return self._scraper
107
+
108
+ def _get_pdf_parser(self) -> BasePDFParserPlugin:
109
+ """Get the PDF parser plugin to use."""
110
+ if self._pdf_parser is not None:
111
+ return self._pdf_parser
112
+
113
+ parsers = PluginRegistry.get_pdf_parsers()
114
+ if not parsers:
115
+ raise RuntimeError(
116
+ "No PDF parser plugins registered. "
117
+ "Ensure plugins are loaded via 'from statement_extractor import plugins'"
118
+ )
119
+
120
+ if self.config.pdf_parser_plugin:
121
+ for parser in parsers:
122
+ if parser.name == self.config.pdf_parser_plugin:
123
+ self._pdf_parser = parser
124
+ return parser
125
+ raise ValueError(f"PDF parser plugin not found: {self.config.pdf_parser_plugin}")
126
+
127
+ # Use first available (highest priority)
128
+ self._pdf_parser = parsers[0]
129
+ return self._pdf_parser
130
+
131
+ async def load(self, url: str) -> Document:
132
+ """
133
+ Load a URL and return a Document.
134
+
135
+ Args:
136
+ url: URL to load
137
+
138
+ Returns:
139
+ Document with extracted content
140
+
141
+ Raises:
142
+ ValueError: If URL cannot be fetched or parsed
143
+ """
144
+ logger.info(f"Loading URL: {url}")
145
+
146
+ # 1. Fetch content
147
+ scraper = self._get_scraper()
148
+ result = await scraper.fetch(url, self.config.timeout)
149
+
150
+ if not result.ok:
151
+ raise ValueError(f"Failed to fetch {url}: {result.error}")
152
+
153
+ logger.debug(f"Fetched {len(result.content)} bytes, type: {result.content_type}")
154
+
155
+ # 2. Process based on content type
156
+ if result.content_type == ContentType.PDF:
157
+ return self._process_pdf(result)
158
+ elif result.content_type == ContentType.HTML:
159
+ return self._process_html(result)
160
+ else:
161
+ # Try to guess based on content
162
+ if result.content[:5] == b"%PDF-":
163
+ return self._process_pdf(result)
164
+ # Default to HTML
165
+ return self._process_html(result)
166
+
167
+ def load_sync(self, url: str) -> Document:
168
+ """
169
+ Synchronous wrapper for load().
170
+
171
+ Args:
172
+ url: URL to load
173
+
174
+ Returns:
175
+ Document with extracted content
176
+ """
177
+ return asyncio.run(self.load(url))
178
+
179
+ def _process_pdf(self, result: ScraperResult) -> Document:
180
+ """
181
+ Convert PDF to Document with pages.
182
+
183
+ Args:
184
+ result: ScraperResult containing PDF bytes
185
+
186
+ Returns:
187
+ Document with PDF content
188
+ """
189
+ logger.info(f"Processing PDF from {result.final_url}")
190
+
191
+ parser = self._get_pdf_parser()
192
+ parse_result = parser.parse(
193
+ result.content,
194
+ max_pages=self.config.max_pdf_pages,
195
+ use_ocr=self.config.use_ocr,
196
+ )
197
+
198
+ if not parse_result.ok:
199
+ raise ValueError(f"Failed to parse PDF: {parse_result.error}")
200
+
201
+ logger.info(f"Extracted {len(parse_result.pages)} pages from PDF")
202
+
203
+ # Create Document from pages
204
+ kwargs = {
205
+ "pages": parse_result.pages,
206
+ "title": parse_result.metadata.get("title"),
207
+ "source_type": "pdf",
208
+ "url": result.final_url,
209
+ }
210
+ author = parse_result.metadata.get("author")
211
+ if author:
212
+ kwargs["authors"] = [author]
213
+
214
+ return Document.from_pages(**kwargs)
215
+
216
+ def _process_html(self, result: ScraperResult) -> Document:
217
+ """
218
+ Convert HTML to Document (single page).
219
+
220
+ Args:
221
+ result: ScraperResult containing HTML bytes
222
+
223
+ Returns:
224
+ Document with HTML content
225
+ """
226
+ logger.info(f"Processing HTML from {result.final_url}")
227
+
228
+ # Decode HTML
229
+ try:
230
+ html = result.content.decode("utf-8", errors="replace")
231
+ except Exception as e:
232
+ raise ValueError(f"Failed to decode HTML: {e}")
233
+
234
+ # Extract text and metadata
235
+ if self.config.extract_metadata:
236
+ text, metadata = extract_article_content(html)
237
+ title = metadata.get("title")
238
+ author = metadata.get("author")
239
+ # Log extracted metadata
240
+ logger.debug(f"Extracted metadata: {metadata}")
241
+ else:
242
+ text, title = extract_text_from_html(html)
243
+ author = None
244
+ metadata = {}
245
+
246
+ if not text or len(text.strip()) < 50:
247
+ raise ValueError("No meaningful content extracted from HTML")
248
+
249
+ logger.info(f"Extracted {len(text)} chars from HTML")
250
+ if title:
251
+ logger.info(f" Title: {title}")
252
+ if author:
253
+ logger.info(f" Author: {author}")
254
+ if metadata.get("published_date"):
255
+ logger.info(f" Published: {metadata.get('published_date')}")
256
+
257
+ # Create Document using from_pages since from_text forces source_type="text"
258
+ kwargs = {
259
+ "pages": [text],
260
+ "title": title,
261
+ "source_type": "webpage",
262
+ "url": result.final_url,
263
+ }
264
+ if author:
265
+ kwargs["authors"] = [author]
266
+
267
+ return Document.from_pages(**kwargs)
268
+
269
+
270
+ async def load_url(
271
+ url: str,
272
+ config: Optional[URLLoaderConfig] = None,
273
+ ) -> Document:
274
+ """
275
+ Convenience function to load a URL.
276
+
277
+ Args:
278
+ url: URL to load
279
+ config: Optional loader configuration
280
+
281
+ Returns:
282
+ Document with extracted content
283
+ """
284
+ loader = URLLoader(config)
285
+ return await loader.load(url)
286
+
287
+
288
+ def load_url_sync(
289
+ url: str,
290
+ config: Optional[URLLoaderConfig] = None,
291
+ ) -> Document:
292
+ """
293
+ Convenience function to load a URL synchronously.
294
+
295
+ Args:
296
+ url: URL to load
297
+ config: Optional loader configuration
298
+
299
+ Returns:
300
+ Document with extracted content
301
+ """
302
+ loader = URLLoader(config)
303
+ return loader.load_sync(url)
@@ -0,0 +1,388 @@
1
+ """
2
+ DocumentPipeline - Orchestrates document-level extraction with chunking and citations.
3
+
4
+ Wraps ExtractionPipeline to provide document-level features:
5
+ - Text chunking with page awareness
6
+ - Batch processing through pipeline stages
7
+ - Statement deduplication across chunks
8
+ - Citation generation from document metadata
9
+ """
10
+
11
+ import logging
12
+ import time
13
+ from typing import Optional
14
+
15
+ from pydantic import BaseModel, Field
16
+
17
+ from ..models.document import ChunkingConfig, Document
18
+ from ..pipeline import ExtractionPipeline, PipelineConfig
19
+ from .chunker import DocumentChunker
20
+ from .context import DocumentContext
21
+ from .deduplicator import StatementDeduplicator
22
+ from .summarizer import DocumentSummarizer
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class DocumentPipelineConfig(BaseModel):
28
+ """Configuration for document pipeline processing."""
29
+
30
+ chunking: ChunkingConfig = Field(
31
+ default_factory=ChunkingConfig,
32
+ description="Configuration for text chunking"
33
+ )
34
+ generate_summary: bool = Field(
35
+ default=True,
36
+ description="Whether to generate a document summary"
37
+ )
38
+ deduplicate_across_chunks: bool = Field(
39
+ default=True,
40
+ description="Whether to deduplicate statements across chunks"
41
+ )
42
+ batch_size: int = Field(
43
+ default=10,
44
+ ge=1,
45
+ description="Number of items to process in each batch"
46
+ )
47
+ pipeline_config: Optional[PipelineConfig] = Field(
48
+ default=None,
49
+ description="Configuration for the underlying extraction pipeline"
50
+ )
51
+
52
+
53
+ class DocumentPipeline:
54
+ """
55
+ Document-level extraction pipeline.
56
+
57
+ Processes documents through:
58
+ 1. Summary generation (optional)
59
+ 2. Chunking with page awareness
60
+ 3. Batch extraction through all pipeline stages
61
+ 4. Deduplication across chunks
62
+ 5. Citation generation
63
+
64
+ Example:
65
+ >>> pipeline = DocumentPipeline()
66
+ >>> document = Document.from_text("Long document text...", title="Report")
67
+ >>> ctx = pipeline.process(document)
68
+ >>> for stmt in ctx.labeled_statements:
69
+ ... print(f"{stmt}: {stmt.citation}")
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ config: Optional[DocumentPipelineConfig] = None,
75
+ ):
76
+ """
77
+ Initialize the document pipeline.
78
+
79
+ Args:
80
+ config: Document pipeline configuration
81
+ """
82
+ self.config = config or DocumentPipelineConfig()
83
+
84
+ # Initialize components
85
+ self._chunker = DocumentChunker(self.config.chunking)
86
+ self._deduplicator = StatementDeduplicator()
87
+ self._summarizer = DocumentSummarizer() if self.config.generate_summary else None
88
+ self._pipeline = ExtractionPipeline(self.config.pipeline_config)
89
+
90
+ def process(self, document: Document) -> DocumentContext:
91
+ """
92
+ Process a document through the pipeline.
93
+
94
+ Args:
95
+ document: Document to process
96
+
97
+ Returns:
98
+ DocumentContext with all extraction results
99
+ """
100
+ logger.info(f"Starting document pipeline: {document.document_id}")
101
+ start_time = time.time()
102
+
103
+ ctx = DocumentContext(document=document)
104
+
105
+ try:
106
+ # Step 1: Generate summary (if enabled)
107
+ if self.config.generate_summary and self._summarizer:
108
+ self._generate_summary(document, ctx)
109
+
110
+ # Step 2: Chunk the document
111
+ chunks = self._chunker.chunk_document(document)
112
+ ctx.chunks = chunks
113
+ logger.info(f"Created {len(chunks)} chunks")
114
+
115
+ if not chunks:
116
+ logger.warning("No chunks created from document")
117
+ return ctx
118
+
119
+ # Step 3: Process all chunks through Stage 1 (Splitting)
120
+ self._process_stage1(ctx)
121
+
122
+ # Step 4: Deduplicate raw triples
123
+ if self.config.deduplicate_across_chunks:
124
+ self._deduplicate_triples(ctx)
125
+
126
+ # Step 5: Process through remaining stages (2-6)
127
+ self._process_remaining_stages(ctx)
128
+
129
+ # Step 6: Add citations to statements
130
+ self._add_citations(ctx)
131
+
132
+ except Exception as e:
133
+ logger.exception("Document pipeline failed")
134
+ ctx.add_error(f"Pipeline error: {str(e)}")
135
+ raise
136
+
137
+ total_time = time.time() - start_time
138
+ ctx.record_timing("total", total_time)
139
+
140
+ logger.info(
141
+ f"Document pipeline complete: {ctx.statement_count} statements, "
142
+ f"{ctx.duplicates_removed} duplicates removed, "
143
+ f"{total_time:.2f}s"
144
+ )
145
+
146
+ return ctx
147
+
148
+ def _generate_summary(self, document: Document, ctx: DocumentContext) -> None:
149
+ """Generate document summary."""
150
+ logger.info("Generating document summary")
151
+ start_time = time.time()
152
+
153
+ try:
154
+ summary = self._summarizer.summarize(document)
155
+ document.summary = summary
156
+ logger.info(f"Generated summary: {len(summary)} chars")
157
+ except Exception as e:
158
+ logger.error(f"Summary generation failed: {e}")
159
+ ctx.add_warning(f"Summary generation failed: {e}")
160
+
161
+ ctx.record_timing("summarization", time.time() - start_time)
162
+
163
+ def _process_stage1(self, ctx: DocumentContext) -> None:
164
+ """Process all chunks through Stage 1 (Splitting) using batch processing."""
165
+ from ..pipeline.registry import PluginRegistry
166
+ from ..plugins.base import PluginCapability
167
+
168
+ logger.info(f"Processing {len(ctx.chunks)} chunks through Stage 1 (batch mode)")
169
+ start_time = time.time()
170
+
171
+ # Get the splitter plugin
172
+ splitters = PluginRegistry.get_splitters()
173
+ if not splitters:
174
+ logger.warning("No splitter plugins registered")
175
+ return
176
+
177
+ # Use first enabled splitter
178
+ splitter = None
179
+ for s in splitters:
180
+ plugin_enabled = (
181
+ self.config.pipeline_config is None or
182
+ self.config.pipeline_config.is_plugin_enabled(s.name)
183
+ )
184
+ if plugin_enabled:
185
+ splitter = s
186
+ break
187
+
188
+ if not splitter:
189
+ logger.warning("No enabled splitter plugin found")
190
+ return
191
+
192
+ # Extract all chunk texts
193
+ chunk_texts = [chunk.text for chunk in ctx.chunks]
194
+
195
+ # Create a dummy context for the splitter
196
+ from ..pipeline.context import PipelineContext
197
+ dummy_ctx = PipelineContext(
198
+ source_text="", # Not used for batch splitting
199
+ source_metadata=self.config.pipeline_config.model_dump() if self.config.pipeline_config else {},
200
+ )
201
+
202
+ all_triples = []
203
+
204
+ # Require batch processing capability
205
+ if PluginCapability.BATCH_PROCESSING not in splitter.capabilities:
206
+ raise RuntimeError(
207
+ f"Splitter plugin '{splitter.name}' does not support batch processing. "
208
+ "Document pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
209
+ )
210
+
211
+ logger.info(f"Using batch splitting with {splitter.name}")
212
+ batch_results = splitter.split_batch(chunk_texts, dummy_ctx)
213
+
214
+ # Annotate triples with document/chunk info
215
+ for chunk, triples in zip(ctx.chunks, batch_results):
216
+ for triple in triples:
217
+ triple.document_id = ctx.document.document_id
218
+ triple.chunk_index = chunk.chunk_index
219
+ triple.page_number = chunk.primary_page
220
+ all_triples.append(triple)
221
+
222
+ ctx.raw_triples = all_triples
223
+ ctx.pre_dedup_count = len(all_triples)
224
+
225
+ ctx.record_timing("stage1_batch", time.time() - start_time)
226
+ logger.info(f"Stage 1 produced {len(all_triples)} raw triples from {len(ctx.chunks)} chunks")
227
+
228
+ def _deduplicate_triples(self, ctx: DocumentContext) -> None:
229
+ """Deduplicate raw triples across chunks."""
230
+ logger.info(f"Deduplicating {len(ctx.raw_triples)} triples")
231
+ start_time = time.time()
232
+
233
+ original_count = len(ctx.raw_triples)
234
+ ctx.raw_triples = self._deduplicator.deduplicate_batch(ctx.raw_triples)
235
+ ctx.post_dedup_count = len(ctx.raw_triples)
236
+
237
+ removed = original_count - len(ctx.raw_triples)
238
+ ctx.record_timing("deduplication", time.time() - start_time)
239
+ logger.info(f"Removed {removed} duplicate triples")
240
+
241
+ def _process_remaining_stages(self, ctx: DocumentContext) -> None:
242
+ """Process through stages 2-6."""
243
+ logger.info(f"Processing {len(ctx.raw_triples)} triples through stages 2-6")
244
+ start_time = time.time()
245
+
246
+ # Create a pipeline config for stages 2-6
247
+ # Exclude enabled_stages from base config to avoid duplicate keyword argument
248
+ base_config = {}
249
+ if self.config.pipeline_config:
250
+ base_config = self.config.pipeline_config.model_dump(exclude={"enabled_stages"})
251
+ stages_config = PipelineConfig(
252
+ enabled_stages={2, 3, 4, 5, 6},
253
+ **base_config
254
+ )
255
+
256
+ # Create a combined context with all raw triples
257
+ from ..pipeline.context import PipelineContext
258
+
259
+ combined_ctx = PipelineContext(
260
+ source_text=ctx.document.full_text,
261
+ source_metadata={
262
+ "document_id": ctx.document.document_id,
263
+ "title": ctx.document.metadata.title,
264
+ },
265
+ )
266
+ combined_ctx.raw_triples = ctx.raw_triples
267
+
268
+ # Run stages 2-6
269
+ pipeline = ExtractionPipeline(stages_config)
270
+
271
+ # We need to manually run stages since we're providing pre-existing triples
272
+ # Stage 2: Extraction
273
+ if stages_config.is_stage_enabled(2):
274
+ combined_ctx = pipeline._run_extraction(combined_ctx)
275
+
276
+ # Propagate document info to statements
277
+ for stmt in combined_ctx.statements:
278
+ # Find the source triple to get document info
279
+ for triple in ctx.raw_triples:
280
+ if triple.source_sentence in stmt.source_text:
281
+ stmt.document_id = triple.document_id
282
+ stmt.chunk_index = triple.chunk_index
283
+ stmt.page_number = triple.page_number
284
+ break
285
+
286
+ # Stage 3: Qualification
287
+ if stages_config.is_stage_enabled(3):
288
+ combined_ctx = pipeline._run_qualification(combined_ctx)
289
+
290
+ # Stage 4: Canonicalization
291
+ if stages_config.is_stage_enabled(4):
292
+ combined_ctx = pipeline._run_canonicalization(combined_ctx)
293
+
294
+ # Stage 5: Labeling
295
+ if stages_config.is_stage_enabled(5):
296
+ combined_ctx = pipeline._run_labeling(combined_ctx)
297
+
298
+ # Stage 6: Taxonomy
299
+ if stages_config.is_stage_enabled(6):
300
+ combined_ctx = pipeline._run_taxonomy(combined_ctx)
301
+
302
+ # Propagate document info to labeled statements
303
+ for labeled_stmt in combined_ctx.labeled_statements:
304
+ labeled_stmt.document_id = labeled_stmt.statement.document_id
305
+ labeled_stmt.page_number = labeled_stmt.statement.page_number
306
+
307
+ ctx.statements = combined_ctx.statements
308
+ ctx.labeled_statements = combined_ctx.labeled_statements
309
+
310
+ # Merge timings
311
+ for stage, duration in combined_ctx.stage_timings.items():
312
+ ctx.record_timing(stage, duration)
313
+
314
+ ctx.record_timing("stages_2_6_batch", time.time() - start_time)
315
+ logger.info(f"Stages 2-6 produced {len(ctx.labeled_statements)} labeled statements")
316
+
317
+ def _add_citations(self, ctx: DocumentContext) -> None:
318
+ """Add citations to all labeled statements."""
319
+ logger.info("Adding citations to statements")
320
+
321
+ for stmt in ctx.labeled_statements:
322
+ citation = ctx.document.metadata.format_citation(stmt.page_number)
323
+ stmt.citation = citation if citation else None
324
+
325
+ def process_text(
326
+ self,
327
+ text: str,
328
+ title: Optional[str] = None,
329
+ **metadata_kwargs,
330
+ ) -> DocumentContext:
331
+ """
332
+ Process plain text through the document pipeline.
333
+
334
+ Convenience method that creates a Document from text.
335
+
336
+ Args:
337
+ text: Text to process
338
+ title: Optional document title
339
+ **metadata_kwargs: Additional document metadata
340
+
341
+ Returns:
342
+ DocumentContext with extraction results
343
+ """
344
+ document = Document.from_text(text, title=title, **metadata_kwargs)
345
+ return self.process(document)
346
+
347
+ async def process_url(
348
+ self,
349
+ url: str,
350
+ loader_config: Optional["URLLoaderConfig"] = None,
351
+ ) -> DocumentContext:
352
+ """
353
+ Process a URL through the document pipeline.
354
+
355
+ Fetches the URL, extracts content (HTML or PDF), and processes it.
356
+
357
+ Args:
358
+ url: URL to process
359
+ loader_config: Optional loader configuration
360
+
361
+ Returns:
362
+ DocumentContext with extraction results
363
+ """
364
+ from .loader import URLLoader, URLLoaderConfig
365
+
366
+ loader = URLLoader(loader_config or URLLoaderConfig())
367
+ document = await loader.load(url)
368
+ return self.process(document)
369
+
370
+ def process_url_sync(
371
+ self,
372
+ url: str,
373
+ loader_config: Optional["URLLoaderConfig"] = None,
374
+ ) -> DocumentContext:
375
+ """
376
+ Process a URL through the document pipeline (synchronous).
377
+
378
+ Fetches the URL, extracts content (HTML or PDF), and processes it.
379
+
380
+ Args:
381
+ url: URL to process
382
+ loader_config: Optional loader configuration
383
+
384
+ Returns:
385
+ DocumentContext with extraction results
386
+ """
387
+ import asyncio
388
+ return asyncio.run(self.process_url(url, loader_config))