corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""
|
|
2
|
+
URL loader for fetching and parsing web content.
|
|
3
|
+
|
|
4
|
+
Orchestrates scraper and PDF parser plugins to load documents from URLs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from ..models.document import Document
|
|
14
|
+
from ..pipeline.registry import PluginRegistry
|
|
15
|
+
from ..plugins.base import (
|
|
16
|
+
BaseScraperPlugin,
|
|
17
|
+
BasePDFParserPlugin,
|
|
18
|
+
ContentType,
|
|
19
|
+
ScraperResult,
|
|
20
|
+
)
|
|
21
|
+
from .html_extractor import extract_text_from_html, extract_article_content
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class URLLoaderConfig(BaseModel):
|
|
27
|
+
"""Configuration for URL loading."""
|
|
28
|
+
|
|
29
|
+
timeout: float = Field(
|
|
30
|
+
default=30.0,
|
|
31
|
+
description="Request timeout in seconds"
|
|
32
|
+
)
|
|
33
|
+
use_ocr: bool = Field(
|
|
34
|
+
default=False,
|
|
35
|
+
description="Force OCR for PDF parsing"
|
|
36
|
+
)
|
|
37
|
+
max_pdf_pages: int = Field(
|
|
38
|
+
default=500,
|
|
39
|
+
description="Maximum pages to extract from PDFs"
|
|
40
|
+
)
|
|
41
|
+
scraper_plugin: Optional[str] = Field(
|
|
42
|
+
default=None,
|
|
43
|
+
description="Specific scraper plugin to use (None = auto-select)"
|
|
44
|
+
)
|
|
45
|
+
pdf_parser_plugin: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Specific PDF parser plugin to use (None = auto-select)"
|
|
48
|
+
)
|
|
49
|
+
extract_metadata: bool = Field(
|
|
50
|
+
default=True,
|
|
51
|
+
description="Extract article metadata from HTML pages"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class URLLoader:
|
|
56
|
+
"""
|
|
57
|
+
Loads documents from URLs using scraper and PDF parser plugins.
|
|
58
|
+
|
|
59
|
+
Orchestrates the content acquisition process:
|
|
60
|
+
1. Fetch content using a scraper plugin
|
|
61
|
+
2. Detect content type (HTML vs PDF)
|
|
62
|
+
3. Parse content using appropriate parser
|
|
63
|
+
4. Create a Document object
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> loader = URLLoader()
|
|
67
|
+
>>> document = await loader.load("https://example.com/article")
|
|
68
|
+
>>> print(document.title)
|
|
69
|
+
|
|
70
|
+
>>> # Synchronous usage
|
|
71
|
+
>>> document = loader.load_sync("https://example.com/report.pdf")
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, config: Optional[URLLoaderConfig] = None):
|
|
75
|
+
"""
|
|
76
|
+
Initialize the URL loader.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
config: Loader configuration
|
|
80
|
+
"""
|
|
81
|
+
self.config = config or URLLoaderConfig()
|
|
82
|
+
self._scraper: Optional[BaseScraperPlugin] = None
|
|
83
|
+
self._pdf_parser: Optional[BasePDFParserPlugin] = None
|
|
84
|
+
|
|
85
|
+
def _get_scraper(self) -> BaseScraperPlugin:
|
|
86
|
+
"""Get the scraper plugin to use."""
|
|
87
|
+
if self._scraper is not None:
|
|
88
|
+
return self._scraper
|
|
89
|
+
|
|
90
|
+
scrapers = PluginRegistry.get_scrapers()
|
|
91
|
+
if not scrapers:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
"No scraper plugins registered. "
|
|
94
|
+
"Ensure plugins are loaded via 'from statement_extractor import plugins'"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if self.config.scraper_plugin:
|
|
98
|
+
for scraper in scrapers:
|
|
99
|
+
if scraper.name == self.config.scraper_plugin:
|
|
100
|
+
self._scraper = scraper
|
|
101
|
+
return scraper
|
|
102
|
+
raise ValueError(f"Scraper plugin not found: {self.config.scraper_plugin}")
|
|
103
|
+
|
|
104
|
+
# Use first available (highest priority)
|
|
105
|
+
self._scraper = scrapers[0]
|
|
106
|
+
return self._scraper
|
|
107
|
+
|
|
108
|
+
def _get_pdf_parser(self) -> BasePDFParserPlugin:
|
|
109
|
+
"""Get the PDF parser plugin to use."""
|
|
110
|
+
if self._pdf_parser is not None:
|
|
111
|
+
return self._pdf_parser
|
|
112
|
+
|
|
113
|
+
parsers = PluginRegistry.get_pdf_parsers()
|
|
114
|
+
if not parsers:
|
|
115
|
+
raise RuntimeError(
|
|
116
|
+
"No PDF parser plugins registered. "
|
|
117
|
+
"Ensure plugins are loaded via 'from statement_extractor import plugins'"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if self.config.pdf_parser_plugin:
|
|
121
|
+
for parser in parsers:
|
|
122
|
+
if parser.name == self.config.pdf_parser_plugin:
|
|
123
|
+
self._pdf_parser = parser
|
|
124
|
+
return parser
|
|
125
|
+
raise ValueError(f"PDF parser plugin not found: {self.config.pdf_parser_plugin}")
|
|
126
|
+
|
|
127
|
+
# Use first available (highest priority)
|
|
128
|
+
self._pdf_parser = parsers[0]
|
|
129
|
+
return self._pdf_parser
|
|
130
|
+
|
|
131
|
+
async def load(self, url: str) -> Document:
|
|
132
|
+
"""
|
|
133
|
+
Load a URL and return a Document.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
url: URL to load
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Document with extracted content
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If URL cannot be fetched or parsed
|
|
143
|
+
"""
|
|
144
|
+
logger.info(f"Loading URL: {url}")
|
|
145
|
+
|
|
146
|
+
# 1. Fetch content
|
|
147
|
+
scraper = self._get_scraper()
|
|
148
|
+
result = await scraper.fetch(url, self.config.timeout)
|
|
149
|
+
|
|
150
|
+
if not result.ok:
|
|
151
|
+
raise ValueError(f"Failed to fetch {url}: {result.error}")
|
|
152
|
+
|
|
153
|
+
logger.debug(f"Fetched {len(result.content)} bytes, type: {result.content_type}")
|
|
154
|
+
|
|
155
|
+
# 2. Process based on content type
|
|
156
|
+
if result.content_type == ContentType.PDF:
|
|
157
|
+
return self._process_pdf(result)
|
|
158
|
+
elif result.content_type == ContentType.HTML:
|
|
159
|
+
return self._process_html(result)
|
|
160
|
+
else:
|
|
161
|
+
# Try to guess based on content
|
|
162
|
+
if result.content[:5] == b"%PDF-":
|
|
163
|
+
return self._process_pdf(result)
|
|
164
|
+
# Default to HTML
|
|
165
|
+
return self._process_html(result)
|
|
166
|
+
|
|
167
|
+
def load_sync(self, url: str) -> Document:
|
|
168
|
+
"""
|
|
169
|
+
Synchronous wrapper for load().
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
url: URL to load
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Document with extracted content
|
|
176
|
+
"""
|
|
177
|
+
return asyncio.run(self.load(url))
|
|
178
|
+
|
|
179
|
+
def _process_pdf(self, result: ScraperResult) -> Document:
|
|
180
|
+
"""
|
|
181
|
+
Convert PDF to Document with pages.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
result: ScraperResult containing PDF bytes
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Document with PDF content
|
|
188
|
+
"""
|
|
189
|
+
logger.info(f"Processing PDF from {result.final_url}")
|
|
190
|
+
|
|
191
|
+
parser = self._get_pdf_parser()
|
|
192
|
+
parse_result = parser.parse(
|
|
193
|
+
result.content,
|
|
194
|
+
max_pages=self.config.max_pdf_pages,
|
|
195
|
+
use_ocr=self.config.use_ocr,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if not parse_result.ok:
|
|
199
|
+
raise ValueError(f"Failed to parse PDF: {parse_result.error}")
|
|
200
|
+
|
|
201
|
+
logger.info(f"Extracted {len(parse_result.pages)} pages from PDF")
|
|
202
|
+
|
|
203
|
+
# Create Document from pages
|
|
204
|
+
kwargs = {
|
|
205
|
+
"pages": parse_result.pages,
|
|
206
|
+
"title": parse_result.metadata.get("title"),
|
|
207
|
+
"source_type": "pdf",
|
|
208
|
+
"url": result.final_url,
|
|
209
|
+
}
|
|
210
|
+
author = parse_result.metadata.get("author")
|
|
211
|
+
if author:
|
|
212
|
+
kwargs["authors"] = [author]
|
|
213
|
+
|
|
214
|
+
return Document.from_pages(**kwargs)
|
|
215
|
+
|
|
216
|
+
def _process_html(self, result: ScraperResult) -> Document:
|
|
217
|
+
"""
|
|
218
|
+
Convert HTML to Document (single page).
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
result: ScraperResult containing HTML bytes
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Document with HTML content
|
|
225
|
+
"""
|
|
226
|
+
logger.info(f"Processing HTML from {result.final_url}")
|
|
227
|
+
|
|
228
|
+
# Decode HTML
|
|
229
|
+
try:
|
|
230
|
+
html = result.content.decode("utf-8", errors="replace")
|
|
231
|
+
except Exception as e:
|
|
232
|
+
raise ValueError(f"Failed to decode HTML: {e}")
|
|
233
|
+
|
|
234
|
+
# Extract text and metadata
|
|
235
|
+
if self.config.extract_metadata:
|
|
236
|
+
text, metadata = extract_article_content(html)
|
|
237
|
+
title = metadata.get("title")
|
|
238
|
+
author = metadata.get("author")
|
|
239
|
+
# Log extracted metadata
|
|
240
|
+
logger.debug(f"Extracted metadata: {metadata}")
|
|
241
|
+
else:
|
|
242
|
+
text, title = extract_text_from_html(html)
|
|
243
|
+
author = None
|
|
244
|
+
metadata = {}
|
|
245
|
+
|
|
246
|
+
if not text or len(text.strip()) < 50:
|
|
247
|
+
raise ValueError("No meaningful content extracted from HTML")
|
|
248
|
+
|
|
249
|
+
logger.info(f"Extracted {len(text)} chars from HTML")
|
|
250
|
+
if title:
|
|
251
|
+
logger.info(f" Title: {title}")
|
|
252
|
+
if author:
|
|
253
|
+
logger.info(f" Author: {author}")
|
|
254
|
+
if metadata.get("published_date"):
|
|
255
|
+
logger.info(f" Published: {metadata.get('published_date')}")
|
|
256
|
+
|
|
257
|
+
# Create Document using from_pages since from_text forces source_type="text"
|
|
258
|
+
kwargs = {
|
|
259
|
+
"pages": [text],
|
|
260
|
+
"title": title,
|
|
261
|
+
"source_type": "webpage",
|
|
262
|
+
"url": result.final_url,
|
|
263
|
+
}
|
|
264
|
+
if author:
|
|
265
|
+
kwargs["authors"] = [author]
|
|
266
|
+
|
|
267
|
+
return Document.from_pages(**kwargs)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
async def load_url(
|
|
271
|
+
url: str,
|
|
272
|
+
config: Optional[URLLoaderConfig] = None,
|
|
273
|
+
) -> Document:
|
|
274
|
+
"""
|
|
275
|
+
Convenience function to load a URL.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
url: URL to load
|
|
279
|
+
config: Optional loader configuration
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Document with extracted content
|
|
283
|
+
"""
|
|
284
|
+
loader = URLLoader(config)
|
|
285
|
+
return await loader.load(url)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def load_url_sync(
|
|
289
|
+
url: str,
|
|
290
|
+
config: Optional[URLLoaderConfig] = None,
|
|
291
|
+
) -> Document:
|
|
292
|
+
"""
|
|
293
|
+
Convenience function to load a URL synchronously.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
url: URL to load
|
|
297
|
+
config: Optional loader configuration
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Document with extracted content
|
|
301
|
+
"""
|
|
302
|
+
loader = URLLoader(config)
|
|
303
|
+
return loader.load_sync(url)
|
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DocumentPipeline - Orchestrates document-level extraction with chunking and citations.
|
|
3
|
+
|
|
4
|
+
Wraps ExtractionPipeline to provide document-level features:
|
|
5
|
+
- Text chunking with page awareness
|
|
6
|
+
- Batch processing through pipeline stages
|
|
7
|
+
- Statement deduplication across chunks
|
|
8
|
+
- Citation generation from document metadata
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import time
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, Field
|
|
16
|
+
|
|
17
|
+
from ..models.document import ChunkingConfig, Document
|
|
18
|
+
from ..pipeline import ExtractionPipeline, PipelineConfig
|
|
19
|
+
from .chunker import DocumentChunker
|
|
20
|
+
from .context import DocumentContext
|
|
21
|
+
from .deduplicator import StatementDeduplicator
|
|
22
|
+
from .summarizer import DocumentSummarizer
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DocumentPipelineConfig(BaseModel):
|
|
28
|
+
"""Configuration for document pipeline processing."""
|
|
29
|
+
|
|
30
|
+
chunking: ChunkingConfig = Field(
|
|
31
|
+
default_factory=ChunkingConfig,
|
|
32
|
+
description="Configuration for text chunking"
|
|
33
|
+
)
|
|
34
|
+
generate_summary: bool = Field(
|
|
35
|
+
default=True,
|
|
36
|
+
description="Whether to generate a document summary"
|
|
37
|
+
)
|
|
38
|
+
deduplicate_across_chunks: bool = Field(
|
|
39
|
+
default=True,
|
|
40
|
+
description="Whether to deduplicate statements across chunks"
|
|
41
|
+
)
|
|
42
|
+
batch_size: int = Field(
|
|
43
|
+
default=10,
|
|
44
|
+
ge=1,
|
|
45
|
+
description="Number of items to process in each batch"
|
|
46
|
+
)
|
|
47
|
+
pipeline_config: Optional[PipelineConfig] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Configuration for the underlying extraction pipeline"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DocumentPipeline:
|
|
54
|
+
"""
|
|
55
|
+
Document-level extraction pipeline.
|
|
56
|
+
|
|
57
|
+
Processes documents through:
|
|
58
|
+
1. Summary generation (optional)
|
|
59
|
+
2. Chunking with page awareness
|
|
60
|
+
3. Batch extraction through all pipeline stages
|
|
61
|
+
4. Deduplication across chunks
|
|
62
|
+
5. Citation generation
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> pipeline = DocumentPipeline()
|
|
66
|
+
>>> document = Document.from_text("Long document text...", title="Report")
|
|
67
|
+
>>> ctx = pipeline.process(document)
|
|
68
|
+
>>> for stmt in ctx.labeled_statements:
|
|
69
|
+
... print(f"{stmt}: {stmt.citation}")
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
config: Optional[DocumentPipelineConfig] = None,
|
|
75
|
+
):
|
|
76
|
+
"""
|
|
77
|
+
Initialize the document pipeline.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
config: Document pipeline configuration
|
|
81
|
+
"""
|
|
82
|
+
self.config = config or DocumentPipelineConfig()
|
|
83
|
+
|
|
84
|
+
# Initialize components
|
|
85
|
+
self._chunker = DocumentChunker(self.config.chunking)
|
|
86
|
+
self._deduplicator = StatementDeduplicator()
|
|
87
|
+
self._summarizer = DocumentSummarizer() if self.config.generate_summary else None
|
|
88
|
+
self._pipeline = ExtractionPipeline(self.config.pipeline_config)
|
|
89
|
+
|
|
90
|
+
def process(self, document: Document) -> DocumentContext:
|
|
91
|
+
"""
|
|
92
|
+
Process a document through the pipeline.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
document: Document to process
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
DocumentContext with all extraction results
|
|
99
|
+
"""
|
|
100
|
+
logger.info(f"Starting document pipeline: {document.document_id}")
|
|
101
|
+
start_time = time.time()
|
|
102
|
+
|
|
103
|
+
ctx = DocumentContext(document=document)
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Step 1: Generate summary (if enabled)
|
|
107
|
+
if self.config.generate_summary and self._summarizer:
|
|
108
|
+
self._generate_summary(document, ctx)
|
|
109
|
+
|
|
110
|
+
# Step 2: Chunk the document
|
|
111
|
+
chunks = self._chunker.chunk_document(document)
|
|
112
|
+
ctx.chunks = chunks
|
|
113
|
+
logger.info(f"Created {len(chunks)} chunks")
|
|
114
|
+
|
|
115
|
+
if not chunks:
|
|
116
|
+
logger.warning("No chunks created from document")
|
|
117
|
+
return ctx
|
|
118
|
+
|
|
119
|
+
# Step 3: Process all chunks through Stage 1 (Splitting)
|
|
120
|
+
self._process_stage1(ctx)
|
|
121
|
+
|
|
122
|
+
# Step 4: Deduplicate raw triples
|
|
123
|
+
if self.config.deduplicate_across_chunks:
|
|
124
|
+
self._deduplicate_triples(ctx)
|
|
125
|
+
|
|
126
|
+
# Step 5: Process through remaining stages (2-6)
|
|
127
|
+
self._process_remaining_stages(ctx)
|
|
128
|
+
|
|
129
|
+
# Step 6: Add citations to statements
|
|
130
|
+
self._add_citations(ctx)
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.exception("Document pipeline failed")
|
|
134
|
+
ctx.add_error(f"Pipeline error: {str(e)}")
|
|
135
|
+
raise
|
|
136
|
+
|
|
137
|
+
total_time = time.time() - start_time
|
|
138
|
+
ctx.record_timing("total", total_time)
|
|
139
|
+
|
|
140
|
+
logger.info(
|
|
141
|
+
f"Document pipeline complete: {ctx.statement_count} statements, "
|
|
142
|
+
f"{ctx.duplicates_removed} duplicates removed, "
|
|
143
|
+
f"{total_time:.2f}s"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return ctx
|
|
147
|
+
|
|
148
|
+
def _generate_summary(self, document: Document, ctx: DocumentContext) -> None:
|
|
149
|
+
"""Generate document summary."""
|
|
150
|
+
logger.info("Generating document summary")
|
|
151
|
+
start_time = time.time()
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
summary = self._summarizer.summarize(document)
|
|
155
|
+
document.summary = summary
|
|
156
|
+
logger.info(f"Generated summary: {len(summary)} chars")
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Summary generation failed: {e}")
|
|
159
|
+
ctx.add_warning(f"Summary generation failed: {e}")
|
|
160
|
+
|
|
161
|
+
ctx.record_timing("summarization", time.time() - start_time)
|
|
162
|
+
|
|
163
|
+
def _process_stage1(self, ctx: DocumentContext) -> None:
|
|
164
|
+
"""Process all chunks through Stage 1 (Splitting) using batch processing."""
|
|
165
|
+
from ..pipeline.registry import PluginRegistry
|
|
166
|
+
from ..plugins.base import PluginCapability
|
|
167
|
+
|
|
168
|
+
logger.info(f"Processing {len(ctx.chunks)} chunks through Stage 1 (batch mode)")
|
|
169
|
+
start_time = time.time()
|
|
170
|
+
|
|
171
|
+
# Get the splitter plugin
|
|
172
|
+
splitters = PluginRegistry.get_splitters()
|
|
173
|
+
if not splitters:
|
|
174
|
+
logger.warning("No splitter plugins registered")
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
# Use first enabled splitter
|
|
178
|
+
splitter = None
|
|
179
|
+
for s in splitters:
|
|
180
|
+
plugin_enabled = (
|
|
181
|
+
self.config.pipeline_config is None or
|
|
182
|
+
self.config.pipeline_config.is_plugin_enabled(s.name)
|
|
183
|
+
)
|
|
184
|
+
if plugin_enabled:
|
|
185
|
+
splitter = s
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
if not splitter:
|
|
189
|
+
logger.warning("No enabled splitter plugin found")
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
# Extract all chunk texts
|
|
193
|
+
chunk_texts = [chunk.text for chunk in ctx.chunks]
|
|
194
|
+
|
|
195
|
+
# Create a dummy context for the splitter
|
|
196
|
+
from ..pipeline.context import PipelineContext
|
|
197
|
+
dummy_ctx = PipelineContext(
|
|
198
|
+
source_text="", # Not used for batch splitting
|
|
199
|
+
source_metadata=self.config.pipeline_config.model_dump() if self.config.pipeline_config else {},
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
all_triples = []
|
|
203
|
+
|
|
204
|
+
# Require batch processing capability
|
|
205
|
+
if PluginCapability.BATCH_PROCESSING not in splitter.capabilities:
|
|
206
|
+
raise RuntimeError(
|
|
207
|
+
f"Splitter plugin '{splitter.name}' does not support batch processing. "
|
|
208
|
+
"Document pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
logger.info(f"Using batch splitting with {splitter.name}")
|
|
212
|
+
batch_results = splitter.split_batch(chunk_texts, dummy_ctx)
|
|
213
|
+
|
|
214
|
+
# Annotate triples with document/chunk info
|
|
215
|
+
for chunk, triples in zip(ctx.chunks, batch_results):
|
|
216
|
+
for triple in triples:
|
|
217
|
+
triple.document_id = ctx.document.document_id
|
|
218
|
+
triple.chunk_index = chunk.chunk_index
|
|
219
|
+
triple.page_number = chunk.primary_page
|
|
220
|
+
all_triples.append(triple)
|
|
221
|
+
|
|
222
|
+
ctx.raw_triples = all_triples
|
|
223
|
+
ctx.pre_dedup_count = len(all_triples)
|
|
224
|
+
|
|
225
|
+
ctx.record_timing("stage1_batch", time.time() - start_time)
|
|
226
|
+
logger.info(f"Stage 1 produced {len(all_triples)} raw triples from {len(ctx.chunks)} chunks")
|
|
227
|
+
|
|
228
|
+
def _deduplicate_triples(self, ctx: DocumentContext) -> None:
|
|
229
|
+
"""Deduplicate raw triples across chunks."""
|
|
230
|
+
logger.info(f"Deduplicating {len(ctx.raw_triples)} triples")
|
|
231
|
+
start_time = time.time()
|
|
232
|
+
|
|
233
|
+
original_count = len(ctx.raw_triples)
|
|
234
|
+
ctx.raw_triples = self._deduplicator.deduplicate_batch(ctx.raw_triples)
|
|
235
|
+
ctx.post_dedup_count = len(ctx.raw_triples)
|
|
236
|
+
|
|
237
|
+
removed = original_count - len(ctx.raw_triples)
|
|
238
|
+
ctx.record_timing("deduplication", time.time() - start_time)
|
|
239
|
+
logger.info(f"Removed {removed} duplicate triples")
|
|
240
|
+
|
|
241
|
+
def _process_remaining_stages(self, ctx: DocumentContext) -> None:
|
|
242
|
+
"""Process through stages 2-6."""
|
|
243
|
+
logger.info(f"Processing {len(ctx.raw_triples)} triples through stages 2-6")
|
|
244
|
+
start_time = time.time()
|
|
245
|
+
|
|
246
|
+
# Create a pipeline config for stages 2-6
|
|
247
|
+
# Exclude enabled_stages from base config to avoid duplicate keyword argument
|
|
248
|
+
base_config = {}
|
|
249
|
+
if self.config.pipeline_config:
|
|
250
|
+
base_config = self.config.pipeline_config.model_dump(exclude={"enabled_stages"})
|
|
251
|
+
stages_config = PipelineConfig(
|
|
252
|
+
enabled_stages={2, 3, 4, 5, 6},
|
|
253
|
+
**base_config
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Create a combined context with all raw triples
|
|
257
|
+
from ..pipeline.context import PipelineContext
|
|
258
|
+
|
|
259
|
+
combined_ctx = PipelineContext(
|
|
260
|
+
source_text=ctx.document.full_text,
|
|
261
|
+
source_metadata={
|
|
262
|
+
"document_id": ctx.document.document_id,
|
|
263
|
+
"title": ctx.document.metadata.title,
|
|
264
|
+
},
|
|
265
|
+
)
|
|
266
|
+
combined_ctx.raw_triples = ctx.raw_triples
|
|
267
|
+
|
|
268
|
+
# Run stages 2-6
|
|
269
|
+
pipeline = ExtractionPipeline(stages_config)
|
|
270
|
+
|
|
271
|
+
# We need to manually run stages since we're providing pre-existing triples
|
|
272
|
+
# Stage 2: Extraction
|
|
273
|
+
if stages_config.is_stage_enabled(2):
|
|
274
|
+
combined_ctx = pipeline._run_extraction(combined_ctx)
|
|
275
|
+
|
|
276
|
+
# Propagate document info to statements
|
|
277
|
+
for stmt in combined_ctx.statements:
|
|
278
|
+
# Find the source triple to get document info
|
|
279
|
+
for triple in ctx.raw_triples:
|
|
280
|
+
if triple.source_sentence in stmt.source_text:
|
|
281
|
+
stmt.document_id = triple.document_id
|
|
282
|
+
stmt.chunk_index = triple.chunk_index
|
|
283
|
+
stmt.page_number = triple.page_number
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
# Stage 3: Qualification
|
|
287
|
+
if stages_config.is_stage_enabled(3):
|
|
288
|
+
combined_ctx = pipeline._run_qualification(combined_ctx)
|
|
289
|
+
|
|
290
|
+
# Stage 4: Canonicalization
|
|
291
|
+
if stages_config.is_stage_enabled(4):
|
|
292
|
+
combined_ctx = pipeline._run_canonicalization(combined_ctx)
|
|
293
|
+
|
|
294
|
+
# Stage 5: Labeling
|
|
295
|
+
if stages_config.is_stage_enabled(5):
|
|
296
|
+
combined_ctx = pipeline._run_labeling(combined_ctx)
|
|
297
|
+
|
|
298
|
+
# Stage 6: Taxonomy
|
|
299
|
+
if stages_config.is_stage_enabled(6):
|
|
300
|
+
combined_ctx = pipeline._run_taxonomy(combined_ctx)
|
|
301
|
+
|
|
302
|
+
# Propagate document info to labeled statements
|
|
303
|
+
for labeled_stmt in combined_ctx.labeled_statements:
|
|
304
|
+
labeled_stmt.document_id = labeled_stmt.statement.document_id
|
|
305
|
+
labeled_stmt.page_number = labeled_stmt.statement.page_number
|
|
306
|
+
|
|
307
|
+
ctx.statements = combined_ctx.statements
|
|
308
|
+
ctx.labeled_statements = combined_ctx.labeled_statements
|
|
309
|
+
|
|
310
|
+
# Merge timings
|
|
311
|
+
for stage, duration in combined_ctx.stage_timings.items():
|
|
312
|
+
ctx.record_timing(stage, duration)
|
|
313
|
+
|
|
314
|
+
ctx.record_timing("stages_2_6_batch", time.time() - start_time)
|
|
315
|
+
logger.info(f"Stages 2-6 produced {len(ctx.labeled_statements)} labeled statements")
|
|
316
|
+
|
|
317
|
+
def _add_citations(self, ctx: DocumentContext) -> None:
|
|
318
|
+
"""Add citations to all labeled statements."""
|
|
319
|
+
logger.info("Adding citations to statements")
|
|
320
|
+
|
|
321
|
+
for stmt in ctx.labeled_statements:
|
|
322
|
+
citation = ctx.document.metadata.format_citation(stmt.page_number)
|
|
323
|
+
stmt.citation = citation if citation else None
|
|
324
|
+
|
|
325
|
+
def process_text(
|
|
326
|
+
self,
|
|
327
|
+
text: str,
|
|
328
|
+
title: Optional[str] = None,
|
|
329
|
+
**metadata_kwargs,
|
|
330
|
+
) -> DocumentContext:
|
|
331
|
+
"""
|
|
332
|
+
Process plain text through the document pipeline.
|
|
333
|
+
|
|
334
|
+
Convenience method that creates a Document from text.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
text: Text to process
|
|
338
|
+
title: Optional document title
|
|
339
|
+
**metadata_kwargs: Additional document metadata
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
DocumentContext with extraction results
|
|
343
|
+
"""
|
|
344
|
+
document = Document.from_text(text, title=title, **metadata_kwargs)
|
|
345
|
+
return self.process(document)
|
|
346
|
+
|
|
347
|
+
async def process_url(
|
|
348
|
+
self,
|
|
349
|
+
url: str,
|
|
350
|
+
loader_config: Optional["URLLoaderConfig"] = None,
|
|
351
|
+
) -> DocumentContext:
|
|
352
|
+
"""
|
|
353
|
+
Process a URL through the document pipeline.
|
|
354
|
+
|
|
355
|
+
Fetches the URL, extracts content (HTML or PDF), and processes it.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
url: URL to process
|
|
359
|
+
loader_config: Optional loader configuration
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
DocumentContext with extraction results
|
|
363
|
+
"""
|
|
364
|
+
from .loader import URLLoader, URLLoaderConfig
|
|
365
|
+
|
|
366
|
+
loader = URLLoader(loader_config or URLLoaderConfig())
|
|
367
|
+
document = await loader.load(url)
|
|
368
|
+
return self.process(document)
|
|
369
|
+
|
|
370
|
+
def process_url_sync(
|
|
371
|
+
self,
|
|
372
|
+
url: str,
|
|
373
|
+
loader_config: Optional["URLLoaderConfig"] = None,
|
|
374
|
+
) -> DocumentContext:
|
|
375
|
+
"""
|
|
376
|
+
Process a URL through the document pipeline (synchronous).
|
|
377
|
+
|
|
378
|
+
Fetches the URL, extracts content (HTML or PDF), and processes it.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
url: URL to process
|
|
382
|
+
loader_config: Optional loader configuration
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
DocumentContext with extraction results
|
|
386
|
+
"""
|
|
387
|
+
import asyncio
|
|
388
|
+
return asyncio.run(self.process_url(url, loader_config))
|