PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +1227 -10
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/models/__init__.py +16 -1
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +26 -0
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/orchestrator.py +80 -111
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +334 -64
statement_extractor/plugins/extractors/gliner2.py +10 -0
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +578 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +158 -53
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/qualifiers/sec_edgar.py CHANGED Viewed

@@ -1,17 +1,20 @@
 """
 SECEdgarQualifierPlugin - Qualifies US ORG entities with SEC data.
+DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
+embedding database with pre-loaded SEC Edgar data for faster, offline matching.
 Uses the SEC EDGAR API to:
 - Look up CIK (Central Index Key) by company name
 - Retrieve ticker symbol, exchange, filing history
 """
 import logging
+import warnings
 from typing import Optional
 from ..base import BaseQualifierPlugin, PluginCapability
 from ...pipeline.context import PipelineContext
-from ...pipeline.registry import PluginRegistry
 from ...models import ExtractedEntity, EntityQualifiers, EntityType
 logger = logging.getLogger(__name__)
@@ -21,11 +24,12 @@ SEC_COMPANY_SEARCH = "https://efts.sec.gov/LATEST/search-index"
 SEC_COMPANY_TICKERS = "https://www.sec.gov/files/company_tickers.json"
-@PluginRegistry.qualifier
+# DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
 class SECEdgarQualifierPlugin(BaseQualifierPlugin):
     """
-    Qualifier plugin for US ORG entities using SEC EDGAR.
+    DEPRECATED: Use EmbeddingCompanyQualifier instead.
+    Qualifier plugin for US ORG entities using SEC EDGAR.
     Provides CIK and ticker symbol for publicly traded US companies.
     """
@@ -37,10 +41,17 @@ class SECEdgarQualifierPlugin(BaseQualifierPlugin):
         """
         Initialize the SEC EDGAR qualifier.
+        DEPRECATED: Use EmbeddingCompanyQualifier instead.
         Args:
             timeout: API request timeout in seconds
             cache_results: Whether to cache API results
         """
+        warnings.warn(
+            "SECEdgarQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         self._timeout = timeout
         self._cache_results = cache_results
         self._cache: dict[str, Optional[dict]] = {}

statement_extractor/plugins/scrapers/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+Scraper plugins for fetching content from URLs.
+Built-in scrapers:
+- http_scraper: Default HTTP scraper using httpx with retries
+"""
+from .http import HttpScraperPlugin
+__all__ = ["HttpScraperPlugin"]

statement_extractor/plugins/scrapers/http.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""
+HTTP scraper plugin for fetching web content.
+Uses httpx for async HTTP requests with retries, timeouts, and CAPTCHA detection.
+"""
+import logging
+from typing import Optional
+from ..base import BaseScraperPlugin, ContentType, ScraperResult
+from ...pipeline.registry import PluginRegistry
+logger = logging.getLogger(__name__)
+@PluginRegistry.scraper
+class HttpScraperPlugin(BaseScraperPlugin):
+    """
+    Default HTTP scraper using httpx with retries and timeouts.
+    Features:
+    - Async HTTP requests with httpx
+    - Automatic redirect following
+    - Content type detection from headers and URL
+    - CAPTCHA page detection
+    - Configurable timeout and retries
+    """
+    def __init__(
+        self,
+        timeout: float = 30.0,
+        max_retries: int = 3,
+        user_agent: str = "Mozilla/5.0 (compatible; StatementExtractor/1.0; +https://github.com/corp-o-rate/statement-extractor)",
+        follow_redirects: bool = True,
+    ):
+        self._timeout = timeout
+        self._max_retries = max_retries
+        self._user_agent = user_agent
+        self._follow_redirects = follow_redirects
+    @property
+    def name(self) -> str:
+        return "http_scraper"
+    @property
+    def priority(self) -> int:
+        return 100  # Default scraper
+    @property
+    def description(self) -> str:
+        return "Default HTTP scraper using httpx with retries and CAPTCHA detection"
+    async def fetch(self, url: str, timeout: Optional[float] = None) -> ScraperResult:
+        """
+        Fetch content from a URL with retries and CAPTCHA detection.
+        Args:
+            url: The URL to fetch
+            timeout: Request timeout in seconds (uses instance default if None)
+        Returns:
+            ScraperResult with content, content type, and any errors
+        """
+        import httpx
+        timeout = timeout or self._timeout
+        last_error: Optional[str] = None
+        for attempt in range(self._max_retries):
+            try:
+                async with httpx.AsyncClient(
+                    timeout=timeout,
+                    follow_redirects=self._follow_redirects,
+                ) as client:
+                    logger.debug(f"Fetching URL: {url} (attempt {attempt + 1})")
+                    response = await client.get(
+                        url,
+                        headers={"User-Agent": self._user_agent},
+                    )
+                    content_type = self._detect_content_type(
+                        dict(response.headers), url
+                    )
+                    # Check for CAPTCHA if HTML
+                    error = None
+                    if content_type == ContentType.HTML:
+                        if self._is_captcha_page(response.content):
+                            error = "CAPTCHA or challenge page detected"
+                            logger.warning(f"CAPTCHA detected at {url}")
+                    return ScraperResult(
+                        url=url,
+                        final_url=str(response.url),
+                        content=response.content,
+                        content_type=content_type,
+                        headers=dict(response.headers),
+                        error=error,
+                    )
+            except httpx.TimeoutException as e:
+                last_error = f"Request timed out after {timeout}s"
+                logger.warning(f"Timeout fetching {url}: {e}")
+            except httpx.ConnectError as e:
+                last_error = f"Connection error: {e}"
+                logger.warning(f"Connection error fetching {url}: {e}")
+            except httpx.HTTPStatusError as e:
+                last_error = f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
+                logger.warning(f"HTTP error fetching {url}: {e}")
+                # Don't retry on 4xx errors
+                if 400 <= e.response.status_code < 500:
+                    break
+            except Exception as e:
+                last_error = f"Unexpected error: {e}"
+                logger.exception(f"Error fetching {url}")
+        # All retries failed
+        return ScraperResult(
+            url=url,
+            final_url=url,
+            content=b"",
+            content_type=ContentType.UNKNOWN,
+            error=last_error or "Unknown error",
+        )
+    async def head(self, url: str, timeout: Optional[float] = None) -> ScraperResult:
+        """
+        Check content type without downloading the full body.
+        Args:
+            url: The URL to check
+            timeout: Request timeout in seconds
+        Returns:
+            ScraperResult with content_type populated (content is empty)
+        """
+        import httpx
+        timeout = timeout or self._timeout
+        try:
+            async with httpx.AsyncClient(
+                timeout=timeout,
+                follow_redirects=self._follow_redirects,
+            ) as client:
+                response = await client.head(
+                    url,
+                    headers={"User-Agent": self._user_agent},
+                )
+                content_type = self._detect_content_type(
+                    dict(response.headers), url
+                )
+                return ScraperResult(
+                    url=url,
+                    final_url=str(response.url),
+                    content=b"",
+                    content_type=content_type,
+                    headers=dict(response.headers),
+                )
+        except Exception as e:
+            logger.warning(f"HEAD request failed for {url}: {e}")
+            # Fall back to full fetch
+            return await self.fetch(url, timeout)
+    @staticmethod
+    def _detect_content_type(headers: dict[str, str], url: str) -> ContentType:
+        """
+        Detect content type from HTTP headers and URL.
+        Priority:
+        1. Content-Type header
+        2. URL file extension
+        """
+        content_type_header = headers.get("content-type", "").lower()
+        # Check Content-Type header
+        if "application/pdf" in content_type_header:
+            return ContentType.PDF
+        if any(mime in content_type_header for mime in [
+            "text/html",
+            "application/xhtml+xml",
+        ]):
+            return ContentType.HTML
+        # Check URL extension
+        url_lower = url.lower().split("?")[0]  # Remove query params
+        if url_lower.endswith(".pdf"):
+            return ContentType.PDF
+        if url_lower.endswith((".html", ".htm")):
+            return ContentType.HTML
+        # Default based on content-type
+        if content_type_header.startswith("text/"):
+            return ContentType.HTML
+        if content_type_header.startswith(("image/", "audio/", "video/")):
+            return ContentType.BINARY
+        return ContentType.UNKNOWN
+    @staticmethod
+    def _is_captcha_page(content: bytes) -> bool:
+        """
+        Detect CAPTCHA or challenge pages.
+        Checks for common CAPTCHA patterns in HTML content.
+        """
+        try:
+            html = content.decode("utf-8", errors="replace").lower()
+        except Exception:
+            return False
+        # Only check small pages (challenge pages are usually small)
+        if len(html) > 50000:
+            return False
+        # Common CAPTCHA/challenge indicators
+        captcha_patterns = [
+            "captcha",
+            "cloudflare",
+            "checking your browser",
+            "please verify you are a human",
+            "access denied",
+            "bot protection",
+            "ddos protection",
+            "just a moment",
+            "enable javascript",
+            "please enable cookies",
+            "verify you are human",
+            "security check",
+        ]
+        return any(pattern in html for pattern in captcha_patterns)

statement_extractor/plugins/splitters/t5_gemma.py CHANGED Viewed

@@ -7,7 +7,6 @@ subject-predicate-object triples from text.
 import logging
 import re
-import xml.etree.ElementTree as ET
 from typing import Optional
 from ..base import BaseSplitterPlugin, PluginCapability
@@ -62,12 +61,22 @@ class T5GemmaSplitter(BaseSplitterPlugin):
     @property
     def capabilities(self) -> PluginCapability:
-        return PluginCapability.LLM_REQUIRED
+        return PluginCapability.LLM_REQUIRED | PluginCapability.BATCH_PROCESSING
     @property
     def description(self) -> str:
         return "T5-Gemma2 model for extracting triples using Diverse Beam Search"
+    @property
+    def model_vram_gb(self) -> float:
+        """T5-Gemma2 model weights ~2GB in bfloat16."""
+        return 2.0
+    @property
+    def per_item_vram_gb(self) -> float:
+        """Each text item during batch processing ~0.5GB for KV cache and activations."""
+        return 0.5
     def _get_extractor(self):
         """Lazy-load the StatementExtractor."""
         if self._extractor is None:
@@ -126,62 +135,158 @@ class T5GemmaSplitter(BaseSplitterPlugin):
         logger.info(f"T5GemmaSplitter produced {len(raw_triples)} raw triples")
         return raw_triples
+    def split_batch(
+        self,
+        texts: list[str],
+        context: PipelineContext,
+    ) -> list[list[RawTriple]]:
+        """
+        Split multiple texts into atomic triples using batch processing.
+        Processes all texts through the T5-Gemma2 model in batches
+        sized for optimal GPU utilization.
+        Args:
+            texts: List of input texts to split
+            context: Pipeline context
+        Returns:
+            List of RawTriple lists, one per input text
+        """
+        if not texts:
+            return []
+        batch_size = self.get_optimal_batch_size()
+        logger.info(f"T5GemmaSplitter batch processing {len(texts)} texts with batch_size={batch_size}")
+        # Get options from context
+        splitter_options = context.source_metadata.get("splitter_options", {})
+        num_beams = splitter_options.get("num_beams", self._num_beams)
+        diversity_penalty = splitter_options.get("diversity_penalty", self._diversity_penalty)
+        max_new_tokens = splitter_options.get("max_new_tokens", self._max_new_tokens)
+        # Create extraction options
+        from ...models import ExtractionOptions as LegacyExtractionOptions
+        options = LegacyExtractionOptions(
+            num_beams=num_beams,
+            diversity_penalty=diversity_penalty,
+            max_new_tokens=max_new_tokens,
+            use_gliner_extraction=False,
+            embedding_dedup=False,
+            deduplicate=False,
+        )
+        extractor = self._get_extractor()
+        all_results: list[list[RawTriple]] = []
+        # Process in batches
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            logger.debug(f"Processing batch {i // batch_size + 1}: {len(batch_texts)} texts")
+            batch_results = self._process_batch(batch_texts, extractor, options)
+            all_results.extend(batch_results)
+        total_triples = sum(len(r) for r in all_results)
+        logger.info(f"T5GemmaSplitter batch produced {total_triples} total triples from {len(texts)} texts")
+        return all_results
+    def _process_batch(
+        self,
+        texts: list[str],
+        extractor,
+        options,
+    ) -> list[list[RawTriple]]:
+        """
+        Process a batch of texts through the model.
+        Uses the model's batch generation capability for efficient GPU utilization.
+        """
+        import torch
+        # Wrap texts in page tags
+        wrapped_texts = [f"<page>{t}</page>" if not t.startswith("<page>") else t for t in texts]
+        # Tokenize batch
+        tokenizer = extractor.tokenizer
+        model = extractor.model
+        inputs = tokenizer(
+            wrapped_texts,
+            return_tensors="pt",
+            max_length=4096,
+            truncation=True,
+            padding=True,
+        ).to(extractor.device)
+        # Create stopping criteria
+        from ...extractor import StopOnSequence
+        from transformers import StoppingCriteriaList
+        input_length = inputs["input_ids"].shape[1]
+        stop_criteria = StopOnSequence(
+            tokenizer=tokenizer,
+            stop_sequence="</statements>",
+            input_length=input_length,
+        )
+        # Generate for all texts in batch
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=options.max_new_tokens,
+                max_length=None,
+                num_beams=options.num_beams,
+                num_beam_groups=options.num_beams,
+                num_return_sequences=1,  # One sequence per input for batch
+                diversity_penalty=options.diversity_penalty,
+                do_sample=False,
+                top_p=None,
+                top_k=None,
+                trust_remote_code=True,
+                custom_generate="transformers-community/group-beam-search",
+                stopping_criteria=StoppingCriteriaList([stop_criteria]),
+            )
+        # Decode and parse each output
+        results: list[list[RawTriple]] = []
+        end_tag = "</statements>"
+        for output in outputs:
+            decoded = tokenizer.decode(output, skip_special_tokens=True)
+            # Truncate at </statements>
+            if end_tag in decoded:
+                end_pos = decoded.find(end_tag) + len(end_tag)
+                decoded = decoded[:end_pos]
+            triples = self._parse_xml_to_raw_triples(decoded)
+            results.append(triples)
+        return results
+    # Regex pattern to extract <text> content from <stmt> blocks
+    _STMT_TEXT_PATTERN = re.compile(r'<stmt>.*?<text>(.*?)</text>.*?</stmt>', re.DOTALL)
     def _parse_xml_to_raw_triples(self, xml_output: str) -> list[RawTriple]:
-        """Parse XML output into RawTriple objects."""
+        """Extract source sentences from <stmt><text>...</text></stmt> blocks."""
         raw_triples = []
-        try:
-            root = ET.fromstring(xml_output)
-        except ET.ParseError as e:
-            logger.warning(f"XML parse error: {e}")
-            # Try to repair
-            xml_output = self._repair_xml(xml_output)
-            try:
-                root = ET.fromstring(xml_output)
-            except ET.ParseError:
-                logger.error("XML repair failed")
-                return raw_triples
-        if root.tag != "statements":
-            logger.warning(f"Unexpected root tag: {root.tag}")
-            return raw_triples
-        for stmt_elem in root.findall("stmt"):
-            try:
-                subject_elem = stmt_elem.find("subject")
-                predicate_elem = stmt_elem.find("predicate")
-                object_elem = stmt_elem.find("object")
-                text_elem = stmt_elem.find("text")
-                subject_text = subject_elem.text.strip() if subject_elem is not None and subject_elem.text else ""
-                predicate_text = predicate_elem.text.strip() if predicate_elem is not None and predicate_elem.text else ""
-                object_text = object_elem.text.strip() if object_elem is not None and object_elem.text else ""
-                source_text = text_elem.text.strip() if text_elem is not None and text_elem.text else ""
-                if subject_text and object_text and source_text:
-                    raw_triples.append(RawTriple(
-                        subject_text=subject_text,
-                        predicate_text=predicate_text,
-                        object_text=object_text,
-                        source_sentence=source_text,
-                    ))
-                else:
-                    logger.debug(f"Skipping incomplete triple: s={subject_text}, p={predicate_text}, o={object_text}")
-            except Exception as e:
-                logger.warning(f"Error parsing stmt element: {e}")
-                continue
+        # Find all <text> content within <stmt> blocks
+        text_matches = self._STMT_TEXT_PATTERN.findall(xml_output)
+        logger.debug(f"Found {len(text_matches)} stmt text blocks via regex")
-        return raw_triples
+        for source_text in text_matches:
+            source_text = source_text.strip()
+            if source_text:
+                raw_triples.append(RawTriple(
+                    subject_text="",
+                    predicate_text="",
+                    object_text="",
+                    source_sentence=source_text,
+                ))
-    def _repair_xml(self, xml_string: str) -> str:
-        """Attempt to repair common XML syntax errors."""
-        # Use the repair function from extractor.py
-        from ...extractor import repair_xml
-        repaired, repairs = repair_xml(xml_string)
-        if repairs:
-            logger.debug(f"XML repairs: {', '.join(repairs)}")
-        return repaired
+        return raw_triples
 # Allow importing without decorator for testing

corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl