PyPI - corp-extractor - Versions diffs - 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

corp-extractor 0.2.5py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/METADATA +115 -22
corp_extractor-0.3.0.dist-info/RECORD +12 -0
statement_extractor/__init__.py +3 -1
statement_extractor/cli.py +41 -1
statement_extractor/extractor.py +381 -26
statement_extractor/models.py +33 -1
statement_extractor/predicate_comparer.py +23 -1
statement_extractor/scoring.py +189 -97
statement_extractor/spacy_extraction.py +386 -0
corp_extractor-0.2.5.dist-info/RECORD +0 -11
{corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/entry_points.txt +0 -0

{corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: corp-extractor
-Version: 0.2.5
+Version: 0.3.0
 Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
 Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
 Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -26,17 +26,15 @@ Requires-Python: >=3.10
 Requires-Dist: click>=8.0.0
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: pydantic>=2.0.0
+Requires-Dist: sentence-transformers>=2.2.0
+Requires-Dist: spacy>=3.5.0
 Requires-Dist: torch>=2.0.0
-Requires-Dist: transformers>=5.0.0
-Provides-Extra: all
-Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
+Requires-Dist: transformers>=5.0.0rc3
 Provides-Extra: dev
 Requires-Dist: mypy>=1.0.0; extra == 'dev'
 Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
 Requires-Dist: pytest>=7.0.0; extra == 'dev'
 Requires-Dist: ruff>=0.1.0; extra == 'dev'
-Provides-Extra: embeddings
-Requires-Dist: sentence-transformers>=2.2.0; extra == 'embeddings'
 Description-Content-Type: text/markdown
 # Corp Extractor
@@ -51,7 +49,11 @@ Extract structured subject-predicate-object statements from unstructured text us
 - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
 - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
-- **Quality Scoring** *(v0.2.0)*: Each triple scored for groundedness (0-1) based on source text
+- **Combined Quality Scoring** *(v0.3.0)*: Confidence combines semantic similarity (50%) + subject/object noun scores (25% each)
+- **spaCy-First Predicates** *(v0.3.0)*: Always uses spaCy for predicate extraction (model predicates are unreliable)
+- **Multi-Candidate Extraction** *(v0.3.0)*: Generates 3 candidates per statement (hybrid, spaCy-only, predicate-split)
+- **Best Triple Selection** *(v0.3.0)*: Keeps only highest-scoring triple per source (use `--all-triples` to keep all)
+- **Extraction Method Tracking** *(v0.3.0)*: Each statement includes `extraction_method` field (hybrid, spacy, split, model)
 - **Beam Merging** *(v0.2.0)*: Combines top beams for better coverage instead of picking one
 - **Embedding-based Dedup** *(v0.2.0)*: Uses semantic similarity to detect near-duplicate predicates
 - **Predicate Taxonomies** *(v0.2.0)*: Normalize predicates to canonical forms via embeddings
@@ -64,19 +66,22 @@ Extract structured subject-predicate-object statements from unstructured text us
 ## Installation
 ```bash
-# Recommended: include embedding support for smart deduplication
-pip install corp-extractor[embeddings]
-# Minimal installation (no embedding features)
 pip install corp-extractor
 ```
-**Note**: This package requires the development version of `transformers` from GitHub (for T5-Gemma2 support). This is handled automatically during installation.
+The spaCy model for predicate inference is downloaded automatically on first use.
+**Note**: This package requires `transformers>=5.0.0` for T5-Gemma2 model support.
 **For GPU support**, install PyTorch with CUDA first:
 ```bash
 pip install torch --index-url https://download.pytorch.org/whl/cu121
-pip install corp-extractor[embeddings]
+pip install corp-extractor
+```
+**For Apple Silicon (M1/M2/M3)**, MPS acceleration is automatically detected:
+```bash
+pip install corp-extractor  # MPS used automatically
 ```
 ## Quick Start
@@ -105,13 +110,13 @@ For best results, install globally first:
 ```bash
 # Using uv (recommended)
-uv tool install corp-extractor[embeddings]
+uv tool install "corp-extractor[embeddings]"
 # Using pipx
-pipx install corp-extractor[embeddings]
+pipx install "corp-extractor[embeddings]"
 # Using pip
-pip install corp-extractor[embeddings]
+pip install "corp-extractor[embeddings]"
 # Then use anywhere
 corp-extractor "Your text here"
@@ -125,7 +130,7 @@ Run directly without installing using [uv](https://docs.astral.sh/uv/):
 uvx corp-extractor "Apple announced a new iPhone."
 ```
-**Note**: uvx runs may be slower on first use as it installs transformers from git.
+**Note**: First run downloads the model (~1.5GB) which may take a few minutes.
 ### Usage Examples
@@ -174,11 +179,13 @@ Options:
   --no-dedup                   Disable deduplication
   --no-embeddings              Disable embedding-based dedup (faster)
   --no-merge                   Disable beam merging
+  --no-spacy                   Disable spaCy extraction (use raw model output)
+  --all-triples                Keep all candidate triples (default: best per source)
   --dedup-threshold FLOAT      Deduplication threshold (default: 0.65)
   --min-confidence FLOAT       Min confidence filter (default: 0)
   --taxonomy PATH              Load predicate taxonomy from file
   --taxonomy-threshold FLOAT   Taxonomy matching threshold (default: 0.5)
-  --device [auto|cuda|cpu]     Device to use (default: auto)
+  --device [auto|cuda|mps|cpu] Device to use (default: auto)
   -v, --verbose                Show confidence scores and metadata
   -q, --quiet                  Suppress progress messages
   --version                    Show version
@@ -276,7 +283,91 @@ for stmt in fixed_statements:
 During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
-## Disable Embeddings (Faster, No Extra Dependencies)
+## New in v0.3.0: spaCy-First Extraction & Semantic Scoring
+v0.3.0 introduces significant improvements to extraction quality:
+### spaCy-First Predicate Extraction
+The T5-Gemma model is excellent at:
+- **Triple isolation** - identifying that a relationship exists
+- **Coreference resolution** - resolving pronouns to named entities
+But unreliable at:
+- **Predicate extraction** - often returns empty or wrong predicates
+**Solution:** v0.3.0 always uses spaCy for predicate extraction. The model provides subject, object, entity types, and source text; spaCy provides the predicate.
+### Three Candidate Extraction Methods
+For each statement, three candidates are generated and the best is selected:
+| Method | Description |
+|--------|-------------|
+| `hybrid` | Model subject/object + spaCy predicate |
+| `spacy` | All components from spaCy dependency parsing |
+| `split` | Source text split around the predicate |
+```python
+for stmt in result:
+    print(f"{stmt.subject.text} --[{stmt.predicate}]--> {stmt.object.text}")
+    print(f"  Method: {stmt.extraction_method}")  # hybrid, spacy, split, or model
+    print(f"  Confidence: {stmt.confidence_score:.2f}")
+```
+### Combined Quality Scoring
+Confidence scores combine **semantic similarity** and **grammatical accuracy**:
+| Component | Weight | Description |
+|-----------|--------|-------------|
+| Semantic similarity | 50% | Cosine similarity between source text and reassembled triple |
+| Subject noun score | 25% | How noun-like the subject is |
+| Object noun score | 25% | How noun-like the object is |
+**Noun scoring:**
+- Proper noun(s) only: 1.0
+- Common noun(s) only: 0.8
+- Contains noun + other words: 0.4-0.8 (based on ratio)
+- No nouns: 0.2
+This ensures extracted subjects and objects are grammatically valid entities, not fragments or verb phrases.
+### Extraction Method Tracking
+Each statement now includes an `extraction_method` field:
+- `hybrid` - Model subject/object + spaCy predicate
+- `spacy` - All components from spaCy dependency parsing
+- `split` - Subject/object from splitting source text around predicate
+- `model` - All components from T5-Gemma model (only when `--no-spacy`)
+### Best Triple Selection
+By default, only the **highest-scoring triple** is kept for each source sentence. This ensures clean output without redundant candidates.
+To keep all candidate triples (for debugging or analysis):
+```python
+options = ExtractionOptions(all_triples=True)
+result = extract_statements(text, options)
+```
+Or via CLI:
+```bash
+corp-extractor "Your text" --all-triples --verbose
+```
+**Disable spaCy extraction** to use only model output:
+```python
+options = ExtractionOptions(use_spacy_extraction=False)
+result = extract_statements(text, options)
+```
+Or via CLI:
+```bash
+corp-extractor "Your text" --no-spacy
+```
+## Disable Embeddings
 ```python
 options = ExtractionOptions(
@@ -314,7 +405,7 @@ dict_output = extract_statements_as_dict(text)
 ```python
 from statement_extractor import StatementExtractor
-extractor = StatementExtractor(device="cuda")  # or "cpu"
+extractor = StatementExtractor(device="cuda")  # or "mps" (Apple Silicon) or "cpu"
 texts = ["Text 1...", "Text 2...", "Text 3..."]
 for text in texts:
@@ -352,14 +443,16 @@ This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam
 6. **Contextualized Matching** *(v0.2.2)*: Full statement context used for canonicalization and dedup
 7. **Entity Type Merging** *(v0.2.3)*: UNKNOWN types merged with specific types during dedup
 8. **Reversal Detection** *(v0.2.3)*: Subject-object reversals detected and corrected via embedding comparison
+9. **Hybrid spaCy** *(v0.2.12)*: spaCy candidates added to pool alongside model output for better coverage
 ## Requirements
 - Python 3.10+
 - PyTorch 2.0+
-- Transformers 4.35+
+- Transformers 5.0+
 - Pydantic 2.0+
-- sentence-transformers 2.2+ *(optional, for embedding features)*
+- sentence-transformers 2.2+
+- spaCy 3.5+ (model downloaded automatically on first use)
 - ~2GB VRAM (GPU) or ~4GB RAM (CPU)
 ## Links

corp_extractor-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+statement_extractor/__init__.py,sha256=KwZfWnTB9oevTLw0TrNlYFu67qIYO-34JqDtcpjOhZI,3013
+statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
+statement_extractor/cli.py,sha256=JMEXiT2xwmW1J8JmJliQh32AT-7bTAtAscPx1AGRfPg,9054
+statement_extractor/extractor.py,sha256=vS8UCgE8uITt_28PwCh4WCqOjWLpfrJcN3fh1YPBcjA,39657
+statement_extractor/models.py,sha256=FxLj2fIodX317XVIJLZ0GFNahm_VV07KzdoLSSjoVD4,11952
+statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
+statement_extractor/scoring.py,sha256=pdNgyLHmlk-npISzm4nycK9G4wM2nztg5KTG7piFACI,18135
+statement_extractor/spacy_extraction.py,sha256=ACvIB-Ag7H7h_Gb0cdypIr8fnf3A-UjyJnqqjWD5Ccs,12320
+corp_extractor-0.3.0.dist-info/METADATA,sha256=eu8b7R_FQxFyc_9FSocy078TTyB7BwvGX-YAS79hKgg,17042
+corp_extractor-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+corp_extractor-0.3.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
+corp_extractor-0.3.0.dist-info/RECORD,,

statement_extractor/__init__.py CHANGED Viewed

@@ -29,12 +29,13 @@ Example:
     >>> data = extract_statements_as_dict("Some text...")
 """
-__version__ = "0.2.5"
+__version__ = "0.3.0"
 # Core models
 from .models import (
     Entity,
     EntityType,
+    ExtractionMethod,
     ExtractionOptions,
     ExtractionResult,
     Statement,
@@ -73,6 +74,7 @@ __all__ = [
     # Core models
     "Entity",
     "EntityType",
+    "ExtractionMethod",
     "ExtractionOptions",
     "ExtractionResult",
     "Statement",

statement_extractor/cli.py CHANGED Viewed

@@ -7,11 +7,37 @@ Usage:
     cat input.txt | corp-extractor -
 """
+import logging
 import sys
 from typing import Optional
 import click
+def _configure_logging(verbose: bool) -> None:
+    """Configure logging for the extraction pipeline."""
+    level = logging.DEBUG if verbose else logging.WARNING
+    # Configure root logger for statement_extractor package
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+        stream=sys.stderr,
+        force=True,
+    )
+    # Set level for all statement_extractor loggers
+    for logger_name in [
+        "statement_extractor",
+        "statement_extractor.extractor",
+        "statement_extractor.scoring",
+        "statement_extractor.predicate_comparer",
+        "statement_extractor.canonicalization",
+        "statement_extractor.spacy_extraction",
+    ]:
+        logging.getLogger(logger_name).setLevel(level)
 from . import __version__
 from .models import (
     ExtractionOptions,
@@ -40,6 +66,8 @@ from .models import (
 @click.option("--no-dedup", is_flag=True, help="Disable deduplication")
 @click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
 @click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
+@click.option("--no-spacy", is_flag=True, help="Disable spaCy extraction (use raw model output)")
+@click.option("--all-triples", is_flag=True, help="Keep all candidate triples instead of selecting best per source")
 @click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
 # Quality options
 @click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
@@ -47,7 +75,7 @@ from .models import (
 @click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
 @click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
 # Device options
-@click.option("--device", type=click.Choice(["auto", "cuda", "cpu"]), default="auto", help="Device to use (default: auto)")
+@click.option("--device", type=click.Choice(["auto", "cuda", "mps", "cpu"]), default="auto", help="Device to use (default: auto)")
 # Output options
 @click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
 @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
@@ -64,6 +92,8 @@ def main(
     no_dedup: bool,
     no_embeddings: bool,
     no_merge: bool,
+    no_spacy: bool,
+    all_triples: bool,
     dedup_threshold: float,
     min_confidence: float,
     taxonomy: Optional[str],
@@ -91,6 +121,9 @@ def main(
         json   JSON with full metadata
         xml    Raw XML from model
     """
+    # Configure logging based on verbose flag
+    _configure_logging(verbose)
     # Determine output format
     if output_json:
         output = "json"
@@ -132,9 +165,12 @@ def main(
         deduplicate=not no_dedup,
         embedding_dedup=not no_embeddings,
         merge_beams=not no_merge,
+        use_spacy_extraction=not no_spacy,
+        all_triples=all_triples,
         predicate_taxonomy=predicate_taxonomy,
         predicate_config=predicate_config,
         scoring_config=scoring_config,
+        verbose=verbose,
     )
     # Import here to allow --help without loading torch
@@ -160,6 +196,7 @@ def main(
             result = extractor.extract(input_text, options)
             _print_table(result, verbose)
     except Exception as e:
+        logging.exception("Error extracting statements:")
         raise click.ClickException(f"Extraction failed: {e}")
@@ -195,6 +232,9 @@ def _print_table(result, verbose: bool):
         click.echo(f"   {stmt.object.text}{object_type}")
         if verbose:
+            # Always show extraction method
+            click.echo(f"   Method: {stmt.extraction_method.value}")
             if stmt.confidence_score is not None:
                 click.echo(f"   Confidence: {stmt.confidence_score:.2f}")

corp-extractor 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

corp-extractor 0.2.5py3-none-any.whl → 0.3.0py3-none-any.whl