PyPI - extract-cli - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

extract-cli 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{extract_cli-0.1.0 → extract_cli-0.1.1}/CHANGELOG.md RENAMED Viewed

@@ -6,6 +6,31 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
 (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
 the output schema require a major version bump**; new optional fields are minor.
+## [0.1.1] - 2026-05-21
+Real-world hardening, driven by testing against a SEC EDGAR employment
+agreement and the Common Paper Mutual NDA (PDF/DOCX).
+### Added
+- **`numbered` clause-detection tier** for plain numbered headings
+  (`1. Termination`, `Section 3. Payment`, `Article IV. …`) — the dominant
+  format in foreign paper, missed by the H2/bold/ALL-CAPS tiers. A title-case
+  heuristic rejects numbered sentences and list items. The output schema's
+  clause `tier` enum gains `numbered` (a backward-compatible widening).
+### Fixed
+- **PDF reader** now extracts text only from inside `BT … ET` text objects, so
+  embedded fonts, digital-signature blobs, and metadata streams no longer leak
+  binary noise (a real signed PDF dropped from ~188 KB of garbage to ~8.7 KB of
+  clean text). Added a printable-ratio backstop.
+- **Effective date**: anchor on `(the "Effective Date")` and a bare
+  `as of <date>` cue; handle dates that wrap across a line break.
+- **Term length**: require a real number, dropping false positives such as
+  `…consecutive days`.
+- **Title**: skip SGML/XML wrapper lines (e.g. SEC EDGAR `<DOCUMENT>` headers).
+- Strip trailing punctuation from clause titles (`Other Benefits.` →
+  `Other Benefits`).
 ## [0.1.0] - 2026-05-21
 Initial release — the open-loop front door of the contract-ops CLI suite.
@@ -57,4 +82,5 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
   intentionally *not* governed by the output schema (the schema describes the
   full default output).
+[0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
 [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0

{extract_cli-0.1.0 → extract_cli-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: extract-cli
-Version: 0.1.0
+Version: 0.1.1
 Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
 Project-URL: Homepage, https://cli.drbaher.com/
 Project-URL: Repository, https://github.com/DrBaher/extract-cli

{extract_cli-0.1.0 → extract_cli-0.1.1}/docs/spec/extract-output.schema.json RENAMED Viewed

@@ -183,6 +183,7 @@
             "enum": [
               "h2",
               "bold-numbered",
+              "numbered",
               "all-caps",
               "explicit",
               "llm"

{extract_cli-0.1.0 → extract_cli-0.1.1}/extract_cli.py RENAMED Viewed

@@ -42,11 +42,11 @@ import urllib.request
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 # Bumped independently of the package version when the *extraction logic*
 # changes in a way downstream consumers should notice. Embedded in `_meta`.
-EXTRACTOR_VERSION = "0.1.0"
+EXTRACTOR_VERSION = "0.1.1"
 # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
 SCHEMA_VERSION = 1
@@ -214,6 +214,49 @@ def _qualifies_as_all_caps_heading(title: str) -> bool:
     return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
+# Tier between bold-numbered and ALL-CAPS: plain numbered headings on their own
+# line -- "1. Termination", "5. Wage Compensation", "Section 3. Payment",
+# "Article IV. Confidentiality". These are the dominant real-world format in
+# foreign paper (and aren't caught by H2, **bold**, or ALL-CAPS). A title-case
+# heuristic distinguishes a heading from a numbered *sentence* or list item.
+_NUMBERED_HEADING_RE = re.compile(
+    r"^[ \t]*"
+    r"(?:(?:Article|Section|ARTICLE|SECTION)[ \t]+)?"
+    r"(?:" + _ROMAN_RE + r"|\d{1,2})\.?"
+    r"[ \t]+"
+    r"([A-Z][A-Za-z][^\n]{0,58})"
+    r"[ \t]*$",
+    re.MULTILINE,
+)
+# Lowercase words allowed inside an otherwise Title-Cased heading.
+_HEADING_STOPWORDS = {
+    "a", "an", "the", "and", "or", "of", "to", "for", "in", "on", "with",
+    "by", "at", "as", "per", "from", "into", "nor", "but",
+}
+def _qualifies_as_numbered_heading(title: str) -> bool:
+    """A numbered line qualifies as a heading only if its title looks like a
+    heading: 1-9 words, Title-Cased (every word starts uppercase or is a short
+    lowercase connector), no sentence-y lowercase verbs. A single word must be
+    >= 4 letters. Rejects 'The parties agree as follows' but accepts 'Wage
+    Compensation' and 'Term And Nature Of Employment'."""
+    t = title.strip().rstrip(".").strip()
+    words = t.split()
+    if not (1 <= len(words) <= 9):
+        return False
+    if len(words) == 1:
+        return sum(1 for ch in words[0] if ch.isalpha()) >= 4 and words[0][:1].isupper()
+    for w in words:
+        if w[:1].isupper() or not w[:1].isalpha():
+            continue  # capitalized word, or punctuation/number token
+        if w.lower() in _HEADING_STOPWORDS:
+            continue  # allowed connector
+        return False  # a lowercase content word => this is a sentence, not a heading
+    return True
 def detect_clauses(text: str) -> List[JSON]:
     """Run the three-tier cascade and return clauses with their detection tier.
@@ -227,6 +270,12 @@ def detect_clauses(text: str) -> List[JSON]:
     bold = list(_BOLD_HEADING_RE.finditer(text))
     if len(bold) >= 2:
         return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
+    numbered = [
+        m for m in _NUMBERED_HEADING_RE.finditer(text)
+        if _qualifies_as_numbered_heading(m.group(1))
+    ]
+    if len(numbered) >= 2:
+        return _matches_to_clauses(text, numbered, group=1, tier="numbered")
     caps = [
         m for m in _ALL_CAPS_HEADING_RE.finditer(text)
         if _qualifies_as_all_caps_heading(m.group(1))
@@ -266,8 +315,9 @@ def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
 def _norm_clause_key(s: str) -> str:
-    """Normalize a clause title/alias for matching (number-stripped, lowercased)."""
-    return _strip_clause_number(s).strip().lower()
+    """Normalize a clause title/alias for matching (number-stripped, trailing
+    punctuation removed, lowercased)."""
+    return _strip_clause_number(s).strip().lower().rstrip(" .:;,")
 # ---------------------------------------------------------------------------
@@ -366,7 +416,7 @@ def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
                 best, best_len = canonical, len(alias_key)
     if best is not None:
         return best, True
-    return _titlecase(detected_title), False
+    return _titlecase(detected_title.strip().rstrip(" .:;,")), False
 # ---------------------------------------------------------------------------
@@ -421,11 +471,17 @@ _DATE_PAT = (
 )
 _DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
+# Highest-confidence: a date explicitly labeled "(the "Effective Date")".
+_EFFDATE_LABEL_RE = re.compile(
+    r"(" + _DATE_PAT + r")\s*\(\s*(?:the\s+)?[\"“]?\s*Effective\s+Date",
+    re.IGNORECASE,
+)
 _EFFECTIVE_RE = re.compile(
     r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
     r"dated(?:\s+as\s+of)?|"
     r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
-    r"entered\s+into(?:\s+as\s+of|\s+on)?)"
+    r"entered\s+into(?:\s+as\s+of|\s+on)?|"
+    r"as\s+of)"
     r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
     re.IGNORECASE,
 )
@@ -534,14 +590,18 @@ def _parse_date_to_iso(s: str) -> Optional[str]:
     return None
+def _date_field_from_str(raw: str, base_conf: float) -> JSON:
+    raw = re.sub(r"\s+", " ", raw.strip())
+    iso = _parse_date_to_iso(raw)
+    if iso is not None:
+        return _field(iso, base_conf)
+    return _field(raw, max(0.0, base_conf - 0.3))
 def _date_field(match: Optional["re.Match[str]"]) -> JSON:
     if match is None:
         return _none_field()
-    raw = match.group(1).strip()
-    iso = _parse_date_to_iso(raw)
-    if iso is not None:
-        return _field(iso, 0.85)
-    return _field(raw, 0.55)
+    return _date_field_from_str(match.group(1), 0.85)
 def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
@@ -578,10 +638,12 @@ def extract_parties(text: str) -> List[JSON]:
 def extract_dates(text: str) -> JSON:
-    return {
-        "effective": _date_field(_EFFECTIVE_RE.search(text)),
-        "expiration": _date_field(_EXPIRE_RE.search(text)),
-    }
+    label = _EFFDATE_LABEL_RE.search(text)
+    if label is not None:
+        effective = _date_field_from_str(label.group(1), 0.9)
+    else:
+        effective = _date_field(_EFFECTIVE_RE.search(text))
+    return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
 def extract_governing_law(text: str) -> JSON:
@@ -600,10 +662,10 @@ def extract_term(text: str) -> JSON:
     if m:
         num = _word_to_int(m.group(1))
         unit = m.group(2).lower().rstrip("s")
+        # Only emit when the captured token is a real number; otherwise the
+        # match was a coincidence ("...consecutive days") -> leave as not-found.
         if num is not None:
             length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
-        else:
-            length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
     notice = _none_field()
     nm = _NOTICE_RE.search(text)
@@ -649,7 +711,8 @@ def extract_clauses(text: str) -> List[JSON]:
     for c in detect_clauses(text):
         canonical, mapped = _canonicalize_clause(c["title"])
         tier = c["tier"]
-        base = {"h2": 0.95, "bold-numbered": 0.85, "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
+        base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
+                "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
         conf = round(base * (1.0 if mapped else 0.75), 2)
         out.append({
             "canonical_title": canonical,
@@ -669,10 +732,14 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
         return m.group(1).strip()
     for line in text.splitlines():
         ls = line.strip().lstrip("#").strip()
-        if ls:
-            if len(ls) <= 90:
-                return ls
-            break
+        if not ls:
+            continue
+        # Skip SGML/XML wrapper lines (e.g. SEC EDGAR "<DOCUMENT>", "<TYPE>...").
+        if ls.startswith("<"):
+            continue
+        if len(ls) <= 90:
+            return ls
+        break
     if path is not None:
         return _titlecase(path.stem.replace("_", " ").replace("-", " "))
     return None
@@ -834,9 +901,15 @@ def _pdf_unescape(s: str) -> str:
 def _pdf_text_from_content(content: bytes) -> str:
+    """Pull text strings from a PDF content stream, but ONLY from inside text
+    objects (`BT` ... `ET`). Real text lives there; embedded fonts, images,
+    digital-signature blobs and metadata streams have no BT/ET, so gating on it
+    keeps their binary bytes (which often contain stray `(...)` sequences) out
+    of the output -- essential for real signed/font-embedded PDFs."""
     s = content.decode("latin-1", "replace")
     lines: List[str] = []
     cur: List[str] = []
+    in_text = False
     def flush() -> None:
         if cur:
@@ -845,17 +918,34 @@ def _pdf_text_from_content(content: bytes) -> str:
     for m in _PDF_TOKEN_RE.finditer(s):
         tok = m.group(0)
-        if tok.startswith("("):
+        if tok == "BT":
+            flush()
+            in_text = True
+        elif tok == "ET":
+            flush()
+            in_text = False
+        elif not in_text:
+            continue
+        elif tok.startswith("("):
             cur.append(_pdf_unescape(tok[1:-1]))
         elif tok.startswith("["):
             for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
                 cur.append(_pdf_unescape(sm.group(0)[1:-1]))
-        elif tok in ("Td", "TD", "T*", "'", '"', "BT", "ET"):
+        elif tok in ("Td", "TD", "T*", "'", '"'):
             flush()
     flush()
     return "\n".join(lines)
+def _mostly_printable(s: str) -> bool:
+    """True if `s` is overwhelmingly printable text (backstop against a
+    malformed stream slipping binary through the BT/ET gate)."""
+    if not s:
+        return False
+    printable = sum(1 for ch in s if ch in "\n\t" or 32 <= ord(ch) < 127 or ord(ch) > 160)
+    return printable / len(s) >= 0.85
 def _read_pdf_stdlib(raw: bytes) -> str:
     import zlib
@@ -873,9 +963,11 @@ def _read_pdf_stdlib(raw: bytes) -> str:
             content = zlib.decompress(body)
         except Exception:
             content = body
-        chunks.append(_pdf_text_from_content(content))
+        piece = _pdf_text_from_content(content)
+        if piece.strip() and _mostly_printable(piece):
+            chunks.append(piece)
         idx = e + len(b"endstream")
-    return "\n".join(c for c in chunks if c.strip())
+    return "\n".join(chunks)
 def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
@@ -1293,7 +1385,7 @@ def output_schema() -> JSON:
                     "properties": {
                         "canonical_title": {"type": ["string", "null"]},
                         "detected_title": {"type": "string"},
-                        "tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
+                        "tier": {"enum": ["h2", "bold-numbered", "numbered", "all-caps", "explicit", "llm"]},
                         "span": {
                             "type": "object",
                             "required": ["start", "end"],

{extract_cli-0.1.0 → extract_cli-0.1.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "extract-cli"
-version = "0.1.0"
+version = "0.1.1"
 description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
 readme = "README.md"
 requires-python = ">=3.9"

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/employment_docx.docx.expected.json RENAMED Viewed

@@ -138,7 +138,7 @@
     "source": "deterministic"
   },
   "_meta": {
-    "extractor_version": "0.1.0",
+    "extractor_version": "0.1.1",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/lease_allcaps.txt.expected.json RENAMED Viewed

@@ -133,7 +133,7 @@
     "source": "deterministic"
   },
   "_meta": {
-    "extractor_version": "0.1.0",
+    "extractor_version": "0.1.1",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/license_pdf.pdf.expected.json RENAMED Viewed

@@ -133,7 +133,7 @@
     "source": "deterministic"
   },
   "_meta": {
-    "extractor_version": "0.1.0",
+    "extractor_version": "0.1.1",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/nda_h2.md.expected.json RENAMED Viewed

@@ -138,7 +138,7 @@
     "source": "none"
   },
   "_meta": {
-    "extractor_version": "0.1.0",
+    "extractor_version": "0.1.1",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/scanned.pdf.expected.json RENAMED Viewed

@@ -48,7 +48,7 @@
     "source": "none"
   },
   "_meta": {
-    "extractor_version": "0.1.0",
+    "extractor_version": "0.1.1",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/fixtures/services_bold.txt.expected.json RENAMED Viewed

@@ -133,7 +133,7 @@
     "source": "deterministic"
   },
   "_meta": {
-    "extractor_version": "0.1.0",
+    "extractor_version": "0.1.1",
     "tiers_used": [
       "deterministic"
     ],

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_clause_map.py RENAMED Viewed

@@ -25,6 +25,50 @@ def test_tier3_all_caps() -> None:
     assert [c["tier"] for c in clauses] == ["all-caps", "all-caps"]
+def test_tier_numbered_plain_headings() -> None:
+    # Real-world dominant format: plain numbered, mixed-case, unbolded headings.
+    text = ("1. Term And Nature Of Employment\n\nbody about term\n\n"
+            "2. Wage Compensation\n\nbody about wages\n\n"
+            "5. Termination\n\nbody about termination")
+    clauses = ex.detect_clauses(text)
+    assert [c["tier"] for c in clauses] == ["numbered", "numbered", "numbered"]
+    assert clauses[0]["title"] == "Term And Nature Of Employment"
+    assert clauses[2]["title"] == "Termination"
+def test_numbered_heading_rejects_sentences() -> None:
+    # "1. The Company shall pay..." is a numbered sentence, not a heading.
+    assert ex._qualifies_as_numbered_heading("Wage Compensation")
+    assert ex._qualifies_as_numbered_heading("Term And Nature Of Employment")
+    assert ex._qualifies_as_numbered_heading("Termination")
+    assert not ex._qualifies_as_numbered_heading("The Company shall pay the Employee monthly")
+    assert not ex._qualifies_as_numbered_heading("Fee")  # single word < 4 letters
+    assert not ex._qualifies_as_numbered_heading(
+        "EMPLOYEE shall be compensated on the basis of an annual salary")
+def test_numbered_section_article_prefixes() -> None:
+    text = ("Section 1. Definitions\n\nx\n\nSection 2. Confidentiality\n\ny\n\n"
+            "Article IV. Governing Law\n\nz")
+    clauses = ex.detect_clauses(text)
+    assert all(c["tier"] == "numbered" for c in clauses)
+    assert clauses[0]["title"] == "Definitions"
+    assert clauses[2]["title"] == "Governing Law"
+def test_numbered_does_not_shadow_bold() -> None:
+    # Bold-numbered must win over plain-numbered when both could match.
+    text = "**1. Purpose**\n\nx\n\n**2. Scope**\n\ny"
+    assert all(c["tier"] == "bold-numbered" for c in ex.detect_clauses(text))
+def test_trailing_period_stripped_from_titles() -> None:
+    canon, mapped = ex._canonicalize_clause("Other Benefits.")
+    assert canon == "Other Benefits"
+    # And a mapped clause with a trailing period still maps.
+    assert ex._canonicalize_clause("Survival.") == ("Survival", True)
 def test_cascade_priority_h2_wins() -> None:
     # An H2 present means the bold/all-caps fallbacks must not fire.
     text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_cli.py RENAMED Viewed

@@ -22,7 +22,7 @@ def test_version(capsys: pytest.CaptureFixture[str]) -> None:
     with pytest.raises(SystemExit) as exc:
         ex.main(["--version"])
     assert exc.value.code == 0
-    assert "extract-cli 0.1.0" in capsys.readouterr().out
+    assert f"extract-cli {ex.__version__}" in capsys.readouterr().out
 def test_demo_runs(capsys: pytest.CaptureFixture[str]) -> None:

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_deterministic.py RENAMED Viewed

@@ -39,6 +39,25 @@ def test_dates_iso_normalization() -> None:
         assert out["source"] == "deterministic"
+def test_dates_effective_date_label_and_as_of() -> None:
+    # The "(the "Effective Date")" anchor, with the date wrapping a newline.
+    text = 'between A and B as of August\n31, 2016 (the "Effective Date").'
+    assert ex.extract_dates(text)["effective"]["value"] == "2016-08-31"
+    # Bare "as of <date>" cue.
+    assert ex.extract_dates("dated as of June 1, 2023")["effective"]["value"] == "2023-06-01"
+def test_term_length_rejects_non_number() -> None:
+    # "...for consecutive days" must NOT be reported as a term length.
+    text = "the Employment Period shall run for consecutive days as scheduled"
+    assert ex.extract_term(text)["length"]["source"] == "none"
+def test_title_skips_sgml_wrapper() -> None:
+    text = "<DOCUMENT>\n<TYPE>EX-10\n<TEXT>\n\nEMPLOYMENT AGREEMENT\n\nbody"
+    assert ex.extract_title(text, None, "text") == "EMPLOYMENT AGREEMENT"
 def test_dates_missing() -> None:
     out = ex.extract_dates("no dates in here")
     assert out["effective"] == ex._none_field()

{extract_cli-0.1.0 → extract_cli-0.1.1}/tests/test_misc.py RENAMED Viewed

@@ -142,6 +142,19 @@ def test_pdf_unescape() -> None:
     assert ex._pdf_unescape(r"\101\102") == "AB"  # octal escapes
+def test_pdf_text_only_inside_bt_et() -> None:
+    # Strings outside BT/ET (font/signature/metadata stream bytes that happen to
+    # contain parentheses) must be ignored; only text objects yield text.
+    content = b"(garbage outside) /Font << >> BT (real text) Tj ET (more garbage)"
+    assert ex._pdf_text_from_content(content) == "real text"
+def test_pdf_mostly_printable_backstop() -> None:
+    assert ex._mostly_printable("Hello, world")
+    assert not ex._mostly_printable("\x00\x01\x02\x03\x04\x05\x06\x07")
+    assert not ex._mostly_printable("")
 def test_extract_json_object_from_noise() -> None:
     assert ex._extract_json_object('prefix {"a": 1} suffix') == {"a": 1}
     assert ex._extract_json_object("no json here") is None