extract-cli 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {extract_cli-0.1.1 → extract_cli-0.1.2}/ARCHITECTURE.md +3 -0
  2. {extract_cli-0.1.1 → extract_cli-0.1.2}/CHANGELOG.md +34 -0
  3. {extract_cli-0.1.1 → extract_cli-0.1.2}/PKG-INFO +8 -7
  4. {extract_cli-0.1.1 → extract_cli-0.1.2}/README.md +6 -5
  5. {extract_cli-0.1.1 → extract_cli-0.1.2}/docs/spec/extract-output.schema.json +2 -1
  6. {extract_cli-0.1.1 → extract_cli-0.1.2}/extract_cli.py +143 -29
  7. {extract_cli-0.1.1 → extract_cli-0.1.2}/pyproject.toml +2 -2
  8. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/_make_goldens.py +2 -1
  9. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/conftest.py +1 -0
  10. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  11. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  12. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  13. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md.expected.json +6 -1
  14. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf.expected.json +1 -1
  15. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt.expected.json +1 -1
  16. extract_cli-0.1.2/tests/fixtures/services_html.html +35 -0
  17. extract_cli-0.1.2/tests/fixtures/services_html.html.expected.json +157 -0
  18. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_deterministic.py +35 -2
  19. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_misc.py +26 -0
  20. {extract_cli-0.1.1 → extract_cli-0.1.2}/.gitignore +0 -0
  21. {extract_cli-0.1.1 → extract_cli-0.1.2}/CONTRIBUTING.md +0 -0
  22. {extract_cli-0.1.1 → extract_cli-0.1.2}/LICENSE +0 -0
  23. {extract_cli-0.1.1 → extract_cli-0.1.2}/Makefile +0 -0
  24. {extract_cli-0.1.1 → extract_cli-0.1.2}/config/llm.json.example +0 -0
  25. {extract_cli-0.1.1 → extract_cli-0.1.2}/docs/INTEROP.md +0 -0
  26. {extract_cli-0.1.1 → extract_cli-0.1.2}/scripts/release.py +0 -0
  27. {extract_cli-0.1.1 → extract_cli-0.1.2}/scripts/validate_against_spec.py +0 -0
  28. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/_fixtures_build.py +0 -0
  29. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/_schema_validator.py +0 -0
  30. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx +0 -0
  31. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt +0 -0
  32. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf +0 -0
  33. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md +0 -0
  34. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf +0 -0
  35. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt +0 -0
  36. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_clause_map.py +0 -0
  37. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_cli.py +0 -0
  38. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_llm.py +0 -0
  39. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_property.py +0 -0
  40. {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_schema_conformance.py +0 -0
@@ -8,11 +8,14 @@ map.
8
8
  ```
9
9
  load_source(path) extension/content sniff → reader
10
10
  ├─ .md/.txt → utf-8 decode
11
+ ├─ .html → stdlib html.parser reader (also auto-detected inside .txt)
11
12
  ├─ .docx → python-docx (if [docx]) else stdlib zipfile/XML reader
12
13
  └─ .pdf → pypdf (if [pdf]) else stdlib zlib + text-operator reader
13
14
 
14
15
  ▼ (raw_bytes, text, format, warnings)
15
16
  build_extraction(text, raw, fmt, src) the DETERMINISTIC tier (always on)
17
+ │ field extractors run on a whitespace-FLATTENED copy (so values that wrap
18
+ │ across a line are matched whole); clause detection keeps the original text
16
19
  ├─ extract_parties "between X and Y", with role parentheticals
17
20
  ├─ extract_dates effective / expiration, ISO-normalized
18
21
  ├─ extract_term length / auto_renew / notice_period_days
@@ -6,6 +6,39 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.2] - 2026-05-21
10
+
11
+ More real-world hardening, driven by testing against five additional contracts
12
+ (SEC EDGAR consulting/MSA, lease, and Visteon services agreements; Common Paper
13
+ and Perigon Cloud Service Agreements).
14
+
15
+ ### Added
16
+ - **HTML input** (`.html`/`.htm`, and HTML auto-detected inside `.txt` such as
17
+ SEC EDGAR full submissions). Stdlib `html.parser`-based reader strips
18
+ script/style, frames block elements so heading detection still works, and
19
+ unescapes entities. `document.format` enum gains `html` (backward-compatible
20
+ widening). This turns the large class of HTML contracts (SEC exhibits, web
21
+ ToS) from garbage into structured output.
22
+
23
+ ### Fixed
24
+ - **Field extraction now runs on whitespace-flattened text**, so values that
25
+ wrap across a line break are matched whole — e.g. governing law
26
+ `the laws of the Province\nof Ontario` now yields `Province of Ontario`, and
27
+ line-wrapped party names/defined terms are captured.
28
+ - **Party extraction** (continues issue #2): names are trimmed of trailing
29
+ descriptors (`, a Delaware corporation`, `doing business as …`,
30
+ `having its offices at …`, `as of …`), and each party must begin with a
31
+ capital so an `and` *inside* a party's own description no longer splits the
32
+ parties (`…V6E 3S7 and doing business as …` → real parties recovered).
33
+
34
+ ### Known limitations (documented, not bugs)
35
+ - The stdlib PDF reader cannot decode PDFs that use embedded subset fonts with
36
+ hex-encoded glyph strings (common in professionally-typeset PDFs); these
37
+ degrade gracefully to a low-signal warning. Install the `[pdf]` extra (pypdf)
38
+ for them — verified to recover full text and clause structure.
39
+ - Two-line `ARTICLE N` / title headings (number on one line, title on the next)
40
+ are not yet detected.
41
+
9
42
  ## [0.1.1] - 2026-05-21
10
43
 
11
44
  Real-world hardening, driven by testing against a SEC EDGAR employment
@@ -82,5 +115,6 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
82
115
  intentionally *not* governed by the output schema (the schema describes the
83
116
  full default output).
84
117
 
118
+ [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
85
119
  [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
86
120
  [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.1
4
- Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
3
+ Version: 0.1.2
4
+ Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
7
7
  Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/docs/INTEROP.md
@@ -63,8 +63,8 @@ ingest (extract) → review → diff → convert → sign
63
63
 
64
64
  ## What it does
65
65
 
66
- Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or **`.pdf`**,
67
- and it returns structured JSON: the parties, dates, term, governing law, a
66
+ Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
67
+ **`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
68
68
  **clause map** normalized onto the suite's canonical clause vocabulary, a
69
69
  defined-term inventory, and a headline value. Every field carries a
70
70
  `confidence` and a `source` so downstream tools **verify, don't trust**.
@@ -75,14 +75,15 @@ daemon, no network in the default path.
75
75
  ## Install
76
76
 
77
77
  ```bash
78
- pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
78
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
79
79
  pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
80
80
  pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
81
81
  pip install "extract-cli[docx,pdf]" # both
82
82
  ```
83
83
 
84
- The core has **zero runtime dependencies** and is fully functional on `.md`/`.txt`
85
- with no extras. `.docx` and `.pdf` work out of the box via stdlib readers; the
84
+ The core has **zero runtime dependencies** and is fully functional on
85
+ `.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
86
+ inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
86
87
  `[docx]`/`[pdf]` extras improve fidelity on complex documents (see
87
88
  [ARCHITECTURE.md](ARCHITECTURE.md)).
88
89
 
@@ -25,8 +25,8 @@ ingest (extract) → review → diff → convert → sign
25
25
 
26
26
  ## What it does
27
27
 
28
- Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or **`.pdf`**,
29
- and it returns structured JSON: the parties, dates, term, governing law, a
28
+ Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
29
+ **`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
30
30
  **clause map** normalized onto the suite's canonical clause vocabulary, a
31
31
  defined-term inventory, and a headline value. Every field carries a
32
32
  `confidence` and a `source` so downstream tools **verify, don't trust**.
@@ -37,14 +37,15 @@ daemon, no network in the default path.
37
37
  ## Install
38
38
 
39
39
  ```bash
40
- pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
40
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
41
41
  pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
42
42
  pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
43
43
  pip install "extract-cli[docx,pdf]" # both
44
44
  ```
45
45
 
46
- The core has **zero runtime dependencies** and is fully functional on `.md`/`.txt`
47
- with no extras. `.docx` and `.pdf` work out of the box via stdlib readers; the
46
+ The core has **zero runtime dependencies** and is fully functional on
47
+ `.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
48
+ inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
48
49
  `[docx]`/`[pdf]` extras improve fidelity on complex documents (see
49
50
  [ARCHITECTURE.md](ARCHITECTURE.md)).
50
51
 
@@ -69,7 +69,8 @@
69
69
  "markdown",
70
70
  "text",
71
71
  "docx",
72
- "pdf"
72
+ "pdf",
73
+ "html"
73
74
  ]
74
75
  },
75
76
  "sha256": {
@@ -4,8 +4,8 @@
4
4
  The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
5
5
  -> sign) that, until now, only handled documents it authored from its own
6
6
  templates. `extract-cli` is "passport control": it ingests ANY document --
7
- yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx, or
8
- .pdf, and emits a structured JSON representation that the rest of the suite
7
+ yours or a counterparty's foreign paper -- in .md/.txt/.html (natively), .docx,
8
+ or .pdf, and emits a structured JSON representation that the rest of the suite
9
9
  (nda-review-cli, compare-cli, contract-vault) consumes.
10
10
 
11
11
  Two extraction tiers:
@@ -32,6 +32,7 @@ from __future__ import annotations
32
32
  import argparse
33
33
  import datetime as _dt
34
34
  import hashlib
35
+ import html.parser
35
36
  import importlib.util
36
37
  import json
37
38
  import os
@@ -42,11 +43,11 @@ import urllib.request
42
43
  from pathlib import Path
43
44
  from typing import Any, Dict, List, Optional, Tuple
44
45
 
45
- __version__ = "0.1.1"
46
+ __version__ = "0.1.2"
46
47
 
47
48
  # Bumped independently of the package version when the *extraction logic*
48
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
49
- EXTRACTOR_VERSION = "0.1.1"
50
+ EXTRACTOR_VERSION = "0.1.2"
50
51
 
51
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
52
53
  SCHEMA_VERSION = 1
@@ -492,10 +493,17 @@ _EXPIRE_RE = re.compile(
492
493
  re.IGNORECASE,
493
494
  )
494
495
 
496
+ # Each party must start with a capital letter (optionally "the X"), a quote, or
497
+ # a paren. This is case-sensitive on purpose (no global IGNORECASE -- only the
498
+ # keywords are): it lets the engine skip an "and" that sits INSIDE a party's own
499
+ # description ("...V6E 3S7 and doing business as ...", where the right side
500
+ # starts lowercase) and find the real "and" before the second named entity.
501
+ _PARTY_START = r"(?:(?:[Tt]he|its)\s+)?[A-Z\"“(]"
495
502
  _PARTY_BLOCK_RE = re.compile(
496
- r"\b(?:by\s+and\s+between|between)\s+(.{2,200}?)\s+\band\b\s+(.{2,200}?)"
497
- r"(?=[\.;\n]|\bwhereas\b|\beffective\b|\bdated\b|\bhaving\b|\bwith\s+offices\b|$)",
498
- re.IGNORECASE | re.DOTALL,
503
+ r"(?i:\b(?:by\s+and\s+between|between)\s+)"
504
+ r"(" + _PARTY_START + r"[^\n]{1,200}?)\s+and\s+"
505
+ r"(" + _PARTY_START + r"[^\n]{1,200}?)"
506
+ r"(?=[\.;\n]|(?i:\bwhereas\b|\beffective\b|\bdated\b|\bas\s+of\b|\bwitnesseth\b)|$)",
499
507
  )
500
508
  _ROLE_PAREN_RE = re.compile(
501
509
  r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
@@ -604,8 +612,40 @@ def _date_field(match: Optional["re.Match[str]"]) -> JSON:
604
612
  return _date_field_from_str(match.group(1), 0.85)
605
613
 
606
614
 
615
+ # Trailing descriptors that follow a party's actual name and should be dropped
616
+ # ("Acme Corp., a Delaware corporation", "... doing business as Foo", "... as of
617
+ # March 1", "... having its offices at ..."). Each is matched and everything from
618
+ # it onward is cut.
619
+ _PARTY_CUT_MARKERS: Tuple[str, ...] = (
620
+ r",\s+an?\s+\w", # ", a Delaware ..." / ", an Ohio ..."
621
+ r"\s+doing\s+business\s+as\b",
622
+ r"\s+d/?b/?a\b",
623
+ r"\s+f/?k/?a\b",
624
+ r"\s+a[n]?\s+\w+\s+(?:corporation|company|partnership|limited)\b",
625
+ r"\s+having\b",
626
+ r"\s+with\s+(?:its\s+)?(?:offices|principal|a\s)\b",
627
+ r"\s+with\s+offices\b",
628
+ r"\s+located\b",
629
+ r"\s+organized\b",
630
+ r"\s+incorporated\b",
631
+ r"\s+whose\b",
632
+ r"\s+(?:as\s+of|dated|effective)\b",
633
+ )
634
+
635
+
636
+ def _clean_party_name(s: str) -> str:
637
+ """Trim a captured party name down to the entity name, dropping trailing
638
+ descriptors ('a Delaware corporation', 'd/b/a ...', 'as of ...')."""
639
+ s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
640
+ for pat in _PARTY_CUT_MARKERS:
641
+ m = re.search(pat, s, re.IGNORECASE)
642
+ if m:
643
+ s = s[: m.start()].strip().strip(",").strip()
644
+ return s.strip("\"“”").strip()
645
+
646
+
607
647
  def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
608
- s = s.strip().strip(",").strip()
648
+ s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
609
649
  role: Optional[str] = None
610
650
  m = _ROLE_PAREN_RE.search(s)
611
651
  if m:
@@ -614,9 +654,7 @@ def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
614
654
  if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
615
655
  role = candidate
616
656
  s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
617
- s = s.strip("\"“”").strip()
618
- s = re.sub(r"\s+", " ", s)
619
- return s, role
657
+ return _clean_party_name(s), role
620
658
 
621
659
 
622
660
  def extract_parties(text: str) -> List[JSON]:
@@ -625,9 +663,6 @@ def extract_parties(text: str) -> List[JSON]:
625
663
  return []
626
664
  out: List[JSON] = []
627
665
  for raw in (m.group(1), m.group(2)):
628
- # Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
629
- # collapse whitespace rather than truncating at the first newline.
630
- raw = re.sub(r"\s+", " ", raw).strip()
631
666
  name, role = _split_name_role(raw)
632
667
  if not name or len(name) < 2 or len(name) > 120:
633
668
  continue
@@ -750,21 +785,91 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
750
785
  # ---------------------------------------------------------------------------
751
786
 
752
787
 
788
+ def _looks_like_html(head: str) -> bool:
789
+ """Heuristic: does this text look like HTML? Catches HTML masquerading as
790
+ .txt (e.g. SEC EDGAR full submissions wrap HTML exhibits in a .txt)."""
791
+ low = head.lower()
792
+ if "<!doctype html" in low or "<html" in low or "<body" in low:
793
+ return True
794
+ return len(re.findall(r"</?(?:p|div|table|tr|td|span|br|h[1-6]|font|b|i)\b", low)) >= 6
795
+
796
+
753
797
  def _detect_format(path: Path, raw: bytes) -> str:
754
798
  ext = path.suffix.lower()
755
- if ext in (".md", ".markdown"):
756
- return "markdown"
757
- if ext == ".txt":
758
- return "text"
799
+ if ext in (".htm", ".html", ".xhtml"):
800
+ return "html"
759
801
  if ext == ".docx":
760
802
  return "docx"
761
803
  if ext == ".pdf":
762
804
  return "pdf"
763
805
  if raw[:4] == b"%PDF":
764
806
  return "pdf"
765
- if raw[:2] == b"PK":
807
+ if raw[:2] == b"PK" and ext not in (".md", ".markdown", ".txt"):
766
808
  return "docx"
767
- return "text"
809
+ base = "markdown" if ext in (".md", ".markdown") else "text"
810
+ # Content sniff: HTML hiding inside a .txt/.md (or extensionless) file.
811
+ if _looks_like_html(raw[:4096].decode("utf-8", "replace")):
812
+ return "html"
813
+ return base
814
+
815
+
816
+ class _HTMLTextExtractor(html.parser.HTMLParser):
817
+ """Stdlib HTML -> text: drops script/style, frames block elements with blank
818
+ lines (so clause-heading detection still works), and unescapes entities."""
819
+
820
+ _SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
821
+ _BLOCK = {
822
+ "p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
823
+ "section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
824
+ "thead", "tbody", "header", "footer", "main",
825
+ }
826
+
827
+ def __init__(self) -> None:
828
+ super().__init__(convert_charrefs=True)
829
+ self._parts: List[str] = []
830
+ self._skip = 0
831
+
832
+ def handle_starttag(self, tag: str, attrs: Any) -> None:
833
+ if tag in self._SKIP:
834
+ self._skip += 1
835
+ elif tag in self._BLOCK:
836
+ self._parts.append("\n")
837
+
838
+ def handle_endtag(self, tag: str) -> None:
839
+ if tag in self._SKIP and self._skip > 0:
840
+ self._skip -= 1
841
+ elif tag in self._BLOCK:
842
+ self._parts.append("\n")
843
+
844
+ def handle_data(self, data: str) -> None:
845
+ if self._skip == 0:
846
+ self._parts.append(data)
847
+
848
+ def get_text(self) -> str:
849
+ # Strip each line; collapse runs of blank lines to a single blank line
850
+ # (gives ALL-CAPS / numbered headings their blank-line frame).
851
+ lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
852
+ out: List[str] = []
853
+ blank = False
854
+ for ln in lines:
855
+ if ln:
856
+ out.append(ln)
857
+ blank = False
858
+ elif not blank:
859
+ out.append("")
860
+ blank = True
861
+ return "\n".join(out).strip()
862
+
863
+
864
+ def _read_html(raw_text: str) -> str:
865
+ parser = _HTMLTextExtractor()
866
+ try:
867
+ parser.feed(raw_text)
868
+ parser.close()
869
+ except Exception:
870
+ # Never crash on malformed markup; fall back to a crude tag strip.
871
+ return re.sub(r"<[^>]+>", " ", raw_text)
872
+ return parser.get_text()
768
873
 
769
874
 
770
875
  def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
@@ -986,6 +1091,8 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
986
1091
  warnings: List[str] = []
987
1092
  if fmt in ("markdown", "text"):
988
1093
  text = raw.decode("utf-8", "replace")
1094
+ elif fmt == "html":
1095
+ text = _read_html(raw.decode("utf-8", "replace"))
989
1096
  elif fmt == "docx":
990
1097
  text, w = _read_docx(path, raw, prefer_optional)
991
1098
  warnings += w
@@ -1011,6 +1118,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
1011
1118
  source_path: Optional[str]) -> JSON:
1012
1119
  """Run the deterministic tier and assemble the output contract object."""
1013
1120
  sha = hashlib.sha256(raw).hexdigest()
1121
+ # Field extractors (parties, dates, governing law, term, value, defined
1122
+ # terms) run on a whitespace-flattened copy so values that wrap across a
1123
+ # line break in the source -- "...laws of the Province\nof Ontario", a party
1124
+ # name split mid-line -- are matched whole. Clause detection and the title
1125
+ # keep the original text, which depends on line structure.
1126
+ flat = re.sub(r"[ \t\r\f\v]*\n[ \t\r\f\v]*", " ", text)
1127
+ flat = re.sub(r"[ \t]+", " ", flat)
1014
1128
  return {
1015
1129
  "document": {
1016
1130
  "title": extract_title(text, Path(source_path) if source_path else None, fmt),
@@ -1018,13 +1132,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
1018
1132
  "sha256": sha,
1019
1133
  "source_path": source_path,
1020
1134
  },
1021
- "parties": extract_parties(text),
1022
- "dates": extract_dates(text),
1023
- "term": extract_term(text),
1024
- "governing_law": extract_governing_law(text),
1135
+ "parties": extract_parties(flat),
1136
+ "dates": extract_dates(flat),
1137
+ "term": extract_term(flat),
1138
+ "governing_law": extract_governing_law(flat),
1025
1139
  "clauses": extract_clauses(text),
1026
- "defined_terms": extract_defined_terms(text),
1027
- "value": extract_value(text),
1140
+ "defined_terms": extract_defined_terms(flat),
1141
+ "value": extract_value(flat),
1028
1142
  "_meta": {
1029
1143
  "extractor_version": EXTRACTOR_VERSION,
1030
1144
  "tiers_used": ["deterministic"],
@@ -1336,7 +1450,7 @@ def output_schema() -> JSON:
1336
1450
  "required": ["title", "format", "sha256", "source_path"],
1337
1451
  "properties": {
1338
1452
  "title": {"type": ["string", "null"]},
1339
- "format": {"enum": ["markdown", "text", "docx", "pdf"]},
1453
+ "format": {"enum": ["markdown", "text", "docx", "pdf", "html"]},
1340
1454
  "sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
1341
1455
  "source_path": {"type": ["string", "null"]},
1342
1456
  },
@@ -1687,7 +1801,7 @@ def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
1687
1801
  def build_parser() -> argparse.ArgumentParser:
1688
1802
  parser = argparse.ArgumentParser(
1689
1803
  prog="extract",
1690
- description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
1804
+ description="Ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured "
1691
1805
  "JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
1692
1806
  )
1693
1807
  parser.add_argument("-V", "--version", action="version",
@@ -1721,7 +1835,7 @@ def build_parser() -> argparse.ArgumentParser:
1721
1835
 
1722
1836
 
1723
1837
  def _build_extract_args(p: argparse.ArgumentParser) -> None:
1724
- p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
1838
+ p.add_argument("path", help="Path to the document (.md/.txt/.html/.docx/.pdf).")
1725
1839
  p.add_argument("--llm", action="store_true",
1726
1840
  help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
1727
1841
  "Off by default; the deterministic core is fully useful without it.")
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.1"
8
- description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
7
+ version = "0.1.2"
8
+ description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
11
11
  license = { text = "MIT" }
@@ -20,7 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
20
20
  FIXTURES = Path(__file__).resolve().parent / "fixtures"
21
21
 
22
22
  DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
23
- "employment_docx.docx", "license_pdf.pdf", "scanned.pdf"]
23
+ "employment_docx.docx", "license_pdf.pdf", "services_html.html",
24
+ "scanned.pdf"]
24
25
 
25
26
 
26
27
  def golden_for(name: str) -> dict:
@@ -26,6 +26,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
26
26
  ("lease_allcaps.txt", "all-caps", "text"),
27
27
  ("employment_docx.docx", "bold-numbered", "docx"),
28
28
  ("license_pdf.pdf", "all-caps", "pdf"),
29
+ ("services_html.html", "numbered", "html"),
29
30
  )
30
31
 
31
32
 
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.1",
141
+ "extractor_version": "0.1.2",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.1",
136
+ "extractor_version": "0.1.2",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.1",
136
+ "extractor_version": "0.1.2",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -121,6 +121,11 @@
121
121
  "confidence": 0.6,
122
122
  "source": "deterministic"
123
123
  },
124
+ {
125
+ "term": "Disclosing Party",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ },
124
129
  {
125
130
  "term": "Receiving Party",
126
131
  "confidence": 0.6,
@@ -138,7 +143,7 @@
138
143
  "source": "none"
139
144
  },
140
145
  "_meta": {
141
- "extractor_version": "0.1.1",
146
+ "extractor_version": "0.1.2",
142
147
  "tiers_used": [
143
148
  "deterministic"
144
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.1",
51
+ "extractor_version": "0.1.2",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.1",
136
+ "extractor_version": "0.1.2",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -0,0 +1,35 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Exhibit 10.1</title>
5
+ <style>body { font-family: serif; } .hidden { display:none; }</style>
6
+ <script>var x = "(this should never appear in output)";</script>
7
+ </head>
8
+ <body>
9
+ <p align="center"><b>MASTER SERVICES AGREEMENT</b></p>
10
+
11
+ <p>This Master Services Agreement (the &ldquo;Agreement&rdquo;) is entered
12
+ into as of March 15, 2023 (the &quot;Effective Date&quot;), by and between
13
+ Initrode&nbsp;Systems,&nbsp;Inc., a Delaware corporation (&ldquo;Provider&rdquo;),
14
+ and Hooli&nbsp;LLC (&ldquo;Customer&rdquo;).</p>
15
+
16
+ <p>1. Services</p>
17
+ <p>Provider shall perform the services described in each Statement of Work.</p>
18
+
19
+ <p>2. Fees and Payment</p>
20
+ <p>Customer shall pay Provider the fees set forth in the applicable Statement
21
+ of Work, not to exceed $500,000 in the aggregate.</p>
22
+
23
+ <p>3. Term and Termination</p>
24
+ <p>The initial term of this Agreement is two (2) years. Either party may
25
+ terminate upon sixty (60) days&rsquo; written notice. This Agreement shall
26
+ automatically renew for successive one-year terms.</p>
27
+
28
+ <p>4. Confidentiality</p>
29
+ <p>Each party shall protect the other&rsquo;s &ldquo;Confidential
30
+ Information&rdquo; using reasonable care.</p>
31
+
32
+ <p>5. Governing Law</p>
33
+ <p>This Agreement shall be governed by the laws of the State of California.</p>
34
+ </body>
35
+ </html>
@@ -0,0 +1,157 @@
1
+ {
2
+ "document": {
3
+ "title": "MASTER SERVICES AGREEMENT",
4
+ "format": "html",
5
+ "sha256": "088b40f13135e6b5d8f8548b162d657f10725d348388c7c3a416d11d7fc65300",
6
+ "source_path": "services_html.html"
7
+ },
8
+ "parties": [
9
+ {
10
+ "name": "Initrode Systems, Inc.",
11
+ "confidence": 0.9,
12
+ "source": "deterministic",
13
+ "role": "Provider"
14
+ },
15
+ {
16
+ "name": "Hooli LLC",
17
+ "confidence": 0.9,
18
+ "source": "deterministic",
19
+ "role": "Customer"
20
+ }
21
+ ],
22
+ "dates": {
23
+ "effective": {
24
+ "value": "2023-03-15",
25
+ "confidence": 0.9,
26
+ "source": "deterministic"
27
+ },
28
+ "expiration": {
29
+ "value": null,
30
+ "confidence": 0.0,
31
+ "source": "none"
32
+ }
33
+ },
34
+ "term": {
35
+ "length": {
36
+ "value": "2 years",
37
+ "confidence": 0.7,
38
+ "source": "deterministic"
39
+ },
40
+ "auto_renew": {
41
+ "value": true,
42
+ "confidence": 0.65,
43
+ "source": "deterministic"
44
+ },
45
+ "notice_period_days": {
46
+ "value": 60,
47
+ "confidence": 0.7,
48
+ "source": "deterministic"
49
+ }
50
+ },
51
+ "governing_law": {
52
+ "value": "State of California",
53
+ "confidence": 0.85,
54
+ "source": "deterministic"
55
+ },
56
+ "clauses": [
57
+ {
58
+ "canonical_title": "Services",
59
+ "detected_title": "1. Services",
60
+ "tier": "numbered",
61
+ "span": {
62
+ "start": 242,
63
+ "end": 329
64
+ },
65
+ "confidence": 0.6,
66
+ "source": "deterministic",
67
+ "mapped": false
68
+ },
69
+ {
70
+ "canonical_title": "Payment",
71
+ "detected_title": "2. Fees and Payment",
72
+ "tier": "numbered",
73
+ "span": {
74
+ "start": 329,
75
+ "end": 476
76
+ },
77
+ "confidence": 0.8,
78
+ "source": "deterministic",
79
+ "mapped": true
80
+ },
81
+ {
82
+ "canonical_title": "Termination",
83
+ "detected_title": "3. Term and Termination",
84
+ "tier": "numbered",
85
+ "span": {
86
+ "start": 476,
87
+ "end": 692
88
+ },
89
+ "confidence": 0.8,
90
+ "source": "deterministic",
91
+ "mapped": true
92
+ },
93
+ {
94
+ "canonical_title": "Confidentiality",
95
+ "detected_title": "4. Confidentiality",
96
+ "tier": "numbered",
97
+ "span": {
98
+ "start": 692,
99
+ "end": 800
100
+ },
101
+ "confidence": 0.8,
102
+ "source": "deterministic",
103
+ "mapped": true
104
+ },
105
+ {
106
+ "canonical_title": "Governing Law",
107
+ "detected_title": "5. Governing Law",
108
+ "tier": "numbered",
109
+ "span": {
110
+ "start": 800,
111
+ "end": 890
112
+ },
113
+ "confidence": 0.8,
114
+ "source": "deterministic",
115
+ "mapped": true
116
+ }
117
+ ],
118
+ "defined_terms": [
119
+ {
120
+ "term": "Agreement",
121
+ "confidence": 0.6,
122
+ "source": "deterministic"
123
+ },
124
+ {
125
+ "term": "Effective Date",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ },
129
+ {
130
+ "term": "Provider",
131
+ "confidence": 0.6,
132
+ "source": "deterministic"
133
+ },
134
+ {
135
+ "term": "Customer",
136
+ "confidence": 0.6,
137
+ "source": "deterministic"
138
+ },
139
+ {
140
+ "term": "Confidential Information",
141
+ "confidence": 0.6,
142
+ "source": "deterministic"
143
+ }
144
+ ],
145
+ "value": {
146
+ "value": "$500,000",
147
+ "confidence": 0.6,
148
+ "source": "deterministic"
149
+ },
150
+ "_meta": {
151
+ "extractor_version": "0.1.2",
152
+ "tiers_used": [
153
+ "deterministic"
154
+ ],
155
+ "llm_used": false
156
+ }
157
+ }
@@ -12,8 +12,8 @@ def test_parties_between_simple() -> None:
12
12
  assert all(0.0 <= p["confidence"] <= 1.0 for p in parties)
13
13
 
14
14
 
15
- def test_parties_with_roles_and_linebreak() -> None:
16
- text = ('by and between Acme Corp. (the "Disclosing\nParty") and '
15
+ def test_parties_with_roles() -> None:
16
+ text = ('by and between Acme Corp. (the "Disclosing Party") and '
17
17
  'Beta LLC (the "Receiving Party"), dated March 1, 2024.')
18
18
  parties = ex.extract_parties(text)
19
19
  assert parties[0]["name"] == "Acme Corp."
@@ -22,6 +22,30 @@ def test_parties_with_roles_and_linebreak() -> None:
22
22
  assert parties[1]["role"] == "Receiving Party"
23
23
 
24
24
 
25
+ def test_parties_linebreak_handled_by_build() -> None:
26
+ # build_extraction flattens whitespace, so a party/role that wraps across a
27
+ # line is matched whole.
28
+ text = ('This Agreement is made by and between Acme Corp. (the "Disclosing\n'
29
+ 'Party") and Beta LLC (the "Receiving Party").')
30
+ r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
31
+ assert [p["name"] for p in r["parties"]] == ["Acme Corp.", "Beta LLC"]
32
+ assert r["parties"][0]["role"] == "Disclosing Party"
33
+
34
+
35
+ def test_parties_skip_and_inside_description() -> None:
36
+ # An "and" inside a party's own description must not split the parties.
37
+ text = ("between Blade Ventures Inc., a Nevada corporation having offices at "
38
+ "1 Main St and doing business as Foo (\"Client\"), and KPMG LP")
39
+ parties = ex.extract_parties(text)
40
+ assert [p["name"] for p in parties] == ["Blade Ventures Inc.", "KPMG LP"]
41
+
42
+
43
+ def test_party_name_descriptors_trimmed() -> None:
44
+ assert ex._clean_party_name("Visteon Corporation, a Delaware corporation") == "Visteon Corporation"
45
+ assert ex._clean_party_name("Foo Inc. doing business as Bar") == "Foo Inc."
46
+ assert ex._clean_party_name("Baz LLC having its principal office at X") == "Baz LLC"
47
+
48
+
25
49
  def test_parties_none() -> None:
26
50
  assert ex.extract_parties("There are no parties named here.") == []
27
51
 
@@ -80,6 +104,15 @@ def test_governing_law_stops_before_trailing_clause() -> None:
80
104
  assert out["value"] == "State of Delaware"
81
105
 
82
106
 
107
+ def test_governing_law_linebreak_handled_by_build() -> None:
108
+ # A jurisdiction that wraps a line ("...the Province\nof Ontario") is
109
+ # matched whole because build_extraction flattens whitespace first.
110
+ text = ("This Agreement shall be governed by the laws of the Province\n"
111
+ "of Ontario and the federal laws of Canada.")
112
+ r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
113
+ assert r["governing_law"]["value"] == "Province of Ontario"
114
+
115
+
83
116
  def test_governing_law_missing() -> None:
84
117
  assert ex.extract_governing_law("nothing about law")["source"] == "none"
85
118
 
@@ -142,6 +142,32 @@ def test_pdf_unescape() -> None:
142
142
  assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
143
143
 
144
144
 
145
+ def test_html_extraction() -> None:
146
+ raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
147
+ assert fmt == "html"
148
+ # script/style content is dropped; entities are unescaped.
149
+ assert "this should never appear" not in text
150
+ result = ex.build_extraction(text, raw, fmt, "services_html.html")
151
+ assert result["document"]["format"] == "html"
152
+ assert [p["name"] for p in result["parties"]] == ["Initrode Systems, Inc.", "Hooli LLC"]
153
+ assert result["governing_law"]["value"] == "State of California"
154
+ assert result["dates"]["effective"]["value"] == "2023-03-15"
155
+ canon = {c["canonical_title"] for c in result["clauses"]}
156
+ assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
157
+
158
+
159
+ def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
160
+ # HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
161
+ p = tmp_path / "exhibit.txt"
162
+ p.write_text("<html><body><p>between A Co and B Co</p></body></html>")
163
+ _raw, _text, fmt, _w = ex.load_source(p)
164
+ assert fmt == "html"
165
+
166
+
167
+ def test_html_malformed_does_not_crash() -> None:
168
+ assert ex._read_html("<p>unclosed <b>bold <div>text") is not None
169
+
170
+
145
171
  def test_pdf_text_only_inside_bt_et() -> None:
146
172
  # Strings outside BT/ET (font/signature/metadata stream bytes that happen to
147
173
  # contain parentheses) must be ignored; only text objects yield text.
File without changes
File without changes
File without changes
File without changes
File without changes