extract-cli 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.1 → extract_cli-0.1.2}/ARCHITECTURE.md +3 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/CHANGELOG.md +34 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/PKG-INFO +8 -7
- {extract_cli-0.1.1 → extract_cli-0.1.2}/README.md +6 -5
- {extract_cli-0.1.1 → extract_cli-0.1.2}/docs/spec/extract-output.schema.json +2 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/extract_cli.py +143 -29
- {extract_cli-0.1.1 → extract_cli-0.1.2}/pyproject.toml +2 -2
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/_make_goldens.py +2 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/conftest.py +1 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx.expected.json +1 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md.expected.json +6 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt.expected.json +1 -1
- extract_cli-0.1.2/tests/fixtures/services_html.html +35 -0
- extract_cli-0.1.2/tests/fixtures/services_html.html.expected.json +157 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_deterministic.py +35 -2
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_misc.py +26 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/.gitignore +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/LICENSE +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/Makefile +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/config/llm.json.example +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/scripts/release.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_cli.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_llm.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_property.py +0 -0
- {extract_cli-0.1.1 → extract_cli-0.1.2}/tests/test_schema_conformance.py +0 -0
|
@@ -8,11 +8,14 @@ map.
|
|
|
8
8
|
```
|
|
9
9
|
load_source(path) extension/content sniff → reader
|
|
10
10
|
├─ .md/.txt → utf-8 decode
|
|
11
|
+
├─ .html → stdlib html.parser reader (also auto-detected inside .txt)
|
|
11
12
|
├─ .docx → python-docx (if [docx]) else stdlib zipfile/XML reader
|
|
12
13
|
└─ .pdf → pypdf (if [pdf]) else stdlib zlib + text-operator reader
|
|
13
14
|
│
|
|
14
15
|
▼ (raw_bytes, text, format, warnings)
|
|
15
16
|
build_extraction(text, raw, fmt, src) the DETERMINISTIC tier (always on)
|
|
17
|
+
│ field extractors run on a whitespace-FLATTENED copy (so values that wrap
|
|
18
|
+
│ across a line are matched whole); clause detection keeps the original text
|
|
16
19
|
├─ extract_parties "between X and Y", with role parentheticals
|
|
17
20
|
├─ extract_dates effective / expiration, ISO-normalized
|
|
18
21
|
├─ extract_term length / auto_renew / notice_period_days
|
|
@@ -6,6 +6,39 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.2] - 2026-05-21
|
|
10
|
+
|
|
11
|
+
More real-world hardening, driven by testing against five additional contracts
|
|
12
|
+
(SEC EDGAR consulting/MSA, lease, and Visteon services agreements; Common Paper
|
|
13
|
+
and Perigon Cloud Service Agreements).
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- **HTML input** (`.html`/`.htm`, and HTML auto-detected inside `.txt` such as
|
|
17
|
+
SEC EDGAR full submissions). Stdlib `html.parser`-based reader strips
|
|
18
|
+
script/style, frames block elements so heading detection still works, and
|
|
19
|
+
unescapes entities. `document.format` enum gains `html` (backward-compatible
|
|
20
|
+
widening). This turns the large class of HTML contracts (SEC exhibits, web
|
|
21
|
+
ToS) from garbage into structured output.
|
|
22
|
+
|
|
23
|
+
### Fixed
|
|
24
|
+
- **Field extraction now runs on whitespace-flattened text**, so values that
|
|
25
|
+
wrap across a line break are matched whole — e.g. governing law
|
|
26
|
+
`the laws of the Province\nof Ontario` now yields `Province of Ontario`, and
|
|
27
|
+
line-wrapped party names/defined terms are captured.
|
|
28
|
+
- **Party extraction** (continues issue #2): names are trimmed of trailing
|
|
29
|
+
descriptors (`, a Delaware corporation`, `doing business as …`,
|
|
30
|
+
`having its offices at …`, `as of …`), and each party must begin with a
|
|
31
|
+
capital so an `and` *inside* a party's own description no longer splits the
|
|
32
|
+
parties (`…V6E 3S7 and doing business as …` → real parties recovered).
|
|
33
|
+
|
|
34
|
+
### Known limitations (documented, not bugs)
|
|
35
|
+
- The stdlib PDF reader cannot decode PDFs that use embedded subset fonts with
|
|
36
|
+
hex-encoded glyph strings (common in professionally-typeset PDFs); these
|
|
37
|
+
degrade gracefully to a low-signal warning. Install the `[pdf]` extra (pypdf)
|
|
38
|
+
for them — verified to recover full text and clause structure.
|
|
39
|
+
- Two-line `ARTICLE N` / title headings (number on one line, title on the next)
|
|
40
|
+
are not yet detected.
|
|
41
|
+
|
|
9
42
|
## [0.1.1] - 2026-05-21
|
|
10
43
|
|
|
11
44
|
Real-world hardening, driven by testing against a SEC EDGAR employment
|
|
@@ -82,5 +115,6 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
82
115
|
intentionally *not* governed by the output schema (the schema describes the
|
|
83
116
|
full default output).
|
|
84
117
|
|
|
118
|
+
[0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
|
|
85
119
|
[0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
|
|
86
120
|
[0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
7
7
|
Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/docs/INTEROP.md
|
|
@@ -63,8 +63,8 @@ ingest (extract) → review → diff → convert → sign
|
|
|
63
63
|
|
|
64
64
|
## What it does
|
|
65
65
|
|
|
66
|
-
Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or
|
|
67
|
-
and it returns structured JSON: the parties, dates, term, governing law, a
|
|
66
|
+
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
67
|
+
**`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
|
|
68
68
|
**clause map** normalized onto the suite's canonical clause vocabulary, a
|
|
69
69
|
defined-term inventory, and a headline value. Every field carries a
|
|
70
70
|
`confidence` and a `source` so downstream tools **verify, don't trust**.
|
|
@@ -75,14 +75,15 @@ daemon, no network in the default path.
|
|
|
75
75
|
## Install
|
|
76
76
|
|
|
77
77
|
```bash
|
|
78
|
-
pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
|
|
78
|
+
pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
|
|
79
79
|
pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
|
|
80
80
|
pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
|
|
81
81
|
pip install "extract-cli[docx,pdf]" # both
|
|
82
82
|
```
|
|
83
83
|
|
|
84
|
-
The core has **zero runtime dependencies** and is fully functional on
|
|
85
|
-
with no extras
|
|
84
|
+
The core has **zero runtime dependencies** and is fully functional on
|
|
85
|
+
`.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
|
|
86
|
+
inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
|
|
86
87
|
`[docx]`/`[pdf]` extras improve fidelity on complex documents (see
|
|
87
88
|
[ARCHITECTURE.md](ARCHITECTURE.md)).
|
|
88
89
|
|
|
@@ -25,8 +25,8 @@ ingest (extract) → review → diff → convert → sign
|
|
|
25
25
|
|
|
26
26
|
## What it does
|
|
27
27
|
|
|
28
|
-
Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or
|
|
29
|
-
and it returns structured JSON: the parties, dates, term, governing law, a
|
|
28
|
+
Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
|
|
29
|
+
**`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
|
|
30
30
|
**clause map** normalized onto the suite's canonical clause vocabulary, a
|
|
31
31
|
defined-term inventory, and a headline value. Every field carries a
|
|
32
32
|
`confidence` and a `source` so downstream tools **verify, don't trust**.
|
|
@@ -37,14 +37,15 @@ daemon, no network in the default path.
|
|
|
37
37
|
## Install
|
|
38
38
|
|
|
39
39
|
```bash
|
|
40
|
-
pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
|
|
40
|
+
pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
|
|
41
41
|
pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
|
|
42
42
|
pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
|
|
43
43
|
pip install "extract-cli[docx,pdf]" # both
|
|
44
44
|
```
|
|
45
45
|
|
|
46
|
-
The core has **zero runtime dependencies** and is fully functional on
|
|
47
|
-
with no extras
|
|
46
|
+
The core has **zero runtime dependencies** and is fully functional on
|
|
47
|
+
`.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
|
|
48
|
+
inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
|
|
48
49
|
`[docx]`/`[pdf]` extras improve fidelity on complex documents (see
|
|
49
50
|
[ARCHITECTURE.md](ARCHITECTURE.md)).
|
|
50
51
|
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
|
|
5
5
|
-> sign) that, until now, only handled documents it authored from its own
|
|
6
6
|
templates. `extract-cli` is "passport control": it ingests ANY document --
|
|
7
|
-
yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx,
|
|
8
|
-
.pdf, and emits a structured JSON representation that the rest of the suite
|
|
7
|
+
yours or a counterparty's foreign paper -- in .md/.txt/.html (natively), .docx,
|
|
8
|
+
or .pdf, and emits a structured JSON representation that the rest of the suite
|
|
9
9
|
(nda-review-cli, compare-cli, contract-vault) consumes.
|
|
10
10
|
|
|
11
11
|
Two extraction tiers:
|
|
@@ -32,6 +32,7 @@ from __future__ import annotations
|
|
|
32
32
|
import argparse
|
|
33
33
|
import datetime as _dt
|
|
34
34
|
import hashlib
|
|
35
|
+
import html.parser
|
|
35
36
|
import importlib.util
|
|
36
37
|
import json
|
|
37
38
|
import os
|
|
@@ -42,11 +43,11 @@ import urllib.request
|
|
|
42
43
|
from pathlib import Path
|
|
43
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
44
45
|
|
|
45
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.2"
|
|
46
47
|
|
|
47
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
48
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
49
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.2"
|
|
50
51
|
|
|
51
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
52
53
|
SCHEMA_VERSION = 1
|
|
@@ -492,10 +493,17 @@ _EXPIRE_RE = re.compile(
|
|
|
492
493
|
re.IGNORECASE,
|
|
493
494
|
)
|
|
494
495
|
|
|
496
|
+
# Each party must start with a capital letter (optionally "the X"), a quote, or
|
|
497
|
+
# a paren. This is case-sensitive on purpose (no global IGNORECASE -- only the
|
|
498
|
+
# keywords are): it lets the engine skip an "and" that sits INSIDE a party's own
|
|
499
|
+
# description ("...V6E 3S7 and doing business as ...", where the right side
|
|
500
|
+
# starts lowercase) and find the real "and" before the second named entity.
|
|
501
|
+
_PARTY_START = r"(?:(?:[Tt]he|its)\s+)?[A-Z\"“(]"
|
|
495
502
|
_PARTY_BLOCK_RE = re.compile(
|
|
496
|
-
r"
|
|
497
|
-
r"(
|
|
498
|
-
|
|
503
|
+
r"(?i:\b(?:by\s+and\s+between|between)\s+)"
|
|
504
|
+
r"(" + _PARTY_START + r"[^\n]{1,200}?)\s+and\s+"
|
|
505
|
+
r"(" + _PARTY_START + r"[^\n]{1,200}?)"
|
|
506
|
+
r"(?=[\.;\n]|(?i:\bwhereas\b|\beffective\b|\bdated\b|\bas\s+of\b|\bwitnesseth\b)|$)",
|
|
499
507
|
)
|
|
500
508
|
_ROLE_PAREN_RE = re.compile(
|
|
501
509
|
r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
|
|
@@ -604,8 +612,40 @@ def _date_field(match: Optional["re.Match[str]"]) -> JSON:
|
|
|
604
612
|
return _date_field_from_str(match.group(1), 0.85)
|
|
605
613
|
|
|
606
614
|
|
|
615
|
+
# Trailing descriptors that follow a party's actual name and should be dropped
|
|
616
|
+
# ("Acme Corp., a Delaware corporation", "... doing business as Foo", "... as of
|
|
617
|
+
# March 1", "... having its offices at ..."). Each is matched and everything from
|
|
618
|
+
# it onward is cut.
|
|
619
|
+
_PARTY_CUT_MARKERS: Tuple[str, ...] = (
|
|
620
|
+
r",\s+an?\s+\w", # ", a Delaware ..." / ", an Ohio ..."
|
|
621
|
+
r"\s+doing\s+business\s+as\b",
|
|
622
|
+
r"\s+d/?b/?a\b",
|
|
623
|
+
r"\s+f/?k/?a\b",
|
|
624
|
+
r"\s+a[n]?\s+\w+\s+(?:corporation|company|partnership|limited)\b",
|
|
625
|
+
r"\s+having\b",
|
|
626
|
+
r"\s+with\s+(?:its\s+)?(?:offices|principal|a\s)\b",
|
|
627
|
+
r"\s+with\s+offices\b",
|
|
628
|
+
r"\s+located\b",
|
|
629
|
+
r"\s+organized\b",
|
|
630
|
+
r"\s+incorporated\b",
|
|
631
|
+
r"\s+whose\b",
|
|
632
|
+
r"\s+(?:as\s+of|dated|effective)\b",
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _clean_party_name(s: str) -> str:
|
|
637
|
+
"""Trim a captured party name down to the entity name, dropping trailing
|
|
638
|
+
descriptors ('a Delaware corporation', 'd/b/a ...', 'as of ...')."""
|
|
639
|
+
s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
|
|
640
|
+
for pat in _PARTY_CUT_MARKERS:
|
|
641
|
+
m = re.search(pat, s, re.IGNORECASE)
|
|
642
|
+
if m:
|
|
643
|
+
s = s[: m.start()].strip().strip(",").strip()
|
|
644
|
+
return s.strip("\"“”").strip()
|
|
645
|
+
|
|
646
|
+
|
|
607
647
|
def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
|
|
608
|
-
s = s.strip().strip(",").strip()
|
|
648
|
+
s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
|
|
609
649
|
role: Optional[str] = None
|
|
610
650
|
m = _ROLE_PAREN_RE.search(s)
|
|
611
651
|
if m:
|
|
@@ -614,9 +654,7 @@ def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
|
|
|
614
654
|
if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
|
|
615
655
|
role = candidate
|
|
616
656
|
s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
|
|
617
|
-
|
|
618
|
-
s = re.sub(r"\s+", " ", s)
|
|
619
|
-
return s, role
|
|
657
|
+
return _clean_party_name(s), role
|
|
620
658
|
|
|
621
659
|
|
|
622
660
|
def extract_parties(text: str) -> List[JSON]:
|
|
@@ -625,9 +663,6 @@ def extract_parties(text: str) -> List[JSON]:
|
|
|
625
663
|
return []
|
|
626
664
|
out: List[JSON] = []
|
|
627
665
|
for raw in (m.group(1), m.group(2)):
|
|
628
|
-
# Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
|
|
629
|
-
# collapse whitespace rather than truncating at the first newline.
|
|
630
|
-
raw = re.sub(r"\s+", " ", raw).strip()
|
|
631
666
|
name, role = _split_name_role(raw)
|
|
632
667
|
if not name or len(name) < 2 or len(name) > 120:
|
|
633
668
|
continue
|
|
@@ -750,21 +785,91 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
|
|
|
750
785
|
# ---------------------------------------------------------------------------
|
|
751
786
|
|
|
752
787
|
|
|
788
|
+
def _looks_like_html(head: str) -> bool:
|
|
789
|
+
"""Heuristic: does this text look like HTML? Catches HTML masquerading as
|
|
790
|
+
.txt (e.g. SEC EDGAR full submissions wrap HTML exhibits in a .txt)."""
|
|
791
|
+
low = head.lower()
|
|
792
|
+
if "<!doctype html" in low or "<html" in low or "<body" in low:
|
|
793
|
+
return True
|
|
794
|
+
return len(re.findall(r"</?(?:p|div|table|tr|td|span|br|h[1-6]|font|b|i)\b", low)) >= 6
|
|
795
|
+
|
|
796
|
+
|
|
753
797
|
def _detect_format(path: Path, raw: bytes) -> str:
|
|
754
798
|
ext = path.suffix.lower()
|
|
755
|
-
if ext in (".
|
|
756
|
-
return "
|
|
757
|
-
if ext == ".txt":
|
|
758
|
-
return "text"
|
|
799
|
+
if ext in (".htm", ".html", ".xhtml"):
|
|
800
|
+
return "html"
|
|
759
801
|
if ext == ".docx":
|
|
760
802
|
return "docx"
|
|
761
803
|
if ext == ".pdf":
|
|
762
804
|
return "pdf"
|
|
763
805
|
if raw[:4] == b"%PDF":
|
|
764
806
|
return "pdf"
|
|
765
|
-
if raw[:2] == b"PK":
|
|
807
|
+
if raw[:2] == b"PK" and ext not in (".md", ".markdown", ".txt"):
|
|
766
808
|
return "docx"
|
|
767
|
-
|
|
809
|
+
base = "markdown" if ext in (".md", ".markdown") else "text"
|
|
810
|
+
# Content sniff: HTML hiding inside a .txt/.md (or extensionless) file.
|
|
811
|
+
if _looks_like_html(raw[:4096].decode("utf-8", "replace")):
|
|
812
|
+
return "html"
|
|
813
|
+
return base
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class _HTMLTextExtractor(html.parser.HTMLParser):
|
|
817
|
+
"""Stdlib HTML -> text: drops script/style, frames block elements with blank
|
|
818
|
+
lines (so clause-heading detection still works), and unescapes entities."""
|
|
819
|
+
|
|
820
|
+
_SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
|
|
821
|
+
_BLOCK = {
|
|
822
|
+
"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
|
|
823
|
+
"section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
|
|
824
|
+
"thead", "tbody", "header", "footer", "main",
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
def __init__(self) -> None:
|
|
828
|
+
super().__init__(convert_charrefs=True)
|
|
829
|
+
self._parts: List[str] = []
|
|
830
|
+
self._skip = 0
|
|
831
|
+
|
|
832
|
+
def handle_starttag(self, tag: str, attrs: Any) -> None:
|
|
833
|
+
if tag in self._SKIP:
|
|
834
|
+
self._skip += 1
|
|
835
|
+
elif tag in self._BLOCK:
|
|
836
|
+
self._parts.append("\n")
|
|
837
|
+
|
|
838
|
+
def handle_endtag(self, tag: str) -> None:
|
|
839
|
+
if tag in self._SKIP and self._skip > 0:
|
|
840
|
+
self._skip -= 1
|
|
841
|
+
elif tag in self._BLOCK:
|
|
842
|
+
self._parts.append("\n")
|
|
843
|
+
|
|
844
|
+
def handle_data(self, data: str) -> None:
|
|
845
|
+
if self._skip == 0:
|
|
846
|
+
self._parts.append(data)
|
|
847
|
+
|
|
848
|
+
def get_text(self) -> str:
|
|
849
|
+
# Strip each line; collapse runs of blank lines to a single blank line
|
|
850
|
+
# (gives ALL-CAPS / numbered headings their blank-line frame).
|
|
851
|
+
lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
|
|
852
|
+
out: List[str] = []
|
|
853
|
+
blank = False
|
|
854
|
+
for ln in lines:
|
|
855
|
+
if ln:
|
|
856
|
+
out.append(ln)
|
|
857
|
+
blank = False
|
|
858
|
+
elif not blank:
|
|
859
|
+
out.append("")
|
|
860
|
+
blank = True
|
|
861
|
+
return "\n".join(out).strip()
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def _read_html(raw_text: str) -> str:
|
|
865
|
+
parser = _HTMLTextExtractor()
|
|
866
|
+
try:
|
|
867
|
+
parser.feed(raw_text)
|
|
868
|
+
parser.close()
|
|
869
|
+
except Exception:
|
|
870
|
+
# Never crash on malformed markup; fall back to a crude tag strip.
|
|
871
|
+
return re.sub(r"<[^>]+>", " ", raw_text)
|
|
872
|
+
return parser.get_text()
|
|
768
873
|
|
|
769
874
|
|
|
770
875
|
def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
|
|
@@ -986,6 +1091,8 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
|
|
|
986
1091
|
warnings: List[str] = []
|
|
987
1092
|
if fmt in ("markdown", "text"):
|
|
988
1093
|
text = raw.decode("utf-8", "replace")
|
|
1094
|
+
elif fmt == "html":
|
|
1095
|
+
text = _read_html(raw.decode("utf-8", "replace"))
|
|
989
1096
|
elif fmt == "docx":
|
|
990
1097
|
text, w = _read_docx(path, raw, prefer_optional)
|
|
991
1098
|
warnings += w
|
|
@@ -1011,6 +1118,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
|
|
|
1011
1118
|
source_path: Optional[str]) -> JSON:
|
|
1012
1119
|
"""Run the deterministic tier and assemble the output contract object."""
|
|
1013
1120
|
sha = hashlib.sha256(raw).hexdigest()
|
|
1121
|
+
# Field extractors (parties, dates, governing law, term, value, defined
|
|
1122
|
+
# terms) run on a whitespace-flattened copy so values that wrap across a
|
|
1123
|
+
# line break in the source -- "...laws of the Province\nof Ontario", a party
|
|
1124
|
+
# name split mid-line -- are matched whole. Clause detection and the title
|
|
1125
|
+
# keep the original text, which depends on line structure.
|
|
1126
|
+
flat = re.sub(r"[ \t\r\f\v]*\n[ \t\r\f\v]*", " ", text)
|
|
1127
|
+
flat = re.sub(r"[ \t]+", " ", flat)
|
|
1014
1128
|
return {
|
|
1015
1129
|
"document": {
|
|
1016
1130
|
"title": extract_title(text, Path(source_path) if source_path else None, fmt),
|
|
@@ -1018,13 +1132,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
|
|
|
1018
1132
|
"sha256": sha,
|
|
1019
1133
|
"source_path": source_path,
|
|
1020
1134
|
},
|
|
1021
|
-
"parties": extract_parties(
|
|
1022
|
-
"dates": extract_dates(
|
|
1023
|
-
"term": extract_term(
|
|
1024
|
-
"governing_law": extract_governing_law(
|
|
1135
|
+
"parties": extract_parties(flat),
|
|
1136
|
+
"dates": extract_dates(flat),
|
|
1137
|
+
"term": extract_term(flat),
|
|
1138
|
+
"governing_law": extract_governing_law(flat),
|
|
1025
1139
|
"clauses": extract_clauses(text),
|
|
1026
|
-
"defined_terms": extract_defined_terms(
|
|
1027
|
-
"value": extract_value(
|
|
1140
|
+
"defined_terms": extract_defined_terms(flat),
|
|
1141
|
+
"value": extract_value(flat),
|
|
1028
1142
|
"_meta": {
|
|
1029
1143
|
"extractor_version": EXTRACTOR_VERSION,
|
|
1030
1144
|
"tiers_used": ["deterministic"],
|
|
@@ -1336,7 +1450,7 @@ def output_schema() -> JSON:
|
|
|
1336
1450
|
"required": ["title", "format", "sha256", "source_path"],
|
|
1337
1451
|
"properties": {
|
|
1338
1452
|
"title": {"type": ["string", "null"]},
|
|
1339
|
-
"format": {"enum": ["markdown", "text", "docx", "pdf"]},
|
|
1453
|
+
"format": {"enum": ["markdown", "text", "docx", "pdf", "html"]},
|
|
1340
1454
|
"sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
|
|
1341
1455
|
"source_path": {"type": ["string", "null"]},
|
|
1342
1456
|
},
|
|
@@ -1687,7 +1801,7 @@ def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
|
|
|
1687
1801
|
def build_parser() -> argparse.ArgumentParser:
|
|
1688
1802
|
parser = argparse.ArgumentParser(
|
|
1689
1803
|
prog="extract",
|
|
1690
|
-
description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
|
|
1804
|
+
description="Ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured "
|
|
1691
1805
|
"JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
|
|
1692
1806
|
)
|
|
1693
1807
|
parser.add_argument("-V", "--version", action="version",
|
|
@@ -1721,7 +1835,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
1721
1835
|
|
|
1722
1836
|
|
|
1723
1837
|
def _build_extract_args(p: argparse.ArgumentParser) -> None:
|
|
1724
|
-
p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
|
|
1838
|
+
p.add_argument("path", help="Path to the document (.md/.txt/.html/.docx/.pdf).")
|
|
1725
1839
|
p.add_argument("--llm", action="store_true",
|
|
1726
1840
|
help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
|
|
1727
1841
|
"Off by default; the deterministic core is fully useful without it.")
|
|
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
8
|
-
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
|
|
7
|
+
version = "0.1.2"
|
|
8
|
+
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
11
11
|
license = { text = "MIT" }
|
|
@@ -20,7 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
|
|
|
20
20
|
FIXTURES = Path(__file__).resolve().parent / "fixtures"
|
|
21
21
|
|
|
22
22
|
DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
|
|
23
|
-
"employment_docx.docx", "license_pdf.pdf", "
|
|
23
|
+
"employment_docx.docx", "license_pdf.pdf", "services_html.html",
|
|
24
|
+
"scanned.pdf"]
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def golden_for(name: str) -> dict:
|
|
@@ -121,6 +121,11 @@
|
|
|
121
121
|
"confidence": 0.6,
|
|
122
122
|
"source": "deterministic"
|
|
123
123
|
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Disclosing Party",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
},
|
|
124
129
|
{
|
|
125
130
|
"term": "Receiving Party",
|
|
126
131
|
"confidence": 0.6,
|
|
@@ -138,7 +143,7 @@
|
|
|
138
143
|
"source": "none"
|
|
139
144
|
},
|
|
140
145
|
"_meta": {
|
|
141
|
-
"extractor_version": "0.1.
|
|
146
|
+
"extractor_version": "0.1.2",
|
|
142
147
|
"tiers_used": [
|
|
143
148
|
"deterministic"
|
|
144
149
|
],
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html>
|
|
3
|
+
<head>
|
|
4
|
+
<title>Exhibit 10.1</title>
|
|
5
|
+
<style>body { font-family: serif; } .hidden { display:none; }</style>
|
|
6
|
+
<script>var x = "(this should never appear in output)";</script>
|
|
7
|
+
</head>
|
|
8
|
+
<body>
|
|
9
|
+
<p align="center"><b>MASTER SERVICES AGREEMENT</b></p>
|
|
10
|
+
|
|
11
|
+
<p>This Master Services Agreement (the “Agreement”) is entered
|
|
12
|
+
into as of March 15, 2023 (the "Effective Date"), by and between
|
|
13
|
+
Initrode Systems, Inc., a Delaware corporation (“Provider”),
|
|
14
|
+
and Hooli LLC (“Customer”).</p>
|
|
15
|
+
|
|
16
|
+
<p>1. Services</p>
|
|
17
|
+
<p>Provider shall perform the services described in each Statement of Work.</p>
|
|
18
|
+
|
|
19
|
+
<p>2. Fees and Payment</p>
|
|
20
|
+
<p>Customer shall pay Provider the fees set forth in the applicable Statement
|
|
21
|
+
of Work, not to exceed $500,000 in the aggregate.</p>
|
|
22
|
+
|
|
23
|
+
<p>3. Term and Termination</p>
|
|
24
|
+
<p>The initial term of this Agreement is two (2) years. Either party may
|
|
25
|
+
terminate upon sixty (60) days’ written notice. This Agreement shall
|
|
26
|
+
automatically renew for successive one-year terms.</p>
|
|
27
|
+
|
|
28
|
+
<p>4. Confidentiality</p>
|
|
29
|
+
<p>Each party shall protect the other’s “Confidential
|
|
30
|
+
Information” using reasonable care.</p>
|
|
31
|
+
|
|
32
|
+
<p>5. Governing Law</p>
|
|
33
|
+
<p>This Agreement shall be governed by the laws of the State of California.</p>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
{
|
|
2
|
+
"document": {
|
|
3
|
+
"title": "MASTER SERVICES AGREEMENT",
|
|
4
|
+
"format": "html",
|
|
5
|
+
"sha256": "088b40f13135e6b5d8f8548b162d657f10725d348388c7c3a416d11d7fc65300",
|
|
6
|
+
"source_path": "services_html.html"
|
|
7
|
+
},
|
|
8
|
+
"parties": [
|
|
9
|
+
{
|
|
10
|
+
"name": "Initrode Systems, Inc.",
|
|
11
|
+
"confidence": 0.9,
|
|
12
|
+
"source": "deterministic",
|
|
13
|
+
"role": "Provider"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Hooli LLC",
|
|
17
|
+
"confidence": 0.9,
|
|
18
|
+
"source": "deterministic",
|
|
19
|
+
"role": "Customer"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"dates": {
|
|
23
|
+
"effective": {
|
|
24
|
+
"value": "2023-03-15",
|
|
25
|
+
"confidence": 0.9,
|
|
26
|
+
"source": "deterministic"
|
|
27
|
+
},
|
|
28
|
+
"expiration": {
|
|
29
|
+
"value": null,
|
|
30
|
+
"confidence": 0.0,
|
|
31
|
+
"source": "none"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"term": {
|
|
35
|
+
"length": {
|
|
36
|
+
"value": "2 years",
|
|
37
|
+
"confidence": 0.7,
|
|
38
|
+
"source": "deterministic"
|
|
39
|
+
},
|
|
40
|
+
"auto_renew": {
|
|
41
|
+
"value": true,
|
|
42
|
+
"confidence": 0.65,
|
|
43
|
+
"source": "deterministic"
|
|
44
|
+
},
|
|
45
|
+
"notice_period_days": {
|
|
46
|
+
"value": 60,
|
|
47
|
+
"confidence": 0.7,
|
|
48
|
+
"source": "deterministic"
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"governing_law": {
|
|
52
|
+
"value": "State of California",
|
|
53
|
+
"confidence": 0.85,
|
|
54
|
+
"source": "deterministic"
|
|
55
|
+
},
|
|
56
|
+
"clauses": [
|
|
57
|
+
{
|
|
58
|
+
"canonical_title": "Services",
|
|
59
|
+
"detected_title": "1. Services",
|
|
60
|
+
"tier": "numbered",
|
|
61
|
+
"span": {
|
|
62
|
+
"start": 242,
|
|
63
|
+
"end": 329
|
|
64
|
+
},
|
|
65
|
+
"confidence": 0.6,
|
|
66
|
+
"source": "deterministic",
|
|
67
|
+
"mapped": false
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"canonical_title": "Payment",
|
|
71
|
+
"detected_title": "2. Fees and Payment",
|
|
72
|
+
"tier": "numbered",
|
|
73
|
+
"span": {
|
|
74
|
+
"start": 329,
|
|
75
|
+
"end": 476
|
|
76
|
+
},
|
|
77
|
+
"confidence": 0.8,
|
|
78
|
+
"source": "deterministic",
|
|
79
|
+
"mapped": true
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"canonical_title": "Termination",
|
|
83
|
+
"detected_title": "3. Term and Termination",
|
|
84
|
+
"tier": "numbered",
|
|
85
|
+
"span": {
|
|
86
|
+
"start": 476,
|
|
87
|
+
"end": 692
|
|
88
|
+
},
|
|
89
|
+
"confidence": 0.8,
|
|
90
|
+
"source": "deterministic",
|
|
91
|
+
"mapped": true
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"canonical_title": "Confidentiality",
|
|
95
|
+
"detected_title": "4. Confidentiality",
|
|
96
|
+
"tier": "numbered",
|
|
97
|
+
"span": {
|
|
98
|
+
"start": 692,
|
|
99
|
+
"end": 800
|
|
100
|
+
},
|
|
101
|
+
"confidence": 0.8,
|
|
102
|
+
"source": "deterministic",
|
|
103
|
+
"mapped": true
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"canonical_title": "Governing Law",
|
|
107
|
+
"detected_title": "5. Governing Law",
|
|
108
|
+
"tier": "numbered",
|
|
109
|
+
"span": {
|
|
110
|
+
"start": 800,
|
|
111
|
+
"end": 890
|
|
112
|
+
},
|
|
113
|
+
"confidence": 0.8,
|
|
114
|
+
"source": "deterministic",
|
|
115
|
+
"mapped": true
|
|
116
|
+
}
|
|
117
|
+
],
|
|
118
|
+
"defined_terms": [
|
|
119
|
+
{
|
|
120
|
+
"term": "Agreement",
|
|
121
|
+
"confidence": 0.6,
|
|
122
|
+
"source": "deterministic"
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
"term": "Effective Date",
|
|
126
|
+
"confidence": 0.6,
|
|
127
|
+
"source": "deterministic"
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"term": "Provider",
|
|
131
|
+
"confidence": 0.6,
|
|
132
|
+
"source": "deterministic"
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"term": "Customer",
|
|
136
|
+
"confidence": 0.6,
|
|
137
|
+
"source": "deterministic"
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"term": "Confidential Information",
|
|
141
|
+
"confidence": 0.6,
|
|
142
|
+
"source": "deterministic"
|
|
143
|
+
}
|
|
144
|
+
],
|
|
145
|
+
"value": {
|
|
146
|
+
"value": "$500,000",
|
|
147
|
+
"confidence": 0.6,
|
|
148
|
+
"source": "deterministic"
|
|
149
|
+
},
|
|
150
|
+
"_meta": {
|
|
151
|
+
"extractor_version": "0.1.2",
|
|
152
|
+
"tiers_used": [
|
|
153
|
+
"deterministic"
|
|
154
|
+
],
|
|
155
|
+
"llm_used": false
|
|
156
|
+
}
|
|
157
|
+
}
|
|
@@ -12,8 +12,8 @@ def test_parties_between_simple() -> None:
|
|
|
12
12
|
assert all(0.0 <= p["confidence"] <= 1.0 for p in parties)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
16
|
-
text = ('by and between Acme Corp. (the "Disclosing
|
|
15
|
+
def test_parties_with_roles() -> None:
|
|
16
|
+
text = ('by and between Acme Corp. (the "Disclosing Party") and '
|
|
17
17
|
'Beta LLC (the "Receiving Party"), dated March 1, 2024.')
|
|
18
18
|
parties = ex.extract_parties(text)
|
|
19
19
|
assert parties[0]["name"] == "Acme Corp."
|
|
@@ -22,6 +22,30 @@ def test_parties_with_roles_and_linebreak() -> None:
|
|
|
22
22
|
assert parties[1]["role"] == "Receiving Party"
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
def test_parties_linebreak_handled_by_build() -> None:
|
|
26
|
+
# build_extraction flattens whitespace, so a party/role that wraps across a
|
|
27
|
+
# line is matched whole.
|
|
28
|
+
text = ('This Agreement is made by and between Acme Corp. (the "Disclosing\n'
|
|
29
|
+
'Party") and Beta LLC (the "Receiving Party").')
|
|
30
|
+
r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
|
|
31
|
+
assert [p["name"] for p in r["parties"]] == ["Acme Corp.", "Beta LLC"]
|
|
32
|
+
assert r["parties"][0]["role"] == "Disclosing Party"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_parties_skip_and_inside_description() -> None:
|
|
36
|
+
# An "and" inside a party's own description must not split the parties.
|
|
37
|
+
text = ("between Blade Ventures Inc., a Nevada corporation having offices at "
|
|
38
|
+
"1 Main St and doing business as Foo (\"Client\"), and KPMG LP")
|
|
39
|
+
parties = ex.extract_parties(text)
|
|
40
|
+
assert [p["name"] for p in parties] == ["Blade Ventures Inc.", "KPMG LP"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_party_name_descriptors_trimmed() -> None:
|
|
44
|
+
assert ex._clean_party_name("Visteon Corporation, a Delaware corporation") == "Visteon Corporation"
|
|
45
|
+
assert ex._clean_party_name("Foo Inc. doing business as Bar") == "Foo Inc."
|
|
46
|
+
assert ex._clean_party_name("Baz LLC having its principal office at X") == "Baz LLC"
|
|
47
|
+
|
|
48
|
+
|
|
25
49
|
def test_parties_none() -> None:
|
|
26
50
|
assert ex.extract_parties("There are no parties named here.") == []
|
|
27
51
|
|
|
@@ -80,6 +104,15 @@ def test_governing_law_stops_before_trailing_clause() -> None:
|
|
|
80
104
|
assert out["value"] == "State of Delaware"
|
|
81
105
|
|
|
82
106
|
|
|
107
|
+
def test_governing_law_linebreak_handled_by_build() -> None:
|
|
108
|
+
# A jurisdiction that wraps a line ("...the Province\nof Ontario") is
|
|
109
|
+
# matched whole because build_extraction flattens whitespace first.
|
|
110
|
+
text = ("This Agreement shall be governed by the laws of the Province\n"
|
|
111
|
+
"of Ontario and the federal laws of Canada.")
|
|
112
|
+
r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
|
|
113
|
+
assert r["governing_law"]["value"] == "Province of Ontario"
|
|
114
|
+
|
|
115
|
+
|
|
83
116
|
def test_governing_law_missing() -> None:
|
|
84
117
|
assert ex.extract_governing_law("nothing about law")["source"] == "none"
|
|
85
118
|
|
|
@@ -142,6 +142,32 @@ def test_pdf_unescape() -> None:
|
|
|
142
142
|
assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def test_html_extraction() -> None:
|
|
146
|
+
raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
|
|
147
|
+
assert fmt == "html"
|
|
148
|
+
# script/style content is dropped; entities are unescaped.
|
|
149
|
+
assert "this should never appear" not in text
|
|
150
|
+
result = ex.build_extraction(text, raw, fmt, "services_html.html")
|
|
151
|
+
assert result["document"]["format"] == "html"
|
|
152
|
+
assert [p["name"] for p in result["parties"]] == ["Initrode Systems, Inc.", "Hooli LLC"]
|
|
153
|
+
assert result["governing_law"]["value"] == "State of California"
|
|
154
|
+
assert result["dates"]["effective"]["value"] == "2023-03-15"
|
|
155
|
+
canon = {c["canonical_title"] for c in result["clauses"]}
|
|
156
|
+
assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
|
|
160
|
+
# HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
|
|
161
|
+
p = tmp_path / "exhibit.txt"
|
|
162
|
+
p.write_text("<html><body><p>between A Co and B Co</p></body></html>")
|
|
163
|
+
_raw, _text, fmt, _w = ex.load_source(p)
|
|
164
|
+
assert fmt == "html"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_html_malformed_does_not_crash() -> None:
|
|
168
|
+
assert ex._read_html("<p>unclosed <b>bold <div>text") is not None
|
|
169
|
+
|
|
170
|
+
|
|
145
171
|
def test_pdf_text_only_inside_bt_et() -> None:
|
|
146
172
|
# Strings outside BT/ET (font/signature/metadata stream bytes that happen to
|
|
147
173
|
# contain parentheses) must be ignored; only text objects yield text.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|