extract-cli 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {extract_cli-0.1.1 → extract_cli-0.1.3}/ARCHITECTURE.md +3 -0
  2. {extract_cli-0.1.1 → extract_cli-0.1.3}/CHANGELOG.md +60 -0
  3. {extract_cli-0.1.1 → extract_cli-0.1.3}/PKG-INFO +8 -7
  4. {extract_cli-0.1.1 → extract_cli-0.1.3}/README.md +6 -5
  5. {extract_cli-0.1.1 → extract_cli-0.1.3}/docs/spec/extract-output.schema.json +2 -1
  6. {extract_cli-0.1.1 → extract_cli-0.1.3}/extract_cli.py +187 -30
  7. {extract_cli-0.1.1 → extract_cli-0.1.3}/pyproject.toml +2 -2
  8. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/_make_goldens.py +2 -1
  9. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/conftest.py +1 -0
  10. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  11. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  12. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  13. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/nda_h2.md.expected.json +6 -1
  14. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/scanned.pdf.expected.json +1 -1
  15. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/services_bold.txt.expected.json +1 -1
  16. extract_cli-0.1.3/tests/fixtures/services_html.html +35 -0
  17. extract_cli-0.1.3/tests/fixtures/services_html.html.expected.json +157 -0
  18. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_clause_map.py +25 -0
  19. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_deterministic.py +35 -2
  20. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_misc.py +26 -0
  21. {extract_cli-0.1.1 → extract_cli-0.1.3}/.gitignore +0 -0
  22. {extract_cli-0.1.1 → extract_cli-0.1.3}/CONTRIBUTING.md +0 -0
  23. {extract_cli-0.1.1 → extract_cli-0.1.3}/LICENSE +0 -0
  24. {extract_cli-0.1.1 → extract_cli-0.1.3}/Makefile +0 -0
  25. {extract_cli-0.1.1 → extract_cli-0.1.3}/config/llm.json.example +0 -0
  26. {extract_cli-0.1.1 → extract_cli-0.1.3}/docs/INTEROP.md +0 -0
  27. {extract_cli-0.1.1 → extract_cli-0.1.3}/scripts/release.py +0 -0
  28. {extract_cli-0.1.1 → extract_cli-0.1.3}/scripts/validate_against_spec.py +0 -0
  29. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/_fixtures_build.py +0 -0
  30. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/_schema_validator.py +0 -0
  31. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/employment_docx.docx +0 -0
  32. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/lease_allcaps.txt +0 -0
  33. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/license_pdf.pdf +0 -0
  34. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/nda_h2.md +0 -0
  35. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/scanned.pdf +0 -0
  36. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/fixtures/services_bold.txt +0 -0
  37. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_cli.py +0 -0
  38. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_llm.py +0 -0
  39. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_property.py +0 -0
  40. {extract_cli-0.1.1 → extract_cli-0.1.3}/tests/test_schema_conformance.py +0 -0
@@ -8,11 +8,14 @@ map.
8
8
  ```
9
9
  load_source(path) extension/content sniff → reader
10
10
  ├─ .md/.txt → utf-8 decode
11
+ ├─ .html → stdlib html.parser reader (also auto-detected inside .txt)
11
12
  ├─ .docx → python-docx (if [docx]) else stdlib zipfile/XML reader
12
13
  └─ .pdf → pypdf (if [pdf]) else stdlib zlib + text-operator reader
13
14
 
14
15
  ▼ (raw_bytes, text, format, warnings)
15
16
  build_extraction(text, raw, fmt, src) the DETERMINISTIC tier (always on)
17
+ │ field extractors run on a whitespace-FLATTENED copy (so values that wrap
18
+ │ across a line are matched whole); clause detection keeps the original text
16
19
  ├─ extract_parties "between X and Y", with role parentheticals
17
20
  ├─ extract_dates effective / expiration, ISO-normalized
18
21
  ├─ extract_term length / auto_renew / notice_period_days
@@ -6,6 +6,64 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.3] - 2026-05-21
10
+
11
+ Clause-map de-noising and party cleanup, driven by testing against 10 more
12
+ contracts (SEC EDGAR credit, loan, employment, lease, asset-purchase, and
13
+ consulting HTML exhibits; Apache PDFs).
14
+
15
+ ### Fixed
16
+ - **Clause map drops structural noise** common in dense real documents:
17
+ a heading whose title repeats 3+ times is treated as a running header/footer
18
+ (one lease's `Ks 112708-2` page code went from 44 "clauses" to 0), and
19
+ front/back-matter (`Table of Contents`, `Exhibit B`, `Schedule 2.1`) and
20
+ document codes/page numbers (4+ consecutive digits) are filtered out.
21
+ - **Party-name cleanup** extended: trailing `together with …`, `, as
22
+ administrative agent`, and a dangling unclosed parenthetical
23
+ (`(each of them being`) are trimmed.
24
+
25
+ ### Notes
26
+ - On dense documents the deterministic clause map can still surface a few
27
+ non-clause headings (e.g. address lines in a notices block); consumers
28
+ wanting only suite-vocabulary clauses should filter on `mapped == true`,
29
+ which isolates the real clauses (the noise is always `mapped == false`).
30
+ - Known best-effort edge cases on varied real paper: a bare role word as a
31
+ party name ("Landlord"), and a middle-initial period truncating a personal
32
+ name ("John C." → "John C"). Best-effort fields carry confidence/source.
33
+
34
+ ## [0.1.2] - 2026-05-21
35
+
36
+ More real-world hardening, driven by testing against five additional contracts
37
+ (SEC EDGAR consulting/MSA, lease, and Visteon services agreements; Common Paper
38
+ and Perigon Cloud Service Agreements).
39
+
40
+ ### Added
41
+ - **HTML input** (`.html`/`.htm`, and HTML auto-detected inside `.txt` such as
42
+ SEC EDGAR full submissions). Stdlib `html.parser`-based reader strips
43
+ script/style, frames block elements so heading detection still works, and
44
+ unescapes entities. `document.format` enum gains `html` (backward-compatible
45
+ widening). This turns the large class of HTML contracts (SEC exhibits, web
46
+ ToS) from garbage into structured output.
47
+
48
+ ### Fixed
49
+ - **Field extraction now runs on whitespace-flattened text**, so values that
50
+ wrap across a line break are matched whole — e.g. governing law
51
+ `the laws of the Province\nof Ontario` now yields `Province of Ontario`, and
52
+ line-wrapped party names/defined terms are captured.
53
+ - **Party extraction** (continues issue #2): names are trimmed of trailing
54
+ descriptors (`, a Delaware corporation`, `doing business as …`,
55
+ `having its offices at …`, `as of …`), and each party must begin with a
56
+ capital so an `and` *inside* a party's own description no longer splits the
57
+ parties (`…V6E 3S7 and doing business as …` → real parties recovered).
58
+
59
+ ### Known limitations (documented, not bugs)
60
+ - The stdlib PDF reader cannot decode PDFs that use embedded subset fonts with
61
+ hex-encoded glyph strings (common in professionally-typeset PDFs); these
62
+ degrade gracefully to a low-signal warning. Install the `[pdf]` extra (pypdf)
63
+ for them — verified to recover full text and clause structure.
64
+ - Two-line `ARTICLE N` / title headings (number on one line, title on the next)
65
+ are not yet detected.
66
+
9
67
  ## [0.1.1] - 2026-05-21
10
68
 
11
69
  Real-world hardening, driven by testing against a SEC EDGAR employment
@@ -82,5 +140,7 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
82
140
  intentionally *not* governed by the output schema (the schema describes the
83
141
  full default output).
84
142
 
143
+ [0.1.3]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.3
144
+ [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
85
145
  [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
86
146
  [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.1
4
- Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
3
+ Version: 0.1.3
4
+ Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
7
7
  Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/docs/INTEROP.md
@@ -63,8 +63,8 @@ ingest (extract) → review → diff → convert → sign
63
63
 
64
64
  ## What it does
65
65
 
66
- Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or **`.pdf`**,
67
- and it returns structured JSON: the parties, dates, term, governing law, a
66
+ Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
67
+ **`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
68
68
  **clause map** normalized onto the suite's canonical clause vocabulary, a
69
69
  defined-term inventory, and a headline value. Every field carries a
70
70
  `confidence` and a `source` so downstream tools **verify, don't trust**.
@@ -75,14 +75,15 @@ daemon, no network in the default path.
75
75
  ## Install
76
76
 
77
77
  ```bash
78
- pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
78
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
79
79
  pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
80
80
  pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
81
81
  pip install "extract-cli[docx,pdf]" # both
82
82
  ```
83
83
 
84
- The core has **zero runtime dependencies** and is fully functional on `.md`/`.txt`
85
- with no extras. `.docx` and `.pdf` work out of the box via stdlib readers; the
84
+ The core has **zero runtime dependencies** and is fully functional on
85
+ `.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
86
+ inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
86
87
  `[docx]`/`[pdf]` extras improve fidelity on complex documents (see
87
88
  [ARCHITECTURE.md](ARCHITECTURE.md)).
88
89
 
@@ -25,8 +25,8 @@ ingest (extract) → review → diff → convert → sign
25
25
 
26
26
  ## What it does
27
27
 
28
- Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or **`.pdf`**,
29
- and it returns structured JSON: the parties, dates, term, governing law, a
28
+ Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
29
+ **`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
30
30
  **clause map** normalized onto the suite's canonical clause vocabulary, a
31
31
  defined-term inventory, and a headline value. Every field carries a
32
32
  `confidence` and a `source` so downstream tools **verify, don't trust**.
@@ -37,14 +37,15 @@ daemon, no network in the default path.
37
37
  ## Install
38
38
 
39
39
  ```bash
40
- pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
40
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
41
41
  pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
42
42
  pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
43
43
  pip install "extract-cli[docx,pdf]" # both
44
44
  ```
45
45
 
46
- The core has **zero runtime dependencies** and is fully functional on `.md`/`.txt`
47
- with no extras. `.docx` and `.pdf` work out of the box via stdlib readers; the
46
+ The core has **zero runtime dependencies** and is fully functional on
47
+ `.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
48
+ inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
48
49
  `[docx]`/`[pdf]` extras improve fidelity on complex documents (see
49
50
  [ARCHITECTURE.md](ARCHITECTURE.md)).
50
51
 
@@ -69,7 +69,8 @@
69
69
  "markdown",
70
70
  "text",
71
71
  "docx",
72
- "pdf"
72
+ "pdf",
73
+ "html"
73
74
  ]
74
75
  },
75
76
  "sha256": {
@@ -4,8 +4,8 @@
4
4
  The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
5
5
  -> sign) that, until now, only handled documents it authored from its own
6
6
  templates. `extract-cli` is "passport control": it ingests ANY document --
7
- yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx, or
8
- .pdf, and emits a structured JSON representation that the rest of the suite
7
+ yours or a counterparty's foreign paper -- in .md/.txt/.html (natively), .docx,
8
+ or .pdf, and emits a structured JSON representation that the rest of the suite
9
9
  (nda-review-cli, compare-cli, contract-vault) consumes.
10
10
 
11
11
  Two extraction tiers:
@@ -32,6 +32,7 @@ from __future__ import annotations
32
32
  import argparse
33
33
  import datetime as _dt
34
34
  import hashlib
35
+ import html.parser
35
36
  import importlib.util
36
37
  import json
37
38
  import os
@@ -42,11 +43,11 @@ import urllib.request
42
43
  from pathlib import Path
43
44
  from typing import Any, Dict, List, Optional, Tuple
44
45
 
45
- __version__ = "0.1.1"
46
+ __version__ = "0.1.3"
46
47
 
47
48
  # Bumped independently of the package version when the *extraction logic*
48
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
49
- EXTRACTOR_VERSION = "0.1.1"
50
+ EXTRACTOR_VERSION = "0.1.3"
50
51
 
51
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
52
53
  SCHEMA_VERSION = 1
@@ -492,10 +493,17 @@ _EXPIRE_RE = re.compile(
492
493
  re.IGNORECASE,
493
494
  )
494
495
 
496
+ # Each party must start with a capital letter (optionally "the X"), a quote, or
497
+ # a paren. This is case-sensitive on purpose (no global IGNORECASE -- only the
498
+ # keywords are): it lets the engine skip an "and" that sits INSIDE a party's own
499
+ # description ("...V6E 3S7 and doing business as ...", where the right side
500
+ # starts lowercase) and find the real "and" before the second named entity.
501
+ _PARTY_START = r"(?:(?:[Tt]he|its)\s+)?[A-Z\"“(]"
495
502
  _PARTY_BLOCK_RE = re.compile(
496
- r"\b(?:by\s+and\s+between|between)\s+(.{2,200}?)\s+\band\b\s+(.{2,200}?)"
497
- r"(?=[\.;\n]|\bwhereas\b|\beffective\b|\bdated\b|\bhaving\b|\bwith\s+offices\b|$)",
498
- re.IGNORECASE | re.DOTALL,
503
+ r"(?i:\b(?:by\s+and\s+between|between)\s+)"
504
+ r"(" + _PARTY_START + r"[^\n]{1,200}?)\s+and\s+"
505
+ r"(" + _PARTY_START + r"[^\n]{1,200}?)"
506
+ r"(?=[\.;\n]|(?i:\bwhereas\b|\beffective\b|\bdated\b|\bas\s+of\b|\bwitnesseth\b)|$)",
499
507
  )
500
508
  _ROLE_PAREN_RE = re.compile(
501
509
  r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
@@ -604,8 +612,47 @@ def _date_field(match: Optional["re.Match[str]"]) -> JSON:
604
612
  return _date_field_from_str(match.group(1), 0.85)
605
613
 
606
614
 
615
+ # Trailing descriptors that follow a party's actual name and should be dropped
616
+ # ("Acme Corp., a Delaware corporation", "... doing business as Foo", "... as of
617
+ # March 1", "... having its offices at ..."). Each is matched and everything from
618
+ # it onward is cut.
619
+ _PARTY_CUT_MARKERS: Tuple[str, ...] = (
620
+ r",\s+an?\s+\w", # ", a Delaware ..." / ", an Ohio ..."
621
+ r"\s+doing\s+business\s+as\b",
622
+ r"\s+d/?b/?a\b",
623
+ r"\s+f/?k/?a\b",
624
+ r"\s+a[n]?\s+\w+\s+(?:corporation|company|partnership|limited)\b",
625
+ r"\s+having\b",
626
+ r"\s+with\s+(?:its\s+)?(?:offices|principal|a\s)\b",
627
+ r"\s+with\s+offices\b",
628
+ r"\s+located\b",
629
+ r"\s+organized\b",
630
+ r"\s+incorporated\b",
631
+ r"\s+whose\b",
632
+ r"\s+together\b",
633
+ r",\s+as\s+\w", # ", as administrative agent"
634
+ r"\s+(?:as\s+of|dated|effective)\b",
635
+ )
636
+
637
+
638
+ def _clean_party_name(s: str) -> str:
639
+ """Trim a captured party name down to the entity name, dropping trailing
640
+ descriptors ('a Delaware corporation', 'd/b/a ...', 'together with ...',
641
+ 'as of ...') and any dangling unclosed parenthetical ('(each of them ...')."""
642
+ s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
643
+ for pat in _PARTY_CUT_MARKERS:
644
+ m = re.search(pat, s, re.IGNORECASE)
645
+ if m:
646
+ s = s[: m.start()].strip().strip(",").strip()
647
+ # Drop a trailing parenthetical that was opened but never closed (the close
648
+ # fell outside the captured span), e.g. "Glenn Rufrano (each of them being".
649
+ if "(" in s and ")" not in s:
650
+ s = s[: s.index("(")].strip().strip(",").strip()
651
+ return s.strip("\"“”").strip()
652
+
653
+
607
654
  def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
608
- s = s.strip().strip(",").strip()
655
+ s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
609
656
  role: Optional[str] = None
610
657
  m = _ROLE_PAREN_RE.search(s)
611
658
  if m:
@@ -614,9 +661,7 @@ def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
614
661
  if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
615
662
  role = candidate
616
663
  s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
617
- s = s.strip("\"“”").strip()
618
- s = re.sub(r"\s+", " ", s)
619
- return s, role
664
+ return _clean_party_name(s), role
620
665
 
621
666
 
622
667
  def extract_parties(text: str) -> List[JSON]:
@@ -625,9 +670,6 @@ def extract_parties(text: str) -> List[JSON]:
625
670
  return []
626
671
  out: List[JSON] = []
627
672
  for raw in (m.group(1), m.group(2)):
628
- # Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
629
- # collapse whitespace rather than truncating at the first newline.
630
- raw = re.sub(r"\s+", " ", raw).strip()
631
673
  name, role = _split_name_role(raw)
632
674
  if not name or len(name) < 2 or len(name) > 120:
633
675
  continue
@@ -706,9 +748,45 @@ def extract_defined_terms(text: str) -> List[JSON]:
706
748
  return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
707
749
 
708
750
 
751
+ # Detected-heading titles that are almost never real clauses: front/back-matter,
752
+ # page/document codes, exhibit & schedule references.
753
+ _NOISE_TITLE_PREFIX_RE = re.compile(
754
+ r"^(?:table\s+of\s+contents|exhibit|schedule|annex|appendix|attachment|"
755
+ r"signature\s+page|page)\b",
756
+ re.IGNORECASE,
757
+ )
758
+
759
+
760
+ def _is_noise_clause_title(title: str) -> bool:
761
+ """True for detected 'headings' that are structural noise rather than
762
+ clauses -- document codes/page numbers (4+ consecutive digits, e.g.
763
+ 'Ks 112708-2'), and front/back-matter like 'Table of Contents' or
764
+ 'Exhibit B'. Safe filters only; kept conservative to avoid dropping real
765
+ clauses."""
766
+ t = title.strip()
767
+ if re.search(r"\d{4,}", t):
768
+ return True
769
+ if _NOISE_TITLE_PREFIX_RE.match(t):
770
+ return True
771
+ return False
772
+
773
+
709
774
  def extract_clauses(text: str) -> List[JSON]:
775
+ detected = detect_clauses(text)
776
+ # A heading whose title repeats 3+ times across the document is almost
777
+ # always a running header/footer (e.g. a page code), not that many distinct
778
+ # clauses -- drop every occurrence. (Counted on the normalized title.)
779
+ counts: Dict[str, int] = {}
780
+ for c in detected:
781
+ k = _norm_clause_key(c["title"])
782
+ counts[k] = counts.get(k, 0) + 1
783
+
710
784
  out: List[JSON] = []
711
- for c in detect_clauses(text):
785
+ for c in detected:
786
+ if counts[_norm_clause_key(c["title"])] >= 3:
787
+ continue
788
+ if _is_noise_clause_title(c["title"]):
789
+ continue
712
790
  canonical, mapped = _canonicalize_clause(c["title"])
713
791
  tier = c["tier"]
714
792
  base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
@@ -750,21 +828,91 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
750
828
  # ---------------------------------------------------------------------------
751
829
 
752
830
 
831
+ def _looks_like_html(head: str) -> bool:
832
+ """Heuristic: does this text look like HTML? Catches HTML masquerading as
833
+ .txt (e.g. SEC EDGAR full submissions wrap HTML exhibits in a .txt)."""
834
+ low = head.lower()
835
+ if "<!doctype html" in low or "<html" in low or "<body" in low:
836
+ return True
837
+ return len(re.findall(r"</?(?:p|div|table|tr|td|span|br|h[1-6]|font|b|i)\b", low)) >= 6
838
+
839
+
753
840
  def _detect_format(path: Path, raw: bytes) -> str:
754
841
  ext = path.suffix.lower()
755
- if ext in (".md", ".markdown"):
756
- return "markdown"
757
- if ext == ".txt":
758
- return "text"
842
+ if ext in (".htm", ".html", ".xhtml"):
843
+ return "html"
759
844
  if ext == ".docx":
760
845
  return "docx"
761
846
  if ext == ".pdf":
762
847
  return "pdf"
763
848
  if raw[:4] == b"%PDF":
764
849
  return "pdf"
765
- if raw[:2] == b"PK":
850
+ if raw[:2] == b"PK" and ext not in (".md", ".markdown", ".txt"):
766
851
  return "docx"
767
- return "text"
852
+ base = "markdown" if ext in (".md", ".markdown") else "text"
853
+ # Content sniff: HTML hiding inside a .txt/.md (or extensionless) file.
854
+ if _looks_like_html(raw[:4096].decode("utf-8", "replace")):
855
+ return "html"
856
+ return base
857
+
858
+
859
+ class _HTMLTextExtractor(html.parser.HTMLParser):
860
+ """Stdlib HTML -> text: drops script/style, frames block elements with blank
861
+ lines (so clause-heading detection still works), and unescapes entities."""
862
+
863
+ _SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
864
+ _BLOCK = {
865
+ "p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
866
+ "section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
867
+ "thead", "tbody", "header", "footer", "main",
868
+ }
869
+
870
+ def __init__(self) -> None:
871
+ super().__init__(convert_charrefs=True)
872
+ self._parts: List[str] = []
873
+ self._skip = 0
874
+
875
+ def handle_starttag(self, tag: str, attrs: Any) -> None:
876
+ if tag in self._SKIP:
877
+ self._skip += 1
878
+ elif tag in self._BLOCK:
879
+ self._parts.append("\n")
880
+
881
+ def handle_endtag(self, tag: str) -> None:
882
+ if tag in self._SKIP and self._skip > 0:
883
+ self._skip -= 1
884
+ elif tag in self._BLOCK:
885
+ self._parts.append("\n")
886
+
887
+ def handle_data(self, data: str) -> None:
888
+ if self._skip == 0:
889
+ self._parts.append(data)
890
+
891
+ def get_text(self) -> str:
892
+ # Strip each line; collapse runs of blank lines to a single blank line
893
+ # (gives ALL-CAPS / numbered headings their blank-line frame).
894
+ lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
895
+ out: List[str] = []
896
+ blank = False
897
+ for ln in lines:
898
+ if ln:
899
+ out.append(ln)
900
+ blank = False
901
+ elif not blank:
902
+ out.append("")
903
+ blank = True
904
+ return "\n".join(out).strip()
905
+
906
+
907
+ def _read_html(raw_text: str) -> str:
908
+ parser = _HTMLTextExtractor()
909
+ try:
910
+ parser.feed(raw_text)
911
+ parser.close()
912
+ except Exception:
913
+ # Never crash on malformed markup; fall back to a crude tag strip.
914
+ return re.sub(r"<[^>]+>", " ", raw_text)
915
+ return parser.get_text()
768
916
 
769
917
 
770
918
  def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
@@ -986,6 +1134,8 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
986
1134
  warnings: List[str] = []
987
1135
  if fmt in ("markdown", "text"):
988
1136
  text = raw.decode("utf-8", "replace")
1137
+ elif fmt == "html":
1138
+ text = _read_html(raw.decode("utf-8", "replace"))
989
1139
  elif fmt == "docx":
990
1140
  text, w = _read_docx(path, raw, prefer_optional)
991
1141
  warnings += w
@@ -1011,6 +1161,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
1011
1161
  source_path: Optional[str]) -> JSON:
1012
1162
  """Run the deterministic tier and assemble the output contract object."""
1013
1163
  sha = hashlib.sha256(raw).hexdigest()
1164
+ # Field extractors (parties, dates, governing law, term, value, defined
1165
+ # terms) run on a whitespace-flattened copy so values that wrap across a
1166
+ # line break in the source -- "...laws of the Province\nof Ontario", a party
1167
+ # name split mid-line -- are matched whole. Clause detection and the title
1168
+ # keep the original text, which depends on line structure.
1169
+ flat = re.sub(r"[ \t\r\f\v]*\n[ \t\r\f\v]*", " ", text)
1170
+ flat = re.sub(r"[ \t]+", " ", flat)
1014
1171
  return {
1015
1172
  "document": {
1016
1173
  "title": extract_title(text, Path(source_path) if source_path else None, fmt),
@@ -1018,13 +1175,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
1018
1175
  "sha256": sha,
1019
1176
  "source_path": source_path,
1020
1177
  },
1021
- "parties": extract_parties(text),
1022
- "dates": extract_dates(text),
1023
- "term": extract_term(text),
1024
- "governing_law": extract_governing_law(text),
1178
+ "parties": extract_parties(flat),
1179
+ "dates": extract_dates(flat),
1180
+ "term": extract_term(flat),
1181
+ "governing_law": extract_governing_law(flat),
1025
1182
  "clauses": extract_clauses(text),
1026
- "defined_terms": extract_defined_terms(text),
1027
- "value": extract_value(text),
1183
+ "defined_terms": extract_defined_terms(flat),
1184
+ "value": extract_value(flat),
1028
1185
  "_meta": {
1029
1186
  "extractor_version": EXTRACTOR_VERSION,
1030
1187
  "tiers_used": ["deterministic"],
@@ -1336,7 +1493,7 @@ def output_schema() -> JSON:
1336
1493
  "required": ["title", "format", "sha256", "source_path"],
1337
1494
  "properties": {
1338
1495
  "title": {"type": ["string", "null"]},
1339
- "format": {"enum": ["markdown", "text", "docx", "pdf"]},
1496
+ "format": {"enum": ["markdown", "text", "docx", "pdf", "html"]},
1340
1497
  "sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
1341
1498
  "source_path": {"type": ["string", "null"]},
1342
1499
  },
@@ -1687,7 +1844,7 @@ def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
1687
1844
  def build_parser() -> argparse.ArgumentParser:
1688
1845
  parser = argparse.ArgumentParser(
1689
1846
  prog="extract",
1690
- description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
1847
+ description="Ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured "
1691
1848
  "JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
1692
1849
  )
1693
1850
  parser.add_argument("-V", "--version", action="version",
@@ -1721,7 +1878,7 @@ def build_parser() -> argparse.ArgumentParser:
1721
1878
 
1722
1879
 
1723
1880
  def _build_extract_args(p: argparse.ArgumentParser) -> None:
1724
- p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
1881
+ p.add_argument("path", help="Path to the document (.md/.txt/.html/.docx/.pdf).")
1725
1882
  p.add_argument("--llm", action="store_true",
1726
1883
  help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
1727
1884
  "Off by default; the deterministic core is fully useful without it.")
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.1"
8
- description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
7
+ version = "0.1.3"
8
+ description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
11
11
  license = { text = "MIT" }
@@ -20,7 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
20
20
  FIXTURES = Path(__file__).resolve().parent / "fixtures"
21
21
 
22
22
  DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
23
- "employment_docx.docx", "license_pdf.pdf", "scanned.pdf"]
23
+ "employment_docx.docx", "license_pdf.pdf", "services_html.html",
24
+ "scanned.pdf"]
24
25
 
25
26
 
26
27
  def golden_for(name: str) -> dict:
@@ -26,6 +26,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
26
26
  ("lease_allcaps.txt", "all-caps", "text"),
27
27
  ("employment_docx.docx", "bold-numbered", "docx"),
28
28
  ("license_pdf.pdf", "all-caps", "pdf"),
29
+ ("services_html.html", "numbered", "html"),
29
30
  )
30
31
 
31
32
 
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.1",
141
+ "extractor_version": "0.1.3",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.1",
136
+ "extractor_version": "0.1.3",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.1",
136
+ "extractor_version": "0.1.3",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -121,6 +121,11 @@
121
121
  "confidence": 0.6,
122
122
  "source": "deterministic"
123
123
  },
124
+ {
125
+ "term": "Disclosing Party",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ },
124
129
  {
125
130
  "term": "Receiving Party",
126
131
  "confidence": 0.6,
@@ -138,7 +143,7 @@
138
143
  "source": "none"
139
144
  },
140
145
  "_meta": {
141
- "extractor_version": "0.1.1",
146
+ "extractor_version": "0.1.3",
142
147
  "tiers_used": [
143
148
  "deterministic"
144
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.1",
51
+ "extractor_version": "0.1.3",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.1",
136
+ "extractor_version": "0.1.3",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -0,0 +1,35 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Exhibit 10.1</title>
5
+ <style>body { font-family: serif; } .hidden { display:none; }</style>
6
+ <script>var x = "(this should never appear in output)";</script>
7
+ </head>
8
+ <body>
9
+ <p align="center"><b>MASTER SERVICES AGREEMENT</b></p>
10
+
11
+ <p>This Master Services Agreement (the &ldquo;Agreement&rdquo;) is entered
12
+ into as of March 15, 2023 (the &quot;Effective Date&quot;), by and between
13
+ Initrode&nbsp;Systems,&nbsp;Inc., a Delaware corporation (&ldquo;Provider&rdquo;),
14
+ and Hooli&nbsp;LLC (&ldquo;Customer&rdquo;).</p>
15
+
16
+ <p>1. Services</p>
17
+ <p>Provider shall perform the services described in each Statement of Work.</p>
18
+
19
+ <p>2. Fees and Payment</p>
20
+ <p>Customer shall pay Provider the fees set forth in the applicable Statement
21
+ of Work, not to exceed $500,000 in the aggregate.</p>
22
+
23
+ <p>3. Term and Termination</p>
24
+ <p>The initial term of this Agreement is two (2) years. Either party may
25
+ terminate upon sixty (60) days&rsquo; written notice. This Agreement shall
26
+ automatically renew for successive one-year terms.</p>
27
+
28
+ <p>4. Confidentiality</p>
29
+ <p>Each party shall protect the other&rsquo;s &ldquo;Confidential
30
+ Information&rdquo; using reasonable care.</p>
31
+
32
+ <p>5. Governing Law</p>
33
+ <p>This Agreement shall be governed by the laws of the State of California.</p>
34
+ </body>
35
+ </html>
@@ -0,0 +1,157 @@
1
+ {
2
+ "document": {
3
+ "title": "MASTER SERVICES AGREEMENT",
4
+ "format": "html",
5
+ "sha256": "088b40f13135e6b5d8f8548b162d657f10725d348388c7c3a416d11d7fc65300",
6
+ "source_path": "services_html.html"
7
+ },
8
+ "parties": [
9
+ {
10
+ "name": "Initrode Systems, Inc.",
11
+ "confidence": 0.9,
12
+ "source": "deterministic",
13
+ "role": "Provider"
14
+ },
15
+ {
16
+ "name": "Hooli LLC",
17
+ "confidence": 0.9,
18
+ "source": "deterministic",
19
+ "role": "Customer"
20
+ }
21
+ ],
22
+ "dates": {
23
+ "effective": {
24
+ "value": "2023-03-15",
25
+ "confidence": 0.9,
26
+ "source": "deterministic"
27
+ },
28
+ "expiration": {
29
+ "value": null,
30
+ "confidence": 0.0,
31
+ "source": "none"
32
+ }
33
+ },
34
+ "term": {
35
+ "length": {
36
+ "value": "2 years",
37
+ "confidence": 0.7,
38
+ "source": "deterministic"
39
+ },
40
+ "auto_renew": {
41
+ "value": true,
42
+ "confidence": 0.65,
43
+ "source": "deterministic"
44
+ },
45
+ "notice_period_days": {
46
+ "value": 60,
47
+ "confidence": 0.7,
48
+ "source": "deterministic"
49
+ }
50
+ },
51
+ "governing_law": {
52
+ "value": "State of California",
53
+ "confidence": 0.85,
54
+ "source": "deterministic"
55
+ },
56
+ "clauses": [
57
+ {
58
+ "canonical_title": "Services",
59
+ "detected_title": "1. Services",
60
+ "tier": "numbered",
61
+ "span": {
62
+ "start": 242,
63
+ "end": 329
64
+ },
65
+ "confidence": 0.6,
66
+ "source": "deterministic",
67
+ "mapped": false
68
+ },
69
+ {
70
+ "canonical_title": "Payment",
71
+ "detected_title": "2. Fees and Payment",
72
+ "tier": "numbered",
73
+ "span": {
74
+ "start": 329,
75
+ "end": 476
76
+ },
77
+ "confidence": 0.8,
78
+ "source": "deterministic",
79
+ "mapped": true
80
+ },
81
+ {
82
+ "canonical_title": "Termination",
83
+ "detected_title": "3. Term and Termination",
84
+ "tier": "numbered",
85
+ "span": {
86
+ "start": 476,
87
+ "end": 692
88
+ },
89
+ "confidence": 0.8,
90
+ "source": "deterministic",
91
+ "mapped": true
92
+ },
93
+ {
94
+ "canonical_title": "Confidentiality",
95
+ "detected_title": "4. Confidentiality",
96
+ "tier": "numbered",
97
+ "span": {
98
+ "start": 692,
99
+ "end": 800
100
+ },
101
+ "confidence": 0.8,
102
+ "source": "deterministic",
103
+ "mapped": true
104
+ },
105
+ {
106
+ "canonical_title": "Governing Law",
107
+ "detected_title": "5. Governing Law",
108
+ "tier": "numbered",
109
+ "span": {
110
+ "start": 800,
111
+ "end": 890
112
+ },
113
+ "confidence": 0.8,
114
+ "source": "deterministic",
115
+ "mapped": true
116
+ }
117
+ ],
118
+ "defined_terms": [
119
+ {
120
+ "term": "Agreement",
121
+ "confidence": 0.6,
122
+ "source": "deterministic"
123
+ },
124
+ {
125
+ "term": "Effective Date",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ },
129
+ {
130
+ "term": "Provider",
131
+ "confidence": 0.6,
132
+ "source": "deterministic"
133
+ },
134
+ {
135
+ "term": "Customer",
136
+ "confidence": 0.6,
137
+ "source": "deterministic"
138
+ },
139
+ {
140
+ "term": "Confidential Information",
141
+ "confidence": 0.6,
142
+ "source": "deterministic"
143
+ }
144
+ ],
145
+ "value": {
146
+ "value": "$500,000",
147
+ "confidence": 0.6,
148
+ "source": "deterministic"
149
+ },
150
+ "_meta": {
151
+ "extractor_version": "0.1.3",
152
+ "tiers_used": [
153
+ "deterministic"
154
+ ],
155
+ "llm_used": false
156
+ }
157
+ }
@@ -69,6 +69,31 @@ def test_trailing_period_stripped_from_titles() -> None:
69
69
  assert ex._canonicalize_clause("Survival.") == ("Survival", True)
70
70
 
71
71
 
72
+ def test_repeated_heading_treated_as_boilerplate() -> None:
73
+ # A "heading" that repeats 3+ times is a running header/footer, not clauses.
74
+ body = "\n\n".join("## Ks 99-2\n\nfoo" for _ in range(4))
75
+ text = "## Confidentiality\n\nreal body\n\n" + body
76
+ clauses = ex.extract_clauses(text)
77
+ titles = [c["canonical_title"] for c in clauses]
78
+ assert "Confidentiality" in titles
79
+ assert not any("Ks" in (t or "") for t in titles)
80
+
81
+
82
+ def test_noise_clause_titles_filtered() -> None:
83
+ assert ex._is_noise_clause_title("Ks 112708-2") # 4+ digit code
84
+ assert ex._is_noise_clause_title("Table of Contents")
85
+ assert ex._is_noise_clause_title("Exhibit B")
86
+ assert ex._is_noise_clause_title("Schedule 2.1")
87
+ assert not ex._is_noise_clause_title("Confidentiality")
88
+ assert not ex._is_noise_clause_title("Term and Termination")
89
+
90
+
91
+ def test_party_cuts_together_as_agent_and_unclosed_paren() -> None:
92
+ assert ex._clean_party_name("Foo LLC, together with its affiliates") == "Foo LLC"
93
+ assert ex._clean_party_name("GE Capital Corporation, as administrative agent") == "GE Capital Corporation"
94
+ assert ex._clean_party_name("Glenn Rufrano (each of them being") == "Glenn Rufrano"
95
+
96
+
72
97
  def test_cascade_priority_h2_wins() -> None:
73
98
  # An H2 present means the bold/all-caps fallbacks must not fire.
74
99
  text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
@@ -12,8 +12,8 @@ def test_parties_between_simple() -> None:
12
12
  assert all(0.0 <= p["confidence"] <= 1.0 for p in parties)
13
13
 
14
14
 
15
- def test_parties_with_roles_and_linebreak() -> None:
16
- text = ('by and between Acme Corp. (the "Disclosing\nParty") and '
15
+ def test_parties_with_roles() -> None:
16
+ text = ('by and between Acme Corp. (the "Disclosing Party") and '
17
17
  'Beta LLC (the "Receiving Party"), dated March 1, 2024.')
18
18
  parties = ex.extract_parties(text)
19
19
  assert parties[0]["name"] == "Acme Corp."
@@ -22,6 +22,30 @@ def test_parties_with_roles_and_linebreak() -> None:
22
22
  assert parties[1]["role"] == "Receiving Party"
23
23
 
24
24
 
25
+ def test_parties_linebreak_handled_by_build() -> None:
26
+ # build_extraction flattens whitespace, so a party/role that wraps across a
27
+ # line is matched whole.
28
+ text = ('This Agreement is made by and between Acme Corp. (the "Disclosing\n'
29
+ 'Party") and Beta LLC (the "Receiving Party").')
30
+ r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
31
+ assert [p["name"] for p in r["parties"]] == ["Acme Corp.", "Beta LLC"]
32
+ assert r["parties"][0]["role"] == "Disclosing Party"
33
+
34
+
35
+ def test_parties_skip_and_inside_description() -> None:
36
+ # An "and" inside a party's own description must not split the parties.
37
+ text = ("between Blade Ventures Inc., a Nevada corporation having offices at "
38
+ "1 Main St and doing business as Foo (\"Client\"), and KPMG LP")
39
+ parties = ex.extract_parties(text)
40
+ assert [p["name"] for p in parties] == ["Blade Ventures Inc.", "KPMG LP"]
41
+
42
+
43
+ def test_party_name_descriptors_trimmed() -> None:
44
+ assert ex._clean_party_name("Visteon Corporation, a Delaware corporation") == "Visteon Corporation"
45
+ assert ex._clean_party_name("Foo Inc. doing business as Bar") == "Foo Inc."
46
+ assert ex._clean_party_name("Baz LLC having its principal office at X") == "Baz LLC"
47
+
48
+
25
49
  def test_parties_none() -> None:
26
50
  assert ex.extract_parties("There are no parties named here.") == []
27
51
 
@@ -80,6 +104,15 @@ def test_governing_law_stops_before_trailing_clause() -> None:
80
104
  assert out["value"] == "State of Delaware"
81
105
 
82
106
 
107
+ def test_governing_law_linebreak_handled_by_build() -> None:
108
+ # A jurisdiction that wraps a line ("...the Province\nof Ontario") is
109
+ # matched whole because build_extraction flattens whitespace first.
110
+ text = ("This Agreement shall be governed by the laws of the Province\n"
111
+ "of Ontario and the federal laws of Canada.")
112
+ r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
113
+ assert r["governing_law"]["value"] == "Province of Ontario"
114
+
115
+
83
116
  def test_governing_law_missing() -> None:
84
117
  assert ex.extract_governing_law("nothing about law")["source"] == "none"
85
118
 
@@ -142,6 +142,32 @@ def test_pdf_unescape() -> None:
142
142
  assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
143
143
 
144
144
 
145
+ def test_html_extraction() -> None:
146
+ raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
147
+ assert fmt == "html"
148
+ # script/style content is dropped; entities are unescaped.
149
+ assert "this should never appear" not in text
150
+ result = ex.build_extraction(text, raw, fmt, "services_html.html")
151
+ assert result["document"]["format"] == "html"
152
+ assert [p["name"] for p in result["parties"]] == ["Initrode Systems, Inc.", "Hooli LLC"]
153
+ assert result["governing_law"]["value"] == "State of California"
154
+ assert result["dates"]["effective"]["value"] == "2023-03-15"
155
+ canon = {c["canonical_title"] for c in result["clauses"]}
156
+ assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
157
+
158
+
159
+ def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
160
+ # HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
161
+ p = tmp_path / "exhibit.txt"
162
+ p.write_text("<html><body><p>between A Co and B Co</p></body></html>")
163
+ _raw, _text, fmt, _w = ex.load_source(p)
164
+ assert fmt == "html"
165
+
166
+
167
+ def test_html_malformed_does_not_crash() -> None:
168
+ assert ex._read_html("<p>unclosed <b>bold <div>text") is not None
169
+
170
+
145
171
  def test_pdf_text_only_inside_bt_et() -> None:
146
172
  # Strings outside BT/ET (font/signature/metadata stream bytes that happen to
147
173
  # contain parentheses) must be ignored; only text objects yield text.
File without changes
File without changes
File without changes
File without changes
File without changes