extract-cli 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {extract_cli-0.1.0 → extract_cli-0.1.2}/ARCHITECTURE.md +3 -0
  2. {extract_cli-0.1.0 → extract_cli-0.1.2}/CHANGELOG.md +60 -0
  3. {extract_cli-0.1.0 → extract_cli-0.1.2}/PKG-INFO +8 -7
  4. {extract_cli-0.1.0 → extract_cli-0.1.2}/README.md +6 -5
  5. {extract_cli-0.1.0 → extract_cli-0.1.2}/docs/spec/extract-output.schema.json +3 -1
  6. {extract_cli-0.1.0 → extract_cli-0.1.2}/extract_cli.py +260 -54
  7. {extract_cli-0.1.0 → extract_cli-0.1.2}/pyproject.toml +2 -2
  8. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/_make_goldens.py +2 -1
  9. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/conftest.py +1 -0
  10. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  11. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  12. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  13. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md.expected.json +6 -1
  14. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf.expected.json +1 -1
  15. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt.expected.json +1 -1
  16. extract_cli-0.1.2/tests/fixtures/services_html.html +35 -0
  17. extract_cli-0.1.2/tests/fixtures/services_html.html.expected.json +157 -0
  18. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_clause_map.py +44 -0
  19. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_cli.py +1 -1
  20. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_deterministic.py +54 -2
  21. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_misc.py +39 -0
  22. {extract_cli-0.1.0 → extract_cli-0.1.2}/.gitignore +0 -0
  23. {extract_cli-0.1.0 → extract_cli-0.1.2}/CONTRIBUTING.md +0 -0
  24. {extract_cli-0.1.0 → extract_cli-0.1.2}/LICENSE +0 -0
  25. {extract_cli-0.1.0 → extract_cli-0.1.2}/Makefile +0 -0
  26. {extract_cli-0.1.0 → extract_cli-0.1.2}/config/llm.json.example +0 -0
  27. {extract_cli-0.1.0 → extract_cli-0.1.2}/docs/INTEROP.md +0 -0
  28. {extract_cli-0.1.0 → extract_cli-0.1.2}/scripts/release.py +0 -0
  29. {extract_cli-0.1.0 → extract_cli-0.1.2}/scripts/validate_against_spec.py +0 -0
  30. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/_fixtures_build.py +0 -0
  31. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/_schema_validator.py +0 -0
  32. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/employment_docx.docx +0 -0
  33. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/lease_allcaps.txt +0 -0
  34. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/license_pdf.pdf +0 -0
  35. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/nda_h2.md +0 -0
  36. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/scanned.pdf +0 -0
  37. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/fixtures/services_bold.txt +0 -0
  38. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_llm.py +0 -0
  39. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_property.py +0 -0
  40. {extract_cli-0.1.0 → extract_cli-0.1.2}/tests/test_schema_conformance.py +0 -0
@@ -8,11 +8,14 @@ map.
8
8
  ```
9
9
  load_source(path) extension/content sniff → reader
10
10
  ├─ .md/.txt → utf-8 decode
11
+ ├─ .html → stdlib html.parser reader (also auto-detected inside .txt)
11
12
  ├─ .docx → python-docx (if [docx]) else stdlib zipfile/XML reader
12
13
  └─ .pdf → pypdf (if [pdf]) else stdlib zlib + text-operator reader
13
14
 
14
15
  ▼ (raw_bytes, text, format, warnings)
15
16
  build_extraction(text, raw, fmt, src) the DETERMINISTIC tier (always on)
17
+ │ field extractors run on a whitespace-FLATTENED copy (so values that wrap
18
+ │ across a line are matched whole); clause detection keeps the original text
16
19
  ├─ extract_parties "between X and Y", with role parentheticals
17
20
  ├─ extract_dates effective / expiration, ISO-normalized
18
21
  ├─ extract_term length / auto_renew / notice_period_days
@@ -6,6 +6,64 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.2] - 2026-05-21
10
+
11
+ More real-world hardening, driven by testing against five additional contracts
12
+ (SEC EDGAR consulting/MSA, lease, and Visteon services agreements; Common Paper
13
+ and Perigon Cloud Service Agreements).
14
+
15
+ ### Added
16
+ - **HTML input** (`.html`/`.htm`, and HTML auto-detected inside `.txt` such as
17
+ SEC EDGAR full submissions). Stdlib `html.parser`-based reader strips
18
+ script/style, frames block elements so heading detection still works, and
19
+ unescapes entities. `document.format` enum gains `html` (backward-compatible
20
+ widening). This turns the large class of HTML contracts (SEC exhibits, web
21
+ ToS) from garbage into structured output.
22
+
23
+ ### Fixed
24
+ - **Field extraction now runs on whitespace-flattened text**, so values that
25
+ wrap across a line break are matched whole — e.g. governing law
26
+ `the laws of the Province\nof Ontario` now yields `Province of Ontario`, and
27
+ line-wrapped party names/defined terms are captured.
28
+ - **Party extraction** (continues issue #2): names are trimmed of trailing
29
+ descriptors (`, a Delaware corporation`, `doing business as …`,
30
+ `having its offices at …`, `as of …`), and each party must begin with a
31
+ capital so an `and` *inside* a party's own description no longer splits the
32
+ parties (`…V6E 3S7 and doing business as …` → real parties recovered).
33
+
34
+ ### Known limitations (documented, not bugs)
35
+ - The stdlib PDF reader cannot decode PDFs that use embedded subset fonts with
36
+ hex-encoded glyph strings (common in professionally-typeset PDFs); these
37
+ degrade gracefully to a low-signal warning. Install the `[pdf]` extra (pypdf)
38
+ for them — verified to recover full text and clause structure.
39
+ - Two-line `ARTICLE N` / title headings (number on one line, title on the next)
40
+ are not yet detected.
41
+
42
+ ## [0.1.1] - 2026-05-21
43
+
44
+ Real-world hardening, driven by testing against a SEC EDGAR employment
45
+ agreement and the Common Paper Mutual NDA (PDF/DOCX).
46
+
47
+ ### Added
48
+ - **`numbered` clause-detection tier** for plain numbered headings
49
+ (`1. Termination`, `Section 3. Payment`, `Article IV. …`) — the dominant
50
+ format in foreign paper, missed by the H2/bold/ALL-CAPS tiers. A title-case
51
+ heuristic rejects numbered sentences and list items. The output schema's
52
+ clause `tier` enum gains `numbered` (a backward-compatible widening).
53
+
54
+ ### Fixed
55
+ - **PDF reader** now extracts text only from inside `BT … ET` text objects, so
56
+ embedded fonts, digital-signature blobs, and metadata streams no longer leak
57
+ binary noise (a real signed PDF dropped from ~188 KB of garbage to ~8.7 KB of
58
+ clean text). Added a printable-ratio backstop.
59
+ - **Effective date**: anchor on `(the "Effective Date")` and a bare
60
+ `as of <date>` cue; handle dates that wrap across a line break.
61
+ - **Term length**: require a real number, dropping false positives such as
62
+ `…consecutive days`.
63
+ - **Title**: skip SGML/XML wrapper lines (e.g. SEC EDGAR `<DOCUMENT>` headers).
64
+ - Strip trailing punctuation from clause titles (`Other Benefits.` →
65
+ `Other Benefits`).
66
+
9
67
  ## [0.1.0] - 2026-05-21
10
68
 
11
69
  Initial release — the open-loop front door of the contract-ops CLI suite.
@@ -57,4 +115,6 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
57
115
  intentionally *not* governed by the output schema (the schema describes the
58
116
  full default output).
59
117
 
118
+ [0.1.2]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.2
119
+ [0.1.1]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.1
60
120
  [0.1.0]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.0
4
- Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON.
3
+ Version: 0.1.2
4
+ Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
7
7
  Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/docs/INTEROP.md
@@ -63,8 +63,8 @@ ingest (extract) → review → diff → convert → sign
63
63
 
64
64
  ## What it does
65
65
 
66
- Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or **`.pdf`**,
67
- and it returns structured JSON: the parties, dates, term, governing law, a
66
+ Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
67
+ **`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
68
68
  **clause map** normalized onto the suite's canonical clause vocabulary, a
69
69
  defined-term inventory, and a headline value. Every field carries a
70
70
  `confidence` and a `source` so downstream tools **verify, don't trust**.
@@ -75,14 +75,15 @@ daemon, no network in the default path.
75
75
  ## Install
76
76
 
77
77
  ```bash
78
- pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
78
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
79
79
  pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
80
80
  pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
81
81
  pip install "extract-cli[docx,pdf]" # both
82
82
  ```
83
83
 
84
- The core has **zero runtime dependencies** and is fully functional on `.md`/`.txt`
85
- with no extras. `.docx` and `.pdf` work out of the box via stdlib readers; the
84
+ The core has **zero runtime dependencies** and is fully functional on
85
+ `.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
86
+ inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
86
87
  `[docx]`/`[pdf]` extras improve fidelity on complex documents (see
87
88
  [ARCHITECTURE.md](ARCHITECTURE.md)).
88
89
 
@@ -25,8 +25,8 @@ ingest (extract) → review → diff → convert → sign
25
25
 
26
26
  ## What it does
27
27
 
28
- Give it a contract in **`.md` / `.txt`** (native), **`.docx`**, or **`.pdf`**,
29
- and it returns structured JSON: the parties, dates, term, governing law, a
28
+ Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
29
+ **`.pdf`**, and it returns structured JSON: the parties, dates, term, governing law, a
30
30
  **clause map** normalized onto the suite's canonical clause vocabulary, a
31
31
  defined-term inventory, and a headline value. Every field carries a
32
32
  `confidence` and a `source` so downstream tools **verify, don't trust**.
@@ -37,14 +37,15 @@ daemon, no network in the default path.
37
37
  ## Install
38
38
 
39
39
  ```bash
40
- pip install extract-cli # core: .md/.txt + best-effort .docx/.pdf
40
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
41
41
  pip install "extract-cli[docx]" # higher-fidelity .docx (python-docx)
42
42
  pip install "extract-cli[pdf]" # higher-fidelity .pdf (pypdf)
43
43
  pip install "extract-cli[docx,pdf]" # both
44
44
  ```
45
45
 
46
- The core has **zero runtime dependencies** and is fully functional on `.md`/`.txt`
47
- with no extras. `.docx` and `.pdf` work out of the box via stdlib readers; the
46
+ The core has **zero runtime dependencies** and is fully functional on
47
+ `.md`/`.txt`/`.html` with no extras (HTML is also auto-detected when it hides
48
+ inside a `.txt`, e.g. SEC EDGAR filings). `.docx` and `.pdf` work out of the box via stdlib readers; the
48
49
  `[docx]`/`[pdf]` extras improve fidelity on complex documents (see
49
50
  [ARCHITECTURE.md](ARCHITECTURE.md)).
50
51
 
@@ -69,7 +69,8 @@
69
69
  "markdown",
70
70
  "text",
71
71
  "docx",
72
- "pdf"
72
+ "pdf",
73
+ "html"
73
74
  ]
74
75
  },
75
76
  "sha256": {
@@ -183,6 +184,7 @@
183
184
  "enum": [
184
185
  "h2",
185
186
  "bold-numbered",
187
+ "numbered",
186
188
  "all-caps",
187
189
  "explicit",
188
190
  "llm"
@@ -4,8 +4,8 @@
4
4
  The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
5
5
  -> sign) that, until now, only handled documents it authored from its own
6
6
  templates. `extract-cli` is "passport control": it ingests ANY document --
7
- yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx, or
8
- .pdf, and emits a structured JSON representation that the rest of the suite
7
+ yours or a counterparty's foreign paper -- in .md/.txt/.html (natively), .docx,
8
+ or .pdf, and emits a structured JSON representation that the rest of the suite
9
9
  (nda-review-cli, compare-cli, contract-vault) consumes.
10
10
 
11
11
  Two extraction tiers:
@@ -32,6 +32,7 @@ from __future__ import annotations
32
32
  import argparse
33
33
  import datetime as _dt
34
34
  import hashlib
35
+ import html.parser
35
36
  import importlib.util
36
37
  import json
37
38
  import os
@@ -42,11 +43,11 @@ import urllib.request
42
43
  from pathlib import Path
43
44
  from typing import Any, Dict, List, Optional, Tuple
44
45
 
45
- __version__ = "0.1.0"
46
+ __version__ = "0.1.2"
46
47
 
47
48
  # Bumped independently of the package version when the *extraction logic*
48
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
49
- EXTRACTOR_VERSION = "0.1.0"
50
+ EXTRACTOR_VERSION = "0.1.2"
50
51
 
51
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
52
53
  SCHEMA_VERSION = 1
@@ -214,6 +215,49 @@ def _qualifies_as_all_caps_heading(title: str) -> bool:
214
215
  return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
215
216
 
216
217
 
218
+ # Tier between bold-numbered and ALL-CAPS: plain numbered headings on their own
219
+ # line -- "1. Termination", "5. Wage Compensation", "Section 3. Payment",
220
+ # "Article IV. Confidentiality". These are the dominant real-world format in
221
+ # foreign paper (and aren't caught by H2, **bold**, or ALL-CAPS). A title-case
222
+ # heuristic distinguishes a heading from a numbered *sentence* or list item.
223
+ _NUMBERED_HEADING_RE = re.compile(
224
+ r"^[ \t]*"
225
+ r"(?:(?:Article|Section|ARTICLE|SECTION)[ \t]+)?"
226
+ r"(?:" + _ROMAN_RE + r"|\d{1,2})\.?"
227
+ r"[ \t]+"
228
+ r"([A-Z][A-Za-z][^\n]{0,58})"
229
+ r"[ \t]*$",
230
+ re.MULTILINE,
231
+ )
232
+
233
+ # Lowercase words allowed inside an otherwise Title-Cased heading.
234
+ _HEADING_STOPWORDS = {
235
+ "a", "an", "the", "and", "or", "of", "to", "for", "in", "on", "with",
236
+ "by", "at", "as", "per", "from", "into", "nor", "but",
237
+ }
238
+
239
+
240
+ def _qualifies_as_numbered_heading(title: str) -> bool:
241
+ """A numbered line qualifies as a heading only if its title looks like a
242
+ heading: 1-9 words, Title-Cased (every word starts uppercase or is a short
243
+ lowercase connector), no sentence-y lowercase verbs. A single word must be
244
+ >= 4 letters. Rejects 'The parties agree as follows' but accepts 'Wage
245
+ Compensation' and 'Term And Nature Of Employment'."""
246
+ t = title.strip().rstrip(".").strip()
247
+ words = t.split()
248
+ if not (1 <= len(words) <= 9):
249
+ return False
250
+ if len(words) == 1:
251
+ return sum(1 for ch in words[0] if ch.isalpha()) >= 4 and words[0][:1].isupper()
252
+ for w in words:
253
+ if w[:1].isupper() or not w[:1].isalpha():
254
+ continue # capitalized word, or punctuation/number token
255
+ if w.lower() in _HEADING_STOPWORDS:
256
+ continue # allowed connector
257
+ return False # a lowercase content word => this is a sentence, not a heading
258
+ return True
259
+
260
+
217
261
  def detect_clauses(text: str) -> List[JSON]:
218
262
  """Run the three-tier cascade and return clauses with their detection tier.
219
263
 
@@ -227,6 +271,12 @@ def detect_clauses(text: str) -> List[JSON]:
227
271
  bold = list(_BOLD_HEADING_RE.finditer(text))
228
272
  if len(bold) >= 2:
229
273
  return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
274
+ numbered = [
275
+ m for m in _NUMBERED_HEADING_RE.finditer(text)
276
+ if _qualifies_as_numbered_heading(m.group(1))
277
+ ]
278
+ if len(numbered) >= 2:
279
+ return _matches_to_clauses(text, numbered, group=1, tier="numbered")
230
280
  caps = [
231
281
  m for m in _ALL_CAPS_HEADING_RE.finditer(text)
232
282
  if _qualifies_as_all_caps_heading(m.group(1))
@@ -266,8 +316,9 @@ def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
266
316
 
267
317
 
268
318
  def _norm_clause_key(s: str) -> str:
269
- """Normalize a clause title/alias for matching (number-stripped, lowercased)."""
270
- return _strip_clause_number(s).strip().lower()
319
+ """Normalize a clause title/alias for matching (number-stripped, trailing
320
+ punctuation removed, lowercased)."""
321
+ return _strip_clause_number(s).strip().lower().rstrip(" .:;,")
271
322
 
272
323
 
273
324
  # ---------------------------------------------------------------------------
@@ -366,7 +417,7 @@ def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
366
417
  best, best_len = canonical, len(alias_key)
367
418
  if best is not None:
368
419
  return best, True
369
- return _titlecase(detected_title), False
420
+ return _titlecase(detected_title.strip().rstrip(" .:;,")), False
370
421
 
371
422
 
372
423
  # ---------------------------------------------------------------------------
@@ -421,11 +472,17 @@ _DATE_PAT = (
421
472
  )
422
473
  _DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
423
474
 
475
+ # Highest-confidence: a date explicitly labeled "(the "Effective Date")".
476
+ _EFFDATE_LABEL_RE = re.compile(
477
+ r"(" + _DATE_PAT + r")\s*\(\s*(?:the\s+)?[\"“]?\s*Effective\s+Date",
478
+ re.IGNORECASE,
479
+ )
424
480
  _EFFECTIVE_RE = re.compile(
425
481
  r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
426
482
  r"dated(?:\s+as\s+of)?|"
427
483
  r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
428
- r"entered\s+into(?:\s+as\s+of|\s+on)?)"
484
+ r"entered\s+into(?:\s+as\s+of|\s+on)?|"
485
+ r"as\s+of)"
429
486
  r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
430
487
  re.IGNORECASE,
431
488
  )
@@ -436,10 +493,17 @@ _EXPIRE_RE = re.compile(
436
493
  re.IGNORECASE,
437
494
  )
438
495
 
496
+ # Each party must start with a capital letter (optionally "the X"), a quote, or
497
+ # a paren. This is case-sensitive on purpose (no global IGNORECASE -- only the
498
+ # keywords are): it lets the engine skip an "and" that sits INSIDE a party's own
499
+ # description ("...V6E 3S7 and doing business as ...", where the right side
500
+ # starts lowercase) and find the real "and" before the second named entity.
501
+ _PARTY_START = r"(?:(?:[Tt]he|its)\s+)?[A-Z\"“(]"
439
502
  _PARTY_BLOCK_RE = re.compile(
440
- r"\b(?:by\s+and\s+between|between)\s+(.{2,200}?)\s+\band\b\s+(.{2,200}?)"
441
- r"(?=[\.;\n]|\bwhereas\b|\beffective\b|\bdated\b|\bhaving\b|\bwith\s+offices\b|$)",
442
- re.IGNORECASE | re.DOTALL,
503
+ r"(?i:\b(?:by\s+and\s+between|between)\s+)"
504
+ r"(" + _PARTY_START + r"[^\n]{1,200}?)\s+and\s+"
505
+ r"(" + _PARTY_START + r"[^\n]{1,200}?)"
506
+ r"(?=[\.;\n]|(?i:\bwhereas\b|\beffective\b|\bdated\b|\bas\s+of\b|\bwitnesseth\b)|$)",
443
507
  )
444
508
  _ROLE_PAREN_RE = re.compile(
445
509
  r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
@@ -534,18 +598,54 @@ def _parse_date_to_iso(s: str) -> Optional[str]:
534
598
  return None
535
599
 
536
600
 
601
+ def _date_field_from_str(raw: str, base_conf: float) -> JSON:
602
+ raw = re.sub(r"\s+", " ", raw.strip())
603
+ iso = _parse_date_to_iso(raw)
604
+ if iso is not None:
605
+ return _field(iso, base_conf)
606
+ return _field(raw, max(0.0, base_conf - 0.3))
607
+
608
+
537
609
  def _date_field(match: Optional["re.Match[str]"]) -> JSON:
538
610
  if match is None:
539
611
  return _none_field()
540
- raw = match.group(1).strip()
541
- iso = _parse_date_to_iso(raw)
542
- if iso is not None:
543
- return _field(iso, 0.85)
544
- return _field(raw, 0.55)
612
+ return _date_field_from_str(match.group(1), 0.85)
613
+
614
+
615
+ # Trailing descriptors that follow a party's actual name and should be dropped
616
+ # ("Acme Corp., a Delaware corporation", "... doing business as Foo", "... as of
617
+ # March 1", "... having its offices at ..."). Each is matched and everything from
618
+ # it onward is cut.
619
+ _PARTY_CUT_MARKERS: Tuple[str, ...] = (
620
+ r",\s+an?\s+\w", # ", a Delaware ..." / ", an Ohio ..."
621
+ r"\s+doing\s+business\s+as\b",
622
+ r"\s+d/?b/?a\b",
623
+ r"\s+f/?k/?a\b",
624
+ r"\s+a[n]?\s+\w+\s+(?:corporation|company|partnership|limited)\b",
625
+ r"\s+having\b",
626
+ r"\s+with\s+(?:its\s+)?(?:offices|principal|a\s)\b",
627
+ r"\s+with\s+offices\b",
628
+ r"\s+located\b",
629
+ r"\s+organized\b",
630
+ r"\s+incorporated\b",
631
+ r"\s+whose\b",
632
+ r"\s+(?:as\s+of|dated|effective)\b",
633
+ )
634
+
635
+
636
+ def _clean_party_name(s: str) -> str:
637
+ """Trim a captured party name down to the entity name, dropping trailing
638
+ descriptors ('a Delaware corporation', 'd/b/a ...', 'as of ...')."""
639
+ s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
640
+ for pat in _PARTY_CUT_MARKERS:
641
+ m = re.search(pat, s, re.IGNORECASE)
642
+ if m:
643
+ s = s[: m.start()].strip().strip(",").strip()
644
+ return s.strip("\"“”").strip()
545
645
 
546
646
 
547
647
  def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
548
- s = s.strip().strip(",").strip()
648
+ s = re.sub(r"\s+", " ", s).strip().strip(",").strip()
549
649
  role: Optional[str] = None
550
650
  m = _ROLE_PAREN_RE.search(s)
551
651
  if m:
@@ -554,9 +654,7 @@ def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
554
654
  if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
555
655
  role = candidate
556
656
  s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
557
- s = s.strip("\"“”").strip()
558
- s = re.sub(r"\s+", " ", s)
559
- return s, role
657
+ return _clean_party_name(s), role
560
658
 
561
659
 
562
660
  def extract_parties(text: str) -> List[JSON]:
@@ -565,9 +663,6 @@ def extract_parties(text: str) -> List[JSON]:
565
663
  return []
566
664
  out: List[JSON] = []
567
665
  for raw in (m.group(1), m.group(2)):
568
- # Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
569
- # collapse whitespace rather than truncating at the first newline.
570
- raw = re.sub(r"\s+", " ", raw).strip()
571
666
  name, role = _split_name_role(raw)
572
667
  if not name or len(name) < 2 or len(name) > 120:
573
668
  continue
@@ -578,10 +673,12 @@ def extract_parties(text: str) -> List[JSON]:
578
673
 
579
674
 
580
675
  def extract_dates(text: str) -> JSON:
581
- return {
582
- "effective": _date_field(_EFFECTIVE_RE.search(text)),
583
- "expiration": _date_field(_EXPIRE_RE.search(text)),
584
- }
676
+ label = _EFFDATE_LABEL_RE.search(text)
677
+ if label is not None:
678
+ effective = _date_field_from_str(label.group(1), 0.9)
679
+ else:
680
+ effective = _date_field(_EFFECTIVE_RE.search(text))
681
+ return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
585
682
 
586
683
 
587
684
  def extract_governing_law(text: str) -> JSON:
@@ -600,10 +697,10 @@ def extract_term(text: str) -> JSON:
600
697
  if m:
601
698
  num = _word_to_int(m.group(1))
602
699
  unit = m.group(2).lower().rstrip("s")
700
+ # Only emit when the captured token is a real number; otherwise the
701
+ # match was a coincidence ("...consecutive days") -> leave as not-found.
603
702
  if num is not None:
604
703
  length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
605
- else:
606
- length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
607
704
 
608
705
  notice = _none_field()
609
706
  nm = _NOTICE_RE.search(text)
@@ -649,7 +746,8 @@ def extract_clauses(text: str) -> List[JSON]:
649
746
  for c in detect_clauses(text):
650
747
  canonical, mapped = _canonicalize_clause(c["title"])
651
748
  tier = c["tier"]
652
- base = {"h2": 0.95, "bold-numbered": 0.85, "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
749
+ base = {"h2": 0.95, "bold-numbered": 0.85, "numbered": 0.8,
750
+ "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
653
751
  conf = round(base * (1.0 if mapped else 0.75), 2)
654
752
  out.append({
655
753
  "canonical_title": canonical,
@@ -669,10 +767,14 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
669
767
  return m.group(1).strip()
670
768
  for line in text.splitlines():
671
769
  ls = line.strip().lstrip("#").strip()
672
- if ls:
673
- if len(ls) <= 90:
674
- return ls
675
- break
770
+ if not ls:
771
+ continue
772
+ # Skip SGML/XML wrapper lines (e.g. SEC EDGAR "<DOCUMENT>", "<TYPE>...").
773
+ if ls.startswith("<"):
774
+ continue
775
+ if len(ls) <= 90:
776
+ return ls
777
+ break
676
778
  if path is not None:
677
779
  return _titlecase(path.stem.replace("_", " ").replace("-", " "))
678
780
  return None
@@ -683,21 +785,91 @@ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
683
785
  # ---------------------------------------------------------------------------
684
786
 
685
787
 
788
+ def _looks_like_html(head: str) -> bool:
789
+ """Heuristic: does this text look like HTML? Catches HTML masquerading as
790
+ .txt (e.g. SEC EDGAR full submissions wrap HTML exhibits in a .txt)."""
791
+ low = head.lower()
792
+ if "<!doctype html" in low or "<html" in low or "<body" in low:
793
+ return True
794
+ return len(re.findall(r"</?(?:p|div|table|tr|td|span|br|h[1-6]|font|b|i)\b", low)) >= 6
795
+
796
+
686
797
  def _detect_format(path: Path, raw: bytes) -> str:
687
798
  ext = path.suffix.lower()
688
- if ext in (".md", ".markdown"):
689
- return "markdown"
690
- if ext == ".txt":
691
- return "text"
799
+ if ext in (".htm", ".html", ".xhtml"):
800
+ return "html"
692
801
  if ext == ".docx":
693
802
  return "docx"
694
803
  if ext == ".pdf":
695
804
  return "pdf"
696
805
  if raw[:4] == b"%PDF":
697
806
  return "pdf"
698
- if raw[:2] == b"PK":
807
+ if raw[:2] == b"PK" and ext not in (".md", ".markdown", ".txt"):
699
808
  return "docx"
700
- return "text"
809
+ base = "markdown" if ext in (".md", ".markdown") else "text"
810
+ # Content sniff: HTML hiding inside a .txt/.md (or extensionless) file.
811
+ if _looks_like_html(raw[:4096].decode("utf-8", "replace")):
812
+ return "html"
813
+ return base
814
+
815
+
816
+ class _HTMLTextExtractor(html.parser.HTMLParser):
817
+ """Stdlib HTML -> text: drops script/style, frames block elements with blank
818
+ lines (so clause-heading detection still works), and unescapes entities."""
819
+
820
+ _SKIP = {"script", "style", "head", "title", "meta", "link", "noscript"}
821
+ _BLOCK = {
822
+ "p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6",
823
+ "section", "article", "table", "ul", "ol", "blockquote", "pre", "hr",
824
+ "thead", "tbody", "header", "footer", "main",
825
+ }
826
+
827
+ def __init__(self) -> None:
828
+ super().__init__(convert_charrefs=True)
829
+ self._parts: List[str] = []
830
+ self._skip = 0
831
+
832
+ def handle_starttag(self, tag: str, attrs: Any) -> None:
833
+ if tag in self._SKIP:
834
+ self._skip += 1
835
+ elif tag in self._BLOCK:
836
+ self._parts.append("\n")
837
+
838
+ def handle_endtag(self, tag: str) -> None:
839
+ if tag in self._SKIP and self._skip > 0:
840
+ self._skip -= 1
841
+ elif tag in self._BLOCK:
842
+ self._parts.append("\n")
843
+
844
+ def handle_data(self, data: str) -> None:
845
+ if self._skip == 0:
846
+ self._parts.append(data)
847
+
848
+ def get_text(self) -> str:
849
+ # Strip each line; collapse runs of blank lines to a single blank line
850
+ # (gives ALL-CAPS / numbered headings their blank-line frame).
851
+ lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in "".join(self._parts).split("\n")]
852
+ out: List[str] = []
853
+ blank = False
854
+ for ln in lines:
855
+ if ln:
856
+ out.append(ln)
857
+ blank = False
858
+ elif not blank:
859
+ out.append("")
860
+ blank = True
861
+ return "\n".join(out).strip()
862
+
863
+
864
+ def _read_html(raw_text: str) -> str:
865
+ parser = _HTMLTextExtractor()
866
+ try:
867
+ parser.feed(raw_text)
868
+ parser.close()
869
+ except Exception:
870
+ # Never crash on malformed markup; fall back to a crude tag strip.
871
+ return re.sub(r"<[^>]+>", " ", raw_text)
872
+ return parser.get_text()
701
873
 
702
874
 
703
875
  def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
@@ -834,9 +1006,15 @@ def _pdf_unescape(s: str) -> str:
834
1006
 
835
1007
 
836
1008
  def _pdf_text_from_content(content: bytes) -> str:
1009
+ """Pull text strings from a PDF content stream, but ONLY from inside text
1010
+ objects (`BT` ... `ET`). Real text lives there; embedded fonts, images,
1011
+ digital-signature blobs and metadata streams have no BT/ET, so gating on it
1012
+ keeps their binary bytes (which often contain stray `(...)` sequences) out
1013
+ of the output -- essential for real signed/font-embedded PDFs."""
837
1014
  s = content.decode("latin-1", "replace")
838
1015
  lines: List[str] = []
839
1016
  cur: List[str] = []
1017
+ in_text = False
840
1018
 
841
1019
  def flush() -> None:
842
1020
  if cur:
@@ -845,17 +1023,34 @@ def _pdf_text_from_content(content: bytes) -> str:
845
1023
 
846
1024
  for m in _PDF_TOKEN_RE.finditer(s):
847
1025
  tok = m.group(0)
848
- if tok.startswith("("):
1026
+ if tok == "BT":
1027
+ flush()
1028
+ in_text = True
1029
+ elif tok == "ET":
1030
+ flush()
1031
+ in_text = False
1032
+ elif not in_text:
1033
+ continue
1034
+ elif tok.startswith("("):
849
1035
  cur.append(_pdf_unescape(tok[1:-1]))
850
1036
  elif tok.startswith("["):
851
1037
  for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
852
1038
  cur.append(_pdf_unescape(sm.group(0)[1:-1]))
853
- elif tok in ("Td", "TD", "T*", "'", '"', "BT", "ET"):
1039
+ elif tok in ("Td", "TD", "T*", "'", '"'):
854
1040
  flush()
855
1041
  flush()
856
1042
  return "\n".join(lines)
857
1043
 
858
1044
 
1045
+ def _mostly_printable(s: str) -> bool:
1046
+ """True if `s` is overwhelmingly printable text (backstop against a
1047
+ malformed stream slipping binary through the BT/ET gate)."""
1048
+ if not s:
1049
+ return False
1050
+ printable = sum(1 for ch in s if ch in "\n\t" or 32 <= ord(ch) < 127 or ord(ch) > 160)
1051
+ return printable / len(s) >= 0.85
1052
+
1053
+
859
1054
  def _read_pdf_stdlib(raw: bytes) -> str:
860
1055
  import zlib
861
1056
 
@@ -873,9 +1068,11 @@ def _read_pdf_stdlib(raw: bytes) -> str:
873
1068
  content = zlib.decompress(body)
874
1069
  except Exception:
875
1070
  content = body
876
- chunks.append(_pdf_text_from_content(content))
1071
+ piece = _pdf_text_from_content(content)
1072
+ if piece.strip() and _mostly_printable(piece):
1073
+ chunks.append(piece)
877
1074
  idx = e + len(b"endstream")
878
- return "\n".join(c for c in chunks if c.strip())
1075
+ return "\n".join(chunks)
879
1076
 
880
1077
 
881
1078
  def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
@@ -894,6 +1091,8 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
894
1091
  warnings: List[str] = []
895
1092
  if fmt in ("markdown", "text"):
896
1093
  text = raw.decode("utf-8", "replace")
1094
+ elif fmt == "html":
1095
+ text = _read_html(raw.decode("utf-8", "replace"))
897
1096
  elif fmt == "docx":
898
1097
  text, w = _read_docx(path, raw, prefer_optional)
899
1098
  warnings += w
@@ -919,6 +1118,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
919
1118
  source_path: Optional[str]) -> JSON:
920
1119
  """Run the deterministic tier and assemble the output contract object."""
921
1120
  sha = hashlib.sha256(raw).hexdigest()
1121
+ # Field extractors (parties, dates, governing law, term, value, defined
1122
+ # terms) run on a whitespace-flattened copy so values that wrap across a
1123
+ # line break in the source -- "...laws of the Province\nof Ontario", a party
1124
+ # name split mid-line -- are matched whole. Clause detection and the title
1125
+ # keep the original text, which depends on line structure.
1126
+ flat = re.sub(r"[ \t\r\f\v]*\n[ \t\r\f\v]*", " ", text)
1127
+ flat = re.sub(r"[ \t]+", " ", flat)
922
1128
  return {
923
1129
  "document": {
924
1130
  "title": extract_title(text, Path(source_path) if source_path else None, fmt),
@@ -926,13 +1132,13 @@ def build_extraction(text: str, raw: bytes, fmt: str,
926
1132
  "sha256": sha,
927
1133
  "source_path": source_path,
928
1134
  },
929
- "parties": extract_parties(text),
930
- "dates": extract_dates(text),
931
- "term": extract_term(text),
932
- "governing_law": extract_governing_law(text),
1135
+ "parties": extract_parties(flat),
1136
+ "dates": extract_dates(flat),
1137
+ "term": extract_term(flat),
1138
+ "governing_law": extract_governing_law(flat),
933
1139
  "clauses": extract_clauses(text),
934
- "defined_terms": extract_defined_terms(text),
935
- "value": extract_value(text),
1140
+ "defined_terms": extract_defined_terms(flat),
1141
+ "value": extract_value(flat),
936
1142
  "_meta": {
937
1143
  "extractor_version": EXTRACTOR_VERSION,
938
1144
  "tiers_used": ["deterministic"],
@@ -1244,7 +1450,7 @@ def output_schema() -> JSON:
1244
1450
  "required": ["title", "format", "sha256", "source_path"],
1245
1451
  "properties": {
1246
1452
  "title": {"type": ["string", "null"]},
1247
- "format": {"enum": ["markdown", "text", "docx", "pdf"]},
1453
+ "format": {"enum": ["markdown", "text", "docx", "pdf", "html"]},
1248
1454
  "sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
1249
1455
  "source_path": {"type": ["string", "null"]},
1250
1456
  },
@@ -1293,7 +1499,7 @@ def output_schema() -> JSON:
1293
1499
  "properties": {
1294
1500
  "canonical_title": {"type": ["string", "null"]},
1295
1501
  "detected_title": {"type": "string"},
1296
- "tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
1502
+ "tier": {"enum": ["h2", "bold-numbered", "numbered", "all-caps", "explicit", "llm"]},
1297
1503
  "span": {
1298
1504
  "type": "object",
1299
1505
  "required": ["start", "end"],
@@ -1595,7 +1801,7 @@ def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
1595
1801
  def build_parser() -> argparse.ArgumentParser:
1596
1802
  parser = argparse.ArgumentParser(
1597
1803
  prog="extract",
1598
- description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
1804
+ description="Ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured "
1599
1805
  "JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
1600
1806
  )
1601
1807
  parser.add_argument("-V", "--version", action="version",
@@ -1629,7 +1835,7 @@ def build_parser() -> argparse.ArgumentParser:
1629
1835
 
1630
1836
 
1631
1837
  def _build_extract_args(p: argparse.ArgumentParser) -> None:
1632
- p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
1838
+ p.add_argument("path", help="Path to the document (.md/.txt/.html/.docx/.pdf).")
1633
1839
  p.add_argument("--llm", action="store_true",
1634
1840
  help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
1635
1841
  "Off by default; the deterministic core is fully useful without it.")
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.0"
8
- description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.docx/.pdf) and emit structured JSON."
7
+ version = "0.1.2"
8
+ description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
11
11
  license = { text = "MIT" }
@@ -20,7 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
20
20
  FIXTURES = Path(__file__).resolve().parent / "fixtures"
21
21
 
22
22
  DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
23
- "employment_docx.docx", "license_pdf.pdf", "scanned.pdf"]
23
+ "employment_docx.docx", "license_pdf.pdf", "services_html.html",
24
+ "scanned.pdf"]
24
25
 
25
26
 
26
27
  def golden_for(name: str) -> dict:
@@ -26,6 +26,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
26
26
  ("lease_allcaps.txt", "all-caps", "text"),
27
27
  ("employment_docx.docx", "bold-numbered", "docx"),
28
28
  ("license_pdf.pdf", "all-caps", "pdf"),
29
+ ("services_html.html", "numbered", "html"),
29
30
  )
30
31
 
31
32
 
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.0",
141
+ "extractor_version": "0.1.2",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.0",
136
+ "extractor_version": "0.1.2",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.0",
136
+ "extractor_version": "0.1.2",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -121,6 +121,11 @@
121
121
  "confidence": 0.6,
122
122
  "source": "deterministic"
123
123
  },
124
+ {
125
+ "term": "Disclosing Party",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ },
124
129
  {
125
130
  "term": "Receiving Party",
126
131
  "confidence": 0.6,
@@ -138,7 +143,7 @@
138
143
  "source": "none"
139
144
  },
140
145
  "_meta": {
141
- "extractor_version": "0.1.0",
146
+ "extractor_version": "0.1.2",
142
147
  "tiers_used": [
143
148
  "deterministic"
144
149
  ],
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.0",
51
+ "extractor_version": "0.1.2",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.0",
136
+ "extractor_version": "0.1.2",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -0,0 +1,35 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Exhibit 10.1</title>
5
+ <style>body { font-family: serif; } .hidden { display:none; }</style>
6
+ <script>var x = "(this should never appear in output)";</script>
7
+ </head>
8
+ <body>
9
+ <p align="center"><b>MASTER SERVICES AGREEMENT</b></p>
10
+
11
+ <p>This Master Services Agreement (the &ldquo;Agreement&rdquo;) is entered
12
+ into as of March 15, 2023 (the &quot;Effective Date&quot;), by and between
13
+ Initrode&nbsp;Systems,&nbsp;Inc., a Delaware corporation (&ldquo;Provider&rdquo;),
14
+ and Hooli&nbsp;LLC (&ldquo;Customer&rdquo;).</p>
15
+
16
+ <p>1. Services</p>
17
+ <p>Provider shall perform the services described in each Statement of Work.</p>
18
+
19
+ <p>2. Fees and Payment</p>
20
+ <p>Customer shall pay Provider the fees set forth in the applicable Statement
21
+ of Work, not to exceed $500,000 in the aggregate.</p>
22
+
23
+ <p>3. Term and Termination</p>
24
+ <p>The initial term of this Agreement is two (2) years. Either party may
25
+ terminate upon sixty (60) days&rsquo; written notice. This Agreement shall
26
+ automatically renew for successive one-year terms.</p>
27
+
28
+ <p>4. Confidentiality</p>
29
+ <p>Each party shall protect the other&rsquo;s &ldquo;Confidential
30
+ Information&rdquo; using reasonable care.</p>
31
+
32
+ <p>5. Governing Law</p>
33
+ <p>This Agreement shall be governed by the laws of the State of California.</p>
34
+ </body>
35
+ </html>
@@ -0,0 +1,157 @@
1
+ {
2
+ "document": {
3
+ "title": "MASTER SERVICES AGREEMENT",
4
+ "format": "html",
5
+ "sha256": "088b40f13135e6b5d8f8548b162d657f10725d348388c7c3a416d11d7fc65300",
6
+ "source_path": "services_html.html"
7
+ },
8
+ "parties": [
9
+ {
10
+ "name": "Initrode Systems, Inc.",
11
+ "confidence": 0.9,
12
+ "source": "deterministic",
13
+ "role": "Provider"
14
+ },
15
+ {
16
+ "name": "Hooli LLC",
17
+ "confidence": 0.9,
18
+ "source": "deterministic",
19
+ "role": "Customer"
20
+ }
21
+ ],
22
+ "dates": {
23
+ "effective": {
24
+ "value": "2023-03-15",
25
+ "confidence": 0.9,
26
+ "source": "deterministic"
27
+ },
28
+ "expiration": {
29
+ "value": null,
30
+ "confidence": 0.0,
31
+ "source": "none"
32
+ }
33
+ },
34
+ "term": {
35
+ "length": {
36
+ "value": "2 years",
37
+ "confidence": 0.7,
38
+ "source": "deterministic"
39
+ },
40
+ "auto_renew": {
41
+ "value": true,
42
+ "confidence": 0.65,
43
+ "source": "deterministic"
44
+ },
45
+ "notice_period_days": {
46
+ "value": 60,
47
+ "confidence": 0.7,
48
+ "source": "deterministic"
49
+ }
50
+ },
51
+ "governing_law": {
52
+ "value": "State of California",
53
+ "confidence": 0.85,
54
+ "source": "deterministic"
55
+ },
56
+ "clauses": [
57
+ {
58
+ "canonical_title": "Services",
59
+ "detected_title": "1. Services",
60
+ "tier": "numbered",
61
+ "span": {
62
+ "start": 242,
63
+ "end": 329
64
+ },
65
+ "confidence": 0.6,
66
+ "source": "deterministic",
67
+ "mapped": false
68
+ },
69
+ {
70
+ "canonical_title": "Payment",
71
+ "detected_title": "2. Fees and Payment",
72
+ "tier": "numbered",
73
+ "span": {
74
+ "start": 329,
75
+ "end": 476
76
+ },
77
+ "confidence": 0.8,
78
+ "source": "deterministic",
79
+ "mapped": true
80
+ },
81
+ {
82
+ "canonical_title": "Termination",
83
+ "detected_title": "3. Term and Termination",
84
+ "tier": "numbered",
85
+ "span": {
86
+ "start": 476,
87
+ "end": 692
88
+ },
89
+ "confidence": 0.8,
90
+ "source": "deterministic",
91
+ "mapped": true
92
+ },
93
+ {
94
+ "canonical_title": "Confidentiality",
95
+ "detected_title": "4. Confidentiality",
96
+ "tier": "numbered",
97
+ "span": {
98
+ "start": 692,
99
+ "end": 800
100
+ },
101
+ "confidence": 0.8,
102
+ "source": "deterministic",
103
+ "mapped": true
104
+ },
105
+ {
106
+ "canonical_title": "Governing Law",
107
+ "detected_title": "5. Governing Law",
108
+ "tier": "numbered",
109
+ "span": {
110
+ "start": 800,
111
+ "end": 890
112
+ },
113
+ "confidence": 0.8,
114
+ "source": "deterministic",
115
+ "mapped": true
116
+ }
117
+ ],
118
+ "defined_terms": [
119
+ {
120
+ "term": "Agreement",
121
+ "confidence": 0.6,
122
+ "source": "deterministic"
123
+ },
124
+ {
125
+ "term": "Effective Date",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ },
129
+ {
130
+ "term": "Provider",
131
+ "confidence": 0.6,
132
+ "source": "deterministic"
133
+ },
134
+ {
135
+ "term": "Customer",
136
+ "confidence": 0.6,
137
+ "source": "deterministic"
138
+ },
139
+ {
140
+ "term": "Confidential Information",
141
+ "confidence": 0.6,
142
+ "source": "deterministic"
143
+ }
144
+ ],
145
+ "value": {
146
+ "value": "$500,000",
147
+ "confidence": 0.6,
148
+ "source": "deterministic"
149
+ },
150
+ "_meta": {
151
+ "extractor_version": "0.1.2",
152
+ "tiers_used": [
153
+ "deterministic"
154
+ ],
155
+ "llm_used": false
156
+ }
157
+ }
@@ -25,6 +25,50 @@ def test_tier3_all_caps() -> None:
25
25
  assert [c["tier"] for c in clauses] == ["all-caps", "all-caps"]
26
26
 
27
27
 
28
+ def test_tier_numbered_plain_headings() -> None:
29
+ # Real-world dominant format: plain numbered, mixed-case, unbolded headings.
30
+ text = ("1. Term And Nature Of Employment\n\nbody about term\n\n"
31
+ "2. Wage Compensation\n\nbody about wages\n\n"
32
+ "5. Termination\n\nbody about termination")
33
+ clauses = ex.detect_clauses(text)
34
+ assert [c["tier"] for c in clauses] == ["numbered", "numbered", "numbered"]
35
+ assert clauses[0]["title"] == "Term And Nature Of Employment"
36
+ assert clauses[2]["title"] == "Termination"
37
+
38
+
39
+ def test_numbered_heading_rejects_sentences() -> None:
40
+ # "1. The Company shall pay..." is a numbered sentence, not a heading.
41
+ assert ex._qualifies_as_numbered_heading("Wage Compensation")
42
+ assert ex._qualifies_as_numbered_heading("Term And Nature Of Employment")
43
+ assert ex._qualifies_as_numbered_heading("Termination")
44
+ assert not ex._qualifies_as_numbered_heading("The Company shall pay the Employee monthly")
45
+ assert not ex._qualifies_as_numbered_heading("Fee") # single word < 4 letters
46
+ assert not ex._qualifies_as_numbered_heading(
47
+ "EMPLOYEE shall be compensated on the basis of an annual salary")
48
+
49
+
50
+ def test_numbered_section_article_prefixes() -> None:
51
+ text = ("Section 1. Definitions\n\nx\n\nSection 2. Confidentiality\n\ny\n\n"
52
+ "Article IV. Governing Law\n\nz")
53
+ clauses = ex.detect_clauses(text)
54
+ assert all(c["tier"] == "numbered" for c in clauses)
55
+ assert clauses[0]["title"] == "Definitions"
56
+ assert clauses[2]["title"] == "Governing Law"
57
+
58
+
59
+ def test_numbered_does_not_shadow_bold() -> None:
60
+ # Bold-numbered must win over plain-numbered when both could match.
61
+ text = "**1. Purpose**\n\nx\n\n**2. Scope**\n\ny"
62
+ assert all(c["tier"] == "bold-numbered" for c in ex.detect_clauses(text))
63
+
64
+
65
+ def test_trailing_period_stripped_from_titles() -> None:
66
+ canon, mapped = ex._canonicalize_clause("Other Benefits.")
67
+ assert canon == "Other Benefits"
68
+ # And a mapped clause with a trailing period still maps.
69
+ assert ex._canonicalize_clause("Survival.") == ("Survival", True)
70
+
71
+
28
72
  def test_cascade_priority_h2_wins() -> None:
29
73
  # An H2 present means the bold/all-caps fallbacks must not fire.
30
74
  text = "## Real Heading\n\n**1. Not A Heading**\n\nALSO NOT A HEADING\n\nbody"
@@ -22,7 +22,7 @@ def test_version(capsys: pytest.CaptureFixture[str]) -> None:
22
22
  with pytest.raises(SystemExit) as exc:
23
23
  ex.main(["--version"])
24
24
  assert exc.value.code == 0
25
- assert "extract-cli 0.1.0" in capsys.readouterr().out
25
+ assert f"extract-cli {ex.__version__}" in capsys.readouterr().out
26
26
 
27
27
 
28
28
  def test_demo_runs(capsys: pytest.CaptureFixture[str]) -> None:
@@ -12,8 +12,8 @@ def test_parties_between_simple() -> None:
12
12
  assert all(0.0 <= p["confidence"] <= 1.0 for p in parties)
13
13
 
14
14
 
15
- def test_parties_with_roles_and_linebreak() -> None:
16
- text = ('by and between Acme Corp. (the "Disclosing\nParty") and '
15
+ def test_parties_with_roles() -> None:
16
+ text = ('by and between Acme Corp. (the "Disclosing Party") and '
17
17
  'Beta LLC (the "Receiving Party"), dated March 1, 2024.')
18
18
  parties = ex.extract_parties(text)
19
19
  assert parties[0]["name"] == "Acme Corp."
@@ -22,6 +22,30 @@ def test_parties_with_roles_and_linebreak() -> None:
22
22
  assert parties[1]["role"] == "Receiving Party"
23
23
 
24
24
 
25
+ def test_parties_linebreak_handled_by_build() -> None:
26
+ # build_extraction flattens whitespace, so a party/role that wraps across a
27
+ # line is matched whole.
28
+ text = ('This Agreement is made by and between Acme Corp. (the "Disclosing\n'
29
+ 'Party") and Beta LLC (the "Receiving Party").')
30
+ r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
31
+ assert [p["name"] for p in r["parties"]] == ["Acme Corp.", "Beta LLC"]
32
+ assert r["parties"][0]["role"] == "Disclosing Party"
33
+
34
+
35
+ def test_parties_skip_and_inside_description() -> None:
36
+ # An "and" inside a party's own description must not split the parties.
37
+ text = ("between Blade Ventures Inc., a Nevada corporation having offices at "
38
+ "1 Main St and doing business as Foo (\"Client\"), and KPMG LP")
39
+ parties = ex.extract_parties(text)
40
+ assert [p["name"] for p in parties] == ["Blade Ventures Inc.", "KPMG LP"]
41
+
42
+
43
+ def test_party_name_descriptors_trimmed() -> None:
44
+ assert ex._clean_party_name("Visteon Corporation, a Delaware corporation") == "Visteon Corporation"
45
+ assert ex._clean_party_name("Foo Inc. doing business as Bar") == "Foo Inc."
46
+ assert ex._clean_party_name("Baz LLC having its principal office at X") == "Baz LLC"
47
+
48
+
25
49
  def test_parties_none() -> None:
26
50
  assert ex.extract_parties("There are no parties named here.") == []
27
51
 
@@ -39,6 +63,25 @@ def test_dates_iso_normalization() -> None:
39
63
  assert out["source"] == "deterministic"
40
64
 
41
65
 
66
+ def test_dates_effective_date_label_and_as_of() -> None:
67
+ # The "(the "Effective Date")" anchor, with the date wrapping a newline.
68
+ text = 'between A and B as of August\n31, 2016 (the "Effective Date").'
69
+ assert ex.extract_dates(text)["effective"]["value"] == "2016-08-31"
70
+ # Bare "as of <date>" cue.
71
+ assert ex.extract_dates("dated as of June 1, 2023")["effective"]["value"] == "2023-06-01"
72
+
73
+
74
+ def test_term_length_rejects_non_number() -> None:
75
+ # "...for consecutive days" must NOT be reported as a term length.
76
+ text = "the Employment Period shall run for consecutive days as scheduled"
77
+ assert ex.extract_term(text)["length"]["source"] == "none"
78
+
79
+
80
+ def test_title_skips_sgml_wrapper() -> None:
81
+ text = "<DOCUMENT>\n<TYPE>EX-10\n<TEXT>\n\nEMPLOYMENT AGREEMENT\n\nbody"
82
+ assert ex.extract_title(text, None, "text") == "EMPLOYMENT AGREEMENT"
83
+
84
+
42
85
  def test_dates_missing() -> None:
43
86
  out = ex.extract_dates("no dates in here")
44
87
  assert out["effective"] == ex._none_field()
@@ -61,6 +104,15 @@ def test_governing_law_stops_before_trailing_clause() -> None:
61
104
  assert out["value"] == "State of Delaware"
62
105
 
63
106
 
107
+ def test_governing_law_linebreak_handled_by_build() -> None:
108
+ # A jurisdiction that wraps a line ("...the Province\nof Ontario") is
109
+ # matched whole because build_extraction flattens whitespace first.
110
+ text = ("This Agreement shall be governed by the laws of the Province\n"
111
+ "of Ontario and the federal laws of Canada.")
112
+ r = ex.build_extraction(text, text.encode("utf-8"), "text", "x.txt")
113
+ assert r["governing_law"]["value"] == "Province of Ontario"
114
+
115
+
64
116
  def test_governing_law_missing() -> None:
65
117
  assert ex.extract_governing_law("nothing about law")["source"] == "none"
66
118
 
@@ -142,6 +142,45 @@ def test_pdf_unescape() -> None:
142
142
  assert ex._pdf_unescape(r"\101\102") == "AB" # octal escapes
143
143
 
144
144
 
145
+ def test_html_extraction() -> None:
146
+ raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
147
+ assert fmt == "html"
148
+ # script/style content is dropped; entities are unescaped.
149
+ assert "this should never appear" not in text
150
+ result = ex.build_extraction(text, raw, fmt, "services_html.html")
151
+ assert result["document"]["format"] == "html"
152
+ assert [p["name"] for p in result["parties"]] == ["Initrode Systems, Inc.", "Hooli LLC"]
153
+ assert result["governing_law"]["value"] == "State of California"
154
+ assert result["dates"]["effective"]["value"] == "2023-03-15"
155
+ canon = {c["canonical_title"] for c in result["clauses"]}
156
+ assert {"Payment", "Termination", "Confidentiality", "Governing Law"} <= canon
157
+
158
+
159
+ def test_html_detected_by_content_sniff(tmp_path: Any) -> None:
160
+ # HTML masquerading as .txt (e.g. a SEC EDGAR full submission) is sniffed.
161
+ p = tmp_path / "exhibit.txt"
162
+ p.write_text("<html><body><p>between A Co and B Co</p></body></html>")
163
+ _raw, _text, fmt, _w = ex.load_source(p)
164
+ assert fmt == "html"
165
+
166
+
167
+ def test_html_malformed_does_not_crash() -> None:
168
+ assert ex._read_html("<p>unclosed <b>bold <div>text") is not None
169
+
170
+
171
+ def test_pdf_text_only_inside_bt_et() -> None:
172
+ # Strings outside BT/ET (font/signature/metadata stream bytes that happen to
173
+ # contain parentheses) must be ignored; only text objects yield text.
174
+ content = b"(garbage outside) /Font << >> BT (real text) Tj ET (more garbage)"
175
+ assert ex._pdf_text_from_content(content) == "real text"
176
+
177
+
178
+ def test_pdf_mostly_printable_backstop() -> None:
179
+ assert ex._mostly_printable("Hello, world")
180
+ assert not ex._mostly_printable("\x00\x01\x02\x03\x04\x05\x06\x07")
181
+ assert not ex._mostly_printable("")
182
+
183
+
145
184
  def test_extract_json_object_from_noise() -> None:
146
185
  assert ex._extract_json_object('prefix {"a": 1} suffix') == {"a": 1}
147
186
  assert ex._extract_json_object("no json here") is None
File without changes
File without changes
File without changes
File without changes
File without changes