extract-cli 0.1.9__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.9 → extract_cli-0.1.11}/ARCHITECTURE.md +12 -5
- {extract_cli-0.1.9 → extract_cli-0.1.11}/CHANGELOG.md +52 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/Makefile +4 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/PKG-INFO +6 -5
- {extract_cli-0.1.9 → extract_cli-0.1.11}/README.md +5 -4
- {extract_cli-0.1.9 → extract_cli-0.1.11}/extract_cli.py +137 -61
- {extract_cli-0.1.9 → extract_cli-0.1.11}/pyproject.toml +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/employment_docx.docx.expected.json +2 -2
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/heading_docx.docx.expected.json +2 -2
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/nda_h2.md.expected.json +2 -2
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/numbered_docx.docx.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/scanned.pdf.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_bold.txt.expected.json +1 -1
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_html.html.expected.json +2 -2
- extract_cli-0.1.11/tests/test_coverage.py +254 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_deterministic.py +7 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_misc.py +40 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_schema_conformance.py +14 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/.gitignore +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/AGENTS.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/LICENSE +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/config/llm.json.example +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/docs/INTEROP.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/docs/spec/extract-output.schema.json +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/llms.txt +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/scripts/release.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/conftest.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_clause_map.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_cli.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_llm.py +0 -0
- {extract_cli-0.1.9 → extract_cli-0.1.11}/tests/test_property.py +0 -0
|
@@ -40,16 +40,23 @@ the "verify, not trust" contract downstream tools consume.
|
|
|
40
40
|
|
|
41
41
|
## The clause map
|
|
42
42
|
|
|
43
|
-
`detect_clauses(text)`
|
|
44
|
-
|
|
45
|
-
structure:
|
|
43
|
+
`detect_clauses(text)` extends template-vault-cli's clause cascade; the first
|
|
44
|
+
tier that fires wins so fallbacks never shadow real structure:
|
|
46
45
|
|
|
47
|
-
1. **`h2`** — `## Heading` (Markdown-native
|
|
46
|
+
1. **`h2`** — `## Heading` (Markdown-native; also what the DOCX reader emits for
|
|
47
|
+
Word heading styles / `w:numPr` paragraphs). Needs ≥ 1 match.
|
|
48
48
|
2. **`bold-numbered`** — `**1. Purpose**`, `**Section 4. Term**` (typical of
|
|
49
49
|
DOCX → text). Needs ≥ 2 matches.
|
|
50
|
-
3. **`
|
|
50
|
+
3. **`numbered`** — plain `1. Term`, `Section 3. Payment`, and two-line
|
|
51
|
+
`ARTICLE N` + title (the dominant format in foreign paper), gated by a
|
|
52
|
+
title-case heuristic. Needs ≥ 2 matches.
|
|
53
|
+
4. **`all-caps`** — blank-line-framed `CONFIDENTIALITY` lines (typical of legal
|
|
51
54
|
PDFs), with the single-token-≥-4-letters rule. Needs ≥ 2 matches.
|
|
52
55
|
|
|
56
|
+
(Plus an opt-in **`llm`** clause-map fallback under `--llm` when none of the
|
|
57
|
+
above fire — see the LLM tier below.) After detection, running headers/footers
|
|
58
|
+
and front/back-matter are filtered (`_is_noise_clause_title` + repeat dedup).
|
|
59
|
+
|
|
53
60
|
`_strip_clause_number` removes leading numbering, including Roman numerals
|
|
54
61
|
1–39 (`_ROMAN_RE` lists longer alternatives first so the engine doesn't
|
|
55
62
|
short-circuit on a prefix — bare `V`/`X` match).
|
|
@@ -6,6 +6,56 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.11] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
Polish pass.
|
|
12
|
+
|
|
13
|
+
### Fixed
|
|
14
|
+
- **Signature blocks no longer capture the next column's label.** A two-column
|
|
15
|
+
unsigned block (`By: By:` / `Name: Name:`) used to yield garbage
|
|
16
|
+
signatories like `{"name": "By:", "title": "Title:"}`; such captures (and
|
|
17
|
+
blank fill lines) are now rejected, so an unsigned template correctly returns
|
|
18
|
+
no signatories.
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
- **`extract fields` and `--format table` now surface `jurisdiction`,
|
|
22
|
+
`amounts`, and `signatories`** — they were extracted and in the JSON but not
|
|
23
|
+
discoverable via the catalog or the human table view. A drift-guard test now
|
|
24
|
+
asserts `extract fields` can't diverge from the output schema.
|
|
25
|
+
- **Confidence values centralized into a documented scale** (named `CONF_*`
|
|
26
|
+
constants with a single descending ladder, replacing scattered magic numbers)
|
|
27
|
+
so downstream "verify, not trust" thresholds are principled. The only value
|
|
28
|
+
change: an affirmative auto-renewal is now `0.70` (was `0.65`), matching the
|
|
29
|
+
other best-effort term fields.
|
|
30
|
+
- Docs sweep: refreshed the clause-cascade description (h2 → bold-numbered →
|
|
31
|
+
numbered → all-caps, + the `--llm` fallback) across README/ARCHITECTURE and
|
|
32
|
+
the output-shape example. Line coverage held at 100% (CI-gated).
|
|
33
|
+
|
|
34
|
+
## [0.1.10] - 2026-05-22
|
|
35
|
+
|
|
36
|
+
### Fixed
|
|
37
|
+
- **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
|
|
38
|
+
the stdlib reader. Previously the python-docx path concatenated paragraph text
|
|
39
|
+
and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
|
|
40
|
+
the `[docx]` extra produced an **empty clause map** on heading-styled Word
|
|
41
|
+
contracts (worse than the no-extra stdlib reader). Both readers now share one
|
|
42
|
+
emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
|
|
43
|
+
paragraphs into `## headings`, so the two paths agree. New tests:
|
|
44
|
+
`test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
|
|
45
|
+
latter asserts the python-docx and stdlib readers produce the same clause map).
|
|
46
|
+
No output-schema change.
|
|
47
|
+
|
|
48
|
+
### Tests / quality
|
|
49
|
+
- **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
|
|
50
|
+
for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
|
|
51
|
+
date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
|
|
52
|
+
branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
|
|
53
|
+
empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
|
|
54
|
+
unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
|
|
55
|
+
marked `# pragma: no cover`. `make coverage` now installs the extras and
|
|
56
|
+
enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
|
|
57
|
+
schema change.
|
|
58
|
+
|
|
9
59
|
## [0.1.9] - 2026-05-22
|
|
10
60
|
|
|
11
61
|
### Security / robustness
|
|
@@ -271,6 +321,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
271
321
|
intentionally *not* governed by the output schema (the schema describes the
|
|
272
322
|
full default output).
|
|
273
323
|
|
|
324
|
+
[0.1.11]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.11
|
|
325
|
+
[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
|
|
274
326
|
[0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
|
|
275
327
|
[0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
|
|
276
328
|
[0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
|
|
@@ -31,8 +31,11 @@ test-quick:
|
|
|
31
31
|
$(PYTHON) -m pytest -x -q -k "not property"
|
|
32
32
|
|
|
33
33
|
coverage:
|
|
34
|
+
# Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
|
|
35
|
+
# without them two extras-only branches stay uncovered (98% vs 100%).
|
|
36
|
+
$(PIP) install -q -e ".[dev,docx,pdf]"
|
|
34
37
|
$(PYTHON) -m coverage run --source=extract_cli -m pytest -q
|
|
35
|
-
$(PYTHON) -m coverage report -m
|
|
38
|
+
$(PYTHON) -m coverage report -m --fail-under=100
|
|
36
39
|
|
|
37
40
|
typecheck:
|
|
38
41
|
$(PYTHON) -m mypy --strict extract_cli.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.11
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -180,16 +180,17 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
|
|
|
180
180
|
"value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
|
|
181
181
|
"amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
|
|
182
182
|
"signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
|
|
183
|
-
"_meta": { "extractor_version": "0.1.
|
|
183
|
+
"_meta": { "extractor_version": "0.1.11", "tiers_used": ["deterministic"], "llm_used": false }
|
|
184
184
|
}
|
|
185
185
|
```
|
|
186
186
|
|
|
187
187
|
## The clause map (the differentiator)
|
|
188
188
|
|
|
189
189
|
A counterparty's "SECTION 7. NON-DISCLOSURE" and your template's
|
|
190
|
-
"## Confidentiality" are the same clause. `extract-cli`
|
|
191
|
-
template-vault-cli's **clause-detection cascade**
|
|
192
|
-
|
|
190
|
+
"## Confidentiality" are the same clause. `extract-cli` extends
|
|
191
|
+
template-vault-cli's **clause-detection cascade** — `## H2` headings →
|
|
192
|
+
bold-numbered `**1. …**` → plain numbered (`1. Term`, `Section 3. …`, two-line
|
|
193
|
+
`ARTICLE N`) → ALL-CAPS lines (and an opt-in `--llm` fallback) — plus a built-in
|
|
193
194
|
**canonical alias vocabulary** to normalize foreign clause titles onto the
|
|
194
195
|
names the rest of the suite already speaks. Clauses it can't map are kept with
|
|
195
196
|
`mapped: false` (and a `*` in the table view) so nothing is silently dropped.
|
|
@@ -142,16 +142,17 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
|
|
|
142
142
|
"value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
|
|
143
143
|
"amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
|
|
144
144
|
"signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
|
|
145
|
-
"_meta": { "extractor_version": "0.1.
|
|
145
|
+
"_meta": { "extractor_version": "0.1.11", "tiers_used": ["deterministic"], "llm_used": false }
|
|
146
146
|
}
|
|
147
147
|
```
|
|
148
148
|
|
|
149
149
|
## The clause map (the differentiator)
|
|
150
150
|
|
|
151
151
|
A counterparty's "SECTION 7. NON-DISCLOSURE" and your template's
|
|
152
|
-
"## Confidentiality" are the same clause. `extract-cli`
|
|
153
|
-
template-vault-cli's **clause-detection cascade**
|
|
154
|
-
|
|
152
|
+
"## Confidentiality" are the same clause. `extract-cli` extends
|
|
153
|
+
template-vault-cli's **clause-detection cascade** — `## H2` headings →
|
|
154
|
+
bold-numbered `**1. …**` → plain numbered (`1. Term`, `Section 3. …`, two-line
|
|
155
|
+
`ARTICLE N`) → ALL-CAPS lines (and an opt-in `--llm` fallback) — plus a built-in
|
|
155
156
|
**canonical alias vocabulary** to normalize foreign clause titles onto the
|
|
156
157
|
names the rest of the suite already speaks. Clauses it can't map are kept with
|
|
157
158
|
`mapped: false` (and a `*` in the table view) so nothing is silently dropped.
|
|
@@ -43,11 +43,11 @@ import urllib.request
|
|
|
43
43
|
from pathlib import Path
|
|
44
44
|
from typing import Any, Dict, List, Optional, Tuple
|
|
45
45
|
|
|
46
|
-
__version__ = "0.1.
|
|
46
|
+
__version__ = "0.1.11"
|
|
47
47
|
|
|
48
48
|
# Bumped independently of the package version when the *extraction logic*
|
|
49
49
|
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
50
|
-
EXTRACTOR_VERSION = "0.1.
|
|
50
|
+
EXTRACTOR_VERSION = "0.1.11"
|
|
51
51
|
|
|
52
52
|
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
53
53
|
SCHEMA_VERSION = 1
|
|
@@ -503,6 +503,42 @@ def _none_field() -> JSON:
|
|
|
503
503
|
return {"value": None, "confidence": 0.0, "source": "none"}
|
|
504
504
|
|
|
505
505
|
|
|
506
|
+
# --- Confidence scale -------------------------------------------------------
|
|
507
|
+
# These confidences are "verify, not trust" hints in [0, 1] -- a ranking of
|
|
508
|
+
# *structural certainty*, not calibrated probabilities. Higher means the
|
|
509
|
+
# extraction rests on more unambiguous structure; lower means a looser heuristic
|
|
510
|
+
# or an LLM guess. Downstream tools threshold on them, so they are centralized
|
|
511
|
+
# here and ordered into a single descending ladder rather than scattered as
|
|
512
|
+
# magic numbers:
|
|
513
|
+
#
|
|
514
|
+
# .95 explicit Markdown H2 heading
|
|
515
|
+
# .90 strong unambiguous pattern (parties "between X and Y"; labeled date)
|
|
516
|
+
# .85 clear keyword/structure (governing law; ISO date; bold-numbered heading)
|
|
517
|
+
# .80 keyworded but looser (plain numbered/ARTICLE heading; jurisdiction code)
|
|
518
|
+
# .75 structural-only heading (ALL-CAPS)
|
|
519
|
+
# .70 best-effort regex on common phrasing (term length, notice, auto-renew)
|
|
520
|
+
# .60 weak heuristic / LLM-enriched scalar (value, amounts, defined terms)
|
|
521
|
+
# .55 loose match (signature block, LLM obligations, non-ISO raw date)
|
|
522
|
+
# .50 fuzzy (LLM clause-map fallback)
|
|
523
|
+
CONF_H2 = 0.95
|
|
524
|
+
CONF_PARTIES = 0.90
|
|
525
|
+
CONF_DATE_LABELED = 0.90
|
|
526
|
+
CONF_DATE_ISO = 0.85
|
|
527
|
+
CONF_GOVERNING_LAW = 0.85
|
|
528
|
+
CONF_BOLD_HEADING = 0.85
|
|
529
|
+
CONF_NUMBERED_HEADING = 0.80
|
|
530
|
+
CONF_JURISDICTION = 0.80
|
|
531
|
+
CONF_ALLCAPS_HEADING = 0.75
|
|
532
|
+
CONF_TERM = 0.70
|
|
533
|
+
CONF_WEAK = 0.60
|
|
534
|
+
CONF_LLM = 0.60
|
|
535
|
+
CONF_DATE_RAW = 0.55
|
|
536
|
+
CONF_LLM_LIST = 0.55
|
|
537
|
+
CONF_SIGNATORY = 0.55
|
|
538
|
+
CONF_LLM_CLAUSE = 0.50
|
|
539
|
+
CONF_UNMAPPED_FACTOR = 0.75 # multiplier applied to a clause that doesn't map to the vocabulary
|
|
540
|
+
|
|
541
|
+
|
|
506
542
|
def _titlecase(s: str) -> str:
|
|
507
543
|
s = s.strip()
|
|
508
544
|
if not s:
|
|
@@ -675,7 +711,7 @@ def _date_field_from_str(raw: str, base_conf: float) -> JSON:
|
|
|
675
711
|
def _date_field(match: Optional["re.Match[str]"]) -> JSON:
|
|
676
712
|
if match is None:
|
|
677
713
|
return _none_field()
|
|
678
|
-
return _date_field_from_str(match.group(1),
|
|
714
|
+
return _date_field_from_str(match.group(1), CONF_DATE_ISO)
|
|
679
715
|
|
|
680
716
|
|
|
681
717
|
# Trailing descriptors that follow a party's actual name and should be dropped
|
|
@@ -739,7 +775,7 @@ def extract_parties(text: str) -> List[JSON]:
|
|
|
739
775
|
name, role = _split_name_role(raw)
|
|
740
776
|
if not name or len(name) < 2 or len(name) > 120:
|
|
741
777
|
continue
|
|
742
|
-
entry: JSON = {"name": name, "confidence":
|
|
778
|
+
entry: JSON = {"name": name, "confidence": CONF_PARTIES, "source": "deterministic"}
|
|
743
779
|
entry["role"] = role
|
|
744
780
|
out.append(entry)
|
|
745
781
|
return out
|
|
@@ -748,7 +784,7 @@ def extract_parties(text: str) -> List[JSON]:
|
|
|
748
784
|
def extract_dates(text: str) -> JSON:
|
|
749
785
|
label = _EFFDATE_LABEL_RE.search(text)
|
|
750
786
|
if label is not None:
|
|
751
|
-
effective = _date_field_from_str(label.group(1),
|
|
787
|
+
effective = _date_field_from_str(label.group(1), CONF_DATE_LABELED)
|
|
752
788
|
else:
|
|
753
789
|
effective = _date_field(_EFFECTIVE_RE.search(text))
|
|
754
790
|
return {"effective": effective, "expiration": _date_field(_EXPIRE_RE.search(text))}
|
|
@@ -759,9 +795,9 @@ def extract_governing_law(text: str) -> JSON:
|
|
|
759
795
|
if not m:
|
|
760
796
|
return _none_field()
|
|
761
797
|
juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
|
|
762
|
-
if not juris:
|
|
798
|
+
if not juris: # pragma: no cover - the capture group requires a leading letter
|
|
763
799
|
return _none_field()
|
|
764
|
-
return _field(juris,
|
|
800
|
+
return _field(juris, CONF_GOVERNING_LAW)
|
|
765
801
|
|
|
766
802
|
|
|
767
803
|
def extract_term(text: str) -> JSON:
|
|
@@ -773,20 +809,20 @@ def extract_term(text: str) -> JSON:
|
|
|
773
809
|
# Only emit when the captured token is a real number; otherwise the
|
|
774
810
|
# match was a coincidence ("...consecutive days") -> leave as not-found.
|
|
775
811
|
if num is not None:
|
|
776
|
-
length = _field(f"{num} {unit}{'s' if num != 1 else ''}",
|
|
812
|
+
length = _field(f"{num} {unit}{'s' if num != 1 else ''}", CONF_TERM)
|
|
777
813
|
|
|
778
814
|
notice = _none_field()
|
|
779
815
|
nm = _NOTICE_RE.search(text)
|
|
780
816
|
if nm:
|
|
781
817
|
days = _word_to_int(nm.group(1))
|
|
782
818
|
if days is not None:
|
|
783
|
-
notice = _field(days,
|
|
819
|
+
notice = _field(days, CONF_TERM)
|
|
784
820
|
|
|
785
821
|
auto = _none_field()
|
|
786
822
|
if _AUTORENEW_NEG_RE.search(text):
|
|
787
|
-
auto = _field(False,
|
|
823
|
+
auto = _field(False, CONF_TERM)
|
|
788
824
|
elif _AUTORENEW_POS_RE.search(text):
|
|
789
|
-
auto = _field(True,
|
|
825
|
+
auto = _field(True, CONF_TERM)
|
|
790
826
|
|
|
791
827
|
return {"length": length, "auto_renew": auto, "notice_period_days": notice}
|
|
792
828
|
|
|
@@ -795,7 +831,7 @@ def extract_value(text: str) -> JSON:
|
|
|
795
831
|
m = _MONEY_RE.search(text)
|
|
796
832
|
if not m:
|
|
797
833
|
return _none_field()
|
|
798
|
-
return _field(re.sub(r"\s+", " ", m.group(0).strip()),
|
|
834
|
+
return _field(re.sub(r"\s+", " ", m.group(0).strip()), CONF_WEAK)
|
|
799
835
|
|
|
800
836
|
|
|
801
837
|
def extract_amounts(text: str) -> List[JSON]:
|
|
@@ -807,7 +843,7 @@ def extract_amounts(text: str) -> List[JSON]:
|
|
|
807
843
|
seen.setdefault(amt, None)
|
|
808
844
|
if len(seen) >= 30:
|
|
809
845
|
break
|
|
810
|
-
return [{"value": a, "confidence":
|
|
846
|
+
return [{"value": a, "confidence": CONF_WEAK, "source": "deterministic"} for a in seen]
|
|
811
847
|
|
|
812
848
|
|
|
813
849
|
# Signature blocks: "By: <name>", "Name: <name>", "Printed Name: <name>".
|
|
@@ -820,20 +856,32 @@ _SIG_TITLE_RE = re.compile(
|
|
|
820
856
|
r"(?:^|\n)[ \t]*(?:Title|Its)[ \t]*:[ \t]*([^\n_{}\[\]]{2,60})",
|
|
821
857
|
re.IGNORECASE,
|
|
822
858
|
)
|
|
859
|
+
# A captured value is rejected when it's really the next column's label (common
|
|
860
|
+
# in two-column signature blocks: "By: By:") or a blank fill line.
|
|
861
|
+
_SIG_LABEL_RE = re.compile(r"(?:by|name|title|signature|its|date|signed|print)\b", re.IGNORECASE)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def _clean_sig_value(raw: str) -> Optional[str]:
|
|
865
|
+
v = re.sub(r"\s+", " ", raw).strip(" .,:")
|
|
866
|
+
if (len(v) < 2 or v.lower() == "the"
|
|
867
|
+
or not any(c.isalpha() for c in v)
|
|
868
|
+
or _SIG_LABEL_RE.match(v)):
|
|
869
|
+
return None
|
|
870
|
+
return v
|
|
823
871
|
|
|
824
872
|
|
|
825
873
|
def extract_signatories(text: str) -> List[JSON]:
|
|
826
874
|
"""Best-effort signature-block names (and titles, when adjacent). Skips
|
|
827
875
|
unfilled placeholders. Blank on a template; populated on executed paper."""
|
|
828
|
-
titles = [
|
|
876
|
+
titles = [_clean_sig_value(m.group(1)) for m in _SIG_TITLE_RE.finditer(text)]
|
|
829
877
|
out: List[JSON] = []
|
|
830
878
|
seen: Dict[str, None] = {}
|
|
831
879
|
for i, m in enumerate(_SIGNATORY_RE.finditer(text)):
|
|
832
|
-
name =
|
|
833
|
-
if
|
|
880
|
+
name = _clean_sig_value(m.group(1))
|
|
881
|
+
if name is None or name in seen:
|
|
834
882
|
continue
|
|
835
883
|
seen[name] = None
|
|
836
|
-
entry: JSON = {"name": name, "confidence":
|
|
884
|
+
entry: JSON = {"name": name, "confidence": CONF_SIGNATORY, "source": "deterministic"}
|
|
837
885
|
entry["title"] = titles[i] if i < len(titles) else None
|
|
838
886
|
out.append(entry)
|
|
839
887
|
if len(out) >= 12:
|
|
@@ -869,7 +917,7 @@ def extract_jurisdiction(governing_law: JSON) -> JSON:
|
|
|
869
917
|
if len(name) >= 5 and name in key:
|
|
870
918
|
code = c
|
|
871
919
|
break
|
|
872
|
-
return _field(code,
|
|
920
|
+
return _field(code, CONF_JURISDICTION, "deterministic") if code else _none_field()
|
|
873
921
|
|
|
874
922
|
|
|
875
923
|
def extract_defined_terms(text: str) -> List[JSON]:
|
|
@@ -880,12 +928,12 @@ def extract_defined_terms(text: str) -> List[JSON]:
|
|
|
880
928
|
# Reject sentence-like or lowercase-y captures.
|
|
881
929
|
if len(term) < 2 or len(term.split()) > 6:
|
|
882
930
|
continue
|
|
883
|
-
if not term[0].isupper():
|
|
931
|
+
if not term[0].isupper(): # pragma: no cover - the regexes require an uppercase lead
|
|
884
932
|
continue
|
|
885
933
|
seen.setdefault(term, None)
|
|
886
934
|
if len(seen) >= 50:
|
|
887
935
|
break
|
|
888
|
-
return [{"term": t, "confidence":
|
|
936
|
+
return [{"term": t, "confidence": CONF_WEAK, "source": "deterministic"} for t in seen]
|
|
889
937
|
|
|
890
938
|
|
|
891
939
|
# Detected-heading titles that are almost never real clauses: front/back-matter,
|
|
@@ -936,9 +984,9 @@ def extract_clauses(text: str) -> List[JSON]:
|
|
|
936
984
|
continue
|
|
937
985
|
canonical, mapped = _canonicalize_clause(c["title"])
|
|
938
986
|
tier = c["tier"]
|
|
939
|
-
base = {"h2":
|
|
940
|
-
"all-caps":
|
|
941
|
-
conf = round(base * (1.0 if mapped else
|
|
987
|
+
base = {"h2": CONF_H2, "bold-numbered": CONF_BOLD_HEADING, "numbered": CONF_NUMBERED_HEADING,
|
|
988
|
+
"all-caps": CONF_ALLCAPS_HEADING, "explicit": CONF_H2}.get(tier, CONF_TERM)
|
|
989
|
+
conf = round(base * (1.0 if mapped else CONF_UNMAPPED_FACTOR), 2)
|
|
942
990
|
out.append({
|
|
943
991
|
"canonical_title": canonical,
|
|
944
992
|
"detected_title": c["detected"],
|
|
@@ -1075,13 +1123,20 @@ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[st
|
|
|
1075
1123
|
mod = importlib.import_module("docx")
|
|
1076
1124
|
document_cls = getattr(mod, "Document")
|
|
1077
1125
|
doc = document_cls(str(path))
|
|
1126
|
+
w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
1078
1127
|
lines: List[str] = []
|
|
1079
1128
|
for para in doc.paragraphs:
|
|
1080
1129
|
line = (para.text or "").strip()
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1130
|
+
# Read the style + numbering off the underlying element so the
|
|
1131
|
+
# cascade sees clause headings (the same logic the stdlib reader
|
|
1132
|
+
# applies); python-docx alone exposes neither as a heading.
|
|
1133
|
+
ppr = para._p.find(w + "pPr")
|
|
1134
|
+
style = _docx_paragraph_style(ppr, w)
|
|
1135
|
+
numbered = bool(ppr is not None and ppr.find(w + "numPr") is not None)
|
|
1136
|
+
all_bold = bool(para.runs) and all(
|
|
1137
|
+
getattr(r, "bold", False) for r in para.runs if (r.text or "").strip())
|
|
1138
|
+
_emit_docx_paragraph(lines, line, style, numbered, all_bold)
|
|
1139
|
+
for table in getattr(doc, "tables", []): # pragma: no cover - [docx] fidelity
|
|
1085
1140
|
for row in table.rows:
|
|
1086
1141
|
for cell in row.cells:
|
|
1087
1142
|
ct = (cell.text or "").strip()
|
|
@@ -1130,6 +1185,30 @@ def _docx_heading_title(text: str) -> Optional[str]:
|
|
|
1130
1185
|
return title
|
|
1131
1186
|
|
|
1132
1187
|
|
|
1188
|
+
def _emit_docx_paragraph(out: List[str], line: str, style: Optional[str],
|
|
1189
|
+
numbered: bool, all_bold: bool) -> None:
|
|
1190
|
+
"""Append one .docx paragraph to `out` the way the clause cascade expects.
|
|
1191
|
+
|
|
1192
|
+
Heading-styled (Heading1-9/Title) or auto-numbered (`w:numPr`) paragraphs --
|
|
1193
|
+
whose visible number is auto-generated and absent from the text -- become a
|
|
1194
|
+
`## <title>` heading (with any run-in body split onto the next line) when the
|
|
1195
|
+
lead looks like a heading; a fully-bold paragraph becomes `**...**`; anything
|
|
1196
|
+
else stays plain. Shared by BOTH the python-docx and stdlib readers so the
|
|
1197
|
+
two paths agree on structure (the python-docx path used to flatten headings,
|
|
1198
|
+
losing the clause map on heading-styled Word docs)."""
|
|
1199
|
+
if not line:
|
|
1200
|
+
out.append("")
|
|
1201
|
+
return
|
|
1202
|
+
if _is_heading_style(style) or numbered:
|
|
1203
|
+
title = _docx_heading_title(line)
|
|
1204
|
+
if title is not None:
|
|
1205
|
+
out.append(f"## {title}")
|
|
1206
|
+
if len(title) < len(line):
|
|
1207
|
+
out.append(line[len(title):].lstrip(" .:\t"))
|
|
1208
|
+
return
|
|
1209
|
+
out.append(f"**{line}**" if all_bold else line)
|
|
1210
|
+
|
|
1211
|
+
|
|
1133
1212
|
def _read_docx_stdlib(raw: bytes) -> str:
|
|
1134
1213
|
import io
|
|
1135
1214
|
import zipfile
|
|
@@ -1153,39 +1232,23 @@ def _read_docx_stdlib(raw: bytes) -> str:
|
|
|
1153
1232
|
style = _docx_paragraph_style(ppr, w)
|
|
1154
1233
|
numbered = ppr is not None and ppr.find(w + "numPr") is not None
|
|
1155
1234
|
run_texts: List[str] = []
|
|
1156
|
-
any_text = False
|
|
1157
1235
|
all_bold = True
|
|
1158
1236
|
for r in p.iter(w + "r"):
|
|
1159
1237
|
rpr = r.find(w + "rPr")
|
|
1160
1238
|
bold = rpr is not None and rpr.find(w + "b") is not None
|
|
1161
1239
|
txt = "".join(t.text or "" for t in r.iter(w + "t"))
|
|
1162
1240
|
if txt:
|
|
1163
|
-
any_text = True
|
|
1164
1241
|
if not bold:
|
|
1165
1242
|
all_bold = False
|
|
1166
1243
|
run_texts.append(txt)
|
|
1167
1244
|
line = "".join(run_texts).strip()
|
|
1168
|
-
if not line:
|
|
1169
|
-
paras.append("")
|
|
1170
|
-
continue
|
|
1171
1245
|
# Clause structure in real Word contracts lives in heading STYLES
|
|
1172
1246
|
# (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
|
|
1173
|
-
# visible number is auto-generated and absent from the text.
|
|
1174
|
-
#
|
|
1175
|
-
#
|
|
1176
|
-
#
|
|
1177
|
-
|
|
1178
|
-
if _is_heading_style(style) or numbered:
|
|
1179
|
-
title = _docx_heading_title(line)
|
|
1180
|
-
if title is not None:
|
|
1181
|
-
paras.append(f"## {title}")
|
|
1182
|
-
if len(title) < len(line):
|
|
1183
|
-
paras.append(line[len(title):].lstrip(" .:\t"))
|
|
1184
|
-
continue
|
|
1185
|
-
# Not heading-like -> treat as ordinary body text.
|
|
1186
|
-
if any_text and all_bold:
|
|
1187
|
-
line = f"**{line}**"
|
|
1188
|
-
paras.append(line)
|
|
1247
|
+
# visible number is auto-generated and absent from the text. The shared
|
|
1248
|
+
# emitter turns those into `## headings` (run-in body split off), bolds
|
|
1249
|
+
# fully-bold lines, and keeps the rest plain. _docx_heading_title rejects
|
|
1250
|
+
# full-sentence body items, so this stays conservative.
|
|
1251
|
+
_emit_docx_paragraph(paras, line, style, numbered, all_bold)
|
|
1189
1252
|
return "\n\n".join(paras)
|
|
1190
1253
|
|
|
1191
1254
|
|
|
@@ -1209,7 +1272,7 @@ def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str
|
|
|
1209
1272
|
warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
|
|
1210
1273
|
try:
|
|
1211
1274
|
text = _read_pdf_stdlib(raw)
|
|
1212
|
-
except Exception as e:
|
|
1275
|
+
except Exception as e: # pragma: no cover - defensive; stdlib reader is bomb-guarded
|
|
1213
1276
|
warnings.append(f"could not parse .pdf ({e}); treating as empty")
|
|
1214
1277
|
return "", warnings
|
|
1215
1278
|
return text, warnings
|
|
@@ -1342,7 +1405,7 @@ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, s
|
|
|
1342
1405
|
raise ExtractError(f"path is a directory, not a file: {path}")
|
|
1343
1406
|
try:
|
|
1344
1407
|
size = path.stat().st_size
|
|
1345
|
-
except OSError:
|
|
1408
|
+
except OSError: # pragma: no cover - defensive; path.exists() already passed
|
|
1346
1409
|
size = 0
|
|
1347
1410
|
if size > MAX_INPUT_BYTES:
|
|
1348
1411
|
raise ExtractError(
|
|
@@ -1557,7 +1620,7 @@ def _llm_clause_map(raw: Any, text: str) -> List[JSON]:
|
|
|
1557
1620
|
"detected_title": title,
|
|
1558
1621
|
"tier": "llm",
|
|
1559
1622
|
"span": span,
|
|
1560
|
-
"confidence":
|
|
1623
|
+
"confidence": CONF_LLM_CLAUSE,
|
|
1561
1624
|
"source": "llm",
|
|
1562
1625
|
"mapped": mapped,
|
|
1563
1626
|
})
|
|
@@ -1592,18 +1655,18 @@ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
|
1592
1655
|
enriched = False
|
|
1593
1656
|
rm = obj.get("renewal_mechanics")
|
|
1594
1657
|
if isinstance(rm, str) and rm.strip():
|
|
1595
|
-
result["term"]["renewal_mechanics"] = _field(rm.strip(),
|
|
1658
|
+
result["term"]["renewal_mechanics"] = _field(rm.strip(), CONF_LLM, "llm")
|
|
1596
1659
|
enriched = True
|
|
1597
1660
|
obligations = obj.get("obligations")
|
|
1598
1661
|
if isinstance(obligations, list) and obligations:
|
|
1599
1662
|
result["obligations"] = [
|
|
1600
|
-
{"text": str(o).strip(), "confidence":
|
|
1663
|
+
{"text": str(o).strip(), "confidence": CONF_LLM_LIST, "source": "llm"}
|
|
1601
1664
|
for o in obligations[:5] if str(o).strip()
|
|
1602
1665
|
]
|
|
1603
1666
|
enriched = True
|
|
1604
1667
|
gl = obj.get("governing_law")
|
|
1605
1668
|
if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
|
|
1606
|
-
result["governing_law"] = _field(gl.strip(),
|
|
1669
|
+
result["governing_law"] = _field(gl.strip(), CONF_LLM, "llm")
|
|
1607
1670
|
enriched = True
|
|
1608
1671
|
if want_clauses:
|
|
1609
1672
|
cmap = _llm_clause_map(obj.get("clauses"), text)
|
|
@@ -1692,10 +1755,20 @@ def render_table(result: JSON, no_confidence: bool) -> str:
|
|
|
1692
1755
|
lines.append(f" renewal : {_fv(term['renewal_mechanics'])} {_dim('[llm]')}")
|
|
1693
1756
|
if "governing_law" in result:
|
|
1694
1757
|
lines.append(_bold("Governing law"))
|
|
1695
|
-
|
|
1758
|
+
juris = result.get("jurisdiction", {}).get("value")
|
|
1759
|
+
suffix = _dim(f" [{juris}]") if juris else ""
|
|
1760
|
+
lines.append(f" {_fv(result['governing_law'])}{suffix}")
|
|
1696
1761
|
if "value" in result:
|
|
1762
|
+
amts = result.get("amounts") or []
|
|
1763
|
+
extra = _dim(f" (+{len(amts) - 1} more)") if len(amts) > 1 else ""
|
|
1697
1764
|
lines.append(_bold("Value"))
|
|
1698
|
-
lines.append(f" {_fv(result['value'])}")
|
|
1765
|
+
lines.append(f" {_fv(result['value'])}{extra}")
|
|
1766
|
+
signatories = result.get("signatories")
|
|
1767
|
+
if signatories:
|
|
1768
|
+
lines.append(_bold(f"Signatories ({len(signatories)})"))
|
|
1769
|
+
for s in signatories[:6]:
|
|
1770
|
+
title = f" - {s['title']}" if s.get("title") else ""
|
|
1771
|
+
lines.append(f" {s['name']}{title}")
|
|
1699
1772
|
clauses = result.get("clauses")
|
|
1700
1773
|
if clauses is not None:
|
|
1701
1774
|
lines.append(_bold(f"Clause map ({len(clauses)})"))
|
|
@@ -1920,11 +1993,14 @@ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
|
|
|
1920
1993
|
("term.length", "deterministic", "Term length, best-effort"),
|
|
1921
1994
|
("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
|
|
1922
1995
|
("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
|
|
1923
|
-
("governing_law", "deterministic", "Governing law
|
|
1996
|
+
("governing_law", "deterministic", "Governing law text ('governed by the laws of ...')"),
|
|
1997
|
+
("jurisdiction", "deterministic", "Governing law normalized to a code (e.g. US-DE)"),
|
|
1924
1998
|
("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary "
|
|
1925
1999
|
"(LLM fallback under --llm when no headings are detected)"),
|
|
1926
2000
|
("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
|
|
1927
2001
|
("value", "deterministic", "Headline monetary value"),
|
|
2002
|
+
("amounts", "deterministic", "All distinct monetary amounts"),
|
|
2003
|
+
("signatories", "deterministic", "Signature-block names/titles (By:/Name:/Title:)"),
|
|
1928
2004
|
("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
|
|
1929
2005
|
("obligations", "llm", "Key obligation phrasing (fuzzy; --llm only)"),
|
|
1930
2006
|
)
|
|
@@ -2315,7 +2391,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2315
2391
|
if hasattr(_stream, "reconfigure"):
|
|
2316
2392
|
try:
|
|
2317
2393
|
_stream.reconfigure(encoding="utf-8", errors="replace")
|
|
2318
|
-
except Exception:
|
|
2394
|
+
except Exception: # pragma: no cover - defensive
|
|
2319
2395
|
pass
|
|
2320
2396
|
|
|
2321
2397
|
argv = sys.argv[1:] if argv is None else argv
|
|
@@ -2358,7 +2434,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2358
2434
|
if first in known:
|
|
2359
2435
|
parser = build_parser()
|
|
2360
2436
|
args = parser.parse_args(argv)
|
|
2361
|
-
if not getattr(args, "func", None):
|
|
2437
|
+
if not getattr(args, "func", None): # pragma: no cover - argparse always sets func
|
|
2362
2438
|
parser.print_help()
|
|
2363
2439
|
return 0
|
|
2364
2440
|
else:
|
|
@@ -2370,7 +2446,7 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2370
2446
|
except BrokenPipeError: # e.g. `extract foo.md | head`
|
|
2371
2447
|
try:
|
|
2372
2448
|
sys.stdout.close()
|
|
2373
|
-
except Exception:
|
|
2449
|
+
except Exception: # pragma: no cover - defensive
|
|
2374
2450
|
pass
|
|
2375
2451
|
return 0
|
|
2376
2452
|
except KeyboardInterrupt: # pragma: no cover
|
|
@@ -2378,5 +2454,5 @@ def main(argv: Optional[List[str]] = None) -> int:
|
|
|
2378
2454
|
return 130
|
|
2379
2455
|
|
|
2380
2456
|
|
|
2381
|
-
if __name__ == "__main__":
|
|
2457
|
+
if __name__ == "__main__": # pragma: no cover
|
|
2382
2458
|
sys.exit(main())
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "extract-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.11"
|
|
8
8
|
description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
},
|
|
40
40
|
"auto_renew": {
|
|
41
41
|
"value": true,
|
|
42
|
-
"confidence": 0.
|
|
42
|
+
"confidence": 0.7,
|
|
43
43
|
"source": "deterministic"
|
|
44
44
|
},
|
|
45
45
|
"notice_period_days": {
|
|
@@ -151,7 +151,7 @@
|
|
|
151
151
|
],
|
|
152
152
|
"signatories": [],
|
|
153
153
|
"_meta": {
|
|
154
|
-
"extractor_version": "0.1.
|
|
154
|
+
"extractor_version": "0.1.11",
|
|
155
155
|
"tiers_used": [
|
|
156
156
|
"deterministic"
|
|
157
157
|
],
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
},
|
|
40
40
|
"auto_renew": {
|
|
41
41
|
"value": true,
|
|
42
|
-
"confidence": 0.
|
|
42
|
+
"confidence": 0.7,
|
|
43
43
|
"source": "deterministic"
|
|
44
44
|
},
|
|
45
45
|
"notice_period_days": {
|
|
@@ -140,7 +140,7 @@
|
|
|
140
140
|
"amounts": [],
|
|
141
141
|
"signatories": [],
|
|
142
142
|
"_meta": {
|
|
143
|
-
"extractor_version": "0.1.
|
|
143
|
+
"extractor_version": "0.1.11",
|
|
144
144
|
"tiers_used": [
|
|
145
145
|
"deterministic"
|
|
146
146
|
],
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
},
|
|
40
40
|
"auto_renew": {
|
|
41
41
|
"value": true,
|
|
42
|
-
"confidence": 0.
|
|
42
|
+
"confidence": 0.7,
|
|
43
43
|
"source": "deterministic"
|
|
44
44
|
},
|
|
45
45
|
"notice_period_days": {
|
|
@@ -150,7 +150,7 @@
|
|
|
150
150
|
"amounts": [],
|
|
151
151
|
"signatories": [],
|
|
152
152
|
"_meta": {
|
|
153
|
-
"extractor_version": "0.1.
|
|
153
|
+
"extractor_version": "0.1.11",
|
|
154
154
|
"tiers_used": [
|
|
155
155
|
"deterministic"
|
|
156
156
|
],
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
},
|
|
40
40
|
"auto_renew": {
|
|
41
41
|
"value": true,
|
|
42
|
-
"confidence": 0.
|
|
42
|
+
"confidence": 0.7,
|
|
43
43
|
"source": "deterministic"
|
|
44
44
|
},
|
|
45
45
|
"notice_period_days": {
|
|
@@ -161,7 +161,7 @@
|
|
|
161
161
|
],
|
|
162
162
|
"signatories": [],
|
|
163
163
|
"_meta": {
|
|
164
|
-
"extractor_version": "0.1.
|
|
164
|
+
"extractor_version": "0.1.11",
|
|
165
165
|
"tiers_used": [
|
|
166
166
|
"deterministic"
|
|
167
167
|
],
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Targeted tests that exercise the remaining reachable branches, to keep line
|
|
2
|
+
coverage at its practical maximum. (Genuinely-unreachable defensive lines and
|
|
3
|
+
[docx]/[pdf]-extra fidelity branches are marked `# pragma: no cover` in the
|
|
4
|
+
source.)"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import sys as _sys
|
|
11
|
+
import zipfile
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
import extract_cli as ex
|
|
17
|
+
from tests.conftest import FIXTURES
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _ns(**kw: object) -> argparse.Namespace:
|
|
21
|
+
base = {"silent": False, "why": False}
|
|
22
|
+
base.update(kw)
|
|
23
|
+
return argparse.Namespace(**base)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# --- color + warn -----------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
def test_color_force_on_and_isatty_exception(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
29
|
+
monkeypatch.delenv("NO_COLOR", raising=False)
|
|
30
|
+
monkeypatch.setenv("FORCE_COLOR", "1")
|
|
31
|
+
assert ex._color_enabled() is True
|
|
32
|
+
assert ex._c("x", "32") == "\033[32mx\033[0m"
|
|
33
|
+
monkeypatch.delenv("FORCE_COLOR", raising=False)
|
|
34
|
+
|
|
35
|
+
class _Bad:
|
|
36
|
+
def isatty(self) -> bool:
|
|
37
|
+
raise ValueError("boom")
|
|
38
|
+
assert ex._color_enabled(_Bad()) is False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_warn_silent_is_suppressed(capsys: pytest.CaptureFixture[str]) -> None:
|
|
42
|
+
ex._warn(_ns(silent=True), "hush")
|
|
43
|
+
assert capsys.readouterr().err == ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# --- small helpers ----------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def test_titlecase_edges() -> None:
|
|
49
|
+
assert ex._titlecase(" ") == ""
|
|
50
|
+
assert ex._titlecase("IP Rights") == "IP Rights" # acronym preserved in mixed case
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_word_to_int_digit_and_unknown() -> None:
|
|
54
|
+
assert ex._word_to_int("30") == 30
|
|
55
|
+
assert ex._word_to_int("zzz") is None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_date_parse_none_and_unparseable_raw() -> None:
|
|
59
|
+
assert ex._parse_date_to_iso("not a date") is None
|
|
60
|
+
f = ex._date_field_from_str("13/13/2024", 0.85) # matches shape, invalid month
|
|
61
|
+
assert f["source"] == "deterministic" and f["confidence"] < 0.85
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_canonicalize_empty_key() -> None:
|
|
65
|
+
assert ex._canonicalize_clause(" ") == (None, False)
|
|
66
|
+
assert ex._canonicalize_clause("1.") == (None, False)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_governing_law_and_title_none() -> None:
|
|
70
|
+
assert ex.extract_governing_law("no law clause here")["source"] == "none"
|
|
71
|
+
assert ex.extract_title("", None, "text") is None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_defined_terms_long_and_capped() -> None:
|
|
75
|
+
long_phrase = '"This Is A Very Long Quoted Heading Phrase Indeed"' # > 6 words
|
|
76
|
+
many = " ".join(f'"Term {i}"' for i in range(60))
|
|
77
|
+
terms = [t["term"] for t in ex.extract_defined_terms(long_phrase + " " + many)]
|
|
78
|
+
assert not any("Very Long" in t for t in terms)
|
|
79
|
+
assert len(terms) <= 50
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_noise_placeholder_midstring() -> None:
|
|
83
|
+
# Placeholder not at the start -> the mid-string regex branch.
|
|
84
|
+
assert ex._is_noise_clause_title("Fee [ # ]% Cap")
|
|
85
|
+
assert ex._is_noise_clause_title("{placeholder}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# --- format / readers -------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def test_detect_format_by_magic_bytes(tmp_path: Any) -> None:
|
|
91
|
+
p = tmp_path / "x.dat"
|
|
92
|
+
p.write_bytes(b"%PDF-1.4\nrest")
|
|
93
|
+
assert ex._detect_format(p, p.read_bytes()) == "pdf"
|
|
94
|
+
q = tmp_path / "y.dat"
|
|
95
|
+
q.write_bytes(b"PK\x03\x04rest")
|
|
96
|
+
assert ex._detect_format(q, q.read_bytes()) == "docx"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_pdf_stream_without_endstream() -> None:
|
|
100
|
+
assert ex._read_pdf_stdlib(b"%PDF\nstream\n(text) Tj") == ""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_pdf_decompression_budget_break(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
104
|
+
import zlib
|
|
105
|
+
monkeypatch.setattr(ex, "MAX_DECOMPRESSED_BYTES", 10)
|
|
106
|
+
blob = b"%PDF\nstream\n" + zlib.compress(b"(Hello World) Tj " * 10) + b"\nendstream"
|
|
107
|
+
assert ex._read_pdf_stdlib(blob) == "" # exceeds the tiny budget -> bail, no text
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_html_malformed_falls_back(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
111
|
+
def boom(self: object, data: object) -> None:
|
|
112
|
+
raise ValueError("bad markup")
|
|
113
|
+
monkeypatch.setattr(ex._HTMLTextExtractor, "feed", boom)
|
|
114
|
+
out = ex._read_html("<p>hello <b>world</b></p>")
|
|
115
|
+
assert "hello" in out and "<" not in out # crude tag-strip fallback
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_docx_empty_paragraph_stdlib() -> None:
|
|
119
|
+
w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
120
|
+
body = '<w:p/><w:p><w:r><w:t>Hello</w:t></w:r></w:p>'
|
|
121
|
+
doc = f'<?xml version="1.0"?><w:document xmlns:w="{w}"><w:body>{body}</w:body></w:document>'
|
|
122
|
+
buf = io.BytesIO()
|
|
123
|
+
with zipfile.ZipFile(buf, "w") as z:
|
|
124
|
+
z.writestr("[Content_Types].xml", "<Types/>")
|
|
125
|
+
z.writestr("word/document.xml", doc)
|
|
126
|
+
assert "Hello" in ex._read_docx_stdlib(buf.getvalue())
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# --- clause detection edges -------------------------------------------------
|
|
130
|
+
|
|
131
|
+
def test_clause_heading_on_last_line() -> None:
|
|
132
|
+
clauses = ex.detect_clauses("## First\n\nbody text\n\n## Last") # no trailing newline
|
|
133
|
+
assert clauses[-1]["title"] == "Last"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_two_line_article_skips_non_heading_next_line() -> None:
|
|
137
|
+
text = ("ARTICLE I\n\nThis whole next line is a long running sentence, not a heading at all.\n\n"
|
|
138
|
+
"ARTICLE II\n\nCONFIDENTIALITY\n\nbody\n\nARTICLE III\n\nGOVERNING LAW\n\nbody")
|
|
139
|
+
titles = [c["title"] for c in ex.detect_clauses(text)]
|
|
140
|
+
assert "CONFIDENTIALITY" in titles and "GOVERNING LAW" in titles
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_is_low_signal_each_branch() -> None:
|
|
144
|
+
def base() -> dict:
|
|
145
|
+
return {"parties": [], "clauses": [],
|
|
146
|
+
"dates": {"effective": ex._none_field(), "expiration": ex._none_field()},
|
|
147
|
+
"governing_law": ex._none_field(), "defined_terms": []}
|
|
148
|
+
r = base(); r["clauses"] = [{}]; assert ex._is_low_signal(r) is False
|
|
149
|
+
r = base(); r["dates"]["effective"] = ex._field("2024-01-01", 0.85); assert ex._is_low_signal(r) is False
|
|
150
|
+
r = base(); r["governing_law"] = ex._field("X", 0.8); assert ex._is_low_signal(r) is False
|
|
151
|
+
r = base(); r["defined_terms"] = [{"term": "X"}]; assert ex._is_low_signal(r) is False
|
|
152
|
+
assert ex._is_low_signal(base()) is True
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# --- LLM internals (mocked transport) ---------------------------------------
|
|
156
|
+
|
|
157
|
+
class _Resp:
|
|
158
|
+
def __init__(self, body: bytes) -> None:
|
|
159
|
+
self._b = body
|
|
160
|
+
|
|
161
|
+
def read(self) -> bytes:
|
|
162
|
+
return self._b
|
|
163
|
+
|
|
164
|
+
def __enter__(self) -> "_Resp":
|
|
165
|
+
return self
|
|
166
|
+
|
|
167
|
+
def __exit__(self, *a: object) -> bool:
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_llm_request_openai_no_choices(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
172
|
+
monkeypatch.setattr(ex.urllib.request, "urlopen",
|
|
173
|
+
lambda req, timeout=30.0: _Resp(json.dumps({"choices": []}).encode()))
|
|
174
|
+
assert ex._llm_request({"provider": "openai", "api_key": "k"}, "p") is None
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def test_extract_json_object_invalid() -> None:
|
|
178
|
+
assert ex._extract_json_object("prefix {not valid json} suffix") is None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_llm_clause_map_skips() -> None:
|
|
182
|
+
cm = ex._llm_clause_map(
|
|
183
|
+
[{"title": ""}, 123, {"title": "Recitals"}, {"title": "Confidentiality"},
|
|
184
|
+
{"title": "Confidentiality"}], "Confidentiality body")
|
|
185
|
+
assert [c["canonical_title"] for c in cm] == ["Confidentiality"]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def test_load_llm_config_malformed(monkeypatch: pytest.MonkeyPatch, tmp_path: Any) -> None:
|
|
189
|
+
bad = tmp_path / "llm.json"
|
|
190
|
+
bad.write_text("{not json")
|
|
191
|
+
monkeypatch.setattr(ex, "LLM_CONFIG_PATHS", (bad,))
|
|
192
|
+
assert ex.load_llm_config() is None
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def test_llm_enrich_empty_and_unparseable(monkeypatch: pytest.MonkeyPatch,
|
|
196
|
+
capsys: pytest.CaptureFixture[str]) -> None:
|
|
197
|
+
monkeypatch.setattr(ex, "load_llm_config", lambda: {"provider": "anthropic", "api_key": "k"})
|
|
198
|
+
text = "x"
|
|
199
|
+
monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "")
|
|
200
|
+
ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
|
|
201
|
+
assert "no content" in capsys.readouterr().err
|
|
202
|
+
monkeypatch.setattr(ex, "_llm_request", lambda c, p, timeout=30.0: "not json at all")
|
|
203
|
+
ex.llm_enrich(ex.build_extraction(text, text.encode(), "text", "x.txt"), text, _ns())
|
|
204
|
+
assert "could not parse" in capsys.readouterr().err
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# --- rendering / CLI edges --------------------------------------------------
|
|
208
|
+
|
|
209
|
+
def test_render_table_unmapped_legend() -> None:
|
|
210
|
+
r = ex.build_extraction("## Zorblax Provisions\n\nbody", b"x", "markdown", "x.md")
|
|
211
|
+
assert "* = not mapped" in ex.render_table(r, no_confidence=False)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def test_render_table_jurisdiction_amounts_signatories() -> None:
|
|
215
|
+
r = ex.build_extraction("body", b"x", "markdown", "x.md")
|
|
216
|
+
r["jurisdiction"] = ex._field("US-DE", ex.CONF_JURISDICTION)
|
|
217
|
+
r["amounts"] = [{"value": "$1", "confidence": 0.6, "source": "deterministic"},
|
|
218
|
+
{"value": "$2", "confidence": 0.6, "source": "deterministic"}]
|
|
219
|
+
r["signatories"] = [{"name": "Jane Doe", "title": "CEO",
|
|
220
|
+
"confidence": ex.CONF_SIGNATORY, "source": "deterministic"}]
|
|
221
|
+
table = ex.render_table(r, no_confidence=False)
|
|
222
|
+
assert "US-DE" in table
|
|
223
|
+
assert "+1 more" in table
|
|
224
|
+
assert "Signatories (1)" in table and "Jane Doe - CEO" in table
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def test_cli_silent_table_suppresses_human_view(capsys: pytest.CaptureFixture[str]) -> None:
|
|
228
|
+
assert ex.main([str(FIXTURES / "nda_h2.md"), "--silent", "--format", "table"]) == 0
|
|
229
|
+
assert "Clause map" not in capsys.readouterr().out
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def test_main_no_args_prints_help(capsys: pytest.CaptureFixture[str]) -> None:
|
|
233
|
+
assert ex.main([]) == 0
|
|
234
|
+
assert "usage" in capsys.readouterr().out.lower()
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# --- last reachable edges ---------------------------------------------------
|
|
238
|
+
|
|
239
|
+
def test_parties_skips_empty_capture() -> None:
|
|
240
|
+
# The second "party" is just a parenthetical role -> cleans to an empty
|
|
241
|
+
# name and is skipped; the first is kept.
|
|
242
|
+
parties = ex.extract_parties('between Acme Corp and ("Receiving Party")')
|
|
243
|
+
assert [p["name"] for p in parties] == ["Acme Corp"]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def test_signatories_skips_dupes_short_and_reserved() -> None:
|
|
247
|
+
text = "By: Jane Doe\nName: Jane Doe\nName: a\nName: the\n"
|
|
248
|
+
s = ex.extract_signatories(text)
|
|
249
|
+
assert [x["name"] for x in s] == ["Jane Doe"]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def test_pdf_text_tj_array_branch() -> None:
|
|
253
|
+
# A TJ array of strings inside a text object.
|
|
254
|
+
assert ex._pdf_text_from_content(b"BT [(Hello) (World)] TJ ET") == "HelloWorld"
|
|
@@ -165,6 +165,13 @@ def test_signatories() -> None:
|
|
|
165
165
|
assert ex.extract_signatories("Name: {party_1_signatory}\nBy: _____________") == []
|
|
166
166
|
|
|
167
167
|
|
|
168
|
+
def test_signatories_two_column_blank_block() -> None:
|
|
169
|
+
# An unsigned two-column block ("By: By:") must NOT capture the next
|
|
170
|
+
# column's label as a name.
|
|
171
|
+
text = "By: By:\nName: Name:\nTitle: Title:\n"
|
|
172
|
+
assert ex.extract_signatories(text) == []
|
|
173
|
+
|
|
174
|
+
|
|
168
175
|
def test_value_money() -> None:
|
|
169
176
|
assert ex.extract_value("a fee of $250,000 is due")["value"] == "$250,000"
|
|
170
177
|
assert ex.extract_value("budget is USD 1.5 million")["value"].startswith("USD")
|
|
@@ -152,6 +152,46 @@ def test_docx_heading_style_helpers() -> None:
|
|
|
152
152
|
# Run-in heading: title is the lead before the sentence body.
|
|
153
153
|
assert ex._docx_heading_title("Payment. Customer will pay the fees.") == "Payment"
|
|
154
154
|
assert ex._docx_heading_title("Governing Law") == "Governing Law"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_emit_docx_paragraph() -> None:
|
|
158
|
+
"""The shared emitter both .docx readers use: heading styles / numbered
|
|
159
|
+
paragraphs become `## headings`, fully-bold lines become `**...**`."""
|
|
160
|
+
out: list[str] = []
|
|
161
|
+
ex._emit_docx_paragraph(out, "Confidentiality", "Heading2", False, False) # heading style
|
|
162
|
+
ex._emit_docx_paragraph(out, "Term", None, True, False) # auto-numbered
|
|
163
|
+
ex._emit_docx_paragraph(out, "Important Notice", None, False, True) # fully bold
|
|
164
|
+
ex._emit_docx_paragraph(out, "Just some body text.", None, False, False) # plain
|
|
165
|
+
ex._emit_docx_paragraph(out, "", None, False, False) # blank
|
|
166
|
+
ex._emit_docx_paragraph(out, "Payment. Customer will pay.", "Heading1", False, False) # run-in
|
|
167
|
+
assert out == [
|
|
168
|
+
"## Confidentiality",
|
|
169
|
+
"## Term",
|
|
170
|
+
"**Important Notice**",
|
|
171
|
+
"Just some body text.",
|
|
172
|
+
"",
|
|
173
|
+
"## Payment",
|
|
174
|
+
"Customer will pay.", # run-in body split onto its own line
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_docx_readers_agree_on_clause_map() -> None:
|
|
179
|
+
"""Regression: the python-docx reader must surface the same clause map as the
|
|
180
|
+
stdlib reader on a heading-styled .docx. The python-docx path used to flatten
|
|
181
|
+
heading styles and return an empty clause map. Skips without [docx]."""
|
|
182
|
+
pytest.importorskip("docx")
|
|
183
|
+
path = FIXTURES / "heading_docx.docx"
|
|
184
|
+
raw = path.read_bytes()
|
|
185
|
+
|
|
186
|
+
def clause_titles(prefer_optional: bool) -> list[str]:
|
|
187
|
+
_raw, text, fmt, _w = ex.load_source(path, prefer_optional=prefer_optional)
|
|
188
|
+
result = ex.build_extraction(text, raw, fmt, "h.docx")
|
|
189
|
+
return [c["canonical_title"] for c in result["clauses"]]
|
|
190
|
+
|
|
191
|
+
stdlib = clause_titles(False)
|
|
192
|
+
pydocx = clause_titles(True)
|
|
193
|
+
assert stdlib, "stdlib reader should detect the heading-styled clauses"
|
|
194
|
+
assert pydocx == stdlib, "python-docx path must agree with the stdlib reader"
|
|
155
195
|
# A full sentence carrying a heading style is rejected (not a clause title).
|
|
156
196
|
assert ex._docx_heading_title(
|
|
157
197
|
"Either party may terminate this Agreement upon material breach that "
|
|
@@ -56,6 +56,20 @@ def test_schema_command_emits_committed_spec() -> None:
|
|
|
56
56
|
assert json.loads(proc.stdout) == json.loads(SPEC_FILE.read_text(encoding="utf-8"))
|
|
57
57
|
|
|
58
58
|
|
|
59
|
+
def test_fields_catalog_covers_schema() -> None:
|
|
60
|
+
"""`extract fields` (FIELD_CATALOG) must not silently drift from the output
|
|
61
|
+
schema -- every top-level output field appears in the catalog."""
|
|
62
|
+
schema_top = set(SCHEMA["properties"]) - {"_meta"}
|
|
63
|
+
catalog_prefixes = {f.split(".")[0] for f, _tier, _desc in ex.FIELD_CATALOG}
|
|
64
|
+
assert schema_top - catalog_prefixes == set()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_confidence_scale_is_a_descending_ladder() -> None:
|
|
68
|
+
assert ex.CONF_H2 >= ex.CONF_PARTIES >= ex.CONF_GOVERNING_LAW >= ex.CONF_NUMBERED_HEADING
|
|
69
|
+
assert ex.CONF_ALLCAPS_HEADING >= ex.CONF_TERM >= ex.CONF_WEAK >= ex.CONF_LLM_CLAUSE
|
|
70
|
+
assert 0.0 < ex.CONF_LLM_CLAUSE and ex.CONF_H2 <= 1.0
|
|
71
|
+
|
|
72
|
+
|
|
59
73
|
def test_schema_is_self_describing() -> None:
|
|
60
74
|
assert SCHEMA["$schema"] == "https://json-schema.org/draft/2020-12/schema"
|
|
61
75
|
assert "extract-cli" in SCHEMA["title"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|