extract-cli 0.1.8__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_cli-0.1.8 → extract_cli-0.1.10}/CHANGELOG.md +54 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/Makefile +4 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/PKG-INFO +5 -2
- {extract_cli-0.1.8 → extract_cli-0.1.10}/README.md +4 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/docs/INTEROP.md +6 -4
- {extract_cli-0.1.8 → extract_cli-0.1.10}/docs/spec/extract-output.schema.json +58 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/extract_cli.py +207 -44
- {extract_cli-0.1.8 → extract_cli-0.1.10}/pyproject.toml +1 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx.expected.json +14 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx.expected.json +8 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt.expected.json +14 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf.expected.json +14 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md.expected.json +8 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx.expected.json +8 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf.expected.json +8 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt.expected.json +14 -1
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_html.html.expected.json +17 -4
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_clause_map.py +15 -0
- extract_cli-0.1.10/tests/test_coverage.py +241 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_deterministic.py +25 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_misc.py +104 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_property.py +18 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/.gitignore +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/AGENTS.md +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/ARCHITECTURE.md +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/CONTRIBUTING.md +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/LICENSE +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/config/llm.json.example +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/llms.txt +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/scripts/release.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/scripts/validate_against_spec.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/_fixtures_build.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/_make_goldens.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/_schema_validator.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/conftest.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_html.html +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_cli.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_llm.py +0 -0
- {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_schema_conformance.py +0 -0
|
@@ -6,6 +6,58 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
|
|
|
6
6
|
(see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
|
|
7
7
|
the output schema require a major version bump**; new optional fields are minor.
|
|
8
8
|
|
|
9
|
+
## [0.1.10] - 2026-05-22
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
- **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
|
|
13
|
+
the stdlib reader. Previously the python-docx path concatenated paragraph text
|
|
14
|
+
and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
|
|
15
|
+
the `[docx]` extra produced an **empty clause map** on heading-styled Word
|
|
16
|
+
contracts (worse than the no-extra stdlib reader). Both readers now share one
|
|
17
|
+
emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
|
|
18
|
+
paragraphs into `## headings`, so the two paths agree. New tests:
|
|
19
|
+
`test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
|
|
20
|
+
latter asserts the python-docx and stdlib readers produce the same clause map).
|
|
21
|
+
No output-schema change.
|
|
22
|
+
|
|
23
|
+
### Tests / quality
|
|
24
|
+
- **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
|
|
25
|
+
for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
|
|
26
|
+
date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
|
|
27
|
+
branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
|
|
28
|
+
empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
|
|
29
|
+
unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
|
|
30
|
+
marked `# pragma: no cover`. `make coverage` now installs the extras and
|
|
31
|
+
enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
|
|
32
|
+
schema change.
|
|
33
|
+
|
|
34
|
+
## [0.1.9] - 2026-05-22
|
|
35
|
+
|
|
36
|
+
### Security / robustness
|
|
37
|
+
- **Resource bounds for untrusted input.** A hard on-disk file cap
|
|
38
|
+
(`MAX_INPUT_BYTES`, 100 MB) and a decompressed-size cap
|
|
39
|
+
(`MAX_DECOMPRESSED_BYTES`, 200 MB) so a zip-bomb `.docx` or zlib-bomb `.pdf`
|
|
40
|
+
can't exhaust memory: the DOCX reader checks `word/document.xml`'s
|
|
41
|
+
uncompressed size before reading, and the PDF reader decompresses streams
|
|
42
|
+
with a bounded budget. Both degrade gracefully (warning, empty text), never
|
|
43
|
+
crash. (Verified fast/bounded on a 2 MB doc: ~0.6 s, ~10 MB peak.)
|
|
44
|
+
|
|
45
|
+
### Added (output schema — minor, backward-compatible additions)
|
|
46
|
+
- **`jurisdiction`** — governing law normalized to a stable code
|
|
47
|
+
(`State of Delaware` → `US-DE`, `Province of Ontario` → `CA-ON`, …).
|
|
48
|
+
- **`amounts[]`** — every distinct monetary amount (`value` remains the headline one).
|
|
49
|
+
- **`signatories[]`** — `{name, title}` from signature blocks (`By:` / `Name:` /
|
|
50
|
+
`Title:`); empty on unsigned templates.
|
|
51
|
+
|
|
52
|
+
### Changed
|
|
53
|
+
- **Clause vocabulary round 2** (from the corpus survey): canonical
|
|
54
|
+
`Suspension`, `Support`, `Service Levels` + `invoicing`→Payment,
|
|
55
|
+
customer-data/protection-by-* → Data Protection. Noise filter now also drops
|
|
56
|
+
recitals/preamble/signature sections, definition fragments (a title starting
|
|
57
|
+
with a quote), and unfilled placeholders (`[ # ]%`). Mapped clause coverage
|
|
58
|
+
across the 58-document corpus rose from 57% → **64%**, no over-matching.
|
|
59
|
+
- Test coverage raised to **92%** (94% with the `[docx]`/`[pdf]` extras).
|
|
60
|
+
|
|
9
61
|
## [0.1.8] - 2026-05-22
|
|
10
62
|
|
|
11
63
|
Clause-detection breadth, driven by a 58-document real-corpus survey.
|
|
@@ -244,6 +296,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
|
|
|
244
296
|
intentionally *not* governed by the output schema (the schema describes the
|
|
245
297
|
full default output).
|
|
246
298
|
|
|
299
|
+
[0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
|
|
300
|
+
[0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
|
|
247
301
|
[0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
|
|
248
302
|
[0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
|
|
249
303
|
[0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
|
|
@@ -31,8 +31,11 @@ test-quick:
|
|
|
31
31
|
$(PYTHON) -m pytest -x -q -k "not property"
|
|
32
32
|
|
|
33
33
|
coverage:
|
|
34
|
+
# Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
|
|
35
|
+
# without them two extras-only branches stay uncovered (98% vs 100%).
|
|
36
|
+
$(PIP) install -q -e ".[dev,docx,pdf]"
|
|
34
37
|
$(PYTHON) -m coverage run --source=extract_cli -m pytest -q
|
|
35
|
-
$(PYTHON) -m coverage report -m
|
|
38
|
+
$(PYTHON) -m coverage report -m --fail-under=100
|
|
36
39
|
|
|
37
40
|
typecheck:
|
|
38
41
|
$(PYTHON) -m mypy --strict extract_cli.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: extract-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
|
|
5
5
|
Project-URL: Homepage, https://cli.drbaher.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/DrBaher/extract-cli
|
|
@@ -174,10 +174,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
|
|
|
174
174
|
"dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
|
|
175
175
|
"term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
|
|
176
176
|
"governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
|
|
177
|
+
"jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
|
|
177
178
|
"clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
|
|
178
179
|
"defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
|
|
179
180
|
"value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
|
|
180
|
-
"
|
|
181
|
+
"amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
|
|
182
|
+
"signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
|
|
183
|
+
"_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
|
|
181
184
|
}
|
|
182
185
|
```
|
|
183
186
|
|
|
@@ -136,10 +136,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
|
|
|
136
136
|
"dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
|
|
137
137
|
"term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
|
|
138
138
|
"governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
|
|
139
|
+
"jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
|
|
139
140
|
"clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
|
|
140
141
|
"defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
|
|
141
142
|
"value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
|
|
142
|
-
"
|
|
143
|
+
"amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
|
|
144
|
+
"signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
|
|
145
|
+
"_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
|
|
143
146
|
}
|
|
144
147
|
```
|
|
145
148
|
|
|
@@ -52,10 +52,12 @@ is a self-contained reference validator.
|
|
|
52
52
|
|
|
53
53
|
Top-level keys: `document` {title, format, sha256, source_path}, `parties[]`,
|
|
54
54
|
`dates` {effective, expiration}, `term` {length, auto_renew,
|
|
55
|
-
notice_period_days, *renewal_mechanics?*}, `governing_law`, `
|
|
56
|
-
|
|
57
|
-
`defined_terms[]`, `value`,
|
|
58
|
-
|
|
55
|
+
notice_period_days, *renewal_mechanics?*}, `governing_law`, `jurisdiction`
|
|
56
|
+
(normalized code, e.g. `US-DE`), `clauses[]` {canonical_title, detected_title,
|
|
57
|
+
tier, span, confidence, source, mapped}, `defined_terms[]`, `value`,
|
|
58
|
+
`amounts[]` (all monetary amounts), `signatories[]` {name, title}, *`obligations[]?`*,
|
|
59
|
+
and `_meta` {extractor_version, tiers_used, llm_used}. Formats: markdown, text,
|
|
60
|
+
html, docx, pdf. **Every extracted field carries a `confidence` (0–1) and
|
|
59
61
|
a `source` ∈ {deterministic, llm, none}.** Scalar fields use the envelope
|
|
60
62
|
`{value, confidence, source}`; "not found" is `{value: null, confidence: 0.0,
|
|
61
63
|
source: "none"}`. Italic fields are added only under `--llm`.
|
|
@@ -10,9 +10,12 @@
|
|
|
10
10
|
"dates",
|
|
11
11
|
"term",
|
|
12
12
|
"governing_law",
|
|
13
|
+
"jurisdiction",
|
|
13
14
|
"clauses",
|
|
14
15
|
"defined_terms",
|
|
15
16
|
"value",
|
|
17
|
+
"amounts",
|
|
18
|
+
"signatories",
|
|
16
19
|
"_meta"
|
|
17
20
|
],
|
|
18
21
|
"additionalProperties": false,
|
|
@@ -157,6 +160,9 @@
|
|
|
157
160
|
"governing_law": {
|
|
158
161
|
"$ref": "#/$defs/field"
|
|
159
162
|
},
|
|
163
|
+
"jurisdiction": {
|
|
164
|
+
"$ref": "#/$defs/field"
|
|
165
|
+
},
|
|
160
166
|
"clauses": {
|
|
161
167
|
"type": "array",
|
|
162
168
|
"items": {
|
|
@@ -247,6 +253,58 @@
|
|
|
247
253
|
"value": {
|
|
248
254
|
"$ref": "#/$defs/field"
|
|
249
255
|
},
|
|
256
|
+
"amounts": {
|
|
257
|
+
"type": "array",
|
|
258
|
+
"items": {
|
|
259
|
+
"type": "object",
|
|
260
|
+
"required": [
|
|
261
|
+
"value",
|
|
262
|
+
"confidence",
|
|
263
|
+
"source"
|
|
264
|
+
],
|
|
265
|
+
"properties": {
|
|
266
|
+
"value": {
|
|
267
|
+
"type": "string"
|
|
268
|
+
},
|
|
269
|
+
"confidence": {
|
|
270
|
+
"$ref": "#/$defs/confidence"
|
|
271
|
+
},
|
|
272
|
+
"source": {
|
|
273
|
+
"$ref": "#/$defs/source"
|
|
274
|
+
}
|
|
275
|
+
},
|
|
276
|
+
"additionalProperties": false
|
|
277
|
+
}
|
|
278
|
+
},
|
|
279
|
+
"signatories": {
|
|
280
|
+
"type": "array",
|
|
281
|
+
"items": {
|
|
282
|
+
"type": "object",
|
|
283
|
+
"required": [
|
|
284
|
+
"name",
|
|
285
|
+
"confidence",
|
|
286
|
+
"source"
|
|
287
|
+
],
|
|
288
|
+
"properties": {
|
|
289
|
+
"name": {
|
|
290
|
+
"type": "string"
|
|
291
|
+
},
|
|
292
|
+
"title": {
|
|
293
|
+
"type": [
|
|
294
|
+
"string",
|
|
295
|
+
"null"
|
|
296
|
+
]
|
|
297
|
+
},
|
|
298
|
+
"confidence": {
|
|
299
|
+
"$ref": "#/$defs/confidence"
|
|
300
|
+
},
|
|
301
|
+
"source": {
|
|
302
|
+
"$ref": "#/$defs/source"
|
|
303
|
+
}
|
|
304
|
+
},
|
|
305
|
+
"additionalProperties": false
|
|
306
|
+
}
|
|
307
|
+
},
|
|
250
308
|
"obligations": {
|
|
251
309
|
"type": "array",
|
|
252
310
|
"items": {
|