extract-cli 0.1.8__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {extract_cli-0.1.8 → extract_cli-0.1.10}/CHANGELOG.md +54 -0
  2. {extract_cli-0.1.8 → extract_cli-0.1.10}/Makefile +4 -1
  3. {extract_cli-0.1.8 → extract_cli-0.1.10}/PKG-INFO +5 -2
  4. {extract_cli-0.1.8 → extract_cli-0.1.10}/README.md +4 -1
  5. {extract_cli-0.1.8 → extract_cli-0.1.10}/docs/INTEROP.md +6 -4
  6. {extract_cli-0.1.8 → extract_cli-0.1.10}/docs/spec/extract-output.schema.json +58 -0
  7. {extract_cli-0.1.8 → extract_cli-0.1.10}/extract_cli.py +207 -44
  8. {extract_cli-0.1.8 → extract_cli-0.1.10}/pyproject.toml +1 -1
  9. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx.expected.json +14 -1
  10. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx.expected.json +8 -1
  11. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt.expected.json +14 -1
  12. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf.expected.json +14 -1
  13. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md.expected.json +8 -1
  14. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx.expected.json +8 -1
  15. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf.expected.json +8 -1
  16. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt.expected.json +14 -1
  17. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_html.html.expected.json +17 -4
  18. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_clause_map.py +15 -0
  19. extract_cli-0.1.10/tests/test_coverage.py +241 -0
  20. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_deterministic.py +25 -0
  21. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_misc.py +104 -0
  22. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_property.py +18 -0
  23. {extract_cli-0.1.8 → extract_cli-0.1.10}/.gitignore +0 -0
  24. {extract_cli-0.1.8 → extract_cli-0.1.10}/AGENTS.md +0 -0
  25. {extract_cli-0.1.8 → extract_cli-0.1.10}/ARCHITECTURE.md +0 -0
  26. {extract_cli-0.1.8 → extract_cli-0.1.10}/CONTRIBUTING.md +0 -0
  27. {extract_cli-0.1.8 → extract_cli-0.1.10}/LICENSE +0 -0
  28. {extract_cli-0.1.8 → extract_cli-0.1.10}/config/llm.json.example +0 -0
  29. {extract_cli-0.1.8 → extract_cli-0.1.10}/llms.txt +0 -0
  30. {extract_cli-0.1.8 → extract_cli-0.1.10}/scripts/release.py +0 -0
  31. {extract_cli-0.1.8 → extract_cli-0.1.10}/scripts/validate_against_spec.py +0 -0
  32. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/_fixtures_build.py +0 -0
  33. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/_make_goldens.py +0 -0
  34. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/_schema_validator.py +0 -0
  35. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/conftest.py +0 -0
  36. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/employment_docx.docx +0 -0
  37. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/heading_docx.docx +0 -0
  38. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/lease_allcaps.txt +0 -0
  39. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/license_pdf.pdf +0 -0
  40. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/nda_h2.md +0 -0
  41. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/numbered_docx.docx +0 -0
  42. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/scanned.pdf +0 -0
  43. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_bold.txt +0 -0
  44. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/fixtures/services_html.html +0 -0
  45. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_cli.py +0 -0
  46. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_llm.py +0 -0
  47. {extract_cli-0.1.8 → extract_cli-0.1.10}/tests/test_schema_conformance.py +0 -0
@@ -6,6 +6,58 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.10] - 2026-05-22
10
+
11
+ ### Fixed
12
+ - **The `[docx]` (python-docx) reader now honors Word heading styles**, matching
13
+ the stdlib reader. Previously the python-docx path concatenated paragraph text
14
+ and dropped `Heading1-9`/`Title` styles and `w:numPr` numbering, so installing
15
+ the `[docx]` extra produced an **empty clause map** on heading-styled Word
16
+ contracts (worse than the no-extra stdlib reader). Both readers now share one
17
+ emitter (`_emit_docx_paragraph`) that turns heading-styled / auto-numbered
18
+ paragraphs into `## headings`, so the two paths agree. New tests:
19
+ `test_emit_docx_paragraph` and `test_docx_readers_agree_on_clause_map` (the
20
+ latter asserts the python-docx and stdlib readers produce the same clause map).
21
+ No output-schema change.
22
+
23
+ ### Tests / quality
24
+ - **Line coverage raised to 100%** (was 92%/94%). Added a targeted test battery
25
+ for the remaining reachable branches (color/`FORCE_COLOR`, `_warn` silent,
26
+ date/jurisdiction/title/clause edge returns, LLM request/parse/clause-map
27
+ branches, PDF `TJ`-array + stream/budget edges, HTML malformed fallback, DOCX
28
+ empty paragraph, `_is_low_signal` branches, CLI silent/help paths). Genuinely
29
+ unreachable defensive lines and `[docx]`/`[pdf]`-extra fidelity branches are
30
+ marked `# pragma: no cover`. `make coverage` now installs the extras and
31
+ enforces `--fail-under=100`; a CI `coverage` job gates it. No code-behavior or
32
+ schema change.
33
+
34
+ ## [0.1.9] - 2026-05-22
35
+
36
+ ### Security / robustness
37
+ - **Resource bounds for untrusted input.** A hard on-disk file cap
38
+ (`MAX_INPUT_BYTES`, 100 MB) and a decompressed-size cap
39
+ (`MAX_DECOMPRESSED_BYTES`, 200 MB) so a zip-bomb `.docx` or zlib-bomb `.pdf`
40
+ can't exhaust memory: the DOCX reader checks `word/document.xml`'s
41
+ uncompressed size before reading, and the PDF reader decompresses streams
42
+ with a bounded budget. Both degrade gracefully (warning, empty text), never
43
+ crash. (Verified fast/bounded on a 2 MB doc: ~0.6 s, ~10 MB peak.)
44
+
45
+ ### Added (output schema — minor, backward-compatible additions)
46
+ - **`jurisdiction`** — governing law normalized to a stable code
47
+ (`State of Delaware` → `US-DE`, `Province of Ontario` → `CA-ON`, …).
48
+ - **`amounts[]`** — every distinct monetary amount (`value` remains the headline one).
49
+ - **`signatories[]`** — `{name, title}` from signature blocks (`By:` / `Name:` /
50
+ `Title:`); empty on unsigned templates.
51
+
52
+ ### Changed
53
+ - **Clause vocabulary round 2** (from the corpus survey): canonical
54
+ `Suspension`, `Support`, `Service Levels` + `invoicing`→Payment,
55
+ customer-data/protection-by-* → Data Protection. Noise filter now also drops
56
+ recitals/preamble/signature sections, definition fragments (a title starting
57
+ with a quote), and unfilled placeholders (`[ # ]%`). Mapped clause coverage
58
+ across the 58-document corpus rose from 57% → **64%**, no over-matching.
59
+ - Test coverage raised to **92%** (94% with the `[docx]`/`[pdf]` extras).
60
+
9
61
  ## [0.1.8] - 2026-05-22
10
62
 
11
63
  Clause-detection breadth, driven by a 58-document real-corpus survey.
@@ -244,6 +296,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
244
296
  intentionally *not* governed by the output schema (the schema describes the
245
297
  full default output).
246
298
 
299
+ [0.1.10]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.10
300
+ [0.1.9]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.9
247
301
  [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
248
302
  [0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
249
303
  [0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
@@ -31,8 +31,11 @@ test-quick:
31
31
  $(PYTHON) -m pytest -x -q -k "not property"
32
32
 
33
33
  coverage:
34
+ # Install the [docx]/[pdf] extras so the fidelity-reader paths execute too;
35
+ # without them two extras-only branches stay uncovered (98% vs 100%).
36
+ $(PIP) install -q -e ".[dev,docx,pdf]"
34
37
  $(PYTHON) -m coverage run --source=extract_cli -m pytest -q
35
- $(PYTHON) -m coverage report -m
38
+ $(PYTHON) -m coverage report -m --fail-under=100
36
39
 
37
40
  typecheck:
38
41
  $(PYTHON) -m mypy --strict extract_cli.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.8
3
+ Version: 0.1.10
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -174,10 +174,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
174
174
  "dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
175
175
  "term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
176
176
  "governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
177
+ "jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
177
178
  "clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
178
179
  "defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
179
180
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
180
- "_meta": { "extractor_version": "0.1.0", "tiers_used": ["deterministic"], "llm_used": false }
181
+ "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
182
+ "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
183
+ "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
181
184
  }
182
185
  ```
183
186
 
@@ -136,10 +136,13 @@ Streams follow the suite convention: **stdout** is the machine payload (JSON),
136
136
  "dates": { "effective": { "value": "2024-03-01", "confidence": 0.85, "source": "deterministic" }, "expiration": { "value": null, "confidence": 0.0, "source": "none" } },
137
137
  "term": { "length": { "value": "3 years", ... }, "auto_renew": { "value": true, ... }, "notice_period_days": { "value": 60, ... } },
138
138
  "governing_law": { "value": "State of Delaware", "confidence": 0.85, "source": "deterministic" },
139
+ "jurisdiction": { "value": "US-DE", "confidence": 0.8, "source": "deterministic" },
139
140
  "clauses": [ { "canonical_title": "Confidentiality", "detected_title": "## Confidentiality Obligations", "tier": "h2", "span": {"start": 0, "end": 120}, "confidence": 0.95, "source": "deterministic", "mapped": true } ],
140
141
  "defined_terms": [ { "term": "Confidential Information", "confidence": 0.6, "source": "deterministic" } ],
141
142
  "value": { "value": "$50,000", "confidence": 0.6, "source": "deterministic" },
142
- "_meta": { "extractor_version": "0.1.0", "tiers_used": ["deterministic"], "llm_used": false }
143
+ "amounts": [ { "value": "$50,000", "confidence": 0.6, "source": "deterministic" } ],
144
+ "signatories": [ { "name": "Jane Doe", "title": "CEO", "confidence": 0.55, "source": "deterministic" } ],
145
+ "_meta": { "extractor_version": "0.1.9", "tiers_used": ["deterministic"], "llm_used": false }
143
146
  }
144
147
  ```
145
148
 
@@ -52,10 +52,12 @@ is a self-contained reference validator.
52
52
 
53
53
  Top-level keys: `document` {title, format, sha256, source_path}, `parties[]`,
54
54
  `dates` {effective, expiration}, `term` {length, auto_renew,
55
- notice_period_days, *renewal_mechanics?*}, `governing_law`, `clauses[]`
56
- {canonical_title, detected_title, tier, span, confidence, source, mapped},
57
- `defined_terms[]`, `value`, *`obligations[]?`*, and `_meta` {extractor_version,
58
- tiers_used, llm_used}. **Every extracted field carries a `confidence` (0–1) and
55
+ notice_period_days, *renewal_mechanics?*}, `governing_law`, `jurisdiction`
56
+ (normalized code, e.g. `US-DE`), `clauses[]` {canonical_title, detected_title,
57
+ tier, span, confidence, source, mapped}, `defined_terms[]`, `value`,
58
+ `amounts[]` (all monetary amounts), `signatories[]` {name, title}, *`obligations[]?`*,
59
+ and `_meta` {extractor_version, tiers_used, llm_used}. Formats: markdown, text,
60
+ html, docx, pdf. **Every extracted field carries a `confidence` (0–1) and
59
61
  a `source` ∈ {deterministic, llm, none}.** Scalar fields use the envelope
60
62
  `{value, confidence, source}`; "not found" is `{value: null, confidence: 0.0,
61
63
  source: "none"}`. Italic fields are added only under `--llm`.
@@ -10,9 +10,12 @@
10
10
  "dates",
11
11
  "term",
12
12
  "governing_law",
13
+ "jurisdiction",
13
14
  "clauses",
14
15
  "defined_terms",
15
16
  "value",
17
+ "amounts",
18
+ "signatories",
16
19
  "_meta"
17
20
  ],
18
21
  "additionalProperties": false,
@@ -157,6 +160,9 @@
157
160
  "governing_law": {
158
161
  "$ref": "#/$defs/field"
159
162
  },
163
+ "jurisdiction": {
164
+ "$ref": "#/$defs/field"
165
+ },
160
166
  "clauses": {
161
167
  "type": "array",
162
168
  "items": {
@@ -247,6 +253,58 @@
247
253
  "value": {
248
254
  "$ref": "#/$defs/field"
249
255
  },
256
+ "amounts": {
257
+ "type": "array",
258
+ "items": {
259
+ "type": "object",
260
+ "required": [
261
+ "value",
262
+ "confidence",
263
+ "source"
264
+ ],
265
+ "properties": {
266
+ "value": {
267
+ "type": "string"
268
+ },
269
+ "confidence": {
270
+ "$ref": "#/$defs/confidence"
271
+ },
272
+ "source": {
273
+ "$ref": "#/$defs/source"
274
+ }
275
+ },
276
+ "additionalProperties": false
277
+ }
278
+ },
279
+ "signatories": {
280
+ "type": "array",
281
+ "items": {
282
+ "type": "object",
283
+ "required": [
284
+ "name",
285
+ "confidence",
286
+ "source"
287
+ ],
288
+ "properties": {
289
+ "name": {
290
+ "type": "string"
291
+ },
292
+ "title": {
293
+ "type": [
294
+ "string",
295
+ "null"
296
+ ]
297
+ },
298
+ "confidence": {
299
+ "$ref": "#/$defs/confidence"
300
+ },
301
+ "source": {
302
+ "$ref": "#/$defs/source"
303
+ }
304
+ },
305
+ "additionalProperties": false
306
+ }
307
+ },
250
308
  "obligations": {
251
309
  "type": "array",
252
310
  "items": {