extract-cli 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. extract_cli-0.1.8/AGENTS.md +87 -0
  2. {extract_cli-0.1.6 → extract_cli-0.1.8}/CHANGELOG.md +48 -0
  3. {extract_cli-0.1.6 → extract_cli-0.1.8}/PKG-INFO +28 -2
  4. {extract_cli-0.1.6 → extract_cli-0.1.8}/README.md +26 -0
  5. {extract_cli-0.1.6 → extract_cli-0.1.8}/docs/INTEROP.md +1 -0
  6. {extract_cli-0.1.6 → extract_cli-0.1.8}/extract_cli.py +187 -14
  7. extract_cli-0.1.8/llms.txt +79 -0
  8. {extract_cli-0.1.6 → extract_cli-0.1.8}/pyproject.toml +7 -2
  9. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/_fixtures_build.py +36 -2
  10. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/_make_goldens.py +2 -2
  11. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/conftest.py +1 -0
  12. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/employment_docx.docx.expected.json +1 -1
  13. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/heading_docx.docx.expected.json +1 -1
  14. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/lease_allcaps.txt.expected.json +1 -1
  15. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/license_pdf.pdf.expected.json +1 -1
  16. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/nda_h2.md.expected.json +1 -1
  17. extract_cli-0.1.8/tests/fixtures/numbered_docx.docx +0 -0
  18. extract_cli-0.1.8/tests/fixtures/numbered_docx.docx.expected.json +142 -0
  19. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/scanned.pdf.expected.json +1 -1
  20. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_bold.txt.expected.json +1 -1
  21. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_html.html.expected.json +1 -1
  22. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_clause_map.py +23 -0
  23. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_cli.py +63 -1
  24. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_misc.py +12 -0
  25. {extract_cli-0.1.6 → extract_cli-0.1.8}/.gitignore +0 -0
  26. {extract_cli-0.1.6 → extract_cli-0.1.8}/ARCHITECTURE.md +0 -0
  27. {extract_cli-0.1.6 → extract_cli-0.1.8}/CONTRIBUTING.md +0 -0
  28. {extract_cli-0.1.6 → extract_cli-0.1.8}/LICENSE +0 -0
  29. {extract_cli-0.1.6 → extract_cli-0.1.8}/Makefile +0 -0
  30. {extract_cli-0.1.6 → extract_cli-0.1.8}/config/llm.json.example +0 -0
  31. {extract_cli-0.1.6 → extract_cli-0.1.8}/docs/spec/extract-output.schema.json +0 -0
  32. {extract_cli-0.1.6 → extract_cli-0.1.8}/scripts/release.py +0 -0
  33. {extract_cli-0.1.6 → extract_cli-0.1.8}/scripts/validate_against_spec.py +0 -0
  34. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/_schema_validator.py +0 -0
  35. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/employment_docx.docx +0 -0
  36. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/heading_docx.docx +0 -0
  37. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/lease_allcaps.txt +0 -0
  38. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/license_pdf.pdf +0 -0
  39. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/nda_h2.md +0 -0
  40. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/scanned.pdf +0 -0
  41. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_bold.txt +0 -0
  42. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/fixtures/services_html.html +0 -0
  43. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_deterministic.py +0 -0
  44. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_llm.py +0 -0
  45. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_property.py +0 -0
  46. {extract_cli-0.1.6 → extract_cli-0.1.8}/tests/test_schema_conformance.py +0 -0
@@ -0,0 +1,87 @@
1
+ # Agents
2
+
3
+ Drive `extract-cli` from an LLM agent or non-interactive client. Same agent
4
+ contract as the rest of the contract-ops suite: a stable machine-readable
5
+ catalog, JSON on stdout, humans on stderr, and a small documented exit-code set.
6
+
7
+ `extract-cli` is the suite's **open-loop front door**: hand it any contract
8
+ (`.md` / `.txt` / `.html` / `.docx` / `.pdf`, yours or a counterparty's) and it
9
+ returns structured JSON the rest of the pipeline can consume. Every field
10
+ carries a `confidence` and a `source` — **verify, don't trust**.
11
+
12
+ ## Output contract
13
+
14
+ - **Success**: a single JSON object to **stdout**, exit `0`. This is the machine
15
+ payload; it's the default (no `--json` needed, though `--json` forces it).
16
+ - Every extracted scalar is the envelope `{value, confidence, source}`;
17
+ "not found" is the canonical `{value: null, confidence: 0.0, source: "none"}`.
18
+ Lists (`parties`, `clauses`, `defined_terms`) carry per-item
19
+ `confidence`/`source`. `source ∈ {deterministic, llm, none}`.
20
+ - `_meta` records `extractor_version`, `tiers_used`, and `llm_used`.
21
+ - The output shape is locked by a JSON Schema —
22
+ [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json),
23
+ also printed by `extract schema`. Validate against it instead of trusting
24
+ field shapes by convention. (Note: the `--no-confidence` projection is a
25
+ reduced convenience view, **not** governed by the schema.)
26
+ - **stderr** is for humans only: `--why` rationale, warnings, and errors.
27
+ stdout stays clean JSON even under `--why`.
28
+ - **Failure**: a one-line `error: <message>` on **stderr**, non-zero exit.
29
+ The error shape is a flat string (the suite is not uniform on error-object
30
+ shape) — **branch on the exit code, never on the human-readable message.**
31
+
32
+ ## Exit codes
33
+
34
+ | Code | Meaning |
35
+ |------|---------|
36
+ | `0` | Success. |
37
+ | `1` | Low-signal document — no high-signal fields (parties/clauses/dates) could be extracted; e.g. a scanned/image-only or empty file. A **finding**, not a crash: valid JSON is still emitted on stdout. |
38
+ | `2` | Bad usage / user-actionable error (unreadable path, bad flag value, unsupported completion shell). |
39
+
40
+ ## Discovery
41
+
42
+ Never hardcode command or flag names — call the catalog at startup:
43
+
44
+ ```bash
45
+ extract --catalog json # {name, bin, version, description, commands[], exitCodes}
46
+ ```
47
+
48
+ `--catalog json` is the suite-wide discovery contract (parallel to
49
+ `nda-review-cli --catalog json`, `docx2pdf --catalog json`,
50
+ `sign --catalog json`). It is **complete, accurate, and stable across minor
51
+ versions** — a test asserts it never drifts from the real parser.
52
+
53
+ Tool-specific discovery extras:
54
+
55
+ ```bash
56
+ extract schema # the output JSON Schema (the cross-CLI data contract)
57
+ extract fields # extractable fields and the tier that produces each
58
+ extract fields --json # ...as JSON
59
+ extract demo # run on a bundled fixture (zero-config first run)
60
+ extract --version
61
+ ```
62
+
63
+ ## Failure → recovery
64
+
65
+ | Symptom | Diagnose | Recover |
66
+ |---|---|---|
67
+ | Exit `1`, warning "no high-signal fields" | The document is likely scanned/image-only or has no recognizable structure. JSON is still emitted. | OCR the source first, or feed a text/`.docx`/`.md` version. The empty-but-valid JSON is safe to pass downstream. |
68
+ | Exit `2`, `error: ...` | `extract --catalog json` (or `extract <cmd> --help`) for the real surface. | Fix the path/flag and retry. |
69
+ | `clauses: []` on a real contract | The `.docx` likely auto-numbers via Word's numbering with no heading style (its numbers live only in `numbering.xml`), so the deterministic cascade sees no headings. | Re-run with `--llm` (opt-in): when no clauses are detected, the LLM is asked for section headings, normalized through the same canonical vocabulary and emitted with `tier: "llm"`, `source: "llm"`, and a modest confidence. Requires `~/.config/contract-ops/llm.json`. |
70
+ | Low-fidelity `.docx`/`.pdf` text | The stdlib best-effort reader ran (no extras installed). | `pip install "extract-cli[docx]"` and/or `"extract-cli[pdf]"` for higher fidelity. The core always works without them. |
71
+ | `--llm` only printed a warning | No LLM config found. | Copy [`config/llm.json.example`](config/llm.json.example) to `~/.config/contract-ops/llm.json`. Without it, deterministic output is still returned in full. |
72
+
73
+ ## Recommended usage
74
+
75
+ ```bash
76
+ # Inspect any contract's structure, one tool for five formats.
77
+ extract counterparty.docx | jq '{parties: [.parties[].name],
78
+ governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
79
+
80
+ # Gate a workflow on extraction confidence (non-zero exit if any clause is shaky).
81
+ extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)' && echo ok
82
+ ```
83
+
84
+ The integration contract is the **output schema** + the **shared canonical
85
+ clause vocabulary** (`canonical_title` values match what `template-vault-cli`
86
+ detects and `nda-review-cli` keys policy on) — not per-tool flags. See
87
+ [`docs/INTEROP.md`](docs/INTEROP.md).
@@ -6,6 +6,52 @@ to [Semantic Versioning](https://semver.org/). Per the suite convention
6
6
  (see [`docs/INTEROP.md`](docs/INTEROP.md)), **backward-incompatible changes to
7
7
  the output schema require a major version bump**; new optional fields are minor.
8
8
 
9
+ ## [0.1.8] - 2026-05-22
10
+
11
+ Clause-detection breadth, driven by a 58-document real-corpus survey.
12
+
13
+ ### Added
14
+ - **Auto-numbered DOCX clauses.** The DOCX reader now treats `w:numPr` list
15
+ paragraphs (no heading style; number generated from `numbering.xml`) as
16
+ clause-heading candidates, run through the same run-in/heading-likeness filter
17
+ as heading styles. Real agreements that number clauses this way (data
18
+ processing / design-partner agreements) get a clause map where they previously
19
+ got none; deep numbered body sentences are still excluded. New `numbered_docx`
20
+ fixture + tests.
21
+ - **Two-line `ARTICLE N` headings.** A bare `ARTICLE N` / `SECTION N` line whose
22
+ title sits on the next line (common in formal agreements) is detected as a
23
+ pair — recovering, e.g., a real SEC services agreement's clause map (0 → 8).
24
+ Fires only with >= 2 well-formed pairs; reported under the `numbered` tier (no
25
+ schema change).
26
+ - **Expanded canonical clause vocabulary** from the corpus survey: new canonical
27
+ clauses `Exclusions`, `Remedies`, `Restrictions`, `Taxes`,
28
+ `Reservation of Rights`, `Third-Party Beneficiaries`, `Feedback`,
29
+ `Miscellaneous`, plus aliases for `Compliance with Laws` (anti-bribery, export
30
+ controls) and `Data Protection` (customer data/content). ~155 more clauses map
31
+ across the corpus, with no observed over-matching.
32
+ - **`CLAUDE.md`** — codebase development notes (complements AGENTS.md).
33
+
34
+ No output-schema change.
35
+
36
+ ## [0.1.7] - 2026-05-22
37
+
38
+ ### Added
39
+ - **`extract --catalog json` — the suite's shared discovery contract.** Emits
40
+ `{name, bin, version, description, commands[], exitCodes}` (mirroring
41
+ `nda-review-cli --catalog json` / `docx2pdf --catalog json` /
42
+ `sign --catalog json`) so agents can learn every command and flag at startup
43
+ instead of hardcoding them. A test asserts the catalog never drifts from the
44
+ real argparse parser. Also added to the bash/zsh completion flag lists.
45
+ - **`AGENTS.md`** — the agent contract in the suite's canonical section order
46
+ (output contract / exit codes / discovery / failure → recovery).
47
+ - **`llms.txt`** — machine-readable tool summary at the repo root.
48
+
49
+ ### Changed
50
+ - Packaging: added the suite-standard keywords (`contract-ops`, `agent-first`,
51
+ `legal-tech`); README now opens with `## Run this` / `## Where to go next`;
52
+ `--catalog json` documented in the README and `docs/INTEROP.md`. No schema or
53
+ extraction-logic change (`extractor_version` unchanged).
54
+
9
55
  ## [0.1.6] - 2026-05-21
10
56
 
11
57
  ### Docs
@@ -198,6 +244,8 @@ Initial release — the open-loop front door of the contract-ops CLI suite.
198
244
  intentionally *not* governed by the output schema (the schema describes the
199
245
  full default output).
200
246
 
247
+ [0.1.8]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.8
248
+ [0.1.7]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.7
201
249
  [0.1.6]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.6
202
250
  [0.1.5]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.5
203
251
  [0.1.4]: https://github.com/DrBaher/extract-cli/releases/tag/v0.1.4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-cli
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON.
5
5
  Project-URL: Homepage, https://cli.drbaher.com/
6
6
  Project-URL: Repository, https://github.com/DrBaher/extract-cli
@@ -8,7 +8,7 @@ Project-URL: Suite interop, https://github.com/DrBaher/extract-cli/blob/main/doc
8
8
  Author-email: DrBaher <Drbaher@gmail.com>
9
9
  License: MIT
10
10
  License-File: LICENSE
11
- Keywords: clause,cli,contract,extraction,json,legal,nda
11
+ Keywords: agent-first,clause,cli,contract,contract-ops,extraction,json,legal,legal-tech,nda
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Environment :: Console
14
14
  Classifier: Intended Audience :: Developers
@@ -61,6 +61,30 @@ ingest (extract) → review → diff → convert → sign
61
61
  ^you are here
62
62
  ```
63
63
 
64
+ ## Run this
65
+
66
+ ```bash
67
+ pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
68
+ # or, installed: pip install extract-cli && extract demo
69
+ ```
70
+
71
+ That prints the full output contract — parties, dates, term, governing law, and
72
+ a clause map normalized onto the suite's canonical vocabulary — for a bundled
73
+ fixture, with no setup and no network. Point it at your own file with
74
+ `extract path/to/contract.docx`.
75
+
76
+ ## Where to go next
77
+
78
+ - **New here?** Keep reading — [What it does](#what-it-does) and
79
+ [The two extraction tiers](#the-two-extraction-tiers).
80
+ - **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
81
+ `extract --catalog json` at startup to discover commands/flags. The output
82
+ shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
83
+ - **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
84
+ contract is the output schema + the shared clause vocabulary.
85
+ - **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
86
+ and [ARCHITECTURE.md](ARCHITECTURE.md).
87
+
64
88
  ## What it does
65
89
 
66
90
  Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
@@ -115,6 +139,7 @@ for them.
115
139
 
116
140
  ```bash
117
141
  extract <path> # parse a document → structured JSON on stdout (default)
142
+ extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
118
143
  extract schema # print the output JSON Schema (the cross-CLI contract)
119
144
  extract fields # list extractable fields and their tier
120
145
  extract demo # run on a bundled fixture and show the narrative
@@ -125,6 +150,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
125
150
 
126
151
  | Flag | Meaning |
127
152
  |---|---|
153
+ | `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
128
154
  | `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
129
155
  | `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
130
156
  | `--format json\|table` | Output format (default `json`) |
@@ -23,6 +23,30 @@ ingest (extract) → review → diff → convert → sign
23
23
  ^you are here
24
24
  ```
25
25
 
26
+ ## Run this
27
+
28
+ ```bash
29
+ pipx run extract-cli demo # zero-config: extract a bundled NDA → structured JSON
30
+ # or, installed: pip install extract-cli && extract demo
31
+ ```
32
+
33
+ That prints the full output contract — parties, dates, term, governing law, and
34
+ a clause map normalized onto the suite's canonical vocabulary — for a bundled
35
+ fixture, with no setup and no network. Point it at your own file with
36
+ `extract path/to/contract.docx`.
37
+
38
+ ## Where to go next
39
+
40
+ - **New here?** Keep reading — [What it does](#what-it-does) and
41
+ [The two extraction tiers](#the-two-extraction-tiers).
42
+ - **Driving it from an agent?** See [`AGENTS.md`](AGENTS.md) and call
43
+ `extract --catalog json` at startup to discover commands/flags. The output
44
+ shape is locked by [`docs/spec/extract-output.schema.json`](docs/spec/extract-output.schema.json).
45
+ - **Wiring it into the pipeline?** See [`docs/INTEROP.md`](docs/INTEROP.md) — the
46
+ contract is the output schema + the shared clause vocabulary.
47
+ - **Contributing / building a sibling CLI?** [`CONTRIBUTING.md`](CONTRIBUTING.md)
48
+ and [ARCHITECTURE.md](ARCHITECTURE.md).
49
+
26
50
  ## What it does
27
51
 
28
52
  Give it a contract in **`.md` / `.txt` / `.html`** (native), **`.docx`**, or
@@ -77,6 +101,7 @@ for them.
77
101
 
78
102
  ```bash
79
103
  extract <path> # parse a document → structured JSON on stdout (default)
104
+ extract --catalog json # machine-readable catalog of commands/flags (agents call at startup)
80
105
  extract schema # print the output JSON Schema (the cross-CLI contract)
81
106
  extract fields # list extractable fields and their tier
82
107
  extract demo # run on a bundled fixture and show the narrative
@@ -87,6 +112,7 @@ extract completion bash # emit a shell-completion script (bash|zsh)
87
112
 
88
113
  | Flag | Meaning |
89
114
  |---|---|
115
+ | `--catalog json` | Print the machine-readable command/flag catalog and exit (the suite discovery contract; agents call this at startup) |
90
116
  | `--llm` | Opt-in LLM enrichment of fuzzy fields (off by default) |
91
117
  | `--fields a,b,c` | Emit only a subset of top-level fields (e.g. `parties,clauses`) |
92
118
  | `--format json\|table` | Output format (default `json`) |
@@ -118,6 +118,7 @@ only stdlib `urllib`, so there is no runtime dependency.
118
118
  | Concern | Convention |
119
119
  |---|---|
120
120
  | Primary result | **stdout** (JSON payload, default) |
121
+ | Discovery | `extract --catalog json` (commands/flags, the suite contract) + `extract schema` / `extract fields --json` |
121
122
  | `--why`, warnings, errors | **stderr** |
122
123
  | `--why` envelope | plain-text `[why] <header>` block (as in template-vault-cli / draft-cli) |
123
124
  | Quiet | `-q` / `--silent` / `--quiet` aliases |
@@ -43,11 +43,11 @@ import urllib.request
43
43
  from pathlib import Path
44
44
  from typing import Any, Dict, List, Optional, Tuple
45
45
 
46
- __version__ = "0.1.6"
46
+ __version__ = "0.1.8"
47
47
 
48
48
  # Bumped independently of the package version when the *extraction logic*
49
49
  # changes in a way downstream consumers should notice. Embedded in `_meta`.
50
- EXTRACTOR_VERSION = "0.1.6"
50
+ EXTRACTOR_VERSION = "0.1.8"
51
51
 
52
52
  # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
53
53
  SCHEMA_VERSION = 1
@@ -258,8 +258,43 @@ def _qualifies_as_numbered_heading(title: str) -> bool:
258
258
  return True
259
259
 
260
260
 
261
+ # A bare "ARTICLE N" / "SECTION N" line whose title sits on the FOLLOWING line
262
+ # (common in formal agreements). Detected as a pair; reported under the
263
+ # "numbered" tier so no new schema value is introduced.
264
+ _ARTICLE_LINE_RE = re.compile(
265
+ r"^[ \t]*(?:ARTICLE|Article|SECTION|Section)[ \t]+(?:" + _ROMAN_RE + r"|\d{1,2})"
266
+ r"[ \t]*[.:–—-]?[ \t]*$",
267
+ re.MULTILINE,
268
+ )
269
+
270
+
271
+ def _detect_two_line_articles(text: str) -> List[JSON]:
272
+ """Pair each `ARTICLE N` marker line with the heading on the next non-blank
273
+ line. Fires only with >= 2 well-formed pairs, so a one-off `ARTICLE` mention
274
+ can't trigger it."""
275
+ markers = list(_ARTICLE_LINE_RE.finditer(text))
276
+ if len(markers) < 2:
277
+ return []
278
+ out: List[JSON] = []
279
+ for i, m in enumerate(markers):
280
+ end = markers[i + 1].start() if i + 1 < len(markers) else len(text)
281
+ title_line = ""
282
+ for ln in text[m.end():end].splitlines():
283
+ if ln.strip():
284
+ title_line = ln.strip()
285
+ break
286
+ title = _strip_clause_number(title_line)
287
+ # Reject when the next line is itself a numbered section header with body
288
+ # ("Section 1.01. Term. The term ...") or simply not heading-like.
289
+ if not title or not _qualifies_as_numbered_heading(title):
290
+ continue
291
+ out.append({"title": title, "detected": title_line, "anchor": title_line,
292
+ "start": m.start(), "end": end, "tier": "numbered"})
293
+ return out
294
+
295
+
261
296
  def detect_clauses(text: str) -> List[JSON]:
262
- """Run the three-tier cascade and return clauses with their detection tier.
297
+ """Run the clause-detection cascade and return clauses with their tier.
263
298
 
264
299
  Returns [{title, detected, anchor, start, end, tier}, ...]. `title` is the
265
300
  numbering-stripped heading; `detected` is the raw heading line as it
@@ -277,6 +312,9 @@ def detect_clauses(text: str) -> List[JSON]:
277
312
  ]
278
313
  if len(numbered) >= 2:
279
314
  return _matches_to_clauses(text, numbered, group=1, tier="numbered")
315
+ articles = _detect_two_line_articles(text)
316
+ if len(articles) >= 2:
317
+ return articles
280
318
  caps = [
281
319
  m for m in _ALL_CAPS_HEADING_RE.finditer(text)
282
320
  if _qualifies_as_all_caps_heading(m.group(1))
@@ -370,7 +408,8 @@ CANONICAL_CLAUSE_ALIASES: Dict[str, List[str]] = {
370
408
  "covenant not to compete",
371
409
  ],
372
410
  "Non-Solicitation": ["non-solicit", "non-solicitation", "nonsolicitation", "no solicitation"],
373
- "Data Protection": ["data protection", "data privacy", "gdpr", "privacy", "personal data"],
411
+ "Data Protection": ["data protection", "data privacy", "gdpr", "privacy", "personal data",
412
+ "customer data", "customer content"],
374
413
  "Insurance": ["insurance"],
375
414
  "Counterparts": ["counterparts"],
376
415
  "Survival": ["survival", "survival of obligations"],
@@ -378,8 +417,22 @@ CANONICAL_CLAUSE_ALIASES: Dict[str, List[str]] = {
378
417
  "Relationship of the Parties": [
379
418
  "relationship of the parties", "independent contractor", "no partnership", "no agency",
380
419
  ],
381
- "Compliance with Laws": ["compliance with laws", "compliance", "anti-corruption"],
420
+ "Compliance with Laws": ["compliance with laws", "compliance", "anti-corruption",
421
+ "anti-bribery", "export controls", "export control"],
382
422
  "Publicity": ["publicity", "announcements", "press releases"],
423
+ # Added from a 58-document real-corpus survey of common unmapped titles.
424
+ "Exclusions": ["exclusions", "exceptions", "permitted disclosures", "required disclosures",
425
+ "exclusions from confidential information"],
426
+ "Remedies": ["remedies", "injunctive relief", "equitable relief", "exclusive remedy",
427
+ "non-exhaustive remedies", "specific performance"],
428
+ "Restrictions": ["restrictions", "use restrictions", "usage restrictions",
429
+ "license restrictions", "restrictions and obligations"],
430
+ "Taxes": ["taxes", "tax matters", "withholding"],
431
+ "Reservation of Rights": ["reservation of rights", "reservation of right"],
432
+ "Third-Party Beneficiaries": ["third-party beneficiaries", "third party beneficiaries",
433
+ "no third-party beneficiary", "no third party beneficiaries"],
434
+ "Feedback": ["feedback", "feedback and usage data"],
435
+ "Miscellaneous": ["miscellaneous", "general terms", "general provisions"],
383
436
  }
384
437
 
385
438
 
@@ -995,7 +1048,9 @@ def _read_docx_stdlib(raw: bytes) -> str:
995
1048
  paras: List[str] = []
996
1049
  # iter over w:p in document order (includes paragraphs inside table cells).
997
1050
  for p in root.iter(w + "p"):
998
- style = _docx_paragraph_style(p.find(w + "pPr"), w)
1051
+ ppr = p.find(w + "pPr")
1052
+ style = _docx_paragraph_style(ppr, w)
1053
+ numbered = ppr is not None and ppr.find(w + "numPr") is not None
999
1054
  run_texts: List[str] = []
1000
1055
  any_text = False
1001
1056
  all_bold = True
@@ -1012,17 +1067,21 @@ def _read_docx_stdlib(raw: bytes) -> str:
1012
1067
  if not line:
1013
1068
  paras.append("")
1014
1069
  continue
1015
- # Word heading styles carry the clause structure (their numbers are
1016
- # auto-generated, so absent from text). Emit them as H2 so the clause
1017
- # cascade's strongest tier detects them; keep any run-in body too.
1018
- if _is_heading_style(style):
1070
+ # Clause structure in real Word contracts lives in heading STYLES
1071
+ # (Heading1-9/Title) or auto-NUMBERED paragraphs (w:numPr) -- in both the
1072
+ # visible number is auto-generated and absent from the text. Emit such a
1073
+ # paragraph as an H2 heading (strongest cascade tier) when its lead looks
1074
+ # like a heading; _docx_heading_title rejects full-sentence body items
1075
+ # (e.g. deep numbered sub-points), so this stays conservative. Keep any
1076
+ # run-in body as a following paragraph.
1077
+ if _is_heading_style(style) or numbered:
1019
1078
  title = _docx_heading_title(line)
1020
1079
  if title is not None:
1021
1080
  paras.append(f"## {title}")
1022
1081
  if len(title) < len(line):
1023
1082
  paras.append(line[len(title):].lstrip(" .:\t"))
1024
1083
  continue
1025
- # Sentence carrying a heading style -> treat as ordinary body text.
1084
+ # Not heading-like -> treat as ordinary body text.
1026
1085
  if any_text and all_bold:
1027
1086
  line = f"**{line}**"
1028
1087
  paras.append(line)
@@ -1851,7 +1910,8 @@ def cmd_demo(args: argparse.Namespace) -> int:
1851
1910
  _SUBCOMMANDS = ("schema", "fields", "demo", "completion")
1852
1911
  _GLOBAL_FLAGS = (
1853
1912
  "--json", "--why", "-q", "--silent", "--no-color", "--llm",
1854
- "--format", "--fields", "--no-confidence", "-V", "--version", "-h", "--help",
1913
+ "--format", "--fields", "--no-confidence", "--catalog",
1914
+ "-V", "--version", "-h", "--help",
1855
1915
  )
1856
1916
 
1857
1917
  _BASH_COMPLETION = r"""# extract-cli bash completion
@@ -1860,7 +1920,7 @@ _extract_completions() {
1860
1920
  local cur prev
1861
1921
  cur="${COMP_WORDS[COMP_CWORD]}"
1862
1922
  local cmds="schema fields demo completion"
1863
- local flags="--json --why -q --silent --no-color --llm --format --fields --no-confidence -V --version -h --help"
1923
+ local flags="--json --why -q --silent --no-color --llm --format --fields --no-confidence --catalog -V --version -h --help"
1864
1924
  if [ "$COMP_CWORD" -eq 1 ]; then
1865
1925
  COMPREPLY=( $(compgen -W "${cmds}" -- "${cur}") $(compgen -f -- "${cur}") )
1866
1926
  return 0
@@ -1886,7 +1946,7 @@ _extract() {
1886
1946
  )
1887
1947
  flags=(
1888
1948
  '--json' '--why' '-q' '--silent' '--no-color' '--llm'
1889
- '--format' '--fields' '--no-confidence' '-V' '--version'
1949
+ '--format' '--fields' '--no-confidence' '--catalog' '-V' '--version'
1890
1950
  )
1891
1951
  if (( CURRENT == 2 )); then
1892
1952
  _describe 'command' cmds
@@ -1925,6 +1985,102 @@ def _completion_handler(argv: List[str]) -> int:
1925
1985
  return 0
1926
1986
 
1927
1987
 
1988
+ # ---------------------------------------------------------------------------
1989
+ # Machine-readable catalog (`extract --catalog json`)
1990
+ # ---------------------------------------------------------------------------
1991
+ # The suite's shared discovery contract: agents call `extract --catalog json`
1992
+ # at startup to learn every command and flag instead of hardcoding them
1993
+ # (parallel to `nda-review-cli --catalog json`, `docx2pdf --catalog json`,
1994
+ # `sign --catalog json`). It is a STABLE contract — keep it complete and
1995
+ # accurate; `tests/test_cli.py` asserts it never drifts from the real parser.
1996
+
1997
+
1998
+ def _flag(name: str, *, aliases: Optional[List[str]] = None, help: str = "",
1999
+ default: Any = None, choices: Optional[List[str]] = None,
2000
+ required: bool = False) -> JSON:
2001
+ return {
2002
+ "name": name,
2003
+ "aliases": aliases if aliases is not None else [],
2004
+ "help": help,
2005
+ "required": required,
2006
+ "default": default,
2007
+ "choices": choices,
2008
+ }
2009
+
2010
+
2011
+ # Output flags shared by `extract` and `demo` (mirror _add_common_output_flags).
2012
+ _CATALOG_OUTPUT_FLAGS: Tuple[JSON, ...] = (
2013
+ _flag("--json", help="Force JSON output to stdout (the default)."),
2014
+ _flag("--format", default="json", choices=["json", "table"],
2015
+ help="Output format (default: json)."),
2016
+ _flag("--no-confidence",
2017
+ help="Omit confidence/source markers (reduced convenience view)."),
2018
+ _flag("--why", help="Print a rationale block to stderr."),
2019
+ _flag("--silent", aliases=["-q", "--quiet"],
2020
+ help="Suppress non-error diagnostics (and the human table)."),
2021
+ )
2022
+
2023
+
2024
+ def build_catalog() -> JSON:
2025
+ """The machine-readable catalog emitted by `extract --catalog json`."""
2026
+ extract_flags: List[JSON] = [
2027
+ _flag("--llm",
2028
+ help="Opt-in LLM enrichment of fuzzy fields (renewal mechanics, "
2029
+ "obligations, and a clause-map fallback). Off by default; the "
2030
+ "deterministic core is fully useful without it."),
2031
+ _flag("--fields", default="",
2032
+ help="Comma-separated subset of top-level fields to emit "
2033
+ "(e.g. parties,clauses,governing_law)."),
2034
+ *_CATALOG_OUTPUT_FLAGS,
2035
+ ]
2036
+ return {
2037
+ "name": CLI_NAME,
2038
+ "bin": "extract",
2039
+ "version": __version__,
2040
+ "description": (
2041
+ "Open-loop front door of the contract-ops CLI suite: ingest any contract "
2042
+ "(.md/.txt/.html/.docx/.pdf) and emit structured JSON."
2043
+ ),
2044
+ "commands": [
2045
+ {
2046
+ "name": "extract",
2047
+ "help": "Parse a document into structured JSON. The default action: "
2048
+ "`extract <path>` works without naming the subcommand. "
2049
+ "Positional: path to a .md/.txt/.html/.docx/.pdf file.",
2050
+ "flags": extract_flags,
2051
+ },
2052
+ {
2053
+ "name": "schema",
2054
+ "help": "Print the output JSON Schema — the cross-CLI output contract.",
2055
+ "flags": [],
2056
+ },
2057
+ {
2058
+ "name": "fields",
2059
+ "help": "List extractable fields and the tier that produces each.",
2060
+ "flags": [_flag("--json", help="Emit the field list as JSON.")],
2061
+ },
2062
+ {
2063
+ "name": "demo",
2064
+ "help": "Run extraction on a bundled fixture (zero-config first run).",
2065
+ "flags": list(_CATALOG_OUTPUT_FLAGS),
2066
+ },
2067
+ {
2068
+ "name": "completion",
2069
+ "help": "Emit a shell-completion script. Positional: bash | zsh.",
2070
+ "flags": [],
2071
+ },
2072
+ ],
2073
+ "exitCodes": {
2074
+ "0": "success",
2075
+ "1": "low-signal document — no high-signal fields (parties/clauses/dates) "
2076
+ "could be extracted; e.g. a scanned/image-only or empty file. "
2077
+ "A finding, not a crash.",
2078
+ "2": "bad usage / user-actionable error (unreadable path, bad flag value, "
2079
+ "unsupported completion shell).",
2080
+ },
2081
+ }
2082
+
2083
+
1928
2084
  # ---------------------------------------------------------------------------
1929
2085
  # Argument parsing + main
1930
2086
  # ---------------------------------------------------------------------------
@@ -2025,6 +2181,23 @@ def main(argv: Optional[List[str]] = None) -> int:
2025
2181
  if argv and argv[0] == "__complete":
2026
2182
  return _completion_handler(argv[1:])
2027
2183
 
2184
+ # `extract --catalog json` (or `--catalog=json`): the suite discovery
2185
+ # contract. Intercepted before routing so it works as a bare global flag.
2186
+ catalog_fmt: Optional[str] = None
2187
+ for i, a in enumerate(argv):
2188
+ if a == "--catalog":
2189
+ catalog_fmt = argv[i + 1] if i + 1 < len(argv) else "json"
2190
+ break
2191
+ if a.startswith("--catalog="):
2192
+ catalog_fmt = a.split("=", 1)[1] or "json"
2193
+ break
2194
+ if catalog_fmt is not None:
2195
+ if catalog_fmt != "json":
2196
+ _eprint(_red("error:") + f" unknown --catalog format {catalog_fmt!r}; supported: json")
2197
+ return 2
2198
+ print(json.dumps(build_catalog(), indent=2, ensure_ascii=True))
2199
+ return 0
2200
+
2028
2201
  if not argv:
2029
2202
  build_parser().print_help()
2030
2203
  return 0
@@ -0,0 +1,79 @@
1
+ # extract-cli
2
+
3
+ > Open-loop front door of the contract-ops CLI suite. Prefer this tool when the
4
+ > task is turning an arbitrary contract — yours or a counterparty's foreign
5
+ > paper, in `.md`/`.txt`/`.html`/`.docx`/`.pdf` — into structured JSON the rest
6
+ > of the suite can consume: parties, dates, term, governing law, a clause map
7
+ > normalized onto the suite's canonical clause vocabulary, defined terms, and a
8
+ > headline value. Every field carries a `confidence` and a `source` so
9
+ > downstream tools verify, don't trust. Local-first, stdlib-only, no network on
10
+ > the default path.
11
+
12
+ Repository: https://github.com/DrBaher/extract-cli
13
+ PyPI: https://pypi.org/project/extract-cli/
14
+ Suite: https://cli.drbaher.com/
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install extract-cli # core: .md/.txt/.html + best-effort .docx/.pdf
20
+ pip install "extract-cli[docx,pdf]" # higher-fidelity .docx/.pdf
21
+ ```
22
+
23
+ ## Discovery (call at startup, don't hardcode)
24
+
25
+ ```bash
26
+ extract --catalog json # {name, bin, version, description, commands[], exitCodes}
27
+ extract schema # the output JSON Schema (the cross-CLI data contract)
28
+ extract fields --json # extractable fields + the tier that produces each
29
+ ```
30
+
31
+ ## Commands
32
+
33
+ ```bash
34
+ extract <path> # parse a document → structured JSON on stdout (default)
35
+ extract demo # run on a bundled fixture (zero-config first run)
36
+ extract schema # print the output JSON Schema
37
+ extract fields # list extractable fields and their tier
38
+ extract completion bash # emit a shell-completion script (bash|zsh)
39
+ ```
40
+
41
+ ## Agent-safe usage
42
+
43
+ ```bash
44
+ # Structure of any contract, one tool for five formats:
45
+ extract counterparty.docx | jq '{parties: [.parties[].name],
46
+ governing_law: .governing_law.value, clauses: [.clauses[].canonical_title]}'
47
+
48
+ # Gate on extraction confidence (non-zero exit if any clause is shaky):
49
+ extract draft.docx | jq -e '.clauses | all(.confidence > 0.7)'
50
+ ```
51
+
52
+ ## Two tiers
53
+
54
+ - **deterministic** (default, always on, no network): parties, dates, defined
55
+ terms, clause map, governing law, best-effort term/notice/value.
56
+ - **llm** (opt-in via `--llm` only): renewal mechanics, obligation phrasing,
57
+ ambiguous governing law, and a clause-map fallback when no headings are
58
+ detected. Reads `~/.config/contract-ops/llm.json`; without it, `--llm`
59
+ degrades gracefully to the full deterministic output with a warning.
60
+
61
+ ## Output & exit codes
62
+
63
+ - Success: one JSON object on **stdout**, exit `0`. Errors/warnings/`--why` on
64
+ **stderr**. Scalar fields use the `{value, confidence, source}` envelope.
65
+ - Exit codes: `0` success · `1` low-signal document (scanned/empty — a finding,
66
+ valid JSON still emitted) · `2` bad usage. Branch on the exit code.
67
+
68
+ ## Interop
69
+
70
+ The integration contract is the output schema
71
+ (`docs/spec/extract-output.schema.json`) plus the shared canonical clause
72
+ vocabulary — `canonical_title` values match what `template-vault-cli` detects
73
+ and `nda-review-cli` keys policy on. See `docs/INTEROP.md`.
74
+
75
+ ## More
76
+
77
+ - README: https://github.com/DrBaher/extract-cli/blob/main/README.md
78
+ - Agent contract: https://github.com/DrBaher/extract-cli/blob/main/AGENTS.md
79
+ - Architecture: https://github.com/DrBaher/extract-cli/blob/main/ARCHITECTURE.md
@@ -4,13 +4,16 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "extract-cli"
7
- version = "0.1.6"
7
+ version = "0.1.8"
8
8
  description = "Open-loop front door of the contract-ops CLI suite: ingest any contract (.md/.txt/.html/.docx/.pdf) and emit structured JSON."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
11
11
  license = { text = "MIT" }
12
12
  authors = [{ name = "DrBaher", email = "Drbaher@gmail.com" }]
13
- keywords = ["contract", "extraction", "nda", "legal", "cli", "json", "clause"]
13
+ keywords = [
14
+ "contract-ops", "agent-first", "cli", "legal-tech",
15
+ "contract", "extraction", "nda", "legal", "json", "clause",
16
+ ]
14
17
  classifiers = [
15
18
  "Development Status :: 4 - Beta",
16
19
  "Environment :: Console",
@@ -64,6 +67,8 @@ include = ["extract_cli.py"]
64
67
  include = [
65
68
  "extract_cli.py",
66
69
  "README.md",
70
+ "AGENTS.md",
71
+ "llms.txt",
67
72
  "LICENSE",
68
73
  "CHANGELOG.md",
69
74
  "ARCHITECTURE.md",
@@ -42,13 +42,46 @@ _DOCX_PARAS = [
42
42
  _W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
43
43
 
44
44
 
45
- def _docx_paragraph(text: str, bold: bool = False, style: str = "") -> str:
46
- ppr = f'<w:pPr><w:pStyle w:val="{style}"/></w:pPr>' if style else ""
45
+ def _docx_paragraph(text: str, bold: bool = False, style: str = "",
46
+ numbered: bool = False, ilvl: int = 0) -> str:
47
+ inner = ""
48
+ if style:
49
+ inner += f'<w:pStyle w:val="{style}"/>'
50
+ if numbered:
51
+ inner += f'<w:numPr><w:ilvl w:val="{ilvl}"/><w:numId w:val="1"/></w:numPr>'
52
+ ppr = f"<w:pPr>{inner}</w:pPr>" if inner else ""
47
53
  rpr = "<w:rPr><w:b/></w:rPr>" if bold else ""
48
54
  return (f"<w:p>{ppr}<w:r>{rpr}"
49
55
  f'<w:t xml:space="preserve">{escape(text)}</w:t></w:r></w:p>')
50
56
 
51
57
 
58
+ # An auto-numbered agreement: clauses are w:numPr list paragraphs with NO heading
59
+ # style and NO visible number (Word generates "1.", "2." from numbering.xml).
60
+ # Run-in titles at ilvl 0/1 are clause headings; ilvl-2 full sentences are body
61
+ # and must be rejected. Mirrors real DOCX like the Common Paper DPA.
62
+ _NUMBERED_DOCX_PARAS = [
63
+ ('Data Processing Agreement', False, "", False, 0),
64
+ ('This Data Processing Agreement is made as of July 7, 2024, by and between '
65
+ 'Globex Cloud, Inc. ("Provider") and Initech Ltd. ("Customer").', False, "", False, 0),
66
+ ('Definitions', False, "", True, 0),
67
+ ('Processing. Provider will process Customer Data only on documented '
68
+ 'instructions from the Customer.', False, "", True, 0),
69
+ ('Confidentiality. Provider will keep Customer Data confidential.', False, "", True, 1),
70
+ ('Subprocessors. Provider may engage subprocessors as permitted.', False, "", True, 1),
71
+ ('Provider will ensure each subprocessor is bound by equivalent obligations '
72
+ 'and remains fully liable for their performance under this Agreement.', False, "", True, 2),
73
+ ('Governing Law. This Agreement is governed by the laws of the State of '
74
+ 'New York.', False, "", True, 0),
75
+ ]
76
+
77
+
78
+ def build_numbered_docx() -> bytes:
79
+ return _docx_package(
80
+ "".join(_docx_paragraph(t, b, style=s, numbered=n, ilvl=l)
81
+ for t, b, s, n, l in _NUMBERED_DOCX_PARAS)
82
+ )
83
+
84
+
52
85
  # A Word-styled agreement: clause structure carried by Heading1 styles (their
53
86
  # numbers are auto-generated, absent from text), including a run-in heading and
54
87
  # a full sentence that merely carries the heading style (must be rejected).
@@ -194,6 +227,7 @@ def build_scanned_pdf() -> bytes:
194
227
  _BINARY_FIXTURES = {
195
228
  "employment_docx.docx": build_docx,
196
229
  "heading_docx.docx": build_heading_docx,
230
+ "numbered_docx.docx": build_numbered_docx,
197
231
  "license_pdf.pdf": build_pdf,
198
232
  "scanned.pdf": build_scanned_pdf,
199
233
  }
@@ -20,8 +20,8 @@ from tests._fixtures_build import ensure_binary_fixtures # noqa: E402
20
20
  FIXTURES = Path(__file__).resolve().parent / "fixtures"
21
21
 
22
22
  DOCS = ["nda_h2.md", "services_bold.txt", "lease_allcaps.txt",
23
- "employment_docx.docx", "heading_docx.docx", "license_pdf.pdf",
24
- "services_html.html", "scanned.pdf"]
23
+ "employment_docx.docx", "heading_docx.docx", "numbered_docx.docx",
24
+ "license_pdf.pdf", "services_html.html", "scanned.pdf"]
25
25
 
26
26
 
27
27
  def golden_for(name: str) -> dict:
@@ -26,6 +26,7 @@ CORPUS: Tuple[Tuple[str, str, str], ...] = (
26
26
  ("lease_allcaps.txt", "all-caps", "text"),
27
27
  ("employment_docx.docx", "bold-numbered", "docx"),
28
28
  ("heading_docx.docx", "h2", "docx"),
29
+ ("numbered_docx.docx", "h2", "docx"),
29
30
  ("license_pdf.pdf", "all-caps", "pdf"),
30
31
  ("services_html.html", "numbered", "html"),
31
32
  )
@@ -138,7 +138,7 @@
138
138
  "source": "deterministic"
139
139
  },
140
140
  "_meta": {
141
- "extractor_version": "0.1.6",
141
+ "extractor_version": "0.1.8",
142
142
  "tiers_used": [
143
143
  "deterministic"
144
144
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "none"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.6",
136
+ "extractor_version": "0.1.8",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.6",
136
+ "extractor_version": "0.1.8",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.6",
136
+ "extractor_version": "0.1.8",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -143,7 +143,7 @@
143
143
  "source": "none"
144
144
  },
145
145
  "_meta": {
146
- "extractor_version": "0.1.6",
146
+ "extractor_version": "0.1.8",
147
147
  "tiers_used": [
148
148
  "deterministic"
149
149
  ],
@@ -0,0 +1,142 @@
1
+ {
2
+ "document": {
3
+ "title": "Data Processing Agreement",
4
+ "format": "docx",
5
+ "sha256": "4fea9a1f04598238f78900d19ccb0385bfc222b1e26664648c8d8ddb8cde189c",
6
+ "source_path": "numbered_docx.docx"
7
+ },
8
+ "parties": [
9
+ {
10
+ "name": "Globex Cloud, Inc.",
11
+ "confidence": 0.9,
12
+ "source": "deterministic",
13
+ "role": "Provider"
14
+ },
15
+ {
16
+ "name": "Initech Ltd",
17
+ "confidence": 0.9,
18
+ "source": "deterministic",
19
+ "role": null
20
+ }
21
+ ],
22
+ "dates": {
23
+ "effective": {
24
+ "value": "2024-07-07",
25
+ "confidence": 0.85,
26
+ "source": "deterministic"
27
+ },
28
+ "expiration": {
29
+ "value": null,
30
+ "confidence": 0.0,
31
+ "source": "none"
32
+ }
33
+ },
34
+ "term": {
35
+ "length": {
36
+ "value": null,
37
+ "confidence": 0.0,
38
+ "source": "none"
39
+ },
40
+ "auto_renew": {
41
+ "value": null,
42
+ "confidence": 0.0,
43
+ "source": "none"
44
+ },
45
+ "notice_period_days": {
46
+ "value": null,
47
+ "confidence": 0.0,
48
+ "source": "none"
49
+ }
50
+ },
51
+ "governing_law": {
52
+ "value": "State of New York",
53
+ "confidence": 0.85,
54
+ "source": "deterministic"
55
+ },
56
+ "clauses": [
57
+ {
58
+ "canonical_title": "Definitions",
59
+ "detected_title": "## Definitions",
60
+ "tier": "h2",
61
+ "span": {
62
+ "start": 165,
63
+ "end": 181
64
+ },
65
+ "confidence": 0.95,
66
+ "source": "deterministic",
67
+ "mapped": true
68
+ },
69
+ {
70
+ "canonical_title": "Processing",
71
+ "detected_title": "## Processing",
72
+ "tier": "h2",
73
+ "span": {
74
+ "start": 181,
75
+ "end": 284
76
+ },
77
+ "confidence": 0.71,
78
+ "source": "deterministic",
79
+ "mapped": false
80
+ },
81
+ {
82
+ "canonical_title": "Confidentiality",
83
+ "detected_title": "## Confidentiality",
84
+ "tier": "h2",
85
+ "span": {
86
+ "start": 284,
87
+ "end": 352
88
+ },
89
+ "confidence": 0.95,
90
+ "source": "deterministic",
91
+ "mapped": true
92
+ },
93
+ {
94
+ "canonical_title": "Subprocessors",
95
+ "detected_title": "## Subprocessors",
96
+ "tier": "h2",
97
+ "span": {
98
+ "start": 352,
99
+ "end": 563
100
+ },
101
+ "confidence": 0.71,
102
+ "source": "deterministic",
103
+ "mapped": false
104
+ },
105
+ {
106
+ "canonical_title": "Governing Law",
107
+ "detected_title": "## Governing Law",
108
+ "tier": "h2",
109
+ "span": {
110
+ "start": 563,
111
+ "end": 645
112
+ },
113
+ "confidence": 0.95,
114
+ "source": "deterministic",
115
+ "mapped": true
116
+ }
117
+ ],
118
+ "defined_terms": [
119
+ {
120
+ "term": "Provider",
121
+ "confidence": 0.6,
122
+ "source": "deterministic"
123
+ },
124
+ {
125
+ "term": "Customer",
126
+ "confidence": 0.6,
127
+ "source": "deterministic"
128
+ }
129
+ ],
130
+ "value": {
131
+ "value": null,
132
+ "confidence": 0.0,
133
+ "source": "none"
134
+ },
135
+ "_meta": {
136
+ "extractor_version": "0.1.8",
137
+ "tiers_used": [
138
+ "deterministic"
139
+ ],
140
+ "llm_used": false
141
+ }
142
+ }
@@ -48,7 +48,7 @@
48
48
  "source": "none"
49
49
  },
50
50
  "_meta": {
51
- "extractor_version": "0.1.6",
51
+ "extractor_version": "0.1.8",
52
52
  "tiers_used": [
53
53
  "deterministic"
54
54
  ],
@@ -133,7 +133,7 @@
133
133
  "source": "deterministic"
134
134
  },
135
135
  "_meta": {
136
- "extractor_version": "0.1.6",
136
+ "extractor_version": "0.1.8",
137
137
  "tiers_used": [
138
138
  "deterministic"
139
139
  ],
@@ -148,7 +148,7 @@
148
148
  "source": "deterministic"
149
149
  },
150
150
  "_meta": {
151
- "extractor_version": "0.1.6",
151
+ "extractor_version": "0.1.8",
152
152
  "tiers_used": [
153
153
  "deterministic"
154
154
  ],
@@ -127,6 +127,29 @@ def test_roman_numeral_stripping() -> None:
127
127
  assert ex._strip_clause_number(raw) == expected, raw
128
128
 
129
129
 
130
+ def test_two_line_article_headings() -> None:
131
+ # "ARTICLE N" on one line, the title on the next (common formal layout).
132
+ text = ("ARTICLE I\n\nDEFINITIONS\n\nCapitalized terms have meanings.\n\n"
133
+ "ARTICLE II\n\nCONFIDENTIALITY\n\nEach party protects info.\n\n"
134
+ "ARTICLE III\n\nGOVERNING LAW\n\nGoverned by New York law.")
135
+ clauses = ex.detect_clauses(text)
136
+ assert [c["title"] for c in clauses] == ["DEFINITIONS", "CONFIDENTIALITY", "GOVERNING LAW"]
137
+ assert all(c["tier"] == "numbered" for c in clauses)
138
+ # A single stray "Article 5" mention must NOT trigger the pairing.
139
+ assert ex._detect_two_line_articles("see Article 5 below for details") == []
140
+
141
+
142
+ def test_expanded_vocabulary_mappings() -> None:
143
+ # Added from the real-corpus survey (v0.1.8).
144
+ assert ex._canonicalize_clause("Permitted Disclosures") == ("Exclusions", True)
145
+ assert ex._canonicalize_clause("Injunctive Relief") == ("Remedies", True)
146
+ assert ex._canonicalize_clause("General Terms") == ("Miscellaneous", True)
147
+ assert ex._canonicalize_clause("No Third-Party Beneficiary") == ("Third-Party Beneficiaries", True)
148
+ assert ex._canonicalize_clause("Export Controls") == ("Compliance with Laws", True)
149
+ # Must NOT over-match: a generic "General Release" is not Miscellaneous.
150
+ assert ex._canonicalize_clause("General Release of Claims")[1] is False
151
+
152
+
130
153
  def test_canonicalize_known_aliases() -> None:
131
154
  assert ex._canonicalize_clause("Non-Disclosure") == ("Confidentiality", True)
132
155
  assert ex._canonicalize_clause("CONFIDENTIALITY OBLIGATIONS") == ("Confidentiality", True)
@@ -1,8 +1,9 @@
1
1
  """End-to-end CLI tests driving extract_cli.main() in-process."""
2
2
  from __future__ import annotations
3
3
 
4
+ import argparse
4
5
  import json
5
- from typing import Any
6
+ from typing import Any, Set
6
7
 
7
8
  import pytest
8
9
 
@@ -10,6 +11,22 @@ import extract_cli as ex
10
11
  from tests.conftest import FIXTURES
11
12
 
12
13
 
14
+ def _parser_optstrings(subparser: argparse.ArgumentParser) -> Set[str]:
15
+ """Every documented --flag a subparser accepts (excluding -h/--help and SUPPRESS)."""
16
+ out: Set[str] = set()
17
+ for action in subparser._actions:
18
+ if isinstance(action, argparse._SubParsersAction):
19
+ continue
20
+ if not action.option_strings: # positional
21
+ continue
22
+ if action.help == argparse.SUPPRESS:
23
+ continue
24
+ if {"-h", "--help"} & set(action.option_strings):
25
+ continue
26
+ out.update(action.option_strings)
27
+ return out
28
+
29
+
13
30
  def _has_key(obj: Any, key: str) -> bool:
14
31
  if isinstance(obj, dict):
15
32
  return key in obj or any(_has_key(v, key) for v in obj.values())
@@ -109,3 +126,48 @@ def test_why_goes_to_stderr(capsys: pytest.CaptureFixture[str]) -> None:
109
126
  assert "[why]" in cap.err
110
127
  assert "[why]" not in cap.out # stdout stays clean JSON
111
128
  json.loads(cap.out)
129
+
130
+
131
+ def test_catalog_json_shape(capsys: pytest.CaptureFixture[str]) -> None:
132
+ assert ex.main(["--catalog", "json"]) == 0
133
+ cat = json.loads(capsys.readouterr().out)
134
+ assert set(cat) >= {"name", "bin", "version", "description", "commands", "exitCodes"}
135
+ assert cat["name"] == "extract-cli"
136
+ assert cat["bin"] == "extract"
137
+ assert cat["version"] == ex.__version__
138
+ assert [c["name"] for c in cat["commands"]] == [
139
+ "extract", "schema", "fields", "demo", "completion"
140
+ ]
141
+ for c in cat["commands"]:
142
+ assert set(c) == {"name", "help", "flags"} and c["help"]
143
+ assert cat["exitCodes"]["0"] and cat["exitCodes"]["1"] and cat["exitCodes"]["2"]
144
+
145
+
146
+ def test_catalog_defaults_to_json(capsys: pytest.CaptureFixture[str]) -> None:
147
+ assert ex.main(["--catalog"]) == 0 # bare --catalog → json
148
+ json.loads(capsys.readouterr().out)
149
+ assert ex.main(["--catalog=json"]) == 0 # = form
150
+ json.loads(capsys.readouterr().out)
151
+
152
+
153
+ def test_catalog_rejects_unknown_format(capsys: pytest.CaptureFixture[str]) -> None:
154
+ assert ex.main(["--catalog", "yaml"]) == 2
155
+ assert "error:" in capsys.readouterr().err
156
+
157
+
158
+ def test_catalog_does_not_drift_from_parser() -> None:
159
+ """The catalog must list exactly the commands/flags the real parser accepts."""
160
+ cat = ex.build_catalog()
161
+ parser = ex.build_parser()
162
+ sub_action = next(
163
+ a for a in parser._actions if isinstance(a, argparse._SubParsersAction)
164
+ )
165
+ real: dict[str, argparse.ArgumentParser] = dict(sub_action.choices)
166
+ cat_by_name = {c["name"]: c for c in cat["commands"]}
167
+ assert set(cat_by_name) == set(real) # no fictional or undocumented commands
168
+ for name, subparser in real.items():
169
+ documented: Set[str] = set()
170
+ for f in cat_by_name[name]["flags"]:
171
+ documented.add(f["name"])
172
+ documented.update(f["aliases"])
173
+ assert documented == _parser_optstrings(subparser), f"flag drift in `{name}`"
@@ -173,6 +173,18 @@ def test_docx_heading_styles_drive_clause_map() -> None:
173
173
  assert [p["name"] for p in result["parties"]] == ["Initech Software, Inc.", "Globex Corporation"]
174
174
 
175
175
 
176
+ def test_numbered_docx_clauses() -> None:
177
+ """A DOCX whose clauses are w:numPr list paragraphs (no heading style, no
178
+ visible number) still yields a clause map; a deep numbered body sentence is
179
+ excluded."""
180
+ raw, text, fmt, _w = ex.load_source(FIXTURES / "numbered_docx.docx", prefer_optional=False)
181
+ result = ex.build_extraction(text, raw, fmt, "numbered_docx.docx")
182
+ canon = {c["canonical_title"] for c in result["clauses"]}
183
+ assert {"Definitions", "Confidentiality", "Governing Law"} <= canon
184
+ assert not any("remains fully liable" in c["detected_title"] for c in result["clauses"])
185
+ assert [p["name"] for p in result["parties"]][0] == "Globex Cloud, Inc."
186
+
187
+
176
188
  def test_html_extraction() -> None:
177
189
  raw, text, fmt, _w = ex.load_source(FIXTURES / "services_html.html")
178
190
  assert fmt == "html"
File without changes
File without changes
File without changes
File without changes
File without changes