extract-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
extract_cli.py ADDED
@@ -0,0 +1,1710 @@
1
+ #!/usr/bin/env python3
2
+ """extract-cli -- the open-loop front door of the contract-ops CLI suite.
3
+
4
+ The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
5
+ -> sign) that, until now, only handled documents it authored from its own
6
+ templates. `extract-cli` is "passport control": it ingests ANY document --
7
+ yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx, or
8
+ .pdf, and emits a structured JSON representation that the rest of the suite
9
+ (nda-review-cli, compare-cli, contract-vault) consumes.
10
+
11
+ Two extraction tiers:
12
+ * DETERMINISTIC (default, always on): parties, dates, defined-term inventory,
13
+ the CLAUSE MAP, governing law, best-effort term/notice/value. Pure
14
+ regex/structure -- no network, no LLM.
15
+ * LLM (opt-in via --llm only): the fuzzy fields (renewal mechanics,
16
+ obligation phrasing, ambiguous governing law). Always skippable; the
17
+ deterministic core is fully useful without it.
18
+
19
+ Every extracted field carries a `confidence` and a `source` in
20
+ {deterministic, llm, none} -- downstream tools treat fields as "verify, not
21
+ trust".
22
+
23
+ Stdlib-only. Single file. The clause-detection cascade (H2 -> bold-numbered ->
24
+ ALL-CAPS) and the canonical-vocabulary alias normalization are ported from
25
+ template-vault-cli so a foreign document's clauses land on the suite's shared
26
+ clause vocabulary.
27
+
28
+ Part of the contract-ops CLI suite. See docs/INTEROP.md.
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import datetime as _dt
34
+ import hashlib
35
+ import importlib.util
36
+ import json
37
+ import os
38
+ import re
39
+ import sys
40
+ import urllib.error
41
+ import urllib.request
42
+ from pathlib import Path
43
+ from typing import Any, Dict, List, Optional, Tuple
44
+
45
+ __version__ = "0.1.0"
46
+
47
+ # Bumped independently of the package version when the *extraction logic*
48
+ # changes in a way downstream consumers should notice. Embedded in `_meta`.
49
+ EXTRACTOR_VERSION = "0.1.0"
50
+
51
+ # JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
52
+ SCHEMA_VERSION = 1
53
+
54
+ JSON = Dict[str, Any]
55
+
56
+ CLI_NAME = "extract-cli"
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Streams / color (convention-shared with the suite; see docs/INTEROP.md)
60
+ # ---------------------------------------------------------------------------
61
+
62
+
63
+ def _color_enabled(stream: Any = None) -> bool:
64
+ """Auto-detect color support: opt out via NO_COLOR (https://no-color.org/),
65
+ force on via FORCE_COLOR, otherwise only when the stream is a tty."""
66
+ if os.environ.get("NO_COLOR"):
67
+ return False
68
+ if os.environ.get("FORCE_COLOR"):
69
+ return True
70
+ s = stream if stream is not None else sys.stdout
71
+ try:
72
+ return bool(s.isatty())
73
+ except Exception:
74
+ return False
75
+
76
+
77
+ def _c(text: str, code: str) -> str:
78
+ if not _color_enabled():
79
+ return text
80
+ return f"\033[{code}m{text}\033[0m"
81
+
82
+
83
+ def _green(s: str) -> str:
84
+ return _c(s, "32")
85
+
86
+
87
+ def _yellow(s: str) -> str:
88
+ return _c(s, "33")
89
+
90
+
91
+ def _red(s: str) -> str:
92
+ return _c(s, "31")
93
+
94
+
95
+ def _bold(s: str) -> str:
96
+ return _c(s, "1")
97
+
98
+
99
+ def _dim(s: str) -> str:
100
+ return _c(s, "2")
101
+
102
+
103
+ def _eprint(*args: Any, **kwargs: Any) -> None:
104
+ print(*args, file=sys.stderr, **kwargs)
105
+
106
+
107
+ def _why_print(args_ns: argparse.Namespace, header: str, *lines: str) -> None:
108
+ """Emit a `--why` block to **stderr** so it never pollutes piped stdout.
109
+ No-op unless `--why` was passed. Plain-text envelope (matches this repo's
110
+ siblings template-vault-cli / draft-cli)."""
111
+ if not getattr(args_ns, "why", False):
112
+ return
113
+ _eprint(f"\n[why] {header}")
114
+ for line in lines:
115
+ _eprint(f" {line}")
116
+
117
+
118
+ def _warn(args_ns: Optional[argparse.Namespace], msg: str) -> None:
119
+ """Diagnostic to stderr, suppressed by -q/--silent."""
120
+ if args_ns is not None and getattr(args_ns, "silent", False):
121
+ return
122
+ _eprint(_yellow("warning:") + f" {msg}")
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Errors
127
+ # ---------------------------------------------------------------------------
128
+
129
+
130
+ class ExtractError(Exception):
131
+ """User-actionable error. main() prints it and exits non-zero."""
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Clause-detection cascade (ported from template-vault-cli `template_vault_cli.py`)
136
+ #
137
+ # Tier 1: H2 headings (`## Title`) -- Markdown-native templates.
138
+ # Tier 2: bold-numbered (`**1. Purpose**`) -- typical of DOCX -> text.
139
+ # Tier 3: ALL-CAPS standalone lines -- typical of legal PDFs.
140
+ # The fallback tiers only run when the prior tier finds nothing, so they can't
141
+ # shadow real structure. Foreign clauses are then normalized onto the suite's
142
+ # canonical vocabulary via the alias index below.
143
+ # ---------------------------------------------------------------------------
144
+
145
+ # Auto-detect clause headers by H2 only (not H3+). Anchored at line start.
146
+ H2_RE = re.compile(r"^##[ \t]+(.+?)[ \t]*$", re.MULTILINE)
147
+
148
+ # Bold-numbered: **1. Purpose** / **Section 4. Term** / **(1) Scope**
149
+ _BOLD_HEADING_RE = re.compile(
150
+ r"^\*\*\s*"
151
+ r"(?:"
152
+ r"(?:Article|Section|Sec\.?|Art\.?|Clause|Part|§)\s+\S+\.?" # word-prefixed
153
+ r"|"
154
+ r"\(\d+\)" # (1)
155
+ r"|"
156
+ r"\d+(?:\.\d+)*" # 1 / 1.2.3
157
+ r")"
158
+ r"[\.\):\s]+"
159
+ r"([^\*\n]+?)"
160
+ r"\s*\*\*\s*$",
161
+ re.MULTILINE,
162
+ )
163
+
164
+ # ALL-CAPS standalone heading: blank-line framed on both sides (so inline
165
+ # shouts in prose don't qualify); doesn't start with `[` (so `[BRACKETED]`
166
+ # placeholders never match). Single-token lines need >= 4 ASCII letters
167
+ # (enforced in _qualifies_as_all_caps_heading).
168
+ _ALL_CAPS_HEADING_RE = re.compile(
169
+ r"(?:^|\n)\n([A-Z][A-Z0-9 \-/&,]{1,}[A-Z0-9])\s*\n\n",
170
+ )
171
+
172
+ # Roman numerals 1-39 -- covers virtually all legal-document section numbering.
173
+ # Longer alternatives come first within each group so the regex engine doesn't
174
+ # short-circuit on a prefix match (bare V / X must still match).
175
+ _ROMAN_RE = (
176
+ r"(?:(?:XXX|XX|X)(?:IX|IV|VIII|VII|VI|V|III|II|I)?"
177
+ r"|IX|IV|VIII|VII|VI|V|III|II|I)"
178
+ )
179
+
180
+ # Leading numbering tokens to strip from a clause title. Order matters: longer
181
+ # Article/Section forms come before bare numbers so they're consumed first.
182
+ _NUMBER_PREFIX_RE = re.compile(
183
+ r"^\s*(?:"
184
+ r"(?:Article|Section|Sec\.?|Art\.?|Clause|Part)\s+"
185
+ r"(?:" + _ROMAN_RE + r"|\d+(?:\.\d+)*)"
186
+ r"|"
187
+ r"§\s*\d+(?:\.\d+)*"
188
+ r"|"
189
+ r"\(\d+\)"
190
+ r"|"
191
+ r"\[\d+\]"
192
+ r"|"
193
+ r"\d+(?:\.\d+)+"
194
+ r"|"
195
+ r"\d+"
196
+ r")"
197
+ r"[\.\)\]:\s]*",
198
+ re.IGNORECASE,
199
+ )
200
+
201
+
202
+ def _strip_clause_number(s: str) -> str:
203
+ """Remove a leading numbering token (`1.`, `1)`, `(1)`, `[1]`, `1.2.3`,
204
+ `Article I.`, `Section 4.`, `§ 4.2`). Idempotent."""
205
+ return _NUMBER_PREFIX_RE.sub("", s, count=1).strip()
206
+
207
+
208
+ def _qualifies_as_all_caps_heading(title: str) -> bool:
209
+ """Single-token ALL-CAPS lines need >= 4 ASCII letters (so 'TER' doesn't
210
+ qualify but 'TERM' does). Multi-token lines pass through."""
211
+ tokens = title.split()
212
+ if len(tokens) >= 2:
213
+ return True
214
+ return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
215
+
216
+
217
+ def detect_clauses(text: str) -> List[JSON]:
218
+ """Run the three-tier cascade and return clauses with their detection tier.
219
+
220
+ Returns [{title, detected, anchor, start, end, tier}, ...]. `title` is the
221
+ numbering-stripped heading; `detected` is the raw heading line as it
222
+ appeared. The first tier that fires wins (H2 needs >= 1 hit; the fallbacks
223
+ need >= 2 to avoid false positives)."""
224
+ h2 = list(H2_RE.finditer(text))
225
+ if h2:
226
+ return _matches_to_clauses(text, h2, group=1, tier="h2")
227
+ bold = list(_BOLD_HEADING_RE.finditer(text))
228
+ if len(bold) >= 2:
229
+ return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
230
+ caps = [
231
+ m for m in _ALL_CAPS_HEADING_RE.finditer(text)
232
+ if _qualifies_as_all_caps_heading(m.group(1))
233
+ ]
234
+ if len(caps) >= 2:
235
+ return _matches_to_clauses(text, caps, group=1, tier="all-caps")
236
+ return []
237
+
238
+
239
+ def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
240
+ tier: str) -> List[JSON]:
241
+ """Build clause dicts from regex matches whose `group` holds the title.
242
+ The clause body runs from the heading line to the next heading (or EOF)."""
243
+ out: List[JSON] = []
244
+ for i, m in enumerate(matches):
245
+ raw = m.group(group).strip()
246
+ title = _strip_clause_number(raw)
247
+ # Anchor line: for ALL-CAPS, step past the leading newline gap the
248
+ # regex captured so the span starts at the heading line itself.
249
+ anchor_start = text.rfind(m.group(group), m.start(), m.end())
250
+ line_start = text.rfind("\n", 0, anchor_start) + 1
251
+ line_end = text.find("\n", line_start)
252
+ if line_end == -1:
253
+ line_end = len(text)
254
+ anchor = text[line_start:line_end]
255
+ start = line_start
256
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
257
+ out.append({
258
+ "title": title,
259
+ "detected": anchor.strip(),
260
+ "anchor": anchor,
261
+ "start": start,
262
+ "end": end,
263
+ "tier": tier,
264
+ })
265
+ return out
266
+
267
+
268
+ def _norm_clause_key(s: str) -> str:
269
+ """Normalize a clause title/alias for matching (number-stripped, lowercased)."""
270
+ return _strip_clause_number(s).strip().lower()
271
+
272
+
273
+ # ---------------------------------------------------------------------------
274
+ # Canonical clause vocabulary
275
+ #
276
+ # template-vault-cli stores `clause_aliases` per-template (canonical_title ->
277
+ # [alias, ...]). A FOREIGN document carries no such map, so extract-cli ships a
278
+ # built-in default vocabulary -- the suite's shared clause names -- and maps a
279
+ # document's detected clause titles onto it. This is the differentiator: it
280
+ # turns "whatever the counterparty called their sections" into the canonical
281
+ # vocabulary nda-review-cli / compare-cli already speak.
282
+ # ---------------------------------------------------------------------------
283
+
284
+ CANONICAL_CLAUSE_ALIASES: Dict[str, List[str]] = {
285
+ "Definitions": ["definitions", "defined terms", "interpretation", "construction"],
286
+ "Confidentiality": [
287
+ "confidentiality", "non-disclosure", "nondisclosure", "confidential information",
288
+ "confidentiality obligations", "secrecy", "protection of confidential information",
289
+ ],
290
+ "Term": ["term", "duration", "agreement term", "term of agreement"],
291
+ "Termination": ["termination", "term and termination", "right to terminate", "termination for cause"],
292
+ "Governing Law": [
293
+ "governing law", "applicable law", "choice of law", "law and jurisdiction",
294
+ "governing law and jurisdiction",
295
+ ],
296
+ "Dispute Resolution": ["dispute resolution", "arbitration", "disputes", "mediation"],
297
+ "Indemnification": ["indemnification", "indemnity", "hold harmless", "indemnities"],
298
+ "Limitation of Liability": [
299
+ "limitation of liability", "liability", "limitation on liability", "liability cap",
300
+ "exclusion of liability",
301
+ ],
302
+ "Intellectual Property": [
303
+ "intellectual property", "ip rights", "ownership of ip", "proprietary rights",
304
+ "intellectual property rights", "ownership",
305
+ ],
306
+ "Payment": ["payment", "fees", "compensation", "fees and payment", "consideration", "pricing"],
307
+ "Warranties": [
308
+ "warranties", "representations and warranties", "warranty", "reps and warranties",
309
+ "representations",
310
+ ],
311
+ "Assignment": ["assignment", "assignability", "assignment and delegation"],
312
+ "Notices": ["notices", "notice"],
313
+ "Force Majeure": ["force majeure", "acts of god"],
314
+ "Entire Agreement": ["entire agreement", "integration", "complete agreement"],
315
+ "Severability": ["severability", "severance"],
316
+ "Waiver": ["waiver", "no waiver"],
317
+ "Non-Compete": [
318
+ "non-compete", "noncompete", "noncompetition", "non-competition",
319
+ "covenant not to compete",
320
+ ],
321
+ "Non-Solicitation": ["non-solicit", "non-solicitation", "nonsolicitation", "no solicitation"],
322
+ "Data Protection": ["data protection", "data privacy", "gdpr", "privacy", "personal data"],
323
+ "Insurance": ["insurance"],
324
+ "Counterparts": ["counterparts"],
325
+ "Survival": ["survival", "survival of obligations"],
326
+ "Amendment": ["amendment", "amendments", "modification", "modifications", "changes"],
327
+ "Relationship of the Parties": [
328
+ "relationship of the parties", "independent contractor", "no partnership", "no agency",
329
+ ],
330
+ "Compliance with Laws": ["compliance with laws", "compliance", "anti-corruption"],
331
+ "Publicity": ["publicity", "announcements", "press releases"],
332
+ }
333
+
334
+
335
+ def _build_alias_index() -> Dict[str, str]:
336
+ idx: Dict[str, str] = {}
337
+ for canonical, aliases in CANONICAL_CLAUSE_ALIASES.items():
338
+ idx[_norm_clause_key(canonical)] = canonical
339
+ for alias in aliases:
340
+ idx[_norm_clause_key(alias)] = canonical
341
+ return idx
342
+
343
+
344
+ _ALIAS_INDEX = _build_alias_index()
345
+
346
+
347
+ def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
348
+ """Map a detected clause title to a canonical suite title.
349
+
350
+ Returns (canonical_title, mapped). On an exact alias/canonical hit, returns
351
+ the canonical name. Otherwise tries a substring containment match against
352
+ the index (so 'Confidentiality and Non-Disclosure' still maps). Falls back
353
+ to a Title-Cased copy of the detected title with mapped=False."""
354
+ key = _norm_clause_key(detected_title)
355
+ if not key:
356
+ return None, False
357
+ canon = _ALIAS_INDEX.get(key)
358
+ if canon is not None:
359
+ return canon, True
360
+ # Containment: longest alias key contained in (or containing) the title.
361
+ best: Optional[str] = None
362
+ best_len = 0
363
+ for alias_key, canonical in _ALIAS_INDEX.items():
364
+ if len(alias_key) >= 5 and (alias_key in key or key in alias_key):
365
+ if len(alias_key) > best_len:
366
+ best, best_len = canonical, len(alias_key)
367
+ if best is not None:
368
+ return best, True
369
+ return _titlecase(detected_title), False
370
+
371
+
372
+ # ---------------------------------------------------------------------------
373
+ # Confidence model + field envelope
374
+ # ---------------------------------------------------------------------------
375
+
376
+
377
+ def _field(value: Any, confidence: float, source: str = "deterministic") -> JSON:
378
+ """Wrap an extracted value with a confidence and a source. A `None` value
379
+ collapses to the canonical 'not found' envelope."""
380
+ if value is None:
381
+ return {"value": None, "confidence": 0.0, "source": "none"}
382
+ return {"value": value, "confidence": round(float(confidence), 2), "source": source}
383
+
384
+
385
+ def _none_field() -> JSON:
386
+ return {"value": None, "confidence": 0.0, "source": "none"}
387
+
388
+
389
+ def _titlecase(s: str) -> str:
390
+ s = s.strip()
391
+ if not s:
392
+ return s
393
+ # A fully-shouted heading (ALL-CAPS, e.g. from a PDF) is title-cased
394
+ # outright; in a mixed-case title a short all-caps word is treated as a
395
+ # deliberate acronym ("IP Rights") and preserved.
396
+ whole_upper = s.isupper()
397
+ parts = []
398
+ for w in s.split():
399
+ if not whole_upper and w.isupper() and len(w) <= 4:
400
+ parts.append(w)
401
+ else:
402
+ parts.append(w[:1].upper() + w[1:].lower())
403
+ return " ".join(parts)
404
+
405
+
406
+ # ---------------------------------------------------------------------------
407
+ # Deterministic extractors
408
+ # ---------------------------------------------------------------------------
409
+
410
+ _MONTHS = (
411
+ "January|February|March|April|May|June|July|August|September|October|"
412
+ "November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec"
413
+ )
414
+ _DATE_PAT = (
415
+ r"(?:"
416
+ r"\d{4}-\d{2}-\d{2}"
417
+ r"|\d{1,2}/\d{1,2}/\d{2,4}"
418
+ r"|(?:" + _MONTHS + r")\.?\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}"
419
+ r"|\d{1,2}(?:st|nd|rd|th)?\s+(?:day\s+of\s+)?(?:" + _MONTHS + r")\.?,?\s+\d{4}"
420
+ r")"
421
+ )
422
+ _DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
423
+
424
+ _EFFECTIVE_RE = re.compile(
425
+ r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
426
+ r"dated(?:\s+as\s+of)?|"
427
+ r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
428
+ r"entered\s+into(?:\s+as\s+of|\s+on)?)"
429
+ r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
430
+ re.IGNORECASE,
431
+ )
432
+ _EXPIRE_RE = re.compile(
433
+ r"(?:expir\w*|terminat\w*\s+on|end(?:s|ing)?\s+on|until|through|"
434
+ r"remain\s+in\s+effect\s+until)"
435
+ r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
436
+ re.IGNORECASE,
437
+ )
438
+
439
+ _PARTY_BLOCK_RE = re.compile(
440
+ r"\b(?:by\s+and\s+between|between)\s+(.{2,200}?)\s+\band\b\s+(.{2,200}?)"
441
+ r"(?=[\.;\n]|\bwhereas\b|\beffective\b|\bdated\b|\bhaving\b|\bwith\s+offices\b|$)",
442
+ re.IGNORECASE | re.DOTALL,
443
+ )
444
+ _ROLE_PAREN_RE = re.compile(
445
+ r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
446
+ )
447
+
448
+ # Keyword portion is case-insensitive via an inline (?i:...) group; the
449
+ # jurisdiction capture stays case-sensitive so a leading [A-Z] actually
450
+ # enforces a capitalized proper noun (a global re.IGNORECASE would defeat that
451
+ # and over-capture trailing lowercase clauses like ", without regard to ...").
452
+ _GOV_LAW_RE = re.compile(
453
+ r"(?i:governed\s+by(?:\s+and\s+construed\s+in\s+accordance\s+with)?\s+"
454
+ r"(?:the\s+)?laws?\s+of\s+(?:the\s+)?)"
455
+ r"([A-Z][A-Za-z\.\- ]+?(?:,\s*[A-Z][A-Za-z\.\- ]+?)?)"
456
+ r"(?=[\.,;\n)]|\s+and\b|\s+without\b|$)",
457
+ )
458
+
459
+ # Anchor on a term/period/duration keyword, then allow a short same-sentence
460
+ # gap before the "<number> <unit>" so phrasings like "the initial term of this
461
+ # Agreement is three (3) years" match as well as "for a period of two years".
462
+ _TERM_LEN_RE = re.compile(
463
+ r"(?:(?:initial\s+)?term|period|duration|"
464
+ r"in\s+(?:full\s+)?(?:force\s+and\s+)?effect\s+for)"
465
+ r"[^.\n]{0,40}?\b(\d+|[A-Za-z]+)(?:\s*\(\d+\))?\s+(years?|months?|weeks?|days?)\b",
466
+ re.IGNORECASE,
467
+ )
468
+ _NOTICE_RE = re.compile(
469
+ r"(\d+|[A-Za-z]+)(?:\s*\(\d+\))?\s+days?[’'`]?s?\s+"
470
+ r"(?:prior\s+)?(?:written\s+)?notice",
471
+ re.IGNORECASE,
472
+ )
473
+ _AUTORENEW_POS_RE = re.compile(
474
+ r"automatic(?:ally)?\s+renew|auto-?renew|renew(?:s|ed)?\s+automatically|"
475
+ r"successive\s+(?:\d+|[A-Za-z]+)[\s-]+(?:year|month)|"
476
+ r"shall\s+(?:automatically\s+)?renew\s+for",
477
+ re.IGNORECASE,
478
+ )
479
+ # Strong negations only. Deliberately excludes a bare "non-renewal", which in
480
+ # practice appears in "...notice of non-renewal" -- the opt-OUT mechanism of a
481
+ # contract that DOES auto-renew, not a statement that it doesn't.
482
+ _AUTORENEW_NEG_RE = re.compile(
483
+ r"(?:shall|will|does|may)\s+not\s+(?:automatically\s+)?renew|"
484
+ r"no\s+automatic\s+renewal|"
485
+ r"not\s+(?:be\s+)?renewed?\s+automatically|"
486
+ r"shall\s+not\s+(?:be\s+)?(?:automatically\s+)?renewed?",
487
+ re.IGNORECASE,
488
+ )
489
+ _MONEY_RE = re.compile(
490
+ r"(?:\$|US\$|USD\s?|EUR\s?|€|£|GBP\s?)"
491
+ r"\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?"
492
+ r"(?:\s?(?:million|billion|thousand|bn|m|k))?",
493
+ re.IGNORECASE,
494
+ )
495
+ _DEFTERM_QUOTED_RE = re.compile(
496
+ r"[\"“]([A-Z][A-Za-z0-9][A-Za-z0-9 \-'/&]{1,60})[\"”]"
497
+ )
498
+ _DEFTERM_PAREN_RE = re.compile(
499
+ r"\(\s*(?:the\s+)?[\"“]?([A-Z][A-Za-z0-9][A-Za-z0-9 \-'/&]{1,40})[\"”]?\s*\)"
500
+ )
501
+
502
+ _WORD_NUMBERS: Dict[str, int] = {
503
+ "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
504
+ "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11,
505
+ "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15,
506
+ "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60,
507
+ "seventy": 70, "eighty": 80, "ninety": 90, "hundred": 100,
508
+ }
509
+
510
+
511
+ def _word_to_int(token: str) -> Optional[int]:
512
+ token = token.strip().lower()
513
+ if token.isdigit():
514
+ return int(token)
515
+ return _WORD_NUMBERS.get(token)
516
+
517
+
518
+ def _parse_date_to_iso(s: str) -> Optional[str]:
519
+ """Best-effort normalization of a matched date string to ISO (YYYY-MM-DD).
520
+ Returns None when no known format parses."""
521
+ cleaned = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", s.strip().rstrip("."), flags=re.IGNORECASE)
522
+ cleaned = re.sub(r"\bday\s+of\s+", "", cleaned, flags=re.IGNORECASE)
523
+ cleaned = cleaned.replace(",", " ")
524
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
525
+ fmts = (
526
+ "%Y-%m-%d", "%B %d %Y", "%b %d %Y", "%d %B %Y", "%d %b %Y",
527
+ "%m/%d/%Y", "%m/%d/%y", "%d/%m/%Y",
528
+ )
529
+ for f in fmts:
530
+ try:
531
+ return _dt.datetime.strptime(cleaned, f).date().isoformat()
532
+ except ValueError:
533
+ continue
534
+ return None
535
+
536
+
537
+ def _date_field(match: Optional["re.Match[str]"]) -> JSON:
538
+ if match is None:
539
+ return _none_field()
540
+ raw = match.group(1).strip()
541
+ iso = _parse_date_to_iso(raw)
542
+ if iso is not None:
543
+ return _field(iso, 0.85)
544
+ return _field(raw, 0.55)
545
+
546
+
547
+ def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
548
+ s = s.strip().strip(",").strip()
549
+ role: Optional[str] = None
550
+ m = _ROLE_PAREN_RE.search(s)
551
+ if m:
552
+ candidate = m.group(1).strip()
553
+ # Only treat short, role-like parentheticals as roles.
554
+ if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
555
+ role = candidate
556
+ s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
557
+ s = s.strip("\"“”").strip()
558
+ s = re.sub(r"\s+", " ", s)
559
+ return s, role
560
+
561
+
562
+ def extract_parties(text: str) -> List[JSON]:
563
+ m = _PARTY_BLOCK_RE.search(text)
564
+ if not m:
565
+ return []
566
+ out: List[JSON] = []
567
+ for raw in (m.group(1), m.group(2)):
568
+ # Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
569
+ # collapse whitespace rather than truncating at the first newline.
570
+ raw = re.sub(r"\s+", " ", raw).strip()
571
+ name, role = _split_name_role(raw)
572
+ if not name or len(name) < 2 or len(name) > 120:
573
+ continue
574
+ entry: JSON = {"name": name, "confidence": 0.9, "source": "deterministic"}
575
+ entry["role"] = role
576
+ out.append(entry)
577
+ return out
578
+
579
+
580
+ def extract_dates(text: str) -> JSON:
581
+ return {
582
+ "effective": _date_field(_EFFECTIVE_RE.search(text)),
583
+ "expiration": _date_field(_EXPIRE_RE.search(text)),
584
+ }
585
+
586
+
587
+ def extract_governing_law(text: str) -> JSON:
588
+ m = _GOV_LAW_RE.search(text)
589
+ if not m:
590
+ return _none_field()
591
+ juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
592
+ if not juris:
593
+ return _none_field()
594
+ return _field(juris, 0.85)
595
+
596
+
597
+ def extract_term(text: str) -> JSON:
598
+ length = _none_field()
599
+ m = _TERM_LEN_RE.search(text)
600
+ if m:
601
+ num = _word_to_int(m.group(1))
602
+ unit = m.group(2).lower().rstrip("s")
603
+ if num is not None:
604
+ length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
605
+ else:
606
+ length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
607
+
608
+ notice = _none_field()
609
+ nm = _NOTICE_RE.search(text)
610
+ if nm:
611
+ days = _word_to_int(nm.group(1))
612
+ if days is not None:
613
+ notice = _field(days, 0.7)
614
+
615
+ auto = _none_field()
616
+ if _AUTORENEW_NEG_RE.search(text):
617
+ auto = _field(False, 0.7)
618
+ elif _AUTORENEW_POS_RE.search(text):
619
+ auto = _field(True, 0.65)
620
+
621
+ return {"length": length, "auto_renew": auto, "notice_period_days": notice}
622
+
623
+
624
+ def extract_value(text: str) -> JSON:
625
+ m = _MONEY_RE.search(text)
626
+ if not m:
627
+ return _none_field()
628
+ return _field(re.sub(r"\s+", " ", m.group(0).strip()), 0.6)
629
+
630
+
631
+ def extract_defined_terms(text: str) -> List[JSON]:
632
+ seen: Dict[str, None] = {}
633
+ for rx in (_DEFTERM_QUOTED_RE, _DEFTERM_PAREN_RE):
634
+ for m in rx.finditer(text):
635
+ term = re.sub(r"\s+", " ", m.group(1).strip())
636
+ # Reject sentence-like or lowercase-y captures.
637
+ if len(term) < 2 or len(term.split()) > 6:
638
+ continue
639
+ if not term[0].isupper():
640
+ continue
641
+ seen.setdefault(term, None)
642
+ if len(seen) >= 50:
643
+ break
644
+ return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
645
+
646
+
647
+ def extract_clauses(text: str) -> List[JSON]:
648
+ out: List[JSON] = []
649
+ for c in detect_clauses(text):
650
+ canonical, mapped = _canonicalize_clause(c["title"])
651
+ tier = c["tier"]
652
+ base = {"h2": 0.95, "bold-numbered": 0.85, "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
653
+ conf = round(base * (1.0 if mapped else 0.75), 2)
654
+ out.append({
655
+ "canonical_title": canonical,
656
+ "detected_title": c["detected"],
657
+ "tier": tier,
658
+ "span": {"start": int(c["start"]), "end": int(c["end"])},
659
+ "confidence": conf,
660
+ "source": "deterministic",
661
+ "mapped": mapped,
662
+ })
663
+ return out
664
+
665
+
666
+ def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
667
+ m = re.search(r"^#\s+(.+?)\s*$", text, re.MULTILINE)
668
+ if m:
669
+ return m.group(1).strip()
670
+ for line in text.splitlines():
671
+ ls = line.strip().lstrip("#").strip()
672
+ if ls:
673
+ if len(ls) <= 90:
674
+ return ls
675
+ break
676
+ if path is not None:
677
+ return _titlecase(path.stem.replace("_", " ").replace("-", " "))
678
+ return None
679
+
680
+
681
+ # ---------------------------------------------------------------------------
682
+ # Input readers
683
+ # ---------------------------------------------------------------------------
684
+
685
+
686
+ def _detect_format(path: Path, raw: bytes) -> str:
687
+ ext = path.suffix.lower()
688
+ if ext in (".md", ".markdown"):
689
+ return "markdown"
690
+ if ext == ".txt":
691
+ return "text"
692
+ if ext == ".docx":
693
+ return "docx"
694
+ if ext == ".pdf":
695
+ return "pdf"
696
+ if raw[:4] == b"%PDF":
697
+ return "pdf"
698
+ if raw[:2] == b"PK":
699
+ return "docx"
700
+ return "text"
701
+
702
+
703
+ def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
704
+ """Extract text from a .docx. Uses python-docx for higher fidelity when the
705
+ optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
706
+ (always available) handles paragraphs, table cells, and bold runs.
707
+
708
+ `prefer_optional=False` forces the stdlib reader regardless of what's
709
+ installed -- used to pin reproducible golden fixtures."""
710
+ warnings: List[str] = []
711
+ if prefer_optional and importlib.util.find_spec("docx") is not None:
712
+ try:
713
+ mod = importlib.import_module("docx")
714
+ document_cls = getattr(mod, "Document")
715
+ doc = document_cls(str(path))
716
+ lines: List[str] = []
717
+ for para in doc.paragraphs:
718
+ line = (para.text or "").strip()
719
+ if line and para.runs and all(getattr(r, "bold", False) for r in para.runs if (r.text or "").strip()):
720
+ line = f"**{line}**"
721
+ lines.append(line)
722
+ for table in getattr(doc, "tables", []):
723
+ for row in table.rows:
724
+ for cell in row.cells:
725
+ ct = (cell.text or "").strip()
726
+ if ct:
727
+ lines.append(ct)
728
+ return "\n\n".join(lines), warnings
729
+ except Exception as e: # pragma: no cover - fidelity path
730
+ warnings.append(f"python-docx read failed ({e}); falling back to stdlib reader")
731
+ try:
732
+ return _read_docx_stdlib(raw), warnings
733
+ except Exception as e:
734
+ warnings.append(f"could not parse .docx ({e}); treating as empty")
735
+ return "", warnings
736
+
737
+
738
+ def _read_docx_stdlib(raw: bytes) -> str:
739
+ import io
740
+ import zipfile
741
+ import xml.etree.ElementTree as ET
742
+
743
+ w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
744
+ with zipfile.ZipFile(io.BytesIO(raw)) as z:
745
+ xml = z.read("word/document.xml")
746
+ root = ET.fromstring(xml)
747
+ paras: List[str] = []
748
+ # iter over w:p in document order (includes paragraphs inside table cells).
749
+ for p in root.iter(w + "p"):
750
+ run_texts: List[str] = []
751
+ any_text = False
752
+ all_bold = True
753
+ for r in p.iter(w + "r"):
754
+ rpr = r.find(w + "rPr")
755
+ bold = rpr is not None and rpr.find(w + "b") is not None
756
+ txt = "".join(t.text or "" for t in r.iter(w + "t"))
757
+ if txt:
758
+ any_text = True
759
+ if not bold:
760
+ all_bold = False
761
+ run_texts.append(txt)
762
+ line = "".join(run_texts).strip()
763
+ if not line:
764
+ paras.append("")
765
+ continue
766
+ if any_text and all_bold:
767
+ line = f"**{line}**"
768
+ paras.append(line)
769
+ return "\n\n".join(paras)
770
+
771
+
772
+ def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
773
+ """Extract text from a .pdf. Uses pypdf when the optional [pdf] extra is
774
+ installed; otherwise a stdlib best-effort reader (zlib FlateDecode + text
775
+ operators). Scanned/image-only PDFs yield no text and are warned about.
776
+
777
+ `prefer_optional=False` forces the stdlib reader regardless of what's
778
+ installed -- used to pin reproducible golden fixtures."""
779
+ warnings: List[str] = []
780
+ if prefer_optional and importlib.util.find_spec("pypdf") is not None:
781
+ try:
782
+ mod = importlib.import_module("pypdf")
783
+ reader_cls = getattr(mod, "PdfReader")
784
+ import io
785
+ reader = reader_cls(io.BytesIO(raw))
786
+ pages = [page.extract_text() or "" for page in reader.pages]
787
+ return "\n\n".join(pages), warnings
788
+ except Exception as e: # pragma: no cover - fidelity path
789
+ warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
790
+ try:
791
+ text = _read_pdf_stdlib(raw)
792
+ except Exception as e:
793
+ warnings.append(f"could not parse .pdf ({e}); treating as empty")
794
+ return "", warnings
795
+ return text, warnings
796
+
797
+
798
+ _PDF_TOKEN_RE = re.compile(
799
+ r"\((?:\\.|[^\\()])*\)|\[(?:\\.|[^\]\\])*\]|Tj|TJ|Td|TD|T\*|BT|ET|'|\""
800
+ )
801
+
802
+
803
+ def _pdf_unescape(s: str) -> str:
804
+ out: List[str] = []
805
+ i = 0
806
+ n = len(s)
807
+ while i < n:
808
+ ch = s[i]
809
+ if ch == "\\" and i + 1 < n:
810
+ nxt = s[i + 1]
811
+ if nxt in "()\\":
812
+ out.append(nxt)
813
+ i += 2
814
+ continue
815
+ if nxt == "n":
816
+ out.append("\n")
817
+ i += 2
818
+ continue
819
+ if nxt in "rtbf":
820
+ out.append({"r": "\r", "t": "\t", "b": "", "f": ""}[nxt])
821
+ i += 2
822
+ continue
823
+ mo = re.match(r"[0-7]{1,3}", s[i + 1:i + 4])
824
+ if mo:
825
+ out.append(chr(int(mo.group(0), 8) & 0xFF))
826
+ i += 1 + len(mo.group(0))
827
+ continue
828
+ out.append(nxt)
829
+ i += 2
830
+ continue
831
+ out.append(ch)
832
+ i += 1
833
+ return "".join(out)
834
+
835
+
836
+ def _pdf_text_from_content(content: bytes) -> str:
837
+ s = content.decode("latin-1", "replace")
838
+ lines: List[str] = []
839
+ cur: List[str] = []
840
+
841
+ def flush() -> None:
842
+ if cur:
843
+ lines.append("".join(cur))
844
+ cur.clear()
845
+
846
+ for m in _PDF_TOKEN_RE.finditer(s):
847
+ tok = m.group(0)
848
+ if tok.startswith("("):
849
+ cur.append(_pdf_unescape(tok[1:-1]))
850
+ elif tok.startswith("["):
851
+ for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
852
+ cur.append(_pdf_unescape(sm.group(0)[1:-1]))
853
+ elif tok in ("Td", "TD", "T*", "'", '"', "BT", "ET"):
854
+ flush()
855
+ flush()
856
+ return "\n".join(lines)
857
+
858
+
859
+ def _read_pdf_stdlib(raw: bytes) -> str:
860
+ import zlib
861
+
862
+ chunks: List[str] = []
863
+ idx = 0
864
+ while True:
865
+ s = raw.find(b"stream", idx)
866
+ if s == -1:
867
+ break
868
+ e = raw.find(b"endstream", s)
869
+ if e == -1:
870
+ break
871
+ body = raw[s + len(b"stream"):e].lstrip(b"\r\n")
872
+ try:
873
+ content = zlib.decompress(body)
874
+ except Exception:
875
+ content = body
876
+ chunks.append(_pdf_text_from_content(content))
877
+ idx = e + len(b"endstream")
878
+ return "\n".join(c for c in chunks if c.strip())
879
+
880
+
881
+ def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
882
+ """Read a document from disk. Returns (raw_bytes, text, format, warnings).
883
+ Never raises on parse trouble -- degrades to empty text with a warning.
884
+
885
+ `prefer_optional=False` forces the stdlib readers for .docx/.pdf so output
886
+ is reproducible regardless of which extras are installed (used by the
887
+ golden fixtures). The CLI default (True) uses the best reader available."""
888
+ if not path.exists():
889
+ raise ExtractError(f"no such file: {path}")
890
+ if path.is_dir():
891
+ raise ExtractError(f"path is a directory, not a file: {path}")
892
+ raw = path.read_bytes()
893
+ fmt = _detect_format(path, raw)
894
+ warnings: List[str] = []
895
+ if fmt in ("markdown", "text"):
896
+ text = raw.decode("utf-8", "replace")
897
+ elif fmt == "docx":
898
+ text, w = _read_docx(path, raw, prefer_optional)
899
+ warnings += w
900
+ elif fmt == "pdf":
901
+ text, w = _read_pdf(path, raw, prefer_optional)
902
+ warnings += w
903
+ else: # pragma: no cover - unreachable; _detect_format only returns the above
904
+ text = raw.decode("utf-8", "replace")
905
+ if not text.strip():
906
+ warnings.append(
907
+ f"no extractable text from {fmt} input (scanned or image-only?); "
908
+ "output will be sparse"
909
+ )
910
+ return raw, text, fmt, warnings
911
+
912
+
913
+ # ---------------------------------------------------------------------------
914
+ # Extraction orchestration
915
+ # ---------------------------------------------------------------------------
916
+
917
+
918
+ def build_extraction(text: str, raw: bytes, fmt: str,
919
+ source_path: Optional[str]) -> JSON:
920
+ """Run the deterministic tier and assemble the output contract object."""
921
+ sha = hashlib.sha256(raw).hexdigest()
922
+ return {
923
+ "document": {
924
+ "title": extract_title(text, Path(source_path) if source_path else None, fmt),
925
+ "format": fmt,
926
+ "sha256": sha,
927
+ "source_path": source_path,
928
+ },
929
+ "parties": extract_parties(text),
930
+ "dates": extract_dates(text),
931
+ "term": extract_term(text),
932
+ "governing_law": extract_governing_law(text),
933
+ "clauses": extract_clauses(text),
934
+ "defined_terms": extract_defined_terms(text),
935
+ "value": extract_value(text),
936
+ "_meta": {
937
+ "extractor_version": EXTRACTOR_VERSION,
938
+ "tiers_used": ["deterministic"],
939
+ "llm_used": False,
940
+ },
941
+ }
942
+
943
+
944
+ def _is_low_signal(result: JSON) -> bool:
945
+ """True when the deterministic tier found essentially nothing extractable
946
+ (e.g. a scanned PDF). Used to set a non-zero exit code as a 'finding'."""
947
+ if result["parties"]:
948
+ return False
949
+ if result["clauses"]:
950
+ return False
951
+ if result["dates"]["effective"]["source"] != "none":
952
+ return False
953
+ if result["governing_law"]["source"] != "none":
954
+ return False
955
+ if result["defined_terms"]:
956
+ return False
957
+ return True
958
+
959
+
960
+ # ---------------------------------------------------------------------------
961
+ # LLM tier (opt-in only, never in a hot path)
962
+ # ---------------------------------------------------------------------------
963
+
964
+ LLM_CONFIG_PATHS = (
965
+ Path.home() / ".config" / "contract-ops" / "llm.json",
966
+ Path("config") / "llm.json",
967
+ )
968
+
969
+
970
+ def load_llm_config() -> Optional[JSON]:
971
+ """Suite-shared LLM config lookup: ~/.config/contract-ops/llm.json first,
972
+ then a repo-local ./config/llm.json. Returns the first valid one, else None."""
973
+ for p in LLM_CONFIG_PATHS:
974
+ try:
975
+ if p.is_file():
976
+ data = json.loads(p.read_text(encoding="utf-8"))
977
+ if isinstance(data, dict) and data.get("api_key"):
978
+ return data
979
+ except (OSError, json.JSONDecodeError):
980
+ continue
981
+ return None
982
+
983
+
984
+ _LLM_PROMPT = (
985
+ "You are a contract-extraction assistant. Given the contract text, return "
986
+ "ONLY a compact JSON object with keys: renewal_mechanics (string or null), "
987
+ "obligations (array of short strings, max 5), governing_law (string or "
988
+ "null). Base answers strictly on the text. No prose, JSON only.\n\n"
989
+ "CONTRACT:\n"
990
+ )
991
+
992
+
993
+ def _llm_request(cfg: JSON, prompt: str, timeout: float = 30.0) -> Optional[str]:
994
+ provider = str(cfg.get("provider", "anthropic")).lower()
995
+ model = cfg.get("model") or ("claude-sonnet-4-6" if provider == "anthropic" else "gpt-4o-mini")
996
+ api_key = cfg["api_key"]
997
+ if provider == "anthropic":
998
+ url = "https://api.anthropic.com/v1/messages"
999
+ payload = {
1000
+ "model": model,
1001
+ "max_tokens": 1024,
1002
+ "messages": [{"role": "user", "content": prompt}],
1003
+ }
1004
+ headers = {
1005
+ "content-type": "application/json",
1006
+ "x-api-key": api_key,
1007
+ "anthropic-version": "2023-06-01",
1008
+ }
1009
+ else:
1010
+ base = str(cfg.get("base_url") or "https://api.openai.com/v1").rstrip("/")
1011
+ url = f"{base}/chat/completions"
1012
+ payload = {
1013
+ "model": model,
1014
+ "messages": [{"role": "user", "content": prompt}],
1015
+ }
1016
+ headers = {
1017
+ "content-type": "application/json",
1018
+ "authorization": f"Bearer {api_key}",
1019
+ }
1020
+ req = urllib.request.Request(
1021
+ url, data=json.dumps(payload).encode("utf-8"), headers=headers, method="POST"
1022
+ )
1023
+ with urllib.request.urlopen(req, timeout=timeout) as resp: # nosec - opt-in
1024
+ body = json.loads(resp.read().decode("utf-8"))
1025
+ if provider == "anthropic":
1026
+ parts = body.get("content") or []
1027
+ return "".join(p.get("text", "") for p in parts if isinstance(p, dict))
1028
+ choices = body.get("choices") or []
1029
+ if choices:
1030
+ return str(choices[0].get("message", {}).get("content", ""))
1031
+ return None
1032
+
1033
+
1034
+ def _extract_json_object(s: str) -> Optional[JSON]:
1035
+ start = s.find("{")
1036
+ end = s.rfind("}")
1037
+ if start == -1 or end == -1 or end < start:
1038
+ return None
1039
+ try:
1040
+ obj = json.loads(s[start:end + 1])
1041
+ return obj if isinstance(obj, dict) else None
1042
+ except json.JSONDecodeError:
1043
+ return None
1044
+
1045
+
1046
+ def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
1047
+ """Opt-in enrichment of fuzzy fields. Mutates `result` in place. Any
1048
+ failure (no config, network error, bad JSON) degrades gracefully: a warning
1049
+ to stderr and the deterministic output is left untouched."""
1050
+ cfg = load_llm_config()
1051
+ if cfg is None:
1052
+ _warn(args_ns, "no LLM config found (~/.config/contract-ops/llm.json or "
1053
+ "./config/llm.json); skipping --llm enrichment")
1054
+ return
1055
+ prompt = _LLM_PROMPT + text[:12000]
1056
+ try:
1057
+ raw = _llm_request(cfg, prompt)
1058
+ except (urllib.error.URLError, TimeoutError, OSError, ValueError) as e:
1059
+ _warn(args_ns, f"LLM request failed ({e}); keeping deterministic output only")
1060
+ return
1061
+ if not raw:
1062
+ _warn(args_ns, "LLM returned no content; keeping deterministic output only")
1063
+ return
1064
+ obj = _extract_json_object(raw)
1065
+ if obj is None:
1066
+ _warn(args_ns, "could not parse LLM JSON response; keeping deterministic output only")
1067
+ return
1068
+
1069
+ enriched = False
1070
+ rm = obj.get("renewal_mechanics")
1071
+ if isinstance(rm, str) and rm.strip():
1072
+ result["term"]["renewal_mechanics"] = _field(rm.strip(), 0.6, "llm")
1073
+ enriched = True
1074
+ obligations = obj.get("obligations")
1075
+ if isinstance(obligations, list) and obligations:
1076
+ result["obligations"] = [
1077
+ {"text": str(o).strip(), "confidence": 0.55, "source": "llm"}
1078
+ for o in obligations[:5] if str(o).strip()
1079
+ ]
1080
+ enriched = True
1081
+ gl = obj.get("governing_law")
1082
+ if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
1083
+ result["governing_law"] = _field(gl.strip(), 0.6, "llm")
1084
+ enriched = True
1085
+
1086
+ result["_meta"]["llm_used"] = True
1087
+ if enriched and "llm" not in result["_meta"]["tiers_used"]:
1088
+ result["_meta"]["tiers_used"].append("llm")
1089
+
1090
+
1091
+ # ---------------------------------------------------------------------------
1092
+ # Output rendering
1093
+ # ---------------------------------------------------------------------------
1094
+
1095
+ TOP_LEVEL_FIELDS = (
1096
+ "document", "parties", "dates", "term", "governing_law",
1097
+ "clauses", "defined_terms", "value",
1098
+ )
1099
+
1100
+
1101
+ def _apply_field_subset(result: JSON, fields: List[str]) -> JSON:
1102
+ wanted = {f.strip() for f in fields if f.strip()}
1103
+ out: JSON = {k: v for k, v in result.items() if k in wanted}
1104
+ out["_meta"] = result["_meta"] # provenance always travels with the payload
1105
+ return out
1106
+
1107
+
1108
+ def _strip_confidence(obj: Any) -> Any:
1109
+ """Recursively drop confidence/source markers for the --no-confidence view.
1110
+ Collapses single-remaining-key dicts ({"value": x} -> x, {"term": t} -> t)."""
1111
+ if isinstance(obj, dict):
1112
+ d = {k: _strip_confidence(v) for k, v in obj.items()
1113
+ if k not in ("confidence", "source")}
1114
+ if len(d) == 1:
1115
+ return next(iter(d.values()))
1116
+ return d
1117
+ if isinstance(obj, list):
1118
+ return [_strip_confidence(v) for v in obj]
1119
+ return obj
1120
+
1121
+
1122
+ def render_json(result: JSON, no_confidence: bool) -> str:
1123
+ payload = _strip_confidence(result) if no_confidence else result
1124
+ return json.dumps(payload, indent=2, ensure_ascii=True, sort_keys=False)
1125
+
1126
+
1127
+ def _fv(field: JSON) -> str:
1128
+ v = field.get("value")
1129
+ if v is None:
1130
+ return _dim("(not found)")
1131
+ return str(v)
1132
+
1133
+
1134
+ def render_table(result: JSON, no_confidence: bool) -> str:
1135
+ lines: List[str] = []
1136
+ doc = result.get("document", {})
1137
+ if doc:
1138
+ lines.append(_bold("Document"))
1139
+ lines.append(f" title : {doc.get('title') or _dim('(none)')}")
1140
+ lines.append(f" format : {doc.get('format')}")
1141
+ lines.append(f" sha256 : {str(doc.get('sha256'))[:16]}...")
1142
+ parties = result.get("parties")
1143
+ if parties is not None:
1144
+ lines.append(_bold("Parties"))
1145
+ if parties:
1146
+ for p in parties:
1147
+ role = f" ({p['role']})" if p.get("role") else ""
1148
+ conf = "" if no_confidence else _dim(f" [{p.get('confidence')}]")
1149
+ lines.append(f" - {p['name']}{role}{conf}")
1150
+ else:
1151
+ lines.append(" " + _dim("(none detected)"))
1152
+ dates = result.get("dates")
1153
+ if dates is not None:
1154
+ lines.append(_bold("Dates"))
1155
+ lines.append(f" effective : {_fv(dates['effective'])}")
1156
+ lines.append(f" expiration : {_fv(dates['expiration'])}")
1157
+ term = result.get("term")
1158
+ if term is not None:
1159
+ lines.append(_bold("Term"))
1160
+ lines.append(f" length : {_fv(term['length'])}")
1161
+ lines.append(f" auto_renew : {_fv(term['auto_renew'])}")
1162
+ lines.append(f" notice_days : {_fv(term['notice_period_days'])}")
1163
+ if "renewal_mechanics" in term:
1164
+ lines.append(f" renewal : {_fv(term['renewal_mechanics'])} {_dim('[llm]')}")
1165
+ if "governing_law" in result:
1166
+ lines.append(_bold("Governing law"))
1167
+ lines.append(f" {_fv(result['governing_law'])}")
1168
+ if "value" in result:
1169
+ lines.append(_bold("Value"))
1170
+ lines.append(f" {_fv(result['value'])}")
1171
+ clauses = result.get("clauses")
1172
+ if clauses is not None:
1173
+ lines.append(_bold(f"Clause map ({len(clauses)})"))
1174
+ if clauses:
1175
+ lines.append(" " + _dim("canonical tier detected"))
1176
+ for c in clauses:
1177
+ canon = (c.get("canonical_title") or "")[:20].ljust(20)
1178
+ tier = str(c.get("tier"))[:14].ljust(14)
1179
+ det = c.get("detected_title", "")
1180
+ flag = "" if c.get("mapped") else _yellow(" *")
1181
+ conf = "" if no_confidence else _dim(f" [{c.get('confidence')}]")
1182
+ lines.append(f" {canon} {tier} {det}{flag}{conf}")
1183
+ if any(not c.get("mapped") for c in clauses):
1184
+ lines.append(" " + _dim("* = not mapped to suite vocabulary"))
1185
+ else:
1186
+ lines.append(" " + _dim("(no clause structure detected)"))
1187
+ terms = result.get("defined_terms")
1188
+ if terms is not None:
1189
+ lines.append(_bold(f"Defined terms ({len(terms)})"))
1190
+ if terms:
1191
+ lines.append(" " + ", ".join(t["term"] for t in terms[:20]))
1192
+ else:
1193
+ lines.append(" " + _dim("(none detected)"))
1194
+ meta = result.get("_meta", {})
1195
+ lines.append(_dim(
1196
+ f"tiers={','.join(meta.get('tiers_used', []))} "
1197
+ f"llm={meta.get('llm_used')} extractor={meta.get('extractor_version')}"
1198
+ ))
1199
+ return "\n".join(lines)
1200
+
1201
+
1202
+ # ---------------------------------------------------------------------------
1203
+ # Output JSON Schema (the cross-CLI contract; source of truth for docs/spec/)
1204
+ # ---------------------------------------------------------------------------
1205
+
1206
+
1207
+ def output_schema() -> JSON:
1208
+ field_ref = {"$ref": "#/$defs/field"}
1209
+ return {
1210
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1211
+ "$id": "https://github.com/DrBaher/extract-cli/blob/main/docs/spec/extract-output.schema.json",
1212
+ "title": f"extract-cli output schema (v{SCHEMA_VERSION})",
1213
+ "description": (
1214
+ "Structured payload emitted by `extract <path>` (default JSON output). "
1215
+ "The cross-CLI contract that nda-review-cli, compare-cli and "
1216
+ "contract-vault consume. Every extracted field carries a confidence "
1217
+ "and a source in {deterministic, llm, none}: downstream treats fields "
1218
+ "as 'verify, not trust'. Note: the --no-confidence view is a reduced "
1219
+ "convenience projection NOT governed by this schema."
1220
+ ),
1221
+ "type": "object",
1222
+ "required": [
1223
+ "document", "parties", "dates", "term", "governing_law",
1224
+ "clauses", "defined_terms", "value", "_meta",
1225
+ ],
1226
+ "additionalProperties": False,
1227
+ "$defs": {
1228
+ "source": {"enum": ["deterministic", "llm", "none"]},
1229
+ "confidence": {"type": "number", "minimum": 0, "maximum": 1},
1230
+ "field": {
1231
+ "type": "object",
1232
+ "required": ["value", "confidence", "source"],
1233
+ "properties": {
1234
+ "value": {},
1235
+ "confidence": {"$ref": "#/$defs/confidence"},
1236
+ "source": {"$ref": "#/$defs/source"},
1237
+ },
1238
+ "additionalProperties": False,
1239
+ },
1240
+ },
1241
+ "properties": {
1242
+ "document": {
1243
+ "type": "object",
1244
+ "required": ["title", "format", "sha256", "source_path"],
1245
+ "properties": {
1246
+ "title": {"type": ["string", "null"]},
1247
+ "format": {"enum": ["markdown", "text", "docx", "pdf"]},
1248
+ "sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
1249
+ "source_path": {"type": ["string", "null"]},
1250
+ },
1251
+ "additionalProperties": False,
1252
+ },
1253
+ "parties": {
1254
+ "type": "array",
1255
+ "items": {
1256
+ "type": "object",
1257
+ "required": ["name", "confidence", "source"],
1258
+ "properties": {
1259
+ "name": {"type": "string"},
1260
+ "role": {"type": ["string", "null"]},
1261
+ "confidence": {"$ref": "#/$defs/confidence"},
1262
+ "source": {"$ref": "#/$defs/source"},
1263
+ },
1264
+ "additionalProperties": False,
1265
+ },
1266
+ },
1267
+ "dates": {
1268
+ "type": "object",
1269
+ "required": ["effective", "expiration"],
1270
+ "properties": {"effective": field_ref, "expiration": field_ref},
1271
+ "additionalProperties": False,
1272
+ },
1273
+ "term": {
1274
+ "type": "object",
1275
+ "required": ["length", "auto_renew", "notice_period_days"],
1276
+ "properties": {
1277
+ "length": field_ref,
1278
+ "auto_renew": field_ref,
1279
+ "notice_period_days": field_ref,
1280
+ "renewal_mechanics": field_ref,
1281
+ },
1282
+ "additionalProperties": False,
1283
+ },
1284
+ "governing_law": field_ref,
1285
+ "clauses": {
1286
+ "type": "array",
1287
+ "items": {
1288
+ "type": "object",
1289
+ "required": [
1290
+ "canonical_title", "detected_title", "tier",
1291
+ "span", "confidence", "source", "mapped",
1292
+ ],
1293
+ "properties": {
1294
+ "canonical_title": {"type": ["string", "null"]},
1295
+ "detected_title": {"type": "string"},
1296
+ "tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
1297
+ "span": {
1298
+ "type": "object",
1299
+ "required": ["start", "end"],
1300
+ "properties": {
1301
+ "start": {"type": "integer", "minimum": 0},
1302
+ "end": {"type": "integer", "minimum": 0},
1303
+ },
1304
+ "additionalProperties": False,
1305
+ },
1306
+ "confidence": {"$ref": "#/$defs/confidence"},
1307
+ "source": {"$ref": "#/$defs/source"},
1308
+ "mapped": {"type": "boolean"},
1309
+ },
1310
+ "additionalProperties": False,
1311
+ },
1312
+ },
1313
+ "defined_terms": {
1314
+ "type": "array",
1315
+ "items": {
1316
+ "type": "object",
1317
+ "required": ["term", "confidence", "source"],
1318
+ "properties": {
1319
+ "term": {"type": "string"},
1320
+ "confidence": {"$ref": "#/$defs/confidence"},
1321
+ "source": {"$ref": "#/$defs/source"},
1322
+ },
1323
+ "additionalProperties": False,
1324
+ },
1325
+ },
1326
+ "value": field_ref,
1327
+ "obligations": {
1328
+ "type": "array",
1329
+ "items": {
1330
+ "type": "object",
1331
+ "required": ["text", "confidence", "source"],
1332
+ "properties": {
1333
+ "text": {"type": "string"},
1334
+ "confidence": {"$ref": "#/$defs/confidence"},
1335
+ "source": {"$ref": "#/$defs/source"},
1336
+ },
1337
+ "additionalProperties": False,
1338
+ },
1339
+ },
1340
+ "_meta": {
1341
+ "type": "object",
1342
+ "required": ["extractor_version", "tiers_used", "llm_used"],
1343
+ "properties": {
1344
+ "extractor_version": {"type": "string"},
1345
+ "tiers_used": {"type": "array", "items": {"enum": ["deterministic", "llm"]}},
1346
+ "llm_used": {"type": "boolean"},
1347
+ },
1348
+ "additionalProperties": False,
1349
+ },
1350
+ },
1351
+ }
1352
+
1353
+
1354
+ # ---------------------------------------------------------------------------
1355
+ # Field catalog (for `extract fields`)
1356
+ # ---------------------------------------------------------------------------
1357
+
1358
+ FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
1359
+ ("document.title", "deterministic", "Document title (first heading or filename)"),
1360
+ ("parties", "deterministic", "Contracting parties ('between X and Y')"),
1361
+ ("dates.effective", "deterministic", "Effective date (ISO-normalized when parseable)"),
1362
+ ("dates.expiration", "deterministic", "Expiration date"),
1363
+ ("term.length", "deterministic", "Term length, best-effort"),
1364
+ ("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
1365
+ ("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
1366
+ ("governing_law", "deterministic", "Governing law / jurisdiction"),
1367
+ ("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary"),
1368
+ ("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
1369
+ ("value", "deterministic", "Headline monetary value"),
1370
+ ("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
1371
+ ("obligations", "llm", "Key obligation phrasing (fuzzy; --llm only)"),
1372
+ )
1373
+
1374
+
1375
+ # ---------------------------------------------------------------------------
1376
+ # Bundled demo fixture (so `extract demo` works from an installed wheel)
1377
+ # ---------------------------------------------------------------------------
1378
+
1379
+ DEMO_DOCUMENT = """# Mutual Non-Disclosure Agreement
1380
+
1381
+ This Mutual Non-Disclosure Agreement (the "Agreement") is made and entered into
1382
+ as of March 1, 2024, by and between Acme Robotics, Inc. (the "Disclosing Party")
1383
+ and Beta Logistics LLC (the "Receiving Party").
1384
+
1385
+ ## Definitions
1386
+
1387
+ For purposes of this Agreement, "Confidential Information" means any non-public
1388
+ information disclosed by one party to the other.
1389
+
1390
+ ## Confidentiality Obligations
1391
+
1392
+ The Receiving Party shall protect the Confidential Information using no less than
1393
+ reasonable care and shall not disclose it to any third party.
1394
+
1395
+ ## Term
1396
+
1397
+ This Agreement shall remain in effect for a period of three (3) years from the
1398
+ Effective Date and shall automatically renew for successive one-year terms unless
1399
+ either party gives sixty (60) days' written notice of non-renewal.
1400
+
1401
+ ## Limitation of Liability
1402
+
1403
+ In no event shall either party's aggregate liability exceed $50,000.
1404
+
1405
+ ## Governing Law
1406
+
1407
+ This Agreement shall be governed by and construed in accordance with the laws of
1408
+ the State of Delaware, without regard to its conflict-of-laws principles.
1409
+ """
1410
+
1411
+
1412
+ # ---------------------------------------------------------------------------
1413
+ # Command handlers
1414
+ # ---------------------------------------------------------------------------
1415
+
1416
+
1417
+ def cmd_extract(args: argparse.Namespace) -> int:
1418
+ path = Path(args.path)
1419
+ raw, text, fmt, warnings = load_source(path)
1420
+ for w in warnings:
1421
+ _warn(args, w)
1422
+
1423
+ result = build_extraction(text, raw, fmt, str(args.path))
1424
+
1425
+ if args.llm:
1426
+ llm_enrich(result, text, args)
1427
+
1428
+ fmt_out = "json" if args.json else args.format
1429
+ if args.fields:
1430
+ result = _apply_field_subset(result, args.fields.split(","))
1431
+
1432
+ _why_print(
1433
+ args, f"extracted {path.name}",
1434
+ f"format={fmt} parties={len(result.get('parties', []))} "
1435
+ f"clauses={len(result.get('clauses', []))}",
1436
+ f"tiers={','.join(result['_meta']['tiers_used'])} "
1437
+ f"llm_used={result['_meta']['llm_used']}",
1438
+ f"low_signal={_is_low_signal(result)}" if not args.fields else "fields_subset=on",
1439
+ )
1440
+
1441
+ if args.silent and fmt_out != "json":
1442
+ pass # silent suppresses the human table; JSON is the machine payload
1443
+ elif fmt_out == "table":
1444
+ print(render_table(result, args.no_confidence))
1445
+ else:
1446
+ print(render_json(result, args.no_confidence))
1447
+
1448
+ if not args.fields and _is_low_signal(result):
1449
+ _warn(args, "document produced no high-signal fields (parties/clauses/dates); "
1450
+ "it may be scanned, image-only, or unstructured")
1451
+ return 1
1452
+ return 0
1453
+
1454
+
1455
+ def cmd_schema(args: argparse.Namespace) -> int:
1456
+ print(json.dumps(output_schema(), indent=2, ensure_ascii=True))
1457
+ return 0
1458
+
1459
+
1460
+ def cmd_fields(args: argparse.Namespace) -> int:
1461
+ if args.json:
1462
+ payload = [
1463
+ {"field": f, "tier": tier, "description": desc}
1464
+ for f, tier, desc in FIELD_CATALOG
1465
+ ]
1466
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
1467
+ return 0
1468
+ print(_bold("Extractable fields") + _dim(" (tier = which extraction tier produces it)"))
1469
+ for f, tier, desc in FIELD_CATALOG:
1470
+ tag = _green(tier) if tier == "deterministic" else _yellow(tier)
1471
+ print(f" {f.ljust(26)} {tag.ljust(22)} {_dim(desc)}")
1472
+ return 0
1473
+
1474
+
1475
+ def cmd_demo(args: argparse.Namespace) -> int:
1476
+ raw = DEMO_DOCUMENT.encode("utf-8")
1477
+ result = build_extraction(DEMO_DOCUMENT, raw, "markdown", "(bundled demo fixture)")
1478
+ if not args.silent:
1479
+ _eprint(_bold("extract-cli demo") + " -- the suite's passport control")
1480
+ _eprint(_dim(
1481
+ " A foreign document comes in (here: a bundled NDA). The deterministic\n"
1482
+ " tier maps its clauses onto the suite's canonical vocabulary and pulls\n"
1483
+ " parties/dates/term/governing-law -- no LLM, no network. The JSON below\n"
1484
+ " is what nda-review-cli / compare-cli / contract-vault consume.\n"
1485
+ ))
1486
+ fmt_out = "json" if args.json else args.format
1487
+ if fmt_out == "table":
1488
+ print(render_table(result, args.no_confidence))
1489
+ else:
1490
+ print(render_json(result, args.no_confidence))
1491
+ if not args.silent:
1492
+ _eprint(_dim("\n Try: extract demo --format json | jq '.clauses[].canonical_title'"))
1493
+ return 0
1494
+
1495
+
1496
+ # ---------------------------------------------------------------------------
1497
+ # Shell completion
1498
+ # ---------------------------------------------------------------------------
1499
+
1500
+ _SUBCOMMANDS = ("schema", "fields", "demo", "completion")
1501
+ _GLOBAL_FLAGS = (
1502
+ "--json", "--why", "-q", "--silent", "--no-color", "--llm",
1503
+ "--format", "--fields", "--no-confidence", "-V", "--version", "-h", "--help",
1504
+ )
1505
+
1506
+ _BASH_COMPLETION = r"""# extract-cli bash completion
1507
+ # eval "$(extract completion bash)"
1508
+ _extract_completions() {
1509
+ local cur prev
1510
+ cur="${COMP_WORDS[COMP_CWORD]}"
1511
+ local cmds="schema fields demo completion"
1512
+ local flags="--json --why -q --silent --no-color --llm --format --fields --no-confidence -V --version -h --help"
1513
+ if [ "$COMP_CWORD" -eq 1 ]; then
1514
+ COMPREPLY=( $(compgen -W "${cmds}" -- "${cur}") $(compgen -f -- "${cur}") )
1515
+ return 0
1516
+ fi
1517
+ if [[ "${cur}" == -* ]]; then
1518
+ COMPREPLY=( $(compgen -W "${flags}" -- "${cur}") )
1519
+ return 0
1520
+ fi
1521
+ COMPREPLY=( $(compgen -f -- "${cur}") )
1522
+ }
1523
+ complete -F _extract_completions extract
1524
+ """
1525
+
1526
+ _ZSH_COMPLETION = r"""# extract-cli zsh completion
1527
+ # eval "$(extract completion zsh)"
1528
+ _extract() {
1529
+ local -a cmds flags
1530
+ cmds=(
1531
+ 'schema:Print the output JSON Schema (the cross-CLI contract)'
1532
+ 'fields:List extractable fields and their tier'
1533
+ 'demo:Run extraction on a bundled fixture'
1534
+ 'completion:Emit a shell completion script'
1535
+ )
1536
+ flags=(
1537
+ '--json' '--why' '-q' '--silent' '--no-color' '--llm'
1538
+ '--format' '--fields' '--no-confidence' '-V' '--version'
1539
+ )
1540
+ if (( CURRENT == 2 )); then
1541
+ _describe 'command' cmds
1542
+ _files
1543
+ return
1544
+ fi
1545
+ _files
1546
+ compadd -- ${flags}
1547
+ }
1548
+ compdef _extract extract
1549
+ """
1550
+
1551
+
1552
+ def cmd_completion(args: argparse.Namespace) -> int:
1553
+ shell = (args.shell or "").lower()
1554
+ if shell == "bash":
1555
+ sys.stdout.write(_BASH_COMPLETION)
1556
+ return 0
1557
+ if shell == "zsh":
1558
+ sys.stdout.write(_ZSH_COMPLETION)
1559
+ return 0
1560
+ raise ExtractError(f"unsupported shell: {args.shell!r}. Supported: bash, zsh.")
1561
+
1562
+
1563
+ def _completion_handler(argv: List[str]) -> int:
1564
+ """Hidden `__complete` handler invoked by the shell-completion scripts."""
1565
+ if not argv:
1566
+ return 0
1567
+ what = argv[0]
1568
+ if what == "commands":
1569
+ for c in _SUBCOMMANDS:
1570
+ print(c)
1571
+ elif what == "flags":
1572
+ for f in _GLOBAL_FLAGS:
1573
+ print(f)
1574
+ return 0
1575
+
1576
+
1577
+ # ---------------------------------------------------------------------------
1578
+ # Argument parsing + main
1579
+ # ---------------------------------------------------------------------------
1580
+
1581
+
1582
+ def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
1583
+ p.add_argument("--json", action="store_true",
1584
+ help="Force JSON output to stdout (the default).")
1585
+ p.add_argument("--format", choices=("json", "table"), default="json",
1586
+ help="Output format (default: json).")
1587
+ p.add_argument("--no-confidence", action="store_true",
1588
+ help="Omit confidence/source markers (reduced convenience view).")
1589
+ p.add_argument("--why", action="store_true",
1590
+ help="Print a rationale block to stderr.")
1591
+ p.add_argument("-q", "--silent", "--quiet", dest="silent", action="store_true",
1592
+ help="Suppress non-error diagnostics (and the human table).")
1593
+
1594
+
1595
+ def build_parser() -> argparse.ArgumentParser:
1596
+ parser = argparse.ArgumentParser(
1597
+ prog="extract",
1598
+ description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
1599
+ "JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
1600
+ )
1601
+ parser.add_argument("-V", "--version", action="version",
1602
+ version=f"{CLI_NAME} {__version__}")
1603
+ parser.add_argument("--no-color", action="store_true",
1604
+ help="Disable ANSI color (also honors NO_COLOR / FORCE_COLOR).")
1605
+
1606
+ sub = parser.add_subparsers(dest="command")
1607
+
1608
+ p_schema = sub.add_parser("schema", help="Print the output JSON Schema (the contract).")
1609
+ p_schema.set_defaults(func=cmd_schema)
1610
+
1611
+ p_fields = sub.add_parser("fields", help="List extractable fields and their tier.")
1612
+ p_fields.add_argument("--json", action="store_true", help="Emit JSON.")
1613
+ p_fields.set_defaults(func=cmd_fields)
1614
+
1615
+ p_demo = sub.add_parser("demo", help="Run extraction on a bundled fixture.")
1616
+ _add_common_output_flags(p_demo)
1617
+ p_demo.add_argument("--llm", action="store_true", help=argparse.SUPPRESS)
1618
+ p_demo.add_argument("--fields", default="", help=argparse.SUPPRESS)
1619
+ p_demo.set_defaults(func=cmd_demo)
1620
+
1621
+ p_comp = sub.add_parser("completion", help="Emit a shell completion script (bash or zsh).")
1622
+ p_comp.add_argument("shell", choices=("bash", "zsh"))
1623
+ p_comp.set_defaults(func=cmd_completion)
1624
+
1625
+ p_ex = sub.add_parser("extract", help="Extract a document (explicit form of the default).")
1626
+ _build_extract_args(p_ex)
1627
+
1628
+ return parser
1629
+
1630
+
1631
+ def _build_extract_args(p: argparse.ArgumentParser) -> None:
1632
+ p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
1633
+ p.add_argument("--llm", action="store_true",
1634
+ help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
1635
+ "Off by default; the deterministic core is fully useful without it.")
1636
+ p.add_argument("--fields", default="",
1637
+ help="Comma-separated subset of top-level fields to emit "
1638
+ "(e.g. parties,clauses,governing_law).")
1639
+ _add_common_output_flags(p)
1640
+ p.set_defaults(func=cmd_extract)
1641
+
1642
+
1643
+ def _build_default_extract_parser() -> argparse.ArgumentParser:
1644
+ """Parser for the bare `extract <path>` default action (no subcommand)."""
1645
+ p = argparse.ArgumentParser(
1646
+ prog="extract",
1647
+ description="Extract a document into structured JSON (default action).",
1648
+ )
1649
+ p.add_argument("--no-color", action="store_true",
1650
+ help="Disable ANSI color (also honors NO_COLOR / FORCE_COLOR).")
1651
+ _build_extract_args(p)
1652
+ return p
1653
+
1654
+
1655
+ def main(argv: Optional[List[str]] = None) -> int:
1656
+ # Locale-safe stdout/stderr: POSIX/C locale (common on macOS CI runners)
1657
+ # leaves the streams in ASCII mode, so any non-ASCII char would raise
1658
+ # UnicodeEncodeError. Force UTF-8 regardless of LANG/LC_ALL.
1659
+ for _stream in (sys.stdout, sys.stderr):
1660
+ if hasattr(_stream, "reconfigure"):
1661
+ try:
1662
+ _stream.reconfigure(encoding="utf-8", errors="replace")
1663
+ except Exception:
1664
+ pass
1665
+
1666
+ argv = sys.argv[1:] if argv is None else argv
1667
+
1668
+ # Global --no-color before argparse so it works on every form.
1669
+ if "--no-color" in argv:
1670
+ os.environ["NO_COLOR"] = "1"
1671
+ argv = [a for a in argv if a != "--no-color"]
1672
+
1673
+ # Hidden completion handler (kept out of argparse / --help).
1674
+ if argv and argv[0] == "__complete":
1675
+ return _completion_handler(argv[1:])
1676
+
1677
+ if not argv:
1678
+ build_parser().print_help()
1679
+ return 0
1680
+
1681
+ # Route: a known subcommand or -V/-h go through the full parser; anything
1682
+ # else is treated as the default `extract <path>` action.
1683
+ known = set(_SUBCOMMANDS) | {"extract", "-V", "--version", "-h", "--help"}
1684
+ first = argv[0]
1685
+ try:
1686
+ if first in known:
1687
+ parser = build_parser()
1688
+ args = parser.parse_args(argv)
1689
+ if not getattr(args, "func", None):
1690
+ parser.print_help()
1691
+ return 0
1692
+ else:
1693
+ args = _build_default_extract_parser().parse_args(argv)
1694
+ return args.func(args) or 0
1695
+ except ExtractError as e:
1696
+ _eprint(_red("error:") + f" {e}")
1697
+ return 2
1698
+ except BrokenPipeError: # e.g. `extract foo.md | head`
1699
+ try:
1700
+ sys.stdout.close()
1701
+ except Exception:
1702
+ pass
1703
+ return 0
1704
+ except KeyboardInterrupt: # pragma: no cover
1705
+ _eprint("interrupted")
1706
+ return 130
1707
+
1708
+
1709
+ if __name__ == "__main__":
1710
+ sys.exit(main())