extract-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
extract_cli.py
ADDED
|
@@ -0,0 +1,1710 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""extract-cli -- the open-loop front door of the contract-ops CLI suite.
|
|
3
|
+
|
|
4
|
+
The suite is a contract lifecycle (store -> draft -> review -> diff -> convert
|
|
5
|
+
-> sign) that, until now, only handled documents it authored from its own
|
|
6
|
+
templates. `extract-cli` is "passport control": it ingests ANY document --
|
|
7
|
+
yours or a counterparty's foreign paper -- in .md/.txt (natively), .docx, or
|
|
8
|
+
.pdf, and emits a structured JSON representation that the rest of the suite
|
|
9
|
+
(nda-review-cli, compare-cli, contract-vault) consumes.
|
|
10
|
+
|
|
11
|
+
Two extraction tiers:
|
|
12
|
+
* DETERMINISTIC (default, always on): parties, dates, defined-term inventory,
|
|
13
|
+
the CLAUSE MAP, governing law, best-effort term/notice/value. Pure
|
|
14
|
+
regex/structure -- no network, no LLM.
|
|
15
|
+
* LLM (opt-in via --llm only): the fuzzy fields (renewal mechanics,
|
|
16
|
+
obligation phrasing, ambiguous governing law). Always skippable; the
|
|
17
|
+
deterministic core is fully useful without it.
|
|
18
|
+
|
|
19
|
+
Every extracted field carries a `confidence` and a `source` in
|
|
20
|
+
{deterministic, llm, none} -- downstream tools treat fields as "verify, not
|
|
21
|
+
trust".
|
|
22
|
+
|
|
23
|
+
Stdlib-only. Single file. The clause-detection cascade (H2 -> bold-numbered ->
|
|
24
|
+
ALL-CAPS) and the canonical-vocabulary alias normalization are ported from
|
|
25
|
+
template-vault-cli so a foreign document's clauses land on the suite's shared
|
|
26
|
+
clause vocabulary.
|
|
27
|
+
|
|
28
|
+
Part of the contract-ops CLI suite. See docs/INTEROP.md.
|
|
29
|
+
"""
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import argparse
|
|
33
|
+
import datetime as _dt
|
|
34
|
+
import hashlib
|
|
35
|
+
import importlib.util
|
|
36
|
+
import json
|
|
37
|
+
import os
|
|
38
|
+
import re
|
|
39
|
+
import sys
|
|
40
|
+
import urllib.error
|
|
41
|
+
import urllib.request
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
44
|
+
|
|
45
|
+
__version__ = "0.1.0"
|
|
46
|
+
|
|
47
|
+
# Bumped independently of the package version when the *extraction logic*
|
|
48
|
+
# changes in a way downstream consumers should notice. Embedded in `_meta`.
|
|
49
|
+
EXTRACTOR_VERSION = "0.1.0"
|
|
50
|
+
|
|
51
|
+
# JSON Schema version of the output contract (docs/spec/extract-output.schema.json).
|
|
52
|
+
SCHEMA_VERSION = 1
|
|
53
|
+
|
|
54
|
+
JSON = Dict[str, Any]
|
|
55
|
+
|
|
56
|
+
CLI_NAME = "extract-cli"
|
|
57
|
+
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
# Streams / color (convention-shared with the suite; see docs/INTEROP.md)
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _color_enabled(stream: Any = None) -> bool:
|
|
64
|
+
"""Auto-detect color support: opt out via NO_COLOR (https://no-color.org/),
|
|
65
|
+
force on via FORCE_COLOR, otherwise only when the stream is a tty."""
|
|
66
|
+
if os.environ.get("NO_COLOR"):
|
|
67
|
+
return False
|
|
68
|
+
if os.environ.get("FORCE_COLOR"):
|
|
69
|
+
return True
|
|
70
|
+
s = stream if stream is not None else sys.stdout
|
|
71
|
+
try:
|
|
72
|
+
return bool(s.isatty())
|
|
73
|
+
except Exception:
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _c(text: str, code: str) -> str:
|
|
78
|
+
if not _color_enabled():
|
|
79
|
+
return text
|
|
80
|
+
return f"\033[{code}m{text}\033[0m"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _green(s: str) -> str:
|
|
84
|
+
return _c(s, "32")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _yellow(s: str) -> str:
|
|
88
|
+
return _c(s, "33")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _red(s: str) -> str:
|
|
92
|
+
return _c(s, "31")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _bold(s: str) -> str:
|
|
96
|
+
return _c(s, "1")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _dim(s: str) -> str:
|
|
100
|
+
return _c(s, "2")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _eprint(*args: Any, **kwargs: Any) -> None:
|
|
104
|
+
print(*args, file=sys.stderr, **kwargs)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _why_print(args_ns: argparse.Namespace, header: str, *lines: str) -> None:
|
|
108
|
+
"""Emit a `--why` block to **stderr** so it never pollutes piped stdout.
|
|
109
|
+
No-op unless `--why` was passed. Plain-text envelope (matches this repo's
|
|
110
|
+
siblings template-vault-cli / draft-cli)."""
|
|
111
|
+
if not getattr(args_ns, "why", False):
|
|
112
|
+
return
|
|
113
|
+
_eprint(f"\n[why] {header}")
|
|
114
|
+
for line in lines:
|
|
115
|
+
_eprint(f" {line}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _warn(args_ns: Optional[argparse.Namespace], msg: str) -> None:
|
|
119
|
+
"""Diagnostic to stderr, suppressed by -q/--silent."""
|
|
120
|
+
if args_ns is not None and getattr(args_ns, "silent", False):
|
|
121
|
+
return
|
|
122
|
+
_eprint(_yellow("warning:") + f" {msg}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
# Errors
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class ExtractError(Exception):
|
|
131
|
+
"""User-actionable error. main() prints it and exits non-zero."""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Clause-detection cascade (ported from template-vault-cli `template_vault_cli.py`)
|
|
136
|
+
#
|
|
137
|
+
# Tier 1: H2 headings (`## Title`) -- Markdown-native templates.
|
|
138
|
+
# Tier 2: bold-numbered (`**1. Purpose**`) -- typical of DOCX -> text.
|
|
139
|
+
# Tier 3: ALL-CAPS standalone lines -- typical of legal PDFs.
|
|
140
|
+
# The fallback tiers only run when the prior tier finds nothing, so they can't
|
|
141
|
+
# shadow real structure. Foreign clauses are then normalized onto the suite's
|
|
142
|
+
# canonical vocabulary via the alias index below.
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
# Auto-detect clause headers by H2 only (not H3+). Anchored at line start.
|
|
146
|
+
H2_RE = re.compile(r"^##[ \t]+(.+?)[ \t]*$", re.MULTILINE)
|
|
147
|
+
|
|
148
|
+
# Bold-numbered: **1. Purpose** / **Section 4. Term** / **(1) Scope**
|
|
149
|
+
_BOLD_HEADING_RE = re.compile(
|
|
150
|
+
r"^\*\*\s*"
|
|
151
|
+
r"(?:"
|
|
152
|
+
r"(?:Article|Section|Sec\.?|Art\.?|Clause|Part|§)\s+\S+\.?" # word-prefixed
|
|
153
|
+
r"|"
|
|
154
|
+
r"\(\d+\)" # (1)
|
|
155
|
+
r"|"
|
|
156
|
+
r"\d+(?:\.\d+)*" # 1 / 1.2.3
|
|
157
|
+
r")"
|
|
158
|
+
r"[\.\):\s]+"
|
|
159
|
+
r"([^\*\n]+?)"
|
|
160
|
+
r"\s*\*\*\s*$",
|
|
161
|
+
re.MULTILINE,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# ALL-CAPS standalone heading: blank-line framed on both sides (so inline
|
|
165
|
+
# shouts in prose don't qualify); doesn't start with `[` (so `[BRACKETED]`
|
|
166
|
+
# placeholders never match). Single-token lines need >= 4 ASCII letters
|
|
167
|
+
# (enforced in _qualifies_as_all_caps_heading).
|
|
168
|
+
_ALL_CAPS_HEADING_RE = re.compile(
|
|
169
|
+
r"(?:^|\n)\n([A-Z][A-Z0-9 \-/&,]{1,}[A-Z0-9])\s*\n\n",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Roman numerals 1-39 -- covers virtually all legal-document section numbering.
|
|
173
|
+
# Longer alternatives come first within each group so the regex engine doesn't
|
|
174
|
+
# short-circuit on a prefix match (bare V / X must still match).
|
|
175
|
+
_ROMAN_RE = (
|
|
176
|
+
r"(?:(?:XXX|XX|X)(?:IX|IV|VIII|VII|VI|V|III|II|I)?"
|
|
177
|
+
r"|IX|IV|VIII|VII|VI|V|III|II|I)"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Leading numbering tokens to strip from a clause title. Order matters: longer
|
|
181
|
+
# Article/Section forms come before bare numbers so they're consumed first.
|
|
182
|
+
_NUMBER_PREFIX_RE = re.compile(
|
|
183
|
+
r"^\s*(?:"
|
|
184
|
+
r"(?:Article|Section|Sec\.?|Art\.?|Clause|Part)\s+"
|
|
185
|
+
r"(?:" + _ROMAN_RE + r"|\d+(?:\.\d+)*)"
|
|
186
|
+
r"|"
|
|
187
|
+
r"§\s*\d+(?:\.\d+)*"
|
|
188
|
+
r"|"
|
|
189
|
+
r"\(\d+\)"
|
|
190
|
+
r"|"
|
|
191
|
+
r"\[\d+\]"
|
|
192
|
+
r"|"
|
|
193
|
+
r"\d+(?:\.\d+)+"
|
|
194
|
+
r"|"
|
|
195
|
+
r"\d+"
|
|
196
|
+
r")"
|
|
197
|
+
r"[\.\)\]:\s]*",
|
|
198
|
+
re.IGNORECASE,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _strip_clause_number(s: str) -> str:
|
|
203
|
+
"""Remove a leading numbering token (`1.`, `1)`, `(1)`, `[1]`, `1.2.3`,
|
|
204
|
+
`Article I.`, `Section 4.`, `§ 4.2`). Idempotent."""
|
|
205
|
+
return _NUMBER_PREFIX_RE.sub("", s, count=1).strip()
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _qualifies_as_all_caps_heading(title: str) -> bool:
|
|
209
|
+
"""Single-token ALL-CAPS lines need >= 4 ASCII letters (so 'TER' doesn't
|
|
210
|
+
qualify but 'TERM' does). Multi-token lines pass through."""
|
|
211
|
+
tokens = title.split()
|
|
212
|
+
if len(tokens) >= 2:
|
|
213
|
+
return True
|
|
214
|
+
return sum(1 for ch in title if "A" <= ch <= "Z") >= 4
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def detect_clauses(text: str) -> List[JSON]:
|
|
218
|
+
"""Run the three-tier cascade and return clauses with their detection tier.
|
|
219
|
+
|
|
220
|
+
Returns [{title, detected, anchor, start, end, tier}, ...]. `title` is the
|
|
221
|
+
numbering-stripped heading; `detected` is the raw heading line as it
|
|
222
|
+
appeared. The first tier that fires wins (H2 needs >= 1 hit; the fallbacks
|
|
223
|
+
need >= 2 to avoid false positives)."""
|
|
224
|
+
h2 = list(H2_RE.finditer(text))
|
|
225
|
+
if h2:
|
|
226
|
+
return _matches_to_clauses(text, h2, group=1, tier="h2")
|
|
227
|
+
bold = list(_BOLD_HEADING_RE.finditer(text))
|
|
228
|
+
if len(bold) >= 2:
|
|
229
|
+
return _matches_to_clauses(text, bold, group=1, tier="bold-numbered")
|
|
230
|
+
caps = [
|
|
231
|
+
m for m in _ALL_CAPS_HEADING_RE.finditer(text)
|
|
232
|
+
if _qualifies_as_all_caps_heading(m.group(1))
|
|
233
|
+
]
|
|
234
|
+
if len(caps) >= 2:
|
|
235
|
+
return _matches_to_clauses(text, caps, group=1, tier="all-caps")
|
|
236
|
+
return []
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _matches_to_clauses(text: str, matches: List["re.Match[str]"], group: int,
|
|
240
|
+
tier: str) -> List[JSON]:
|
|
241
|
+
"""Build clause dicts from regex matches whose `group` holds the title.
|
|
242
|
+
The clause body runs from the heading line to the next heading (or EOF)."""
|
|
243
|
+
out: List[JSON] = []
|
|
244
|
+
for i, m in enumerate(matches):
|
|
245
|
+
raw = m.group(group).strip()
|
|
246
|
+
title = _strip_clause_number(raw)
|
|
247
|
+
# Anchor line: for ALL-CAPS, step past the leading newline gap the
|
|
248
|
+
# regex captured so the span starts at the heading line itself.
|
|
249
|
+
anchor_start = text.rfind(m.group(group), m.start(), m.end())
|
|
250
|
+
line_start = text.rfind("\n", 0, anchor_start) + 1
|
|
251
|
+
line_end = text.find("\n", line_start)
|
|
252
|
+
if line_end == -1:
|
|
253
|
+
line_end = len(text)
|
|
254
|
+
anchor = text[line_start:line_end]
|
|
255
|
+
start = line_start
|
|
256
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
257
|
+
out.append({
|
|
258
|
+
"title": title,
|
|
259
|
+
"detected": anchor.strip(),
|
|
260
|
+
"anchor": anchor,
|
|
261
|
+
"start": start,
|
|
262
|
+
"end": end,
|
|
263
|
+
"tier": tier,
|
|
264
|
+
})
|
|
265
|
+
return out
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _norm_clause_key(s: str) -> str:
|
|
269
|
+
"""Normalize a clause title/alias for matching (number-stripped, lowercased)."""
|
|
270
|
+
return _strip_clause_number(s).strip().lower()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
# ---------------------------------------------------------------------------
|
|
274
|
+
# Canonical clause vocabulary
|
|
275
|
+
#
|
|
276
|
+
# template-vault-cli stores `clause_aliases` per-template (canonical_title ->
|
|
277
|
+
# [alias, ...]). A FOREIGN document carries no such map, so extract-cli ships a
|
|
278
|
+
# built-in default vocabulary -- the suite's shared clause names -- and maps a
|
|
279
|
+
# document's detected clause titles onto it. This is the differentiator: it
|
|
280
|
+
# turns "whatever the counterparty called their sections" into the canonical
|
|
281
|
+
# vocabulary nda-review-cli / compare-cli already speak.
|
|
282
|
+
# ---------------------------------------------------------------------------
|
|
283
|
+
|
|
284
|
+
CANONICAL_CLAUSE_ALIASES: Dict[str, List[str]] = {
|
|
285
|
+
"Definitions": ["definitions", "defined terms", "interpretation", "construction"],
|
|
286
|
+
"Confidentiality": [
|
|
287
|
+
"confidentiality", "non-disclosure", "nondisclosure", "confidential information",
|
|
288
|
+
"confidentiality obligations", "secrecy", "protection of confidential information",
|
|
289
|
+
],
|
|
290
|
+
"Term": ["term", "duration", "agreement term", "term of agreement"],
|
|
291
|
+
"Termination": ["termination", "term and termination", "right to terminate", "termination for cause"],
|
|
292
|
+
"Governing Law": [
|
|
293
|
+
"governing law", "applicable law", "choice of law", "law and jurisdiction",
|
|
294
|
+
"governing law and jurisdiction",
|
|
295
|
+
],
|
|
296
|
+
"Dispute Resolution": ["dispute resolution", "arbitration", "disputes", "mediation"],
|
|
297
|
+
"Indemnification": ["indemnification", "indemnity", "hold harmless", "indemnities"],
|
|
298
|
+
"Limitation of Liability": [
|
|
299
|
+
"limitation of liability", "liability", "limitation on liability", "liability cap",
|
|
300
|
+
"exclusion of liability",
|
|
301
|
+
],
|
|
302
|
+
"Intellectual Property": [
|
|
303
|
+
"intellectual property", "ip rights", "ownership of ip", "proprietary rights",
|
|
304
|
+
"intellectual property rights", "ownership",
|
|
305
|
+
],
|
|
306
|
+
"Payment": ["payment", "fees", "compensation", "fees and payment", "consideration", "pricing"],
|
|
307
|
+
"Warranties": [
|
|
308
|
+
"warranties", "representations and warranties", "warranty", "reps and warranties",
|
|
309
|
+
"representations",
|
|
310
|
+
],
|
|
311
|
+
"Assignment": ["assignment", "assignability", "assignment and delegation"],
|
|
312
|
+
"Notices": ["notices", "notice"],
|
|
313
|
+
"Force Majeure": ["force majeure", "acts of god"],
|
|
314
|
+
"Entire Agreement": ["entire agreement", "integration", "complete agreement"],
|
|
315
|
+
"Severability": ["severability", "severance"],
|
|
316
|
+
"Waiver": ["waiver", "no waiver"],
|
|
317
|
+
"Non-Compete": [
|
|
318
|
+
"non-compete", "noncompete", "noncompetition", "non-competition",
|
|
319
|
+
"covenant not to compete",
|
|
320
|
+
],
|
|
321
|
+
"Non-Solicitation": ["non-solicit", "non-solicitation", "nonsolicitation", "no solicitation"],
|
|
322
|
+
"Data Protection": ["data protection", "data privacy", "gdpr", "privacy", "personal data"],
|
|
323
|
+
"Insurance": ["insurance"],
|
|
324
|
+
"Counterparts": ["counterparts"],
|
|
325
|
+
"Survival": ["survival", "survival of obligations"],
|
|
326
|
+
"Amendment": ["amendment", "amendments", "modification", "modifications", "changes"],
|
|
327
|
+
"Relationship of the Parties": [
|
|
328
|
+
"relationship of the parties", "independent contractor", "no partnership", "no agency",
|
|
329
|
+
],
|
|
330
|
+
"Compliance with Laws": ["compliance with laws", "compliance", "anti-corruption"],
|
|
331
|
+
"Publicity": ["publicity", "announcements", "press releases"],
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _build_alias_index() -> Dict[str, str]:
|
|
336
|
+
idx: Dict[str, str] = {}
|
|
337
|
+
for canonical, aliases in CANONICAL_CLAUSE_ALIASES.items():
|
|
338
|
+
idx[_norm_clause_key(canonical)] = canonical
|
|
339
|
+
for alias in aliases:
|
|
340
|
+
idx[_norm_clause_key(alias)] = canonical
|
|
341
|
+
return idx
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
_ALIAS_INDEX = _build_alias_index()
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _canonicalize_clause(detected_title: str) -> Tuple[Optional[str], bool]:
|
|
348
|
+
"""Map a detected clause title to a canonical suite title.
|
|
349
|
+
|
|
350
|
+
Returns (canonical_title, mapped). On an exact alias/canonical hit, returns
|
|
351
|
+
the canonical name. Otherwise tries a substring containment match against
|
|
352
|
+
the index (so 'Confidentiality and Non-Disclosure' still maps). Falls back
|
|
353
|
+
to a Title-Cased copy of the detected title with mapped=False."""
|
|
354
|
+
key = _norm_clause_key(detected_title)
|
|
355
|
+
if not key:
|
|
356
|
+
return None, False
|
|
357
|
+
canon = _ALIAS_INDEX.get(key)
|
|
358
|
+
if canon is not None:
|
|
359
|
+
return canon, True
|
|
360
|
+
# Containment: longest alias key contained in (or containing) the title.
|
|
361
|
+
best: Optional[str] = None
|
|
362
|
+
best_len = 0
|
|
363
|
+
for alias_key, canonical in _ALIAS_INDEX.items():
|
|
364
|
+
if len(alias_key) >= 5 and (alias_key in key or key in alias_key):
|
|
365
|
+
if len(alias_key) > best_len:
|
|
366
|
+
best, best_len = canonical, len(alias_key)
|
|
367
|
+
if best is not None:
|
|
368
|
+
return best, True
|
|
369
|
+
return _titlecase(detected_title), False
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
# ---------------------------------------------------------------------------
|
|
373
|
+
# Confidence model + field envelope
|
|
374
|
+
# ---------------------------------------------------------------------------
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _field(value: Any, confidence: float, source: str = "deterministic") -> JSON:
|
|
378
|
+
"""Wrap an extracted value with a confidence and a source. A `None` value
|
|
379
|
+
collapses to the canonical 'not found' envelope."""
|
|
380
|
+
if value is None:
|
|
381
|
+
return {"value": None, "confidence": 0.0, "source": "none"}
|
|
382
|
+
return {"value": value, "confidence": round(float(confidence), 2), "source": source}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _none_field() -> JSON:
|
|
386
|
+
return {"value": None, "confidence": 0.0, "source": "none"}
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _titlecase(s: str) -> str:
|
|
390
|
+
s = s.strip()
|
|
391
|
+
if not s:
|
|
392
|
+
return s
|
|
393
|
+
# A fully-shouted heading (ALL-CAPS, e.g. from a PDF) is title-cased
|
|
394
|
+
# outright; in a mixed-case title a short all-caps word is treated as a
|
|
395
|
+
# deliberate acronym ("IP Rights") and preserved.
|
|
396
|
+
whole_upper = s.isupper()
|
|
397
|
+
parts = []
|
|
398
|
+
for w in s.split():
|
|
399
|
+
if not whole_upper and w.isupper() and len(w) <= 4:
|
|
400
|
+
parts.append(w)
|
|
401
|
+
else:
|
|
402
|
+
parts.append(w[:1].upper() + w[1:].lower())
|
|
403
|
+
return " ".join(parts)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
# ---------------------------------------------------------------------------
|
|
407
|
+
# Deterministic extractors
|
|
408
|
+
# ---------------------------------------------------------------------------
|
|
409
|
+
|
|
410
|
+
_MONTHS = (
|
|
411
|
+
"January|February|March|April|May|June|July|August|September|October|"
|
|
412
|
+
"November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec"
|
|
413
|
+
)
|
|
414
|
+
_DATE_PAT = (
|
|
415
|
+
r"(?:"
|
|
416
|
+
r"\d{4}-\d{2}-\d{2}"
|
|
417
|
+
r"|\d{1,2}/\d{1,2}/\d{2,4}"
|
|
418
|
+
r"|(?:" + _MONTHS + r")\.?\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}"
|
|
419
|
+
r"|\d{1,2}(?:st|nd|rd|th)?\s+(?:day\s+of\s+)?(?:" + _MONTHS + r")\.?,?\s+\d{4}"
|
|
420
|
+
r")"
|
|
421
|
+
)
|
|
422
|
+
_DATE_RE = re.compile(_DATE_PAT, re.IGNORECASE)
|
|
423
|
+
|
|
424
|
+
_EFFECTIVE_RE = re.compile(
|
|
425
|
+
r"(?:effective(?:\s+date)?(?:\s+(?:as\s+of|date|on))?|"
|
|
426
|
+
r"dated(?:\s+as\s+of)?|"
|
|
427
|
+
r"made(?:\s+and\s+entered\s+into)?(?:\s+as\s+of|\s+on)?|"
|
|
428
|
+
r"entered\s+into(?:\s+as\s+of|\s+on)?)"
|
|
429
|
+
r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
|
|
430
|
+
re.IGNORECASE,
|
|
431
|
+
)
|
|
432
|
+
_EXPIRE_RE = re.compile(
|
|
433
|
+
r"(?:expir\w*|terminat\w*\s+on|end(?:s|ing)?\s+on|until|through|"
|
|
434
|
+
r"remain\s+in\s+effect\s+until)"
|
|
435
|
+
r"[\s:,]+(?:the\s+)?(" + _DATE_PAT + r")",
|
|
436
|
+
re.IGNORECASE,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
_PARTY_BLOCK_RE = re.compile(
|
|
440
|
+
r"\b(?:by\s+and\s+between|between)\s+(.{2,200}?)\s+\band\b\s+(.{2,200}?)"
|
|
441
|
+
r"(?=[\.;\n]|\bwhereas\b|\beffective\b|\bdated\b|\bhaving\b|\bwith\s+offices\b|$)",
|
|
442
|
+
re.IGNORECASE | re.DOTALL,
|
|
443
|
+
)
|
|
444
|
+
_ROLE_PAREN_RE = re.compile(
|
|
445
|
+
r"\(\s*(?:the\s+)?[\"“]?([^\"”()]+?)[\"”]?\s*\)"
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# Keyword portion is case-insensitive via an inline (?i:...) group; the
|
|
449
|
+
# jurisdiction capture stays case-sensitive so a leading [A-Z] actually
|
|
450
|
+
# enforces a capitalized proper noun (a global re.IGNORECASE would defeat that
|
|
451
|
+
# and over-capture trailing lowercase clauses like ", without regard to ...").
|
|
452
|
+
_GOV_LAW_RE = re.compile(
|
|
453
|
+
r"(?i:governed\s+by(?:\s+and\s+construed\s+in\s+accordance\s+with)?\s+"
|
|
454
|
+
r"(?:the\s+)?laws?\s+of\s+(?:the\s+)?)"
|
|
455
|
+
r"([A-Z][A-Za-z\.\- ]+?(?:,\s*[A-Z][A-Za-z\.\- ]+?)?)"
|
|
456
|
+
r"(?=[\.,;\n)]|\s+and\b|\s+without\b|$)",
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Anchor on a term/period/duration keyword, then allow a short same-sentence
|
|
460
|
+
# gap before the "<number> <unit>" so phrasings like "the initial term of this
|
|
461
|
+
# Agreement is three (3) years" match as well as "for a period of two years".
|
|
462
|
+
_TERM_LEN_RE = re.compile(
|
|
463
|
+
r"(?:(?:initial\s+)?term|period|duration|"
|
|
464
|
+
r"in\s+(?:full\s+)?(?:force\s+and\s+)?effect\s+for)"
|
|
465
|
+
r"[^.\n]{0,40}?\b(\d+|[A-Za-z]+)(?:\s*\(\d+\))?\s+(years?|months?|weeks?|days?)\b",
|
|
466
|
+
re.IGNORECASE,
|
|
467
|
+
)
|
|
468
|
+
_NOTICE_RE = re.compile(
|
|
469
|
+
r"(\d+|[A-Za-z]+)(?:\s*\(\d+\))?\s+days?[’'`]?s?\s+"
|
|
470
|
+
r"(?:prior\s+)?(?:written\s+)?notice",
|
|
471
|
+
re.IGNORECASE,
|
|
472
|
+
)
|
|
473
|
+
_AUTORENEW_POS_RE = re.compile(
|
|
474
|
+
r"automatic(?:ally)?\s+renew|auto-?renew|renew(?:s|ed)?\s+automatically|"
|
|
475
|
+
r"successive\s+(?:\d+|[A-Za-z]+)[\s-]+(?:year|month)|"
|
|
476
|
+
r"shall\s+(?:automatically\s+)?renew\s+for",
|
|
477
|
+
re.IGNORECASE,
|
|
478
|
+
)
|
|
479
|
+
# Strong negations only. Deliberately excludes a bare "non-renewal", which in
|
|
480
|
+
# practice appears in "...notice of non-renewal" -- the opt-OUT mechanism of a
|
|
481
|
+
# contract that DOES auto-renew, not a statement that it doesn't.
|
|
482
|
+
_AUTORENEW_NEG_RE = re.compile(
|
|
483
|
+
r"(?:shall|will|does|may)\s+not\s+(?:automatically\s+)?renew|"
|
|
484
|
+
r"no\s+automatic\s+renewal|"
|
|
485
|
+
r"not\s+(?:be\s+)?renewed?\s+automatically|"
|
|
486
|
+
r"shall\s+not\s+(?:be\s+)?(?:automatically\s+)?renewed?",
|
|
487
|
+
re.IGNORECASE,
|
|
488
|
+
)
|
|
489
|
+
_MONEY_RE = re.compile(
|
|
490
|
+
r"(?:\$|US\$|USD\s?|EUR\s?|€|£|GBP\s?)"
|
|
491
|
+
r"\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?"
|
|
492
|
+
r"(?:\s?(?:million|billion|thousand|bn|m|k))?",
|
|
493
|
+
re.IGNORECASE,
|
|
494
|
+
)
|
|
495
|
+
_DEFTERM_QUOTED_RE = re.compile(
|
|
496
|
+
r"[\"“]([A-Z][A-Za-z0-9][A-Za-z0-9 \-'/&]{1,60})[\"”]"
|
|
497
|
+
)
|
|
498
|
+
_DEFTERM_PAREN_RE = re.compile(
|
|
499
|
+
r"\(\s*(?:the\s+)?[\"“]?([A-Z][A-Za-z0-9][A-Za-z0-9 \-'/&]{1,40})[\"”]?\s*\)"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
_WORD_NUMBERS: Dict[str, int] = {
|
|
503
|
+
"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
|
|
504
|
+
"six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11,
|
|
505
|
+
"twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15,
|
|
506
|
+
"twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60,
|
|
507
|
+
"seventy": 70, "eighty": 80, "ninety": 90, "hundred": 100,
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _word_to_int(token: str) -> Optional[int]:
|
|
512
|
+
token = token.strip().lower()
|
|
513
|
+
if token.isdigit():
|
|
514
|
+
return int(token)
|
|
515
|
+
return _WORD_NUMBERS.get(token)
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _parse_date_to_iso(s: str) -> Optional[str]:
|
|
519
|
+
"""Best-effort normalization of a matched date string to ISO (YYYY-MM-DD).
|
|
520
|
+
Returns None when no known format parses."""
|
|
521
|
+
cleaned = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", s.strip().rstrip("."), flags=re.IGNORECASE)
|
|
522
|
+
cleaned = re.sub(r"\bday\s+of\s+", "", cleaned, flags=re.IGNORECASE)
|
|
523
|
+
cleaned = cleaned.replace(",", " ")
|
|
524
|
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
525
|
+
fmts = (
|
|
526
|
+
"%Y-%m-%d", "%B %d %Y", "%b %d %Y", "%d %B %Y", "%d %b %Y",
|
|
527
|
+
"%m/%d/%Y", "%m/%d/%y", "%d/%m/%Y",
|
|
528
|
+
)
|
|
529
|
+
for f in fmts:
|
|
530
|
+
try:
|
|
531
|
+
return _dt.datetime.strptime(cleaned, f).date().isoformat()
|
|
532
|
+
except ValueError:
|
|
533
|
+
continue
|
|
534
|
+
return None
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _date_field(match: Optional["re.Match[str]"]) -> JSON:
|
|
538
|
+
if match is None:
|
|
539
|
+
return _none_field()
|
|
540
|
+
raw = match.group(1).strip()
|
|
541
|
+
iso = _parse_date_to_iso(raw)
|
|
542
|
+
if iso is not None:
|
|
543
|
+
return _field(iso, 0.85)
|
|
544
|
+
return _field(raw, 0.55)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _split_name_role(s: str) -> Tuple[str, Optional[str]]:
|
|
548
|
+
s = s.strip().strip(",").strip()
|
|
549
|
+
role: Optional[str] = None
|
|
550
|
+
m = _ROLE_PAREN_RE.search(s)
|
|
551
|
+
if m:
|
|
552
|
+
candidate = m.group(1).strip()
|
|
553
|
+
# Only treat short, role-like parentheticals as roles.
|
|
554
|
+
if len(candidate) <= 40 and candidate.lower() not in ("a", "an", "the"):
|
|
555
|
+
role = candidate
|
|
556
|
+
s = (s[: m.start()] + s[m.end():]).strip().rstrip(",").strip()
|
|
557
|
+
s = s.strip("\"“”").strip()
|
|
558
|
+
s = re.sub(r"\s+", " ", s)
|
|
559
|
+
return s, role
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def extract_parties(text: str) -> List[JSON]:
|
|
563
|
+
m = _PARTY_BLOCK_RE.search(text)
|
|
564
|
+
if not m:
|
|
565
|
+
return []
|
|
566
|
+
out: List[JSON] = []
|
|
567
|
+
for raw in (m.group(1), m.group(2)):
|
|
568
|
+
# Party names can wrap across lines ("...(the \"Disclosing\nParty\")");
|
|
569
|
+
# collapse whitespace rather than truncating at the first newline.
|
|
570
|
+
raw = re.sub(r"\s+", " ", raw).strip()
|
|
571
|
+
name, role = _split_name_role(raw)
|
|
572
|
+
if not name or len(name) < 2 or len(name) > 120:
|
|
573
|
+
continue
|
|
574
|
+
entry: JSON = {"name": name, "confidence": 0.9, "source": "deterministic"}
|
|
575
|
+
entry["role"] = role
|
|
576
|
+
out.append(entry)
|
|
577
|
+
return out
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def extract_dates(text: str) -> JSON:
|
|
581
|
+
return {
|
|
582
|
+
"effective": _date_field(_EFFECTIVE_RE.search(text)),
|
|
583
|
+
"expiration": _date_field(_EXPIRE_RE.search(text)),
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def extract_governing_law(text: str) -> JSON:
|
|
588
|
+
m = _GOV_LAW_RE.search(text)
|
|
589
|
+
if not m:
|
|
590
|
+
return _none_field()
|
|
591
|
+
juris = re.sub(r"\s+", " ", m.group(1).strip().rstrip(".,")).strip()
|
|
592
|
+
if not juris:
|
|
593
|
+
return _none_field()
|
|
594
|
+
return _field(juris, 0.85)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def extract_term(text: str) -> JSON:
|
|
598
|
+
length = _none_field()
|
|
599
|
+
m = _TERM_LEN_RE.search(text)
|
|
600
|
+
if m:
|
|
601
|
+
num = _word_to_int(m.group(1))
|
|
602
|
+
unit = m.group(2).lower().rstrip("s")
|
|
603
|
+
if num is not None:
|
|
604
|
+
length = _field(f"{num} {unit}{'s' if num != 1 else ''}", 0.7)
|
|
605
|
+
else:
|
|
606
|
+
length = _field(f"{m.group(1)} {m.group(2)}".strip(), 0.5)
|
|
607
|
+
|
|
608
|
+
notice = _none_field()
|
|
609
|
+
nm = _NOTICE_RE.search(text)
|
|
610
|
+
if nm:
|
|
611
|
+
days = _word_to_int(nm.group(1))
|
|
612
|
+
if days is not None:
|
|
613
|
+
notice = _field(days, 0.7)
|
|
614
|
+
|
|
615
|
+
auto = _none_field()
|
|
616
|
+
if _AUTORENEW_NEG_RE.search(text):
|
|
617
|
+
auto = _field(False, 0.7)
|
|
618
|
+
elif _AUTORENEW_POS_RE.search(text):
|
|
619
|
+
auto = _field(True, 0.65)
|
|
620
|
+
|
|
621
|
+
return {"length": length, "auto_renew": auto, "notice_period_days": notice}
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def extract_value(text: str) -> JSON:
|
|
625
|
+
m = _MONEY_RE.search(text)
|
|
626
|
+
if not m:
|
|
627
|
+
return _none_field()
|
|
628
|
+
return _field(re.sub(r"\s+", " ", m.group(0).strip()), 0.6)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def extract_defined_terms(text: str) -> List[JSON]:
|
|
632
|
+
seen: Dict[str, None] = {}
|
|
633
|
+
for rx in (_DEFTERM_QUOTED_RE, _DEFTERM_PAREN_RE):
|
|
634
|
+
for m in rx.finditer(text):
|
|
635
|
+
term = re.sub(r"\s+", " ", m.group(1).strip())
|
|
636
|
+
# Reject sentence-like or lowercase-y captures.
|
|
637
|
+
if len(term) < 2 or len(term.split()) > 6:
|
|
638
|
+
continue
|
|
639
|
+
if not term[0].isupper():
|
|
640
|
+
continue
|
|
641
|
+
seen.setdefault(term, None)
|
|
642
|
+
if len(seen) >= 50:
|
|
643
|
+
break
|
|
644
|
+
return [{"term": t, "confidence": 0.6, "source": "deterministic"} for t in seen]
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def extract_clauses(text: str) -> List[JSON]:
|
|
648
|
+
out: List[JSON] = []
|
|
649
|
+
for c in detect_clauses(text):
|
|
650
|
+
canonical, mapped = _canonicalize_clause(c["title"])
|
|
651
|
+
tier = c["tier"]
|
|
652
|
+
base = {"h2": 0.95, "bold-numbered": 0.85, "all-caps": 0.75, "explicit": 0.95}.get(tier, 0.7)
|
|
653
|
+
conf = round(base * (1.0 if mapped else 0.75), 2)
|
|
654
|
+
out.append({
|
|
655
|
+
"canonical_title": canonical,
|
|
656
|
+
"detected_title": c["detected"],
|
|
657
|
+
"tier": tier,
|
|
658
|
+
"span": {"start": int(c["start"]), "end": int(c["end"])},
|
|
659
|
+
"confidence": conf,
|
|
660
|
+
"source": "deterministic",
|
|
661
|
+
"mapped": mapped,
|
|
662
|
+
})
|
|
663
|
+
return out
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def extract_title(text: str, path: Optional[Path], fmt: str) -> Optional[str]:
|
|
667
|
+
m = re.search(r"^#\s+(.+?)\s*$", text, re.MULTILINE)
|
|
668
|
+
if m:
|
|
669
|
+
return m.group(1).strip()
|
|
670
|
+
for line in text.splitlines():
|
|
671
|
+
ls = line.strip().lstrip("#").strip()
|
|
672
|
+
if ls:
|
|
673
|
+
if len(ls) <= 90:
|
|
674
|
+
return ls
|
|
675
|
+
break
|
|
676
|
+
if path is not None:
|
|
677
|
+
return _titlecase(path.stem.replace("_", " ").replace("-", " "))
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
# ---------------------------------------------------------------------------
|
|
682
|
+
# Input readers
|
|
683
|
+
# ---------------------------------------------------------------------------
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def _detect_format(path: Path, raw: bytes) -> str:
|
|
687
|
+
ext = path.suffix.lower()
|
|
688
|
+
if ext in (".md", ".markdown"):
|
|
689
|
+
return "markdown"
|
|
690
|
+
if ext == ".txt":
|
|
691
|
+
return "text"
|
|
692
|
+
if ext == ".docx":
|
|
693
|
+
return "docx"
|
|
694
|
+
if ext == ".pdf":
|
|
695
|
+
return "pdf"
|
|
696
|
+
if raw[:4] == b"%PDF":
|
|
697
|
+
return "pdf"
|
|
698
|
+
if raw[:2] == b"PK":
|
|
699
|
+
return "docx"
|
|
700
|
+
return "text"
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def _read_docx(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
|
|
704
|
+
"""Extract text from a .docx. Uses python-docx for higher fidelity when the
|
|
705
|
+
optional [docx] extra is installed; otherwise a stdlib zipfile/XML reader
|
|
706
|
+
(always available) handles paragraphs, table cells, and bold runs.
|
|
707
|
+
|
|
708
|
+
`prefer_optional=False` forces the stdlib reader regardless of what's
|
|
709
|
+
installed -- used to pin reproducible golden fixtures."""
|
|
710
|
+
warnings: List[str] = []
|
|
711
|
+
if prefer_optional and importlib.util.find_spec("docx") is not None:
|
|
712
|
+
try:
|
|
713
|
+
mod = importlib.import_module("docx")
|
|
714
|
+
document_cls = getattr(mod, "Document")
|
|
715
|
+
doc = document_cls(str(path))
|
|
716
|
+
lines: List[str] = []
|
|
717
|
+
for para in doc.paragraphs:
|
|
718
|
+
line = (para.text or "").strip()
|
|
719
|
+
if line and para.runs and all(getattr(r, "bold", False) for r in para.runs if (r.text or "").strip()):
|
|
720
|
+
line = f"**{line}**"
|
|
721
|
+
lines.append(line)
|
|
722
|
+
for table in getattr(doc, "tables", []):
|
|
723
|
+
for row in table.rows:
|
|
724
|
+
for cell in row.cells:
|
|
725
|
+
ct = (cell.text or "").strip()
|
|
726
|
+
if ct:
|
|
727
|
+
lines.append(ct)
|
|
728
|
+
return "\n\n".join(lines), warnings
|
|
729
|
+
except Exception as e: # pragma: no cover - fidelity path
|
|
730
|
+
warnings.append(f"python-docx read failed ({e}); falling back to stdlib reader")
|
|
731
|
+
try:
|
|
732
|
+
return _read_docx_stdlib(raw), warnings
|
|
733
|
+
except Exception as e:
|
|
734
|
+
warnings.append(f"could not parse .docx ({e}); treating as empty")
|
|
735
|
+
return "", warnings
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def _read_docx_stdlib(raw: bytes) -> str:
|
|
739
|
+
import io
|
|
740
|
+
import zipfile
|
|
741
|
+
import xml.etree.ElementTree as ET
|
|
742
|
+
|
|
743
|
+
w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
|
|
744
|
+
with zipfile.ZipFile(io.BytesIO(raw)) as z:
|
|
745
|
+
xml = z.read("word/document.xml")
|
|
746
|
+
root = ET.fromstring(xml)
|
|
747
|
+
paras: List[str] = []
|
|
748
|
+
# iter over w:p in document order (includes paragraphs inside table cells).
|
|
749
|
+
for p in root.iter(w + "p"):
|
|
750
|
+
run_texts: List[str] = []
|
|
751
|
+
any_text = False
|
|
752
|
+
all_bold = True
|
|
753
|
+
for r in p.iter(w + "r"):
|
|
754
|
+
rpr = r.find(w + "rPr")
|
|
755
|
+
bold = rpr is not None and rpr.find(w + "b") is not None
|
|
756
|
+
txt = "".join(t.text or "" for t in r.iter(w + "t"))
|
|
757
|
+
if txt:
|
|
758
|
+
any_text = True
|
|
759
|
+
if not bold:
|
|
760
|
+
all_bold = False
|
|
761
|
+
run_texts.append(txt)
|
|
762
|
+
line = "".join(run_texts).strip()
|
|
763
|
+
if not line:
|
|
764
|
+
paras.append("")
|
|
765
|
+
continue
|
|
766
|
+
if any_text and all_bold:
|
|
767
|
+
line = f"**{line}**"
|
|
768
|
+
paras.append(line)
|
|
769
|
+
return "\n\n".join(paras)
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def _read_pdf(path: Path, raw: bytes, prefer_optional: bool = True) -> Tuple[str, List[str]]:
|
|
773
|
+
"""Extract text from a .pdf. Uses pypdf when the optional [pdf] extra is
|
|
774
|
+
installed; otherwise a stdlib best-effort reader (zlib FlateDecode + text
|
|
775
|
+
operators). Scanned/image-only PDFs yield no text and are warned about.
|
|
776
|
+
|
|
777
|
+
`prefer_optional=False` forces the stdlib reader regardless of what's
|
|
778
|
+
installed -- used to pin reproducible golden fixtures."""
|
|
779
|
+
warnings: List[str] = []
|
|
780
|
+
if prefer_optional and importlib.util.find_spec("pypdf") is not None:
|
|
781
|
+
try:
|
|
782
|
+
mod = importlib.import_module("pypdf")
|
|
783
|
+
reader_cls = getattr(mod, "PdfReader")
|
|
784
|
+
import io
|
|
785
|
+
reader = reader_cls(io.BytesIO(raw))
|
|
786
|
+
pages = [page.extract_text() or "" for page in reader.pages]
|
|
787
|
+
return "\n\n".join(pages), warnings
|
|
788
|
+
except Exception as e: # pragma: no cover - fidelity path
|
|
789
|
+
warnings.append(f"pypdf read failed ({e}); falling back to stdlib reader")
|
|
790
|
+
try:
|
|
791
|
+
text = _read_pdf_stdlib(raw)
|
|
792
|
+
except Exception as e:
|
|
793
|
+
warnings.append(f"could not parse .pdf ({e}); treating as empty")
|
|
794
|
+
return "", warnings
|
|
795
|
+
return text, warnings
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
_PDF_TOKEN_RE = re.compile(
|
|
799
|
+
r"\((?:\\.|[^\\()])*\)|\[(?:\\.|[^\]\\])*\]|Tj|TJ|Td|TD|T\*|BT|ET|'|\""
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
def _pdf_unescape(s: str) -> str:
|
|
804
|
+
out: List[str] = []
|
|
805
|
+
i = 0
|
|
806
|
+
n = len(s)
|
|
807
|
+
while i < n:
|
|
808
|
+
ch = s[i]
|
|
809
|
+
if ch == "\\" and i + 1 < n:
|
|
810
|
+
nxt = s[i + 1]
|
|
811
|
+
if nxt in "()\\":
|
|
812
|
+
out.append(nxt)
|
|
813
|
+
i += 2
|
|
814
|
+
continue
|
|
815
|
+
if nxt == "n":
|
|
816
|
+
out.append("\n")
|
|
817
|
+
i += 2
|
|
818
|
+
continue
|
|
819
|
+
if nxt in "rtbf":
|
|
820
|
+
out.append({"r": "\r", "t": "\t", "b": "", "f": ""}[nxt])
|
|
821
|
+
i += 2
|
|
822
|
+
continue
|
|
823
|
+
mo = re.match(r"[0-7]{1,3}", s[i + 1:i + 4])
|
|
824
|
+
if mo:
|
|
825
|
+
out.append(chr(int(mo.group(0), 8) & 0xFF))
|
|
826
|
+
i += 1 + len(mo.group(0))
|
|
827
|
+
continue
|
|
828
|
+
out.append(nxt)
|
|
829
|
+
i += 2
|
|
830
|
+
continue
|
|
831
|
+
out.append(ch)
|
|
832
|
+
i += 1
|
|
833
|
+
return "".join(out)
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
def _pdf_text_from_content(content: bytes) -> str:
|
|
837
|
+
s = content.decode("latin-1", "replace")
|
|
838
|
+
lines: List[str] = []
|
|
839
|
+
cur: List[str] = []
|
|
840
|
+
|
|
841
|
+
def flush() -> None:
|
|
842
|
+
if cur:
|
|
843
|
+
lines.append("".join(cur))
|
|
844
|
+
cur.clear()
|
|
845
|
+
|
|
846
|
+
for m in _PDF_TOKEN_RE.finditer(s):
|
|
847
|
+
tok = m.group(0)
|
|
848
|
+
if tok.startswith("("):
|
|
849
|
+
cur.append(_pdf_unescape(tok[1:-1]))
|
|
850
|
+
elif tok.startswith("["):
|
|
851
|
+
for sm in re.finditer(r"\((?:\\.|[^\\()])*\)", tok):
|
|
852
|
+
cur.append(_pdf_unescape(sm.group(0)[1:-1]))
|
|
853
|
+
elif tok in ("Td", "TD", "T*", "'", '"', "BT", "ET"):
|
|
854
|
+
flush()
|
|
855
|
+
flush()
|
|
856
|
+
return "\n".join(lines)
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def _read_pdf_stdlib(raw: bytes) -> str:
|
|
860
|
+
import zlib
|
|
861
|
+
|
|
862
|
+
chunks: List[str] = []
|
|
863
|
+
idx = 0
|
|
864
|
+
while True:
|
|
865
|
+
s = raw.find(b"stream", idx)
|
|
866
|
+
if s == -1:
|
|
867
|
+
break
|
|
868
|
+
e = raw.find(b"endstream", s)
|
|
869
|
+
if e == -1:
|
|
870
|
+
break
|
|
871
|
+
body = raw[s + len(b"stream"):e].lstrip(b"\r\n")
|
|
872
|
+
try:
|
|
873
|
+
content = zlib.decompress(body)
|
|
874
|
+
except Exception:
|
|
875
|
+
content = body
|
|
876
|
+
chunks.append(_pdf_text_from_content(content))
|
|
877
|
+
idx = e + len(b"endstream")
|
|
878
|
+
return "\n".join(c for c in chunks if c.strip())
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def load_source(path: Path, prefer_optional: bool = True) -> Tuple[bytes, str, str, List[str]]:
|
|
882
|
+
"""Read a document from disk. Returns (raw_bytes, text, format, warnings).
|
|
883
|
+
Never raises on parse trouble -- degrades to empty text with a warning.
|
|
884
|
+
|
|
885
|
+
`prefer_optional=False` forces the stdlib readers for .docx/.pdf so output
|
|
886
|
+
is reproducible regardless of which extras are installed (used by the
|
|
887
|
+
golden fixtures). The CLI default (True) uses the best reader available."""
|
|
888
|
+
if not path.exists():
|
|
889
|
+
raise ExtractError(f"no such file: {path}")
|
|
890
|
+
if path.is_dir():
|
|
891
|
+
raise ExtractError(f"path is a directory, not a file: {path}")
|
|
892
|
+
raw = path.read_bytes()
|
|
893
|
+
fmt = _detect_format(path, raw)
|
|
894
|
+
warnings: List[str] = []
|
|
895
|
+
if fmt in ("markdown", "text"):
|
|
896
|
+
text = raw.decode("utf-8", "replace")
|
|
897
|
+
elif fmt == "docx":
|
|
898
|
+
text, w = _read_docx(path, raw, prefer_optional)
|
|
899
|
+
warnings += w
|
|
900
|
+
elif fmt == "pdf":
|
|
901
|
+
text, w = _read_pdf(path, raw, prefer_optional)
|
|
902
|
+
warnings += w
|
|
903
|
+
else: # pragma: no cover - unreachable; _detect_format only returns the above
|
|
904
|
+
text = raw.decode("utf-8", "replace")
|
|
905
|
+
if not text.strip():
|
|
906
|
+
warnings.append(
|
|
907
|
+
f"no extractable text from {fmt} input (scanned or image-only?); "
|
|
908
|
+
"output will be sparse"
|
|
909
|
+
)
|
|
910
|
+
return raw, text, fmt, warnings
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
# ---------------------------------------------------------------------------
|
|
914
|
+
# Extraction orchestration
|
|
915
|
+
# ---------------------------------------------------------------------------
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def build_extraction(text: str, raw: bytes, fmt: str,
|
|
919
|
+
source_path: Optional[str]) -> JSON:
|
|
920
|
+
"""Run the deterministic tier and assemble the output contract object."""
|
|
921
|
+
sha = hashlib.sha256(raw).hexdigest()
|
|
922
|
+
return {
|
|
923
|
+
"document": {
|
|
924
|
+
"title": extract_title(text, Path(source_path) if source_path else None, fmt),
|
|
925
|
+
"format": fmt,
|
|
926
|
+
"sha256": sha,
|
|
927
|
+
"source_path": source_path,
|
|
928
|
+
},
|
|
929
|
+
"parties": extract_parties(text),
|
|
930
|
+
"dates": extract_dates(text),
|
|
931
|
+
"term": extract_term(text),
|
|
932
|
+
"governing_law": extract_governing_law(text),
|
|
933
|
+
"clauses": extract_clauses(text),
|
|
934
|
+
"defined_terms": extract_defined_terms(text),
|
|
935
|
+
"value": extract_value(text),
|
|
936
|
+
"_meta": {
|
|
937
|
+
"extractor_version": EXTRACTOR_VERSION,
|
|
938
|
+
"tiers_used": ["deterministic"],
|
|
939
|
+
"llm_used": False,
|
|
940
|
+
},
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _is_low_signal(result: JSON) -> bool:
|
|
945
|
+
"""True when the deterministic tier found essentially nothing extractable
|
|
946
|
+
(e.g. a scanned PDF). Used to set a non-zero exit code as a 'finding'."""
|
|
947
|
+
if result["parties"]:
|
|
948
|
+
return False
|
|
949
|
+
if result["clauses"]:
|
|
950
|
+
return False
|
|
951
|
+
if result["dates"]["effective"]["source"] != "none":
|
|
952
|
+
return False
|
|
953
|
+
if result["governing_law"]["source"] != "none":
|
|
954
|
+
return False
|
|
955
|
+
if result["defined_terms"]:
|
|
956
|
+
return False
|
|
957
|
+
return True
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
# ---------------------------------------------------------------------------
|
|
961
|
+
# LLM tier (opt-in only, never in a hot path)
|
|
962
|
+
# ---------------------------------------------------------------------------
|
|
963
|
+
|
|
964
|
+
LLM_CONFIG_PATHS = (
|
|
965
|
+
Path.home() / ".config" / "contract-ops" / "llm.json",
|
|
966
|
+
Path("config") / "llm.json",
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def load_llm_config() -> Optional[JSON]:
|
|
971
|
+
"""Suite-shared LLM config lookup: ~/.config/contract-ops/llm.json first,
|
|
972
|
+
then a repo-local ./config/llm.json. Returns the first valid one, else None."""
|
|
973
|
+
for p in LLM_CONFIG_PATHS:
|
|
974
|
+
try:
|
|
975
|
+
if p.is_file():
|
|
976
|
+
data = json.loads(p.read_text(encoding="utf-8"))
|
|
977
|
+
if isinstance(data, dict) and data.get("api_key"):
|
|
978
|
+
return data
|
|
979
|
+
except (OSError, json.JSONDecodeError):
|
|
980
|
+
continue
|
|
981
|
+
return None
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
_LLM_PROMPT = (
|
|
985
|
+
"You are a contract-extraction assistant. Given the contract text, return "
|
|
986
|
+
"ONLY a compact JSON object with keys: renewal_mechanics (string or null), "
|
|
987
|
+
"obligations (array of short strings, max 5), governing_law (string or "
|
|
988
|
+
"null). Base answers strictly on the text. No prose, JSON only.\n\n"
|
|
989
|
+
"CONTRACT:\n"
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def _llm_request(cfg: JSON, prompt: str, timeout: float = 30.0) -> Optional[str]:
|
|
994
|
+
provider = str(cfg.get("provider", "anthropic")).lower()
|
|
995
|
+
model = cfg.get("model") or ("claude-sonnet-4-6" if provider == "anthropic" else "gpt-4o-mini")
|
|
996
|
+
api_key = cfg["api_key"]
|
|
997
|
+
if provider == "anthropic":
|
|
998
|
+
url = "https://api.anthropic.com/v1/messages"
|
|
999
|
+
payload = {
|
|
1000
|
+
"model": model,
|
|
1001
|
+
"max_tokens": 1024,
|
|
1002
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
1003
|
+
}
|
|
1004
|
+
headers = {
|
|
1005
|
+
"content-type": "application/json",
|
|
1006
|
+
"x-api-key": api_key,
|
|
1007
|
+
"anthropic-version": "2023-06-01",
|
|
1008
|
+
}
|
|
1009
|
+
else:
|
|
1010
|
+
base = str(cfg.get("base_url") or "https://api.openai.com/v1").rstrip("/")
|
|
1011
|
+
url = f"{base}/chat/completions"
|
|
1012
|
+
payload = {
|
|
1013
|
+
"model": model,
|
|
1014
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
1015
|
+
}
|
|
1016
|
+
headers = {
|
|
1017
|
+
"content-type": "application/json",
|
|
1018
|
+
"authorization": f"Bearer {api_key}",
|
|
1019
|
+
}
|
|
1020
|
+
req = urllib.request.Request(
|
|
1021
|
+
url, data=json.dumps(payload).encode("utf-8"), headers=headers, method="POST"
|
|
1022
|
+
)
|
|
1023
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp: # nosec - opt-in
|
|
1024
|
+
body = json.loads(resp.read().decode("utf-8"))
|
|
1025
|
+
if provider == "anthropic":
|
|
1026
|
+
parts = body.get("content") or []
|
|
1027
|
+
return "".join(p.get("text", "") for p in parts if isinstance(p, dict))
|
|
1028
|
+
choices = body.get("choices") or []
|
|
1029
|
+
if choices:
|
|
1030
|
+
return str(choices[0].get("message", {}).get("content", ""))
|
|
1031
|
+
return None
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
def _extract_json_object(s: str) -> Optional[JSON]:
|
|
1035
|
+
start = s.find("{")
|
|
1036
|
+
end = s.rfind("}")
|
|
1037
|
+
if start == -1 or end == -1 or end < start:
|
|
1038
|
+
return None
|
|
1039
|
+
try:
|
|
1040
|
+
obj = json.loads(s[start:end + 1])
|
|
1041
|
+
return obj if isinstance(obj, dict) else None
|
|
1042
|
+
except json.JSONDecodeError:
|
|
1043
|
+
return None
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def llm_enrich(result: JSON, text: str, args_ns: argparse.Namespace) -> None:
|
|
1047
|
+
"""Opt-in enrichment of fuzzy fields. Mutates `result` in place. Any
|
|
1048
|
+
failure (no config, network error, bad JSON) degrades gracefully: a warning
|
|
1049
|
+
to stderr and the deterministic output is left untouched."""
|
|
1050
|
+
cfg = load_llm_config()
|
|
1051
|
+
if cfg is None:
|
|
1052
|
+
_warn(args_ns, "no LLM config found (~/.config/contract-ops/llm.json or "
|
|
1053
|
+
"./config/llm.json); skipping --llm enrichment")
|
|
1054
|
+
return
|
|
1055
|
+
prompt = _LLM_PROMPT + text[:12000]
|
|
1056
|
+
try:
|
|
1057
|
+
raw = _llm_request(cfg, prompt)
|
|
1058
|
+
except (urllib.error.URLError, TimeoutError, OSError, ValueError) as e:
|
|
1059
|
+
_warn(args_ns, f"LLM request failed ({e}); keeping deterministic output only")
|
|
1060
|
+
return
|
|
1061
|
+
if not raw:
|
|
1062
|
+
_warn(args_ns, "LLM returned no content; keeping deterministic output only")
|
|
1063
|
+
return
|
|
1064
|
+
obj = _extract_json_object(raw)
|
|
1065
|
+
if obj is None:
|
|
1066
|
+
_warn(args_ns, "could not parse LLM JSON response; keeping deterministic output only")
|
|
1067
|
+
return
|
|
1068
|
+
|
|
1069
|
+
enriched = False
|
|
1070
|
+
rm = obj.get("renewal_mechanics")
|
|
1071
|
+
if isinstance(rm, str) and rm.strip():
|
|
1072
|
+
result["term"]["renewal_mechanics"] = _field(rm.strip(), 0.6, "llm")
|
|
1073
|
+
enriched = True
|
|
1074
|
+
obligations = obj.get("obligations")
|
|
1075
|
+
if isinstance(obligations, list) and obligations:
|
|
1076
|
+
result["obligations"] = [
|
|
1077
|
+
{"text": str(o).strip(), "confidence": 0.55, "source": "llm"}
|
|
1078
|
+
for o in obligations[:5] if str(o).strip()
|
|
1079
|
+
]
|
|
1080
|
+
enriched = True
|
|
1081
|
+
gl = obj.get("governing_law")
|
|
1082
|
+
if isinstance(gl, str) and gl.strip() and result["governing_law"]["source"] == "none":
|
|
1083
|
+
result["governing_law"] = _field(gl.strip(), 0.6, "llm")
|
|
1084
|
+
enriched = True
|
|
1085
|
+
|
|
1086
|
+
result["_meta"]["llm_used"] = True
|
|
1087
|
+
if enriched and "llm" not in result["_meta"]["tiers_used"]:
|
|
1088
|
+
result["_meta"]["tiers_used"].append("llm")
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
# ---------------------------------------------------------------------------
|
|
1092
|
+
# Output rendering
|
|
1093
|
+
# ---------------------------------------------------------------------------
|
|
1094
|
+
|
|
1095
|
+
TOP_LEVEL_FIELDS = (
|
|
1096
|
+
"document", "parties", "dates", "term", "governing_law",
|
|
1097
|
+
"clauses", "defined_terms", "value",
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
|
|
1101
|
+
def _apply_field_subset(result: JSON, fields: List[str]) -> JSON:
|
|
1102
|
+
wanted = {f.strip() for f in fields if f.strip()}
|
|
1103
|
+
out: JSON = {k: v for k, v in result.items() if k in wanted}
|
|
1104
|
+
out["_meta"] = result["_meta"] # provenance always travels with the payload
|
|
1105
|
+
return out
|
|
1106
|
+
|
|
1107
|
+
|
|
1108
|
+
def _strip_confidence(obj: Any) -> Any:
|
|
1109
|
+
"""Recursively drop confidence/source markers for the --no-confidence view.
|
|
1110
|
+
Collapses single-remaining-key dicts ({"value": x} -> x, {"term": t} -> t)."""
|
|
1111
|
+
if isinstance(obj, dict):
|
|
1112
|
+
d = {k: _strip_confidence(v) for k, v in obj.items()
|
|
1113
|
+
if k not in ("confidence", "source")}
|
|
1114
|
+
if len(d) == 1:
|
|
1115
|
+
return next(iter(d.values()))
|
|
1116
|
+
return d
|
|
1117
|
+
if isinstance(obj, list):
|
|
1118
|
+
return [_strip_confidence(v) for v in obj]
|
|
1119
|
+
return obj
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
def render_json(result: JSON, no_confidence: bool) -> str:
|
|
1123
|
+
payload = _strip_confidence(result) if no_confidence else result
|
|
1124
|
+
return json.dumps(payload, indent=2, ensure_ascii=True, sort_keys=False)
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
def _fv(field: JSON) -> str:
|
|
1128
|
+
v = field.get("value")
|
|
1129
|
+
if v is None:
|
|
1130
|
+
return _dim("(not found)")
|
|
1131
|
+
return str(v)
|
|
1132
|
+
|
|
1133
|
+
|
|
1134
|
+
def render_table(result: JSON, no_confidence: bool) -> str:
|
|
1135
|
+
lines: List[str] = []
|
|
1136
|
+
doc = result.get("document", {})
|
|
1137
|
+
if doc:
|
|
1138
|
+
lines.append(_bold("Document"))
|
|
1139
|
+
lines.append(f" title : {doc.get('title') or _dim('(none)')}")
|
|
1140
|
+
lines.append(f" format : {doc.get('format')}")
|
|
1141
|
+
lines.append(f" sha256 : {str(doc.get('sha256'))[:16]}...")
|
|
1142
|
+
parties = result.get("parties")
|
|
1143
|
+
if parties is not None:
|
|
1144
|
+
lines.append(_bold("Parties"))
|
|
1145
|
+
if parties:
|
|
1146
|
+
for p in parties:
|
|
1147
|
+
role = f" ({p['role']})" if p.get("role") else ""
|
|
1148
|
+
conf = "" if no_confidence else _dim(f" [{p.get('confidence')}]")
|
|
1149
|
+
lines.append(f" - {p['name']}{role}{conf}")
|
|
1150
|
+
else:
|
|
1151
|
+
lines.append(" " + _dim("(none detected)"))
|
|
1152
|
+
dates = result.get("dates")
|
|
1153
|
+
if dates is not None:
|
|
1154
|
+
lines.append(_bold("Dates"))
|
|
1155
|
+
lines.append(f" effective : {_fv(dates['effective'])}")
|
|
1156
|
+
lines.append(f" expiration : {_fv(dates['expiration'])}")
|
|
1157
|
+
term = result.get("term")
|
|
1158
|
+
if term is not None:
|
|
1159
|
+
lines.append(_bold("Term"))
|
|
1160
|
+
lines.append(f" length : {_fv(term['length'])}")
|
|
1161
|
+
lines.append(f" auto_renew : {_fv(term['auto_renew'])}")
|
|
1162
|
+
lines.append(f" notice_days : {_fv(term['notice_period_days'])}")
|
|
1163
|
+
if "renewal_mechanics" in term:
|
|
1164
|
+
lines.append(f" renewal : {_fv(term['renewal_mechanics'])} {_dim('[llm]')}")
|
|
1165
|
+
if "governing_law" in result:
|
|
1166
|
+
lines.append(_bold("Governing law"))
|
|
1167
|
+
lines.append(f" {_fv(result['governing_law'])}")
|
|
1168
|
+
if "value" in result:
|
|
1169
|
+
lines.append(_bold("Value"))
|
|
1170
|
+
lines.append(f" {_fv(result['value'])}")
|
|
1171
|
+
clauses = result.get("clauses")
|
|
1172
|
+
if clauses is not None:
|
|
1173
|
+
lines.append(_bold(f"Clause map ({len(clauses)})"))
|
|
1174
|
+
if clauses:
|
|
1175
|
+
lines.append(" " + _dim("canonical tier detected"))
|
|
1176
|
+
for c in clauses:
|
|
1177
|
+
canon = (c.get("canonical_title") or "")[:20].ljust(20)
|
|
1178
|
+
tier = str(c.get("tier"))[:14].ljust(14)
|
|
1179
|
+
det = c.get("detected_title", "")
|
|
1180
|
+
flag = "" if c.get("mapped") else _yellow(" *")
|
|
1181
|
+
conf = "" if no_confidence else _dim(f" [{c.get('confidence')}]")
|
|
1182
|
+
lines.append(f" {canon} {tier} {det}{flag}{conf}")
|
|
1183
|
+
if any(not c.get("mapped") for c in clauses):
|
|
1184
|
+
lines.append(" " + _dim("* = not mapped to suite vocabulary"))
|
|
1185
|
+
else:
|
|
1186
|
+
lines.append(" " + _dim("(no clause structure detected)"))
|
|
1187
|
+
terms = result.get("defined_terms")
|
|
1188
|
+
if terms is not None:
|
|
1189
|
+
lines.append(_bold(f"Defined terms ({len(terms)})"))
|
|
1190
|
+
if terms:
|
|
1191
|
+
lines.append(" " + ", ".join(t["term"] for t in terms[:20]))
|
|
1192
|
+
else:
|
|
1193
|
+
lines.append(" " + _dim("(none detected)"))
|
|
1194
|
+
meta = result.get("_meta", {})
|
|
1195
|
+
lines.append(_dim(
|
|
1196
|
+
f"tiers={','.join(meta.get('tiers_used', []))} "
|
|
1197
|
+
f"llm={meta.get('llm_used')} extractor={meta.get('extractor_version')}"
|
|
1198
|
+
))
|
|
1199
|
+
return "\n".join(lines)
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
# ---------------------------------------------------------------------------
|
|
1203
|
+
# Output JSON Schema (the cross-CLI contract; source of truth for docs/spec/)
|
|
1204
|
+
# ---------------------------------------------------------------------------
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
def output_schema() -> JSON:
|
|
1208
|
+
field_ref = {"$ref": "#/$defs/field"}
|
|
1209
|
+
return {
|
|
1210
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1211
|
+
"$id": "https://github.com/DrBaher/extract-cli/blob/main/docs/spec/extract-output.schema.json",
|
|
1212
|
+
"title": f"extract-cli output schema (v{SCHEMA_VERSION})",
|
|
1213
|
+
"description": (
|
|
1214
|
+
"Structured payload emitted by `extract <path>` (default JSON output). "
|
|
1215
|
+
"The cross-CLI contract that nda-review-cli, compare-cli and "
|
|
1216
|
+
"contract-vault consume. Every extracted field carries a confidence "
|
|
1217
|
+
"and a source in {deterministic, llm, none}: downstream treats fields "
|
|
1218
|
+
"as 'verify, not trust'. Note: the --no-confidence view is a reduced "
|
|
1219
|
+
"convenience projection NOT governed by this schema."
|
|
1220
|
+
),
|
|
1221
|
+
"type": "object",
|
|
1222
|
+
"required": [
|
|
1223
|
+
"document", "parties", "dates", "term", "governing_law",
|
|
1224
|
+
"clauses", "defined_terms", "value", "_meta",
|
|
1225
|
+
],
|
|
1226
|
+
"additionalProperties": False,
|
|
1227
|
+
"$defs": {
|
|
1228
|
+
"source": {"enum": ["deterministic", "llm", "none"]},
|
|
1229
|
+
"confidence": {"type": "number", "minimum": 0, "maximum": 1},
|
|
1230
|
+
"field": {
|
|
1231
|
+
"type": "object",
|
|
1232
|
+
"required": ["value", "confidence", "source"],
|
|
1233
|
+
"properties": {
|
|
1234
|
+
"value": {},
|
|
1235
|
+
"confidence": {"$ref": "#/$defs/confidence"},
|
|
1236
|
+
"source": {"$ref": "#/$defs/source"},
|
|
1237
|
+
},
|
|
1238
|
+
"additionalProperties": False,
|
|
1239
|
+
},
|
|
1240
|
+
},
|
|
1241
|
+
"properties": {
|
|
1242
|
+
"document": {
|
|
1243
|
+
"type": "object",
|
|
1244
|
+
"required": ["title", "format", "sha256", "source_path"],
|
|
1245
|
+
"properties": {
|
|
1246
|
+
"title": {"type": ["string", "null"]},
|
|
1247
|
+
"format": {"enum": ["markdown", "text", "docx", "pdf"]},
|
|
1248
|
+
"sha256": {"type": "string", "pattern": "^[0-9a-f]{64}$"},
|
|
1249
|
+
"source_path": {"type": ["string", "null"]},
|
|
1250
|
+
},
|
|
1251
|
+
"additionalProperties": False,
|
|
1252
|
+
},
|
|
1253
|
+
"parties": {
|
|
1254
|
+
"type": "array",
|
|
1255
|
+
"items": {
|
|
1256
|
+
"type": "object",
|
|
1257
|
+
"required": ["name", "confidence", "source"],
|
|
1258
|
+
"properties": {
|
|
1259
|
+
"name": {"type": "string"},
|
|
1260
|
+
"role": {"type": ["string", "null"]},
|
|
1261
|
+
"confidence": {"$ref": "#/$defs/confidence"},
|
|
1262
|
+
"source": {"$ref": "#/$defs/source"},
|
|
1263
|
+
},
|
|
1264
|
+
"additionalProperties": False,
|
|
1265
|
+
},
|
|
1266
|
+
},
|
|
1267
|
+
"dates": {
|
|
1268
|
+
"type": "object",
|
|
1269
|
+
"required": ["effective", "expiration"],
|
|
1270
|
+
"properties": {"effective": field_ref, "expiration": field_ref},
|
|
1271
|
+
"additionalProperties": False,
|
|
1272
|
+
},
|
|
1273
|
+
"term": {
|
|
1274
|
+
"type": "object",
|
|
1275
|
+
"required": ["length", "auto_renew", "notice_period_days"],
|
|
1276
|
+
"properties": {
|
|
1277
|
+
"length": field_ref,
|
|
1278
|
+
"auto_renew": field_ref,
|
|
1279
|
+
"notice_period_days": field_ref,
|
|
1280
|
+
"renewal_mechanics": field_ref,
|
|
1281
|
+
},
|
|
1282
|
+
"additionalProperties": False,
|
|
1283
|
+
},
|
|
1284
|
+
"governing_law": field_ref,
|
|
1285
|
+
"clauses": {
|
|
1286
|
+
"type": "array",
|
|
1287
|
+
"items": {
|
|
1288
|
+
"type": "object",
|
|
1289
|
+
"required": [
|
|
1290
|
+
"canonical_title", "detected_title", "tier",
|
|
1291
|
+
"span", "confidence", "source", "mapped",
|
|
1292
|
+
],
|
|
1293
|
+
"properties": {
|
|
1294
|
+
"canonical_title": {"type": ["string", "null"]},
|
|
1295
|
+
"detected_title": {"type": "string"},
|
|
1296
|
+
"tier": {"enum": ["h2", "bold-numbered", "all-caps", "explicit", "llm"]},
|
|
1297
|
+
"span": {
|
|
1298
|
+
"type": "object",
|
|
1299
|
+
"required": ["start", "end"],
|
|
1300
|
+
"properties": {
|
|
1301
|
+
"start": {"type": "integer", "minimum": 0},
|
|
1302
|
+
"end": {"type": "integer", "minimum": 0},
|
|
1303
|
+
},
|
|
1304
|
+
"additionalProperties": False,
|
|
1305
|
+
},
|
|
1306
|
+
"confidence": {"$ref": "#/$defs/confidence"},
|
|
1307
|
+
"source": {"$ref": "#/$defs/source"},
|
|
1308
|
+
"mapped": {"type": "boolean"},
|
|
1309
|
+
},
|
|
1310
|
+
"additionalProperties": False,
|
|
1311
|
+
},
|
|
1312
|
+
},
|
|
1313
|
+
"defined_terms": {
|
|
1314
|
+
"type": "array",
|
|
1315
|
+
"items": {
|
|
1316
|
+
"type": "object",
|
|
1317
|
+
"required": ["term", "confidence", "source"],
|
|
1318
|
+
"properties": {
|
|
1319
|
+
"term": {"type": "string"},
|
|
1320
|
+
"confidence": {"$ref": "#/$defs/confidence"},
|
|
1321
|
+
"source": {"$ref": "#/$defs/source"},
|
|
1322
|
+
},
|
|
1323
|
+
"additionalProperties": False,
|
|
1324
|
+
},
|
|
1325
|
+
},
|
|
1326
|
+
"value": field_ref,
|
|
1327
|
+
"obligations": {
|
|
1328
|
+
"type": "array",
|
|
1329
|
+
"items": {
|
|
1330
|
+
"type": "object",
|
|
1331
|
+
"required": ["text", "confidence", "source"],
|
|
1332
|
+
"properties": {
|
|
1333
|
+
"text": {"type": "string"},
|
|
1334
|
+
"confidence": {"$ref": "#/$defs/confidence"},
|
|
1335
|
+
"source": {"$ref": "#/$defs/source"},
|
|
1336
|
+
},
|
|
1337
|
+
"additionalProperties": False,
|
|
1338
|
+
},
|
|
1339
|
+
},
|
|
1340
|
+
"_meta": {
|
|
1341
|
+
"type": "object",
|
|
1342
|
+
"required": ["extractor_version", "tiers_used", "llm_used"],
|
|
1343
|
+
"properties": {
|
|
1344
|
+
"extractor_version": {"type": "string"},
|
|
1345
|
+
"tiers_used": {"type": "array", "items": {"enum": ["deterministic", "llm"]}},
|
|
1346
|
+
"llm_used": {"type": "boolean"},
|
|
1347
|
+
},
|
|
1348
|
+
"additionalProperties": False,
|
|
1349
|
+
},
|
|
1350
|
+
},
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
# ---------------------------------------------------------------------------
|
|
1355
|
+
# Field catalog (for `extract fields`)
|
|
1356
|
+
# ---------------------------------------------------------------------------
|
|
1357
|
+
|
|
1358
|
+
FIELD_CATALOG: Tuple[Tuple[str, str, str], ...] = (
|
|
1359
|
+
("document.title", "deterministic", "Document title (first heading or filename)"),
|
|
1360
|
+
("parties", "deterministic", "Contracting parties ('between X and Y')"),
|
|
1361
|
+
("dates.effective", "deterministic", "Effective date (ISO-normalized when parseable)"),
|
|
1362
|
+
("dates.expiration", "deterministic", "Expiration date"),
|
|
1363
|
+
("term.length", "deterministic", "Term length, best-effort"),
|
|
1364
|
+
("term.notice_period_days", "deterministic", "Notice period in days, best-effort"),
|
|
1365
|
+
("term.auto_renew", "deterministic", "Auto-renewal flag, best-effort"),
|
|
1366
|
+
("governing_law", "deterministic", "Governing law / jurisdiction"),
|
|
1367
|
+
("clauses", "deterministic", "Clause map normalized to the suite's canonical vocabulary"),
|
|
1368
|
+
("defined_terms", "deterministic", "Defined-term inventory (quoted / parenthetical)"),
|
|
1369
|
+
("value", "deterministic", "Headline monetary value"),
|
|
1370
|
+
("term.renewal_mechanics", "llm", "Renewal mechanics (fuzzy; --llm only)"),
|
|
1371
|
+
("obligations", "llm", "Key obligation phrasing (fuzzy; --llm only)"),
|
|
1372
|
+
)
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
# ---------------------------------------------------------------------------
|
|
1376
|
+
# Bundled demo fixture (so `extract demo` works from an installed wheel)
|
|
1377
|
+
# ---------------------------------------------------------------------------
|
|
1378
|
+
|
|
1379
|
+
DEMO_DOCUMENT = """# Mutual Non-Disclosure Agreement
|
|
1380
|
+
|
|
1381
|
+
This Mutual Non-Disclosure Agreement (the "Agreement") is made and entered into
|
|
1382
|
+
as of March 1, 2024, by and between Acme Robotics, Inc. (the "Disclosing Party")
|
|
1383
|
+
and Beta Logistics LLC (the "Receiving Party").
|
|
1384
|
+
|
|
1385
|
+
## Definitions
|
|
1386
|
+
|
|
1387
|
+
For purposes of this Agreement, "Confidential Information" means any non-public
|
|
1388
|
+
information disclosed by one party to the other.
|
|
1389
|
+
|
|
1390
|
+
## Confidentiality Obligations
|
|
1391
|
+
|
|
1392
|
+
The Receiving Party shall protect the Confidential Information using no less than
|
|
1393
|
+
reasonable care and shall not disclose it to any third party.
|
|
1394
|
+
|
|
1395
|
+
## Term
|
|
1396
|
+
|
|
1397
|
+
This Agreement shall remain in effect for a period of three (3) years from the
|
|
1398
|
+
Effective Date and shall automatically renew for successive one-year terms unless
|
|
1399
|
+
either party gives sixty (60) days' written notice of non-renewal.
|
|
1400
|
+
|
|
1401
|
+
## Limitation of Liability
|
|
1402
|
+
|
|
1403
|
+
In no event shall either party's aggregate liability exceed $50,000.
|
|
1404
|
+
|
|
1405
|
+
## Governing Law
|
|
1406
|
+
|
|
1407
|
+
This Agreement shall be governed by and construed in accordance with the laws of
|
|
1408
|
+
the State of Delaware, without regard to its conflict-of-laws principles.
|
|
1409
|
+
"""
|
|
1410
|
+
|
|
1411
|
+
|
|
1412
|
+
# ---------------------------------------------------------------------------
|
|
1413
|
+
# Command handlers
|
|
1414
|
+
# ---------------------------------------------------------------------------
|
|
1415
|
+
|
|
1416
|
+
|
|
1417
|
+
def cmd_extract(args: argparse.Namespace) -> int:
|
|
1418
|
+
path = Path(args.path)
|
|
1419
|
+
raw, text, fmt, warnings = load_source(path)
|
|
1420
|
+
for w in warnings:
|
|
1421
|
+
_warn(args, w)
|
|
1422
|
+
|
|
1423
|
+
result = build_extraction(text, raw, fmt, str(args.path))
|
|
1424
|
+
|
|
1425
|
+
if args.llm:
|
|
1426
|
+
llm_enrich(result, text, args)
|
|
1427
|
+
|
|
1428
|
+
fmt_out = "json" if args.json else args.format
|
|
1429
|
+
if args.fields:
|
|
1430
|
+
result = _apply_field_subset(result, args.fields.split(","))
|
|
1431
|
+
|
|
1432
|
+
_why_print(
|
|
1433
|
+
args, f"extracted {path.name}",
|
|
1434
|
+
f"format={fmt} parties={len(result.get('parties', []))} "
|
|
1435
|
+
f"clauses={len(result.get('clauses', []))}",
|
|
1436
|
+
f"tiers={','.join(result['_meta']['tiers_used'])} "
|
|
1437
|
+
f"llm_used={result['_meta']['llm_used']}",
|
|
1438
|
+
f"low_signal={_is_low_signal(result)}" if not args.fields else "fields_subset=on",
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
if args.silent and fmt_out != "json":
|
|
1442
|
+
pass # silent suppresses the human table; JSON is the machine payload
|
|
1443
|
+
elif fmt_out == "table":
|
|
1444
|
+
print(render_table(result, args.no_confidence))
|
|
1445
|
+
else:
|
|
1446
|
+
print(render_json(result, args.no_confidence))
|
|
1447
|
+
|
|
1448
|
+
if not args.fields and _is_low_signal(result):
|
|
1449
|
+
_warn(args, "document produced no high-signal fields (parties/clauses/dates); "
|
|
1450
|
+
"it may be scanned, image-only, or unstructured")
|
|
1451
|
+
return 1
|
|
1452
|
+
return 0
|
|
1453
|
+
|
|
1454
|
+
|
|
1455
|
+
def cmd_schema(args: argparse.Namespace) -> int:
|
|
1456
|
+
print(json.dumps(output_schema(), indent=2, ensure_ascii=True))
|
|
1457
|
+
return 0
|
|
1458
|
+
|
|
1459
|
+
|
|
1460
|
+
def cmd_fields(args: argparse.Namespace) -> int:
|
|
1461
|
+
if args.json:
|
|
1462
|
+
payload = [
|
|
1463
|
+
{"field": f, "tier": tier, "description": desc}
|
|
1464
|
+
for f, tier, desc in FIELD_CATALOG
|
|
1465
|
+
]
|
|
1466
|
+
print(json.dumps(payload, indent=2, ensure_ascii=True))
|
|
1467
|
+
return 0
|
|
1468
|
+
print(_bold("Extractable fields") + _dim(" (tier = which extraction tier produces it)"))
|
|
1469
|
+
for f, tier, desc in FIELD_CATALOG:
|
|
1470
|
+
tag = _green(tier) if tier == "deterministic" else _yellow(tier)
|
|
1471
|
+
print(f" {f.ljust(26)} {tag.ljust(22)} {_dim(desc)}")
|
|
1472
|
+
return 0
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
def cmd_demo(args: argparse.Namespace) -> int:
|
|
1476
|
+
raw = DEMO_DOCUMENT.encode("utf-8")
|
|
1477
|
+
result = build_extraction(DEMO_DOCUMENT, raw, "markdown", "(bundled demo fixture)")
|
|
1478
|
+
if not args.silent:
|
|
1479
|
+
_eprint(_bold("extract-cli demo") + " -- the suite's passport control")
|
|
1480
|
+
_eprint(_dim(
|
|
1481
|
+
" A foreign document comes in (here: a bundled NDA). The deterministic\n"
|
|
1482
|
+
" tier maps its clauses onto the suite's canonical vocabulary and pulls\n"
|
|
1483
|
+
" parties/dates/term/governing-law -- no LLM, no network. The JSON below\n"
|
|
1484
|
+
" is what nda-review-cli / compare-cli / contract-vault consume.\n"
|
|
1485
|
+
))
|
|
1486
|
+
fmt_out = "json" if args.json else args.format
|
|
1487
|
+
if fmt_out == "table":
|
|
1488
|
+
print(render_table(result, args.no_confidence))
|
|
1489
|
+
else:
|
|
1490
|
+
print(render_json(result, args.no_confidence))
|
|
1491
|
+
if not args.silent:
|
|
1492
|
+
_eprint(_dim("\n Try: extract demo --format json | jq '.clauses[].canonical_title'"))
|
|
1493
|
+
return 0
|
|
1494
|
+
|
|
1495
|
+
|
|
1496
|
+
# ---------------------------------------------------------------------------
|
|
1497
|
+
# Shell completion
|
|
1498
|
+
# ---------------------------------------------------------------------------
|
|
1499
|
+
|
|
1500
|
+
_SUBCOMMANDS = ("schema", "fields", "demo", "completion")
|
|
1501
|
+
_GLOBAL_FLAGS = (
|
|
1502
|
+
"--json", "--why", "-q", "--silent", "--no-color", "--llm",
|
|
1503
|
+
"--format", "--fields", "--no-confidence", "-V", "--version", "-h", "--help",
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
_BASH_COMPLETION = r"""# extract-cli bash completion
|
|
1507
|
+
# eval "$(extract completion bash)"
|
|
1508
|
+
_extract_completions() {
|
|
1509
|
+
local cur prev
|
|
1510
|
+
cur="${COMP_WORDS[COMP_CWORD]}"
|
|
1511
|
+
local cmds="schema fields demo completion"
|
|
1512
|
+
local flags="--json --why -q --silent --no-color --llm --format --fields --no-confidence -V --version -h --help"
|
|
1513
|
+
if [ "$COMP_CWORD" -eq 1 ]; then
|
|
1514
|
+
COMPREPLY=( $(compgen -W "${cmds}" -- "${cur}") $(compgen -f -- "${cur}") )
|
|
1515
|
+
return 0
|
|
1516
|
+
fi
|
|
1517
|
+
if [[ "${cur}" == -* ]]; then
|
|
1518
|
+
COMPREPLY=( $(compgen -W "${flags}" -- "${cur}") )
|
|
1519
|
+
return 0
|
|
1520
|
+
fi
|
|
1521
|
+
COMPREPLY=( $(compgen -f -- "${cur}") )
|
|
1522
|
+
}
|
|
1523
|
+
complete -F _extract_completions extract
|
|
1524
|
+
"""
|
|
1525
|
+
|
|
1526
|
+
_ZSH_COMPLETION = r"""# extract-cli zsh completion
|
|
1527
|
+
# eval "$(extract completion zsh)"
|
|
1528
|
+
_extract() {
|
|
1529
|
+
local -a cmds flags
|
|
1530
|
+
cmds=(
|
|
1531
|
+
'schema:Print the output JSON Schema (the cross-CLI contract)'
|
|
1532
|
+
'fields:List extractable fields and their tier'
|
|
1533
|
+
'demo:Run extraction on a bundled fixture'
|
|
1534
|
+
'completion:Emit a shell completion script'
|
|
1535
|
+
)
|
|
1536
|
+
flags=(
|
|
1537
|
+
'--json' '--why' '-q' '--silent' '--no-color' '--llm'
|
|
1538
|
+
'--format' '--fields' '--no-confidence' '-V' '--version'
|
|
1539
|
+
)
|
|
1540
|
+
if (( CURRENT == 2 )); then
|
|
1541
|
+
_describe 'command' cmds
|
|
1542
|
+
_files
|
|
1543
|
+
return
|
|
1544
|
+
fi
|
|
1545
|
+
_files
|
|
1546
|
+
compadd -- ${flags}
|
|
1547
|
+
}
|
|
1548
|
+
compdef _extract extract
|
|
1549
|
+
"""
|
|
1550
|
+
|
|
1551
|
+
|
|
1552
|
+
def cmd_completion(args: argparse.Namespace) -> int:
|
|
1553
|
+
shell = (args.shell or "").lower()
|
|
1554
|
+
if shell == "bash":
|
|
1555
|
+
sys.stdout.write(_BASH_COMPLETION)
|
|
1556
|
+
return 0
|
|
1557
|
+
if shell == "zsh":
|
|
1558
|
+
sys.stdout.write(_ZSH_COMPLETION)
|
|
1559
|
+
return 0
|
|
1560
|
+
raise ExtractError(f"unsupported shell: {args.shell!r}. Supported: bash, zsh.")
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
def _completion_handler(argv: List[str]) -> int:
|
|
1564
|
+
"""Hidden `__complete` handler invoked by the shell-completion scripts."""
|
|
1565
|
+
if not argv:
|
|
1566
|
+
return 0
|
|
1567
|
+
what = argv[0]
|
|
1568
|
+
if what == "commands":
|
|
1569
|
+
for c in _SUBCOMMANDS:
|
|
1570
|
+
print(c)
|
|
1571
|
+
elif what == "flags":
|
|
1572
|
+
for f in _GLOBAL_FLAGS:
|
|
1573
|
+
print(f)
|
|
1574
|
+
return 0
|
|
1575
|
+
|
|
1576
|
+
|
|
1577
|
+
# ---------------------------------------------------------------------------
|
|
1578
|
+
# Argument parsing + main
|
|
1579
|
+
# ---------------------------------------------------------------------------
|
|
1580
|
+
|
|
1581
|
+
|
|
1582
|
+
def _add_common_output_flags(p: argparse.ArgumentParser) -> None:
|
|
1583
|
+
p.add_argument("--json", action="store_true",
|
|
1584
|
+
help="Force JSON output to stdout (the default).")
|
|
1585
|
+
p.add_argument("--format", choices=("json", "table"), default="json",
|
|
1586
|
+
help="Output format (default: json).")
|
|
1587
|
+
p.add_argument("--no-confidence", action="store_true",
|
|
1588
|
+
help="Omit confidence/source markers (reduced convenience view).")
|
|
1589
|
+
p.add_argument("--why", action="store_true",
|
|
1590
|
+
help="Print a rationale block to stderr.")
|
|
1591
|
+
p.add_argument("-q", "--silent", "--quiet", dest="silent", action="store_true",
|
|
1592
|
+
help="Suppress non-error diagnostics (and the human table).")
|
|
1593
|
+
|
|
1594
|
+
|
|
1595
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
1596
|
+
parser = argparse.ArgumentParser(
|
|
1597
|
+
prog="extract",
|
|
1598
|
+
description="Ingest any contract (.md/.txt/.docx/.pdf) and emit structured "
|
|
1599
|
+
"JSON for the contract-ops CLI suite. See docs/INTEROP.md.",
|
|
1600
|
+
)
|
|
1601
|
+
parser.add_argument("-V", "--version", action="version",
|
|
1602
|
+
version=f"{CLI_NAME} {__version__}")
|
|
1603
|
+
parser.add_argument("--no-color", action="store_true",
|
|
1604
|
+
help="Disable ANSI color (also honors NO_COLOR / FORCE_COLOR).")
|
|
1605
|
+
|
|
1606
|
+
sub = parser.add_subparsers(dest="command")
|
|
1607
|
+
|
|
1608
|
+
p_schema = sub.add_parser("schema", help="Print the output JSON Schema (the contract).")
|
|
1609
|
+
p_schema.set_defaults(func=cmd_schema)
|
|
1610
|
+
|
|
1611
|
+
p_fields = sub.add_parser("fields", help="List extractable fields and their tier.")
|
|
1612
|
+
p_fields.add_argument("--json", action="store_true", help="Emit JSON.")
|
|
1613
|
+
p_fields.set_defaults(func=cmd_fields)
|
|
1614
|
+
|
|
1615
|
+
p_demo = sub.add_parser("demo", help="Run extraction on a bundled fixture.")
|
|
1616
|
+
_add_common_output_flags(p_demo)
|
|
1617
|
+
p_demo.add_argument("--llm", action="store_true", help=argparse.SUPPRESS)
|
|
1618
|
+
p_demo.add_argument("--fields", default="", help=argparse.SUPPRESS)
|
|
1619
|
+
p_demo.set_defaults(func=cmd_demo)
|
|
1620
|
+
|
|
1621
|
+
p_comp = sub.add_parser("completion", help="Emit a shell completion script (bash or zsh).")
|
|
1622
|
+
p_comp.add_argument("shell", choices=("bash", "zsh"))
|
|
1623
|
+
p_comp.set_defaults(func=cmd_completion)
|
|
1624
|
+
|
|
1625
|
+
p_ex = sub.add_parser("extract", help="Extract a document (explicit form of the default).")
|
|
1626
|
+
_build_extract_args(p_ex)
|
|
1627
|
+
|
|
1628
|
+
return parser
|
|
1629
|
+
|
|
1630
|
+
|
|
1631
|
+
def _build_extract_args(p: argparse.ArgumentParser) -> None:
|
|
1632
|
+
p.add_argument("path", help="Path to the document (.md/.txt/.docx/.pdf).")
|
|
1633
|
+
p.add_argument("--llm", action="store_true",
|
|
1634
|
+
help="Opt-in LLM enrichment of fuzzy fields (renewal, obligations). "
|
|
1635
|
+
"Off by default; the deterministic core is fully useful without it.")
|
|
1636
|
+
p.add_argument("--fields", default="",
|
|
1637
|
+
help="Comma-separated subset of top-level fields to emit "
|
|
1638
|
+
"(e.g. parties,clauses,governing_law).")
|
|
1639
|
+
_add_common_output_flags(p)
|
|
1640
|
+
p.set_defaults(func=cmd_extract)
|
|
1641
|
+
|
|
1642
|
+
|
|
1643
|
+
def _build_default_extract_parser() -> argparse.ArgumentParser:
|
|
1644
|
+
"""Parser for the bare `extract <path>` default action (no subcommand)."""
|
|
1645
|
+
p = argparse.ArgumentParser(
|
|
1646
|
+
prog="extract",
|
|
1647
|
+
description="Extract a document into structured JSON (default action).",
|
|
1648
|
+
)
|
|
1649
|
+
p.add_argument("--no-color", action="store_true",
|
|
1650
|
+
help="Disable ANSI color (also honors NO_COLOR / FORCE_COLOR).")
|
|
1651
|
+
_build_extract_args(p)
|
|
1652
|
+
return p
|
|
1653
|
+
|
|
1654
|
+
|
|
1655
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
1656
|
+
# Locale-safe stdout/stderr: POSIX/C locale (common on macOS CI runners)
|
|
1657
|
+
# leaves the streams in ASCII mode, so any non-ASCII char would raise
|
|
1658
|
+
# UnicodeEncodeError. Force UTF-8 regardless of LANG/LC_ALL.
|
|
1659
|
+
for _stream in (sys.stdout, sys.stderr):
|
|
1660
|
+
if hasattr(_stream, "reconfigure"):
|
|
1661
|
+
try:
|
|
1662
|
+
_stream.reconfigure(encoding="utf-8", errors="replace")
|
|
1663
|
+
except Exception:
|
|
1664
|
+
pass
|
|
1665
|
+
|
|
1666
|
+
argv = sys.argv[1:] if argv is None else argv
|
|
1667
|
+
|
|
1668
|
+
# Global --no-color before argparse so it works on every form.
|
|
1669
|
+
if "--no-color" in argv:
|
|
1670
|
+
os.environ["NO_COLOR"] = "1"
|
|
1671
|
+
argv = [a for a in argv if a != "--no-color"]
|
|
1672
|
+
|
|
1673
|
+
# Hidden completion handler (kept out of argparse / --help).
|
|
1674
|
+
if argv and argv[0] == "__complete":
|
|
1675
|
+
return _completion_handler(argv[1:])
|
|
1676
|
+
|
|
1677
|
+
if not argv:
|
|
1678
|
+
build_parser().print_help()
|
|
1679
|
+
return 0
|
|
1680
|
+
|
|
1681
|
+
# Route: a known subcommand or -V/-h go through the full parser; anything
|
|
1682
|
+
# else is treated as the default `extract <path>` action.
|
|
1683
|
+
known = set(_SUBCOMMANDS) | {"extract", "-V", "--version", "-h", "--help"}
|
|
1684
|
+
first = argv[0]
|
|
1685
|
+
try:
|
|
1686
|
+
if first in known:
|
|
1687
|
+
parser = build_parser()
|
|
1688
|
+
args = parser.parse_args(argv)
|
|
1689
|
+
if not getattr(args, "func", None):
|
|
1690
|
+
parser.print_help()
|
|
1691
|
+
return 0
|
|
1692
|
+
else:
|
|
1693
|
+
args = _build_default_extract_parser().parse_args(argv)
|
|
1694
|
+
return args.func(args) or 0
|
|
1695
|
+
except ExtractError as e:
|
|
1696
|
+
_eprint(_red("error:") + f" {e}")
|
|
1697
|
+
return 2
|
|
1698
|
+
except BrokenPipeError: # e.g. `extract foo.md | head`
|
|
1699
|
+
try:
|
|
1700
|
+
sys.stdout.close()
|
|
1701
|
+
except Exception:
|
|
1702
|
+
pass
|
|
1703
|
+
return 0
|
|
1704
|
+
except KeyboardInterrupt: # pragma: no cover
|
|
1705
|
+
_eprint("interrupted")
|
|
1706
|
+
return 130
|
|
1707
|
+
|
|
1708
|
+
|
|
1709
|
+
if __name__ == "__main__":
|
|
1710
|
+
sys.exit(main())
|