contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI 渲染层:把 DocumentRow / IngestResult 等数据对象格式化成展示字符串 / JSON dict / rich Table。
|
|
3
|
+
|
|
4
|
+
这里只放与 typer/console 无关的纯函数(输入数据对象,输出字符串/dict/Table 对象,
|
|
5
|
+
不碰 stdout——打印交给 cli.py 的 console),便于单测、也让 cli.py 专注命令定义与参数解析。
|
|
6
|
+
函数对入参做鸭子类型,不依赖具体 model 类型,避免反向 import。
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def status_color(s: str) -> str:
|
|
19
|
+
"""status 着色:ok 绿 / partial 黄 / failed 红。"""
|
|
20
|
+
color = {"ok": "green", "partial": "yellow", "failed": "red"}.get(s, "white")
|
|
21
|
+
return f"[{color}]{s}[/{color}]"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def subject_of(r) -> str:
|
|
25
|
+
"""list 用的『主体』列:优先信封 parties,回退合同甲乙方。截断防撑宽。"""
|
|
26
|
+
parties = r.details().get("parties") or []
|
|
27
|
+
if not parties:
|
|
28
|
+
parties = [p for p in (r.party_a, r.party_b) if p]
|
|
29
|
+
if not parties:
|
|
30
|
+
return "-"
|
|
31
|
+
s = "、".join(parties[:2])
|
|
32
|
+
return s if len(s) <= 20 else s[:19] + "…"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def display_amount(r) -> str:
|
|
36
|
+
"""
|
|
37
|
+
list 金额列:有计算合计(computed_total_value)优先显示并标 *,
|
|
38
|
+
否则回退抽取的主金额(primary_amount_value),都没有则 '-'。
|
|
39
|
+
"""
|
|
40
|
+
total = r.details().get("computed_total_value")
|
|
41
|
+
if isinstance(total, (int, float)):
|
|
42
|
+
return f"¥{total:,.0f}[cyan]*[/cyan]"
|
|
43
|
+
if r.primary_amount_value is not None:
|
|
44
|
+
return f"¥{r.primary_amount_value:,.0f}"
|
|
45
|
+
return "-"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def completeness_mark(r) -> str:
|
|
49
|
+
"""
|
|
50
|
+
list『完整』列:仅合同有值。疑似缺红色警示,其余从简。
|
|
51
|
+
'incomplete'→红警;'complete'→绿勾;'unknown'→黄问号;None(非合同/未判)→灰横。
|
|
52
|
+
"""
|
|
53
|
+
s = getattr(r, "completeness_status", None)
|
|
54
|
+
return {
|
|
55
|
+
"incomplete": "[red]⚠ 疑似缺[/red]",
|
|
56
|
+
"complete": "[green]✓[/green]",
|
|
57
|
+
"unknown": "[yellow]?[/yellow]",
|
|
58
|
+
}.get(s, "[dim]-[/dim]")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def period_str(a: dict) -> str:
|
|
62
|
+
"""金额覆盖区间的展示标注,如 ' [2025-01-01~2025-12-31]';无区间返回空串。"""
|
|
63
|
+
start, end = a.get("period_start"), a.get("period_end")
|
|
64
|
+
if not start and not end:
|
|
65
|
+
return ""
|
|
66
|
+
return f" [dim][{start or '?'}~{end or '?'}][/dim]"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def local_time(iso_utc: Optional[str]) -> str:
|
|
70
|
+
"""
|
|
71
|
+
入库时间 UTC ISO('2026-05-24T23:05:04Z')→ 本地时区展示串。
|
|
72
|
+
存储保持 UTC(可移植、可比较),仅展示时转本地。解析失败则原样返回。
|
|
73
|
+
"""
|
|
74
|
+
if not iso_utc:
|
|
75
|
+
return "-"
|
|
76
|
+
try:
|
|
77
|
+
dt = datetime.strptime(iso_utc, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
78
|
+
return dt.astimezone().strftime("%Y-%m-%d %H:%M:%S")
|
|
79
|
+
except (ValueError, TypeError):
|
|
80
|
+
return iso_utc
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def ingest_result_to_dict(r) -> dict:
|
|
84
|
+
"""IngestResult → JSON 友好 dict(pdf_path 转字符串)。"""
|
|
85
|
+
return {
|
|
86
|
+
"pdf_path": str(r.pdf_path),
|
|
87
|
+
"sha256": r.sha256,
|
|
88
|
+
"status": r.status,
|
|
89
|
+
"doc_id": r.doc_id,
|
|
90
|
+
"mineru_duration_s": r.mineru_duration_s,
|
|
91
|
+
"llm_duration_s": r.llm_duration_s,
|
|
92
|
+
"error_message": r.error_message,
|
|
93
|
+
# 结构化错误(code/category/retryable);成功/跳过为 None。供 Agent 判可否重试。
|
|
94
|
+
"error": r.error.model_dump() if r.error else None,
|
|
95
|
+
"skipped_reason": r.skipped_reason,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def archived_doc_dir(row, archive_root: Optional[Path | str] = None) -> Path:
|
|
100
|
+
"""show 用的档案目录路径;传入 archive_root 时始终落在 archive 可控目录内。"""
|
|
101
|
+
if archive_root is not None:
|
|
102
|
+
return Path(archive_root) / "documents" / row.sha256[:12]
|
|
103
|
+
if row.output_dir:
|
|
104
|
+
return Path(row.output_dir)
|
|
105
|
+
return Path("documents") / row.sha256[:12]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def archived_source_path(row, archive_root: Optional[Path | str] = None) -> Path:
|
|
109
|
+
"""show 用的留档 PDF 路径:始终指向 archive 可控目录内的 source.pdf。"""
|
|
110
|
+
return archived_doc_dir(row, archive_root) / "source.pdf"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def archive_source_status(row, archive_root: Optional[Path | str] = None) -> dict:
|
|
114
|
+
"""留档 PDF 的路径 + 存在状态;show/json 和 show/table 共用。"""
|
|
115
|
+
path = archived_source_path(row, archive_root)
|
|
116
|
+
exists = path.is_file()
|
|
117
|
+
return {
|
|
118
|
+
"path": str(path),
|
|
119
|
+
"exists": exists,
|
|
120
|
+
"status": "present" if exists else "missing",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def seal_rows_to_dict(rows) -> list[dict]:
|
|
125
|
+
"""SealRow 列表 → JSON 友好 dict 列表(seals --format json 用)。"""
|
|
126
|
+
return [
|
|
127
|
+
{
|
|
128
|
+
"doc_id": r.doc_id,
|
|
129
|
+
"title": r.title,
|
|
130
|
+
"owner": r.owner,
|
|
131
|
+
"seal_type": r.seal_type,
|
|
132
|
+
"raw_text": r.raw_text,
|
|
133
|
+
}
|
|
134
|
+
for r in rows
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def row_to_dict(
|
|
139
|
+
r,
|
|
140
|
+
*,
|
|
141
|
+
archive_root: Optional[Path | str] = None,
|
|
142
|
+
include_original_source: bool = True,
|
|
143
|
+
) -> dict:
|
|
144
|
+
"""DocumentRow → JSON 友好 dict(list/search/show 的 --format json 用)。"""
|
|
145
|
+
details = r.details()
|
|
146
|
+
archive_dir = archived_doc_dir(r, archive_root)
|
|
147
|
+
source = archive_source_status(r, archive_root)
|
|
148
|
+
payload = {
|
|
149
|
+
"id": r.id,
|
|
150
|
+
"sha256": r.sha256,
|
|
151
|
+
"status": r.status,
|
|
152
|
+
"doc_type": r.doc_type,
|
|
153
|
+
"title": r.title,
|
|
154
|
+
"summary": r.summary,
|
|
155
|
+
"primary_date": r.primary_date,
|
|
156
|
+
"primary_amount_value": r.primary_amount_value,
|
|
157
|
+
"computed_total_value": details.get("computed_total_value"),
|
|
158
|
+
"seals": details.get("seals"),
|
|
159
|
+
"sub_agreements": details.get("sub_agreements"),
|
|
160
|
+
"completeness": details.get("completeness"),
|
|
161
|
+
"completeness_status": r.completeness_status,
|
|
162
|
+
"llm_model": details.get("llm_model"),
|
|
163
|
+
"details": details,
|
|
164
|
+
"contract_name": r.contract_name,
|
|
165
|
+
"party_a": r.party_a,
|
|
166
|
+
"party_b": r.party_b,
|
|
167
|
+
"amount_text": r.amount_text,
|
|
168
|
+
"amount_value": r.amount_value,
|
|
169
|
+
"sign_date": r.sign_date,
|
|
170
|
+
"expire_date": r.expire_date,
|
|
171
|
+
"auto_renewal": bool(r.auto_renewal) if r.auto_renewal is not None else None,
|
|
172
|
+
"risk_clauses": r.risk_clauses,
|
|
173
|
+
"obligations": [
|
|
174
|
+
{"actor": o.actor, "action": o.action,
|
|
175
|
+
"deadline": o.deadline, "evidence": o.evidence}
|
|
176
|
+
for o in r.obligations
|
|
177
|
+
],
|
|
178
|
+
"overall_confidence": r.overall_confidence,
|
|
179
|
+
"source_path": source["path"] if archive_root is not None else r.source_path,
|
|
180
|
+
"archive_source_path": source["path"],
|
|
181
|
+
"archive_source_exists": source["exists"],
|
|
182
|
+
"archive_source_status": source["status"],
|
|
183
|
+
"output_dir": str(archive_dir) if archive_root is not None else r.output_dir,
|
|
184
|
+
"archive_dir": str(archive_dir),
|
|
185
|
+
"ingested_at": r.ingested_at,
|
|
186
|
+
}
|
|
187
|
+
if include_original_source:
|
|
188
|
+
payload["original_source_path"] = r.source_path
|
|
189
|
+
return payload
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def build_show_table(row, archive_root: Optional[Path | str] = None) -> Table:
|
|
193
|
+
"""
|
|
194
|
+
show 命令的单文档详情表(纯函数:输入 DocumentRow,输出 rich Table,不打印)。
|
|
195
|
+
|
|
196
|
+
从 cli.py 下沉至此让 cli.py 专注命令定义;按展示段拆成多个 _show_*_rows helper
|
|
197
|
+
(守项目 50 行/函数铁律)。逻辑与原 show 命令的表格构建逐行等价。
|
|
198
|
+
"""
|
|
199
|
+
table = Table(title=f"Document #{row.id} · {row.doc_type or '?'} ({status_color(row.status)})")
|
|
200
|
+
table.add_column("field", style="cyan", no_wrap=True)
|
|
201
|
+
table.add_column("value", overflow="fold")
|
|
202
|
+
det = row.details()
|
|
203
|
+
_show_header_rows(table, row, det, archive_root)
|
|
204
|
+
_show_amount_rows(table, row, det)
|
|
205
|
+
_show_identity_rows(table, det)
|
|
206
|
+
_show_seal_rows(table, row)
|
|
207
|
+
_show_completeness_rows(table, row)
|
|
208
|
+
_show_footer_rows(table, row)
|
|
209
|
+
return table
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _show_header_rows(
|
|
213
|
+
table: Table,
|
|
214
|
+
row,
|
|
215
|
+
det: dict,
|
|
216
|
+
archive_root: Optional[Path | str] = None,
|
|
217
|
+
) -> None:
|
|
218
|
+
"""元信息 + 通用信封 + 合同专属列(party/到期/续约)或非合同的主体/日期。"""
|
|
219
|
+
table.add_row("sha256", row.sha256)
|
|
220
|
+
source = archive_source_status(row, archive_root)
|
|
221
|
+
if source["exists"]:
|
|
222
|
+
table.add_row("source_pdf", source["path"])
|
|
223
|
+
else:
|
|
224
|
+
table.add_row("[red]source_pdf[/red]", f"{source['path']} [red](留档文件丢失)[/red]")
|
|
225
|
+
table.add_row("archive_dir", str(archived_doc_dir(row, archive_root)))
|
|
226
|
+
table.add_row("ingested_at", local_time(row.ingested_at))
|
|
227
|
+
# mineru_s/llm_s(执行耗时)是运维遥测,不属于档案内容——不在 show 展示。
|
|
228
|
+
if row.error_message:
|
|
229
|
+
table.add_row("[red]error[/red]", row.error_message)
|
|
230
|
+
|
|
231
|
+
table.add_row("", "")
|
|
232
|
+
table.add_row("[bold]doc_type[/bold]", row.doc_type or "-")
|
|
233
|
+
table.add_row("[bold]title[/bold]", row.title or row.contract_name or "-")
|
|
234
|
+
if row.summary:
|
|
235
|
+
table.add_row("summary", row.summary)
|
|
236
|
+
|
|
237
|
+
# 合同有专属列(party/到期/续约),日期走表列;其余类型走 details 的主体/日期。
|
|
238
|
+
is_contract = bool(row.contract_name or row.party_a or row.party_b)
|
|
239
|
+
if is_contract:
|
|
240
|
+
table.add_row("", "")
|
|
241
|
+
table.add_row("party_a", row.party_a or "-")
|
|
242
|
+
table.add_row("party_b", row.party_b or "-")
|
|
243
|
+
table.add_row("sign_date", row.sign_date or "-")
|
|
244
|
+
table.add_row("expire_date", row.expire_date or "-")
|
|
245
|
+
table.add_row(
|
|
246
|
+
"auto_renewal",
|
|
247
|
+
"是" if row.auto_renewal == 1 else ("否" if row.auto_renewal == 0 else "-"),
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
parties = det.get("parties") or []
|
|
251
|
+
if parties:
|
|
252
|
+
table.add_row("主体", "\n".join(f"• {p}" for p in parties))
|
|
253
|
+
key_dates = det.get("key_dates") or []
|
|
254
|
+
if key_dates:
|
|
255
|
+
table.add_row(
|
|
256
|
+
"日期",
|
|
257
|
+
"\n".join(f"• {d.get('label', '')}: {d.get('date') or '-'}" for d in key_dates),
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _show_amount_rows(table: Table, row, det: dict) -> None:
|
|
262
|
+
"""金额明细 / 计算合计 / 类型专属字段(所有文档类型通用)。"""
|
|
263
|
+
amounts = det.get("amounts") or []
|
|
264
|
+
if amounts:
|
|
265
|
+
lines = []
|
|
266
|
+
for a in amounts:
|
|
267
|
+
v = a.get("value")
|
|
268
|
+
unit = a.get("unit")
|
|
269
|
+
if unit: # 单价项:显示量纲(如 2.25 元/月·㎡),不套 ¥(非绝对金额)
|
|
270
|
+
vs = f"({v:g} {unit})" if isinstance(v, (int, float)) else ""
|
|
271
|
+
else:
|
|
272
|
+
vs = f"(¥{v:,.2f})" if isinstance(v, (int, float)) else ""
|
|
273
|
+
mark = " [cyan]✓计入合计[/cyan]" if a.get("is_total_component") else ""
|
|
274
|
+
if a.get("is_installment"):
|
|
275
|
+
mark += " [magenta]分期[/magenta]"
|
|
276
|
+
lines.append(
|
|
277
|
+
f"• {a.get('label', '')}: {a.get('text', '')}{vs}{period_str(a)}{mark}"
|
|
278
|
+
)
|
|
279
|
+
ev = a.get("evidence") or ""
|
|
280
|
+
if ev:
|
|
281
|
+
lines.append(f" [dim]↳ 出处:{ev}[/dim]")
|
|
282
|
+
table.add_row("金额", "\n".join(lines))
|
|
283
|
+
elif row.amount_text: # details 无 amounts 的旧数据/回退,至少显示表列主金额
|
|
284
|
+
table.add_row(
|
|
285
|
+
"金额",
|
|
286
|
+
f"{row.amount_text} (¥{row.amount_value:,.2f})" if row.amount_value is not None else row.amount_text,
|
|
287
|
+
)
|
|
288
|
+
total = det.get("computed_total_value")
|
|
289
|
+
if isinstance(total, (int, float)):
|
|
290
|
+
table.add_row(
|
|
291
|
+
"[bold]合计(计算)[/bold]",
|
|
292
|
+
f"[cyan]¥{total:,.2f}[/cyan] [dim](上方标✓项之和,非抽取值)[/dim]",
|
|
293
|
+
)
|
|
294
|
+
mfee = det.get("monthly_property_fee_value")
|
|
295
|
+
if isinstance(mfee, (int, float)):
|
|
296
|
+
mfee_text = det.get("monthly_property_fee_text") or ""
|
|
297
|
+
detail = f" [dim]({mfee_text})[/dim]" if mfee_text else ""
|
|
298
|
+
table.add_row(
|
|
299
|
+
"[bold]月物业费(估算)[/bold]",
|
|
300
|
+
f"[cyan]¥{mfee:,.2f}/月[/cyan]{detail}",
|
|
301
|
+
)
|
|
302
|
+
fields = det.get("fields") or []
|
|
303
|
+
if fields:
|
|
304
|
+
table.add_row(
|
|
305
|
+
"字段",
|
|
306
|
+
"\n".join(f"• {f.get('label', '')}: {f.get('value', '')}" for f in fields),
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _show_identity_rows(table: Table, det: dict) -> None:
|
|
311
|
+
"""身份标识(精确到人):person_identities,known_parties 基准库逐人核对的依据。"""
|
|
312
|
+
pids = det.get("person_identities") or []
|
|
313
|
+
if not pids:
|
|
314
|
+
return
|
|
315
|
+
lines = []
|
|
316
|
+
for p in pids:
|
|
317
|
+
role = p.get("role")
|
|
318
|
+
head = f"[bold]{p.get('name', '?')}[/bold]" + (f" [dim]({role})[/dim]" if role else "")
|
|
319
|
+
lines.append(head)
|
|
320
|
+
for idv in p.get("identifiers") or []:
|
|
321
|
+
lines.append(f" • {idv.get('label', '')}: {idv.get('value', '')}")
|
|
322
|
+
table.add_row("身份标识", "\n".join(lines))
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _show_seal_rows(table: Table, row) -> None:
|
|
326
|
+
"""印章 + 附属协议(补充协议,各有独立签章落款)。det 用 row.details() 现取避免分支差异。"""
|
|
327
|
+
seals = row.details().get("seals") or []
|
|
328
|
+
if seals:
|
|
329
|
+
lines = []
|
|
330
|
+
for s in seals:
|
|
331
|
+
owner = s.get("owner") or "?"
|
|
332
|
+
stype = s.get("seal_type")
|
|
333
|
+
head = owner + (f" · {stype}" if stype else "")
|
|
334
|
+
raw = s.get("raw_text") or ""
|
|
335
|
+
lines.append(f"• {head} [dim]{raw}[/dim]" if raw else f"• {head}")
|
|
336
|
+
table.add_row("[bold]印章[/bold]", "\n".join(lines))
|
|
337
|
+
|
|
338
|
+
subs = row.details().get("sub_agreements") or []
|
|
339
|
+
if subs:
|
|
340
|
+
lines = []
|
|
341
|
+
for sub in subs:
|
|
342
|
+
head = f"[bold]{sub.get('title') or '附属协议'}[/bold]"
|
|
343
|
+
if sub.get("sign_date"):
|
|
344
|
+
head += f" [dim]{sub['sign_date']}[/dim]"
|
|
345
|
+
lines.append(head)
|
|
346
|
+
if sub.get("summary"):
|
|
347
|
+
lines.append(f" {sub['summary']}")
|
|
348
|
+
sseals = sub.get("seals") or []
|
|
349
|
+
if sseals:
|
|
350
|
+
for s in sseals:
|
|
351
|
+
owner = s.get("owner") or "?"
|
|
352
|
+
stype = s.get("seal_type")
|
|
353
|
+
lines.append(f" 印章: {owner}" + (f" · {stype}" if stype else ""))
|
|
354
|
+
else:
|
|
355
|
+
lines.append(" [dim]印章: 无[/dim]")
|
|
356
|
+
table.add_row("[bold]补充协议[/bold]", "\n".join(lines))
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _show_completeness_rows(table: Table, row) -> None:
|
|
360
|
+
"""合同完整性核查块(仅合同有;签章经落款页 VL 核查,要素/金额据原文)。"""
|
|
361
|
+
comp = row.details().get("completeness")
|
|
362
|
+
if not comp:
|
|
363
|
+
return
|
|
364
|
+
status = comp.get("status")
|
|
365
|
+
issues = comp.get("issues") or []
|
|
366
|
+
if status == "complete":
|
|
367
|
+
table.add_row("[bold]完整性[/bold]", "[green]✓ 要素与签章齐全[/green]")
|
|
368
|
+
elif status == "incomplete":
|
|
369
|
+
lines = ["[red]⚠ 疑似不完整[/red] [dim](签章经落款页核查;要素/金额据原文,可翻回核对)[/dim]"]
|
|
370
|
+
cat_label = {"signature": "签章", "amount": "金额", "field": "要素"}
|
|
371
|
+
for it in issues:
|
|
372
|
+
cat = cat_label.get(it.get("category"), "要素")
|
|
373
|
+
detail = it.get("detail") or ""
|
|
374
|
+
tail = f" — [dim]{detail}[/dim]" if detail else ""
|
|
375
|
+
lines.append(f"• [{cat}] {it.get('item', '')}{tail}")
|
|
376
|
+
evidence = it.get("evidence") or ""
|
|
377
|
+
if evidence:
|
|
378
|
+
lines.append(f" [dim]↳ 出处:{evidence}[/dim]")
|
|
379
|
+
table.add_row("[bold]完整性[/bold]", "\n".join(lines))
|
|
380
|
+
else: # unknown
|
|
381
|
+
table.add_row("[bold]完整性[/bold]", "[yellow]? 信息不足,未能判定[/yellow]")
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _show_footer_rows(table: Table, row) -> None:
|
|
385
|
+
"""身份核对不一致 + 抽取元数据(llm_model/置信度)+ 双方义务动作 + 风险条款。"""
|
|
386
|
+
# 身份核对:person_identities 与 known_parties 基准库比对的不一致项(跨文档类型)。
|
|
387
|
+
id_issues = row.details().get("identity_issues") or []
|
|
388
|
+
if id_issues:
|
|
389
|
+
lines = ["[red]⚠ 与基准库不一致[/red] [dim](known_parties 跨文档核对,请人工确认)[/dim]"]
|
|
390
|
+
for it in id_issues:
|
|
391
|
+
detail = it.get("detail") or ""
|
|
392
|
+
tail = f" — [dim]{detail}[/dim]" if detail else ""
|
|
393
|
+
lines.append(f"• {it.get('item', '')}{tail}")
|
|
394
|
+
ev = it.get("evidence") or ""
|
|
395
|
+
if ev:
|
|
396
|
+
lines.append(f" [dim]↳ {ev}[/dim]")
|
|
397
|
+
table.add_row("[bold]身份核对[/bold]", "\n".join(lines))
|
|
398
|
+
table.add_row(
|
|
399
|
+
"llm_model",
|
|
400
|
+
row.details().get("llm_model") or "[dim]- (旧抽取未记录,重抽后显示)[/dim]",
|
|
401
|
+
)
|
|
402
|
+
table.add_row(
|
|
403
|
+
"overall_confidence",
|
|
404
|
+
f"{row.overall_confidence:.2f}" if row.overall_confidence is not None else "-",
|
|
405
|
+
)
|
|
406
|
+
if row.obligations:
|
|
407
|
+
table.add_row("", "")
|
|
408
|
+
for actor_key, label in (
|
|
409
|
+
("party_a", "[bold]甲方动作[/bold]"),
|
|
410
|
+
("party_b", "[bold]乙方动作[/bold]"),
|
|
411
|
+
("both", "[bold]双方动作[/bold]"),
|
|
412
|
+
):
|
|
413
|
+
items = [o for o in row.obligations if o.actor == actor_key]
|
|
414
|
+
if not items:
|
|
415
|
+
continue
|
|
416
|
+
lines = []
|
|
417
|
+
for o in items:
|
|
418
|
+
dl = o.deadline or "[dim]无日期[/dim]"
|
|
419
|
+
lines.append(f"• [{dl}] {o.action}")
|
|
420
|
+
table.add_row(label, "\n".join(lines))
|
|
421
|
+
if row.risk_clauses:
|
|
422
|
+
table.add_row(
|
|
423
|
+
"[bold]risk_clauses[/bold]",
|
|
424
|
+
"\n".join(f"• {c}" for c in row.risk_clauses),
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def build_list_table(rows, root) -> Table:
|
|
429
|
+
"""list 命令的档案列表(纯函数:rows + 档案根目录 → rich Table,不打印)。"""
|
|
430
|
+
table = Table(
|
|
431
|
+
title=f"Archive · {root} ({len(rows)} of total)",
|
|
432
|
+
caption="amount 带 * 为计算合计(如收入证明=年税前+股权),无 * 为抽取的主金额",
|
|
433
|
+
caption_justify="left",
|
|
434
|
+
)
|
|
435
|
+
table.add_column("id", style="cyan", justify="right")
|
|
436
|
+
table.add_column("status")
|
|
437
|
+
table.add_column("type", style="magenta")
|
|
438
|
+
table.add_column("完整") # 合同完整性:⚠ 疑似缺 / ✓ / -(非合同)
|
|
439
|
+
table.add_column("title", overflow="fold")
|
|
440
|
+
table.add_column("主体", overflow="fold") # 区分同类文档(谁的/和谁签的)
|
|
441
|
+
table.add_column("date")
|
|
442
|
+
table.add_column("amount", justify="right")
|
|
443
|
+
table.add_column("ingested", style="dim")
|
|
444
|
+
for r in rows:
|
|
445
|
+
table.add_row(
|
|
446
|
+
str(r.id),
|
|
447
|
+
status_color(r.status),
|
|
448
|
+
r.doc_type or "-",
|
|
449
|
+
completeness_mark(r),
|
|
450
|
+
r.title or r.contract_name or "-",
|
|
451
|
+
subject_of(r),
|
|
452
|
+
r.primary_date or "-",
|
|
453
|
+
display_amount(r),
|
|
454
|
+
local_time(r.ingested_at)[:10], # 本地日期,与 show 一致
|
|
455
|
+
)
|
|
456
|
+
return table
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
# ---------- raw 命令:原文高亮(TTY 上色,标出 LLM 抽到的关键字)----------
|
|
460
|
+
#
|
|
461
|
+
# 终端着色用 ANSI 转义码。是否上色由 cli.py 按 --color + isatty 决定;这里只做
|
|
462
|
+
# "数据 → 带色字符串" 的纯转换,便于单测。按抽取来源分类着色,让"哪些被识别到、
|
|
463
|
+
# 识别成什么类别"一眼可见。
|
|
464
|
+
|
|
465
|
+
_HL_RESET = "\033[0m"
|
|
466
|
+
_HL_STYLES: dict[str, str] = {
|
|
467
|
+
"party": "\033[1;36m", # 加粗青:当事人 / 主体 / 印章 owner
|
|
468
|
+
"amount": "\033[1;33m", # 加粗黄:金额(原文串)
|
|
469
|
+
"date": "\033[1;34m", # 加粗蓝:日期(原文串;ISO 规范化值通常命不中)
|
|
470
|
+
"risk": "\033[1;31m", # 加粗红:风险条款
|
|
471
|
+
"field": "\033[1;35m", # 加粗紫:其他字段值 / 义务出处 / 印章原文
|
|
472
|
+
}
|
|
473
|
+
_HL_LABELS = [("party", "当事人"), ("amount", "金额"), ("date", "日期"),
|
|
474
|
+
("risk", "风险"), ("field", "字段")]
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def extracted_terms(row) -> dict[str, str]:
|
|
478
|
+
"""
|
|
479
|
+
收集 LLM 抽取的、可能在原文里**原样出现**的串 → 高亮类别 key。
|
|
480
|
+
|
|
481
|
+
只收原文原样承载的值(主体名 / 原始金额串 / 字段值 / 出处片段);日期 ISO、
|
|
482
|
+
金额数值、摘要等是规范化或改写的,在原文里 substring 命不中 → 自然不高亮,
|
|
483
|
+
诚实反映"原文里真出现且被抽到"的项。短串(<2 字)丢弃,避免单字满屏误命中。
|
|
484
|
+
"""
|
|
485
|
+
terms: dict[str, str] = {}
|
|
486
|
+
|
|
487
|
+
def add(value, style: str) -> None:
|
|
488
|
+
if isinstance(value, str):
|
|
489
|
+
v = value.strip()
|
|
490
|
+
if len(v) >= 2:
|
|
491
|
+
terms[v] = style
|
|
492
|
+
|
|
493
|
+
# 合同专属顶层列
|
|
494
|
+
add(row.contract_name, "field")
|
|
495
|
+
add(row.party_a, "party")
|
|
496
|
+
add(row.party_b, "party")
|
|
497
|
+
add(row.amount_text, "amount")
|
|
498
|
+
add(row.sign_date, "date")
|
|
499
|
+
add(row.expire_date, "date")
|
|
500
|
+
for rc in row.risk_clauses:
|
|
501
|
+
add(rc, "risk")
|
|
502
|
+
for o in row.obligations:
|
|
503
|
+
add(o.evidence, "field") # evidence 是原文片段
|
|
504
|
+
# 通用信封柔性字段(details_json = DocumentExtraction)
|
|
505
|
+
det = row.details()
|
|
506
|
+
for p in det.get("parties") or []:
|
|
507
|
+
add(p, "party")
|
|
508
|
+
for a in det.get("amounts") or []:
|
|
509
|
+
add(a.get("text"), "amount") # 原文金额串(含大写 / 币种)
|
|
510
|
+
add(a.get("evidence"), "amount")
|
|
511
|
+
for d in det.get("key_dates") or []:
|
|
512
|
+
add(d.get("date"), "date")
|
|
513
|
+
for f in det.get("fields") or []:
|
|
514
|
+
add(f.get("value"), "field") # 字段原文值
|
|
515
|
+
for s in det.get("seals") or []:
|
|
516
|
+
add(s.get("owner"), "party")
|
|
517
|
+
add(s.get("raw_text"), "field")
|
|
518
|
+
return terms
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def render_highlighted(text: str, terms: dict[str, str]) -> str:
|
|
522
|
+
"""
|
|
523
|
+
给原文里命中的抽取串套 ANSI 着色,返回新串(纯函数,不碰 stdout)。
|
|
524
|
+
|
|
525
|
+
长串优先排进正则 alternation:finditer 同位置优先吃长串、且天然从左到右
|
|
526
|
+
不重叠——无需手动合并重叠区间(把特殊情况消成正常情况)。命不中的 term 自然忽略。
|
|
527
|
+
"""
|
|
528
|
+
if not terms:
|
|
529
|
+
return text
|
|
530
|
+
ordered = sorted(terms, key=len, reverse=True)
|
|
531
|
+
pattern = re.compile("|".join(re.escape(t) for t in ordered))
|
|
532
|
+
out: list[str] = []
|
|
533
|
+
last = 0
|
|
534
|
+
for m in pattern.finditer(text):
|
|
535
|
+
hit = m.group()
|
|
536
|
+
style = _HL_STYLES.get(terms.get(hit, "field"), _HL_STYLES["field"])
|
|
537
|
+
out.append(text[last:m.start()])
|
|
538
|
+
out.append(f"{style}{hit}{_HL_RESET}")
|
|
539
|
+
last = m.end()
|
|
540
|
+
out.append(text[last:])
|
|
541
|
+
return "".join(out)
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def color_legend(terms: dict[str, str]) -> str:
|
|
545
|
+
"""已命中的类别 → 一行 ANSI 图例,解释每种颜色代表的抽取类别。无命中返回空串。"""
|
|
546
|
+
used = set(terms.values())
|
|
547
|
+
parts = [f"{_HL_STYLES[k]}■{name}{_HL_RESET}"
|
|
548
|
+
for k, name in _HL_LABELS if k in used]
|
|
549
|
+
return "图例 " + " ".join(parts) if parts else ""
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def build_search_table(rows) -> Table:
|
|
553
|
+
"""search 命令的命中列表(纯函数:rows → rich Table,不打印)。"""
|
|
554
|
+
table = Table(title=f"Search · {len(rows)} hit(s)")
|
|
555
|
+
table.add_column("id", style="cyan", justify="right")
|
|
556
|
+
table.add_column("name", overflow="fold")
|
|
557
|
+
table.add_column("party_a", overflow="fold")
|
|
558
|
+
table.add_column("party_b", overflow="fold")
|
|
559
|
+
table.add_column("amount", justify="right")
|
|
560
|
+
table.add_column("sign_date")
|
|
561
|
+
table.add_column("expire_date")
|
|
562
|
+
table.add_column("risks", justify="right")
|
|
563
|
+
for r in rows:
|
|
564
|
+
amount = f"¥{r.amount_value:,.0f}" if r.amount_value is not None else "-"
|
|
565
|
+
table.add_row(
|
|
566
|
+
str(r.id),
|
|
567
|
+
r.contract_name or "-",
|
|
568
|
+
r.party_a or "-",
|
|
569
|
+
r.party_b or "-",
|
|
570
|
+
amount,
|
|
571
|
+
r.sign_date or "-",
|
|
572
|
+
r.expire_date or "-",
|
|
573
|
+
str(len(r.risk_clauses)),
|
|
574
|
+
)
|
|
575
|
+
return table
|