contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
`party` 子命令:管理 known_parties 身份基准库(查看/录入/删除主体固有标识)。
|
|
3
|
+
|
|
4
|
+
独立文件——cli.py 已逼近 1000 行红线,不能再塞。known_parties.json 含真实 PII,
|
|
5
|
+
故本命令只在本地档案库读写,不提供导出/分享。基准的"首见入库"由 ingest 自动完成,
|
|
6
|
+
本命令组负责人工查看与修正:set 覆盖(纠正被 OCR 读错的首见基准)、rm 删除。
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json as _json
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
import typer
|
|
16
|
+
from rich.table import Table
|
|
17
|
+
|
|
18
|
+
from .archive.party_registry import PartyRegistry, group_by_value
|
|
19
|
+
from .archive.paths import ArchivePaths, default_archive_root
|
|
20
|
+
# 复用 cli_common 的全局 console(理由同 cli_config):自建实例会让全局 --no-color 失效。
|
|
21
|
+
from .cli_common import OutputFormat, console, err_console
|
|
22
|
+
from .config import load_settings
|
|
23
|
+
|
|
24
|
+
# pretty_exceptions_show_locals=False:防 traceback 把 PII 等局部变量 dump 到终端。
|
|
25
|
+
party_app = typer.Typer(
|
|
26
|
+
help="管理 known_parties 身份基准库(主体固有标识的跨文档核对基准)",
|
|
27
|
+
pretty_exceptions_show_locals=False,
|
|
28
|
+
no_args_is_help=True, # clig.dev:裸 `party` 列出 list/show/set/rm,而非报 Missing command
|
|
29
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
_archive_opt = typer.Option(
|
|
33
|
+
None,
|
|
34
|
+
"--archive",
|
|
35
|
+
"-a",
|
|
36
|
+
help="档案库根目录;不传则用 CONTRACT_ARCHIVE_DIR 或 XDG 默认",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _resolve_archive(archive_opt: Optional[Path]) -> ArchivePaths:
|
|
41
|
+
"""与 cli._resolve_archive 同逻辑:flag > env/config > XDG 默认。隔离实现以避免循环 import。"""
|
|
42
|
+
if archive_opt:
|
|
43
|
+
root = archive_opt
|
|
44
|
+
else:
|
|
45
|
+
configured = load_settings().archive_dir
|
|
46
|
+
root = Path(configured) if configured else default_archive_root()
|
|
47
|
+
return ArchivePaths(root=root.expanduser().resolve())
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _load_registry(archive_opt: Optional[Path]) -> PartyRegistry:
|
|
51
|
+
return PartyRegistry.load(_resolve_archive(archive_opt).known_parties_path)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@party_app.command("list")
|
|
55
|
+
def list_parties(
|
|
56
|
+
archive: Optional[Path] = _archive_opt,
|
|
57
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
58
|
+
) -> None:
|
|
59
|
+
"""列出基准库里所有主体及其固有标识。"""
|
|
60
|
+
reg = _load_registry(archive)
|
|
61
|
+
parties = reg.all_parties()
|
|
62
|
+
if fmt is OutputFormat.json:
|
|
63
|
+
# known_parties 是跨文档身份核对基准,agent 据此核对身份——给机读出口,别只剩表格。
|
|
64
|
+
# 空库吐合法 {}(与其它命令空集合吐 [] 同一套契约)。注意:含真实 PII,仍只到本地 stdout。
|
|
65
|
+
print(_json.dumps(parties, ensure_ascii=False, indent=2))
|
|
66
|
+
return
|
|
67
|
+
if not parties:
|
|
68
|
+
err_console.print("[yellow]known_parties 为空——入库文档后会自动录入首见标识。[/yellow]")
|
|
69
|
+
return
|
|
70
|
+
table = Table(title=f"known_parties · {len(parties)} 个主体")
|
|
71
|
+
table.add_column("主体", style="cyan", no_wrap=True)
|
|
72
|
+
table.add_column("标识")
|
|
73
|
+
table.add_column("值", overflow="fold")
|
|
74
|
+
table.add_column("首见", style="dim")
|
|
75
|
+
# 表格按值折叠同号多 label(电话/联系电话…)去冗余;--format json 保持原始未折叠。
|
|
76
|
+
for name, ids in parties.items():
|
|
77
|
+
first = True
|
|
78
|
+
for label, rec in group_by_value(ids):
|
|
79
|
+
table.add_row(name if first else "", label, rec.get("value", ""), str(rec.get("first_seen_doc", ""))[:12])
|
|
80
|
+
first = False
|
|
81
|
+
console.print(table)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@party_app.command("show")
|
|
85
|
+
def show_party(
|
|
86
|
+
name: str = typer.Argument(..., help="主体名(姓名或机构全称)"),
|
|
87
|
+
archive: Optional[Path] = _archive_opt,
|
|
88
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
89
|
+
) -> None:
|
|
90
|
+
"""查看某主体的全部标识基准。"""
|
|
91
|
+
reg = _load_registry(archive)
|
|
92
|
+
ids = reg.get(name)
|
|
93
|
+
if not ids:
|
|
94
|
+
# json 模式吐 not_found 信封到 stdout(别让 | jq 拿空输入);table 走 stderr。
|
|
95
|
+
if fmt is OutputFormat.json:
|
|
96
|
+
print(_json.dumps({"error": "not_found", "name": name}, ensure_ascii=False))
|
|
97
|
+
else:
|
|
98
|
+
err_console.print(f"[red]未找到主体: {name}[/red]")
|
|
99
|
+
raise typer.Exit(1)
|
|
100
|
+
if fmt is OutputFormat.json:
|
|
101
|
+
print(_json.dumps(ids, ensure_ascii=False, indent=2))
|
|
102
|
+
return
|
|
103
|
+
# 同值多 label 折叠后才是"几项不同标识"——同号的电话/联系电话算一项,故计数用折叠后行数。
|
|
104
|
+
rows = group_by_value(ids)
|
|
105
|
+
table = Table(title=f"{name} · {len(rows)} 项标识")
|
|
106
|
+
table.add_column("标识", style="cyan")
|
|
107
|
+
table.add_column("值", overflow="fold")
|
|
108
|
+
table.add_column("角色", style="dim")
|
|
109
|
+
table.add_column("首见出处", style="dim")
|
|
110
|
+
for label, rec in rows:
|
|
111
|
+
table.add_row(label, rec.get("value", ""), rec.get("role", ""), str(rec.get("first_seen_doc", "")))
|
|
112
|
+
console.print(table)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@party_app.command("set")
|
|
116
|
+
def set_party(
|
|
117
|
+
name: str = typer.Argument(..., help="主体名"),
|
|
118
|
+
label: str = typer.Argument(..., help="标识名,如 身份证号 / 电话 / 银行账号"),
|
|
119
|
+
value: str = typer.Argument(..., help="标识值"),
|
|
120
|
+
archive: Optional[Path] = _archive_opt,
|
|
121
|
+
) -> None:
|
|
122
|
+
"""手动录入/修正某主体的标识基准(覆盖既有值;用于纠正首见时被 OCR 读错的基准)。"""
|
|
123
|
+
paths = _resolve_archive(archive)
|
|
124
|
+
reg = PartyRegistry.load(paths.known_parties_path)
|
|
125
|
+
try:
|
|
126
|
+
reg.set(name, label, value)
|
|
127
|
+
except ValueError as e:
|
|
128
|
+
err_console.print(f"[red]{e}[/red]")
|
|
129
|
+
raise typer.Exit(1)
|
|
130
|
+
reg.save()
|
|
131
|
+
# 状态变更确认走 stderr(与 delete/vacuum 一致),stdout 留给数据。
|
|
132
|
+
err_console.print(f"[green]已设置[/green] {name}·{label} → {paths.known_parties_path}")
|
|
133
|
+
err_console.print(
|
|
134
|
+
"[yellow]注意:known_parties.json 明文存 PII,已设为仅本人可读(0600),请勿提交或分享。[/yellow]"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@party_app.command("rm")
|
|
139
|
+
def rm_party(
|
|
140
|
+
name: str = typer.Argument(..., help="主体名"),
|
|
141
|
+
label: Optional[str] = typer.Argument(None, help="标识名;省略则删除该主体全部标识"),
|
|
142
|
+
archive: Optional[Path] = _archive_opt,
|
|
143
|
+
yes: bool = typer.Option(False, "--yes", "-y", help="跳过确认(删整个主体时需要)"),
|
|
144
|
+
) -> None:
|
|
145
|
+
"""删除某主体的某标识;不给 label 则删除整个主体。"""
|
|
146
|
+
paths = _resolve_archive(archive)
|
|
147
|
+
reg = PartyRegistry.load(paths.known_parties_path)
|
|
148
|
+
target = f"{name}·{label}" if label else name
|
|
149
|
+
# 删【整个主体】(省略 label)是更危险路径:known_parties 是跨文档 PII 核对基准,
|
|
150
|
+
# 删错会让后续身份核对静默失准。比照 delete 的守卫——非交互须显式 --yes,TTY 下确认。
|
|
151
|
+
# 删单个 label(精确指定)影响小,保持轻量、不强制确认。
|
|
152
|
+
if label is None and not yes:
|
|
153
|
+
if not sys.stdin.isatty():
|
|
154
|
+
err_console.print(
|
|
155
|
+
f"[red]拒绝在非交互环境删除整个主体 {name}:请加 --yes 确认[/red]"
|
|
156
|
+
)
|
|
157
|
+
raise typer.Exit(1)
|
|
158
|
+
if not typer.confirm(f"删除主体 {name} 的全部标识基准?", default=False):
|
|
159
|
+
err_console.print("[yellow]aborted[/yellow]")
|
|
160
|
+
raise typer.Exit(0)
|
|
161
|
+
if reg.remove(name, label):
|
|
162
|
+
reg.save()
|
|
163
|
+
err_console.print(f"[green]已删除[/green] {target}")
|
|
164
|
+
else:
|
|
165
|
+
err_console.print(f"[red]未找到: {target}[/red]")
|
|
166
|
+
raise typer.Exit(1)
|
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
"""
|
|
2
|
+
只读查询/展示命令:list / search / show / raw / stats / todo / seals。
|
|
3
|
+
|
|
4
|
+
这些命令不写库、不调用付费 API,是档案库的"读侧"。它们用 @app.command 挂到
|
|
5
|
+
cli_common 的主 app 上——import 本模块即触发注册(见 cli.py 的组装段)。
|
|
6
|
+
|
|
7
|
+
依赖方向:本模块只 import cli_common(基础设施)+ archive 读函数 + cli_render
|
|
8
|
+
(纯渲染),绝不回头 import cli(写命令模块),以保持 DAG、避免循环 import。
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json as _json
|
|
13
|
+
import sys
|
|
14
|
+
from dataclasses import asdict
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
import typer
|
|
19
|
+
from rich.table import Table
|
|
20
|
+
|
|
21
|
+
from .archive import (
|
|
22
|
+
SearchFilter,
|
|
23
|
+
Stats,
|
|
24
|
+
collect_stats,
|
|
25
|
+
list_documents,
|
|
26
|
+
list_obligations,
|
|
27
|
+
list_seals,
|
|
28
|
+
load_document_text,
|
|
29
|
+
open_archive_db,
|
|
30
|
+
search_documents,
|
|
31
|
+
)
|
|
32
|
+
from .cli_common import (
|
|
33
|
+
Actor,
|
|
34
|
+
ColorWhen,
|
|
35
|
+
DocStatus,
|
|
36
|
+
DocType,
|
|
37
|
+
OrderBy,
|
|
38
|
+
OutputFormat,
|
|
39
|
+
_archive_empty,
|
|
40
|
+
_archive_opt,
|
|
41
|
+
_resolve_archive,
|
|
42
|
+
_resolve_ident,
|
|
43
|
+
app,
|
|
44
|
+
color_disabled,
|
|
45
|
+
console,
|
|
46
|
+
err_console,
|
|
47
|
+
not_found_json,
|
|
48
|
+
)
|
|
49
|
+
from .cli_render import (
|
|
50
|
+
build_list_table,
|
|
51
|
+
build_search_table,
|
|
52
|
+
build_show_table,
|
|
53
|
+
color_legend,
|
|
54
|
+
extracted_terms,
|
|
55
|
+
render_highlighted,
|
|
56
|
+
row_to_dict,
|
|
57
|
+
seal_rows_to_dict,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# ---------- list ----------
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@app.command("list")
|
|
64
|
+
def list_cmd(
|
|
65
|
+
archive: Optional[Path] = _archive_opt,
|
|
66
|
+
limit: int = typer.Option(50, "--limit", "-n", help="最多返回 N 条(按排序截断)"),
|
|
67
|
+
order_by: OrderBy = typer.Option(
|
|
68
|
+
OrderBy.ingested_at, "--order-by", help="排序字段"
|
|
69
|
+
),
|
|
70
|
+
status: Optional[DocStatus] = typer.Option(
|
|
71
|
+
None, "--status", help="过滤状态;默认全部"
|
|
72
|
+
),
|
|
73
|
+
doc_type: Optional[DocType] = typer.Option(
|
|
74
|
+
None, "--type", help="按文档类型过滤"
|
|
75
|
+
),
|
|
76
|
+
incomplete: bool = typer.Option(
|
|
77
|
+
False, "--incomplete", help="只列疑似不完整的合同(缺签章/缺要素)"
|
|
78
|
+
),
|
|
79
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
80
|
+
) -> None:
|
|
81
|
+
"""列出档案库内已索引文档。"""
|
|
82
|
+
paths = _resolve_archive(archive)
|
|
83
|
+
if _archive_empty(paths, fmt):
|
|
84
|
+
return
|
|
85
|
+
conn = open_archive_db(paths.db_path)
|
|
86
|
+
rows = list_documents(
|
|
87
|
+
conn,
|
|
88
|
+
limit=limit,
|
|
89
|
+
order_by=order_by.value,
|
|
90
|
+
status=status.value if status else None,
|
|
91
|
+
doc_type=doc_type.value if doc_type else None,
|
|
92
|
+
incomplete=incomplete,
|
|
93
|
+
)
|
|
94
|
+
conn.close()
|
|
95
|
+
|
|
96
|
+
if fmt is OutputFormat.json:
|
|
97
|
+
print(_json.dumps([row_to_dict(r) for r in rows], ensure_ascii=False, indent=2))
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
console.print(build_list_table(rows, paths.root))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------- search ----------
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@app.command()
|
|
107
|
+
def search(
|
|
108
|
+
archive: Optional[Path] = _archive_opt,
|
|
109
|
+
name: Optional[str] = typer.Option(None, "--name", help="合同名包含(LIKE)"),
|
|
110
|
+
party: Optional[str] = typer.Option(
|
|
111
|
+
None, "--party", help="甲方 OR 乙方包含(LIKE)"
|
|
112
|
+
),
|
|
113
|
+
amount_min: Optional[float] = typer.Option(
|
|
114
|
+
None, "--amount-min", help="金额下限(元)"
|
|
115
|
+
),
|
|
116
|
+
amount_max: Optional[float] = typer.Option(
|
|
117
|
+
None, "--amount-max", help="金额上限(元)"
|
|
118
|
+
),
|
|
119
|
+
signed_after: Optional[str] = typer.Option(
|
|
120
|
+
None, "--signed-after", help="签订日 ≥ YYYY-MM-DD"
|
|
121
|
+
),
|
|
122
|
+
signed_before: Optional[str] = typer.Option(
|
|
123
|
+
None, "--signed-before", help="签订日 ≤ YYYY-MM-DD"
|
|
124
|
+
),
|
|
125
|
+
expire_before: Optional[str] = typer.Option(
|
|
126
|
+
None, "--expire-before", help="到期日 ≤ YYYY-MM-DD(找快到期)"
|
|
127
|
+
),
|
|
128
|
+
auto_renewal: Optional[bool] = typer.Option(
|
|
129
|
+
None,
|
|
130
|
+
"--auto-renewal/--no-auto-renewal",
|
|
131
|
+
help="是否自动续约",
|
|
132
|
+
),
|
|
133
|
+
has_risk: bool = typer.Option(False, "--has-risk", help="只显示有风险条款的"),
|
|
134
|
+
deadline_before: Optional[str] = typer.Option(
|
|
135
|
+
None,
|
|
136
|
+
"--deadline-before",
|
|
137
|
+
help="存在 deadline ≤ YYYY-MM-DD 的义务(找近期待办合同)",
|
|
138
|
+
),
|
|
139
|
+
deadline_after: Optional[str] = typer.Option(
|
|
140
|
+
None, "--deadline-after", help="存在 deadline ≥ YYYY-MM-DD 的义务"
|
|
141
|
+
),
|
|
142
|
+
actor: Optional[Actor] = typer.Option(
|
|
143
|
+
None, "--actor", help="义务 actor"
|
|
144
|
+
),
|
|
145
|
+
status: Optional[DocStatus] = typer.Option(None, "--status", help="过滤状态"),
|
|
146
|
+
has_seal: Optional[bool] = typer.Option(
|
|
147
|
+
None, "--has-seal/--no-seal", help="有/无印章(默认不过滤);只想列章本身用 seals 命令"
|
|
148
|
+
),
|
|
149
|
+
seal_owner: Optional[str] = typer.Option(
|
|
150
|
+
None, "--seal-owner", help="盖章主体包含(LIKE)"
|
|
151
|
+
),
|
|
152
|
+
seal_type: Optional[str] = typer.Option(
|
|
153
|
+
None, "--seal-type", help="印章类型包含(LIKE),如 合同专用章/公章"
|
|
154
|
+
),
|
|
155
|
+
subject: Optional[str] = typer.Option(
|
|
156
|
+
None, "--subject", help="主体包含(LIKE),覆盖所有文档类型(含合同甲乙方)"
|
|
157
|
+
),
|
|
158
|
+
limit: int = typer.Option(50, "--limit", "-n", help="最多返回 N 条(按排序截断)"),
|
|
159
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
160
|
+
) -> None:
|
|
161
|
+
"""多字段 AND 过滤查询。"""
|
|
162
|
+
paths = _resolve_archive(archive)
|
|
163
|
+
if _archive_empty(paths, fmt):
|
|
164
|
+
return
|
|
165
|
+
conn = open_archive_db(paths.db_path)
|
|
166
|
+
flt = SearchFilter(
|
|
167
|
+
name=name,
|
|
168
|
+
party=party,
|
|
169
|
+
amount_min_cents=int(round(amount_min * 100)) if amount_min is not None else None,
|
|
170
|
+
amount_max_cents=int(round(amount_max * 100)) if amount_max is not None else None,
|
|
171
|
+
signed_after=signed_after,
|
|
172
|
+
signed_before=signed_before,
|
|
173
|
+
expire_before=expire_before,
|
|
174
|
+
auto_renewal=auto_renewal,
|
|
175
|
+
has_risk=has_risk,
|
|
176
|
+
deadline_before=deadline_before,
|
|
177
|
+
deadline_after=deadline_after,
|
|
178
|
+
actor=actor.value if actor else None,
|
|
179
|
+
status=status.value if status else None,
|
|
180
|
+
has_seal=has_seal,
|
|
181
|
+
seal_owner=seal_owner,
|
|
182
|
+
seal_type=seal_type,
|
|
183
|
+
subject=subject,
|
|
184
|
+
limit=limit,
|
|
185
|
+
)
|
|
186
|
+
rows = search_documents(conn, flt)
|
|
187
|
+
conn.close()
|
|
188
|
+
|
|
189
|
+
if fmt is OutputFormat.json:
|
|
190
|
+
print(_json.dumps([row_to_dict(r) for r in rows], ensure_ascii=False, indent=2))
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
console.print(build_search_table(rows))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ---------- show ----------
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@app.command()
|
|
200
|
+
def show(
|
|
201
|
+
ident: str = typer.Argument(..., help="档案 id (整数) 或 sha 前缀 (>=4 字符)"),
|
|
202
|
+
archive: Optional[Path] = _archive_opt,
|
|
203
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
204
|
+
) -> None:
|
|
205
|
+
"""显示单条档案详情。"""
|
|
206
|
+
paths = _resolve_archive(archive)
|
|
207
|
+
# show 请求的是具体一条;库不存在/查不到都是错误(exit 1)。
|
|
208
|
+
# table 模式提示走 stderr;json 模式吐 not_found 信封到 stdout(别让 | jq 拿到空输入)。
|
|
209
|
+
if not paths.db_path.exists():
|
|
210
|
+
if fmt is OutputFormat.json:
|
|
211
|
+
not_found_json(ident)
|
|
212
|
+
else:
|
|
213
|
+
err_console.print(f"[yellow]archive empty: {paths.db_path}[/yellow]")
|
|
214
|
+
raise typer.Exit(1)
|
|
215
|
+
conn = open_archive_db(paths.db_path)
|
|
216
|
+
row = _resolve_ident(conn, ident)
|
|
217
|
+
conn.close()
|
|
218
|
+
|
|
219
|
+
if not row:
|
|
220
|
+
if fmt is OutputFormat.json:
|
|
221
|
+
not_found_json(ident)
|
|
222
|
+
else:
|
|
223
|
+
err_console.print(f"[red]not found: {ident}[/red]")
|
|
224
|
+
raise typer.Exit(1)
|
|
225
|
+
|
|
226
|
+
if fmt is OutputFormat.json:
|
|
227
|
+
print(_json.dumps(
|
|
228
|
+
row_to_dict(row, archive_root=paths.root, include_original_source=False),
|
|
229
|
+
ensure_ascii=False,
|
|
230
|
+
indent=2,
|
|
231
|
+
))
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
console.print(build_show_table(row, paths.root))
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# ---------- raw ----------
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
@app.command()
|
|
241
|
+
def raw(
|
|
242
|
+
ident: str = typer.Argument(..., help="档案 id (整数) 或 sha 前缀 (>=4 字符)"),
|
|
243
|
+
archive: Optional[Path] = _archive_opt,
|
|
244
|
+
color: ColorWhen = typer.Option(
|
|
245
|
+
ColorWhen.auto, "--color",
|
|
246
|
+
help="auto=仅 TTY 上色(管道纯文本)| always(配 less -R)| never",
|
|
247
|
+
),
|
|
248
|
+
) -> None:
|
|
249
|
+
"""
|
|
250
|
+
打印文档原文(MinerU OCR 输出的纯文本)到 stdout。
|
|
251
|
+
|
|
252
|
+
与 show 互补:show 看 LLM 抽出的结构化字段,raw 看抽取所依据的原始文本——
|
|
253
|
+
这正是抽取时喂给 LLM 的同一份内容(raw_text.txt,缺失则退回 markdown.md),
|
|
254
|
+
用于核对抽取结果是否忠于原文。
|
|
255
|
+
|
|
256
|
+
交互终端下默认按抽取来源给命中关键字着色(当事人/金额/日期/风险/字段),
|
|
257
|
+
一眼看出哪些被 LLM 识别到;管道(非 TTY)时输出纯文本,不破坏 raw|grep / raw|less。
|
|
258
|
+
"""
|
|
259
|
+
paths = _resolve_archive(archive)
|
|
260
|
+
# 同 show:请求的是具体一条,库不存在/查不到都按错误处理(exit 1),提示走 stderr。
|
|
261
|
+
if not paths.db_path.exists():
|
|
262
|
+
err_console.print(f"[yellow]archive empty: {paths.db_path}[/yellow]")
|
|
263
|
+
raise typer.Exit(1)
|
|
264
|
+
conn = open_archive_db(paths.db_path)
|
|
265
|
+
row = _resolve_ident(conn, ident)
|
|
266
|
+
conn.close()
|
|
267
|
+
|
|
268
|
+
if not row:
|
|
269
|
+
err_console.print(f"[red]not found: {ident}[/red]")
|
|
270
|
+
raise typer.Exit(1)
|
|
271
|
+
|
|
272
|
+
# output_dir 可能为空串(失败入库的记录),Path("")/"mineru" 会落到不存在目录,
|
|
273
|
+
# load_document_text 返回 "",统一走下面的"无原文"分支,无需单独判空。
|
|
274
|
+
mineru_dir = Path(row.output_dir) / "mineru"
|
|
275
|
+
text = load_document_text(mineru_dir)
|
|
276
|
+
if not text:
|
|
277
|
+
err_console.print(
|
|
278
|
+
f"[red]no OCR text for id={row.id} sha={row.short_sha}: {mineru_dir}[/red]"
|
|
279
|
+
)
|
|
280
|
+
raise typer.Exit(1)
|
|
281
|
+
|
|
282
|
+
# 上色判定:always 强制(显式逃生口,压过 NO_COLOR);auto 仅当 stdout 是 TTY
|
|
283
|
+
# 且未被全局禁色(--no-color / NO_COLOR);never 禁用。
|
|
284
|
+
# 管道默认纯文本——保住 raw|grep / raw|less 的既有行为(不破坏 userspace)。
|
|
285
|
+
use_color = color is ColorWhen.always or (
|
|
286
|
+
color is ColorWhen.auto and sys.stdout.isatty() and not color_disabled()
|
|
287
|
+
)
|
|
288
|
+
if not use_color:
|
|
289
|
+
print(text)
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
terms = extracted_terms(row)
|
|
293
|
+
# 图例走 stderr:解释颜色含义,又不污染 stdout 的原文(even with | less -R)。
|
|
294
|
+
legend = color_legend(terms)
|
|
295
|
+
if legend and sys.stderr.isatty():
|
|
296
|
+
print(legend, file=sys.stderr)
|
|
297
|
+
sys.stdout.write(render_highlighted(text, terms))
|
|
298
|
+
if not text.endswith("\n"):
|
|
299
|
+
sys.stdout.write("\n")
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ---------- stats ----------
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@app.command()
|
|
306
|
+
def stats(
|
|
307
|
+
archive: Optional[Path] = _archive_opt,
|
|
308
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
309
|
+
) -> None:
|
|
310
|
+
"""档案库统计:总数 / status 分布 / 按月签订分布 / 近 30 天到期数。"""
|
|
311
|
+
paths = _resolve_archive(archive)
|
|
312
|
+
# 库不存在 = 零文档档案:合成零值 Stats,走同一条渲染路径,
|
|
313
|
+
# 不为"空库"单开分支——json 形状始终是对象(不会退化成 list 的 [])。
|
|
314
|
+
if paths.db_path.exists():
|
|
315
|
+
conn = open_archive_db(paths.db_path)
|
|
316
|
+
s = collect_stats(conn)
|
|
317
|
+
conn.close()
|
|
318
|
+
else:
|
|
319
|
+
s = Stats(
|
|
320
|
+
total=0, by_status={}, by_sign_month={},
|
|
321
|
+
new_this_month=0, expiring_within_30d=0,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if fmt is OutputFormat.json:
|
|
325
|
+
print(_json.dumps(asdict(s), ensure_ascii=False, indent=2))
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
table = Table(title=f"Archive Stats · {paths.root}")
|
|
329
|
+
table.add_column("metric", style="cyan")
|
|
330
|
+
table.add_column("value")
|
|
331
|
+
table.add_row("total", str(s.total))
|
|
332
|
+
table.add_row(
|
|
333
|
+
"by_status",
|
|
334
|
+
", ".join(f"{k}={v}" for k, v in sorted(s.by_status.items())) or "-",
|
|
335
|
+
)
|
|
336
|
+
table.add_row("new_this_month", str(s.new_this_month))
|
|
337
|
+
table.add_row("expiring_within_30d", str(s.expiring_within_30d))
|
|
338
|
+
table.add_row(
|
|
339
|
+
"by_sign_month",
|
|
340
|
+
"\n".join(f"{m}: {c}" for m, c in s.by_sign_month.items()) or "-",
|
|
341
|
+
)
|
|
342
|
+
console.print(table)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
# ---------- todo ----------
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
@app.command()
|
|
349
|
+
def todo(
|
|
350
|
+
archive: Optional[Path] = _archive_opt,
|
|
351
|
+
actor: Optional[Actor] = typer.Option(
|
|
352
|
+
None, "--actor", help="义务 actor"
|
|
353
|
+
),
|
|
354
|
+
before: Optional[str] = typer.Option(
|
|
355
|
+
None, "--before", help="deadline ≤ YYYY-MM-DD"
|
|
356
|
+
),
|
|
357
|
+
after: Optional[str] = typer.Option(
|
|
358
|
+
None, "--after", help="deadline ≥ YYYY-MM-DD"
|
|
359
|
+
),
|
|
360
|
+
include_undated: bool = typer.Option(
|
|
361
|
+
False, "--include-undated", help="同时显示无 deadline 的义务"
|
|
362
|
+
),
|
|
363
|
+
within_days: Optional[int] = typer.Option(
|
|
364
|
+
None,
|
|
365
|
+
"--within-days",
|
|
366
|
+
help="便捷选项:deadline 在今天到 N 天内(等价于 --after today --before today+N)",
|
|
367
|
+
),
|
|
368
|
+
limit: int = typer.Option(50, "--limit", "-n", help="最多返回 N 条(按排序截断)"),
|
|
369
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
370
|
+
) -> None:
|
|
371
|
+
"""
|
|
372
|
+
跨合同列出待办义务("催办看板")。按 deadline 升序。
|
|
373
|
+
|
|
374
|
+
用例:
|
|
375
|
+
contract-archive todo --within-days 30 本月需要做的事
|
|
376
|
+
contract-archive todo --actor party_b 乙方所有待办
|
|
377
|
+
contract-archive todo --actor party_a --before 2026-12-31
|
|
378
|
+
contract-archive todo --include-undated 含无日期的(如"签订当日支付定金")
|
|
379
|
+
"""
|
|
380
|
+
from datetime import date, timedelta
|
|
381
|
+
|
|
382
|
+
if within_days is not None:
|
|
383
|
+
today = date.today().isoformat()
|
|
384
|
+
before = before or (date.today() + timedelta(days=within_days)).isoformat()
|
|
385
|
+
after = after or today
|
|
386
|
+
|
|
387
|
+
paths = _resolve_archive(archive)
|
|
388
|
+
if _archive_empty(paths, fmt):
|
|
389
|
+
return
|
|
390
|
+
conn = open_archive_db(paths.db_path)
|
|
391
|
+
items = list_obligations(
|
|
392
|
+
conn,
|
|
393
|
+
actor=actor.value if actor else None,
|
|
394
|
+
before=before,
|
|
395
|
+
after=after,
|
|
396
|
+
include_undated=include_undated,
|
|
397
|
+
limit=limit,
|
|
398
|
+
)
|
|
399
|
+
conn.close()
|
|
400
|
+
|
|
401
|
+
if fmt is OutputFormat.json:
|
|
402
|
+
print(
|
|
403
|
+
_json.dumps(
|
|
404
|
+
[
|
|
405
|
+
{
|
|
406
|
+
"doc_id": it.doc_id,
|
|
407
|
+
"contract_name": it.contract_name,
|
|
408
|
+
"actor": it.actor,
|
|
409
|
+
"action": it.action,
|
|
410
|
+
"deadline": it.deadline,
|
|
411
|
+
"evidence": it.evidence,
|
|
412
|
+
}
|
|
413
|
+
for it in items
|
|
414
|
+
],
|
|
415
|
+
ensure_ascii=False,
|
|
416
|
+
indent=2,
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
return
|
|
420
|
+
|
|
421
|
+
table = Table(title=f"Todo · {len(items)} obligation(s)")
|
|
422
|
+
table.add_column("deadline", style="cyan")
|
|
423
|
+
table.add_column("actor")
|
|
424
|
+
table.add_column("action", overflow="fold")
|
|
425
|
+
table.add_column("contract", overflow="fold", style="dim")
|
|
426
|
+
table.add_column("doc", justify="right", style="dim")
|
|
427
|
+
actor_label = {"party_a": "甲方", "party_b": "乙方", "both": "双方"}
|
|
428
|
+
for it in items:
|
|
429
|
+
deadline = it.deadline or "[dim]无日期[/dim]"
|
|
430
|
+
table.add_row(
|
|
431
|
+
deadline,
|
|
432
|
+
actor_label.get(it.actor, it.actor),
|
|
433
|
+
it.action,
|
|
434
|
+
it.contract_name or "-",
|
|
435
|
+
f"#{it.doc_id}",
|
|
436
|
+
)
|
|
437
|
+
console.print(table)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
# ---------- seals ----------
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
@app.command("seals")
|
|
444
|
+
def seals_cmd(
|
|
445
|
+
archive: Optional[Path] = _archive_opt,
|
|
446
|
+
owner: Optional[str] = typer.Option(
|
|
447
|
+
None, "--owner", "--seal-owner", help="盖章主体包含(LIKE);与 search 的 --seal-owner 同义"
|
|
448
|
+
),
|
|
449
|
+
seal_type: Optional[str] = typer.Option(
|
|
450
|
+
None, "--type", "--seal-type",
|
|
451
|
+
help="印章类型包含(LIKE),如 合同专用章/公章;与 search 的 --seal-type 同义",
|
|
452
|
+
),
|
|
453
|
+
limit: int = typer.Option(200, "--limit", "-n", help="最多列 N 枚印章"),
|
|
454
|
+
fmt: OutputFormat = typer.Option(OutputFormat.table, "--format", help="table | json"),
|
|
455
|
+
) -> None:
|
|
456
|
+
"""
|
|
457
|
+
跨文档列印章:某主体有哪些章、各出现在哪些文档(按主体/类型聚合阅读)。
|
|
458
|
+
|
|
459
|
+
与 search 互补:seals 列【印章】本身;要按印章条件筛【文档】(哪些合同盖了某章)
|
|
460
|
+
用 `search --has-seal/--seal-owner/--seal-type`。
|
|
461
|
+
|
|
462
|
+
用例:
|
|
463
|
+
contract-archive seals 全部印章
|
|
464
|
+
contract-archive seals --seal-owner 示例公司 某公司的章
|
|
465
|
+
contract-archive seals --seal-type 合同专用章
|
|
466
|
+
"""
|
|
467
|
+
paths = _resolve_archive(archive)
|
|
468
|
+
if _archive_empty(paths, fmt):
|
|
469
|
+
return
|
|
470
|
+
conn = open_archive_db(paths.db_path)
|
|
471
|
+
rows = list_seals(conn, owner=owner, seal_type=seal_type, limit=limit)
|
|
472
|
+
conn.close()
|
|
473
|
+
|
|
474
|
+
if fmt is OutputFormat.json:
|
|
475
|
+
print(_json.dumps(seal_rows_to_dict(rows), ensure_ascii=False, indent=2))
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
table = Table(title=f"Seals · {len(rows)} 枚")
|
|
479
|
+
table.add_column("owner", overflow="fold", style="magenta")
|
|
480
|
+
table.add_column("type")
|
|
481
|
+
table.add_column("raw_text", overflow="fold", style="dim")
|
|
482
|
+
table.add_column("doc", overflow="fold")
|
|
483
|
+
table.add_column("id", justify="right", style="dim")
|
|
484
|
+
for r in rows:
|
|
485
|
+
table.add_row(
|
|
486
|
+
r.owner or "?",
|
|
487
|
+
r.seal_type or "-",
|
|
488
|
+
r.raw_text,
|
|
489
|
+
r.title or "-",
|
|
490
|
+
f"#{r.doc_id}",
|
|
491
|
+
)
|
|
492
|
+
console.print(table)
|