contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""
|
|
2
|
+
多模态签章核查:对落款页图像调 qwen-vl,确证每个落款区甲/乙方的盖章/签字有无。
|
|
3
|
+
|
|
4
|
+
为什么要看图:MinerU 把落款签章区当 image 抠出,手写签字和红章都没被 OCR 成文字
|
|
5
|
+
(layout 也无 signature/stamp 类型)——纯文本判签章既会误报(签了但读不到)又会漏判。
|
|
6
|
+
只有看图才能确证。文本抽取负责要素核查,签章核查交这里。
|
|
7
|
+
|
|
8
|
+
降级:无落款页图 / 无 key / VL 调用失败时,调用方保留原文本签章判断(不破坏 --no-llm)。
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import base64
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from ..config import get_timeout_s, load_settings
|
|
20
|
+
from ..schemas import (
|
|
21
|
+
Completeness,
|
|
22
|
+
CompletenessIssue,
|
|
23
|
+
DocumentExtraction,
|
|
24
|
+
LabeledValue,
|
|
25
|
+
PersonIdentity,
|
|
26
|
+
)
|
|
27
|
+
from .llm_extractor import _parse_json_loose
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
VL_PROMPT = """你是严谨的合同签章核查员。下面是合同的落款/签署页图像,请逐个落款区核查
|
|
33
|
+
每一方的签署情况,只看图、据实判断。
|
|
34
|
+
|
|
35
|
+
定义:
|
|
36
|
+
- 盖章(seal):该方位置有红色印章图案。
|
|
37
|
+
- 签字(signature):该方位置有手写笔迹姓名。
|
|
38
|
+
- 空白:该方位置既无红章也无手写签字。
|
|
39
|
+
|
|
40
|
+
只输出 JSON,不要解释、不要 markdown 代码块:
|
|
41
|
+
{
|
|
42
|
+
"units": [
|
|
43
|
+
{
|
|
44
|
+
"agreement": "落款所属协议(如 主协议 / 补充协议)",
|
|
45
|
+
"page": 该落款区所在页码数字(看图前的【第X页】标注,必须填),
|
|
46
|
+
"parties": [
|
|
47
|
+
{"role": "甲方", "has_seal": true_or_false, "has_signature": true_or_false, "seal_owner": "章上识别到的主体全称(无章填 null)", "seal_text": "章上完整文字(无章填 null)", "seal_no": "章上的编号数字串,如 33010000000001(读不到填 null)", "signature_name": "手写签字处的姓名(无签字填 null)", "note": "说明"},
|
|
48
|
+
{"role": "乙方", "has_seal": true_or_false, "has_signature": true_or_false, "seal_owner": "...", "seal_text": "...", "signature_name": "...", "note": "..."}
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
要点:
|
|
55
|
+
- 一份文档可能有多个落款区,不同页通常是不同协议(主协议、补充协议)的落款。
|
|
56
|
+
- 每张图前有【第X页】标注,每个 unit 的 page 必须填它所在的那一页,便于追溯出处。
|
|
57
|
+
- 红章可能较淡或被文字压住,仔细看;拿不准 has_seal 填 false 并在 note 里说明。
|
|
58
|
+
- 有红章时尽量读出章面文字:主体全称填 seal_owner,章类型与全文填 seal_text,
|
|
59
|
+
**章上的编号数字串单独填 seal_no(如 33010000000001)**——编号是跨合同核对的关键。
|
|
60
|
+
章面模糊就读多少填多少,**禁止编造编号**;无章则三者填 null。
|
|
61
|
+
- 手写签字哪怕潦草也算 has_signature=true,并尽量辨识姓名填 signature_name;无签字填 null。
|
|
62
|
+
- 只核查"甲方(签章)""乙方(签章)"这类落款签署位,不要把正文印章/骑缝章当落款。
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# 落款页标志词:只在签署区出现、正文罕见。不用"盖章"——正文常提"加盖公章"会误判整页。
|
|
67
|
+
# "签章"覆盖"甲方(签章)"式落款;"委托代理人/经办人"覆盖认购等用"买受人/出卖人"的落款。
|
|
68
|
+
_SIGN_PAGE_MARKERS = ("签章", "委托代理人", "经办人")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def locate_signature_pages(mineru_dir: Path, max_pages: int = 4) -> list[Path]:
|
|
72
|
+
"""从 MinerU content_list 找落款页(含签章/委托代理人等标志词),映射到 preview_images/page_NNN.png。"""
|
|
73
|
+
preview = mineru_dir / "preview_images"
|
|
74
|
+
if not preview.is_dir():
|
|
75
|
+
return []
|
|
76
|
+
content_lists = list(mineru_dir.glob("_mineru_raw/*/auto/*_content_list.json"))
|
|
77
|
+
if not content_lists:
|
|
78
|
+
return []
|
|
79
|
+
try:
|
|
80
|
+
items = json.loads(content_lists[0].read_text(encoding="utf-8"))
|
|
81
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
82
|
+
logger.warning("读取 content_list 失败: %s", e)
|
|
83
|
+
return []
|
|
84
|
+
page_idxs = sorted({
|
|
85
|
+
it["page_idx"]
|
|
86
|
+
for it in items
|
|
87
|
+
if isinstance(it, dict)
|
|
88
|
+
and it.get("page_idx") is not None
|
|
89
|
+
and any(m in str(it.get("text", "")) for m in _SIGN_PAGE_MARKERS)
|
|
90
|
+
})
|
|
91
|
+
out: list[Path] = []
|
|
92
|
+
for idx in page_idxs[:max_pages]:
|
|
93
|
+
img = preview / f"page_{idx + 1:03d}.png"
|
|
94
|
+
if img.exists():
|
|
95
|
+
out.append(img)
|
|
96
|
+
return out
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _encode_image(path: Path) -> str:
|
|
100
|
+
"""本地图 → data URI。OpenAI 兼容接口不收 file://,用 base64 内联。"""
|
|
101
|
+
data = base64.b64encode(path.read_bytes()).decode("ascii")
|
|
102
|
+
return f"data:image/png;base64,{data}"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _call_vl(
|
|
106
|
+
image_paths: list[Path], model: str, api_key: str, base_url: str
|
|
107
|
+
) -> Optional[str]:
|
|
108
|
+
"""
|
|
109
|
+
走 DashScope 的 OpenAI 兼容接口调多模态模型看落款页图。失败返回 None。
|
|
110
|
+
|
|
111
|
+
端点:把原生 base_url 的 /api/v1 换成 /compatible-mode/v1(DashScope OpenAI 兼容模式)。
|
|
112
|
+
图:本地 PNG 转 base64 data URI(兼容接口不支持 file://)。
|
|
113
|
+
"""
|
|
114
|
+
from openai import OpenAI
|
|
115
|
+
|
|
116
|
+
compat_url = base_url.replace("/api/v1", "/compatible-mode/v1")
|
|
117
|
+
content: list[dict] = [{"type": "text", "text": VL_PROMPT}]
|
|
118
|
+
for p in image_paths:
|
|
119
|
+
content.append({"type": "text", "text": f"【第 {_page_no(p)} 页】"})
|
|
120
|
+
content.append({"type": "image_url", "image_url": {"url": _encode_image(p)}})
|
|
121
|
+
content.append({"type": "text", "text": "请逐页核查落款签章,按要求输出 JSON(每个 unit 回填 page)。"})
|
|
122
|
+
try:
|
|
123
|
+
# 显式 timeout(默认 300s,DASHSCOPE_TIMEOUT_S 可调):VL 内联多张落款页大图、
|
|
124
|
+
# 请求体大更易长挂,不设则吃 SDK 默认 ~600s。超时走下面 except 降级返回 None。
|
|
125
|
+
client = OpenAI(
|
|
126
|
+
api_key=api_key, base_url=compat_url,
|
|
127
|
+
timeout=get_timeout_s("DASHSCOPE_TIMEOUT_S", 300.0),
|
|
128
|
+
)
|
|
129
|
+
resp = client.chat.completions.create(
|
|
130
|
+
model=model,
|
|
131
|
+
messages=[{"role": "user", "content": content}],
|
|
132
|
+
temperature=0.1,
|
|
133
|
+
)
|
|
134
|
+
except Exception as e: # noqa: BLE001 — 外部调用,任何异常都降级,不让它中断入库
|
|
135
|
+
logger.exception("VL 签章核查调用失败: %s", e)
|
|
136
|
+
return None
|
|
137
|
+
return resp.choices[0].message.content
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _page_no(image: Path) -> str:
|
|
141
|
+
"""从 preview 文件名 page_NNN.png 提取页码(去前导零)。"""
|
|
142
|
+
return image.stem.replace("page_", "").lstrip("0") or "0"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _issues_from_vision(parsed: dict, fallback_evidence: str = "") -> list[CompletenessIssue]:
|
|
146
|
+
"""
|
|
147
|
+
VL 结果 → 签章缺陷 issues:某方既无章又无签字即为缺,只列缺的。
|
|
148
|
+
|
|
149
|
+
出处优先用该 unit 自己回填的 page(各落款区各自归属的页,如主协议→第8页、
|
|
150
|
+
补充协议→第9页);unit 没给 page 才退回 fallback(所有落款页的笼统出处)。
|
|
151
|
+
"""
|
|
152
|
+
issues: list[CompletenessIssue] = []
|
|
153
|
+
for unit in parsed.get("units") or []:
|
|
154
|
+
if not isinstance(unit, dict):
|
|
155
|
+
continue
|
|
156
|
+
agreement = str(unit.get("agreement") or "协议").strip()
|
|
157
|
+
page = str(unit.get("page") or "").strip()
|
|
158
|
+
evidence = f"据落款页图:第 {page} 页" if page else fallback_evidence
|
|
159
|
+
for party in unit.get("parties") or []:
|
|
160
|
+
if not isinstance(party, dict):
|
|
161
|
+
continue
|
|
162
|
+
role = str(party.get("role") or "").strip()
|
|
163
|
+
if not role:
|
|
164
|
+
continue
|
|
165
|
+
if not bool(party.get("has_seal")) and not bool(party.get("has_signature")):
|
|
166
|
+
issues.append(CompletenessIssue(
|
|
167
|
+
item=f"{agreement}·{role}签章",
|
|
168
|
+
category="signature",
|
|
169
|
+
detail="落款页图像显示该处空白,无红章也无手写签字",
|
|
170
|
+
evidence=evidence,
|
|
171
|
+
))
|
|
172
|
+
return issues
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _signature_evidence(images: list[Path]) -> str:
|
|
176
|
+
"""所有落款页的笼统出处(VL 未回填 unit.page 时的兜底)。"""
|
|
177
|
+
return f"据落款页图:第 {'、'.join(_page_no(p) for p in images)} 页"
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def read_seals_on_images(
|
|
181
|
+
images: list[Path],
|
|
182
|
+
model: str | None = None,
|
|
183
|
+
api_key: str | None = None,
|
|
184
|
+
base_url: str | None = None,
|
|
185
|
+
) -> Optional[dict]:
|
|
186
|
+
"""
|
|
187
|
+
对落款页图像调 VL 模型,返回解析后的**完整结构**(units[].parties[] 含
|
|
188
|
+
has_seal/has_signature 及章读数 seal_owner/seal_text/signature_name)。
|
|
189
|
+
|
|
190
|
+
这是 VL 看图的单一原始来源:check_seals_on_images(缺陷列表)与
|
|
191
|
+
augment_completeness_with_vision(绑章号 + 跨合同核对)都基于它,避免重复调 VL。
|
|
192
|
+
|
|
193
|
+
:param model: 覆盖 VL model(默认 None=走 settings.dashscope_vl_model)。
|
|
194
|
+
:return: 解析后的 dict(无图 → {"units": []});无 key / VL 调用失败 /
|
|
195
|
+
响应无法解析返回 None——让调用方据此降级(保留原文本签章判断)。
|
|
196
|
+
"""
|
|
197
|
+
if not images:
|
|
198
|
+
return {"units": []}
|
|
199
|
+
settings = load_settings()
|
|
200
|
+
model = model or settings.dashscope_vl_model
|
|
201
|
+
api_key = api_key or settings.dashscope_api_key
|
|
202
|
+
base_url = base_url or settings.dashscope_base_url
|
|
203
|
+
if not api_key:
|
|
204
|
+
logger.warning("DASHSCOPE_API_KEY missing; skip VL seal check")
|
|
205
|
+
return None
|
|
206
|
+
text = _call_vl(images, model, api_key, base_url)
|
|
207
|
+
if not text:
|
|
208
|
+
return None
|
|
209
|
+
parsed = _parse_json_loose(text)
|
|
210
|
+
if not parsed:
|
|
211
|
+
logger.warning("VL 签章响应无法解析为 JSON: %s", text[:200])
|
|
212
|
+
return None
|
|
213
|
+
return parsed
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def check_seals_on_images(
|
|
217
|
+
images: list[Path],
|
|
218
|
+
model: str | None = None,
|
|
219
|
+
api_key: str | None = None,
|
|
220
|
+
base_url: str | None = None,
|
|
221
|
+
) -> Optional[list[CompletenessIssue]]:
|
|
222
|
+
"""
|
|
223
|
+
对落款页图像调 VL 模型核查签章,返回签章缺陷 issues(只列缺的)。
|
|
224
|
+
|
|
225
|
+
评测据此横向对比不同 VL 模型。内部走 read_seals_on_images(VL 看图的单一来源)。
|
|
226
|
+
|
|
227
|
+
:return: 缺陷列表([] 表示看图后未发现缺签章);无 key / VL 调用失败 /
|
|
228
|
+
响应无法解析返回 None——让调用方据此降级(保留原文本签章判断)。
|
|
229
|
+
"""
|
|
230
|
+
parsed = read_seals_on_images(images, model, api_key, base_url)
|
|
231
|
+
if parsed is None:
|
|
232
|
+
return None
|
|
233
|
+
return _issues_from_vision(parsed, _signature_evidence(images))
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
_SEAL_NO_RE = re.compile(r"\d{6,}") # 章编号:连续 6+ 位数字
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _seal_number(party: dict) -> str:
|
|
240
|
+
"""
|
|
241
|
+
章号核对值:优先用 VL 给的 seal_no,否则从 seal_text 提取最长数字串。
|
|
242
|
+
|
|
243
|
+
为何用章号而非章面主体名/全文:章号是章上刻死的编号,比 VL 看红章读出的主体名稳定
|
|
244
|
+
(实测同一枚章主体名会被读成"浙典"/"浙奥",但编号一致)。用编号核对避开这种误差。
|
|
245
|
+
"""
|
|
246
|
+
no = str(party.get("seal_no") or "").strip()
|
|
247
|
+
if no:
|
|
248
|
+
return no
|
|
249
|
+
nums = _SEAL_NO_RE.findall(str(party.get("seal_text") or ""))
|
|
250
|
+
return max(nums, key=len) if nums else ""
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# 落款 role ↔ 头部 role 同义归组:VL 按 prompt 输出"甲方/乙方",但头部主体的 role
|
|
254
|
+
# 可能是认购协议的"出卖人/买受人"、租赁的"出租方/承租方"等。按"甲方阵营/乙方阵营"
|
|
255
|
+
# 归组匹配,章号才能绑到正确头部主体——只死认"甲/乙"字会让认购协议(甲方=出卖人)漏匹配,
|
|
256
|
+
# 反被幻觉的"甲方|X"主体截走章号(实测的浙典/浙奥分裂成因之一)。
|
|
257
|
+
_ROLE_GROUP_A = ("甲", "出卖", "卖方", "出租", "转让", "出借", "供方", "发包")
|
|
258
|
+
_ROLE_GROUP_B = ("乙", "买受", "买方", "承租", "受让", "借款", "需方", "承包")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _role_group(role: str) -> str:
|
|
262
|
+
"""把 role 归到甲方阵营('A') / 乙方阵营('B');两边都不沾返回 ''。"""
|
|
263
|
+
text = role or ""
|
|
264
|
+
if any(k in text for k in _ROLE_GROUP_A):
|
|
265
|
+
return "A"
|
|
266
|
+
if any(k in text for k in _ROLE_GROUP_B):
|
|
267
|
+
return "B"
|
|
268
|
+
return ""
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _match_head_party(env: DocumentExtraction, vl_role: str) -> Optional[PersonIdentity]:
|
|
272
|
+
"""
|
|
273
|
+
VL 落款 role(甲方/乙方) → 头部 person_identity(按甲/乙阵营同义归组匹配)。
|
|
274
|
+
|
|
275
|
+
头部主体名来自正文抽取(稳定、有明确指向),是"头部主体 ↔ 落款签章"对应关系的锚点——
|
|
276
|
+
比 VL 看红章读出的章面主体名可靠得多。认购协议头部用"出卖人/买受人"而 VL 用"甲方/乙方",
|
|
277
|
+
故按阵营归组而非死认"甲/乙"字。匹配不到则返回 None,由调用方兜底。
|
|
278
|
+
"""
|
|
279
|
+
group = _role_group(vl_role)
|
|
280
|
+
if not group:
|
|
281
|
+
return None
|
|
282
|
+
for p in env.person_identities:
|
|
283
|
+
if _role_group(p.role or "") == group:
|
|
284
|
+
return p
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _attach_seal_identities(env: DocumentExtraction, parsed: dict) -> None:
|
|
289
|
+
"""
|
|
290
|
+
把 VL 读出的章号绑定到**头部声明的对应主体**(按 role 匹配),作 identifier(label="印章")
|
|
291
|
+
追加进 env.person_identities——后续 known_parties reconcile 即自动跨合同核对章号一致性。
|
|
292
|
+
|
|
293
|
+
为何绑头部主体而非章面 owner:头部主体名来自正文 OCR(稳定、有明确指向),VL 看红章读出的
|
|
294
|
+
章面主体名常误读("浙典"→"浙奥")。用稳定的头部主体名作锚点、章号编号作核对值,才是用户
|
|
295
|
+
要的"头部主体 ↔ 落款签章"对应。头部匹配不到对应方时,退回用章面 owner 兜底(有总比丢好)。
|
|
296
|
+
"""
|
|
297
|
+
for unit in parsed.get("units") or []:
|
|
298
|
+
if not isinstance(unit, dict):
|
|
299
|
+
continue
|
|
300
|
+
for party in unit.get("parties") or []:
|
|
301
|
+
if not isinstance(party, dict) or not bool(party.get("has_seal")):
|
|
302
|
+
continue
|
|
303
|
+
seal_no = _seal_number(party)
|
|
304
|
+
if not seal_no:
|
|
305
|
+
continue
|
|
306
|
+
role = str(party.get("role") or "").strip()
|
|
307
|
+
pid = _match_head_party(env, role)
|
|
308
|
+
if pid is None:
|
|
309
|
+
owner = str(party.get("seal_owner") or "").strip()
|
|
310
|
+
if not owner:
|
|
311
|
+
continue
|
|
312
|
+
pid = PersonIdentity(name=owner, role=role or None)
|
|
313
|
+
env.person_identities.append(pid)
|
|
314
|
+
if not any(i.label == "印章" and i.value == seal_no for i in pid.identifiers):
|
|
315
|
+
pid.identifiers.append(LabeledValue(label="印章", value=seal_no))
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _signatory_mismatch_issues(env: DocumentExtraction, parsed: dict) -> list[CompletenessIssue]:
|
|
319
|
+
"""
|
|
320
|
+
落款签字人 vs 当事人名单一致性核查:VL 读出的 signature_name 若不在 env.parties 中,
|
|
321
|
+
报疑似代签/冒签/笔误。例:补充协议乙方落款"王五",而乙方(买受人)是张三、李四。
|
|
322
|
+
|
|
323
|
+
保守判定:手写签字 OCR 不可靠,一律标"疑似,需人工复核";无 signature_name 的不判。
|
|
324
|
+
委托代理人代签等也会触发——作为异常交人工核对是对的(宁可疑,不漏冒签)。
|
|
325
|
+
名字匹配用双向子串("张三" ↔ "张三(买受人)"算同一人)。
|
|
326
|
+
"""
|
|
327
|
+
parties = [p for p in (env.parties or []) if p]
|
|
328
|
+
if not parties:
|
|
329
|
+
return []
|
|
330
|
+
|
|
331
|
+
def norm(s: str) -> str:
|
|
332
|
+
return "".join((s or "").split())
|
|
333
|
+
|
|
334
|
+
def in_parties(name: str) -> bool:
|
|
335
|
+
n = norm(name)
|
|
336
|
+
return bool(n) and any(n in norm(p) or norm(p) in n for p in parties)
|
|
337
|
+
|
|
338
|
+
issues: list[CompletenessIssue] = []
|
|
339
|
+
for unit in parsed.get("units") or []:
|
|
340
|
+
if not isinstance(unit, dict):
|
|
341
|
+
continue
|
|
342
|
+
agreement = str(unit.get("agreement") or "协议").strip()
|
|
343
|
+
page = str(unit.get("page") or "").strip()
|
|
344
|
+
for party in unit.get("parties") or []:
|
|
345
|
+
if not isinstance(party, dict):
|
|
346
|
+
continue
|
|
347
|
+
signer = str(party.get("signature_name") or "").strip()
|
|
348
|
+
if not signer or in_parties(signer):
|
|
349
|
+
continue
|
|
350
|
+
role = str(party.get("role") or "").strip()
|
|
351
|
+
issues.append(CompletenessIssue(
|
|
352
|
+
item=f"{agreement}·{role}落款人与当事人不符",
|
|
353
|
+
category="signature",
|
|
354
|
+
detail=f"落款签字「{signer}」不在当事人名单({'、'.join(parties)})中,"
|
|
355
|
+
"疑似代签/冒签/笔误,需人工复核",
|
|
356
|
+
evidence=f"据落款页图:第 {page} 页" if page else "",
|
|
357
|
+
))
|
|
358
|
+
return issues
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def augment_completeness_with_vision(env: DocumentExtraction, mineru_dir: Path) -> bool:
|
|
362
|
+
"""
|
|
363
|
+
用 VL 看落款页重判签章:替换 env.completeness 的 signature 类 issues(保留 field/amount),
|
|
364
|
+
并把读出的印章绑定到头部主体(供 known_parties 跨合同核对章号一致性)。
|
|
365
|
+
|
|
366
|
+
仅对合同协议生效。成功返回 True;无图 / 无 key / VL 失败返回 False,
|
|
367
|
+
由调用方保留原文本签章判断作降级。
|
|
368
|
+
"""
|
|
369
|
+
if env.doc_type != "合同协议":
|
|
370
|
+
return False
|
|
371
|
+
images = locate_signature_pages(mineru_dir)
|
|
372
|
+
if not images:
|
|
373
|
+
logger.info("未定位到落款页图,跳过 VL 签章核查")
|
|
374
|
+
return False
|
|
375
|
+
parsed = read_seals_on_images(images)
|
|
376
|
+
if parsed is None:
|
|
377
|
+
return False
|
|
378
|
+
sig_issues = _issues_from_vision(parsed, _signature_evidence(images))
|
|
379
|
+
# 落款人 vs 当事人交叉核对:签字人不在名单 → 疑似代签/冒签(也归 signature 类)。
|
|
380
|
+
sig_issues += _signatory_mismatch_issues(env, parsed)
|
|
381
|
+
# 保留文本判出的非签章缺陷(field/amount 等),签章(signature)缺陷整体换成 VL 看图的结果。
|
|
382
|
+
field_issues = [i for i in env.completeness.issues if i.category != "signature"] if env.completeness else []
|
|
383
|
+
all_issues = field_issues + sig_issues
|
|
384
|
+
env.completeness = Completeness(
|
|
385
|
+
status="incomplete" if all_issues else "complete",
|
|
386
|
+
issues=all_issues,
|
|
387
|
+
)
|
|
388
|
+
# 印章读数绑定到头部主体,使 2.7 的 known_parties reconcile 自动跨合同核对章号一致性。
|
|
389
|
+
_attach_seal_identities(env, parsed)
|
|
390
|
+
return True
|