contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
全局配置:XDG 配置文件 + 进程环境的统一加载。
|
|
3
|
+
|
|
4
|
+
设计(对齐 clig.dev + 本项目的简洁取向):
|
|
5
|
+
- 配置文件走 XDG:$XDG_CONFIG_HOME/contract-archive/config.json(默认 ~/.config/...)。
|
|
6
|
+
- 优先级 env > 配置文件 > 默认值,纯只读短路:`os.getenv() or file or default`。
|
|
7
|
+
cli.py 的 load_dotenv() 已把项目 .env 注入 os.environ(override=False,shell export
|
|
8
|
+
仍优先),所以 .env 天然落在 env 层——保留老 .env 用户零中断。
|
|
9
|
+
- load_settings() 只读,**绝不回写 os.environ**;任何字段缺失都不报错
|
|
10
|
+
(api_key 缺失返回空串,由调用方在真要调 LLM 时降级,沿用既有"返回 {}+warning"语义)。
|
|
11
|
+
- secret(api_key)落盘是明文,靠目录 0700 + 文件 0600 + 展示掩码保护;
|
|
12
|
+
不上 keyring(对单用户本地 CLI 过重,边际收益低)。
|
|
13
|
+
- 故意不收:XDG_DATA_HOME(由 archive/paths.py 决定 data 位置,收进来是循环依赖)、
|
|
14
|
+
MinerU 子进程 environ(基础设施,非配置项);COMPUTE_DEVICE/LOG_LEVEL 是运行时旋钮,
|
|
15
|
+
保持 env-only,不做持久配置。
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
APP_CONFIG_DIR = "contract-archive"
|
|
28
|
+
CONFIG_FILENAME = "config.json"
|
|
29
|
+
DEFAULT_DASHSCOPE_BASE_URL = "https://dashscope.aliyuncs.com/api/v1"
|
|
30
|
+
DEFAULT_DASHSCOPE_MODEL = "qwen3.7-max"
|
|
31
|
+
DEFAULT_DASHSCOPE_VL_MODEL = "qwen3.6-flash" # 多模态签章核查(OpenAI 兼容接口);更准用 qwen3.6-plus
|
|
32
|
+
DEFAULT_DASHSCOPE_OCR_MODEL = "qwen-vl-ocr-latest" # OCR 阶段专用 OCR 模型,逐页调用(maxInput 30000,不能一次塞多页)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True, slots=True)
|
|
36
|
+
class ConfigKey:
|
|
37
|
+
"""一个受支持的全局配置键。"""
|
|
38
|
+
|
|
39
|
+
name: str # 用户输入名 / 文件键名,如 "dashscope.api_key"
|
|
40
|
+
env_name: str # 对应环境变量名(严格沿用现存名,保证老 .env 值仍被读到)
|
|
41
|
+
secret: bool = False # 敏感项,展示时掩码
|
|
42
|
+
default: str | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# DashScope 四件套(LLM 抽取 / VL 签章 / OCR)+ 档案库路径。env_name 必须 = 现存环境变量名。
|
|
46
|
+
CONFIG_KEYS: tuple[ConfigKey, ...] = (
|
|
47
|
+
ConfigKey("dashscope.api_key", "DASHSCOPE_API_KEY", secret=True),
|
|
48
|
+
ConfigKey("dashscope.base_url", "DASHSCOPE_BASE_URL", default=DEFAULT_DASHSCOPE_BASE_URL),
|
|
49
|
+
ConfigKey("dashscope.model", "DASHSCOPE_LLM_MODEL", default=DEFAULT_DASHSCOPE_MODEL),
|
|
50
|
+
ConfigKey("dashscope.vl_model", "DASHSCOPE_VL_MODEL", default=DEFAULT_DASHSCOPE_VL_MODEL),
|
|
51
|
+
ConfigKey("dashscope.ocr_model", "DASHSCOPE_OCR_MODEL", default=DEFAULT_DASHSCOPE_OCR_MODEL),
|
|
52
|
+
ConfigKey("archive.dir", "CONTRACT_ARCHIVE_DIR"),
|
|
53
|
+
)
|
|
54
|
+
_KEYS_BY_NAME = {k.name: k for k in CONFIG_KEYS}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(slots=True)
|
|
58
|
+
class Settings:
|
|
59
|
+
"""运行时配置:全局配置文件 + 进程环境合并后的取值。"""
|
|
60
|
+
|
|
61
|
+
dashscope_api_key: str
|
|
62
|
+
dashscope_base_url: str
|
|
63
|
+
dashscope_model: str
|
|
64
|
+
dashscope_vl_model: str
|
|
65
|
+
dashscope_ocr_model: str
|
|
66
|
+
archive_dir: str | None
|
|
67
|
+
config_path: Path
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def find_key(name: str) -> ConfigKey | None:
|
|
71
|
+
"""按 name 查配置键定义;未注册返回 None。"""
|
|
72
|
+
return _KEYS_BY_NAME.get(name.strip())
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def config_path() -> Path:
|
|
76
|
+
"""XDG 配置文件路径:$XDG_CONFIG_HOME/contract-archive/config.json(默认 ~/.config/...)。"""
|
|
77
|
+
return _xdg_base_dir("XDG_CONFIG_HOME", Path.home() / ".config") / APP_CONFIG_DIR / CONFIG_FILENAME
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _xdg_base_dir(env_name: str, fallback: Path) -> Path:
|
|
81
|
+
"""返回绝对 XDG 基目录,否则回退(与 archive/paths.py 同风格:仅绝对路径生效)。"""
|
|
82
|
+
raw = os.getenv(env_name)
|
|
83
|
+
if not raw:
|
|
84
|
+
return fallback
|
|
85
|
+
candidate = Path(raw).expanduser()
|
|
86
|
+
return candidate if candidate.is_absolute() else fallback
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def load_config_values(path: Path | None = None) -> dict[str, str]:
|
|
90
|
+
"""
|
|
91
|
+
读配置文件为 {name: value}。文件不存在/损坏/含未知键都不报错——
|
|
92
|
+
只读路径必须健壮(坏配置不能让所有命令崩),未知键跳过并 warning。
|
|
93
|
+
"""
|
|
94
|
+
cfg = path or config_path()
|
|
95
|
+
if not cfg.exists():
|
|
96
|
+
return {}
|
|
97
|
+
try:
|
|
98
|
+
payload = json.loads(cfg.read_text(encoding="utf-8"))
|
|
99
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
100
|
+
logger.warning("配置文件读取失败,忽略 %s: %s", cfg, e)
|
|
101
|
+
return {}
|
|
102
|
+
if not isinstance(payload, dict):
|
|
103
|
+
logger.warning("配置文件必须是 JSON 对象,忽略: %s", cfg)
|
|
104
|
+
return {}
|
|
105
|
+
out: dict[str, str] = {}
|
|
106
|
+
for raw_key, raw_value in payload.items():
|
|
107
|
+
name = str(raw_key).strip()
|
|
108
|
+
if name not in _KEYS_BY_NAME:
|
|
109
|
+
logger.warning("配置文件含未知键,跳过: %s", name)
|
|
110
|
+
continue
|
|
111
|
+
if raw_value is None:
|
|
112
|
+
continue
|
|
113
|
+
value = str(raw_value).strip()
|
|
114
|
+
if value:
|
|
115
|
+
out[name] = value
|
|
116
|
+
return out
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def save_config_values(values: dict[str, str], path: Path | None = None) -> Path:
|
|
120
|
+
"""写配置文件;目录 0700 / 文件 0600(每次都 chmod,防 umask 宽松导致 secret 可被他人读)。"""
|
|
121
|
+
cfg = path or config_path()
|
|
122
|
+
cfg.parent.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
cfg.parent.chmod(0o700)
|
|
124
|
+
cfg.write_text(json.dumps(values, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
125
|
+
cfg.chmod(0o600)
|
|
126
|
+
return cfg
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _read_value(values: dict[str, str], key: ConfigKey) -> str | None:
|
|
130
|
+
"""
|
|
131
|
+
优先级 os.getenv(含 .env 注入) > 配置文件 > 默认值。只读,不回写 os.environ,不报错。
|
|
132
|
+
|
|
133
|
+
strip 后判 truthy,把"未设 / 空串 / 纯空白"三者一视同仁地回落下一层——一把消除
|
|
134
|
+
特殊情况(空串与空白串行为一致),与历史 _resolve_archive 的 truthy 语义一致。
|
|
135
|
+
默认值在此兜底(base_url/model 有 default),故 load_settings 无需再 `or DEFAULT`,
|
|
136
|
+
默认保持单一真相源(只在 CONFIG_KEYS 里定义一次)。
|
|
137
|
+
"""
|
|
138
|
+
for candidate in (os.getenv(key.env_name), values.get(key.name), key.default):
|
|
139
|
+
if candidate and candidate.strip():
|
|
140
|
+
return candidate.strip()
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def load_settings(path: Path | None = None) -> Settings:
|
|
145
|
+
"""
|
|
146
|
+
加载运行时配置。惰性(每次现读,无缓存——CLI 一次进程读几次,缓存只会让
|
|
147
|
+
`config set` 同进程不生效 + 污染测试)。任何字段缺失都不报错。
|
|
148
|
+
"""
|
|
149
|
+
values = load_config_values(path)
|
|
150
|
+
|
|
151
|
+
def read(name: str) -> str | None:
|
|
152
|
+
return _read_value(values, _KEYS_BY_NAME[name])
|
|
153
|
+
|
|
154
|
+
# 默认值的单一真相源是 CONFIG_KEYS 里的 default(_read_value 已兜底)。
|
|
155
|
+
# 这里的 `or DEFAULT_*` 不是第二默认源(引用同一常量),只是把 read() 的 str|None
|
|
156
|
+
# 类型收敛成 Settings 字段要求的 str。api_key 无 default,`or ""` 同理收敛。
|
|
157
|
+
return Settings(
|
|
158
|
+
dashscope_api_key=read("dashscope.api_key") or "",
|
|
159
|
+
dashscope_base_url=read("dashscope.base_url") or DEFAULT_DASHSCOPE_BASE_URL,
|
|
160
|
+
dashscope_model=read("dashscope.model") or DEFAULT_DASHSCOPE_MODEL,
|
|
161
|
+
dashscope_vl_model=read("dashscope.vl_model") or DEFAULT_DASHSCOPE_VL_MODEL,
|
|
162
|
+
dashscope_ocr_model=read("dashscope.ocr_model") or DEFAULT_DASHSCOPE_OCR_MODEL,
|
|
163
|
+
archive_dir=read("archive.dir"),
|
|
164
|
+
config_path=path or config_path(),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def set_value(key: str, value: str, path: Path | None = None) -> Path:
|
|
169
|
+
"""设置一个配置项。key 必须是注册表里的 name,否则报错列出支持的键。"""
|
|
170
|
+
name = _validate_key(key)
|
|
171
|
+
values = load_config_values(path)
|
|
172
|
+
values[name] = value.strip()
|
|
173
|
+
return save_config_values(values, path)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def unset_value(key: str, path: Path | None = None) -> Path:
|
|
177
|
+
"""从配置文件删除一个配置项(不影响环境变量)。"""
|
|
178
|
+
name = _validate_key(key)
|
|
179
|
+
values = load_config_values(path)
|
|
180
|
+
values.pop(name, None)
|
|
181
|
+
return save_config_values(values, path)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _validate_key(key: str) -> str:
|
|
185
|
+
name = key.strip()
|
|
186
|
+
if name not in _KEYS_BY_NAME:
|
|
187
|
+
supported = ", ".join(k.name for k in CONFIG_KEYS)
|
|
188
|
+
raise ValueError(f"不支持的配置键: {key}。支持的键: {supported}")
|
|
189
|
+
return name
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def get_timeout_s(env_name: str, default: float) -> float:
|
|
193
|
+
"""
|
|
194
|
+
读一个"秒数"类运行时旋钮(如 DASHSCOPE_TIMEOUT_S / CONTRACT_ARCHIVE_MINERU_TIMEOUT_S)。
|
|
195
|
+
|
|
196
|
+
超时是运行时旋钮而非持久配置(同 LOG_LEVEL/COMPUTE_DEVICE),保持 env-only、不进 CONFIG_KEYS。
|
|
197
|
+
坏值(非数字/非正数/缺失)一律回退 default 并 warning——坏配置不该让命令崩,
|
|
198
|
+
与 load_config_values 的"坏配置不崩、warning 后降级"取向一致。
|
|
199
|
+
"""
|
|
200
|
+
raw = os.getenv(env_name)
|
|
201
|
+
if not raw or not raw.strip():
|
|
202
|
+
return default
|
|
203
|
+
try:
|
|
204
|
+
val = float(raw.strip())
|
|
205
|
+
except ValueError:
|
|
206
|
+
logger.warning("%s=%r 不是合法数字,回退默认 %ss", env_name, raw, default)
|
|
207
|
+
return default
|
|
208
|
+
if val <= 0:
|
|
209
|
+
logger.warning("%s=%r 非正数,回退默认 %ss", env_name, raw, default)
|
|
210
|
+
return default
|
|
211
|
+
return val
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def display_value(key: ConfigKey, value: str | None, *, reveal: bool) -> str:
|
|
215
|
+
"""展示用:secret 不 reveal 时掩码,空值显示 <unset>。"""
|
|
216
|
+
if not value:
|
|
217
|
+
return "<unset>"
|
|
218
|
+
if key.secret and not reveal:
|
|
219
|
+
return "********"
|
|
220
|
+
return value
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def visible_items(*, reveal: bool = False, path: Path | None = None) -> list[tuple[str, str]]:
|
|
224
|
+
"""config show 用:按注册表顺序给出 (name, 展示值),值已按 env>file>default 解析。"""
|
|
225
|
+
values = load_config_values(path)
|
|
226
|
+
return [(k.name, display_value(k, _read_value(values, k), reveal=reveal)) for k in CONFIG_KEYS]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def describe_items(*, reveal: bool = False, path: Path | None = None) -> list[dict[str, object]]:
|
|
230
|
+
"""
|
|
231
|
+
config show --format json 用:每个配置键的结构化描述(让 agent 程序化发现配置旋钮)。
|
|
232
|
+
|
|
233
|
+
含 key / env(对应环境变量名)/ secret / default / value(按 env>file>default 解析,
|
|
234
|
+
secret 默认掩码)/ source(值来自 env|file|default|unset)。source 判定与 _read_value 同序。
|
|
235
|
+
"""
|
|
236
|
+
values = load_config_values(path)
|
|
237
|
+
out: list[dict[str, object]] = []
|
|
238
|
+
for k in CONFIG_KEYS:
|
|
239
|
+
env_v = os.getenv(k.env_name)
|
|
240
|
+
file_v = values.get(k.name)
|
|
241
|
+
if env_v and env_v.strip():
|
|
242
|
+
source = "env"
|
|
243
|
+
elif file_v and file_v.strip():
|
|
244
|
+
source = "file"
|
|
245
|
+
elif k.default:
|
|
246
|
+
source = "default"
|
|
247
|
+
else:
|
|
248
|
+
source = "unset"
|
|
249
|
+
out.append({
|
|
250
|
+
"key": k.name,
|
|
251
|
+
"env": k.env_name,
|
|
252
|
+
"secret": k.secret,
|
|
253
|
+
"default": k.default,
|
|
254
|
+
"value": display_value(k, _read_value(values, k), reveal=reveal),
|
|
255
|
+
"source": source,
|
|
256
|
+
})
|
|
257
|
+
return out
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
结构化错误模型——给机器(Agent / 自动化编排)一个可据以决策的错误信号。
|
|
3
|
+
|
|
4
|
+
为什么存在:历史上错误是自由文本(`f"mineru: {e}"`),退出码只有 0/1,
|
|
5
|
+
Agent 无法区分「限流(该退避重试)」与「缺 API key(重试无用,该停下改配置)」,
|
|
6
|
+
只能正则匹配错误串——供应商一改措辞就崩。这里把错误归一成
|
|
7
|
+
`code / category / retryable`,让上层(尤其 ingest 的 JSON 输出)携带可判定信号。
|
|
8
|
+
|
|
9
|
+
设计原则:
|
|
10
|
+
- 分类靠 duck-typing(异常类名 + status_code),**不 import openai**——
|
|
11
|
+
errors 是底层模块,不该绑死某个 SDK 的异常类层级,也避免无谓的强依赖。
|
|
12
|
+
- ErrorInfo 是 pydantic 模型,可直接嵌进 DocumentExtraction 落盘、可 model_dump 进 JSON 输出。
|
|
13
|
+
- retryable 是给 Agent 的核心信号:transient 类(限流/超时/网络/5xx)为 True,
|
|
14
|
+
config/permission/validation/user 类为 False。
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ErrorCategory(str, Enum):
|
|
25
|
+
"""错误大类。retryable 默认由类别决定(transient→可重试,其余不可)。"""
|
|
26
|
+
|
|
27
|
+
user = "user" # 用户输入错(不存在的 id / 非 PDF)
|
|
28
|
+
validation = "validation" # 入参/上游响应不合法(400、空抽取)
|
|
29
|
+
config = "config" # 配置缺失或无效(缺 API key、key 失效)
|
|
30
|
+
permission = "permission" # 鉴权通过但无权限(403)
|
|
31
|
+
transient = "transient" # 瞬时故障,重试可能成功(限流/超时/网络/5xx)
|
|
32
|
+
infra = "infra" # 基础设施/外部工具故障(MinerU 崩、DB 锁)
|
|
33
|
+
unknown = "unknown" # 未能归类
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ErrorInfo(BaseModel):
|
|
37
|
+
"""
|
|
38
|
+
一条结构化错误。嵌进抽取信封落盘、并由 CLI 的 --format json 原样吐出。
|
|
39
|
+
|
|
40
|
+
:param code: 稳定的机器可读错误码(如 RATE_LIMITED),供 Agent switch。
|
|
41
|
+
:param category: 错误大类(见 ErrorCategory)。
|
|
42
|
+
:param message: 人类可读详情(已截断,避免把超长 traceback 灌进 JSON)。
|
|
43
|
+
:param retryable: 给 Agent 的核心信号——是否值得退避后重试。
|
|
44
|
+
:param retry_after_s: 建议的重试等待秒数(限流场景),未知为 None。
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
code: str
|
|
48
|
+
category: str
|
|
49
|
+
message: str
|
|
50
|
+
retryable: bool
|
|
51
|
+
retry_after_s: Optional[float] = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# message 截断长度:够定位问题,又不至于把整个 traceback/超长上游响应灌进 JSON。
|
|
55
|
+
_MAX_MESSAGE_LEN = 500
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _short(text: str) -> str:
|
|
59
|
+
"""错误文本归一:去首尾空白 + 截断,保证 JSON 体积可控。"""
|
|
60
|
+
text = (text or "").strip()
|
|
61
|
+
return text if len(text) <= _MAX_MESSAGE_LEN else text[: _MAX_MESSAGE_LEN - 1] + "…"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def classify_exception(exc: BaseException) -> ErrorInfo:
|
|
65
|
+
"""
|
|
66
|
+
把一个(多半来自外部 API 调用的)异常归类成 ErrorInfo。
|
|
67
|
+
|
|
68
|
+
用 duck-typing 而非 isinstance(openai.XxxError):读异常类名 + status_code,
|
|
69
|
+
既能识别 openai SDK 的异常(RateLimitError/AuthenticationError/...),
|
|
70
|
+
也能兜住自建网关/其他 SDK 抛出的形似异常。无法识别时归 UNKNOWN(不可重试,保守)。
|
|
71
|
+
"""
|
|
72
|
+
name = type(exc).__name__
|
|
73
|
+
status = getattr(exc, "status_code", None)
|
|
74
|
+
if not isinstance(status, int):
|
|
75
|
+
status = None
|
|
76
|
+
msg = _short(str(exc))
|
|
77
|
+
|
|
78
|
+
# 限流:429 / RateLimitError。可重试,给默认退避建议。
|
|
79
|
+
if name == "RateLimitError" or status == 429:
|
|
80
|
+
return ErrorInfo(
|
|
81
|
+
code="RATE_LIMITED", category=ErrorCategory.transient.value,
|
|
82
|
+
message=msg, retryable=True, retry_after_s=_retry_after(exc),
|
|
83
|
+
)
|
|
84
|
+
# 认证失败 / key 无效:401。属配置问题,重试无用,应改配置。
|
|
85
|
+
if name == "AuthenticationError" or status == 401:
|
|
86
|
+
return ErrorInfo(
|
|
87
|
+
code="AUTH_FAILED", category=ErrorCategory.config.value,
|
|
88
|
+
message=msg, retryable=False,
|
|
89
|
+
)
|
|
90
|
+
# 鉴权通过但无权限:403。
|
|
91
|
+
if name == "PermissionDeniedError" or status == 403:
|
|
92
|
+
return ErrorInfo(
|
|
93
|
+
code="PERMISSION_DENIED", category=ErrorCategory.permission.value,
|
|
94
|
+
message=msg, retryable=False,
|
|
95
|
+
)
|
|
96
|
+
# 请求不合法:400(模型 id 错、参数非法)。重试无用。
|
|
97
|
+
if name == "BadRequestError" or status == 400:
|
|
98
|
+
return ErrorInfo(
|
|
99
|
+
code="BAD_REQUEST", category=ErrorCategory.validation.value,
|
|
100
|
+
message=msg, retryable=False,
|
|
101
|
+
)
|
|
102
|
+
# 超时:408 / openai 或 httpx 的超时异常 / 文本含 timeout。瞬时,可重试。
|
|
103
|
+
if (
|
|
104
|
+
name in ("APITimeoutError", "ReadTimeout", "ConnectTimeout")
|
|
105
|
+
or status == 408
|
|
106
|
+
or "timed out" in msg.lower()
|
|
107
|
+
or "timeout" in msg.lower()
|
|
108
|
+
):
|
|
109
|
+
return ErrorInfo(
|
|
110
|
+
code="TIMEOUT", category=ErrorCategory.transient.value,
|
|
111
|
+
message=msg, retryable=True,
|
|
112
|
+
)
|
|
113
|
+
# 连接错误 / 上游 5xx:瞬时,可重试(含 httpx 裸 ConnectError)。
|
|
114
|
+
if name in ("APIConnectionError", "ConnectError", "InternalServerError") or (status is not None and status >= 500):
|
|
115
|
+
return ErrorInfo(
|
|
116
|
+
code="UPSTREAM_ERROR", category=ErrorCategory.transient.value,
|
|
117
|
+
message=msg, retryable=True,
|
|
118
|
+
)
|
|
119
|
+
return ErrorInfo(
|
|
120
|
+
code="UNKNOWN", category=ErrorCategory.unknown.value,
|
|
121
|
+
message=msg, retryable=False,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _retry_after(exc: BaseException) -> Optional[float]:
|
|
126
|
+
"""尽力从异常/响应头取 Retry-After 秒数;取不到返回 None(由调用方自定退避)。"""
|
|
127
|
+
resp = getattr(exc, "response", None)
|
|
128
|
+
headers = getattr(resp, "headers", None)
|
|
129
|
+
if headers:
|
|
130
|
+
raw = headers.get("retry-after") or headers.get("Retry-After")
|
|
131
|
+
if raw:
|
|
132
|
+
try:
|
|
133
|
+
return float(raw)
|
|
134
|
+
except (TypeError, ValueError):
|
|
135
|
+
return None
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# ---------- 非异常来源的常见错误构造器(语义清晰,省得各处手拼 ErrorInfo) ----------
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def config_missing(detail: str) -> ErrorInfo:
|
|
143
|
+
"""缺必要配置(最常见:缺 DASHSCOPE_API_KEY)。重试无用,须先配置。"""
|
|
144
|
+
return ErrorInfo(
|
|
145
|
+
code="CONFIG_MISSING", category=ErrorCategory.config.value,
|
|
146
|
+
message=_short(detail), retryable=False,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def extract_empty(detail: str) -> ErrorInfo:
|
|
151
|
+
"""LLM 调用成功但抽取产出为空(非缺 key 场景)。保守标不可重试。"""
|
|
152
|
+
return ErrorInfo(
|
|
153
|
+
code="EXTRACT_EMPTY", category=ErrorCategory.validation.value,
|
|
154
|
+
message=_short(detail), retryable=False,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def mineru_failed(detail: str) -> ErrorInfo:
|
|
159
|
+
"""MinerU 解析失败(subprocess 非零/崩溃)。归基础设施类。"""
|
|
160
|
+
return ErrorInfo(
|
|
161
|
+
code="MINERU_FAILED", category=ErrorCategory.infra.value,
|
|
162
|
+
message=_short(detail), retryable=False,
|
|
163
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .contract_extractor import extract_contract
|
|
2
|
+
from .document_extractor import call_llm_document, extract_document
|
|
3
|
+
from .llm_extractor import call_llm_extract
|
|
4
|
+
from .normalize import coerce_obligations, normalize_date, parse_money_value
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"extract_contract",
|
|
8
|
+
"extract_document",
|
|
9
|
+
"call_llm_document",
|
|
10
|
+
"call_llm_extract",
|
|
11
|
+
"normalize_date",
|
|
12
|
+
"parse_money_value",
|
|
13
|
+
"coerce_obligations",
|
|
14
|
+
]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
金额自洽性校验:纯确定性数值规则,不依赖 LLM 算术。
|
|
3
|
+
|
|
4
|
+
为什么不交给 LLM:抽取时 LLM 会忠实抄录原文金额,但对"首期 500000 却 > 总价
|
|
5
|
+
200000"这类数量矛盾毫无警觉(实测 29 号车位即 LLM 照抄矛盾数字而未报)。算术与
|
|
6
|
+
比较交给代码,LLM 只负责语义标注(is_total_component / is_installment)。
|
|
7
|
+
|
|
8
|
+
两条规则,都只产出"疑似异常、请人工核对"——属辅助筛查、非终判:
|
|
9
|
+
规则A 分期超额:各分期付款项(is_installment)之和 > 总价(合计) 即报(如首期50W+
|
|
10
|
+
余款15W=65W > 总价20W,首期多打一个0)。**只报正差**——分期和 < 总价是认购预付、
|
|
11
|
+
首付+贷款等正常场景(认购预付50W+定金50W ≪ 房屋总价1228W),报了是误报。
|
|
12
|
+
规则B 单项越界:未计入合计、也未标分期的单项金额却 > 合计,疑似多填/笔误,
|
|
13
|
+
作为 LLM 漏标分期时的兜底。
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from ..schemas import CompletenessIssue, LabeledAmount
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _tolerance(total: float) -> float:
|
|
21
|
+
"""金额比较容差:取「1元」与「合计的 1%」中较大者。
|
|
22
|
+
|
|
23
|
+
既容忍分/角小数舍入,又不会把数量级矛盾(50W vs 20W)当成噪声放过。
|
|
24
|
+
"""
|
|
25
|
+
return max(1.0, abs(total) * 0.01)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _merge_evidence(amounts: list[LabeledAmount]) -> str:
|
|
29
|
+
"""拼接各分期项的出处,便于翻回原文逐笔核对。"""
|
|
30
|
+
parts = [f"{a.label}:{a.evidence}" for a in amounts if a.evidence]
|
|
31
|
+
return ";".join(parts)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def check_amount_consistency(
|
|
35
|
+
amounts: list[LabeledAmount], computed_total: float | None
|
|
36
|
+
) -> list[CompletenessIssue]:
|
|
37
|
+
"""
|
|
38
|
+
金额自洽校验 → amount 类缺陷列表。
|
|
39
|
+
|
|
40
|
+
无合计基准(computed_total 为空或 ≤0)时不校验——没有"总价"做参照,
|
|
41
|
+
谈不上自洽,硬比会误报。
|
|
42
|
+
"""
|
|
43
|
+
issues: list[CompletenessIssue] = []
|
|
44
|
+
if not computed_total or computed_total <= 0:
|
|
45
|
+
return issues
|
|
46
|
+
tol = _tolerance(computed_total)
|
|
47
|
+
|
|
48
|
+
# 规则A:分期付款项之和 **超过** 总价才报(只报正差)。
|
|
49
|
+
# 不报负差:认购/预售先付定金预付款、首付+贷款等,已列分期项之和本就小于总价
|
|
50
|
+
# 属正常(认购预付50W+定金50W ≪ 房屋总价1228W),报负差是误报。
|
|
51
|
+
# 分期之和 > 总价 才是硬矛盾(29号首期50W+余款15W=65W > 总价20W,首期误填)。
|
|
52
|
+
installments = [a for a in amounts if a.is_installment and a.value is not None]
|
|
53
|
+
if installments:
|
|
54
|
+
inst_sum = round(sum(a.value for a in installments), 2)
|
|
55
|
+
overshoot = round(inst_sum - computed_total, 2)
|
|
56
|
+
if overshoot > tol:
|
|
57
|
+
labels = "+".join(a.label for a in installments)
|
|
58
|
+
issues.append(CompletenessIssue(
|
|
59
|
+
item="分期款超过总价",
|
|
60
|
+
category="amount",
|
|
61
|
+
detail=(
|
|
62
|
+
f"分期款之和({labels})={inst_sum:,.0f}元,"
|
|
63
|
+
f"超过总价/合计 {computed_total:,.0f}元(多 {overshoot:,.0f}元),"
|
|
64
|
+
f"疑似金额笔误,请人工核对"
|
|
65
|
+
),
|
|
66
|
+
evidence=_merge_evidence(installments),
|
|
67
|
+
))
|
|
68
|
+
|
|
69
|
+
# 规则B:未计入合计、也未标分期的单项却超过合计(兜底 LLM 漏标分期的场景)。
|
|
70
|
+
for a in amounts:
|
|
71
|
+
if (
|
|
72
|
+
not a.is_total_component
|
|
73
|
+
and not a.is_installment
|
|
74
|
+
and not a.unit # 单价项(元/月·㎡ 等)量纲不同,不与合同金额比,跳过
|
|
75
|
+
and a.value is not None
|
|
76
|
+
and a.value > computed_total + tol
|
|
77
|
+
):
|
|
78
|
+
issues.append(CompletenessIssue(
|
|
79
|
+
item=f"{a.label}超过合计",
|
|
80
|
+
category="amount",
|
|
81
|
+
detail=(
|
|
82
|
+
f"{a.label}={a.value:,.0f}元 超过合计 {computed_total:,.0f}元,"
|
|
83
|
+
f"疑似多填/笔误,请人工核对"
|
|
84
|
+
),
|
|
85
|
+
evidence=a.evidence,
|
|
86
|
+
))
|
|
87
|
+
return issues
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
合同字段抽取(LLM-only)。
|
|
3
|
+
|
|
4
|
+
Phase 2 起退役了 rule_extractor + rule/LLM hybrid 合并:合同抽取与其余文档类型
|
|
5
|
+
一样走纯 LLM,只对 LLM 输出做确定性数值归一化(金额→数值、日期→ISO)。
|
|
6
|
+
保留 ContractExtraction schema 与合同专属列/搜索(party/到期/续约/风险/义务),
|
|
7
|
+
是因为这些是文档化的能力,不破坏。
|
|
8
|
+
|
|
9
|
+
合同的 LLM prompt 仍是那份调校过的 LLM_SYSTEM_PROMPT(见 llm_extractor)。
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
from ..schemas import (
|
|
16
|
+
ContractExtraction,
|
|
17
|
+
ExtractionConfidence,
|
|
18
|
+
FieldConfidence,
|
|
19
|
+
)
|
|
20
|
+
from .llm_extractor import call_llm_extract
|
|
21
|
+
from .normalize import (
|
|
22
|
+
coerce_bool,
|
|
23
|
+
coerce_obligations,
|
|
24
|
+
coerce_str_list,
|
|
25
|
+
normalize_date,
|
|
26
|
+
parse_money_value,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# 参与 overall 置信度均值的字段(与历史口径一致)
|
|
32
|
+
_SCORED_FIELDS = (
|
|
33
|
+
"contract_name", "party_a", "party_b", "amount",
|
|
34
|
+
"sign_date", "expire_date", "auto_renewal", "risk_clauses",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _build_confidence(ext: ContractExtraction) -> ExtractionConfidence:
|
|
39
|
+
"""
|
|
40
|
+
LLM-only 置信度:有值即 llm/0.7,无值 missing/0.0。
|
|
41
|
+
无 rule 交叉验证,故不再有 merged/0.9 与 rule_hit 维度——诚实反映"只有 LLM 一票"。
|
|
42
|
+
"""
|
|
43
|
+
conf = ExtractionConfidence()
|
|
44
|
+
|
|
45
|
+
def fc(present: bool) -> FieldConfidence:
|
|
46
|
+
return FieldConfidence(
|
|
47
|
+
value_source="llm" if present else "missing",
|
|
48
|
+
confidence=0.7 if present else 0.0,
|
|
49
|
+
rule_hit=False,
|
|
50
|
+
llm_agreed=None,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
conf.contract_name = fc(bool(ext.contract_name))
|
|
54
|
+
conf.party_a = fc(bool(ext.party_a))
|
|
55
|
+
conf.party_b = fc(bool(ext.party_b))
|
|
56
|
+
conf.amount = fc(bool(ext.amount))
|
|
57
|
+
conf.sign_date = fc(bool(ext.sign_date))
|
|
58
|
+
conf.expire_date = fc(bool(ext.expire_date))
|
|
59
|
+
conf.auto_renewal = fc(ext.auto_renewal is not None)
|
|
60
|
+
conf.risk_clauses = fc(bool(ext.risk_clauses))
|
|
61
|
+
scores = [getattr(conf, f).confidence for f in _SCORED_FIELDS]
|
|
62
|
+
conf.overall = sum(scores) / len(scores)
|
|
63
|
+
return conf
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def extract_contract(
|
|
67
|
+
document_text: str,
|
|
68
|
+
llm_enabled: bool = True,
|
|
69
|
+
model: str | None = None,
|
|
70
|
+
) -> tuple[ContractExtraction, ExtractionConfidence]:
|
|
71
|
+
"""
|
|
72
|
+
合同字段抽取主入口(LLM-only)。
|
|
73
|
+
|
|
74
|
+
:param llm_enabled: False(或无 API key / LLM 失败)时返回空结果——
|
|
75
|
+
纯 LLM 路径无 rule 兜底,抽不到比硬塞更诚实。
|
|
76
|
+
:param model: 覆盖抽取所用 model(默认 None=走 settings.dashscope_model);
|
|
77
|
+
合同线是双 LLM 调用之一,评测换模型时与 extract_document 同步穿透才保真。
|
|
78
|
+
"""
|
|
79
|
+
if not llm_enabled:
|
|
80
|
+
return ContractExtraction(), ExtractionConfidence()
|
|
81
|
+
|
|
82
|
+
raw = call_llm_extract(document_text, model=model).parsed
|
|
83
|
+
if not raw:
|
|
84
|
+
return ContractExtraction(), ExtractionConfidence()
|
|
85
|
+
|
|
86
|
+
ext = ContractExtraction(
|
|
87
|
+
contract_name=(raw.get("contract_name") or None),
|
|
88
|
+
party_a=(raw.get("party_a") or None),
|
|
89
|
+
party_b=(raw.get("party_b") or None),
|
|
90
|
+
amount=(raw.get("amount") or None),
|
|
91
|
+
sign_date=normalize_date(raw.get("sign_date")) if isinstance(raw.get("sign_date"), str) else None,
|
|
92
|
+
expire_date=normalize_date(raw.get("expire_date")) if isinstance(raw.get("expire_date"), str) else None,
|
|
93
|
+
auto_renewal=coerce_bool(raw.get("auto_renewal")),
|
|
94
|
+
risk_clauses=coerce_str_list(raw.get("risk_clauses")),
|
|
95
|
+
obligations=coerce_obligations(raw.get("obligations")),
|
|
96
|
+
)
|
|
97
|
+
ext.amount_value = parse_money_value(ext.amount)
|
|
98
|
+
ext.raw_evidence = {
|
|
99
|
+
k: f"[LLM] {getattr(ext, k)}"
|
|
100
|
+
for k in ("contract_name", "party_a", "party_b", "amount", "sign_date")
|
|
101
|
+
if getattr(ext, k)
|
|
102
|
+
}
|
|
103
|
+
return ext, _build_confidence(ext)
|