contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-based 合同字段抽取。
|
|
3
|
+
|
|
4
|
+
模型:DashScope qwen3.7-max(严格按用户指定的 model id,不要替换)
|
|
5
|
+
|
|
6
|
+
策略:
|
|
7
|
+
- 一次性把 markdown / raw_text 喂给 LLM,让其返回结构化 JSON
|
|
8
|
+
- 用 JSON Schema/示例约束输出
|
|
9
|
+
- 失败/超时时返回空字典,由调用方(contract_extractor)处理为空抽取
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Any, Optional
|
|
18
|
+
|
|
19
|
+
from ..config import get_timeout_s, load_settings
|
|
20
|
+
from ..errors import ErrorInfo, classify_exception, config_missing
|
|
21
|
+
from ..utils.http_env import sanitized_httpx_proxy_env
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class LlmResult:
|
|
28
|
+
"""
|
|
29
|
+
一次 LLM 调用的产物 + 元数据。
|
|
30
|
+
|
|
31
|
+
把 parsed / model / usage 一起作为返回值传出,让"实际用了哪个模型""花了多少
|
|
32
|
+
token"成为可信的返回值,而非靠外部 monkeypatch 拦截偷取(SDK 一升级猴补就静默失效)。
|
|
33
|
+
评测据 usage 算成本;model 是单一真相源,杜绝"记录的模型≠实际跑的模型"。
|
|
34
|
+
失败路径返回 parsed={},与历史"返回空 dict"语义一致(调用方判 `if not res.parsed`)。
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
parsed: dict[str, Any] # 解析后的 JSON;失败为空 dict
|
|
38
|
+
model: str # 本次实际请求的 model id
|
|
39
|
+
usage: dict[str, Any] | None = None # token 用量(DashScope resp["usage"]);读不到为 None
|
|
40
|
+
error: Optional[ErrorInfo] = None # 结构化错误(缺 key / API 异常);成功为 None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
LLM_SYSTEM_PROMPT = """你是一名严谨的法律助理。请从给定的合同文本中抽取结构化字段。
|
|
44
|
+
铁律:
|
|
45
|
+
1. 只输出 JSON,不要任何解释、前缀、Markdown 代码块标记。
|
|
46
|
+
2. 抽不到的字段填 null,禁止猜测、禁止拼凑。
|
|
47
|
+
3. 如果合同里某个日期是占位符或空白(如"___年__月__日"、"2026年5月_日"),返回 null,不要补全。
|
|
48
|
+
4. 日期统一为 ISO 8601 (YYYY-MM-DD)。
|
|
49
|
+
5. 金额保留原文(如"人民币贰万元整"或"210000 元整")。
|
|
50
|
+
6. 签订日期(sign_date):仅取合同最后落款/签字处的日期;如未明确则 null,不要把"付款日"或"交付日"当签订日。
|
|
51
|
+
7. 到期日期(expire_date):仅取合同明确的有效期/失效日;车位转让、买卖等一次性合同通常没有,应填 null。
|
|
52
|
+
8. party_a / party_b:填全称(含公司类型如"有限公司"或买受人完整身份描述)。
|
|
53
|
+
9. risk_clauses 是字符串数组,每条 ≤80 字,仅列违约/赔偿/争议解决/不可抗力/保密/管辖等"出问题后果"型条款。
|
|
54
|
+
10. obligations 是动作清单——"X 方应/须于 Y 之前做 Z"型条款。
|
|
55
|
+
与 risk_clauses 严格区分:
|
|
56
|
+
- 动作类(要做某事、按时交付、提交资料、付款、验收、盖章、签订其他合同)→ obligations
|
|
57
|
+
- 后果类(违约金、解除权、争议解决、滞纳金、赔偿、不可抗力免责)→ risk_clauses
|
|
58
|
+
每条 obligation 必须含:actor、action、deadline(若无明确日期则 null)、evidence(原文片段≤120字)
|
|
59
|
+
actor 只能是 "party_a"|"party_b"|"both",不要写实际人名/公司名。
|
|
60
|
+
action 用动宾短语,≤30字,例如"递交审贷资料"、"交付车位"、"支付定金"。
|
|
61
|
+
deadline 是 ISO 'YYYY-MM-DD';原文为"签订本协议当日"/"30 日内"等相对时间无法换算时填 null。
|
|
62
|
+
宁缺毋滥:抽不出动作不要硬凑;典型合同 obligations 5-15 条为正常。
|
|
63
|
+
|
|
64
|
+
JSON 字段定义:
|
|
65
|
+
{
|
|
66
|
+
"contract_name": "合同名称",
|
|
67
|
+
"party_a": "甲方全称",
|
|
68
|
+
"party_b": "乙方全称(如有多人/多主体用顿号分隔)",
|
|
69
|
+
"amount": "合同金额原文",
|
|
70
|
+
"sign_date": "签订日期 ISO 8601 或 null",
|
|
71
|
+
"expire_date": "到期/终止日期 ISO 8601 或 null",
|
|
72
|
+
"auto_renewal": true/false/null,
|
|
73
|
+
"risk_clauses": ["违约金/赔偿/解除/争议解决等罚则", "..."],
|
|
74
|
+
"obligations": [
|
|
75
|
+
{
|
|
76
|
+
"actor": "party_a"|"party_b"|"both",
|
|
77
|
+
"action": "动宾短语",
|
|
78
|
+
"deadline": "YYYY-MM-DD 或 null",
|
|
79
|
+
"evidence": "原文片段"
|
|
80
|
+
}
|
|
81
|
+
]
|
|
82
|
+
}
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def call_llm_extract(
|
|
87
|
+
document_text: str,
|
|
88
|
+
model: str | None = None,
|
|
89
|
+
api_key: str | None = None,
|
|
90
|
+
base_url: str | None = None,
|
|
91
|
+
max_chars: int = 24000,
|
|
92
|
+
) -> LlmResult:
|
|
93
|
+
"""
|
|
94
|
+
调用 DashScope LLM(OpenAI 兼容口)进行合同字段抽取。
|
|
95
|
+
|
|
96
|
+
见 CLAUDE.md:DashScope 一律走 OpenAI 兼容接口(原生 Generation 不认部分模型 id)。
|
|
97
|
+
:param document_text: 已 OCR 得到的合同全文(推荐用 markdown 版本)
|
|
98
|
+
:param model: 默认从 DASHSCOPE_LLM_MODEL env 读,最终默认 qwen3.7-max
|
|
99
|
+
:param max_chars: 截断阈值,避免超过模型上下文
|
|
100
|
+
:return: LlmResult(parsed/model/usage);失败时 parsed={},调用方判 `if not res.parsed`
|
|
101
|
+
"""
|
|
102
|
+
# 统一从 config 层取(env > 配置文件 > 默认);显式传参仍优先(param or settings)。
|
|
103
|
+
settings = load_settings()
|
|
104
|
+
model = model or settings.dashscope_model
|
|
105
|
+
api_key = api_key or settings.dashscope_api_key
|
|
106
|
+
base_url = base_url or settings.dashscope_base_url
|
|
107
|
+
if not api_key:
|
|
108
|
+
logger.warning("DASHSCOPE_API_KEY missing; skip LLM extraction")
|
|
109
|
+
return LlmResult(
|
|
110
|
+
parsed={}, model=model,
|
|
111
|
+
error=config_missing("DASHSCOPE_API_KEY 缺失,跳过 LLM 抽取"),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
user_msg = f"以下是合同正文,请抽取字段:\n\n{_truncate_middle(document_text, max_chars)}"
|
|
115
|
+
try:
|
|
116
|
+
content, usage = _call_openai_compat(LLM_SYSTEM_PROMPT, user_msg, model, api_key, base_url)
|
|
117
|
+
except Exception as e: # noqa: BLE001 — 外部调用降级返回空,但保留结构化 error 供上层判重试
|
|
118
|
+
logger.exception("DashScope LLM call failed: %s", e)
|
|
119
|
+
return LlmResult(parsed={}, model=model, error=classify_exception(e))
|
|
120
|
+
|
|
121
|
+
if not content:
|
|
122
|
+
logger.warning("LLM empty response")
|
|
123
|
+
return LlmResult(parsed={}, model=model, usage=usage)
|
|
124
|
+
parsed = _parse_json_loose(content)
|
|
125
|
+
if not parsed:
|
|
126
|
+
logger.warning("LLM response not parseable as JSON: %s", content[:200])
|
|
127
|
+
return LlmResult(parsed=parsed, model=model, usage=usage)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _truncate_middle(text: str, max_chars: int) -> str:
|
|
131
|
+
"""超长则头 1/3 尾 2/3 截断——尾部承载签字/金额/到期日等关键信息,权重更高。"""
|
|
132
|
+
if len(text) <= max_chars:
|
|
133
|
+
return text
|
|
134
|
+
head = max_chars // 3
|
|
135
|
+
return text[:head] + "\n\n[...省略中段...]\n\n" + text[-(max_chars - head):]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _usage_from_openai(resp: Any) -> dict[str, Any] | None:
|
|
139
|
+
"""OpenAI 兼容响应的 token 用量 → 归一化 input/output/total_tokens。读不到返回 None。"""
|
|
140
|
+
u = getattr(resp, "usage", None)
|
|
141
|
+
if u is None:
|
|
142
|
+
return None
|
|
143
|
+
out = {
|
|
144
|
+
"input_tokens": getattr(u, "prompt_tokens", None),
|
|
145
|
+
"output_tokens": getattr(u, "completion_tokens", None),
|
|
146
|
+
"total_tokens": getattr(u, "total_tokens", None),
|
|
147
|
+
}
|
|
148
|
+
return out if any(v is not None for v in out.values()) else None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _call_openai_compat(
|
|
152
|
+
system_prompt: str, user_content: str, model: str, api_key: str, base_url: str
|
|
153
|
+
) -> tuple[str, dict[str, Any] | None]:
|
|
154
|
+
"""
|
|
155
|
+
经 DashScope 的 OpenAI 兼容接口调文本模型,返回 (content, usage)。失败抛异常由调用方降级。
|
|
156
|
+
|
|
157
|
+
见 CLAUDE.md:DashScope 一律走兼容口(原生 Generation 不认部分模型 id,如 qwen3.6-flash)。
|
|
158
|
+
开 json_object;**不设 max_tokens**(避免 JSON 被截断成非法串);各 prompt 已含 "JSON" 字样。
|
|
159
|
+
|
|
160
|
+
显式 timeout(默认 300s,DASHSCOPE_TIMEOUT_S 可调):不设则吃 SDK 默认 ~600s,
|
|
161
|
+
上游 hang 时 CI/agent 会静默干等近 10 分钟。300s 给长合同(截断后约 6 万字,
|
|
162
|
+
且故意不设 max_tokens)留足头寸,又不至无界等待。超时异常由调用方 except 兜底降级。
|
|
163
|
+
"""
|
|
164
|
+
from openai import OpenAI
|
|
165
|
+
|
|
166
|
+
compat_url = base_url.replace("/api/v1", "/compatible-mode/v1")
|
|
167
|
+
with sanitized_httpx_proxy_env():
|
|
168
|
+
client = OpenAI(
|
|
169
|
+
api_key=api_key, base_url=compat_url,
|
|
170
|
+
timeout=get_timeout_s("DASHSCOPE_TIMEOUT_S", 300.0),
|
|
171
|
+
)
|
|
172
|
+
resp = client.chat.completions.create(
|
|
173
|
+
model=model,
|
|
174
|
+
messages=[
|
|
175
|
+
{"role": "system", "content": system_prompt},
|
|
176
|
+
{"role": "user", "content": user_content},
|
|
177
|
+
],
|
|
178
|
+
temperature=0.1,
|
|
179
|
+
top_p=0.5,
|
|
180
|
+
response_format={"type": "json_object"},
|
|
181
|
+
)
|
|
182
|
+
return (resp.choices[0].message.content or ""), _usage_from_openai(resp)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _parse_json_loose(text: str) -> dict[str, Any]:
|
|
186
|
+
"""
|
|
187
|
+
LLM 偶尔会带 markdown 代码块或前缀文字,做一次 best-effort 解析。
|
|
188
|
+
"""
|
|
189
|
+
text = text.strip()
|
|
190
|
+
if text.startswith("```"):
|
|
191
|
+
# 去掉 ```json ... ``` 包裹
|
|
192
|
+
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
193
|
+
text = re.sub(r"\s*```$", "", text)
|
|
194
|
+
# 抓第一个 {...} 块
|
|
195
|
+
m = re.search(r"\{[\s\S]*\}", text)
|
|
196
|
+
if not m:
|
|
197
|
+
return {}
|
|
198
|
+
try:
|
|
199
|
+
return json.loads(m.group(0))
|
|
200
|
+
except json.JSONDecodeError:
|
|
201
|
+
# 修一下常见问题:单引号 / trailing comma
|
|
202
|
+
repaired = m.group(0).replace("'", '"')
|
|
203
|
+
repaired = re.sub(r",\s*([}\]])", r"\1", repaired)
|
|
204
|
+
try:
|
|
205
|
+
return json.loads(repaired)
|
|
206
|
+
except json.JSONDecodeError:
|
|
207
|
+
return {}
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
确定性数值/结构归一化工具。
|
|
3
|
+
|
|
4
|
+
这是项目里**唯一保留的"死代码 rule"**——但它做的不是字段抽取,而是把 LLM
|
|
5
|
+
吐出的原文值规整成可存储/可比较的形态(中文大写金额→数值、日期→ISO、
|
|
6
|
+
LLM 数组→强类型对象)。LLM 在精确数值/日期换算上不可靠,这类确定性转换
|
|
7
|
+
交给代码更稳。字段"抽什么"全归 LLM。
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from ..schemas import ObligationItem
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ---------- 日期 ----------
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def normalize_date(value: str | None) -> str | None:
|
|
21
|
+
"""把原文日期粗糙归一化到 ISO 8601。失败原样返回。"""
|
|
22
|
+
if not value:
|
|
23
|
+
return None
|
|
24
|
+
# 兼容 OCR 输出里数字与"年/月"之间出现的空格
|
|
25
|
+
m = re.match(
|
|
26
|
+
r"^((?:19|20)\d{2}|二[〇零]{1,3}[一二三四五六七八九十]{1,3})"
|
|
27
|
+
r"\s*[年\-./]\s*(\d{1,2}|[一二三四五六七八九十]{1,3})"
|
|
28
|
+
r"\s*[月\-./]\s*(\d{1,2}|[一二三四五六七八九十]{1,3})",
|
|
29
|
+
value,
|
|
30
|
+
)
|
|
31
|
+
if not m:
|
|
32
|
+
return value
|
|
33
|
+
y, mo, d = m.group(1), m.group(2), m.group(3)
|
|
34
|
+
if not y.isdigit():
|
|
35
|
+
y = _cn_year_to_int(y)
|
|
36
|
+
if not mo.isdigit():
|
|
37
|
+
mo = _cn_num_to_int(mo)
|
|
38
|
+
if not d.isdigit():
|
|
39
|
+
d = _cn_num_to_int(d)
|
|
40
|
+
try:
|
|
41
|
+
return f"{int(y):04d}-{int(mo):02d}-{int(d):02d}"
|
|
42
|
+
except (ValueError, TypeError):
|
|
43
|
+
return value
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_CN_DIGITS = {"〇": 0, "零": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5,
|
|
47
|
+
"六": 6, "七": 7, "八": 8, "九": 9, "十": 10}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _cn_year_to_int(s: str) -> str:
|
|
51
|
+
digits = [str(_CN_DIGITS.get(ch, "")) for ch in s if ch in _CN_DIGITS]
|
|
52
|
+
return "".join(digits) or s
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _cn_num_to_int(s: str) -> str:
|
|
56
|
+
if "十" in s:
|
|
57
|
+
parts = s.split("十")
|
|
58
|
+
tens = _CN_DIGITS.get(parts[0], 1) if parts[0] else 1
|
|
59
|
+
ones = _CN_DIGITS.get(parts[1], 0) if len(parts) > 1 and parts[1] else 0
|
|
60
|
+
return str(tens * 10 + ones)
|
|
61
|
+
return "".join(str(_CN_DIGITS.get(ch, "")) for ch in s) or s
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------- 金额 ----------
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_CN_MONEY_DIGITS = {
|
|
68
|
+
"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5,
|
|
69
|
+
"陆": 6, "柒": 7, "捌": 8, "玖": 9,
|
|
70
|
+
# 小写也支持(偶尔混用)
|
|
71
|
+
"一": 1, "二": 2, "三": 3, "四": 4, "五": 5,
|
|
72
|
+
"六": 6, "七": 7, "八": 8, "九": 9, "两": 2,
|
|
73
|
+
}
|
|
74
|
+
_CN_MONEY_UNITS = {
|
|
75
|
+
"拾": 10, "佰": 100, "仟": 1000, "万": 10000, "亿": 100000000,
|
|
76
|
+
"十": 10, "百": 100, "千": 1000,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _cn_money_to_value(s: str) -> float | None:
|
|
81
|
+
"""
|
|
82
|
+
中文大写金额 → 数值(元)。
|
|
83
|
+
"壹仟贰佰贰拾柒万玖仟捌佰捌拾玖元整" → 12279889.0
|
|
84
|
+
"陆拾贰万壹仟壹佰零陆元柒角壹分" → 621106.71
|
|
85
|
+
解析失败返回 None。
|
|
86
|
+
"""
|
|
87
|
+
s = s.replace("人民币", "").replace("圆", "元").replace("整", "").strip()
|
|
88
|
+
if not s:
|
|
89
|
+
return None
|
|
90
|
+
# 小数部分:元后的 角(0.1) / 分(0.01),如"…元柒角壹分" = .71
|
|
91
|
+
frac = 0.0
|
|
92
|
+
if "元" in s:
|
|
93
|
+
intpart, _, fracpart = s.partition("元")
|
|
94
|
+
for unit_ch, unit_val in (("角", 0.1), ("分", 0.01)):
|
|
95
|
+
idx = fracpart.find(unit_ch)
|
|
96
|
+
if idx >= 1:
|
|
97
|
+
d = _CN_MONEY_DIGITS.get(fracpart[idx - 1])
|
|
98
|
+
if d is not None:
|
|
99
|
+
frac += d * unit_val
|
|
100
|
+
s = intpart
|
|
101
|
+
total = 0
|
|
102
|
+
section = 0 # 当前"万"以下的累加
|
|
103
|
+
digit = 0 # 上一个数字
|
|
104
|
+
saw_any = False
|
|
105
|
+
for ch in s:
|
|
106
|
+
if ch in _CN_MONEY_DIGITS:
|
|
107
|
+
digit = _CN_MONEY_DIGITS[ch]
|
|
108
|
+
saw_any = True
|
|
109
|
+
elif ch in _CN_MONEY_UNITS:
|
|
110
|
+
unit = _CN_MONEY_UNITS[ch]
|
|
111
|
+
if unit >= 10000:
|
|
112
|
+
# 万 / 亿 触发段结算
|
|
113
|
+
section = (section + digit) * unit
|
|
114
|
+
total += section
|
|
115
|
+
section = 0
|
|
116
|
+
else:
|
|
117
|
+
# 拾/佰/仟:digit 为 0 时按 1("拾" 单独 = 10)
|
|
118
|
+
section += (digit if digit else 1) * unit
|
|
119
|
+
digit = 0
|
|
120
|
+
saw_any = True
|
|
121
|
+
elif ch.isspace():
|
|
122
|
+
continue
|
|
123
|
+
else:
|
|
124
|
+
return None # 不认识的字符(含混阿拉伯/中文),交给阿拉伯路径
|
|
125
|
+
section += digit
|
|
126
|
+
total += section
|
|
127
|
+
result = float(total) + frac
|
|
128
|
+
return result if (saw_any or frac) and result > 0 else None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def parse_money_value(value: str | None) -> float | None:
|
|
132
|
+
"""从原文金额抽出数值(人民币元)。先试阿拉伯数字,再试中文大写。"""
|
|
133
|
+
if not value:
|
|
134
|
+
return None
|
|
135
|
+
# 阿拉伯数字优先(精度高,覆盖大多数情况)
|
|
136
|
+
m = re.search(r"([0-9]+(?:[,,]\d{3})*(?:\.\d+)?)\s*(万元|万|千元|百元|元|圆)?", value)
|
|
137
|
+
if m:
|
|
138
|
+
num = float(m.group(1).replace(",", "").replace(",", ""))
|
|
139
|
+
unit = m.group(2) or "元"
|
|
140
|
+
multiplier = {"万元": 10000, "万": 10000, "千元": 1000, "百元": 100,
|
|
141
|
+
"元": 1, "圆": 1}.get(unit, 1)
|
|
142
|
+
return num * multiplier
|
|
143
|
+
# 退化到中文大写
|
|
144
|
+
return _cn_money_to_value(value)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------- LLM 数组 → 强类型 ----------
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def coerce_bool(value: Any) -> bool | None:
|
|
151
|
+
"""LLM 的 auto_renewal 等布尔字段:兼容 true/false/是/否/null。"""
|
|
152
|
+
if isinstance(value, bool):
|
|
153
|
+
return value
|
|
154
|
+
if value is None:
|
|
155
|
+
return None
|
|
156
|
+
s = str(value).strip().lower()
|
|
157
|
+
if s in ("true", "是", "yes", "1"):
|
|
158
|
+
return True
|
|
159
|
+
if s in ("false", "否", "no", "0"):
|
|
160
|
+
return False
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def coerce_str_list(value: Any, max_len: int = 200) -> list[str]:
|
|
165
|
+
"""LLM 的字符串数组(如 risk_clauses):过滤空项、截断。"""
|
|
166
|
+
if not isinstance(value, list):
|
|
167
|
+
return []
|
|
168
|
+
out: list[str] = []
|
|
169
|
+
for item in value:
|
|
170
|
+
s = str(item).strip() if item is not None else ""
|
|
171
|
+
if s:
|
|
172
|
+
out.append(s[:max_len])
|
|
173
|
+
return out
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def coerce_obligations(raw: Any) -> list[ObligationItem]:
|
|
177
|
+
"""
|
|
178
|
+
把 LLM 返回的 obligations 数组(dict 列表)转 ObligationItem。
|
|
179
|
+
actor 兼容"甲方"/"乙方"中文别名;跳过缺 actor/action 的非法项,不抛异常。
|
|
180
|
+
"""
|
|
181
|
+
if not isinstance(raw, list):
|
|
182
|
+
return []
|
|
183
|
+
actor_alias = {
|
|
184
|
+
"甲方": "party_a", "Party A": "party_a", "partyA": "party_a",
|
|
185
|
+
"乙方": "party_b", "Party B": "party_b", "partyB": "party_b",
|
|
186
|
+
"双方": "both", "Both": "both",
|
|
187
|
+
}
|
|
188
|
+
out: list[ObligationItem] = []
|
|
189
|
+
for item in raw:
|
|
190
|
+
if not isinstance(item, dict):
|
|
191
|
+
continue
|
|
192
|
+
actor = str(item.get("actor", "")).strip()
|
|
193
|
+
actor = actor_alias.get(actor, actor)
|
|
194
|
+
if actor not in ("party_a", "party_b", "both"):
|
|
195
|
+
continue
|
|
196
|
+
action = str(item.get("action", "")).strip()
|
|
197
|
+
if not action:
|
|
198
|
+
continue
|
|
199
|
+
deadline = item.get("deadline")
|
|
200
|
+
deadline = normalize_date(deadline) or None if isinstance(deadline, str) else None
|
|
201
|
+
evidence = str(item.get("evidence", "")).strip()[:500]
|
|
202
|
+
out.append(
|
|
203
|
+
ObligationItem(
|
|
204
|
+
actor=actor, # type: ignore[arg-type]
|
|
205
|
+
action=action[:200],
|
|
206
|
+
deadline=deadline,
|
|
207
|
+
evidence=evidence,
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
return out
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
周期性费用的月度估算:纯确定性派生,不依赖 LLM 算术。
|
|
3
|
+
|
|
4
|
+
为什么交给代码(同 amount_check.py 的哲学):合同只给**单价**(物业服务费 2.25 元/月·㎡、
|
|
5
|
+
服务费 4.55、能耗费 0.8),买受人真正关心的是"一个月实付多少钱"。LLM 照抄单价可靠,
|
|
6
|
+
但让它把同量纲单价相加再乘建筑面积,既易算错又不可审计。算术与量纲判断交给代码,
|
|
7
|
+
LLM 只负责抽出每项单价及其 unit。
|
|
8
|
+
|
|
9
|
+
口径:
|
|
10
|
+
月物业费 ≈ Σ(按建筑面积计价的物业类单价,元/月·㎡) × 建筑面积
|
|
11
|
+
- **只并入「元/月·㎡」量纲**的项(unit 含 ㎡/平方米 且 含 月);车位管理费"元/个/月"
|
|
12
|
+
量纲不同(应乘车位数而非建筑面积),不并入——混加是量纲错误。
|
|
13
|
+
- 建筑面积取「建筑面积/预测建筑面积」,**排除「套内/分摊」**——合同物业费明示按建筑面积计。
|
|
14
|
+
|
|
15
|
+
产出"估算、供参考"——单价/面积任一抽不到即返回 (None, None),不硬凑。
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
from typing import Optional
|
|
21
|
+
|
|
22
|
+
from ..schemas import LabeledAmount, LabeledValue
|
|
23
|
+
|
|
24
|
+
# 面积数值解析:从"286.92 平方米""286.92㎡"里取第一个数。
|
|
25
|
+
_NUM_RE = re.compile(r"(\d+(?:\.\d+)?)")
|
|
26
|
+
# 建筑面积标签里需排除的限定词——要的是「总建筑面积」,非套内/分摊。
|
|
27
|
+
_AREA_EXCLUDE = ("套内", "分摊", "共有")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is_per_sqm_month(unit: Optional[str]) -> bool:
|
|
31
|
+
"""unit 是否为「元/月·㎡」量纲(按建筑面积、按月计的单价)。"""
|
|
32
|
+
u = unit or ""
|
|
33
|
+
has_sqm = "㎡" in u or "平方米" in u or "平米" in u or "m²" in u or "m2" in u.lower()
|
|
34
|
+
has_month = "月" in u
|
|
35
|
+
return has_sqm and has_month
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _find_building_area(fields: list[LabeledValue]) -> Optional[float]:
|
|
39
|
+
"""
|
|
40
|
+
从 fields 找建筑面积数值(㎡)。优先「预测建筑面积」或纯「建筑面积」,
|
|
41
|
+
排除「套内建筑面积/分摊共有建筑面积」(物业费按总建筑面积计)。抽不到返回 None。
|
|
42
|
+
"""
|
|
43
|
+
candidates: list[tuple[str, float]] = []
|
|
44
|
+
for f in fields:
|
|
45
|
+
label = f.label or ""
|
|
46
|
+
if "建筑面积" in label and not any(x in label for x in _AREA_EXCLUDE):
|
|
47
|
+
m = _NUM_RE.search(f.value or "")
|
|
48
|
+
if m:
|
|
49
|
+
candidates.append((label, float(m.group(1))))
|
|
50
|
+
if not candidates:
|
|
51
|
+
return None
|
|
52
|
+
# 优先"预测建筑面积"/纯"建筑面积",否则取首个候选。
|
|
53
|
+
for label, value in candidates:
|
|
54
|
+
if "预测" in label or label.strip() == "建筑面积":
|
|
55
|
+
return value
|
|
56
|
+
return candidates[0][1]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def estimate_monthly_property_fee(
|
|
60
|
+
amounts: list[LabeledAmount], fields: list[LabeledValue]
|
|
61
|
+
) -> tuple[Optional[float], Optional[str]]:
|
|
62
|
+
"""
|
|
63
|
+
估算每月物业费 = Σ(按㎡·月计价的单价) × 建筑面积。
|
|
64
|
+
|
|
65
|
+
返回 (月费数值, 算式说明);缺单价或缺建筑面积时返回 (None, None)。
|
|
66
|
+
算式说明 _text 保留各单价与面积,供人工翻回原文核对。
|
|
67
|
+
"""
|
|
68
|
+
per_sqm = [a for a in amounts if a.value is not None and _is_per_sqm_month(a.unit)]
|
|
69
|
+
if not per_sqm:
|
|
70
|
+
return None, None
|
|
71
|
+
area = _find_building_area(fields)
|
|
72
|
+
if not area:
|
|
73
|
+
return None, None
|
|
74
|
+
|
|
75
|
+
unit_sum = round(sum(a.value for a in per_sqm if a.value is not None), 4)
|
|
76
|
+
monthly = round(unit_sum * area, 2)
|
|
77
|
+
parts = "+".join(f"{a.label}{a.value:g}" for a in per_sqm)
|
|
78
|
+
text = f"({parts})={unit_sum:g}元/月·㎡ × {area:g}㎡ ≈ {monthly:,.2f}元/月"
|
|
79
|
+
return monthly, text
|