contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,667 @@
|
|
|
1
|
+
"""
|
|
2
|
+
单 PDF 入库流水线。
|
|
3
|
+
|
|
4
|
+
流程(每个 PDF 一次调用):
|
|
5
|
+
1) 流式 SHA256
|
|
6
|
+
2) 查 documents.sha256 → 命中 + 非 reingest 直接 skip
|
|
7
|
+
3) 在 tmp/<sha-short>/ 先留 source.pdf,再跑 OCR pipeline + 抽取
|
|
8
|
+
4) 全成功后 os.rename(tmp → documents/<sha-short>/) 是事务边界
|
|
9
|
+
5) DB 写入 documents + risk_clauses(单事务,由 repository 保证)
|
|
10
|
+
6) 追加一行 ingest.jsonl 总日志
|
|
11
|
+
7) 失败时:仍保留 documents/<sha-short>/source.pdf + ingest.log,记 status=failed
|
|
12
|
+
|
|
13
|
+
状态语义:
|
|
14
|
+
- ok OCR + 抽取都成功
|
|
15
|
+
- partial OCR 成功但 LLM 失败 → markdown 可用,可后续 extract 命令重跑
|
|
16
|
+
- failed OCR 失败 → 没有 OCR 产物,但 source.pdf 留档可查
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import sqlite3
|
|
23
|
+
import time
|
|
24
|
+
import traceback
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Optional
|
|
29
|
+
|
|
30
|
+
from ..errors import ErrorInfo, classify_exception, extract_empty, mineru_failed
|
|
31
|
+
from ..extraction import extract_contract, extract_document
|
|
32
|
+
from ..extraction.vision_seal import augment_completeness_with_vision
|
|
33
|
+
from ..extraction.evidence_page_fix import correct_evidence_pages
|
|
34
|
+
from ..pipelines import MinerUPipeline
|
|
35
|
+
from ..schemas import (
|
|
36
|
+
FILE_EXTRACTION,
|
|
37
|
+
FILE_EXTRACTION_CONF,
|
|
38
|
+
FILE_MARKDOWN,
|
|
39
|
+
FILE_RAW_TEXT,
|
|
40
|
+
ContractExtraction,
|
|
41
|
+
DocumentExtraction,
|
|
42
|
+
ExtractionConfidence,
|
|
43
|
+
)
|
|
44
|
+
from .party_registry import PartyRegistry
|
|
45
|
+
from .paths import ArchivePaths, SHA_SHORT_LEN, link_or_copy, safe_rmtree, sha256_of_file
|
|
46
|
+
from .repository import (
|
|
47
|
+
contract_to_envelope,
|
|
48
|
+
find_by_sha,
|
|
49
|
+
get_document,
|
|
50
|
+
insert_document,
|
|
51
|
+
replace_document,
|
|
52
|
+
update_extraction,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
logger = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ---------- 结果类型 ----------
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class IngestResult:
|
|
63
|
+
"""单 PDF 入库结果,CLI 用来汇总/打印。"""
|
|
64
|
+
|
|
65
|
+
pdf_path: Path
|
|
66
|
+
sha256: str
|
|
67
|
+
status: str # ok | partial | failed | skipped
|
|
68
|
+
doc_id: Optional[int] # 写入/已存在的 documents.id
|
|
69
|
+
mineru_duration_s: Optional[float] = None
|
|
70
|
+
llm_duration_s: Optional[float] = None
|
|
71
|
+
error_message: Optional[str] = None # 人类可读错误(同时写入 DB documents.error_message)
|
|
72
|
+
error: Optional[ErrorInfo] = None # 结构化错误(仅 CLI --format json 输出,不入库)
|
|
73
|
+
skipped_reason: Optional[str] = None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ---------- 抽取调度(LLM-first) ----------
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _envelope_confidence(env: DocumentExtraction) -> float:
|
|
80
|
+
"""
|
|
81
|
+
非合同文档的总体置信度启发式(LLM-first,无 rule 交叉验证)。
|
|
82
|
+
有标题/摘要算基础 0.5,每多一类柔性信息(主体/金额/字段/日期)+0.1,封顶 0.9。
|
|
83
|
+
"""
|
|
84
|
+
if not env.title and not env.summary:
|
|
85
|
+
return 0.0
|
|
86
|
+
rich = sum(bool(x) for x in (env.parties, env.amounts, env.fields, env.key_dates))
|
|
87
|
+
return min(0.9, 0.5 + 0.1 * rich)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _run_extraction(
|
|
91
|
+
document_text: str, llm_enabled: bool
|
|
92
|
+
) -> tuple[ContractExtraction, ExtractionConfidence, DocumentExtraction]:
|
|
93
|
+
"""
|
|
94
|
+
LLM-first 抽取:先判类型抽通用信封;若是合同,再跑合同抽取补专属列。
|
|
95
|
+
(合同抽取自 Phase 2 起也是纯 LLM,不再有 rule/hybrid。)
|
|
96
|
+
返回 (合同抽取, 置信度, 通用信封)——三者一并交给 repository 落库。
|
|
97
|
+
"""
|
|
98
|
+
if not llm_enabled:
|
|
99
|
+
# 无 LLM:rule 抽取自 Phase 2 已退役,extract_contract(llm_enabled=False) 返回空对象——
|
|
100
|
+
# 即 --no-llm 下抽取字段留空(仅 MinerU 产物入库),可后续 `extract <id>` 补抽。
|
|
101
|
+
ext, conf = extract_contract(document_text, llm_enabled=False)
|
|
102
|
+
return ext, conf, contract_to_envelope(ext)
|
|
103
|
+
|
|
104
|
+
envelope = extract_document(document_text, llm_enabled=llm_enabled)
|
|
105
|
+
if envelope.doc_type == "合同协议" and llm_enabled:
|
|
106
|
+
ext, conf = extract_contract(document_text, llm_enabled=llm_enabled)
|
|
107
|
+
# 合同义务用合同抽取的(专属 prompt 对义务/罚则区分更细)
|
|
108
|
+
envelope.obligations = ext.obligations
|
|
109
|
+
# 标题若合同抽取没给,回退用信封的
|
|
110
|
+
if not ext.contract_name and envelope.title:
|
|
111
|
+
ext.contract_name = envelope.title
|
|
112
|
+
return ext, conf, envelope
|
|
113
|
+
# 非合同:无合同专属列,overall 走信封启发式
|
|
114
|
+
conf = ExtractionConfidence()
|
|
115
|
+
conf.overall = _envelope_confidence(envelope)
|
|
116
|
+
return ContractExtraction(), conf, envelope
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _ensure_archived_source(paths: ArchivePaths, sha: str, pdf_path: Path) -> Path:
|
|
120
|
+
"""
|
|
121
|
+
幂等保证 archive 可控目录内有 source.pdf。
|
|
122
|
+
|
|
123
|
+
重复 ingest 命中 skip 时也走这里:如果历史产物被误删,当前这次 ingest 仍会
|
|
124
|
+
把源 PDF 补回 documents/<sha-short>/source.pdf。
|
|
125
|
+
"""
|
|
126
|
+
source_pdf = paths.doc_dir(sha) / "source.pdf"
|
|
127
|
+
if not source_pdf.exists():
|
|
128
|
+
link_or_copy(pdf_path, source_pdf)
|
|
129
|
+
return source_pdf
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# ---------- 入口 ----------
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def ingest_pdf(
|
|
136
|
+
pdf_path: Path,
|
|
137
|
+
paths: ArchivePaths,
|
|
138
|
+
conn: sqlite3.Connection,
|
|
139
|
+
*,
|
|
140
|
+
reingest: bool = False,
|
|
141
|
+
llm_enabled: bool = True,
|
|
142
|
+
pipeline: Optional[MinerUPipeline] = None,
|
|
143
|
+
) -> IngestResult:
|
|
144
|
+
"""
|
|
145
|
+
单 PDF 入库。
|
|
146
|
+
|
|
147
|
+
:param pdf_path: PDF 绝对/相对路径
|
|
148
|
+
:param paths: 档案库根路径对象
|
|
149
|
+
:param conn: 已打开 + 已 migrate 的 sqlite3 连接
|
|
150
|
+
:param reingest: True 时即使 sha256 已存在也强制重跑
|
|
151
|
+
:param llm_enabled: False 时只跑 rule 抽取
|
|
152
|
+
:param pipeline: 可注入的 MinerUPipeline 实例(复用模型加载,批量场景必传)
|
|
153
|
+
"""
|
|
154
|
+
pdf_path = pdf_path.resolve()
|
|
155
|
+
if not pdf_path.is_file():
|
|
156
|
+
raise FileNotFoundError(pdf_path)
|
|
157
|
+
|
|
158
|
+
paths.ensure()
|
|
159
|
+
|
|
160
|
+
logger.info("hashing %s", pdf_path.name)
|
|
161
|
+
sha = sha256_of_file(pdf_path)
|
|
162
|
+
sha_short = sha[:SHA_SHORT_LEN]
|
|
163
|
+
logger.info("sha256=%s", sha_short)
|
|
164
|
+
|
|
165
|
+
existing_id = find_by_sha(conn, sha)
|
|
166
|
+
if existing_id and not reingest:
|
|
167
|
+
prev = get_document(conn, existing_id)
|
|
168
|
+
prev_status = prev.status if prev else None
|
|
169
|
+
if prev_status == "failed":
|
|
170
|
+
# 上次失败不算"已入库"——重跑就是想重试,自动按 reingest 处理,
|
|
171
|
+
# 不要 skip 后甩给用户一句"加 --reingest"(UX:见 id=6 排查)。
|
|
172
|
+
logger.info("sha=%s 上次 ingest 失败,自动重试", sha_short)
|
|
173
|
+
reingest = True
|
|
174
|
+
else:
|
|
175
|
+
_ensure_archived_source(paths, sha, pdf_path)
|
|
176
|
+
if prev_status == "partial":
|
|
177
|
+
hint = f"(OCR 已完成、抽取未完成;用 `extract {existing_id}` 只重跑抽取,省去 OCR)"
|
|
178
|
+
else:
|
|
179
|
+
hint = "(已成功入库)"
|
|
180
|
+
return IngestResult(
|
|
181
|
+
pdf_path=pdf_path,
|
|
182
|
+
sha256=sha,
|
|
183
|
+
status="skipped",
|
|
184
|
+
doc_id=existing_id,
|
|
185
|
+
skipped_reason=f"sha256 已在档案库{hint};要强制重跑整条流程加 --reingest",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# 在 tmp 跑,全成功后 rename 到 documents/<sha-short>/
|
|
189
|
+
tmp_doc_dir = paths.tmp_dir / sha_short
|
|
190
|
+
safe_rmtree(tmp_doc_dir)
|
|
191
|
+
tmp_doc_dir.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
mineru_dir = tmp_doc_dir / "mineru"
|
|
193
|
+
|
|
194
|
+
# 单合同 stderr 日志(plain text),与档案库总 jsonl 互补
|
|
195
|
+
log_path = tmp_doc_dir / "ingest.log"
|
|
196
|
+
log_handle = log_path.open("w", encoding="utf-8")
|
|
197
|
+
log_handle.write(f"# ingest started at {_utc_now()}\n# pdf={pdf_path}\n")
|
|
198
|
+
link_strategy = link_or_copy(pdf_path, tmp_doc_dir / "source.pdf")
|
|
199
|
+
log_handle.write(f"[source.pdf] {link_strategy}ed from {pdf_path}\n")
|
|
200
|
+
|
|
201
|
+
mineru_duration: Optional[float] = None
|
|
202
|
+
llm_duration: Optional[float] = None
|
|
203
|
+
extraction: Optional[ContractExtraction] = None
|
|
204
|
+
confidence: Optional[ExtractionConfidence] = None
|
|
205
|
+
envelope: Optional[DocumentExtraction] = None
|
|
206
|
+
error_message: Optional[str] = None
|
|
207
|
+
error_info: Optional[ErrorInfo] = None
|
|
208
|
+
status = "ok"
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
# ---- 1. OCR 解析 ----
|
|
212
|
+
pl = pipeline or MinerUPipeline(allow_vl_fallback=llm_enabled)
|
|
213
|
+
t0 = time.perf_counter()
|
|
214
|
+
try:
|
|
215
|
+
pl.run(pdf_path, mineru_dir)
|
|
216
|
+
mineru_duration = time.perf_counter() - t0
|
|
217
|
+
log_handle.write(f"\n[ocr] OK in {mineru_duration:.2f}s\n")
|
|
218
|
+
except Exception as e:
|
|
219
|
+
mineru_duration = time.perf_counter() - t0
|
|
220
|
+
status = "failed"
|
|
221
|
+
error_message = f"ocr: {e}"
|
|
222
|
+
log_handle.write(f"\n[ocr] FAILED: {error_message}\n")
|
|
223
|
+
log_handle.write(traceback.format_exc())
|
|
224
|
+
return _commit_failed(
|
|
225
|
+
conn=conn,
|
|
226
|
+
paths=paths,
|
|
227
|
+
pdf_path=pdf_path,
|
|
228
|
+
sha=sha,
|
|
229
|
+
tmp_doc_dir=tmp_doc_dir,
|
|
230
|
+
log_handle=log_handle,
|
|
231
|
+
existing_id=existing_id,
|
|
232
|
+
mineru_duration=mineru_duration,
|
|
233
|
+
error_message=error_message,
|
|
234
|
+
error=mineru_failed(str(e)),
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# ---- 2. 抽取(基于 mineru 产物的 raw_text.txt 优先) ----
|
|
238
|
+
document_text = load_document_text(mineru_dir)
|
|
239
|
+
if not document_text:
|
|
240
|
+
log_handle.write("\n[extract] WARNING: no text found in mineru output\n")
|
|
241
|
+
t1 = time.perf_counter()
|
|
242
|
+
try:
|
|
243
|
+
extraction, confidence, envelope = _run_extraction(
|
|
244
|
+
document_text, llm_enabled=llm_enabled
|
|
245
|
+
)
|
|
246
|
+
llm_duration = time.perf_counter() - t1
|
|
247
|
+
log_handle.write(
|
|
248
|
+
f"[extract] OK in {llm_duration:.2f}s (doc_type={envelope.doc_type})\n"
|
|
249
|
+
)
|
|
250
|
+
# 抽取空跑护栏:开了 LLM 却啥都没抽到(最常见是缺 DASHSCOPE_API_KEY——
|
|
251
|
+
# 全局工具需 shell export,不读项目 .env),别静默标 ok 误导用户。
|
|
252
|
+
if (
|
|
253
|
+
llm_enabled
|
|
254
|
+
and not extraction.contract_name
|
|
255
|
+
and not envelope.title
|
|
256
|
+
and not envelope.fields
|
|
257
|
+
and not envelope.amounts
|
|
258
|
+
and not envelope.seals
|
|
259
|
+
):
|
|
260
|
+
status = "partial"
|
|
261
|
+
# 结构化 error 优先用 envelope 透上来的(精确区分缺 key / 限流 / 网络),
|
|
262
|
+
# 缺失才兜底 EXTRACT_EMPTY;error_message 仍是人类可读提示,不变。
|
|
263
|
+
error_info = envelope.extraction_error or extract_empty("LLM 抽取为空")
|
|
264
|
+
error_message = (
|
|
265
|
+
"LLM 抽取为空——通常是缺 DASHSCOPE_API_KEY(全局工具需在 shell "
|
|
266
|
+
"export,不读项目 .env)或 LLM 调用失败;补好后用 `extract <id>` 重抽"
|
|
267
|
+
)
|
|
268
|
+
log_handle.write(f"\n[extract] WARNING: {error_message}\n")
|
|
269
|
+
except Exception as e:
|
|
270
|
+
llm_duration = time.perf_counter() - t1
|
|
271
|
+
status = "partial"
|
|
272
|
+
error_message = f"extract: {e}"
|
|
273
|
+
error_info = classify_exception(e)
|
|
274
|
+
extraction = ContractExtraction()
|
|
275
|
+
confidence = ExtractionConfidence()
|
|
276
|
+
envelope = DocumentExtraction()
|
|
277
|
+
log_handle.write(f"\n[extract] FAILED (status=partial): {error_message}\n")
|
|
278
|
+
log_handle.write(traceback.format_exc())
|
|
279
|
+
|
|
280
|
+
# ---- 2.5 多模态签章核查:看落款页图覆盖文本对签章的判断(有图 + 有 key 才跑)----
|
|
281
|
+
if status != "failed" and llm_enabled:
|
|
282
|
+
try:
|
|
283
|
+
if augment_completeness_with_vision(envelope, mineru_dir):
|
|
284
|
+
log_handle.write("[seal-vision] 签章核查完成(看落款页图)\n")
|
|
285
|
+
except Exception as e: # noqa: BLE001 — VL 失败不能中断入库
|
|
286
|
+
log_handle.write(f"[seal-vision] 跳过(异常): {e}\n")
|
|
287
|
+
|
|
288
|
+
# ---- 2.6 出处页码校正:用 content_list 的 page_idx 覆盖 LLM 猜的页码 ----
|
|
289
|
+
try:
|
|
290
|
+
if correct_evidence_pages(envelope, mineru_dir):
|
|
291
|
+
log_handle.write("[page-fix] 出处页码已据 content_list 校正\n")
|
|
292
|
+
except Exception as e: # noqa: BLE001 — 页码校正失败不能中断入库
|
|
293
|
+
log_handle.write(f"[page-fix] 跳过(异常): {e}\n")
|
|
294
|
+
|
|
295
|
+
# ---- 2.7 身份基本信息核对:首见入库、再见校对(known_parties 基准库)----
|
|
296
|
+
# 把抽到的 person_identities(精确到人的身份证/电话/银行账号/开户行…)与
|
|
297
|
+
# 跨文档基准库比对:首次见到的录入为基准,再见到不一致即报 identity 缺陷。
|
|
298
|
+
try:
|
|
299
|
+
registry = PartyRegistry.load(paths.known_parties_path)
|
|
300
|
+
id_issues = registry.reconcile(envelope.person_identities, sha)
|
|
301
|
+
if registry.dirty:
|
|
302
|
+
registry.save()
|
|
303
|
+
envelope.identity_issues = id_issues
|
|
304
|
+
if id_issues:
|
|
305
|
+
log_handle.write(f"[identity] 身份核对:{len(id_issues)} 项与基准不一致\n")
|
|
306
|
+
elif envelope.person_identities:
|
|
307
|
+
log_handle.write("[identity] 身份核对:与基准一致(或首见已入库)\n")
|
|
308
|
+
except Exception as e: # noqa: BLE001 — 核对失败不能中断入库
|
|
309
|
+
log_handle.write(f"[identity] 跳过(异常): {e}\n")
|
|
310
|
+
|
|
311
|
+
# ---- 3. extracted.json 落盘(写通用信封;即使 partial 也写空对象,便于后续 extract 复跑) ----
|
|
312
|
+
(tmp_doc_dir / FILE_EXTRACTION).write_text(
|
|
313
|
+
envelope.model_dump_json(indent=2), encoding="utf-8"
|
|
314
|
+
)
|
|
315
|
+
(tmp_doc_dir / FILE_EXTRACTION_CONF).write_text(
|
|
316
|
+
confidence.model_dump_json(indent=2), encoding="utf-8"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# ---- 4. 事务边界:rename tmp → documents/<sha-short>/ ----
|
|
320
|
+
final_doc_dir = paths.doc_dir(sha)
|
|
321
|
+
safe_rmtree(final_doc_dir)
|
|
322
|
+
final_doc_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
323
|
+
tmp_doc_dir.rename(final_doc_dir)
|
|
324
|
+
# rename 之后 log_handle 仍然有效(文件描述符不依赖路径),但 path 已变
|
|
325
|
+
# 为了后续追加,把 handle 关掉再开新的
|
|
326
|
+
log_handle.close()
|
|
327
|
+
log_handle = (final_doc_dir / "ingest.log").open("a", encoding="utf-8")
|
|
328
|
+
|
|
329
|
+
# ---- 5. DB 写入 ----
|
|
330
|
+
if existing_id:
|
|
331
|
+
replace_document(
|
|
332
|
+
conn,
|
|
333
|
+
existing_id,
|
|
334
|
+
source_path=str(pdf_path),
|
|
335
|
+
output_dir=str(final_doc_dir),
|
|
336
|
+
status=status,
|
|
337
|
+
mineru_duration_s=mineru_duration,
|
|
338
|
+
llm_duration_s=llm_duration,
|
|
339
|
+
error_message=error_message,
|
|
340
|
+
extraction=extraction,
|
|
341
|
+
confidence=confidence,
|
|
342
|
+
envelope=envelope,
|
|
343
|
+
)
|
|
344
|
+
doc_id = existing_id
|
|
345
|
+
log_handle.write(f"\n[db] replaced id={doc_id} status={status}\n")
|
|
346
|
+
else:
|
|
347
|
+
doc_id = insert_document(
|
|
348
|
+
conn,
|
|
349
|
+
sha256=sha,
|
|
350
|
+
source_path=str(pdf_path),
|
|
351
|
+
output_dir=str(final_doc_dir),
|
|
352
|
+
status=status,
|
|
353
|
+
mineru_duration_s=mineru_duration,
|
|
354
|
+
llm_duration_s=llm_duration,
|
|
355
|
+
error_message=error_message,
|
|
356
|
+
extraction=extraction,
|
|
357
|
+
confidence=confidence,
|
|
358
|
+
envelope=envelope,
|
|
359
|
+
)
|
|
360
|
+
# 极端竞态:sha 在我们 hash 完到 insert 之间被别的 worker 写入
|
|
361
|
+
if doc_id is None:
|
|
362
|
+
doc_id = find_by_sha(conn, sha)
|
|
363
|
+
log_handle.write(
|
|
364
|
+
f"\n[db] race: sha already inserted by peer, reusing id={doc_id}\n"
|
|
365
|
+
)
|
|
366
|
+
else:
|
|
367
|
+
log_handle.write(f"\n[db] inserted id={doc_id} status={status}\n")
|
|
368
|
+
|
|
369
|
+
_append_jsonl(
|
|
370
|
+
paths.ingest_log,
|
|
371
|
+
{
|
|
372
|
+
"ts": _utc_now(),
|
|
373
|
+
"pdf": str(pdf_path),
|
|
374
|
+
"sha": sha,
|
|
375
|
+
"doc_id": doc_id,
|
|
376
|
+
"status": status,
|
|
377
|
+
"mineru_s": mineru_duration,
|
|
378
|
+
"llm_s": llm_duration,
|
|
379
|
+
"error": error_message,
|
|
380
|
+
},
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return IngestResult(
|
|
384
|
+
pdf_path=pdf_path,
|
|
385
|
+
sha256=sha,
|
|
386
|
+
status=status,
|
|
387
|
+
doc_id=doc_id,
|
|
388
|
+
mineru_duration_s=mineru_duration,
|
|
389
|
+
llm_duration_s=llm_duration,
|
|
390
|
+
error_message=error_message,
|
|
391
|
+
error=error_info,
|
|
392
|
+
)
|
|
393
|
+
finally:
|
|
394
|
+
try:
|
|
395
|
+
log_handle.close()
|
|
396
|
+
except Exception:
|
|
397
|
+
pass
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _commit_failed(
|
|
401
|
+
*,
|
|
402
|
+
conn: sqlite3.Connection,
|
|
403
|
+
paths: ArchivePaths,
|
|
404
|
+
pdf_path: Path,
|
|
405
|
+
sha: str,
|
|
406
|
+
tmp_doc_dir: Path,
|
|
407
|
+
log_handle,
|
|
408
|
+
existing_id: Optional[int],
|
|
409
|
+
mineru_duration: Optional[float],
|
|
410
|
+
error_message: str,
|
|
411
|
+
error: Optional[ErrorInfo] = None,
|
|
412
|
+
) -> IngestResult:
|
|
413
|
+
"""
|
|
414
|
+
OCR 失败的收尾。DB 仍要记一条 status=failed,且保留 archive 内 source.pdf。
|
|
415
|
+
|
|
416
|
+
如果是已成功/partial 的文档强制 reingest 失败,保留旧 output_dir 产物,只把本次
|
|
417
|
+
失败日志挪到 archive root;如果是新文档或上次本来就是 failed,则把 tmp 提交成
|
|
418
|
+
documents/<sha-short>/,至少留下 source.pdf + ingest.log。
|
|
419
|
+
"""
|
|
420
|
+
log_handle.close()
|
|
421
|
+
|
|
422
|
+
final_doc_dir = paths.doc_dir(sha)
|
|
423
|
+
existing = get_document(conn, existing_id) if existing_id else None
|
|
424
|
+
old_output_dir = Path(existing.output_dir) if existing and existing.output_dir else None
|
|
425
|
+
keep_old_outputs = (
|
|
426
|
+
existing is not None
|
|
427
|
+
and existing.status in {"ok", "partial"}
|
|
428
|
+
and old_output_dir is not None
|
|
429
|
+
and old_output_dir.exists()
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
failed_log: Optional[Path]
|
|
433
|
+
if keep_old_outputs:
|
|
434
|
+
# 旧 OCR 产物仍可用,不能被一次失败的 reingest 覆盖;但确保留档 PDF 在可控目录内。
|
|
435
|
+
_ensure_archived_source(paths, sha, pdf_path)
|
|
436
|
+
failed_log = paths.root / f"failed_{sha[:SHA_SHORT_LEN]}_{int(time.time())}.log"
|
|
437
|
+
try:
|
|
438
|
+
(tmp_doc_dir / "ingest.log").rename(failed_log)
|
|
439
|
+
except OSError:
|
|
440
|
+
failed_log = None
|
|
441
|
+
safe_rmtree(tmp_doc_dir)
|
|
442
|
+
output_dir = str(old_output_dir)
|
|
443
|
+
else:
|
|
444
|
+
safe_rmtree(final_doc_dir)
|
|
445
|
+
final_doc_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
446
|
+
tmp_doc_dir.rename(final_doc_dir)
|
|
447
|
+
output_dir = str(final_doc_dir)
|
|
448
|
+
failed_log = final_doc_dir / "ingest.log"
|
|
449
|
+
|
|
450
|
+
if existing_id:
|
|
451
|
+
replace_document(
|
|
452
|
+
conn,
|
|
453
|
+
existing_id,
|
|
454
|
+
source_path=str(pdf_path),
|
|
455
|
+
output_dir=output_dir,
|
|
456
|
+
status="failed",
|
|
457
|
+
mineru_duration_s=mineru_duration,
|
|
458
|
+
llm_duration_s=None,
|
|
459
|
+
error_message=error_message,
|
|
460
|
+
extraction=ContractExtraction(),
|
|
461
|
+
confidence=ExtractionConfidence(),
|
|
462
|
+
)
|
|
463
|
+
doc_id = existing_id
|
|
464
|
+
else:
|
|
465
|
+
doc_id = insert_document(
|
|
466
|
+
conn,
|
|
467
|
+
sha256=sha,
|
|
468
|
+
source_path=str(pdf_path),
|
|
469
|
+
output_dir=output_dir,
|
|
470
|
+
status="failed",
|
|
471
|
+
mineru_duration_s=mineru_duration,
|
|
472
|
+
llm_duration_s=None,
|
|
473
|
+
error_message=error_message,
|
|
474
|
+
extraction=ContractExtraction(),
|
|
475
|
+
confidence=ExtractionConfidence(),
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
_append_jsonl(
|
|
479
|
+
paths.ingest_log,
|
|
480
|
+
{
|
|
481
|
+
"ts": _utc_now(),
|
|
482
|
+
"pdf": str(pdf_path),
|
|
483
|
+
"sha": sha,
|
|
484
|
+
"doc_id": doc_id,
|
|
485
|
+
"status": "failed",
|
|
486
|
+
"mineru_s": mineru_duration,
|
|
487
|
+
"llm_s": None,
|
|
488
|
+
"error": error_message,
|
|
489
|
+
"log_path": str(failed_log) if failed_log else None,
|
|
490
|
+
},
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
return IngestResult(
|
|
494
|
+
pdf_path=pdf_path,
|
|
495
|
+
sha256=sha,
|
|
496
|
+
status="failed",
|
|
497
|
+
doc_id=doc_id,
|
|
498
|
+
mineru_duration_s=mineru_duration,
|
|
499
|
+
error_message=error_message,
|
|
500
|
+
error=error,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
# ---------- 复跑抽取(partial 状态修复,不重跑 OCR) ----------
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def re_extract(
|
|
508
|
+
doc_id: int,
|
|
509
|
+
paths: ArchivePaths,
|
|
510
|
+
conn: sqlite3.Connection,
|
|
511
|
+
*,
|
|
512
|
+
llm_enabled: bool = True,
|
|
513
|
+
) -> IngestResult:
|
|
514
|
+
"""
|
|
515
|
+
基于已有 mineru 产物重跑抽取。用于 partial 状态修复或调 prompt 后批量再抽取。
|
|
516
|
+
不动 MinerU 产物,不动 sha256/source_path/ingested_at。
|
|
517
|
+
"""
|
|
518
|
+
doc = get_document(conn, doc_id)
|
|
519
|
+
if not doc:
|
|
520
|
+
raise ValueError(f"document id={doc_id} not found")
|
|
521
|
+
mineru_dir = Path(doc.output_dir) / "mineru"
|
|
522
|
+
if not mineru_dir.exists():
|
|
523
|
+
raise FileNotFoundError(
|
|
524
|
+
f"mineru output missing for id={doc_id}: {mineru_dir}"
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
document_text = load_document_text(mineru_dir)
|
|
528
|
+
t0 = time.perf_counter()
|
|
529
|
+
error_message: Optional[str] = None
|
|
530
|
+
error_info: Optional[ErrorInfo] = None
|
|
531
|
+
status = "ok"
|
|
532
|
+
envelope = DocumentExtraction()
|
|
533
|
+
try:
|
|
534
|
+
extraction, confidence, envelope = _run_extraction(
|
|
535
|
+
document_text, llm_enabled=llm_enabled
|
|
536
|
+
)
|
|
537
|
+
# 空抽取护栏(与 ingest_pdf 对齐):开了 LLM 却啥都没抽到(最常见缺 key),
|
|
538
|
+
# 别静默标 ok 误导用户/agent——据 envelope.extraction_error 给结构化信号。
|
|
539
|
+
if (
|
|
540
|
+
llm_enabled
|
|
541
|
+
and not extraction.contract_name
|
|
542
|
+
and not envelope.title
|
|
543
|
+
and not envelope.fields
|
|
544
|
+
and not envelope.amounts
|
|
545
|
+
and not envelope.seals
|
|
546
|
+
):
|
|
547
|
+
status = "partial"
|
|
548
|
+
error_info = envelope.extraction_error or extract_empty("LLM 抽取为空")
|
|
549
|
+
error_message = (
|
|
550
|
+
"LLM 抽取为空——通常是缺 DASHSCOPE_API_KEY 或 LLM 调用失败;"
|
|
551
|
+
"补好后重跑 `extract <id>`"
|
|
552
|
+
)
|
|
553
|
+
except Exception as e:
|
|
554
|
+
status = "partial"
|
|
555
|
+
error_message = f"extract: {e}"
|
|
556
|
+
error_info = classify_exception(e)
|
|
557
|
+
extraction = ContractExtraction()
|
|
558
|
+
confidence = ExtractionConfidence()
|
|
559
|
+
envelope = DocumentExtraction()
|
|
560
|
+
llm_duration = time.perf_counter() - t0
|
|
561
|
+
|
|
562
|
+
# 多模态签章核查:看落款页图重判签章(augment 内部处理 doc_type/无图/无 key 降级)。
|
|
563
|
+
if llm_enabled and status == "ok":
|
|
564
|
+
try:
|
|
565
|
+
augment_completeness_with_vision(envelope, mineru_dir)
|
|
566
|
+
except Exception as e: # noqa: BLE001 — VL 失败不能中断重抽
|
|
567
|
+
logger.warning("seal-vision 跳过(异常): %s", e)
|
|
568
|
+
try:
|
|
569
|
+
correct_evidence_pages(envelope, mineru_dir)
|
|
570
|
+
except Exception as e: # noqa: BLE001 — 页码校正失败不能中断重抽
|
|
571
|
+
logger.warning("page-fix 跳过(异常): %s", e)
|
|
572
|
+
|
|
573
|
+
# 身份基本信息核对:首见入库、再见校对。与 ingest 的 2.7 一致——
|
|
574
|
+
# 否则重抽会把已核对出的 identity_issues 清空,造成 ingest/extract 行为分叉。
|
|
575
|
+
try:
|
|
576
|
+
registry = PartyRegistry.load(paths.known_parties_path)
|
|
577
|
+
id_issues = registry.reconcile(envelope.person_identities, doc.sha256)
|
|
578
|
+
if registry.dirty:
|
|
579
|
+
registry.save()
|
|
580
|
+
envelope.identity_issues = id_issues
|
|
581
|
+
except Exception as e: # noqa: BLE001 — 核对失败不能中断重抽
|
|
582
|
+
logger.warning("identity 跳过(异常): %s", e)
|
|
583
|
+
|
|
584
|
+
# 落盘新 extracted.json(通用信封)
|
|
585
|
+
(Path(doc.output_dir) / FILE_EXTRACTION).write_text(
|
|
586
|
+
envelope.model_dump_json(indent=2), encoding="utf-8"
|
|
587
|
+
)
|
|
588
|
+
(Path(doc.output_dir) / FILE_EXTRACTION_CONF).write_text(
|
|
589
|
+
confidence.model_dump_json(indent=2), encoding="utf-8"
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
update_extraction(
|
|
593
|
+
conn,
|
|
594
|
+
doc_id,
|
|
595
|
+
status=status,
|
|
596
|
+
llm_duration_s=llm_duration,
|
|
597
|
+
error_message=error_message,
|
|
598
|
+
extraction=extraction,
|
|
599
|
+
confidence=confidence,
|
|
600
|
+
envelope=envelope,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
_append_jsonl(
|
|
604
|
+
paths.ingest_log,
|
|
605
|
+
{
|
|
606
|
+
"ts": _utc_now(),
|
|
607
|
+
"op": "re_extract",
|
|
608
|
+
"doc_id": doc_id,
|
|
609
|
+
"sha": doc.sha256,
|
|
610
|
+
"status": status,
|
|
611
|
+
"llm_s": llm_duration,
|
|
612
|
+
"error": error_message,
|
|
613
|
+
},
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
return IngestResult(
|
|
617
|
+
pdf_path=Path(doc.source_path),
|
|
618
|
+
sha256=doc.sha256,
|
|
619
|
+
status=status,
|
|
620
|
+
doc_id=doc_id,
|
|
621
|
+
llm_duration_s=llm_duration,
|
|
622
|
+
error_message=error_message,
|
|
623
|
+
error=error_info or envelope.extraction_error,
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
# ---------- 工具 ----------
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def load_document_text(mineru_dir: Path) -> str:
|
|
631
|
+
"""优先 raw_text.txt(已清洗),fallback markdown.md。"""
|
|
632
|
+
raw = mineru_dir / FILE_RAW_TEXT
|
|
633
|
+
md = mineru_dir / FILE_MARKDOWN
|
|
634
|
+
if raw.exists():
|
|
635
|
+
return raw.read_text(encoding="utf-8")
|
|
636
|
+
if md.exists():
|
|
637
|
+
return md.read_text(encoding="utf-8")
|
|
638
|
+
return ""
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _append_jsonl(path: Path, payload: dict) -> None:
|
|
642
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
643
|
+
with path.open("a", encoding="utf-8") as f:
|
|
644
|
+
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _utc_now() -> str:
|
|
648
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
# ---------- 目录递归收集 PDF ----------
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def discover_pdfs(path: Path) -> list[Path]:
|
|
655
|
+
"""传入文件返回单元素列表;传入目录递归找 *.pdf,跳过隐藏文件。"""
|
|
656
|
+
path = path.resolve()
|
|
657
|
+
if path.is_file():
|
|
658
|
+
if path.suffix.lower() != ".pdf":
|
|
659
|
+
raise ValueError(f"not a PDF: {path}")
|
|
660
|
+
return [path]
|
|
661
|
+
if not path.is_dir():
|
|
662
|
+
raise FileNotFoundError(path)
|
|
663
|
+
pdfs = sorted(
|
|
664
|
+
p for p in path.rglob("*.pdf")
|
|
665
|
+
if not any(part.startswith(".") for part in p.relative_to(path).parts)
|
|
666
|
+
)
|
|
667
|
+
return pdfs
|