contract-archive-cli 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. contract_archive/__init__.py +2 -0
  2. contract_archive/archive/__init__.py +64 -0
  3. contract_archive/archive/db.py +126 -0
  4. contract_archive/archive/ingest.py +667 -0
  5. contract_archive/archive/migrations/001_init.sql +62 -0
  6. contract_archive/archive/migrations/002_obligations.sql +25 -0
  7. contract_archive/archive/migrations/003_document_types.sql +31 -0
  8. contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
  9. contract_archive/archive/migrations/005_completeness.sql +18 -0
  10. contract_archive/archive/party_registry.py +276 -0
  11. contract_archive/archive/paths.py +113 -0
  12. contract_archive/archive/repository.py +918 -0
  13. contract_archive/cli.py +455 -0
  14. contract_archive/cli_common.py +293 -0
  15. contract_archive/cli_config.py +96 -0
  16. contract_archive/cli_introspect.py +204 -0
  17. contract_archive/cli_party.py +166 -0
  18. contract_archive/cli_query.py +492 -0
  19. contract_archive/cli_render.py +575 -0
  20. contract_archive/config.py +257 -0
  21. contract_archive/errors.py +163 -0
  22. contract_archive/extraction/__init__.py +14 -0
  23. contract_archive/extraction/amount_check.py +87 -0
  24. contract_archive/extraction/contract_extractor.py +103 -0
  25. contract_archive/extraction/document_extractor.py +546 -0
  26. contract_archive/extraction/evidence_page_fix.py +99 -0
  27. contract_archive/extraction/llm_extractor.py +207 -0
  28. contract_archive/extraction/normalize.py +210 -0
  29. contract_archive/extraction/property_fee.py +79 -0
  30. contract_archive/extraction/vision_seal.py +390 -0
  31. contract_archive/pipelines/__init__.py +9 -0
  32. contract_archive/pipelines/mineru_pipeline.py +955 -0
  33. contract_archive/pipelines/vl_ocr.py +160 -0
  34. contract_archive/schemas/__init__.py +67 -0
  35. contract_archive/schemas/document.py +408 -0
  36. contract_archive/utils/__init__.py +27 -0
  37. contract_archive/utils/device.py +51 -0
  38. contract_archive/utils/http_env.py +54 -0
  39. contract_archive/utils/pdf.py +207 -0
  40. contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
  41. contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
  42. contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
  43. contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
  44. contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,667 @@
1
+ """
2
+ 单 PDF 入库流水线。
3
+
4
+ 流程(每个 PDF 一次调用):
5
+ 1) 流式 SHA256
6
+ 2) 查 documents.sha256 → 命中 + 非 reingest 直接 skip
7
+ 3) 在 tmp/<sha-short>/ 先留 source.pdf,再跑 OCR pipeline + 抽取
8
+ 4) 全成功后 os.rename(tmp → documents/<sha-short>/) 是事务边界
9
+ 5) DB 写入 documents + risk_clauses(单事务,由 repository 保证)
10
+ 6) 追加一行 ingest.jsonl 总日志
11
+ 7) 失败时:仍保留 documents/<sha-short>/source.pdf + ingest.log,记 status=failed
12
+
13
+ 状态语义:
14
+ - ok OCR + 抽取都成功
15
+ - partial OCR 成功但 LLM 失败 → markdown 可用,可后续 extract 命令重跑
16
+ - failed OCR 失败 → 没有 OCR 产物,但 source.pdf 留档可查
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import logging
22
+ import sqlite3
23
+ import time
24
+ import traceback
25
+ from dataclasses import dataclass
26
+ from datetime import datetime, timezone
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ from ..errors import ErrorInfo, classify_exception, extract_empty, mineru_failed
31
+ from ..extraction import extract_contract, extract_document
32
+ from ..extraction.vision_seal import augment_completeness_with_vision
33
+ from ..extraction.evidence_page_fix import correct_evidence_pages
34
+ from ..pipelines import MinerUPipeline
35
+ from ..schemas import (
36
+ FILE_EXTRACTION,
37
+ FILE_EXTRACTION_CONF,
38
+ FILE_MARKDOWN,
39
+ FILE_RAW_TEXT,
40
+ ContractExtraction,
41
+ DocumentExtraction,
42
+ ExtractionConfidence,
43
+ )
44
+ from .party_registry import PartyRegistry
45
+ from .paths import ArchivePaths, SHA_SHORT_LEN, link_or_copy, safe_rmtree, sha256_of_file
46
+ from .repository import (
47
+ contract_to_envelope,
48
+ find_by_sha,
49
+ get_document,
50
+ insert_document,
51
+ replace_document,
52
+ update_extraction,
53
+ )
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+
58
+ # ---------- 结果类型 ----------
59
+
60
+
61
+ @dataclass
62
+ class IngestResult:
63
+ """单 PDF 入库结果,CLI 用来汇总/打印。"""
64
+
65
+ pdf_path: Path
66
+ sha256: str
67
+ status: str # ok | partial | failed | skipped
68
+ doc_id: Optional[int] # 写入/已存在的 documents.id
69
+ mineru_duration_s: Optional[float] = None
70
+ llm_duration_s: Optional[float] = None
71
+ error_message: Optional[str] = None # 人类可读错误(同时写入 DB documents.error_message)
72
+ error: Optional[ErrorInfo] = None # 结构化错误(仅 CLI --format json 输出,不入库)
73
+ skipped_reason: Optional[str] = None
74
+
75
+
76
+ # ---------- 抽取调度(LLM-first) ----------
77
+
78
+
79
+ def _envelope_confidence(env: DocumentExtraction) -> float:
80
+ """
81
+ 非合同文档的总体置信度启发式(LLM-first,无 rule 交叉验证)。
82
+ 有标题/摘要算基础 0.5,每多一类柔性信息(主体/金额/字段/日期)+0.1,封顶 0.9。
83
+ """
84
+ if not env.title and not env.summary:
85
+ return 0.0
86
+ rich = sum(bool(x) for x in (env.parties, env.amounts, env.fields, env.key_dates))
87
+ return min(0.9, 0.5 + 0.1 * rich)
88
+
89
+
90
+ def _run_extraction(
91
+ document_text: str, llm_enabled: bool
92
+ ) -> tuple[ContractExtraction, ExtractionConfidence, DocumentExtraction]:
93
+ """
94
+ LLM-first 抽取:先判类型抽通用信封;若是合同,再跑合同抽取补专属列。
95
+ (合同抽取自 Phase 2 起也是纯 LLM,不再有 rule/hybrid。)
96
+ 返回 (合同抽取, 置信度, 通用信封)——三者一并交给 repository 落库。
97
+ """
98
+ if not llm_enabled:
99
+ # 无 LLM:rule 抽取自 Phase 2 已退役,extract_contract(llm_enabled=False) 返回空对象——
100
+ # 即 --no-llm 下抽取字段留空(仅 MinerU 产物入库),可后续 `extract <id>` 补抽。
101
+ ext, conf = extract_contract(document_text, llm_enabled=False)
102
+ return ext, conf, contract_to_envelope(ext)
103
+
104
+ envelope = extract_document(document_text, llm_enabled=llm_enabled)
105
+ if envelope.doc_type == "合同协议" and llm_enabled:
106
+ ext, conf = extract_contract(document_text, llm_enabled=llm_enabled)
107
+ # 合同义务用合同抽取的(专属 prompt 对义务/罚则区分更细)
108
+ envelope.obligations = ext.obligations
109
+ # 标题若合同抽取没给,回退用信封的
110
+ if not ext.contract_name and envelope.title:
111
+ ext.contract_name = envelope.title
112
+ return ext, conf, envelope
113
+ # 非合同:无合同专属列,overall 走信封启发式
114
+ conf = ExtractionConfidence()
115
+ conf.overall = _envelope_confidence(envelope)
116
+ return ContractExtraction(), conf, envelope
117
+
118
+
119
+ def _ensure_archived_source(paths: ArchivePaths, sha: str, pdf_path: Path) -> Path:
120
+ """
121
+ 幂等保证 archive 可控目录内有 source.pdf。
122
+
123
+ 重复 ingest 命中 skip 时也走这里:如果历史产物被误删,当前这次 ingest 仍会
124
+ 把源 PDF 补回 documents/<sha-short>/source.pdf。
125
+ """
126
+ source_pdf = paths.doc_dir(sha) / "source.pdf"
127
+ if not source_pdf.exists():
128
+ link_or_copy(pdf_path, source_pdf)
129
+ return source_pdf
130
+
131
+
132
+ # ---------- 入口 ----------
133
+
134
+
135
+ def ingest_pdf(
136
+ pdf_path: Path,
137
+ paths: ArchivePaths,
138
+ conn: sqlite3.Connection,
139
+ *,
140
+ reingest: bool = False,
141
+ llm_enabled: bool = True,
142
+ pipeline: Optional[MinerUPipeline] = None,
143
+ ) -> IngestResult:
144
+ """
145
+ 单 PDF 入库。
146
+
147
+ :param pdf_path: PDF 绝对/相对路径
148
+ :param paths: 档案库根路径对象
149
+ :param conn: 已打开 + 已 migrate 的 sqlite3 连接
150
+ :param reingest: True 时即使 sha256 已存在也强制重跑
151
+ :param llm_enabled: False 时只跑 rule 抽取
152
+ :param pipeline: 可注入的 MinerUPipeline 实例(复用模型加载,批量场景必传)
153
+ """
154
+ pdf_path = pdf_path.resolve()
155
+ if not pdf_path.is_file():
156
+ raise FileNotFoundError(pdf_path)
157
+
158
+ paths.ensure()
159
+
160
+ logger.info("hashing %s", pdf_path.name)
161
+ sha = sha256_of_file(pdf_path)
162
+ sha_short = sha[:SHA_SHORT_LEN]
163
+ logger.info("sha256=%s", sha_short)
164
+
165
+ existing_id = find_by_sha(conn, sha)
166
+ if existing_id and not reingest:
167
+ prev = get_document(conn, existing_id)
168
+ prev_status = prev.status if prev else None
169
+ if prev_status == "failed":
170
+ # 上次失败不算"已入库"——重跑就是想重试,自动按 reingest 处理,
171
+ # 不要 skip 后甩给用户一句"加 --reingest"(UX:见 id=6 排查)。
172
+ logger.info("sha=%s 上次 ingest 失败,自动重试", sha_short)
173
+ reingest = True
174
+ else:
175
+ _ensure_archived_source(paths, sha, pdf_path)
176
+ if prev_status == "partial":
177
+ hint = f"(OCR 已完成、抽取未完成;用 `extract {existing_id}` 只重跑抽取,省去 OCR)"
178
+ else:
179
+ hint = "(已成功入库)"
180
+ return IngestResult(
181
+ pdf_path=pdf_path,
182
+ sha256=sha,
183
+ status="skipped",
184
+ doc_id=existing_id,
185
+ skipped_reason=f"sha256 已在档案库{hint};要强制重跑整条流程加 --reingest",
186
+ )
187
+
188
+ # 在 tmp 跑,全成功后 rename 到 documents/<sha-short>/
189
+ tmp_doc_dir = paths.tmp_dir / sha_short
190
+ safe_rmtree(tmp_doc_dir)
191
+ tmp_doc_dir.mkdir(parents=True, exist_ok=True)
192
+ mineru_dir = tmp_doc_dir / "mineru"
193
+
194
+ # 单合同 stderr 日志(plain text),与档案库总 jsonl 互补
195
+ log_path = tmp_doc_dir / "ingest.log"
196
+ log_handle = log_path.open("w", encoding="utf-8")
197
+ log_handle.write(f"# ingest started at {_utc_now()}\n# pdf={pdf_path}\n")
198
+ link_strategy = link_or_copy(pdf_path, tmp_doc_dir / "source.pdf")
199
+ log_handle.write(f"[source.pdf] {link_strategy}ed from {pdf_path}\n")
200
+
201
+ mineru_duration: Optional[float] = None
202
+ llm_duration: Optional[float] = None
203
+ extraction: Optional[ContractExtraction] = None
204
+ confidence: Optional[ExtractionConfidence] = None
205
+ envelope: Optional[DocumentExtraction] = None
206
+ error_message: Optional[str] = None
207
+ error_info: Optional[ErrorInfo] = None
208
+ status = "ok"
209
+
210
+ try:
211
+ # ---- 1. OCR 解析 ----
212
+ pl = pipeline or MinerUPipeline(allow_vl_fallback=llm_enabled)
213
+ t0 = time.perf_counter()
214
+ try:
215
+ pl.run(pdf_path, mineru_dir)
216
+ mineru_duration = time.perf_counter() - t0
217
+ log_handle.write(f"\n[ocr] OK in {mineru_duration:.2f}s\n")
218
+ except Exception as e:
219
+ mineru_duration = time.perf_counter() - t0
220
+ status = "failed"
221
+ error_message = f"ocr: {e}"
222
+ log_handle.write(f"\n[ocr] FAILED: {error_message}\n")
223
+ log_handle.write(traceback.format_exc())
224
+ return _commit_failed(
225
+ conn=conn,
226
+ paths=paths,
227
+ pdf_path=pdf_path,
228
+ sha=sha,
229
+ tmp_doc_dir=tmp_doc_dir,
230
+ log_handle=log_handle,
231
+ existing_id=existing_id,
232
+ mineru_duration=mineru_duration,
233
+ error_message=error_message,
234
+ error=mineru_failed(str(e)),
235
+ )
236
+
237
+ # ---- 2. 抽取(基于 mineru 产物的 raw_text.txt 优先) ----
238
+ document_text = load_document_text(mineru_dir)
239
+ if not document_text:
240
+ log_handle.write("\n[extract] WARNING: no text found in mineru output\n")
241
+ t1 = time.perf_counter()
242
+ try:
243
+ extraction, confidence, envelope = _run_extraction(
244
+ document_text, llm_enabled=llm_enabled
245
+ )
246
+ llm_duration = time.perf_counter() - t1
247
+ log_handle.write(
248
+ f"[extract] OK in {llm_duration:.2f}s (doc_type={envelope.doc_type})\n"
249
+ )
250
+ # 抽取空跑护栏:开了 LLM 却啥都没抽到(最常见是缺 DASHSCOPE_API_KEY——
251
+ # 全局工具需 shell export,不读项目 .env),别静默标 ok 误导用户。
252
+ if (
253
+ llm_enabled
254
+ and not extraction.contract_name
255
+ and not envelope.title
256
+ and not envelope.fields
257
+ and not envelope.amounts
258
+ and not envelope.seals
259
+ ):
260
+ status = "partial"
261
+ # 结构化 error 优先用 envelope 透上来的(精确区分缺 key / 限流 / 网络),
262
+ # 缺失才兜底 EXTRACT_EMPTY;error_message 仍是人类可读提示,不变。
263
+ error_info = envelope.extraction_error or extract_empty("LLM 抽取为空")
264
+ error_message = (
265
+ "LLM 抽取为空——通常是缺 DASHSCOPE_API_KEY(全局工具需在 shell "
266
+ "export,不读项目 .env)或 LLM 调用失败;补好后用 `extract <id>` 重抽"
267
+ )
268
+ log_handle.write(f"\n[extract] WARNING: {error_message}\n")
269
+ except Exception as e:
270
+ llm_duration = time.perf_counter() - t1
271
+ status = "partial"
272
+ error_message = f"extract: {e}"
273
+ error_info = classify_exception(e)
274
+ extraction = ContractExtraction()
275
+ confidence = ExtractionConfidence()
276
+ envelope = DocumentExtraction()
277
+ log_handle.write(f"\n[extract] FAILED (status=partial): {error_message}\n")
278
+ log_handle.write(traceback.format_exc())
279
+
280
+ # ---- 2.5 多模态签章核查:看落款页图覆盖文本对签章的判断(有图 + 有 key 才跑)----
281
+ if status != "failed" and llm_enabled:
282
+ try:
283
+ if augment_completeness_with_vision(envelope, mineru_dir):
284
+ log_handle.write("[seal-vision] 签章核查完成(看落款页图)\n")
285
+ except Exception as e: # noqa: BLE001 — VL 失败不能中断入库
286
+ log_handle.write(f"[seal-vision] 跳过(异常): {e}\n")
287
+
288
+ # ---- 2.6 出处页码校正:用 content_list 的 page_idx 覆盖 LLM 猜的页码 ----
289
+ try:
290
+ if correct_evidence_pages(envelope, mineru_dir):
291
+ log_handle.write("[page-fix] 出处页码已据 content_list 校正\n")
292
+ except Exception as e: # noqa: BLE001 — 页码校正失败不能中断入库
293
+ log_handle.write(f"[page-fix] 跳过(异常): {e}\n")
294
+
295
+ # ---- 2.7 身份基本信息核对:首见入库、再见校对(known_parties 基准库)----
296
+ # 把抽到的 person_identities(精确到人的身份证/电话/银行账号/开户行…)与
297
+ # 跨文档基准库比对:首次见到的录入为基准,再见到不一致即报 identity 缺陷。
298
+ try:
299
+ registry = PartyRegistry.load(paths.known_parties_path)
300
+ id_issues = registry.reconcile(envelope.person_identities, sha)
301
+ if registry.dirty:
302
+ registry.save()
303
+ envelope.identity_issues = id_issues
304
+ if id_issues:
305
+ log_handle.write(f"[identity] 身份核对:{len(id_issues)} 项与基准不一致\n")
306
+ elif envelope.person_identities:
307
+ log_handle.write("[identity] 身份核对:与基准一致(或首见已入库)\n")
308
+ except Exception as e: # noqa: BLE001 — 核对失败不能中断入库
309
+ log_handle.write(f"[identity] 跳过(异常): {e}\n")
310
+
311
+ # ---- 3. extracted.json 落盘(写通用信封;即使 partial 也写空对象,便于后续 extract 复跑) ----
312
+ (tmp_doc_dir / FILE_EXTRACTION).write_text(
313
+ envelope.model_dump_json(indent=2), encoding="utf-8"
314
+ )
315
+ (tmp_doc_dir / FILE_EXTRACTION_CONF).write_text(
316
+ confidence.model_dump_json(indent=2), encoding="utf-8"
317
+ )
318
+
319
+ # ---- 4. 事务边界:rename tmp → documents/<sha-short>/ ----
320
+ final_doc_dir = paths.doc_dir(sha)
321
+ safe_rmtree(final_doc_dir)
322
+ final_doc_dir.parent.mkdir(parents=True, exist_ok=True)
323
+ tmp_doc_dir.rename(final_doc_dir)
324
+ # rename 之后 log_handle 仍然有效(文件描述符不依赖路径),但 path 已变
325
+ # 为了后续追加,把 handle 关掉再开新的
326
+ log_handle.close()
327
+ log_handle = (final_doc_dir / "ingest.log").open("a", encoding="utf-8")
328
+
329
+ # ---- 5. DB 写入 ----
330
+ if existing_id:
331
+ replace_document(
332
+ conn,
333
+ existing_id,
334
+ source_path=str(pdf_path),
335
+ output_dir=str(final_doc_dir),
336
+ status=status,
337
+ mineru_duration_s=mineru_duration,
338
+ llm_duration_s=llm_duration,
339
+ error_message=error_message,
340
+ extraction=extraction,
341
+ confidence=confidence,
342
+ envelope=envelope,
343
+ )
344
+ doc_id = existing_id
345
+ log_handle.write(f"\n[db] replaced id={doc_id} status={status}\n")
346
+ else:
347
+ doc_id = insert_document(
348
+ conn,
349
+ sha256=sha,
350
+ source_path=str(pdf_path),
351
+ output_dir=str(final_doc_dir),
352
+ status=status,
353
+ mineru_duration_s=mineru_duration,
354
+ llm_duration_s=llm_duration,
355
+ error_message=error_message,
356
+ extraction=extraction,
357
+ confidence=confidence,
358
+ envelope=envelope,
359
+ )
360
+ # 极端竞态:sha 在我们 hash 完到 insert 之间被别的 worker 写入
361
+ if doc_id is None:
362
+ doc_id = find_by_sha(conn, sha)
363
+ log_handle.write(
364
+ f"\n[db] race: sha already inserted by peer, reusing id={doc_id}\n"
365
+ )
366
+ else:
367
+ log_handle.write(f"\n[db] inserted id={doc_id} status={status}\n")
368
+
369
+ _append_jsonl(
370
+ paths.ingest_log,
371
+ {
372
+ "ts": _utc_now(),
373
+ "pdf": str(pdf_path),
374
+ "sha": sha,
375
+ "doc_id": doc_id,
376
+ "status": status,
377
+ "mineru_s": mineru_duration,
378
+ "llm_s": llm_duration,
379
+ "error": error_message,
380
+ },
381
+ )
382
+
383
+ return IngestResult(
384
+ pdf_path=pdf_path,
385
+ sha256=sha,
386
+ status=status,
387
+ doc_id=doc_id,
388
+ mineru_duration_s=mineru_duration,
389
+ llm_duration_s=llm_duration,
390
+ error_message=error_message,
391
+ error=error_info,
392
+ )
393
+ finally:
394
+ try:
395
+ log_handle.close()
396
+ except Exception:
397
+ pass
398
+
399
+
400
+ def _commit_failed(
401
+ *,
402
+ conn: sqlite3.Connection,
403
+ paths: ArchivePaths,
404
+ pdf_path: Path,
405
+ sha: str,
406
+ tmp_doc_dir: Path,
407
+ log_handle,
408
+ existing_id: Optional[int],
409
+ mineru_duration: Optional[float],
410
+ error_message: str,
411
+ error: Optional[ErrorInfo] = None,
412
+ ) -> IngestResult:
413
+ """
414
+ OCR 失败的收尾。DB 仍要记一条 status=failed,且保留 archive 内 source.pdf。
415
+
416
+ 如果是已成功/partial 的文档强制 reingest 失败,保留旧 output_dir 产物,只把本次
417
+ 失败日志挪到 archive root;如果是新文档或上次本来就是 failed,则把 tmp 提交成
418
+ documents/<sha-short>/,至少留下 source.pdf + ingest.log。
419
+ """
420
+ log_handle.close()
421
+
422
+ final_doc_dir = paths.doc_dir(sha)
423
+ existing = get_document(conn, existing_id) if existing_id else None
424
+ old_output_dir = Path(existing.output_dir) if existing and existing.output_dir else None
425
+ keep_old_outputs = (
426
+ existing is not None
427
+ and existing.status in {"ok", "partial"}
428
+ and old_output_dir is not None
429
+ and old_output_dir.exists()
430
+ )
431
+
432
+ failed_log: Optional[Path]
433
+ if keep_old_outputs:
434
+ # 旧 OCR 产物仍可用,不能被一次失败的 reingest 覆盖;但确保留档 PDF 在可控目录内。
435
+ _ensure_archived_source(paths, sha, pdf_path)
436
+ failed_log = paths.root / f"failed_{sha[:SHA_SHORT_LEN]}_{int(time.time())}.log"
437
+ try:
438
+ (tmp_doc_dir / "ingest.log").rename(failed_log)
439
+ except OSError:
440
+ failed_log = None
441
+ safe_rmtree(tmp_doc_dir)
442
+ output_dir = str(old_output_dir)
443
+ else:
444
+ safe_rmtree(final_doc_dir)
445
+ final_doc_dir.parent.mkdir(parents=True, exist_ok=True)
446
+ tmp_doc_dir.rename(final_doc_dir)
447
+ output_dir = str(final_doc_dir)
448
+ failed_log = final_doc_dir / "ingest.log"
449
+
450
+ if existing_id:
451
+ replace_document(
452
+ conn,
453
+ existing_id,
454
+ source_path=str(pdf_path),
455
+ output_dir=output_dir,
456
+ status="failed",
457
+ mineru_duration_s=mineru_duration,
458
+ llm_duration_s=None,
459
+ error_message=error_message,
460
+ extraction=ContractExtraction(),
461
+ confidence=ExtractionConfidence(),
462
+ )
463
+ doc_id = existing_id
464
+ else:
465
+ doc_id = insert_document(
466
+ conn,
467
+ sha256=sha,
468
+ source_path=str(pdf_path),
469
+ output_dir=output_dir,
470
+ status="failed",
471
+ mineru_duration_s=mineru_duration,
472
+ llm_duration_s=None,
473
+ error_message=error_message,
474
+ extraction=ContractExtraction(),
475
+ confidence=ExtractionConfidence(),
476
+ )
477
+
478
+ _append_jsonl(
479
+ paths.ingest_log,
480
+ {
481
+ "ts": _utc_now(),
482
+ "pdf": str(pdf_path),
483
+ "sha": sha,
484
+ "doc_id": doc_id,
485
+ "status": "failed",
486
+ "mineru_s": mineru_duration,
487
+ "llm_s": None,
488
+ "error": error_message,
489
+ "log_path": str(failed_log) if failed_log else None,
490
+ },
491
+ )
492
+
493
+ return IngestResult(
494
+ pdf_path=pdf_path,
495
+ sha256=sha,
496
+ status="failed",
497
+ doc_id=doc_id,
498
+ mineru_duration_s=mineru_duration,
499
+ error_message=error_message,
500
+ error=error,
501
+ )
502
+
503
+
504
+ # ---------- 复跑抽取(partial 状态修复,不重跑 OCR) ----------
505
+
506
+
507
+ def re_extract(
508
+ doc_id: int,
509
+ paths: ArchivePaths,
510
+ conn: sqlite3.Connection,
511
+ *,
512
+ llm_enabled: bool = True,
513
+ ) -> IngestResult:
514
+ """
515
+ 基于已有 mineru 产物重跑抽取。用于 partial 状态修复或调 prompt 后批量再抽取。
516
+ 不动 MinerU 产物,不动 sha256/source_path/ingested_at。
517
+ """
518
+ doc = get_document(conn, doc_id)
519
+ if not doc:
520
+ raise ValueError(f"document id={doc_id} not found")
521
+ mineru_dir = Path(doc.output_dir) / "mineru"
522
+ if not mineru_dir.exists():
523
+ raise FileNotFoundError(
524
+ f"mineru output missing for id={doc_id}: {mineru_dir}"
525
+ )
526
+
527
+ document_text = load_document_text(mineru_dir)
528
+ t0 = time.perf_counter()
529
+ error_message: Optional[str] = None
530
+ error_info: Optional[ErrorInfo] = None
531
+ status = "ok"
532
+ envelope = DocumentExtraction()
533
+ try:
534
+ extraction, confidence, envelope = _run_extraction(
535
+ document_text, llm_enabled=llm_enabled
536
+ )
537
+ # 空抽取护栏(与 ingest_pdf 对齐):开了 LLM 却啥都没抽到(最常见缺 key),
538
+ # 别静默标 ok 误导用户/agent——据 envelope.extraction_error 给结构化信号。
539
+ if (
540
+ llm_enabled
541
+ and not extraction.contract_name
542
+ and not envelope.title
543
+ and not envelope.fields
544
+ and not envelope.amounts
545
+ and not envelope.seals
546
+ ):
547
+ status = "partial"
548
+ error_info = envelope.extraction_error or extract_empty("LLM 抽取为空")
549
+ error_message = (
550
+ "LLM 抽取为空——通常是缺 DASHSCOPE_API_KEY 或 LLM 调用失败;"
551
+ "补好后重跑 `extract <id>`"
552
+ )
553
+ except Exception as e:
554
+ status = "partial"
555
+ error_message = f"extract: {e}"
556
+ error_info = classify_exception(e)
557
+ extraction = ContractExtraction()
558
+ confidence = ExtractionConfidence()
559
+ envelope = DocumentExtraction()
560
+ llm_duration = time.perf_counter() - t0
561
+
562
+ # 多模态签章核查:看落款页图重判签章(augment 内部处理 doc_type/无图/无 key 降级)。
563
+ if llm_enabled and status == "ok":
564
+ try:
565
+ augment_completeness_with_vision(envelope, mineru_dir)
566
+ except Exception as e: # noqa: BLE001 — VL 失败不能中断重抽
567
+ logger.warning("seal-vision 跳过(异常): %s", e)
568
+ try:
569
+ correct_evidence_pages(envelope, mineru_dir)
570
+ except Exception as e: # noqa: BLE001 — 页码校正失败不能中断重抽
571
+ logger.warning("page-fix 跳过(异常): %s", e)
572
+
573
+ # 身份基本信息核对:首见入库、再见校对。与 ingest 的 2.7 一致——
574
+ # 否则重抽会把已核对出的 identity_issues 清空,造成 ingest/extract 行为分叉。
575
+ try:
576
+ registry = PartyRegistry.load(paths.known_parties_path)
577
+ id_issues = registry.reconcile(envelope.person_identities, doc.sha256)
578
+ if registry.dirty:
579
+ registry.save()
580
+ envelope.identity_issues = id_issues
581
+ except Exception as e: # noqa: BLE001 — 核对失败不能中断重抽
582
+ logger.warning("identity 跳过(异常): %s", e)
583
+
584
+ # 落盘新 extracted.json(通用信封)
585
+ (Path(doc.output_dir) / FILE_EXTRACTION).write_text(
586
+ envelope.model_dump_json(indent=2), encoding="utf-8"
587
+ )
588
+ (Path(doc.output_dir) / FILE_EXTRACTION_CONF).write_text(
589
+ confidence.model_dump_json(indent=2), encoding="utf-8"
590
+ )
591
+
592
+ update_extraction(
593
+ conn,
594
+ doc_id,
595
+ status=status,
596
+ llm_duration_s=llm_duration,
597
+ error_message=error_message,
598
+ extraction=extraction,
599
+ confidence=confidence,
600
+ envelope=envelope,
601
+ )
602
+
603
+ _append_jsonl(
604
+ paths.ingest_log,
605
+ {
606
+ "ts": _utc_now(),
607
+ "op": "re_extract",
608
+ "doc_id": doc_id,
609
+ "sha": doc.sha256,
610
+ "status": status,
611
+ "llm_s": llm_duration,
612
+ "error": error_message,
613
+ },
614
+ )
615
+
616
+ return IngestResult(
617
+ pdf_path=Path(doc.source_path),
618
+ sha256=doc.sha256,
619
+ status=status,
620
+ doc_id=doc_id,
621
+ llm_duration_s=llm_duration,
622
+ error_message=error_message,
623
+ error=error_info or envelope.extraction_error,
624
+ )
625
+
626
+
627
+ # ---------- 工具 ----------
628
+
629
+
630
+ def load_document_text(mineru_dir: Path) -> str:
631
+ """优先 raw_text.txt(已清洗),fallback markdown.md。"""
632
+ raw = mineru_dir / FILE_RAW_TEXT
633
+ md = mineru_dir / FILE_MARKDOWN
634
+ if raw.exists():
635
+ return raw.read_text(encoding="utf-8")
636
+ if md.exists():
637
+ return md.read_text(encoding="utf-8")
638
+ return ""
639
+
640
+
641
+ def _append_jsonl(path: Path, payload: dict) -> None:
642
+ path.parent.mkdir(parents=True, exist_ok=True)
643
+ with path.open("a", encoding="utf-8") as f:
644
+ f.write(json.dumps(payload, ensure_ascii=False) + "\n")
645
+
646
+
647
+ def _utc_now() -> str:
648
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
649
+
650
+
651
+ # ---------- 目录递归收集 PDF ----------
652
+
653
+
654
+ def discover_pdfs(path: Path) -> list[Path]:
655
+ """传入文件返回单元素列表;传入目录递归找 *.pdf,跳过隐藏文件。"""
656
+ path = path.resolve()
657
+ if path.is_file():
658
+ if path.suffix.lower() != ".pdf":
659
+ raise ValueError(f"not a PDF: {path}")
660
+ return [path]
661
+ if not path.is_dir():
662
+ raise FileNotFoundError(path)
663
+ pdfs = sorted(
664
+ p for p in path.rglob("*.pdf")
665
+ if not any(part.startswith(".") for part in p.relative_to(path).parts)
666
+ )
667
+ return pdfs