contract-archive-cli 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. contract_archive/__init__.py +2 -0
  2. contract_archive/archive/__init__.py +64 -0
  3. contract_archive/archive/db.py +126 -0
  4. contract_archive/archive/ingest.py +667 -0
  5. contract_archive/archive/migrations/001_init.sql +62 -0
  6. contract_archive/archive/migrations/002_obligations.sql +25 -0
  7. contract_archive/archive/migrations/003_document_types.sql +31 -0
  8. contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
  9. contract_archive/archive/migrations/005_completeness.sql +18 -0
  10. contract_archive/archive/party_registry.py +276 -0
  11. contract_archive/archive/paths.py +113 -0
  12. contract_archive/archive/repository.py +918 -0
  13. contract_archive/cli.py +455 -0
  14. contract_archive/cli_common.py +293 -0
  15. contract_archive/cli_config.py +96 -0
  16. contract_archive/cli_introspect.py +204 -0
  17. contract_archive/cli_party.py +166 -0
  18. contract_archive/cli_query.py +492 -0
  19. contract_archive/cli_render.py +575 -0
  20. contract_archive/config.py +257 -0
  21. contract_archive/errors.py +163 -0
  22. contract_archive/extraction/__init__.py +14 -0
  23. contract_archive/extraction/amount_check.py +87 -0
  24. contract_archive/extraction/contract_extractor.py +103 -0
  25. contract_archive/extraction/document_extractor.py +546 -0
  26. contract_archive/extraction/evidence_page_fix.py +99 -0
  27. contract_archive/extraction/llm_extractor.py +207 -0
  28. contract_archive/extraction/normalize.py +210 -0
  29. contract_archive/extraction/property_fee.py +79 -0
  30. contract_archive/extraction/vision_seal.py +390 -0
  31. contract_archive/pipelines/__init__.py +9 -0
  32. contract_archive/pipelines/mineru_pipeline.py +955 -0
  33. contract_archive/pipelines/vl_ocr.py +160 -0
  34. contract_archive/schemas/__init__.py +67 -0
  35. contract_archive/schemas/document.py +408 -0
  36. contract_archive/utils/__init__.py +27 -0
  37. contract_archive/utils/device.py +51 -0
  38. contract_archive/utils/http_env.py +54 -0
  39. contract_archive/utils/pdf.py +207 -0
  40. contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
  41. contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
  42. contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
  43. contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
  44. contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,918 @@
1
+ """
2
+ 档案库 DAO(数据访问层)。
3
+
4
+ 只暴露业务操作,调用方不直接拼 SQL。所有写操作:
5
+ - 显式事务(with transaction(conn))
6
+ - INSERT documents 用 ON CONFLICT(sha256) DO NOTHING 避免吃 autoincrement seq
7
+ - reingest 时 risk_clauses 先 DELETE 再批量 INSERT,同一事务
8
+
9
+ 不引入 ORM —— 单表项目,dict ↔ row 手写更轻。
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import sqlite3
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, Iterable, Optional
17
+
18
+ from ..schemas import (
19
+ ContractExtraction,
20
+ DocumentExtraction,
21
+ ExtractionConfidence,
22
+ LabeledAmount,
23
+ LabeledDate,
24
+ ObligationItem,
25
+ Seal,
26
+ )
27
+ from .db import transaction, utc_now_iso
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ # ---------- 类型 ----------
33
+
34
+
35
+ @dataclass
36
+ class DocumentRow:
37
+ """单条档案记录(documents 表 + risk_clauses 聚合)。"""
38
+
39
+ id: int
40
+ sha256: str
41
+ source_path: str
42
+ output_dir: str
43
+ ingested_at: str
44
+ mineru_duration_s: Optional[float]
45
+ llm_duration_s: Optional[float]
46
+ status: str
47
+ error_message: Optional[str]
48
+ # 通用信封列(任何文档类型都填)
49
+ doc_type: str
50
+ title: Optional[str]
51
+ summary: Optional[str]
52
+ primary_date: Optional[str]
53
+ primary_amount_cents: Optional[int]
54
+ details_json: Optional[str]
55
+ # 合同专属列(doc_type=合同协议 时填,其余 NULL)
56
+ contract_name: Optional[str]
57
+ party_a: Optional[str]
58
+ party_b: Optional[str]
59
+ amount_text: Optional[str]
60
+ amount_cents: Optional[int]
61
+ sign_date: Optional[str]
62
+ expire_date: Optional[str]
63
+ auto_renewal: Optional[int]
64
+ overall_confidence: Optional[float]
65
+ # 完整性核查状态(仅合同协议判,其余 NULL)。详情在 details_json.completeness,
66
+ # 此列只为 list --incomplete 的 WHERE 过滤。默认 None 保证旧构造点不破。
67
+ completeness_status: Optional[str] = None
68
+ risk_clauses: list[str] = field(default_factory=list)
69
+ obligations: list[ObligationItem] = field(default_factory=list)
70
+
71
+ @property
72
+ def primary_amount_value(self) -> Optional[float]:
73
+ return None if self.primary_amount_cents is None else self.primary_amount_cents / 100.0
74
+
75
+ def details(self) -> dict:
76
+ """details_json 解析为 dict(柔性字段:parties/amounts/fields/key_dates)。"""
77
+ import json
78
+ if not self.details_json:
79
+ return {}
80
+ try:
81
+ return json.loads(self.details_json)
82
+ except (json.JSONDecodeError, TypeError):
83
+ return {}
84
+
85
+ @property
86
+ def amount_value(self) -> Optional[float]:
87
+ """amount_cents → 元,方便展示。"""
88
+ return None if self.amount_cents is None else self.amount_cents / 100.0
89
+
90
+ @property
91
+ def short_sha(self) -> str:
92
+ return self.sha256[:12]
93
+
94
+
95
+ # ---------- 工具 ----------
96
+
97
+
98
+ def _amount_to_cents(value: Optional[float]) -> Optional[int]:
99
+ """元 → 分。None 透传。四舍五入到分,防 0.005 漂移。"""
100
+ if value is None:
101
+ return None
102
+ return int(round(value * 100))
103
+
104
+
105
+ def _completeness_status(env: DocumentExtraction) -> Optional[str]:
106
+ """envelope.completeness.status → 可索引列值;无核查(非合同/未判)返回 None。"""
107
+ return env.completeness.status if env.completeness else None
108
+
109
+
110
+ def contract_to_envelope(ext: ContractExtraction) -> DocumentExtraction:
111
+ """
112
+ 合同抽取 → 通用信封。
113
+ 用于未显式提供 envelope 的调用(如旧测试、纯合同路径回退),
114
+ 保证合同行也有统一的 doc_type/title/primary_* 可供 list/show 展示。
115
+ """
116
+ amounts: list[LabeledAmount] = []
117
+ if ext.amount:
118
+ amounts.append(LabeledAmount(label="合同金额", text=ext.amount, value=ext.amount_value))
119
+ key_dates: list[LabeledDate] = []
120
+ if ext.sign_date:
121
+ key_dates.append(LabeledDate(label="签订日", date=ext.sign_date))
122
+ if ext.expire_date:
123
+ key_dates.append(LabeledDate(label="到期日", date=ext.expire_date))
124
+ return DocumentExtraction(
125
+ doc_type="合同协议",
126
+ title=ext.contract_name,
127
+ summary=ext.contract_name,
128
+ parties=[p for p in (ext.party_a, ext.party_b) if p],
129
+ primary_date=ext.sign_date,
130
+ primary_amount_text=ext.amount,
131
+ primary_amount_value=ext.amount_value,
132
+ key_dates=key_dates,
133
+ amounts=amounts,
134
+ obligations=ext.obligations,
135
+ )
136
+
137
+
138
+ def _row_to_document(
139
+ row: sqlite3.Row,
140
+ risks: list[str],
141
+ obligations: list[ObligationItem],
142
+ ) -> DocumentRow:
143
+ return DocumentRow(
144
+ id=row["id"],
145
+ sha256=row["sha256"],
146
+ source_path=row["source_path"],
147
+ output_dir=row["output_dir"],
148
+ ingested_at=row["ingested_at"],
149
+ mineru_duration_s=row["mineru_duration_s"],
150
+ llm_duration_s=row["llm_duration_s"],
151
+ status=row["status"],
152
+ error_message=row["error_message"],
153
+ doc_type=row["doc_type"],
154
+ title=row["title"],
155
+ summary=row["summary"],
156
+ primary_date=row["primary_date"],
157
+ primary_amount_cents=row["primary_amount_cents"],
158
+ details_json=row["details_json"],
159
+ contract_name=row["contract_name"],
160
+ party_a=row["party_a"],
161
+ party_b=row["party_b"],
162
+ amount_text=row["amount_text"],
163
+ amount_cents=row["amount_cents"],
164
+ sign_date=row["sign_date"],
165
+ expire_date=row["expire_date"],
166
+ auto_renewal=row["auto_renewal"],
167
+ overall_confidence=row["overall_confidence"],
168
+ completeness_status=row["completeness_status"],
169
+ risk_clauses=risks,
170
+ obligations=obligations,
171
+ )
172
+
173
+
174
+ # ---------- 查询 ----------
175
+
176
+
177
+ def find_by_sha(conn: sqlite3.Connection, sha256: str) -> Optional[int]:
178
+ """sha256 → id;不存在返回 None。ingest 去重用。"""
179
+ row = conn.execute(
180
+ "SELECT id FROM documents WHERE sha256 = ?", (sha256,)
181
+ ).fetchone()
182
+ return row["id"] if row else None
183
+
184
+
185
+ def get_document(conn: sqlite3.Connection, doc_id: int) -> Optional[DocumentRow]:
186
+ row = conn.execute(
187
+ "SELECT * FROM documents WHERE id = ?", (doc_id,)
188
+ ).fetchone()
189
+ if not row:
190
+ return None
191
+ return _hydrate(conn, row)
192
+
193
+
194
+ def find_by_sha_prefix(
195
+ conn: sqlite3.Connection, prefix: str
196
+ ) -> list[DocumentRow]:
197
+ """
198
+ sha 前缀查(show 命令支持)。
199
+ 前缀必须 >= 4 字符以避免误命中。
200
+ """
201
+ if len(prefix) < 4:
202
+ raise ValueError("sha prefix must be >= 4 chars to disambiguate")
203
+ rows = conn.execute(
204
+ "SELECT * FROM documents WHERE sha256 LIKE ? ORDER BY ingested_at DESC",
205
+ (prefix + "%",),
206
+ ).fetchall()
207
+ return [_hydrate(conn, r) for r in rows]
208
+
209
+
210
+ def _hydrate(conn: sqlite3.Connection, row: sqlite3.Row) -> DocumentRow:
211
+ """从主表行 + 子表数据组装 DocumentRow。"""
212
+ return _row_to_document(
213
+ row,
214
+ _load_risks(conn, row["id"]),
215
+ _load_obligations(conn, row["id"]),
216
+ )
217
+
218
+
219
+ def _load_risks(conn: sqlite3.Connection, doc_id: int) -> list[str]:
220
+ rows = conn.execute(
221
+ "SELECT clause_text FROM risk_clauses WHERE doc_id = ? ORDER BY id",
222
+ (doc_id,),
223
+ ).fetchall()
224
+ return [r["clause_text"] for r in rows]
225
+
226
+
227
+ @dataclass
228
+ class TodoItem:
229
+ """跨合同 obligations 视图(list_obligations 的返回行)。"""
230
+
231
+ obligation_id: int
232
+ doc_id: int
233
+ contract_name: Optional[str]
234
+ party_a: Optional[str]
235
+ party_b: Optional[str]
236
+ actor: str
237
+ action: str
238
+ deadline: Optional[str]
239
+ evidence: str
240
+
241
+
242
+ def list_obligations(
243
+ conn: sqlite3.Connection,
244
+ *,
245
+ actor: Optional[str] = None,
246
+ before: Optional[str] = None,
247
+ after: Optional[str] = None,
248
+ include_undated: bool = False,
249
+ limit: int = 50,
250
+ ) -> list[TodoItem]:
251
+ """
252
+ 跨合同列 obligations(待办看板)。
253
+
254
+ 默认只返回带 deadline 的,按 deadline 升序。
255
+ include_undated=True 时同时返回无日期义务(排在末尾)。
256
+ """
257
+ where: list[str] = []
258
+ params: list[Any] = []
259
+ if not include_undated:
260
+ where.append("o.deadline IS NOT NULL")
261
+ if actor:
262
+ if actor not in ("party_a", "party_b", "both"):
263
+ raise ValueError(f"actor must be party_a/party_b/both, got {actor!r}")
264
+ where.append("o.actor = ?")
265
+ params.append(actor)
266
+ if before:
267
+ where.append("(o.deadline IS NOT NULL AND o.deadline <= ?)")
268
+ params.append(before)
269
+ if after:
270
+ where.append("(o.deadline IS NOT NULL AND o.deadline >= ?)")
271
+ params.append(after)
272
+
273
+ sql = """
274
+ SELECT o.id AS oid, o.doc_id, o.actor, o.action, o.deadline, o.evidence,
275
+ d.contract_name, d.party_a, d.party_b
276
+ FROM obligations o JOIN documents d ON d.id = o.doc_id
277
+ """
278
+ if where:
279
+ sql += " WHERE " + " AND ".join(where)
280
+ # NULL deadline 排到最后(IS NULL 排序:SQLite NULLS FIRST 默认,反过来)
281
+ sql += " ORDER BY (o.deadline IS NULL), o.deadline ASC, o.doc_id LIMIT ?"
282
+ params.append(limit)
283
+
284
+ rows = conn.execute(sql, params).fetchall()
285
+ return [
286
+ TodoItem(
287
+ obligation_id=r["oid"],
288
+ doc_id=r["doc_id"],
289
+ contract_name=r["contract_name"],
290
+ party_a=r["party_a"],
291
+ party_b=r["party_b"],
292
+ actor=r["actor"],
293
+ action=r["action"],
294
+ deadline=r["deadline"],
295
+ evidence=r["evidence"] or "",
296
+ )
297
+ for r in rows
298
+ ]
299
+
300
+
301
+ @dataclass
302
+ class SealRow:
303
+ """跨文档印章视图(list_seals 的返回行)。"""
304
+
305
+ seal_id: int
306
+ doc_id: int
307
+ title: Optional[str] # 文档标题(COALESCE title, contract_name)
308
+ owner: Optional[str]
309
+ seal_type: Optional[str]
310
+ raw_text: str
311
+
312
+
313
+ def list_seals(
314
+ conn: sqlite3.Connection,
315
+ *,
316
+ owner: Optional[str] = None,
317
+ seal_type: Optional[str] = None,
318
+ limit: int = 200,
319
+ ) -> list[SealRow]:
320
+ """
321
+ 跨文档列印章("某公司有哪些章、各出现在哪些文档")。
322
+ owner / seal_type 为 LIKE 过滤。按 owner、seal_type 排序便于聚合阅读。
323
+ """
324
+ where: list[str] = []
325
+ params: list[Any] = []
326
+ if owner:
327
+ where.append("s.owner LIKE ?")
328
+ params.append(f"%{owner}%")
329
+ if seal_type:
330
+ where.append("s.seal_type LIKE ?")
331
+ params.append(f"%{seal_type}%")
332
+
333
+ sql = """
334
+ SELECT s.id AS sid, s.doc_id, s.owner, s.seal_type, s.raw_text,
335
+ COALESCE(d.title, d.contract_name) AS title
336
+ FROM document_seals s JOIN documents d ON d.id = s.doc_id
337
+ """
338
+ if where:
339
+ sql += " WHERE " + " AND ".join(where)
340
+ sql += " ORDER BY s.owner IS NULL, s.owner, s.seal_type, s.doc_id LIMIT ?"
341
+ params.append(limit)
342
+
343
+ rows = conn.execute(sql, params).fetchall()
344
+ return [
345
+ SealRow(
346
+ seal_id=r["sid"],
347
+ doc_id=r["doc_id"],
348
+ title=r["title"],
349
+ owner=r["owner"],
350
+ seal_type=r["seal_type"],
351
+ raw_text=r["raw_text"],
352
+ )
353
+ for r in rows
354
+ ]
355
+
356
+
357
+ def _load_obligations(conn: sqlite3.Connection, doc_id: int) -> list[ObligationItem]:
358
+ rows = conn.execute(
359
+ """SELECT actor, action, deadline, evidence
360
+ FROM obligations WHERE doc_id = ?
361
+ ORDER BY ordering, id""",
362
+ (doc_id,),
363
+ ).fetchall()
364
+ return [
365
+ ObligationItem(
366
+ actor=r["actor"],
367
+ action=r["action"],
368
+ deadline=r["deadline"],
369
+ evidence=r["evidence"] or "",
370
+ )
371
+ for r in rows
372
+ ]
373
+
374
+
375
+ def list_documents(
376
+ conn: sqlite3.Connection,
377
+ limit: int = 50,
378
+ order_by: str = "ingested_at",
379
+ status: Optional[str] = None,
380
+ doc_type: Optional[str] = None,
381
+ incomplete: bool = False,
382
+ ) -> list[DocumentRow]:
383
+ """list 命令实现。status / doc_type=None 表示不过滤;incomplete=True 只列疑似不完整。"""
384
+ allowed_order = {
385
+ "ingested_at", "sign_date", "expire_date", "amount_cents",
386
+ "primary_date", "primary_amount_cents",
387
+ }
388
+ if order_by not in allowed_order:
389
+ raise ValueError(f"order_by must be one of {allowed_order}")
390
+
391
+ where: list[str] = []
392
+ params: list[Any] = []
393
+ if status:
394
+ where.append("status = ?")
395
+ params.append(status)
396
+ if doc_type:
397
+ where.append("doc_type = ?")
398
+ params.append(doc_type)
399
+ if incomplete:
400
+ where.append("completeness_status = 'incomplete'")
401
+ sql = "SELECT * FROM documents"
402
+ if where:
403
+ sql += " WHERE " + " AND ".join(where)
404
+ sql += f" ORDER BY {order_by} DESC LIMIT ?"
405
+ params.append(limit)
406
+
407
+ rows = conn.execute(sql, params).fetchall()
408
+ return [_hydrate(conn, r) for r in rows]
409
+
410
+
411
+ @dataclass
412
+ class SearchFilter:
413
+ """search 命令的过滤参数。所有 None 字段被忽略。"""
414
+
415
+ name: Optional[str] = None # LIKE 模糊匹配 contract_name
416
+ party: Optional[str] = None # LIKE 模糊匹配 party_a 或 party_b
417
+ amount_min_cents: Optional[int] = None
418
+ amount_max_cents: Optional[int] = None
419
+ signed_after: Optional[str] = None
420
+ signed_before: Optional[str] = None
421
+ expire_before: Optional[str] = None
422
+ auto_renewal: Optional[bool] = None
423
+ has_risk: bool = False
424
+ status: Optional[str] = None
425
+ # 义务过滤:跨表 EXISTS 查询
426
+ deadline_before: Optional[str] = None # 找近期到期的待办
427
+ deadline_after: Optional[str] = None
428
+ actor: Optional[str] = None # party_a / party_b / both
429
+ # 印章 / 主体过滤:跨表 EXISTS
430
+ has_seal: Optional[bool] = None # True=有章 / False=无章 / None=不过滤
431
+ seal_owner: Optional[str] = None # LIKE 盖章主体
432
+ seal_type: Optional[str] = None # LIKE 章类型
433
+ subject: Optional[str] = None # LIKE 主体(覆盖所有文档类型)
434
+ limit: int = 50
435
+
436
+
437
+ def search_documents(
438
+ conn: sqlite3.Connection, flt: SearchFilter
439
+ ) -> list[DocumentRow]:
440
+ """
441
+ 多字段过滤查询(全 AND)。
442
+
443
+ - name/party/seal/subject 用 LIKE '%kw%',跨表条件用 EXISTS 子查询。
444
+ 不用 FTS5:千级档案库全表扫毫秒级,且 2 字中文人名/词 trigram 会 miss
445
+ (见 001_init.sql 设计注释)。
446
+ - 参数顺序:where 子句与其 ? 参数严格同序——每个带参条件「先 append param
447
+ 再 append clause」,多条件 EXISTS 块整段追加在已有条件之后、limit 之前。
448
+ """
449
+ where: list[str] = []
450
+ params: list[Any] = []
451
+
452
+ if flt.name:
453
+ where.append("contract_name LIKE ?")
454
+ params.append(f"%{flt.name}%")
455
+ if flt.party:
456
+ where.append("(party_a LIKE ? OR party_b LIKE ?)")
457
+ like = f"%{flt.party}%"
458
+ params.append(like)
459
+ params.append(like)
460
+ if flt.amount_min_cents is not None:
461
+ where.append("amount_cents >= ?")
462
+ params.append(flt.amount_min_cents)
463
+ if flt.amount_max_cents is not None:
464
+ where.append("amount_cents <= ?")
465
+ params.append(flt.amount_max_cents)
466
+ if flt.signed_after:
467
+ where.append("sign_date >= ?")
468
+ params.append(flt.signed_after)
469
+ if flt.signed_before:
470
+ where.append("sign_date <= ?")
471
+ params.append(flt.signed_before)
472
+ if flt.expire_before:
473
+ where.append("expire_date <= ?")
474
+ params.append(flt.expire_before)
475
+ if flt.auto_renewal is not None:
476
+ where.append("auto_renewal = ?")
477
+ params.append(1 if flt.auto_renewal else 0)
478
+ if flt.has_risk:
479
+ where.append("EXISTS (SELECT 1 FROM risk_clauses WHERE doc_id = documents.id)")
480
+ if flt.status:
481
+ where.append("status = ?")
482
+ params.append(flt.status)
483
+
484
+ # 义务过滤:用一个 EXISTS 子查询带 AND 链,所有 obligation 条件命中同一条 obligation
485
+ obl_where: list[str] = []
486
+ if flt.deadline_before:
487
+ obl_where.append("deadline IS NOT NULL AND deadline <= ?")
488
+ params.append(flt.deadline_before)
489
+ if flt.deadline_after:
490
+ obl_where.append("deadline IS NOT NULL AND deadline >= ?")
491
+ params.append(flt.deadline_after)
492
+ if flt.actor:
493
+ if flt.actor not in ("party_a", "party_b", "both"):
494
+ raise ValueError(f"actor must be party_a/party_b/both, got {flt.actor!r}")
495
+ obl_where.append("actor = ?")
496
+ params.append(flt.actor)
497
+ if obl_where:
498
+ # obl 参数已逐个 append 到 params 末尾,这里把整个 EXISTS clause 也 append
499
+ # 到 where 末尾——两者同序,? 与 params 自然对齐。
500
+ clause = (
501
+ "EXISTS (SELECT 1 FROM obligations WHERE doc_id = documents.id AND "
502
+ + " AND ".join(obl_where)
503
+ + ")"
504
+ )
505
+ where.append(clause)
506
+
507
+ # 印章过滤:存在性(无参数)+ owner/type(LIKE)。
508
+ # 参数顺序铁律:每个带 ? 的 clause,其参数 append 到 params 的位置必须与
509
+ # clause 在 where 里的位置一致——这里一律"先 append param 再 append clause",
510
+ # 且整段在 obligations 块之后、limit 之前,顺序自然对齐。
511
+ if flt.has_seal is True:
512
+ where.append("EXISTS (SELECT 1 FROM document_seals WHERE doc_id = documents.id)")
513
+ elif flt.has_seal is False:
514
+ where.append("NOT EXISTS (SELECT 1 FROM document_seals WHERE doc_id = documents.id)")
515
+ seal_where: list[str] = []
516
+ if flt.seal_owner:
517
+ seal_where.append("owner LIKE ?")
518
+ params.append(f"%{flt.seal_owner}%")
519
+ if flt.seal_type:
520
+ seal_where.append("seal_type LIKE ?")
521
+ params.append(f"%{flt.seal_type}%")
522
+ if seal_where:
523
+ where.append(
524
+ "EXISTS (SELECT 1 FROM document_seals WHERE doc_id = documents.id AND "
525
+ + " AND ".join(seal_where)
526
+ + ")"
527
+ )
528
+
529
+ # 主体过滤:document_subjects 覆盖所有文档类型(含合同甲乙方)
530
+ if flt.subject:
531
+ where.append(
532
+ "EXISTS (SELECT 1 FROM document_subjects "
533
+ "WHERE doc_id = documents.id AND subject LIKE ?)"
534
+ )
535
+ params.append(f"%{flt.subject}%")
536
+
537
+ sql = "SELECT * FROM documents"
538
+ if where:
539
+ sql += " WHERE " + " AND ".join(where)
540
+ sql += " ORDER BY ingested_at DESC LIMIT ?"
541
+ params.append(flt.limit)
542
+
543
+ rows = conn.execute(sql, params).fetchall()
544
+ return [_hydrate(conn, r) for r in rows]
545
+
546
+
547
+ # ---------- 写入 ----------
548
+
549
+
550
+ def insert_document(
551
+ conn: sqlite3.Connection,
552
+ *,
553
+ sha256: str,
554
+ source_path: str,
555
+ output_dir: str,
556
+ status: str,
557
+ mineru_duration_s: Optional[float],
558
+ llm_duration_s: Optional[float],
559
+ error_message: Optional[str],
560
+ extraction: Optional[ContractExtraction],
561
+ confidence: Optional[ExtractionConfidence],
562
+ envelope: Optional[DocumentExtraction] = None,
563
+ ) -> Optional[int]:
564
+ """
565
+ 新增一条档案。sha256 冲突时返回 None(已存在),不消耗 autoincrement seq。
566
+ 单事务:documents + risk_clauses + obligations 全部原子写入。
567
+
568
+ envelope 缺省时由合同抽取派生(兼容只传 extraction 的调用)。
569
+ obligations 取信封的(合同路径会把 hybrid 的 obligations 灌进信封)。
570
+ """
571
+ ext = extraction or ContractExtraction()
572
+ conf = confidence or ExtractionConfidence()
573
+ env = envelope or contract_to_envelope(ext)
574
+
575
+ with transaction(conn):
576
+ cursor = conn.execute(
577
+ """
578
+ INSERT INTO documents (
579
+ sha256, source_path, output_dir, ingested_at,
580
+ mineru_duration_s, llm_duration_s, status, error_message,
581
+ doc_type, title, summary, details_json,
582
+ primary_date, primary_amount_cents,
583
+ contract_name, party_a, party_b,
584
+ amount_text, amount_cents,
585
+ sign_date, expire_date, auto_renewal,
586
+ overall_confidence, completeness_status
587
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
588
+ ON CONFLICT(sha256) DO NOTHING
589
+ """,
590
+ (
591
+ sha256,
592
+ source_path,
593
+ output_dir,
594
+ utc_now_iso(),
595
+ mineru_duration_s,
596
+ llm_duration_s,
597
+ status,
598
+ error_message,
599
+ env.doc_type,
600
+ env.title,
601
+ env.summary,
602
+ env.model_dump_json(),
603
+ env.primary_date,
604
+ _amount_to_cents(env.primary_amount_value),
605
+ ext.contract_name,
606
+ ext.party_a,
607
+ ext.party_b,
608
+ ext.amount,
609
+ _amount_to_cents(ext.amount_value),
610
+ ext.sign_date,
611
+ ext.expire_date,
612
+ None if ext.auto_renewal is None else int(ext.auto_renewal),
613
+ conf.overall,
614
+ _completeness_status(env),
615
+ ),
616
+ )
617
+ if cursor.rowcount == 0:
618
+ return None # 冲突,sha256 已存在
619
+ doc_id = cursor.lastrowid
620
+ _insert_risks(conn, doc_id, ext.risk_clauses)
621
+ _insert_obligations(conn, doc_id, env.obligations)
622
+ _insert_seals(conn, doc_id, _collect_seals(env))
623
+ _insert_subjects(conn, doc_id, _subjects_for(env, ext))
624
+ return doc_id
625
+
626
+
627
+ def update_extraction(
628
+ conn: sqlite3.Connection,
629
+ doc_id: int,
630
+ *,
631
+ status: str,
632
+ llm_duration_s: Optional[float],
633
+ error_message: Optional[str],
634
+ extraction: ContractExtraction,
635
+ confidence: ExtractionConfidence,
636
+ envelope: Optional[DocumentExtraction] = None,
637
+ ) -> None:
638
+ """
639
+ 复跑抽取(mineru 产物已存在)后更新字段。同一事务:
640
+ risk_clauses / obligations 显式 DELETE 再 INSERT,避免重复堆积。
641
+ """
642
+ ext = extraction
643
+ env = envelope or contract_to_envelope(ext)
644
+ with transaction(conn):
645
+ conn.execute(
646
+ """
647
+ UPDATE documents SET
648
+ status = ?,
649
+ llm_duration_s = ?,
650
+ error_message = ?,
651
+ doc_type = ?, title = ?, summary = ?, details_json = ?,
652
+ primary_date = ?, primary_amount_cents = ?,
653
+ contract_name = ?, party_a = ?, party_b = ?,
654
+ amount_text = ?, amount_cents = ?,
655
+ sign_date = ?, expire_date = ?, auto_renewal = ?,
656
+ overall_confidence = ?, completeness_status = ?
657
+ WHERE id = ?
658
+ """,
659
+ (
660
+ status,
661
+ llm_duration_s,
662
+ error_message,
663
+ env.doc_type,
664
+ env.title,
665
+ env.summary,
666
+ env.model_dump_json(),
667
+ env.primary_date,
668
+ _amount_to_cents(env.primary_amount_value),
669
+ ext.contract_name,
670
+ ext.party_a,
671
+ ext.party_b,
672
+ ext.amount,
673
+ _amount_to_cents(ext.amount_value),
674
+ ext.sign_date,
675
+ ext.expire_date,
676
+ None if ext.auto_renewal is None else int(ext.auto_renewal),
677
+ confidence.overall,
678
+ _completeness_status(env),
679
+ doc_id,
680
+ ),
681
+ )
682
+ conn.execute("DELETE FROM risk_clauses WHERE doc_id = ?", (doc_id,))
683
+ conn.execute("DELETE FROM obligations WHERE doc_id = ?", (doc_id,))
684
+ conn.execute("DELETE FROM document_seals WHERE doc_id = ?", (doc_id,))
685
+ conn.execute("DELETE FROM document_subjects WHERE doc_id = ?", (doc_id,))
686
+ _insert_risks(conn, doc_id, ext.risk_clauses)
687
+ _insert_obligations(conn, doc_id, env.obligations)
688
+ _insert_seals(conn, doc_id, _collect_seals(env))
689
+ _insert_subjects(conn, doc_id, _subjects_for(env, ext))
690
+
691
+
692
+ def replace_document(
693
+ conn: sqlite3.Connection,
694
+ doc_id: int,
695
+ *,
696
+ source_path: str,
697
+ output_dir: str,
698
+ status: str,
699
+ mineru_duration_s: Optional[float],
700
+ llm_duration_s: Optional[float],
701
+ error_message: Optional[str],
702
+ extraction: ContractExtraction,
703
+ confidence: ExtractionConfidence,
704
+ envelope: Optional[DocumentExtraction] = None,
705
+ ) -> None:
706
+ """
707
+ reingest:mineru + 抽取都重跑。比 update_extraction 多更新 source_path/output_dir/mineru_duration。
708
+ sha256 / id / ingested_at 不变。
709
+ """
710
+ ext = extraction
711
+ env = envelope or contract_to_envelope(ext)
712
+ with transaction(conn):
713
+ conn.execute(
714
+ """
715
+ UPDATE documents SET
716
+ source_path = ?, output_dir = ?,
717
+ mineru_duration_s = ?, llm_duration_s = ?,
718
+ status = ?, error_message = ?,
719
+ doc_type = ?, title = ?, summary = ?, details_json = ?,
720
+ primary_date = ?, primary_amount_cents = ?,
721
+ contract_name = ?, party_a = ?, party_b = ?,
722
+ amount_text = ?, amount_cents = ?,
723
+ sign_date = ?, expire_date = ?, auto_renewal = ?,
724
+ overall_confidence = ?, completeness_status = ?
725
+ WHERE id = ?
726
+ """,
727
+ (
728
+ source_path,
729
+ output_dir,
730
+ mineru_duration_s,
731
+ llm_duration_s,
732
+ status,
733
+ error_message,
734
+ env.doc_type,
735
+ env.title,
736
+ env.summary,
737
+ env.model_dump_json(),
738
+ env.primary_date,
739
+ _amount_to_cents(env.primary_amount_value),
740
+ ext.contract_name,
741
+ ext.party_a,
742
+ ext.party_b,
743
+ ext.amount,
744
+ _amount_to_cents(ext.amount_value),
745
+ ext.sign_date,
746
+ ext.expire_date,
747
+ None if ext.auto_renewal is None else int(ext.auto_renewal),
748
+ confidence.overall,
749
+ _completeness_status(env),
750
+ doc_id,
751
+ ),
752
+ )
753
+ conn.execute("DELETE FROM risk_clauses WHERE doc_id = ?", (doc_id,))
754
+ conn.execute("DELETE FROM obligations WHERE doc_id = ?", (doc_id,))
755
+ conn.execute("DELETE FROM document_seals WHERE doc_id = ?", (doc_id,))
756
+ conn.execute("DELETE FROM document_subjects WHERE doc_id = ?", (doc_id,))
757
+ _insert_risks(conn, doc_id, ext.risk_clauses)
758
+ _insert_obligations(conn, doc_id, env.obligations)
759
+ _insert_seals(conn, doc_id, _collect_seals(env))
760
+ _insert_subjects(conn, doc_id, _subjects_for(env, ext))
761
+
762
+
763
+ def _insert_risks(
764
+ conn: sqlite3.Connection, doc_id: int, clauses: Iterable[str]
765
+ ) -> None:
766
+ """批量插 risk_clauses(severity 留空,未来增强)。"""
767
+ rows = [(doc_id, c) for c in clauses if c and c.strip()]
768
+ if not rows:
769
+ return
770
+ conn.executemany(
771
+ "INSERT INTO risk_clauses(doc_id, clause_text) VALUES (?, ?)",
772
+ rows,
773
+ )
774
+
775
+
776
+ def _insert_obligations(
777
+ conn: sqlite3.Connection,
778
+ doc_id: int,
779
+ items: Iterable[ObligationItem],
780
+ ) -> None:
781
+ """批量插 obligations,ordering 按列表顺序递增。"""
782
+ rows = [
783
+ (doc_id, it.actor, it.action, it.deadline, it.evidence, i)
784
+ for i, it in enumerate(items)
785
+ if it.action and it.action.strip()
786
+ ]
787
+ if not rows:
788
+ return
789
+ conn.executemany(
790
+ """INSERT INTO obligations(doc_id, actor, action, deadline, evidence, ordering)
791
+ VALUES (?, ?, ?, ?, ?, ?)""",
792
+ rows,
793
+ )
794
+
795
+
796
+ def _collect_seals(env: DocumentExtraction) -> list[Seal]:
797
+ """
798
+ 文档级全部印章 = 主文档 seals + 各补充协议 seals,统一进 document_seals 子表。
799
+ 保证 seals 命令不漏补充协议落款上的章——章就是这份文档的章,不分主协议/补充协议。
800
+ """
801
+ seals = list(env.seals)
802
+ for sub in env.sub_agreements:
803
+ seals.extend(sub.seals)
804
+ return seals
805
+
806
+
807
+ def _insert_seals(
808
+ conn: sqlite3.Connection, doc_id: int, seals: Iterable[Seal]
809
+ ) -> None:
810
+ """批量插 document_seals,跳过 raw_text 与 owner 全空的垃圾项,ordering 递增。"""
811
+ rows = [
812
+ (doc_id, s.owner, s.seal_type, (s.raw_text or "").strip(), i)
813
+ for i, s in enumerate(seals)
814
+ if (s.raw_text and s.raw_text.strip()) or (s.owner and s.owner.strip())
815
+ ]
816
+ if not rows:
817
+ return
818
+ conn.executemany(
819
+ """INSERT INTO document_seals(doc_id, owner, seal_type, raw_text, ordering)
820
+ VALUES (?, ?, ?, ?, ?)""",
821
+ rows,
822
+ )
823
+
824
+
825
+ def _insert_subjects(
826
+ conn: sqlite3.Connection, doc_id: int, subjects: Iterable[str]
827
+ ) -> None:
828
+ """批量插 document_subjects(调用方已去重去空),ordering 递增。"""
829
+ rows = [(doc_id, s, i) for i, s in enumerate(subjects)]
830
+ if not rows:
831
+ return
832
+ conn.executemany(
833
+ "INSERT INTO document_subjects(doc_id, subject, ordering) VALUES (?, ?, ?)",
834
+ rows,
835
+ )
836
+
837
+
838
+ def _subjects_for(env: DocumentExtraction, ext: ContractExtraction) -> list[str]:
839
+ """
840
+ 文档主体集合:信封 parties + 合同甲乙方,保序去重去空。
841
+ 合同的 party_a/b 一并纳入,保证 --subject 对合同也命中(信封 parties 不保证含全称)。
842
+ """
843
+ seen: dict[str, None] = {}
844
+ for s in list(env.parties) + [ext.party_a, ext.party_b]:
845
+ if s and s.strip():
846
+ seen[s.strip()] = None
847
+ return list(seen)
848
+
849
+
850
+ def delete_document(conn: sqlite3.Connection, doc_id: int) -> Optional[str]:
851
+ """
852
+ 删档案记录。返回 output_dir 路径(让调用方决定是否删文件)。
853
+ DB 中 risk_clauses / obligations / document_seals / document_subjects
854
+ 全部由 ON DELETE CASCADE 自动级联(依赖 connect() 的 PRAGMA foreign_keys=ON)。
855
+ """
856
+ with transaction(conn):
857
+ row = conn.execute(
858
+ "SELECT output_dir FROM documents WHERE id = ?", (doc_id,)
859
+ ).fetchone()
860
+ if not row:
861
+ return None
862
+ conn.execute("DELETE FROM documents WHERE id = ?", (doc_id,))
863
+ return row["output_dir"]
864
+
865
+
866
+ # ---------- 统计 ----------
867
+
868
+
869
+ @dataclass
870
+ class Stats:
871
+ total: int
872
+ by_status: dict[str, int]
873
+ by_sign_month: dict[str, int] # 'YYYY-MM' → count
874
+ new_this_month: int
875
+ expiring_within_30d: int
876
+
877
+
878
+ def collect_stats(conn: sqlite3.Connection) -> Stats:
879
+ total = conn.execute("SELECT COUNT(*) AS c FROM documents").fetchone()["c"]
880
+
881
+ by_status = {
882
+ r["status"]: r["c"]
883
+ for r in conn.execute(
884
+ "SELECT status, COUNT(*) AS c FROM documents GROUP BY status"
885
+ )
886
+ }
887
+
888
+ by_sign_month = {
889
+ r["m"]: r["c"]
890
+ for r in conn.execute(
891
+ """
892
+ SELECT substr(sign_date, 1, 7) AS m, COUNT(*) AS c
893
+ FROM documents WHERE sign_date IS NOT NULL
894
+ GROUP BY m ORDER BY m
895
+ """
896
+ )
897
+ }
898
+
899
+ new_this_month = conn.execute(
900
+ "SELECT COUNT(*) AS c FROM documents WHERE substr(ingested_at, 1, 7) = strftime('%Y-%m', 'now')"
901
+ ).fetchone()["c"]
902
+
903
+ expiring_within_30d = conn.execute(
904
+ """
905
+ SELECT COUNT(*) AS c FROM documents
906
+ WHERE expire_date IS NOT NULL
907
+ AND expire_date >= date('now')
908
+ AND expire_date <= date('now', '+30 days')
909
+ """
910
+ ).fetchone()["c"]
911
+
912
+ return Stats(
913
+ total=total,
914
+ by_status=by_status,
915
+ by_sign_month=by_sign_month,
916
+ new_this_month=new_this_month,
917
+ expiring_within_30d=expiring_within_30d,
918
+ )