contract-archive-cli 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. contract_archive/__init__.py +2 -0
  2. contract_archive/archive/__init__.py +64 -0
  3. contract_archive/archive/db.py +126 -0
  4. contract_archive/archive/ingest.py +667 -0
  5. contract_archive/archive/migrations/001_init.sql +62 -0
  6. contract_archive/archive/migrations/002_obligations.sql +25 -0
  7. contract_archive/archive/migrations/003_document_types.sql +31 -0
  8. contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
  9. contract_archive/archive/migrations/005_completeness.sql +18 -0
  10. contract_archive/archive/party_registry.py +276 -0
  11. contract_archive/archive/paths.py +113 -0
  12. contract_archive/archive/repository.py +918 -0
  13. contract_archive/cli.py +455 -0
  14. contract_archive/cli_common.py +293 -0
  15. contract_archive/cli_config.py +96 -0
  16. contract_archive/cli_introspect.py +204 -0
  17. contract_archive/cli_party.py +166 -0
  18. contract_archive/cli_query.py +492 -0
  19. contract_archive/cli_render.py +575 -0
  20. contract_archive/config.py +257 -0
  21. contract_archive/errors.py +163 -0
  22. contract_archive/extraction/__init__.py +14 -0
  23. contract_archive/extraction/amount_check.py +87 -0
  24. contract_archive/extraction/contract_extractor.py +103 -0
  25. contract_archive/extraction/document_extractor.py +546 -0
  26. contract_archive/extraction/evidence_page_fix.py +99 -0
  27. contract_archive/extraction/llm_extractor.py +207 -0
  28. contract_archive/extraction/normalize.py +210 -0
  29. contract_archive/extraction/property_fee.py +79 -0
  30. contract_archive/extraction/vision_seal.py +390 -0
  31. contract_archive/pipelines/__init__.py +9 -0
  32. contract_archive/pipelines/mineru_pipeline.py +955 -0
  33. contract_archive/pipelines/vl_ocr.py +160 -0
  34. contract_archive/schemas/__init__.py +67 -0
  35. contract_archive/schemas/document.py +408 -0
  36. contract_archive/utils/__init__.py +27 -0
  37. contract_archive/utils/device.py +51 -0
  38. contract_archive/utils/http_env.py +54 -0
  39. contract_archive/utils/pdf.py +207 -0
  40. contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
  41. contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
  42. contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
  43. contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
  44. contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,2 @@
1
+ """本地文档档案库 CLI:OCR 解析 + qwen3.7-max 字段抽取 + SQLite 索引。"""
2
+ __version__ = "0.2.6"
@@ -0,0 +1,64 @@
1
+ """
2
+ 本地合同档案库(SQLite + 文件系统)。
3
+
4
+ 模块结构:
5
+ - db.py 连接 / PRAGMA / migrations 引擎
6
+ - repository.py DAO(CRUD + search + stats)
7
+ - ingest.py 单 PDF 入库流水线(hash → MinerU → extract → rename → DB)
8
+ - paths.py 档案库路径约定 + 硬链接/拷贝工具
9
+ """
10
+ from .db import checkpoint, open_archive_db, transaction, utc_now_iso
11
+ from .ingest import IngestResult, discover_pdfs, ingest_pdf, load_document_text, re_extract
12
+ from .paths import ArchivePaths, default_archive_root, link_or_copy, sha256_of_file
13
+ from .repository import (
14
+ DocumentRow,
15
+ SealRow,
16
+ SearchFilter,
17
+ Stats,
18
+ TodoItem,
19
+ collect_stats,
20
+ delete_document,
21
+ find_by_sha,
22
+ find_by_sha_prefix,
23
+ get_document,
24
+ insert_document,
25
+ list_documents,
26
+ list_obligations,
27
+ list_seals,
28
+ replace_document,
29
+ search_documents,
30
+ update_extraction,
31
+ )
32
+
33
+ __all__ = [
34
+ "open_archive_db",
35
+ "transaction",
36
+ "checkpoint",
37
+ "utc_now_iso",
38
+ "ArchivePaths",
39
+ "default_archive_root",
40
+ "link_or_copy",
41
+ "sha256_of_file",
42
+ "DocumentRow",
43
+ "SealRow",
44
+ "SearchFilter",
45
+ "Stats",
46
+ "TodoItem",
47
+ "list_obligations",
48
+ "list_seals",
49
+ "find_by_sha",
50
+ "find_by_sha_prefix",
51
+ "get_document",
52
+ "list_documents",
53
+ "search_documents",
54
+ "insert_document",
55
+ "update_extraction",
56
+ "replace_document",
57
+ "delete_document",
58
+ "collect_stats",
59
+ "IngestResult",
60
+ "ingest_pdf",
61
+ "re_extract",
62
+ "discover_pdfs",
63
+ "load_document_text",
64
+ ]
@@ -0,0 +1,126 @@
1
+ """
2
+ SQLite 连接 + schema 迁移。
3
+
4
+ 设计要点:
5
+ - 用 stdlib sqlite3,不引 SQLAlchemy(单表单进程,ORM 是负债)
6
+ - 每个连接强制执行 PRAGMA:WAL + foreign_keys=ON + busy_timeout=5000
7
+ - schema_version 表 + migrations/*.sql 文件按版本顺序执行
8
+ - 退出前 wal_checkpoint(TRUNCATE):清空 -wal 文件,避免拷贝 db 时丢数据
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import re
14
+ import sqlite3
15
+ from contextlib import contextmanager
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+ from typing import Iterator
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ MIGRATIONS_DIR = Path(__file__).parent / "migrations"
23
+ MIGRATION_PATTERN = re.compile(r"^(\d{3})_.+\.sql$")
24
+
25
+
26
+ def utc_now_iso() -> str:
27
+ """统一时间戳格式(带 Z 后缀的 UTC ISO8601,字典序 = 时间序)。"""
28
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
29
+
30
+
31
+ def connect(db_path: Path) -> sqlite3.Connection:
32
+ """
33
+ 打开连接 + 应用必要 PRAGMA。
34
+
35
+ 注意:
36
+ - foreign_keys 不是持久 PRAGMA,每次新连接默认 OFF,必须手动开
37
+ - busy_timeout 防止并发写时立即 SQLITE_BUSY
38
+ - row_factory 改 sqlite3.Row 让结果支持 row["col_name"] 访问
39
+ """
40
+ db_path = Path(db_path)
41
+ db_path.parent.mkdir(parents=True, exist_ok=True)
42
+ conn = sqlite3.connect(
43
+ db_path,
44
+ isolation_level=None, # 自动提交模式,事务用显式 BEGIN/COMMIT 控制
45
+ timeout=10.0,
46
+ )
47
+ conn.row_factory = sqlite3.Row
48
+ conn.execute("PRAGMA journal_mode = WAL")
49
+ conn.execute("PRAGMA foreign_keys = ON")
50
+ conn.execute("PRAGMA busy_timeout = 5000")
51
+ return conn
52
+
53
+
54
+ @contextmanager
55
+ def transaction(conn: sqlite3.Connection) -> Iterator[sqlite3.Connection]:
56
+ """显式事务:BEGIN IMMEDIATE 立即获取写锁,避免升级死锁。"""
57
+ conn.execute("BEGIN IMMEDIATE")
58
+ try:
59
+ yield conn
60
+ conn.execute("COMMIT")
61
+ except Exception:
62
+ conn.execute("ROLLBACK")
63
+ raise
64
+
65
+
66
+ def checkpoint(conn: sqlite3.Connection) -> None:
67
+ """强制 WAL checkpoint,清空 -wal 文件。退出前调用,避免拷 db 丢数据。"""
68
+ try:
69
+ conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
70
+ except sqlite3.OperationalError as e:
71
+ logger.warning("wal_checkpoint failed: %s", e)
72
+
73
+
74
+ def get_schema_version(conn: sqlite3.Connection) -> int:
75
+ """读 schema_version 表。表不存在视为版本 0(新库)。"""
76
+ try:
77
+ row = conn.execute(
78
+ "SELECT MAX(version) AS v FROM schema_version"
79
+ ).fetchone()
80
+ return int(row["v"]) if row and row["v"] is not None else 0
81
+ except sqlite3.OperationalError:
82
+ return 0
83
+
84
+
85
+ def discover_migrations() -> list[tuple[int, Path]]:
86
+ """扫描 migrations/ 目录,按版本号升序返回 [(version, path), ...]。"""
87
+ found: list[tuple[int, Path]] = []
88
+ if not MIGRATIONS_DIR.exists():
89
+ return found
90
+ for f in MIGRATIONS_DIR.iterdir():
91
+ m = MIGRATION_PATTERN.match(f.name)
92
+ if m:
93
+ found.append((int(m.group(1)), f))
94
+ found.sort(key=lambda x: x[0])
95
+ return found
96
+
97
+
98
+ def migrate(conn: sqlite3.Connection) -> int:
99
+ """
100
+ 应用所有未应用的迁移。返回最终 schema_version。
101
+
102
+ 注意:executescript() 内部会自动 COMMIT 当前事务,所以不能再用 transaction()
103
+ 包裹(会出现 "no transaction is active" 错误)。失败回滚由 SQLite 自身的
104
+ 事务语义保证——脚本里若有 BEGIN/COMMIT,executescript 会按 SQL 内容执行。
105
+ 本工具的 migration 文件不写 BEGIN/COMMIT,让 SQLite 走自动事务即可。
106
+ """
107
+ current = get_schema_version(conn)
108
+ applied = 0
109
+ for version, path in discover_migrations():
110
+ if version <= current:
111
+ continue
112
+ sql = path.read_text(encoding="utf-8")
113
+ logger.info("applying migration %s (version=%d)", path.name, version)
114
+ conn.executescript(sql)
115
+ applied += 1
116
+ final = get_schema_version(conn)
117
+ if applied:
118
+ logger.info("migrations applied: %d, schema_version=%d", applied, final)
119
+ return final
120
+
121
+
122
+ def open_archive_db(db_path: Path) -> sqlite3.Connection:
123
+ """打开档案库 DB(必要时建表 + 迁移)。"""
124
+ conn = connect(db_path)
125
+ migrate(conn)
126
+ return conn