contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
本地合同档案库(SQLite + 文件系统)。
|
|
3
|
+
|
|
4
|
+
模块结构:
|
|
5
|
+
- db.py 连接 / PRAGMA / migrations 引擎
|
|
6
|
+
- repository.py DAO(CRUD + search + stats)
|
|
7
|
+
- ingest.py 单 PDF 入库流水线(hash → MinerU → extract → rename → DB)
|
|
8
|
+
- paths.py 档案库路径约定 + 硬链接/拷贝工具
|
|
9
|
+
"""
|
|
10
|
+
from .db import checkpoint, open_archive_db, transaction, utc_now_iso
|
|
11
|
+
from .ingest import IngestResult, discover_pdfs, ingest_pdf, load_document_text, re_extract
|
|
12
|
+
from .paths import ArchivePaths, default_archive_root, link_or_copy, sha256_of_file
|
|
13
|
+
from .repository import (
|
|
14
|
+
DocumentRow,
|
|
15
|
+
SealRow,
|
|
16
|
+
SearchFilter,
|
|
17
|
+
Stats,
|
|
18
|
+
TodoItem,
|
|
19
|
+
collect_stats,
|
|
20
|
+
delete_document,
|
|
21
|
+
find_by_sha,
|
|
22
|
+
find_by_sha_prefix,
|
|
23
|
+
get_document,
|
|
24
|
+
insert_document,
|
|
25
|
+
list_documents,
|
|
26
|
+
list_obligations,
|
|
27
|
+
list_seals,
|
|
28
|
+
replace_document,
|
|
29
|
+
search_documents,
|
|
30
|
+
update_extraction,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"open_archive_db",
|
|
35
|
+
"transaction",
|
|
36
|
+
"checkpoint",
|
|
37
|
+
"utc_now_iso",
|
|
38
|
+
"ArchivePaths",
|
|
39
|
+
"default_archive_root",
|
|
40
|
+
"link_or_copy",
|
|
41
|
+
"sha256_of_file",
|
|
42
|
+
"DocumentRow",
|
|
43
|
+
"SealRow",
|
|
44
|
+
"SearchFilter",
|
|
45
|
+
"Stats",
|
|
46
|
+
"TodoItem",
|
|
47
|
+
"list_obligations",
|
|
48
|
+
"list_seals",
|
|
49
|
+
"find_by_sha",
|
|
50
|
+
"find_by_sha_prefix",
|
|
51
|
+
"get_document",
|
|
52
|
+
"list_documents",
|
|
53
|
+
"search_documents",
|
|
54
|
+
"insert_document",
|
|
55
|
+
"update_extraction",
|
|
56
|
+
"replace_document",
|
|
57
|
+
"delete_document",
|
|
58
|
+
"collect_stats",
|
|
59
|
+
"IngestResult",
|
|
60
|
+
"ingest_pdf",
|
|
61
|
+
"re_extract",
|
|
62
|
+
"discover_pdfs",
|
|
63
|
+
"load_document_text",
|
|
64
|
+
]
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SQLite 连接 + schema 迁移。
|
|
3
|
+
|
|
4
|
+
设计要点:
|
|
5
|
+
- 用 stdlib sqlite3,不引 SQLAlchemy(单表单进程,ORM 是负债)
|
|
6
|
+
- 每个连接强制执行 PRAGMA:WAL + foreign_keys=ON + busy_timeout=5000
|
|
7
|
+
- schema_version 表 + migrations/*.sql 文件按版本顺序执行
|
|
8
|
+
- 退出前 wal_checkpoint(TRUNCATE):清空 -wal 文件,避免拷贝 db 时丢数据
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
import sqlite3
|
|
15
|
+
from contextlib import contextmanager
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Iterator
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
MIGRATIONS_DIR = Path(__file__).parent / "migrations"
|
|
23
|
+
MIGRATION_PATTERN = re.compile(r"^(\d{3})_.+\.sql$")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def utc_now_iso() -> str:
|
|
27
|
+
"""统一时间戳格式(带 Z 后缀的 UTC ISO8601,字典序 = 时间序)。"""
|
|
28
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def connect(db_path: Path) -> sqlite3.Connection:
|
|
32
|
+
"""
|
|
33
|
+
打开连接 + 应用必要 PRAGMA。
|
|
34
|
+
|
|
35
|
+
注意:
|
|
36
|
+
- foreign_keys 不是持久 PRAGMA,每次新连接默认 OFF,必须手动开
|
|
37
|
+
- busy_timeout 防止并发写时立即 SQLITE_BUSY
|
|
38
|
+
- row_factory 改 sqlite3.Row 让结果支持 row["col_name"] 访问
|
|
39
|
+
"""
|
|
40
|
+
db_path = Path(db_path)
|
|
41
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
conn = sqlite3.connect(
|
|
43
|
+
db_path,
|
|
44
|
+
isolation_level=None, # 自动提交模式,事务用显式 BEGIN/COMMIT 控制
|
|
45
|
+
timeout=10.0,
|
|
46
|
+
)
|
|
47
|
+
conn.row_factory = sqlite3.Row
|
|
48
|
+
conn.execute("PRAGMA journal_mode = WAL")
|
|
49
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
50
|
+
conn.execute("PRAGMA busy_timeout = 5000")
|
|
51
|
+
return conn
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@contextmanager
|
|
55
|
+
def transaction(conn: sqlite3.Connection) -> Iterator[sqlite3.Connection]:
|
|
56
|
+
"""显式事务:BEGIN IMMEDIATE 立即获取写锁,避免升级死锁。"""
|
|
57
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
58
|
+
try:
|
|
59
|
+
yield conn
|
|
60
|
+
conn.execute("COMMIT")
|
|
61
|
+
except Exception:
|
|
62
|
+
conn.execute("ROLLBACK")
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def checkpoint(conn: sqlite3.Connection) -> None:
|
|
67
|
+
"""强制 WAL checkpoint,清空 -wal 文件。退出前调用,避免拷 db 丢数据。"""
|
|
68
|
+
try:
|
|
69
|
+
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
|
|
70
|
+
except sqlite3.OperationalError as e:
|
|
71
|
+
logger.warning("wal_checkpoint failed: %s", e)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_schema_version(conn: sqlite3.Connection) -> int:
|
|
75
|
+
"""读 schema_version 表。表不存在视为版本 0(新库)。"""
|
|
76
|
+
try:
|
|
77
|
+
row = conn.execute(
|
|
78
|
+
"SELECT MAX(version) AS v FROM schema_version"
|
|
79
|
+
).fetchone()
|
|
80
|
+
return int(row["v"]) if row and row["v"] is not None else 0
|
|
81
|
+
except sqlite3.OperationalError:
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def discover_migrations() -> list[tuple[int, Path]]:
|
|
86
|
+
"""扫描 migrations/ 目录,按版本号升序返回 [(version, path), ...]。"""
|
|
87
|
+
found: list[tuple[int, Path]] = []
|
|
88
|
+
if not MIGRATIONS_DIR.exists():
|
|
89
|
+
return found
|
|
90
|
+
for f in MIGRATIONS_DIR.iterdir():
|
|
91
|
+
m = MIGRATION_PATTERN.match(f.name)
|
|
92
|
+
if m:
|
|
93
|
+
found.append((int(m.group(1)), f))
|
|
94
|
+
found.sort(key=lambda x: x[0])
|
|
95
|
+
return found
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def migrate(conn: sqlite3.Connection) -> int:
|
|
99
|
+
"""
|
|
100
|
+
应用所有未应用的迁移。返回最终 schema_version。
|
|
101
|
+
|
|
102
|
+
注意:executescript() 内部会自动 COMMIT 当前事务,所以不能再用 transaction()
|
|
103
|
+
包裹(会出现 "no transaction is active" 错误)。失败回滚由 SQLite 自身的
|
|
104
|
+
事务语义保证——脚本里若有 BEGIN/COMMIT,executescript 会按 SQL 内容执行。
|
|
105
|
+
本工具的 migration 文件不写 BEGIN/COMMIT,让 SQLite 走自动事务即可。
|
|
106
|
+
"""
|
|
107
|
+
current = get_schema_version(conn)
|
|
108
|
+
applied = 0
|
|
109
|
+
for version, path in discover_migrations():
|
|
110
|
+
if version <= current:
|
|
111
|
+
continue
|
|
112
|
+
sql = path.read_text(encoding="utf-8")
|
|
113
|
+
logger.info("applying migration %s (version=%d)", path.name, version)
|
|
114
|
+
conn.executescript(sql)
|
|
115
|
+
applied += 1
|
|
116
|
+
final = get_schema_version(conn)
|
|
117
|
+
if applied:
|
|
118
|
+
logger.info("migrations applied: %d, schema_version=%d", applied, final)
|
|
119
|
+
return final
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def open_archive_db(db_path: Path) -> sqlite3.Connection:
|
|
123
|
+
"""打开档案库 DB(必要时建表 + 迁移)。"""
|
|
124
|
+
conn = connect(db_path)
|
|
125
|
+
migrate(conn)
|
|
126
|
+
return conn
|