datamask-store 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamask_store/__init__.py +22 -0
- datamask_store/audit_store.py +49 -0
- datamask_store/labeling_store.py +72 -0
- datamask_store/py.typed +0 -0
- datamask_store/user_store.py +35 -0
- datamask_store-1.0.0.dist-info/METADATA +16 -0
- datamask_store-1.0.0.dist-info/RECORD +9 -0
- datamask_store-1.0.0.dist-info/WHEEL +5 -0
- datamask_store-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-store — 存储层 (W11.3)
|
|
3
|
+
|
|
4
|
+
W6.11 已实现: 多用户隔离
|
|
5
|
+
- 每用户独立 SQLite (./data/users/<user_id>/)
|
|
6
|
+
- 命名空间化 task_id (user:task)
|
|
7
|
+
- 路径穿越防御
|
|
8
|
+
|
|
9
|
+
W9.P2.2 已实现: SQLite WAL 模式
|
|
10
|
+
- 4 workers 并发 database is locked 修复
|
|
11
|
+
- WAL + busy_timeout 5000 + 5 retry
|
|
12
|
+
"""
|
|
13
|
+
from datamask_store.user_store import UserStore
|
|
14
|
+
from datamask_store.audit_store import AuditStore
|
|
15
|
+
from datamask_store.labeling_store import LabelingStore
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
__all__ = [
|
|
19
|
+
"UserStore",
|
|
20
|
+
"AuditStore",
|
|
21
|
+
"LabelingStore",
|
|
22
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-store — 审计日志存储 (W7.B.2)
|
|
3
|
+
"""
|
|
4
|
+
import sqlite3
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from typing import Dict, Any, Optional, List
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AuditStore:
|
|
11
|
+
"""审计追溯存储 (W7.B.2 已下沉到 backend/main.py middleware)"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, conn: sqlite3.Connection):
|
|
14
|
+
self.conn = conn
|
|
15
|
+
self._init_table()
|
|
16
|
+
|
|
17
|
+
def _init_table(self):
|
|
18
|
+
self.conn.execute("""
|
|
19
|
+
CREATE TABLE IF NOT EXISTS audit_traces (
|
|
20
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
21
|
+
task_id TEXT NOT NULL,
|
|
22
|
+
user_id TEXT,
|
|
23
|
+
action TEXT NOT NULL,
|
|
24
|
+
entity_type TEXT,
|
|
25
|
+
entity_text TEXT,
|
|
26
|
+
meta_json TEXT,
|
|
27
|
+
created_at REAL NOT NULL
|
|
28
|
+
)
|
|
29
|
+
""")
|
|
30
|
+
self.conn.execute("CREATE INDEX IF NOT EXISTS idx_audit_task ON audit_traces(task_id)")
|
|
31
|
+
self.conn.commit()
|
|
32
|
+
|
|
33
|
+
def log(self, task_id: str, action: str, user_id: Optional[str] = None,
|
|
34
|
+
entity_type: Optional[str] = None, entity_text: Optional[str] = None,
|
|
35
|
+
meta: Optional[Dict[str, Any]] = None):
|
|
36
|
+
self.conn.execute(
|
|
37
|
+
"INSERT INTO audit_traces (task_id, user_id, action, entity_type, entity_text, meta_json, created_at) "
|
|
38
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
39
|
+
(task_id, user_id, action, entity_type, entity_text,
|
|
40
|
+
json.dumps(meta) if meta else None, time.time())
|
|
41
|
+
)
|
|
42
|
+
self.conn.commit()
|
|
43
|
+
|
|
44
|
+
def query_by_task(self, task_id: str, limit: int = 100) -> List[Dict]:
|
|
45
|
+
cursor = self.conn.execute(
|
|
46
|
+
"SELECT * FROM audit_traces WHERE task_id = ? ORDER BY id DESC LIMIT ?",
|
|
47
|
+
(task_id, limit)
|
|
48
|
+
)
|
|
49
|
+
return [dict(zip([d[0] for d in cursor.description], row)) for row in cursor.fetchall()]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-store — 标注数据存储 (W7.D.1)
|
|
3
|
+
"""
|
|
4
|
+
import sqlite3
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from typing import Dict, Any, Optional, List
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LabelingStore:
|
|
11
|
+
"""标注数据存储 (含 W9.P2.2 独立 samples_extended 表)"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, conn: sqlite3.Connection):
|
|
14
|
+
self.conn = conn
|
|
15
|
+
self._init_tables()
|
|
16
|
+
|
|
17
|
+
def _init_tables(self):
|
|
18
|
+
# 基础标注
|
|
19
|
+
self.conn.execute("""
|
|
20
|
+
CREATE TABLE IF NOT EXISTS samples (
|
|
21
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
22
|
+
text TEXT NOT NULL,
|
|
23
|
+
entities_json TEXT NOT NULL,
|
|
24
|
+
confidence REAL NOT NULL,
|
|
25
|
+
source TEXT,
|
|
26
|
+
created_at REAL NOT NULL,
|
|
27
|
+
UNIQUE(text, source)
|
|
28
|
+
)
|
|
29
|
+
""")
|
|
30
|
+
# 扩展实体标注 (W9.P2.2 解决 INTEGER PRIMARY KEY 联合唯一约束)
|
|
31
|
+
self.conn.execute("""
|
|
32
|
+
CREATE TABLE IF NOT EXISTS samples_extended (
|
|
33
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
34
|
+
text TEXT NOT NULL,
|
|
35
|
+
entities_json TEXT NOT NULL,
|
|
36
|
+
confidence REAL NOT NULL,
|
|
37
|
+
source TEXT,
|
|
38
|
+
created_at REAL NOT NULL
|
|
39
|
+
)
|
|
40
|
+
""")
|
|
41
|
+
self.conn.commit()
|
|
42
|
+
|
|
43
|
+
def upsert_sample(self, text: str, entities: list, confidence: float,
|
|
44
|
+
source: str = "deepseek", extended: bool = False) -> int:
|
|
45
|
+
"""W7.D.3: P0 主键冲突修复 (upsert 防御)"""
|
|
46
|
+
table = "samples_extended" if extended else "samples"
|
|
47
|
+
entities_json = json.dumps(entities, ensure_ascii=False)
|
|
48
|
+
try:
|
|
49
|
+
cur = self.conn.execute(
|
|
50
|
+
f"INSERT INTO {table} (text, entities_json, confidence, source, created_at) VALUES (?, ?, ?, ?, ?)",
|
|
51
|
+
(text, entities_json, confidence, source, time.time())
|
|
52
|
+
)
|
|
53
|
+
self.conn.commit()
|
|
54
|
+
return cur.lastrowid
|
|
55
|
+
except sqlite3.IntegrityError:
|
|
56
|
+
# upsert
|
|
57
|
+
self.conn.execute(
|
|
58
|
+
f"UPDATE {table} SET entities_json=?, confidence=?, created_at=? WHERE text=? AND source=?",
|
|
59
|
+
(entities_json, confidence, time.time(), text, source)
|
|
60
|
+
)
|
|
61
|
+
self.conn.commit()
|
|
62
|
+
return -1
|
|
63
|
+
|
|
64
|
+
def get_pending(self, limit: int = 50, min_confidence: float = 0.0) -> List[Dict]:
|
|
65
|
+
cursor = self.conn.execute(
|
|
66
|
+
"SELECT id, text, entities_json, confidence FROM samples WHERE confidence >= ? ORDER BY confidence ASC LIMIT ?",
|
|
67
|
+
(min_confidence, limit)
|
|
68
|
+
)
|
|
69
|
+
return [
|
|
70
|
+
{"id": row[0], "text": row[1], "entities": json.loads(row[2]), "confidence": row[3]}
|
|
71
|
+
for row in cursor.fetchall()
|
|
72
|
+
]
|
datamask_store/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-store — 多用户隔离存储 (W6.11)
|
|
3
|
+
"""
|
|
4
|
+
import sqlite3
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, List, Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
USER_ID_PATTERN = re.compile(r'^[a-zA-Z0-9_-]{1,32}$')
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UserStore:
|
|
15
|
+
"""每用户独立 SQLite (W6.11)"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, base_dir: str = "./data/users"):
|
|
18
|
+
self.base_dir = Path(base_dir)
|
|
19
|
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
def get_user_db_path(self, user_id: str) -> Path:
|
|
22
|
+
"""获取用户数据库路径 (路径穿越防御)"""
|
|
23
|
+
if not USER_ID_PATTERN.match(user_id):
|
|
24
|
+
raise ValueError(f"Invalid user_id: {user_id}")
|
|
25
|
+
user_dir = self.base_dir / user_id
|
|
26
|
+
user_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
return user_dir / "data.db"
|
|
28
|
+
|
|
29
|
+
def connect(self, user_id: str) -> sqlite3.Connection:
|
|
30
|
+
"""连接用户数据库 (WAL 模式)"""
|
|
31
|
+
db_path = self.get_user_db_path(user_id)
|
|
32
|
+
conn = sqlite3.connect(str(db_path), timeout=5.0)
|
|
33
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
34
|
+
conn.execute("PRAGMA busy_timeout=5000")
|
|
35
|
+
return conn
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datamask-store
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: DataMask 存储层 — SQLite WAL 模式、多用户隔离、审计追溯、标注持久化
|
|
5
|
+
Author-email: TianluAudit <contact@datamask.cn>
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Project-URL: Homepage, https://datamask.cn
|
|
8
|
+
Project-URL: Documentation, https://datamask.cn/docs
|
|
9
|
+
Keywords: data-masking,privacy,NER,entity-recognition,FPE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Python: >=3.9
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
datamask_store/__init__.py,sha256=bCs24eL3ckzG_j9dhyNGRFR9f69n1a9KYV0hRApcJlw,555
|
|
2
|
+
datamask_store/audit_store.py,sha256=c3m4whx1wEyknoat1VFgxKf3Qu0xREikDgwoF28shxA,1799
|
|
3
|
+
datamask_store/labeling_store.py,sha256=L6pfLQmBvcv_cSkogY79frxg-IXZXoHEn7mXdLfGhMs,2710
|
|
4
|
+
datamask_store/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
datamask_store/user_store.py,sha256=64Z3Ad0HCr1CN5TpYAUZPNtX-ZRnikk8qPfL7qq5z4U,1121
|
|
6
|
+
datamask_store-1.0.0.dist-info/METADATA,sha256=MUwpI7veVXypdnIJYUUnhYYrPlL3i4tRhug8C-UCuak,695
|
|
7
|
+
datamask_store-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
datamask_store-1.0.0.dist-info/top_level.txt,sha256=hZChoVV5m36ZfQW16r2Nh08H5Lq5mDgEYxi3muTlAlM,15
|
|
9
|
+
datamask_store-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datamask_store
|