nutricare-data-packages 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nutricare_data_packages-0.1.0/PKG-INFO +59 -0
- nutricare_data_packages-0.1.0/README.md +48 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages/__init__.py +47 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages/metadata_storage.py +237 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages/mongo_repository_base.py +98 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages/storage_base.py +193 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages.egg-info/PKG-INFO +59 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages.egg-info/SOURCES.txt +11 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages.egg-info/dependency_links.txt +1 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages.egg-info/requires.txt +1 -0
- nutricare_data_packages-0.1.0/nutricare_data_packages.egg-info/top_level.txt +1 -0
- nutricare_data_packages-0.1.0/pyproject.toml +27 -0
- nutricare_data_packages-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nutricare-data-packages
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Nutricare data access package
|
|
5
|
+
Author: Nutricare Team
|
|
6
|
+
Project-URL: Homepage, https://example.com
|
|
7
|
+
Project-URL: Repository, https://example.com/nutricare-data-packages
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: pymongo>=4.0.0
|
|
11
|
+
|
|
12
|
+
# nutricare-data-packages
|
|
13
|
+
|
|
14
|
+
Nutricare 的数据访问基础包,封装 MongoDB 仓储与本地存储抽象基类。
|
|
15
|
+
|
|
16
|
+
## 安装
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install nutricare-data-packages
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 快速使用
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from nutricare_data_packages import MetadataStore, StorageBase
|
|
28
|
+
|
|
29
|
+
store = MetadataStore(
|
|
30
|
+
mongodb_url="mongodb://localhost:27017",
|
|
31
|
+
db_name="nutricare",
|
|
32
|
+
auth_source="admin",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
class ReportStorage(StorageBase):
|
|
36
|
+
@property
|
|
37
|
+
def storage_subdir(self) -> str:
|
|
38
|
+
return "reports"
|
|
39
|
+
|
|
40
|
+
def _write_impl(self, relative_path: str, data: bytes) -> Path:
|
|
41
|
+
target = self.resolve_path(relative_path)
|
|
42
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
target.write_bytes(data)
|
|
44
|
+
return target
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 本地构建与发布
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python -m pip install --upgrade build twine
|
|
51
|
+
python -m build
|
|
52
|
+
python -m twine check dist/*
|
|
53
|
+
python -m twine upload dist/*
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## 说明
|
|
57
|
+
|
|
58
|
+
- 包名(pip 安装名):`nutricare-data-packages`
|
|
59
|
+
- 导入名(Python import):`nutricare_data_packages`
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# nutricare-data-packages
|
|
2
|
+
|
|
3
|
+
Nutricare 的数据访问基础包,封装 MongoDB 仓储与本地存储抽象基类。
|
|
4
|
+
|
|
5
|
+
## 安装
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install nutricare-data-packages
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 快速使用
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from nutricare_data_packages import MetadataStore, StorageBase
|
|
17
|
+
|
|
18
|
+
store = MetadataStore(
|
|
19
|
+
mongodb_url="mongodb://localhost:27017",
|
|
20
|
+
db_name="nutricare",
|
|
21
|
+
auth_source="admin",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
class ReportStorage(StorageBase):
|
|
25
|
+
@property
|
|
26
|
+
def storage_subdir(self) -> str:
|
|
27
|
+
return "reports"
|
|
28
|
+
|
|
29
|
+
def _write_impl(self, relative_path: str, data: bytes) -> Path:
|
|
30
|
+
target = self.resolve_path(relative_path)
|
|
31
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
target.write_bytes(data)
|
|
33
|
+
return target
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## 本地构建与发布
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
python -m pip install --upgrade build twine
|
|
40
|
+
python -m build
|
|
41
|
+
python -m twine check dist/*
|
|
42
|
+
python -m twine upload dist/*
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## 说明
|
|
46
|
+
|
|
47
|
+
- 包名(pip 安装名):`nutricare-data-packages`
|
|
48
|
+
- 导入名(Python import):`nutricare_data_packages`
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Nutricare data packages."""
|
|
2
|
+
|
|
3
|
+
from .metadata_storage import (
|
|
4
|
+
INDEX_KEYS,
|
|
5
|
+
KEY_CONTENT_HASH,
|
|
6
|
+
KEY_CREATED_AT,
|
|
7
|
+
KEY_MD5,
|
|
8
|
+
KEY_METADATA,
|
|
9
|
+
KEY_MIME_TYPE,
|
|
10
|
+
KEY_PHASH,
|
|
11
|
+
KEY_RELATIVE_PATH,
|
|
12
|
+
KEY_SIZE,
|
|
13
|
+
KEY_STATUS,
|
|
14
|
+
KEY_STORAGE_SUBDIR,
|
|
15
|
+
KEY_TASK_ID,
|
|
16
|
+
KEY_TICKET_ID,
|
|
17
|
+
KEY_UPDATED_AT,
|
|
18
|
+
METADATA_COLLECTION,
|
|
19
|
+
MetadataStore,
|
|
20
|
+
make_document,
|
|
21
|
+
make_filter,
|
|
22
|
+
)
|
|
23
|
+
from .mongo_repository_base import MongoRepositoryBase
|
|
24
|
+
from .storage_base import StorageBase
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"MongoRepositoryBase",
|
|
28
|
+
"MetadataStore",
|
|
29
|
+
"StorageBase",
|
|
30
|
+
"METADATA_COLLECTION",
|
|
31
|
+
"INDEX_KEYS",
|
|
32
|
+
"KEY_STORAGE_SUBDIR",
|
|
33
|
+
"KEY_RELATIVE_PATH",
|
|
34
|
+
"KEY_TICKET_ID",
|
|
35
|
+
"KEY_TASK_ID",
|
|
36
|
+
"KEY_SIZE",
|
|
37
|
+
"KEY_CONTENT_HASH",
|
|
38
|
+
"KEY_MD5",
|
|
39
|
+
"KEY_PHASH",
|
|
40
|
+
"KEY_MIME_TYPE",
|
|
41
|
+
"KEY_STATUS",
|
|
42
|
+
"KEY_METADATA",
|
|
43
|
+
"KEY_CREATED_AT",
|
|
44
|
+
"KEY_UPDATED_AT",
|
|
45
|
+
"make_document",
|
|
46
|
+
"make_filter",
|
|
47
|
+
]
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# 元数据存储 - 定义 doc_metadata 表结构、存储与更新方法
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from pymongo.collection import Collection
|
|
7
|
+
from pymongo.cursor import Cursor
|
|
8
|
+
from pymongo.results import DeleteResult, InsertOneResult, UpdateResult
|
|
9
|
+
|
|
10
|
+
from .mongo_repository_base import MongoRepositoryBase
|
|
11
|
+
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
# 集合名与表结构:字段名常量与说明
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
# 元数据集合名(MongoDB collection)
|
|
17
|
+
METADATA_COLLECTION = "doc_metadata"
|
|
18
|
+
|
|
19
|
+
# 唯一键:存储子目录 + 相对路径,用于 upsert
|
|
20
|
+
KEY_STORAGE_SUBDIR = "storage_subdir"
|
|
21
|
+
KEY_RELATIVE_PATH = "relative_path"
|
|
22
|
+
|
|
23
|
+
# 文档字段
|
|
24
|
+
KEY_TICKET_ID = "ticket_id"
|
|
25
|
+
KEY_TASK_ID = "task_id"
|
|
26
|
+
KEY_SIZE = "size"
|
|
27
|
+
KEY_CONTENT_HASH = "content_hash"
|
|
28
|
+
KEY_MD5 = "md5"
|
|
29
|
+
KEY_PHASH = "phash"
|
|
30
|
+
KEY_MIME_TYPE = "mime_type"
|
|
31
|
+
KEY_STATUS = "status"
|
|
32
|
+
KEY_METADATA = "metadata"
|
|
33
|
+
KEY_CREATED_AT = "created_at"
|
|
34
|
+
KEY_UPDATED_AT = "updated_at"
|
|
35
|
+
|
|
36
|
+
# 建议唯一索引:(storage_subdir, relative_path)
|
|
37
|
+
INDEX_KEYS = [KEY_STORAGE_SUBDIR, KEY_RELATIVE_PATH]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _now_utc() -> datetime:
|
|
41
|
+
"""当前 UTC 时间。"""
|
|
42
|
+
return datetime.now(timezone.utc)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def make_document(
|
|
46
|
+
*,
|
|
47
|
+
storage_subdir: str,
|
|
48
|
+
relative_path: str,
|
|
49
|
+
size: Optional[int] = None,
|
|
50
|
+
content_hash: Optional[str] = None,
|
|
51
|
+
md5: Optional[str] = None,
|
|
52
|
+
phash: Optional[str] = None,
|
|
53
|
+
mime_type: Optional[str] = None,
|
|
54
|
+
ticket_id: Optional[str] = None,
|
|
55
|
+
task_id: Optional[str] = None,
|
|
56
|
+
status: Optional[str] = None,
|
|
57
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
58
|
+
created_at: Optional[datetime] = None,
|
|
59
|
+
updated_at: Optional[datetime] = None,
|
|
60
|
+
) -> dict[str, Any]:
|
|
61
|
+
"""
|
|
62
|
+
构造符合元数据表结构的文档(不包含 _id)。
|
|
63
|
+
用于存储前校验与统一时间戳。
|
|
64
|
+
"""
|
|
65
|
+
now = _now_utc()
|
|
66
|
+
doc: dict[str, Any] = {
|
|
67
|
+
KEY_STORAGE_SUBDIR: storage_subdir.strip(),
|
|
68
|
+
KEY_RELATIVE_PATH: relative_path.replace("\\", "/").strip().lstrip("/"),
|
|
69
|
+
KEY_CREATED_AT: created_at or now,
|
|
70
|
+
KEY_UPDATED_AT: updated_at or now,
|
|
71
|
+
}
|
|
72
|
+
if size is not None:
|
|
73
|
+
doc[KEY_SIZE] = size
|
|
74
|
+
if content_hash is not None and str(content_hash).strip():
|
|
75
|
+
doc[KEY_CONTENT_HASH] = content_hash.strip()
|
|
76
|
+
if md5 is not None and str(md5).strip():
|
|
77
|
+
doc[KEY_MD5] = md5.strip()
|
|
78
|
+
if phash is not None and str(phash).strip():
|
|
79
|
+
doc[KEY_PHASH] = phash.strip()
|
|
80
|
+
if mime_type is not None and str(mime_type).strip():
|
|
81
|
+
doc[KEY_MIME_TYPE] = mime_type.strip()
|
|
82
|
+
if ticket_id is not None and str(ticket_id).strip():
|
|
83
|
+
doc[KEY_TICKET_ID] = ticket_id.strip()
|
|
84
|
+
if task_id is not None and str(task_id).strip():
|
|
85
|
+
doc[KEY_TASK_ID] = task_id.strip()
|
|
86
|
+
if status is not None and str(status).strip():
|
|
87
|
+
doc[KEY_STATUS] = status.strip()
|
|
88
|
+
if metadata is not None and isinstance(metadata, dict):
|
|
89
|
+
doc[KEY_METADATA] = metadata
|
|
90
|
+
return doc
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def make_filter(storage_subdir: str, relative_path: str) -> dict[str, Any]:
|
|
94
|
+
"""按存储子目录 + 相对路径构造查询条件。"""
|
|
95
|
+
return {
|
|
96
|
+
KEY_STORAGE_SUBDIR: storage_subdir.strip(),
|
|
97
|
+
KEY_RELATIVE_PATH: relative_path.replace("\\", "/").strip().lstrip("/"),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class MetadataStore:
|
|
102
|
+
"""
|
|
103
|
+
元数据存储:基于 METADATA_COLLECTION 的增删改查封装。
|
|
104
|
+
通过连接参数在内部实例化 MongoRepositoryBase。
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
*,
|
|
110
|
+
mongodb_url: str,
|
|
111
|
+
db_name: str,
|
|
112
|
+
auth_source: Optional[str] = None,
|
|
113
|
+
) -> None:
|
|
114
|
+
if not mongodb_url or not str(mongodb_url).strip():
|
|
115
|
+
raise ValueError("mongodb_url 不能为空")
|
|
116
|
+
if not db_name or not str(db_name).strip():
|
|
117
|
+
raise ValueError("db_name 不能为空")
|
|
118
|
+
self._repo = MongoRepositoryBase(
|
|
119
|
+
mongodb_url=mongodb_url.strip(),
|
|
120
|
+
db_name=db_name.strip(),
|
|
121
|
+
collection_name=METADATA_COLLECTION,
|
|
122
|
+
auth_source=auth_source,
|
|
123
|
+
)
|
|
124
|
+
self._coll = self._repo.collection
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def collection(self) -> Collection:
|
|
128
|
+
"""底层集合。"""
|
|
129
|
+
return self._coll
|
|
130
|
+
|
|
131
|
+
def save(
|
|
132
|
+
self,
|
|
133
|
+
*,
|
|
134
|
+
storage_subdir: str,
|
|
135
|
+
relative_path: str,
|
|
136
|
+
size: Optional[int] = None,
|
|
137
|
+
content_hash: Optional[str] = None,
|
|
138
|
+
md5: Optional[str] = None,
|
|
139
|
+
phash: Optional[str] = None,
|
|
140
|
+
mime_type: Optional[str] = None,
|
|
141
|
+
ticket_id: Optional[str] = None,
|
|
142
|
+
task_id: Optional[str] = None,
|
|
143
|
+
status: Optional[str] = None,
|
|
144
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
145
|
+
) -> InsertOneResult | UpdateResult:
|
|
146
|
+
"""
|
|
147
|
+
存储一条元数据。若 (storage_subdir, relative_path) 已存在则更新,否则插入。
|
|
148
|
+
更新时自动刷新 updated_at,插入时设置 created_at / updated_at。
|
|
149
|
+
"""
|
|
150
|
+
doc = make_document(
|
|
151
|
+
storage_subdir=storage_subdir,
|
|
152
|
+
relative_path=relative_path,
|
|
153
|
+
size=size,
|
|
154
|
+
content_hash=content_hash,
|
|
155
|
+
md5=md5,
|
|
156
|
+
phash=phash,
|
|
157
|
+
mime_type=mime_type,
|
|
158
|
+
ticket_id=ticket_id,
|
|
159
|
+
task_id=task_id,
|
|
160
|
+
status=status,
|
|
161
|
+
metadata=metadata,
|
|
162
|
+
)
|
|
163
|
+
flt = make_filter(storage_subdir, relative_path)
|
|
164
|
+
existing = self._coll.find_one(flt)
|
|
165
|
+
if existing:
|
|
166
|
+
update_doc = {
|
|
167
|
+
"$set": {
|
|
168
|
+
KEY_UPDATED_AT: _now_utc(),
|
|
169
|
+
**{k: v for k, v in doc.items() if k not in (KEY_CREATED_AT,)},
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return self._coll.update_one(flt, update_doc)
|
|
173
|
+
return self._coll.insert_one(doc)
|
|
174
|
+
|
|
175
|
+
def update_by_path(
|
|
176
|
+
self,
|
|
177
|
+
storage_subdir: str,
|
|
178
|
+
relative_path: str,
|
|
179
|
+
*,
|
|
180
|
+
size_bytes: Optional[int] = None,
|
|
181
|
+
content_hash: Optional[str] = None,
|
|
182
|
+
md5: Optional[str] = None,
|
|
183
|
+
phash: Optional[str] = None,
|
|
184
|
+
mime_type: Optional[str] = None,
|
|
185
|
+
ticket_id: Optional[str] = None,
|
|
186
|
+
task_id: Optional[str] = None,
|
|
187
|
+
status: Optional[str] = None,
|
|
188
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
189
|
+
) -> UpdateResult:
|
|
190
|
+
"""
|
|
191
|
+
按 (storage_subdir, relative_path) 更新部分字段。
|
|
192
|
+
"""
|
|
193
|
+
flt = make_filter(storage_subdir, relative_path)
|
|
194
|
+
set_payload: dict[str, Any] = {KEY_UPDATED_AT: _now_utc()}
|
|
195
|
+
if size_bytes is not None:
|
|
196
|
+
set_payload[KEY_SIZE] = size_bytes
|
|
197
|
+
if content_hash is not None:
|
|
198
|
+
set_payload[KEY_CONTENT_HASH] = content_hash.strip() if content_hash else None
|
|
199
|
+
if md5 is not None:
|
|
200
|
+
set_payload[KEY_MD5] = md5.strip() if md5 else None
|
|
201
|
+
if phash is not None:
|
|
202
|
+
set_payload[KEY_PHASH] = phash.strip() if phash else None
|
|
203
|
+
if mime_type is not None:
|
|
204
|
+
set_payload[KEY_MIME_TYPE] = mime_type.strip() if mime_type else None
|
|
205
|
+
if ticket_id is not None:
|
|
206
|
+
set_payload[KEY_TICKET_ID] = ticket_id.strip() if ticket_id else None
|
|
207
|
+
if task_id is not None:
|
|
208
|
+
set_payload[KEY_TASK_ID] = task_id.strip() if task_id else None
|
|
209
|
+
if status is not None:
|
|
210
|
+
set_payload[KEY_STATUS] = status.strip() if status else None
|
|
211
|
+
if metadata is not None:
|
|
212
|
+
set_payload[KEY_METADATA] = metadata
|
|
213
|
+
return self._coll.update_one(flt, {"$set": set_payload})
|
|
214
|
+
|
|
215
|
+
def update_one(
|
|
216
|
+
self, filter: dict[str, Any], update: dict[str, Any], **kwargs: Any
|
|
217
|
+
) -> UpdateResult:
|
|
218
|
+
"""透传底层 update_one,便于自定义更新。"""
|
|
219
|
+
return self._coll.update_one(filter, update, **kwargs)
|
|
220
|
+
|
|
221
|
+
def find_by_path(
|
|
222
|
+
self, storage_subdir: str, relative_path: str
|
|
223
|
+
) -> Optional[dict[str, Any]]:
|
|
224
|
+
"""按 (storage_subdir, relative_path) 查询单条元数据。"""
|
|
225
|
+
return self._coll.find_one(make_filter(storage_subdir, relative_path))
|
|
226
|
+
|
|
227
|
+
def list_by_storage_subdir(
|
|
228
|
+
self, storage_subdir: str, **kwargs: Any
|
|
229
|
+
) -> Cursor[dict[str, Any]]:
|
|
230
|
+
"""按 storage_subdir 查询该子目录下所有元数据(返回 Cursor)。"""
|
|
231
|
+
return self._coll.find({KEY_STORAGE_SUBDIR: storage_subdir.strip()}, **kwargs)
|
|
232
|
+
|
|
233
|
+
def delete_by_path(
|
|
234
|
+
self, storage_subdir: str, relative_path: str
|
|
235
|
+
) -> DeleteResult:
|
|
236
|
+
"""按 (storage_subdir, relative_path) 删除一条元数据。"""
|
|
237
|
+
return self._coll.delete_one(make_filter(storage_subdir, relative_path))
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# MongoDB 连接与数据库访问(基类:子类需传入连接参数与集合名)
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
from pymongo import MongoClient
|
|
6
|
+
from pymongo.collection import Collection
|
|
7
|
+
from pymongo.cursor import Cursor
|
|
8
|
+
from pymongo.results import DeleteResult, InsertOneResult, InsertManyResult, UpdateResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MongoRepositoryBase:
|
|
12
|
+
"""
|
|
13
|
+
MongoDB 集合访问的基类。
|
|
14
|
+
|
|
15
|
+
子类继承后,在实例化时必须传入:
|
|
16
|
+
- 数据库连接参数:mongodb_url、db_name,以及可选的 auth_source
|
|
17
|
+
- 集合名称:collection_name
|
|
18
|
+
|
|
19
|
+
示例:
|
|
20
|
+
class MyRepo(MongoRepositoryBase):
|
|
21
|
+
def __init__(self):
|
|
22
|
+
super().__init__(
|
|
23
|
+
mongodb_url="mongodb://localhost:27017",
|
|
24
|
+
db_name="mydb",
|
|
25
|
+
collection_name="my_coll",
|
|
26
|
+
auth_source="admin", # 可选
|
|
27
|
+
)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
*,
|
|
33
|
+
mongodb_url: str,
|
|
34
|
+
db_name: str,
|
|
35
|
+
collection_name: str,
|
|
36
|
+
auth_source: Optional[str] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
if not mongodb_url or not str(mongodb_url).strip():
|
|
39
|
+
raise ValueError("mongodb_url 不能为空")
|
|
40
|
+
if not db_name or not str(db_name).strip():
|
|
41
|
+
raise ValueError("db_name 不能为空")
|
|
42
|
+
if not collection_name or not str(collection_name).strip():
|
|
43
|
+
raise ValueError("collection_name 不能为空")
|
|
44
|
+
kwargs: dict[str, Any] = {}
|
|
45
|
+
if auth_source and str(auth_source).strip():
|
|
46
|
+
kwargs["authSource"] = auth_source.strip()
|
|
47
|
+
self._client = MongoClient(mongodb_url.strip(), **kwargs)
|
|
48
|
+
self._db = self._client[db_name.strip()]
|
|
49
|
+
self._coll = self._db[collection_name.strip()]
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def collection(self) -> Collection:
|
|
53
|
+
"""当前集合。"""
|
|
54
|
+
return self._coll
|
|
55
|
+
|
|
56
|
+
def find(self, filter: Optional[dict[str, Any]] = None, **kwargs: Any) -> Cursor:
|
|
57
|
+
"""查询多条。filter 为 None 时查全部。"""
|
|
58
|
+
return self._coll.find(filter or {}, **kwargs)
|
|
59
|
+
|
|
60
|
+
def find_one(
|
|
61
|
+
self, filter: Optional[dict[str, Any]] = None, **kwargs: Any
|
|
62
|
+
) -> Optional[dict[str, Any]]:
|
|
63
|
+
"""查询单条。"""
|
|
64
|
+
return self._coll.find_one(filter or {}, **kwargs)
|
|
65
|
+
|
|
66
|
+
def insert_one(self, document: dict[str, Any], **kwargs: Any) -> InsertOneResult:
|
|
67
|
+
"""新增一条。"""
|
|
68
|
+
return self._coll.insert_one(document, **kwargs)
|
|
69
|
+
|
|
70
|
+
def insert_many(
|
|
71
|
+
self, documents: list[dict[str, Any]], **kwargs: Any
|
|
72
|
+
) -> InsertManyResult:
|
|
73
|
+
"""新增多条。"""
|
|
74
|
+
return self._coll.insert_many(documents, **kwargs)
|
|
75
|
+
|
|
76
|
+
def update_one(
|
|
77
|
+
self, filter: dict[str, Any], update: dict[str, Any], **kwargs: Any
|
|
78
|
+
) -> UpdateResult:
|
|
79
|
+
"""修改一条。"""
|
|
80
|
+
return self._coll.update_one(filter, update, **kwargs)
|
|
81
|
+
|
|
82
|
+
def update_many(
|
|
83
|
+
self, filter: dict[str, Any], update: dict[str, Any], **kwargs: Any
|
|
84
|
+
) -> UpdateResult:
|
|
85
|
+
"""修改多条。"""
|
|
86
|
+
return self._coll.update_many(filter, update, **kwargs)
|
|
87
|
+
|
|
88
|
+
def delete_one(self, filter: dict[str, Any], **kwargs: Any) -> DeleteResult:
|
|
89
|
+
"""删除一条。"""
|
|
90
|
+
return self._coll.delete_one(filter, **kwargs)
|
|
91
|
+
|
|
92
|
+
def delete_many(self, filter: dict[str, Any], **kwargs: Any) -> DeleteResult:
|
|
93
|
+
"""删除多条。"""
|
|
94
|
+
return self._coll.delete_many(filter, **kwargs)
|
|
95
|
+
|
|
96
|
+
def aggregate(self, pipeline: list[dict[str, Any]], **kwargs: Any) -> Any:
|
|
97
|
+
"""聚合查询。"""
|
|
98
|
+
return self._coll.aggregate(pipeline, **kwargs)
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# 本地存储基类 - 约定存储位置与规则,子类继承后按约束执行
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
import re
|
|
5
|
+
import threading
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from pymongo.collection import Collection
|
|
10
|
+
from pymongo.results import InsertOneResult, UpdateResult
|
|
11
|
+
|
|
12
|
+
from .metadata_storage import MetadataStore
|
|
13
|
+
|
|
14
|
+
DOCUMENTS_DIR = "/data/documents"
|
|
15
|
+
|
|
16
|
+
# 默认允许的相对路径:字母、数字、下划线、短横线、点、斜杠,禁止 ..
|
|
17
|
+
_RELATIVE_PATH_PATTERN = re.compile(r"^[a-zA-Z0-9_.\-/]+$")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StorageBase(ABC):
|
|
21
|
+
"""
|
|
22
|
+
本地存储的顶级基类。
|
|
23
|
+
|
|
24
|
+
约定:
|
|
25
|
+
1. 存储位置:根路径为 DOCUMENTS_DIR,子类通过 storage_subdir 指定子目录。
|
|
26
|
+
2. 存储规则:所有路径均为相对于「根路径/storage_subdir」的相对路径,禁止路径穿越(..)、绝对路径及非法字符。
|
|
27
|
+
3. 子类继承本类后,必须通过 self._resolve(relative_path) 解析路径,不得绕过约束直接写文件。
|
|
28
|
+
4. 子类必须实现 storage_subdir,指定存储子目录名。
|
|
29
|
+
5. 基类中实例化元数据集合 METADATA_COLLECTION,子类可通过 metadata_collection 或 _metadata_repo 访问。
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def storage_subdir(self) -> str:
|
|
35
|
+
"""子类必须实现:指定存储子目录名(不可为空)。"""
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
mongodb_url: str,
|
|
42
|
+
db_name: str,
|
|
43
|
+
auth_source: Optional[str] = None,
|
|
44
|
+
) -> None:
|
|
45
|
+
self._root = self._init_root()
|
|
46
|
+
self._metadata_repo = MetadataStore(
|
|
47
|
+
mongodb_url=mongodb_url,
|
|
48
|
+
db_name=db_name,
|
|
49
|
+
auth_source=auth_source,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def metadata_collection(self) -> Collection:
|
|
54
|
+
"""元数据集合(METADATA_COLLECTION)实例,用于文档元数据的增删改查。"""
|
|
55
|
+
return self._metadata_repo.collection
|
|
56
|
+
|
|
57
|
+
def _init_root(self) -> Path:
|
|
58
|
+
"""根据配置与 storage_subdir 初始化存储根路径。"""
|
|
59
|
+
if not (self.storage_subdir and self.storage_subdir.strip()):
|
|
60
|
+
raise ValueError("子类必须指定非空的 storage_subdir")
|
|
61
|
+
root = Path(DOCUMENTS_DIR).resolve()
|
|
62
|
+
root = root / self.storage_subdir.strip()
|
|
63
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
return root
|
|
65
|
+
|
|
66
|
+
def _resolve(self, relative_path: str) -> Path:
|
|
67
|
+
"""
|
|
68
|
+
将相对路径解析为绝对路径,并校验存储规则。
|
|
69
|
+
|
|
70
|
+
规则:
|
|
71
|
+
- 路径必须相对于当前存储根(_root),不得包含 .. 或绝对路径。
|
|
72
|
+
- 仅允许字母、数字、下划线、短横线、点、斜杠(子类可覆盖 _check_relative_path 放宽或收紧)。
|
|
73
|
+
|
|
74
|
+
:param relative_path: 相对于存储根的路径,如 "20260228_013509_pubmed/main.py"
|
|
75
|
+
:return: 解析后的绝对路径
|
|
76
|
+
:raises ValueError: 路径非法(穿越根目录或不符合命名规则)
|
|
77
|
+
"""
|
|
78
|
+
relative_path = relative_path.replace("\\", "/").strip().lstrip("/")
|
|
79
|
+
self._check_relative_path(relative_path)
|
|
80
|
+
full = (self._root / relative_path).resolve()
|
|
81
|
+
try:
|
|
82
|
+
full.relative_to(self._root.resolve())
|
|
83
|
+
except ValueError:
|
|
84
|
+
raise ValueError("路径不允许超出存储根目录") from None
|
|
85
|
+
return full
|
|
86
|
+
|
|
87
|
+
def _check_relative_path(self, relative_path: str) -> None:
|
|
88
|
+
"""
|
|
89
|
+
校验相对路径是否符合存储规则。子类可覆盖以自定义规则。
|
|
90
|
+
|
|
91
|
+
默认:不允许空、不允许 ".."、不允许绝对路径形态、仅允许安全字符。
|
|
92
|
+
"""
|
|
93
|
+
if not relative_path:
|
|
94
|
+
raise ValueError("相对路径不能为空")
|
|
95
|
+
if ".." in relative_path:
|
|
96
|
+
raise ValueError("相对路径不允许包含 ..")
|
|
97
|
+
if Path(relative_path).is_absolute() or relative_path.startswith("/"):
|
|
98
|
+
raise ValueError("相对路径不能为绝对路径")
|
|
99
|
+
if not _RELATIVE_PATH_PATTERN.match(relative_path):
|
|
100
|
+
raise ValueError("相对路径仅允许字母、数字、下划线、短横线、点和斜杠")
|
|
101
|
+
|
|
102
|
+
def get_root(self) -> Path:
|
|
103
|
+
"""返回当前存储根目录的绝对路径。"""
|
|
104
|
+
return self._root
|
|
105
|
+
|
|
106
|
+
def resolve_path(self, relative_path: str) -> Path:
|
|
107
|
+
"""
|
|
108
|
+
公开方法:解析相对路径为绝对路径(遵守存储规则)。
|
|
109
|
+
子类在实现读写时应使用此方法或 _resolve 获取目标路径。
|
|
110
|
+
"""
|
|
111
|
+
return self._resolve(relative_path)
|
|
112
|
+
|
|
113
|
+
def write_metadata(
|
|
114
|
+
self,
|
|
115
|
+
*,
|
|
116
|
+
relative_path: str,
|
|
117
|
+
size: Optional[int] = None,
|
|
118
|
+
content_hash: Optional[str] = None,
|
|
119
|
+
md5: Optional[str] = None,
|
|
120
|
+
phash: Optional[str] = None,
|
|
121
|
+
mime_type: Optional[str] = None,
|
|
122
|
+
status: Optional[str] = None,
|
|
123
|
+
metadata: Optional[dict[str, object]] = None,
|
|
124
|
+
) -> InsertOneResult | UpdateResult:
|
|
125
|
+
"""
|
|
126
|
+
写入一条文件元数据(按 storage_subdir + relative_path upsert)。
|
|
127
|
+
ticket_id 与 task_id 从当前线程上下文中获取,缺失则抛出异常。
|
|
128
|
+
"""
|
|
129
|
+
thread = threading.current_thread()
|
|
130
|
+
ticket_id = getattr(thread, "ticket_id", None)
|
|
131
|
+
task_id = getattr(thread, "task_id", None)
|
|
132
|
+
if (not ticket_id or not str(ticket_id).strip()) and (
|
|
133
|
+
thread.name and thread.name.startswith("collection-")
|
|
134
|
+
):
|
|
135
|
+
fallback_id = thread.name.split("collection-", 1)[1].strip()
|
|
136
|
+
if fallback_id:
|
|
137
|
+
ticket_id = fallback_id
|
|
138
|
+
if not task_id or not str(task_id).strip():
|
|
139
|
+
task_id = ticket_id
|
|
140
|
+
if not ticket_id or not str(ticket_id).strip():
|
|
141
|
+
raise ValueError("线程上下文缺少 ticket_id")
|
|
142
|
+
if not task_id or not str(task_id).strip():
|
|
143
|
+
raise ValueError("线程上下文缺少 task_id")
|
|
144
|
+
|
|
145
|
+
normalized = relative_path.replace("\\", "/").strip().lstrip("/")
|
|
146
|
+
self._check_relative_path(normalized)
|
|
147
|
+
return self._metadata_repo.save(
|
|
148
|
+
storage_subdir=self.storage_subdir,
|
|
149
|
+
relative_path=normalized,
|
|
150
|
+
size=size,
|
|
151
|
+
content_hash=content_hash,
|
|
152
|
+
md5=md5,
|
|
153
|
+
phash=phash,
|
|
154
|
+
mime_type=mime_type,
|
|
155
|
+
ticket_id=ticket_id,
|
|
156
|
+
task_id=task_id,
|
|
157
|
+
status=status,
|
|
158
|
+
metadata=metadata,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def _ensure_metadata_exists(self, relative_path: str) -> None:
|
|
162
|
+
"""
|
|
163
|
+
写入文件前校验元数据是否存在,不存在则抛异常。
|
|
164
|
+
"""
|
|
165
|
+
normalized = relative_path.replace("\\", "/").strip().lstrip("/")
|
|
166
|
+
self._check_relative_path(normalized)
|
|
167
|
+
doc = self._metadata_repo.find_by_path(self.storage_subdir, normalized)
|
|
168
|
+
if not doc:
|
|
169
|
+
raise ValueError(f"未找到对应元数据,禁止写入文件: {normalized}")
|
|
170
|
+
|
|
171
|
+
def write(self, relative_path: str, data: bytes) -> Path:
|
|
172
|
+
"""
|
|
173
|
+
将数据写入相对路径。
|
|
174
|
+
基类默认在写入前校验对应元数据已存在,再委托给子类实现实际写入。
|
|
175
|
+
|
|
176
|
+
:param relative_path: 相对于存储根的路径
|
|
177
|
+
:param data: 字节内容
|
|
178
|
+
:return: 写入后的绝对路径
|
|
179
|
+
"""
|
|
180
|
+
self._ensure_metadata_exists(relative_path)
|
|
181
|
+
return self._write_impl(relative_path, data)
|
|
182
|
+
|
|
183
|
+
@abstractmethod
|
|
184
|
+
def _write_impl(self, relative_path: str, data: bytes) -> Path:
|
|
185
|
+
"""子类实现:执行实际写入逻辑(无需重复元数据存在校验)。"""
|
|
186
|
+
...
|
|
187
|
+
|
|
188
|
+
def exists(self, relative_path: str) -> bool:
|
|
189
|
+
"""判断相对路径对应的文件或目录是否存在。子类可直接使用,也可覆盖。"""
|
|
190
|
+
try:
|
|
191
|
+
return self._resolve(relative_path).exists()
|
|
192
|
+
except ValueError:
|
|
193
|
+
return False
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nutricare-data-packages
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Nutricare data access package
|
|
5
|
+
Author: Nutricare Team
|
|
6
|
+
Project-URL: Homepage, https://example.com
|
|
7
|
+
Project-URL: Repository, https://example.com/nutricare-data-packages
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: pymongo>=4.0.0
|
|
11
|
+
|
|
12
|
+
# nutricare-data-packages
|
|
13
|
+
|
|
14
|
+
Nutricare 的数据访问基础包,封装 MongoDB 仓储与本地存储抽象基类。
|
|
15
|
+
|
|
16
|
+
## 安装
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install nutricare-data-packages
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 快速使用
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from nutricare_data_packages import MetadataStore, StorageBase
|
|
28
|
+
|
|
29
|
+
store = MetadataStore(
|
|
30
|
+
mongodb_url="mongodb://localhost:27017",
|
|
31
|
+
db_name="nutricare",
|
|
32
|
+
auth_source="admin",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
class ReportStorage(StorageBase):
|
|
36
|
+
@property
|
|
37
|
+
def storage_subdir(self) -> str:
|
|
38
|
+
return "reports"
|
|
39
|
+
|
|
40
|
+
def _write_impl(self, relative_path: str, data: bytes) -> Path:
|
|
41
|
+
target = self.resolve_path(relative_path)
|
|
42
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
target.write_bytes(data)
|
|
44
|
+
return target
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 本地构建与发布
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python -m pip install --upgrade build twine
|
|
51
|
+
python -m build
|
|
52
|
+
python -m twine check dist/*
|
|
53
|
+
python -m twine upload dist/*
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## 说明
|
|
57
|
+
|
|
58
|
+
- 包名(pip 安装名):`nutricare-data-packages`
|
|
59
|
+
- 导入名(Python import):`nutricare_data_packages`
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
nutricare_data_packages/__init__.py
|
|
4
|
+
nutricare_data_packages/metadata_storage.py
|
|
5
|
+
nutricare_data_packages/mongo_repository_base.py
|
|
6
|
+
nutricare_data_packages/storage_base.py
|
|
7
|
+
nutricare_data_packages.egg-info/PKG-INFO
|
|
8
|
+
nutricare_data_packages.egg-info/SOURCES.txt
|
|
9
|
+
nutricare_data_packages.egg-info/dependency_links.txt
|
|
10
|
+
nutricare_data_packages.egg-info/requires.txt
|
|
11
|
+
nutricare_data_packages.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pymongo>=4.0.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nutricare_data_packages
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "nutricare-data-packages"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Nutricare data access package"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Nutricare Team" }
|
|
13
|
+
]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pymongo>=4.0.0"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://example.com"
|
|
20
|
+
Repository = "https://example.com/nutricare-data-packages"
|
|
21
|
+
|
|
22
|
+
[tool.setuptools]
|
|
23
|
+
include-package-data = true
|
|
24
|
+
|
|
25
|
+
[tool.setuptools.packages.find]
|
|
26
|
+
where = ["."]
|
|
27
|
+
include = ["nutricare_data_packages*"]
|