xparse-client 0.2.19__py3-none-any.whl → 0.3.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +111 -20
- xparse_client/_base.py +188 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +351 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +225 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +134 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +132 -0
- xparse_client/models/pipeline.py +134 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b8.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b8.dist-info/RECORD +68 -0
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.19.dist-info → xparse_client-0.3.0b8.dist-info}/top_level.txt +2 -0
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -129
- xparse_client/pipeline/destinations.py +0 -489
- xparse_client/pipeline/pipeline.py +0 -690
- xparse_client/pipeline/sources.py +0 -583
- xparse_client-0.2.19.dist-info/METADATA +0 -1050
- xparse_client-0.2.19.dist-info/RECORD +0 -11
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""FTP 数据源"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ftplib
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from ...exceptions import SourceError
|
|
12
|
+
from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
|
|
13
|
+
from .base import Source
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FtpSource(Source):
|
|
19
|
+
"""FTP 数据源
|
|
20
|
+
|
|
21
|
+
从 FTP 服务器读取文件。
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
host: FTP 主机
|
|
25
|
+
port: FTP 端口
|
|
26
|
+
username: 用户名
|
|
27
|
+
pattern: 文件匹配模式
|
|
28
|
+
recursive: 是否递归
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> source = FtpSource(
|
|
32
|
+
... host="ftp.example.com",
|
|
33
|
+
... port=21,
|
|
34
|
+
... username="user",
|
|
35
|
+
... password="pass",
|
|
36
|
+
... pattern=["*.pdf"],
|
|
37
|
+
... )
|
|
38
|
+
>>> files = source.list_files()
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
host: str,
|
|
44
|
+
port: int,
|
|
45
|
+
username: str,
|
|
46
|
+
password: str,
|
|
47
|
+
pattern: list[str] | None = None,
|
|
48
|
+
recursive: bool = False,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""初始化 FTP 数据源
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
host: FTP 主机地址
|
|
54
|
+
port: FTP 端口
|
|
55
|
+
username: 用户名
|
|
56
|
+
password: 密码
|
|
57
|
+
pattern: 文件匹配模式列表
|
|
58
|
+
recursive: 是否递归,默认 False
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
SourceError: 连接失败
|
|
62
|
+
"""
|
|
63
|
+
self.host = host
|
|
64
|
+
self.port = port
|
|
65
|
+
self.username = username
|
|
66
|
+
self.password = password
|
|
67
|
+
self.pattern = normalize_wildcard_patterns(pattern)
|
|
68
|
+
self.recursive = recursive
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
self.client = ftplib.FTP()
|
|
72
|
+
self.client.connect(self.host, self.port)
|
|
73
|
+
self.client.login(self.username, self.password)
|
|
74
|
+
logger.info(f"FTP 连接成功: {host}:{port}")
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise SourceError(
|
|
77
|
+
f"FTP 连接失败: {e}",
|
|
78
|
+
connector_type="ftp",
|
|
79
|
+
operation="connect",
|
|
80
|
+
details={"host": host, "port": port},
|
|
81
|
+
) from e
|
|
82
|
+
|
|
83
|
+
def list_files(self) -> list[str]:
|
|
84
|
+
"""列出 FTP 文件
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
文件路径列表
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
SourceError: 列出文件失败
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
files: list[str] = []
|
|
94
|
+
current_dir = self.client.pwd()
|
|
95
|
+
|
|
96
|
+
if self.recursive:
|
|
97
|
+
self._list_recursive("", files)
|
|
98
|
+
else:
|
|
99
|
+
self._list_current_dir(files)
|
|
100
|
+
|
|
101
|
+
# 确保回到原始目录
|
|
102
|
+
try:
|
|
103
|
+
self.client.cwd(current_dir)
|
|
104
|
+
except Exception:
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
logger.info(f"FTP 找到 {len(files)} 个文件")
|
|
108
|
+
return files
|
|
109
|
+
|
|
110
|
+
except SourceError:
|
|
111
|
+
raise
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise SourceError(
|
|
114
|
+
f"列出 FTP 文件失败: {e}",
|
|
115
|
+
connector_type="ftp",
|
|
116
|
+
operation="list_files",
|
|
117
|
+
) from e
|
|
118
|
+
|
|
119
|
+
def _list_recursive(self, path: str, files: list[str]) -> None:
|
|
120
|
+
"""递归列出目录下的文件"""
|
|
121
|
+
try:
|
|
122
|
+
original_dir = self.client.pwd()
|
|
123
|
+
if path:
|
|
124
|
+
try:
|
|
125
|
+
self.client.cwd(path)
|
|
126
|
+
except Exception:
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
items = self._get_dir_items()
|
|
130
|
+
|
|
131
|
+
for item_name, is_dir in items:
|
|
132
|
+
if item_name in [".", ".."]:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
full_path = f"{path}/{item_name}" if path else item_name
|
|
136
|
+
|
|
137
|
+
if is_dir:
|
|
138
|
+
self._list_recursive(full_path, files)
|
|
139
|
+
else:
|
|
140
|
+
relative_path = full_path.lstrip("/")
|
|
141
|
+
if match_file_pattern(relative_path, self.pattern):
|
|
142
|
+
files.append(relative_path)
|
|
143
|
+
|
|
144
|
+
self.client.cwd(original_dir)
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.warning(f"FTP 列出路径失败 {path}: {e}")
|
|
148
|
+
|
|
149
|
+
def _list_current_dir(self, files: list[str]) -> None:
|
|
150
|
+
"""列出当前目录下的文件(非递归)"""
|
|
151
|
+
items = self._get_dir_items()
|
|
152
|
+
|
|
153
|
+
for item_name, is_dir in items:
|
|
154
|
+
if item_name in [".", ".."] or is_dir:
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
if match_file_pattern(item_name, self.pattern):
|
|
158
|
+
files.append(item_name)
|
|
159
|
+
|
|
160
|
+
def _get_dir_items(self) -> list[tuple[str, bool]]:
|
|
161
|
+
"""获取当前目录的项目列表
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
(文件名, 是否为目录) 元组列表
|
|
165
|
+
"""
|
|
166
|
+
items = []
|
|
167
|
+
|
|
168
|
+
# 尝试使用 MLSD(更可靠)
|
|
169
|
+
try:
|
|
170
|
+
for item_name, item_info in self.client.mlsd():
|
|
171
|
+
if item_name in [".", ".."]:
|
|
172
|
+
continue
|
|
173
|
+
is_dir = item_info.get("type") == "dir"
|
|
174
|
+
items.append((item_name, is_dir))
|
|
175
|
+
return items
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
# 回退到 LIST 命令
|
|
180
|
+
try:
|
|
181
|
+
lines: list[str] = []
|
|
182
|
+
self.client.retrlines("LIST", lines.append)
|
|
183
|
+
for line in lines:
|
|
184
|
+
parts = line.split()
|
|
185
|
+
if len(parts) >= 9:
|
|
186
|
+
item_name = " ".join(parts[8:])
|
|
187
|
+
if item_name in [".", ".."]:
|
|
188
|
+
continue
|
|
189
|
+
is_dir = parts[0].startswith("d")
|
|
190
|
+
items.append((item_name, is_dir))
|
|
191
|
+
return items
|
|
192
|
+
except Exception:
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
# 最后回退到 NLST,通过尝试切换目录判断类型
|
|
196
|
+
try:
|
|
197
|
+
raw_items = self.client.nlst()
|
|
198
|
+
for item_name in raw_items:
|
|
199
|
+
if item_name in [".", ".."]:
|
|
200
|
+
continue
|
|
201
|
+
try:
|
|
202
|
+
self.client.cwd(item_name)
|
|
203
|
+
self.client.cwd("..")
|
|
204
|
+
is_dir = True
|
|
205
|
+
except Exception:
|
|
206
|
+
is_dir = False
|
|
207
|
+
items.append((item_name, is_dir))
|
|
208
|
+
except Exception:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
return items
|
|
212
|
+
|
|
213
|
+
def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
|
|
214
|
+
"""读取 FTP 文件
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
file_path: 文件路径
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
(文件内容, 元信息) 元组
|
|
221
|
+
|
|
222
|
+
Raises:
|
|
223
|
+
SourceError: 读取文件失败
|
|
224
|
+
"""
|
|
225
|
+
try:
|
|
226
|
+
buffer = BytesIO()
|
|
227
|
+
self.client.retrbinary(f"RETR {file_path}", buffer.write)
|
|
228
|
+
|
|
229
|
+
# 获取修改时间
|
|
230
|
+
date_modified = None
|
|
231
|
+
try:
|
|
232
|
+
resp = self.client.sendcmd(f"MDTM {file_path}")
|
|
233
|
+
parts = resp.split()
|
|
234
|
+
if len(parts) == 2 and parts[0] == "213":
|
|
235
|
+
dt = datetime.strptime(parts[1], "%Y%m%d%H%M%S")
|
|
236
|
+
date_modified = dt.replace(tzinfo=timezone.utc).timestamp()
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logger.debug(f"FTP 获取文件时间失败 {file_path}: {e}")
|
|
239
|
+
|
|
240
|
+
normalized_path = file_path.lstrip("/")
|
|
241
|
+
version = to_millis_timestamp(date_modified)
|
|
242
|
+
|
|
243
|
+
data_source = {
|
|
244
|
+
"url": f"ftp://{self.host}:{self.port}/{normalized_path}",
|
|
245
|
+
"version": version,
|
|
246
|
+
"date_created": version,
|
|
247
|
+
"date_modified": version,
|
|
248
|
+
"record_locator": {
|
|
249
|
+
"server": f"{self.host}:{self.port}",
|
|
250
|
+
"protocol": "ftp",
|
|
251
|
+
"remote_file_path": normalized_path,
|
|
252
|
+
},
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return buffer.getvalue(), data_source
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
raise SourceError(
|
|
259
|
+
f"读取 FTP 文件失败: {file_path}, {e}",
|
|
260
|
+
connector_type="ftp",
|
|
261
|
+
operation="read_file",
|
|
262
|
+
) from e
|
|
263
|
+
|
|
264
|
+
def close(self) -> None:
|
|
265
|
+
"""关闭 FTP 连接"""
|
|
266
|
+
try:
|
|
267
|
+
self.client.quit()
|
|
268
|
+
except Exception:
|
|
269
|
+
try:
|
|
270
|
+
self.client.close()
|
|
271
|
+
except Exception:
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
def __repr__(self) -> str:
|
|
275
|
+
return f"<FtpSource host={self.host}:{self.port}>"
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
__all__ = ["FtpSource"]
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""本地文件系统数据源"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ...exceptions import SourceError
|
|
10
|
+
from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
|
|
11
|
+
from .base import Source
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LocalSource(Source):
|
|
17
|
+
"""本地文件系统数据源
|
|
18
|
+
|
|
19
|
+
从本地目录读取文件。
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
directory: 目录路径
|
|
23
|
+
pattern: 文件匹配模式
|
|
24
|
+
recursive: 是否递归子目录
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> source = LocalSource(
|
|
28
|
+
... directory="./documents",
|
|
29
|
+
... pattern=["*.pdf", "*.docx"],
|
|
30
|
+
... recursive=True,
|
|
31
|
+
... )
|
|
32
|
+
>>> files = source.list_files()
|
|
33
|
+
>>> content, metadata = source.read_file(files[0])
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
directory: str,
|
|
39
|
+
pattern: list[str] | None = None,
|
|
40
|
+
recursive: bool = False,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""初始化本地数据源
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
directory: 目录路径
|
|
46
|
+
pattern: 文件匹配模式列表,如 ["*.pdf", "*.docx"]
|
|
47
|
+
recursive: 是否递归子目录,默认 False
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
SourceError: 目录不存在
|
|
51
|
+
"""
|
|
52
|
+
self.directory = Path(directory).resolve()
|
|
53
|
+
self.pattern = normalize_wildcard_patterns(pattern)
|
|
54
|
+
self.recursive = recursive
|
|
55
|
+
|
|
56
|
+
if not self.directory.exists():
|
|
57
|
+
raise SourceError(
|
|
58
|
+
f"目录不存在: {directory}",
|
|
59
|
+
connector_type="local",
|
|
60
|
+
operation="init",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if not self.directory.is_dir():
|
|
64
|
+
raise SourceError(
|
|
65
|
+
f"路径不是目录: {directory}",
|
|
66
|
+
connector_type="local",
|
|
67
|
+
operation="init",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
logger.info(f"本地数据源初始化: {self.directory}")
|
|
71
|
+
|
|
72
|
+
def list_files(self) -> list[str]:
|
|
73
|
+
"""列出目录下的文件
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
文件相对路径列表
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
SourceError: 列出文件失败
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
glob_pattern = "**/*" if self.recursive else "*"
|
|
83
|
+
all_files = []
|
|
84
|
+
|
|
85
|
+
for f in self.directory.glob(glob_pattern):
|
|
86
|
+
if f.is_file():
|
|
87
|
+
# 返回相对路径
|
|
88
|
+
rel_path = str(f.relative_to(self.directory))
|
|
89
|
+
all_files.append(rel_path)
|
|
90
|
+
|
|
91
|
+
# 按模式过滤
|
|
92
|
+
files = [f for f in all_files if match_file_pattern(f, self.pattern)]
|
|
93
|
+
|
|
94
|
+
logger.info(f"本地数据源找到 {len(files)} 个文件")
|
|
95
|
+
return sorted(files)
|
|
96
|
+
|
|
97
|
+
except SourceError:
|
|
98
|
+
raise
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise SourceError(
|
|
101
|
+
f"列出文件失败: {e}",
|
|
102
|
+
connector_type="local",
|
|
103
|
+
operation="list_files",
|
|
104
|
+
) from e
|
|
105
|
+
|
|
106
|
+
def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
|
|
107
|
+
"""读取本地文件
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
file_path: 文件相对路径
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
(文件内容, 元信息) 元组
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
SourceError: 读取文件失败
|
|
117
|
+
"""
|
|
118
|
+
full_path = (self.directory / file_path).resolve()
|
|
119
|
+
|
|
120
|
+
# 安全检查:确保路径在目录内
|
|
121
|
+
try:
|
|
122
|
+
full_path.relative_to(self.directory)
|
|
123
|
+
except ValueError as e:
|
|
124
|
+
raise SourceError(
|
|
125
|
+
f"非法路径,不在目录内: {file_path}",
|
|
126
|
+
connector_type="local",
|
|
127
|
+
operation="read_file",
|
|
128
|
+
) from e
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
with open(full_path, "rb") as f:
|
|
132
|
+
file_bytes = f.read()
|
|
133
|
+
|
|
134
|
+
# 获取文件元信息
|
|
135
|
+
stats = full_path.stat()
|
|
136
|
+
date_created = stats.st_ctime
|
|
137
|
+
date_modified = stats.st_mtime
|
|
138
|
+
version = str(int(stats.st_mtime_ns))
|
|
139
|
+
|
|
140
|
+
data_source = {
|
|
141
|
+
"url": full_path.as_uri(),
|
|
142
|
+
"version": version,
|
|
143
|
+
"date_created": to_millis_timestamp(date_created),
|
|
144
|
+
"date_modified": to_millis_timestamp(date_modified),
|
|
145
|
+
"record_locator": {
|
|
146
|
+
"protocol": "file",
|
|
147
|
+
"remote_file_path": str(full_path),
|
|
148
|
+
},
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return file_bytes, data_source
|
|
152
|
+
|
|
153
|
+
except FileNotFoundError as e:
|
|
154
|
+
raise SourceError(
|
|
155
|
+
f"文件不存在: {full_path}",
|
|
156
|
+
connector_type="local",
|
|
157
|
+
operation="read_file",
|
|
158
|
+
) from e
|
|
159
|
+
except PermissionError as e:
|
|
160
|
+
raise SourceError(
|
|
161
|
+
f"无权限读取文件: {full_path}",
|
|
162
|
+
connector_type="local",
|
|
163
|
+
operation="read_file",
|
|
164
|
+
) from e
|
|
165
|
+
except Exception as e:
|
|
166
|
+
raise SourceError(
|
|
167
|
+
f"读取文件失败: {e}",
|
|
168
|
+
connector_type="local",
|
|
169
|
+
operation="read_file",
|
|
170
|
+
) from e
|
|
171
|
+
|
|
172
|
+
def __repr__(self) -> str:
|
|
173
|
+
return f"<LocalSource directory={self.directory}>"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
__all__ = ["LocalSource"]
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""S3/MinIO 数据源(懒加载 boto3)"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import timezone
|
|
7
|
+
from email.utils import parsedate_to_datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ...exceptions import SourceError
|
|
11
|
+
from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
|
|
12
|
+
from .base import Source
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_boto3():
|
|
18
|
+
"""懒加载 boto3
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
(boto3, Config) 元组
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
ImportError: boto3 未安装
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
import boto3
|
|
28
|
+
from botocore.config import Config
|
|
29
|
+
|
|
30
|
+
return boto3, Config
|
|
31
|
+
except ImportError as e:
|
|
32
|
+
raise ImportError(
|
|
33
|
+
"使用 S3Source 需要安装 boto3: pip install xparse-client[s3]"
|
|
34
|
+
) from e
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class S3Source(Source):
|
|
38
|
+
"""S3/MinIO 数据源
|
|
39
|
+
|
|
40
|
+
从 S3 或兼容存储(MinIO、阿里云 OSS、华为云 OBS 等)读取文件。
|
|
41
|
+
boto3 会在首次使用时懒加载。
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
endpoint: S3 端点 URL
|
|
45
|
+
bucket: 存储桶名称
|
|
46
|
+
prefix: 对象前缀
|
|
47
|
+
pattern: 文件匹配模式
|
|
48
|
+
recursive: 是否递归
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
>>> source = S3Source(
|
|
52
|
+
... endpoint="https://s3.amazonaws.com",
|
|
53
|
+
... access_key="your-access-key",
|
|
54
|
+
... secret_key="your-secret-key",
|
|
55
|
+
... bucket="my-bucket",
|
|
56
|
+
... prefix="documents/",
|
|
57
|
+
... pattern=["*.pdf"],
|
|
58
|
+
... )
|
|
59
|
+
>>> files = source.list_files()
|
|
60
|
+
>>> content, metadata = source.read_file(files[0])
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
endpoint: str,
|
|
66
|
+
access_key: str,
|
|
67
|
+
secret_key: str,
|
|
68
|
+
bucket: str,
|
|
69
|
+
prefix: str = "",
|
|
70
|
+
region: str = "us-east-1",
|
|
71
|
+
pattern: list[str] | None = None,
|
|
72
|
+
recursive: bool = False,
|
|
73
|
+
) -> None:
|
|
74
|
+
"""初始化 S3 数据源
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
endpoint: S3 端点 URL
|
|
78
|
+
access_key: 访问密钥
|
|
79
|
+
secret_key: 秘密密钥
|
|
80
|
+
bucket: 存储桶名称
|
|
81
|
+
prefix: 对象前缀(可选)
|
|
82
|
+
region: 区域,默认 us-east-1
|
|
83
|
+
pattern: 文件匹配模式列表
|
|
84
|
+
recursive: 是否递归,默认 False
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
SourceError: 连接失败
|
|
88
|
+
"""
|
|
89
|
+
boto3, Config = _get_boto3()
|
|
90
|
+
|
|
91
|
+
self.endpoint = endpoint
|
|
92
|
+
self.bucket = bucket
|
|
93
|
+
self.prefix = prefix
|
|
94
|
+
self.pattern = normalize_wildcard_patterns(pattern)
|
|
95
|
+
self.recursive = recursive
|
|
96
|
+
|
|
97
|
+
# 根据端点配置签名版本
|
|
98
|
+
if endpoint == "https://textin-minio-api.ai.intsig.net":
|
|
99
|
+
config = Config(signature_version="s3v4")
|
|
100
|
+
elif endpoint.endswith("aliyuncs.com"):
|
|
101
|
+
config = Config(signature_version="s3", s3={"addressing_style": "virtual"})
|
|
102
|
+
elif endpoint.endswith("myhuaweicloud.com"):
|
|
103
|
+
config = Config(signature_version="s3", s3={"addressing_style": "virtual"})
|
|
104
|
+
else:
|
|
105
|
+
config = Config(signature_version="s3v4", s3={"addressing_style": "virtual"})
|
|
106
|
+
|
|
107
|
+
self.client = boto3.client(
|
|
108
|
+
"s3",
|
|
109
|
+
endpoint_url=endpoint,
|
|
110
|
+
aws_access_key_id=access_key,
|
|
111
|
+
aws_secret_access_key=secret_key,
|
|
112
|
+
region_name=region,
|
|
113
|
+
config=config,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# 验证连接
|
|
117
|
+
try:
|
|
118
|
+
self.client.head_bucket(Bucket=bucket)
|
|
119
|
+
logger.info(f"S3 连接成功: {endpoint}/{bucket}")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
raise SourceError(
|
|
122
|
+
f"S3 连接失败: {e}",
|
|
123
|
+
connector_type="s3",
|
|
124
|
+
operation="connect",
|
|
125
|
+
details={"endpoint": endpoint, "bucket": bucket},
|
|
126
|
+
) from e
|
|
127
|
+
|
|
128
|
+
def list_files(self) -> list[str]:
|
|
129
|
+
"""列出 S3 对象
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
对象 Key 列表
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
SourceError: 列出对象失败
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
files = []
|
|
139
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
|
140
|
+
|
|
141
|
+
params = {"Bucket": self.bucket}
|
|
142
|
+
if self.prefix:
|
|
143
|
+
params["Prefix"] = self.prefix
|
|
144
|
+
if not self.recursive:
|
|
145
|
+
# 非递归模式使用 Delimiter
|
|
146
|
+
params["Delimiter"] = "/"
|
|
147
|
+
|
|
148
|
+
for page in paginator.paginate(**params):
|
|
149
|
+
if "Contents" in page:
|
|
150
|
+
for obj in page["Contents"]:
|
|
151
|
+
key = obj["Key"]
|
|
152
|
+
# 跳过目录和临时文件
|
|
153
|
+
if key.endswith("/") or key.endswith("empty.tmp"):
|
|
154
|
+
continue
|
|
155
|
+
if match_file_pattern(key, self.pattern):
|
|
156
|
+
files.append(key)
|
|
157
|
+
|
|
158
|
+
logger.info(f"S3 找到 {len(files)} 个文件")
|
|
159
|
+
return files
|
|
160
|
+
|
|
161
|
+
except SourceError:
|
|
162
|
+
raise
|
|
163
|
+
except Exception as e:
|
|
164
|
+
raise SourceError(
|
|
165
|
+
f"列出 S3 对象失败: {e}",
|
|
166
|
+
connector_type="s3",
|
|
167
|
+
operation="list_files",
|
|
168
|
+
) from e
|
|
169
|
+
|
|
170
|
+
def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
|
|
171
|
+
"""读取 S3 对象
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
file_path: 对象 Key
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
(对象内容, 元信息) 元组
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
SourceError: 读取对象失败
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
response = self.client.get_object(Bucket=self.bucket, Key=file_path)
|
|
184
|
+
file_bytes = response["Body"].read()
|
|
185
|
+
|
|
186
|
+
# 获取元信息
|
|
187
|
+
headers = response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
|
188
|
+
version = headers.get("etag", "").strip('"')
|
|
189
|
+
last_modified = headers.get("last-modified")
|
|
190
|
+
server = headers.get("server", "unknown")
|
|
191
|
+
|
|
192
|
+
date_modified = None
|
|
193
|
+
if last_modified:
|
|
194
|
+
try:
|
|
195
|
+
dt = parsedate_to_datetime(last_modified)
|
|
196
|
+
date_modified = dt.astimezone(timezone.utc).timestamp()
|
|
197
|
+
except Exception:
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
normalized_key = file_path.lstrip("/")
|
|
201
|
+
data_source = {
|
|
202
|
+
"url": f"s3://{self.bucket}/{normalized_key}",
|
|
203
|
+
"version": version,
|
|
204
|
+
"date_created": to_millis_timestamp(date_modified),
|
|
205
|
+
"date_modified": to_millis_timestamp(date_modified),
|
|
206
|
+
"record_locator": {
|
|
207
|
+
"server": server,
|
|
208
|
+
"protocol": "s3",
|
|
209
|
+
"remote_file_path": normalized_key,
|
|
210
|
+
},
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return file_bytes, data_source
|
|
214
|
+
|
|
215
|
+
except self.client.exceptions.NoSuchKey as e:
|
|
216
|
+
raise SourceError(
|
|
217
|
+
f"S3 对象不存在: {file_path}",
|
|
218
|
+
connector_type="s3",
|
|
219
|
+
operation="read_file",
|
|
220
|
+
) from e
|
|
221
|
+
except Exception as e:
|
|
222
|
+
raise SourceError(
|
|
223
|
+
f"读取 S3 对象失败: {file_path}, {e}",
|
|
224
|
+
connector_type="s3",
|
|
225
|
+
operation="read_file",
|
|
226
|
+
) from e
|
|
227
|
+
|
|
228
|
+
def __repr__(self) -> str:
|
|
229
|
+
return f"<S3Source endpoint={self.endpoint} bucket={self.bucket}>"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
__all__ = ["S3Source"]
|