xparse-client 0.2.20__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +110 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +185 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +132 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +132 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b1.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b1.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b1.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -163
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -860
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.20.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.20.dist-info/RECORD +0 -11
@@ -0,0 +1,278 @@
1
+ """FTP 数据源"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ftplib
6
+ import logging
7
+ from datetime import datetime, timezone
8
+ from io import BytesIO
9
+ from typing import Any
10
+
11
+ from ...exceptions import SourceError
12
+ from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
13
+ from .base import Source
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FtpSource(Source):
19
+ """FTP 数据源
20
+
21
+ 从 FTP 服务器读取文件。
22
+
23
+ Attributes:
24
+ host: FTP 主机
25
+ port: FTP 端口
26
+ username: 用户名
27
+ pattern: 文件匹配模式
28
+ recursive: 是否递归
29
+
30
+ Example:
31
+ >>> source = FtpSource(
32
+ ... host="ftp.example.com",
33
+ ... port=21,
34
+ ... username="user",
35
+ ... password="pass",
36
+ ... pattern=["*.pdf"],
37
+ ... )
38
+ >>> files = source.list_files()
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ host: str,
44
+ port: int,
45
+ username: str,
46
+ password: str,
47
+ pattern: list[str] | None = None,
48
+ recursive: bool = False,
49
+ ) -> None:
50
+ """初始化 FTP 数据源
51
+
52
+ Args:
53
+ host: FTP 主机地址
54
+ port: FTP 端口
55
+ username: 用户名
56
+ password: 密码
57
+ pattern: 文件匹配模式列表
58
+ recursive: 是否递归,默认 False
59
+
60
+ Raises:
61
+ SourceError: 连接失败
62
+ """
63
+ self.host = host
64
+ self.port = port
65
+ self.username = username
66
+ self.password = password
67
+ self.pattern = normalize_wildcard_patterns(pattern)
68
+ self.recursive = recursive
69
+
70
+ try:
71
+ self.client = ftplib.FTP()
72
+ self.client.connect(self.host, self.port)
73
+ self.client.login(self.username, self.password)
74
+ logger.info(f"FTP 连接成功: {host}:{port}")
75
+ except Exception as e:
76
+ raise SourceError(
77
+ f"FTP 连接失败: {e}",
78
+ connector_type="ftp",
79
+ operation="connect",
80
+ details={"host": host, "port": port},
81
+ ) from e
82
+
83
+ def list_files(self) -> list[str]:
84
+ """列出 FTP 文件
85
+
86
+ Returns:
87
+ 文件路径列表
88
+
89
+ Raises:
90
+ SourceError: 列出文件失败
91
+ """
92
+ try:
93
+ files = []
94
+ current_dir = self.client.pwd()
95
+
96
+ if self.recursive:
97
+ self._list_recursive("", files)
98
+ else:
99
+ self._list_current_dir(files)
100
+
101
+ # 确保回到原始目录
102
+ try:
103
+ self.client.cwd(current_dir)
104
+ except Exception:
105
+ pass
106
+
107
+ logger.info(f"FTP 找到 {len(files)} 个文件")
108
+ return files
109
+
110
+ except SourceError:
111
+ raise
112
+ except Exception as e:
113
+ raise SourceError(
114
+ f"列出 FTP 文件失败: {e}",
115
+ connector_type="ftp",
116
+ operation="list_files",
117
+ ) from e
118
+
119
+ def _list_recursive(self, path: str, files: list[str]) -> None:
120
+ """递归列出目录下的文件"""
121
+ try:
122
+ original_dir = self.client.pwd()
123
+ if path:
124
+ try:
125
+ self.client.cwd(path)
126
+ except Exception:
127
+ return
128
+
129
+ items = self._get_dir_items()
130
+
131
+ for item_name, is_dir in items:
132
+ if item_name in [".", ".."]:
133
+ continue
134
+
135
+ full_path = f"{path}/{item_name}" if path else item_name
136
+
137
+ if is_dir:
138
+ self._list_recursive(full_path, files)
139
+ else:
140
+ relative_path = full_path.lstrip("/")
141
+ if match_file_pattern(relative_path, self.pattern):
142
+ files.append(relative_path)
143
+
144
+ self.client.cwd(original_dir)
145
+
146
+ except Exception as e:
147
+ logger.warning(f"FTP 列出路径失败 {path}: {e}")
148
+
149
+ def _list_current_dir(self, files: list[str]) -> None:
150
+ """列出当前目录下的文件(非递归)"""
151
+ items = self._get_dir_items()
152
+
153
+ for item_name, is_dir in items:
154
+ if item_name in [".", ".."] or is_dir:
155
+ continue
156
+
157
+ if match_file_pattern(item_name, self.pattern):
158
+ files.append(item_name)
159
+
160
+ def _get_dir_items(self) -> list[tuple[str, bool]]:
161
+ """获取当前目录的项目列表
162
+
163
+ Returns:
164
+ (文件名, 是否为目录) 元组列表
165
+ """
166
+ items = []
167
+
168
+ # 尝试使用 MLSD(更可靠)
169
+ try:
170
+ for item_name, item_info in self.client.mlsd():
171
+ if item_name in [".", ".."]:
172
+ continue
173
+ is_dir = item_info.get("type") == "dir"
174
+ items.append((item_name, is_dir))
175
+ return items
176
+ except Exception:
177
+ pass
178
+
179
+ # 回退到 LIST 命令
180
+ try:
181
+ lines = []
182
+ self.client.retrlines("LIST", lines.append)
183
+ for line in lines:
184
+ parts = line.split()
185
+ if len(parts) >= 9:
186
+ item_name = " ".join(parts[8:])
187
+ if item_name in [".", ".."]:
188
+ continue
189
+ is_dir = parts[0].startswith("d")
190
+ items.append((item_name, is_dir))
191
+ return items
192
+ except Exception:
193
+ pass
194
+
195
+ # 最后回退到 NLST,通过尝试切换目录判断类型
196
+ try:
197
+ raw_items = self.client.nlst()
198
+ for item_name in raw_items:
199
+ if item_name in [".", ".."]:
200
+ continue
201
+ try:
202
+ self.client.cwd(item_name)
203
+ self.client.cwd("..")
204
+ is_dir = True
205
+ except Exception:
206
+ is_dir = False
207
+ items.append((item_name, is_dir))
208
+ except Exception:
209
+ pass
210
+
211
+ return items
212
+
213
+ def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
214
+ """读取 FTP 文件
215
+
216
+ Args:
217
+ file_path: 文件路径
218
+
219
+ Returns:
220
+ (文件内容, 元信息) 元组
221
+
222
+ Raises:
223
+ SourceError: 读取文件失败
224
+ """
225
+ try:
226
+ buffer = BytesIO()
227
+ self.client.retrbinary(f"RETR {file_path}", buffer.write)
228
+
229
+ # 获取修改时间
230
+ date_modified = None
231
+ try:
232
+ resp = self.client.sendcmd(f"MDTM {file_path}")
233
+ parts = resp.split()
234
+ if len(parts) == 2 and parts[0] == "213":
235
+ dt = datetime.strptime(parts[1], "%Y%m%d%H%M%S")
236
+ date_modified = dt.replace(tzinfo=timezone.utc).timestamp()
237
+ except Exception as e:
238
+ logger.debug(f"FTP 获取文件时间失败 {file_path}: {e}")
239
+
240
+ normalized_path = file_path.lstrip("/")
241
+ version = to_millis_timestamp(date_modified)
242
+
243
+ data_source = {
244
+ "url": f"ftp://{self.host}:{self.port}/{normalized_path}",
245
+ "version": version,
246
+ "date_created": version,
247
+ "date_modified": version,
248
+ "record_locator": {
249
+ "server": f"{self.host}:{self.port}",
250
+ "protocol": "ftp",
251
+ "remote_file_path": normalized_path,
252
+ },
253
+ }
254
+
255
+ return buffer.getvalue(), data_source
256
+
257
+ except Exception as e:
258
+ raise SourceError(
259
+ f"读取 FTP 文件失败: {file_path}, {e}",
260
+ connector_type="ftp",
261
+ operation="read_file",
262
+ ) from e
263
+
264
+ def close(self) -> None:
265
+ """关闭 FTP 连接"""
266
+ try:
267
+ self.client.quit()
268
+ except Exception:
269
+ try:
270
+ self.client.close()
271
+ except Exception:
272
+ pass
273
+
274
+ def __repr__(self) -> str:
275
+ return f"<FtpSource host={self.host}:{self.port}>"
276
+
277
+
278
+ __all__ = ["FtpSource"]
@@ -0,0 +1,176 @@
1
+ """本地文件系统数据源"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from ...exceptions import SourceError
10
+ from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
11
+ from .base import Source
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LocalSource(Source):
17
+ """本地文件系统数据源
18
+
19
+ 从本地目录读取文件。
20
+
21
+ Attributes:
22
+ directory: 目录路径
23
+ pattern: 文件匹配模式
24
+ recursive: 是否递归子目录
25
+
26
+ Example:
27
+ >>> source = LocalSource(
28
+ ... directory="./documents",
29
+ ... pattern=["*.pdf", "*.docx"],
30
+ ... recursive=True,
31
+ ... )
32
+ >>> files = source.list_files()
33
+ >>> content, metadata = source.read_file(files[0])
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ directory: str,
39
+ pattern: list[str] | None = None,
40
+ recursive: bool = False,
41
+ ) -> None:
42
+ """初始化本地数据源
43
+
44
+ Args:
45
+ directory: 目录路径
46
+ pattern: 文件匹配模式列表,如 ["*.pdf", "*.docx"]
47
+ recursive: 是否递归子目录,默认 False
48
+
49
+ Raises:
50
+ SourceError: 目录不存在
51
+ """
52
+ self.directory = Path(directory).resolve()
53
+ self.pattern = normalize_wildcard_patterns(pattern)
54
+ self.recursive = recursive
55
+
56
+ if not self.directory.exists():
57
+ raise SourceError(
58
+ f"目录不存在: {directory}",
59
+ connector_type="local",
60
+ operation="init",
61
+ )
62
+
63
+ if not self.directory.is_dir():
64
+ raise SourceError(
65
+ f"路径不是目录: {directory}",
66
+ connector_type="local",
67
+ operation="init",
68
+ )
69
+
70
+ logger.info(f"本地数据源初始化: {self.directory}")
71
+
72
+ def list_files(self) -> list[str]:
73
+ """列出目录下的文件
74
+
75
+ Returns:
76
+ 文件相对路径列表
77
+
78
+ Raises:
79
+ SourceError: 列出文件失败
80
+ """
81
+ try:
82
+ glob_pattern = "**/*" if self.recursive else "*"
83
+ all_files = []
84
+
85
+ for f in self.directory.glob(glob_pattern):
86
+ if f.is_file():
87
+ # 返回相对路径
88
+ rel_path = str(f.relative_to(self.directory))
89
+ all_files.append(rel_path)
90
+
91
+ # 按模式过滤
92
+ files = [f for f in all_files if match_file_pattern(f, self.pattern)]
93
+
94
+ logger.info(f"本地数据源找到 {len(files)} 个文件")
95
+ return sorted(files)
96
+
97
+ except SourceError:
98
+ raise
99
+ except Exception as e:
100
+ raise SourceError(
101
+ f"列出文件失败: {e}",
102
+ connector_type="local",
103
+ operation="list_files",
104
+ ) from e
105
+
106
+ def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
107
+ """读取本地文件
108
+
109
+ Args:
110
+ file_path: 文件相对路径
111
+
112
+ Returns:
113
+ (文件内容, 元信息) 元组
114
+
115
+ Raises:
116
+ SourceError: 读取文件失败
117
+ """
118
+ full_path = (self.directory / file_path).resolve()
119
+
120
+ # 安全检查:确保路径在目录内
121
+ try:
122
+ full_path.relative_to(self.directory)
123
+ except ValueError as e:
124
+ raise SourceError(
125
+ f"非法路径,不在目录内: {file_path}",
126
+ connector_type="local",
127
+ operation="read_file",
128
+ ) from e
129
+
130
+ try:
131
+ with open(full_path, "rb") as f:
132
+ file_bytes = f.read()
133
+
134
+ # 获取文件元信息
135
+ stats = full_path.stat()
136
+ date_created = stats.st_ctime
137
+ date_modified = stats.st_mtime
138
+ version = str(int(stats.st_mtime_ns))
139
+
140
+ data_source = {
141
+ "url": full_path.as_uri(),
142
+ "version": version,
143
+ "date_created": to_millis_timestamp(date_created),
144
+ "date_modified": to_millis_timestamp(date_modified),
145
+ "record_locator": {
146
+ "protocol": "file",
147
+ "remote_file_path": str(full_path),
148
+ },
149
+ }
150
+
151
+ return file_bytes, data_source
152
+
153
+ except FileNotFoundError as e:
154
+ raise SourceError(
155
+ f"文件不存在: {full_path}",
156
+ connector_type="local",
157
+ operation="read_file",
158
+ ) from e
159
+ except PermissionError as e:
160
+ raise SourceError(
161
+ f"无权限读取文件: {full_path}",
162
+ connector_type="local",
163
+ operation="read_file",
164
+ ) from e
165
+ except Exception as e:
166
+ raise SourceError(
167
+ f"读取文件失败: {e}",
168
+ connector_type="local",
169
+ operation="read_file",
170
+ ) from e
171
+
172
+ def __repr__(self) -> str:
173
+ return f"<LocalSource directory={self.directory}>"
174
+
175
+
176
+ __all__ = ["LocalSource"]
@@ -0,0 +1,232 @@
1
+ """S3/MinIO 数据源(懒加载 boto3)"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from datetime import timezone
7
+ from email.utils import parsedate_to_datetime
8
+ from typing import Any
9
+
10
+ from ...exceptions import SourceError
11
+ from .._utils import match_file_pattern, normalize_wildcard_patterns, to_millis_timestamp
12
+ from .base import Source
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def _get_boto3():
18
+ """懒加载 boto3
19
+
20
+ Returns:
21
+ (boto3, Config) 元组
22
+
23
+ Raises:
24
+ ImportError: boto3 未安装
25
+ """
26
+ try:
27
+ import boto3
28
+ from botocore.config import Config
29
+
30
+ return boto3, Config
31
+ except ImportError as e:
32
+ raise ImportError(
33
+ "使用 S3Source 需要安装 boto3: pip install xparse-client[s3]"
34
+ ) from e
35
+
36
+
37
+ class S3Source(Source):
38
+ """S3/MinIO 数据源
39
+
40
+ 从 S3 或兼容存储(MinIO、阿里云 OSS、华为云 OBS 等)读取文件。
41
+ boto3 会在首次使用时懒加载。
42
+
43
+ Attributes:
44
+ endpoint: S3 端点 URL
45
+ bucket: 存储桶名称
46
+ prefix: 对象前缀
47
+ pattern: 文件匹配模式
48
+ recursive: 是否递归
49
+
50
+ Example:
51
+ >>> source = S3Source(
52
+ ... endpoint="https://s3.amazonaws.com",
53
+ ... access_key="your-access-key",
54
+ ... secret_key="your-secret-key",
55
+ ... bucket="my-bucket",
56
+ ... prefix="documents/",
57
+ ... pattern=["*.pdf"],
58
+ ... )
59
+ >>> files = source.list_files()
60
+ >>> content, metadata = source.read_file(files[0])
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ endpoint: str,
66
+ access_key: str,
67
+ secret_key: str,
68
+ bucket: str,
69
+ prefix: str = "",
70
+ region: str = "us-east-1",
71
+ pattern: list[str] | None = None,
72
+ recursive: bool = False,
73
+ ) -> None:
74
+ """初始化 S3 数据源
75
+
76
+ Args:
77
+ endpoint: S3 端点 URL
78
+ access_key: 访问密钥
79
+ secret_key: 秘密密钥
80
+ bucket: 存储桶名称
81
+ prefix: 对象前缀(可选)
82
+ region: 区域,默认 us-east-1
83
+ pattern: 文件匹配模式列表
84
+ recursive: 是否递归,默认 False
85
+
86
+ Raises:
87
+ SourceError: 连接失败
88
+ """
89
+ boto3, Config = _get_boto3()
90
+
91
+ self.endpoint = endpoint
92
+ self.bucket = bucket
93
+ self.prefix = prefix
94
+ self.pattern = normalize_wildcard_patterns(pattern)
95
+ self.recursive = recursive
96
+
97
+ # 根据端点配置签名版本
98
+ if endpoint == "https://textin-minio-api.ai.intsig.net":
99
+ config = Config(signature_version="s3v4")
100
+ elif endpoint.endswith("aliyuncs.com"):
101
+ config = Config(signature_version="s3", s3={"addressing_style": "virtual"})
102
+ elif endpoint.endswith("myhuaweicloud.com"):
103
+ config = Config(signature_version="s3", s3={"addressing_style": "virtual"})
104
+ else:
105
+ config = Config(signature_version="s3v4", s3={"addressing_style": "virtual"})
106
+
107
+ self.client = boto3.client(
108
+ "s3",
109
+ endpoint_url=endpoint,
110
+ aws_access_key_id=access_key,
111
+ aws_secret_access_key=secret_key,
112
+ region_name=region,
113
+ config=config,
114
+ )
115
+
116
+ # 验证连接
117
+ try:
118
+ self.client.head_bucket(Bucket=bucket)
119
+ logger.info(f"S3 连接成功: {endpoint}/{bucket}")
120
+ except Exception as e:
121
+ raise SourceError(
122
+ f"S3 连接失败: {e}",
123
+ connector_type="s3",
124
+ operation="connect",
125
+ details={"endpoint": endpoint, "bucket": bucket},
126
+ ) from e
127
+
128
+ def list_files(self) -> list[str]:
129
+ """列出 S3 对象
130
+
131
+ Returns:
132
+ 对象 Key 列表
133
+
134
+ Raises:
135
+ SourceError: 列出对象失败
136
+ """
137
+ try:
138
+ files = []
139
+ paginator = self.client.get_paginator("list_objects_v2")
140
+
141
+ params = {"Bucket": self.bucket}
142
+ if self.prefix:
143
+ params["Prefix"] = self.prefix
144
+ if not self.recursive:
145
+ # 非递归模式使用 Delimiter
146
+ params["Delimiter"] = "/"
147
+
148
+ for page in paginator.paginate(**params):
149
+ if "Contents" in page:
150
+ for obj in page["Contents"]:
151
+ key = obj["Key"]
152
+ # 跳过目录和临时文件
153
+ if key.endswith("/") or key.endswith("empty.tmp"):
154
+ continue
155
+ if match_file_pattern(key, self.pattern):
156
+ files.append(key)
157
+
158
+ logger.info(f"S3 找到 {len(files)} 个文件")
159
+ return files
160
+
161
+ except SourceError:
162
+ raise
163
+ except Exception as e:
164
+ raise SourceError(
165
+ f"列出 S3 对象失败: {e}",
166
+ connector_type="s3",
167
+ operation="list_files",
168
+ ) from e
169
+
170
+ def read_file(self, file_path: str) -> tuple[bytes, dict[str, Any]]:
171
+ """读取 S3 对象
172
+
173
+ Args:
174
+ file_path: 对象 Key
175
+
176
+ Returns:
177
+ (对象内容, 元信息) 元组
178
+
179
+ Raises:
180
+ SourceError: 读取对象失败
181
+ """
182
+ try:
183
+ response = self.client.get_object(Bucket=self.bucket, Key=file_path)
184
+ file_bytes = response["Body"].read()
185
+
186
+ # 获取元信息
187
+ headers = response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
188
+ version = headers.get("etag", "").strip('"')
189
+ last_modified = headers.get("last-modified")
190
+ server = headers.get("server", "unknown")
191
+
192
+ date_modified = None
193
+ if last_modified:
194
+ try:
195
+ dt = parsedate_to_datetime(last_modified)
196
+ date_modified = dt.astimezone(timezone.utc).timestamp()
197
+ except Exception:
198
+ pass
199
+
200
+ normalized_key = file_path.lstrip("/")
201
+ data_source = {
202
+ "url": f"s3://{self.bucket}/{normalized_key}",
203
+ "version": version,
204
+ "date_created": to_millis_timestamp(date_modified),
205
+ "date_modified": to_millis_timestamp(date_modified),
206
+ "record_locator": {
207
+ "server": server,
208
+ "protocol": "s3",
209
+ "remote_file_path": normalized_key,
210
+ },
211
+ }
212
+
213
+ return file_bytes, data_source
214
+
215
+ except self.client.exceptions.NoSuchKey as e:
216
+ raise SourceError(
217
+ f"S3 对象不存在: {file_path}",
218
+ connector_type="s3",
219
+ operation="read_file",
220
+ ) from e
221
+ except Exception as e:
222
+ raise SourceError(
223
+ f"读取 S3 对象失败: {file_path}, {e}",
224
+ connector_type="s3",
225
+ operation="read_file",
226
+ ) from e
227
+
228
+ def __repr__(self) -> str:
229
+ return f"<S3Source endpoint={self.endpoint} bucket={self.bucket}>"
230
+
231
+
232
+ __all__ = ["S3Source"]