xparse-client 0.2.11__py3-none-any.whl → 0.3.0b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/1_basic_api_usage.py +198 -0
- example/2_async_job.py +210 -0
- example/3_local_workflow.py +300 -0
- example/4_advanced_workflow.py +327 -0
- example/README.md +128 -0
- example/config_example.json +95 -0
- tests/conftest.py +310 -0
- tests/unit/__init__.py +1 -0
- tests/unit/api/__init__.py +1 -0
- tests/unit/api/test_extract.py +232 -0
- tests/unit/api/test_local.py +231 -0
- tests/unit/api/test_parse.py +374 -0
- tests/unit/api/test_pipeline.py +369 -0
- tests/unit/api/test_workflows.py +108 -0
- tests/unit/connectors/test_ftp.py +525 -0
- tests/unit/connectors/test_local_connectors.py +324 -0
- tests/unit/connectors/test_milvus.py +368 -0
- tests/unit/connectors/test_qdrant.py +399 -0
- tests/unit/connectors/test_s3.py +598 -0
- tests/unit/connectors/test_smb.py +442 -0
- tests/unit/connectors/test_utils.py +335 -0
- tests/unit/models/test_local.py +54 -0
- tests/unit/models/test_pipeline_stages.py +144 -0
- tests/unit/models/test_workflows.py +55 -0
- tests/unit/test_base.py +437 -0
- tests/unit/test_client.py +110 -0
- tests/unit/test_config.py +160 -0
- tests/unit/test_exceptions.py +182 -0
- tests/unit/test_http.py +562 -0
- xparse_client/__init__.py +111 -20
- xparse_client/_base.py +179 -0
- xparse_client/_client.py +218 -0
- xparse_client/_config.py +221 -0
- xparse_client/_http.py +350 -0
- xparse_client/api/__init__.py +14 -0
- xparse_client/api/extract.py +109 -0
- xparse_client/api/local.py +215 -0
- xparse_client/api/parse.py +209 -0
- xparse_client/api/pipeline.py +134 -0
- xparse_client/api/workflows.py +204 -0
- xparse_client/connectors/__init__.py +45 -0
- xparse_client/connectors/_utils.py +138 -0
- xparse_client/connectors/destinations/__init__.py +45 -0
- xparse_client/connectors/destinations/base.py +116 -0
- xparse_client/connectors/destinations/local.py +91 -0
- xparse_client/connectors/destinations/milvus.py +229 -0
- xparse_client/connectors/destinations/qdrant.py +238 -0
- xparse_client/connectors/destinations/s3.py +163 -0
- xparse_client/connectors/sources/__init__.py +45 -0
- xparse_client/connectors/sources/base.py +74 -0
- xparse_client/connectors/sources/ftp.py +278 -0
- xparse_client/connectors/sources/local.py +176 -0
- xparse_client/connectors/sources/s3.py +232 -0
- xparse_client/connectors/sources/smb.py +259 -0
- xparse_client/exceptions.py +398 -0
- xparse_client/models/__init__.py +60 -0
- xparse_client/models/chunk.py +39 -0
- xparse_client/models/embed.py +62 -0
- xparse_client/models/extract.py +41 -0
- xparse_client/models/local.py +38 -0
- xparse_client/models/parse.py +136 -0
- xparse_client/models/pipeline.py +134 -0
- xparse_client/models/workflows.py +74 -0
- xparse_client-0.3.0b3.dist-info/METADATA +1075 -0
- xparse_client-0.3.0b3.dist-info/RECORD +68 -0
- {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/WHEEL +1 -1
- {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/licenses/LICENSE +1 -1
- {xparse_client-0.2.11.dist-info → xparse_client-0.3.0b3.dist-info}/top_level.txt +1 -0
- example/run_pipeline.py +0 -506
- example/run_pipeline_test.py +0 -458
- xparse_client/pipeline/__init__.py +0 -3
- xparse_client/pipeline/config.py +0 -129
- xparse_client/pipeline/destinations.py +0 -487
- xparse_client/pipeline/pipeline.py +0 -622
- xparse_client/pipeline/sources.py +0 -585
- xparse_client-0.2.11.dist-info/METADATA +0 -1050
- xparse_client-0.2.11.dist-info/RECORD +0 -13
|
@@ -1,585 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- encoding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
import logging
|
|
6
|
-
import boto3
|
|
7
|
-
import ftplib
|
|
8
|
-
|
|
9
|
-
from abc import ABC, abstractmethod
|
|
10
|
-
from datetime import datetime, timezone
|
|
11
|
-
from email.utils import parsedate_to_datetime
|
|
12
|
-
from fnmatch import fnmatch
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
from typing import List, Dict, Any, Tuple, Optional
|
|
15
|
-
|
|
16
|
-
from smb.SMBConnection import SMBConnection
|
|
17
|
-
from botocore.config import Config
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _normalize_wildcard_patterns(pattern: Optional[List[str]]) -> Optional[List[str]]:
|
|
24
|
-
"""规范化通配符模式列表
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
pattern: 通配符模式列表,如果为 None 或空列表则返回 None(表示匹配所有文件)
|
|
28
|
-
|
|
29
|
-
Returns:
|
|
30
|
-
通配符模式列表,如果 pattern 是 None、空列表或包含 "*" 则返回 None(表示匹配所有文件)
|
|
31
|
-
"""
|
|
32
|
-
if pattern is None or not pattern:
|
|
33
|
-
return None # None 表示匹配所有文件
|
|
34
|
-
|
|
35
|
-
if not isinstance(pattern, list):
|
|
36
|
-
raise ValueError(f"pattern 类型错误: {type(pattern)}")
|
|
37
|
-
|
|
38
|
-
# 过滤空字符串并去除空格
|
|
39
|
-
normalized = [p.strip() for p in pattern if p and p.strip()]
|
|
40
|
-
|
|
41
|
-
if not normalized:
|
|
42
|
-
return None
|
|
43
|
-
|
|
44
|
-
# 如果包含 "*",直接返回 None(匹配所有文件,减少后续开销)
|
|
45
|
-
if '*' in normalized:
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
return normalized
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def _match_file_extension(file_path: str, wildcard_patterns: Optional[List[str]]) -> bool:
|
|
52
|
-
"""检查文件路径是否匹配通配符模式
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
file_path: 文件路径
|
|
56
|
-
wildcard_patterns: 已规范化的通配符模式列表(如 ['*.pdf', '*.docx'])
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
如果匹配返回 True,否则返回 False
|
|
60
|
-
"""
|
|
61
|
-
# 如果 wildcard_patterns 是 None 或空列表,匹配所有文件
|
|
62
|
-
if wildcard_patterns is None:
|
|
63
|
-
return True
|
|
64
|
-
|
|
65
|
-
# 检查是否匹配任何一个通配符模式
|
|
66
|
-
for wildcard_pattern in wildcard_patterns:
|
|
67
|
-
if fnmatch(file_path, wildcard_pattern):
|
|
68
|
-
return True
|
|
69
|
-
|
|
70
|
-
return False
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _to_millis_timestamp_string(timestamp):
|
|
74
|
-
"""将时间戳转换为毫秒时间戳字符串
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
timestamp: 时间戳(秒或毫秒),可以是 int、float 或 None
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
str: 毫秒时间戳字符串,如果输入为 None 则返回空字符串
|
|
81
|
-
"""
|
|
82
|
-
if timestamp is None:
|
|
83
|
-
return ""
|
|
84
|
-
|
|
85
|
-
# 如果已经是毫秒时间戳(大于 1e12),直接转换
|
|
86
|
-
if isinstance(timestamp, (int, float)):
|
|
87
|
-
if timestamp > 1e12:
|
|
88
|
-
# 已经是毫秒时间戳
|
|
89
|
-
return str(int(timestamp))
|
|
90
|
-
else:
|
|
91
|
-
# 秒级时间戳,转换为毫秒
|
|
92
|
-
return str(int(timestamp * 1000))
|
|
93
|
-
|
|
94
|
-
return str(timestamp)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
class Source(ABC):
|
|
98
|
-
"""数据源抽象基类"""
|
|
99
|
-
|
|
100
|
-
@abstractmethod
|
|
101
|
-
def list_files(self) -> List[str]:
|
|
102
|
-
"""列出所有文件"""
|
|
103
|
-
raise NotImplementedError
|
|
104
|
-
|
|
105
|
-
@abstractmethod
|
|
106
|
-
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
107
|
-
"""读取文件内容并返回数据来源信息"""
|
|
108
|
-
raise NotImplementedError
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class S3Source(Source):
|
|
112
|
-
"""S3/MinIO 数据源"""
|
|
113
|
-
|
|
114
|
-
def __init__(self, endpoint: str, access_key: str, secret_key: str,
|
|
115
|
-
bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
116
|
-
self.endpoint = endpoint
|
|
117
|
-
self.bucket = bucket
|
|
118
|
-
self.prefix = prefix
|
|
119
|
-
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
120
|
-
self.recursive = recursive
|
|
121
|
-
|
|
122
|
-
if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
|
|
123
|
-
config = Config(signature_version='s3v4')
|
|
124
|
-
elif self.endpoint.endswith('aliyuncs.com'):
|
|
125
|
-
config = Config(signature_version='s3', s3={'addressing_style': 'virtual'})
|
|
126
|
-
else:
|
|
127
|
-
config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})
|
|
128
|
-
|
|
129
|
-
self.client = boto3.client(
|
|
130
|
-
's3',
|
|
131
|
-
endpoint_url=endpoint,
|
|
132
|
-
aws_access_key_id=access_key,
|
|
133
|
-
aws_secret_access_key=secret_key,
|
|
134
|
-
region_name=region,
|
|
135
|
-
config=config
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
try:
|
|
139
|
-
self.client.head_bucket(Bucket=bucket)
|
|
140
|
-
print(f"✓ S3 连接成功: {endpoint}/{bucket}")
|
|
141
|
-
logger.info(f"S3 连接成功: {endpoint}/{bucket}")
|
|
142
|
-
except Exception as e:
|
|
143
|
-
print(f"✗ S3 连接失败: {str(e)}")
|
|
144
|
-
raise
|
|
145
|
-
|
|
146
|
-
def list_files(self) -> List[str]:
|
|
147
|
-
files = []
|
|
148
|
-
paginator = self.client.get_paginator('list_objects_v2')
|
|
149
|
-
|
|
150
|
-
params = {'Bucket': self.bucket}
|
|
151
|
-
if self.prefix:
|
|
152
|
-
params['Prefix'] = self.prefix
|
|
153
|
-
if not self.recursive:
|
|
154
|
-
# 非递归模式:使用 Delimiter 只列出当前目录下的文件
|
|
155
|
-
params['Delimiter'] = '/'
|
|
156
|
-
|
|
157
|
-
for page in paginator.paginate(**params):
|
|
158
|
-
if 'Contents' in page:
|
|
159
|
-
for obj in page['Contents']:
|
|
160
|
-
key = obj['Key']
|
|
161
|
-
if key.endswith('/') or key.endswith('empty.tmp'):
|
|
162
|
-
continue
|
|
163
|
-
if _match_file_extension(key, self.pattern):
|
|
164
|
-
files.append(key)
|
|
165
|
-
|
|
166
|
-
# 非递归模式下,CommonPrefixes 包含子目录,我们忽略它们
|
|
167
|
-
if not self.recursive and 'CommonPrefixes' in page:
|
|
168
|
-
# 这些是子目录,在非递归模式下忽略
|
|
169
|
-
pass
|
|
170
|
-
|
|
171
|
-
print(f"✓ S3 找到 {len(files)} 个文件")
|
|
172
|
-
return files
|
|
173
|
-
|
|
174
|
-
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
175
|
-
response = self.client.get_object(Bucket=self.bucket, Key=file_path)
|
|
176
|
-
file_bytes = response['Body'].read()
|
|
177
|
-
|
|
178
|
-
headers = response.get('ResponseMetadata', {}).get('HTTPHeaders', {})
|
|
179
|
-
version = headers.get('etag') or ""
|
|
180
|
-
if version.startswith('"') and version.endswith('"'):
|
|
181
|
-
version = version[1:-1]
|
|
182
|
-
last_modified = headers.get('last-modified')
|
|
183
|
-
server = headers.get('server') or "unknown"
|
|
184
|
-
date_modified = None
|
|
185
|
-
if last_modified:
|
|
186
|
-
try:
|
|
187
|
-
dt = parsedate_to_datetime(last_modified)
|
|
188
|
-
date_modified = dt.astimezone(timezone.utc).timestamp()
|
|
189
|
-
except Exception as exc:
|
|
190
|
-
logger.debug(f"S3 解析 last-modified 失败 {file_path}: {exc}")
|
|
191
|
-
|
|
192
|
-
normalized_key = file_path.lstrip('/')
|
|
193
|
-
data_source = {
|
|
194
|
-
'url': f"s3://{self.bucket}/{normalized_key}",
|
|
195
|
-
'version': version,
|
|
196
|
-
'date_created': _to_millis_timestamp_string(date_modified),
|
|
197
|
-
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
198
|
-
'record_locator': {
|
|
199
|
-
'server': server,
|
|
200
|
-
'protocol': 's3',
|
|
201
|
-
'remote_file_path': normalized_key
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
return file_bytes, data_source
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
class LocalSource(Source):
|
|
209
|
-
"""本地文件系统数据源"""
|
|
210
|
-
|
|
211
|
-
def __init__(self, directory: str, pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
212
|
-
self.directory = Path(directory)
|
|
213
|
-
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
214
|
-
self.recursive = recursive
|
|
215
|
-
|
|
216
|
-
if not self.directory.exists():
|
|
217
|
-
raise ValueError(f"目录不存在: {directory}")
|
|
218
|
-
|
|
219
|
-
print(f"✓ 本地目录: {self.directory}")
|
|
220
|
-
logger.info(f"本地目录: {self.directory}")
|
|
221
|
-
|
|
222
|
-
def list_files(self) -> List[str]:
|
|
223
|
-
all_files = []
|
|
224
|
-
# 匹配所有文件
|
|
225
|
-
if self.recursive:
|
|
226
|
-
all_files.extend([
|
|
227
|
-
str(f.relative_to(self.directory))
|
|
228
|
-
for f in self.directory.rglob('*')
|
|
229
|
-
if f.is_file()
|
|
230
|
-
])
|
|
231
|
-
else:
|
|
232
|
-
all_files.extend([
|
|
233
|
-
str(f.relative_to(self.directory))
|
|
234
|
-
for f in self.directory.glob('*')
|
|
235
|
-
if f.is_file()
|
|
236
|
-
])
|
|
237
|
-
|
|
238
|
-
files = []
|
|
239
|
-
if self.pattern is not None:
|
|
240
|
-
for file in all_files:
|
|
241
|
-
if _match_file_extension(file, self.pattern):
|
|
242
|
-
files.append(file)
|
|
243
|
-
else:
|
|
244
|
-
files.extend(all_files)
|
|
245
|
-
|
|
246
|
-
print(f"✓ 本地找到 {len(files)} 个文件")
|
|
247
|
-
return files
|
|
248
|
-
|
|
249
|
-
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
250
|
-
full_path = (self.directory / file_path).resolve()
|
|
251
|
-
with open(full_path, 'rb') as f:
|
|
252
|
-
file_bytes = f.read()
|
|
253
|
-
|
|
254
|
-
date_created = None
|
|
255
|
-
date_modified = None
|
|
256
|
-
version = None
|
|
257
|
-
try:
|
|
258
|
-
stats = full_path.stat()
|
|
259
|
-
date_created = stats.st_ctime
|
|
260
|
-
date_modified = stats.st_mtime
|
|
261
|
-
version = str(int(stats.st_mtime_ns))
|
|
262
|
-
except FileNotFoundError:
|
|
263
|
-
logger.warning(f"本地文件不存在,无法获取 metadata: {full_path}")
|
|
264
|
-
|
|
265
|
-
data_source = {
|
|
266
|
-
'url': full_path.as_uri(),
|
|
267
|
-
'version': version,
|
|
268
|
-
'date_created': _to_millis_timestamp_string(date_created),
|
|
269
|
-
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
270
|
-
'record_locator': {
|
|
271
|
-
'protocol': 'file',
|
|
272
|
-
'remote_file_path': str(full_path)
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
return file_bytes, data_source
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
class FtpSource(Source):
|
|
279
|
-
"""FTP 数据源"""
|
|
280
|
-
|
|
281
|
-
def __init__(self, host: str, port: int, username: str, password: str, pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
282
|
-
self.host = host
|
|
283
|
-
self.port = port
|
|
284
|
-
self.username = username
|
|
285
|
-
self.password = password
|
|
286
|
-
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
287
|
-
self.recursive = recursive
|
|
288
|
-
|
|
289
|
-
self.client = ftplib.FTP()
|
|
290
|
-
self.client.connect(self.host, self.port)
|
|
291
|
-
self.client.login(self.username, self.password)
|
|
292
|
-
|
|
293
|
-
print(f"✓ FTP 连接成功: {self.host}:{self.port}")
|
|
294
|
-
logger.info(f"FTP 连接成功: {self.host}:{self.port}")
|
|
295
|
-
|
|
296
|
-
def list_files(self) -> List[str]:
|
|
297
|
-
if self.recursive:
|
|
298
|
-
# 递归模式:递归列出所有文件
|
|
299
|
-
files = []
|
|
300
|
-
current_dir = self.client.pwd()
|
|
301
|
-
|
|
302
|
-
def _list_recursive(path=''):
|
|
303
|
-
try:
|
|
304
|
-
# 保存当前目录
|
|
305
|
-
original_dir = self.client.pwd()
|
|
306
|
-
if path:
|
|
307
|
-
try:
|
|
308
|
-
self.client.cwd(path)
|
|
309
|
-
except:
|
|
310
|
-
return
|
|
311
|
-
|
|
312
|
-
items = []
|
|
313
|
-
try:
|
|
314
|
-
# 尝试使用 MLSD 命令(更可靠)
|
|
315
|
-
items = []
|
|
316
|
-
for item in self.client.mlsd():
|
|
317
|
-
items.append(item)
|
|
318
|
-
except:
|
|
319
|
-
# 如果不支持 MLSD,使用 LIST 命令
|
|
320
|
-
try:
|
|
321
|
-
lines = []
|
|
322
|
-
self.client.retrlines('LIST', lines.append)
|
|
323
|
-
for line in lines:
|
|
324
|
-
parts = line.split()
|
|
325
|
-
if len(parts) >= 9:
|
|
326
|
-
# 解析 LIST 输出,第一个字符表示文件类型
|
|
327
|
-
item_name = ' '.join(parts[8:])
|
|
328
|
-
is_dir = parts[0].startswith('d')
|
|
329
|
-
items.append((item_name, {'type': 'dir' if is_dir else 'file'}))
|
|
330
|
-
except:
|
|
331
|
-
# 最后回退到 nlst,但无法区分文件和目录
|
|
332
|
-
for item_name in self.client.nlst():
|
|
333
|
-
items.append((item_name, {'type': 'unknown'}))
|
|
334
|
-
|
|
335
|
-
for item_name, item_info in items:
|
|
336
|
-
if item_name in ['.', '..']:
|
|
337
|
-
continue
|
|
338
|
-
|
|
339
|
-
item_type = item_info.get('type', 'unknown')
|
|
340
|
-
full_path = f"{path}/{item_name}" if path else item_name
|
|
341
|
-
|
|
342
|
-
if item_type == 'dir' or item_type == 'unknown':
|
|
343
|
-
# 尝试切换目录来判断是否为目录
|
|
344
|
-
try:
|
|
345
|
-
self.client.cwd(item_name)
|
|
346
|
-
self.client.cwd('..')
|
|
347
|
-
# 是目录,递归处理
|
|
348
|
-
_list_recursive(full_path)
|
|
349
|
-
except:
|
|
350
|
-
# 不是目录,是文件
|
|
351
|
-
relative_path = full_path.lstrip('/')
|
|
352
|
-
if _match_file_extension(relative_path, self.pattern):
|
|
353
|
-
files.append(relative_path)
|
|
354
|
-
else:
|
|
355
|
-
# 是文件
|
|
356
|
-
relative_path = full_path.lstrip('/')
|
|
357
|
-
if _match_file_extension(relative_path, self.pattern):
|
|
358
|
-
files.append(relative_path)
|
|
359
|
-
|
|
360
|
-
# 恢复原始目录
|
|
361
|
-
self.client.cwd(original_dir)
|
|
362
|
-
except Exception as e:
|
|
363
|
-
logger.warning(f"FTP 列出路径失败 {path}: {str(e)}")
|
|
364
|
-
try:
|
|
365
|
-
self.client.cwd(current_dir)
|
|
366
|
-
except:
|
|
367
|
-
pass
|
|
368
|
-
|
|
369
|
-
_list_recursive()
|
|
370
|
-
# 确保回到原始目录
|
|
371
|
-
try:
|
|
372
|
-
self.client.cwd(current_dir)
|
|
373
|
-
except:
|
|
374
|
-
pass
|
|
375
|
-
else:
|
|
376
|
-
# 非递归模式:只列出当前目录下的文件(排除目录)
|
|
377
|
-
files = []
|
|
378
|
-
current_dir = self.client.pwd()
|
|
379
|
-
|
|
380
|
-
try:
|
|
381
|
-
# 尝试使用 MLSD 命令(更可靠)
|
|
382
|
-
items = []
|
|
383
|
-
for item_name, item_info in self.client.mlsd():
|
|
384
|
-
if item_name in ['.', '..']:
|
|
385
|
-
continue
|
|
386
|
-
item_type = item_info.get('type', 'unknown')
|
|
387
|
-
# 只添加文件,排除目录
|
|
388
|
-
if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
|
|
389
|
-
if _match_file_extension(item_name, self.pattern):
|
|
390
|
-
files.append(item_name)
|
|
391
|
-
except:
|
|
392
|
-
# 如果不支持 MLSD,使用 LIST 命令
|
|
393
|
-
try:
|
|
394
|
-
lines = []
|
|
395
|
-
self.client.retrlines('LIST', lines.append)
|
|
396
|
-
for line in lines:
|
|
397
|
-
parts = line.split()
|
|
398
|
-
if len(parts) >= 9:
|
|
399
|
-
# 解析 LIST 输出,第一个字符表示文件类型
|
|
400
|
-
item_name = ' '.join(parts[8:])
|
|
401
|
-
if item_name in ['.', '..']:
|
|
402
|
-
continue
|
|
403
|
-
is_dir = parts[0].startswith('d')
|
|
404
|
-
# 只添加文件,排除目录
|
|
405
|
-
if not is_dir and _match_file_extension(item_name, self.pattern):
|
|
406
|
-
files.append(item_name)
|
|
407
|
-
except:
|
|
408
|
-
# 最后回退到 nlst,通过尝试切换目录来判断是否为目录
|
|
409
|
-
raw_items = self.client.nlst()
|
|
410
|
-
for item_name in raw_items:
|
|
411
|
-
if item_name in ['.', '..']:
|
|
412
|
-
continue
|
|
413
|
-
# 尝试切换目录来判断是否为目录
|
|
414
|
-
try:
|
|
415
|
-
self.client.cwd(item_name)
|
|
416
|
-
self.client.cwd('..')
|
|
417
|
-
# 能切换成功,说明是目录,跳过
|
|
418
|
-
continue
|
|
419
|
-
except:
|
|
420
|
-
# 不能切换,说明是文件
|
|
421
|
-
if _match_file_extension(item_name, self.pattern):
|
|
422
|
-
files.append(item_name)
|
|
423
|
-
|
|
424
|
-
# 确保回到原始目录
|
|
425
|
-
try:
|
|
426
|
-
self.client.cwd(current_dir)
|
|
427
|
-
except:
|
|
428
|
-
pass
|
|
429
|
-
|
|
430
|
-
print(f"✓ FTP 找到 {len(files)} 个文件 (匹配 pattern)")
|
|
431
|
-
return files
|
|
432
|
-
|
|
433
|
-
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
434
|
-
from io import BytesIO
|
|
435
|
-
buffer = BytesIO()
|
|
436
|
-
self.client.retrbinary(f'RETR {file_path}', buffer.write)
|
|
437
|
-
|
|
438
|
-
date_modified = None
|
|
439
|
-
try:
|
|
440
|
-
resp = self.client.sendcmd(f"MDTM {file_path}")
|
|
441
|
-
parts = resp.split()
|
|
442
|
-
if len(parts) == 2 and parts[0] == '213':
|
|
443
|
-
dt = datetime.strptime(parts[1], "%Y%m%d%H%M%S")
|
|
444
|
-
date_modified = dt.replace(tzinfo=timezone.utc).timestamp()
|
|
445
|
-
except Exception as exc:
|
|
446
|
-
logger.debug(f"FTP 获取文件时间失败 {file_path}: {exc}")
|
|
447
|
-
|
|
448
|
-
normalized_path = file_path.lstrip('/')
|
|
449
|
-
version = _to_millis_timestamp_string(date_modified)
|
|
450
|
-
data_source = {
|
|
451
|
-
'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
|
|
452
|
-
'version': version,
|
|
453
|
-
'date_created': version,
|
|
454
|
-
'date_modified': version,
|
|
455
|
-
'record_locator': {
|
|
456
|
-
'server': f"{self.host}:{self.port}",
|
|
457
|
-
'protocol': 'ftp',
|
|
458
|
-
'remote_file_path': normalized_path
|
|
459
|
-
}
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
return buffer.getvalue(), data_source
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
class SmbSource(Source):
|
|
466
|
-
"""SMB/CIFS 数据源"""
|
|
467
|
-
|
|
468
|
-
def __init__(self, host: str, share_name: str, username: str, password: str,
|
|
469
|
-
domain: str = '', port: int = 445, path: str = '', pattern: Optional[List[str]] = None, recursive: bool = False):
|
|
470
|
-
self.host = host
|
|
471
|
-
self.share_name = share_name
|
|
472
|
-
self.username = username
|
|
473
|
-
self.password = password
|
|
474
|
-
self.domain = domain
|
|
475
|
-
self.port = port
|
|
476
|
-
self.path = path.strip('/').strip('\\') if path else ''
|
|
477
|
-
self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
|
|
478
|
-
self.recursive = recursive
|
|
479
|
-
|
|
480
|
-
self.conn = SMBConnection(
|
|
481
|
-
username,
|
|
482
|
-
password,
|
|
483
|
-
'',
|
|
484
|
-
host,
|
|
485
|
-
domain=domain,
|
|
486
|
-
use_ntlm_v2=True
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
try:
|
|
490
|
-
self.conn.connect(host, port)
|
|
491
|
-
except Exception as e:
|
|
492
|
-
error_msg = f"无法连接到 SMB 服务器 {host}:{port}: {str(e)}"
|
|
493
|
-
print(f"✗ SMB 连接失败: {error_msg}")
|
|
494
|
-
logger.error(f"SMB 连接失败: {error_msg}")
|
|
495
|
-
raise ConnectionError(error_msg)
|
|
496
|
-
|
|
497
|
-
def list_files(self) -> List[str]:
|
|
498
|
-
files = []
|
|
499
|
-
base_path = '/' if not self.path else f'/{self.path}'
|
|
500
|
-
|
|
501
|
-
def _list_recursive(conn, share, current_path):
|
|
502
|
-
try:
|
|
503
|
-
items = conn.listPath(share, current_path)
|
|
504
|
-
for item in items:
|
|
505
|
-
if item.filename in ['.', '..'] or item.filename.startswith('.'):
|
|
506
|
-
continue
|
|
507
|
-
item_path = f"{current_path.rstrip('/')}/{item.filename}" if current_path != '/' else f"/{item.filename}"
|
|
508
|
-
relative_path = item_path[len(base_path):].lstrip('/')
|
|
509
|
-
if item.isDirectory:
|
|
510
|
-
if self.recursive:
|
|
511
|
-
# 递归模式:继续递归子目录
|
|
512
|
-
_list_recursive(conn, share, item_path)
|
|
513
|
-
# 非递归模式:忽略子目录
|
|
514
|
-
else:
|
|
515
|
-
if _match_file_extension(relative_path, self.pattern):
|
|
516
|
-
files.append(relative_path)
|
|
517
|
-
except Exception as e:
|
|
518
|
-
logger.warning(f"列出路径失败 {current_path}: {str(e)}")
|
|
519
|
-
|
|
520
|
-
_list_recursive(self.conn, self.share_name, base_path)
|
|
521
|
-
|
|
522
|
-
print(f"✓ SMB 找到 {len(files)} 个文件")
|
|
523
|
-
return files
|
|
524
|
-
|
|
525
|
-
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
526
|
-
from io import BytesIO
|
|
527
|
-
|
|
528
|
-
base_path = '/' if not self.path else f'/{self.path}'
|
|
529
|
-
full_path = f"{base_path.rstrip('/')}/{file_path.lstrip('/')}" if base_path != '/' else f"/{file_path.lstrip('/')}"
|
|
530
|
-
|
|
531
|
-
file_obj = BytesIO()
|
|
532
|
-
try:
|
|
533
|
-
self.conn.retrieveFile(self.share_name, full_path, file_obj)
|
|
534
|
-
except Exception as e:
|
|
535
|
-
raise IOError(f"读取文件失败 {full_path}: {str(e)}")
|
|
536
|
-
|
|
537
|
-
def _to_timestamp(value):
|
|
538
|
-
if isinstance(value, datetime):
|
|
539
|
-
return value.astimezone(timezone.utc).timestamp()
|
|
540
|
-
if isinstance(value, (int, float)):
|
|
541
|
-
return value
|
|
542
|
-
return None
|
|
543
|
-
|
|
544
|
-
date_created = None
|
|
545
|
-
date_modified = None
|
|
546
|
-
try:
|
|
547
|
-
attrs = self.conn.getAttributes(self.share_name, full_path)
|
|
548
|
-
date_created = _to_timestamp(getattr(attrs, 'create_time', None))
|
|
549
|
-
date_modified = _to_timestamp(getattr(attrs, 'last_write_time', None))
|
|
550
|
-
except Exception as exc:
|
|
551
|
-
logger.debug(f"SMB 获取文件属性失败 {full_path}: {exc}")
|
|
552
|
-
|
|
553
|
-
smb_url = f"smb://{self.host}/{self.share_name}{full_path}"
|
|
554
|
-
data_source = {
|
|
555
|
-
'url': smb_url,
|
|
556
|
-
'version': _to_millis_timestamp_string(date_modified),
|
|
557
|
-
'date_created': _to_millis_timestamp_string(date_created),
|
|
558
|
-
'date_modified': _to_millis_timestamp_string(date_modified),
|
|
559
|
-
'record_locator': {
|
|
560
|
-
'server': self.host,
|
|
561
|
-
'share': self.share_name,
|
|
562
|
-
'protocol': 'smb',
|
|
563
|
-
'remote_file_path': full_path
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
file_obj.seek(0)
|
|
568
|
-
return file_obj.read(), data_source
|
|
569
|
-
|
|
570
|
-
def __del__(self):
|
|
571
|
-
if hasattr(self, 'conn') and self.conn:
|
|
572
|
-
try:
|
|
573
|
-
self.conn.close()
|
|
574
|
-
except Exception:
|
|
575
|
-
pass
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
__all__ = [
|
|
579
|
-
'Source',
|
|
580
|
-
'S3Source',
|
|
581
|
-
'LocalSource',
|
|
582
|
-
'FtpSource',
|
|
583
|
-
'SmbSource',
|
|
584
|
-
]
|
|
585
|
-
|