xparse-client 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- example/run_pipeline.py +479 -0
- example/run_pipeline_test.py +458 -0
- xparse_client/__init__.py +21 -1
- xparse_client/pipeline/__init__.py +3 -0
- xparse_client/pipeline/config.py +128 -0
- xparse_client/pipeline/destinations.py +250 -0
- xparse_client/pipeline/pipeline.py +440 -0
- xparse_client/pipeline/sources.py +342 -0
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.4.dist-info}/METADATA +1 -1
- xparse_client-0.2.4.dist-info/RECORD +13 -0
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.4.dist-info}/top_level.txt +1 -0
- xparse_client-0.2.2.dist-info/RECORD +0 -6
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.4.dist-info}/WHEEL +0 -0
- {xparse_client-0.2.2.dist-info → xparse_client-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- encoding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import boto3
|
|
7
|
+
import ftplib
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from email.utils import parsedate_to_datetime
|
|
12
|
+
from fnmatch import fnmatch
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Dict, Any, Tuple
|
|
15
|
+
|
|
16
|
+
from smb.SMBConnection import SMBConnection
|
|
17
|
+
from botocore.config import Config
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Source(ABC):
|
|
24
|
+
"""数据源抽象基类"""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def list_files(self) -> List[str]:
|
|
28
|
+
"""列出所有文件"""
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
33
|
+
"""读取文件内容并返回数据来源信息"""
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class S3Source(Source):
|
|
38
|
+
"""S3/MinIO 数据源"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, endpoint: str, access_key: str, secret_key: str,
|
|
41
|
+
bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*'):
|
|
42
|
+
self.endpoint = endpoint
|
|
43
|
+
self.bucket = bucket
|
|
44
|
+
self.prefix = prefix
|
|
45
|
+
self.pattern = pattern or '*'
|
|
46
|
+
|
|
47
|
+
if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
|
|
48
|
+
config = Config(signature_version='s3v4')
|
|
49
|
+
else:
|
|
50
|
+
config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})
|
|
51
|
+
|
|
52
|
+
self.client = boto3.client(
|
|
53
|
+
's3',
|
|
54
|
+
endpoint_url=endpoint,
|
|
55
|
+
aws_access_key_id=access_key,
|
|
56
|
+
aws_secret_access_key=secret_key,
|
|
57
|
+
region_name=region,
|
|
58
|
+
config=config
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
self.client.head_bucket(Bucket=bucket)
|
|
63
|
+
print(f"✓ S3 连接成功: {endpoint}/{bucket}")
|
|
64
|
+
logger.info(f"S3 连接成功: {endpoint}/{bucket}")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"✗ S3 连接失败: {str(e)}")
|
|
67
|
+
raise
|
|
68
|
+
|
|
69
|
+
def list_files(self) -> List[str]:
|
|
70
|
+
files = []
|
|
71
|
+
paginator = self.client.get_paginator('list_objects_v2')
|
|
72
|
+
|
|
73
|
+
params = {'Bucket': self.bucket}
|
|
74
|
+
if self.prefix:
|
|
75
|
+
params['Prefix'] = self.prefix
|
|
76
|
+
|
|
77
|
+
for page in paginator.paginate(**params):
|
|
78
|
+
if 'Contents' in page:
|
|
79
|
+
for obj in page['Contents']:
|
|
80
|
+
key = obj['Key']
|
|
81
|
+
if key.endswith('/') or key.endswith('empty.tmp'):
|
|
82
|
+
continue
|
|
83
|
+
if fnmatch(key, self.pattern):
|
|
84
|
+
files.append(key)
|
|
85
|
+
|
|
86
|
+
print(f"✓ S3 找到 {len(files)} 个文件")
|
|
87
|
+
return files
|
|
88
|
+
|
|
89
|
+
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
90
|
+
response = self.client.get_object(Bucket=self.bucket, Key=file_path)
|
|
91
|
+
file_bytes = response['Body'].read()
|
|
92
|
+
|
|
93
|
+
headers = response.get('ResponseMetadata', {}).get('HTTPHeaders', {})
|
|
94
|
+
version = response.get('VersionId') or headers.get('x-amz-version-id')
|
|
95
|
+
last_modified = headers.get('last-modified')
|
|
96
|
+
server = headers.get('server') or "unknown"
|
|
97
|
+
date_modified = None
|
|
98
|
+
if last_modified:
|
|
99
|
+
try:
|
|
100
|
+
dt = parsedate_to_datetime(last_modified)
|
|
101
|
+
date_modified = dt.astimezone(timezone.utc).timestamp()
|
|
102
|
+
except Exception as exc:
|
|
103
|
+
logger.debug(f"S3 解析 last-modified 失败 {file_path}: {exc}")
|
|
104
|
+
|
|
105
|
+
normalized_key = file_path.lstrip('/')
|
|
106
|
+
data_source = {
|
|
107
|
+
'url': f"s3://{self.bucket}/{normalized_key}",
|
|
108
|
+
'version': version,
|
|
109
|
+
'date_created': date_modified,
|
|
110
|
+
'date_modified': date_modified,
|
|
111
|
+
'record_locator': {
|
|
112
|
+
'server': server,
|
|
113
|
+
'protocol': 's3',
|
|
114
|
+
'remote_file_path': normalized_key
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return file_bytes, data_source
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class LocalSource(Source):
|
|
122
|
+
"""本地文件系统数据源"""
|
|
123
|
+
|
|
124
|
+
def __init__(self, directory: str, pattern: str = '*'):
|
|
125
|
+
self.directory = Path(directory)
|
|
126
|
+
self.pattern = pattern or '*'
|
|
127
|
+
|
|
128
|
+
if not self.directory.exists():
|
|
129
|
+
raise ValueError(f"目录不存在: {directory}")
|
|
130
|
+
|
|
131
|
+
print(f"✓ 本地目录: {self.directory}")
|
|
132
|
+
logger.info(f"本地目录: {self.directory}")
|
|
133
|
+
|
|
134
|
+
def list_files(self) -> List[str]:
|
|
135
|
+
files = [
|
|
136
|
+
str(f.relative_to(self.directory))
|
|
137
|
+
for f in self.directory.rglob(self.pattern)
|
|
138
|
+
if f.is_file()
|
|
139
|
+
]
|
|
140
|
+
print(f"✓ 本地找到 {len(files)} 个文件")
|
|
141
|
+
return files
|
|
142
|
+
|
|
143
|
+
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
144
|
+
full_path = (self.directory / file_path).resolve()
|
|
145
|
+
with open(full_path, 'rb') as f:
|
|
146
|
+
file_bytes = f.read()
|
|
147
|
+
|
|
148
|
+
date_created = None
|
|
149
|
+
date_modified = None
|
|
150
|
+
version = None
|
|
151
|
+
try:
|
|
152
|
+
stats = full_path.stat()
|
|
153
|
+
date_created = stats.st_ctime
|
|
154
|
+
date_modified = stats.st_mtime
|
|
155
|
+
version = str(int(stats.st_mtime_ns))
|
|
156
|
+
except FileNotFoundError:
|
|
157
|
+
logger.warning(f"本地文件不存在,无法获取 metadata: {full_path}")
|
|
158
|
+
|
|
159
|
+
data_source = {
|
|
160
|
+
'url': full_path.as_uri(),
|
|
161
|
+
'version': version,
|
|
162
|
+
'date_created': date_created,
|
|
163
|
+
'date_modified': date_modified,
|
|
164
|
+
'record_locator': {
|
|
165
|
+
'protocol': 'file',
|
|
166
|
+
'remote_file_path': str(full_path)
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return file_bytes, data_source
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class FtpSource(Source):
|
|
173
|
+
"""FTP 数据源"""
|
|
174
|
+
|
|
175
|
+
def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*'):
|
|
176
|
+
self.host = host
|
|
177
|
+
self.port = port
|
|
178
|
+
self.username = username
|
|
179
|
+
self.password = password
|
|
180
|
+
self.pattern = pattern or '*'
|
|
181
|
+
|
|
182
|
+
self.client = ftplib.FTP()
|
|
183
|
+
self.client.connect(self.host, self.port)
|
|
184
|
+
self.client.login(self.username, self.password)
|
|
185
|
+
|
|
186
|
+
print(f"✓ FTP 连接成功: {self.host}:{self.port}")
|
|
187
|
+
logger.info(f"FTP 连接成功: {self.host}:{self.port}")
|
|
188
|
+
|
|
189
|
+
def list_files(self) -> List[str]:
|
|
190
|
+
raw_files = self.client.nlst()
|
|
191
|
+
files = [f for f in raw_files if fnmatch(f, self.pattern)]
|
|
192
|
+
print(f"✓ FTP 找到 {len(files)} 个文件 (匹配 pattern)")
|
|
193
|
+
return files
|
|
194
|
+
|
|
195
|
+
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
196
|
+
from io import BytesIO
|
|
197
|
+
buffer = BytesIO()
|
|
198
|
+
self.client.retrbinary(f'RETR {file_path}', buffer.write)
|
|
199
|
+
|
|
200
|
+
date_modified = None
|
|
201
|
+
try:
|
|
202
|
+
resp = self.client.sendcmd(f"MDTM {file_path}")
|
|
203
|
+
parts = resp.split()
|
|
204
|
+
if len(parts) == 2 and parts[0] == '213':
|
|
205
|
+
dt = datetime.strptime(parts[1], "%Y%m%d%H%M%S")
|
|
206
|
+
date_modified = dt.replace(tzinfo=timezone.utc).timestamp()
|
|
207
|
+
except Exception as exc:
|
|
208
|
+
logger.debug(f"FTP 获取文件时间失败 {file_path}: {exc}")
|
|
209
|
+
|
|
210
|
+
normalized_path = file_path.lstrip('/')
|
|
211
|
+
data_source = {
|
|
212
|
+
'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
|
|
213
|
+
'version': None,
|
|
214
|
+
'date_created': None,
|
|
215
|
+
'date_modified': date_modified,
|
|
216
|
+
'record_locator': {
|
|
217
|
+
'server': f"{self.host}:{self.port}",
|
|
218
|
+
'protocol': 'ftp',
|
|
219
|
+
'remote_file_path': normalized_path
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return buffer.getvalue(), data_source
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class SmbSource(Source):
|
|
227
|
+
"""SMB/CIFS 数据源"""
|
|
228
|
+
|
|
229
|
+
def __init__(self, host: str, share_name: str, username: str, password: str,
|
|
230
|
+
domain: str = '', port: int = 445, path: str = '', pattern: str = '*'):
|
|
231
|
+
self.host = host
|
|
232
|
+
self.share_name = share_name
|
|
233
|
+
self.username = username
|
|
234
|
+
self.password = password
|
|
235
|
+
self.domain = domain
|
|
236
|
+
self.port = port
|
|
237
|
+
self.path = path.strip('/').strip('\\') if path else ''
|
|
238
|
+
self.pattern = pattern or '*'
|
|
239
|
+
|
|
240
|
+
self.conn = SMBConnection(
|
|
241
|
+
username,
|
|
242
|
+
password,
|
|
243
|
+
'',
|
|
244
|
+
host,
|
|
245
|
+
domain=domain,
|
|
246
|
+
use_ntlm_v2=True
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
self.conn.connect(host, port)
|
|
251
|
+
except Exception as e:
|
|
252
|
+
error_msg = f"无法连接到 SMB 服务器 {host}:{port}: {str(e)}"
|
|
253
|
+
print(f"✗ SMB 连接失败: {error_msg}")
|
|
254
|
+
logger.error(f"SMB 连接失败: {error_msg}")
|
|
255
|
+
raise ConnectionError(error_msg)
|
|
256
|
+
|
|
257
|
+
def list_files(self) -> List[str]:
|
|
258
|
+
files = []
|
|
259
|
+
base_path = '/' if not self.path else f'/{self.path}'
|
|
260
|
+
|
|
261
|
+
def _list_recursive(conn, share, current_path):
|
|
262
|
+
try:
|
|
263
|
+
items = conn.listPath(share, current_path)
|
|
264
|
+
for item in items:
|
|
265
|
+
if item.filename in ['.', '..'] or item.filename.startswith('.'):
|
|
266
|
+
continue
|
|
267
|
+
item_path = f"{current_path.rstrip('/')}/{item.filename}" if current_path != '/' else f"/{item.filename}"
|
|
268
|
+
relative_path = item_path[len(base_path):].lstrip('/')
|
|
269
|
+
if item.isDirectory:
|
|
270
|
+
_list_recursive(conn, share, item_path)
|
|
271
|
+
else:
|
|
272
|
+
if fnmatch(relative_path, self.pattern):
|
|
273
|
+
files.append(relative_path)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.warning(f"列出路径失败 {current_path}: {str(e)}")
|
|
276
|
+
|
|
277
|
+
_list_recursive(self.conn, self.share_name, base_path)
|
|
278
|
+
|
|
279
|
+
print(f"✓ SMB 找到 {len(files)} 个文件")
|
|
280
|
+
return files
|
|
281
|
+
|
|
282
|
+
def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
|
|
283
|
+
from io import BytesIO
|
|
284
|
+
|
|
285
|
+
base_path = '/' if not self.path else f'/{self.path}'
|
|
286
|
+
full_path = f"{base_path.rstrip('/')}/{file_path.lstrip('/')}" if base_path != '/' else f"/{file_path.lstrip('/')}"
|
|
287
|
+
|
|
288
|
+
file_obj = BytesIO()
|
|
289
|
+
try:
|
|
290
|
+
self.conn.retrieveFile(self.share_name, full_path, file_obj)
|
|
291
|
+
except Exception as e:
|
|
292
|
+
raise IOError(f"读取文件失败 {full_path}: {str(e)}")
|
|
293
|
+
|
|
294
|
+
def _to_timestamp(value):
|
|
295
|
+
if isinstance(value, datetime):
|
|
296
|
+
return value.astimezone(timezone.utc).timestamp()
|
|
297
|
+
if isinstance(value, (int, float)):
|
|
298
|
+
return value
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
date_created = None
|
|
302
|
+
date_modified = None
|
|
303
|
+
try:
|
|
304
|
+
attrs = self.conn.getAttributes(self.share_name, full_path)
|
|
305
|
+
date_created = _to_timestamp(getattr(attrs, 'create_time', None))
|
|
306
|
+
date_modified = _to_timestamp(getattr(attrs, 'last_write_time', None))
|
|
307
|
+
except Exception as exc:
|
|
308
|
+
logger.debug(f"SMB 获取文件属性失败 {full_path}: {exc}")
|
|
309
|
+
|
|
310
|
+
smb_url = f"smb://{self.host}/{self.share_name}{full_path}"
|
|
311
|
+
data_source = {
|
|
312
|
+
'url': smb_url,
|
|
313
|
+
'version': None,
|
|
314
|
+
'date_created': date_created,
|
|
315
|
+
'date_modified': date_modified,
|
|
316
|
+
'record_locator': {
|
|
317
|
+
'server': self.host,
|
|
318
|
+
'share': self.share_name,
|
|
319
|
+
'protocol': 'smb',
|
|
320
|
+
'remote_file_path': full_path
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
file_obj.seek(0)
|
|
325
|
+
return file_obj.read(), data_source
|
|
326
|
+
|
|
327
|
+
def __del__(self):
|
|
328
|
+
if hasattr(self, 'conn') and self.conn:
|
|
329
|
+
try:
|
|
330
|
+
self.conn.close()
|
|
331
|
+
except Exception:
|
|
332
|
+
pass
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
__all__ = [
|
|
336
|
+
'Source',
|
|
337
|
+
'S3Source',
|
|
338
|
+
'LocalSource',
|
|
339
|
+
'FtpSource',
|
|
340
|
+
'SmbSource',
|
|
341
|
+
]
|
|
342
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
example/run_pipeline.py,sha256=6gavTizAIqD62g4n9Pjq2-yW57ItZMJOOw8GEKm0Byk,15125
|
|
2
|
+
example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
|
|
3
|
+
xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
|
|
4
|
+
xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
|
|
5
|
+
xparse_client/pipeline/config.py,sha256=gkhAF-55PNvPPyfTZ0HkP95XB_K0HKCyYl6R4PTQLhI,4045
|
|
6
|
+
xparse_client/pipeline/destinations.py,sha256=rqcxmsn1YGClVxGQxSVmyr-uumOVilOv_vX82fUBj-I,9859
|
|
7
|
+
xparse_client/pipeline/pipeline.py,sha256=oz_BKWLbslkuRsxG0zEfh9url7saLWgtoTH1mrK6gCc,18282
|
|
8
|
+
xparse_client/pipeline/sources.py,sha256=-0Eutg9t8xni12cfv2bdQVdImlkCQ7gWlOXIFBt6tpE,11568
|
|
9
|
+
xparse_client-0.2.4.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
10
|
+
xparse_client-0.2.4.dist-info/METADATA,sha256=IMgXO9a7wnN0Ygzauk7eOkyrRFE3A2rq73eofvq3wBs,26508
|
|
11
|
+
xparse_client-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
xparse_client-0.2.4.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
|
|
13
|
+
xparse_client-0.2.4.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
xparse_client/__init__.py,sha256=3c_nFCooim4J31P0QWPM2VqdFJiHNFdH44IRmIIHvjk,901
|
|
2
|
-
xparse_client-0.2.2.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
|
|
3
|
-
xparse_client-0.2.2.dist-info/METADATA,sha256=F390ItwP4PGN-9FvJGOgqjtX7YWgMJolSJnxqbEBq08,26508
|
|
4
|
-
xparse_client-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
-
xparse_client-0.2.2.dist-info/top_level.txt,sha256=W5PeQwOyfo_Od3d26-gcOtan7rHYk1q3SP1phYedat4,14
|
|
6
|
-
xparse_client-0.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|