xparse-client 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,342 @@
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ import json
5
+ import logging
6
+ import boto3
7
+ import ftplib
8
+
9
+ from abc import ABC, abstractmethod
10
+ from datetime import datetime, timezone
11
+ from email.utils import parsedate_to_datetime
12
+ from fnmatch import fnmatch
13
+ from pathlib import Path
14
+ from typing import List, Dict, Any, Tuple
15
+
16
+ from smb.SMBConnection import SMBConnection
17
+ from botocore.config import Config
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class Source(ABC):
24
+ """数据源抽象基类"""
25
+
26
+ @abstractmethod
27
+ def list_files(self) -> List[str]:
28
+ """列出所有文件"""
29
+ raise NotImplementedError
30
+
31
+ @abstractmethod
32
+ def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
33
+ """读取文件内容并返回数据来源信息"""
34
+ raise NotImplementedError
35
+
36
+
37
+ class S3Source(Source):
38
+ """S3/MinIO 数据源"""
39
+
40
+ def __init__(self, endpoint: str, access_key: str, secret_key: str,
41
+ bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: str = '*'):
42
+ self.endpoint = endpoint
43
+ self.bucket = bucket
44
+ self.prefix = prefix
45
+ self.pattern = pattern or '*'
46
+
47
+ if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
48
+ config = Config(signature_version='s3v4')
49
+ else:
50
+ config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})
51
+
52
+ self.client = boto3.client(
53
+ 's3',
54
+ endpoint_url=endpoint,
55
+ aws_access_key_id=access_key,
56
+ aws_secret_access_key=secret_key,
57
+ region_name=region,
58
+ config=config
59
+ )
60
+
61
+ try:
62
+ self.client.head_bucket(Bucket=bucket)
63
+ print(f"✓ S3 连接成功: {endpoint}/{bucket}")
64
+ logger.info(f"S3 连接成功: {endpoint}/{bucket}")
65
+ except Exception as e:
66
+ print(f"✗ S3 连接失败: {str(e)}")
67
+ raise
68
+
69
+ def list_files(self) -> List[str]:
70
+ files = []
71
+ paginator = self.client.get_paginator('list_objects_v2')
72
+
73
+ params = {'Bucket': self.bucket}
74
+ if self.prefix:
75
+ params['Prefix'] = self.prefix
76
+
77
+ for page in paginator.paginate(**params):
78
+ if 'Contents' in page:
79
+ for obj in page['Contents']:
80
+ key = obj['Key']
81
+ if key.endswith('/') or key.endswith('empty.tmp'):
82
+ continue
83
+ if fnmatch(key, self.pattern):
84
+ files.append(key)
85
+
86
+ print(f"✓ S3 找到 {len(files)} 个文件")
87
+ return files
88
+
89
+ def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
90
+ response = self.client.get_object(Bucket=self.bucket, Key=file_path)
91
+ file_bytes = response['Body'].read()
92
+
93
+ headers = response.get('ResponseMetadata', {}).get('HTTPHeaders', {})
94
+ version = response.get('VersionId') or headers.get('x-amz-version-id')
95
+ last_modified = headers.get('last-modified')
96
+ server = headers.get('server') or "unknown"
97
+ date_modified = None
98
+ if last_modified:
99
+ try:
100
+ dt = parsedate_to_datetime(last_modified)
101
+ date_modified = dt.astimezone(timezone.utc).timestamp()
102
+ except Exception as exc:
103
+ logger.debug(f"S3 解析 last-modified 失败 {file_path}: {exc}")
104
+
105
+ normalized_key = file_path.lstrip('/')
106
+ data_source = {
107
+ 'url': f"s3://{self.bucket}/{normalized_key}",
108
+ 'version': version,
109
+ 'date_created': date_modified,
110
+ 'date_modified': date_modified,
111
+ 'record_locator': {
112
+ 'server': server,
113
+ 'protocol': 's3',
114
+ 'remote_file_path': normalized_key
115
+ }
116
+ }
117
+
118
+ return file_bytes, data_source
119
+
120
+
121
+ class LocalSource(Source):
122
+ """本地文件系统数据源"""
123
+
124
+ def __init__(self, directory: str, pattern: str = '*'):
125
+ self.directory = Path(directory)
126
+ self.pattern = pattern or '*'
127
+
128
+ if not self.directory.exists():
129
+ raise ValueError(f"目录不存在: {directory}")
130
+
131
+ print(f"✓ 本地目录: {self.directory}")
132
+ logger.info(f"本地目录: {self.directory}")
133
+
134
+ def list_files(self) -> List[str]:
135
+ files = [
136
+ str(f.relative_to(self.directory))
137
+ for f in self.directory.rglob(self.pattern)
138
+ if f.is_file()
139
+ ]
140
+ print(f"✓ 本地找到 {len(files)} 个文件")
141
+ return files
142
+
143
+ def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
144
+ full_path = (self.directory / file_path).resolve()
145
+ with open(full_path, 'rb') as f:
146
+ file_bytes = f.read()
147
+
148
+ date_created = None
149
+ date_modified = None
150
+ version = None
151
+ try:
152
+ stats = full_path.stat()
153
+ date_created = stats.st_ctime
154
+ date_modified = stats.st_mtime
155
+ version = str(int(stats.st_mtime_ns))
156
+ except FileNotFoundError:
157
+ logger.warning(f"本地文件不存在,无法获取 metadata: {full_path}")
158
+
159
+ data_source = {
160
+ 'url': full_path.as_uri(),
161
+ 'version': version,
162
+ 'date_created': date_created,
163
+ 'date_modified': date_modified,
164
+ 'record_locator': {
165
+ 'protocol': 'file',
166
+ 'remote_file_path': str(full_path)
167
+ }
168
+ }
169
+ return file_bytes, data_source
170
+
171
+
172
+ class FtpSource(Source):
173
+ """FTP 数据源"""
174
+
175
+ def __init__(self, host: str, port: int, username: str, password: str, pattern: str = '*'):
176
+ self.host = host
177
+ self.port = port
178
+ self.username = username
179
+ self.password = password
180
+ self.pattern = pattern or '*'
181
+
182
+ self.client = ftplib.FTP()
183
+ self.client.connect(self.host, self.port)
184
+ self.client.login(self.username, self.password)
185
+
186
+ print(f"✓ FTP 连接成功: {self.host}:{self.port}")
187
+ logger.info(f"FTP 连接成功: {self.host}:{self.port}")
188
+
189
+ def list_files(self) -> List[str]:
190
+ raw_files = self.client.nlst()
191
+ files = [f for f in raw_files if fnmatch(f, self.pattern)]
192
+ print(f"✓ FTP 找到 {len(files)} 个文件 (匹配 pattern)")
193
+ return files
194
+
195
+ def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
196
+ from io import BytesIO
197
+ buffer = BytesIO()
198
+ self.client.retrbinary(f'RETR {file_path}', buffer.write)
199
+
200
+ date_modified = None
201
+ try:
202
+ resp = self.client.sendcmd(f"MDTM {file_path}")
203
+ parts = resp.split()
204
+ if len(parts) == 2 and parts[0] == '213':
205
+ dt = datetime.strptime(parts[1], "%Y%m%d%H%M%S")
206
+ date_modified = dt.replace(tzinfo=timezone.utc).timestamp()
207
+ except Exception as exc:
208
+ logger.debug(f"FTP 获取文件时间失败 {file_path}: {exc}")
209
+
210
+ normalized_path = file_path.lstrip('/')
211
+ data_source = {
212
+ 'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
213
+ 'version': None,
214
+ 'date_created': None,
215
+ 'date_modified': date_modified,
216
+ 'record_locator': {
217
+ 'server': f"{self.host}:{self.port}",
218
+ 'protocol': 'ftp',
219
+ 'remote_file_path': normalized_path
220
+ }
221
+ }
222
+
223
+ return buffer.getvalue(), data_source
224
+
225
+
226
+ class SmbSource(Source):
227
+ """SMB/CIFS 数据源"""
228
+
229
+ def __init__(self, host: str, share_name: str, username: str, password: str,
230
+ domain: str = '', port: int = 445, path: str = '', pattern: str = '*'):
231
+ self.host = host
232
+ self.share_name = share_name
233
+ self.username = username
234
+ self.password = password
235
+ self.domain = domain
236
+ self.port = port
237
+ self.path = path.strip('/').strip('\\') if path else ''
238
+ self.pattern = pattern or '*'
239
+
240
+ self.conn = SMBConnection(
241
+ username,
242
+ password,
243
+ '',
244
+ host,
245
+ domain=domain,
246
+ use_ntlm_v2=True
247
+ )
248
+
249
+ try:
250
+ self.conn.connect(host, port)
251
+ except Exception as e:
252
+ error_msg = f"无法连接到 SMB 服务器 {host}:{port}: {str(e)}"
253
+ print(f"✗ SMB 连接失败: {error_msg}")
254
+ logger.error(f"SMB 连接失败: {error_msg}")
255
+ raise ConnectionError(error_msg)
256
+
257
+ def list_files(self) -> List[str]:
258
+ files = []
259
+ base_path = '/' if not self.path else f'/{self.path}'
260
+
261
+ def _list_recursive(conn, share, current_path):
262
+ try:
263
+ items = conn.listPath(share, current_path)
264
+ for item in items:
265
+ if item.filename in ['.', '..'] or item.filename.startswith('.'):
266
+ continue
267
+ item_path = f"{current_path.rstrip('/')}/{item.filename}" if current_path != '/' else f"/{item.filename}"
268
+ relative_path = item_path[len(base_path):].lstrip('/')
269
+ if item.isDirectory:
270
+ _list_recursive(conn, share, item_path)
271
+ else:
272
+ if fnmatch(relative_path, self.pattern):
273
+ files.append(relative_path)
274
+ except Exception as e:
275
+ logger.warning(f"列出路径失败 {current_path}: {str(e)}")
276
+
277
+ _list_recursive(self.conn, self.share_name, base_path)
278
+
279
+ print(f"✓ SMB 找到 {len(files)} 个文件")
280
+ return files
281
+
282
+ def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
283
+ from io import BytesIO
284
+
285
+ base_path = '/' if not self.path else f'/{self.path}'
286
+ full_path = f"{base_path.rstrip('/')}/{file_path.lstrip('/')}" if base_path != '/' else f"/{file_path.lstrip('/')}"
287
+
288
+ file_obj = BytesIO()
289
+ try:
290
+ self.conn.retrieveFile(self.share_name, full_path, file_obj)
291
+ except Exception as e:
292
+ raise IOError(f"读取文件失败 {full_path}: {str(e)}")
293
+
294
+ def _to_timestamp(value):
295
+ if isinstance(value, datetime):
296
+ return value.astimezone(timezone.utc).timestamp()
297
+ if isinstance(value, (int, float)):
298
+ return value
299
+ return None
300
+
301
+ date_created = None
302
+ date_modified = None
303
+ try:
304
+ attrs = self.conn.getAttributes(self.share_name, full_path)
305
+ date_created = _to_timestamp(getattr(attrs, 'create_time', None))
306
+ date_modified = _to_timestamp(getattr(attrs, 'last_write_time', None))
307
+ except Exception as exc:
308
+ logger.debug(f"SMB 获取文件属性失败 {full_path}: {exc}")
309
+
310
+ smb_url = f"smb://{self.host}/{self.share_name}{full_path}"
311
+ data_source = {
312
+ 'url': smb_url,
313
+ 'version': None,
314
+ 'date_created': date_created,
315
+ 'date_modified': date_modified,
316
+ 'record_locator': {
317
+ 'server': self.host,
318
+ 'share': self.share_name,
319
+ 'protocol': 'smb',
320
+ 'remote_file_path': full_path
321
+ }
322
+ }
323
+
324
+ file_obj.seek(0)
325
+ return file_obj.read(), data_source
326
+
327
+ def __del__(self):
328
+ if hasattr(self, 'conn') and self.conn:
329
+ try:
330
+ self.conn.close()
331
+ except Exception:
332
+ pass
333
+
334
+
335
+ __all__ = [
336
+ 'Source',
337
+ 'S3Source',
338
+ 'LocalSource',
339
+ 'FtpSource',
340
+ 'SmbSource',
341
+ ]
342
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xparse-client
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: 面向Agent和RAG的新一代文档处理 AI Infra
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://gitlab.intsig.net/xparse1/xparse-pipeline
@@ -0,0 +1,13 @@
1
+ example/run_pipeline.py,sha256=6gavTizAIqD62g4n9Pjq2-yW57ItZMJOOw8GEKm0Byk,15125
2
+ example/run_pipeline_test.py,sha256=uIU09FTv_VnTQS1Lc94ydc3kaD86eHkaHQbVXpsGEcA,14861
3
+ xparse_client/__init__.py,sha256=je1ena3HwLL4CRtLU4r6EAzoOIJthlPjTwshxZnzQDM,1677
4
+ xparse_client/pipeline/__init__.py,sha256=TVlb2AGCNKP0jrv3p4ZLZCPKp68hTVMFi00DTdi6QAo,49
5
+ xparse_client/pipeline/config.py,sha256=gkhAF-55PNvPPyfTZ0HkP95XB_K0HKCyYl6R4PTQLhI,4045
6
+ xparse_client/pipeline/destinations.py,sha256=rqcxmsn1YGClVxGQxSVmyr-uumOVilOv_vX82fUBj-I,9859
7
+ xparse_client/pipeline/pipeline.py,sha256=oz_BKWLbslkuRsxG0zEfh9url7saLWgtoTH1mrK6gCc,18282
8
+ xparse_client/pipeline/sources.py,sha256=-0Eutg9t8xni12cfv2bdQVdImlkCQ7gWlOXIFBt6tpE,11568
9
+ xparse_client-0.2.4.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
10
+ xparse_client-0.2.4.dist-info/METADATA,sha256=IMgXO9a7wnN0Ygzauk7eOkyrRFE3A2rq73eofvq3wBs,26508
11
+ xparse_client-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ xparse_client-0.2.4.dist-info/top_level.txt,sha256=bfX8BWo1sEEQVsI4Ql4Uu80vrfEh5zfajU9YqFTzxMo,22
13
+ xparse_client-0.2.4.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- xparse_client/__init__.py,sha256=3c_nFCooim4J31P0QWPM2VqdFJiHNFdH44IRmIIHvjk,901
2
- xparse_client-0.2.2.dist-info/licenses/LICENSE,sha256=ckIP-MbocsP9nqYnta5KgfAicYF196B5TNdHIR6kOO0,1075
3
- xparse_client-0.2.2.dist-info/METADATA,sha256=F390ItwP4PGN-9FvJGOgqjtX7YWgMJolSJnxqbEBq08,26508
4
- xparse_client-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
5
- xparse_client-0.2.2.dist-info/top_level.txt,sha256=W5PeQwOyfo_Od3d26-gcOtan7rHYk1q3SP1phYedat4,14
6
- xparse_client-0.2.2.dist-info/RECORD,,