xparse-client 0.2.20__py3-none-any.whl → 0.3.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. example/1_basic_api_usage.py +198 -0
  2. example/2_async_job.py +210 -0
  3. example/3_local_workflow.py +300 -0
  4. example/4_advanced_workflow.py +327 -0
  5. example/README.md +128 -0
  6. example/config_example.json +95 -0
  7. tests/conftest.py +310 -0
  8. tests/unit/__init__.py +1 -0
  9. tests/unit/api/__init__.py +1 -0
  10. tests/unit/api/test_extract.py +232 -0
  11. tests/unit/api/test_local.py +231 -0
  12. tests/unit/api/test_parse.py +374 -0
  13. tests/unit/api/test_pipeline.py +369 -0
  14. tests/unit/api/test_workflows.py +108 -0
  15. tests/unit/connectors/test_ftp.py +525 -0
  16. tests/unit/connectors/test_local_connectors.py +324 -0
  17. tests/unit/connectors/test_milvus.py +368 -0
  18. tests/unit/connectors/test_qdrant.py +399 -0
  19. tests/unit/connectors/test_s3.py +598 -0
  20. tests/unit/connectors/test_smb.py +442 -0
  21. tests/unit/connectors/test_utils.py +335 -0
  22. tests/unit/models/test_local.py +54 -0
  23. tests/unit/models/test_pipeline_stages.py +144 -0
  24. tests/unit/models/test_workflows.py +55 -0
  25. tests/unit/test_base.py +437 -0
  26. tests/unit/test_client.py +110 -0
  27. tests/unit/test_config.py +160 -0
  28. tests/unit/test_exceptions.py +182 -0
  29. tests/unit/test_http.py +562 -0
  30. xparse_client/__init__.py +110 -20
  31. xparse_client/_base.py +179 -0
  32. xparse_client/_client.py +218 -0
  33. xparse_client/_config.py +221 -0
  34. xparse_client/_http.py +350 -0
  35. xparse_client/api/__init__.py +14 -0
  36. xparse_client/api/extract.py +109 -0
  37. xparse_client/api/local.py +188 -0
  38. xparse_client/api/parse.py +209 -0
  39. xparse_client/api/pipeline.py +132 -0
  40. xparse_client/api/workflows.py +204 -0
  41. xparse_client/connectors/__init__.py +45 -0
  42. xparse_client/connectors/_utils.py +138 -0
  43. xparse_client/connectors/destinations/__init__.py +45 -0
  44. xparse_client/connectors/destinations/base.py +116 -0
  45. xparse_client/connectors/destinations/local.py +91 -0
  46. xparse_client/connectors/destinations/milvus.py +229 -0
  47. xparse_client/connectors/destinations/qdrant.py +238 -0
  48. xparse_client/connectors/destinations/s3.py +163 -0
  49. xparse_client/connectors/sources/__init__.py +45 -0
  50. xparse_client/connectors/sources/base.py +74 -0
  51. xparse_client/connectors/sources/ftp.py +278 -0
  52. xparse_client/connectors/sources/local.py +176 -0
  53. xparse_client/connectors/sources/s3.py +232 -0
  54. xparse_client/connectors/sources/smb.py +259 -0
  55. xparse_client/exceptions.py +398 -0
  56. xparse_client/models/__init__.py +60 -0
  57. xparse_client/models/chunk.py +39 -0
  58. xparse_client/models/embed.py +62 -0
  59. xparse_client/models/extract.py +41 -0
  60. xparse_client/models/local.py +38 -0
  61. xparse_client/models/parse.py +136 -0
  62. xparse_client/models/pipeline.py +132 -0
  63. xparse_client/models/workflows.py +74 -0
  64. xparse_client-0.3.0b2.dist-info/METADATA +1075 -0
  65. xparse_client-0.3.0b2.dist-info/RECORD +68 -0
  66. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/WHEEL +1 -1
  67. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/licenses/LICENSE +1 -1
  68. {xparse_client-0.2.20.dist-info → xparse_client-0.3.0b2.dist-info}/top_level.txt +2 -0
  69. xparse_client/pipeline/__init__.py +0 -3
  70. xparse_client/pipeline/config.py +0 -163
  71. xparse_client/pipeline/destinations.py +0 -489
  72. xparse_client/pipeline/pipeline.py +0 -860
  73. xparse_client/pipeline/sources.py +0 -583
  74. xparse_client-0.2.20.dist-info/METADATA +0 -1050
  75. xparse_client-0.2.20.dist-info/RECORD +0 -11
@@ -1,583 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- encoding: utf-8 -*-
3
-
4
- import json
5
- import logging
6
- import boto3
7
- import ftplib
8
-
9
- from abc import ABC, abstractmethod
10
- from datetime import datetime, timezone
11
- from email.utils import parsedate_to_datetime
12
- from fnmatch import fnmatch
13
- from pathlib import Path
14
- from typing import List, Dict, Any, Tuple, Optional
15
-
16
- from smb.SMBConnection import SMBConnection
17
- from botocore.config import Config
18
-
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- def _normalize_wildcard_patterns(pattern: Optional[List[str]]) -> Optional[List[str]]:
24
- """规范化通配符模式列表
25
-
26
- Args:
27
- pattern: 通配符模式列表,如果为 None 或空列表则返回 None(表示匹配所有文件)
28
-
29
- Returns:
30
- 通配符模式列表,如果 pattern 是 None、空列表或包含 "*" 则返回 None(表示匹配所有文件)
31
- """
32
- if pattern is None or not pattern:
33
- return None # None 表示匹配所有文件
34
-
35
- if not isinstance(pattern, list):
36
- raise ValueError(f"pattern 类型错误: {type(pattern)}")
37
-
38
- # 过滤空字符串并去除空格
39
- normalized = [p.strip() for p in pattern if p and p.strip()]
40
-
41
- if not normalized:
42
- return None
43
-
44
- # 如果包含 "*",直接返回 None(匹配所有文件,减少后续开销)
45
- if '*' in normalized:
46
- return None
47
-
48
- return normalized
49
-
50
-
51
- def _match_file_extension(file_path: str, wildcard_patterns: Optional[List[str]]) -> bool:
52
- """检查文件路径是否匹配通配符模式
53
-
54
- Args:
55
- file_path: 文件路径
56
- wildcard_patterns: 已规范化的通配符模式列表(如 ['*.pdf', '*.docx'])
57
-
58
- Returns:
59
- 如果匹配返回 True,否则返回 False
60
- """
61
- # 如果 wildcard_patterns 是 None 或空列表,匹配所有文件
62
- if wildcard_patterns is None:
63
- return True
64
-
65
- # 检查是否匹配任何一个通配符模式
66
- for wildcard_pattern in wildcard_patterns:
67
- if fnmatch(file_path, wildcard_pattern):
68
- return True
69
-
70
- return False
71
-
72
-
73
- def _to_millis_timestamp_string(timestamp):
74
- """将时间戳转换为毫秒时间戳字符串
75
-
76
- Args:
77
- timestamp: 时间戳(秒或毫秒),可以是 int、float 或 None
78
-
79
- Returns:
80
- str: 毫秒时间戳字符串,如果输入为 None 则返回空字符串
81
- """
82
- if timestamp is None:
83
- return ""
84
-
85
- # 如果已经是毫秒时间戳(大于 1e12),直接转换
86
- if isinstance(timestamp, (int, float)):
87
- if timestamp > 1e12:
88
- # 已经是毫秒时间戳
89
- return str(int(timestamp))
90
- else:
91
- # 秒级时间戳,转换为毫秒
92
- return str(int(timestamp * 1000))
93
-
94
- return str(timestamp)
95
-
96
-
97
- class Source(ABC):
98
- """数据源抽象基类"""
99
-
100
- @abstractmethod
101
- def list_files(self) -> List[str]:
102
- """列出所有文件"""
103
- raise NotImplementedError
104
-
105
- @abstractmethod
106
- def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
107
- """读取文件内容并返回数据来源信息"""
108
- raise NotImplementedError
109
-
110
-
111
- class S3Source(Source):
112
- """S3/MinIO 数据源"""
113
-
114
- def __init__(self, endpoint: str, access_key: str, secret_key: str,
115
- bucket: str, prefix: str = '', region: str = 'us-east-1', pattern: Optional[List[str]] = None, recursive: bool = False):
116
- self.endpoint = endpoint
117
- self.bucket = bucket
118
- self.prefix = prefix
119
- self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
120
- self.recursive = recursive
121
-
122
- if self.endpoint == 'https://textin-minio-api.ai.intsig.net':
123
- config = Config(signature_version='s3v4')
124
- else:
125
- config = Config(signature_version='s3v4', s3={'addressing_style': 'virtual'})
126
-
127
- self.client = boto3.client(
128
- 's3',
129
- endpoint_url=endpoint,
130
- aws_access_key_id=access_key,
131
- aws_secret_access_key=secret_key,
132
- region_name=region,
133
- config=config
134
- )
135
-
136
- try:
137
- self.client.head_bucket(Bucket=bucket)
138
- print(f"✓ S3 连接成功: {endpoint}/{bucket}")
139
- logger.info(f"S3 连接成功: {endpoint}/{bucket}")
140
- except Exception as e:
141
- print(f"✗ S3 连接失败: {str(e)}")
142
- raise
143
-
144
- def list_files(self) -> List[str]:
145
- files = []
146
- paginator = self.client.get_paginator('list_objects_v2')
147
-
148
- params = {'Bucket': self.bucket}
149
- if self.prefix:
150
- params['Prefix'] = self.prefix
151
- if not self.recursive:
152
- # 非递归模式:使用 Delimiter 只列出当前目录下的文件
153
- params['Delimiter'] = '/'
154
-
155
- for page in paginator.paginate(**params):
156
- if 'Contents' in page:
157
- for obj in page['Contents']:
158
- key = obj['Key']
159
- if key.endswith('/') or key.endswith('empty.tmp'):
160
- continue
161
- if _match_file_extension(key, self.pattern):
162
- files.append(key)
163
-
164
- # 非递归模式下,CommonPrefixes 包含子目录,我们忽略它们
165
- if not self.recursive and 'CommonPrefixes' in page:
166
- # 这些是子目录,在非递归模式下忽略
167
- pass
168
-
169
- print(f"✓ S3 找到 {len(files)} 个文件")
170
- return files
171
-
172
- def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
173
- response = self.client.get_object(Bucket=self.bucket, Key=file_path)
174
- file_bytes = response['Body'].read()
175
-
176
- headers = response.get('ResponseMetadata', {}).get('HTTPHeaders', {})
177
- version = headers.get('etag') or ""
178
- if version.startswith('"') and version.endswith('"'):
179
- version = version[1:-1]
180
- last_modified = headers.get('last-modified')
181
- server = headers.get('server') or "unknown"
182
- date_modified = None
183
- if last_modified:
184
- try:
185
- dt = parsedate_to_datetime(last_modified)
186
- date_modified = dt.astimezone(timezone.utc).timestamp()
187
- except Exception as exc:
188
- logger.debug(f"S3 解析 last-modified 失败 {file_path}: {exc}")
189
-
190
- normalized_key = file_path.lstrip('/')
191
- data_source = {
192
- 'url': f"s3://{self.bucket}/{normalized_key}",
193
- 'version': version,
194
- 'date_created': _to_millis_timestamp_string(date_modified),
195
- 'date_modified': _to_millis_timestamp_string(date_modified),
196
- 'record_locator': {
197
- 'server': server,
198
- 'protocol': 's3',
199
- 'remote_file_path': normalized_key
200
- }
201
- }
202
-
203
- return file_bytes, data_source
204
-
205
-
206
- class LocalSource(Source):
207
- """本地文件系统数据源"""
208
-
209
- def __init__(self, directory: str, pattern: Optional[List[str]] = None, recursive: bool = False):
210
- self.directory = Path(directory)
211
- self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
212
- self.recursive = recursive
213
-
214
- if not self.directory.exists():
215
- raise ValueError(f"目录不存在: {directory}")
216
-
217
- print(f"✓ 本地目录: {self.directory}")
218
- logger.info(f"本地目录: {self.directory}")
219
-
220
- def list_files(self) -> List[str]:
221
- all_files = []
222
- # 匹配所有文件
223
- if self.recursive:
224
- all_files.extend([
225
- str(f.relative_to(self.directory))
226
- for f in self.directory.rglob('*')
227
- if f.is_file()
228
- ])
229
- else:
230
- all_files.extend([
231
- str(f.relative_to(self.directory))
232
- for f in self.directory.glob('*')
233
- if f.is_file()
234
- ])
235
-
236
- files = []
237
- if self.pattern is not None:
238
- for file in all_files:
239
- if _match_file_extension(file, self.pattern):
240
- files.append(file)
241
- else:
242
- files.extend(all_files)
243
-
244
- print(f"✓ 本地找到 {len(files)} 个文件")
245
- return files
246
-
247
- def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
248
- full_path = (self.directory / file_path).resolve()
249
- with open(full_path, 'rb') as f:
250
- file_bytes = f.read()
251
-
252
- date_created = None
253
- date_modified = None
254
- version = None
255
- try:
256
- stats = full_path.stat()
257
- date_created = stats.st_ctime
258
- date_modified = stats.st_mtime
259
- version = str(int(stats.st_mtime_ns))
260
- except FileNotFoundError:
261
- logger.warning(f"本地文件不存在,无法获取 metadata: {full_path}")
262
-
263
- data_source = {
264
- 'url': full_path.as_uri(),
265
- 'version': version,
266
- 'date_created': _to_millis_timestamp_string(date_created),
267
- 'date_modified': _to_millis_timestamp_string(date_modified),
268
- 'record_locator': {
269
- 'protocol': 'file',
270
- 'remote_file_path': str(full_path)
271
- }
272
- }
273
- return file_bytes, data_source
274
-
275
-
276
- class FtpSource(Source):
277
- """FTP 数据源"""
278
-
279
- def __init__(self, host: str, port: int, username: str, password: str, pattern: Optional[List[str]] = None, recursive: bool = False):
280
- self.host = host
281
- self.port = port
282
- self.username = username
283
- self.password = password
284
- self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
285
- self.recursive = recursive
286
-
287
- self.client = ftplib.FTP()
288
- self.client.connect(self.host, self.port)
289
- self.client.login(self.username, self.password)
290
-
291
- print(f"✓ FTP 连接成功: {self.host}:{self.port}")
292
- logger.info(f"FTP 连接成功: {self.host}:{self.port}")
293
-
294
- def list_files(self) -> List[str]:
295
- if self.recursive:
296
- # 递归模式:递归列出所有文件
297
- files = []
298
- current_dir = self.client.pwd()
299
-
300
- def _list_recursive(path=''):
301
- try:
302
- # 保存当前目录
303
- original_dir = self.client.pwd()
304
- if path:
305
- try:
306
- self.client.cwd(path)
307
- except:
308
- return
309
-
310
- items = []
311
- try:
312
- # 尝试使用 MLSD 命令(更可靠)
313
- items = []
314
- for item in self.client.mlsd():
315
- items.append(item)
316
- except:
317
- # 如果不支持 MLSD,使用 LIST 命令
318
- try:
319
- lines = []
320
- self.client.retrlines('LIST', lines.append)
321
- for line in lines:
322
- parts = line.split()
323
- if len(parts) >= 9:
324
- # 解析 LIST 输出,第一个字符表示文件类型
325
- item_name = ' '.join(parts[8:])
326
- is_dir = parts[0].startswith('d')
327
- items.append((item_name, {'type': 'dir' if is_dir else 'file'}))
328
- except:
329
- # 最后回退到 nlst,但无法区分文件和目录
330
- for item_name in self.client.nlst():
331
- items.append((item_name, {'type': 'unknown'}))
332
-
333
- for item_name, item_info in items:
334
- if item_name in ['.', '..']:
335
- continue
336
-
337
- item_type = item_info.get('type', 'unknown')
338
- full_path = f"{path}/{item_name}" if path else item_name
339
-
340
- if item_type == 'dir' or item_type == 'unknown':
341
- # 尝试切换目录来判断是否为目录
342
- try:
343
- self.client.cwd(item_name)
344
- self.client.cwd('..')
345
- # 是目录,递归处理
346
- _list_recursive(full_path)
347
- except:
348
- # 不是目录,是文件
349
- relative_path = full_path.lstrip('/')
350
- if _match_file_extension(relative_path, self.pattern):
351
- files.append(relative_path)
352
- else:
353
- # 是文件
354
- relative_path = full_path.lstrip('/')
355
- if _match_file_extension(relative_path, self.pattern):
356
- files.append(relative_path)
357
-
358
- # 恢复原始目录
359
- self.client.cwd(original_dir)
360
- except Exception as e:
361
- logger.warning(f"FTP 列出路径失败 {path}: {str(e)}")
362
- try:
363
- self.client.cwd(current_dir)
364
- except:
365
- pass
366
-
367
- _list_recursive()
368
- # 确保回到原始目录
369
- try:
370
- self.client.cwd(current_dir)
371
- except:
372
- pass
373
- else:
374
- # 非递归模式:只列出当前目录下的文件(排除目录)
375
- files = []
376
- current_dir = self.client.pwd()
377
-
378
- try:
379
- # 尝试使用 MLSD 命令(更可靠)
380
- items = []
381
- for item_name, item_info in self.client.mlsd():
382
- if item_name in ['.', '..']:
383
- continue
384
- item_type = item_info.get('type', 'unknown')
385
- # 只添加文件,排除目录
386
- if item_type == 'file' or (item_type == 'unknown' and not item_info.get('type', '').startswith('dir')):
387
- if _match_file_extension(item_name, self.pattern):
388
- files.append(item_name)
389
- except:
390
- # 如果不支持 MLSD,使用 LIST 命令
391
- try:
392
- lines = []
393
- self.client.retrlines('LIST', lines.append)
394
- for line in lines:
395
- parts = line.split()
396
- if len(parts) >= 9:
397
- # 解析 LIST 输出,第一个字符表示文件类型
398
- item_name = ' '.join(parts[8:])
399
- if item_name in ['.', '..']:
400
- continue
401
- is_dir = parts[0].startswith('d')
402
- # 只添加文件,排除目录
403
- if not is_dir and _match_file_extension(item_name, self.pattern):
404
- files.append(item_name)
405
- except:
406
- # 最后回退到 nlst,通过尝试切换目录来判断是否为目录
407
- raw_items = self.client.nlst()
408
- for item_name in raw_items:
409
- if item_name in ['.', '..']:
410
- continue
411
- # 尝试切换目录来判断是否为目录
412
- try:
413
- self.client.cwd(item_name)
414
- self.client.cwd('..')
415
- # 能切换成功,说明是目录,跳过
416
- continue
417
- except:
418
- # 不能切换,说明是文件
419
- if _match_file_extension(item_name, self.pattern):
420
- files.append(item_name)
421
-
422
- # 确保回到原始目录
423
- try:
424
- self.client.cwd(current_dir)
425
- except:
426
- pass
427
-
428
- print(f"✓ FTP 找到 {len(files)} 个文件 (匹配 pattern)")
429
- return files
430
-
431
- def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
432
- from io import BytesIO
433
- buffer = BytesIO()
434
- self.client.retrbinary(f'RETR {file_path}', buffer.write)
435
-
436
- date_modified = None
437
- try:
438
- resp = self.client.sendcmd(f"MDTM {file_path}")
439
- parts = resp.split()
440
- if len(parts) == 2 and parts[0] == '213':
441
- dt = datetime.strptime(parts[1], "%Y%m%d%H%M%S")
442
- date_modified = dt.replace(tzinfo=timezone.utc).timestamp()
443
- except Exception as exc:
444
- logger.debug(f"FTP 获取文件时间失败 {file_path}: {exc}")
445
-
446
- normalized_path = file_path.lstrip('/')
447
- version = _to_millis_timestamp_string(date_modified)
448
- data_source = {
449
- 'url': f"ftp://{self.host}:{self.port}/{normalized_path}",
450
- 'version': version,
451
- 'date_created': version,
452
- 'date_modified': version,
453
- 'record_locator': {
454
- 'server': f"{self.host}:{self.port}",
455
- 'protocol': 'ftp',
456
- 'remote_file_path': normalized_path
457
- }
458
- }
459
-
460
- return buffer.getvalue(), data_source
461
-
462
-
463
- class SmbSource(Source):
464
- """SMB/CIFS 数据源"""
465
-
466
- def __init__(self, host: str, share_name: str, username: str, password: str,
467
- domain: str = '', port: int = 445, path: str = '', pattern: Optional[List[str]] = None, recursive: bool = False):
468
- self.host = host
469
- self.share_name = share_name
470
- self.username = username
471
- self.password = password
472
- self.domain = domain
473
- self.port = port
474
- self.path = path.strip('/').strip('\\') if path else ''
475
- self.pattern = _normalize_wildcard_patterns(pattern) # 在初始化时规范化
476
- self.recursive = recursive
477
-
478
- self.conn = SMBConnection(
479
- username,
480
- password,
481
- '',
482
- host,
483
- domain=domain,
484
- use_ntlm_v2=True
485
- )
486
-
487
- try:
488
- self.conn.connect(host, port)
489
- except Exception as e:
490
- error_msg = f"无法连接到 SMB 服务器 {host}:{port}: {str(e)}"
491
- print(f"✗ SMB 连接失败: {error_msg}")
492
- logger.error(f"SMB 连接失败: {error_msg}")
493
- raise ConnectionError(error_msg)
494
-
495
- def list_files(self) -> List[str]:
496
- files = []
497
- base_path = '/' if not self.path else f'/{self.path}'
498
-
499
- def _list_recursive(conn, share, current_path):
500
- try:
501
- items = conn.listPath(share, current_path)
502
- for item in items:
503
- if item.filename in ['.', '..'] or item.filename.startswith('.'):
504
- continue
505
- item_path = f"{current_path.rstrip('/')}/{item.filename}" if current_path != '/' else f"/{item.filename}"
506
- relative_path = item_path[len(base_path):].lstrip('/')
507
- if item.isDirectory:
508
- if self.recursive:
509
- # 递归模式:继续递归子目录
510
- _list_recursive(conn, share, item_path)
511
- # 非递归模式:忽略子目录
512
- else:
513
- if _match_file_extension(relative_path, self.pattern):
514
- files.append(relative_path)
515
- except Exception as e:
516
- logger.warning(f"列出路径失败 {current_path}: {str(e)}")
517
-
518
- _list_recursive(self.conn, self.share_name, base_path)
519
-
520
- print(f"✓ SMB 找到 {len(files)} 个文件")
521
- return files
522
-
523
- def read_file(self, file_path: str) -> Tuple[bytes, Dict[str, Any]]:
524
- from io import BytesIO
525
-
526
- base_path = '/' if not self.path else f'/{self.path}'
527
- full_path = f"{base_path.rstrip('/')}/{file_path.lstrip('/')}" if base_path != '/' else f"/{file_path.lstrip('/')}"
528
-
529
- file_obj = BytesIO()
530
- try:
531
- self.conn.retrieveFile(self.share_name, full_path, file_obj)
532
- except Exception as e:
533
- raise IOError(f"读取文件失败 {full_path}: {str(e)}")
534
-
535
- def _to_timestamp(value):
536
- if isinstance(value, datetime):
537
- return value.astimezone(timezone.utc).timestamp()
538
- if isinstance(value, (int, float)):
539
- return value
540
- return None
541
-
542
- date_created = None
543
- date_modified = None
544
- try:
545
- attrs = self.conn.getAttributes(self.share_name, full_path)
546
- date_created = _to_timestamp(getattr(attrs, 'create_time', None))
547
- date_modified = _to_timestamp(getattr(attrs, 'last_write_time', None))
548
- except Exception as exc:
549
- logger.debug(f"SMB 获取文件属性失败 {full_path}: {exc}")
550
-
551
- smb_url = f"smb://{self.host}/{self.share_name}{full_path}"
552
- data_source = {
553
- 'url': smb_url,
554
- 'version': _to_millis_timestamp_string(date_modified),
555
- 'date_created': _to_millis_timestamp_string(date_created),
556
- 'date_modified': _to_millis_timestamp_string(date_modified),
557
- 'record_locator': {
558
- 'server': self.host,
559
- 'share': self.share_name,
560
- 'protocol': 'smb',
561
- 'remote_file_path': full_path
562
- }
563
- }
564
-
565
- file_obj.seek(0)
566
- return file_obj.read(), data_source
567
-
568
- def __del__(self):
569
- if hasattr(self, 'conn') and self.conn:
570
- try:
571
- self.conn.close()
572
- except Exception:
573
- pass
574
-
575
-
576
- __all__ = [
577
- 'Source',
578
- 'S3Source',
579
- 'LocalSource',
580
- 'FtpSource',
581
- 'SmbSource',
582
- ]
583
-