synapse-sdk 1.0.0b23__py3-none-any.whl → 2025.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synapse-sdk might be problematic. Click here for more details.
- synapse_sdk/clients/agent/ray.py +50 -0
- synapse_sdk/devtools/docs/docs/api/clients/ray.md +24 -3
- synapse_sdk/devtools/docs/docs/api/index.md +5 -5
- synapse_sdk/devtools/docs/docs/features/utils/file.md +415 -0
- synapse_sdk/devtools/docs/docs/{api → features}/utils/network.md +1 -1
- synapse_sdk/devtools/docs/docs/plugins/export-plugins.md +140 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/clients/ray.md +24 -3
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/index.md +5 -5
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/features/index.md +5 -5
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/features/utils/file.md +415 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/{api → features}/utils/network.md +1 -1
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md +138 -0
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/plugins.md +48 -2
- synapse_sdk/devtools/docs/sidebars.ts +10 -10
- synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +17 -2
- synapse_sdk/utils/file/__init__.py +39 -0
- synapse_sdk/utils/file/archive.py +32 -0
- synapse_sdk/utils/file/checksum.py +56 -0
- synapse_sdk/utils/file/chunking.py +31 -0
- synapse_sdk/utils/file/download.py +124 -0
- synapse_sdk/utils/file/encoding.py +40 -0
- synapse_sdk/utils/file/io.py +22 -0
- synapse_sdk/utils/file/video/__init__.py +29 -0
- synapse_sdk/utils/file/video/transcode.py +307 -0
- {synapse_sdk-1.0.0b23.dist-info → synapse_sdk-2025.9.1.dist-info}/METADATA +3 -2
- {synapse_sdk-1.0.0b23.dist-info → synapse_sdk-2025.9.1.dist-info}/RECORD +35 -26
- synapse_sdk/devtools/docs/docs/api/utils/file.md +0 -195
- synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/api/utils/file.md +0 -195
- /synapse_sdk/devtools/docs/docs/{api → features}/utils/storage.md +0 -0
- /synapse_sdk/devtools/docs/docs/{api → features}/utils/types.md +0 -0
- /synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/{api → features}/utils/storage.md +0 -0
- /synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/{api → features}/utils/types.md +0 -0
- /synapse_sdk/utils/{file.py → file.py.backup} +0 -0
- {synapse_sdk-1.0.0b23.dist-info → synapse_sdk-2025.9.1.dist-info}/WHEEL +0 -0
- {synapse_sdk-1.0.0b23.dist-info → synapse_sdk-2025.9.1.dist-info}/entry_points.txt +0 -0
- {synapse_sdk-1.0.0b23.dist-info → synapse_sdk-2025.9.1.dist-info}/licenses/LICENSE +0 -0
- {synapse_sdk-1.0.0b23.dist-info → synapse_sdk-2025.9.1.dist-info}/top_level.txt +0 -0
synapse_sdk/devtools/docs/i18n/ko/docusaurus-plugin-content-docs/current/plugins/export-plugins.md
CHANGED
|
@@ -213,6 +213,7 @@ class Exporter(BaseExporter):
|
|
|
213
213
|
- **before_convert()**: 변환 전 데이터 전처리
|
|
214
214
|
- **after_convert()**: 변환 후 데이터 후처리
|
|
215
215
|
- **process_file_saving()**: 사용자 정의 파일 저장 로직
|
|
216
|
+
- **additional_file_saving()**: 모든 export 항목 처리 후 추가 파일 저장
|
|
216
217
|
|
|
217
218
|
### 헬퍼 메서드
|
|
218
219
|
|
|
@@ -225,6 +226,143 @@ class Exporter(BaseExporter):
|
|
|
225
226
|
- `self.run.log_message()` 및 기타 run 메서드를 통한 로깅
|
|
226
227
|
- run 메서드를 통한 오류 처리 및 메트릭 수집
|
|
227
228
|
|
|
229
|
+
## 추가 파일 저장 (Additional File Saving)
|
|
230
|
+
|
|
231
|
+
`additional_file_saving()` 메서드는 모든 export 항목이 처리된 후에 호출되며, 모든 처리된 항목의 집합적 데이터에 의존하는 파일을 저장하기 위해 설계되었습니다. 다음과 같은 용도로 유용합니다:
|
|
232
|
+
|
|
233
|
+
- 메타데이터 파일 (예: 데이터셋 통계, 클래스 매핑)
|
|
234
|
+
- 설정 파일 (예: YOLO용 dataset.yaml, classes.txt)
|
|
235
|
+
- 요약 파일 (예: export 보고서, 처리 로그)
|
|
236
|
+
- 인덱스 파일 (예: 파일 목록, 디렉토리 구조)
|
|
237
|
+
|
|
238
|
+
### 메서드 시그니처
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
def additional_file_saving(self, unique_export_path):
|
|
242
|
+
"""모든 export 항목 처리 후 추가 파일 저장.
|
|
243
|
+
|
|
244
|
+
이 메서드는 주 export 루프가 완료된 후 호출되며, 모든 처리된 export 항목의
|
|
245
|
+
집합적 데이터를 기반으로 생성되어야 하는 파일들(예: 메타데이터 파일,
|
|
246
|
+
설정 파일, 요약 파일 등)을 저장하기 위한 것입니다.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
unique_export_path (str): 추가 파일이 저장될 고유한 export 디렉토리 경로.
|
|
250
|
+
"""
|
|
251
|
+
pass
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### 사용 예시
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
class YOLOExporter(BaseExporter):
|
|
258
|
+
def __init__(self, run, export_items, path_root, **params):
|
|
259
|
+
super().__init__(run, export_items, path_root, **params)
|
|
260
|
+
self.class_names = set()
|
|
261
|
+
self.dataset_stats = {
|
|
262
|
+
'total_images': 0,
|
|
263
|
+
'total_annotations': 0,
|
|
264
|
+
'class_distribution': {}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
def convert_data(self, data):
|
|
268
|
+
# 변환 중 클래스와 통계 추적
|
|
269
|
+
for annotation in data.get('annotations', []):
|
|
270
|
+
class_name = annotation['class_name']
|
|
271
|
+
self.class_names.add(class_name)
|
|
272
|
+
self.dataset_stats['class_distribution'][class_name] = \
|
|
273
|
+
self.dataset_stats['class_distribution'].get(class_name, 0) + 1
|
|
274
|
+
|
|
275
|
+
self.dataset_stats['total_images'] += 1
|
|
276
|
+
self.dataset_stats['total_annotations'] += len(data.get('annotations', []))
|
|
277
|
+
|
|
278
|
+
return data # ... 나머지 변환 로직
|
|
279
|
+
|
|
280
|
+
def additional_file_saving(self, unique_export_path):
|
|
281
|
+
"""YOLO 설정 및 메타데이터 파일 저장."""
|
|
282
|
+
data_dir = Path(unique_export_path) / 'data'
|
|
283
|
+
data_dir.mkdir(exist_ok=True)
|
|
284
|
+
|
|
285
|
+
# 1. classes.txt 파일 저장
|
|
286
|
+
classes_file = data_dir / 'classes.txt'
|
|
287
|
+
with classes_file.open('w') as f:
|
|
288
|
+
for class_name in sorted(self.class_names):
|
|
289
|
+
f.write(f"{class_name}\n")
|
|
290
|
+
self.run.log_message(f"클래스 파일 저장: {classes_file}")
|
|
291
|
+
|
|
292
|
+
# 2. dataset.yaml 파일 저장
|
|
293
|
+
dataset_config = {
|
|
294
|
+
'path': str(unique_export_path),
|
|
295
|
+
'train': 'images',
|
|
296
|
+
'val': 'images',
|
|
297
|
+
'names': {i: name for i, name in enumerate(sorted(self.class_names))}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
dataset_file = data_dir / 'dataset.yaml'
|
|
301
|
+
with dataset_file.open('w') as f:
|
|
302
|
+
yaml.dump(dataset_config, f, default_flow_style=False)
|
|
303
|
+
self.run.log_message(f"데이터셋 설정 저장: {dataset_file}")
|
|
304
|
+
|
|
305
|
+
# 3. export 통계 저장
|
|
306
|
+
stats_file = data_dir / 'export_stats.json'
|
|
307
|
+
with stats_file.open('w') as f:
|
|
308
|
+
json.dump(self.dataset_stats, f, indent=2)
|
|
309
|
+
self.run.log_message(f"export 통계 저장: {stats_file}")
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### 일반적인 사용 사례
|
|
313
|
+
|
|
314
|
+
#### 1. 데이터셋 설정 파일
|
|
315
|
+
```python
|
|
316
|
+
def additional_file_saving(self, unique_export_path):
|
|
317
|
+
# 훈련 프레임워크용 데이터셋 설정 생성
|
|
318
|
+
config = {
|
|
319
|
+
'dataset_name': self.params.get('name'),
|
|
320
|
+
'created_at': datetime.now().isoformat(),
|
|
321
|
+
'total_samples': len(self.processed_items),
|
|
322
|
+
'classes': list(self.class_mapping.keys())
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
config_file = Path(unique_export_path) / 'dataset_config.json'
|
|
326
|
+
with config_file.open('w') as f:
|
|
327
|
+
json.dump(config, f, indent=2)
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
#### 2. Export 요약 보고서
|
|
331
|
+
```python
|
|
332
|
+
def additional_file_saving(self, unique_export_path):
|
|
333
|
+
# export 요약 생성
|
|
334
|
+
summary = {
|
|
335
|
+
'export_info': {
|
|
336
|
+
'plugin_name': self.__class__.__name__,
|
|
337
|
+
'export_time': datetime.now().isoformat(),
|
|
338
|
+
'export_path': str(unique_export_path)
|
|
339
|
+
},
|
|
340
|
+
'statistics': self.get_export_statistics(),
|
|
341
|
+
'errors': self.get_error_summary()
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
summary_file = Path(unique_export_path) / 'export_summary.json'
|
|
345
|
+
with summary_file.open('w') as f:
|
|
346
|
+
json.dump(summary, f, indent=2)
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
#### 3. 인덱스 및 매니페스트 파일
|
|
350
|
+
```python
|
|
351
|
+
def additional_file_saving(self, unique_export_path):
|
|
352
|
+
# 처리된 항목들에 대한 파일 인덱스 생성
|
|
353
|
+
file_index = []
|
|
354
|
+
for item in self.processed_items:
|
|
355
|
+
file_index.append({
|
|
356
|
+
'original_file': item['original_filename'],
|
|
357
|
+
'json_file': f"{item['stem']}.json",
|
|
358
|
+
'processed_at': item['timestamp']
|
|
359
|
+
})
|
|
360
|
+
|
|
361
|
+
index_file = Path(unique_export_path) / 'file_index.json'
|
|
362
|
+
with index_file.open('w') as f:
|
|
363
|
+
json.dump(file_index, f, indent=2)
|
|
364
|
+
```
|
|
365
|
+
|
|
228
366
|
## 주요 특징
|
|
229
367
|
|
|
230
368
|
- **진행률 추적**: `run.set_progress()`로 내장 진행률 모니터링
|
|
@@ -67,5 +67,51 @@ Synapse 플랫폼에서 주석이 달린 데이터, 그라운드 트루스 데
|
|
|
67
67
|
- `ground_truth` - 그라운드 트루스 데이터셋 버전 내보내기
|
|
68
68
|
- `task` - 관련 주석이 있는 작업 데이터 내보내기
|
|
69
69
|
|
|
70
|
-
Export 플러그인에 대한 자세한 정보, BaseExporter 클래스 구조, 구현 예시 및 모범 사례는 [Export 플러그인](
|
|
71
|
-
|
|
70
|
+
Export 플러그인에 대한 자세한 정보, BaseExporter 클래스 구조, 구현 예시 및 모범 사례는 [Export 플러그인](./export-plugins) 문서를 참조하세요.
|
|
71
|
+
|
|
72
|
+
## 플러그인 구성
|
|
73
|
+
|
|
74
|
+
플러그인은 YAML 구성 파일을 사용하여 구성되며, 이 파일은 메타데이터, 종속성, 패키지 관리 옵션 및 액션을 정의합니다.
|
|
75
|
+
|
|
76
|
+
### 패키지 관리
|
|
77
|
+
|
|
78
|
+
```yaml
|
|
79
|
+
# 플러그인 메타데이터 (필수)
|
|
80
|
+
name: "my-ml-plugin" # 필수: 플러그인 이름
|
|
81
|
+
version: "1.0.0" # 필수: 버전
|
|
82
|
+
category: "neural_net" # 필수: 카테고리
|
|
83
|
+
description: "사용자 정의 ML 플러그인" # 필수: 설명
|
|
84
|
+
|
|
85
|
+
# 패키지 관리 (선택사항)
|
|
86
|
+
package_manager: "pip" # 선택사항: "pip" 또는 "uv" (기본값: "pip")
|
|
87
|
+
|
|
88
|
+
# 패키지 매니저 옵션 (선택사항)
|
|
89
|
+
# uv만 지원 pip_install_options: 지원 예정
|
|
90
|
+
# uv의 경우 기본값은 ['--no-cache']입니다
|
|
91
|
+
package_manager_options: ["--no-cache", "--quiet"] # 선택사항: 설치 옵션
|
|
92
|
+
|
|
93
|
+
# 액션 정의 (필수)
|
|
94
|
+
actions: # 필수: 최소 하나의 액션 필요
|
|
95
|
+
train:
|
|
96
|
+
entrypoint: "plugin.train.TrainAction" # 필수
|
|
97
|
+
method: "job" # 필수: 실행 방법
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**구성 옵션 설명:**
|
|
101
|
+
|
|
102
|
+
**필수 필드:**
|
|
103
|
+
- `name`: 플러그인 이름
|
|
104
|
+
- `version`: 플러그인 버전
|
|
105
|
+
- `category`: 플러그인 카테고리 (`neural_net`, `export` 등)
|
|
106
|
+
- `description`: 플러그인 설명
|
|
107
|
+
- `actions`: 플러그인 액션 정의 (최소 하나 필요)
|
|
108
|
+
- `entrypoint`: 액션의 진입점 클래스
|
|
109
|
+
- `method`: 실행 방법 (`job`, `task`, `serve` 등)
|
|
110
|
+
|
|
111
|
+
**선택사항 필드:**
|
|
112
|
+
- `package_manager`: 종속성에 사용할 패키지 매니저 (기본값: `"pip"`)
|
|
113
|
+
- 지원 값: `"pip"` 또는 `"uv"`
|
|
114
|
+
- `package_manager_options`: 패키지 설치 명령에 대한 사용자 제공 옵션
|
|
115
|
+
- 현재 Ray 2.44.1에서는 `uv`만 지원됩니다
|
|
116
|
+
- `uv`의 경우 기본값은 `["--no-cache"]`입니다
|
|
117
|
+
- 사용자 정의 옵션이 기본값과 병합됩니다
|
|
@@ -25,6 +25,16 @@ const sidebars: SidebarsConfig = {
|
|
|
25
25
|
'features/converters/converters',
|
|
26
26
|
],
|
|
27
27
|
},
|
|
28
|
+
{
|
|
29
|
+
type: 'category',
|
|
30
|
+
label: 'Utilities',
|
|
31
|
+
items: [
|
|
32
|
+
'features/utils/file',
|
|
33
|
+
'features/utils/network',
|
|
34
|
+
'features/utils/storage',
|
|
35
|
+
'features/utils/types',
|
|
36
|
+
],
|
|
37
|
+
},
|
|
28
38
|
{
|
|
29
39
|
type: 'category',
|
|
30
40
|
label: 'Plugin System',
|
|
@@ -49,16 +59,6 @@ const sidebars: SidebarsConfig = {
|
|
|
49
59
|
'api/clients/base',
|
|
50
60
|
],
|
|
51
61
|
},
|
|
52
|
-
{
|
|
53
|
-
type: 'category',
|
|
54
|
-
label: 'Utilities',
|
|
55
|
-
items: [
|
|
56
|
-
'api/utils/file',
|
|
57
|
-
'api/utils/network',
|
|
58
|
-
'api/utils/storage',
|
|
59
|
-
'api/utils/types',
|
|
60
|
-
],
|
|
61
|
-
},
|
|
62
62
|
],
|
|
63
63
|
},
|
|
64
64
|
'configuration',
|
|
@@ -306,6 +306,20 @@ class BaseExporter:
|
|
|
306
306
|
|
|
307
307
|
return True
|
|
308
308
|
|
|
309
|
+
def additional_file_saving(self, unique_export_path):
|
|
310
|
+
"""Save additional files after processing all export items.
|
|
311
|
+
|
|
312
|
+
This method is called after the main export loop completes and is intended
|
|
313
|
+
for saving files that need to be created based on the collective data from
|
|
314
|
+
all processed export items (e.g., metadata files, configuration files,
|
|
315
|
+
summary files, etc.).
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
unique_export_path (str): The unique export directory path where
|
|
319
|
+
additional files should be saved.
|
|
320
|
+
"""
|
|
321
|
+
pass
|
|
322
|
+
|
|
309
323
|
def export(self, export_items=None, results=None, **_kwargs) -> dict:
|
|
310
324
|
"""Main export method that can be overridden by subclasses for custom logic.
|
|
311
325
|
|
|
@@ -340,8 +354,8 @@ class BaseExporter:
|
|
|
340
354
|
|
|
341
355
|
total = self.params['count']
|
|
342
356
|
|
|
343
|
-
original_file_metrics_record = self.run.MetricsRecord(stand_by=total)
|
|
344
|
-
data_file_metrics_record = self.run.MetricsRecord(stand_by=total)
|
|
357
|
+
original_file_metrics_record = self.run.MetricsRecord(stand_by=total, success=0, failed=0)
|
|
358
|
+
data_file_metrics_record = self.run.MetricsRecord(stand_by=total, success=0, failed=0)
|
|
345
359
|
|
|
346
360
|
# progress init
|
|
347
361
|
self.run.set_progress(0, total, category='dataset_conversion')
|
|
@@ -367,6 +381,7 @@ class BaseExporter:
|
|
|
367
381
|
if not should_continue:
|
|
368
382
|
continue
|
|
369
383
|
|
|
384
|
+
self.additional_file_saving(unique_export_path)
|
|
370
385
|
self.run.end_log()
|
|
371
386
|
|
|
372
387
|
# Save error list files
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# File utilities module
|
|
2
|
+
# Maintains backward compatibility by re-exporting all functions
|
|
3
|
+
|
|
4
|
+
from .archive import archive, unarchive
|
|
5
|
+
from .checksum import calculate_checksum, get_checksum_from_file
|
|
6
|
+
from .chunking import read_file_in_chunks
|
|
7
|
+
from .download import (
|
|
8
|
+
adownload_file,
|
|
9
|
+
afiles_url_to_path,
|
|
10
|
+
afiles_url_to_path_from_objs,
|
|
11
|
+
download_file,
|
|
12
|
+
files_url_to_path,
|
|
13
|
+
files_url_to_path_from_objs,
|
|
14
|
+
)
|
|
15
|
+
from .encoding import convert_file_to_base64
|
|
16
|
+
from .io import get_dict_from_file, get_temp_path
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
# Chunking
|
|
20
|
+
'read_file_in_chunks',
|
|
21
|
+
# Download
|
|
22
|
+
'download_file',
|
|
23
|
+
'adownload_file',
|
|
24
|
+
'files_url_to_path',
|
|
25
|
+
'afiles_url_to_path',
|
|
26
|
+
'files_url_to_path_from_objs',
|
|
27
|
+
'afiles_url_to_path_from_objs',
|
|
28
|
+
# Checksum
|
|
29
|
+
'calculate_checksum',
|
|
30
|
+
'get_checksum_from_file',
|
|
31
|
+
# Archive
|
|
32
|
+
'archive',
|
|
33
|
+
'unarchive',
|
|
34
|
+
# Encoding
|
|
35
|
+
'convert_file_to_base64',
|
|
36
|
+
# I/O
|
|
37
|
+
'get_dict_from_file',
|
|
38
|
+
'get_temp_path',
|
|
39
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import zipfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def archive(input_path, output_path, append=False):
|
|
6
|
+
input_path = Path(input_path)
|
|
7
|
+
output_path = Path(output_path)
|
|
8
|
+
|
|
9
|
+
mode = 'a' if append and output_path.exists() else 'w'
|
|
10
|
+
with zipfile.ZipFile(output_path, mode=mode, compression=zipfile.ZIP_DEFLATED) as zipf:
|
|
11
|
+
if input_path.is_file():
|
|
12
|
+
zipf.write(input_path, input_path.name)
|
|
13
|
+
else:
|
|
14
|
+
for file_path in input_path.rglob('*'):
|
|
15
|
+
if file_path.is_file(): # Only add files, skip directories
|
|
16
|
+
arcname = file_path.relative_to(input_path.parent)
|
|
17
|
+
zipf.write(file_path, arcname)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def unarchive(file_path, output_path):
|
|
21
|
+
"""
|
|
22
|
+
Unarchives a ZIP file to a given directory.
|
|
23
|
+
|
|
24
|
+
Parameters:
|
|
25
|
+
file_path (str | Path): The path to the ZIP file.
|
|
26
|
+
output_path (str): The directory where the files will be extracted.
|
|
27
|
+
"""
|
|
28
|
+
output_path = Path(output_path)
|
|
29
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
with zipfile.ZipFile(str(file_path), 'r') as zip_ref:
|
|
32
|
+
zip_ref.extractall(output_path)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from typing import IO, Any, Callable
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def calculate_checksum(file_path, prefix=''):
|
|
6
|
+
md5_hash = hashlib.md5()
|
|
7
|
+
with open(file_path, 'rb') as f:
|
|
8
|
+
for byte_block in iter(lambda: f.read(4096), b''):
|
|
9
|
+
md5_hash.update(byte_block)
|
|
10
|
+
checksum = md5_hash.hexdigest()
|
|
11
|
+
if prefix:
|
|
12
|
+
return f'dev-{checksum}'
|
|
13
|
+
return checksum
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_checksum_from_file(file: IO[Any], digest_mod: Callable[[], Any] = hashlib.sha1) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Calculate checksum for a file-like object.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
file (IO[Any]): File-like object with read() method that supports reading in chunks
|
|
22
|
+
digest_mod (Callable[[], Any]): Hash algorithm from hashlib (defaults to hashlib.sha1)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
str: Hexadecimal digest of the file contents
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
```python
|
|
29
|
+
import hashlib
|
|
30
|
+
from io import BytesIO
|
|
31
|
+
from synapse_sdk.utils.file import get_checksum_from_file
|
|
32
|
+
|
|
33
|
+
# With BytesIO
|
|
34
|
+
data = BytesIO(b'Hello, world!')
|
|
35
|
+
checksum = get_checksum_from_file(data)
|
|
36
|
+
|
|
37
|
+
# With different hash algorithm
|
|
38
|
+
checksum = get_checksum_from_file(data, digest_mod=hashlib.sha256)
|
|
39
|
+
```
|
|
40
|
+
"""
|
|
41
|
+
digest = digest_mod()
|
|
42
|
+
chunk_size = 4096
|
|
43
|
+
|
|
44
|
+
# Reset file pointer to beginning if possible
|
|
45
|
+
if hasattr(file, 'seek'):
|
|
46
|
+
file.seek(0)
|
|
47
|
+
|
|
48
|
+
while True:
|
|
49
|
+
chunk = file.read(chunk_size)
|
|
50
|
+
if not chunk:
|
|
51
|
+
break
|
|
52
|
+
if isinstance(chunk, str):
|
|
53
|
+
chunk = chunk.encode('utf-8')
|
|
54
|
+
digest.update(chunk)
|
|
55
|
+
|
|
56
|
+
return digest.hexdigest()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
def read_file_in_chunks(file_path, chunk_size=1024 * 1024 * 50):
|
|
2
|
+
"""
|
|
3
|
+
Read a file in chunks for efficient memory usage during file processing.
|
|
4
|
+
|
|
5
|
+
This function is particularly useful for large files or when you need to process
|
|
6
|
+
files in chunks, such as for uploading or hashing.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
file_path (str | Path): Path to the file to read
|
|
10
|
+
chunk_size (int, optional): Size of each chunk in bytes. Defaults to 50MB (1024 * 1024 * 50)
|
|
11
|
+
|
|
12
|
+
Yields:
|
|
13
|
+
bytes: File content chunks
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
FileNotFoundError: If the file doesn't exist
|
|
17
|
+
PermissionError: If the file can't be read due to permissions
|
|
18
|
+
OSError: If there's an OS-level error reading the file
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
```python
|
|
22
|
+
from synapse_sdk.utils.file import read_file_in_chunks
|
|
23
|
+
|
|
24
|
+
# Read a file in 10MB chunks
|
|
25
|
+
for chunk in read_file_in_chunks('large_file.bin', chunk_size=1024*1024*10):
|
|
26
|
+
process_chunk(chunk)
|
|
27
|
+
```
|
|
28
|
+
"""
|
|
29
|
+
with open(file_path, 'rb') as file:
|
|
30
|
+
while chunk := file.read(chunk_size):
|
|
31
|
+
yield chunk
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import operator
|
|
3
|
+
from functools import reduce
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import aiohttp
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from synapse_sdk.utils.network import clean_url
|
|
10
|
+
from synapse_sdk.utils.string import hash_text
|
|
11
|
+
|
|
12
|
+
from .io import get_temp_path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def download_file(url, path_download, name=None, coerce=None, use_cached=True):
|
|
16
|
+
chunk_size = 1024 * 1024 * 50
|
|
17
|
+
cleaned_url = clean_url(url) # remove query params and fragment
|
|
18
|
+
|
|
19
|
+
if name:
|
|
20
|
+
use_cached = False
|
|
21
|
+
else:
|
|
22
|
+
name = hash_text(cleaned_url)
|
|
23
|
+
|
|
24
|
+
name += Path(cleaned_url).suffix
|
|
25
|
+
|
|
26
|
+
path = Path(path_download) / name
|
|
27
|
+
|
|
28
|
+
if not use_cached or not path.is_file():
|
|
29
|
+
response = requests.get(url, allow_redirects=True, stream=True)
|
|
30
|
+
response.raise_for_status()
|
|
31
|
+
|
|
32
|
+
with path.open('wb') as file:
|
|
33
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
|
34
|
+
file.write(chunk)
|
|
35
|
+
|
|
36
|
+
if coerce:
|
|
37
|
+
path = coerce(path)
|
|
38
|
+
|
|
39
|
+
return path
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def files_url_to_path(files, coerce=None, file_field=None):
|
|
43
|
+
path_download = get_temp_path('media')
|
|
44
|
+
path_download.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
if file_field:
|
|
46
|
+
files[file_field] = download_file(files[file_field], path_download, coerce=coerce)
|
|
47
|
+
else:
|
|
48
|
+
for file_name in files:
|
|
49
|
+
if isinstance(files[file_name], str):
|
|
50
|
+
files[file_name] = download_file(files[file_name], path_download, coerce=coerce)
|
|
51
|
+
else:
|
|
52
|
+
files[file_name]['path'] = download_file(files[file_name].pop('url'), path_download, coerce=coerce)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def files_url_to_path_from_objs(objs, files_fields, coerce=None, is_list=False, is_async=False):
|
|
56
|
+
if is_async:
|
|
57
|
+
asyncio.run(afiles_url_to_path_from_objs(objs, files_fields, coerce=coerce, is_list=is_list))
|
|
58
|
+
else:
|
|
59
|
+
if not is_list:
|
|
60
|
+
objs = [objs]
|
|
61
|
+
|
|
62
|
+
for obj in objs:
|
|
63
|
+
for files_field in files_fields:
|
|
64
|
+
try:
|
|
65
|
+
files = reduce(operator.getitem, files_field.split('.'), obj)
|
|
66
|
+
if isinstance(files, str):
|
|
67
|
+
files_url_to_path(obj, coerce=coerce, file_field=files_field)
|
|
68
|
+
else:
|
|
69
|
+
files_url_to_path(files, coerce=coerce)
|
|
70
|
+
except KeyError:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def adownload_file(url, path_download, name=None, coerce=None, use_cached=True):
|
|
75
|
+
chunk_size = 1024 * 1024 * 50
|
|
76
|
+
cleaned_url = clean_url(url) # remove query params and fragment
|
|
77
|
+
|
|
78
|
+
if name:
|
|
79
|
+
use_cached = False
|
|
80
|
+
else:
|
|
81
|
+
name = hash_text(cleaned_url)
|
|
82
|
+
|
|
83
|
+
name += Path(cleaned_url).suffix
|
|
84
|
+
|
|
85
|
+
path = Path(path_download) / name
|
|
86
|
+
|
|
87
|
+
if not use_cached or not path.is_file():
|
|
88
|
+
async with aiohttp.ClientSession() as session:
|
|
89
|
+
async with session.get(url) as response:
|
|
90
|
+
with path.open('wb') as file:
|
|
91
|
+
while chunk := await response.content.read(chunk_size):
|
|
92
|
+
file.write(chunk)
|
|
93
|
+
|
|
94
|
+
if coerce:
|
|
95
|
+
path = coerce(path)
|
|
96
|
+
|
|
97
|
+
return path
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def afiles_url_to_path(files, coerce=None):
|
|
101
|
+
path_download = get_temp_path('media')
|
|
102
|
+
path_download.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
for file_name in files:
|
|
104
|
+
if isinstance(files[file_name], str):
|
|
105
|
+
files[file_name] = await adownload_file(files[file_name], path_download, coerce=coerce)
|
|
106
|
+
else:
|
|
107
|
+
files[file_name]['path'] = await adownload_file(files[file_name].pop('url'), path_download, coerce=coerce)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
async def afiles_url_to_path_from_objs(objs, files_fields, coerce=None, is_list=False):
|
|
111
|
+
if not is_list:
|
|
112
|
+
objs = [objs]
|
|
113
|
+
|
|
114
|
+
tasks = []
|
|
115
|
+
|
|
116
|
+
for obj in objs:
|
|
117
|
+
for files_field in files_fields:
|
|
118
|
+
try:
|
|
119
|
+
files = reduce(operator.getitem, files_field.split('.'), obj)
|
|
120
|
+
tasks.append(afiles_url_to_path(files, coerce=coerce))
|
|
121
|
+
except KeyError:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
await asyncio.gather(*tasks)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import mimetypes
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def convert_file_to_base64(file_path):
|
|
7
|
+
"""
|
|
8
|
+
Convert a file to base64 using pathlib.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
file_path (str): Path to the file to convert
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
str: Base64 encoded string of the file contents
|
|
15
|
+
"""
|
|
16
|
+
# FIXME base64 is sent sometimes.
|
|
17
|
+
if file_path.startswith('data:'):
|
|
18
|
+
return file_path
|
|
19
|
+
|
|
20
|
+
# Convert string path to Path object
|
|
21
|
+
path = Path(file_path)
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
# Read binary content of the file
|
|
25
|
+
binary_content = path.read_bytes()
|
|
26
|
+
|
|
27
|
+
# Convert to base64
|
|
28
|
+
base64_encoded = base64.b64encode(binary_content).decode('utf-8')
|
|
29
|
+
|
|
30
|
+
# Get the MIME type of the file
|
|
31
|
+
mime_type, _ = mimetypes.guess_type(path)
|
|
32
|
+
assert mime_type is not None, 'MIME type cannot be guessed'
|
|
33
|
+
|
|
34
|
+
# Convert bytes to string for readable output
|
|
35
|
+
return f'data:{mime_type};base64,{base64_encoded}'
|
|
36
|
+
|
|
37
|
+
except FileNotFoundError:
|
|
38
|
+
raise FileNotFoundError(f'File not found: {file_path}')
|
|
39
|
+
except Exception as e:
|
|
40
|
+
raise Exception(f'Error converting file to base64: {str(e)}')
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_dict_from_file(file_path):
|
|
8
|
+
if isinstance(file_path, str):
|
|
9
|
+
file_path = Path(file_path)
|
|
10
|
+
|
|
11
|
+
with open(file_path) as f:
|
|
12
|
+
if file_path.suffix == '.yaml':
|
|
13
|
+
return yaml.safe_load(f)
|
|
14
|
+
else:
|
|
15
|
+
return json.load(f)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_temp_path(sub_path=None):
|
|
19
|
+
path = Path('/tmp/datamaker')
|
|
20
|
+
if sub_path:
|
|
21
|
+
path = path / sub_path
|
|
22
|
+
return path
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Video processing utilities
|
|
2
|
+
|
|
3
|
+
from .transcode import (
|
|
4
|
+
FFmpegNotFoundError,
|
|
5
|
+
TranscodeConfig,
|
|
6
|
+
TranscodingFailedError,
|
|
7
|
+
UnsupportedFormatError,
|
|
8
|
+
VideoTranscodeError,
|
|
9
|
+
atranscode_video,
|
|
10
|
+
get_video_info,
|
|
11
|
+
optimize_for_web,
|
|
12
|
+
transcode_batch,
|
|
13
|
+
transcode_video,
|
|
14
|
+
validate_video_format,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
'TranscodeConfig',
|
|
19
|
+
'VideoTranscodeError',
|
|
20
|
+
'UnsupportedFormatError',
|
|
21
|
+
'FFmpegNotFoundError',
|
|
22
|
+
'TranscodingFailedError',
|
|
23
|
+
'transcode_video',
|
|
24
|
+
'atranscode_video',
|
|
25
|
+
'get_video_info',
|
|
26
|
+
'validate_video_format',
|
|
27
|
+
'optimize_for_web',
|
|
28
|
+
'transcode_batch',
|
|
29
|
+
]
|