synapse-sdk 1.0.0b5__py3-none-any.whl → 2025.12.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. synapse_sdk/__init__.py +24 -0
  2. synapse_sdk/cli/code_server.py +305 -33
  3. synapse_sdk/clients/agent/__init__.py +2 -1
  4. synapse_sdk/clients/agent/container.py +143 -0
  5. synapse_sdk/clients/agent/ray.py +296 -38
  6. synapse_sdk/clients/backend/annotation.py +1 -1
  7. synapse_sdk/clients/backend/core.py +31 -4
  8. synapse_sdk/clients/backend/data_collection.py +82 -7
  9. synapse_sdk/clients/backend/hitl.py +1 -1
  10. synapse_sdk/clients/backend/ml.py +1 -1
  11. synapse_sdk/clients/base.py +211 -61
  12. synapse_sdk/loggers.py +46 -0
  13. synapse_sdk/plugins/README.md +1340 -0
  14. synapse_sdk/plugins/categories/base.py +59 -9
  15. synapse_sdk/plugins/categories/export/actions/__init__.py +3 -0
  16. synapse_sdk/plugins/categories/export/actions/export/__init__.py +28 -0
  17. synapse_sdk/plugins/categories/export/actions/export/action.py +165 -0
  18. synapse_sdk/plugins/categories/export/actions/export/enums.py +113 -0
  19. synapse_sdk/plugins/categories/export/actions/export/exceptions.py +53 -0
  20. synapse_sdk/plugins/categories/export/actions/export/models.py +74 -0
  21. synapse_sdk/plugins/categories/export/actions/export/run.py +195 -0
  22. synapse_sdk/plugins/categories/export/actions/export/utils.py +187 -0
  23. synapse_sdk/plugins/categories/export/templates/config.yaml +19 -1
  24. synapse_sdk/plugins/categories/export/templates/plugin/__init__.py +390 -0
  25. synapse_sdk/plugins/categories/export/templates/plugin/export.py +153 -177
  26. synapse_sdk/plugins/categories/neural_net/actions/train.py +1130 -32
  27. synapse_sdk/plugins/categories/neural_net/actions/tune.py +157 -4
  28. synapse_sdk/plugins/categories/neural_net/templates/config.yaml +7 -4
  29. synapse_sdk/plugins/categories/pre_annotation/actions/__init__.py +4 -0
  30. synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/__init__.py +3 -0
  31. synapse_sdk/plugins/categories/pre_annotation/actions/pre_annotation/action.py +10 -0
  32. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/__init__.py +28 -0
  33. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/action.py +148 -0
  34. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/enums.py +269 -0
  35. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/exceptions.py +14 -0
  36. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/factory.py +76 -0
  37. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/models.py +100 -0
  38. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/orchestrator.py +248 -0
  39. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/run.py +64 -0
  40. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/__init__.py +17 -0
  41. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/annotation.py +265 -0
  42. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/base.py +170 -0
  43. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/extraction.py +83 -0
  44. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/metrics.py +92 -0
  45. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/preprocessor.py +243 -0
  46. synapse_sdk/plugins/categories/pre_annotation/actions/to_task/strategies/validation.py +143 -0
  47. synapse_sdk/plugins/categories/upload/actions/upload/__init__.py +19 -0
  48. synapse_sdk/plugins/categories/upload/actions/upload/action.py +236 -0
  49. synapse_sdk/plugins/categories/upload/actions/upload/context.py +185 -0
  50. synapse_sdk/plugins/categories/upload/actions/upload/enums.py +493 -0
  51. synapse_sdk/plugins/categories/upload/actions/upload/exceptions.py +36 -0
  52. synapse_sdk/plugins/categories/upload/actions/upload/factory.py +138 -0
  53. synapse_sdk/plugins/categories/upload/actions/upload/models.py +214 -0
  54. synapse_sdk/plugins/categories/upload/actions/upload/orchestrator.py +183 -0
  55. synapse_sdk/plugins/categories/upload/actions/upload/registry.py +113 -0
  56. synapse_sdk/plugins/categories/upload/actions/upload/run.py +179 -0
  57. synapse_sdk/plugins/categories/upload/actions/upload/steps/__init__.py +1 -0
  58. synapse_sdk/plugins/categories/upload/actions/upload/steps/base.py +107 -0
  59. synapse_sdk/plugins/categories/upload/actions/upload/steps/cleanup.py +62 -0
  60. synapse_sdk/plugins/categories/upload/actions/upload/steps/collection.py +63 -0
  61. synapse_sdk/plugins/categories/upload/actions/upload/steps/generate.py +91 -0
  62. synapse_sdk/plugins/categories/upload/actions/upload/steps/initialize.py +82 -0
  63. synapse_sdk/plugins/categories/upload/actions/upload/steps/metadata.py +235 -0
  64. synapse_sdk/plugins/categories/upload/actions/upload/steps/organize.py +201 -0
  65. synapse_sdk/plugins/categories/upload/actions/upload/steps/upload.py +104 -0
  66. synapse_sdk/plugins/categories/upload/actions/upload/steps/validate.py +71 -0
  67. synapse_sdk/plugins/categories/upload/actions/upload/strategies/__init__.py +1 -0
  68. synapse_sdk/plugins/categories/upload/actions/upload/strategies/base.py +82 -0
  69. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/__init__.py +1 -0
  70. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/batch.py +39 -0
  71. synapse_sdk/plugins/categories/upload/actions/upload/strategies/data_unit/single.py +29 -0
  72. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/__init__.py +1 -0
  73. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/flat.py +300 -0
  74. synapse_sdk/plugins/categories/upload/actions/upload/strategies/file_discovery/recursive.py +287 -0
  75. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/__init__.py +1 -0
  76. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/excel.py +174 -0
  77. synapse_sdk/plugins/categories/upload/actions/upload/strategies/metadata/none.py +16 -0
  78. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/__init__.py +1 -0
  79. synapse_sdk/plugins/categories/upload/actions/upload/strategies/upload/sync.py +84 -0
  80. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/__init__.py +1 -0
  81. synapse_sdk/plugins/categories/upload/actions/upload/strategies/validation/default.py +60 -0
  82. synapse_sdk/plugins/categories/upload/actions/upload/utils.py +250 -0
  83. synapse_sdk/plugins/categories/upload/templates/README.md +470 -0
  84. synapse_sdk/plugins/categories/upload/templates/config.yaml +28 -2
  85. synapse_sdk/plugins/categories/upload/templates/plugin/__init__.py +310 -0
  86. synapse_sdk/plugins/categories/upload/templates/plugin/upload.py +82 -20
  87. synapse_sdk/plugins/models.py +111 -9
  88. synapse_sdk/plugins/templates/plugin-config-schema.json +7 -0
  89. synapse_sdk/plugins/templates/schema.json +7 -0
  90. synapse_sdk/plugins/utils/__init__.py +3 -0
  91. synapse_sdk/plugins/utils/ray_gcs.py +66 -0
  92. synapse_sdk/shared/__init__.py +25 -0
  93. synapse_sdk/utils/converters/dm/__init__.py +42 -41
  94. synapse_sdk/utils/converters/dm/base.py +137 -0
  95. synapse_sdk/utils/converters/dm/from_v1.py +208 -562
  96. synapse_sdk/utils/converters/dm/to_v1.py +258 -304
  97. synapse_sdk/utils/converters/dm/tools/__init__.py +214 -0
  98. synapse_sdk/utils/converters/dm/tools/answer.py +95 -0
  99. synapse_sdk/utils/converters/dm/tools/bounding_box.py +132 -0
  100. synapse_sdk/utils/converters/dm/tools/bounding_box_3d.py +121 -0
  101. synapse_sdk/utils/converters/dm/tools/classification.py +75 -0
  102. synapse_sdk/utils/converters/dm/tools/keypoint.py +117 -0
  103. synapse_sdk/utils/converters/dm/tools/named_entity.py +111 -0
  104. synapse_sdk/utils/converters/dm/tools/polygon.py +122 -0
  105. synapse_sdk/utils/converters/dm/tools/polyline.py +124 -0
  106. synapse_sdk/utils/converters/dm/tools/prompt.py +94 -0
  107. synapse_sdk/utils/converters/dm/tools/relation.py +86 -0
  108. synapse_sdk/utils/converters/dm/tools/segmentation.py +141 -0
  109. synapse_sdk/utils/converters/dm/tools/segmentation_3d.py +83 -0
  110. synapse_sdk/utils/converters/dm/types.py +168 -0
  111. synapse_sdk/utils/converters/dm/utils.py +162 -0
  112. synapse_sdk/utils/converters/dm_legacy/__init__.py +56 -0
  113. synapse_sdk/utils/converters/dm_legacy/from_v1.py +627 -0
  114. synapse_sdk/utils/converters/dm_legacy/to_v1.py +367 -0
  115. synapse_sdk/utils/file/__init__.py +58 -0
  116. synapse_sdk/utils/file/archive.py +32 -0
  117. synapse_sdk/utils/file/checksum.py +56 -0
  118. synapse_sdk/utils/file/chunking.py +31 -0
  119. synapse_sdk/utils/file/download.py +385 -0
  120. synapse_sdk/utils/file/encoding.py +40 -0
  121. synapse_sdk/utils/file/io.py +22 -0
  122. synapse_sdk/utils/file/upload.py +165 -0
  123. synapse_sdk/utils/file/video/__init__.py +29 -0
  124. synapse_sdk/utils/file/video/transcode.py +307 -0
  125. synapse_sdk/utils/{file.py → file.py.backup} +77 -0
  126. synapse_sdk/utils/network.py +272 -0
  127. synapse_sdk/utils/storage/__init__.py +6 -2
  128. synapse_sdk/utils/storage/providers/file_system.py +6 -0
  129. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/METADATA +19 -2
  130. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/RECORD +134 -74
  131. synapse_sdk/devtools/docs/.gitignore +0 -20
  132. synapse_sdk/devtools/docs/README.md +0 -41
  133. synapse_sdk/devtools/docs/blog/2019-05-28-first-blog-post.md +0 -12
  134. synapse_sdk/devtools/docs/blog/2019-05-29-long-blog-post.md +0 -44
  135. synapse_sdk/devtools/docs/blog/2021-08-01-mdx-blog-post.mdx +0 -24
  136. synapse_sdk/devtools/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
  137. synapse_sdk/devtools/docs/blog/2021-08-26-welcome/index.md +0 -29
  138. synapse_sdk/devtools/docs/blog/authors.yml +0 -25
  139. synapse_sdk/devtools/docs/blog/tags.yml +0 -19
  140. synapse_sdk/devtools/docs/docusaurus.config.ts +0 -138
  141. synapse_sdk/devtools/docs/package-lock.json +0 -17455
  142. synapse_sdk/devtools/docs/package.json +0 -47
  143. synapse_sdk/devtools/docs/sidebars.ts +0 -44
  144. synapse_sdk/devtools/docs/src/components/HomepageFeatures/index.tsx +0 -71
  145. synapse_sdk/devtools/docs/src/components/HomepageFeatures/styles.module.css +0 -11
  146. synapse_sdk/devtools/docs/src/css/custom.css +0 -30
  147. synapse_sdk/devtools/docs/src/pages/index.module.css +0 -23
  148. synapse_sdk/devtools/docs/src/pages/index.tsx +0 -21
  149. synapse_sdk/devtools/docs/src/pages/markdown-page.md +0 -7
  150. synapse_sdk/devtools/docs/static/.nojekyll +0 -0
  151. synapse_sdk/devtools/docs/static/img/docusaurus-social-card.jpg +0 -0
  152. synapse_sdk/devtools/docs/static/img/docusaurus.png +0 -0
  153. synapse_sdk/devtools/docs/static/img/favicon.ico +0 -0
  154. synapse_sdk/devtools/docs/static/img/logo.png +0 -0
  155. synapse_sdk/devtools/docs/static/img/undraw_docusaurus_mountain.svg +0 -171
  156. synapse_sdk/devtools/docs/static/img/undraw_docusaurus_react.svg +0 -170
  157. synapse_sdk/devtools/docs/static/img/undraw_docusaurus_tree.svg +0 -40
  158. synapse_sdk/devtools/docs/tsconfig.json +0 -8
  159. synapse_sdk/plugins/categories/export/actions/export.py +0 -346
  160. synapse_sdk/plugins/categories/export/enums.py +0 -7
  161. synapse_sdk/plugins/categories/neural_net/actions/gradio.py +0 -151
  162. synapse_sdk/plugins/categories/pre_annotation/actions/to_task.py +0 -943
  163. synapse_sdk/plugins/categories/upload/actions/upload.py +0 -954
  164. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/WHEEL +0 -0
  165. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/entry_points.txt +0 -0
  166. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/licenses/LICENSE +0 -0
  167. {synapse_sdk-1.0.0b5.dist-info → synapse_sdk-2025.12.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,300 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Dict, List
4
+
5
+ from ..base import FileDiscoveryStrategy
6
+
7
+
8
+ class FlatFileDiscoveryStrategy(FileDiscoveryStrategy):
9
+ """Non-recursive file discovery strategy."""
10
+
11
+ def discover(self, path: Path, recursive: bool) -> List[Path]:
12
+ """Discover files in the given path.
13
+
14
+ Args:
15
+ path: Directory path to search
16
+ recursive: If True, search recursively; if False, search only in the given directory
17
+
18
+ Returns:
19
+ List of discovered file paths
20
+ """
21
+ # Exclude system files
22
+ excluded_files = {'.DS_Store', 'Thumbs.db', 'desktop.ini'}
23
+
24
+ if recursive:
25
+ # Recursive mode: use rglob
26
+ # Exclude system directories
27
+ excluded_dirs = {'@eaDir', '.@__thumb', '@Recycle', '#recycle', '.DS_Store', 'Thumbs.db', '.synology'}
28
+
29
+ def exclude_dirs(file_path: Path) -> bool:
30
+ """Check if file path contains excluded directories."""
31
+ return any(excluded_dir in file_path.parts for excluded_dir in excluded_dirs)
32
+
33
+ return [
34
+ file_path
35
+ for file_path in path.rglob('*')
36
+ if file_path.is_file() and file_path.name not in excluded_files and not exclude_dirs(file_path)
37
+ ]
38
+ else:
39
+ # Flat mode: use glob (non-recursive)
40
+ return [
41
+ file_path
42
+ for file_path in path.glob('*')
43
+ if file_path.is_file() and file_path.name not in excluded_files
44
+ ]
45
+
46
+ def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
47
+ """Organize files according to specifications with metadata."""
48
+ organized_files = []
49
+
50
+ # Use provided type_dirs or create fallback mapping
51
+ if type_dirs is None:
52
+ type_dirs = {}
53
+ for spec in specs:
54
+ spec_name = spec['name']
55
+ # Fallback: extract spec directory from file paths
56
+ for file_path in files:
57
+ # Check if this file's path contains the spec_name as a directory
58
+ path_parts = file_path.parts
59
+ if spec_name in path_parts:
60
+ # Find the index of spec_name and reconstruct the path up to that directory
61
+ spec_index = path_parts.index(spec_name)
62
+ spec_dir = Path(*path_parts[: spec_index + 1])
63
+ if spec_dir.exists() and spec_dir.is_dir():
64
+ type_dirs[spec_name] = spec_dir
65
+ break
66
+
67
+ if not type_dirs:
68
+ return organized_files
69
+
70
+ # Performance optimization 2: Build metadata index for faster lookups
71
+ metadata_index = self._build_metadata_index(metadata)
72
+
73
+ # Group files by dataset_key (stem-based matching) - flat discovery (no subdirectories)
74
+ # Strategy:
75
+ # 1. Group all files (required + optional) by their file stem
76
+ # 2. Only create data units for groups that have ALL required files
77
+ # 3. Optional files are automatically included if they match the stem
78
+ dataset_files = {}
79
+ required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
80
+
81
+ for file_path in files:
82
+ # Find all matching directories for this file
83
+ matched_specs = []
84
+ for spec_name, dir_path in type_dirs.items():
85
+ # Check if file is within this directory (supports recursive)
86
+ try:
87
+ file_path.relative_to(dir_path)
88
+ matched_specs.append((spec_name, dir_path))
89
+ except ValueError:
90
+ continue
91
+
92
+ # If no matches found, skip this file
93
+ if not matched_specs:
94
+ continue
95
+
96
+ # Select the most specific (deepest) directory match
97
+ # This prevents files in subdirectories from being assigned to parent directory specs
98
+ best_match = max(matched_specs, key=lambda x: len(x[1].parts))
99
+ spec_name, dir_path = best_match
100
+
101
+ file_name = file_path.stem
102
+
103
+ if file_name not in dataset_files:
104
+ dataset_files[file_name] = {}
105
+
106
+ if spec_name not in dataset_files[file_name]:
107
+ dataset_files[file_name][spec_name] = file_path
108
+ else:
109
+ # Keep the most recent file - only stat when needed
110
+ existing_file = dataset_files[file_name][spec_name]
111
+ try:
112
+ if file_path.stat().st_mtime > existing_file.stat().st_mtime:
113
+ dataset_files[file_name][spec_name] = file_path
114
+ except (OSError, IOError):
115
+ # If stat fails, keep existing file
116
+ pass
117
+
118
+ # Create organized files ONLY for datasets with ALL required files
119
+ # Optional files are included automatically if they match the stem
120
+ for file_name, files_dict in sorted(dataset_files.items()):
121
+ # Check if all required files are present
122
+ has_all_required = all(req in files_dict for req in required_specs)
123
+
124
+ if has_all_required:
125
+ # Extract original file stem from actual file paths (more reliable)
126
+ # Collect stems from all files in the group
127
+ file_stems = {}
128
+ file_extensions = {}
129
+
130
+ for file_path in files_dict.values():
131
+ stem = file_path.stem
132
+ ext = file_path.suffix.lower()
133
+
134
+ # Count stems (to handle multiple files with slightly different names)
135
+ if stem:
136
+ file_stems[stem] = file_stems.get(stem, 0) + 1
137
+
138
+ # Count extensions
139
+ if ext:
140
+ file_extensions[ext] = file_extensions.get(ext, 0) + 1
141
+
142
+ # Use the most common stem (usually they're all the same)
143
+ original_stem = max(file_stems, key=file_stems.get) if file_stems else file_name
144
+ origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
145
+
146
+ meta_data = {
147
+ 'origin_file_stem': original_stem,
148
+ 'origin_file_extension': origin_file_extension,
149
+ 'created_at': datetime.now().isoformat(),
150
+ }
151
+
152
+ # Add metadata if available - using optimized index lookup
153
+ if metadata_index:
154
+ matched_metadata = self._find_matching_metadata_optimized(file_name, files_dict, metadata_index)
155
+ if matched_metadata:
156
+ meta_data.update(matched_metadata)
157
+
158
+ organized_files.append({'files': files_dict, 'meta': meta_data})
159
+
160
+ return organized_files
161
+
162
+ def _build_metadata_index(self, metadata: Dict) -> Dict:
163
+ """Build metadata index for faster lookups."""
164
+ if not metadata:
165
+ return {}
166
+
167
+ metadata_index = {'exact_stem': {}, 'exact_name': {}, 'stem_lookup': {}, 'partial_paths': {}, 'full_paths': {}}
168
+
169
+ for meta_key, meta_value in metadata.items():
170
+ meta_path = Path(meta_key)
171
+
172
+ # Index by stem
173
+ stem = meta_path.stem
174
+ if stem:
175
+ metadata_index['exact_stem'][stem] = meta_value
176
+ metadata_index['stem_lookup'][stem] = meta_value
177
+
178
+ # Index by full name
179
+ name = meta_path.name
180
+ if name:
181
+ metadata_index['exact_name'][name] = meta_value
182
+
183
+ # Index for partial path matching
184
+ metadata_index['partial_paths'][meta_key] = meta_value
185
+
186
+ # Index for full path matching
187
+ metadata_index['full_paths'][meta_key] = meta_value
188
+
189
+ return metadata_index
190
+
191
+ def _find_matching_metadata_optimized(self, file_name: str, files_dict: Dict, metadata_index: Dict) -> Dict:
192
+ """Find matching metadata using optimized index lookups."""
193
+ if not metadata_index:
194
+ return {}
195
+
196
+ # Strategy 1: Exact stem match (O(1) lookup)
197
+ if file_name in metadata_index['exact_stem']:
198
+ return metadata_index['exact_stem'][file_name]
199
+
200
+ # Strategy 2: Exact filename match with extension (O(1) lookup)
201
+ sample_file = list(files_dict.values())[0] if files_dict else None
202
+ if sample_file:
203
+ full_filename = f'{file_name}{sample_file.suffix}'
204
+ if full_filename in metadata_index['exact_name']:
205
+ return metadata_index['exact_name'][full_filename]
206
+
207
+ # Try sample file name
208
+ sample_filename = sample_file.name
209
+ if sample_filename in metadata_index['exact_name']:
210
+ return metadata_index['exact_name'][sample_filename]
211
+
212
+ # Strategy 3: Stem lookup (already optimized above)
213
+ # This is covered by exact_stem lookup
214
+
215
+ # Strategy 4 & 5: Partial and full path matching (fallback to original logic for complex cases)
216
+ if sample_file:
217
+ file_path_str = str(sample_file)
218
+ file_path_posix = sample_file.as_posix()
219
+
220
+ # Check partial paths
221
+ for meta_key in metadata_index['partial_paths']:
222
+ if (
223
+ meta_key in file_path_str
224
+ or meta_key in file_path_posix
225
+ or file_path_str in meta_key
226
+ or file_path_posix in meta_key
227
+ ):
228
+ return metadata_index['partial_paths'][meta_key]
229
+
230
+ return {}
231
+
232
+ def _find_matching_metadata(self, file_name: str, files_dict: Dict, metadata: Dict) -> Dict:
233
+ """Find matching metadata using comprehensive pattern matching.
234
+
235
+ Matching priority:
236
+ 1. Exact stem match (highest priority)
237
+ 2. Exact filename match (with extension)
238
+ 3. Metadata key stem matches file stem
239
+ 4. Partial path matching
240
+ 5. Full path matching
241
+ """
242
+ if not metadata:
243
+ return {}
244
+
245
+ # Get sample file for extension and path information
246
+ sample_file = list(files_dict.values())[0] if files_dict else None
247
+
248
+ # Strategy 1: Exact stem match (highest priority)
249
+ if file_name in metadata:
250
+ return metadata[file_name]
251
+
252
+ # Strategy 2: Exact filename match (with extension)
253
+ if sample_file:
254
+ full_filename = f'{file_name}{sample_file.suffix}'
255
+ if full_filename in metadata:
256
+ return metadata[full_filename]
257
+
258
+ # Also try with sample file name
259
+ sample_filename = sample_file.name
260
+ if sample_filename in metadata:
261
+ return metadata[sample_filename]
262
+
263
+ # Strategy 3: Metadata key stem matches file stem
264
+ for meta_key in metadata.keys():
265
+ meta_stem = Path(meta_key).stem
266
+ if meta_stem == file_name:
267
+ return metadata[meta_key]
268
+
269
+ # Strategy 4: Partial path matching
270
+ if sample_file:
271
+ file_path_parts = sample_file.parts
272
+ for meta_key in metadata.keys():
273
+ meta_path = Path(meta_key)
274
+ # Check if any part of the metadata key matches our file path parts
275
+ for part in file_path_parts:
276
+ if part in str(meta_path) or str(meta_path) in part:
277
+ # Additional validation: ensure it's a reasonable match
278
+ if meta_path.stem == file_name or meta_path.name == sample_file.name or part == meta_path.stem:
279
+ return metadata[meta_key]
280
+
281
+ # Strategy 5: Full path matching
282
+ if sample_file:
283
+ full_path_str = str(sample_file)
284
+ full_path_posix = sample_file.as_posix()
285
+
286
+ for meta_key in metadata.keys():
287
+ # Direct path match
288
+ if meta_key == full_path_str or meta_key == full_path_posix:
289
+ return metadata[meta_key]
290
+
291
+ # Relative path match (check if meta_key is contained in our path)
292
+ if meta_key in full_path_str or meta_key in full_path_posix:
293
+ return metadata[meta_key]
294
+
295
+ # Reverse match (check if our path is contained in meta_key)
296
+ if full_path_str in meta_key or full_path_posix in meta_key:
297
+ return metadata[meta_key]
298
+
299
+ # No match found
300
+ return {}
@@ -0,0 +1,287 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Dict, List
4
+
5
+ from ..base import FileDiscoveryStrategy
6
+
7
+
8
+ class RecursiveFileDiscoveryStrategy(FileDiscoveryStrategy):
9
+ """Recursive file discovery strategy."""
10
+
11
+ def discover(self, path: Path, recursive: bool) -> List[Path]:
12
+ """Discover files recursively in the given path."""
13
+ # Exclude system directories
14
+ excluded_dirs = {'@eaDir', '.@__thumb', '@Recycle', '#recycle', '.DS_Store', 'Thumbs.db', '.synology'}
15
+
16
+ def exclude_dirs(file_path: Path) -> bool:
17
+ """Check if file path contains excluded directories."""
18
+ return any(excluded_dir in file_path.parts for excluded_dir in excluded_dirs)
19
+
20
+ return [file_path for file_path in path.rglob('*') if file_path.is_file() and not exclude_dirs(file_path)]
21
+
22
+ def organize(self, files: List[Path], specs: Dict, metadata: Dict, type_dirs: Dict = None) -> List[Dict]:
23
+ """Organize files according to specifications with metadata."""
24
+ organized_files = []
25
+
26
+ # Use provided type_dirs or create fallback mapping
27
+ if type_dirs is None:
28
+ type_dirs = {}
29
+ for spec in specs:
30
+ spec_name = spec['name']
31
+ # Fallback: extract spec directory from file paths
32
+ for file_path in files:
33
+ # Check if this file's path contains the spec_name as a directory
34
+ path_parts = file_path.parts
35
+ if spec_name in path_parts:
36
+ # Find the index of spec_name and reconstruct the path up to that directory
37
+ spec_index = path_parts.index(spec_name)
38
+ spec_dir = Path(*path_parts[: spec_index + 1])
39
+ if spec_dir.exists() and spec_dir.is_dir():
40
+ type_dirs[spec_name] = spec_dir
41
+ break
42
+
43
+ if not type_dirs:
44
+ return organized_files
45
+
46
+ # Performance optimization 1: Path caching - avoid repeated string conversions
47
+ path_cache = {dir_path: str(dir_path) for dir_path in type_dirs.values()}
48
+
49
+ # Performance optimization 2: Build metadata index for faster lookups
50
+ metadata_index = self._build_metadata_index(metadata)
51
+
52
+ # Group files by dataset_key (stem-based matching)
53
+ # Strategy:
54
+ # 1. Group all files (required + optional) by their file stem
55
+ # 2. Only create data units for groups that have ALL required files
56
+ # 3. Optional files are automatically included if they match the stem
57
+ dataset_files = {}
58
+ required_specs = [spec['name'] for spec in specs if spec.get('is_required', False)]
59
+
60
+ for file_path in files:
61
+ # Find all matching directories for this file
62
+ matched_specs = []
63
+ for spec_name, dir_path in type_dirs.items():
64
+ # Check if file is under this spec's directory
65
+ # Use try/except for relative_to to ensure proper path matching
66
+ try:
67
+ relative_path = file_path.relative_to(dir_path)
68
+ matched_specs.append((spec_name, dir_path, relative_path))
69
+ except ValueError:
70
+ # File is not under this directory
71
+ continue
72
+
73
+ # If no matches found, skip this file
74
+ if not matched_specs:
75
+ continue
76
+
77
+ # Select the most specific (deepest) directory match
78
+ # This prevents files in subdirectories from being assigned to parent directory specs
79
+ best_match = max(matched_specs, key=lambda x: len(x[1].parts))
80
+ spec_name, dir_path, relative_path = best_match
81
+
82
+ # Create unique dataset key using relative path from spec directory
83
+ # Use parent directory + stem as unique key to group related files
84
+ if relative_path.parent != Path('.'):
85
+ dataset_key = f'{relative_path.parent}_{file_path.stem}'
86
+ else:
87
+ dataset_key = file_path.stem
88
+
89
+ if dataset_key not in dataset_files:
90
+ dataset_files[dataset_key] = {}
91
+
92
+ if spec_name not in dataset_files[dataset_key]:
93
+ dataset_files[dataset_key][spec_name] = file_path
94
+ else:
95
+ # Keep the most recent file - only stat when needed
96
+ existing_file = dataset_files[dataset_key][spec_name]
97
+ try:
98
+ if file_path.stat().st_mtime > existing_file.stat().st_mtime:
99
+ dataset_files[dataset_key][spec_name] = file_path
100
+ except (OSError, IOError):
101
+ # If stat fails, keep existing file
102
+ pass
103
+
104
+ # Create organized files ONLY for datasets with ALL required files
105
+ # Optional files are included automatically if they match the stem
106
+ for dataset_key, files_dict in sorted(dataset_files.items()):
107
+ # Check if all required files are present
108
+ has_all_required = all(req in files_dict for req in required_specs)
109
+
110
+ if has_all_required:
111
+ # Extract original file stem from actual file paths (more reliable than parsing dataset_key)
112
+ # Collect stems from all files in the group
113
+ file_stems = {}
114
+ file_extensions = {}
115
+
116
+ for file_path in files_dict.values():
117
+ stem = file_path.stem
118
+ ext = file_path.suffix.lower()
119
+
120
+ # Count stems (to handle multiple files with slightly different names)
121
+ if stem:
122
+ file_stems[stem] = file_stems.get(stem, 0) + 1
123
+
124
+ # Count extensions
125
+ if ext:
126
+ file_extensions[ext] = file_extensions.get(ext, 0) + 1
127
+
128
+ # Use the most common stem (usually they're all the same)
129
+ original_stem = max(file_stems, key=file_stems.get) if file_stems else dataset_key
130
+ origin_file_extension = max(file_extensions, key=file_extensions.get) if file_extensions else ''
131
+
132
+ meta_data = {
133
+ 'origin_file_stem': original_stem,
134
+ 'origin_file_extension': origin_file_extension,
135
+ 'created_at': datetime.now().isoformat(),
136
+ 'dataset_key': dataset_key, # Add dataset key for debugging
137
+ }
138
+
139
+ # Add metadata if available - using optimized index lookup
140
+ if metadata_index:
141
+ matched_metadata = self._find_matching_metadata_optimized(original_stem, files_dict, metadata_index)
142
+ if matched_metadata:
143
+ meta_data.update(matched_metadata)
144
+
145
+ organized_files.append({'files': files_dict, 'meta': meta_data})
146
+
147
+ return organized_files
148
+
149
+ def _build_metadata_index(self, metadata: Dict) -> Dict:
150
+ """Build metadata index for faster lookups."""
151
+ if not metadata:
152
+ return {}
153
+
154
+ metadata_index = {'exact_stem': {}, 'exact_name': {}, 'stem_lookup': {}, 'partial_paths': {}, 'full_paths': {}}
155
+
156
+ for meta_key, meta_value in metadata.items():
157
+ meta_path = Path(meta_key)
158
+
159
+ # Index by stem
160
+ stem = meta_path.stem
161
+ if stem:
162
+ metadata_index['exact_stem'][stem] = meta_value
163
+ metadata_index['stem_lookup'][stem] = meta_value
164
+
165
+ # Index by full name
166
+ name = meta_path.name
167
+ if name:
168
+ metadata_index['exact_name'][name] = meta_value
169
+
170
+ # Index for partial path matching
171
+ metadata_index['partial_paths'][meta_key] = meta_value
172
+
173
+ # Index for full path matching
174
+ metadata_index['full_paths'][meta_key] = meta_value
175
+
176
+ return metadata_index
177
+
178
+ def _find_matching_metadata_optimized(self, file_name: str, files_dict: Dict, metadata_index: Dict) -> Dict:
179
+ """Find matching metadata using optimized index lookups."""
180
+ if not metadata_index:
181
+ return {}
182
+
183
+ # Strategy 1: Exact stem match (O(1) lookup)
184
+ if file_name in metadata_index['exact_stem']:
185
+ return metadata_index['exact_stem'][file_name]
186
+
187
+ # Strategy 2: Exact filename match with extension (O(1) lookup)
188
+ sample_file = list(files_dict.values())[0] if files_dict else None
189
+ if sample_file:
190
+ full_filename = f'{file_name}{sample_file.suffix}'
191
+ if full_filename in metadata_index['exact_name']:
192
+ return metadata_index['exact_name'][full_filename]
193
+
194
+ # Try sample file name
195
+ sample_filename = sample_file.name
196
+ if sample_filename in metadata_index['exact_name']:
197
+ return metadata_index['exact_name'][sample_filename]
198
+
199
+ # Strategy 3: Stem lookup (already optimized above)
200
+ # This is covered by exact_stem lookup
201
+
202
+ # Strategy 4 & 5: Partial and full path matching (fallback to original logic for complex cases)
203
+ if sample_file:
204
+ file_path_str = str(sample_file)
205
+ file_path_posix = sample_file.as_posix()
206
+
207
+ # Check partial paths
208
+ for meta_key in metadata_index['partial_paths']:
209
+ if (
210
+ meta_key in file_path_str
211
+ or meta_key in file_path_posix
212
+ or file_path_str in meta_key
213
+ or file_path_posix in meta_key
214
+ ):
215
+ return metadata_index['partial_paths'][meta_key]
216
+
217
+ return {}
218
+
219
+ def _find_matching_metadata(self, file_name: str, files_dict: Dict, metadata: Dict) -> Dict:
220
+ """Find matching metadata using comprehensive pattern matching.
221
+
222
+ Matching priority:
223
+ 1. Exact stem match (highest priority)
224
+ 2. Exact filename match (with extension)
225
+ 3. Metadata key stem matches file stem
226
+ 4. Partial path matching
227
+ 5. Full path matching
228
+ """
229
+ if not metadata:
230
+ return {}
231
+
232
+ # Get sample file for extension and path information
233
+ sample_file = list(files_dict.values())[0] if files_dict else None
234
+
235
+ # Strategy 1: Exact stem match (highest priority)
236
+ if file_name in metadata:
237
+ return metadata[file_name]
238
+
239
+ # Strategy 2: Exact filename match (with extension)
240
+ if sample_file:
241
+ full_filename = f'{file_name}{sample_file.suffix}'
242
+ if full_filename in metadata:
243
+ return metadata[full_filename]
244
+
245
+ # Also try with sample file name
246
+ sample_filename = sample_file.name
247
+ if sample_filename in metadata:
248
+ return metadata[sample_filename]
249
+
250
+ # Strategy 3: Metadata key stem matches file stem
251
+ for meta_key in metadata.keys():
252
+ meta_stem = Path(meta_key).stem
253
+ if meta_stem == file_name:
254
+ return metadata[meta_key]
255
+
256
+ # Strategy 4: Partial path matching
257
+ if sample_file:
258
+ file_path_parts = sample_file.parts
259
+ for meta_key in metadata.keys():
260
+ meta_path = Path(meta_key)
261
+ # Check if any part of the metadata key matches our file path parts
262
+ for part in file_path_parts:
263
+ if part in str(meta_path) or str(meta_path) in part:
264
+ # Additional validation: ensure it's a reasonable match
265
+ if meta_path.stem == file_name or meta_path.name == sample_file.name or part == meta_path.stem:
266
+ return metadata[meta_key]
267
+
268
+ # Strategy 5: Full path matching
269
+ if sample_file:
270
+ full_path_str = str(sample_file)
271
+ full_path_posix = sample_file.as_posix()
272
+
273
+ for meta_key in metadata.keys():
274
+ # Direct path match
275
+ if meta_key == full_path_str or meta_key == full_path_posix:
276
+ return metadata[meta_key]
277
+
278
+ # Relative path match (check if meta_key is contained in our path)
279
+ if meta_key in full_path_str or meta_key in full_path_posix:
280
+ return metadata[meta_key]
281
+
282
+ # Reverse match (check if our path is contained in meta_key)
283
+ if full_path_str in meta_key or full_path_posix in meta_key:
284
+ return metadata[meta_key]
285
+
286
+ # No match found
287
+ return {}
@@ -0,0 +1 @@
1
+ # Metadata strategy implementations