nv-ingest-client 2025.7.24.dev20250724__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (38) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/cli/util/processing.py +0 -393
  3. nv_ingest_client/client/client.py +561 -207
  4. nv_ingest_client/client/ingest_job_handler.py +412 -0
  5. nv_ingest_client/client/interface.py +466 -59
  6. nv_ingest_client/client/util/processing.py +11 -1
  7. nv_ingest_client/nv_ingest_cli.py +58 -6
  8. nv_ingest_client/primitives/jobs/job_spec.py +32 -10
  9. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  10. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  11. nv_ingest_client/primitives/tasks/caption.py +10 -16
  12. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  13. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  14. nv_ingest_client/primitives/tasks/embed.py +37 -76
  15. nv_ingest_client/primitives/tasks/extract.py +68 -169
  16. nv_ingest_client/primitives/tasks/filter.py +22 -28
  17. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  18. nv_ingest_client/primitives/tasks/split.py +17 -18
  19. nv_ingest_client/primitives/tasks/store.py +29 -29
  20. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  21. nv_ingest_client/primitives/tasks/task_factory.py +10 -11
  22. nv_ingest_client/primitives/tasks/udf.py +349 -0
  23. nv_ingest_client/util/dataset.py +8 -2
  24. nv_ingest_client/util/document_analysis.py +314 -0
  25. nv_ingest_client/util/image_disk_utils.py +300 -0
  26. nv_ingest_client/util/transport.py +12 -6
  27. nv_ingest_client/util/util.py +66 -0
  28. nv_ingest_client/util/vdb/milvus.py +220 -75
  29. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/METADATA +1 -3
  30. nv_ingest_client-2025.11.2.dev20251102.dist-info/RECORD +55 -0
  31. nv_ingest_client/cli/util/tasks.py +0 -3
  32. nv_ingest_client/primitives/exceptions.py +0 -0
  33. nv_ingest_client/primitives/tasks/transform.py +0 -0
  34. nv_ingest_client-2025.7.24.dev20250724.dist-info/RECORD +0 -54
  35. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  36. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/entry_points.txt +0 -0
  37. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest_client-2025.7.24.dev20250724.dist-info → nv_ingest_client-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -10,8 +10,8 @@ import logging
10
10
  from typing import Dict
11
11
  from typing import Literal
12
12
 
13
- from pydantic import BaseModel
14
- from pydantic import model_validator
13
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreSchema
14
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskStoreEmbedSchema
15
15
 
16
16
  from .task_base import Task
17
17
 
@@ -20,28 +20,6 @@ logger = logging.getLogger(__name__)
20
20
  _DEFAULT_STORE_METHOD = "minio"
21
21
 
22
22
 
23
- class StoreEmbedTaskSchema(BaseModel):
24
-
25
- class Config:
26
- extra = "allow"
27
-
28
-
29
- class StoreTaskSchema(BaseModel):
30
- store_method: str = None
31
-
32
- @model_validator(mode="before")
33
- @classmethod
34
- def set_default_store_method(cls, values):
35
- store_method = values.get("store_method")
36
-
37
- if store_method is None:
38
- values["store_method"] = _DEFAULT_STORE_METHOD
39
- return values
40
-
41
- class Config:
42
- extra = "allow"
43
-
44
-
45
23
  class StoreTask(Task):
46
24
  """
47
25
  Object for image storage task.
@@ -64,10 +42,22 @@ class StoreTask(Task):
64
42
  """
65
43
  super().__init__()
66
44
 
67
- self._structured = structured
68
- self._images = images
69
- self._store_method = store_method or "minio"
70
- self._params = params
45
+ # Handle None params by converting to empty dict for backward compatibility
46
+ if params is None:
47
+ params = {}
48
+
49
+ # Merge extra_params into params for API schema compatibility
50
+ merged_params = {**params, **extra_params}
51
+
52
+ # Use the API schema for validation
53
+ validated_data = IngestTaskStoreSchema(
54
+ structured=structured, images=images, method=store_method or _DEFAULT_STORE_METHOD, params=merged_params
55
+ )
56
+
57
+ self._structured = validated_data.structured
58
+ self._images = validated_data.images
59
+ self._store_method = validated_data.method
60
+ self._params = validated_data.params
71
61
  self._extra_params = extra_params
72
62
 
73
63
  def __str__(self) -> str:
@@ -116,7 +106,17 @@ class StoreEmbedTask(Task):
116
106
  """
117
107
  super().__init__()
118
108
 
119
- self._params = params or {}
109
+ # Handle None params by converting to empty dict for backward compatibility
110
+ if params is None:
111
+ params = {}
112
+
113
+ # Merge extra_params into params for API schema compatibility
114
+ merged_params = {**params, **extra_params}
115
+
116
+ # Use the API schema for validation
117
+ validated_data = IngestTaskStoreEmbedSchema(params=merged_params)
118
+
119
+ self._params = validated_data.params
120
120
  self._extra_params = extra_params
121
121
 
122
122
  def __str__(self) -> str:
@@ -27,6 +27,7 @@ class TaskType(Enum):
27
27
  STORE_EMBEDDING = auto()
28
28
  TABLE_DATA_EXTRACT = auto()
29
29
  TRANSFORM = auto()
30
+ UDF = auto()
30
31
  VDB_UPLOAD = auto()
31
32
 
32
33
 
@@ -71,75 +72,3 @@ class Task:
71
72
  tasks that are then submitted to the redis client
72
73
  """
73
74
  return {}
74
-
75
-
76
- # class ExtractUnstructuredTask(ExtractTask):
77
- # """
78
- # Object for document unstructured extraction task
79
- # extract_method = ["unstructured_local", "unstructured_service"]
80
- # """
81
- #
82
- # def __init__(
83
- # self,
84
- # extract_method: ExtractTask._Type_Extract_Method,
85
- # document_type: ExtractTask._TypeDocumentType,
86
- # api_key: str,
87
- # uri: str,
88
- # ) -> None:
89
- # """
90
- # Setup Extract Task Config
91
- # """
92
- # super().__init__(extract_method, document_type)
93
- # self._api_key = api_key
94
- # self._uri = uri
95
- #
96
- # def __str__(self) -> str:
97
- # """
98
- # Returns a string with the object's config and run time state
99
- # """
100
- # info = ""
101
- # info += super().__str__()
102
- # info += f"unstructured uri: {self._uri}\n"
103
- # return info
104
- #
105
- # def to_dict(self) -> Dict:
106
- # """
107
- # Convert to a dict for submission to redis (fixme)
108
- # """
109
- # unstructured_properties = {
110
- # "api_key": self._api_key,
111
- # "unstructured_url": self._uri,
112
- # }
113
- # task_desc = super().to_dict()
114
- # task_desc["task_properties"]["params"].update(unstructured_properties)
115
- # return task_desc
116
-
117
-
118
- # class ExtractLlamaParseTask(ExtractTask):
119
- # """
120
- # Object for document llama extraction task
121
- # extract_method = ["llama_parse"]
122
- # """
123
- #
124
- # def __init__(
125
- # self,
126
- # extract_method: ExtractTask._Type_Extract_Method,
127
- # document_type: ExtractTask._TypeDocumentType,
128
- # api_key: str,
129
- # ) -> None:
130
- # """
131
- # Setup Extract Task Config
132
- # """
133
- # super().__init__(extract_method, document_type)
134
- # self._api_key = api_key
135
- #
136
- # def to_dict(self) -> Dict:
137
- # """
138
- # Convert to a dict for submission to redis (fixme)
139
- # """
140
- # llama_parse_properties = {
141
- # "api_key": self._api_key,
142
- # }
143
- # task_desc = super().to_dict()
144
- # task_desc["task_properties"]["params"].update(llama_parse_properties)
145
- # return task_desc
@@ -8,17 +8,15 @@ from typing import Dict
8
8
  from typing import Type
9
9
  from typing import Union
10
10
 
11
- from .caption import CaptionTask
12
- from .dedup import DedupTask
13
- from .embed import EmbedTask
14
- from .extract import ExtractTask
15
- from .filter import FilterTask
16
- from .split import SplitTask
17
- from .store import StoreEmbedTask
18
- from .store import StoreTask
19
- from .task_base import Task
20
- from .task_base import TaskType
21
- from .task_base import is_valid_task_type
11
+ from nv_ingest_client.primitives.tasks.task_base import Task, TaskType, is_valid_task_type
12
+ from nv_ingest_client.primitives.tasks.caption import CaptionTask
13
+ from nv_ingest_client.primitives.tasks.dedup import DedupTask
14
+ from nv_ingest_client.primitives.tasks.embed import EmbedTask
15
+ from nv_ingest_client.primitives.tasks.extract import ExtractTask
16
+ from nv_ingest_client.primitives.tasks.filter import FilterTask
17
+ from nv_ingest_client.primitives.tasks.split import SplitTask
18
+ from nv_ingest_client.primitives.tasks.store import StoreEmbedTask, StoreTask
19
+ from nv_ingest_client.primitives.tasks.udf import UDFTask
22
20
 
23
21
 
24
22
  class TaskUnimplemented(Task):
@@ -42,6 +40,7 @@ _TASK_MAP: Dict[TaskType, Callable] = {
42
40
  TaskType.STORE_EMBEDDING: StoreEmbedTask,
43
41
  TaskType.STORE: StoreTask,
44
42
  TaskType.TRANSFORM: TaskUnimplemented,
43
+ TaskType.UDF: UDFTask,
45
44
  }
46
45
 
47
46
 
@@ -0,0 +1,349 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ # pylint: disable=too-few-public-methods
7
+ # pylint: disable=too-many-arguments
8
+
9
+ import importlib.util
10
+ import logging
11
+ import importlib
12
+ import inspect
13
+ import ast
14
+ import re
15
+ from typing import Dict, Optional, Union
16
+
17
+ from nv_ingest_api.internal.enums.common import PipelinePhase
18
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskUDFSchema
19
+ from nv_ingest_client.primitives.tasks.task_base import Task
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _load_function_from_import_path(import_path: str):
25
+ """Load a function from an import path like 'module.submodule.function'."""
26
+ try:
27
+ parts = import_path.split(".")
28
+ module_path = ".".join(parts[:-1])
29
+ function_name = parts[-1]
30
+
31
+ module = importlib.import_module(module_path)
32
+ func = getattr(module, function_name)
33
+
34
+ if not callable(func):
35
+ raise ValueError(f"'{function_name}' is not callable in module '{module_path}'")
36
+
37
+ return func
38
+ except ImportError as e:
39
+ raise ValueError(f"Failed to import module from '{import_path}': {e}")
40
+ except AttributeError as e:
41
+ raise ValueError(f"Function '{function_name}' not found in module '{module_path}': {e}")
42
+
43
+
44
+ def _load_function_from_file_path(file_path: str, function_name: str):
45
+ """Load a function from a file path."""
46
+ try:
47
+
48
+ # Create a module spec from the file
49
+ spec = importlib.util.spec_from_file_location("udf_module", file_path)
50
+ if spec is None:
51
+ raise ValueError(f"Could not create module spec from file: {file_path}")
52
+
53
+ module = importlib.util.module_from_spec(spec)
54
+
55
+ # Execute the module to load its contents
56
+ spec.loader.exec_module(module)
57
+
58
+ # Get the function
59
+ func = getattr(module, function_name)
60
+
61
+ if not callable(func):
62
+ raise ValueError(f"'{function_name}' is not callable in file '{file_path}'")
63
+
64
+ return func
65
+ except Exception as e:
66
+ raise ValueError(f"Failed to load function '{function_name}' from file '{file_path}': {e}")
67
+
68
+
69
+ def _extract_function_with_context(file_path: str, function_name: str) -> str:
70
+ """
71
+ Extract a function from a file while preserving the full module context.
72
+
73
+ This includes all imports, module-level variables, and other functions
74
+ that the target function might depend on.
75
+
76
+ Parameters
77
+ ----------
78
+ file_path : str
79
+ Path to the Python file containing the function
80
+ function_name : str
81
+ Name of the function to extract
82
+
83
+ Returns
84
+ -------
85
+ str
86
+ Complete module source code with the target function
87
+ """
88
+ try:
89
+ with open(file_path, "r", encoding="utf-8") as f:
90
+ module_source = f.read()
91
+
92
+ # Parse the module to verify the function exists
93
+ try:
94
+ tree = ast.parse(module_source)
95
+ function_found = False
96
+
97
+ for node in ast.walk(tree):
98
+ if isinstance(node, ast.FunctionDef) and node.name == function_name:
99
+ function_found = True
100
+ break
101
+
102
+ if not function_found:
103
+ raise ValueError(f"Function '{function_name}' not found in file '{file_path}'")
104
+
105
+ except SyntaxError as e:
106
+ raise ValueError(f"Syntax error in file '{file_path}': {e}")
107
+
108
+ return module_source
109
+
110
+ except FileNotFoundError:
111
+ raise ValueError(f"File not found: {file_path}")
112
+ except Exception as e:
113
+ raise ValueError(f"Failed to read file '{file_path}': {e}")
114
+
115
+
116
+ def _resolve_udf_function(udf_function_spec: str) -> str:
117
+ """
118
+ Resolve UDF function specification to function string.
119
+
120
+ Supports four formats:
121
+ 1. Inline function string: 'def my_func(control_message): ...'
122
+ 2. Module path with colon: 'my_module.my_submodule:my_function' (preserves imports)
123
+ 3. File path: '/path/to/file.py:my_function'
124
+ 4. Legacy import path: 'my_module.my_function' (function name only, no imports)
125
+ """
126
+ # Default to treating as inline unless it clearly matches a
127
+ # module/file specification. This avoids misclassifying inline code that
128
+ # contains colons, imports, or annotations before the def line.
129
+
130
+ spec = udf_function_spec.strip()
131
+
132
+ # 1) File path with function: /path/to/file.py:function_name
133
+ if ".py:" in spec:
134
+ file_path, function_name = spec.split(":", 1)
135
+ return _extract_function_with_context(file_path, function_name)
136
+
137
+ # 2) File path without function name is an explicit error
138
+ if spec.endswith(".py"):
139
+ raise ValueError(
140
+ f"File path '{udf_function_spec}' is missing function name. Use format 'file.py:function_name'."
141
+ )
142
+
143
+ # 3) Module path with colon: my.module:function
144
+ # Be strict: only letters, numbers, underscore, and dots on the left; valid identifier on the right;
145
+ # no whitespace/newlines.
146
+ module_colon_pattern = re.compile(r"^[A-Za-z_][\w\.]*:[A-Za-z_][\w]*$")
147
+ if module_colon_pattern.match(spec):
148
+ module_path, function_name = spec.split(":", 1)
149
+ try:
150
+ module = importlib.import_module(module_path)
151
+ module_file = inspect.getfile(module)
152
+ return _extract_function_with_context(module_file, function_name)
153
+ except ImportError as e:
154
+ raise ValueError(f"Failed to import module '{module_path}': {e}")
155
+ except Exception as e:
156
+ raise ValueError(f"Failed to resolve module path '{module_path}': {e}")
157
+
158
+ # 4) Legacy import path: my.module.function (no colon)
159
+ legacy_import_pattern = re.compile(r"^[A-Za-z_][\w\.]*\.[A-Za-z_][\w]*$")
160
+ if legacy_import_pattern.match(spec):
161
+ func = _load_function_from_import_path(spec)
162
+ try:
163
+ source = inspect.getsource(func)
164
+ return source
165
+ except (OSError, TypeError) as e:
166
+ raise ValueError(f"Could not get source code for function from '{udf_function_spec}': {e}")
167
+
168
+ # 5) Default: treat as inline UDF source (entire string)
169
+ return udf_function_spec
170
+
171
+
172
+ class UDFTask(Task):
173
+ """
174
+ User-Defined Function (UDF) task for custom processing logic.
175
+
176
+ This task allows users to provide custom Python functions that will be executed
177
+ during the ingestion pipeline. The UDF function must accept a control_message
178
+ parameter and return an IngestControlMessage.
179
+
180
+ Supports four UDF function specification formats:
181
+ 1. Inline function string: 'def my_func(control_message): ...'
182
+ 2. Module path with colon: 'my_module.my_submodule:my_function' (preserves imports)
183
+ 3. File path: '/path/to/file.py:my_function'
184
+ 4. Legacy import path: 'my_module.my_function' (function name only, no imports)
185
+ """
186
+
187
+ def __init__(
188
+ self,
189
+ udf_function: Optional[str] = None,
190
+ udf_function_name: Optional[str] = None,
191
+ phase: Union[PipelinePhase, int, str, None] = PipelinePhase.RESPONSE,
192
+ target_stage: Optional[str] = None,
193
+ run_before: bool = False,
194
+ run_after: bool = False,
195
+ ) -> None:
196
+ super().__init__()
197
+ self._udf_function = udf_function
198
+ self._udf_function_name = udf_function_name
199
+ self._target_stage = target_stage
200
+ self._run_before = run_before
201
+ self._run_after = run_after
202
+
203
+ # Convert phase to the appropriate format for API schema
204
+ # If target_stage is provided and phase is None, don't convert phase
205
+ if target_stage is not None and phase is None:
206
+ converted_phase = None
207
+ self._phase = None # Set to None when using target_stage
208
+ else:
209
+ converted_phase = self._convert_phase(phase)
210
+ self._phase = PipelinePhase(converted_phase) # Convert back to enum for internal use
211
+
212
+ # Use the API schema for validation
213
+ _ = IngestTaskUDFSchema(
214
+ udf_function=udf_function or "",
215
+ udf_function_name=udf_function_name or "",
216
+ phase=converted_phase,
217
+ target_stage=target_stage,
218
+ run_before=run_before,
219
+ run_after=run_after,
220
+ )
221
+ self._resolved_udf_function = None
222
+
223
+ def _convert_phase(self, phase: Union[PipelinePhase, int, str]) -> int:
224
+ """Convert phase to integer for API schema validation."""
225
+ if isinstance(phase, PipelinePhase):
226
+ return phase.value
227
+
228
+ if isinstance(phase, int):
229
+ try:
230
+ PipelinePhase(phase) # Validate it's a valid phase number
231
+ return phase
232
+ except ValueError:
233
+ valid_values = [p.value for p in PipelinePhase]
234
+ raise ValueError(f"Invalid phase number {phase}. Valid values are: {valid_values}")
235
+
236
+ if isinstance(phase, str):
237
+ # Convert string to uppercase and try to match enum name
238
+ phase_name = phase.upper().strip()
239
+
240
+ # Handle common aliases and variations
241
+ phase_aliases = {
242
+ "EXTRACT": "EXTRACTION",
243
+ "PREPROCESS": "PRE_PROCESSING",
244
+ "PRE_PROCESS": "PRE_PROCESSING",
245
+ "PREPROCESSING": "PRE_PROCESSING",
246
+ "POSTPROCESS": "POST_PROCESSING",
247
+ "POST_PROCESS": "POST_PROCESSING",
248
+ "POSTPROCESSING": "POST_PROCESSING",
249
+ "MUTATE": "MUTATION",
250
+ }
251
+
252
+ # Apply alias if exists
253
+ if phase_name in phase_aliases:
254
+ phase_name = phase_aliases[phase_name]
255
+
256
+ try:
257
+ return PipelinePhase[phase_name].value
258
+ except KeyError:
259
+ valid_names = [p.name for p in PipelinePhase]
260
+ valid_aliases = list(phase_aliases.keys())
261
+ raise ValueError(
262
+ f"Invalid phase name '{phase}'. Valid phase names are: {valid_names}. "
263
+ f"Also supported aliases: {valid_aliases}"
264
+ )
265
+
266
+ raise ValueError(f"Phase must be a PipelinePhase enum, integer, or string, got {type(phase)}")
267
+
268
+ @property
269
+ def udf_function(self) -> Optional[str]:
270
+ """
271
+ Returns the UDF function string or specification.
272
+ """
273
+ return self._udf_function
274
+
275
+ @property
276
+ def udf_function_name(self) -> Optional[str]:
277
+ """
278
+ Returns the UDF function name.
279
+ """
280
+ return self._udf_function_name
281
+
282
+ @property
283
+ def phase(self) -> PipelinePhase:
284
+ """
285
+ Returns the pipeline phase for this UDF task.
286
+ """
287
+ return self._phase
288
+
289
+ def __str__(self) -> str:
290
+ """
291
+ Returns a string with the object's config and run time state
292
+ """
293
+ info = ""
294
+ info += "User-Defined Function (UDF) Task:\n"
295
+
296
+ if self._udf_function:
297
+ # Show first 100 characters of the function for brevity
298
+ function_preview = self._udf_function[:100]
299
+ if len(self._udf_function) > 100:
300
+ function_preview += "..."
301
+ info += f" udf_function: {function_preview}\n"
302
+ else:
303
+ info += " udf_function: None\n"
304
+
305
+ # Display phase information
306
+ if isinstance(self._phase, PipelinePhase):
307
+ info += f" phase: {self._phase.name} ({self._phase.value})\n"
308
+ else:
309
+ info += f" phase: {self._phase}\n"
310
+
311
+ return info
312
+
313
+ def to_dict(self) -> Dict:
314
+ """
315
+ Convert to a dict for submission to redis
316
+ """
317
+ task_properties = {}
318
+
319
+ if self._udf_function:
320
+ # Resolve the UDF function specification to function string
321
+ resolved_function = self._resolve_udf_function()
322
+ task_properties["udf_function"] = resolved_function
323
+
324
+ if self._udf_function_name:
325
+ task_properties["udf_function_name"] = self._udf_function_name
326
+
327
+ # Convert phase to integer value for serialization
328
+ if isinstance(self._phase, PipelinePhase):
329
+ task_properties["phase"] = self._phase.value
330
+ else:
331
+ task_properties["phase"] = self._phase
332
+
333
+ # Add new stage targeting parameters
334
+ if self._target_stage:
335
+ task_properties["target_stage"] = self._target_stage
336
+
337
+ task_properties["run_before"] = self._run_before
338
+ task_properties["run_after"] = self._run_after
339
+
340
+ return {
341
+ "type": "udf",
342
+ "task_properties": task_properties,
343
+ }
344
+
345
+ def _resolve_udf_function(self):
346
+ """Resolve UDF function specification to function string."""
347
+ if self._resolved_udf_function is None and self._udf_function:
348
+ self._resolved_udf_function = _resolve_udf_function(self._udf_function)
349
+ return self._resolved_udf_function
@@ -94,8 +94,14 @@ def get_dataset_files(dataset_bytes: BytesIO, shuffle: bool = False) -> list:
94
94
  dataset_bytes.seek(0)
95
95
  dataset = json.load(dataset_bytes)
96
96
  sampled_files = dataset.get("sampled_files", [])
97
- if shuffle:
98
- random.shuffle(sampled_files)
97
+ if shuffle and len(sampled_files) > 1:
98
+ original = list(sampled_files)
99
+ # Create a shuffled copy without mutating the original list
100
+ shuffled = random.sample(sampled_files, k=len(sampled_files))
101
+ # Guard against seeded RNG or accidental identity by forcing a different order
102
+ if shuffled == original:
103
+ shuffled = shuffled[1:] + shuffled[:1]
104
+ return shuffled
99
105
  return sampled_files
100
106
  except json.JSONDecodeError as err:
101
107
  raise ValueError(f"{err}")