nextmv 0.33.0.dev0__py3-none-any.whl → 0.34.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nextmv/local/executor.py CHANGED
@@ -16,6 +16,10 @@ process_run_input
16
16
  Function to process the run input based on the format.
17
17
  process_run_output
18
18
  Function to process the run output and handle results.
19
+ resolve_output_format
20
+ Function to determine the output format from manifest or directory structure.
21
+ process_run_information
22
+ Function to update run metadata including duration and status.
19
23
  process_run_logs
20
24
  Function to process and save run logs.
21
25
  process_run_statistics
@@ -26,8 +30,13 @@ process_run_solutions
26
30
  Function to process and save run solutions.
27
31
  process_run_visuals
28
32
  Function to process and save run visuals.
33
+ resolve_stdout
34
+ Function to parse subprocess stdout output.
35
+ ignore_patterns
36
+ Function to filter files and directories during source code copying.
29
37
  """
30
38
 
39
+ import hashlib
31
40
  import json
32
41
  import os
33
42
  import shutil
@@ -84,25 +93,26 @@ def execute_run(
84
93
  input_data: Optional[Union[dict[str, Any], str]] = None,
85
94
  ) -> None:
86
95
  """
87
- This function actually executes the decision model run, using a
88
- subprocess to call the entrypoint script with the appropriate input and
89
- options.
96
+ Executes the decision model run using a subprocess to call the entrypoint
97
+ script with the appropriate input and options.
90
98
 
91
99
  Parameters
92
100
  ----------
101
+ run_id : str
102
+ The unique identifier for the run.
93
103
  src : str
94
104
  The path to the application source code.
95
- manifest_entrypoint : str
96
- The entrypoint script as defined in the application manifest.
105
+ manifest_dict : dict[str, Any]
106
+ The manifest dictionary containing application configuration.
97
107
  run_dir : str
98
- The path to the run directory.
108
+ The path to the run directory where outputs will be stored.
99
109
  run_config : dict[str, Any]
100
- The run configuration.
110
+ The run configuration containing format and other settings.
101
111
  inputs_dir_path : Optional[str], optional
102
112
  The path to the directory containing input files, by default None. If
103
113
  provided, this parameter takes precedence over `input_data`.
104
114
  options : Optional[dict[str, Any]], optional
105
- Additional options for the run, by default None.
115
+ Additional command-line options for the run, by default None.
106
116
  input_data : Optional[Union[dict[str, Any], str]], optional
107
117
  The input data for the run, by default None. If `inputs_dir_path` is
108
118
  provided, this parameter is ignored.
@@ -119,7 +129,7 @@ def execute_run(
119
129
  # place to work from, and be cleaned up afterwards.
120
130
  with tempfile.TemporaryDirectory() as temp_dir:
121
131
  temp_src = os.path.join(temp_dir, "src")
122
- shutil.copytree(src, temp_src, ignore=shutil.ignore_patterns(NEXTMV_DIR))
132
+ shutil.copytree(src, temp_src, ignore=ignore_patterns)
123
133
 
124
134
  manifest = Manifest.from_dict(manifest_dict)
125
135
 
@@ -162,6 +172,7 @@ def execute_run(
162
172
  temp_src=temp_src,
163
173
  result=result,
164
174
  run_dir=run_dir,
175
+ src=src,
165
176
  )
166
177
 
167
178
  except Exception as e:
@@ -290,29 +301,30 @@ def process_run_output(
290
301
  temp_src: str,
291
302
  result: subprocess.CompletedProcess[str],
292
303
  run_dir: str,
304
+ src: str,
293
305
  ) -> None:
294
306
  """
295
307
  Processes the result of the subprocess run. This function is in charge of
296
308
  handling the run results, including solutions, statistics, logs, assets,
297
- etc.
309
+ and visuals.
298
310
 
299
311
  Parameters
300
312
  ----------
301
313
  manifest : Manifest
302
- The application manifest.
314
+ The application manifest containing configuration details.
315
+ run_id : str
316
+ The unique identifier for the run.
303
317
  temp_src : str
304
318
  The path to the temporary source directory.
305
319
  result : subprocess.CompletedProcess[str]
306
- The result of the subprocess run.
320
+ The result of the subprocess run containing stdout, stderr, and return code.
307
321
  run_dir : str
308
- The path to the run directory.
322
+ The path to the run directory where outputs will be stored.
323
+ src : str
324
+ The path to the application source code.
309
325
  """
310
326
 
311
- # Parse stdout as JSON, if possible.
312
- stdout_output = {}
313
- raw_output = result.stdout
314
- if raw_output.strip() != "":
315
- stdout_output = json.loads(raw_output)
327
+ stdout_output = resolve_stdout(result)
316
328
 
317
329
  # Create outputs directory.
318
330
  outputs_dir = os.path.join(run_dir, OUTPUTS_KEY)
@@ -324,7 +336,6 @@ def process_run_output(
324
336
  temp_run_outputs_dir=temp_run_outputs_dir,
325
337
  temp_src=temp_src,
326
338
  )
327
-
328
339
  process_run_information(
329
340
  run_id=run_id,
330
341
  run_dir=run_dir,
@@ -342,6 +353,7 @@ def process_run_output(
342
353
  stdout_output=stdout_output,
343
354
  temp_src=temp_src,
344
355
  manifest=manifest,
356
+ src=src,
345
357
  )
346
358
  process_run_assets(
347
359
  temp_run_outputs_dir=temp_run_outputs_dir,
@@ -349,6 +361,7 @@ def process_run_output(
349
361
  stdout_output=stdout_output,
350
362
  temp_src=temp_src,
351
363
  manifest=manifest,
364
+ src=src,
352
365
  )
353
366
  process_run_solutions(
354
367
  run_id=run_id,
@@ -359,6 +372,7 @@ def process_run_output(
359
372
  stdout_output=stdout_output,
360
373
  output_format=output_format,
361
374
  manifest=manifest,
375
+ src=src,
362
376
  )
363
377
  process_run_visuals(
364
378
  run_dir=run_dir,
@@ -381,11 +395,16 @@ def resolve_output_format(
381
395
  Parameters
382
396
  ----------
383
397
  manifest : Manifest
384
- The application manifest.
398
+ The application manifest containing configuration details.
385
399
  temp_run_outputs_dir : str
386
400
  The path to the temporary outputs directory.
387
401
  temp_src : str
388
402
  The path to the temporary source directory.
403
+
404
+ Returns
405
+ -------
406
+ OutputFormat
407
+ The determined output format (JSON, CSV_ARCHIVE, or MULTI_FILE).
389
408
  """
390
409
 
391
410
  if manifest.configuration is not None and manifest.configuration.content is not None:
@@ -433,7 +452,8 @@ def process_run_information(run_id: str, run_dir: str, result: subprocess.Comple
433
452
  error = ""
434
453
  if result.returncode != 0:
435
454
  status = StatusV2.failed.value
436
- error = result.stderr if result.stderr else "unknown error"
455
+ # Truncate error message so that Cloud does not complain.
456
+ error = (result.stderr.strip().replace("\n", " ") if result.stderr else "unknown error")[:60]
437
457
 
438
458
  # Update the run info file.
439
459
  info["metadata"]["duration"] = duration
@@ -448,29 +468,34 @@ def process_run_logs(
448
468
  output_format: OutputFormat,
449
469
  run_dir: str,
450
470
  result: subprocess.CompletedProcess[str],
451
- stdout_output: dict[str, Any],
471
+ stdout_output: Union[str, dict[str, Any]],
452
472
  ) -> None:
453
473
  """
454
474
  Processes the logs of the run. Writes the logs to a logs directory.
475
+ For multi-file format, stdout is written to logs if present.
455
476
 
456
477
  Parameters
457
478
  ----------
458
479
  output_format : OutputFormat
459
- The output format of the run.
480
+ The output format of the run (JSON, CSV_ARCHIVE, or MULTI_FILE).
460
481
  run_dir : str
461
- The path to the run directory.
482
+ The path to the run directory where logs will be stored.
462
483
  result : subprocess.CompletedProcess[str]
463
- The result of the subprocess run.
464
- stdout_output : dict[str, Any]
465
- The stdout output of the run, parsed as a dictionary.
484
+ The result of the subprocess run containing stderr output.
485
+ stdout_output : Union[str, dict[str, Any]]
486
+ The stdout output of the run, either as raw string or parsed dictionary.
466
487
  """
467
488
 
468
489
  logs_dir = os.path.join(run_dir, LOGS_KEY)
469
490
  os.makedirs(logs_dir, exist_ok=True)
470
491
  std_err = result.stderr
471
492
  with open(os.path.join(logs_dir, LOGS_FILE), "w") as f:
472
- if output_format == OutputFormat.MULTI_FILE and stdout_output != {}:
473
- f.write(json.dumps(stdout_output))
493
+ if output_format == OutputFormat.MULTI_FILE and bool(stdout_output):
494
+ if isinstance(stdout_output, dict):
495
+ f.write(json.dumps(stdout_output))
496
+ elif isinstance(stdout_output, str):
497
+ f.write(stdout_output)
498
+
474
499
  if std_err:
475
500
  f.write("\n")
476
501
 
@@ -480,14 +505,15 @@ def process_run_logs(
480
505
  def process_run_statistics(
481
506
  temp_run_outputs_dir: str,
482
507
  outputs_dir: str,
483
- stdout_output: dict[str, Any],
508
+ stdout_output: Union[str, dict[str, Any]],
484
509
  temp_src: str,
485
510
  manifest: Manifest,
511
+ src: str,
486
512
  ) -> None:
487
513
  """
488
- Processes the statistics of the run. Check for an outputs/statistics folder
489
- being created by the run. If it exists, copy it to the run directory. If it
490
- doesn't exist, attempt to get the stats from stdout.
514
+ Processes the statistics of the run. Checks for an outputs/statistics folder
515
+ or custom statistics file location from manifest. If found, copies to run
516
+ directory. Otherwise, attempts to extract statistics from stdout.
491
517
 
492
518
  Parameters
493
519
  ----------
@@ -495,12 +521,15 @@ def process_run_statistics(
495
521
  The path to the temporary outputs directory.
496
522
  outputs_dir : str
497
523
  The path to the outputs directory in the run directory.
498
- stdout_output : dict[str, Any]
499
- The stdout output of the run, parsed as a dictionary.
524
+ stdout_output : Union[str, dict[str, Any]]
525
+ The stdout output of the run, either as raw string or parsed dictionary.
500
526
  temp_src : str
501
527
  The path to the temporary source directory.
502
528
  manifest : Manifest
503
- The application manifest.
529
+ The application manifest containing configuration and custom paths.
530
+ src : str
531
+ The path to the original application source code, used to avoid copying
532
+ files that are already part of the source.
504
533
  """
505
534
 
506
535
  stats_dst = os.path.join(outputs_dir, STATISTICS_KEY)
@@ -524,7 +553,10 @@ def process_run_statistics(
524
553
 
525
554
  stats_src = os.path.join(temp_run_outputs_dir, STATISTICS_KEY)
526
555
  if os.path.exists(stats_src) and os.path.isdir(stats_src):
527
- shutil.copytree(stats_src, stats_dst, dirs_exist_ok=True)
556
+ _copy_new_or_modified_files(stats_src, stats_dst, src)
557
+ return
558
+
559
+ if not isinstance(stdout_output, dict):
528
560
  return
529
561
 
530
562
  if STATISTICS_KEY not in stdout_output:
@@ -538,14 +570,15 @@ def process_run_statistics(
538
570
  def process_run_assets(
539
571
  temp_run_outputs_dir: str,
540
572
  outputs_dir: str,
541
- stdout_output: dict[str, Any],
573
+ stdout_output: Union[str, dict[str, Any]],
542
574
  temp_src: str,
543
575
  manifest: Manifest,
576
+ src: str,
544
577
  ) -> None:
545
578
  """
546
- Processes the assets of the run. Check for an outputs/assets folder being
547
- created by the run. If it exists, copy it to the run directory. If it
548
- doesn't exist, attempt to get the assets from stdout.
579
+ Processes the assets of the run. Checks for an outputs/assets folder or
580
+ custom assets file location from manifest. If found, copies to run directory.
581
+ Otherwise, attempts to extract assets from stdout.
549
582
 
550
583
  Parameters
551
584
  ----------
@@ -553,12 +586,15 @@ def process_run_assets(
553
586
  The path to the temporary outputs directory.
554
587
  outputs_dir : str
555
588
  The path to the outputs directory in the run directory.
556
- stdout_output : dict[str, Any]
557
- The stdout output of the run, parsed as a dictionary.
589
+ stdout_output : Union[str, dict[str, Any]]
590
+ The stdout output of the run, either as raw string or parsed dictionary.
558
591
  temp_src : str
559
592
  The path to the temporary source directory.
560
593
  manifest : Manifest
561
- The application manifest.
594
+ The application manifest containing configuration and custom paths.
595
+ src : str
596
+ The path to the original application source code, used to avoid copying
597
+ files that are already part of the source.
562
598
  """
563
599
 
564
600
  assets_dst = os.path.join(outputs_dir, ASSETS_KEY)
@@ -582,7 +618,10 @@ def process_run_assets(
582
618
 
583
619
  assets_src = os.path.join(temp_run_outputs_dir, ASSETS_KEY)
584
620
  if os.path.exists(assets_src) and os.path.isdir(assets_src):
585
- shutil.copytree(assets_src, assets_dst, dirs_exist_ok=True)
621
+ _copy_new_or_modified_files(assets_src, assets_dst, src)
622
+ return
623
+
624
+ if not isinstance(stdout_output, dict):
586
625
  return
587
626
 
588
627
  if ASSETS_KEY not in stdout_output:
@@ -599,37 +638,42 @@ def process_run_solutions(
599
638
  temp_run_outputs_dir: str,
600
639
  temp_src: str,
601
640
  outputs_dir: str,
602
- stdout_output: dict[str, Any],
641
+ stdout_output: Union[str, dict[str, Any]],
603
642
  output_format: OutputFormat,
604
643
  manifest: Manifest,
644
+ src: str,
605
645
  ) -> None:
606
646
  """
607
- Processes the solutions (output) of the run. This method has the handle all
608
- the different formats for processing solutions. This includes looking for
609
- an `output` directory (`csv-archive`), an `outputs/solutions` directory
610
- (`multi-file`), or looking for solutions in the stdout output (`json` or
611
- `text`). For flexibility, we copy whatever is in the `output` and
612
- `outputs/solutions` directories, if they exist. If neither exist, we
613
- attempt to get the solution from stdout.
647
+ Processes the solutions (output) of the run. Handles all different output
648
+ formats including CSV-archive, multi-file, JSON, and text. Looks for
649
+ `output` directory (csv-archive), `outputs/solutions` directory (multi-file),
650
+ or custom solutions path from manifest. Falls back to stdout for JSON/text.
651
+ Updates run metadata with output size and format information.
652
+
653
+ Only copies files that are truly new outputs, excluding files that already
654
+ exist in the original source code, inputs, statistics, or assets directories
655
+ to prevent copying application data as solutions.
614
656
 
615
657
  Parameters
616
658
  ----------
617
659
  run_id : str
618
- The ID of the run.
660
+ The unique identifier of the run.
619
661
  run_dir : str
620
- The path to the run directory.
662
+ The path to the run directory where outputs are stored.
621
663
  temp_run_outputs_dir : str
622
664
  The path to the temporary outputs directory.
623
665
  temp_src : str
624
666
  The path to the temporary source directory.
625
667
  outputs_dir : str
626
668
  The path to the outputs directory in the run directory.
627
- stdout_output : dict[str, Any]
628
- The stdout output of the run, parsed as a dictionary.
669
+ stdout_output : Union[str, dict[str, Any]]
670
+ The stdout output of the run, either as raw string or parsed dictionary.
629
671
  output_format : OutputFormat
630
- The output format of the run.
672
+ The determined output format (JSON, CSV_ARCHIVE, MULTI_FILE, or TEXT).
631
673
  manifest : Manifest
632
- The application manifest.
674
+ The application manifest containing configuration and custom paths.
675
+ src : str
676
+ The path to the application source code.
633
677
  """
634
678
 
635
679
  info_file = os.path.join(run_dir, f"{run_id}.json")
@@ -640,9 +684,12 @@ def process_run_solutions(
640
684
  solutions_dst = os.path.join(outputs_dir, SOLUTIONS_KEY)
641
685
  os.makedirs(solutions_dst, exist_ok=True)
642
686
 
687
+ # Build list of directories to exclude from copying
688
+ exclusion_dirs = _build_exclusion_directories(src, manifest, outputs_dir, run_dir)
689
+
643
690
  if output_format == OutputFormat.CSV_ARCHIVE:
644
691
  output_src = os.path.join(temp_src, OUTPUT_KEY)
645
- shutil.copytree(output_src, solutions_dst, dirs_exist_ok=True)
692
+ _copy_new_or_modified_files(output_src, solutions_dst, src, exclusion_dirs)
646
693
  elif output_format == OutputFormat.MULTI_FILE:
647
694
  solutions_src = os.path.join(temp_run_outputs_dir, SOLUTIONS_KEY)
648
695
  if (
@@ -653,11 +700,14 @@ def process_run_solutions(
653
700
  ):
654
701
  solutions_src = os.path.join(temp_src, manifest.configuration.content.multi_file.output.solutions)
655
702
 
656
- shutil.copytree(solutions_src, solutions_dst, dirs_exist_ok=True)
703
+ _copy_new_or_modified_files(solutions_src, solutions_dst, src, exclusion_dirs)
657
704
  else:
658
- if stdout_output:
705
+ if bool(stdout_output):
659
706
  with open(os.path.join(solutions_dst, DEFAULT_OUTPUT_JSON_FILE), "w") as f:
660
- json.dump(stdout_output, f, indent=2)
707
+ if isinstance(stdout_output, dict):
708
+ json.dump(stdout_output, f, indent=2)
709
+ elif isinstance(stdout_output, str):
710
+ f.write(stdout_output)
661
711
 
662
712
  # Update the run information file with the output size and type.
663
713
  calculate_files_size(run_dir, run_id, solutions_dst, metadata_key="output_size")
@@ -669,14 +719,15 @@ def process_run_solutions(
669
719
  def process_run_visuals(run_dir: str, outputs_dir: str) -> None:
670
720
  """
671
721
  Processes the visuals from the assets in the run output. This function looks
672
- for Plotly assets and generates HTML files for each visual.
722
+ for visual assets (Plotly and GeoJSON) in the assets.json file and generates
723
+ HTML files for each visual. ChartJS visuals are ignored for local runs.
673
724
 
674
725
  Parameters
675
726
  ----------
676
727
  run_dir : str
677
- The path to the run directory.
728
+ The path to the run directory where visuals will be stored.
678
729
  outputs_dir : str
679
- The path to the outputs directory in the run directory.
730
+ The path to the outputs directory in the run directory containing assets.
680
731
  """
681
732
 
682
733
  # Get the assets.
@@ -710,5 +761,265 @@ def process_run_visuals(run_dir: str, outputs_dir: str) -> None:
710
761
  # so we ignore it for now.
711
762
 
712
763
 
764
+ def resolve_stdout(result: subprocess.CompletedProcess[str]) -> Union[str, dict[str, Any]]:
765
+ """
766
+ Resolves the stdout output of the subprocess run. If the stdout is valid
767
+ JSON, it returns the parsed dictionary. Otherwise, it returns the raw
768
+ string output.
769
+
770
+ Parameters
771
+ ----------
772
+ result : subprocess.CompletedProcess[str]
773
+ The result of the subprocess run.
774
+
775
+ Returns
776
+ -------
777
+ Union[str, dict[str, Any]]
778
+ The parsed stdout output as a dictionary if valid JSON, otherwise the
779
+ raw string output.
780
+ """
781
+ raw_output = result.stdout
782
+ if raw_output.strip() == "":
783
+ return ""
784
+
785
+ try:
786
+ return json.loads(raw_output)
787
+ except json.JSONDecodeError:
788
+ return raw_output
789
+
790
+
791
+ def ignore_patterns(dir_path: str, names: list[str]) -> list[str]:
792
+ """
793
+ Custom ignore function for copytree that filters files and directories
794
+ during source code copying. Excludes virtual environments, cache files,
795
+ the nextmv directory, and non-essential files while preserving Python
796
+ source files and application manifests.
797
+
798
+ Parameters
799
+ ----------
800
+ dir_path : str
801
+ The path to the directory being processed.
802
+ names : list[str]
803
+ A list of file and directory names in the current directory.
804
+
805
+ Returns
806
+ -------
807
+ list[str]
808
+ A list of names to ignore during the copy operation.
809
+ """
810
+ ignored = []
811
+ for name in names:
812
+ full_path = os.path.join(dir_path, name)
813
+
814
+ # Ignore nextmv directory
815
+ if name == NEXTMV_DIR:
816
+ ignored.append(name)
817
+ continue
818
+
819
+ # Ignore virtual environment directories
820
+ if name in ("venv", ".venv", "env", ".env", "virtualenv", ".virtualenv"):
821
+ ignored.append(name)
822
+ continue
823
+
824
+ # Ignore __pycache__ directories
825
+ if name == "__pycache__":
826
+ ignored.append(name)
827
+ continue
828
+
829
+ # If it's a file, only keep Python files and app.yaml
830
+ if os.path.isfile(full_path):
831
+ if not (name.endswith(".py") or name == "app.yaml"):
832
+ ignored.append(name)
833
+ continue
834
+
835
+ # Ignore .pyc files explicitly
836
+ if name.endswith(".pyc"):
837
+ ignored.append(name)
838
+ continue
839
+
840
+ return ignored
841
+
842
+
843
+ def _build_exclusion_directories(src: str, manifest: Manifest, outputs_dir: str, run_dir: str) -> list[str]:
844
+ """
845
+ Build a list of directories to exclude when copying solution files.
846
+
847
+ Parameters
848
+ ----------
849
+ src : str
850
+ The path to the original application source code.
851
+ manifest : Manifest
852
+ The application manifest containing configuration.
853
+ outputs_dir : str
854
+ The path to the outputs directory in the run directory.
855
+ run_dir : str
856
+ The path to the run directory.
857
+
858
+ Returns
859
+ -------
860
+ list[str]
861
+ List of directory paths to exclude from copying.
862
+ """
863
+ exclusion_dirs = []
864
+
865
+ # Add inputs directory from original source
866
+ inputs_dir_original = os.path.join(src, INPUTS_KEY)
867
+ if os.path.exists(inputs_dir_original):
868
+ exclusion_dirs.append(inputs_dir_original)
869
+
870
+ # Add custom inputs directory if specified in manifest
871
+ if (
872
+ manifest.configuration is not None
873
+ and manifest.configuration.content is not None
874
+ and manifest.configuration.content.format == InputFormat.MULTI_FILE
875
+ and manifest.configuration.content.multi_file is not None
876
+ ):
877
+ custom_inputs_dir = os.path.join(src, manifest.configuration.content.multi_file.input.path)
878
+ if os.path.exists(custom_inputs_dir):
879
+ exclusion_dirs.append(custom_inputs_dir)
880
+
881
+ # Add inputs directory from run directory
882
+ inputs_dir_run = os.path.join(run_dir, INPUTS_KEY)
883
+ if os.path.exists(inputs_dir_run):
884
+ exclusion_dirs.append(inputs_dir_run)
885
+
886
+ # Add statistics and assets directories from run outputs
887
+ stats_dir = os.path.join(outputs_dir, STATISTICS_KEY)
888
+ if os.path.exists(stats_dir):
889
+ exclusion_dirs.append(stats_dir)
890
+
891
+ assets_dir = os.path.join(outputs_dir, ASSETS_KEY)
892
+ if os.path.exists(assets_dir):
893
+ exclusion_dirs.append(assets_dir)
894
+
895
+ return exclusion_dirs
896
+
897
+
898
+ def _copy_new_or_modified_files(
899
+ src_dir: str, dst_dir: str, original_src_dir: Optional[str] = None, exclusion_dirs: Optional[list[str]] = None
900
+ ) -> None:
901
+ """
902
+ Copy files from source to destination only if they meet specific criteria.
903
+
904
+ This function ensures that only files that are either:
905
+ 1. New files (not present in destination)
906
+ 2. Existing files with different content (based on checksum comparison)
907
+ 3. Files that are NOT present in the original source directory (if provided)
908
+ 4. Files that are NOT present in any of the exclusion directories (if provided)
909
+
910
+ Parameters
911
+ ----------
912
+ src_dir : str
913
+ The source directory path to copy from.
914
+ dst_dir : str
915
+ The destination directory path to copy to.
916
+ original_src_dir : Optional[str], optional
917
+ The original source directory to check against. Files present in this
918
+ directory will NOT be copied, by default None.
919
+ exclusion_dirs : Optional[list[str]], optional
920
+ Additional directories to check against. Files present in any of these
921
+ directories will NOT be copied, by default None.
922
+ """
923
+ # Build list of all exclusion directories
924
+ exclusion_directories = []
925
+ if original_src_dir is not None:
926
+ exclusion_directories.append(original_src_dir)
927
+ if exclusion_dirs is not None:
928
+ exclusion_directories.extend(exclusion_dirs)
929
+
930
+ for root, _dirs, files in os.walk(src_dir):
931
+ rel_root = os.path.relpath(root, src_dir)
932
+ dst_root = dst_dir if rel_root == "." else os.path.join(dst_dir, rel_root)
933
+ os.makedirs(dst_root, exist_ok=True)
934
+
935
+ for file in files:
936
+ # Skip if file exists in any exclusion directory
937
+ if exclusion_directories and _file_exists_in_exclusion_dirs(file, rel_root, exclusion_directories):
938
+ continue
939
+
940
+ src_file = os.path.join(root, file)
941
+ dst_file = os.path.join(dst_root, file)
942
+
943
+ if _should_copy_file(src_file, dst_file):
944
+ shutil.copy2(src_file, dst_file)
945
+
946
+
947
+ def _should_copy_file(src_file: str, dst_file: str) -> bool:
948
+ """
949
+ Determine if a file should be copied based on existence and content.
950
+
951
+ Parameters
952
+ ----------
953
+ src_file : str
954
+ Path to the source file.
955
+ dst_file : str
956
+ Path to the destination file.
957
+
958
+ Returns
959
+ -------
960
+ bool
961
+ True if the file should be copied, False otherwise.
962
+ """
963
+ if not os.path.exists(dst_file):
964
+ return True
965
+
966
+ try:
967
+ src_checksum = _calculate_file_checksum(src_file)
968
+ dst_checksum = _calculate_file_checksum(dst_file)
969
+ return src_checksum != dst_checksum
970
+ except OSError:
971
+ return True
972
+
973
+
974
+ def _calculate_file_checksum(file_path: str) -> str:
975
+ """
976
+ Calculate MD5 checksum of a file.
977
+
978
+ Parameters
979
+ ----------
980
+ file_path : str
981
+ The path to the file.
982
+
983
+ Returns
984
+ -------
985
+ str
986
+ The MD5 checksum of the file.
987
+ """
988
+ hash_md5 = hashlib.md5()
989
+ with open(file_path, "rb") as f:
990
+ for chunk in iter(lambda: f.read(4096), b""):
991
+ hash_md5.update(chunk)
992
+ return hash_md5.hexdigest()
993
+
994
+
995
+ def _file_exists_in_exclusion_dirs(file_name: str, rel_root: str, exclusion_dirs: list[str]) -> bool:
996
+ """
997
+ Check if a file exists in any of the exclusion directories.
998
+
999
+ Parameters
1000
+ ----------
1001
+ file_name : str
1002
+ The name of the file to check.
1003
+ rel_root : str
1004
+ The relative root path from the source directory.
1005
+ exclusion_dirs : list[str]
1006
+ List of directories to check against.
1007
+
1008
+ Returns
1009
+ -------
1010
+ bool
1011
+ True if the file exists in any exclusion directory, False otherwise.
1012
+ """
1013
+ for exclusion_dir in exclusion_dirs:
1014
+ if rel_root != ".":
1015
+ exclusion_file = os.path.join(exclusion_dir, rel_root, file_name)
1016
+ else:
1017
+ exclusion_file = os.path.join(exclusion_dir, file_name)
1018
+
1019
+ if os.path.exists(exclusion_file):
1020
+ return True
1021
+ return False
1022
+
1023
+
713
1024
  if __name__ == "__main__":
714
1025
  main()