PyPI - runzi - Versions diffs - 0.9.0__tar.gz - Mend

runzi 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

runzi-0.9.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,61 @@
+Metadata-Version: 2.4
+Name: runzi
+Version: 0.9.0
+Summary: scripting and cli for the NSHM
+License-Expression: GPL-3.0-only
+Author: Chris DiCaprio
+Author-email: christopher.dicaprio@gmail.com
+Requires-Python: >=3.11,<3.12
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Requires-Dist: boto3 (>=1.26.43)
+Requires-Dist: click (>=8.1.7)
+Requires-Dist: gitpython (>=3.1.46)
+Requires-Dist: lxml (>=4.9.2)
+Requires-Dist: nshm-toshi-client (>=1.1.0,<2.0.0)
+Requires-Dist: nzshm-common[geometry] (>=0.8.4)
+Requires-Dist: nzshm-hazlab (>=0.1.1,<0.2.0)
+Requires-Dist: nzshm-model (>=0.13.6)
+Requires-Dist: py4j (==0.10.9.1)
+Requires-Dist: pydantic (>=2.12)
+Requires-Dist: python-dateutil (>=2.8.2)
+Requires-Dist: python-dotenv (>=1.1.0)
+Requires-Dist: pytz (>=2025.1)
+Requires-Dist: solvis (>=1.2.0,<2.0.0)
+Requires-Dist: toshi-hazard-store (>=1.2.3,<2.0.0)
+Requires-Dist: tqdm (>=4.67.3)
+Requires-Dist: typer (>=0.17.4,<0.18.0)
+Description-Content-Type: text/markdown
+# nzshm-runzi
+[![pypi](https://img.shields.io/pypi/v/nzshm-runzi.svg)](https://pypi.org/project/nzshm-runzi/)
+[![python](https://img.shields.io/pypi/pyversions/nzshm-runzi.svg)](https://pypi.org/project/nzshm-runzi/)
+[![Build Status](https://github.com/GNS-Science/nzshm-runzi/actions/workflows/dev.yml/badge.svg)](https://github.com/GNS-Science/nzshm-runzi/actions/workflows/dev.yml)
+[![codecov](https://codecov.io/gh/GNS-Science/nzshm-runzi/branch/main/graphs/badge.svg)](https://codecov.io/github/GNS-Science/nzshm-runzi)
+* Documentation: <https://GNS-Science.github.io/nzshm-runzi>
+* GitHub: <https://github.com/GNS-Science/nzshm-runzi>
+* PyPI: <https://pypi.org/project/nzshm-runzi/>
+* Free software: GPL-3.0-only
+Python application for running, scheduling, collecting inputs &amp; outputs of NZSHM jobs on workstations, AWS cloud, and HPC cluster
+runzi is used by the ESNZ NSHM programme to run OpenSHA style inversions, hazard calculations, and other computational tasks.
+- Provides a CLI for launching jobs locally or using AWS EC2 services (HPC is currently unsupported after the move from PBS to Slurm).
+- Coordinates with [toshi API](https://github.com/GNS-Science/nshm-toshi-api) and [toshi-hazard-store](https://github.com/GNS-Science/toshi-hazard-store) to lookup and store results and metadata.
+## Run
+```console
+$ runzi [OPTIONS] COMMAND [ARGS]...
+```
+```console
+$ runzi --help
+```

runzi-0.9.0/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+# nzshm-runzi
+[![pypi](https://img.shields.io/pypi/v/nzshm-runzi.svg)](https://pypi.org/project/nzshm-runzi/)
+[![python](https://img.shields.io/pypi/pyversions/nzshm-runzi.svg)](https://pypi.org/project/nzshm-runzi/)
+[![Build Status](https://github.com/GNS-Science/nzshm-runzi/actions/workflows/dev.yml/badge.svg)](https://github.com/GNS-Science/nzshm-runzi/actions/workflows/dev.yml)
+[![codecov](https://codecov.io/gh/GNS-Science/nzshm-runzi/branch/main/graphs/badge.svg)](https://codecov.io/github/GNS-Science/nzshm-runzi)
+* Documentation: <https://GNS-Science.github.io/nzshm-runzi>
+* GitHub: <https://github.com/GNS-Science/nzshm-runzi>
+* PyPI: <https://pypi.org/project/nzshm-runzi/>
+* Free software: GPL-3.0-only
+Python application for running, scheduling, collecting inputs &amp; outputs of NZSHM jobs on workstations, AWS cloud, and HPC cluster
+runzi is used by the ESNZ NSHM programme to run OpenSHA style inversions, hazard calculations, and other computational tasks.
+- Provides a CLI for launching jobs locally or using AWS EC2 services (HPC is currently unsupported after the move from PBS to Slurm).
+- Coordinates with [toshi API](https://github.com/GNS-Science/nshm-toshi-api) and [toshi-hazard-store](https://github.com/GNS-Science/toshi-hazard-store) to lookup and store results and metadata.
+## Run
+```console
+$ runzi [OPTIONS] COMMAND [ARGS]...
+```
+```console
+$ runzi --help
+```

runzi-0.9.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,118 @@
+[project]
+name = "runzi"
+version = "0.9.0"
+readme = "README.md"
+authors = [
+    {name = "Chris DiCaprio", email = "christopher.dicaprio@gmail.com"},
+    {name = "Chris Chamberlain", email = "chrisbc@artisan.co.nz"},
+    {name = "Oakley Jurgens", email = "o.jurgens@gns.cri.nz"},
+]
+description = "scripting and cli for the NSHM"
+license = "GPL-3.0-only"
+requires-python = ">=3.11,<3.12"
+packages = [
+    { include = "runzi" },
+    { include = "tests", format = "sdist" },
+]
+dependencies = [
+    "gitpython (>=3.1.46)",
+    "py4j (==0.10.9.1)",
+    "boto3 (>=1.26.43)",
+    "lxml (>=4.9.2)",
+    "tqdm (>=4.67.3)",
+    "click (>=8.1.7)",
+    "python-dateutil (>=2.8.2)",
+    "nzshm-common[geometry] (>=0.8.4)",
+    "pytz (>=2025.1)",
+    "pydantic (>=2.12)",
+    "nzshm-model (>=0.13.6)",
+    "python-dotenv (>=1.1.0)",
+    "toshi-hazard-store (>=1.2.3,<2.0.0)",
+    "typer (>=0.17.4,<0.18.0)",
+    "solvis (>=1.2.0,<2.0.0)",
+    "nshm-toshi-client (>=1.1.0,<2.0.0)",
+    "nzshm-hazlab (>=0.1.1,<0.2.0)",
+]
+[project.scripts]
+runzi = 'runzi.cli.runzi_cli:app'
+[dependency-groups]
+dev = [
+    "pytest",
+    "flake8",
+    "black",
+    "isort",
+    "bump2version",
+    "tox",
+    "mypy",
+    "pytest-cov",
+    "types-requests",
+    "types-pytz",
+    "types-python-dateutil",
+    "pytest-mock",
+    "vulture",
+    "safety",
+    "pip-audit",
+    "chardet (<6)",
+]
+doc = [
+    "mkdocs",
+    "mkdocs-material",
+    "mkdocs-include-markdown-plugin",
+    "mkdocstrings",
+    "mkdocstrings-python",
+    "griffe (>=2.0.0,<3.0.0)",
+    "griffe-pydantic (>=1.3.1,<2.0.0)",
+    "griffe-fieldz (>=0.4.0,<0.5.0)",
+    "mkdocs-gen-files (>=0.6.0,<0.7.0)",
+]
+[tool.isort]
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 120
+skip_gitignore = true
+[tool.black]
+line-length = 120
+skip-string-normalization = true
+target-version = ['py311']
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+  | runzi/CONFIG
+)/
+'''
+[tool.mypy]
+ignore_missing_imports = true
+[tool.poetry.requires-plugins]
+poetry-plugin-export = ">=1.8"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

runzi-0.9.0/runzi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = '0.9.0'

runzi-0.9.0/runzi/arguments.py ADDED Viewed

@@ -0,0 +1,119 @@
+import copy
+import json
+from enum import Enum
+from pathlib import Path
+from typing import Any, Generator, Optional, Sequence
+from pydantic import BaseModel
+from typing_extensions import Self
+from runzi.aws import BatchEnvironmentSetting
+class TaskLanguage(Enum):
+    PYTHON = 'python'
+    JAVA = 'java'
+class SystemArgs(BaseModel):
+    task_language: TaskLanguage
+    general_task_id: Optional[str] = None
+    task_count: int = 0
+    use_api: bool
+    java_threads: Optional[int] = None  # only used for pbs mode, which is not supported anymore
+    jvm_heap_max: Optional[int] = None
+    java_gateway_port: Optional[int] = None
+    ecs_max_job_time_min: int
+    ecs_memory: int
+    ecs_vcpu: int
+    ecs_job_definition: str
+    ecs_job_queue: str
+    ecs_extra_env: Optional[list[BatchEnvironmentSetting]] = None
+class ArgSweeper:
+    """Class to hold argument prototype and swept arguments."""
+    def __init__(
+        self,
+        prototype_args: BaseModel,
+        swept_args: dict[str, Sequence[Any]],
+        title: str,
+        description: str,
+        sys_arg_overrides: Optional[dict[str, Any]] = None,
+    ):
+        """Initialize a SweptArgs instance.
+        Args:
+            prototype: The prototype job argument object.
+            swept_args: A dictionary of argument names to lists of values to be swept.
+            title: The title for the job.
+            description: The description for the job.
+            sys_arg_overrides: System arguments to override from the default of the JobRunner.
+        """
+        self.prototype_args = prototype_args
+        self.swept_args = swept_args
+        self.title = title
+        self.description = description
+        self.sys_arg_overrides = sys_arg_overrides or {}
+    @classmethod
+    def from_config_file(cls, config_file: Path | str, args_class: type[BaseModel]) -> Self:
+        """Create a prototype job argument object and a dict of arguments to be swept.
+        Config files are json format and can optionally contain a "swept_args" object that specifies the names and
+        list of values for an argument to take in the jobs to be created.  The prototype object is generated from the
+        first value from each of the swept arguments. The dict keys are the argument names and values are lists of
+        argument values.
+        Args:
+            config_file: File-like object or path to configuration file.
+            args_class: The type (class) of the configuration/arguments object.
+        Returns:
+            A tuple of the prototype config object and a dictionary of arguments to be swept.
+        """
+        json_str = Path(config_file).read_text()
+        data = json.loads(json_str)
+        title = data.pop("title")
+        description = data.pop("description")
+        swept_args = data.pop("swept_args", {})
+        sys_arg_overrides = data.pop("sys_arg_overrides", {})
+        if swept_args:
+            for k, v in swept_args.items():
+                if k in data:
+                    raise ValueError(f"Swept argument '{k}' also specified in unswept arguments")
+                if not all(isinstance(item, type(v[0])) for item in v):
+                    raise ValueError(f"All values for swept argument '{k}' must be of the same type")
+                data[k] = v[0]
+        # we include the base_path context so that any arg_class that needs to
+        # resolve absolute paths can (e.g., used by HazardArgs)
+        prototype = args_class.model_validate(
+            data, extra='forbid', context={"base_path": Path(config_file).parent.resolve()}
+        )
+        return cls(prototype, swept_args, title, description, sys_arg_overrides)
+    def get_tasks(self) -> Generator[BaseModel, None, None]:
+        """Generate all combinations of swept arguments as job argument objects.
+        Yields:
+            Job argument objects for each combination of swept arguments.
+        """
+        from itertools import product
+        if not self.swept_args:
+            yield self.prototype_args
+            return
+        prototype_data = self.prototype_args.model_dump()
+        for values in product(*self.swept_args.values()):
+            update_data = dict(zip(self.swept_args.keys(), values))
+            prototype_data_copy = copy.deepcopy(prototype_data)
+            yield self.prototype_args.model_validate(prototype_data_copy | update_data)

runzi-0.9.0/runzi/automation/__init__.py ADDED Viewed

File without changes

runzi-0.9.0/runzi/automation/azimuthal_rupture_set_builder_task.py ADDED Viewed

@@ -0,0 +1,180 @@
+import argparse
+import datetime as dt
+import json
+import os
+import platform
+import time
+from pathlib import PurePath
+import git
+from dateutil.tz import tzutc
+from nshm_toshi_client.general_task import GeneralTask
+from nshm_toshi_client.rupture_generation_task import RuptureGenerationTask
+from nshm_toshi_client.task_relation import TaskRelation
+from py4j.java_gateway import GatewayParameters, JavaGateway
+API_URL = os.getenv('NZSHM22_TOSHI_API_URL', "http://127.0.0.1:5000/graphql")
+API_KEY = os.getenv('NZSHM22_TOSHI_API_KEY', "")
+S3_URL = os.getenv('NZSHM22_TOSHI_S3_URL', "http://localhost:4569")
+class RuptureSetBuilderTask:
+    """
+    The python client for a RuptureSetBuildTask
+    """
+    def __init__(self, job_args):
+        self.use_api = job_args.get('use_api', False)
+        # setup the java gateway binding
+        gateway = JavaGateway(gateway_parameters=GatewayParameters(port=job_args['java_gateway_port']))
+        app = gateway.entry_point
+        self._builder = app.getAzimuthalRuptureSetBuilder()
+        # get the root path for the task local data
+        # root_folder = PurePath(os.getcwd())
+        repos = ["opensha", "nshm-nz-opensha"]
+        # repo_root = root_folder
+        self._output_folder = PurePath(
+            job_args.get('working_path')
+        )  # .joinpath('tmp').joinpath(dt.datetime.utcnow().isoformat().replace(':','-'))
+        # os.mkdir(self._output_folder)
+        # setup the csv (backup) task recorder
+        self._writer = None  # CSVResultWriter(open(self._output_folder.joinpath('results.csv'), 'w'), repos)
+        self._repoheads = get_repo_heads(PurePath(job_args['root_folder']), repos)
+        if self.use_api:
+            headers = {"x-api-key": API_KEY}
+            self._ruptgen_api = RuptureGenerationTask(
+                API_URL, S3_URL, None, with_schema_validation=True, headers=headers
+            )
+            self._general_api = GeneralTask(API_URL, S3_URL, None, with_schema_validation=True, headers=headers)
+            self._task_relation_api = TaskRelation(API_URL, None, with_schema_validation=True, headers=headers)
+    def ruptureSetMetrics(self):
+        metrics = {}
+        metrics["subsection_count"] = self._builder.getSubSections().size()
+        metrics["rupture_count"] = self._builder.getRuptures().size()
+        # metrics["possible_cluster_connections"] = conf.getConnectionStrategy().getClusterConnectionCount()
+        # # get info from the configuratiion
+        conf = self._builder.getPlausibilityConfig()
+        conf_diags = json.loads(conf.toJSON())
+        conns = 0
+        for cluster in conf_diags['connectionStrategy']['clusters']:
+            conns += len(cluster.get('connections', []))
+        metrics["cluster_connections"] = conns
+        return metrics
+    def run(self, task_arguments, job_arguments):
+        # print(task_arguments)
+        # print(job_arguments)
+        t0 = dt.datetime.utcnow()
+        environment = {
+            "host": platform.node(),
+            "gitref_opensha": self._repoheads['opensha'],
+            "gitref_nshm-nz-opensha": self._repoheads['nshm-nz-opensha'],
+        }
+        if self.use_api:
+            # create new task in toshi_api
+            task_id = self._ruptgen_api.create_task(
+                dict(created=dt.datetime.now(tzutc()).isoformat()), arguments=task_arguments, environment=environment
+            )
+            # link task tp the parent task
+            self._task_relation_api.create_task_relation(job_arguments['general_task_id'], task_id)
+            # #link task to the input datafile (*.XML)
+            # self._ruptgen_api.link_task_file(task_id, crustal_id, 'READ')
+        else:
+            task_id = None
+        # Run the task....
+        ta = task_arguments
+        # for crustal
+        self._builder.setMaxFaultSections(int(ta["max_sections"])).setMaxJumpDistance(
+            float(ta["max_jump_distance"])
+        ).setPermutationStrategy(ta["connection_strategy"]).setMaxSubSectionLength(
+            float(ta["down_dip_width"])
+        ).setMinSubSectsPerParent(
+            int(ta["min_sub_sects_per_parent"])
+        ).setMinSubSections(
+            int(ta["min_sub_sections"])
+        ).setMaxCumulativeAzimuthChange(
+            float(ta["max_cumulative_azimuth"])
+        ).setThinningFactor(
+            float(ta["thinning_factor"])
+        ).setFaultModel(
+            ta["fault_model"]
+        )
+        # name the output file
+        outputfile = self._output_folder.joinpath(self._builder.getDescriptiveName() + ".zip")
+        print("building %s started at %s" % (outputfile, dt.datetime.utcnow().isoformat()), end=' ')
+        self._builder.setNumThreads(int(job_arguments["java_threads"])).buildRuptureSet()
+        # capture task metrics
+        duration = (dt.datetime.utcnow() - t0).total_seconds()
+        metrics = self.ruptureSetMetrics()
+        # write the result
+        self._builder.writeRuptureSet(str(outputfile))
+        if self.use_api:
+            # record the completed task
+            done_args = {
+                'task_id': task_id,
+                'duration': duration,
+                'result': "SUCCESS",
+                'state': "DONE",
+            }
+            self._ruptgen_api.complete_task(done_args, metrics)
+            # upload the task output
+            self._ruptgen_api.upload_task_file(task_id, outputfile, 'WRITE', meta=task_arguments)
+            # and the log files, why not
+            java_log_file = self._output_folder.joinpath(f"java_app.{job_arguments['java_gateway_port']}.log")
+            self._ruptgen_api.upload_task_file(task_id, java_log_file, 'WRITE')
+            pyth_log_file = self._output_folder.joinpath(f"python_script.{job_arguments['java_gateway_port']}.log")
+            self._ruptgen_api.upload_task_file(task_id, pyth_log_file, 'WRITE')
+        print("; took %s secs" % (dt.datetime.utcnow() - t0).total_seconds())
+def get_repo_heads(rootdir, repos):
+    result = {}
+    for reponame in repos:
+        repo = git.Repo(rootdir.joinpath(reponame))
+        headcommit = repo.head.commit
+        result[reponame] = headcommit.hexsha
+    return result
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config")
+    args = parser.parse_args()
+    config_file = args.config
+    f = open(config_file, 'r', encoding='utf-8')
+    config = json.load(f)
+    # maybe the JVM App is a little slow to get listening
+    time.sleep(5)
+    # Wait for some more time, scaled by taskid to avoid S3 consistency issue
+    time.sleep(config['job_arguments']['task_id'] * 5)
+    # print(config)
+    task = RuptureSetBuilderTask(config['job_arguments'])
+    task.run(**config)

runzi-0.9.0/runzi/automation/file_utils.py ADDED Viewed

@@ -0,0 +1,178 @@
+#!python3
+"""
+helpers for upstream file retrieval
+"""
+import os
+from pathlib import Path, PurePath
+from typing import TYPE_CHECKING, Any, Generator, Iterable
+import requests
+if TYPE_CHECKING:
+    from runzi.automation.toshi_api import ToshiApi
+def get_output_file_ids(general_task_api, upstream_task_id, file_extension='zip'):
+    api_result = general_task_api.get_subtask_files(upstream_task_id)
+    for subtask in api_result['children']['edges']:
+        # get rupture set fault model
+        fault_model = ""
+        for filenode in subtask['node']['child']['files']['edges']:
+            # print("FN:", filenode)
+            if filenode['node']['role'] == 'READ' and filenode['node']['file']['file_name'][-3:] == file_extension:
+                for kv in filenode['node']['file'].get('meta', []):
+                    if kv.get('k') == 'fault_model':
+                        fault_model = kv.get('v')
+                        break
+        # get rupture set max jump distance
+        max_jump_distance = ""
+        for filenode in subtask['node']['child']['files']['edges']:
+            # print("FN:", filenode)
+            if filenode['node']['file'].get('meta', []):
+                for kv in filenode['node']['file'].get('meta', []):
+                    if kv.get('k') == 'max_jump_distance':
+                        max_jump_distance = kv.get('v')
+                        break
+        for filenode in subtask['node']['child']['files']['edges']:
+            # skip task inputs
+            if filenode['node']['role'] == 'READ':
+                continue
+            if filenode['node']['file']['file_name'][-3:] == file_extension:
+                # inversion_meta = dict() ## this relies on order of
+                # for kv in filenode['node']['file']['meta']:
+                #     inversion_meta[kv['k']] = kv['v']
+                res = dict(
+                    id=filenode['node']['file']['id'],
+                    file_name=filenode['node']['file']['file_name'],
+                    file_size=filenode['node']['file']['file_size'],
+                )
+                if fault_model:
+                    res['fault_model'] = fault_model
+                if max_jump_distance:
+                    res['max_jump_distance'] = max_jump_distance
+                yield res
+                # TESTING
+                # return
+def get_output_file_id(file_api: 'ToshiApi', single_file_id: str) -> Generator[dict[str, Any], None, None]:
+    api_result = file_api.get_file_detail(single_file_id)
+    fault_model = ""
+    max_jump_distance = ""
+    print("FN:", api_result)
+    if api_result['file_name'][-3:] == "zip":
+        res = dict(id=api_result['id'], file_name=api_result['file_name'], file_size=api_result['file_size'])
+        if api_result.get('meta'):
+            for kv in api_result['meta']:
+                if kv.get('k') == 'fault_model':
+                    fault_model = kv.get('v')
+            for kv in api_result['meta']:
+                if kv.get('k') == 'max_jump_distance':
+                    max_jump_distance = kv.get('v')
+        if fault_model:
+            res['fault_model'] = fault_model
+        if max_jump_distance:
+            res['max_jump_distance'] = max_jump_distance
+        yield res  # yep yield one
+    return
+def get_file_meta(file_api, single_file_id):
+    api_result = file_api.get_file_detail(single_file_id)
+    # return api_result.get('meta')
+    res = dict()
+    if api_result.get('meta'):
+        for kv in api_result['meta']:
+            res[kv['k']] = kv['v']
+        return res
+    else:
+        return None
+def get_download_info(file_api: 'ToshiApi', file_infos: Iterable[dict[str, Any]]) -> Generator[dict, None, None]:
+    """
+    [{'id': 'RmlsZToyOS4wRUVjV0E=',
+    'file_name':
+        'RupSet_Cl_FM(CFM_0_3_SANSTVZ)_noInP(T)_slRtP(0.05)_slInL(F)_cfFr(0.75)_cfRN(2)_cfRTh(0.5)_cfRP(0.01)_
+            fvJm(T)_jmPTh(0.001)_cmRkTh(360)_mxJmD(15)_plCn(T)_adMnD(6)_adScFr(0)_bi(F)_stGrSp(2)_coFr(0.5).zip',
+    'file_size': 2498443,
+    'short_name': None}]
+    """
+    # file_info = {}
+    for itm in file_infos:
+        api_result = file_api.get_file_download_url(itm['id'])
+        # print(api_result)
+        yield dict(dict(file_url=api_result['file_url']), **itm)  # merge the discts
+def download_files(
+    file_api: 'ToshiApi',
+    file_generator: Iterable[dict[str, Any]],
+    dest_folder: str,
+    id_suffix: bool = False,
+    overwrite: bool = False,
+    skip_existing: bool = False,
+    skip_download: bool = False,
+) -> dict[str, dict]:
+    """
+    file_generator = get_output_file_ids(general_api, upstream_task_id) # for files by upstream task ID)
+    or
+    file_generator = get_output_file_id(file_api, file_id) #for file by file ID
+    """
+    downloads = dict()
+    for info in get_download_info(file_api, file_generator):
+        folder = Path(dest_folder, 'downloads', info['id'])
+        folder.mkdir(parents=True, exist_ok=True)
+        # we can skip if file exists and has correct file_size
+        file_path: str | PurePath = PurePath(folder, info['file_name'])
+        if id_suffix:
+            file_path = str(file_path).replace('.zip', f"_{info['id']}.zip")
+        # shortname = info['short_name'] or info['id']
+        if skip_existing and os.path.isfile(file_path):
+            print(f"Don't reprocess existing file: {file_path}")
+            continue
+        downloads[info['id']] = dict(id=info['id'], filepath=str(file_path), info=info)
+        if not overwrite and os.path.isfile(file_path):
+            print(f"Skip DL for existing file: {file_path}")
+            continue
+        if skip_download:
+            print("Skipping download -> aws mode")
+            continue
+        # here we pull the file
+        # print(info['file_url'])
+        # r0 = requests.head(info['file_url'])
+        r1 = requests.get(info['file_url'])
+        with open(str(file_path), 'wb') as f:
+            f.write(r1.content)
+            f.flush()
+            print("downloaded input file:", file_path, f)
+            if os.path.getsize(file_path) != info['file_size']:
+                raise RuntimeError("downloaded file size mismatch")
+    return downloads