PyPI - proceed - Versions diffs - 0.0.1__py3-none-any.whl - Mend

proceed 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

proceed/__about__.py +1 -0
proceed/__init__.py +0 -0
proceed/aggregator.py +123 -0
proceed/cli.py +161 -0
proceed/config_options.py +265 -0
proceed/docker_runner.py +325 -0
proceed/file_matching.py +52 -0
proceed/model.py +813 -0
proceed/yaml_data.py +97 -0
proceed-0.0.1.dist-info/METADATA +83 -0
proceed-0.0.1.dist-info/RECORD +14 -0
proceed-0.0.1.dist-info/WHEEL +4 -0
proceed-0.0.1.dist-info/entry_points.txt +2 -0
proceed-0.0.1.dist-info/licenses/LICENSE +24 -0

proceed/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.1"

proceed/__init__.py ADDED Viewed

File without changes

proceed/aggregator.py ADDED Viewed

@@ -0,0 +1,123 @@
+import logging
+from typing import Any
+from pathlib import Path
+from pandas import DataFrame
+import yaml
+from proceed.model import ExecutionRecord, Pipeline, Step, Timing, StepResult
+from proceed.file_matching import flatten_matches, file_summary, hash_contents
+def summarize_results(results_path: Path, columns: list[str] = None, sort_rows_by: list[str] = None) -> DataFrame:
+    summary_rows = []
+    group_paths = [path for path in results_path.iterdir() if path.is_dir()]
+    for group_path in group_paths:
+        id_paths = [path for path in group_path.iterdir() if path.is_dir()]
+        for id_path in id_paths:
+            for yaml_file in id_path.glob("execution_record.yaml"):
+                execution_record = safe_read_execution_record(yaml_file)
+                if execution_record:
+                    execution_summary = summarize_execution(id_path.stem, group_path.stem, execution_record)
+                    summary_rows = summary_rows + execution_summary
+    summary = DataFrame(summary_rows)
+    if columns:
+        summary_columns = list(summary.columns)
+        usable_columns = [column for column in columns if column in summary_columns]
+        summary = summary.filter(items=columns)
+    if sort_rows_by:
+        summary_columns = list(summary.columns)
+        usable_columns = [column for column in sort_rows_by if column in summary_columns]
+        summary = summary.sort_values(usable_columns)
+    return summary
+def safe_read_execution_record(yaml_file: Path) -> ExecutionRecord:
+    try:
+        with open(yaml_file) as f:
+            return ExecutionRecord.from_yaml(f.read())
+    except:
+        logging.error(f"Skipping file that seems not to be a Proceed execution record: {yaml_file}")
+        return None
+def summarize_execution(results_id: str, group: str, execution_record: ExecutionRecord) -> list[dict[str, str]]:
+    pipeline_summary = summarize_pipeline(results_id, group, execution_record.amended, execution_record.timing)
+    steps_and_results = zip(execution_record.amended.steps, execution_record.step_results)
+    step_summaries = [summarize_step_and_result(step, result) for step, result in steps_and_results]
+    combined_summary = [{**pipeline_summary, **file_summary} for step_summary in step_summaries for file_summary in step_summary]
+    return combined_summary
+def summarize_pipeline(results_id: str, group: str, pipeline: Pipeline, timing: Timing) -> dict[str, str]:
+    top_level_summary = {
+        "proceed_version": pipeline.version,
+        "results_id": results_id,
+        "results_group": group,
+        "pipeline_description": pipeline.description,
+        "pipeline_start": timing.start,
+        "pipeline_finish": timing.finish,
+        "pipeline_duration": timing.duration,
+    }
+    arg_summary = {f"arg_{key}": value for key, value in pipeline.args.items()}
+    combined_summary = {**top_level_summary, **arg_summary}
+    return combined_summary
+def summarize_step_and_result(step: Step, result: StepResult) -> list[dict[str, Any]]:
+    step_summary = {f"step_{key}": str(value) for key, value in step.to_dict().items()}
+    flattened_step_attributes = {"timing", "log_file", "files_done", "files_in", "files_out", "files_summary"}
+    result_summary = {f"step_{key}": str(value) for key, value in result.to_dict().items() if key not in flattened_step_attributes}
+    result_summary["step_start"] = result.timing.start
+    result_summary["step_finish"] = result.timing.finish
+    result_summary["step_duration"] = result.timing.duration
+    if result.log_file:
+        log_path = Path(result.log_file)
+        log_digest = hash_contents(log_path)
+        log_file = file_summary(volume=log_path.parent.as_posix(), path=log_path.name, digest=log_digest, file_role="log")
+    else:
+        log_file = file_summary(volume="", path="", digest="", file_role="log")
+    done_files = flatten_matches(result.files_done, file_role="done")
+    in_files = flatten_matches(result.files_in, file_role="in")
+    out_files = flatten_matches(result.files_out, file_role="out")
+    summary_files = flatten_matches(result.files_summary, file_role="summary")
+    all_files = [log_file] + done_files + in_files + out_files + summary_files
+    custom_summary = {}
+    for summary_file in summary_files:
+        custom_columns = collect_custom_columns(summary_file["file_volume"], summary_file["file_path"])
+        custom_summary.update(custom_columns)
+    combined_summary = [{**step_summary, **result_summary, **file_summary, **custom_summary} for file_summary in all_files]
+    return combined_summary
+def collect_custom_columns(file_volume: str, file_path: str) -> dict[str, str]:
+    path = Path(file_volume, file_path)
+    if not path.is_file() or not path.exists():
+        return {}
+    with open(path) as f:
+        content = f.read()
+    try:
+        parsed = yaml.safe_load(content)
+        if parsed and isinstance(parsed, dict):
+            return parsed
+        logging.info(f"Treating non-dictionary YAML as plain text: {path.as_posix()}")
+    except yaml.parser.ParserError:
+        logging.info(f"Treating non-YAML file as plain text: {path.as_posix()}")
+    return {path.stem: content.strip()}

proceed/cli.py ADDED Viewed

@@ -0,0 +1,161 @@
+import sys
+import logging
+import yaml
+from pathlib import Path
+from datetime import datetime, timezone
+from argparse import ArgumentParser
+from typing import Optional, Sequence
+from proceed.model import Pipeline
+from proceed.config_options import ConfigOptions, resolve_config_options
+from proceed.docker_runner import run_pipeline
+from proceed.aggregator import summarize_results
+from proceed.__about__ import __version__ as proceed_version
+version_string = f"Proceed {proceed_version}"
+def set_up_logging(log_file: str = None):
+    logging.root.handlers = []
+    handlers = [
+        logging.StreamHandler(sys.stdout)
+    ]
+    if log_file:
+        handlers.append(logging.FileHandler(log_file))
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=handlers
+    )
+    logging.info(version_string)
+def run(spec: str, config_options: ConfigOptions) -> int:
+    """Execute a pipeline for "proceed run spec ..."""
+    if not spec:
+        logging.error("You must provide a pipeline spec to the run operation.")
+        return -1
+    # Choose where to write outputs.
+    out_path = Path(config_options.results_dir.value).expanduser()
+    if config_options.results_group.value:
+        group_path = Path(out_path, config_options.results_group.value)
+    else:
+        spec_path = Path(spec)
+        group_path = Path(out_path, spec_path.stem)
+    if config_options.results_id.value:
+        execution_path = Path(group_path, config_options.results_id.value)
+    else:
+        execution_time = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S%Z')
+        execution_path = Path(group_path, execution_time)
+    execution_path.mkdir(parents=True, exist_ok=True)
+    # Log to the output path and to the console.
+    log_path = Path(execution_path, "proceed.log")
+    set_up_logging(log_path)
+    logging.info(f"Using output directory: {execution_path.as_posix()}")
+    # Record the effective options we're using for this run.
+    effective_options_path = Path(execution_path, "effective_options.yaml")
+    logging.info(f"Writing effective config options to: {effective_options_path.as_posix()}")
+    effective_options_yaml = yaml.safe_dump(config_options.to_dict(), **config_options.yaml_options.value)
+    with open(effective_options_path, "w") as f:
+        f.write(effective_options_yaml)
+    logging.info(f"Parsing pipeline specification from: {spec}")
+    with open(spec) as f:
+        pipeline = Pipeline.from_yaml(f.read())
+    logging.info(f"Running pipeline with args: {config_options.args.value}")
+    pipeline_result = run_pipeline(
+        original=pipeline,
+        execution_path=execution_path,
+        args=config_options.args.value,
+        force_rerun=config_options.force_rerun.value,
+        step_names=config_options.step_names.value)
+    record_path = Path(execution_path, "execution_record.yaml")
+    logging.info(f"Writing execution record to: {record_path}")
+    with open(record_path, "w") as record:
+        record.write(pipeline_result.to_yaml(
+            skip_empty=config_options.yaml_skip_empty.value,
+            dump_args=config_options.yaml_options.value
+        ))
+    error_count = sum((not not step_result.exit_code) for step_result in pipeline_result.step_results)
+    if error_count:
+        logging.error(f"{error_count} step(s) had nonzero exit codes:")
+        for step_result in pipeline_result.step_results:
+            logging.error(f"{step_result.name} exit code: {step_result.exit_code}")
+        return error_count
+    else:
+        logging.info(f"Completed {len(pipeline_result.step_results)} steps successfully.")
+        return 0
+def summarize(config_options: ConfigOptions) -> int:
+    """Collect and organize results for "proceed summarize ..."""
+    # Choose where to look for previous results.
+    results_path = Path(config_options.results_dir.value)
+    logging.info(f"Summarizing results from {results_path.as_posix()}")
+    summary = summarize_results(results_path, columns=config_options.summary_columns.value,
+                                sort_rows_by=config_options.summary_sort_rows_by.value)
+    # Choose where to write the summary of results.
+    out_file = Path(config_options.summary_file.value)
+    logging.info(f"Writing summary to {out_file.as_posix()}")
+    summary.to_csv(out_file)
+    return 0
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    parser = ArgumentParser(description="Declarative file processing with YAML and containers.")
+    parser.add_argument("operation",
+                        type=str,
+                        choices=["run", "summarize"],
+                        help="operation to perform: run a pipeline or summarize results from multiple runs"),
+    parser.add_argument("spec",
+                        type=str,
+                        nargs="?",
+                        help="YAML file with pipeline specification to run")
+    parser.add_argument("--version", "-v", action="version", version=version_string)
+    default_config_options = ConfigOptions()
+    for option_name in default_config_options.option_names():
+        config_option = default_config_options.config_option(option_name)
+        parser.add_argument(
+            config_option.cli_long_name,
+            config_option.cli_short_name,
+            **config_option.cli_kwargs()
+        )
+    cli_args = parser.parse_args(argv)
+    set_up_logging()
+    preferred_options = vars(cli_args)
+    config_options = resolve_config_options(preferred_options)
+    match cli_args.operation:
+        case "run":
+            exit_code = run(cli_args.spec, config_options)
+        case "summarize":
+            exit_code = summarize(config_options)
+        case _:  # pragma: no cover
+            # We don't expect this to happen -- argparse should error before we get here.
+            logging.error(f"Unsupported operation: {cli_args.operation}")
+            exit_code = -2
+    if exit_code:
+        logging.error(f"Completed with errors.")
+    else:
+        logging.info(f"OK.")
+    return exit_code

proceed/config_options.py ADDED Viewed

@@ -0,0 +1,265 @@
+import logging
+import yaml
+from typing import Any
+from pathlib import Path
+from argparse import Action
+from dataclasses import dataclass, field, fields
+def parse_key_value_pairs(values: list[str], delimiter: str = "=", convert_values: bool = False):
+    key_value_pairs = {}
+    for kvp in values:
+        (k, v) = kvp.split(delimiter)
+        if convert_values:
+            v = yaml.safe_load(v)
+        key_value_pairs[k] = v
+    return key_value_pairs
+class KeyValuePairsAction(Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        key_value_pairs = parse_key_value_pairs(values, convert_values=False)
+        setattr(namespace, self.dest, key_value_pairs)
+class ConvertingKeyValuePairsAction(Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        key_value_pairs = parse_key_value_pairs(values, convert_values=True)
+        setattr(namespace, self.dest, key_value_pairs)
+@dataclass
+class ConfigOption():
+    value: Any = None
+    cli_long_name: str = None
+    cli_short_name: str = None
+    cli_nargs: str = None
+    cli_type: type = str
+    cli_action: Any = None
+    cli_help: str = None
+    cli_help_default: str = "%(default)s"
+    def cli_help_with_default(self):
+        return f"{self.cli_help} (default: {self.cli_help_default})"
+    def cli_kwargs(self) -> dict[str, Any]:
+        kwargs = {
+            "default": self.value,
+            "action": self.cli_action,
+            "help": self.cli_help_with_default(),
+        }
+        # Annoying: actions like "store_true" blow up when unused args provided.
+        if self.cli_type:
+            kwargs["type"] = self.cli_type
+        if self.cli_nargs:
+            kwargs["nargs"] = self.cli_nargs
+        return kwargs
+@dataclass
+class ConfigOptions():
+    """TODO: describe options for sphinx docs"""
+    user_options_file: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value="~/proceed_options.yaml",
+        cli_long_name="--user-options-file",
+        cli_short_name="-u",
+        cli_help="a user-level options file to search for",
+    ))
+    local_options_file: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value="./proceed_options.yaml",
+        cli_long_name="--local-options-file",
+        cli_short_name="-l",
+        cli_help="a local options file to search for",
+    ))
+    custom_options_file: ConfigOption = field(default_factory=lambda: ConfigOption(
+        cli_long_name="--custom-options-file",
+        cli_short_name="-o",
+        cli_help="an artibrary, custom options file to apply, for example: -o my_options.yaml",
+    ))
+    results_dir: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value="./proceed_out",
+        cli_long_name="--results-dir",
+        cli_short_name="-d",
+        cli_help="working dir to receive logs and execution records",
+    ))
+    results_group: ConfigOption = field(default_factory=lambda: ConfigOption(
+        cli_long_name="--results-group",
+        cli_short_name="-g",
+        cli_help="working subdir grouping outputs from the same spec",
+        cli_help_default="base name of the given spec",
+    ))
+    results_id: ConfigOption = field(default_factory=lambda: ConfigOption(
+        cli_long_name="--results-id",
+        cli_short_name="-i",
+        cli_help="working subdir with outputs from the current run",
+        cli_help_default="UTC datetime",
+    ))
+    args: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value={},
+        cli_long_name="--args",
+        cli_short_name="-a",
+        cli_nargs="+",
+        cli_action=KeyValuePairsAction,
+        cli_help="one or more arg=value assignments to apply to the pipeline, for example: --args foo=bar baz=quux",
+        cli_help_default="no args",
+    ))
+    force_rerun: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value=False,
+        cli_long_name="--force-rerun",
+        cli_short_name="-F",
+        cli_action="store_true",
+        cli_type=None,
+        cli_help="force steps to rerun, even if they have done files",
+    ))
+    step_names: ConfigOption = field(default_factory=lambda: ConfigOption(
+        cli_long_name="--step-names",
+        cli_short_name="-n",
+        cli_nargs="+",
+        cli_type=str,
+        cli_help="explicit list of step names to run",
+        cli_help_default="run all steps",
+    ))
+    summary_file: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value="./summary.csv",
+        cli_long_name="--summary-file",
+        cli_short_name="-f",
+        cli_help="output file to to receive summary of results from multiple runs",
+    ))
+    summary_sort_rows_by: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value=["step_start", "file_path"],
+        cli_long_name="--summary-sort-rows-by",
+        cli_short_name="-s",
+        cli_nargs="+",
+        cli_help="summary column names by which to sort summary rows",
+        cli_help_default="-s step_start file_path",
+    ))
+    summary_columns: ConfigOption = field(default_factory=lambda: ConfigOption(
+        cli_long_name="--summary-columns",
+        cli_short_name="-c",
+        cli_nargs="+",
+        cli_help="column names to keep in the summary",
+        cli_help_default="all columns",
+    ))
+    yaml_skip_empty: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value=True,
+        cli_long_name="--yaml-skip-empty",
+        cli_short_name="-e",
+        cli_type=bool,
+        cli_help="whether to omit null and empty values from YAML outputs",
+    ))
+    yaml_options: ConfigOption = field(default_factory=lambda: ConfigOption(
+        value={"sort_keys": False, "default_flow_style": None, "width": 1000},
+        cli_long_name="--yaml-options",
+        cli_short_name="-y",
+        cli_nargs="+",
+        cli_action=ConvertingKeyValuePairsAction,
+        cli_help="one or more key=value assignments to pass as keyword args to PyYAML safe_dump()",
+        cli_help_default="-y sort_keys=False default_flow_style=null width=1000",
+    ))
+    def option_names(self) -> list[str]:
+        """Retrun a list of field names so we can iterate over the options."""
+        return [field.name for field in fields(self) if field.type == ConfigOption]
+    def config_option(self, option_name: str) -> ConfigOption:
+        """Get the :class:`ConfigOption` with the given name -- which includes value and cli metadata."""
+        return getattr(self, option_name)
+    def get_value(self, option_name: str) -> Any:
+        """Get the value of the option with the given name."""
+        return self.config_option(option_name).value
+    def set_value(self, option_name: str, value: Any):
+        """Set the given value to the option with the given name."""
+        self.config_option(option_name).value = value
+    def update_values(self, values: dict[str, str]):
+        """Set any non-default option values from the given dictionary."""
+        if not values:
+            return
+        default_config_options = ConfigOptions()
+        for option_name in self.option_names():
+            if option_name in values.keys():
+                value = values[option_name]
+                self_value = self.get_value(option_name)
+                default_value = default_config_options.get_value(option_name)
+                if isinstance(self_value, dict) and isinstance(value, dict):
+                    self.get_value(option_name).update(value)
+                elif value != default_value:
+                    self.set_value(option_name, value)
+    def to_dict(self) -> dict[str, Any]:
+        """Return a dictionary with the names and values of all options, omitting cli metadata."""
+        return {option_name: self.get_value(option_name) for option_name in self.option_names()}
+def resolve_config_options(preferred_options: dict[str, Any] = {}) -> ConfigOptions:
+    """Resolve the combined, effective config options from among several possible sources.
+    Search for Proceed :class:`ConfigOptions` from several possible sources.
+    Return a single, effective config options combining all the sources found, in the following order:
+    #. general defaults from the :class:`ConfigOptions` source code (least preferred)
+    #. user-level options file, by default: ``~/proceed_options.yaml``
+    #. local options file, by default: ``./proceed_options.yaml``
+    #. custom options file, as passed on the command line, for example ``proceed --options=my_options.yaml ...``
+    #. explicit options values, as passed on the command line (see ``proceed --help``) (most preferred)
+    """
+    config_options = ConfigOptions()
+    user_options_file = preferred_options.get("user_options_file", config_options.user_options_file.value)
+    config_options.update_values(safe_load_config_options(user_options_file))
+    local_options_file = preferred_options.get("local_options_file", config_options.local_options_file.value)
+    config_options.update_values(safe_load_config_options(local_options_file))
+    custom_options_file = preferred_options.get("custom_options_file", config_options.custom_options_file.value)
+    config_options.update_values(safe_load_config_options(custom_options_file))
+    config_options.update_values(preferred_options)
+    return config_options
+def safe_load_config_options(options_file: str) -> dict[str, Any]:
+    if not options_file:
+        print("nothing")
+        return None
+    logging.info(f"Looking for config options in file: {options_file}")
+    options_path = Path(options_file).expanduser()
+    if not options_path.is_file() or not options_path.exists():
+        print(f"Skipping not a file or doesn't exist: {options_file}")
+        logging.info(f"Skipping not a file or doesn't exist: {options_file}")
+        return None
+    # Let read and parse errors bubble up / blow up the whole thing.
+    # Otherwise a pipeline might run with config that wasn't intended.
+    with open(options_path) as f:
+        options_yaml = f.read()
+    options = yaml.safe_load(options_yaml)
+    logging.info(f"Found config options in file: {options_file}")
+    return options