proceed 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
proceed/__about__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.0.1"
proceed/__init__.py ADDED
File without changes
proceed/aggregator.py ADDED
@@ -0,0 +1,123 @@
1
+ import logging
2
+ from typing import Any
3
+ from pathlib import Path
4
+ from pandas import DataFrame
5
+ import yaml
6
+ from proceed.model import ExecutionRecord, Pipeline, Step, Timing, StepResult
7
+ from proceed.file_matching import flatten_matches, file_summary, hash_contents
8
+
9
+ def summarize_results(results_path: Path, columns: list[str] = None, sort_rows_by: list[str] = None) -> DataFrame:
10
+ summary_rows = []
11
+ group_paths = [path for path in results_path.iterdir() if path.is_dir()]
12
+ for group_path in group_paths:
13
+ id_paths = [path for path in group_path.iterdir() if path.is_dir()]
14
+ for id_path in id_paths:
15
+ for yaml_file in id_path.glob("execution_record.yaml"):
16
+ execution_record = safe_read_execution_record(yaml_file)
17
+ if execution_record:
18
+ execution_summary = summarize_execution(id_path.stem, group_path.stem, execution_record)
19
+ summary_rows = summary_rows + execution_summary
20
+
21
+ summary = DataFrame(summary_rows)
22
+
23
+ if columns:
24
+ summary_columns = list(summary.columns)
25
+ usable_columns = [column for column in columns if column in summary_columns]
26
+ summary = summary.filter(items=columns)
27
+
28
+ if sort_rows_by:
29
+ summary_columns = list(summary.columns)
30
+ usable_columns = [column for column in sort_rows_by if column in summary_columns]
31
+ summary = summary.sort_values(usable_columns)
32
+
33
+ return summary
34
+
35
+
36
+ def safe_read_execution_record(yaml_file: Path) -> ExecutionRecord:
37
+ try:
38
+ with open(yaml_file) as f:
39
+ return ExecutionRecord.from_yaml(f.read())
40
+ except:
41
+ logging.error(f"Skipping file that seems not to be a Proceed execution record: {yaml_file}")
42
+ return None
43
+
44
+
45
+ def summarize_execution(results_id: str, group: str, execution_record: ExecutionRecord) -> list[dict[str, str]]:
46
+ pipeline_summary = summarize_pipeline(results_id, group, execution_record.amended, execution_record.timing)
47
+
48
+ steps_and_results = zip(execution_record.amended.steps, execution_record.step_results)
49
+ step_summaries = [summarize_step_and_result(step, result) for step, result in steps_and_results]
50
+
51
+ combined_summary = [{**pipeline_summary, **file_summary} for step_summary in step_summaries for file_summary in step_summary]
52
+ return combined_summary
53
+
54
+
55
+ def summarize_pipeline(results_id: str, group: str, pipeline: Pipeline, timing: Timing) -> dict[str, str]:
56
+ top_level_summary = {
57
+ "proceed_version": pipeline.version,
58
+ "results_id": results_id,
59
+ "results_group": group,
60
+ "pipeline_description": pipeline.description,
61
+ "pipeline_start": timing.start,
62
+ "pipeline_finish": timing.finish,
63
+ "pipeline_duration": timing.duration,
64
+ }
65
+
66
+ arg_summary = {f"arg_{key}": value for key, value in pipeline.args.items()}
67
+
68
+ combined_summary = {**top_level_summary, **arg_summary}
69
+ return combined_summary
70
+
71
+
72
+ def summarize_step_and_result(step: Step, result: StepResult) -> list[dict[str, Any]]:
73
+ step_summary = {f"step_{key}": str(value) for key, value in step.to_dict().items()}
74
+
75
+ flattened_step_attributes = {"timing", "log_file", "files_done", "files_in", "files_out", "files_summary"}
76
+ result_summary = {f"step_{key}": str(value) for key, value in result.to_dict().items() if key not in flattened_step_attributes}
77
+
78
+ result_summary["step_start"] = result.timing.start
79
+ result_summary["step_finish"] = result.timing.finish
80
+ result_summary["step_duration"] = result.timing.duration
81
+
82
+ if result.log_file:
83
+ log_path = Path(result.log_file)
84
+ log_digest = hash_contents(log_path)
85
+ log_file = file_summary(volume=log_path.parent.as_posix(), path=log_path.name, digest=log_digest, file_role="log")
86
+ else:
87
+ log_file = file_summary(volume="", path="", digest="", file_role="log")
88
+
89
+ done_files = flatten_matches(result.files_done, file_role="done")
90
+ in_files = flatten_matches(result.files_in, file_role="in")
91
+ out_files = flatten_matches(result.files_out, file_role="out")
92
+ summary_files = flatten_matches(result.files_summary, file_role="summary")
93
+
94
+ all_files = [log_file] + done_files + in_files + out_files + summary_files
95
+
96
+ custom_summary = {}
97
+ for summary_file in summary_files:
98
+ custom_columns = collect_custom_columns(summary_file["file_volume"], summary_file["file_path"])
99
+ custom_summary.update(custom_columns)
100
+
101
+ combined_summary = [{**step_summary, **result_summary, **file_summary, **custom_summary} for file_summary in all_files]
102
+ return combined_summary
103
+
104
+
105
+ def collect_custom_columns(file_volume: str, file_path: str) -> dict[str, str]:
106
+ path = Path(file_volume, file_path)
107
+ if not path.is_file() or not path.exists():
108
+ return {}
109
+
110
+ with open(path) as f:
111
+ content = f.read()
112
+
113
+ try:
114
+ parsed = yaml.safe_load(content)
115
+ if parsed and isinstance(parsed, dict):
116
+ return parsed
117
+
118
+ logging.info(f"Treating non-dictionary YAML as plain text: {path.as_posix()}")
119
+
120
+ except yaml.parser.ParserError:
121
+ logging.info(f"Treating non-YAML file as plain text: {path.as_posix()}")
122
+
123
+ return {path.stem: content.strip()}
proceed/cli.py ADDED
@@ -0,0 +1,161 @@
1
+ import sys
2
+ import logging
3
+ import yaml
4
+ from pathlib import Path
5
+ from datetime import datetime, timezone
6
+ from argparse import ArgumentParser
7
+ from typing import Optional, Sequence
8
+ from proceed.model import Pipeline
9
+ from proceed.config_options import ConfigOptions, resolve_config_options
10
+ from proceed.docker_runner import run_pipeline
11
+ from proceed.aggregator import summarize_results
12
+ from proceed.__about__ import __version__ as proceed_version
13
+
14
+ version_string = f"Proceed {proceed_version}"
15
+
16
+
17
+ def set_up_logging(log_file: str = None):
18
+ logging.root.handlers = []
19
+ handlers = [
20
+ logging.StreamHandler(sys.stdout)
21
+ ]
22
+ if log_file:
23
+ handlers.append(logging.FileHandler(log_file))
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format="%(asctime)s [%(levelname)s] %(message)s",
27
+ handlers=handlers
28
+ )
29
+ logging.info(version_string)
30
+
31
+
32
+ def run(spec: str, config_options: ConfigOptions) -> int:
33
+ """Execute a pipeline for "proceed run spec ..."""
34
+
35
+ if not spec:
36
+ logging.error("You must provide a pipeline spec to the run operation.")
37
+ return -1
38
+
39
+ # Choose where to write outputs.
40
+ out_path = Path(config_options.results_dir.value).expanduser()
41
+
42
+ if config_options.results_group.value:
43
+ group_path = Path(out_path, config_options.results_group.value)
44
+ else:
45
+ spec_path = Path(spec)
46
+ group_path = Path(out_path, spec_path.stem)
47
+
48
+ if config_options.results_id.value:
49
+ execution_path = Path(group_path, config_options.results_id.value)
50
+ else:
51
+ execution_time = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%S%Z')
52
+ execution_path = Path(group_path, execution_time)
53
+
54
+ execution_path.mkdir(parents=True, exist_ok=True)
55
+
56
+ # Log to the output path and to the console.
57
+ log_path = Path(execution_path, "proceed.log")
58
+ set_up_logging(log_path)
59
+
60
+ logging.info(f"Using output directory: {execution_path.as_posix()}")
61
+
62
+ # Record the effective options we're using for this run.
63
+ effective_options_path = Path(execution_path, "effective_options.yaml")
64
+ logging.info(f"Writing effective config options to: {effective_options_path.as_posix()}")
65
+ effective_options_yaml = yaml.safe_dump(config_options.to_dict(), **config_options.yaml_options.value)
66
+ with open(effective_options_path, "w") as f:
67
+ f.write(effective_options_yaml)
68
+
69
+ logging.info(f"Parsing pipeline specification from: {spec}")
70
+ with open(spec) as f:
71
+ pipeline = Pipeline.from_yaml(f.read())
72
+
73
+ logging.info(f"Running pipeline with args: {config_options.args.value}")
74
+ pipeline_result = run_pipeline(
75
+ original=pipeline,
76
+ execution_path=execution_path,
77
+ args=config_options.args.value,
78
+ force_rerun=config_options.force_rerun.value,
79
+ step_names=config_options.step_names.value)
80
+
81
+ record_path = Path(execution_path, "execution_record.yaml")
82
+ logging.info(f"Writing execution record to: {record_path}")
83
+ with open(record_path, "w") as record:
84
+ record.write(pipeline_result.to_yaml(
85
+ skip_empty=config_options.yaml_skip_empty.value,
86
+ dump_args=config_options.yaml_options.value
87
+ ))
88
+
89
+ error_count = sum((not not step_result.exit_code) for step_result in pipeline_result.step_results)
90
+ if error_count:
91
+ logging.error(f"{error_count} step(s) had nonzero exit codes:")
92
+ for step_result in pipeline_result.step_results:
93
+ logging.error(f"{step_result.name} exit code: {step_result.exit_code}")
94
+ return error_count
95
+ else:
96
+ logging.info(f"Completed {len(pipeline_result.step_results)} steps successfully.")
97
+ return 0
98
+
99
+
100
+ def summarize(config_options: ConfigOptions) -> int:
101
+ """Collect and organize results for "proceed summarize ..."""
102
+
103
+ # Choose where to look for previous results.
104
+ results_path = Path(config_options.results_dir.value)
105
+ logging.info(f"Summarizing results from {results_path.as_posix()}")
106
+
107
+ summary = summarize_results(results_path, columns=config_options.summary_columns.value,
108
+ sort_rows_by=config_options.summary_sort_rows_by.value)
109
+
110
+ # Choose where to write the summary of results.
111
+ out_file = Path(config_options.summary_file.value)
112
+ logging.info(f"Writing summary to {out_file.as_posix()}")
113
+ summary.to_csv(out_file)
114
+
115
+ return 0
116
+
117
+
118
+ def main(argv: Optional[Sequence[str]] = None) -> int:
119
+ parser = ArgumentParser(description="Declarative file processing with YAML and containers.")
120
+ parser.add_argument("operation",
121
+ type=str,
122
+ choices=["run", "summarize"],
123
+ help="operation to perform: run a pipeline or summarize results from multiple runs"),
124
+ parser.add_argument("spec",
125
+ type=str,
126
+ nargs="?",
127
+ help="YAML file with pipeline specification to run")
128
+ parser.add_argument("--version", "-v", action="version", version=version_string)
129
+
130
+ default_config_options = ConfigOptions()
131
+ for option_name in default_config_options.option_names():
132
+ config_option = default_config_options.config_option(option_name)
133
+ parser.add_argument(
134
+ config_option.cli_long_name,
135
+ config_option.cli_short_name,
136
+ **config_option.cli_kwargs()
137
+ )
138
+
139
+ cli_args = parser.parse_args(argv)
140
+
141
+ set_up_logging()
142
+
143
+ preferred_options = vars(cli_args)
144
+ config_options = resolve_config_options(preferred_options)
145
+
146
+ match cli_args.operation:
147
+ case "run":
148
+ exit_code = run(cli_args.spec, config_options)
149
+ case "summarize":
150
+ exit_code = summarize(config_options)
151
+ case _: # pragma: no cover
152
+ # We don't expect this to happen -- argparse should error before we get here.
153
+ logging.error(f"Unsupported operation: {cli_args.operation}")
154
+ exit_code = -2
155
+
156
+ if exit_code:
157
+ logging.error(f"Completed with errors.")
158
+ else:
159
+ logging.info(f"OK.")
160
+
161
+ return exit_code
@@ -0,0 +1,265 @@
1
+ import logging
2
+ import yaml
3
+ from typing import Any
4
+ from pathlib import Path
5
+ from argparse import Action
6
+ from dataclasses import dataclass, field, fields
7
+
8
+
9
+ def parse_key_value_pairs(values: list[str], delimiter: str = "=", convert_values: bool = False):
10
+ key_value_pairs = {}
11
+ for kvp in values:
12
+ (k, v) = kvp.split(delimiter)
13
+ if convert_values:
14
+ v = yaml.safe_load(v)
15
+ key_value_pairs[k] = v
16
+ return key_value_pairs
17
+
18
+
19
+ class KeyValuePairsAction(Action):
20
+ def __call__(self, parser, namespace, values, option_string=None):
21
+ key_value_pairs = parse_key_value_pairs(values, convert_values=False)
22
+ setattr(namespace, self.dest, key_value_pairs)
23
+
24
+
25
+ class ConvertingKeyValuePairsAction(Action):
26
+ def __call__(self, parser, namespace, values, option_string=None):
27
+ key_value_pairs = parse_key_value_pairs(values, convert_values=True)
28
+ setattr(namespace, self.dest, key_value_pairs)
29
+
30
+
31
+ @dataclass
32
+ class ConfigOption():
33
+ value: Any = None
34
+ cli_long_name: str = None
35
+ cli_short_name: str = None
36
+ cli_nargs: str = None
37
+ cli_type: type = str
38
+ cli_action: Any = None
39
+ cli_help: str = None
40
+ cli_help_default: str = "%(default)s"
41
+
42
+ def cli_help_with_default(self):
43
+ return f"{self.cli_help} (default: {self.cli_help_default})"
44
+
45
+ def cli_kwargs(self) -> dict[str, Any]:
46
+ kwargs = {
47
+ "default": self.value,
48
+ "action": self.cli_action,
49
+ "help": self.cli_help_with_default(),
50
+ }
51
+
52
+ # Annoying: actions like "store_true" blow up when unused args provided.
53
+ if self.cli_type:
54
+ kwargs["type"] = self.cli_type
55
+
56
+ if self.cli_nargs:
57
+ kwargs["nargs"] = self.cli_nargs
58
+
59
+ return kwargs
60
+
61
+
62
+ @dataclass
63
+ class ConfigOptions():
64
+ """TODO: describe options for sphinx docs"""
65
+
66
+ user_options_file: ConfigOption = field(default_factory=lambda: ConfigOption(
67
+ value="~/proceed_options.yaml",
68
+ cli_long_name="--user-options-file",
69
+ cli_short_name="-u",
70
+ cli_help="a user-level options file to search for",
71
+ ))
72
+
73
+ local_options_file: ConfigOption = field(default_factory=lambda: ConfigOption(
74
+ value="./proceed_options.yaml",
75
+ cli_long_name="--local-options-file",
76
+ cli_short_name="-l",
77
+ cli_help="a local options file to search for",
78
+ ))
79
+
80
+ custom_options_file: ConfigOption = field(default_factory=lambda: ConfigOption(
81
+ cli_long_name="--custom-options-file",
82
+ cli_short_name="-o",
83
+ cli_help="an artibrary, custom options file to apply, for example: -o my_options.yaml",
84
+ ))
85
+
86
+ results_dir: ConfigOption = field(default_factory=lambda: ConfigOption(
87
+ value="./proceed_out",
88
+ cli_long_name="--results-dir",
89
+ cli_short_name="-d",
90
+ cli_help="working dir to receive logs and execution records",
91
+ ))
92
+
93
+ results_group: ConfigOption = field(default_factory=lambda: ConfigOption(
94
+ cli_long_name="--results-group",
95
+ cli_short_name="-g",
96
+ cli_help="working subdir grouping outputs from the same spec",
97
+ cli_help_default="base name of the given spec",
98
+ ))
99
+
100
+ results_id: ConfigOption = field(default_factory=lambda: ConfigOption(
101
+ cli_long_name="--results-id",
102
+ cli_short_name="-i",
103
+ cli_help="working subdir with outputs from the current run",
104
+ cli_help_default="UTC datetime",
105
+ ))
106
+
107
+ args: ConfigOption = field(default_factory=lambda: ConfigOption(
108
+ value={},
109
+ cli_long_name="--args",
110
+ cli_short_name="-a",
111
+ cli_nargs="+",
112
+ cli_action=KeyValuePairsAction,
113
+ cli_help="one or more arg=value assignments to apply to the pipeline, for example: --args foo=bar baz=quux",
114
+ cli_help_default="no args",
115
+ ))
116
+
117
+ force_rerun: ConfigOption = field(default_factory=lambda: ConfigOption(
118
+ value=False,
119
+ cli_long_name="--force-rerun",
120
+ cli_short_name="-F",
121
+ cli_action="store_true",
122
+ cli_type=None,
123
+ cli_help="force steps to rerun, even if they have done files",
124
+ ))
125
+
126
+ step_names: ConfigOption = field(default_factory=lambda: ConfigOption(
127
+ cli_long_name="--step-names",
128
+ cli_short_name="-n",
129
+ cli_nargs="+",
130
+ cli_type=str,
131
+ cli_help="explicit list of step names to run",
132
+ cli_help_default="run all steps",
133
+ ))
134
+
135
+ summary_file: ConfigOption = field(default_factory=lambda: ConfigOption(
136
+ value="./summary.csv",
137
+ cli_long_name="--summary-file",
138
+ cli_short_name="-f",
139
+ cli_help="output file to to receive summary of results from multiple runs",
140
+ ))
141
+
142
+ summary_sort_rows_by: ConfigOption = field(default_factory=lambda: ConfigOption(
143
+ value=["step_start", "file_path"],
144
+ cli_long_name="--summary-sort-rows-by",
145
+ cli_short_name="-s",
146
+ cli_nargs="+",
147
+ cli_help="summary column names by which to sort summary rows",
148
+ cli_help_default="-s step_start file_path",
149
+ ))
150
+
151
+ summary_columns: ConfigOption = field(default_factory=lambda: ConfigOption(
152
+ cli_long_name="--summary-columns",
153
+ cli_short_name="-c",
154
+ cli_nargs="+",
155
+ cli_help="column names to keep in the summary",
156
+ cli_help_default="all columns",
157
+ ))
158
+
159
+ yaml_skip_empty: ConfigOption = field(default_factory=lambda: ConfigOption(
160
+ value=True,
161
+ cli_long_name="--yaml-skip-empty",
162
+ cli_short_name="-e",
163
+ cli_type=bool,
164
+ cli_help="whether to omit null and empty values from YAML outputs",
165
+ ))
166
+
167
+ yaml_options: ConfigOption = field(default_factory=lambda: ConfigOption(
168
+ value={"sort_keys": False, "default_flow_style": None, "width": 1000},
169
+ cli_long_name="--yaml-options",
170
+ cli_short_name="-y",
171
+ cli_nargs="+",
172
+ cli_action=ConvertingKeyValuePairsAction,
173
+ cli_help="one or more key=value assignments to pass as keyword args to PyYAML safe_dump()",
174
+ cli_help_default="-y sort_keys=False default_flow_style=null width=1000",
175
+ ))
176
+
177
+ def option_names(self) -> list[str]:
178
+ """Retrun a list of field names so we can iterate over the options."""
179
+ return [field.name for field in fields(self) if field.type == ConfigOption]
180
+
181
+ def config_option(self, option_name: str) -> ConfigOption:
182
+ """Get the :class:`ConfigOption` with the given name -- which includes value and cli metadata."""
183
+ return getattr(self, option_name)
184
+
185
+ def get_value(self, option_name: str) -> Any:
186
+ """Get the value of the option with the given name."""
187
+ return self.config_option(option_name).value
188
+
189
+ def set_value(self, option_name: str, value: Any):
190
+ """Set the given value to the option with the given name."""
191
+ self.config_option(option_name).value = value
192
+
193
+ def update_values(self, values: dict[str, str]):
194
+ """Set any non-default option values from the given dictionary."""
195
+ if not values:
196
+ return
197
+
198
+ default_config_options = ConfigOptions()
199
+ for option_name in self.option_names():
200
+ if option_name in values.keys():
201
+ value = values[option_name]
202
+ self_value = self.get_value(option_name)
203
+ default_value = default_config_options.get_value(option_name)
204
+ if isinstance(self_value, dict) and isinstance(value, dict):
205
+ self.get_value(option_name).update(value)
206
+ elif value != default_value:
207
+ self.set_value(option_name, value)
208
+
209
+ def to_dict(self) -> dict[str, Any]:
210
+ """Return a dictionary with the names and values of all options, omitting cli metadata."""
211
+ return {option_name: self.get_value(option_name) for option_name in self.option_names()}
212
+
213
+
214
+ def resolve_config_options(preferred_options: dict[str, Any] = {}) -> ConfigOptions:
215
+ """Resolve the combined, effective config options from among several possible sources.
216
+
217
+ Search for Proceed :class:`ConfigOptions` from several possible sources.
218
+ Return a single, effective config options combining all the sources found, in the following order:
219
+
220
+ #. general defaults from the :class:`ConfigOptions` source code (least preferred)
221
+ #. user-level options file, by default: ``~/proceed_options.yaml``
222
+ #. local options file, by default: ``./proceed_options.yaml``
223
+ #. custom options file, as passed on the command line, for example ``proceed --options=my_options.yaml ...``
224
+ #. explicit options values, as passed on the command line (see ``proceed --help``) (most preferred)
225
+ """
226
+
227
+ config_options = ConfigOptions()
228
+
229
+ user_options_file = preferred_options.get("user_options_file", config_options.user_options_file.value)
230
+ config_options.update_values(safe_load_config_options(user_options_file))
231
+
232
+ local_options_file = preferred_options.get("local_options_file", config_options.local_options_file.value)
233
+ config_options.update_values(safe_load_config_options(local_options_file))
234
+
235
+ custom_options_file = preferred_options.get("custom_options_file", config_options.custom_options_file.value)
236
+ config_options.update_values(safe_load_config_options(custom_options_file))
237
+
238
+ config_options.update_values(preferred_options)
239
+
240
+ return config_options
241
+
242
+
243
+ def safe_load_config_options(options_file: str) -> dict[str, Any]:
244
+ if not options_file:
245
+ print("nothing")
246
+ return None
247
+
248
+ logging.info(f"Looking for config options in file: {options_file}")
249
+
250
+ options_path = Path(options_file).expanduser()
251
+ if not options_path.is_file() or not options_path.exists():
252
+ print(f"Skipping not a file or doesn't exist: {options_file}")
253
+ logging.info(f"Skipping not a file or doesn't exist: {options_file}")
254
+ return None
255
+
256
+ # Let read and parse errors bubble up / blow up the whole thing.
257
+ # Otherwise a pipeline might run with config that wasn't intended.
258
+
259
+ with open(options_path) as f:
260
+ options_yaml = f.read()
261
+
262
+ options = yaml.safe_load(options_yaml)
263
+ logging.info(f"Found config options in file: {options_file}")
264
+
265
+ return options