calkit-python 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {calkit_python-0.6.0 → calkit_python-0.7.0}/PKG-INFO +27 -7
  2. {calkit_python-0.6.0 → calkit_python-0.7.0}/README.md +26 -6
  3. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/__init__.py +1 -1
  4. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/main.py +9 -2
  5. calkit_python-0.7.0/calkit/core.py +202 -0
  6. calkit_python-0.7.0/calkit/magics.py +257 -0
  7. calkit_python-0.7.0/calkit/tests/test_magics.py +36 -0
  8. calkit_python-0.7.0/docs/tutorials/notebook-pipeline.md +158 -0
  9. calkit_python-0.7.0/test/pipeline.ipynb +93 -0
  10. calkit_python-0.6.0/calkit/core.py +0 -98
  11. {calkit_python-0.6.0 → calkit_python-0.7.0}/.github/FUNDING.yml +0 -0
  12. {calkit_python-0.6.0 → calkit_python-0.7.0}/.github/workflows/publish-test.yml +0 -0
  13. {calkit_python-0.6.0 → calkit_python-0.7.0}/.github/workflows/publish.yml +0 -0
  14. {calkit_python-0.6.0 → calkit_python-0.7.0}/.gitignore +0 -0
  15. {calkit_python-0.6.0 → calkit_python-0.7.0}/LICENSE +0 -0
  16. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/__init__.py +0 -0
  17. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/config.py +0 -0
  18. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/core.py +0 -0
  19. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/import_.py +0 -0
  20. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/list.py +0 -0
  21. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/new.py +0 -0
  22. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/notebooks.py +0 -0
  23. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cli/office.py +0 -0
  24. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/cloud.py +0 -0
  25. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/conda.py +0 -0
  26. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/config.py +0 -0
  27. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/data.py +0 -0
  28. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/docker.py +0 -0
  29. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/dvc.py +0 -0
  30. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/git.py +0 -0
  31. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/gui.py +0 -0
  32. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/jupyter.py +0 -0
  33. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/models.py +0 -0
  34. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/office.py +0 -0
  35. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/server.py +0 -0
  36. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/__init__.py +0 -0
  37. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/core.py +0 -0
  38. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/__init__.py +0 -0
  39. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/article/paper.tex +0 -0
  40. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/core.py +0 -0
  41. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/jfm/jfm.bst +0 -0
  42. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/jfm/jfm.cls +0 -0
  43. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/jfm/lineno-FLM.sty +0 -0
  44. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/jfm/paper.tex +0 -0
  45. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/templates/latex/jfm/upmath.sty +0 -0
  46. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/__init__.py +0 -0
  47. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/cli/__init__.py +0 -0
  48. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/cli/test_list.py +0 -0
  49. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/cli/test_main.py +0 -0
  50. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/cli/test_new.py +0 -0
  51. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/test_core.py +0 -0
  52. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/test_dvc.py +0 -0
  53. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/test_jupyter.py +0 -0
  54. {calkit_python-0.6.0 → calkit_python-0.7.0}/calkit/tests/test_templates.py +0 -0
  55. {calkit_python-0.6.0 → calkit_python-0.7.0}/docs/tutorials/adding-latex-pub-docker.md +0 -0
  56. {calkit_python-0.6.0 → calkit_python-0.7.0}/docs/tutorials/conda-envs.md +0 -0
  57. {calkit_python-0.6.0 → calkit_python-0.7.0}/docs/tutorials/img/run-proc.png +0 -0
  58. {calkit_python-0.6.0 → calkit_python-0.7.0}/docs/tutorials/procedures.md +0 -0
  59. {calkit_python-0.6.0 → calkit_python-0.7.0}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: calkit-python
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: Reproducibility simplified.
5
5
  Project-URL: Homepage, https://github.com/calkit/calkit
6
6
  Project-URL: Issues, https://github.com/calkit/calkit/issues
@@ -31,20 +31,40 @@ Description-Content-Type: text/markdown
31
31
 
32
32
  # Calkit
33
33
 
34
- [Calkit](https://calkit.io) helps simplify reproducibility,
35
- acting as a layer on top of
36
- [Git](https://git-scm.com/), [DVC](https://dvc.org/),
37
- [Docker](https://docker.com),
38
- and adds a domain-specific data model
34
+ Calkit is a lightweight framework for doing reproducible research.
35
+ It acts as a top-level layer to integrate and simplify the use of enabling
36
+ technologies such as
37
+ [Git](https://git-scm.com/),
38
+ [DVC](https://dvc.org/),
39
+ [Conda](https://docs.conda.io/en/latest/),
40
+ and [Docker](https://docker.com).
41
+ Calkit also adds a domain-specific data model
39
42
  such that all aspects of the research process can be fully described in a
40
43
  single repository and therefore easily consumed by others.
41
44
 
45
+ Our goal is to make reproducibility easier so it becomes more common.
46
+ To do this, we try to make it easy for users to follow two simple rules:
47
+
48
+ 1. **Keep everything in version control.** This includes large files like
49
+ datasets, enabled by DVC. The [Calkit cloud](https://calkit.io)
50
+ serves as a simple default DVC remote storage location for those who do not
51
+ want to manage their own infrastructure.
52
+ 2. **Generate all important artifacts with a single pipeline.** There should be
53
+ no special instructions required to reproduce a project's artifacts.
54
+ It should be as simple as calling `calkit run`.
55
+ The DVC pipeline (in a project's `dvc.yaml` file) is therefore the main
56
+ thing to "build" throughout a research project.
57
+ Calkit provides helper functionality to build pipeline stages that
58
+ keep computational environments up-to-date and label their outputs for
59
+ convenient reuse.
60
+
42
61
  ## Tutorials
43
62
 
63
+ - [Jupyter notebook as a DVC pipeline](docs/tutorials/notebook-pipeline.md)
44
64
  - [Keeping track of conda environments](docs/tutorials/conda-envs.md)
45
65
  - [Defining and executing manual procedures](docs/tutorials/procedures.md)
46
66
  - [Adding a new LaTeX-based publication with its own Docker build environment](docs/tutorials/adding-latex-pub-docker.md)
47
- - [A reproducibly workflow using Microsoft Office (Word and Excel)](https://petebachant.me/office-repro/)
67
+ - [A reproducible workflow using Microsoft Office (Word and Excel)](https://petebachant.me/office-repro/)
48
68
  - [Reproducible OpenFOAM simulations](https://petebachant.me/reproducible-openfoam/)
49
69
 
50
70
  ## Why does reproducibility matter?
@@ -1,19 +1,39 @@
1
1
  # Calkit
2
2
 
3
- [Calkit](https://calkit.io) helps simplify reproducibility,
4
- acting as a layer on top of
5
- [Git](https://git-scm.com/), [DVC](https://dvc.org/),
6
- [Docker](https://docker.com),
7
- and adds a domain-specific data model
3
+ Calkit is a lightweight framework for doing reproducible research.
4
+ It acts as a top-level layer to integrate and simplify the use of enabling
5
+ technologies such as
6
+ [Git](https://git-scm.com/),
7
+ [DVC](https://dvc.org/),
8
+ [Conda](https://docs.conda.io/en/latest/),
9
+ and [Docker](https://docker.com).
10
+ Calkit also adds a domain-specific data model
8
11
  such that all aspects of the research process can be fully described in a
9
12
  single repository and therefore easily consumed by others.
10
13
 
14
+ Our goal is to make reproducibility easier so it becomes more common.
15
+ To do this, we try to make it easy for users to follow two simple rules:
16
+
17
+ 1. **Keep everything in version control.** This includes large files like
18
+ datasets, enabled by DVC. The [Calkit cloud](https://calkit.io)
19
+ serves as a simple default DVC remote storage location for those who do not
20
+ want to manage their own infrastructure.
21
+ 2. **Generate all important artifacts with a single pipeline.** There should be
22
+ no special instructions required to reproduce a project's artifacts.
23
+ It should be as simple as calling `calkit run`.
24
+ The DVC pipeline (in a project's `dvc.yaml` file) is therefore the main
25
+ thing to "build" throughout a research project.
26
+ Calkit provides helper functionality to build pipeline stages that
27
+ keep computational environments up-to-date and label their outputs for
28
+ convenient reuse.
29
+
11
30
  ## Tutorials
12
31
 
32
+ - [Jupyter notebook as a DVC pipeline](docs/tutorials/notebook-pipeline.md)
13
33
  - [Keeping track of conda environments](docs/tutorials/conda-envs.md)
14
34
  - [Defining and executing manual procedures](docs/tutorials/procedures.md)
15
35
  - [Adding a new LaTeX-based publication with its own Docker build environment](docs/tutorials/adding-latex-pub-docker.md)
16
- - [A reproducibly workflow using Microsoft Office (Word and Excel)](https://petebachant.me/office-repro/)
36
+ - [A reproducible workflow using Microsoft Office (Word and Excel)](https://petebachant.me/office-repro/)
17
37
  - [Reproducible OpenFOAM simulations](https://petebachant.me/reproducible-openfoam/)
18
38
 
19
39
  ## Why does reproducibility matter?
@@ -1,4 +1,4 @@
1
- __version__ = "0.6.0"
1
+ __version__ = "0.7.0"
2
2
 
3
3
  from .core import *
4
4
  from . import git
@@ -404,7 +404,12 @@ def run_dvc_repro(
404
404
  f"Stage {stage_name} does not have exactly one output"
405
405
  )
406
406
  cktype = ckmeta.get("type")
407
- if cktype not in ["figure", "dataset", "publication"]:
407
+ if cktype not in [
408
+ "figure",
409
+ "dataset",
410
+ "publication",
411
+ "notebook",
412
+ ]:
408
413
  raise_error(f"Invalid Calkit output type '{cktype}'")
409
414
  objects.append(
410
415
  dict(path=outs[0]) | ckmeta | dict(stage=stage_name)
@@ -553,7 +558,9 @@ def run_in_env(
553
558
  typer.echo(f"Running command: {docker_cmd}")
554
559
  subprocess.call(docker_cmd, cwd=wdir)
555
560
  elif env["kind"] == "conda":
556
- cmd = ["conda", "run", "-n", env_name] + cmd
561
+ with open(env["path"]) as f:
562
+ conda_env = calkit.ryaml.load(f)
563
+ cmd = ["conda", "run", "-n", conda_env["name"]] + cmd
557
564
  if verbose:
558
565
  typer.echo(f"Running command: {cmd}")
559
566
  subprocess.call(cmd, cwd=wdir)
@@ -0,0 +1,202 @@
1
+ """Core functionality."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import glob
6
+ import json
7
+ import logging
8
+ import os
9
+ import pickle
10
+ from datetime import UTC, datetime
11
+ from typing import Literal
12
+
13
+ import ruamel.yaml
14
+ from git import Repo
15
+ from git.exc import InvalidGitRepositoryError
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__package__)
19
+
20
+ ryaml = ruamel.yaml.YAML()
21
+ ryaml.indent(mapping=2, sequence=4, offset=2)
22
+ ryaml.preserve_quotes = True
23
+ ryaml.width = 70
24
+
25
+
26
+ def find_project_dirs(relative=False, max_depth=3) -> list[str]:
27
+ """Find all Calkit project directories."""
28
+ if relative:
29
+ start = ""
30
+ else:
31
+ start = os.path.expanduser("~")
32
+ res = []
33
+ for i in range(max_depth):
34
+ pattern = os.path.join(start, *["*"] * (i + 1), "calkit.yaml")
35
+ res += glob.glob(pattern)
36
+ # Check GitHub documents for users who use GitHub Desktop
37
+ pattern = os.path.join(
38
+ start, "*", "GitHub", *["*"] * (i + 1), "calkit.yaml"
39
+ )
40
+ res += glob.glob(pattern)
41
+ final_res = []
42
+ for ck_fpath in res:
43
+ path = os.path.dirname(ck_fpath)
44
+ # Make sure this path is a Git repo
45
+ try:
46
+ Repo(path)
47
+ except InvalidGitRepositoryError:
48
+ continue
49
+ final_res.append(path)
50
+ return final_res
51
+
52
+
53
+ def load_calkit_info(
54
+ wdir=None, process_includes: bool | str | list[str] = False
55
+ ) -> dict:
56
+ """Load Calkit project information.
57
+
58
+ Parameters
59
+ ----------
60
+ wdir : str
61
+ Working directory. Defaults to current working directory.
62
+ process_includes: bool, string or list of strings
63
+ Whether or not to process any '_include' keys for a given kind of
64
+ object. If a string is passed, only process includes for that kind.
65
+ Similarly, if a list of strings is passed, only process those kinds.
66
+ If True, process all default kinds.
67
+ """
68
+ info = {}
69
+ fpath = "calkit.yaml"
70
+ if wdir is not None:
71
+ fpath = os.path.join(wdir, fpath)
72
+ if os.path.isfile(fpath):
73
+ with open(fpath) as f:
74
+ info = ryaml.load(f)
75
+ # Check for any includes, i.e., entities with an _include key, for which
76
+ # we should merge in another file
77
+ default_includes_enabled = ["environments", "procedures"]
78
+ if process_includes:
79
+ if isinstance(process_includes, bool):
80
+ includes_enabled = default_includes_enabled
81
+ elif isinstance(process_includes, str):
82
+ includes_enabled = [process_includes]
83
+ elif isinstance(process_includes, list):
84
+ includes_enabled = process_includes
85
+ for kind in includes_enabled:
86
+ if kind in info:
87
+ for obj_name, obj in info[kind].items():
88
+ if "_include" in obj:
89
+ include_fpath = obj.pop("_include")
90
+ with open(include_fpath) as f:
91
+ include_data = ryaml.load(f)
92
+ info[kind][obj_name] |= include_data
93
+ return info
94
+
95
+
96
+ def utcnow(remove_tz=True) -> datetime:
97
+ """Return now in UTC, optionally stripping timezone information."""
98
+ dt = datetime.now(UTC)
99
+ if remove_tz:
100
+ dt = dt.replace(tzinfo=None)
101
+ return dt
102
+
103
+
104
+ NOTEBOOK_STAGE_OUT_FORMATS = ["pickle", "parquet", "json", "yaml", "csv"]
105
+
106
+
107
+ def get_notebook_stage_dir(stage_name: str) -> str:
108
+ return f".calkit/notebook-stages/{stage_name}"
109
+
110
+
111
+ def get_notebook_stage_script_path(stage_name: str) -> str:
112
+ return os.path.join(get_notebook_stage_dir(stage_name), "script.py")
113
+
114
+
115
+ def get_notebook_stage_out_dir(stage_name: str) -> str:
116
+ return os.path.join(get_notebook_stage_dir(stage_name), "outs")
117
+
118
+
119
+ def get_notebook_stage_out_path(
120
+ stage_name: str,
121
+ out_name: str,
122
+ fmt: Literal["pickle", "parquet", "json", "yaml", "csv"] = "pickle",
123
+ ) -> str:
124
+ if fmt not in NOTEBOOK_STAGE_OUT_FORMATS:
125
+ raise ValueError(f"Invalid output format '{fmt}'")
126
+ return os.path.join(
127
+ get_notebook_stage_out_dir(stage_name), f"{out_name}.{fmt}"
128
+ )
129
+
130
+
131
+ def load_notebook_stage_out(
132
+ stage_name: str,
133
+ out_name: str,
134
+ fmt: Literal["pickle", "parquet", "json", "yaml", "csv"] = "pickle",
135
+ engine: Literal["pandas", "polars"] | None = None,
136
+ ):
137
+ fpath = get_notebook_stage_out_path(stage_name, out_name, fmt=fmt)
138
+ if fmt in ["pickle", "json", "yaml"] and engine is not None:
139
+ raise ValueError(
140
+ f"Engine '{engine}' not compatible with format '{fmt}'"
141
+ )
142
+ if fmt == "pickle":
143
+ with open(fpath, "rb") as f:
144
+ return pickle.load(f)
145
+ elif fmt == "yaml":
146
+ with open(fpath) as f:
147
+ return ryaml.load(f)
148
+ elif fmt == "json":
149
+ with open(fpath) as f:
150
+ return json.load(f)
151
+ elif fmt == "csv" and engine == "pandas":
152
+ import pandas as pd
153
+
154
+ return pd.read_csv(fpath)
155
+ elif fmt == "csv" and engine == "polars":
156
+ import polars as pl
157
+
158
+ return pl.read_csv(fpath)
159
+ elif fmt == "parquet" and engine == "pandas":
160
+ import pandas as pd
161
+
162
+ return pd.read_parquet(fpath)
163
+ elif fmt == "parquet" and engine == "polars":
164
+ import polars as pl
165
+
166
+ return pl.read_parquet(fpath)
167
+ raise ValueError(f"Unsupported format '{fmt}' for engine '{engine}'")
168
+
169
+
170
+ def save_notebook_stage_out(
171
+ obj,
172
+ stage_name: str,
173
+ out_name: str,
174
+ fmt: Literal["pickle", "parquet", "json", "yaml", "csv"] = "pickle",
175
+ engine: Literal["pandas", "polars"] | None = None,
176
+ ):
177
+ fpath = get_notebook_stage_out_path(stage_name, out_name, fmt=fmt)
178
+ dirname = os.path.dirname(fpath)
179
+ os.makedirs(dirname, exist_ok=True)
180
+ if fmt in ["pickle", "json", "yaml"] and engine is not None:
181
+ raise ValueError(
182
+ f"Engine '{engine}' not compatible with format '{fmt}'"
183
+ )
184
+ if fmt == "pickle":
185
+ with open(fpath, "wb") as f:
186
+ pickle.dump(obj, f)
187
+ elif fmt == "json":
188
+ with open(fpath, "w") as f:
189
+ json.dump(obj, f)
190
+ elif fmt == "yaml":
191
+ with open(fpath, "w") as f:
192
+ ryaml.dump(obj, f)
193
+ elif fmt == "csv" and engine == "pandas":
194
+ obj.to_csv(fpath)
195
+ elif fmt == "parquet" and engine == "pandas":
196
+ obj.to_parquet(fpath)
197
+ elif fmt == "csv" and engine == "polars":
198
+ obj.write_csv(fpath)
199
+ elif fmt == "parquet" and engine == "polars":
200
+ obj.write_parquet(fpath)
201
+ else:
202
+ raise ValueError(f"Unsupported format '{fmt}' for engine '{engine}'")
@@ -0,0 +1,257 @@
1
+ """IPython magics."""
2
+
3
+ import ast
4
+ import os
5
+ import subprocess
6
+
7
+ from IPython.core import magic_arguments
8
+ from IPython.core.magic import Magics, cell_magic, magics_class
9
+
10
+ import calkit
11
+
12
+
13
+ @magics_class
14
+ class Calkit(Magics):
15
+
16
+ @magic_arguments.magic_arguments()
17
+ @magic_arguments.argument(
18
+ "-n", "--name", help="Stage name.", required=True
19
+ )
20
+ @magic_arguments.argument(
21
+ "--dep",
22
+ "-d",
23
+ help=(
24
+ "Declare another stage's output variable as a dependency. "
25
+ "Should be in the format '{stage_name}:{var_name}'. "
26
+ "Optionally, the output format and engine, if applicable, can be "
27
+ "appended like 'my-stage:some_dict:yaml' or "
28
+ "'my-stage:df:parquet:pandas'."
29
+ ),
30
+ nargs="+",
31
+ )
32
+ @magic_arguments.argument(
33
+ "--out",
34
+ "-o",
35
+ help=(
36
+ "Declare a variable as an output. "
37
+ "Optionally, the output format can be specified like "
38
+ "'my_dict:json' or both the output format and engine can be "
39
+ "specified like 'df:parquet:polars'."
40
+ ),
41
+ nargs="+",
42
+ )
43
+ @magic_arguments.argument(
44
+ "--dep-path",
45
+ "-D",
46
+ help=(
47
+ "Declare a path as a dependency, so that if that path changes, "
48
+ "the stage will be rerun."
49
+ ),
50
+ nargs="+",
51
+ )
52
+ @magic_arguments.argument(
53
+ "--out-path",
54
+ "-O",
55
+ help=(
56
+ "Declare an output path written to by this cell, e.g., "
57
+ "if a figure is saved to a file."
58
+ ),
59
+ nargs="+",
60
+ )
61
+ @magic_arguments.argument(
62
+ "--out-type",
63
+ "-t",
64
+ choices=["figure", "dataset"],
65
+ help=(
66
+ "Declare the output as a type of Calkit object. If --out-path "
67
+ "is specified, that will be used as the object path, else its "
68
+ "path will be set as the output variable path. "
69
+ "Note that there must only be one output to use this option."
70
+ ),
71
+ )
72
+ @magic_arguments.argument(
73
+ "--out-title", help="Title for Calkit output object."
74
+ )
75
+ @magic_arguments.argument(
76
+ "--out-desc", help="Description for Calkit output object."
77
+ )
78
+ @cell_magic
79
+ def stage(self, line, cell):
80
+ """Turn a notebook cell into a DVC pipeline stage.
81
+
82
+ Note that all dependencies must be declared since the cell will be
83
+ first turned into a script and then run as part of the DVC pipeline.
84
+
85
+ Then variables will be loaded back into the user namespace state by
86
+ loading the DVC output.
87
+ """
88
+ args = magic_arguments.parse_argstring(self.stage, line)
89
+ # If an output object type is specified, make sure we only have one
90
+ # output
91
+ if args.out_type:
92
+ all_outs = []
93
+ if args.out:
94
+ all_outs += args.out
95
+ if args.out_path:
96
+ all_outs = args.out_path
97
+ if len(all_outs) != 1:
98
+ raise ValueError(
99
+ "Only one output can be defined if declaring as a "
100
+ "Calkit object"
101
+ )
102
+ # Parse calkit object parameters
103
+ out_params = {}
104
+ if args.out_title:
105
+ out_params["title"] = ast.literal_eval(args.out_title)
106
+ if args.out_desc:
107
+ out_params["description"] = ast.literal_eval(args.out_desc)
108
+ # Ensure we have required keys
109
+ # TODO: Use Pydantic here
110
+ if "title" not in out_params:
111
+ raise ValueError(
112
+ f"Calkit type {args.out_type} requires a title"
113
+ )
114
+ # Parse output path
115
+ if args.out_path:
116
+ out_params["path"] = args.out_path[0]
117
+ elif args.out:
118
+ out = args.out[0]
119
+ out_split = out.split(":")
120
+ kws = dict(stage_name=args.name, out_name=out_split[0])
121
+ if len(out_split) > 1:
122
+ kws["fmt"] = out_split[1]
123
+ out_path = calkit.get_notebook_stage_out_path(**kws)
124
+ out_params["path"] = out_path
125
+ out_params["stage"] = args.name
126
+ # Save in calkit.yaml
127
+ ck_info = calkit.load_calkit_info()
128
+ objs = ck_info.get(args.out_type + "s", [])
129
+ objs = [obj for obj in objs if obj["path"] != out_params["path"]]
130
+ objs.append(out_params)
131
+ ck_info[args.out_type + "s"] = objs
132
+ with open("calkit.yaml", "w") as f:
133
+ calkit.ryaml.dump(ck_info, f)
134
+ # First, let's write this cell out to a script, ensuring that we
135
+ # load the important state at the top
136
+ script_txt = "# This script was automatically generated by Calkit\n\n"
137
+ script_txt += "import calkit\n\n"
138
+ if args.dep:
139
+ for d in args.dep:
140
+ dep_split = d.split(":")
141
+ stage = dep_split[0]
142
+ varname = dep_split[1]
143
+ fmt_string = ""
144
+ eng_string = ""
145
+ if len(dep_split) >= 3:
146
+ fmt_string = f", fmt='{dep_split[2]}'"
147
+ if len(dep_split) == 4:
148
+ eng_string = f", engine='{dep_split[3]}'"
149
+ script_txt += (
150
+ f"{varname} = calkit.load_notebook_stage_out("
151
+ f"stage_name='{stage}', out_name='{varname}'"
152
+ f"{fmt_string}{eng_string})\n\n"
153
+ )
154
+ script_txt += cell
155
+ # Add lines that save our output variables to files
156
+ if args.out:
157
+ for out in args.out:
158
+ fmt_string = ""
159
+ eng_string = ""
160
+ out_split = out.split(":")
161
+ outvar = out_split[0]
162
+ if len(out_split) > 1:
163
+ fmt_string = f", fmt='{out_split[1]}'"
164
+ if len(out_split) == 3:
165
+ eng_string = f", engine='{out_split[2]}'"
166
+ script_txt += (
167
+ f"calkit.save_notebook_stage_out("
168
+ f"{outvar}, stage_name='{args.name}', out_name='{outvar}'"
169
+ f"{fmt_string}{eng_string})\n"
170
+ )
171
+ # Save the script to a Python file
172
+ script_fpath = calkit.get_notebook_stage_script_path(args.name)
173
+ script_dir = os.path.dirname(script_fpath)
174
+ os.makedirs(script_dir, exist_ok=True)
175
+ outs_dir = calkit.get_notebook_stage_out_dir(stage_name=args.name)
176
+ os.makedirs(outs_dir, exist_ok=True)
177
+ with open(script_fpath, "w") as f:
178
+ f.write(script_txt)
179
+ # Create a DVC stage that runs the script, defining the appropriate
180
+ # dependencies and outputs, and run it
181
+ cmd = [
182
+ "dvc",
183
+ "stage",
184
+ "add",
185
+ "-q",
186
+ "-n",
187
+ args.name,
188
+ "--run",
189
+ "--force",
190
+ "-d",
191
+ script_fpath,
192
+ ]
193
+ if args.dep:
194
+ for dep in args.dep:
195
+ dep_split = dep.split(":")
196
+ stage = dep_split[0]
197
+ varname = dep_split[1]
198
+ kws = dict(stage_name=stage, out_name=varname)
199
+ if len(dep_split) > 2:
200
+ kws["fmt"] = dep_split[2]
201
+ cmd += [
202
+ "-d",
203
+ calkit.get_notebook_stage_out_path(**kws),
204
+ ]
205
+ if args.dep_path:
206
+ for dep in args.dep_path:
207
+ cmd += ["-d", f"'{dep}'"]
208
+ if args.out:
209
+ for out in args.out:
210
+ out_split = out.split(":")
211
+ out_name = out_split[0]
212
+ kws = dict(stage_name=args.name, out_name=out_name)
213
+ if len(out_split) > 1:
214
+ kws["fmt"] = out_split[1]
215
+ cmd += [
216
+ "-o",
217
+ calkit.get_notebook_stage_out_path(**kws),
218
+ ]
219
+ if args.out_path:
220
+ for path in args.out_path:
221
+ cmd += ["-o", f"{path}"]
222
+ cmd.append(f"python '{script_fpath}'")
223
+ try:
224
+ subprocess.run(cmd, check=True, capture_output=True, text=True)
225
+ except subprocess.CalledProcessError as e:
226
+ print(f"Error: {e.stderr}")
227
+ raise e
228
+ # Now let's read in and inject the outputs back into the IPython state
229
+ if args.out:
230
+ for out in args.out:
231
+ out_split = out.split(":")
232
+ out_name = out_split[0]
233
+ kws = dict(stage_name=args.name, out_name=out_name)
234
+ if len(out_split) > 1:
235
+ kws["fmt"] = out_split[1]
236
+ if len(out_split) > 2:
237
+ kws["engine"] = out_split[2]
238
+ self.shell.user_ns[out_name] = calkit.load_notebook_stage_out(
239
+ **kws
240
+ )
241
+ # If the last line of the cell has no equals signs, run that command,
242
+ # since it's probably meant for display
243
+ last_line = cell.strip().split("\n")[-1]
244
+ if not "=" in last_line:
245
+ self.shell.run_cell(last_line)
246
+
247
+
248
+ def load_ipython_extension(ipython):
249
+ """Any module file that define a function named `load_ipython_extension`
250
+ can be loaded via `%load_ext module.path` or be configured to be
251
+ autoloaded by IPython at startup time.
252
+
253
+ See https://ipython.readthedocs.io/en/stable/config/custommagics.html
254
+ """
255
+ # You can register the class itself without instantiating it
256
+ # IPython will call the default constructor on it
257
+ ipython.register_magics(Calkit)
@@ -0,0 +1,36 @@
1
+ """Tests for ``calkit.magics``."""
2
+
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+
7
+ import calkit
8
+
9
+
10
+ def test_stage(tmp_dir):
11
+ # Test the stage magic
12
+ # Run git and dvc init in the temp dir
13
+ subprocess.check_call(["git", "init"])
14
+ subprocess.check_call(["dvc", "init"])
15
+ # Copy in a test notebook and run it
16
+ nb_fpath = os.path.join(
17
+ os.path.dirname(__file__), "..", "..", "test", "pipeline.ipynb"
18
+ )
19
+ shutil.copy(nb_fpath, "notebook.ipynb")
20
+ subprocess.check_call(
21
+ ["jupyter", "nbconvert", "--execute", "notebook.ipynb", "--to", "html"]
22
+ )
23
+ # Check DVC stages make sense
24
+ with open("dvc.yaml") as f:
25
+ pipeline = calkit.ryaml.load(f)
26
+ script = ".calkit/notebook-stages/get-data/script.py"
27
+ deps = pipeline["stages"]["get-data"]["deps"]
28
+ assert script in deps
29
+ # Check Calkit metadata makes sense
30
+ ck_info = calkit.load_calkit_info()
31
+ figs = ck_info["figures"]
32
+ fig = figs[0]
33
+ assert fig["path"] == "figures/plot.png"
34
+ assert fig["title"] == "A plot of the data"
35
+ assert fig["description"] == "This is a plot of the data."
36
+ assert fig["stage"] == "plot-fig"
@@ -0,0 +1,158 @@
1
+ # Using a Jupyter Notebook as a reproducible pipeline
2
+
3
+ Jupyter Notebooks are great tools for exploration,
4
+ but they can cause real headaches when it comes to managing state,
5
+ since they can be executed out-of-order.
6
+ This can lead to bad practices like only running certain cells
7
+ since others are too expensive or failing.
8
+ This means it's very possible for a result from a notebook to be
9
+ non-reproducible.
10
+
11
+ Here we're going to show how to use Calkit to turn a Jupyter Notebook
12
+ into a DVC pipeline,
13
+ as well as label our artifacts.
14
+
15
+ The natural process would be something like:
16
+
17
+ 1. Prototype a cell by running whatever commands make sense.
18
+ 2. Convert cells that are working and valuable into pipeline
19
+ stages, and delete anything else.
20
+
21
+ We should also be using [`nbstripout`](https://github.com/kynan/nbstripout)
22
+ to strip notebook outputs before we commit to the repo,
23
+ since the important ones will be produced as part of the pipeline
24
+ and cached with DVC.
25
+
26
+ At the end of this process we should be left with a notebook that runs
27
+ very quickly after it's been run once,
28
+ and all of our important outputs will be cached and pushed to the cloud,
29
+ but kept out of our Git repo.
30
+
31
+ Alright, so let's show how to convert a notebook into a reproducible
32
+ DVC pipeline without leaving the notebook interface.
33
+
34
+ First, let's write a cell to fetch a dataset,
35
+ and let's assume this is expensive,
36
+ maybe because we had to fetch it from a database.
37
+ To simulate that expense we'll use a call to `time.sleep`.
38
+
39
+ ```python
40
+ import pandas as pd
41
+ import time
42
+
43
+ time.sleep(10)
44
+
45
+ df = pd.DataFrame({"col1": range(1000)})
46
+ df.describe()
47
+ ```
48
+
49
+ In order to convert this cell into a pipeline stage,
50
+ we'll need to load the Calkit magics in our notebook.
51
+ This only needs to be run once, so it can be at the very top:
52
+
53
+ ```python
54
+ %load_ext calkit.magics
55
+ ```
56
+
57
+ Next we simply call the `%%stage` magic with the appropriate arguments to
58
+ convert the cell into a pipeline stage and run it externally with DVC:
59
+
60
+ ```python
61
+ %%stage --name get-data --out df
62
+
63
+ import pandas as pd
64
+ import time
65
+
66
+ time.sleep(10)
67
+
68
+ df = pd.DataFrame({"col1": range(1000)})
69
+ df.describe()
70
+ ```
71
+
72
+ In the magic call, we gave the stage a name and declared an output `df`.
73
+ When we run the cell, we'll see it takes at least 10 seconds the first time,
74
+ but if we run it a second time,
75
+ it will be much faster, since our output is being fetched from the DVC cache.
76
+ If we run `calkit status`, we can see we have some new data to commit and
77
+ push to the DVC remote.
78
+ If we do that, anyone else who clones this project will be able to
79
+ pull in the cache, and the cell will run quickly for them.
80
+
81
+ ## Saving outputs in different formats
82
+
83
+ By default, our output variables will be pickled,
84
+ which is not the most portable format.
85
+ Let's instead save our DataFrame to Parquet format.
86
+ To do this, all we need to do is adjust the `--out` value to add the format
87
+ and DataFrame library
88
+ (Calkit currently supports both Pandas and Polars DataFrames.)
89
+ So change the call to the magic to be:
90
+
91
+ ```python
92
+ %%stage --name get-data --out df:parquet:pandas
93
+ ```
94
+
95
+ ## Using the output of one cell as a dependency in another
96
+
97
+ Let's imagine that now we want to create a visualization of our data.
98
+ Just like if we were creating a typical DVC stage in a `dvc.yaml` file,
99
+ we can declare a cell to depend on the output of another cell with the
100
+ `--dep` command.
101
+ For example:
102
+
103
+ ```python
104
+ %%stage --name plot --dep get-data:df:parquet:pandas --out fig
105
+
106
+ fig = df.plot(backend="plotly")
107
+ fig
108
+ ```
109
+
110
+ In this case, we need to specify what DataFrame library to use to read in
111
+ this dependency.
112
+ Here we tell Calkit that it's a Parquet file to be read with Pandas.
113
+ Calkit will ensure this dependency is loaded into memory before running the
114
+ cell as part of the pipeline.
115
+
116
+ ## Declaring an output as a figure saved to a different path
117
+
118
+ In the cell above we end up pickling `fig` into the DVC cache,
119
+ which is fine if we only ever want to view the figure through the notebook
120
+ interface,
121
+ but what if we want to declare this as a figure and, e.g.,
122
+ use it in a publication?
123
+ We can add a line that saves the figure and declare an additional output path
124
+ and metadata like (note this requires `plotly` and `kaleido` to be installed):
125
+
126
+ ```python
127
+ %%stage --name plot --dep get-data:df:parquet:pandas --out fig --out-path figures/plot.png --out-type figure --out-title "A plot of the data" --out-desc "This is a plot of the data."
128
+
129
+ import os
130
+
131
+ os.makedirs("figures", exist_ok=True)
132
+
133
+ fig = df.plot(backend="plotly")
134
+ fig.write_image("figures/plot.png")
135
+ fig
136
+ ```
137
+
138
+ If we call `calkit list figures`, we'll see our figure,
139
+ and after pushing to the cloud, we'll be able to see it there as well.
140
+
141
+ Note that we could also go back and add `--out-type=dataset` to the
142
+ `get-data` cell,
143
+ which will similarly add that dataset to our project metadata
144
+ for searchability and reuse.
145
+
146
+ ## Running the pipeline outside the notebook
147
+
148
+ One cool feature about building the pipeline this way is that it actually
149
+ creates runnable stages in `dvc.yaml`,
150
+ so `calkit run` or `dvc repro` will run all the same operations that
151
+ executing the notebook would.
152
+
153
+ ## Further exploration
154
+
155
+ If you'd like to try this out or explore further,
156
+ you can view this project up on
157
+ [GitHub](https://github.com/calkit/example-notebook-pipeline)
158
+ or the [Calkit cloud](https://calkit.io/calkit/example-notebook-pipeline).
@@ -0,0 +1,93 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Notebook as a pipeline test"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "%load_ext calkit.magics"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "%%stage --name get-data-pickle --out df\n",
26
+ "\n",
27
+ "import pandas as pd\n",
28
+ "\n",
29
+ "df = pd.DataFrame({\"col1\": range(1000)})\n",
30
+ "df.describe()"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "%%stage --name get-data --out df:parquet:pandas\n",
40
+ "\n",
41
+ "import pandas as pd\n",
42
+ "import time\n",
43
+ "\n",
44
+ "time.sleep(10)\n",
45
+ "\n",
46
+ "df = pd.DataFrame({\"col1\": range(1000)})\n",
47
+ "df.describe()"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "%%stage --name plot --dep get-data:df:parquet:pandas --out fig\n",
57
+ "\n",
58
+ "fig = df.plot(backend=\"plotly\")\n",
59
+ "fig"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "%%stage --name plot-fig --dep get-data:df:parquet:pandas --out fig --out-path figures/plot.png --out-type figure --out-title \"A plot of the data\" --out-desc \"This is a plot of the data.\"\n",
69
+ "\n",
70
+ "import os\n",
71
+ "\n",
72
+ "os.makedirs(\"figures\", exist_ok=True)\n",
73
+ "\n",
74
+ "fig = df.plot(backend=\"plotly\")\n",
75
+ "fig.write_image(\"figures/plot.png\")\n",
76
+ "fig"
77
+ ]
78
+ }
79
+ ],
80
+ "metadata": {
81
+ "kernelspec": {
82
+ "display_name": "base",
83
+ "language": "python",
84
+ "name": "python3"
85
+ },
86
+ "language_info": {
87
+ "name": "python",
88
+ "version": "3.12.4"
89
+ }
90
+ },
91
+ "nbformat": 4,
92
+ "nbformat_minor": 2
93
+ }
@@ -1,98 +0,0 @@
1
- """Core functionality."""
2
-
3
- from __future__ import annotations
4
-
5
- import glob
6
- import logging
7
- import os
8
- from datetime import UTC, datetime
9
-
10
- import ruamel.yaml
11
- from git import Repo
12
- from git.exc import InvalidGitRepositoryError
13
-
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__package__)
16
-
17
- ryaml = ruamel.yaml.YAML()
18
- ryaml.indent(mapping=2, sequence=4, offset=2)
19
- ryaml.preserve_quotes = True
20
- ryaml.width = 70
21
-
22
-
23
- def find_project_dirs(relative=False, max_depth=3) -> list[str]:
24
- """Find all Calkit project directories."""
25
- if relative:
26
- start = ""
27
- else:
28
- start = os.path.expanduser("~")
29
- res = []
30
- for i in range(max_depth):
31
- pattern = os.path.join(start, *["*"] * (i + 1), "calkit.yaml")
32
- res += glob.glob(pattern)
33
- # Check GitHub documents for users who use GitHub Desktop
34
- pattern = os.path.join(
35
- start, "*", "GitHub", *["*"] * (i + 1), "calkit.yaml"
36
- )
37
- res += glob.glob(pattern)
38
- final_res = []
39
- for ck_fpath in res:
40
- path = os.path.dirname(ck_fpath)
41
- # Make sure this path is a Git repo
42
- try:
43
- Repo(path)
44
- except InvalidGitRepositoryError:
45
- continue
46
- final_res.append(path)
47
- return final_res
48
-
49
-
50
- def load_calkit_info(
51
- wdir=None, process_includes: bool | str | list[str] = False
52
- ) -> dict:
53
- """Load Calkit project information.
54
-
55
- Parameters
56
- ----------
57
- wdir : str
58
- Working directory. Defaults to current working directory.
59
- process_includes: bool, string or list of strings
60
- Whether or not to process any '_include' keys for a given kind of
61
- object. If a string is passed, only process includes for that kind.
62
- Similarly, if a list of strings is passed, only process those kinds.
63
- If True, process all default kinds.
64
- """
65
- info = {}
66
- fpath = "calkit.yaml"
67
- if wdir is not None:
68
- fpath = os.path.join(wdir, fpath)
69
- if os.path.isfile(fpath):
70
- with open(fpath) as f:
71
- info = ryaml.load(f)
72
- # Check for any includes, i.e., entities with an _include key, for which
73
- # we should merge in another file
74
- default_includes_enabled = ["environments", "procedures"]
75
- if process_includes:
76
- if isinstance(process_includes, bool):
77
- includes_enabled = default_includes_enabled
78
- elif isinstance(process_includes, str):
79
- includes_enabled = [process_includes]
80
- elif isinstance(process_includes, list):
81
- includes_enabled = process_includes
82
- for kind in includes_enabled:
83
- if kind in info:
84
- for obj_name, obj in info[kind].items():
85
- if "_include" in obj:
86
- include_fpath = obj.pop("_include")
87
- with open(include_fpath) as f:
88
- include_data = ryaml.load(f)
89
- info[kind][obj_name] |= include_data
90
- return info
91
-
92
-
93
- def utcnow(remove_tz=True) -> datetime:
94
- """Return now in UTC, optionally stripping timezone information."""
95
- dt = datetime.now(UTC)
96
- if remove_tz:
97
- dt = dt.replace(tzinfo=None)
98
- return dt
File without changes
File without changes