calkit-python 0.7.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {calkit_python-0.7.0 → calkit_python-0.8.0}/PKG-INFO +1 -1
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/__init__.py +1 -1
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/main.py +9 -2
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/new.py +114 -2
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/conda.py +14 -4
- {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/conda-envs.md +20 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/notebook-pipeline.md +43 -24
- {calkit_python-0.7.0 → calkit_python-0.8.0}/.github/FUNDING.yml +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/.github/workflows/publish-test.yml +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/.github/workflows/publish.yml +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/.gitignore +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/LICENSE +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/README.md +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/__init__.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/config.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/core.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/import_.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/list.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/notebooks.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/office.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cloud.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/config.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/core.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/data.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/docker.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/dvc.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/git.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/gui.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/jupyter.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/magics.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/models.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/office.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/server.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/__init__.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/core.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/__init__.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/article/paper.tex +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/core.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/jfm.bst +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/jfm.cls +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/lineno-FLM.sty +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/paper.tex +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/upmath.sty +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/__init__.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/__init__.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/test_list.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/test_main.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/test_new.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_core.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_dvc.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_jupyter.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_magics.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_templates.py +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/adding-latex-pub-docker.md +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/img/run-proc.png +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/procedures.md +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/pyproject.toml +0 -0
- {calkit_python-0.7.0 → calkit_python-0.8.0}/test/pipeline.ipynb +0 -0
|
@@ -606,6 +606,9 @@ def build_docker(
|
|
|
606
606
|
fpath: Annotated[
|
|
607
607
|
str, typer.Option("-i", "--input", help="Path to input Dockerfile.")
|
|
608
608
|
] = "Dockerfile",
|
|
609
|
+
platform: Annotated[
|
|
610
|
+
str, typer.Option("--platform", help="Which platform(s) to build for.")
|
|
611
|
+
] = None,
|
|
609
612
|
):
|
|
610
613
|
def get_docker_inspect():
|
|
611
614
|
out = json.loads(
|
|
@@ -645,7 +648,11 @@ def build_docker(
|
|
|
645
648
|
"Layers"
|
|
646
649
|
] or dockerfile_md5 != lock[0].get("DockerfileMD5")
|
|
647
650
|
if rebuild:
|
|
648
|
-
|
|
651
|
+
cmd = ["docker", "build", "-t", tag, "-f", fpath]
|
|
652
|
+
if platform is not None:
|
|
653
|
+
cmd += ["--platform", platform]
|
|
654
|
+
cmd.append(".")
|
|
655
|
+
subprocess.check_call(cmd)
|
|
649
656
|
# Write the lock file
|
|
650
657
|
inspect = get_docker_inspect()
|
|
651
658
|
inspect[0]["DockerfileMD5"] = dockerfile_md5
|
|
@@ -811,7 +818,7 @@ def check_conda_env(
|
|
|
811
818
|
"-o",
|
|
812
819
|
help=(
|
|
813
820
|
"Path to which existing environment should be exported. "
|
|
814
|
-
"If not specified, will have the same filename with '-
|
|
821
|
+
"If not specified, will have the same filename with '-lock' "
|
|
815
822
|
"appended to it, keeping the same extension."
|
|
816
823
|
),
|
|
817
824
|
),
|
|
@@ -222,6 +222,9 @@ def new_docker_env(
|
|
|
222
222
|
wdir: Annotated[
|
|
223
223
|
str, typer.Option("--wdir", help="Working directory.")
|
|
224
224
|
] = "/work",
|
|
225
|
+
platform: Annotated[
|
|
226
|
+
str, typer.Option("--platform", help="Which platform(s) to build for.")
|
|
227
|
+
] = None,
|
|
225
228
|
description: Annotated[
|
|
226
229
|
str, typer.Option("--description", help="Description.")
|
|
227
230
|
] = None,
|
|
@@ -284,6 +287,8 @@ def new_docker_env(
|
|
|
284
287
|
env["description"] = description
|
|
285
288
|
if layers:
|
|
286
289
|
env["layers"] = layers
|
|
290
|
+
if platform:
|
|
291
|
+
env["platform"] = platform
|
|
287
292
|
envs[name] = env
|
|
288
293
|
ck_info["environments"] = envs
|
|
289
294
|
with open("calkit.yaml", "w") as f:
|
|
@@ -291,7 +296,13 @@ def new_docker_env(
|
|
|
291
296
|
# If we're creating a stage, do so with DVC
|
|
292
297
|
if stage:
|
|
293
298
|
typer.echo(f"Creating DVC stage {stage}")
|
|
294
|
-
|
|
299
|
+
if not os.path.isfile(".dvc/config"):
|
|
300
|
+
typer.echo(f"Running dvc init")
|
|
301
|
+
subprocess.check_call(["dvc", "init"])
|
|
302
|
+
ck_cmd = f"calkit build-docker {image_name} -i {path}"
|
|
303
|
+
if platform:
|
|
304
|
+
ck_cmd += f" --platform {platform}"
|
|
305
|
+
subprocess.check_call(
|
|
295
306
|
[
|
|
296
307
|
"dvc",
|
|
297
308
|
"stage",
|
|
@@ -304,7 +315,7 @@ def new_docker_env(
|
|
|
304
315
|
path,
|
|
305
316
|
"--outs-persist-no-cache",
|
|
306
317
|
f"{path}-lock.json",
|
|
307
|
-
|
|
318
|
+
ck_cmd,
|
|
308
319
|
]
|
|
309
320
|
)
|
|
310
321
|
repo.git.add("calkit.yaml")
|
|
@@ -659,3 +670,104 @@ def new_publication(
|
|
|
659
670
|
repo.git.add("dvc.yaml")
|
|
660
671
|
if not no_commit and repo.git.diff("--staged"):
|
|
661
672
|
repo.git.commit(["-m", f"Add new publication {pub_fpath}"])
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
@new_app.command("conda-env")
|
|
676
|
+
def new_conda_env(
|
|
677
|
+
packages: Annotated[
|
|
678
|
+
list[str],
|
|
679
|
+
typer.Argument(help="Packages to include in the environment."),
|
|
680
|
+
],
|
|
681
|
+
name: Annotated[
|
|
682
|
+
str, typer.Option("--name", "-n", help="Environment name.")
|
|
683
|
+
],
|
|
684
|
+
path: Annotated[
|
|
685
|
+
str, typer.Option("--path", help="Environment YAML file path.")
|
|
686
|
+
] = "environment.yml",
|
|
687
|
+
pip_packages: Annotated[
|
|
688
|
+
list[str], typer.Option("--pip", help="Packages to install with pip.")
|
|
689
|
+
] = [],
|
|
690
|
+
stage: Annotated[
|
|
691
|
+
str,
|
|
692
|
+
typer.Option("--stage", help="DVC pipeline stage name for checking."),
|
|
693
|
+
] = None,
|
|
694
|
+
description: Annotated[
|
|
695
|
+
str, typer.Option("--description", help="Description.")
|
|
696
|
+
] = None,
|
|
697
|
+
overwrite: Annotated[
|
|
698
|
+
bool,
|
|
699
|
+
typer.Option(
|
|
700
|
+
"--overwrite",
|
|
701
|
+
"-f",
|
|
702
|
+
help="Overwrite any existing environment with this name.",
|
|
703
|
+
),
|
|
704
|
+
] = False,
|
|
705
|
+
no_commit: Annotated[
|
|
706
|
+
bool, typer.Option("--no-commit", help="Do not commit changes.")
|
|
707
|
+
] = False,
|
|
708
|
+
):
|
|
709
|
+
"""Create a new Conda environment."""
|
|
710
|
+
if os.path.isfile(path) and not overwrite:
|
|
711
|
+
raise_error("Output path already exists (use -f to overwrite)")
|
|
712
|
+
repo = git.Repo()
|
|
713
|
+
# Add environment to Calkit info
|
|
714
|
+
ck_info = calkit.load_calkit_info()
|
|
715
|
+
# If environments is a list instead of a dict, reformulate it
|
|
716
|
+
envs = ck_info.get("environments", {})
|
|
717
|
+
if isinstance(envs, list):
|
|
718
|
+
typer.echo("Converting environments from list to dict")
|
|
719
|
+
envs = {env.pop("name"): env for env in envs}
|
|
720
|
+
if name in envs and not overwrite:
|
|
721
|
+
raise_error(
|
|
722
|
+
f"Environment with name {name} already exists "
|
|
723
|
+
"(use -f to overwrite)"
|
|
724
|
+
)
|
|
725
|
+
# Write environment to path
|
|
726
|
+
conda_env = dict(
|
|
727
|
+
name=name, channels=["conda-forge"], dependencies=packages
|
|
728
|
+
)
|
|
729
|
+
if pip_packages:
|
|
730
|
+
conda_env["dependencies"].append(dict(pip=pip_packages))
|
|
731
|
+
with open(path, "w") as f:
|
|
732
|
+
ryaml.dump(conda_env, f)
|
|
733
|
+
repo.git.add(path)
|
|
734
|
+
typer.echo("Adding environment to calkit.yaml")
|
|
735
|
+
env = dict(path=path, kind="conda")
|
|
736
|
+
if stage is not None:
|
|
737
|
+
env["stage"] = stage
|
|
738
|
+
if description is not None:
|
|
739
|
+
env["description"] = description
|
|
740
|
+
envs[name] = env
|
|
741
|
+
ck_info["environments"] = envs
|
|
742
|
+
with open("calkit.yaml", "w") as f:
|
|
743
|
+
ryaml.dump(ck_info, f)
|
|
744
|
+
# If we're creating a stage, do so with DVC
|
|
745
|
+
if stage:
|
|
746
|
+
typer.echo(f"Creating DVC stage {stage}")
|
|
747
|
+
if not os.path.isfile(".dvc/config"):
|
|
748
|
+
typer.echo(f"Running dvc init")
|
|
749
|
+
subprocess.check_call(["dvc", "init"])
|
|
750
|
+
ck_cmd = f"calkit check-conda-env -f {path}"
|
|
751
|
+
fname, ext = os.path.splitext(path)
|
|
752
|
+
lock_path = f"{fname}-lock{ext}"
|
|
753
|
+
subprocess.check_call(
|
|
754
|
+
[
|
|
755
|
+
"dvc",
|
|
756
|
+
"stage",
|
|
757
|
+
"add",
|
|
758
|
+
"-f",
|
|
759
|
+
"-n",
|
|
760
|
+
stage,
|
|
761
|
+
"--always-changed",
|
|
762
|
+
"-d",
|
|
763
|
+
path,
|
|
764
|
+
"--outs-persist-no-cache",
|
|
765
|
+
lock_path,
|
|
766
|
+
ck_cmd,
|
|
767
|
+
]
|
|
768
|
+
)
|
|
769
|
+
repo.git.add("calkit.yaml")
|
|
770
|
+
if stage:
|
|
771
|
+
repo.git.add("dvc.yaml")
|
|
772
|
+
if not no_commit and repo.git.diff("--staged"):
|
|
773
|
+
repo.git.commit(["-m", f"Add Conda environment {name}"])
|
|
@@ -84,10 +84,18 @@ def check_env(
|
|
|
84
84
|
ryaml.dump(env_check, f)
|
|
85
85
|
# Determine if the env matches
|
|
86
86
|
env_needs_rebuild = False
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
if isinstance(env_check["dependencies"][-1], dict):
|
|
88
|
+
existing_conda_deps = env_check["dependencies"][:-1]
|
|
89
|
+
existing_pip_deps = env_check["dependencies"][-1]["pip"]
|
|
90
|
+
else:
|
|
91
|
+
existing_conda_deps = env_check["dependencies"]
|
|
92
|
+
existing_pip_deps = []
|
|
93
|
+
if isinstance(env_spec["dependencies"][-1], dict):
|
|
94
|
+
required_conda_deps = env_spec["dependencies"][:-1]
|
|
95
|
+
required_pip_deps = env_spec["dependencies"][-1]["pip"]
|
|
96
|
+
else:
|
|
97
|
+
required_conda_deps = env_spec["dependencies"]
|
|
98
|
+
required_pip_deps = []
|
|
91
99
|
log_func("Checking conda dependencies")
|
|
92
100
|
for dep in required_conda_deps:
|
|
93
101
|
dep_split = dep.split("=")
|
|
@@ -101,6 +109,8 @@ def check_env(
|
|
|
101
109
|
env_needs_rebuild = True
|
|
102
110
|
break
|
|
103
111
|
elif version is None:
|
|
112
|
+
# TODO: This does not handle specification of only major or
|
|
113
|
+
# major+minor version
|
|
104
114
|
if package not in [
|
|
105
115
|
d.split("=")[0] for d in existing_conda_deps
|
|
106
116
|
]:
|
|
@@ -62,3 +62,23 @@ Note that this pattern can also be expanded to projects that use multiple
|
|
|
62
62
|
conda environments.
|
|
63
63
|
For example, if an environment spec is saved to `env-2.yml`,
|
|
64
64
|
we can call `calkit check-conda-env -f env-2.yml`.
|
|
65
|
+
|
|
66
|
+
## Adding a Conda environment to a Calkit project
|
|
67
|
+
|
|
68
|
+
If you run something like:
|
|
69
|
+
|
|
70
|
+
```sh
|
|
71
|
+
calkit new conda-env \
|
|
72
|
+
-n my-project-py11 \
|
|
73
|
+
python=3.11 \
|
|
74
|
+
pip \
|
|
75
|
+
matplotlib \
|
|
76
|
+
pandas \
|
|
77
|
+
jupyter \
|
|
78
|
+
--pip tensorflow \
|
|
79
|
+
--stage check-conda-env
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Calkit will create an environment definition in `calkit.yaml` for use with
|
|
83
|
+
`calkit runenv`, and since `--stage` was specified, Calkit will also add
|
|
84
|
+
an environment check stage to the pipeline automatically.
|
|
@@ -1,40 +1,59 @@
|
|
|
1
1
|
# Using a Jupyter Notebook as a reproducible pipeline
|
|
2
2
|
|
|
3
|
-
Jupyter Notebooks are great tools for exploration,
|
|
4
|
-
but they can
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
3
|
+
Jupyter Notebooks are great tools for exploration and prototyping,
|
|
4
|
+
but they can be troublesome if relied upon to produce permanent
|
|
5
|
+
artifacts like figures, datasets, or machine learning models.
|
|
6
|
+
Their strength for ad hoc work is their weakness for "production" work,
|
|
7
|
+
namely that their cells can be executed in any order,
|
|
8
|
+
and they can be difficult to use with Git,
|
|
9
|
+
hindering their reproducibility.
|
|
10
|
+
Furthermore, expensive cells may inspire home grown caches that
|
|
11
|
+
can be cumbersome to invalidate or share between collaborators.
|
|
12
|
+
|
|
13
|
+
It's typically recommended to move anything important or production-ready
|
|
14
|
+
out of notebooks and into modules and/or scripts so they can be easily
|
|
15
|
+
version-controlled and run as part of a reproducible pipeline.
|
|
16
|
+
However, Calkit includes a Jupyter
|
|
17
|
+
cell magic
|
|
18
|
+
to help "productionize" notebook cells as DVC pipeline stages without
|
|
19
|
+
needing to cut/paste anything.
|
|
20
|
+
|
|
21
|
+
This enables a workflow like:
|
|
17
22
|
1. Prototype a cell by running whatever commands make sense.
|
|
18
23
|
2. Convert cells that are working and valuable into pipeline
|
|
19
24
|
stages, and delete anything else.
|
|
20
25
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
26
|
+
In the process of making notebook cells into pipeline stages,
|
|
27
|
+
we will need to be explicit about what variables our
|
|
28
|
+
cells depend on and which are outputs,
|
|
29
|
+
since the cells will be executed outside of out Jupyter kernel in a
|
|
30
|
+
separate process.
|
|
31
|
+
Those processes won't have access to any state that isn't declared as
|
|
32
|
+
a dependency or created by the code itself,
|
|
33
|
+
thereby negating some of the state management traps
|
|
34
|
+
one can run into if running cells out of order,
|
|
35
|
+
changing cells but forgetting to rerun them, etc.
|
|
25
36
|
|
|
26
37
|
At the end of this process we should be left with a notebook that runs
|
|
27
38
|
very quickly after it's been run once,
|
|
28
39
|
and all of our important outputs will be cached and pushed to the cloud,
|
|
29
40
|
but kept out of our Git repo.
|
|
41
|
+
Our collaborators will be able to pull everything and similarly
|
|
42
|
+
run the notebook very quickly on the first go,
|
|
43
|
+
and if/when cells are changed,
|
|
44
|
+
DVC will only rerun what is necessary to rerun.
|
|
30
45
|
|
|
31
|
-
|
|
32
|
-
|
|
46
|
+
Side note:
|
|
47
|
+
We should also be using [`nbstripout`](https://github.com/kynan/nbstripout)
|
|
48
|
+
to strip notebook outputs before we commit to the repo,
|
|
49
|
+
since the important ones will be produced as part of the pipeline
|
|
50
|
+
and cached with DVC.
|
|
33
51
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
52
|
+
Now let's work through an example.
|
|
53
|
+
First, we'll write a cell to simulate fetching a dataset.
|
|
54
|
+
To simulate this being an expensive call,
|
|
55
|
+
e.g., if we had to query a database,
|
|
56
|
+
we'll use a call to `time.sleep`.
|
|
38
57
|
|
|
39
58
|
```python
|
|
40
59
|
import pandas as pd
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|