calkit-python 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {calkit_python-0.7.0 → calkit_python-0.8.0}/PKG-INFO +1 -1
  2. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/__init__.py +1 -1
  3. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/main.py +9 -2
  4. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/new.py +114 -2
  5. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/conda.py +14 -4
  6. {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/conda-envs.md +20 -0
  7. {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/notebook-pipeline.md +43 -24
  8. {calkit_python-0.7.0 → calkit_python-0.8.0}/.github/FUNDING.yml +0 -0
  9. {calkit_python-0.7.0 → calkit_python-0.8.0}/.github/workflows/publish-test.yml +0 -0
  10. {calkit_python-0.7.0 → calkit_python-0.8.0}/.github/workflows/publish.yml +0 -0
  11. {calkit_python-0.7.0 → calkit_python-0.8.0}/.gitignore +0 -0
  12. {calkit_python-0.7.0 → calkit_python-0.8.0}/LICENSE +0 -0
  13. {calkit_python-0.7.0 → calkit_python-0.8.0}/README.md +0 -0
  14. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/__init__.py +0 -0
  15. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/config.py +0 -0
  16. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/core.py +0 -0
  17. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/import_.py +0 -0
  18. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/list.py +0 -0
  19. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/notebooks.py +0 -0
  20. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/office.py +0 -0
  21. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cloud.py +0 -0
  22. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/config.py +0 -0
  23. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/core.py +0 -0
  24. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/data.py +0 -0
  25. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/docker.py +0 -0
  26. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/dvc.py +0 -0
  27. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/git.py +0 -0
  28. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/gui.py +0 -0
  29. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/jupyter.py +0 -0
  30. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/magics.py +0 -0
  31. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/models.py +0 -0
  32. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/office.py +0 -0
  33. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/server.py +0 -0
  34. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/__init__.py +0 -0
  35. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/core.py +0 -0
  36. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/__init__.py +0 -0
  37. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/article/paper.tex +0 -0
  38. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/core.py +0 -0
  39. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/jfm.bst +0 -0
  40. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/jfm.cls +0 -0
  41. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/lineno-FLM.sty +0 -0
  42. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/paper.tex +0 -0
  43. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/templates/latex/jfm/upmath.sty +0 -0
  44. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/__init__.py +0 -0
  45. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/__init__.py +0 -0
  46. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/test_list.py +0 -0
  47. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/test_main.py +0 -0
  48. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/cli/test_new.py +0 -0
  49. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_core.py +0 -0
  50. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_dvc.py +0 -0
  51. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_jupyter.py +0 -0
  52. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_magics.py +0 -0
  53. {calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/tests/test_templates.py +0 -0
  54. {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/adding-latex-pub-docker.md +0 -0
  55. {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/img/run-proc.png +0 -0
  56. {calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/procedures.md +0 -0
  57. {calkit_python-0.7.0 → calkit_python-0.8.0}/pyproject.toml +0 -0
  58. {calkit_python-0.7.0 → calkit_python-0.8.0}/test/pipeline.ipynb +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: calkit-python
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Reproducibility simplified.
5
5
  Project-URL: Homepage, https://github.com/calkit/calkit
6
6
  Project-URL: Issues, https://github.com/calkit/calkit/issues
@@ -1,4 +1,4 @@
1
- __version__ = "0.7.0"
1
+ __version__ = "0.8.0"
2
2
 
3
3
  from .core import *
4
4
  from . import git
@@ -606,6 +606,9 @@ def build_docker(
606
606
  fpath: Annotated[
607
607
  str, typer.Option("-i", "--input", help="Path to input Dockerfile.")
608
608
  ] = "Dockerfile",
609
+ platform: Annotated[
610
+ str, typer.Option("--platform", help="Which platform(s) to build for.")
611
+ ] = None,
609
612
  ):
610
613
  def get_docker_inspect():
611
614
  out = json.loads(
@@ -645,7 +648,11 @@ def build_docker(
645
648
  "Layers"
646
649
  ] or dockerfile_md5 != lock[0].get("DockerfileMD5")
647
650
  if rebuild:
648
- subprocess.check_call(["docker", "build", "-t", tag, "-f", fpath, "."])
651
+ cmd = ["docker", "build", "-t", tag, "-f", fpath]
652
+ if platform is not None:
653
+ cmd += ["--platform", platform]
654
+ cmd.append(".")
655
+ subprocess.check_call(cmd)
649
656
  # Write the lock file
650
657
  inspect = get_docker_inspect()
651
658
  inspect[0]["DockerfileMD5"] = dockerfile_md5
@@ -811,7 +818,7 @@ def check_conda_env(
811
818
  "-o",
812
819
  help=(
813
820
  "Path to which existing environment should be exported. "
814
- "If not specified, will have the same filename with '-loc' "
821
+ "If not specified, will have the same filename with '-lock' "
815
822
  "appended to it, keeping the same extension."
816
823
  ),
817
824
  ),
@@ -222,6 +222,9 @@ def new_docker_env(
222
222
  wdir: Annotated[
223
223
  str, typer.Option("--wdir", help="Working directory.")
224
224
  ] = "/work",
225
+ platform: Annotated[
226
+ str, typer.Option("--platform", help="Which platform(s) to build for.")
227
+ ] = None,
225
228
  description: Annotated[
226
229
  str, typer.Option("--description", help="Description.")
227
230
  ] = None,
@@ -284,6 +287,8 @@ def new_docker_env(
284
287
  env["description"] = description
285
288
  if layers:
286
289
  env["layers"] = layers
290
+ if platform:
291
+ env["platform"] = platform
287
292
  envs[name] = env
288
293
  ck_info["environments"] = envs
289
294
  with open("calkit.yaml", "w") as f:
@@ -291,7 +296,13 @@ def new_docker_env(
291
296
  # If we're creating a stage, do so with DVC
292
297
  if stage:
293
298
  typer.echo(f"Creating DVC stage {stage}")
294
- subprocess.call(
299
+ if not os.path.isfile(".dvc/config"):
300
+ typer.echo(f"Running dvc init")
301
+ subprocess.check_call(["dvc", "init"])
302
+ ck_cmd = f"calkit build-docker {image_name} -i {path}"
303
+ if platform:
304
+ ck_cmd += f" --platform {platform}"
305
+ subprocess.check_call(
295
306
  [
296
307
  "dvc",
297
308
  "stage",
@@ -304,7 +315,7 @@ def new_docker_env(
304
315
  path,
305
316
  "--outs-persist-no-cache",
306
317
  f"{path}-lock.json",
307
- f"calkit build-docker {image_name} -i {path}",
318
+ ck_cmd,
308
319
  ]
309
320
  )
310
321
  repo.git.add("calkit.yaml")
@@ -659,3 +670,104 @@ def new_publication(
659
670
  repo.git.add("dvc.yaml")
660
671
  if not no_commit and repo.git.diff("--staged"):
661
672
  repo.git.commit(["-m", f"Add new publication {pub_fpath}"])
673
+
674
+
675
+ @new_app.command("conda-env")
676
+ def new_conda_env(
677
+ packages: Annotated[
678
+ list[str],
679
+ typer.Argument(help="Packages to include in the environment."),
680
+ ],
681
+ name: Annotated[
682
+ str, typer.Option("--name", "-n", help="Environment name.")
683
+ ],
684
+ path: Annotated[
685
+ str, typer.Option("--path", help="Environment YAML file path.")
686
+ ] = "environment.yml",
687
+ pip_packages: Annotated[
688
+ list[str], typer.Option("--pip", help="Packages to install with pip.")
689
+ ] = [],
690
+ stage: Annotated[
691
+ str,
692
+ typer.Option("--stage", help="DVC pipeline stage name for checking."),
693
+ ] = None,
694
+ description: Annotated[
695
+ str, typer.Option("--description", help="Description.")
696
+ ] = None,
697
+ overwrite: Annotated[
698
+ bool,
699
+ typer.Option(
700
+ "--overwrite",
701
+ "-f",
702
+ help="Overwrite any existing environment with this name.",
703
+ ),
704
+ ] = False,
705
+ no_commit: Annotated[
706
+ bool, typer.Option("--no-commit", help="Do not commit changes.")
707
+ ] = False,
708
+ ):
709
+ """Create a new Conda environment."""
710
+ if os.path.isfile(path) and not overwrite:
711
+ raise_error("Output path already exists (use -f to overwrite)")
712
+ repo = git.Repo()
713
+ # Add environment to Calkit info
714
+ ck_info = calkit.load_calkit_info()
715
+ # If environments is a list instead of a dict, reformulate it
716
+ envs = ck_info.get("environments", {})
717
+ if isinstance(envs, list):
718
+ typer.echo("Converting environments from list to dict")
719
+ envs = {env.pop("name"): env for env in envs}
720
+ if name in envs and not overwrite:
721
+ raise_error(
722
+ f"Environment with name {name} already exists "
723
+ "(use -f to overwrite)"
724
+ )
725
+ # Write environment to path
726
+ conda_env = dict(
727
+ name=name, channels=["conda-forge"], dependencies=packages
728
+ )
729
+ if pip_packages:
730
+ conda_env["dependencies"].append(dict(pip=pip_packages))
731
+ with open(path, "w") as f:
732
+ ryaml.dump(conda_env, f)
733
+ repo.git.add(path)
734
+ typer.echo("Adding environment to calkit.yaml")
735
+ env = dict(path=path, kind="conda")
736
+ if stage is not None:
737
+ env["stage"] = stage
738
+ if description is not None:
739
+ env["description"] = description
740
+ envs[name] = env
741
+ ck_info["environments"] = envs
742
+ with open("calkit.yaml", "w") as f:
743
+ ryaml.dump(ck_info, f)
744
+ # If we're creating a stage, do so with DVC
745
+ if stage:
746
+ typer.echo(f"Creating DVC stage {stage}")
747
+ if not os.path.isfile(".dvc/config"):
748
+ typer.echo(f"Running dvc init")
749
+ subprocess.check_call(["dvc", "init"])
750
+ ck_cmd = f"calkit check-conda-env -f {path}"
751
+ fname, ext = os.path.splitext(path)
752
+ lock_path = f"{fname}-lock{ext}"
753
+ subprocess.check_call(
754
+ [
755
+ "dvc",
756
+ "stage",
757
+ "add",
758
+ "-f",
759
+ "-n",
760
+ stage,
761
+ "--always-changed",
762
+ "-d",
763
+ path,
764
+ "--outs-persist-no-cache",
765
+ lock_path,
766
+ ck_cmd,
767
+ ]
768
+ )
769
+ repo.git.add("calkit.yaml")
770
+ if stage:
771
+ repo.git.add("dvc.yaml")
772
+ if not no_commit and repo.git.diff("--staged"):
773
+ repo.git.commit(["-m", f"Add Conda environment {name}"])
@@ -84,10 +84,18 @@ def check_env(
84
84
  ryaml.dump(env_check, f)
85
85
  # Determine if the env matches
86
86
  env_needs_rebuild = False
87
- existing_conda_deps = env_check["dependencies"][:-1]
88
- existing_pip_deps = env_check["dependencies"][-1]["pip"]
89
- required_conda_deps = env_spec["dependencies"][:-1]
90
- required_pip_deps = env_spec["dependencies"][-1]["pip"]
87
+ if isinstance(env_check["dependencies"][-1], dict):
88
+ existing_conda_deps = env_check["dependencies"][:-1]
89
+ existing_pip_deps = env_check["dependencies"][-1]["pip"]
90
+ else:
91
+ existing_conda_deps = env_check["dependencies"]
92
+ existing_pip_deps = []
93
+ if isinstance(env_spec["dependencies"][-1], dict):
94
+ required_conda_deps = env_spec["dependencies"][:-1]
95
+ required_pip_deps = env_spec["dependencies"][-1]["pip"]
96
+ else:
97
+ required_conda_deps = env_spec["dependencies"]
98
+ required_pip_deps = []
91
99
  log_func("Checking conda dependencies")
92
100
  for dep in required_conda_deps:
93
101
  dep_split = dep.split("=")
@@ -101,6 +109,8 @@ def check_env(
101
109
  env_needs_rebuild = True
102
110
  break
103
111
  elif version is None:
112
+ # TODO: This does not handle specification of only major or
113
+ # major+minor version
104
114
  if package not in [
105
115
  d.split("=")[0] for d in existing_conda_deps
106
116
  ]:
@@ -62,3 +62,23 @@ Note that this pattern can also be expanded to projects that use multiple
62
62
  conda environments.
63
63
  For example, if an environment spec is saved to `env-2.yml`,
64
64
  we can call `calkit check-conda-env -f env-2.yml`.
65
+
66
+ ## Adding a Conda environment to a Calkit project
67
+
68
+ If you run something like:
69
+
70
+ ```sh
71
+ calkit new conda-env \
72
+ -n my-project-py11 \
73
+ python=3.11 \
74
+ pip \
75
+ matplotlib \
76
+ pandas \
77
+ jupyter \
78
+ --pip tensorflow \
79
+ --stage check-conda-env
80
+ ```
81
+
82
+ Calkit will create an environment definition in `calkit.yaml` for use with
83
+ `calkit runenv`, and since `--stage` was specified, Calkit will also add
84
+ an environment check stage to the pipeline automatically.
@@ -1,40 +1,59 @@
1
1
  # Using a Jupyter Notebook as a reproducible pipeline
2
2
 
3
- Jupyter Notebooks are great tools for exploration,
4
- but they can cause real headaches when it comes to managing state,
5
- since they can be executed out-of-order.
6
- This can lead to bad practices like only running certain cells
7
- since others are too expensive or failing.
8
- This means it's very possible for a result from a notebook to be
9
- non-reproducible.
10
-
11
- Here we're going to show how to use Calkit to turn a Jupyter Notebook
12
- into a DVC pipeline,
13
- as well as label our artifacts.
14
-
15
- The natural process would be something like:
16
-
3
+ Jupyter Notebooks are great tools for exploration and prototyping,
4
+ but they can be troublesome if relied upon to produce permanent
5
+ artifacts like figures, datasets, or machine learning models.
6
+ Their strength for ad hoc work is their weakness for "production" work,
7
+ namely that their cells can be executed in any order,
8
+ and they can be difficult to use with Git,
9
+ hindering their reproducibility.
10
+ Furthermore, expensive cells may inspire home grown caches that
11
+ can be cumbersome to invalidate or share between collaborators.
12
+
13
+ It's typically recommended to move anything important or production-ready
14
+ out of notebooks and into modules and/or scripts so they can be easily
15
+ version-controlled and run as part of a reproducible pipeline.
16
+ However, Calkit includes a Jupyter
17
+ cell magic
18
+ to help "productionize" notebook cells as DVC pipeline stages without
19
+ needing to cut/paste anything.
20
+
21
+ This enables a workflow like:
17
22
  1. Prototype a cell by running whatever commands make sense.
18
23
  2. Convert cells that are working and valuable into pipeline
19
24
  stages, and delete anything else.
20
25
 
21
- We should also be using [`nbstripout`](https://github.com/kynan/nbstripout)
22
- to strip notebook outputs before we commit to the repo,
23
- since the important ones will be produced as part of the pipeline
24
- and cached with DVC.
26
+ In the process of making notebook cells into pipeline stages,
27
+ we will need to be explicit about what variables our
28
+ cells depend on and which are outputs,
29
+ since the cells will be executed outside of out Jupyter kernel in a
30
+ separate process.
31
+ Those processes won't have access to any state that isn't declared as
32
+ a dependency or created by the code itself,
33
+ thereby negating some of the state management traps
34
+ one can run into if running cells out of order,
35
+ changing cells but forgetting to rerun them, etc.
25
36
 
26
37
  At the end of this process we should be left with a notebook that runs
27
38
  very quickly after it's been run once,
28
39
  and all of our important outputs will be cached and pushed to the cloud,
29
40
  but kept out of our Git repo.
41
+ Our collaborators will be able to pull everything and similarly
42
+ run the notebook very quickly on the first go,
43
+ and if/when cells are changed,
44
+ DVC will only rerun what is necessary to rerun.
30
45
 
31
- Alright, so let's show how to convert a notebook into a reproducible
32
- DVC pipeline without leaving the notebook interface.
46
+ Side note:
47
+ We should also be using [`nbstripout`](https://github.com/kynan/nbstripout)
48
+ to strip notebook outputs before we commit to the repo,
49
+ since the important ones will be produced as part of the pipeline
50
+ and cached with DVC.
33
51
 
34
- First, let's write a cell to fetch a dataset,
35
- and let's assume this is expensive,
36
- maybe because we had to fetch it from a database.
37
- To simulate that expense we'll use a call to `time.sleep`.
52
+ Now let's work through an example.
53
+ First, we'll write a cell to simulate fetching a dataset.
54
+ To simulate this being an expensive call,
55
+ e.g., if we had to query a database,
56
+ we'll use a call to `time.sleep`.
38
57
 
39
58
  ```python
40
59
  import pandas as pd
File without changes
File without changes
File without changes