PyPI - calkit-python - Versions diffs - 0.7.0__tar.gz → 0.8.0__tar.gz - Mend

calkit-python 0.7.0tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{calkit_python-0.7.0 → calkit_python-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: calkit-python
-Version: 0.7.0
+Version: 0.8.0
 Summary: Reproducibility simplified.
 Project-URL: Homepage, https://github.com/calkit/calkit
 Project-URL: Issues, https://github.com/calkit/calkit/issues

{calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.7.0"
+__version__ = "0.8.0"
 from .core import *
 from . import git

{calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/main.py RENAMED Viewed

@@ -606,6 +606,9 @@ def build_docker(
     fpath: Annotated[
         str, typer.Option("-i", "--input", help="Path to input Dockerfile.")
     ] = "Dockerfile",
+    platform: Annotated[
+        str, typer.Option("--platform", help="Which platform(s) to build for.")
+    ] = None,
 ):
     def get_docker_inspect():
         out = json.loads(
@@ -645,7 +648,11 @@ def build_docker(
             "Layers"
         ] or dockerfile_md5 != lock[0].get("DockerfileMD5")
     if rebuild:
-        subprocess.check_call(["docker", "build", "-t", tag, "-f", fpath, "."])
+        cmd = ["docker", "build", "-t", tag, "-f", fpath]
+        if platform is not None:
+            cmd += ["--platform", platform]
+        cmd.append(".")
+        subprocess.check_call(cmd)
     # Write the lock file
     inspect = get_docker_inspect()
     inspect[0]["DockerfileMD5"] = dockerfile_md5
@@ -811,7 +818,7 @@ def check_conda_env(
             "-o",
             help=(
                 "Path to which existing environment should be exported. "
-                "If not specified, will have the same filename with '-loc' "
+                "If not specified, will have the same filename with '-lock' "
                 "appended to it, keeping the same extension."
             ),
         ),

{calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/cli/new.py RENAMED Viewed

@@ -222,6 +222,9 @@ def new_docker_env(
     wdir: Annotated[
         str, typer.Option("--wdir", help="Working directory.")
     ] = "/work",
+    platform: Annotated[
+        str, typer.Option("--platform", help="Which platform(s) to build for.")
+    ] = None,
     description: Annotated[
         str, typer.Option("--description", help="Description.")
     ] = None,
@@ -284,6 +287,8 @@ def new_docker_env(
         env["description"] = description
     if layers:
         env["layers"] = layers
+    if platform:
+        env["platform"] = platform
     envs[name] = env
     ck_info["environments"] = envs
     with open("calkit.yaml", "w") as f:
@@ -291,7 +296,13 @@ def new_docker_env(
     # If we're creating a stage, do so with DVC
     if stage:
         typer.echo(f"Creating DVC stage {stage}")
-        subprocess.call(
+        if not os.path.isfile(".dvc/config"):
+            typer.echo(f"Running dvc init")
+            subprocess.check_call(["dvc", "init"])
+        ck_cmd = f"calkit build-docker {image_name} -i {path}"
+        if platform:
+            ck_cmd += f" --platform {platform}"
+        subprocess.check_call(
             [
                 "dvc",
                 "stage",
@@ -304,7 +315,7 @@ def new_docker_env(
                 path,
                 "--outs-persist-no-cache",
                 f"{path}-lock.json",
-                f"calkit build-docker {image_name} -i {path}",
+                ck_cmd,
             ]
         )
     repo.git.add("calkit.yaml")
@@ -659,3 +670,104 @@ def new_publication(
         repo.git.add("dvc.yaml")
     if not no_commit and repo.git.diff("--staged"):
         repo.git.commit(["-m", f"Add new publication {pub_fpath}"])
+@new_app.command("conda-env")
+def new_conda_env(
+    packages: Annotated[
+        list[str],
+        typer.Argument(help="Packages to include in the environment."),
+    ],
+    name: Annotated[
+        str, typer.Option("--name", "-n", help="Environment name.")
+    ],
+    path: Annotated[
+        str, typer.Option("--path", help="Environment YAML file path.")
+    ] = "environment.yml",
+    pip_packages: Annotated[
+        list[str], typer.Option("--pip", help="Packages to install with pip.")
+    ] = [],
+    stage: Annotated[
+        str,
+        typer.Option("--stage", help="DVC pipeline stage name for checking."),
+    ] = None,
+    description: Annotated[
+        str, typer.Option("--description", help="Description.")
+    ] = None,
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            "--overwrite",
+            "-f",
+            help="Overwrite any existing environment with this name.",
+        ),
+    ] = False,
+    no_commit: Annotated[
+        bool, typer.Option("--no-commit", help="Do not commit changes.")
+    ] = False,
+):
+    """Create a new Conda environment."""
+    if os.path.isfile(path) and not overwrite:
+        raise_error("Output path already exists (use -f to overwrite)")
+    repo = git.Repo()
+    # Add environment to Calkit info
+    ck_info = calkit.load_calkit_info()
+    # If environments is a list instead of a dict, reformulate it
+    envs = ck_info.get("environments", {})
+    if isinstance(envs, list):
+        typer.echo("Converting environments from list to dict")
+        envs = {env.pop("name"): env for env in envs}
+    if name in envs and not overwrite:
+        raise_error(
+            f"Environment with name {name} already exists "
+            "(use -f to overwrite)"
+        )
+    # Write environment to path
+    conda_env = dict(
+        name=name, channels=["conda-forge"], dependencies=packages
+    )
+    if pip_packages:
+        conda_env["dependencies"].append(dict(pip=pip_packages))
+    with open(path, "w") as f:
+        ryaml.dump(conda_env, f)
+    repo.git.add(path)
+    typer.echo("Adding environment to calkit.yaml")
+    env = dict(path=path, kind="conda")
+    if stage is not None:
+        env["stage"] = stage
+    if description is not None:
+        env["description"] = description
+    envs[name] = env
+    ck_info["environments"] = envs
+    with open("calkit.yaml", "w") as f:
+        ryaml.dump(ck_info, f)
+    # If we're creating a stage, do so with DVC
+    if stage:
+        typer.echo(f"Creating DVC stage {stage}")
+        if not os.path.isfile(".dvc/config"):
+            typer.echo(f"Running dvc init")
+            subprocess.check_call(["dvc", "init"])
+        ck_cmd = f"calkit check-conda-env -f {path}"
+        fname, ext = os.path.splitext(path)
+        lock_path = f"{fname}-lock{ext}"
+        subprocess.check_call(
+            [
+                "dvc",
+                "stage",
+                "add",
+                "-f",
+                "-n",
+                stage,
+                "--always-changed",
+                "-d",
+                path,
+                "--outs-persist-no-cache",
+                lock_path,
+                ck_cmd,
+            ]
+        )
+    repo.git.add("calkit.yaml")
+    if stage:
+        repo.git.add("dvc.yaml")
+    if not no_commit and repo.git.diff("--staged"):
+        repo.git.commit(["-m", f"Add Conda environment {name}"])

{calkit_python-0.7.0 → calkit_python-0.8.0}/calkit/conda.py RENAMED Viewed

@@ -84,10 +84,18 @@ def check_env(
                 ryaml.dump(env_check, f)
         # Determine if the env matches
         env_needs_rebuild = False
-        existing_conda_deps = env_check["dependencies"][:-1]
-        existing_pip_deps = env_check["dependencies"][-1]["pip"]
-        required_conda_deps = env_spec["dependencies"][:-1]
-        required_pip_deps = env_spec["dependencies"][-1]["pip"]
+        if isinstance(env_check["dependencies"][-1], dict):
+            existing_conda_deps = env_check["dependencies"][:-1]
+            existing_pip_deps = env_check["dependencies"][-1]["pip"]
+        else:
+            existing_conda_deps = env_check["dependencies"]
+            existing_pip_deps = []
+        if isinstance(env_spec["dependencies"][-1], dict):
+            required_conda_deps = env_spec["dependencies"][:-1]
+            required_pip_deps = env_spec["dependencies"][-1]["pip"]
+        else:
+            required_conda_deps = env_spec["dependencies"]
+            required_pip_deps = []
         log_func("Checking conda dependencies")
         for dep in required_conda_deps:
             dep_split = dep.split("=")
@@ -101,6 +109,8 @@ def check_env(
                 env_needs_rebuild = True
                 break
             elif version is None:
+                # TODO: This does not handle specification of only major or
+                # major+minor version
                 if package not in [
                     d.split("=")[0] for d in existing_conda_deps
                 ]:

{calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/conda-envs.md RENAMED Viewed

@@ -62,3 +62,23 @@ Note that this pattern can also be expanded to projects that use multiple
 conda environments.
 For example, if an environment spec is saved to `env-2.yml`,
 we can call `calkit check-conda-env -f env-2.yml`.
+## Adding a Conda environment to a Calkit project
+If you run something like:
+```sh
+calkit new conda-env \
+    -n my-project-py11 \
+    python=3.11 \
+    pip \
+    matplotlib \
+    pandas \
+    jupyter \
+    --pip tensorflow \
+    --stage check-conda-env
+```
+Calkit will create an environment definition in `calkit.yaml` for use with
+`calkit runenv`, and since `--stage` was specified, Calkit will also add
+an environment check stage to the pipeline automatically.

{calkit_python-0.7.0 → calkit_python-0.8.0}/docs/tutorials/notebook-pipeline.md RENAMED Viewed

@@ -1,40 +1,59 @@
 # Using a Jupyter Notebook as a reproducible pipeline
-Jupyter Notebooks are great tools for exploration,
-but they can cause real headaches when it comes to managing state,
-since they can be executed out-of-order.
-This can lead to bad practices like only running certain cells
-since others are too expensive or failing.
-This means it's very possible for a result from a notebook to be
-non-reproducible.
-Here we're going to show how to use Calkit to turn a Jupyter Notebook
-into a DVC pipeline,
-as well as label our artifacts.
-The natural process would be something like:
+Jupyter Notebooks are great tools for exploration and prototyping,
+but they can be troublesome if relied upon to produce permanent
+artifacts like figures, datasets, or machine learning models.
+Their strength for ad hoc work is their weakness for "production" work,
+namely that their cells can be executed in any order,
+and they can be difficult to use with Git,
+hindering their reproducibility.
+Furthermore, expensive cells may inspire home grown caches that
+can be cumbersome to invalidate or share between collaborators.
+It's typically recommended to move anything important or production-ready
+out of notebooks and into modules and/or scripts so they can be easily
+version-controlled and run as part of a reproducible pipeline.
+However, Calkit includes a Jupyter
+cell magic
+to help "productionize" notebook cells as DVC pipeline stages without
+needing to cut/paste anything.
+This enables a workflow like:
 1. Prototype a cell by running whatever commands make sense.
 2. Convert cells that are working and valuable into pipeline
    stages, and delete anything else.
-We should also be using [`nbstripout`](https://github.com/kynan/nbstripout)
-to strip notebook outputs before we commit to the repo,
-since the important ones will be produced as part of the pipeline
-and cached with DVC.
+In the process of making notebook cells into pipeline stages,
+we will need to be explicit about what variables our
+cells depend on and which are outputs,
+since the cells will be executed outside of out Jupyter kernel in a
+separate process.
+Those processes won't have access to any state that isn't declared as
+a dependency or created by the code itself,
+thereby negating some of the state management traps
+one can run into if running cells out of order,
+changing cells but forgetting to rerun them, etc.
 At the end of this process we should be left with a notebook that runs
 very quickly after it's been run once,
 and all of our important outputs will be cached and pushed to the cloud,
 but kept out of our Git repo.
+Our collaborators will be able to pull everything and similarly
+run the notebook very quickly on the first go,
+and if/when cells are changed,
+DVC will only rerun what is necessary to rerun.
-Alright, so let's show how to convert a notebook into a reproducible
-DVC pipeline without leaving the notebook interface.
+Side note:
+We should also be using [`nbstripout`](https://github.com/kynan/nbstripout)
+to strip notebook outputs before we commit to the repo,
+since the important ones will be produced as part of the pipeline
+and cached with DVC.
-First, let's write a cell to fetch a dataset,
-and let's assume this is expensive,
-maybe because we had to fetch it from a database.
-To simulate that expense we'll use a call to `time.sleep`.
+Now let's work through an example.
+First, we'll write a cell to simulate fetching a dataset.
+To simulate this being an expensive call,
+e.g., if we had to query a database,
+we'll use a call to `time.sleep`.
 ```python
 import pandas as pd