dvc-utils 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ name: Verify README examples, release to PyPI
2
+ on:
3
+ push:
4
+ branches: [ "main" ]
5
+ tags: [ "v**" ]
6
+ pull_request:
7
+ branches: [ "main" ]
8
+ workflow_dispatch:
9
+ env:
10
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
11
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
12
+ jobs:
13
+ test:
14
+ name: Test (Python ${{ matrix.python-version }})
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ matrix:
18
+ python-version: ['3.10', '3.11', '3.12', '3.13']
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ with:
22
+ fetch-depth: 0
23
+ submodules: true
24
+ - uses: astral-sh/setup-uv@v5
25
+ with:
26
+ enable-cache: true
27
+ - name: Set up Python ${{ matrix.python-version }}
28
+ run: uv python install ${{ matrix.python-version }}
29
+ - uses: dtolnay/rust-toolchain@stable
30
+ - uses: Swatinem/rust-cache@v2
31
+ - name: Install parquet2json
32
+ run: |
33
+ if ! command -v parquet2json &> /dev/null; then
34
+ # Use cargo-binstall for faster installation if available
35
+ if ! command -v cargo-binstall &> /dev/null; then
36
+ curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash
37
+ fi
38
+ cargo binstall -y parquet2json || cargo install parquet2json
39
+ else
40
+ echo "parquet2json already installed: $(parquet2json --version)"
41
+ fi
42
+ - name: Install dependencies
43
+ run: uv sync --extra ci --extra test
44
+ - name: Run pytest
45
+ run: |
46
+ source .venv/bin/activate
47
+ pytest
48
+ - name: '`dvc pull` test/data'
49
+ working-directory: test/data
50
+ run: |
51
+ source ../../.venv/bin/activate
52
+ dvc pull -r s3 -R -A
53
+ - name: Set up parquet-helpers
54
+ uses: actions/checkout@v4
55
+ with:
56
+ repository: ryan-williams/parquet-helpers
57
+ path: pqt
58
+ - name: Verify README examples
59
+ env:
60
+ # Evaluate README examples from within the `test/data` submodule
61
+ BMDF_WORKDIR: test/data
62
+ run: |
63
+ source .venv/bin/activate
64
+ export PATH="$PWD/pqt:$PATH"
65
+ . pqt/.pqt-rc
66
+ export SHELL
67
+ mdcmd
68
+ git diff --exit-code
69
+ release:
70
+ name: Release to PyPI
71
+ if: startsWith(github.ref, 'refs/tags/')
72
+ needs: test
73
+ runs-on: ubuntu-latest
74
+ steps:
75
+ - uses: actions/checkout@v4
76
+ - uses: astral-sh/setup-uv@v5
77
+ - name: Build package
78
+ run: uv build
79
+ - name: Publish to PyPI
80
+ run: uv publish --username __token__ --password ${{ secrets.PYPI_TOKEN }}
81
+ - name: Create GitHub Release
82
+ env:
83
+ GH_TOKEN: ${{ github.token }}
84
+ run: |
85
+ gh release create ${{ github.ref_name }} \
86
+ --title "${{ github.ref_name }}" \
87
+ --generate-notes
@@ -0,0 +1,3 @@
1
+ [submodule "test/data"]
2
+ path = test/data
3
+ url = https://github.com/ryan-williams/dvc-helpers
@@ -0,0 +1,130 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ `dvc-utils` is a Python CLI tool for diffing DVC-tracked files between commits or against the worktree. It can optionally pipe file contents through other commands before diffing (e.g., `parquet2json`, `gunzip`, etc.), enabling semantic diffs of binary or compressed formats.
8
+
9
+ Core functionality: Compare DVC-tracked files at different Git commits by resolving their MD5 hashes from `.dvc` files, looking up cached content in the DVC cache, and piping through optional preprocessing commands before diffing.
10
+
11
+ ## Build & Development Commands
12
+
13
+ This project uses `uv` for dependency management:
14
+
15
+ ```bash
16
+ # Initialize project (first time setup)
17
+ uv init
18
+
19
+ # Install dependencies
20
+ uv sync
21
+
22
+ # Install with test dependencies
23
+ uv sync --extra test
24
+
25
+ # Run tests
26
+ pytest
27
+
28
+ # Run tests (with activated venv)
29
+ source .venv/bin/activate
30
+ pytest
31
+
32
+ # Build the package
33
+ uv build
34
+
35
+ # Install in development mode
36
+ pip install -e .
37
+ ```
38
+
39
+ ## Testing
40
+
41
+ Tests are in `tests/` directory. The test suite uses a DVC-tracked test data repository at `test/data` (submodule of [ryan-williams/dvc-helpers@test]).
42
+
43
+ Key test commands:
44
+ ```bash
45
+ # Run all tests
46
+ pytest
47
+
48
+ # Run specific test file
49
+ pytest tests/test_diff_exit_codes.py
50
+
51
+ # Run tests with verbose output (already configured in pytest.ini)
52
+ pytest -v
53
+ ```
54
+
55
+ The test data directory requires DVC pull:
56
+ ```bash
57
+ cd test/data
58
+ dvc pull -r s3 -R -A
59
+ ```
60
+
61
+ ## Architecture
62
+
63
+ ### Core Module Structure
64
+
65
+ - **`src/dvc_utils/cli.py`**: Minimal Click group definition (entry point)
66
+ - **`src/dvc_utils/main.py`**: Main entry point that invokes the CLI
67
+ - **`src/dvc_utils/diff.py`**: Core `dvc-diff` command implementation
68
+ - Parses refspecs (e.g., `HEAD^..HEAD` or single commit)
69
+ - Resolves DVC-tracked file paths to cache paths via MD5 lookups
70
+ - Handles both individual files and DVC-tracked directories
71
+ - Uses `dffs.join_pipelines` to run preprocessing commands on both sides of diff
72
+ - **`src/dvc_utils/path.py`**: DVC path resolution utilities
73
+ - `dvc_paths()`: Normalizes path/dvc_path pairs
74
+ - `dvc_md5()`: Extracts MD5 from `.dvc` file at specific Git ref
75
+ - `dvc_cache_path()`: Resolves MD5 to actual cache file path
76
+ - `dvc_cache_dir()`: Finds DVC cache directory (respects `DVC_UTILS_CACHE_DIR` env var)
77
+ - **`src/dvc_utils/sync.py`**: Incomplete `pull-x` command (not currently functional)
78
+
79
+ ### Key Dependencies
80
+
81
+ - **`dffs`**: Provides `join_pipelines()` for executing parallel command pipelines and diffing their outputs
82
+ - **`utz`**: Utility library with subprocess wrappers (`utz.process`), error printing (`err`), and file hashing
83
+ - **`click`**: CLI framework
84
+ - **`pyyaml`**: For parsing `.dvc` YAML files
85
+
86
+ ### How DVC Diffing Works
87
+
88
+ 1. Parse the path argument to get both the data path and `.dvc` file path
89
+ 2. For each side of the diff (before/after commits):
90
+ - Use `git show <ref>:<path>.dvc` to get the DVC file content at that ref
91
+ - Parse the YAML to extract the MD5 hash
92
+ - Construct cache path: `<cache_dir>/files/md5/<first_2_chars>/<remaining_chars>`
93
+ 3. If preprocessing commands are specified (e.g., `dvc-diff wc -l foo.dvc`):
94
+ - Use `dffs.join_pipelines()` to run commands on both cache files and diff the outputs
95
+ 4. Otherwise, run `diff` directly on the two cache files
96
+
97
+ ### DVC Directory Handling
98
+
99
+ For DVC-tracked directories (`.dvc` files with multiple entries), the tool:
100
+ - Parses the directory's `.dvc` file JSON structure
101
+ - Compares MD5 hashes for each file in the directory
102
+ - Outputs changes as: `filename: <old_md5> -> <new_md5>`
103
+
104
+ ### Exit Code Behavior
105
+
106
+ The tool propagates exit codes correctly:
107
+ - `0`: No differences found
108
+ - `1`: Differences found (standard `diff` behavior)
109
+ - `>1`: Error in preprocessing pipeline or diff execution
110
+
111
+ ## Entry Points
112
+
113
+ Defined in `pyproject.toml`:
114
+ - `dvc-utils`: Main entry point (currently just invokes CLI group)
115
+ - `dvc-diff`: Direct entry to diff command
116
+
117
+ ## Environment Variables
118
+
119
+ - `DVC_UTILS_CACHE_DIR`: Override DVC cache directory location (relative to git root)
120
+ - `SHELL`: Used as default shell for executing preprocessing commands
121
+ - `BMDF_WORKDIR`: Used in CI for running README example verification from subdirectory
122
+
123
+ ## CI/CD Notes
124
+
125
+ - Uses GitHub Actions (`.github/workflows/ci.yml`)
126
+ - Tests run on every push/PR to main
127
+ - Releases to PyPI on version tags (`v**`)
128
+ - README examples are verified using `bmdf` (bash-markdown-fence) in the `test/data` directory
129
+ - Requires `parquet2json` (Rust tool) for Parquet-related examples
130
+ - AWS credentials needed for DVC remote access in tests
@@ -1,22 +1,23 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dvc-utils
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: CLI for diffing DVC-tracked files at two commits (or one commit vs. current worktree), optionally passing both through another command first
5
- Author-email: Ryan Williams <ryan@runsascoded.com>
6
- License: MIT
7
5
  Project-URL: Homepage, https://github.com/runsascoded/dvc-utils
8
6
  Project-URL: Author URL, https://github.com/ryan-williams
9
- Requires-Python: >=3.9
10
- Description-Content-Type: text/markdown
7
+ Author-email: Ryan Williams <ryan@runsascoded.com>
8
+ License: MIT
11
9
  License-File: LICENSE
10
+ Requires-Python: >=3.10
12
11
  Requires-Dist: click
13
- Requires-Dist: dffs>=0.0.5
12
+ Requires-Dist: dffs>=0.0.7
14
13
  Requires-Dist: pyyaml
15
14
  Requires-Dist: utz>=0.20.0
16
15
  Provides-Extra: ci
17
- Requires-Dist: bmdf==0.5.2; extra == "ci"
18
- Requires-Dist: dvc-s3; extra == "ci"
19
- Dynamic: license-file
16
+ Requires-Dist: bmdf==0.5.2; extra == 'ci'
17
+ Requires-Dist: dvc-s3; extra == 'ci'
18
+ Provides-Extra: test
19
+ Requires-Dist: pytest>=7.0.0; extra == 'test'
20
+ Description-Content-Type: text/markdown
20
21
 
21
22
  # dvc-utils
22
23
  Diff [DVC] files, optionally piping through other commands first.
@@ -61,6 +62,10 @@ dvc-diff
61
62
  # optional) at HEAD (last committed value) vs. the current worktree content.
62
63
  #
63
64
  # Options:
65
+ # -b, --both / -B, --no-both Merge stderr into stdout in pipeline commands
66
+ # (like shell `2>&1`), so stderr is included in
67
+ # diff output. Default: stderr shown only on
68
+ # command failures.
64
69
  # -c, --color / -C, --no-color Force or prevent colorized output
65
70
  # -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits)
66
71
  # or <commit> (compare <commit> to the worktree)
@@ -41,6 +41,10 @@ dvc-diff
41
41
  # optional) at HEAD (last committed value) vs. the current worktree content.
42
42
  #
43
43
  # Options:
44
+ # -b, --both / -B, --no-both Merge stderr into stdout in pipeline commands
45
+ # (like shell `2>&1`), so stderr is included in
46
+ # diff output. Default: stderr shown only on
47
+ # command failures.
44
48
  # -c, --color / -C, --no-color Force or prevent colorized output
45
49
  # -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits)
46
50
  # or <commit> (compare <commit> to the worktree)
@@ -1,20 +1,20 @@
1
1
  [build-system]
2
- requires = ["setuptools>=75"]
3
- build-backend = "setuptools.build_meta"
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "dvc-utils"
7
- version = "0.3.0"
7
+ version = "0.3.2"
8
8
  description = "CLI for diffing DVC-tracked files at two commits (or one commit vs. current worktree), optionally passing both through another command first"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
11
11
  authors = [
12
12
  {name = "Ryan Williams", email = "ryan@runsascoded.com"}
13
13
  ]
14
- requires-python = ">=3.9"
14
+ requires-python = ">=3.10"
15
15
  dependencies = [
16
16
  "click",
17
- "dffs>=0.0.5",
17
+ "dffs>=0.0.7",
18
18
  "pyyaml",
19
19
  "utz>=0.20.0",
20
20
  ]
@@ -24,6 +24,9 @@ ci = [
24
24
  "bmdf==0.5.2",
25
25
  "dvc-s3",
26
26
  ]
27
+ test = [
28
+ "pytest>=7.0.0",
29
+ ]
27
30
 
28
31
  [project.urls]
29
32
  Homepage = "https://github.com/runsascoded/dvc-utils"
@@ -33,8 +36,10 @@ Homepage = "https://github.com/runsascoded/dvc-utils"
33
36
  dvc-utils = "dvc_utils.main:main"
34
37
  dvc-diff = "dvc_utils.diff:dvc_diff"
35
38
 
36
- [tool.setuptools]
37
- package-dir = {"" = "src"}
39
+ [dependency-groups]
40
+ dev = [
41
+ "pytest>=7.0.0",
42
+ ]
38
43
 
39
- [tool.setuptools.packages.find]
40
- where = ["src"]
44
+ [tool.hatch.build.targets.wheel]
45
+ packages = ["src/dvc_utils"]
@@ -0,0 +1,6 @@
1
+ [pytest]
2
+ testpaths = tests
3
+ python_files = test_*.py
4
+ python_classes = Test*
5
+ python_functions = test_*
6
+ addopts = -v --tb=short
@@ -18,6 +18,7 @@ from dvc_utils.path import dvc_paths, dvc_cache_path
18
18
  short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first',
19
19
  no_args_is_help=True,
20
20
  )
21
+ @option('-b/-B', '--both/--no-both', default=False, help='Merge stderr into stdout in pipeline commands (like shell `2>&1`), so stderr is included in diff output. Default: stderr shown only on command failures.')
21
22
  @option('-c/-C', '--color/--no-color', default=None, help='Force or prevent colorized output')
22
23
  @option('-r', '--refspec', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
23
24
  @option('-R', '--ref', help='Shorthand for `-r <ref>^..<ref>`, i.e. inspect a specific commit (vs. its parent)')
@@ -29,6 +30,7 @@ from dvc_utils.path import dvc_paths, dvc_cache_path
29
30
  @option('-x', '--exec-cmd', 'exec_cmds', multiple=True, help='Command(s) to execute before diffing; alternate syntax to passing commands as positional arguments')
30
31
  @argument('args', metavar='[exec_cmd...] <path>', nargs=-1)
31
32
  def dvc_diff(
33
+ both: bool,
32
34
  color: bool | None,
33
35
  refspec: str | None,
34
36
  ref: str | None,
@@ -111,14 +113,16 @@ def dvc_diff(
111
113
  cmds1 = [ shlex.split(cmd) for cmd in cmds1 ]
112
114
  cmds2 = [ shlex.split(cmd) for cmd in cmds2 ]
113
115
 
114
- join_pipelines(
116
+ returncode = join_pipelines(
115
117
  base_cmd=['diff', *diff_args],
116
118
  cmds1=cmds1,
117
119
  cmds2=cmds2,
118
120
  verbose=verbose,
119
121
  shell=shell,
120
122
  executable=shell_executable,
123
+ both=both,
121
124
  )
125
+ exit(returncode)
122
126
  else:
123
127
  res = process.run('diff', *diff_args, path1 or '/dev/null', path2 or '/dev/null', log=log, check=False)
124
128
  exit(res.returncode)
@@ -60,7 +60,8 @@ def dvc_md5(
60
60
  relpath = basename(dvc_path)
61
61
  if relpath.endswith(".dvc"):
62
62
  relpath = relpath[:-len(".dvc")]
63
- while cur_dir and cur_dir != '.':
63
+ prev_dir = None
64
+ while cur_dir and cur_dir != '.' and cur_dir != prev_dir:
64
65
  dir_cache_path = dvc_cache_path(ref=git_ref, dvc_path=f"{cur_dir}.dvc", log=log)
65
66
  if dir_cache_path:
66
67
  with open(dir_cache_path, 'r') as f:
@@ -71,6 +72,7 @@ def dvc_md5(
71
72
  else:
72
73
  raise RuntimeError(f"{relpath=} not found in DVC-tracked dir {cur_dir}")
73
74
  relpath = join(basename(cur_dir), relpath)
75
+ prev_dir = cur_dir
74
76
  cur_dir = dirname(cur_dir)
75
77
  return None
76
78
  dvc_obj = yaml.safe_load(dvc_spec)
File without changes
@@ -0,0 +1,83 @@
1
+ """Tests for dvc-diff exit code handling."""
2
+ import pytest
3
+ import subprocess
4
+ from pathlib import Path
5
+ import tempfile
6
+
7
+
8
+ class TestDiffExitCodes:
9
+ """Test that dvc-diff properly propagates exit codes from pipeline commands."""
10
+
11
+ def test_successful_pipeline_returns_zero(self, tmp_path):
12
+ """Test that successful identical pipeline returns 0."""
13
+ # Create test files
14
+ file1 = tmp_path / "test1.txt"
15
+ file2 = tmp_path / "test2.txt"
16
+ file1.write_text("foo\nbar\n")
17
+ file2.write_text("foo\nbar\n")
18
+
19
+ # Run diff-x (not dvc-diff, but tests the same join_pipelines code)
20
+ result = subprocess.run(
21
+ ["diff-x", "cat", str(file1), str(file2)],
22
+ capture_output=True,
23
+ )
24
+ assert result.returncode == 0
25
+
26
+ def test_diff_found_returns_one(self, tmp_path):
27
+ """Test that differences found returns 1."""
28
+ file1 = tmp_path / "test1.txt"
29
+ file2 = tmp_path / "test2.txt"
30
+ file1.write_text("foo\n")
31
+ file2.write_text("bar\n")
32
+
33
+ result = subprocess.run(
34
+ ["diff-x", "cat", str(file1), str(file2)],
35
+ capture_output=True,
36
+ )
37
+ assert result.returncode == 1
38
+
39
+ def test_pipeline_error_propagates(self, tmp_path):
40
+ """Test that pipeline command errors propagate to exit code."""
41
+ file1 = tmp_path / "test1.txt"
42
+ file2 = tmp_path / "test2.txt"
43
+ file1.write_text("foo\n")
44
+ file2.write_text("bar\n")
45
+
46
+ # Use a command that will fail
47
+ result = subprocess.run(
48
+ ["diff-x", "cat /nonexistent/file/that/does/not/exist ||", str(file1), str(file2)],
49
+ capture_output=True,
50
+ shell=False,
51
+ )
52
+ # Should return non-zero due to cat failing
53
+ assert result.returncode != 0
54
+
55
+ def test_false_command_propagates_error(self, tmp_path):
56
+ """Test that 'false' command in pipeline propagates error."""
57
+ file1 = tmp_path / "test1.txt"
58
+ file2 = tmp_path / "test2.txt"
59
+ file1.write_text("foo\n")
60
+ file2.write_text("bar\n")
61
+
62
+ # Use 'false' which always returns 1
63
+ result = subprocess.run(
64
+ ["diff-x", "cat", "false", str(file1), str(file2)],
65
+ capture_output=True,
66
+ )
67
+ # Should return non-zero due to false in pipeline
68
+ assert result.returncode != 0
69
+
70
+ def test_multi_stage_pipeline_error(self, tmp_path):
71
+ """Test that errors in multi-stage pipelines are detected."""
72
+ file1 = tmp_path / "test1.txt"
73
+ file2 = tmp_path / "test2.txt"
74
+ file1.write_text("foo\nbar\n")
75
+ file2.write_text("bar\nfoo\n")
76
+
77
+ # Pipeline: sort (succeeds) | false (fails)
78
+ result = subprocess.run(
79
+ ["diff-x", "-x", "sort", "-x", "false", str(file1), str(file2)],
80
+ capture_output=True,
81
+ )
82
+ # Should return non-zero due to false in pipeline
83
+ assert result.returncode != 0