dvc-utils 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dvc-utils-0.1.0/dvc_utils.egg-info → dvc_utils-0.3.0}/PKG-INFO +166 -39
- dvc-utils-0.1.0/PKG-INFO → dvc_utils-0.3.0/README.md +151 -44
- dvc_utils-0.3.0/pyproject.toml +40 -0
- {dvc-utils-0.1.0 → dvc_utils-0.3.0/src}/dvc_utils/__init__.py +1 -1
- dvc_utils-0.3.0/src/dvc_utils/cli.py +17 -0
- dvc_utils-0.3.0/src/dvc_utils/diff.py +124 -0
- dvc_utils-0.3.0/src/dvc_utils/main.py +9 -0
- dvc_utils-0.3.0/src/dvc_utils/path.py +106 -0
- dvc_utils-0.3.0/src/dvc_utils/sync.py +13 -0
- dvc-utils-0.1.0/README.md → dvc_utils-0.3.0/src/dvc_utils.egg-info/PKG-INFO +171 -33
- dvc_utils-0.3.0/src/dvc_utils.egg-info/SOURCES.txt +15 -0
- dvc_utils-0.3.0/src/dvc_utils.egg-info/entry_points.txt +3 -0
- dvc_utils-0.3.0/src/dvc_utils.egg-info/requires.txt +8 -0
- dvc-utils-0.1.0/dvc_utils/cli.py +0 -95
- dvc-utils-0.1.0/dvc_utils/path.py +0 -60
- dvc-utils-0.1.0/dvc_utils.egg-info/SOURCES.txt +0 -12
- dvc-utils-0.1.0/dvc_utils.egg-info/entry_points.txt +0 -3
- dvc-utils-0.1.0/dvc_utils.egg-info/requires.txt +0 -4
- dvc-utils-0.1.0/setup.py +0 -22
- {dvc-utils-0.1.0 → dvc_utils-0.3.0}/LICENSE +0 -0
- {dvc-utils-0.1.0 → dvc_utils-0.3.0}/setup.cfg +0 -0
- {dvc-utils-0.1.0 → dvc_utils-0.3.0/src}/dvc_utils.egg-info/dependency_links.txt +0 -0
- {dvc-utils-0.1.0 → dvc_utils-0.3.0/src}/dvc_utils.egg-info/top_level.txt +0 -0
@@ -1,13 +1,22 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: dvc-utils
|
3
|
-
Version: 0.
|
4
|
-
Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
|
-
|
6
|
-
Author: Ryan Williams
|
7
|
-
Author-email: ryan@runsascoded.com
|
3
|
+
Version: 0.3.0
|
4
|
+
Summary: CLI for diffing DVC-tracked files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
|
+
Author-email: Ryan Williams <ryan@runsascoded.com>
|
8
6
|
License: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/runsascoded/dvc-utils
|
8
|
+
Project-URL: Author URL, https://github.com/ryan-williams
|
9
|
+
Requires-Python: >=3.9
|
9
10
|
Description-Content-Type: text/markdown
|
10
11
|
License-File: LICENSE
|
12
|
+
Requires-Dist: click
|
13
|
+
Requires-Dist: dffs>=0.0.5
|
14
|
+
Requires-Dist: pyyaml
|
15
|
+
Requires-Dist: utz>=0.20.0
|
16
|
+
Provides-Extra: ci
|
17
|
+
Requires-Dist: bmdf==0.5.2; extra == "ci"
|
18
|
+
Requires-Dist: dvc-s3; extra == "ci"
|
19
|
+
Dynamic: license-file
|
11
20
|
|
12
21
|
# dvc-utils
|
13
22
|
Diff [DVC] files, optionally piping through other commands first.
|
@@ -32,25 +41,11 @@ pip install dvc-utils
|
|
32
41
|
```
|
33
42
|
|
34
43
|
## Usage <a id="usage"></a>
|
35
|
-
|
36
|
-
```bash
|
37
|
-
dvc-utils --help
|
38
|
-
# Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
|
39
|
-
#
|
40
|
-
# Options:
|
41
|
-
# --help Show this message and exit.
|
42
|
-
#
|
43
|
-
# Commands:
|
44
|
-
# diff Diff a DVC-tracked file at two commits (or one commit vs. current
|
45
|
-
# worktree), optionally passing both through another command first
|
46
|
-
```
|
47
|
-
|
48
|
-
The single subcommand, `dvc-utils diff`, is also exposed directly as `dvc-dff`:
|
44
|
+
Currently one command is exposed, `dvc-diff`:
|
49
45
|
|
50
|
-
|
51
|
-
<!-- `bmdf -- dvc-diff --help` -->
|
46
|
+
<!-- `bmdf -- dvc-diff` -->
|
52
47
|
```bash
|
53
|
-
dvc-diff
|
48
|
+
dvc-diff
|
54
49
|
# Usage: dvc-diff [OPTIONS] [exec_cmd...] <path>
|
55
50
|
#
|
56
51
|
# Diff a file at two commits (or one commit vs. current worktree), optionally
|
@@ -66,25 +61,144 @@ dvc-diff --help
|
|
66
61
|
# optional) at HEAD (last committed value) vs. the current worktree content.
|
67
62
|
#
|
68
63
|
# Options:
|
69
|
-
# -c, --color
|
70
|
-
# -r, --refspec TEXT
|
71
|
-
#
|
72
|
-
# -
|
73
|
-
#
|
74
|
-
# -
|
75
|
-
#
|
76
|
-
# -
|
77
|
-
#
|
78
|
-
# -
|
79
|
-
#
|
80
|
-
#
|
81
|
-
# -
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
64
|
+
# -c, --color / -C, --no-color Force or prevent colorized output
|
65
|
+
# -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits)
|
66
|
+
# or <commit> (compare <commit> to the worktree)
|
67
|
+
# -R, --ref TEXT Shorthand for `-r <ref>^..<ref>`, i.e. inspect
|
68
|
+
# a specific commit (vs. its parent)
|
69
|
+
# -s, --shell-executable TEXT Shell to use for executing commands; defaults
|
70
|
+
# to $SHELL
|
71
|
+
# -S, --no-shell Don't pass `shell=True` to Python
|
72
|
+
# `subprocess`es
|
73
|
+
# -U, --unified INTEGER Number of lines of context to show (passes
|
74
|
+
# through to `diff`)
|
75
|
+
# -v, --verbose Log intermediate commands to stderr
|
76
|
+
# -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to
|
77
|
+
# `diff`)
|
78
|
+
# -x, --exec-cmd TEXT Command(s) to execute before diffing;
|
79
|
+
# alternate syntax to passing commands as
|
80
|
+
# positional arguments
|
81
|
+
# --help Show this message and exit.
|
85
82
|
```
|
86
83
|
|
87
84
|
## Examples <a id="examples"></a>
|
85
|
+
- Examples below are verified with [`mdcmd`] and `$BMDF_WORKDIR=test/data` (see [.github/workflows/ci.yml](.github/workflows/ci.yml)).
|
86
|
+
- [test/data] is a clone of [ryan-williams/dvc-helpers@test], which contains simple DVC-tracked files (used in that repo for testing [`git-diff-dvc.sh`]).
|
87
|
+
|
88
|
+
[`8ec2060`] added a DVC-tracked text file, `test.txt`:
|
89
|
+
|
90
|
+
<!-- `bmdf -- dvc-diff -R 8ec2060 test.txt` -->
|
91
|
+
```bash
|
92
|
+
dvc-diff -R 8ec2060 test.txt
|
93
|
+
# 0a1,10
|
94
|
+
# > 1
|
95
|
+
# > 2
|
96
|
+
# > 3
|
97
|
+
# > 4
|
98
|
+
# > 5
|
99
|
+
# > 6
|
100
|
+
# > 7
|
101
|
+
# > 8
|
102
|
+
# > 9
|
103
|
+
# > 10
|
104
|
+
```
|
105
|
+
|
106
|
+
[`0455b50`] appended some lines to `test.txt`:
|
107
|
+
|
108
|
+
<!-- `bmdf -- dvc-diff -R 0455b50 test.txt` -->
|
109
|
+
```bash
|
110
|
+
dvc-diff -R 0455b50 test.txt
|
111
|
+
# 10a11,15
|
112
|
+
# > 11
|
113
|
+
# > 12
|
114
|
+
# > 13
|
115
|
+
# > 14
|
116
|
+
# > 15
|
117
|
+
```
|
118
|
+
|
119
|
+
[`f92c1d2`] added `test.parquet`:
|
120
|
+
|
121
|
+
<!-- `bmdf -- dvc-diff -R f92c1d2 pqa test.parquet` -->
|
122
|
+
```bash
|
123
|
+
dvc-diff -R f92c1d2 pqa test.parquet
|
124
|
+
# 0a1,27
|
125
|
+
# > MD5: 4379600b26647a50dfcd0daa824e8219
|
126
|
+
# > 1635 bytes
|
127
|
+
# > 5 rows
|
128
|
+
# > message schema {
|
129
|
+
# > OPTIONAL INT64 num;
|
130
|
+
# > OPTIONAL BYTE_ARRAY str (STRING);
|
131
|
+
# > }
|
132
|
+
# > {
|
133
|
+
# > "num": 111,
|
134
|
+
# > "str": "aaa"
|
135
|
+
# > }
|
136
|
+
# > {
|
137
|
+
# > "num": 222,
|
138
|
+
# > "str": "bbb"
|
139
|
+
# > }
|
140
|
+
# > {
|
141
|
+
# > "num": 333,
|
142
|
+
# > "str": "ccc"
|
143
|
+
# > }
|
144
|
+
# > {
|
145
|
+
# > "num": 444,
|
146
|
+
# > "str": "ddd"
|
147
|
+
# > }
|
148
|
+
# > {
|
149
|
+
# > "num": 555,
|
150
|
+
# > "str": "eee"
|
151
|
+
# > }
|
152
|
+
```
|
153
|
+
|
154
|
+
[`f29e52a`] updated `test.parquet`:
|
155
|
+
|
156
|
+
<!-- `bmdf -E PQT_TXT_OPTS=-n2 -- dvc-diff -R f29e52a pqa test.parquet` -->
|
157
|
+
```bash
|
158
|
+
PQT_TXT_OPTS=-n2 dvc-diff -R f29e52a pqa test.parquet
|
159
|
+
# 1,3c1,3
|
160
|
+
# < MD5: 4379600b26647a50dfcd0daa824e8219
|
161
|
+
# < 1635 bytes
|
162
|
+
# < 5 rows
|
163
|
+
# ---
|
164
|
+
# > MD5: be082c87786f3364ca9efec061a3cc21
|
165
|
+
# > 1622 bytes
|
166
|
+
# > 8 rows
|
167
|
+
# 5c5
|
168
|
+
# < OPTIONAL INT64 num;
|
169
|
+
# ---
|
170
|
+
# > OPTIONAL INT32 num;
|
171
|
+
# 19,20c19,20
|
172
|
+
# < "num": 444,
|
173
|
+
# < "str": "ddd"
|
174
|
+
# ---
|
175
|
+
# > "num": 777,
|
176
|
+
# > "str": "ggg"
|
177
|
+
# 23,24c23,24
|
178
|
+
# < "num": 555,
|
179
|
+
# < "str": "eee"
|
180
|
+
# ---
|
181
|
+
# > "num": 888,
|
182
|
+
# > "str": "hhh"
|
183
|
+
```
|
184
|
+
|
185
|
+
[`3257258`] added a DVC-tracked directory `data/`, including `test.{txt,parquet}`), and removed the top-level `test.{txt,parquet}`.
|
186
|
+
|
187
|
+
<!-- `bmdf -- dvc-diff -R 3257258 data` -->
|
188
|
+
```bash
|
189
|
+
dvc-diff -R 3257258 data
|
190
|
+
# test.parquet: None -> c07bba3fae2b64207aa92f422506e4a2
|
191
|
+
# test.txt: None -> e20b902b49a98b1a05ed62804c757f94
|
192
|
+
```
|
193
|
+
|
194
|
+
[`ae8638a`] changed values in `data/test.parquet`, and added rows to `data/test.txt`:
|
195
|
+
|
196
|
+
<!-- `bmdf -- dvc-diff -R ae8638a data` -->
|
197
|
+
```bash
|
198
|
+
dvc-diff -R ae8638a data
|
199
|
+
# test.parquet: c07bba3fae2b64207aa92f422506e4a2 -> f46dd86f608b1dc00993056c9fc55e6e
|
200
|
+
# test.txt: e20b902b49a98b1a05ed62804c757f94 -> 9306ec0709cc72558045559ada26573b
|
201
|
+
```
|
88
202
|
|
89
203
|
### Parquet <a id="parquet-diff"></a>
|
90
204
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
@@ -334,3 +448,16 @@ This helped me see that the data update in question (`c0..c1`) dropped some fiel
|
|
334
448
|
[`kcr`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L118
|
335
449
|
[`snc`]: https://github.com/ryan-williams/case-helpers/blob/c40a62a9656f0d52d68fb3a108ae6bb3eed3c7bd/.case-rc#L9
|
336
450
|
[`sdf`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L138
|
451
|
+
|
452
|
+
[`mdcmd`]: https://github.com/runsascoded/bash-markdown-fence?tab=readme-ov-file#bmdf
|
453
|
+
[`test/data`]: test/data
|
454
|
+
[test/data]: test/data
|
455
|
+
[ryan-williams/dvc-helpers@test]: https://github.com/ryan-williams/dvc-helpers/tree/test
|
456
|
+
[`git-diff-dvc.sh`]: https://github.com/ryan-williams/dvc-helpers/blob/main/git-diff-dvc.sh
|
457
|
+
|
458
|
+
[`8ec2060`]: https://github.com/ryan-williams/dvc-helpers/commit/8ec2060
|
459
|
+
[`0455b50`]: https://github.com/ryan-williams/dvc-helpers/commit/0455b50
|
460
|
+
[`f92c1d2`]: https://github.com/ryan-williams/dvc-helpers/commit/f92c1d2
|
461
|
+
[`f29e52a`]: https://github.com/ryan-williams/dvc-helpers/commit/f29e52a
|
462
|
+
[`3257258`]: https://github.com/ryan-williams/dvc-helpers/commit/3257258
|
463
|
+
[`ae8638a`]: https://github.com/ryan-williams/dvc-helpers/commit/ae8638a
|
@@ -1,14 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: dvc-utils
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
|
-
Home-page: https://github.com/runsascoded/dvc-utils
|
6
|
-
Author: Ryan Williams
|
7
|
-
Author-email: ryan@runsascoded.com
|
8
|
-
License: MIT
|
9
|
-
Description-Content-Type: text/markdown
|
10
|
-
License-File: LICENSE
|
11
|
-
|
12
1
|
# dvc-utils
|
13
2
|
Diff [DVC] files, optionally piping through other commands first.
|
14
3
|
|
@@ -32,25 +21,11 @@ pip install dvc-utils
|
|
32
21
|
```
|
33
22
|
|
34
23
|
## Usage <a id="usage"></a>
|
35
|
-
|
36
|
-
```bash
|
37
|
-
dvc-utils --help
|
38
|
-
# Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
|
39
|
-
#
|
40
|
-
# Options:
|
41
|
-
# --help Show this message and exit.
|
42
|
-
#
|
43
|
-
# Commands:
|
44
|
-
# diff Diff a DVC-tracked file at two commits (or one commit vs. current
|
45
|
-
# worktree), optionally passing both through another command first
|
46
|
-
```
|
24
|
+
Currently one command is exposed, `dvc-diff`:
|
47
25
|
|
48
|
-
|
49
|
-
|
50
|
-
### `dvc-diff` <a id="dvc-diff"></a>
|
51
|
-
<!-- `bmdf -- dvc-diff --help` -->
|
26
|
+
<!-- `bmdf -- dvc-diff` -->
|
52
27
|
```bash
|
53
|
-
dvc-diff
|
28
|
+
dvc-diff
|
54
29
|
# Usage: dvc-diff [OPTIONS] [exec_cmd...] <path>
|
55
30
|
#
|
56
31
|
# Diff a file at two commits (or one commit vs. current worktree), optionally
|
@@ -66,25 +41,144 @@ dvc-diff --help
|
|
66
41
|
# optional) at HEAD (last committed value) vs. the current worktree content.
|
67
42
|
#
|
68
43
|
# Options:
|
69
|
-
# -c, --color
|
70
|
-
# -r, --refspec TEXT
|
71
|
-
#
|
72
|
-
# -
|
73
|
-
#
|
74
|
-
# -
|
75
|
-
#
|
76
|
-
# -
|
77
|
-
#
|
78
|
-
# -
|
79
|
-
#
|
80
|
-
#
|
81
|
-
# -
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
44
|
+
# -c, --color / -C, --no-color Force or prevent colorized output
|
45
|
+
# -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits)
|
46
|
+
# or <commit> (compare <commit> to the worktree)
|
47
|
+
# -R, --ref TEXT Shorthand for `-r <ref>^..<ref>`, i.e. inspect
|
48
|
+
# a specific commit (vs. its parent)
|
49
|
+
# -s, --shell-executable TEXT Shell to use for executing commands; defaults
|
50
|
+
# to $SHELL
|
51
|
+
# -S, --no-shell Don't pass `shell=True` to Python
|
52
|
+
# `subprocess`es
|
53
|
+
# -U, --unified INTEGER Number of lines of context to show (passes
|
54
|
+
# through to `diff`)
|
55
|
+
# -v, --verbose Log intermediate commands to stderr
|
56
|
+
# -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to
|
57
|
+
# `diff`)
|
58
|
+
# -x, --exec-cmd TEXT Command(s) to execute before diffing;
|
59
|
+
# alternate syntax to passing commands as
|
60
|
+
# positional arguments
|
61
|
+
# --help Show this message and exit.
|
85
62
|
```
|
86
63
|
|
87
64
|
## Examples <a id="examples"></a>
|
65
|
+
- Examples below are verified with [`mdcmd`] and `$BMDF_WORKDIR=test/data` (see [.github/workflows/ci.yml](.github/workflows/ci.yml)).
|
66
|
+
- [test/data] is a clone of [ryan-williams/dvc-helpers@test], which contains simple DVC-tracked files (used in that repo for testing [`git-diff-dvc.sh`]).
|
67
|
+
|
68
|
+
[`8ec2060`] added a DVC-tracked text file, `test.txt`:
|
69
|
+
|
70
|
+
<!-- `bmdf -- dvc-diff -R 8ec2060 test.txt` -->
|
71
|
+
```bash
|
72
|
+
dvc-diff -R 8ec2060 test.txt
|
73
|
+
# 0a1,10
|
74
|
+
# > 1
|
75
|
+
# > 2
|
76
|
+
# > 3
|
77
|
+
# > 4
|
78
|
+
# > 5
|
79
|
+
# > 6
|
80
|
+
# > 7
|
81
|
+
# > 8
|
82
|
+
# > 9
|
83
|
+
# > 10
|
84
|
+
```
|
85
|
+
|
86
|
+
[`0455b50`] appended some lines to `test.txt`:
|
87
|
+
|
88
|
+
<!-- `bmdf -- dvc-diff -R 0455b50 test.txt` -->
|
89
|
+
```bash
|
90
|
+
dvc-diff -R 0455b50 test.txt
|
91
|
+
# 10a11,15
|
92
|
+
# > 11
|
93
|
+
# > 12
|
94
|
+
# > 13
|
95
|
+
# > 14
|
96
|
+
# > 15
|
97
|
+
```
|
98
|
+
|
99
|
+
[`f92c1d2`] added `test.parquet`:
|
100
|
+
|
101
|
+
<!-- `bmdf -- dvc-diff -R f92c1d2 pqa test.parquet` -->
|
102
|
+
```bash
|
103
|
+
dvc-diff -R f92c1d2 pqa test.parquet
|
104
|
+
# 0a1,27
|
105
|
+
# > MD5: 4379600b26647a50dfcd0daa824e8219
|
106
|
+
# > 1635 bytes
|
107
|
+
# > 5 rows
|
108
|
+
# > message schema {
|
109
|
+
# > OPTIONAL INT64 num;
|
110
|
+
# > OPTIONAL BYTE_ARRAY str (STRING);
|
111
|
+
# > }
|
112
|
+
# > {
|
113
|
+
# > "num": 111,
|
114
|
+
# > "str": "aaa"
|
115
|
+
# > }
|
116
|
+
# > {
|
117
|
+
# > "num": 222,
|
118
|
+
# > "str": "bbb"
|
119
|
+
# > }
|
120
|
+
# > {
|
121
|
+
# > "num": 333,
|
122
|
+
# > "str": "ccc"
|
123
|
+
# > }
|
124
|
+
# > {
|
125
|
+
# > "num": 444,
|
126
|
+
# > "str": "ddd"
|
127
|
+
# > }
|
128
|
+
# > {
|
129
|
+
# > "num": 555,
|
130
|
+
# > "str": "eee"
|
131
|
+
# > }
|
132
|
+
```
|
133
|
+
|
134
|
+
[`f29e52a`] updated `test.parquet`:
|
135
|
+
|
136
|
+
<!-- `bmdf -E PQT_TXT_OPTS=-n2 -- dvc-diff -R f29e52a pqa test.parquet` -->
|
137
|
+
```bash
|
138
|
+
PQT_TXT_OPTS=-n2 dvc-diff -R f29e52a pqa test.parquet
|
139
|
+
# 1,3c1,3
|
140
|
+
# < MD5: 4379600b26647a50dfcd0daa824e8219
|
141
|
+
# < 1635 bytes
|
142
|
+
# < 5 rows
|
143
|
+
# ---
|
144
|
+
# > MD5: be082c87786f3364ca9efec061a3cc21
|
145
|
+
# > 1622 bytes
|
146
|
+
# > 8 rows
|
147
|
+
# 5c5
|
148
|
+
# < OPTIONAL INT64 num;
|
149
|
+
# ---
|
150
|
+
# > OPTIONAL INT32 num;
|
151
|
+
# 19,20c19,20
|
152
|
+
# < "num": 444,
|
153
|
+
# < "str": "ddd"
|
154
|
+
# ---
|
155
|
+
# > "num": 777,
|
156
|
+
# > "str": "ggg"
|
157
|
+
# 23,24c23,24
|
158
|
+
# < "num": 555,
|
159
|
+
# < "str": "eee"
|
160
|
+
# ---
|
161
|
+
# > "num": 888,
|
162
|
+
# > "str": "hhh"
|
163
|
+
```
|
164
|
+
|
165
|
+
[`3257258`] added a DVC-tracked directory `data/`, including `test.{txt,parquet}`), and removed the top-level `test.{txt,parquet}`.
|
166
|
+
|
167
|
+
<!-- `bmdf -- dvc-diff -R 3257258 data` -->
|
168
|
+
```bash
|
169
|
+
dvc-diff -R 3257258 data
|
170
|
+
# test.parquet: None -> c07bba3fae2b64207aa92f422506e4a2
|
171
|
+
# test.txt: None -> e20b902b49a98b1a05ed62804c757f94
|
172
|
+
```
|
173
|
+
|
174
|
+
[`ae8638a`] changed values in `data/test.parquet`, and added rows to `data/test.txt`:
|
175
|
+
|
176
|
+
<!-- `bmdf -- dvc-diff -R ae8638a data` -->
|
177
|
+
```bash
|
178
|
+
dvc-diff -R ae8638a data
|
179
|
+
# test.parquet: c07bba3fae2b64207aa92f422506e4a2 -> f46dd86f608b1dc00993056c9fc55e6e
|
180
|
+
# test.txt: e20b902b49a98b1a05ed62804c757f94 -> 9306ec0709cc72558045559ada26573b
|
181
|
+
```
|
88
182
|
|
89
183
|
### Parquet <a id="parquet-diff"></a>
|
90
184
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
@@ -334,3 +428,16 @@ This helped me see that the data update in question (`c0..c1`) dropped some fiel
|
|
334
428
|
[`kcr`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L118
|
335
429
|
[`snc`]: https://github.com/ryan-williams/case-helpers/blob/c40a62a9656f0d52d68fb3a108ae6bb3eed3c7bd/.case-rc#L9
|
336
430
|
[`sdf`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L138
|
431
|
+
|
432
|
+
[`mdcmd`]: https://github.com/runsascoded/bash-markdown-fence?tab=readme-ov-file#bmdf
|
433
|
+
[`test/data`]: test/data
|
434
|
+
[test/data]: test/data
|
435
|
+
[ryan-williams/dvc-helpers@test]: https://github.com/ryan-williams/dvc-helpers/tree/test
|
436
|
+
[`git-diff-dvc.sh`]: https://github.com/ryan-williams/dvc-helpers/blob/main/git-diff-dvc.sh
|
437
|
+
|
438
|
+
[`8ec2060`]: https://github.com/ryan-williams/dvc-helpers/commit/8ec2060
|
439
|
+
[`0455b50`]: https://github.com/ryan-williams/dvc-helpers/commit/0455b50
|
440
|
+
[`f92c1d2`]: https://github.com/ryan-williams/dvc-helpers/commit/f92c1d2
|
441
|
+
[`f29e52a`]: https://github.com/ryan-williams/dvc-helpers/commit/f29e52a
|
442
|
+
[`3257258`]: https://github.com/ryan-williams/dvc-helpers/commit/3257258
|
443
|
+
[`ae8638a`]: https://github.com/ryan-williams/dvc-helpers/commit/ae8638a
|
@@ -0,0 +1,40 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=75"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "dvc-utils"
|
7
|
+
version = "0.3.0"
|
8
|
+
description = "CLI for diffing DVC-tracked files at two commits (or one commit vs. current worktree), optionally passing both through another command first"
|
9
|
+
readme = "README.md"
|
10
|
+
license = {text = "MIT"}
|
11
|
+
authors = [
|
12
|
+
{name = "Ryan Williams", email = "ryan@runsascoded.com"}
|
13
|
+
]
|
14
|
+
requires-python = ">=3.9"
|
15
|
+
dependencies = [
|
16
|
+
"click",
|
17
|
+
"dffs>=0.0.5",
|
18
|
+
"pyyaml",
|
19
|
+
"utz>=0.20.0",
|
20
|
+
]
|
21
|
+
|
22
|
+
[project.optional-dependencies]
|
23
|
+
ci = [
|
24
|
+
"bmdf==0.5.2",
|
25
|
+
"dvc-s3",
|
26
|
+
]
|
27
|
+
|
28
|
+
[project.urls]
|
29
|
+
Homepage = "https://github.com/runsascoded/dvc-utils"
|
30
|
+
"Author URL" = "https://github.com/ryan-williams"
|
31
|
+
|
32
|
+
[project.scripts]
|
33
|
+
dvc-utils = "dvc_utils.main:main"
|
34
|
+
dvc-diff = "dvc_utils.diff:dvc_diff"
|
35
|
+
|
36
|
+
[tool.setuptools]
|
37
|
+
package-dir = {"" = "src"}
|
38
|
+
|
39
|
+
[tool.setuptools.packages.find]
|
40
|
+
where = ["src"]
|
@@ -1,2 +1,2 @@
|
|
1
|
-
from . import cli, path
|
1
|
+
from . import cli, diff, path
|
2
2
|
from .path import dvc_cache_dir, dvc_md5, dvc_paths, dvc_path
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import json
|
2
|
+
import shlex
|
3
|
+
from os import listdir
|
4
|
+
from os.path import isdir, join
|
5
|
+
from typing import Tuple
|
6
|
+
|
7
|
+
import click
|
8
|
+
from click import option, argument, group
|
9
|
+
from dffs import join_pipelines
|
10
|
+
from utz import process, err, hash_file
|
11
|
+
|
12
|
+
from dvc_utils.path import dvc_paths, dvc_cache_path
|
13
|
+
|
14
|
+
|
15
|
+
@group()
|
16
|
+
def cli():
|
17
|
+
pass
|
@@ -0,0 +1,124 @@
|
|
1
|
+
import json
|
2
|
+
import shlex
|
3
|
+
from os import listdir
|
4
|
+
from os.path import isdir, join
|
5
|
+
from typing import Tuple
|
6
|
+
|
7
|
+
import click
|
8
|
+
from click import option, argument, group
|
9
|
+
from dffs import join_pipelines
|
10
|
+
from utz import process, err, hash_file
|
11
|
+
|
12
|
+
from dvc_utils.cli import cli
|
13
|
+
from dvc_utils.path import dvc_paths, dvc_cache_path
|
14
|
+
|
15
|
+
|
16
|
+
@cli.command(
|
17
|
+
'diff',
|
18
|
+
short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first',
|
19
|
+
no_args_is_help=True,
|
20
|
+
)
|
21
|
+
@option('-c/-C', '--color/--no-color', default=None, help='Force or prevent colorized output')
|
22
|
+
@option('-r', '--refspec', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
|
23
|
+
@option('-R', '--ref', help='Shorthand for `-r <ref>^..<ref>`, i.e. inspect a specific commit (vs. its parent)')
|
24
|
+
@option('-s', '--shell-executable', help=f'Shell to use for executing commands; defaults to $SHELL')
|
25
|
+
@option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
|
26
|
+
@option('-U', '--unified', type=int, help='Number of lines of context to show (passes through to `diff`)')
|
27
|
+
@option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
|
28
|
+
@option('-w', '--ignore-whitespace', is_flag=True, help="Ignore whitespace differences (pass `-w` to `diff`)")
|
29
|
+
@option('-x', '--exec-cmd', 'exec_cmds', multiple=True, help='Command(s) to execute before diffing; alternate syntax to passing commands as positional arguments')
|
30
|
+
@argument('args', metavar='[exec_cmd...] <path>', nargs=-1)
|
31
|
+
def dvc_diff(
|
32
|
+
color: bool | None,
|
33
|
+
refspec: str | None,
|
34
|
+
ref: str | None,
|
35
|
+
shell_executable: str | None,
|
36
|
+
no_shell: bool,
|
37
|
+
unified: int | None,
|
38
|
+
verbose: bool,
|
39
|
+
ignore_whitespace: bool,
|
40
|
+
exec_cmds: Tuple[str, ...],
|
41
|
+
args: Tuple[str, ...],
|
42
|
+
):
|
43
|
+
"""Diff a file at two commits (or one commit vs. current worktree), optionally passing both through `cmd` first
|
44
|
+
|
45
|
+
Examples:
|
46
|
+
|
47
|
+
dvc-utils diff -r HEAD^..HEAD wc -l foo.dvc # Compare the number of lines (`wc -l`) in `foo` (the file referenced by `foo.dvc`) at the previous vs. current commit (`HEAD^..HEAD`).
|
48
|
+
|
49
|
+
dvc-utils diff md5sum foo # Diff the `md5sum` of `foo` (".dvc" extension is optional) at HEAD (last committed value) vs. the current worktree content.
|
50
|
+
"""
|
51
|
+
if not args:
|
52
|
+
raise click.UsageError('Must specify [cmd...] <path>')
|
53
|
+
|
54
|
+
shell = not no_shell
|
55
|
+
*cmds, path = args
|
56
|
+
cmds = list(exec_cmds) + cmds
|
57
|
+
|
58
|
+
path, dvc_path = dvc_paths(path)
|
59
|
+
|
60
|
+
if refspec and ref:
|
61
|
+
raise ValueError("Specify -r/--refspec xor -R/--ref")
|
62
|
+
if ref:
|
63
|
+
refspec = f'{ref}^..{ref}'
|
64
|
+
elif not refspec:
|
65
|
+
refspec = 'HEAD'
|
66
|
+
|
67
|
+
pcs = refspec.split('..', 1)
|
68
|
+
if len(pcs) == 1:
|
69
|
+
before = pcs[0]
|
70
|
+
after = None
|
71
|
+
elif len(pcs) == 2:
|
72
|
+
before, after = pcs
|
73
|
+
else:
|
74
|
+
raise ValueError(f"Invalid refspec: {refspec}")
|
75
|
+
|
76
|
+
log = err if verbose else False
|
77
|
+
path1 = dvc_cache_path(before, dvc_path, log=log)
|
78
|
+
path2 = (path if after is None else dvc_cache_path(after, dvc_path, log=log))
|
79
|
+
|
80
|
+
if isdir(path):
|
81
|
+
dir_json1 = dir_json2 = {}
|
82
|
+
if path1:
|
83
|
+
with open(path1, 'r') as f:
|
84
|
+
obj = json.load(f)
|
85
|
+
dir_json1 = { e["relpath"]: e["md5"] for e in obj }
|
86
|
+
if path2:
|
87
|
+
if path2 == path and after is None:
|
88
|
+
dir_json2 = {}
|
89
|
+
for file in listdir(path2):
|
90
|
+
md5 = hash_file(join(path2, file), hash_name='md5')
|
91
|
+
dir_json2[file] = md5
|
92
|
+
else:
|
93
|
+
with open(path2, 'r') as f:
|
94
|
+
dir_json2 = { obj["relpath"]: obj["md5"] for obj in json.load(f) }
|
95
|
+
for relpath in sorted(set(dir_json1) | set(dir_json2)):
|
96
|
+
md5_1 = dir_json1.get(relpath)
|
97
|
+
md5_2 = dir_json2.get(relpath)
|
98
|
+
if md5_1 != md5_2:
|
99
|
+
print(f'{relpath}: {md5_1} -> {md5_2}')
|
100
|
+
else:
|
101
|
+
diff_args = [
|
102
|
+
*(['-w'] if ignore_whitespace else []),
|
103
|
+
*(['-U', str(unified)] if unified is not None else []),
|
104
|
+
*(['--color=always'] if color is True else ['--color=never'] if color is False else []),
|
105
|
+
]
|
106
|
+
if cmds:
|
107
|
+
cmd, *sub_cmds = cmds
|
108
|
+
cmds1 = [ 'cat /dev/null' ] if path1 is None else [ f'{cmd} {path1 or "/dev/null"}', *sub_cmds ]
|
109
|
+
cmds2 = [ 'cat /dev/null' ] if path2 is None else [ f'{cmd} {path2 or "/dev/null"}', *sub_cmds ]
|
110
|
+
if not shell:
|
111
|
+
cmds1 = [ shlex.split(cmd) for cmd in cmds1 ]
|
112
|
+
cmds2 = [ shlex.split(cmd) for cmd in cmds2 ]
|
113
|
+
|
114
|
+
join_pipelines(
|
115
|
+
base_cmd=['diff', *diff_args],
|
116
|
+
cmds1=cmds1,
|
117
|
+
cmds2=cmds2,
|
118
|
+
verbose=verbose,
|
119
|
+
shell=shell,
|
120
|
+
executable=shell_executable,
|
121
|
+
)
|
122
|
+
else:
|
123
|
+
res = process.run('diff', *diff_args, path1 or '/dev/null', path2 or '/dev/null', log=log, check=False)
|
124
|
+
exit(res.returncode)
|
@@ -0,0 +1,106 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
from functools import cache
|
5
|
+
from os import environ as env, getcwd
|
6
|
+
from os.path import join, relpath, dirname, basename, sep
|
7
|
+
from subprocess import DEVNULL
|
8
|
+
from typing import Tuple
|
9
|
+
|
10
|
+
import yaml
|
11
|
+
from utz import process, err, singleton
|
12
|
+
|
13
|
+
|
14
|
+
def dvc_paths(path: str) -> Tuple[str, str]:
|
15
|
+
if path.endswith(sep):
|
16
|
+
path = path[:-len(sep)]
|
17
|
+
if path.endswith('.dvc'):
|
18
|
+
dvc_path = path
|
19
|
+
path = dvc_path[:-len('.dvc')]
|
20
|
+
else:
|
21
|
+
dvc_path = f'{path}.dvc'
|
22
|
+
return path, dvc_path
|
23
|
+
|
24
|
+
|
25
|
+
@cache
|
26
|
+
def get_git_root() -> str:
|
27
|
+
return process.line('git', 'rev-parse', '--show-toplevel', log=False)
|
28
|
+
|
29
|
+
|
30
|
+
@cache
|
31
|
+
def get_dir_path() -> str:
|
32
|
+
return relpath(getcwd(), get_git_root())
|
33
|
+
|
34
|
+
|
35
|
+
@cache
|
36
|
+
def dvc_cache_dir(log: bool = False) -> str:
|
37
|
+
dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
|
38
|
+
if dvc_cache_relpath:
|
39
|
+
return join(get_git_root(), dvc_cache_relpath)
|
40
|
+
else:
|
41
|
+
return process.line('dvc', 'cache', 'dir', log=log)
|
42
|
+
|
43
|
+
|
44
|
+
def dvc_md5(
|
45
|
+
git_ref: str,
|
46
|
+
dvc_path: str,
|
47
|
+
log: bool = False,
|
48
|
+
) -> str | None:
|
49
|
+
dir_path = get_dir_path()
|
50
|
+
dir_path = '' if dir_path == '.' else f'{dir_path}{sep}'
|
51
|
+
dvc_path = f"{dir_path}{dvc_path}"
|
52
|
+
dvc_spec = process.output(
|
53
|
+
'git', 'show', f'{git_ref}:{dvc_path}',
|
54
|
+
err_ok=True,
|
55
|
+
log=err if log else None,
|
56
|
+
stderr=None if log else DEVNULL,
|
57
|
+
)
|
58
|
+
if not dvc_spec:
|
59
|
+
cur_dir = dirname(dvc_path)
|
60
|
+
relpath = basename(dvc_path)
|
61
|
+
if relpath.endswith(".dvc"):
|
62
|
+
relpath = relpath[:-len(".dvc")]
|
63
|
+
while cur_dir and cur_dir != '.':
|
64
|
+
dir_cache_path = dvc_cache_path(ref=git_ref, dvc_path=f"{cur_dir}.dvc", log=log)
|
65
|
+
if dir_cache_path:
|
66
|
+
with open(dir_cache_path, 'r') as f:
|
67
|
+
dir_entries = json.load(f)
|
68
|
+
md5s = [ e["md5"] for e in dir_entries if e["relpath"] == relpath ]
|
69
|
+
if len(md5s) == 1:
|
70
|
+
return md5s[0]
|
71
|
+
else:
|
72
|
+
raise RuntimeError(f"{relpath=} not found in DVC-tracked dir {cur_dir}")
|
73
|
+
relpath = join(basename(cur_dir), relpath)
|
74
|
+
cur_dir = dirname(cur_dir)
|
75
|
+
return None
|
76
|
+
dvc_obj = yaml.safe_load(dvc_spec)
|
77
|
+
out = singleton(dvc_obj['outs'], dedupe=False)
|
78
|
+
md5 = out['md5']
|
79
|
+
return md5
|
80
|
+
|
81
|
+
|
82
|
+
def dvc_path(
|
83
|
+
ref: str,
|
84
|
+
dvc_path: str | None = None,
|
85
|
+
log: bool = False,
|
86
|
+
) -> str | None:
|
87
|
+
if dvc_path and not dvc_path.endswith('.dvc'):
|
88
|
+
dvc_path += '.dvc'
|
89
|
+
|
90
|
+
if dvc_path:
|
91
|
+
md5 = dvc_md5(ref, dvc_path, log=log)
|
92
|
+
elif ':' in ref:
|
93
|
+
git_ref, dvc_path = ref.split(':', 1)
|
94
|
+
md5 = dvc_md5(git_ref, dvc_path, log=log)
|
95
|
+
else:
|
96
|
+
md5 = ref
|
97
|
+
|
98
|
+
if md5 is None:
|
99
|
+
return None
|
100
|
+
else:
|
101
|
+
dirname = md5[:2]
|
102
|
+
basename = md5[2:]
|
103
|
+
return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
|
104
|
+
|
105
|
+
|
106
|
+
dvc_cache_path = dvc_path
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from click import option
|
2
|
+
|
3
|
+
from dvc_utils.cli import cli
|
4
|
+
from git import Repo
|
5
|
+
|
6
|
+
|
7
|
+
@cli.command('pull-x', short_help='Sync DVC cache files from an S3 remote')
|
8
|
+
@option('-n', '--dry-run', is_flag=True, help='Print files that would be synced, don\'t actually perform sync')
|
9
|
+
@option('-p', '--path', 'paths', multiple=True, help='Path globs to sync')
|
10
|
+
@option('-r', '--ref', 'refs', multiple=True, help='Git refs to sync DVC files from')
|
11
|
+
def pull_x(dry_run, paths, refs):
|
12
|
+
repo = Repo()
|
13
|
+
|
@@ -1,3 +1,23 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: dvc-utils
|
3
|
+
Version: 0.3.0
|
4
|
+
Summary: CLI for diffing DVC-tracked files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
|
+
Author-email: Ryan Williams <ryan@runsascoded.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/runsascoded/dvc-utils
|
8
|
+
Project-URL: Author URL, https://github.com/ryan-williams
|
9
|
+
Requires-Python: >=3.9
|
10
|
+
Description-Content-Type: text/markdown
|
11
|
+
License-File: LICENSE
|
12
|
+
Requires-Dist: click
|
13
|
+
Requires-Dist: dffs>=0.0.5
|
14
|
+
Requires-Dist: pyyaml
|
15
|
+
Requires-Dist: utz>=0.20.0
|
16
|
+
Provides-Extra: ci
|
17
|
+
Requires-Dist: bmdf==0.5.2; extra == "ci"
|
18
|
+
Requires-Dist: dvc-s3; extra == "ci"
|
19
|
+
Dynamic: license-file
|
20
|
+
|
1
21
|
# dvc-utils
|
2
22
|
Diff [DVC] files, optionally piping through other commands first.
|
3
23
|
|
@@ -21,25 +41,11 @@ pip install dvc-utils
|
|
21
41
|
```
|
22
42
|
|
23
43
|
## Usage <a id="usage"></a>
|
24
|
-
|
25
|
-
```bash
|
26
|
-
dvc-utils --help
|
27
|
-
# Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
|
28
|
-
#
|
29
|
-
# Options:
|
30
|
-
# --help Show this message and exit.
|
31
|
-
#
|
32
|
-
# Commands:
|
33
|
-
# diff Diff a DVC-tracked file at two commits (or one commit vs. current
|
34
|
-
# worktree), optionally passing both through another command first
|
35
|
-
```
|
36
|
-
|
37
|
-
The single subcommand, `dvc-utils diff`, is also exposed directly as `dvc-dff`:
|
44
|
+
Currently one command is exposed, `dvc-diff`:
|
38
45
|
|
39
|
-
|
40
|
-
<!-- `bmdf -- dvc-diff --help` -->
|
46
|
+
<!-- `bmdf -- dvc-diff` -->
|
41
47
|
```bash
|
42
|
-
dvc-diff
|
48
|
+
dvc-diff
|
43
49
|
# Usage: dvc-diff [OPTIONS] [exec_cmd...] <path>
|
44
50
|
#
|
45
51
|
# Diff a file at two commits (or one commit vs. current worktree), optionally
|
@@ -55,25 +61,144 @@ dvc-diff --help
|
|
55
61
|
# optional) at HEAD (last committed value) vs. the current worktree content.
|
56
62
|
#
|
57
63
|
# Options:
|
58
|
-
# -c, --color
|
59
|
-
# -r, --refspec TEXT
|
60
|
-
#
|
61
|
-
# -
|
62
|
-
#
|
63
|
-
# -
|
64
|
-
#
|
65
|
-
# -
|
66
|
-
#
|
67
|
-
# -
|
68
|
-
#
|
69
|
-
#
|
70
|
-
# -
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
64
|
+
# -c, --color / -C, --no-color Force or prevent colorized output
|
65
|
+
# -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits)
|
66
|
+
# or <commit> (compare <commit> to the worktree)
|
67
|
+
# -R, --ref TEXT Shorthand for `-r <ref>^..<ref>`, i.e. inspect
|
68
|
+
# a specific commit (vs. its parent)
|
69
|
+
# -s, --shell-executable TEXT Shell to use for executing commands; defaults
|
70
|
+
# to $SHELL
|
71
|
+
# -S, --no-shell Don't pass `shell=True` to Python
|
72
|
+
# `subprocess`es
|
73
|
+
# -U, --unified INTEGER Number of lines of context to show (passes
|
74
|
+
# through to `diff`)
|
75
|
+
# -v, --verbose Log intermediate commands to stderr
|
76
|
+
# -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to
|
77
|
+
# `diff`)
|
78
|
+
# -x, --exec-cmd TEXT Command(s) to execute before diffing;
|
79
|
+
# alternate syntax to passing commands as
|
80
|
+
# positional arguments
|
81
|
+
# --help Show this message and exit.
|
74
82
|
```
|
75
83
|
|
76
84
|
## Examples <a id="examples"></a>
|
85
|
+
- Examples below are verified with [`mdcmd`] and `$BMDF_WORKDIR=test/data` (see [.github/workflows/ci.yml](.github/workflows/ci.yml)).
|
86
|
+
- [test/data] is a clone of [ryan-williams/dvc-helpers@test], which contains simple DVC-tracked files (used in that repo for testing [`git-diff-dvc.sh`]).
|
87
|
+
|
88
|
+
[`8ec2060`] added a DVC-tracked text file, `test.txt`:
|
89
|
+
|
90
|
+
<!-- `bmdf -- dvc-diff -R 8ec2060 test.txt` -->
|
91
|
+
```bash
|
92
|
+
dvc-diff -R 8ec2060 test.txt
|
93
|
+
# 0a1,10
|
94
|
+
# > 1
|
95
|
+
# > 2
|
96
|
+
# > 3
|
97
|
+
# > 4
|
98
|
+
# > 5
|
99
|
+
# > 6
|
100
|
+
# > 7
|
101
|
+
# > 8
|
102
|
+
# > 9
|
103
|
+
# > 10
|
104
|
+
```
|
105
|
+
|
106
|
+
[`0455b50`] appended some lines to `test.txt`:
|
107
|
+
|
108
|
+
<!-- `bmdf -- dvc-diff -R 0455b50 test.txt` -->
|
109
|
+
```bash
|
110
|
+
dvc-diff -R 0455b50 test.txt
|
111
|
+
# 10a11,15
|
112
|
+
# > 11
|
113
|
+
# > 12
|
114
|
+
# > 13
|
115
|
+
# > 14
|
116
|
+
# > 15
|
117
|
+
```
|
118
|
+
|
119
|
+
[`f92c1d2`] added `test.parquet`:
|
120
|
+
|
121
|
+
<!-- `bmdf -- dvc-diff -R f92c1d2 pqa test.parquet` -->
|
122
|
+
```bash
|
123
|
+
dvc-diff -R f92c1d2 pqa test.parquet
|
124
|
+
# 0a1,27
|
125
|
+
# > MD5: 4379600b26647a50dfcd0daa824e8219
|
126
|
+
# > 1635 bytes
|
127
|
+
# > 5 rows
|
128
|
+
# > message schema {
|
129
|
+
# > OPTIONAL INT64 num;
|
130
|
+
# > OPTIONAL BYTE_ARRAY str (STRING);
|
131
|
+
# > }
|
132
|
+
# > {
|
133
|
+
# > "num": 111,
|
134
|
+
# > "str": "aaa"
|
135
|
+
# > }
|
136
|
+
# > {
|
137
|
+
# > "num": 222,
|
138
|
+
# > "str": "bbb"
|
139
|
+
# > }
|
140
|
+
# > {
|
141
|
+
# > "num": 333,
|
142
|
+
# > "str": "ccc"
|
143
|
+
# > }
|
144
|
+
# > {
|
145
|
+
# > "num": 444,
|
146
|
+
# > "str": "ddd"
|
147
|
+
# > }
|
148
|
+
# > {
|
149
|
+
# > "num": 555,
|
150
|
+
# > "str": "eee"
|
151
|
+
# > }
|
152
|
+
```
|
153
|
+
|
154
|
+
[`f29e52a`] updated `test.parquet`:
|
155
|
+
|
156
|
+
<!-- `bmdf -E PQT_TXT_OPTS=-n2 -- dvc-diff -R f29e52a pqa test.parquet` -->
|
157
|
+
```bash
|
158
|
+
PQT_TXT_OPTS=-n2 dvc-diff -R f29e52a pqa test.parquet
|
159
|
+
# 1,3c1,3
|
160
|
+
# < MD5: 4379600b26647a50dfcd0daa824e8219
|
161
|
+
# < 1635 bytes
|
162
|
+
# < 5 rows
|
163
|
+
# ---
|
164
|
+
# > MD5: be082c87786f3364ca9efec061a3cc21
|
165
|
+
# > 1622 bytes
|
166
|
+
# > 8 rows
|
167
|
+
# 5c5
|
168
|
+
# < OPTIONAL INT64 num;
|
169
|
+
# ---
|
170
|
+
# > OPTIONAL INT32 num;
|
171
|
+
# 19,20c19,20
|
172
|
+
# < "num": 444,
|
173
|
+
# < "str": "ddd"
|
174
|
+
# ---
|
175
|
+
# > "num": 777,
|
176
|
+
# > "str": "ggg"
|
177
|
+
# 23,24c23,24
|
178
|
+
# < "num": 555,
|
179
|
+
# < "str": "eee"
|
180
|
+
# ---
|
181
|
+
# > "num": 888,
|
182
|
+
# > "str": "hhh"
|
183
|
+
```
|
184
|
+
|
185
|
+
[`3257258`] added a DVC-tracked directory `data/`, including `test.{txt,parquet}`), and removed the top-level `test.{txt,parquet}`.
|
186
|
+
|
187
|
+
<!-- `bmdf -- dvc-diff -R 3257258 data` -->
|
188
|
+
```bash
|
189
|
+
dvc-diff -R 3257258 data
|
190
|
+
# test.parquet: None -> c07bba3fae2b64207aa92f422506e4a2
|
191
|
+
# test.txt: None -> e20b902b49a98b1a05ed62804c757f94
|
192
|
+
```
|
193
|
+
|
194
|
+
[`ae8638a`] changed values in `data/test.parquet`, and added rows to `data/test.txt`:
|
195
|
+
|
196
|
+
<!-- `bmdf -- dvc-diff -R ae8638a data` -->
|
197
|
+
```bash
|
198
|
+
dvc-diff -R ae8638a data
|
199
|
+
# test.parquet: c07bba3fae2b64207aa92f422506e4a2 -> f46dd86f608b1dc00993056c9fc55e6e
|
200
|
+
# test.txt: e20b902b49a98b1a05ed62804c757f94 -> 9306ec0709cc72558045559ada26573b
|
201
|
+
```
|
77
202
|
|
78
203
|
### Parquet <a id="parquet-diff"></a>
|
79
204
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
@@ -323,3 +448,16 @@ This helped me see that the data update in question (`c0..c1`) dropped some fiel
|
|
323
448
|
[`kcr`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L118
|
324
449
|
[`snc`]: https://github.com/ryan-williams/case-helpers/blob/c40a62a9656f0d52d68fb3a108ae6bb3eed3c7bd/.case-rc#L9
|
325
450
|
[`sdf`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L138
|
451
|
+
|
452
|
+
[`mdcmd`]: https://github.com/runsascoded/bash-markdown-fence?tab=readme-ov-file#bmdf
|
453
|
+
[`test/data`]: test/data
|
454
|
+
[test/data]: test/data
|
455
|
+
[ryan-williams/dvc-helpers@test]: https://github.com/ryan-williams/dvc-helpers/tree/test
|
456
|
+
[`git-diff-dvc.sh`]: https://github.com/ryan-williams/dvc-helpers/blob/main/git-diff-dvc.sh
|
457
|
+
|
458
|
+
[`8ec2060`]: https://github.com/ryan-williams/dvc-helpers/commit/8ec2060
|
459
|
+
[`0455b50`]: https://github.com/ryan-williams/dvc-helpers/commit/0455b50
|
460
|
+
[`f92c1d2`]: https://github.com/ryan-williams/dvc-helpers/commit/f92c1d2
|
461
|
+
[`f29e52a`]: https://github.com/ryan-williams/dvc-helpers/commit/f29e52a
|
462
|
+
[`3257258`]: https://github.com/ryan-williams/dvc-helpers/commit/3257258
|
463
|
+
[`ae8638a`]: https://github.com/ryan-williams/dvc-helpers/commit/ae8638a
|
@@ -0,0 +1,15 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
pyproject.toml
|
4
|
+
src/dvc_utils/__init__.py
|
5
|
+
src/dvc_utils/cli.py
|
6
|
+
src/dvc_utils/diff.py
|
7
|
+
src/dvc_utils/main.py
|
8
|
+
src/dvc_utils/path.py
|
9
|
+
src/dvc_utils/sync.py
|
10
|
+
src/dvc_utils.egg-info/PKG-INFO
|
11
|
+
src/dvc_utils.egg-info/SOURCES.txt
|
12
|
+
src/dvc_utils.egg-info/dependency_links.txt
|
13
|
+
src/dvc_utils.egg-info/entry_points.txt
|
14
|
+
src/dvc_utils.egg-info/requires.txt
|
15
|
+
src/dvc_utils.egg-info/top_level.txt
|
dvc-utils-0.1.0/dvc_utils/cli.py
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
import shlex
|
2
|
-
from os import environ as env
|
3
|
-
from typing import Tuple
|
4
|
-
|
5
|
-
import click
|
6
|
-
from click import option, argument, group
|
7
|
-
from utz import process, err
|
8
|
-
from qmdx import join_pipelines
|
9
|
-
|
10
|
-
from dvc_utils.path import dvc_paths, dvc_path as dvc_cache_path
|
11
|
-
|
12
|
-
|
13
|
-
@group()
|
14
|
-
def cli():
|
15
|
-
pass
|
16
|
-
|
17
|
-
|
18
|
-
@cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
|
19
|
-
@option('-c', '--color', is_flag=True, help='Colorize the output')
|
20
|
-
@option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
|
21
|
-
@option('-s', '--shell-executable', help=f'Shell to use for executing commands; defaults to $SHELL ({env.get("SHELL")})')
|
22
|
-
@option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
|
23
|
-
@option('-U', '--unified', type=int, help='Number of lines of context to show (passes through to `diff`)')
|
24
|
-
@option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
|
25
|
-
@option('-w', '--ignore-whitespace', is_flag=True, help="Ignore whitespace differences (pass `-w` to `diff`)")
|
26
|
-
@option('-x', '--exec-cmd', 'exec_cmds', multiple=True, help='Command(s) to execute before diffing; alternate syntax to passing commands as positional arguments')
|
27
|
-
@argument('args', metavar='[exec_cmd...] <path>', nargs=-1)
|
28
|
-
def dvc_utils_diff(
|
29
|
-
color: bool,
|
30
|
-
refspec: str | None,
|
31
|
-
shell_executable: str | None,
|
32
|
-
no_shell: bool,
|
33
|
-
unified: int | None,
|
34
|
-
verbose: bool,
|
35
|
-
ignore_whitespace: bool,
|
36
|
-
exec_cmds: Tuple[str, ...],
|
37
|
-
args: Tuple[str, ...],
|
38
|
-
):
|
39
|
-
"""Diff a file at two commits (or one commit vs. current worktree), optionally passing both through `cmd` first
|
40
|
-
|
41
|
-
Examples:
|
42
|
-
|
43
|
-
dvc-utils diff -r HEAD^..HEAD wc -l foo.dvc # Compare the number of lines (`wc -l`) in `foo` (the file referenced by `foo.dvc`) at the previous vs. current commit (`HEAD^..HEAD`).
|
44
|
-
|
45
|
-
dvc-utils diff md5sum foo # Diff the `md5sum` of `foo` (".dvc" extension is optional) at HEAD (last committed value) vs. the current worktree content.
|
46
|
-
"""
|
47
|
-
if not args:
|
48
|
-
raise click.UsageError('Must specify [cmd...] <path>')
|
49
|
-
|
50
|
-
shell = not no_shell
|
51
|
-
*cmds, path = args
|
52
|
-
cmds = list(exec_cmds) + cmds
|
53
|
-
|
54
|
-
path, dvc_path = dvc_paths(path)
|
55
|
-
|
56
|
-
pcs = refspec.split('..', 1)
|
57
|
-
if len(pcs) == 1:
|
58
|
-
before = pcs[0]
|
59
|
-
after = None
|
60
|
-
elif len(pcs) == 2:
|
61
|
-
before, after = pcs
|
62
|
-
else:
|
63
|
-
raise ValueError(f"Invalid refspec: {refspec}")
|
64
|
-
|
65
|
-
log = err if verbose else False
|
66
|
-
path1 = dvc_cache_path(before, dvc_path, log=log)
|
67
|
-
path2 = path if after is None else dvc_cache_path(after, dvc_path, log=log)
|
68
|
-
|
69
|
-
diff_args = [
|
70
|
-
*(['-w'] if ignore_whitespace else []),
|
71
|
-
*(['-U', str(unified)] if unified is not None else []),
|
72
|
-
*(['--color=always'] if color else []),
|
73
|
-
]
|
74
|
-
if cmds:
|
75
|
-
cmd, *sub_cmds = cmds
|
76
|
-
cmds1 = [ f'{cmd} {path1}', *sub_cmds ]
|
77
|
-
cmds2 = [ f'{cmd} {path2}', *sub_cmds ]
|
78
|
-
if not shell:
|
79
|
-
cmds1 = [ shlex.split(cmd) for cmd in cmds1 ]
|
80
|
-
cmds2 = [ shlex.split(cmd) for cmd in cmds2 ]
|
81
|
-
|
82
|
-
join_pipelines(
|
83
|
-
base_cmd=['diff', *diff_args],
|
84
|
-
cmds1=cmds1,
|
85
|
-
cmds2=cmds2,
|
86
|
-
verbose=verbose,
|
87
|
-
shell=shell,
|
88
|
-
shell_executable=shell_executable,
|
89
|
-
)
|
90
|
-
else:
|
91
|
-
process.run('diff', *diff_args, path1, path2, log=log)
|
92
|
-
|
93
|
-
|
94
|
-
if __name__ == '__main__':
|
95
|
-
cli()
|
@@ -1,60 +0,0 @@
|
|
1
|
-
from functools import cache
|
2
|
-
from os import environ as env, getcwd
|
3
|
-
from os.path import join, relpath
|
4
|
-
from typing import Optional, Tuple
|
5
|
-
|
6
|
-
import yaml
|
7
|
-
from utz import process, err, singleton
|
8
|
-
|
9
|
-
|
10
|
-
def dvc_paths(path: str) -> Tuple[str, str]:
|
11
|
-
if path.endswith('.dvc'):
|
12
|
-
dvc_path = path
|
13
|
-
path = dvc_path[:-len('.dvc')]
|
14
|
-
else:
|
15
|
-
dvc_path = f'{path}.dvc'
|
16
|
-
return path, dvc_path
|
17
|
-
|
18
|
-
|
19
|
-
@cache
|
20
|
-
def get_git_root() -> str:
|
21
|
-
return process.line('git', 'rev-parse', '--show-toplevel', log=False)
|
22
|
-
|
23
|
-
|
24
|
-
@cache
|
25
|
-
def get_dir_path() -> str:
|
26
|
-
return relpath(getcwd(), get_git_root())
|
27
|
-
|
28
|
-
|
29
|
-
@cache
|
30
|
-
def dvc_cache_dir(log: bool = False) -> str:
|
31
|
-
dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
|
32
|
-
if dvc_cache_relpath:
|
33
|
-
return join(get_git_root(), dvc_cache_relpath)
|
34
|
-
else:
|
35
|
-
return process.line('dvc', 'cache', 'dir', log=log)
|
36
|
-
|
37
|
-
|
38
|
-
def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
|
39
|
-
dir_path = get_dir_path()
|
40
|
-
dir_path = '' if dir_path == '.' else f'{dir_path}/'
|
41
|
-
dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=err if log else None)
|
42
|
-
dvc_obj = yaml.safe_load(dvc_spec)
|
43
|
-
out = singleton(dvc_obj['outs'], dedupe=False)
|
44
|
-
md5 = out['md5']
|
45
|
-
return md5
|
46
|
-
|
47
|
-
|
48
|
-
def dvc_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
|
49
|
-
if dvc_path and not dvc_path.endswith('.dvc'):
|
50
|
-
dvc_path += '.dvc'
|
51
|
-
if dvc_path:
|
52
|
-
md5 = dvc_md5(ref, dvc_path, log=log)
|
53
|
-
elif ':' in ref:
|
54
|
-
git_ref, dvc_path = ref.split(':', 1)
|
55
|
-
md5 = dvc_md5(git_ref, dvc_path, log=log)
|
56
|
-
else:
|
57
|
-
md5 = ref
|
58
|
-
dirname = md5[:2]
|
59
|
-
basename = md5[2:]
|
60
|
-
return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
|
@@ -1,12 +0,0 @@
|
|
1
|
-
LICENSE
|
2
|
-
README.md
|
3
|
-
setup.py
|
4
|
-
dvc_utils/__init__.py
|
5
|
-
dvc_utils/cli.py
|
6
|
-
dvc_utils/path.py
|
7
|
-
dvc_utils.egg-info/PKG-INFO
|
8
|
-
dvc_utils.egg-info/SOURCES.txt
|
9
|
-
dvc_utils.egg-info/dependency_links.txt
|
10
|
-
dvc_utils.egg-info/entry_points.txt
|
11
|
-
dvc_utils.egg-info/requires.txt
|
12
|
-
dvc_utils.egg-info/top_level.txt
|
dvc-utils-0.1.0/setup.py
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
from setuptools import setup
|
2
|
-
|
3
|
-
setup(
|
4
|
-
name='dvc-utils',
|
5
|
-
version="0.1.0",
|
6
|
-
description="CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first",
|
7
|
-
long_description=open("README.md").read(),
|
8
|
-
long_description_content_type="text/markdown",
|
9
|
-
packages=['dvc_utils'],
|
10
|
-
install_requires=open("requirements.txt").read(),
|
11
|
-
entry_points={
|
12
|
-
'console_scripts': [
|
13
|
-
'dvc-utils = dvc_utils.cli:cli',
|
14
|
-
'dvc-diff = dvc_utils.cli:dvc_utils_diff',
|
15
|
-
],
|
16
|
-
},
|
17
|
-
license="MIT",
|
18
|
-
author="Ryan Williams",
|
19
|
-
author_email="ryan@runsascoded.com",
|
20
|
-
author_url="https://github.com/ryan-williams",
|
21
|
-
url="https://github.com/runsascoded/dvc-utils",
|
22
|
-
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|