dvc-utils 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/PKG-INFO +46 -24
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/README.md +45 -23
- dvc-utils-0.0.4/dvc_utils/main.py +183 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils/named_pipes.py +1 -1
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils.egg-info/PKG-INFO +46 -24
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/setup.py +1 -1
- dvc-utils-0.0.2/dvc_utils/main.py +0 -119
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/LICENSE +0 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils/__init__.py +0 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils.egg-info/SOURCES.txt +0 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils.egg-info/dependency_links.txt +0 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils.egg-info/entry_points.txt +0 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/dvc_utils.egg-info/top_level.txt +0 -0
- {dvc-utils-0.0.2 → dvc-utils-0.0.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dvc-utils
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
5
|
Home-page: https://github.com/runsascoded/dvc-utils
|
6
6
|
Author: Ryan Williams
|
@@ -10,14 +10,26 @@ Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
11
11
|
|
12
12
|
# dvc-utils
|
13
|
-
CLI for diffing [DVC] files
|
14
|
-
|
15
|
-
|
13
|
+
CLI for diffing [DVC] files, optionally passing both through another command first
|
14
|
+
|
15
|
+
<!-- toc -->
|
16
|
+
- [Installation](#installation)
|
17
|
+
- [Usage](#usage)
|
18
|
+
- [`dvc-utils diff`](#dvc-utils-diff)
|
19
|
+
- [Examples](#examples)
|
20
|
+
- [Parquet file](#parquet-diff)
|
21
|
+
- [Schema diff](#parquet-schema-diff)
|
22
|
+
- [Row diff](#parquet-row-diff)
|
23
|
+
- [Row count diff](#parquet-row-count-diff)
|
24
|
+
<!-- /toc -->
|
25
|
+
|
26
|
+
## Installation <a id="installation"></a>
|
16
27
|
```bash
|
17
28
|
pip install dvc-utils
|
18
29
|
```
|
19
30
|
|
20
|
-
## Usage
|
31
|
+
## Usage <a id="usage"></a>
|
32
|
+
<!-- `bmdf -- dvc-utils --help` -->
|
21
33
|
```bash
|
22
34
|
dvc-utils --help
|
23
35
|
# Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
|
@@ -30,7 +42,8 @@ dvc-utils --help
|
|
30
42
|
# worktree), optionally passing both through another command first
|
31
43
|
```
|
32
44
|
|
33
|
-
### `dvc-utils diff`
|
45
|
+
### `dvc-utils diff` <a id="dvc-utils-diff"></a>
|
46
|
+
<!-- `bmdf -- dvc-utils diff --help` -->
|
34
47
|
```bash
|
35
48
|
dvc-utils diff --help
|
36
49
|
# Usage: dvc-utils diff [OPTIONS] [cmd...] <path>
|
@@ -48,24 +61,31 @@ dvc-utils diff --help
|
|
48
61
|
# optional) at HEAD (last committed value) vs. the current worktree content.
|
49
62
|
#
|
50
63
|
# Options:
|
51
|
-
# -
|
52
|
-
#
|
53
|
-
#
|
54
|
-
# -
|
55
|
-
# --
|
64
|
+
# -c, --color Colorize the output
|
65
|
+
# -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits) or
|
66
|
+
# <commit> (compare <commit> to the worktree)
|
67
|
+
# -S, --no-shell Don't pass `shell=True` to Python `subprocess`es
|
68
|
+
# -U, --unified INTEGER Number of lines of context to show (passes through
|
69
|
+
# to `diff`)
|
70
|
+
# -v, --verbose Log intermediate commands to stderr
|
71
|
+
# -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to `diff`)
|
72
|
+
# --help Show this message and exit.
|
56
73
|
```
|
57
74
|
|
58
|
-
## Examples
|
75
|
+
## Examples <a id="examples"></a>
|
76
|
+
|
77
|
+
### Parquet file <a id="parquet-diff"></a>
|
59
78
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
60
79
|
|
80
|
+
Setup:
|
61
81
|
```bash
|
62
|
-
git clone https://github.com/
|
63
|
-
commit=c8ae28e
|
64
|
-
path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc
|
82
|
+
git clone https://github.com/hudcostreets/nj-crashes && cd nj-crashes # Clone + enter example repo
|
83
|
+
commit=c8ae28e # Example commit that changed some DVC-tracked Parquet files
|
84
|
+
path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc # One of the changed files
|
65
85
|
```
|
66
86
|
|
67
|
-
|
68
|
-
Use [`parquet2json`] to observe schema changes to a Parquet file
|
87
|
+
#### Schema diff <a id="parquet-schema-diff"></a>
|
88
|
+
Use [`parquet2json`] to observe schema changes to a Parquet file:
|
69
89
|
```bash
|
70
90
|
parquet_schema() {
|
71
91
|
parquet2json "$1" schema
|
@@ -120,11 +140,12 @@ Here we can see that various date/time columns were consolidated, and several st
|
|
120
140
|
|
121
141
|
</details>
|
122
142
|
|
123
|
-
|
124
|
-
Diff the first row of the Parquet file above (pretty-printed as JSON), before and after the given commit:
|
143
|
+
#### Row diff <a id="parquet-row-diff"></a>
|
144
|
+
Diff the first row of the Parquet file above (pretty-printed as JSON using [`jq`]), before and after the given commit:
|
125
145
|
|
126
146
|
```bash
|
127
147
|
pretty_print_first_row() {
|
148
|
+
# Print first row of Parquet file as JSON, pretty-print with jq
|
128
149
|
parquet2json "$1" cat -l 1 | jq .
|
129
150
|
}
|
130
151
|
export -f pretty_print_first_row
|
@@ -181,7 +202,7 @@ This reflects the schema changes above.
|
|
181
202
|
|
182
203
|
</details>
|
183
204
|
|
184
|
-
|
205
|
+
#### Row count diff <a id="parquet-row-count-diff"></a>
|
185
206
|
```bash
|
186
207
|
parquet_row_count() {
|
187
208
|
parquet2json "$1" rowcount
|
@@ -194,8 +215,9 @@ This time we get no output; [the given `$commit`][commit] didn't change the row
|
|
194
215
|
|
195
216
|
[DVC]: https://dvc.org/
|
196
217
|
[`parquet2json`]: https://github.com/jupiter/parquet2json
|
197
|
-
[
|
218
|
+
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
198
219
|
[Parquet]: https://parquet.apache.org/
|
199
|
-
[commit]: https://github.com/
|
200
|
-
[commit path]: https://github.com/
|
201
|
-
[
|
220
|
+
[commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
|
221
|
+
[commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
|
222
|
+
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
223
|
+
[`jq`]: https://jqlang.github.io/jq/
|
@@ -1,12 +1,24 @@
|
|
1
1
|
# dvc-utils
|
2
|
-
CLI for diffing [DVC] files
|
3
|
-
|
4
|
-
|
2
|
+
CLI for diffing [DVC] files, optionally passing both through another command first
|
3
|
+
|
4
|
+
<!-- toc -->
|
5
|
+
- [Installation](#installation)
|
6
|
+
- [Usage](#usage)
|
7
|
+
- [`dvc-utils diff`](#dvc-utils-diff)
|
8
|
+
- [Examples](#examples)
|
9
|
+
- [Parquet file](#parquet-diff)
|
10
|
+
- [Schema diff](#parquet-schema-diff)
|
11
|
+
- [Row diff](#parquet-row-diff)
|
12
|
+
- [Row count diff](#parquet-row-count-diff)
|
13
|
+
<!-- /toc -->
|
14
|
+
|
15
|
+
## Installation <a id="installation"></a>
|
5
16
|
```bash
|
6
17
|
pip install dvc-utils
|
7
18
|
```
|
8
19
|
|
9
|
-
## Usage
|
20
|
+
## Usage <a id="usage"></a>
|
21
|
+
<!-- `bmdf -- dvc-utils --help` -->
|
10
22
|
```bash
|
11
23
|
dvc-utils --help
|
12
24
|
# Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
|
@@ -19,7 +31,8 @@ dvc-utils --help
|
|
19
31
|
# worktree), optionally passing both through another command first
|
20
32
|
```
|
21
33
|
|
22
|
-
### `dvc-utils diff`
|
34
|
+
### `dvc-utils diff` <a id="dvc-utils-diff"></a>
|
35
|
+
<!-- `bmdf -- dvc-utils diff --help` -->
|
23
36
|
```bash
|
24
37
|
dvc-utils diff --help
|
25
38
|
# Usage: dvc-utils diff [OPTIONS] [cmd...] <path>
|
@@ -37,24 +50,31 @@ dvc-utils diff --help
|
|
37
50
|
# optional) at HEAD (last committed value) vs. the current worktree content.
|
38
51
|
#
|
39
52
|
# Options:
|
40
|
-
# -
|
41
|
-
#
|
42
|
-
#
|
43
|
-
# -
|
44
|
-
# --
|
53
|
+
# -c, --color Colorize the output
|
54
|
+
# -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits) or
|
55
|
+
# <commit> (compare <commit> to the worktree)
|
56
|
+
# -S, --no-shell Don't pass `shell=True` to Python `subprocess`es
|
57
|
+
# -U, --unified INTEGER Number of lines of context to show (passes through
|
58
|
+
# to `diff`)
|
59
|
+
# -v, --verbose Log intermediate commands to stderr
|
60
|
+
# -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to `diff`)
|
61
|
+
# --help Show this message and exit.
|
45
62
|
```
|
46
63
|
|
47
|
-
## Examples
|
64
|
+
## Examples <a id="examples"></a>
|
65
|
+
|
66
|
+
### Parquet file <a id="parquet-diff"></a>
|
48
67
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
49
68
|
|
69
|
+
Setup:
|
50
70
|
```bash
|
51
|
-
git clone https://github.com/
|
52
|
-
commit=c8ae28e
|
53
|
-
path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc
|
71
|
+
git clone https://github.com/hudcostreets/nj-crashes && cd nj-crashes # Clone + enter example repo
|
72
|
+
commit=c8ae28e # Example commit that changed some DVC-tracked Parquet files
|
73
|
+
path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc # One of the changed files
|
54
74
|
```
|
55
75
|
|
56
|
-
|
57
|
-
Use [`parquet2json`] to observe schema changes to a Parquet file
|
76
|
+
#### Schema diff <a id="parquet-schema-diff"></a>
|
77
|
+
Use [`parquet2json`] to observe schema changes to a Parquet file:
|
58
78
|
```bash
|
59
79
|
parquet_schema() {
|
60
80
|
parquet2json "$1" schema
|
@@ -109,11 +129,12 @@ Here we can see that various date/time columns were consolidated, and several st
|
|
109
129
|
|
110
130
|
</details>
|
111
131
|
|
112
|
-
|
113
|
-
Diff the first row of the Parquet file above (pretty-printed as JSON), before and after the given commit:
|
132
|
+
#### Row diff <a id="parquet-row-diff"></a>
|
133
|
+
Diff the first row of the Parquet file above (pretty-printed as JSON using [`jq`]), before and after the given commit:
|
114
134
|
|
115
135
|
```bash
|
116
136
|
pretty_print_first_row() {
|
137
|
+
# Print first row of Parquet file as JSON, pretty-print with jq
|
117
138
|
parquet2json "$1" cat -l 1 | jq .
|
118
139
|
}
|
119
140
|
export -f pretty_print_first_row
|
@@ -170,7 +191,7 @@ This reflects the schema changes above.
|
|
170
191
|
|
171
192
|
</details>
|
172
193
|
|
173
|
-
|
194
|
+
#### Row count diff <a id="parquet-row-count-diff"></a>
|
174
195
|
```bash
|
175
196
|
parquet_row_count() {
|
176
197
|
parquet2json "$1" rowcount
|
@@ -183,8 +204,9 @@ This time we get no output; [the given `$commit`][commit] didn't change the row
|
|
183
204
|
|
184
205
|
[DVC]: https://dvc.org/
|
185
206
|
[`parquet2json`]: https://github.com/jupiter/parquet2json
|
186
|
-
[
|
207
|
+
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
187
208
|
[Parquet]: https://parquet.apache.org/
|
188
|
-
[commit]: https://github.com/
|
189
|
-
[commit path]: https://github.com/
|
190
|
-
[
|
209
|
+
[commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
|
210
|
+
[commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
|
211
|
+
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
212
|
+
[`jq`]: https://jqlang.github.io/jq/
|
@@ -0,0 +1,183 @@
|
|
1
|
+
from functools import cache
|
2
|
+
from os import environ as env, getcwd
|
3
|
+
|
4
|
+
from typing import Optional, Tuple
|
5
|
+
|
6
|
+
import shlex
|
7
|
+
from os.path import join, relpath
|
8
|
+
|
9
|
+
from click import option, argument, group
|
10
|
+
from subprocess import Popen
|
11
|
+
|
12
|
+
import click
|
13
|
+
import yaml
|
14
|
+
from utz import process, singleton, err
|
15
|
+
|
16
|
+
from dvc_utils.named_pipes import named_pipes
|
17
|
+
|
18
|
+
|
19
|
+
@group()
|
20
|
+
def cli():
|
21
|
+
pass
|
22
|
+
|
23
|
+
|
24
|
+
def dvc_paths(path: str) -> Tuple[str, str]:
|
25
|
+
if path.endswith('.dvc'):
|
26
|
+
dvc_path = path
|
27
|
+
path = dvc_path[:-len('.dvc')]
|
28
|
+
else:
|
29
|
+
dvc_path = f'{path}.dvc'
|
30
|
+
return path, dvc_path
|
31
|
+
|
32
|
+
|
33
|
+
@cache
|
34
|
+
def get_git_root() -> str:
|
35
|
+
return process.line('git', 'rev-parse', '--show-toplevel', log=False)
|
36
|
+
|
37
|
+
|
38
|
+
@cache
|
39
|
+
def get_dir_path() -> str:
|
40
|
+
return relpath(getcwd(), get_git_root())
|
41
|
+
|
42
|
+
|
43
|
+
@cache
|
44
|
+
def dvc_cache_dir(log: bool = False) -> str:
|
45
|
+
dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
|
46
|
+
if dvc_cache_relpath:
|
47
|
+
return join(get_git_root(), dvc_cache_relpath)
|
48
|
+
else:
|
49
|
+
return process.line('dvc', 'cache', 'dir', log=log)
|
50
|
+
|
51
|
+
|
52
|
+
def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
|
53
|
+
dir_path = get_dir_path()
|
54
|
+
dir_path = '' if dir_path == '.' else f'{dir_path}/'
|
55
|
+
dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=log)
|
56
|
+
dvc_obj = yaml.safe_load(dvc_spec)
|
57
|
+
out = singleton(dvc_obj['outs'], dedupe=False)
|
58
|
+
md5 = out['md5']
|
59
|
+
return md5
|
60
|
+
|
61
|
+
|
62
|
+
def dvc_cache_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
|
63
|
+
if dvc_path:
|
64
|
+
md5 = dvc_md5(ref, dvc_path, log=log)
|
65
|
+
elif ':' in ref:
|
66
|
+
git_ref, dvc_path = ref.split(':', 1)
|
67
|
+
md5 = dvc_md5(git_ref, dvc_path, log=log)
|
68
|
+
else:
|
69
|
+
md5 = ref
|
70
|
+
dirname = md5[:2]
|
71
|
+
basename = md5[2:]
|
72
|
+
return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
|
73
|
+
|
74
|
+
|
75
|
+
def diff_cmds(
|
76
|
+
cmd1: str,
|
77
|
+
cmd2: str,
|
78
|
+
verbose: bool = False,
|
79
|
+
color: bool = False,
|
80
|
+
unified: int | None = None,
|
81
|
+
ignore_whitespace: bool = False,
|
82
|
+
**kwargs,
|
83
|
+
):
|
84
|
+
"""Run two commands and diff their output.
|
85
|
+
|
86
|
+
Adapted from https://stackoverflow.com/a/28840955"""
|
87
|
+
with named_pipes(n=2) as pipes:
|
88
|
+
(pipe1, pipe2) = pipes
|
89
|
+
diff_cmd = [
|
90
|
+
'diff',
|
91
|
+
*(['-w'] if ignore_whitespace else []),
|
92
|
+
*(['-U', str(unified)] if unified is not None else []),
|
93
|
+
*(['--color=always'] if color else []),
|
94
|
+
pipe1,
|
95
|
+
pipe2,
|
96
|
+
]
|
97
|
+
diff = Popen(diff_cmd)
|
98
|
+
processes = []
|
99
|
+
for path, cmd in ((pipe1, cmd1), (pipe2, cmd2)):
|
100
|
+
with open(path, 'wb', 0) as pipe:
|
101
|
+
if verbose:
|
102
|
+
err(f"Running: {cmd}")
|
103
|
+
processes.append(Popen(cmd, stdout=pipe, close_fds=True, **kwargs))
|
104
|
+
for p in [diff] + processes:
|
105
|
+
p.wait()
|
106
|
+
|
107
|
+
|
108
|
+
@cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
|
109
|
+
@option('-c', '--color', is_flag=True, help='Colorize the output')
|
110
|
+
@option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
|
111
|
+
@option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
|
112
|
+
@option('-U', '--unified', type=int, help='Number of lines of context to show (passes through to `diff`)')
|
113
|
+
@option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
|
114
|
+
@option('-w', '--ignore-whitespace', is_flag=True, help="Ignore whitespace differences (pass `-w` to `diff`)")
|
115
|
+
@argument('args', metavar='[cmd...] <path>', nargs=-1)
|
116
|
+
def dvc_utils_diff(
|
117
|
+
color: bool,
|
118
|
+
refspec: str | None,
|
119
|
+
no_shell: bool,
|
120
|
+
unified: int | None,
|
121
|
+
verbose: bool,
|
122
|
+
ignore_whitespace: bool,
|
123
|
+
args: Tuple[str, ...],
|
124
|
+
):
|
125
|
+
"""Diff a file at two commits (or one commit vs. current worktree), optionally passing both through `cmd` first
|
126
|
+
|
127
|
+
Examples:
|
128
|
+
|
129
|
+
dvc-utils diff -r HEAD^..HEAD wc -l foo.dvc # Compare the number of lines (`wc -l`) in `foo` (the file referenced by `foo.dvc`) at the previous vs. current commit (`HEAD^..HEAD`).
|
130
|
+
|
131
|
+
dvc-utils diff md5sum foo # Diff the `md5sum` of `foo` (".dvc" extension is optional) at HEAD (last committed value) vs. the current worktree content.
|
132
|
+
"""
|
133
|
+
if not args:
|
134
|
+
raise click.UsageError('Must specify [cmd...] <path>')
|
135
|
+
|
136
|
+
shell = not no_shell
|
137
|
+
if len(args) == 2:
|
138
|
+
cmd, path = args
|
139
|
+
cmd = shlex.split(cmd)
|
140
|
+
elif len(args) == 1:
|
141
|
+
cmd = None
|
142
|
+
path, = args
|
143
|
+
else:
|
144
|
+
raise click.UsageError('Maximum 2 positional args: [cmd] <path>')
|
145
|
+
|
146
|
+
path, dvc_path = dvc_paths(path)
|
147
|
+
|
148
|
+
pcs = refspec.split('..', 1)
|
149
|
+
if len(pcs) == 1:
|
150
|
+
before = pcs[0]
|
151
|
+
after = None
|
152
|
+
elif len(pcs) == 2:
|
153
|
+
before, after = pcs
|
154
|
+
else:
|
155
|
+
raise ValueError(f"Invalid refspec: {refspec}")
|
156
|
+
|
157
|
+
log = err if verbose else False
|
158
|
+
before_path = dvc_cache_path(before, dvc_path, log=log)
|
159
|
+
after_path = path if after is None else dvc_cache_path(after, dvc_path, log=log)
|
160
|
+
|
161
|
+
if cmd:
|
162
|
+
def args(path: str):
|
163
|
+
arr = cmd + [path]
|
164
|
+
return shlex.join(arr) if shell else arr
|
165
|
+
|
166
|
+
shell_kwargs = dict(shell=shell) if shell else {}
|
167
|
+
before_cmd = args(before_path)
|
168
|
+
after_cmd = args(after_path)
|
169
|
+
diff_cmds(
|
170
|
+
before_cmd,
|
171
|
+
after_cmd,
|
172
|
+
verbose=verbose,
|
173
|
+
color=color,
|
174
|
+
unified=unified,
|
175
|
+
ignore_whitespace=ignore_whitespace,
|
176
|
+
**shell_kwargs,
|
177
|
+
)
|
178
|
+
else:
|
179
|
+
process.run('diff', before_path, after_path, log=log)
|
180
|
+
|
181
|
+
|
182
|
+
if __name__ == '__main__':
|
183
|
+
cli()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dvc-utils
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
5
|
Home-page: https://github.com/runsascoded/dvc-utils
|
6
6
|
Author: Ryan Williams
|
@@ -10,14 +10,26 @@ Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
11
11
|
|
12
12
|
# dvc-utils
|
13
|
-
CLI for diffing [DVC] files
|
14
|
-
|
15
|
-
|
13
|
+
CLI for diffing [DVC] files, optionally passing both through another command first
|
14
|
+
|
15
|
+
<!-- toc -->
|
16
|
+
- [Installation](#installation)
|
17
|
+
- [Usage](#usage)
|
18
|
+
- [`dvc-utils diff`](#dvc-utils-diff)
|
19
|
+
- [Examples](#examples)
|
20
|
+
- [Parquet file](#parquet-diff)
|
21
|
+
- [Schema diff](#parquet-schema-diff)
|
22
|
+
- [Row diff](#parquet-row-diff)
|
23
|
+
- [Row count diff](#parquet-row-count-diff)
|
24
|
+
<!-- /toc -->
|
25
|
+
|
26
|
+
## Installation <a id="installation"></a>
|
16
27
|
```bash
|
17
28
|
pip install dvc-utils
|
18
29
|
```
|
19
30
|
|
20
|
-
## Usage
|
31
|
+
## Usage <a id="usage"></a>
|
32
|
+
<!-- `bmdf -- dvc-utils --help` -->
|
21
33
|
```bash
|
22
34
|
dvc-utils --help
|
23
35
|
# Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
|
@@ -30,7 +42,8 @@ dvc-utils --help
|
|
30
42
|
# worktree), optionally passing both through another command first
|
31
43
|
```
|
32
44
|
|
33
|
-
### `dvc-utils diff`
|
45
|
+
### `dvc-utils diff` <a id="dvc-utils-diff"></a>
|
46
|
+
<!-- `bmdf -- dvc-utils diff --help` -->
|
34
47
|
```bash
|
35
48
|
dvc-utils diff --help
|
36
49
|
# Usage: dvc-utils diff [OPTIONS] [cmd...] <path>
|
@@ -48,24 +61,31 @@ dvc-utils diff --help
|
|
48
61
|
# optional) at HEAD (last committed value) vs. the current worktree content.
|
49
62
|
#
|
50
63
|
# Options:
|
51
|
-
# -
|
52
|
-
#
|
53
|
-
#
|
54
|
-
# -
|
55
|
-
# --
|
64
|
+
# -c, --color Colorize the output
|
65
|
+
# -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits) or
|
66
|
+
# <commit> (compare <commit> to the worktree)
|
67
|
+
# -S, --no-shell Don't pass `shell=True` to Python `subprocess`es
|
68
|
+
# -U, --unified INTEGER Number of lines of context to show (passes through
|
69
|
+
# to `diff`)
|
70
|
+
# -v, --verbose Log intermediate commands to stderr
|
71
|
+
# -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to `diff`)
|
72
|
+
# --help Show this message and exit.
|
56
73
|
```
|
57
74
|
|
58
|
-
## Examples
|
75
|
+
## Examples <a id="examples"></a>
|
76
|
+
|
77
|
+
### Parquet file <a id="parquet-diff"></a>
|
59
78
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
60
79
|
|
80
|
+
Setup:
|
61
81
|
```bash
|
62
|
-
git clone https://github.com/
|
63
|
-
commit=c8ae28e
|
64
|
-
path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc
|
82
|
+
git clone https://github.com/hudcostreets/nj-crashes && cd nj-crashes # Clone + enter example repo
|
83
|
+
commit=c8ae28e # Example commit that changed some DVC-tracked Parquet files
|
84
|
+
path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc # One of the changed files
|
65
85
|
```
|
66
86
|
|
67
|
-
|
68
|
-
Use [`parquet2json`] to observe schema changes to a Parquet file
|
87
|
+
#### Schema diff <a id="parquet-schema-diff"></a>
|
88
|
+
Use [`parquet2json`] to observe schema changes to a Parquet file:
|
69
89
|
```bash
|
70
90
|
parquet_schema() {
|
71
91
|
parquet2json "$1" schema
|
@@ -120,11 +140,12 @@ Here we can see that various date/time columns were consolidated, and several st
|
|
120
140
|
|
121
141
|
</details>
|
122
142
|
|
123
|
-
|
124
|
-
Diff the first row of the Parquet file above (pretty-printed as JSON), before and after the given commit:
|
143
|
+
#### Row diff <a id="parquet-row-diff"></a>
|
144
|
+
Diff the first row of the Parquet file above (pretty-printed as JSON using [`jq`]), before and after the given commit:
|
125
145
|
|
126
146
|
```bash
|
127
147
|
pretty_print_first_row() {
|
148
|
+
# Print first row of Parquet file as JSON, pretty-print with jq
|
128
149
|
parquet2json "$1" cat -l 1 | jq .
|
129
150
|
}
|
130
151
|
export -f pretty_print_first_row
|
@@ -181,7 +202,7 @@ This reflects the schema changes above.
|
|
181
202
|
|
182
203
|
</details>
|
183
204
|
|
184
|
-
|
205
|
+
#### Row count diff <a id="parquet-row-count-diff"></a>
|
185
206
|
```bash
|
186
207
|
parquet_row_count() {
|
187
208
|
parquet2json "$1" rowcount
|
@@ -194,8 +215,9 @@ This time we get no output; [the given `$commit`][commit] didn't change the row
|
|
194
215
|
|
195
216
|
[DVC]: https://dvc.org/
|
196
217
|
[`parquet2json`]: https://github.com/jupiter/parquet2json
|
197
|
-
[
|
218
|
+
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
198
219
|
[Parquet]: https://parquet.apache.org/
|
199
|
-
[commit]: https://github.com/
|
200
|
-
[commit path]: https://github.com/
|
201
|
-
[
|
220
|
+
[commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
|
221
|
+
[commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
|
222
|
+
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
223
|
+
[`jq`]: https://jqlang.github.io/jq/
|
@@ -2,7 +2,7 @@ from setuptools import setup
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name='dvc-utils',
|
5
|
-
version="0.0.
|
5
|
+
version="0.0.4",
|
6
6
|
description="CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first",
|
7
7
|
long_description=open("README.md").read(),
|
8
8
|
long_description_content_type="text/markdown",
|
@@ -1,119 +0,0 @@
|
|
1
|
-
import shlex
|
2
|
-
from os.path import join
|
3
|
-
from subprocess import Popen
|
4
|
-
|
5
|
-
import click
|
6
|
-
import yaml
|
7
|
-
from utz import process, singleton, err
|
8
|
-
|
9
|
-
from dvc_utils.named_pipes import named_pipes
|
10
|
-
|
11
|
-
|
12
|
-
@click.group()
|
13
|
-
def cli():
|
14
|
-
pass
|
15
|
-
|
16
|
-
|
17
|
-
def dvc_paths(path):
|
18
|
-
if path.endswith('.dvc'):
|
19
|
-
dvc_path = path
|
20
|
-
path = dvc_path[:-len('.dvc')]
|
21
|
-
else:
|
22
|
-
dvc_path = f'{path}.dvc'
|
23
|
-
return path, dvc_path
|
24
|
-
|
25
|
-
|
26
|
-
def dvc_md5(git_ref, dvc_path, log=False):
|
27
|
-
dvc_spec = process.output('git', 'show', f'{git_ref}:{dvc_path}', log=log)
|
28
|
-
dvc_obj = yaml.safe_load(dvc_spec)
|
29
|
-
out = singleton(dvc_obj['outs'], dedupe=False)
|
30
|
-
md5 = out['md5']
|
31
|
-
return md5
|
32
|
-
|
33
|
-
|
34
|
-
_dvc_cache_dir = None
|
35
|
-
def dvc_cache_dir(log=False):
|
36
|
-
global _dvc_cache_dir
|
37
|
-
if _dvc_cache_dir is None:
|
38
|
-
_dvc_cache_dir = process.line('dvc', 'cache', 'dir', log=log)
|
39
|
-
return _dvc_cache_dir
|
40
|
-
|
41
|
-
|
42
|
-
def dvc_cache_path(spec, dvc_path=None, log=False):
|
43
|
-
if dvc_path:
|
44
|
-
md5 = dvc_md5(spec, dvc_path, log=log)
|
45
|
-
elif ':' in spec:
|
46
|
-
git_ref, dvc_path = spec.split(':', 1)
|
47
|
-
md5 = dvc_md5(git_ref, dvc_path, log=log)
|
48
|
-
else:
|
49
|
-
md5 = spec
|
50
|
-
dirname = md5[:2]
|
51
|
-
basename = md5[2:]
|
52
|
-
return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
|
53
|
-
|
54
|
-
|
55
|
-
def diff_cmds(cmd1, cmd2, **kwargs):
|
56
|
-
"""Run two commands and diff their output.
|
57
|
-
|
58
|
-
Adapted from https://stackoverflow.com/a/28840955"""
|
59
|
-
with named_pipes(n=2) as paths:
|
60
|
-
someprogram = Popen(['diff'] + paths)
|
61
|
-
processes = []
|
62
|
-
for path, cmd in zip(paths, [ cmd1, cmd2 ]):
|
63
|
-
with open(path, 'wb', 0) as pipe:
|
64
|
-
processes.append(Popen(cmd, stdout=pipe, close_fds=True, **kwargs))
|
65
|
-
for p in [someprogram] + processes:
|
66
|
-
p.wait()
|
67
|
-
|
68
|
-
|
69
|
-
@cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
|
70
|
-
@click.option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
|
71
|
-
@click.option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
|
72
|
-
@click.option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
|
73
|
-
@click.argument('args', metavar='[cmd...] <path>', nargs=-1)
|
74
|
-
def dvc_utils_diff(refspec, no_shell, verbose, args):
|
75
|
-
"""Diff a file at two commits (or one commit vs. current worktree), optionally passing both through `cmd` first
|
76
|
-
|
77
|
-
Examples:
|
78
|
-
|
79
|
-
dvc-utils diff -r HEAD^..HEAD wc -l foo.dvc # Compare the number of lines (`wc -l`) in `foo` (the file referenced by `foo.dvc`) at the previous vs. current commit (`HEAD^..HEAD`).
|
80
|
-
|
81
|
-
dvc-utils diff md5sum foo # Diff the `md5sum` of `foo` (".dvc" extension is optional) at HEAD (last committed value) vs. the current worktree content.
|
82
|
-
"""
|
83
|
-
if not args:
|
84
|
-
raise click.UsageError('Must specify [cmd...] <path>')
|
85
|
-
|
86
|
-
shell = not no_shell
|
87
|
-
(*cmd, path) = args
|
88
|
-
if path.endswith('.dvc'):
|
89
|
-
dvc_path = path
|
90
|
-
path = dvc_path[:-len('.dvc')]
|
91
|
-
else:
|
92
|
-
dvc_path = f'{path}.dvc'
|
93
|
-
|
94
|
-
pcs = refspec.split('..', 1)
|
95
|
-
if len(pcs) == 1:
|
96
|
-
before = pcs[0]
|
97
|
-
after = None
|
98
|
-
else:
|
99
|
-
before, after = pcs
|
100
|
-
|
101
|
-
log = err if verbose else False
|
102
|
-
before_path = dvc_cache_path(before, dvc_path, log=log)
|
103
|
-
after_path = path if after is None else dvc_cache_path(after, dvc_path, log=log)
|
104
|
-
|
105
|
-
if cmd:
|
106
|
-
def args(path):
|
107
|
-
arr = cmd + [path]
|
108
|
-
return shlex.join(arr) if shell else arr
|
109
|
-
|
110
|
-
shell_kwargs = dict(shell=shell) if shell else {}
|
111
|
-
before_cmd = args(before_path)
|
112
|
-
after_cmd = args(after_path)
|
113
|
-
diff_cmds(before_cmd, after_cmd, **shell_kwargs)
|
114
|
-
else:
|
115
|
-
process.run('diff', before_path, after_path, log=log)
|
116
|
-
|
117
|
-
|
118
|
-
if __name__ == '__main__':
|
119
|
-
cli()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|