dvc-utils 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dvc_utils/main.py CHANGED
@@ -1,5 +1,12 @@
1
+ from functools import cache
2
+ from os import environ as env, getcwd
3
+
4
+ from typing import Optional, Tuple
5
+
1
6
  import shlex
2
- from os.path import join
7
+ from os.path import join, relpath
8
+
9
+ from click import option, argument, group
3
10
  from subprocess import Popen
4
11
 
5
12
  import click
@@ -9,12 +16,12 @@ from utz import process, singleton, err
9
16
  from dvc_utils.named_pipes import named_pipes
10
17
 
11
18
 
12
- @click.group()
19
+ @group()
13
20
  def cli():
14
21
  pass
15
22
 
16
23
 
17
- def dvc_paths(path):
24
+ def dvc_paths(path: str) -> Tuple[str, str]:
18
25
  if path.endswith('.dvc'):
19
26
  dvc_path = path
20
27
  path = dvc_path[:-len('.dvc')]
@@ -23,54 +30,70 @@ def dvc_paths(path):
23
30
  return path, dvc_path
24
31
 
25
32
 
26
- def dvc_md5(git_ref, dvc_path, log=False):
27
- dvc_spec = process.output('git', 'show', f'{git_ref}:{dvc_path}', log=log)
33
+ @cache
34
+ def get_git_root() -> str:
35
+ return process.line('git', 'rev-parse', '--show-toplevel', log=False)
36
+
37
+
38
+ @cache
39
+ def get_dir_path() -> str:
40
+ return relpath(getcwd(), get_git_root())
41
+
42
+
43
+ @cache
44
+ def dvc_cache_dir(log: bool = False) -> str:
45
+ dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
46
+ if dvc_cache_relpath:
47
+ return join(get_git_root(), dvc_cache_relpath)
48
+ else:
49
+ return process.line('dvc', 'cache', 'dir', log=log)
50
+
51
+
52
+ def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
53
+ dir_path = get_dir_path()
54
+ dir_path = '' if dir_path == '.' else f'{dir_path}/'
55
+ dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=log)
28
56
  dvc_obj = yaml.safe_load(dvc_spec)
29
57
  out = singleton(dvc_obj['outs'], dedupe=False)
30
58
  md5 = out['md5']
31
59
  return md5
32
60
 
33
61
 
34
- _dvc_cache_dir = None
35
- def dvc_cache_dir(log=False):
36
- global _dvc_cache_dir
37
- if _dvc_cache_dir is None:
38
- _dvc_cache_dir = process.line('dvc', 'cache', 'dir', log=log)
39
- return _dvc_cache_dir
40
-
41
-
42
- def dvc_cache_path(spec, dvc_path=None, log=False):
62
+ def dvc_cache_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
43
63
  if dvc_path:
44
- md5 = dvc_md5(spec, dvc_path, log=log)
45
- elif ':' in spec:
46
- git_ref, dvc_path = spec.split(':', 1)
64
+ md5 = dvc_md5(ref, dvc_path, log=log)
65
+ elif ':' in ref:
66
+ git_ref, dvc_path = ref.split(':', 1)
47
67
  md5 = dvc_md5(git_ref, dvc_path, log=log)
48
68
  else:
49
- md5 = spec
69
+ md5 = ref
50
70
  dirname = md5[:2]
51
71
  basename = md5[2:]
52
72
  return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
53
73
 
54
74
 
55
- def diff_cmds(cmd1, cmd2, **kwargs):
75
+ def diff_cmds(cmd1: str, cmd2: str, verbose: bool = False, **kwargs):
56
76
  """Run two commands and diff their output.
57
77
 
58
78
  Adapted from https://stackoverflow.com/a/28840955"""
59
- with named_pipes(n=2) as paths:
60
- someprogram = Popen(['diff'] + paths)
79
+ with named_pipes(n=2) as pipes:
80
+ (pipe1, pipe2) = pipes
81
+ diff = Popen(['diff'] + pipes)
61
82
  processes = []
62
- for path, cmd in zip(paths, [ cmd1, cmd2 ]):
83
+ for path, cmd in ((pipe1, cmd1), (pipe2, cmd2)):
63
84
  with open(path, 'wb', 0) as pipe:
85
+ if verbose:
86
+ err(f"Running: {cmd}")
64
87
  processes.append(Popen(cmd, stdout=pipe, close_fds=True, **kwargs))
65
- for p in [someprogram] + processes:
88
+ for p in [diff] + processes:
66
89
  p.wait()
67
90
 
68
91
 
69
92
  @cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
70
- @click.option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
71
- @click.option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
72
- @click.option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
73
- @click.argument('args', metavar='[cmd...] <path>', nargs=-1)
93
+ @option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
94
+ @option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
95
+ @option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
96
+ @argument('args', metavar='[cmd...] <path>', nargs=-1)
74
97
  def dvc_utils_diff(refspec, no_shell, verbose, args):
75
98
  """Diff a file at two commits (or one commit vs. current worktree), optionally passing both through `cmd` first
76
99
 
@@ -84,33 +107,39 @@ def dvc_utils_diff(refspec, no_shell, verbose, args):
84
107
  raise click.UsageError('Must specify [cmd...] <path>')
85
108
 
86
109
  shell = not no_shell
87
- (*cmd, path) = args
88
- if path.endswith('.dvc'):
89
- dvc_path = path
90
- path = dvc_path[:-len('.dvc')]
110
+ if len(args) == 2:
111
+ cmd, path = args
112
+ cmd = shlex.split(cmd)
113
+ elif len(args) == 1:
114
+ cmd = None
115
+ path, = args
91
116
  else:
92
- dvc_path = f'{path}.dvc'
117
+ raise click.UsageError('Maximum 2 positional args: [cmd] <path>')
118
+
119
+ path, dvc_path = dvc_paths(path)
93
120
 
94
121
  pcs = refspec.split('..', 1)
95
122
  if len(pcs) == 1:
96
123
  before = pcs[0]
97
124
  after = None
98
- else:
125
+ elif len(pcs) == 2:
99
126
  before, after = pcs
127
+ else:
128
+ raise ValueError(f"Invalid refspec: {refspec}")
100
129
 
101
130
  log = err if verbose else False
102
131
  before_path = dvc_cache_path(before, dvc_path, log=log)
103
132
  after_path = path if after is None else dvc_cache_path(after, dvc_path, log=log)
104
133
 
105
134
  if cmd:
106
- def args(path):
135
+ def args(path: str):
107
136
  arr = cmd + [path]
108
137
  return shlex.join(arr) if shell else arr
109
138
 
110
139
  shell_kwargs = dict(shell=shell) if shell else {}
111
140
  before_cmd = args(before_path)
112
141
  after_cmd = args(after_path)
113
- diff_cmds(before_cmd, after_cmd, **shell_kwargs)
142
+ diff_cmds(before_cmd, after_cmd, verbose=verbose, **shell_kwargs)
114
143
  else:
115
144
  process.run('diff', before_path, after_path, log=log)
116
145
 
dvc_utils/named_pipes.py CHANGED
@@ -5,7 +5,7 @@ from contextlib import contextmanager
5
5
 
6
6
 
7
7
  @contextmanager
8
- def named_pipes(n=1):
8
+ def named_pipes(n: int = 1):
9
9
  """Yield a list of paths to named pipes that are created and destroyed
10
10
 
11
11
  From https://stackoverflow.com/a/28840955"""
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.1
2
+ Name: dvc-utils
3
+ Version: 0.0.3
4
+ Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
5
+ Home-page: https://github.com/runsascoded/dvc-utils
6
+ Author: Ryan Williams
7
+ Author-email: ryan@runsascoded.com
8
+ License: MIT
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+
12
+ # dvc-utils
13
+ CLI for diffing [DVC] files, optionally passing both through another command first
14
+
15
+ <!-- toc -->
16
+ - [Installation](#installation)
17
+ - [Usage](#usage)
18
+ - [`dvc-utils diff`](#dvc-utils-diff)
19
+ - [Examples](#examples)
20
+ - [Parquet file](#parquet-diff)
21
+ - [Schema diff](#parquet-schema-diff)
22
+ - [Row diff](#parquet-row-diff)
23
+ - [Row count diff](#parquet-row-count-diff)
24
+ <!-- /toc -->
25
+
26
+ ## Installation <a id="installation"></a>
27
+ ```bash
28
+ pip install dvc-utils
29
+ ```
30
+
31
+ ## Usage <a id="usage"></a>
32
+ ```bash
33
+ dvc-utils --help
34
+ # Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
35
+ #
36
+ # Options:
37
+ # --help Show this message and exit.
38
+ #
39
+ # Commands:
40
+ # diff Diff a DVC-tracked file at two commits (or one commit vs. current
41
+ # worktree), optionally passing both through another command first
42
+ ```
43
+
44
+ ### `dvc-utils diff` <a id="dvc-utils-diff"></a>
45
+ ```bash
46
+ dvc-utils diff --help
47
+ # Usage: dvc-utils diff [OPTIONS] [cmd...] <path>
48
+ #
49
+ # Diff a file at two commits (or one commit vs. current worktree), optionally
50
+ # passing both through `cmd` first
51
+ #
52
+ # Examples:
53
+ #
54
+ # dvc-utils diff -r HEAD^..HEAD wc -l foo.dvc # Compare the number of lines
55
+ # (`wc -l`) in `foo` (the file referenced by `foo.dvc`) at the previous vs.
56
+ # current commit (`HEAD^..HEAD`).
57
+ #
58
+ # dvc-utils diff md5sum foo # Diff the `md5sum` of `foo` (".dvc" extension is
59
+ # optional) at HEAD (last committed value) vs. the current worktree content.
60
+ #
61
+ # Options:
62
+ # -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits) or <commit>
63
+ # (compare <commit> to the worktree)
64
+ # -S, --no-shell Don't pass `shell=True` to Python `subprocess`es
65
+ # -v, --verbose Log intermediate commands to stderr
66
+ # --help Show this message and exit.
67
+ ```
68
+
69
+ ## Examples <a id="examples"></a>
70
+
71
+ ### Parquet file <a id="parquet-diff"></a>
72
+ See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
73
+
74
+ Setup:
75
+ ```bash
76
+ git clone https://github.com/hudcostreets/nj-crashes && cd nj-crashes # Clone + enter example repo
77
+ commit=c8ae28e # Example commit that changed some DVC-tracked Parquet files
78
+ path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc # One of the changed files
79
+ ```
80
+
81
+ #### Schema diff <a id="parquet-schema-diff"></a>
82
+ Use [`parquet2json`] to observe schema changes to a Parquet file:
83
+ ```bash
84
+ parquet_schema() {
85
+ parquet2json "$1" schema
86
+ }
87
+ export -f parquet_schema
88
+ dvc-utils diff -r $commit^..$commit parquet_schema $path
89
+ ```
90
+ <details><summary>Output</summary>
91
+
92
+ ```diff
93
+ 2d1
94
+ < OPTIONAL BYTE_ARRAY Year (STRING);
95
+ 8,10d6
96
+ < OPTIONAL BYTE_ARRAY Crash Date (STRING);
97
+ < OPTIONAL BYTE_ARRAY Crash Day Of Week (STRING);
98
+ < OPTIONAL BYTE_ARRAY Crash Time (STRING);
99
+ 14,17c10,13
100
+ < OPTIONAL BYTE_ARRAY Total Killed (STRING);
101
+ < OPTIONAL BYTE_ARRAY Total Injured (STRING);
102
+ < OPTIONAL BYTE_ARRAY Pedestrians Killed (STRING);
103
+ < OPTIONAL BYTE_ARRAY Pedestrians Injured (STRING);
104
+ ---
105
+ > OPTIONAL INT64 Total Killed;
106
+ > OPTIONAL INT64 Total Injured;
107
+ > OPTIONAL INT64 Pedestrians Killed;
108
+ > OPTIONAL INT64 Pedestrians Injured;
109
+ 20,21c16,17
110
+ < OPTIONAL BYTE_ARRAY Alcohol Involved (STRING);
111
+ < OPTIONAL BYTE_ARRAY HazMat Involved (STRING);
112
+ ---
113
+ > OPTIONAL BOOLEAN Alcohol Involved;
114
+ > OPTIONAL BOOLEAN HazMat Involved;
115
+ 23c19
116
+ < OPTIONAL BYTE_ARRAY Total Vehicles Involved (STRING);
117
+ ---
118
+ > OPTIONAL INT64 Total Vehicles Involved;
119
+ 29c25
120
+ < OPTIONAL BYTE_ARRAY Mile Post (STRING);
121
+ ---
122
+ > OPTIONAL DOUBLE Mile Post;
123
+ 47,48c43,44
124
+ < OPTIONAL BYTE_ARRAY Latitude (STRING);
125
+ < OPTIONAL BYTE_ARRAY Longitude (STRING);
126
+ ---
127
+ > OPTIONAL DOUBLE Latitude;
128
+ > OPTIONAL DOUBLE Longitude;
129
+ 51a48
130
+ > OPTIONAL INT64 Date (TIMESTAMP(MICROS,false));
131
+ ```
132
+
133
+ Here we can see that various date/time columns were consolidated, and several stringly-typed columns were converted to ints, floats, and booleans.
134
+
135
+ </details>
136
+
137
+ #### Row diff <a id="parquet-row-diff"></a>
138
+ Diff the first row of the Parquet file above (pretty-printed as JSON using [`jq`]), before and after the given commit:
139
+
140
+ ```bash
141
+ pretty_print_first_row() {
142
+ # Print first row of Parquet file as JSON, pretty-print with jq
143
+ parquet2json "$1" cat -l 1 | jq .
144
+ }
145
+ export -f pretty_print_first_row
146
+ dvc-utils diff -r $commit^..$commit pretty_print_first_row $path
147
+ ```
148
+
149
+ <details><summary>Output</summary>
150
+
151
+ ```diff
152
+ 2d1
153
+ < "Year": "2001",
154
+ 8,10d6
155
+ < "Crash Date": "12/21/2001",
156
+ < "Crash Day Of Week": "F",
157
+ < "Crash Time": "1834",
158
+ 14,17c10,13
159
+ < "Total Killed": "0",
160
+ < "Total Injured": "0",
161
+ < "Pedestrians Killed": "0",
162
+ < "Pedestrians Injured": "0",
163
+ ---
164
+ > "Total Killed": 0,
165
+ > "Total Injured": 0,
166
+ > "Pedestrians Killed": 0,
167
+ > "Pedestrians Injured": 0,
168
+ 20,21c16,17
169
+ < "Alcohol Involved": "N",
170
+ < "HazMat Involved": "N",
171
+ ---
172
+ > "Alcohol Involved": false,
173
+ > "HazMat Involved": false,
174
+ 23c19
175
+ < "Total Vehicles Involved": "2",
176
+ ---
177
+ > "Total Vehicles Involved": 2,
178
+ 29c25
179
+ < "Mile Post": "",
180
+ ---
181
+ > "Mile Post": null,
182
+ 47,48c43,44
183
+ < "Latitude": "",
184
+ < "Longitude": "",
185
+ ---
186
+ > "Latitude": null,
187
+ > "Longitude": null,
188
+ 51c47,48
189
+ < "Reporting Badge No.": "830"
190
+ ---
191
+ > "Reporting Badge No.": "830",
192
+ > "Date": "2001-12-21 18:34:00 +00:00"
193
+ ```
194
+
195
+ This reflects the schema changes above.
196
+
197
+ </details>
198
+
199
+ #### Row count diff <a id="parquet-row-count-diff"></a>
200
+ ```bash
201
+ parquet_row_count() {
202
+ parquet2json "$1" rowcount
203
+ }
204
+ export -f parquet_row_count
205
+ dvc-utils diff -r $commit^..$commit parquet_row_count $path
206
+ ```
207
+
208
+ This time we get no output; [the given `$commit`][commit] didn't change the row count in the DVC-tracked Parquet file [`$path`][commit path].
209
+
210
+ [DVC]: https://dvc.org/
211
+ [`parquet2json`]: https://github.com/jupiter/parquet2json
212
+ [hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
213
+ [Parquet]: https://parquet.apache.org/
214
+ [commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
215
+ [commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
216
+ [hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
217
+ [`jq`]: https://jqlang.github.io/jq/
@@ -0,0 +1,9 @@
1
+ dvc_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ dvc_utils/main.py,sha256=0rpJptELszWdFLoCKoYOY6VQtAm2mp-3_-kN075T-TE,4743
3
+ dvc_utils/named_pipes.py,sha256=VQ2t9BYCazFq_-MABj4t2HS7GHDvSqXXx8fOLz5DsTc,492
4
+ dvc_utils-0.0.3.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
+ dvc_utils-0.0.3.dist-info/METADATA,sha256=ExDOJDxXQTUrklQfYm6qTdWNW8Le6xknle4mG2sQFpg,6572
6
+ dvc_utils-0.0.3.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
7
+ dvc_utils-0.0.3.dist-info/entry_points.txt,sha256=W9OuZ6CX8QF9ojbqLtfXFo8Q2hnJ-zlcGY4_7nO8paM,49
8
+ dvc_utils-0.0.3.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
+ dvc_utils-0.0.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: dvc-utils
3
- Version: 0.0.1
4
- License-File: LICENSE
5
- Requires-Dist: click
6
- Requires-Dist: pyyaml
7
- Requires-Dist: utz
8
-
@@ -1,9 +0,0 @@
1
- dvc_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- dvc_utils/main.py,sha256=_D_FVRVnBGRHb6XIeTLOOFKc0v5J9D8_O6thG8_lSmU,3863
3
- dvc_utils/named_pipes.py,sha256=GqWvsvTMmnkjk0gPM1aXBIW5dUsSkW-eblerHJ18B68,485
4
- dvc_utils-0.0.1.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
- dvc_utils-0.0.1.dist-info/METADATA,sha256=YRQeC1fJaUdBHluoCt6BXsEM5LozJesFVqRnILU9Nhs,138
6
- dvc_utils-0.0.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
7
- dvc_utils-0.0.1.dist-info/entry_points.txt,sha256=W9OuZ6CX8QF9ojbqLtfXFo8Q2hnJ-zlcGY4_7nO8paM,49
8
- dvc_utils-0.0.1.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
- dvc_utils-0.0.1.dist-info/RECORD,,