dvc-utils 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dvc_utils/main.py CHANGED
@@ -1,5 +1,12 @@
1
+ from functools import cache
2
+ from os import environ as env, getcwd
3
+
4
+ from typing import Optional, Tuple
5
+
1
6
  import shlex
2
- from os.path import join
7
+ from os.path import join, relpath
8
+
9
+ from click import option, argument, group
3
10
  from subprocess import Popen
4
11
 
5
12
  import click
@@ -9,12 +16,12 @@ from utz import process, singleton, err
9
16
  from dvc_utils.named_pipes import named_pipes
10
17
 
11
18
 
12
- @click.group()
19
+ @group()
13
20
  def cli():
14
21
  pass
15
22
 
16
23
 
17
- def dvc_paths(path):
24
+ def dvc_paths(path: str) -> Tuple[str, str]:
18
25
  if path.endswith('.dvc'):
19
26
  dvc_path = path
20
27
  path = dvc_path[:-len('.dvc')]
@@ -23,54 +30,70 @@ def dvc_paths(path):
23
30
  return path, dvc_path
24
31
 
25
32
 
26
- def dvc_md5(git_ref, dvc_path, log=False):
27
- dvc_spec = process.output('git', 'show', f'{git_ref}:{dvc_path}', log=log)
33
+ @cache
34
+ def get_git_root() -> str:
35
+ return process.line('git', 'rev-parse', '--show-toplevel', log=False)
36
+
37
+
38
+ @cache
39
+ def get_dir_path() -> str:
40
+ return relpath(getcwd(), get_git_root())
41
+
42
+
43
+ @cache
44
+ def dvc_cache_dir(log: bool = False) -> str:
45
+ dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
46
+ if dvc_cache_relpath:
47
+ return join(get_git_root(), dvc_cache_relpath)
48
+ else:
49
+ return process.line('dvc', 'cache', 'dir', log=log)
50
+
51
+
52
+ def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
53
+ dir_path = get_dir_path()
54
+ dir_path = '' if dir_path == '.' else f'{dir_path}/'
55
+ dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=log)
28
56
  dvc_obj = yaml.safe_load(dvc_spec)
29
57
  out = singleton(dvc_obj['outs'], dedupe=False)
30
58
  md5 = out['md5']
31
59
  return md5
32
60
 
33
61
 
34
- _dvc_cache_dir = None
35
- def dvc_cache_dir(log=False):
36
- global _dvc_cache_dir
37
- if _dvc_cache_dir is None:
38
- _dvc_cache_dir = process.line('dvc', 'cache', 'dir', log=log)
39
- return _dvc_cache_dir
40
-
41
-
42
- def dvc_cache_path(spec, dvc_path=None, log=False):
62
+ def dvc_cache_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
43
63
  if dvc_path:
44
- md5 = dvc_md5(spec, dvc_path, log=log)
45
- elif ':' in spec:
46
- git_ref, dvc_path = spec.split(':', 1)
64
+ md5 = dvc_md5(ref, dvc_path, log=log)
65
+ elif ':' in ref:
66
+ git_ref, dvc_path = ref.split(':', 1)
47
67
  md5 = dvc_md5(git_ref, dvc_path, log=log)
48
68
  else:
49
- md5 = spec
69
+ md5 = ref
50
70
  dirname = md5[:2]
51
71
  basename = md5[2:]
52
72
  return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
53
73
 
54
74
 
55
- def diff_cmds(cmd1, cmd2, **kwargs):
75
+ def diff_cmds(cmd1: str, cmd2: str, verbose: bool = False, **kwargs):
56
76
  """Run two commands and diff their output.
57
77
 
58
78
  Adapted from https://stackoverflow.com/a/28840955"""
59
- with named_pipes(n=2) as paths:
60
- someprogram = Popen(['diff'] + paths)
79
+ with named_pipes(n=2) as pipes:
80
+ (pipe1, pipe2) = pipes
81
+ diff = Popen(['diff'] + pipes)
61
82
  processes = []
62
- for path, cmd in zip(paths, [ cmd1, cmd2 ]):
83
+ for path, cmd in ((pipe1, cmd1), (pipe2, cmd2)):
63
84
  with open(path, 'wb', 0) as pipe:
85
+ if verbose:
86
+ err(f"Running: {cmd}")
64
87
  processes.append(Popen(cmd, stdout=pipe, close_fds=True, **kwargs))
65
- for p in [someprogram] + processes:
88
+ for p in [diff] + processes:
66
89
  p.wait()
67
90
 
68
91
 
69
92
  @cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
70
- @click.option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
71
- @click.option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
72
- @click.option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
73
- @click.argument('args', metavar='[cmd...] <path>', nargs=-1)
93
+ @option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
94
+ @option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
95
+ @option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
96
+ @argument('args', metavar='[cmd...] <path>', nargs=-1)
74
97
  def dvc_utils_diff(refspec, no_shell, verbose, args):
75
98
  """Diff a file at two commits (or one commit vs. current worktree), optionally passing both through `cmd` first
76
99
 
@@ -84,33 +107,39 @@ def dvc_utils_diff(refspec, no_shell, verbose, args):
84
107
  raise click.UsageError('Must specify [cmd...] <path>')
85
108
 
86
109
  shell = not no_shell
87
- (*cmd, path) = args
88
- if path.endswith('.dvc'):
89
- dvc_path = path
90
- path = dvc_path[:-len('.dvc')]
110
+ if len(args) == 2:
111
+ cmd, path = args
112
+ cmd = shlex.split(cmd)
113
+ elif len(args) == 1:
114
+ cmd = None
115
+ path, = args
91
116
  else:
92
- dvc_path = f'{path}.dvc'
117
+ raise click.UsageError('Maximum 2 positional args: [cmd] <path>')
118
+
119
+ path, dvc_path = dvc_paths(path)
93
120
 
94
121
  pcs = refspec.split('..', 1)
95
122
  if len(pcs) == 1:
96
123
  before = pcs[0]
97
124
  after = None
98
- else:
125
+ elif len(pcs) == 2:
99
126
  before, after = pcs
127
+ else:
128
+ raise ValueError(f"Invalid refspec: {refspec}")
100
129
 
101
130
  log = err if verbose else False
102
131
  before_path = dvc_cache_path(before, dvc_path, log=log)
103
132
  after_path = path if after is None else dvc_cache_path(after, dvc_path, log=log)
104
133
 
105
134
  if cmd:
106
- def args(path):
135
+ def args(path: str):
107
136
  arr = cmd + [path]
108
137
  return shlex.join(arr) if shell else arr
109
138
 
110
139
  shell_kwargs = dict(shell=shell) if shell else {}
111
140
  before_cmd = args(before_path)
112
141
  after_cmd = args(after_path)
113
- diff_cmds(before_cmd, after_cmd, **shell_kwargs)
142
+ diff_cmds(before_cmd, after_cmd, verbose=verbose, **shell_kwargs)
114
143
  else:
115
144
  process.run('diff', before_path, after_path, log=log)
116
145
 
dvc_utils/named_pipes.py CHANGED
@@ -5,7 +5,7 @@ from contextlib import contextmanager
5
5
 
6
6
 
7
7
  @contextmanager
8
- def named_pipes(n=1):
8
+ def named_pipes(n: int = 1):
9
9
  """Yield a list of paths to named pipes that are created and destroyed
10
10
 
11
11
  From https://stackoverflow.com/a/28840955"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dvc-utils
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
5
5
  Home-page: https://github.com/runsascoded/dvc-utils
6
6
  Author: Ryan Williams
@@ -10,14 +10,25 @@ Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
 
12
12
  # dvc-utils
13
- CLI for diffing [DVC] files at two commits (or one commit vs. current worktree), optionally passing both through another command first
14
-
15
- ## Installation
13
+ CLI for diffing [DVC] files, optionally passing both through another command first
14
+
15
+ <!-- toc -->
16
+ - [Installation](#installation)
17
+ - [Usage](#usage)
18
+ - [`dvc-utils diff`](#dvc-utils-diff)
19
+ - [Examples](#examples)
20
+ - [Parquet file](#parquet-diff)
21
+ - [Schema diff](#parquet-schema-diff)
22
+ - [Row diff](#parquet-row-diff)
23
+ - [Row count diff](#parquet-row-count-diff)
24
+ <!-- /toc -->
25
+
26
+ ## Installation <a id="installation"></a>
16
27
  ```bash
17
28
  pip install dvc-utils
18
29
  ```
19
30
 
20
- ## Usage
31
+ ## Usage <a id="usage"></a>
21
32
  ```bash
22
33
  dvc-utils --help
23
34
  # Usage: dvc-utils [OPTIONS] COMMAND [ARGS]...
@@ -30,7 +41,7 @@ dvc-utils --help
30
41
  # worktree), optionally passing both through another command first
31
42
  ```
32
43
 
33
- ### `dvc-utils diff`
44
+ ### `dvc-utils diff` <a id="dvc-utils-diff"></a>
34
45
  ```bash
35
46
  dvc-utils diff --help
36
47
  # Usage: dvc-utils diff [OPTIONS] [cmd...] <path>
@@ -55,17 +66,20 @@ dvc-utils diff --help
55
66
  # --help Show this message and exit.
56
67
  ```
57
68
 
58
- ## Examples
69
+ ## Examples <a id="examples"></a>
70
+
71
+ ### Parquet file <a id="parquet-diff"></a>
59
72
  See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
60
73
 
74
+ Setup:
61
75
  ```bash
62
- git clone https://github.com/neighbor-ryan/nj-crashes
63
- commit=c8ae28e
64
- path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc
76
+ git clone https://github.com/hudcostreets/nj-crashes && cd nj-crashes # Clone + enter example repo
77
+ commit=c8ae28e # Example commit that changed some DVC-tracked Parquet files
78
+ path=njdot/data/2001/NewJersey2001Accidents.pqt.dvc # One of the changed files
65
79
  ```
66
80
 
67
- ### Parquet schema diff
68
- Use [`parquet2json`] to observe schema changes to a Parquet file, in [a given commit][commit] from [neighbor-ryan/nj-crashes]:
81
+ #### Schema diff <a id="parquet-schema-diff"></a>
82
+ Use [`parquet2json`] to observe schema changes to a Parquet file:
69
83
  ```bash
70
84
  parquet_schema() {
71
85
  parquet2json "$1" schema
@@ -120,11 +134,12 @@ Here we can see that various date/time columns were consolidated, and several st
120
134
 
121
135
  </details>
122
136
 
123
- ### Parquet row diff
124
- Diff the first row of the Parquet file above (pretty-printed as JSON), before and after the given commit:
137
+ #### Row diff <a id="parquet-row-diff"></a>
138
+ Diff the first row of the Parquet file above (pretty-printed as JSON using [`jq`]), before and after the given commit:
125
139
 
126
140
  ```bash
127
141
  pretty_print_first_row() {
142
+ # Print first row of Parquet file as JSON, pretty-print with jq
128
143
  parquet2json "$1" cat -l 1 | jq .
129
144
  }
130
145
  export -f pretty_print_first_row
@@ -181,7 +196,7 @@ This reflects the schema changes above.
181
196
 
182
197
  </details>
183
198
 
184
- ### Parquet row count diff
199
+ #### Row count diff <a id="parquet-row-count-diff"></a>
185
200
  ```bash
186
201
  parquet_row_count() {
187
202
  parquet2json "$1" rowcount
@@ -194,8 +209,9 @@ This time we get no output; [the given `$commit`][commit] didn't change the row
194
209
 
195
210
  [DVC]: https://dvc.org/
196
211
  [`parquet2json`]: https://github.com/jupiter/parquet2json
197
- [neighbor-ryan/nj-crashes]: https://github.com/neighbor-ryan/nj-crashes
212
+ [hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
198
213
  [Parquet]: https://parquet.apache.org/
199
- [commit]: https://github.com/neighbor-ryan/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
200
- [commit path]: https://github.com/neighbor-ryan/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
201
- [neighbor-ryan/nj-crashes]: https://github.com/neighbor-ryan/nj-crashes
214
+ [commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
215
+ [commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
216
+ [hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
217
+ [`jq`]: https://jqlang.github.io/jq/
@@ -0,0 +1,9 @@
1
+ dvc_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ dvc_utils/main.py,sha256=0rpJptELszWdFLoCKoYOY6VQtAm2mp-3_-kN075T-TE,4743
3
+ dvc_utils/named_pipes.py,sha256=VQ2t9BYCazFq_-MABj4t2HS7GHDvSqXXx8fOLz5DsTc,492
4
+ dvc_utils-0.0.3.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
+ dvc_utils-0.0.3.dist-info/METADATA,sha256=ExDOJDxXQTUrklQfYm6qTdWNW8Le6xknle4mG2sQFpg,6572
6
+ dvc_utils-0.0.3.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
7
+ dvc_utils-0.0.3.dist-info/entry_points.txt,sha256=W9OuZ6CX8QF9ojbqLtfXFo8Q2hnJ-zlcGY4_7nO8paM,49
8
+ dvc_utils-0.0.3.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
+ dvc_utils-0.0.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.3)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- dvc_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- dvc_utils/main.py,sha256=_D_FVRVnBGRHb6XIeTLOOFKc0v5J9D8_O6thG8_lSmU,3863
3
- dvc_utils/named_pipes.py,sha256=GqWvsvTMmnkjk0gPM1aXBIW5dUsSkW-eblerHJ18B68,485
4
- dvc_utils-0.0.2.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
- dvc_utils-0.0.2.dist-info/METADATA,sha256=92Uu2g1qLI9qpHoKvvndj5xapXnPcQN-sxhxK2PTvow,5896
6
- dvc_utils-0.0.2.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
7
- dvc_utils-0.0.2.dist-info/entry_points.txt,sha256=W9OuZ6CX8QF9ojbqLtfXFo8Q2hnJ-zlcGY4_7nO8paM,49
8
- dvc_utils-0.0.2.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
- dvc_utils-0.0.2.dist-info/RECORD,,