dvc-utils 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dvc_utils/cli.py CHANGED
@@ -1,13 +1,15 @@
1
+ import json
1
2
  import shlex
2
- from os import environ as env
3
+ from os import listdir
4
+ from os.path import isdir, join
3
5
  from typing import Tuple
4
6
 
5
7
  import click
6
8
  from click import option, argument, group
7
- from utz import process, err
8
9
  from qmdx import join_pipelines
10
+ from utz import process, err, hash_file
9
11
 
10
- from dvc_utils.path import dvc_paths, dvc_path as dvc_cache_path
12
+ from dvc_utils.path import dvc_paths, dvc_cache_path
11
13
 
12
14
 
13
15
  @group()
@@ -16,9 +18,10 @@ def cli():
16
18
 
17
19
 
18
20
  @cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
19
- @option('-c', '--color', is_flag=True, help='Colorize the output')
20
- @option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
21
- @option('-s', '--shell-executable', help=f'Shell to use for executing commands; defaults to $SHELL ({env.get("SHELL")})')
21
+ @option('-c/-C', '--color/--no-color', default=None, help='Force or prevent colorized output')
22
+ @option('-r', '--refspec', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
23
+ @option('-R', '--ref', help='Shorthand for `-r <ref>^..<ref>`, i.e. inspect a specific commit (vs. its parent)')
24
+ @option('-s', '--shell-executable', help=f'Shell to use for executing commands; defaults to $SHELL')
22
25
  @option('-S', '--no-shell', is_flag=True, help="Don't pass `shell=True` to Python `subprocess`es")
23
26
  @option('-U', '--unified', type=int, help='Number of lines of context to show (passes through to `diff`)')
24
27
  @option('-v', '--verbose', is_flag=True, help="Log intermediate commands to stderr")
@@ -26,8 +29,9 @@ def cli():
26
29
  @option('-x', '--exec-cmd', 'exec_cmds', multiple=True, help='Command(s) to execute before diffing; alternate syntax to passing commands as positional arguments')
27
30
  @argument('args', metavar='[exec_cmd...] <path>', nargs=-1)
28
31
  def dvc_utils_diff(
29
- color: bool,
32
+ color: bool | None,
30
33
  refspec: str | None,
34
+ ref: str | None,
31
35
  shell_executable: str | None,
32
36
  no_shell: bool,
33
37
  unified: int | None,
@@ -53,6 +57,13 @@ def dvc_utils_diff(
53
57
 
54
58
  path, dvc_path = dvc_paths(path)
55
59
 
60
+ if refspec and ref:
61
+ raise ValueError("Specify -r/--refspec xor -R/--ref")
62
+ if ref:
63
+ refspec = f'{ref}^..{ref}'
64
+ elif not refspec:
65
+ refspec = 'HEAD'
66
+
56
67
  pcs = refspec.split('..', 1)
57
68
  if len(pcs) == 1:
58
69
  before = pcs[0]
@@ -64,31 +75,53 @@ def dvc_utils_diff(
64
75
 
65
76
  log = err if verbose else False
66
77
  path1 = dvc_cache_path(before, dvc_path, log=log)
67
- path2 = path if after is None else dvc_cache_path(after, dvc_path, log=log)
68
-
69
- diff_args = [
70
- *(['-w'] if ignore_whitespace else []),
71
- *(['-U', str(unified)] if unified is not None else []),
72
- *(['--color=always'] if color else []),
73
- ]
74
- if cmds:
75
- cmd, *sub_cmds = cmds
76
- cmds1 = [ f'{cmd} {path1}', *sub_cmds ]
77
- cmds2 = [ f'{cmd} {path2}', *sub_cmds ]
78
- if not shell:
79
- cmds1 = [ shlex.split(cmd) for cmd in cmds1 ]
80
- cmds2 = [ shlex.split(cmd) for cmd in cmds2 ]
81
-
82
- join_pipelines(
83
- base_cmd=['diff', *diff_args],
84
- cmds1=cmds1,
85
- cmds2=cmds2,
86
- verbose=verbose,
87
- shell=shell,
88
- shell_executable=shell_executable,
89
- )
78
+ path2 = (path if after is None else dvc_cache_path(after, dvc_path, log=log))
79
+
80
+ if isdir(path):
81
+ dir_json1 = dir_json2 = {}
82
+ if path1:
83
+ with open(path1, 'r') as f:
84
+ obj = json.load(f)
85
+ dir_json1 = { e["relpath"]: e["md5"] for e in obj }
86
+ if path2:
87
+ if path2 == path and after is None:
88
+ dir_json2 = {}
89
+ for file in listdir(path2):
90
+ md5 = hash_file(join(path2, file), hash_name='md5')
91
+ dir_json2[file] = md5
92
+ else:
93
+ with open(path2, 'r') as f:
94
+ dir_json2 = { obj["relpath"]: obj["md5"] for obj in json.load(f) }
95
+ for relpath in sorted(set(dir_json1) | set(dir_json2)):
96
+ md5_1 = dir_json1.get(relpath)
97
+ md5_2 = dir_json2.get(relpath)
98
+ if md5_1 != md5_2:
99
+ print(f'{relpath}: {md5_1} -> {md5_2}')
90
100
  else:
91
- process.run('diff', *diff_args, path1, path2, log=log)
101
+ diff_args = [
102
+ *(['-w'] if ignore_whitespace else []),
103
+ *(['-U', str(unified)] if unified is not None else []),
104
+ *(['--color=always'] if color is True else ['--color=never'] if color is False else []),
105
+ ]
106
+ if cmds:
107
+ cmd, *sub_cmds = cmds
108
+ cmds1 = [ 'cat /dev/null' ] if path1 is None else [ f'{cmd} {path1 or "/dev/null"}', *sub_cmds ]
109
+ cmds2 = [ 'cat /dev/null' ] if path2 is None else [ f'{cmd} {path2 or "/dev/null"}', *sub_cmds ]
110
+ if not shell:
111
+ cmds1 = [ shlex.split(cmd) for cmd in cmds1 ]
112
+ cmds2 = [ shlex.split(cmd) for cmd in cmds2 ]
113
+
114
+ join_pipelines(
115
+ base_cmd=['diff', *diff_args],
116
+ cmds1=cmds1,
117
+ cmds2=cmds2,
118
+ verbose=verbose,
119
+ shell=shell,
120
+ executable=shell_executable,
121
+ )
122
+ else:
123
+ res = process.run('diff', *diff_args, path1 or '/dev/null', path2 or '/dev/null', log=log, check=False)
124
+ exit(res.returncode)
92
125
 
93
126
 
94
127
  if __name__ == '__main__':
dvc_utils/path.py CHANGED
@@ -1,13 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ import json
1
4
  from functools import cache
2
5
  from os import environ as env, getcwd
3
- from os.path import join, relpath
4
- from typing import Optional, Tuple
6
+ from os.path import join, relpath, dirname, basename, sep
7
+ from subprocess import DEVNULL
8
+ from typing import Tuple
5
9
 
6
10
  import yaml
7
11
  from utz import process, err, singleton
8
12
 
9
13
 
10
14
  def dvc_paths(path: str) -> Tuple[str, str]:
15
+ if path.endswith(sep):
16
+ path = path[:-len(sep)]
11
17
  if path.endswith('.dvc'):
12
18
  dvc_path = path
13
19
  path = dvc_path[:-len('.dvc')]
@@ -35,19 +41,47 @@ def dvc_cache_dir(log: bool = False) -> str:
35
41
  return process.line('dvc', 'cache', 'dir', log=log)
36
42
 
37
43
 
38
- def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
44
+ def dvc_md5(
45
+ git_ref: str,
46
+ dvc_path: str,
47
+ log: bool = False,
48
+ ) -> str | None:
39
49
  dir_path = get_dir_path()
40
- dir_path = '' if dir_path == '.' else f'{dir_path}/'
41
- dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=err if log else None)
50
+ dir_path = '' if dir_path == '.' else f'{dir_path}{sep}'
51
+ dvc_path = f"{dir_path}{dvc_path}"
52
+ dvc_spec = process.output('git', 'show', f'{git_ref}:{dvc_path}', log=err if log else None, err_ok=True, stderr=DEVNULL)
53
+ if dvc_spec is None:
54
+ cur_dir = dirname(dvc_path)
55
+ relpath = basename(dvc_path)
56
+ if relpath.endswith(".dvc"):
57
+ relpath = relpath[:-len(".dvc")]
58
+ while cur_dir and cur_dir != '.':
59
+ dir_cache_path = dvc_cache_path(ref=git_ref, dvc_path=f"{cur_dir}.dvc", log=log)
60
+ if dir_cache_path:
61
+ with open(dir_cache_path, 'r') as f:
62
+ dir_entries = json.load(f)
63
+ md5s = [ e["md5"] for e in dir_entries if e["relpath"] == relpath ]
64
+ if len(md5s) == 1:
65
+ return md5s[0]
66
+ else:
67
+ raise RuntimeError(f"{relpath=} not found in DVC-tracked dir {cur_dir}")
68
+ relpath = join(basename(cur_dir), relpath)
69
+ cur_dir = dirname(cur_dir)
70
+ return None
42
71
  dvc_obj = yaml.safe_load(dvc_spec)
43
72
  out = singleton(dvc_obj['outs'], dedupe=False)
44
73
  md5 = out['md5']
45
74
  return md5
46
75
 
47
76
 
48
- def dvc_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
77
+ def dvc_path(
78
+ ref: str,
79
+ dvc_path: str | None = None,
80
+ log: bool = False,
81
+ ) -> str | None:
49
82
  if dvc_path and not dvc_path.endswith('.dvc'):
50
83
  dvc_path += '.dvc'
84
+
51
85
  if dvc_path:
52
86
  md5 = dvc_md5(ref, dvc_path, log=log)
53
87
  elif ':' in ref:
@@ -55,6 +89,13 @@ def dvc_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str
55
89
  md5 = dvc_md5(git_ref, dvc_path, log=log)
56
90
  else:
57
91
  md5 = ref
58
- dirname = md5[:2]
59
- basename = md5[2:]
60
- return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
92
+
93
+ if md5 is None:
94
+ return None
95
+ else:
96
+ dirname = md5[:2]
97
+ basename = md5[2:]
98
+ return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
99
+
100
+
101
+ dvc_cache_path = dvc_path
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dvc-utils
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
5
5
  Home-page: https://github.com/runsascoded/dvc-utils
6
6
  Author: Ryan Williams
@@ -10,8 +10,8 @@ Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: click
12
12
  Requires-Dist: pyyaml
13
- Requires-Dist: qmdx
14
- Requires-Dist: utz>=0.11.3
13
+ Requires-Dist: qmdx>=0.0.5
14
+ Requires-Dist: utz>=0.13.0
15
15
 
16
16
  # dvc-utils
17
17
  Diff [DVC] files, optionally piping through other commands first.
@@ -70,25 +70,146 @@ dvc-diff --help
70
70
  # optional) at HEAD (last committed value) vs. the current worktree content.
71
71
  #
72
72
  # Options:
73
- # -c, --color Colorize the output
74
- # -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits) or
75
- # <commit> (compare <commit> to the worktree)
76
- # -s, --shell-executable TEXT Shell to use for executing commands; defaults
77
- # to $SHELL (/bin/bash)
78
- # -S, --no-shell Don't pass `shell=True` to Python
79
- # `subprocess`es
80
- # -U, --unified INTEGER Number of lines of context to show (passes
81
- # through to `diff`)
82
- # -v, --verbose Log intermediate commands to stderr
83
- # -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to
84
- # `diff`)
85
- # -x, --exec-cmd TEXT Command(s) to execute before diffing; alternate
86
- # syntax to passing commands as positional
87
- # arguments
88
- # --help Show this message and exit.
73
+ # -c, --color / -C, --no-color Force or prevent colorized output
74
+ # -r, --refspec TEXT <commit 1>..<commit 2> (compare two commits)
75
+ # or <commit> (compare <commit> to the worktree)
76
+ # -R, --ref TEXT Shorthand for `-r <ref>^..<ref>`, i.e. inspect
77
+ # a specific commit (vs. its parent)
78
+ # -s, --shell-executable TEXT Shell to use for executing commands; defaults
79
+ # to $SHELL
80
+ # -S, --no-shell Don't pass `shell=True` to Python
81
+ # `subprocess`es
82
+ # -U, --unified INTEGER Number of lines of context to show (passes
83
+ # through to `diff`)
84
+ # -v, --verbose Log intermediate commands to stderr
85
+ # -w, --ignore-whitespace Ignore whitespace differences (pass `-w` to
86
+ # `diff`)
87
+ # -x, --exec-cmd TEXT Command(s) to execute before diffing;
88
+ # alternate syntax to passing commands as
89
+ # positional arguments
90
+ # --help Show this message and exit.
89
91
  ```
90
92
 
91
93
  ## Examples <a id="examples"></a>
94
+ These examples are verified with [`mdcmd`] and `$BMDF_WORKDIR=test/data`
95
+
96
+ ([`test/data`] is a clone of [ryan-williams/dvc-helpers@test], which contains simple DVC-tracked files used for testing [`git-diff-dvc.sh`])
97
+
98
+ [`8ec2060`] added a DVC-tracked text file, `test.txt`:
99
+
100
+ <!-- `bmdf -- dvc-diff -R 8ec2060 test.txt` -->
101
+ ```bash
102
+ dvc-diff -R 8ec2060 test.txt
103
+ # 0a1,10
104
+ # > 1
105
+ # > 2
106
+ # > 3
107
+ # > 4
108
+ # > 5
109
+ # > 6
110
+ # > 7
111
+ # > 8
112
+ # > 9
113
+ # > 10
114
+ ```
115
+
116
+ [`0455b50`] appended some lines to `test.txt`:
117
+
118
+ <!-- `bmdf -- dvc-diff -R 0455b50 test.txt` -->
119
+ ```bash
120
+ dvc-diff -R 0455b50 test.txt
121
+ # 10a11,15
122
+ # > 11
123
+ # > 12
124
+ # > 13
125
+ # > 14
126
+ # > 15
127
+ ```
128
+
129
+ [`f92c1d2`] added `test.parquet`:
130
+
131
+ <!-- `bmdf -- dvc-diff -R f92c1d2 pqa test.parquet` -->
132
+ ```bash
133
+ dvc-diff -R f92c1d2 pqa test.parquet
134
+ # 0a1,27
135
+ # > MD5: 4379600b26647a50dfcd0daa824e8219
136
+ # > 1635 bytes
137
+ # > 5 rows
138
+ # > message schema {
139
+ # > OPTIONAL INT64 num;
140
+ # > OPTIONAL BYTE_ARRAY str (STRING);
141
+ # > }
142
+ # > {
143
+ # > "num": 111,
144
+ # > "str": "aaa"
145
+ # > }
146
+ # > {
147
+ # > "num": 222,
148
+ # > "str": "bbb"
149
+ # > }
150
+ # > {
151
+ # > "num": 333,
152
+ # > "str": "ccc"
153
+ # > }
154
+ # > {
155
+ # > "num": 444,
156
+ # > "str": "ddd"
157
+ # > }
158
+ # > {
159
+ # > "num": 555,
160
+ # > "str": "eee"
161
+ # > }
162
+ ```
163
+
164
+ [`f29e52a`] updated `test.parquet`:
165
+
166
+ <!-- `bmdf -- dvc-diff -R f29e52a pqa test.parquet` -->
167
+ ```bash
168
+ dvc-diff -R f29e52a pqa test.parquet
169
+ # 1,3c1,3
170
+ # < MD5: 4379600b26647a50dfcd0daa824e8219
171
+ # < 1635 bytes
172
+ # < 5 rows
173
+ # ---
174
+ # > MD5: be082c87786f3364ca9efec061a3cc21
175
+ # > 1622 bytes
176
+ # > 8 rows
177
+ # 5c5
178
+ # < OPTIONAL INT64 num;
179
+ # ---
180
+ # > OPTIONAL INT32 num;
181
+ # 26a27,38
182
+ # > }
183
+ # > {
184
+ # > "num": 666,
185
+ # > "str": "fff"
186
+ # > }
187
+ # > {
188
+ # > "num": 777,
189
+ # > "str": "ggg"
190
+ # > }
191
+ # > {
192
+ # > "num": 888,
193
+ # > "str": "hhh"
194
+ ```
195
+
196
+ [`3257258`] added a DVC-tracked directory `data/`, including `test.{txt,parquet}`), and removed the top-level `test.{txt,parquet}`.
197
+
198
+ <!-- `bmdf -- dvc-diff -R 3257258 data` -->
199
+ ```bash
200
+ dvc-diff -R 3257258 data
201
+ # test.parquet: None -> c07bba3fae2b64207aa92f422506e4a2
202
+ # test.txt: None -> e20b902b49a98b1a05ed62804c757f94
203
+ ```
204
+
205
+ [`ae8638a`] changed values in `data/test.parquet`, and added rows to `data/test.txt`:
206
+
207
+ <!-- `bmdf -- dvc-diff -R ae8638a data` -->
208
+ ```bash
209
+ dvc-diff -R ae8638a data
210
+ # test.parquet: c07bba3fae2b64207aa92f422506e4a2 -> f46dd86f608b1dc00993056c9fc55e6e
211
+ # test.txt: e20b902b49a98b1a05ed62804c757f94 -> 9306ec0709cc72558045559ada26573b
212
+ ```
92
213
 
93
214
  ### Parquet <a id="parquet-diff"></a>
94
215
  See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
@@ -338,3 +459,15 @@ This helped me see that the data update in question (`c0..c1`) dropped some fiel
338
459
  [`kcr`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L118
339
460
  [`snc`]: https://github.com/ryan-williams/case-helpers/blob/c40a62a9656f0d52d68fb3a108ae6bb3eed3c7bd/.case-rc#L9
340
461
  [`sdf`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L138
462
+
463
+ [`mdcmd`]: https://github.com/runsascoded/bash-markdown-fence?tab=readme-ov-file#bmdf
464
+ [`test/data`]: test/data
465
+ [ryan-williams/dvc-helpers@test]: https://github.com/ryan-williams/dvc-helpers/tree/test
466
+ [`git-diff-dvc.sh`]: https://github.com/ryan-williams/dvc-helpers/blob/main/git-diff-dvc.sh
467
+
468
+ [`8ec2060`]: https://github.com/ryan-williams/dvc-helpers/commit/8ec2060
469
+ [`0455b50`]: https://github.com/ryan-williams/dvc-helpers/commit/0455b50
470
+ [`f92c1d2`]: https://github.com/ryan-williams/dvc-helpers/commit/f92c1d2
471
+ [`f29e52a`]: https://github.com/ryan-williams/dvc-helpers/commit/f29e52a
472
+ [`3257258`]: https://github.com/ryan-williams/dvc-helpers/commit/3257258
473
+ [`ae8638a`]: https://github.com/ryan-williams/dvc-helpers/commit/ae8638a
@@ -0,0 +1,9 @@
1
+ dvc_utils/__init__.py,sha256=mP-p1Sl2JMMShM_hRhu86pFNfIq_8E_feh1CN47LWcs,86
2
+ dvc_utils/cli.py,sha256=d27Q8K77ZxwWRtHJqn_70MhfOYF8ybsCEamG7wrAyjU,5079
3
+ dvc_utils/path.py,sha256=eEP-r6o33BZsEz6JNllzEbU8ficW9slAFMJ2l02IRrY,2903
4
+ dvc_utils-0.2.0.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
+ dvc_utils-0.2.0.dist-info/METADATA,sha256=ppHbZlkyxL015AhfAWcx9H6UGWLNt2078FjdMIQgCok,14977
6
+ dvc_utils-0.2.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
7
+ dvc_utils-0.2.0.dist-info/entry_points.txt,sha256=luxCQr8OS-jMSyyDhB9KDQhUbP8UH6UMcy-vkfXX7Gg,88
8
+ dvc_utils-0.2.0.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
+ dvc_utils-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- dvc_utils/__init__.py,sha256=mP-p1Sl2JMMShM_hRhu86pFNfIq_8E_feh1CN47LWcs,86
2
- dvc_utils/cli.py,sha256=CcOa5Qmkry3PASz8nu_XqcBB6GnGmcf0e8zsuqSDsCM,3534
3
- dvc_utils/path.py,sha256=PoAbeaqRPDksY2hcUeF8xZ6Nr6hLIZprey3VNT4V5bc,1727
4
- dvc_utils-0.1.0.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
- dvc_utils-0.1.0.dist-info/METADATA,sha256=F3GUp8NMg0oEqsqtI_jECiWkoO7-sGYb8KJT4upGqNM,11722
6
- dvc_utils-0.1.0.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
7
- dvc_utils-0.1.0.dist-info/entry_points.txt,sha256=luxCQr8OS-jMSyyDhB9KDQhUbP8UH6UMcy-vkfXX7Gg,88
8
- dvc_utils-0.1.0.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
- dvc_utils-0.1.0.dist-info/RECORD,,