dvc-utils 0.0.8__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dvc_utils/__init__.py CHANGED
@@ -0,0 +1,2 @@
1
+ from . import cli, path
2
+ from .path import dvc_cache_dir, dvc_md5, dvc_paths, dvc_path
@@ -1,13 +1,13 @@
1
- from functools import cache
2
- from os import environ as env, getcwd
3
- from os.path import join, relpath
4
1
  import shlex
5
- from typing import Optional, Tuple
2
+ from os import environ as env
3
+ from typing import Tuple
6
4
 
7
- from click import option, argument, group
8
5
  import click
9
- import yaml
10
- from utz import diff_cmds, process, err, singleton
6
+ from click import option, argument, group
7
+ from utz import process, err
8
+ from qmdx import join_pipelines
9
+
10
+ from dvc_utils.path import dvc_paths, dvc_path as dvc_cache_path
11
11
 
12
12
 
13
13
  @group()
@@ -15,57 +15,6 @@ def cli():
15
15
  pass
16
16
 
17
17
 
18
- def dvc_paths(path: str) -> Tuple[str, str]:
19
- if path.endswith('.dvc'):
20
- dvc_path = path
21
- path = dvc_path[:-len('.dvc')]
22
- else:
23
- dvc_path = f'{path}.dvc'
24
- return path, dvc_path
25
-
26
-
27
- @cache
28
- def get_git_root() -> str:
29
- return process.line('git', 'rev-parse', '--show-toplevel', log=False)
30
-
31
-
32
- @cache
33
- def get_dir_path() -> str:
34
- return relpath(getcwd(), get_git_root())
35
-
36
-
37
- @cache
38
- def dvc_cache_dir(log: bool = False) -> str:
39
- dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
40
- if dvc_cache_relpath:
41
- return join(get_git_root(), dvc_cache_relpath)
42
- else:
43
- return process.line('dvc', 'cache', 'dir', log=log)
44
-
45
-
46
- def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
47
- dir_path = get_dir_path()
48
- dir_path = '' if dir_path == '.' else f'{dir_path}/'
49
- dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=err if log else None)
50
- dvc_obj = yaml.safe_load(dvc_spec)
51
- out = singleton(dvc_obj['outs'], dedupe=False)
52
- md5 = out['md5']
53
- return md5
54
-
55
-
56
- def dvc_cache_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
57
- if dvc_path:
58
- md5 = dvc_md5(ref, dvc_path, log=log)
59
- elif ':' in ref:
60
- git_ref, dvc_path = ref.split(':', 1)
61
- md5 = dvc_md5(git_ref, dvc_path, log=log)
62
- else:
63
- md5 = ref
64
- dirname = md5[:2]
65
- basename = md5[2:]
66
- return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
67
-
68
-
69
18
  @cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
70
19
  @option('-c', '--color', is_flag=True, help='Colorize the output')
71
20
  @option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
@@ -114,39 +63,32 @@ def dvc_utils_diff(
114
63
  raise ValueError(f"Invalid refspec: {refspec}")
115
64
 
116
65
  log = err if verbose else False
117
- before_path = dvc_cache_path(before, dvc_path, log=log)
118
- after_path = path if after is None else dvc_cache_path(after, dvc_path, log=log)
119
-
66
+ path1 = dvc_cache_path(before, dvc_path, log=log)
67
+ path2 = path if after is None else dvc_cache_path(after, dvc_path, log=log)
68
+
69
+ diff_args = [
70
+ *(['-w'] if ignore_whitespace else []),
71
+ *(['-U', str(unified)] if unified is not None else []),
72
+ *(['--color=always'] if color else []),
73
+ ]
120
74
  if cmds:
121
75
  cmd, *sub_cmds = cmds
76
+ cmds1 = [ f'{cmd} {path1}', *sub_cmds ]
77
+ cmds2 = [ f'{cmd} {path2}', *sub_cmds ]
122
78
  if not shell:
123
- sub_cmds = [ shlex.split(c) for c in sub_cmds ]
124
- before_cmds = [
125
- shlex.split(f'{cmd} {before_path}'),
126
- *sub_cmds,
127
- ]
128
- after_cmds = [
129
- shlex.split(f'{cmd} {after_path}'),
130
- *sub_cmds,
131
- ]
132
- shell_kwargs = {}
133
- else:
134
- before_cmds = [ f'{cmd} {before_path}', *sub_cmds ]
135
- after_cmds = [ f'{cmd} {after_path}', *sub_cmds ]
136
- shell_kwargs = dict(shell=shell)
137
-
138
- diff_cmds(
139
- before_cmds,
140
- after_cmds,
79
+ cmds1 = [ shlex.split(cmd) for cmd in cmds1 ]
80
+ cmds2 = [ shlex.split(cmd) for cmd in cmds2 ]
81
+
82
+ join_pipelines(
83
+ base_cmd=['diff', *diff_args],
84
+ cmds1=cmds1,
85
+ cmds2=cmds2,
141
86
  verbose=verbose,
142
- color=color,
143
- unified=unified,
144
- ignore_whitespace=ignore_whitespace,
87
+ shell=shell,
145
88
  shell_executable=shell_executable,
146
- **shell_kwargs,
147
89
  )
148
90
  else:
149
- process.run('diff', before_path, after_path, log=log)
91
+ process.run('diff', *diff_args, path1, path2, log=log)
150
92
 
151
93
 
152
94
  if __name__ == '__main__':
dvc_utils/path.py ADDED
@@ -0,0 +1,60 @@
1
+ from functools import cache
2
+ from os import environ as env, getcwd
3
+ from os.path import join, relpath
4
+ from typing import Optional, Tuple
5
+
6
+ import yaml
7
+ from utz import process, err, singleton
8
+
9
+
10
+ def dvc_paths(path: str) -> Tuple[str, str]:
11
+ if path.endswith('.dvc'):
12
+ dvc_path = path
13
+ path = dvc_path[:-len('.dvc')]
14
+ else:
15
+ dvc_path = f'{path}.dvc'
16
+ return path, dvc_path
17
+
18
+
19
+ @cache
20
+ def get_git_root() -> str:
21
+ return process.line('git', 'rev-parse', '--show-toplevel', log=False)
22
+
23
+
24
+ @cache
25
+ def get_dir_path() -> str:
26
+ return relpath(getcwd(), get_git_root())
27
+
28
+
29
+ @cache
30
+ def dvc_cache_dir(log: bool = False) -> str:
31
+ dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
32
+ if dvc_cache_relpath:
33
+ return join(get_git_root(), dvc_cache_relpath)
34
+ else:
35
+ return process.line('dvc', 'cache', 'dir', log=log)
36
+
37
+
38
+ def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
39
+ dir_path = get_dir_path()
40
+ dir_path = '' if dir_path == '.' else f'{dir_path}/'
41
+ dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=err if log else None)
42
+ dvc_obj = yaml.safe_load(dvc_spec)
43
+ out = singleton(dvc_obj['outs'], dedupe=False)
44
+ md5 = out['md5']
45
+ return md5
46
+
47
+
48
+ def dvc_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
49
+ if dvc_path and not dvc_path.endswith('.dvc'):
50
+ dvc_path += '.dvc'
51
+ if dvc_path:
52
+ md5 = dvc_md5(ref, dvc_path, log=log)
53
+ elif ':' in ref:
54
+ git_ref, dvc_path = ref.split(':', 1)
55
+ md5 = dvc_md5(git_ref, dvc_path, log=log)
56
+ else:
57
+ md5 = ref
58
+ dirname = md5[:2]
59
+ basename = md5[2:]
60
+ return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dvc-utils
3
- Version: 0.0.8
3
+ Version: 0.1.0
4
4
  Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
5
5
  Home-page: https://github.com/runsascoded/dvc-utils
6
6
  Author: Ryan Williams
@@ -8,19 +8,26 @@ Author-email: ryan@runsascoded.com
8
8
  License: MIT
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
+ Requires-Dist: click
12
+ Requires-Dist: pyyaml
13
+ Requires-Dist: qmdx
14
+ Requires-Dist: utz>=0.11.3
11
15
 
12
16
  # dvc-utils
13
- CLI for diffing [DVC] files, optionally passing both through another command first
17
+ Diff [DVC] files, optionally piping through other commands first.
18
+
19
+ [![dvc-utils on PyPI](https://img.shields.io/pypi/v/dvc-utils?label=dvc-utils)][PyPI]
14
20
 
15
21
  <!-- toc -->
16
22
  - [Installation](#installation)
17
23
  - [Usage](#usage)
18
24
  - [`dvc-diff`](#dvc-diff)
19
25
  - [Examples](#examples)
20
- - [Parquet file](#parquet-diff)
26
+ - [Parquet](#parquet-diff)
21
27
  - [Schema diff](#parquet-schema-diff)
22
28
  - [Row diff](#parquet-row-diff)
23
29
  - [Row count diff](#parquet-row-count-diff)
30
+ - [GZipped CSVs](#csv-gz)
24
31
  <!-- /toc -->
25
32
 
26
33
  ## Installation <a id="installation"></a>
@@ -83,7 +90,7 @@ dvc-diff --help
83
90
 
84
91
  ## Examples <a id="examples"></a>
85
92
 
86
- ### Parquet file <a id="parquet-diff"></a>
93
+ ### Parquet <a id="parquet-diff"></a>
87
94
  See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
88
95
 
89
96
  Setup:
@@ -222,11 +229,112 @@ dvc-diff -r $commit^..$commit parquet_row_count $path
222
229
 
223
230
  This time we get no output; [the given `$commit`][commit] didn't change the row count in the DVC-tracked Parquet file [`$path`][commit path].
224
231
 
232
+ ### GZipped CSVs <a id="csv-gz"></a>
233
+
234
+ Here's a "one-liner" I used in [ctbk.dev][ctbk.dev gh], to normalize and compare headers of `.csv.gz.dvc` files between two commits:
235
+
236
+ ```bash
237
+ # Save some `sed` substitution commands to file `seds`:
238
+ cat <<EOF >seds
239
+ s/station_//
240
+ s/latitude/lat/
241
+ s/longitude/lng/
242
+ s/starttime/started_at/
243
+ s/stoptime/ended_at/
244
+ s/usertype/member_casual/
245
+ EOF
246
+ # Commit range to diff; branch `c0` is an initial commit of some `.csv.gz` files, branch `c1` is a later commit after some updates
247
+ r=c0..c1
248
+ # List files changed in commit range `$r`, in the `s3/ctbk/csvs/` dir, piping through several post-processing commands:
249
+ gdno $r s3/ctbk/csvs/ | \
250
+ pel "ddcr $r guc h1 spc kq kcr snc 'sdf seds' sort"
251
+ ```
252
+
253
+ <details>
254
+ <summary>
255
+ Explanation of aliases
256
+ </summary>
257
+
258
+ - [`gdno`] (`git diff --name-only`): list files changed in the given commit range and directory
259
+ - [`pel`]: [`parallel`] alias that prepends an `echo {}` to the command
260
+ - [`ddcr`] (`dvc-diff -cr`): colorized `diff` output, revision range `$r`
261
+ - [`guc`] (`gunzip -c`): uncompress the `.csv.gz` files
262
+ - [`h1`] (`head -n1`): only examine each file's header line
263
+ - [`spc`] (`tr , $'\n'`): **sp**lit the header line by **c**ommas (so each column name will be on one line, for easier `diff`ing below)
264
+ - [`kq`] (`tr -d '"'`): **k**ill **q**uote characters (in this case, header-column name quoting changed, but I don't care about that)
265
+ - [`kcr`] (`tr -d '\r'`): **k**ill **c**arriage **r**eturns (line endings also changed)
266
+ - [`snc`] (`sed -f 'snake_case.sed'`): snake-case column names
267
+ - [`sdf`] (`sed -f`): execute the `sed` substitution commands defined in the `seds` file above
268
+ - `sort`: sort the column names alphabetically (to identify missing or added columns, ignore rearrangements)
269
+
270
+ Note:
271
+ - Most of these are exported Bash functions, allowing them to be used inside the [`parallel`] command.
272
+ - I was able to build this pipeline iteratively, adding steps to normalize out the bits I didn't care about (and accumulating the `seds` commands).
273
+ </details>
274
+
275
+ Example output:
276
+ ```diff
277
+
278
+ s3/ctbk/csvs/201910-citibike-tripdata.csv.gz.dvc:
279
+ s3/ctbk/csvs/201911-citibike-tripdata.csv.gz.dvc:
280
+ s3/ctbk/csvs/201912-citibike-tripdata.csv.gz.dvc:
281
+ s3/ctbk/csvs/202001-citibike-tripdata.csv.gz.dvc:
282
+ 1,2d0
283
+ < bikeid
284
+ < birth_year
285
+ 8d5
286
+ < gender
287
+ 9a7,8
288
+ > ride_id
289
+ > rideable_type
290
+ 15d13
291
+ < tripduration
292
+ s3/ctbk/csvs/202002-citibike-tripdata.csv.gz.dvc:
293
+ 1,2d0
294
+ < bikeid
295
+ < birth_year
296
+ 8d5
297
+ < gender
298
+ 9a7,8
299
+ > ride_id
300
+ > rideable_type
301
+ 15d13
302
+ < tripduration
303
+ s3/ctbk/csvs/202003-citibike-tripdata.csv.gz.dvc:
304
+ 1,2d0
305
+ < bikeid
306
+ < birth_year
307
+ 8d5
308
+ < gender
309
+ 9a7,8
310
+ > ride_id
311
+ > rideable_type
312
+ 15d13
313
+ < tripduration
314
+
315
+ ```
316
+
317
+ This helped me see that the data update in question (`c0..c1`) dropped some fields (`bikeid, birth_year`, `gender`, `tripduration`) and added others (`ride_id`, `rideable_type`), for `202001` and later.
318
+
225
319
  [DVC]: https://dvc.org/
320
+ [PyPI]: https://pypi.org/project/dvc-utils/
226
321
  [`parquet2json`]: https://github.com/jupiter/parquet2json
227
322
  [hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
228
323
  [Parquet]: https://parquet.apache.org/
229
324
  [commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
230
325
  [commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
231
326
  [hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
327
+ [ctbk.dev gh]: https://github.com/neighbor-ryan/ctbk.dev
232
328
  [`jq`]: https://jqlang.github.io/jq/
329
+ [`parallel`]: https://www.gnu.org/software/parallel/
330
+
331
+ [`gdno`]: https://github.com/ryan-williams/git-helpers/blob/96560df1406f41676f293becefb423895a755faf/diff/.gitconfig#L31
332
+ [`pel`]: https://github.com/ryan-williams/parallel-helpers/blob/e7ee109c4085c04036840ea78999cff73fcf9502/.parallel-rc#L6-L17
333
+ [`ddcr`]: https://github.com/ryan-williams/aws-helpers/blob/8a314f1e6b336833c772459de6b739f5c06a51a3/.dvc-rc#L84
334
+ [`guc`]: https://github.com/ryan-williams/zip-helpers/blob/c67d84fb06c0ab3609dacb68d900344d3b8e8f04/.zip-rc#L16
335
+ [`h1`]: https://github.com/ryan-williams/head-tail-helpers/blob/9715690f47ceeff6b6948b2093901f2b0830114b/.head-tail-rc#L3
336
+ [`spc`]: https://github.com/ryan-williams/col-helpers/blob/9493d003224249ee240d023f71ab03bdd4174b88/.cols-rc#L8
337
+ [`kq`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L115
338
+ [`kcr`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L118
339
+ [`snc`]: https://github.com/ryan-williams/case-helpers/blob/c40a62a9656f0d52d68fb3a108ae6bb3eed3c7bd/.case-rc#L9
340
+ [`sdf`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L138
@@ -0,0 +1,9 @@
1
+ dvc_utils/__init__.py,sha256=mP-p1Sl2JMMShM_hRhu86pFNfIq_8E_feh1CN47LWcs,86
2
+ dvc_utils/cli.py,sha256=CcOa5Qmkry3PASz8nu_XqcBB6GnGmcf0e8zsuqSDsCM,3534
3
+ dvc_utils/path.py,sha256=PoAbeaqRPDksY2hcUeF8xZ6Nr6hLIZprey3VNT4V5bc,1727
4
+ dvc_utils-0.1.0.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
5
+ dvc_utils-0.1.0.dist-info/METADATA,sha256=F3GUp8NMg0oEqsqtI_jECiWkoO7-sGYb8KJT4upGqNM,11722
6
+ dvc_utils-0.1.0.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
7
+ dvc_utils-0.1.0.dist-info/entry_points.txt,sha256=luxCQr8OS-jMSyyDhB9KDQhUbP8UH6UMcy-vkfXX7Gg,88
8
+ dvc_utils-0.1.0.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
9
+ dvc_utils-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ dvc-diff = dvc_utils.cli:dvc_utils_diff
3
+ dvc-utils = dvc_utils.cli:cli
@@ -1,8 +0,0 @@
1
- dvc_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- dvc_utils/main.py,sha256=05Cv6jlKfNnmRQjXOs9dq1eBzT2gIdwnayOsiEjmVZE,5205
3
- dvc_utils-0.0.8.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
4
- dvc_utils-0.0.8.dist-info/METADATA,sha256=EplLe4Eaqnfu1LGalvWaVOvoKdUVpTjZjxpljp4YtmU,7384
5
- dvc_utils-0.0.8.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
6
- dvc_utils-0.0.8.dist-info/entry_points.txt,sha256=0JqFeb29N5ZrvwmIJrg4ii32xNFOXRul-HMq-3GOIw4,90
7
- dvc_utils-0.0.8.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
8
- dvc_utils-0.0.8.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- dvc-diff = dvc_utils.main:dvc_utils_diff
3
- dvc-utils = dvc_utils.main:cli