dvc-utils 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dvc_utils/__init__.py +2 -0
- dvc_utils/{main.py → cli.py} +6 -58
- dvc_utils/path.py +60 -0
- {dvc_utils-0.0.8.dist-info → dvc_utils-0.0.9.dist-info}/METADATA +105 -4
- dvc_utils-0.0.9.dist-info/RECORD +9 -0
- dvc_utils-0.0.9.dist-info/entry_points.txt +3 -0
- dvc_utils-0.0.8.dist-info/RECORD +0 -8
- dvc_utils-0.0.8.dist-info/entry_points.txt +0 -3
- {dvc_utils-0.0.8.dist-info → dvc_utils-0.0.9.dist-info}/LICENSE +0 -0
- {dvc_utils-0.0.8.dist-info → dvc_utils-0.0.9.dist-info}/WHEEL +0 -0
- {dvc_utils-0.0.8.dist-info → dvc_utils-0.0.9.dist-info}/top_level.txt +0 -0
dvc_utils/__init__.py
CHANGED
dvc_utils/{main.py → cli.py}
RENAMED
@@ -1,13 +1,12 @@
|
|
1
|
-
from functools import cache
|
2
|
-
from os import environ as env, getcwd
|
3
|
-
from os.path import join, relpath
|
4
1
|
import shlex
|
5
|
-
from
|
2
|
+
from os import environ as env
|
3
|
+
from typing import Tuple
|
6
4
|
|
7
|
-
from click import option, argument, group
|
8
5
|
import click
|
9
|
-
import
|
10
|
-
from utz import diff_cmds, process, err
|
6
|
+
from click import option, argument, group
|
7
|
+
from utz import diff_cmds, process, err
|
8
|
+
|
9
|
+
from dvc_utils.path import dvc_paths, dvc_path as dvc_cache_path
|
11
10
|
|
12
11
|
|
13
12
|
@group()
|
@@ -15,57 +14,6 @@ def cli():
|
|
15
14
|
pass
|
16
15
|
|
17
16
|
|
18
|
-
def dvc_paths(path: str) -> Tuple[str, str]:
|
19
|
-
if path.endswith('.dvc'):
|
20
|
-
dvc_path = path
|
21
|
-
path = dvc_path[:-len('.dvc')]
|
22
|
-
else:
|
23
|
-
dvc_path = f'{path}.dvc'
|
24
|
-
return path, dvc_path
|
25
|
-
|
26
|
-
|
27
|
-
@cache
|
28
|
-
def get_git_root() -> str:
|
29
|
-
return process.line('git', 'rev-parse', '--show-toplevel', log=False)
|
30
|
-
|
31
|
-
|
32
|
-
@cache
|
33
|
-
def get_dir_path() -> str:
|
34
|
-
return relpath(getcwd(), get_git_root())
|
35
|
-
|
36
|
-
|
37
|
-
@cache
|
38
|
-
def dvc_cache_dir(log: bool = False) -> str:
|
39
|
-
dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
|
40
|
-
if dvc_cache_relpath:
|
41
|
-
return join(get_git_root(), dvc_cache_relpath)
|
42
|
-
else:
|
43
|
-
return process.line('dvc', 'cache', 'dir', log=log)
|
44
|
-
|
45
|
-
|
46
|
-
def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
|
47
|
-
dir_path = get_dir_path()
|
48
|
-
dir_path = '' if dir_path == '.' else f'{dir_path}/'
|
49
|
-
dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=err if log else None)
|
50
|
-
dvc_obj = yaml.safe_load(dvc_spec)
|
51
|
-
out = singleton(dvc_obj['outs'], dedupe=False)
|
52
|
-
md5 = out['md5']
|
53
|
-
return md5
|
54
|
-
|
55
|
-
|
56
|
-
def dvc_cache_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
|
57
|
-
if dvc_path:
|
58
|
-
md5 = dvc_md5(ref, dvc_path, log=log)
|
59
|
-
elif ':' in ref:
|
60
|
-
git_ref, dvc_path = ref.split(':', 1)
|
61
|
-
md5 = dvc_md5(git_ref, dvc_path, log=log)
|
62
|
-
else:
|
63
|
-
md5 = ref
|
64
|
-
dirname = md5[:2]
|
65
|
-
basename = md5[2:]
|
66
|
-
return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
|
67
|
-
|
68
|
-
|
69
17
|
@cli.command('diff', short_help='Diff a DVC-tracked file at two commits (or one commit vs. current worktree), optionally passing both through another command first')
|
70
18
|
@option('-c', '--color', is_flag=True, help='Colorize the output')
|
71
19
|
@option('-r', '--refspec', default='HEAD', help='<commit 1>..<commit 2> (compare two commits) or <commit> (compare <commit> to the worktree)')
|
dvc_utils/path.py
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
from functools import cache
|
2
|
+
from os import environ as env, getcwd
|
3
|
+
from os.path import join, relpath
|
4
|
+
from typing import Optional, Tuple
|
5
|
+
|
6
|
+
import yaml
|
7
|
+
from utz import process, err, singleton
|
8
|
+
|
9
|
+
|
10
|
+
def dvc_paths(path: str) -> Tuple[str, str]:
|
11
|
+
if path.endswith('.dvc'):
|
12
|
+
dvc_path = path
|
13
|
+
path = dvc_path[:-len('.dvc')]
|
14
|
+
else:
|
15
|
+
dvc_path = f'{path}.dvc'
|
16
|
+
return path, dvc_path
|
17
|
+
|
18
|
+
|
19
|
+
@cache
|
20
|
+
def get_git_root() -> str:
|
21
|
+
return process.line('git', 'rev-parse', '--show-toplevel', log=False)
|
22
|
+
|
23
|
+
|
24
|
+
@cache
|
25
|
+
def get_dir_path() -> str:
|
26
|
+
return relpath(getcwd(), get_git_root())
|
27
|
+
|
28
|
+
|
29
|
+
@cache
|
30
|
+
def dvc_cache_dir(log: bool = False) -> str:
|
31
|
+
dvc_cache_relpath = env.get('DVC_UTILS_CACHE_DIR')
|
32
|
+
if dvc_cache_relpath:
|
33
|
+
return join(get_git_root(), dvc_cache_relpath)
|
34
|
+
else:
|
35
|
+
return process.line('dvc', 'cache', 'dir', log=log)
|
36
|
+
|
37
|
+
|
38
|
+
def dvc_md5(git_ref: str, dvc_path: str, log: bool = False) -> str:
|
39
|
+
dir_path = get_dir_path()
|
40
|
+
dir_path = '' if dir_path == '.' else f'{dir_path}/'
|
41
|
+
dvc_spec = process.output('git', 'show', f'{git_ref}:{dir_path}{dvc_path}', log=err if log else None)
|
42
|
+
dvc_obj = yaml.safe_load(dvc_spec)
|
43
|
+
out = singleton(dvc_obj['outs'], dedupe=False)
|
44
|
+
md5 = out['md5']
|
45
|
+
return md5
|
46
|
+
|
47
|
+
|
48
|
+
def dvc_path(ref: str, dvc_path: Optional[str] = None, log: bool = False) -> str:
|
49
|
+
if dvc_path and not dvc_path.endswith('.dvc'):
|
50
|
+
dvc_path += '.dvc'
|
51
|
+
if dvc_path:
|
52
|
+
md5 = dvc_md5(ref, dvc_path, log=log)
|
53
|
+
elif ':' in ref:
|
54
|
+
git_ref, dvc_path = ref.split(':', 1)
|
55
|
+
md5 = dvc_md5(git_ref, dvc_path, log=log)
|
56
|
+
else:
|
57
|
+
md5 = ref
|
58
|
+
dirname = md5[:2]
|
59
|
+
basename = md5[2:]
|
60
|
+
return join(dvc_cache_dir(log=log), 'files', 'md5', dirname, basename)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dvc-utils
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: CLI for diffing DVC files at two commits (or one commit vs. current worktree), optionally passing both through another command first
|
5
5
|
Home-page: https://github.com/runsascoded/dvc-utils
|
6
6
|
Author: Ryan Williams
|
@@ -10,17 +10,18 @@ Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
11
11
|
|
12
12
|
# dvc-utils
|
13
|
-
|
13
|
+
Diff [DVC] files, optionally piping through other commands first.
|
14
14
|
|
15
15
|
<!-- toc -->
|
16
16
|
- [Installation](#installation)
|
17
17
|
- [Usage](#usage)
|
18
18
|
- [`dvc-diff`](#dvc-diff)
|
19
19
|
- [Examples](#examples)
|
20
|
-
- [Parquet
|
20
|
+
- [Parquet](#parquet-diff)
|
21
21
|
- [Schema diff](#parquet-schema-diff)
|
22
22
|
- [Row diff](#parquet-row-diff)
|
23
23
|
- [Row count diff](#parquet-row-count-diff)
|
24
|
+
- [GZipped CSVs](#csv-gz)
|
24
25
|
<!-- /toc -->
|
25
26
|
|
26
27
|
## Installation <a id="installation"></a>
|
@@ -83,7 +84,7 @@ dvc-diff --help
|
|
83
84
|
|
84
85
|
## Examples <a id="examples"></a>
|
85
86
|
|
86
|
-
### Parquet
|
87
|
+
### Parquet <a id="parquet-diff"></a>
|
87
88
|
See sample commands and output below for inspecting changes to [a DVC-tracked Parquet file][commit path] in [a given commit][commit].
|
88
89
|
|
89
90
|
Setup:
|
@@ -222,6 +223,93 @@ dvc-diff -r $commit^..$commit parquet_row_count $path
|
|
222
223
|
|
223
224
|
This time we get no output; [the given `$commit`][commit] didn't change the row count in the DVC-tracked Parquet file [`$path`][commit path].
|
224
225
|
|
226
|
+
### GZipped CSVs <a id="csv-gz"></a>
|
227
|
+
|
228
|
+
Here's a "one-liner" I used in [ctbk.dev][ctbk.dev gh], to normalize and compare headers of `.csv.gz.dvc` files between two commits:
|
229
|
+
|
230
|
+
```bash
|
231
|
+
# Save some `sed` substitution commands to file `seds`:
|
232
|
+
cat <<EOF >seds
|
233
|
+
s/station_//
|
234
|
+
s/latitude/lat/
|
235
|
+
s/longitude/lng/
|
236
|
+
s/starttime/started_at/
|
237
|
+
s/stoptime/ended_at/
|
238
|
+
s/usertype/member_casual/
|
239
|
+
EOF
|
240
|
+
# Commit range to diff; branch `c0` is an initial commit of some `.csv.gz` files, branch `c1` is a later commit after some updates
|
241
|
+
r=c0..c1
|
242
|
+
# List files changed in commit range `$r`, in the `s3/ctbk/csvs/` dir, piping through several post-processing commands:
|
243
|
+
gdno $r s3/ctbk/csvs/ | \
|
244
|
+
pel "ddcr $r guc h1 spc kq kcr snc 'sdf seds' sort"
|
245
|
+
```
|
246
|
+
|
247
|
+
<details>
|
248
|
+
<summary>
|
249
|
+
Aliases used in the pipeline:
|
250
|
+
</summary>
|
251
|
+
|
252
|
+
- [`gdno`] (`git diff --name-only`): list files changed in the given commit range and directory
|
253
|
+
- [`pel`]: [`parallel`] alias that prepends an `echo {}` to the command
|
254
|
+
- [`ddcr`] (`dvc-diff -cr`): colorized `diff` output, revision range `$r`
|
255
|
+
- [`guc`] (`gunzip -c`): uncompress the `.csv.gz` files
|
256
|
+
- [`h1`] (`head -n1`): only examine each file's header line
|
257
|
+
- [`spc`] (`tr , $'\n'`): split the header line by commas (so each column name will be on one line, for easier `diff`ing below)
|
258
|
+
- [`kq`] (`tr -d '"'`): kill quote characters (in this case, header-column name quoting changed, but I don't care about that)
|
259
|
+
- [`kcr`] (`tr -d '\r'`): kill carriage returns (line endings also changed)
|
260
|
+
- [`snc`] (`sed -f 'snake_case.sed'`): snake-case column names
|
261
|
+
- [`sdf`] (`sed -f`): execute the `sed` substitution commands defined in the `seds` file above
|
262
|
+
- `sort`: sort the column names alphabetically (to identify missing or added columns, ignore rearrangements)
|
263
|
+
|
264
|
+
Note:
|
265
|
+
- Most of these are exported Bash functions, allowing them to be used inside the [`parallel`] command.
|
266
|
+
- I was able to build this pipeline iteratively, adding steps to normalize out the bits I didn't care about (and accumulating the `seds` commands).
|
267
|
+
</details>
|
268
|
+
|
269
|
+
Example output:
|
270
|
+
```diff
|
271
|
+
…
|
272
|
+
s3/ctbk/csvs/201910-citibike-tripdata.csv.gz.dvc:
|
273
|
+
s3/ctbk/csvs/201911-citibike-tripdata.csv.gz.dvc:
|
274
|
+
s3/ctbk/csvs/201912-citibike-tripdata.csv.gz.dvc:
|
275
|
+
s3/ctbk/csvs/202001-citibike-tripdata.csv.gz.dvc:
|
276
|
+
1,2d0
|
277
|
+
< bikeid
|
278
|
+
< birth_year
|
279
|
+
8d5
|
280
|
+
< gender
|
281
|
+
9a7,8
|
282
|
+
> ride_id
|
283
|
+
> rideable_type
|
284
|
+
15d13
|
285
|
+
< tripduration
|
286
|
+
s3/ctbk/csvs/202002-citibike-tripdata.csv.gz.dvc:
|
287
|
+
1,2d0
|
288
|
+
< bikeid
|
289
|
+
< birth_year
|
290
|
+
8d5
|
291
|
+
< gender
|
292
|
+
9a7,8
|
293
|
+
> ride_id
|
294
|
+
> rideable_type
|
295
|
+
15d13
|
296
|
+
< tripduration
|
297
|
+
s3/ctbk/csvs/202003-citibike-tripdata.csv.gz.dvc:
|
298
|
+
1,2d0
|
299
|
+
< bikeid
|
300
|
+
< birth_year
|
301
|
+
8d5
|
302
|
+
< gender
|
303
|
+
9a7,8
|
304
|
+
> ride_id
|
305
|
+
> rideable_type
|
306
|
+
15d13
|
307
|
+
< tripduration
|
308
|
+
…
|
309
|
+
```
|
310
|
+
|
311
|
+
This helped me see that the data update in question (`c0..c1`) dropped some fields (`bikeid, birth_year`, `gender`, `tripduration`) and added others (`ride_id`, `rideable_type`), for `202001` and later.
|
312
|
+
|
225
313
|
[DVC]: https://dvc.org/
|
226
314
|
[`parquet2json`]: https://github.com/jupiter/parquet2json
|
227
315
|
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
@@ -229,4 +317,17 @@ This time we get no output; [the given `$commit`][commit] didn't change the row
|
|
229
317
|
[commit]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7
|
230
318
|
[commit path]: https://github.com/hudcostreets/nj-crashes/commit/c8ae28e64f4917895d84074913f48e0a7afbc3d7#diff-7f812dce61e0996354f4af414203e0933ccdfe9613cb406c40c1c41a14b9769c
|
231
319
|
[hudcostreets/nj-crashes]: https://github.com/hudcostreets/nj-crashes
|
320
|
+
[ctbk.dev gh]: https://github.com/neighbor-ryan/ctbk.dev
|
232
321
|
[`jq`]: https://jqlang.github.io/jq/
|
322
|
+
[`parallel`]: https://www.gnu.org/software/parallel/
|
323
|
+
|
324
|
+
[`gdno`]: https://github.com/ryan-williams/git-helpers/blob/96560df1406f41676f293becefb423895a755faf/diff/.gitconfig#L31
|
325
|
+
[`pel`]: https://github.com/ryan-williams/parallel-helpers/blob/e7ee109c4085c04036840ea78999cff73fcf9502/.parallel-rc#L6-L17
|
326
|
+
[`ddcr`]: https://github.com/ryan-williams/aws-helpers/blob/8a314f1e6b336833c772459de6b739f5c06a51a3/.dvc-rc#L84
|
327
|
+
[`guc`]: https://github.com/ryan-williams/zip-helpers/blob/c67d84fb06c0ab3609dacb68d900344d3b8e8f04/.zip-rc#L16
|
328
|
+
[`h1`]: https://github.com/ryan-williams/head-tail-helpers/blob/9715690f47ceeff6b6948b2093901f2b0830114b/.head-tail-rc#L3
|
329
|
+
[`spc`]: https://github.com/ryan-williams/col-helpers/blob/9493d003224249ee240d023f71ab03bdd4174b88/.cols-rc#L8
|
330
|
+
[`kq`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L115
|
331
|
+
[`kcr`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L118
|
332
|
+
[`snc`]: https://github.com/ryan-williams/case-helpers/blob/c40a62a9656f0d52d68fb3a108ae6bb3eed3c7bd/.case-rc#L9
|
333
|
+
[`sdf`]: https://github.com/ryan-williams/arg-helpers/blob/a8c60809f8878fa38b3c03614778fcf29132538e/.arg-rc#L138
|
@@ -0,0 +1,9 @@
|
|
1
|
+
dvc_utils/__init__.py,sha256=mP-p1Sl2JMMShM_hRhu86pFNfIq_8E_feh1CN47LWcs,86
|
2
|
+
dvc_utils/cli.py,sha256=D5GGDPpMlKG-8IHkTXPIB2z-QAKHzbm5gQXqj7vCquY,3701
|
3
|
+
dvc_utils/path.py,sha256=PoAbeaqRPDksY2hcUeF8xZ6Nr6hLIZprey3VNT4V5bc,1727
|
4
|
+
dvc_utils-0.0.9.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
|
5
|
+
dvc_utils-0.0.9.dist-info/METADATA,sha256=zhFkhQbnbOYwwhHJiXUj8otqRw5tfmVsXx-b5MlByvw,11480
|
6
|
+
dvc_utils-0.0.9.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
|
7
|
+
dvc_utils-0.0.9.dist-info/entry_points.txt,sha256=luxCQr8OS-jMSyyDhB9KDQhUbP8UH6UMcy-vkfXX7Gg,88
|
8
|
+
dvc_utils-0.0.9.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
|
9
|
+
dvc_utils-0.0.9.dist-info/RECORD,,
|
dvc_utils-0.0.8.dist-info/RECORD
DELETED
@@ -1,8 +0,0 @@
|
|
1
|
-
dvc_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
dvc_utils/main.py,sha256=05Cv6jlKfNnmRQjXOs9dq1eBzT2gIdwnayOsiEjmVZE,5205
|
3
|
-
dvc_utils-0.0.8.dist-info/LICENSE,sha256=ZS8AReay7xmQzBAHwxIuTouGXz3SKgUa2_Sz8Ip0EzQ,1070
|
4
|
-
dvc_utils-0.0.8.dist-info/METADATA,sha256=EplLe4Eaqnfu1LGalvWaVOvoKdUVpTjZjxpljp4YtmU,7384
|
5
|
-
dvc_utils-0.0.8.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
|
6
|
-
dvc_utils-0.0.8.dist-info/entry_points.txt,sha256=0JqFeb29N5ZrvwmIJrg4ii32xNFOXRul-HMq-3GOIw4,90
|
7
|
-
dvc_utils-0.0.8.dist-info/top_level.txt,sha256=jT0-PJa2t_eFRE9rn-52AjdnZ8nQeEHllf2kJmaGh80,10
|
8
|
-
dvc_utils-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|