sacsv 1.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sacsv-1.5.1/.gitignore +1 -0
- sacsv-1.5.1/LICENSE.md +5 -0
- sacsv-1.5.1/PKG-INFO +62 -0
- sacsv-1.5.1/README.md +51 -0
- sacsv-1.5.1/examples/csvparallel.png +0 -0
- sacsv-1.5.1/pyproject.toml +47 -0
- sacsv-1.5.1/src/sacsv/__init__.py +0 -0
- sacsv-1.5.1/src/sacsv/csv2jsonl.py +33 -0
- sacsv-1.5.1/src/sacsv/csvaddrandom.py +29 -0
- sacsv-1.5.1/src/sacsv/csvadduniqueid.py +43 -0
- sacsv-1.5.1/src/sacsv/csvaggregate.py +59 -0
- sacsv-1.5.1/src/sacsv/csvappend.py +52 -0
- sacsv-1.5.1/src/sacsv/csvcheck.py +232 -0
- sacsv-1.5.1/src/sacsv/csvdropdups.py +60 -0
- sacsv-1.5.1/src/sacsv/csvfindsortkey.py +62 -0
- sacsv-1.5.1/src/sacsv/csvkeepmax.py +52 -0
- sacsv-1.5.1/src/sacsv/csvleftjoin.py +59 -0
- sacsv-1.5.1/src/sacsv/csvop.py +49 -0
- sacsv-1.5.1/src/sacsv/csvparallel.py +118 -0
- sacsv-1.5.1/src/sacsv/csvrename.py +27 -0
- sacsv-1.5.1/src/sacsv/csvreorder.py +31 -0
- sacsv-1.5.1/src/sacsv/csvsed.py +30 -0
- sacsv-1.5.1/src/sacsv/csvsort.py +30 -0
- sacsv-1.5.1/src/sacsv/csvtranspose.py +18 -0
- sacsv-1.5.1/src/sacsv/fw2csv.py +76 -0
- sacsv-1.5.1/src/sacsv/longcsv2wide.py +94 -0
- sacsv-1.5.1/src/sacsv/widecsv2long.py +129 -0
- sacsv-1.5.1/uv.lock +290 -0
sacsv-1.5.1/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*.pyc
|
sacsv-1.5.1/LICENSE.md
ADDED
sacsv-1.5.1/PKG-INFO
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sacsv
|
|
3
|
+
Version: 1.5.1
|
|
4
|
+
Summary: Swiss Army csv: command-line tools to manipulate csv-formatted data
|
|
5
|
+
Project-URL: Homepage, https://github.com/gn0/sacsv
|
|
6
|
+
Author-email: Gabor Nyeki <gabor.nyeki@alumni.duke.edu>
|
|
7
|
+
License-File: LICENSE.md
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: argh>=0.31.3
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# sacsv: Swiss Army csv
|
|
14
|
+
|
|
15
|
+
This Python package provides an assortment of command-line tools to manipulate csv-formatted data.
|
|
16
|
+
The tools are:
|
|
17
|
+
|
|
18
|
+
- `csv2jsonl`: converts csv input into jsonlines
|
|
19
|
+
- `csvaddrandom`: adds a column with a random number
|
|
20
|
+
- `csvadduniqueid`: adds a column with a unique record identifier
|
|
21
|
+
- `csvaggregate`: applies an arbitrary Python function to every value of a column, possibly within groups
|
|
22
|
+
- `csvappend`: appends two or more csv files
|
|
23
|
+
- `csvdropdups`: drops duplicate records
|
|
24
|
+
- `csvfindsortkey`: attempts to find the column that the input is sorted by
|
|
25
|
+
- `csvkeepmax`: keeps the record that has the maximum value in a column
|
|
26
|
+
- `csvleftjoin`: merges two csv files
|
|
27
|
+
- `csvop`: applies an arbitrary Python function to every record and saves the return value in a new column
|
|
28
|
+
- `csvparallel`: parallelizes arbitrary commands that read a csv input and write a csv output
|
|
29
|
+
- `csvrename`: changes the name of a column
|
|
30
|
+
- `csvreorder`: changes the order of columns
|
|
31
|
+
- `csvsed`: applies a substitution rule, using regular expressions, to every value of a column
|
|
32
|
+
- `csvsort`: sorts the input
|
|
33
|
+
- `csvtranspose`: transposes the input
|
|
34
|
+
- `fw2csv`: converts fixed-width input, potentially with multi-line records, into csv
|
|
35
|
+
- `longcsv2wide`: converts the input from long to wide form
|
|
36
|
+
- `widecsv2long`: converts the input from wide to long form
|
|
37
|
+
|
|
38
|
+

|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
To install this package using pip, type
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
pip install git+https://github.com/gn0/sacsv
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
or, alternatively,
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
git clone https://github.com/gn0/sacsv
|
|
52
|
+
pip install ./sacsv
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Author
|
|
56
|
+
|
|
57
|
+
Gabor Nyeki. Contact information is on http://www.gabornyeki.com/.
|
|
58
|
+
|
|
59
|
+
## License
|
|
60
|
+
|
|
61
|
+
This package is licensed under the Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/.
|
|
62
|
+
|
sacsv-1.5.1/README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
|
|
2
|
+
# sacsv: Swiss Army csv
|
|
3
|
+
|
|
4
|
+
This Python package provides an assortment of command-line tools to manipulate csv-formatted data.
|
|
5
|
+
The tools are:
|
|
6
|
+
|
|
7
|
+
- `csv2jsonl`: converts csv input into jsonlines
|
|
8
|
+
- `csvaddrandom`: adds a column with a random number
|
|
9
|
+
- `csvadduniqueid`: adds a column with a unique record identifier
|
|
10
|
+
- `csvaggregate`: applies an arbitrary Python function to every value of a column, possibly within groups
|
|
11
|
+
- `csvappend`: appends two or more csv files
|
|
12
|
+
- `csvdropdups`: drops duplicate records
|
|
13
|
+
- `csvfindsortkey`: attempts to find the column that the input is sorted by
|
|
14
|
+
- `csvkeepmax`: keeps the record that has the maximum value in a column
|
|
15
|
+
- `csvleftjoin`: merges two csv files
|
|
16
|
+
- `csvop`: applies an arbitrary Python function to every record and saves the return value in a new column
|
|
17
|
+
- `csvparallel`: parallelizes arbitrary commands that read a csv input and write a csv output
|
|
18
|
+
- `csvrename`: changes the name of a column
|
|
19
|
+
- `csvreorder`: changes the order of columns
|
|
20
|
+
- `csvsed`: applies a substitution rule, using regular expressions, to every value of a column
|
|
21
|
+
- `csvsort`: sorts the input
|
|
22
|
+
- `csvtranspose`: transposes the input
|
|
23
|
+
- `fw2csv`: converts fixed-width input, potentially with multi-line records, into csv
|
|
24
|
+
- `longcsv2wide`: converts the input from long to wide form
|
|
25
|
+
- `widecsv2long`: converts the input from wide to long form
|
|
26
|
+
|
|
27
|
+

|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
To install this package using pip, type
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
pip install git+https://github.com/gn0/sacsv
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
or, alternatively,
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
git clone https://github.com/gn0/sacsv
|
|
41
|
+
pip install ./sacsv
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Author
|
|
45
|
+
|
|
46
|
+
Gabor Nyeki. Contact information is on http://www.gabornyeki.com/.
|
|
47
|
+
|
|
48
|
+
## License
|
|
49
|
+
|
|
50
|
+
This package is licensed under the Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/.
|
|
51
|
+
|
|
Binary file
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sacsv"
|
|
7
|
+
version = "1.5.1"
|
|
8
|
+
description = "Swiss Army csv: command-line tools to manipulate csv-formatted data"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Gabor Nyeki", email = "gabor.nyeki@alumni.duke.edu" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"argh>=0.31.3",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
csv2jsonl = "sacsv.csv2jsonl:dispatch"
|
|
20
|
+
csvaddrandom = "sacsv.csvaddrandom:dispatch"
|
|
21
|
+
csvadduniqueid = "sacsv.csvadduniqueid:dispatch"
|
|
22
|
+
csvaggregate = "sacsv.csvaggregate:dispatch"
|
|
23
|
+
csvappend = "sacsv.csvappend:dispatch"
|
|
24
|
+
csvcheck = "sacsv.csvcheck:dispatch"
|
|
25
|
+
csvdropdups = "sacsv.csvdropdups:dispatch"
|
|
26
|
+
csvfindsortkey = "sacsv.csvfindsortkey:dispatch"
|
|
27
|
+
csvkeepmax = "sacsv.csvkeepmax:dispatch"
|
|
28
|
+
csvleftjoin = "sacsv.csvleftjoin:dispatch"
|
|
29
|
+
csvop = "sacsv.csvop:dispatch"
|
|
30
|
+
csvparallel = "sacsv.csvparallel:dispatch"
|
|
31
|
+
csvrename = "sacsv.csvrename:dispatch"
|
|
32
|
+
csvreorder = "sacsv.csvreorder:dispatch"
|
|
33
|
+
csvsed = "sacsv.csvsed:dispatch"
|
|
34
|
+
csvsort = "sacsv.csvsort:dispatch"
|
|
35
|
+
csvtranspose = "sacsv.csvtranspose:dispatch"
|
|
36
|
+
fw2csv = "sacsv.fw2csv:dispatch"
|
|
37
|
+
longcsv2wide = "sacsv.longcsv2wide:dispatch"
|
|
38
|
+
widecsv2long = "sacsv.widecsv2long:dispatch"
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/gn0/sacsv"
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=9.0.2",
|
|
46
|
+
"pytest-cov>=7.0.0",
|
|
47
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import csv
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
import collections
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def cast(obj):
|
|
9
|
+
try:
|
|
10
|
+
return float(obj)
|
|
11
|
+
except:
|
|
12
|
+
return obj
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@argh.arg("-a", "--auto-cast", default=False)
|
|
16
|
+
def main(auto_cast=None):
|
|
17
|
+
reader = csv.reader(sys.stdin)
|
|
18
|
+
header = next(reader)
|
|
19
|
+
|
|
20
|
+
for record in reader:
|
|
21
|
+
obj = collections.OrderedDict((k, cast(v) if auto_cast else v)
|
|
22
|
+
for k, v in zip(header, record))
|
|
23
|
+
print(json.dumps(obj))
|
|
24
|
+
|
|
25
|
+
sys.stdout.flush()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def dispatch():
|
|
29
|
+
argh.dispatch_command(main)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
if __name__ == "__main__":
|
|
33
|
+
dispatch()
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import random
|
|
3
|
+
import csv
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@argh.arg("-s", "--seed", type=int, required=True)
|
|
8
|
+
@argh.arg("-c", "--column-name", type=str, required=True)
|
|
9
|
+
def main(column_name=None, seed=None):
|
|
10
|
+
random.seed(seed)
|
|
11
|
+
|
|
12
|
+
reader = csv.reader(sys.stdin)
|
|
13
|
+
header = next(reader)
|
|
14
|
+
|
|
15
|
+
writer = csv.writer(sys.stdout)
|
|
16
|
+
writer.writerow(
|
|
17
|
+
header + [column_name])
|
|
18
|
+
|
|
19
|
+
for record in reader:
|
|
20
|
+
writer.writerow(
|
|
21
|
+
record + [random.randint(1, 2**31)])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def dispatch():
|
|
25
|
+
argh.dispatch_command(main)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
dispatch()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import sys
|
|
3
|
+
import csv
|
|
4
|
+
import itertools as it
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@argh.arg("-g", "--group-by", type=str, nargs="+")
|
|
8
|
+
@argh.arg("-s", "--sort-by", type=str, nargs="+")
|
|
9
|
+
@argh.arg("-c", "--column-name", type=str, required=True)
|
|
10
|
+
def main(group_by=None, sort_by=None, column_name=None):
|
|
11
|
+
reader = csv.reader(sys.stdin)
|
|
12
|
+
header = next(reader)
|
|
13
|
+
|
|
14
|
+
if group_by is None:
|
|
15
|
+
group_key = lambda r: 1
|
|
16
|
+
else:
|
|
17
|
+
group_indices = tuple(header.index(c) for c in group_by)
|
|
18
|
+
group_key = lambda r: tuple(r[i] for i in group_indices)
|
|
19
|
+
|
|
20
|
+
if sort_by is None:
|
|
21
|
+
sort_key = lambda r: 1
|
|
22
|
+
else:
|
|
23
|
+
sort_indices = tuple(header.index(c) for c in sort_by)
|
|
24
|
+
sort_key = lambda r: tuple(r[i] for i in sort_indices)
|
|
25
|
+
|
|
26
|
+
writer = csv.writer(sys.stdout)
|
|
27
|
+
writer.writerow(
|
|
28
|
+
[column_name] + header)
|
|
29
|
+
|
|
30
|
+
for group_id, group_iter in it.groupby(
|
|
31
|
+
sorted(reader, key=group_key),
|
|
32
|
+
group_key):
|
|
33
|
+
for k, record in enumerate(sorted(group_iter, key=sort_key), 1):
|
|
34
|
+
writer.writerow(
|
|
35
|
+
[k] + record)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def dispatch():
|
|
39
|
+
argh.dispatch_command(main)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
dispatch()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import importlib
|
|
3
|
+
import csv
|
|
4
|
+
import sys
|
|
5
|
+
import operator as op
|
|
6
|
+
import itertools as it
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@argh.arg("-m", "--import-mod", type=str, nargs="+", required=False)
|
|
10
|
+
@argh.arg("-c", "--columns", type=str, nargs="+", required=True)
|
|
11
|
+
@argh.arg("-g", "--group-by", type=str, nargs="+", required=False)
|
|
12
|
+
@argh.arg("-f", "--func-def", type=str, required=True)
|
|
13
|
+
def main(import_mod=None, columns=None, group_by=None, func_def=None):
|
|
14
|
+
for m in import_mod or tuple():
|
|
15
|
+
globals()[m.split(".")[0]] = importlib.import_module(m.split(".")[0])
|
|
16
|
+
importlib.import_module(m)
|
|
17
|
+
|
|
18
|
+
f = eval(func_def)
|
|
19
|
+
|
|
20
|
+
reader = csv.reader(sys.stdin)
|
|
21
|
+
header = next(reader)
|
|
22
|
+
pickers = tuple(op.itemgetter(header.index(c)) for c in columns)
|
|
23
|
+
|
|
24
|
+
writer = csv.writer(sys.stdout)
|
|
25
|
+
|
|
26
|
+
if group_by is None:
|
|
27
|
+
group_key = lambda r: 1
|
|
28
|
+
writer.writerow(columns)
|
|
29
|
+
else:
|
|
30
|
+
group_key = lambda r: tuple(r[header.index(c)] for c in group_by)
|
|
31
|
+
writer.writerow(group_by + columns)
|
|
32
|
+
|
|
33
|
+
for group_id, record_iter in it.groupby(
|
|
34
|
+
sorted(
|
|
35
|
+
reader,
|
|
36
|
+
key=group_key),
|
|
37
|
+
group_key):
|
|
38
|
+
values = list(list() for k in range(len(columns)))
|
|
39
|
+
|
|
40
|
+
for record in record_iter:
|
|
41
|
+
for k in range(len(columns)):
|
|
42
|
+
values[k].append(
|
|
43
|
+
pickers[k](record))
|
|
44
|
+
|
|
45
|
+
if group_by is None:
|
|
46
|
+
writer.writerow(
|
|
47
|
+
tuple(f(values[k]) for k in range(len(columns))))
|
|
48
|
+
else:
|
|
49
|
+
writer.writerow(
|
|
50
|
+
group_id
|
|
51
|
+
+ tuple(f(values[k]) for k in range(len(columns))))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def dispatch():
|
|
55
|
+
argh.dispatch_command(main)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
dispatch()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import csv
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_fieldnames(fieldnames_iter):
|
|
7
|
+
return sorted(
|
|
8
|
+
set(f
|
|
9
|
+
for fieldnames in fieldnames_iter
|
|
10
|
+
for f in fieldnames))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
assert (get_fieldnames(
|
|
14
|
+
(("a", "b"), ("a", "b", "c")))
|
|
15
|
+
== ["a", "b", "c"])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def make_get_fields(*fieldnames):
|
|
19
|
+
def get_fields(record):
|
|
20
|
+
return tuple(record.get(field) for field in fieldnames)
|
|
21
|
+
|
|
22
|
+
return get_fields
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
assert make_get_fields("a", "b")(dict(a=1)) == (1, None)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@argh.arg("csv_filename", nargs="+")
|
|
29
|
+
def main(csv_filename):
|
|
30
|
+
dict_readers = tuple(csv.DictReader(open(filename, "r"))
|
|
31
|
+
for filename in csv_filename)
|
|
32
|
+
|
|
33
|
+
fieldnames = get_fieldnames(r.fieldnames for r in dict_readers)
|
|
34
|
+
get_fields = make_get_fields(*fieldnames)
|
|
35
|
+
|
|
36
|
+
writer = csv.writer(sys.stdout)
|
|
37
|
+
|
|
38
|
+
writer.writerow(fieldnames)
|
|
39
|
+
writer.writerows(
|
|
40
|
+
get_fields(record)
|
|
41
|
+
for dict_reader in dict_readers
|
|
42
|
+
for record in dict_reader)
|
|
43
|
+
|
|
44
|
+
sys.stdout.flush()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def dispatch():
|
|
48
|
+
argh.dispatch_command(main)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
dispatch()
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import sys
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
from collections import namedtuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
CheckResult = namedtuple(
|
|
9
|
+
"CheckResult",
|
|
10
|
+
("success", "message", "problems"))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def column_values(rows, index):
|
|
14
|
+
return tuple(row[index] for row in rows)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def print_problems_markdown(header, records):
|
|
18
|
+
n_columns = len(header)
|
|
19
|
+
|
|
20
|
+
column_widths = tuple(
|
|
21
|
+
max(map(len, (header[i],) + column_values(records, i)))
|
|
22
|
+
for i in range(n_columns))
|
|
23
|
+
|
|
24
|
+
# Header.
|
|
25
|
+
#
|
|
26
|
+
|
|
27
|
+
print("|", end = "")
|
|
28
|
+
for i in range(n_columns):
|
|
29
|
+
print(
|
|
30
|
+
(" %%%ds |" % column_widths[i]) % header[i],
|
|
31
|
+
end="")
|
|
32
|
+
print()
|
|
33
|
+
|
|
34
|
+
print("|", end = "")
|
|
35
|
+
for i in range(n_columns):
|
|
36
|
+
print(" %s |" % ("-" * column_widths[i]), end="")
|
|
37
|
+
print()
|
|
38
|
+
|
|
39
|
+
# Body.
|
|
40
|
+
#
|
|
41
|
+
|
|
42
|
+
for record in records:
|
|
43
|
+
print("|", end = "")
|
|
44
|
+
for i in range(n_columns):
|
|
45
|
+
print(
|
|
46
|
+
(" %%%ds |" % column_widths[i]) % record[i],
|
|
47
|
+
end="")
|
|
48
|
+
print()
|
|
49
|
+
|
|
50
|
+
print()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def share_samelength_among_nonmissing(values):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def is_nonmissing(value):
|
|
58
|
+
return value not in ("", "NA")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def is_numeric(value):
|
|
62
|
+
try:
|
|
63
|
+
float(value)
|
|
64
|
+
except ValueError:
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_text(value):
|
|
71
|
+
return not is_numeric(value)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def is_unique(value, values):
|
|
75
|
+
try:
|
|
76
|
+
values[values.index(value) + 1:].index(value)
|
|
77
|
+
except ValueError:
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def share_type_among_nonmissing(type_func, values):
|
|
84
|
+
indicators = tuple(
|
|
85
|
+
type_func(value)
|
|
86
|
+
for value in values
|
|
87
|
+
if is_nonmissing(value))
|
|
88
|
+
|
|
89
|
+
if len(indicators) == 0:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
return sum(indicators) / len(indicators)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def share_numeric_among_nonmissing(values):
|
|
96
|
+
return share_type_among_nonmissing(
|
|
97
|
+
is_numeric, values)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def share_text_among_nonmissing(values):
|
|
101
|
+
return share_type_among_nonmissing(
|
|
102
|
+
is_text, values)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def share_unique_among_nonmissing(values):
|
|
106
|
+
return share_type_among_nonmissing(
|
|
107
|
+
lambda x: is_unique(x, values),
|
|
108
|
+
values)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def mostly_samelength(values):
|
|
112
|
+
return False # TODO
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def mostly_numeric(values):
|
|
116
|
+
share = share_numeric_among_nonmissing(values)
|
|
117
|
+
|
|
118
|
+
return share is None or share >= .9
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def mostly_text(values):
|
|
122
|
+
share = share_text_among_nonmissing(values)
|
|
123
|
+
|
|
124
|
+
return share is None or share >= .9
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def mostly_unique(values):
|
|
128
|
+
share = share_unique_among_nonmissing(values)
|
|
129
|
+
|
|
130
|
+
return share is None or share >= .9
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def check_column_type_consistency(values):
|
|
134
|
+
"""Checks that if most values in a column are a certain
|
|
135
|
+
type, then all values in that column are that type."""
|
|
136
|
+
|
|
137
|
+
success = True
|
|
138
|
+
message = None
|
|
139
|
+
problems = None
|
|
140
|
+
|
|
141
|
+
if mostly_samelength(values):
|
|
142
|
+
# E.g., phone numbers that each have 11 digits.
|
|
143
|
+
#
|
|
144
|
+
pass # TODO
|
|
145
|
+
elif mostly_numeric(values):
|
|
146
|
+
if share_text_among_nonmissing(values) > 0:
|
|
147
|
+
success = False
|
|
148
|
+
message = "Text value in mostly numeric column."
|
|
149
|
+
problems = tuple(
|
|
150
|
+
i
|
|
151
|
+
for i, value in enumerate(values)
|
|
152
|
+
if is_nonmissing(value) and is_text(value))
|
|
153
|
+
elif mostly_text(values):
|
|
154
|
+
if share_numeric_among_nonmissing(values) > 0:
|
|
155
|
+
success = False
|
|
156
|
+
message = "Numeric value in mostly text column."
|
|
157
|
+
problems = tuple(
|
|
158
|
+
i
|
|
159
|
+
for i, value in enumerate(values)
|
|
160
|
+
if is_nonmissing(value) and is_numeric(value))
|
|
161
|
+
|
|
162
|
+
return CheckResult(
|
|
163
|
+
success=success,
|
|
164
|
+
message=message,
|
|
165
|
+
problems=problems)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def check_column_no_odd_value_repetitions(values):
|
|
169
|
+
"""Checks that if most values in a column are unique, then
|
|
170
|
+
all values in that column are unique."""
|
|
171
|
+
|
|
172
|
+
success = True
|
|
173
|
+
message = None
|
|
174
|
+
problems = None
|
|
175
|
+
|
|
176
|
+
if mostly_unique(values):
|
|
177
|
+
if share_unique_among_nonmissing(values) < 1:
|
|
178
|
+
success = False
|
|
179
|
+
message = "Repeated values in column with mostly unique values."
|
|
180
|
+
problems = tuple(
|
|
181
|
+
i
|
|
182
|
+
for i, value in enumerate(values)
|
|
183
|
+
if is_nonmissing(value)
|
|
184
|
+
and not is_unique(value, values))
|
|
185
|
+
|
|
186
|
+
return CheckResult(
|
|
187
|
+
success=success,
|
|
188
|
+
message=message,
|
|
189
|
+
problems=problems)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def main():
|
|
193
|
+
reader = csv.reader(sys.stdin)
|
|
194
|
+
|
|
195
|
+
header = next(reader)
|
|
196
|
+
records = tuple(reader)
|
|
197
|
+
|
|
198
|
+
column_checks = (
|
|
199
|
+
check_column_type_consistency,
|
|
200
|
+
check_column_no_odd_value_repetitions)
|
|
201
|
+
|
|
202
|
+
# TODO
|
|
203
|
+
# row_checks = (
|
|
204
|
+
# check_row_no_odd_value_repetitions,)
|
|
205
|
+
|
|
206
|
+
all_success = True
|
|
207
|
+
|
|
208
|
+
for check in column_checks:
|
|
209
|
+
for i, column in enumerate(header):
|
|
210
|
+
result = check(column_values(records, i))
|
|
211
|
+
|
|
212
|
+
if not result.success:
|
|
213
|
+
all_success = False
|
|
214
|
+
|
|
215
|
+
print("warning: %s: %s"
|
|
216
|
+
% (column, result.message))
|
|
217
|
+
print_problems_markdown(
|
|
218
|
+
header,
|
|
219
|
+
tuple(
|
|
220
|
+
records[index]
|
|
221
|
+
for index in result.problems))
|
|
222
|
+
|
|
223
|
+
if not all_success:
|
|
224
|
+
sys.exit(1)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def dispatch():
|
|
228
|
+
argh.dispatch_command(main)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
dispatch()
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
import csv
|
|
3
|
+
import sys
|
|
4
|
+
import itertools as it
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def make_key(key_columns, columns):
|
|
8
|
+
if len(key_columns) == 0:
|
|
9
|
+
return lambda x: 1
|
|
10
|
+
else:
|
|
11
|
+
def key(item):
|
|
12
|
+
return tuple(item[columns.index(c)] for c in key_columns)
|
|
13
|
+
|
|
14
|
+
return key
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@argh.arg("-k", "--key", type=str, nargs="+", required=True)
|
|
18
|
+
@argh.arg("-f", "--keep-first", type=str, nargs="*")
|
|
19
|
+
@argh.arg("-l", "--keep-last", type=str, nargs="*")
|
|
20
|
+
def main(key=None, keep_first=None, keep_last=None):
|
|
21
|
+
if keep_first is None and keep_last is None:
|
|
22
|
+
raise argh.CommandError(
|
|
23
|
+
"Must specify either --keep-first or --keep-last.")
|
|
24
|
+
elif keep_first is not None and keep_last is not None:
|
|
25
|
+
raise argh.CommandError(
|
|
26
|
+
"Must specify either --keep-first or --keep-last "
|
|
27
|
+
+ "but not both.")
|
|
28
|
+
|
|
29
|
+
reader = csv.reader(sys.stdin)
|
|
30
|
+
columns = next(reader)
|
|
31
|
+
|
|
32
|
+
primary_key = make_key(key, columns)
|
|
33
|
+
|
|
34
|
+
if keep_first is not None:
|
|
35
|
+
secondary_key = make_key(keep_first, columns)
|
|
36
|
+
else:
|
|
37
|
+
secondary_key = make_key(keep_last, columns)
|
|
38
|
+
|
|
39
|
+
writer = csv.writer(sys.stdout)
|
|
40
|
+
writer.writerow(columns)
|
|
41
|
+
|
|
42
|
+
for item_key, item_iter in it.groupby(
|
|
43
|
+
sorted(
|
|
44
|
+
reader,
|
|
45
|
+
key=primary_key),
|
|
46
|
+
primary_key):
|
|
47
|
+
items = sorted(item_iter, key=secondary_key)
|
|
48
|
+
|
|
49
|
+
if keep_first is not None:
|
|
50
|
+
writer.writerow(items[0])
|
|
51
|
+
else:
|
|
52
|
+
writer.writerow(items[-1])
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def dispatch():
|
|
56
|
+
argh.dispatch_command(main)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
dispatch()
|