sacsv 1.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sacsv-1.5.1/.gitignore ADDED
@@ -0,0 +1 @@
1
+ *.pyc
sacsv-1.5.1/LICENSE.md ADDED
@@ -0,0 +1,5 @@
1
+
2
+ Copyright (C) 2018 Gabor Nyeki
3
+
4
+ This package is licensed under the Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/.
5
+
sacsv-1.5.1/PKG-INFO ADDED
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.4
2
+ Name: sacsv
3
+ Version: 1.5.1
4
+ Summary: Swiss Army csv: command-line tools to manipulate csv-formatted data
5
+ Project-URL: Homepage, https://github.com/gn0/sacsv
6
+ Author-email: Gabor Nyeki <gabor.nyeki@alumni.duke.edu>
7
+ License-File: LICENSE.md
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: argh>=0.31.3
10
+ Description-Content-Type: text/markdown
11
+
12
+
13
+ # sacsv: Swiss Army csv
14
+
15
+ This Python package provides an assortment of command-line tools to manipulate csv-formatted data.
16
+ The tools are:
17
+
18
+ - `csv2jsonl`: converts csv input into jsonlines
19
+ - `csvaddrandom`: adds a column with a random number
20
+ - `csvadduniqueid`: adds a column with a unique record identifier
21
+ - `csvaggregate`: applies an arbitrary Python function to every value of a column, possibly within groups
22
+ - `csvappend`: appends two or more csv files
23
+ - `csvdropdups`: drops duplicate records
24
+ - `csvfindsortkey`: attempts to find the column that the input is sorted by
25
+ - `csvkeepmax`: keeps the record that has the maximum value in a column
26
+ - `csvleftjoin`: merges two csv files
27
+ - `csvop`: applies an arbitrary Python function to every record and saves the return value in a new column
28
+ - `csvparallel`: parallelizes arbitrary commands that read a csv input and write a csv output
29
+ - `csvrename`: changes the name of a column
30
+ - `csvreorder`: changes the order of columns
31
+ - `csvsed`: applies a substitution rule, using regular expressions, to every value of a column
32
+ - `csvsort`: sorts the input
33
+ - `csvtranspose`: transposes the input
34
+ - `fw2csv`: converts fixed-width input, potentially with multi-line records, into csv
35
+ - `longcsv2wide`: converts the input from long to wide form
36
+ - `widecsv2long`: converts the input from wide to long form
37
+
38
+ ![Illustration of `csvparallel` executing `f.py` in 8 jobs](examples/csvparallel.png)
39
+
40
+ ## Installation
41
+
42
+ To install this package using pip, type
43
+
44
+ ```
45
+ pip install git+https://github.com/gn0/sacsv
46
+ ```
47
+
48
+ or, alternatively,
49
+
50
+ ```
51
+ git clone https://github.com/gn0/sacsv
52
+ pip install ./sacsv
53
+ ```
54
+
55
+ ## Author
56
+
57
+ Gabor Nyeki. Contact information is on http://www.gabornyeki.com/.
58
+
59
+ ## License
60
+
61
+ This package is licensed under the Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/.
62
+
sacsv-1.5.1/README.md ADDED
@@ -0,0 +1,51 @@
1
+
2
+ # sacsv: Swiss Army csv
3
+
4
+ This Python package provides an assortment of command-line tools to manipulate csv-formatted data.
5
+ The tools are:
6
+
7
+ - `csv2jsonl`: converts csv input into jsonlines
8
+ - `csvaddrandom`: adds a column with a random number
9
+ - `csvadduniqueid`: adds a column with a unique record identifier
10
+ - `csvaggregate`: applies an arbitrary Python function to every value of a column, possibly within groups
11
+ - `csvappend`: appends two or more csv files
12
+ - `csvdropdups`: drops duplicate records
13
+ - `csvfindsortkey`: attempts to find the column that the input is sorted by
14
+ - `csvkeepmax`: keeps the record that has the maximum value in a column
15
+ - `csvleftjoin`: merges two csv files
16
+ - `csvop`: applies an arbitrary Python function to every record and saves the return value in a new column
17
+ - `csvparallel`: parallelizes arbitrary commands that read a csv input and write a csv output
18
+ - `csvrename`: changes the name of a column
19
+ - `csvreorder`: changes the order of columns
20
+ - `csvsed`: applies a substitution rule, using regular expressions, to every value of a column
21
+ - `csvsort`: sorts the input
22
+ - `csvtranspose`: transposes the input
23
+ - `fw2csv`: converts fixed-width input, potentially with multi-line records, into csv
24
+ - `longcsv2wide`: converts the input from long to wide form
25
+ - `widecsv2long`: converts the input from wide to long form
26
+
27
+ ![Illustration of `csvparallel` executing `f.py` in 8 jobs](examples/csvparallel.png)
28
+
29
+ ## Installation
30
+
31
+ To install this package using pip, type
32
+
33
+ ```
34
+ pip install git+https://github.com/gn0/sacsv
35
+ ```
36
+
37
+ or, alternatively,
38
+
39
+ ```
40
+ git clone https://github.com/gn0/sacsv
41
+ pip install ./sacsv
42
+ ```
43
+
44
+ ## Author
45
+
46
+ Gabor Nyeki. Contact information is on http://www.gabornyeki.com/.
47
+
48
+ ## License
49
+
50
+ This package is licensed under the Creative Commons Attribution 4.0 International License: http://creativecommons.org/licenses/by/4.0/.
51
+
Binary file
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "sacsv"
7
+ version = "1.5.1"
8
+ description = "Swiss Army csv: command-line tools to manipulate csv-formatted data"
9
+ authors = [
10
+ { name = "Gabor Nyeki", email = "gabor.nyeki@alumni.duke.edu" }
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ dependencies = [
15
+ "argh>=0.31.3",
16
+ ]
17
+
18
+ [project.scripts]
19
+ csv2jsonl = "sacsv.csv2jsonl:dispatch"
20
+ csvaddrandom = "sacsv.csvaddrandom:dispatch"
21
+ csvadduniqueid = "sacsv.csvadduniqueid:dispatch"
22
+ csvaggregate = "sacsv.csvaggregate:dispatch"
23
+ csvappend = "sacsv.csvappend:dispatch"
24
+ csvcheck = "sacsv.csvcheck:dispatch"
25
+ csvdropdups = "sacsv.csvdropdups:dispatch"
26
+ csvfindsortkey = "sacsv.csvfindsortkey:dispatch"
27
+ csvkeepmax = "sacsv.csvkeepmax:dispatch"
28
+ csvleftjoin = "sacsv.csvleftjoin:dispatch"
29
+ csvop = "sacsv.csvop:dispatch"
30
+ csvparallel = "sacsv.csvparallel:dispatch"
31
+ csvrename = "sacsv.csvrename:dispatch"
32
+ csvreorder = "sacsv.csvreorder:dispatch"
33
+ csvsed = "sacsv.csvsed:dispatch"
34
+ csvsort = "sacsv.csvsort:dispatch"
35
+ csvtranspose = "sacsv.csvtranspose:dispatch"
36
+ fw2csv = "sacsv.fw2csv:dispatch"
37
+ longcsv2wide = "sacsv.longcsv2wide:dispatch"
38
+ widecsv2long = "sacsv.widecsv2long:dispatch"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/gn0/sacsv"
42
+
43
+ [dependency-groups]
44
+ dev = [
45
+ "pytest>=9.0.2",
46
+ "pytest-cov>=7.0.0",
47
+ ]
File without changes
@@ -0,0 +1,33 @@
1
+ import argh
2
+ import csv
3
+ import json
4
+ import sys
5
+ import collections
6
+
7
+
8
+ def cast(obj):
9
+ try:
10
+ return float(obj)
11
+ except:
12
+ return obj
13
+
14
+
15
+ @argh.arg("-a", "--auto-cast", default=False)
16
+ def main(auto_cast=None):
17
+ reader = csv.reader(sys.stdin)
18
+ header = next(reader)
19
+
20
+ for record in reader:
21
+ obj = collections.OrderedDict((k, cast(v) if auto_cast else v)
22
+ for k, v in zip(header, record))
23
+ print(json.dumps(obj))
24
+
25
+ sys.stdout.flush()
26
+
27
+
28
+ def dispatch():
29
+ argh.dispatch_command(main)
30
+
31
+
32
+ if __name__ == "__main__":
33
+ dispatch()
@@ -0,0 +1,29 @@
1
+ import argh
2
+ import random
3
+ import csv
4
+ import sys
5
+
6
+
7
+ @argh.arg("-s", "--seed", type=int, required=True)
8
+ @argh.arg("-c", "--column-name", type=str, required=True)
9
+ def main(column_name=None, seed=None):
10
+ random.seed(seed)
11
+
12
+ reader = csv.reader(sys.stdin)
13
+ header = next(reader)
14
+
15
+ writer = csv.writer(sys.stdout)
16
+ writer.writerow(
17
+ header + [column_name])
18
+
19
+ for record in reader:
20
+ writer.writerow(
21
+ record + [random.randint(1, 2**31)])
22
+
23
+
24
+ def dispatch():
25
+ argh.dispatch_command(main)
26
+
27
+
28
+ if __name__ == "__main__":
29
+ dispatch()
@@ -0,0 +1,43 @@
1
+ import argh
2
+ import sys
3
+ import csv
4
+ import itertools as it
5
+
6
+
7
+ @argh.arg("-g", "--group-by", type=str, nargs="+")
8
+ @argh.arg("-s", "--sort-by", type=str, nargs="+")
9
+ @argh.arg("-c", "--column-name", type=str, required=True)
10
+ def main(group_by=None, sort_by=None, column_name=None):
11
+ reader = csv.reader(sys.stdin)
12
+ header = next(reader)
13
+
14
+ if group_by is None:
15
+ group_key = lambda r: 1
16
+ else:
17
+ group_indices = tuple(header.index(c) for c in group_by)
18
+ group_key = lambda r: tuple(r[i] for i in group_indices)
19
+
20
+ if sort_by is None:
21
+ sort_key = lambda r: 1
22
+ else:
23
+ sort_indices = tuple(header.index(c) for c in sort_by)
24
+ sort_key = lambda r: tuple(r[i] for i in sort_indices)
25
+
26
+ writer = csv.writer(sys.stdout)
27
+ writer.writerow(
28
+ [column_name] + header)
29
+
30
+ for group_id, group_iter in it.groupby(
31
+ sorted(reader, key=group_key),
32
+ group_key):
33
+ for k, record in enumerate(sorted(group_iter, key=sort_key), 1):
34
+ writer.writerow(
35
+ [k] + record)
36
+
37
+
38
+ def dispatch():
39
+ argh.dispatch_command(main)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ dispatch()
@@ -0,0 +1,59 @@
1
+ import argh
2
+ import importlib
3
+ import csv
4
+ import sys
5
+ import operator as op
6
+ import itertools as it
7
+
8
+
9
+ @argh.arg("-m", "--import-mod", type=str, nargs="+", required=False)
10
+ @argh.arg("-c", "--columns", type=str, nargs="+", required=True)
11
+ @argh.arg("-g", "--group-by", type=str, nargs="+", required=False)
12
+ @argh.arg("-f", "--func-def", type=str, required=True)
13
+ def main(import_mod=None, columns=None, group_by=None, func_def=None):
14
+ for m in import_mod or tuple():
15
+ globals()[m.split(".")[0]] = importlib.import_module(m.split(".")[0])
16
+ importlib.import_module(m)
17
+
18
+ f = eval(func_def)
19
+
20
+ reader = csv.reader(sys.stdin)
21
+ header = next(reader)
22
+ pickers = tuple(op.itemgetter(header.index(c)) for c in columns)
23
+
24
+ writer = csv.writer(sys.stdout)
25
+
26
+ if group_by is None:
27
+ group_key = lambda r: 1
28
+ writer.writerow(columns)
29
+ else:
30
+ group_key = lambda r: tuple(r[header.index(c)] for c in group_by)
31
+ writer.writerow(group_by + columns)
32
+
33
+ for group_id, record_iter in it.groupby(
34
+ sorted(
35
+ reader,
36
+ key=group_key),
37
+ group_key):
38
+ values = list(list() for k in range(len(columns)))
39
+
40
+ for record in record_iter:
41
+ for k in range(len(columns)):
42
+ values[k].append(
43
+ pickers[k](record))
44
+
45
+ if group_by is None:
46
+ writer.writerow(
47
+ tuple(f(values[k]) for k in range(len(columns))))
48
+ else:
49
+ writer.writerow(
50
+ group_id
51
+ + tuple(f(values[k]) for k in range(len(columns))))
52
+
53
+
54
+ def dispatch():
55
+ argh.dispatch_command(main)
56
+
57
+
58
+ if __name__ == "__main__":
59
+ dispatch()
@@ -0,0 +1,52 @@
1
+ import argh
2
+ import csv
3
+ import sys
4
+
5
+
6
+ def get_fieldnames(fieldnames_iter):
7
+ return sorted(
8
+ set(f
9
+ for fieldnames in fieldnames_iter
10
+ for f in fieldnames))
11
+
12
+
13
+ assert (get_fieldnames(
14
+ (("a", "b"), ("a", "b", "c")))
15
+ == ["a", "b", "c"])
16
+
17
+
18
+ def make_get_fields(*fieldnames):
19
+ def get_fields(record):
20
+ return tuple(record.get(field) for field in fieldnames)
21
+
22
+ return get_fields
23
+
24
+
25
+ assert make_get_fields("a", "b")(dict(a=1)) == (1, None)
26
+
27
+
28
+ @argh.arg("csv_filename", nargs="+")
29
+ def main(csv_filename):
30
+ dict_readers = tuple(csv.DictReader(open(filename, "r"))
31
+ for filename in csv_filename)
32
+
33
+ fieldnames = get_fieldnames(r.fieldnames for r in dict_readers)
34
+ get_fields = make_get_fields(*fieldnames)
35
+
36
+ writer = csv.writer(sys.stdout)
37
+
38
+ writer.writerow(fieldnames)
39
+ writer.writerows(
40
+ get_fields(record)
41
+ for dict_reader in dict_readers
42
+ for record in dict_reader)
43
+
44
+ sys.stdout.flush()
45
+
46
+
47
+ def dispatch():
48
+ argh.dispatch_command(main)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ dispatch()
@@ -0,0 +1,232 @@
1
+ import argh
2
+ import sys
3
+ import csv
4
+
5
+ from collections import namedtuple
6
+
7
+
8
+ CheckResult = namedtuple(
9
+ "CheckResult",
10
+ ("success", "message", "problems"))
11
+
12
+
13
+ def column_values(rows, index):
14
+ return tuple(row[index] for row in rows)
15
+
16
+
17
+ def print_problems_markdown(header, records):
18
+ n_columns = len(header)
19
+
20
+ column_widths = tuple(
21
+ max(map(len, (header[i],) + column_values(records, i)))
22
+ for i in range(n_columns))
23
+
24
+ # Header.
25
+ #
26
+
27
+ print("|", end = "")
28
+ for i in range(n_columns):
29
+ print(
30
+ (" %%%ds |" % column_widths[i]) % header[i],
31
+ end="")
32
+ print()
33
+
34
+ print("|", end = "")
35
+ for i in range(n_columns):
36
+ print(" %s |" % ("-" * column_widths[i]), end="")
37
+ print()
38
+
39
+ # Body.
40
+ #
41
+
42
+ for record in records:
43
+ print("|", end = "")
44
+ for i in range(n_columns):
45
+ print(
46
+ (" %%%ds |" % column_widths[i]) % record[i],
47
+ end="")
48
+ print()
49
+
50
+ print()
51
+
52
+
53
+ def share_samelength_among_nonmissing(values):
54
+ pass
55
+
56
+
57
+ def is_nonmissing(value):
58
+ return value not in ("", "NA")
59
+
60
+
61
+ def is_numeric(value):
62
+ try:
63
+ float(value)
64
+ except ValueError:
65
+ return False
66
+
67
+ return True
68
+
69
+
70
+ def is_text(value):
71
+ return not is_numeric(value)
72
+
73
+
74
+ def is_unique(value, values):
75
+ try:
76
+ values[values.index(value) + 1:].index(value)
77
+ except ValueError:
78
+ return True
79
+
80
+ return False
81
+
82
+
83
+ def share_type_among_nonmissing(type_func, values):
84
+ indicators = tuple(
85
+ type_func(value)
86
+ for value in values
87
+ if is_nonmissing(value))
88
+
89
+ if len(indicators) == 0:
90
+ return None
91
+
92
+ return sum(indicators) / len(indicators)
93
+
94
+
95
+ def share_numeric_among_nonmissing(values):
96
+ return share_type_among_nonmissing(
97
+ is_numeric, values)
98
+
99
+
100
+ def share_text_among_nonmissing(values):
101
+ return share_type_among_nonmissing(
102
+ is_text, values)
103
+
104
+
105
+ def share_unique_among_nonmissing(values):
106
+ return share_type_among_nonmissing(
107
+ lambda x: is_unique(x, values),
108
+ values)
109
+
110
+
111
+ def mostly_samelength(values):
112
+ return False # TODO
113
+
114
+
115
+ def mostly_numeric(values):
116
+ share = share_numeric_among_nonmissing(values)
117
+
118
+ return share is None or share >= .9
119
+
120
+
121
+ def mostly_text(values):
122
+ share = share_text_among_nonmissing(values)
123
+
124
+ return share is None or share >= .9
125
+
126
+
127
+ def mostly_unique(values):
128
+ share = share_unique_among_nonmissing(values)
129
+
130
+ return share is None or share >= .9
131
+
132
+
133
+ def check_column_type_consistency(values):
134
+ """Checks that if most values in a column are a certain
135
+ type, then all values in that column are that type."""
136
+
137
+ success = True
138
+ message = None
139
+ problems = None
140
+
141
+ if mostly_samelength(values):
142
+ # E.g., phone numbers that each have 11 digits.
143
+ #
144
+ pass # TODO
145
+ elif mostly_numeric(values):
146
+ if share_text_among_nonmissing(values) > 0:
147
+ success = False
148
+ message = "Text value in mostly numeric column."
149
+ problems = tuple(
150
+ i
151
+ for i, value in enumerate(values)
152
+ if is_nonmissing(value) and is_text(value))
153
+ elif mostly_text(values):
154
+ if share_numeric_among_nonmissing(values) > 0:
155
+ success = False
156
+ message = "Numeric value in mostly text column."
157
+ problems = tuple(
158
+ i
159
+ for i, value in enumerate(values)
160
+ if is_nonmissing(value) and is_numeric(value))
161
+
162
+ return CheckResult(
163
+ success=success,
164
+ message=message,
165
+ problems=problems)
166
+
167
+
168
+ def check_column_no_odd_value_repetitions(values):
169
+ """Checks that if most values in a column are unique, then
170
+ all values in that column are unique."""
171
+
172
+ success = True
173
+ message = None
174
+ problems = None
175
+
176
+ if mostly_unique(values):
177
+ if share_unique_among_nonmissing(values) < 1:
178
+ success = False
179
+ message = "Repeated values in column with mostly unique values."
180
+ problems = tuple(
181
+ i
182
+ for i, value in enumerate(values)
183
+ if is_nonmissing(value)
184
+ and not is_unique(value, values))
185
+
186
+ return CheckResult(
187
+ success=success,
188
+ message=message,
189
+ problems=problems)
190
+
191
+
192
+ def main():
193
+ reader = csv.reader(sys.stdin)
194
+
195
+ header = next(reader)
196
+ records = tuple(reader)
197
+
198
+ column_checks = (
199
+ check_column_type_consistency,
200
+ check_column_no_odd_value_repetitions)
201
+
202
+ # TODO
203
+ # row_checks = (
204
+ # check_row_no_odd_value_repetitions,)
205
+
206
+ all_success = True
207
+
208
+ for check in column_checks:
209
+ for i, column in enumerate(header):
210
+ result = check(column_values(records, i))
211
+
212
+ if not result.success:
213
+ all_success = False
214
+
215
+ print("warning: %s: %s"
216
+ % (column, result.message))
217
+ print_problems_markdown(
218
+ header,
219
+ tuple(
220
+ records[index]
221
+ for index in result.problems))
222
+
223
+ if not all_success:
224
+ sys.exit(1)
225
+
226
+
227
+ def dispatch():
228
+ argh.dispatch_command(main)
229
+
230
+
231
+ if __name__ == "__main__":
232
+ dispatch()
@@ -0,0 +1,60 @@
1
+ import argh
2
+ import csv
3
+ import sys
4
+ import itertools as it
5
+
6
+
7
+ def make_key(key_columns, columns):
8
+ if len(key_columns) == 0:
9
+ return lambda x: 1
10
+ else:
11
+ def key(item):
12
+ return tuple(item[columns.index(c)] for c in key_columns)
13
+
14
+ return key
15
+
16
+
17
+ @argh.arg("-k", "--key", type=str, nargs="+", required=True)
18
+ @argh.arg("-f", "--keep-first", type=str, nargs="*")
19
+ @argh.arg("-l", "--keep-last", type=str, nargs="*")
20
+ def main(key=None, keep_first=None, keep_last=None):
21
+ if keep_first is None and keep_last is None:
22
+ raise argh.CommandError(
23
+ "Must specify either --keep-first or --keep-last.")
24
+ elif keep_first is not None and keep_last is not None:
25
+ raise argh.CommandError(
26
+ "Must specify either --keep-first or --keep-last "
27
+ + "but not both.")
28
+
29
+ reader = csv.reader(sys.stdin)
30
+ columns = next(reader)
31
+
32
+ primary_key = make_key(key, columns)
33
+
34
+ if keep_first is not None:
35
+ secondary_key = make_key(keep_first, columns)
36
+ else:
37
+ secondary_key = make_key(keep_last, columns)
38
+
39
+ writer = csv.writer(sys.stdout)
40
+ writer.writerow(columns)
41
+
42
+ for item_key, item_iter in it.groupby(
43
+ sorted(
44
+ reader,
45
+ key=primary_key),
46
+ primary_key):
47
+ items = sorted(item_iter, key=secondary_key)
48
+
49
+ if keep_first is not None:
50
+ writer.writerow(items[0])
51
+ else:
52
+ writer.writerow(items[-1])
53
+
54
+
55
+ def dispatch():
56
+ argh.dispatch_command(main)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ dispatch()