csvpath 0.0.21__tar.gz → 0.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvpath-0.0.23/PKG-INFO +298 -0
- csvpath-0.0.23/README.md +281 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/csvpath.py +67 -31
- csvpath-0.0.23/csvpath/csvpaths.py +88 -0
- csvpath-0.0.23/csvpath/matching/functions/any.md +33 -0
- csvpath-0.0.23/csvpath/matching/functions/any.py +105 -0
- csvpath-0.0.23/csvpath/matching/functions/average.md +21 -0
- csvpath-0.0.23/csvpath/matching/functions/column.py +23 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/concat.py +6 -7
- csvpath-0.0.23/csvpath/matching/functions/count_lines.py +10 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/function.py +1 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/function_factory.py +28 -2
- csvpath-0.0.23/csvpath/matching/functions/header.md +13 -0
- csvpath-0.0.23/csvpath/matching/functions/header.py +10 -0
- csvpath-0.0.23/csvpath/matching/functions/increment.md +57 -0
- csvpath-0.0.23/csvpath/matching/functions/increment.py +66 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/length.py +0 -5
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/minf.py +17 -12
- csvpath-0.0.23/csvpath/matching/functions/nonef.py +10 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/notf.py +0 -2
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/now.py +0 -6
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/print.md +1 -1
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/printf.py +11 -12
- csvpath-0.0.23/csvpath/matching/functions/regex.py +35 -0
- csvpath-0.0.23/csvpath/matching/functions/stop.md +24 -0
- csvpath-0.0.23/csvpath/matching/functions/stop.py +23 -0
- csvpath-0.0.23/csvpath/matching/functions/substring.py +31 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/subtract.py +0 -1
- csvpath-0.0.23/csvpath/matching/functions/variable.md +13 -0
- csvpath-0.0.23/csvpath/matching/functions/variable.py +10 -0
- csvpath-0.0.23/csvpath/matching/functions/when.md +16 -0
- csvpath-0.0.23/csvpath/matching/functions/when.py +25 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/matcher.py +21 -12
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/matching_lexer.py +4 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/parser.out +334 -236
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/parsetab.py +377 -221
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/equality.py +2 -3
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/expression.py +1 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/header.py +2 -1
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/matchable.py +37 -1
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/term.py +4 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/variable.py +1 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/pyproject.toml +1 -1
- csvpath-0.0.21/PKG-INFO +0 -199
- csvpath-0.0.21/README.md +0 -182
- csvpath-0.0.21/csvpath/matching/functions/count_lines.py +0 -12
- csvpath-0.0.21/csvpath/matching/functions/regex.py +0 -38
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/__init__.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/__init__.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/expression_encoder.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/expression_math.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/expression_utility.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/above.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/add.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/below.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/count.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/count.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/count_scans.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/divide.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/end.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/every.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/every.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/first.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/first.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/in.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/inf.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/is_instance.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/lower.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/multiply.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/no.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/no.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/now.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/orf.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/percent.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/random.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/tally.md +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/tally.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/upper.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/yes.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/parser_utility.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/__init__.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/parser.out +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/parsetab.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/scanner.py +0 -0
- {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/scanning_lexer.py +0 -0
csvpath-0.0.23/PKG-INFO
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: csvpath
|
|
3
|
+
Version: 0.0.23
|
|
4
|
+
Summary:
|
|
5
|
+
Author: David Kershaw
|
|
6
|
+
Author-email: dk107dk@hotmail.com
|
|
7
|
+
Requires-Python: >=3.12,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
11
|
+
Requires-Dist: ply (>=3.11,<4.0)
|
|
12
|
+
Requires-Dist: polars (>=1.1.0,<2.0.0)
|
|
13
|
+
Requires-Dist: pytest (>=8.2.2,<9.0.0)
|
|
14
|
+
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# CsvPath
|
|
19
|
+
|
|
20
|
+
CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
|
|
21
|
+
- XPath: CsvPath is to a CSV file like XPath is to an XML file
|
|
22
|
+
- Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
|
|
23
|
+
- CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
|
|
24
|
+
|
|
25
|
+
CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
|
|
26
|
+
|
|
27
|
+
# Usage
|
|
28
|
+
CsvPath paths have three parts:
|
|
29
|
+
- a "root" file name
|
|
30
|
+
- a scanning part
|
|
31
|
+
- a matching part
|
|
32
|
+
|
|
33
|
+
The root starts with `$`. The match and scan parts are enclosed by brackets.
|
|
34
|
+
|
|
35
|
+
A very simple csvpath might look like this:
|
|
36
|
+
|
|
37
|
+
$filename[*][yes()]
|
|
38
|
+
|
|
39
|
+
This path says open the file named `filename`, scan all the lines, and match every line scanned.
|
|
40
|
+
|
|
41
|
+
The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
|
|
42
|
+
|
|
43
|
+
This is a very basic use. For more usage, see the unit tests.
|
|
44
|
+
|
|
45
|
+
path = CsvPath()
|
|
46
|
+
path.parse("""$test.csv
|
|
47
|
+
[5-25]
|
|
48
|
+
[
|
|
49
|
+
#0=="Frog"
|
|
50
|
+
@lastname.onmatch="Bats"
|
|
51
|
+
count()==2
|
|
52
|
+
]
|
|
53
|
+
""")
|
|
54
|
+
for i, line in enumerate( path.next() ):
|
|
55
|
+
print(f"{i}: {line}")
|
|
56
|
+
print(f"path vars: {path.variables}")
|
|
57
|
+
|
|
58
|
+
The csvpath says:
|
|
59
|
+
- Open test.csv
|
|
60
|
+
- Scan lines 5 through 25
|
|
61
|
+
- Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
|
|
62
|
+
|
|
63
|
+
Another path that does the same thing might look like:
|
|
64
|
+
|
|
65
|
+
path = CsvPath()
|
|
66
|
+
path.parse("""$test.csv
|
|
67
|
+
[5-25]
|
|
68
|
+
[
|
|
69
|
+
#0=="Frog"
|
|
70
|
+
@lastname.onmatch="Bats"
|
|
71
|
+
count()==2
|
|
72
|
+
print( count()==2, "$.match_count: $.line")
|
|
73
|
+
]
|
|
74
|
+
""")
|
|
75
|
+
path.fast_forward()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
|
|
79
|
+
- A JSON file with a dictionary of file paths under name keys
|
|
80
|
+
- A dict object passed into the CsvPaths object containing the same named path structure
|
|
81
|
+
- The path to a csv file that will be put into the named paths dict under its name minus extension
|
|
82
|
+
- A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
|
|
83
|
+
|
|
84
|
+
You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
|
|
85
|
+
|
|
86
|
+
paths = CsvPaths(filename = "my_named_paths.json")
|
|
87
|
+
path = paths.csvpath()
|
|
88
|
+
path.parse( """$test[*][#firstname=="Fred"]""" )
|
|
89
|
+
path.collect()
|
|
90
|
+
|
|
91
|
+
If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
|
|
92
|
+
|
|
93
|
+
{ "test":"test/test_resources/test.csv" }
|
|
94
|
+
|
|
95
|
+
# Scanning
|
|
96
|
+
The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
|
|
97
|
+
|
|
98
|
+
The scan part of the path starts with a dollar sign to indicate the root, meaning the file from the top. After the dollar sign comes the file path. The scanning instructions are in a bracket. The rules are:
|
|
99
|
+
- `[*]` means all
|
|
100
|
+
- `[3*]` means starting from line 3 and going to the end of the file
|
|
101
|
+
- `[3]` by itself means just line 3
|
|
102
|
+
- `[1-3]` means lines 1 through 3
|
|
103
|
+
- `[1+3]` means lines 1 and line 3
|
|
104
|
+
- `[1+3-8]` means line 1 and lines 3 through eight
|
|
105
|
+
|
|
106
|
+
# Matching
|
|
107
|
+
The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
|
|
108
|
+
<table>
|
|
109
|
+
<tr>
|
|
110
|
+
<td>Type</td>
|
|
111
|
+
<td>Returns</td>
|
|
112
|
+
<td>Matches</td>
|
|
113
|
+
<td>Description</td>
|
|
114
|
+
<td>Examples</td>
|
|
115
|
+
</tr>
|
|
116
|
+
<tr>
|
|
117
|
+
<td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
|
|
118
|
+
<td>A quoted string or date, optionally quoted number, or
|
|
119
|
+
regex. Regex features are limited. A regex is wrapped in "/" characters and
|
|
120
|
+
only has regex functionality when used in the regex() function.</td>
|
|
121
|
+
<td>
|
|
122
|
+
<li/> `"Massachusetts"`
|
|
123
|
+
<li/> `89.7`
|
|
124
|
+
<li/> `/[0-9a-zA-Z]+!/`
|
|
125
|
+
</td>
|
|
126
|
+
</tr>
|
|
127
|
+
<tr>
|
|
128
|
+
<td>Function </td><td> Calculated </td><td> Calculated </td>
|
|
129
|
+
<td>A function name followed by parentheses. Functions can
|
|
130
|
+
contain terms, variables, headers and other functions. Some functions
|
|
131
|
+
take a specific or unlimited number of types as arguments.
|
|
132
|
+
Certain functions can take qualifiers. An `onmatch` qualifier indicates that
|
|
133
|
+
the function should be applied only when the whole path matches.
|
|
134
|
+
Some functions optionally take an arbitrary name qualifier to better name a tracking variable.
|
|
135
|
+
Qualifiers are described below. </td>
|
|
136
|
+
<td>
|
|
137
|
+
<li/> `not(count()==2)`
|
|
138
|
+
<li/> `add( 5, 3, 1 )`
|
|
139
|
+
<li/> `concat( end(), regex(#0, /[0-5]+abc/))`
|
|
140
|
+
</td>
|
|
141
|
+
</tr>
|
|
142
|
+
<tr>
|
|
143
|
+
<td>Variable </td>
|
|
144
|
+
<td>Value</td>
|
|
145
|
+
<td>True/False when value tested. True when set, True/False existence when used alone</td>
|
|
146
|
+
<td>An @ followed by a name. A variable is
|
|
147
|
+
set or tested depending on the usage. By itself, it is an existence test. When used as
|
|
148
|
+
the left hand side of an "=" its value is set.
|
|
149
|
+
When it is used on either side of an "==" it is an equality test.
|
|
150
|
+
Variables can take an `onmatch` qualifier to indicate that the variable should
|
|
151
|
+
only be set when the row matches all parts of the path.
|
|
152
|
+
<td>
|
|
153
|
+
<li/> `@weather="cloudy"`
|
|
154
|
+
<li/> `count(@weather=="sunny")`
|
|
155
|
+
<li/> `@weather`
|
|
156
|
+
<li/> `#summer==@weather`
|
|
157
|
+
|
|
158
|
+
#1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
|
|
159
|
+
</td>
|
|
160
|
+
</tr>
|
|
161
|
+
<tr>
|
|
162
|
+
<td>Header </td>
|
|
163
|
+
<td>Value </td>
|
|
164
|
+
<td>A True/False existence test when used alone, otherwise calculated</td>
|
|
165
|
+
<td>A # followed by a name or integer. The name references a value in line 0, the header
|
|
166
|
+
row. A number references a column by the 0-based column order. </td>
|
|
167
|
+
<td>
|
|
168
|
+
<li/> `#firstname`
|
|
169
|
+
<li/> `#3`
|
|
170
|
+
</td>
|
|
171
|
+
</tr>
|
|
172
|
+
<tr>
|
|
173
|
+
<td>Equality</td>
|
|
174
|
+
<td>Calculated </td>
|
|
175
|
+
<td>True at assignment, otherwise calculated </td>
|
|
176
|
+
<td>Two of the other types joined with an "=" or "==".</td>
|
|
177
|
+
<td>
|
|
178
|
+
<li/> `@type_of_tree="Oak"`
|
|
179
|
+
<li/> `#name == @type_of_tree`
|
|
180
|
+
</td>
|
|
181
|
+
</tr>
|
|
182
|
+
<table>
|
|
183
|
+
|
|
184
|
+
A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
|
|
185
|
+
|
|
186
|
+
[@a=#b #c==@a]
|
|
187
|
+
|
|
188
|
+
Can also be written as:
|
|
189
|
+
|
|
190
|
+
[#c==#b]
|
|
191
|
+
|
|
192
|
+
Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only two qualifiers:
|
|
193
|
+
|
|
194
|
+
- `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
|
|
195
|
+
- An arbitrary string to add a name for the function's internal use, typically to name a variable
|
|
196
|
+
|
|
197
|
+
Qualifiers look like:
|
|
198
|
+
|
|
199
|
+
[ @myvar.onmatch = yes() ]
|
|
200
|
+
|
|
201
|
+
Or:
|
|
202
|
+
|
|
203
|
+
[ @i = increment.this_is_my_increment.onmatch(yes(), 3) ]
|
|
204
|
+
|
|
205
|
+
When multiple qualifiers are used order is not important.
|
|
206
|
+
|
|
207
|
+
## Example
|
|
208
|
+
[ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
|
|
209
|
+
|
|
210
|
+
In the path above, the rules applied are:
|
|
211
|
+
- `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
|
|
212
|
+
- `#2` means the 3rd column, counting from 0
|
|
213
|
+
- Functions and column references are ANDed together
|
|
214
|
+
- `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
|
|
215
|
+
- Functions can contain functions, equality tests, and/or literals
|
|
216
|
+
|
|
217
|
+
Variables are always set unless they are flagged with `.onmatch`. That means:
|
|
218
|
+
|
|
219
|
+
$file.csv[*][ @imcounting.onmatch = count_lines() no()]
|
|
220
|
+
|
|
221
|
+
will never set `imcounting`, because of the `no()` function disallowing any matches, but:
|
|
222
|
+
|
|
223
|
+
$file.csv[*][ @imcounting = count_lines() no()]
|
|
224
|
+
|
|
225
|
+
will always set it.
|
|
226
|
+
|
|
227
|
+
Most of the work of matching is done in functions. The match functions are the following.
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
<table>
|
|
231
|
+
<tr><th> Group </th><th>Function </th><th> What it does </th></tr>
|
|
232
|
+
<tr><td> Boolean </td><td> </td><td> </td></tr>
|
|
233
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
|
|
234
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
|
|
235
|
+
<tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
|
|
236
|
+
<tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
|
|
237
|
+
<tr><td> </td><td> yes() </td><td> always true </td></tr>
|
|
238
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
|
|
239
|
+
<tr><td> Math </td><td> </td><td> </td></tr>
|
|
240
|
+
<tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
|
|
241
|
+
<tr><td> </td><td> divide(value, value, ...) </td><td> divides numbers </td></tr>
|
|
242
|
+
<tr><td> </td><td> multiply(value, value, ...) </td><td> multiplies numbers </td></tr>
|
|
243
|
+
<tr><td> </td><td> subtract(value, value, ...) </td><td> subtracts numbers </td></tr>
|
|
244
|
+
<tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
|
|
245
|
+
<tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
|
|
246
|
+
<tr><td> Stats </td><td> </td><td> </td></tr>
|
|
247
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
|
|
248
|
+
<tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
|
|
249
|
+
<tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
|
|
250
|
+
<tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
|
|
251
|
+
<tr><td> </td><td> percent(type) </td><td> % of total lines for "scan", "match", "line" </td></tr>
|
|
252
|
+
<tr><td> Counting </td><td> </td><td> </td></tr>
|
|
253
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count()</a> </td><td> counts the number of matches </td></tr>
|
|
254
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count(value)</a> </td><td> count matches of value </td></tr>
|
|
255
|
+
<tr><td> </td><td> count_lines() </td><td> count lines to this point in the file </td></tr>
|
|
256
|
+
<tr><td> </td><td> count_scans() </td><td> count lines we checked for match </td></tr>
|
|
257
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/first.md'>first(value, value, ...)</a> </td><td> match the first occurrence and capture line </td></tr>
|
|
258
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/increment.md'>increment(value, n)</a> </td><td> increments a variable by n each time seen </td></tr>
|
|
259
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/every.md'>every(value, number)</a> </td><td> match every Nth time a value is seen </td></tr>
|
|
260
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/tally.md'>tally(value, value, ...)</a></td><td> counts times values are seen, including as a set </td></tr>
|
|
261
|
+
<tr><td> Strings </td><td> </td><td> </td></tr>
|
|
262
|
+
<tr><td> </td><td> concat(value, value) </td><td> joins two values </td></tr>
|
|
263
|
+
<tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
|
|
264
|
+
<tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
|
|
265
|
+
<tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
|
|
266
|
+
<tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
|
|
267
|
+
<tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
|
|
268
|
+
<tr><td> Columns </td><td> </td><td> </td></tr>
|
|
269
|
+
<tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
|
|
270
|
+
<tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
|
|
271
|
+
<tr><td> Other </td><td> </td><td> </td></tr>
|
|
272
|
+
<tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
|
|
273
|
+
<tr><td> </td><td> isinstance(value, typestr) </td><td> tests for "int","float","complex","bool","usd" </td></tr>
|
|
274
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
|
|
275
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
|
|
276
|
+
<tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
|
|
277
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
|
|
278
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
|
|
279
|
+
<tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
|
|
280
|
+
</tr>
|
|
281
|
+
</table>
|
|
282
|
+
|
|
283
|
+
# Not Ready For Production
|
|
284
|
+
Anything could change and performance could be better. This project is a hobby.
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
csvpath-0.0.23/README.md
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
|
|
2
|
+
# CsvPath
|
|
3
|
+
|
|
4
|
+
CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
|
|
5
|
+
- XPath: CsvPath is to a CSV file like XPath is to an XML file
|
|
6
|
+
- Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
|
|
7
|
+
- CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
|
|
8
|
+
|
|
9
|
+
CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
|
|
10
|
+
|
|
11
|
+
# Usage
|
|
12
|
+
CsvPath paths have three parts:
|
|
13
|
+
- a "root" file name
|
|
14
|
+
- a scanning part
|
|
15
|
+
- a matching part
|
|
16
|
+
|
|
17
|
+
The root starts with `$`. The match and scan parts are enclosed by brackets.
|
|
18
|
+
|
|
19
|
+
A very simple csvpath might look like this:
|
|
20
|
+
|
|
21
|
+
$filename[*][yes()]
|
|
22
|
+
|
|
23
|
+
This path says open the file named `filename`, scan all the lines, and match every line scanned.
|
|
24
|
+
|
|
25
|
+
The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
|
|
26
|
+
|
|
27
|
+
This is a very basic use. For more usage, see the unit tests.
|
|
28
|
+
|
|
29
|
+
path = CsvPath()
|
|
30
|
+
path.parse("""$test.csv
|
|
31
|
+
[5-25]
|
|
32
|
+
[
|
|
33
|
+
#0=="Frog"
|
|
34
|
+
@lastname.onmatch="Bats"
|
|
35
|
+
count()==2
|
|
36
|
+
]
|
|
37
|
+
""")
|
|
38
|
+
for i, line in enumerate( path.next() ):
|
|
39
|
+
print(f"{i}: {line}")
|
|
40
|
+
print(f"path vars: {path.variables}")
|
|
41
|
+
|
|
42
|
+
The csvpath says:
|
|
43
|
+
- Open test.csv
|
|
44
|
+
- Scan lines 5 through 25
|
|
45
|
+
- Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
|
|
46
|
+
|
|
47
|
+
Another path that does the same thing might look like:
|
|
48
|
+
|
|
49
|
+
path = CsvPath()
|
|
50
|
+
path.parse("""$test.csv
|
|
51
|
+
[5-25]
|
|
52
|
+
[
|
|
53
|
+
#0=="Frog"
|
|
54
|
+
@lastname.onmatch="Bats"
|
|
55
|
+
count()==2
|
|
56
|
+
print( count()==2, "$.match_count: $.line")
|
|
57
|
+
]
|
|
58
|
+
""")
|
|
59
|
+
path.fast_forward()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
|
|
63
|
+
- A JSON file with a dictionary of file paths under name keys
|
|
64
|
+
- A dict object passed into the CsvPaths object containing the same named path structure
|
|
65
|
+
- The path to a csv file that will be put into the named paths dict under its name minus extension
|
|
66
|
+
- A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
|
|
67
|
+
|
|
68
|
+
You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
|
|
69
|
+
|
|
70
|
+
paths = CsvPaths(filename = "my_named_paths.json")
|
|
71
|
+
path = paths.csvpath()
|
|
72
|
+
path.parse( """$test[*][#firstname=="Fred"]""" )
|
|
73
|
+
path.collect()
|
|
74
|
+
|
|
75
|
+
If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
|
|
76
|
+
|
|
77
|
+
{ "test":"test/test_resources/test.csv" }
|
|
78
|
+
|
|
79
|
+
# Scanning
|
|
80
|
+
The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
|
|
81
|
+
|
|
82
|
+
The scan part of the path starts with a dollar sign to indicate the root, meaning the file from the top. After the dollar sign comes the file path. The scanning instructions are in a bracket. The rules are:
|
|
83
|
+
- `[*]` means all
|
|
84
|
+
- `[3*]` means starting from line 3 and going to the end of the file
|
|
85
|
+
- `[3]` by itself means just line 3
|
|
86
|
+
- `[1-3]` means lines 1 through 3
|
|
87
|
+
- `[1+3]` means lines 1 and line 3
|
|
88
|
+
- `[1+3-8]` means line 1 and lines 3 through eight
|
|
89
|
+
|
|
90
|
+
# Matching
|
|
91
|
+
The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
|
|
92
|
+
<table>
|
|
93
|
+
<tr>
|
|
94
|
+
<td>Type</td>
|
|
95
|
+
<td>Returns</td>
|
|
96
|
+
<td>Matches</td>
|
|
97
|
+
<td>Description</td>
|
|
98
|
+
<td>Examples</td>
|
|
99
|
+
</tr>
|
|
100
|
+
<tr>
|
|
101
|
+
<td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
|
|
102
|
+
<td>A quoted string or date, optionally quoted number, or
|
|
103
|
+
regex. Regex features are limited. A regex is wrapped in "/" characters and
|
|
104
|
+
only has regex functionality when used in the regex() function.</td>
|
|
105
|
+
<td>
|
|
106
|
+
<li/> `"Massachusetts"`
|
|
107
|
+
<li/> `89.7`
|
|
108
|
+
<li/> `/[0-9a-zA-Z]+!/`
|
|
109
|
+
</td>
|
|
110
|
+
</tr>
|
|
111
|
+
<tr>
|
|
112
|
+
<td>Function </td><td> Calculated </td><td> Calculated </td>
|
|
113
|
+
<td>A function name followed by parentheses. Functions can
|
|
114
|
+
contain terms, variables, headers and other functions. Some functions
|
|
115
|
+
take a specific or unlimited number of types as arguments.
|
|
116
|
+
Certain functions can take qualifiers. An `onmatch` qualifier indicates that
|
|
117
|
+
the function should be applied only when the whole path matches.
|
|
118
|
+
Some functions optionally take an arbitrary name qualifier to better name a tracking variable.
|
|
119
|
+
Qualifiers are described below. </td>
|
|
120
|
+
<td>
|
|
121
|
+
<li/> `not(count()==2)`
|
|
122
|
+
<li/> `add( 5, 3, 1 )`
|
|
123
|
+
<li/> `concat( end(), regex(#0, /[0-5]+abc/))`
|
|
124
|
+
</td>
|
|
125
|
+
</tr>
|
|
126
|
+
<tr>
|
|
127
|
+
<td>Variable </td>
|
|
128
|
+
<td>Value</td>
|
|
129
|
+
<td>True/False when value tested. True when set, True/False existence when used alone</td>
|
|
130
|
+
<td>An @ followed by a name. A variable is
|
|
131
|
+
set or tested depending on the usage. By itself, it is an existence test. When used as
|
|
132
|
+
the left hand side of an "=" its value is set.
|
|
133
|
+
When it is used on either side of an "==" it is an equality test.
|
|
134
|
+
Variables can take an `onmatch` qualifier to indicate that the variable should
|
|
135
|
+
only be set when the row matches all parts of the path.
|
|
136
|
+
<td>
|
|
137
|
+
<li/> `@weather="cloudy"`
|
|
138
|
+
<li/> `count(@weather=="sunny")`
|
|
139
|
+
<li/> `@weather`
|
|
140
|
+
<li/> `#summer==@weather`
|
|
141
|
+
|
|
142
|
+
#1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
|
|
143
|
+
</td>
|
|
144
|
+
</tr>
|
|
145
|
+
<tr>
|
|
146
|
+
<td>Header </td>
|
|
147
|
+
<td>Value </td>
|
|
148
|
+
<td>A True/False existence test when used alone, otherwise calculated</td>
|
|
149
|
+
<td>A # followed by a name or integer. The name references a value in line 0, the header
|
|
150
|
+
row. A number references a column by the 0-based column order. </td>
|
|
151
|
+
<td>
|
|
152
|
+
<li/> `#firstname`
|
|
153
|
+
<li/> `#3`
|
|
154
|
+
</td>
|
|
155
|
+
</tr>
|
|
156
|
+
<tr>
|
|
157
|
+
<td>Equality</td>
|
|
158
|
+
<td>Calculated </td>
|
|
159
|
+
<td>True at assignment, otherwise calculated </td>
|
|
160
|
+
<td>Two of the other types joined with an "=" or "==".</td>
|
|
161
|
+
<td>
|
|
162
|
+
<li/> `@type_of_tree="Oak"`
|
|
163
|
+
<li/> `#name == @type_of_tree`
|
|
164
|
+
</td>
|
|
165
|
+
</tr>
|
|
166
|
+
<table>
|
|
167
|
+
|
|
168
|
+
A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
|
|
169
|
+
|
|
170
|
+
[@a=#b #c==@a]
|
|
171
|
+
|
|
172
|
+
Can also be written as:
|
|
173
|
+
|
|
174
|
+
[#c==#b]
|
|
175
|
+
|
|
176
|
+
Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only two qualifiers:
|
|
177
|
+
|
|
178
|
+
- `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
|
|
179
|
+
- An arbitrary string to add a name for the function's internal use, typically to name a variable
|
|
180
|
+
|
|
181
|
+
Qualifiers look like:
|
|
182
|
+
|
|
183
|
+
[ @myvar.onmatch = yes() ]
|
|
184
|
+
|
|
185
|
+
Or:
|
|
186
|
+
|
|
187
|
+
[ @i = increment.this_is_my_increment.onmatch(yes(), 3) ]
|
|
188
|
+
|
|
189
|
+
When multiple qualifiers are used order is not important.
|
|
190
|
+
|
|
191
|
+
## Example
|
|
192
|
+
[ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
|
|
193
|
+
|
|
194
|
+
In the path above, the rules applied are:
|
|
195
|
+
- `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
|
|
196
|
+
- `#2` means the 3rd column, counting from 0
|
|
197
|
+
- Functions and column references are ANDed together
|
|
198
|
+
- `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
|
|
199
|
+
- Functions can contain functions, equality tests, and/or literals
|
|
200
|
+
|
|
201
|
+
Variables are always set unless they are flagged with `.onmatch`. That means:
|
|
202
|
+
|
|
203
|
+
$file.csv[*][ @imcounting.onmatch = count_lines() no()]
|
|
204
|
+
|
|
205
|
+
will never set `imcounting`, because of the `no()` function disallowing any matches, but:
|
|
206
|
+
|
|
207
|
+
$file.csv[*][ @imcounting = count_lines() no()]
|
|
208
|
+
|
|
209
|
+
will always set it.
|
|
210
|
+
|
|
211
|
+
Most of the work of matching is done in functions. The match functions are the following.
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
<table>
|
|
215
|
+
<tr><th> Group </th><th>Function </th><th> What it does </th></tr>
|
|
216
|
+
<tr><td> Boolean </td><td> </td><td> </td></tr>
|
|
217
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
|
|
218
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
|
|
219
|
+
<tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
|
|
220
|
+
<tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
|
|
221
|
+
<tr><td> </td><td> yes() </td><td> always true </td></tr>
|
|
222
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
|
|
223
|
+
<tr><td> Math </td><td> </td><td> </td></tr>
|
|
224
|
+
<tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
|
|
225
|
+
<tr><td> </td><td> divide(value, value, ...) </td><td> divides numbers </td></tr>
|
|
226
|
+
<tr><td> </td><td> multiply(value, value, ...) </td><td> multiplies numbers </td></tr>
|
|
227
|
+
<tr><td> </td><td> subtract(value, value, ...) </td><td> subtracts numbers </td></tr>
|
|
228
|
+
<tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
|
|
229
|
+
<tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
|
|
230
|
+
<tr><td> Stats </td><td> </td><td> </td></tr>
|
|
231
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
|
|
232
|
+
<tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
|
|
233
|
+
<tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
|
|
234
|
+
<tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
|
|
235
|
+
<tr><td> </td><td> percent(type) </td><td> % of total lines for "scan", "match", "line" </td></tr>
|
|
236
|
+
<tr><td> Counting </td><td> </td><td> </td></tr>
|
|
237
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count()</a> </td><td> counts the number of matches </td></tr>
|
|
238
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count(value)</a> </td><td> count matches of value </td></tr>
|
|
239
|
+
<tr><td> </td><td> count_lines() </td><td> count lines to this point in the file </td></tr>
|
|
240
|
+
<tr><td> </td><td> count_scans() </td><td> count lines we checked for match </td></tr>
|
|
241
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/first.md'>first(value, value, ...)</a> </td><td> match the first occurrence and capture line </td></tr>
|
|
242
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/increment.md'>increment(value, n)</a> </td><td> increments a variable by n each time seen </td></tr>
|
|
243
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/every.md'>every(value, number)</a> </td><td> match every Nth time a value is seen </td></tr>
|
|
244
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/tally.md'>tally(value, value, ...)</a></td><td> counts times values are seen, including as a set </td></tr>
|
|
245
|
+
<tr><td> Strings </td><td> </td><td> </td></tr>
|
|
246
|
+
<tr><td> </td><td> concat(value, value) </td><td> joins two values </td></tr>
|
|
247
|
+
<tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
|
|
248
|
+
<tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
|
|
249
|
+
<tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
|
|
250
|
+
<tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
|
|
251
|
+
<tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
|
|
252
|
+
<tr><td> Columns </td><td> </td><td> </td></tr>
|
|
253
|
+
<tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
|
|
254
|
+
<tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
|
|
255
|
+
<tr><td> Other </td><td> </td><td> </td></tr>
|
|
256
|
+
<tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
|
|
257
|
+
<tr><td> </td><td> isinstance(value, typestr) </td><td> tests for "int","float","complex","bool","usd" </td></tr>
|
|
258
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
|
|
259
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
|
|
260
|
+
<tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
|
|
261
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
|
|
262
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
|
|
263
|
+
<tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
|
|
264
|
+
</tr>
|
|
265
|
+
</table>
|
|
266
|
+
|
|
267
|
+
# Not Ready For Production
|
|
268
|
+
Anything could change and performance could be better. This project is a hobby.
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
|