csvpath 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvpath-0.0.4/PKG-INFO +326 -0
- csvpath-0.0.4/README.md +309 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/csvpath.py +139 -123
- csvpath-0.0.4/csvpath/csvpaths.py +88 -0
- csvpath-0.0.4/csvpath/matching/expression_utility.py +94 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/add.py +3 -1
- csvpath-0.0.4/csvpath/matching/functions/any.md +33 -0
- csvpath-0.0.4/csvpath/matching/functions/any.py +105 -0
- csvpath-0.0.4/csvpath/matching/functions/average.md +21 -0
- csvpath-0.0.4/csvpath/matching/functions/column.py +23 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/concat.py +7 -7
- csvpath-0.0.4/csvpath/matching/functions/count.md +28 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/count.py +12 -2
- csvpath-0.0.4/csvpath/matching/functions/count_lines.py +10 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/divide.py +2 -2
- csvpath-0.0.4/csvpath/matching/functions/equals.py +42 -0
- csvpath-0.0.4/csvpath/matching/functions/every.md +58 -0
- csvpath-0.0.4/csvpath/matching/functions/every.py +48 -0
- csvpath-0.0.4/csvpath/matching/functions/exists.py +28 -0
- csvpath-0.0.4/csvpath/matching/functions/first.md +23 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/first.py +10 -4
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/function.py +5 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/function_factory.py +56 -3
- csvpath-0.0.4/csvpath/matching/functions/header.md +13 -0
- csvpath-0.0.4/csvpath/matching/functions/header.py +10 -0
- csvpath-0.0.4/csvpath/matching/functions/in.md +25 -0
- csvpath-0.0.4/csvpath/matching/functions/increment.md +57 -0
- csvpath-0.0.4/csvpath/matching/functions/increment.py +69 -0
- csvpath-0.0.4/csvpath/matching/functions/last.md +12 -0
- csvpath-0.0.4/csvpath/matching/functions/last.py +14 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/length.py +0 -5
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/minf.py +17 -12
- csvpath-0.0.4/csvpath/matching/functions/mod.py +27 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/multiply.py +1 -1
- csvpath-0.0.4/csvpath/matching/functions/no.md +13 -0
- csvpath-0.0.4/csvpath/matching/functions/nonef.py +10 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/notf.py +0 -2
- csvpath-0.0.4/csvpath/matching/functions/now.md +13 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/now.py +0 -7
- csvpath-0.0.4/csvpath/matching/functions/print.md +37 -0
- csvpath-0.0.4/csvpath/matching/functions/printf.py +143 -0
- csvpath-0.0.4/csvpath/matching/functions/regex.py +35 -0
- csvpath-0.0.4/csvpath/matching/functions/stop.md +24 -0
- csvpath-0.0.4/csvpath/matching/functions/stop.py +23 -0
- csvpath-0.0.4/csvpath/matching/functions/substring.py +32 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/subtract.py +1 -2
- csvpath-0.0.4/csvpath/matching/functions/tally.md +29 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/tally.py +3 -3
- csvpath-0.0.4/csvpath/matching/functions/variable.md +13 -0
- csvpath-0.0.4/csvpath/matching/functions/variable.py +10 -0
- csvpath-0.0.4/csvpath/matching/functions/when._ +16 -0
- csvpath-0.0.4/csvpath/matching/functions/when.__ +29 -0
- csvpath-0.0.4/csvpath/matching/functions/yes.py +10 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/matcher.py +65 -28
- csvpath-0.0.4/csvpath/matching/matching_lexer.py +122 -0
- csvpath-0.0.4/csvpath/matching/parser.out +2096 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/parsetab.py +1485 -551
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/productions/equality.py +39 -56
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/productions/expression.py +5 -0
- csvpath-0.0.4/csvpath/matching/productions/header.py +40 -0
- csvpath-0.0.4/csvpath/matching/productions/matchable.py +121 -0
- csvpath-0.0.4/csvpath/matching/productions/term.py +18 -0
- csvpath-0.0.4/csvpath/matching/productions/variable.py +46 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/parser_utility.py +5 -3
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/scanning/scanner.py +32 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/pyproject.toml +3 -1
- csvpath-0.0.2/PKG-INFO +0 -184
- csvpath-0.0.2/README.md +0 -169
- csvpath-0.0.2/csvpath/matching/expression_math.py +0 -123
- csvpath-0.0.2/csvpath/matching/expression_utility.py +0 -29
- csvpath-0.0.2/csvpath/matching/functions/count_lines.py +0 -12
- csvpath-0.0.2/csvpath/matching/functions/every.py +0 -33
- csvpath-0.0.2/csvpath/matching/functions/regex.py +0 -38
- csvpath-0.0.2/csvpath/matching/matching_lexer.py +0 -66
- csvpath-0.0.2/csvpath/matching/parser.out +0 -1287
- csvpath-0.0.2/csvpath/matching/productions/header.py +0 -30
- csvpath-0.0.2/csvpath/matching/productions/matchable.py +0 -41
- csvpath-0.0.2/csvpath/matching/productions/term.py +0 -11
- csvpath-0.0.2/csvpath/matching/productions/variable.py +0 -15
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/__init__.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/__init__.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/expression_encoder.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/above.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/below.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/count_scans.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/end.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/inf.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/is_instance.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/lower.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/no.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/orf.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/percent.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/random.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/matching/functions/upper.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/scanning/__init__.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/scanning/parser.out +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/scanning/parsetab.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.4}/csvpath/scanning/scanning_lexer.py +0 -0
csvpath-0.0.4/PKG-INFO
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: csvpath
|
|
3
|
+
Version: 0.0.4
|
|
4
|
+
Summary:
|
|
5
|
+
Author: David Kershaw
|
|
6
|
+
Author-email: dk107dk@hotmail.com
|
|
7
|
+
Requires-Python: >=3.12,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
11
|
+
Requires-Dist: ply (>=3.11,<4.0)
|
|
12
|
+
Requires-Dist: polars (>=1.1.0,<2.0.0)
|
|
13
|
+
Requires-Dist: pytest (>=8.2.2,<9.0.0)
|
|
14
|
+
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# CsvPath
|
|
19
|
+
|
|
20
|
+
CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
|
|
21
|
+
- XPath: CsvPath is to a CSV file like XPath is to an XML file
|
|
22
|
+
- Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
|
|
23
|
+
- CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
|
|
24
|
+
|
|
25
|
+
CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
|
|
26
|
+
|
|
27
|
+
# Usage
|
|
28
|
+
CsvPath paths have three parts:
|
|
29
|
+
- a "root" file name
|
|
30
|
+
- a scanning part
|
|
31
|
+
- a matching part
|
|
32
|
+
|
|
33
|
+
The root starts with `$`. The match and scan parts are enclosed by brackets.
|
|
34
|
+
|
|
35
|
+
A very simple csvpath might look like this:
|
|
36
|
+
|
|
37
|
+
$filename[*][yes()]
|
|
38
|
+
|
|
39
|
+
This path says open the file named `filename`, scan all the lines, and match every line scanned.
|
|
40
|
+
|
|
41
|
+
The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
|
|
42
|
+
|
|
43
|
+
This is a very basic use. For more usage, see the unit tests.
|
|
44
|
+
|
|
45
|
+
path = CsvPath()
|
|
46
|
+
path.parse("""$test.csv
|
|
47
|
+
[5-25]
|
|
48
|
+
[
|
|
49
|
+
#0=="Frog" @lastname.onmatch="Bats" count()==2
|
|
50
|
+
]
|
|
51
|
+
""")
|
|
52
|
+
for i, line in enumerate( path.next() ):
|
|
53
|
+
print(f"{i}: {line}")
|
|
54
|
+
print(f"path vars: {path.variables}")
|
|
55
|
+
|
|
56
|
+
The csvpath says:
|
|
57
|
+
- Open test.csv
|
|
58
|
+
- Scan lines 5 through 25
|
|
59
|
+
- Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
|
|
60
|
+
|
|
61
|
+
Another path that does the same thing might look like:
|
|
62
|
+
|
|
63
|
+
"""$test.csv
|
|
64
|
+
[5-25]
|
|
65
|
+
[
|
|
66
|
+
#0=="Frog"
|
|
67
|
+
@lastname.onmatch="Bats"
|
|
68
|
+
count()==2
|
|
69
|
+
count()==2 -> print( "$.match_count: $.line")
|
|
70
|
+
]
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
In this case we're using the "when" operator, `->`, to determine when to print.
|
|
74
|
+
|
|
75
|
+
You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
|
|
76
|
+
- A JSON file with a dictionary of file paths under name keys
|
|
77
|
+
- A dict object passed into the CsvPaths object containing the same named path structure
|
|
78
|
+
- The path to a csv file that will be put into the named paths dict under its name minus extension
|
|
79
|
+
- A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
|
|
80
|
+
|
|
81
|
+
You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
|
|
82
|
+
|
|
83
|
+
paths = CsvPaths(filename = "my_named_paths.json")
|
|
84
|
+
path = paths.csvpath()
|
|
85
|
+
path.parse( """$test[*][#firstname=="Fred"]""" )
|
|
86
|
+
path.collect()
|
|
87
|
+
|
|
88
|
+
If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
|
|
89
|
+
|
|
90
|
+
{ "test":"test/test_resources/test.csv" }
|
|
91
|
+
|
|
92
|
+
# Scanning
|
|
93
|
+
The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
|
|
94
|
+
|
|
95
|
+
The scan part of the path starts with a dollar sign to indicate the root, meaning the file from the top. After the dollar sign comes the file path. The scanning instructions are in a bracket. The rules are:
|
|
96
|
+
- `[*]` means all
|
|
97
|
+
- `[3*]` means starting from line 3 and going to the end of the file
|
|
98
|
+
- `[3]` by itself means just line 3
|
|
99
|
+
- `[1-3]` means lines 1 through 3
|
|
100
|
+
- `[1+3]` means lines 1 and line 3
|
|
101
|
+
- `[1+3-8]` means line 1 and lines 3 through eight
|
|
102
|
+
|
|
103
|
+
# Matching
|
|
104
|
+
The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
|
|
105
|
+
<table>
|
|
106
|
+
<tr>
|
|
107
|
+
<td>Type</td>
|
|
108
|
+
<td>Returns</td>
|
|
109
|
+
<td>Matches</td>
|
|
110
|
+
<td>Description</td>
|
|
111
|
+
<td>Examples</td>
|
|
112
|
+
</tr>
|
|
113
|
+
<tr>
|
|
114
|
+
<td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
|
|
115
|
+
<td>A quoted string or date, optionally quoted number, or
|
|
116
|
+
regex. Regex features are limited. A regex is wrapped in "/" characters and
|
|
117
|
+
only has regex functionality when used in the regex() function.</td>
|
|
118
|
+
<td>
|
|
119
|
+
<li/> `"Massachusetts"`
|
|
120
|
+
<li/> `89.7`
|
|
121
|
+
<li/> `/[0-9a-zA-Z]+!/`
|
|
122
|
+
</td>
|
|
123
|
+
</tr>
|
|
124
|
+
<tr>
|
|
125
|
+
<td>Function </td><td> Calculated </td><td> Calculated </td>
|
|
126
|
+
<td>A function name followed by parentheses. Functions can
|
|
127
|
+
contain terms, variables, headers and other functions. Some functions
|
|
128
|
+
take a specific or unlimited number of types as arguments.
|
|
129
|
+
Certain functions can take qualifiers. An `onmatch` qualifier indicates that
|
|
130
|
+
the function should be applied only when the whole path matches.
|
|
131
|
+
Some functions optionally take an arbitrary name qualifier to better name a tracking variable.
|
|
132
|
+
Qualifiers are described below. </td>
|
|
133
|
+
<td>
|
|
134
|
+
<li/> `not(count()==2)`
|
|
135
|
+
<li/> `add( 5, 3, 1 )`
|
|
136
|
+
<li/> `concat( end(), regex(#0, /[0-5]+abc/))`
|
|
137
|
+
</td>
|
|
138
|
+
</tr>
|
|
139
|
+
<tr>
|
|
140
|
+
<td>Variable </td>
|
|
141
|
+
<td>Value</td>
|
|
142
|
+
<td>True when set unless `onchange` determines True/False.</td>
|
|
143
|
+
<td>
|
|
144
|
+
<p>
|
|
145
|
+
An @ followed by a name. A variable is set or tested depending on the usage. When used as the left hand side of an "=" its value is set. When it is used on either side of an "==" it is an equality test.
|
|
146
|
+
</p>
|
|
147
|
+
<p>
|
|
148
|
+
Variables can take an `onmatch` qualifier to indicate that the variable should only be set when the row matches all parts of the path.
|
|
149
|
+
<p/>
|
|
150
|
+
<p>
|
|
151
|
+
A variable can also take an `onchange` qualifier to make its assignment only match when its value changes. In the usual case, a variable assignment always matches, making it not a factor in the row's matching or not matching. With `onchange` the assignment can determine if the row fails to match the csvpath.
|
|
152
|
+
</p>
|
|
153
|
+
<p>
|
|
154
|
+
Note that at present a variable assignment of an equality test is not possible using `==`. In the future the csvpath grammar may be improved to address this gap. In the interim, use the `equals(value,value)` function. I.e.instead of
|
|
155
|
+
@test = @cat == @hat
|
|
156
|
+
use
|
|
157
|
+
@test = equals(@cat, @hat)
|
|
158
|
+
</p>
|
|
159
|
+
<td>
|
|
160
|
+
<li/> `@weather="cloudy"`
|
|
161
|
+
<li/> `count(@weather=="sunny")`
|
|
162
|
+
<li/> `#summer==@weather`
|
|
163
|
+
<li/> `@happy.onchange=#weather`
|
|
164
|
+
|
|
165
|
+
#1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is a test. #4 sets the `happy` variable to the value of the `weather` header and fails the row matching until `happy`'s value changes.
|
|
166
|
+
</td>
|
|
167
|
+
</tr>
|
|
168
|
+
<tr>
|
|
169
|
+
<td>Header </td>
|
|
170
|
+
<td>Value </td>
|
|
171
|
+
<td>Calculated</td>
|
|
172
|
+
<td>A # followed by a name or integer. The name references a value in line 0, the header
|
|
173
|
+
row. A number references a column by the 0-based column order. </td>
|
|
174
|
+
<td>
|
|
175
|
+
<li/> `#firstname`
|
|
176
|
+
<li/> `#3`
|
|
177
|
+
</td>
|
|
178
|
+
</tr>
|
|
179
|
+
<tr>
|
|
180
|
+
<td>Equality</td>
|
|
181
|
+
<td>Calculated </td>
|
|
182
|
+
<td>True at assignment, otherwise calculated </td>
|
|
183
|
+
<td>Two of the other types joined with an "=" or "==".</td>
|
|
184
|
+
<td>
|
|
185
|
+
<li/> `@type_of_tree="Oak"`
|
|
186
|
+
<li/> `#name == @type_of_tree`
|
|
187
|
+
</td>
|
|
188
|
+
</tr>
|
|
189
|
+
<table>
|
|
190
|
+
|
|
191
|
+
## Qualifiers
|
|
192
|
+
|
|
193
|
+
Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only three qualifiers:
|
|
194
|
+
|
|
195
|
+
- `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
|
|
196
|
+
- `onchange` set on a variable to indicate that a row should only match when the variable is set to a new value
|
|
197
|
+
- An arbitrary string to add a name for the function's internal use, typically to name a variable
|
|
198
|
+
|
|
199
|
+
Qualifiers look like:
|
|
200
|
+
|
|
201
|
+
[ @myvar.onmatch = yes() ]
|
|
202
|
+
|
|
203
|
+
Or:
|
|
204
|
+
|
|
205
|
+
[ @i = increment.this_is_my_increment.onmatch(yes(), 3) ]
|
|
206
|
+
|
|
207
|
+
When multiple qualifiers are used order is not important.
|
|
208
|
+
|
|
209
|
+
## Variables
|
|
210
|
+
|
|
211
|
+
A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
|
|
212
|
+
|
|
213
|
+
[@a=#b #c==@a]
|
|
214
|
+
|
|
215
|
+
Can also be written as:
|
|
216
|
+
|
|
217
|
+
[#c==#b]
|
|
218
|
+
|
|
219
|
+
Variables are always set unless they are flagged with the `.onmatch` qualifier. That means:
|
|
220
|
+
|
|
221
|
+
$file.csv[*][ @imcounting.onmatch = count_lines() no()]
|
|
222
|
+
|
|
223
|
+
will never set `imcounting`, because of the `no()` function disallowing any matches, but:
|
|
224
|
+
|
|
225
|
+
$file.csv[*][ @imcounting = count_lines() no()]
|
|
226
|
+
|
|
227
|
+
will always set it.
|
|
228
|
+
|
|
229
|
+
As noted above, a variable can be flagged with the `onchange` qualifier. The effect is that a row will only match if the variable qualified by `onchange` changes in value.
|
|
230
|
+
|
|
231
|
+
## The when operator
|
|
232
|
+
|
|
233
|
+
`->`, the "when" operator, is used to act on a condition. `->` can take an equality or function on the left and trigger an equality, assignment, or function on the right. For e.g.
|
|
234
|
+
|
|
235
|
+
[ last() -> print("this is the last line") ]
|
|
236
|
+
|
|
237
|
+
Prints `this is the last line` just before the scan ends.
|
|
238
|
+
|
|
239
|
+
[ exists(#0) -> @firstname = #0 ]
|
|
240
|
+
|
|
241
|
+
Says to set the `firstname` variable to the value of the first column when the first column has a value.
|
|
242
|
+
|
|
243
|
+
## Match functions
|
|
244
|
+
|
|
245
|
+
Most of the work of matching is done in functions. The match functions are the following.
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
<table>
|
|
249
|
+
<tr><th> Group </th><th>Function </th><th> What it does </th></tr>
|
|
250
|
+
<tr><td> Boolean </td><td> </td><td> </td></tr>
|
|
251
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
|
|
252
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
|
|
253
|
+
<tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
|
|
254
|
+
<tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
|
|
255
|
+
<tr><td> </td><td> yes() </td><td> always true </td></tr>
|
|
256
|
+
<tr><td> </td><td> exists(value) </td><td> tests if the value exists </td></tr>
|
|
257
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
|
|
258
|
+
<tr><td> Math </td><td> </td><td> </td></tr>
|
|
259
|
+
<tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
|
|
260
|
+
<tr><td> </td><td> divide(value, value, ...) </td><td> divides numbers </td></tr>
|
|
261
|
+
<tr><td> </td><td> multiply(value, value, ...) </td><td> multiplies numbers </td></tr>
|
|
262
|
+
<tr><td> </td><td> subtract(value, value, ...) </td><td> subtracts numbers </td></tr>
|
|
263
|
+
<tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
|
|
264
|
+
<tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
|
|
265
|
+
<tr><td> Stats </td><td> </td><td> </td></tr>
|
|
266
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
|
|
267
|
+
<tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
|
|
268
|
+
<tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
|
|
269
|
+
<tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
|
|
270
|
+
<tr><td> </td><td> percent(type) </td><td> % of total lines for "scan", "match", "line" </td></tr>
|
|
271
|
+
<tr><td> Counting </td><td> </td><td> </td></tr>
|
|
272
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count()</a> </td><td> counts the number of matches </td></tr>
|
|
273
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count(value)</a> </td><td> count matches of value </td></tr>
|
|
274
|
+
<tr><td> </td><td> count_lines() </td><td> count lines to this point in the file </td></tr>
|
|
275
|
+
<tr><td> </td><td> count_scans() </td><td> count lines we checked for match </td></tr>
|
|
276
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/first.md'>first(value, value, ...)</a> </td><td> match the first occurrence and capture line </td></tr>
|
|
277
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/increment.md'>increment(value, n)</a> </td><td> increments a variable by n each time seen </td></tr>
|
|
278
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/every.md'>every(value, number)</a> </td><td> match every Nth time a value is seen </td></tr>
|
|
279
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/tally.md'>tally(value, value, ...)</a></td><td> counts times values are seen, including as a set </td></tr>
|
|
280
|
+
<tr><td> Strings </td><td> </td><td> </td></tr>
|
|
281
|
+
<tr><td> </td><td> concat(value, value) </td><td> joins two values </td></tr>
|
|
282
|
+
<tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
|
|
283
|
+
<tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
|
|
284
|
+
<tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
|
|
285
|
+
<tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
|
|
286
|
+
<tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
|
|
287
|
+
<tr><td> Columns </td><td> </td><td> </td></tr>
|
|
288
|
+
<tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
|
|
289
|
+
<tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
|
|
290
|
+
<tr><td> Other </td><td> </td><td> </td></tr>
|
|
291
|
+
<tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
|
|
292
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
|
|
293
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
|
|
294
|
+
<tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
|
|
295
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
|
|
296
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
|
|
297
|
+
<tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
|
|
298
|
+
</tr>
|
|
299
|
+
</table>
|
|
300
|
+
|
|
301
|
+
## Another Example
|
|
302
|
+
[ exists(#common_name) #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
|
|
303
|
+
|
|
304
|
+
In the path above, the rules applied are:
|
|
305
|
+
- The exists test of `#common_name` checks if the header named "common_name" has a value. Headers are the values in the 0th line.
|
|
306
|
+
- `#2` means the 3rd column, counting from 0
|
|
307
|
+
- Functions and column references are ANDed together
|
|
308
|
+
- `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
|
|
309
|
+
- Functions can contain functions, equality tests, and/or literals
|
|
310
|
+
|
|
311
|
+
# Not Ready For Production
|
|
312
|
+
Anything could change and performance could be better. This project is a hobby.
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
|
csvpath-0.0.4/README.md
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
|
|
2
|
+
# CsvPath
|
|
3
|
+
|
|
4
|
+
CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
|
|
5
|
+
- XPath: CsvPath is to a CSV file like XPath is to an XML file
|
|
6
|
+
- Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
|
|
7
|
+
- CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
|
|
8
|
+
|
|
9
|
+
CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
|
|
10
|
+
|
|
11
|
+
# Usage
|
|
12
|
+
CsvPath paths have three parts:
|
|
13
|
+
- a "root" file name
|
|
14
|
+
- a scanning part
|
|
15
|
+
- a matching part
|
|
16
|
+
|
|
17
|
+
The root starts with `$`. The match and scan parts are enclosed by brackets.
|
|
18
|
+
|
|
19
|
+
A very simple csvpath might look like this:
|
|
20
|
+
|
|
21
|
+
$filename[*][yes()]
|
|
22
|
+
|
|
23
|
+
This path says open the file named `filename`, scan all the lines, and match every line scanned.
|
|
24
|
+
|
|
25
|
+
The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
|
|
26
|
+
|
|
27
|
+
This is a very basic use. For more usage, see the unit tests.
|
|
28
|
+
|
|
29
|
+
path = CsvPath()
|
|
30
|
+
path.parse("""$test.csv
|
|
31
|
+
[5-25]
|
|
32
|
+
[
|
|
33
|
+
#0=="Frog" @lastname.onmatch="Bats" count()==2
|
|
34
|
+
]
|
|
35
|
+
""")
|
|
36
|
+
for i, line in enumerate( path.next() ):
|
|
37
|
+
print(f"{i}: {line}")
|
|
38
|
+
print(f"path vars: {path.variables}")
|
|
39
|
+
|
|
40
|
+
The csvpath says:
|
|
41
|
+
- Open test.csv
|
|
42
|
+
- Scan lines 5 through 25
|
|
43
|
+
- Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
|
|
44
|
+
|
|
45
|
+
Another path that does the same thing might look like:
|
|
46
|
+
|
|
47
|
+
"""$test.csv
|
|
48
|
+
[5-25]
|
|
49
|
+
[
|
|
50
|
+
#0=="Frog"
|
|
51
|
+
@lastname.onmatch="Bats"
|
|
52
|
+
count()==2
|
|
53
|
+
count()==2 -> print( "$.match_count: $.line")
|
|
54
|
+
]
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
In this case we're using the "when" operator, `->`, to determine when to print.
|
|
58
|
+
|
|
59
|
+
You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
|
|
60
|
+
- A JSON file with a dictionary of file paths under name keys
|
|
61
|
+
- A dict object passed into the CsvPaths object containing the same named path structure
|
|
62
|
+
- The path to a csv file that will be put into the named paths dict under its name minus extension
|
|
63
|
+
- A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
|
|
64
|
+
|
|
65
|
+
You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
|
|
66
|
+
|
|
67
|
+
paths = CsvPaths(filename = "my_named_paths.json")
|
|
68
|
+
path = paths.csvpath()
|
|
69
|
+
path.parse( """$test[*][#firstname=="Fred"]""" )
|
|
70
|
+
path.collect()
|
|
71
|
+
|
|
72
|
+
If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
|
|
73
|
+
|
|
74
|
+
{ "test":"test/test_resources/test.csv" }
|
|
75
|
+
|
|
76
|
+
# Scanning
|
|
77
|
+
The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
|
|
78
|
+
|
|
79
|
+
The scan part of the path starts with a dollar sign to indicate the root, meaning the file from the top. After the dollar sign comes the file path. The scanning instructions are in a bracket. The rules are:
|
|
80
|
+
- `[*]` means all
|
|
81
|
+
- `[3*]` means starting from line 3 and going to the end of the file
|
|
82
|
+
- `[3]` by itself means just line 3
|
|
83
|
+
- `[1-3]` means lines 1 through 3
|
|
84
|
+
- `[1+3]` means lines 1 and line 3
|
|
85
|
+
- `[1+3-8]` means line 1 and lines 3 through eight
|
|
86
|
+
|
|
87
|
+
# Matching
|
|
88
|
+
The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
|
|
89
|
+
<table>
|
|
90
|
+
<tr>
|
|
91
|
+
<td>Type</td>
|
|
92
|
+
<td>Returns</td>
|
|
93
|
+
<td>Matches</td>
|
|
94
|
+
<td>Description</td>
|
|
95
|
+
<td>Examples</td>
|
|
96
|
+
</tr>
|
|
97
|
+
<tr>
|
|
98
|
+
<td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
|
|
99
|
+
<td>A quoted string or date, optionally quoted number, or
|
|
100
|
+
regex. Regex features are limited. A regex is wrapped in "/" characters and
|
|
101
|
+
only has regex functionality when used in the regex() function.</td>
|
|
102
|
+
<td>
|
|
103
|
+
<li/> `"Massachusetts"`
|
|
104
|
+
<li/> `89.7`
|
|
105
|
+
<li/> `/[0-9a-zA-Z]+!/`
|
|
106
|
+
</td>
|
|
107
|
+
</tr>
|
|
108
|
+
<tr>
|
|
109
|
+
<td>Function </td><td> Calculated </td><td> Calculated </td>
|
|
110
|
+
<td>A function name followed by parentheses. Functions can
|
|
111
|
+
contain terms, variables, headers and other functions. Some functions
|
|
112
|
+
take a specific or unlimited number of types as arguments.
|
|
113
|
+
Certain functions can take qualifiers. An `onmatch` qualifier indicates that
|
|
114
|
+
the function should be applied only when the whole path matches.
|
|
115
|
+
Some functions optionally take an arbitrary name qualifier to better name a tracking variable.
|
|
116
|
+
Qualifiers are described below. </td>
|
|
117
|
+
<td>
|
|
118
|
+
<li/> `not(count()==2)`
|
|
119
|
+
<li/> `add( 5, 3, 1 )`
|
|
120
|
+
<li/> `concat( end(), regex(#0, /[0-5]+abc/))`
|
|
121
|
+
</td>
|
|
122
|
+
</tr>
|
|
123
|
+
<tr>
|
|
124
|
+
<td>Variable </td>
|
|
125
|
+
<td>Value</td>
|
|
126
|
+
<td>True when set unless `onchange` determines True/False.</td>
|
|
127
|
+
<td>
|
|
128
|
+
<p>
|
|
129
|
+
An @ followed by a name. A variable is set or tested depending on the usage. When used as the left hand side of an "=" its value is set. When it is used on either side of an "==" it is an equality test.
|
|
130
|
+
</p>
|
|
131
|
+
<p>
|
|
132
|
+
Variables can take an `onmatch` qualifier to indicate that the variable should only be set when the row matches all parts of the path.
|
|
133
|
+
<p/>
|
|
134
|
+
<p>
|
|
135
|
+
A variable can also take an `onchange` qualifier to make its assignment only match when its value changes. In the usual case, a variable assignment always matches, making it not a factor in the row's matching or not matching. With `onchange` the assignment can determine if the row fails to match the csvpath.
|
|
136
|
+
</p>
|
|
137
|
+
<p>
|
|
138
|
+
Note that at present a variable assignment of an equality test is not possible using `==`. In the future the csvpath grammar may be improved to address this gap. In the interim, use the `equals(value,value)` function. I.e.instead of
|
|
139
|
+
@test = @cat == @hat
|
|
140
|
+
use
|
|
141
|
+
@test = equals(@cat, @hat)
|
|
142
|
+
</p>
|
|
143
|
+
<td>
|
|
144
|
+
<li/> `@weather="cloudy"`
|
|
145
|
+
<li/> `count(@weather=="sunny")`
|
|
146
|
+
<li/> `#summer==@weather`
|
|
147
|
+
<li/> `@happy.onchange=#weather`
|
|
148
|
+
|
|
149
|
+
#1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is a test. #4 sets the `happy` variable to the value of the `weather` header and fails the row matching until `happy`'s value changes.
|
|
150
|
+
</td>
|
|
151
|
+
</tr>
|
|
152
|
+
<tr>
|
|
153
|
+
<td>Header </td>
|
|
154
|
+
<td>Value </td>
|
|
155
|
+
<td>Calculated</td>
|
|
156
|
+
<td>A # followed by a name or integer. The name references a value in line 0, the header
|
|
157
|
+
row. A number references a column by the 0-based column order. </td>
|
|
158
|
+
<td>
|
|
159
|
+
<li/> `#firstname`
|
|
160
|
+
<li/> `#3`
|
|
161
|
+
</td>
|
|
162
|
+
</tr>
|
|
163
|
+
<tr>
|
|
164
|
+
<td>Equality</td>
|
|
165
|
+
<td>Calculated </td>
|
|
166
|
+
<td>True at assignment, otherwise calculated </td>
|
|
167
|
+
<td>Two of the other types joined with an "=" or "==".</td>
|
|
168
|
+
<td>
|
|
169
|
+
<li/> `@type_of_tree="Oak"`
|
|
170
|
+
<li/> `#name == @type_of_tree`
|
|
171
|
+
</td>
|
|
172
|
+
</tr>
|
|
173
|
+
<table>
|
|
174
|
+
|
|
175
|
+
## Qualifiers
|
|
176
|
+
|
|
177
|
+
Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only three qualifiers:
|
|
178
|
+
|
|
179
|
+
- `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
|
|
180
|
+
- `onchange` set on a variable to indicate that a row should only match when the variable is set to a new value
|
|
181
|
+
- An arbitrary string to add a name for the function's internal use, typically to name a variable
|
|
182
|
+
|
|
183
|
+
Qualifiers look like:
|
|
184
|
+
|
|
185
|
+
[ @myvar.onmatch = yes() ]
|
|
186
|
+
|
|
187
|
+
Or:
|
|
188
|
+
|
|
189
|
+
[ @i = increment.this_is_my_increment.onmatch(yes(), 3) ]
|
|
190
|
+
|
|
191
|
+
When multiple qualifiers are used order is not important.
|
|
192
|
+
|
|
193
|
+
## Variables
|
|
194
|
+
|
|
195
|
+
A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
|
|
196
|
+
|
|
197
|
+
[@a=#b #c==@a]
|
|
198
|
+
|
|
199
|
+
Can also be written as:
|
|
200
|
+
|
|
201
|
+
[#c==#b]
|
|
202
|
+
|
|
203
|
+
Variables are always set unless they are flagged with the `.onmatch` qualifier. That means:
|
|
204
|
+
|
|
205
|
+
$file.csv[*][ @imcounting.onmatch = count_lines() no()]
|
|
206
|
+
|
|
207
|
+
will never set `imcounting`, because of the `no()` function disallowing any matches, but:
|
|
208
|
+
|
|
209
|
+
$file.csv[*][ @imcounting = count_lines() no()]
|
|
210
|
+
|
|
211
|
+
will always set it.
|
|
212
|
+
|
|
213
|
+
As noted above, a variable can be flagged with the `onchange` qualifier. The effect is that a row will only match if the variable qualified by `onchange` changes in value.
|
|
214
|
+
|
|
215
|
+
## The when operator
|
|
216
|
+
|
|
217
|
+
`->`, the "when" operator, is used to act on a condition. `->` can take an equality or function on the left and trigger an equality, assignment, or function on the right. For e.g.
|
|
218
|
+
|
|
219
|
+
[ last() -> print("this is the last line") ]
|
|
220
|
+
|
|
221
|
+
Prints `this is the last line` just before the scan ends.
|
|
222
|
+
|
|
223
|
+
[ exists(#0) -> @firstname = #0 ]
|
|
224
|
+
|
|
225
|
+
Says to set the `firstname` variable to the value of the first column when the first column has a value.
|
|
226
|
+
|
|
227
|
+
## Match functions
|
|
228
|
+
|
|
229
|
+
Most of the work of matching is done in functions. The match functions are the following.
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
<table>
|
|
233
|
+
<tr><th> Group </th><th>Function </th><th> What it does </th></tr>
|
|
234
|
+
<tr><td> Boolean </td><td> </td><td> </td></tr>
|
|
235
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
|
|
236
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
|
|
237
|
+
<tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
|
|
238
|
+
<tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
|
|
239
|
+
<tr><td> </td><td> yes() </td><td> always true </td></tr>
|
|
240
|
+
<tr><td> </td><td> exists(value) </td><td> tests if the value exists </td></tr>
|
|
241
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
|
|
242
|
+
<tr><td> Math </td><td> </td><td> </td></tr>
|
|
243
|
+
<tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
|
|
244
|
+
<tr><td> </td><td> divide(value, value, ...) </td><td> divides numbers </td></tr>
|
|
245
|
+
<tr><td> </td><td> multiply(value, value, ...) </td><td> multiplies numbers </td></tr>
|
|
246
|
+
<tr><td> </td><td> subtract(value, value, ...) </td><td> subtracts numbers </td></tr>
|
|
247
|
+
<tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
|
|
248
|
+
<tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
|
|
249
|
+
<tr><td> Stats </td><td> </td><td> </td></tr>
|
|
250
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
|
|
251
|
+
<tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
|
|
252
|
+
<tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
|
|
253
|
+
<tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
|
|
254
|
+
<tr><td> </td><td> percent(type) </td><td> % of total lines for "scan", "match", "line" </td></tr>
|
|
255
|
+
<tr><td> Counting </td><td> </td><td> </td></tr>
|
|
256
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count()</a> </td><td> counts the number of matches </td></tr>
|
|
257
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count(value)</a> </td><td> count matches of value </td></tr>
|
|
258
|
+
<tr><td> </td><td> count_lines() </td><td> count lines to this point in the file </td></tr>
|
|
259
|
+
<tr><td> </td><td> count_scans() </td><td> count lines we checked for match </td></tr>
|
|
260
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/first.md'>first(value, value, ...)</a> </td><td> match the first occurrence and capture line </td></tr>
|
|
261
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/increment.md'>increment(value, n)</a> </td><td> increments a variable by n each time seen </td></tr>
|
|
262
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/every.md'>every(value, number)</a> </td><td> match every Nth time a value is seen </td></tr>
|
|
263
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/tally.md'>tally(value, value, ...)</a></td><td> counts times values are seen, including as a set </td></tr>
|
|
264
|
+
<tr><td> Strings </td><td> </td><td> </td></tr>
|
|
265
|
+
<tr><td> </td><td> concat(value, value) </td><td> joins two values </td></tr>
|
|
266
|
+
<tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
|
|
267
|
+
<tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
|
|
268
|
+
<tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
|
|
269
|
+
<tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
|
|
270
|
+
<tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
|
|
271
|
+
<tr><td> Columns </td><td> </td><td> </td></tr>
|
|
272
|
+
<tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
|
|
273
|
+
<tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
|
|
274
|
+
<tr><td> Other </td><td> </td><td> </td></tr>
|
|
275
|
+
<tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
|
|
276
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
|
|
277
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
|
|
278
|
+
<tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
|
|
279
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
|
|
280
|
+
<tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
|
|
281
|
+
<tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
|
|
282
|
+
</tr>
|
|
283
|
+
</table>
|
|
284
|
+
|
|
285
|
+
## Another Example
|
|
286
|
+
[ exists(#common_name) #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
|
|
287
|
+
|
|
288
|
+
In the path above, the rules applied are:
|
|
289
|
+
- The exists test of `#common_name` checks if the header named "common_name" has a value. Headers are the values in the 0th line.
|
|
290
|
+
- `#2` means the 3rd column, counting from 0
|
|
291
|
+
- Functions and column references are ANDed together
|
|
292
|
+
- `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
|
|
293
|
+
- Functions can contain functions, equality tests, and/or literals
|
|
294
|
+
|
|
295
|
+
# Not Ready For Production
|
|
296
|
+
Anything could change and performance could be better. This project is a hobby.
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
|