csvpath 0.0.21__tar.gz → 0.0.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. csvpath-0.0.23/PKG-INFO +298 -0
  2. csvpath-0.0.23/README.md +281 -0
  3. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/csvpath.py +67 -31
  4. csvpath-0.0.23/csvpath/csvpaths.py +88 -0
  5. csvpath-0.0.23/csvpath/matching/functions/any.md +33 -0
  6. csvpath-0.0.23/csvpath/matching/functions/any.py +105 -0
  7. csvpath-0.0.23/csvpath/matching/functions/average.md +21 -0
  8. csvpath-0.0.23/csvpath/matching/functions/column.py +23 -0
  9. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/concat.py +6 -7
  10. csvpath-0.0.23/csvpath/matching/functions/count_lines.py +10 -0
  11. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/function.py +1 -0
  12. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/function_factory.py +28 -2
  13. csvpath-0.0.23/csvpath/matching/functions/header.md +13 -0
  14. csvpath-0.0.23/csvpath/matching/functions/header.py +10 -0
  15. csvpath-0.0.23/csvpath/matching/functions/increment.md +57 -0
  16. csvpath-0.0.23/csvpath/matching/functions/increment.py +66 -0
  17. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/length.py +0 -5
  18. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/minf.py +17 -12
  19. csvpath-0.0.23/csvpath/matching/functions/nonef.py +10 -0
  20. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/notf.py +0 -2
  21. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/now.py +0 -6
  22. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/print.md +1 -1
  23. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/printf.py +11 -12
  24. csvpath-0.0.23/csvpath/matching/functions/regex.py +35 -0
  25. csvpath-0.0.23/csvpath/matching/functions/stop.md +24 -0
  26. csvpath-0.0.23/csvpath/matching/functions/stop.py +23 -0
  27. csvpath-0.0.23/csvpath/matching/functions/substring.py +31 -0
  28. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/subtract.py +0 -1
  29. csvpath-0.0.23/csvpath/matching/functions/variable.md +13 -0
  30. csvpath-0.0.23/csvpath/matching/functions/variable.py +10 -0
  31. csvpath-0.0.23/csvpath/matching/functions/when.md +16 -0
  32. csvpath-0.0.23/csvpath/matching/functions/when.py +25 -0
  33. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/matcher.py +21 -12
  34. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/matching_lexer.py +4 -0
  35. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/parser.out +334 -236
  36. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/parsetab.py +377 -221
  37. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/equality.py +2 -3
  38. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/expression.py +1 -0
  39. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/header.py +2 -1
  40. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/matchable.py +37 -1
  41. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/term.py +4 -0
  42. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/productions/variable.py +1 -0
  43. {csvpath-0.0.21 → csvpath-0.0.23}/pyproject.toml +1 -1
  44. csvpath-0.0.21/PKG-INFO +0 -199
  45. csvpath-0.0.21/README.md +0 -182
  46. csvpath-0.0.21/csvpath/matching/functions/count_lines.py +0 -12
  47. csvpath-0.0.21/csvpath/matching/functions/regex.py +0 -38
  48. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/__init__.py +0 -0
  49. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/__init__.py +0 -0
  50. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/expression_encoder.py +0 -0
  51. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/expression_math.py +0 -0
  52. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/expression_utility.py +0 -0
  53. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/above.py +0 -0
  54. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/add.py +0 -0
  55. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/below.py +0 -0
  56. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/count.md +0 -0
  57. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/count.py +0 -0
  58. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/count_scans.py +0 -0
  59. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/divide.py +0 -0
  60. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/end.py +0 -0
  61. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/every.md +0 -0
  62. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/every.py +0 -0
  63. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/first.md +0 -0
  64. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/first.py +0 -0
  65. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/in.md +0 -0
  66. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/inf.py +0 -0
  67. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/is_instance.py +0 -0
  68. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/lower.py +0 -0
  69. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/multiply.py +0 -0
  70. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/no.md +0 -0
  71. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/no.py +0 -0
  72. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/now.md +0 -0
  73. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/orf.py +0 -0
  74. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/percent.py +0 -0
  75. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/random.py +0 -0
  76. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/tally.md +0 -0
  77. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/tally.py +0 -0
  78. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/upper.py +0 -0
  79. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/matching/functions/yes.py +0 -0
  80. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/parser_utility.py +0 -0
  81. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/__init__.py +0 -0
  82. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/parser.out +0 -0
  83. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/parsetab.py +0 -0
  84. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/scanner.py +0 -0
  85. {csvpath-0.0.21 → csvpath-0.0.23}/csvpath/scanning/scanning_lexer.py +0 -0
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.1
2
+ Name: csvpath
3
+ Version: 0.0.23
4
+ Summary:
5
+ Author: David Kershaw
6
+ Author-email: dk107dk@hotmail.com
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
11
+ Requires-Dist: ply (>=3.11,<4.0)
12
+ Requires-Dist: polars (>=1.1.0,<2.0.0)
13
+ Requires-Dist: pytest (>=8.2.2,<9.0.0)
14
+ Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
15
+ Description-Content-Type: text/markdown
16
+
17
+
18
+ # CsvPath
19
+
20
+ CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
21
+ - XPath: CsvPath is to a CSV file like XPath is to an XML file
22
+ - Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
23
+ - CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
24
+
25
+ CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
26
+
27
+ # Usage
28
+ CsvPath paths have three parts:
29
+ - a "root" file name
30
+ - a scanning part
31
+ - a matching part
32
+
33
+ The root starts with `$`. The match and scan parts are enclosed by brackets.
34
+
35
+ A very simple csvpath might look like this:
36
+
37
+ $filename[*][yes()]
38
+
39
+ This path says open the file named `filename`, scan all the lines, and match every line scanned.
40
+
41
+ The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
42
+
43
+ This is a very basic use. For more usage, see the unit tests.
44
+
45
+ path = CsvPath()
46
+ path.parse("""$test.csv
47
+ [5-25]
48
+ [
49
+ #0=="Frog"
50
+ @lastname.onmatch="Bats"
51
+ count()==2
52
+ ]
53
+ """)
54
+ for i, line in enumerate( path.next() ):
55
+ print(f"{i}: {line}")
56
+ print(f"path vars: {path.variables}")
57
+
58
+ The csvpath says:
59
+ - Open test.csv
60
+ - Scan lines 5 through 25
61
+ - Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
62
+
63
+ Another path that does the same thing might look like:
64
+
65
+ path = CsvPath()
66
+ path.parse("""$test.csv
67
+ [5-25]
68
+ [
69
+ #0=="Frog"
70
+ @lastname.onmatch="Bats"
71
+ count()==2
72
+ print( count()==2, "$.match_count: $.line")
73
+ ]
74
+ """)
75
+ path.fast_forward()
76
+
77
+
78
+ You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
79
+ - A JSON file with a dictionary of file paths under name keys
80
+ - A dict object passed into the CsvPaths object containing the same named path structure
81
+ - The path to a csv file that will be put into the named paths dict under its name minus extension
82
+ - A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
83
+
84
+ You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
85
+
86
+ paths = CsvPaths(filename = "my_named_paths.json")
87
+ path = paths.csvpath()
88
+ path.parse( """$test[*][#firstname=="Fred"]""" )
89
+ path.collect()
90
+
91
+ If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
92
+
93
+ { "test":"test/test_resources/test.csv" }
94
+
95
+ # Scanning
96
+ The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
97
+
98
+ The scan part of the path starts with a dollar sign to indicate the root, meaning the file from the top. After the dollar sign comes the file path. The scanning instructions are in a bracket. The rules are:
99
+ - `[*]` means all
100
+ - `[3*]` means starting from line 3 and going to the end of the file
101
+ - `[3]` by itself means just line 3
102
+ - `[1-3]` means lines 1 through 3
103
+ - `[1+3]` means lines 1 and line 3
104
+ - `[1+3-8]` means line 1 and lines 3 through eight
105
+
106
+ # Matching
107
+ The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
108
+ <table>
109
+ <tr>
110
+ <td>Type</td>
111
+ <td>Returns</td>
112
+ <td>Matches</td>
113
+ <td>Description</td>
114
+ <td>Examples</td>
115
+ </tr>
116
+ <tr>
117
+ <td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
118
+ <td>A quoted string or date, optionally quoted number, or
119
+ regex. Regex features are limited. A regex is wrapped in "/" characters and
120
+ only has regex functionality when used in the regex() function.</td>
121
+ <td>
122
+ <li/> `"Massachusetts"`
123
+ <li/> `89.7`
124
+ <li/> `/[0-9a-zA-Z]+!/`
125
+ </td>
126
+ </tr>
127
+ <tr>
128
+ <td>Function </td><td> Calculated </td><td> Calculated </td>
129
+ <td>A function name followed by parentheses. Functions can
130
+ contain terms, variables, headers and other functions. Some functions
131
+ take a specific or unlimited number of types as arguments.
132
+ Certain functions can take qualifiers. An `onmatch` qualifier indicates that
133
+ the function should be applied only when the whole path matches.
134
+ Some functions optionally take an arbitrary name qualifier to better name a tracking variable.
135
+ Qualifiers are described below. </td>
136
+ <td>
137
+ <li/> `not(count()==2)`
138
+ <li/> `add( 5, 3, 1 )`
139
+ <li/> `concat( end(), regex(#0, /[0-5]+abc/))`
140
+ </td>
141
+ </tr>
142
+ <tr>
143
+ <td>Variable </td>
144
+ <td>Value</td>
145
+ <td>True/False when value tested. True when set, True/False existence when used alone</td>
146
+ <td>An @ followed by a name. A variable is
147
+ set or tested depending on the usage. By itself, it is an existence test. When used as
148
+ the left hand side of an "=" its value is set.
149
+ When it is used on either side of an "==" it is an equality test.
150
+ Variables can take an `onmatch` qualifier to indicate that the variable should
151
+ only be set when the row matches all parts of the path.
152
+ <td>
153
+ <li/> `@weather="cloudy"`
154
+ <li/> `count(@weather=="sunny")`
155
+ <li/> `@weather`
156
+ <li/> `#summer==@weather`
157
+
158
+ #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
159
+ </td>
160
+ </tr>
161
+ <tr>
162
+ <td>Header </td>
163
+ <td>Value </td>
164
+ <td>A True/False existence test when used alone, otherwise calculated</td>
165
+ <td>A # followed by a name or integer. The name references a value in line 0, the header
166
+ row. A number references a column by the 0-based column order. </td>
167
+ <td>
168
+ <li/> `#firstname`
169
+ <li/> `#3`
170
+ </td>
171
+ </tr>
172
+ <tr>
173
+ <td>Equality</td>
174
+ <td>Calculated </td>
175
+ <td>True at assignment, otherwise calculated </td>
176
+ <td>Two of the other types joined with an "=" or "==".</td>
177
+ <td>
178
+ <li/> `@type_of_tree="Oak"`
179
+ <li/> `#name == @type_of_tree`
180
+ </td>
181
+ </tr>
182
+ <table>
183
+
184
+ A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
185
+
186
+ [@a=#b #c==@a]
187
+
188
+ Can also be written as:
189
+
190
+ [#c==#b]
191
+
192
+ Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only two qualifiers:
193
+
194
+ - `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
195
+ - An arbitrary string to add a name for the function's internal use, typically to name a variable
196
+
197
+ Qualifiers look like:
198
+
199
+ [ @myvar.onmatch = yes() ]
200
+
201
+ Or:
202
+
203
+ [ @i = increment.this_is_my_increment.onmatch(yes(), 3) ]
204
+
205
+ When multiple qualifiers are used order is not important.
206
+
207
+ ## Example
208
+ [ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
209
+
210
+ In the path above, the rules applied are:
211
+ - `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
212
+ - `#2` means the 3rd column, counting from 0
213
+ - Functions and column references are ANDed together
214
+ - `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
215
+ - Functions can contain functions, equality tests, and/or literals
216
+
217
+ Variables are always set unless they are flagged with `.onmatch`. That means:
218
+
219
+ $file.csv[*][ @imcounting.onmatch = count_lines() no()]
220
+
221
+ will never set `imcounting`, because of the `no()` function disallowing any matches, but:
222
+
223
+ $file.csv[*][ @imcounting = count_lines() no()]
224
+
225
+ will always set it.
226
+
227
+ Most of the work of matching is done in functions. The match functions are the following.
228
+
229
+
230
+ <table>
231
+ <tr><th> Group </th><th>Function </th><th> What it does </th></tr>
232
+ <tr><td> Boolean </td><td> </td><td> </td></tr>
233
+ <tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
234
+ <tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
235
+ <tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
236
+ <tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
237
+ <tr><td> </td><td> yes() </td><td> always true </td></tr>
238
+ <tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
239
+ <tr><td> Math </td><td> </td><td> </td></tr>
240
+ <tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
241
+ <tr><td> </td><td> divide(value, value, ...) </td><td> divides numbers </td></tr>
242
+ <tr><td> </td><td> multiply(value, value, ...) </td><td> multiplies numbers </td></tr>
243
+ <tr><td> </td><td> subtract(value, value, ...) </td><td> subtracts numbers </td></tr>
244
+ <tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
245
+ <tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
246
+ <tr><td> Stats </td><td> </td><td> </td></tr>
247
+ <tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
248
+ <tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
249
+ <tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
250
+ <tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
251
+ <tr><td> </td><td> percent(type) </td><td> % of total lines for "scan", "match", "line" </td></tr>
252
+ <tr><td> Counting </td><td> </td><td> </td></tr>
253
+ <tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count()</a> </td><td> counts the number of matches </td></tr>
254
+ <tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count(value)</a> </td><td> count matches of value </td></tr>
255
+ <tr><td> </td><td> count_lines() </td><td> count lines to this point in the file </td></tr>
256
+ <tr><td> </td><td> count_scans() </td><td> count lines we checked for match </td></tr>
257
+ <tr><td> </td><td> <a href='csvpath/matching/functions/first.md'>first(value, value, ...)</a> </td><td> match the first occurrence and capture line </td></tr>
258
+ <tr><td> </td><td> <a href='csvpath/matching/functions/increment.md'>increment(value, n)</a> </td><td> increments a variable by n each time seen </td></tr>
259
+ <tr><td> </td><td> <a href='csvpath/matching/functions/every.md'>every(value, number)</a> </td><td> match every Nth time a value is seen </td></tr>
260
+ <tr><td> </td><td> <a href='csvpath/matching/functions/tally.md'>tally(value, value, ...)</a></td><td> counts times values are seen, including as a set </td></tr>
261
+ <tr><td> Strings </td><td> </td><td> </td></tr>
262
+ <tr><td> </td><td> concat(value, value) </td><td> joins two values </td></tr>
263
+ <tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
264
+ <tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
265
+ <tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
266
+ <tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
267
+ <tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
268
+ <tr><td> Columns </td><td> </td><td> </td></tr>
269
+ <tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
270
+ <tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
271
+ <tr><td> Other </td><td> </td><td> </td></tr>
272
+ <tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
273
+ <tr><td> </td><td> isinstance(value, typestr) </td><td> tests for "int","float","complex","bool","usd" </td></tr>
274
+ <tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
275
+ <tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
276
+ <tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
277
+ <tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
278
+ <tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
279
+ <tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
280
+ </tr>
281
+ </table>
282
+
283
+ # Not Ready For Production
284
+ Anything could change and performance could be better. This project is a hobby.
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
@@ -0,0 +1,281 @@
1
+
2
+ # CsvPath
3
+
4
+ CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
5
+ - XPath: CsvPath is to a CSV file like XPath is to an XML file
6
+ - Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
7
+ - CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
8
+
9
+ CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
10
+
11
+ # Usage
12
+ CsvPath paths have three parts:
13
+ - a "root" file name
14
+ - a scanning part
15
+ - a matching part
16
+
17
+ The root starts with `$`. The match and scan parts are enclosed by brackets.
18
+
19
+ A very simple csvpath might look like this:
20
+
21
+ $filename[*][yes()]
22
+
23
+ This path says open the file named `filename`, scan all the lines, and match every line scanned.
24
+
25
+ The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
26
+
27
+ This is a very basic use. For more usage, see the unit tests.
28
+
29
+ path = CsvPath()
30
+ path.parse("""$test.csv
31
+ [5-25]
32
+ [
33
+ #0=="Frog"
34
+ @lastname.onmatch="Bats"
35
+ count()==2
36
+ ]
37
+ """)
38
+ for i, line in enumerate( path.next() ):
39
+ print(f"{i}: {line}")
40
+ print(f"path vars: {path.variables}")
41
+
42
+ The csvpath says:
43
+ - Open test.csv
44
+ - Scan lines 5 through 25
45
+ - Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
46
+
47
+ Another path that does the same thing might look like:
48
+
49
+ path = CsvPath()
50
+ path.parse("""$test.csv
51
+ [5-25]
52
+ [
53
+ #0=="Frog"
54
+ @lastname.onmatch="Bats"
55
+ count()==2
56
+ print( count()==2, "$.match_count: $.line")
57
+ ]
58
+ """)
59
+ path.fast_forward()
60
+
61
+
62
+ You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
63
+ - A JSON file with a dictionary of file paths under name keys
64
+ - A dict object passed into the CsvPaths object containing the same named path structure
65
+ - The path to a csv file that will be put into the named paths dict under its name minus extension
66
+ - A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
67
+
68
+ You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
69
+
70
+ paths = CsvPaths(filename = "my_named_paths.json")
71
+ path = paths.csvpath()
72
+ path.parse( """$test[*][#firstname=="Fred"]""" )
73
+ path.collect()
74
+
75
+ If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
76
+
77
+ { "test":"test/test_resources/test.csv" }
78
+
79
+ # Scanning
80
+ The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
81
+
82
+ The scan part of the path starts with a dollar sign to indicate the root, meaning the file from the top. After the dollar sign comes the file path. The scanning instructions are in a bracket. The rules are:
83
+ - `[*]` means all
84
+ - `[3*]` means starting from line 3 and going to the end of the file
85
+ - `[3]` by itself means just line 3
86
+ - `[1-3]` means lines 1 through 3
87
+ - `[1+3]` means lines 1 and line 3
88
+ - `[1+3-8]` means line 1 and lines 3 through eight
89
+
90
+ # Matching
91
+ The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
92
+ <table>
93
+ <tr>
94
+ <td>Type</td>
95
+ <td>Returns</td>
96
+ <td>Matches</td>
97
+ <td>Description</td>
98
+ <td>Examples</td>
99
+ </tr>
100
+ <tr>
101
+ <td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
102
+ <td>A quoted string or date, optionally quoted number, or
103
+ regex. Regex features are limited. A regex is wrapped in "/" characters and
104
+ only has regex functionality when used in the regex() function.</td>
105
+ <td>
106
+ <li/> `"Massachusetts"`
107
+ <li/> `89.7`
108
+ <li/> `/[0-9a-zA-Z]+!/`
109
+ </td>
110
+ </tr>
111
+ <tr>
112
+ <td>Function </td><td> Calculated </td><td> Calculated </td>
113
+ <td>A function name followed by parentheses. Functions can
114
+ contain terms, variables, headers and other functions. Some functions
115
+ take a specific or unlimited number of types as arguments.
116
+ Certain functions can take qualifiers. An `onmatch` qualifier indicates that
117
+ the function should be applied only when the whole path matches.
118
+ Some functions optionally take an arbitrary name qualifier to better name a tracking variable.
119
+ Qualifiers are described below. </td>
120
+ <td>
121
+ <li/> `not(count()==2)`
122
+ <li/> `add( 5, 3, 1 )`
123
+ <li/> `concat( end(), regex(#0, /[0-5]+abc/))`
124
+ </td>
125
+ </tr>
126
+ <tr>
127
+ <td>Variable </td>
128
+ <td>Value</td>
129
+ <td>True/False when value tested. True when set, True/False existence when used alone</td>
130
+ <td>An @ followed by a name. A variable is
131
+ set or tested depending on the usage. By itself, it is an existence test. When used as
132
+ the left hand side of an "=" its value is set.
133
+ When it is used on either side of an "==" it is an equality test.
134
+ Variables can take an `onmatch` qualifier to indicate that the variable should
135
+ only be set when the row matches all parts of the path.
136
+ <td>
137
+ <li/> `@weather="cloudy"`
138
+ <li/> `count(@weather=="sunny")`
139
+ <li/> `@weather`
140
+ <li/> `#summer==@weather`
141
+
142
+ #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
143
+ </td>
144
+ </tr>
145
+ <tr>
146
+ <td>Header </td>
147
+ <td>Value </td>
148
+ <td>A True/False existence test when used alone, otherwise calculated</td>
149
+ <td>A # followed by a name or integer. The name references a value in line 0, the header
150
+ row. A number references a column by the 0-based column order. </td>
151
+ <td>
152
+ <li/> `#firstname`
153
+ <li/> `#3`
154
+ </td>
155
+ </tr>
156
+ <tr>
157
+ <td>Equality</td>
158
+ <td>Calculated </td>
159
+ <td>True at assignment, otherwise calculated </td>
160
+ <td>Two of the other types joined with an "=" or "==".</td>
161
+ <td>
162
+ <li/> `@type_of_tree="Oak"`
163
+ <li/> `#name == @type_of_tree`
164
+ </td>
165
+ </tr>
166
+ <table>
167
+
168
+ A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
169
+
170
+ [@a=#b #c==@a]
171
+
172
+ Can also be written as:
173
+
174
+ [#c==#b]
175
+
176
+ Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only two qualifiers:
177
+
178
+ - `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
179
+ - An arbitrary string to add a name for the function's internal use, typically to name a variable
180
+
181
+ Qualifiers look like:
182
+
183
+ [ @myvar.onmatch = yes() ]
184
+
185
+ Or:
186
+
187
+ [ @i = increment.this_is_my_increment.onmatch(yes(), 3) ]
188
+
189
+ When multiple qualifiers are used order is not important.
190
+
191
+ ## Example
192
+ [ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
193
+
194
+ In the path above, the rules applied are:
195
+ - `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
196
+ - `#2` means the 3rd column, counting from 0
197
+ - Functions and column references are ANDed together
198
+ - `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
199
+ - Functions can contain functions, equality tests, and/or literals
200
+
201
+ Variables are always set unless they are flagged with `.onmatch`. That means:
202
+
203
+ $file.csv[*][ @imcounting.onmatch = count_lines() no()]
204
+
205
+ will never set `imcounting`, because of the `no()` function disallowing any matches, but:
206
+
207
+ $file.csv[*][ @imcounting = count_lines() no()]
208
+
209
+ will always set it.
210
+
211
+ Most of the work of matching is done in functions. The match functions are the following.
212
+
213
+
214
+ <table>
215
+ <tr><th> Group </th><th>Function </th><th> What it does </th></tr>
216
+ <tr><td> Boolean </td><td> </td><td> </td></tr>
217
+ <tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
218
+ <tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
219
+ <tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
220
+ <tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
221
+ <tr><td> </td><td> yes() </td><td> always true </td></tr>
222
+ <tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
223
+ <tr><td> Math </td><td> </td><td> </td></tr>
224
+ <tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
225
+ <tr><td> </td><td> divide(value, value, ...) </td><td> divides numbers </td></tr>
226
+ <tr><td> </td><td> multiply(value, value, ...) </td><td> multiplies numbers </td></tr>
227
+ <tr><td> </td><td> subtract(value, value, ...) </td><td> subtracts numbers </td></tr>
228
+ <tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
229
+ <tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
230
+ <tr><td> Stats </td><td> </td><td> </td></tr>
231
+ <tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
232
+ <tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
233
+ <tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
234
+ <tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
235
+ <tr><td> </td><td> percent(type) </td><td> % of total lines for "scan", "match", "line" </td></tr>
236
+ <tr><td> Counting </td><td> </td><td> </td></tr>
237
+ <tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count()</a> </td><td> counts the number of matches </td></tr>
238
+ <tr><td> </td><td> <a href='csvpath/matching/functions/count.md'>count(value)</a> </td><td> count matches of value </td></tr>
239
+ <tr><td> </td><td> count_lines() </td><td> count lines to this point in the file </td></tr>
240
+ <tr><td> </td><td> count_scans() </td><td> count lines we checked for match </td></tr>
241
+ <tr><td> </td><td> <a href='csvpath/matching/functions/first.md'>first(value, value, ...)</a> </td><td> match the first occurrence and capture line </td></tr>
242
+ <tr><td> </td><td> <a href='csvpath/matching/functions/increment.md'>increment(value, n)</a> </td><td> increments a variable by n each time seen </td></tr>
243
+ <tr><td> </td><td> <a href='csvpath/matching/functions/every.md'>every(value, number)</a> </td><td> match every Nth time a value is seen </td></tr>
244
+ <tr><td> </td><td> <a href='csvpath/matching/functions/tally.md'>tally(value, value, ...)</a></td><td> counts times values are seen, including as a set </td></tr>
245
+ <tr><td> Strings </td><td> </td><td> </td></tr>
246
+ <tr><td> </td><td> concat(value, value) </td><td> joins two values </td></tr>
247
+ <tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
248
+ <tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
249
+ <tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
250
+ <tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
251
+ <tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
252
+ <tr><td> Columns </td><td> </td><td> </td></tr>
253
+ <tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
254
+ <tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
255
+ <tr><td> Other </td><td> </td><td> </td></tr>
256
+ <tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
257
+ <tr><td> </td><td> isinstance(value, typestr) </td><td> tests for "int","float","complex","bool","usd" </td></tr>
258
+ <tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
259
+ <tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
260
+ <tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
261
+ <tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
262
+ <tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
263
+ <tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
264
+ </tr>
265
+ </table>
266
+
267
+ # Not Ready For Production
268
+ Anything could change and performance could be better. This project is a hobby.
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+