csvpath 0.0.22__tar.gz → 0.0.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {csvpath-0.0.22 → csvpath-0.0.41}/PKG-INFO +175 -34
  2. {csvpath-0.0.22 → csvpath-0.0.41}/README.md +174 -33
  3. csvpath-0.0.41/csvpath/__init__.py +8 -0
  4. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/csvpath.py +87 -55
  5. csvpath-0.0.41/csvpath/csvpaths.py +88 -0
  6. csvpath-0.0.41/csvpath/matching/expression_utility.py +112 -0
  7. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/above.py +1 -1
  8. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/add.py +5 -3
  9. csvpath-0.0.41/csvpath/matching/functions/any.md +33 -0
  10. csvpath-0.0.41/csvpath/matching/functions/any.py +104 -0
  11. csvpath-0.0.41/csvpath/matching/functions/average.md +21 -0
  12. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/below.py +1 -1
  13. csvpath-0.0.41/csvpath/matching/functions/column.py +23 -0
  14. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/concat.py +8 -8
  15. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/count.md +2 -2
  16. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/count.py +3 -3
  17. csvpath-0.0.41/csvpath/matching/functions/count_lines.py +10 -0
  18. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/count_scans.py +1 -1
  19. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/divide.py +4 -4
  20. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/end.py +1 -1
  21. csvpath-0.0.41/csvpath/matching/functions/equals.py +39 -0
  22. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/every.py +3 -2
  23. csvpath-0.0.41/csvpath/matching/functions/exists.py +27 -0
  24. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/first.py +2 -2
  25. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/function.py +1 -0
  26. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/function_factory.py +63 -34
  27. csvpath-0.0.41/csvpath/matching/functions/header.md +13 -0
  28. csvpath-0.0.41/csvpath/matching/functions/header.py +10 -0
  29. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/increment.md +6 -6
  30. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/increment.py +7 -11
  31. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/inf.py +1 -6
  32. csvpath-0.0.41/csvpath/matching/functions/last.md +12 -0
  33. csvpath-0.0.41/csvpath/matching/functions/last.py +14 -0
  34. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/length.py +1 -6
  35. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/lower.py +1 -1
  36. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/minf.py +19 -15
  37. csvpath-0.0.41/csvpath/matching/functions/mod.py +27 -0
  38. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/multiply.py +3 -3
  39. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/no.py +1 -1
  40. csvpath-0.0.41/csvpath/matching/functions/nonef.py +10 -0
  41. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/notf.py +1 -3
  42. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/now.py +1 -7
  43. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/orf.py +2 -2
  44. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/percent.py +1 -1
  45. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/print.md +3 -1
  46. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/printf.py +29 -22
  47. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/random.py +1 -1
  48. csvpath-0.0.41/csvpath/matching/functions/regex.py +35 -0
  49. csvpath-0.0.41/csvpath/matching/functions/stop.md +24 -0
  50. csvpath-0.0.41/csvpath/matching/functions/stop.py +23 -0
  51. csvpath-0.0.41/csvpath/matching/functions/substring.py +32 -0
  52. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/subtract.py +3 -4
  53. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/tally.py +2 -2
  54. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/upper.py +1 -1
  55. csvpath-0.0.41/csvpath/matching/functions/variable.md +13 -0
  56. csvpath-0.0.41/csvpath/matching/functions/variable.py +10 -0
  57. csvpath-0.0.41/csvpath/matching/functions/when._ +16 -0
  58. csvpath-0.0.41/csvpath/matching/functions/when.__ +29 -0
  59. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/yes.py +1 -1
  60. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/matcher.py +42 -42
  61. csvpath-0.0.41/csvpath/matching/matching_lexer.py +122 -0
  62. csvpath-0.0.41/csvpath/matching/parser.out +2275 -0
  63. csvpath-0.0.41/csvpath/matching/parsetab.py +2490 -0
  64. csvpath-0.0.41/csvpath/matching/productions/__init__.py +8 -0
  65. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/productions/equality.py +27 -47
  66. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/productions/expression.py +1 -0
  67. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/productions/header.py +14 -6
  68. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/productions/matchable.py +59 -4
  69. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/productions/term.py +4 -0
  70. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/productions/variable.py +22 -3
  71. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/parser_utility.py +5 -3
  72. {csvpath-0.0.22 → csvpath-0.0.41}/pyproject.toml +1 -1
  73. csvpath-0.0.22/csvpath/matching/expression_math.py +0 -123
  74. csvpath-0.0.22/csvpath/matching/expression_utility.py +0 -29
  75. csvpath-0.0.22/csvpath/matching/functions/count_lines.py +0 -12
  76. csvpath-0.0.22/csvpath/matching/functions/is_instance.py +0 -95
  77. csvpath-0.0.22/csvpath/matching/functions/regex.py +0 -38
  78. csvpath-0.0.22/csvpath/matching/matching_lexer.py +0 -66
  79. csvpath-0.0.22/csvpath/matching/parser.out +0 -1287
  80. csvpath-0.0.22/csvpath/matching/parsetab.py +0 -1427
  81. {csvpath-0.0.22/csvpath → csvpath-0.0.41/csvpath/matching}/__init__.py +0 -0
  82. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/expression_encoder.py +0 -0
  83. {csvpath-0.0.22/csvpath/matching → csvpath-0.0.41/csvpath/matching/functions}/__init__.py +0 -0
  84. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/every.md +0 -0
  85. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/first.md +0 -0
  86. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/in.md +0 -0
  87. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/no.md +0 -0
  88. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/now.md +0 -0
  89. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/matching/functions/tally.md +0 -0
  90. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/scanning/__init__.py +0 -0
  91. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/scanning/parser.out +0 -0
  92. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/scanning/parsetab.py +0 -0
  93. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/scanning/scanner.py +0 -0
  94. {csvpath-0.0.22 → csvpath-0.0.41}/csvpath/scanning/scanning_lexer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: csvpath
3
- Version: 0.0.22
3
+ Version: 0.0.41
4
4
  Summary:
5
5
  Author: David Kershaw
6
6
  Author-email: dk107dk@hotmail.com
@@ -19,26 +19,120 @@ Description-Content-Type: text/markdown
19
19
 
20
20
  CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
21
21
  - XPath: CsvPath is to a CSV file like XPath is to an XML file
22
- - Schematron: Schematron is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
23
- - CSS selectors: CsvPath picks out structured data in a similar way to how CSS selectors pick out HTML structures.
22
+ - Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
23
+ - CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
24
24
 
25
25
  CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
26
26
 
27
27
  # Usage
28
- CsvPath paths have two parts, scanning and matching. For usage, see the unit tests in [tests/test_scanner.py](tests/test_scanner.py), [tests/test_matcher.py](tests/test_matcher.py) and [tests/test_functions.py](tests/test_functions.py).
29
28
 
30
- path = CsvPath(delimiter=",")
31
- path.parse("$test.csv[5-25][#0=="Frog" @lastname="Bats" count()==2]")
29
+ CsvPath paths have three parts:
30
+ - a "root" file name
31
+ - a scanning part
32
+ - a matching part
33
+
34
+ The root starts with `$`. The match and scan parts are enclosed by brackets.
35
+
36
+ A very simple csvpath might look like this:
37
+
38
+ $filename[*][yes()]
39
+
40
+ This path says open the file named `filename`, scan all the lines, and match every line scanned.
41
+
42
+ The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
43
+
44
+ ## Running CsvPath
45
+
46
+ There are two classes that do all the work: CsvPath and CsvPaths. Each has very few external methods.
47
+ - CsvPath
48
+ - parse() applies a csvpath to a file
49
+ - next() iterates over the matched rows
50
+ - fast_forward() processes all rows
51
+ - collect() processes all rows and collects the lines that matched as lists
52
+ - CsvPaths
53
+ - csvpath() gets a CsvPath that knows all the file names available
54
+ - set_named_files() sets the file names as a Dict[str,str] of named paths
55
+ - set_file_path() sets the file names from a JSON file of named paths or a single .csv file or a directory of .csv files
56
+
57
+ This is a very basic use of CsvPath. For more usage, see the unit tests.
58
+
59
+ path = CsvPath()
60
+ path.parse("""$test.csv
61
+ [5-25]
62
+ [
63
+ #0=="Frog"
64
+ @lastname.onmatch="Bats"
65
+ count()==2
66
+ ]
67
+ """)
32
68
  for i, line in enumerate( path.next() ):
33
69
  print(f"{i}: {line}")
34
-
35
70
  print(f"path vars: {path.variables}")
36
71
 
37
- This scanning and matching path says:
72
+ The csvpath says:
38
73
  - Open test.csv
39
74
  - Scan lines 5 through 25
40
75
  - Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
41
76
 
77
+ Another path that does the same thing a bit more simply might look like:
78
+
79
+ """$test.csv
80
+ [5-25]
81
+ [
82
+ #0=="Frog"
83
+ @lastname.onmatch="Bats"
84
+ count()==2 -> print( "$.match_count: $.line")
85
+ ]
86
+ """
87
+
88
+ In this case we're using the "when" operator, `->`, to determine when to print.
89
+
90
+ ## The print function
91
+
92
+ The `print` function has several uses, including:
93
+ - Debugging csvpaths
94
+ - Validating CSV files
95
+ - Creating new CSV files based on an existing file
96
+
97
+ ### Validating CSV
98
+
99
+ CsvPath paths can be used for rules based validation. Rules based validation checks a file against content and structure rules but does not validate the file's structure against a schema. This validation approach is similar to XML's Schematron validation, where XPath rules are applied to XML.
100
+
101
+ There is no "standard" way to do CsvPath validation. The simplest way is to create csvpaths that print a validation message when a rule fails. For example:
102
+
103
+ $test.csv[*][@failed = equals(#firstname, "Frog")
104
+ @failed.asbool -> print("Error: Check line $.line_count for a row with the name Frog")]
105
+
106
+ Several rules can exist in the same csvpath for convenience and/or performance. Alternatively, you can run separate csvpaths for each rule.
107
+
108
+ ### Creating new CSV files
109
+
110
+ Csvpaths can use the `print` function to generate new file content on system out. Redirecting the output to a file is an easy way to create a new CSV file based on an existing file. For e.g.
111
+
112
+ $test.csv[*][ line_count()==0 -> print("lastname, firstname, say")
113
+ above(line_count(), 0) -> print("$.headers.lastname, $.headers.firstname, $.headers.say")]
114
+
115
+ This csvpath reorders the headers of the test file at `tests/test_resources/test.csv`. The output file will have a header row.
116
+
117
+ ## Named files
118
+
119
+ You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
120
+ - A JSON file with a dictionary of file paths under name keys
121
+ - A dict object passed into the CsvPaths object containing the same named path structure
122
+ - The path to a csv file that will be put into the named paths dict under its name minus extension
123
+ - A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
124
+
125
+ You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
126
+
127
+ paths = CsvPaths(filename = "my_named_paths.json")
128
+ path = paths.csvpath()
129
+ path.parse( """$test[*][#firstname=="Fred"]""" )
130
+ path.collect()
131
+
132
+ If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
133
+
134
+ { "test":"test/test_resources/test.csv" }
135
+
42
136
  # Scanning
43
137
  The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
44
138
 
@@ -51,8 +145,7 @@ The scan part of the path starts with a dollar sign to indicate the root, meanin
51
145
  - `[1+3-8]` means line 1 and lines 3 through eight
52
146
 
53
147
  # Matching
54
- The match part is also bracketed. Matches have space separated
55
- components or "values" that are ANDed together. A match component is one of several types:
148
+ The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
56
149
  <table>
57
150
  <tr>
58
151
  <td>Type</td>
@@ -90,26 +183,36 @@ Qualifiers are described below. </td>
90
183
  <tr>
91
184
  <td>Variable </td>
92
185
  <td>Value</td>
93
- <td>True/False when value tested. True when set, True/False existence when used alone</td>
94
- <td>An @ followed by a name. A variable is
95
- set or tested depending on the usage. By itself, it is an existence test. When used as
96
- the left hand side of an "=" its value is set.
97
- When it is used on either side of an "==" it is an equality test.
98
- Variables can take an `onmatch` qualifier to indicate that the variable should
99
- only be set when the row matches all parts of the path.
186
+ <td>True when set unless `onchange` determines True/False.</td>
187
+ <td>
188
+ <p>
189
+ An @ followed by a name. A variable is set or tested depending on the usage. When used as the left hand side of an "=" its value is set. When it is used on either side of an "==" it is an equality test.
190
+ </p>
191
+ <p>
192
+ Variables can take an `onmatch` qualifier to indicate that the variable should only be set when the row matches all parts of the path.
193
+ <p/>
194
+ <p>
195
+ A variable can also take an `onchange` qualifier to make its assignment only match when its value changes. In the usual case, a variable assignment always matches, making it not a factor in the row's matching or not matching. With `onchange` the assignment can determine if the row fails to match the csvpath.
196
+ </p>
197
+ <p>
198
+ Note that at present a variable assignment of an equality test is not possible using `==`. In the future the csvpath grammar may be improved to address this gap. In the interim, use the `equals(value,value)` function. I.e.instead of
199
+ @test = @cat == @hat
200
+ use
201
+ @test = equals(@cat, @hat)
202
+ </p>
100
203
  <td>
101
204
  <li/> `@weather="cloudy"`
102
205
  <li/> `count(@weather=="sunny")`
103
- <li/> `@weather`
104
206
  <li/> `#summer==@weather`
207
+ <li/> `@happy.onchange=#weather`
105
208
 
106
- #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
209
+ #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is a test. #4 sets the `happy` variable to the value of the `weather` header and fails the row matching until `happy`'s value changes.
107
210
  </td>
108
211
  </tr>
109
212
  <tr>
110
213
  <td>Header </td>
111
214
  <td>Value </td>
112
- <td>A True/False existence test when used alone, otherwise calculated</td>
215
+ <td>Calculated</td>
113
216
  <td>A # followed by a name or integer. The name references a value in line 0, the header
114
217
  row. A number references a column by the 0-based column order. </td>
115
218
  <td>
@@ -129,9 +232,13 @@ only be set when the row matches all parts of the path.
129
232
  </tr>
130
233
  <table>
131
234
 
132
- Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only two qualifiers:
235
+ ## Qualifiers
236
+
237
+ Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only four qualifiers:
133
238
 
134
239
  - `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
240
+ - `onchange` set on a variable to indicate that a row should only match when the variable is set to a new value
241
+ - `asbool` set on a variable or header to have its value interpreted as a bool rather than just a simple `is not None` test
135
242
  - An arbitrary string to add a name for the function's internal use, typically to name a variable
136
243
 
137
244
  Qualifiers look like:
@@ -144,36 +251,54 @@ Or:
144
251
 
145
252
  When multiple qualifiers are used order is not important.
146
253
 
147
- ## Example
148
- [ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short</td><td>medium')) ]
254
+ ## Variables
149
255
 
150
- In the path above, the rules applied are:
151
- - `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
152
- - `#2` means the 3rd column, counting from 0
153
- - Functions and column references are ANDed together
154
- - `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
155
- - Functions can contain functions, equality tests, and/or literals
256
+ A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
257
+
258
+ [@a=#b #c==@a]
259
+
260
+ Can also be written as:
261
+
262
+ [#c==#b]
156
263
 
157
- Variables are always set unless they are flagged with `.onmatch`. That means:
264
+ Variables are always set unless they are flagged with the `.onmatch` qualifier. That means:
158
265
 
159
266
  $file.csv[*][ @imcounting.onmatch = count_lines() no()]
160
267
 
161
- will never set `imcounting`, but:
268
+ will never set `imcounting`, because of the `no()` function disallowing any matches, but:
162
269
 
163
270
  $file.csv[*][ @imcounting = count_lines() no()]
164
271
 
165
272
  will always set it.
166
273
 
274
+ As noted above, a variable can be flagged with the `onchange` qualifier. The effect is that a row will only match if the variable qualified by `onchange` changes in value.
275
+
276
+ ## The when operator
277
+
278
+ `->`, the "when" operator, is used to act on a condition. `->` can take an equality or function on the left and trigger an equality, assignment, or function on the right. For e.g.
279
+
280
+ [ last() -> print("this is the last line") ]
281
+
282
+ Prints `this is the last line` just before the scan ends.
283
+
284
+ [ exists(#0) -> @firstname = #0 ]
285
+
286
+ Says to set the `firstname` variable to the value of the first column when the first column has a value.
287
+
288
+ ## Match functions
289
+
167
290
  Most of the work of matching is done in functions. The match functions are the following.
168
291
 
169
292
 
170
293
  <table>
171
294
  <tr><th> Group </th><th>Function </th><th> What it does </th></tr>
172
295
  <tr><td> Boolean </td><td> </td><td> </td></tr>
296
+ <tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
173
297
  <tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
174
298
  <tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
175
299
  <tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
176
300
  <tr><td> </td><td> yes() </td><td> always true </td></tr>
301
+ <tr><td> </td><td> exists(value) </td><td> tests if the value exists </td></tr>
177
302
  <tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
178
303
  <tr><td> Math </td><td> </td><td> </td></tr>
179
304
  <tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
@@ -183,7 +308,7 @@ Most of the work of matching is done in functions. The match functions are the f
183
308
  <tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
184
309
  <tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
185
310
  <tr><td> Stats </td><td> </td><td> </td></tr>
186
- <tr><td> </td><td> average(number, type) </td><td> returns the average up to current "line", "scan", "match" </td></tr>
311
+ <tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
187
312
  <tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
188
313
  <tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
189
314
  <tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
@@ -202,16 +327,32 @@ Most of the work of matching is done in functions. The match functions are the f
202
327
  <tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
203
328
  <tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
204
329
  <tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
330
+ <tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
205
331
  <tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
206
- <tr><td> Other </td><td> </td><td> </td></tr>
332
+ <tr><td> Columns </td><td> </td><td> </td></tr>
207
333
  <tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
208
- <tr><td> </td><td> isinstance(value, typestr) </td><td> tests for "int","float","complex","bool","usd" </td></tr>
334
+ <tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
335
+ <tr><td> Other </td><td> </td><td> </td></tr>
336
+ <tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
209
337
  <tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
210
338
  <tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
211
339
  <tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
340
+ <tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
341
+ <tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
342
+ <tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
212
343
  </tr>
213
344
  </table>
214
345
 
346
+ ## Another Example
347
+ [ exists(#common_name) #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
348
+
349
+ In the path above, the rules applied are:
350
+ - The exists test of `#common_name` checks if the header named "common_name" has a value. Headers are the values in the 0th line.
351
+ - `#2` means the 3rd column, counting from 0
352
+ - Functions and column references are ANDed together
353
+ - `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
354
+ - Functions can contain functions, equality tests, and/or literals
355
+
215
356
  # Not Ready For Production
216
357
  Anything could change and performance could be better. This project is a hobby.
217
358
 
@@ -3,26 +3,120 @@
3
3
 
4
4
  CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
5
5
  - XPath: CsvPath is to a CSV file like XPath is to an XML file
6
- - Schematron: Schematron is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
7
- - CSS selectors: CsvPath picks out structured data in a similar way to how CSS selectors pick out HTML structures.
6
+ - Schematron: Schematron validation is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
7
+ - CSS selectors: CsvPath picks out structured data in a conceptually similar way to how CSS selectors pick out HTML structures.
8
8
 
9
9
  CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
10
10
 
11
11
  # Usage
12
- CsvPath paths have two parts, scanning and matching. For usage, see the unit tests in [tests/test_scanner.py](tests/test_scanner.py), [tests/test_matcher.py](tests/test_matcher.py) and [tests/test_functions.py](tests/test_functions.py).
13
12
 
14
- path = CsvPath(delimiter=",")
15
- path.parse("$test.csv[5-25][#0=="Frog" @lastname="Bats" count()==2]")
13
+ CsvPath paths have three parts:
14
+ - a "root" file name
15
+ - a scanning part
16
+ - a matching part
17
+
18
+ The root starts with `$`. The match and scan parts are enclosed by brackets.
19
+
20
+ A very simple csvpath might look like this:
21
+
22
+ $filename[*][yes()]
23
+
24
+ This path says open the file named `filename`, scan all the lines, and match every line scanned.
25
+
26
+ The filename following the `$` can be an actual relative or absolute file path. It could alternatively be a logical identifier that points indirectly to a physical file, as described below.
27
+
28
+ ## Running CsvPath
29
+
30
+ There are two classes that do all the work: CsvPath and CsvPaths. Each has very few external methods.
31
+ - CsvPath
32
+ - parse() applies a csvpath to a file
33
+ - next() iterates over the matched rows
34
+ - fast_forward() processes all rows
35
+ - collect() processes all rows and collects the lines that matched as lists
36
+ - CsvPaths
37
+ - csvpath() gets a CsvPath that knows all the file names available
38
+ - set_named_files() sets the file names as a Dict[str,str] of named paths
39
+ - set_file_path() sets the file names from a JSON file of named paths or a single .csv file or a directory of .csv files
40
+
41
+ This is a very basic use of CsvPath. For more usage, see the unit tests.
42
+
43
+ path = CsvPath()
44
+ path.parse("""$test.csv
45
+ [5-25]
46
+ [
47
+ #0=="Frog"
48
+ @lastname.onmatch="Bats"
49
+ count()==2
50
+ ]
51
+ """)
16
52
  for i, line in enumerate( path.next() ):
17
53
  print(f"{i}: {line}")
18
-
19
54
  print(f"path vars: {path.variables}")
20
55
 
21
- This scanning and matching path says:
56
+ The csvpath says:
22
57
  - Open test.csv
23
58
  - Scan lines 5 through 25
24
59
  - Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
25
60
 
61
+ Another path that does the same thing a bit more simply might look like:
62
+
63
+ """$test.csv
64
+ [5-25]
65
+ [
66
+ #0=="Frog"
67
+ @lastname.onmatch="Bats"
68
+ count()==2 -> print( "$.match_count: $.line")
69
+ ]
70
+ """
71
+
72
+ In this case we're using the "when" operator, `->`, to determine when to print.
73
+
74
+ ## The print function
75
+
76
+ The `print` function has several uses, including:
77
+ - Debugging csvpaths
78
+ - Validating CSV files
79
+ - Creating new CSV files based on an existing file
80
+
81
+ ### Validating CSV
82
+
83
+ CsvPath paths can be used for rules based validation. Rules based validation checks a file against content and structure rules but does not validate the file's structure against a schema. This validation approach is similar to XML's Schematron validation, where XPath rules are applied to XML.
84
+
85
+ There is no "standard" way to do CsvPath validation. The simplest way is to create csvpaths that print a validation message when a rule fails. For example:
86
+
87
+ $test.csv[*][@failed = equals(#firstname, "Frog")
88
+ @failed.asbool -> print("Error: Check line $.line_count for a row with the name Frog")]
89
+
90
+ Several rules can exist in the same csvpath for convenience and/or performance. Alternatively, you can run separate csvpaths for each rule.
91
+
92
+ ### Creating new CSV files
93
+
94
+ Csvpaths can use the `print` function to generate new file content on system out. Redirecting the output to a file is an easy way to create a new CSV file based on an existing file. For e.g.
95
+
96
+ $test.csv[*][ line_count()==0 -> print("lastname, firstname, say")
97
+ above(line_count(), 0) -> print("$.headers.lastname, $.headers.firstname, $.headers.say")]
98
+
99
+ This csvpath reorders the headers of the test file at `tests/test_resources/test.csv`. The output file will have a header row.
100
+
101
+ ## Named files
102
+
103
+ You can use the `CsvPaths` class to set up a list of named file paths so that you can have more concise csvpaths. Named paths can take the form of:
104
+ - A JSON file with a dictionary of file paths under name keys
105
+ - A dict object passed into the CsvPaths object containing the same named path structure
106
+ - The path to a csv file that will be put into the named paths dict under its name minus extension
107
+ - A file system path pointing to a directory that will be used to populate the named paths dict with all contined files
108
+
109
+ You can then use a csvpath like `$logical_name[*][yes()]` to apply the csvpath to the file named `logical_name` in the CsvPaths object's named paths dict. This use is nearly transparent:
110
+
111
+ paths = CsvPaths(filename = "my_named_paths.json")
112
+ path = paths.csvpath()
113
+ path.parse( """$test[*][#firstname=="Fred"]""" )
114
+ path.collect()
115
+
116
+ If my_named_paths.json contains the following structure, the name `test` will be used to find `tests/test_resources/test.csv`. The parse method will apply the csvpath and the collect method will gather all the matched rows.
117
+
118
+ { "test":"test/test_resources/test.csv" }
119
+
26
120
  # Scanning
27
121
  The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
28
122
 
@@ -35,8 +129,7 @@ The scan part of the path starts with a dollar sign to indicate the root, meanin
35
129
  - `[1+3-8]` means line 1 and lines 3 through eight
36
130
 
37
131
  # Matching
38
- The match part is also bracketed. Matches have space separated
39
- components or "values" that are ANDed together. A match component is one of several types:
132
+ The match part is also bracketed. Matches have space separated components or "values" that are ANDed together. The components' order is important. A match component is one of several types:
40
133
  <table>
41
134
  <tr>
42
135
  <td>Type</td>
@@ -74,26 +167,36 @@ Qualifiers are described below. </td>
74
167
  <tr>
75
168
  <td>Variable </td>
76
169
  <td>Value</td>
77
- <td>True/False when value tested. True when set, True/False existence when used alone</td>
78
- <td>An @ followed by a name. A variable is
79
- set or tested depending on the usage. By itself, it is an existence test. When used as
80
- the left hand side of an "=" its value is set.
81
- When it is used on either side of an "==" it is an equality test.
82
- Variables can take an `onmatch` qualifier to indicate that the variable should
83
- only be set when the row matches all parts of the path.
170
+ <td>True when set unless `onchange` determines True/False.</td>
171
+ <td>
172
+ <p>
173
+ An @ followed by a name. A variable is set or tested depending on the usage. When used as the left hand side of an "=" its value is set. When it is used on either side of an "==" it is an equality test.
174
+ </p>
175
+ <p>
176
+ Variables can take an `onmatch` qualifier to indicate that the variable should only be set when the row matches all parts of the path.
177
+ <p/>
178
+ <p>
179
+ A variable can also take an `onchange` qualifier to make its assignment only match when its value changes. In the usual case, a variable assignment always matches, making it not a factor in the row's matching or not matching. With `onchange` the assignment can determine if the row fails to match the csvpath.
180
+ </p>
181
+ <p>
182
+ Note that at present a variable assignment of an equality test is not possible using `==`. In the future the csvpath grammar may be improved to address this gap. In the interim, use the `equals(value,value)` function. I.e.instead of
183
+ @test = @cat == @hat
184
+ use
185
+ @test = equals(@cat, @hat)
186
+ </p>
84
187
  <td>
85
188
  <li/> `@weather="cloudy"`
86
189
  <li/> `count(@weather=="sunny")`
87
- <li/> `@weather`
88
190
  <li/> `#summer==@weather`
191
+ <li/> `@happy.onchange=#weather`
89
192
 
90
- #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
193
+ #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is a test. #4 sets the `happy` variable to the value of the `weather` header and fails the row matching until `happy`'s value changes.
91
194
  </td>
92
195
  </tr>
93
196
  <tr>
94
197
  <td>Header </td>
95
198
  <td>Value </td>
96
- <td>A True/False existence test when used alone, otherwise calculated</td>
199
+ <td>Calculated</td>
97
200
  <td>A # followed by a name or integer. The name references a value in line 0, the header
98
201
  row. A number references a column by the 0-based column order. </td>
99
202
  <td>
@@ -113,9 +216,13 @@ only be set when the row matches all parts of the path.
113
216
  </tr>
114
217
  <table>
115
218
 
116
- Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only two qualifiers:
219
+ ## Qualifiers
220
+
221
+ Variables and some functions can take qualifiers on their name. A qualifier takes the form of a dot plus a qualification name. At the moment there are only four qualifiers:
117
222
 
118
223
  - `onmatch` to indicate that action on the variable or function only happens when the whole path matches a row
224
+ - `onchange` set on a variable to indicate that a row should only match when the variable is set to a new value
225
+ - `asbool` set on a variable or header to have its value interpreted as a bool rather than just a simple `is not None` test
119
226
  - An arbitrary string to add a name for the function's internal use, typically to name a variable
120
227
 
121
228
  Qualifiers look like:
@@ -128,36 +235,54 @@ Or:
128
235
 
129
236
  When multiple qualifiers are used order is not important.
130
237
 
131
- ## Example
132
- [ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short</td><td>medium')) ]
238
+ ## Variables
133
239
 
134
- In the path above, the rules applied are:
135
- - `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
136
- - `#2` means the 3rd column, counting from 0
137
- - Functions and column references are ANDed together
138
- - `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
139
- - Functions can contain functions, equality tests, and/or literals
240
+ A variable can be assigned early in the match part of a path and used later in that same path. The assignment and use will both be in the context of the same row in the file. For e.g.
241
+
242
+ [@a=#b #c==@a]
243
+
244
+ Can also be written as:
245
+
246
+ [#c==#b]
140
247
 
141
- Variables are always set unless they are flagged with `.onmatch`. That means:
248
+ Variables are always set unless they are flagged with the `.onmatch` qualifier. That means:
142
249
 
143
250
  $file.csv[*][ @imcounting.onmatch = count_lines() no()]
144
251
 
145
- will never set `imcounting`, but:
252
+ will never set `imcounting`, because of the `no()` function disallowing any matches, but:
146
253
 
147
254
  $file.csv[*][ @imcounting = count_lines() no()]
148
255
 
149
256
  will always set it.
150
257
 
258
+ As noted above, a variable can be flagged with the `onchange` qualifier. The effect is that a row will only match if the variable qualified by `onchange` changes in value.
259
+
260
+ ## The when operator
261
+
262
+ `->`, the "when" operator, is used to act on a condition. `->` can take an equality or function on the left and trigger an equality, assignment, or function on the right. For e.g.
263
+
264
+ [ last() -> print("this is the last line") ]
265
+
266
+ Prints `this is the last line` just before the scan ends.
267
+
268
+ [ exists(#0) -> @firstname = #0 ]
269
+
270
+ Says to set the `firstname` variable to the value of the first column when the first column has a value.
271
+
272
+ ## Match functions
273
+
151
274
  Most of the work of matching is done in functions. The match functions are the following.
152
275
 
153
276
 
154
277
  <table>
155
278
  <tr><th> Group </th><th>Function </th><th> What it does </th></tr>
156
279
  <tr><td> Boolean </td><td> </td><td> </td></tr>
280
+ <tr><td> </td><td> <a href='csvpath/matching/functions/any.md'>any(value, value)</a> </td><td> existence test across a range of places </td></tr>
157
281
  <tr><td> </td><td> <a href='csvpath/matching/functions/no.md'>no()</a> </td><td> always false </td></tr>
158
282
  <tr><td> </td><td> not(value) </td><td> negates a value </td></tr>
159
283
  <tr><td> </td><td> or(value, value,...) </td><td> match any one </td></tr>
160
284
  <tr><td> </td><td> yes() </td><td> always true </td></tr>
285
+ <tr><td> </td><td> exists(value) </td><td> tests if the value exists </td></tr>
161
286
  <tr><td> </td><td> <a href='csvpath/matching/functions/in.md'>in(value, list)</a> </td><td> match in a pipe-delimited list </td></tr>
162
287
  <tr><td> Math </td><td> </td><td> </td></tr>
163
288
  <tr><td> </td><td> add(value, value, ...) </td><td> adds numbers </td></tr>
@@ -167,7 +292,7 @@ Most of the work of matching is done in functions. The match functions are the f
167
292
  <tr><td> </td><td> after(value) </td><td> finds things after a date, number, string </td></tr>
168
293
  <tr><td> </td><td> before(value) </td><td> finds things before a date, number, string </td></tr>
169
294
  <tr><td> Stats </td><td> </td><td> </td></tr>
170
- <tr><td> </td><td> average(number, type) </td><td> returns the average up to current "line", "scan", "match" </td></tr>
295
+ <tr><td> </td><td> <a href='csvpath/matching/functions/average.md'>average(number, type)</a> </td><td> returns the average up to current "line", "scan", "match" </td></tr>
171
296
  <tr><td> </td><td> median(value, type) </td><td> median value up to current "line", "scan", "match" </td></tr>
172
297
  <tr><td> </td><td> max(value, type) </td><td> largest value seen up to current "line", "scan", "match" </td></tr>
173
298
  <tr><td> </td><td> min(value, type) </td><td> smallest value seen up to current "line", "scan", "match" </td></tr>
@@ -186,16 +311,32 @@ Most of the work of matching is done in functions. The match functions are the f
186
311
  <tr><td> </td><td> length(value) </td><td> returns the length of the value </td></tr>
187
312
  <tr><td> </td><td> lower(value) </td><td> makes value lowercase </td></tr>
188
313
  <tr><td> </td><td> regex(regex-string, value) </td><td> match on a regular expression </td></tr>
314
+ <tr><td> </td><td> substring(value, int) </td><td> returns the first n chars from the value </td></tr>
189
315
  <tr><td> </td><td> upper(value) </td><td> makes value uppercase </td></tr>
190
- <tr><td> Other </td><td> </td><td> </td></tr>
316
+ <tr><td> Columns </td><td> </td><td> </td></tr>
191
317
  <tr><td> </td><td> end() </td><td> returns the value of the last column </td></tr>
192
- <tr><td> </td><td> isinstance(value, typestr) </td><td> tests for "int","float","complex","bool","usd" </td></tr>
318
+ <tr><td> </td><td> column(value) </td><td> returns column name for an index or index for a name </td></tr>
319
+ <tr><td> Other </td><td> </td><td> </td></tr>
320
+ <tr><td> </td><td> header() </td><td> indicates to another function to look in headers </td></tr>
193
321
  <tr><td> </td><td> <a href='csvpath/matching/functions/now.md'>now(format)</a></td><td> a datetime, optionally formatted </td></tr>
194
322
  <tr><td> </td><td> <a href='csvpath/matching/functions/print.md'>print(value, str)</a></td><td> when matches prints the interpolated string </td></tr>
195
323
  <tr><td> </td><td> random(starting, ending) </td><td> generates a random int from starting to ending </td>
324
+ <tr><td> </td><td> <a href='csvpath/matching/functions/stop.md'>stop(value)</a> </td><td> stops path scanning if a condition is met </td>
325
+ <tr><td> </td><td> <a href='csvpath/matching/functions/when.md'>when(value, value)</a> </td><td> activate a value when a condition matches </td>
326
+ <tr><td> </td><td> variable() </td><td> indicates to another function to look in variables </td></tr>
196
327
  </tr>
197
328
  </table>
198
329
 
330
+ ## Another Example
331
+ [ exists(#common_name) #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
332
+
333
+ In the path above, the rules applied are:
334
+ - The exists test of `#common_name` checks if the header named "common_name" has a value. Headers are the values in the 0th line.
335
+ - `#2` means the 3rd column, counting from 0
336
+ - Functions and column references are ANDed together
337
+ - `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
338
+ - Functions can contain functions, equality tests, and/or literals
339
+
199
340
  # Not Ready For Production
200
341
  Anything could change and performance could be better. This project is a hobby.
201
342
 
@@ -0,0 +1,8 @@
1
+ from csvpath.matching.matcher import Matcher
2
+ from csvpath.matching.expression_encoder import ExpressionEncoder
3
+ from csvpath.scanning.scanner import Scanner
4
+ from csvpath.csvpath import CsvPath
5
+ from csvpath.csvpaths import CsvPaths
6
+
7
+
8
+ __all__ = ["CsvPath", "CsvPaths"]