csvpath 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. csvpath-0.0.2/PKG-INFO +184 -0
  2. csvpath-0.0.2/README.md +169 -0
  3. csvpath-0.0.2/csvpath/__init__.py +1 -0
  4. csvpath-0.0.2/csvpath/csvpath.py +368 -0
  5. csvpath-0.0.2/csvpath/matching/__init__.py +1 -0
  6. csvpath-0.0.2/csvpath/matching/expression_encoder.py +108 -0
  7. csvpath-0.0.2/csvpath/matching/expression_math.py +123 -0
  8. csvpath-0.0.2/csvpath/matching/expression_utility.py +29 -0
  9. csvpath-0.0.2/csvpath/matching/functions/above.py +36 -0
  10. csvpath-0.0.2/csvpath/matching/functions/add.py +24 -0
  11. csvpath-0.0.2/csvpath/matching/functions/below.py +36 -0
  12. csvpath-0.0.2/csvpath/matching/functions/concat.py +25 -0
  13. csvpath-0.0.2/csvpath/matching/functions/count.py +44 -0
  14. csvpath-0.0.2/csvpath/matching/functions/count_lines.py +12 -0
  15. csvpath-0.0.2/csvpath/matching/functions/count_scans.py +13 -0
  16. csvpath-0.0.2/csvpath/matching/functions/divide.py +30 -0
  17. csvpath-0.0.2/csvpath/matching/functions/end.py +18 -0
  18. csvpath-0.0.2/csvpath/matching/functions/every.py +33 -0
  19. csvpath-0.0.2/csvpath/matching/functions/first.py +46 -0
  20. csvpath-0.0.2/csvpath/matching/functions/function.py +31 -0
  21. csvpath-0.0.2/csvpath/matching/functions/function_factory.py +114 -0
  22. csvpath-0.0.2/csvpath/matching/functions/inf.py +38 -0
  23. csvpath-0.0.2/csvpath/matching/functions/is_instance.py +95 -0
  24. csvpath-0.0.2/csvpath/matching/functions/length.py +33 -0
  25. csvpath-0.0.2/csvpath/matching/functions/lower.py +21 -0
  26. csvpath-0.0.2/csvpath/matching/functions/minf.py +167 -0
  27. csvpath-0.0.2/csvpath/matching/functions/multiply.py +27 -0
  28. csvpath-0.0.2/csvpath/matching/functions/no.py +10 -0
  29. csvpath-0.0.2/csvpath/matching/functions/notf.py +26 -0
  30. csvpath-0.0.2/csvpath/matching/functions/now.py +33 -0
  31. csvpath-0.0.2/csvpath/matching/functions/orf.py +28 -0
  32. csvpath-0.0.2/csvpath/matching/functions/percent.py +29 -0
  33. csvpath-0.0.2/csvpath/matching/functions/random.py +33 -0
  34. csvpath-0.0.2/csvpath/matching/functions/regex.py +38 -0
  35. csvpath-0.0.2/csvpath/matching/functions/subtract.py +28 -0
  36. csvpath-0.0.2/csvpath/matching/functions/tally.py +36 -0
  37. csvpath-0.0.2/csvpath/matching/functions/upper.py +21 -0
  38. csvpath-0.0.2/csvpath/matching/matcher.py +215 -0
  39. csvpath-0.0.2/csvpath/matching/matching_lexer.py +66 -0
  40. csvpath-0.0.2/csvpath/matching/parser.out +1287 -0
  41. csvpath-0.0.2/csvpath/matching/parsetab.py +1427 -0
  42. csvpath-0.0.2/csvpath/matching/productions/equality.py +158 -0
  43. csvpath-0.0.2/csvpath/matching/productions/expression.py +16 -0
  44. csvpath-0.0.2/csvpath/matching/productions/header.py +30 -0
  45. csvpath-0.0.2/csvpath/matching/productions/matchable.py +41 -0
  46. csvpath-0.0.2/csvpath/matching/productions/term.py +11 -0
  47. csvpath-0.0.2/csvpath/matching/productions/variable.py +15 -0
  48. csvpath-0.0.2/csvpath/parser_utility.py +39 -0
  49. csvpath-0.0.2/csvpath/scanning/__init__.py +1 -0
  50. csvpath-0.0.2/csvpath/scanning/parser.out +1 -0
  51. csvpath-0.0.2/csvpath/scanning/parsetab.py +231 -0
  52. csvpath-0.0.2/csvpath/scanning/scanner.py +165 -0
  53. csvpath-0.0.2/csvpath/scanning/scanning_lexer.py +47 -0
  54. csvpath-0.0.2/pyproject.toml +22 -0
csvpath-0.0.2/PKG-INFO ADDED
@@ -0,0 +1,184 @@
1
+ Metadata-Version: 2.1
2
+ Name: csvpath
3
+ Version: 0.0.2
4
+ Summary:
5
+ Author: David Kershaw
6
+ Author-email: dk107dk@hotmail.com
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Dist: ply (>=3.11,<4.0)
11
+ Requires-Dist: pytest (>=8.2.2,<9.0.0)
12
+ Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
13
+ Description-Content-Type: text/markdown
14
+
15
+
16
+ # CsvPath
17
+
18
+ CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
19
+ - XPath: CsvPath is to a CSV file like XPath is to an XML file
20
+ - Schematron: Schematron is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
21
+ - CSS selectors: CsvPath picks out structured data in a similar way to how CSS selectors pick out HTML structures.
22
+
23
+ CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
24
+
25
+ # Usage
26
+ CsvPath paths have two parts, scanning and matching. For usage, see the unit tests in [tests/test_scanner.py](tests/test_scanner.py), [tests/test_matcher.py](tests/test_matcher.py) and [tests/test_functions.py](tests/test_functions.py).
27
+
28
+ path = CsvPath(delimiter=",")
29
+ path.parse("$test.csv[5-25][#0=="Frog" @lastname=="Bats" count()==2]")
30
+ for i, line in enumerate( path.next() ):
31
+ print(f"{i}: {line}")
32
+
33
+ print(f"path vars: {path.variables}")
34
+
35
+ This scanning and matching path says:
36
+ - Open test.csv
37
+ - Scan lines 5 through 25
38
+ - Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
39
+
40
+ # Scanning
41
+ The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
42
+
43
+ The scan part of the path starts with '$' to indicate the root, meaning the file from the top. After the '$' comes the file path. The scanning instructions are in a bracket. The rules are:
44
+ - `[*]` means all
45
+ - `[3*]` means starting from line 3 and going to the end of the file
46
+ - `[3]` by itself means just line 3
47
+ - `[1-3]` means lines 1 through 3
48
+ - `[1+3]` means lines 1 and line 3
49
+ - `[1+3-8]` means line 1 and lines 3 through eight
50
+
51
+ # Matching
52
+ The match part is also bracketed. Matches have space separated
53
+ components that are ANDed together. A match component is one of several types:
54
+ <table>
55
+ <tr>
56
+ <td>Type</td>
57
+ <td>Returns</td>
58
+ <td>Matches</td>
59
+ <td>Description</td>
60
+ <td>Examples</td>
61
+ </tr>
62
+ <tr>
63
+ <td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
64
+ <td>A quoted string or date, optionally quoted number, or
65
+ regex. Regex features are limited. A regex is wrapped in "/" characters.</td>
66
+ <td>
67
+ <li/> `"Massachusetts"`
68
+ <li/> `89.7`
69
+ <li/> `/[0-9a-zA-Z]+!/`
70
+ </td>
71
+ </tr>
72
+ <tr>
73
+ <td>Function </td><td> Calculated </td><td> Calculated </td>
74
+ <td>A function name followed by parentheses. Functions can
75
+ contain terms, variables, headers and other functions. Some functions
76
+ take a specific or unlimited number of types as arguments. </td>
77
+ <td>
78
+ <li/> `not(count()==2)`
79
+ </td>
80
+ </tr>
81
+ <tr>
82
+ <td>Variable </td>
83
+ <td>Value</td>
84
+ <td>True/False when value tested. True when set, True/False existence when used alone</td>
85
+ <td>An @ followed by a name. A variable is
86
+ set or tested depending on the usage. By itself, it is an existence test. When used as
87
+ the left hand side of an "=" its value is set.
88
+ When it is used on either side of an "==" it is an equality test.
89
+ <td>
90
+ <li/> `@weather="cloudy"`
91
+ <li/> `count(@weather=="sunny")`
92
+ <li/> `@weather`
93
+ <li/> `#summer==@weather`
94
+
95
+ #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
96
+ </td>
97
+ </tr>
98
+ <tr>
99
+ <td>Header </td>
100
+ <td>Value </td>
101
+ <td>True/False existence when used alone, otherwise calculated</td>
102
+ <td>A # followed by a name or integer. The name references a value in line 0, the header
103
+ row. A number references a column by the 0-based column order. </td>
104
+ <td>
105
+ <li/> `#firstname`
106
+ <li/> `#3`
107
+ </td>
108
+ </tr>
109
+ <tr>
110
+ <td>Equality</td>
111
+ <td>Calculated </td>
112
+ <td>True at assignment, otherwise calculated </td>
113
+ <td>Two of the other types joined with an "=" or "==".</td>
114
+ <td>
115
+ <li/> `@type_of_tree="Oak"`
116
+ <li/> `#name == @type_of_tree`
117
+ </td>
118
+ </tr>
119
+ <table>
120
+
121
+ [ #common_name #0=="field" @tail=end() not(in(@tail, 'short|medium')) ]
122
+
123
+ In the path above, the rules applied are:
124
+ - `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
125
+ - `#2` means the 3rd column, counting from 0
126
+ - Functions and column references are ANDed together
127
+ - `@tail` creates a variable named "tail" and sets it to the value of the last column
128
+ - Functions can contain functions, equality tests, and/or literals
129
+
130
+ Most of the work of matching is done in functions. The match functions are:
131
+
132
+ | Function | What it does |Done|
133
+ |-------------------------------|-----------------------------------------------------------|----|
134
+ | add(value, value, ...) | adds numbers | X |
135
+ | after(value) | finds things after a date, number, string | X |
136
+ | average(number, type) | returns the average up to current "line", "scan", "match" | X |
137
+ | before(value) | finds things before a date, number, string | X |
138
+ | concat(value, value) | counts the number of matches | X |
139
+ | count() | counts the number of matches | X |
140
+ | count(value) | count matches of value | X |
141
+ | count_lines() | count lines to this point in the file | X |
142
+ | count_scans() | count lines we checked for match | X |
143
+ | divide(value, value, ...) | divides numbers | X |
144
+ | end() | returns the value of the last column | X |
145
+ | every(value, number) | match every Nth time a value is seen | X |
146
+ | first(value) | match the first occurrence and capture line | X |
147
+ | in(value, list) | match in a pipe-delimited list | X |
148
+ | increment(value, n) | increments a variable by n each time seen | |
149
+ | isinstance(value, typestr) | tests for "int","float","complex","bool","usd" | X |
150
+ | length(value) | returns the length of the value | X |
151
+ | lower(value) | makes value lowercase | X |
152
+ | max(value, type) | largest value seen up to current "line", "scan", "match" | X |
153
+ | median(value, type) | median value up to current "line", "scan", "match" | X |
154
+ | min(value, type) | smallest value seen up to current "line", "scan", "match" | X |
155
+ | multiply(value, value, ...) | multiplies numbers | X |
156
+ | no() | always false | X |
157
+ | not(value) | negates a value | X |
158
+ | now(format) | a datetime, optionally formatted | X |
159
+ | or(value, value,...) | match any one | X |
160
+ | percent(type) | % of total lines for "scan", "match", "line" | X |
161
+ | random(list) | pick from a list | |
162
+ | random(starting, ending) | generates a random int from starting to ending | X |
163
+ | regex(regex-string) | match on a regular expression | X |
164
+ | subtract(value, value, ...) | subtracts numbers | X |
165
+ | tally(value, value, ...) | counts times values are seen, including as a set | X |
166
+ | then(y,m,d,hh,mm,ss,format) | a datetime, optionally formatted | |
167
+ | upper(value) | makes value uppercase | X |
168
+
169
+ # Not Ready For Production
170
+ Anything could change. This project is a hobby.
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
@@ -0,0 +1,169 @@
1
+
2
+ # CsvPath
3
+
4
+ CsvPath defines a declarative syntax for inspecting and updating CSV files. Though much simpler, it is similar to:
5
+ - XPath: CsvPath is to a CSV file like XPath is to an XML file
6
+ - Schematron: Schematron is basically XPath rules applied using XSLT. CsvPath paths can be used as validation rules.
7
+ - CSS selectors: CsvPath picks out structured data in a similar way to how CSS selectors pick out HTML structures.
8
+
9
+ CsvPath is intended to fit with other DataOps and data quality tools. Files are streamed. The interface is simple. Custom functions can be added.
10
+
11
+ # Usage
12
+ CsvPath paths have two parts, scanning and matching. For usage, see the unit tests in [tests/test_scanner.py](tests/test_scanner.py), [tests/test_matcher.py](tests/test_matcher.py) and [tests/test_functions.py](tests/test_functions.py).
13
+
14
+ path = CsvPath(delimiter=",")
15
+ path.parse("$test.csv[5-25][#0=="Frog" @lastname=="Bats" count()==2]")
16
+ for i, line in enumerate( path.next() ):
17
+ print(f"{i}: {line}")
18
+
19
+ print(f"path vars: {path.variables}")
20
+
21
+ This scanning and matching path says:
22
+ - Open test.csv
23
+ - Scan lines 5 through 25
24
+ - Match the second time we see a line where the first column equals "Frog" and set the variable called "lastname" to "Bats"
25
+
26
+ # Scanning
27
+ The scanner enumerates lines. For each line returned, the line number, the scanned line count, and the match count are available. The set of line numbers scanned is also available.
28
+
29
+ The scan part of the path starts with '$' to indicate the root, meaning the file from the top. After the '$' comes the file path. The scanning instructions are in a bracket. The rules are:
30
+ - `[*]` means all
31
+ - `[3*]` means starting from line 3 and going to the end of the file
32
+ - `[3]` by itself means just line 3
33
+ - `[1-3]` means lines 1 through 3
34
+ - `[1+3]` means lines 1 and line 3
35
+ - `[1+3-8]` means line 1 and lines 3 through eight
36
+
37
+ # Matching
38
+ The match part is also bracketed. Matches have space separated
39
+ components that are ANDed together. A match component is one of several types:
40
+ <table>
41
+ <tr>
42
+ <td>Type</td>
43
+ <td>Returns</td>
44
+ <td>Matches</td>
45
+ <td>Description</td>
46
+ <td>Examples</td>
47
+ </tr>
48
+ <tr>
49
+ <td>Term </td><td> Value </td><td> True when used alone, otherwise calculated </td>
50
+ <td>A quoted string or date, optionally quoted number, or
51
+ regex. Regex features are limited. A regex is wrapped in "/" characters.</td>
52
+ <td>
53
+ <li/> `"Massachusetts"`
54
+ <li/> `89.7`
55
+ <li/> `/[0-9a-zA-Z]+!/`
56
+ </td>
57
+ </tr>
58
+ <tr>
59
+ <td>Function </td><td> Calculated </td><td> Calculated </td>
60
+ <td>A function name followed by parentheses. Functions can
61
+ contain terms, variables, headers and other functions. Some functions
62
+ take a specific or unlimited number of types as arguments. </td>
63
+ <td>
64
+ <li/> `not(count()==2)`
65
+ </td>
66
+ </tr>
67
+ <tr>
68
+ <td>Variable </td>
69
+ <td>Value</td>
70
+ <td>True/False when value tested. True when set, True/False existence when used alone</td>
71
+ <td>An @ followed by a name. A variable is
72
+ set or tested depending on the usage. By itself, it is an existence test. When used as
73
+ the left hand side of an "=" its value is set.
74
+ When it is used on either side of an "==" it is an equality test.
75
+ <td>
76
+ <li/> `@weather="cloudy"`
77
+ <li/> `count(@weather=="sunny")`
78
+ <li/> `@weather`
79
+ <li/> `#summer==@weather`
80
+
81
+ #1 is an assignment that sets the variable and returns True. #2 is an argument used as a test in a way that is specific to the function. #3 is an existence test. #4 is a test.
82
+ </td>
83
+ </tr>
84
+ <tr>
85
+ <td>Header </td>
86
+ <td>Value </td>
87
+ <td>True/False existence when used alone, otherwise calculated</td>
88
+ <td>A # followed by a name or integer. The name references a value in line 0, the header
89
+ row. A number references a column by the 0-based column order. </td>
90
+ <td>
91
+ <li/> `#firstname`
92
+ <li/> `#3`
93
+ </td>
94
+ </tr>
95
+ <tr>
96
+ <td>Equality</td>
97
+ <td>Calculated </td>
98
+ <td>True at assignment, otherwise calculated </td>
99
+ <td>Two of the other types joined with an "=" or "==".</td>
100
+ <td>
101
+ <li/> `@type_of_tree="Oak"`
102
+ <li/> `#name == @type_of_tree`
103
+ </td>
104
+ </tr>
105
+ <table>
106
+
107
+ [ #common_name #0=="field" @tail=end() not(in(@tail, 'short|medium')) ]
108
+
109
+ In the path above, the rules applied are:
110
+ - `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
111
+ - `#2` means the 3rd column, counting from 0
112
+ - Functions and column references are ANDed together
113
+ - `@tail` creates a variable named "tail" and sets it to the value of the last column
114
+ - Functions can contain functions, equality tests, and/or literals
115
+
116
+ Most of the work of matching is done in functions. The match functions are:
117
+
118
+ | Function | What it does |Done|
119
+ |-------------------------------|-----------------------------------------------------------|----|
120
+ | add(value, value, ...) | adds numbers | X |
121
+ | after(value) | finds things after a date, number, string | X |
122
+ | average(number, type) | returns the average up to current "line", "scan", "match" | X |
123
+ | before(value) | finds things before a date, number, string | X |
124
+ | concat(value, value) | counts the number of matches | X |
125
+ | count() | counts the number of matches | X |
126
+ | count(value) | count matches of value | X |
127
+ | count_lines() | count lines to this point in the file | X |
128
+ | count_scans() | count lines we checked for match | X |
129
+ | divide(value, value, ...) | divides numbers | X |
130
+ | end() | returns the value of the last column | X |
131
+ | every(value, number) | match every Nth time a value is seen | X |
132
+ | first(value) | match the first occurrence and capture line | X |
133
+ | in(value, list) | match in a pipe-delimited list | X |
134
+ | increment(value, n) | increments a variable by n each time seen | |
135
+ | isinstance(value, typestr) | tests for "int","float","complex","bool","usd" | X |
136
+ | length(value) | returns the length of the value | X |
137
+ | lower(value) | makes value lowercase | X |
138
+ | max(value, type) | largest value seen up to current "line", "scan", "match" | X |
139
+ | median(value, type) | median value up to current "line", "scan", "match" | X |
140
+ | min(value, type) | smallest value seen up to current "line", "scan", "match" | X |
141
+ | multiply(value, value, ...) | multiplies numbers | X |
142
+ | no() | always false | X |
143
+ | not(value) | negates a value | X |
144
+ | now(format) | a datetime, optionally formatted | X |
145
+ | or(value, value,...) | match any one | X |
146
+ | percent(type) | % of total lines for "scan", "match", "line" | X |
147
+ | random(list) | pick from a list | |
148
+ | random(starting, ending) | generates a random int from starting to ending | X |
149
+ | regex(regex-string) | match on a regular expression | X |
150
+ | subtract(value, value, ...) | subtracts numbers | X |
151
+ | tally(value, value, ...) | counts times values are seen, including as a set | X |
152
+ | then(y,m,d,hh,mm,ss,format) | a datetime, optionally formatted | |
153
+ | upper(value) | makes value uppercase | X |
154
+
155
+ # Not Ready For Production
156
+ Anything could change. This project is a hobby.
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
@@ -0,0 +1 @@
1
+