csvpath 0.0.2__tar.gz → 0.0.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvpath-0.0.2/README.md → csvpath-0.0.21/PKG-INFO +41 -11
- csvpath-0.0.2/PKG-INFO → csvpath-0.0.21/README.md +24 -26
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/csvpath.py +77 -88
- csvpath-0.0.21/csvpath/matching/functions/count.md +28 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/count.py +11 -1
- csvpath-0.0.21/csvpath/matching/functions/every.md +58 -0
- csvpath-0.0.21/csvpath/matching/functions/every.py +47 -0
- csvpath-0.0.21/csvpath/matching/functions/first.md +23 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/first.py +10 -4
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/function.py +4 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/function_factory.py +21 -0
- csvpath-0.0.21/csvpath/matching/functions/in.md +25 -0
- csvpath-0.0.21/csvpath/matching/functions/no.md +13 -0
- csvpath-0.0.21/csvpath/matching/functions/now.md +13 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/now.py +0 -1
- csvpath-0.0.21/csvpath/matching/functions/print.md +37 -0
- csvpath-0.0.21/csvpath/matching/functions/printf.py +145 -0
- csvpath-0.0.21/csvpath/matching/functions/tally.md +29 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/tally.py +3 -3
- csvpath-0.0.21/csvpath/matching/functions/yes.py +10 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/matcher.py +45 -10
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/matching_lexer.py +1 -1
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/productions/equality.py +15 -12
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/productions/expression.py +4 -0
- csvpath-0.0.21/csvpath/matching/productions/header.py +39 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/productions/matchable.py +8 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/productions/term.py +3 -0
- csvpath-0.0.21/csvpath/matching/productions/variable.py +33 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/scanning/scanner.py +32 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/pyproject.toml +3 -1
- csvpath-0.0.2/csvpath/matching/functions/every.py +0 -33
- csvpath-0.0.2/csvpath/matching/productions/header.py +0 -30
- csvpath-0.0.2/csvpath/matching/productions/variable.py +0 -15
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/__init__.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/__init__.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/expression_encoder.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/expression_math.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/expression_utility.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/above.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/add.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/below.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/concat.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/count_lines.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/count_scans.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/divide.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/end.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/inf.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/is_instance.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/length.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/lower.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/minf.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/multiply.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/no.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/notf.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/orf.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/percent.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/random.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/regex.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/subtract.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/functions/upper.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/parser.out +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/matching/parsetab.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/parser_utility.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/scanning/__init__.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/scanning/parser.out +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/scanning/parsetab.py +0 -0
- {csvpath-0.0.2 → csvpath-0.0.21}/csvpath/scanning/scanning_lexer.py +0 -0
|
@@ -1,3 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: csvpath
|
|
3
|
+
Version: 0.0.21
|
|
4
|
+
Summary:
|
|
5
|
+
Author: David Kershaw
|
|
6
|
+
Author-email: dk107dk@hotmail.com
|
|
7
|
+
Requires-Python: >=3.12,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
11
|
+
Requires-Dist: ply (>=3.11,<4.0)
|
|
12
|
+
Requires-Dist: polars (>=1.1.0,<2.0.0)
|
|
13
|
+
Requires-Dist: pytest (>=8.2.2,<9.0.0)
|
|
14
|
+
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
1
17
|
|
|
2
18
|
# CsvPath
|
|
3
19
|
|
|
@@ -104,15 +120,26 @@ take a specific or unlimited number of types as arguments. </td>
|
|
|
104
120
|
</tr>
|
|
105
121
|
<table>
|
|
106
122
|
|
|
107
|
-
|
|
123
|
+
## Example
|
|
124
|
+
[ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
|
|
108
125
|
|
|
109
126
|
In the path above, the rules applied are:
|
|
110
127
|
- `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
|
|
111
128
|
- `#2` means the 3rd column, counting from 0
|
|
112
129
|
- Functions and column references are ANDed together
|
|
113
|
-
- `@tail` creates a variable named "tail" and sets it to the value of the last column
|
|
130
|
+
- `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
|
|
114
131
|
- Functions can contain functions, equality tests, and/or literals
|
|
115
132
|
|
|
133
|
+
Variables are always set unless they are flagged with `.onmatch`. That means:
|
|
134
|
+
|
|
135
|
+
$file.csv[*][ @imcounting.onmatch = count_lines() no()]
|
|
136
|
+
|
|
137
|
+
will never set `imcounting`, but:
|
|
138
|
+
|
|
139
|
+
$file.csv[*][ @imcounting = count_lines() no()]
|
|
140
|
+
|
|
141
|
+
will always set it.
|
|
142
|
+
|
|
116
143
|
Most of the work of matching is done in functions. The match functions are:
|
|
117
144
|
|
|
118
145
|
| Function | What it does |Done|
|
|
@@ -122,15 +149,15 @@ Most of the work of matching is done in functions. The match functions are:
|
|
|
122
149
|
| average(number, type) | returns the average up to current "line", "scan", "match" | X |
|
|
123
150
|
| before(value) | finds things before a date, number, string | X |
|
|
124
151
|
| concat(value, value) | counts the number of matches | X |
|
|
125
|
-
| count() | counts the number of matches | X |
|
|
126
|
-
| count(value) | count matches of value | X |
|
|
152
|
+
| [count()](csvpath/matching/functions/count.md) | counts the number of matches | X |
|
|
153
|
+
| [count(value)](csvpath/matching/functions/count.md) | count matches of value | X |
|
|
127
154
|
| count_lines() | count lines to this point in the file | X |
|
|
128
155
|
| count_scans() | count lines we checked for match | X |
|
|
129
156
|
| divide(value, value, ...) | divides numbers | X |
|
|
130
157
|
| end() | returns the value of the last column | X |
|
|
131
|
-
| every(value, number) | match every Nth time a value is seen | X |
|
|
132
|
-
| first(value)
|
|
133
|
-
| in(value, list) | match in a pipe-delimited list | X |
|
|
158
|
+
| [every(value, number)](csvpath/matching/functions/every.md) | match every Nth time a value is seen | X |
|
|
159
|
+
| [first(value, value, ...)](csvpath/matching/functions/first.md) | match the first occurrence and capture line | X |
|
|
160
|
+
| [in(value, list)](csvpath/matching/functions/in.md) | match in a pipe-delimited list | X |
|
|
134
161
|
| increment(value, n) | increments a variable by n each time seen | |
|
|
135
162
|
| isinstance(value, typestr) | tests for "int","float","complex","bool","usd" | X |
|
|
136
163
|
| length(value) | returns the length of the value | X |
|
|
@@ -139,21 +166,24 @@ Most of the work of matching is done in functions. The match functions are:
|
|
|
139
166
|
| median(value, type) | median value up to current "line", "scan", "match" | X |
|
|
140
167
|
| min(value, type) | smallest value seen up to current "line", "scan", "match" | X |
|
|
141
168
|
| multiply(value, value, ...) | multiplies numbers | X |
|
|
142
|
-
| no() | always false | X |
|
|
169
|
+
| [no()](csvpath/matching/functions/no.md) | always false | X |
|
|
143
170
|
| not(value) | negates a value | X |
|
|
144
|
-
| now(format) | a datetime, optionally formatted | X |
|
|
171
|
+
| [now(format)](csvpath/matching/functions/now.md) | a datetime, optionally formatted | X |
|
|
145
172
|
| or(value, value,...) | match any one | X |
|
|
146
173
|
| percent(type) | % of total lines for "scan", "match", "line" | X |
|
|
174
|
+
| [print(value, str)](csvpath/matching/functions/print.md) | when matches prints the interpolated string | X |
|
|
147
175
|
| random(list) | pick from a list | |
|
|
148
176
|
| random(starting, ending) | generates a random int from starting to ending | X |
|
|
149
177
|
| regex(regex-string) | match on a regular expression | X |
|
|
150
178
|
| subtract(value, value, ...) | subtracts numbers | X |
|
|
151
|
-
| tally(value, value, ...) | counts times values are seen, including as a set | X |
|
|
179
|
+
| [tally(value, value, ...)](csvpath/matching/functions/tally.md) | counts times values are seen, including as a set | X |
|
|
152
180
|
| then(y,m,d,hh,mm,ss,format) | a datetime, optionally formatted | |
|
|
153
181
|
| upper(value) | makes value uppercase | X |
|
|
182
|
+
| yes() | always true | X |
|
|
154
183
|
|
|
155
184
|
# Not Ready For Production
|
|
156
|
-
Anything could change. This project is a hobby.
|
|
185
|
+
Anything could change and performance could be better. This project is a hobby.
|
|
186
|
+
|
|
157
187
|
|
|
158
188
|
|
|
159
189
|
|
|
@@ -1,17 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: csvpath
|
|
3
|
-
Version: 0.0.2
|
|
4
|
-
Summary:
|
|
5
|
-
Author: David Kershaw
|
|
6
|
-
Author-email: dk107dk@hotmail.com
|
|
7
|
-
Requires-Python: >=3.12,<4.0
|
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
-
Requires-Dist: ply (>=3.11,<4.0)
|
|
11
|
-
Requires-Dist: pytest (>=8.2.2,<9.0.0)
|
|
12
|
-
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
|
|
15
1
|
|
|
16
2
|
# CsvPath
|
|
17
3
|
|
|
@@ -118,15 +104,26 @@ take a specific or unlimited number of types as arguments. </td>
|
|
|
118
104
|
</tr>
|
|
119
105
|
<table>
|
|
120
106
|
|
|
121
|
-
|
|
107
|
+
## Example
|
|
108
|
+
[ #common_name #0=="field" @tail.onmatch=end() not(in(@tail, 'short|medium')) ]
|
|
122
109
|
|
|
123
110
|
In the path above, the rules applied are:
|
|
124
111
|
- `#common_name` indicates a header named "common_name". Headers are the values in the 0th line. This component of the match is an existence test.
|
|
125
112
|
- `#2` means the 3rd column, counting from 0
|
|
126
113
|
- Functions and column references are ANDed together
|
|
127
|
-
- `@tail` creates a variable named "tail" and sets it to the value of the last column
|
|
114
|
+
- `@tail` creates a variable named "tail" and sets it to the value of the last column if all else matches
|
|
128
115
|
- Functions can contain functions, equality tests, and/or literals
|
|
129
116
|
|
|
117
|
+
Variables are always set unless they are flagged with `.onmatch`. That means:
|
|
118
|
+
|
|
119
|
+
$file.csv[*][ @imcounting.onmatch = count_lines() no()]
|
|
120
|
+
|
|
121
|
+
will never set `imcounting`, but:
|
|
122
|
+
|
|
123
|
+
$file.csv[*][ @imcounting = count_lines() no()]
|
|
124
|
+
|
|
125
|
+
will always set it.
|
|
126
|
+
|
|
130
127
|
Most of the work of matching is done in functions. The match functions are:
|
|
131
128
|
|
|
132
129
|
| Function | What it does |Done|
|
|
@@ -136,15 +133,15 @@ Most of the work of matching is done in functions. The match functions are:
|
|
|
136
133
|
| average(number, type) | returns the average up to current "line", "scan", "match" | X |
|
|
137
134
|
| before(value) | finds things before a date, number, string | X |
|
|
138
135
|
| concat(value, value) | counts the number of matches | X |
|
|
139
|
-
| count() | counts the number of matches | X |
|
|
140
|
-
| count(value) | count matches of value | X |
|
|
136
|
+
| [count()](csvpath/matching/functions/count.md) | counts the number of matches | X |
|
|
137
|
+
| [count(value)](csvpath/matching/functions/count.md) | count matches of value | X |
|
|
141
138
|
| count_lines() | count lines to this point in the file | X |
|
|
142
139
|
| count_scans() | count lines we checked for match | X |
|
|
143
140
|
| divide(value, value, ...) | divides numbers | X |
|
|
144
141
|
| end() | returns the value of the last column | X |
|
|
145
|
-
| every(value, number) | match every Nth time a value is seen | X |
|
|
146
|
-
| first(value)
|
|
147
|
-
| in(value, list) | match in a pipe-delimited list | X |
|
|
142
|
+
| [every(value, number)](csvpath/matching/functions/every.md) | match every Nth time a value is seen | X |
|
|
143
|
+
| [first(value, value, ...)](csvpath/matching/functions/first.md) | match the first occurrence and capture line | X |
|
|
144
|
+
| [in(value, list)](csvpath/matching/functions/in.md) | match in a pipe-delimited list | X |
|
|
148
145
|
| increment(value, n) | increments a variable by n each time seen | |
|
|
149
146
|
| isinstance(value, typestr) | tests for "int","float","complex","bool","usd" | X |
|
|
150
147
|
| length(value) | returns the length of the value | X |
|
|
@@ -153,22 +150,23 @@ Most of the work of matching is done in functions. The match functions are:
|
|
|
153
150
|
| median(value, type) | median value up to current "line", "scan", "match" | X |
|
|
154
151
|
| min(value, type) | smallest value seen up to current "line", "scan", "match" | X |
|
|
155
152
|
| multiply(value, value, ...) | multiplies numbers | X |
|
|
156
|
-
| no() | always false | X |
|
|
153
|
+
| [no()](csvpath/matching/functions/no.md) | always false | X |
|
|
157
154
|
| not(value) | negates a value | X |
|
|
158
|
-
| now(format) | a datetime, optionally formatted | X |
|
|
155
|
+
| [now(format)](csvpath/matching/functions/now.md) | a datetime, optionally formatted | X |
|
|
159
156
|
| or(value, value,...) | match any one | X |
|
|
160
157
|
| percent(type) | % of total lines for "scan", "match", "line" | X |
|
|
158
|
+
| [print(value, str)](csvpath/matching/functions/print.md) | when matches prints the interpolated string | X |
|
|
161
159
|
| random(list) | pick from a list | |
|
|
162
160
|
| random(starting, ending) | generates a random int from starting to ending | X |
|
|
163
161
|
| regex(regex-string) | match on a regular expression | X |
|
|
164
162
|
| subtract(value, value, ...) | subtracts numbers | X |
|
|
165
|
-
| tally(value, value, ...) | counts times values are seen, including as a set | X |
|
|
163
|
+
| [tally(value, value, ...)](csvpath/matching/functions/tally.md) | counts times values are seen, including as a set | X |
|
|
166
164
|
| then(y,m,d,hh,mm,ss,format) | a datetime, optionally formatted | |
|
|
167
165
|
| upper(value) | makes value uppercase | X |
|
|
166
|
+
| yes() | always true | X |
|
|
168
167
|
|
|
169
168
|
# Not Ready For Production
|
|
170
|
-
Anything could change. This project is a hobby.
|
|
171
|
-
|
|
169
|
+
Anything could change and performance could be better. This project is a hobby.
|
|
172
170
|
|
|
173
171
|
|
|
174
172
|
|
|
@@ -5,6 +5,7 @@ from csvpath.matching.matcher import Matcher
|
|
|
5
5
|
from csvpath.matching.expression_encoder import ExpressionEncoder
|
|
6
6
|
from csvpath.matching.expression_math import ExpressionMath
|
|
7
7
|
from csvpath.scanning.scanner import Scanner
|
|
8
|
+
import time
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class NoFileException(Exception):
|
|
@@ -13,7 +14,13 @@ class NoFileException(Exception):
|
|
|
13
14
|
|
|
14
15
|
class CsvPath:
|
|
15
16
|
def __init__(
|
|
16
|
-
self,
|
|
17
|
+
self,
|
|
18
|
+
*,
|
|
19
|
+
filename=None,
|
|
20
|
+
delimiter=",",
|
|
21
|
+
quotechar='"',
|
|
22
|
+
block_print=True,
|
|
23
|
+
skip_blank_lines=True,
|
|
17
24
|
):
|
|
18
25
|
self.filename = filename
|
|
19
26
|
self.scanner = None
|
|
@@ -30,60 +37,35 @@ class CsvPath:
|
|
|
30
37
|
self.quotechar = quotechar
|
|
31
38
|
self.block_print = block_print
|
|
32
39
|
self.total_lines = -1
|
|
33
|
-
self._verbose = False
|
|
34
40
|
self._dump_json = False
|
|
35
41
|
self._do_math = False # off by default, still experimental
|
|
36
42
|
self._collect_matchers = False
|
|
37
43
|
self.matchers = []
|
|
38
44
|
self.jsons = []
|
|
45
|
+
self.matcher = None
|
|
46
|
+
self.skip_blank_lines = skip_blank_lines
|
|
39
47
|
|
|
40
48
|
def dump_json(self):
|
|
41
49
|
self._dump_json = not self._dump_json
|
|
42
50
|
|
|
43
51
|
def parse(self, data):
|
|
52
|
+
start = time.time()
|
|
44
53
|
self.scanner = Scanner()
|
|
45
54
|
s, mat, mod = self._find_scan_match_modify(data)
|
|
46
55
|
self.scan = s
|
|
47
56
|
self.match = mat
|
|
48
57
|
self.modify = mod
|
|
49
58
|
self.scanner.parse(s)
|
|
50
|
-
|
|
51
|
-
|
|
59
|
+
end = time.time()
|
|
60
|
+
print(f"parsed: {end - start}")
|
|
61
|
+
self.get_total_lines_and_headers()
|
|
52
62
|
return self.scanner
|
|
53
63
|
|
|
54
|
-
def verbose(self, set_verbose: bool = True) -> None:
|
|
55
|
-
self._verbose = set_verbose
|
|
56
|
-
|
|
57
|
-
# prints what the user needs to see
|
|
58
|
-
def verbosity(self, msg: Any) -> None:
|
|
59
|
-
if self._verbose:
|
|
60
|
-
print(f"{msg}")
|
|
61
|
-
|
|
62
64
|
# prints what the developer needs to see
|
|
63
65
|
def print(self, msg: str) -> None:
|
|
64
66
|
if not self.block_print:
|
|
65
67
|
print(msg)
|
|
66
68
|
|
|
67
|
-
def _load_headers(self) -> None:
|
|
68
|
-
with open(self.scanner.filename, "r") as file:
|
|
69
|
-
reader = csv.reader(
|
|
70
|
-
file, delimiter=self.delimiter, quotechar=self.quotechar
|
|
71
|
-
)
|
|
72
|
-
for row in reader:
|
|
73
|
-
self.headers = row
|
|
74
|
-
break
|
|
75
|
-
hs = self.headers[:]
|
|
76
|
-
self.headers = []
|
|
77
|
-
for header in hs:
|
|
78
|
-
header = header.strip()
|
|
79
|
-
header = header.replace(";", "")
|
|
80
|
-
header = header.replace(",", "")
|
|
81
|
-
header = header.replace("|", "")
|
|
82
|
-
header = header.replace("\t", "")
|
|
83
|
-
header = header.replace("`", "")
|
|
84
|
-
self.headers.append(header)
|
|
85
|
-
self.verbosity(f"header: {header}")
|
|
86
|
-
|
|
87
69
|
def _find_scan_match_modify(self, data):
|
|
88
70
|
scan = ""
|
|
89
71
|
matches = ""
|
|
@@ -104,9 +86,6 @@ class CsvPath:
|
|
|
104
86
|
matches = matches if len(matches) > 0 else None
|
|
105
87
|
modify = modify.strip()
|
|
106
88
|
modify = modify if len(modify) > 0 else None
|
|
107
|
-
self.verbosity(f"scan: {scan}")
|
|
108
|
-
self.verbosity(f"matches: {matches}")
|
|
109
|
-
self.verbosity(f"modify: {modify}")
|
|
110
89
|
return scan, matches, modify
|
|
111
90
|
|
|
112
91
|
def __str__(self):
|
|
@@ -158,38 +137,81 @@ class CsvPath:
|
|
|
158
137
|
def next(self):
|
|
159
138
|
if self.scanner.filename is None:
|
|
160
139
|
raise NoFileException("there is no filename")
|
|
161
|
-
self.verbosity(f"filename: {self.scanner.filename}")
|
|
162
|
-
total_lines = -1
|
|
163
|
-
if self._verbose:
|
|
164
|
-
total_lines = self.get_total_lines()
|
|
165
|
-
self.verbosity(f"total lines: {total_lines}")
|
|
166
|
-
|
|
167
140
|
with open(self.scanner.filename, "r") as file:
|
|
168
141
|
reader = csv.reader(
|
|
169
142
|
file, delimiter=self.delimiter, quotechar=self.quotechar
|
|
170
143
|
)
|
|
144
|
+
start = time.time()
|
|
171
145
|
for line in reader:
|
|
172
|
-
self.
|
|
173
|
-
|
|
146
|
+
if self.skip_blank_lines and len(line) == 0:
|
|
147
|
+
continue
|
|
148
|
+
if self.scanner.includes(self.line_number):
|
|
174
149
|
self.scan_count = self.scan_count + 1
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
150
|
+
# from datetime import timedelta
|
|
151
|
+
# startmatch = time.perf_counter()
|
|
152
|
+
b = self.matches(line)
|
|
153
|
+
# endmatch = time.time()
|
|
154
|
+
# duration = timedelta(seconds=time.perf_counter()-startmatch)
|
|
155
|
+
if b:
|
|
178
156
|
self.match_count = self.match_count + 1
|
|
179
|
-
self.verbosity(f"match count: {self.match_count}")
|
|
180
157
|
yield line
|
|
158
|
+
# if self.scan_count < 100:
|
|
159
|
+
# print(f"match {self.scan_count}: {duration}")
|
|
181
160
|
self.line_number = self.line_number + 1
|
|
161
|
+
end = time.time()
|
|
162
|
+
print(f"iterated: {end - start}")
|
|
182
163
|
|
|
183
164
|
def get_total_lines(self) -> int:
|
|
184
165
|
if self.total_lines == -1:
|
|
166
|
+
return self.get_total_lines_and_headers()
|
|
167
|
+
return self.total_lines
|
|
168
|
+
|
|
169
|
+
def get_total_lines_and_headers(self) -> int:
|
|
170
|
+
if self.total_lines == -1:
|
|
171
|
+
start = time.time()
|
|
185
172
|
with open(self.scanner.filename, "r") as file:
|
|
186
173
|
reader = csv.reader(
|
|
187
174
|
file, delimiter=self.delimiter, quotechar=self.quotechar
|
|
188
175
|
)
|
|
176
|
+
i = 0
|
|
189
177
|
for line in reader:
|
|
178
|
+
if i == 0:
|
|
179
|
+
self.headers = line
|
|
180
|
+
i += 1
|
|
190
181
|
self.total_lines += 1
|
|
182
|
+
hs = self.headers[:]
|
|
183
|
+
self.headers = []
|
|
184
|
+
for header in hs:
|
|
185
|
+
header = header.strip()
|
|
186
|
+
header = header.replace(";", "")
|
|
187
|
+
header = header.replace(",", "")
|
|
188
|
+
header = header.replace("|", "")
|
|
189
|
+
header = header.replace("\t", "")
|
|
190
|
+
header = header.replace("`", "")
|
|
191
|
+
self.headers.append(header)
|
|
192
|
+
end = time.time()
|
|
193
|
+
print(f"lines and headers: {end - start}")
|
|
191
194
|
return self.total_lines
|
|
192
195
|
|
|
196
|
+
def _load_headers(self) -> None:
|
|
197
|
+
with open(self.scanner.filename, "r") as file:
|
|
198
|
+
reader = csv.reader(
|
|
199
|
+
file, delimiter=self.delimiter, quotechar=self.quotechar
|
|
200
|
+
)
|
|
201
|
+
for row in reader:
|
|
202
|
+
self.headers = row
|
|
203
|
+
break
|
|
204
|
+
hs = self.headers[:]
|
|
205
|
+
self.headers = []
|
|
206
|
+
for header in hs:
|
|
207
|
+
header = header.strip()
|
|
208
|
+
header = header.replace(";", "")
|
|
209
|
+
header = header.replace(",", "")
|
|
210
|
+
header = header.replace("|", "")
|
|
211
|
+
header = header.replace("\t", "")
|
|
212
|
+
header = header.replace("`", "")
|
|
213
|
+
self.headers.append(header)
|
|
214
|
+
|
|
193
215
|
def current_line_number(self) -> int:
|
|
194
216
|
return self.line_number
|
|
195
217
|
|
|
@@ -208,11 +230,14 @@ class CsvPath:
|
|
|
208
230
|
def matches(self, line) -> bool:
|
|
209
231
|
if not self.match:
|
|
210
232
|
return True
|
|
211
|
-
self.
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
233
|
+
if self.matcher is None:
|
|
234
|
+
self.matcher = Matcher(
|
|
235
|
+
csvpath=self, data=self.match, line=line, headers=self.headers
|
|
236
|
+
)
|
|
237
|
+
else:
|
|
238
|
+
self.matcher.reset()
|
|
239
|
+
self.matcher.line = line
|
|
240
|
+
matcher = self.matcher
|
|
216
241
|
|
|
217
242
|
if self._do_math:
|
|
218
243
|
em = ExpressionMath()
|
|
@@ -272,42 +297,6 @@ class CsvPath:
|
|
|
272
297
|
thevalue = self.variables[name]
|
|
273
298
|
return thevalue
|
|
274
299
|
|
|
275
|
-
def includes(self, line: int) -> bool:
|
|
276
|
-
from_line = self.scanner.from_line
|
|
277
|
-
to_line = self.scanner.to_line
|
|
278
|
-
all_lines = self.scanner.all_lines
|
|
279
|
-
these = self.scanner.these
|
|
280
|
-
return self._includes(
|
|
281
|
-
line, from_line=from_line, to_line=to_line, all_lines=all_lines, these=these
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
def _includes(
|
|
285
|
-
self,
|
|
286
|
-
line: int,
|
|
287
|
-
*,
|
|
288
|
-
from_line: int = None,
|
|
289
|
-
to_line: int = None,
|
|
290
|
-
all_lines: bool = None,
|
|
291
|
-
these: List[int] = [],
|
|
292
|
-
) -> bool:
|
|
293
|
-
if line is None:
|
|
294
|
-
return False
|
|
295
|
-
if from_line is None and all_lines:
|
|
296
|
-
return True
|
|
297
|
-
if from_line is not None and all_lines:
|
|
298
|
-
return line >= from_line
|
|
299
|
-
if from_line == line:
|
|
300
|
-
return True
|
|
301
|
-
if from_line is not None and to_line is not None and from_line > to_line:
|
|
302
|
-
return line >= to_line and line <= from_line
|
|
303
|
-
if from_line is not None and to_line is not None:
|
|
304
|
-
return line >= from_line and line <= to_line
|
|
305
|
-
if line in these:
|
|
306
|
-
return True
|
|
307
|
-
if to_line is not None:
|
|
308
|
-
return line < to_line
|
|
309
|
-
return False
|
|
310
|
-
|
|
311
300
|
def line_numbers(self) -> Iterator[int | str]:
|
|
312
301
|
these = self.scanner.these
|
|
313
302
|
from_line = self.scanner.from_line
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
|
|
2
|
+
# Count
|
|
3
|
+
|
|
4
|
+
Returns the number of matches. When used alone count() gives the total matches seen up to the current line in the file.
|
|
5
|
+
|
|
6
|
+
Matches can be scoped down to a contained existance test or equality. Counting an equality means a function, term, variable, or header compared to another function, term, variable, or header.
|
|
7
|
+
|
|
8
|
+
When the counted match is scoped to the contained existance or equality, the count is of values seen. When counting values seen the count function stores the value-integer pairs in a dict within CsvPath's variables under a key identifying the count function. The ID of the count function is a hash by default, making it difficult for a human to understand which count the key represents. To name the count use a qualifier on the count function. A qualifier is a name that follows the function name separated by a dot, as:
|
|
9
|
+
|
|
10
|
+
count.my_named_count(#0=True)
|
|
11
|
+
|
|
12
|
+
For example you can do do something like this:
|
|
13
|
+
|
|
14
|
+
$file.csv [*]
|
|
15
|
+
[
|
|
16
|
+
@t.onmatch=count.firstname_match(#firstname=="Ants")
|
|
17
|
+
#firstname=="Ants"
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
This path counts the number of matches of firstname into the path's variables so that the variable name is like:
|
|
21
|
+
|
|
22
|
+
{'firstname_match':{True:1}}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
## Examples
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
@@ -20,6 +20,9 @@ class Count(Function):
|
|
|
20
20
|
# contribute to if there's a match
|
|
21
21
|
return self.value # or not. we have to act as if.
|
|
22
22
|
|
|
23
|
+
def matches(self, *, skip=[]) -> bool:
|
|
24
|
+
return self.value
|
|
25
|
+
|
|
23
26
|
def _get_match_count(self) -> int:
|
|
24
27
|
if not self.matcher or not self.matcher.csvpath:
|
|
25
28
|
print("WARNING: no csvpath. are we testing?")
|
|
@@ -31,13 +34,20 @@ class Count(Function):
|
|
|
31
34
|
# need to apply this count function to the contained obj's value
|
|
32
35
|
#
|
|
33
36
|
b = self._function_or_equality.matches(skip=skip)
|
|
34
|
-
|
|
37
|
+
if not b:
|
|
38
|
+
return False
|
|
39
|
+
self._id = (
|
|
40
|
+
self.qualifier
|
|
41
|
+
if self.qualifier is not None
|
|
42
|
+
else self.get_id(self._function_or_equality)
|
|
43
|
+
)
|
|
35
44
|
#
|
|
36
45
|
# to_value() is often going to be a bool based on matches().
|
|
37
46
|
# but in a case like: count(now('yyyy-mm-dd')) it would not be
|
|
38
47
|
#
|
|
39
48
|
tracked_value = self._function_or_equality.to_value(skip=skip)
|
|
40
49
|
cnt = self.matcher.get_variable(self._id, tracking=tracked_value, set_if_none=0)
|
|
50
|
+
# print(f"count: cnt: {cnt}, b: {b}, tracked value: {tracked_value}")
|
|
41
51
|
if b:
|
|
42
52
|
cnt += 1
|
|
43
53
|
self.matcher.set_variable(self._id, tracking=tracked_value, value=cnt)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
|
|
2
|
+
# Every
|
|
3
|
+
|
|
4
|
+
Matches every N times a value is seen. Every takes two arguments: a value in the form of a function, header, or variable
|
|
5
|
+
and an int that indicates how many of the value must be seen for the counter to be increased.
|
|
6
|
+
|
|
7
|
+
Every creates two variables. One tracks the number of times a value is seen. The other tracks the number of times every() matched or didn't match.
|
|
8
|
+
|
|
9
|
+
## Examples
|
|
10
|
+
|
|
11
|
+
$file.csv[*]
|
|
12
|
+
[
|
|
13
|
+
@t.onmatch=count()
|
|
14
|
+
every.who(#lastname, 2)
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
This path matches every other time the value of the `lastname` is seen before. It results in a variable like:
|
|
19
|
+
|
|
20
|
+
{'who_every': {'lastname': 1, 'Kermit': 1, 'Bat': 7}, 'who': {False: 6, True: 3}, 't': 3}
|
|
21
|
+
|
|
22
|
+
This result indicates that the lastname column had:
|
|
23
|
+
- 1 'lastname'
|
|
24
|
+
- 1 'Kermit'
|
|
25
|
+
- 7 'Bat'
|
|
26
|
+
|
|
27
|
+
Those counts resulted in 3 matches and 6 times no match. 'lastname' and 'Kermit' didn't match because they only appear 1 time each. We would have to see 'Kermit' 2 times in order to get a match on 'Kermit'.
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
$file.csv[*]
|
|
31
|
+
[
|
|
32
|
+
@t.onmatch=count()
|
|
33
|
+
every.fish(#lastname=="Bat", 2)
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
For a certain .csv file, this path matches 3 times and returns variables like:
|
|
37
|
+
|
|
38
|
+
{'fish_every': {False: 2, True: 7}, 'fish': {False: 5, True: 4}, 't': 4}
|
|
39
|
+
|
|
40
|
+
This means that `#lastname` was "Bat" seven times. There were 2 times `#lastname` was not "Bat". This result could be problematic because it doesn't indicate which rows it collects are the `False` rows and which were the `True` ones. If we care only about the `True` matches, we could filter out the `False` rows by selecting for `#lastname == "Bat" only.
|
|
41
|
+
|
|
42
|
+
$file.csv[*]
|
|
43
|
+
[
|
|
44
|
+
@t.onmatch=count()
|
|
45
|
+
every.fish(#lastname=="Bat", 2)
|
|
46
|
+
#lastname=="Bat"
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
This results in `t==3` and the list of matched rows including only the 3 matched rows. The variables look like:
|
|
50
|
+
|
|
51
|
+
{'fish_every': {False: 2, True: 7}, 'fish': {False: 5, True: 4}, 't': 3}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from csvpath.matching.functions.function import Function, ChildrenException
|
|
3
|
+
from csvpath.matching.productions.equality import Equality
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Every(Function):
|
|
7
|
+
def to_value(self, *, skip=[]) -> Any:
|
|
8
|
+
return self.matches(skip=skip)
|
|
9
|
+
|
|
10
|
+
def matches(self, *, skip=[]) -> bool:
|
|
11
|
+
if self.value is None:
|
|
12
|
+
if len(self.children) != 1:
|
|
13
|
+
raise ChildrenException("no children. there must be 1 equality child")
|
|
14
|
+
child = self.children[0]
|
|
15
|
+
if not isinstance(child, Equality):
|
|
16
|
+
raise ChildrenException("must be 1 equality child")
|
|
17
|
+
|
|
18
|
+
###
|
|
19
|
+
# 1. we store a count of values under the ID of left. this is the value.to_value
|
|
20
|
+
# 2. we store the every-N-seen count under the qualifier or ID of every
|
|
21
|
+
# 3. we match based on count % n == 0
|
|
22
|
+
#
|
|
23
|
+
self._id = (
|
|
24
|
+
self.qualifier if self.qualifier is not None else self.get_id(self)
|
|
25
|
+
)
|
|
26
|
+
allcount = f"{self.get_id(self)}_{'every'}"
|
|
27
|
+
tracked_value = self.children[0].left.to_value(skip=skip)
|
|
28
|
+
print(f"Every.matches: tracked_value: {tracked_value}")
|
|
29
|
+
cnt = self.matcher.get_variable(
|
|
30
|
+
allcount, tracking=tracked_value, set_if_none=0
|
|
31
|
+
)
|
|
32
|
+
cnt += 1
|
|
33
|
+
self.matcher.set_variable(allcount, tracking=tracked_value, value=cnt)
|
|
34
|
+
every = self.children[0].right.to_value()
|
|
35
|
+
print(
|
|
36
|
+
f"Every.matches: {self._id}: every: {every}, cnt: {cnt} % {every} = {cnt % every}"
|
|
37
|
+
)
|
|
38
|
+
if cnt % every == 0:
|
|
39
|
+
self.value = True
|
|
40
|
+
else:
|
|
41
|
+
self.value = False
|
|
42
|
+
everycount = self.matcher.get_variable(
|
|
43
|
+
self._id, tracking=self.value, set_if_none=0
|
|
44
|
+
)
|
|
45
|
+
everycount += 1
|
|
46
|
+
self.matcher.set_variable(self._id, tracking=self.value, value=everycount)
|
|
47
|
+
return self.value
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
# First
|
|
3
|
+
|
|
4
|
+
Matches the first time a value is seen. A variable tracks the first line numbers for each value. First tracks None and other values that could be hard to interpret. Internally, the magic number First.NEVER = -9999999999 indicates an unset value.
|
|
5
|
+
|
|
6
|
+
## Examples
|
|
7
|
+
|
|
8
|
+
$file.csv[*][first.folks(#firstname)]
|
|
9
|
+
|
|
10
|
+
This path matches when the value of the `firstname` has not been seen before. It results in a variable like:
|
|
11
|
+
|
|
12
|
+
{'folks': {'David': 1}}
|
|
13
|
+
|
|
14
|
+
Multiple values can be used as arguments to first().
|
|
15
|
+
|
|
16
|
+
$file.csv[*][first.dude(#firstname, #lastname)]
|
|
17
|
+
|
|
18
|
+
This path matches the first instance of the firstname and lastname column values together. The comparison simply concatenates the values. The result is a variable like:
|
|
19
|
+
|
|
20
|
+
{'dude': {'DavidKermit': 5}}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|