malcolm3utils 0.5.7__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,31 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: malcolm3utils
3
- Version: 0.5.7
3
+ Version: 0.6.0
4
4
  Summary: Collection of Utility Scripts and Packages
5
5
  License: BSD-3-Clause
6
+ License-File: LICENCE
6
7
  Author: Malcolm E. Davis
7
8
  Author-email: mnjjunk@comcast.net
8
- Requires-Python: >=3.9,<4.0
9
+ Requires-Python: >=3.11,<4.0
9
10
  Classifier: Development Status :: 4 - Beta
10
11
  Classifier: Intended Audience :: Developers
11
12
  Classifier: License :: OSI Approved :: BSD License
12
13
  Classifier: Operating System :: OS Independent
13
14
  Classifier: Programming Language :: Python
14
15
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
- Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Classifier: Programming Language :: Python :: 3.10
20
21
  Classifier: Topic :: Desktop Environment
21
22
  Classifier: Typing :: Typed
23
+ Requires-Dist: click (>=8.3.1,<9.0.0)
22
24
  Requires-Dist: click-logging (>=1.0.1,<2.0.0)
25
+ Requires-Dist: lark (>=1.3.1,<2.0.0)
26
+ Requires-Dist: pandas (>=3.0.0,<4.0.0)
23
27
  Requires-Dist: requests (>=2.32.4)
24
- Requires-Dist: urllib3 (>=2.5.0)
28
+ Requires-Dist: urllib3 (>=2.6.3)
25
29
  Project-URL: Documentation, https://malcolm-3.github.io/malcolm3utils
26
30
  Project-URL: Homepage, https://malcolm-3.github.io/malcolm3utils
27
31
  Project-URL: Repository, https://github.com/malcolm-3/malcolm3utils
@@ -63,6 +67,8 @@ This package provides the following command line tools
63
67
  - A tool for extracting columns of data by column header name or column id
64
68
  - ``merge``
65
69
  - A version of the ``join`` command that doesn't require pre-sorting
70
+ - ``filter``
71
+ - Filter csv files using expressions containing the column headers
66
72
 
67
73
  ## Development
68
74
 
@@ -34,6 +34,8 @@ This package provides the following command line tools
34
34
  - A tool for extracting columns of data by column header name or column id
35
35
  - ``merge``
36
36
  - A version of the ``join`` command that doesn't require pre-sorting
37
+ - ``filter``
38
+ - Filter csv files using expressions containing the column headers
37
39
 
38
40
  ## Development
39
41
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "malcolm3utils"
3
- version = "0.5.7"
3
+ version = "0.6.0"
4
4
  description = "Collection of Utility Scripts and Packages"
5
5
  authors = [
6
6
  "Malcolm E. Davis <mnjjunk@comcast.net>",
@@ -30,10 +30,13 @@ packages = [
30
30
  ]
31
31
 
32
32
  [tool.poetry.dependencies]
33
- python = ">=3.9, <4.0"
33
+ python = ">=3.11, <4.0"
34
34
  click-logging = "^1.0.1"
35
35
  requests = ">=2.32.4"
36
- urllib3 = ">=2.5.0"
36
+ urllib3 = ">=2.6.3"
37
+ pandas = "^3.0.0"
38
+ click = "^8.3.1"
39
+ lark = "^1.3.1"
37
40
 
38
41
  [tool.poetry.group.dev.dependencies]
39
42
  autoflake = "*"
@@ -60,6 +63,7 @@ python-kacl = "*"
60
63
  pyupgrade = "*"
61
64
  tryceratops = "*"
62
65
  setuptools = "*"
66
+ pandas-stubs = "^3.0.0.260204"
63
67
 
64
68
  [build-system]
65
69
  requires = ["poetry-core>=1.0.0"]
@@ -79,7 +83,6 @@ pythonpath = [
79
83
  ]
80
84
  addopts = """\
81
85
  --cov malcolm3utils \
82
- --cov tests \
83
86
  --cov-report term-missing \
84
87
  --no-cov-on-fail \
85
88
  """
@@ -101,6 +104,10 @@ warn_redundant_casts = true
101
104
  warn_return_any = true
102
105
  check_untyped_defs = true
103
106
  show_error_codes = true
107
+ exclude = [
108
+ "tests",
109
+ "venv",
110
+ ]
104
111
 
105
112
  [[tool.mypy.overrides]]
106
113
  module = "click_logging"
@@ -114,3 +121,4 @@ replace = '__version__ = "{new_version}"'
114
121
  touch_latest = 'malcolm3utils.scripts.touch_latest:touch_latest'
115
122
  getcol = 'malcolm3utils.scripts.getcol:getcol'
116
123
  merge = 'malcolm3utils.scripts.merge:merge'
124
+ filter = 'malcolm3utils.scripts.filter:cli'
@@ -1,2 +1,2 @@
1
- __version__ = "0.5.7"
1
+ __version__ = "0.6.0"
2
2
  __version_message__ = "%(prog)s, malcolm3utils version %(version)s"
@@ -0,0 +1,139 @@
1
+ import logging
2
+ import sys
3
+ from csv import DictReader, DictWriter
4
+ from io import TextIOWrapper
5
+ from typing import Any, TextIO, Tuple
6
+
7
+ import click
8
+ import click_logging
9
+
10
+ from malcolm3utils import __version__, __version_message__
11
+ from malcolm3utils.utils.filter_parser import create_filter
12
+
13
+ logger = logging.getLogger()
14
+
15
+
16
+ @click.command(
17
+ "csv-filter",
18
+ help="""
19
+ Filter the input csv files.
20
+
21
+ The filter expression can be a combination of simple
22
+ arithmatic and logical expressions that will evaluate to
23
+ True or False, with a numerical answer being True if
24
+ non-zero. Fieldnames can be used as variables in
25
+ this expression.
26
+
27
+ For example 'age + 1 < 4' would check each row and
28
+ see if row['age'] + 1 is less than 4, and keep or discard
29
+ rows for which the expression is True.
30
+
31
+ \b
32
+ Available operators are:
33
+ +, -, *, /, %, //, ==, !=, <, <=, >, >=, not, and, or
34
+ String literals can be specified using single quotes.
35
+ Field names with spaces should be surrounded by double quotes.
36
+
37
+ If no csv_files are specified, read from stdin.
38
+
39
+ If no --output specified, write to stdout.
40
+
41
+ Input files do not all have to have the same columns.
42
+ The output will have all columns.
43
+ To achieve this all csv_files are opened at program initiation.
44
+ This may cause problems with your system's open file limit if
45
+ you are attempting to filter a large number of files at once.
46
+ """,
47
+ )
48
+ @click.argument(
49
+ "filter_expression",
50
+ type=str,
51
+ required=True,
52
+ )
53
+ @click.argument(
54
+ "csv_files",
55
+ type=click.Path(exists=True),
56
+ nargs=-1,
57
+ metavar="csv_file",
58
+ required=False,
59
+ )
60
+ @click.option(
61
+ "--keep/--discard",
62
+ is_flag=True,
63
+ help="keep or discard entries for which the expression is true (default=keep)",
64
+ default=True,
65
+ )
66
+ @click.option(
67
+ "--output",
68
+ type=click.Path(exists=False),
69
+ help="output file name",
70
+ )
71
+ @click.option(
72
+ "-d",
73
+ "--delimiter",
74
+ type=str,
75
+ help="column delimiter (default=COMMA)",
76
+ default=",",
77
+ )
78
+ @click.option(
79
+ "--output-delimiter",
80
+ type=str,
81
+ help="output column delimiter (default=input delimiter)",
82
+ )
83
+ @click.version_option(__version__, message=__version_message__)
84
+ @click_logging.simple_verbosity_option(logger)
85
+ def cli( # noqa: C901
86
+ filter_expression: str,
87
+ csv_files: Tuple[click.Path, ...] = (),
88
+ keep: bool = True,
89
+ output: click.Path | None = None,
90
+ delimiter: str = ",",
91
+ output_delimiter: str | None = None,
92
+ ) -> None:
93
+
94
+ if output_delimiter is None:
95
+ output_delimiter = delimiter
96
+ filter_function = create_filter(filter_expression)
97
+ input_fhs = []
98
+ readers = []
99
+ fieldnames = []
100
+ output_fh = None
101
+ try:
102
+ if output is None: # pragma: no cover
103
+ output_fh = sys.stdout
104
+ else:
105
+ output_fh = open(str(output), "w")
106
+
107
+ if csv_files:
108
+ for csv_file in csv_files:
109
+ input_fh: TextIOWrapper[Any] | TextIO | Any = open(str(csv_file))
110
+ input_fhs.append(input_fh)
111
+ reader = DictReader(input_fh, delimiter=delimiter)
112
+ if reader.fieldnames is not None:
113
+ fieldnames.extend(
114
+ [x for x in reader.fieldnames if x not in fieldnames]
115
+ )
116
+ readers.append(reader)
117
+ else: # pragma: no cover
118
+ input_fh = sys.stdin
119
+ input_fhs.append(input_fh)
120
+ reader = DictReader(input_fh)
121
+ if reader.fieldnames is not None:
122
+ fieldnames.extend(reader.fieldnames)
123
+ readers.append(reader)
124
+
125
+ writer = DictWriter(
126
+ output_fh, fieldnames=fieldnames, delimiter=output_delimiter
127
+ )
128
+ writer.writeheader()
129
+
130
+ for reader in readers:
131
+ for row in reader:
132
+ if filter_function(row) == keep:
133
+ writer.writerow(row)
134
+
135
+ finally:
136
+ for input_fh in input_fhs:
137
+ input_fh.close()
138
+ if output_fh is not None:
139
+ output_fh.close()
@@ -10,16 +10,14 @@ import click
10
10
  from .. import __version__, __version_message__
11
11
 
12
12
 
13
- @click.command(
14
- help="""
13
+ @click.command(help="""
15
14
  Read the specified file and write out just the specified columns to stdout.
16
15
 
17
16
  The column_spec is a comma separated list of column headers, column indexes (one-based),
18
17
  or column ranges (e.g. 4-6 for columns 4 through 6 inclusive).
19
18
 
20
19
  If no file_to_read is specified, then input is read from stdin.
21
- """
22
- )
20
+ """)
23
21
  @click.option(
24
22
  "-d", "--delimiter", type=str, help="column delimiter (default=TAB)", default="\t"
25
23
  )
@@ -14,8 +14,7 @@ logger = logging.getLogger(__name__)
14
14
  click_logging.basic_config(logger)
15
15
 
16
16
 
17
- @click.command(
18
- help="""
17
+ @click.command(help="""
19
18
  Merge the specified delimited files with column headings, joining entries with
20
19
  the same key field value.
21
20
 
@@ -34,8 +33,7 @@ header will be the header from the first file.
34
33
  If -k is used to specify alternative keys columns for subsequent files, but
35
34
  those files have a column with the same name as the output key column, that
36
35
  will be ignored.
37
- """
38
- )
36
+ """)
39
37
  @click_logging.simple_verbosity_option(logger)
40
38
  @click.option(
41
39
  "-d", "--delimiter", type=str, help="column delimiter (default=TAB)", default="\t"
@@ -0,0 +1,77 @@
1
+ import logging # noqa: A005
2
+ from pathlib import Path
3
+ from typing import Any, Callable, Hashable
4
+
5
+ import pandas as pd
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def read_keyed_csv_data(
11
+ csv_file: Path,
12
+ keyfield: str,
13
+ skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,
14
+ multiple: bool = False,
15
+ ) -> dict[Any, dict[str, Any]] | dict[Any, list[dict[str, Any]]]:
16
+ """
17
+ Instead of using DictReader which imports all values as strings,
18
+ we use pandas.read_csv which handles all of the data conversion
19
+
20
+ Values are returned as a keyed dictionary rather than a list
21
+ as we may need to be able to look up the entries by key.
22
+
23
+ Because MDBs do not support boolean values, we convert all
24
+ boolean values to integer 0/1 fields.
25
+
26
+ Skiprows can be
27
+ - a list of 0-based line indexes to skip
28
+ - a integer giving the number of initial lines to skip
29
+ - a callable that takes the line index and returns True to skip that line
30
+ - None to skip no rows (default)
31
+
32
+ If multiple is true, then the value of the nested dict will be a list
33
+ with each row that matches the key being appended to that list.
34
+
35
+ :param csv_file: CSV file to be read.
36
+ :param keyfield: Field to use as the key in the returned dictionary.
37
+ :param skiprows: rows to skip
38
+ :return: keyed dictionary of each row of data.
39
+ :param multiple: indicates there may be multiple rows for each key
40
+ """
41
+ if multiple:
42
+ result: dict[str, list[dict[str, Any]]] = {}
43
+ for entry in read_csv_data(csv_file, skiprows=skiprows):
44
+ key = entry[keyfield]
45
+ if key not in result:
46
+ result[key] = []
47
+ result[key].append(entry)
48
+ return result
49
+ else:
50
+ return {x[keyfield]: x for x in read_csv_data(csv_file)}
51
+
52
+
53
+ def read_csv_data(
54
+ csv_file: Path,
55
+ skiprows: list[int] | int | Callable[[Hashable], bool] | None = None,
56
+ ) -> list[dict[str, Any]]:
57
+ """
58
+ Use Pandas to read a CSV into a simple list of dictionaries.
59
+
60
+ Rows can be filtered by specifying a skiprows option.
61
+
62
+ Skiprows can be
63
+ - a list of 0-based line indexes to skip
64
+ - a integer giving the number of initial lines to skip
65
+ - a callable that takes the line index and returns True to skip that line
66
+ - None to skip no rows (default)
67
+
68
+
69
+ :param csv_file: file to be read
70
+ :param skiprows: rows to skip
71
+ :return: list of dictionary entries
72
+ """
73
+ logger.debug('...............reading CSV data from "%s"', csv_file)
74
+ pandas_csv_data = pd.read_csv(str(csv_file), skiprows=skiprows)
75
+ for key in pandas_csv_data.select_dtypes("bool").keys():
76
+ pandas_csv_data[key] = pandas_csv_data[key].astype(int)
77
+ return list(pandas_csv_data.transpose().to_dict().values())
@@ -0,0 +1,177 @@
1
+ import logging
2
+ from typing import Any, Generator
3
+
4
+ from lark import Lark, Transformer, v_args
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ filter_grammar = """
9
+ ?start: or_test_
10
+ ?or_test_: and_test_ ("or" and_test_)* -> or_test
11
+ ?and_test_: not_test_ ("and" not_test_)* -> and_test
12
+ ?not_test_: "not" not_test_ -> not_test
13
+ | "True" -> true
14
+ | "False" -> false
15
+ | sum -> num_test
16
+ | comp
17
+ | "(" or_test_ ")"
18
+
19
+ ?comp: sum "==" sum -> eq
20
+ | sum "!=" sum -> ne
21
+ | sum ">" sum -> gt
22
+ | sum ">=" sum -> ge
23
+ | sum "<" sum -> lt
24
+ | sum "<=" sum -> le
25
+ | "(" comp ")"
26
+
27
+ ?sum: product
28
+ | sum "+" product -> add
29
+ | sum "-" product -> sub
30
+
31
+ ?product: atom
32
+ | product "*" atom -> mul
33
+ | product "/" atom -> div
34
+ | product "%" atom -> mod
35
+ | product "//" atom -> floordiv
36
+
37
+ ?atom: NUMBER -> number
38
+ | STRING_LITERAL -> strlit
39
+ | "-" atom -> neg
40
+ | key -> key
41
+ | "(" sum ")"
42
+
43
+ ?key: CNAME | ESCAPED_STRING
44
+ STRING_LITERAL: "'" _STRING_ESC_INNER "'"
45
+
46
+ %import common.ESCAPED_STRING
47
+ %import common._STRING_ESC_INNER
48
+ %import common.CNAME
49
+ %import common.NUMBER
50
+ %import common.WS_INLINE
51
+
52
+ %ignore WS_INLINE
53
+ """
54
+
55
+
56
+ def to_number_or_string(s: Any) -> float | int | str:
57
+ if isinstance(s, float) or isinstance(s, int):
58
+ return s
59
+ elif isinstance(s, str):
60
+ try:
61
+ return int(s)
62
+ except ValueError:
63
+ try:
64
+ return float(s)
65
+ except ValueError:
66
+ return s
67
+ return str(s)
68
+
69
+
70
+ def applyall(d: dict[str, Any], *args) -> Generator[bool, None, None]: # type: ignore[no-untyped-def]
71
+ for a in args:
72
+ yield a(d)
73
+
74
+
75
+ @v_args(inline=True)
76
+ class FilterParser(Transformer):
77
+ # making methods static breaks the @v_args functionality
78
+
79
+ def or_test(self, *args): # type: ignore[no-untyped-def]
80
+ logger.debug("Or test: %s", args)
81
+ return lambda d: any(applyall(d, *args))
82
+
83
+ def and_test(self, *args): # type: ignore[no-untyped-def]
84
+ logger.debug("And test: %s", args)
85
+ return lambda d: all(applyall(d, *args))
86
+
87
+ def not_test(self, a): # type: ignore[no-untyped-def]
88
+ logger.debug("Not test: %s", a)
89
+ return lambda d: not a(d)
90
+
91
+ def num_test(self, a): # type: ignore[no-untyped-def]
92
+ logger.debug("Num test: %s", a)
93
+ return lambda d: a(d) != 0
94
+
95
+ def eq(self, a, b): # type: ignore[no-untyped-def]
96
+ logger.debug("Eq test: %s and %s", a, b)
97
+ return lambda d: a(d) == b(d)
98
+
99
+ def ne(self, a, b): # type: ignore[no-untyped-def]
100
+ logger.debug("Ne test: %s and %s", a, b)
101
+ return lambda d: a(d) != b(d)
102
+
103
+ def gt(self, a, b): # type: ignore[no-untyped-def]
104
+ logger.debug("Gt test: %s and %s", a, b)
105
+ return lambda d: a(d) > b(d)
106
+
107
+ def ge(self, a, b): # type: ignore[no-untyped-def]
108
+ logger.debug("Ge test: %s and %s", a, b)
109
+ return lambda d: a(d) >= b(d)
110
+
111
+ def lt(self, a, b): # type: ignore[no-untyped-def]
112
+ logger.debug("Lt test: %s and %s", a, b)
113
+ return lambda d: a(d) < b(d)
114
+
115
+ def le(self, a, b): # type: ignore[no-untyped-def]
116
+ logger.debug("Le test: %s and %s", a, b)
117
+ return lambda d: a(d) <= b(d)
118
+
119
+ def add(self, a, b): # type: ignore[no-untyped-def]
120
+ logger.debug("add: %s and %s", a, b)
121
+ return lambda d: a(d) + b(d)
122
+
123
+ def sub(self, a, b): # type: ignore[no-untyped-def]
124
+ logger.debug("sub: %s and %s", a, b)
125
+ return lambda d: a(d) - b(d)
126
+
127
+ def mul(self, a, b): # type: ignore[no-untyped-def]
128
+ logger.debug("mul: %s and %s", a, b)
129
+ return lambda d: a(d) * b(d)
130
+
131
+ def div(self, a, b): # type: ignore[no-untyped-def]
132
+ logger.debug("div: %s and %s", a, b)
133
+ return lambda d: a(d) / b(d)
134
+
135
+ def mod(self, a, b): # type: ignore[no-untyped-def]
136
+ logger.debug("mod: %s and %s", a, b)
137
+ return lambda d: a(d) % b(d)
138
+
139
+ def floordiv(self, a, b): # type: ignore[no-untyped-def]
140
+ logger.debug("floordiv: %s and %s", a, b)
141
+ return lambda d: a(d) // b(d)
142
+
143
+ def number(self, value): # type: ignore[no-untyped-def]
144
+ logger.debug("Number: %s", value)
145
+ return lambda d: to_number_or_string(value)
146
+
147
+ def strlit(self, v): # type: ignore[no-untyped-def]
148
+ logger.debug("String: %s", v)
149
+ vv = v.strip("'")
150
+ return lambda d: vv
151
+
152
+ def neg(self, a): # type: ignore[no-untyped-def]
153
+ logger.debug("Negation: %s", a)
154
+ return lambda d: -a(d)
155
+
156
+ def key(self, a): # type: ignore[no-untyped-def]
157
+ logger.debug("Key: %s", a)
158
+ b = a.strip('"')
159
+ return lambda d: to_number_or_string(d[b])
160
+
161
+ def true(self): # type: ignore[no-untyped-def]
162
+ logger.debug("True")
163
+ return lambda d: True
164
+
165
+ def false(self): # type: ignore[no-untyped-def]
166
+ logger.debug("False")
167
+ return lambda d: False
168
+
169
+
170
+ def create_filter(filter_spec: str): # type: ignore[no-untyped-def]
171
+ """
172
+ Convert a expression string into a function that takes a dictionary as an argument
173
+ and returns a boolean
174
+ """
175
+ filter_parser = Lark(filter_grammar, parser="lalr", transformer=FilterParser())
176
+ filter_generator = filter_parser.parse
177
+ return filter_generator(filter_spec)
File without changes