python-jack-knife 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pjk/__init__.py +5 -0
- pjk/base.py +377 -0
- pjk/common.py +150 -0
- pjk/log.py +67 -0
- pjk/main.py +106 -0
- pjk/man_page.py +125 -0
- pjk/parser.py +284 -0
- pjk/pipes/__init__.py +0 -0
- pjk/pipes/denorm.py +68 -0
- pjk/pipes/factory.py +62 -0
- pjk/pipes/filter.py +57 -0
- pjk/pipes/head.py +34 -0
- pjk/pipes/join.py +85 -0
- pjk/pipes/let_reduce.py +198 -0
- pjk/pipes/map.py +91 -0
- pjk/pipes/move_field.py +36 -0
- pjk/pipes/postgres_pipe.py +209 -0
- pjk/pipes/remove_field.py +36 -0
- pjk/pipes/select.py +42 -0
- pjk/pipes/sort.py +63 -0
- pjk/pipes/tail.py +39 -0
- pjk/pipes/user_pipe_factory.py +45 -0
- pjk/pipes/where.py +49 -0
- pjk/registry.py +143 -0
- pjk/sinks/__init__.py +0 -0
- pjk/sinks/csv_sink.py +33 -0
- pjk/sinks/ddb.py +54 -0
- pjk/sinks/devnull.py +31 -0
- pjk/sinks/dir_sink.py +59 -0
- pjk/sinks/expect.py +53 -0
- pjk/sinks/factory.py +108 -0
- pjk/sinks/graph.py +57 -0
- pjk/sinks/graph_bar_line.py +229 -0
- pjk/sinks/graph_cumulative.py +55 -0
- pjk/sinks/graph_hist.py +72 -0
- pjk/sinks/graph_scatter.py +29 -0
- pjk/sinks/json_sink.py +23 -0
- pjk/sinks/s3_sink.py +100 -0
- pjk/sinks/sinks.py +68 -0
- pjk/sinks/stdout.py +44 -0
- pjk/sinks/tsv_sink.py +22 -0
- pjk/sinks/user_sink_factory.py +43 -0
- pjk/sources/__init__.py +0 -0
- pjk/sources/csv_source.py +28 -0
- pjk/sources/dir_source.py +69 -0
- pjk/sources/factory.py +100 -0
- pjk/sources/format_usage.py +11 -0
- pjk/sources/inline_source.py +56 -0
- pjk/sources/json_source.py +35 -0
- pjk/sources/lazy_file.py +16 -0
- pjk/sources/lazy_file_local.py +22 -0
- pjk/sources/lazy_file_s3.py +28 -0
- pjk/sources/parquet_source.py +32 -0
- pjk/sources/s3_source.py +146 -0
- pjk/sources/source_list.py +23 -0
- pjk/sources/sql_source.py +32 -0
- pjk/sources/tsv_source.py +15 -0
- pjk/sources/user_source_factory.py +33 -0
- pjk/version.py +4 -0
- python_jack_knife-0.5.0.dist-info/METADATA +254 -0
- python_jack_knife-0.5.0.dist-info/RECORD +65 -0
- python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
- python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
- python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
- python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/pipes/join.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/join.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, Usage, UsageError, ParsedToken, KeyedSource
|
|
7
|
+
|
|
8
|
+
class JoinPipe(Pipe):
|
|
9
|
+
arity = 2 # left = record stream, right = KeyedSource
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def usage(cls):
|
|
13
|
+
usage = Usage(
|
|
14
|
+
name='join',
|
|
15
|
+
desc="Join records against a keyed source on shared fields",
|
|
16
|
+
component_class=cls
|
|
17
|
+
)
|
|
18
|
+
usage.def_arg(
|
|
19
|
+
name='mode',
|
|
20
|
+
usage="'left', 'inner', or 'outer' join behavior",
|
|
21
|
+
valid_values={'left', 'inner', 'outer'}
|
|
22
|
+
)
|
|
23
|
+
usage.def_syntax("pjk <left_source> <map_source> map:<how>:<key> join:<mode> <sink>")
|
|
24
|
+
|
|
25
|
+
usage.def_example(expr_tokens=
|
|
26
|
+
[
|
|
27
|
+
"[{color:'blue'},{color:'green'}]",
|
|
28
|
+
"[{color:'blue', price:50}, {color:'red', price:20}]",
|
|
29
|
+
'map:o:color',
|
|
30
|
+
"join:left"
|
|
31
|
+
],
|
|
32
|
+
expect="[{color:'blue', price:50}, {color:'green'}]")
|
|
33
|
+
usage.def_example(expr_tokens=
|
|
34
|
+
[
|
|
35
|
+
"[{color:'blue'},{color:'green'}]",
|
|
36
|
+
"[{color:'blue', price:50}, {color:'red', price:20}]",
|
|
37
|
+
'map:o:color',
|
|
38
|
+
"join:inner"
|
|
39
|
+
],
|
|
40
|
+
expect="[{color:'blue', price:50}]")
|
|
41
|
+
|
|
42
|
+
usage.def_example(expr_tokens=
|
|
43
|
+
[
|
|
44
|
+
"[{color:'blue'},{color:'green'}]",
|
|
45
|
+
"[{color:'blue', price:50}, {color:'red', price:20}]",
|
|
46
|
+
'map:o:color',
|
|
47
|
+
"join:outer"
|
|
48
|
+
],
|
|
49
|
+
expect="[{color:'blue', price:50}, {color:'green'}, {color:'red', price: 20}]")
|
|
50
|
+
return usage
|
|
51
|
+
|
|
52
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
53
|
+
super().__init__(ptok)
|
|
54
|
+
|
|
55
|
+
self.mode = usage.get_arg('mode')
|
|
56
|
+
self.left = None
|
|
57
|
+
self.right = None
|
|
58
|
+
self._pending_right = None
|
|
59
|
+
self._check_right = False
|
|
60
|
+
|
|
61
|
+
def reset(self):
|
|
62
|
+
self._pending_right = None
|
|
63
|
+
self._check_right = False
|
|
64
|
+
|
|
65
|
+
def __iter__(self):
|
|
66
|
+
if not isinstance(self.right, KeyedSource):
|
|
67
|
+
raise UsageError("right source must be a KeyedSource")
|
|
68
|
+
|
|
69
|
+
for left_rec in self.left:
|
|
70
|
+
match = self.right.lookup(left_rec)
|
|
71
|
+
|
|
72
|
+
if match is not None:
|
|
73
|
+
merged = dict(left_rec)
|
|
74
|
+
merged.update(match)
|
|
75
|
+
yield merged
|
|
76
|
+
elif self.mode == "left":
|
|
77
|
+
yield left_rec
|
|
78
|
+
elif self.mode == "outer":
|
|
79
|
+
yield left_rec
|
|
80
|
+
elif self.mode == "inner":
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
if self.mode == "outer":
|
|
84
|
+
for right_rec in self.right.get_unlookedup_records():
|
|
85
|
+
yield right_rec
|
pjk/pipes/let_reduce.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/let_reduce.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, UsageError, TokenError
|
|
7
|
+
from pjk.common import SafeNamespace, ReducingNamespace
|
|
8
|
+
import re
|
|
9
|
+
import ast
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
# --- Shared Utilities ---
|
|
13
|
+
def parse_args(token: str):
|
|
14
|
+
pattern = re.compile(r'(?P<field>\w+)(?P<op>[:=\+\-\*/]+)(?P<rest>.+)$')
|
|
15
|
+
match = pattern.fullmatch(token)
|
|
16
|
+
if not match:
|
|
17
|
+
raise ValueError(f"Invalid token syntax: {token!r}")
|
|
18
|
+
return match.groupdict()
|
|
19
|
+
|
|
20
|
+
def do_eval(expr, env):
|
|
21
|
+
try:
|
|
22
|
+
safe_env = dict(env)
|
|
23
|
+
safe_env['json'] = json
|
|
24
|
+
return eval(expr, {}, safe_env)
|
|
25
|
+
except Exception:
|
|
26
|
+
raise UsageError(f"UsageError in expression: {expr}")
|
|
27
|
+
|
|
28
|
+
def eval_regular(expr: str, record: dict):
|
|
29
|
+
env = {'f': SafeNamespace(record)}
|
|
30
|
+
if re.match(r'[a-zA-Z0-9_]+$', expr):
|
|
31
|
+
return expr
|
|
32
|
+
return do_eval(expr, env)
|
|
33
|
+
|
|
34
|
+
def eval_accumulating(expr: str, record: dict, op: str, acc=None):
|
|
35
|
+
if op in ('-=', '*=', '/=') and 'acc' not in expr:
|
|
36
|
+
expr = f'acc {op[0]} ({expr})'
|
|
37
|
+
|
|
38
|
+
env = {
|
|
39
|
+
'f': SafeNamespace(record),
|
|
40
|
+
'acc': acc
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
node = ast.parse(expr, mode='eval').body
|
|
45
|
+
except SyntaxError:
|
|
46
|
+
raise UsageError(f"Invalid expression: {expr}")
|
|
47
|
+
|
|
48
|
+
if isinstance(node, (ast.ListComp, ast.SetComp, ast.DictComp)):
|
|
49
|
+
env['f'] = ReducingNamespace(record)
|
|
50
|
+
|
|
51
|
+
if isinstance(node, ast.ListComp):
|
|
52
|
+
values = eval(compile(ast.Expression(node), '<reduce:listcomp>', 'eval'), {}, env)
|
|
53
|
+
return (acc or []) + list(values)
|
|
54
|
+
|
|
55
|
+
if isinstance(node, ast.SetComp):
|
|
56
|
+
values = eval(compile(ast.Expression(node), '<reduce:setcomp>', 'eval'), {}, env)
|
|
57
|
+
return (acc or set()).union(values)
|
|
58
|
+
|
|
59
|
+
if isinstance(node, ast.DictComp):
|
|
60
|
+
values = eval(compile(ast.Expression(node), '<reduce:dictcomp>', 'eval'), {}, env)
|
|
61
|
+
return {**(acc or {}), **values}
|
|
62
|
+
|
|
63
|
+
if op == '+=':
|
|
64
|
+
value = eval(expr, {}, env)
|
|
65
|
+
if isinstance(value, (int, float)):
|
|
66
|
+
return (acc or 0) + value
|
|
67
|
+
elif isinstance(value, str):
|
|
68
|
+
return str(acc or '') + value
|
|
69
|
+
elif isinstance(value, list):
|
|
70
|
+
return (acc or []) + value
|
|
71
|
+
else:
|
|
72
|
+
return (acc or []) + [value]
|
|
73
|
+
|
|
74
|
+
if op in ('-=', '*=', '/='):
|
|
75
|
+
return do_eval(expr, env)
|
|
76
|
+
|
|
77
|
+
return do_eval(expr, env)
|
|
78
|
+
|
|
79
|
+
# --- LetPipe (simple field assignment) ---
|
|
80
|
+
class LetPipe(Pipe):
|
|
81
|
+
@classmethod
|
|
82
|
+
def usage(cls):
|
|
83
|
+
usage = NoBindUsage( # can't use bound usage because of complicated parsing
|
|
84
|
+
name='let',
|
|
85
|
+
desc="set a new field equal to a rhs python expression",
|
|
86
|
+
component_class=cls
|
|
87
|
+
)
|
|
88
|
+
usage.def_arg(name='rhs', usage="python rhs expression (use f.<field> syntax)")
|
|
89
|
+
usage.def_example(expr_tokens=['{hello:0}', 'let:there=f.hello + 1'], expect="{hello:0, there: 1}")
|
|
90
|
+
usage.def_example(expr_tokens=['{hello:0}', 'let:foo:bar'], expect="{hello:0, foo: 'bar'}")
|
|
91
|
+
usage.def_example(expr_tokens=['{hello:0}', 'let:foo=int(1)'], expect="{hello:0, foo: 1}")
|
|
92
|
+
return usage
|
|
93
|
+
|
|
94
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
95
|
+
super().__init__(ptok)
|
|
96
|
+
args = parse_args(ptok.whole_token.split(':', 1)[-1])
|
|
97
|
+
self.field = args['field']
|
|
98
|
+
self.op = args['op']
|
|
99
|
+
self.rest = args['rest']
|
|
100
|
+
|
|
101
|
+
if self.op in ('+=', '-=', '*=', '/='):
|
|
102
|
+
raise TokenError("Aggregation operator not allowed in let, use reduce:")
|
|
103
|
+
|
|
104
|
+
def reset(self):
|
|
105
|
+
pass # stateless
|
|
106
|
+
|
|
107
|
+
def __iter__(self):
|
|
108
|
+
for record in self.left:
|
|
109
|
+
if self.op == ':':
|
|
110
|
+
record[self.field] = self.rest
|
|
111
|
+
else:
|
|
112
|
+
record[self.field] = eval_regular(self.rest, record)
|
|
113
|
+
yield record
|
|
114
|
+
|
|
115
|
+
# --- ReducePipe (stateful accumulator) ---
|
|
116
|
+
def is_comprehension(expr: str) -> bool:
|
|
117
|
+
try:
|
|
118
|
+
node = ast.parse(expr, mode='eval').body
|
|
119
|
+
return isinstance(node, (ast.ListComp, ast.SetComp, ast.DictComp))
|
|
120
|
+
except SyntaxError:
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
class ReducePipe(Pipe):
|
|
124
|
+
@classmethod
|
|
125
|
+
def usage(cls):
|
|
126
|
+
usage = NoBindUsage( # can't use bound usage because of complicated parsing
|
|
127
|
+
name='reduce',
|
|
128
|
+
desc="set a new field equal to a reduction over records of a sub or main expression\n" +
|
|
129
|
+
"rhs operators must be accumulating, e.g. +=, -=, *=, /=\n" +
|
|
130
|
+
"or use list or dictionary comprehension",
|
|
131
|
+
component_class=cls
|
|
132
|
+
)
|
|
133
|
+
usage.def_arg(name='rhs', usage="accumulating python rhs expression (use f.<field> syntax)")
|
|
134
|
+
|
|
135
|
+
usage.def_example(expr_tokens=["{ferry:'orca', cars:[{make: 'ford', size:9}, {make:'bmw', size:4}]}",
|
|
136
|
+
'[', 'reduce:total_size+=f.size', 'over:cars'
|
|
137
|
+
],
|
|
138
|
+
expect="{ferry:'orca', cars:[{make: 'ford', size:9}, {make:'bmw', size:4}], total_size: 13}")
|
|
139
|
+
|
|
140
|
+
usage.def_example(expr_tokens=["[{make: 'honda'}, {make: 'ford'}, {make:'bmw'}]",
|
|
141
|
+
'reduce:cars=[x for x in f.make]'
|
|
142
|
+
],
|
|
143
|
+
expect="{cars:['honda', 'ford', 'bmw']}")
|
|
144
|
+
|
|
145
|
+
usage.def_example(expr_tokens=["[{i:[1,2]},{i:[3]}]",
|
|
146
|
+
'reduce:flattened=[x for x in f.i]'
|
|
147
|
+
],
|
|
148
|
+
expect="{flattened:[1, 2, 3]}")
|
|
149
|
+
|
|
150
|
+
usage.def_example(expr_tokens=["[{i:1},{i:3}, {i:7}]",
|
|
151
|
+
'reduce:diff-=f.i'
|
|
152
|
+
],
|
|
153
|
+
expect="{diff:-11}")
|
|
154
|
+
|
|
155
|
+
usage.def_example(expr_tokens=["[{i:1},{i:3}, {i:7}]",
|
|
156
|
+
'reduce:product*=f.i'
|
|
157
|
+
],
|
|
158
|
+
expect="{product:21}")
|
|
159
|
+
|
|
160
|
+
return usage
|
|
161
|
+
|
|
162
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
163
|
+
super().__init__(ptok)
|
|
164
|
+
args = parse_args(ptok.whole_token.split(':', 1)[-1])
|
|
165
|
+
self.field = args['field']
|
|
166
|
+
self.op = args['op']
|
|
167
|
+
self.rest = args['rest']
|
|
168
|
+
|
|
169
|
+
if self.op not in ('+=', '-=', '*=', '/='):
|
|
170
|
+
if is_comprehension(self.rest):
|
|
171
|
+
self.op = '+='
|
|
172
|
+
else:
|
|
173
|
+
raise TokenError("Reduce pipe requires an accumulating operator (+=, -=, etc.), unless RHS is a comprehension")
|
|
174
|
+
|
|
175
|
+
self.accum_value = self.initial_acc_value()
|
|
176
|
+
|
|
177
|
+
def initial_acc_value(self):
|
|
178
|
+
if self.op == '+=':
|
|
179
|
+
return 0
|
|
180
|
+
elif self.op == '*=':
|
|
181
|
+
return 1
|
|
182
|
+
elif self.op == '-=':
|
|
183
|
+
return 0
|
|
184
|
+
elif self.op == '/=':
|
|
185
|
+
return 1.0
|
|
186
|
+
else:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def reset(self):
|
|
190
|
+
self.accum_value = self.initial_acc_value()
|
|
191
|
+
|
|
192
|
+
def __iter__(self):
|
|
193
|
+
for record in self.left:
|
|
194
|
+
self.accum_value = eval_accumulating(self.rest, record, self.op, self.accum_value)
|
|
195
|
+
yield record
|
|
196
|
+
|
|
197
|
+
def get_subexp_result(self):
|
|
198
|
+
return (self.field, self.accum_value)
|
pjk/pipes/map.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/group.py
|
|
5
|
+
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from pjk.base import ParsedToken, Usage, Pipe, KeyedSource
|
|
8
|
+
|
|
9
|
+
class MapPipe(Pipe, KeyedSource):
|
|
10
|
+
@classmethod
|
|
11
|
+
def usage(cls):
|
|
12
|
+
usage = Usage(
|
|
13
|
+
name='map',
|
|
14
|
+
desc="maps records to key, either overriding or grouping duplicates. Creates Keyed Source for join or filter.",
|
|
15
|
+
component_class=cls
|
|
16
|
+
)
|
|
17
|
+
usage.def_arg(name='how', usage="'o' for override, 'g' for group", valid_values={'o', 'g'})
|
|
18
|
+
usage.def_arg(name='key', usage='comma separated fields to map by')
|
|
19
|
+
usage.def_example(expr_tokens=["[{id: 1, color:'blue'}, {id:1, color:'green'}, {id:2, color:'red'}]", 'map:o:id'],
|
|
20
|
+
expect="[{id:2, color:'red'}, {id:1, color:'green'}]")
|
|
21
|
+
usage.def_example(expr_tokens=["[{id: 1, color:'blue'}, {id:1, color:'green'}, {id:2, color:'red'}]", 'map:g:id'],
|
|
22
|
+
expect="[{id:2, child:[{color:'red'}]}, {id:1, child:[{color:'blue'},{color: 'green'}]}]")
|
|
23
|
+
usage.def_example(expr_tokens=["[{id: 1, color:'blue', size:5}, {id:1, color:'green', size:10}]", 'map:o:id,color'],
|
|
24
|
+
expect="[{id:1, color:'green', size: 10}, {id:1, color:'blue', size:5}]")
|
|
25
|
+
|
|
26
|
+
return usage
|
|
27
|
+
|
|
28
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
29
|
+
super().__init__(ptok)
|
|
30
|
+
self.is_group = usage.get_arg('how') == 'g'
|
|
31
|
+
self.fields = usage.get_arg('key').split(',')
|
|
32
|
+
self.rec_map = {}
|
|
33
|
+
self.matched_map = {}
|
|
34
|
+
self._rec_list = None
|
|
35
|
+
self.is_loaded = False
|
|
36
|
+
|
|
37
|
+
def reset(self):
|
|
38
|
+
self.rec_map.clear()
|
|
39
|
+
self.matched_map.clear()
|
|
40
|
+
self._rec_list = None
|
|
41
|
+
self.is_loaded = False
|
|
42
|
+
|
|
43
|
+
def load(self):
|
|
44
|
+
if self.is_loaded:
|
|
45
|
+
return
|
|
46
|
+
self.is_loaded = True
|
|
47
|
+
|
|
48
|
+
for record in self.left:
|
|
49
|
+
key_rec = {}
|
|
50
|
+
for field in self.fields:
|
|
51
|
+
key_rec[field] = record.pop(field, None) if self.is_group else record.get(field)
|
|
52
|
+
|
|
53
|
+
key = tuple(key_rec.values())
|
|
54
|
+
existing = self.rec_map.get(key)
|
|
55
|
+
|
|
56
|
+
if not existing:
|
|
57
|
+
if self.is_group:
|
|
58
|
+
key_rec['child'] = [record]
|
|
59
|
+
self.rec_map[key] = key_rec
|
|
60
|
+
else:
|
|
61
|
+
self.rec_map[key] = record
|
|
62
|
+
else:
|
|
63
|
+
if self.is_group:
|
|
64
|
+
existing['child'].append(record)
|
|
65
|
+
else:
|
|
66
|
+
self.rec_map[key] = record
|
|
67
|
+
|
|
68
|
+
def __iter__(self):
|
|
69
|
+
if not self.is_loaded:
|
|
70
|
+
self.load()
|
|
71
|
+
if self._rec_list is None:
|
|
72
|
+
self._rec_list = list(self.rec_map.values())
|
|
73
|
+
|
|
74
|
+
while self._rec_list:
|
|
75
|
+
yield self._rec_list.pop()
|
|
76
|
+
|
|
77
|
+
def lookup(self, left_rec) -> Optional[dict]:
|
|
78
|
+
if not self.is_loaded:
|
|
79
|
+
self.load()
|
|
80
|
+
|
|
81
|
+
key = tuple(left_rec.get(f) for f in self.fields)
|
|
82
|
+
rec = self.rec_map.pop(key, None)
|
|
83
|
+
if rec is not None:
|
|
84
|
+
self.matched_map[key] = rec
|
|
85
|
+
return rec
|
|
86
|
+
return self.matched_map.get(key)
|
|
87
|
+
|
|
88
|
+
def get_unlookedup_records(self):
|
|
89
|
+
if not self.is_loaded:
|
|
90
|
+
self.load()
|
|
91
|
+
return list(self.rec_map.values())
|
pjk/pipes/move_field.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/move_field.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, Usage
|
|
7
|
+
|
|
8
|
+
class MoveField(Pipe):
|
|
9
|
+
@classmethod
|
|
10
|
+
def usage(cls):
|
|
11
|
+
usage = Usage(
|
|
12
|
+
name='as',
|
|
13
|
+
desc='Move one field to another key in the record',
|
|
14
|
+
component_class=cls
|
|
15
|
+
)
|
|
16
|
+
usage.def_arg(name='src', usage='Source field name')
|
|
17
|
+
usage.def_arg(name='dst', usage='Destination field name')
|
|
18
|
+
usage.def_example(expr_tokens=['{up:1}', 'as:up:down'], expect="{down:1}")
|
|
19
|
+
|
|
20
|
+
return usage
|
|
21
|
+
|
|
22
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
23
|
+
super().__init__(ptok)
|
|
24
|
+
self.src = usage.get_arg('src')
|
|
25
|
+
self.dst = usage.get_arg('dst')
|
|
26
|
+
self.count = 0
|
|
27
|
+
|
|
28
|
+
def reset(self):
|
|
29
|
+
self.count = 0
|
|
30
|
+
|
|
31
|
+
def __iter__(self):
|
|
32
|
+
for record in self.left:
|
|
33
|
+
self.count += 1
|
|
34
|
+
if self.src in record:
|
|
35
|
+
record[self.dst] = record.pop(self.src)
|
|
36
|
+
yield record
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
#
|
|
4
|
+
# djk/pipes/postgres_pipe.py
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import datetime as _dt
|
|
8
|
+
import uuid
|
|
9
|
+
from decimal import Decimal
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, TokenError
|
|
13
|
+
from pjk.common import Lookups
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DBClient:
|
|
17
|
+
"""Simple shared-connection wrapper for pg8000."""
|
|
18
|
+
_connection = None
|
|
19
|
+
|
|
20
|
+
def __init__(self, host: str, username: str, password: Optional[str],
|
|
21
|
+
dbname: str, port: int = 5432, ssl: bool = False):
|
|
22
|
+
import pg8000 # lazy import
|
|
23
|
+
if DBClient._connection is None:
|
|
24
|
+
try:
|
|
25
|
+
kwargs = dict(user=username, password=password, host=host, database=dbname, port=port)
|
|
26
|
+
if ssl:
|
|
27
|
+
import ssl as _ssl
|
|
28
|
+
kwargs["ssl_context"] = _ssl.create_default_context()
|
|
29
|
+
DBClient._connection = pg8000.connect(**kwargs)
|
|
30
|
+
DBClient._connection.autocommit = True
|
|
31
|
+
except Exception as e:
|
|
32
|
+
print("Failed to connect to DB")
|
|
33
|
+
raise e
|
|
34
|
+
self.conn = DBClient._connection
|
|
35
|
+
|
|
36
|
+
def close(self):
|
|
37
|
+
if self.conn is not None:
|
|
38
|
+
try:
|
|
39
|
+
self.conn.close()
|
|
40
|
+
finally:
|
|
41
|
+
DBClient._connection = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _iso_dt(x: _dt.datetime) -> str:
|
|
45
|
+
"""ISO 8601; normalize UTC offset to 'Z'."""
|
|
46
|
+
s = x.isoformat()
|
|
47
|
+
return s.replace("+00:00", "Z")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def normalize(obj: Any) -> Any:
|
|
51
|
+
"""
|
|
52
|
+
Make values JSON/YAML-safe and portable (schema-agnostic):
|
|
53
|
+
- Decimal -> exact string (no sci-notation)
|
|
54
|
+
- date/datetime/time -> ISO-8601 string (datetime keeps offset; UTC -> 'Z')
|
|
55
|
+
- UUID -> string
|
|
56
|
+
- bytes -> base64 string
|
|
57
|
+
- lists/tuples/sets, dicts -> normalized recursively
|
|
58
|
+
- leaves int/float/str/bool/None as-is
|
|
59
|
+
"""
|
|
60
|
+
if obj is None:
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
if isinstance(obj, Decimal):
|
|
64
|
+
return format(obj, "f") # exact value as string
|
|
65
|
+
|
|
66
|
+
if isinstance(obj, _dt.datetime):
|
|
67
|
+
return _iso_dt(obj)
|
|
68
|
+
|
|
69
|
+
if isinstance(obj, (_dt.date, _dt.time)):
|
|
70
|
+
return obj.isoformat()
|
|
71
|
+
|
|
72
|
+
if isinstance(obj, uuid.UUID):
|
|
73
|
+
return str(obj)
|
|
74
|
+
|
|
75
|
+
if isinstance(obj, (bytes, bytearray, memoryview)):
|
|
76
|
+
return base64.b64encode(bytes(obj)).decode("ascii")
|
|
77
|
+
|
|
78
|
+
if isinstance(obj, dict):
|
|
79
|
+
return {k: normalize(v) for k, v in obj.items()}
|
|
80
|
+
|
|
81
|
+
if isinstance(obj, (list, tuple, set)):
|
|
82
|
+
return [normalize(v) for v in obj]
|
|
83
|
+
|
|
84
|
+
return obj
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _row_to_dict(cursor, row) -> Dict[str, Any]:
|
|
88
|
+
cols = [d[0] for d in cursor.description]
|
|
89
|
+
return {col: normalize(val) for col, val in zip(cols, row)}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class PostgresPipe(Pipe):
|
|
93
|
+
@classmethod
|
|
94
|
+
def usage(cls):
|
|
95
|
+
usage = Usage(
|
|
96
|
+
name="pgres",
|
|
97
|
+
desc="Postgres query pipe; executes SQL from input record['query'].",
|
|
98
|
+
component_class=cls,
|
|
99
|
+
)
|
|
100
|
+
usage.def_arg(
|
|
101
|
+
"dbname",
|
|
102
|
+
"name of db. Entry in ~/.pjk/lookups.yaml containing host, user, password"
|
|
103
|
+
)
|
|
104
|
+
usage.def_param(
|
|
105
|
+
"header",
|
|
106
|
+
usage="emit header record before query results",
|
|
107
|
+
valid_values={"true", "false"}, default='true',
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
usage.def_example(expr_tokens=['myquery.sql', 'pgres:mydb'], expect=None)
|
|
111
|
+
usage.def_example(expr_tokens=["{'query': 'SELECT * from MY_TABLE;'}", 'pgres:mydb'], expect=None)
|
|
112
|
+
return usage
|
|
113
|
+
|
|
114
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
115
|
+
super().__init__(ptok, usage)
|
|
116
|
+
|
|
117
|
+
lookups = Lookups()
|
|
118
|
+
self.dbname = usage.get_arg("dbname")
|
|
119
|
+
db_params = lookups.get(self.dbname)
|
|
120
|
+
if not db_params:
|
|
121
|
+
# f-string so dbname prints correctly
|
|
122
|
+
raise TokenError(
|
|
123
|
+
f"~/.pjk/lookups.yaml must contain entry for '{self.dbname}' with host, user, password."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
self.db_host = db_params.get("host")
|
|
127
|
+
self.db_user = db_params.get("user")
|
|
128
|
+
self.db_pass = db_params.get("password")
|
|
129
|
+
self.db_port = int(db_params.get("port", 5432))
|
|
130
|
+
self.db_ssl = bool(db_params.get("ssl", False))
|
|
131
|
+
|
|
132
|
+
self.query_field = "query" # SQL string
|
|
133
|
+
self.params_field = "params" # optional: list/tuple (positional) or dict (named)
|
|
134
|
+
self.do_header = usage.get_param("header") == "true"
|
|
135
|
+
|
|
136
|
+
def reset(self):
|
|
137
|
+
# stateless across reset
|
|
138
|
+
pass
|
|
139
|
+
|
|
140
|
+
def _make_header(self, cur, query: str, params=None) -> Dict[str, Any]:
|
|
141
|
+
"""
|
|
142
|
+
Inspect the cursor and build a full header record.
|
|
143
|
+
Figures out result, rowcount, function automatically.
|
|
144
|
+
"""
|
|
145
|
+
h = {
|
|
146
|
+
"query": query,
|
|
147
|
+
"db": self.dbname,
|
|
148
|
+
"dbhost": self.db_host,
|
|
149
|
+
}
|
|
150
|
+
if params:
|
|
151
|
+
h["params"] = params
|
|
152
|
+
|
|
153
|
+
if cur.description:
|
|
154
|
+
cols = [d[0] for d in cur.description]
|
|
155
|
+
if len(cols) == 1 and cols[0] == "ingest_event":
|
|
156
|
+
_ = cur.fetchone() # consume void row
|
|
157
|
+
h["result"] = "ok"
|
|
158
|
+
h["function"] = "ingest_event"
|
|
159
|
+
else:
|
|
160
|
+
h["result"] = "ok"
|
|
161
|
+
h["rowcount"] = cur.rowcount if cur.rowcount != -1 else None
|
|
162
|
+
else:
|
|
163
|
+
h["result"] = "ok"
|
|
164
|
+
h["rowcount"] = cur.rowcount
|
|
165
|
+
|
|
166
|
+
return {"header": h}
|
|
167
|
+
|
|
168
|
+
def __iter__(self):
|
|
169
|
+
client = DBClient(
|
|
170
|
+
host=self.db_host,
|
|
171
|
+
username=self.db_user,
|
|
172
|
+
password=self.db_pass,
|
|
173
|
+
dbname=self.dbname,
|
|
174
|
+
port=self.db_port,
|
|
175
|
+
ssl=self.db_ssl,
|
|
176
|
+
)
|
|
177
|
+
try:
|
|
178
|
+
for input_record in self.left:
|
|
179
|
+
query = input_record.get(self.query_field)
|
|
180
|
+
if not query:
|
|
181
|
+
yield {"_error": "missing query"}
|
|
182
|
+
continue
|
|
183
|
+
params = input_record.get(self.params_field)
|
|
184
|
+
|
|
185
|
+
cur = client.conn.cursor()
|
|
186
|
+
try:
|
|
187
|
+
# execute
|
|
188
|
+
if params is None:
|
|
189
|
+
cur.execute(query)
|
|
190
|
+
else:
|
|
191
|
+
if isinstance(params, (list, tuple, dict)):
|
|
192
|
+
cur.execute(query, params)
|
|
193
|
+
else:
|
|
194
|
+
cur.execute(query, (params,))
|
|
195
|
+
|
|
196
|
+
# yield header first
|
|
197
|
+
if self.do_header:
|
|
198
|
+
yield self._make_header(cur, query, params)
|
|
199
|
+
|
|
200
|
+
# then stream rows if it was a real SELECT with results
|
|
201
|
+
if cur.description:
|
|
202
|
+
cols = [d[0] for d in cur.description]
|
|
203
|
+
if not (len(cols) == 1 and cols[0] == "ingest_event"):
|
|
204
|
+
for row in cur:
|
|
205
|
+
yield _row_to_dict(cur, row)
|
|
206
|
+
finally:
|
|
207
|
+
cur.close()
|
|
208
|
+
finally:
|
|
209
|
+
client.close()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/remove_field.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, Usage, UsageError
|
|
7
|
+
|
|
8
|
+
class RemoveField(Pipe):
|
|
9
|
+
@classmethod
|
|
10
|
+
def usage(cls):
|
|
11
|
+
usage = Usage(
|
|
12
|
+
name='drop',
|
|
13
|
+
desc='Remove one or more fields from each record',
|
|
14
|
+
component_class=cls
|
|
15
|
+
)
|
|
16
|
+
usage.def_arg(name='fields', usage='Comma-separated list of field names to drop')
|
|
17
|
+
usage.def_example(expr_tokens=["{id:1, dir:'up', color:'blue'}", 'drop:id,color'], expect="dir: 'up'")
|
|
18
|
+
return usage
|
|
19
|
+
|
|
20
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
21
|
+
super().__init__(ptok)
|
|
22
|
+
arg_string = usage.get_arg('fields')
|
|
23
|
+
self.fields = [f.strip() for f in arg_string.split(',') if f.strip()]
|
|
24
|
+
if not self.fields:
|
|
25
|
+
raise UsageError("rm must include at least one valid field name")
|
|
26
|
+
self.count = 0
|
|
27
|
+
|
|
28
|
+
def reset(self):
|
|
29
|
+
self.count = 0
|
|
30
|
+
|
|
31
|
+
def __iter__(self):
|
|
32
|
+
for record in self.left:
|
|
33
|
+
self.count += 1
|
|
34
|
+
for field in self.fields:
|
|
35
|
+
record.pop(field, None)
|
|
36
|
+
yield record
|