python-jack-knife 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pjk/__init__.py +5 -0
- pjk/base.py +377 -0
- pjk/common.py +150 -0
- pjk/log.py +67 -0
- pjk/main.py +106 -0
- pjk/man_page.py +125 -0
- pjk/parser.py +284 -0
- pjk/pipes/__init__.py +0 -0
- pjk/pipes/denorm.py +68 -0
- pjk/pipes/factory.py +62 -0
- pjk/pipes/filter.py +57 -0
- pjk/pipes/head.py +34 -0
- pjk/pipes/join.py +85 -0
- pjk/pipes/let_reduce.py +198 -0
- pjk/pipes/map.py +91 -0
- pjk/pipes/move_field.py +36 -0
- pjk/pipes/postgres_pipe.py +209 -0
- pjk/pipes/remove_field.py +36 -0
- pjk/pipes/select.py +42 -0
- pjk/pipes/sort.py +63 -0
- pjk/pipes/tail.py +39 -0
- pjk/pipes/user_pipe_factory.py +45 -0
- pjk/pipes/where.py +49 -0
- pjk/registry.py +143 -0
- pjk/sinks/__init__.py +0 -0
- pjk/sinks/csv_sink.py +33 -0
- pjk/sinks/ddb.py +54 -0
- pjk/sinks/devnull.py +31 -0
- pjk/sinks/dir_sink.py +59 -0
- pjk/sinks/expect.py +53 -0
- pjk/sinks/factory.py +108 -0
- pjk/sinks/graph.py +57 -0
- pjk/sinks/graph_bar_line.py +229 -0
- pjk/sinks/graph_cumulative.py +55 -0
- pjk/sinks/graph_hist.py +72 -0
- pjk/sinks/graph_scatter.py +29 -0
- pjk/sinks/json_sink.py +23 -0
- pjk/sinks/s3_sink.py +100 -0
- pjk/sinks/sinks.py +68 -0
- pjk/sinks/stdout.py +44 -0
- pjk/sinks/tsv_sink.py +22 -0
- pjk/sinks/user_sink_factory.py +43 -0
- pjk/sources/__init__.py +0 -0
- pjk/sources/csv_source.py +28 -0
- pjk/sources/dir_source.py +69 -0
- pjk/sources/factory.py +100 -0
- pjk/sources/format_usage.py +11 -0
- pjk/sources/inline_source.py +56 -0
- pjk/sources/json_source.py +35 -0
- pjk/sources/lazy_file.py +16 -0
- pjk/sources/lazy_file_local.py +22 -0
- pjk/sources/lazy_file_s3.py +28 -0
- pjk/sources/parquet_source.py +32 -0
- pjk/sources/s3_source.py +146 -0
- pjk/sources/source_list.py +23 -0
- pjk/sources/sql_source.py +32 -0
- pjk/sources/tsv_source.py +15 -0
- pjk/sources/user_source_factory.py +33 -0
- pjk/version.py +4 -0
- python_jack_knife-0.5.0.dist-info/METADATA +254 -0
- python_jack_knife-0.5.0.dist-info/RECORD +65 -0
- python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
- python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
- python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
- python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/pipes/select.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/select_pipe.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, Usage, ParsedToken, UsageError
|
|
7
|
+
|
|
8
|
+
class SelectFields(Pipe):
|
|
9
|
+
deep_copyable: bool = True
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def usage(cls):
|
|
13
|
+
usage = Usage(
|
|
14
|
+
name='sel',
|
|
15
|
+
desc='Keep only the specified fields from each record',
|
|
16
|
+
component_class=cls
|
|
17
|
+
)
|
|
18
|
+
usage.def_arg(name='fields', usage='Comma-separated list of fields to retain')
|
|
19
|
+
usage.def_example(expr_tokens=["{id:1, dir:'up', color:'blue'}", 'sel:id,color'], expect="id: 1, color:'blue'")
|
|
20
|
+
return usage
|
|
21
|
+
|
|
22
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
23
|
+
super().__init__(ptok)
|
|
24
|
+
|
|
25
|
+
arg_string = usage.get_arg('fields')
|
|
26
|
+
if not arg_string:
|
|
27
|
+
raise UsageError("select:<f1,f2,...> requires at least one field")
|
|
28
|
+
|
|
29
|
+
self.keep_fields = {f.strip() for f in arg_string.split(',') if f.strip()}
|
|
30
|
+
if not self.keep_fields:
|
|
31
|
+
raise UsageError("select must include at least one valid field name")
|
|
32
|
+
|
|
33
|
+
def reset(self):
|
|
34
|
+
pass # stateless
|
|
35
|
+
|
|
36
|
+
def __iter__(self):
|
|
37
|
+
for record in self.left:
|
|
38
|
+
keys = list(record.keys())
|
|
39
|
+
for k in keys:
|
|
40
|
+
if k not in self.keep_fields:
|
|
41
|
+
record.pop(k)
|
|
42
|
+
yield record
|
pjk/pipes/sort.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/sort.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, Usage, UsageError
|
|
7
|
+
|
|
8
|
+
class SortPipe(Pipe):
|
|
9
|
+
@classmethod
|
|
10
|
+
def usage(cls):
|
|
11
|
+
usage = Usage(
|
|
12
|
+
name='sort',
|
|
13
|
+
desc="Sort records by a single field (records with missing field sort last).",
|
|
14
|
+
component_class=cls
|
|
15
|
+
)
|
|
16
|
+
usage.def_arg(name='field', usage="+name or -name for ascending or decending sort by field 'name'.")
|
|
17
|
+
usage.def_example(expr_tokens=["[{id:17}, {id:10}, {id:1}]", 'sort:+id'], expect="[{id:1}, {id:10}, {id:17}]")
|
|
18
|
+
usage.def_example(expr_tokens=["[{id:1}, {color:'blue'}, {color:'green'}]", 'sort:-color'], expect="[{color:'green'}, {color:'blue'}, {id:1}]")
|
|
19
|
+
return usage
|
|
20
|
+
|
|
21
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
22
|
+
super().__init__(ptok)
|
|
23
|
+
|
|
24
|
+
arg_string = usage.get_arg('field')
|
|
25
|
+
if not arg_string:
|
|
26
|
+
raise UsageError("sort:[+-]<field> requires direction and field name")
|
|
27
|
+
|
|
28
|
+
if arg_string.startswith("-"):
|
|
29
|
+
self.field = arg_string[1:]
|
|
30
|
+
self.reverse = True
|
|
31
|
+
elif arg_string.startswith("+"):
|
|
32
|
+
self.field = arg_string[1:]
|
|
33
|
+
self.reverse = False
|
|
34
|
+
else:
|
|
35
|
+
raise UsageError("sort:[+-]<field> must start with '+' or '-'")
|
|
36
|
+
|
|
37
|
+
self._buffer = None
|
|
38
|
+
self._index = 0
|
|
39
|
+
|
|
40
|
+
def reset(self):
|
|
41
|
+
self._buffer = None
|
|
42
|
+
self._index = 0
|
|
43
|
+
|
|
44
|
+
def __iter__(self):
|
|
45
|
+
if self._buffer is None:
|
|
46
|
+
self._buffer = list(self.left)
|
|
47
|
+
|
|
48
|
+
# Partition into records with and without the sort field
|
|
49
|
+
present = [r for r in self._buffer if r.get(self.field) is not None]
|
|
50
|
+
missing = [r for r in self._buffer if r.get(self.field) is None]
|
|
51
|
+
|
|
52
|
+
present.sort(
|
|
53
|
+
key=lambda r: r.get(self.field),
|
|
54
|
+
reverse=self.reverse
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self._buffer = present + missing # always push missing to the end
|
|
58
|
+
|
|
59
|
+
while self._index < len(self._buffer):
|
|
60
|
+
yield self._buffer[self._index]
|
|
61
|
+
self._index += 1
|
|
62
|
+
|
|
63
|
+
|
pjk/pipes/tail.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/tail.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, Usage
|
|
7
|
+
|
|
8
|
+
class TailPipe(Pipe):
|
|
9
|
+
@classmethod
|
|
10
|
+
def usage(cls):
|
|
11
|
+
usage = Usage(
|
|
12
|
+
name='tail',
|
|
13
|
+
desc='take last records of input (when single-threaded)',
|
|
14
|
+
component_class=cls
|
|
15
|
+
)
|
|
16
|
+
usage.def_arg(name='limit', usage='number of records', is_num=True)
|
|
17
|
+
usage.def_example(expr_tokens=['[{id:1}, {id:2}]', 'tail:1'], expect="{id:2}")
|
|
18
|
+
return usage
|
|
19
|
+
|
|
20
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
21
|
+
super().__init__(ptok)
|
|
22
|
+
self.limit = usage.get_arg('limit')
|
|
23
|
+
|
|
24
|
+
self.buffer = []
|
|
25
|
+
self.ready = False
|
|
26
|
+
|
|
27
|
+
def reset(self):
|
|
28
|
+
self.buffer.clear()
|
|
29
|
+
self.ready = False
|
|
30
|
+
|
|
31
|
+
def __iter__(self):
|
|
32
|
+
if not self.ready:
|
|
33
|
+
for record in self.left:
|
|
34
|
+
self.buffer.append(record)
|
|
35
|
+
if len(self.buffer) > self.limit:
|
|
36
|
+
self.buffer.pop(0)
|
|
37
|
+
self.ready = True
|
|
38
|
+
|
|
39
|
+
yield from self.buffer
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/user_pipe_factory.py
|
|
5
|
+
|
|
6
|
+
import importlib.util
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from pjk.base import Pipe, Sink, ParsedToken, UsageError
|
|
9
|
+
|
|
10
|
+
class UserPipeFactory:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def create(ptok: ParsedToken) -> Optional[Pipe]:
|
|
13
|
+
script_path = ptok.pre_colon
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
# Load module dynamically from script path
|
|
17
|
+
spec = importlib.util.spec_from_file_location("user_pipe", script_path)
|
|
18
|
+
if spec is None or spec.loader is None:
|
|
19
|
+
raise UsageError(f"Could not load Python file: {script_path}")
|
|
20
|
+
|
|
21
|
+
module = importlib.util.module_from_spec(spec)
|
|
22
|
+
spec.loader.exec_module(module)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
raise UsageError(f"Failed to import {script_path}: {e}")
|
|
25
|
+
|
|
26
|
+
# Look for exactly one top-level Pipe class that isn't a Sink or base Pipe
|
|
27
|
+
pipe_cls = None
|
|
28
|
+
for value in vars(module).values():
|
|
29
|
+
if (
|
|
30
|
+
isinstance(value, type)
|
|
31
|
+
and issubclass(value, Pipe)
|
|
32
|
+
and not issubclass(value, Sink)
|
|
33
|
+
and value is not Pipe
|
|
34
|
+
and value.__module__ == module.__name__
|
|
35
|
+
):
|
|
36
|
+
if pipe_cls is not None:
|
|
37
|
+
raise UsageError(f"Multiple Pipe classes found in {script_path}. Only one is allowed.")
|
|
38
|
+
pipe_cls = value
|
|
39
|
+
|
|
40
|
+
if pipe_cls is None:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
usage = pipe_cls.usage()
|
|
44
|
+
usage.bind(ptok)
|
|
45
|
+
return pipe_cls(ptok, usage)
|
pjk/pipes/where.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/pipes/where.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, UsageError
|
|
7
|
+
from pjk.common import SafeNamespace
|
|
8
|
+
|
|
9
|
+
class WherePipe(Pipe):
|
|
10
|
+
@classmethod
|
|
11
|
+
def usage(cls):
|
|
12
|
+
usage = NoBindUsage(
|
|
13
|
+
name='where',
|
|
14
|
+
desc="Filter records using a Python expression over fields",
|
|
15
|
+
component_class=cls
|
|
16
|
+
)
|
|
17
|
+
usage.def_arg(name='expr', usage='Python expression using \'f.<field>\' syntax')
|
|
18
|
+
usage.def_example(expr_tokens=["[{size:1}, {size:5}, {size:10}]", "where:f.size >= 5"], expect="[{size:5}, {size:10}]")
|
|
19
|
+
usage.def_example(expr_tokens=["[{color:'blue'}, {color:'red'}, {color:'black'}]", "where:f.color.startswith('bl')"], expect="[{color:'blue'}, {color:'black'}]")
|
|
20
|
+
return usage
|
|
21
|
+
|
|
22
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
23
|
+
super().__init__(ptok, usage)
|
|
24
|
+
self.expr = ptok.whole_token.split(':', 1)[1]
|
|
25
|
+
try:
|
|
26
|
+
self.code = compile(self.expr, '<where>', 'eval')
|
|
27
|
+
except Exception as e:
|
|
28
|
+
raise UsageError(f"Invalid where expression: {self.expr}") from e
|
|
29
|
+
|
|
30
|
+
def reset(self):
|
|
31
|
+
pass # stateless
|
|
32
|
+
|
|
33
|
+
def __iter__(self):
|
|
34
|
+
for record in self.left:
|
|
35
|
+
f = SafeNamespace(record)
|
|
36
|
+
try:
|
|
37
|
+
if eval(self.code, {}, {'f': f}):
|
|
38
|
+
yield record
|
|
39
|
+
except Exception:
|
|
40
|
+
continue # ignore eval errors
|
|
41
|
+
|
|
42
|
+
def deep_copy(self):
|
|
43
|
+
source_clone = self.left.deep_copy()
|
|
44
|
+
if source_clone:
|
|
45
|
+
pipe = WherePipe(self.ptok, self.usage)
|
|
46
|
+
pipe.add_source(source_clone)
|
|
47
|
+
return pipe
|
|
48
|
+
else:
|
|
49
|
+
return None
|
pjk/registry.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pjk.sinks.factory import SinkFactory
|
|
6
|
+
from pjk.pipes.factory import PipeFactory
|
|
7
|
+
from pjk.sources.factory import SourceFactory
|
|
8
|
+
import importlib.util
|
|
9
|
+
import importlib
|
|
10
|
+
import importlib.metadata
|
|
11
|
+
from pjk.base import Pipe, Source, Sink
|
|
12
|
+
|
|
13
|
+
class ComponentRegistry:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.source_factory = SourceFactory()
|
|
16
|
+
self.pipe_factory = PipeFactory()
|
|
17
|
+
self.sink_factory = SinkFactory()
|
|
18
|
+
|
|
19
|
+
sources, pipes, sinks = load_user_components()
|
|
20
|
+
for name, comp in sources.items():
|
|
21
|
+
self.source_factory.register(name, comp)
|
|
22
|
+
|
|
23
|
+
for name, comp in pipes.items():
|
|
24
|
+
self.pipe_factory.register(name, comp)
|
|
25
|
+
|
|
26
|
+
for name, comp in sinks.items():
|
|
27
|
+
self.sink_factory.register(name, comp)
|
|
28
|
+
|
|
29
|
+
load_package_extras()
|
|
30
|
+
|
|
31
|
+
def register(self, name, comp):
|
|
32
|
+
if is_pipe(comp):
|
|
33
|
+
print('HELEELELELELELEEE')
|
|
34
|
+
if hasattr(comp, "usage"):
|
|
35
|
+
usage = comp.usage()
|
|
36
|
+
name = usage.name
|
|
37
|
+
self.pipe_factory.register(name, comp)
|
|
38
|
+
elif is_sink(comp):
|
|
39
|
+
self.sink_factory.register(name, comp)
|
|
40
|
+
elif is_source(comp):
|
|
41
|
+
self.source_factory(name, comp)
|
|
42
|
+
|
|
43
|
+
def create_source(self, token: str):
|
|
44
|
+
return self.source_factory.create(token)
|
|
45
|
+
|
|
46
|
+
def create_pipe(self, token: str):
|
|
47
|
+
return self.pipe_factory.create(token)
|
|
48
|
+
|
|
49
|
+
def create_sink(self, token: str):
|
|
50
|
+
return self.sink_factory.create(token)
|
|
51
|
+
|
|
52
|
+
def get_factories(self):
|
|
53
|
+
return [self.source_factory, self.pipe_factory, self.sink_factory]
|
|
54
|
+
|
|
55
|
+
def print_usage(self):
|
|
56
|
+
print('Usage: pjk <source> [<pipe> ...] <sink>')
|
|
57
|
+
print(' pjk man <component> | --all')
|
|
58
|
+
print(' pjk examples')
|
|
59
|
+
print()
|
|
60
|
+
self.source_factory.print_descriptions()
|
|
61
|
+
print()
|
|
62
|
+
self.pipe_factory.print_descriptions()
|
|
63
|
+
print()
|
|
64
|
+
self.sink_factory.print_descriptions()
|
|
65
|
+
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
def is_source(obj, module):
|
|
69
|
+
return (
|
|
70
|
+
isinstance(obj, type)
|
|
71
|
+
and issubclass(obj, Source)
|
|
72
|
+
and not issubclass(obj, Pipe)
|
|
73
|
+
and not issubclass(obj, Sink)
|
|
74
|
+
and obj is not Source
|
|
75
|
+
and obj.__module__ == module.__name__ # 🧠 only user-defined classes
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def is_pipe(obj, module):
|
|
79
|
+
return (
|
|
80
|
+
isinstance(obj, type)
|
|
81
|
+
and issubclass(obj, Pipe)
|
|
82
|
+
and not issubclass(obj, Sink)
|
|
83
|
+
and obj is not Pipe
|
|
84
|
+
and obj.__module__ == module.__name__
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def is_sink(obj, module):
|
|
88
|
+
return (
|
|
89
|
+
isinstance(obj, type)
|
|
90
|
+
and issubclass(obj, Sink)
|
|
91
|
+
and obj is not Sink
|
|
92
|
+
and obj.__module__ == module.__name__
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def load_user_components(path=os.path.expanduser("~/.pjk/plugins")):
|
|
96
|
+
sources = {}
|
|
97
|
+
pipes = {}
|
|
98
|
+
sinks = {}
|
|
99
|
+
|
|
100
|
+
if not os.path.isdir(path):
|
|
101
|
+
return {}, {}, {}
|
|
102
|
+
|
|
103
|
+
for fname in os.listdir(path):
|
|
104
|
+
if not fname.endswith(".py"):
|
|
105
|
+
continue
|
|
106
|
+
fpath = os.path.join(path, fname)
|
|
107
|
+
modname = f"user_component_{fname[:-3]}"
|
|
108
|
+
spec = importlib.util.spec_from_file_location(modname, fpath)
|
|
109
|
+
if not spec or not spec.loader:
|
|
110
|
+
continue
|
|
111
|
+
module = importlib.util.module_from_spec(spec)
|
|
112
|
+
try:
|
|
113
|
+
spec.loader.exec_module(module)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
print(f"[djk] Failed to load {fname}: {e}")
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
for obj in vars(module).values():
|
|
119
|
+
if not isinstance(obj, type):
|
|
120
|
+
continue
|
|
121
|
+
if hasattr(obj, "usage"):
|
|
122
|
+
usage = obj.usage()
|
|
123
|
+
name = usage.name
|
|
124
|
+
|
|
125
|
+
if is_sink(obj, module):
|
|
126
|
+
sinks[name] = obj
|
|
127
|
+
elif is_pipe(obj, module):
|
|
128
|
+
pipes[name] = obj
|
|
129
|
+
elif is_source(obj, module):
|
|
130
|
+
sources[name] = obj
|
|
131
|
+
|
|
132
|
+
return sources, pipes, sinks
|
|
133
|
+
|
|
134
|
+
def load_package_extras():
|
|
135
|
+
"""
|
|
136
|
+
Discover and import all installed pjk extras (via entry points).
|
|
137
|
+
"""
|
|
138
|
+
for ep in importlib.metadata.entry_points(group="pjk.package_extras"):
|
|
139
|
+
try:
|
|
140
|
+
importlib.import_module(ep.value)
|
|
141
|
+
print(f"[pjk] loaded package extra: {ep.name} -> {ep.value}")
|
|
142
|
+
except Exception as e:
|
|
143
|
+
print(f"[pjk] failed to load extra {ep.name}: {e}")
|
pjk/sinks/__init__.py
ADDED
|
File without changes
|
pjk/sinks/csv_sink.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import csv
|
|
5
|
+
from pjk.base import Sink, Source, ParsedToken, Usage
|
|
6
|
+
|
|
7
|
+
class CSVSink(Sink):
|
|
8
|
+
is_format = True
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
def usage(cls):
|
|
12
|
+
usage = Usage(
|
|
13
|
+
name='csv',
|
|
14
|
+
desc='Write records to a CSV file with dynamic header from first record',
|
|
15
|
+
component_class=cls
|
|
16
|
+
)
|
|
17
|
+
usage.def_arg('path', usage='Path prefix (no extension)')
|
|
18
|
+
return usage
|
|
19
|
+
|
|
20
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
21
|
+
super().__init__(ptok, usage)
|
|
22
|
+
path_no_ext = usage.get_arg('path')
|
|
23
|
+
self.path = f"{path_no_ext}.csv"
|
|
24
|
+
|
|
25
|
+
def process(self) -> None:
|
|
26
|
+
with open(self.path, 'w', newline='') as f:
|
|
27
|
+
writer = None
|
|
28
|
+
|
|
29
|
+
for record in self.input:
|
|
30
|
+
if writer is None:
|
|
31
|
+
writer = csv.DictWriter(f, fieldnames=record.keys())
|
|
32
|
+
writer.writeheader()
|
|
33
|
+
writer.writerow(record)
|
pjk/sinks/ddb.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from pjk.base import Sink, Source, ParsedToken, Usage
|
|
5
|
+
from decimal import Decimal
|
|
6
|
+
|
|
7
|
+
class DDBSink(Sink):
|
|
8
|
+
@classmethod
|
|
9
|
+
def usage(cls):
|
|
10
|
+
usage = Usage(
|
|
11
|
+
name='ddb',
|
|
12
|
+
desc='Write records to a DynamoDB table via batch_writer()',
|
|
13
|
+
component_class=cls
|
|
14
|
+
)
|
|
15
|
+
usage.def_arg('table', usage='DynamoDB table name')
|
|
16
|
+
usage.def_param('batch_size', usage='How many records to write per batch (max 25)')
|
|
17
|
+
return usage
|
|
18
|
+
|
|
19
|
+
def __init__(self, input_source: Source, ptok: ParsedToken, usage: Usage):
|
|
20
|
+
super().__init__(input_source)
|
|
21
|
+
import boto3 # lazy import
|
|
22
|
+
|
|
23
|
+
self.table_name = usage.get_arg('table')
|
|
24
|
+
self.batch_size = int(usage.get_param('batch_size', default='10'))
|
|
25
|
+
self.num_recs = 0
|
|
26
|
+
self.batch = []
|
|
27
|
+
|
|
28
|
+
dynamodb = boto3.resource('dynamodb')
|
|
29
|
+
self.table = dynamodb.Table(self.table_name)
|
|
30
|
+
|
|
31
|
+
def process_batch(self):
|
|
32
|
+
if not self.batch:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
with self.table.batch_writer() as batch:
|
|
36
|
+
for item in self.batch:
|
|
37
|
+
clean_item = {
|
|
38
|
+
k: (Decimal(str(v)) if isinstance(v, float) else v)
|
|
39
|
+
for k, v in item.items()
|
|
40
|
+
}
|
|
41
|
+
batch.put_item(Item=clean_item)
|
|
42
|
+
|
|
43
|
+
self.batch = []
|
|
44
|
+
|
|
45
|
+
def process(self):
|
|
46
|
+
for record in self.input:
|
|
47
|
+
self.batch.append(record)
|
|
48
|
+
self.num_recs += 1
|
|
49
|
+
|
|
50
|
+
if len(self.batch) >= self.batch_size:
|
|
51
|
+
self.process_batch()
|
|
52
|
+
|
|
53
|
+
self.process_batch()
|
|
54
|
+
print(f"DDBSink wrote {self.num_recs} records.")
|
pjk/sinks/devnull.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/sinks/devnull.py
|
|
5
|
+
|
|
6
|
+
from pjk.base import Sink, Source, ParsedToken, Usage
|
|
7
|
+
|
|
8
|
+
class DevNullSink(Sink):
|
|
9
|
+
@classmethod
|
|
10
|
+
def usage(cls):
|
|
11
|
+
usage = Usage(
|
|
12
|
+
name='devnull',
|
|
13
|
+
desc='Consume all input records and discard them (debug/testing)',
|
|
14
|
+
component_class=cls
|
|
15
|
+
)
|
|
16
|
+
usage.def_example(expr_tokens=['{id:1}', 'devnull'], expect=None)
|
|
17
|
+
return usage
|
|
18
|
+
|
|
19
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
20
|
+
super().__init__(ptok, usage)
|
|
21
|
+
self.count = 0
|
|
22
|
+
|
|
23
|
+
def process(self):
|
|
24
|
+
for record in self.input:
|
|
25
|
+
self.count += 1
|
|
26
|
+
|
|
27
|
+
def print_info(self):
|
|
28
|
+
print(f'num_recs:{self.count}')
|
|
29
|
+
|
|
30
|
+
def deep_copy(self):
|
|
31
|
+
return None # until we implement cross-thread coordination
|
pjk/sinks/dir_sink.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pjk.base import Source, Sink, ParsedToken, Usage
|
|
6
|
+
from pjk.log import logger
|
|
7
|
+
|
|
8
|
+
class DirSink(Sink):
|
|
9
|
+
@classmethod
|
|
10
|
+
def usage(cls):
|
|
11
|
+
usage = Usage(
|
|
12
|
+
name='<format>',
|
|
13
|
+
desc='Write records to a local directory in the given <format> (e.g., csv)',
|
|
14
|
+
component_class=cls
|
|
15
|
+
)
|
|
16
|
+
usage.def_arg(name='dir', usage='Path to output directory')
|
|
17
|
+
return usage
|
|
18
|
+
|
|
19
|
+
def __init__(self, ptok: ParsedToken, usage: Usage, sink_class: type, is_gz: bool, fileno: int = 0):
|
|
20
|
+
super().__init__(ptok, usage)
|
|
21
|
+
self.dir_path = usage.get_arg('dir') # ✅ Use usage, not ptok directly
|
|
22
|
+
self.ptok = ptok
|
|
23
|
+
self.usage = usage
|
|
24
|
+
self.sink_class = sink_class
|
|
25
|
+
self.is_gz = is_gz
|
|
26
|
+
self.fileno = fileno
|
|
27
|
+
self.num_files = 1
|
|
28
|
+
|
|
29
|
+
os.makedirs(self.dir_path, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
def process(self):
|
|
32
|
+
file = os.path.join(self.dir_path, f'file-{self.fileno:04d}')
|
|
33
|
+
file_ptok = ParsedToken(f'{file}:{self.is_gz}')
|
|
34
|
+
file_usage = self.sink_class.usage()
|
|
35
|
+
file_usage.bind(file_ptok)
|
|
36
|
+
|
|
37
|
+
file_sink = self.sink_class(file_ptok, file_usage)
|
|
38
|
+
file_sink.add_source(self.input)
|
|
39
|
+
|
|
40
|
+
logger.debug(f'in process sinking to: {file}')
|
|
41
|
+
file_sink.process()
|
|
42
|
+
|
|
43
|
+
def deep_copy(self):
|
|
44
|
+
source_clone = self.input.deep_copy()
|
|
45
|
+
if not source_clone:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
clone = DirSink(
|
|
49
|
+
ptok=self.ptok,
|
|
50
|
+
usage=self.usage,
|
|
51
|
+
sink_class=self.sink_class,
|
|
52
|
+
is_gz=self.is_gz,
|
|
53
|
+
fileno=self.num_files
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
clone.add_source(source_clone)
|
|
57
|
+
|
|
58
|
+
self.num_files += 1
|
|
59
|
+
return clone
|
pjk/sinks/expect.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from pjk.base import Source, Sink, ParsedToken, Usage
|
|
5
|
+
from pjk.sources.inline_source import InlineSource
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
class ExpectSink(Sink):
|
|
9
|
+
# NOTE: ExpectSink intentionally does NOT use Usage due to raw JSON argument parsing
|
|
10
|
+
# e.g., expect:'[{a:1},{a:2}]' must preserve the entire post-colon string unparsed
|
|
11
|
+
|
|
12
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
13
|
+
super().__init__(ptok, usage)
|
|
14
|
+
self.inline = ptok.whole_token.split(':', 1)[-1]
|
|
15
|
+
self.expect_source = InlineSource(self.inline)
|
|
16
|
+
self._expect_iter = iter(self.expect_source)
|
|
17
|
+
|
|
18
|
+
def print_info(self):
|
|
19
|
+
command = ' '.join(sys.argv[1:-1]) # omit 'pjk' and 'expect'
|
|
20
|
+
print(f'{command} ==> OK!\n') # only prints on success
|
|
21
|
+
|
|
22
|
+
def process(self) -> None:
|
|
23
|
+
command = ' '.join(sys.argv[1:-1]) # omit 'pjk' and 'expect'
|
|
24
|
+
|
|
25
|
+
for test_rec in self.input:
|
|
26
|
+
try:
|
|
27
|
+
expect_rec = next(self._expect_iter)
|
|
28
|
+
except StopIteration:
|
|
29
|
+
raise ValueError(
|
|
30
|
+
f"expect failure: {command}\n"
|
|
31
|
+
f"expected_record:None\n"
|
|
32
|
+
f"got_record:{test_rec}\n"
|
|
33
|
+
f"entire_expected:{self.inline}"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if test_rec != expect_rec:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"expect failure: {command}\n"
|
|
39
|
+
f"expected_record:{expect_rec}\n"
|
|
40
|
+
f"got_record:{test_rec}\n"
|
|
41
|
+
f"entire_expected:{self.inline}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
expect_rec = next(self._expect_iter)
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"expect failure: {command}\n"
|
|
48
|
+
f"expected_record:{expect_rec}\n"
|
|
49
|
+
f"got_record:None\n"
|
|
50
|
+
f"entire_expected:{self.inline}"
|
|
51
|
+
)
|
|
52
|
+
except StopIteration:
|
|
53
|
+
pass
|