python-jack-knife 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. pjk/__init__.py +5 -0
  2. pjk/base.py +377 -0
  3. pjk/common.py +150 -0
  4. pjk/log.py +67 -0
  5. pjk/main.py +106 -0
  6. pjk/man_page.py +125 -0
  7. pjk/parser.py +284 -0
  8. pjk/pipes/__init__.py +0 -0
  9. pjk/pipes/denorm.py +68 -0
  10. pjk/pipes/factory.py +62 -0
  11. pjk/pipes/filter.py +57 -0
  12. pjk/pipes/head.py +34 -0
  13. pjk/pipes/join.py +85 -0
  14. pjk/pipes/let_reduce.py +198 -0
  15. pjk/pipes/map.py +91 -0
  16. pjk/pipes/move_field.py +36 -0
  17. pjk/pipes/postgres_pipe.py +209 -0
  18. pjk/pipes/remove_field.py +36 -0
  19. pjk/pipes/select.py +42 -0
  20. pjk/pipes/sort.py +63 -0
  21. pjk/pipes/tail.py +39 -0
  22. pjk/pipes/user_pipe_factory.py +45 -0
  23. pjk/pipes/where.py +49 -0
  24. pjk/registry.py +143 -0
  25. pjk/sinks/__init__.py +0 -0
  26. pjk/sinks/csv_sink.py +33 -0
  27. pjk/sinks/ddb.py +54 -0
  28. pjk/sinks/devnull.py +31 -0
  29. pjk/sinks/dir_sink.py +59 -0
  30. pjk/sinks/expect.py +53 -0
  31. pjk/sinks/factory.py +108 -0
  32. pjk/sinks/graph.py +57 -0
  33. pjk/sinks/graph_bar_line.py +229 -0
  34. pjk/sinks/graph_cumulative.py +55 -0
  35. pjk/sinks/graph_hist.py +72 -0
  36. pjk/sinks/graph_scatter.py +29 -0
  37. pjk/sinks/json_sink.py +23 -0
  38. pjk/sinks/s3_sink.py +100 -0
  39. pjk/sinks/sinks.py +68 -0
  40. pjk/sinks/stdout.py +44 -0
  41. pjk/sinks/tsv_sink.py +22 -0
  42. pjk/sinks/user_sink_factory.py +43 -0
  43. pjk/sources/__init__.py +0 -0
  44. pjk/sources/csv_source.py +28 -0
  45. pjk/sources/dir_source.py +69 -0
  46. pjk/sources/factory.py +100 -0
  47. pjk/sources/format_usage.py +11 -0
  48. pjk/sources/inline_source.py +56 -0
  49. pjk/sources/json_source.py +35 -0
  50. pjk/sources/lazy_file.py +16 -0
  51. pjk/sources/lazy_file_local.py +22 -0
  52. pjk/sources/lazy_file_s3.py +28 -0
  53. pjk/sources/parquet_source.py +32 -0
  54. pjk/sources/s3_source.py +146 -0
  55. pjk/sources/source_list.py +23 -0
  56. pjk/sources/sql_source.py +32 -0
  57. pjk/sources/tsv_source.py +15 -0
  58. pjk/sources/user_source_factory.py +33 -0
  59. pjk/version.py +4 -0
  60. python_jack_knife-0.5.0.dist-info/METADATA +254 -0
  61. python_jack_knife-0.5.0.dist-info/RECORD +65 -0
  62. python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
  63. python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
  64. python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
  65. python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/man_page.py ADDED
@@ -0,0 +1,125 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from pjk.pipes.factory import PipeFactory
5
+ from pjk.sources.factory import SourceFactory
6
+ from pjk.sinks.factory import SinkFactory
7
+ from pjk.parser import ExpressionParser
8
+ from pjk.base import Usage
9
+ from pjk.registry import ComponentRegistry
10
+ from pjk.common import pager_stdout, highlight
11
+ from contextlib import nullcontext
12
+
13
+ def smart_print(expr_tokens: list[str], name: str):
14
+ import re
15
+ SAFE_UNQUOTED_RE = re.compile(r"^[a-zA-Z0-9._/:=+-]+$")
16
+
17
+ def quote(token: str) -> str:
18
+ if SAFE_UNQUOTED_RE.fullmatch(token):
19
+ return token
20
+ elif "'" not in token:
21
+ return f"'{token}'"
22
+ elif '"' not in token:
23
+ return f'"{token}"'
24
+ else:
25
+ return '"' + token.replace('"', '\\"') + '"'
26
+
27
+ expr_str = ' '.join(quote(t) for t in expr_tokens)
28
+ expr_str = highlight(expr_str, 'bold', name)
29
+
30
+ #print("pjk", " ".join(quote(t) for t in expr_tokens))
31
+ print('pjk', expr_str)
32
+
33
+ def do_man(name: str, registry: ComponentRegistry):
34
+ no_pager = name.endswith('+')
35
+ if '--all' in name:
36
+ do_all_man(registry, no_pager=no_pager)
37
+ return
38
+
39
+ # source and sinks have common names so go through multiple times
40
+ printed = False
41
+ for factory in registry.get_factories():
42
+ usage = factory.get_usage(name)
43
+ if usage:
44
+ print_man(registry, name, usage)
45
+ printed = True
46
+
47
+ if not printed:
48
+ print(f'unknown: {name}')
49
+
50
+ def do_all_man(registry: ComponentRegistry, no_pager: bool = True):
51
+ cm = nullcontext() if no_pager else pager_stdout()
52
+ with cm:
53
+ for factory in registry.get_factories():
54
+ comp_type = factory.get_comp_type_name()
55
+ for name in factory.components.keys():
56
+ usage = factory.get_usage(name)
57
+ print_man(registry, name, usage)
58
+ print()
59
+
60
+ def print_man(registry: ComponentRegistry, name: str, usage: Usage):
61
+ comp_type = usage.get_base_class(as_string=True)
62
+ header = f'{name} is a {comp_type}'
63
+ print('===================================')
64
+ print(' ', highlight(header, 'bold', name))
65
+ print('===================================')
66
+
67
+ print()
68
+ print(usage.get_usage_text())
69
+
70
+ examples = usage.get_examples()
71
+ if not examples:
72
+ return
73
+
74
+ print()
75
+ print('examples:')
76
+ print()
77
+
78
+ for expr_tokens, expect in usage.get_examples(): # expect in InlineSource format
79
+ print_example(registry, expr_tokens, expect, name)
80
+
81
+ def do_examples(token:str, registry: ComponentRegistry):
82
+ no_pager = token.endswith('+')
83
+ cm = nullcontext() if no_pager else pager_stdout()
84
+ with cm:
85
+ for factory in registry.get_factories():
86
+ comp_type = factory.get_comp_type_name()
87
+ for name, comp_class in factory.components.items():
88
+ usage = comp_class.usage()
89
+
90
+ comp_type = usage.get_base_class(as_string=True)
91
+ header = f'{name} is a {comp_type}'
92
+ print('===================================')
93
+ print(' ', highlight(header, 'bold', name))
94
+ print('===================================')
95
+
96
+ examples = usage.get_examples()
97
+ if not examples:
98
+ print(f'{name} needs examples')
99
+ print()
100
+
101
+ for expr_tokens, expect in examples:
102
+ print_example(registry, expr_tokens, expect, name)
103
+
104
+ def print_example(registry: ComponentRegistry, expr_tokens: list[str], expect:str, name: str):
105
+ try:
106
+ if not expect: # if no expect, don't run them, just print them
107
+ smart_print(expr_tokens, name)
108
+ print()
109
+ return
110
+
111
+ expr_tokens.append(f'expect:{expect}')
112
+ parser = ExpressionParser(registry)
113
+ sink = parser.parse(expr_tokens)
114
+ sink.drain() # make sure the expect is fulfilled
115
+
116
+ expr_tokens[-1] = '-' # for printing so you see simple stdout -
117
+ smart_print(expr_tokens, name)
118
+ expr_tokens[-1] = '-@less=false' # no less since man is doing less
119
+ parser = ExpressionParser(registry)
120
+ sink = parser.parse(expr_tokens)
121
+ sink.drain()
122
+ print()
123
+
124
+ except ValueError as e:
125
+ raise 'error executing example'
pjk/parser.py ADDED
@@ -0,0 +1,284 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from typing import Any, List, Callable
5
+ import os
6
+ import shlex
7
+ from typing import Optional, Any, List
8
+ from pjk.base import Source, Pipe, Sink, TokenError, UsageError, ParsedToken, Usage
9
+ from pjk.pipes.user_pipe_factory import UserPipeFactory
10
+ from pjk.pipes.let_reduce import ReducePipe
11
+ from pjk.registry import ComponentRegistry
12
+
13
+ def expand_macros(tokens: List[str]) -> List[str]:
14
+ expanded = []
15
+ for token in tokens:
16
+ if token.endswith(".pjk"):
17
+ if not os.path.isfile(token):
18
+ raise FileNotFoundError(f"Macro file not found: {token}")
19
+ with open(token, "r") as f:
20
+ lines = f.readlines()
21
+
22
+ # Remove comments outside quotes, then split
23
+ stripped = []
24
+ for line in lines:
25
+ try:
26
+ parts = shlex.split(line, comments=True, posix=True)
27
+ stripped.extend(parts)
28
+ except ValueError as e:
29
+ raise UsageError(f"Error parsing {token}: {e}")
30
+ expanded.extend(stripped)
31
+ else:
32
+ expanded.append(token)
33
+ return expanded
34
+
35
+ class ExpressionParser:
36
+ def __init__(self, registry: ComponentRegistry):
37
+ self.stack: List[Any] = []
38
+ self.registry = registry
39
+
40
+ def get_sink(self, stack_helper, token):
41
+ if len(self.stack) < 1:
42
+ raise TokenError.from_list(['expression must include source and sink.',
43
+ 'pjk <source> [<pipe> ...] <sink>'])
44
+
45
+ source = self.stack.pop()
46
+ if len(self.stack) != 0:
47
+ raise TokenError.from_list(['A sink can only consume one source.',
48
+ 'pjk <source> [<pipe> ...] <sink>'])
49
+
50
+ # if there's top level aggregation for reduction
51
+ aggregator = stack_helper.get_reducer_aggregator()
52
+ if aggregator:
53
+ aggregator.add_source(source)
54
+ source = aggregator
55
+
56
+ sink = self.registry.create_sink(token)
57
+
58
+ if not sink:
59
+ raise TokenError.from_list(['expression must end in a sink.',
60
+ 'pjk <source> [<pipe> ...] <sink>'])
61
+
62
+ sink.add_source(source)
63
+ return sink
64
+
65
+ def parse(self, tokens: List[str]) -> Sink:
66
+ self.tokens = expand_macros(tokens)
67
+ usage_error_message = "You've got a problem here."
68
+ stack_helper = StackLoader()
69
+ pos = 0
70
+
71
+ try:
72
+ if len(self.tokens) < 2:
73
+ raise TokenError.from_list(['expression must include source and sink.',
74
+ 'pjk <source> [<pipe> ...] <sink>'])
75
+
76
+ for pos, token in enumerate(self.tokens):
77
+ if pos == len(self.tokens) - 1: # token should be THE sink
78
+ return self.get_sink(stack_helper, token)
79
+
80
+ source = self.registry.create_source(token)
81
+ if source:
82
+ stack_helper.add_operator(source, self.stack)
83
+ continue
84
+
85
+ subexp = SubExpression.create(token)
86
+ if subexp:
87
+ stack_helper.add_operator(subexp, self.stack)
88
+ continue
89
+
90
+ pipe = self.registry.create_pipe(token)
91
+ if pipe:
92
+ stack_helper.add_operator(pipe, self.stack)
93
+ continue
94
+
95
+ else: # unrecognized token
96
+ # could be sink in WRONG position, let's see for better error message
97
+ sink = self.registry.create_sink(token, None)
98
+ if sink:
99
+ raise TokenError.from_list(['sink may only occur in final position.',
100
+ 'pjk <source> [<pipe> ...] <sink>'])
101
+ raise TokenError.from_list([token, 'unrecognized token'])
102
+
103
+ except TokenError as e:
104
+ raise UsageError(usage_error_message, self.tokens, pos, e)
105
+
106
+ class ReducerAggregatorPipe(Pipe):
107
+ def __init__(self, top_level_reducers: List[Any]):
108
+ super().__init__(None)
109
+ self.top_level_reducers = top_level_reducers
110
+ self.reduction = {}
111
+ self.done = False
112
+
113
+ def reset(self):
114
+ self.done = False
115
+ self.reduction.clear()
116
+
117
+ def __iter__(self):
118
+ if not self.done:
119
+ for _ in self.left:
120
+ pass # consume all input
121
+ for reducer in self.top_level_reducers:
122
+ name, value = reducer.get_subexp_result()
123
+ self.reduction[name] = value
124
+ self.done = True
125
+ yield self.reduction
126
+
127
+ class StackLoader:
128
+ def __init__(self):
129
+ self.top_level_reducers = []
130
+
131
+ def get_reducer_aggregator(self) -> ReducerAggregatorPipe:
132
+ if not self.top_level_reducers:
133
+ return None
134
+
135
+ return ReducerAggregatorPipe(top_level_reducers=self.top_level_reducers)
136
+
137
+ def add_operator(self, op, stack):
138
+ if len(stack) > 0 and isinstance(stack[-1], Pipe):
139
+ target = stack[-1]
140
+
141
+ if isinstance(target, SubExpression):
142
+ if isinstance(op, SubExpressionOver):
143
+ subexp_begin = stack.pop()
144
+ subexp_begin.set_over_arg(op.get_over_arg())
145
+ op.add_source(subexp_begin)
146
+ stack.append(op)
147
+ return
148
+ else: # an operator within the subexpression
149
+ target.add_subop(op)
150
+ return
151
+
152
+ # order matters, sources are pipes
153
+ if isinstance(op, Pipe):
154
+ arity = op.arity # class level attribute
155
+ if len(stack) < arity:
156
+ raise UsageError(f"'{op}' requires {arity} input(s)")
157
+ for _ in range(arity):
158
+ op.add_source(stack.pop())
159
+ stack.append(op)
160
+
161
+ if isinstance(op, ReducePipe):
162
+ self.top_level_reducers.append(op)
163
+
164
+ return
165
+
166
+ elif isinstance(op, Source):
167
+ stack.append(op)
168
+ return
169
+
170
+ # special upstream source put in subexp stack for flexibility
171
+ # when we don't know what that upstream source will be.
172
+ class UpstreamSource(Source):
173
+ def __init__(self):
174
+ self.data = []
175
+ self.inner_source = None
176
+
177
+ def set_source(self, source: Source):
178
+ self.inner_source = source
179
+
180
+ def set_list(self, items):
181
+ self.data = items if items else []
182
+
183
+ def add_item(self, rec):
184
+ self.data.append(rec)
185
+
186
+ def reset(self):
187
+ # nothing needed in generator model
188
+ pass
189
+
190
+ def __iter__(self):
191
+ if self.inner_source:
192
+ yield from self.inner_source
193
+ else:
194
+ for item in self.data:
195
+ yield item
196
+
197
+ class SubExpression(Pipe):
198
+ @classmethod
199
+ def create(cls, token: str) -> Pipe:
200
+ ptok = ParsedToken(token)
201
+ if ptok.pre_colon == '[':
202
+ return SubExpression(ptok, None)
203
+ if ptok.pre_colon == 'over':
204
+ return SubExpressionOver(ptok, None)
205
+ return None
206
+
207
+ def __init__(self, ptok: ParsedToken, usage: Usage):
208
+ super().__init__(ptok)
209
+ self.upstream_source = UpstreamSource()
210
+ self.over_arg = None
211
+ self.over_field = None
212
+ self.subexp_stack = [self.upstream_source]
213
+ self.subexp_ops = []
214
+ self.over_pipe = None
215
+ self.stack_helper = StackLoader()
216
+
217
+ def add_subop(self, op):
218
+ self.subexp_ops.append(op)
219
+ self.stack_helper.add_operator(op, self.subexp_stack)
220
+
221
+ def set_over_arg(self, over_arg):
222
+ self.over_arg = over_arg
223
+ if over_arg.endswith('.py'):
224
+ self.over_field = 'child'
225
+ self.over_pipe = UserPipeFactory.create(over_arg)
226
+ self.upstream_source.set_source(self.over_pipe)
227
+ self.subexp_ops.append(self.over_pipe)
228
+ else:
229
+ self.over_field = over_arg
230
+
231
+ def reset(self):
232
+ for op in self.subexp_ops:
233
+ if isinstance(op, Pipe):
234
+ op.reset()
235
+
236
+ def __iter__(self):
237
+ for record in self.left:
238
+ if self.over_pipe:
239
+ one = UpstreamSource()
240
+ one.add_item(record)
241
+ self.over_pipe.set_sources([one])
242
+ else:
243
+ field_data = record.pop(self.over_field, None)
244
+ if not field_data:
245
+ yield record
246
+ continue
247
+ if isinstance(field_data, list):
248
+ self.upstream_source.set_list(field_data)
249
+ else:
250
+ self.upstream_source.set_list([field_data])
251
+
252
+ # Reset sub-pipe stack
253
+ for op in self.subexp_ops:
254
+ op.reset()
255
+
256
+ out_recs = []
257
+ for rec in self.subexp_stack[-1]:
258
+ out_recs.append(rec)
259
+
260
+ record[self.over_field] = out_recs
261
+
262
+ for op in self.subexp_ops:
263
+ get_subexp = getattr(op, "get_subexp_result", None)
264
+ if get_subexp:
265
+ name, value = get_subexp()
266
+ if name:
267
+ record[name] = value
268
+
269
+ yield record
270
+
271
+ class SubExpressionOver(Pipe):
272
+ def __init__(self, ptok: ParsedToken, usage: Usage):
273
+ super().__init__(ptok)
274
+ self.over_arg = ptok.get_arg(0)
275
+
276
+ def get_over_arg(self):
277
+ return self.over_arg
278
+
279
+ def reset(self):
280
+ pass # stateless
281
+
282
+ def __iter__(self):
283
+ yield from self.left
284
+
pjk/pipes/__init__.py ADDED
File without changes
pjk/pipes/denorm.py ADDED
@@ -0,0 +1,68 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ # djk/pipes/denorm.py
5
+
6
+ from pjk.base import Pipe, ParsedToken, Usage, UsageError
7
+ from typing import Iterator
8
+
9
+ class Denormer:
10
+ def __init__(self, record, field):
11
+ self.field = field
12
+ data = record.pop(field, None)
13
+
14
+ if not data:
15
+ self.subrec_list = [record]
16
+ self.base_record = {}
17
+ return
18
+
19
+ self.base_record = record
20
+
21
+ if isinstance(data, list):
22
+ self.subrec_list = data
23
+ elif isinstance(data, dict):
24
+ self.subrec_list = [data]
25
+ else:
26
+ raise UsageError("can only denorm sub-records")
27
+
28
+ def __iter__(self) -> Iterator[dict]:
29
+ for subrec in self.subrec_list:
30
+ if not isinstance(subrec, dict):
31
+ subrec = {self.field: subrec}
32
+ out = self.base_record.copy()
33
+ out.update(subrec)
34
+ yield out
35
+
36
+
37
+ class DenormPipe(Pipe):
38
+ @classmethod
39
+ def usage(cls):
40
+ usage = Usage(
41
+ name='explode',
42
+ desc='Explode a nested list/dict field into separate flattened records',
43
+ component_class=cls
44
+ )
45
+ usage.def_arg(name='field', usage='Field to explode')
46
+ usage.def_example(expr_tokens=["{ferry:'orca', cars:[{make: 'ford', size:9}, {make:'bmw', size:4}]}",
47
+ 'explode:cars'
48
+ ],
49
+ expect="[{ferry:'orca', make: 'ford', size:9}, {ferry:'orca', make:'bmw', size:4}]")
50
+ return usage
51
+
52
+ def __init__(self, ptok: ParsedToken, usage: Usage):
53
+ super().__init__(ptok)
54
+
55
+ self.field = usage.get_arg('field')
56
+ if not self.field:
57
+ raise UsageError("denorm must include a field name")
58
+
59
+ self._pending_iter = None
60
+
61
+ def reset(self):
62
+ self._pending_iter = None
63
+
64
+ def __iter__(self):
65
+ for record in self.left:
66
+ denormer = Denormer(record, self.field)
67
+ for out in denormer:
68
+ yield out
pjk/pipes/factory.py ADDED
@@ -0,0 +1,62 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ # djk/pipes/factory.py
5
+ from pjk.base import Usage, Pipe, ParsedToken
6
+ from pjk.common import ComponentFactory
7
+ from pjk.pipes.move_field import MoveField
8
+ from pjk.pipes.remove_field import RemoveField
9
+ from pjk.pipes.let_reduce import LetPipe
10
+ from pjk.pipes.let_reduce import ReducePipe
11
+ from pjk.pipes.head import HeadPipe
12
+ from pjk.pipes.tail import TailPipe
13
+ from pjk.pipes.sort import SortPipe
14
+ from pjk.pipes.where import WherePipe
15
+ from pjk.pipes.map import MapPipe
16
+ from pjk.pipes.join import JoinPipe
17
+ from pjk.pipes.filter import FilterPipe
18
+ from pjk.pipes.select import SelectFields
19
+ from pjk.pipes.denorm import DenormPipe
20
+ from pjk.pipes.postgres_pipe import PostgresPipe
21
+ from pjk.pipes.user_pipe_factory import UserPipeFactory
22
+
23
+ COMPONENTS = {
24
+ 'head': HeadPipe,
25
+ 'tail': TailPipe,
26
+ 'join': JoinPipe,
27
+ 'filter': FilterPipe,
28
+ 'map': MapPipe,
29
+ 'as': MoveField,
30
+ 'drop': RemoveField,
31
+ 'let': LetPipe,
32
+ 'reduce': ReducePipe,
33
+ 'sort': SortPipe,
34
+ 'where': WherePipe,
35
+ 'sel': SelectFields,
36
+ 'explode': DenormPipe,
37
+ 'pgres': PostgresPipe,
38
+ }
39
+
40
+ class PipeFactory(ComponentFactory):
41
+ def __init__(self):
42
+ super().__init__(COMPONENTS, 'pipe')
43
+
44
+ def create(self, token: str) -> Pipe:
45
+
46
+ ptok = ParsedToken(token)
47
+ if ptok.pre_colon.endswith('.py'):
48
+ pipe = UserPipeFactory.create(ptok)
49
+ if pipe:
50
+ return pipe # else keep looking
51
+
52
+ pipe_cls = self.components.get(ptok.pre_colon)
53
+
54
+ if not pipe_cls:
55
+ return None
56
+
57
+ usage = pipe_cls.usage()
58
+ usage.bind(ptok)
59
+
60
+ pipe = pipe_cls(ptok, usage)
61
+ return pipe
62
+
pjk/pipes/filter.py ADDED
@@ -0,0 +1,57 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from pjk.base import Pipe, Usage, UsageError, ParsedToken, KeyedSource
5
+
6
+ class FilterPipe(Pipe):
7
+ arity = 2 # left = record stream, right = keyed source
8
+
9
+ @classmethod
10
+ def usage(cls):
11
+ usage = Usage(
12
+ name="filter",
13
+ desc="Filters left records based on presence in right keyed source",
14
+ component_class=cls
15
+ )
16
+ usage.def_arg("mode", "'+' to include matches, '-' to exclude matches",
17
+ valid_values={'+', '-'})
18
+ usage.def_syntax("pjk <left_source> <map_source> map:<how>:<key> filter:<mode> <sink>")
19
+
20
+ usage.def_example(expr_tokens=
21
+ [
22
+ "[{id:1}, {id:2}, {id:3}, {id:4}, {id:5}]",
23
+ "[{id:1}, {id:3}, {id:5}]",
24
+ 'map:o:id',
25
+ "filter:+"
26
+ ],
27
+ expect="[{id:1}, {id:3}, {id:5}]")
28
+
29
+ usage.def_example(expr_tokens=
30
+ [
31
+ "[{id:1}, {id:2}, {id:3}, {id:4}, {id:5}]",
32
+ "[{id:1}, {id:3}, {id:5}]",
33
+ 'map:o:id',
34
+ "filter:-"
35
+ ],
36
+ expect="[{id:2}, {id:4}]")
37
+ return usage
38
+
39
+ def __init__(self, ptok: ParsedToken, usage: Usage):
40
+ super().__init__(ptok)
41
+ self.mode = usage.get_arg('mode')
42
+ self.left = None
43
+ self.right = None
44
+
45
+ def reset(self):
46
+ pass # stateless
47
+
48
+ def __iter__(self):
49
+ if not isinstance(self.right, KeyedSource):
50
+ raise UsageError("Right input to filter must be a KeyedSource")
51
+
52
+ for record in self.left:
53
+ match = self.right.lookup(record)
54
+ exists = match is not None
55
+
56
+ if (self.mode == "+" and exists) or (self.mode == "-" and not exists):
57
+ yield record
pjk/pipes/head.py ADDED
@@ -0,0 +1,34 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ # djk/pipes/head.py
5
+
6
+ from typing import Optional
7
+ from pjk.base import Pipe, ParsedToken, Usage
8
+
9
+ class HeadPipe(Pipe):
10
+ @classmethod
11
+ def usage(cls):
12
+ usage = Usage(
13
+ name='head',
14
+ desc='take first records of input (when single-threaded)',
15
+ component_class=cls
16
+ )
17
+ usage.def_arg(name='limit', usage='number of records', is_num=True)
18
+ usage.def_example(expr_tokens=['[{id:1}, {id:2}]', 'head:1'], expect="{id:1}")
19
+ return usage
20
+
21
+ def __init__(self, ptok: ParsedToken, usage: Usage):
22
+ super().__init__(ptok)
23
+ self.limit = usage.get_arg('limit')
24
+ self.count = 0
25
+
26
+ def __iter__(self):
27
+ for record in self.left:
28
+ if self.count >= self.limit:
29
+ break
30
+ self.count += 1
31
+ yield record
32
+
33
+ def reset(self):
34
+ self.count = 0