python-jack-knife 0.6.16__tar.gz → 0.6.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/PKG-INFO +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/components.py +9 -5
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/parser.py +3 -2
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/denorm.py +2 -2
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/filter.py +2 -2
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/join.py +2 -2
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/let_reduce.py +5 -5
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/map.py +2 -2
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/query_pipe.py +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/remove_field.py +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/select.py +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/sort.py +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/tail.py +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/where.py +5 -3
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/progress.py +61 -46
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/graph.py +2 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_bar_line.py +14 -14
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/s3_sink.py +1 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/csv_source.py +1 -0
- python_jack_knife-0.6.17/src/pjk/sources/dir_source.py +181 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/format_source.py +3 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/inline_source.py +1 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/json_source.py +1 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/s3_source.py +1 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/sql_source.py +1 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/version.py +1 -1
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/PKG-INFO +1 -1
- python_jack_knife-0.6.16/src/pjk/sources/dir_source.py +0 -82
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/LICENSE +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/README.md +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/pyproject.toml +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/setup.cfg +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/__init__.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/common.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_client.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_index_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_query_pipe.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/postgres_pipe.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/snowflake_pipe.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/log.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/main.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/man_page.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/__init__.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/factory.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/head.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/move_field.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/progress_pipe.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/sample.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/pipes/user_pipe_factory.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/registry.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/__init__.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/create_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/csv_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/devnull.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/dir_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/expect.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/factory.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/format_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_cumulative.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_hist.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_scatter.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/json_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/s3_stream.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/sinks.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/stdout.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/tsv_sink.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sinks/user_sink_factory.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/__init__.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/configs_source.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/factory.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/favorite_source.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/lazy_file.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/lazy_file_local.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/lazy_file_s3.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/macro_source.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/npy_source.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/parquet_source.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/source_list.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/tsv_source.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/user_source_factory.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/usage.py +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/SOURCES.txt +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/requires.txt +0 -0
- {python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/top_level.txt +0 -0
|
@@ -35,9 +35,12 @@ class Source(ABC):
|
|
|
35
35
|
component_class=cls
|
|
36
36
|
)
|
|
37
37
|
|
|
38
|
+
def __init__(self, root = None):
|
|
39
|
+
self.root = root
|
|
40
|
+
|
|
38
41
|
@abstractmethod
|
|
39
42
|
def __iter__(self):
|
|
40
|
-
|
|
43
|
+
pass
|
|
41
44
|
|
|
42
45
|
def __next__(self):
|
|
43
46
|
# lazily create an internal iterator the first time next() is called
|
|
@@ -57,7 +60,8 @@ class Source(ABC):
|
|
|
57
60
|
class Pipe(Source):
|
|
58
61
|
arity: int = 1
|
|
59
62
|
|
|
60
|
-
def __init__(self, ptok: ParsedToken, usage: Usage = None):
|
|
63
|
+
def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
|
|
64
|
+
self.root = root
|
|
61
65
|
self.ptok = ptok
|
|
62
66
|
self.usage = usage
|
|
63
67
|
self.left = None # left source for convience
|
|
@@ -95,7 +99,7 @@ class DeepCopyPipe(Pipe):
|
|
|
95
99
|
return None
|
|
96
100
|
|
|
97
101
|
# re-instantiate using the actual subclass
|
|
98
|
-
pipe = type(self)(self.ptok, self.usage)
|
|
102
|
+
pipe = type(self)(self.ptok, self.usage, self) # this self is the root
|
|
99
103
|
pipe.add_source(source_clone)
|
|
100
104
|
return pipe
|
|
101
105
|
|
|
@@ -108,7 +112,8 @@ class Sink(ABC):
|
|
|
108
112
|
component_class=cls
|
|
109
113
|
)
|
|
110
114
|
|
|
111
|
-
def __init__(self, ptok: ParsedToken, usage: Usage = None):
|
|
115
|
+
def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
|
|
116
|
+
self.root = root
|
|
112
117
|
self.ptok = ptok
|
|
113
118
|
self.usage = usage
|
|
114
119
|
|
|
@@ -135,4 +140,3 @@ class Sink(ABC):
|
|
|
135
140
|
|
|
136
141
|
def deep_copy(self):
|
|
137
142
|
return None
|
|
138
|
-
|
|
@@ -187,7 +187,7 @@ class ExpressionParser:
|
|
|
187
187
|
|
|
188
188
|
class ReducerAggregatorPipe(Pipe):
|
|
189
189
|
def __init__(self, top_level_reducers: List[Any]):
|
|
190
|
-
super().__init__(None)
|
|
190
|
+
super().__init__(None, None)
|
|
191
191
|
self.top_level_reducers = top_level_reducers
|
|
192
192
|
self.reduction = {}
|
|
193
193
|
self.done = False
|
|
@@ -275,6 +275,7 @@ class UpstreamSource(Source):
|
|
|
275
275
|
return u
|
|
276
276
|
|
|
277
277
|
def __init__(self):
|
|
278
|
+
super().__init__(root=None)
|
|
278
279
|
self.data = []
|
|
279
280
|
self.inner_source = None
|
|
280
281
|
self.sub_recs_in = papi.get_counter(self, var_label='sub_recs_in')
|
|
@@ -338,7 +339,7 @@ class SubExpression(Pipe, ProgressIgnore):
|
|
|
338
339
|
return None
|
|
339
340
|
|
|
340
341
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
341
|
-
super().__init__(ptok)
|
|
342
|
+
super().__init__(ptok, usage)
|
|
342
343
|
self.subexp_ops = []
|
|
343
344
|
self.stack_helper = StackLoader()
|
|
344
345
|
self.subexp_stack = OperandStack()
|
|
@@ -52,10 +52,10 @@ class DenormPipe(Pipe):
|
|
|
52
52
|
return usage
|
|
53
53
|
|
|
54
54
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
55
|
-
super().__init__(ptok)
|
|
55
|
+
super().__init__(ptok, usage)
|
|
56
56
|
|
|
57
57
|
self.field = usage.get_arg('field')
|
|
58
|
-
self.recs_in = papi.get_counter(self,
|
|
58
|
+
self.recs_in = papi.get_counter(self, 'recs_in', display=False)
|
|
59
59
|
self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
|
|
60
60
|
|
|
61
61
|
self._pending_iter = None
|
|
@@ -39,11 +39,11 @@ class FilterPipe(Pipe):
|
|
|
39
39
|
return usage
|
|
40
40
|
|
|
41
41
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
42
|
-
super().__init__(ptok)
|
|
42
|
+
super().__init__(ptok, usage)
|
|
43
43
|
self.mode = usage.get_arg('mode')
|
|
44
44
|
self.left = None
|
|
45
45
|
self.right = None
|
|
46
|
-
self.recs_in = papi.get_counter(self,
|
|
46
|
+
self.recs_in = papi.get_counter(self, 'recs_in', display=False)
|
|
47
47
|
self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
|
|
48
48
|
|
|
49
49
|
def reset(self):
|
|
@@ -60,7 +60,7 @@ class JoinPipe(Pipe):
|
|
|
60
60
|
return usage
|
|
61
61
|
|
|
62
62
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
63
|
-
super().__init__(ptok)
|
|
63
|
+
super().__init__(ptok, usage)
|
|
64
64
|
|
|
65
65
|
self.mode = usage.get_arg('mode')
|
|
66
66
|
self.left = None
|
|
@@ -68,7 +68,7 @@ class JoinPipe(Pipe):
|
|
|
68
68
|
self._pending_right = None
|
|
69
69
|
self._check_right = False
|
|
70
70
|
|
|
71
|
-
self.recs_in = papi.get_counter(self,
|
|
71
|
+
self.recs_in = papi.get_counter(self, 'recs_in', display=False)
|
|
72
72
|
self.matches = papi.get_percentage_counter(self, 'matches', self.recs_in)
|
|
73
73
|
self.recs_out = papi.get_counter(self, 'recs_out')
|
|
74
74
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
# djk/pipes/let_reduce.py
|
|
5
5
|
|
|
6
|
-
from pjk.components import
|
|
6
|
+
from pjk.components import DeepCopyPipe
|
|
7
7
|
from pjk.usage import ParsedToken, Usage, UsageError, TokenError, NoBindUsage
|
|
8
8
|
from pjk.common import SafeNamespace, ReducingNamespace
|
|
9
9
|
import re
|
|
@@ -78,7 +78,7 @@ def eval_accumulating(expr: str, record: dict, op: str, acc=None):
|
|
|
78
78
|
return do_eval(expr, env)
|
|
79
79
|
|
|
80
80
|
# --- LetPipe (simple field assignment) ---
|
|
81
|
-
class LetPipe(
|
|
81
|
+
class LetPipe(DeepCopyPipe):
|
|
82
82
|
@classmethod
|
|
83
83
|
def usage(cls):
|
|
84
84
|
usage = NoBindUsage( # can't use bound usage because of complicated parsing
|
|
@@ -93,7 +93,7 @@ class LetPipe(Pipe):
|
|
|
93
93
|
return usage
|
|
94
94
|
|
|
95
95
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
96
|
-
super().__init__(ptok)
|
|
96
|
+
super().__init__(ptok, usage)
|
|
97
97
|
args = parse_args(ptok.whole_token.split(':', 1)[-1])
|
|
98
98
|
self.field = args['field']
|
|
99
99
|
self.op = args['op']
|
|
@@ -121,7 +121,7 @@ def is_comprehension(expr: str) -> bool:
|
|
|
121
121
|
except SyntaxError:
|
|
122
122
|
return False
|
|
123
123
|
|
|
124
|
-
class ReducePipe(
|
|
124
|
+
class ReducePipe(DeepCopyPipe):
|
|
125
125
|
@classmethod
|
|
126
126
|
def usage(cls):
|
|
127
127
|
usage = NoBindUsage( # can't use bound usage because of complicated parsing
|
|
@@ -161,7 +161,7 @@ class ReducePipe(Pipe):
|
|
|
161
161
|
return usage
|
|
162
162
|
|
|
163
163
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
164
|
-
super().__init__(ptok)
|
|
164
|
+
super().__init__(ptok, usage)
|
|
165
165
|
args = parse_args(ptok.whole_token.split(':', 1)[-1])
|
|
166
166
|
self.field = args['field']
|
|
167
167
|
self.op = args['op']
|
|
@@ -28,7 +28,7 @@ class MapByPipe(Pipe, KeyedSource):
|
|
|
28
28
|
return u
|
|
29
29
|
|
|
30
30
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
31
|
-
super().__init__(ptok)
|
|
31
|
+
super().__init__(ptok, usage)
|
|
32
32
|
self.is_group = False
|
|
33
33
|
self.fields = usage.get_arg('key').split(',')
|
|
34
34
|
self.rec_map = {}
|
|
@@ -37,7 +37,7 @@ class MapByPipe(Pipe, KeyedSource):
|
|
|
37
37
|
self.do_count = usage.get_param(name='count').lower() == 'true'
|
|
38
38
|
self.counts = {}
|
|
39
39
|
self.missing_keys = papi.get_counter(self, 'missing_keys')
|
|
40
|
-
self.recs_in = papi.get_counter(self,
|
|
40
|
+
self.recs_in = papi.get_counter(self, 'recs_in', display=False)
|
|
41
41
|
# recs_out = distinct_keys
|
|
42
42
|
self.distinct_keys = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
|
|
43
43
|
|
|
@@ -40,7 +40,7 @@ class QueryPipe(Pipe):
|
|
|
40
40
|
self.output_shape = usage.get_param('shape')
|
|
41
41
|
self.count = usage.get_param('count')
|
|
42
42
|
self.query_field = 'query' # for all subclasses
|
|
43
|
-
self.inrecs = papi.get_counter(self, var_label=
|
|
43
|
+
self.inrecs = papi.get_counter(self, var_label='recs_in')
|
|
44
44
|
self.outrecs = papi.get_percentage_counter(self, var_label='recs_out', denom_counter=self.inrecs)
|
|
45
45
|
|
|
46
46
|
@abstractmethod
|
|
@@ -19,7 +19,7 @@ class RemoveField(DeepCopyPipe):
|
|
|
19
19
|
return usage
|
|
20
20
|
|
|
21
21
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
22
|
-
super().__init__(ptok)
|
|
22
|
+
super().__init__(ptok, usage)
|
|
23
23
|
arg_string = usage.get_arg('fields')
|
|
24
24
|
self.fields = [f.strip() for f in arg_string.split(',') if f.strip()]
|
|
25
25
|
if not self.fields:
|
|
@@ -22,10 +22,11 @@ class WherePipe(DeepCopyPipe):
|
|
|
22
22
|
u.def_example(expr_tokens=["[{color:'blue'}, {color:'red'}, {color:'black'}]", "where:f.color.startswith('bl')"], expect="[{color:'blue'}, {color:'black'}]")
|
|
23
23
|
return u
|
|
24
24
|
|
|
25
|
-
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
26
|
-
super().__init__(ptok, usage)
|
|
25
|
+
def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
|
|
26
|
+
super().__init__(ptok, usage, root)
|
|
27
27
|
self.expr = ptok.whole_token.split(':', 1)[1]
|
|
28
|
-
|
|
28
|
+
|
|
29
|
+
self.inrecs = papi.get_counter(self, var_label='recs_in', display=False)
|
|
29
30
|
self.outrecs = papi.get_percentage_counter(self, var_label='recs_out', denom_counter=self.inrecs)
|
|
30
31
|
try:
|
|
31
32
|
self.code = compile(self.expr, '<where>', 'eval')
|
|
@@ -46,3 +47,4 @@ class WherePipe(DeepCopyPipe):
|
|
|
46
47
|
except Exception:
|
|
47
48
|
continue # ignore eval errors
|
|
48
49
|
|
|
50
|
+
|
|
@@ -13,18 +13,26 @@ class ProgressIgnore:
|
|
|
13
13
|
|
|
14
14
|
class Report:
|
|
15
15
|
def __init__(self):
|
|
16
|
-
self.
|
|
16
|
+
self._values: dict[str, Any] = {}
|
|
17
17
|
self.parse_level = -1
|
|
18
|
+
self.invisibles = set()
|
|
18
19
|
|
|
19
|
-
def
|
|
20
|
-
|
|
20
|
+
def set_or_get_value(self, name, value):
|
|
21
|
+
# store once; subsequent calls return the existing object
|
|
22
|
+
return self._values.setdefault(name, value)
|
|
23
|
+
|
|
24
|
+
def get_value(self, name):
|
|
25
|
+
return self._values.get(name)
|
|
21
26
|
|
|
22
27
|
def get_name_value_tuples(self):
|
|
23
|
-
return self.
|
|
28
|
+
return self._values.items()
|
|
24
29
|
|
|
25
30
|
def set_parse_level(self, level: int):
|
|
26
31
|
self.parse_level = level
|
|
27
32
|
|
|
33
|
+
def make_invisible(self, var_label:str):
|
|
34
|
+
self.invisibles.add(var_label)
|
|
35
|
+
|
|
28
36
|
def get_parse_level(self):
|
|
29
37
|
return self.parse_level
|
|
30
38
|
|
|
@@ -58,29 +66,7 @@ class ProgressDisplay:
|
|
|
58
66
|
while not self._stop_event.is_set():
|
|
59
67
|
snap = self.api.snapshot()
|
|
60
68
|
lines = self._render_lines(snap)
|
|
61
|
-
|
|
62
|
-
# Move up to overwrite previous block
|
|
63
|
-
if self._last_lines:
|
|
64
|
-
self.stream.write(f"{CSI}{self._last_lines}F") # move cursor up N lines, to column 1
|
|
65
|
-
|
|
66
|
-
# Write fresh lines
|
|
67
|
-
for line in lines:
|
|
68
|
-
self.stream.write(line + "\n")
|
|
69
|
-
|
|
70
|
-
# Erase extra old lines if the block got shorter
|
|
71
|
-
if self._last_lines > len(lines):
|
|
72
|
-
diff = self._last_lines - len(lines)
|
|
73
|
-
for _ in range(diff):
|
|
74
|
-
self.stream.write(" " * 120 + "\n")
|
|
75
|
-
# move cursor up to top of block again
|
|
76
|
-
self.stream.write(f"{CSI}{self._last_lines}F")
|
|
77
|
-
|
|
78
|
-
try:
|
|
79
|
-
self.stream.flush()
|
|
80
|
-
except Exception:
|
|
81
|
-
pass
|
|
82
|
-
|
|
83
|
-
self._last_lines = len(lines)
|
|
69
|
+
self._write_lines(lines)
|
|
84
70
|
|
|
85
71
|
if self._stop_event.wait(self.interval):
|
|
86
72
|
break
|
|
@@ -88,18 +74,38 @@ class ProgressDisplay:
|
|
|
88
74
|
# --- FINAL REFRESH ON SHUTDOWN ---
|
|
89
75
|
reports = self.api.snapshot()
|
|
90
76
|
lines = self._render_lines(reports)
|
|
77
|
+
self._write_lines(lines, final=True)
|
|
78
|
+
|
|
79
|
+
def _write_lines(self, lines, final: bool = False):
|
|
80
|
+
"""
|
|
81
|
+
Render output either by rewriting the previous block (TTY) or by
|
|
82
|
+
printing a fresh snapshot (non-TTY fall back).
|
|
83
|
+
"""
|
|
84
|
+
prev_lines = self._last_lines
|
|
91
85
|
|
|
92
|
-
if self.
|
|
93
|
-
|
|
86
|
+
if self._use_ansi:
|
|
87
|
+
if prev_lines:
|
|
88
|
+
# Move cursor up to the beginning of the old block
|
|
89
|
+
self.stream.write(f"{CSI}{prev_lines}F")
|
|
94
90
|
|
|
95
|
-
|
|
96
|
-
|
|
91
|
+
for line in lines:
|
|
92
|
+
self.stream.write(line + "\n")
|
|
97
93
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
94
|
+
if prev_lines > len(lines):
|
|
95
|
+
diff = prev_lines - len(lines)
|
|
96
|
+
blank = " " * 120
|
|
97
|
+
for _ in range(diff):
|
|
98
|
+
self.stream.write(blank + "\n")
|
|
99
|
+
# move cursor back to sit just below the freshly written block
|
|
100
|
+
self.stream.write(f"{CSI}{diff}F")
|
|
101
|
+
else:
|
|
102
|
+
# Best-effort fallback when we cannot reposition the cursor.
|
|
103
|
+
if prev_lines and not final:
|
|
104
|
+
self.stream.write("\n")
|
|
105
|
+
for line in lines:
|
|
106
|
+
self.stream.write(line + "\n")
|
|
107
|
+
if prev_lines and not final:
|
|
108
|
+
self.stream.write("-" * 40 + "\n")
|
|
103
109
|
|
|
104
110
|
try:
|
|
105
111
|
self.stream.flush()
|
|
@@ -125,6 +131,9 @@ class ProgressDisplay:
|
|
|
125
131
|
label = f'{indent}{key}'
|
|
126
132
|
parts = [f"{label:<{KEY_W}.{KEY_W}}"] # left col, truncated if too long
|
|
127
133
|
for name, val in report.get_name_value_tuples():
|
|
134
|
+
if name in report.invisibles:
|
|
135
|
+
continue
|
|
136
|
+
|
|
128
137
|
token = f"{name}={val}" # __str__ handles formatting
|
|
129
138
|
parts.append(f"{token:<{COL_W}}") # left-justify, hard truncate at COL_W
|
|
130
139
|
return highlight(" ".join(parts), 'bold', key)
|
|
@@ -210,8 +219,8 @@ class ProgressAPI:
|
|
|
210
219
|
self._parse_depth: Dict[int, int] = {} # component id -> level
|
|
211
220
|
self.level = 0
|
|
212
221
|
|
|
213
|
-
def get_counter(self, component: Source | Sink, var_label: str) -> SafeCounter:
|
|
214
|
-
return self._update_storage(component, var_label=var_label, value=SafeCounter())
|
|
222
|
+
def get_counter(self, component: Source | Sink, var_label: str, display: bool = True) -> SafeCounter:
|
|
223
|
+
return self._update_storage(component, var_label=var_label, value=SafeCounter(), display=display)
|
|
215
224
|
|
|
216
225
|
# returns the numerator counter
|
|
217
226
|
def get_percentage_counter(self, component: Source | Sink, var_label: str, denom_counter: SafeCounter):
|
|
@@ -229,7 +238,7 @@ class ProgressAPI:
|
|
|
229
238
|
report.set_parse_level(level)
|
|
230
239
|
return self._reports
|
|
231
240
|
|
|
232
|
-
# could happen before or after update storage
|
|
241
|
+
# could happen before or after update storage, done in operand stack to get levels right)
|
|
233
242
|
def register_component(self, component: Source | Sink, stack_level: int):
|
|
234
243
|
if isinstance(component, ProgressIgnore):
|
|
235
244
|
return # um, ignore
|
|
@@ -238,20 +247,26 @@ class ProgressAPI:
|
|
|
238
247
|
self._parse_depth[comp_id] = stack_level
|
|
239
248
|
self._update_storage(component, var_label=None, value=None) # just register, no values
|
|
240
249
|
|
|
241
|
-
def _update_storage(self, component: Source | Sink, var_label: str, value: Any):
|
|
250
|
+
def _update_storage(self, component: Source | Sink, var_label: str, value: Any, display:bool = True):
|
|
242
251
|
# we can have multiple instances of a component type in an expression so we need to
|
|
243
252
|
# differentiate by id when we put them in the _store.
|
|
244
253
|
component_label = self._get_component_label(component)
|
|
245
|
-
|
|
254
|
+
|
|
255
|
+
# create an uniq id for variable that is common across clones
|
|
256
|
+
comp_id = id(component) if component.root is None else id(component.root)
|
|
257
|
+
|
|
258
|
+
store_key = (component_label, comp_id)
|
|
246
259
|
report = self._reports.setdefault(store_key, Report())
|
|
247
|
-
if
|
|
260
|
+
if value is None: # when just registering component
|
|
248
261
|
return None
|
|
249
262
|
|
|
250
|
-
if var_label:
|
|
251
|
-
|
|
252
|
-
report.add_value(var_label, value)
|
|
263
|
+
if not var_label:
|
|
264
|
+
raise Exception('unique var_label is required')
|
|
253
265
|
|
|
254
|
-
|
|
266
|
+
if not display:
|
|
267
|
+
report.make_invisible(var_label)
|
|
268
|
+
|
|
269
|
+
return report.set_or_get_value(var_label, value)
|
|
255
270
|
|
|
256
271
|
# some hacking to get at reasonable labels
|
|
257
272
|
def _get_component_label(self, component: Source | Sink):
|
|
@@ -21,6 +21,7 @@ class GraphSink(Sink):
|
|
|
21
21
|
usage.def_param(name='x', usage='x-axis field', default='x')
|
|
22
22
|
usage.def_param(name='y', usage='comma separated list of y-axis fields', default='y')
|
|
23
23
|
usage.def_param(name='pause', usage='Seconds to show graph', is_num=True, default='-1')
|
|
24
|
+
usage.def_param(name='title', usage='A title for the graph', is_num=False)
|
|
24
25
|
return usage
|
|
25
26
|
|
|
26
27
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
@@ -30,6 +31,7 @@ class GraphSink(Sink):
|
|
|
30
31
|
self.x_field = usage.get_param('x')
|
|
31
32
|
self.y_field = usage.get_param('y')
|
|
32
33
|
self.pause = usage.get_param('pause')
|
|
34
|
+
self.title = usage.get_param('title')
|
|
33
35
|
|
|
34
36
|
def process(self):
|
|
35
37
|
import matplotlib.pyplot as plt # lazy import
|
|
@@ -137,8 +137,8 @@ class SingleYWithSetsAdapter:
|
|
|
137
137
|
# ----------------------------- Plotter -----------------------------
|
|
138
138
|
class GraphPlotter:
|
|
139
139
|
def __init__(self, params: GraphParams):
|
|
140
|
-
self.
|
|
141
|
-
self.y_fields = list(dict.fromkeys(self.
|
|
140
|
+
self.pms = params
|
|
141
|
+
self.y_fields = list(dict.fromkeys(self.pms.y_fields)) # dedupe, preserve order
|
|
142
142
|
|
|
143
143
|
def plot(self, chart_type: str = "line"):
|
|
144
144
|
import matplotlib.pyplot as plt
|
|
@@ -149,8 +149,8 @@ class GraphPlotter:
|
|
|
149
149
|
|
|
150
150
|
# Multi-Y path (preferred)
|
|
151
151
|
if len(self.y_fields) > 1:
|
|
152
|
-
df = MultiYAdapter.to_df(self.
|
|
153
|
-
is_time = self.
|
|
152
|
+
df = MultiYAdapter.to_df(self.pms.records, self.pms.x_field, self.y_fields)
|
|
153
|
+
is_time = self.pms.x_is_time if isinstance(self.pms.x_is_time, bool) else TimeDetector.is_time(df["x"])
|
|
154
154
|
if is_time:
|
|
155
155
|
df["ts"] = TimeDetector.parse_times(df["x"])
|
|
156
156
|
df = df.dropna(subset=["ts"]).sort_values("ts")
|
|
@@ -165,12 +165,12 @@ class GraphPlotter:
|
|
|
165
165
|
else:
|
|
166
166
|
self._lines_categorical(ax, df, self.y_fields)
|
|
167
167
|
self._format_categorical_axis(ax, df)
|
|
168
|
-
title = self.
|
|
168
|
+
title = self.pms.title or ("Line over time" if is_time and chart_type=="line" else
|
|
169
169
|
"Bar over time" if is_time else
|
|
170
170
|
"Line by category" if chart_type=="line" else
|
|
171
171
|
"Bar by category")
|
|
172
172
|
ax.set_title(title)
|
|
173
|
-
ax.set_xlabel(self.
|
|
173
|
+
ax.set_xlabel(self.pms.x_field)
|
|
174
174
|
ax.set_ylabel(", ".join(self.y_fields))
|
|
175
175
|
ax.legend(title="Series")
|
|
176
176
|
self._apply_args_dict()
|
|
@@ -180,13 +180,13 @@ class GraphPlotter:
|
|
|
180
180
|
|
|
181
181
|
# Single-Y legacy path (maybe with set_name)
|
|
182
182
|
y = self.y_fields[0]
|
|
183
|
-
sdf = SingleYWithSetsAdapter.to_df(self.
|
|
183
|
+
sdf = SingleYWithSetsAdapter.to_df(self.pms.records, self.pms.x_field, y)
|
|
184
184
|
if sdf.empty:
|
|
185
|
-
print(f"No valid '{self.
|
|
185
|
+
print(f"No valid '{self.pms.x_field}' and '{y}' records found.")
|
|
186
186
|
return fig, ax
|
|
187
187
|
|
|
188
188
|
# time vs categorical
|
|
189
|
-
is_time = self.
|
|
189
|
+
is_time = self.pms.x_is_time if isinstance(self.pms.x_is_time, bool) else TimeDetector.is_time(sdf["x"])
|
|
190
190
|
if is_time:
|
|
191
191
|
sdf["ts"] = TimeDetector.parse_times(sdf["x"])
|
|
192
192
|
sdf = sdf.dropna(subset=["ts"]) # might be empty
|
|
@@ -206,8 +206,8 @@ class GraphPlotter:
|
|
|
206
206
|
else:
|
|
207
207
|
ax.plot(s.index, s.values, label=label)
|
|
208
208
|
self._format_time_axis(ax, sdf.rename(columns={"ts":"ts"}))
|
|
209
|
-
ax.set_title(self.
|
|
210
|
-
ax.set_xlabel(self.
|
|
209
|
+
ax.set_title(self.pms.title or f"{y} over time")
|
|
210
|
+
ax.set_xlabel(self.pms.x_field)
|
|
211
211
|
ax.set_ylabel(y)
|
|
212
212
|
if any(s != "__default__" for s in sdf["set"].unique()):
|
|
213
213
|
ax.legend(title="data set")
|
|
@@ -243,8 +243,8 @@ class GraphPlotter:
|
|
|
243
243
|
tick_idx = idx
|
|
244
244
|
tick_lbl = x_vals
|
|
245
245
|
ax.set_xticks(tick_idx, tick_lbl, rotation=45)
|
|
246
|
-
ax.set_title(self.
|
|
247
|
-
ax.set_xlabel(self.
|
|
246
|
+
ax.set_title(self.pms.title or f"{y} by {self.pms.x_field}")
|
|
247
|
+
ax.set_xlabel(self.pms.x_field)
|
|
248
248
|
ax.set_ylabel(y)
|
|
249
249
|
if len(set_names) > 1 or "__default__" not in set_names:
|
|
250
250
|
ax.legend(title="data set")
|
|
@@ -323,7 +323,7 @@ class GraphPlotter:
|
|
|
323
323
|
# ---------- Misc ----------
|
|
324
324
|
def _apply_args_dict(self) -> None:
|
|
325
325
|
import matplotlib.pyplot as plt
|
|
326
|
-
for name, val in getattr(self.
|
|
326
|
+
for name, val in getattr(self.pms, "args_dict", {}).items():
|
|
327
327
|
fn = getattr(plt, name, None)
|
|
328
328
|
if callable(fn):
|
|
329
329
|
try:
|
|
@@ -26,6 +26,7 @@ class S3Sink(Sink):
|
|
|
26
26
|
_FILENAME_DIGITS: int = 4
|
|
27
27
|
|
|
28
28
|
def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int):
|
|
29
|
+
super().__init__(root=None, ptok=None, usage=None)
|
|
29
30
|
self.path_no_ext = path_no_ext if not path_no_ext.startswith('//') else path_no_ext[2:] # strip leading //
|
|
30
31
|
self.sink_class = sink_class
|
|
31
32
|
self.is_gz = is_gz
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from pjk.components import Source
|
|
9
|
+
from pjk.sources.lazy_file_local import LazyFileLocal
|
|
10
|
+
from pjk.log import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DirSource(Source):
|
|
14
|
+
"""
|
|
15
|
+
Iterate over files in a directory, materializing a concrete Source per file.
|
|
16
|
+
Coordination between clones is handled by a shared file iterator protected
|
|
17
|
+
by a lock. No queues, no is_root, no done_event.
|
|
18
|
+
"""
|
|
19
|
+
extension = 'dir' # ducklike hack so like FormatSource without the hassle
|
|
20
|
+
|
|
21
|
+
def __init__(self, root: Source, file_iter = None, source_classes: dict = None, format_override: str = None):
|
|
22
|
+
super().__init__(root=root)
|
|
23
|
+
self.current = None
|
|
24
|
+
if not root: # WE! are the root
|
|
25
|
+
if not file_iter:
|
|
26
|
+
raise Exception('root creation must include file_iter')
|
|
27
|
+
self.file_iter = file_iter
|
|
28
|
+
self.iterator_lock = threading.Lock()
|
|
29
|
+
self.format_override = format_override
|
|
30
|
+
self.source_classes = source_classes
|
|
31
|
+
|
|
32
|
+
else:
|
|
33
|
+
self.file_iter = root.file_iter
|
|
34
|
+
self.source_classes = root.source_classes
|
|
35
|
+
self.format_override = root.format_override
|
|
36
|
+
self.iterator_lock = root.iterator_lock
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------
|
|
39
|
+
# Iteration
|
|
40
|
+
# ---------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
def __iter__(self):
|
|
43
|
+
while True:
|
|
44
|
+
if self.current is None:
|
|
45
|
+
# Pull the next file-backed Source (skip unsupported files)
|
|
46
|
+
self.current = self._get_next_source()
|
|
47
|
+
if self.current is None:
|
|
48
|
+
return # exhausted
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
for record in self.current:
|
|
52
|
+
yield record
|
|
53
|
+
finally:
|
|
54
|
+
# move on after this inner source is exhausted
|
|
55
|
+
self.current = None
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------
|
|
58
|
+
# Contention boundary: only here we touch the shared iterator
|
|
59
|
+
# ---------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
# needed for in deep_clone to stop itereration
|
|
62
|
+
def has_next(self):
|
|
63
|
+
if self.current is not None:
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
self.current = self._get_next_source()
|
|
67
|
+
return self.current is not None
|
|
68
|
+
|
|
69
|
+
def get_next_file(self) -> Optional[str]:
|
|
70
|
+
"""
|
|
71
|
+
Thread-safe advancement of the shared file iterator.
|
|
72
|
+
Returns the next file path, or None when exhausted.
|
|
73
|
+
"""
|
|
74
|
+
with self.iterator_lock:
|
|
75
|
+
if self.file_iter is None:
|
|
76
|
+
return None
|
|
77
|
+
try:
|
|
78
|
+
path = next(self.file_iter)
|
|
79
|
+
logger.debug(f'get_next_file -> {path}')
|
|
80
|
+
return path
|
|
81
|
+
except StopIteration:
|
|
82
|
+
self.file_iter = None
|
|
83
|
+
logger.debug('get_next_file -> None (exhausted)')
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
def _get_next_source(self) -> Optional[Source]:
|
|
87
|
+
"""
|
|
88
|
+
Keep drawing files until we either exhaust or we can construct a Source.
|
|
89
|
+
"""
|
|
90
|
+
while True:
|
|
91
|
+
file = self.get_next_file()
|
|
92
|
+
if file is None:
|
|
93
|
+
return None
|
|
94
|
+
src = self._file_to_source(file)
|
|
95
|
+
if src is None:
|
|
96
|
+
logger.debug(f'skipping unsupported file: {file}')
|
|
97
|
+
continue
|
|
98
|
+
logger.debug(f'next source (from file) = {src}')
|
|
99
|
+
return src
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------
|
|
102
|
+
# Helpers
|
|
103
|
+
# ---------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def _file_to_source(self, file: str) -> Optional[Source]:
|
|
106
|
+
parts = file.split('.')
|
|
107
|
+
is_gz = False
|
|
108
|
+
|
|
109
|
+
if parts and parts[-1] == 'gz':
|
|
110
|
+
is_gz = True
|
|
111
|
+
parts.pop()
|
|
112
|
+
|
|
113
|
+
fmt = parts[-1] if parts else None
|
|
114
|
+
|
|
115
|
+
if self.format_override:
|
|
116
|
+
fmt, is_gz = self.get_format_gz(self.format_override)
|
|
117
|
+
|
|
118
|
+
if not fmt:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
source_class = self.source_classes.get(fmt)
|
|
122
|
+
if not source_class:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
lazy_file = LazyFileLocal(file, is_gz)
|
|
126
|
+
return source_class(lazy_file)
|
|
127
|
+
|
|
128
|
+
def deep_copy(self):
|
|
129
|
+
clone = DirSource(self)
|
|
130
|
+
if clone.has_next():
|
|
131
|
+
return clone
|
|
132
|
+
else:
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------
|
|
136
|
+
# Class utilities
|
|
137
|
+
# ---------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def get_format_gz(cls, input_str: str):
|
|
141
|
+
is_gz = False
|
|
142
|
+
fmt = input_str
|
|
143
|
+
if input_str.endswith('.gz'):
|
|
144
|
+
is_gz = True
|
|
145
|
+
fmt = input_str[:-3]
|
|
146
|
+
return fmt, is_gz
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def _iter_files(cls, path: str, recursive: bool):
|
|
150
|
+
if not recursive:
|
|
151
|
+
for f in os.listdir(path):
|
|
152
|
+
full = os.path.join(path, f)
|
|
153
|
+
if os.path.isfile(full):
|
|
154
|
+
yield full
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=False):
|
|
158
|
+
for name in filenames:
|
|
159
|
+
full = os.path.join(dirpath, name)
|
|
160
|
+
if os.path.isfile(full):
|
|
161
|
+
yield full
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def create(
|
|
165
|
+
cls,
|
|
166
|
+
source_classes: dict,
|
|
167
|
+
path_no_ext: str,
|
|
168
|
+
format_override: Optional[str] = None,
|
|
169
|
+
recursive: bool = False,
|
|
170
|
+
):
|
|
171
|
+
"""
|
|
172
|
+
Factory: returns a DirSource that will lazily enumerate files.
|
|
173
|
+
"""
|
|
174
|
+
file_iter = cls._iter_files(path_no_ext, recursive)
|
|
175
|
+
|
|
176
|
+
return DirSource(
|
|
177
|
+
root = None, # THIS is the root
|
|
178
|
+
file_iter=file_iter,
|
|
179
|
+
source_classes=source_classes,
|
|
180
|
+
format_override=format_override
|
|
181
|
+
)
|
|
@@ -16,6 +16,7 @@ class SourceFormatUsage(NoBindUsage):
|
|
|
16
16
|
self.def_syntax("") # no syntax for these
|
|
17
17
|
# default = None because for source, format is an OVERRIDE
|
|
18
18
|
self.def_param('format', 'file format', is_num=False, valid_values={'json', 'csv', 'tsv', 'json.gz', 'tsv.gz', 'csv.gz'}, default=None)
|
|
19
|
+
self.def_param('recursive', 'for local direcories only', is_num=False, valid_values={'true', 'false'}, default=False)
|
|
19
20
|
self.def_example(expr_tokens=[f"myfile.{name}", "-"], expect=None)
|
|
20
21
|
self.def_example(expr_tokens=["mydir", "-"], expect=None)
|
|
21
22
|
self.def_example(expr_tokens=[f"s3://mybucket/myfile.{name}", "-"], expect=None)
|
|
@@ -94,7 +95,8 @@ class FormatSource(Source):
|
|
|
94
95
|
return S3Source.create(sources, path_no_ext, ext, format_override=format_override)
|
|
95
96
|
|
|
96
97
|
if os.path.isdir(path_no_ext):
|
|
97
|
-
|
|
98
|
+
recursive = usage.get_param('recursive') == 'true'
|
|
99
|
+
return DirSource.create(sources, path_no_ext, format_override=format_override, recursive=recursive)
|
|
98
100
|
|
|
99
101
|
return None
|
|
100
102
|
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
-
# Copyright 2024 Mike Schultz
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
from typing import Any
|
|
6
|
-
from queue import Queue, Empty
|
|
7
|
-
from pjk.components import Source
|
|
8
|
-
from pjk.sources.lazy_file_local import LazyFileLocal
|
|
9
|
-
from pjk.log import logger
|
|
10
|
-
|
|
11
|
-
class DirSource(Source):
|
|
12
|
-
extension = 'dir' # ducklike hack so like FormatSource without the hassle
|
|
13
|
-
|
|
14
|
-
def __init__(self, source_queue: Queue, in_source: Source = None):
|
|
15
|
-
self.source_queue = source_queue
|
|
16
|
-
self.current = in_source
|
|
17
|
-
|
|
18
|
-
def __iter__(self):
|
|
19
|
-
while True:
|
|
20
|
-
if self.current is None:
|
|
21
|
-
try:
|
|
22
|
-
self.current = self.source_queue.get_nowait()
|
|
23
|
-
logger.debug(f'next source={self.current}')
|
|
24
|
-
except Empty:
|
|
25
|
-
return # end of all sources
|
|
26
|
-
|
|
27
|
-
try:
|
|
28
|
-
for record in self.current:
|
|
29
|
-
yield record
|
|
30
|
-
finally:
|
|
31
|
-
self.current = None # move to next source after exhaustion
|
|
32
|
-
|
|
33
|
-
def deep_copy(self):
|
|
34
|
-
if self.source_queue.qsize() <= 1:
|
|
35
|
-
return None # leave remaining files to original
|
|
36
|
-
try:
|
|
37
|
-
next_source = self.source_queue.get_nowait()
|
|
38
|
-
logger.debug(f'deep_copy next_source={next_source}')
|
|
39
|
-
except Empty:
|
|
40
|
-
return None
|
|
41
|
-
|
|
42
|
-
return DirSource(self.source_queue, next_source)
|
|
43
|
-
|
|
44
|
-
@classmethod
|
|
45
|
-
def get_format_gz(cls, input:str):
|
|
46
|
-
is_gz = False
|
|
47
|
-
format = input
|
|
48
|
-
if input.endswith('.gz'):
|
|
49
|
-
is_gz = True
|
|
50
|
-
format = input[:-3]
|
|
51
|
-
return format, is_gz
|
|
52
|
-
|
|
53
|
-
@classmethod
|
|
54
|
-
def create(cls, sources: dict, path_no_ext: str, format_override: str = None):
|
|
55
|
-
files = [
|
|
56
|
-
os.path.join(path_no_ext, f)
|
|
57
|
-
for f in os.listdir(path_no_ext)
|
|
58
|
-
if os.path.isfile(os.path.join(path_no_ext, f))
|
|
59
|
-
]
|
|
60
|
-
|
|
61
|
-
source_queue = Queue()
|
|
62
|
-
for file in files:
|
|
63
|
-
parts = file.split('.')
|
|
64
|
-
is_gz = False
|
|
65
|
-
|
|
66
|
-
if parts[-1] == 'gz':
|
|
67
|
-
is_gz = True
|
|
68
|
-
parts.pop()
|
|
69
|
-
|
|
70
|
-
format = parts[-1]
|
|
71
|
-
|
|
72
|
-
if format_override:
|
|
73
|
-
format, is_gz = cls.get_format_gz(format_override)
|
|
74
|
-
|
|
75
|
-
source_class = sources.get(format)
|
|
76
|
-
lazy_file = LazyFileLocal(file, is_gz)
|
|
77
|
-
source_queue.put(source_class(lazy_file))
|
|
78
|
-
|
|
79
|
-
if source_queue.empty():
|
|
80
|
-
return None
|
|
81
|
-
|
|
82
|
-
return DirSource(source_queue)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_client.py
RENAMED
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_index_sink.py
RENAMED
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_query_pipe.py
RENAMED
|
File without changes
|
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/integrations/snowflake_pipe.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/pjk/sources/user_source_factory.py
RENAMED
|
File without changes
|
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/requires.txt
RENAMED
|
File without changes
|
{python_jack_knife-0.6.16 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/top_level.txt
RENAMED
|
File without changes
|