python-jack-knife 0.6.15__tar.gz → 0.6.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/PKG-INFO +1 -1
  2. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/components.py +9 -5
  3. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/parser.py +3 -2
  4. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/denorm.py +2 -2
  5. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/filter.py +2 -2
  6. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/join.py +2 -2
  7. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/let_reduce.py +5 -5
  8. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/map.py +2 -2
  9. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/query_pipe.py +1 -1
  10. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/remove_field.py +1 -1
  11. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/select.py +1 -1
  12. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/sort.py +1 -1
  13. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/tail.py +1 -1
  14. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/where.py +5 -3
  15. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/progress.py +61 -46
  16. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph.py +2 -0
  17. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_bar_line.py +14 -14
  18. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/s3_sink.py +1 -0
  19. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/csv_source.py +1 -0
  20. python_jack_knife-0.6.17/src/pjk/sources/dir_source.py +181 -0
  21. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/format_source.py +3 -1
  22. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/inline_source.py +1 -0
  23. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/json_source.py +1 -0
  24. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/s3_source.py +1 -0
  25. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/sql_source.py +1 -0
  26. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/version.py +1 -1
  27. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/PKG-INFO +1 -1
  28. python_jack_knife-0.6.15/src/pjk/sources/dir_source.py +0 -82
  29. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/LICENSE +0 -0
  30. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/README.md +0 -0
  31. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/pyproject.toml +0 -0
  32. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/setup.cfg +0 -0
  33. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/__init__.py +0 -0
  34. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/common.py +0 -0
  35. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_client.py +0 -0
  36. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_index_sink.py +0 -0
  37. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/integrations/opensearch_query_pipe.py +0 -0
  38. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/integrations/postgres_pipe.py +0 -0
  39. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/integrations/snowflake_pipe.py +0 -0
  40. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/log.py +0 -0
  41. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/main.py +0 -0
  42. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/man_page.py +0 -0
  43. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/__init__.py +0 -0
  44. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/factory.py +0 -0
  45. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/head.py +0 -0
  46. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/move_field.py +0 -0
  47. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/progress_pipe.py +0 -0
  48. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/sample.py +0 -0
  49. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/user_pipe_factory.py +0 -0
  50. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/registry.py +0 -0
  51. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/__init__.py +0 -0
  52. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/create_sink.py +0 -0
  53. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/csv_sink.py +0 -0
  54. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/devnull.py +0 -0
  55. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/dir_sink.py +0 -0
  56. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/expect.py +0 -0
  57. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/factory.py +0 -0
  58. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/format_sink.py +0 -0
  59. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_cumulative.py +0 -0
  60. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_hist.py +0 -0
  61. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_scatter.py +0 -0
  62. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/json_sink.py +0 -0
  63. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/s3_stream.py +0 -0
  64. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/sinks.py +0 -0
  65. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/stdout.py +0 -0
  66. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/tsv_sink.py +0 -0
  67. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/user_sink_factory.py +0 -0
  68. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/__init__.py +0 -0
  69. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/configs_source.py +0 -0
  70. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/factory.py +0 -0
  71. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/favorite_source.py +0 -0
  72. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/lazy_file.py +0 -0
  73. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/lazy_file_local.py +0 -0
  74. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/lazy_file_s3.py +0 -0
  75. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/macro_source.py +0 -0
  76. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/npy_source.py +0 -0
  77. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/parquet_source.py +0 -0
  78. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/source_list.py +0 -0
  79. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/tsv_source.py +0 -0
  80. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/user_source_factory.py +0 -0
  81. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/usage.py +0 -0
  82. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/SOURCES.txt +0 -0
  83. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
  84. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
  85. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/requires.txt +0 -0
  86. {python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-jack-knife
3
- Version: 0.6.15
3
+ Version: 0.6.17
4
4
  Summary: Python Jack Knife – a command line data processor
5
5
  Author-email: Mike Schultz <mike.schultz@gmail.com>
6
6
  License:
@@ -35,9 +35,12 @@ class Source(ABC):
35
35
  component_class=cls
36
36
  )
37
37
 
38
+ def __init__(self, root = None):
39
+ self.root = root
40
+
38
41
  @abstractmethod
39
42
  def __iter__(self):
40
- raise NotImplementedError("__iter__ must be implemented by subclasses")
43
+ pass
41
44
 
42
45
  def __next__(self):
43
46
  # lazily create an internal iterator the first time next() is called
@@ -57,7 +60,8 @@ class Source(ABC):
57
60
  class Pipe(Source):
58
61
  arity: int = 1
59
62
 
60
- def __init__(self, ptok: ParsedToken, usage: Usage = None):
63
+ def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
64
+ self.root = root
61
65
  self.ptok = ptok
62
66
  self.usage = usage
63
67
  self.left = None # left source for convience
@@ -95,7 +99,7 @@ class DeepCopyPipe(Pipe):
95
99
  return None
96
100
 
97
101
  # re-instantiate using the actual subclass
98
- pipe = type(self)(self.ptok, self.usage)
102
+ pipe = type(self)(self.ptok, self.usage, self) # this self is the root
99
103
  pipe.add_source(source_clone)
100
104
  return pipe
101
105
 
@@ -108,7 +112,8 @@ class Sink(ABC):
108
112
  component_class=cls
109
113
  )
110
114
 
111
- def __init__(self, ptok: ParsedToken, usage: Usage = None):
115
+ def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
116
+ self.root = root
112
117
  self.ptok = ptok
113
118
  self.usage = usage
114
119
 
@@ -135,4 +140,3 @@ class Sink(ABC):
135
140
 
136
141
  def deep_copy(self):
137
142
  return None
138
-
@@ -187,7 +187,7 @@ class ExpressionParser:
187
187
 
188
188
  class ReducerAggregatorPipe(Pipe):
189
189
  def __init__(self, top_level_reducers: List[Any]):
190
- super().__init__(None)
190
+ super().__init__(None, None)
191
191
  self.top_level_reducers = top_level_reducers
192
192
  self.reduction = {}
193
193
  self.done = False
@@ -275,6 +275,7 @@ class UpstreamSource(Source):
275
275
  return u
276
276
 
277
277
  def __init__(self):
278
+ super().__init__(root=None)
278
279
  self.data = []
279
280
  self.inner_source = None
280
281
  self.sub_recs_in = papi.get_counter(self, var_label='sub_recs_in')
@@ -338,7 +339,7 @@ class SubExpression(Pipe, ProgressIgnore):
338
339
  return None
339
340
 
340
341
  def __init__(self, ptok: ParsedToken, usage: Usage):
341
- super().__init__(ptok)
342
+ super().__init__(ptok, usage)
342
343
  self.subexp_ops = []
343
344
  self.stack_helper = StackLoader()
344
345
  self.subexp_stack = OperandStack()
@@ -52,10 +52,10 @@ class DenormPipe(Pipe):
52
52
  return usage
53
53
 
54
54
  def __init__(self, ptok: ParsedToken, usage: Usage):
55
- super().__init__(ptok)
55
+ super().__init__(ptok, usage)
56
56
 
57
57
  self.field = usage.get_arg('field')
58
- self.recs_in = papi.get_counter(self, None) # don't display
58
+ self.recs_in = papi.get_counter(self, 'recs_in', display=False)
59
59
  self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
60
60
 
61
61
  self._pending_iter = None
@@ -39,11 +39,11 @@ class FilterPipe(Pipe):
39
39
  return usage
40
40
 
41
41
  def __init__(self, ptok: ParsedToken, usage: Usage):
42
- super().__init__(ptok)
42
+ super().__init__(ptok, usage)
43
43
  self.mode = usage.get_arg('mode')
44
44
  self.left = None
45
45
  self.right = None
46
- self.recs_in = papi.get_counter(self, None) # don't display
46
+ self.recs_in = papi.get_counter(self, 'recs_in', display=False)
47
47
  self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
48
48
 
49
49
  def reset(self):
@@ -60,7 +60,7 @@ class JoinPipe(Pipe):
60
60
  return usage
61
61
 
62
62
  def __init__(self, ptok: ParsedToken, usage: Usage):
63
- super().__init__(ptok)
63
+ super().__init__(ptok, usage)
64
64
 
65
65
  self.mode = usage.get_arg('mode')
66
66
  self.left = None
@@ -68,7 +68,7 @@ class JoinPipe(Pipe):
68
68
  self._pending_right = None
69
69
  self._check_right = False
70
70
 
71
- self.recs_in = papi.get_counter(self, None) # don't display
71
+ self.recs_in = papi.get_counter(self, 'recs_in', display=False)
72
72
  self.matches = papi.get_percentage_counter(self, 'matches', self.recs_in)
73
73
  self.recs_out = papi.get_counter(self, 'recs_out')
74
74
 
@@ -3,7 +3,7 @@
3
3
 
4
4
  # djk/pipes/let_reduce.py
5
5
 
6
- from pjk.components import Pipe
6
+ from pjk.components import DeepCopyPipe
7
7
  from pjk.usage import ParsedToken, Usage, UsageError, TokenError, NoBindUsage
8
8
  from pjk.common import SafeNamespace, ReducingNamespace
9
9
  import re
@@ -78,7 +78,7 @@ def eval_accumulating(expr: str, record: dict, op: str, acc=None):
78
78
  return do_eval(expr, env)
79
79
 
80
80
  # --- LetPipe (simple field assignment) ---
81
- class LetPipe(Pipe):
81
+ class LetPipe(DeepCopyPipe):
82
82
  @classmethod
83
83
  def usage(cls):
84
84
  usage = NoBindUsage( # can't use bound usage because of complicated parsing
@@ -93,7 +93,7 @@ class LetPipe(Pipe):
93
93
  return usage
94
94
 
95
95
  def __init__(self, ptok: ParsedToken, usage: Usage):
96
- super().__init__(ptok)
96
+ super().__init__(ptok, usage)
97
97
  args = parse_args(ptok.whole_token.split(':', 1)[-1])
98
98
  self.field = args['field']
99
99
  self.op = args['op']
@@ -121,7 +121,7 @@ def is_comprehension(expr: str) -> bool:
121
121
  except SyntaxError:
122
122
  return False
123
123
 
124
- class ReducePipe(Pipe):
124
+ class ReducePipe(DeepCopyPipe):
125
125
  @classmethod
126
126
  def usage(cls):
127
127
  usage = NoBindUsage( # can't use bound usage because of complicated parsing
@@ -161,7 +161,7 @@ class ReducePipe(Pipe):
161
161
  return usage
162
162
 
163
163
  def __init__(self, ptok: ParsedToken, usage: Usage):
164
- super().__init__(ptok)
164
+ super().__init__(ptok, usage)
165
165
  args = parse_args(ptok.whole_token.split(':', 1)[-1])
166
166
  self.field = args['field']
167
167
  self.op = args['op']
@@ -28,7 +28,7 @@ class MapByPipe(Pipe, KeyedSource):
28
28
  return u
29
29
 
30
30
  def __init__(self, ptok: ParsedToken, usage: Usage):
31
- super().__init__(ptok)
31
+ super().__init__(ptok, usage)
32
32
  self.is_group = False
33
33
  self.fields = usage.get_arg('key').split(',')
34
34
  self.rec_map = {}
@@ -37,7 +37,7 @@ class MapByPipe(Pipe, KeyedSource):
37
37
  self.do_count = usage.get_param(name='count').lower() == 'true'
38
38
  self.counts = {}
39
39
  self.missing_keys = papi.get_counter(self, 'missing_keys')
40
- self.recs_in = papi.get_counter(self, None) # don't display
40
+ self.recs_in = papi.get_counter(self, 'recs_in', display=False)
41
41
  # recs_out = distinct_keys
42
42
  self.distinct_keys = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
43
43
 
@@ -40,7 +40,7 @@ class QueryPipe(Pipe):
40
40
  self.output_shape = usage.get_param('shape')
41
41
  self.count = usage.get_param('count')
42
42
  self.query_field = 'query' # for all subclasses
43
- self.inrecs = papi.get_counter(self, var_label=None) # don't display progress
43
+ self.inrecs = papi.get_counter(self, var_label='recs_in')
44
44
  self.outrecs = papi.get_percentage_counter(self, var_label='recs_out', denom_counter=self.inrecs)
45
45
 
46
46
  @abstractmethod
@@ -19,7 +19,7 @@ class RemoveField(DeepCopyPipe):
19
19
  return usage
20
20
 
21
21
  def __init__(self, ptok: ParsedToken, usage: Usage):
22
- super().__init__(ptok)
22
+ super().__init__(ptok, usage)
23
23
  arg_string = usage.get_arg('fields')
24
24
  self.fields = [f.strip() for f in arg_string.split(',') if f.strip()]
25
25
  if not self.fields:
@@ -19,7 +19,7 @@ class SelectFields(DeepCopyPipe):
19
19
  return usage
20
20
 
21
21
  def __init__(self, ptok: ParsedToken, usage: Usage):
22
- super().__init__(ptok)
22
+ super().__init__(ptok, usage)
23
23
 
24
24
  arg_string = usage.get_arg('fields')
25
25
  if not arg_string:
@@ -21,7 +21,7 @@ class SortPipe(Pipe):
21
21
  return usage
22
22
 
23
23
  def __init__(self, ptok: ParsedToken, usage: Usage):
24
- super().__init__(ptok)
24
+ super().__init__(ptok, usage)
25
25
 
26
26
  arg_string = usage.get_arg('field')
27
27
  if arg_string.startswith("-"):
@@ -19,7 +19,7 @@ class TailPipe(Pipe):
19
19
  return usage
20
20
 
21
21
  def __init__(self, ptok: ParsedToken, usage: Usage):
22
- super().__init__(ptok)
22
+ super().__init__(ptok, usage)
23
23
  self.limit = usage.get_arg('limit')
24
24
 
25
25
  self.buffer = []
@@ -22,10 +22,11 @@ class WherePipe(DeepCopyPipe):
22
22
  u.def_example(expr_tokens=["[{color:'blue'}, {color:'red'}, {color:'black'}]", "where:f.color.startswith('bl')"], expect="[{color:'blue'}, {color:'black'}]")
23
23
  return u
24
24
 
25
- def __init__(self, ptok: ParsedToken, usage: Usage):
26
- super().__init__(ptok, usage)
25
+ def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
26
+ super().__init__(ptok, usage, root)
27
27
  self.expr = ptok.whole_token.split(':', 1)[1]
28
- self.inrecs = papi.get_counter(self, var_label=None) # don't display progress
28
+
29
+ self.inrecs = papi.get_counter(self, var_label='recs_in', display=False)
29
30
  self.outrecs = papi.get_percentage_counter(self, var_label='recs_out', denom_counter=self.inrecs)
30
31
  try:
31
32
  self.code = compile(self.expr, '<where>', 'eval')
@@ -46,3 +47,4 @@ class WherePipe(DeepCopyPipe):
46
47
  except Exception:
47
48
  continue # ignore eval errors
48
49
 
50
+
@@ -13,18 +13,26 @@ class ProgressIgnore:
13
13
 
14
14
  class Report:
15
15
  def __init__(self):
16
- self.name_value_tuples = []
16
+ self._values: dict[str, Any] = {}
17
17
  self.parse_level = -1
18
+ self.invisibles = set()
18
19
 
19
- def add_value(self, name, value):
20
- self.name_value_tuples.append((name, value))
20
+ def set_or_get_value(self, name, value):
21
+ # store once; subsequent calls return the existing object
22
+ return self._values.setdefault(name, value)
23
+
24
+ def get_value(self, name):
25
+ return self._values.get(name)
21
26
 
22
27
  def get_name_value_tuples(self):
23
- return self.name_value_tuples
28
+ return self._values.items()
24
29
 
25
30
  def set_parse_level(self, level: int):
26
31
  self.parse_level = level
27
32
 
33
+ def make_invisible(self, var_label:str):
34
+ self.invisibles.add(var_label)
35
+
28
36
  def get_parse_level(self):
29
37
  return self.parse_level
30
38
 
@@ -58,29 +66,7 @@ class ProgressDisplay:
58
66
  while not self._stop_event.is_set():
59
67
  snap = self.api.snapshot()
60
68
  lines = self._render_lines(snap)
61
-
62
- # Move up to overwrite previous block
63
- if self._last_lines:
64
- self.stream.write(f"{CSI}{self._last_lines}F") # move cursor up N lines, to column 1
65
-
66
- # Write fresh lines
67
- for line in lines:
68
- self.stream.write(line + "\n")
69
-
70
- # Erase extra old lines if the block got shorter
71
- if self._last_lines > len(lines):
72
- diff = self._last_lines - len(lines)
73
- for _ in range(diff):
74
- self.stream.write(" " * 120 + "\n")
75
- # move cursor up to top of block again
76
- self.stream.write(f"{CSI}{self._last_lines}F")
77
-
78
- try:
79
- self.stream.flush()
80
- except Exception:
81
- pass
82
-
83
- self._last_lines = len(lines)
69
+ self._write_lines(lines)
84
70
 
85
71
  if self._stop_event.wait(self.interval):
86
72
  break
@@ -88,18 +74,38 @@ class ProgressDisplay:
88
74
  # --- FINAL REFRESH ON SHUTDOWN ---
89
75
  reports = self.api.snapshot()
90
76
  lines = self._render_lines(reports)
77
+ self._write_lines(lines, final=True)
78
+
79
+ def _write_lines(self, lines, final: bool = False):
80
+ """
81
+ Render output either by rewriting the previous block (TTY) or by
82
+ printing a fresh snapshot (non-TTY fall back).
83
+ """
84
+ prev_lines = self._last_lines
91
85
 
92
- if self._last_lines:
93
- self.stream.write(f"{CSI}{self._last_lines}F")
86
+ if self._use_ansi:
87
+ if prev_lines:
88
+ # Move cursor up to the beginning of the old block
89
+ self.stream.write(f"{CSI}{prev_lines}F")
94
90
 
95
- for line in lines:
96
- self.stream.write(line + "\n")
91
+ for line in lines:
92
+ self.stream.write(line + "\n")
97
93
 
98
- if self._last_lines > len(lines):
99
- diff = self._last_lines - len(lines)
100
- for _ in range(diff):
101
- self.stream.write(" " * 120 + "\n")
102
- self.stream.write(f"{CSI}{self._last_lines}F")
94
+ if prev_lines > len(lines):
95
+ diff = prev_lines - len(lines)
96
+ blank = " " * 120
97
+ for _ in range(diff):
98
+ self.stream.write(blank + "\n")
99
+ # move cursor back to sit just below the freshly written block
100
+ self.stream.write(f"{CSI}{diff}F")
101
+ else:
102
+ # Best-effort fallback when we cannot reposition the cursor.
103
+ if prev_lines and not final:
104
+ self.stream.write("\n")
105
+ for line in lines:
106
+ self.stream.write(line + "\n")
107
+ if prev_lines and not final:
108
+ self.stream.write("-" * 40 + "\n")
103
109
 
104
110
  try:
105
111
  self.stream.flush()
@@ -125,6 +131,9 @@ class ProgressDisplay:
125
131
  label = f'{indent}{key}'
126
132
  parts = [f"{label:<{KEY_W}.{KEY_W}}"] # left col, truncated if too long
127
133
  for name, val in report.get_name_value_tuples():
134
+ if name in report.invisibles:
135
+ continue
136
+
128
137
  token = f"{name}={val}" # __str__ handles formatting
129
138
  parts.append(f"{token:<{COL_W}}") # left-justify, hard truncate at COL_W
130
139
  return highlight(" ".join(parts), 'bold', key)
@@ -210,8 +219,8 @@ class ProgressAPI:
210
219
  self._parse_depth: Dict[int, int] = {} # component id -> level
211
220
  self.level = 0
212
221
 
213
- def get_counter(self, component: Source | Sink, var_label: str) -> SafeCounter:
214
- return self._update_storage(component, var_label=var_label, value=SafeCounter())
222
+ def get_counter(self, component: Source | Sink, var_label: str, display: bool = True) -> SafeCounter:
223
+ return self._update_storage(component, var_label=var_label, value=SafeCounter(), display=display)
215
224
 
216
225
  # returns the numerator counter
217
226
  def get_percentage_counter(self, component: Source | Sink, var_label: str, denom_counter: SafeCounter):
@@ -229,7 +238,7 @@ class ProgressAPI:
229
238
  report.set_parse_level(level)
230
239
  return self._reports
231
240
 
232
- # could happen before or after update storage
241
+ # could happen before or after update storage, done in operand stack to get levels right)
233
242
  def register_component(self, component: Source | Sink, stack_level: int):
234
243
  if isinstance(component, ProgressIgnore):
235
244
  return # um, ignore
@@ -238,20 +247,26 @@ class ProgressAPI:
238
247
  self._parse_depth[comp_id] = stack_level
239
248
  self._update_storage(component, var_label=None, value=None) # just register, no values
240
249
 
241
- def _update_storage(self, component: Source | Sink, var_label: str, value: Any):
250
+ def _update_storage(self, component: Source | Sink, var_label: str, value: Any, display:bool = True):
242
251
  # we can have multiple instances of a component type in an expression so we need to
243
252
  # differentiate by id when we put them in the _store.
244
253
  component_label = self._get_component_label(component)
245
- store_key = (component_label, id(component))
254
+
255
+ # create an uniq id for variable that is common across clones
256
+ comp_id = id(component) if component.root is None else id(component.root)
257
+
258
+ store_key = (component_label, comp_id)
246
259
  report = self._reports.setdefault(store_key, Report())
247
- if not value: # when just registering component
260
+ if value is None: # when just registering component
248
261
  return None
249
262
 
250
- if var_label:
251
- # only when var_label not None, do we want the stat displayed
252
- report.add_value(var_label, value)
263
+ if not var_label:
264
+ raise Exception('unique var_label is required')
253
265
 
254
- return value
266
+ if not display:
267
+ report.make_invisible(var_label)
268
+
269
+ return report.set_or_get_value(var_label, value)
255
270
 
256
271
  # some hacking to get at reasonable labels
257
272
  def _get_component_label(self, component: Source | Sink):
@@ -21,6 +21,7 @@ class GraphSink(Sink):
21
21
  usage.def_param(name='x', usage='x-axis field', default='x')
22
22
  usage.def_param(name='y', usage='comma separated list of y-axis fields', default='y')
23
23
  usage.def_param(name='pause', usage='Seconds to show graph', is_num=True, default='-1')
24
+ usage.def_param(name='title', usage='A title for the graph', is_num=False)
24
25
  return usage
25
26
 
26
27
  def __init__(self, ptok: ParsedToken, usage: Usage):
@@ -30,6 +31,7 @@ class GraphSink(Sink):
30
31
  self.x_field = usage.get_param('x')
31
32
  self.y_field = usage.get_param('y')
32
33
  self.pause = usage.get_param('pause')
34
+ self.title = usage.get_param('title')
33
35
 
34
36
  def process(self):
35
37
  import matplotlib.pyplot as plt # lazy import
@@ -137,8 +137,8 @@ class SingleYWithSetsAdapter:
137
137
  # ----------------------------- Plotter -----------------------------
138
138
  class GraphPlotter:
139
139
  def __init__(self, params: GraphParams):
140
- self.p = params
141
- self.y_fields = list(dict.fromkeys(self.p.y_fields)) # dedupe, preserve order
140
+ self.pms = params
141
+ self.y_fields = list(dict.fromkeys(self.pms.y_fields)) # dedupe, preserve order
142
142
 
143
143
  def plot(self, chart_type: str = "line"):
144
144
  import matplotlib.pyplot as plt
@@ -149,8 +149,8 @@ class GraphPlotter:
149
149
 
150
150
  # Multi-Y path (preferred)
151
151
  if len(self.y_fields) > 1:
152
- df = MultiYAdapter.to_df(self.p.records, self.p.x_field, self.y_fields)
153
- is_time = self.p.x_is_time if isinstance(self.p.x_is_time, bool) else TimeDetector.is_time(df["x"])
152
+ df = MultiYAdapter.to_df(self.pms.records, self.pms.x_field, self.y_fields)
153
+ is_time = self.pms.x_is_time if isinstance(self.pms.x_is_time, bool) else TimeDetector.is_time(df["x"])
154
154
  if is_time:
155
155
  df["ts"] = TimeDetector.parse_times(df["x"])
156
156
  df = df.dropna(subset=["ts"]).sort_values("ts")
@@ -165,12 +165,12 @@ class GraphPlotter:
165
165
  else:
166
166
  self._lines_categorical(ax, df, self.y_fields)
167
167
  self._format_categorical_axis(ax, df)
168
- title = self.p.title or ("Line over time" if is_time and chart_type=="line" else
168
+ title = self.pms.title or ("Line over time" if is_time and chart_type=="line" else
169
169
  "Bar over time" if is_time else
170
170
  "Line by category" if chart_type=="line" else
171
171
  "Bar by category")
172
172
  ax.set_title(title)
173
- ax.set_xlabel(self.p.x_field)
173
+ ax.set_xlabel(self.pms.x_field)
174
174
  ax.set_ylabel(", ".join(self.y_fields))
175
175
  ax.legend(title="Series")
176
176
  self._apply_args_dict()
@@ -180,13 +180,13 @@ class GraphPlotter:
180
180
 
181
181
  # Single-Y legacy path (maybe with set_name)
182
182
  y = self.y_fields[0]
183
- sdf = SingleYWithSetsAdapter.to_df(self.p.records, self.p.x_field, y)
183
+ sdf = SingleYWithSetsAdapter.to_df(self.pms.records, self.pms.x_field, y)
184
184
  if sdf.empty:
185
- print(f"No valid '{self.p.x_field}' and '{y}' records found.")
185
+ print(f"No valid '{self.pms.x_field}' and '{y}' records found.")
186
186
  return fig, ax
187
187
 
188
188
  # time vs categorical
189
- is_time = self.p.x_is_time if isinstance(self.p.x_is_time, bool) else TimeDetector.is_time(sdf["x"])
189
+ is_time = self.pms.x_is_time if isinstance(self.pms.x_is_time, bool) else TimeDetector.is_time(sdf["x"])
190
190
  if is_time:
191
191
  sdf["ts"] = TimeDetector.parse_times(sdf["x"])
192
192
  sdf = sdf.dropna(subset=["ts"]) # might be empty
@@ -206,8 +206,8 @@ class GraphPlotter:
206
206
  else:
207
207
  ax.plot(s.index, s.values, label=label)
208
208
  self._format_time_axis(ax, sdf.rename(columns={"ts":"ts"}))
209
- ax.set_title(self.p.title or f"{y} over time")
210
- ax.set_xlabel(self.p.x_field)
209
+ ax.set_title(self.pms.title or f"{y} over time")
210
+ ax.set_xlabel(self.pms.x_field)
211
211
  ax.set_ylabel(y)
212
212
  if any(s != "__default__" for s in sdf["set"].unique()):
213
213
  ax.legend(title="data set")
@@ -243,8 +243,8 @@ class GraphPlotter:
243
243
  tick_idx = idx
244
244
  tick_lbl = x_vals
245
245
  ax.set_xticks(tick_idx, tick_lbl, rotation=45)
246
- ax.set_title(self.p.title or f"{y} by {self.p.x_field}")
247
- ax.set_xlabel(self.p.x_field)
246
+ ax.set_title(self.pms.title or f"{y} by {self.pms.x_field}")
247
+ ax.set_xlabel(self.pms.x_field)
248
248
  ax.set_ylabel(y)
249
249
  if len(set_names) > 1 or "__default__" not in set_names:
250
250
  ax.legend(title="data set")
@@ -323,7 +323,7 @@ class GraphPlotter:
323
323
  # ---------- Misc ----------
324
324
  def _apply_args_dict(self) -> None:
325
325
  import matplotlib.pyplot as plt
326
- for name, val in getattr(self.p, "args_dict", {}).items():
326
+ for name, val in getattr(self.pms, "args_dict", {}).items():
327
327
  fn = getattr(plt, name, None)
328
328
  if callable(fn):
329
329
  try:
@@ -26,6 +26,7 @@ class S3Sink(Sink):
26
26
  _FILENAME_DIGITS: int = 4
27
27
 
28
28
  def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int):
29
+ super().__init__(root=None, ptok=None, usage=None)
29
30
  self.path_no_ext = path_no_ext if not path_no_ext.startswith('//') else path_no_ext[2:] # strip leading //
30
31
  self.sink_class = sink_class
31
32
  self.is_gz = is_gz
@@ -14,6 +14,7 @@ class CSVSource(FormatSource):
14
14
  extension = 'csv'
15
15
 
16
16
  def __init__(self, lazy_file: LazyFile, delimiter: str = ","):
17
+ super().__init__(lazy_file)
17
18
  self.lazy_file = lazy_file
18
19
  self.delimiter = delimiter
19
20
  self.num_recs = 0
@@ -0,0 +1,181 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import os
5
+ import threading
6
+ from typing import Optional
7
+
8
+ from pjk.components import Source
9
+ from pjk.sources.lazy_file_local import LazyFileLocal
10
+ from pjk.log import logger
11
+
12
+
13
+ class DirSource(Source):
14
+ """
15
+ Iterate over files in a directory, materializing a concrete Source per file.
16
+ Coordination between clones is handled by a shared file iterator protected
17
+ by a lock. No queues, no is_root, no done_event.
18
+ """
19
+ extension = 'dir' # ducklike hack so like FormatSource without the hassle
20
+
21
+ def __init__(self, root: Source, file_iter = None, source_classes: dict = None, format_override: str = None):
22
+ super().__init__(root=root)
23
+ self.current = None
24
+ if not root: # WE! are the root
25
+ if not file_iter:
26
+ raise Exception('root creation must include file_iter')
27
+ self.file_iter = file_iter
28
+ self.iterator_lock = threading.Lock()
29
+ self.format_override = format_override
30
+ self.source_classes = source_classes
31
+
32
+ else:
33
+ self.file_iter = root.file_iter
34
+ self.source_classes = root.source_classes
35
+ self.format_override = root.format_override
36
+ self.iterator_lock = root.iterator_lock
37
+
38
+ # ---------------------------------------------------------------------
39
+ # Iteration
40
+ # ---------------------------------------------------------------------
41
+
42
+ def __iter__(self):
43
+ while True:
44
+ if self.current is None:
45
+ # Pull the next file-backed Source (skip unsupported files)
46
+ self.current = self._get_next_source()
47
+ if self.current is None:
48
+ return # exhausted
49
+
50
+ try:
51
+ for record in self.current:
52
+ yield record
53
+ finally:
54
+ # move on after this inner source is exhausted
55
+ self.current = None
56
+
57
+ # ---------------------------------------------------------------------
58
+ # Contention boundary: only here we touch the shared iterator
59
+ # ---------------------------------------------------------------------
60
+
61
+ # needed for in deep_clone to stop itereration
62
+ def has_next(self):
63
+ if self.current is not None:
64
+ return True
65
+
66
+ self.current = self._get_next_source()
67
+ return self.current is not None
68
+
69
+ def get_next_file(self) -> Optional[str]:
70
+ """
71
+ Thread-safe advancement of the shared file iterator.
72
+ Returns the next file path, or None when exhausted.
73
+ """
74
+ with self.iterator_lock:
75
+ if self.file_iter is None:
76
+ return None
77
+ try:
78
+ path = next(self.file_iter)
79
+ logger.debug(f'get_next_file -> {path}')
80
+ return path
81
+ except StopIteration:
82
+ self.file_iter = None
83
+ logger.debug('get_next_file -> None (exhausted)')
84
+ return None
85
+
86
+ def _get_next_source(self) -> Optional[Source]:
87
+ """
88
+ Keep drawing files until we either exhaust or we can construct a Source.
89
+ """
90
+ while True:
91
+ file = self.get_next_file()
92
+ if file is None:
93
+ return None
94
+ src = self._file_to_source(file)
95
+ if src is None:
96
+ logger.debug(f'skipping unsupported file: {file}')
97
+ continue
98
+ logger.debug(f'next source (from file) = {src}')
99
+ return src
100
+
101
+ # ---------------------------------------------------------------------
102
+ # Helpers
103
+ # ---------------------------------------------------------------------
104
+
105
+ def _file_to_source(self, file: str) -> Optional[Source]:
106
+ parts = file.split('.')
107
+ is_gz = False
108
+
109
+ if parts and parts[-1] == 'gz':
110
+ is_gz = True
111
+ parts.pop()
112
+
113
+ fmt = parts[-1] if parts else None
114
+
115
+ if self.format_override:
116
+ fmt, is_gz = self.get_format_gz(self.format_override)
117
+
118
+ if not fmt:
119
+ return None
120
+
121
+ source_class = self.source_classes.get(fmt)
122
+ if not source_class:
123
+ return None
124
+
125
+ lazy_file = LazyFileLocal(file, is_gz)
126
+ return source_class(lazy_file)
127
+
128
+ def deep_copy(self):
129
+ clone = DirSource(self)
130
+ if clone.has_next():
131
+ return clone
132
+ else:
133
+ return None
134
+
135
+ # ---------------------------------------------------------------------
136
+ # Class utilities
137
+ # ---------------------------------------------------------------------
138
+
139
+ @classmethod
140
+ def get_format_gz(cls, input_str: str):
141
+ is_gz = False
142
+ fmt = input_str
143
+ if input_str.endswith('.gz'):
144
+ is_gz = True
145
+ fmt = input_str[:-3]
146
+ return fmt, is_gz
147
+
148
+ @classmethod
149
+ def _iter_files(cls, path: str, recursive: bool):
150
+ if not recursive:
151
+ for f in os.listdir(path):
152
+ full = os.path.join(path, f)
153
+ if os.path.isfile(full):
154
+ yield full
155
+ return
156
+
157
+ for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=False):
158
+ for name in filenames:
159
+ full = os.path.join(dirpath, name)
160
+ if os.path.isfile(full):
161
+ yield full
162
+
163
+ @classmethod
164
+ def create(
165
+ cls,
166
+ source_classes: dict,
167
+ path_no_ext: str,
168
+ format_override: Optional[str] = None,
169
+ recursive: bool = False,
170
+ ):
171
+ """
172
+ Factory: returns a DirSource that will lazily enumerate files.
173
+ """
174
+ file_iter = cls._iter_files(path_no_ext, recursive)
175
+
176
+ return DirSource(
177
+ root = None, # THIS is the root
178
+ file_iter=file_iter,
179
+ source_classes=source_classes,
180
+ format_override=format_override
181
+ )
@@ -16,6 +16,7 @@ class SourceFormatUsage(NoBindUsage):
16
16
  self.def_syntax("") # no syntax for these
17
17
  # default = None because for source, format is an OVERRIDE
18
18
  self.def_param('format', 'file format', is_num=False, valid_values={'json', 'csv', 'tsv', 'json.gz', 'tsv.gz', 'csv.gz'}, default=None)
19
+ self.def_param('recursive', 'for local direcories only', is_num=False, valid_values={'true', 'false'}, default=False)
19
20
  self.def_example(expr_tokens=[f"myfile.{name}", "-"], expect=None)
20
21
  self.def_example(expr_tokens=["mydir", "-"], expect=None)
21
22
  self.def_example(expr_tokens=[f"s3://mybucket/myfile.{name}", "-"], expect=None)
@@ -94,7 +95,8 @@ class FormatSource(Source):
94
95
  return S3Source.create(sources, path_no_ext, ext, format_override=format_override)
95
96
 
96
97
  if os.path.isdir(path_no_ext):
97
- return DirSource.create(sources, path_no_ext, format_override=format_override)
98
+ recursive = usage.get_param('recursive') == 'true'
99
+ return DirSource.create(sources, path_no_ext, format_override=format_override, recursive=recursive)
98
100
 
99
101
  return None
100
102
 
@@ -33,6 +33,7 @@ class InlineSource(Source):
33
33
  return usage
34
34
 
35
35
  def __init__(self, inline_expr):
36
+ super().__init__(root=None)
36
37
  self.num_recs = 0
37
38
  try:
38
39
  obj = hjson.loads(inline_expr)
@@ -13,6 +13,7 @@ class JsonSource(FormatSource):
13
13
  extension = 'json'
14
14
 
15
15
  def __init__(self, lazy_file: LazyFile):
16
+ super().__init__(root=None)
16
17
  self.lazy_file = lazy_file
17
18
  self.num_recs = 0
18
19
 
@@ -96,6 +96,7 @@ class S3Source(Source):
96
96
  """
97
97
 
98
98
  def __init__(self, shared_state: _SharedS3State, reserved: Optional[Source] = None):
99
+ super().__init__(root=None)
99
100
  self._state = shared_state
100
101
  self._current: Optional[Source] = reserved
101
102
 
@@ -13,6 +13,7 @@ class SQLSource(FormatSource):
13
13
  desc_override = "SQL source. Emits SQL in single record in 'query' field."
14
14
 
15
15
  def __init__(self, lazy_file: LazyFile):
16
+ super().__init__(root=None)
16
17
  self.lazy_file = lazy_file
17
18
  self.num_recs = 0
18
19
 
@@ -1,4 +1,4 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Copyright 2024 Mike Schultz
3
3
 
4
- __version__ = "0.6.15"
4
+ __version__ = "0.6.17"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-jack-knife
3
- Version: 0.6.15
3
+ Version: 0.6.17
4
4
  Summary: Python Jack Knife – a command line data processor
5
5
  Author-email: Mike Schultz <mike.schultz@gmail.com>
6
6
  License:
@@ -1,82 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- # Copyright 2024 Mike Schultz
3
-
4
- import os
5
- from typing import Any
6
- from queue import Queue, Empty
7
- from pjk.components import Source
8
- from pjk.sources.lazy_file_local import LazyFileLocal
9
- from pjk.log import logger
10
-
11
- class DirSource(Source):
12
- extension = 'dir' # ducklike hack so like FormatSource without the hassle
13
-
14
- def __init__(self, source_queue: Queue, in_source: Source = None):
15
- self.source_queue = source_queue
16
- self.current = in_source
17
-
18
- def __iter__(self):
19
- while True:
20
- if self.current is None:
21
- try:
22
- self.current = self.source_queue.get_nowait()
23
- logger.debug(f'next source={self.current}')
24
- except Empty:
25
- return # end of all sources
26
-
27
- try:
28
- for record in self.current:
29
- yield record
30
- finally:
31
- self.current = None # move to next source after exhaustion
32
-
33
- def deep_copy(self):
34
- if self.source_queue.qsize() <= 1:
35
- return None # leave remaining files to original
36
- try:
37
- next_source = self.source_queue.get_nowait()
38
- logger.debug(f'deep_copy next_source={next_source}')
39
- except Empty:
40
- return None
41
-
42
- return DirSource(self.source_queue, next_source)
43
-
44
- @classmethod
45
- def get_format_gz(cls, input:str):
46
- is_gz = False
47
- format = input
48
- if input.endswith('.gz'):
49
- is_gz = True
50
- format = input[:-3]
51
- return format, is_gz
52
-
53
- @classmethod
54
- def create(cls, sources: dict, path_no_ext: str, format_override: str = None):
55
- files = [
56
- os.path.join(path_no_ext, f)
57
- for f in os.listdir(path_no_ext)
58
- if os.path.isfile(os.path.join(path_no_ext, f))
59
- ]
60
-
61
- source_queue = Queue()
62
- for file in files:
63
- parts = file.split('.')
64
- is_gz = False
65
-
66
- if parts[-1] == 'gz':
67
- is_gz = True
68
- parts.pop()
69
-
70
- format = parts[-1]
71
-
72
- if format_override:
73
- format, is_gz = cls.get_format_gz(format_override)
74
-
75
- source_class = sources.get(format)
76
- lazy_file = LazyFileLocal(file, is_gz)
77
- source_queue.put(source_class(lazy_file))
78
-
79
- if source_queue.empty():
80
- return None
81
-
82
- return DirSource(source_queue)