PyPI - python-jack-knife - Versions diffs - 0.6.15__tar.gz → 0.6.17__tar.gz - Mend

python-jack-knife 0.6.15tar.gz → 0.6.17tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-jack-knife
-Version: 0.6.15
+Version: 0.6.17
 Summary: Python Jack Knife – a command line data processor
 Author-email: Mike Schultz <mike.schultz@gmail.com>
 License:

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/components.py RENAMED Viewed

@@ -35,9 +35,12 @@ class Source(ABC):
             component_class=cls
         )
+    def __init__(self, root = None):
+        self.root = root
     @abstractmethod
     def __iter__(self):
-        raise NotImplementedError("__iter__ must be implemented by subclasses")
+        pass
     def __next__(self):
         # lazily create an internal iterator the first time next() is called
@@ -57,7 +60,8 @@ class Source(ABC):
 class Pipe(Source):
     arity: int = 1
-    def __init__(self, ptok: ParsedToken, usage: Usage = None):
+    def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
+        self.root = root
         self.ptok = ptok
         self.usage = usage
         self.left = None  # left source for convience
@@ -95,7 +99,7 @@ class DeepCopyPipe(Pipe):
             return None
         # re-instantiate using the actual subclass
-        pipe = type(self)(self.ptok, self.usage)
+        pipe = type(self)(self.ptok, self.usage, self) # this self is the root
         pipe.add_source(source_clone)
         return pipe
@@ -108,7 +112,8 @@ class Sink(ABC):
             component_class=cls
         )
-    def __init__(self, ptok: ParsedToken, usage: Usage = None):
+    def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
+        self.root = root
         self.ptok = ptok
         self.usage = usage
@@ -135,4 +140,3 @@ class Sink(ABC):
     def deep_copy(self):
         return None

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/parser.py RENAMED Viewed

@@ -187,7 +187,7 @@ class ExpressionParser:
 class ReducerAggregatorPipe(Pipe):
     def __init__(self, top_level_reducers: List[Any]):
-        super().__init__(None)
+        super().__init__(None, None)
         self.top_level_reducers = top_level_reducers
         self.reduction = {}
         self.done = False
@@ -275,6 +275,7 @@ class UpstreamSource(Source):
         return u
     def __init__(self):
+        super().__init__(root=None)
         self.data = []
         self.inner_source = None
         self.sub_recs_in = papi.get_counter(self, var_label='sub_recs_in')
@@ -338,7 +339,7 @@ class SubExpression(Pipe, ProgressIgnore):
         return None
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         self.subexp_ops = []
         self.stack_helper = StackLoader()
         self.subexp_stack = OperandStack()

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/denorm.py RENAMED Viewed

@@ -52,10 +52,10 @@ class DenormPipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         self.field = usage.get_arg('field')
-        self.recs_in = papi.get_counter(self, None) # don't display
+        self.recs_in = papi.get_counter(self, 'recs_in', display=False)
         self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
         self._pending_iter = None

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/filter.py RENAMED Viewed

@@ -39,11 +39,11 @@ class FilterPipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         self.mode = usage.get_arg('mode')
         self.left = None
         self.right = None
-        self.recs_in = papi.get_counter(self, None) # don't display
+        self.recs_in = papi.get_counter(self, 'recs_in', display=False)
         self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
     def reset(self):

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/join.py RENAMED Viewed

@@ -60,7 +60,7 @@ class JoinPipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         self.mode = usage.get_arg('mode')
         self.left = None
@@ -68,7 +68,7 @@ class JoinPipe(Pipe):
         self._pending_right = None
         self._check_right = False
-        self.recs_in = papi.get_counter(self, None) # don't display
+        self.recs_in = papi.get_counter(self, 'recs_in', display=False)
         self.matches = papi.get_percentage_counter(self, 'matches', self.recs_in)
         self.recs_out = papi.get_counter(self, 'recs_out')

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/let_reduce.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # djk/pipes/let_reduce.py
-from pjk.components import Pipe
+from pjk.components import DeepCopyPipe
 from pjk.usage import ParsedToken, Usage, UsageError, TokenError, NoBindUsage
 from pjk.common import SafeNamespace, ReducingNamespace
 import re
@@ -78,7 +78,7 @@ def eval_accumulating(expr: str, record: dict, op: str, acc=None):
     return do_eval(expr, env)
 # --- LetPipe (simple field assignment) ---
-class LetPipe(Pipe):
+class LetPipe(DeepCopyPipe):
     @classmethod
     def usage(cls):
         usage = NoBindUsage( # can't use bound usage because of complicated parsing
@@ -93,7 +93,7 @@ class LetPipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         args = parse_args(ptok.whole_token.split(':', 1)[-1])
         self.field = args['field']
         self.op = args['op']
@@ -121,7 +121,7 @@ def is_comprehension(expr: str) -> bool:
     except SyntaxError:
         return False
-class ReducePipe(Pipe):
+class ReducePipe(DeepCopyPipe):
     @classmethod
     def usage(cls):
         usage = NoBindUsage( # can't use bound usage because of complicated parsing
@@ -161,7 +161,7 @@ class ReducePipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         args = parse_args(ptok.whole_token.split(':', 1)[-1])
         self.field = args['field']
         self.op = args['op']

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/map.py RENAMED Viewed

@@ -28,7 +28,7 @@ class MapByPipe(Pipe, KeyedSource):
         return u
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         self.is_group = False
         self.fields = usage.get_arg('key').split(',')
         self.rec_map = {}
@@ -37,7 +37,7 @@ class MapByPipe(Pipe, KeyedSource):
         self.do_count = usage.get_param(name='count').lower() == 'true'
         self.counts = {}
         self.missing_keys = papi.get_counter(self, 'missing_keys')
-        self.recs_in = papi.get_counter(self, None) # don't display
+        self.recs_in = papi.get_counter(self, 'recs_in', display=False)
         # recs_out = distinct_keys
         self.distinct_keys = papi.get_percentage_counter(self, 'recs_out', self.recs_in)

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/query_pipe.py RENAMED Viewed

@@ -40,7 +40,7 @@ class QueryPipe(Pipe):
         self.output_shape = usage.get_param('shape')
         self.count = usage.get_param('count')
         self.query_field = 'query' # for all subclasses
-        self.inrecs = papi.get_counter(self, var_label=None) # don't display progress
+        self.inrecs = papi.get_counter(self, var_label='recs_in')
         self.outrecs = papi.get_percentage_counter(self, var_label='recs_out', denom_counter=self.inrecs)
     @abstractmethod

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/remove_field.py RENAMED Viewed

@@ -19,7 +19,7 @@ class RemoveField(DeepCopyPipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         arg_string = usage.get_arg('fields')
         self.fields = [f.strip() for f in arg_string.split(',') if f.strip()]
         if not self.fields:

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/select.py RENAMED Viewed

@@ -19,7 +19,7 @@ class SelectFields(DeepCopyPipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         arg_string = usage.get_arg('fields')
         if not arg_string:

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/sort.py RENAMED Viewed

@@ -21,7 +21,7 @@ class SortPipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         arg_string = usage.get_arg('field')
         if arg_string.startswith("-"):

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/tail.py RENAMED Viewed

@@ -19,7 +19,7 @@ class TailPipe(Pipe):
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok)
+        super().__init__(ptok, usage)
         self.limit = usage.get_arg('limit')
         self.buffer = []

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/pipes/where.py RENAMED Viewed

@@ -22,10 +22,11 @@ class WherePipe(DeepCopyPipe):
         u.def_example(expr_tokens=["[{color:'blue'}, {color:'red'}, {color:'black'}]", "where:f.color.startswith('bl')"], expect="[{color:'blue'}, {color:'black'}]")
         return u
-    def __init__(self, ptok: ParsedToken, usage: Usage):
-        super().__init__(ptok, usage)
+    def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
+        super().__init__(ptok, usage, root)
         self.expr = ptok.whole_token.split(':', 1)[1]
-        self.inrecs = papi.get_counter(self, var_label=None) # don't display progress
+        self.inrecs = papi.get_counter(self, var_label='recs_in', display=False)
         self.outrecs = papi.get_percentage_counter(self, var_label='recs_out', denom_counter=self.inrecs)
         try:
             self.code = compile(self.expr, '<where>', 'eval')
@@ -46,3 +47,4 @@ class WherePipe(DeepCopyPipe):
             except Exception:
                 continue  # ignore eval errors

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/progress.py RENAMED Viewed

@@ -13,18 +13,26 @@ class ProgressIgnore:
 class Report:
     def __init__(self):
-        self.name_value_tuples = []
+        self._values: dict[str, Any] = {}
         self.parse_level = -1
+        self.invisibles = set()
-    def add_value(self, name, value):
-        self.name_value_tuples.append((name, value))
+    def set_or_get_value(self, name, value):
+        # store once; subsequent calls return the existing object
+        return self._values.setdefault(name, value)
+    def get_value(self, name):
+        return self._values.get(name)
     def get_name_value_tuples(self):
-        return self.name_value_tuples
+        return self._values.items()
     def set_parse_level(self, level: int):
         self.parse_level = level
+    def make_invisible(self, var_label:str):
+        self.invisibles.add(var_label)
     def get_parse_level(self):
         return self.parse_level
@@ -58,29 +66,7 @@ class ProgressDisplay:
         while not self._stop_event.is_set():
             snap = self.api.snapshot()
             lines = self._render_lines(snap)
-            # Move up to overwrite previous block
-            if self._last_lines:
-                self.stream.write(f"{CSI}{self._last_lines}F")  # move cursor up N lines, to column 1
-            # Write fresh lines
-            for line in lines:
-                self.stream.write(line + "\n")
-            # Erase extra old lines if the block got shorter
-            if self._last_lines > len(lines):
-                diff = self._last_lines - len(lines)
-                for _ in range(diff):
-                    self.stream.write(" " * 120 + "\n")
-                # move cursor up to top of block again
-                self.stream.write(f"{CSI}{self._last_lines}F")
-            try:
-                self.stream.flush()
-            except Exception:
-                pass
-            self._last_lines = len(lines)
+            self._write_lines(lines)
             if self._stop_event.wait(self.interval):
                 break
@@ -88,18 +74,38 @@ class ProgressDisplay:
         # --- FINAL REFRESH ON SHUTDOWN ---
         reports = self.api.snapshot()
         lines = self._render_lines(reports)
+        self._write_lines(lines, final=True)
+    def _write_lines(self, lines, final: bool = False):
+        """
+        Render output either by rewriting the previous block (TTY) or by
+        printing a fresh snapshot (non-TTY fall back).
+        """
+        prev_lines = self._last_lines
-        if self._last_lines:
-            self.stream.write(f"{CSI}{self._last_lines}F")
+        if self._use_ansi:
+            if prev_lines:
+                # Move cursor up to the beginning of the old block
+                self.stream.write(f"{CSI}{prev_lines}F")
-        for line in lines:
-            self.stream.write(line + "\n")
+            for line in lines:
+                self.stream.write(line + "\n")
-        if self._last_lines > len(lines):
-            diff = self._last_lines - len(lines)
-            for _ in range(diff):
-                self.stream.write(" " * 120 + "\n")
-            self.stream.write(f"{CSI}{self._last_lines}F")
+            if prev_lines > len(lines):
+                diff = prev_lines - len(lines)
+                blank = " " * 120
+                for _ in range(diff):
+                    self.stream.write(blank + "\n")
+                # move cursor back to sit just below the freshly written block
+                self.stream.write(f"{CSI}{diff}F")
+        else:
+            # Best-effort fallback when we cannot reposition the cursor.
+            if prev_lines and not final:
+                self.stream.write("\n")
+            for line in lines:
+                self.stream.write(line + "\n")
+            if prev_lines and not final:
+                self.stream.write("-" * 40 + "\n")
         try:
             self.stream.flush()
@@ -125,6 +131,9 @@ class ProgressDisplay:
         label = f'{indent}{key}'
         parts = [f"{label:<{KEY_W}.{KEY_W}}"]           # left col, truncated if too long
         for name, val in report.get_name_value_tuples():
+            if name in report.invisibles:
+                continue
             token = f"{name}={val}"                   # __str__ handles formatting
             parts.append(f"{token:<{COL_W}}") # left-justify, hard truncate at COL_W
         return highlight(" ".join(parts), 'bold', key)
@@ -210,8 +219,8 @@ class ProgressAPI:
         self._parse_depth: Dict[int, int] = {} # component id -> level
         self.level = 0
-    def get_counter(self, component: Source | Sink, var_label: str) -> SafeCounter:
-        return self._update_storage(component, var_label=var_label, value=SafeCounter())
+    def get_counter(self, component: Source | Sink, var_label: str, display: bool = True) -> SafeCounter:
+        return self._update_storage(component, var_label=var_label, value=SafeCounter(), display=display)
     # returns the numerator counter
     def get_percentage_counter(self, component: Source | Sink, var_label: str, denom_counter: SafeCounter):
@@ -229,7 +238,7 @@ class ProgressAPI:
             report.set_parse_level(level)
         return self._reports
-    # could happen before or after update storage
+    # could happen before or after update storage, done in operand stack to get levels right)
     def register_component(self, component: Source | Sink, stack_level: int):
         if isinstance(component, ProgressIgnore):
             return # um, ignore
@@ -238,20 +247,26 @@ class ProgressAPI:
         self._parse_depth[comp_id] = stack_level
         self._update_storage(component, var_label=None, value=None) # just register, no values
-    def _update_storage(self, component: Source | Sink, var_label: str, value: Any):
+    def _update_storage(self, component: Source | Sink, var_label: str, value: Any, display:bool = True):
         # we can have multiple instances of a component type in an expression so we need to
         # differentiate by id when we put them in the _store.
         component_label = self._get_component_label(component)
-        store_key = (component_label, id(component))
+        # create an uniq id for variable that is common across clones
+        comp_id = id(component) if component.root is None else id(component.root)
+        store_key = (component_label, comp_id)
         report = self._reports.setdefault(store_key, Report())
-        if not value: # when just registering component
+        if value is None: # when just registering component
             return None
-        if var_label:
-            # only when var_label not None, do we want the stat displayed
-            report.add_value(var_label, value)
+        if not var_label:
+            raise Exception('unique var_label is required')
-        return value
+        if not display:
+            report.make_invisible(var_label)
+        return report.set_or_get_value(var_label, value)
     # some hacking to get at reasonable labels
     def _get_component_label(self, component: Source | Sink):

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph.py RENAMED Viewed

@@ -21,6 +21,7 @@ class GraphSink(Sink):
         usage.def_param(name='x', usage='x-axis field', default='x')
         usage.def_param(name='y', usage='comma separated list of y-axis fields', default='y')
         usage.def_param(name='pause', usage='Seconds to show graph', is_num=True, default='-1')
+        usage.def_param(name='title', usage='A title for the graph', is_num=False)
         return usage
     def __init__(self, ptok: ParsedToken, usage: Usage):
@@ -30,6 +31,7 @@ class GraphSink(Sink):
         self.x_field = usage.get_param('x')
         self.y_field = usage.get_param('y')
         self.pause = usage.get_param('pause')
+        self.title = usage.get_param('title')
     def process(self):
         import matplotlib.pyplot as plt # lazy import

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/graph_bar_line.py RENAMED Viewed

@@ -137,8 +137,8 @@ class SingleYWithSetsAdapter:
 # ----------------------------- Plotter -----------------------------
 class GraphPlotter:
     def __init__(self, params: GraphParams):
-        self.p = params
-        self.y_fields = list(dict.fromkeys(self.p.y_fields))  # dedupe, preserve order
+        self.pms = params
+        self.y_fields = list(dict.fromkeys(self.pms.y_fields))  # dedupe, preserve order
     def plot(self, chart_type: str = "line"):
         import matplotlib.pyplot as plt
@@ -149,8 +149,8 @@ class GraphPlotter:
         # Multi-Y path (preferred)
         if len(self.y_fields) > 1:
-            df = MultiYAdapter.to_df(self.p.records, self.p.x_field, self.y_fields)
-            is_time = self.p.x_is_time if isinstance(self.p.x_is_time, bool) else TimeDetector.is_time(df["x"])
+            df = MultiYAdapter.to_df(self.pms.records, self.pms.x_field, self.y_fields)
+            is_time = self.pms.x_is_time if isinstance(self.pms.x_is_time, bool) else TimeDetector.is_time(df["x"])
             if is_time:
                 df["ts"] = TimeDetector.parse_times(df["x"])
                 df = df.dropna(subset=["ts"]).sort_values("ts")
@@ -165,12 +165,12 @@ class GraphPlotter:
                 else:
                     self._lines_categorical(ax, df, self.y_fields)
                 self._format_categorical_axis(ax, df)
-            title = self.p.title or ("Line over time" if is_time and chart_type=="line" else
+            title = self.pms.title or ("Line over time" if is_time and chart_type=="line" else
                                      "Bar over time" if is_time else
                                      "Line by category" if chart_type=="line" else
                                      "Bar by category")
             ax.set_title(title)
-            ax.set_xlabel(self.p.x_field)
+            ax.set_xlabel(self.pms.x_field)
             ax.set_ylabel(", ".join(self.y_fields))
             ax.legend(title="Series")
             self._apply_args_dict()
@@ -180,13 +180,13 @@ class GraphPlotter:
         # Single-Y legacy path (maybe with set_name)
         y = self.y_fields[0]
-        sdf = SingleYWithSetsAdapter.to_df(self.p.records, self.p.x_field, y)
+        sdf = SingleYWithSetsAdapter.to_df(self.pms.records, self.pms.x_field, y)
         if sdf.empty:
-            print(f"No valid '{self.p.x_field}' and '{y}' records found.")
+            print(f"No valid '{self.pms.x_field}' and '{y}' records found.")
             return fig, ax
         # time vs categorical
-        is_time = self.p.x_is_time if isinstance(self.p.x_is_time, bool) else TimeDetector.is_time(sdf["x"])
+        is_time = self.pms.x_is_time if isinstance(self.pms.x_is_time, bool) else TimeDetector.is_time(sdf["x"])
         if is_time:
             sdf["ts"] = TimeDetector.parse_times(sdf["x"])
             sdf = sdf.dropna(subset=["ts"])  # might be empty
@@ -206,8 +206,8 @@ class GraphPlotter:
                 else:
                     ax.plot(s.index, s.values, label=label)
             self._format_time_axis(ax, sdf.rename(columns={"ts":"ts"}))
-            ax.set_title(self.p.title or f"{y} over time")
-            ax.set_xlabel(self.p.x_field)
+            ax.set_title(self.pms.title or f"{y} over time")
+            ax.set_xlabel(self.pms.x_field)
             ax.set_ylabel(y)
             if any(s != "__default__" for s in sdf["set"].unique()):
                 ax.legend(title="data set")
@@ -243,8 +243,8 @@ class GraphPlotter:
                 tick_idx = idx
                 tick_lbl = x_vals
             ax.set_xticks(tick_idx, tick_lbl, rotation=45)
-            ax.set_title(self.p.title or f"{y} by {self.p.x_field}")
-            ax.set_xlabel(self.p.x_field)
+            ax.set_title(self.pms.title or f"{y} by {self.pms.x_field}")
+            ax.set_xlabel(self.pms.x_field)
             ax.set_ylabel(y)
             if len(set_names) > 1 or "__default__" not in set_names:
                 ax.legend(title="data set")
@@ -323,7 +323,7 @@ class GraphPlotter:
     # ---------- Misc ----------
     def _apply_args_dict(self) -> None:
         import matplotlib.pyplot as plt
-        for name, val in getattr(self.p, "args_dict", {}).items():
+        for name, val in getattr(self.pms, "args_dict", {}).items():
             fn = getattr(plt, name, None)
             if callable(fn):
                 try:

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sinks/s3_sink.py RENAMED Viewed

@@ -26,6 +26,7 @@ class S3Sink(Sink):
     _FILENAME_DIGITS: int = 4
     def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int):
+        super().__init__(root=None, ptok=None, usage=None)
         self.path_no_ext = path_no_ext if not path_no_ext.startswith('//') else path_no_ext[2:] # strip leading //
         self.sink_class = sink_class
         self.is_gz = is_gz

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/csv_source.py RENAMED Viewed

@@ -14,6 +14,7 @@ class CSVSource(FormatSource):
     extension = 'csv'
     def __init__(self, lazy_file: LazyFile, delimiter: str = ","):
+        super().__init__(lazy_file)
         self.lazy_file = lazy_file
         self.delimiter = delimiter
         self.num_recs = 0

python_jack_knife-0.6.17/src/pjk/sources/dir_source.py ADDED Viewed

@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 Mike Schultz
+import os
+import threading
+from typing import Optional
+from pjk.components import Source
+from pjk.sources.lazy_file_local import LazyFileLocal
+from pjk.log import logger
+class DirSource(Source):
+    """
+    Iterate over files in a directory, materializing a concrete Source per file.
+    Coordination between clones is handled by a shared file iterator protected
+    by a lock. No queues, no is_root, no done_event.
+    """
+    extension = 'dir'  # ducklike hack so like FormatSource without the hassle
+    def __init__(self, root: Source, file_iter = None, source_classes: dict = None, format_override: str = None):
+        super().__init__(root=root)
+        self.current = None
+        if not root: # WE! are the root
+            if not file_iter:
+                raise Exception('root creation must include file_iter')
+            self.file_iter = file_iter
+            self.iterator_lock = threading.Lock()
+            self.format_override = format_override
+            self.source_classes = source_classes
+        else:
+            self.file_iter = root.file_iter
+            self.source_classes = root.source_classes
+            self.format_override = root.format_override
+            self.iterator_lock = root.iterator_lock
+    # ---------------------------------------------------------------------
+    # Iteration
+    # ---------------------------------------------------------------------
+    def __iter__(self):
+        while True:
+            if self.current is None:
+                # Pull the next file-backed Source (skip unsupported files)
+                self.current = self._get_next_source()
+                if self.current is None:
+                    return  # exhausted
+            try:
+                for record in self.current:
+                    yield record
+            finally:
+                # move on after this inner source is exhausted
+                self.current = None
+    # ---------------------------------------------------------------------
+    # Contention boundary: only here we touch the shared iterator
+    # ---------------------------------------------------------------------
+    # needed for in deep_clone to stop itereration
+    def has_next(self):
+        if self.current is not None:
+            return True
+        self.current = self._get_next_source()
+        return self.current is not None
+    def get_next_file(self) -> Optional[str]:
+        """
+        Thread-safe advancement of the shared file iterator.
+        Returns the next file path, or None when exhausted.
+        """
+        with self.iterator_lock:
+            if self.file_iter is None:
+                return None
+            try:
+                path = next(self.file_iter)
+                logger.debug(f'get_next_file -> {path}')
+                return path
+            except StopIteration:
+                self.file_iter = None
+                logger.debug('get_next_file -> None (exhausted)')
+                return None
+    def _get_next_source(self) -> Optional[Source]:
+        """
+        Keep drawing files until we either exhaust or we can construct a Source.
+        """
+        while True:
+            file = self.get_next_file()
+            if file is None:
+                return None
+            src = self._file_to_source(file)
+            if src is None:
+                logger.debug(f'skipping unsupported file: {file}')
+                continue
+            logger.debug(f'next source (from file) = {src}')
+            return src
+    # ---------------------------------------------------------------------
+    # Helpers
+    # ---------------------------------------------------------------------
+    def _file_to_source(self, file: str) -> Optional[Source]:
+        parts = file.split('.')
+        is_gz = False
+        if parts and parts[-1] == 'gz':
+            is_gz = True
+            parts.pop()
+        fmt = parts[-1] if parts else None
+        if self.format_override:
+            fmt, is_gz = self.get_format_gz(self.format_override)
+        if not fmt:
+            return None
+        source_class = self.source_classes.get(fmt)
+        if not source_class:
+            return None
+        lazy_file = LazyFileLocal(file, is_gz)
+        return source_class(lazy_file)
+    def deep_copy(self):
+        clone = DirSource(self)
+        if clone.has_next():
+            return clone
+        else:
+            return None
+    # ---------------------------------------------------------------------
+    # Class utilities
+    # ---------------------------------------------------------------------
+    @classmethod
+    def get_format_gz(cls, input_str: str):
+        is_gz = False
+        fmt = input_str
+        if input_str.endswith('.gz'):
+            is_gz = True
+            fmt = input_str[:-3]
+        return fmt, is_gz
+    @classmethod
+    def _iter_files(cls, path: str, recursive: bool):
+        if not recursive:
+            for f in os.listdir(path):
+                full = os.path.join(path, f)
+                if os.path.isfile(full):
+                    yield full
+            return
+        for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=False):
+            for name in filenames:
+                full = os.path.join(dirpath, name)
+                if os.path.isfile(full):
+                    yield full
+    @classmethod
+    def create(
+        cls,
+        source_classes: dict,
+        path_no_ext: str,
+        format_override: Optional[str] = None,
+        recursive: bool = False,
+    ):
+        """
+        Factory: returns a DirSource that will lazily enumerate files.
+        """
+        file_iter = cls._iter_files(path_no_ext, recursive)
+        return DirSource(
+            root = None, # THIS is the root
+            file_iter=file_iter,
+            source_classes=source_classes,
+            format_override=format_override
+        )

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/format_source.py RENAMED Viewed

@@ -16,6 +16,7 @@ class SourceFormatUsage(NoBindUsage):
         self.def_syntax("") # no syntax for these
         # default = None because for source, format is an OVERRIDE
         self.def_param('format', 'file format', is_num=False, valid_values={'json', 'csv', 'tsv', 'json.gz', 'tsv.gz', 'csv.gz'}, default=None)
+        self.def_param('recursive', 'for local direcories only', is_num=False, valid_values={'true', 'false'}, default=False)
         self.def_example(expr_tokens=[f"myfile.{name}", "-"], expect=None)
         self.def_example(expr_tokens=["mydir", "-"], expect=None)
         self.def_example(expr_tokens=[f"s3://mybucket/myfile.{name}", "-"], expect=None)
@@ -94,7 +95,8 @@ class FormatSource(Source):
                 return S3Source.create(sources, path_no_ext, ext, format_override=format_override)
             if os.path.isdir(path_no_ext):
-                return DirSource.create(sources, path_no_ext, format_override=format_override)
+                recursive = usage.get_param('recursive') == 'true'
+                return DirSource.create(sources, path_no_ext, format_override=format_override, recursive=recursive)
             return None

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/inline_source.py RENAMED Viewed

@@ -33,6 +33,7 @@ class InlineSource(Source):
         return usage
     def __init__(self, inline_expr):
+        super().__init__(root=None)
         self.num_recs = 0
         try:
             obj = hjson.loads(inline_expr)

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/json_source.py RENAMED Viewed

@@ -13,6 +13,7 @@ class JsonSource(FormatSource):
     extension = 'json'
     def __init__(self, lazy_file: LazyFile):
+        super().__init__(root=None)
         self.lazy_file = lazy_file
         self.num_recs = 0

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/s3_source.py RENAMED Viewed

@@ -96,6 +96,7 @@ class S3Source(Source):
     """
     def __init__(self, shared_state: _SharedS3State, reserved: Optional[Source] = None):
+        super().__init__(root=None)
         self._state = shared_state
         self._current: Optional[Source] = reserved

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/sources/sql_source.py RENAMED Viewed

@@ -13,6 +13,7 @@ class SQLSource(FormatSource):
     desc_override = "SQL source. Emits SQL in single record in 'query' field."
     def __init__(self, lazy_file: LazyFile):
+        super().__init__(root=None)
         self.lazy_file = lazy_file
         self.num_recs = 0

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/pjk/version.py RENAMED Viewed

@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 Mike Schultz
-__version__ = "0.6.15"
+__version__ = "0.6.17"

{python_jack_knife-0.6.15 → python_jack_knife-0.6.17}/src/python_jack_knife.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-jack-knife
-Version: 0.6.15
+Version: 0.6.17
 Summary: Python Jack Knife – a command line data processor
 Author-email: Mike Schultz <mike.schultz@gmail.com>
 License:

python_jack_knife-0.6.15/src/pjk/sources/dir_source.py DELETED Viewed

@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2024 Mike Schultz
-import os
-from typing import Any
-from queue import Queue, Empty
-from pjk.components import Source
-from pjk.sources.lazy_file_local import LazyFileLocal
-from pjk.log import logger
-class DirSource(Source):
-    extension = 'dir' # ducklike hack so like FormatSource without the hassle
-    def __init__(self, source_queue: Queue, in_source: Source = None):
-        self.source_queue = source_queue
-        self.current = in_source
-    def __iter__(self):
-        while True:
-            if self.current is None:
-                try:
-                    self.current = self.source_queue.get_nowait()
-                    logger.debug(f'next source={self.current}')
-                except Empty:
-                    return  # end of all sources
-            try:
-                for record in self.current:
-                    yield record
-            finally:
-                self.current = None  # move to next source after exhaustion
-    def deep_copy(self):
-        if self.source_queue.qsize() <= 1:
-            return None  # leave remaining files to original
-        try:
-            next_source = self.source_queue.get_nowait()
-            logger.debug(f'deep_copy next_source={next_source}')
-        except Empty:
-            return None
-        return DirSource(self.source_queue, next_source)
-    @classmethod
-    def get_format_gz(cls, input:str):
-        is_gz = False
-        format = input
-        if input.endswith('.gz'):
-            is_gz = True
-            format = input[:-3]
-        return format, is_gz
-    @classmethod
-    def create(cls, sources: dict, path_no_ext: str, format_override: str = None):
-        files = [
-            os.path.join(path_no_ext, f)
-            for f in os.listdir(path_no_ext)
-            if os.path.isfile(os.path.join(path_no_ext, f))
-        ]
-        source_queue = Queue()
-        for file in files:
-            parts = file.split('.')
-            is_gz = False
-            if parts[-1] == 'gz':
-                is_gz = True
-                parts.pop()
-            format = parts[-1]
-            if format_override:
-                format, is_gz = cls.get_format_gz(format_override)
-            source_class = sources.get(format)
-            lazy_file = LazyFileLocal(file, is_gz)
-            source_queue.put(source_class(lazy_file))
-        if source_queue.empty():
-            return None
-        return DirSource(source_queue)