python-jack-knife 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. pjk/__init__.py +5 -0
  2. pjk/base.py +377 -0
  3. pjk/common.py +150 -0
  4. pjk/log.py +67 -0
  5. pjk/main.py +106 -0
  6. pjk/man_page.py +125 -0
  7. pjk/parser.py +284 -0
  8. pjk/pipes/__init__.py +0 -0
  9. pjk/pipes/denorm.py +68 -0
  10. pjk/pipes/factory.py +62 -0
  11. pjk/pipes/filter.py +57 -0
  12. pjk/pipes/head.py +34 -0
  13. pjk/pipes/join.py +85 -0
  14. pjk/pipes/let_reduce.py +198 -0
  15. pjk/pipes/map.py +91 -0
  16. pjk/pipes/move_field.py +36 -0
  17. pjk/pipes/postgres_pipe.py +209 -0
  18. pjk/pipes/remove_field.py +36 -0
  19. pjk/pipes/select.py +42 -0
  20. pjk/pipes/sort.py +63 -0
  21. pjk/pipes/tail.py +39 -0
  22. pjk/pipes/user_pipe_factory.py +45 -0
  23. pjk/pipes/where.py +49 -0
  24. pjk/registry.py +143 -0
  25. pjk/sinks/__init__.py +0 -0
  26. pjk/sinks/csv_sink.py +33 -0
  27. pjk/sinks/ddb.py +54 -0
  28. pjk/sinks/devnull.py +31 -0
  29. pjk/sinks/dir_sink.py +59 -0
  30. pjk/sinks/expect.py +53 -0
  31. pjk/sinks/factory.py +108 -0
  32. pjk/sinks/graph.py +57 -0
  33. pjk/sinks/graph_bar_line.py +229 -0
  34. pjk/sinks/graph_cumulative.py +55 -0
  35. pjk/sinks/graph_hist.py +72 -0
  36. pjk/sinks/graph_scatter.py +29 -0
  37. pjk/sinks/json_sink.py +23 -0
  38. pjk/sinks/s3_sink.py +100 -0
  39. pjk/sinks/sinks.py +68 -0
  40. pjk/sinks/stdout.py +44 -0
  41. pjk/sinks/tsv_sink.py +22 -0
  42. pjk/sinks/user_sink_factory.py +43 -0
  43. pjk/sources/__init__.py +0 -0
  44. pjk/sources/csv_source.py +28 -0
  45. pjk/sources/dir_source.py +69 -0
  46. pjk/sources/factory.py +100 -0
  47. pjk/sources/format_usage.py +11 -0
  48. pjk/sources/inline_source.py +56 -0
  49. pjk/sources/json_source.py +35 -0
  50. pjk/sources/lazy_file.py +16 -0
  51. pjk/sources/lazy_file_local.py +22 -0
  52. pjk/sources/lazy_file_s3.py +28 -0
  53. pjk/sources/parquet_source.py +32 -0
  54. pjk/sources/s3_source.py +146 -0
  55. pjk/sources/source_list.py +23 -0
  56. pjk/sources/sql_source.py +32 -0
  57. pjk/sources/tsv_source.py +15 -0
  58. pjk/sources/user_source_factory.py +33 -0
  59. pjk/version.py +4 -0
  60. python_jack_knife-0.5.0.dist-info/METADATA +254 -0
  61. python_jack_knife-0.5.0.dist-info/RECORD +65 -0
  62. python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
  63. python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
  64. python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
  65. python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/sinks/factory.py ADDED
@@ -0,0 +1,108 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from typing import Any, List, Callable
5
+ import os
6
+ from pjk.base import Source, Sink, ParsedToken
7
+ from pjk.common import ComponentFactory
8
+ from pjk.sinks.stdout import StdoutSink
9
+ from pjk.sinks.json_sink import JsonSink
10
+ from pjk.sinks.devnull import DevNullSink
11
+ from pjk.sinks.graph import GraphSink
12
+ from pjk.sinks.csv_sink import CSVSink
13
+ from pjk.sinks.tsv_sink import TSVSink
14
+ from pjk.sinks.ddb import DDBSink
15
+ from pjk.sinks.dir_sink import DirSink
16
+ from pjk.sinks.expect import ExpectSink
17
+ from pjk.sinks.user_sink_factory import UserSinkFactory
18
+
19
+ COMPONENTS = {
20
+ '-': StdoutSink,
21
+ 'devnull': DevNullSink,
22
+ 'graph': GraphSink,
23
+ 'ddb': DDBSink,
24
+ 'json': JsonSink,
25
+ 'csv': CSVSink,
26
+ 'tsv': TSVSink,
27
+ }
28
+
29
+ class SinkFactory(ComponentFactory):
30
+ def __init__(self):
31
+ super().__init__(COMPONENTS, 'sink')
32
+
33
+ def create(self, token: str) -> Callable[[Source], Sink]:
34
+ token = token.strip()
35
+ ptok = ParsedToken(token)
36
+
37
+ # non-usage sink (bind incompatible)
38
+ if ptok.pre_colon == 'expect':
39
+ return ExpectSink(ptok, None)
40
+
41
+ if ptok.pre_colon.endswith('.py'):
42
+ sink = UserSinkFactory.create(ptok)
43
+ if sink:
44
+ return sink
45
+
46
+ #if ptok.all_but_params.startswith('s3'):
47
+ # return S3Sink.create(ptok, get_format_class_gz=self.get_format_class_gz)
48
+
49
+ # check for format sinks
50
+ sink = self._attempt_format(ptok)
51
+ if sink:
52
+ return sink
53
+
54
+ sink_cls = self.components.get(ptok.pre_colon)
55
+ if not sink_cls:
56
+ return None
57
+
58
+ usage = sink_cls.usage()
59
+ usage.bind(ptok)
60
+ return sink_cls(ptok, usage)
61
+
62
+ #raise TokenError.from_list(['pjk <source> [<pipe> ...] <sink>',
63
+ # "Expression must end in a sink (e.g. '-', 'out.json')"]
64
+ # )
65
+
66
+ def _attempt_format(self, ptok: ParsedToken):
67
+ format = ptok.pre_colon
68
+ is_gz = False
69
+ if format.endswith('.gz'):
70
+ format = format[:-3]
71
+ is_gz = True
72
+
73
+ sink_cls = self.components.get(format) # <format>: directory case
74
+ if not sink_cls:
75
+ # attempt case -> myfile.<format>
76
+ return self._attempt_format_file(ptok)
77
+
78
+ # case -> <format>:<path> local dir
79
+ if sink_cls.is_format:
80
+ dir_usage = DirSink.usage()
81
+ dir_usage.bind(ptok)
82
+ return DirSink(ptok, dir_usage, sink_cls, is_gz)
83
+
84
+ return None
85
+
86
+
87
+ def _attempt_format_file(self, ptok: ParsedToken):
88
+ is_gz = False
89
+ path, ext = os.path.splitext(ptok.all_but_params)
90
+ if '.gz' in ext:
91
+ is_gz = True
92
+ path, ext = os.path.splitext(path)
93
+
94
+ file_ext = ext.lstrip('.') # removes the leading dot
95
+
96
+ sink_cls = self.components.get(file_ext)
97
+ if not sink_cls:
98
+ return None
99
+
100
+ file_token = f'{path}:{is_gz}' # hack so user can do .json.gz
101
+ file_ptok = ParsedToken(file_token)
102
+
103
+ usage = sink_cls.usage()
104
+ usage.bind(file_ptok) # not sure we'll ever use since we're hacking above
105
+
106
+ return sink_cls(file_ptok, usage)
107
+
108
+
pjk/sinks/graph.py ADDED
@@ -0,0 +1,57 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from pjk.base import Sink, ParsedToken, Usage, TokenError
5
+
6
+ from pjk.sinks.graph_cumulative import graph_cumulative
7
+ from pjk.sinks.graph_hist import graph_hist
8
+ from pjk.sinks.graph_scatter import graph_scatter
9
+ from pjk.sinks.graph_bar_line import graph_bar_line
10
+
11
+ class GraphSink(Sink):
12
+ @classmethod
13
+ def usage(cls):
14
+ usage = Usage(
15
+ name='graph',
16
+ desc='Display various kinds of graphs from streamed records',
17
+ component_class=cls
18
+ )
19
+ usage.def_arg(name='kind', usage='hist|scatter|bar|line|cumulative')
20
+ usage.def_param(name='x', usage='Name of x-axis field', default='x')
21
+ usage.def_param(name='y', usage='Name of y-axis field', default='y')
22
+ usage.def_param(name='pause', usage='Seconds to show graph', is_num=True, default='-1')
23
+ return usage
24
+
25
+ def __init__(self, ptok: ParsedToken, usage: Usage):
26
+ super().__init__(ptok, usage)
27
+ self.records = []
28
+ self.kind = usage.get_arg('kind')
29
+ self.x_field = usage.get_param('x')
30
+ self.y_field = usage.get_param('y')
31
+ self.pause = usage.get_param('pause')
32
+
33
+ def process(self):
34
+ import matplotlib.pyplot as plt # lazy import
35
+
36
+ for record in self.input:
37
+ self.records.append(record)
38
+
39
+ if self.kind == "scatter":
40
+ graph_scatter(self)
41
+ elif self.kind == "hist":
42
+ graph_hist(self)
43
+ elif self.kind == "cumulative":
44
+ graph_cumulative(self)
45
+ elif self.kind == "bar":
46
+ graph_bar_line(self, 'bar')
47
+ elif self.kind == "line":
48
+ graph_bar_line(self, 'line')
49
+ else:
50
+ raise TokenError(f"Unsupported graph type: {self.kind}")
51
+
52
+ if not self.pause:
53
+ plt.show()
54
+ else:
55
+ plt.show(block=False)
56
+ plt.pause(int(self.pause))
57
+ plt.close()
@@ -0,0 +1,229 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import re
5
+ from datetime import date, datetime
6
+ from collections import defaultdict
7
+
8
+ def graph_bar_line(obj, type):
9
+ import matplotlib.pyplot as plt # lazy imports
10
+ import matplotlib.dates as mdates
11
+ import numpy as np
12
+ import pandas as pd
13
+ """
14
+ Plot grouped data as a bar or line chart. Automatically detects time-series X
15
+ unless explicitly overridden via `obj.x_is_time = True/False`.
16
+
17
+ Required fields on `obj`:
18
+ - obj.records: iterable of dicts
19
+ - obj.x_field: str
20
+ - obj.y_field: str
21
+ - obj.args_dict: optional dict of {matplotlib.pyplot function name -> value}
22
+ - obj.x_is_time: optional bool to force/disable time-series mode
23
+
24
+ `type` should be 'bar' or 'line'.
25
+ """
26
+ if not obj.x_field or not obj.y_field:
27
+ print("Both x_field and y_field must be specified.")
28
+ return
29
+
30
+ # ---------- collect raw rows ----------
31
+ rows = [] # (x, y, set_name)
32
+ count = 0
33
+ for r in obj.records:
34
+ if obj.x_field in r and obj.y_field in r:
35
+ x = r[obj.x_field]
36
+ y = r[obj.y_field]
37
+ set_name = r.get("set_name", "__default__")
38
+ try:
39
+ rows.append((x, float(y), set_name))
40
+ count += 1
41
+ except Exception:
42
+ pass
43
+
44
+ if not rows:
45
+ print(f"No valid '{obj.x_field}' and '{obj.y_field}' records found.")
46
+ return
47
+
48
+ plt.figure()
49
+
50
+ # ---------- robust time-series detection (without false positives) ----------
51
+ df = pd.DataFrame(rows, columns=["x", "y", "set"])
52
+
53
+ # Optional explicit override from caller
54
+ x_is_time_override = getattr(obj, "x_is_time", None)
55
+
56
+ def looks_like_datetime_str(s: str) -> bool:
57
+ # quick heuristics for common date/time strings (ISO, with separators, or HH:MM)
58
+ return bool(
59
+ re.search(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}", s) or # YYYY-MM-DD or YYYY/MM/DD
60
+ re.search(r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}", s) or # MM-DD-YYYY, etc.
61
+ re.search(r"\d{1,2}:\d{2}", s) or # HH:MM present
62
+ ("T" in s) or ("Z" in s) # ISO 8601 hints
63
+ )
64
+
65
+ def detect_time_series(xs: pd.Series) -> bool:
66
+ # Already datetime dtype?
67
+ if pd.api.types.is_datetime64_any_dtype(xs):
68
+ return True
69
+
70
+ # Native datetime-like objects
71
+ if (xs.map(lambda v: isinstance(v, (datetime, date, pd.Timestamp, np.datetime64))).mean() >= 0.9):
72
+ return True
73
+
74
+ # String-looking dates
75
+ str_mask = xs.map(lambda v: isinstance(v, str))
76
+ if str_mask.mean() >= 0.9:
77
+ looks_mask = xs[str_mask].map(looks_like_datetime_str)
78
+ if looks_mask.mean() >= 0.9:
79
+ parsed = pd.to_datetime(xs[str_mask][looks_mask], errors="coerce", utc=False)
80
+ if parsed.notna().mean() >= 0.9:
81
+ return True
82
+
83
+ # Numeric epoch seconds/millis (avoid generic integer → ns trap)
84
+ num = pd.to_numeric(xs, errors="coerce")
85
+ if num.notna().mean() >= 0.9:
86
+ # use quantiles to ignore a few bad points
87
+ q05, q95 = num.quantile(0.05), num.quantile(0.95)
88
+ # seconds since epoch ~ [1e9, 2e9] for 2001–2033
89
+ seconds_like = 1e9 <= q05 <= 2.2e9 and 1e9 <= q95 <= 2.2e9
90
+ # millis since epoch ~ [1e12, 2e12]
91
+ millis_like = 1e12 <= q05 <= 2.2e12 and 1e12 <= q95 <= 2.2e12
92
+ if seconds_like or millis_like:
93
+ return True
94
+
95
+ return False
96
+
97
+ if isinstance(x_is_time_override, bool):
98
+ is_time_series = x_is_time_override
99
+ else:
100
+ is_time_series = detect_time_series(df["x"])
101
+
102
+ if is_time_series:
103
+ # ===== time-series path (real datetime index) =====
104
+ # Parse with care: handle epoch seconds/millis and strings
105
+ x_series = df["x"]
106
+
107
+ # Try numeric epochs first
108
+ numeric = pd.to_numeric(x_series, errors="coerce")
109
+ parsed = None
110
+ if numeric.notna().mean() >= 0.9:
111
+ q05, q95 = numeric.quantile(0.05), numeric.quantile(0.95)
112
+ if 1e12 <= q05 <= 2.2e12 and 1e12 <= q95 <= 2.2e12:
113
+ parsed = pd.to_datetime(numeric, unit="ms", errors="coerce", utc=False)
114
+ elif 1e9 <= q05 <= 2.2e9 and 1e9 <= q95 <= 2.2e9:
115
+ parsed = pd.to_datetime(numeric, unit="s", errors="coerce", utc=False)
116
+
117
+ # Fallback to generic parser for strings/datetimes
118
+ if parsed is None:
119
+ parsed = pd.to_datetime(x_series, errors="coerce", utc=False)
120
+
121
+ df["ts"] = parsed
122
+ df = df.dropna(subset=["ts"])
123
+ if df.empty:
124
+ print("No parsable datetime values in x_field.")
125
+ return
126
+
127
+ # aggregate duplicates at same timestamp per set
128
+ df = df.groupby(["ts", "set"], as_index=False)["y"].sum()
129
+ sets = sorted(df["set"].unique())
130
+
131
+ ax = plt.gca()
132
+ for sname in sets:
133
+ s = df[df["set"] == sname].set_index("ts")["y"].sort_index()
134
+ label = None if sname == "__default__" else sname
135
+ # line plot for time series (clean)
136
+ ax.plot(s.index, s.values, label=label)
137
+
138
+ # ---- sparse, readable ticks (dynamic) ----
139
+ ts_min, ts_max = df["ts"].min(), df["ts"].max()
140
+ span_hours = max((ts_max - ts_min).total_seconds() / 3600.0, 1)
141
+
142
+ if span_hours <= 72:
143
+ major = mdates.HourLocator(interval=6) # every 6 hours
144
+ fmt = mdates.DateFormatter("%m-%d %H:%M")
145
+ elif span_hours <= 14 * 24:
146
+ major = mdates.HourLocator(interval=12) # every 12 hours
147
+ fmt = mdates.DateFormatter("%m-%d %H:%M")
148
+ elif span_hours <= 90 * 24:
149
+ major = mdates.DayLocator(interval=1) # daily
150
+ fmt = mdates.DateFormatter("%Y-%m-%d")
151
+ else:
152
+ major = mdates.WeekdayLocator(byweekday=mdates.MO, interval=1) # weekly
153
+ fmt = mdates.DateFormatter("%Y-%m-%d")
154
+
155
+ ax.xaxis.set_major_locator(major)
156
+ ax.xaxis.set_major_formatter(fmt)
157
+ # no minor tick labels; rotate to avoid overlap
158
+ plt.gcf().autofmt_xdate()
159
+
160
+ plt.xlabel(obj.x_field)
161
+ plt.ylabel(obj.y_field)
162
+ if any(s != "__default__" for s in sets):
163
+ plt.legend(title="data set")
164
+ plt.title(f"{obj.y_field} over time")
165
+ plt.text(1.0, 0.95, f"{count} data points", transform=ax.transAxes,
166
+ ha='right', va='top', fontsize=10, color='gray')
167
+
168
+ else:
169
+ # ===== categorical / numeric path (original behavior, with tick thinning) =====
170
+ data = defaultdict(float)
171
+ all_x = []
172
+ all_sets = set()
173
+
174
+ for x, y, set_name in rows:
175
+ data[(x, set_name)] += y
176
+ all_sets.add(set_name)
177
+ all_x.append(x)
178
+
179
+ if not data:
180
+ print(f"No valid '{obj.x_field}' and '{obj.y_field}' records found.")
181
+ return
182
+
183
+ # preserve order but unique
184
+ seen = set()
185
+ x_vals = [x for x in all_x if not (x in seen or seen.add(x))]
186
+ set_names = sorted(all_sets)
187
+ x_indices = np.arange(len(x_vals))
188
+ width = 0.8 / len(set_names) if len(set_names) > 1 else 0.6
189
+
190
+ for i, set_name in enumerate(set_names):
191
+ heights = [data.get((x, set_name), 0) for x in x_vals]
192
+ label = None if set_name == "__default__" else set_name
193
+ offset = (i - (len(set_names) - 1) / 2) * width
194
+
195
+ if type == 'bar':
196
+ plt.bar(x_indices + offset, heights, width=width, label=label, edgecolor='black')
197
+ else:
198
+ plt.plot(x_indices, heights, marker='o', label=label)
199
+
200
+ # ---- thin xticks to at most 12 labels ----
201
+ max_ticks = 12
202
+ if len(x_vals) > max_ticks:
203
+ step = int(np.ceil(len(x_vals) / max_ticks))
204
+ tick_idx = x_indices[::step]
205
+ tick_lbl = [x_vals[i] for i in tick_idx]
206
+ else:
207
+ tick_idx = x_indices
208
+ tick_lbl = x_vals
209
+
210
+ plt.xticks(tick_idx, tick_lbl, rotation=45)
211
+ plt.xlabel(obj.x_field)
212
+ plt.ylabel(obj.y_field)
213
+ if len(set_names) > 1 or "__default__" not in set_names:
214
+ plt.legend(title="data set")
215
+ plt.title(f"{obj.y_field} by {obj.x_field}")
216
+ plt.text(1.0, 0.95, f"{count} data points", transform=plt.gca().transAxes,
217
+ ha='right', va='top', fontsize=10, color='gray')
218
+
219
+ # ---------- optional plt args ----------
220
+ for name, val in getattr(obj, "args_dict", {}).items():
221
+ fn = getattr(plt, name, None)
222
+ if callable(fn):
223
+ try:
224
+ fn(val)
225
+ except Exception:
226
+ pass
227
+
228
+ plt.grid(True, linestyle='--', alpha=0.6)
229
+ plt.tight_layout()
@@ -0,0 +1,55 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ def graph_cumulative(obj):
5
+ import matplotlib.pyplot as plt # lazy import
6
+
7
+ # Filter and sort records by x
8
+ records = [
9
+ r for r in obj.records
10
+ if obj.x_field in r and obj.y_field in r
11
+ ]
12
+ try:
13
+ sorted_records = sorted(records, key=lambda r: r[obj.x_field])
14
+ except TypeError:
15
+ print(f"Unable to sort records by '{obj.x_field}' — incompatible types.")
16
+ return
17
+
18
+ x_vals = []
19
+ y_vals = []
20
+ total = 0
21
+ count = 0
22
+ for r in sorted_records:
23
+ try:
24
+ x = r[obj.x_field]
25
+ y = r[obj.y_field]
26
+ total += y
27
+ x_vals.append(x)
28
+ y_vals.append(total)
29
+ count += 1
30
+ except Exception:
31
+ pass # silently skip bad records
32
+
33
+ if not x_vals:
34
+ print(f"No valid '{obj.x_field}' and '{obj.y_field}' data for cumulative plot.")
35
+ return
36
+
37
+ plt.figure()
38
+ plt.plot(x_vals, y_vals, marker='o', linestyle='-', label='Cumulative')
39
+
40
+ # Apply user-specified styling functions (e.g. title, xlabel, etc.)
41
+ #for name, val in obj.args_dict.items():
42
+ # fn = getattr(plt, name, None)
43
+ # if fn and callable(fn):
44
+ # fn(val)
45
+
46
+ plt.xlabel(obj.x_field)
47
+ plt.ylabel(f"cumulative({obj.y_field})")
48
+ plt.text(1.0, 1.0, f"Cumulative {obj.y_field} over {obj.x_field}", transform=plt.gca().transAxes,
49
+ ha='right', va='top', fontsize=10, color='gray')
50
+ plt.text(1.0, 0.95, f"{count} data points", transform=plt.gca().transAxes,
51
+ ha='right', va='top', fontsize=10, color='gray')
52
+ plt.grid(True, linestyle='--', alpha=0.6)
53
+ plt.tight_layout()
54
+
55
+
@@ -0,0 +1,72 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from collections import defaultdict
5
+
6
+ def aggregate_ys(obj):
7
+ agg = defaultdict(float)
8
+ count = 0
9
+ for r in obj.records:
10
+ if obj.x_field in r and obj.y_field in r:
11
+ try:
12
+ agg[r[obj.x_field]] += r[obj.y_field]
13
+ count += 1
14
+ except Exception:
15
+ pass # skip malformed record
16
+
17
+ return count, agg
18
+
19
+ def graph_hist(obj):
20
+ import matplotlib.pyplot as plt # lazy imports
21
+ import numpy as np
22
+
23
+ count, agg = aggregate_ys(obj)
24
+ if agg:
25
+ x_vals = sorted(agg)
26
+ y_vals = [agg[x] for x in x_vals]
27
+
28
+ plt.figure()
29
+ plt.bar(x_vals, y_vals, edgecolor='black')
30
+
31
+ ylabel = f"sum({obj.y_field})"
32
+ else:
33
+ # Count mode: standard histogram
34
+ x_vals = [r[obj.x_field] for r in obj.records if obj.x_field in r]
35
+ count = len(x_vals)
36
+
37
+ if not x_vals:
38
+ print(f"No valid '{obj.x_field}' data for histogram.")
39
+ return
40
+
41
+ bin_width = 1
42
+ min_val = min(x_vals)
43
+ max_val = max(x_vals)
44
+ bins = np.arange(min_val, max_val + bin_width, bin_width)
45
+
46
+ plt.figure()
47
+ plt.hist(
48
+ x_vals,
49
+ bins=bins,
50
+ edgecolor='black',
51
+ rwidth=0.8,
52
+ align='mid'
53
+ )
54
+
55
+ ylabel = "count"
56
+
57
+ # need to fix as usage.params and list them
58
+ #for name, val in obj.args_dict.items():
59
+ # fn = getattr(plt, name, None)
60
+ # if fn and callable(fn):
61
+ # fn(val)
62
+
63
+ plt.xlabel(obj.x_field)
64
+ plt.ylabel(ylabel)
65
+ plt.text(1.0, 1.0, f"Histogram of {obj.x_field}", transform=plt.gca().transAxes,
66
+ ha='right', va='top', fontsize=10, color='gray')
67
+ plt.text(1.0, 0.95, f"{count} data points", transform=plt.gca().transAxes,
68
+ ha='right', va='top', fontsize=10, color='gray')
69
+ plt.grid(True, linestyle='--', alpha=0.6)
70
+ plt.tight_layout()
71
+ #plt.show()
72
+
@@ -0,0 +1,29 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ def graph_scatter(obj):
5
+ import matplotlib.pyplot as plt # lazy imports
6
+ import numpy as np
7
+
8
+ valid_records = [r for r in obj.records if obj.x_field in r and obj.y_field in r]
9
+ x_vals = [r[obj.x_field] for r in valid_records]
10
+ y_vals = [r[obj.y_field] for r in valid_records]
11
+
12
+ if not x_vals or not y_vals:
13
+ print(f"No valid '{obj.x_field}' and '{obj.y_field}' data for scatter plot.")
14
+ return
15
+
16
+ correlation = np.corrcoef(x_vals, y_vals)[0, 1]
17
+ slope, intercept = np.polyfit(x_vals, y_vals, 1)
18
+ regression_line = [slope * x + intercept for x in x_vals]
19
+
20
+ plt.figure()
21
+ plt.scatter(x_vals, y_vals, label="Data")
22
+ plt.plot(x_vals, regression_line, color="red", label=f"{obj.y_field} = {slope:.2f}*{obj.x_field} + {intercept:.2f}")
23
+ plt.xlabel(obj.x_field)
24
+ plt.ylabel(obj.y_field)
25
+ plt.title(f"Scatter Plot\nCorrelation: {correlation:.3f}")
26
+ plt.legend()
27
+ plt.grid(True)
28
+ plt.tight_layout()
29
+
pjk/sinks/json_sink.py ADDED
@@ -0,0 +1,23 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import os
5
+ import gzip
6
+ import json
7
+ from pjk.base import Sink, Source, ParsedToken, Usage
8
+
9
+ class JsonSink(Sink):
10
+ is_format = True
11
+
12
+ def __init__(self, ptok: ParsedToken, usage: Usage):
13
+ super().__init__(ptok, usage)
14
+ self.path_no_ext = ptok.pre_colon # NOTE: ptok built by framework, doesn't use usage
15
+ self.gz = ptok.get_arg(0) == 'True'# NOTE: ptok built by framework, doesn't use usage
16
+
17
+ def process(self) -> None:
18
+ path = self.path_no_ext + ('.json.gz' if self.gz else '.json')
19
+ open_func = gzip.open if self.gz else open
20
+
21
+ with open_func(path, 'wt', encoding='utf-8') as f:
22
+ for record in self.input:
23
+ f.write(json.dumps(record) + '\n')
pjk/sinks/s3_sink.py ADDED
@@ -0,0 +1,100 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2025 Mike Schultz
3
+
4
+ from typing import Optional, Type
5
+ from pjk.base import Source, Sink, ParsedToken, Usage
6
+ from pjk.log import logger
7
+
8
+
9
+ class S3Sink(Sink):
10
+ """
11
+ Write records to S3 in the given <format>, partitioned into:
12
+ s3:{bucket}/{prefix}/file-0000
13
+ s3:{bucket}/{prefix}/file-0001
14
+ Args (via Usage):
15
+ - path: 'bucket/path/to/files' (bucket required, prefix optional)
16
+ """
17
+
18
+ _FILENAME_BASE: str = "file"
19
+ _FILENAME_DIGITS: int = 4
20
+ _SCHEME: str = "s3:"
21
+
22
+ @classmethod
23
+ def usage(cls):
24
+ usage = Usage(
25
+ name="<format>",
26
+ desc="Write records to S3 in the given <format> (e.g., csv)",
27
+ component_class=cls,
28
+ )
29
+ usage.def_arg(name="path", usage="bucket/path/to/files")
30
+ return usage
31
+
32
+ def __init__(
33
+ self,
34
+ ptok: ParsedToken,
35
+ usage: Usage,
36
+ sink_class: Type[Sink],
37
+ is_gz: bool,
38
+ fileno: int = 0,
39
+ ):
40
+ super().__init__(ptok, usage)
41
+
42
+ raw_path: Optional[str] = usage.get_arg("path")
43
+ if not raw_path:
44
+ raise ValueError("S3Sink requires 'path' argument like 'bucket/path/to/files'")
45
+
46
+ # Normalize: allow 's3:bucket/...' or '/bucket/...', strip extras
47
+ path = raw_path.strip()
48
+ if path.startswith(self._SCHEME):
49
+ path = path[len(self._SCHEME) :]
50
+ path = path.lstrip("/")
51
+
52
+ # Ensure a trailing slash so we can append filenames cleanly
53
+ self.base_path: str = path if path.endswith("/") else path + "/"
54
+
55
+ self.ptok = ptok
56
+ self.usage = usage
57
+ self.sink_class = sink_class
58
+ self.is_gz = is_gz
59
+ self.fileno = fileno
60
+ self.num_files = 1 # next file index for deep_copy clones
61
+
62
+ def _build_object_key(self, index: int) -> str:
63
+ file_name = f"{self._FILENAME_BASE}-{index:0{self._FILENAME_DIGITS}d}"
64
+ return f"{self.base_path}{file_name}"
65
+
66
+ def _build_parsed_token_for_index(self, index: int) -> ParsedToken:
67
+ key = self._build_object_key(index)
68
+ token_str = f"{self._SCHEME}{key}:{self.is_gz}"
69
+ return ParsedToken(token_str)
70
+
71
+ def process(self):
72
+ file_ptok = self._build_parsed_token_for_index(self.fileno)
73
+
74
+ file_usage = self.sink_class.usage()
75
+ file_usage.bind(file_ptok)
76
+
77
+ file_sink = self.sink_class(file_ptok, file_usage)
78
+ file_sink.add_source(self.input)
79
+
80
+ logger.debug(
81
+ f"in process sinking to: s3:{self.base_path} (object index {self.fileno:0{self._FILENAME_DIGITS}d})"
82
+ )
83
+ file_sink.process()
84
+
85
+ def deep_copy(self):
86
+ source_clone: Optional[Source] = self.input.deep_copy()
87
+ if not source_clone:
88
+ return None
89
+
90
+ clone = S3Sink(
91
+ ptok=self.ptok,
92
+ usage=self.usage,
93
+ sink_class=self.sink_class,
94
+ is_gz=self.is_gz,
95
+ fileno=self.num_files,
96
+ )
97
+ clone.add_source(source_clone)
98
+
99
+ self.num_files += 1
100
+ return clone