python-jack-knife 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pjk/__init__.py +5 -0
- pjk/base.py +377 -0
- pjk/common.py +150 -0
- pjk/log.py +67 -0
- pjk/main.py +106 -0
- pjk/man_page.py +125 -0
- pjk/parser.py +284 -0
- pjk/pipes/__init__.py +0 -0
- pjk/pipes/denorm.py +68 -0
- pjk/pipes/factory.py +62 -0
- pjk/pipes/filter.py +57 -0
- pjk/pipes/head.py +34 -0
- pjk/pipes/join.py +85 -0
- pjk/pipes/let_reduce.py +198 -0
- pjk/pipes/map.py +91 -0
- pjk/pipes/move_field.py +36 -0
- pjk/pipes/postgres_pipe.py +209 -0
- pjk/pipes/remove_field.py +36 -0
- pjk/pipes/select.py +42 -0
- pjk/pipes/sort.py +63 -0
- pjk/pipes/tail.py +39 -0
- pjk/pipes/user_pipe_factory.py +45 -0
- pjk/pipes/where.py +49 -0
- pjk/registry.py +143 -0
- pjk/sinks/__init__.py +0 -0
- pjk/sinks/csv_sink.py +33 -0
- pjk/sinks/ddb.py +54 -0
- pjk/sinks/devnull.py +31 -0
- pjk/sinks/dir_sink.py +59 -0
- pjk/sinks/expect.py +53 -0
- pjk/sinks/factory.py +108 -0
- pjk/sinks/graph.py +57 -0
- pjk/sinks/graph_bar_line.py +229 -0
- pjk/sinks/graph_cumulative.py +55 -0
- pjk/sinks/graph_hist.py +72 -0
- pjk/sinks/graph_scatter.py +29 -0
- pjk/sinks/json_sink.py +23 -0
- pjk/sinks/s3_sink.py +100 -0
- pjk/sinks/sinks.py +68 -0
- pjk/sinks/stdout.py +44 -0
- pjk/sinks/tsv_sink.py +22 -0
- pjk/sinks/user_sink_factory.py +43 -0
- pjk/sources/__init__.py +0 -0
- pjk/sources/csv_source.py +28 -0
- pjk/sources/dir_source.py +69 -0
- pjk/sources/factory.py +100 -0
- pjk/sources/format_usage.py +11 -0
- pjk/sources/inline_source.py +56 -0
- pjk/sources/json_source.py +35 -0
- pjk/sources/lazy_file.py +16 -0
- pjk/sources/lazy_file_local.py +22 -0
- pjk/sources/lazy_file_s3.py +28 -0
- pjk/sources/parquet_source.py +32 -0
- pjk/sources/s3_source.py +146 -0
- pjk/sources/source_list.py +23 -0
- pjk/sources/sql_source.py +32 -0
- pjk/sources/tsv_source.py +15 -0
- pjk/sources/user_source_factory.py +33 -0
- pjk/version.py +4 -0
- python_jack_knife-0.5.0.dist-info/METADATA +254 -0
- python_jack_knife-0.5.0.dist-info/RECORD +65 -0
- python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
- python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
- python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
- python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/sinks/factory.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from typing import Any, List, Callable
|
|
5
|
+
import os
|
|
6
|
+
from pjk.base import Source, Sink, ParsedToken
|
|
7
|
+
from pjk.common import ComponentFactory
|
|
8
|
+
from pjk.sinks.stdout import StdoutSink
|
|
9
|
+
from pjk.sinks.json_sink import JsonSink
|
|
10
|
+
from pjk.sinks.devnull import DevNullSink
|
|
11
|
+
from pjk.sinks.graph import GraphSink
|
|
12
|
+
from pjk.sinks.csv_sink import CSVSink
|
|
13
|
+
from pjk.sinks.tsv_sink import TSVSink
|
|
14
|
+
from pjk.sinks.ddb import DDBSink
|
|
15
|
+
from pjk.sinks.dir_sink import DirSink
|
|
16
|
+
from pjk.sinks.expect import ExpectSink
|
|
17
|
+
from pjk.sinks.user_sink_factory import UserSinkFactory
|
|
18
|
+
|
|
19
|
+
COMPONENTS = {
|
|
20
|
+
'-': StdoutSink,
|
|
21
|
+
'devnull': DevNullSink,
|
|
22
|
+
'graph': GraphSink,
|
|
23
|
+
'ddb': DDBSink,
|
|
24
|
+
'json': JsonSink,
|
|
25
|
+
'csv': CSVSink,
|
|
26
|
+
'tsv': TSVSink,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
class SinkFactory(ComponentFactory):
|
|
30
|
+
def __init__(self):
|
|
31
|
+
super().__init__(COMPONENTS, 'sink')
|
|
32
|
+
|
|
33
|
+
def create(self, token: str) -> Callable[[Source], Sink]:
|
|
34
|
+
token = token.strip()
|
|
35
|
+
ptok = ParsedToken(token)
|
|
36
|
+
|
|
37
|
+
# non-usage sink (bind incompatible)
|
|
38
|
+
if ptok.pre_colon == 'expect':
|
|
39
|
+
return ExpectSink(ptok, None)
|
|
40
|
+
|
|
41
|
+
if ptok.pre_colon.endswith('.py'):
|
|
42
|
+
sink = UserSinkFactory.create(ptok)
|
|
43
|
+
if sink:
|
|
44
|
+
return sink
|
|
45
|
+
|
|
46
|
+
#if ptok.all_but_params.startswith('s3'):
|
|
47
|
+
# return S3Sink.create(ptok, get_format_class_gz=self.get_format_class_gz)
|
|
48
|
+
|
|
49
|
+
# check for format sinks
|
|
50
|
+
sink = self._attempt_format(ptok)
|
|
51
|
+
if sink:
|
|
52
|
+
return sink
|
|
53
|
+
|
|
54
|
+
sink_cls = self.components.get(ptok.pre_colon)
|
|
55
|
+
if not sink_cls:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
usage = sink_cls.usage()
|
|
59
|
+
usage.bind(ptok)
|
|
60
|
+
return sink_cls(ptok, usage)
|
|
61
|
+
|
|
62
|
+
#raise TokenError.from_list(['pjk <source> [<pipe> ...] <sink>',
|
|
63
|
+
# "Expression must end in a sink (e.g. '-', 'out.json')"]
|
|
64
|
+
# )
|
|
65
|
+
|
|
66
|
+
def _attempt_format(self, ptok: ParsedToken):
|
|
67
|
+
format = ptok.pre_colon
|
|
68
|
+
is_gz = False
|
|
69
|
+
if format.endswith('.gz'):
|
|
70
|
+
format = format[:-3]
|
|
71
|
+
is_gz = True
|
|
72
|
+
|
|
73
|
+
sink_cls = self.components.get(format) # <format>: directory case
|
|
74
|
+
if not sink_cls:
|
|
75
|
+
# attempt case -> myfile.<format>
|
|
76
|
+
return self._attempt_format_file(ptok)
|
|
77
|
+
|
|
78
|
+
# case -> <format>:<path> local dir
|
|
79
|
+
if sink_cls.is_format:
|
|
80
|
+
dir_usage = DirSink.usage()
|
|
81
|
+
dir_usage.bind(ptok)
|
|
82
|
+
return DirSink(ptok, dir_usage, sink_cls, is_gz)
|
|
83
|
+
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _attempt_format_file(self, ptok: ParsedToken):
|
|
88
|
+
is_gz = False
|
|
89
|
+
path, ext = os.path.splitext(ptok.all_but_params)
|
|
90
|
+
if '.gz' in ext:
|
|
91
|
+
is_gz = True
|
|
92
|
+
path, ext = os.path.splitext(path)
|
|
93
|
+
|
|
94
|
+
file_ext = ext.lstrip('.') # removes the leading dot
|
|
95
|
+
|
|
96
|
+
sink_cls = self.components.get(file_ext)
|
|
97
|
+
if not sink_cls:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
file_token = f'{path}:{is_gz}' # hack so user can do .json.gz
|
|
101
|
+
file_ptok = ParsedToken(file_token)
|
|
102
|
+
|
|
103
|
+
usage = sink_cls.usage()
|
|
104
|
+
usage.bind(file_ptok) # not sure we'll ever use since we're hacking above
|
|
105
|
+
|
|
106
|
+
return sink_cls(file_ptok, usage)
|
|
107
|
+
|
|
108
|
+
|
pjk/sinks/graph.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from pjk.base import Sink, ParsedToken, Usage, TokenError
|
|
5
|
+
|
|
6
|
+
from pjk.sinks.graph_cumulative import graph_cumulative
|
|
7
|
+
from pjk.sinks.graph_hist import graph_hist
|
|
8
|
+
from pjk.sinks.graph_scatter import graph_scatter
|
|
9
|
+
from pjk.sinks.graph_bar_line import graph_bar_line
|
|
10
|
+
|
|
11
|
+
class GraphSink(Sink):
|
|
12
|
+
@classmethod
|
|
13
|
+
def usage(cls):
|
|
14
|
+
usage = Usage(
|
|
15
|
+
name='graph',
|
|
16
|
+
desc='Display various kinds of graphs from streamed records',
|
|
17
|
+
component_class=cls
|
|
18
|
+
)
|
|
19
|
+
usage.def_arg(name='kind', usage='hist|scatter|bar|line|cumulative')
|
|
20
|
+
usage.def_param(name='x', usage='Name of x-axis field', default='x')
|
|
21
|
+
usage.def_param(name='y', usage='Name of y-axis field', default='y')
|
|
22
|
+
usage.def_param(name='pause', usage='Seconds to show graph', is_num=True, default='-1')
|
|
23
|
+
return usage
|
|
24
|
+
|
|
25
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
26
|
+
super().__init__(ptok, usage)
|
|
27
|
+
self.records = []
|
|
28
|
+
self.kind = usage.get_arg('kind')
|
|
29
|
+
self.x_field = usage.get_param('x')
|
|
30
|
+
self.y_field = usage.get_param('y')
|
|
31
|
+
self.pause = usage.get_param('pause')
|
|
32
|
+
|
|
33
|
+
def process(self):
|
|
34
|
+
import matplotlib.pyplot as plt # lazy import
|
|
35
|
+
|
|
36
|
+
for record in self.input:
|
|
37
|
+
self.records.append(record)
|
|
38
|
+
|
|
39
|
+
if self.kind == "scatter":
|
|
40
|
+
graph_scatter(self)
|
|
41
|
+
elif self.kind == "hist":
|
|
42
|
+
graph_hist(self)
|
|
43
|
+
elif self.kind == "cumulative":
|
|
44
|
+
graph_cumulative(self)
|
|
45
|
+
elif self.kind == "bar":
|
|
46
|
+
graph_bar_line(self, 'bar')
|
|
47
|
+
elif self.kind == "line":
|
|
48
|
+
graph_bar_line(self, 'line')
|
|
49
|
+
else:
|
|
50
|
+
raise TokenError(f"Unsupported graph type: {self.kind}")
|
|
51
|
+
|
|
52
|
+
if not self.pause:
|
|
53
|
+
plt.show()
|
|
54
|
+
else:
|
|
55
|
+
plt.show(block=False)
|
|
56
|
+
plt.pause(int(self.pause))
|
|
57
|
+
plt.close()
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from datetime import date, datetime
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
|
|
8
|
+
def graph_bar_line(obj, type):
|
|
9
|
+
import matplotlib.pyplot as plt # lazy imports
|
|
10
|
+
import matplotlib.dates as mdates
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
"""
|
|
14
|
+
Plot grouped data as a bar or line chart. Automatically detects time-series X
|
|
15
|
+
unless explicitly overridden via `obj.x_is_time = True/False`.
|
|
16
|
+
|
|
17
|
+
Required fields on `obj`:
|
|
18
|
+
- obj.records: iterable of dicts
|
|
19
|
+
- obj.x_field: str
|
|
20
|
+
- obj.y_field: str
|
|
21
|
+
- obj.args_dict: optional dict of {matplotlib.pyplot function name -> value}
|
|
22
|
+
- obj.x_is_time: optional bool to force/disable time-series mode
|
|
23
|
+
|
|
24
|
+
`type` should be 'bar' or 'line'.
|
|
25
|
+
"""
|
|
26
|
+
if not obj.x_field or not obj.y_field:
|
|
27
|
+
print("Both x_field and y_field must be specified.")
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
# ---------- collect raw rows ----------
|
|
31
|
+
rows = [] # (x, y, set_name)
|
|
32
|
+
count = 0
|
|
33
|
+
for r in obj.records:
|
|
34
|
+
if obj.x_field in r and obj.y_field in r:
|
|
35
|
+
x = r[obj.x_field]
|
|
36
|
+
y = r[obj.y_field]
|
|
37
|
+
set_name = r.get("set_name", "__default__")
|
|
38
|
+
try:
|
|
39
|
+
rows.append((x, float(y), set_name))
|
|
40
|
+
count += 1
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
if not rows:
|
|
45
|
+
print(f"No valid '{obj.x_field}' and '{obj.y_field}' records found.")
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
plt.figure()
|
|
49
|
+
|
|
50
|
+
# ---------- robust time-series detection (without false positives) ----------
|
|
51
|
+
df = pd.DataFrame(rows, columns=["x", "y", "set"])
|
|
52
|
+
|
|
53
|
+
# Optional explicit override from caller
|
|
54
|
+
x_is_time_override = getattr(obj, "x_is_time", None)
|
|
55
|
+
|
|
56
|
+
def looks_like_datetime_str(s: str) -> bool:
|
|
57
|
+
# quick heuristics for common date/time strings (ISO, with separators, or HH:MM)
|
|
58
|
+
return bool(
|
|
59
|
+
re.search(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}", s) or # YYYY-MM-DD or YYYY/MM/DD
|
|
60
|
+
re.search(r"\d{1,2}[-/]\d{1,2}[-/]\d{2,4}", s) or # MM-DD-YYYY, etc.
|
|
61
|
+
re.search(r"\d{1,2}:\d{2}", s) or # HH:MM present
|
|
62
|
+
("T" in s) or ("Z" in s) # ISO 8601 hints
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def detect_time_series(xs: pd.Series) -> bool:
|
|
66
|
+
# Already datetime dtype?
|
|
67
|
+
if pd.api.types.is_datetime64_any_dtype(xs):
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
# Native datetime-like objects
|
|
71
|
+
if (xs.map(lambda v: isinstance(v, (datetime, date, pd.Timestamp, np.datetime64))).mean() >= 0.9):
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
# String-looking dates
|
|
75
|
+
str_mask = xs.map(lambda v: isinstance(v, str))
|
|
76
|
+
if str_mask.mean() >= 0.9:
|
|
77
|
+
looks_mask = xs[str_mask].map(looks_like_datetime_str)
|
|
78
|
+
if looks_mask.mean() >= 0.9:
|
|
79
|
+
parsed = pd.to_datetime(xs[str_mask][looks_mask], errors="coerce", utc=False)
|
|
80
|
+
if parsed.notna().mean() >= 0.9:
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
# Numeric epoch seconds/millis (avoid generic integer → ns trap)
|
|
84
|
+
num = pd.to_numeric(xs, errors="coerce")
|
|
85
|
+
if num.notna().mean() >= 0.9:
|
|
86
|
+
# use quantiles to ignore a few bad points
|
|
87
|
+
q05, q95 = num.quantile(0.05), num.quantile(0.95)
|
|
88
|
+
# seconds since epoch ~ [1e9, 2e9] for 2001–2033
|
|
89
|
+
seconds_like = 1e9 <= q05 <= 2.2e9 and 1e9 <= q95 <= 2.2e9
|
|
90
|
+
# millis since epoch ~ [1e12, 2e12]
|
|
91
|
+
millis_like = 1e12 <= q05 <= 2.2e12 and 1e12 <= q95 <= 2.2e12
|
|
92
|
+
if seconds_like or millis_like:
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
if isinstance(x_is_time_override, bool):
|
|
98
|
+
is_time_series = x_is_time_override
|
|
99
|
+
else:
|
|
100
|
+
is_time_series = detect_time_series(df["x"])
|
|
101
|
+
|
|
102
|
+
if is_time_series:
|
|
103
|
+
# ===== time-series path (real datetime index) =====
|
|
104
|
+
# Parse with care: handle epoch seconds/millis and strings
|
|
105
|
+
x_series = df["x"]
|
|
106
|
+
|
|
107
|
+
# Try numeric epochs first
|
|
108
|
+
numeric = pd.to_numeric(x_series, errors="coerce")
|
|
109
|
+
parsed = None
|
|
110
|
+
if numeric.notna().mean() >= 0.9:
|
|
111
|
+
q05, q95 = numeric.quantile(0.05), numeric.quantile(0.95)
|
|
112
|
+
if 1e12 <= q05 <= 2.2e12 and 1e12 <= q95 <= 2.2e12:
|
|
113
|
+
parsed = pd.to_datetime(numeric, unit="ms", errors="coerce", utc=False)
|
|
114
|
+
elif 1e9 <= q05 <= 2.2e9 and 1e9 <= q95 <= 2.2e9:
|
|
115
|
+
parsed = pd.to_datetime(numeric, unit="s", errors="coerce", utc=False)
|
|
116
|
+
|
|
117
|
+
# Fallback to generic parser for strings/datetimes
|
|
118
|
+
if parsed is None:
|
|
119
|
+
parsed = pd.to_datetime(x_series, errors="coerce", utc=False)
|
|
120
|
+
|
|
121
|
+
df["ts"] = parsed
|
|
122
|
+
df = df.dropna(subset=["ts"])
|
|
123
|
+
if df.empty:
|
|
124
|
+
print("No parsable datetime values in x_field.")
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
# aggregate duplicates at same timestamp per set
|
|
128
|
+
df = df.groupby(["ts", "set"], as_index=False)["y"].sum()
|
|
129
|
+
sets = sorted(df["set"].unique())
|
|
130
|
+
|
|
131
|
+
ax = plt.gca()
|
|
132
|
+
for sname in sets:
|
|
133
|
+
s = df[df["set"] == sname].set_index("ts")["y"].sort_index()
|
|
134
|
+
label = None if sname == "__default__" else sname
|
|
135
|
+
# line plot for time series (clean)
|
|
136
|
+
ax.plot(s.index, s.values, label=label)
|
|
137
|
+
|
|
138
|
+
# ---- sparse, readable ticks (dynamic) ----
|
|
139
|
+
ts_min, ts_max = df["ts"].min(), df["ts"].max()
|
|
140
|
+
span_hours = max((ts_max - ts_min).total_seconds() / 3600.0, 1)
|
|
141
|
+
|
|
142
|
+
if span_hours <= 72:
|
|
143
|
+
major = mdates.HourLocator(interval=6) # every 6 hours
|
|
144
|
+
fmt = mdates.DateFormatter("%m-%d %H:%M")
|
|
145
|
+
elif span_hours <= 14 * 24:
|
|
146
|
+
major = mdates.HourLocator(interval=12) # every 12 hours
|
|
147
|
+
fmt = mdates.DateFormatter("%m-%d %H:%M")
|
|
148
|
+
elif span_hours <= 90 * 24:
|
|
149
|
+
major = mdates.DayLocator(interval=1) # daily
|
|
150
|
+
fmt = mdates.DateFormatter("%Y-%m-%d")
|
|
151
|
+
else:
|
|
152
|
+
major = mdates.WeekdayLocator(byweekday=mdates.MO, interval=1) # weekly
|
|
153
|
+
fmt = mdates.DateFormatter("%Y-%m-%d")
|
|
154
|
+
|
|
155
|
+
ax.xaxis.set_major_locator(major)
|
|
156
|
+
ax.xaxis.set_major_formatter(fmt)
|
|
157
|
+
# no minor tick labels; rotate to avoid overlap
|
|
158
|
+
plt.gcf().autofmt_xdate()
|
|
159
|
+
|
|
160
|
+
plt.xlabel(obj.x_field)
|
|
161
|
+
plt.ylabel(obj.y_field)
|
|
162
|
+
if any(s != "__default__" for s in sets):
|
|
163
|
+
plt.legend(title="data set")
|
|
164
|
+
plt.title(f"{obj.y_field} over time")
|
|
165
|
+
plt.text(1.0, 0.95, f"{count} data points", transform=ax.transAxes,
|
|
166
|
+
ha='right', va='top', fontsize=10, color='gray')
|
|
167
|
+
|
|
168
|
+
else:
|
|
169
|
+
# ===== categorical / numeric path (original behavior, with tick thinning) =====
|
|
170
|
+
data = defaultdict(float)
|
|
171
|
+
all_x = []
|
|
172
|
+
all_sets = set()
|
|
173
|
+
|
|
174
|
+
for x, y, set_name in rows:
|
|
175
|
+
data[(x, set_name)] += y
|
|
176
|
+
all_sets.add(set_name)
|
|
177
|
+
all_x.append(x)
|
|
178
|
+
|
|
179
|
+
if not data:
|
|
180
|
+
print(f"No valid '{obj.x_field}' and '{obj.y_field}' records found.")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
# preserve order but unique
|
|
184
|
+
seen = set()
|
|
185
|
+
x_vals = [x for x in all_x if not (x in seen or seen.add(x))]
|
|
186
|
+
set_names = sorted(all_sets)
|
|
187
|
+
x_indices = np.arange(len(x_vals))
|
|
188
|
+
width = 0.8 / len(set_names) if len(set_names) > 1 else 0.6
|
|
189
|
+
|
|
190
|
+
for i, set_name in enumerate(set_names):
|
|
191
|
+
heights = [data.get((x, set_name), 0) for x in x_vals]
|
|
192
|
+
label = None if set_name == "__default__" else set_name
|
|
193
|
+
offset = (i - (len(set_names) - 1) / 2) * width
|
|
194
|
+
|
|
195
|
+
if type == 'bar':
|
|
196
|
+
plt.bar(x_indices + offset, heights, width=width, label=label, edgecolor='black')
|
|
197
|
+
else:
|
|
198
|
+
plt.plot(x_indices, heights, marker='o', label=label)
|
|
199
|
+
|
|
200
|
+
# ---- thin xticks to at most 12 labels ----
|
|
201
|
+
max_ticks = 12
|
|
202
|
+
if len(x_vals) > max_ticks:
|
|
203
|
+
step = int(np.ceil(len(x_vals) / max_ticks))
|
|
204
|
+
tick_idx = x_indices[::step]
|
|
205
|
+
tick_lbl = [x_vals[i] for i in tick_idx]
|
|
206
|
+
else:
|
|
207
|
+
tick_idx = x_indices
|
|
208
|
+
tick_lbl = x_vals
|
|
209
|
+
|
|
210
|
+
plt.xticks(tick_idx, tick_lbl, rotation=45)
|
|
211
|
+
plt.xlabel(obj.x_field)
|
|
212
|
+
plt.ylabel(obj.y_field)
|
|
213
|
+
if len(set_names) > 1 or "__default__" not in set_names:
|
|
214
|
+
plt.legend(title="data set")
|
|
215
|
+
plt.title(f"{obj.y_field} by {obj.x_field}")
|
|
216
|
+
plt.text(1.0, 0.95, f"{count} data points", transform=plt.gca().transAxes,
|
|
217
|
+
ha='right', va='top', fontsize=10, color='gray')
|
|
218
|
+
|
|
219
|
+
# ---------- optional plt args ----------
|
|
220
|
+
for name, val in getattr(obj, "args_dict", {}).items():
|
|
221
|
+
fn = getattr(plt, name, None)
|
|
222
|
+
if callable(fn):
|
|
223
|
+
try:
|
|
224
|
+
fn(val)
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
|
229
|
+
plt.tight_layout()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
def graph_cumulative(obj):
|
|
5
|
+
import matplotlib.pyplot as plt # lazy import
|
|
6
|
+
|
|
7
|
+
# Filter and sort records by x
|
|
8
|
+
records = [
|
|
9
|
+
r for r in obj.records
|
|
10
|
+
if obj.x_field in r and obj.y_field in r
|
|
11
|
+
]
|
|
12
|
+
try:
|
|
13
|
+
sorted_records = sorted(records, key=lambda r: r[obj.x_field])
|
|
14
|
+
except TypeError:
|
|
15
|
+
print(f"Unable to sort records by '{obj.x_field}' — incompatible types.")
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
x_vals = []
|
|
19
|
+
y_vals = []
|
|
20
|
+
total = 0
|
|
21
|
+
count = 0
|
|
22
|
+
for r in sorted_records:
|
|
23
|
+
try:
|
|
24
|
+
x = r[obj.x_field]
|
|
25
|
+
y = r[obj.y_field]
|
|
26
|
+
total += y
|
|
27
|
+
x_vals.append(x)
|
|
28
|
+
y_vals.append(total)
|
|
29
|
+
count += 1
|
|
30
|
+
except Exception:
|
|
31
|
+
pass # silently skip bad records
|
|
32
|
+
|
|
33
|
+
if not x_vals:
|
|
34
|
+
print(f"No valid '{obj.x_field}' and '{obj.y_field}' data for cumulative plot.")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
plt.figure()
|
|
38
|
+
plt.plot(x_vals, y_vals, marker='o', linestyle='-', label='Cumulative')
|
|
39
|
+
|
|
40
|
+
# Apply user-specified styling functions (e.g. title, xlabel, etc.)
|
|
41
|
+
#for name, val in obj.args_dict.items():
|
|
42
|
+
# fn = getattr(plt, name, None)
|
|
43
|
+
# if fn and callable(fn):
|
|
44
|
+
# fn(val)
|
|
45
|
+
|
|
46
|
+
plt.xlabel(obj.x_field)
|
|
47
|
+
plt.ylabel(f"cumulative({obj.y_field})")
|
|
48
|
+
plt.text(1.0, 1.0, f"Cumulative {obj.y_field} over {obj.x_field}", transform=plt.gca().transAxes,
|
|
49
|
+
ha='right', va='top', fontsize=10, color='gray')
|
|
50
|
+
plt.text(1.0, 0.95, f"{count} data points", transform=plt.gca().transAxes,
|
|
51
|
+
ha='right', va='top', fontsize=10, color='gray')
|
|
52
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
|
53
|
+
plt.tight_layout()
|
|
54
|
+
|
|
55
|
+
|
pjk/sinks/graph_hist.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
|
|
6
|
+
def aggregate_ys(obj):
|
|
7
|
+
agg = defaultdict(float)
|
|
8
|
+
count = 0
|
|
9
|
+
for r in obj.records:
|
|
10
|
+
if obj.x_field in r and obj.y_field in r:
|
|
11
|
+
try:
|
|
12
|
+
agg[r[obj.x_field]] += r[obj.y_field]
|
|
13
|
+
count += 1
|
|
14
|
+
except Exception:
|
|
15
|
+
pass # skip malformed record
|
|
16
|
+
|
|
17
|
+
return count, agg
|
|
18
|
+
|
|
19
|
+
def graph_hist(obj):
|
|
20
|
+
import matplotlib.pyplot as plt # lazy imports
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
count, agg = aggregate_ys(obj)
|
|
24
|
+
if agg:
|
|
25
|
+
x_vals = sorted(agg)
|
|
26
|
+
y_vals = [agg[x] for x in x_vals]
|
|
27
|
+
|
|
28
|
+
plt.figure()
|
|
29
|
+
plt.bar(x_vals, y_vals, edgecolor='black')
|
|
30
|
+
|
|
31
|
+
ylabel = f"sum({obj.y_field})"
|
|
32
|
+
else:
|
|
33
|
+
# Count mode: standard histogram
|
|
34
|
+
x_vals = [r[obj.x_field] for r in obj.records if obj.x_field in r]
|
|
35
|
+
count = len(x_vals)
|
|
36
|
+
|
|
37
|
+
if not x_vals:
|
|
38
|
+
print(f"No valid '{obj.x_field}' data for histogram.")
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
bin_width = 1
|
|
42
|
+
min_val = min(x_vals)
|
|
43
|
+
max_val = max(x_vals)
|
|
44
|
+
bins = np.arange(min_val, max_val + bin_width, bin_width)
|
|
45
|
+
|
|
46
|
+
plt.figure()
|
|
47
|
+
plt.hist(
|
|
48
|
+
x_vals,
|
|
49
|
+
bins=bins,
|
|
50
|
+
edgecolor='black',
|
|
51
|
+
rwidth=0.8,
|
|
52
|
+
align='mid'
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
ylabel = "count"
|
|
56
|
+
|
|
57
|
+
# need to fix as usage.params and list them
|
|
58
|
+
#for name, val in obj.args_dict.items():
|
|
59
|
+
# fn = getattr(plt, name, None)
|
|
60
|
+
# if fn and callable(fn):
|
|
61
|
+
# fn(val)
|
|
62
|
+
|
|
63
|
+
plt.xlabel(obj.x_field)
|
|
64
|
+
plt.ylabel(ylabel)
|
|
65
|
+
plt.text(1.0, 1.0, f"Histogram of {obj.x_field}", transform=plt.gca().transAxes,
|
|
66
|
+
ha='right', va='top', fontsize=10, color='gray')
|
|
67
|
+
plt.text(1.0, 0.95, f"{count} data points", transform=plt.gca().transAxes,
|
|
68
|
+
ha='right', va='top', fontsize=10, color='gray')
|
|
69
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
|
70
|
+
plt.tight_layout()
|
|
71
|
+
#plt.show()
|
|
72
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
def graph_scatter(obj):
|
|
5
|
+
import matplotlib.pyplot as plt # lazy imports
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
valid_records = [r for r in obj.records if obj.x_field in r and obj.y_field in r]
|
|
9
|
+
x_vals = [r[obj.x_field] for r in valid_records]
|
|
10
|
+
y_vals = [r[obj.y_field] for r in valid_records]
|
|
11
|
+
|
|
12
|
+
if not x_vals or not y_vals:
|
|
13
|
+
print(f"No valid '{obj.x_field}' and '{obj.y_field}' data for scatter plot.")
|
|
14
|
+
return
|
|
15
|
+
|
|
16
|
+
correlation = np.corrcoef(x_vals, y_vals)[0, 1]
|
|
17
|
+
slope, intercept = np.polyfit(x_vals, y_vals, 1)
|
|
18
|
+
regression_line = [slope * x + intercept for x in x_vals]
|
|
19
|
+
|
|
20
|
+
plt.figure()
|
|
21
|
+
plt.scatter(x_vals, y_vals, label="Data")
|
|
22
|
+
plt.plot(x_vals, regression_line, color="red", label=f"{obj.y_field} = {slope:.2f}*{obj.x_field} + {intercept:.2f}")
|
|
23
|
+
plt.xlabel(obj.x_field)
|
|
24
|
+
plt.ylabel(obj.y_field)
|
|
25
|
+
plt.title(f"Scatter Plot\nCorrelation: {correlation:.3f}")
|
|
26
|
+
plt.legend()
|
|
27
|
+
plt.grid(True)
|
|
28
|
+
plt.tight_layout()
|
|
29
|
+
|
pjk/sinks/json_sink.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import gzip
|
|
6
|
+
import json
|
|
7
|
+
from pjk.base import Sink, Source, ParsedToken, Usage
|
|
8
|
+
|
|
9
|
+
class JsonSink(Sink):
|
|
10
|
+
is_format = True
|
|
11
|
+
|
|
12
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
13
|
+
super().__init__(ptok, usage)
|
|
14
|
+
self.path_no_ext = ptok.pre_colon # NOTE: ptok built by framework, doesn't use usage
|
|
15
|
+
self.gz = ptok.get_arg(0) == 'True'# NOTE: ptok built by framework, doesn't use usage
|
|
16
|
+
|
|
17
|
+
def process(self) -> None:
|
|
18
|
+
path = self.path_no_ext + ('.json.gz' if self.gz else '.json')
|
|
19
|
+
open_func = gzip.open if self.gz else open
|
|
20
|
+
|
|
21
|
+
with open_func(path, 'wt', encoding='utf-8') as f:
|
|
22
|
+
for record in self.input:
|
|
23
|
+
f.write(json.dumps(record) + '\n')
|
pjk/sinks/s3_sink.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2025 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from typing import Optional, Type
|
|
5
|
+
from pjk.base import Source, Sink, ParsedToken, Usage
|
|
6
|
+
from pjk.log import logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class S3Sink(Sink):
|
|
10
|
+
"""
|
|
11
|
+
Write records to S3 in the given <format>, partitioned into:
|
|
12
|
+
s3:{bucket}/{prefix}/file-0000
|
|
13
|
+
s3:{bucket}/{prefix}/file-0001
|
|
14
|
+
Args (via Usage):
|
|
15
|
+
- path: 'bucket/path/to/files' (bucket required, prefix optional)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
_FILENAME_BASE: str = "file"
|
|
19
|
+
_FILENAME_DIGITS: int = 4
|
|
20
|
+
_SCHEME: str = "s3:"
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def usage(cls):
|
|
24
|
+
usage = Usage(
|
|
25
|
+
name="<format>",
|
|
26
|
+
desc="Write records to S3 in the given <format> (e.g., csv)",
|
|
27
|
+
component_class=cls,
|
|
28
|
+
)
|
|
29
|
+
usage.def_arg(name="path", usage="bucket/path/to/files")
|
|
30
|
+
return usage
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
ptok: ParsedToken,
|
|
35
|
+
usage: Usage,
|
|
36
|
+
sink_class: Type[Sink],
|
|
37
|
+
is_gz: bool,
|
|
38
|
+
fileno: int = 0,
|
|
39
|
+
):
|
|
40
|
+
super().__init__(ptok, usage)
|
|
41
|
+
|
|
42
|
+
raw_path: Optional[str] = usage.get_arg("path")
|
|
43
|
+
if not raw_path:
|
|
44
|
+
raise ValueError("S3Sink requires 'path' argument like 'bucket/path/to/files'")
|
|
45
|
+
|
|
46
|
+
# Normalize: allow 's3:bucket/...' or '/bucket/...', strip extras
|
|
47
|
+
path = raw_path.strip()
|
|
48
|
+
if path.startswith(self._SCHEME):
|
|
49
|
+
path = path[len(self._SCHEME) :]
|
|
50
|
+
path = path.lstrip("/")
|
|
51
|
+
|
|
52
|
+
# Ensure a trailing slash so we can append filenames cleanly
|
|
53
|
+
self.base_path: str = path if path.endswith("/") else path + "/"
|
|
54
|
+
|
|
55
|
+
self.ptok = ptok
|
|
56
|
+
self.usage = usage
|
|
57
|
+
self.sink_class = sink_class
|
|
58
|
+
self.is_gz = is_gz
|
|
59
|
+
self.fileno = fileno
|
|
60
|
+
self.num_files = 1 # next file index for deep_copy clones
|
|
61
|
+
|
|
62
|
+
def _build_object_key(self, index: int) -> str:
|
|
63
|
+
file_name = f"{self._FILENAME_BASE}-{index:0{self._FILENAME_DIGITS}d}"
|
|
64
|
+
return f"{self.base_path}{file_name}"
|
|
65
|
+
|
|
66
|
+
def _build_parsed_token_for_index(self, index: int) -> ParsedToken:
|
|
67
|
+
key = self._build_object_key(index)
|
|
68
|
+
token_str = f"{self._SCHEME}{key}:{self.is_gz}"
|
|
69
|
+
return ParsedToken(token_str)
|
|
70
|
+
|
|
71
|
+
def process(self):
|
|
72
|
+
file_ptok = self._build_parsed_token_for_index(self.fileno)
|
|
73
|
+
|
|
74
|
+
file_usage = self.sink_class.usage()
|
|
75
|
+
file_usage.bind(file_ptok)
|
|
76
|
+
|
|
77
|
+
file_sink = self.sink_class(file_ptok, file_usage)
|
|
78
|
+
file_sink.add_source(self.input)
|
|
79
|
+
|
|
80
|
+
logger.debug(
|
|
81
|
+
f"in process sinking to: s3:{self.base_path} (object index {self.fileno:0{self._FILENAME_DIGITS}d})"
|
|
82
|
+
)
|
|
83
|
+
file_sink.process()
|
|
84
|
+
|
|
85
|
+
def deep_copy(self):
|
|
86
|
+
source_clone: Optional[Source] = self.input.deep_copy()
|
|
87
|
+
if not source_clone:
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
clone = S3Sink(
|
|
91
|
+
ptok=self.ptok,
|
|
92
|
+
usage=self.usage,
|
|
93
|
+
sink_class=self.sink_class,
|
|
94
|
+
is_gz=self.is_gz,
|
|
95
|
+
fileno=self.num_files,
|
|
96
|
+
)
|
|
97
|
+
clone.add_source(source_clone)
|
|
98
|
+
|
|
99
|
+
self.num_files += 1
|
|
100
|
+
return clone
|