python-jack-knife 0.7.0__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {python_jack_knife-0.7.0/src/python_jack_knife.egg-info → python_jack_knife-0.7.4}/PKG-INFO +1 -1
  2. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/history.py +3 -0
  3. python_jack_knife-0.7.4/src/pjk/integrations/postgres_pipe.py +268 -0
  4. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/factory.py +1 -1
  5. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/query_pipe.py +2 -2
  6. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/select.py +2 -2
  7. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_bar_line.py +17 -10
  8. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/factory.py +13 -0
  9. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/npy_source.py +3 -4
  10. python_jack_knife-0.7.4/src/pjk/sources/s3_select_source.py +373 -0
  11. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/sql_source.py +13 -4
  12. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/version.py +1 -1
  13. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4/src/python_jack_knife.egg-info}/PKG-INFO +1 -1
  14. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/SOURCES.txt +1 -0
  15. python_jack_knife-0.7.0/src/pjk/integrations/postgres_pipe.py +0 -218
  16. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/LICENSE +0 -0
  17. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/README.md +0 -0
  18. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/pyproject.toml +0 -0
  19. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/setup.cfg +0 -0
  20. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/__init__.py +0 -0
  21. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/common.py +0 -0
  22. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/components.py +0 -0
  23. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/opensearch_client.py +0 -0
  24. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/opensearch_index_sink.py +0 -0
  25. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/opensearch_query_pipe.py +0 -0
  26. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/snowflake_pipe.py +0 -0
  27. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/log.py +0 -0
  28. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/main.py +0 -0
  29. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/man_page.py +0 -0
  30. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/parser.py +0 -0
  31. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/__init__.py +0 -0
  32. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/denorm.py +0 -0
  33. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/filter.py +0 -0
  34. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/head.py +0 -0
  35. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/join.py +0 -0
  36. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/let_reduce.py +0 -0
  37. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/map.py +0 -0
  38. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/move_field.py +0 -0
  39. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/progress_pipe.py +0 -0
  40. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/remove_field.py +0 -0
  41. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/sample.py +0 -0
  42. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/sort.py +0 -0
  43. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/tail.py +0 -0
  44. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/user_pipe_factory.py +0 -0
  45. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/where.py +0 -0
  46. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/progress.py +0 -0
  47. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/registry.py +0 -0
  48. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/__init__.py +0 -0
  49. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/create_sink.py +0 -0
  50. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/csv_sink.py +0 -0
  51. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/devnull.py +0 -0
  52. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/dir_sink.py +0 -0
  53. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/expect.py +0 -0
  54. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/factory.py +0 -0
  55. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/format_sink.py +0 -0
  56. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph.py +0 -0
  57. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_cumulative.py +0 -0
  58. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_hist.py +0 -0
  59. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_scatter.py +0 -0
  60. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/json_sink.py +0 -0
  61. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/s3_sink.py +0 -0
  62. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/s3_stream.py +0 -0
  63. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/sinks.py +0 -0
  64. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/stdout.py +0 -0
  65. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/tsv_sink.py +0 -0
  66. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/user_sink_factory.py +0 -0
  67. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/__init__.py +0 -0
  68. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/csv_source.py +0 -0
  69. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/dir_source.py +0 -0
  70. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/favorite_source.py +0 -0
  71. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/format_source.py +0 -0
  72. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/inline_source.py +0 -0
  73. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/json_source.py +0 -0
  74. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/lazy_file.py +0 -0
  75. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/lazy_file_local.py +0 -0
  76. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/lazy_file_s3.py +0 -0
  77. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/parquet_source.py +0 -0
  78. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/s3_source.py +0 -0
  79. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/source_list.py +0 -0
  80. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/tsv_source.py +0 -0
  81. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/user_source_factory.py +0 -0
  82. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/usage.py +0 -0
  83. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
  84. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
  85. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/requires.txt +0 -0
  86. {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-jack-knife
3
- Version: 0.7.0
3
+ Version: 0.7.4
4
4
  Summary: Python Jack Knife – a command line data processor
5
5
  Author-email: Mike Schultz <mike.schultz@gmail.com>
6
6
  License:
@@ -30,6 +30,9 @@ def read_history(log_path: str) -> List[int]:
30
30
  if not line:
31
31
  continue
32
32
 
33
+ if 'pjk ' in line: # legacy
34
+ line = line.split('pjk ', 1)[1]
35
+
33
36
  # Expected format: <command_string>
34
37
  line = line.strip()
35
38
 
@@ -0,0 +1,268 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+ #
4
+ # djk/pipes/postgres_pipe.py
5
+
6
+ import base64
7
+ import datetime as _dt
8
+ import uuid
9
+ import time
10
+ from decimal import Decimal
11
+ from typing import Any, Dict, Optional
12
+
13
+ from pjk.usage import ParsedToken, Usage
14
+ from pjk.common import Integration
15
+ from pjk.pipes.query_pipe import QueryPipe
16
+
17
+ MAX_RETRIES = 3
18
+ BASE_DELAY = 0.1 # seconds
19
+
20
+ class DBClient:
21
+ """Per-instance pg8000 connection wrapper. No shared state."""
22
+
23
+ def __init__(
24
+ self,
25
+ host: str,
26
+ username: str,
27
+ password: Optional[str],
28
+ db_name: str,
29
+ port: int = 5432,
30
+ ssl: bool = False,
31
+ ):
32
+ import pg8000 # lazy import
33
+
34
+ kwargs = dict(
35
+ user=username,
36
+ password=password,
37
+ host=host,
38
+ database=db_name,
39
+ port=port,
40
+ )
41
+ if ssl:
42
+ import ssl as _ssl
43
+
44
+ kwargs["ssl_context"] = _ssl.create_default_context()
45
+
46
+ try:
47
+ self.conn = pg8000.connect(**kwargs)
48
+ self.conn.autocommit = True
49
+ except Exception as e:
50
+ print("Failed to connect to DB")
51
+ raise e
52
+
53
+ def close(self):
54
+ if getattr(self, "conn", None) is None:
55
+ return
56
+
57
+ import pg8000 # lazy
58
+
59
+ try:
60
+ self.conn.close()
61
+ except pg8000.exceptions.InterfaceError:
62
+ # Already closed / broken; ignore.
63
+ pass
64
+ finally:
65
+ self.conn = None
66
+
67
+
68
+ def _iso_dt(x: _dt.datetime) -> str:
69
+ """ISO 8601; normalize UTC offset to 'Z'."""
70
+ s = x.isoformat()
71
+ return s.replace("+00:00", "Z")
72
+
73
+
74
+ def normalize(obj: Any) -> Any:
75
+ """
76
+ Make values JSON/YAML-safe and portable (schema-agnostic):
77
+ - Decimal -> exact string (no sci-notation)
78
+ - date/datetime/time -> ISO-8601 string (datetime keeps offset; UTC -> 'Z')
79
+ - UUID -> string
80
+ - bytes -> base64 string
81
+ - lists/tuples/sets, dicts -> normalized recursively
82
+ - leaves int/float/str/bool/None as-is
83
+ """
84
+ if obj is None:
85
+ return None
86
+
87
+ if isinstance(obj, Decimal):
88
+ return format(obj, "f") # exact value as string
89
+
90
+ if isinstance(obj, _dt.datetime):
91
+ return _iso_dt(obj)
92
+
93
+ if isinstance(obj, (_dt.date, _dt.time)):
94
+ return obj.isoformat()
95
+
96
+ if isinstance(obj, uuid.UUID):
97
+ return str(obj)
98
+
99
+ if isinstance(obj, (bytes, bytearray, memoryview)):
100
+ return base64.b64encode(bytes(obj)).decode("ascii")
101
+
102
+ if isinstance(obj, dict):
103
+ return {k: normalize(v) for k, v in obj.items()}
104
+
105
+ if isinstance(obj, (list, tuple, set)):
106
+ return [normalize(v) for v in obj]
107
+
108
+ return obj
109
+
110
+
111
+ def _row_to_dict(cursor, row) -> Dict[str, Any]:
112
+ cols = [d[0] for d in cursor.description]
113
+ return {col: normalize(val) for col, val in zip(cols, row)}
114
+
115
+
116
+ class PostgresPipe(QueryPipe, Integration):
117
+ name = "postgres"
118
+ desc = "Postgres query pipe; executes SQL over input record['query']."
119
+ arg0 = ("instance", "instance of database.")
120
+ examples = [
121
+ ["myquery.sql", "postgres:mydb", "-"],
122
+ ["{'query': 'SELECT * from MY_TABLE;'}", "postgres:mydb", "-"],
123
+ ["{'query': 'SELECT * FROM pg_catalog.pg_tables;'}", "postgres:mydb"],
124
+ ["{'query': 'SELECT procedure_batch(%s, ...), batch_params:{...}"],
125
+ ["{'query': 'SELECT procedure_jsonb(%s, ...), json_params:json_string"],
126
+ ]
127
+
128
+ # name, type, default
129
+ config_tuples = [
130
+ ("db_name", str, None),
131
+ ("host", str, None),
132
+ ("user", str, None),
133
+ ("password", str, None),
134
+ ("port", int, 5432),
135
+ ("ssl", bool, False),
136
+ ]
137
+
138
+ def __init__(self, ptok: ParsedToken, u: Usage, root=None):
139
+ super().__init__(ptok, u, root=root)
140
+
141
+ self.db_name = u.get_config("db_name")
142
+ self.db_host = u.get_config("host")
143
+ self.db_user = u.get_config("user")
144
+ self.db_pass = u.get_config("password")
145
+ self.db_port = u.get_config("port")
146
+ self.db_ssl = u.get_config("ssl")
147
+
148
+ # Standard params field: single-exec params (list/tuple/dict/single value)
149
+ self.params_field = "params"
150
+
151
+ # Legacy batch path: list[tuple|list|dict] → executemany
152
+ self.batch_field = "batch_params"
153
+
154
+ # Explicit JSON payload field (no query sniffing).
155
+ # If present, this value is passed to cur.execute(query, json_params).
156
+ self.json_params_field = "json_params"
157
+
158
+ # One DB client (and thus one connection) per PostgresPipe instance.
159
+ # Under your invariant (one thread per pipe), this is thread-safe.
160
+ self.client = DBClient(
161
+ host=self.db_host,
162
+ username=self.db_user,
163
+ password=self.db_pass,
164
+ db_name=self.db_name,
165
+ port=self.db_port,
166
+ ssl=self.db_ssl,
167
+ )
168
+
169
+ def reset(self):
170
+ # stateless across reset
171
+ pass
172
+
173
+ def close(self):
174
+ if self.client is not None:
175
+ self.client.close()
176
+
177
+ def _make_header(self, cur, query: str, params=None) -> Dict[str, Any]:
178
+ """
179
+ Inspect the cursor and build a full header record.
180
+ Figures out result, rowcount, function automatically.
181
+ """
182
+ h = {
183
+ "db": self.db_name,
184
+ "dbhost": self.db_host,
185
+ }
186
+ if params is not None:
187
+ h["params"] = params
188
+
189
+ if cur.description:
190
+ cols = [d[0] for d in cur.description]
191
+ if len(cols) == 1 and cols[0] == "ingest_event":
192
+ _ = cur.fetchone() # consume void row
193
+ h["result"] = "ok"
194
+ h["function"] = "ingest_event"
195
+ else:
196
+ h["result"] = "ok"
197
+ h["rowcount"] = cur.rowcount if cur.rowcount != -1 else None
198
+ else:
199
+ h["result"] = "ok"
200
+ h["rowcount"] = cur.rowcount
201
+
202
+ return h
203
+
204
+ def execute_query_returning_S_xO_iterable(self, record):
205
+ query = record.get(self.query_field)
206
+ if not query:
207
+ record["_error"] = "missing query"
208
+ yield record
209
+ return
210
+
211
+ # Priority: json_params > batch_params > params
212
+ json_params = record.get(self.json_params_field, None)
213
+ batch = record.get(self.batch_field, None)
214
+ params = record.get(self.params_field, None)
215
+
216
+ cur = self.client.conn.cursor()
217
+ try:
218
+ did_executemany = False
219
+ header_params = None
220
+
221
+ # ---------- execute ----------
222
+ if json_params is not None:
223
+ # Explicit JSON payload; caller controls shape.
224
+ # We don't inspect query or payload.
225
+ if isinstance(json_params, (list, tuple, dict)):
226
+ cur.execute(query, json_params)
227
+ else:
228
+ cur.execute(query, (json_params,))
229
+ header_params = {self.json_params_field: json_params}
230
+
231
+ elif batch is not None:
232
+ # Legacy executemany path; no magic.
233
+ if len(batch) == 0:
234
+ cur.execute("SELECT 1")
235
+ header_params = {"batch_size": 0}
236
+ elif len(batch) == 1:
237
+ cur.execute(query, batch[0])
238
+ header_params = {"batch_size": 1, "params": batch[0]}
239
+ else:
240
+ cur.executemany(query, batch)
241
+ did_executemany = True
242
+ header_params = {"batch_size": len(batch)}
243
+
244
+ else:
245
+ # Single-statement path.
246
+ if params is None:
247
+ cur.execute(query)
248
+ header_params = None
249
+ else:
250
+ if isinstance(params, (list, tuple, dict)):
251
+ cur.execute(query, params)
252
+ else:
253
+ cur.execute(query, (params,))
254
+ header_params = params
255
+
256
+ # ---------- header ----------
257
+ yield self._make_header(cur, query, header_params)
258
+
259
+ # ---------- stream rows (only meaningful for single execute that returns rows) ----------
260
+ if not did_executemany and cur.description:
261
+ cols = [d[0] for d in cur.description]
262
+ if not (len(cols) == 1 and cols[0] == "ingest_event"):
263
+ for row in cur:
264
+ yield _row_to_dict(cur, row)
265
+
266
+ finally:
267
+ cur.close()
268
+ # connection stays open for this pipe; closed in .close()
@@ -38,7 +38,7 @@ COMPONENTS = {
38
38
  'reduce': ReducePipe,
39
39
  'sort': SortPipe,
40
40
  'where': WherePipe,
41
- 'sel': SelectFields,
41
+ 'select': SelectFields,
42
42
  'sample': SamplePipe,
43
43
  'explode': DenormPipe,
44
44
  'postgres': PostgresPipe,
@@ -35,8 +35,8 @@ class QueryPipe(Pipe):
35
35
  return u
36
36
 
37
37
 
38
- def __init__(self, ptok: ParsedToken, usage: Usage):
39
- super().__init__(ptok, usage)
38
+ def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
39
+ super().__init__(ptok, usage, root=root)
40
40
  self.output_shape = usage.get_param('shape')
41
41
  self.count = usage.get_param('count')
42
42
  self.query_field = 'query' # for all subclasses
@@ -10,12 +10,12 @@ class SelectFields(DeepCopyPipe):
10
10
  @classmethod
11
11
  def usage(cls):
12
12
  usage = Usage(
13
- name='sel',
13
+ name='select',
14
14
  desc='Select specific fields from each record.',
15
15
  component_class=cls
16
16
  )
17
17
  usage.def_arg(name='fields', usage='Comma-separated list of fields to retain')
18
- usage.def_example(expr_tokens=["{id:1, dir:'up', color:'blue'}", 'sel:id,color'], expect="id: 1, color:'blue'")
18
+ usage.def_example(expr_tokens=["{id:1, dir:'up', color:'blue'}", 'select:id,color'], expect="id: 1, color:'blue'")
19
19
  return usage
20
20
 
21
21
  def __init__(self, ptok: ParsedToken, usage: Usage):
@@ -20,10 +20,6 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence
20
20
  from datetime import date, datetime
21
21
  from collections import defaultdict
22
22
 
23
- import numpy as np
24
- import pandas as pd
25
-
26
-
27
23
  # ----------------------------- Public Params -----------------------------
28
24
  @dataclass
29
25
  class GraphParams:
@@ -48,6 +44,8 @@ class TimeDetector:
48
44
 
49
45
  @staticmethod
50
46
  def is_time(xs: pd.Series) -> bool:
47
+ import numpy as np # lazy
48
+ import pandas as pd # lazy
51
49
  # Already datetime dtype?
52
50
  if pd.api.types.is_datetime64_any_dtype(xs):
53
51
  return True
@@ -74,6 +72,7 @@ class TimeDetector:
74
72
 
75
73
  @staticmethod
76
74
  def parse_times(series: pd.Series) -> pd.Series:
75
+ import pandas as pd # lazy
77
76
  numeric = pd.to_numeric(series, errors="coerce")
78
77
  parsed = None
79
78
  if numeric.notna().mean() >= 0.9:
@@ -92,6 +91,8 @@ class MultiYAdapter:
92
91
  """Builds wide dataframe: columns = ['x'] + y_fields; sums duplicates of x."""
93
92
  @staticmethod
94
93
  def to_df(records: Iterable[Dict[str, Any]], x_field: str, y_fields: Sequence[str]) -> pd.DataFrame:
94
+ import pandas as pd # lazy
95
+ import numpy as np # lazy
95
96
  rows: List[Dict[str, Any]] = []
96
97
  for r in records:
97
98
  if x_field not in r:
@@ -120,7 +121,8 @@ class MultiYAdapter:
120
121
  class SingleYWithSetsAdapter:
121
122
  """Legacy: single y_field + optional per-row set_name to create series."""
122
123
  @staticmethod
123
- def to_df(records: Iterable[Dict[str, Any]], x_field: str, y_field: str) -> pd.DataFrame:
124
+ def to_df(records: Iterable[Dict[str, Any]], x_field: str, y_field: str):
125
+ import pandas as pd # lazy
124
126
  triplets = [] # (x, y, set_name)
125
127
  for r in records:
126
128
  if x_field in r and y_field in r:
@@ -141,8 +143,10 @@ class GraphPlotter:
141
143
  self.y_fields = list(dict.fromkeys(self.pms.y_fields)) # dedupe, preserve order
142
144
 
143
145
  def plot(self, chart_type: str = "line"):
144
- import matplotlib.pyplot as plt
145
- import matplotlib.dates as mdates
146
+ import matplotlib.pyplot as plt # lazy
147
+ import matplotlib.dates as mdates # lazy
148
+ import pandas as pd # lazy
149
+ import numpy as np # lazy
146
150
 
147
151
  fig = plt.figure()
148
152
  ax = plt.gca()
@@ -258,7 +262,7 @@ class GraphPlotter:
258
262
  # ---------- Formatting helpers ----------
259
263
  @staticmethod
260
264
  def _format_time_axis(ax, df: pd.DataFrame) -> None:
261
- import matplotlib.dates as mdates
265
+ import matplotlib.dates as mdates # lazy
262
266
  fig = ax.get_figure()
263
267
  ts = df["ts"]
264
268
  if ts.empty:
@@ -289,6 +293,7 @@ class GraphPlotter:
289
293
 
290
294
  def _bars_time(self, ax, df: pd.DataFrame, y_cols: Sequence[str]) -> None:
291
295
  # Grouped bars at each timestamp using index positions
296
+ import numpy as np # lazy
292
297
  x_vals = df["ts"].to_numpy(); idx = np.arange(len(x_vals))
293
298
  n = len(y_cols); width = 0.8 / max(n, 1)
294
299
  for i, y in enumerate(y_cols):
@@ -298,6 +303,7 @@ class GraphPlotter:
298
303
  ax.set_xticks(idx, [pd.to_datetime(t).strftime("%Y-%m-%d %H:%M") for t in x_vals], rotation=45)
299
304
 
300
305
  def _bars_categorical(self, ax, df: pd.DataFrame, y_cols: Sequence[str]) -> None:
306
+ import numpy as np # lazy
301
307
  seen = set(); ordered_x: List[Any] = []
302
308
  for x in df["x"].tolist():
303
309
  if x not in seen:
@@ -310,6 +316,7 @@ class GraphPlotter:
310
316
  ax.set_xticks(idx, ordered_x, rotation=45)
311
317
 
312
318
  def _lines_categorical(self, ax, df: pd.DataFrame, y_cols: Sequence[str]) -> None:
319
+ import numpy as np # lazy
313
320
  seen = set(); ordered_x: List[Any] = []
314
321
  for x in df["x"].tolist():
315
322
  if x not in seen:
@@ -322,7 +329,7 @@ class GraphPlotter:
322
329
 
323
330
  # ---------- Misc ----------
324
331
  def _apply_args_dict(self) -> None:
325
- import matplotlib.pyplot as plt
332
+ import matplotlib.pyplot as plt # lazy
326
333
  for name, val in getattr(self.pms, "args_dict", {}).items():
327
334
  fn = getattr(plt, name, None)
328
335
  if callable(fn):
@@ -345,7 +352,7 @@ def graph_bar_line(obj, type):
345
352
  Returns (fig, ax) for optional downstream tweaks (safe to ignore).
346
353
  """
347
354
  # Lazy import (ensures MPL backend)
348
- import matplotlib.pyplot as plt # noqa: F401
355
+ import matplotlib.pyplot as plt # noqa: F401 # lazy
349
356
 
350
357
  # Normalize y_fields from string or list
351
358
  raw_y = obj.y_field if isinstance(obj.y_field, str) else str(obj.y_field)
@@ -13,9 +13,11 @@ from pjk.sources.inline_source import InlineSource
13
13
  from pjk.sources.user_source_factory import UserSourceFactory
14
14
  from pjk.sources.parquet_source import ParquetSource
15
15
  from pjk.sources.format_source import FormatSource
16
+ from pjk.sources.s3_select_source import S3SelectSource
16
17
 
17
18
  COMPONENTS = {
18
19
  'inline': InlineSource,
20
+ 's3s': S3SelectSource,
19
21
  'json': JsonSource,
20
22
  'jsonl': JsonSource,
21
23
  'csv': CSVSource,
@@ -35,6 +37,11 @@ class SourceFactory(ComponentFactory):
35
37
  def create(self, token: str) -> Source:
36
38
  token = token.strip()
37
39
 
40
+ # s3s is a pseudo source only in the above list to provide easy man page
41
+ # it's instantiated by the parser when <file>.s3s, so disallow standard search for it.
42
+ if token == 's3s':
43
+ return None
44
+
38
45
  if InlineSource.is_inline(token):
39
46
  return InlineSource(token)
40
47
 
@@ -44,6 +51,12 @@ class SourceFactory(ComponentFactory):
44
51
  source = UserSourceFactory.create(ptok)
45
52
  if source:
46
53
  return source
54
+
55
+ # s3 select file
56
+ if ptok.pre_colon.endswith('.s3s'):
57
+ source = S3SelectSource(ptok, None)
58
+ if source:
59
+ return source
47
60
 
48
61
  source_cls = self.get_component_class(ptok.pre_colon)
49
62
  if source_cls and not issubclass(source_cls, FormatSource):
@@ -4,18 +4,15 @@
4
4
  import json
5
5
  from typing import Iterator, Dict, Any
6
6
 
7
- import numpy as np
8
- from pjk.usage import NoBindUsage
9
- from pjk.components import Source
10
7
  from pjk.sources.lazy_file import LazyFile
11
8
  from pjk.sources.format_source import FormatSource
12
9
  from pjk.log import logger
13
10
 
14
-
15
11
  class NpySource(FormatSource):
16
12
  extension = 'npy'
17
13
 
18
14
  def __init__(self, lazy_file: LazyFile):
15
+ super().__init__(root=None)
19
16
  self.lazy_file = lazy_file
20
17
  self.num_vecs = 0
21
18
 
@@ -32,9 +29,11 @@ class NpySource(FormatSource):
32
29
 
33
30
  try:
34
31
  # Use mmap to avoid loading entire array in RAM at once.
32
+ import numpy as np #lazy import
35
33
  arr = np.load(path, mmap_mode="r", allow_pickle=False)
36
34
  except Exception as e:
37
35
  logger.error(f"Failed to load .npy file at {path}: {e}")
36
+ raise Exception(f"Failed to load .npy file at {path}: {e}")
38
37
  return
39
38
 
40
39
  if arr.size == 0: