python-jack-knife 0.7.0__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_jack_knife-0.7.0/src/python_jack_knife.egg-info → python_jack_knife-0.7.4}/PKG-INFO +1 -1
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/history.py +3 -0
- python_jack_knife-0.7.4/src/pjk/integrations/postgres_pipe.py +268 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/factory.py +1 -1
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/query_pipe.py +2 -2
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/select.py +2 -2
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_bar_line.py +17 -10
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/factory.py +13 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/npy_source.py +3 -4
- python_jack_knife-0.7.4/src/pjk/sources/s3_select_source.py +373 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/sql_source.py +13 -4
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/version.py +1 -1
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4/src/python_jack_knife.egg-info}/PKG-INFO +1 -1
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/SOURCES.txt +1 -0
- python_jack_knife-0.7.0/src/pjk/integrations/postgres_pipe.py +0 -218
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/LICENSE +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/README.md +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/pyproject.toml +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/setup.cfg +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/__init__.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/common.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/components.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/opensearch_client.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/opensearch_index_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/opensearch_query_pipe.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/integrations/snowflake_pipe.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/log.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/main.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/man_page.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/parser.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/__init__.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/denorm.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/filter.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/head.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/join.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/let_reduce.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/map.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/move_field.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/progress_pipe.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/remove_field.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/sample.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/sort.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/tail.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/user_pipe_factory.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/pipes/where.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/progress.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/registry.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/__init__.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/create_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/csv_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/devnull.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/dir_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/expect.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/factory.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/format_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_cumulative.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_hist.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/graph_scatter.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/json_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/s3_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/s3_stream.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/sinks.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/stdout.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/tsv_sink.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sinks/user_sink_factory.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/__init__.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/csv_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/dir_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/favorite_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/format_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/inline_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/json_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/lazy_file.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/lazy_file_local.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/lazy_file_s3.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/parquet_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/s3_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/source_list.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/tsv_source.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/sources/user_source_factory.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/pjk/usage.py +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/requires.txt +0 -0
- {python_jack_knife-0.7.0 → python_jack_knife-0.7.4}/src/python_jack_knife.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
#
|
|
4
|
+
# djk/pipes/postgres_pipe.py
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import datetime as _dt
|
|
8
|
+
import uuid
|
|
9
|
+
import time
|
|
10
|
+
from decimal import Decimal
|
|
11
|
+
from typing import Any, Dict, Optional
|
|
12
|
+
|
|
13
|
+
from pjk.usage import ParsedToken, Usage
|
|
14
|
+
from pjk.common import Integration
|
|
15
|
+
from pjk.pipes.query_pipe import QueryPipe
|
|
16
|
+
|
|
17
|
+
MAX_RETRIES = 3
|
|
18
|
+
BASE_DELAY = 0.1 # seconds
|
|
19
|
+
|
|
20
|
+
class DBClient:
|
|
21
|
+
"""Per-instance pg8000 connection wrapper. No shared state."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
host: str,
|
|
26
|
+
username: str,
|
|
27
|
+
password: Optional[str],
|
|
28
|
+
db_name: str,
|
|
29
|
+
port: int = 5432,
|
|
30
|
+
ssl: bool = False,
|
|
31
|
+
):
|
|
32
|
+
import pg8000 # lazy import
|
|
33
|
+
|
|
34
|
+
kwargs = dict(
|
|
35
|
+
user=username,
|
|
36
|
+
password=password,
|
|
37
|
+
host=host,
|
|
38
|
+
database=db_name,
|
|
39
|
+
port=port,
|
|
40
|
+
)
|
|
41
|
+
if ssl:
|
|
42
|
+
import ssl as _ssl
|
|
43
|
+
|
|
44
|
+
kwargs["ssl_context"] = _ssl.create_default_context()
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
self.conn = pg8000.connect(**kwargs)
|
|
48
|
+
self.conn.autocommit = True
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print("Failed to connect to DB")
|
|
51
|
+
raise e
|
|
52
|
+
|
|
53
|
+
def close(self):
|
|
54
|
+
if getattr(self, "conn", None) is None:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
import pg8000 # lazy
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
self.conn.close()
|
|
61
|
+
except pg8000.exceptions.InterfaceError:
|
|
62
|
+
# Already closed / broken; ignore.
|
|
63
|
+
pass
|
|
64
|
+
finally:
|
|
65
|
+
self.conn = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _iso_dt(x: _dt.datetime) -> str:
|
|
69
|
+
"""ISO 8601; normalize UTC offset to 'Z'."""
|
|
70
|
+
s = x.isoformat()
|
|
71
|
+
return s.replace("+00:00", "Z")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def normalize(obj: Any) -> Any:
|
|
75
|
+
"""
|
|
76
|
+
Make values JSON/YAML-safe and portable (schema-agnostic):
|
|
77
|
+
- Decimal -> exact string (no sci-notation)
|
|
78
|
+
- date/datetime/time -> ISO-8601 string (datetime keeps offset; UTC -> 'Z')
|
|
79
|
+
- UUID -> string
|
|
80
|
+
- bytes -> base64 string
|
|
81
|
+
- lists/tuples/sets, dicts -> normalized recursively
|
|
82
|
+
- leaves int/float/str/bool/None as-is
|
|
83
|
+
"""
|
|
84
|
+
if obj is None:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
if isinstance(obj, Decimal):
|
|
88
|
+
return format(obj, "f") # exact value as string
|
|
89
|
+
|
|
90
|
+
if isinstance(obj, _dt.datetime):
|
|
91
|
+
return _iso_dt(obj)
|
|
92
|
+
|
|
93
|
+
if isinstance(obj, (_dt.date, _dt.time)):
|
|
94
|
+
return obj.isoformat()
|
|
95
|
+
|
|
96
|
+
if isinstance(obj, uuid.UUID):
|
|
97
|
+
return str(obj)
|
|
98
|
+
|
|
99
|
+
if isinstance(obj, (bytes, bytearray, memoryview)):
|
|
100
|
+
return base64.b64encode(bytes(obj)).decode("ascii")
|
|
101
|
+
|
|
102
|
+
if isinstance(obj, dict):
|
|
103
|
+
return {k: normalize(v) for k, v in obj.items()}
|
|
104
|
+
|
|
105
|
+
if isinstance(obj, (list, tuple, set)):
|
|
106
|
+
return [normalize(v) for v in obj]
|
|
107
|
+
|
|
108
|
+
return obj
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _row_to_dict(cursor, row) -> Dict[str, Any]:
|
|
112
|
+
cols = [d[0] for d in cursor.description]
|
|
113
|
+
return {col: normalize(val) for col, val in zip(cols, row)}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class PostgresPipe(QueryPipe, Integration):
|
|
117
|
+
name = "postgres"
|
|
118
|
+
desc = "Postgres query pipe; executes SQL over input record['query']."
|
|
119
|
+
arg0 = ("instance", "instance of database.")
|
|
120
|
+
examples = [
|
|
121
|
+
["myquery.sql", "postgres:mydb", "-"],
|
|
122
|
+
["{'query': 'SELECT * from MY_TABLE;'}", "postgres:mydb", "-"],
|
|
123
|
+
["{'query': 'SELECT * FROM pg_catalog.pg_tables;'}", "postgres:mydb"],
|
|
124
|
+
["{'query': 'SELECT procedure_batch(%s, ...), batch_params:{...}"],
|
|
125
|
+
["{'query': 'SELECT procedure_jsonb(%s, ...), json_params:json_string"],
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
# name, type, default
|
|
129
|
+
config_tuples = [
|
|
130
|
+
("db_name", str, None),
|
|
131
|
+
("host", str, None),
|
|
132
|
+
("user", str, None),
|
|
133
|
+
("password", str, None),
|
|
134
|
+
("port", int, 5432),
|
|
135
|
+
("ssl", bool, False),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
def __init__(self, ptok: ParsedToken, u: Usage, root=None):
|
|
139
|
+
super().__init__(ptok, u, root=root)
|
|
140
|
+
|
|
141
|
+
self.db_name = u.get_config("db_name")
|
|
142
|
+
self.db_host = u.get_config("host")
|
|
143
|
+
self.db_user = u.get_config("user")
|
|
144
|
+
self.db_pass = u.get_config("password")
|
|
145
|
+
self.db_port = u.get_config("port")
|
|
146
|
+
self.db_ssl = u.get_config("ssl")
|
|
147
|
+
|
|
148
|
+
# Standard params field: single-exec params (list/tuple/dict/single value)
|
|
149
|
+
self.params_field = "params"
|
|
150
|
+
|
|
151
|
+
# Legacy batch path: list[tuple|list|dict] → executemany
|
|
152
|
+
self.batch_field = "batch_params"
|
|
153
|
+
|
|
154
|
+
# Explicit JSON payload field (no query sniffing).
|
|
155
|
+
# If present, this value is passed to cur.execute(query, json_params).
|
|
156
|
+
self.json_params_field = "json_params"
|
|
157
|
+
|
|
158
|
+
# One DB client (and thus one connection) per PostgresPipe instance.
|
|
159
|
+
# Under your invariant (one thread per pipe), this is thread-safe.
|
|
160
|
+
self.client = DBClient(
|
|
161
|
+
host=self.db_host,
|
|
162
|
+
username=self.db_user,
|
|
163
|
+
password=self.db_pass,
|
|
164
|
+
db_name=self.db_name,
|
|
165
|
+
port=self.db_port,
|
|
166
|
+
ssl=self.db_ssl,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def reset(self):
|
|
170
|
+
# stateless across reset
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
def close(self):
|
|
174
|
+
if self.client is not None:
|
|
175
|
+
self.client.close()
|
|
176
|
+
|
|
177
|
+
def _make_header(self, cur, query: str, params=None) -> Dict[str, Any]:
|
|
178
|
+
"""
|
|
179
|
+
Inspect the cursor and build a full header record.
|
|
180
|
+
Figures out result, rowcount, function automatically.
|
|
181
|
+
"""
|
|
182
|
+
h = {
|
|
183
|
+
"db": self.db_name,
|
|
184
|
+
"dbhost": self.db_host,
|
|
185
|
+
}
|
|
186
|
+
if params is not None:
|
|
187
|
+
h["params"] = params
|
|
188
|
+
|
|
189
|
+
if cur.description:
|
|
190
|
+
cols = [d[0] for d in cur.description]
|
|
191
|
+
if len(cols) == 1 and cols[0] == "ingest_event":
|
|
192
|
+
_ = cur.fetchone() # consume void row
|
|
193
|
+
h["result"] = "ok"
|
|
194
|
+
h["function"] = "ingest_event"
|
|
195
|
+
else:
|
|
196
|
+
h["result"] = "ok"
|
|
197
|
+
h["rowcount"] = cur.rowcount if cur.rowcount != -1 else None
|
|
198
|
+
else:
|
|
199
|
+
h["result"] = "ok"
|
|
200
|
+
h["rowcount"] = cur.rowcount
|
|
201
|
+
|
|
202
|
+
return h
|
|
203
|
+
|
|
204
|
+
def execute_query_returning_S_xO_iterable(self, record):
|
|
205
|
+
query = record.get(self.query_field)
|
|
206
|
+
if not query:
|
|
207
|
+
record["_error"] = "missing query"
|
|
208
|
+
yield record
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
# Priority: json_params > batch_params > params
|
|
212
|
+
json_params = record.get(self.json_params_field, None)
|
|
213
|
+
batch = record.get(self.batch_field, None)
|
|
214
|
+
params = record.get(self.params_field, None)
|
|
215
|
+
|
|
216
|
+
cur = self.client.conn.cursor()
|
|
217
|
+
try:
|
|
218
|
+
did_executemany = False
|
|
219
|
+
header_params = None
|
|
220
|
+
|
|
221
|
+
# ---------- execute ----------
|
|
222
|
+
if json_params is not None:
|
|
223
|
+
# Explicit JSON payload; caller controls shape.
|
|
224
|
+
# We don't inspect query or payload.
|
|
225
|
+
if isinstance(json_params, (list, tuple, dict)):
|
|
226
|
+
cur.execute(query, json_params)
|
|
227
|
+
else:
|
|
228
|
+
cur.execute(query, (json_params,))
|
|
229
|
+
header_params = {self.json_params_field: json_params}
|
|
230
|
+
|
|
231
|
+
elif batch is not None:
|
|
232
|
+
# Legacy executemany path; no magic.
|
|
233
|
+
if len(batch) == 0:
|
|
234
|
+
cur.execute("SELECT 1")
|
|
235
|
+
header_params = {"batch_size": 0}
|
|
236
|
+
elif len(batch) == 1:
|
|
237
|
+
cur.execute(query, batch[0])
|
|
238
|
+
header_params = {"batch_size": 1, "params": batch[0]}
|
|
239
|
+
else:
|
|
240
|
+
cur.executemany(query, batch)
|
|
241
|
+
did_executemany = True
|
|
242
|
+
header_params = {"batch_size": len(batch)}
|
|
243
|
+
|
|
244
|
+
else:
|
|
245
|
+
# Single-statement path.
|
|
246
|
+
if params is None:
|
|
247
|
+
cur.execute(query)
|
|
248
|
+
header_params = None
|
|
249
|
+
else:
|
|
250
|
+
if isinstance(params, (list, tuple, dict)):
|
|
251
|
+
cur.execute(query, params)
|
|
252
|
+
else:
|
|
253
|
+
cur.execute(query, (params,))
|
|
254
|
+
header_params = params
|
|
255
|
+
|
|
256
|
+
# ---------- header ----------
|
|
257
|
+
yield self._make_header(cur, query, header_params)
|
|
258
|
+
|
|
259
|
+
# ---------- stream rows (only meaningful for single execute that returns rows) ----------
|
|
260
|
+
if not did_executemany and cur.description:
|
|
261
|
+
cols = [d[0] for d in cur.description]
|
|
262
|
+
if not (len(cols) == 1 and cols[0] == "ingest_event"):
|
|
263
|
+
for row in cur:
|
|
264
|
+
yield _row_to_dict(cur, row)
|
|
265
|
+
|
|
266
|
+
finally:
|
|
267
|
+
cur.close()
|
|
268
|
+
# connection stays open for this pipe; closed in .close()
|
|
@@ -35,8 +35,8 @@ class QueryPipe(Pipe):
|
|
|
35
35
|
return u
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
39
|
-
super().__init__(ptok, usage)
|
|
38
|
+
def __init__(self, ptok: ParsedToken, usage: Usage, root = None):
|
|
39
|
+
super().__init__(ptok, usage, root=root)
|
|
40
40
|
self.output_shape = usage.get_param('shape')
|
|
41
41
|
self.count = usage.get_param('count')
|
|
42
42
|
self.query_field = 'query' # for all subclasses
|
|
@@ -10,12 +10,12 @@ class SelectFields(DeepCopyPipe):
|
|
|
10
10
|
@classmethod
|
|
11
11
|
def usage(cls):
|
|
12
12
|
usage = Usage(
|
|
13
|
-
name='
|
|
13
|
+
name='select',
|
|
14
14
|
desc='Select specific fields from each record.',
|
|
15
15
|
component_class=cls
|
|
16
16
|
)
|
|
17
17
|
usage.def_arg(name='fields', usage='Comma-separated list of fields to retain')
|
|
18
|
-
usage.def_example(expr_tokens=["{id:1, dir:'up', color:'blue'}", '
|
|
18
|
+
usage.def_example(expr_tokens=["{id:1, dir:'up', color:'blue'}", 'select:id,color'], expect="id: 1, color:'blue'")
|
|
19
19
|
return usage
|
|
20
20
|
|
|
21
21
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
@@ -20,10 +20,6 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence
|
|
|
20
20
|
from datetime import date, datetime
|
|
21
21
|
from collections import defaultdict
|
|
22
22
|
|
|
23
|
-
import numpy as np
|
|
24
|
-
import pandas as pd
|
|
25
|
-
|
|
26
|
-
|
|
27
23
|
# ----------------------------- Public Params -----------------------------
|
|
28
24
|
@dataclass
|
|
29
25
|
class GraphParams:
|
|
@@ -48,6 +44,8 @@ class TimeDetector:
|
|
|
48
44
|
|
|
49
45
|
@staticmethod
|
|
50
46
|
def is_time(xs: pd.Series) -> bool:
|
|
47
|
+
import numpy as np # lazy
|
|
48
|
+
import pandas as pd # lazy
|
|
51
49
|
# Already datetime dtype?
|
|
52
50
|
if pd.api.types.is_datetime64_any_dtype(xs):
|
|
53
51
|
return True
|
|
@@ -74,6 +72,7 @@ class TimeDetector:
|
|
|
74
72
|
|
|
75
73
|
@staticmethod
|
|
76
74
|
def parse_times(series: pd.Series) -> pd.Series:
|
|
75
|
+
import pandas as pd # lazy
|
|
77
76
|
numeric = pd.to_numeric(series, errors="coerce")
|
|
78
77
|
parsed = None
|
|
79
78
|
if numeric.notna().mean() >= 0.9:
|
|
@@ -92,6 +91,8 @@ class MultiYAdapter:
|
|
|
92
91
|
"""Builds wide dataframe: columns = ['x'] + y_fields; sums duplicates of x."""
|
|
93
92
|
@staticmethod
|
|
94
93
|
def to_df(records: Iterable[Dict[str, Any]], x_field: str, y_fields: Sequence[str]) -> pd.DataFrame:
|
|
94
|
+
import pandas as pd # lazy
|
|
95
|
+
import numpy as np # lazy
|
|
95
96
|
rows: List[Dict[str, Any]] = []
|
|
96
97
|
for r in records:
|
|
97
98
|
if x_field not in r:
|
|
@@ -120,7 +121,8 @@ class MultiYAdapter:
|
|
|
120
121
|
class SingleYWithSetsAdapter:
|
|
121
122
|
"""Legacy: single y_field + optional per-row set_name to create series."""
|
|
122
123
|
@staticmethod
|
|
123
|
-
def to_df(records: Iterable[Dict[str, Any]], x_field: str, y_field: str)
|
|
124
|
+
def to_df(records: Iterable[Dict[str, Any]], x_field: str, y_field: str):
|
|
125
|
+
import pandas as pd # lazy
|
|
124
126
|
triplets = [] # (x, y, set_name)
|
|
125
127
|
for r in records:
|
|
126
128
|
if x_field in r and y_field in r:
|
|
@@ -141,8 +143,10 @@ class GraphPlotter:
|
|
|
141
143
|
self.y_fields = list(dict.fromkeys(self.pms.y_fields)) # dedupe, preserve order
|
|
142
144
|
|
|
143
145
|
def plot(self, chart_type: str = "line"):
|
|
144
|
-
import matplotlib.pyplot as plt
|
|
145
|
-
import matplotlib.dates as mdates
|
|
146
|
+
import matplotlib.pyplot as plt # lazy
|
|
147
|
+
import matplotlib.dates as mdates # lazy
|
|
148
|
+
import pandas as pd # lazy
|
|
149
|
+
import numpy as np # lazy
|
|
146
150
|
|
|
147
151
|
fig = plt.figure()
|
|
148
152
|
ax = plt.gca()
|
|
@@ -258,7 +262,7 @@ class GraphPlotter:
|
|
|
258
262
|
# ---------- Formatting helpers ----------
|
|
259
263
|
@staticmethod
|
|
260
264
|
def _format_time_axis(ax, df: pd.DataFrame) -> None:
|
|
261
|
-
import matplotlib.dates as mdates
|
|
265
|
+
import matplotlib.dates as mdates # lazy
|
|
262
266
|
fig = ax.get_figure()
|
|
263
267
|
ts = df["ts"]
|
|
264
268
|
if ts.empty:
|
|
@@ -289,6 +293,7 @@ class GraphPlotter:
|
|
|
289
293
|
|
|
290
294
|
def _bars_time(self, ax, df: pd.DataFrame, y_cols: Sequence[str]) -> None:
|
|
291
295
|
# Grouped bars at each timestamp using index positions
|
|
296
|
+
import numpy as np # lazy
|
|
292
297
|
x_vals = df["ts"].to_numpy(); idx = np.arange(len(x_vals))
|
|
293
298
|
n = len(y_cols); width = 0.8 / max(n, 1)
|
|
294
299
|
for i, y in enumerate(y_cols):
|
|
@@ -298,6 +303,7 @@ class GraphPlotter:
|
|
|
298
303
|
ax.set_xticks(idx, [pd.to_datetime(t).strftime("%Y-%m-%d %H:%M") for t in x_vals], rotation=45)
|
|
299
304
|
|
|
300
305
|
def _bars_categorical(self, ax, df: pd.DataFrame, y_cols: Sequence[str]) -> None:
|
|
306
|
+
import numpy as np # lazy
|
|
301
307
|
seen = set(); ordered_x: List[Any] = []
|
|
302
308
|
for x in df["x"].tolist():
|
|
303
309
|
if x not in seen:
|
|
@@ -310,6 +316,7 @@ class GraphPlotter:
|
|
|
310
316
|
ax.set_xticks(idx, ordered_x, rotation=45)
|
|
311
317
|
|
|
312
318
|
def _lines_categorical(self, ax, df: pd.DataFrame, y_cols: Sequence[str]) -> None:
|
|
319
|
+
import numpy as np # lazy
|
|
313
320
|
seen = set(); ordered_x: List[Any] = []
|
|
314
321
|
for x in df["x"].tolist():
|
|
315
322
|
if x not in seen:
|
|
@@ -322,7 +329,7 @@ class GraphPlotter:
|
|
|
322
329
|
|
|
323
330
|
# ---------- Misc ----------
|
|
324
331
|
def _apply_args_dict(self) -> None:
|
|
325
|
-
import matplotlib.pyplot as plt
|
|
332
|
+
import matplotlib.pyplot as plt # lazy
|
|
326
333
|
for name, val in getattr(self.pms, "args_dict", {}).items():
|
|
327
334
|
fn = getattr(plt, name, None)
|
|
328
335
|
if callable(fn):
|
|
@@ -345,7 +352,7 @@ def graph_bar_line(obj, type):
|
|
|
345
352
|
Returns (fig, ax) for optional downstream tweaks (safe to ignore).
|
|
346
353
|
"""
|
|
347
354
|
# Lazy import (ensures MPL backend)
|
|
348
|
-
import matplotlib.pyplot as plt # noqa: F401
|
|
355
|
+
import matplotlib.pyplot as plt # noqa: F401 # lazy
|
|
349
356
|
|
|
350
357
|
# Normalize y_fields from string or list
|
|
351
358
|
raw_y = obj.y_field if isinstance(obj.y_field, str) else str(obj.y_field)
|
|
@@ -13,9 +13,11 @@ from pjk.sources.inline_source import InlineSource
|
|
|
13
13
|
from pjk.sources.user_source_factory import UserSourceFactory
|
|
14
14
|
from pjk.sources.parquet_source import ParquetSource
|
|
15
15
|
from pjk.sources.format_source import FormatSource
|
|
16
|
+
from pjk.sources.s3_select_source import S3SelectSource
|
|
16
17
|
|
|
17
18
|
COMPONENTS = {
|
|
18
19
|
'inline': InlineSource,
|
|
20
|
+
's3s': S3SelectSource,
|
|
19
21
|
'json': JsonSource,
|
|
20
22
|
'jsonl': JsonSource,
|
|
21
23
|
'csv': CSVSource,
|
|
@@ -35,6 +37,11 @@ class SourceFactory(ComponentFactory):
|
|
|
35
37
|
def create(self, token: str) -> Source:
|
|
36
38
|
token = token.strip()
|
|
37
39
|
|
|
40
|
+
# s3s is a pseudo source only in the above list to provide easy man page
|
|
41
|
+
# it's instantiated by the parser when <file>.s3s, so disallow standard search for it.
|
|
42
|
+
if token == 's3s':
|
|
43
|
+
return None
|
|
44
|
+
|
|
38
45
|
if InlineSource.is_inline(token):
|
|
39
46
|
return InlineSource(token)
|
|
40
47
|
|
|
@@ -44,6 +51,12 @@ class SourceFactory(ComponentFactory):
|
|
|
44
51
|
source = UserSourceFactory.create(ptok)
|
|
45
52
|
if source:
|
|
46
53
|
return source
|
|
54
|
+
|
|
55
|
+
# s3 select file
|
|
56
|
+
if ptok.pre_colon.endswith('.s3s'):
|
|
57
|
+
source = S3SelectSource(ptok, None)
|
|
58
|
+
if source:
|
|
59
|
+
return source
|
|
47
60
|
|
|
48
61
|
source_cls = self.get_component_class(ptok.pre_colon)
|
|
49
62
|
if source_cls and not issubclass(source_cls, FormatSource):
|
|
@@ -4,18 +4,15 @@
|
|
|
4
4
|
import json
|
|
5
5
|
from typing import Iterator, Dict, Any
|
|
6
6
|
|
|
7
|
-
import numpy as np
|
|
8
|
-
from pjk.usage import NoBindUsage
|
|
9
|
-
from pjk.components import Source
|
|
10
7
|
from pjk.sources.lazy_file import LazyFile
|
|
11
8
|
from pjk.sources.format_source import FormatSource
|
|
12
9
|
from pjk.log import logger
|
|
13
10
|
|
|
14
|
-
|
|
15
11
|
class NpySource(FormatSource):
|
|
16
12
|
extension = 'npy'
|
|
17
13
|
|
|
18
14
|
def __init__(self, lazy_file: LazyFile):
|
|
15
|
+
super().__init__(root=None)
|
|
19
16
|
self.lazy_file = lazy_file
|
|
20
17
|
self.num_vecs = 0
|
|
21
18
|
|
|
@@ -32,9 +29,11 @@ class NpySource(FormatSource):
|
|
|
32
29
|
|
|
33
30
|
try:
|
|
34
31
|
# Use mmap to avoid loading entire array in RAM at once.
|
|
32
|
+
import numpy as np #lazy import
|
|
35
33
|
arr = np.load(path, mmap_mode="r", allow_pickle=False)
|
|
36
34
|
except Exception as e:
|
|
37
35
|
logger.error(f"Failed to load .npy file at {path}: {e}")
|
|
36
|
+
raise Exception(f"Failed to load .npy file at {path}: {e}")
|
|
38
37
|
return
|
|
39
38
|
|
|
40
39
|
if arr.size == 0:
|