python-jack-knife 0.5.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/PKG-INFO +1 -1
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/base.py +32 -12
- python_jack_knife-0.5.1/src/pjk/log.py +62 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/main.py +6 -2
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/parser.py +1 -1
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/head.py +3 -3
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/where.py +2 -10
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/registry.py +9 -2
- python_jack_knife-0.5.1/src/pjk/sinks/csv_sink.py +22 -0
- python_jack_knife-0.5.1/src/pjk/sinks/dir_sink.py +71 -0
- python_jack_knife-0.5.1/src/pjk/sinks/factory.py +55 -0
- python_jack_knife-0.5.1/src/pjk/sinks/format_sink.py +126 -0
- python_jack_knife-0.5.1/src/pjk/sinks/json_sink.py +14 -0
- python_jack_knife-0.5.1/src/pjk/sinks/s3_sink.py +90 -0
- python_jack_knife-0.5.1/src/pjk/sinks/s3_stream.py +134 -0
- python_jack_knife-0.5.1/src/pjk/sinks/tsv_sink.py +12 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/csv_source.py +3 -6
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/dir_source.py +28 -17
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/factory.py +6 -17
- python_jack_knife-0.5.1/src/pjk/sources/format_source.py +114 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/json_source.py +3 -7
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/parquet_source.py +3 -7
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/s3_source.py +40 -50
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/sql_source.py +4 -11
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/tsv_source.py +2 -6
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/version.py +1 -1
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/PKG-INFO +1 -1
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/SOURCES.txt +3 -1
- python_jack_knife-0.5.0/src/pjk/log.py +0 -67
- python_jack_knife-0.5.0/src/pjk/sinks/csv_sink.py +0 -33
- python_jack_knife-0.5.0/src/pjk/sinks/dir_sink.py +0 -59
- python_jack_knife-0.5.0/src/pjk/sinks/factory.py +0 -108
- python_jack_knife-0.5.0/src/pjk/sinks/json_sink.py +0 -23
- python_jack_knife-0.5.0/src/pjk/sinks/s3_sink.py +0 -100
- python_jack_knife-0.5.0/src/pjk/sinks/tsv_sink.py +0 -22
- python_jack_knife-0.5.0/src/pjk/sources/format_usage.py +0 -11
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/LICENSE +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/README.md +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/pyproject.toml +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/setup.cfg +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/__init__.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/common.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/man_page.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/__init__.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/denorm.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/factory.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/filter.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/join.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/let_reduce.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/map.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/move_field.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/postgres_pipe.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/remove_field.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/select.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/sort.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/tail.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/user_pipe_factory.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/__init__.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/ddb.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/devnull.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/expect.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_bar_line.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_cumulative.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_hist.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_scatter.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/sinks.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/stdout.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/user_sink_factory.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/__init__.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/inline_source.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/lazy_file.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/lazy_file_local.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/lazy_file_s3.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/source_list.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/user_source_factory.py +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/requires.txt +0 -0
- {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/top_level.txt +0 -0
|
@@ -152,16 +152,17 @@ class Usage:
|
|
|
152
152
|
lines.append(self.desc)
|
|
153
153
|
|
|
154
154
|
syntax_str = self.get_token_syntax() # might be ''
|
|
155
|
-
if
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
155
|
+
if not syntax_str:
|
|
156
|
+
return '\n'.join(lines)
|
|
157
|
+
|
|
158
|
+
lines.append('')
|
|
159
|
+
lines.append(f'syntax:')
|
|
160
|
+
lines.append(f' {self.get_token_syntax()}')
|
|
160
161
|
lines.extend(f"{line}" for line in self.get_arg_param_desc())
|
|
161
162
|
return '\n'.join(lines)
|
|
162
163
|
|
|
163
164
|
def get_token_syntax(self):
|
|
164
|
-
if self.syntax
|
|
165
|
+
if not self.syntax:
|
|
165
166
|
return self.syntax # else piece it together
|
|
166
167
|
|
|
167
168
|
token = f'{self.name}'
|
|
@@ -216,7 +217,10 @@ class Usage:
|
|
|
216
217
|
self.args[name] = self._get_val(val_str, is_num, valid_values)
|
|
217
218
|
except (ValueError, TypeError) as e:
|
|
218
219
|
raise TokenError.from_list([f"wrong value for '{name}' arg.", '', self.get_usage_text()])
|
|
219
|
-
|
|
220
|
+
|
|
221
|
+
self.bind_params(ptok)
|
|
222
|
+
|
|
223
|
+
def bind_params(self, ptok: ParsedToken):
|
|
220
224
|
for name, str_val in ptok.get_params().items():
|
|
221
225
|
usage = self.param_usages.get(name, None)
|
|
222
226
|
if not usage:
|
|
@@ -276,8 +280,6 @@ class KeyedSource(ABC):
|
|
|
276
280
|
return None
|
|
277
281
|
|
|
278
282
|
class Source(ABC):
|
|
279
|
-
is_format = False
|
|
280
|
-
|
|
281
283
|
@classmethod
|
|
282
284
|
def usage(cls):
|
|
283
285
|
return NoBindUsage(
|
|
@@ -296,7 +298,6 @@ class Source(ABC):
|
|
|
296
298
|
self._iter = iter(self)
|
|
297
299
|
return next(self._iter)
|
|
298
300
|
|
|
299
|
-
|
|
300
301
|
def deep_copy(self):
|
|
301
302
|
return None # Default: not copyable unless overridden
|
|
302
303
|
|
|
@@ -307,6 +308,7 @@ class Pipe(Source):
|
|
|
307
308
|
|
|
308
309
|
def __init__(self, ptok: ParsedToken, usage: Usage = None):
|
|
309
310
|
self.ptok = ptok
|
|
311
|
+
self.usage = usage
|
|
310
312
|
self.left = None # left source for convience
|
|
311
313
|
self.right = None # right source for convience
|
|
312
314
|
self.inputs: List[Source] = []
|
|
@@ -339,9 +341,22 @@ class Pipe(Source):
|
|
|
339
341
|
|
|
340
342
|
return clone
|
|
341
343
|
|
|
342
|
-
class
|
|
343
|
-
|
|
344
|
+
class DeepCopyPipe(Pipe):
|
|
345
|
+
def deep_copy(self):
|
|
346
|
+
"""
|
|
347
|
+
Generic deep_copy: clone left source, re-instantiate
|
|
348
|
+
this pipe class with the same ptok/usage, and attach.
|
|
349
|
+
"""
|
|
350
|
+
source_clone = self.left.deep_copy()
|
|
351
|
+
if not source_clone:
|
|
352
|
+
return None
|
|
344
353
|
|
|
354
|
+
# re-instantiate using the actual subclass
|
|
355
|
+
pipe = type(self)(self.ptok, self.usage)
|
|
356
|
+
pipe.add_source(source_clone)
|
|
357
|
+
return pipe
|
|
358
|
+
|
|
359
|
+
class Sink(ABC):
|
|
345
360
|
@classmethod
|
|
346
361
|
def usage(cls):
|
|
347
362
|
return NoBindUsage(
|
|
@@ -356,6 +371,11 @@ class Sink(ABC):
|
|
|
356
371
|
|
|
357
372
|
def drain(self):
|
|
358
373
|
self.process()
|
|
374
|
+
self.close()
|
|
375
|
+
|
|
376
|
+
# optional
|
|
377
|
+
def close(self):
|
|
378
|
+
pass
|
|
359
379
|
|
|
360
380
|
def print_info(self):
|
|
361
381
|
pass
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import logging, os, tempfile
|
|
5
|
+
from logging.handlers import RotatingFileHandler
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("pjk")
|
|
10
|
+
|
|
11
|
+
def _truthy(v: Optional[str]) -> bool:
|
|
12
|
+
return str(v).lower() in ("1", "true", "yes", "on")
|
|
13
|
+
|
|
14
|
+
def init(force: bool = False, level: Optional[int] = None):
|
|
15
|
+
"""
|
|
16
|
+
Initialize 'pjk' logging.
|
|
17
|
+
|
|
18
|
+
- Rotates at DJK_LOG_MAX_MB (default 2 MB), keeps DJK_LOG_BACKUPS (default 3).
|
|
19
|
+
- Files under ~/.pjk/logs by default; override with DJK_LOG_DIR / DJK_LOG_FILE.
|
|
20
|
+
- Set DJK_DEBUG=1|true|yes for DEBUG, else INFO (or pass explicit level).
|
|
21
|
+
- If the log directory is not writable, fall back to console logging
|
|
22
|
+
(stderr → CloudWatch in AWS).
|
|
23
|
+
- Set force=True to replace existing handlers.
|
|
24
|
+
"""
|
|
25
|
+
if logger.handlers and not force:
|
|
26
|
+
return
|
|
27
|
+
logger.handlers.clear()
|
|
28
|
+
|
|
29
|
+
level = level or (logging.DEBUG if _truthy(os.getenv("DJK_DEBUG")) else logging.INFO)
|
|
30
|
+
fmt = "[%(levelname)s] [%(threadName)s] %(message)s"
|
|
31
|
+
formatter = logging.Formatter(fmt)
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
# Preferred: rotating file handler under ~/.pjk/logs
|
|
35
|
+
log_dir = Path(os.getenv("DJK_LOG_DIR", Path.home() / ".pjk" / "logs"))
|
|
36
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
log_file = log_dir / os.getenv("DJK_LOG_FILE", "pjk.log")
|
|
39
|
+
max_bytes = int(float(os.getenv("DJK_LOG_MAX_MB", "2")) * 1024 * 1024) # 2 MB
|
|
40
|
+
backups = int(os.getenv("DJK_LOG_BACKUPS", "3"))
|
|
41
|
+
|
|
42
|
+
fh = RotatingFileHandler(
|
|
43
|
+
log_file,
|
|
44
|
+
maxBytes=max_bytes,
|
|
45
|
+
backupCount=backups,
|
|
46
|
+
encoding="utf-8",
|
|
47
|
+
delay=False,
|
|
48
|
+
)
|
|
49
|
+
fh.setLevel(level)
|
|
50
|
+
fh.setFormatter(formatter)
|
|
51
|
+
logger.addHandler(fh)
|
|
52
|
+
except Exception:
|
|
53
|
+
# Fallback: console handler
|
|
54
|
+
ch = logging.StreamHandler()
|
|
55
|
+
ch.setLevel(level)
|
|
56
|
+
ch.setFormatter(formatter)
|
|
57
|
+
logger.addHandler(ch)
|
|
58
|
+
logger.warning("Falling back to console logging (log file not writable)")
|
|
59
|
+
|
|
60
|
+
logger.setLevel(level)
|
|
61
|
+
# Do not propagate to root
|
|
62
|
+
logger.propagate = False
|
|
@@ -24,8 +24,12 @@ def write_history(tokens):
|
|
|
24
24
|
log_path = ".pjk-history.txt"
|
|
25
25
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
26
26
|
command = " ".join(tokens)
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
with open(log_path, "a") as f:
|
|
30
|
+
f.write(f"{timestamp}\tpjk {command}\n")
|
|
31
|
+
except (PermissionError, OSError):
|
|
32
|
+
pass
|
|
29
33
|
|
|
30
34
|
def execute_threaded(sinks):
|
|
31
35
|
# Choose a max thread limit (explicitly)
|
|
@@ -94,7 +94,7 @@ class ExpressionParser:
|
|
|
94
94
|
|
|
95
95
|
else: # unrecognized token
|
|
96
96
|
# could be sink in WRONG position, let's see for better error message
|
|
97
|
-
sink = self.registry.create_sink(token
|
|
97
|
+
sink = self.registry.create_sink(token)
|
|
98
98
|
if sink:
|
|
99
99
|
raise TokenError.from_list(['sink may only occur in final position.',
|
|
100
100
|
'pjk <source> [<pipe> ...] <sink>'])
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
# djk/pipes/head.py
|
|
5
5
|
|
|
6
6
|
from typing import Optional
|
|
7
|
-
from pjk.base import Pipe, ParsedToken, Usage
|
|
7
|
+
from pjk.base import Pipe, ParsedToken, Usage, DeepCopyPipe
|
|
8
8
|
|
|
9
|
-
class HeadPipe(
|
|
9
|
+
class HeadPipe(DeepCopyPipe):
|
|
10
10
|
@classmethod
|
|
11
11
|
def usage(cls):
|
|
12
12
|
usage = Usage(
|
|
@@ -19,7 +19,7 @@ class HeadPipe(Pipe):
|
|
|
19
19
|
return usage
|
|
20
20
|
|
|
21
21
|
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
22
|
-
super().__init__(ptok)
|
|
22
|
+
super().__init__(ptok, usage)
|
|
23
23
|
self.limit = usage.get_arg('limit')
|
|
24
24
|
self.count = 0
|
|
25
25
|
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
# djk/pipes/where.py
|
|
5
5
|
|
|
6
|
-
from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, UsageError
|
|
6
|
+
from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, UsageError, DeepCopyPipe
|
|
7
7
|
from pjk.common import SafeNamespace
|
|
8
8
|
|
|
9
|
-
class WherePipe(
|
|
9
|
+
class WherePipe(DeepCopyPipe):
|
|
10
10
|
@classmethod
|
|
11
11
|
def usage(cls):
|
|
12
12
|
usage = NoBindUsage(
|
|
@@ -39,11 +39,3 @@ class WherePipe(Pipe):
|
|
|
39
39
|
except Exception:
|
|
40
40
|
continue # ignore eval errors
|
|
41
41
|
|
|
42
|
-
def deep_copy(self):
|
|
43
|
-
source_clone = self.left.deep_copy()
|
|
44
|
-
if source_clone:
|
|
45
|
-
pipe = WherePipe(self.ptok, self.usage)
|
|
46
|
-
pipe.add_source(source_clone)
|
|
47
|
-
return pipe
|
|
48
|
-
else:
|
|
49
|
-
return None
|
|
@@ -30,7 +30,6 @@ class ComponentRegistry:
|
|
|
30
30
|
|
|
31
31
|
def register(self, name, comp):
|
|
32
32
|
if is_pipe(comp):
|
|
33
|
-
print('HELEELELELELELEEE')
|
|
34
33
|
if hasattr(comp, "usage"):
|
|
35
34
|
usage = comp.usage()
|
|
36
35
|
name = usage.name
|
|
@@ -131,11 +130,19 @@ def load_user_components(path=os.path.expanduser("~/.pjk/plugins")):
|
|
|
131
130
|
|
|
132
131
|
return sources, pipes, sinks
|
|
133
132
|
|
|
133
|
+
def iter_entry_points(group: str):
|
|
134
|
+
eps = importlib.metadata.entry_points()
|
|
135
|
+
if hasattr(eps, "select"):
|
|
136
|
+
# Python 3.10+ (importlib.metadata.EntryPoints)
|
|
137
|
+
return eps.select(group=group)
|
|
138
|
+
# Python 3.9 and older
|
|
139
|
+
return eps.get(group, [])
|
|
140
|
+
|
|
134
141
|
def load_package_extras():
|
|
135
142
|
"""
|
|
136
143
|
Discover and import all installed pjk extras (via entry points).
|
|
137
144
|
"""
|
|
138
|
-
for ep in
|
|
145
|
+
for ep in iter_entry_points("pjk.package_extras"):
|
|
139
146
|
try:
|
|
140
147
|
importlib.import_module(ep.value)
|
|
141
148
|
print(f"[pjk] loaded package extra: {ep.name} -> {ep.value}")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import csv
|
|
5
|
+
from typing import IO, Dict, Any
|
|
6
|
+
from .format_sink import FormatSink
|
|
7
|
+
|
|
8
|
+
class CSVSink(FormatSink):
|
|
9
|
+
extension = "csv"
|
|
10
|
+
|
|
11
|
+
def __init__(self, outfile: IO[str], delimiter:str = ','):
|
|
12
|
+
super().__init__(outfile=outfile)
|
|
13
|
+
self.delimiter = delimiter
|
|
14
|
+
|
|
15
|
+
def process(self) -> None:
|
|
16
|
+
writer = None
|
|
17
|
+
for record in self.input:
|
|
18
|
+
if writer is None:
|
|
19
|
+
# Initialize DictWriter with dynamic fieldnames from first record
|
|
20
|
+
writer = csv.DictWriter(self.outfile, fieldnames=record.keys(), delimiter=self.delimiter)
|
|
21
|
+
writer.writeheader()
|
|
22
|
+
writer.writerow(record)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os, gzip, shutil
|
|
5
|
+
from pjk.base import Sink, ParsedToken, Usage
|
|
6
|
+
from typing import Optional, Type
|
|
7
|
+
from .format_sink import Sink
|
|
8
|
+
from pjk.log import logger
|
|
9
|
+
import gzip
|
|
10
|
+
|
|
11
|
+
class DirSink(Sink):
|
|
12
|
+
def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int = 0):
|
|
13
|
+
super().__init__(None, None)
|
|
14
|
+
self.sink_class = sink_class
|
|
15
|
+
self.path_no_ext = path_no_ext
|
|
16
|
+
self.is_gz = is_gz
|
|
17
|
+
self.fileno = fileno
|
|
18
|
+
self.num_files = 1
|
|
19
|
+
|
|
20
|
+
if os.path.isdir(self.path_no_ext):
|
|
21
|
+
# remove everything inside
|
|
22
|
+
for entry in os.listdir(self.path_no_ext):
|
|
23
|
+
full = os.path.join(self.path_no_ext, entry)
|
|
24
|
+
if os.path.isfile(full) or os.path.islink(full):
|
|
25
|
+
os.unlink(full)
|
|
26
|
+
elif os.path.isdir(full):
|
|
27
|
+
shutil.rmtree(full)
|
|
28
|
+
else:
|
|
29
|
+
os.makedirs(self.path_no_ext, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
def process(self):
|
|
32
|
+
# build the base filename
|
|
33
|
+
base = os.path.join(self.path_no_ext, f"file-{self.fileno:04d}")
|
|
34
|
+
|
|
35
|
+
# include extension here (format sink name + gz logic)
|
|
36
|
+
filename = f"{base}.{self.sink_class.extension}"
|
|
37
|
+
if self.is_gz:
|
|
38
|
+
filename += ".gz"
|
|
39
|
+
|
|
40
|
+
# open output file handle
|
|
41
|
+
outfile = gzip.open(filename, "wt", encoding="utf-8") if self.is_gz else open(filename, "wt", encoding="utf-8")
|
|
42
|
+
|
|
43
|
+
# create the format-specific sink with the open handle
|
|
44
|
+
file_sink = self.sink_class(outfile)
|
|
45
|
+
file_sink.add_source(self.input)
|
|
46
|
+
|
|
47
|
+
logger.debug(f"in process sinking to local file: {filename}")
|
|
48
|
+
file_sink.process()
|
|
49
|
+
outfile.close()
|
|
50
|
+
|
|
51
|
+
def deep_copy(self):
|
|
52
|
+
# Ask the upstream source to duplicate itself
|
|
53
|
+
source_clone = self.input.deep_copy()
|
|
54
|
+
if source_clone is None:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
# Create a new DirSink with the next file index
|
|
58
|
+
clone = DirSink(
|
|
59
|
+
sink_class=self.sink_class,
|
|
60
|
+
path_no_ext=self.path_no_ext,
|
|
61
|
+
is_gz=self.is_gz,
|
|
62
|
+
fileno=self.num_files,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Wire up the cloned source to the new sink
|
|
66
|
+
clone.add_source(source_clone)
|
|
67
|
+
|
|
68
|
+
# Increment file counter for the next clone
|
|
69
|
+
self.num_files += 1
|
|
70
|
+
return clone
|
|
71
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from typing import Callable
|
|
5
|
+
import os
|
|
6
|
+
import gzip
|
|
7
|
+
from pjk.base import Source, Sink, ParsedToken
|
|
8
|
+
from pjk.common import ComponentFactory
|
|
9
|
+
from pjk.sinks.stdout import StdoutSink
|
|
10
|
+
from pjk.sinks.json_sink import JsonSink
|
|
11
|
+
from pjk.sinks.devnull import DevNullSink
|
|
12
|
+
from pjk.sinks.graph import GraphSink
|
|
13
|
+
from pjk.sinks.csv_sink import CSVSink
|
|
14
|
+
from pjk.sinks.tsv_sink import TSVSink
|
|
15
|
+
from pjk.sinks.ddb import DDBSink
|
|
16
|
+
from pjk.sinks.dir_sink import DirSink
|
|
17
|
+
from pjk.sinks.s3_sink import S3Sink
|
|
18
|
+
from pjk.sinks.expect import ExpectSink
|
|
19
|
+
from pjk.sinks.format_sink import FormatSink
|
|
20
|
+
from pjk.sinks.user_sink_factory import UserSinkFactory
|
|
21
|
+
|
|
22
|
+
COMPONENTS = {
|
|
23
|
+
'-': StdoutSink,
|
|
24
|
+
'devnull': DevNullSink,
|
|
25
|
+
'graph': GraphSink,
|
|
26
|
+
'ddb': DDBSink,
|
|
27
|
+
'json': JsonSink,
|
|
28
|
+
'csv': CSVSink,
|
|
29
|
+
'tsv': TSVSink,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
class SinkFactory(ComponentFactory):
|
|
33
|
+
def __init__(self):
|
|
34
|
+
super().__init__(COMPONENTS, 'sink')
|
|
35
|
+
|
|
36
|
+
def create(self, token: str) -> Callable[[Source], Sink]:
|
|
37
|
+
token = token.strip()
|
|
38
|
+
ptok = ParsedToken(token)
|
|
39
|
+
|
|
40
|
+
# non-usage sink (bind incompatible)
|
|
41
|
+
if ptok.pre_colon == 'expect':
|
|
42
|
+
return ExpectSink(ptok, None)
|
|
43
|
+
|
|
44
|
+
if ptok.pre_colon.endswith('.py'):
|
|
45
|
+
sink = UserSinkFactory.create(ptok)
|
|
46
|
+
if sink:
|
|
47
|
+
return sink
|
|
48
|
+
|
|
49
|
+
sink_cls = self.components.get(ptok.pre_colon)
|
|
50
|
+
if sink_cls and not issubclass(sink_cls, FormatSink):
|
|
51
|
+
usage = sink_cls.usage()
|
|
52
|
+
usage.bind(ptok)
|
|
53
|
+
return sink_cls(ptok, usage)
|
|
54
|
+
|
|
55
|
+
return FormatSink.create(ptok, COMPONENTS)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pjk.base import Sink, ParsedToken, NoBindUsage
|
|
2
|
+
from pjk.sinks.s3_sink import S3Sink
|
|
3
|
+
from pjk.sinks.dir_sink import DirSink
|
|
4
|
+
from typing import IO
|
|
5
|
+
import re
|
|
6
|
+
import gzip
|
|
7
|
+
|
|
8
|
+
class SinkFormatUsage(NoBindUsage):
|
|
9
|
+
def __init__(self, name: str, component_class: type, desc_override: str = None):
|
|
10
|
+
desc = f'{name} source for s3 and local files/directories.\ns3 defaults to \'json.gz\', others require format param' if desc_override == None else desc_override
|
|
11
|
+
super().__init__(name, desc, component_class)
|
|
12
|
+
|
|
13
|
+
self.def_syntax("") # don't use generated syntax for these, rely on examples
|
|
14
|
+
self.def_param('format', 'file format', is_num=False, valid_values={'json', 'csv', 'tsv', 'json.gz', 'tsv.gz', 'csv.gz'}, default='json.gz')
|
|
15
|
+
self.def_example(expr_tokens=["{hello: 'world'}", f"myfile.{name}"], expect=None)
|
|
16
|
+
self.def_example(expr_tokens=["{hello: 'world}", f"{name}:mydir"], expect=None)
|
|
17
|
+
self.def_example(expr_tokens=["{hello: 'world'}", f"s3://mybucket/myfile.{name}"], expect=None)
|
|
18
|
+
self.def_example(expr_tokens=["{hello: 'world'}", f"s3://mybucket/myfiles@format={name}"], expect=None)
|
|
19
|
+
|
|
20
|
+
class FormatSink(Sink):
|
|
21
|
+
extension: str = None
|
|
22
|
+
desc_override = None
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def usage(cls):
|
|
26
|
+
return SinkFormatUsage(name=cls.extension,
|
|
27
|
+
component_class=cls,
|
|
28
|
+
desc_override=cls.desc_override)
|
|
29
|
+
|
|
30
|
+
def __init__(self, outfile: IO[str]):
|
|
31
|
+
super().__init__(None, None)
|
|
32
|
+
self.outfile = outfile
|
|
33
|
+
|
|
34
|
+
def close(self):
|
|
35
|
+
self.outfile.close()
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def get_format_gz(cls, input:str):
|
|
39
|
+
is_gz = False
|
|
40
|
+
format = input
|
|
41
|
+
if input.endswith('.gz'):
|
|
42
|
+
is_gz = True
|
|
43
|
+
format = input[:-3]
|
|
44
|
+
return format, is_gz
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def create(cls, ptok: ParsedToken, sinks):
|
|
48
|
+
"""
|
|
49
|
+
use cases covered:
|
|
50
|
+
1) foo.<format> # local single file
|
|
51
|
+
2) <format>:foo # local directory
|
|
52
|
+
3) s3://bucket/prefix.<format> # s3 single file
|
|
53
|
+
4) s3://bucket/prefix # s3 directory (@format=<format parameter with default = json)
|
|
54
|
+
|
|
55
|
+
format = json, csv, tsv, and also json.gz etc.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
pattern = re.compile(
|
|
59
|
+
r'^(?:(?P<pre_colon>[^:]+):)?' # optional precolon
|
|
60
|
+
r'(?P<path>[^:]+?)' # main path
|
|
61
|
+
r'(?:\.(?P<ext>\w+(?:\.gz)?))?$' # optional extension, e.g. json, csv, json.gz
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# we don't use framework token parsing (except for params) cuz too complicated
|
|
65
|
+
input = ptok.all_but_params
|
|
66
|
+
|
|
67
|
+
# Example usage
|
|
68
|
+
match = pattern.match(input)
|
|
69
|
+
if not match:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
gd = match.groupdict()
|
|
73
|
+
pre_colon = gd.get('pre_colon', None)
|
|
74
|
+
path_no_ext = gd.get('path', None)
|
|
75
|
+
ext = gd.get('ext', None)
|
|
76
|
+
|
|
77
|
+
usage = cls.usage()
|
|
78
|
+
usage.bind_params(ptok) # only bind params
|
|
79
|
+
|
|
80
|
+
is_gz = False
|
|
81
|
+
format = None
|
|
82
|
+
|
|
83
|
+
if pre_colon and pre_colon != 's3': # local dir case
|
|
84
|
+
format, is_gz = cls.get_format_gz(pre_colon)
|
|
85
|
+
sink_class = sinks.get(format)
|
|
86
|
+
if not sink_class or not issubclass(sink_class, FormatSink):
|
|
87
|
+
return None
|
|
88
|
+
if ext:
|
|
89
|
+
raise Exception('fix this exception message, extensions not allowed for local directory sinks')
|
|
90
|
+
return DirSink(sink_class, path_no_ext, is_gz, fileno=0)
|
|
91
|
+
|
|
92
|
+
if ext and not pre_colon: # single local file case
|
|
93
|
+
format, is_gz = cls.get_format_gz(ext)
|
|
94
|
+
sink_class = sinks.get(format)
|
|
95
|
+
if not sink_class:
|
|
96
|
+
raise Exception('fix this exception message, extension for single file must be recognized format')
|
|
97
|
+
|
|
98
|
+
filename = f'{path_no_ext}.{format}'
|
|
99
|
+
|
|
100
|
+
# open the output file stream
|
|
101
|
+
if is_gz:
|
|
102
|
+
outfile = gzip.open(f'{filename}.gz', "wt", encoding="utf-8", newline="")
|
|
103
|
+
else:
|
|
104
|
+
outfile = open(filename, "wt", encoding="utf-8", newline="")
|
|
105
|
+
|
|
106
|
+
# instantiate the sink with the prepared stream
|
|
107
|
+
sink = sink_class(outfile)
|
|
108
|
+
return sink
|
|
109
|
+
|
|
110
|
+
if pre_colon == 's3':
|
|
111
|
+
if ext: # single file
|
|
112
|
+
format, is_gz = cls.get_format_gz(ext)
|
|
113
|
+
sink_class = sinks.get(format)
|
|
114
|
+
if not sink_class:
|
|
115
|
+
raise Exception('fix this exception message, extension for single file must be recognized format')
|
|
116
|
+
else:
|
|
117
|
+
format, is_gz = cls.get_format_gz(usage.get_param('format'))
|
|
118
|
+
sink_class = sinks.get(format)
|
|
119
|
+
|
|
120
|
+
fileno = -1 if ext else 0 # -1 tells s3 single file, no threading
|
|
121
|
+
return S3Sink(sink_class, path_no_ext, is_gz, fileno)
|
|
122
|
+
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from typing import IO
|
|
6
|
+
from .format_sink import FormatSink
|
|
7
|
+
|
|
8
|
+
class JsonSink(FormatSink):
|
|
9
|
+
extension = 'json'
|
|
10
|
+
|
|
11
|
+
def process(self) -> None:
|
|
12
|
+
for record in self.input:
|
|
13
|
+
self.outfile.write(json.dumps(record) + "\n")
|
|
14
|
+
# Caller (DirSink/S3Sink) owns closing the outfile
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2025 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
import gzip
|
|
6
|
+
from typing import Optional, Type
|
|
7
|
+
from pjk.base import Source, Sink
|
|
8
|
+
from pjk.log import logger
|
|
9
|
+
from pjk.sinks.s3_stream import S3MultipartWriter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class S3Sink(Sink):
|
|
13
|
+
"""
|
|
14
|
+
Write records to S3 in the given <format>.
|
|
15
|
+
|
|
16
|
+
- Folder mode (path without extension):
|
|
17
|
+
s3:bucket/prefix/ → file-0000.ext, file-0001.ext, ...
|
|
18
|
+
- Single-file mode (path ends with .ext or .ext.gz):
|
|
19
|
+
s3:bucket/prefix/output.csv[.gz]
|
|
20
|
+
|
|
21
|
+
Args (via Usage):
|
|
22
|
+
- path: 'bucket/path/to/files' (bucket required, prefix optional)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
_FILENAME_BASE: str = "file"
|
|
26
|
+
_FILENAME_DIGITS: int = 4
|
|
27
|
+
|
|
28
|
+
def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int):
|
|
29
|
+
self.path_no_ext = path_no_ext if not path_no_ext.startswith('//') else path_no_ext[2:] # strip leading //
|
|
30
|
+
self.sink_class = sink_class
|
|
31
|
+
self.is_gz = is_gz
|
|
32
|
+
self.fileno = fileno
|
|
33
|
+
self.is_single_file = fileno == -1
|
|
34
|
+
if self.path_no_ext.endswith('/') and not self.is_single_file:
|
|
35
|
+
self.path_no_ext = self.path_no_ext[:-1]
|
|
36
|
+
|
|
37
|
+
self.num_files = 1
|
|
38
|
+
|
|
39
|
+
def _build_object_key(self, index: int) -> str:
|
|
40
|
+
if self.is_single_file:
|
|
41
|
+
file_name = f'{self.path_no_ext}.{self.sink_class.extension}'
|
|
42
|
+
else:
|
|
43
|
+
file_name = f"{self.path_no_ext}/{self._FILENAME_BASE}-{index:0{self._FILENAME_DIGITS}d}.{self.sink_class.extension}"
|
|
44
|
+
|
|
45
|
+
if self.is_gz:
|
|
46
|
+
file_name += ".gz"
|
|
47
|
+
|
|
48
|
+
return file_name
|
|
49
|
+
|
|
50
|
+
def process(self):
|
|
51
|
+
object_key = self._build_object_key(self.fileno)
|
|
52
|
+
bucket, key = object_key.split("/", 1)
|
|
53
|
+
|
|
54
|
+
with S3MultipartWriter(bucket, key) as writer:
|
|
55
|
+
if self.is_gz:
|
|
56
|
+
# gzip needs a binary sink → use writer directly
|
|
57
|
+
with gzip.GzipFile(fileobj=writer, mode="wb") as gz:
|
|
58
|
+
with io.TextIOWrapper(gz, encoding="utf-8", newline="") as outfile:
|
|
59
|
+
file_sink = self.sink_class(outfile)
|
|
60
|
+
file_sink.add_source(self.input)
|
|
61
|
+
logger.debug(f"S3Sink streaming GZ to s3://{bucket}/{key}")
|
|
62
|
+
file_sink.process()
|
|
63
|
+
else:
|
|
64
|
+
# plain text path
|
|
65
|
+
with io.TextIOWrapper(writer, encoding="utf-8", newline="") as outfile:
|
|
66
|
+
file_sink = self.sink_class(outfile)
|
|
67
|
+
file_sink.add_source(self.input)
|
|
68
|
+
logger.debug(f"S3Sink streaming to s3://{bucket}/{key}")
|
|
69
|
+
file_sink.process()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def deep_copy(self):
|
|
73
|
+
if self.is_single_file:
|
|
74
|
+
# Single-file mode: no fanout allowed
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
source_clone: Optional[Source] = self.input.deep_copy()
|
|
78
|
+
if not source_clone:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
clone = S3Sink(
|
|
82
|
+
sink_class=self.sink_class,
|
|
83
|
+
path_no_ext=self.path_no_ext,
|
|
84
|
+
is_gz=self.is_gz,
|
|
85
|
+
fileno=self.num_files,
|
|
86
|
+
)
|
|
87
|
+
clone.add_source(source_clone)
|
|
88
|
+
|
|
89
|
+
self.num_files += 1
|
|
90
|
+
return clone
|