python-jack-knife 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/PKG-INFO +1 -1
  2. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/base.py +32 -12
  3. python_jack_knife-0.5.1/src/pjk/log.py +62 -0
  4. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/main.py +6 -2
  5. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/parser.py +1 -1
  6. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/head.py +3 -3
  7. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/where.py +2 -10
  8. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/registry.py +9 -2
  9. python_jack_knife-0.5.1/src/pjk/sinks/csv_sink.py +22 -0
  10. python_jack_knife-0.5.1/src/pjk/sinks/dir_sink.py +71 -0
  11. python_jack_knife-0.5.1/src/pjk/sinks/factory.py +55 -0
  12. python_jack_knife-0.5.1/src/pjk/sinks/format_sink.py +126 -0
  13. python_jack_knife-0.5.1/src/pjk/sinks/json_sink.py +14 -0
  14. python_jack_knife-0.5.1/src/pjk/sinks/s3_sink.py +90 -0
  15. python_jack_knife-0.5.1/src/pjk/sinks/s3_stream.py +134 -0
  16. python_jack_knife-0.5.1/src/pjk/sinks/tsv_sink.py +12 -0
  17. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/csv_source.py +3 -6
  18. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/dir_source.py +28 -17
  19. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/factory.py +6 -17
  20. python_jack_knife-0.5.1/src/pjk/sources/format_source.py +114 -0
  21. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/json_source.py +3 -7
  22. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/parquet_source.py +3 -7
  23. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/s3_source.py +40 -50
  24. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/sql_source.py +4 -11
  25. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/tsv_source.py +2 -6
  26. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/version.py +1 -1
  27. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/PKG-INFO +1 -1
  28. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/SOURCES.txt +3 -1
  29. python_jack_knife-0.5.0/src/pjk/log.py +0 -67
  30. python_jack_knife-0.5.0/src/pjk/sinks/csv_sink.py +0 -33
  31. python_jack_knife-0.5.0/src/pjk/sinks/dir_sink.py +0 -59
  32. python_jack_knife-0.5.0/src/pjk/sinks/factory.py +0 -108
  33. python_jack_knife-0.5.0/src/pjk/sinks/json_sink.py +0 -23
  34. python_jack_knife-0.5.0/src/pjk/sinks/s3_sink.py +0 -100
  35. python_jack_knife-0.5.0/src/pjk/sinks/tsv_sink.py +0 -22
  36. python_jack_knife-0.5.0/src/pjk/sources/format_usage.py +0 -11
  37. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/LICENSE +0 -0
  38. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/README.md +0 -0
  39. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/pyproject.toml +0 -0
  40. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/setup.cfg +0 -0
  41. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/__init__.py +0 -0
  42. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/common.py +0 -0
  43. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/man_page.py +0 -0
  44. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/__init__.py +0 -0
  45. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/denorm.py +0 -0
  46. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/factory.py +0 -0
  47. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/filter.py +0 -0
  48. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/join.py +0 -0
  49. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/let_reduce.py +0 -0
  50. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/map.py +0 -0
  51. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/move_field.py +0 -0
  52. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/postgres_pipe.py +0 -0
  53. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/remove_field.py +0 -0
  54. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/select.py +0 -0
  55. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/sort.py +0 -0
  56. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/tail.py +0 -0
  57. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/pipes/user_pipe_factory.py +0 -0
  58. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/__init__.py +0 -0
  59. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/ddb.py +0 -0
  60. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/devnull.py +0 -0
  61. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/expect.py +0 -0
  62. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph.py +0 -0
  63. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_bar_line.py +0 -0
  64. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_cumulative.py +0 -0
  65. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_hist.py +0 -0
  66. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/graph_scatter.py +0 -0
  67. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/sinks.py +0 -0
  68. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/stdout.py +0 -0
  69. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sinks/user_sink_factory.py +0 -0
  70. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/__init__.py +0 -0
  71. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/inline_source.py +0 -0
  72. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/lazy_file.py +0 -0
  73. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/lazy_file_local.py +0 -0
  74. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/lazy_file_s3.py +0 -0
  75. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/source_list.py +0 -0
  76. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/pjk/sources/user_source_factory.py +0 -0
  77. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
  78. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
  79. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/requires.txt +0 -0
  80. {python_jack_knife-0.5.0 → python_jack_knife-0.5.1}/src/python_jack_knife.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-jack-knife
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Python Jack Knife – a command line data processor
5
5
  Author-email: Mike Schultz <mike.schultz@gmail.com>
6
6
  License:
@@ -152,16 +152,17 @@ class Usage:
152
152
  lines.append(self.desc)
153
153
 
154
154
  syntax_str = self.get_token_syntax() # might be ''
155
- if len(syntax_str) > 0:
156
- lines.append('')
157
- lines.append(f'syntax:')
158
- lines.append(f' {self.get_token_syntax()}')
159
-
155
+ if not syntax_str:
156
+ return '\n'.join(lines)
157
+
158
+ lines.append('')
159
+ lines.append(f'syntax:')
160
+ lines.append(f' {self.get_token_syntax()}')
160
161
  lines.extend(f"{line}" for line in self.get_arg_param_desc())
161
162
  return '\n'.join(lines)
162
163
 
163
164
  def get_token_syntax(self):
164
- if self.syntax != None:
165
+ if not self.syntax:
165
166
  return self.syntax # else piece it together
166
167
 
167
168
  token = f'{self.name}'
@@ -216,7 +217,10 @@ class Usage:
216
217
  self.args[name] = self._get_val(val_str, is_num, valid_values)
217
218
  except (ValueError, TypeError) as e:
218
219
  raise TokenError.from_list([f"wrong value for '{name}' arg.", '', self.get_usage_text()])
219
-
220
+
221
+ self.bind_params(ptok)
222
+
223
+ def bind_params(self, ptok: ParsedToken):
220
224
  for name, str_val in ptok.get_params().items():
221
225
  usage = self.param_usages.get(name, None)
222
226
  if not usage:
@@ -276,8 +280,6 @@ class KeyedSource(ABC):
276
280
  return None
277
281
 
278
282
  class Source(ABC):
279
- is_format = False
280
-
281
283
  @classmethod
282
284
  def usage(cls):
283
285
  return NoBindUsage(
@@ -296,7 +298,6 @@ class Source(ABC):
296
298
  self._iter = iter(self)
297
299
  return next(self._iter)
298
300
 
299
-
300
301
  def deep_copy(self):
301
302
  return None # Default: not copyable unless overridden
302
303
 
@@ -307,6 +308,7 @@ class Pipe(Source):
307
308
 
308
309
  def __init__(self, ptok: ParsedToken, usage: Usage = None):
309
310
  self.ptok = ptok
311
+ self.usage = usage
310
312
  self.left = None # left source for convience
311
313
  self.right = None # right source for convience
312
314
  self.inputs: List[Source] = []
@@ -339,9 +341,22 @@ class Pipe(Source):
339
341
 
340
342
  return clone
341
343
 
342
- class Sink(ABC):
343
- is_format = False
344
+ class DeepCopyPipe(Pipe):
345
+ def deep_copy(self):
346
+ """
347
+ Generic deep_copy: clone left source, re-instantiate
348
+ this pipe class with the same ptok/usage, and attach.
349
+ """
350
+ source_clone = self.left.deep_copy()
351
+ if not source_clone:
352
+ return None
344
353
 
354
+ # re-instantiate using the actual subclass
355
+ pipe = type(self)(self.ptok, self.usage)
356
+ pipe.add_source(source_clone)
357
+ return pipe
358
+
359
+ class Sink(ABC):
345
360
  @classmethod
346
361
  def usage(cls):
347
362
  return NoBindUsage(
@@ -356,6 +371,11 @@ class Sink(ABC):
356
371
 
357
372
  def drain(self):
358
373
  self.process()
374
+ self.close()
375
+
376
+ # optional
377
+ def close(self):
378
+ pass
359
379
 
360
380
  def print_info(self):
361
381
  pass
@@ -0,0 +1,62 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import logging, os, tempfile
5
+ from logging.handlers import RotatingFileHandler
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ logger = logging.getLogger("pjk")
10
+
11
+ def _truthy(v: Optional[str]) -> bool:
12
+ return str(v).lower() in ("1", "true", "yes", "on")
13
+
14
+ def init(force: bool = False, level: Optional[int] = None):
15
+ """
16
+ Initialize 'pjk' logging.
17
+
18
+ - Rotates at DJK_LOG_MAX_MB (default 2 MB), keeps DJK_LOG_BACKUPS (default 3).
19
+ - Files under ~/.pjk/logs by default; override with DJK_LOG_DIR / DJK_LOG_FILE.
20
+ - Set DJK_DEBUG=1|true|yes for DEBUG, else INFO (or pass explicit level).
21
+ - If the log directory is not writable, fall back to console logging
22
+ (stderr → CloudWatch in AWS).
23
+ - Set force=True to replace existing handlers.
24
+ """
25
+ if logger.handlers and not force:
26
+ return
27
+ logger.handlers.clear()
28
+
29
+ level = level or (logging.DEBUG if _truthy(os.getenv("DJK_DEBUG")) else logging.INFO)
30
+ fmt = "[%(levelname)s] [%(threadName)s] %(message)s"
31
+ formatter = logging.Formatter(fmt)
32
+
33
+ try:
34
+ # Preferred: rotating file handler under ~/.pjk/logs
35
+ log_dir = Path(os.getenv("DJK_LOG_DIR", Path.home() / ".pjk" / "logs"))
36
+ log_dir.mkdir(parents=True, exist_ok=True)
37
+
38
+ log_file = log_dir / os.getenv("DJK_LOG_FILE", "pjk.log")
39
+ max_bytes = int(float(os.getenv("DJK_LOG_MAX_MB", "2")) * 1024 * 1024) # 2 MB
40
+ backups = int(os.getenv("DJK_LOG_BACKUPS", "3"))
41
+
42
+ fh = RotatingFileHandler(
43
+ log_file,
44
+ maxBytes=max_bytes,
45
+ backupCount=backups,
46
+ encoding="utf-8",
47
+ delay=False,
48
+ )
49
+ fh.setLevel(level)
50
+ fh.setFormatter(formatter)
51
+ logger.addHandler(fh)
52
+ except Exception:
53
+ # Fallback: console handler
54
+ ch = logging.StreamHandler()
55
+ ch.setLevel(level)
56
+ ch.setFormatter(formatter)
57
+ logger.addHandler(ch)
58
+ logger.warning("Falling back to console logging (log file not writable)")
59
+
60
+ logger.setLevel(level)
61
+ # Do not propagate to root
62
+ logger.propagate = False
@@ -24,8 +24,12 @@ def write_history(tokens):
24
24
  log_path = ".pjk-history.txt"
25
25
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
26
26
  command = " ".join(tokens)
27
- with open(log_path, "a") as f:
28
- f.write(f"{timestamp}\tpjk {command}\n")
27
+
28
+ try:
29
+ with open(log_path, "a") as f:
30
+ f.write(f"{timestamp}\tpjk {command}\n")
31
+ except (PermissionError, OSError):
32
+ pass
29
33
 
30
34
  def execute_threaded(sinks):
31
35
  # Choose a max thread limit (explicitly)
@@ -94,7 +94,7 @@ class ExpressionParser:
94
94
 
95
95
  else: # unrecognized token
96
96
  # could be sink in WRONG position, let's see for better error message
97
- sink = self.registry.create_sink(token, None)
97
+ sink = self.registry.create_sink(token)
98
98
  if sink:
99
99
  raise TokenError.from_list(['sink may only occur in final position.',
100
100
  'pjk <source> [<pipe> ...] <sink>'])
@@ -4,9 +4,9 @@
4
4
  # djk/pipes/head.py
5
5
 
6
6
  from typing import Optional
7
- from pjk.base import Pipe, ParsedToken, Usage
7
+ from pjk.base import Pipe, ParsedToken, Usage, DeepCopyPipe
8
8
 
9
- class HeadPipe(Pipe):
9
+ class HeadPipe(DeepCopyPipe):
10
10
  @classmethod
11
11
  def usage(cls):
12
12
  usage = Usage(
@@ -19,7 +19,7 @@ class HeadPipe(Pipe):
19
19
  return usage
20
20
 
21
21
  def __init__(self, ptok: ParsedToken, usage: Usage):
22
- super().__init__(ptok)
22
+ super().__init__(ptok, usage)
23
23
  self.limit = usage.get_arg('limit')
24
24
  self.count = 0
25
25
 
@@ -3,10 +3,10 @@
3
3
 
4
4
  # djk/pipes/where.py
5
5
 
6
- from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, UsageError
6
+ from pjk.base import Pipe, ParsedToken, NoBindUsage, Usage, UsageError, DeepCopyPipe
7
7
  from pjk.common import SafeNamespace
8
8
 
9
- class WherePipe(Pipe):
9
+ class WherePipe(DeepCopyPipe):
10
10
  @classmethod
11
11
  def usage(cls):
12
12
  usage = NoBindUsage(
@@ -39,11 +39,3 @@ class WherePipe(Pipe):
39
39
  except Exception:
40
40
  continue # ignore eval errors
41
41
 
42
- def deep_copy(self):
43
- source_clone = self.left.deep_copy()
44
- if source_clone:
45
- pipe = WherePipe(self.ptok, self.usage)
46
- pipe.add_source(source_clone)
47
- return pipe
48
- else:
49
- return None
@@ -30,7 +30,6 @@ class ComponentRegistry:
30
30
 
31
31
  def register(self, name, comp):
32
32
  if is_pipe(comp):
33
- print('HELEELELELELELEEE')
34
33
  if hasattr(comp, "usage"):
35
34
  usage = comp.usage()
36
35
  name = usage.name
@@ -131,11 +130,19 @@ def load_user_components(path=os.path.expanduser("~/.pjk/plugins")):
131
130
 
132
131
  return sources, pipes, sinks
133
132
 
133
+ def iter_entry_points(group: str):
134
+ eps = importlib.metadata.entry_points()
135
+ if hasattr(eps, "select"):
136
+ # Python 3.10+ (importlib.metadata.EntryPoints)
137
+ return eps.select(group=group)
138
+ # Python 3.9 and older
139
+ return eps.get(group, [])
140
+
134
141
  def load_package_extras():
135
142
  """
136
143
  Discover and import all installed pjk extras (via entry points).
137
144
  """
138
- for ep in importlib.metadata.entry_points(group="pjk.package_extras"):
145
+ for ep in iter_entry_points("pjk.package_extras"):
139
146
  try:
140
147
  importlib.import_module(ep.value)
141
148
  print(f"[pjk] loaded package extra: {ep.name} -> {ep.value}")
@@ -0,0 +1,22 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import csv
5
+ from typing import IO, Dict, Any
6
+ from .format_sink import FormatSink
7
+
8
+ class CSVSink(FormatSink):
9
+ extension = "csv"
10
+
11
+ def __init__(self, outfile: IO[str], delimiter:str = ','):
12
+ super().__init__(outfile=outfile)
13
+ self.delimiter = delimiter
14
+
15
+ def process(self) -> None:
16
+ writer = None
17
+ for record in self.input:
18
+ if writer is None:
19
+ # Initialize DictWriter with dynamic fieldnames from first record
20
+ writer = csv.DictWriter(self.outfile, fieldnames=record.keys(), delimiter=self.delimiter)
21
+ writer.writeheader()
22
+ writer.writerow(record)
@@ -0,0 +1,71 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import os, gzip, shutil
5
+ from pjk.base import Sink, ParsedToken, Usage
6
+ from typing import Optional, Type
7
+ from .format_sink import Sink
8
+ from pjk.log import logger
9
+ import gzip
10
+
11
+ class DirSink(Sink):
12
+ def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int = 0):
13
+ super().__init__(None, None)
14
+ self.sink_class = sink_class
15
+ self.path_no_ext = path_no_ext
16
+ self.is_gz = is_gz
17
+ self.fileno = fileno
18
+ self.num_files = 1
19
+
20
+ if os.path.isdir(self.path_no_ext):
21
+ # remove everything inside
22
+ for entry in os.listdir(self.path_no_ext):
23
+ full = os.path.join(self.path_no_ext, entry)
24
+ if os.path.isfile(full) or os.path.islink(full):
25
+ os.unlink(full)
26
+ elif os.path.isdir(full):
27
+ shutil.rmtree(full)
28
+ else:
29
+ os.makedirs(self.path_no_ext, exist_ok=True)
30
+
31
+ def process(self):
32
+ # build the base filename
33
+ base = os.path.join(self.path_no_ext, f"file-{self.fileno:04d}")
34
+
35
+ # include extension here (format sink name + gz logic)
36
+ filename = f"{base}.{self.sink_class.extension}"
37
+ if self.is_gz:
38
+ filename += ".gz"
39
+
40
+ # open output file handle
41
+ outfile = gzip.open(filename, "wt", encoding="utf-8") if self.is_gz else open(filename, "wt", encoding="utf-8")
42
+
43
+ # create the format-specific sink with the open handle
44
+ file_sink = self.sink_class(outfile)
45
+ file_sink.add_source(self.input)
46
+
47
+ logger.debug(f"in process sinking to local file: {filename}")
48
+ file_sink.process()
49
+ outfile.close()
50
+
51
+ def deep_copy(self):
52
+ # Ask the upstream source to duplicate itself
53
+ source_clone = self.input.deep_copy()
54
+ if source_clone is None:
55
+ return None
56
+
57
+ # Create a new DirSink with the next file index
58
+ clone = DirSink(
59
+ sink_class=self.sink_class,
60
+ path_no_ext=self.path_no_ext,
61
+ is_gz=self.is_gz,
62
+ fileno=self.num_files,
63
+ )
64
+
65
+ # Wire up the cloned source to the new sink
66
+ clone.add_source(source_clone)
67
+
68
+ # Increment file counter for the next clone
69
+ self.num_files += 1
70
+ return clone
71
+
@@ -0,0 +1,55 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from typing import Callable
5
+ import os
6
+ import gzip
7
+ from pjk.base import Source, Sink, ParsedToken
8
+ from pjk.common import ComponentFactory
9
+ from pjk.sinks.stdout import StdoutSink
10
+ from pjk.sinks.json_sink import JsonSink
11
+ from pjk.sinks.devnull import DevNullSink
12
+ from pjk.sinks.graph import GraphSink
13
+ from pjk.sinks.csv_sink import CSVSink
14
+ from pjk.sinks.tsv_sink import TSVSink
15
+ from pjk.sinks.ddb import DDBSink
16
+ from pjk.sinks.dir_sink import DirSink
17
+ from pjk.sinks.s3_sink import S3Sink
18
+ from pjk.sinks.expect import ExpectSink
19
+ from pjk.sinks.format_sink import FormatSink
20
+ from pjk.sinks.user_sink_factory import UserSinkFactory
21
+
22
+ COMPONENTS = {
23
+ '-': StdoutSink,
24
+ 'devnull': DevNullSink,
25
+ 'graph': GraphSink,
26
+ 'ddb': DDBSink,
27
+ 'json': JsonSink,
28
+ 'csv': CSVSink,
29
+ 'tsv': TSVSink,
30
+ }
31
+
32
+ class SinkFactory(ComponentFactory):
33
+ def __init__(self):
34
+ super().__init__(COMPONENTS, 'sink')
35
+
36
+ def create(self, token: str) -> Callable[[Source], Sink]:
37
+ token = token.strip()
38
+ ptok = ParsedToken(token)
39
+
40
+ # non-usage sink (bind incompatible)
41
+ if ptok.pre_colon == 'expect':
42
+ return ExpectSink(ptok, None)
43
+
44
+ if ptok.pre_colon.endswith('.py'):
45
+ sink = UserSinkFactory.create(ptok)
46
+ if sink:
47
+ return sink
48
+
49
+ sink_cls = self.components.get(ptok.pre_colon)
50
+ if sink_cls and not issubclass(sink_cls, FormatSink):
51
+ usage = sink_cls.usage()
52
+ usage.bind(ptok)
53
+ return sink_cls(ptok, usage)
54
+
55
+ return FormatSink.create(ptok, COMPONENTS)
@@ -0,0 +1,126 @@
1
+ from pjk.base import Sink, ParsedToken, NoBindUsage
2
+ from pjk.sinks.s3_sink import S3Sink
3
+ from pjk.sinks.dir_sink import DirSink
4
+ from typing import IO
5
+ import re
6
+ import gzip
7
+
8
+ class SinkFormatUsage(NoBindUsage):
9
+ def __init__(self, name: str, component_class: type, desc_override: str = None):
10
+ desc = f'{name} source for s3 and local files/directories.\ns3 defaults to \'json.gz\', others require format param' if desc_override == None else desc_override
11
+ super().__init__(name, desc, component_class)
12
+
13
+ self.def_syntax("") # don't use generated syntax for these, rely on examples
14
+ self.def_param('format', 'file format', is_num=False, valid_values={'json', 'csv', 'tsv', 'json.gz', 'tsv.gz', 'csv.gz'}, default='json.gz')
15
+ self.def_example(expr_tokens=["{hello: 'world'}", f"myfile.{name}"], expect=None)
16
+ self.def_example(expr_tokens=["{hello: 'world}", f"{name}:mydir"], expect=None)
17
+ self.def_example(expr_tokens=["{hello: 'world'}", f"s3://mybucket/myfile.{name}"], expect=None)
18
+ self.def_example(expr_tokens=["{hello: 'world'}", f"s3://mybucket/myfiles@format={name}"], expect=None)
19
+
20
+ class FormatSink(Sink):
21
+ extension: str = None
22
+ desc_override = None
23
+
24
+ @classmethod
25
+ def usage(cls):
26
+ return SinkFormatUsage(name=cls.extension,
27
+ component_class=cls,
28
+ desc_override=cls.desc_override)
29
+
30
+ def __init__(self, outfile: IO[str]):
31
+ super().__init__(None, None)
32
+ self.outfile = outfile
33
+
34
+ def close(self):
35
+ self.outfile.close()
36
+
37
+ @classmethod
38
+ def get_format_gz(cls, input:str):
39
+ is_gz = False
40
+ format = input
41
+ if input.endswith('.gz'):
42
+ is_gz = True
43
+ format = input[:-3]
44
+ return format, is_gz
45
+
46
+ @classmethod
47
+ def create(cls, ptok: ParsedToken, sinks):
48
+ """
49
+ use cases covered:
50
+ 1) foo.<format> # local single file
51
+ 2) <format>:foo # local directory
52
+ 3) s3://bucket/prefix.<format> # s3 single file
53
+ 4) s3://bucket/prefix # s3 directory (@format=<format parameter with default = json)
54
+
55
+ format = json, csv, tsv, and also json.gz etc.
56
+ """
57
+
58
+ pattern = re.compile(
59
+ r'^(?:(?P<pre_colon>[^:]+):)?' # optional precolon
60
+ r'(?P<path>[^:]+?)' # main path
61
+ r'(?:\.(?P<ext>\w+(?:\.gz)?))?$' # optional extension, e.g. json, csv, json.gz
62
+ )
63
+
64
+ # we don't use framework token parsing (except for params) cuz too complicated
65
+ input = ptok.all_but_params
66
+
67
+ # Example usage
68
+ match = pattern.match(input)
69
+ if not match:
70
+ return None
71
+
72
+ gd = match.groupdict()
73
+ pre_colon = gd.get('pre_colon', None)
74
+ path_no_ext = gd.get('path', None)
75
+ ext = gd.get('ext', None)
76
+
77
+ usage = cls.usage()
78
+ usage.bind_params(ptok) # only bind params
79
+
80
+ is_gz = False
81
+ format = None
82
+
83
+ if pre_colon and pre_colon != 's3': # local dir case
84
+ format, is_gz = cls.get_format_gz(pre_colon)
85
+ sink_class = sinks.get(format)
86
+ if not sink_class or not issubclass(sink_class, FormatSink):
87
+ return None
88
+ if ext:
89
+ raise Exception('fix this exception message, extensions not allowed for local directory sinks')
90
+ return DirSink(sink_class, path_no_ext, is_gz, fileno=0)
91
+
92
+ if ext and not pre_colon: # single local file case
93
+ format, is_gz = cls.get_format_gz(ext)
94
+ sink_class = sinks.get(format)
95
+ if not sink_class:
96
+ raise Exception('fix this exception message, extension for single file must be recognized format')
97
+
98
+ filename = f'{path_no_ext}.{format}'
99
+
100
+ # open the output file stream
101
+ if is_gz:
102
+ outfile = gzip.open(f'{filename}.gz', "wt", encoding="utf-8", newline="")
103
+ else:
104
+ outfile = open(filename, "wt", encoding="utf-8", newline="")
105
+
106
+ # instantiate the sink with the prepared stream
107
+ sink = sink_class(outfile)
108
+ return sink
109
+
110
+ if pre_colon == 's3':
111
+ if ext: # single file
112
+ format, is_gz = cls.get_format_gz(ext)
113
+ sink_class = sinks.get(format)
114
+ if not sink_class:
115
+ raise Exception('fix this exception message, extension for single file must be recognized format')
116
+ else:
117
+ format, is_gz = cls.get_format_gz(usage.get_param('format'))
118
+ sink_class = sinks.get(format)
119
+
120
+ fileno = -1 if ext else 0 # -1 tells s3 single file, no threading
121
+ return S3Sink(sink_class, path_no_ext, is_gz, fileno)
122
+
123
+ return None
124
+
125
+
126
+
@@ -0,0 +1,14 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import json
5
+ from typing import IO
6
+ from .format_sink import FormatSink
7
+
8
+ class JsonSink(FormatSink):
9
+ extension = 'json'
10
+
11
+ def process(self) -> None:
12
+ for record in self.input:
13
+ self.outfile.write(json.dumps(record) + "\n")
14
+ # Caller (DirSink/S3Sink) owns closing the outfile
@@ -0,0 +1,90 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2025 Mike Schultz
3
+
4
+ import io
5
+ import gzip
6
+ from typing import Optional, Type
7
+ from pjk.base import Source, Sink
8
+ from pjk.log import logger
9
+ from pjk.sinks.s3_stream import S3MultipartWriter
10
+
11
+
12
+ class S3Sink(Sink):
13
+ """
14
+ Write records to S3 in the given <format>.
15
+
16
+ - Folder mode (path without extension):
17
+ s3:bucket/prefix/ → file-0000.ext, file-0001.ext, ...
18
+ - Single-file mode (path ends with .ext or .ext.gz):
19
+ s3:bucket/prefix/output.csv[.gz]
20
+
21
+ Args (via Usage):
22
+ - path: 'bucket/path/to/files' (bucket required, prefix optional)
23
+ """
24
+
25
+ _FILENAME_BASE: str = "file"
26
+ _FILENAME_DIGITS: int = 4
27
+
28
+ def __init__(self, sink_class: Type[Sink], path_no_ext: str, is_gz: bool, fileno: int):
29
+ self.path_no_ext = path_no_ext if not path_no_ext.startswith('//') else path_no_ext[2:] # strip leading //
30
+ self.sink_class = sink_class
31
+ self.is_gz = is_gz
32
+ self.fileno = fileno
33
+ self.is_single_file = fileno == -1
34
+ if self.path_no_ext.endswith('/') and not self.is_single_file:
35
+ self.path_no_ext = self.path_no_ext[:-1]
36
+
37
+ self.num_files = 1
38
+
39
+ def _build_object_key(self, index: int) -> str:
40
+ if self.is_single_file:
41
+ file_name = f'{self.path_no_ext}.{self.sink_class.extension}'
42
+ else:
43
+ file_name = f"{self.path_no_ext}/{self._FILENAME_BASE}-{index:0{self._FILENAME_DIGITS}d}.{self.sink_class.extension}"
44
+
45
+ if self.is_gz:
46
+ file_name += ".gz"
47
+
48
+ return file_name
49
+
50
+ def process(self):
51
+ object_key = self._build_object_key(self.fileno)
52
+ bucket, key = object_key.split("/", 1)
53
+
54
+ with S3MultipartWriter(bucket, key) as writer:
55
+ if self.is_gz:
56
+ # gzip needs a binary sink → use writer directly
57
+ with gzip.GzipFile(fileobj=writer, mode="wb") as gz:
58
+ with io.TextIOWrapper(gz, encoding="utf-8", newline="") as outfile:
59
+ file_sink = self.sink_class(outfile)
60
+ file_sink.add_source(self.input)
61
+ logger.debug(f"S3Sink streaming GZ to s3://{bucket}/{key}")
62
+ file_sink.process()
63
+ else:
64
+ # plain text path
65
+ with io.TextIOWrapper(writer, encoding="utf-8", newline="") as outfile:
66
+ file_sink = self.sink_class(outfile)
67
+ file_sink.add_source(self.input)
68
+ logger.debug(f"S3Sink streaming to s3://{bucket}/{key}")
69
+ file_sink.process()
70
+
71
+
72
+ def deep_copy(self):
73
+ if self.is_single_file:
74
+ # Single-file mode: no fanout allowed
75
+ return None
76
+
77
+ source_clone: Optional[Source] = self.input.deep_copy()
78
+ if not source_clone:
79
+ return None
80
+
81
+ clone = S3Sink(
82
+ sink_class=self.sink_class,
83
+ path_no_ext=self.path_no_ext,
84
+ is_gz=self.is_gz,
85
+ fileno=self.num_files,
86
+ )
87
+ clone.add_source(source_clone)
88
+
89
+ self.num_files += 1
90
+ return clone