python-jack-knife 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pjk/__init__.py +5 -0
- pjk/base.py +377 -0
- pjk/common.py +150 -0
- pjk/log.py +67 -0
- pjk/main.py +106 -0
- pjk/man_page.py +125 -0
- pjk/parser.py +284 -0
- pjk/pipes/__init__.py +0 -0
- pjk/pipes/denorm.py +68 -0
- pjk/pipes/factory.py +62 -0
- pjk/pipes/filter.py +57 -0
- pjk/pipes/head.py +34 -0
- pjk/pipes/join.py +85 -0
- pjk/pipes/let_reduce.py +198 -0
- pjk/pipes/map.py +91 -0
- pjk/pipes/move_field.py +36 -0
- pjk/pipes/postgres_pipe.py +209 -0
- pjk/pipes/remove_field.py +36 -0
- pjk/pipes/select.py +42 -0
- pjk/pipes/sort.py +63 -0
- pjk/pipes/tail.py +39 -0
- pjk/pipes/user_pipe_factory.py +45 -0
- pjk/pipes/where.py +49 -0
- pjk/registry.py +143 -0
- pjk/sinks/__init__.py +0 -0
- pjk/sinks/csv_sink.py +33 -0
- pjk/sinks/ddb.py +54 -0
- pjk/sinks/devnull.py +31 -0
- pjk/sinks/dir_sink.py +59 -0
- pjk/sinks/expect.py +53 -0
- pjk/sinks/factory.py +108 -0
- pjk/sinks/graph.py +57 -0
- pjk/sinks/graph_bar_line.py +229 -0
- pjk/sinks/graph_cumulative.py +55 -0
- pjk/sinks/graph_hist.py +72 -0
- pjk/sinks/graph_scatter.py +29 -0
- pjk/sinks/json_sink.py +23 -0
- pjk/sinks/s3_sink.py +100 -0
- pjk/sinks/sinks.py +68 -0
- pjk/sinks/stdout.py +44 -0
- pjk/sinks/tsv_sink.py +22 -0
- pjk/sinks/user_sink_factory.py +43 -0
- pjk/sources/__init__.py +0 -0
- pjk/sources/csv_source.py +28 -0
- pjk/sources/dir_source.py +69 -0
- pjk/sources/factory.py +100 -0
- pjk/sources/format_usage.py +11 -0
- pjk/sources/inline_source.py +56 -0
- pjk/sources/json_source.py +35 -0
- pjk/sources/lazy_file.py +16 -0
- pjk/sources/lazy_file_local.py +22 -0
- pjk/sources/lazy_file_s3.py +28 -0
- pjk/sources/parquet_source.py +32 -0
- pjk/sources/s3_source.py +146 -0
- pjk/sources/source_list.py +23 -0
- pjk/sources/sql_source.py +32 -0
- pjk/sources/tsv_source.py +15 -0
- pjk/sources/user_source_factory.py +33 -0
- pjk/version.py +4 -0
- python_jack_knife-0.5.0.dist-info/METADATA +254 -0
- python_jack_knife-0.5.0.dist-info/RECORD +65 -0
- python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
- python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
- python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
- python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/sinks/sinks.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
import yaml
|
|
6
|
+
import subprocess
|
|
7
|
+
import shutil
|
|
8
|
+
from pjk.base import Sink, Source, ParsedToken, Usage
|
|
9
|
+
|
|
10
|
+
class YamlSink(Sink):
|
|
11
|
+
@classmethod
|
|
12
|
+
def usage(cls):
|
|
13
|
+
usage = Usage(
|
|
14
|
+
name='yaml',
|
|
15
|
+
desc='Write all records to a YAML file as multi-doc stream'
|
|
16
|
+
)
|
|
17
|
+
usage.def_arg(name='path', usage='Path to output YAML file')
|
|
18
|
+
return usage
|
|
19
|
+
|
|
20
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
21
|
+
super().__init__(ptok, usage)
|
|
22
|
+
self.path = usage.get_arg('path')
|
|
23
|
+
|
|
24
|
+
def process(self) -> None:
|
|
25
|
+
with open(self.path, 'w') as f:
|
|
26
|
+
yaml.dump_all(self.input, f, sort_keys=False)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class StdoutYamlSink(Sink):
|
|
30
|
+
# No usage() — not token-based; intended for internal use
|
|
31
|
+
def __init__(self, input_source: Source, use_pager: bool = True):
|
|
32
|
+
super().__init__(input_source)
|
|
33
|
+
self.use_pager = use_pager
|
|
34
|
+
self.suppress_report = True
|
|
35
|
+
|
|
36
|
+
def process(self) -> None:
|
|
37
|
+
output_stream = sys.stdout
|
|
38
|
+
pager_proc = None
|
|
39
|
+
|
|
40
|
+
if self.use_pager and shutil.which("less"):
|
|
41
|
+
pager_proc = subprocess.Popen(
|
|
42
|
+
["less", "-FRSX"],
|
|
43
|
+
stdin=subprocess.PIPE,
|
|
44
|
+
text=True
|
|
45
|
+
)
|
|
46
|
+
output_stream = pager_proc.stdin
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
for record in self.input:
|
|
50
|
+
try:
|
|
51
|
+
yaml.dump(
|
|
52
|
+
record,
|
|
53
|
+
output_stream,
|
|
54
|
+
sort_keys=False,
|
|
55
|
+
explicit_start=True,
|
|
56
|
+
width=float("inf")
|
|
57
|
+
)
|
|
58
|
+
except BrokenPipeError:
|
|
59
|
+
break # user quit pager
|
|
60
|
+
except BrokenPipeError:
|
|
61
|
+
pass
|
|
62
|
+
finally:
|
|
63
|
+
if pager_proc:
|
|
64
|
+
try:
|
|
65
|
+
output_stream.close()
|
|
66
|
+
except BrokenPipeError:
|
|
67
|
+
pass
|
|
68
|
+
pager_proc.wait()
|
pjk/sinks/stdout.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
import yaml
|
|
6
|
+
from pjk.base import Sink, Source, ParsedToken, Usage
|
|
7
|
+
from pjk.common import pager_stdout
|
|
8
|
+
|
|
9
|
+
class StdoutSink(Sink):
|
|
10
|
+
@classmethod
|
|
11
|
+
def usage(cls):
|
|
12
|
+
usage = Usage(
|
|
13
|
+
name='-',
|
|
14
|
+
desc='display records in yaml format to stdout through less',
|
|
15
|
+
component_class=cls
|
|
16
|
+
)
|
|
17
|
+
usage.def_param('less', usage='use less to display', valid_values=['true', 'false'], default='true')
|
|
18
|
+
usage.def_example(["{hello:'world!'}"], "{hello:'world!'}")
|
|
19
|
+
return usage
|
|
20
|
+
|
|
21
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
22
|
+
super().__init__(ptok, usage)
|
|
23
|
+
|
|
24
|
+
# NOTE: self.use_pager is hardcoded for now; override via constructor if needed
|
|
25
|
+
self.use_pager = True if usage.get_param('less') == None else usage.get_param('less') == 'true'
|
|
26
|
+
|
|
27
|
+
def process(self) -> None:
|
|
28
|
+
# Route all stdout into `less` while dumping
|
|
29
|
+
try:
|
|
30
|
+
with pager_stdout(self.use_pager):
|
|
31
|
+
for record in self.input:
|
|
32
|
+
try:
|
|
33
|
+
yaml.dump(
|
|
34
|
+
record,
|
|
35
|
+
sys.stdout, # now points to less (if enabled)
|
|
36
|
+
sort_keys=False,
|
|
37
|
+
explicit_start=True,
|
|
38
|
+
width=10**9 # effectively no wrap without using a float
|
|
39
|
+
)
|
|
40
|
+
except BrokenPipeError:
|
|
41
|
+
break # user quit pager
|
|
42
|
+
except BrokenPipeError:
|
|
43
|
+
# Swallow if pager closed early
|
|
44
|
+
pass
|
pjk/sinks/tsv_sink.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from pjk.sinks.csv_sink import CSVSink
|
|
5
|
+
from pjk.base import Source, ParsedToken, Usage
|
|
6
|
+
|
|
7
|
+
class TSVSink(CSVSink):
|
|
8
|
+
is_format = True
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
def usage(cls):
|
|
12
|
+
usage = Usage(
|
|
13
|
+
name='tsv',
|
|
14
|
+
desc='Write records to a .tsv file (tab-separated values)',
|
|
15
|
+
component_class=cls
|
|
16
|
+
)
|
|
17
|
+
usage.def_arg(name='path', usage='Path prefix (no extension)')
|
|
18
|
+
return usage
|
|
19
|
+
|
|
20
|
+
def __init__(self, ptok: ParsedToken, usage: Usage):
|
|
21
|
+
path_no_ext = usage.get_arg('path')
|
|
22
|
+
super().__init__(path_no_ext, delimiter="\t", ext='tsv')
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import importlib.util
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from pjk.base import Source, Sink, UsageError, ParsedToken
|
|
7
|
+
|
|
8
|
+
class UserSinkFactory:
|
|
9
|
+
@staticmethod
|
|
10
|
+
def create_from_path(ptok: ParsedToken) -> Optional[Sink]:
|
|
11
|
+
script_path = ptok.pre_colon
|
|
12
|
+
try:
|
|
13
|
+
spec = importlib.util.spec_from_file_location("user_sink", script_path)
|
|
14
|
+
if spec is None or spec.loader is None:
|
|
15
|
+
raise UsageError(f"Could not load Python file: {script_path}")
|
|
16
|
+
|
|
17
|
+
module = importlib.util.module_from_spec(spec)
|
|
18
|
+
spec.loader.exec_module(module)
|
|
19
|
+
except Exception as e:
|
|
20
|
+
raise UsageError(f"Failed to import {script_path}: {e}")
|
|
21
|
+
|
|
22
|
+
for value in vars(module).values():
|
|
23
|
+
if (
|
|
24
|
+
isinstance(value, type)
|
|
25
|
+
and issubclass(value, Sink)
|
|
26
|
+
and value is not Sink
|
|
27
|
+
and value.__module__ == module.__name__
|
|
28
|
+
):
|
|
29
|
+
usage = value.usage()
|
|
30
|
+
usage.bind(ptok)
|
|
31
|
+
|
|
32
|
+
return value(ptok, usage)
|
|
33
|
+
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def create(cls, ptok: ParsedToken) -> Sink:
|
|
38
|
+
if ptok.pre_colon.endswith('.py'):
|
|
39
|
+
sink = cls.create_from_path(ptok)
|
|
40
|
+
if sink:
|
|
41
|
+
return sink
|
|
42
|
+
|
|
43
|
+
|
pjk/sources/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
import csv
|
|
6
|
+
from pjk.base import Source, NoBindUsage
|
|
7
|
+
from pjk.sources.format_usage import FormatUsage
|
|
8
|
+
from pjk.sources.lazy_file import LazyFile
|
|
9
|
+
|
|
10
|
+
csv.field_size_limit(sys.maxsize)
|
|
11
|
+
|
|
12
|
+
class CSVSource(Source):
|
|
13
|
+
is_format = True
|
|
14
|
+
@classmethod
|
|
15
|
+
def usage(cls):
|
|
16
|
+
return FormatUsage('csv', component_class=cls)
|
|
17
|
+
|
|
18
|
+
def __init__(self, lazy_file: LazyFile, delimiter: str = ","):
|
|
19
|
+
self.lazy_file = lazy_file
|
|
20
|
+
self.delimiter = delimiter
|
|
21
|
+
self.num_recs = 0
|
|
22
|
+
|
|
23
|
+
def __iter__(self):
|
|
24
|
+
with self.lazy_file.open() as f:
|
|
25
|
+
reader = csv.DictReader(f, delimiter=self.delimiter)
|
|
26
|
+
for row in reader:
|
|
27
|
+
self.num_recs += 1
|
|
28
|
+
yield row
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any
|
|
6
|
+
from queue import Queue, Empty
|
|
7
|
+
from pjk.base import Source, ParsedToken
|
|
8
|
+
from pjk.sources.lazy_file_local import LazyFileLocal
|
|
9
|
+
from pjk.log import logger
|
|
10
|
+
|
|
11
|
+
class DirSource(Source):
|
|
12
|
+
def __init__(self, source_queue: Queue, in_source: Source = None):
|
|
13
|
+
self.source_queue = source_queue
|
|
14
|
+
self.current = in_source
|
|
15
|
+
|
|
16
|
+
def __iter__(self):
|
|
17
|
+
while True:
|
|
18
|
+
if self.current is None:
|
|
19
|
+
try:
|
|
20
|
+
self.current = self.source_queue.get_nowait()
|
|
21
|
+
logger.debug(f'next source={self.current}')
|
|
22
|
+
except Empty:
|
|
23
|
+
return # end of all sources
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
for record in self.current:
|
|
27
|
+
yield record
|
|
28
|
+
finally:
|
|
29
|
+
self.current = None # move to next source after exhaustion
|
|
30
|
+
|
|
31
|
+
def deep_copy(self):
|
|
32
|
+
if self.source_queue.qsize() <= 1:
|
|
33
|
+
return None # leave remaining files to original
|
|
34
|
+
try:
|
|
35
|
+
next_source = self.source_queue.get_nowait()
|
|
36
|
+
logger.debug(f'deep_copy next_source={next_source}')
|
|
37
|
+
except Empty:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
return DirSource(self.source_queue, next_source)
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def create(cls, ptok: ParsedToken, get_format_class_gz: Any):
|
|
44
|
+
params = ptok.get_params()
|
|
45
|
+
override = params.get('format', None)
|
|
46
|
+
path = ptok.all_but_params
|
|
47
|
+
|
|
48
|
+
files = [
|
|
49
|
+
os.path.join(path, f)
|
|
50
|
+
for f in os.listdir(path)
|
|
51
|
+
if os.path.isfile(os.path.join(path, f))
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
source_queue = Queue()
|
|
55
|
+
for file in files:
|
|
56
|
+
file_token = file if not override else f"{file}@format={override}"
|
|
57
|
+
file_ptok = ParsedToken(file_token)
|
|
58
|
+
|
|
59
|
+
format_class, is_gz = get_format_class_gz(file_ptok)
|
|
60
|
+
if format_class:
|
|
61
|
+
lazy_file = LazyFileLocal(file, is_gz)
|
|
62
|
+
source_queue.put(format_class(lazy_file))
|
|
63
|
+
else:
|
|
64
|
+
raise RuntimeError(f"No format for file: {file}")
|
|
65
|
+
|
|
66
|
+
if source_queue.empty():
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
return DirSource(source_queue)
|
pjk/sources/factory.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import queue
|
|
6
|
+
from pjk.base import Source, ParsedToken
|
|
7
|
+
from pjk.common import ComponentFactory
|
|
8
|
+
from pjk.sources.json_source import JsonSource
|
|
9
|
+
from pjk.sources.csv_source import CSVSource
|
|
10
|
+
from pjk.sources.sql_source import SQLSource
|
|
11
|
+
from pjk.sources.tsv_source import TSVSource
|
|
12
|
+
from pjk.sources.s3_source import S3Source
|
|
13
|
+
from pjk.sources.inline_source import InlineSource
|
|
14
|
+
from pjk.sources.dir_source import DirSource
|
|
15
|
+
from pjk.sources.user_source_factory import UserSourceFactory
|
|
16
|
+
from pjk.sources.lazy_file import LazyFile
|
|
17
|
+
from pjk.sources.lazy_file_local import LazyFileLocal
|
|
18
|
+
from pjk.sources.parquet_source import ParquetSource
|
|
19
|
+
|
|
20
|
+
COMPONENTS = {
|
|
21
|
+
'inline': InlineSource,
|
|
22
|
+
'json': JsonSource,
|
|
23
|
+
'csv': CSVSource,
|
|
24
|
+
'tsv': TSVSource,
|
|
25
|
+
'sql': SQLSource,
|
|
26
|
+
'parquet': ParquetSource,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
class SourceFactory(ComponentFactory):
|
|
30
|
+
def __init__(self):
|
|
31
|
+
super().__init__(COMPONENTS, 'source')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_format_class_gz(self, ptok: ParsedToken):
|
|
35
|
+
params = ptok.get_params()
|
|
36
|
+
override = params.get('format', None) # e.g. json or json.gz
|
|
37
|
+
|
|
38
|
+
lookup = None
|
|
39
|
+
|
|
40
|
+
is_gz = ptok.all_but_params.endswith('gz')
|
|
41
|
+
if override:
|
|
42
|
+
if override.endswith('.gz'):
|
|
43
|
+
is_gz = True
|
|
44
|
+
override = override.removesuffix('.gz')
|
|
45
|
+
lookup = override
|
|
46
|
+
|
|
47
|
+
else: # e.g. foo.json or foo.json.gz
|
|
48
|
+
path = ptok.all_but_params
|
|
49
|
+
if path.endswith('.gz'):
|
|
50
|
+
is_gz = True
|
|
51
|
+
path = path.removesuffix('.gz')
|
|
52
|
+
|
|
53
|
+
path, ext = os.path.splitext(path) # e.g path=foo.json
|
|
54
|
+
lookup = ext.removeprefix('.')
|
|
55
|
+
|
|
56
|
+
format_class = self.components.get(lookup, None)
|
|
57
|
+
if not format_class:
|
|
58
|
+
return None, None
|
|
59
|
+
|
|
60
|
+
# make sure
|
|
61
|
+
if not format_class.is_format:
|
|
62
|
+
return None, None # raise ?
|
|
63
|
+
|
|
64
|
+
return format_class, is_gz
|
|
65
|
+
|
|
66
|
+
def create(self, token: str) -> Source:
|
|
67
|
+
token = token.strip()
|
|
68
|
+
|
|
69
|
+
if InlineSource.is_inline(token):
|
|
70
|
+
return InlineSource(token)
|
|
71
|
+
|
|
72
|
+
ptok = ParsedToken(token)
|
|
73
|
+
|
|
74
|
+
if ptok.pre_colon.endswith('.py'):
|
|
75
|
+
source = UserSourceFactory.create(ptok)
|
|
76
|
+
if source:
|
|
77
|
+
return source
|
|
78
|
+
|
|
79
|
+
source_cls = self.components.get(ptok.pre_colon)
|
|
80
|
+
if source_cls:
|
|
81
|
+
usage = source_cls.usage()
|
|
82
|
+
usage.bind(ptok)
|
|
83
|
+
|
|
84
|
+
source = source_cls(ptok, usage)
|
|
85
|
+
return source
|
|
86
|
+
|
|
87
|
+
if ptok.all_but_params.startswith('s3'):
|
|
88
|
+
return S3Source.create(ptok, get_format_class_gz=self.get_format_class_gz)
|
|
89
|
+
|
|
90
|
+
if os.path.isdir(ptok.all_but_params):
|
|
91
|
+
return DirSource.create(ptok, get_format_class_gz=self.get_format_class_gz)
|
|
92
|
+
|
|
93
|
+
# individual file
|
|
94
|
+
if os.path.isfile(ptok.all_but_params):
|
|
95
|
+
source_class, is_gz = self.get_format_class_gz(ptok)
|
|
96
|
+
if source_class:
|
|
97
|
+
lazy_file = LazyFileLocal(ptok.all_but_params, is_gz)
|
|
98
|
+
return source_class(lazy_file)
|
|
99
|
+
|
|
100
|
+
return None
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from pjk.base import Source, NoBindUsage
|
|
2
|
+
|
|
3
|
+
class FormatUsage(NoBindUsage):
|
|
4
|
+
def __init__(self, name: str, component_class: type, desc_override: str = None):
|
|
5
|
+
desc = f'{name} source for s3 and local files/directories.' if desc_override == None else desc_override
|
|
6
|
+
super().__init__(name, desc, component_class)
|
|
7
|
+
|
|
8
|
+
self.def_syntax("") # no syntax for these
|
|
9
|
+
self.def_example(expr_tokens=[f"myfile.{name}", "-"], expect=None)
|
|
10
|
+
self.def_example(expr_tokens=["mydir", "-"], expect=None)
|
|
11
|
+
self.def_example(expr_tokens=["s3://mybucket/path/to/files", "-"], expect=None)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
# djk/sources/inline_source.py
|
|
5
|
+
|
|
6
|
+
import hjson
|
|
7
|
+
from hjson import HjsonDecodeError
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from collections import OrderedDict
|
|
10
|
+
from pjk.base import Source, TokenError, Usage
|
|
11
|
+
|
|
12
|
+
def to_builtin(obj):
|
|
13
|
+
"""Recursively convert OrderedDicts to dicts and lists."""
|
|
14
|
+
if isinstance(obj, OrderedDict):
|
|
15
|
+
return {k: to_builtin(v) for k, v in obj.items()}
|
|
16
|
+
elif isinstance(obj, list):
|
|
17
|
+
return [to_builtin(v) for v in obj]
|
|
18
|
+
else:
|
|
19
|
+
return obj
|
|
20
|
+
|
|
21
|
+
class InlineSource(Source):
|
|
22
|
+
@classmethod
|
|
23
|
+
def usage(cls):
|
|
24
|
+
usage = Usage(
|
|
25
|
+
name='inline',
|
|
26
|
+
desc="simplified json lines format (uses hjson)",
|
|
27
|
+
component_class=cls
|
|
28
|
+
)
|
|
29
|
+
usage.def_syntax('')
|
|
30
|
+
usage.def_example(expr_tokens=["{hello: 'world!'}"], expect="{hello: 'world!'}")
|
|
31
|
+
usage.def_example(expr_tokens=["[{id:1, dir:'up'},{id:2, dir:'down'}]"], expect="[{id:1, dir:'up'}, {id:2, dir:'down'}]")
|
|
32
|
+
return usage
|
|
33
|
+
|
|
34
|
+
def __init__(self, inline_expr):
|
|
35
|
+
self.num_recs = 0
|
|
36
|
+
try:
|
|
37
|
+
obj = hjson.loads(inline_expr)
|
|
38
|
+
except HjsonDecodeError:
|
|
39
|
+
raise TokenError('incorrect hjson line syntax')
|
|
40
|
+
|
|
41
|
+
if isinstance(obj, dict):
|
|
42
|
+
self.records = [obj]
|
|
43
|
+
elif isinstance(obj, list):
|
|
44
|
+
self.records = obj
|
|
45
|
+
else:
|
|
46
|
+
raise TokenError(f'"{inline_expr}"')
|
|
47
|
+
|
|
48
|
+
def __iter__(self):
|
|
49
|
+
for raw in self.records:
|
|
50
|
+
yield to_builtin(raw)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def is_inline(cls, token):
|
|
54
|
+
if len(token) < 2:
|
|
55
|
+
return False
|
|
56
|
+
return (token[0], token[-1]) in {('{', '}'), ('[', ']')}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from pjk.base import Source, NoBindUsage
|
|
6
|
+
from pjk.sources.lazy_file import LazyFile
|
|
7
|
+
from pjk.sources.format_usage import FormatUsage
|
|
8
|
+
from pjk.log import logger
|
|
9
|
+
|
|
10
|
+
class JsonSource(Source):
|
|
11
|
+
is_format = True
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def usage(cls):
|
|
15
|
+
return FormatUsage('json', component_class=cls)
|
|
16
|
+
|
|
17
|
+
def __init__(self, lazy_file: LazyFile):
|
|
18
|
+
self.lazy_file = lazy_file
|
|
19
|
+
self.num_recs = 0
|
|
20
|
+
|
|
21
|
+
def __iter__(self):
|
|
22
|
+
with self.lazy_file.open() as f:
|
|
23
|
+
for line in f:
|
|
24
|
+
self.num_recs += 1
|
|
25
|
+
try:
|
|
26
|
+
yield json.loads(line)
|
|
27
|
+
except json.JSONDecodeError as e:
|
|
28
|
+
print('json decode error, see ~/.pjk/logs')
|
|
29
|
+
snippet = line.strip()
|
|
30
|
+
if len(snippet) > 200:
|
|
31
|
+
snippet = snippet[:200] + "…"
|
|
32
|
+
logger.warning(
|
|
33
|
+
f"Skipping invalid JSON at line {self.num_recs} "
|
|
34
|
+
f"in {self.lazy_file.path}: {e} | data: {snippet}"
|
|
35
|
+
)
|
pjk/sources/lazy_file.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import IO
|
|
6
|
+
|
|
7
|
+
class LazyFile(ABC):
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def open(self) -> IO[str]:
|
|
10
|
+
"""Open and return a text-mode file-like object."""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def name(self) -> str:
|
|
15
|
+
"""Return a descriptive identifier (e.g. path or URI)."""
|
|
16
|
+
pass
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import gzip
|
|
5
|
+
import io
|
|
6
|
+
from typing import IO
|
|
7
|
+
from pjk.sources.lazy_file import LazyFile
|
|
8
|
+
|
|
9
|
+
class LazyFileLocal(LazyFile):
|
|
10
|
+
def __init__(self, path: str, is_gz: bool = False):
|
|
11
|
+
self.path = path
|
|
12
|
+
self.is_gz = is_gz
|
|
13
|
+
|
|
14
|
+
def open(self) -> IO[str]:
|
|
15
|
+
raw = open(self.path, "rb")
|
|
16
|
+
if self.path.endswith(".gz") or self.is_gz:
|
|
17
|
+
return io.TextIOWrapper(gzip.GzipFile(fileobj=raw))
|
|
18
|
+
else:
|
|
19
|
+
return io.TextIOWrapper(raw)
|
|
20
|
+
|
|
21
|
+
def name(self) -> str:
|
|
22
|
+
return self.path
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
import gzip
|
|
6
|
+
from typing import IO
|
|
7
|
+
from pjk.sources.lazy_file import LazyFile
|
|
8
|
+
|
|
9
|
+
class LazyFileS3(LazyFile):
|
|
10
|
+
def __init__(self, bucket: str, key: str, is_gz: bool):
|
|
11
|
+
import boto3 # lazy import
|
|
12
|
+
self.s3 = boto3.client('s3') # for each thread
|
|
13
|
+
self.bucket = bucket
|
|
14
|
+
self.key = key
|
|
15
|
+
self.is_gz = is_gz
|
|
16
|
+
|
|
17
|
+
def open(self, binary=False) -> IO[str]:
|
|
18
|
+
obj = self.s3.get_object(Bucket=self.bucket, Key=self.key)
|
|
19
|
+
raw_body = obj['Body'].read()
|
|
20
|
+
if self.is_gz:
|
|
21
|
+
return io.TextIOWrapper(gzip.GzipFile(fileobj=io.BytesIO(raw_body)))
|
|
22
|
+
elif binary:
|
|
23
|
+
return io.BytesIO(raw_body)
|
|
24
|
+
else:
|
|
25
|
+
return io.StringIO(raw_body.decode("utf-8"))
|
|
26
|
+
|
|
27
|
+
def name(self) -> str:
|
|
28
|
+
return f"s3://{self.bucket}/{self.key}"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2024 Mike Schultz
|
|
3
|
+
|
|
4
|
+
from pjk.base import Source
|
|
5
|
+
from pjk.sources.lazy_file import LazyFile
|
|
6
|
+
from pjk.sources.format_usage import FormatUsage
|
|
7
|
+
|
|
8
|
+
class ParquetSource(Source):
|
|
9
|
+
is_format = True # enables format-based routing
|
|
10
|
+
@classmethod
|
|
11
|
+
def usage(cls):
|
|
12
|
+
return FormatUsage('parquet', component_class=cls)
|
|
13
|
+
|
|
14
|
+
def __init__(self, lazy_file: LazyFile):
|
|
15
|
+
self.lazy_file = lazy_file
|
|
16
|
+
self.num_recs = 0
|
|
17
|
+
|
|
18
|
+
def __iter__(self):
|
|
19
|
+
import pyarrow.parquet as pq # lazy import
|
|
20
|
+
with self.lazy_file.open(binary=True) as f:
|
|
21
|
+
table = pq.read_table(f)
|
|
22
|
+
batch = table.to_pydict()
|
|
23
|
+
|
|
24
|
+
if not batch:
|
|
25
|
+
return # no columns = no rows
|
|
26
|
+
|
|
27
|
+
num_rows = len(next(iter(batch.values())))
|
|
28
|
+
|
|
29
|
+
for i in range(num_rows):
|
|
30
|
+
record = {col: batch[col][i] for col in batch}
|
|
31
|
+
self.num_recs += 1
|
|
32
|
+
yield record
|