python-jack-knife 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. pjk/__init__.py +5 -0
  2. pjk/base.py +377 -0
  3. pjk/common.py +150 -0
  4. pjk/log.py +67 -0
  5. pjk/main.py +106 -0
  6. pjk/man_page.py +125 -0
  7. pjk/parser.py +284 -0
  8. pjk/pipes/__init__.py +0 -0
  9. pjk/pipes/denorm.py +68 -0
  10. pjk/pipes/factory.py +62 -0
  11. pjk/pipes/filter.py +57 -0
  12. pjk/pipes/head.py +34 -0
  13. pjk/pipes/join.py +85 -0
  14. pjk/pipes/let_reduce.py +198 -0
  15. pjk/pipes/map.py +91 -0
  16. pjk/pipes/move_field.py +36 -0
  17. pjk/pipes/postgres_pipe.py +209 -0
  18. pjk/pipes/remove_field.py +36 -0
  19. pjk/pipes/select.py +42 -0
  20. pjk/pipes/sort.py +63 -0
  21. pjk/pipes/tail.py +39 -0
  22. pjk/pipes/user_pipe_factory.py +45 -0
  23. pjk/pipes/where.py +49 -0
  24. pjk/registry.py +143 -0
  25. pjk/sinks/__init__.py +0 -0
  26. pjk/sinks/csv_sink.py +33 -0
  27. pjk/sinks/ddb.py +54 -0
  28. pjk/sinks/devnull.py +31 -0
  29. pjk/sinks/dir_sink.py +59 -0
  30. pjk/sinks/expect.py +53 -0
  31. pjk/sinks/factory.py +108 -0
  32. pjk/sinks/graph.py +57 -0
  33. pjk/sinks/graph_bar_line.py +229 -0
  34. pjk/sinks/graph_cumulative.py +55 -0
  35. pjk/sinks/graph_hist.py +72 -0
  36. pjk/sinks/graph_scatter.py +29 -0
  37. pjk/sinks/json_sink.py +23 -0
  38. pjk/sinks/s3_sink.py +100 -0
  39. pjk/sinks/sinks.py +68 -0
  40. pjk/sinks/stdout.py +44 -0
  41. pjk/sinks/tsv_sink.py +22 -0
  42. pjk/sinks/user_sink_factory.py +43 -0
  43. pjk/sources/__init__.py +0 -0
  44. pjk/sources/csv_source.py +28 -0
  45. pjk/sources/dir_source.py +69 -0
  46. pjk/sources/factory.py +100 -0
  47. pjk/sources/format_usage.py +11 -0
  48. pjk/sources/inline_source.py +56 -0
  49. pjk/sources/json_source.py +35 -0
  50. pjk/sources/lazy_file.py +16 -0
  51. pjk/sources/lazy_file_local.py +22 -0
  52. pjk/sources/lazy_file_s3.py +28 -0
  53. pjk/sources/parquet_source.py +32 -0
  54. pjk/sources/s3_source.py +146 -0
  55. pjk/sources/source_list.py +23 -0
  56. pjk/sources/sql_source.py +32 -0
  57. pjk/sources/tsv_source.py +15 -0
  58. pjk/sources/user_source_factory.py +33 -0
  59. pjk/version.py +4 -0
  60. python_jack_knife-0.5.0.dist-info/METADATA +254 -0
  61. python_jack_knife-0.5.0.dist-info/RECORD +65 -0
  62. python_jack_knife-0.5.0.dist-info/WHEEL +5 -0
  63. python_jack_knife-0.5.0.dist-info/entry_points.txt +2 -0
  64. python_jack_knife-0.5.0.dist-info/licenses/LICENSE +202 -0
  65. python_jack_knife-0.5.0.dist-info/top_level.txt +1 -0
pjk/sinks/sinks.py ADDED
@@ -0,0 +1,68 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import sys
5
+ import yaml
6
+ import subprocess
7
+ import shutil
8
+ from pjk.base import Sink, Source, ParsedToken, Usage
9
+
10
+ class YamlSink(Sink):
11
+ @classmethod
12
+ def usage(cls):
13
+ usage = Usage(
14
+ name='yaml',
15
+ desc='Write all records to a YAML file as multi-doc stream'
16
+ )
17
+ usage.def_arg(name='path', usage='Path to output YAML file')
18
+ return usage
19
+
20
+ def __init__(self, ptok: ParsedToken, usage: Usage):
21
+ super().__init__(ptok, usage)
22
+ self.path = usage.get_arg('path')
23
+
24
+ def process(self) -> None:
25
+ with open(self.path, 'w') as f:
26
+ yaml.dump_all(self.input, f, sort_keys=False)
27
+
28
+
29
+ class StdoutYamlSink(Sink):
30
+ # No usage() — not token-based; intended for internal use
31
+ def __init__(self, input_source: Source, use_pager: bool = True):
32
+ super().__init__(input_source)
33
+ self.use_pager = use_pager
34
+ self.suppress_report = True
35
+
36
+ def process(self) -> None:
37
+ output_stream = sys.stdout
38
+ pager_proc = None
39
+
40
+ if self.use_pager and shutil.which("less"):
41
+ pager_proc = subprocess.Popen(
42
+ ["less", "-FRSX"],
43
+ stdin=subprocess.PIPE,
44
+ text=True
45
+ )
46
+ output_stream = pager_proc.stdin
47
+
48
+ try:
49
+ for record in self.input:
50
+ try:
51
+ yaml.dump(
52
+ record,
53
+ output_stream,
54
+ sort_keys=False,
55
+ explicit_start=True,
56
+ width=float("inf")
57
+ )
58
+ except BrokenPipeError:
59
+ break # user quit pager
60
+ except BrokenPipeError:
61
+ pass
62
+ finally:
63
+ if pager_proc:
64
+ try:
65
+ output_stream.close()
66
+ except BrokenPipeError:
67
+ pass
68
+ pager_proc.wait()
pjk/sinks/stdout.py ADDED
@@ -0,0 +1,44 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import sys
5
+ import yaml
6
+ from pjk.base import Sink, Source, ParsedToken, Usage
7
+ from pjk.common import pager_stdout
8
+
9
+ class StdoutSink(Sink):
10
+ @classmethod
11
+ def usage(cls):
12
+ usage = Usage(
13
+ name='-',
14
+ desc='display records in yaml format to stdout through less',
15
+ component_class=cls
16
+ )
17
+ usage.def_param('less', usage='use less to display', valid_values=['true', 'false'], default='true')
18
+ usage.def_example(["{hello:'world!'}"], "{hello:'world!'}")
19
+ return usage
20
+
21
+ def __init__(self, ptok: ParsedToken, usage: Usage):
22
+ super().__init__(ptok, usage)
23
+
24
+ # NOTE: self.use_pager is hardcoded for now; override via constructor if needed
25
+ self.use_pager = True if usage.get_param('less') == None else usage.get_param('less') == 'true'
26
+
27
+ def process(self) -> None:
28
+ # Route all stdout into `less` while dumping
29
+ try:
30
+ with pager_stdout(self.use_pager):
31
+ for record in self.input:
32
+ try:
33
+ yaml.dump(
34
+ record,
35
+ sys.stdout, # now points to less (if enabled)
36
+ sort_keys=False,
37
+ explicit_start=True,
38
+ width=10**9 # effectively no wrap without using a float
39
+ )
40
+ except BrokenPipeError:
41
+ break # user quit pager
42
+ except BrokenPipeError:
43
+ # Swallow if pager closed early
44
+ pass
pjk/sinks/tsv_sink.py ADDED
@@ -0,0 +1,22 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from pjk.sinks.csv_sink import CSVSink
5
+ from pjk.base import Source, ParsedToken, Usage
6
+
7
+ class TSVSink(CSVSink):
8
+ is_format = True
9
+
10
+ @classmethod
11
+ def usage(cls):
12
+ usage = Usage(
13
+ name='tsv',
14
+ desc='Write records to a .tsv file (tab-separated values)',
15
+ component_class=cls
16
+ )
17
+ usage.def_arg(name='path', usage='Path prefix (no extension)')
18
+ return usage
19
+
20
+ def __init__(self, ptok: ParsedToken, usage: Usage):
21
+ path_no_ext = usage.get_arg('path')
22
+ super().__init__(path_no_ext, delimiter="\t", ext='tsv')
@@ -0,0 +1,43 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import importlib.util
5
+ from typing import Optional
6
+ from pjk.base import Source, Sink, UsageError, ParsedToken
7
+
8
+ class UserSinkFactory:
9
+ @staticmethod
10
+ def create_from_path(ptok: ParsedToken) -> Optional[Sink]:
11
+ script_path = ptok.pre_colon
12
+ try:
13
+ spec = importlib.util.spec_from_file_location("user_sink", script_path)
14
+ if spec is None or spec.loader is None:
15
+ raise UsageError(f"Could not load Python file: {script_path}")
16
+
17
+ module = importlib.util.module_from_spec(spec)
18
+ spec.loader.exec_module(module)
19
+ except Exception as e:
20
+ raise UsageError(f"Failed to import {script_path}: {e}")
21
+
22
+ for value in vars(module).values():
23
+ if (
24
+ isinstance(value, type)
25
+ and issubclass(value, Sink)
26
+ and value is not Sink
27
+ and value.__module__ == module.__name__
28
+ ):
29
+ usage = value.usage()
30
+ usage.bind(ptok)
31
+
32
+ return value(ptok, usage)
33
+
34
+ return None
35
+
36
+ @classmethod
37
+ def create(cls, ptok: ParsedToken) -> Sink:
38
+ if ptok.pre_colon.endswith('.py'):
39
+ sink = cls.create_from_path(ptok)
40
+ if sink:
41
+ return sink
42
+
43
+
File without changes
@@ -0,0 +1,28 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import sys
5
+ import csv
6
+ from pjk.base import Source, NoBindUsage
7
+ from pjk.sources.format_usage import FormatUsage
8
+ from pjk.sources.lazy_file import LazyFile
9
+
10
+ csv.field_size_limit(sys.maxsize)
11
+
12
+ class CSVSource(Source):
13
+ is_format = True
14
+ @classmethod
15
+ def usage(cls):
16
+ return FormatUsage('csv', component_class=cls)
17
+
18
+ def __init__(self, lazy_file: LazyFile, delimiter: str = ","):
19
+ self.lazy_file = lazy_file
20
+ self.delimiter = delimiter
21
+ self.num_recs = 0
22
+
23
+ def __iter__(self):
24
+ with self.lazy_file.open() as f:
25
+ reader = csv.DictReader(f, delimiter=self.delimiter)
26
+ for row in reader:
27
+ self.num_recs += 1
28
+ yield row
@@ -0,0 +1,69 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import os
5
+ from typing import Any
6
+ from queue import Queue, Empty
7
+ from pjk.base import Source, ParsedToken
8
+ from pjk.sources.lazy_file_local import LazyFileLocal
9
+ from pjk.log import logger
10
+
11
+ class DirSource(Source):
12
+ def __init__(self, source_queue: Queue, in_source: Source = None):
13
+ self.source_queue = source_queue
14
+ self.current = in_source
15
+
16
+ def __iter__(self):
17
+ while True:
18
+ if self.current is None:
19
+ try:
20
+ self.current = self.source_queue.get_nowait()
21
+ logger.debug(f'next source={self.current}')
22
+ except Empty:
23
+ return # end of all sources
24
+
25
+ try:
26
+ for record in self.current:
27
+ yield record
28
+ finally:
29
+ self.current = None # move to next source after exhaustion
30
+
31
+ def deep_copy(self):
32
+ if self.source_queue.qsize() <= 1:
33
+ return None # leave remaining files to original
34
+ try:
35
+ next_source = self.source_queue.get_nowait()
36
+ logger.debug(f'deep_copy next_source={next_source}')
37
+ except Empty:
38
+ return None
39
+
40
+ return DirSource(self.source_queue, next_source)
41
+
42
+ @classmethod
43
+ def create(cls, ptok: ParsedToken, get_format_class_gz: Any):
44
+ params = ptok.get_params()
45
+ override = params.get('format', None)
46
+ path = ptok.all_but_params
47
+
48
+ files = [
49
+ os.path.join(path, f)
50
+ for f in os.listdir(path)
51
+ if os.path.isfile(os.path.join(path, f))
52
+ ]
53
+
54
+ source_queue = Queue()
55
+ for file in files:
56
+ file_token = file if not override else f"{file}@format={override}"
57
+ file_ptok = ParsedToken(file_token)
58
+
59
+ format_class, is_gz = get_format_class_gz(file_ptok)
60
+ if format_class:
61
+ lazy_file = LazyFileLocal(file, is_gz)
62
+ source_queue.put(format_class(lazy_file))
63
+ else:
64
+ raise RuntimeError(f"No format for file: {file}")
65
+
66
+ if source_queue.empty():
67
+ return None
68
+
69
+ return DirSource(source_queue)
pjk/sources/factory.py ADDED
@@ -0,0 +1,100 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import os
5
+ import queue
6
+ from pjk.base import Source, ParsedToken
7
+ from pjk.common import ComponentFactory
8
+ from pjk.sources.json_source import JsonSource
9
+ from pjk.sources.csv_source import CSVSource
10
+ from pjk.sources.sql_source import SQLSource
11
+ from pjk.sources.tsv_source import TSVSource
12
+ from pjk.sources.s3_source import S3Source
13
+ from pjk.sources.inline_source import InlineSource
14
+ from pjk.sources.dir_source import DirSource
15
+ from pjk.sources.user_source_factory import UserSourceFactory
16
+ from pjk.sources.lazy_file import LazyFile
17
+ from pjk.sources.lazy_file_local import LazyFileLocal
18
+ from pjk.sources.parquet_source import ParquetSource
19
+
20
+ COMPONENTS = {
21
+ 'inline': InlineSource,
22
+ 'json': JsonSource,
23
+ 'csv': CSVSource,
24
+ 'tsv': TSVSource,
25
+ 'sql': SQLSource,
26
+ 'parquet': ParquetSource,
27
+ }
28
+
29
+ class SourceFactory(ComponentFactory):
30
+ def __init__(self):
31
+ super().__init__(COMPONENTS, 'source')
32
+
33
+
34
+ def get_format_class_gz(self, ptok: ParsedToken):
35
+ params = ptok.get_params()
36
+ override = params.get('format', None) # e.g. json or json.gz
37
+
38
+ lookup = None
39
+
40
+ is_gz = ptok.all_but_params.endswith('gz')
41
+ if override:
42
+ if override.endswith('.gz'):
43
+ is_gz = True
44
+ override = override.removesuffix('.gz')
45
+ lookup = override
46
+
47
+ else: # e.g. foo.json or foo.json.gz
48
+ path = ptok.all_but_params
49
+ if path.endswith('.gz'):
50
+ is_gz = True
51
+ path = path.removesuffix('.gz')
52
+
53
+ path, ext = os.path.splitext(path) # e.g path=foo.json
54
+ lookup = ext.removeprefix('.')
55
+
56
+ format_class = self.components.get(lookup, None)
57
+ if not format_class:
58
+ return None, None
59
+
60
+ # make sure
61
+ if not format_class.is_format:
62
+ return None, None # raise ?
63
+
64
+ return format_class, is_gz
65
+
66
+ def create(self, token: str) -> Source:
67
+ token = token.strip()
68
+
69
+ if InlineSource.is_inline(token):
70
+ return InlineSource(token)
71
+
72
+ ptok = ParsedToken(token)
73
+
74
+ if ptok.pre_colon.endswith('.py'):
75
+ source = UserSourceFactory.create(ptok)
76
+ if source:
77
+ return source
78
+
79
+ source_cls = self.components.get(ptok.pre_colon)
80
+ if source_cls:
81
+ usage = source_cls.usage()
82
+ usage.bind(ptok)
83
+
84
+ source = source_cls(ptok, usage)
85
+ return source
86
+
87
+ if ptok.all_but_params.startswith('s3'):
88
+ return S3Source.create(ptok, get_format_class_gz=self.get_format_class_gz)
89
+
90
+ if os.path.isdir(ptok.all_but_params):
91
+ return DirSource.create(ptok, get_format_class_gz=self.get_format_class_gz)
92
+
93
+ # individual file
94
+ if os.path.isfile(ptok.all_but_params):
95
+ source_class, is_gz = self.get_format_class_gz(ptok)
96
+ if source_class:
97
+ lazy_file = LazyFileLocal(ptok.all_but_params, is_gz)
98
+ return source_class(lazy_file)
99
+
100
+ return None
@@ -0,0 +1,11 @@
1
+ from pjk.base import Source, NoBindUsage
2
+
3
+ class FormatUsage(NoBindUsage):
4
+ def __init__(self, name: str, component_class: type, desc_override: str = None):
5
+ desc = f'{name} source for s3 and local files/directories.' if desc_override == None else desc_override
6
+ super().__init__(name, desc, component_class)
7
+
8
+ self.def_syntax("") # no syntax for these
9
+ self.def_example(expr_tokens=[f"myfile.{name}", "-"], expect=None)
10
+ self.def_example(expr_tokens=["mydir", "-"], expect=None)
11
+ self.def_example(expr_tokens=["s3://mybucket/path/to/files", "-"], expect=None)
@@ -0,0 +1,56 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ # djk/sources/inline_source.py
5
+
6
+ import hjson
7
+ from hjson import HjsonDecodeError
8
+ from typing import Optional
9
+ from collections import OrderedDict
10
+ from pjk.base import Source, TokenError, Usage
11
+
12
+ def to_builtin(obj):
13
+ """Recursively convert OrderedDicts to dicts and lists."""
14
+ if isinstance(obj, OrderedDict):
15
+ return {k: to_builtin(v) for k, v in obj.items()}
16
+ elif isinstance(obj, list):
17
+ return [to_builtin(v) for v in obj]
18
+ else:
19
+ return obj
20
+
21
+ class InlineSource(Source):
22
+ @classmethod
23
+ def usage(cls):
24
+ usage = Usage(
25
+ name='inline',
26
+ desc="simplified json lines format (uses hjson)",
27
+ component_class=cls
28
+ )
29
+ usage.def_syntax('')
30
+ usage.def_example(expr_tokens=["{hello: 'world!'}"], expect="{hello: 'world!'}")
31
+ usage.def_example(expr_tokens=["[{id:1, dir:'up'},{id:2, dir:'down'}]"], expect="[{id:1, dir:'up'}, {id:2, dir:'down'}]")
32
+ return usage
33
+
34
+ def __init__(self, inline_expr):
35
+ self.num_recs = 0
36
+ try:
37
+ obj = hjson.loads(inline_expr)
38
+ except HjsonDecodeError:
39
+ raise TokenError('incorrect hjson line syntax')
40
+
41
+ if isinstance(obj, dict):
42
+ self.records = [obj]
43
+ elif isinstance(obj, list):
44
+ self.records = obj
45
+ else:
46
+ raise TokenError(f'"{inline_expr}"')
47
+
48
+ def __iter__(self):
49
+ for raw in self.records:
50
+ yield to_builtin(raw)
51
+
52
+ @classmethod
53
+ def is_inline(cls, token):
54
+ if len(token) < 2:
55
+ return False
56
+ return (token[0], token[-1]) in {('{', '}'), ('[', ']')}
@@ -0,0 +1,35 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import json
5
+ from pjk.base import Source, NoBindUsage
6
+ from pjk.sources.lazy_file import LazyFile
7
+ from pjk.sources.format_usage import FormatUsage
8
+ from pjk.log import logger
9
+
10
+ class JsonSource(Source):
11
+ is_format = True
12
+
13
+ @classmethod
14
+ def usage(cls):
15
+ return FormatUsage('json', component_class=cls)
16
+
17
+ def __init__(self, lazy_file: LazyFile):
18
+ self.lazy_file = lazy_file
19
+ self.num_recs = 0
20
+
21
+ def __iter__(self):
22
+ with self.lazy_file.open() as f:
23
+ for line in f:
24
+ self.num_recs += 1
25
+ try:
26
+ yield json.loads(line)
27
+ except json.JSONDecodeError as e:
28
+ print('json decode error, see ~/.pjk/logs')
29
+ snippet = line.strip()
30
+ if len(snippet) > 200:
31
+ snippet = snippet[:200] + "…"
32
+ logger.warning(
33
+ f"Skipping invalid JSON at line {self.num_recs} "
34
+ f"in {self.lazy_file.path}: {e} | data: {snippet}"
35
+ )
@@ -0,0 +1,16 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from abc import ABC, abstractmethod
5
+ from typing import IO
6
+
7
+ class LazyFile(ABC):
8
+ @abstractmethod
9
+ def open(self) -> IO[str]:
10
+ """Open and return a text-mode file-like object."""
11
+ pass
12
+
13
+ @abstractmethod
14
+ def name(self) -> str:
15
+ """Return a descriptive identifier (e.g. path or URI)."""
16
+ pass
@@ -0,0 +1,22 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import gzip
5
+ import io
6
+ from typing import IO
7
+ from pjk.sources.lazy_file import LazyFile
8
+
9
+ class LazyFileLocal(LazyFile):
10
+ def __init__(self, path: str, is_gz: bool = False):
11
+ self.path = path
12
+ self.is_gz = is_gz
13
+
14
+ def open(self) -> IO[str]:
15
+ raw = open(self.path, "rb")
16
+ if self.path.endswith(".gz") or self.is_gz:
17
+ return io.TextIOWrapper(gzip.GzipFile(fileobj=raw))
18
+ else:
19
+ return io.TextIOWrapper(raw)
20
+
21
+ def name(self) -> str:
22
+ return self.path
@@ -0,0 +1,28 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ import io
5
+ import gzip
6
+ from typing import IO
7
+ from pjk.sources.lazy_file import LazyFile
8
+
9
+ class LazyFileS3(LazyFile):
10
+ def __init__(self, bucket: str, key: str, is_gz: bool):
11
+ import boto3 # lazy import
12
+ self.s3 = boto3.client('s3') # for each thread
13
+ self.bucket = bucket
14
+ self.key = key
15
+ self.is_gz = is_gz
16
+
17
+ def open(self, binary=False) -> IO[str]:
18
+ obj = self.s3.get_object(Bucket=self.bucket, Key=self.key)
19
+ raw_body = obj['Body'].read()
20
+ if self.is_gz:
21
+ return io.TextIOWrapper(gzip.GzipFile(fileobj=io.BytesIO(raw_body)))
22
+ elif binary:
23
+ return io.BytesIO(raw_body)
24
+ else:
25
+ return io.StringIO(raw_body.decode("utf-8"))
26
+
27
+ def name(self) -> str:
28
+ return f"s3://{self.bucket}/{self.key}"
@@ -0,0 +1,32 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Copyright 2024 Mike Schultz
3
+
4
+ from pjk.base import Source
5
+ from pjk.sources.lazy_file import LazyFile
6
+ from pjk.sources.format_usage import FormatUsage
7
+
8
+ class ParquetSource(Source):
9
+ is_format = True # enables format-based routing
10
+ @classmethod
11
+ def usage(cls):
12
+ return FormatUsage('parquet', component_class=cls)
13
+
14
+ def __init__(self, lazy_file: LazyFile):
15
+ self.lazy_file = lazy_file
16
+ self.num_recs = 0
17
+
18
+ def __iter__(self):
19
+ import pyarrow.parquet as pq # lazy import
20
+ with self.lazy_file.open(binary=True) as f:
21
+ table = pq.read_table(f)
22
+ batch = table.to_pydict()
23
+
24
+ if not batch:
25
+ return # no columns = no rows
26
+
27
+ num_rows = len(next(iter(batch.values())))
28
+
29
+ for i in range(num_rows):
30
+ record = {col: batch[col][i] for col in batch}
31
+ self.num_recs += 1
32
+ yield record