pbz2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pbz2/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ """pbz2: stream and parallel-process `.bz2` files via pbzip2."""
2
+
3
+ from .parallel import process_parallel
4
+ from .reader import iter_chunks, iter_jsonl, iter_lines, open_decompress
5
+
6
+ __all__ = [
7
+ "iter_chunks",
8
+ "iter_jsonl",
9
+ "iter_lines",
10
+ "open_decompress",
11
+ "process_parallel",
12
+ ]
pbz2/cli.py ADDED
@@ -0,0 +1,27 @@
1
+ """pbz2 CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import typer
8
+
9
+ from .reader import iter_lines
10
+
11
+ app = typer.Typer(help="Stream `.bz2` files via pbzip2.")
12
+
13
+
14
+ @app.command()
15
+ def count(path: Path) -> None:
16
+ """Count lines in a `.bz2` file."""
17
+ n = sum(1 for _ in iter_lines(path))
18
+ typer.echo(n)
19
+
20
+
21
+ @app.command()
22
+ def head(path: Path, n: int = typer.Option(10, "-n", help="Number of lines.")) -> None:
23
+ """Print the first N lines of a `.bz2` file."""
24
+ for i, line in enumerate(iter_lines(path)):
25
+ if i >= n:
26
+ break
27
+ typer.echo(line)
pbz2/parallel.py ADDED
@@ -0,0 +1,77 @@
1
+ """Parallel processing of `.bz2` streams via a process pool.
2
+
3
+ Streams chunks of newline-terminated records from a `.bz2` file through a
4
+ worker function in a `ProcessPoolExecutor`, dispatching each result to an
5
+ optional handler in the main process.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ from collections.abc import Callable, Sequence
13
+ from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
14
+ from typing import Any
15
+
16
+ from .reader import (
17
+ DEFAULT_BUFSIZE_MB,
18
+ DEFAULT_STREAM_BUFFER_MB,
19
+ iter_chunks,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def process_parallel(
26
+ path: str | os.PathLike,
27
+ worker_fn: Callable[..., Any],
28
+ *,
29
+ on_result: Callable[[Any], None] | None = None,
30
+ worker_args: Sequence[Any] = (),
31
+ num_processes: int | None = None,
32
+ max_pending: int | None = None,
33
+ bufsize_mb: int = DEFAULT_BUFSIZE_MB,
34
+ stream_buffer_mb: int = DEFAULT_STREAM_BUFFER_MB,
35
+ ) -> None:
36
+ """Process chunks of `path` in parallel.
37
+
38
+ `worker_fn(chunk, *worker_args)` runs in worker processes; each chunk is a
39
+ `str` of complete newline-terminated records. Results are dispatched to
40
+ `on_result` in the main process as they complete.
41
+
42
+ When `max_pending` is set, the producer pauses while that many futures are
43
+ in-flight to bound memory.
44
+ """
45
+ nproc = num_processes or max(1, (os.cpu_count() or 2) // 2)
46
+ cap = max_pending if max_pending is not None else nproc * 2
47
+
48
+ def drain_done(pending: list, *, block: bool = False) -> list:
49
+ if block and pending:
50
+ wait(pending, return_when=FIRST_COMPLETED)
51
+ still_pending = []
52
+ for fut in pending:
53
+ if fut.done():
54
+ result = fut.result()
55
+ if on_result is not None:
56
+ on_result(result)
57
+ else:
58
+ still_pending.append(fut)
59
+ return still_pending
60
+
61
+ with ProcessPoolExecutor(max_workers=nproc) as executor:
62
+ pending: list = []
63
+ for chunk in iter_chunks(
64
+ path,
65
+ num_processors=nproc,
66
+ bufsize_mb=bufsize_mb,
67
+ stream_buffer_mb=stream_buffer_mb,
68
+ ):
69
+ if not chunk.strip():
70
+ continue
71
+ pending.append(executor.submit(worker_fn, chunk, *worker_args))
72
+ pending = drain_done(pending)
73
+ while len(pending) >= cap:
74
+ pending = drain_done(pending, block=True)
75
+
76
+ while pending:
77
+ pending = drain_done(pending, block=True)
pbz2/py.typed ADDED
File without changes
pbz2/reader.py ADDED
@@ -0,0 +1,136 @@
1
+ """Streaming readers for `.bz2` files using pbzip2 for parallel decompression.
2
+
3
+ Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import bz2
9
+ import codecs
10
+ import logging
11
+ import os
12
+ import shutil
13
+ import subprocess
14
+ from collections.abc import Callable, Iterator
15
+ from typing import IO, Any, cast
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ DEFAULT_BUFSIZE_MB = 32 # OS pipe buffer between pbzip2 and Python
20
+ DEFAULT_STREAM_BUFFER_MB = 4 # Python-side read chunk size
21
+
22
+
23
+ def _has_pbzip2() -> bool:
24
+ return shutil.which("pbzip2") is not None
25
+
26
+
27
+ def open_decompress(
28
+ path: str | os.PathLike,
29
+ *,
30
+ num_processors: int | None = None,
31
+ bufsize_mb: int = DEFAULT_BUFSIZE_MB,
32
+ ) -> tuple[IO[bytes], subprocess.Popen | None]:
33
+ """Open a binary stream that yields decompressed bytes from `path`.
34
+
35
+ Returns `(stream, process)`. `process` is the pbzip2 subprocess when
36
+ pbzip2 is available, else `None` (stdlib `bz2.open` fallback).
37
+ """
38
+ if _has_pbzip2():
39
+ nproc = num_processors or max(1, (os.cpu_count() or 2) - 1)
40
+ cmd = [
41
+ "pbzip2",
42
+ "-dc",
43
+ f"-p{int(nproc)}",
44
+ os.fspath(path),
45
+ ]
46
+ logger.info("decompress cmd: %s", " ".join(cmd))
47
+ proc = subprocess.Popen(
48
+ cmd,
49
+ stdout=subprocess.PIPE,
50
+ stderr=subprocess.DEVNULL,
51
+ bufsize=bufsize_mb * 1024 * 1024,
52
+ )
53
+ assert proc.stdout is not None
54
+ return proc.stdout, proc
55
+
56
+ logger.info("pbzip2 not found; falling back to stdlib bz2 for %s", path)
57
+ return cast(IO[bytes], bz2.open(os.fspath(path), "rb")), None
58
+
59
+
60
+ def iter_chunks(
61
+ path: str | os.PathLike,
62
+ *,
63
+ num_processors: int | None = None,
64
+ bufsize_mb: int = DEFAULT_BUFSIZE_MB,
65
+ stream_buffer_mb: int = DEFAULT_STREAM_BUFFER_MB,
66
+ ) -> Iterator[str]:
67
+ """Yield UTF-8 text chunks of complete newline-terminated records.
68
+
69
+ Each yielded chunk ends on a newline boundary, so callers can `splitlines()`
70
+ safely. Handles UTF-8 multibyte characters split across read boundaries.
71
+ """
72
+ stream, proc = open_decompress(
73
+ path,
74
+ num_processors=num_processors,
75
+ bufsize_mb=bufsize_mb,
76
+ )
77
+ read_size = stream_buffer_mb * 1024 * 1024
78
+ decoder = codecs.getincrementaldecoder("utf-8")()
79
+ buffer = ""
80
+
81
+ try:
82
+ while True:
83
+ data = stream.read(read_size)
84
+ if not data:
85
+ # Flush any final bytes through the incremental decoder
86
+ buffer += decoder.decode(b"", final=True)
87
+ if buffer:
88
+ yield buffer
89
+ return
90
+
91
+ buffer += decoder.decode(data)
92
+ last_nl = buffer.rfind("\n")
93
+ if last_nl == -1:
94
+ continue
95
+ yield buffer[: last_nl + 1]
96
+ buffer = buffer[last_nl + 1 :]
97
+ finally:
98
+ stream.close()
99
+ if proc is not None:
100
+ proc.wait()
101
+
102
+
103
+ def iter_lines(
104
+ path: str | os.PathLike,
105
+ **kwargs: Any,
106
+ ) -> Iterator[str]:
107
+ """Yield UTF-8 lines (without trailing newline) from a `.bz2` file."""
108
+ for chunk in iter_chunks(path, **kwargs):
109
+ for line in chunk.splitlines():
110
+ if line:
111
+ yield line
112
+
113
+
114
+ def iter_jsonl(
115
+ path: str | os.PathLike,
116
+ *,
117
+ loads: Callable[[str | bytes], Any] | None = None,
118
+ **kwargs: Any,
119
+ ) -> Iterator[Any]:
120
+ """Yield parsed JSON objects from a `.json.bz2` file (one object per line).
121
+
122
+ Uses `orjson.loads` when available, else stdlib `json.loads`. Pass `loads=`
123
+ to override.
124
+ """
125
+ if loads is None:
126
+ try:
127
+ import orjson
128
+
129
+ loads = orjson.loads
130
+ except ImportError:
131
+ import json
132
+
133
+ loads = json.loads
134
+
135
+ for line in iter_lines(path, **kwargs):
136
+ yield loads(line)
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: pbz2
3
+ Version: 0.1.0
4
+ Summary: Stream and parallel-process .bz2 files via pbzip2.
5
+ Project-URL: repository, https://github.com/gitronald/pbz2
6
+ Author-email: gitronald <gitronald@users.noreply.github.com>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Requires-Python: >=3.11
10
+ Requires-Dist: orjson>=3.11.9
11
+ Requires-Dist: typer
12
+ Description-Content-Type: text/markdown
13
+
14
+ # pbz2
15
+
16
+ Stream and parallel-process `.bz2` files via [pbzip2](http://compression.great-site.net/pbzip2/) (parallel bzip2). Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ uv add pbz2
22
+ ```
23
+
24
+ Install `pbzip2` for parallel decompression:
25
+
26
+ ```bash
27
+ sudo apt install pbzip2 # Debian/Ubuntu
28
+ brew install pbzip2 # macOS
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### Iterate
34
+
35
+ ```python
36
+ import pbz2
37
+
38
+ # Parsed JSON objects from a .json.bz2 file
39
+ for obj in pbz2.iter_jsonl("data.json.bz2"):
40
+ ...
41
+
42
+ # Raw UTF-8 lines
43
+ for line in pbz2.iter_lines("data.txt.bz2"):
44
+ ...
45
+
46
+ # Newline-aligned text chunks (useful for batched processing)
47
+ for chunk in pbz2.iter_chunks("data.txt.bz2"):
48
+ ...
49
+ ```
50
+
51
+ ### Parallel processing
52
+
53
+ `process_parallel` streams chunks of newline-terminated records through a worker pool. The worker function receives raw text chunks (so parsing happens in the worker, not the main process), and `on_result` runs in the main process to handle each result as it completes.
54
+
55
+ ```python
56
+ import json
57
+ import pbz2
58
+
59
+ def parse_chunk(chunk: str) -> list[dict]:
60
+ return [json.loads(line) for line in chunk.splitlines() if line]
61
+
62
+ def save(records: list[dict]) -> None:
63
+ ... # write to db, file, etc.
64
+
65
+ pbz2.process_parallel(
66
+ "data.json.bz2",
67
+ worker_fn=parse_chunk,
68
+ on_result=save,
69
+ num_processes=8,
70
+ )
71
+ ```
72
+
73
+ ### CLI
74
+
75
+ ```bash
76
+ pbz2 count data.json.bz2
77
+ pbz2 head data.json.bz2 -n 5
78
+ ```
79
+
80
+ ## API
81
+
82
+ | Function | Description |
83
+ | --- | --- |
84
+ | `iter_chunks(path, **opts)` | Yield UTF-8 text chunks ending on a newline boundary. |
85
+ | `iter_lines(path, **opts)` | Yield non-empty UTF-8 lines (no trailing newline). |
86
+ | `iter_jsonl(path, *, loads=None, **opts)` | Yield parsed JSON objects (uses `orjson` if installed). |
87
+ | `process_parallel(path, worker_fn, *, on_result=None, worker_args=(), num_processes=None, max_pending=None, ...)` | Run `worker_fn(chunk, *worker_args)` in a process pool, dispatching results to `on_result`. |
88
+ | `open_decompress(path, **opts)` | Low-level: open a binary stream of decompressed bytes. |
89
+
90
+ ### Common options
91
+
92
+ - `num_processors` — pbzip2 worker count (default: cpu_count - 1)
93
+ - `bufsize_mb` — OS pipe buffer between pbzip2 and Python (default: 32 MB)
94
+ - `stream_buffer_mb` — Python-side read chunk size (default: 4 MB)
@@ -0,0 +1,10 @@
1
+ pbz2/__init__.py,sha256=iUiLFKg8eYEHID2xQ83-X9WygPjnr_im9HgIppET3o8,295
2
+ pbz2/cli.py,sha256=_cqQkeDAdnOLGGdcE5uN3MeudSVenrfDixwWggqg7Pw,597
3
+ pbz2/parallel.py,sha256=kLJUr1nWET-ol6_spyXnq9T4AHVIoAvGrh4wzO3Vk3s,2521
4
+ pbz2/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ pbz2/reader.py,sha256=VmLSnUJFiYzJvEFCFgiPI6VFrxBnh_kf9iqu0qRfDlg,3876
6
+ pbz2-0.1.0.dist-info/METADATA,sha256=0-ChnaBE5GNZSaBEPAEvsEaBLFRahwgEcV1dTi7hMlg,2686
7
+ pbz2-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ pbz2-0.1.0.dist-info/entry_points.txt,sha256=ov0ld90XMBPEvkBaCVnPD0lJgM278YBjZ9hvuutRL-Y,38
9
+ pbz2-0.1.0.dist-info/licenses/LICENSE,sha256=a1bWEq8sJGCTdJ_BtJVrfK44EUtx2ToP2sLCcKl7xkU,1077
10
+ pbz2-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pbz2 = pbz2.cli:app
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ronald E. Robertson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+