pbz2 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pbz2/__init__.py +12 -0
- pbz2/cli.py +27 -0
- pbz2/parallel.py +77 -0
- pbz2/py.typed +0 -0
- pbz2/reader.py +136 -0
- pbz2-0.1.0.dist-info/METADATA +94 -0
- pbz2-0.1.0.dist-info/RECORD +10 -0
- pbz2-0.1.0.dist-info/WHEEL +4 -0
- pbz2-0.1.0.dist-info/entry_points.txt +2 -0
- pbz2-0.1.0.dist-info/licenses/LICENSE +22 -0
pbz2/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""pbz2: stream and parallel-process `.bz2` files via pbzip2."""
|
|
2
|
+
|
|
3
|
+
from .parallel import process_parallel
|
|
4
|
+
from .reader import iter_chunks, iter_jsonl, iter_lines, open_decompress
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"iter_chunks",
|
|
8
|
+
"iter_jsonl",
|
|
9
|
+
"iter_lines",
|
|
10
|
+
"open_decompress",
|
|
11
|
+
"process_parallel",
|
|
12
|
+
]
|
pbz2/cli.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""pbz2 CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
|
|
9
|
+
from .reader import iter_lines
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(help="Stream `.bz2` files via pbzip2.")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command()
|
|
15
|
+
def count(path: Path) -> None:
|
|
16
|
+
"""Count lines in a `.bz2` file."""
|
|
17
|
+
n = sum(1 for _ in iter_lines(path))
|
|
18
|
+
typer.echo(n)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command()
|
|
22
|
+
def head(path: Path, n: int = typer.Option(10, "-n", help="Number of lines.")) -> None:
|
|
23
|
+
"""Print the first N lines of a `.bz2` file."""
|
|
24
|
+
for i, line in enumerate(iter_lines(path)):
|
|
25
|
+
if i >= n:
|
|
26
|
+
break
|
|
27
|
+
typer.echo(line)
|
pbz2/parallel.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Parallel processing of `.bz2` streams via a process pool.
|
|
2
|
+
|
|
3
|
+
Streams chunks of newline-terminated records from a `.bz2` file through a
|
|
4
|
+
worker function in a `ProcessPoolExecutor`, dispatching each result to an
|
|
5
|
+
optional handler in the main process.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
from collections.abc import Callable, Sequence
|
|
13
|
+
from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .reader import (
|
|
17
|
+
DEFAULT_BUFSIZE_MB,
|
|
18
|
+
DEFAULT_STREAM_BUFFER_MB,
|
|
19
|
+
iter_chunks,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def process_parallel(
|
|
26
|
+
path: str | os.PathLike,
|
|
27
|
+
worker_fn: Callable[..., Any],
|
|
28
|
+
*,
|
|
29
|
+
on_result: Callable[[Any], None] | None = None,
|
|
30
|
+
worker_args: Sequence[Any] = (),
|
|
31
|
+
num_processes: int | None = None,
|
|
32
|
+
max_pending: int | None = None,
|
|
33
|
+
bufsize_mb: int = DEFAULT_BUFSIZE_MB,
|
|
34
|
+
stream_buffer_mb: int = DEFAULT_STREAM_BUFFER_MB,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Process chunks of `path` in parallel.
|
|
37
|
+
|
|
38
|
+
`worker_fn(chunk, *worker_args)` runs in worker processes; each chunk is a
|
|
39
|
+
`str` of complete newline-terminated records. Results are dispatched to
|
|
40
|
+
`on_result` in the main process as they complete.
|
|
41
|
+
|
|
42
|
+
When `max_pending` is set, the producer pauses while that many futures are
|
|
43
|
+
in-flight to bound memory.
|
|
44
|
+
"""
|
|
45
|
+
nproc = num_processes or max(1, (os.cpu_count() or 2) // 2)
|
|
46
|
+
cap = max_pending if max_pending is not None else nproc * 2
|
|
47
|
+
|
|
48
|
+
def drain_done(pending: list, *, block: bool = False) -> list:
|
|
49
|
+
if block and pending:
|
|
50
|
+
wait(pending, return_when=FIRST_COMPLETED)
|
|
51
|
+
still_pending = []
|
|
52
|
+
for fut in pending:
|
|
53
|
+
if fut.done():
|
|
54
|
+
result = fut.result()
|
|
55
|
+
if on_result is not None:
|
|
56
|
+
on_result(result)
|
|
57
|
+
else:
|
|
58
|
+
still_pending.append(fut)
|
|
59
|
+
return still_pending
|
|
60
|
+
|
|
61
|
+
with ProcessPoolExecutor(max_workers=nproc) as executor:
|
|
62
|
+
pending: list = []
|
|
63
|
+
for chunk in iter_chunks(
|
|
64
|
+
path,
|
|
65
|
+
num_processors=nproc,
|
|
66
|
+
bufsize_mb=bufsize_mb,
|
|
67
|
+
stream_buffer_mb=stream_buffer_mb,
|
|
68
|
+
):
|
|
69
|
+
if not chunk.strip():
|
|
70
|
+
continue
|
|
71
|
+
pending.append(executor.submit(worker_fn, chunk, *worker_args))
|
|
72
|
+
pending = drain_done(pending)
|
|
73
|
+
while len(pending) >= cap:
|
|
74
|
+
pending = drain_done(pending, block=True)
|
|
75
|
+
|
|
76
|
+
while pending:
|
|
77
|
+
pending = drain_done(pending, block=True)
|
pbz2/py.typed
ADDED
|
File without changes
|
pbz2/reader.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Streaming readers for `.bz2` files using pbzip2 for parallel decompression.
|
|
2
|
+
|
|
3
|
+
Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import bz2
|
|
9
|
+
import codecs
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
from collections.abc import Callable, Iterator
|
|
15
|
+
from typing import IO, Any, cast
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
DEFAULT_BUFSIZE_MB = 32 # OS pipe buffer between pbzip2 and Python
|
|
20
|
+
DEFAULT_STREAM_BUFFER_MB = 4 # Python-side read chunk size
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _has_pbzip2() -> bool:
|
|
24
|
+
return shutil.which("pbzip2") is not None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def open_decompress(
|
|
28
|
+
path: str | os.PathLike,
|
|
29
|
+
*,
|
|
30
|
+
num_processors: int | None = None,
|
|
31
|
+
bufsize_mb: int = DEFAULT_BUFSIZE_MB,
|
|
32
|
+
) -> tuple[IO[bytes], subprocess.Popen | None]:
|
|
33
|
+
"""Open a binary stream that yields decompressed bytes from `path`.
|
|
34
|
+
|
|
35
|
+
Returns `(stream, process)`. `process` is the pbzip2 subprocess when
|
|
36
|
+
pbzip2 is available, else `None` (stdlib `bz2.open` fallback).
|
|
37
|
+
"""
|
|
38
|
+
if _has_pbzip2():
|
|
39
|
+
nproc = num_processors or max(1, (os.cpu_count() or 2) - 1)
|
|
40
|
+
cmd = [
|
|
41
|
+
"pbzip2",
|
|
42
|
+
"-dc",
|
|
43
|
+
f"-p{int(nproc)}",
|
|
44
|
+
os.fspath(path),
|
|
45
|
+
]
|
|
46
|
+
logger.info("decompress cmd: %s", " ".join(cmd))
|
|
47
|
+
proc = subprocess.Popen(
|
|
48
|
+
cmd,
|
|
49
|
+
stdout=subprocess.PIPE,
|
|
50
|
+
stderr=subprocess.DEVNULL,
|
|
51
|
+
bufsize=bufsize_mb * 1024 * 1024,
|
|
52
|
+
)
|
|
53
|
+
assert proc.stdout is not None
|
|
54
|
+
return proc.stdout, proc
|
|
55
|
+
|
|
56
|
+
logger.info("pbzip2 not found; falling back to stdlib bz2 for %s", path)
|
|
57
|
+
return cast(IO[bytes], bz2.open(os.fspath(path), "rb")), None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def iter_chunks(
|
|
61
|
+
path: str | os.PathLike,
|
|
62
|
+
*,
|
|
63
|
+
num_processors: int | None = None,
|
|
64
|
+
bufsize_mb: int = DEFAULT_BUFSIZE_MB,
|
|
65
|
+
stream_buffer_mb: int = DEFAULT_STREAM_BUFFER_MB,
|
|
66
|
+
) -> Iterator[str]:
|
|
67
|
+
"""Yield UTF-8 text chunks of complete newline-terminated records.
|
|
68
|
+
|
|
69
|
+
Each yielded chunk ends on a newline boundary, so callers can `splitlines()`
|
|
70
|
+
safely. Handles UTF-8 multibyte characters split across read boundaries.
|
|
71
|
+
"""
|
|
72
|
+
stream, proc = open_decompress(
|
|
73
|
+
path,
|
|
74
|
+
num_processors=num_processors,
|
|
75
|
+
bufsize_mb=bufsize_mb,
|
|
76
|
+
)
|
|
77
|
+
read_size = stream_buffer_mb * 1024 * 1024
|
|
78
|
+
decoder = codecs.getincrementaldecoder("utf-8")()
|
|
79
|
+
buffer = ""
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
while True:
|
|
83
|
+
data = stream.read(read_size)
|
|
84
|
+
if not data:
|
|
85
|
+
# Flush any final bytes through the incremental decoder
|
|
86
|
+
buffer += decoder.decode(b"", final=True)
|
|
87
|
+
if buffer:
|
|
88
|
+
yield buffer
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
buffer += decoder.decode(data)
|
|
92
|
+
last_nl = buffer.rfind("\n")
|
|
93
|
+
if last_nl == -1:
|
|
94
|
+
continue
|
|
95
|
+
yield buffer[: last_nl + 1]
|
|
96
|
+
buffer = buffer[last_nl + 1 :]
|
|
97
|
+
finally:
|
|
98
|
+
stream.close()
|
|
99
|
+
if proc is not None:
|
|
100
|
+
proc.wait()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def iter_lines(
|
|
104
|
+
path: str | os.PathLike,
|
|
105
|
+
**kwargs: Any,
|
|
106
|
+
) -> Iterator[str]:
|
|
107
|
+
"""Yield UTF-8 lines (without trailing newline) from a `.bz2` file."""
|
|
108
|
+
for chunk in iter_chunks(path, **kwargs):
|
|
109
|
+
for line in chunk.splitlines():
|
|
110
|
+
if line:
|
|
111
|
+
yield line
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def iter_jsonl(
|
|
115
|
+
path: str | os.PathLike,
|
|
116
|
+
*,
|
|
117
|
+
loads: Callable[[str | bytes], Any] | None = None,
|
|
118
|
+
**kwargs: Any,
|
|
119
|
+
) -> Iterator[Any]:
|
|
120
|
+
"""Yield parsed JSON objects from a `.json.bz2` file (one object per line).
|
|
121
|
+
|
|
122
|
+
Uses `orjson.loads` when available, else stdlib `json.loads`. Pass `loads=`
|
|
123
|
+
to override.
|
|
124
|
+
"""
|
|
125
|
+
if loads is None:
|
|
126
|
+
try:
|
|
127
|
+
import orjson
|
|
128
|
+
|
|
129
|
+
loads = orjson.loads
|
|
130
|
+
except ImportError:
|
|
131
|
+
import json
|
|
132
|
+
|
|
133
|
+
loads = json.loads
|
|
134
|
+
|
|
135
|
+
for line in iter_lines(path, **kwargs):
|
|
136
|
+
yield loads(line)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pbz2
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Stream and parallel-process .bz2 files via pbzip2.
|
|
5
|
+
Project-URL: repository, https://github.com/gitronald/pbz2
|
|
6
|
+
Author-email: gitronald <gitronald@users.noreply.github.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Requires-Dist: orjson>=3.11.9
|
|
11
|
+
Requires-Dist: typer
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# pbz2
|
|
15
|
+
|
|
16
|
+
Stream and parallel-process `.bz2` files via [pbzip2](http://compression.great-site.net/pbzip2/) (parallel bzip2). Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv add pbz2
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Install `pbzip2` for parallel decompression:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
sudo apt install pbzip2 # Debian/Ubuntu
|
|
28
|
+
brew install pbzip2 # macOS
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### Iterate
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import pbz2
|
|
37
|
+
|
|
38
|
+
# Parsed JSON objects from a .json.bz2 file
|
|
39
|
+
for obj in pbz2.iter_jsonl("data.json.bz2"):
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
# Raw UTF-8 lines
|
|
43
|
+
for line in pbz2.iter_lines("data.txt.bz2"):
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
# Newline-aligned text chunks (useful for batched processing)
|
|
47
|
+
for chunk in pbz2.iter_chunks("data.txt.bz2"):
|
|
48
|
+
...
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Parallel processing
|
|
52
|
+
|
|
53
|
+
`process_parallel` streams chunks of newline-terminated records through a worker pool. The worker function receives raw text chunks (so parsing happens in the worker, not the main process), and `on_result` runs in the main process to handle each result as it completes.
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import json
|
|
57
|
+
import pbz2
|
|
58
|
+
|
|
59
|
+
def parse_chunk(chunk: str) -> list[dict]:
|
|
60
|
+
return [json.loads(line) for line in chunk.splitlines() if line]
|
|
61
|
+
|
|
62
|
+
def save(records: list[dict]) -> None:
|
|
63
|
+
... # write to db, file, etc.
|
|
64
|
+
|
|
65
|
+
pbz2.process_parallel(
|
|
66
|
+
"data.json.bz2",
|
|
67
|
+
worker_fn=parse_chunk,
|
|
68
|
+
on_result=save,
|
|
69
|
+
num_processes=8,
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pbz2 count data.json.bz2
|
|
77
|
+
pbz2 head data.json.bz2 -n 5
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## API
|
|
81
|
+
|
|
82
|
+
| Function | Description |
|
|
83
|
+
| --- | --- |
|
|
84
|
+
| `iter_chunks(path, **opts)` | Yield UTF-8 text chunks ending on a newline boundary. |
|
|
85
|
+
| `iter_lines(path, **opts)` | Yield non-empty UTF-8 lines (no trailing newline). |
|
|
86
|
+
| `iter_jsonl(path, *, loads=None, **opts)` | Yield parsed JSON objects (uses `orjson` if installed). |
|
|
87
|
+
| `process_parallel(path, worker_fn, *, on_result=None, worker_args=(), num_processes=None, max_pending=None, ...)` | Run `worker_fn(chunk, *worker_args)` in a process pool, dispatching results to `on_result`. |
|
|
88
|
+
| `open_decompress(path, **opts)` | Low-level: open a binary stream of decompressed bytes. |
|
|
89
|
+
|
|
90
|
+
### Common options
|
|
91
|
+
|
|
92
|
+
- `num_processors` — pbzip2 worker count (default: cpu_count - 1)
|
|
93
|
+
- `bufsize_mb` — OS pipe buffer between pbzip2 and Python (default: 32 MB)
|
|
94
|
+
- `stream_buffer_mb` — Python-side read chunk size (default: 4 MB)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
pbz2/__init__.py,sha256=iUiLFKg8eYEHID2xQ83-X9WygPjnr_im9HgIppET3o8,295
|
|
2
|
+
pbz2/cli.py,sha256=_cqQkeDAdnOLGGdcE5uN3MeudSVenrfDixwWggqg7Pw,597
|
|
3
|
+
pbz2/parallel.py,sha256=kLJUr1nWET-ol6_spyXnq9T4AHVIoAvGrh4wzO3Vk3s,2521
|
|
4
|
+
pbz2/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
pbz2/reader.py,sha256=VmLSnUJFiYzJvEFCFgiPI6VFrxBnh_kf9iqu0qRfDlg,3876
|
|
6
|
+
pbz2-0.1.0.dist-info/METADATA,sha256=0-ChnaBE5GNZSaBEPAEvsEaBLFRahwgEcV1dTi7hMlg,2686
|
|
7
|
+
pbz2-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
8
|
+
pbz2-0.1.0.dist-info/entry_points.txt,sha256=ov0ld90XMBPEvkBaCVnPD0lJgM278YBjZ9hvuutRL-Y,38
|
|
9
|
+
pbz2-0.1.0.dist-info/licenses/LICENSE,sha256=a1bWEq8sJGCTdJ_BtJVrfK44EUtx2ToP2sLCcKl7xkU,1077
|
|
10
|
+
pbz2-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ronald E. Robertson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|