pbz2 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,69 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(date:*)",
5
+ "Bash(diff:*)",
6
+ "Bash(du:*)",
7
+ "Bash(file:*)",
8
+ "Bash(find:*)",
9
+ "Bash(gh api:*)",
10
+ "Bash(gh issue:*)",
11
+ "Bash(gh pr create:*)",
12
+ "Bash(gh pr diff:*)",
13
+ "Bash(gh pr list:*)",
14
+ "Bash(gh pr merge:*)",
15
+ "Bash(gh pr view:*)",
16
+ "Bash(gh repo:*)",
17
+ "Bash(git add:*)",
18
+ "Bash(git branch:*)",
19
+ "Bash(git checkout:*)",
20
+ "Bash(git commit:*)",
21
+ "Bash(git config:*)",
22
+ "Bash(git diff:*)",
23
+ "Bash(git fetch:*)",
24
+ "Bash(git log:*)",
25
+ "Bash(git merge:*)",
26
+ "Bash(git mv:*)",
27
+ "Bash(git pull:*)",
28
+ "Bash(git remote:*)",
29
+ "Bash(git show:*)",
30
+ "Bash(git stash:*)",
31
+ "Bash(git status:*)",
32
+ "Bash(git switch:*)",
33
+ "Bash(git tag:*)",
34
+ "Bash(grep:*)",
35
+ "Bash(head:*)",
36
+ "Bash(jq:*)",
37
+ "Bash(ls:*)",
38
+ "Bash(sqlite3:*)",
39
+ "Bash(stanza:*)",
40
+ "Bash(test:*)",
41
+ "Bash(tree:*)",
42
+ "Bash(uv add:*)",
43
+ "Bash(uv build:*)",
44
+ "Bash(uv lock:*)",
45
+ "Bash(uv pip:*)",
46
+ "Bash(uv remove:*)",
47
+ "Bash(uv run python:*)",
48
+ "Bash(uv run:*)",
49
+ "Bash(uv sync:*)",
50
+ "Bash(wc:*)",
51
+ "Bash(which:*)",
52
+ "Bash(xxd:*)",
53
+ "Edit(.claude/**)"
54
+ ],
55
+ "deny": [
56
+ "Bash(git clean:*)",
57
+ "Bash(git push --force:*)",
58
+ "Bash(git reset --hard:*)",
59
+ "Bash(rm -rf:*)"
60
+ ],
61
+ "ask": [
62
+ "Bash(git checkout .:*)",
63
+ "Bash(git push:*)",
64
+ "Bash(git rebase:*)",
65
+ "Bash(git restore:*)",
66
+ "Bash(rm:*)"
67
+ ]
68
+ }
69
+ }
@@ -0,0 +1,10 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: github-actions
4
+ directory: /
5
+ schedule:
6
+ interval: weekly
7
+ - package-ecosystem: uv
8
+ directory: /
9
+ schedule:
10
+ interval: weekly
@@ -0,0 +1,5 @@
1
+ ## Summary
2
+ <!-- What does this PR do and why? -->
3
+
4
+ ## Test plan
5
+ <!-- How was this tested? -->
@@ -0,0 +1,31 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v6
12
+ - uses: astral-sh/setup-uv@v8.1.0
13
+ - run: uv build
14
+ - uses: actions/upload-artifact@v7
15
+ with:
16
+ name: dist
17
+ path: dist/
18
+
19
+ publish:
20
+ needs: build
21
+ runs-on: ubuntu-latest
22
+ environment: pypi
23
+ permissions:
24
+ id-token: write
25
+ contents: read
26
+ steps:
27
+ - uses: actions/download-artifact@v8
28
+ with:
29
+ name: dist
30
+ path: dist/
31
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,26 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [dev, main]
6
+ pull_request:
7
+ branches: [dev, main]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ test:
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ matrix:
17
+ python-version: ["3.11", "3.12", "3.13", "3.14"]
18
+ steps:
19
+ - uses: actions/checkout@v6
20
+ - uses: astral-sh/setup-uv@v8.1.0
21
+ - run: uv python install ${{ matrix.python-version }}
22
+ - run: uv sync --all-groups --python ${{ matrix.python-version }}
23
+ - run: uv run ruff check .
24
+ - run: uv run ruff format --check .
25
+ - run: uv run pyrefly check
26
+ - run: uv run pytest --cov --cov-report=term-missing
pbz2-0.1.0/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ .archive/
2
+ .pytest_cache/
3
+ .ruff_cache/
4
+ .venv/
5
+ __pycache__/
6
+
7
+ .DS_Store
8
+
9
+ .env
10
+ .env.*
11
+ !.env.example
12
+
13
+ !.claude/settings.local.json
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.15.5
4
+ hooks:
5
+ - id: ruff-format
6
+ - id: ruff
7
+ args: [--fix]
8
+ - repo: local
9
+ hooks:
10
+ - id: pyrefly-check
11
+ name: pyrefly check
12
+ entry: uv run pyrefly check
13
+ language: system
14
+ types_or: [python, pyi]
15
+ pass_filenames: false
16
+ require_serial: true
@@ -0,0 +1 @@
1
+ 3.14
@@ -0,0 +1,34 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+
12
+ ### Changed
13
+
14
+ ### Deprecated
15
+
16
+ ### Removed
17
+
18
+ ### Fixed
19
+
20
+ ### Security
21
+
22
+ ## [0.1.0] - 2026-05-10
23
+
24
+ Initial release.
25
+
26
+ ### Added
27
+
28
+ - `open_decompress` — file-like reader that streams a `.bz2` file through `pbzip2 -dc`.
29
+ - `iter_chunks` — iterate over fixed-size byte chunks from a decompressed stream.
30
+ - `iter_lines` — iterate over decoded text lines from a decompressed stream.
31
+ - `iter_jsonl` — iterate over parsed JSON records from a decompressed JSONL stream, backed by `orjson`.
32
+ - `process_parallel` — fan out decompressed records across worker processes.
33
+ - `pbz2` CLI entry point (Typer-based) for streaming and processing `.bz2` files from the shell.
34
+ - Python 3.11–3.14 support.
pbz2-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,28 @@
1
+ # Claude Settings
2
+
3
+ This file provides guidance to [Claude Code](claude.ai/code).
4
+
5
+ ## Package Structure
6
+
7
+ ```
8
+ pbz2/
9
+ ├── cli.py # Typer CLI entry point
10
+ └── __init__.py
11
+ ```
12
+
13
+ ## Development
14
+
15
+ - Install: `uv sync --all-groups`
16
+ - Tests: `uv run pytest`
17
+ - Linting: pre-commit hooks run ruff format + lint on commit
18
+ - Type checking: pre-commit hooks run pyrefly on commit
19
+ - CI: GitHub Actions runs lint + type check + test matrix (Python 3.11–3.14) on push/PR to dev/main
20
+
21
+ ## Release Automation
22
+
23
+ Use [stanza](https://github.com/gitronald/stanza) for release workflows:
24
+
25
+ ```bash
26
+ stanza release [patch|minor|major|prerelease]
27
+ stanza init
28
+ ```
pbz2-0.1.0/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ronald E. Robertson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
pbz2-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: pbz2
3
+ Version: 0.1.0
4
+ Summary: Stream and parallel-process .bz2 files via pbzip2.
5
+ Project-URL: repository, https://github.com/gitronald/pbz2
6
+ Author-email: gitronald <gitronald@users.noreply.github.com>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Requires-Python: >=3.11
10
+ Requires-Dist: orjson>=3.11.9
11
+ Requires-Dist: typer
12
+ Description-Content-Type: text/markdown
13
+
14
+ # pbz2
15
+
16
+ Stream and parallel-process `.bz2` files via [pbzip2](http://compression.great-site.net/pbzip2/) (parallel bzip2). Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ uv add pbz2
22
+ ```
23
+
24
+ Install `pbzip2` for parallel decompression:
25
+
26
+ ```bash
27
+ sudo apt install pbzip2 # Debian/Ubuntu
28
+ brew install pbzip2 # macOS
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### Iterate
34
+
35
+ ```python
36
+ import pbz2
37
+
38
+ # Parsed JSON objects from a .json.bz2 file
39
+ for obj in pbz2.iter_jsonl("data.json.bz2"):
40
+ ...
41
+
42
+ # Raw UTF-8 lines
43
+ for line in pbz2.iter_lines("data.txt.bz2"):
44
+ ...
45
+
46
+ # Newline-aligned text chunks (useful for batched processing)
47
+ for chunk in pbz2.iter_chunks("data.txt.bz2"):
48
+ ...
49
+ ```
50
+
51
+ ### Parallel processing
52
+
53
+ `process_parallel` streams chunks of newline-terminated records through a worker pool. The worker function receives raw text chunks (so parsing happens in the worker, not the main process), and `on_result` runs in the main process to handle each result as it completes.
54
+
55
+ ```python
56
+ import json
57
+ import pbz2
58
+
59
+ def parse_chunk(chunk: str) -> list[dict]:
60
+ return [json.loads(line) for line in chunk.splitlines() if line]
61
+
62
+ def save(records: list[dict]) -> None:
63
+ ... # write to db, file, etc.
64
+
65
+ pbz2.process_parallel(
66
+ "data.json.bz2",
67
+ worker_fn=parse_chunk,
68
+ on_result=save,
69
+ num_processes=8,
70
+ )
71
+ ```
72
+
73
+ ### CLI
74
+
75
+ ```bash
76
+ pbz2 count data.json.bz2
77
+ pbz2 head data.json.bz2 -n 5
78
+ ```
79
+
80
+ ## API
81
+
82
+ | Function | Description |
83
+ | --- | --- |
84
+ | `iter_chunks(path, **opts)` | Yield UTF-8 text chunks ending on a newline boundary. |
85
+ | `iter_lines(path, **opts)` | Yield non-empty UTF-8 lines (no trailing newline). |
86
+ | `iter_jsonl(path, *, loads=None, **opts)` | Yield parsed JSON objects (uses `orjson` if installed). |
87
+ | `process_parallel(path, worker_fn, *, on_result=None, worker_args=(), num_processes=None, max_pending=None, ...)` | Run `worker_fn(chunk, *worker_args)` in a process pool, dispatching results to `on_result`. |
88
+ | `open_decompress(path, **opts)` | Low-level: open a binary stream of decompressed bytes. |
89
+
90
+ ### Common options
91
+
92
+ - `num_processors` — pbzip2 worker count (default: cpu_count - 1)
93
+ - `bufsize_mb` — OS pipe buffer between pbzip2 and Python (default: 32 MB)
94
+ - `stream_buffer_mb` — Python-side read chunk size (default: 4 MB)
pbz2-0.1.0/README.md ADDED
@@ -0,0 +1,81 @@
1
+ # pbz2
2
+
3
+ Stream and parallel-process `.bz2` files via [pbzip2](http://compression.great-site.net/pbzip2/) (parallel bzip2). Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ uv add pbz2
9
+ ```
10
+
11
+ Install `pbzip2` for parallel decompression:
12
+
13
+ ```bash
14
+ sudo apt install pbzip2 # Debian/Ubuntu
15
+ brew install pbzip2 # macOS
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ### Iterate
21
+
22
+ ```python
23
+ import pbz2
24
+
25
+ # Parsed JSON objects from a .json.bz2 file
26
+ for obj in pbz2.iter_jsonl("data.json.bz2"):
27
+ ...
28
+
29
+ # Raw UTF-8 lines
30
+ for line in pbz2.iter_lines("data.txt.bz2"):
31
+ ...
32
+
33
+ # Newline-aligned text chunks (useful for batched processing)
34
+ for chunk in pbz2.iter_chunks("data.txt.bz2"):
35
+ ...
36
+ ```
37
+
38
+ ### Parallel processing
39
+
40
+ `process_parallel` streams chunks of newline-terminated records through a worker pool. The worker function receives raw text chunks (so parsing happens in the worker, not the main process), and `on_result` runs in the main process to handle each result as it completes.
41
+
42
+ ```python
43
+ import json
44
+ import pbz2
45
+
46
+ def parse_chunk(chunk: str) -> list[dict]:
47
+ return [json.loads(line) for line in chunk.splitlines() if line]
48
+
49
+ def save(records: list[dict]) -> None:
50
+ ... # write to db, file, etc.
51
+
52
+ pbz2.process_parallel(
53
+ "data.json.bz2",
54
+ worker_fn=parse_chunk,
55
+ on_result=save,
56
+ num_processes=8,
57
+ )
58
+ ```
59
+
60
+ ### CLI
61
+
62
+ ```bash
63
+ pbz2 count data.json.bz2
64
+ pbz2 head data.json.bz2 -n 5
65
+ ```
66
+
67
+ ## API
68
+
69
+ | Function | Description |
70
+ | --- | --- |
71
+ | `iter_chunks(path, **opts)` | Yield UTF-8 text chunks ending on a newline boundary. |
72
+ | `iter_lines(path, **opts)` | Yield non-empty UTF-8 lines (no trailing newline). |
73
+ | `iter_jsonl(path, *, loads=None, **opts)` | Yield parsed JSON objects (uses `orjson` if installed). |
74
+ | `process_parallel(path, worker_fn, *, on_result=None, worker_args=(), num_processes=None, max_pending=None, ...)` | Run `worker_fn(chunk, *worker_args)` in a process pool, dispatching results to `on_result`. |
75
+ | `open_decompress(path, **opts)` | Low-level: open a binary stream of decompressed bytes. |
76
+
77
+ ### Common options
78
+
79
+ - `num_processors` — pbzip2 worker count (default: cpu_count - 1)
80
+ - `bufsize_mb` — OS pipe buffer between pbzip2 and Python (default: 32 MB)
81
+ - `stream_buffer_mb` — Python-side read chunk size (default: 4 MB)
pbz2-0.1.0/TODO.md ADDED
@@ -0,0 +1 @@
1
+ # TODO
@@ -0,0 +1,13 @@
1
+ # Documentation
2
+
3
+ ## Plans
4
+
5
+ Implementation plans for project features and improvements. Each plan has YAML frontmatter tracking status, branch, and timestamps. Plans are linked from `TODO.md` in the project root — open tasks reference their plan file, and completed tasks retain the link as a historical record.
6
+
7
+ See [`plans/`](plans/).
8
+
9
+ ## Guides
10
+
11
+ Reference guides for project tasks or setup.
12
+
13
+ See [`guides/`](guides/).
File without changes
File without changes
@@ -0,0 +1,12 @@
1
+ """pbz2: stream and parallel-process `.bz2` files via pbzip2."""
2
+
3
+ from .parallel import process_parallel
4
+ from .reader import iter_chunks, iter_jsonl, iter_lines, open_decompress
5
+
6
+ __all__ = [
7
+ "iter_chunks",
8
+ "iter_jsonl",
9
+ "iter_lines",
10
+ "open_decompress",
11
+ "process_parallel",
12
+ ]
pbz2-0.1.0/pbz2/cli.py ADDED
@@ -0,0 +1,27 @@
1
+ """pbz2 CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import typer
8
+
9
+ from .reader import iter_lines
10
+
11
+ app = typer.Typer(help="Stream `.bz2` files via pbzip2.")
12
+
13
+
14
+ @app.command()
15
+ def count(path: Path) -> None:
16
+ """Count lines in a `.bz2` file."""
17
+ n = sum(1 for _ in iter_lines(path))
18
+ typer.echo(n)
19
+
20
+
21
+ @app.command()
22
+ def head(path: Path, n: int = typer.Option(10, "-n", help="Number of lines.")) -> None:
23
+ """Print the first N lines of a `.bz2` file."""
24
+ for i, line in enumerate(iter_lines(path)):
25
+ if i >= n:
26
+ break
27
+ typer.echo(line)
@@ -0,0 +1,77 @@
1
+ """Parallel processing of `.bz2` streams via a process pool.
2
+
3
+ Streams chunks of newline-terminated records from a `.bz2` file through a
4
+ worker function in a `ProcessPoolExecutor`, dispatching each result to an
5
+ optional handler in the main process.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ from collections.abc import Callable, Sequence
13
+ from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
14
+ from typing import Any
15
+
16
+ from .reader import (
17
+ DEFAULT_BUFSIZE_MB,
18
+ DEFAULT_STREAM_BUFFER_MB,
19
+ iter_chunks,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def process_parallel(
26
+ path: str | os.PathLike,
27
+ worker_fn: Callable[..., Any],
28
+ *,
29
+ on_result: Callable[[Any], None] | None = None,
30
+ worker_args: Sequence[Any] = (),
31
+ num_processes: int | None = None,
32
+ max_pending: int | None = None,
33
+ bufsize_mb: int = DEFAULT_BUFSIZE_MB,
34
+ stream_buffer_mb: int = DEFAULT_STREAM_BUFFER_MB,
35
+ ) -> None:
36
+ """Process chunks of `path` in parallel.
37
+
38
+ `worker_fn(chunk, *worker_args)` runs in worker processes; each chunk is a
39
+ `str` of complete newline-terminated records. Results are dispatched to
40
+ `on_result` in the main process as they complete.
41
+
42
+ When `max_pending` is set, the producer pauses while that many futures are
43
+ in-flight to bound memory.
44
+ """
45
+ nproc = num_processes or max(1, (os.cpu_count() or 2) // 2)
46
+ cap = max_pending if max_pending is not None else nproc * 2
47
+
48
+ def drain_done(pending: list, *, block: bool = False) -> list:
49
+ if block and pending:
50
+ wait(pending, return_when=FIRST_COMPLETED)
51
+ still_pending = []
52
+ for fut in pending:
53
+ if fut.done():
54
+ result = fut.result()
55
+ if on_result is not None:
56
+ on_result(result)
57
+ else:
58
+ still_pending.append(fut)
59
+ return still_pending
60
+
61
+ with ProcessPoolExecutor(max_workers=nproc) as executor:
62
+ pending: list = []
63
+ for chunk in iter_chunks(
64
+ path,
65
+ num_processors=nproc,
66
+ bufsize_mb=bufsize_mb,
67
+ stream_buffer_mb=stream_buffer_mb,
68
+ ):
69
+ if not chunk.strip():
70
+ continue
71
+ pending.append(executor.submit(worker_fn, chunk, *worker_args))
72
+ pending = drain_done(pending)
73
+ while len(pending) >= cap:
74
+ pending = drain_done(pending, block=True)
75
+
76
+ while pending:
77
+ pending = drain_done(pending, block=True)
File without changes