pbz2 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pbz2-0.1.0/.claude/settings.local.json +69 -0
- pbz2-0.1.0/.github/dependabot.yml +10 -0
- pbz2-0.1.0/.github/pull_request_template.md +5 -0
- pbz2-0.1.0/.github/workflows/publish.yml +31 -0
- pbz2-0.1.0/.github/workflows/test.yml +26 -0
- pbz2-0.1.0/.gitignore +13 -0
- pbz2-0.1.0/.pre-commit-config.yaml +16 -0
- pbz2-0.1.0/.python-version +1 -0
- pbz2-0.1.0/CHANGELOG.md +34 -0
- pbz2-0.1.0/CLAUDE.md +28 -0
- pbz2-0.1.0/LICENSE +22 -0
- pbz2-0.1.0/PKG-INFO +94 -0
- pbz2-0.1.0/README.md +81 -0
- pbz2-0.1.0/TODO.md +1 -0
- pbz2-0.1.0/docs/README.md +13 -0
- pbz2-0.1.0/docs/guides/.gitkeep +0 -0
- pbz2-0.1.0/docs/plans/.gitkeep +0 -0
- pbz2-0.1.0/pbz2/__init__.py +12 -0
- pbz2-0.1.0/pbz2/cli.py +27 -0
- pbz2-0.1.0/pbz2/parallel.py +77 -0
- pbz2-0.1.0/pbz2/py.typed +0 -0
- pbz2-0.1.0/pbz2/reader.py +136 -0
- pbz2-0.1.0/pyproject.toml +52 -0
- pbz2-0.1.0/tests/__init__.py +0 -0
- pbz2-0.1.0/tests/test_pbz2.py +76 -0
- pbz2-0.1.0/uv.lock +611 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(date:*)",
|
|
5
|
+
"Bash(diff:*)",
|
|
6
|
+
"Bash(du:*)",
|
|
7
|
+
"Bash(file:*)",
|
|
8
|
+
"Bash(find:*)",
|
|
9
|
+
"Bash(gh api:*)",
|
|
10
|
+
"Bash(gh issue:*)",
|
|
11
|
+
"Bash(gh pr create:*)",
|
|
12
|
+
"Bash(gh pr diff:*)",
|
|
13
|
+
"Bash(gh pr list:*)",
|
|
14
|
+
"Bash(gh pr merge:*)",
|
|
15
|
+
"Bash(gh pr view:*)",
|
|
16
|
+
"Bash(gh repo:*)",
|
|
17
|
+
"Bash(git add:*)",
|
|
18
|
+
"Bash(git branch:*)",
|
|
19
|
+
"Bash(git checkout:*)",
|
|
20
|
+
"Bash(git commit:*)",
|
|
21
|
+
"Bash(git config:*)",
|
|
22
|
+
"Bash(git diff:*)",
|
|
23
|
+
"Bash(git fetch:*)",
|
|
24
|
+
"Bash(git log:*)",
|
|
25
|
+
"Bash(git merge:*)",
|
|
26
|
+
"Bash(git mv:*)",
|
|
27
|
+
"Bash(git pull:*)",
|
|
28
|
+
"Bash(git remote:*)",
|
|
29
|
+
"Bash(git show:*)",
|
|
30
|
+
"Bash(git stash:*)",
|
|
31
|
+
"Bash(git status:*)",
|
|
32
|
+
"Bash(git switch:*)",
|
|
33
|
+
"Bash(git tag:*)",
|
|
34
|
+
"Bash(grep:*)",
|
|
35
|
+
"Bash(head:*)",
|
|
36
|
+
"Bash(jq:*)",
|
|
37
|
+
"Bash(ls:*)",
|
|
38
|
+
"Bash(sqlite3:*)",
|
|
39
|
+
"Bash(stanza:*)",
|
|
40
|
+
"Bash(test:*)",
|
|
41
|
+
"Bash(tree:*)",
|
|
42
|
+
"Bash(uv add:*)",
|
|
43
|
+
"Bash(uv build:*)",
|
|
44
|
+
"Bash(uv lock:*)",
|
|
45
|
+
"Bash(uv pip:*)",
|
|
46
|
+
"Bash(uv remove:*)",
|
|
47
|
+
"Bash(uv run python:*)",
|
|
48
|
+
"Bash(uv run:*)",
|
|
49
|
+
"Bash(uv sync:*)",
|
|
50
|
+
"Bash(wc:*)",
|
|
51
|
+
"Bash(which:*)",
|
|
52
|
+
"Bash(xxd:*)",
|
|
53
|
+
"Edit(.claude/**)"
|
|
54
|
+
],
|
|
55
|
+
"deny": [
|
|
56
|
+
"Bash(git clean:*)",
|
|
57
|
+
"Bash(git push --force:*)",
|
|
58
|
+
"Bash(git reset --hard:*)",
|
|
59
|
+
"Bash(rm -rf:*)"
|
|
60
|
+
],
|
|
61
|
+
"ask": [
|
|
62
|
+
"Bash(git checkout .:*)",
|
|
63
|
+
"Bash(git push:*)",
|
|
64
|
+
"Bash(git rebase:*)",
|
|
65
|
+
"Bash(git restore:*)",
|
|
66
|
+
"Bash(rm:*)"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v6
|
|
12
|
+
- uses: astral-sh/setup-uv@v8.1.0
|
|
13
|
+
- run: uv build
|
|
14
|
+
- uses: actions/upload-artifact@v7
|
|
15
|
+
with:
|
|
16
|
+
name: dist
|
|
17
|
+
path: dist/
|
|
18
|
+
|
|
19
|
+
publish:
|
|
20
|
+
needs: build
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
environment: pypi
|
|
23
|
+
permissions:
|
|
24
|
+
id-token: write
|
|
25
|
+
contents: read
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/download-artifact@v8
|
|
28
|
+
with:
|
|
29
|
+
name: dist
|
|
30
|
+
path: dist/
|
|
31
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [dev, main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [dev, main]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
strategy:
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.11", "3.12", "3.13", "3.14"]
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v6
|
|
20
|
+
- uses: astral-sh/setup-uv@v8.1.0
|
|
21
|
+
- run: uv python install ${{ matrix.python-version }}
|
|
22
|
+
- run: uv sync --all-groups --python ${{ matrix.python-version }}
|
|
23
|
+
- run: uv run ruff check .
|
|
24
|
+
- run: uv run ruff format --check .
|
|
25
|
+
- run: uv run pyrefly check
|
|
26
|
+
- run: uv run pytest --cov --cov-report=term-missing
|
pbz2-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.15.5
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff-format
|
|
6
|
+
- id: ruff
|
|
7
|
+
args: [--fix]
|
|
8
|
+
- repo: local
|
|
9
|
+
hooks:
|
|
10
|
+
- id: pyrefly-check
|
|
11
|
+
name: pyrefly check
|
|
12
|
+
entry: uv run pyrefly check
|
|
13
|
+
language: system
|
|
14
|
+
types_or: [python, pyi]
|
|
15
|
+
pass_filenames: false
|
|
16
|
+
require_serial: true
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.14
|
pbz2-0.1.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
### Deprecated
|
|
15
|
+
|
|
16
|
+
### Removed
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
### Security
|
|
21
|
+
|
|
22
|
+
## [0.1.0] - 2026-05-10
|
|
23
|
+
|
|
24
|
+
Initial release.
|
|
25
|
+
|
|
26
|
+
### Added
|
|
27
|
+
|
|
28
|
+
- `open_decompress` — file-like reader that streams a `.bz2` file through `pbzip2 -dc`.
|
|
29
|
+
- `iter_chunks` — iterate over fixed-size byte chunks from a decompressed stream.
|
|
30
|
+
- `iter_lines` — iterate over decoded text lines from a decompressed stream.
|
|
31
|
+
- `iter_jsonl` — iterate over parsed JSON records from a decompressed JSONL stream, backed by `orjson`.
|
|
32
|
+
- `process_parallel` — fan out decompressed records across worker processes.
|
|
33
|
+
- `pbz2` CLI entry point (Typer-based) for streaming and processing `.bz2` files from the shell.
|
|
34
|
+
- Python 3.11–3.14 support.
|
pbz2-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Claude Settings
|
|
2
|
+
|
|
3
|
+
This file provides guidance to [Claude Code](claude.ai/code).
|
|
4
|
+
|
|
5
|
+
## Package Structure
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
pbz2/
|
|
9
|
+
├── cli.py # Typer CLI entry point
|
|
10
|
+
└── __init__.py
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Development
|
|
14
|
+
|
|
15
|
+
- Install: `uv sync --all-groups`
|
|
16
|
+
- Tests: `uv run pytest`
|
|
17
|
+
- Linting: pre-commit hooks run ruff format + lint on commit
|
|
18
|
+
- Type checking: pre-commit hooks run pyrefly on commit
|
|
19
|
+
- CI: GitHub Actions runs lint + type check + test matrix (Python 3.11–3.14) on push/PR to dev/main
|
|
20
|
+
|
|
21
|
+
## Release Automation
|
|
22
|
+
|
|
23
|
+
Use [stanza](https://github.com/gitronald/stanza) for release workflows:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
stanza release [patch|minor|major|prerelease]
|
|
27
|
+
stanza init
|
|
28
|
+
```
|
pbz2-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ronald E. Robertson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
pbz2-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pbz2
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Stream and parallel-process .bz2 files via pbzip2.
|
|
5
|
+
Project-URL: repository, https://github.com/gitronald/pbz2
|
|
6
|
+
Author-email: gitronald <gitronald@users.noreply.github.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Requires-Dist: orjson>=3.11.9
|
|
11
|
+
Requires-Dist: typer
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# pbz2
|
|
15
|
+
|
|
16
|
+
Stream and parallel-process `.bz2` files via [pbzip2](http://compression.great-site.net/pbzip2/) (parallel bzip2). Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv add pbz2
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Install `pbzip2` for parallel decompression:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
sudo apt install pbzip2 # Debian/Ubuntu
|
|
28
|
+
brew install pbzip2 # macOS
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### Iterate
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import pbz2
|
|
37
|
+
|
|
38
|
+
# Parsed JSON objects from a .json.bz2 file
|
|
39
|
+
for obj in pbz2.iter_jsonl("data.json.bz2"):
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
# Raw UTF-8 lines
|
|
43
|
+
for line in pbz2.iter_lines("data.txt.bz2"):
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
# Newline-aligned text chunks (useful for batched processing)
|
|
47
|
+
for chunk in pbz2.iter_chunks("data.txt.bz2"):
|
|
48
|
+
...
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Parallel processing
|
|
52
|
+
|
|
53
|
+
`process_parallel` streams chunks of newline-terminated records through a worker pool. The worker function receives raw text chunks (so parsing happens in the worker, not the main process), and `on_result` runs in the main process to handle each result as it completes.
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import json
|
|
57
|
+
import pbz2
|
|
58
|
+
|
|
59
|
+
def parse_chunk(chunk: str) -> list[dict]:
|
|
60
|
+
return [json.loads(line) for line in chunk.splitlines() if line]
|
|
61
|
+
|
|
62
|
+
def save(records: list[dict]) -> None:
|
|
63
|
+
... # write to db, file, etc.
|
|
64
|
+
|
|
65
|
+
pbz2.process_parallel(
|
|
66
|
+
"data.json.bz2",
|
|
67
|
+
worker_fn=parse_chunk,
|
|
68
|
+
on_result=save,
|
|
69
|
+
num_processes=8,
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pbz2 count data.json.bz2
|
|
77
|
+
pbz2 head data.json.bz2 -n 5
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## API
|
|
81
|
+
|
|
82
|
+
| Function | Description |
|
|
83
|
+
| --- | --- |
|
|
84
|
+
| `iter_chunks(path, **opts)` | Yield UTF-8 text chunks ending on a newline boundary. |
|
|
85
|
+
| `iter_lines(path, **opts)` | Yield non-empty UTF-8 lines (no trailing newline). |
|
|
86
|
+
| `iter_jsonl(path, *, loads=None, **opts)` | Yield parsed JSON objects (uses `orjson` if installed). |
|
|
87
|
+
| `process_parallel(path, worker_fn, *, on_result=None, worker_args=(), num_processes=None, max_pending=None, ...)` | Run `worker_fn(chunk, *worker_args)` in a process pool, dispatching results to `on_result`. |
|
|
88
|
+
| `open_decompress(path, **opts)` | Low-level: open a binary stream of decompressed bytes. |
|
|
89
|
+
|
|
90
|
+
### Common options
|
|
91
|
+
|
|
92
|
+
- `num_processors` — pbzip2 worker count (default: cpu_count - 1)
|
|
93
|
+
- `bufsize_mb` — OS pipe buffer between pbzip2 and Python (default: 32 MB)
|
|
94
|
+
- `stream_buffer_mb` — Python-side read chunk size (default: 4 MB)
|
pbz2-0.1.0/README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# pbz2
|
|
2
|
+
|
|
3
|
+
Stream and parallel-process `.bz2` files via [pbzip2](http://compression.great-site.net/pbzip2/) (parallel bzip2). Falls back to the stdlib `bz2` module when the `pbzip2` binary is unavailable.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv add pbz2
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Install `pbzip2` for parallel decompression:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
sudo apt install pbzip2 # Debian/Ubuntu
|
|
15
|
+
brew install pbzip2 # macOS
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
### Iterate
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
import pbz2
|
|
24
|
+
|
|
25
|
+
# Parsed JSON objects from a .json.bz2 file
|
|
26
|
+
for obj in pbz2.iter_jsonl("data.json.bz2"):
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
# Raw UTF-8 lines
|
|
30
|
+
for line in pbz2.iter_lines("data.txt.bz2"):
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
# Newline-aligned text chunks (useful for batched processing)
|
|
34
|
+
for chunk in pbz2.iter_chunks("data.txt.bz2"):
|
|
35
|
+
...
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Parallel processing
|
|
39
|
+
|
|
40
|
+
`process_parallel` streams chunks of newline-terminated records through a worker pool. The worker function receives raw text chunks (so parsing happens in the worker, not the main process), and `on_result` runs in the main process to handle each result as it completes.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import json
|
|
44
|
+
import pbz2
|
|
45
|
+
|
|
46
|
+
def parse_chunk(chunk: str) -> list[dict]:
|
|
47
|
+
return [json.loads(line) for line in chunk.splitlines() if line]
|
|
48
|
+
|
|
49
|
+
def save(records: list[dict]) -> None:
|
|
50
|
+
... # write to db, file, etc.
|
|
51
|
+
|
|
52
|
+
pbz2.process_parallel(
|
|
53
|
+
"data.json.bz2",
|
|
54
|
+
worker_fn=parse_chunk,
|
|
55
|
+
on_result=save,
|
|
56
|
+
num_processes=8,
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### CLI
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pbz2 count data.json.bz2
|
|
64
|
+
pbz2 head data.json.bz2 -n 5
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## API
|
|
68
|
+
|
|
69
|
+
| Function | Description |
|
|
70
|
+
| --- | --- |
|
|
71
|
+
| `iter_chunks(path, **opts)` | Yield UTF-8 text chunks ending on a newline boundary. |
|
|
72
|
+
| `iter_lines(path, **opts)` | Yield non-empty UTF-8 lines (no trailing newline). |
|
|
73
|
+
| `iter_jsonl(path, *, loads=None, **opts)` | Yield parsed JSON objects (uses `orjson` if installed). |
|
|
74
|
+
| `process_parallel(path, worker_fn, *, on_result=None, worker_args=(), num_processes=None, max_pending=None, ...)` | Run `worker_fn(chunk, *worker_args)` in a process pool, dispatching results to `on_result`. |
|
|
75
|
+
| `open_decompress(path, **opts)` | Low-level: open a binary stream of decompressed bytes. |
|
|
76
|
+
|
|
77
|
+
### Common options
|
|
78
|
+
|
|
79
|
+
- `num_processors` — pbzip2 worker count (default: cpu_count - 1)
|
|
80
|
+
- `bufsize_mb` — OS pipe buffer between pbzip2 and Python (default: 32 MB)
|
|
81
|
+
- `stream_buffer_mb` — Python-side read chunk size (default: 4 MB)
|
pbz2-0.1.0/TODO.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# TODO
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Documentation
|
|
2
|
+
|
|
3
|
+
## Plans
|
|
4
|
+
|
|
5
|
+
Implementation plans for project features and improvements. Each plan has YAML frontmatter tracking status, branch, and timestamps. Plans are linked from `TODO.md` in the project root — open tasks reference their plan file, and completed tasks retain the link as a historical record.
|
|
6
|
+
|
|
7
|
+
See [`plans/`](plans/).
|
|
8
|
+
|
|
9
|
+
## Guides
|
|
10
|
+
|
|
11
|
+
Reference guides for project tasks or setup.
|
|
12
|
+
|
|
13
|
+
See [`guides/`](guides/).
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""pbz2: stream and parallel-process `.bz2` files via pbzip2."""
|
|
2
|
+
|
|
3
|
+
from .parallel import process_parallel
|
|
4
|
+
from .reader import iter_chunks, iter_jsonl, iter_lines, open_decompress
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"iter_chunks",
|
|
8
|
+
"iter_jsonl",
|
|
9
|
+
"iter_lines",
|
|
10
|
+
"open_decompress",
|
|
11
|
+
"process_parallel",
|
|
12
|
+
]
|
pbz2-0.1.0/pbz2/cli.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""pbz2 CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
|
|
9
|
+
from .reader import iter_lines
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(help="Stream `.bz2` files via pbzip2.")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@app.command()
|
|
15
|
+
def count(path: Path) -> None:
|
|
16
|
+
"""Count lines in a `.bz2` file."""
|
|
17
|
+
n = sum(1 for _ in iter_lines(path))
|
|
18
|
+
typer.echo(n)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command()
|
|
22
|
+
def head(path: Path, n: int = typer.Option(10, "-n", help="Number of lines.")) -> None:
|
|
23
|
+
"""Print the first N lines of a `.bz2` file."""
|
|
24
|
+
for i, line in enumerate(iter_lines(path)):
|
|
25
|
+
if i >= n:
|
|
26
|
+
break
|
|
27
|
+
typer.echo(line)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Parallel processing of `.bz2` streams via a process pool.
|
|
2
|
+
|
|
3
|
+
Streams chunks of newline-terminated records from a `.bz2` file through a
|
|
4
|
+
worker function in a `ProcessPoolExecutor`, dispatching each result to an
|
|
5
|
+
optional handler in the main process.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
from collections.abc import Callable, Sequence
|
|
13
|
+
from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .reader import (
|
|
17
|
+
DEFAULT_BUFSIZE_MB,
|
|
18
|
+
DEFAULT_STREAM_BUFFER_MB,
|
|
19
|
+
iter_chunks,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def process_parallel(
|
|
26
|
+
path: str | os.PathLike,
|
|
27
|
+
worker_fn: Callable[..., Any],
|
|
28
|
+
*,
|
|
29
|
+
on_result: Callable[[Any], None] | None = None,
|
|
30
|
+
worker_args: Sequence[Any] = (),
|
|
31
|
+
num_processes: int | None = None,
|
|
32
|
+
max_pending: int | None = None,
|
|
33
|
+
bufsize_mb: int = DEFAULT_BUFSIZE_MB,
|
|
34
|
+
stream_buffer_mb: int = DEFAULT_STREAM_BUFFER_MB,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Process chunks of `path` in parallel.
|
|
37
|
+
|
|
38
|
+
`worker_fn(chunk, *worker_args)` runs in worker processes; each chunk is a
|
|
39
|
+
`str` of complete newline-terminated records. Results are dispatched to
|
|
40
|
+
`on_result` in the main process as they complete.
|
|
41
|
+
|
|
42
|
+
When `max_pending` is set, the producer pauses while that many futures are
|
|
43
|
+
in-flight to bound memory.
|
|
44
|
+
"""
|
|
45
|
+
nproc = num_processes or max(1, (os.cpu_count() or 2) // 2)
|
|
46
|
+
cap = max_pending if max_pending is not None else nproc * 2
|
|
47
|
+
|
|
48
|
+
def drain_done(pending: list, *, block: bool = False) -> list:
|
|
49
|
+
if block and pending:
|
|
50
|
+
wait(pending, return_when=FIRST_COMPLETED)
|
|
51
|
+
still_pending = []
|
|
52
|
+
for fut in pending:
|
|
53
|
+
if fut.done():
|
|
54
|
+
result = fut.result()
|
|
55
|
+
if on_result is not None:
|
|
56
|
+
on_result(result)
|
|
57
|
+
else:
|
|
58
|
+
still_pending.append(fut)
|
|
59
|
+
return still_pending
|
|
60
|
+
|
|
61
|
+
with ProcessPoolExecutor(max_workers=nproc) as executor:
|
|
62
|
+
pending: list = []
|
|
63
|
+
for chunk in iter_chunks(
|
|
64
|
+
path,
|
|
65
|
+
num_processors=nproc,
|
|
66
|
+
bufsize_mb=bufsize_mb,
|
|
67
|
+
stream_buffer_mb=stream_buffer_mb,
|
|
68
|
+
):
|
|
69
|
+
if not chunk.strip():
|
|
70
|
+
continue
|
|
71
|
+
pending.append(executor.submit(worker_fn, chunk, *worker_args))
|
|
72
|
+
pending = drain_done(pending)
|
|
73
|
+
while len(pending) >= cap:
|
|
74
|
+
pending = drain_done(pending, block=True)
|
|
75
|
+
|
|
76
|
+
while pending:
|
|
77
|
+
pending = drain_done(pending, block=True)
|
pbz2-0.1.0/pbz2/py.typed
ADDED
|
File without changes
|