ducl 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ducl-0.1.0/.gitignore +12 -0
- ducl-0.1.0/PKG-INFO +15 -0
- ducl-0.1.0/README.md +47 -0
- ducl-0.1.0/hatch_build.py +21 -0
- ducl-0.1.0/pyproject.toml +35 -0
- ducl-0.1.0/src/ducl/__init__.py +3 -0
- ducl-0.1.0/src/ducl/agg.py +88 -0
- ducl-0.1.0/src/ducl/cli.py +113 -0
- ducl-0.1.0/src/ducl/dashboard.html +1129 -0
- ducl-0.1.0/src/ducl/dashboard.py +913 -0
- ducl-0.1.0/src/ducl/ext.py +20 -0
- ducl-0.1.0/src/ducl/fmt.py +21 -0
- ducl-0.1.0/src/ducl/pwalk2/Makefile +11 -0
- ducl-0.1.0/src/ducl/pwalk2/pw2_output.c +175 -0
- ducl-0.1.0/src/ducl/pwalk2/pw2_uring.h +236 -0
- ducl-0.1.0/src/ducl/pwalk2/pw2_worker.c +651 -0
- ducl-0.1.0/src/ducl/pwalk2/pwalk2.c +398 -0
- ducl-0.1.0/src/ducl/pwalk2/pwalk2.h +178 -0
- ducl-0.1.0/src/ducl/query.py +161 -0
- ducl-0.1.0/src/ducl/s3scan.py +448 -0
- ducl-0.1.0/src/ducl/scan.py +153 -0
- ducl-0.1.0/src/ducl/schema.py +81 -0
- ducl-0.1.0/tests/conftest.py +322 -0
- ducl-0.1.0/tests/test_agg.py +150 -0
- ducl-0.1.0/tests/test_build.py +347 -0
- ducl-0.1.0/tests/test_scan.py +101 -0
- ducl-0.1.0/tests/test_update.py +251 -0
ducl-0.1.0/.gitignore
ADDED
ducl-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ducl
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Disk Usage Command Line toolkit — scan, dashboard, query
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: click>=8.0
|
|
8
|
+
Requires-Dist: numpy>=1.24
|
|
9
|
+
Requires-Dist: polars>=1.0
|
|
10
|
+
Requires-Dist: pyarrow>=14.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: boto3>=1.28; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
14
|
+
Provides-Extra: s3
|
|
15
|
+
Requires-Dist: boto3>=1.28; extra == 's3'
|
ducl-0.1.0/README.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# ducl — Disk Usage Command Line toolkit
|
|
2
|
+
|
|
3
|
+
Scan filesystems (or S3 buckets), build interactive dashboards, and query scan data — all from one CLI.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install ducl # from PyPI
|
|
9
|
+
pip install ducl[s3] # with S3 support (boto3)
|
|
10
|
+
pip install -e ".[dev]" # development (editable + pytest)
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
On Linux the bundled **pwalk2** C binary is compiled automatically at install time.
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Scan a filesystem → Feather file + dashboard
|
|
19
|
+
ducl scan /mnt/data -o scan.feather
|
|
20
|
+
|
|
21
|
+
# Scan an S3 bucket
|
|
22
|
+
ducl scan my-bucket -o bucket.feather --s3
|
|
23
|
+
|
|
24
|
+
# Build dashboard from existing Feather
|
|
25
|
+
ducl dashboard scan.feather ./output/
|
|
26
|
+
|
|
27
|
+
# Incremental update after rescanning a subtree
|
|
28
|
+
ducl update ./output/ subtree.feather
|
|
29
|
+
|
|
30
|
+
# Query scan data
|
|
31
|
+
ducl query scan.feather --under /mnt/data/models/ --ext wav --top 10
|
|
32
|
+
|
|
33
|
+
# Run bundled pwalk2 directly
|
|
34
|
+
ducl pwalk2 /mnt/data --threads 64
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Development
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
cd v3
|
|
41
|
+
pip install -e ".[dev]"
|
|
42
|
+
pytest -v
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## License
|
|
46
|
+
|
|
47
|
+
MIT
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Hatchling build hook -- compiles pwalk2 from bundled C sources."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import platform
|
|
5
|
+
import subprocess
|
|
6
|
+
|
|
7
|
+
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CustomBuildHook(BuildHookInterface):
|
|
11
|
+
def initialize(self, version, build_data):
|
|
12
|
+
if platform.system() != "Linux":
|
|
13
|
+
return # pwalk2 is Linux-only; skip silently
|
|
14
|
+
pw2_dir = os.path.join(self.root, "src", "ducl", "pwalk2")
|
|
15
|
+
if not os.path.isdir(pw2_dir):
|
|
16
|
+
return
|
|
17
|
+
subprocess.check_call(["make", "-C", pw2_dir, "pwalk2"])
|
|
18
|
+
# Include compiled binary in wheel
|
|
19
|
+
binary = os.path.join(pw2_dir, "pwalk2")
|
|
20
|
+
if os.path.isfile(binary):
|
|
21
|
+
build_data.setdefault("force-include", {})[binary] = "ducl/pwalk2/pwalk2"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ducl"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Disk Usage Command Line toolkit — scan, dashboard, query"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"click>=8.0",
|
|
13
|
+
"polars>=1.0",
|
|
14
|
+
"pyarrow>=14.0",
|
|
15
|
+
"numpy>=1.24",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
s3 = ["boto3>=1.28"]
|
|
20
|
+
dev = ["pytest>=7.0", "boto3>=1.28"]
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
ducl = "ducl.cli:cli"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
packages = ["src/ducl"]
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel.hooks.custom]
|
|
29
|
+
path = "hatch_build.py"
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.sdist]
|
|
32
|
+
include = ["src/ducl/**", "tests/**", "pyproject.toml", "README.md", "hatch_build.py"]
|
|
33
|
+
|
|
34
|
+
[tool.pytest.ini_options]
|
|
35
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Batch processing for aggregation sidecars."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
|
|
9
|
+
from .schema import AGG_GROUP_COLS, EXAMPLES_TOP_K, HIST_EDGES
|
|
10
|
+
from .ext import normalize_ext_expr
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def process_batch(batch: pa.RecordBatch) -> tuple[pl.DataFrame, pl.DataFrame]:
|
|
14
|
+
"""Process a single pyarrow RecordBatch into agg and example DataFrames.
|
|
15
|
+
|
|
16
|
+
Returns (agg_df, examples_df).
|
|
17
|
+
"""
|
|
18
|
+
df = pl.from_arrow(batch)
|
|
19
|
+
|
|
20
|
+
# Filter to files only
|
|
21
|
+
df = df.filter(pl.col("child_count") == -1)
|
|
22
|
+
if df.height == 0:
|
|
23
|
+
agg_empty = pl.DataFrame(schema={
|
|
24
|
+
"dir_path": pl.Utf8, "ext": pl.Utf8, "leaf_folder": pl.Utf8,
|
|
25
|
+
"size_bin": pl.Int8, "n_components": pl.Int32,
|
|
26
|
+
"file_count": pl.Int64, "total_size": pl.Int64,
|
|
27
|
+
})
|
|
28
|
+
ex_empty = pl.DataFrame(schema={
|
|
29
|
+
"dir_path": pl.Utf8, "size_bin": pl.Int8, "filename": pl.Utf8,
|
|
30
|
+
"ext": pl.Utf8, "size": pl.Int64,
|
|
31
|
+
})
|
|
32
|
+
return agg_empty, ex_empty
|
|
33
|
+
|
|
34
|
+
# Compute derived columns
|
|
35
|
+
df = df.select("path", "ext", "size").with_columns(
|
|
36
|
+
pl.col("path").str.replace(r"/[^/]+$", "").alias("dir_path"),
|
|
37
|
+
pl.col("path").str.extract(r"([^/]+)/[^/]+$").alias("leaf_folder"),
|
|
38
|
+
pl.col("path").str.extract(r"([^/]+)$").alias("filename"),
|
|
39
|
+
).with_columns(
|
|
40
|
+
normalize_ext_expr(),
|
|
41
|
+
(pl.col("dir_path").str.count_matches("/") + 2).cast(pl.Int32).alias("n_components"),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Assign size bins
|
|
45
|
+
sizes_np = df["size"].to_numpy().astype(float)
|
|
46
|
+
bins = np.digitize(sizes_np, HIST_EDGES).astype(np.int8)
|
|
47
|
+
df = df.with_columns(pl.lit(pl.Series("size_bin", bins)).alias("size_bin"))
|
|
48
|
+
|
|
49
|
+
# Agg: group by (dir_path, ext, leaf_folder, size_bin, n_components)
|
|
50
|
+
agg_df = (
|
|
51
|
+
df.group_by(AGG_GROUP_COLS)
|
|
52
|
+
.agg(
|
|
53
|
+
pl.len().cast(pl.Int64).alias("file_count"),
|
|
54
|
+
pl.col("size").sum().alias("total_size"),
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Examples: top-3 largest files per (dir_path, size_bin)
|
|
59
|
+
examples_df = (
|
|
60
|
+
df.sort("size", descending=True)
|
|
61
|
+
.group_by(["dir_path", "size_bin"])
|
|
62
|
+
.head(EXAMPLES_TOP_K)
|
|
63
|
+
.select("dir_path", "size_bin", "filename", "ext", "size")
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return agg_df, examples_df
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def compact_agg(batches: list[pl.DataFrame]) -> pl.DataFrame:
|
|
70
|
+
"""Re-aggregate accumulated agg batches to bound memory."""
|
|
71
|
+
combined = pl.concat(batches)
|
|
72
|
+
return (
|
|
73
|
+
combined.group_by(AGG_GROUP_COLS)
|
|
74
|
+
.agg(
|
|
75
|
+
pl.col("file_count").sum().alias("file_count"),
|
|
76
|
+
pl.col("total_size").sum().alias("total_size"),
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def compact_examples(batches: list[pl.DataFrame]) -> pl.DataFrame:
|
|
82
|
+
"""Re-pick top-3 from accumulated example batches."""
|
|
83
|
+
combined = pl.concat(batches)
|
|
84
|
+
return (
|
|
85
|
+
combined.sort("size", descending=True)
|
|
86
|
+
.group_by(["dir_path", "size_bin"])
|
|
87
|
+
.head(EXAMPLES_TOP_K)
|
|
88
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""ducl CLI — one entry point for disk-usage operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@click.group()
|
|
13
|
+
@click.version_option(package_name="ducl")
|
|
14
|
+
def cli():
|
|
15
|
+
"""Disk Usage Command Line toolkit."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@cli.command()
|
|
19
|
+
@click.argument("path")
|
|
20
|
+
@click.option("-o", "--output", required=True, help="Output .feather file path")
|
|
21
|
+
@click.option("--s3", "use_s3", is_flag=True, help="Treat PATH as an S3 bucket name")
|
|
22
|
+
@click.option("--no-agg", is_flag=True, help="Skip sidecar aggregation")
|
|
23
|
+
@click.option("--no-dashboard", is_flag=True, help="Skip automatic dashboard build")
|
|
24
|
+
@click.option("--block-size", type=int, default=256 << 20, help="CSV read block size (default 256 MiB)")
|
|
25
|
+
@click.option("-w", "--workers", type=int, default=32, help="S3 parallel listing threads")
|
|
26
|
+
@click.option("--discover-depth", type=int, default=2, help="S3 prefix discovery depth")
|
|
27
|
+
@click.option("--endpoint-url", help="S3-compatible endpoint")
|
|
28
|
+
@click.option("--profile", help="AWS profile name")
|
|
29
|
+
@click.option("--region", help="AWS region")
|
|
30
|
+
def scan(path, output, use_s3, no_agg, no_dashboard, block_size, workers,
|
|
31
|
+
discover_depth, endpoint_url, profile, region):
|
|
32
|
+
"""Scan a filesystem or S3 bucket into a Feather file."""
|
|
33
|
+
do_agg = not no_agg
|
|
34
|
+
|
|
35
|
+
if use_s3:
|
|
36
|
+
from .s3scan import scan_bucket
|
|
37
|
+
scan_bucket(
|
|
38
|
+
bucket=path,
|
|
39
|
+
output=output,
|
|
40
|
+
workers=workers,
|
|
41
|
+
discover_depth=discover_depth,
|
|
42
|
+
do_agg=do_agg,
|
|
43
|
+
endpoint_url=endpoint_url,
|
|
44
|
+
profile=profile,
|
|
45
|
+
region=region,
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
from .scan import scan_filesystem
|
|
49
|
+
scan_filesystem(path, output, do_agg=do_agg, block_size=block_size)
|
|
50
|
+
|
|
51
|
+
if not no_dashboard and os.path.exists(output):
|
|
52
|
+
out_dir = Path(output).parent / (Path(output).stem + "_dashboard")
|
|
53
|
+
click.echo(f"Building dashboard in {out_dir} ...", err=True)
|
|
54
|
+
from .dashboard import build
|
|
55
|
+
build(output, out_dir)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@cli.command()
|
|
59
|
+
@click.argument("feather")
|
|
60
|
+
@click.argument("output_dir")
|
|
61
|
+
def dashboard(feather, output_dir):
|
|
62
|
+
"""Build a dashboard from a Feather scan file."""
|
|
63
|
+
from .dashboard import build
|
|
64
|
+
build(feather, output_dir)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@cli.command("update")
|
|
68
|
+
@click.argument("dashboard_dir")
|
|
69
|
+
@click.argument("subtree_feather")
|
|
70
|
+
def update_cmd(dashboard_dir, subtree_feather):
|
|
71
|
+
"""Incrementally update a dashboard after a subtree rescan."""
|
|
72
|
+
from .dashboard import update
|
|
73
|
+
update(dashboard_dir, subtree_feather)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@cli.command()
|
|
77
|
+
@click.argument("feather")
|
|
78
|
+
@click.option("--under", help="Path prefix filter")
|
|
79
|
+
@click.option("--name", help="Exact match on any path component")
|
|
80
|
+
@click.option("--name-glob", help="Glob pattern on any path component")
|
|
81
|
+
@click.option("--ext", multiple=True, help="Extension filter (repeatable)")
|
|
82
|
+
@click.option("--min-size", help="Minimum file size (K/M/G/T suffixes)")
|
|
83
|
+
@click.option("--max-size", help="Maximum file size (K/M/G/T suffixes)")
|
|
84
|
+
@click.option("--path-contains", help="Substring match on full path")
|
|
85
|
+
@click.option("--dirs", is_flag=True, help="Directories only (default: files only)")
|
|
86
|
+
@click.option("--top", type=int, help="Show N largest entries")
|
|
87
|
+
@click.option("--delete", is_flag=True, help="Delete matching files from disk")
|
|
88
|
+
def query(feather, **kwargs):
|
|
89
|
+
"""Query a .feather scan file with du-style output."""
|
|
90
|
+
from .query import run_query
|
|
91
|
+
ext_vals = list(kwargs.pop("ext")) or None
|
|
92
|
+
run_query(feather, ext=ext_vals, **kwargs)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@cli.command(
|
|
96
|
+
"pwalk2",
|
|
97
|
+
context_settings=dict(
|
|
98
|
+
ignore_unknown_options=True,
|
|
99
|
+
allow_extra_args=True,
|
|
100
|
+
),
|
|
101
|
+
)
|
|
102
|
+
@click.argument("args", nargs=-1, type=click.UNPROCESSED)
|
|
103
|
+
def pwalk2_cmd(args):
|
|
104
|
+
"""Run the bundled pwalk2 binary (all arguments forwarded)."""
|
|
105
|
+
binary = Path(__file__).parent / "pwalk2" / "pwalk2"
|
|
106
|
+
if not binary.exists():
|
|
107
|
+
click.echo(
|
|
108
|
+
"Error: pwalk2 binary not found. "
|
|
109
|
+
"pwalk2 requires Linux and is compiled at install time.",
|
|
110
|
+
err=True,
|
|
111
|
+
)
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
os.execvp(str(binary), [str(binary)] + list(args))
|