ducl 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ducl-0.1.0/.gitignore ADDED
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ *.feather
8
+ *.parquet
9
+ src/ducl/pwalk2/pwalk2
10
+ !src/ducl/pwalk2/pwalk2.c
11
+ .pytest_cache/
12
+ .venv/
ducl-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: ducl
3
+ Version: 0.1.0
4
+ Summary: Disk Usage Command Line toolkit — scan, dashboard, query
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: click>=8.0
8
+ Requires-Dist: numpy>=1.24
9
+ Requires-Dist: polars>=1.0
10
+ Requires-Dist: pyarrow>=14.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: boto3>=1.28; extra == 'dev'
13
+ Requires-Dist: pytest>=7.0; extra == 'dev'
14
+ Provides-Extra: s3
15
+ Requires-Dist: boto3>=1.28; extra == 's3'
ducl-0.1.0/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # ducl — Disk Usage Command Line toolkit
2
+
3
+ Scan filesystems (or S3 buckets), build interactive dashboards, and query scan data — all from one CLI.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install ducl # from PyPI
9
+ pip install ducl[s3] # with S3 support (boto3)
10
+ pip install -e ".[dev]" # development (editable + pytest)
11
+ ```
12
+
13
+ On Linux the bundled **pwalk2** C binary is compiled automatically at install time.
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ # Scan a filesystem → Feather file + dashboard
19
+ ducl scan /mnt/data -o scan.feather
20
+
21
+ # Scan an S3 bucket
22
+ ducl scan my-bucket -o bucket.feather --s3
23
+
24
+ # Build dashboard from existing Feather
25
+ ducl dashboard scan.feather ./output/
26
+
27
+ # Incremental update after rescanning a subtree
28
+ ducl update ./output/ subtree.feather
29
+
30
+ # Query scan data
31
+ ducl query scan.feather --under /mnt/data/models/ --ext wav --top 10
32
+
33
+ # Run bundled pwalk2 directly
34
+ ducl pwalk2 /mnt/data --threads 64
35
+ ```
36
+
37
+ ## Development
38
+
39
+ ```bash
40
+ cd v3
41
+ pip install -e ".[dev]"
42
+ pytest -v
43
+ ```
44
+
45
+ ## License
46
+
47
+ MIT
@@ -0,0 +1,21 @@
1
+ """Hatchling build hook -- compiles pwalk2 from bundled C sources."""
2
+
3
+ import os
4
+ import platform
5
+ import subprocess
6
+
7
+ from hatchling.builders.hooks.plugin.interface import BuildHookInterface
8
+
9
+
10
+ class CustomBuildHook(BuildHookInterface):
11
+ def initialize(self, version, build_data):
12
+ if platform.system() != "Linux":
13
+ return # pwalk2 is Linux-only; skip silently
14
+ pw2_dir = os.path.join(self.root, "src", "ducl", "pwalk2")
15
+ if not os.path.isdir(pw2_dir):
16
+ return
17
+ subprocess.check_call(["make", "-C", pw2_dir, "pwalk2"])
18
+ # Include compiled binary in wheel
19
+ binary = os.path.join(pw2_dir, "pwalk2")
20
+ if os.path.isfile(binary):
21
+ build_data.setdefault("force-include", {})[binary] = "ducl/pwalk2/pwalk2"
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "ducl"
7
+ version = "0.1.0"
8
+ description = "Disk Usage Command Line toolkit — scan, dashboard, query"
9
+ requires-python = ">=3.10"
10
+ license = "MIT"
11
+ dependencies = [
12
+ "click>=8.0",
13
+ "polars>=1.0",
14
+ "pyarrow>=14.0",
15
+ "numpy>=1.24",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ s3 = ["boto3>=1.28"]
20
+ dev = ["pytest>=7.0", "boto3>=1.28"]
21
+
22
+ [project.scripts]
23
+ ducl = "ducl.cli:cli"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["src/ducl"]
27
+
28
+ [tool.hatch.build.targets.wheel.hooks.custom]
29
+ path = "hatch_build.py"
30
+
31
+ [tool.hatch.build.targets.sdist]
32
+ include = ["src/ducl/**", "tests/**", "pyproject.toml", "README.md", "hatch_build.py"]
33
+
34
+ [tool.pytest.ini_options]
35
+ testpaths = ["tests"]
@@ -0,0 +1,3 @@
1
+ """ducl — Disk Usage Command Line toolkit."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,88 @@
1
+ """Batch processing for aggregation sidecars."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ import pyarrow as pa
8
+
9
+ from .schema import AGG_GROUP_COLS, EXAMPLES_TOP_K, HIST_EDGES
10
+ from .ext import normalize_ext_expr
11
+
12
+
13
+ def process_batch(batch: pa.RecordBatch) -> tuple[pl.DataFrame, pl.DataFrame]:
14
+ """Process a single pyarrow RecordBatch into agg and example DataFrames.
15
+
16
+ Returns (agg_df, examples_df).
17
+ """
18
+ df = pl.from_arrow(batch)
19
+
20
+ # Filter to files only
21
+ df = df.filter(pl.col("child_count") == -1)
22
+ if df.height == 0:
23
+ agg_empty = pl.DataFrame(schema={
24
+ "dir_path": pl.Utf8, "ext": pl.Utf8, "leaf_folder": pl.Utf8,
25
+ "size_bin": pl.Int8, "n_components": pl.Int32,
26
+ "file_count": pl.Int64, "total_size": pl.Int64,
27
+ })
28
+ ex_empty = pl.DataFrame(schema={
29
+ "dir_path": pl.Utf8, "size_bin": pl.Int8, "filename": pl.Utf8,
30
+ "ext": pl.Utf8, "size": pl.Int64,
31
+ })
32
+ return agg_empty, ex_empty
33
+
34
+ # Compute derived columns
35
+ df = df.select("path", "ext", "size").with_columns(
36
+ pl.col("path").str.replace(r"/[^/]+$", "").alias("dir_path"),
37
+ pl.col("path").str.extract(r"([^/]+)/[^/]+$").alias("leaf_folder"),
38
+ pl.col("path").str.extract(r"([^/]+)$").alias("filename"),
39
+ ).with_columns(
40
+ normalize_ext_expr(),
41
+ (pl.col("dir_path").str.count_matches("/") + 2).cast(pl.Int32).alias("n_components"),
42
+ )
43
+
44
+ # Assign size bins
45
+ sizes_np = df["size"].to_numpy().astype(float)
46
+ bins = np.digitize(sizes_np, HIST_EDGES).astype(np.int8)
47
+ df = df.with_columns(pl.lit(pl.Series("size_bin", bins)).alias("size_bin"))
48
+
49
+ # Agg: group by (dir_path, ext, leaf_folder, size_bin, n_components)
50
+ agg_df = (
51
+ df.group_by(AGG_GROUP_COLS)
52
+ .agg(
53
+ pl.len().cast(pl.Int64).alias("file_count"),
54
+ pl.col("size").sum().alias("total_size"),
55
+ )
56
+ )
57
+
58
+ # Examples: top-3 largest files per (dir_path, size_bin)
59
+ examples_df = (
60
+ df.sort("size", descending=True)
61
+ .group_by(["dir_path", "size_bin"])
62
+ .head(EXAMPLES_TOP_K)
63
+ .select("dir_path", "size_bin", "filename", "ext", "size")
64
+ )
65
+
66
+ return agg_df, examples_df
67
+
68
+
69
+ def compact_agg(batches: list[pl.DataFrame]) -> pl.DataFrame:
70
+ """Re-aggregate accumulated agg batches to bound memory."""
71
+ combined = pl.concat(batches)
72
+ return (
73
+ combined.group_by(AGG_GROUP_COLS)
74
+ .agg(
75
+ pl.col("file_count").sum().alias("file_count"),
76
+ pl.col("total_size").sum().alias("total_size"),
77
+ )
78
+ )
79
+
80
+
81
+ def compact_examples(batches: list[pl.DataFrame]) -> pl.DataFrame:
82
+ """Re-pick top-3 from accumulated example batches."""
83
+ combined = pl.concat(batches)
84
+ return (
85
+ combined.sort("size", descending=True)
86
+ .group_by(["dir_path", "size_bin"])
87
+ .head(EXAMPLES_TOP_K)
88
+ )
@@ -0,0 +1,113 @@
1
+ """ducl CLI — one entry point for disk-usage operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import click
10
+
11
+
12
+ @click.group()
13
+ @click.version_option(package_name="ducl")
14
+ def cli():
15
+ """Disk Usage Command Line toolkit."""
16
+
17
+
18
+ @cli.command()
19
+ @click.argument("path")
20
+ @click.option("-o", "--output", required=True, help="Output .feather file path")
21
+ @click.option("--s3", "use_s3", is_flag=True, help="Treat PATH as an S3 bucket name")
22
+ @click.option("--no-agg", is_flag=True, help="Skip sidecar aggregation")
23
+ @click.option("--no-dashboard", is_flag=True, help="Skip automatic dashboard build")
24
+ @click.option("--block-size", type=int, default=256 << 20, help="CSV read block size (default 256 MiB)")
25
+ @click.option("-w", "--workers", type=int, default=32, help="S3 parallel listing threads")
26
+ @click.option("--discover-depth", type=int, default=2, help="S3 prefix discovery depth")
27
+ @click.option("--endpoint-url", help="S3-compatible endpoint")
28
+ @click.option("--profile", help="AWS profile name")
29
+ @click.option("--region", help="AWS region")
30
+ def scan(path, output, use_s3, no_agg, no_dashboard, block_size, workers,
31
+ discover_depth, endpoint_url, profile, region):
32
+ """Scan a filesystem or S3 bucket into a Feather file."""
33
+ do_agg = not no_agg
34
+
35
+ if use_s3:
36
+ from .s3scan import scan_bucket
37
+ scan_bucket(
38
+ bucket=path,
39
+ output=output,
40
+ workers=workers,
41
+ discover_depth=discover_depth,
42
+ do_agg=do_agg,
43
+ endpoint_url=endpoint_url,
44
+ profile=profile,
45
+ region=region,
46
+ )
47
+ else:
48
+ from .scan import scan_filesystem
49
+ scan_filesystem(path, output, do_agg=do_agg, block_size=block_size)
50
+
51
+ if not no_dashboard and os.path.exists(output):
52
+ out_dir = Path(output).parent / (Path(output).stem + "_dashboard")
53
+ click.echo(f"Building dashboard in {out_dir} ...", err=True)
54
+ from .dashboard import build
55
+ build(output, out_dir)
56
+
57
+
58
+ @cli.command()
59
+ @click.argument("feather")
60
+ @click.argument("output_dir")
61
+ def dashboard(feather, output_dir):
62
+ """Build a dashboard from a Feather scan file."""
63
+ from .dashboard import build
64
+ build(feather, output_dir)
65
+
66
+
67
+ @cli.command("update")
68
+ @click.argument("dashboard_dir")
69
+ @click.argument("subtree_feather")
70
+ def update_cmd(dashboard_dir, subtree_feather):
71
+ """Incrementally update a dashboard after a subtree rescan."""
72
+ from .dashboard import update
73
+ update(dashboard_dir, subtree_feather)
74
+
75
+
76
+ @cli.command()
77
+ @click.argument("feather")
78
+ @click.option("--under", help="Path prefix filter")
79
+ @click.option("--name", help="Exact match on any path component")
80
+ @click.option("--name-glob", help="Glob pattern on any path component")
81
+ @click.option("--ext", multiple=True, help="Extension filter (repeatable)")
82
+ @click.option("--min-size", help="Minimum file size (K/M/G/T suffixes)")
83
+ @click.option("--max-size", help="Maximum file size (K/M/G/T suffixes)")
84
+ @click.option("--path-contains", help="Substring match on full path")
85
+ @click.option("--dirs", is_flag=True, help="Directories only (default: files only)")
86
+ @click.option("--top", type=int, help="Show N largest entries")
87
+ @click.option("--delete", is_flag=True, help="Delete matching files from disk")
88
+ def query(feather, **kwargs):
89
+ """Query a .feather scan file with du-style output."""
90
+ from .query import run_query
91
+ ext_vals = list(kwargs.pop("ext")) or None
92
+ run_query(feather, ext=ext_vals, **kwargs)
93
+
94
+
95
+ @cli.command(
96
+ "pwalk2",
97
+ context_settings=dict(
98
+ ignore_unknown_options=True,
99
+ allow_extra_args=True,
100
+ ),
101
+ )
102
+ @click.argument("args", nargs=-1, type=click.UNPROCESSED)
103
+ def pwalk2_cmd(args):
104
+ """Run the bundled pwalk2 binary (all arguments forwarded)."""
105
+ binary = Path(__file__).parent / "pwalk2" / "pwalk2"
106
+ if not binary.exists():
107
+ click.echo(
108
+ "Error: pwalk2 binary not found. "
109
+ "pwalk2 requires Linux and is compiled at install time.",
110
+ err=True,
111
+ )
112
+ sys.exit(1)
113
+ os.execvp(str(binary), [str(binary)] + list(args))