PyPI - csvnorm - Versions diffs - 0.3.3__py3-none-any.whl - Mend

csvnorm 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

csvnorm/__init__.py +8 -0
csvnorm/__main__.py +6 -0
csvnorm/cli.py +150 -0
csvnorm/core.py +189 -0
csvnorm/encoding.py +119 -0
csvnorm/utils.py +71 -0
csvnorm/validation.py +109 -0
csvnorm-0.3.3.dist-info/METADATA +240 -0
csvnorm-0.3.3.dist-info/RECORD +13 -0
csvnorm-0.3.3.dist-info/WHEEL +5 -0
csvnorm-0.3.3.dist-info/entry_points.txt +2 -0
csvnorm-0.3.3.dist-info/licenses/LICENSE +21 -0
csvnorm-0.3.3.dist-info/top_level.txt +1 -0

csvnorm/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""csvnorm - Validate and normalize CSV files."""
+__version__ = "0.3.3"
+__all__ = ["normalize_csv", "detect_encoding", "process_csv"]
+from csvnorm.core import process_csv
+from csvnorm.encoding import detect_encoding
+from csvnorm.validation import normalize_csv

csvnorm/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Entry point for python -m csvnorm."""
+from csvnorm.cli import main
+if __name__ == "__main__":
+    main()

csvnorm/cli.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Command-line interface for csvnorm."""
+import argparse
+import sys
+from pathlib import Path
+from rich.console import Console
+from rich_argparse import RichHelpFormatter
+from csvnorm import __version__
+from csvnorm.core import process_csv
+from csvnorm.utils import setup_logger
+console = Console()
+def show_banner() -> None:
+    """Show ASCII art banner."""
+    from pyfiglet import figlet_format
+    banner = figlet_format("csvnorm", font="slant")
+    console.print(banner, style="bold cyan")
+class VersionAction(argparse.Action):
+    """Custom action to show banner with version."""
+    def __call__(self, parser, _namespace, _values, _option_string=None):
+        show_banner()
+        console.print(f"csvnorm {__version__}", style="bold")
+        console.print()
+        console.print("Validate and normalize CSV files for exploratory data analysis", style="dim")
+        console.print()
+        console.print("Author: aborruso", style="dim")
+        console.print("Repository: https://github.com/aborruso/csvnorm", style="dim cyan")
+        console.print("License: MIT", style="dim")
+        parser.exit()
+def create_parser() -> argparse.ArgumentParser:
+    """Create and return the argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="csvnorm",
+        description="Validate and normalize CSV files for exploratory data analysis",
+        formatter_class=RichHelpFormatter,
+        epilog="""\
+Examples:
+  csvnorm data.csv -d ';' -o output_folder --force
+  csvnorm data.csv --keep-names --delimiter '\\t'
+  csvnorm data.csv -V
+""",
+    )
+    parser.add_argument(
+        "input_file",
+        type=Path,
+        help="Input CSV file path",
+    )
+    parser.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        help="Force overwrite of existing output files",
+    )
+    parser.add_argument(
+        "-k",
+        "--keep-names",
+        action="store_true",
+        help=(
+            "Keep original column names (disable snake_case normalization). "
+            "By default, column names are converted to snake_case format "
+            "(e.g., 'Column Name' becomes 'column_name')."
+        ),
+    )
+    parser.add_argument(
+        "-d",
+        "--delimiter",
+        default=",",
+        help="Set custom field delimiter (default: comma). Example: -d ';'",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        type=Path,
+        default=Path.cwd(),
+        help="Set custom output directory (default: current working directory)",
+    )
+    parser.add_argument(
+        "-V",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output for debugging",
+    )
+    parser.add_argument(
+        "-v",
+        "--version",
+        action=VersionAction,
+        nargs=0,
+        help="Show version number with banner",
+    )
+    return parser
+def main(argv: list[str] | None = None) -> int:
+    """Main entry point for the CLI.
+    Args:
+        argv: Command line arguments (defaults to sys.argv[1:]).
+    Returns:
+        Exit code: 0 for success, 1 for error.
+    """
+    parser = create_parser()
+    # Handle missing arguments gracefully
+    if argv is None:
+        argv = sys.argv[1:]
+    if not argv or (len(argv) == 1 and argv[0] in ['-h', '--help']):
+        parser.print_help()
+        return 0 if argv else 2
+    args = parser.parse_args(argv)
+    # Show banner in verbose mode
+    if args.verbose:
+        show_banner()
+    # Setup logging
+    setup_logger(args.verbose)
+    # Run processing
+    return process_csv(
+        input_file=args.input_file,
+        output_dir=args.output_dir,
+        force=args.force,
+        keep_names=args.keep_names,
+        delimiter=args.delimiter,
+        verbose=args.verbose,
+    )
+if __name__ == "__main__":
+    sys.exit(main())

csvnorm/core.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""Core processing logic for csvnorm."""
+import logging
+from pathlib import Path
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.table import Table
+from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
+from csvnorm.utils import ensure_output_dir, to_snake_case, validate_delimiter
+from csvnorm.validation import normalize_csv, validate_csv
+logger = logging.getLogger("csvnorm")
+console = Console()
+def process_csv(
+    input_file: Path,
+    output_dir: Path,
+    force: bool = False,
+    keep_names: bool = False,
+    delimiter: str = ",",
+    verbose: bool = False,
+) -> int:
+    """Main CSV processing pipeline.
+    Args:
+        input_file: Path to input CSV file.
+        output_dir: Directory for output files.
+        force: If True, overwrite existing output files.
+        keep_names: If True, keep original column names.
+        delimiter: Output field delimiter.
+        verbose: If True, enable debug logging.
+    Returns:
+        Exit code: 0 for success, 1 for error.
+    """
+    # Validate inputs
+    if not input_file.exists():
+        console.print(Panel(
+            f"[bold red]Error:[/bold red] Input file not found\n{input_file}",
+            border_style="red"
+        ))
+        return 1
+    if not input_file.is_file():
+        console.print(Panel(
+            f"[bold red]Error:[/bold red] Not a file\n{input_file}",
+            border_style="red"
+        ))
+        return 1
+    try:
+        validate_delimiter(delimiter)
+    except ValueError as e:
+        console.print(Panel(
+            f"[bold red]Error:[/bold red] {e}",
+            border_style="red"
+        ))
+        return 1
+    # Setup paths
+    base_name = to_snake_case(input_file.name)
+    ensure_output_dir(output_dir)
+    output_file = output_dir / f"{base_name}.csv"
+    reject_file = output_dir / f"{base_name}_reject_errors.csv"
+    temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
+    # Check if output exists
+    if output_file.exists() and not force:
+        console.print(Panel(
+            f"[bold yellow]Warning:[/bold yellow] Output file already exists\n\n"
+            f"{output_file}\n\n"
+            f"Use [bold]--force[/bold] to overwrite.",
+            border_style="yellow"
+        ))
+        return 1
+    # Clean up previous reject file
+    if reject_file.exists():
+        reject_file.unlink()
+    # Track files to clean up
+    temp_files: list[Path] = []
+    try:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            console=console,
+            transient=True
+        ) as progress:
+            # Step 1: Detect encoding
+            task = progress.add_task("[cyan]Detecting encoding...", total=None)
+            try:
+                encoding = detect_encoding(input_file)
+            except ValueError as e:
+                progress.stop()
+                console.print(Panel(
+                    f"[bold red]Error:[/bold red] {e}",
+                    border_style="red"
+                ))
+                return 1
+            logger.debug(f"Detected encoding: {encoding}")
+            progress.update(task, description=f"[green]✓[/green] Detected encoding: {encoding}")
+            # Step 2: Convert to UTF-8 if needed
+            working_file = input_file
+            if needs_conversion(encoding):
+                progress.update(task, description=f"[cyan]Converting from {encoding} to UTF-8...")
+                try:
+                    convert_to_utf8(input_file, temp_utf8_file, encoding)
+                    working_file = temp_utf8_file
+                    temp_files.append(temp_utf8_file)
+                    progress.update(task, description=f"[green]✓[/green] Converted to UTF-8")
+                except (UnicodeDecodeError, LookupError) as e:
+                    progress.stop()
+                    console.print(Panel(
+                        f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
+                        border_style="red"
+                    ))
+                    return 1
+            else:
+                progress.update(task, description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)")
+            # Step 3: Validate CSV
+            progress.update(task, description="[cyan]Validating CSV...")
+            logger.debug("Validating CSV with DuckDB...")
+            is_valid = validate_csv(working_file, reject_file)
+            if not is_valid:
+                progress.stop()
+                console.print(Panel(
+                    "[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
+                    f"Details: [cyan]{reject_file}[/cyan]\n\n"
+                    "Please fix the issues and try again.",
+                    border_style="red"
+                ))
+                return 1
+            progress.update(task, description="[green]✓[/green] CSV validated")
+            # Step 4: Normalize and write output
+            progress.update(task, description="[cyan]Normalizing and writing output...")
+            logger.debug("Normalizing CSV...")
+            normalize_csv(
+                input_path=working_file,
+                output_path=output_file,
+                delimiter=delimiter,
+                normalize_names=not keep_names,
+            )
+            logger.debug(f"Output written to: {output_file}")
+            progress.update(task, description="[green]✓[/green] Complete")
+        # Success summary table
+        table = Table(show_header=False, box=None, padding=(0, 1))
+        table.add_row("[green]✓[/green] Success", "")
+        table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
+        table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
+        table.add_row("Encoding:", encoding)
+        if delimiter != ",":
+            table.add_row("Delimiter:", repr(delimiter))
+        if not keep_names:
+            table.add_row("Headers:", "normalized to snake_case")
+        console.print()
+        console.print(table)
+    finally:
+        # Cleanup temp files
+        for temp_file in temp_files:
+            if temp_file.exists():
+                logger.debug(f"Removing temp file: {temp_file}")
+                temp_file.unlink()
+        # Remove reject file if empty (only header)
+        if reject_file.exists():
+            with open(reject_file, "r") as f:
+                line_count = sum(1 for _ in f)
+            if line_count <= 1:
+                logger.debug(f"Removing empty reject file: {reject_file}")
+                reject_file.unlink()
+    return 0

csvnorm/encoding.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Encoding detection and conversion for CSV files."""
+import codecs
+import logging
+from pathlib import Path
+from charset_normalizer import from_path
+logger = logging.getLogger("csvnorm")
+# Encoding alias mapping for Python codec compatibility
+ENCODING_ALIASES: dict[str, str] = {
+    "macroman": "mac_roman",
+    "macintosh": "mac_roman",
+    "utf_8": "utf-8",
+    "utf_8_sig": "utf-8-sig",
+    "ascii": "ascii",
+}
+# Encodings that don't need conversion
+UTF8_ENCODINGS = frozenset({"utf-8", "ascii", "utf-8-sig"})
+def normalize_encoding_name(encoding: str) -> str:
+    """Normalize encoding name to Python codec name.
+    Args:
+        encoding: Raw encoding name from detection.
+    Returns:
+        Normalized encoding name compatible with Python codecs.
+    """
+    encoding_lower = encoding.lower().replace("-", "_")
+    # Check alias mapping
+    if encoding_lower in ENCODING_ALIASES:
+        return ENCODING_ALIASES[encoding_lower]
+    # Try to normalize with underscores to dashes
+    return encoding_lower.replace("_", "-")
+def detect_encoding(file_path: Path) -> str:
+    """Detect the encoding of a file using charset_normalizer.
+    Args:
+        file_path: Path to the file to analyze.
+    Returns:
+        Detected encoding name (normalized for Python codecs).
+    Raises:
+        ValueError: If encoding cannot be detected.
+    """
+    logger.debug(f"Detecting encoding for: {file_path}")
+    result = from_path(file_path)
+    best = result.best()
+    if best is None:
+        logger.debug("charset_normalizer failed, cannot detect encoding")
+        raise ValueError(f"Cannot detect encoding for: {file_path}")
+    encoding = best.encoding
+    logger.debug(f"Detected encoding: {encoding}")
+    # Normalize the encoding name
+    normalized = normalize_encoding_name(encoding)
+    if normalized != encoding.lower():
+        logger.debug(f"Normalized encoding: {encoding} -> {normalized}")
+    return normalized
+def needs_conversion(encoding: str) -> bool:
+    """Check if file needs encoding conversion to UTF-8.
+    Args:
+        encoding: Detected encoding name.
+    Returns:
+        True if conversion is needed, False otherwise.
+    """
+    encoding_lower = encoding.lower()
+    return encoding_lower not in UTF8_ENCODINGS
+def convert_to_utf8(input_path: Path, output_path: Path, source_encoding: str) -> Path:
+    """Convert file from source encoding to UTF-8.
+    Args:
+        input_path: Path to input file.
+        output_path: Path for UTF-8 output file.
+        source_encoding: Source file encoding.
+    Returns:
+        Path to the converted file.
+    Raises:
+        UnicodeDecodeError: If file cannot be decoded with the specified encoding.
+        LookupError: If the encoding is not supported.
+    """
+    logger.debug(f"Converting from {source_encoding} to UTF-8")
+    # Validate encoding exists
+    try:
+        codecs.lookup(source_encoding)
+    except LookupError as e:
+        raise LookupError(f"Unknown encoding: {source_encoding}") from e
+    # Read with source encoding, write as UTF-8
+    with open(input_path, "r", encoding=source_encoding, errors="strict") as f:
+        content = f.read()
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(content)
+    logger.debug(f"Converted file written to: {output_path}")
+    return output_path

csvnorm/utils.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Utility functions for csvnorm."""
+import logging
+import re
+from pathlib import Path
+from rich.logging import RichHandler
+def to_snake_case(name: str) -> str:
+    """Convert filename to clean snake_case.
+    Replicates the bash logic:
+    tr '[:upper:]' '[:lower:]' |
+    sed -E 's/[^a-z0-9]+/_/g' |
+    sed -E 's/_+/_/g' |
+    sed -E 's/^_|_$//g'
+    """
+    # Remove .csv extension if present
+    if name.lower().endswith(".csv"):
+        name = name[:-4]
+    # Convert to lowercase
+    name = name.lower()
+    # Replace non-alphanumeric with underscore
+    name = re.sub(r"[^a-z0-9]+", "_", name)
+    # Collapse multiple underscores
+    name = re.sub(r"_+", "_", name)
+    # Remove leading/trailing underscores
+    name = name.strip("_")
+    return name
+def setup_logger(verbose: bool = False) -> logging.Logger:
+    """Setup and return a logger instance with rich formatting.
+    Args:
+        verbose: If True, set log level to DEBUG, else INFO.
+    """
+    logger = logging.getLogger("csvnorm")
+    if not logger.handlers:
+        handler = RichHandler(
+            show_time=False,
+            show_path=verbose,
+            markup=True,
+            rich_tracebacks=True
+        )
+        logger.addHandler(handler)
+    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    return logger
+def validate_delimiter(delimiter: str) -> None:
+    """Validate that delimiter is a single character.
+    Raises:
+        ValueError: If delimiter is not exactly one character.
+    """
+    if len(delimiter) != 1:
+        raise ValueError("--delimiter must be a single character")
+def ensure_output_dir(output_dir: Path) -> None:
+    """Create output directory if it doesn't exist."""
+    output_dir.mkdir(parents=True, exist_ok=True)

csvnorm/validation.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""CSV validation and normalization using DuckDB."""
+import logging
+from pathlib import Path
+import duckdb
+logger = logging.getLogger("csvnorm")
+def validate_csv(file_path: Path, reject_file: Path) -> bool:
+    """Validate CSV file using DuckDB and export rejected rows.
+    Args:
+        file_path: Path to CSV file to validate.
+        reject_file: Path to write rejected rows.
+    Returns:
+        True if validation passes (no rejected rows), False otherwise.
+    """
+    logger.debug(f"Validating CSV: {file_path}")
+    conn = duckdb.connect()
+    try:
+        # Read CSV with store_rejects to capture malformed rows
+        # Use all_varchar=true to avoid type inference failures
+        conn.execute(f"""
+            COPY (
+                FROM read_csv(
+                    '{file_path}',
+                    store_rejects=true,
+                    sample_size=-1,
+                    all_varchar=true
+                )
+            ) TO '/dev/null'
+        """)
+        # Export rejected rows to file
+        conn.execute(f"COPY (FROM reject_errors) TO '{reject_file}'")
+    finally:
+        conn.close()
+    # Check if there are rejected rows (more than just header)
+    reject_count = _count_lines(reject_file)
+    logger.debug(f"Reject file lines: {reject_count}")
+    return reject_count <= 1
+def normalize_csv(
+    input_path: Path,
+    output_path: Path,
+    delimiter: str = ",",
+    normalize_names: bool = True,
+) -> None:
+    """Normalize CSV file using DuckDB.
+    Args:
+        input_path: Path to input CSV file.
+        output_path: Path for normalized output file.
+        delimiter: Output field delimiter.
+        normalize_names: If True, convert column names to snake_case.
+    """
+    logger.debug(f"Normalizing CSV: {input_path} -> {output_path}")
+    conn = duckdb.connect()
+    try:
+        # Build read options
+        read_opts = "sample_size=-1, all_varchar=true"
+        if normalize_names:
+            read_opts += ", normalize_names=true"
+        # Build copy options
+        copy_opts = "header true, format csv"
+        if delimiter != ",":
+            copy_opts += f", delimiter '{delimiter}'"
+        query = f"""
+            COPY (
+                SELECT * FROM read_csv('{input_path}', {read_opts})
+            ) TO '{output_path}' ({copy_opts})
+        """
+        logger.debug(f"DuckDB query: {query}")
+        conn.execute(query)
+    finally:
+        conn.close()
+    logger.debug(f"Normalized file written to: {output_path}")
+def _count_lines(file_path: Path) -> int:
+    """Count lines in a file.
+    Args:
+        file_path: Path to file.
+    Returns:
+        Number of lines in file, or 0 if file doesn't exist.
+    """
+    if not file_path.exists():
+        return 0
+    with open(file_path, "r") as f:
+        return sum(1 for _ in f)

csvnorm-0.3.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,240 @@
+Metadata-Version: 2.4
+Name: csvnorm
+Version: 0.3.3
+Summary: A command-line utility to validate and normalize CSV files
+Author-email: aborruso <aborruso@gmail.com>
+License: MIT License
+        Copyright (c) 2026 aborruso@gmail.com
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/aborruso/prepare_data
+Project-URL: Issues, https://github.com/aborruso/prepare_data/issues
+Keywords: csv,data,normalization,validation,etl
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Utilities
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: charset-normalizer>=3.0.0
+Requires-Dist: duckdb>=0.9.0
+Requires-Dist: rich>=13.0.0
+Requires-Dist: rich-argparse>=1.0.0
+Requires-Dist: pyfiglet>=1.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Provides-Extra: banner
+Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
+Dynamic: license-file
+[![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/csvnorm)
+# csvnorm
+A command-line utility to validate and normalize CSV files for initial exploration.
+## Installation
+Recommended (uv):
+```bash
+uv tool install csvnorm
+```
+Or with pip:
+```bash
+pip install csvnorm
+```
+For ASCII art banner (shown with `--version` and `-V`):
+```bash
+uv tool install 'csvnorm[banner]'
+# or
+pip install 'csvnorm[banner]'
+```
+Example with banner:
+```bash
+csvnorm --version
+# Output:
+#   ___________   ______  ____  _________ ___
+#  / ___/ ___/ | / / __ \/ __ \/ ___/ __ `__ \
+# / /__(__  )| |/ / / / / /_/ / /  / / / / / /
+# \___/____/ |___/_/ /_/\____/_/  /_/ /_/ /_/
+#
+# csvnorm 0.3.1
+```
+## Purpose
+This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
+**What it does:**
+- Validates CSV structure and reports errors
+- Normalizes encoding to UTF-8
+- Normalizes delimiters and field names
+- Creates a consistent starting point for data exploration
+**What it doesn't do:**
+- Complex data transformations or business logic
+- Type inference or data validation beyond structure
+- Heavy processing or aggregations
+## Features
+- **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
+- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
+- **Field Name Normalization**: Converts column headers to snake_case format
+- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
+- **Error Reporting**: Exports detailed error file for invalid rows
+## Usage
+```bash
+csvnorm input.csv [options]
+```
+### Options
+| Option | Description |
+|--------|-------------|
+| `-f, --force` | Force overwrite of existing output files |
+| `-k, --keep-names` | Keep original column names (disable snake_case) |
+| `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
+| `-o, --output-dir DIR` | Set output directory (default: current dir) |
+| `-V, --verbose` | Enable verbose output for debugging |
+| `-v, --version` | Show version number |
+| `-h, --help` | Show help message |
+### Examples
+```bash
+# Basic usage
+csvnorm data.csv
+# With semicolon delimiter
+csvnorm data.csv -d ';'
+# Custom output directory
+csvnorm data.csv -o ./output
+# Keep original headers
+csvnorm data.csv --keep-names
+# Force overwrite with verbose output
+csvnorm data.csv -f -V
+```
+### Output
+Creates a normalized CSV file in the specified output directory with:
+- UTF-8 encoding
+- Consistent field delimiters
+- Normalized column names (unless `--keep-names` is specified)
+- Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
+The tool provides modern terminal output with:
+- Progress indicators for multi-step processing
+- Color-coded error messages with panels
+- Success summary table showing encoding, paths, and settings
+- Optional ASCII art banner with `--version` and `-V` verbose mode (requires `pyfiglet`)
+### Exit Codes
+| Code | Meaning |
+|------|---------|
+| 0 | Success |
+| 1 | Error (validation failed, file not found, etc.) |
+## Requirements
+- Python 3.8+
+- Dependencies (automatically installed):
+  - `charset-normalizer>=3.0.0` - Encoding detection
+  - `duckdb>=0.9.0` - CSV validation and normalization
+  - `rich>=13.0.0` - Modern terminal output formatting
+  - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
+Optional extras:
+- `[banner]` - ASCII art banner for `--version` and `-V` verbose mode (`pyfiglet>=1.0.0`)
+- `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
+## Development
+### Setup
+```bash
+git clone https://github.com/aborruso/csvnorm
+cd csvnorm
+# Create and activate venv with uv (recommended)
+uv venv
+source .venv/bin/activate
+uv pip install -e ".[dev]"
+# Or with pip
+pip install -e ".[dev]"
+```
+### Testing
+```bash
+pytest tests/ -v
+```
+### Project Structure
+```
+prepare_data/
+├── src/csvnorm/
+│   ├── __init__.py      # Package version
+│   ├── __main__.py      # python -m support
+│   ├── cli.py           # CLI argument parsing
+│   ├── core.py          # Main processing pipeline
+│   ├── encoding.py      # Encoding detection/conversion
+│   ├── validation.py    # DuckDB validation
+│   └── utils.py         # Helper functions
+├── tests/               # Test suite
+├── test/                # CSV fixtures
+└── pyproject.toml       # Package configuration
+```
+## License
+MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details

csvnorm-0.3.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+csvnorm/__init__.py,sha256=8njXIycxL0qSI5Q9bVGyTaM41j_kKX9jV7TeQOSAQGE,263
+csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
+csvnorm/cli.py,sha256=MwIPahLktbulF6NYRWyBsE4s9Al9_aSdA1zvzuI0AiQ,3815
+csvnorm/core.py,sha256=_kTaui_2IhqrN_UxJpcjwXYXEvqaRMhML49Xlx-e0p0,6633
+csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
+csvnorm/utils.py,sha256=gvwDToOx3YoKCfVPyCmxcSa7teCWFB2SmAGr-jV5w_Y,1761
+csvnorm/validation.py,sha256=iXdfalAGDNB9kPefyzHXGI9uc-HLAG5pQ_-T93ShppY,2815
+csvnorm-0.3.3.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
+csvnorm-0.3.3.dist-info/METADATA,sha256=xKJmLVX9RoB22KwAAlxAvWB_KA9h68m5V-UyFaS_DGo,7840
+csvnorm-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csvnorm-0.3.3.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
+csvnorm-0.3.3.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
+csvnorm-0.3.3.dist-info/RECORD,,

csvnorm-0.3.3.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

csvnorm-0.3.3.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ csvnorm = csvnorm.cli:main

csvnorm-0.3.3.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 aborruso@gmail.com
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

csvnorm-0.3.3.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ csvnorm