PyPI - csvnorm - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

csvnorm 0.3.3py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

csvnorm/__init__.py +1 -1
csvnorm/cli.py +28 -15
csvnorm/core.py +182 -91
csvnorm/ui.py +124 -0
csvnorm/utils.py +125 -4
csvnorm/validation.py +59 -6
{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/METADATA +60 -33
csvnorm-0.3.11.dist-info/RECORD +14 -0
csvnorm-0.3.3.dist-info/RECORD +0 -13
{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/WHEEL +0 -0
{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/entry_points.txt +0 -0
{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/licenses/LICENSE +0 -0
{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/top_level.txt +0 -0

csvnorm/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """csvnorm - Validate and normalize CSV files."""
-__version__ = "0.3.3"
+__version__ = "0.3.11"
 __all__ = ["normalize_csv", "detect_encoding", "process_csv"]
 from csvnorm.core import process_csv

csvnorm/cli.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import argparse
 import sys
 from pathlib import Path
+from typing import Optional
 from rich.console import Console
 from rich_argparse import RichHelpFormatter
@@ -15,10 +16,10 @@ console = Console()
 def show_banner() -> None:
-    """Show ASCII art banner."""
-    from pyfiglet import figlet_format
-    banner = figlet_format("csvnorm", font="slant")
-    console.print(banner, style="bold cyan")
+    """Show simple styled banner."""
+    console.print()
+    console.print("  csvnorm  ", style="bold cyan on black", justify="center")
+    console.print()
 class VersionAction(argparse.Action):
@@ -28,10 +29,15 @@ class VersionAction(argparse.Action):
         show_banner()
         console.print(f"csvnorm {__version__}", style="bold")
         console.print()
-        console.print("Validate and normalize CSV files for exploratory data analysis", style="dim")
+        console.print(
+            "Validate and normalize CSV files for exploratory data analysis",
+            style="dim",
+        )
         console.print()
         console.print("Author: aborruso", style="dim")
-        console.print("Repository: https://github.com/aborruso/csvnorm", style="dim cyan")
+        console.print(
+            "Repository: https://github.com/aborruso/csvnorm", style="dim cyan"
+        )
         console.print("License: MIT", style="dim")
         parser.exit()
@@ -44,16 +50,17 @@ def create_parser() -> argparse.ArgumentParser:
         formatter_class=RichHelpFormatter,
         epilog="""\
 Examples:
-  csvnorm data.csv -d ';' -o output_folder --force
+  csvnorm data.csv -d ';' -o output.csv --force
   csvnorm data.csv --keep-names --delimiter '\\t'
+  csvnorm https://example.com/data.csv -o processed/data.csv
   csvnorm data.csv -V
 """,
     )
     parser.add_argument(
         "input_file",
-        type=Path,
-        help="Input CSV file path",
+        type=str,
+        help="Input CSV file path or HTTP/HTTPS URL",
     )
     parser.add_argument(
@@ -83,10 +90,9 @@ Examples:
     parser.add_argument(
         "-o",
-        "--output-dir",
+        "--output-file",
         type=Path,
-        default=Path.cwd(),
-        help="Set custom output directory (default: current working directory)",
+        help="Set output file path (absolute or relative)",
     )
     parser.add_argument(
@@ -107,7 +113,7 @@ Examples:
     return parser
-def main(argv: list[str] | None = None) -> int:
+def main(argv: Optional[list[str]] = None) -> int:
     """Main entry point for the CLI.
     Args:
@@ -122,7 +128,7 @@ def main(argv: list[str] | None = None) -> int:
     if argv is None:
         argv = sys.argv[1:]
-    if not argv or (len(argv) == 1 and argv[0] in ['-h', '--help']):
+    if not argv or (len(argv) == 1 and argv[0] in ["-h", "--help"]):
         parser.print_help()
         return 0 if argv else 2
@@ -135,10 +141,17 @@ def main(argv: list[str] | None = None) -> int:
     # Setup logging
     setup_logger(args.verbose)
+    # Determine output file (default: input filename in current directory)
+    if args.output_file is None:
+        input_name = Path(args.input_file).name
+        output_file = Path.cwd() / input_name
+    else:
+        output_file = args.output_file
     # Run processing
     return process_csv(
         input_file=args.input_file,
-        output_dir=args.output_dir,
+        output_file=output_file,
         force=args.force,
         keep_names=args.keep_names,
         delimiter=args.delimiter,

csvnorm/core.py CHANGED Viewed

@@ -1,15 +1,29 @@
 """Core processing logic for csvnorm."""
 import logging
+import tempfile
 from pathlib import Path
+from typing import Union
 from rich.console import Console
-from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn
-from rich.table import Table
 from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
-from csvnorm.utils import ensure_output_dir, to_snake_case, validate_delimiter
+from csvnorm.ui import (
+    show_error_panel,
+    show_success_table,
+    show_validation_error_panel,
+    show_warning_panel,
+)
+from csvnorm.utils import (
+    extract_filename_from_url,
+    get_column_count,
+    get_row_count,
+    is_url,
+    to_snake_case,
+    validate_delimiter,
+    validate_url,
+)
 from csvnorm.validation import normalize_csv, validate_csv
 logger = logging.getLogger("csvnorm")
@@ -17,8 +31,8 @@ console = Console()
 def process_csv(
-    input_file: Path,
-    output_dir: Path,
+    input_file: str,
+    output_file: Path,
     force: bool = False,
     keep_names: bool = False,
     delimiter: str = ",",
@@ -27,8 +41,8 @@ def process_csv(
     """Main CSV processing pipeline.
     Args:
-        input_file: Path to input CSV file.
-        output_dir: Directory for output files.
+        input_file: Path to input CSV file or HTTP/HTTPS URL.
+        output_file: Full path for output file.
         force: If True, overwrite existing output files.
         keep_names: If True, keep original column names.
         delimiter: Output field delimiter.
@@ -37,111 +51,169 @@ def process_csv(
     Returns:
         Exit code: 0 for success, 1 for error.
     """
-    # Validate inputs
-    if not input_file.exists():
-        console.print(Panel(
-            f"[bold red]Error:[/bold red] Input file not found\n{input_file}",
-            border_style="red"
-        ))
-        return 1
+    # Detect if input is URL or file
+    is_remote = is_url(input_file)
-    if not input_file.is_file():
-        console.print(Panel(
-            f"[bold red]Error:[/bold red] Not a file\n{input_file}",
-            border_style="red"
-        ))
-        return 1
+    input_path: Union[str, Path]
+    if is_remote:
+        # Validate URL
+        try:
+            validate_url(input_file)
+        except ValueError as e:
+            show_error_panel(str(e))
+            return 1
+        base_name = extract_filename_from_url(input_file)
+        input_path = input_file  # Keep as string for DuckDB
+    else:
+        # Validate local file
+        file_path = Path(input_file)
+        if not file_path.exists():
+            show_error_panel(f"Input file not found\n{file_path}")
+            return 1
+        if not file_path.is_file():
+            show_error_panel(f"Not a file\n{file_path}")
+            return 1
+        base_name = to_snake_case(file_path.name)
+        input_path = file_path
     try:
         validate_delimiter(delimiter)
     except ValueError as e:
-        console.print(Panel(
-            f"[bold red]Error:[/bold red] {e}",
-            border_style="red"
-        ))
+        show_error_panel(str(e))
         return 1
     # Setup paths
-    base_name = to_snake_case(input_file.name)
-    ensure_output_dir(output_dir)
-    output_file = output_dir / f"{base_name}.csv"
-    reject_file = output_dir / f"{base_name}_reject_errors.csv"
-    temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
+    output_dir = output_file.parent
+    temp_dir = Path(tempfile.mkdtemp(prefix="csvnorm_"))
+    reject_file = output_dir / f"{output_file.stem}_reject_errors.csv"
+    temp_utf8_file = temp_dir / f"{output_file.stem}_utf8.csv"
     # Check if output exists
     if output_file.exists() and not force:
-        console.print(Panel(
-            f"[bold yellow]Warning:[/bold yellow] Output file already exists\n\n"
+        show_warning_panel(
+            f"Output file already exists\n\n"
             f"{output_file}\n\n"
-            f"Use [bold]--force[/bold] to overwrite.",
-            border_style="yellow"
-        ))
+            f"Use [bold]--force[/bold] to overwrite."
+        )
         return 1
-    # Clean up previous reject file
+    # Clean up previous reject file (always overwrite)
     if reject_file.exists():
         reject_file.unlink()
     # Track files to clean up
-    temp_files: list[Path] = []
+    temp_files: list[Path] = [temp_dir]
     try:
         with Progress(
             SpinnerColumn(),
             TextColumn("[progress.description]{task.description}"),
             console=console,
-            transient=True
+            transient=True,
         ) as progress:
-            # Step 1: Detect encoding
-            task = progress.add_task("[cyan]Detecting encoding...", total=None)
-            try:
-                encoding = detect_encoding(input_file)
-            except ValueError as e:
-                progress.stop()
-                console.print(Panel(
-                    f"[bold red]Error:[/bold red] {e}",
-                    border_style="red"
-                ))
-                return 1
+            task = progress.add_task("[cyan]Processing...", total=None)
-            logger.debug(f"Detected encoding: {encoding}")
-            progress.update(task, description=f"[green]✓[/green] Detected encoding: {encoding}")
+            # For remote URLs, skip encoding detection/conversion
+            if is_remote:
+                progress.update(
+                    task,
+                    description="[green]✓[/green] Remote URL (encoding handled by DuckDB)",
+                )
+                working_file = input_path  # Keep URL as string
+                encoding = "remote"
+            else:
+                # Step 1: Detect encoding (local files only)
+                # input_path is Path here (set in else block above)
+                file_input_path = input_path  # Type narrowing for mypy
+                assert isinstance(file_input_path, Path)
-            # Step 2: Convert to UTF-8 if needed
-            working_file = input_file
-            if needs_conversion(encoding):
-                progress.update(task, description=f"[cyan]Converting from {encoding} to UTF-8...")
+                progress.update(task, description="[cyan]Detecting encoding...")
                 try:
-                    convert_to_utf8(input_file, temp_utf8_file, encoding)
-                    working_file = temp_utf8_file
-                    temp_files.append(temp_utf8_file)
-                    progress.update(task, description=f"[green]✓[/green] Converted to UTF-8")
-                except (UnicodeDecodeError, LookupError) as e:
+                    encoding = detect_encoding(file_input_path)
+                except ValueError as e:
                     progress.stop()
-                    console.print(Panel(
-                        f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
-                        border_style="red"
-                    ))
+                    show_error_panel(str(e))
                     return 1
-            else:
-                progress.update(task, description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)")
+                logger.debug(f"Detected encoding: {encoding}")
+                progress.update(
+                    task, description=f"[green]✓[/green] Detected encoding: {encoding}"
+                )
+                # Step 2: Convert to UTF-8 if needed
+                working_file = file_input_path
+                if needs_conversion(encoding):
+                    progress.update(
+                        task,
+                        description=f"[cyan]Converting from {encoding} to UTF-8...",
+                    )
+                    try:
+                        convert_to_utf8(file_input_path, temp_utf8_file, encoding)
+                        working_file = temp_utf8_file
+                        temp_files.append(temp_utf8_file)
+                        progress.update(
+                            task, description="[green]✓[/green] Converted to UTF-8"
+                        )
+                    except (UnicodeDecodeError, LookupError) as e:
+                        progress.stop()
+                        show_error_panel(f"Encoding conversion failed\n{e}")
+                        return 1
+                else:
+                    progress.update(
+                        task,
+                        description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)",
+                    )
             # Step 3: Validate CSV
             progress.update(task, description="[cyan]Validating CSV...")
             logger.debug("Validating CSV with DuckDB...")
-            is_valid = validate_csv(working_file, reject_file)
-            if not is_valid:
+            try:
+                reject_count, error_types = validate_csv(
+                    working_file, reject_file, is_remote=is_remote
+                )
+            except Exception as e:
                 progress.stop()
-                console.print(Panel(
-                    "[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
-                    f"Details: [cyan]{reject_file}[/cyan]\n\n"
-                    "Please fix the issues and try again.",
-                    border_style="red"
-                ))
+                error_msg = str(e)
+                # Check for common HTTP errors
+                if "HTTP Error" in error_msg or "HTTPException" in error_msg:
+                    if "404" in error_msg:
+                        show_error_panel(
+                            f"Remote CSV file not found (HTTP 404)\n\n"
+                            f"URL: [cyan]{input_file}[/cyan]\n\n"
+                            "Please check the URL is correct."
+                        )
+                    elif "401" in error_msg or "403" in error_msg:
+                        show_error_panel(
+                            f"Authentication required (HTTP 401/403)\n\n"
+                            f"URL: [cyan]{input_file}[/cyan]\n\n"
+                            "This tool only supports public URLs without authentication.\n"
+                            "Please download the file manually first."
+                        )
+                    elif (
+                        "timeout" in error_msg.lower()
+                        or "timed out" in error_msg.lower()
+                    ):
+                        show_error_panel(
+                            f"HTTP request timeout (30 seconds)\n\n"
+                            f"URL: [cyan]{input_file}[/cyan]\n\n"
+                            "The remote server took too long to respond.\n"
+                            "Try again later or download the file manually."
+                        )
+                    else:
+                        show_error_panel(f"HTTP request failed\n\n{error_msg}")
+                else:
+                    # Re-raise non-HTTP errors
+                    raise
                 return 1
+            has_validation_errors = reject_count > 1
+            if has_validation_errors:
+                progress.stop()
             progress.update(task, description="[green]✓[/green] CSV validated")
             # Step 4: Normalize and write output
@@ -152,31 +224,50 @@ def process_csv(
                 output_path=output_file,
                 delimiter=delimiter,
                 normalize_names=not keep_names,
+                is_remote=is_remote,
             )
             logger.debug(f"Output written to: {output_file}")
             progress.update(task, description="[green]✓[/green] Complete")
-        # Success summary table
-        table = Table(show_header=False, box=None, padding=(0, 1))
-        table.add_row("[green]✓[/green] Success", "")
-        table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
-        table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
-        table.add_row("Encoding:", encoding)
-        if delimiter != ",":
-            table.add_row("Delimiter:", repr(delimiter))
-        if not keep_names:
-            table.add_row("Headers:", "normalized to snake_case")
+        # Collect statistics
+        input_size = (
+            working_file.stat().st_size if isinstance(working_file, Path) else 0
+        )
+        output_size = output_file.stat().st_size
+        row_count = get_row_count(output_file)
+        column_count = get_column_count(output_file, delimiter)
-        console.print()
-        console.print(table)
+        # Show success summary
+        show_success_table(
+            input_file=input_file,
+            output_file=output_file,
+            encoding=encoding,
+            is_remote=is_remote,
+            row_count=row_count,
+            column_count=column_count,
+            input_size=input_size,
+            output_size=output_size,
+            delimiter=delimiter,
+            keep_names=keep_names,
+        )
+        # Show validation errors if any
+        if has_validation_errors:
+            show_validation_error_panel(reject_count, error_types, reject_file)
+            return 1
     finally:
-        # Cleanup temp files
-        for temp_file in temp_files:
-            if temp_file.exists():
-                logger.debug(f"Removing temp file: {temp_file}")
-                temp_file.unlink()
+        # Cleanup temp directory
+        import shutil
+        for temp_path in temp_files:
+            if temp_path.exists():
+                logger.debug(f"Removing temp path: {temp_path}")
+                if temp_path.is_dir():
+                    shutil.rmtree(temp_path)
+                else:
+                    temp_path.unlink()
         # Remove reject file if empty (only header)
         if reject_file.exists():

csvnorm/ui.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""UI formatting functions for csvnorm terminal output."""
+from pathlib import Path
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from csvnorm.encoding import needs_conversion
+from csvnorm.utils import format_file_size
+console = Console()
+def show_error_panel(message: str, title: str = "Error") -> None:
+    """Display an error panel with red border.
+    Args:
+        message: Error message to display.
+        title: Panel title (default: "Error").
+    """
+    console.print(Panel(f"[bold red]{title}:[/bold red] {message}", border_style="red"))
+def show_warning_panel(message: str, title: str = "Warning") -> None:
+    """Display a warning panel with yellow border.
+    Args:
+        message: Warning message to display.
+        title: Panel title (default: "Warning").
+    """
+    console.print(
+        Panel(f"[bold yellow]{title}:[/bold yellow] {message}", border_style="yellow")
+    )
+def show_success_table(
+    input_file: str,
+    output_file: Path,
+    encoding: str,
+    is_remote: bool,
+    row_count: int,
+    column_count: int,
+    input_size: int,
+    output_size: int,
+    delimiter: str,
+    keep_names: bool,
+) -> None:
+    """Display success summary table with processing results.
+    Args:
+        input_file: Input CSV file path or URL.
+        output_file: Output CSV file path.
+        encoding: Detected encoding (or "remote" for URLs).
+        is_remote: Whether input was a remote URL.
+        row_count: Number of data rows in output.
+        column_count: Number of columns in output.
+        input_size: Input file size in bytes (0 for remote).
+        output_size: Output file size in bytes.
+        delimiter: Output delimiter character.
+        keep_names: Whether original column names were kept.
+    """
+    table = Table(show_header=False, box=None, padding=(0, 1))
+    table.add_row("[green]✓[/green] Success", "")
+    table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
+    table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
+    # Encoding info
+    if not is_remote:
+        if needs_conversion(encoding):
+            table.add_row("Encoding:", f"{encoding} → UTF-8 [dim](converted)[/dim]")
+        else:
+            table.add_row("Encoding:", f"{encoding} [dim](no conversion needed)[/dim]")
+    else:
+        table.add_row("Encoding:", "remote [dim](handled by DuckDB)[/dim]")
+    # Statistics
+    table.add_row("Rows:", f"{row_count:,}")
+    table.add_row("Columns:", f"{column_count}")
+    table.add_row("Input size:", format_file_size(input_size))
+    table.add_row("Output size:", format_file_size(output_size))
+    # Optional fields
+    if delimiter != ",":
+        table.add_row("Delimiter:", repr(delimiter))
+    if not keep_names:
+        table.add_row("Headers:", "normalized to snake_case")
+    console.print()
+    console.print(table)
+def show_validation_error_panel(
+    reject_count: int, error_types: list[str], reject_file: Path
+) -> None:
+    """Display validation error summary panel.
+    Args:
+        reject_count: Number of rejected rows (including header).
+        error_types: List of error type descriptions.
+        reject_file: Path to reject errors CSV file.
+    """
+    console.print()
+    error_lines = []
+    error_lines.append("[bold red]Validation Errors:[/bold red]")
+    error_lines.append("")
+    error_lines.append(f"Rejected rows: [yellow]{reject_count - 1}[/yellow]")
+    if error_types:
+        error_lines.append("")
+        error_lines.append("[dim]Error types:[/dim]")
+        for error_type in error_types:
+            error_lines.append(f"  • {error_type}")
+    error_lines.append("")
+    error_lines.append(f"Details: [cyan]{reject_file}[/cyan]")
+    console.print(
+        Panel(
+            "\n".join(error_lines),
+            border_style="yellow",
+            title="[yellow]![/yellow] Validation Failed",
+        )
+    )

csvnorm/utils.py CHANGED Viewed

@@ -3,6 +3,8 @@
 import logging
 import re
 from pathlib import Path
+from typing import Union
+from urllib.parse import urlparse
 from rich.logging import RichHandler
@@ -45,10 +47,7 @@ def setup_logger(verbose: bool = False) -> logging.Logger:
     if not logger.handlers:
         handler = RichHandler(
-            show_time=False,
-            show_path=verbose,
-            markup=True,
-            rich_tracebacks=True
+            show_time=False, show_path=verbose, markup=True, rich_tracebacks=True
         )
         logger.addHandler(handler)
@@ -69,3 +68,125 @@ def validate_delimiter(delimiter: str) -> None:
 def ensure_output_dir(output_dir: Path) -> None:
     """Create output directory if it doesn't exist."""
     output_dir.mkdir(parents=True, exist_ok=True)
+def is_url(input_str: str) -> bool:
+    """Check if input string is an HTTP/HTTPS URL.
+    Args:
+        input_str: String to check.
+    Returns:
+        True if input is HTTP/HTTPS URL, False otherwise.
+    """
+    try:
+        result = urlparse(input_str)
+        return result.scheme in ("http", "https") and bool(result.netloc)
+    except Exception:
+        return False
+def validate_url(url: str) -> None:
+    """Validate URL has HTTP/HTTPS protocol.
+    Args:
+        url: URL to validate.
+    Raises:
+        ValueError: If URL protocol is not HTTP/HTTPS.
+    """
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise ValueError(f"Only HTTP/HTTPS URLs are supported. Got: {parsed.scheme}://")
+def extract_filename_from_url(url: str) -> str:
+    """Extract and normalize filename from URL.
+    Args:
+        url: URL to extract filename from.
+    Returns:
+        Normalized snake_case filename without extension.
+    """
+    from urllib.parse import unquote
+    parsed = urlparse(url)
+    # Get last path segment, ignore query/fragment
+    path = parsed.path.rstrip("/")
+    filename = path.split("/")[-1] if path else "data"
+    # Decode URL encoding (%20 -> space, etc.)
+    filename = unquote(filename)
+    # Remove extension if present
+    if filename.lower().endswith(".csv"):
+        filename = filename[:-4]
+    # Apply snake_case normalization
+    return to_snake_case(filename) if filename else "data"
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human-readable format.
+    Args:
+        size_bytes: File size in bytes.
+    Returns:
+        Formatted size string (e.g., "1.5 MB", "256 KB").
+    """
+    for unit in ["B", "KB", "MB", "GB"]:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} B"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} TB"
+def get_row_count(file_path: Union[Path, str]) -> int:
+    """Count number of rows in a CSV file.
+    Args:
+        file_path: Path to CSV file.
+    Returns:
+        Number of data rows (excluding header), or 0 if file doesn't exist.
+    """
+    if not isinstance(file_path, Path) or not file_path.exists():
+        return 0
+    try:
+        with open(file_path, "r") as f:
+            # Skip header
+            next(f, None)
+            return sum(1 for _ in f)
+    except Exception:
+        return 0
+def get_column_count(file_path: Union[Path, str], delimiter: str = ",") -> int:
+    """Count number of columns in a CSV file using DuckDB.
+    Args:
+        file_path: Path to CSV file.
+        delimiter: Field delimiter used in the CSV file.
+    Returns:
+        Number of columns in the CSV, or 0 if file doesn't exist or error.
+    """
+    if not isinstance(file_path, Path) or not file_path.exists():
+        return 0
+    try:
+        import duckdb
+        conn = duckdb.connect(":memory:")
+        # Get column names from CSV using DuckDB DESCRIBE
+        columns = conn.execute(
+            f"DESCRIBE SELECT * FROM read_csv('{file_path}', delim='{delimiter}', header=true, sample_size=1)"
+        ).fetchall()
+        conn.close()
+        return len(columns)
+    except Exception:
+        return 0

csvnorm/validation.py CHANGED Viewed

@@ -2,27 +2,36 @@
 import logging
 from pathlib import Path
+from typing import Union
 import duckdb
 logger = logging.getLogger("csvnorm")
-def validate_csv(file_path: Path, reject_file: Path) -> bool:
+def validate_csv(
+    file_path: Union[Path, str], reject_file: Path, is_remote: bool = False
+) -> tuple[int, list[str]]:
     """Validate CSV file using DuckDB and export rejected rows.
     Args:
-        file_path: Path to CSV file to validate.
+        file_path: Path to CSV file to validate or URL string.
         reject_file: Path to write rejected rows.
+        is_remote: True if file_path is a remote URL.
     Returns:
-        True if validation passes (no rejected rows), False otherwise.
+        Tuple of (reject_count, error_types) where error_types is list of
+        up to 3 unique error reasons from reject file.
     """
     logger.debug(f"Validating CSV: {file_path}")
     conn = duckdb.connect()
     try:
+        # Set HTTP timeout for remote URLs (30 seconds)
+        if is_remote:
+            conn.execute("SET http_timeout=30000")
         # Read CSV with store_rejects to capture malformed rows
         # Use all_varchar=true to avoid type inference failures
         conn.execute(f"""
@@ -46,28 +55,39 @@ def validate_csv(file_path: Path, reject_file: Path) -> bool:
     reject_count = _count_lines(reject_file)
     logger.debug(f"Reject file lines: {reject_count}")
-    return reject_count <= 1
+    # Collect sample error types from reject file
+    error_types = []
+    if reject_count > 1:
+        error_types = _get_error_types(reject_file)
+    return reject_count, error_types
 def normalize_csv(
-    input_path: Path,
+    input_path: Union[Path, str],
     output_path: Path,
     delimiter: str = ",",
     normalize_names: bool = True,
+    is_remote: bool = False,
 ) -> None:
     """Normalize CSV file using DuckDB.
     Args:
-        input_path: Path to input CSV file.
+        input_path: Path to input CSV file or URL string.
         output_path: Path for normalized output file.
         delimiter: Output field delimiter.
         normalize_names: If True, convert column names to snake_case.
+        is_remote: True if input_path is a remote URL.
     """
     logger.debug(f"Normalizing CSV: {input_path} -> {output_path}")
     conn = duckdb.connect()
     try:
+        # Set HTTP timeout for remote URLs (30 seconds)
+        if is_remote:
+            conn.execute("SET http_timeout=30000")
         # Build read options
         read_opts = "sample_size=-1, all_varchar=true"
         if normalize_names:
@@ -107,3 +127,36 @@ def _count_lines(file_path: Path) -> int:
     with open(file_path, "r") as f:
         return sum(1 for _ in f)
+def _get_error_types(reject_file: Path) -> list[str]:
+    """Extract sample error types from reject file.
+    Args:
+        reject_file: Path to reject_errors.csv file.
+    Returns:
+        List of up to 3 unique error reasons.
+    """
+    if not reject_file.exists():
+        return []
+    error_types: set[str] = set()
+    try:
+        with open(reject_file, "r") as f:
+            # Skip header
+            next(f, None)
+            for line in f:
+                # Error message is in the last column
+                parts = line.rstrip("\n").split(",")
+                if parts:
+                    error_reason = parts[-1].strip()
+                    if error_reason and error_reason != "error":
+                        error_types.add(error_reason)
+                        if len(error_types) >= 3:
+                            break
+    except Exception as e:
+        logger.warning(f"Failed to extract error types: {e}")
+        return []
+    return list(error_types)[:3]

{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: csvnorm
-Version: 0.3.3
+Version: 0.3.11
 Summary: A command-line utility to validate and normalize CSV files
 Author-email: aborruso <aborruso@gmail.com>
 License: MIT License
@@ -34,7 +34,6 @@ Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
@@ -49,18 +48,15 @@ Requires-Dist: charset-normalizer>=3.0.0
 Requires-Dist: duckdb>=0.9.0
 Requires-Dist: rich>=13.0.0
 Requires-Dist: rich-argparse>=1.0.0
-Requires-Dist: pyfiglet>=1.0.0
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 Requires-Dist: ruff>=0.1.0; extra == "dev"
-Provides-Extra: banner
-Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
 Dynamic: license-file
 [![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/csvnorm)
 # csvnorm
@@ -81,26 +77,6 @@ Or with pip:
 pip install csvnorm
 ```
-For ASCII art banner (shown with `--version` and `-V`):
-```bash
-uv tool install 'csvnorm[banner]'
-# or
-pip install 'csvnorm[banner]'
-```
-Example with banner:
-```bash
-csvnorm --version
-# Output:
-#   ___________   ______  ____  _________ ___
-#  / ___/ ___/ | / / __ \/ __ \/ ___/ __ `__ \
-# / /__(__  )| |/ / / / / /_/ / /  / / / / / /
-# \___/____/ |___/_/ /_/\____/_/  /_/ /_/ /_/
-#
-# csvnorm 0.3.1
-```
 ## Purpose
 This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
@@ -122,7 +98,9 @@ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not
 - **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
 - **Field Name Normalization**: Converts column headers to snake_case format
 - **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
-- **Error Reporting**: Exports detailed error file for invalid rows
+- **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
+- **Error Reporting**: Exports detailed error file for invalid rows with summary panel
+- **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
 ## Usage
@@ -148,6 +126,9 @@ csvnorm input.csv [options]
 # Basic usage
 csvnorm data.csv
+# Process remote CSV from URL
+csvnorm "https://raw.githubusercontent.com/aborruso/csvnorm/refs/heads/main/test/Trasporto%20Pubblico%20Locale%20Settore%20Pubblico%20Allargato%20-%20Indicatore%202000-2020%20Trasferimenti%20Correnti%20su%20Entrate%20Correnti.csv"
 # With semicolon delimiter
 csvnorm data.csv -d ';'
@@ -163,17 +144,63 @@ csvnorm data.csv -f -V
 ### Output
-Creates a normalized CSV file in the specified output directory with:
+Creates a normalized CSV file in specified output directory with:
 - UTF-8 encoding
 - Consistent field delimiters
 - Normalized column names (unless `--keep-names` is specified)
 - Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
+For remote URLs:
+- The output filename is derived from URL's last path segment
+- Encoding is handled automatically by DuckDB
+- HTTP timeout is set to 30 seconds
+- Only public URLs are supported (no authentication)
 The tool provides modern terminal output with:
 - Progress indicators for multi-step processing
 - Color-coded error messages with panels
-- Success summary table showing encoding, paths, and settings
-- Optional ASCII art banner with `--version` and `-V` verbose mode (requires `pyfiglet`)
+- Success summary table with statistics (rows, columns, file sizes)
+- Encoding conversion status (converted/no conversion/remote)
+- Error summary panel with reject count and error types when validation fails
+- ASCII art banner with `--version` and `-V` verbose mode
+**Success Example:**
+```
+ ✓ Success
+ Input:        test/utf8_basic.csv
+ Output:       output/utf8_basic.csv
+ Encoding:     ascii (no conversion needed)
+ Rows:         2
+ Columns:      3
+ Input size:   42 B
+ Output size:  43 B
+ Headers:      normalized to snake_case
+```
+**Error Example:**
+```
+ ✓ Success
+ Input:        test/malformed_rows.csv
+ Output:       output/malformed_rows.csv
+ Encoding:     ascii (no conversion needed)
+ Rows:         1
+ Columns:      4
+ Input size:   24 B
+ Output size:  40 B
+ Headers:      normalized to snake_case
+╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
+│ Validation Errors:                                                           │
+│                                                                              │
+│ Rejected rows: 2                                                             │
+│                                                                              │
+│ Error types:                                                                 │
+│   • Expected Number of Columns: 3 Found: 2                                   │
+│   • Expected Number of Columns: 3 Found: 4                                   │
+│                                                                              │
+│ Details: output/malformed_rows_reject_errors.csv                             │
+╰──────────────────────────────────────────────────────────────────────────────╯
+```
 ### Exit Codes
@@ -184,15 +211,15 @@ The tool provides modern terminal output with:
 ## Requirements
-- Python 3.8+
+- Python 3.9+
 - Dependencies (automatically installed):
   - `charset-normalizer>=3.0.0` - Encoding detection
   - `duckdb>=0.9.0` - CSV validation and normalization
   - `rich>=13.0.0` - Modern terminal output formatting
   - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
+  - `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
 Optional extras:
-- `[banner]` - ASCII art banner for `--version` and `-V` verbose mode (`pyfiglet>=1.0.0`)
 - `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
 ## Development
@@ -221,7 +248,7 @@ pytest tests/ -v
 ### Project Structure
 ```
-prepare_data/
+csvnorm/
 ├── src/csvnorm/
 │   ├── __init__.py      # Package version
 │   ├── __main__.py      # python -m support

csvnorm-0.3.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+csvnorm/__init__.py,sha256=frEketezK5MWX8eiy1mFgw_3QeMcH4cVgVsNXtD1Jgg,264
+csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
+csvnorm/cli.py,sha256=UEe0hRGWx9m6ZLGLd9TIaJ_uayclNTh_i0fO_JEgTXY,4166
+csvnorm/core.py,sha256=0tgOmPr4JSMSzgSxT8ffCk_IrOWGLI2hTzhV9_xNQQ8,9945
+csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
+csvnorm/ui.py,sha256=rOfVYjnTImplMMc-QGmcYUXzzZ513Y1bCjlO2jPxG2A,3893
+csvnorm/utils.py,sha256=slV2aADBDfg9RHZJE-jmRuzPfY1RX0Wq-D1A4oBN7Yo,5020
+csvnorm/validation.py,sha256=I7m_nxsGDROy5pBkNU-H7qEVYEAT19vw5alkrvZqGh4,4539
+csvnorm-0.3.11.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
+csvnorm-0.3.11.dist-info/METADATA,sha256=7c2Bu-M-4UiOqqVOC5Nm-I88ZhmC2BquMSiGRjD9VBo,9808
+csvnorm-0.3.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+csvnorm-0.3.11.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
+csvnorm-0.3.11.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
+csvnorm-0.3.11.dist-info/RECORD,,

csvnorm-0.3.3.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-csvnorm/__init__.py,sha256=8njXIycxL0qSI5Q9bVGyTaM41j_kKX9jV7TeQOSAQGE,263
-csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
-csvnorm/cli.py,sha256=MwIPahLktbulF6NYRWyBsE4s9Al9_aSdA1zvzuI0AiQ,3815
-csvnorm/core.py,sha256=_kTaui_2IhqrN_UxJpcjwXYXEvqaRMhML49Xlx-e0p0,6633
-csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
-csvnorm/utils.py,sha256=gvwDToOx3YoKCfVPyCmxcSa7teCWFB2SmAGr-jV5w_Y,1761
-csvnorm/validation.py,sha256=iXdfalAGDNB9kPefyzHXGI9uc-HLAG5pQ_-T93ShppY,2815
-csvnorm-0.3.3.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
-csvnorm-0.3.3.dist-info/METADATA,sha256=xKJmLVX9RoB22KwAAlxAvWB_KA9h68m5V-UyFaS_DGo,7840
-csvnorm-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-csvnorm-0.3.3.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
-csvnorm-0.3.3.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
-csvnorm-0.3.3.dist-info/RECORD,,

{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

csvnorm 0.3.3__py3-none-any.whl → 0.3.11__py3-none-any.whl

csvnorm 0.3.3py3-none-any.whl → 0.3.11py3-none-any.whl