csvnorm 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvnorm/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """csvnorm - Validate and normalize CSV files."""
2
+
3
+ __version__ = "0.3.3"
4
+ __all__ = ["normalize_csv", "detect_encoding", "process_csv"]
5
+
6
+ from csvnorm.core import process_csv
7
+ from csvnorm.encoding import detect_encoding
8
+ from csvnorm.validation import normalize_csv
csvnorm/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m csvnorm."""
2
+
3
+ from csvnorm.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
csvnorm/cli.py ADDED
@@ -0,0 +1,150 @@
1
+ """Command-line interface for csvnorm."""
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from rich.console import Console
8
+ from rich_argparse import RichHelpFormatter
9
+
10
+ from csvnorm import __version__
11
+ from csvnorm.core import process_csv
12
+ from csvnorm.utils import setup_logger
13
+
14
+ console = Console()
15
+
16
+
17
+ def show_banner() -> None:
18
+ """Show ASCII art banner."""
19
+ from pyfiglet import figlet_format
20
+ banner = figlet_format("csvnorm", font="slant")
21
+ console.print(banner, style="bold cyan")
22
+
23
+
24
+ class VersionAction(argparse.Action):
25
+ """Custom action to show banner with version."""
26
+
27
+ def __call__(self, parser, _namespace, _values, _option_string=None):
28
+ show_banner()
29
+ console.print(f"csvnorm {__version__}", style="bold")
30
+ console.print()
31
+ console.print("Validate and normalize CSV files for exploratory data analysis", style="dim")
32
+ console.print()
33
+ console.print("Author: aborruso", style="dim")
34
+ console.print("Repository: https://github.com/aborruso/csvnorm", style="dim cyan")
35
+ console.print("License: MIT", style="dim")
36
+ parser.exit()
37
+
38
+
39
+ def create_parser() -> argparse.ArgumentParser:
40
+ """Create and return the argument parser."""
41
+ parser = argparse.ArgumentParser(
42
+ prog="csvnorm",
43
+ description="Validate and normalize CSV files for exploratory data analysis",
44
+ formatter_class=RichHelpFormatter,
45
+ epilog="""\
46
+ Examples:
47
+ csvnorm data.csv -d ';' -o output_folder --force
48
+ csvnorm data.csv --keep-names --delimiter '\\t'
49
+ csvnorm data.csv -V
50
+ """,
51
+ )
52
+
53
+ parser.add_argument(
54
+ "input_file",
55
+ type=Path,
56
+ help="Input CSV file path",
57
+ )
58
+
59
+ parser.add_argument(
60
+ "-f",
61
+ "--force",
62
+ action="store_true",
63
+ help="Force overwrite of existing output files",
64
+ )
65
+
66
+ parser.add_argument(
67
+ "-k",
68
+ "--keep-names",
69
+ action="store_true",
70
+ help=(
71
+ "Keep original column names (disable snake_case normalization). "
72
+ "By default, column names are converted to snake_case format "
73
+ "(e.g., 'Column Name' becomes 'column_name')."
74
+ ),
75
+ )
76
+
77
+ parser.add_argument(
78
+ "-d",
79
+ "--delimiter",
80
+ default=",",
81
+ help="Set custom field delimiter (default: comma). Example: -d ';'",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "-o",
86
+ "--output-dir",
87
+ type=Path,
88
+ default=Path.cwd(),
89
+ help="Set custom output directory (default: current working directory)",
90
+ )
91
+
92
+ parser.add_argument(
93
+ "-V",
94
+ "--verbose",
95
+ action="store_true",
96
+ help="Enable verbose output for debugging",
97
+ )
98
+
99
+ parser.add_argument(
100
+ "-v",
101
+ "--version",
102
+ action=VersionAction,
103
+ nargs=0,
104
+ help="Show version number with banner",
105
+ )
106
+
107
+ return parser
108
+
109
+
110
+ def main(argv: list[str] | None = None) -> int:
111
+ """Main entry point for the CLI.
112
+
113
+ Args:
114
+ argv: Command line arguments (defaults to sys.argv[1:]).
115
+
116
+ Returns:
117
+ Exit code: 0 for success, 1 for error.
118
+ """
119
+ parser = create_parser()
120
+
121
+ # Handle missing arguments gracefully
122
+ if argv is None:
123
+ argv = sys.argv[1:]
124
+
125
+ if not argv or (len(argv) == 1 and argv[0] in ['-h', '--help']):
126
+ parser.print_help()
127
+ return 0 if argv else 2
128
+
129
+ args = parser.parse_args(argv)
130
+
131
+ # Show banner in verbose mode
132
+ if args.verbose:
133
+ show_banner()
134
+
135
+ # Setup logging
136
+ setup_logger(args.verbose)
137
+
138
+ # Run processing
139
+ return process_csv(
140
+ input_file=args.input_file,
141
+ output_dir=args.output_dir,
142
+ force=args.force,
143
+ keep_names=args.keep_names,
144
+ delimiter=args.delimiter,
145
+ verbose=args.verbose,
146
+ )
147
+
148
+
149
+ if __name__ == "__main__":
150
+ sys.exit(main())
csvnorm/core.py ADDED
@@ -0,0 +1,189 @@
1
+ """Core processing logic for csvnorm."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from rich.console import Console
7
+ from rich.panel import Panel
8
+ from rich.progress import Progress, SpinnerColumn, TextColumn
9
+ from rich.table import Table
10
+
11
+ from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
12
+ from csvnorm.utils import ensure_output_dir, to_snake_case, validate_delimiter
13
+ from csvnorm.validation import normalize_csv, validate_csv
14
+
15
+ logger = logging.getLogger("csvnorm")
16
+ console = Console()
17
+
18
+
19
+ def process_csv(
20
+ input_file: Path,
21
+ output_dir: Path,
22
+ force: bool = False,
23
+ keep_names: bool = False,
24
+ delimiter: str = ",",
25
+ verbose: bool = False,
26
+ ) -> int:
27
+ """Main CSV processing pipeline.
28
+
29
+ Args:
30
+ input_file: Path to input CSV file.
31
+ output_dir: Directory for output files.
32
+ force: If True, overwrite existing output files.
33
+ keep_names: If True, keep original column names.
34
+ delimiter: Output field delimiter.
35
+ verbose: If True, enable debug logging.
36
+
37
+ Returns:
38
+ Exit code: 0 for success, 1 for error.
39
+ """
40
+ # Validate inputs
41
+ if not input_file.exists():
42
+ console.print(Panel(
43
+ f"[bold red]Error:[/bold red] Input file not found\n{input_file}",
44
+ border_style="red"
45
+ ))
46
+ return 1
47
+
48
+ if not input_file.is_file():
49
+ console.print(Panel(
50
+ f"[bold red]Error:[/bold red] Not a file\n{input_file}",
51
+ border_style="red"
52
+ ))
53
+ return 1
54
+
55
+ try:
56
+ validate_delimiter(delimiter)
57
+ except ValueError as e:
58
+ console.print(Panel(
59
+ f"[bold red]Error:[/bold red] {e}",
60
+ border_style="red"
61
+ ))
62
+ return 1
63
+
64
+ # Setup paths
65
+ base_name = to_snake_case(input_file.name)
66
+ ensure_output_dir(output_dir)
67
+
68
+ output_file = output_dir / f"{base_name}.csv"
69
+ reject_file = output_dir / f"{base_name}_reject_errors.csv"
70
+ temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
71
+
72
+ # Check if output exists
73
+ if output_file.exists() and not force:
74
+ console.print(Panel(
75
+ f"[bold yellow]Warning:[/bold yellow] Output file already exists\n\n"
76
+ f"{output_file}\n\n"
77
+ f"Use [bold]--force[/bold] to overwrite.",
78
+ border_style="yellow"
79
+ ))
80
+ return 1
81
+
82
+ # Clean up previous reject file
83
+ if reject_file.exists():
84
+ reject_file.unlink()
85
+
86
+ # Track files to clean up
87
+ temp_files: list[Path] = []
88
+
89
+ try:
90
+ with Progress(
91
+ SpinnerColumn(),
92
+ TextColumn("[progress.description]{task.description}"),
93
+ console=console,
94
+ transient=True
95
+ ) as progress:
96
+ # Step 1: Detect encoding
97
+ task = progress.add_task("[cyan]Detecting encoding...", total=None)
98
+ try:
99
+ encoding = detect_encoding(input_file)
100
+ except ValueError as e:
101
+ progress.stop()
102
+ console.print(Panel(
103
+ f"[bold red]Error:[/bold red] {e}",
104
+ border_style="red"
105
+ ))
106
+ return 1
107
+
108
+ logger.debug(f"Detected encoding: {encoding}")
109
+ progress.update(task, description=f"[green]✓[/green] Detected encoding: {encoding}")
110
+
111
+ # Step 2: Convert to UTF-8 if needed
112
+ working_file = input_file
113
+ if needs_conversion(encoding):
114
+ progress.update(task, description=f"[cyan]Converting from {encoding} to UTF-8...")
115
+ try:
116
+ convert_to_utf8(input_file, temp_utf8_file, encoding)
117
+ working_file = temp_utf8_file
118
+ temp_files.append(temp_utf8_file)
119
+ progress.update(task, description=f"[green]✓[/green] Converted to UTF-8")
120
+ except (UnicodeDecodeError, LookupError) as e:
121
+ progress.stop()
122
+ console.print(Panel(
123
+ f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
124
+ border_style="red"
125
+ ))
126
+ return 1
127
+ else:
128
+ progress.update(task, description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)")
129
+
130
+ # Step 3: Validate CSV
131
+ progress.update(task, description="[cyan]Validating CSV...")
132
+ logger.debug("Validating CSV with DuckDB...")
133
+ is_valid = validate_csv(working_file, reject_file)
134
+
135
+ if not is_valid:
136
+ progress.stop()
137
+ console.print(Panel(
138
+ "[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
139
+ f"Details: [cyan]{reject_file}[/cyan]\n\n"
140
+ "Please fix the issues and try again.",
141
+ border_style="red"
142
+ ))
143
+ return 1
144
+
145
+ progress.update(task, description="[green]✓[/green] CSV validated")
146
+
147
+ # Step 4: Normalize and write output
148
+ progress.update(task, description="[cyan]Normalizing and writing output...")
149
+ logger.debug("Normalizing CSV...")
150
+ normalize_csv(
151
+ input_path=working_file,
152
+ output_path=output_file,
153
+ delimiter=delimiter,
154
+ normalize_names=not keep_names,
155
+ )
156
+
157
+ logger.debug(f"Output written to: {output_file}")
158
+ progress.update(task, description="[green]✓[/green] Complete")
159
+
160
+ # Success summary table
161
+ table = Table(show_header=False, box=None, padding=(0, 1))
162
+ table.add_row("[green]✓[/green] Success", "")
163
+ table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
164
+ table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
165
+ table.add_row("Encoding:", encoding)
166
+ if delimiter != ",":
167
+ table.add_row("Delimiter:", repr(delimiter))
168
+ if not keep_names:
169
+ table.add_row("Headers:", "normalized to snake_case")
170
+
171
+ console.print()
172
+ console.print(table)
173
+
174
+ finally:
175
+ # Cleanup temp files
176
+ for temp_file in temp_files:
177
+ if temp_file.exists():
178
+ logger.debug(f"Removing temp file: {temp_file}")
179
+ temp_file.unlink()
180
+
181
+ # Remove reject file if empty (only header)
182
+ if reject_file.exists():
183
+ with open(reject_file, "r") as f:
184
+ line_count = sum(1 for _ in f)
185
+ if line_count <= 1:
186
+ logger.debug(f"Removing empty reject file: {reject_file}")
187
+ reject_file.unlink()
188
+
189
+ return 0
csvnorm/encoding.py ADDED
@@ -0,0 +1,119 @@
1
+ """Encoding detection and conversion for CSV files."""
2
+
3
+ import codecs
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ from charset_normalizer import from_path
8
+
9
+ logger = logging.getLogger("csvnorm")
10
+
11
+ # Encoding alias mapping for Python codec compatibility
12
+ ENCODING_ALIASES: dict[str, str] = {
13
+ "macroman": "mac_roman",
14
+ "macintosh": "mac_roman",
15
+ "utf_8": "utf-8",
16
+ "utf_8_sig": "utf-8-sig",
17
+ "ascii": "ascii",
18
+ }
19
+
20
+ # Encodings that don't need conversion
21
+ UTF8_ENCODINGS = frozenset({"utf-8", "ascii", "utf-8-sig"})
22
+
23
+
24
+ def normalize_encoding_name(encoding: str) -> str:
25
+ """Normalize encoding name to Python codec name.
26
+
27
+ Args:
28
+ encoding: Raw encoding name from detection.
29
+
30
+ Returns:
31
+ Normalized encoding name compatible with Python codecs.
32
+ """
33
+ encoding_lower = encoding.lower().replace("-", "_")
34
+
35
+ # Check alias mapping
36
+ if encoding_lower in ENCODING_ALIASES:
37
+ return ENCODING_ALIASES[encoding_lower]
38
+
39
+ # Try to normalize with underscores to dashes
40
+ return encoding_lower.replace("_", "-")
41
+
42
+
43
+ def detect_encoding(file_path: Path) -> str:
44
+ """Detect the encoding of a file using charset_normalizer.
45
+
46
+ Args:
47
+ file_path: Path to the file to analyze.
48
+
49
+ Returns:
50
+ Detected encoding name (normalized for Python codecs).
51
+
52
+ Raises:
53
+ ValueError: If encoding cannot be detected.
54
+ """
55
+ logger.debug(f"Detecting encoding for: {file_path}")
56
+
57
+ result = from_path(file_path)
58
+ best = result.best()
59
+
60
+ if best is None:
61
+ logger.debug("charset_normalizer failed, cannot detect encoding")
62
+ raise ValueError(f"Cannot detect encoding for: {file_path}")
63
+
64
+ encoding = best.encoding
65
+ logger.debug(f"Detected encoding: {encoding}")
66
+
67
+ # Normalize the encoding name
68
+ normalized = normalize_encoding_name(encoding)
69
+ if normalized != encoding.lower():
70
+ logger.debug(f"Normalized encoding: {encoding} -> {normalized}")
71
+
72
+ return normalized
73
+
74
+
75
+ def needs_conversion(encoding: str) -> bool:
76
+ """Check if file needs encoding conversion to UTF-8.
77
+
78
+ Args:
79
+ encoding: Detected encoding name.
80
+
81
+ Returns:
82
+ True if conversion is needed, False otherwise.
83
+ """
84
+ encoding_lower = encoding.lower()
85
+ return encoding_lower not in UTF8_ENCODINGS
86
+
87
+
88
+ def convert_to_utf8(input_path: Path, output_path: Path, source_encoding: str) -> Path:
89
+ """Convert file from source encoding to UTF-8.
90
+
91
+ Args:
92
+ input_path: Path to input file.
93
+ output_path: Path for UTF-8 output file.
94
+ source_encoding: Source file encoding.
95
+
96
+ Returns:
97
+ Path to the converted file.
98
+
99
+ Raises:
100
+ UnicodeDecodeError: If file cannot be decoded with the specified encoding.
101
+ LookupError: If the encoding is not supported.
102
+ """
103
+ logger.debug(f"Converting from {source_encoding} to UTF-8")
104
+
105
+ # Validate encoding exists
106
+ try:
107
+ codecs.lookup(source_encoding)
108
+ except LookupError as e:
109
+ raise LookupError(f"Unknown encoding: {source_encoding}") from e
110
+
111
+ # Read with source encoding, write as UTF-8
112
+ with open(input_path, "r", encoding=source_encoding, errors="strict") as f:
113
+ content = f.read()
114
+
115
+ with open(output_path, "w", encoding="utf-8") as f:
116
+ f.write(content)
117
+
118
+ logger.debug(f"Converted file written to: {output_path}")
119
+ return output_path
csvnorm/utils.py ADDED
@@ -0,0 +1,71 @@
1
+ """Utility functions for csvnorm."""
2
+
3
+ import logging
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from rich.logging import RichHandler
8
+
9
+
10
+ def to_snake_case(name: str) -> str:
11
+ """Convert filename to clean snake_case.
12
+
13
+ Replicates the bash logic:
14
+ tr '[:upper:]' '[:lower:]' |
15
+ sed -E 's/[^a-z0-9]+/_/g' |
16
+ sed -E 's/_+/_/g' |
17
+ sed -E 's/^_|_$//g'
18
+ """
19
+ # Remove .csv extension if present
20
+ if name.lower().endswith(".csv"):
21
+ name = name[:-4]
22
+
23
+ # Convert to lowercase
24
+ name = name.lower()
25
+
26
+ # Replace non-alphanumeric with underscore
27
+ name = re.sub(r"[^a-z0-9]+", "_", name)
28
+
29
+ # Collapse multiple underscores
30
+ name = re.sub(r"_+", "_", name)
31
+
32
+ # Remove leading/trailing underscores
33
+ name = name.strip("_")
34
+
35
+ return name
36
+
37
+
38
+ def setup_logger(verbose: bool = False) -> logging.Logger:
39
+ """Setup and return a logger instance with rich formatting.
40
+
41
+ Args:
42
+ verbose: If True, set log level to DEBUG, else INFO.
43
+ """
44
+ logger = logging.getLogger("csvnorm")
45
+
46
+ if not logger.handlers:
47
+ handler = RichHandler(
48
+ show_time=False,
49
+ show_path=verbose,
50
+ markup=True,
51
+ rich_tracebacks=True
52
+ )
53
+ logger.addHandler(handler)
54
+
55
+ logger.setLevel(logging.DEBUG if verbose else logging.INFO)
56
+ return logger
57
+
58
+
59
+ def validate_delimiter(delimiter: str) -> None:
60
+ """Validate that delimiter is a single character.
61
+
62
+ Raises:
63
+ ValueError: If delimiter is not exactly one character.
64
+ """
65
+ if len(delimiter) != 1:
66
+ raise ValueError("--delimiter must be a single character")
67
+
68
+
69
+ def ensure_output_dir(output_dir: Path) -> None:
70
+ """Create output directory if it doesn't exist."""
71
+ output_dir.mkdir(parents=True, exist_ok=True)
csvnorm/validation.py ADDED
@@ -0,0 +1,109 @@
1
+ """CSV validation and normalization using DuckDB."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ import duckdb
7
+
8
+ logger = logging.getLogger("csvnorm")
9
+
10
+
11
+ def validate_csv(file_path: Path, reject_file: Path) -> bool:
12
+ """Validate CSV file using DuckDB and export rejected rows.
13
+
14
+ Args:
15
+ file_path: Path to CSV file to validate.
16
+ reject_file: Path to write rejected rows.
17
+
18
+ Returns:
19
+ True if validation passes (no rejected rows), False otherwise.
20
+ """
21
+ logger.debug(f"Validating CSV: {file_path}")
22
+
23
+ conn = duckdb.connect()
24
+
25
+ try:
26
+ # Read CSV with store_rejects to capture malformed rows
27
+ # Use all_varchar=true to avoid type inference failures
28
+ conn.execute(f"""
29
+ COPY (
30
+ FROM read_csv(
31
+ '{file_path}',
32
+ store_rejects=true,
33
+ sample_size=-1,
34
+ all_varchar=true
35
+ )
36
+ ) TO '/dev/null'
37
+ """)
38
+
39
+ # Export rejected rows to file
40
+ conn.execute(f"COPY (FROM reject_errors) TO '{reject_file}'")
41
+
42
+ finally:
43
+ conn.close()
44
+
45
+ # Check if there are rejected rows (more than just header)
46
+ reject_count = _count_lines(reject_file)
47
+ logger.debug(f"Reject file lines: {reject_count}")
48
+
49
+ return reject_count <= 1
50
+
51
+
52
+ def normalize_csv(
53
+ input_path: Path,
54
+ output_path: Path,
55
+ delimiter: str = ",",
56
+ normalize_names: bool = True,
57
+ ) -> None:
58
+ """Normalize CSV file using DuckDB.
59
+
60
+ Args:
61
+ input_path: Path to input CSV file.
62
+ output_path: Path for normalized output file.
63
+ delimiter: Output field delimiter.
64
+ normalize_names: If True, convert column names to snake_case.
65
+ """
66
+ logger.debug(f"Normalizing CSV: {input_path} -> {output_path}")
67
+
68
+ conn = duckdb.connect()
69
+
70
+ try:
71
+ # Build read options
72
+ read_opts = "sample_size=-1, all_varchar=true"
73
+ if normalize_names:
74
+ read_opts += ", normalize_names=true"
75
+
76
+ # Build copy options
77
+ copy_opts = "header true, format csv"
78
+ if delimiter != ",":
79
+ copy_opts += f", delimiter '{delimiter}'"
80
+
81
+ query = f"""
82
+ COPY (
83
+ SELECT * FROM read_csv('{input_path}', {read_opts})
84
+ ) TO '{output_path}' ({copy_opts})
85
+ """
86
+
87
+ logger.debug(f"DuckDB query: {query}")
88
+ conn.execute(query)
89
+
90
+ finally:
91
+ conn.close()
92
+
93
+ logger.debug(f"Normalized file written to: {output_path}")
94
+
95
+
96
+ def _count_lines(file_path: Path) -> int:
97
+ """Count lines in a file.
98
+
99
+ Args:
100
+ file_path: Path to file.
101
+
102
+ Returns:
103
+ Number of lines in file, or 0 if file doesn't exist.
104
+ """
105
+ if not file_path.exists():
106
+ return 0
107
+
108
+ with open(file_path, "r") as f:
109
+ return sum(1 for _ in f)
@@ -0,0 +1,240 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvnorm
3
+ Version: 0.3.3
4
+ Summary: A command-line utility to validate and normalize CSV files
5
+ Author-email: aborruso <aborruso@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 aborruso@gmail.com
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/aborruso/prepare_data
29
+ Project-URL: Issues, https://github.com/aborruso/prepare_data/issues
30
+ Keywords: csv,data,normalization,validation,etl
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Operating System :: OS Independent
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.8
38
+ Classifier: Programming Language :: Python :: 3.9
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
43
+ Classifier: Topic :: Software Development :: Libraries
44
+ Classifier: Topic :: Utilities
45
+ Requires-Python: >=3.9
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: charset-normalizer>=3.0.0
49
+ Requires-Dist: duckdb>=0.9.0
50
+ Requires-Dist: rich>=13.0.0
51
+ Requires-Dist: rich-argparse>=1.0.0
52
+ Requires-Dist: pyfiglet>=1.0.0
53
+ Provides-Extra: dev
54
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
55
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
56
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
57
+ Provides-Extra: banner
58
+ Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
59
+ Dynamic: license-file
60
+
61
+ [![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
62
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
63
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
64
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/csvnorm)
65
+
66
+ # csvnorm
67
+
68
+ A command-line utility to validate and normalize CSV files for initial exploration.
69
+
70
+ ## Installation
71
+
72
+ Recommended (uv):
73
+
74
+ ```bash
75
+ uv tool install csvnorm
76
+ ```
77
+
78
+ Or with pip:
79
+
80
+ ```bash
81
+ pip install csvnorm
82
+ ```
83
+
84
+ For ASCII art banner (shown with `--version` and `-V`):
85
+
86
+ ```bash
87
+ uv tool install 'csvnorm[banner]'
88
+ # or
89
+ pip install 'csvnorm[banner]'
90
+ ```
91
+
92
+ Example with banner:
93
+ ```bash
94
+ csvnorm --version
95
+ # Output:
96
+ # ___________ ______ ____ _________ ___
97
+ # / ___/ ___/ | / / __ \/ __ \/ ___/ __ `__ \
98
+ # / /__(__ )| |/ / / / / /_/ / / / / / / / /
99
+ # \___/____/ |___/_/ /_/\____/_/ /_/ /_/ /_/
100
+ #
101
+ # csvnorm 0.3.1
102
+ ```
103
+
104
+ ## Purpose
105
+
106
+ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
107
+
108
+ **What it does:**
109
+ - Validates CSV structure and reports errors
110
+ - Normalizes encoding to UTF-8
111
+ - Normalizes delimiters and field names
112
+ - Creates a consistent starting point for data exploration
113
+
114
+ **What it doesn't do:**
115
+ - Complex data transformations or business logic
116
+ - Type inference or data validation beyond structure
117
+ - Heavy processing or aggregations
118
+
119
+ ## Features
120
+
121
+ - **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
122
+ - **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
123
+ - **Field Name Normalization**: Converts column headers to snake_case format
124
+ - **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
125
+ - **Error Reporting**: Exports detailed error file for invalid rows
126
+
127
+ ## Usage
128
+
129
+ ```bash
130
+ csvnorm input.csv [options]
131
+ ```
132
+
133
+ ### Options
134
+
135
+ | Option | Description |
136
+ |--------|-------------|
137
+ | `-f, --force` | Force overwrite of existing output files |
138
+ | `-k, --keep-names` | Keep original column names (disable snake_case) |
139
+ | `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
140
+ | `-o, --output-dir DIR` | Set output directory (default: current dir) |
141
+ | `-V, --verbose` | Enable verbose output for debugging |
142
+ | `-v, --version` | Show version number |
143
+ | `-h, --help` | Show help message |
144
+
145
+ ### Examples
146
+
147
+ ```bash
148
+ # Basic usage
149
+ csvnorm data.csv
150
+
151
+ # With semicolon delimiter
152
+ csvnorm data.csv -d ';'
153
+
154
+ # Custom output directory
155
+ csvnorm data.csv -o ./output
156
+
157
+ # Keep original headers
158
+ csvnorm data.csv --keep-names
159
+
160
+ # Force overwrite with verbose output
161
+ csvnorm data.csv -f -V
162
+ ```
163
+
164
+ ### Output
165
+
166
+ Creates a normalized CSV file in the specified output directory with:
167
+ - UTF-8 encoding
168
+ - Consistent field delimiters
169
+ - Normalized column names (unless `--keep-names` is specified)
170
+ - Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
171
+
172
+ The tool provides modern terminal output with:
173
+ - Progress indicators for multi-step processing
174
+ - Color-coded error messages with panels
175
+ - Success summary table showing encoding, paths, and settings
176
+ - Optional ASCII art banner with `--version` and `-V` verbose mode (requires `pyfiglet`)
177
+
178
+ ### Exit Codes
179
+
180
+ | Code | Meaning |
181
+ |------|---------|
182
+ | 0 | Success |
183
+ | 1 | Error (validation failed, file not found, etc.) |
184
+
185
+ ## Requirements
186
+
187
+ - Python 3.8+
188
+ - Dependencies (automatically installed):
189
+ - `charset-normalizer>=3.0.0` - Encoding detection
190
+ - `duckdb>=0.9.0` - CSV validation and normalization
191
+ - `rich>=13.0.0` - Modern terminal output formatting
192
+ - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
193
+
194
+ Optional extras:
195
+ - `[banner]` - ASCII art banner for `--version` and `-V` verbose mode (`pyfiglet>=1.0.0`)
196
+ - `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
197
+
198
+ ## Development
199
+
200
+ ### Setup
201
+
202
+ ```bash
203
+ git clone https://github.com/aborruso/csvnorm
204
+ cd csvnorm
205
+
206
+ # Create and activate venv with uv (recommended)
207
+ uv venv
208
+ source .venv/bin/activate
209
+ uv pip install -e ".[dev]"
210
+
211
+ # Or with pip
212
+ pip install -e ".[dev]"
213
+ ```
214
+
215
+ ### Testing
216
+
217
+ ```bash
218
+ pytest tests/ -v
219
+ ```
220
+
221
+ ### Project Structure
222
+
223
+ ```
224
+ prepare_data/
225
+ ├── src/csvnorm/
226
+ │ ├── __init__.py # Package version
227
+ │ ├── __main__.py # python -m support
228
+ │ ├── cli.py # CLI argument parsing
229
+ │ ├── core.py # Main processing pipeline
230
+ │ ├── encoding.py # Encoding detection/conversion
231
+ │ ├── validation.py # DuckDB validation
232
+ │ └── utils.py # Helper functions
233
+ ├── tests/ # Test suite
234
+ ├── test/ # CSV fixtures
235
+ └── pyproject.toml # Package configuration
236
+ ```
237
+
238
+ ## License
239
+
240
+ MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
@@ -0,0 +1,13 @@
1
+ csvnorm/__init__.py,sha256=8njXIycxL0qSI5Q9bVGyTaM41j_kKX9jV7TeQOSAQGE,263
2
+ csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
3
+ csvnorm/cli.py,sha256=MwIPahLktbulF6NYRWyBsE4s9Al9_aSdA1zvzuI0AiQ,3815
4
+ csvnorm/core.py,sha256=_kTaui_2IhqrN_UxJpcjwXYXEvqaRMhML49Xlx-e0p0,6633
5
+ csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
6
+ csvnorm/utils.py,sha256=gvwDToOx3YoKCfVPyCmxcSa7teCWFB2SmAGr-jV5w_Y,1761
7
+ csvnorm/validation.py,sha256=iXdfalAGDNB9kPefyzHXGI9uc-HLAG5pQ_-T93ShppY,2815
8
+ csvnorm-0.3.3.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
9
+ csvnorm-0.3.3.dist-info/METADATA,sha256=xKJmLVX9RoB22KwAAlxAvWB_KA9h68m5V-UyFaS_DGo,7840
10
+ csvnorm-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ csvnorm-0.3.3.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
12
+ csvnorm-0.3.3.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
13
+ csvnorm-0.3.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ csvnorm = csvnorm.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 aborruso@gmail.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ csvnorm