csvnorm 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvnorm-0.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 aborruso@gmail.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
csvnorm-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvnorm
3
+ Version: 0.3.0
4
+ Summary: A command-line utility to validate and normalize CSV files
5
+ Author-email: aborruso <aborruso@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 aborruso@gmail.com
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/aborruso/prepare_data
29
+ Project-URL: Issues, https://github.com/aborruso/prepare_data/issues
30
+ Keywords: csv,data,normalization,validation,etl
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Operating System :: OS Independent
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.8
38
+ Classifier: Programming Language :: Python :: 3.9
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
43
+ Classifier: Topic :: Software Development :: Libraries
44
+ Classifier: Topic :: Utilities
45
+ Requires-Python: >=3.9
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: charset-normalizer>=3.0.0
49
+ Requires-Dist: duckdb>=0.9.0
50
+ Requires-Dist: rich>=13.0.0
51
+ Requires-Dist: rich-argparse>=1.0.0
52
+ Provides-Extra: dev
53
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
54
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
55
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
56
+ Provides-Extra: banner
57
+ Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
58
+ Dynamic: license-file
59
+
60
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/prepare_data)
61
+ [![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
62
+
63
+ # csvnorm
64
+
65
+ A command-line utility to validate and normalize CSV files for initial exploration.
66
+
67
+ ## Installation
68
+
69
+ Recommended (uv):
70
+
71
+ ```bash
72
+ uv tool install csvnorm
73
+ ```
74
+
75
+ Or with pip:
76
+
77
+ ```bash
78
+ pip install csvnorm
79
+ ```
80
+
81
+ For ASCII banner in verbose mode:
82
+
83
+ ```bash
84
+ uv tool install csvnorm[banner]
85
+ # or
86
+ pip install csvnorm[banner]
87
+ ```
88
+
89
+ ## Purpose
90
+
91
+ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
92
+
93
+ **What it does:**
94
+ - Validates CSV structure and reports errors
95
+ - Normalizes encoding to UTF-8
96
+ - Normalizes delimiters and field names
97
+ - Creates a consistent starting point for data exploration
98
+
99
+ **What it doesn't do:**
100
+ - Complex data transformations or business logic
101
+ - Type inference or data validation beyond structure
102
+ - Heavy processing or aggregations
103
+
104
+ ## Features
105
+
106
+ - **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
107
+ - **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
108
+ - **Field Name Normalization**: Converts column headers to snake_case format
109
+ - **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
110
+ - **Error Reporting**: Exports detailed error file for invalid rows
111
+
112
+ ## Usage
113
+
114
+ ```bash
115
+ csvnorm input.csv [options]
116
+ ```
117
+
118
+ ### Options
119
+
120
+ | Option | Description |
121
+ |--------|-------------|
122
+ | `-f, --force` | Force overwrite of existing output files |
123
+ | `-n, --keep-names` | Keep original column names (disable snake_case) |
124
+ | `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
125
+ | `-o, --output-dir DIR` | Set output directory (default: current dir) |
126
+ | `-v, --verbose` | Enable verbose output for debugging |
127
+ | `--version` | Show version number |
128
+ | `-h, --help` | Show help message |
129
+
130
+ ### Examples
131
+
132
+ ```bash
133
+ # Basic usage
134
+ csvnorm data.csv
135
+
136
+ # With semicolon delimiter
137
+ csvnorm data.csv -d ';'
138
+
139
+ # Custom output directory
140
+ csvnorm data.csv -o ./output
141
+
142
+ # Keep original headers
143
+ csvnorm data.csv --keep-names
144
+
145
+ # Force overwrite with verbose output
146
+ csvnorm data.csv -f -v
147
+ ```
148
+
149
+ ### Output
150
+
151
+ Creates a normalized CSV file in the specified output directory with:
152
+ - UTF-8 encoding
153
+ - Consistent field delimiters
154
+ - Normalized column names (unless `--keep-names` is specified)
155
+ - Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
156
+
157
+ The tool provides modern terminal output with:
158
+ - Progress indicators for multi-step processing
159
+ - Color-coded error messages with panels
160
+ - Success summary table showing encoding, paths, and settings
161
+ - Optional ASCII art banner in verbose mode (requires `pyfiglet`)
162
+
163
+ ### Exit Codes
164
+
165
+ | Code | Meaning |
166
+ |------|---------|
167
+ | 0 | Success |
168
+ | 1 | Error (validation failed, file not found, etc.) |
169
+
170
+ ## Requirements
171
+
172
+ - Python 3.8+
173
+ - Dependencies (automatically installed):
174
+ - `charset-normalizer>=3.0.0` - Encoding detection
175
+ - `duckdb>=0.9.0` - CSV validation and normalization
176
+ - `rich>=13.0.0` - Modern terminal output formatting
177
+ - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
178
+
179
+ Optional:
180
+ - `pyfiglet>=1.0.0` - ASCII art banner in verbose mode (install with `pip install csvnorm[banner]`)
181
+
182
+ ## Development
183
+
184
+ ### Setup
185
+
186
+ ```bash
187
+ git clone https://github.com/aborruso/prepare_data
188
+ cd prepare_data
189
+ pip install -e ".[dev]"
190
+ ```
191
+
192
+ ### Testing
193
+
194
+ ```bash
195
+ pytest tests/ -v
196
+ ```
197
+
198
+ ### Project Structure
199
+
200
+ ```
201
+ prepare_data/
202
+ ├── src/csvnorm/
203
+ │ ├── __init__.py # Package version
204
+ │ ├── __main__.py # python -m support
205
+ │ ├── cli.py # CLI argument parsing
206
+ │ ├── core.py # Main processing pipeline
207
+ │ ├── encoding.py # Encoding detection/conversion
208
+ │ ├── validation.py # DuckDB validation
209
+ │ └── utils.py # Helper functions
210
+ ├── tests/ # Test suite
211
+ ├── test/ # CSV fixtures
212
+ └── pyproject.toml # Package configuration
213
+ ```
214
+
215
+ ## License
216
+
217
+ MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
@@ -0,0 +1,158 @@
1
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/prepare_data)
2
+ [![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
3
+
4
+ # csvnorm
5
+
6
+ A command-line utility to validate and normalize CSV files for initial exploration.
7
+
8
+ ## Installation
9
+
10
+ Recommended (uv):
11
+
12
+ ```bash
13
+ uv tool install csvnorm
14
+ ```
15
+
16
+ Or with pip:
17
+
18
+ ```bash
19
+ pip install csvnorm
20
+ ```
21
+
22
+ For ASCII banner in verbose mode:
23
+
24
+ ```bash
25
+ uv tool install csvnorm[banner]
26
+ # or
27
+ pip install csvnorm[banner]
28
+ ```
29
+
30
+ ## Purpose
31
+
32
+ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
33
+
34
+ **What it does:**
35
+ - Validates CSV structure and reports errors
36
+ - Normalizes encoding to UTF-8
37
+ - Normalizes delimiters and field names
38
+ - Creates a consistent starting point for data exploration
39
+
40
+ **What it doesn't do:**
41
+ - Complex data transformations or business logic
42
+ - Type inference or data validation beyond structure
43
+ - Heavy processing or aggregations
44
+
45
+ ## Features
46
+
47
+ - **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
48
+ - **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
49
+ - **Field Name Normalization**: Converts column headers to snake_case format
50
+ - **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
51
+ - **Error Reporting**: Exports detailed error file for invalid rows
52
+
53
+ ## Usage
54
+
55
+ ```bash
56
+ csvnorm input.csv [options]
57
+ ```
58
+
59
+ ### Options
60
+
61
+ | Option | Description |
62
+ |--------|-------------|
63
+ | `-f, --force` | Force overwrite of existing output files |
64
+ | `-n, --keep-names` | Keep original column names (disable snake_case) |
65
+ | `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
66
+ | `-o, --output-dir DIR` | Set output directory (default: current dir) |
67
+ | `-v, --verbose` | Enable verbose output for debugging |
68
+ | `--version` | Show version number |
69
+ | `-h, --help` | Show help message |
70
+
71
+ ### Examples
72
+
73
+ ```bash
74
+ # Basic usage
75
+ csvnorm data.csv
76
+
77
+ # With semicolon delimiter
78
+ csvnorm data.csv -d ';'
79
+
80
+ # Custom output directory
81
+ csvnorm data.csv -o ./output
82
+
83
+ # Keep original headers
84
+ csvnorm data.csv --keep-names
85
+
86
+ # Force overwrite with verbose output
87
+ csvnorm data.csv -f -v
88
+ ```
89
+
90
+ ### Output
91
+
92
+ Creates a normalized CSV file in the specified output directory with:
93
+ - UTF-8 encoding
94
+ - Consistent field delimiters
95
+ - Normalized column names (unless `--keep-names` is specified)
96
+ - Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
97
+
98
+ The tool provides modern terminal output with:
99
+ - Progress indicators for multi-step processing
100
+ - Color-coded error messages with panels
101
+ - Success summary table showing encoding, paths, and settings
102
+ - Optional ASCII art banner in verbose mode (requires `pyfiglet`)
103
+
104
+ ### Exit Codes
105
+
106
+ | Code | Meaning |
107
+ |------|---------|
108
+ | 0 | Success |
109
+ | 1 | Error (validation failed, file not found, etc.) |
110
+
111
+ ## Requirements
112
+
113
+ - Python 3.8+
114
+ - Dependencies (automatically installed):
115
+ - `charset-normalizer>=3.0.0` - Encoding detection
116
+ - `duckdb>=0.9.0` - CSV validation and normalization
117
+ - `rich>=13.0.0` - Modern terminal output formatting
118
+ - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
119
+
120
+ Optional:
121
+ - `pyfiglet>=1.0.0` - ASCII art banner in verbose mode (install with `pip install csvnorm[banner]`)
122
+
123
+ ## Development
124
+
125
+ ### Setup
126
+
127
+ ```bash
128
+ git clone https://github.com/aborruso/prepare_data
129
+ cd prepare_data
130
+ pip install -e ".[dev]"
131
+ ```
132
+
133
+ ### Testing
134
+
135
+ ```bash
136
+ pytest tests/ -v
137
+ ```
138
+
139
+ ### Project Structure
140
+
141
+ ```
142
+ prepare_data/
143
+ ├── src/csvnorm/
144
+ │ ├── __init__.py # Package version
145
+ │ ├── __main__.py # python -m support
146
+ │ ├── cli.py # CLI argument parsing
147
+ │ ├── core.py # Main processing pipeline
148
+ │ ├── encoding.py # Encoding detection/conversion
149
+ │ ├── validation.py # DuckDB validation
150
+ │ └── utils.py # Helper functions
151
+ ├── tests/ # Test suite
152
+ ├── test/ # CSV fixtures
153
+ └── pyproject.toml # Package configuration
154
+ ```
155
+
156
+ ## License
157
+
158
+ MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "csvnorm"
7
+ version = "0.3.0"
8
+ description = "A command-line utility to validate and normalize CSV files"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {file = "LICENSE"}
12
+ authors = [
13
+ {name = "aborruso", email = "aborruso@gmail.com"}
14
+ ]
15
+ keywords = ["csv", "data", "normalization", "validation", "etl"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.8",
24
+ "Programming Language :: Python :: 3.9",
25
+ "Programming Language :: Python :: 3.10",
26
+ "Programming Language :: Python :: 3.11",
27
+ "Programming Language :: Python :: 3.12",
28
+ "Topic :: Scientific/Engineering :: Information Analysis",
29
+ "Topic :: Software Development :: Libraries",
30
+ "Topic :: Utilities",
31
+ ]
32
+
33
+ dependencies = [
34
+ "charset-normalizer>=3.0.0",
35
+ "duckdb>=0.9.0",
36
+ "rich>=13.0.0",
37
+ "rich-argparse>=1.0.0",
38
+ ]
39
+
40
+ [project.optional-dependencies]
41
+ dev = [
42
+ "pytest>=7.0.0",
43
+ "pytest-cov>=4.0.0",
44
+ "ruff>=0.1.0",
45
+ ]
46
+ banner = [
47
+ "pyfiglet>=1.0.0",
48
+ ]
49
+
50
+ [project.urls]
51
+ Homepage = "https://github.com/aborruso/prepare_data"
52
+ Issues = "https://github.com/aborruso/prepare_data/issues"
53
+
54
+ [project.scripts]
55
+ csvnorm = "csvnorm.cli:main"
56
+
57
+ [tool.setuptools.packages.find]
58
+ where = ["src"]
59
+
60
+ [tool.ruff]
61
+ line-length = 88
62
+ target-version = "py38"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ """csvnorm - Validate and normalize CSV files."""
2
+
3
+ __version__ = "0.3.0"
4
+ __all__ = ["normalize_csv", "detect_encoding", "process_csv"]
5
+
6
+ from csvnorm.core import process_csv
7
+ from csvnorm.encoding import detect_encoding
8
+ from csvnorm.validation import normalize_csv
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m csvnorm."""
2
+
3
+ from csvnorm.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,137 @@
1
+ """Command-line interface for csvnorm."""
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from rich.console import Console
8
+ from rich_argparse import RichHelpFormatter
9
+
10
+ from csvnorm import __version__
11
+ from csvnorm.core import process_csv
12
+ from csvnorm.utils import setup_logger
13
+
14
+ console = Console()
15
+
16
+
17
+ def show_banner() -> None:
18
+ """Show ASCII art banner if pyfiglet is available."""
19
+ try:
20
+ from pyfiglet import figlet_format
21
+ banner = figlet_format("csvnorm", font="slant")
22
+ console.print(banner, style="bold cyan")
23
+ except ImportError:
24
+ # pyfiglet not installed, skip banner
25
+ pass
26
+
27
+
28
+ def create_parser() -> argparse.ArgumentParser:
29
+ """Create and return the argument parser."""
30
+ parser = argparse.ArgumentParser(
31
+ prog="csvnorm",
32
+ description="Validate and normalize CSV files for exploratory data analysis",
33
+ formatter_class=RichHelpFormatter,
34
+ epilog="""\
35
+ Examples:
36
+ csvnorm data.csv -d ';' -o output_folder --force
37
+ csvnorm data.csv --keep-names --delimiter '\\t'
38
+ csvnorm data.csv -v
39
+ """,
40
+ )
41
+
42
+ parser.add_argument(
43
+ "input_file",
44
+ type=Path,
45
+ help="Input CSV file path",
46
+ )
47
+
48
+ parser.add_argument(
49
+ "-f",
50
+ "--force",
51
+ action="store_true",
52
+ help="Force overwrite of existing output files",
53
+ )
54
+
55
+ parser.add_argument(
56
+ "-n",
57
+ "--keep-names",
58
+ action="store_true",
59
+ help=(
60
+ "Keep original column names (disable snake_case normalization). "
61
+ "By default, column names are converted to snake_case format "
62
+ "(e.g., 'Column Name' becomes 'column_name')."
63
+ ),
64
+ )
65
+
66
+ parser.add_argument(
67
+ "-d",
68
+ "--delimiter",
69
+ default=",",
70
+ help="Set custom field delimiter (default: comma). Example: -d ';'",
71
+ )
72
+
73
+ parser.add_argument(
74
+ "-o",
75
+ "--output-dir",
76
+ type=Path,
77
+ default=Path.cwd(),
78
+ help="Set custom output directory (default: current working directory)",
79
+ )
80
+
81
+ parser.add_argument(
82
+ "-v",
83
+ "--verbose",
84
+ action="store_true",
85
+ help="Enable verbose output for debugging",
86
+ )
87
+
88
+ parser.add_argument(
89
+ "--version",
90
+ action="version",
91
+ version=f"%(prog)s {__version__}",
92
+ )
93
+
94
+ return parser
95
+
96
+
97
+ def main(argv: list[str] | None = None) -> int:
98
+ """Main entry point for the CLI.
99
+
100
+ Args:
101
+ argv: Command line arguments (defaults to sys.argv[1:]).
102
+
103
+ Returns:
104
+ Exit code: 0 for success, 1 for error.
105
+ """
106
+ parser = create_parser()
107
+
108
+ # Handle missing arguments gracefully
109
+ if argv is None:
110
+ argv = sys.argv[1:]
111
+
112
+ if not argv or (len(argv) == 1 and argv[0] in ['-h', '--help']):
113
+ parser.print_help()
114
+ return 0 if argv else 2
115
+
116
+ args = parser.parse_args(argv)
117
+
118
+ # Show banner in verbose mode
119
+ if args.verbose:
120
+ show_banner()
121
+
122
+ # Setup logging
123
+ setup_logger(args.verbose)
124
+
125
+ # Run processing
126
+ return process_csv(
127
+ input_file=args.input_file,
128
+ output_dir=args.output_dir,
129
+ force=args.force,
130
+ keep_names=args.keep_names,
131
+ delimiter=args.delimiter,
132
+ verbose=args.verbose,
133
+ )
134
+
135
+
136
+ if __name__ == "__main__":
137
+ sys.exit(main())