csvnorm 0.3.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvnorm-0.3.12/LICENSE +21 -0
- csvnorm-0.3.12/PKG-INFO +279 -0
- csvnorm-0.3.12/README.md +223 -0
- csvnorm-0.3.12/pyproject.toml +63 -0
- csvnorm-0.3.12/setup.cfg +4 -0
- csvnorm-0.3.12/src/csvnorm/__init__.py +8 -0
- csvnorm-0.3.12/src/csvnorm/__main__.py +6 -0
- csvnorm-0.3.12/src/csvnorm/cli.py +163 -0
- csvnorm-0.3.12/src/csvnorm/core.py +284 -0
- csvnorm-0.3.12/src/csvnorm/encoding.py +119 -0
- csvnorm-0.3.12/src/csvnorm/ui.py +127 -0
- csvnorm-0.3.12/src/csvnorm/utils.py +192 -0
- csvnorm-0.3.12/src/csvnorm/validation.py +195 -0
- csvnorm-0.3.12/src/csvnorm.egg-info/PKG-INFO +279 -0
- csvnorm-0.3.12/src/csvnorm.egg-info/SOURCES.txt +22 -0
- csvnorm-0.3.12/src/csvnorm.egg-info/dependency_links.txt +1 -0
- csvnorm-0.3.12/src/csvnorm.egg-info/entry_points.txt +2 -0
- csvnorm-0.3.12/src/csvnorm.egg-info/requires.txt +9 -0
- csvnorm-0.3.12/src/csvnorm.egg-info/top_level.txt +1 -0
- csvnorm-0.3.12/tests/test_cli.py +211 -0
- csvnorm-0.3.12/tests/test_encoding.py +134 -0
- csvnorm-0.3.12/tests/test_integration.py +231 -0
- csvnorm-0.3.12/tests/test_utils.py +156 -0
- csvnorm-0.3.12/tests/test_validation.py +101 -0
csvnorm-0.3.12/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 aborruso@gmail.com
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
csvnorm-0.3.12/PKG-INFO
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csvnorm
|
|
3
|
+
Version: 0.3.12
|
|
4
|
+
Summary: A command-line utility to validate and normalize CSV files
|
|
5
|
+
Author-email: aborruso <aborruso@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 aborruso@gmail.com
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/aborruso/prepare_data
|
|
29
|
+
Project-URL: Issues, https://github.com/aborruso/prepare_data/issues
|
|
30
|
+
Keywords: csv,data,normalization,validation,etl
|
|
31
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Operating System :: OS Independent
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
42
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
43
|
+
Classifier: Topic :: Utilities
|
|
44
|
+
Requires-Python: >=3.9
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
License-File: LICENSE
|
|
47
|
+
Requires-Dist: charset-normalizer>=3.0.0
|
|
48
|
+
Requires-Dist: duckdb>=0.9.0
|
|
49
|
+
Requires-Dist: rich>=13.0.0
|
|
50
|
+
Requires-Dist: rich-argparse>=1.0.0
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
53
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
54
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
|
|
57
|
+
[](https://pypi.org/project/csvnorm/)
|
|
58
|
+
[](https://opensource.org/licenses/MIT)
|
|
59
|
+
[](https://www.python.org/downloads/)
|
|
60
|
+
[](https://deepwiki.com/aborruso/csvnorm)
|
|
61
|
+
|
|
62
|
+
# csvnorm
|
|
63
|
+
|
|
64
|
+
A command-line utility to validate and normalize CSV files for initial exploration.
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
Recommended (uv):
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
uv tool install csvnorm
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Or with pip:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install csvnorm
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Purpose
|
|
81
|
+
|
|
82
|
+
This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
|
|
83
|
+
|
|
84
|
+
**What it does:**
|
|
85
|
+
- Validates CSV structure and reports errors
|
|
86
|
+
- Normalizes encoding to UTF-8 when needed
|
|
87
|
+
- Normalizes delimiters and field names
|
|
88
|
+
- Creates a consistent starting point for data exploration
|
|
89
|
+
|
|
90
|
+
**What it doesn't do:**
|
|
91
|
+
- Complex data transformations or business logic
|
|
92
|
+
- Type inference or data validation beyond structure
|
|
93
|
+
- Heavy processing or aggregations
|
|
94
|
+
|
|
95
|
+
## Features
|
|
96
|
+
|
|
97
|
+
- **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
|
|
98
|
+
- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
|
|
99
|
+
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
100
|
+
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8 when needed (ASCII is already UTF-8 compatible)
|
|
101
|
+
- **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
|
|
102
|
+
- **Error Reporting**: Exports detailed error file for invalid rows with summary panel
|
|
103
|
+
- **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
|
|
104
|
+
|
|
105
|
+
## Usage
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
csvnorm input.csv [options]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Options
|
|
112
|
+
|
|
113
|
+
| Option | Description |
|
|
114
|
+
|--------|-------------|
|
|
115
|
+
| `-f, --force` | Force overwrite of existing output files |
|
|
116
|
+
| `-k, --keep-names` | Keep original column names (disable snake_case) |
|
|
117
|
+
| `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
|
|
118
|
+
| `-o, --output-file PATH` | Set output file path (absolute or relative) |
|
|
119
|
+
| `-V, --verbose` | Enable verbose output for debugging |
|
|
120
|
+
| `-v, --version` | Show version number |
|
|
121
|
+
| `-h, --help` | Show help message |
|
|
122
|
+
|
|
123
|
+
### Examples
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Basic usage (output: data.csv in current directory)
|
|
127
|
+
csvnorm data.csv
|
|
128
|
+
|
|
129
|
+
# Specify output file path
|
|
130
|
+
csvnorm data.csv -o output/processed.csv
|
|
131
|
+
|
|
132
|
+
# Use absolute path
|
|
133
|
+
csvnorm data.csv -o /tmp/data_normalized.csv
|
|
134
|
+
|
|
135
|
+
# Process remote CSV from URL
|
|
136
|
+
csvnorm "https://raw.githubusercontent.com/aborruso/csvnorm/refs/heads/main/test/Trasporto%20Pubblico%20Locale%20Settore%20Pubblico%20Allargato%20-%20Indicatore%202000-2020%20Trasferimenti%20Correnti%20su%20Entrate%20Correnti.csv" -o output.csv
|
|
137
|
+
|
|
138
|
+
# With semicolon delimiter
|
|
139
|
+
csvnorm data.csv -d ';' -o data_semicolon.csv
|
|
140
|
+
|
|
141
|
+
# Keep original headers
|
|
142
|
+
csvnorm data.csv --keep-names -o output.csv
|
|
143
|
+
|
|
144
|
+
# Force overwrite with verbose output
|
|
145
|
+
csvnorm data.csv -f -V -o processed.csv
|
|
146
|
+
|
|
147
|
+
# Custom output name and extension
|
|
148
|
+
csvnorm data.csv -o results.txt
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Output
|
|
152
|
+
|
|
153
|
+
Creates a normalized CSV file at the specified path with:
|
|
154
|
+
- UTF-8 encoding
|
|
155
|
+
- Consistent field delimiters
|
|
156
|
+
- Normalized column names (unless `--keep-names` is specified)
|
|
157
|
+
- Error report if any invalid rows are found (saved as `{output_name}_reject_errors.csv` in the same directory)
|
|
158
|
+
- Temporary encoding conversion files stored in system temp directory with auto-cleanup
|
|
159
|
+
|
|
160
|
+
Output file path behavior:
|
|
161
|
+
- If `-o` is specified: uses the exact path provided (supports absolute and relative paths)
|
|
162
|
+
- If `-o` is omitted: uses input filename in current working directory
|
|
163
|
+
- Any file extension is allowed (not limited to `.csv`)
|
|
164
|
+
|
|
165
|
+
For remote URLs:
|
|
166
|
+
- You must specify `-o` to set the output filename
|
|
167
|
+
- Encoding is handled automatically by DuckDB
|
|
168
|
+
- HTTP timeout is set to 30 seconds
|
|
169
|
+
- Only public URLs are supported (no authentication)
|
|
170
|
+
|
|
171
|
+
The tool provides modern terminal output with:
|
|
172
|
+
- Progress indicators for multi-step processing
|
|
173
|
+
- Color-coded error messages with panels
|
|
174
|
+
- Success summary table with statistics (rows, columns, file sizes)
|
|
175
|
+
- Encoding conversion status (converted/no conversion/remote; ASCII is already UTF-8 compatible)
|
|
176
|
+
- Error summary panel with reject count and error types when validation fails
|
|
177
|
+
- ASCII art banner with `--version` and `-V` verbose mode
|
|
178
|
+
|
|
179
|
+
**Success Example:**
|
|
180
|
+
```
|
|
181
|
+
✓ Success
|
|
182
|
+
Input: test/utf8_basic.csv
|
|
183
|
+
Output: output/utf8_basic.csv
|
|
184
|
+
Encoding: ascii (ASCII is UTF-8 compatible; no conversion needed)
|
|
185
|
+
Rows: 2
|
|
186
|
+
Columns: 3
|
|
187
|
+
Input size: 42 B
|
|
188
|
+
Output size: 43 B
|
|
189
|
+
Headers: normalized to snake_case
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Error Example:**
|
|
193
|
+
```
|
|
194
|
+
✓ Success
|
|
195
|
+
Input: test/malformed_rows.csv
|
|
196
|
+
Output: output/malformed_rows.csv
|
|
197
|
+
Encoding: ascii (ASCII is UTF-8 compatible; no conversion needed)
|
|
198
|
+
Rows: 1
|
|
199
|
+
Columns: 4
|
|
200
|
+
Input size: 24 B
|
|
201
|
+
Output size: 40 B
|
|
202
|
+
Headers: normalized to snake_case
|
|
203
|
+
|
|
204
|
+
╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
|
|
205
|
+
│ Validation Errors: │
|
|
206
|
+
│ │
|
|
207
|
+
│ Rejected rows: 2 │
|
|
208
|
+
│ │
|
|
209
|
+
│ Error types: │
|
|
210
|
+
│ • Expected Number of Columns: 3 Found: 2 │
|
|
211
|
+
│ • Expected Number of Columns: 3 Found: 4 │
|
|
212
|
+
│ │
|
|
213
|
+
│ Details: output/malformed_rows_reject_errors.csv │
|
|
214
|
+
╰──────────────────────────────────────────────────────────────────────────────╯
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Exit Codes
|
|
218
|
+
|
|
219
|
+
| Code | Meaning |
|
|
220
|
+
|------|---------|
|
|
221
|
+
| 0 | Success |
|
|
222
|
+
| 1 | Error (validation failed, file not found, etc.) |
|
|
223
|
+
|
|
224
|
+
## Requirements
|
|
225
|
+
|
|
226
|
+
- Python 3.9+
|
|
227
|
+
- Dependencies (automatically installed):
|
|
228
|
+
- `charset-normalizer>=3.0.0` - Encoding detection
|
|
229
|
+
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
230
|
+
- `rich>=13.0.0` - Modern terminal output formatting
|
|
231
|
+
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
232
|
+
- `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
|
|
233
|
+
|
|
234
|
+
Optional extras:
|
|
235
|
+
- `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
|
|
236
|
+
|
|
237
|
+
## Development
|
|
238
|
+
|
|
239
|
+
### Setup
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
git clone https://github.com/aborruso/csvnorm
|
|
243
|
+
cd csvnorm
|
|
244
|
+
|
|
245
|
+
# Create and activate venv with uv (recommended)
|
|
246
|
+
uv venv
|
|
247
|
+
source .venv/bin/activate
|
|
248
|
+
uv pip install -e ".[dev]"
|
|
249
|
+
|
|
250
|
+
# Or with pip
|
|
251
|
+
pip install -e ".[dev]"
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Testing
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
pytest tests/ -v
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Project Structure
|
|
261
|
+
|
|
262
|
+
```
|
|
263
|
+
csvnorm/
|
|
264
|
+
├── src/csvnorm/
|
|
265
|
+
│ ├── __init__.py # Package version
|
|
266
|
+
│ ├── __main__.py # python -m support
|
|
267
|
+
│ ├── cli.py # CLI argument parsing
|
|
268
|
+
│ ├── core.py # Main processing pipeline
|
|
269
|
+
│ ├── encoding.py # Encoding detection/conversion
|
|
270
|
+
│ ├── validation.py # DuckDB validation
|
|
271
|
+
│ └── utils.py # Helper functions
|
|
272
|
+
├── tests/ # Test suite
|
|
273
|
+
├── test/ # CSV fixtures
|
|
274
|
+
└── pyproject.toml # Package configuration
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
## License
|
|
278
|
+
|
|
279
|
+
MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
|
csvnorm-0.3.12/README.md
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
[](https://pypi.org/project/csvnorm/)
|
|
2
|
+
[](https://opensource.org/licenses/MIT)
|
|
3
|
+
[](https://www.python.org/downloads/)
|
|
4
|
+
[](https://deepwiki.com/aborruso/csvnorm)
|
|
5
|
+
|
|
6
|
+
# csvnorm
|
|
7
|
+
|
|
8
|
+
A command-line utility to validate and normalize CSV files for initial exploration.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Recommended (uv):
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
uv tool install csvnorm
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Or with pip:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install csvnorm
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Purpose
|
|
25
|
+
|
|
26
|
+
This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
|
|
27
|
+
|
|
28
|
+
**What it does:**
|
|
29
|
+
- Validates CSV structure and reports errors
|
|
30
|
+
- Normalizes encoding to UTF-8 when needed
|
|
31
|
+
- Normalizes delimiters and field names
|
|
32
|
+
- Creates a consistent starting point for data exploration
|
|
33
|
+
|
|
34
|
+
**What it doesn't do:**
|
|
35
|
+
- Complex data transformations or business logic
|
|
36
|
+
- Type inference or data validation beyond structure
|
|
37
|
+
- Heavy processing or aggregations
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
|
|
42
|
+
- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
|
|
43
|
+
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
44
|
+
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8 when needed (ASCII is already UTF-8 compatible)
|
|
45
|
+
- **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
|
|
46
|
+
- **Error Reporting**: Exports detailed error file for invalid rows with summary panel
|
|
47
|
+
- **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
csvnorm input.csv [options]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Options
|
|
56
|
+
|
|
57
|
+
| Option | Description |
|
|
58
|
+
|--------|-------------|
|
|
59
|
+
| `-f, --force` | Force overwrite of existing output files |
|
|
60
|
+
| `-k, --keep-names` | Keep original column names (disable snake_case) |
|
|
61
|
+
| `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
|
|
62
|
+
| `-o, --output-file PATH` | Set output file path (absolute or relative) |
|
|
63
|
+
| `-V, --verbose` | Enable verbose output for debugging |
|
|
64
|
+
| `-v, --version` | Show version number |
|
|
65
|
+
| `-h, --help` | Show help message |
|
|
66
|
+
|
|
67
|
+
### Examples
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Basic usage (output: data.csv in current directory)
|
|
71
|
+
csvnorm data.csv
|
|
72
|
+
|
|
73
|
+
# Specify output file path
|
|
74
|
+
csvnorm data.csv -o output/processed.csv
|
|
75
|
+
|
|
76
|
+
# Use absolute path
|
|
77
|
+
csvnorm data.csv -o /tmp/data_normalized.csv
|
|
78
|
+
|
|
79
|
+
# Process remote CSV from URL
|
|
80
|
+
csvnorm "https://raw.githubusercontent.com/aborruso/csvnorm/refs/heads/main/test/Trasporto%20Pubblico%20Locale%20Settore%20Pubblico%20Allargato%20-%20Indicatore%202000-2020%20Trasferimenti%20Correnti%20su%20Entrate%20Correnti.csv" -o output.csv
|
|
81
|
+
|
|
82
|
+
# With semicolon delimiter
|
|
83
|
+
csvnorm data.csv -d ';' -o data_semicolon.csv
|
|
84
|
+
|
|
85
|
+
# Keep original headers
|
|
86
|
+
csvnorm data.csv --keep-names -o output.csv
|
|
87
|
+
|
|
88
|
+
# Force overwrite with verbose output
|
|
89
|
+
csvnorm data.csv -f -V -o processed.csv
|
|
90
|
+
|
|
91
|
+
# Custom output name and extension
|
|
92
|
+
csvnorm data.csv -o results.txt
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Output
|
|
96
|
+
|
|
97
|
+
Creates a normalized CSV file at the specified path with:
|
|
98
|
+
- UTF-8 encoding
|
|
99
|
+
- Consistent field delimiters
|
|
100
|
+
- Normalized column names (unless `--keep-names` is specified)
|
|
101
|
+
- Error report if any invalid rows are found (saved as `{output_name}_reject_errors.csv` in the same directory)
|
|
102
|
+
- Temporary encoding conversion files stored in system temp directory with auto-cleanup
|
|
103
|
+
|
|
104
|
+
Output file path behavior:
|
|
105
|
+
- If `-o` is specified: uses the exact path provided (supports absolute and relative paths)
|
|
106
|
+
- If `-o` is omitted: uses input filename in current working directory
|
|
107
|
+
- Any file extension is allowed (not limited to `.csv`)
|
|
108
|
+
|
|
109
|
+
For remote URLs:
|
|
110
|
+
- You must specify `-o` to set the output filename
|
|
111
|
+
- Encoding is handled automatically by DuckDB
|
|
112
|
+
- HTTP timeout is set to 30 seconds
|
|
113
|
+
- Only public URLs are supported (no authentication)
|
|
114
|
+
|
|
115
|
+
The tool provides modern terminal output with:
|
|
116
|
+
- Progress indicators for multi-step processing
|
|
117
|
+
- Color-coded error messages with panels
|
|
118
|
+
- Success summary table with statistics (rows, columns, file sizes)
|
|
119
|
+
- Encoding conversion status (converted/no conversion/remote; ASCII is already UTF-8 compatible)
|
|
120
|
+
- Error summary panel with reject count and error types when validation fails
|
|
121
|
+
- ASCII art banner with `--version` and `-V` verbose mode
|
|
122
|
+
|
|
123
|
+
**Success Example:**
|
|
124
|
+
```
|
|
125
|
+
✓ Success
|
|
126
|
+
Input: test/utf8_basic.csv
|
|
127
|
+
Output: output/utf8_basic.csv
|
|
128
|
+
Encoding: ascii (ASCII is UTF-8 compatible; no conversion needed)
|
|
129
|
+
Rows: 2
|
|
130
|
+
Columns: 3
|
|
131
|
+
Input size: 42 B
|
|
132
|
+
Output size: 43 B
|
|
133
|
+
Headers: normalized to snake_case
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**Error Example:**
|
|
137
|
+
```
|
|
138
|
+
✓ Success
|
|
139
|
+
Input: test/malformed_rows.csv
|
|
140
|
+
Output: output/malformed_rows.csv
|
|
141
|
+
Encoding: ascii (ASCII is UTF-8 compatible; no conversion needed)
|
|
142
|
+
Rows: 1
|
|
143
|
+
Columns: 4
|
|
144
|
+
Input size: 24 B
|
|
145
|
+
Output size: 40 B
|
|
146
|
+
Headers: normalized to snake_case
|
|
147
|
+
|
|
148
|
+
╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
|
|
149
|
+
│ Validation Errors: │
|
|
150
|
+
│ │
|
|
151
|
+
│ Rejected rows: 2 │
|
|
152
|
+
│ │
|
|
153
|
+
│ Error types: │
|
|
154
|
+
│ • Expected Number of Columns: 3 Found: 2 │
|
|
155
|
+
│ • Expected Number of Columns: 3 Found: 4 │
|
|
156
|
+
│ │
|
|
157
|
+
│ Details: output/malformed_rows_reject_errors.csv │
|
|
158
|
+
╰──────────────────────────────────────────────────────────────────────────────╯
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Exit Codes
|
|
162
|
+
|
|
163
|
+
| Code | Meaning |
|
|
164
|
+
|------|---------|
|
|
165
|
+
| 0 | Success |
|
|
166
|
+
| 1 | Error (validation failed, file not found, etc.) |
|
|
167
|
+
|
|
168
|
+
## Requirements
|
|
169
|
+
|
|
170
|
+
- Python 3.9+
|
|
171
|
+
- Dependencies (automatically installed):
|
|
172
|
+
- `charset-normalizer>=3.0.0` - Encoding detection
|
|
173
|
+
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
174
|
+
- `rich>=13.0.0` - Modern terminal output formatting
|
|
175
|
+
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
176
|
+
- `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
|
|
177
|
+
|
|
178
|
+
Optional extras:
|
|
179
|
+
- `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
|
|
180
|
+
|
|
181
|
+
## Development
|
|
182
|
+
|
|
183
|
+
### Setup
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
git clone https://github.com/aborruso/csvnorm
|
|
187
|
+
cd csvnorm
|
|
188
|
+
|
|
189
|
+
# Create and activate venv with uv (recommended)
|
|
190
|
+
uv venv
|
|
191
|
+
source .venv/bin/activate
|
|
192
|
+
uv pip install -e ".[dev]"
|
|
193
|
+
|
|
194
|
+
# Or with pip
|
|
195
|
+
pip install -e ".[dev]"
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Testing
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
pytest tests/ -v
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Project Structure
|
|
205
|
+
|
|
206
|
+
```
|
|
207
|
+
csvnorm/
|
|
208
|
+
├── src/csvnorm/
|
|
209
|
+
│ ├── __init__.py # Package version
|
|
210
|
+
│ ├── __main__.py # python -m support
|
|
211
|
+
│ ├── cli.py # CLI argument parsing
|
|
212
|
+
│ ├── core.py # Main processing pipeline
|
|
213
|
+
│ ├── encoding.py # Encoding detection/conversion
|
|
214
|
+
│ ├── validation.py # DuckDB validation
|
|
215
|
+
│ └── utils.py # Helper functions
|
|
216
|
+
├── tests/ # Test suite
|
|
217
|
+
├── test/ # CSV fixtures
|
|
218
|
+
└── pyproject.toml # Package configuration
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "csvnorm"
|
|
7
|
+
version = "0.3.12"
|
|
8
|
+
description = "A command-line utility to validate and normalize CSV files"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {file = "LICENSE"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "aborruso", email = "aborruso@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["csv", "data", "normalization", "validation", "etl"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 5 - Production/Stable",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
28
|
+
"Topic :: Software Development :: Libraries",
|
|
29
|
+
"Topic :: Utilities",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
dependencies = [
|
|
33
|
+
"charset-normalizer>=3.0.0",
|
|
34
|
+
"duckdb>=0.9.0",
|
|
35
|
+
"rich>=13.0.0",
|
|
36
|
+
"rich-argparse>=1.0.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
dev = [
|
|
41
|
+
"pytest>=7.0.0",
|
|
42
|
+
"pytest-cov>=4.0.0",
|
|
43
|
+
"ruff>=0.1.0",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[project.urls]
|
|
47
|
+
Homepage = "https://github.com/aborruso/prepare_data"
|
|
48
|
+
Issues = "https://github.com/aborruso/prepare_data/issues"
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
csvnorm = "csvnorm.cli:main"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
where = ["src"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
line-length = 88
|
|
58
|
+
target-version = "py38"
|
|
59
|
+
|
|
60
|
+
[tool.pytest.ini_options]
|
|
61
|
+
markers = [
|
|
62
|
+
"network: tests that require network access",
|
|
63
|
+
]
|
csvnorm-0.3.12/setup.cfg
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""csvnorm - Validate and normalize CSV files."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.3.12"
|
|
4
|
+
__all__ = ["normalize_csv", "detect_encoding", "process_csv"]
|
|
5
|
+
|
|
6
|
+
from csvnorm.core import process_csv
|
|
7
|
+
from csvnorm.encoding import detect_encoding
|
|
8
|
+
from csvnorm.validation import normalize_csv
|