csvnorm 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvnorm-0.3.0/LICENSE +21 -0
- csvnorm-0.3.0/PKG-INFO +217 -0
- csvnorm-0.3.0/README.md +158 -0
- csvnorm-0.3.0/pyproject.toml +62 -0
- csvnorm-0.3.0/setup.cfg +4 -0
- csvnorm-0.3.0/src/csvnorm/__init__.py +8 -0
- csvnorm-0.3.0/src/csvnorm/__main__.py +6 -0
- csvnorm-0.3.0/src/csvnorm/cli.py +137 -0
- csvnorm-0.3.0/src/csvnorm/core.py +189 -0
- csvnorm-0.3.0/src/csvnorm/encoding.py +119 -0
- csvnorm-0.3.0/src/csvnorm/utils.py +71 -0
- csvnorm-0.3.0/src/csvnorm/validation.py +109 -0
- csvnorm-0.3.0/src/csvnorm.egg-info/PKG-INFO +217 -0
- csvnorm-0.3.0/src/csvnorm.egg-info/SOURCES.txt +19 -0
- csvnorm-0.3.0/src/csvnorm.egg-info/dependency_links.txt +1 -0
- csvnorm-0.3.0/src/csvnorm.egg-info/entry_points.txt +2 -0
- csvnorm-0.3.0/src/csvnorm.egg-info/requires.txt +12 -0
- csvnorm-0.3.0/src/csvnorm.egg-info/top_level.txt +1 -0
- csvnorm-0.3.0/tests/test_encoding.py +80 -0
- csvnorm-0.3.0/tests/test_integration.py +115 -0
- csvnorm-0.3.0/tests/test_utils.py +69 -0
csvnorm-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 aborruso@gmail.com
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
csvnorm-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csvnorm
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: A command-line utility to validate and normalize CSV files
|
|
5
|
+
Author-email: aborruso <aborruso@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 aborruso@gmail.com
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/aborruso/prepare_data
|
|
29
|
+
Project-URL: Issues, https://github.com/aborruso/prepare_data/issues
|
|
30
|
+
Keywords: csv,data,normalization,validation,etl
|
|
31
|
+
Classifier: Development Status :: 4 - Beta
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Operating System :: OS Independent
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
43
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
44
|
+
Classifier: Topic :: Utilities
|
|
45
|
+
Requires-Python: >=3.9
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
License-File: LICENSE
|
|
48
|
+
Requires-Dist: charset-normalizer>=3.0.0
|
|
49
|
+
Requires-Dist: duckdb>=0.9.0
|
|
50
|
+
Requires-Dist: rich>=13.0.0
|
|
51
|
+
Requires-Dist: rich-argparse>=1.0.0
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
54
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
55
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
56
|
+
Provides-Extra: banner
|
|
57
|
+
Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
|
|
58
|
+
Dynamic: license-file
|
|
59
|
+
|
|
60
|
+
[](https://deepwiki.com/aborruso/prepare_data)
|
|
61
|
+
[](https://pypi.org/project/csvnorm/)
|
|
62
|
+
|
|
63
|
+
# csvnorm
|
|
64
|
+
|
|
65
|
+
A command-line utility to validate and normalize CSV files for initial exploration.
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
Recommended (uv):
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
uv tool install csvnorm
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Or with pip:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install csvnorm
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
For ASCII banner in verbose mode:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv tool install csvnorm[banner]
|
|
85
|
+
# or
|
|
86
|
+
pip install csvnorm[banner]
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Purpose
|
|
90
|
+
|
|
91
|
+
This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
|
|
92
|
+
|
|
93
|
+
**What it does:**
|
|
94
|
+
- Validates CSV structure and reports errors
|
|
95
|
+
- Normalizes encoding to UTF-8
|
|
96
|
+
- Normalizes delimiters and field names
|
|
97
|
+
- Creates a consistent starting point for data exploration
|
|
98
|
+
|
|
99
|
+
**What it doesn't do:**
|
|
100
|
+
- Complex data transformations or business logic
|
|
101
|
+
- Type inference or data validation beyond structure
|
|
102
|
+
- Heavy processing or aggregations
|
|
103
|
+
|
|
104
|
+
## Features
|
|
105
|
+
|
|
106
|
+
- **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
|
|
107
|
+
- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
|
|
108
|
+
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
109
|
+
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
|
|
110
|
+
- **Error Reporting**: Exports detailed error file for invalid rows
|
|
111
|
+
|
|
112
|
+
## Usage
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
csvnorm input.csv [options]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Options
|
|
119
|
+
|
|
120
|
+
| Option | Description |
|
|
121
|
+
|--------|-------------|
|
|
122
|
+
| `-f, --force` | Force overwrite of existing output files |
|
|
123
|
+
| `-n, --keep-names` | Keep original column names (disable snake_case) |
|
|
124
|
+
| `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
|
|
125
|
+
| `-o, --output-dir DIR` | Set output directory (default: current dir) |
|
|
126
|
+
| `-v, --verbose` | Enable verbose output for debugging |
|
|
127
|
+
| `--version` | Show version number |
|
|
128
|
+
| `-h, --help` | Show help message |
|
|
129
|
+
|
|
130
|
+
### Examples
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Basic usage
|
|
134
|
+
csvnorm data.csv
|
|
135
|
+
|
|
136
|
+
# With semicolon delimiter
|
|
137
|
+
csvnorm data.csv -d ';'
|
|
138
|
+
|
|
139
|
+
# Custom output directory
|
|
140
|
+
csvnorm data.csv -o ./output
|
|
141
|
+
|
|
142
|
+
# Keep original headers
|
|
143
|
+
csvnorm data.csv --keep-names
|
|
144
|
+
|
|
145
|
+
# Force overwrite with verbose output
|
|
146
|
+
csvnorm data.csv -f -v
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Output
|
|
150
|
+
|
|
151
|
+
Creates a normalized CSV file in the specified output directory with:
|
|
152
|
+
- UTF-8 encoding
|
|
153
|
+
- Consistent field delimiters
|
|
154
|
+
- Normalized column names (unless `--keep-names` is specified)
|
|
155
|
+
- Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
|
|
156
|
+
|
|
157
|
+
The tool provides modern terminal output with:
|
|
158
|
+
- Progress indicators for multi-step processing
|
|
159
|
+
- Color-coded error messages with panels
|
|
160
|
+
- Success summary table showing encoding, paths, and settings
|
|
161
|
+
- Optional ASCII art banner in verbose mode (requires `pyfiglet`)
|
|
162
|
+
|
|
163
|
+
### Exit Codes
|
|
164
|
+
|
|
165
|
+
| Code | Meaning |
|
|
166
|
+
|------|---------|
|
|
167
|
+
| 0 | Success |
|
|
168
|
+
| 1 | Error (validation failed, file not found, etc.) |
|
|
169
|
+
|
|
170
|
+
## Requirements
|
|
171
|
+
|
|
172
|
+
- Python 3.8+
|
|
173
|
+
- Dependencies (automatically installed):
|
|
174
|
+
- `charset-normalizer>=3.0.0` - Encoding detection
|
|
175
|
+
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
176
|
+
- `rich>=13.0.0` - Modern terminal output formatting
|
|
177
|
+
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
178
|
+
|
|
179
|
+
Optional:
|
|
180
|
+
- `pyfiglet>=1.0.0` - ASCII art banner in verbose mode (install with `pip install csvnorm[banner]`)
|
|
181
|
+
|
|
182
|
+
## Development
|
|
183
|
+
|
|
184
|
+
### Setup
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
git clone https://github.com/aborruso/prepare_data
|
|
188
|
+
cd prepare_data
|
|
189
|
+
pip install -e ".[dev]"
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Testing
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
pytest tests/ -v
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Project Structure
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
prepare_data/
|
|
202
|
+
├── src/csvnorm/
|
|
203
|
+
│ ├── __init__.py # Package version
|
|
204
|
+
│ ├── __main__.py # python -m support
|
|
205
|
+
│ ├── cli.py # CLI argument parsing
|
|
206
|
+
│ ├── core.py # Main processing pipeline
|
|
207
|
+
│ ├── encoding.py # Encoding detection/conversion
|
|
208
|
+
│ ├── validation.py # DuckDB validation
|
|
209
|
+
│ └── utils.py # Helper functions
|
|
210
|
+
├── tests/ # Test suite
|
|
211
|
+
├── test/ # CSV fixtures
|
|
212
|
+
└── pyproject.toml # Package configuration
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## License
|
|
216
|
+
|
|
217
|
+
MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
|
csvnorm-0.3.0/README.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
[](https://deepwiki.com/aborruso/prepare_data)
|
|
2
|
+
[](https://pypi.org/project/csvnorm/)
|
|
3
|
+
|
|
4
|
+
# csvnorm
|
|
5
|
+
|
|
6
|
+
A command-line utility to validate and normalize CSV files for initial exploration.
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
Recommended (uv):
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
uv tool install csvnorm
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Or with pip:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install csvnorm
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
For ASCII banner in verbose mode:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv tool install csvnorm[banner]
|
|
26
|
+
# or
|
|
27
|
+
pip install csvnorm[banner]
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Purpose
|
|
31
|
+
|
|
32
|
+
This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
|
|
33
|
+
|
|
34
|
+
**What it does:**
|
|
35
|
+
- Validates CSV structure and reports errors
|
|
36
|
+
- Normalizes encoding to UTF-8
|
|
37
|
+
- Normalizes delimiters and field names
|
|
38
|
+
- Creates a consistent starting point for data exploration
|
|
39
|
+
|
|
40
|
+
**What it doesn't do:**
|
|
41
|
+
- Complex data transformations or business logic
|
|
42
|
+
- Type inference or data validation beyond structure
|
|
43
|
+
- Heavy processing or aggregations
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **CSV Validation**: Checks for common CSV errors and inconsistencies using DuckDB
|
|
48
|
+
- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
|
|
49
|
+
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
50
|
+
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
|
|
51
|
+
- **Error Reporting**: Exports detailed error file for invalid rows
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
csvnorm input.csv [options]
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Options
|
|
60
|
+
|
|
61
|
+
| Option | Description |
|
|
62
|
+
|--------|-------------|
|
|
63
|
+
| `-f, --force` | Force overwrite of existing output files |
|
|
64
|
+
| `-n, --keep-names` | Keep original column names (disable snake_case) |
|
|
65
|
+
| `-d, --delimiter CHAR` | Set custom output delimiter (default: `,`) |
|
|
66
|
+
| `-o, --output-dir DIR` | Set output directory (default: current dir) |
|
|
67
|
+
| `-v, --verbose` | Enable verbose output for debugging |
|
|
68
|
+
| `--version` | Show version number |
|
|
69
|
+
| `-h, --help` | Show help message |
|
|
70
|
+
|
|
71
|
+
### Examples
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Basic usage
|
|
75
|
+
csvnorm data.csv
|
|
76
|
+
|
|
77
|
+
# With semicolon delimiter
|
|
78
|
+
csvnorm data.csv -d ';'
|
|
79
|
+
|
|
80
|
+
# Custom output directory
|
|
81
|
+
csvnorm data.csv -o ./output
|
|
82
|
+
|
|
83
|
+
# Keep original headers
|
|
84
|
+
csvnorm data.csv --keep-names
|
|
85
|
+
|
|
86
|
+
# Force overwrite with verbose output
|
|
87
|
+
csvnorm data.csv -f -v
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Output
|
|
91
|
+
|
|
92
|
+
Creates a normalized CSV file in the specified output directory with:
|
|
93
|
+
- UTF-8 encoding
|
|
94
|
+
- Consistent field delimiters
|
|
95
|
+
- Normalized column names (unless `--keep-names` is specified)
|
|
96
|
+
- Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
|
|
97
|
+
|
|
98
|
+
The tool provides modern terminal output with:
|
|
99
|
+
- Progress indicators for multi-step processing
|
|
100
|
+
- Color-coded error messages with panels
|
|
101
|
+
- Success summary table showing encoding, paths, and settings
|
|
102
|
+
- Optional ASCII art banner in verbose mode (requires `pyfiglet`)
|
|
103
|
+
|
|
104
|
+
### Exit Codes
|
|
105
|
+
|
|
106
|
+
| Code | Meaning |
|
|
107
|
+
|------|---------|
|
|
108
|
+
| 0 | Success |
|
|
109
|
+
| 1 | Error (validation failed, file not found, etc.) |
|
|
110
|
+
|
|
111
|
+
## Requirements
|
|
112
|
+
|
|
113
|
+
- Python 3.8+
|
|
114
|
+
- Dependencies (automatically installed):
|
|
115
|
+
- `charset-normalizer>=3.0.0` - Encoding detection
|
|
116
|
+
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
117
|
+
- `rich>=13.0.0` - Modern terminal output formatting
|
|
118
|
+
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
119
|
+
|
|
120
|
+
Optional:
|
|
121
|
+
- `pyfiglet>=1.0.0` - ASCII art banner in verbose mode (install with `pip install csvnorm[banner]`)
|
|
122
|
+
|
|
123
|
+
## Development
|
|
124
|
+
|
|
125
|
+
### Setup
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
git clone https://github.com/aborruso/prepare_data
|
|
129
|
+
cd prepare_data
|
|
130
|
+
pip install -e ".[dev]"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Testing
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
pytest tests/ -v
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Project Structure
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
prepare_data/
|
|
143
|
+
├── src/csvnorm/
|
|
144
|
+
│ ├── __init__.py # Package version
|
|
145
|
+
│ ├── __main__.py # python -m support
|
|
146
|
+
│ ├── cli.py # CLI argument parsing
|
|
147
|
+
│ ├── core.py # Main processing pipeline
|
|
148
|
+
│ ├── encoding.py # Encoding detection/conversion
|
|
149
|
+
│ ├── validation.py # DuckDB validation
|
|
150
|
+
│ └── utils.py # Helper functions
|
|
151
|
+
├── tests/ # Test suite
|
|
152
|
+
├── test/ # CSV fixtures
|
|
153
|
+
└── pyproject.toml # Package configuration
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
MIT License (c) 2026 aborruso@gmail.com - See LICENSE file for details
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "csvnorm"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "A command-line utility to validate and normalize CSV files"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {file = "LICENSE"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "aborruso", email = "aborruso@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["csv", "data", "normalization", "validation", "etl"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.8",
|
|
24
|
+
"Programming Language :: Python :: 3.9",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
29
|
+
"Topic :: Software Development :: Libraries",
|
|
30
|
+
"Topic :: Utilities",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
dependencies = [
|
|
34
|
+
"charset-normalizer>=3.0.0",
|
|
35
|
+
"duckdb>=0.9.0",
|
|
36
|
+
"rich>=13.0.0",
|
|
37
|
+
"rich-argparse>=1.0.0",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=7.0.0",
|
|
43
|
+
"pytest-cov>=4.0.0",
|
|
44
|
+
"ruff>=0.1.0",
|
|
45
|
+
]
|
|
46
|
+
banner = [
|
|
47
|
+
"pyfiglet>=1.0.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.urls]
|
|
51
|
+
Homepage = "https://github.com/aborruso/prepare_data"
|
|
52
|
+
Issues = "https://github.com/aborruso/prepare_data/issues"
|
|
53
|
+
|
|
54
|
+
[project.scripts]
|
|
55
|
+
csvnorm = "csvnorm.cli:main"
|
|
56
|
+
|
|
57
|
+
[tool.setuptools.packages.find]
|
|
58
|
+
where = ["src"]
|
|
59
|
+
|
|
60
|
+
[tool.ruff]
|
|
61
|
+
line-length = 88
|
|
62
|
+
target-version = "py38"
|
csvnorm-0.3.0/setup.cfg
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""csvnorm - Validate and normalize CSV files."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.3.0"
|
|
4
|
+
__all__ = ["normalize_csv", "detect_encoding", "process_csv"]
|
|
5
|
+
|
|
6
|
+
from csvnorm.core import process_csv
|
|
7
|
+
from csvnorm.encoding import detect_encoding
|
|
8
|
+
from csvnorm.validation import normalize_csv
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Command-line interface for csvnorm."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich_argparse import RichHelpFormatter
|
|
9
|
+
|
|
10
|
+
from csvnorm import __version__
|
|
11
|
+
from csvnorm.core import process_csv
|
|
12
|
+
from csvnorm.utils import setup_logger
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def show_banner() -> None:
|
|
18
|
+
"""Show ASCII art banner if pyfiglet is available."""
|
|
19
|
+
try:
|
|
20
|
+
from pyfiglet import figlet_format
|
|
21
|
+
banner = figlet_format("csvnorm", font="slant")
|
|
22
|
+
console.print(banner, style="bold cyan")
|
|
23
|
+
except ImportError:
|
|
24
|
+
# pyfiglet not installed, skip banner
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
29
|
+
"""Create and return the argument parser."""
|
|
30
|
+
parser = argparse.ArgumentParser(
|
|
31
|
+
prog="csvnorm",
|
|
32
|
+
description="Validate and normalize CSV files for exploratory data analysis",
|
|
33
|
+
formatter_class=RichHelpFormatter,
|
|
34
|
+
epilog="""\
|
|
35
|
+
Examples:
|
|
36
|
+
csvnorm data.csv -d ';' -o output_folder --force
|
|
37
|
+
csvnorm data.csv --keep-names --delimiter '\\t'
|
|
38
|
+
csvnorm data.csv -v
|
|
39
|
+
""",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"input_file",
|
|
44
|
+
type=Path,
|
|
45
|
+
help="Input CSV file path",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"-f",
|
|
50
|
+
"--force",
|
|
51
|
+
action="store_true",
|
|
52
|
+
help="Force overwrite of existing output files",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"-n",
|
|
57
|
+
"--keep-names",
|
|
58
|
+
action="store_true",
|
|
59
|
+
help=(
|
|
60
|
+
"Keep original column names (disable snake_case normalization). "
|
|
61
|
+
"By default, column names are converted to snake_case format "
|
|
62
|
+
"(e.g., 'Column Name' becomes 'column_name')."
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"-d",
|
|
68
|
+
"--delimiter",
|
|
69
|
+
default=",",
|
|
70
|
+
help="Set custom field delimiter (default: comma). Example: -d ';'",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"-o",
|
|
75
|
+
"--output-dir",
|
|
76
|
+
type=Path,
|
|
77
|
+
default=Path.cwd(),
|
|
78
|
+
help="Set custom output directory (default: current working directory)",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"-v",
|
|
83
|
+
"--verbose",
|
|
84
|
+
action="store_true",
|
|
85
|
+
help="Enable verbose output for debugging",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"--version",
|
|
90
|
+
action="version",
|
|
91
|
+
version=f"%(prog)s {__version__}",
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return parser
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def main(argv: list[str] | None = None) -> int:
|
|
98
|
+
"""Main entry point for the CLI.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
argv: Command line arguments (defaults to sys.argv[1:]).
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Exit code: 0 for success, 1 for error.
|
|
105
|
+
"""
|
|
106
|
+
parser = create_parser()
|
|
107
|
+
|
|
108
|
+
# Handle missing arguments gracefully
|
|
109
|
+
if argv is None:
|
|
110
|
+
argv = sys.argv[1:]
|
|
111
|
+
|
|
112
|
+
if not argv or (len(argv) == 1 and argv[0] in ['-h', '--help']):
|
|
113
|
+
parser.print_help()
|
|
114
|
+
return 0 if argv else 2
|
|
115
|
+
|
|
116
|
+
args = parser.parse_args(argv)
|
|
117
|
+
|
|
118
|
+
# Show banner in verbose mode
|
|
119
|
+
if args.verbose:
|
|
120
|
+
show_banner()
|
|
121
|
+
|
|
122
|
+
# Setup logging
|
|
123
|
+
setup_logger(args.verbose)
|
|
124
|
+
|
|
125
|
+
# Run processing
|
|
126
|
+
return process_csv(
|
|
127
|
+
input_file=args.input_file,
|
|
128
|
+
output_dir=args.output_dir,
|
|
129
|
+
force=args.force,
|
|
130
|
+
keep_names=args.keep_names,
|
|
131
|
+
delimiter=args.delimiter,
|
|
132
|
+
verbose=args.verbose,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == "__main__":
|
|
137
|
+
sys.exit(main())
|