csvnorm 0.3.4__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvnorm/__init__.py +1 -1
- csvnorm/cli.py +18 -12
- csvnorm/core.py +84 -103
- csvnorm/ui.py +124 -0
- csvnorm/utils.py +66 -0
- csvnorm/validation.py +42 -3
- {csvnorm-0.3.4.dist-info → csvnorm-0.3.11.dist-info}/METADATA +50 -11
- csvnorm-0.3.11.dist-info/RECORD +14 -0
- csvnorm-0.3.4.dist-info/RECORD +0 -13
- {csvnorm-0.3.4.dist-info → csvnorm-0.3.11.dist-info}/WHEEL +0 -0
- {csvnorm-0.3.4.dist-info → csvnorm-0.3.11.dist-info}/entry_points.txt +0 -0
- {csvnorm-0.3.4.dist-info → csvnorm-0.3.11.dist-info}/licenses/LICENSE +0 -0
- {csvnorm-0.3.4.dist-info → csvnorm-0.3.11.dist-info}/top_level.txt +0 -0
csvnorm/__init__.py
CHANGED
csvnorm/cli.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import argparse
|
|
4
4
|
import sys
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
from rich.console import Console
|
|
8
9
|
from rich_argparse import RichHelpFormatter
|
|
@@ -15,11 +16,10 @@ console = Console()
|
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def show_banner() -> None:
|
|
18
|
-
"""Show
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
console.print(banner, style="bold cyan")
|
|
19
|
+
"""Show simple styled banner."""
|
|
20
|
+
console.print()
|
|
21
|
+
console.print(" csvnorm ", style="bold cyan on black", justify="center")
|
|
22
|
+
console.print()
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class VersionAction(argparse.Action):
|
|
@@ -50,9 +50,9 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
50
50
|
formatter_class=RichHelpFormatter,
|
|
51
51
|
epilog="""\
|
|
52
52
|
Examples:
|
|
53
|
-
csvnorm data.csv -d ';' -o
|
|
53
|
+
csvnorm data.csv -d ';' -o output.csv --force
|
|
54
54
|
csvnorm data.csv --keep-names --delimiter '\\t'
|
|
55
|
-
csvnorm https://example.com/data.csv -o
|
|
55
|
+
csvnorm https://example.com/data.csv -o processed/data.csv
|
|
56
56
|
csvnorm data.csv -V
|
|
57
57
|
""",
|
|
58
58
|
)
|
|
@@ -90,10 +90,9 @@ Examples:
|
|
|
90
90
|
|
|
91
91
|
parser.add_argument(
|
|
92
92
|
"-o",
|
|
93
|
-
"--output-
|
|
93
|
+
"--output-file",
|
|
94
94
|
type=Path,
|
|
95
|
-
|
|
96
|
-
help="Set custom output directory (default: current working directory)",
|
|
95
|
+
help="Set output file path (absolute or relative)",
|
|
97
96
|
)
|
|
98
97
|
|
|
99
98
|
parser.add_argument(
|
|
@@ -114,7 +113,7 @@ Examples:
|
|
|
114
113
|
return parser
|
|
115
114
|
|
|
116
115
|
|
|
117
|
-
def main(argv: list[str]
|
|
116
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
118
117
|
"""Main entry point for the CLI.
|
|
119
118
|
|
|
120
119
|
Args:
|
|
@@ -142,10 +141,17 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
142
141
|
# Setup logging
|
|
143
142
|
setup_logger(args.verbose)
|
|
144
143
|
|
|
144
|
+
# Determine output file (default: input filename in current directory)
|
|
145
|
+
if args.output_file is None:
|
|
146
|
+
input_name = Path(args.input_file).name
|
|
147
|
+
output_file = Path.cwd() / input_name
|
|
148
|
+
else:
|
|
149
|
+
output_file = args.output_file
|
|
150
|
+
|
|
145
151
|
# Run processing
|
|
146
152
|
return process_csv(
|
|
147
153
|
input_file=args.input_file,
|
|
148
|
-
|
|
154
|
+
output_file=output_file,
|
|
149
155
|
force=args.force,
|
|
150
156
|
keep_names=args.keep_names,
|
|
151
157
|
delimiter=args.delimiter,
|
csvnorm/core.py
CHANGED
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
"""Core processing logic for csvnorm."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import tempfile
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Union
|
|
6
7
|
|
|
7
8
|
from rich.console import Console
|
|
8
|
-
from rich.panel import Panel
|
|
9
9
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
10
|
-
from rich.table import Table
|
|
11
10
|
|
|
12
11
|
from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
|
|
12
|
+
from csvnorm.ui import (
|
|
13
|
+
show_error_panel,
|
|
14
|
+
show_success_table,
|
|
15
|
+
show_validation_error_panel,
|
|
16
|
+
show_warning_panel,
|
|
17
|
+
)
|
|
13
18
|
from csvnorm.utils import (
|
|
14
|
-
ensure_output_dir,
|
|
15
19
|
extract_filename_from_url,
|
|
20
|
+
get_column_count,
|
|
21
|
+
get_row_count,
|
|
16
22
|
is_url,
|
|
17
23
|
to_snake_case,
|
|
18
24
|
validate_delimiter,
|
|
@@ -26,7 +32,7 @@ console = Console()
|
|
|
26
32
|
|
|
27
33
|
def process_csv(
|
|
28
34
|
input_file: str,
|
|
29
|
-
|
|
35
|
+
output_file: Path,
|
|
30
36
|
force: bool = False,
|
|
31
37
|
keep_names: bool = False,
|
|
32
38
|
delimiter: str = ",",
|
|
@@ -36,7 +42,7 @@ def process_csv(
|
|
|
36
42
|
|
|
37
43
|
Args:
|
|
38
44
|
input_file: Path to input CSV file or HTTP/HTTPS URL.
|
|
39
|
-
|
|
45
|
+
output_file: Full path for output file.
|
|
40
46
|
force: If True, overwrite existing output files.
|
|
41
47
|
keep_names: If True, keep original column names.
|
|
42
48
|
delimiter: Output field delimiter.
|
|
@@ -54,7 +60,7 @@ def process_csv(
|
|
|
54
60
|
try:
|
|
55
61
|
validate_url(input_file)
|
|
56
62
|
except ValueError as e:
|
|
57
|
-
|
|
63
|
+
show_error_panel(str(e))
|
|
58
64
|
return 1
|
|
59
65
|
base_name = extract_filename_from_url(input_file)
|
|
60
66
|
input_path = input_file # Keep as string for DuckDB
|
|
@@ -62,21 +68,11 @@ def process_csv(
|
|
|
62
68
|
# Validate local file
|
|
63
69
|
file_path = Path(input_file)
|
|
64
70
|
if not file_path.exists():
|
|
65
|
-
|
|
66
|
-
Panel(
|
|
67
|
-
f"[bold red]Error:[/bold red] Input file not found\n{file_path}",
|
|
68
|
-
border_style="red",
|
|
69
|
-
)
|
|
70
|
-
)
|
|
71
|
+
show_error_panel(f"Input file not found\n{file_path}")
|
|
71
72
|
return 1
|
|
72
73
|
|
|
73
74
|
if not file_path.is_file():
|
|
74
|
-
|
|
75
|
-
Panel(
|
|
76
|
-
f"[bold red]Error:[/bold red] Not a file\n{file_path}",
|
|
77
|
-
border_style="red",
|
|
78
|
-
)
|
|
79
|
-
)
|
|
75
|
+
show_error_panel(f"Not a file\n{file_path}")
|
|
80
76
|
return 1
|
|
81
77
|
|
|
82
78
|
base_name = to_snake_case(file_path.name)
|
|
@@ -85,34 +81,30 @@ def process_csv(
|
|
|
85
81
|
try:
|
|
86
82
|
validate_delimiter(delimiter)
|
|
87
83
|
except ValueError as e:
|
|
88
|
-
|
|
84
|
+
show_error_panel(str(e))
|
|
89
85
|
return 1
|
|
90
86
|
|
|
91
87
|
# Setup paths
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
|
|
88
|
+
output_dir = output_file.parent
|
|
89
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="csvnorm_"))
|
|
90
|
+
reject_file = output_dir / f"{output_file.stem}_reject_errors.csv"
|
|
91
|
+
temp_utf8_file = temp_dir / f"{output_file.stem}_utf8.csv"
|
|
97
92
|
|
|
98
93
|
# Check if output exists
|
|
99
94
|
if output_file.exists() and not force:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
f"Use [bold]--force[/bold] to overwrite.",
|
|
105
|
-
border_style="yellow",
|
|
106
|
-
)
|
|
95
|
+
show_warning_panel(
|
|
96
|
+
f"Output file already exists\n\n"
|
|
97
|
+
f"{output_file}\n\n"
|
|
98
|
+
f"Use [bold]--force[/bold] to overwrite."
|
|
107
99
|
)
|
|
108
100
|
return 1
|
|
109
101
|
|
|
110
|
-
# Clean up previous reject file
|
|
102
|
+
# Clean up previous reject file (always overwrite)
|
|
111
103
|
if reject_file.exists():
|
|
112
104
|
reject_file.unlink()
|
|
113
105
|
|
|
114
106
|
# Track files to clean up
|
|
115
|
-
temp_files: list[Path] = []
|
|
107
|
+
temp_files: list[Path] = [temp_dir]
|
|
116
108
|
|
|
117
109
|
try:
|
|
118
110
|
with Progress(
|
|
@@ -142,9 +134,7 @@ def process_csv(
|
|
|
142
134
|
encoding = detect_encoding(file_input_path)
|
|
143
135
|
except ValueError as e:
|
|
144
136
|
progress.stop()
|
|
145
|
-
|
|
146
|
-
Panel(f"[bold red]Error:[/bold red] {e}", border_style="red")
|
|
147
|
-
)
|
|
137
|
+
show_error_panel(str(e))
|
|
148
138
|
return 1
|
|
149
139
|
|
|
150
140
|
logger.debug(f"Detected encoding: {encoding}")
|
|
@@ -164,16 +154,11 @@ def process_csv(
|
|
|
164
154
|
working_file = temp_utf8_file
|
|
165
155
|
temp_files.append(temp_utf8_file)
|
|
166
156
|
progress.update(
|
|
167
|
-
task, description=
|
|
157
|
+
task, description="[green]✓[/green] Converted to UTF-8"
|
|
168
158
|
)
|
|
169
159
|
except (UnicodeDecodeError, LookupError) as e:
|
|
170
160
|
progress.stop()
|
|
171
|
-
|
|
172
|
-
Panel(
|
|
173
|
-
f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
|
|
174
|
-
border_style="red",
|
|
175
|
-
)
|
|
176
|
-
)
|
|
161
|
+
show_error_panel(f"Encoding conversion failed\n{e}")
|
|
177
162
|
return 1
|
|
178
163
|
else:
|
|
179
164
|
progress.update(
|
|
@@ -186,7 +171,9 @@ def process_csv(
|
|
|
186
171
|
logger.debug("Validating CSV with DuckDB...")
|
|
187
172
|
|
|
188
173
|
try:
|
|
189
|
-
|
|
174
|
+
reject_count, error_types = validate_csv(
|
|
175
|
+
working_file, reject_file, is_remote=is_remote
|
|
176
|
+
)
|
|
190
177
|
except Exception as e:
|
|
191
178
|
progress.stop()
|
|
192
179
|
error_msg = str(e)
|
|
@@ -194,61 +181,38 @@ def process_csv(
|
|
|
194
181
|
# Check for common HTTP errors
|
|
195
182
|
if "HTTP Error" in error_msg or "HTTPException" in error_msg:
|
|
196
183
|
if "404" in error_msg:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
"Please check the URL is correct.",
|
|
202
|
-
border_style="red",
|
|
203
|
-
)
|
|
184
|
+
show_error_panel(
|
|
185
|
+
f"Remote CSV file not found (HTTP 404)\n\n"
|
|
186
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
187
|
+
"Please check the URL is correct."
|
|
204
188
|
)
|
|
205
189
|
elif "401" in error_msg or "403" in error_msg:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
"Please download the file manually first.",
|
|
212
|
-
border_style="red",
|
|
213
|
-
)
|
|
190
|
+
show_error_panel(
|
|
191
|
+
f"Authentication required (HTTP 401/403)\n\n"
|
|
192
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
193
|
+
"This tool only supports public URLs without authentication.\n"
|
|
194
|
+
"Please download the file manually first."
|
|
214
195
|
)
|
|
215
196
|
elif (
|
|
216
197
|
"timeout" in error_msg.lower()
|
|
217
198
|
or "timed out" in error_msg.lower()
|
|
218
199
|
):
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
"Try again later or download the file manually.",
|
|
225
|
-
border_style="red",
|
|
226
|
-
)
|
|
200
|
+
show_error_panel(
|
|
201
|
+
f"HTTP request timeout (30 seconds)\n\n"
|
|
202
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
203
|
+
"The remote server took too long to respond.\n"
|
|
204
|
+
"Try again later or download the file manually."
|
|
227
205
|
)
|
|
228
206
|
else:
|
|
229
|
-
|
|
230
|
-
Panel(
|
|
231
|
-
f"[bold red]Error:[/bold red] HTTP request failed\n\n"
|
|
232
|
-
f"{error_msg}",
|
|
233
|
-
border_style="red",
|
|
234
|
-
)
|
|
235
|
-
)
|
|
207
|
+
show_error_panel(f"HTTP request failed\n\n{error_msg}")
|
|
236
208
|
else:
|
|
237
209
|
# Re-raise non-HTTP errors
|
|
238
210
|
raise
|
|
239
211
|
return 1
|
|
240
212
|
|
|
241
|
-
|
|
213
|
+
has_validation_errors = reject_count > 1
|
|
214
|
+
if has_validation_errors:
|
|
242
215
|
progress.stop()
|
|
243
|
-
console.print(
|
|
244
|
-
Panel(
|
|
245
|
-
"[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
|
|
246
|
-
f"Details: [cyan]{reject_file}[/cyan]\n\n"
|
|
247
|
-
"Please fix the issues and try again.",
|
|
248
|
-
border_style="red",
|
|
249
|
-
)
|
|
250
|
-
)
|
|
251
|
-
return 1
|
|
252
216
|
|
|
253
217
|
progress.update(task, description="[green]✓[/green] CSV validated")
|
|
254
218
|
|
|
@@ -266,27 +230,44 @@ def process_csv(
|
|
|
266
230
|
logger.debug(f"Output written to: {output_file}")
|
|
267
231
|
progress.update(task, description="[green]✓[/green] Complete")
|
|
268
232
|
|
|
269
|
-
#
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
233
|
+
# Collect statistics
|
|
234
|
+
input_size = (
|
|
235
|
+
working_file.stat().st_size if isinstance(working_file, Path) else 0
|
|
236
|
+
)
|
|
237
|
+
output_size = output_file.stat().st_size
|
|
238
|
+
row_count = get_row_count(output_file)
|
|
239
|
+
column_count = get_column_count(output_file, delimiter)
|
|
240
|
+
|
|
241
|
+
# Show success summary
|
|
242
|
+
show_success_table(
|
|
243
|
+
input_file=input_file,
|
|
244
|
+
output_file=output_file,
|
|
245
|
+
encoding=encoding,
|
|
246
|
+
is_remote=is_remote,
|
|
247
|
+
row_count=row_count,
|
|
248
|
+
column_count=column_count,
|
|
249
|
+
input_size=input_size,
|
|
250
|
+
output_size=output_size,
|
|
251
|
+
delimiter=delimiter,
|
|
252
|
+
keep_names=keep_names,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Show validation errors if any
|
|
256
|
+
if has_validation_errors:
|
|
257
|
+
show_validation_error_panel(reject_count, error_types, reject_file)
|
|
258
|
+
return 1
|
|
283
259
|
|
|
284
260
|
finally:
|
|
285
|
-
# Cleanup temp
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
261
|
+
# Cleanup temp directory
|
|
262
|
+
import shutil
|
|
263
|
+
|
|
264
|
+
for temp_path in temp_files:
|
|
265
|
+
if temp_path.exists():
|
|
266
|
+
logger.debug(f"Removing temp path: {temp_path}")
|
|
267
|
+
if temp_path.is_dir():
|
|
268
|
+
shutil.rmtree(temp_path)
|
|
269
|
+
else:
|
|
270
|
+
temp_path.unlink()
|
|
290
271
|
|
|
291
272
|
# Remove reject file if empty (only header)
|
|
292
273
|
if reject_file.exists():
|
csvnorm/ui.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""UI formatting functions for csvnorm terminal output."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from csvnorm.encoding import needs_conversion
|
|
10
|
+
from csvnorm.utils import format_file_size
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def show_error_panel(message: str, title: str = "Error") -> None:
|
|
16
|
+
"""Display an error panel with red border.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
message: Error message to display.
|
|
20
|
+
title: Panel title (default: "Error").
|
|
21
|
+
"""
|
|
22
|
+
console.print(Panel(f"[bold red]{title}:[/bold red] {message}", border_style="red"))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def show_warning_panel(message: str, title: str = "Warning") -> None:
|
|
26
|
+
"""Display a warning panel with yellow border.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
message: Warning message to display.
|
|
30
|
+
title: Panel title (default: "Warning").
|
|
31
|
+
"""
|
|
32
|
+
console.print(
|
|
33
|
+
Panel(f"[bold yellow]{title}:[/bold yellow] {message}", border_style="yellow")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def show_success_table(
|
|
38
|
+
input_file: str,
|
|
39
|
+
output_file: Path,
|
|
40
|
+
encoding: str,
|
|
41
|
+
is_remote: bool,
|
|
42
|
+
row_count: int,
|
|
43
|
+
column_count: int,
|
|
44
|
+
input_size: int,
|
|
45
|
+
output_size: int,
|
|
46
|
+
delimiter: str,
|
|
47
|
+
keep_names: bool,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Display success summary table with processing results.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
input_file: Input CSV file path or URL.
|
|
53
|
+
output_file: Output CSV file path.
|
|
54
|
+
encoding: Detected encoding (or "remote" for URLs).
|
|
55
|
+
is_remote: Whether input was a remote URL.
|
|
56
|
+
row_count: Number of data rows in output.
|
|
57
|
+
column_count: Number of columns in output.
|
|
58
|
+
input_size: Input file size in bytes (0 for remote).
|
|
59
|
+
output_size: Output file size in bytes.
|
|
60
|
+
delimiter: Output delimiter character.
|
|
61
|
+
keep_names: Whether original column names were kept.
|
|
62
|
+
"""
|
|
63
|
+
table = Table(show_header=False, box=None, padding=(0, 1))
|
|
64
|
+
table.add_row("[green]✓[/green] Success", "")
|
|
65
|
+
table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
|
|
66
|
+
table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
|
|
67
|
+
|
|
68
|
+
# Encoding info
|
|
69
|
+
if not is_remote:
|
|
70
|
+
if needs_conversion(encoding):
|
|
71
|
+
table.add_row("Encoding:", f"{encoding} → UTF-8 [dim](converted)[/dim]")
|
|
72
|
+
else:
|
|
73
|
+
table.add_row("Encoding:", f"{encoding} [dim](no conversion needed)[/dim]")
|
|
74
|
+
else:
|
|
75
|
+
table.add_row("Encoding:", "remote [dim](handled by DuckDB)[/dim]")
|
|
76
|
+
|
|
77
|
+
# Statistics
|
|
78
|
+
table.add_row("Rows:", f"{row_count:,}")
|
|
79
|
+
table.add_row("Columns:", f"{column_count}")
|
|
80
|
+
table.add_row("Input size:", format_file_size(input_size))
|
|
81
|
+
table.add_row("Output size:", format_file_size(output_size))
|
|
82
|
+
|
|
83
|
+
# Optional fields
|
|
84
|
+
if delimiter != ",":
|
|
85
|
+
table.add_row("Delimiter:", repr(delimiter))
|
|
86
|
+
if not keep_names:
|
|
87
|
+
table.add_row("Headers:", "normalized to snake_case")
|
|
88
|
+
|
|
89
|
+
console.print()
|
|
90
|
+
console.print(table)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def show_validation_error_panel(
|
|
94
|
+
reject_count: int, error_types: list[str], reject_file: Path
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Display validation error summary panel.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
reject_count: Number of rejected rows (including header).
|
|
100
|
+
error_types: List of error type descriptions.
|
|
101
|
+
reject_file: Path to reject errors CSV file.
|
|
102
|
+
"""
|
|
103
|
+
console.print()
|
|
104
|
+
error_lines = []
|
|
105
|
+
error_lines.append("[bold red]Validation Errors:[/bold red]")
|
|
106
|
+
error_lines.append("")
|
|
107
|
+
error_lines.append(f"Rejected rows: [yellow]{reject_count - 1}[/yellow]")
|
|
108
|
+
|
|
109
|
+
if error_types:
|
|
110
|
+
error_lines.append("")
|
|
111
|
+
error_lines.append("[dim]Error types:[/dim]")
|
|
112
|
+
for error_type in error_types:
|
|
113
|
+
error_lines.append(f" • {error_type}")
|
|
114
|
+
|
|
115
|
+
error_lines.append("")
|
|
116
|
+
error_lines.append(f"Details: [cyan]{reject_file}[/cyan]")
|
|
117
|
+
|
|
118
|
+
console.print(
|
|
119
|
+
Panel(
|
|
120
|
+
"\n".join(error_lines),
|
|
121
|
+
border_style="yellow",
|
|
122
|
+
title="[yellow]![/yellow] Validation Failed",
|
|
123
|
+
)
|
|
124
|
+
)
|
csvnorm/utils.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
6
7
|
from urllib.parse import urlparse
|
|
7
8
|
|
|
8
9
|
from rich.logging import RichHandler
|
|
@@ -124,3 +125,68 @@ def extract_filename_from_url(url: str) -> str:
|
|
|
124
125
|
|
|
125
126
|
# Apply snake_case normalization
|
|
126
127
|
return to_snake_case(filename) if filename else "data"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def format_file_size(size_bytes: int) -> str:
|
|
131
|
+
"""Format file size in human-readable format.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
size_bytes: File size in bytes.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Formatted size string (e.g., "1.5 MB", "256 KB").
|
|
138
|
+
"""
|
|
139
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
140
|
+
if size_bytes < 1024.0:
|
|
141
|
+
return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} B"
|
|
142
|
+
size_bytes /= 1024.0
|
|
143
|
+
return f"{size_bytes:.1f} TB"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_row_count(file_path: Union[Path, str]) -> int:
|
|
147
|
+
"""Count number of rows in a CSV file.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
file_path: Path to CSV file.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Number of data rows (excluding header), or 0 if file doesn't exist.
|
|
154
|
+
"""
|
|
155
|
+
if not isinstance(file_path, Path) or not file_path.exists():
|
|
156
|
+
return 0
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
with open(file_path, "r") as f:
|
|
160
|
+
# Skip header
|
|
161
|
+
next(f, None)
|
|
162
|
+
return sum(1 for _ in f)
|
|
163
|
+
except Exception:
|
|
164
|
+
return 0
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_column_count(file_path: Union[Path, str], delimiter: str = ",") -> int:
|
|
168
|
+
"""Count number of columns in a CSV file using DuckDB.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
file_path: Path to CSV file.
|
|
172
|
+
delimiter: Field delimiter used in the CSV file.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Number of columns in the CSV, or 0 if file doesn't exist or error.
|
|
176
|
+
"""
|
|
177
|
+
if not isinstance(file_path, Path) or not file_path.exists():
|
|
178
|
+
return 0
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
import duckdb
|
|
182
|
+
|
|
183
|
+
conn = duckdb.connect(":memory:")
|
|
184
|
+
# Get column names from CSV using DuckDB DESCRIBE
|
|
185
|
+
columns = conn.execute(
|
|
186
|
+
f"DESCRIBE SELECT * FROM read_csv('{file_path}', delim='{delimiter}', header=true, sample_size=1)"
|
|
187
|
+
).fetchall()
|
|
188
|
+
conn.close()
|
|
189
|
+
|
|
190
|
+
return len(columns)
|
|
191
|
+
except Exception:
|
|
192
|
+
return 0
|
csvnorm/validation.py
CHANGED
|
@@ -11,7 +11,7 @@ logger = logging.getLogger("csvnorm")
|
|
|
11
11
|
|
|
12
12
|
def validate_csv(
|
|
13
13
|
file_path: Union[Path, str], reject_file: Path, is_remote: bool = False
|
|
14
|
-
) ->
|
|
14
|
+
) -> tuple[int, list[str]]:
|
|
15
15
|
"""Validate CSV file using DuckDB and export rejected rows.
|
|
16
16
|
|
|
17
17
|
Args:
|
|
@@ -20,7 +20,8 @@ def validate_csv(
|
|
|
20
20
|
is_remote: True if file_path is a remote URL.
|
|
21
21
|
|
|
22
22
|
Returns:
|
|
23
|
-
|
|
23
|
+
Tuple of (reject_count, error_types) where error_types is list of
|
|
24
|
+
up to 3 unique error reasons from reject file.
|
|
24
25
|
"""
|
|
25
26
|
logger.debug(f"Validating CSV: {file_path}")
|
|
26
27
|
|
|
@@ -54,7 +55,12 @@ def validate_csv(
|
|
|
54
55
|
reject_count = _count_lines(reject_file)
|
|
55
56
|
logger.debug(f"Reject file lines: {reject_count}")
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
# Collect sample error types from reject file
|
|
59
|
+
error_types = []
|
|
60
|
+
if reject_count > 1:
|
|
61
|
+
error_types = _get_error_types(reject_file)
|
|
62
|
+
|
|
63
|
+
return reject_count, error_types
|
|
58
64
|
|
|
59
65
|
|
|
60
66
|
def normalize_csv(
|
|
@@ -121,3 +127,36 @@ def _count_lines(file_path: Path) -> int:
|
|
|
121
127
|
|
|
122
128
|
with open(file_path, "r") as f:
|
|
123
129
|
return sum(1 for _ in f)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _get_error_types(reject_file: Path) -> list[str]:
|
|
133
|
+
"""Extract sample error types from reject file.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
reject_file: Path to reject_errors.csv file.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of up to 3 unique error reasons.
|
|
140
|
+
"""
|
|
141
|
+
if not reject_file.exists():
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
error_types: set[str] = set()
|
|
145
|
+
try:
|
|
146
|
+
with open(reject_file, "r") as f:
|
|
147
|
+
# Skip header
|
|
148
|
+
next(f, None)
|
|
149
|
+
for line in f:
|
|
150
|
+
# Error message is in the last column
|
|
151
|
+
parts = line.rstrip("\n").split(",")
|
|
152
|
+
if parts:
|
|
153
|
+
error_reason = parts[-1].strip()
|
|
154
|
+
if error_reason and error_reason != "error":
|
|
155
|
+
error_types.add(error_reason)
|
|
156
|
+
if len(error_types) >= 3:
|
|
157
|
+
break
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f"Failed to extract error types: {e}")
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
return list(error_types)[:3]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csvnorm
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.11
|
|
4
4
|
Summary: A command-line utility to validate and normalize CSV files
|
|
5
5
|
Author-email: aborruso <aborruso@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -34,7 +34,6 @@ Classifier: Intended Audience :: Science/Research
|
|
|
34
34
|
Classifier: License :: OSI Approved :: MIT License
|
|
35
35
|
Classifier: Operating System :: OS Independent
|
|
36
36
|
Classifier: Programming Language :: Python :: 3
|
|
37
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
38
37
|
Classifier: Programming Language :: Python :: 3.9
|
|
39
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
40
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -49,7 +48,6 @@ Requires-Dist: charset-normalizer>=3.0.0
|
|
|
49
48
|
Requires-Dist: duckdb>=0.9.0
|
|
50
49
|
Requires-Dist: rich>=13.0.0
|
|
51
50
|
Requires-Dist: rich-argparse>=1.0.0
|
|
52
|
-
Requires-Dist: pyfiglet>=1.0.0
|
|
53
51
|
Provides-Extra: dev
|
|
54
52
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
55
53
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
@@ -58,7 +56,7 @@ Dynamic: license-file
|
|
|
58
56
|
|
|
59
57
|
[](https://pypi.org/project/csvnorm/)
|
|
60
58
|
[](https://opensource.org/licenses/MIT)
|
|
61
|
-
[](https://www.python.org/downloads/)
|
|
62
60
|
[](https://deepwiki.com/aborruso/csvnorm)
|
|
63
61
|
|
|
64
62
|
# csvnorm
|
|
@@ -100,7 +98,8 @@ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not
|
|
|
100
98
|
- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
|
|
101
99
|
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
102
100
|
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
|
|
103
|
-
- **
|
|
101
|
+
- **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
|
|
102
|
+
- **Error Reporting**: Exports detailed error file for invalid rows with summary panel
|
|
104
103
|
- **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
|
|
105
104
|
|
|
106
105
|
## Usage
|
|
@@ -145,14 +144,14 @@ csvnorm data.csv -f -V
|
|
|
145
144
|
|
|
146
145
|
### Output
|
|
147
146
|
|
|
148
|
-
Creates a normalized CSV file in
|
|
147
|
+
Creates a normalized CSV file in specified output directory with:
|
|
149
148
|
- UTF-8 encoding
|
|
150
149
|
- Consistent field delimiters
|
|
151
150
|
- Normalized column names (unless `--keep-names` is specified)
|
|
152
151
|
- Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
|
|
153
152
|
|
|
154
153
|
For remote URLs:
|
|
155
|
-
- The output filename is derived from
|
|
154
|
+
- The output filename is derived from URL's last path segment
|
|
156
155
|
- Encoding is handled automatically by DuckDB
|
|
157
156
|
- HTTP timeout is set to 30 seconds
|
|
158
157
|
- Only public URLs are supported (no authentication)
|
|
@@ -160,9 +159,49 @@ For remote URLs:
|
|
|
160
159
|
The tool provides modern terminal output with:
|
|
161
160
|
- Progress indicators for multi-step processing
|
|
162
161
|
- Color-coded error messages with panels
|
|
163
|
-
- Success summary table
|
|
162
|
+
- Success summary table with statistics (rows, columns, file sizes)
|
|
163
|
+
- Encoding conversion status (converted/no conversion/remote)
|
|
164
|
+
- Error summary panel with reject count and error types when validation fails
|
|
164
165
|
- ASCII art banner with `--version` and `-V` verbose mode
|
|
165
166
|
|
|
167
|
+
**Success Example:**
|
|
168
|
+
```
|
|
169
|
+
✓ Success
|
|
170
|
+
Input: test/utf8_basic.csv
|
|
171
|
+
Output: output/utf8_basic.csv
|
|
172
|
+
Encoding: ascii (no conversion needed)
|
|
173
|
+
Rows: 2
|
|
174
|
+
Columns: 3
|
|
175
|
+
Input size: 42 B
|
|
176
|
+
Output size: 43 B
|
|
177
|
+
Headers: normalized to snake_case
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
**Error Example:**
|
|
181
|
+
```
|
|
182
|
+
✓ Success
|
|
183
|
+
Input: test/malformed_rows.csv
|
|
184
|
+
Output: output/malformed_rows.csv
|
|
185
|
+
Encoding: ascii (no conversion needed)
|
|
186
|
+
Rows: 1
|
|
187
|
+
Columns: 4
|
|
188
|
+
Input size: 24 B
|
|
189
|
+
Output size: 40 B
|
|
190
|
+
Headers: normalized to snake_case
|
|
191
|
+
|
|
192
|
+
╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
|
|
193
|
+
│ Validation Errors: │
|
|
194
|
+
│ │
|
|
195
|
+
│ Rejected rows: 2 │
|
|
196
|
+
│ │
|
|
197
|
+
│ Error types: │
|
|
198
|
+
│ • Expected Number of Columns: 3 Found: 2 │
|
|
199
|
+
│ • Expected Number of Columns: 3 Found: 4 │
|
|
200
|
+
│ │
|
|
201
|
+
│ Details: output/malformed_rows_reject_errors.csv │
|
|
202
|
+
╰──────────────────────────────────────────────────────────────────────────────╯
|
|
203
|
+
```
|
|
204
|
+
|
|
166
205
|
### Exit Codes
|
|
167
206
|
|
|
168
207
|
| Code | Meaning |
|
|
@@ -172,13 +211,13 @@ The tool provides modern terminal output with:
|
|
|
172
211
|
|
|
173
212
|
## Requirements
|
|
174
213
|
|
|
175
|
-
- Python 3.
|
|
214
|
+
- Python 3.9+
|
|
176
215
|
- Dependencies (automatically installed):
|
|
177
216
|
- `charset-normalizer>=3.0.0` - Encoding detection
|
|
178
217
|
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
179
218
|
- `rich>=13.0.0` - Modern terminal output formatting
|
|
180
219
|
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
181
|
-
- `pyfiglet>=1.0.0` - ASCII art banner
|
|
220
|
+
- `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
|
|
182
221
|
|
|
183
222
|
Optional extras:
|
|
184
223
|
- `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
|
|
@@ -209,7 +248,7 @@ pytest tests/ -v
|
|
|
209
248
|
### Project Structure
|
|
210
249
|
|
|
211
250
|
```
|
|
212
|
-
|
|
251
|
+
csvnorm/
|
|
213
252
|
├── src/csvnorm/
|
|
214
253
|
│ ├── __init__.py # Package version
|
|
215
254
|
│ ├── __main__.py # python -m support
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
csvnorm/__init__.py,sha256=frEketezK5MWX8eiy1mFgw_3QeMcH4cVgVsNXtD1Jgg,264
|
|
2
|
+
csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
|
|
3
|
+
csvnorm/cli.py,sha256=UEe0hRGWx9m6ZLGLd9TIaJ_uayclNTh_i0fO_JEgTXY,4166
|
|
4
|
+
csvnorm/core.py,sha256=0tgOmPr4JSMSzgSxT8ffCk_IrOWGLI2hTzhV9_xNQQ8,9945
|
|
5
|
+
csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
|
|
6
|
+
csvnorm/ui.py,sha256=rOfVYjnTImplMMc-QGmcYUXzzZ513Y1bCjlO2jPxG2A,3893
|
|
7
|
+
csvnorm/utils.py,sha256=slV2aADBDfg9RHZJE-jmRuzPfY1RX0Wq-D1A4oBN7Yo,5020
|
|
8
|
+
csvnorm/validation.py,sha256=I7m_nxsGDROy5pBkNU-H7qEVYEAT19vw5alkrvZqGh4,4539
|
|
9
|
+
csvnorm-0.3.11.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
|
|
10
|
+
csvnorm-0.3.11.dist-info/METADATA,sha256=7c2Bu-M-4UiOqqVOC5Nm-I88ZhmC2BquMSiGRjD9VBo,9808
|
|
11
|
+
csvnorm-0.3.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
csvnorm-0.3.11.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
|
|
13
|
+
csvnorm-0.3.11.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
|
|
14
|
+
csvnorm-0.3.11.dist-info/RECORD,,
|
csvnorm-0.3.4.dist-info/RECORD
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
csvnorm/__init__.py,sha256=OvADr4XLxKa9CBE8oTVtV_YBCvr7oV8cgLN68cUtC1E,263
|
|
2
|
-
csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
|
|
3
|
-
csvnorm/cli.py,sha256=DZYclAKMhyZp234D_aWJUncVXZJDGO4u4Jh_fVHlz-g,3939
|
|
4
|
-
csvnorm/core.py,sha256=fby0c4Fs7zkvC3pqvxo8U-4eXm-SnQJ8UivluReDxxc,11126
|
|
5
|
-
csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
|
|
6
|
-
csvnorm/utils.py,sha256=k5gYxlmdtJOJEhOU1UxnmPb8Akn3UUIsB02S-t5oj4c,3227
|
|
7
|
-
csvnorm/validation.py,sha256=cB0rASU-f7C8M539lFoR7bWhPmG5_LfM7f3S5mRSqAM,3321
|
|
8
|
-
csvnorm-0.3.4.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
|
|
9
|
-
csvnorm-0.3.4.dist-info/METADATA,sha256=BkJEIeef7w1IsC5TMOHH1DL-Hj6A6PD-ZbCDbwRmCpg,7857
|
|
10
|
-
csvnorm-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
csvnorm-0.3.4.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
|
|
12
|
-
csvnorm-0.3.4.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
|
|
13
|
-
csvnorm-0.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|