csvnorm 0.3.3__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvnorm/__init__.py +1 -1
- csvnorm/cli.py +28 -15
- csvnorm/core.py +182 -91
- csvnorm/ui.py +124 -0
- csvnorm/utils.py +125 -4
- csvnorm/validation.py +59 -6
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/METADATA +60 -33
- csvnorm-0.3.11.dist-info/RECORD +14 -0
- csvnorm-0.3.3.dist-info/RECORD +0 -13
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/WHEEL +0 -0
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/entry_points.txt +0 -0
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/licenses/LICENSE +0 -0
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.11.dist-info}/top_level.txt +0 -0
csvnorm/__init__.py
CHANGED
csvnorm/cli.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import argparse
|
|
4
4
|
import sys
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
6
7
|
|
|
7
8
|
from rich.console import Console
|
|
8
9
|
from rich_argparse import RichHelpFormatter
|
|
@@ -15,10 +16,10 @@ console = Console()
|
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def show_banner() -> None:
|
|
18
|
-
"""Show
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
console.print(
|
|
19
|
+
"""Show simple styled banner."""
|
|
20
|
+
console.print()
|
|
21
|
+
console.print(" csvnorm ", style="bold cyan on black", justify="center")
|
|
22
|
+
console.print()
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class VersionAction(argparse.Action):
|
|
@@ -28,10 +29,15 @@ class VersionAction(argparse.Action):
|
|
|
28
29
|
show_banner()
|
|
29
30
|
console.print(f"csvnorm {__version__}", style="bold")
|
|
30
31
|
console.print()
|
|
31
|
-
console.print(
|
|
32
|
+
console.print(
|
|
33
|
+
"Validate and normalize CSV files for exploratory data analysis",
|
|
34
|
+
style="dim",
|
|
35
|
+
)
|
|
32
36
|
console.print()
|
|
33
37
|
console.print("Author: aborruso", style="dim")
|
|
34
|
-
console.print(
|
|
38
|
+
console.print(
|
|
39
|
+
"Repository: https://github.com/aborruso/csvnorm", style="dim cyan"
|
|
40
|
+
)
|
|
35
41
|
console.print("License: MIT", style="dim")
|
|
36
42
|
parser.exit()
|
|
37
43
|
|
|
@@ -44,16 +50,17 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
44
50
|
formatter_class=RichHelpFormatter,
|
|
45
51
|
epilog="""\
|
|
46
52
|
Examples:
|
|
47
|
-
csvnorm data.csv -d ';' -o
|
|
53
|
+
csvnorm data.csv -d ';' -o output.csv --force
|
|
48
54
|
csvnorm data.csv --keep-names --delimiter '\\t'
|
|
55
|
+
csvnorm https://example.com/data.csv -o processed/data.csv
|
|
49
56
|
csvnorm data.csv -V
|
|
50
57
|
""",
|
|
51
58
|
)
|
|
52
59
|
|
|
53
60
|
parser.add_argument(
|
|
54
61
|
"input_file",
|
|
55
|
-
type=
|
|
56
|
-
help="Input CSV file path",
|
|
62
|
+
type=str,
|
|
63
|
+
help="Input CSV file path or HTTP/HTTPS URL",
|
|
57
64
|
)
|
|
58
65
|
|
|
59
66
|
parser.add_argument(
|
|
@@ -83,10 +90,9 @@ Examples:
|
|
|
83
90
|
|
|
84
91
|
parser.add_argument(
|
|
85
92
|
"-o",
|
|
86
|
-
"--output-
|
|
93
|
+
"--output-file",
|
|
87
94
|
type=Path,
|
|
88
|
-
|
|
89
|
-
help="Set custom output directory (default: current working directory)",
|
|
95
|
+
help="Set output file path (absolute or relative)",
|
|
90
96
|
)
|
|
91
97
|
|
|
92
98
|
parser.add_argument(
|
|
@@ -107,7 +113,7 @@ Examples:
|
|
|
107
113
|
return parser
|
|
108
114
|
|
|
109
115
|
|
|
110
|
-
def main(argv: list[str]
|
|
116
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
111
117
|
"""Main entry point for the CLI.
|
|
112
118
|
|
|
113
119
|
Args:
|
|
@@ -122,7 +128,7 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
122
128
|
if argv is None:
|
|
123
129
|
argv = sys.argv[1:]
|
|
124
130
|
|
|
125
|
-
if not argv or (len(argv) == 1 and argv[0] in [
|
|
131
|
+
if not argv or (len(argv) == 1 and argv[0] in ["-h", "--help"]):
|
|
126
132
|
parser.print_help()
|
|
127
133
|
return 0 if argv else 2
|
|
128
134
|
|
|
@@ -135,10 +141,17 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
135
141
|
# Setup logging
|
|
136
142
|
setup_logger(args.verbose)
|
|
137
143
|
|
|
144
|
+
# Determine output file (default: input filename in current directory)
|
|
145
|
+
if args.output_file is None:
|
|
146
|
+
input_name = Path(args.input_file).name
|
|
147
|
+
output_file = Path.cwd() / input_name
|
|
148
|
+
else:
|
|
149
|
+
output_file = args.output_file
|
|
150
|
+
|
|
138
151
|
# Run processing
|
|
139
152
|
return process_csv(
|
|
140
153
|
input_file=args.input_file,
|
|
141
|
-
|
|
154
|
+
output_file=output_file,
|
|
142
155
|
force=args.force,
|
|
143
156
|
keep_names=args.keep_names,
|
|
144
157
|
delimiter=args.delimiter,
|
csvnorm/core.py
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
"""Core processing logic for csvnorm."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import tempfile
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
5
7
|
|
|
6
8
|
from rich.console import Console
|
|
7
|
-
from rich.panel import Panel
|
|
8
9
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
9
|
-
from rich.table import Table
|
|
10
10
|
|
|
11
11
|
from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
|
|
12
|
-
from csvnorm.
|
|
12
|
+
from csvnorm.ui import (
|
|
13
|
+
show_error_panel,
|
|
14
|
+
show_success_table,
|
|
15
|
+
show_validation_error_panel,
|
|
16
|
+
show_warning_panel,
|
|
17
|
+
)
|
|
18
|
+
from csvnorm.utils import (
|
|
19
|
+
extract_filename_from_url,
|
|
20
|
+
get_column_count,
|
|
21
|
+
get_row_count,
|
|
22
|
+
is_url,
|
|
23
|
+
to_snake_case,
|
|
24
|
+
validate_delimiter,
|
|
25
|
+
validate_url,
|
|
26
|
+
)
|
|
13
27
|
from csvnorm.validation import normalize_csv, validate_csv
|
|
14
28
|
|
|
15
29
|
logger = logging.getLogger("csvnorm")
|
|
@@ -17,8 +31,8 @@ console = Console()
|
|
|
17
31
|
|
|
18
32
|
|
|
19
33
|
def process_csv(
|
|
20
|
-
input_file:
|
|
21
|
-
|
|
34
|
+
input_file: str,
|
|
35
|
+
output_file: Path,
|
|
22
36
|
force: bool = False,
|
|
23
37
|
keep_names: bool = False,
|
|
24
38
|
delimiter: str = ",",
|
|
@@ -27,8 +41,8 @@ def process_csv(
|
|
|
27
41
|
"""Main CSV processing pipeline.
|
|
28
42
|
|
|
29
43
|
Args:
|
|
30
|
-
input_file: Path to input CSV file.
|
|
31
|
-
|
|
44
|
+
input_file: Path to input CSV file or HTTP/HTTPS URL.
|
|
45
|
+
output_file: Full path for output file.
|
|
32
46
|
force: If True, overwrite existing output files.
|
|
33
47
|
keep_names: If True, keep original column names.
|
|
34
48
|
delimiter: Output field delimiter.
|
|
@@ -37,111 +51,169 @@ def process_csv(
|
|
|
37
51
|
Returns:
|
|
38
52
|
Exit code: 0 for success, 1 for error.
|
|
39
53
|
"""
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
console.print(Panel(
|
|
43
|
-
f"[bold red]Error:[/bold red] Input file not found\n{input_file}",
|
|
44
|
-
border_style="red"
|
|
45
|
-
))
|
|
46
|
-
return 1
|
|
54
|
+
# Detect if input is URL or file
|
|
55
|
+
is_remote = is_url(input_file)
|
|
47
56
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
57
|
+
input_path: Union[str, Path]
|
|
58
|
+
if is_remote:
|
|
59
|
+
# Validate URL
|
|
60
|
+
try:
|
|
61
|
+
validate_url(input_file)
|
|
62
|
+
except ValueError as e:
|
|
63
|
+
show_error_panel(str(e))
|
|
64
|
+
return 1
|
|
65
|
+
base_name = extract_filename_from_url(input_file)
|
|
66
|
+
input_path = input_file # Keep as string for DuckDB
|
|
67
|
+
else:
|
|
68
|
+
# Validate local file
|
|
69
|
+
file_path = Path(input_file)
|
|
70
|
+
if not file_path.exists():
|
|
71
|
+
show_error_panel(f"Input file not found\n{file_path}")
|
|
72
|
+
return 1
|
|
73
|
+
|
|
74
|
+
if not file_path.is_file():
|
|
75
|
+
show_error_panel(f"Not a file\n{file_path}")
|
|
76
|
+
return 1
|
|
77
|
+
|
|
78
|
+
base_name = to_snake_case(file_path.name)
|
|
79
|
+
input_path = file_path
|
|
54
80
|
|
|
55
81
|
try:
|
|
56
82
|
validate_delimiter(delimiter)
|
|
57
83
|
except ValueError as e:
|
|
58
|
-
|
|
59
|
-
f"[bold red]Error:[/bold red] {e}",
|
|
60
|
-
border_style="red"
|
|
61
|
-
))
|
|
84
|
+
show_error_panel(str(e))
|
|
62
85
|
return 1
|
|
63
86
|
|
|
64
87
|
# Setup paths
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
reject_file = output_dir / f"{base_name}_reject_errors.csv"
|
|
70
|
-
temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
|
|
88
|
+
output_dir = output_file.parent
|
|
89
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="csvnorm_"))
|
|
90
|
+
reject_file = output_dir / f"{output_file.stem}_reject_errors.csv"
|
|
91
|
+
temp_utf8_file = temp_dir / f"{output_file.stem}_utf8.csv"
|
|
71
92
|
|
|
72
93
|
# Check if output exists
|
|
73
94
|
if output_file.exists() and not force:
|
|
74
|
-
|
|
75
|
-
f"
|
|
95
|
+
show_warning_panel(
|
|
96
|
+
f"Output file already exists\n\n"
|
|
76
97
|
f"{output_file}\n\n"
|
|
77
|
-
f"Use [bold]--force[/bold] to overwrite."
|
|
78
|
-
|
|
79
|
-
))
|
|
98
|
+
f"Use [bold]--force[/bold] to overwrite."
|
|
99
|
+
)
|
|
80
100
|
return 1
|
|
81
101
|
|
|
82
|
-
# Clean up previous reject file
|
|
102
|
+
# Clean up previous reject file (always overwrite)
|
|
83
103
|
if reject_file.exists():
|
|
84
104
|
reject_file.unlink()
|
|
85
105
|
|
|
86
106
|
# Track files to clean up
|
|
87
|
-
temp_files: list[Path] = []
|
|
107
|
+
temp_files: list[Path] = [temp_dir]
|
|
88
108
|
|
|
89
109
|
try:
|
|
90
110
|
with Progress(
|
|
91
111
|
SpinnerColumn(),
|
|
92
112
|
TextColumn("[progress.description]{task.description}"),
|
|
93
113
|
console=console,
|
|
94
|
-
transient=True
|
|
114
|
+
transient=True,
|
|
95
115
|
) as progress:
|
|
96
|
-
|
|
97
|
-
task = progress.add_task("[cyan]Detecting encoding...", total=None)
|
|
98
|
-
try:
|
|
99
|
-
encoding = detect_encoding(input_file)
|
|
100
|
-
except ValueError as e:
|
|
101
|
-
progress.stop()
|
|
102
|
-
console.print(Panel(
|
|
103
|
-
f"[bold red]Error:[/bold red] {e}",
|
|
104
|
-
border_style="red"
|
|
105
|
-
))
|
|
106
|
-
return 1
|
|
116
|
+
task = progress.add_task("[cyan]Processing...", total=None)
|
|
107
117
|
|
|
108
|
-
|
|
109
|
-
|
|
118
|
+
# For remote URLs, skip encoding detection/conversion
|
|
119
|
+
if is_remote:
|
|
120
|
+
progress.update(
|
|
121
|
+
task,
|
|
122
|
+
description="[green]✓[/green] Remote URL (encoding handled by DuckDB)",
|
|
123
|
+
)
|
|
124
|
+
working_file = input_path # Keep URL as string
|
|
125
|
+
encoding = "remote"
|
|
126
|
+
else:
|
|
127
|
+
# Step 1: Detect encoding (local files only)
|
|
128
|
+
# input_path is Path here (set in else block above)
|
|
129
|
+
file_input_path = input_path # Type narrowing for mypy
|
|
130
|
+
assert isinstance(file_input_path, Path)
|
|
110
131
|
|
|
111
|
-
|
|
112
|
-
working_file = input_file
|
|
113
|
-
if needs_conversion(encoding):
|
|
114
|
-
progress.update(task, description=f"[cyan]Converting from {encoding} to UTF-8...")
|
|
132
|
+
progress.update(task, description="[cyan]Detecting encoding...")
|
|
115
133
|
try:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
temp_files.append(temp_utf8_file)
|
|
119
|
-
progress.update(task, description=f"[green]✓[/green] Converted to UTF-8")
|
|
120
|
-
except (UnicodeDecodeError, LookupError) as e:
|
|
134
|
+
encoding = detect_encoding(file_input_path)
|
|
135
|
+
except ValueError as e:
|
|
121
136
|
progress.stop()
|
|
122
|
-
|
|
123
|
-
f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
|
|
124
|
-
border_style="red"
|
|
125
|
-
))
|
|
137
|
+
show_error_panel(str(e))
|
|
126
138
|
return 1
|
|
127
|
-
|
|
128
|
-
|
|
139
|
+
|
|
140
|
+
logger.debug(f"Detected encoding: {encoding}")
|
|
141
|
+
progress.update(
|
|
142
|
+
task, description=f"[green]✓[/green] Detected encoding: {encoding}"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Step 2: Convert to UTF-8 if needed
|
|
146
|
+
working_file = file_input_path
|
|
147
|
+
if needs_conversion(encoding):
|
|
148
|
+
progress.update(
|
|
149
|
+
task,
|
|
150
|
+
description=f"[cyan]Converting from {encoding} to UTF-8...",
|
|
151
|
+
)
|
|
152
|
+
try:
|
|
153
|
+
convert_to_utf8(file_input_path, temp_utf8_file, encoding)
|
|
154
|
+
working_file = temp_utf8_file
|
|
155
|
+
temp_files.append(temp_utf8_file)
|
|
156
|
+
progress.update(
|
|
157
|
+
task, description="[green]✓[/green] Converted to UTF-8"
|
|
158
|
+
)
|
|
159
|
+
except (UnicodeDecodeError, LookupError) as e:
|
|
160
|
+
progress.stop()
|
|
161
|
+
show_error_panel(f"Encoding conversion failed\n{e}")
|
|
162
|
+
return 1
|
|
163
|
+
else:
|
|
164
|
+
progress.update(
|
|
165
|
+
task,
|
|
166
|
+
description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)",
|
|
167
|
+
)
|
|
129
168
|
|
|
130
169
|
# Step 3: Validate CSV
|
|
131
170
|
progress.update(task, description="[cyan]Validating CSV...")
|
|
132
171
|
logger.debug("Validating CSV with DuckDB...")
|
|
133
|
-
is_valid = validate_csv(working_file, reject_file)
|
|
134
172
|
|
|
135
|
-
|
|
173
|
+
try:
|
|
174
|
+
reject_count, error_types = validate_csv(
|
|
175
|
+
working_file, reject_file, is_remote=is_remote
|
|
176
|
+
)
|
|
177
|
+
except Exception as e:
|
|
136
178
|
progress.stop()
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
179
|
+
error_msg = str(e)
|
|
180
|
+
|
|
181
|
+
# Check for common HTTP errors
|
|
182
|
+
if "HTTP Error" in error_msg or "HTTPException" in error_msg:
|
|
183
|
+
if "404" in error_msg:
|
|
184
|
+
show_error_panel(
|
|
185
|
+
f"Remote CSV file not found (HTTP 404)\n\n"
|
|
186
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
187
|
+
"Please check the URL is correct."
|
|
188
|
+
)
|
|
189
|
+
elif "401" in error_msg or "403" in error_msg:
|
|
190
|
+
show_error_panel(
|
|
191
|
+
f"Authentication required (HTTP 401/403)\n\n"
|
|
192
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
193
|
+
"This tool only supports public URLs without authentication.\n"
|
|
194
|
+
"Please download the file manually first."
|
|
195
|
+
)
|
|
196
|
+
elif (
|
|
197
|
+
"timeout" in error_msg.lower()
|
|
198
|
+
or "timed out" in error_msg.lower()
|
|
199
|
+
):
|
|
200
|
+
show_error_panel(
|
|
201
|
+
f"HTTP request timeout (30 seconds)\n\n"
|
|
202
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
203
|
+
"The remote server took too long to respond.\n"
|
|
204
|
+
"Try again later or download the file manually."
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
show_error_panel(f"HTTP request failed\n\n{error_msg}")
|
|
208
|
+
else:
|
|
209
|
+
# Re-raise non-HTTP errors
|
|
210
|
+
raise
|
|
143
211
|
return 1
|
|
144
212
|
|
|
213
|
+
has_validation_errors = reject_count > 1
|
|
214
|
+
if has_validation_errors:
|
|
215
|
+
progress.stop()
|
|
216
|
+
|
|
145
217
|
progress.update(task, description="[green]✓[/green] CSV validated")
|
|
146
218
|
|
|
147
219
|
# Step 4: Normalize and write output
|
|
@@ -152,31 +224,50 @@ def process_csv(
|
|
|
152
224
|
output_path=output_file,
|
|
153
225
|
delimiter=delimiter,
|
|
154
226
|
normalize_names=not keep_names,
|
|
227
|
+
is_remote=is_remote,
|
|
155
228
|
)
|
|
156
229
|
|
|
157
230
|
logger.debug(f"Output written to: {output_file}")
|
|
158
231
|
progress.update(task, description="[green]✓[/green] Complete")
|
|
159
232
|
|
|
160
|
-
#
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
table.add_row("Delimiter:", repr(delimiter))
|
|
168
|
-
if not keep_names:
|
|
169
|
-
table.add_row("Headers:", "normalized to snake_case")
|
|
233
|
+
# Collect statistics
|
|
234
|
+
input_size = (
|
|
235
|
+
working_file.stat().st_size if isinstance(working_file, Path) else 0
|
|
236
|
+
)
|
|
237
|
+
output_size = output_file.stat().st_size
|
|
238
|
+
row_count = get_row_count(output_file)
|
|
239
|
+
column_count = get_column_count(output_file, delimiter)
|
|
170
240
|
|
|
171
|
-
|
|
172
|
-
|
|
241
|
+
# Show success summary
|
|
242
|
+
show_success_table(
|
|
243
|
+
input_file=input_file,
|
|
244
|
+
output_file=output_file,
|
|
245
|
+
encoding=encoding,
|
|
246
|
+
is_remote=is_remote,
|
|
247
|
+
row_count=row_count,
|
|
248
|
+
column_count=column_count,
|
|
249
|
+
input_size=input_size,
|
|
250
|
+
output_size=output_size,
|
|
251
|
+
delimiter=delimiter,
|
|
252
|
+
keep_names=keep_names,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Show validation errors if any
|
|
256
|
+
if has_validation_errors:
|
|
257
|
+
show_validation_error_panel(reject_count, error_types, reject_file)
|
|
258
|
+
return 1
|
|
173
259
|
|
|
174
260
|
finally:
|
|
175
|
-
# Cleanup temp
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
261
|
+
# Cleanup temp directory
|
|
262
|
+
import shutil
|
|
263
|
+
|
|
264
|
+
for temp_path in temp_files:
|
|
265
|
+
if temp_path.exists():
|
|
266
|
+
logger.debug(f"Removing temp path: {temp_path}")
|
|
267
|
+
if temp_path.is_dir():
|
|
268
|
+
shutil.rmtree(temp_path)
|
|
269
|
+
else:
|
|
270
|
+
temp_path.unlink()
|
|
180
271
|
|
|
181
272
|
# Remove reject file if empty (only header)
|
|
182
273
|
if reject_file.exists():
|
csvnorm/ui.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""UI formatting functions for csvnorm terminal output."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from csvnorm.encoding import needs_conversion
|
|
10
|
+
from csvnorm.utils import format_file_size
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def show_error_panel(message: str, title: str = "Error") -> None:
|
|
16
|
+
"""Display an error panel with red border.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
message: Error message to display.
|
|
20
|
+
title: Panel title (default: "Error").
|
|
21
|
+
"""
|
|
22
|
+
console.print(Panel(f"[bold red]{title}:[/bold red] {message}", border_style="red"))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def show_warning_panel(message: str, title: str = "Warning") -> None:
|
|
26
|
+
"""Display a warning panel with yellow border.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
message: Warning message to display.
|
|
30
|
+
title: Panel title (default: "Warning").
|
|
31
|
+
"""
|
|
32
|
+
console.print(
|
|
33
|
+
Panel(f"[bold yellow]{title}:[/bold yellow] {message}", border_style="yellow")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def show_success_table(
|
|
38
|
+
input_file: str,
|
|
39
|
+
output_file: Path,
|
|
40
|
+
encoding: str,
|
|
41
|
+
is_remote: bool,
|
|
42
|
+
row_count: int,
|
|
43
|
+
column_count: int,
|
|
44
|
+
input_size: int,
|
|
45
|
+
output_size: int,
|
|
46
|
+
delimiter: str,
|
|
47
|
+
keep_names: bool,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Display success summary table with processing results.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
input_file: Input CSV file path or URL.
|
|
53
|
+
output_file: Output CSV file path.
|
|
54
|
+
encoding: Detected encoding (or "remote" for URLs).
|
|
55
|
+
is_remote: Whether input was a remote URL.
|
|
56
|
+
row_count: Number of data rows in output.
|
|
57
|
+
column_count: Number of columns in output.
|
|
58
|
+
input_size: Input file size in bytes (0 for remote).
|
|
59
|
+
output_size: Output file size in bytes.
|
|
60
|
+
delimiter: Output delimiter character.
|
|
61
|
+
keep_names: Whether original column names were kept.
|
|
62
|
+
"""
|
|
63
|
+
table = Table(show_header=False, box=None, padding=(0, 1))
|
|
64
|
+
table.add_row("[green]✓[/green] Success", "")
|
|
65
|
+
table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
|
|
66
|
+
table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
|
|
67
|
+
|
|
68
|
+
# Encoding info
|
|
69
|
+
if not is_remote:
|
|
70
|
+
if needs_conversion(encoding):
|
|
71
|
+
table.add_row("Encoding:", f"{encoding} → UTF-8 [dim](converted)[/dim]")
|
|
72
|
+
else:
|
|
73
|
+
table.add_row("Encoding:", f"{encoding} [dim](no conversion needed)[/dim]")
|
|
74
|
+
else:
|
|
75
|
+
table.add_row("Encoding:", "remote [dim](handled by DuckDB)[/dim]")
|
|
76
|
+
|
|
77
|
+
# Statistics
|
|
78
|
+
table.add_row("Rows:", f"{row_count:,}")
|
|
79
|
+
table.add_row("Columns:", f"{column_count}")
|
|
80
|
+
table.add_row("Input size:", format_file_size(input_size))
|
|
81
|
+
table.add_row("Output size:", format_file_size(output_size))
|
|
82
|
+
|
|
83
|
+
# Optional fields
|
|
84
|
+
if delimiter != ",":
|
|
85
|
+
table.add_row("Delimiter:", repr(delimiter))
|
|
86
|
+
if not keep_names:
|
|
87
|
+
table.add_row("Headers:", "normalized to snake_case")
|
|
88
|
+
|
|
89
|
+
console.print()
|
|
90
|
+
console.print(table)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def show_validation_error_panel(
|
|
94
|
+
reject_count: int, error_types: list[str], reject_file: Path
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Display validation error summary panel.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
reject_count: Number of rejected rows (including header).
|
|
100
|
+
error_types: List of error type descriptions.
|
|
101
|
+
reject_file: Path to reject errors CSV file.
|
|
102
|
+
"""
|
|
103
|
+
console.print()
|
|
104
|
+
error_lines = []
|
|
105
|
+
error_lines.append("[bold red]Validation Errors:[/bold red]")
|
|
106
|
+
error_lines.append("")
|
|
107
|
+
error_lines.append(f"Rejected rows: [yellow]{reject_count - 1}[/yellow]")
|
|
108
|
+
|
|
109
|
+
if error_types:
|
|
110
|
+
error_lines.append("")
|
|
111
|
+
error_lines.append("[dim]Error types:[/dim]")
|
|
112
|
+
for error_type in error_types:
|
|
113
|
+
error_lines.append(f" • {error_type}")
|
|
114
|
+
|
|
115
|
+
error_lines.append("")
|
|
116
|
+
error_lines.append(f"Details: [cyan]{reject_file}[/cyan]")
|
|
117
|
+
|
|
118
|
+
console.print(
|
|
119
|
+
Panel(
|
|
120
|
+
"\n".join(error_lines),
|
|
121
|
+
border_style="yellow",
|
|
122
|
+
title="[yellow]![/yellow] Validation Failed",
|
|
123
|
+
)
|
|
124
|
+
)
|
csvnorm/utils.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
7
|
+
from urllib.parse import urlparse
|
|
6
8
|
|
|
7
9
|
from rich.logging import RichHandler
|
|
8
10
|
|
|
@@ -45,10 +47,7 @@ def setup_logger(verbose: bool = False) -> logging.Logger:
|
|
|
45
47
|
|
|
46
48
|
if not logger.handlers:
|
|
47
49
|
handler = RichHandler(
|
|
48
|
-
show_time=False,
|
|
49
|
-
show_path=verbose,
|
|
50
|
-
markup=True,
|
|
51
|
-
rich_tracebacks=True
|
|
50
|
+
show_time=False, show_path=verbose, markup=True, rich_tracebacks=True
|
|
52
51
|
)
|
|
53
52
|
logger.addHandler(handler)
|
|
54
53
|
|
|
@@ -69,3 +68,125 @@ def validate_delimiter(delimiter: str) -> None:
|
|
|
69
68
|
def ensure_output_dir(output_dir: Path) -> None:
|
|
70
69
|
"""Create output directory if it doesn't exist."""
|
|
71
70
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def is_url(input_str: str) -> bool:
|
|
74
|
+
"""Check if input string is an HTTP/HTTPS URL.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
input_str: String to check.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
True if input is HTTP/HTTPS URL, False otherwise.
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
result = urlparse(input_str)
|
|
84
|
+
return result.scheme in ("http", "https") and bool(result.netloc)
|
|
85
|
+
except Exception:
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def validate_url(url: str) -> None:
|
|
90
|
+
"""Validate URL has HTTP/HTTPS protocol.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
url: URL to validate.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If URL protocol is not HTTP/HTTPS.
|
|
97
|
+
"""
|
|
98
|
+
parsed = urlparse(url)
|
|
99
|
+
if parsed.scheme not in ("http", "https"):
|
|
100
|
+
raise ValueError(f"Only HTTP/HTTPS URLs are supported. Got: {parsed.scheme}://")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_filename_from_url(url: str) -> str:
|
|
104
|
+
"""Extract and normalize filename from URL.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
url: URL to extract filename from.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Normalized snake_case filename without extension.
|
|
111
|
+
"""
|
|
112
|
+
from urllib.parse import unquote
|
|
113
|
+
|
|
114
|
+
parsed = urlparse(url)
|
|
115
|
+
# Get last path segment, ignore query/fragment
|
|
116
|
+
path = parsed.path.rstrip("/")
|
|
117
|
+
filename = path.split("/")[-1] if path else "data"
|
|
118
|
+
|
|
119
|
+
# Decode URL encoding (%20 -> space, etc.)
|
|
120
|
+
filename = unquote(filename)
|
|
121
|
+
|
|
122
|
+
# Remove extension if present
|
|
123
|
+
if filename.lower().endswith(".csv"):
|
|
124
|
+
filename = filename[:-4]
|
|
125
|
+
|
|
126
|
+
# Apply snake_case normalization
|
|
127
|
+
return to_snake_case(filename) if filename else "data"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def format_file_size(size_bytes: int) -> str:
|
|
131
|
+
"""Format file size in human-readable format.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
size_bytes: File size in bytes.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Formatted size string (e.g., "1.5 MB", "256 KB").
|
|
138
|
+
"""
|
|
139
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
140
|
+
if size_bytes < 1024.0:
|
|
141
|
+
return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} B"
|
|
142
|
+
size_bytes /= 1024.0
|
|
143
|
+
return f"{size_bytes:.1f} TB"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_row_count(file_path: Union[Path, str]) -> int:
|
|
147
|
+
"""Count number of rows in a CSV file.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
file_path: Path to CSV file.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Number of data rows (excluding header), or 0 if file doesn't exist.
|
|
154
|
+
"""
|
|
155
|
+
if not isinstance(file_path, Path) or not file_path.exists():
|
|
156
|
+
return 0
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
with open(file_path, "r") as f:
|
|
160
|
+
# Skip header
|
|
161
|
+
next(f, None)
|
|
162
|
+
return sum(1 for _ in f)
|
|
163
|
+
except Exception:
|
|
164
|
+
return 0
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_column_count(file_path: Union[Path, str], delimiter: str = ",") -> int:
|
|
168
|
+
"""Count number of columns in a CSV file using DuckDB.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
file_path: Path to CSV file.
|
|
172
|
+
delimiter: Field delimiter used in the CSV file.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Number of columns in the CSV, or 0 if file doesn't exist or error.
|
|
176
|
+
"""
|
|
177
|
+
if not isinstance(file_path, Path) or not file_path.exists():
|
|
178
|
+
return 0
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
import duckdb
|
|
182
|
+
|
|
183
|
+
conn = duckdb.connect(":memory:")
|
|
184
|
+
# Get column names from CSV using DuckDB DESCRIBE
|
|
185
|
+
columns = conn.execute(
|
|
186
|
+
f"DESCRIBE SELECT * FROM read_csv('{file_path}', delim='{delimiter}', header=true, sample_size=1)"
|
|
187
|
+
).fetchall()
|
|
188
|
+
conn.close()
|
|
189
|
+
|
|
190
|
+
return len(columns)
|
|
191
|
+
except Exception:
|
|
192
|
+
return 0
|
csvnorm/validation.py
CHANGED
|
@@ -2,27 +2,36 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import Union
|
|
5
6
|
|
|
6
7
|
import duckdb
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger("csvnorm")
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
def validate_csv(
|
|
12
|
+
def validate_csv(
|
|
13
|
+
file_path: Union[Path, str], reject_file: Path, is_remote: bool = False
|
|
14
|
+
) -> tuple[int, list[str]]:
|
|
12
15
|
"""Validate CSV file using DuckDB and export rejected rows.
|
|
13
16
|
|
|
14
17
|
Args:
|
|
15
|
-
file_path: Path to CSV file to validate.
|
|
18
|
+
file_path: Path to CSV file to validate or URL string.
|
|
16
19
|
reject_file: Path to write rejected rows.
|
|
20
|
+
is_remote: True if file_path is a remote URL.
|
|
17
21
|
|
|
18
22
|
Returns:
|
|
19
|
-
|
|
23
|
+
Tuple of (reject_count, error_types) where error_types is list of
|
|
24
|
+
up to 3 unique error reasons from reject file.
|
|
20
25
|
"""
|
|
21
26
|
logger.debug(f"Validating CSV: {file_path}")
|
|
22
27
|
|
|
23
28
|
conn = duckdb.connect()
|
|
24
29
|
|
|
25
30
|
try:
|
|
31
|
+
# Set HTTP timeout for remote URLs (30 seconds)
|
|
32
|
+
if is_remote:
|
|
33
|
+
conn.execute("SET http_timeout=30000")
|
|
34
|
+
|
|
26
35
|
# Read CSV with store_rejects to capture malformed rows
|
|
27
36
|
# Use all_varchar=true to avoid type inference failures
|
|
28
37
|
conn.execute(f"""
|
|
@@ -46,28 +55,39 @@ def validate_csv(file_path: Path, reject_file: Path) -> bool:
|
|
|
46
55
|
reject_count = _count_lines(reject_file)
|
|
47
56
|
logger.debug(f"Reject file lines: {reject_count}")
|
|
48
57
|
|
|
49
|
-
|
|
58
|
+
# Collect sample error types from reject file
|
|
59
|
+
error_types = []
|
|
60
|
+
if reject_count > 1:
|
|
61
|
+
error_types = _get_error_types(reject_file)
|
|
62
|
+
|
|
63
|
+
return reject_count, error_types
|
|
50
64
|
|
|
51
65
|
|
|
52
66
|
def normalize_csv(
|
|
53
|
-
input_path: Path,
|
|
67
|
+
input_path: Union[Path, str],
|
|
54
68
|
output_path: Path,
|
|
55
69
|
delimiter: str = ",",
|
|
56
70
|
normalize_names: bool = True,
|
|
71
|
+
is_remote: bool = False,
|
|
57
72
|
) -> None:
|
|
58
73
|
"""Normalize CSV file using DuckDB.
|
|
59
74
|
|
|
60
75
|
Args:
|
|
61
|
-
input_path: Path to input CSV file.
|
|
76
|
+
input_path: Path to input CSV file or URL string.
|
|
62
77
|
output_path: Path for normalized output file.
|
|
63
78
|
delimiter: Output field delimiter.
|
|
64
79
|
normalize_names: If True, convert column names to snake_case.
|
|
80
|
+
is_remote: True if input_path is a remote URL.
|
|
65
81
|
"""
|
|
66
82
|
logger.debug(f"Normalizing CSV: {input_path} -> {output_path}")
|
|
67
83
|
|
|
68
84
|
conn = duckdb.connect()
|
|
69
85
|
|
|
70
86
|
try:
|
|
87
|
+
# Set HTTP timeout for remote URLs (30 seconds)
|
|
88
|
+
if is_remote:
|
|
89
|
+
conn.execute("SET http_timeout=30000")
|
|
90
|
+
|
|
71
91
|
# Build read options
|
|
72
92
|
read_opts = "sample_size=-1, all_varchar=true"
|
|
73
93
|
if normalize_names:
|
|
@@ -107,3 +127,36 @@ def _count_lines(file_path: Path) -> int:
|
|
|
107
127
|
|
|
108
128
|
with open(file_path, "r") as f:
|
|
109
129
|
return sum(1 for _ in f)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _get_error_types(reject_file: Path) -> list[str]:
|
|
133
|
+
"""Extract sample error types from reject file.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
reject_file: Path to reject_errors.csv file.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of up to 3 unique error reasons.
|
|
140
|
+
"""
|
|
141
|
+
if not reject_file.exists():
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
error_types: set[str] = set()
|
|
145
|
+
try:
|
|
146
|
+
with open(reject_file, "r") as f:
|
|
147
|
+
# Skip header
|
|
148
|
+
next(f, None)
|
|
149
|
+
for line in f:
|
|
150
|
+
# Error message is in the last column
|
|
151
|
+
parts = line.rstrip("\n").split(",")
|
|
152
|
+
if parts:
|
|
153
|
+
error_reason = parts[-1].strip()
|
|
154
|
+
if error_reason and error_reason != "error":
|
|
155
|
+
error_types.add(error_reason)
|
|
156
|
+
if len(error_types) >= 3:
|
|
157
|
+
break
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.warning(f"Failed to extract error types: {e}")
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
return list(error_types)[:3]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csvnorm
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.11
|
|
4
4
|
Summary: A command-line utility to validate and normalize CSV files
|
|
5
5
|
Author-email: aborruso <aborruso@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -34,7 +34,6 @@ Classifier: Intended Audience :: Science/Research
|
|
|
34
34
|
Classifier: License :: OSI Approved :: MIT License
|
|
35
35
|
Classifier: Operating System :: OS Independent
|
|
36
36
|
Classifier: Programming Language :: Python :: 3
|
|
37
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
38
37
|
Classifier: Programming Language :: Python :: 3.9
|
|
39
38
|
Classifier: Programming Language :: Python :: 3.10
|
|
40
39
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -49,18 +48,15 @@ Requires-Dist: charset-normalizer>=3.0.0
|
|
|
49
48
|
Requires-Dist: duckdb>=0.9.0
|
|
50
49
|
Requires-Dist: rich>=13.0.0
|
|
51
50
|
Requires-Dist: rich-argparse>=1.0.0
|
|
52
|
-
Requires-Dist: pyfiglet>=1.0.0
|
|
53
51
|
Provides-Extra: dev
|
|
54
52
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
55
53
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
56
54
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
57
|
-
Provides-Extra: banner
|
|
58
|
-
Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
|
|
59
55
|
Dynamic: license-file
|
|
60
56
|
|
|
61
57
|
[](https://pypi.org/project/csvnorm/)
|
|
62
58
|
[](https://opensource.org/licenses/MIT)
|
|
63
|
-
[](https://www.python.org/downloads/)
|
|
64
60
|
[](https://deepwiki.com/aborruso/csvnorm)
|
|
65
61
|
|
|
66
62
|
# csvnorm
|
|
@@ -81,26 +77,6 @@ Or with pip:
|
|
|
81
77
|
pip install csvnorm
|
|
82
78
|
```
|
|
83
79
|
|
|
84
|
-
For ASCII art banner (shown with `--version` and `-V`):
|
|
85
|
-
|
|
86
|
-
```bash
|
|
87
|
-
uv tool install 'csvnorm[banner]'
|
|
88
|
-
# or
|
|
89
|
-
pip install 'csvnorm[banner]'
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
Example with banner:
|
|
93
|
-
```bash
|
|
94
|
-
csvnorm --version
|
|
95
|
-
# Output:
|
|
96
|
-
# ___________ ______ ____ _________ ___
|
|
97
|
-
# / ___/ ___/ | / / __ \/ __ \/ ___/ __ `__ \
|
|
98
|
-
# / /__(__ )| |/ / / / / /_/ / / / / / / / /
|
|
99
|
-
# \___/____/ |___/_/ /_/\____/_/ /_/ /_/ /_/
|
|
100
|
-
#
|
|
101
|
-
# csvnorm 0.3.1
|
|
102
|
-
```
|
|
103
|
-
|
|
104
80
|
## Purpose
|
|
105
81
|
|
|
106
82
|
This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
|
|
@@ -122,7 +98,9 @@ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not
|
|
|
122
98
|
- **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
|
|
123
99
|
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
124
100
|
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
|
|
125
|
-
- **
|
|
101
|
+
- **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
|
|
102
|
+
- **Error Reporting**: Exports detailed error file for invalid rows with summary panel
|
|
103
|
+
- **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
|
|
126
104
|
|
|
127
105
|
## Usage
|
|
128
106
|
|
|
@@ -148,6 +126,9 @@ csvnorm input.csv [options]
|
|
|
148
126
|
# Basic usage
|
|
149
127
|
csvnorm data.csv
|
|
150
128
|
|
|
129
|
+
# Process remote CSV from URL
|
|
130
|
+
csvnorm "https://raw.githubusercontent.com/aborruso/csvnorm/refs/heads/main/test/Trasporto%20Pubblico%20Locale%20Settore%20Pubblico%20Allargato%20-%20Indicatore%202000-2020%20Trasferimenti%20Correnti%20su%20Entrate%20Correnti.csv"
|
|
131
|
+
|
|
151
132
|
# With semicolon delimiter
|
|
152
133
|
csvnorm data.csv -d ';'
|
|
153
134
|
|
|
@@ -163,17 +144,63 @@ csvnorm data.csv -f -V
|
|
|
163
144
|
|
|
164
145
|
### Output
|
|
165
146
|
|
|
166
|
-
Creates a normalized CSV file in
|
|
147
|
+
Creates a normalized CSV file in specified output directory with:
|
|
167
148
|
- UTF-8 encoding
|
|
168
149
|
- Consistent field delimiters
|
|
169
150
|
- Normalized column names (unless `--keep-names` is specified)
|
|
170
151
|
- Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
|
|
171
152
|
|
|
153
|
+
For remote URLs:
|
|
154
|
+
- The output filename is derived from URL's last path segment
|
|
155
|
+
- Encoding is handled automatically by DuckDB
|
|
156
|
+
- HTTP timeout is set to 30 seconds
|
|
157
|
+
- Only public URLs are supported (no authentication)
|
|
158
|
+
|
|
172
159
|
The tool provides modern terminal output with:
|
|
173
160
|
- Progress indicators for multi-step processing
|
|
174
161
|
- Color-coded error messages with panels
|
|
175
|
-
- Success summary table
|
|
176
|
-
-
|
|
162
|
+
- Success summary table with statistics (rows, columns, file sizes)
|
|
163
|
+
- Encoding conversion status (converted/no conversion/remote)
|
|
164
|
+
- Error summary panel with reject count and error types when validation fails
|
|
165
|
+
- ASCII art banner with `--version` and `-V` verbose mode
|
|
166
|
+
|
|
167
|
+
**Success Example:**
|
|
168
|
+
```
|
|
169
|
+
✓ Success
|
|
170
|
+
Input: test/utf8_basic.csv
|
|
171
|
+
Output: output/utf8_basic.csv
|
|
172
|
+
Encoding: ascii (no conversion needed)
|
|
173
|
+
Rows: 2
|
|
174
|
+
Columns: 3
|
|
175
|
+
Input size: 42 B
|
|
176
|
+
Output size: 43 B
|
|
177
|
+
Headers: normalized to snake_case
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
**Error Example:**
|
|
181
|
+
```
|
|
182
|
+
✓ Success
|
|
183
|
+
Input: test/malformed_rows.csv
|
|
184
|
+
Output: output/malformed_rows.csv
|
|
185
|
+
Encoding: ascii (no conversion needed)
|
|
186
|
+
Rows: 1
|
|
187
|
+
Columns: 4
|
|
188
|
+
Input size: 24 B
|
|
189
|
+
Output size: 40 B
|
|
190
|
+
Headers: normalized to snake_case
|
|
191
|
+
|
|
192
|
+
╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
|
|
193
|
+
│ Validation Errors: │
|
|
194
|
+
│ │
|
|
195
|
+
│ Rejected rows: 2 │
|
|
196
|
+
│ │
|
|
197
|
+
│ Error types: │
|
|
198
|
+
│ • Expected Number of Columns: 3 Found: 2 │
|
|
199
|
+
│ • Expected Number of Columns: 3 Found: 4 │
|
|
200
|
+
│ │
|
|
201
|
+
│ Details: output/malformed_rows_reject_errors.csv │
|
|
202
|
+
╰──────────────────────────────────────────────────────────────────────────────╯
|
|
203
|
+
```
|
|
177
204
|
|
|
178
205
|
### Exit Codes
|
|
179
206
|
|
|
@@ -184,15 +211,15 @@ The tool provides modern terminal output with:
|
|
|
184
211
|
|
|
185
212
|
## Requirements
|
|
186
213
|
|
|
187
|
-
- Python 3.
|
|
214
|
+
- Python 3.9+
|
|
188
215
|
- Dependencies (automatically installed):
|
|
189
216
|
- `charset-normalizer>=3.0.0` - Encoding detection
|
|
190
217
|
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
191
218
|
- `rich>=13.0.0` - Modern terminal output formatting
|
|
192
219
|
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
220
|
+
- `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
|
|
193
221
|
|
|
194
222
|
Optional extras:
|
|
195
|
-
- `[banner]` - ASCII art banner for `--version` and `-V` verbose mode (`pyfiglet>=1.0.0`)
|
|
196
223
|
- `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
|
|
197
224
|
|
|
198
225
|
## Development
|
|
@@ -221,7 +248,7 @@ pytest tests/ -v
|
|
|
221
248
|
### Project Structure
|
|
222
249
|
|
|
223
250
|
```
|
|
224
|
-
|
|
251
|
+
csvnorm/
|
|
225
252
|
├── src/csvnorm/
|
|
226
253
|
│ ├── __init__.py # Package version
|
|
227
254
|
│ ├── __main__.py # python -m support
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
csvnorm/__init__.py,sha256=frEketezK5MWX8eiy1mFgw_3QeMcH4cVgVsNXtD1Jgg,264
|
|
2
|
+
csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
|
|
3
|
+
csvnorm/cli.py,sha256=UEe0hRGWx9m6ZLGLd9TIaJ_uayclNTh_i0fO_JEgTXY,4166
|
|
4
|
+
csvnorm/core.py,sha256=0tgOmPr4JSMSzgSxT8ffCk_IrOWGLI2hTzhV9_xNQQ8,9945
|
|
5
|
+
csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
|
|
6
|
+
csvnorm/ui.py,sha256=rOfVYjnTImplMMc-QGmcYUXzzZ513Y1bCjlO2jPxG2A,3893
|
|
7
|
+
csvnorm/utils.py,sha256=slV2aADBDfg9RHZJE-jmRuzPfY1RX0Wq-D1A4oBN7Yo,5020
|
|
8
|
+
csvnorm/validation.py,sha256=I7m_nxsGDROy5pBkNU-H7qEVYEAT19vw5alkrvZqGh4,4539
|
|
9
|
+
csvnorm-0.3.11.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
|
|
10
|
+
csvnorm-0.3.11.dist-info/METADATA,sha256=7c2Bu-M-4UiOqqVOC5Nm-I88ZhmC2BquMSiGRjD9VBo,9808
|
|
11
|
+
csvnorm-0.3.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
csvnorm-0.3.11.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
|
|
13
|
+
csvnorm-0.3.11.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
|
|
14
|
+
csvnorm-0.3.11.dist-info/RECORD,,
|
csvnorm-0.3.3.dist-info/RECORD
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
csvnorm/__init__.py,sha256=8njXIycxL0qSI5Q9bVGyTaM41j_kKX9jV7TeQOSAQGE,263
|
|
2
|
-
csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
|
|
3
|
-
csvnorm/cli.py,sha256=MwIPahLktbulF6NYRWyBsE4s9Al9_aSdA1zvzuI0AiQ,3815
|
|
4
|
-
csvnorm/core.py,sha256=_kTaui_2IhqrN_UxJpcjwXYXEvqaRMhML49Xlx-e0p0,6633
|
|
5
|
-
csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
|
|
6
|
-
csvnorm/utils.py,sha256=gvwDToOx3YoKCfVPyCmxcSa7teCWFB2SmAGr-jV5w_Y,1761
|
|
7
|
-
csvnorm/validation.py,sha256=iXdfalAGDNB9kPefyzHXGI9uc-HLAG5pQ_-T93ShppY,2815
|
|
8
|
-
csvnorm-0.3.3.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
|
|
9
|
-
csvnorm-0.3.3.dist-info/METADATA,sha256=xKJmLVX9RoB22KwAAlxAvWB_KA9h68m5V-UyFaS_DGo,7840
|
|
10
|
-
csvnorm-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
csvnorm-0.3.3.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
|
|
12
|
-
csvnorm-0.3.3.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
|
|
13
|
-
csvnorm-0.3.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|