csvnorm 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvnorm/__init__.py +1 -1
- csvnorm/cli.py +12 -5
- csvnorm/core.py +174 -64
- csvnorm/utils.py +59 -4
- csvnorm/validation.py +18 -4
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.4.dist-info}/METADATA +13 -25
- csvnorm-0.3.4.dist-info/RECORD +13 -0
- csvnorm-0.3.3.dist-info/RECORD +0 -13
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.4.dist-info}/WHEEL +0 -0
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.4.dist-info}/entry_points.txt +0 -0
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.4.dist-info}/licenses/LICENSE +0 -0
- {csvnorm-0.3.3.dist-info → csvnorm-0.3.4.dist-info}/top_level.txt +0 -0
csvnorm/__init__.py
CHANGED
csvnorm/cli.py
CHANGED
|
@@ -17,6 +17,7 @@ console = Console()
|
|
|
17
17
|
def show_banner() -> None:
|
|
18
18
|
"""Show ASCII art banner."""
|
|
19
19
|
from pyfiglet import figlet_format
|
|
20
|
+
|
|
20
21
|
banner = figlet_format("csvnorm", font="slant")
|
|
21
22
|
console.print(banner, style="bold cyan")
|
|
22
23
|
|
|
@@ -28,10 +29,15 @@ class VersionAction(argparse.Action):
|
|
|
28
29
|
show_banner()
|
|
29
30
|
console.print(f"csvnorm {__version__}", style="bold")
|
|
30
31
|
console.print()
|
|
31
|
-
console.print(
|
|
32
|
+
console.print(
|
|
33
|
+
"Validate and normalize CSV files for exploratory data analysis",
|
|
34
|
+
style="dim",
|
|
35
|
+
)
|
|
32
36
|
console.print()
|
|
33
37
|
console.print("Author: aborruso", style="dim")
|
|
34
|
-
console.print(
|
|
38
|
+
console.print(
|
|
39
|
+
"Repository: https://github.com/aborruso/csvnorm", style="dim cyan"
|
|
40
|
+
)
|
|
35
41
|
console.print("License: MIT", style="dim")
|
|
36
42
|
parser.exit()
|
|
37
43
|
|
|
@@ -46,14 +52,15 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
46
52
|
Examples:
|
|
47
53
|
csvnorm data.csv -d ';' -o output_folder --force
|
|
48
54
|
csvnorm data.csv --keep-names --delimiter '\\t'
|
|
55
|
+
csvnorm https://example.com/data.csv -o output
|
|
49
56
|
csvnorm data.csv -V
|
|
50
57
|
""",
|
|
51
58
|
)
|
|
52
59
|
|
|
53
60
|
parser.add_argument(
|
|
54
61
|
"input_file",
|
|
55
|
-
type=
|
|
56
|
-
help="Input CSV file path",
|
|
62
|
+
type=str,
|
|
63
|
+
help="Input CSV file path or HTTP/HTTPS URL",
|
|
57
64
|
)
|
|
58
65
|
|
|
59
66
|
parser.add_argument(
|
|
@@ -122,7 +129,7 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
122
129
|
if argv is None:
|
|
123
130
|
argv = sys.argv[1:]
|
|
124
131
|
|
|
125
|
-
if not argv or (len(argv) == 1 and argv[0] in [
|
|
132
|
+
if not argv or (len(argv) == 1 and argv[0] in ["-h", "--help"]):
|
|
126
133
|
parser.print_help()
|
|
127
134
|
return 0 if argv else 2
|
|
128
135
|
|
csvnorm/core.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import Union
|
|
5
6
|
|
|
6
7
|
from rich.console import Console
|
|
7
8
|
from rich.panel import Panel
|
|
@@ -9,7 +10,14 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
|
9
10
|
from rich.table import Table
|
|
10
11
|
|
|
11
12
|
from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
|
|
12
|
-
from csvnorm.utils import
|
|
13
|
+
from csvnorm.utils import (
|
|
14
|
+
ensure_output_dir,
|
|
15
|
+
extract_filename_from_url,
|
|
16
|
+
is_url,
|
|
17
|
+
to_snake_case,
|
|
18
|
+
validate_delimiter,
|
|
19
|
+
validate_url,
|
|
20
|
+
)
|
|
13
21
|
from csvnorm.validation import normalize_csv, validate_csv
|
|
14
22
|
|
|
15
23
|
logger = logging.getLogger("csvnorm")
|
|
@@ -17,7 +25,7 @@ console = Console()
|
|
|
17
25
|
|
|
18
26
|
|
|
19
27
|
def process_csv(
|
|
20
|
-
input_file:
|
|
28
|
+
input_file: str,
|
|
21
29
|
output_dir: Path,
|
|
22
30
|
force: bool = False,
|
|
23
31
|
keep_names: bool = False,
|
|
@@ -27,7 +35,7 @@ def process_csv(
|
|
|
27
35
|
"""Main CSV processing pipeline.
|
|
28
36
|
|
|
29
37
|
Args:
|
|
30
|
-
input_file: Path to input CSV file.
|
|
38
|
+
input_file: Path to input CSV file or HTTP/HTTPS URL.
|
|
31
39
|
output_dir: Directory for output files.
|
|
32
40
|
force: If True, overwrite existing output files.
|
|
33
41
|
keep_names: If True, keep original column names.
|
|
@@ -37,32 +45,50 @@ def process_csv(
|
|
|
37
45
|
Returns:
|
|
38
46
|
Exit code: 0 for success, 1 for error.
|
|
39
47
|
"""
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
console.print(Panel(
|
|
43
|
-
f"[bold red]Error:[/bold red] Input file not found\n{input_file}",
|
|
44
|
-
border_style="red"
|
|
45
|
-
))
|
|
46
|
-
return 1
|
|
48
|
+
# Detect if input is URL or file
|
|
49
|
+
is_remote = is_url(input_file)
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
input_path: Union[str, Path]
|
|
52
|
+
if is_remote:
|
|
53
|
+
# Validate URL
|
|
54
|
+
try:
|
|
55
|
+
validate_url(input_file)
|
|
56
|
+
except ValueError as e:
|
|
57
|
+
console.print(Panel(f"[bold red]Error:[/bold red] {e}", border_style="red"))
|
|
58
|
+
return 1
|
|
59
|
+
base_name = extract_filename_from_url(input_file)
|
|
60
|
+
input_path = input_file # Keep as string for DuckDB
|
|
61
|
+
else:
|
|
62
|
+
# Validate local file
|
|
63
|
+
file_path = Path(input_file)
|
|
64
|
+
if not file_path.exists():
|
|
65
|
+
console.print(
|
|
66
|
+
Panel(
|
|
67
|
+
f"[bold red]Error:[/bold red] Input file not found\n{file_path}",
|
|
68
|
+
border_style="red",
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
return 1
|
|
72
|
+
|
|
73
|
+
if not file_path.is_file():
|
|
74
|
+
console.print(
|
|
75
|
+
Panel(
|
|
76
|
+
f"[bold red]Error:[/bold red] Not a file\n{file_path}",
|
|
77
|
+
border_style="red",
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
return 1
|
|
81
|
+
|
|
82
|
+
base_name = to_snake_case(file_path.name)
|
|
83
|
+
input_path = file_path
|
|
54
84
|
|
|
55
85
|
try:
|
|
56
86
|
validate_delimiter(delimiter)
|
|
57
87
|
except ValueError as e:
|
|
58
|
-
console.print(Panel(
|
|
59
|
-
f"[bold red]Error:[/bold red] {e}",
|
|
60
|
-
border_style="red"
|
|
61
|
-
))
|
|
88
|
+
console.print(Panel(f"[bold red]Error:[/bold red] {e}", border_style="red"))
|
|
62
89
|
return 1
|
|
63
90
|
|
|
64
91
|
# Setup paths
|
|
65
|
-
base_name = to_snake_case(input_file.name)
|
|
66
92
|
ensure_output_dir(output_dir)
|
|
67
93
|
|
|
68
94
|
output_file = output_dir / f"{base_name}.csv"
|
|
@@ -71,12 +97,14 @@ def process_csv(
|
|
|
71
97
|
|
|
72
98
|
# Check if output exists
|
|
73
99
|
if output_file.exists() and not force:
|
|
74
|
-
console.print(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
100
|
+
console.print(
|
|
101
|
+
Panel(
|
|
102
|
+
f"[bold yellow]Warning:[/bold yellow] Output file already exists\n\n"
|
|
103
|
+
f"{output_file}\n\n"
|
|
104
|
+
f"Use [bold]--force[/bold] to overwrite.",
|
|
105
|
+
border_style="yellow",
|
|
106
|
+
)
|
|
107
|
+
)
|
|
80
108
|
return 1
|
|
81
109
|
|
|
82
110
|
# Clean up previous reject file
|
|
@@ -91,55 +119,135 @@ def process_csv(
|
|
|
91
119
|
SpinnerColumn(),
|
|
92
120
|
TextColumn("[progress.description]{task.description}"),
|
|
93
121
|
console=console,
|
|
94
|
-
transient=True
|
|
122
|
+
transient=True,
|
|
95
123
|
) as progress:
|
|
96
|
-
|
|
97
|
-
task = progress.add_task("[cyan]Detecting encoding...", total=None)
|
|
98
|
-
try:
|
|
99
|
-
encoding = detect_encoding(input_file)
|
|
100
|
-
except ValueError as e:
|
|
101
|
-
progress.stop()
|
|
102
|
-
console.print(Panel(
|
|
103
|
-
f"[bold red]Error:[/bold red] {e}",
|
|
104
|
-
border_style="red"
|
|
105
|
-
))
|
|
106
|
-
return 1
|
|
124
|
+
task = progress.add_task("[cyan]Processing...", total=None)
|
|
107
125
|
|
|
108
|
-
|
|
109
|
-
|
|
126
|
+
# For remote URLs, skip encoding detection/conversion
|
|
127
|
+
if is_remote:
|
|
128
|
+
progress.update(
|
|
129
|
+
task,
|
|
130
|
+
description="[green]✓[/green] Remote URL (encoding handled by DuckDB)",
|
|
131
|
+
)
|
|
132
|
+
working_file = input_path # Keep URL as string
|
|
133
|
+
encoding = "remote"
|
|
134
|
+
else:
|
|
135
|
+
# Step 1: Detect encoding (local files only)
|
|
136
|
+
# input_path is Path here (set in else block above)
|
|
137
|
+
file_input_path = input_path # Type narrowing for mypy
|
|
138
|
+
assert isinstance(file_input_path, Path)
|
|
110
139
|
|
|
111
|
-
|
|
112
|
-
working_file = input_file
|
|
113
|
-
if needs_conversion(encoding):
|
|
114
|
-
progress.update(task, description=f"[cyan]Converting from {encoding} to UTF-8...")
|
|
140
|
+
progress.update(task, description="[cyan]Detecting encoding...")
|
|
115
141
|
try:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
temp_files.append(temp_utf8_file)
|
|
119
|
-
progress.update(task, description=f"[green]✓[/green] Converted to UTF-8")
|
|
120
|
-
except (UnicodeDecodeError, LookupError) as e:
|
|
142
|
+
encoding = detect_encoding(file_input_path)
|
|
143
|
+
except ValueError as e:
|
|
121
144
|
progress.stop()
|
|
122
|
-
console.print(
|
|
123
|
-
f"[bold red]Error:[/bold red]
|
|
124
|
-
|
|
125
|
-
))
|
|
145
|
+
console.print(
|
|
146
|
+
Panel(f"[bold red]Error:[/bold red] {e}", border_style="red")
|
|
147
|
+
)
|
|
126
148
|
return 1
|
|
127
|
-
|
|
128
|
-
|
|
149
|
+
|
|
150
|
+
logger.debug(f"Detected encoding: {encoding}")
|
|
151
|
+
progress.update(
|
|
152
|
+
task, description=f"[green]✓[/green] Detected encoding: {encoding}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Step 2: Convert to UTF-8 if needed
|
|
156
|
+
working_file = file_input_path
|
|
157
|
+
if needs_conversion(encoding):
|
|
158
|
+
progress.update(
|
|
159
|
+
task,
|
|
160
|
+
description=f"[cyan]Converting from {encoding} to UTF-8...",
|
|
161
|
+
)
|
|
162
|
+
try:
|
|
163
|
+
convert_to_utf8(file_input_path, temp_utf8_file, encoding)
|
|
164
|
+
working_file = temp_utf8_file
|
|
165
|
+
temp_files.append(temp_utf8_file)
|
|
166
|
+
progress.update(
|
|
167
|
+
task, description=f"[green]✓[/green] Converted to UTF-8"
|
|
168
|
+
)
|
|
169
|
+
except (UnicodeDecodeError, LookupError) as e:
|
|
170
|
+
progress.stop()
|
|
171
|
+
console.print(
|
|
172
|
+
Panel(
|
|
173
|
+
f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
|
|
174
|
+
border_style="red",
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
return 1
|
|
178
|
+
else:
|
|
179
|
+
progress.update(
|
|
180
|
+
task,
|
|
181
|
+
description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)",
|
|
182
|
+
)
|
|
129
183
|
|
|
130
184
|
# Step 3: Validate CSV
|
|
131
185
|
progress.update(task, description="[cyan]Validating CSV...")
|
|
132
186
|
logger.debug("Validating CSV with DuckDB...")
|
|
133
|
-
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
is_valid = validate_csv(working_file, reject_file, is_remote=is_remote)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
progress.stop()
|
|
192
|
+
error_msg = str(e)
|
|
193
|
+
|
|
194
|
+
# Check for common HTTP errors
|
|
195
|
+
if "HTTP Error" in error_msg or "HTTPException" in error_msg:
|
|
196
|
+
if "404" in error_msg:
|
|
197
|
+
console.print(
|
|
198
|
+
Panel(
|
|
199
|
+
"[bold red]Error:[/bold red] Remote CSV file not found (HTTP 404)\n\n"
|
|
200
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
201
|
+
"Please check the URL is correct.",
|
|
202
|
+
border_style="red",
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
elif "401" in error_msg or "403" in error_msg:
|
|
206
|
+
console.print(
|
|
207
|
+
Panel(
|
|
208
|
+
"[bold red]Error:[/bold red] Authentication required (HTTP 401/403)\n\n"
|
|
209
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
210
|
+
"This tool only supports public URLs without authentication.\n"
|
|
211
|
+
"Please download the file manually first.",
|
|
212
|
+
border_style="red",
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
elif (
|
|
216
|
+
"timeout" in error_msg.lower()
|
|
217
|
+
or "timed out" in error_msg.lower()
|
|
218
|
+
):
|
|
219
|
+
console.print(
|
|
220
|
+
Panel(
|
|
221
|
+
"[bold red]Error:[/bold red] HTTP request timeout (30 seconds)\n\n"
|
|
222
|
+
f"URL: [cyan]{input_file}[/cyan]\n\n"
|
|
223
|
+
"The remote server took too long to respond.\n"
|
|
224
|
+
"Try again later or download the file manually.",
|
|
225
|
+
border_style="red",
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
console.print(
|
|
230
|
+
Panel(
|
|
231
|
+
f"[bold red]Error:[/bold red] HTTP request failed\n\n"
|
|
232
|
+
f"{error_msg}",
|
|
233
|
+
border_style="red",
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
# Re-raise non-HTTP errors
|
|
238
|
+
raise
|
|
239
|
+
return 1
|
|
134
240
|
|
|
135
241
|
if not is_valid:
|
|
136
242
|
progress.stop()
|
|
137
|
-
console.print(
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
243
|
+
console.print(
|
|
244
|
+
Panel(
|
|
245
|
+
"[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
|
|
246
|
+
f"Details: [cyan]{reject_file}[/cyan]\n\n"
|
|
247
|
+
"Please fix the issues and try again.",
|
|
248
|
+
border_style="red",
|
|
249
|
+
)
|
|
250
|
+
)
|
|
143
251
|
return 1
|
|
144
252
|
|
|
145
253
|
progress.update(task, description="[green]✓[/green] CSV validated")
|
|
@@ -152,6 +260,7 @@ def process_csv(
|
|
|
152
260
|
output_path=output_file,
|
|
153
261
|
delimiter=delimiter,
|
|
154
262
|
normalize_names=not keep_names,
|
|
263
|
+
is_remote=is_remote,
|
|
155
264
|
)
|
|
156
265
|
|
|
157
266
|
logger.debug(f"Output written to: {output_file}")
|
|
@@ -162,7 +271,8 @@ def process_csv(
|
|
|
162
271
|
table.add_row("[green]✓[/green] Success", "")
|
|
163
272
|
table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
|
|
164
273
|
table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
|
|
165
|
-
|
|
274
|
+
if not is_remote:
|
|
275
|
+
table.add_row("Encoding:", encoding)
|
|
166
276
|
if delimiter != ",":
|
|
167
277
|
table.add_row("Delimiter:", repr(delimiter))
|
|
168
278
|
if not keep_names:
|
csvnorm/utils.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from urllib.parse import urlparse
|
|
6
7
|
|
|
7
8
|
from rich.logging import RichHandler
|
|
8
9
|
|
|
@@ -45,10 +46,7 @@ def setup_logger(verbose: bool = False) -> logging.Logger:
|
|
|
45
46
|
|
|
46
47
|
if not logger.handlers:
|
|
47
48
|
handler = RichHandler(
|
|
48
|
-
show_time=False,
|
|
49
|
-
show_path=verbose,
|
|
50
|
-
markup=True,
|
|
51
|
-
rich_tracebacks=True
|
|
49
|
+
show_time=False, show_path=verbose, markup=True, rich_tracebacks=True
|
|
52
50
|
)
|
|
53
51
|
logger.addHandler(handler)
|
|
54
52
|
|
|
@@ -69,3 +67,60 @@ def validate_delimiter(delimiter: str) -> None:
|
|
|
69
67
|
def ensure_output_dir(output_dir: Path) -> None:
|
|
70
68
|
"""Create output directory if it doesn't exist."""
|
|
71
69
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def is_url(input_str: str) -> bool:
|
|
73
|
+
"""Check if input string is an HTTP/HTTPS URL.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
input_str: String to check.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
True if input is HTTP/HTTPS URL, False otherwise.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
result = urlparse(input_str)
|
|
83
|
+
return result.scheme in ("http", "https") and bool(result.netloc)
|
|
84
|
+
except Exception:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def validate_url(url: str) -> None:
|
|
89
|
+
"""Validate URL has HTTP/HTTPS protocol.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
url: URL to validate.
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If URL protocol is not HTTP/HTTPS.
|
|
96
|
+
"""
|
|
97
|
+
parsed = urlparse(url)
|
|
98
|
+
if parsed.scheme not in ("http", "https"):
|
|
99
|
+
raise ValueError(f"Only HTTP/HTTPS URLs are supported. Got: {parsed.scheme}://")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_filename_from_url(url: str) -> str:
|
|
103
|
+
"""Extract and normalize filename from URL.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
url: URL to extract filename from.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Normalized snake_case filename without extension.
|
|
110
|
+
"""
|
|
111
|
+
from urllib.parse import unquote
|
|
112
|
+
|
|
113
|
+
parsed = urlparse(url)
|
|
114
|
+
# Get last path segment, ignore query/fragment
|
|
115
|
+
path = parsed.path.rstrip("/")
|
|
116
|
+
filename = path.split("/")[-1] if path else "data"
|
|
117
|
+
|
|
118
|
+
# Decode URL encoding (%20 -> space, etc.)
|
|
119
|
+
filename = unquote(filename)
|
|
120
|
+
|
|
121
|
+
# Remove extension if present
|
|
122
|
+
if filename.lower().endswith(".csv"):
|
|
123
|
+
filename = filename[:-4]
|
|
124
|
+
|
|
125
|
+
# Apply snake_case normalization
|
|
126
|
+
return to_snake_case(filename) if filename else "data"
|
csvnorm/validation.py
CHANGED
|
@@ -2,18 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import Union
|
|
5
6
|
|
|
6
7
|
import duckdb
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger("csvnorm")
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
def validate_csv(
|
|
12
|
+
def validate_csv(
|
|
13
|
+
file_path: Union[Path, str], reject_file: Path, is_remote: bool = False
|
|
14
|
+
) -> bool:
|
|
12
15
|
"""Validate CSV file using DuckDB and export rejected rows.
|
|
13
16
|
|
|
14
17
|
Args:
|
|
15
|
-
file_path: Path to CSV file to validate.
|
|
18
|
+
file_path: Path to CSV file to validate or URL string.
|
|
16
19
|
reject_file: Path to write rejected rows.
|
|
20
|
+
is_remote: True if file_path is a remote URL.
|
|
17
21
|
|
|
18
22
|
Returns:
|
|
19
23
|
True if validation passes (no rejected rows), False otherwise.
|
|
@@ -23,6 +27,10 @@ def validate_csv(file_path: Path, reject_file: Path) -> bool:
|
|
|
23
27
|
conn = duckdb.connect()
|
|
24
28
|
|
|
25
29
|
try:
|
|
30
|
+
# Set HTTP timeout for remote URLs (30 seconds)
|
|
31
|
+
if is_remote:
|
|
32
|
+
conn.execute("SET http_timeout=30000")
|
|
33
|
+
|
|
26
34
|
# Read CSV with store_rejects to capture malformed rows
|
|
27
35
|
# Use all_varchar=true to avoid type inference failures
|
|
28
36
|
conn.execute(f"""
|
|
@@ -50,24 +58,30 @@ def validate_csv(file_path: Path, reject_file: Path) -> bool:
|
|
|
50
58
|
|
|
51
59
|
|
|
52
60
|
def normalize_csv(
|
|
53
|
-
input_path: Path,
|
|
61
|
+
input_path: Union[Path, str],
|
|
54
62
|
output_path: Path,
|
|
55
63
|
delimiter: str = ",",
|
|
56
64
|
normalize_names: bool = True,
|
|
65
|
+
is_remote: bool = False,
|
|
57
66
|
) -> None:
|
|
58
67
|
"""Normalize CSV file using DuckDB.
|
|
59
68
|
|
|
60
69
|
Args:
|
|
61
|
-
input_path: Path to input CSV file.
|
|
70
|
+
input_path: Path to input CSV file or URL string.
|
|
62
71
|
output_path: Path for normalized output file.
|
|
63
72
|
delimiter: Output field delimiter.
|
|
64
73
|
normalize_names: If True, convert column names to snake_case.
|
|
74
|
+
is_remote: True if input_path is a remote URL.
|
|
65
75
|
"""
|
|
66
76
|
logger.debug(f"Normalizing CSV: {input_path} -> {output_path}")
|
|
67
77
|
|
|
68
78
|
conn = duckdb.connect()
|
|
69
79
|
|
|
70
80
|
try:
|
|
81
|
+
# Set HTTP timeout for remote URLs (30 seconds)
|
|
82
|
+
if is_remote:
|
|
83
|
+
conn.execute("SET http_timeout=30000")
|
|
84
|
+
|
|
71
85
|
# Build read options
|
|
72
86
|
read_opts = "sample_size=-1, all_varchar=true"
|
|
73
87
|
if normalize_names:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csvnorm
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: A command-line utility to validate and normalize CSV files
|
|
5
5
|
Author-email: aborruso <aborruso@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -54,8 +54,6 @@ Provides-Extra: dev
|
|
|
54
54
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
55
55
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
56
56
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
57
|
-
Provides-Extra: banner
|
|
58
|
-
Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
|
|
59
57
|
Dynamic: license-file
|
|
60
58
|
|
|
61
59
|
[](https://pypi.org/project/csvnorm/)
|
|
@@ -81,26 +79,6 @@ Or with pip:
|
|
|
81
79
|
pip install csvnorm
|
|
82
80
|
```
|
|
83
81
|
|
|
84
|
-
For ASCII art banner (shown with `--version` and `-V`):
|
|
85
|
-
|
|
86
|
-
```bash
|
|
87
|
-
uv tool install 'csvnorm[banner]'
|
|
88
|
-
# or
|
|
89
|
-
pip install 'csvnorm[banner]'
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
Example with banner:
|
|
93
|
-
```bash
|
|
94
|
-
csvnorm --version
|
|
95
|
-
# Output:
|
|
96
|
-
# ___________ ______ ____ _________ ___
|
|
97
|
-
# / ___/ ___/ | / / __ \/ __ \/ ___/ __ `__ \
|
|
98
|
-
# / /__(__ )| |/ / / / / /_/ / / / / / / / /
|
|
99
|
-
# \___/____/ |___/_/ /_/\____/_/ /_/ /_/ /_/
|
|
100
|
-
#
|
|
101
|
-
# csvnorm 0.3.1
|
|
102
|
-
```
|
|
103
|
-
|
|
104
82
|
## Purpose
|
|
105
83
|
|
|
106
84
|
This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
|
|
@@ -123,6 +101,7 @@ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not
|
|
|
123
101
|
- **Field Name Normalization**: Converts column headers to snake_case format
|
|
124
102
|
- **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
|
|
125
103
|
- **Error Reporting**: Exports detailed error file for invalid rows
|
|
104
|
+
- **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
|
|
126
105
|
|
|
127
106
|
## Usage
|
|
128
107
|
|
|
@@ -148,6 +127,9 @@ csvnorm input.csv [options]
|
|
|
148
127
|
# Basic usage
|
|
149
128
|
csvnorm data.csv
|
|
150
129
|
|
|
130
|
+
# Process remote CSV from URL
|
|
131
|
+
csvnorm "https://raw.githubusercontent.com/aborruso/csvnorm/refs/heads/main/test/Trasporto%20Pubblico%20Locale%20Settore%20Pubblico%20Allargato%20-%20Indicatore%202000-2020%20Trasferimenti%20Correnti%20su%20Entrate%20Correnti.csv"
|
|
132
|
+
|
|
151
133
|
# With semicolon delimiter
|
|
152
134
|
csvnorm data.csv -d ';'
|
|
153
135
|
|
|
@@ -169,11 +151,17 @@ Creates a normalized CSV file in the specified output directory with:
|
|
|
169
151
|
- Normalized column names (unless `--keep-names` is specified)
|
|
170
152
|
- Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
|
|
171
153
|
|
|
154
|
+
For remote URLs:
|
|
155
|
+
- The output filename is derived from the URL's last path segment
|
|
156
|
+
- Encoding is handled automatically by DuckDB
|
|
157
|
+
- HTTP timeout is set to 30 seconds
|
|
158
|
+
- Only public URLs are supported (no authentication)
|
|
159
|
+
|
|
172
160
|
The tool provides modern terminal output with:
|
|
173
161
|
- Progress indicators for multi-step processing
|
|
174
162
|
- Color-coded error messages with panels
|
|
175
163
|
- Success summary table showing encoding, paths, and settings
|
|
176
|
-
-
|
|
164
|
+
- ASCII art banner with `--version` and `-V` verbose mode
|
|
177
165
|
|
|
178
166
|
### Exit Codes
|
|
179
167
|
|
|
@@ -190,9 +178,9 @@ The tool provides modern terminal output with:
|
|
|
190
178
|
- `duckdb>=0.9.0` - CSV validation and normalization
|
|
191
179
|
- `rich>=13.0.0` - Modern terminal output formatting
|
|
192
180
|
- `rich-argparse>=1.0.0` - Enhanced CLI help formatting
|
|
181
|
+
- `pyfiglet>=1.0.0` - ASCII art banner
|
|
193
182
|
|
|
194
183
|
Optional extras:
|
|
195
|
-
- `[banner]` - ASCII art banner for `--version` and `-V` verbose mode (`pyfiglet>=1.0.0`)
|
|
196
184
|
- `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
|
|
197
185
|
|
|
198
186
|
## Development
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
csvnorm/__init__.py,sha256=OvADr4XLxKa9CBE8oTVtV_YBCvr7oV8cgLN68cUtC1E,263
|
|
2
|
+
csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
|
|
3
|
+
csvnorm/cli.py,sha256=DZYclAKMhyZp234D_aWJUncVXZJDGO4u4Jh_fVHlz-g,3939
|
|
4
|
+
csvnorm/core.py,sha256=fby0c4Fs7zkvC3pqvxo8U-4eXm-SnQJ8UivluReDxxc,11126
|
|
5
|
+
csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
|
|
6
|
+
csvnorm/utils.py,sha256=k5gYxlmdtJOJEhOU1UxnmPb8Akn3UUIsB02S-t5oj4c,3227
|
|
7
|
+
csvnorm/validation.py,sha256=cB0rASU-f7C8M539lFoR7bWhPmG5_LfM7f3S5mRSqAM,3321
|
|
8
|
+
csvnorm-0.3.4.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
|
|
9
|
+
csvnorm-0.3.4.dist-info/METADATA,sha256=BkJEIeef7w1IsC5TMOHH1DL-Hj6A6PD-ZbCDbwRmCpg,7857
|
|
10
|
+
csvnorm-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
csvnorm-0.3.4.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
|
|
12
|
+
csvnorm-0.3.4.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
|
|
13
|
+
csvnorm-0.3.4.dist-info/RECORD,,
|
csvnorm-0.3.3.dist-info/RECORD
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
csvnorm/__init__.py,sha256=8njXIycxL0qSI5Q9bVGyTaM41j_kKX9jV7TeQOSAQGE,263
|
|
2
|
-
csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
|
|
3
|
-
csvnorm/cli.py,sha256=MwIPahLktbulF6NYRWyBsE4s9Al9_aSdA1zvzuI0AiQ,3815
|
|
4
|
-
csvnorm/core.py,sha256=_kTaui_2IhqrN_UxJpcjwXYXEvqaRMhML49Xlx-e0p0,6633
|
|
5
|
-
csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
|
|
6
|
-
csvnorm/utils.py,sha256=gvwDToOx3YoKCfVPyCmxcSa7teCWFB2SmAGr-jV5w_Y,1761
|
|
7
|
-
csvnorm/validation.py,sha256=iXdfalAGDNB9kPefyzHXGI9uc-HLAG5pQ_-T93ShppY,2815
|
|
8
|
-
csvnorm-0.3.3.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
|
|
9
|
-
csvnorm-0.3.3.dist-info/METADATA,sha256=xKJmLVX9RoB22KwAAlxAvWB_KA9h68m5V-UyFaS_DGo,7840
|
|
10
|
-
csvnorm-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
csvnorm-0.3.3.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
|
|
12
|
-
csvnorm-0.3.3.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
|
|
13
|
-
csvnorm-0.3.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|