csvnorm 0.3.3__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvnorm/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """csvnorm - Validate and normalize CSV files."""
2
2
 
3
- __version__ = "0.3.3"
3
+ __version__ = "0.3.11"
4
4
  __all__ = ["normalize_csv", "detect_encoding", "process_csv"]
5
5
 
6
6
  from csvnorm.core import process_csv
csvnorm/cli.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import argparse
4
4
  import sys
5
5
  from pathlib import Path
6
+ from typing import Optional
6
7
 
7
8
  from rich.console import Console
8
9
  from rich_argparse import RichHelpFormatter
@@ -15,10 +16,10 @@ console = Console()
15
16
 
16
17
 
17
18
  def show_banner() -> None:
18
- """Show ASCII art banner."""
19
- from pyfiglet import figlet_format
20
- banner = figlet_format("csvnorm", font="slant")
21
- console.print(banner, style="bold cyan")
19
+ """Show simple styled banner."""
20
+ console.print()
21
+ console.print(" csvnorm ", style="bold cyan on black", justify="center")
22
+ console.print()
22
23
 
23
24
 
24
25
  class VersionAction(argparse.Action):
@@ -28,10 +29,15 @@ class VersionAction(argparse.Action):
28
29
  show_banner()
29
30
  console.print(f"csvnorm {__version__}", style="bold")
30
31
  console.print()
31
- console.print("Validate and normalize CSV files for exploratory data analysis", style="dim")
32
+ console.print(
33
+ "Validate and normalize CSV files for exploratory data analysis",
34
+ style="dim",
35
+ )
32
36
  console.print()
33
37
  console.print("Author: aborruso", style="dim")
34
- console.print("Repository: https://github.com/aborruso/csvnorm", style="dim cyan")
38
+ console.print(
39
+ "Repository: https://github.com/aborruso/csvnorm", style="dim cyan"
40
+ )
35
41
  console.print("License: MIT", style="dim")
36
42
  parser.exit()
37
43
 
@@ -44,16 +50,17 @@ def create_parser() -> argparse.ArgumentParser:
44
50
  formatter_class=RichHelpFormatter,
45
51
  epilog="""\
46
52
  Examples:
47
- csvnorm data.csv -d ';' -o output_folder --force
53
+ csvnorm data.csv -d ';' -o output.csv --force
48
54
  csvnorm data.csv --keep-names --delimiter '\\t'
55
+ csvnorm https://example.com/data.csv -o processed/data.csv
49
56
  csvnorm data.csv -V
50
57
  """,
51
58
  )
52
59
 
53
60
  parser.add_argument(
54
61
  "input_file",
55
- type=Path,
56
- help="Input CSV file path",
62
+ type=str,
63
+ help="Input CSV file path or HTTP/HTTPS URL",
57
64
  )
58
65
 
59
66
  parser.add_argument(
@@ -83,10 +90,9 @@ Examples:
83
90
 
84
91
  parser.add_argument(
85
92
  "-o",
86
- "--output-dir",
93
+ "--output-file",
87
94
  type=Path,
88
- default=Path.cwd(),
89
- help="Set custom output directory (default: current working directory)",
95
+ help="Set output file path (absolute or relative)",
90
96
  )
91
97
 
92
98
  parser.add_argument(
@@ -107,7 +113,7 @@ Examples:
107
113
  return parser
108
114
 
109
115
 
110
- def main(argv: list[str] | None = None) -> int:
116
+ def main(argv: Optional[list[str]] = None) -> int:
111
117
  """Main entry point for the CLI.
112
118
 
113
119
  Args:
@@ -122,7 +128,7 @@ def main(argv: list[str] | None = None) -> int:
122
128
  if argv is None:
123
129
  argv = sys.argv[1:]
124
130
 
125
- if not argv or (len(argv) == 1 and argv[0] in ['-h', '--help']):
131
+ if not argv or (len(argv) == 1 and argv[0] in ["-h", "--help"]):
126
132
  parser.print_help()
127
133
  return 0 if argv else 2
128
134
 
@@ -135,10 +141,17 @@ def main(argv: list[str] | None = None) -> int:
135
141
  # Setup logging
136
142
  setup_logger(args.verbose)
137
143
 
144
+ # Determine output file (default: input filename in current directory)
145
+ if args.output_file is None:
146
+ input_name = Path(args.input_file).name
147
+ output_file = Path.cwd() / input_name
148
+ else:
149
+ output_file = args.output_file
150
+
138
151
  # Run processing
139
152
  return process_csv(
140
153
  input_file=args.input_file,
141
- output_dir=args.output_dir,
154
+ output_file=output_file,
142
155
  force=args.force,
143
156
  keep_names=args.keep_names,
144
157
  delimiter=args.delimiter,
csvnorm/core.py CHANGED
@@ -1,15 +1,29 @@
1
1
  """Core processing logic for csvnorm."""
2
2
 
3
3
  import logging
4
+ import tempfile
4
5
  from pathlib import Path
6
+ from typing import Union
5
7
 
6
8
  from rich.console import Console
7
- from rich.panel import Panel
8
9
  from rich.progress import Progress, SpinnerColumn, TextColumn
9
- from rich.table import Table
10
10
 
11
11
  from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
12
- from csvnorm.utils import ensure_output_dir, to_snake_case, validate_delimiter
12
+ from csvnorm.ui import (
13
+ show_error_panel,
14
+ show_success_table,
15
+ show_validation_error_panel,
16
+ show_warning_panel,
17
+ )
18
+ from csvnorm.utils import (
19
+ extract_filename_from_url,
20
+ get_column_count,
21
+ get_row_count,
22
+ is_url,
23
+ to_snake_case,
24
+ validate_delimiter,
25
+ validate_url,
26
+ )
13
27
  from csvnorm.validation import normalize_csv, validate_csv
14
28
 
15
29
  logger = logging.getLogger("csvnorm")
@@ -17,8 +31,8 @@ console = Console()
17
31
 
18
32
 
19
33
  def process_csv(
20
- input_file: Path,
21
- output_dir: Path,
34
+ input_file: str,
35
+ output_file: Path,
22
36
  force: bool = False,
23
37
  keep_names: bool = False,
24
38
  delimiter: str = ",",
@@ -27,8 +41,8 @@ def process_csv(
27
41
  """Main CSV processing pipeline.
28
42
 
29
43
  Args:
30
- input_file: Path to input CSV file.
31
- output_dir: Directory for output files.
44
+ input_file: Path to input CSV file or HTTP/HTTPS URL.
45
+ output_file: Full path for output file.
32
46
  force: If True, overwrite existing output files.
33
47
  keep_names: If True, keep original column names.
34
48
  delimiter: Output field delimiter.
@@ -37,111 +51,169 @@ def process_csv(
37
51
  Returns:
38
52
  Exit code: 0 for success, 1 for error.
39
53
  """
40
- # Validate inputs
41
- if not input_file.exists():
42
- console.print(Panel(
43
- f"[bold red]Error:[/bold red] Input file not found\n{input_file}",
44
- border_style="red"
45
- ))
46
- return 1
54
+ # Detect if input is URL or file
55
+ is_remote = is_url(input_file)
47
56
 
48
- if not input_file.is_file():
49
- console.print(Panel(
50
- f"[bold red]Error:[/bold red] Not a file\n{input_file}",
51
- border_style="red"
52
- ))
53
- return 1
57
+ input_path: Union[str, Path]
58
+ if is_remote:
59
+ # Validate URL
60
+ try:
61
+ validate_url(input_file)
62
+ except ValueError as e:
63
+ show_error_panel(str(e))
64
+ return 1
65
+ base_name = extract_filename_from_url(input_file)
66
+ input_path = input_file # Keep as string for DuckDB
67
+ else:
68
+ # Validate local file
69
+ file_path = Path(input_file)
70
+ if not file_path.exists():
71
+ show_error_panel(f"Input file not found\n{file_path}")
72
+ return 1
73
+
74
+ if not file_path.is_file():
75
+ show_error_panel(f"Not a file\n{file_path}")
76
+ return 1
77
+
78
+ base_name = to_snake_case(file_path.name)
79
+ input_path = file_path
54
80
 
55
81
  try:
56
82
  validate_delimiter(delimiter)
57
83
  except ValueError as e:
58
- console.print(Panel(
59
- f"[bold red]Error:[/bold red] {e}",
60
- border_style="red"
61
- ))
84
+ show_error_panel(str(e))
62
85
  return 1
63
86
 
64
87
  # Setup paths
65
- base_name = to_snake_case(input_file.name)
66
- ensure_output_dir(output_dir)
67
-
68
- output_file = output_dir / f"{base_name}.csv"
69
- reject_file = output_dir / f"{base_name}_reject_errors.csv"
70
- temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
88
+ output_dir = output_file.parent
89
+ temp_dir = Path(tempfile.mkdtemp(prefix="csvnorm_"))
90
+ reject_file = output_dir / f"{output_file.stem}_reject_errors.csv"
91
+ temp_utf8_file = temp_dir / f"{output_file.stem}_utf8.csv"
71
92
 
72
93
  # Check if output exists
73
94
  if output_file.exists() and not force:
74
- console.print(Panel(
75
- f"[bold yellow]Warning:[/bold yellow] Output file already exists\n\n"
95
+ show_warning_panel(
96
+ f"Output file already exists\n\n"
76
97
  f"{output_file}\n\n"
77
- f"Use [bold]--force[/bold] to overwrite.",
78
- border_style="yellow"
79
- ))
98
+ f"Use [bold]--force[/bold] to overwrite."
99
+ )
80
100
  return 1
81
101
 
82
- # Clean up previous reject file
102
+ # Clean up previous reject file (always overwrite)
83
103
  if reject_file.exists():
84
104
  reject_file.unlink()
85
105
 
86
106
  # Track files to clean up
87
- temp_files: list[Path] = []
107
+ temp_files: list[Path] = [temp_dir]
88
108
 
89
109
  try:
90
110
  with Progress(
91
111
  SpinnerColumn(),
92
112
  TextColumn("[progress.description]{task.description}"),
93
113
  console=console,
94
- transient=True
114
+ transient=True,
95
115
  ) as progress:
96
- # Step 1: Detect encoding
97
- task = progress.add_task("[cyan]Detecting encoding...", total=None)
98
- try:
99
- encoding = detect_encoding(input_file)
100
- except ValueError as e:
101
- progress.stop()
102
- console.print(Panel(
103
- f"[bold red]Error:[/bold red] {e}",
104
- border_style="red"
105
- ))
106
- return 1
116
+ task = progress.add_task("[cyan]Processing...", total=None)
107
117
 
108
- logger.debug(f"Detected encoding: {encoding}")
109
- progress.update(task, description=f"[green]✓[/green] Detected encoding: {encoding}")
118
+ # For remote URLs, skip encoding detection/conversion
119
+ if is_remote:
120
+ progress.update(
121
+ task,
122
+ description="[green]✓[/green] Remote URL (encoding handled by DuckDB)",
123
+ )
124
+ working_file = input_path # Keep URL as string
125
+ encoding = "remote"
126
+ else:
127
+ # Step 1: Detect encoding (local files only)
128
+ # input_path is Path here (set in else block above)
129
+ file_input_path = input_path # Type narrowing for mypy
130
+ assert isinstance(file_input_path, Path)
110
131
 
111
- # Step 2: Convert to UTF-8 if needed
112
- working_file = input_file
113
- if needs_conversion(encoding):
114
- progress.update(task, description=f"[cyan]Converting from {encoding} to UTF-8...")
132
+ progress.update(task, description="[cyan]Detecting encoding...")
115
133
  try:
116
- convert_to_utf8(input_file, temp_utf8_file, encoding)
117
- working_file = temp_utf8_file
118
- temp_files.append(temp_utf8_file)
119
- progress.update(task, description=f"[green]✓[/green] Converted to UTF-8")
120
- except (UnicodeDecodeError, LookupError) as e:
134
+ encoding = detect_encoding(file_input_path)
135
+ except ValueError as e:
121
136
  progress.stop()
122
- console.print(Panel(
123
- f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
124
- border_style="red"
125
- ))
137
+ show_error_panel(str(e))
126
138
  return 1
127
- else:
128
- progress.update(task, description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)")
139
+
140
+ logger.debug(f"Detected encoding: {encoding}")
141
+ progress.update(
142
+ task, description=f"[green]✓[/green] Detected encoding: {encoding}"
143
+ )
144
+
145
+ # Step 2: Convert to UTF-8 if needed
146
+ working_file = file_input_path
147
+ if needs_conversion(encoding):
148
+ progress.update(
149
+ task,
150
+ description=f"[cyan]Converting from {encoding} to UTF-8...",
151
+ )
152
+ try:
153
+ convert_to_utf8(file_input_path, temp_utf8_file, encoding)
154
+ working_file = temp_utf8_file
155
+ temp_files.append(temp_utf8_file)
156
+ progress.update(
157
+ task, description="[green]✓[/green] Converted to UTF-8"
158
+ )
159
+ except (UnicodeDecodeError, LookupError) as e:
160
+ progress.stop()
161
+ show_error_panel(f"Encoding conversion failed\n{e}")
162
+ return 1
163
+ else:
164
+ progress.update(
165
+ task,
166
+ description=f"[green]✓[/green] Encoding: {encoding} (no conversion needed)",
167
+ )
129
168
 
130
169
  # Step 3: Validate CSV
131
170
  progress.update(task, description="[cyan]Validating CSV...")
132
171
  logger.debug("Validating CSV with DuckDB...")
133
- is_valid = validate_csv(working_file, reject_file)
134
172
 
135
- if not is_valid:
173
+ try:
174
+ reject_count, error_types = validate_csv(
175
+ working_file, reject_file, is_remote=is_remote
176
+ )
177
+ except Exception as e:
136
178
  progress.stop()
137
- console.print(Panel(
138
- "[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
139
- f"Details: [cyan]{reject_file}[/cyan]\n\n"
140
- "Please fix the issues and try again.",
141
- border_style="red"
142
- ))
179
+ error_msg = str(e)
180
+
181
+ # Check for common HTTP errors
182
+ if "HTTP Error" in error_msg or "HTTPException" in error_msg:
183
+ if "404" in error_msg:
184
+ show_error_panel(
185
+ f"Remote CSV file not found (HTTP 404)\n\n"
186
+ f"URL: [cyan]{input_file}[/cyan]\n\n"
187
+ "Please check the URL is correct."
188
+ )
189
+ elif "401" in error_msg or "403" in error_msg:
190
+ show_error_panel(
191
+ f"Authentication required (HTTP 401/403)\n\n"
192
+ f"URL: [cyan]{input_file}[/cyan]\n\n"
193
+ "This tool only supports public URLs without authentication.\n"
194
+ "Please download the file manually first."
195
+ )
196
+ elif (
197
+ "timeout" in error_msg.lower()
198
+ or "timed out" in error_msg.lower()
199
+ ):
200
+ show_error_panel(
201
+ f"HTTP request timeout (30 seconds)\n\n"
202
+ f"URL: [cyan]{input_file}[/cyan]\n\n"
203
+ "The remote server took too long to respond.\n"
204
+ "Try again later or download the file manually."
205
+ )
206
+ else:
207
+ show_error_panel(f"HTTP request failed\n\n{error_msg}")
208
+ else:
209
+ # Re-raise non-HTTP errors
210
+ raise
143
211
  return 1
144
212
 
213
+ has_validation_errors = reject_count > 1
214
+ if has_validation_errors:
215
+ progress.stop()
216
+
145
217
  progress.update(task, description="[green]✓[/green] CSV validated")
146
218
 
147
219
  # Step 4: Normalize and write output
@@ -152,31 +224,50 @@ def process_csv(
152
224
  output_path=output_file,
153
225
  delimiter=delimiter,
154
226
  normalize_names=not keep_names,
227
+ is_remote=is_remote,
155
228
  )
156
229
 
157
230
  logger.debug(f"Output written to: {output_file}")
158
231
  progress.update(task, description="[green]✓[/green] Complete")
159
232
 
160
- # Success summary table
161
- table = Table(show_header=False, box=None, padding=(0, 1))
162
- table.add_row("[green]✓[/green] Success", "")
163
- table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
164
- table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
165
- table.add_row("Encoding:", encoding)
166
- if delimiter != ",":
167
- table.add_row("Delimiter:", repr(delimiter))
168
- if not keep_names:
169
- table.add_row("Headers:", "normalized to snake_case")
233
+ # Collect statistics
234
+ input_size = (
235
+ working_file.stat().st_size if isinstance(working_file, Path) else 0
236
+ )
237
+ output_size = output_file.stat().st_size
238
+ row_count = get_row_count(output_file)
239
+ column_count = get_column_count(output_file, delimiter)
170
240
 
171
- console.print()
172
- console.print(table)
241
+ # Show success summary
242
+ show_success_table(
243
+ input_file=input_file,
244
+ output_file=output_file,
245
+ encoding=encoding,
246
+ is_remote=is_remote,
247
+ row_count=row_count,
248
+ column_count=column_count,
249
+ input_size=input_size,
250
+ output_size=output_size,
251
+ delimiter=delimiter,
252
+ keep_names=keep_names,
253
+ )
254
+
255
+ # Show validation errors if any
256
+ if has_validation_errors:
257
+ show_validation_error_panel(reject_count, error_types, reject_file)
258
+ return 1
173
259
 
174
260
  finally:
175
- # Cleanup temp files
176
- for temp_file in temp_files:
177
- if temp_file.exists():
178
- logger.debug(f"Removing temp file: {temp_file}")
179
- temp_file.unlink()
261
+ # Cleanup temp directory
262
+ import shutil
263
+
264
+ for temp_path in temp_files:
265
+ if temp_path.exists():
266
+ logger.debug(f"Removing temp path: {temp_path}")
267
+ if temp_path.is_dir():
268
+ shutil.rmtree(temp_path)
269
+ else:
270
+ temp_path.unlink()
180
271
 
181
272
  # Remove reject file if empty (only header)
182
273
  if reject_file.exists():
csvnorm/ui.py ADDED
@@ -0,0 +1,124 @@
1
+ """UI formatting functions for csvnorm terminal output."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rich.console import Console
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+
9
+ from csvnorm.encoding import needs_conversion
10
+ from csvnorm.utils import format_file_size
11
+
12
+ console = Console()
13
+
14
+
15
+ def show_error_panel(message: str, title: str = "Error") -> None:
16
+ """Display an error panel with red border.
17
+
18
+ Args:
19
+ message: Error message to display.
20
+ title: Panel title (default: "Error").
21
+ """
22
+ console.print(Panel(f"[bold red]{title}:[/bold red] {message}", border_style="red"))
23
+
24
+
25
+ def show_warning_panel(message: str, title: str = "Warning") -> None:
26
+ """Display a warning panel with yellow border.
27
+
28
+ Args:
29
+ message: Warning message to display.
30
+ title: Panel title (default: "Warning").
31
+ """
32
+ console.print(
33
+ Panel(f"[bold yellow]{title}:[/bold yellow] {message}", border_style="yellow")
34
+ )
35
+
36
+
37
+ def show_success_table(
38
+ input_file: str,
39
+ output_file: Path,
40
+ encoding: str,
41
+ is_remote: bool,
42
+ row_count: int,
43
+ column_count: int,
44
+ input_size: int,
45
+ output_size: int,
46
+ delimiter: str,
47
+ keep_names: bool,
48
+ ) -> None:
49
+ """Display success summary table with processing results.
50
+
51
+ Args:
52
+ input_file: Input CSV file path or URL.
53
+ output_file: Output CSV file path.
54
+ encoding: Detected encoding (or "remote" for URLs).
55
+ is_remote: Whether input was a remote URL.
56
+ row_count: Number of data rows in output.
57
+ column_count: Number of columns in output.
58
+ input_size: Input file size in bytes (0 for remote).
59
+ output_size: Output file size in bytes.
60
+ delimiter: Output delimiter character.
61
+ keep_names: Whether original column names were kept.
62
+ """
63
+ table = Table(show_header=False, box=None, padding=(0, 1))
64
+ table.add_row("[green]✓[/green] Success", "")
65
+ table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
66
+ table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
67
+
68
+ # Encoding info
69
+ if not is_remote:
70
+ if needs_conversion(encoding):
71
+ table.add_row("Encoding:", f"{encoding} → UTF-8 [dim](converted)[/dim]")
72
+ else:
73
+ table.add_row("Encoding:", f"{encoding} [dim](no conversion needed)[/dim]")
74
+ else:
75
+ table.add_row("Encoding:", "remote [dim](handled by DuckDB)[/dim]")
76
+
77
+ # Statistics
78
+ table.add_row("Rows:", f"{row_count:,}")
79
+ table.add_row("Columns:", f"{column_count}")
80
+ table.add_row("Input size:", format_file_size(input_size))
81
+ table.add_row("Output size:", format_file_size(output_size))
82
+
83
+ # Optional fields
84
+ if delimiter != ",":
85
+ table.add_row("Delimiter:", repr(delimiter))
86
+ if not keep_names:
87
+ table.add_row("Headers:", "normalized to snake_case")
88
+
89
+ console.print()
90
+ console.print(table)
91
+
92
+
93
+ def show_validation_error_panel(
94
+ reject_count: int, error_types: list[str], reject_file: Path
95
+ ) -> None:
96
+ """Display validation error summary panel.
97
+
98
+ Args:
99
+ reject_count: Number of rejected rows (including header).
100
+ error_types: List of error type descriptions.
101
+ reject_file: Path to reject errors CSV file.
102
+ """
103
+ console.print()
104
+ error_lines = []
105
+ error_lines.append("[bold red]Validation Errors:[/bold red]")
106
+ error_lines.append("")
107
+ error_lines.append(f"Rejected rows: [yellow]{reject_count - 1}[/yellow]")
108
+
109
+ if error_types:
110
+ error_lines.append("")
111
+ error_lines.append("[dim]Error types:[/dim]")
112
+ for error_type in error_types:
113
+ error_lines.append(f" • {error_type}")
114
+
115
+ error_lines.append("")
116
+ error_lines.append(f"Details: [cyan]{reject_file}[/cyan]")
117
+
118
+ console.print(
119
+ Panel(
120
+ "\n".join(error_lines),
121
+ border_style="yellow",
122
+ title="[yellow]![/yellow] Validation Failed",
123
+ )
124
+ )
csvnorm/utils.py CHANGED
@@ -3,6 +3,8 @@
3
3
  import logging
4
4
  import re
5
5
  from pathlib import Path
6
+ from typing import Union
7
+ from urllib.parse import urlparse
6
8
 
7
9
  from rich.logging import RichHandler
8
10
 
@@ -45,10 +47,7 @@ def setup_logger(verbose: bool = False) -> logging.Logger:
45
47
 
46
48
  if not logger.handlers:
47
49
  handler = RichHandler(
48
- show_time=False,
49
- show_path=verbose,
50
- markup=True,
51
- rich_tracebacks=True
50
+ show_time=False, show_path=verbose, markup=True, rich_tracebacks=True
52
51
  )
53
52
  logger.addHandler(handler)
54
53
 
@@ -69,3 +68,125 @@ def validate_delimiter(delimiter: str) -> None:
69
68
  def ensure_output_dir(output_dir: Path) -> None:
70
69
  """Create output directory if it doesn't exist."""
71
70
  output_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+
73
+ def is_url(input_str: str) -> bool:
74
+ """Check if input string is an HTTP/HTTPS URL.
75
+
76
+ Args:
77
+ input_str: String to check.
78
+
79
+ Returns:
80
+ True if input is HTTP/HTTPS URL, False otherwise.
81
+ """
82
+ try:
83
+ result = urlparse(input_str)
84
+ return result.scheme in ("http", "https") and bool(result.netloc)
85
+ except Exception:
86
+ return False
87
+
88
+
89
+ def validate_url(url: str) -> None:
90
+ """Validate URL has HTTP/HTTPS protocol.
91
+
92
+ Args:
93
+ url: URL to validate.
94
+
95
+ Raises:
96
+ ValueError: If URL protocol is not HTTP/HTTPS.
97
+ """
98
+ parsed = urlparse(url)
99
+ if parsed.scheme not in ("http", "https"):
100
+ raise ValueError(f"Only HTTP/HTTPS URLs are supported. Got: {parsed.scheme}://")
101
+
102
+
103
+ def extract_filename_from_url(url: str) -> str:
104
+ """Extract and normalize filename from URL.
105
+
106
+ Args:
107
+ url: URL to extract filename from.
108
+
109
+ Returns:
110
+ Normalized snake_case filename without extension.
111
+ """
112
+ from urllib.parse import unquote
113
+
114
+ parsed = urlparse(url)
115
+ # Get last path segment, ignore query/fragment
116
+ path = parsed.path.rstrip("/")
117
+ filename = path.split("/")[-1] if path else "data"
118
+
119
+ # Decode URL encoding (%20 -> space, etc.)
120
+ filename = unquote(filename)
121
+
122
+ # Remove extension if present
123
+ if filename.lower().endswith(".csv"):
124
+ filename = filename[:-4]
125
+
126
+ # Apply snake_case normalization
127
+ return to_snake_case(filename) if filename else "data"
128
+
129
+
130
+ def format_file_size(size_bytes: int) -> str:
131
+ """Format file size in human-readable format.
132
+
133
+ Args:
134
+ size_bytes: File size in bytes.
135
+
136
+ Returns:
137
+ Formatted size string (e.g., "1.5 MB", "256 KB").
138
+ """
139
+ for unit in ["B", "KB", "MB", "GB"]:
140
+ if size_bytes < 1024.0:
141
+ return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} B"
142
+ size_bytes /= 1024.0
143
+ return f"{size_bytes:.1f} TB"
144
+
145
+
146
+ def get_row_count(file_path: Union[Path, str]) -> int:
147
+ """Count number of rows in a CSV file.
148
+
149
+ Args:
150
+ file_path: Path to CSV file.
151
+
152
+ Returns:
153
+ Number of data rows (excluding header), or 0 if file doesn't exist.
154
+ """
155
+ if not isinstance(file_path, Path) or not file_path.exists():
156
+ return 0
157
+
158
+ try:
159
+ with open(file_path, "r") as f:
160
+ # Skip header
161
+ next(f, None)
162
+ return sum(1 for _ in f)
163
+ except Exception:
164
+ return 0
165
+
166
+
167
+ def get_column_count(file_path: Union[Path, str], delimiter: str = ",") -> int:
168
+ """Count number of columns in a CSV file using DuckDB.
169
+
170
+ Args:
171
+ file_path: Path to CSV file.
172
+ delimiter: Field delimiter used in the CSV file.
173
+
174
+ Returns:
175
+ Number of columns in the CSV, or 0 if file doesn't exist or error.
176
+ """
177
+ if not isinstance(file_path, Path) or not file_path.exists():
178
+ return 0
179
+
180
+ try:
181
+ import duckdb
182
+
183
+ conn = duckdb.connect(":memory:")
184
+ # Get column names from CSV using DuckDB DESCRIBE
185
+ columns = conn.execute(
186
+ f"DESCRIBE SELECT * FROM read_csv('{file_path}', delim='{delimiter}', header=true, sample_size=1)"
187
+ ).fetchall()
188
+ conn.close()
189
+
190
+ return len(columns)
191
+ except Exception:
192
+ return 0
csvnorm/validation.py CHANGED
@@ -2,27 +2,36 @@
2
2
 
3
3
  import logging
4
4
  from pathlib import Path
5
+ from typing import Union
5
6
 
6
7
  import duckdb
7
8
 
8
9
  logger = logging.getLogger("csvnorm")
9
10
 
10
11
 
11
- def validate_csv(file_path: Path, reject_file: Path) -> bool:
12
+ def validate_csv(
13
+ file_path: Union[Path, str], reject_file: Path, is_remote: bool = False
14
+ ) -> tuple[int, list[str]]:
12
15
  """Validate CSV file using DuckDB and export rejected rows.
13
16
 
14
17
  Args:
15
- file_path: Path to CSV file to validate.
18
+ file_path: Path to CSV file to validate or URL string.
16
19
  reject_file: Path to write rejected rows.
20
+ is_remote: True if file_path is a remote URL.
17
21
 
18
22
  Returns:
19
- True if validation passes (no rejected rows), False otherwise.
23
+ Tuple of (reject_count, error_types) where error_types is list of
24
+ up to 3 unique error reasons from reject file.
20
25
  """
21
26
  logger.debug(f"Validating CSV: {file_path}")
22
27
 
23
28
  conn = duckdb.connect()
24
29
 
25
30
  try:
31
+ # Set HTTP timeout for remote URLs (30 seconds)
32
+ if is_remote:
33
+ conn.execute("SET http_timeout=30000")
34
+
26
35
  # Read CSV with store_rejects to capture malformed rows
27
36
  # Use all_varchar=true to avoid type inference failures
28
37
  conn.execute(f"""
@@ -46,28 +55,39 @@ def validate_csv(file_path: Path, reject_file: Path) -> bool:
46
55
  reject_count = _count_lines(reject_file)
47
56
  logger.debug(f"Reject file lines: {reject_count}")
48
57
 
49
- return reject_count <= 1
58
+ # Collect sample error types from reject file
59
+ error_types = []
60
+ if reject_count > 1:
61
+ error_types = _get_error_types(reject_file)
62
+
63
+ return reject_count, error_types
50
64
 
51
65
 
52
66
  def normalize_csv(
53
- input_path: Path,
67
+ input_path: Union[Path, str],
54
68
  output_path: Path,
55
69
  delimiter: str = ",",
56
70
  normalize_names: bool = True,
71
+ is_remote: bool = False,
57
72
  ) -> None:
58
73
  """Normalize CSV file using DuckDB.
59
74
 
60
75
  Args:
61
- input_path: Path to input CSV file.
76
+ input_path: Path to input CSV file or URL string.
62
77
  output_path: Path for normalized output file.
63
78
  delimiter: Output field delimiter.
64
79
  normalize_names: If True, convert column names to snake_case.
80
+ is_remote: True if input_path is a remote URL.
65
81
  """
66
82
  logger.debug(f"Normalizing CSV: {input_path} -> {output_path}")
67
83
 
68
84
  conn = duckdb.connect()
69
85
 
70
86
  try:
87
+ # Set HTTP timeout for remote URLs (30 seconds)
88
+ if is_remote:
89
+ conn.execute("SET http_timeout=30000")
90
+
71
91
  # Build read options
72
92
  read_opts = "sample_size=-1, all_varchar=true"
73
93
  if normalize_names:
@@ -107,3 +127,36 @@ def _count_lines(file_path: Path) -> int:
107
127
 
108
128
  with open(file_path, "r") as f:
109
129
  return sum(1 for _ in f)
130
+
131
+
132
+ def _get_error_types(reject_file: Path) -> list[str]:
133
+ """Extract sample error types from reject file.
134
+
135
+ Args:
136
+ reject_file: Path to reject_errors.csv file.
137
+
138
+ Returns:
139
+ List of up to 3 unique error reasons.
140
+ """
141
+ if not reject_file.exists():
142
+ return []
143
+
144
+ error_types: set[str] = set()
145
+ try:
146
+ with open(reject_file, "r") as f:
147
+ # Skip header
148
+ next(f, None)
149
+ for line in f:
150
+ # Error message is in the last column
151
+ parts = line.rstrip("\n").split(",")
152
+ if parts:
153
+ error_reason = parts[-1].strip()
154
+ if error_reason and error_reason != "error":
155
+ error_types.add(error_reason)
156
+ if len(error_types) >= 3:
157
+ break
158
+ except Exception as e:
159
+ logger.warning(f"Failed to extract error types: {e}")
160
+ return []
161
+
162
+ return list(error_types)[:3]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csvnorm
3
- Version: 0.3.3
3
+ Version: 0.3.11
4
4
  Summary: A command-line utility to validate and normalize CSV files
5
5
  Author-email: aborruso <aborruso@gmail.com>
6
6
  License: MIT License
@@ -34,7 +34,6 @@ Classifier: Intended Audience :: Science/Research
34
34
  Classifier: License :: OSI Approved :: MIT License
35
35
  Classifier: Operating System :: OS Independent
36
36
  Classifier: Programming Language :: Python :: 3
37
- Classifier: Programming Language :: Python :: 3.8
38
37
  Classifier: Programming Language :: Python :: 3.9
39
38
  Classifier: Programming Language :: Python :: 3.10
40
39
  Classifier: Programming Language :: Python :: 3.11
@@ -49,18 +48,15 @@ Requires-Dist: charset-normalizer>=3.0.0
49
48
  Requires-Dist: duckdb>=0.9.0
50
49
  Requires-Dist: rich>=13.0.0
51
50
  Requires-Dist: rich-argparse>=1.0.0
52
- Requires-Dist: pyfiglet>=1.0.0
53
51
  Provides-Extra: dev
54
52
  Requires-Dist: pytest>=7.0.0; extra == "dev"
55
53
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
56
54
  Requires-Dist: ruff>=0.1.0; extra == "dev"
57
- Provides-Extra: banner
58
- Requires-Dist: pyfiglet>=1.0.0; extra == "banner"
59
55
  Dynamic: license-file
60
56
 
61
57
  [![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
62
58
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
63
- [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
59
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
64
60
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/csvnorm)
65
61
 
66
62
  # csvnorm
@@ -81,26 +77,6 @@ Or with pip:
81
77
  pip install csvnorm
82
78
  ```
83
79
 
84
- For ASCII art banner (shown with `--version` and `-V`):
85
-
86
- ```bash
87
- uv tool install 'csvnorm[banner]'
88
- # or
89
- pip install 'csvnorm[banner]'
90
- ```
91
-
92
- Example with banner:
93
- ```bash
94
- csvnorm --version
95
- # Output:
96
- # ___________ ______ ____ _________ ___
97
- # / ___/ ___/ | / / __ \/ __ \/ ___/ __ `__ \
98
- # / /__(__ )| |/ / / / / /_/ / / / / / / / /
99
- # \___/____/ |___/_/ /_/\____/_/ /_/ /_/ /_/
100
- #
101
- # csvnorm 0.3.1
102
- ```
103
-
104
80
  ## Purpose
105
81
 
106
82
  This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not for complex transformations. It focuses on achieving a clean, standardized baseline format that allows you to quickly assess data quality and structure before designing more sophisticated ETL pipelines.
@@ -122,7 +98,9 @@ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not
122
98
  - **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
123
99
  - **Field Name Normalization**: Converts column headers to snake_case format
124
100
  - **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
125
- - **Error Reporting**: Exports detailed error file for invalid rows
101
+ - **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
102
+ - **Error Reporting**: Exports detailed error file for invalid rows with summary panel
103
+ - **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
126
104
 
127
105
  ## Usage
128
106
 
@@ -148,6 +126,9 @@ csvnorm input.csv [options]
148
126
  # Basic usage
149
127
  csvnorm data.csv
150
128
 
129
+ # Process remote CSV from URL
130
+ csvnorm "https://raw.githubusercontent.com/aborruso/csvnorm/refs/heads/main/test/Trasporto%20Pubblico%20Locale%20Settore%20Pubblico%20Allargato%20-%20Indicatore%202000-2020%20Trasferimenti%20Correnti%20su%20Entrate%20Correnti.csv"
131
+
151
132
  # With semicolon delimiter
152
133
  csvnorm data.csv -d ';'
153
134
 
@@ -163,17 +144,63 @@ csvnorm data.csv -f -V
163
144
 
164
145
  ### Output
165
146
 
166
- Creates a normalized CSV file in the specified output directory with:
147
+ Creates a normalized CSV file in specified output directory with:
167
148
  - UTF-8 encoding
168
149
  - Consistent field delimiters
169
150
  - Normalized column names (unless `--keep-names` is specified)
170
151
  - Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
171
152
 
153
+ For remote URLs:
154
+ - The output filename is derived from URL's last path segment
155
+ - Encoding is handled automatically by DuckDB
156
+ - HTTP timeout is set to 30 seconds
157
+ - Only public URLs are supported (no authentication)
158
+
172
159
  The tool provides modern terminal output with:
173
160
  - Progress indicators for multi-step processing
174
161
  - Color-coded error messages with panels
175
- - Success summary table showing encoding, paths, and settings
176
- - Optional ASCII art banner with `--version` and `-V` verbose mode (requires `pyfiglet`)
162
+ - Success summary table with statistics (rows, columns, file sizes)
163
+ - Encoding conversion status (converted/no conversion/remote)
164
+ - Error summary panel with reject count and error types when validation fails
165
+ - ASCII art banner with `--version` and `-V` verbose mode
166
+
167
+ **Success Example:**
168
+ ```
169
+ ✓ Success
170
+ Input: test/utf8_basic.csv
171
+ Output: output/utf8_basic.csv
172
+ Encoding: ascii (no conversion needed)
173
+ Rows: 2
174
+ Columns: 3
175
+ Input size: 42 B
176
+ Output size: 43 B
177
+ Headers: normalized to snake_case
178
+ ```
179
+
180
+ **Error Example:**
181
+ ```
182
+ ✓ Success
183
+ Input: test/malformed_rows.csv
184
+ Output: output/malformed_rows.csv
185
+ Encoding: ascii (no conversion needed)
186
+ Rows: 1
187
+ Columns: 4
188
+ Input size: 24 B
189
+ Output size: 40 B
190
+ Headers: normalized to snake_case
191
+
192
+ ╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
193
+ │ Validation Errors: │
194
+ │ │
195
+ │ Rejected rows: 2 │
196
+ │ │
197
+ │ Error types: │
198
+ │ • Expected Number of Columns: 3 Found: 2 │
199
+ │ • Expected Number of Columns: 3 Found: 4 │
200
+ │ │
201
+ │ Details: output/malformed_rows_reject_errors.csv │
202
+ ╰──────────────────────────────────────────────────────────────────────────────╯
203
+ ```
177
204
 
178
205
  ### Exit Codes
179
206
 
@@ -184,15 +211,15 @@ The tool provides modern terminal output with:
184
211
 
185
212
  ## Requirements
186
213
 
187
- - Python 3.8+
214
+ - Python 3.9+
188
215
  - Dependencies (automatically installed):
189
216
  - `charset-normalizer>=3.0.0` - Encoding detection
190
217
  - `duckdb>=0.9.0` - CSV validation and normalization
191
218
  - `rich>=13.0.0` - Modern terminal output formatting
192
219
  - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
220
+ - `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
193
221
 
194
222
  Optional extras:
195
- - `[banner]` - ASCII art banner for `--version` and `-V` verbose mode (`pyfiglet>=1.0.0`)
196
223
  - `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
197
224
 
198
225
  ## Development
@@ -221,7 +248,7 @@ pytest tests/ -v
221
248
  ### Project Structure
222
249
 
223
250
  ```
224
- prepare_data/
251
+ csvnorm/
225
252
  ├── src/csvnorm/
226
253
  │ ├── __init__.py # Package version
227
254
  │ ├── __main__.py # python -m support
@@ -0,0 +1,14 @@
1
+ csvnorm/__init__.py,sha256=frEketezK5MWX8eiy1mFgw_3QeMcH4cVgVsNXtD1Jgg,264
2
+ csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
3
+ csvnorm/cli.py,sha256=UEe0hRGWx9m6ZLGLd9TIaJ_uayclNTh_i0fO_JEgTXY,4166
4
+ csvnorm/core.py,sha256=0tgOmPr4JSMSzgSxT8ffCk_IrOWGLI2hTzhV9_xNQQ8,9945
5
+ csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
6
+ csvnorm/ui.py,sha256=rOfVYjnTImplMMc-QGmcYUXzzZ513Y1bCjlO2jPxG2A,3893
7
+ csvnorm/utils.py,sha256=slV2aADBDfg9RHZJE-jmRuzPfY1RX0Wq-D1A4oBN7Yo,5020
8
+ csvnorm/validation.py,sha256=I7m_nxsGDROy5pBkNU-H7qEVYEAT19vw5alkrvZqGh4,4539
9
+ csvnorm-0.3.11.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
10
+ csvnorm-0.3.11.dist-info/METADATA,sha256=7c2Bu-M-4UiOqqVOC5Nm-I88ZhmC2BquMSiGRjD9VBo,9808
11
+ csvnorm-0.3.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ csvnorm-0.3.11.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
13
+ csvnorm-0.3.11.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
14
+ csvnorm-0.3.11.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- csvnorm/__init__.py,sha256=8njXIycxL0qSI5Q9bVGyTaM41j_kKX9jV7TeQOSAQGE,263
2
- csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
3
- csvnorm/cli.py,sha256=MwIPahLktbulF6NYRWyBsE4s9Al9_aSdA1zvzuI0AiQ,3815
4
- csvnorm/core.py,sha256=_kTaui_2IhqrN_UxJpcjwXYXEvqaRMhML49Xlx-e0p0,6633
5
- csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
6
- csvnorm/utils.py,sha256=gvwDToOx3YoKCfVPyCmxcSa7teCWFB2SmAGr-jV5w_Y,1761
7
- csvnorm/validation.py,sha256=iXdfalAGDNB9kPefyzHXGI9uc-HLAG5pQ_-T93ShppY,2815
8
- csvnorm-0.3.3.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
9
- csvnorm-0.3.3.dist-info/METADATA,sha256=xKJmLVX9RoB22KwAAlxAvWB_KA9h68m5V-UyFaS_DGo,7840
10
- csvnorm-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- csvnorm-0.3.3.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
12
- csvnorm-0.3.3.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
13
- csvnorm-0.3.3.dist-info/RECORD,,