csvnorm 0.3.4__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvnorm/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """csvnorm - Validate and normalize CSV files."""
2
2
 
3
- __version__ = "0.3.4"
3
+ __version__ = "0.3.11"
4
4
  __all__ = ["normalize_csv", "detect_encoding", "process_csv"]
5
5
 
6
6
  from csvnorm.core import process_csv
csvnorm/cli.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import argparse
4
4
  import sys
5
5
  from pathlib import Path
6
+ from typing import Optional
6
7
 
7
8
  from rich.console import Console
8
9
  from rich_argparse import RichHelpFormatter
@@ -15,11 +16,10 @@ console = Console()
15
16
 
16
17
 
17
18
  def show_banner() -> None:
18
- """Show ASCII art banner."""
19
- from pyfiglet import figlet_format
20
-
21
- banner = figlet_format("csvnorm", font="slant")
22
- console.print(banner, style="bold cyan")
19
+ """Show simple styled banner."""
20
+ console.print()
21
+ console.print(" csvnorm ", style="bold cyan on black", justify="center")
22
+ console.print()
23
23
 
24
24
 
25
25
  class VersionAction(argparse.Action):
@@ -50,9 +50,9 @@ def create_parser() -> argparse.ArgumentParser:
50
50
  formatter_class=RichHelpFormatter,
51
51
  epilog="""\
52
52
  Examples:
53
- csvnorm data.csv -d ';' -o output_folder --force
53
+ csvnorm data.csv -d ';' -o output.csv --force
54
54
  csvnorm data.csv --keep-names --delimiter '\\t'
55
- csvnorm https://example.com/data.csv -o output
55
+ csvnorm https://example.com/data.csv -o processed/data.csv
56
56
  csvnorm data.csv -V
57
57
  """,
58
58
  )
@@ -90,10 +90,9 @@ Examples:
90
90
 
91
91
  parser.add_argument(
92
92
  "-o",
93
- "--output-dir",
93
+ "--output-file",
94
94
  type=Path,
95
- default=Path.cwd(),
96
- help="Set custom output directory (default: current working directory)",
95
+ help="Set output file path (absolute or relative)",
97
96
  )
98
97
 
99
98
  parser.add_argument(
@@ -114,7 +113,7 @@ Examples:
114
113
  return parser
115
114
 
116
115
 
117
- def main(argv: list[str] | None = None) -> int:
116
+ def main(argv: Optional[list[str]] = None) -> int:
118
117
  """Main entry point for the CLI.
119
118
 
120
119
  Args:
@@ -142,10 +141,17 @@ def main(argv: list[str] | None = None) -> int:
142
141
  # Setup logging
143
142
  setup_logger(args.verbose)
144
143
 
144
+ # Determine output file (default: input filename in current directory)
145
+ if args.output_file is None:
146
+ input_name = Path(args.input_file).name
147
+ output_file = Path.cwd() / input_name
148
+ else:
149
+ output_file = args.output_file
150
+
145
151
  # Run processing
146
152
  return process_csv(
147
153
  input_file=args.input_file,
148
- output_dir=args.output_dir,
154
+ output_file=output_file,
149
155
  force=args.force,
150
156
  keep_names=args.keep_names,
151
157
  delimiter=args.delimiter,
csvnorm/core.py CHANGED
@@ -1,18 +1,24 @@
1
1
  """Core processing logic for csvnorm."""
2
2
 
3
3
  import logging
4
+ import tempfile
4
5
  from pathlib import Path
5
6
  from typing import Union
6
7
 
7
8
  from rich.console import Console
8
- from rich.panel import Panel
9
9
  from rich.progress import Progress, SpinnerColumn, TextColumn
10
- from rich.table import Table
11
10
 
12
11
  from csvnorm.encoding import convert_to_utf8, detect_encoding, needs_conversion
12
+ from csvnorm.ui import (
13
+ show_error_panel,
14
+ show_success_table,
15
+ show_validation_error_panel,
16
+ show_warning_panel,
17
+ )
13
18
  from csvnorm.utils import (
14
- ensure_output_dir,
15
19
  extract_filename_from_url,
20
+ get_column_count,
21
+ get_row_count,
16
22
  is_url,
17
23
  to_snake_case,
18
24
  validate_delimiter,
@@ -26,7 +32,7 @@ console = Console()
26
32
 
27
33
  def process_csv(
28
34
  input_file: str,
29
- output_dir: Path,
35
+ output_file: Path,
30
36
  force: bool = False,
31
37
  keep_names: bool = False,
32
38
  delimiter: str = ",",
@@ -36,7 +42,7 @@ def process_csv(
36
42
 
37
43
  Args:
38
44
  input_file: Path to input CSV file or HTTP/HTTPS URL.
39
- output_dir: Directory for output files.
45
+ output_file: Full path for output file.
40
46
  force: If True, overwrite existing output files.
41
47
  keep_names: If True, keep original column names.
42
48
  delimiter: Output field delimiter.
@@ -54,7 +60,7 @@ def process_csv(
54
60
  try:
55
61
  validate_url(input_file)
56
62
  except ValueError as e:
57
- console.print(Panel(f"[bold red]Error:[/bold red] {e}", border_style="red"))
63
+ show_error_panel(str(e))
58
64
  return 1
59
65
  base_name = extract_filename_from_url(input_file)
60
66
  input_path = input_file # Keep as string for DuckDB
@@ -62,21 +68,11 @@ def process_csv(
62
68
  # Validate local file
63
69
  file_path = Path(input_file)
64
70
  if not file_path.exists():
65
- console.print(
66
- Panel(
67
- f"[bold red]Error:[/bold red] Input file not found\n{file_path}",
68
- border_style="red",
69
- )
70
- )
71
+ show_error_panel(f"Input file not found\n{file_path}")
71
72
  return 1
72
73
 
73
74
  if not file_path.is_file():
74
- console.print(
75
- Panel(
76
- f"[bold red]Error:[/bold red] Not a file\n{file_path}",
77
- border_style="red",
78
- )
79
- )
75
+ show_error_panel(f"Not a file\n{file_path}")
80
76
  return 1
81
77
 
82
78
  base_name = to_snake_case(file_path.name)
@@ -85,34 +81,30 @@ def process_csv(
85
81
  try:
86
82
  validate_delimiter(delimiter)
87
83
  except ValueError as e:
88
- console.print(Panel(f"[bold red]Error:[/bold red] {e}", border_style="red"))
84
+ show_error_panel(str(e))
89
85
  return 1
90
86
 
91
87
  # Setup paths
92
- ensure_output_dir(output_dir)
93
-
94
- output_file = output_dir / f"{base_name}.csv"
95
- reject_file = output_dir / f"{base_name}_reject_errors.csv"
96
- temp_utf8_file = output_dir / f"{base_name}_utf8.csv"
88
+ output_dir = output_file.parent
89
+ temp_dir = Path(tempfile.mkdtemp(prefix="csvnorm_"))
90
+ reject_file = output_dir / f"{output_file.stem}_reject_errors.csv"
91
+ temp_utf8_file = temp_dir / f"{output_file.stem}_utf8.csv"
97
92
 
98
93
  # Check if output exists
99
94
  if output_file.exists() and not force:
100
- console.print(
101
- Panel(
102
- f"[bold yellow]Warning:[/bold yellow] Output file already exists\n\n"
103
- f"{output_file}\n\n"
104
- f"Use [bold]--force[/bold] to overwrite.",
105
- border_style="yellow",
106
- )
95
+ show_warning_panel(
96
+ f"Output file already exists\n\n"
97
+ f"{output_file}\n\n"
98
+ f"Use [bold]--force[/bold] to overwrite."
107
99
  )
108
100
  return 1
109
101
 
110
- # Clean up previous reject file
102
+ # Clean up previous reject file (always overwrite)
111
103
  if reject_file.exists():
112
104
  reject_file.unlink()
113
105
 
114
106
  # Track files to clean up
115
- temp_files: list[Path] = []
107
+ temp_files: list[Path] = [temp_dir]
116
108
 
117
109
  try:
118
110
  with Progress(
@@ -142,9 +134,7 @@ def process_csv(
142
134
  encoding = detect_encoding(file_input_path)
143
135
  except ValueError as e:
144
136
  progress.stop()
145
- console.print(
146
- Panel(f"[bold red]Error:[/bold red] {e}", border_style="red")
147
- )
137
+ show_error_panel(str(e))
148
138
  return 1
149
139
 
150
140
  logger.debug(f"Detected encoding: {encoding}")
@@ -164,16 +154,11 @@ def process_csv(
164
154
  working_file = temp_utf8_file
165
155
  temp_files.append(temp_utf8_file)
166
156
  progress.update(
167
- task, description=f"[green]✓[/green] Converted to UTF-8"
157
+ task, description="[green]✓[/green] Converted to UTF-8"
168
158
  )
169
159
  except (UnicodeDecodeError, LookupError) as e:
170
160
  progress.stop()
171
- console.print(
172
- Panel(
173
- f"[bold red]Error:[/bold red] Encoding conversion failed\n{e}",
174
- border_style="red",
175
- )
176
- )
161
+ show_error_panel(f"Encoding conversion failed\n{e}")
177
162
  return 1
178
163
  else:
179
164
  progress.update(
@@ -186,7 +171,9 @@ def process_csv(
186
171
  logger.debug("Validating CSV with DuckDB...")
187
172
 
188
173
  try:
189
- is_valid = validate_csv(working_file, reject_file, is_remote=is_remote)
174
+ reject_count, error_types = validate_csv(
175
+ working_file, reject_file, is_remote=is_remote
176
+ )
190
177
  except Exception as e:
191
178
  progress.stop()
192
179
  error_msg = str(e)
@@ -194,61 +181,38 @@ def process_csv(
194
181
  # Check for common HTTP errors
195
182
  if "HTTP Error" in error_msg or "HTTPException" in error_msg:
196
183
  if "404" in error_msg:
197
- console.print(
198
- Panel(
199
- "[bold red]Error:[/bold red] Remote CSV file not found (HTTP 404)\n\n"
200
- f"URL: [cyan]{input_file}[/cyan]\n\n"
201
- "Please check the URL is correct.",
202
- border_style="red",
203
- )
184
+ show_error_panel(
185
+ f"Remote CSV file not found (HTTP 404)\n\n"
186
+ f"URL: [cyan]{input_file}[/cyan]\n\n"
187
+ "Please check the URL is correct."
204
188
  )
205
189
  elif "401" in error_msg or "403" in error_msg:
206
- console.print(
207
- Panel(
208
- "[bold red]Error:[/bold red] Authentication required (HTTP 401/403)\n\n"
209
- f"URL: [cyan]{input_file}[/cyan]\n\n"
210
- "This tool only supports public URLs without authentication.\n"
211
- "Please download the file manually first.",
212
- border_style="red",
213
- )
190
+ show_error_panel(
191
+ f"Authentication required (HTTP 401/403)\n\n"
192
+ f"URL: [cyan]{input_file}[/cyan]\n\n"
193
+ "This tool only supports public URLs without authentication.\n"
194
+ "Please download the file manually first."
214
195
  )
215
196
  elif (
216
197
  "timeout" in error_msg.lower()
217
198
  or "timed out" in error_msg.lower()
218
199
  ):
219
- console.print(
220
- Panel(
221
- "[bold red]Error:[/bold red] HTTP request timeout (30 seconds)\n\n"
222
- f"URL: [cyan]{input_file}[/cyan]\n\n"
223
- "The remote server took too long to respond.\n"
224
- "Try again later or download the file manually.",
225
- border_style="red",
226
- )
200
+ show_error_panel(
201
+ f"HTTP request timeout (30 seconds)\n\n"
202
+ f"URL: [cyan]{input_file}[/cyan]\n\n"
203
+ "The remote server took too long to respond.\n"
204
+ "Try again later or download the file manually."
227
205
  )
228
206
  else:
229
- console.print(
230
- Panel(
231
- f"[bold red]Error:[/bold red] HTTP request failed\n\n"
232
- f"{error_msg}",
233
- border_style="red",
234
- )
235
- )
207
+ show_error_panel(f"HTTP request failed\n\n{error_msg}")
236
208
  else:
237
209
  # Re-raise non-HTTP errors
238
210
  raise
239
211
  return 1
240
212
 
241
- if not is_valid:
213
+ has_validation_errors = reject_count > 1
214
+ if has_validation_errors:
242
215
  progress.stop()
243
- console.print(
244
- Panel(
245
- "[bold red]Error:[/bold red] DuckDB encountered invalid rows\n\n"
246
- f"Details: [cyan]{reject_file}[/cyan]\n\n"
247
- "Please fix the issues and try again.",
248
- border_style="red",
249
- )
250
- )
251
- return 1
252
216
 
253
217
  progress.update(task, description="[green]✓[/green] CSV validated")
254
218
 
@@ -266,27 +230,44 @@ def process_csv(
266
230
  logger.debug(f"Output written to: {output_file}")
267
231
  progress.update(task, description="[green]✓[/green] Complete")
268
232
 
269
- # Success summary table
270
- table = Table(show_header=False, box=None, padding=(0, 1))
271
- table.add_row("[green]✓[/green] Success", "")
272
- table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
273
- table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
274
- if not is_remote:
275
- table.add_row("Encoding:", encoding)
276
- if delimiter != ",":
277
- table.add_row("Delimiter:", repr(delimiter))
278
- if not keep_names:
279
- table.add_row("Headers:", "normalized to snake_case")
280
-
281
- console.print()
282
- console.print(table)
233
+ # Collect statistics
234
+ input_size = (
235
+ working_file.stat().st_size if isinstance(working_file, Path) else 0
236
+ )
237
+ output_size = output_file.stat().st_size
238
+ row_count = get_row_count(output_file)
239
+ column_count = get_column_count(output_file, delimiter)
240
+
241
+ # Show success summary
242
+ show_success_table(
243
+ input_file=input_file,
244
+ output_file=output_file,
245
+ encoding=encoding,
246
+ is_remote=is_remote,
247
+ row_count=row_count,
248
+ column_count=column_count,
249
+ input_size=input_size,
250
+ output_size=output_size,
251
+ delimiter=delimiter,
252
+ keep_names=keep_names,
253
+ )
254
+
255
+ # Show validation errors if any
256
+ if has_validation_errors:
257
+ show_validation_error_panel(reject_count, error_types, reject_file)
258
+ return 1
283
259
 
284
260
  finally:
285
- # Cleanup temp files
286
- for temp_file in temp_files:
287
- if temp_file.exists():
288
- logger.debug(f"Removing temp file: {temp_file}")
289
- temp_file.unlink()
261
+ # Cleanup temp directory
262
+ import shutil
263
+
264
+ for temp_path in temp_files:
265
+ if temp_path.exists():
266
+ logger.debug(f"Removing temp path: {temp_path}")
267
+ if temp_path.is_dir():
268
+ shutil.rmtree(temp_path)
269
+ else:
270
+ temp_path.unlink()
290
271
 
291
272
  # Remove reject file if empty (only header)
292
273
  if reject_file.exists():
csvnorm/ui.py ADDED
@@ -0,0 +1,124 @@
1
+ """UI formatting functions for csvnorm terminal output."""
2
+
3
+ from pathlib import Path
4
+
5
+ from rich.console import Console
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+
9
+ from csvnorm.encoding import needs_conversion
10
+ from csvnorm.utils import format_file_size
11
+
12
+ console = Console()
13
+
14
+
15
+ def show_error_panel(message: str, title: str = "Error") -> None:
16
+ """Display an error panel with red border.
17
+
18
+ Args:
19
+ message: Error message to display.
20
+ title: Panel title (default: "Error").
21
+ """
22
+ console.print(Panel(f"[bold red]{title}:[/bold red] {message}", border_style="red"))
23
+
24
+
25
+ def show_warning_panel(message: str, title: str = "Warning") -> None:
26
+ """Display a warning panel with yellow border.
27
+
28
+ Args:
29
+ message: Warning message to display.
30
+ title: Panel title (default: "Warning").
31
+ """
32
+ console.print(
33
+ Panel(f"[bold yellow]{title}:[/bold yellow] {message}", border_style="yellow")
34
+ )
35
+
36
+
37
+ def show_success_table(
38
+ input_file: str,
39
+ output_file: Path,
40
+ encoding: str,
41
+ is_remote: bool,
42
+ row_count: int,
43
+ column_count: int,
44
+ input_size: int,
45
+ output_size: int,
46
+ delimiter: str,
47
+ keep_names: bool,
48
+ ) -> None:
49
+ """Display success summary table with processing results.
50
+
51
+ Args:
52
+ input_file: Input CSV file path or URL.
53
+ output_file: Output CSV file path.
54
+ encoding: Detected encoding (or "remote" for URLs).
55
+ is_remote: Whether input was a remote URL.
56
+ row_count: Number of data rows in output.
57
+ column_count: Number of columns in output.
58
+ input_size: Input file size in bytes (0 for remote).
59
+ output_size: Output file size in bytes.
60
+ delimiter: Output delimiter character.
61
+ keep_names: Whether original column names were kept.
62
+ """
63
+ table = Table(show_header=False, box=None, padding=(0, 1))
64
+ table.add_row("[green]✓[/green] Success", "")
65
+ table.add_row("Input:", f"[cyan]{input_file}[/cyan]")
66
+ table.add_row("Output:", f"[cyan]{output_file}[/cyan]")
67
+
68
+ # Encoding info
69
+ if not is_remote:
70
+ if needs_conversion(encoding):
71
+ table.add_row("Encoding:", f"{encoding} → UTF-8 [dim](converted)[/dim]")
72
+ else:
73
+ table.add_row("Encoding:", f"{encoding} [dim](no conversion needed)[/dim]")
74
+ else:
75
+ table.add_row("Encoding:", "remote [dim](handled by DuckDB)[/dim]")
76
+
77
+ # Statistics
78
+ table.add_row("Rows:", f"{row_count:,}")
79
+ table.add_row("Columns:", f"{column_count}")
80
+ table.add_row("Input size:", format_file_size(input_size))
81
+ table.add_row("Output size:", format_file_size(output_size))
82
+
83
+ # Optional fields
84
+ if delimiter != ",":
85
+ table.add_row("Delimiter:", repr(delimiter))
86
+ if not keep_names:
87
+ table.add_row("Headers:", "normalized to snake_case")
88
+
89
+ console.print()
90
+ console.print(table)
91
+
92
+
93
+ def show_validation_error_panel(
94
+ reject_count: int, error_types: list[str], reject_file: Path
95
+ ) -> None:
96
+ """Display validation error summary panel.
97
+
98
+ Args:
99
+ reject_count: Number of rejected rows (including header).
100
+ error_types: List of error type descriptions.
101
+ reject_file: Path to reject errors CSV file.
102
+ """
103
+ console.print()
104
+ error_lines = []
105
+ error_lines.append("[bold red]Validation Errors:[/bold red]")
106
+ error_lines.append("")
107
+ error_lines.append(f"Rejected rows: [yellow]{reject_count - 1}[/yellow]")
108
+
109
+ if error_types:
110
+ error_lines.append("")
111
+ error_lines.append("[dim]Error types:[/dim]")
112
+ for error_type in error_types:
113
+ error_lines.append(f" • {error_type}")
114
+
115
+ error_lines.append("")
116
+ error_lines.append(f"Details: [cyan]{reject_file}[/cyan]")
117
+
118
+ console.print(
119
+ Panel(
120
+ "\n".join(error_lines),
121
+ border_style="yellow",
122
+ title="[yellow]![/yellow] Validation Failed",
123
+ )
124
+ )
csvnorm/utils.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  import re
5
5
  from pathlib import Path
6
+ from typing import Union
6
7
  from urllib.parse import urlparse
7
8
 
8
9
  from rich.logging import RichHandler
@@ -124,3 +125,68 @@ def extract_filename_from_url(url: str) -> str:
124
125
 
125
126
  # Apply snake_case normalization
126
127
  return to_snake_case(filename) if filename else "data"
128
+
129
+
130
+ def format_file_size(size_bytes: int) -> str:
131
+ """Format file size in human-readable format.
132
+
133
+ Args:
134
+ size_bytes: File size in bytes.
135
+
136
+ Returns:
137
+ Formatted size string (e.g., "1.5 MB", "256 KB").
138
+ """
139
+ for unit in ["B", "KB", "MB", "GB"]:
140
+ if size_bytes < 1024.0:
141
+ return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} B"
142
+ size_bytes /= 1024.0
143
+ return f"{size_bytes:.1f} TB"
144
+
145
+
146
+ def get_row_count(file_path: Union[Path, str]) -> int:
147
+ """Count number of rows in a CSV file.
148
+
149
+ Args:
150
+ file_path: Path to CSV file.
151
+
152
+ Returns:
153
+ Number of data rows (excluding header), or 0 if file doesn't exist.
154
+ """
155
+ if not isinstance(file_path, Path) or not file_path.exists():
156
+ return 0
157
+
158
+ try:
159
+ with open(file_path, "r") as f:
160
+ # Skip header
161
+ next(f, None)
162
+ return sum(1 for _ in f)
163
+ except Exception:
164
+ return 0
165
+
166
+
167
+ def get_column_count(file_path: Union[Path, str], delimiter: str = ",") -> int:
168
+ """Count number of columns in a CSV file using DuckDB.
169
+
170
+ Args:
171
+ file_path: Path to CSV file.
172
+ delimiter: Field delimiter used in the CSV file.
173
+
174
+ Returns:
175
+ Number of columns in the CSV, or 0 if file doesn't exist or error.
176
+ """
177
+ if not isinstance(file_path, Path) or not file_path.exists():
178
+ return 0
179
+
180
+ try:
181
+ import duckdb
182
+
183
+ conn = duckdb.connect(":memory:")
184
+ # Get column names from CSV using DuckDB DESCRIBE
185
+ columns = conn.execute(
186
+ f"DESCRIBE SELECT * FROM read_csv('{file_path}', delim='{delimiter}', header=true, sample_size=1)"
187
+ ).fetchall()
188
+ conn.close()
189
+
190
+ return len(columns)
191
+ except Exception:
192
+ return 0
csvnorm/validation.py CHANGED
@@ -11,7 +11,7 @@ logger = logging.getLogger("csvnorm")
11
11
 
12
12
  def validate_csv(
13
13
  file_path: Union[Path, str], reject_file: Path, is_remote: bool = False
14
- ) -> bool:
14
+ ) -> tuple[int, list[str]]:
15
15
  """Validate CSV file using DuckDB and export rejected rows.
16
16
 
17
17
  Args:
@@ -20,7 +20,8 @@ def validate_csv(
20
20
  is_remote: True if file_path is a remote URL.
21
21
 
22
22
  Returns:
23
- True if validation passes (no rejected rows), False otherwise.
23
+ Tuple of (reject_count, error_types) where error_types is list of
24
+ up to 3 unique error reasons from reject file.
24
25
  """
25
26
  logger.debug(f"Validating CSV: {file_path}")
26
27
 
@@ -54,7 +55,12 @@ def validate_csv(
54
55
  reject_count = _count_lines(reject_file)
55
56
  logger.debug(f"Reject file lines: {reject_count}")
56
57
 
57
- return reject_count <= 1
58
+ # Collect sample error types from reject file
59
+ error_types = []
60
+ if reject_count > 1:
61
+ error_types = _get_error_types(reject_file)
62
+
63
+ return reject_count, error_types
58
64
 
59
65
 
60
66
  def normalize_csv(
@@ -121,3 +127,36 @@ def _count_lines(file_path: Path) -> int:
121
127
 
122
128
  with open(file_path, "r") as f:
123
129
  return sum(1 for _ in f)
130
+
131
+
132
+ def _get_error_types(reject_file: Path) -> list[str]:
133
+ """Extract sample error types from reject file.
134
+
135
+ Args:
136
+ reject_file: Path to reject_errors.csv file.
137
+
138
+ Returns:
139
+ List of up to 3 unique error reasons.
140
+ """
141
+ if not reject_file.exists():
142
+ return []
143
+
144
+ error_types: set[str] = set()
145
+ try:
146
+ with open(reject_file, "r") as f:
147
+ # Skip header
148
+ next(f, None)
149
+ for line in f:
150
+ # Error message is in the last column
151
+ parts = line.rstrip("\n").split(",")
152
+ if parts:
153
+ error_reason = parts[-1].strip()
154
+ if error_reason and error_reason != "error":
155
+ error_types.add(error_reason)
156
+ if len(error_types) >= 3:
157
+ break
158
+ except Exception as e:
159
+ logger.warning(f"Failed to extract error types: {e}")
160
+ return []
161
+
162
+ return list(error_types)[:3]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csvnorm
3
- Version: 0.3.4
3
+ Version: 0.3.11
4
4
  Summary: A command-line utility to validate and normalize CSV files
5
5
  Author-email: aborruso <aborruso@gmail.com>
6
6
  License: MIT License
@@ -34,7 +34,6 @@ Classifier: Intended Audience :: Science/Research
34
34
  Classifier: License :: OSI Approved :: MIT License
35
35
  Classifier: Operating System :: OS Independent
36
36
  Classifier: Programming Language :: Python :: 3
37
- Classifier: Programming Language :: Python :: 3.8
38
37
  Classifier: Programming Language :: Python :: 3.9
39
38
  Classifier: Programming Language :: Python :: 3.10
40
39
  Classifier: Programming Language :: Python :: 3.11
@@ -49,7 +48,6 @@ Requires-Dist: charset-normalizer>=3.0.0
49
48
  Requires-Dist: duckdb>=0.9.0
50
49
  Requires-Dist: rich>=13.0.0
51
50
  Requires-Dist: rich-argparse>=1.0.0
52
- Requires-Dist: pyfiglet>=1.0.0
53
51
  Provides-Extra: dev
54
52
  Requires-Dist: pytest>=7.0.0; extra == "dev"
55
53
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
@@ -58,7 +56,7 @@ Dynamic: license-file
58
56
 
59
57
  [![PyPI version](https://badge.fury.io/py/csvnorm.svg)](https://pypi.org/project/csvnorm/)
60
58
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
61
- [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
59
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
62
60
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/aborruso/csvnorm)
63
61
 
64
62
  # csvnorm
@@ -100,7 +98,8 @@ This tool prepares CSV files for **basic exploratory data analysis (EDA)**, not
100
98
  - **Delimiter Normalization**: Converts all field separators to standard commas (`,`)
101
99
  - **Field Name Normalization**: Converts column headers to snake_case format
102
100
  - **Encoding Normalization**: Auto-detects encoding and converts to UTF-8
103
- - **Error Reporting**: Exports detailed error file for invalid rows
101
+ - **Processing Summary**: Displays comprehensive statistics (rows, columns, file sizes) and error details
102
+ - **Error Reporting**: Exports detailed error file for invalid rows with summary panel
104
103
  - **Remote URL Support**: Process CSV files directly from HTTP/HTTPS URLs without downloading
105
104
 
106
105
  ## Usage
@@ -145,14 +144,14 @@ csvnorm data.csv -f -V
145
144
 
146
145
  ### Output
147
146
 
148
- Creates a normalized CSV file in the specified output directory with:
147
+ Creates a normalized CSV file in specified output directory with:
149
148
  - UTF-8 encoding
150
149
  - Consistent field delimiters
151
150
  - Normalized column names (unless `--keep-names` is specified)
152
151
  - Error report if any invalid rows are found (saved as `{input_name}_reject_errors.csv`)
153
152
 
154
153
  For remote URLs:
155
- - The output filename is derived from the URL's last path segment
154
+ - The output filename is derived from URL's last path segment
156
155
  - Encoding is handled automatically by DuckDB
157
156
  - HTTP timeout is set to 30 seconds
158
157
  - Only public URLs are supported (no authentication)
@@ -160,9 +159,49 @@ For remote URLs:
160
159
  The tool provides modern terminal output with:
161
160
  - Progress indicators for multi-step processing
162
161
  - Color-coded error messages with panels
163
- - Success summary table showing encoding, paths, and settings
162
+ - Success summary table with statistics (rows, columns, file sizes)
163
+ - Encoding conversion status (converted/no conversion/remote)
164
+ - Error summary panel with reject count and error types when validation fails
164
165
  - ASCII art banner with `--version` and `-V` verbose mode
165
166
 
167
+ **Success Example:**
168
+ ```
169
+ ✓ Success
170
+ Input: test/utf8_basic.csv
171
+ Output: output/utf8_basic.csv
172
+ Encoding: ascii (no conversion needed)
173
+ Rows: 2
174
+ Columns: 3
175
+ Input size: 42 B
176
+ Output size: 43 B
177
+ Headers: normalized to snake_case
178
+ ```
179
+
180
+ **Error Example:**
181
+ ```
182
+ ✓ Success
183
+ Input: test/malformed_rows.csv
184
+ Output: output/malformed_rows.csv
185
+ Encoding: ascii (no conversion needed)
186
+ Rows: 1
187
+ Columns: 4
188
+ Input size: 24 B
189
+ Output size: 40 B
190
+ Headers: normalized to snake_case
191
+
192
+ ╭──────────────────────────── ! Validation Failed ─────────────────────────────╮
193
+ │ Validation Errors: │
194
+ │ │
195
+ │ Rejected rows: 2 │
196
+ │ │
197
+ │ Error types: │
198
+ │ • Expected Number of Columns: 3 Found: 2 │
199
+ │ • Expected Number of Columns: 3 Found: 4 │
200
+ │ │
201
+ │ Details: output/malformed_rows_reject_errors.csv │
202
+ ╰──────────────────────────────────────────────────────────────────────────────╯
203
+ ```
204
+
166
205
  ### Exit Codes
167
206
 
168
207
  | Code | Meaning |
@@ -172,13 +211,13 @@ The tool provides modern terminal output with:
172
211
 
173
212
  ## Requirements
174
213
 
175
- - Python 3.8+
214
+ - Python 3.9+
176
215
  - Dependencies (automatically installed):
177
216
  - `charset-normalizer>=3.0.0` - Encoding detection
178
217
  - `duckdb>=0.9.0` - CSV validation and normalization
179
218
  - `rich>=13.0.0` - Modern terminal output formatting
180
219
  - `rich-argparse>=1.0.0` - Enhanced CLI help formatting
181
- - `pyfiglet>=1.0.0` - ASCII art banner
220
+ - `pyfiglet>=0.8.post1,<1.0.0` - ASCII art banner
182
221
 
183
222
  Optional extras:
184
223
  - `[dev]` - Development dependencies (`pytest>=7.0.0`, `pytest-cov>=4.0.0`, `ruff>=0.1.0`)
@@ -209,7 +248,7 @@ pytest tests/ -v
209
248
  ### Project Structure
210
249
 
211
250
  ```
212
- prepare_data/
251
+ csvnorm/
213
252
  ├── src/csvnorm/
214
253
  │ ├── __init__.py # Package version
215
254
  │ ├── __main__.py # python -m support
@@ -0,0 +1,14 @@
1
+ csvnorm/__init__.py,sha256=frEketezK5MWX8eiy1mFgw_3QeMcH4cVgVsNXtD1Jgg,264
2
+ csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
3
+ csvnorm/cli.py,sha256=UEe0hRGWx9m6ZLGLd9TIaJ_uayclNTh_i0fO_JEgTXY,4166
4
+ csvnorm/core.py,sha256=0tgOmPr4JSMSzgSxT8ffCk_IrOWGLI2hTzhV9_xNQQ8,9945
5
+ csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
6
+ csvnorm/ui.py,sha256=rOfVYjnTImplMMc-QGmcYUXzzZ513Y1bCjlO2jPxG2A,3893
7
+ csvnorm/utils.py,sha256=slV2aADBDfg9RHZJE-jmRuzPfY1RX0Wq-D1A4oBN7Yo,5020
8
+ csvnorm/validation.py,sha256=I7m_nxsGDROy5pBkNU-H7qEVYEAT19vw5alkrvZqGh4,4539
9
+ csvnorm-0.3.11.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
10
+ csvnorm-0.3.11.dist-info/METADATA,sha256=7c2Bu-M-4UiOqqVOC5Nm-I88ZhmC2BquMSiGRjD9VBo,9808
11
+ csvnorm-0.3.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ csvnorm-0.3.11.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
13
+ csvnorm-0.3.11.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
14
+ csvnorm-0.3.11.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- csvnorm/__init__.py,sha256=OvADr4XLxKa9CBE8oTVtV_YBCvr7oV8cgLN68cUtC1E,263
2
- csvnorm/__main__.py,sha256=WDURvgm7E-yMN_ZvbBBmgzIJz5naonZxQ9RYcoD7ves,110
3
- csvnorm/cli.py,sha256=DZYclAKMhyZp234D_aWJUncVXZJDGO4u4Jh_fVHlz-g,3939
4
- csvnorm/core.py,sha256=fby0c4Fs7zkvC3pqvxo8U-4eXm-SnQJ8UivluReDxxc,11126
5
- csvnorm/encoding.py,sha256=kV7J8nVWDv_F6bKUCrMj2rReMB4Zecf4gwFeebdjm98,3334
6
- csvnorm/utils.py,sha256=k5gYxlmdtJOJEhOU1UxnmPb8Akn3UUIsB02S-t5oj4c,3227
7
- csvnorm/validation.py,sha256=cB0rASU-f7C8M539lFoR7bWhPmG5_LfM7f3S5mRSqAM,3321
8
- csvnorm-0.3.4.dist-info/licenses/LICENSE,sha256=iPlktyjNwNMal_HUjGRTAavIF8mHV3CxGW4g5XmG3ZU,1075
9
- csvnorm-0.3.4.dist-info/METADATA,sha256=BkJEIeef7w1IsC5TMOHH1DL-Hj6A6PD-ZbCDbwRmCpg,7857
10
- csvnorm-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- csvnorm-0.3.4.dist-info/entry_points.txt,sha256=6GmXNAdKzhHle3KV6Gbe95RS6edR0Fe6PVqdy1ED23k,45
12
- csvnorm-0.3.4.dist-info/top_level.txt,sha256=jMenLN9avlYH8z8v23sReLFkUYx0awPGFm8cTnoAl-0,8
13
- csvnorm-0.3.4.dist-info/RECORD,,