splurge-dsv 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
splurge_dsv/__init__.py CHANGED
@@ -13,9 +13,20 @@ Copyright (c) 2025 Jim Schilling
13
13
  # test cases may remove the process working directory which causes calls to
14
14
  # os.getcwd() to raise FileNotFoundError later during test execution. Guard
15
15
  # against that here by switching to this package directory when cwd is missing.
16
+ # Ensure the required external implementation is available on import so the
17
+ # rest of the package can rely on its APIs. Fail fast with a helpful message
18
+ # instructing the user to install the package if it's missing.
19
+ import importlib as _importlib
16
20
  import os
17
21
  from pathlib import Path as _Path
18
22
 
23
+ try: # pragma: no cover - import-time guard
24
+ _importlib.import_module("splurge_safe_io")
25
+ except Exception as e:
26
+ raise ImportError(
27
+ "Missing required dependency 'splurge-safe-io'. Please install it: `pip install splurge-safe-io`"
28
+ ) from e
29
+
19
30
  try:
20
31
  try:
21
32
  # os.getcwd() can raise FileNotFoundError in CI/runner environments
@@ -35,11 +46,13 @@ except Exception:
35
46
  from splurge_dsv.dsv import Dsv, DsvConfig
36
47
  from splurge_dsv.dsv_helper import DsvHelper
37
48
  from splurge_dsv.exceptions import (
49
+ SplurgeDsvColumnMismatchError,
38
50
  SplurgeDsvConfigurationError,
39
51
  SplurgeDsvDataProcessingError,
40
52
  # canonical SplurgeDsv* exception names
41
53
  SplurgeDsvError,
42
54
  SplurgeDsvFileEncodingError,
55
+ SplurgeDsvFileExistsError,
43
56
  SplurgeDsvFileNotFoundError,
44
57
  SplurgeDsvFileOperationError,
45
58
  SplurgeDsvFilePermissionError,
@@ -56,11 +69,9 @@ from splurge_dsv.exceptions import (
56
69
  SplurgeDsvTypeConversionError,
57
70
  SplurgeDsvValidationError,
58
71
  )
59
- from splurge_dsv.path_validator import PathValidator
60
72
  from splurge_dsv.string_tokenizer import StringTokenizer
61
- from splurge_dsv.text_file_helper import TextFileHelper
62
73
 
63
- __version__ = "2025.2.0"
74
+ __version__ = "2025.3.1"
64
75
  __author__ = "Jim Schilling"
65
76
  __license__ = "MIT"
66
77
 
@@ -79,6 +90,7 @@ __all__ = [
79
90
  "SplurgeDsvPathValidationError",
80
91
  "SplurgeDsvDataProcessingError",
81
92
  "SplurgeDsvParsingError",
93
+ "SplurgeDsvColumnMismatchError",
82
94
  "SplurgeDsvTypeConversionError",
83
95
  "SplurgeDsvStreamingError",
84
96
  "SplurgeDsvConfigurationError",
@@ -89,8 +101,7 @@ __all__ = [
89
101
  "SplurgeDsvParameterError",
90
102
  "SplurgeDsvRangeError",
91
103
  "SplurgeDsvFormatError",
104
+ "SplurgeDsvFileExistsError",
92
105
  # Utility classes
93
106
  "StringTokenizer",
94
- "TextFileHelper",
95
- "PathValidator",
96
107
  ]
splurge_dsv/cli.py CHANGED
@@ -23,6 +23,7 @@ from pathlib import Path
23
23
  # Local imports
24
24
  from splurge_dsv import __version__
25
25
  from splurge_dsv.dsv import Dsv, DsvConfig
26
+ from splurge_dsv.dsv_helper import DsvHelper
26
27
  from splurge_dsv.exceptions import SplurgeDsvError
27
28
 
28
29
 
@@ -39,14 +40,31 @@ def parse_arguments() -> argparse.Namespace:
39
40
  epilog="""
40
41
  Examples:
41
42
  python -m splurge_dsv data.csv --delimiter ,
42
- python -m splurge_dsv data.tsv --delimiter "\\t"
43
+ python -m splurge_dsv data.tsv --delimiter "\t"
43
44
  python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
44
- """,
45
+ # Auto-detect the expected column count and normalize rows
46
+ python -m splurge_dsv data.csv --delimiter , --detect-columns --max-detect-chunks 5
47
+ # Stream a large file while attempting to detect the column count from the first non-blank logical row
48
+ python -m splurge_dsv large.csv --delimiter , --stream --detect-columns --max-detect-chunks 10
49
+ """,
45
50
  )
46
51
 
47
52
  parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
48
53
 
49
- parser.add_argument("--delimiter", "-d", type=str, required=True, help="Delimiter character to use for parsing")
54
+ parser.add_argument(
55
+ "--config",
56
+ "-c",
57
+ dest="config",
58
+ type=str,
59
+ help="Path to a YAML config file that mirrors CLI options (values overridden by CLI args)",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--delimiter",
64
+ "-d",
65
+ type=str,
66
+ help="Delimiter character to use for parsing (may also be provided via --config)",
67
+ )
50
68
 
51
69
  parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
52
70
 
@@ -64,7 +82,53 @@ Examples:
64
82
  "--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
65
83
  )
66
84
 
67
- parser.add_argument("--chunk-size", type=int, default=500, help="Chunk size for streaming (default: 500)")
85
+ parser.add_argument(
86
+ "--detect-columns",
87
+ action="store_true",
88
+ help=(
89
+ "Auto-detect the expected column count from the first non-blank logical row "
90
+ "and normalize subsequent rows to that count. For streamed parsing, the "
91
+ "detector may scan up to --max-detect-chunks chunks from the start of the file."
92
+ ),
93
+ )
94
+
95
+ parser.add_argument(
96
+ "--raise-on-missing-columns",
97
+ action="store_true",
98
+ help="Raise an error if a row has fewer columns than the detected/expected count",
99
+ )
100
+
101
+ parser.add_argument(
102
+ "--raise-on-extra-columns",
103
+ action="store_true",
104
+ help="Raise an error if a row has more columns than the detected/expected count",
105
+ )
106
+
107
+ parser.add_argument(
108
+ "--chunk-size",
109
+ type=int,
110
+ default=DsvHelper.DEFAULT_CHUNK_SIZE,
111
+ help=(
112
+ f"Chunk size for streaming (minimum: {DsvHelper.DEFAULT_MIN_CHUNK_SIZE}, "
113
+ f"default: {DsvHelper.DEFAULT_CHUNK_SIZE})"
114
+ ),
115
+ )
116
+
117
+ parser.add_argument(
118
+ "--max-detect-chunks",
119
+ type=int,
120
+ default=DsvHelper.MAX_DETECT_CHUNKS,
121
+ help=(
122
+ "When detecting columns while streaming (use --detect-normalize-columns), "
123
+ f"scan up to N chunks from the start of the stream before giving up (default: {DsvHelper.MAX_DETECT_CHUNKS})."
124
+ ),
125
+ )
126
+
127
+ parser.add_argument(
128
+ "--skip-empty-lines",
129
+ action="store_true",
130
+ help="Have the underlying reader skip raw empty logical lines (line.strip() == '') before parsing",
131
+ )
68
132
 
69
133
  parser.add_argument(
70
134
  "--output-format",
@@ -141,17 +205,56 @@ def run_cli() -> int:
141
205
  print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
142
206
  return 1
143
207
 
208
+ # Build base config either from YAML file (if provided) or from CLI args
209
+ base_params = {}
210
+ if args.config:
211
+ try:
212
+ import yaml # type: ignore
213
+
214
+ cfg_path = Path(args.config)
215
+ if not cfg_path.exists():
216
+ print(f"Error: Config file '{args.config}' not found.", file=sys.stderr)
217
+ return 1
218
+
219
+ with cfg_path.open("r", encoding="utf-8") as fh:
220
+ file_cfg = yaml.safe_load(fh) or {}
221
+
222
+ if not isinstance(file_cfg, dict):
223
+ print(f"Error: Config file '{args.config}' must contain a mapping/dictionary.", file=sys.stderr)
224
+ return 1
225
+
226
+ base_params.update(file_cfg)
227
+ except Exception as e:
228
+ print(f"Error reading config file '{args.config}': {e}", file=sys.stderr)
229
+ return 1
230
+
231
+ # CLI args override YAML values when provided. Build the parameter map
232
+ cli_params = {
233
+ "delimiter": args.delimiter,
234
+ "strip": not args.no_strip,
235
+ "bookend": args.bookend,
236
+ "bookend_strip": not args.no_bookend_strip,
237
+ "encoding": args.encoding,
238
+ "skip_header_rows": args.skip_header,
239
+ "skip_footer_rows": args.skip_footer,
240
+ "chunk_size": args.chunk_size,
241
+ "detect_columns": args.detect_columns,
242
+ "raise_on_missing_columns": args.raise_on_missing_columns,
243
+ "raise_on_extra_columns": args.raise_on_extra_columns,
244
+ "max_detect_chunks": args.max_detect_chunks,
245
+ "skip_empty_lines": args.skip_empty_lines,
246
+ }
247
+
248
+ # Merge: start from file (if any), then overlay CLI-provided values
249
+ merged = {**base_params, **{k: v for k, v in cli_params.items() if v is not None}}
250
+
144
251
  # Create configuration and Dsv instance for parsing
145
- config = DsvConfig(
146
- delimiter=args.delimiter,
147
- strip=not args.no_strip,
148
- bookend=args.bookend,
149
- bookend_strip=not args.no_bookend_strip,
150
- encoding=args.encoding,
151
- skip_header_rows=args.skip_header,
152
- skip_footer_rows=args.skip_footer,
153
- chunk_size=args.chunk_size,
154
- )
252
+ try:
253
+ config = DsvConfig.from_params(**merged)
254
+ except Exception as e:
255
+ print(f"Error building configuration: {e}", file=sys.stderr)
256
+ return 1
257
+ dsv = Dsv(config)
155
258
  dsv = Dsv(config)
156
259
 
157
260
  # Parse the file
@@ -161,18 +264,26 @@ def run_cli() -> int:
161
264
  chunk_count = 0
162
265
  total_rows = 0
163
266
 
164
- for chunk in dsv.parse_stream(file_path):
165
- chunk_count += 1
166
- total_rows += len(chunk)
167
- if args.output_format == "json":
168
- print(json.dumps(chunk, ensure_ascii=False))
169
- elif args.output_format == "ndjson":
170
- for row in chunk:
171
- print(json.dumps(row, ensure_ascii=False))
172
- else:
173
- print(f"Chunk {chunk_count}: {len(chunk)} rows")
174
- print_results(chunk, args.delimiter)
175
- print()
267
+ try:
268
+ for chunk in dsv.parse_file_stream(file_path):
269
+ chunk_count += 1
270
+ total_rows += len(chunk)
271
+
272
+ if args.output_format == "json":
273
+ print(json.dumps(chunk, ensure_ascii=False))
274
+ elif args.output_format == "ndjson":
275
+ for row in chunk:
276
+ print(json.dumps(row, ensure_ascii=False))
277
+ else:
278
+ print(f"Chunk {chunk_count}: {len(chunk)} rows")
279
+ print_results(chunk, args.delimiter)
280
+ print()
281
+ except Exception as e:
282
+ print(f"Error during streaming: {e}", file=sys.stderr)
283
+ import traceback
284
+
285
+ traceback.print_exc(file=sys.stderr)
286
+ return 1
176
287
 
177
288
  if args.output_format not in ["json", "ndjson"]:
178
289
  print(f"Total: {total_rows} rows in {chunk_count} chunks")
splurge_dsv/dsv.py CHANGED
@@ -9,7 +9,7 @@ files, and streaming large inputs.
9
9
 
10
10
  Public API:
11
11
  - DsvConfig: Configuration dataclass for parsing behavior.
12
- - Dsv: Parser instance that performs parse/parse_file/parse_stream.
12
+ - Dsv: Parser instance that performs parse/parse_file/parse_file_stream.
13
13
 
14
14
  License: MIT
15
15
 
@@ -20,6 +20,7 @@ Copyright (c) 2025 Jim Schilling
20
20
  from collections.abc import Iterator
21
21
  from dataclasses import dataclass, fields
22
22
  from os import PathLike
23
+ from pathlib import Path
23
24
 
24
25
  # Local imports
25
26
  from splurge_dsv.dsv_helper import DsvHelper
@@ -42,6 +43,10 @@ class DsvConfig:
42
43
  skip_header_rows: Number of header rows to skip when reading files.
43
44
  skip_footer_rows: Number of footer rows to skip when reading files.
44
45
  chunk_size: Size of chunks for streaming operations.
46
+ detect_columns: Whether to auto-detect column count from data.
47
+ raise_on_missing_columns: If True, raise an error if rows have fewer columns than detected
48
+ raise_on_extra_columns: If True, raise an error if rows have more columns than detected
49
+ max_detect_chunks: Maximum number of chunks to scan for column detection
45
50
 
46
51
  Raises:
47
52
  SplurgeDsvParameterError: If delimiter is empty, chunk_size is too
@@ -55,7 +60,16 @@ class DsvConfig:
55
60
  encoding: str = "utf-8"
56
61
  skip_header_rows: int = 0
57
62
  skip_footer_rows: int = 0
58
- chunk_size: int = 500
63
+ # When True, instruct the underlying SafeTextFileReader to remove raw
64
+ # empty logical lines (where line.strip() == "") before returning
65
+ # content. Defaults to False to preserve historical behavior.
66
+ skip_empty_lines: bool = False
67
+ chunk_size: int = DsvHelper.DEFAULT_MIN_CHUNK_SIZE
68
+ # Column normalization and detection flags
69
+ detect_columns: bool = False
70
+ raise_on_missing_columns: bool = False
71
+ raise_on_extra_columns: bool = False
72
+ max_detect_chunks: int = DsvHelper.MAX_DETECT_CHUNKS
59
73
 
60
74
  def __post_init__(self) -> None:
61
75
  """Validate configuration after initialization.
@@ -136,6 +150,53 @@ class DsvConfig:
136
150
  filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}
137
151
  return cls(**filtered_kwargs)
138
152
 
153
+ @classmethod
154
+ def from_file(cls, file_path: PathLike[str] | Path | str) -> "DsvConfig":
155
+ """
156
+ Load a YAML configuration file and return a DsvConfig instance.
157
+
158
+ The YAML should contain a mapping whose keys correspond to
159
+ DsvConfig field names (for example: delimiter, strip, bookend,
160
+ encoding, skip_header_rows, etc.). Unknown keys are ignored.
161
+
162
+ Args:
163
+ file_path: Path to the YAML configuration file.
164
+
165
+ Returns:
166
+ DsvConfig: Configuration object built from the YAML file.
167
+
168
+ Raises:
169
+ SplurgeDsvParameterError: If the file cannot be read, parsed,
170
+ or does not contain a mapping at the top level.
171
+ """
172
+ try:
173
+ import yaml # type: ignore
174
+ except Exception as e: # pragma: no cover - dependency issues surfaced elsewhere
175
+ raise SplurgeDsvParameterError(f"PyYAML is required to load config files: {e}") from e
176
+
177
+ p = Path(file_path)
178
+ if not p.exists():
179
+ raise SplurgeDsvParameterError(f"Config file '{file_path}' not found")
180
+
181
+ try:
182
+ with p.open("r", encoding="utf-8") as fh:
183
+ data = yaml.safe_load(fh) or {}
184
+ except Exception as e:
185
+ raise SplurgeDsvParameterError(f"Failed to read or parse config file '{file_path}': {e}") from e
186
+
187
+ if not isinstance(data, dict):
188
+ raise SplurgeDsvParameterError("Config file must contain a top-level mapping/dictionary of options")
189
+
190
+ # Filter and construct via existing from_params helper
191
+ valid_fields = {f.name for f in fields(cls)}
192
+ filtered = {k: v for k, v in data.items() if k in valid_fields}
193
+
194
+ # Ensure required values are present in the config (delimiter is required)
195
+ if "delimiter" not in filtered:
196
+ raise SplurgeDsvParameterError("Config file must include the required 'delimiter' option")
197
+
198
+ return cls.from_params(**filtered)
199
+
139
200
 
140
201
  class Dsv:
141
202
  """Parser class that binds a :class:`DsvConfig` to parsing operations.
@@ -172,6 +233,7 @@ class Dsv:
172
233
 
173
234
  Raises:
174
235
  SplurgeDsvParameterError: If the configured delimiter is invalid.
236
+ SplurgeDsvColumnMismatchError: If column validation fails.
175
237
  """
176
238
  return DsvHelper.parse(
177
239
  content,
@@ -179,6 +241,9 @@ class Dsv:
179
241
  strip=self.config.strip,
180
242
  bookend=self.config.bookend,
181
243
  bookend_strip=self.config.bookend_strip,
244
+ normalize_columns=0,
245
+ raise_on_missing_columns=self.config.raise_on_missing_columns,
246
+ raise_on_extra_columns=self.config.raise_on_extra_columns,
182
247
  )
183
248
 
184
249
  def parses(self, content: list[str]) -> list[list[str]]:
@@ -191,6 +256,10 @@ class Dsv:
191
256
  Returns:
192
257
  List of lists of parsed strings
193
258
 
259
+ Raises:
260
+ SplurgeDsvParameterError: If the configured delimiter is invalid.
261
+ SplurgeDsvColumnMismatchError: If column validation fails.
262
+
194
263
  Example:
195
264
  >>> parser = Dsv(DsvConfig(delimiter=","))
196
265
  >>> parser.parses(["a,b", "c,d"])
@@ -202,9 +271,13 @@ class Dsv:
202
271
  strip=self.config.strip,
203
272
  bookend=self.config.bookend,
204
273
  bookend_strip=self.config.bookend_strip,
274
+ normalize_columns=0,
275
+ raise_on_missing_columns=self.config.raise_on_missing_columns,
276
+ raise_on_extra_columns=self.config.raise_on_extra_columns,
277
+ detect_columns=self.config.detect_columns,
205
278
  )
206
279
 
207
- def parse_file(self, file_path: PathLike[str] | str) -> list[list[str]]:
280
+ def parse_file(self, file_path: PathLike[str] | Path | str) -> list[list[str]]:
208
281
  """Parse a DSV file and return all rows as lists of strings.
209
282
 
210
283
  Args:
@@ -214,10 +287,13 @@ class Dsv:
214
287
  A list of rows, where each row is a list of string tokens.
215
288
 
216
289
  Raises:
290
+ SplurgeDsvPathValidationError: If the file path is invalid.
217
291
  SplurgeDsvFileNotFoundError: If the file cannot be found.
218
292
  SplurgeDsvFilePermissionError: If the file cannot be read.
219
- SplurgeDsvFileEncodingError: If the file cannot be decoded with
220
- the configured encoding.
293
+ SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
294
+ SplurgeDsvColumnMismatchError: If column validation fails.
295
+ SplurgeDsvParameterError: If the configured delimiter is invalid.
296
+ SplurgeDsvError: For other unexpected errors.
221
297
  """
222
298
  return DsvHelper.parse_file(
223
299
  file_path,
@@ -227,10 +303,14 @@ class Dsv:
227
303
  bookend_strip=self.config.bookend_strip,
228
304
  encoding=self.config.encoding,
229
305
  skip_header_rows=self.config.skip_header_rows,
306
+ skip_empty_lines=self.config.skip_empty_lines,
230
307
  skip_footer_rows=self.config.skip_footer_rows,
308
+ detect_columns=self.config.detect_columns,
309
+ raise_on_missing_columns=self.config.raise_on_missing_columns,
310
+ raise_on_extra_columns=self.config.raise_on_extra_columns,
231
311
  )
232
312
 
233
- def parse_stream(self, file_path: PathLike[str] | str) -> Iterator[list[list[str]]]:
313
+ def parse_file_stream(self, file_path: PathLike[str] | Path | str) -> Iterator[list[list[str]]]:
234
314
  """Stream-parse a DSV file, yielding chunks of parsed rows.
235
315
 
236
316
  The method yields lists of parsed rows (each row itself is a list of
@@ -242,8 +322,17 @@ class Dsv:
242
322
 
243
323
  Yields:
244
324
  Lists of parsed rows, each list containing up to ``chunk_size`` rows.
325
+
326
+ Raises:
327
+ SplurgeDsvPathValidationError: If the file path is invalid.
328
+ SplurgeDsvFileNotFoundError: If the file cannot be found.
329
+ SplurgeDsvFilePermissionError: If the file cannot be read.
330
+ SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
331
+ SplurgeDsvColumnMismatchError: If column validation fails.
332
+ SplurgeDsvParameterError: If the configured delimiter is invalid.
333
+ SplurgeDsvError: For other unexpected errors.
245
334
  """
246
- return DsvHelper.parse_stream(
335
+ return DsvHelper.parse_file_stream(
247
336
  file_path,
248
337
  delimiter=self.config.delimiter,
249
338
  strip=self.config.strip,
@@ -251,6 +340,11 @@ class Dsv:
251
340
  bookend_strip=self.config.bookend_strip,
252
341
  encoding=self.config.encoding,
253
342
  skip_header_rows=self.config.skip_header_rows,
343
+ skip_empty_lines=self.config.skip_empty_lines,
254
344
  skip_footer_rows=self.config.skip_footer_rows,
345
+ detect_columns=self.config.detect_columns,
346
+ raise_on_missing_columns=self.config.raise_on_missing_columns,
347
+ raise_on_extra_columns=self.config.raise_on_extra_columns,
255
348
  chunk_size=self.config.chunk_size,
349
+ max_detect_chunks=self.config.max_detect_chunks,
256
350
  )