splurge-dsv 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +16 -5
- splurge_dsv/cli.py +137 -26
- splurge_dsv/dsv.py +101 -7
- splurge_dsv/dsv_helper.py +417 -43
- splurge_dsv/exceptions.py +22 -1
- splurge_dsv/string_tokenizer.py +7 -1
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
- splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
- splurge_dsv/path_validator.py +0 -298
- splurge_dsv/safe_text_file_reader.py +0 -177
- splurge_dsv/safe_text_file_writer.py +0 -136
- splurge_dsv/text_file_helper.py +0 -240
- splurge_dsv-2025.2.0.dist-info/RECORD +0 -17
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
- {splurge_dsv-2025.2.0.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0
splurge_dsv/__init__.py
CHANGED
@@ -13,9 +13,20 @@ Copyright (c) 2025 Jim Schilling
|
|
13
13
|
# test cases may remove the process working directory which causes calls to
|
14
14
|
# os.getcwd() to raise FileNotFoundError later during test execution. Guard
|
15
15
|
# against that here by switching to this package directory when cwd is missing.
|
16
|
+
# Ensure the required external implementation is available on import so the
|
17
|
+
# rest of the package can rely on its APIs. Fail fast with a helpful message
|
18
|
+
# instructing the user to install the package if it's missing.
|
19
|
+
import importlib as _importlib
|
16
20
|
import os
|
17
21
|
from pathlib import Path as _Path
|
18
22
|
|
23
|
+
try: # pragma: no cover - import-time guard
|
24
|
+
_importlib.import_module("splurge_safe_io")
|
25
|
+
except Exception as e:
|
26
|
+
raise ImportError(
|
27
|
+
"Missing required dependency 'splurge-safe-io'. Please install it: `pip install splurge-safe-io`"
|
28
|
+
) from e
|
29
|
+
|
19
30
|
try:
|
20
31
|
try:
|
21
32
|
# os.getcwd() can raise FileNotFoundError in CI/runner environments
|
@@ -35,11 +46,13 @@ except Exception:
|
|
35
46
|
from splurge_dsv.dsv import Dsv, DsvConfig
|
36
47
|
from splurge_dsv.dsv_helper import DsvHelper
|
37
48
|
from splurge_dsv.exceptions import (
|
49
|
+
SplurgeDsvColumnMismatchError,
|
38
50
|
SplurgeDsvConfigurationError,
|
39
51
|
SplurgeDsvDataProcessingError,
|
40
52
|
# canonical SplurgeDsv* exception names
|
41
53
|
SplurgeDsvError,
|
42
54
|
SplurgeDsvFileEncodingError,
|
55
|
+
SplurgeDsvFileExistsError,
|
43
56
|
SplurgeDsvFileNotFoundError,
|
44
57
|
SplurgeDsvFileOperationError,
|
45
58
|
SplurgeDsvFilePermissionError,
|
@@ -56,11 +69,9 @@ from splurge_dsv.exceptions import (
|
|
56
69
|
SplurgeDsvTypeConversionError,
|
57
70
|
SplurgeDsvValidationError,
|
58
71
|
)
|
59
|
-
from splurge_dsv.path_validator import PathValidator
|
60
72
|
from splurge_dsv.string_tokenizer import StringTokenizer
|
61
|
-
from splurge_dsv.text_file_helper import TextFileHelper
|
62
73
|
|
63
|
-
__version__ = "2025.
|
74
|
+
__version__ = "2025.3.1"
|
64
75
|
__author__ = "Jim Schilling"
|
65
76
|
__license__ = "MIT"
|
66
77
|
|
@@ -79,6 +90,7 @@ __all__ = [
|
|
79
90
|
"SplurgeDsvPathValidationError",
|
80
91
|
"SplurgeDsvDataProcessingError",
|
81
92
|
"SplurgeDsvParsingError",
|
93
|
+
"SplurgeDsvColumnMismatchError",
|
82
94
|
"SplurgeDsvTypeConversionError",
|
83
95
|
"SplurgeDsvStreamingError",
|
84
96
|
"SplurgeDsvConfigurationError",
|
@@ -89,8 +101,7 @@ __all__ = [
|
|
89
101
|
"SplurgeDsvParameterError",
|
90
102
|
"SplurgeDsvRangeError",
|
91
103
|
"SplurgeDsvFormatError",
|
104
|
+
"SplurgeDsvFileExistsError",
|
92
105
|
# Utility classes
|
93
106
|
"StringTokenizer",
|
94
|
-
"TextFileHelper",
|
95
|
-
"PathValidator",
|
96
107
|
]
|
splurge_dsv/cli.py
CHANGED
@@ -23,6 +23,7 @@ from pathlib import Path
|
|
23
23
|
# Local imports
|
24
24
|
from splurge_dsv import __version__
|
25
25
|
from splurge_dsv.dsv import Dsv, DsvConfig
|
26
|
+
from splurge_dsv.dsv_helper import DsvHelper
|
26
27
|
from splurge_dsv.exceptions import SplurgeDsvError
|
27
28
|
|
28
29
|
|
@@ -39,14 +40,31 @@ def parse_arguments() -> argparse.Namespace:
|
|
39
40
|
epilog="""
|
40
41
|
Examples:
|
41
42
|
python -m splurge_dsv data.csv --delimiter ,
|
42
|
-
python -m splurge_dsv data.tsv --delimiter "
|
43
|
+
python -m splurge_dsv data.tsv --delimiter "\t"
|
43
44
|
python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
|
44
|
-
|
45
|
+
# Auto-detect the expected column count and normalize rows
|
46
|
+
python -m splurge_dsv data.csv --delimiter , --detect-columns --max-detect-chunks 5
|
47
|
+
# Stream a large file while attempting to detect the column count from the first non-blank logical row
|
48
|
+
python -m splurge_dsv large.csv --delimiter , --stream --detect-columns --max-detect-chunks 10
|
49
|
+
""",
|
45
50
|
)
|
46
51
|
|
47
52
|
parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
|
48
53
|
|
49
|
-
parser.add_argument(
|
54
|
+
parser.add_argument(
|
55
|
+
"--config",
|
56
|
+
"-c",
|
57
|
+
dest="config",
|
58
|
+
type=str,
|
59
|
+
help="Path to a YAML config file that mirrors CLI options (values overridden by CLI args)",
|
60
|
+
)
|
61
|
+
|
62
|
+
parser.add_argument(
|
63
|
+
"--delimiter",
|
64
|
+
"-d",
|
65
|
+
type=str,
|
66
|
+
help="Delimiter character to use for parsing (may also be provided via --config)",
|
67
|
+
)
|
50
68
|
|
51
69
|
parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
|
52
70
|
|
@@ -64,7 +82,53 @@ Examples:
|
|
64
82
|
"--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
|
65
83
|
)
|
66
84
|
|
67
|
-
parser.add_argument(
|
85
|
+
parser.add_argument(
|
86
|
+
"--detect-columns",
|
87
|
+
action="store_true",
|
88
|
+
help=(
|
89
|
+
"Auto-detect the expected column count from the first non-blank logical row "
|
90
|
+
"and normalize subsequent rows to that count. For streamed parsing, the "
|
91
|
+
"detector may scan up to --max-detect-chunks chunks from the start of the file."
|
92
|
+
),
|
93
|
+
)
|
94
|
+
|
95
|
+
parser.add_argument(
|
96
|
+
"--raise-on-missing-columns",
|
97
|
+
action="store_true",
|
98
|
+
help="Raise an error if a row has fewer columns than the detected/expected count",
|
99
|
+
)
|
100
|
+
|
101
|
+
parser.add_argument(
|
102
|
+
"--raise-on-extra-columns",
|
103
|
+
action="store_true",
|
104
|
+
help="Raise an error if a row has more columns than the detected/expected count",
|
105
|
+
)
|
106
|
+
|
107
|
+
parser.add_argument(
|
108
|
+
"--chunk-size",
|
109
|
+
type=int,
|
110
|
+
default=DsvHelper.DEFAULT_CHUNK_SIZE,
|
111
|
+
help=(
|
112
|
+
f"Chunk size for streaming (minimum: {DsvHelper.DEFAULT_MIN_CHUNK_SIZE}, "
|
113
|
+
f"default: {DsvHelper.DEFAULT_CHUNK_SIZE})"
|
114
|
+
),
|
115
|
+
)
|
116
|
+
|
117
|
+
parser.add_argument(
|
118
|
+
"--max-detect-chunks",
|
119
|
+
type=int,
|
120
|
+
default=DsvHelper.MAX_DETECT_CHUNKS,
|
121
|
+
help=(
|
122
|
+
"When detecting columns while streaming (use --detect-normalize-columns), "
|
123
|
+
f"scan up to N chunks from the start of the stream before giving up (default: {DsvHelper.MAX_DETECT_CHUNKS})."
|
124
|
+
),
|
125
|
+
)
|
126
|
+
|
127
|
+
parser.add_argument(
|
128
|
+
"--skip-empty-lines",
|
129
|
+
action="store_true",
|
130
|
+
help="Have the underlying reader skip raw empty logical lines (line.strip() == '') before parsing",
|
131
|
+
)
|
68
132
|
|
69
133
|
parser.add_argument(
|
70
134
|
"--output-format",
|
@@ -141,17 +205,56 @@ def run_cli() -> int:
|
|
141
205
|
print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
|
142
206
|
return 1
|
143
207
|
|
208
|
+
# Build base config either from YAML file (if provided) or from CLI args
|
209
|
+
base_params = {}
|
210
|
+
if args.config:
|
211
|
+
try:
|
212
|
+
import yaml # type: ignore
|
213
|
+
|
214
|
+
cfg_path = Path(args.config)
|
215
|
+
if not cfg_path.exists():
|
216
|
+
print(f"Error: Config file '{args.config}' not found.", file=sys.stderr)
|
217
|
+
return 1
|
218
|
+
|
219
|
+
with cfg_path.open("r", encoding="utf-8") as fh:
|
220
|
+
file_cfg = yaml.safe_load(fh) or {}
|
221
|
+
|
222
|
+
if not isinstance(file_cfg, dict):
|
223
|
+
print(f"Error: Config file '{args.config}' must contain a mapping/dictionary.", file=sys.stderr)
|
224
|
+
return 1
|
225
|
+
|
226
|
+
base_params.update(file_cfg)
|
227
|
+
except Exception as e:
|
228
|
+
print(f"Error reading config file '{args.config}': {e}", file=sys.stderr)
|
229
|
+
return 1
|
230
|
+
|
231
|
+
# CLI args override YAML values when provided. Build the parameter map
|
232
|
+
cli_params = {
|
233
|
+
"delimiter": args.delimiter,
|
234
|
+
"strip": not args.no_strip,
|
235
|
+
"bookend": args.bookend,
|
236
|
+
"bookend_strip": not args.no_bookend_strip,
|
237
|
+
"encoding": args.encoding,
|
238
|
+
"skip_header_rows": args.skip_header,
|
239
|
+
"skip_footer_rows": args.skip_footer,
|
240
|
+
"chunk_size": args.chunk_size,
|
241
|
+
"detect_columns": args.detect_columns,
|
242
|
+
"raise_on_missing_columns": args.raise_on_missing_columns,
|
243
|
+
"raise_on_extra_columns": args.raise_on_extra_columns,
|
244
|
+
"max_detect_chunks": args.max_detect_chunks,
|
245
|
+
"skip_empty_lines": args.skip_empty_lines,
|
246
|
+
}
|
247
|
+
|
248
|
+
# Merge: start from file (if any), then overlay CLI-provided values
|
249
|
+
merged = {**base_params, **{k: v for k, v in cli_params.items() if v is not None}}
|
250
|
+
|
144
251
|
# Create configuration and Dsv instance for parsing
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
skip_header_rows=args.skip_header,
|
152
|
-
skip_footer_rows=args.skip_footer,
|
153
|
-
chunk_size=args.chunk_size,
|
154
|
-
)
|
252
|
+
try:
|
253
|
+
config = DsvConfig.from_params(**merged)
|
254
|
+
except Exception as e:
|
255
|
+
print(f"Error building configuration: {e}", file=sys.stderr)
|
256
|
+
return 1
|
257
|
+
dsv = Dsv(config)
|
155
258
|
dsv = Dsv(config)
|
156
259
|
|
157
260
|
# Parse the file
|
@@ -161,18 +264,26 @@ def run_cli() -> int:
|
|
161
264
|
chunk_count = 0
|
162
265
|
total_rows = 0
|
163
266
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
267
|
+
try:
|
268
|
+
for chunk in dsv.parse_file_stream(file_path):
|
269
|
+
chunk_count += 1
|
270
|
+
total_rows += len(chunk)
|
271
|
+
|
272
|
+
if args.output_format == "json":
|
273
|
+
print(json.dumps(chunk, ensure_ascii=False))
|
274
|
+
elif args.output_format == "ndjson":
|
275
|
+
for row in chunk:
|
276
|
+
print(json.dumps(row, ensure_ascii=False))
|
277
|
+
else:
|
278
|
+
print(f"Chunk {chunk_count}: {len(chunk)} rows")
|
279
|
+
print_results(chunk, args.delimiter)
|
280
|
+
print()
|
281
|
+
except Exception as e:
|
282
|
+
print(f"Error during streaming: {e}", file=sys.stderr)
|
283
|
+
import traceback
|
284
|
+
|
285
|
+
traceback.print_exc(file=sys.stderr)
|
286
|
+
return 1
|
176
287
|
|
177
288
|
if args.output_format not in ["json", "ndjson"]:
|
178
289
|
print(f"Total: {total_rows} rows in {chunk_count} chunks")
|
splurge_dsv/dsv.py
CHANGED
@@ -9,7 +9,7 @@ files, and streaming large inputs.
|
|
9
9
|
|
10
10
|
Public API:
|
11
11
|
- DsvConfig: Configuration dataclass for parsing behavior.
|
12
|
-
- Dsv: Parser instance that performs parse/parse_file/
|
12
|
+
- Dsv: Parser instance that performs parse/parse_file/parse_file_stream.
|
13
13
|
|
14
14
|
License: MIT
|
15
15
|
|
@@ -20,6 +20,7 @@ Copyright (c) 2025 Jim Schilling
|
|
20
20
|
from collections.abc import Iterator
|
21
21
|
from dataclasses import dataclass, fields
|
22
22
|
from os import PathLike
|
23
|
+
from pathlib import Path
|
23
24
|
|
24
25
|
# Local imports
|
25
26
|
from splurge_dsv.dsv_helper import DsvHelper
|
@@ -42,6 +43,10 @@ class DsvConfig:
|
|
42
43
|
skip_header_rows: Number of header rows to skip when reading files.
|
43
44
|
skip_footer_rows: Number of footer rows to skip when reading files.
|
44
45
|
chunk_size: Size of chunks for streaming operations.
|
46
|
+
detect_columns: Whether to auto-detect column count from data.
|
47
|
+
raise_on_missing_columns: If True, raise an error if rows have fewer columns than detected
|
48
|
+
raise_on_extra_columns: If True, raise an error if rows have more columns than detected
|
49
|
+
max_detect_chunks: Maximum number of chunks to scan for column detection
|
45
50
|
|
46
51
|
Raises:
|
47
52
|
SplurgeDsvParameterError: If delimiter is empty, chunk_size is too
|
@@ -55,7 +60,16 @@ class DsvConfig:
|
|
55
60
|
encoding: str = "utf-8"
|
56
61
|
skip_header_rows: int = 0
|
57
62
|
skip_footer_rows: int = 0
|
58
|
-
|
63
|
+
# When True, instruct the underlying SafeTextFileReader to remove raw
|
64
|
+
# empty logical lines (where line.strip() == "") before returning
|
65
|
+
# content. Defaults to False to preserve historical behavior.
|
66
|
+
skip_empty_lines: bool = False
|
67
|
+
chunk_size: int = DsvHelper.DEFAULT_MIN_CHUNK_SIZE
|
68
|
+
# Column normalization and detection flags
|
69
|
+
detect_columns: bool = False
|
70
|
+
raise_on_missing_columns: bool = False
|
71
|
+
raise_on_extra_columns: bool = False
|
72
|
+
max_detect_chunks: int = DsvHelper.MAX_DETECT_CHUNKS
|
59
73
|
|
60
74
|
def __post_init__(self) -> None:
|
61
75
|
"""Validate configuration after initialization.
|
@@ -136,6 +150,53 @@ class DsvConfig:
|
|
136
150
|
filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}
|
137
151
|
return cls(**filtered_kwargs)
|
138
152
|
|
153
|
+
@classmethod
|
154
|
+
def from_file(cls, file_path: PathLike[str] | Path | str) -> "DsvConfig":
|
155
|
+
"""
|
156
|
+
Load a YAML configuration file and return a DsvConfig instance.
|
157
|
+
|
158
|
+
The YAML should contain a mapping whose keys correspond to
|
159
|
+
DsvConfig field names (for example: delimiter, strip, bookend,
|
160
|
+
encoding, skip_header_rows, etc.). Unknown keys are ignored.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
file_path: Path to the YAML configuration file.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
DsvConfig: Configuration object built from the YAML file.
|
167
|
+
|
168
|
+
Raises:
|
169
|
+
SplurgeDsvParameterError: If the file cannot be read, parsed,
|
170
|
+
or does not contain a mapping at the top level.
|
171
|
+
"""
|
172
|
+
try:
|
173
|
+
import yaml # type: ignore
|
174
|
+
except Exception as e: # pragma: no cover - dependency issues surfaced elsewhere
|
175
|
+
raise SplurgeDsvParameterError(f"PyYAML is required to load config files: {e}") from e
|
176
|
+
|
177
|
+
p = Path(file_path)
|
178
|
+
if not p.exists():
|
179
|
+
raise SplurgeDsvParameterError(f"Config file '{file_path}' not found")
|
180
|
+
|
181
|
+
try:
|
182
|
+
with p.open("r", encoding="utf-8") as fh:
|
183
|
+
data = yaml.safe_load(fh) or {}
|
184
|
+
except Exception as e:
|
185
|
+
raise SplurgeDsvParameterError(f"Failed to read or parse config file '{file_path}': {e}") from e
|
186
|
+
|
187
|
+
if not isinstance(data, dict):
|
188
|
+
raise SplurgeDsvParameterError("Config file must contain a top-level mapping/dictionary of options")
|
189
|
+
|
190
|
+
# Filter and construct via existing from_params helper
|
191
|
+
valid_fields = {f.name for f in fields(cls)}
|
192
|
+
filtered = {k: v for k, v in data.items() if k in valid_fields}
|
193
|
+
|
194
|
+
# Ensure required values are present in the config (delimiter is required)
|
195
|
+
if "delimiter" not in filtered:
|
196
|
+
raise SplurgeDsvParameterError("Config file must include the required 'delimiter' option")
|
197
|
+
|
198
|
+
return cls.from_params(**filtered)
|
199
|
+
|
139
200
|
|
140
201
|
class Dsv:
|
141
202
|
"""Parser class that binds a :class:`DsvConfig` to parsing operations.
|
@@ -172,6 +233,7 @@ class Dsv:
|
|
172
233
|
|
173
234
|
Raises:
|
174
235
|
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
236
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
175
237
|
"""
|
176
238
|
return DsvHelper.parse(
|
177
239
|
content,
|
@@ -179,6 +241,9 @@ class Dsv:
|
|
179
241
|
strip=self.config.strip,
|
180
242
|
bookend=self.config.bookend,
|
181
243
|
bookend_strip=self.config.bookend_strip,
|
244
|
+
normalize_columns=0,
|
245
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
246
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
182
247
|
)
|
183
248
|
|
184
249
|
def parses(self, content: list[str]) -> list[list[str]]:
|
@@ -191,6 +256,10 @@ class Dsv:
|
|
191
256
|
Returns:
|
192
257
|
List of lists of parsed strings
|
193
258
|
|
259
|
+
Raises:
|
260
|
+
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
261
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
262
|
+
|
194
263
|
Example:
|
195
264
|
>>> parser = Dsv(DsvConfig(delimiter=","))
|
196
265
|
>>> parser.parses(["a,b", "c,d"])
|
@@ -202,9 +271,13 @@ class Dsv:
|
|
202
271
|
strip=self.config.strip,
|
203
272
|
bookend=self.config.bookend,
|
204
273
|
bookend_strip=self.config.bookend_strip,
|
274
|
+
normalize_columns=0,
|
275
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
276
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
277
|
+
detect_columns=self.config.detect_columns,
|
205
278
|
)
|
206
279
|
|
207
|
-
def parse_file(self, file_path: PathLike[str] | str) -> list[list[str]]:
|
280
|
+
def parse_file(self, file_path: PathLike[str] | Path | str) -> list[list[str]]:
|
208
281
|
"""Parse a DSV file and return all rows as lists of strings.
|
209
282
|
|
210
283
|
Args:
|
@@ -214,10 +287,13 @@ class Dsv:
|
|
214
287
|
A list of rows, where each row is a list of string tokens.
|
215
288
|
|
216
289
|
Raises:
|
290
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
217
291
|
SplurgeDsvFileNotFoundError: If the file cannot be found.
|
218
292
|
SplurgeDsvFilePermissionError: If the file cannot be read.
|
219
|
-
|
220
|
-
|
293
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
|
294
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
295
|
+
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
296
|
+
SplurgeDsvError: For other unexpected errors.
|
221
297
|
"""
|
222
298
|
return DsvHelper.parse_file(
|
223
299
|
file_path,
|
@@ -227,10 +303,14 @@ class Dsv:
|
|
227
303
|
bookend_strip=self.config.bookend_strip,
|
228
304
|
encoding=self.config.encoding,
|
229
305
|
skip_header_rows=self.config.skip_header_rows,
|
306
|
+
skip_empty_lines=self.config.skip_empty_lines,
|
230
307
|
skip_footer_rows=self.config.skip_footer_rows,
|
308
|
+
detect_columns=self.config.detect_columns,
|
309
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
310
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
231
311
|
)
|
232
312
|
|
233
|
-
def
|
313
|
+
def parse_file_stream(self, file_path: PathLike[str] | Path | str) -> Iterator[list[list[str]]]:
|
234
314
|
"""Stream-parse a DSV file, yielding chunks of parsed rows.
|
235
315
|
|
236
316
|
The method yields lists of parsed rows (each row itself is a list of
|
@@ -242,8 +322,17 @@ class Dsv:
|
|
242
322
|
|
243
323
|
Yields:
|
244
324
|
Lists of parsed rows, each list containing up to ``chunk_size`` rows.
|
325
|
+
|
326
|
+
Raises:
|
327
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
328
|
+
SplurgeDsvFileNotFoundError: If the file cannot be found.
|
329
|
+
SplurgeDsvFilePermissionError: If the file cannot be read.
|
330
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
|
331
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
332
|
+
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
333
|
+
SplurgeDsvError: For other unexpected errors.
|
245
334
|
"""
|
246
|
-
return DsvHelper.
|
335
|
+
return DsvHelper.parse_file_stream(
|
247
336
|
file_path,
|
248
337
|
delimiter=self.config.delimiter,
|
249
338
|
strip=self.config.strip,
|
@@ -251,6 +340,11 @@ class Dsv:
|
|
251
340
|
bookend_strip=self.config.bookend_strip,
|
252
341
|
encoding=self.config.encoding,
|
253
342
|
skip_header_rows=self.config.skip_header_rows,
|
343
|
+
skip_empty_lines=self.config.skip_empty_lines,
|
254
344
|
skip_footer_rows=self.config.skip_footer_rows,
|
345
|
+
detect_columns=self.config.detect_columns,
|
346
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
347
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
255
348
|
chunk_size=self.config.chunk_size,
|
349
|
+
max_detect_chunks=self.config.max_detect_chunks,
|
256
350
|
)
|