splurge-dsv 2025.2.1__py3-none-any.whl → 2025.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +16 -5
- splurge_dsv/cli.py +137 -26
- splurge_dsv/dsv.py +100 -30
- splurge_dsv/dsv_helper.py +415 -90
- splurge_dsv/exceptions.py +22 -1
- splurge_dsv/string_tokenizer.py +7 -1
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/METADATA +78 -5
- splurge_dsv-2025.3.1.dist-info/RECORD +13 -0
- splurge_dsv/path_validator.py +0 -298
- splurge_dsv/safe_text_file_reader.py +0 -177
- splurge_dsv/safe_text_file_writer.py +0 -136
- splurge_dsv/text_file_helper.py +0 -240
- splurge_dsv-2025.2.1.dist-info/RECORD +0 -17
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/WHEEL +0 -0
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/entry_points.txt +0 -0
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/licenses/LICENSE +0 -0
- {splurge_dsv-2025.2.1.dist-info → splurge_dsv-2025.3.1.dist-info}/top_level.txt +0 -0
splurge_dsv/__init__.py
CHANGED
@@ -13,9 +13,20 @@ Copyright (c) 2025 Jim Schilling
|
|
13
13
|
# test cases may remove the process working directory which causes calls to
|
14
14
|
# os.getcwd() to raise FileNotFoundError later during test execution. Guard
|
15
15
|
# against that here by switching to this package directory when cwd is missing.
|
16
|
+
# Ensure the required external implementation is available on import so the
|
17
|
+
# rest of the package can rely on its APIs. Fail fast with a helpful message
|
18
|
+
# instructing the user to install the package if it's missing.
|
19
|
+
import importlib as _importlib
|
16
20
|
import os
|
17
21
|
from pathlib import Path as _Path
|
18
22
|
|
23
|
+
try: # pragma: no cover - import-time guard
|
24
|
+
_importlib.import_module("splurge_safe_io")
|
25
|
+
except Exception as e:
|
26
|
+
raise ImportError(
|
27
|
+
"Missing required dependency 'splurge-safe-io'. Please install it: `pip install splurge-safe-io`"
|
28
|
+
) from e
|
29
|
+
|
19
30
|
try:
|
20
31
|
try:
|
21
32
|
# os.getcwd() can raise FileNotFoundError in CI/runner environments
|
@@ -35,11 +46,13 @@ except Exception:
|
|
35
46
|
from splurge_dsv.dsv import Dsv, DsvConfig
|
36
47
|
from splurge_dsv.dsv_helper import DsvHelper
|
37
48
|
from splurge_dsv.exceptions import (
|
49
|
+
SplurgeDsvColumnMismatchError,
|
38
50
|
SplurgeDsvConfigurationError,
|
39
51
|
SplurgeDsvDataProcessingError,
|
40
52
|
# canonical SplurgeDsv* exception names
|
41
53
|
SplurgeDsvError,
|
42
54
|
SplurgeDsvFileEncodingError,
|
55
|
+
SplurgeDsvFileExistsError,
|
43
56
|
SplurgeDsvFileNotFoundError,
|
44
57
|
SplurgeDsvFileOperationError,
|
45
58
|
SplurgeDsvFilePermissionError,
|
@@ -56,11 +69,9 @@ from splurge_dsv.exceptions import (
|
|
56
69
|
SplurgeDsvTypeConversionError,
|
57
70
|
SplurgeDsvValidationError,
|
58
71
|
)
|
59
|
-
from splurge_dsv.path_validator import PathValidator
|
60
72
|
from splurge_dsv.string_tokenizer import StringTokenizer
|
61
|
-
from splurge_dsv.text_file_helper import TextFileHelper
|
62
73
|
|
63
|
-
__version__ = "2025.
|
74
|
+
__version__ = "2025.3.1"
|
64
75
|
__author__ = "Jim Schilling"
|
65
76
|
__license__ = "MIT"
|
66
77
|
|
@@ -79,6 +90,7 @@ __all__ = [
|
|
79
90
|
"SplurgeDsvPathValidationError",
|
80
91
|
"SplurgeDsvDataProcessingError",
|
81
92
|
"SplurgeDsvParsingError",
|
93
|
+
"SplurgeDsvColumnMismatchError",
|
82
94
|
"SplurgeDsvTypeConversionError",
|
83
95
|
"SplurgeDsvStreamingError",
|
84
96
|
"SplurgeDsvConfigurationError",
|
@@ -89,8 +101,7 @@ __all__ = [
|
|
89
101
|
"SplurgeDsvParameterError",
|
90
102
|
"SplurgeDsvRangeError",
|
91
103
|
"SplurgeDsvFormatError",
|
104
|
+
"SplurgeDsvFileExistsError",
|
92
105
|
# Utility classes
|
93
106
|
"StringTokenizer",
|
94
|
-
"TextFileHelper",
|
95
|
-
"PathValidator",
|
96
107
|
]
|
splurge_dsv/cli.py
CHANGED
@@ -23,6 +23,7 @@ from pathlib import Path
|
|
23
23
|
# Local imports
|
24
24
|
from splurge_dsv import __version__
|
25
25
|
from splurge_dsv.dsv import Dsv, DsvConfig
|
26
|
+
from splurge_dsv.dsv_helper import DsvHelper
|
26
27
|
from splurge_dsv.exceptions import SplurgeDsvError
|
27
28
|
|
28
29
|
|
@@ -39,14 +40,31 @@ def parse_arguments() -> argparse.Namespace:
|
|
39
40
|
epilog="""
|
40
41
|
Examples:
|
41
42
|
python -m splurge_dsv data.csv --delimiter ,
|
42
|
-
python -m splurge_dsv data.tsv --delimiter "
|
43
|
+
python -m splurge_dsv data.tsv --delimiter "\t"
|
43
44
|
python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
|
44
|
-
|
45
|
+
# Auto-detect the expected column count and normalize rows
|
46
|
+
python -m splurge_dsv data.csv --delimiter , --detect-columns --max-detect-chunks 5
|
47
|
+
# Stream a large file while attempting to detect the column count from the first non-blank logical row
|
48
|
+
python -m splurge_dsv large.csv --delimiter , --stream --detect-columns --max-detect-chunks 10
|
49
|
+
""",
|
45
50
|
)
|
46
51
|
|
47
52
|
parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
|
48
53
|
|
49
|
-
parser.add_argument(
|
54
|
+
parser.add_argument(
|
55
|
+
"--config",
|
56
|
+
"-c",
|
57
|
+
dest="config",
|
58
|
+
type=str,
|
59
|
+
help="Path to a YAML config file that mirrors CLI options (values overridden by CLI args)",
|
60
|
+
)
|
61
|
+
|
62
|
+
parser.add_argument(
|
63
|
+
"--delimiter",
|
64
|
+
"-d",
|
65
|
+
type=str,
|
66
|
+
help="Delimiter character to use for parsing (may also be provided via --config)",
|
67
|
+
)
|
50
68
|
|
51
69
|
parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
|
52
70
|
|
@@ -64,7 +82,53 @@ Examples:
|
|
64
82
|
"--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
|
65
83
|
)
|
66
84
|
|
67
|
-
parser.add_argument(
|
85
|
+
parser.add_argument(
|
86
|
+
"--detect-columns",
|
87
|
+
action="store_true",
|
88
|
+
help=(
|
89
|
+
"Auto-detect the expected column count from the first non-blank logical row "
|
90
|
+
"and normalize subsequent rows to that count. For streamed parsing, the "
|
91
|
+
"detector may scan up to --max-detect-chunks chunks from the start of the file."
|
92
|
+
),
|
93
|
+
)
|
94
|
+
|
95
|
+
parser.add_argument(
|
96
|
+
"--raise-on-missing-columns",
|
97
|
+
action="store_true",
|
98
|
+
help="Raise an error if a row has fewer columns than the detected/expected count",
|
99
|
+
)
|
100
|
+
|
101
|
+
parser.add_argument(
|
102
|
+
"--raise-on-extra-columns",
|
103
|
+
action="store_true",
|
104
|
+
help="Raise an error if a row has more columns than the detected/expected count",
|
105
|
+
)
|
106
|
+
|
107
|
+
parser.add_argument(
|
108
|
+
"--chunk-size",
|
109
|
+
type=int,
|
110
|
+
default=DsvHelper.DEFAULT_CHUNK_SIZE,
|
111
|
+
help=(
|
112
|
+
f"Chunk size for streaming (minimum: {DsvHelper.DEFAULT_MIN_CHUNK_SIZE}, "
|
113
|
+
f"default: {DsvHelper.DEFAULT_CHUNK_SIZE})"
|
114
|
+
),
|
115
|
+
)
|
116
|
+
|
117
|
+
parser.add_argument(
|
118
|
+
"--max-detect-chunks",
|
119
|
+
type=int,
|
120
|
+
default=DsvHelper.MAX_DETECT_CHUNKS,
|
121
|
+
help=(
|
122
|
+
"When detecting columns while streaming (use --detect-normalize-columns), "
|
123
|
+
f"scan up to N chunks from the start of the stream before giving up (default: {DsvHelper.MAX_DETECT_CHUNKS})."
|
124
|
+
),
|
125
|
+
)
|
126
|
+
|
127
|
+
parser.add_argument(
|
128
|
+
"--skip-empty-lines",
|
129
|
+
action="store_true",
|
130
|
+
help="Have the underlying reader skip raw empty logical lines (line.strip() == '') before parsing",
|
131
|
+
)
|
68
132
|
|
69
133
|
parser.add_argument(
|
70
134
|
"--output-format",
|
@@ -141,17 +205,56 @@ def run_cli() -> int:
|
|
141
205
|
print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
|
142
206
|
return 1
|
143
207
|
|
208
|
+
# Build base config either from YAML file (if provided) or from CLI args
|
209
|
+
base_params = {}
|
210
|
+
if args.config:
|
211
|
+
try:
|
212
|
+
import yaml # type: ignore
|
213
|
+
|
214
|
+
cfg_path = Path(args.config)
|
215
|
+
if not cfg_path.exists():
|
216
|
+
print(f"Error: Config file '{args.config}' not found.", file=sys.stderr)
|
217
|
+
return 1
|
218
|
+
|
219
|
+
with cfg_path.open("r", encoding="utf-8") as fh:
|
220
|
+
file_cfg = yaml.safe_load(fh) or {}
|
221
|
+
|
222
|
+
if not isinstance(file_cfg, dict):
|
223
|
+
print(f"Error: Config file '{args.config}' must contain a mapping/dictionary.", file=sys.stderr)
|
224
|
+
return 1
|
225
|
+
|
226
|
+
base_params.update(file_cfg)
|
227
|
+
except Exception as e:
|
228
|
+
print(f"Error reading config file '{args.config}': {e}", file=sys.stderr)
|
229
|
+
return 1
|
230
|
+
|
231
|
+
# CLI args override YAML values when provided. Build the parameter map
|
232
|
+
cli_params = {
|
233
|
+
"delimiter": args.delimiter,
|
234
|
+
"strip": not args.no_strip,
|
235
|
+
"bookend": args.bookend,
|
236
|
+
"bookend_strip": not args.no_bookend_strip,
|
237
|
+
"encoding": args.encoding,
|
238
|
+
"skip_header_rows": args.skip_header,
|
239
|
+
"skip_footer_rows": args.skip_footer,
|
240
|
+
"chunk_size": args.chunk_size,
|
241
|
+
"detect_columns": args.detect_columns,
|
242
|
+
"raise_on_missing_columns": args.raise_on_missing_columns,
|
243
|
+
"raise_on_extra_columns": args.raise_on_extra_columns,
|
244
|
+
"max_detect_chunks": args.max_detect_chunks,
|
245
|
+
"skip_empty_lines": args.skip_empty_lines,
|
246
|
+
}
|
247
|
+
|
248
|
+
# Merge: start from file (if any), then overlay CLI-provided values
|
249
|
+
merged = {**base_params, **{k: v for k, v in cli_params.items() if v is not None}}
|
250
|
+
|
144
251
|
# Create configuration and Dsv instance for parsing
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
skip_header_rows=args.skip_header,
|
152
|
-
skip_footer_rows=args.skip_footer,
|
153
|
-
chunk_size=args.chunk_size,
|
154
|
-
)
|
252
|
+
try:
|
253
|
+
config = DsvConfig.from_params(**merged)
|
254
|
+
except Exception as e:
|
255
|
+
print(f"Error building configuration: {e}", file=sys.stderr)
|
256
|
+
return 1
|
257
|
+
dsv = Dsv(config)
|
155
258
|
dsv = Dsv(config)
|
156
259
|
|
157
260
|
# Parse the file
|
@@ -161,18 +264,26 @@ def run_cli() -> int:
|
|
161
264
|
chunk_count = 0
|
162
265
|
total_rows = 0
|
163
266
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
267
|
+
try:
|
268
|
+
for chunk in dsv.parse_file_stream(file_path):
|
269
|
+
chunk_count += 1
|
270
|
+
total_rows += len(chunk)
|
271
|
+
|
272
|
+
if args.output_format == "json":
|
273
|
+
print(json.dumps(chunk, ensure_ascii=False))
|
274
|
+
elif args.output_format == "ndjson":
|
275
|
+
for row in chunk:
|
276
|
+
print(json.dumps(row, ensure_ascii=False))
|
277
|
+
else:
|
278
|
+
print(f"Chunk {chunk_count}: {len(chunk)} rows")
|
279
|
+
print_results(chunk, args.delimiter)
|
280
|
+
print()
|
281
|
+
except Exception as e:
|
282
|
+
print(f"Error during streaming: {e}", file=sys.stderr)
|
283
|
+
import traceback
|
284
|
+
|
285
|
+
traceback.print_exc(file=sys.stderr)
|
286
|
+
return 1
|
176
287
|
|
177
288
|
if args.output_format not in ["json", "ndjson"]:
|
178
289
|
print(f"Total: {total_rows} rows in {chunk_count} chunks")
|
splurge_dsv/dsv.py
CHANGED
@@ -9,7 +9,7 @@ files, and streaming large inputs.
|
|
9
9
|
|
10
10
|
Public API:
|
11
11
|
- DsvConfig: Configuration dataclass for parsing behavior.
|
12
|
-
- Dsv: Parser instance that performs parse/parse_file/
|
12
|
+
- Dsv: Parser instance that performs parse/parse_file/parse_file_stream.
|
13
13
|
|
14
14
|
License: MIT
|
15
15
|
|
@@ -17,10 +17,10 @@ Copyright (c) 2025 Jim Schilling
|
|
17
17
|
"""
|
18
18
|
|
19
19
|
# Standard library imports
|
20
|
-
import warnings
|
21
20
|
from collections.abc import Iterator
|
22
21
|
from dataclasses import dataclass, fields
|
23
22
|
from os import PathLike
|
23
|
+
from pathlib import Path
|
24
24
|
|
25
25
|
# Local imports
|
26
26
|
from splurge_dsv.dsv_helper import DsvHelper
|
@@ -43,6 +43,10 @@ class DsvConfig:
|
|
43
43
|
skip_header_rows: Number of header rows to skip when reading files.
|
44
44
|
skip_footer_rows: Number of footer rows to skip when reading files.
|
45
45
|
chunk_size: Size of chunks for streaming operations.
|
46
|
+
detect_columns: Whether to auto-detect column count from data.
|
47
|
+
raise_on_missing_columns: If True, raise an error if rows have fewer columns than detected
|
48
|
+
raise_on_extra_columns: If True, raise an error if rows have more columns than detected
|
49
|
+
max_detect_chunks: Maximum number of chunks to scan for column detection
|
46
50
|
|
47
51
|
Raises:
|
48
52
|
SplurgeDsvParameterError: If delimiter is empty, chunk_size is too
|
@@ -56,7 +60,16 @@ class DsvConfig:
|
|
56
60
|
encoding: str = "utf-8"
|
57
61
|
skip_header_rows: int = 0
|
58
62
|
skip_footer_rows: int = 0
|
59
|
-
|
63
|
+
# When True, instruct the underlying SafeTextFileReader to remove raw
|
64
|
+
# empty logical lines (where line.strip() == "") before returning
|
65
|
+
# content. Defaults to False to preserve historical behavior.
|
66
|
+
skip_empty_lines: bool = False
|
67
|
+
chunk_size: int = DsvHelper.DEFAULT_MIN_CHUNK_SIZE
|
68
|
+
# Column normalization and detection flags
|
69
|
+
detect_columns: bool = False
|
70
|
+
raise_on_missing_columns: bool = False
|
71
|
+
raise_on_extra_columns: bool = False
|
72
|
+
max_detect_chunks: int = DsvHelper.MAX_DETECT_CHUNKS
|
60
73
|
|
61
74
|
def __post_init__(self) -> None:
|
62
75
|
"""Validate configuration after initialization.
|
@@ -137,6 +150,53 @@ class DsvConfig:
|
|
137
150
|
filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}
|
138
151
|
return cls(**filtered_kwargs)
|
139
152
|
|
153
|
+
@classmethod
|
154
|
+
def from_file(cls, file_path: PathLike[str] | Path | str) -> "DsvConfig":
|
155
|
+
"""
|
156
|
+
Load a YAML configuration file and return a DsvConfig instance.
|
157
|
+
|
158
|
+
The YAML should contain a mapping whose keys correspond to
|
159
|
+
DsvConfig field names (for example: delimiter, strip, bookend,
|
160
|
+
encoding, skip_header_rows, etc.). Unknown keys are ignored.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
file_path: Path to the YAML configuration file.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
DsvConfig: Configuration object built from the YAML file.
|
167
|
+
|
168
|
+
Raises:
|
169
|
+
SplurgeDsvParameterError: If the file cannot be read, parsed,
|
170
|
+
or does not contain a mapping at the top level.
|
171
|
+
"""
|
172
|
+
try:
|
173
|
+
import yaml # type: ignore
|
174
|
+
except Exception as e: # pragma: no cover - dependency issues surfaced elsewhere
|
175
|
+
raise SplurgeDsvParameterError(f"PyYAML is required to load config files: {e}") from e
|
176
|
+
|
177
|
+
p = Path(file_path)
|
178
|
+
if not p.exists():
|
179
|
+
raise SplurgeDsvParameterError(f"Config file '{file_path}' not found")
|
180
|
+
|
181
|
+
try:
|
182
|
+
with p.open("r", encoding="utf-8") as fh:
|
183
|
+
data = yaml.safe_load(fh) or {}
|
184
|
+
except Exception as e:
|
185
|
+
raise SplurgeDsvParameterError(f"Failed to read or parse config file '{file_path}': {e}") from e
|
186
|
+
|
187
|
+
if not isinstance(data, dict):
|
188
|
+
raise SplurgeDsvParameterError("Config file must contain a top-level mapping/dictionary of options")
|
189
|
+
|
190
|
+
# Filter and construct via existing from_params helper
|
191
|
+
valid_fields = {f.name for f in fields(cls)}
|
192
|
+
filtered = {k: v for k, v in data.items() if k in valid_fields}
|
193
|
+
|
194
|
+
# Ensure required values are present in the config (delimiter is required)
|
195
|
+
if "delimiter" not in filtered:
|
196
|
+
raise SplurgeDsvParameterError("Config file must include the required 'delimiter' option")
|
197
|
+
|
198
|
+
return cls.from_params(**filtered)
|
199
|
+
|
140
200
|
|
141
201
|
class Dsv:
|
142
202
|
"""Parser class that binds a :class:`DsvConfig` to parsing operations.
|
@@ -173,6 +233,7 @@ class Dsv:
|
|
173
233
|
|
174
234
|
Raises:
|
175
235
|
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
236
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
176
237
|
"""
|
177
238
|
return DsvHelper.parse(
|
178
239
|
content,
|
@@ -180,6 +241,9 @@ class Dsv:
|
|
180
241
|
strip=self.config.strip,
|
181
242
|
bookend=self.config.bookend,
|
182
243
|
bookend_strip=self.config.bookend_strip,
|
244
|
+
normalize_columns=0,
|
245
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
246
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
183
247
|
)
|
184
248
|
|
185
249
|
def parses(self, content: list[str]) -> list[list[str]]:
|
@@ -192,6 +256,10 @@ class Dsv:
|
|
192
256
|
Returns:
|
193
257
|
List of lists of parsed strings
|
194
258
|
|
259
|
+
Raises:
|
260
|
+
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
261
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
262
|
+
|
195
263
|
Example:
|
196
264
|
>>> parser = Dsv(DsvConfig(delimiter=","))
|
197
265
|
>>> parser.parses(["a,b", "c,d"])
|
@@ -203,9 +271,13 @@ class Dsv:
|
|
203
271
|
strip=self.config.strip,
|
204
272
|
bookend=self.config.bookend,
|
205
273
|
bookend_strip=self.config.bookend_strip,
|
274
|
+
normalize_columns=0,
|
275
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
276
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
277
|
+
detect_columns=self.config.detect_columns,
|
206
278
|
)
|
207
279
|
|
208
|
-
def parse_file(self, file_path: PathLike[str] | str) -> list[list[str]]:
|
280
|
+
def parse_file(self, file_path: PathLike[str] | Path | str) -> list[list[str]]:
|
209
281
|
"""Parse a DSV file and return all rows as lists of strings.
|
210
282
|
|
211
283
|
Args:
|
@@ -215,10 +287,13 @@ class Dsv:
|
|
215
287
|
A list of rows, where each row is a list of string tokens.
|
216
288
|
|
217
289
|
Raises:
|
290
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
218
291
|
SplurgeDsvFileNotFoundError: If the file cannot be found.
|
219
292
|
SplurgeDsvFilePermissionError: If the file cannot be read.
|
220
|
-
|
221
|
-
|
293
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
|
294
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
295
|
+
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
296
|
+
SplurgeDsvError: For other unexpected errors.
|
222
297
|
"""
|
223
298
|
return DsvHelper.parse_file(
|
224
299
|
file_path,
|
@@ -228,10 +303,14 @@ class Dsv:
|
|
228
303
|
bookend_strip=self.config.bookend_strip,
|
229
304
|
encoding=self.config.encoding,
|
230
305
|
skip_header_rows=self.config.skip_header_rows,
|
306
|
+
skip_empty_lines=self.config.skip_empty_lines,
|
231
307
|
skip_footer_rows=self.config.skip_footer_rows,
|
308
|
+
detect_columns=self.config.detect_columns,
|
309
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
310
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
232
311
|
)
|
233
312
|
|
234
|
-
def parse_file_stream(self, file_path: PathLike[str] | str) -> Iterator[list[list[str]]]:
|
313
|
+
def parse_file_stream(self, file_path: PathLike[str] | Path | str) -> Iterator[list[list[str]]]:
|
235
314
|
"""Stream-parse a DSV file, yielding chunks of parsed rows.
|
236
315
|
|
237
316
|
The method yields lists of parsed rows (each row itself is a list of
|
@@ -243,6 +322,15 @@ class Dsv:
|
|
243
322
|
|
244
323
|
Yields:
|
245
324
|
Lists of parsed rows, each list containing up to ``chunk_size`` rows.
|
325
|
+
|
326
|
+
Raises:
|
327
|
+
SplurgeDsvPathValidationError: If the file path is invalid.
|
328
|
+
SplurgeDsvFileNotFoundError: If the file cannot be found.
|
329
|
+
SplurgeDsvFilePermissionError: If the file cannot be read.
|
330
|
+
SplurgeDsvFileDecodingError: If the file cannot be decoded with the configured encoding.
|
331
|
+
SplurgeDsvColumnMismatchError: If column validation fails.
|
332
|
+
SplurgeDsvParameterError: If the configured delimiter is invalid.
|
333
|
+
SplurgeDsvError: For other unexpected errors.
|
246
334
|
"""
|
247
335
|
return DsvHelper.parse_file_stream(
|
248
336
|
file_path,
|
@@ -252,29 +340,11 @@ class Dsv:
|
|
252
340
|
bookend_strip=self.config.bookend_strip,
|
253
341
|
encoding=self.config.encoding,
|
254
342
|
skip_header_rows=self.config.skip_header_rows,
|
343
|
+
skip_empty_lines=self.config.skip_empty_lines,
|
255
344
|
skip_footer_rows=self.config.skip_footer_rows,
|
345
|
+
detect_columns=self.config.detect_columns,
|
346
|
+
raise_on_missing_columns=self.config.raise_on_missing_columns,
|
347
|
+
raise_on_extra_columns=self.config.raise_on_extra_columns,
|
256
348
|
chunk_size=self.config.chunk_size,
|
349
|
+
max_detect_chunks=self.config.max_detect_chunks,
|
257
350
|
)
|
258
|
-
|
259
|
-
def parse_stream(self, file_path: PathLike[str] | str) -> Iterator[list[list[str]]]:
|
260
|
-
"""Stream-parse a DSV file, yielding chunks of parsed rows.
|
261
|
-
|
262
|
-
The method yields lists of parsed rows (each row itself is a list of
|
263
|
-
strings). Chunk sizing is controlled by the bound configuration's
|
264
|
-
``chunk_size`` value.
|
265
|
-
|
266
|
-
Args:
|
267
|
-
file_path: Path to the file to parse.
|
268
|
-
|
269
|
-
Yields:
|
270
|
-
Lists of parsed rows, each list containing up to ``chunk_size`` rows.
|
271
|
-
|
272
|
-
Deprecated: Use `parse_file_stream` instead. This method will be removed in a future release.
|
273
|
-
"""
|
274
|
-
# Emit a DeprecationWarning to signal removal in a future release
|
275
|
-
warnings.warn(
|
276
|
-
"Dsv.parse_stream() is deprecated and will be removed in a future release; use Dsv.parse_file_stream() instead.",
|
277
|
-
DeprecationWarning,
|
278
|
-
stacklevel=2,
|
279
|
-
)
|
280
|
-
return Dsv.parse_file_stream(self, file_path)
|