splurge-dsv 2025.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- splurge_dsv/__init__.py +0 -0
- splurge_dsv/__main__.py +0 -0
- splurge_dsv/dsv_helper.py +263 -0
- splurge_dsv/exceptions.py +123 -0
- splurge_dsv/path_validator.py +262 -0
- splurge_dsv/resource_manager.py +432 -0
- splurge_dsv/string_tokenizer.py +136 -0
- splurge_dsv/text_file_helper.py +343 -0
- splurge_dsv-2025.1.0.dist-info/METADATA +292 -0
- splurge_dsv-2025.1.0.dist-info/RECORD +13 -0
- splurge_dsv-2025.1.0.dist-info/WHEEL +5 -0
- splurge_dsv-2025.1.0.dist-info/licenses/LICENSE +21 -0
- splurge_dsv-2025.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
"""
|
2
|
+
Text file utility functions for common file operations.
|
3
|
+
|
4
|
+
This module provides helper methods for working with text files, including
|
5
|
+
line counting, file previewing, and file loading capabilities. The TextFileHelper
|
6
|
+
class implements static methods for efficient file operations without requiring
|
7
|
+
class instantiation.
|
8
|
+
|
9
|
+
Key features:
|
10
|
+
- Line counting for text files
|
11
|
+
- File previewing with configurable line limits
|
12
|
+
- Complete file loading with header/footer skipping
|
13
|
+
- Streaming file loading with configurable chunk sizes
|
14
|
+
- Configurable whitespace handling and encoding
|
15
|
+
- Secure file path validation
|
16
|
+
- Resource management with context managers
|
17
|
+
|
18
|
+
Copyright (c) 2025 Jim Schilling
|
19
|
+
|
20
|
+
Please preserve this header and all related material when sharing!
|
21
|
+
|
22
|
+
This module is licensed under the MIT License.
|
23
|
+
"""
|
24
|
+
|
25
|
+
from collections import deque
|
26
|
+
from os import PathLike
|
27
|
+
from pathlib import Path
|
28
|
+
from typing import Iterator
|
29
|
+
|
30
|
+
from splurge_dsv.exceptions import (
|
31
|
+
SplurgeParameterError,
|
32
|
+
SplurgeFileEncodingError
|
33
|
+
)
|
34
|
+
from splurge_dsv.path_validator import PathValidator
|
35
|
+
from splurge_dsv.resource_manager import safe_file_operation
|
36
|
+
|
37
|
+
|
38
|
+
class TextFileHelper:
|
39
|
+
"""
|
40
|
+
Utility class for text file operations.
|
41
|
+
All methods are static and memory efficient.
|
42
|
+
"""
|
43
|
+
|
44
|
+
DEFAULT_ENCODING = "utf-8"
|
45
|
+
DEFAULT_MAX_LINES = 100
|
46
|
+
DEFAULT_CHUNK_SIZE = 500
|
47
|
+
DEFAULT_MIN_CHUNK_SIZE = 100
|
48
|
+
DEFAULT_SKIP_HEADER_ROWS = 0
|
49
|
+
DEFAULT_SKIP_FOOTER_ROWS = 0
|
50
|
+
DEFAULT_STRIP = True
|
51
|
+
DEFAULT_MODE = "r"
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def line_count(
|
55
|
+
cls,
|
56
|
+
file_path: PathLike[str] | str,
|
57
|
+
*,
|
58
|
+
encoding: str = DEFAULT_ENCODING
|
59
|
+
) -> int:
|
60
|
+
"""
|
61
|
+
Count the number of lines in a text file.
|
62
|
+
|
63
|
+
This method efficiently counts lines by iterating through the file
|
64
|
+
without loading it entirely into memory.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
file_path: Path to the text file
|
68
|
+
encoding: File encoding to use (default: 'utf-8')
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
int: Number of lines in the file
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
SplurgeFileNotFoundError: If the specified file doesn't exist
|
75
|
+
SplurgeFilePermissionError: If there are permission issues
|
76
|
+
SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding
|
77
|
+
SplurgePathValidationError: If file path validation fails
|
78
|
+
"""
|
79
|
+
# Validate file path
|
80
|
+
validated_path = PathValidator.validate_path(
|
81
|
+
Path(file_path),
|
82
|
+
must_exist=True,
|
83
|
+
must_be_file=True,
|
84
|
+
must_be_readable=True
|
85
|
+
)
|
86
|
+
|
87
|
+
with safe_file_operation(validated_path, encoding=encoding, mode=cls.DEFAULT_MODE) as stream:
|
88
|
+
return sum(1 for _ in stream)
|
89
|
+
|
90
|
+
@classmethod
|
91
|
+
def preview(
|
92
|
+
cls,
|
93
|
+
file_path: PathLike[str] | str,
|
94
|
+
*,
|
95
|
+
max_lines: int = DEFAULT_MAX_LINES,
|
96
|
+
strip: bool = DEFAULT_STRIP,
|
97
|
+
encoding: str = DEFAULT_ENCODING,
|
98
|
+
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS
|
99
|
+
) -> list[str]:
|
100
|
+
"""
|
101
|
+
Preview the first N lines of a text file.
|
102
|
+
|
103
|
+
This method reads up to max_lines from the beginning of the file,
|
104
|
+
optionally stripping whitespace from each line and skipping header rows.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
file_path: Path to the text file
|
108
|
+
max_lines: Maximum number of lines to read (default: 100)
|
109
|
+
strip: Whether to strip whitespace from lines (default: True)
|
110
|
+
encoding: File encoding to use (default: 'utf-8')
|
111
|
+
skip_header_rows: Number of rows to skip from the start (default: 0)
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
list[str]: List of lines from the file
|
115
|
+
|
116
|
+
Raises:
|
117
|
+
SplurgeParameterError: If max_lines < 1
|
118
|
+
SplurgeFileNotFoundError: If the specified file doesn't exist
|
119
|
+
SplurgeFilePermissionError: If there are permission issues
|
120
|
+
SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding
|
121
|
+
SplurgePathValidationError: If file path validation fails
|
122
|
+
"""
|
123
|
+
if max_lines < 1:
|
124
|
+
raise SplurgeParameterError(
|
125
|
+
"TextFileHelper.preview: max_lines is less than 1",
|
126
|
+
details="max_lines must be at least 1"
|
127
|
+
)
|
128
|
+
|
129
|
+
# Validate file path
|
130
|
+
validated_path = PathValidator.validate_path(
|
131
|
+
Path(file_path),
|
132
|
+
must_exist=True,
|
133
|
+
must_be_file=True,
|
134
|
+
must_be_readable=True
|
135
|
+
)
|
136
|
+
|
137
|
+
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
138
|
+
lines: list[str] = []
|
139
|
+
|
140
|
+
with safe_file_operation(validated_path, encoding=encoding, mode=cls.DEFAULT_MODE) as stream:
|
141
|
+
# Skip header rows
|
142
|
+
for _ in range(skip_header_rows):
|
143
|
+
if not stream.readline():
|
144
|
+
return lines
|
145
|
+
|
146
|
+
# Read up to max_lines after skipping headers
|
147
|
+
for _ in range(max_lines):
|
148
|
+
line = stream.readline()
|
149
|
+
if not line:
|
150
|
+
break
|
151
|
+
lines.append(line.strip() if strip else line.rstrip("\n"))
|
152
|
+
|
153
|
+
return lines
|
154
|
+
|
155
|
+
@classmethod
|
156
|
+
def read_as_stream(
|
157
|
+
cls,
|
158
|
+
file_path: PathLike[str] | str,
|
159
|
+
*,
|
160
|
+
strip: bool = DEFAULT_STRIP,
|
161
|
+
encoding: str = DEFAULT_ENCODING,
|
162
|
+
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
163
|
+
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
164
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE
|
165
|
+
) -> Iterator[list[str]]:
|
166
|
+
"""
|
167
|
+
Read a text file as a stream of line chunks.
|
168
|
+
|
169
|
+
This method yields chunks of lines from the file, allowing for
|
170
|
+
memory-efficient processing of large files. Each chunk contains
|
171
|
+
up to chunk_size lines. Uses a sliding window approach to handle
|
172
|
+
footer row skipping without loading the entire file into memory.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
file_path: Path to the text file
|
176
|
+
strip: Whether to strip whitespace from lines (default: True)
|
177
|
+
encoding: File encoding to use (default: 'utf-8')
|
178
|
+
skip_header_rows: Number of rows to skip from the start (default: 0)
|
179
|
+
skip_footer_rows: Number of rows to skip from the end (default: 0)
|
180
|
+
chunk_size: Number of lines per chunk (default: 500)
|
181
|
+
|
182
|
+
Yields:
|
183
|
+
List[str]: Chunks of lines from the file
|
184
|
+
|
185
|
+
Raises:
|
186
|
+
SplurgeFileNotFoundError: If the specified file doesn't exist
|
187
|
+
SplurgeFilePermissionError: If there are permission issues
|
188
|
+
SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding
|
189
|
+
SplurgePathValidationError: If file path validation fails
|
190
|
+
"""
|
191
|
+
# Ensure minimum chunk size
|
192
|
+
chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
|
193
|
+
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
194
|
+
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
195
|
+
|
196
|
+
# Validate file path
|
197
|
+
validated_path = PathValidator.validate_path(
|
198
|
+
Path(file_path),
|
199
|
+
must_exist=True,
|
200
|
+
must_be_file=True,
|
201
|
+
must_be_readable=True
|
202
|
+
)
|
203
|
+
|
204
|
+
with safe_file_operation(validated_path, encoding=encoding, mode=cls.DEFAULT_MODE) as stream:
|
205
|
+
# Skip header rows
|
206
|
+
for _ in range(skip_header_rows):
|
207
|
+
if not stream.readline():
|
208
|
+
return
|
209
|
+
|
210
|
+
# Use a sliding window to handle footer skipping efficiently
|
211
|
+
if skip_footer_rows > 0:
|
212
|
+
# Buffer to hold the last skip_footer_rows lines
|
213
|
+
buffer: deque[str] = deque(maxlen=skip_footer_rows + 1)
|
214
|
+
current_chunk: list[str] = []
|
215
|
+
|
216
|
+
for line in stream:
|
217
|
+
processed_line = line.strip() if strip else line.rstrip("\n")
|
218
|
+
|
219
|
+
# Add current line to buffer
|
220
|
+
buffer.append(processed_line)
|
221
|
+
|
222
|
+
# Wait until the buffer is full (skip_footer_rows + 1 lines) before processing lines.
|
223
|
+
# This ensures we have enough lines to reliably identify and skip the footer rows at the end.
|
224
|
+
if len(buffer) < skip_footer_rows + 1:
|
225
|
+
continue
|
226
|
+
|
227
|
+
# Once the buffer contains more than skip_footer_rows lines, the oldest line (removed with popleft)
|
228
|
+
# is guaranteed not to be part of the footer and can be safely processed and added to the current chunk.
|
229
|
+
safe_line = buffer.popleft()
|
230
|
+
current_chunk.append(safe_line)
|
231
|
+
|
232
|
+
# Yield chunk when it reaches the desired size
|
233
|
+
if len(current_chunk) >= chunk_size:
|
234
|
+
yield current_chunk
|
235
|
+
current_chunk = []
|
236
|
+
|
237
|
+
# At the end, the buffer contains exactly the footer rows to skip
|
238
|
+
# All other lines have already been processed and yielded
|
239
|
+
|
240
|
+
# Yield any remaining lines in the final chunk
|
241
|
+
if current_chunk:
|
242
|
+
yield current_chunk
|
243
|
+
else:
|
244
|
+
# No footer skipping needed - simple streaming
|
245
|
+
chunk: list[str] = []
|
246
|
+
|
247
|
+
for line in stream:
|
248
|
+
processed_line = line.strip() if strip else line.rstrip("\n")
|
249
|
+
chunk.append(processed_line)
|
250
|
+
|
251
|
+
# Yield chunk when it reaches the desired size
|
252
|
+
if len(chunk) >= chunk_size:
|
253
|
+
yield chunk
|
254
|
+
chunk = []
|
255
|
+
|
256
|
+
# Yield any remaining lines in the final chunk
|
257
|
+
if chunk:
|
258
|
+
yield chunk
|
259
|
+
|
260
|
+
@classmethod
|
261
|
+
def read(
|
262
|
+
cls,
|
263
|
+
file_path: PathLike[str] | str,
|
264
|
+
*,
|
265
|
+
strip: bool = DEFAULT_STRIP,
|
266
|
+
encoding: str = DEFAULT_ENCODING,
|
267
|
+
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
268
|
+
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS
|
269
|
+
) -> list[str]:
|
270
|
+
"""
|
271
|
+
Read the entire contents of a text file into a list of strings.
|
272
|
+
|
273
|
+
This method reads the complete file into memory, with options to
|
274
|
+
strip whitespace from each line and skip header/footer rows.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
file_path: Path to the text file
|
278
|
+
strip: Whether to strip whitespace from lines (default: True)
|
279
|
+
encoding: File encoding to use (default: 'utf-8')
|
280
|
+
skip_header_rows: Number of rows to skip from the start (default: 0)
|
281
|
+
skip_footer_rows: Number of rows to skip from the end (default: 0)
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
List[str]: List of all lines from the file, excluding skipped rows
|
285
|
+
|
286
|
+
Raises:
|
287
|
+
SplurgeFileNotFoundError: If the specified file doesn't exist
|
288
|
+
SplurgeFilePermissionError: If there are permission issues
|
289
|
+
SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding
|
290
|
+
SplurgePathValidationError: If file path validation fails
|
291
|
+
"""
|
292
|
+
# Validate file path
|
293
|
+
validated_path = PathValidator.validate_path(
|
294
|
+
Path(file_path),
|
295
|
+
must_exist=True,
|
296
|
+
must_be_file=True,
|
297
|
+
must_be_readable=True
|
298
|
+
)
|
299
|
+
|
300
|
+
skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
|
301
|
+
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
302
|
+
|
303
|
+
with safe_file_operation(validated_path, encoding=encoding, mode=cls.DEFAULT_MODE) as stream:
|
304
|
+
for _ in range(skip_header_rows):
|
305
|
+
if not stream.readline():
|
306
|
+
return []
|
307
|
+
|
308
|
+
try:
|
309
|
+
if skip_footer_rows > 0:
|
310
|
+
# Buffer to hold the last skip_footer_rows + 1 lines
|
311
|
+
buffer = deque(maxlen=skip_footer_rows + 1)
|
312
|
+
result: list[str] = []
|
313
|
+
|
314
|
+
for line in stream:
|
315
|
+
processed_line = line.strip() if strip else line.rstrip("\n")
|
316
|
+
|
317
|
+
# Add current line to buffer
|
318
|
+
buffer.append(processed_line)
|
319
|
+
|
320
|
+
# Wait until the buffer is full (skip_footer_rows + 1 lines) before processing lines.
|
321
|
+
# This ensures we have enough lines to reliably identify and skip the footer rows at the end.
|
322
|
+
if len(buffer) < skip_footer_rows + 1:
|
323
|
+
continue
|
324
|
+
|
325
|
+
# Once the buffer contains more than skip_footer_rows lines, the oldest line (removed with popleft)
|
326
|
+
# is guaranteed not to be part of the footer and can be safely processed and added to the result.
|
327
|
+
safe_line = buffer.popleft()
|
328
|
+
result.append(safe_line)
|
329
|
+
|
330
|
+
# At the end, the buffer contains exactly the footer rows to skip
|
331
|
+
# All other lines have already been processed and added to result
|
332
|
+
return result
|
333
|
+
else:
|
334
|
+
result: list[str] = []
|
335
|
+
for line in stream:
|
336
|
+
processed_line = line.strip() if strip else line.rstrip("\n")
|
337
|
+
result.append(processed_line)
|
338
|
+
return result
|
339
|
+
except UnicodeDecodeError as e:
|
340
|
+
raise SplurgeFileEncodingError(
|
341
|
+
f"Encoding error reading file: {validated_path}",
|
342
|
+
details=str(e)
|
343
|
+
)
|
@@ -0,0 +1,292 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: splurge-dsv
|
3
|
+
Version: 2025.1.0
|
4
|
+
Summary: A utility library for working with DSV (Delimited String Values) files
|
5
|
+
Author: Jim Schilling
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/jim-schilling/splurge-dsv
|
8
|
+
Project-URL: Repository, https://github.com/jim-schilling/splurge-dsv
|
9
|
+
Project-URL: Documentation, https://github.com/jim-schilling/splurge-dsv#readme
|
10
|
+
Project-URL: Bug Tracker, https://github.com/jim-schilling/splurge-dsv/issues
|
11
|
+
Keywords: dsv,csv,tsv,delimited,parsing,file-processing
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
13
|
+
Classifier: Intended Audience :: Developers
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
+
Classifier: Topic :: Text Processing :: Filters
|
21
|
+
Requires-Python: >=3.10
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
License-File: LICENSE
|
24
|
+
Provides-Extra: dev
|
25
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
26
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
27
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
|
28
|
+
Dynamic: license-file
|
29
|
+
|
30
|
+
# splurge-dsv
|
31
|
+
|
32
|
+
A robust Python library for parsing and processing delimited-separated value (DSV) files with advanced features for data validation, streaming, and error handling.
|
33
|
+
|
34
|
+
## Features
|
35
|
+
|
36
|
+
### ๐ง Core Functionality
|
37
|
+
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, semicolon-delimited, and custom delimiter files
|
38
|
+
- **Flexible Parsing Options**: Configurable whitespace handling, bookend removal, and encoding support
|
39
|
+
- **Memory-Efficient Streaming**: Process large files without loading entire content into memory
|
40
|
+
- **Header/Footer Skipping**: Skip specified numbers of rows from start or end of files
|
41
|
+
- **Unicode Support**: Full Unicode character and delimiter support
|
42
|
+
|
43
|
+
### ๐ก๏ธ Security & Validation
|
44
|
+
- **Path Validation**: Comprehensive file path security validation with traversal attack prevention
|
45
|
+
- **File Permission Checks**: Automatic file accessibility and permission validation
|
46
|
+
- **Encoding Validation**: Robust encoding error detection and handling
|
47
|
+
- **Resource Management**: Automatic file handle cleanup and resource management
|
48
|
+
|
49
|
+
### ๐ Advanced Processing
|
50
|
+
- **Chunked Processing**: Configurable chunk sizes for streaming large datasets
|
51
|
+
- **Mixed Content Handling**: Support for quoted and unquoted values in the same file
|
52
|
+
- **Line Ending Flexibility**: Automatic handling of different line ending formats
|
53
|
+
- **Error Recovery**: Graceful error handling with detailed error messages
|
54
|
+
|
55
|
+
### ๐งช Testing & Quality
|
56
|
+
- **Comprehensive Test Suite**: 90%+ code coverage with 250+ tests
|
57
|
+
- **Cross-Platform Support**: Tested on Windows, Linux, and macOS
|
58
|
+
- **Type Safety**: Full type annotations and validation
|
59
|
+
- **Documentation**: Complete API documentation with examples
|
60
|
+
|
61
|
+
## Installation
|
62
|
+
|
63
|
+
```bash
|
64
|
+
pip install splurge-dsv
|
65
|
+
```
|
66
|
+
|
67
|
+
## Quick Start
|
68
|
+
|
69
|
+
### Basic CSV Parsing
|
70
|
+
|
71
|
+
```python
|
72
|
+
from splurge_dsv import DsvHelper
|
73
|
+
|
74
|
+
# Parse a simple CSV string
|
75
|
+
data = DsvHelper.parse("a,b,c", delimiter=",")
|
76
|
+
print(data) # ['a', 'b', 'c']
|
77
|
+
|
78
|
+
# Parse a CSV file
|
79
|
+
rows = DsvHelper.parse_file("data.csv", delimiter=",")
|
80
|
+
for row in rows:
|
81
|
+
print(row) # ['col1', 'col2', 'col3']
|
82
|
+
```
|
83
|
+
|
84
|
+
### Streaming Large Files
|
85
|
+
|
86
|
+
```python
|
87
|
+
from splurge_dsv import DsvHelper
|
88
|
+
|
89
|
+
# Stream a large CSV file in chunks
|
90
|
+
for chunk in DsvHelper.parse_stream("large_file.csv", delimiter=",", chunk_size=1000):
|
91
|
+
for row in chunk:
|
92
|
+
process_row(row)
|
93
|
+
```
|
94
|
+
|
95
|
+
### Advanced Parsing Options
|
96
|
+
|
97
|
+
```python
|
98
|
+
from splurge_dsv import DsvHelper
|
99
|
+
|
100
|
+
# Parse with custom options
|
101
|
+
data = DsvHelper.parse(
|
102
|
+
'"a","b","c"',
|
103
|
+
delimiter=",",
|
104
|
+
bookend='"',
|
105
|
+
strip=True,
|
106
|
+
bookend_strip=True
|
107
|
+
)
|
108
|
+
print(data) # ['a', 'b', 'c']
|
109
|
+
|
110
|
+
# Skip header and footer rows
|
111
|
+
rows = DsvHelper.parse_file(
|
112
|
+
"data.csv",
|
113
|
+
delimiter=",",
|
114
|
+
skip_header_rows=1,
|
115
|
+
skip_footer_rows=2
|
116
|
+
)
|
117
|
+
```
|
118
|
+
|
119
|
+
### Text File Operations
|
120
|
+
|
121
|
+
```python
|
122
|
+
from splurge_dsv import TextFileHelper
|
123
|
+
|
124
|
+
# Count lines in a file
|
125
|
+
line_count = TextFileHelper.line_count("data.txt")
|
126
|
+
|
127
|
+
# Preview first N lines
|
128
|
+
preview = TextFileHelper.preview("data.txt", max_lines=10)
|
129
|
+
|
130
|
+
# Read entire file with options
|
131
|
+
lines = TextFileHelper.read(
|
132
|
+
"data.txt",
|
133
|
+
strip=True,
|
134
|
+
skip_header_rows=1,
|
135
|
+
skip_footer_rows=1
|
136
|
+
)
|
137
|
+
|
138
|
+
# Stream file content
|
139
|
+
for chunk in TextFileHelper.read_as_stream("large_file.txt", chunk_size=500):
|
140
|
+
process_chunk(chunk)
|
141
|
+
```
|
142
|
+
|
143
|
+
### Path Validation
|
144
|
+
|
145
|
+
```python
|
146
|
+
from splurge_dsv import PathValidator
|
147
|
+
|
148
|
+
# Validate a file path
|
149
|
+
valid_path = PathValidator.validate_path(
|
150
|
+
"data.csv",
|
151
|
+
must_exist=True,
|
152
|
+
must_be_file=True,
|
153
|
+
must_be_readable=True
|
154
|
+
)
|
155
|
+
|
156
|
+
# Check if path is safe
|
157
|
+
is_safe = PathValidator.is_safe_path("user_input_path.txt")
|
158
|
+
```
|
159
|
+
|
160
|
+
## API Reference
|
161
|
+
|
162
|
+
### DsvHelper
|
163
|
+
|
164
|
+
Main class for DSV parsing operations.
|
165
|
+
|
166
|
+
#### Methods
|
167
|
+
|
168
|
+
- `parse(content, delimiter, strip=True, bookend=None, bookend_strip=True)` - Parse a single string
|
169
|
+
- `parses(content_list, delimiter, strip=True, bookend=None, bookend_strip=True)` - Parse multiple strings
|
170
|
+
- `parse_file(file_path, delimiter, strip=True, bookend=None, bookend_strip=True, skip_header_rows=0, skip_footer_rows=0, encoding='utf-8')` - Parse a file
|
171
|
+
- `parse_stream(file_path, delimiter, strip=True, bookend=None, bookend_strip=True, skip_header_rows=0, skip_footer_rows=0, encoding='utf-8', chunk_size=500)` - Stream parse a file
|
172
|
+
|
173
|
+
### TextFileHelper
|
174
|
+
|
175
|
+
Utility class for text file operations.
|
176
|
+
|
177
|
+
#### Methods
|
178
|
+
|
179
|
+
- `line_count(file_path, encoding='utf-8')` - Count lines in a file
|
180
|
+
- `preview(file_path, max_lines=100, strip=True, encoding='utf-8', skip_header_rows=0)` - Preview file content
|
181
|
+
- `read(file_path, strip=True, encoding='utf-8', skip_header_rows=0, skip_footer_rows=0)` - Read entire file
|
182
|
+
- `read_as_stream(file_path, strip=True, encoding='utf-8', skip_header_rows=0, skip_footer_rows=0, chunk_size=500)` - Stream read file
|
183
|
+
|
184
|
+
### PathValidator
|
185
|
+
|
186
|
+
Security-focused path validation utilities.
|
187
|
+
|
188
|
+
#### Methods
|
189
|
+
|
190
|
+
- `validate_path(file_path, must_exist=False, must_be_file=False, must_be_readable=False, allow_relative=False, base_directory=None)` - Validate file path
|
191
|
+
- `is_safe_path(file_path)` - Check if path is safe
|
192
|
+
- `sanitize_filename(filename, default_name='file')` - Sanitize filename
|
193
|
+
|
194
|
+
### ResourceManager
|
195
|
+
|
196
|
+
Context managers for safe resource handling.
|
197
|
+
|
198
|
+
#### Classes
|
199
|
+
|
200
|
+
- `FileResourceManager` - Context manager for file operations
|
201
|
+
- `StreamResourceManager` - Context manager for stream operations
|
202
|
+
|
203
|
+
#### Functions
|
204
|
+
|
205
|
+
- `safe_file_operation(file_path, mode='r', encoding='utf-8', ...)` - Safe file operation context manager
|
206
|
+
- `safe_stream_operation(stream, auto_close=True)` - Safe stream operation context manager
|
207
|
+
|
208
|
+
## Error Handling
|
209
|
+
|
210
|
+
The library provides comprehensive error handling with custom exception classes:
|
211
|
+
|
212
|
+
- `SplurgeParameterError` - Invalid parameter values
|
213
|
+
- `SplurgeFileNotFoundError` - File not found
|
214
|
+
- `SplurgeFilePermissionError` - File permission issues
|
215
|
+
- `SplurgeFileEncodingError` - File encoding problems
|
216
|
+
- `SplurgePathValidationError` - Path validation failures
|
217
|
+
- `SplurgeResourceAcquisitionError` - Resource acquisition failures
|
218
|
+
- `SplurgeResourceReleaseError` - Resource cleanup failures
|
219
|
+
|
220
|
+
## Development
|
221
|
+
|
222
|
+
### Running Tests
|
223
|
+
|
224
|
+
```bash
|
225
|
+
# Run all tests
|
226
|
+
pytest tests/ -v
|
227
|
+
|
228
|
+
# Run with coverage
|
229
|
+
pytest tests/ --cov=splurge_dsv --cov-report=html
|
230
|
+
|
231
|
+
# Run specific test file
|
232
|
+
pytest tests/test_dsv_helper.py -v
|
233
|
+
```
|
234
|
+
|
235
|
+
### Code Quality
|
236
|
+
|
237
|
+
The project follows strict coding standards:
|
238
|
+
- PEP 8 compliance
|
239
|
+
- Type annotations for all functions
|
240
|
+
- Google-style docstrings
|
241
|
+
- 90%+ test coverage requirement
|
242
|
+
- Comprehensive error handling
|
243
|
+
|
244
|
+
## Changelog
|
245
|
+
|
246
|
+
### 2025.1.0 (2025-08-25)
|
247
|
+
|
248
|
+
#### ๐ Major Features
|
249
|
+
- **Complete DSV Parser**: Full-featured delimited-separated value parser with support for CSV, TSV, and custom delimiters
|
250
|
+
- **Streaming Support**: Memory-efficient streaming for large files with configurable chunk sizes
|
251
|
+
- **Advanced Parsing Options**: Bookend removal, whitespace handling, and encoding support
|
252
|
+
- **Header/Footer Skipping**: Skip specified numbers of rows from start or end of files
|
253
|
+
|
254
|
+
#### ๐ก๏ธ Security Enhancements
|
255
|
+
- **Path Validation System**: Comprehensive file path security validation with traversal attack prevention
|
256
|
+
- **File Permission Checks**: Automatic file accessibility and permission validation
|
257
|
+
- **Encoding Validation**: Robust encoding error detection and handling
|
258
|
+
|
259
|
+
#### ๐ง Core Components
|
260
|
+
- **DsvHelper**: Main DSV parsing class with parse, parses, parse_file, and parse_stream methods
|
261
|
+
- **TextFileHelper**: Utility class for text file operations (line counting, preview, reading, streaming)
|
262
|
+
- **PathValidator**: Security-focused path validation utilities
|
263
|
+
- **ResourceManager**: Context managers for safe resource handling
|
264
|
+
- **StringTokenizer**: Core string parsing functionality
|
265
|
+
|
266
|
+
#### ๐งช Testing & Quality
|
267
|
+
- **Comprehensive Test Suite**: 250+ tests with 90%+ code coverage
|
268
|
+
- **Cross-Platform Testing**: Tested on Windows, Linux, and macOS
|
269
|
+
- **Type Safety**: Full type annotations throughout the codebase
|
270
|
+
- **Error Handling**: Custom exception hierarchy with detailed error messages
|
271
|
+
|
272
|
+
#### ๐ Documentation
|
273
|
+
- **Complete API Documentation**: Google-style docstrings for all public methods
|
274
|
+
- **Usage Examples**: Comprehensive examples for all major features
|
275
|
+
- **Error Documentation**: Detailed error handling documentation
|
276
|
+
|
277
|
+
#### ๐ Performance
|
278
|
+
- **Memory Efficiency**: Streaming support for large files
|
279
|
+
- **Optimized Parsing**: Efficient string tokenization and processing
|
280
|
+
- **Resource Management**: Automatic cleanup and resource management
|
281
|
+
|
282
|
+
## License
|
283
|
+
|
284
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
285
|
+
|
286
|
+
## Contributing
|
287
|
+
|
288
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
289
|
+
|
290
|
+
## Support
|
291
|
+
|
292
|
+
For support, please open an issue on the GitHub repository or contact the maintainers.
|
@@ -0,0 +1,13 @@
|
|
1
|
+
splurge_dsv/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
splurge_dsv/__main__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
splurge_dsv/dsv_helper.py,sha256=gv9Wwf_soqdXuZWkBpxSyvJVqryKDfEoSf9SoAaRX1A,9651
|
4
|
+
splurge_dsv/exceptions.py,sha256=tPVLXxf8LPc0yd8L8xjik331B4-NUVYfyd6ifPLUtm4,3052
|
5
|
+
splurge_dsv/path_validator.py,sha256=2vXsNnmRTLbdYedDrF5kQZsOUvfLhSxkXyRj26OZyj8,10036
|
6
|
+
splurge_dsv/resource_manager.py,sha256=fldvZQpzznANf4ahHre_lSqQ5_FUWndFuSlyccBylGE,13013
|
7
|
+
splurge_dsv/string_tokenizer.py,sha256=GDAtJht-d9vt8Kb4yglxfl2iMfSMWGT3esksISNklMU,4203
|
8
|
+
splurge_dsv/text_file_helper.py,sha256=htYT1Z0aKuQt24X2IfSKnFHB1AOY3dV7lRU8Vr6iqn0,14237
|
9
|
+
splurge_dsv-2025.1.0.dist-info/licenses/LICENSE,sha256=fPgtg-tIFHinQvJH0arRfv50AuxikD5eHw6rrPy2A5w,1091
|
10
|
+
splurge_dsv-2025.1.0.dist-info/METADATA,sha256=MUa4M1124GYLHmOYzLIne6-s_biC8wFi9a1PaBEmMXA,10282
|
11
|
+
splurge_dsv-2025.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
+
splurge_dsv-2025.1.0.dist-info/top_level.txt,sha256=D6Si3FTfpRYqH7kzM7tSQAyaKbbraO6UPLpcqcY4XXM,12
|
13
|
+
splurge_dsv-2025.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Jim Schilling
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1 @@
|
|
1
|
+
splurge_dsv
|