splurge-dsv 2025.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,263 @@
1
+ """
2
+ A utility module for working with DSV (Delimited String Values) files.
3
+
4
+ Copyright (c) 2025 Jim Schilling
5
+
6
+ Please preserve this header and all related material when sharing!
7
+
8
+ This module is licensed under the MIT License.
9
+ """
10
+
11
+ from os import PathLike
12
+ from typing import Iterator
13
+
14
+ from splurge_dsv.string_tokenizer import StringTokenizer
15
+ from splurge_dsv.text_file_helper import TextFileHelper
16
+ from splurge_dsv.exceptions import SplurgeParameterError
17
+
18
+ class DsvHelper:
19
+ """
20
+ Utility class for working with DSV (Delimited String Values) files.
21
+
22
+ Provides methods to parse DSV content from strings, lists of strings, and files.
23
+ Supports configurable delimiters, text bookends, and whitespace handling options.
24
+ """
25
+
26
+ DEFAULT_CHUNK_SIZE = 500 # Default chunk size for streaming operations
27
+ DEFAULT_ENCODING = "utf-8" # Default text encoding for file operations
28
+ DEFAULT_SKIP_HEADER_ROWS = 0 # Default number of header rows to skip
29
+ DEFAULT_SKIP_FOOTER_ROWS = 0 # Default number of footer rows to skip
30
+ DEFAULT_MIN_CHUNK_SIZE = 100
31
+ DEFAULT_STRIP = True
32
+ DEFAULT_BOOKEND_STRIP = True
33
+
34
+ @staticmethod
35
+ def parse(
36
+ content: str,
37
+ *,
38
+ delimiter: str,
39
+ strip: bool = DEFAULT_STRIP,
40
+ bookend: str | None = None,
41
+ bookend_strip: bool = DEFAULT_BOOKEND_STRIP
42
+ ) -> list[str]:
43
+ """
44
+ Parse a string into a list of strings.
45
+
46
+ Args:
47
+ content (str): The string to parse.
48
+ delimiter (str): The delimiter to use.
49
+ strip (bool): Whether to strip whitespace from the strings.
50
+ bookend (str | None): The bookend to use for text fields.
51
+ bookend_strip (bool): Whether to strip whitespace from the bookend.
52
+
53
+ Returns:
54
+ list[str]: The list of strings.
55
+
56
+ Raises:
57
+ SplurgeParameterError: If delimiter is empty or None.
58
+
59
+ Example:
60
+ >>> DsvHelper.parse("a,b,c", delimiter=",")
61
+ ['a', 'b', 'c']
62
+ >>> DsvHelper.parse('"a","b","c"', delimiter=",", bookend='"')
63
+ ['a', 'b', 'c']
64
+ """
65
+ if delimiter is None or delimiter == "":
66
+ raise SplurgeParameterError("delimiter cannot be empty or None")
67
+
68
+ tokens: list[str] = StringTokenizer.parse(content, delimiter=delimiter, strip=strip)
69
+
70
+ if bookend:
71
+ tokens = [
72
+ StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip)
73
+ for token in tokens
74
+ ]
75
+
76
+ return tokens
77
+
78
+ @classmethod
79
+ def parses(
80
+ cls,
81
+ content: list[str],
82
+ *,
83
+ delimiter: str,
84
+ strip: bool = DEFAULT_STRIP,
85
+ bookend: str | None = None,
86
+ bookend_strip: bool = DEFAULT_BOOKEND_STRIP
87
+ ) -> list[list[str]]:
88
+ """
89
+ Parse a list of strings into a list of lists of strings.
90
+
91
+ Args:
92
+ content (list[str]): The list of strings to parse.
93
+ delimiter (str): The delimiter to use.
94
+ strip (bool): Whether to strip whitespace from the strings.
95
+ bookend (str | None): The bookend to use for text fields.
96
+ bookend_strip (bool): Whether to strip whitespace from the bookend.
97
+
98
+ Returns:
99
+ list[list[str]]: The list of lists of strings.
100
+
101
+ Raises:
102
+ SplurgeParameterError: If delimiter is empty or None.
103
+ SplurgeParameterError: If content is not a list of strings.
104
+
105
+ Example:
106
+ >>> DsvHelper.parses(["a,b,c", "d,e,f"], delimiter=",")
107
+ [['a', 'b', 'c'], ['d', 'e', 'f']]
108
+ """
109
+ if not isinstance(content, list):
110
+ raise SplurgeParameterError("content must be a list")
111
+
112
+ if not all(isinstance(item, str) for item in content):
113
+ raise SplurgeParameterError("content must be a list of strings")
114
+
115
+ return [
116
+ cls.parse(item, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
117
+ for item in content
118
+ ]
119
+
120
+ @classmethod
121
+ def parse_file(
122
+ cls,
123
+ file_path: PathLike[str] | str,
124
+ *,
125
+ delimiter: str,
126
+ strip: bool = DEFAULT_STRIP,
127
+ bookend: str | None = None,
128
+ bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
129
+ encoding: str = DEFAULT_ENCODING,
130
+ skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
131
+ skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS
132
+ ) -> list[list[str]]:
133
+ """
134
+ Parse a file into a list of lists of strings.
135
+
136
+ Args:
137
+ file_path (PathLike[str] | str): The path to the file to parse.
138
+ delimiter (str): The delimiter to use.
139
+ strip (bool): Whether to strip whitespace from the strings.
140
+ bookend (str | None): The bookend to use for text fields.
141
+ bookend_strip (bool): Whether to strip whitespace from the bookend.
142
+ encoding (str): The file encoding.
143
+ skip_header_rows (int): Number of header rows to skip.
144
+ skip_footer_rows (int): Number of footer rows to skip.
145
+
146
+ Returns:
147
+ list[list[str]]: The list of lists of strings.
148
+
149
+ Raises:
150
+ SplurgeParameterError: If delimiter is empty or None.
151
+ SplurgeFileNotFoundError: If the file does not exist.
152
+ SplurgeFilePermissionError: If the file cannot be accessed.
153
+ SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding.
154
+
155
+ Example:
156
+ >>> DsvHelper.parse_file("data.csv", delimiter=",")
157
+ [['header1', 'header2'], ['value1', 'value2']]
158
+ """
159
+ lines: list[str] = TextFileHelper.read(
160
+ file_path,
161
+ encoding=encoding,
162
+ skip_header_rows=skip_header_rows,
163
+ skip_footer_rows=skip_footer_rows
164
+ )
165
+
166
+ return cls.parses(
167
+ lines,
168
+ delimiter=delimiter,
169
+ strip=strip,
170
+ bookend=bookend,
171
+ bookend_strip=bookend_strip
172
+ )
173
+
174
+ @classmethod
175
+ def _process_stream_chunk(
176
+ cls,
177
+ chunk: list[str],
178
+ *,
179
+ delimiter: str,
180
+ strip: bool = DEFAULT_STRIP,
181
+ bookend: str | None = None,
182
+ bookend_strip: bool = DEFAULT_BOOKEND_STRIP
183
+ ) -> list[list[str]]:
184
+ """
185
+ Process a chunk of lines from the stream.
186
+
187
+ Args:
188
+ chunk: List of lines to process
189
+ delimiter: Delimiter to use for parsing
190
+ strip: Whether to strip whitespace
191
+ bookend: Bookend character for text fields
192
+ bookend_strip: Whether to strip whitespace from bookends
193
+
194
+ Returns:
195
+ list[list[str]]: Parsed rows
196
+ """
197
+ return cls.parses(
198
+ chunk,
199
+ delimiter=delimiter,
200
+ strip=strip,
201
+ bookend=bookend,
202
+ bookend_strip=bookend_strip
203
+ )
204
+
205
+ @classmethod
206
+ def parse_stream(
207
+ cls,
208
+ file_path: PathLike[str] | str,
209
+ *,
210
+ delimiter: str,
211
+ strip: bool = DEFAULT_STRIP,
212
+ bookend: str | None = None,
213
+ bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
214
+ encoding: str = DEFAULT_ENCODING,
215
+ skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
216
+ skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
217
+ chunk_size: int = DEFAULT_CHUNK_SIZE
218
+ ) -> Iterator[list[list[str]]]:
219
+ """
220
+ Stream-parse a DSV file in chunks of lines.
221
+
222
+ Args:
223
+ file_path (PathLike[str] | str): The path to the file to parse.
224
+ delimiter (str): The delimiter to use.
225
+ strip (bool): Whether to strip whitespace from the strings.
226
+ bookend (str | None): The bookend to use for text fields.
227
+ bookend_strip (bool): Whether to strip whitespace from the bookend.
228
+ encoding (str): The file encoding.
229
+ skip_header_rows (int): Number of header rows to skip.
230
+ skip_footer_rows (int): Number of footer rows to skip.
231
+ chunk_size (int): Number of lines per chunk (default: 100).
232
+
233
+ Yields:
234
+ list[list[str]]: Parsed rows for each chunk.
235
+
236
+ Raises:
237
+ SplurgeParameterError: If delimiter is empty or None.
238
+ SplurgeFileNotFoundError: If the file does not exist.
239
+ SplurgeFilePermissionError: If the file cannot be accessed.
240
+ SplurgeFileEncodingError: If the file cannot be decoded with the specified encoding.
241
+ """
242
+ if delimiter is None or delimiter == "":
243
+ raise SplurgeParameterError("delimiter cannot be empty or None")
244
+
245
+ chunk_size = max(chunk_size, cls.DEFAULT_MIN_CHUNK_SIZE)
246
+ skip_header_rows = max(skip_header_rows, cls.DEFAULT_SKIP_HEADER_ROWS)
247
+ skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
248
+
249
+ # Use TextFileHelper.read_as_stream for consistent error handling
250
+ for chunk in TextFileHelper.read_as_stream(
251
+ file_path,
252
+ encoding=encoding,
253
+ skip_header_rows=skip_header_rows,
254
+ skip_footer_rows=skip_footer_rows,
255
+ chunk_size=chunk_size
256
+ ):
257
+ yield cls._process_stream_chunk(
258
+ chunk,
259
+ delimiter=delimiter,
260
+ strip=strip,
261
+ bookend=bookend,
262
+ bookend_strip=bookend_strip
263
+ )
@@ -0,0 +1,123 @@
1
+ """
2
+ Custom exceptions for the splurge-dsv package.
3
+
4
+ This module provides a hierarchy of custom exceptions for better error handling
5
+ and more specific error messages throughout the package.
6
+
7
+ Copyright (c) 2025 Jim Schilling
8
+
9
+ Please preserve this header and all related material when sharing!
10
+
11
+ This module is licensed under the MIT License.
12
+ """
13
+
14
+
15
+ class SplurgeDsvError(Exception):
16
+ """Base exception for all splurge-dsv errors."""
17
+
18
+ def __init__(
19
+ self,
20
+ message: str,
21
+ *,
22
+ details: str | None = None
23
+ ) -> None:
24
+ """
25
+ Initialize SplurgeDsvError.
26
+
27
+ Args:
28
+ message: Primary error message
29
+ details: Additional error details
30
+ """
31
+ self.message = message
32
+ self.details = details
33
+ super().__init__(self.message)
34
+
35
+
36
+ class SplurgeValidationError(SplurgeDsvError):
37
+ """Raised when data validation fails."""
38
+ pass
39
+
40
+
41
+ class SplurgeFileOperationError(SplurgeDsvError):
42
+ """Base exception for file operation errors."""
43
+ pass
44
+
45
+
46
+ class SplurgeFileNotFoundError(SplurgeFileOperationError):
47
+ """Raised when a file is not found."""
48
+ pass
49
+
50
+
51
+ class SplurgeFilePermissionError(SplurgeFileOperationError):
52
+ """Raised when there are permission issues with file operations."""
53
+ pass
54
+
55
+
56
+ class SplurgeFileEncodingError(SplurgeFileOperationError):
57
+ """Raised when there are encoding issues with file operations."""
58
+ pass
59
+
60
+
61
+ class SplurgePathValidationError(SplurgeFileOperationError):
62
+ """Raised when file path validation fails."""
63
+ pass
64
+
65
+
66
+ class SplurgeDataProcessingError(SplurgeDsvError):
67
+ """Base exception for data processing errors."""
68
+ pass
69
+
70
+
71
+ class SplurgeParsingError(SplurgeDataProcessingError):
72
+ """Raised when data parsing fails."""
73
+ pass
74
+
75
+
76
+ class SplurgeTypeConversionError(SplurgeDataProcessingError):
77
+ """Raised when type conversion fails."""
78
+ pass
79
+
80
+
81
+ class SplurgeStreamingError(SplurgeDataProcessingError):
82
+ """Raised when streaming operations fail."""
83
+ pass
84
+
85
+
86
+ class SplurgeConfigurationError(SplurgeDsvError):
87
+ """Raised when configuration is invalid."""
88
+ pass
89
+
90
+
91
+ class SplurgeResourceError(SplurgeDsvError):
92
+ """Base exception for resource management errors."""
93
+ pass
94
+
95
+
96
+ class SplurgeResourceAcquisitionError(SplurgeResourceError):
97
+ """Raised when resource acquisition fails."""
98
+ pass
99
+
100
+
101
+ class SplurgeResourceReleaseError(SplurgeResourceError):
102
+ """Raised when resource release fails."""
103
+ pass
104
+
105
+
106
+ class SplurgePerformanceWarning(SplurgeDsvError):
107
+ """Warning for performance-related issues."""
108
+ pass
109
+
110
+
111
+ class SplurgeParameterError(SplurgeValidationError):
112
+ """Raised when function parameters are invalid."""
113
+ pass
114
+
115
+
116
+ class SplurgeRangeError(SplurgeValidationError):
117
+ """Raised when values are outside expected ranges."""
118
+ pass
119
+
120
+
121
+ class SplurgeFormatError(SplurgeValidationError):
122
+ """Raised when data format is invalid."""
123
+ pass
@@ -0,0 +1,262 @@
1
+ """
2
+ File path validation utilities for secure file operations.
3
+
4
+ This module provides utilities for validating file paths to prevent
5
+ path traversal attacks and ensure secure file operations.
6
+
7
+ Copyright (c) 2025 Jim Schilling
8
+
9
+ Please preserve this header and all related material when sharing!
10
+
11
+ This module is licensed under the MIT License.
12
+ """
13
+
14
+ import os
15
+ import re
16
+ from pathlib import Path
17
+
18
+ from splurge_dsv.exceptions import (
19
+ SplurgePathValidationError,
20
+ SplurgeFileNotFoundError,
21
+ SplurgeFilePermissionError
22
+ )
23
+
24
+
25
+ # Module-level constants for path validation
26
+ _MAX_PATH_LENGTH = 4096 # Maximum path length for most filesystems
27
+ _DEFAULT_FILENAME = "unnamed_file" # Default filename when sanitization results in empty string
28
+
29
+
30
+ class PathValidator:
31
+ """
32
+ Utility class for validating file paths securely.
33
+
34
+ This class provides methods to validate file paths and prevent
35
+ path traversal attacks and other security vulnerabilities.
36
+ """
37
+
38
+ # Private constants for path validation
39
+ _PATH_TRAVERSAL_PATTERNS = [
40
+ r'\.\.', # Directory traversal
41
+ r'//+', # Multiple forward slashes (including //)
42
+ r'\\{2,}', # Two or more consecutive backslashes (not normal Windows paths)
43
+ r'~', # Home directory expansion
44
+ ]
45
+
46
+ _DANGEROUS_CHARS = [
47
+ '<', '>', '"', '|', '?', '*', # Windows reserved characters (excluding ':' for drive letters)
48
+ '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', # Control characters
49
+ '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
50
+ '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
51
+ '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
52
+ ]
53
+
54
+ MAX_PATH_LENGTH = _MAX_PATH_LENGTH
55
+
56
+ @classmethod
57
+ def validate_path(
58
+ cls,
59
+ file_path: str | Path,
60
+ *,
61
+ must_exist: bool = False,
62
+ must_be_file: bool = False,
63
+ must_be_readable: bool = False,
64
+ allow_relative: bool = True,
65
+ base_directory: str | Path | None = None
66
+ ) -> Path:
67
+ """
68
+ Validate a file path for security and correctness.
69
+
70
+ Args:
71
+ file_path: Path to validate
72
+ must_exist: Whether the file must exist
73
+ must_be_file: Whether the path must be a file (not directory)
74
+ must_be_readable: Whether the file must be readable
75
+ allow_relative: Whether to allow relative paths
76
+ base_directory: Base directory for relative path resolution
77
+
78
+ Returns:
79
+ Normalized Path object
80
+
81
+ Raises:
82
+ SplurgePathValidationError: If path validation fails
83
+ SplurgeFileNotFoundError: If file doesn't exist when required
84
+ SplurgeFilePermissionError: If file is not readable when required
85
+ """
86
+ # Convert to Path object
87
+ path = Path(file_path) if isinstance(file_path, str) else file_path
88
+
89
+ # Get the original string for validation (before Path normalization)
90
+ path_str = str(file_path) if isinstance(file_path, str) else str(path)
91
+
92
+ # Check for dangerous characters
93
+ cls._check_dangerous_characters(path_str)
94
+
95
+ # Check for path traversal patterns
96
+ cls._check_path_traversal(path_str)
97
+
98
+ # Check path length
99
+ cls._check_path_length(path_str)
100
+
101
+ # Handle relative paths
102
+ if not path.is_absolute() and not allow_relative:
103
+ raise SplurgePathValidationError(
104
+ f"Relative paths are not allowed: {path}",
105
+ details="Set allow_relative=True to allow relative paths"
106
+ )
107
+
108
+ # Resolve path (handles symlinks and normalizes)
109
+ try:
110
+ if base_directory:
111
+ base_path = Path(base_directory).resolve()
112
+ if not path.is_absolute():
113
+ resolved_path = (base_path / path).resolve()
114
+ else:
115
+ resolved_path = path.resolve()
116
+
117
+ # Ensure resolved path is within base directory
118
+ try:
119
+ resolved_path.relative_to(base_path)
120
+ except ValueError:
121
+ raise SplurgePathValidationError(
122
+ f"Path {path} resolves outside base directory {base_directory}",
123
+ details="Path traversal detected"
124
+ )
125
+ else:
126
+ resolved_path = path.resolve()
127
+ except (OSError, RuntimeError) as e:
128
+ raise SplurgePathValidationError(
129
+ f"Failed to resolve path {path}: {e}",
130
+ details="Check if path contains invalid characters or symlinks"
131
+ )
132
+
133
+ # Check if file exists
134
+ if must_exist and not resolved_path.exists():
135
+ raise SplurgeFileNotFoundError(
136
+ f"File does not exist: {resolved_path}",
137
+ details="Set must_exist=False to allow non-existent files"
138
+ )
139
+
140
+ # Check if it's a file (not directory)
141
+ if must_be_file and resolved_path.exists() and not resolved_path.is_file():
142
+ raise SplurgePathValidationError(
143
+ f"Path is not a file: {resolved_path}",
144
+ details="Path exists but is not a regular file"
145
+ )
146
+
147
+ # Check if file is readable
148
+ if must_be_readable:
149
+ if not resolved_path.exists():
150
+ raise SplurgeFileNotFoundError(
151
+ f"Cannot check readability of non-existent file: {resolved_path}",
152
+ details="File must exist to check readability"
153
+ )
154
+
155
+ if not os.access(resolved_path, os.R_OK):
156
+ raise SplurgeFilePermissionError(
157
+ f"File is not readable: {resolved_path}",
158
+ details="Check file permissions"
159
+ )
160
+
161
+ return resolved_path
162
+
163
+ @classmethod
164
+ def _is_valid_windows_drive_pattern(cls, path_str: str) -> bool:
165
+ """
166
+ Check if a path string contains a valid Windows drive letter pattern.
167
+
168
+ Args:
169
+ path_str: Path string to validate
170
+
171
+ Returns:
172
+ True if the path contains a valid Windows drive letter pattern,
173
+ False otherwise
174
+ """
175
+ # Must be C: at the end of the string, or C:\ (or C:/) followed by path
176
+ return (re.match(r'^[A-Za-z]:$', path_str) or
177
+ re.match(r'^[A-Za-z]:[\\/]', path_str))
178
+
179
+ @classmethod
180
+ def _check_dangerous_characters(cls, path_str: str) -> None:
181
+ """Check for dangerous characters in path string."""
182
+ # Check for dangerous characters, but allow colons in Windows drive letters
183
+ for char in cls._DANGEROUS_CHARS:
184
+ if char in path_str:
185
+ raise SplurgePathValidationError(
186
+ f"Path contains dangerous character: {repr(char)}",
187
+ details=f"Character at position {path_str.find(char)}"
188
+ )
189
+
190
+ # Special handling for colons - only allow them in Windows drive letters (e.g., C:)
191
+ if ':' in path_str:
192
+ if not cls._is_valid_windows_drive_pattern(path_str):
193
+ raise SplurgePathValidationError(
194
+ "Path contains colon in invalid position",
195
+ details="Colons are only allowed in Windows drive letters (e.g., C: or C:\\)"
196
+ )
197
+
198
+ @classmethod
199
+ def _check_path_traversal(cls, path_str: str) -> None:
200
+ """Check for path traversal patterns."""
201
+ for pattern in cls._PATH_TRAVERSAL_PATTERNS:
202
+ if re.search(pattern, path_str):
203
+ raise SplurgePathValidationError(
204
+ f"Path contains traversal pattern: {pattern}",
205
+ details="Path traversal attacks are not allowed"
206
+ )
207
+
208
+ @classmethod
209
+ def _check_path_length(cls, path_str: str) -> None:
210
+ """Check if path length is within acceptable limits."""
211
+ if len(path_str) > cls.MAX_PATH_LENGTH:
212
+ raise SplurgePathValidationError(
213
+ f"Path is too long: {len(path_str)} characters",
214
+ details=f"Maximum allowed length is {cls.MAX_PATH_LENGTH} characters"
215
+ )
216
+
217
+ @classmethod
218
+ def sanitize_filename(cls, filename: str) -> str:
219
+ """
220
+ Sanitize a filename by removing dangerous characters.
221
+
222
+ Args:
223
+ filename: Original filename
224
+
225
+ Returns:
226
+ Sanitized filename
227
+ """
228
+ # Remove or replace dangerous characters
229
+ sanitized = filename
230
+
231
+ # Replace Windows reserved characters
232
+ for char in ['<', '>', ':', '"', '|', '?', '*']:
233
+ sanitized = sanitized.replace(char, '_')
234
+
235
+ # Remove control characters
236
+ sanitized = ''.join(char for char in sanitized if ord(char) >= 32)
237
+
238
+ # Remove leading/trailing spaces and dots
239
+ sanitized = sanitized.strip(' .')
240
+
241
+ # Ensure filename is not empty
242
+ if not sanitized:
243
+ sanitized = _DEFAULT_FILENAME
244
+
245
+ return sanitized
246
+
247
+ @classmethod
248
+ def is_safe_path(cls, file_path: str | Path) -> bool:
249
+ """
250
+ Check if a path is safe without raising exceptions.
251
+
252
+ Args:
253
+ file_path: Path to check
254
+
255
+ Returns:
256
+ True if path is safe, False otherwise
257
+ """
258
+ try:
259
+ cls.validate_path(file_path)
260
+ return True
261
+ except (SplurgePathValidationError, SplurgeFileNotFoundError, SplurgeFilePermissionError):
262
+ return False