splurge-dsv 2025.1.0__tar.gz → 2025.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {splurge_dsv-2025.1.0/splurge_dsv.egg-info → splurge_dsv-2025.1.2}/PKG-INFO +47 -1
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/README.md +46 -0
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/pyproject.toml +27 -1
- splurge_dsv-2025.1.2/splurge_dsv/__init__.py +84 -0
- splurge_dsv-2025.1.2/splurge_dsv/__main__.py +15 -0
- splurge_dsv-2025.1.2/splurge_dsv/cli.py +158 -0
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv/dsv_helper.py +29 -46
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv/exceptions.py +22 -9
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv/path_validator.py +102 -79
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv/resource_manager.py +77 -138
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv/string_tokenizer.py +5 -24
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv/text_file_helper.py +42 -64
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2/splurge_dsv.egg-info}/PKG-INFO +47 -1
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv.egg-info/SOURCES.txt +2 -7
- splurge_dsv-2025.1.0/splurge_dsv/__init__.py +0 -0
- splurge_dsv-2025.1.0/splurge_dsv/__main__.py +0 -0
- splurge_dsv-2025.1.0/tests/test_dsv_helper.py +0 -521
- splurge_dsv-2025.1.0/tests/test_exceptions.py +0 -255
- splurge_dsv-2025.1.0/tests/test_path_validator.py +0 -413
- splurge_dsv-2025.1.0/tests/test_resource_manager.py +0 -504
- splurge_dsv-2025.1.0/tests/test_string_tokenizer.py +0 -297
- splurge_dsv-2025.1.0/tests/test_text_file_helper.py +0 -580
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/LICENSE +0 -0
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/setup.cfg +0 -0
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv.egg-info/dependency_links.txt +0 -0
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv.egg-info/requires.txt +0 -0
- {splurge_dsv-2025.1.0 → splurge_dsv-2025.1.2}/splurge_dsv.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: splurge-dsv
|
3
|
-
Version: 2025.1.
|
3
|
+
Version: 2025.1.2
|
4
4
|
Summary: A utility library for working with DSV (Delimited String Values) files
|
5
5
|
Author: Jim Schilling
|
6
6
|
License-Expression: MIT
|
@@ -243,6 +243,52 @@ The project follows strict coding standards:
|
|
243
243
|
|
244
244
|
## Changelog
|
245
245
|
|
246
|
+
### 2025.1.2 (2025-09-02)
|
247
|
+
|
248
|
+
#### 🧪 Comprehensive End-to-End Testing
|
249
|
+
- **Complete E2E Test Suite**: Implemented 25 comprehensive end-to-end workflow tests covering all major CLI functionality
|
250
|
+
- **Real CLI Execution**: Tests run actual `splurge-dsv` commands with real files, not just mocked components
|
251
|
+
- **Workflow Coverage**: Tests cover CSV/TSV parsing, file operations, data processing, error handling, and performance scenarios
|
252
|
+
- **Cross-Platform Compatibility**: Handles Windows-specific encoding issues and platform differences gracefully
|
253
|
+
- **Performance Testing**: Large file processing tests (1,000+ and 10,000+ rows) with streaming and chunking validation
|
254
|
+
|
255
|
+
#### 📊 Test Coverage Improvements
|
256
|
+
- **CLI Coverage**: Increased from 64% to **95%** with comprehensive CLI workflow testing
|
257
|
+
- **DSV Helper Coverage**: Improved from 75% to **93%** with real-world usage scenarios
|
258
|
+
- **Overall Coverage**: Improved from 60% to **73%** across the entire codebase
|
259
|
+
- **Integration Testing**: Added real file system operations and complete pipeline validation
|
260
|
+
|
261
|
+
#### 🔄 Test Categories
|
262
|
+
- **CLI Workflows**: 19 tests covering basic parsing, custom delimiters, header/footer skipping, streaming, and error scenarios
|
263
|
+
- **Error Handling**: 3 tests for invalid arguments, missing parameters, and CLI error conditions
|
264
|
+
- **Integration Scenarios**: 3 tests for data analysis, transformation, and multi-format workflows
|
265
|
+
|
266
|
+
#### 📚 Documentation & Examples
|
267
|
+
- **E2E Testing Guide**: Created comprehensive documentation (`docs/e2e_testing_coverage.md`) explaining test coverage and usage
|
268
|
+
- **Real-World Examples**: Tests serve as practical examples of library usage patterns
|
269
|
+
- **Error Scenario Coverage**: Comprehensive testing of edge cases and failure conditions
|
270
|
+
|
271
|
+
### 2025.1.1 (2025-08-XX)
|
272
|
+
|
273
|
+
#### 🔧 Code Quality Improvements
|
274
|
+
- **Refactored Complex Regex Logic**: Extracted Windows drive letter validation logic from `_check_dangerous_characters` into a dedicated `_is_valid_windows_drive_pattern` helper method in `PathValidator` for better readability and maintainability
|
275
|
+
- **Exception Handling Consistency**: Fixed inconsistency in `ResourceManager.acquire()` method to properly re-raise `NotImplementedError` without wrapping it in `SplurgeResourceAcquisitionError`
|
276
|
+
- **Import Organization**: Moved all imports to the top of modules across the entire codebase for better code structure and PEP 8 compliance
|
277
|
+
|
278
|
+
#### 🧪 Testing Enhancements
|
279
|
+
- **Public API Focus**: Removed all tests that validated private implementation details, focusing exclusively on public API behavior validation
|
280
|
+
- **Comprehensive Resource Manager Tests**: Added extensive test suite for `ResourceManager` module covering all public methods, edge cases, error scenarios, and context manager behavior
|
281
|
+
- **Bookend Logic Clarification**: Updated and corrected all tests related to `StringTokenizer.remove_bookends` to properly reflect its single-character, symmetric bookend matching behavior
|
282
|
+
- **Path Validation Test Clarity**: Clarified test expectations and comments for Windows drive-relative paths (e.g., "C:file.txt") to reflect the validator's intentionally strict security design
|
283
|
+
|
284
|
+
#### 🐛 Bug Fixes
|
285
|
+
- **Test Reliability**: Fixed failing tests in `ResourceManager` context manager scenarios by properly handling file truncation and line ending normalization
|
286
|
+
- **Ruff Compliance**: Resolved all linting warnings including unused variables and imports
|
287
|
+
|
288
|
+
#### 📚 Documentation Updates
|
289
|
+
- **Method Documentation**: Updated `ResourceManager.acquire()` docstring to include `NotImplementedError` in the Raises section
|
290
|
+
- **Test Comments**: Enhanced test documentation with clearer explanations of expected behaviors and edge cases
|
291
|
+
|
246
292
|
### 2025.1.0 (2025-08-25)
|
247
293
|
|
248
294
|
#### 🎉 Major Features
|
@@ -214,6 +214,52 @@ The project follows strict coding standards:
|
|
214
214
|
|
215
215
|
## Changelog
|
216
216
|
|
217
|
+
### 2025.1.2 (2025-09-02)
|
218
|
+
|
219
|
+
#### 🧪 Comprehensive End-to-End Testing
|
220
|
+
- **Complete E2E Test Suite**: Implemented 25 comprehensive end-to-end workflow tests covering all major CLI functionality
|
221
|
+
- **Real CLI Execution**: Tests run actual `splurge-dsv` commands with real files, not just mocked components
|
222
|
+
- **Workflow Coverage**: Tests cover CSV/TSV parsing, file operations, data processing, error handling, and performance scenarios
|
223
|
+
- **Cross-Platform Compatibility**: Handles Windows-specific encoding issues and platform differences gracefully
|
224
|
+
- **Performance Testing**: Large file processing tests (1,000+ and 10,000+ rows) with streaming and chunking validation
|
225
|
+
|
226
|
+
#### 📊 Test Coverage Improvements
|
227
|
+
- **CLI Coverage**: Increased from 64% to **95%** with comprehensive CLI workflow testing
|
228
|
+
- **DSV Helper Coverage**: Improved from 75% to **93%** with real-world usage scenarios
|
229
|
+
- **Overall Coverage**: Improved from 60% to **73%** across the entire codebase
|
230
|
+
- **Integration Testing**: Added real file system operations and complete pipeline validation
|
231
|
+
|
232
|
+
#### 🔄 Test Categories
|
233
|
+
- **CLI Workflows**: 19 tests covering basic parsing, custom delimiters, header/footer skipping, streaming, and error scenarios
|
234
|
+
- **Error Handling**: 3 tests for invalid arguments, missing parameters, and CLI error conditions
|
235
|
+
- **Integration Scenarios**: 3 tests for data analysis, transformation, and multi-format workflows
|
236
|
+
|
237
|
+
#### 📚 Documentation & Examples
|
238
|
+
- **E2E Testing Guide**: Created comprehensive documentation (`docs/e2e_testing_coverage.md`) explaining test coverage and usage
|
239
|
+
- **Real-World Examples**: Tests serve as practical examples of library usage patterns
|
240
|
+
- **Error Scenario Coverage**: Comprehensive testing of edge cases and failure conditions
|
241
|
+
|
242
|
+
### 2025.1.1 (2025-08-XX)
|
243
|
+
|
244
|
+
#### 🔧 Code Quality Improvements
|
245
|
+
- **Refactored Complex Regex Logic**: Extracted Windows drive letter validation logic from `_check_dangerous_characters` into a dedicated `_is_valid_windows_drive_pattern` helper method in `PathValidator` for better readability and maintainability
|
246
|
+
- **Exception Handling Consistency**: Fixed inconsistency in `ResourceManager.acquire()` method to properly re-raise `NotImplementedError` without wrapping it in `SplurgeResourceAcquisitionError`
|
247
|
+
- **Import Organization**: Moved all imports to the top of modules across the entire codebase for better code structure and PEP 8 compliance
|
248
|
+
|
249
|
+
#### 🧪 Testing Enhancements
|
250
|
+
- **Public API Focus**: Removed all tests that validated private implementation details, focusing exclusively on public API behavior validation
|
251
|
+
- **Comprehensive Resource Manager Tests**: Added extensive test suite for `ResourceManager` module covering all public methods, edge cases, error scenarios, and context manager behavior
|
252
|
+
- **Bookend Logic Clarification**: Updated and corrected all tests related to `StringTokenizer.remove_bookends` to properly reflect its single-character, symmetric bookend matching behavior
|
253
|
+
- **Path Validation Test Clarity**: Clarified test expectations and comments for Windows drive-relative paths (e.g., "C:file.txt") to reflect the validator's intentionally strict security design
|
254
|
+
|
255
|
+
#### 🐛 Bug Fixes
|
256
|
+
- **Test Reliability**: Fixed failing tests in `ResourceManager` context manager scenarios by properly handling file truncation and line ending normalization
|
257
|
+
- **Ruff Compliance**: Resolved all linting warnings including unused variables and imports
|
258
|
+
|
259
|
+
#### 📚 Documentation Updates
|
260
|
+
- **Method Documentation**: Updated `ResourceManager.acquire()` docstring to include `NotImplementedError` in the Raises section
|
261
|
+
- **Test Comments**: Enhanced test documentation with clearer explanations of expected behaviors and edge cases
|
262
|
+
|
217
263
|
### 2025.1.0 (2025-08-25)
|
218
264
|
|
219
265
|
#### 🎉 Major Features
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "splurge-dsv"
|
7
|
-
version = "2025.1.
|
7
|
+
version = "2025.1.2"
|
8
8
|
description = "A utility library for working with DSV (Delimited String Values) files"
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.10"
|
@@ -82,3 +82,29 @@ exclude_lines = [
|
|
82
82
|
|
83
83
|
[tool.coverage.html]
|
84
84
|
directory = "htmlcov"
|
85
|
+
|
86
|
+
[tool.ruff]
|
87
|
+
target-version = "py310"
|
88
|
+
line-length = 120
|
89
|
+
|
90
|
+
[tool.ruff.lint]
|
91
|
+
select = [
|
92
|
+
"E", # pycodestyle errors
|
93
|
+
"W", # pycodestyle warnings
|
94
|
+
"F", # pyflakes
|
95
|
+
"I", # isort
|
96
|
+
"B", # flake8-bugbear
|
97
|
+
"C4", # flake8-comprehensions
|
98
|
+
"UP", # pyupgrade
|
99
|
+
]
|
100
|
+
ignore = [
|
101
|
+
"E501", # line too long, handled by line-length
|
102
|
+
"B008", # do not perform function calls in argument defaults
|
103
|
+
"C901", # too complex
|
104
|
+
]
|
105
|
+
|
106
|
+
[tool.ruff.format]
|
107
|
+
quote-style = "double"
|
108
|
+
indent-style = "space"
|
109
|
+
skip-magic-trailing-comma = false
|
110
|
+
line-ending = "auto"
|
@@ -0,0 +1,84 @@
|
|
1
|
+
"""
|
2
|
+
Splurge DSV - A utility library for working with DSV (Delimited String Values) files.
|
3
|
+
|
4
|
+
This package provides utilities for parsing, processing, and manipulating
|
5
|
+
delimited string value files with support for various delimiters, text bookends,
|
6
|
+
and streaming operations.
|
7
|
+
|
8
|
+
Copyright (c) 2025 Jim Schilling
|
9
|
+
|
10
|
+
This module is licensed under the MIT License.
|
11
|
+
"""
|
12
|
+
|
13
|
+
# Local imports
|
14
|
+
from splurge_dsv.dsv_helper import DsvHelper
|
15
|
+
from splurge_dsv.exceptions import (
|
16
|
+
SplurgeConfigurationError,
|
17
|
+
SplurgeDataProcessingError,
|
18
|
+
SplurgeDsvError,
|
19
|
+
SplurgeFileEncodingError,
|
20
|
+
SplurgeFileNotFoundError,
|
21
|
+
SplurgeFileOperationError,
|
22
|
+
SplurgeFilePermissionError,
|
23
|
+
SplurgeFormatError,
|
24
|
+
SplurgeParameterError,
|
25
|
+
SplurgeParsingError,
|
26
|
+
SplurgePathValidationError,
|
27
|
+
SplurgePerformanceWarning,
|
28
|
+
SplurgeRangeError,
|
29
|
+
SplurgeResourceAcquisitionError,
|
30
|
+
SplurgeResourceError,
|
31
|
+
SplurgeResourceReleaseError,
|
32
|
+
SplurgeStreamingError,
|
33
|
+
SplurgeTypeConversionError,
|
34
|
+
SplurgeValidationError,
|
35
|
+
)
|
36
|
+
from splurge_dsv.path_validator import PathValidator
|
37
|
+
from splurge_dsv.resource_manager import (
|
38
|
+
FileResourceManager,
|
39
|
+
ResourceManager,
|
40
|
+
StreamResourceManager,
|
41
|
+
safe_file_operation,
|
42
|
+
safe_stream_operation,
|
43
|
+
)
|
44
|
+
from splurge_dsv.string_tokenizer import StringTokenizer
|
45
|
+
from splurge_dsv.text_file_helper import TextFileHelper
|
46
|
+
|
47
|
+
__version__ = "2025.1.2"
|
48
|
+
__author__ = "Jim Schilling"
|
49
|
+
__license__ = "MIT"
|
50
|
+
|
51
|
+
__all__ = [
|
52
|
+
# Main helper class
|
53
|
+
"DsvHelper",
|
54
|
+
# Exceptions
|
55
|
+
"SplurgeDsvError",
|
56
|
+
"SplurgeValidationError",
|
57
|
+
"SplurgeFileOperationError",
|
58
|
+
"SplurgeFileNotFoundError",
|
59
|
+
"SplurgeFilePermissionError",
|
60
|
+
"SplurgeFileEncodingError",
|
61
|
+
"SplurgePathValidationError",
|
62
|
+
"SplurgeDataProcessingError",
|
63
|
+
"SplurgeParsingError",
|
64
|
+
"SplurgeTypeConversionError",
|
65
|
+
"SplurgeStreamingError",
|
66
|
+
"SplurgeConfigurationError",
|
67
|
+
"SplurgeResourceError",
|
68
|
+
"SplurgeResourceAcquisitionError",
|
69
|
+
"SplurgeResourceReleaseError",
|
70
|
+
"SplurgePerformanceWarning",
|
71
|
+
"SplurgeParameterError",
|
72
|
+
"SplurgeRangeError",
|
73
|
+
"SplurgeFormatError",
|
74
|
+
# Utility classes
|
75
|
+
"StringTokenizer",
|
76
|
+
"TextFileHelper",
|
77
|
+
"PathValidator",
|
78
|
+
"ResourceManager",
|
79
|
+
"FileResourceManager",
|
80
|
+
"StreamResourceManager",
|
81
|
+
# Context managers
|
82
|
+
"safe_file_operation",
|
83
|
+
"safe_stream_operation",
|
84
|
+
]
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""
|
2
|
+
Command-line interface entry point for splurge-dsv.
|
3
|
+
|
4
|
+
This module serves as the entry point when running the package as a module.
|
5
|
+
It imports and calls the main CLI function from the cli module.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Standard library imports
|
9
|
+
import sys
|
10
|
+
|
11
|
+
# Local imports
|
12
|
+
from splurge_dsv.cli import main
|
13
|
+
|
14
|
+
if __name__ == "__main__":
|
15
|
+
sys.exit(main())
|
@@ -0,0 +1,158 @@
|
|
1
|
+
"""
|
2
|
+
Command-line interface for splurge-dsv.
|
3
|
+
|
4
|
+
This module provides a command-line interface for the splurge-dsv library,
|
5
|
+
allowing users to parse DSV files from the command line.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
python -m splurge_dsv <file_path> [options]
|
9
|
+
python -m splurge_dsv --help
|
10
|
+
"""
|
11
|
+
|
12
|
+
# Standard library imports
|
13
|
+
import argparse
|
14
|
+
import sys
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
# Local imports
|
18
|
+
from splurge_dsv.dsv_helper import DsvHelper
|
19
|
+
from splurge_dsv.exceptions import SplurgeDsvError
|
20
|
+
|
21
|
+
|
22
|
+
def parse_arguments() -> argparse.Namespace:
|
23
|
+
"""Parse command line arguments."""
|
24
|
+
parser = argparse.ArgumentParser(
|
25
|
+
description="Parse DSV (Delimited String Values) files",
|
26
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
27
|
+
epilog="""
|
28
|
+
Examples:
|
29
|
+
python -m splurge_dsv data.csv --delimiter ,
|
30
|
+
python -m splurge_dsv data.tsv --delimiter "\\t"
|
31
|
+
python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
|
32
|
+
""",
|
33
|
+
)
|
34
|
+
|
35
|
+
parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
|
36
|
+
|
37
|
+
parser.add_argument("--delimiter", "-d", type=str, required=True, help="Delimiter character to use for parsing")
|
38
|
+
|
39
|
+
parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
|
40
|
+
|
41
|
+
parser.add_argument("--no-strip", action="store_true", help="Don't strip whitespace from values")
|
42
|
+
|
43
|
+
parser.add_argument("--no-bookend-strip", action="store_true", help="Don't strip whitespace from bookends")
|
44
|
+
|
45
|
+
parser.add_argument("--encoding", "-e", type=str, default="utf-8", help="File encoding (default: utf-8)")
|
46
|
+
|
47
|
+
parser.add_argument("--skip-header", type=int, default=0, help="Number of header rows to skip (default: 0)")
|
48
|
+
|
49
|
+
parser.add_argument("--skip-footer", type=int, default=0, help="Number of footer rows to skip (default: 0)")
|
50
|
+
|
51
|
+
parser.add_argument(
|
52
|
+
"--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
|
53
|
+
)
|
54
|
+
|
55
|
+
parser.add_argument("--chunk-size", type=int, default=500, help="Chunk size for streaming (default: 500)")
|
56
|
+
|
57
|
+
parser.add_argument("--version", action="version", version="%(prog)s 2025.1.2")
|
58
|
+
|
59
|
+
return parser.parse_args()
|
60
|
+
|
61
|
+
|
62
|
+
def print_results(rows: list[list[str]], delimiter: str) -> None:
|
63
|
+
"""Print parsed results in a formatted way."""
|
64
|
+
if not rows:
|
65
|
+
print("No data found.")
|
66
|
+
return
|
67
|
+
|
68
|
+
# Find the maximum width for each column
|
69
|
+
if rows:
|
70
|
+
max_widths = []
|
71
|
+
for col_idx in range(len(rows[0])):
|
72
|
+
max_width = max(len(str(row[col_idx])) for row in rows)
|
73
|
+
max_widths.append(max_width)
|
74
|
+
|
75
|
+
# Print header separator
|
76
|
+
print("-" * (sum(max_widths) + len(max_widths) * 3 - 1))
|
77
|
+
|
78
|
+
# Print each row
|
79
|
+
for row_idx, row in enumerate(rows):
|
80
|
+
formatted_row = []
|
81
|
+
for col_idx, value in enumerate(row):
|
82
|
+
formatted_value = str(value).ljust(max_widths[col_idx])
|
83
|
+
formatted_row.append(formatted_value)
|
84
|
+
print(f"| {' | '.join(formatted_row)} |")
|
85
|
+
|
86
|
+
# Print separator after header
|
87
|
+
if row_idx == 0:
|
88
|
+
print("-" * (sum(max_widths) + len(max_widths) * 3 - 1))
|
89
|
+
|
90
|
+
|
91
|
+
def main() -> int:
|
92
|
+
"""Main entry point for the command-line interface."""
|
93
|
+
try:
|
94
|
+
args = parse_arguments()
|
95
|
+
|
96
|
+
# Validate file path
|
97
|
+
file_path = Path(args.file_path)
|
98
|
+
if not file_path.exists():
|
99
|
+
print(f"Error: File '{args.file_path}' not found.", file=sys.stderr)
|
100
|
+
return 1
|
101
|
+
|
102
|
+
if not file_path.is_file():
|
103
|
+
print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
|
104
|
+
return 1
|
105
|
+
|
106
|
+
# Parse the file
|
107
|
+
if args.stream:
|
108
|
+
print(f"Streaming file '{args.file_path}' with delimiter '{args.delimiter}'...")
|
109
|
+
chunk_count = 0
|
110
|
+
total_rows = 0
|
111
|
+
|
112
|
+
for chunk in DsvHelper.parse_stream(
|
113
|
+
file_path,
|
114
|
+
delimiter=args.delimiter,
|
115
|
+
strip=not args.no_strip,
|
116
|
+
bookend=args.bookend,
|
117
|
+
bookend_strip=not args.no_bookend_strip,
|
118
|
+
encoding=args.encoding,
|
119
|
+
skip_header_rows=args.skip_header,
|
120
|
+
skip_footer_rows=args.skip_footer,
|
121
|
+
chunk_size=args.chunk_size,
|
122
|
+
):
|
123
|
+
chunk_count += 1
|
124
|
+
total_rows += len(chunk)
|
125
|
+
print(f"Chunk {chunk_count}: {len(chunk)} rows")
|
126
|
+
print_results(chunk, args.delimiter)
|
127
|
+
print()
|
128
|
+
|
129
|
+
print(f"Total: {total_rows} rows in {chunk_count} chunks")
|
130
|
+
else:
|
131
|
+
print(f"Parsing file '{args.file_path}' with delimiter '{args.delimiter}'...")
|
132
|
+
rows = DsvHelper.parse_file(
|
133
|
+
file_path,
|
134
|
+
delimiter=args.delimiter,
|
135
|
+
strip=not args.no_strip,
|
136
|
+
bookend=args.bookend,
|
137
|
+
bookend_strip=not args.no_bookend_strip,
|
138
|
+
encoding=args.encoding,
|
139
|
+
skip_header_rows=args.skip_header,
|
140
|
+
skip_footer_rows=args.skip_footer,
|
141
|
+
)
|
142
|
+
|
143
|
+
print(f"Parsed {len(rows)} rows")
|
144
|
+
print_results(rows, args.delimiter)
|
145
|
+
|
146
|
+
return 0
|
147
|
+
|
148
|
+
except KeyboardInterrupt:
|
149
|
+
print("\nOperation cancelled by user.", file=sys.stderr)
|
150
|
+
return 130
|
151
|
+
except SplurgeDsvError as e:
|
152
|
+
print(f"Error: {e.message}", file=sys.stderr)
|
153
|
+
if e.details:
|
154
|
+
print(f"Details: {e.details}", file=sys.stderr)
|
155
|
+
return 1
|
156
|
+
except Exception as e:
|
157
|
+
print(f"Unexpected error: {e}", file=sys.stderr)
|
158
|
+
return 1
|
@@ -8,12 +8,15 @@ Please preserve this header and all related material when sharing!
|
|
8
8
|
This module is licensed under the MIT License.
|
9
9
|
"""
|
10
10
|
|
11
|
+
# Standard library imports
|
12
|
+
from collections.abc import Iterator
|
11
13
|
from os import PathLike
|
12
|
-
from typing import Iterator
|
13
14
|
|
15
|
+
# Local imports
|
16
|
+
from splurge_dsv.exceptions import SplurgeParameterError
|
14
17
|
from splurge_dsv.string_tokenizer import StringTokenizer
|
15
18
|
from splurge_dsv.text_file_helper import TextFileHelper
|
16
|
-
|
19
|
+
|
17
20
|
|
18
21
|
class DsvHelper:
|
19
22
|
"""
|
@@ -38,7 +41,7 @@ class DsvHelper:
|
|
38
41
|
delimiter: str,
|
39
42
|
strip: bool = DEFAULT_STRIP,
|
40
43
|
bookend: str | None = None,
|
41
|
-
bookend_strip: bool = DEFAULT_BOOKEND_STRIP
|
44
|
+
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
42
45
|
) -> list[str]:
|
43
46
|
"""
|
44
47
|
Parse a string into a list of strings.
|
@@ -68,10 +71,7 @@ class DsvHelper:
|
|
68
71
|
tokens: list[str] = StringTokenizer.parse(content, delimiter=delimiter, strip=strip)
|
69
72
|
|
70
73
|
if bookend:
|
71
|
-
tokens = [
|
72
|
-
StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip)
|
73
|
-
for token in tokens
|
74
|
-
]
|
74
|
+
tokens = [StringTokenizer.remove_bookends(token, bookend=bookend, strip=bookend_strip) for token in tokens]
|
75
75
|
|
76
76
|
return tokens
|
77
77
|
|
@@ -83,7 +83,7 @@ class DsvHelper:
|
|
83
83
|
delimiter: str,
|
84
84
|
strip: bool = DEFAULT_STRIP,
|
85
85
|
bookend: str | None = None,
|
86
|
-
bookend_strip: bool = DEFAULT_BOOKEND_STRIP
|
86
|
+
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
87
87
|
) -> list[list[str]]:
|
88
88
|
"""
|
89
89
|
Parse a list of strings into a list of lists of strings.
|
@@ -108,7 +108,7 @@ class DsvHelper:
|
|
108
108
|
"""
|
109
109
|
if not isinstance(content, list):
|
110
110
|
raise SplurgeParameterError("content must be a list")
|
111
|
-
|
111
|
+
|
112
112
|
if not all(isinstance(item, str) for item in content):
|
113
113
|
raise SplurgeParameterError("content must be a list of strings")
|
114
114
|
|
@@ -128,7 +128,7 @@ class DsvHelper:
|
|
128
128
|
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
129
129
|
encoding: str = DEFAULT_ENCODING,
|
130
130
|
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
131
|
-
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS
|
131
|
+
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
132
132
|
) -> list[list[str]]:
|
133
133
|
"""
|
134
134
|
Parse a file into a list of lists of strings.
|
@@ -157,19 +157,10 @@ class DsvHelper:
|
|
157
157
|
[['header1', 'header2'], ['value1', 'value2']]
|
158
158
|
"""
|
159
159
|
lines: list[str] = TextFileHelper.read(
|
160
|
-
file_path,
|
161
|
-
encoding=encoding,
|
162
|
-
skip_header_rows=skip_header_rows,
|
163
|
-
skip_footer_rows=skip_footer_rows
|
160
|
+
file_path, encoding=encoding, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows
|
164
161
|
)
|
165
162
|
|
166
|
-
return cls.parses(
|
167
|
-
lines,
|
168
|
-
delimiter=delimiter,
|
169
|
-
strip=strip,
|
170
|
-
bookend=bookend,
|
171
|
-
bookend_strip=bookend_strip
|
172
|
-
)
|
163
|
+
return cls.parses(lines, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
|
173
164
|
|
174
165
|
@classmethod
|
175
166
|
def _process_stream_chunk(
|
@@ -179,28 +170,22 @@ class DsvHelper:
|
|
179
170
|
delimiter: str,
|
180
171
|
strip: bool = DEFAULT_STRIP,
|
181
172
|
bookend: str | None = None,
|
182
|
-
bookend_strip: bool = DEFAULT_BOOKEND_STRIP
|
173
|
+
bookend_strip: bool = DEFAULT_BOOKEND_STRIP,
|
183
174
|
) -> list[list[str]]:
|
184
175
|
"""
|
185
176
|
Process a chunk of lines from the stream.
|
186
|
-
|
177
|
+
|
187
178
|
Args:
|
188
179
|
chunk: List of lines to process
|
189
180
|
delimiter: Delimiter to use for parsing
|
190
181
|
strip: Whether to strip whitespace
|
191
182
|
bookend: Bookend character for text fields
|
192
183
|
bookend_strip: Whether to strip whitespace from bookends
|
193
|
-
|
184
|
+
|
194
185
|
Returns:
|
195
186
|
list[list[str]]: Parsed rows
|
196
187
|
"""
|
197
|
-
return cls.parses(
|
198
|
-
chunk,
|
199
|
-
delimiter=delimiter,
|
200
|
-
strip=strip,
|
201
|
-
bookend=bookend,
|
202
|
-
bookend_strip=bookend_strip
|
203
|
-
)
|
188
|
+
return cls.parses(chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip)
|
204
189
|
|
205
190
|
@classmethod
|
206
191
|
def parse_stream(
|
@@ -214,7 +199,7 @@ class DsvHelper:
|
|
214
199
|
encoding: str = DEFAULT_ENCODING,
|
215
200
|
skip_header_rows: int = DEFAULT_SKIP_HEADER_ROWS,
|
216
201
|
skip_footer_rows: int = DEFAULT_SKIP_FOOTER_ROWS,
|
217
|
-
chunk_size: int = DEFAULT_CHUNK_SIZE
|
202
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
218
203
|
) -> Iterator[list[list[str]]]:
|
219
204
|
"""
|
220
205
|
Stream-parse a DSV file in chunks of lines.
|
@@ -247,17 +232,15 @@ class DsvHelper:
|
|
247
232
|
skip_footer_rows = max(skip_footer_rows, cls.DEFAULT_SKIP_FOOTER_ROWS)
|
248
233
|
|
249
234
|
# Use TextFileHelper.read_as_stream for consistent error handling
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
bookend_strip=bookend_strip
|
263
|
-
)
|
235
|
+
yield from (
|
236
|
+
cls._process_stream_chunk(
|
237
|
+
chunk, delimiter=delimiter, strip=strip, bookend=bookend, bookend_strip=bookend_strip
|
238
|
+
)
|
239
|
+
for chunk in TextFileHelper.read_as_stream(
|
240
|
+
file_path,
|
241
|
+
encoding=encoding,
|
242
|
+
skip_header_rows=skip_header_rows,
|
243
|
+
skip_footer_rows=skip_footer_rows,
|
244
|
+
chunk_size=chunk_size,
|
245
|
+
)
|
246
|
+
)
|