splurge-dsv 2025.2.1__tar.gz → 2025.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {splurge_dsv-2025.2.1/splurge_dsv.egg-info → splurge_dsv-2025.3.1}/PKG-INFO +78 -5
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/README.md +75 -3
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/pyproject.toml +8 -4
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv/__init__.py +16 -5
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv/cli.py +137 -26
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv/dsv.py +100 -30
- splurge_dsv-2025.3.1/splurge_dsv/dsv_helper.py +631 -0
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv/exceptions.py +22 -1
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv/string_tokenizer.py +7 -1
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1/splurge_dsv.egg-info}/PKG-INFO +78 -5
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/SOURCES.txt +0 -4
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/requires.txt +2 -1
- splurge_dsv-2025.2.1/splurge_dsv/dsv_helper.py +0 -306
- splurge_dsv-2025.2.1/splurge_dsv/path_validator.py +0 -298
- splurge_dsv-2025.2.1/splurge_dsv/safe_text_file_reader.py +0 -177
- splurge_dsv-2025.2.1/splurge_dsv/safe_text_file_writer.py +0 -136
- splurge_dsv-2025.2.1/splurge_dsv/text_file_helper.py +0 -240
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/LICENSE +0 -0
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/setup.cfg +0 -0
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv/__main__.py +0 -0
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/dependency_links.txt +0 -0
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/entry_points.txt +0 -0
- {splurge_dsv-2025.2.1 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: splurge-dsv
|
3
|
-
Version: 2025.
|
3
|
+
Version: 2025.3.1
|
4
4
|
Summary: A utility library for working with DSV (Delimited String Values) files
|
5
5
|
Author: Jim Schilling
|
6
6
|
License-Expression: MIT
|
@@ -21,10 +21,11 @@ Classifier: Topic :: Text Processing :: Filters
|
|
21
21
|
Requires-Python: >=3.10
|
22
22
|
Description-Content-Type: text/markdown
|
23
23
|
License-File: LICENSE
|
24
|
+
Requires-Dist: splurge-safe-io>=2025.0.5
|
25
|
+
Requires-Dist: PyYAML>=6.0
|
24
26
|
Provides-Extra: dev
|
25
27
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
26
28
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
27
|
-
Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
|
28
29
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
29
30
|
Requires-Dist: ruff>=0.0.241; extra == "dev"
|
30
31
|
Requires-Dist: pytest-mock>=3.0.0; extra == "dev"
|
@@ -38,7 +39,7 @@ Dynamic: license-file
|
|
38
39
|
[](https://opensource.org/licenses/MIT)
|
39
40
|
|
40
41
|
[](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml)
|
41
|
-
[](https://github.com/jim-schilling/splurge-dsv)
|
42
43
|
[](https://github.com/astral-sh/ruff)
|
43
44
|
[](https://mypy-lang.org/)
|
44
45
|
|
@@ -46,12 +47,49 @@ A robust Python library for parsing and processing delimited-separated value (DS
|
|
46
47
|
|
47
48
|
## Features
|
48
49
|
|
49
|
-
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter files
|
50
|
+
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter separated value files/objects
|
51
|
+
- **Configurable Parsing**: Flexible options for delimiters, quote characters, escape characters, header/footer row(s) handling
|
50
52
|
- **Memory-Efficient Streaming**: Process large files without loading entire content into memory
|
51
53
|
- **Security & Validation**: Comprehensive path validation and file permission checks
|
52
54
|
- **Unicode Support**: Full Unicode character and encoding support
|
53
55
|
- **Type Safety**: Full type annotations with mypy validation
|
54
|
-
- **
|
56
|
+
- **Deterministic Newline Handling**: Consistent handling of CRLF, CR, and LF newlines across platforms
|
57
|
+
- **CLI Tool**: Command-line interface for quick parsing and inspection of DSV files
|
58
|
+
- **Robust Error Handling**: Clear and specific exceptions for various error scenarios
|
59
|
+
- **Modern API**: Object-oriented API with `Dsv` and `DsvConfig` classes for easy configuration and reuse
|
60
|
+
- **Comprehensive Documentation**: In-depth API reference and usage examples
|
61
|
+
- **Exhaustive Testing**: 283 tests with 93% code coverage including property-based testing, edge case testing, and cross-platform compatibility validation
|
62
|
+
|
63
|
+
**⚠️ CHANGES in v2025.3.1**
|
64
|
+
> - **skip_empty_lines** option added to `DsvConfig`, `DsvHelper`, and CLI.
|
65
|
+
> - This option allows users to skip logical empty lines when parsing DSV files.
|
66
|
+
|
67
|
+
**⚠️ CHANGES in v2025.3.0**
|
68
|
+
> - **Commit-Only Release**: v2025.3.0 is a commit-only release and will not be published to PyPI.
|
69
|
+
> - The legacy `parse_stream()` helpers were removed in release 2025.3.0.
|
70
|
+
> - Use `parse_file_stream()` on `Dsv`/`DsvHelper` for stream-based parsing of files. This standardizes the API naming and clarifies that streaming helpers accept file paths rather than arbitrary iterables.
|
71
|
+
> - TextFileHelper, SafeTextFileReader, SafeTextFileWriter, and PathValidator, as well as all their associated tests have been removed in this release.
|
72
|
+
> - Their functionality has been migrated in favor of the `splurge-safe-io` package, which provides robust and secure file I/O operations.
|
73
|
+
> - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
|
74
|
+
> - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
|
75
|
+
> - **See API-REFERENCE.md for migration guidance and complete reference documentation, with usage examples.**
|
76
|
+
|
77
|
+
**⚠️ CHANGES in v2025.2.2**
|
78
|
+
> - **Deprecated Warning**: The following modules and their associated classes and functions are deprecated and will be removed in a future release (2025.3.0). Users are encouraged to transition to the `splurge-safe-io` package for these functionalities:
|
79
|
+
> - `splurge_dsv.safe_text_file_reader`
|
80
|
+
> - `splurge_dsv.safe_text_file_writer`
|
81
|
+
> - `splurge_dsv.path_validator`
|
82
|
+
> - `splurge_dsv.text_file_helper`
|
83
|
+
> - **New Exception**: Added `SplurgeDsvFileExistsError` to handle file existence errors.
|
84
|
+
> - **Fixed Exception Mapping**: Many errors were incorrectly mapped to SplurgeDsvEncodingError; this has been corrected to use appropriate exception types.
|
85
|
+
> - Some exceptions were not mapped to any SplurgeDsv* exception; these have also been corrected.
|
86
|
+
> - **3rd-Party Dependency Additions**: Added `splurge-safe-io (v2025.0.4)`.
|
87
|
+
> - `splurge-safe-io` is a new dependency that provides robust and secure file I/O operations, including safe text file reading and writing with deterministic newline handling and path validation.
|
88
|
+
> - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
|
89
|
+
> - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
|
90
|
+
> - **Code Refactoring**: Refactored `SafeTextFileReader`, `SafeTextFileWriter`, and `PathValidator` to utilize `splurge-safe-io` implementations internally, ensuring consistent behavior and reducing maintenance overhead.
|
91
|
+
> - **This release maintains backward compatibility** for existing users, but users are encouraged to transition to `splurge-safe-io` for future-proofing their codebases.
|
92
|
+
> - **_This release is a commit-only release and will not be published to PyPI._**
|
55
93
|
|
56
94
|
**⚠️ BREAKING CHANGES in v2025.2.0**
|
57
95
|
>
|
@@ -78,6 +116,41 @@ python -m splurge_dsv data.csv --delimiter ,
|
|
78
116
|
python -m splurge_dsv large_file.csv --delimiter , --stream --chunk-size 1000
|
79
117
|
```
|
80
118
|
|
119
|
+
### YAML configuration file
|
120
|
+
|
121
|
+
You can place CLI-equivalent options in a YAML file and pass it to the CLI
|
122
|
+
using `--config` (or `-c`). CLI arguments override values found in the
|
123
|
+
YAML file. Example `config.yaml`:
|
124
|
+
|
125
|
+
```yaml
|
126
|
+
delimiter: ","
|
127
|
+
strip: true
|
128
|
+
bookend: '"'
|
129
|
+
encoding: utf-8
|
130
|
+
skip_header_rows: 1
|
131
|
+
skip_footer_rows: 0
|
132
|
+
skip_empty_lines: false
|
133
|
+
detect_columns: true
|
134
|
+
chunk_size: 500
|
135
|
+
max_detect_chunks: 5
|
136
|
+
raise_on_missing_columns: false
|
137
|
+
raise_on_extra_columns: false
|
138
|
+
```
|
139
|
+
|
140
|
+
Usage with CLI:
|
141
|
+
|
142
|
+
```bash
|
143
|
+
python -m splurge_dsv data.csv --config config.yaml --delimiter "|"
|
144
|
+
# The CLI delimiter '|' overrides the YAML delimiter
|
145
|
+
```
|
146
|
+
|
147
|
+
Example using the shipped example config in the repository:
|
148
|
+
|
149
|
+
```bash
|
150
|
+
# Use the example file provided at examples/config.yaml
|
151
|
+
python -m splurge_dsv data.csv --config examples/config.yaml
|
152
|
+
```
|
153
|
+
|
81
154
|
### API Usage
|
82
155
|
|
83
156
|
```python
|
@@ -5,7 +5,7 @@
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
6
6
|
|
7
7
|
[](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml)
|
8
|
-
[](https://github.com/jim-schilling/splurge-dsv)
|
9
9
|
[](https://github.com/astral-sh/ruff)
|
10
10
|
[](https://mypy-lang.org/)
|
11
11
|
|
@@ -13,12 +13,49 @@ A robust Python library for parsing and processing delimited-separated value (DS
|
|
13
13
|
|
14
14
|
## Features
|
15
15
|
|
16
|
-
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter files
|
16
|
+
- **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter separated value files/objects
|
17
|
+
- **Configurable Parsing**: Flexible options for delimiters, quote characters, escape characters, header/footer row(s) handling
|
17
18
|
- **Memory-Efficient Streaming**: Process large files without loading entire content into memory
|
18
19
|
- **Security & Validation**: Comprehensive path validation and file permission checks
|
19
20
|
- **Unicode Support**: Full Unicode character and encoding support
|
20
21
|
- **Type Safety**: Full type annotations with mypy validation
|
21
|
-
- **
|
22
|
+
- **Deterministic Newline Handling**: Consistent handling of CRLF, CR, and LF newlines across platforms
|
23
|
+
- **CLI Tool**: Command-line interface for quick parsing and inspection of DSV files
|
24
|
+
- **Robust Error Handling**: Clear and specific exceptions for various error scenarios
|
25
|
+
- **Modern API**: Object-oriented API with `Dsv` and `DsvConfig` classes for easy configuration and reuse
|
26
|
+
- **Comprehensive Documentation**: In-depth API reference and usage examples
|
27
|
+
- **Exhaustive Testing**: 283 tests with 93% code coverage including property-based testing, edge case testing, and cross-platform compatibility validation
|
28
|
+
|
29
|
+
**⚠️ CHANGES in v2025.3.1**
|
30
|
+
> - **skip_empty_lines** option added to `DsvConfig`, `DsvHelper`, and CLI.
|
31
|
+
> - This option allows users to skip logical empty lines when parsing DSV files.
|
32
|
+
|
33
|
+
**⚠️ CHANGES in v2025.3.0**
|
34
|
+
> - **Commit-Only Release**: v2025.3.0 is a commit-only release and will not be published to PyPI.
|
35
|
+
> - The legacy `parse_stream()` helpers were removed in release 2025.3.0.
|
36
|
+
> - Use `parse_file_stream()` on `Dsv`/`DsvHelper` for stream-based parsing of files. This standardizes the API naming and clarifies that streaming helpers accept file paths rather than arbitrary iterables.
|
37
|
+
> - TextFileHelper, SafeTextFileReader, SafeTextFileWriter, and PathValidator, as well as all their associated tests have been removed in this release.
|
38
|
+
> - Their functionality has been migrated in favor of the `splurge-safe-io` package, which provides robust and secure file I/O operations.
|
39
|
+
> - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
|
40
|
+
> - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
|
41
|
+
> - **See API-REFERENCE.md for migration guidance and complete reference documentation, with usage examples.**
|
42
|
+
|
43
|
+
**⚠️ CHANGES in v2025.2.2**
|
44
|
+
> - **Deprecated Warning**: The following modules and their associated classes and functions are deprecated and will be removed in a future release (2025.3.0). Users are encouraged to transition to the `splurge-safe-io` package for these functionalities:
|
45
|
+
> - `splurge_dsv.safe_text_file_reader`
|
46
|
+
> - `splurge_dsv.safe_text_file_writer`
|
47
|
+
> - `splurge_dsv.path_validator`
|
48
|
+
> - `splurge_dsv.text_file_helper`
|
49
|
+
> - **New Exception**: Added `SplurgeDsvFileExistsError` to handle file existence errors.
|
50
|
+
> - **Fixed Exception Mapping**: Many errors were incorrectly mapped to SplurgeDsvEncodingError; this has been corrected to use appropriate exception types.
|
51
|
+
> - Some exceptions were not mapped to any SplurgeDsv* exception; these have also been corrected.
|
52
|
+
> - **3rd-Party Dependency Additions**: Added `splurge-safe-io (v2025.0.4)`.
|
53
|
+
> - `splurge-safe-io` is a new dependency that provides robust and secure file I/O operations, including safe text file reading and writing with deterministic newline handling and path validation.
|
54
|
+
> - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
|
55
|
+
> - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
|
56
|
+
> - **Code Refactoring**: Refactored `SafeTextFileReader`, `SafeTextFileWriter`, and `PathValidator` to utilize `splurge-safe-io` implementations internally, ensuring consistent behavior and reducing maintenance overhead.
|
57
|
+
> - **This release maintains backward compatibility** for existing users, but users are encouraged to transition to `splurge-safe-io` for future-proofing their codebases.
|
58
|
+
> - **_This release is a commit-only release and will not be published to PyPI._**
|
22
59
|
|
23
60
|
**⚠️ BREAKING CHANGES in v2025.2.0**
|
24
61
|
>
|
@@ -45,6 +82,41 @@ python -m splurge_dsv data.csv --delimiter ,
|
|
45
82
|
python -m splurge_dsv large_file.csv --delimiter , --stream --chunk-size 1000
|
46
83
|
```
|
47
84
|
|
85
|
+
### YAML configuration file
|
86
|
+
|
87
|
+
You can place CLI-equivalent options in a YAML file and pass it to the CLI
|
88
|
+
using `--config` (or `-c`). CLI arguments override values found in the
|
89
|
+
YAML file. Example `config.yaml`:
|
90
|
+
|
91
|
+
```yaml
|
92
|
+
delimiter: ","
|
93
|
+
strip: true
|
94
|
+
bookend: '"'
|
95
|
+
encoding: utf-8
|
96
|
+
skip_header_rows: 1
|
97
|
+
skip_footer_rows: 0
|
98
|
+
skip_empty_lines: false
|
99
|
+
detect_columns: true
|
100
|
+
chunk_size: 500
|
101
|
+
max_detect_chunks: 5
|
102
|
+
raise_on_missing_columns: false
|
103
|
+
raise_on_extra_columns: false
|
104
|
+
```
|
105
|
+
|
106
|
+
Usage with CLI:
|
107
|
+
|
108
|
+
```bash
|
109
|
+
python -m splurge_dsv data.csv --config config.yaml --delimiter "|"
|
110
|
+
# The CLI delimiter '|' overrides the YAML delimiter
|
111
|
+
```
|
112
|
+
|
113
|
+
Example using the shipped example config in the repository:
|
114
|
+
|
115
|
+
```bash
|
116
|
+
# Use the example file provided at examples/config.yaml
|
117
|
+
python -m splurge_dsv data.csv --config examples/config.yaml
|
118
|
+
```
|
119
|
+
|
48
120
|
### API Usage
|
49
121
|
|
50
122
|
```python
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "splurge-dsv"
|
7
|
-
version = "2025.
|
7
|
+
version = "2025.3.1"
|
8
8
|
description = "A utility library for working with DSV (Delimited String Values) files"
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.10"
|
@@ -25,13 +25,15 @@ classifiers = [
|
|
25
25
|
"Topic :: Text Processing :: Filters",
|
26
26
|
]
|
27
27
|
|
28
|
-
dependencies = [
|
28
|
+
dependencies = [
|
29
|
+
"splurge-safe-io>=2025.0.5",
|
30
|
+
"PyYAML>=6.0",
|
31
|
+
]
|
29
32
|
|
30
33
|
[project.optional-dependencies]
|
31
34
|
dev = [
|
32
35
|
"pytest>=7.0.0",
|
33
36
|
"pytest-cov>=4.0.0",
|
34
|
-
"pytest-xdist>=3.0.0",
|
35
37
|
"mypy>=1.0.0",
|
36
38
|
"ruff>=0.0.241",
|
37
39
|
"pytest-mock>=3.0.0",
|
@@ -53,7 +55,7 @@ include = ["splurge_dsv*"]
|
|
53
55
|
|
54
56
|
[tool.pytest.ini_options]
|
55
57
|
minversion = "7.0"
|
56
|
-
addopts = "-x -v
|
58
|
+
addopts = "-x -v"
|
57
59
|
testpaths = ["tests"]
|
58
60
|
python_files = ["test_*.py"]
|
59
61
|
python_classes = ["Test*"]
|
@@ -128,3 +130,5 @@ line-ending = "auto"
|
|
128
130
|
files = ["splurge_dsv"]
|
129
131
|
# show_error_codes helps with diagnostics
|
130
132
|
show_error_codes = true
|
133
|
+
# Allow missing imports for vendored/external helper package used at runtime
|
134
|
+
ignore_missing_imports = true
|
@@ -13,9 +13,20 @@ Copyright (c) 2025 Jim Schilling
|
|
13
13
|
# test cases may remove the process working directory which causes calls to
|
14
14
|
# os.getcwd() to raise FileNotFoundError later during test execution. Guard
|
15
15
|
# against that here by switching to this package directory when cwd is missing.
|
16
|
+
# Ensure the required external implementation is available on import so the
|
17
|
+
# rest of the package can rely on its APIs. Fail fast with a helpful message
|
18
|
+
# instructing the user to install the package if it's missing.
|
19
|
+
import importlib as _importlib
|
16
20
|
import os
|
17
21
|
from pathlib import Path as _Path
|
18
22
|
|
23
|
+
try: # pragma: no cover - import-time guard
|
24
|
+
_importlib.import_module("splurge_safe_io")
|
25
|
+
except Exception as e:
|
26
|
+
raise ImportError(
|
27
|
+
"Missing required dependency 'splurge-safe-io'. Please install it: `pip install splurge-safe-io`"
|
28
|
+
) from e
|
29
|
+
|
19
30
|
try:
|
20
31
|
try:
|
21
32
|
# os.getcwd() can raise FileNotFoundError in CI/runner environments
|
@@ -35,11 +46,13 @@ except Exception:
|
|
35
46
|
from splurge_dsv.dsv import Dsv, DsvConfig
|
36
47
|
from splurge_dsv.dsv_helper import DsvHelper
|
37
48
|
from splurge_dsv.exceptions import (
|
49
|
+
SplurgeDsvColumnMismatchError,
|
38
50
|
SplurgeDsvConfigurationError,
|
39
51
|
SplurgeDsvDataProcessingError,
|
40
52
|
# canonical SplurgeDsv* exception names
|
41
53
|
SplurgeDsvError,
|
42
54
|
SplurgeDsvFileEncodingError,
|
55
|
+
SplurgeDsvFileExistsError,
|
43
56
|
SplurgeDsvFileNotFoundError,
|
44
57
|
SplurgeDsvFileOperationError,
|
45
58
|
SplurgeDsvFilePermissionError,
|
@@ -56,11 +69,9 @@ from splurge_dsv.exceptions import (
|
|
56
69
|
SplurgeDsvTypeConversionError,
|
57
70
|
SplurgeDsvValidationError,
|
58
71
|
)
|
59
|
-
from splurge_dsv.path_validator import PathValidator
|
60
72
|
from splurge_dsv.string_tokenizer import StringTokenizer
|
61
|
-
from splurge_dsv.text_file_helper import TextFileHelper
|
62
73
|
|
63
|
-
__version__ = "2025.
|
74
|
+
__version__ = "2025.3.1"
|
64
75
|
__author__ = "Jim Schilling"
|
65
76
|
__license__ = "MIT"
|
66
77
|
|
@@ -79,6 +90,7 @@ __all__ = [
|
|
79
90
|
"SplurgeDsvPathValidationError",
|
80
91
|
"SplurgeDsvDataProcessingError",
|
81
92
|
"SplurgeDsvParsingError",
|
93
|
+
"SplurgeDsvColumnMismatchError",
|
82
94
|
"SplurgeDsvTypeConversionError",
|
83
95
|
"SplurgeDsvStreamingError",
|
84
96
|
"SplurgeDsvConfigurationError",
|
@@ -89,8 +101,7 @@ __all__ = [
|
|
89
101
|
"SplurgeDsvParameterError",
|
90
102
|
"SplurgeDsvRangeError",
|
91
103
|
"SplurgeDsvFormatError",
|
104
|
+
"SplurgeDsvFileExistsError",
|
92
105
|
# Utility classes
|
93
106
|
"StringTokenizer",
|
94
|
-
"TextFileHelper",
|
95
|
-
"PathValidator",
|
96
107
|
]
|
@@ -23,6 +23,7 @@ from pathlib import Path
|
|
23
23
|
# Local imports
|
24
24
|
from splurge_dsv import __version__
|
25
25
|
from splurge_dsv.dsv import Dsv, DsvConfig
|
26
|
+
from splurge_dsv.dsv_helper import DsvHelper
|
26
27
|
from splurge_dsv.exceptions import SplurgeDsvError
|
27
28
|
|
28
29
|
|
@@ -39,14 +40,31 @@ def parse_arguments() -> argparse.Namespace:
|
|
39
40
|
epilog="""
|
40
41
|
Examples:
|
41
42
|
python -m splurge_dsv data.csv --delimiter ,
|
42
|
-
python -m splurge_dsv data.tsv --delimiter "
|
43
|
+
python -m splurge_dsv data.tsv --delimiter "\t"
|
43
44
|
python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
|
44
|
-
|
45
|
+
# Auto-detect the expected column count and normalize rows
|
46
|
+
python -m splurge_dsv data.csv --delimiter , --detect-columns --max-detect-chunks 5
|
47
|
+
# Stream a large file while attempting to detect the column count from the first non-blank logical row
|
48
|
+
python -m splurge_dsv large.csv --delimiter , --stream --detect-columns --max-detect-chunks 10
|
49
|
+
""",
|
45
50
|
)
|
46
51
|
|
47
52
|
parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
|
48
53
|
|
49
|
-
parser.add_argument(
|
54
|
+
parser.add_argument(
|
55
|
+
"--config",
|
56
|
+
"-c",
|
57
|
+
dest="config",
|
58
|
+
type=str,
|
59
|
+
help="Path to a YAML config file that mirrors CLI options (values overridden by CLI args)",
|
60
|
+
)
|
61
|
+
|
62
|
+
parser.add_argument(
|
63
|
+
"--delimiter",
|
64
|
+
"-d",
|
65
|
+
type=str,
|
66
|
+
help="Delimiter character to use for parsing (may also be provided via --config)",
|
67
|
+
)
|
50
68
|
|
51
69
|
parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
|
52
70
|
|
@@ -64,7 +82,53 @@ Examples:
|
|
64
82
|
"--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
|
65
83
|
)
|
66
84
|
|
67
|
-
parser.add_argument(
|
85
|
+
parser.add_argument(
|
86
|
+
"--detect-columns",
|
87
|
+
action="store_true",
|
88
|
+
help=(
|
89
|
+
"Auto-detect the expected column count from the first non-blank logical row "
|
90
|
+
"and normalize subsequent rows to that count. For streamed parsing, the "
|
91
|
+
"detector may scan up to --max-detect-chunks chunks from the start of the file."
|
92
|
+
),
|
93
|
+
)
|
94
|
+
|
95
|
+
parser.add_argument(
|
96
|
+
"--raise-on-missing-columns",
|
97
|
+
action="store_true",
|
98
|
+
help="Raise an error if a row has fewer columns than the detected/expected count",
|
99
|
+
)
|
100
|
+
|
101
|
+
parser.add_argument(
|
102
|
+
"--raise-on-extra-columns",
|
103
|
+
action="store_true",
|
104
|
+
help="Raise an error if a row has more columns than the detected/expected count",
|
105
|
+
)
|
106
|
+
|
107
|
+
parser.add_argument(
|
108
|
+
"--chunk-size",
|
109
|
+
type=int,
|
110
|
+
default=DsvHelper.DEFAULT_CHUNK_SIZE,
|
111
|
+
help=(
|
112
|
+
f"Chunk size for streaming (minimum: {DsvHelper.DEFAULT_MIN_CHUNK_SIZE}, "
|
113
|
+
f"default: {DsvHelper.DEFAULT_CHUNK_SIZE})"
|
114
|
+
),
|
115
|
+
)
|
116
|
+
|
117
|
+
parser.add_argument(
|
118
|
+
"--max-detect-chunks",
|
119
|
+
type=int,
|
120
|
+
default=DsvHelper.MAX_DETECT_CHUNKS,
|
121
|
+
help=(
|
122
|
+
"When detecting columns while streaming (use --detect-normalize-columns), "
|
123
|
+
f"scan up to N chunks from the start of the stream before giving up (default: {DsvHelper.MAX_DETECT_CHUNKS})."
|
124
|
+
),
|
125
|
+
)
|
126
|
+
|
127
|
+
parser.add_argument(
|
128
|
+
"--skip-empty-lines",
|
129
|
+
action="store_true",
|
130
|
+
help="Have the underlying reader skip raw empty logical lines (line.strip() == '') before parsing",
|
131
|
+
)
|
68
132
|
|
69
133
|
parser.add_argument(
|
70
134
|
"--output-format",
|
@@ -141,17 +205,56 @@ def run_cli() -> int:
|
|
141
205
|
print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
|
142
206
|
return 1
|
143
207
|
|
208
|
+
# Build base config either from YAML file (if provided) or from CLI args
|
209
|
+
base_params = {}
|
210
|
+
if args.config:
|
211
|
+
try:
|
212
|
+
import yaml # type: ignore
|
213
|
+
|
214
|
+
cfg_path = Path(args.config)
|
215
|
+
if not cfg_path.exists():
|
216
|
+
print(f"Error: Config file '{args.config}' not found.", file=sys.stderr)
|
217
|
+
return 1
|
218
|
+
|
219
|
+
with cfg_path.open("r", encoding="utf-8") as fh:
|
220
|
+
file_cfg = yaml.safe_load(fh) or {}
|
221
|
+
|
222
|
+
if not isinstance(file_cfg, dict):
|
223
|
+
print(f"Error: Config file '{args.config}' must contain a mapping/dictionary.", file=sys.stderr)
|
224
|
+
return 1
|
225
|
+
|
226
|
+
base_params.update(file_cfg)
|
227
|
+
except Exception as e:
|
228
|
+
print(f"Error reading config file '{args.config}': {e}", file=sys.stderr)
|
229
|
+
return 1
|
230
|
+
|
231
|
+
# CLI args override YAML values when provided. Build the parameter map
|
232
|
+
cli_params = {
|
233
|
+
"delimiter": args.delimiter,
|
234
|
+
"strip": not args.no_strip,
|
235
|
+
"bookend": args.bookend,
|
236
|
+
"bookend_strip": not args.no_bookend_strip,
|
237
|
+
"encoding": args.encoding,
|
238
|
+
"skip_header_rows": args.skip_header,
|
239
|
+
"skip_footer_rows": args.skip_footer,
|
240
|
+
"chunk_size": args.chunk_size,
|
241
|
+
"detect_columns": args.detect_columns,
|
242
|
+
"raise_on_missing_columns": args.raise_on_missing_columns,
|
243
|
+
"raise_on_extra_columns": args.raise_on_extra_columns,
|
244
|
+
"max_detect_chunks": args.max_detect_chunks,
|
245
|
+
"skip_empty_lines": args.skip_empty_lines,
|
246
|
+
}
|
247
|
+
|
248
|
+
# Merge: start from file (if any), then overlay CLI-provided values
|
249
|
+
merged = {**base_params, **{k: v for k, v in cli_params.items() if v is not None}}
|
250
|
+
|
144
251
|
# Create configuration and Dsv instance for parsing
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
skip_header_rows=args.skip_header,
|
152
|
-
skip_footer_rows=args.skip_footer,
|
153
|
-
chunk_size=args.chunk_size,
|
154
|
-
)
|
252
|
+
try:
|
253
|
+
config = DsvConfig.from_params(**merged)
|
254
|
+
except Exception as e:
|
255
|
+
print(f"Error building configuration: {e}", file=sys.stderr)
|
256
|
+
return 1
|
257
|
+
dsv = Dsv(config)
|
155
258
|
dsv = Dsv(config)
|
156
259
|
|
157
260
|
# Parse the file
|
@@ -161,18 +264,26 @@ def run_cli() -> int:
|
|
161
264
|
chunk_count = 0
|
162
265
|
total_rows = 0
|
163
266
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
267
|
+
try:
|
268
|
+
for chunk in dsv.parse_file_stream(file_path):
|
269
|
+
chunk_count += 1
|
270
|
+
total_rows += len(chunk)
|
271
|
+
|
272
|
+
if args.output_format == "json":
|
273
|
+
print(json.dumps(chunk, ensure_ascii=False))
|
274
|
+
elif args.output_format == "ndjson":
|
275
|
+
for row in chunk:
|
276
|
+
print(json.dumps(row, ensure_ascii=False))
|
277
|
+
else:
|
278
|
+
print(f"Chunk {chunk_count}: {len(chunk)} rows")
|
279
|
+
print_results(chunk, args.delimiter)
|
280
|
+
print()
|
281
|
+
except Exception as e:
|
282
|
+
print(f"Error during streaming: {e}", file=sys.stderr)
|
283
|
+
import traceback
|
284
|
+
|
285
|
+
traceback.print_exc(file=sys.stderr)
|
286
|
+
return 1
|
176
287
|
|
177
288
|
if args.output_format not in ["json", "ndjson"]:
|
178
289
|
print(f"Total: {total_rows} rows in {chunk_count} chunks")
|