splurge-dsv 2025.2.0__tar.gz → 2025.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {splurge_dsv-2025.2.0/splurge_dsv.egg-info → splurge_dsv-2025.3.1}/PKG-INFO +78 -5
  2. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/README.md +75 -3
  3. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/pyproject.toml +8 -4
  4. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv/__init__.py +16 -5
  5. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv/cli.py +137 -26
  6. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv/dsv.py +101 -7
  7. splurge_dsv-2025.3.1/splurge_dsv/dsv_helper.py +631 -0
  8. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv/exceptions.py +22 -1
  9. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv/string_tokenizer.py +7 -1
  10. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1/splurge_dsv.egg-info}/PKG-INFO +78 -5
  11. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/SOURCES.txt +0 -4
  12. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/requires.txt +2 -1
  13. splurge_dsv-2025.2.0/splurge_dsv/dsv_helper.py +0 -257
  14. splurge_dsv-2025.2.0/splurge_dsv/path_validator.py +0 -298
  15. splurge_dsv-2025.2.0/splurge_dsv/safe_text_file_reader.py +0 -177
  16. splurge_dsv-2025.2.0/splurge_dsv/safe_text_file_writer.py +0 -136
  17. splurge_dsv-2025.2.0/splurge_dsv/text_file_helper.py +0 -240
  18. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/LICENSE +0 -0
  19. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/setup.cfg +0 -0
  20. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv/__main__.py +0 -0
  21. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/dependency_links.txt +0 -0
  22. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/entry_points.txt +0 -0
  23. {splurge_dsv-2025.2.0 → splurge_dsv-2025.3.1}/splurge_dsv.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: splurge-dsv
3
- Version: 2025.2.0
3
+ Version: 2025.3.1
4
4
  Summary: A utility library for working with DSV (Delimited String Values) files
5
5
  Author: Jim Schilling
6
6
  License-Expression: MIT
@@ -21,10 +21,11 @@ Classifier: Topic :: Text Processing :: Filters
21
21
  Requires-Python: >=3.10
22
22
  Description-Content-Type: text/markdown
23
23
  License-File: LICENSE
24
+ Requires-Dist: splurge-safe-io>=2025.0.5
25
+ Requires-Dist: PyYAML>=6.0
24
26
  Provides-Extra: dev
25
27
  Requires-Dist: pytest>=7.0.0; extra == "dev"
26
28
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
27
- Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
28
29
  Requires-Dist: mypy>=1.0.0; extra == "dev"
29
30
  Requires-Dist: ruff>=0.0.241; extra == "dev"
30
31
  Requires-Dist: pytest-mock>=3.0.0; extra == "dev"
@@ -38,7 +39,7 @@ Dynamic: license-file
38
39
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
39
40
 
40
41
  [![CI](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml/badge.svg)](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml)
41
- [![Coverage](https://img.shields.io/badge/coverage-94%25-brightgreen.svg)](https://github.com/jim-schilling/splurge-dsv)
42
+ [![Coverage](https://img.shields.io/badge/coverage-93%25-brightgreen.svg)](https://github.com/jim-schilling/splurge-dsv)
42
43
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
43
44
  [![mypy](https://img.shields.io/badge/mypy-checked-black)](https://mypy-lang.org/)
44
45
 
@@ -46,12 +47,49 @@ A robust Python library for parsing and processing delimited-separated value (DS
46
47
 
47
48
  ## Features
48
49
 
49
- - **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter files
50
+ - **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter separated value files/objects
51
+ - **Configurable Parsing**: Flexible options for delimiters, quote characters, escape characters, header/footer row(s) handling
50
52
  - **Memory-Efficient Streaming**: Process large files without loading entire content into memory
51
53
  - **Security & Validation**: Comprehensive path validation and file permission checks
52
54
  - **Unicode Support**: Full Unicode character and encoding support
53
55
  - **Type Safety**: Full type annotations with mypy validation
54
- - **Comprehensive Testing**: 420 tests (409 passed, 11 skipped) with 94% code coverage including property-based testing, edge case testing, and cross-platform compatibility
56
+ - **Deterministic Newline Handling**: Consistent handling of CRLF, CR, and LF newlines across platforms
57
+ - **CLI Tool**: Command-line interface for quick parsing and inspection of DSV files
58
+ - **Robust Error Handling**: Clear and specific exceptions for various error scenarios
59
+ - **Modern API**: Object-oriented API with `Dsv` and `DsvConfig` classes for easy configuration and reuse
60
+ - **Comprehensive Documentation**: In-depth API reference and usage examples
61
+ - **Exhaustive Testing**: 283 tests with 93% code coverage including property-based testing, edge case testing, and cross-platform compatibility validation
62
+
63
+ **⚠️ CHANGES in v2025.3.1**
64
+ > - **skip_empty_lines** option added to `DsvConfig`, `DsvHelper`, and CLI.
65
+ > - This option allows users to skip logical empty lines when parsing DSV files.
66
+
67
+ **⚠️ CHANGES in v2025.3.0**
68
+ > - **Commit-Only Release**: v2025.3.0 is a commit-only release and will not be published to PyPI.
69
+ > - The legacy `parse_stream()` helpers were removed in release 2025.3.0.
70
+ > - Use `parse_file_stream()` on `Dsv`/`DsvHelper` for stream-based parsing of files. This standardizes the API naming and clarifies that streaming helpers accept file paths rather than arbitrary iterables.
71
+ > - TextFileHelper, SafeTextFileReader, SafeTextFileWriter, and PathValidator, as well as all their associated tests have been removed in this release.
72
+ > - Their functionality has been migrated in favor of the `splurge-safe-io` package, which provides robust and secure file I/O operations.
73
+ > - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
74
+ > - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
75
+ > - **See API-REFERENCE.md for migration guidance and complete reference documentation, with usage examples.**
76
+
77
+ **⚠️ CHANGES in v2025.2.2**
78
+ > - **Deprecated Warning**: The following modules and their associated classes and functions are deprecated and will be removed in a future release (2025.3.0). Users are encouraged to transition to the `splurge-safe-io` package for these functionalities:
79
+ > - `splurge_dsv.safe_text_file_reader`
80
+ > - `splurge_dsv.safe_text_file_writer`
81
+ > - `splurge_dsv.path_validator`
82
+ > - `splurge_dsv.text_file_helper`
83
+ > - **New Exception**: Added `SplurgeDsvFileExistsError` to handle file existence errors.
84
+ > - **Fixed Exception Mapping**: Many errors were incorrectly mapped to SplurgeDsvEncodingError; this has been corrected to use appropriate exception types.
85
+ > - Some exceptions were not mapped to any SplurgeDsv* exception; these have also been corrected.
86
+ > - **3rd-Party Dependency Additions**: Added `splurge-safe-io (v2025.0.4)`.
87
+ > - `splurge-safe-io` is a new dependency that provides robust and secure file I/O operations, including safe text file reading and writing with deterministic newline handling and path validation.
88
+ > - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
89
+ > - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
90
+ > - **Code Refactoring**: Refactored `SafeTextFileReader`, `SafeTextFileWriter`, and `PathValidator` to utilize `splurge-safe-io` implementations internally, ensuring consistent behavior and reducing maintenance overhead.
91
+ > - **This release maintains backward compatibility** for existing users, but users are encouraged to transition to `splurge-safe-io` for future-proofing their codebases.
92
+ > - **_This release is a commit-only release and will not be published to PyPI._**
55
93
 
56
94
  **⚠️ BREAKING CHANGES in v2025.2.0**
57
95
  >
@@ -78,6 +116,41 @@ python -m splurge_dsv data.csv --delimiter ,
78
116
  python -m splurge_dsv large_file.csv --delimiter , --stream --chunk-size 1000
79
117
  ```
80
118
 
119
+ ### YAML configuration file
120
+
121
+ You can place CLI-equivalent options in a YAML file and pass it to the CLI
122
+ using `--config` (or `-c`). CLI arguments override values found in the
123
+ YAML file. Example `config.yaml`:
124
+
125
+ ```yaml
126
+ delimiter: ","
127
+ strip: true
128
+ bookend: '"'
129
+ encoding: utf-8
130
+ skip_header_rows: 1
131
+ skip_footer_rows: 0
132
+ skip_empty_lines: false
133
+ detect_columns: true
134
+ chunk_size: 500
135
+ max_detect_chunks: 5
136
+ raise_on_missing_columns: false
137
+ raise_on_extra_columns: false
138
+ ```
139
+
140
+ Usage with CLI:
141
+
142
+ ```bash
143
+ python -m splurge_dsv data.csv --config config.yaml --delimiter "|"
144
+ # The CLI delimiter '|' overrides the YAML delimiter
145
+ ```
146
+
147
+ Example using the shipped example config in the repository:
148
+
149
+ ```bash
150
+ # Use the example file provided at examples/config.yaml
151
+ python -m splurge_dsv data.csv --config examples/config.yaml
152
+ ```
153
+
81
154
  ### API Usage
82
155
 
83
156
  ```python
@@ -5,7 +5,7 @@
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
6
 
7
7
  [![CI](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml/badge.svg)](https://github.com/jim-schilling/splurge-dsv/actions/workflows/ci-quick-test.yml)
8
- [![Coverage](https://img.shields.io/badge/coverage-94%25-brightgreen.svg)](https://github.com/jim-schilling/splurge-dsv)
8
+ [![Coverage](https://img.shields.io/badge/coverage-93%25-brightgreen.svg)](https://github.com/jim-schilling/splurge-dsv)
9
9
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
10
10
  [![mypy](https://img.shields.io/badge/mypy-checked-black)](https://mypy-lang.org/)
11
11
 
@@ -13,12 +13,49 @@ A robust Python library for parsing and processing delimited-separated value (DS
13
13
 
14
14
  ## Features
15
15
 
16
- - **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter files
16
+ - **Multi-format DSV Support**: Parse CSV, TSV, pipe-delimited, and custom delimiter separated value files/objects
17
+ - **Configurable Parsing**: Flexible options for delimiters, quote characters, escape characters, header/footer row(s) handling
17
18
  - **Memory-Efficient Streaming**: Process large files without loading entire content into memory
18
19
  - **Security & Validation**: Comprehensive path validation and file permission checks
19
20
  - **Unicode Support**: Full Unicode character and encoding support
20
21
  - **Type Safety**: Full type annotations with mypy validation
21
- - **Comprehensive Testing**: 420 tests (409 passed, 11 skipped) with 94% code coverage including property-based testing, edge case testing, and cross-platform compatibility
22
+ - **Deterministic Newline Handling**: Consistent handling of CRLF, CR, and LF newlines across platforms
23
+ - **CLI Tool**: Command-line interface for quick parsing and inspection of DSV files
24
+ - **Robust Error Handling**: Clear and specific exceptions for various error scenarios
25
+ - **Modern API**: Object-oriented API with `Dsv` and `DsvConfig` classes for easy configuration and reuse
26
+ - **Comprehensive Documentation**: In-depth API reference and usage examples
27
+ - **Exhaustive Testing**: 283 tests with 93% code coverage including property-based testing, edge case testing, and cross-platform compatibility validation
28
+
29
+ **⚠️ CHANGES in v2025.3.1**
30
+ > - **skip_empty_lines** option added to `DsvConfig`, `DsvHelper`, and CLI.
31
+ > - This option allows users to skip logical empty lines when parsing DSV files.
32
+
33
+ **⚠️ CHANGES in v2025.3.0**
34
+ > - **Commit-Only Release**: v2025.3.0 is a commit-only release and will not be published to PyPI.
35
+ > - The legacy `parse_stream()` helpers were removed in release 2025.3.0.
36
+ > - Use `parse_file_stream()` on `Dsv`/`DsvHelper` for stream-based parsing of files. This standardizes the API naming and clarifies that streaming helpers accept file paths rather than arbitrary iterables.
37
+ > - TextFileHelper, SafeTextFileReader, SafeTextFileWriter, and PathValidator, as well as all their associated tests have been removed in this release.
38
+ > - Their functionality has been migrated in favor of the `splurge-safe-io` package, which provides robust and secure file I/O operations.
39
+ > - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
40
+ > - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
41
+ > - **See API-REFERENCE.md for migration guidance and complete reference documentation, with usage examples.**
42
+
43
+ **⚠️ CHANGES in v2025.2.2**
44
+ > - **Deprecated Warning**: The following modules and their associated classes and functions are deprecated and will be removed in a future release (2025.3.0). Users are encouraged to transition to the `splurge-safe-io` package for these functionalities:
45
+ > - `splurge_dsv.safe_text_file_reader`
46
+ > - `splurge_dsv.safe_text_file_writer`
47
+ > - `splurge_dsv.path_validator`
48
+ > - `splurge_dsv.text_file_helper`
49
+ > - **New Exception**: Added `SplurgeDsvFileExistsError` to handle file existence errors.
50
+ > - **Fixed Exception Mapping**: Many errors were incorrectly mapped to SplurgeDsvEncodingError; this has been corrected to use appropriate exception types.
51
+ > - Some exceptions were not mapped to any SplurgeDsv* exception; these have also been corrected.
52
+ > - **3rd-Party Dependency Additions**: Added `splurge-safe-io (v2025.0.4)`.
53
+ > - `splurge-safe-io` is a new dependency that provides robust and secure file I/O operations, including safe text file reading and writing with deterministic newline handling and path validation.
54
+ > - This change reduces code duplication and improves maintainability by leveraging the functionality of `splurge-safe-io`.
55
+ > - Users should refer to the `splurge-safe-io` documentation for details on its usage and features.
56
+ > - **Code Refactoring**: Refactored `SafeTextFileReader`, `SafeTextFileWriter`, and `PathValidator` to utilize `splurge-safe-io` implementations internally, ensuring consistent behavior and reducing maintenance overhead.
57
+ > - **This release maintains backward compatibility** for existing users, but users are encouraged to transition to `splurge-safe-io` for future-proofing their codebases.
58
+ > - **_This release is a commit-only release and will not be published to PyPI._**
22
59
 
23
60
  **⚠️ BREAKING CHANGES in v2025.2.0**
24
61
  >
@@ -45,6 +82,41 @@ python -m splurge_dsv data.csv --delimiter ,
45
82
  python -m splurge_dsv large_file.csv --delimiter , --stream --chunk-size 1000
46
83
  ```
47
84
 
85
+ ### YAML configuration file
86
+
87
+ You can place CLI-equivalent options in a YAML file and pass it to the CLI
88
+ using `--config` (or `-c`). CLI arguments override values found in the
89
+ YAML file. Example `config.yaml`:
90
+
91
+ ```yaml
92
+ delimiter: ","
93
+ strip: true
94
+ bookend: '"'
95
+ encoding: utf-8
96
+ skip_header_rows: 1
97
+ skip_footer_rows: 0
98
+ skip_empty_lines: false
99
+ detect_columns: true
100
+ chunk_size: 500
101
+ max_detect_chunks: 5
102
+ raise_on_missing_columns: false
103
+ raise_on_extra_columns: false
104
+ ```
105
+
106
+ Usage with CLI:
107
+
108
+ ```bash
109
+ python -m splurge_dsv data.csv --config config.yaml --delimiter "|"
110
+ # The CLI delimiter '|' overrides the YAML delimiter
111
+ ```
112
+
113
+ Example using the shipped example config in the repository:
114
+
115
+ ```bash
116
+ # Use the example file provided at examples/config.yaml
117
+ python -m splurge_dsv data.csv --config examples/config.yaml
118
+ ```
119
+
48
120
  ### API Usage
49
121
 
50
122
  ```python
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "splurge-dsv"
7
- version = "2025.2.0"
7
+ version = "2025.3.1"
8
8
  description = "A utility library for working with DSV (Delimited String Values) files"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -25,13 +25,15 @@ classifiers = [
25
25
  "Topic :: Text Processing :: Filters",
26
26
  ]
27
27
 
28
- dependencies = []
28
+ dependencies = [
29
+ "splurge-safe-io>=2025.0.5",
30
+ "PyYAML>=6.0",
31
+ ]
29
32
 
30
33
  [project.optional-dependencies]
31
34
  dev = [
32
35
  "pytest>=7.0.0",
33
36
  "pytest-cov>=4.0.0",
34
- "pytest-xdist>=3.0.0",
35
37
  "mypy>=1.0.0",
36
38
  "ruff>=0.0.241",
37
39
  "pytest-mock>=3.0.0",
@@ -53,7 +55,7 @@ include = ["splurge_dsv*"]
53
55
 
54
56
  [tool.pytest.ini_options]
55
57
  minversion = "7.0"
56
- addopts = "-x -v -n 4"
58
+ addopts = "-x -v"
57
59
  testpaths = ["tests"]
58
60
  python_files = ["test_*.py"]
59
61
  python_classes = ["Test*"]
@@ -128,3 +130,5 @@ line-ending = "auto"
128
130
  files = ["splurge_dsv"]
129
131
  # show_error_codes helps with diagnostics
130
132
  show_error_codes = true
133
+ # Allow missing imports for vendored/external helper package used at runtime
134
+ ignore_missing_imports = true
@@ -13,9 +13,20 @@ Copyright (c) 2025 Jim Schilling
13
13
  # test cases may remove the process working directory which causes calls to
14
14
  # os.getcwd() to raise FileNotFoundError later during test execution. Guard
15
15
  # against that here by switching to this package directory when cwd is missing.
16
+ # Ensure the required external implementation is available on import so the
17
+ # rest of the package can rely on its APIs. Fail fast with a helpful message
18
+ # instructing the user to install the package if it's missing.
19
+ import importlib as _importlib
16
20
  import os
17
21
  from pathlib import Path as _Path
18
22
 
23
+ try: # pragma: no cover - import-time guard
24
+ _importlib.import_module("splurge_safe_io")
25
+ except Exception as e:
26
+ raise ImportError(
27
+ "Missing required dependency 'splurge-safe-io'. Please install it: `pip install splurge-safe-io`"
28
+ ) from e
29
+
19
30
  try:
20
31
  try:
21
32
  # os.getcwd() can raise FileNotFoundError in CI/runner environments
@@ -35,11 +46,13 @@ except Exception:
35
46
  from splurge_dsv.dsv import Dsv, DsvConfig
36
47
  from splurge_dsv.dsv_helper import DsvHelper
37
48
  from splurge_dsv.exceptions import (
49
+ SplurgeDsvColumnMismatchError,
38
50
  SplurgeDsvConfigurationError,
39
51
  SplurgeDsvDataProcessingError,
40
52
  # canonical SplurgeDsv* exception names
41
53
  SplurgeDsvError,
42
54
  SplurgeDsvFileEncodingError,
55
+ SplurgeDsvFileExistsError,
43
56
  SplurgeDsvFileNotFoundError,
44
57
  SplurgeDsvFileOperationError,
45
58
  SplurgeDsvFilePermissionError,
@@ -56,11 +69,9 @@ from splurge_dsv.exceptions import (
56
69
  SplurgeDsvTypeConversionError,
57
70
  SplurgeDsvValidationError,
58
71
  )
59
- from splurge_dsv.path_validator import PathValidator
60
72
  from splurge_dsv.string_tokenizer import StringTokenizer
61
- from splurge_dsv.text_file_helper import TextFileHelper
62
73
 
63
- __version__ = "2025.2.0"
74
+ __version__ = "2025.3.1"
64
75
  __author__ = "Jim Schilling"
65
76
  __license__ = "MIT"
66
77
 
@@ -79,6 +90,7 @@ __all__ = [
79
90
  "SplurgeDsvPathValidationError",
80
91
  "SplurgeDsvDataProcessingError",
81
92
  "SplurgeDsvParsingError",
93
+ "SplurgeDsvColumnMismatchError",
82
94
  "SplurgeDsvTypeConversionError",
83
95
  "SplurgeDsvStreamingError",
84
96
  "SplurgeDsvConfigurationError",
@@ -89,8 +101,7 @@ __all__ = [
89
101
  "SplurgeDsvParameterError",
90
102
  "SplurgeDsvRangeError",
91
103
  "SplurgeDsvFormatError",
104
+ "SplurgeDsvFileExistsError",
92
105
  # Utility classes
93
106
  "StringTokenizer",
94
- "TextFileHelper",
95
- "PathValidator",
96
107
  ]
@@ -23,6 +23,7 @@ from pathlib import Path
23
23
  # Local imports
24
24
  from splurge_dsv import __version__
25
25
  from splurge_dsv.dsv import Dsv, DsvConfig
26
+ from splurge_dsv.dsv_helper import DsvHelper
26
27
  from splurge_dsv.exceptions import SplurgeDsvError
27
28
 
28
29
 
@@ -39,14 +40,31 @@ def parse_arguments() -> argparse.Namespace:
39
40
  epilog="""
40
41
  Examples:
41
42
  python -m splurge_dsv data.csv --delimiter ,
42
- python -m splurge_dsv data.tsv --delimiter "\\t"
43
+ python -m splurge_dsv data.tsv --delimiter "\t"
43
44
  python -m splurge_dsv data.txt --delimiter "|" --bookend '"'
44
- """,
45
+ # Auto-detect the expected column count and normalize rows
46
+ python -m splurge_dsv data.csv --delimiter , --detect-columns --max-detect-chunks 5
47
+ # Stream a large file while attempting to detect the column count from the first non-blank logical row
48
+ python -m splurge_dsv large.csv --delimiter , --stream --detect-columns --max-detect-chunks 10
49
+ """,
45
50
  )
46
51
 
47
52
  parser.add_argument("file_path", type=str, help="Path to the DSV file to parse")
48
53
 
49
- parser.add_argument("--delimiter", "-d", type=str, required=True, help="Delimiter character to use for parsing")
54
+ parser.add_argument(
55
+ "--config",
56
+ "-c",
57
+ dest="config",
58
+ type=str,
59
+ help="Path to a YAML config file that mirrors CLI options (values overridden by CLI args)",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--delimiter",
64
+ "-d",
65
+ type=str,
66
+ help="Delimiter character to use for parsing (may also be provided via --config)",
67
+ )
50
68
 
51
69
  parser.add_argument("--bookend", "-b", type=str, help="Bookend character for text fields (e.g., '\"')")
52
70
 
@@ -64,7 +82,53 @@ Examples:
64
82
  "--stream", "-s", action="store_true", help="Stream the file in chunks instead of loading entirely into memory"
65
83
  )
66
84
 
67
- parser.add_argument("--chunk-size", type=int, default=500, help="Chunk size for streaming (default: 500)")
85
+ parser.add_argument(
86
+ "--detect-columns",
87
+ action="store_true",
88
+ help=(
89
+ "Auto-detect the expected column count from the first non-blank logical row "
90
+ "and normalize subsequent rows to that count. For streamed parsing, the "
91
+ "detector may scan up to --max-detect-chunks chunks from the start of the file."
92
+ ),
93
+ )
94
+
95
+ parser.add_argument(
96
+ "--raise-on-missing-columns",
97
+ action="store_true",
98
+ help="Raise an error if a row has fewer columns than the detected/expected count",
99
+ )
100
+
101
+ parser.add_argument(
102
+ "--raise-on-extra-columns",
103
+ action="store_true",
104
+ help="Raise an error if a row has more columns than the detected/expected count",
105
+ )
106
+
107
+ parser.add_argument(
108
+ "--chunk-size",
109
+ type=int,
110
+ default=DsvHelper.DEFAULT_CHUNK_SIZE,
111
+ help=(
112
+ f"Chunk size for streaming (minimum: {DsvHelper.DEFAULT_MIN_CHUNK_SIZE}, "
113
+ f"default: {DsvHelper.DEFAULT_CHUNK_SIZE})"
114
+ ),
115
+ )
116
+
117
+ parser.add_argument(
118
+ "--max-detect-chunks",
119
+ type=int,
120
+ default=DsvHelper.MAX_DETECT_CHUNKS,
121
+ help=(
122
+ "When detecting columns while streaming (use --detect-normalize-columns), "
123
+ f"scan up to N chunks from the start of the stream before giving up (default: {DsvHelper.MAX_DETECT_CHUNKS})."
124
+ ),
125
+ )
126
+
127
+ parser.add_argument(
128
+ "--skip-empty-lines",
129
+ action="store_true",
130
+ help="Have the underlying reader skip raw empty logical lines (line.strip() == '') before parsing",
131
+ )
68
132
 
69
133
  parser.add_argument(
70
134
  "--output-format",
@@ -141,17 +205,56 @@ def run_cli() -> int:
141
205
  print(f"Error: '{args.file_path}' is not a file.", file=sys.stderr)
142
206
  return 1
143
207
 
208
+ # Build base config either from YAML file (if provided) or from CLI args
209
+ base_params = {}
210
+ if args.config:
211
+ try:
212
+ import yaml # type: ignore
213
+
214
+ cfg_path = Path(args.config)
215
+ if not cfg_path.exists():
216
+ print(f"Error: Config file '{args.config}' not found.", file=sys.stderr)
217
+ return 1
218
+
219
+ with cfg_path.open("r", encoding="utf-8") as fh:
220
+ file_cfg = yaml.safe_load(fh) or {}
221
+
222
+ if not isinstance(file_cfg, dict):
223
+ print(f"Error: Config file '{args.config}' must contain a mapping/dictionary.", file=sys.stderr)
224
+ return 1
225
+
226
+ base_params.update(file_cfg)
227
+ except Exception as e:
228
+ print(f"Error reading config file '{args.config}': {e}", file=sys.stderr)
229
+ return 1
230
+
231
+ # CLI args override YAML values when provided. Build the parameter map
232
+ cli_params = {
233
+ "delimiter": args.delimiter,
234
+ "strip": not args.no_strip,
235
+ "bookend": args.bookend,
236
+ "bookend_strip": not args.no_bookend_strip,
237
+ "encoding": args.encoding,
238
+ "skip_header_rows": args.skip_header,
239
+ "skip_footer_rows": args.skip_footer,
240
+ "chunk_size": args.chunk_size,
241
+ "detect_columns": args.detect_columns,
242
+ "raise_on_missing_columns": args.raise_on_missing_columns,
243
+ "raise_on_extra_columns": args.raise_on_extra_columns,
244
+ "max_detect_chunks": args.max_detect_chunks,
245
+ "skip_empty_lines": args.skip_empty_lines,
246
+ }
247
+
248
+ # Merge: start from file (if any), then overlay CLI-provided values
249
+ merged = {**base_params, **{k: v for k, v in cli_params.items() if v is not None}}
250
+
144
251
  # Create configuration and Dsv instance for parsing
145
- config = DsvConfig(
146
- delimiter=args.delimiter,
147
- strip=not args.no_strip,
148
- bookend=args.bookend,
149
- bookend_strip=not args.no_bookend_strip,
150
- encoding=args.encoding,
151
- skip_header_rows=args.skip_header,
152
- skip_footer_rows=args.skip_footer,
153
- chunk_size=args.chunk_size,
154
- )
252
+ try:
253
+ config = DsvConfig.from_params(**merged)
254
+ except Exception as e:
255
+ print(f"Error building configuration: {e}", file=sys.stderr)
256
+ return 1
257
+ dsv = Dsv(config)
155
258
  dsv = Dsv(config)
156
259
 
157
260
  # Parse the file
@@ -161,18 +264,26 @@ def run_cli() -> int:
161
264
  chunk_count = 0
162
265
  total_rows = 0
163
266
 
164
- for chunk in dsv.parse_stream(file_path):
165
- chunk_count += 1
166
- total_rows += len(chunk)
167
- if args.output_format == "json":
168
- print(json.dumps(chunk, ensure_ascii=False))
169
- elif args.output_format == "ndjson":
170
- for row in chunk:
171
- print(json.dumps(row, ensure_ascii=False))
172
- else:
173
- print(f"Chunk {chunk_count}: {len(chunk)} rows")
174
- print_results(chunk, args.delimiter)
175
- print()
267
+ try:
268
+ for chunk in dsv.parse_file_stream(file_path):
269
+ chunk_count += 1
270
+ total_rows += len(chunk)
271
+
272
+ if args.output_format == "json":
273
+ print(json.dumps(chunk, ensure_ascii=False))
274
+ elif args.output_format == "ndjson":
275
+ for row in chunk:
276
+ print(json.dumps(row, ensure_ascii=False))
277
+ else:
278
+ print(f"Chunk {chunk_count}: {len(chunk)} rows")
279
+ print_results(chunk, args.delimiter)
280
+ print()
281
+ except Exception as e:
282
+ print(f"Error during streaming: {e}", file=sys.stderr)
283
+ import traceback
284
+
285
+ traceback.print_exc(file=sys.stderr)
286
+ return 1
176
287
 
177
288
  if args.output_format not in ["json", "ndjson"]:
178
289
  print(f"Total: {total_rows} rows in {chunk_count} chunks")