splurge-dsv 2025.1.5__py3-none-any.whl → 2025.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ """Safe text file reader utilities.
2
+
3
+ This module implements :class:`SafeTextFileReader`, a small helper that reads
4
+ text files in binary mode and performs deterministic newline normalization.
5
+ It intentionally decodes bytes explicitly to avoid platform newline
6
+ translation side-effects and centralizes encoding error handling into a
7
+ package-specific exception type.
8
+
9
+ Public API summary:
10
+ - SafeTextFileReader: Read, preview, and stream text files with normalized
11
+ newlines and optional header/footer skipping.
12
+ - open_text: Context manager returning an in-memory text stream for
13
+ callers that expect a file-like object.
14
+
15
+ Example:
16
+ reader = SafeTextFileReader("data.csv", encoding="utf-8")
17
+ lines = reader.read()
18
+
19
+ License: MIT
20
+
21
+ Copyright (c) 2025 Jim Schilling
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections.abc import Iterator
27
+ from contextlib import contextmanager
28
+ from io import StringIO
29
+ from pathlib import Path
30
+
31
+ from splurge_dsv.exceptions import SplurgeDsvFileEncodingError
32
+
33
+
34
+ class SafeTextFileReader:
35
+ """Read text files with deterministic newline normalization.
36
+
37
+ The class reads raw bytes from disk and decodes using the provided
38
+ encoding. Newline sequences are normalized to ``\n`` (LF). Public
39
+ methods provide convenience wrappers for full reads, previews and
40
+ chunked streaming.
41
+
42
+ Args:
43
+ file_path (Path | str): Path to the file to read.
44
+ encoding (str): Encoding to use when decoding bytes (default: utf-8).
45
+
46
+ Example:
47
+ reader = SafeTextFileReader("/tmp/data.csv", encoding="utf-8")
48
+ rows = reader.read(skip_header_rows=1)
49
+ """
50
+
51
+ def __init__(self, file_path: Path | str, *, encoding: str = "utf-8") -> None:
52
+ self.path = Path(file_path)
53
+ self.encoding = encoding
54
+
55
+ def _read_text(self) -> str:
56
+ """Read the file bytes and return decoded text with no newline normalization applied.
57
+
58
+ Returns:
59
+ Decoded text (str).
60
+
61
+ Raises:
62
+ SplurgeDsvFileEncodingError: If decoding fails or the file cannot
63
+ be read.
64
+ """
65
+ try:
66
+ # Read raw bytes and decode explicitly to avoid the platform's
67
+ # text-mode newline translations which can alter mixed line endings.
68
+ with self.path.open("rb") as fh:
69
+ raw = fh.read()
70
+ return raw.decode(self.encoding)
71
+ except Exception as e:
72
+ raise SplurgeDsvFileEncodingError(f"Encoding error reading file: {self.path}", details=str(e)) from e
73
+
74
+ def read(self, *, strip: bool = True, skip_header_rows: int = 0, skip_footer_rows: int = 0) -> list[str]:
75
+ """Read the entire file and return a list of normalized lines.
76
+
77
+ Newlines are normalized to ``\n`` and optional header/footer rows
78
+ can be skipped. If ``strip`` is True, whitespace surrounding each
79
+ line is removed.
80
+
81
+ Args:
82
+ strip (bool): Strip whitespace from each line (default: True).
83
+ skip_header_rows (int): Number of rows to skip at the start.
84
+ skip_footer_rows (int): Number of rows to skip at the end.
85
+
86
+ Returns:
87
+ List of lines as strings.
88
+ """
89
+ text = self._read_text()
90
+ # Normalize newlines to LF
91
+ normalized = text.replace("\r\n", "\n").replace("\r", "\n")
92
+ lines = normalized.splitlines()
93
+
94
+ if skip_header_rows:
95
+ lines = lines[skip_header_rows:]
96
+ if skip_footer_rows:
97
+ if skip_footer_rows >= len(lines):
98
+ return []
99
+ lines = lines[:-skip_footer_rows]
100
+
101
+ if strip:
102
+ return [ln.strip() for ln in lines]
103
+ return list(lines)
104
+
105
+ def preview(self, max_lines: int = 100, *, strip: bool = True, skip_header_rows: int = 0) -> list[str]:
106
+ """Return the first ``max_lines`` lines of the file after normalization.
107
+
108
+ Args:
109
+ max_lines (int): Maximum number of lines to return.
110
+ strip (bool): Strip whitespace from each returned line.
111
+ skip_header_rows (int): Number of header rows to skip before previewing.
112
+
113
+ Returns:
114
+ A list of preview lines.
115
+ """
116
+ text = self._read_text()
117
+ normalized = text.replace("\r\n", "\n").replace("\r", "\n")
118
+ lines = normalized.splitlines()
119
+ if skip_header_rows:
120
+ lines = lines[skip_header_rows:]
121
+ if max_lines < 1:
122
+ return []
123
+ result = lines[:max_lines]
124
+ return [ln.strip() for ln in result] if strip else list(result)
125
+
126
+ def read_as_stream(
127
+ self, *, strip: bool = True, skip_header_rows: int = 0, skip_footer_rows: int = 0, chunk_size: int = 500
128
+ ) -> Iterator[list[str]]:
129
+ """Yield chunks of lines from the file.
130
+
131
+ This convenience method currently reads the decoded file into memory
132
+ and yields chunks of ``chunk_size`` lines. For very large files this
133
+ could be optimized to stream from disk without full materialization.
134
+
135
+ Args:
136
+ strip (bool): Whether to strip whitespace from each line.
137
+ skip_header_rows (int): Number of header rows to skip.
138
+ skip_footer_rows (int): Number of footer rows to skip.
139
+ chunk_size (int): Number of lines per yielded chunk.
140
+
141
+ Yields:
142
+ Lists of lines (each list length <= chunk_size).
143
+ """
144
+ lines = self.read(strip=strip, skip_header_rows=skip_header_rows, skip_footer_rows=skip_footer_rows)
145
+ chunk: list[str] = []
146
+ for ln in lines:
147
+ chunk.append(ln)
148
+ if len(chunk) >= chunk_size:
149
+ yield chunk
150
+ chunk = []
151
+ if chunk:
152
+ yield chunk
153
+
154
+
155
+ @contextmanager
156
+ def open_text(file_path: Path | str, *, encoding: str = "utf-8"):
157
+ """Context manager returning a text stream (io.StringIO) with normalized newlines.
158
+
159
+ Useful when an API expects a file-like object. The returned StringIO
160
+ contains the normalized text (LF newlines) and is closed automatically
161
+ when the context exits.
162
+
163
+ Args:
164
+ file_path: Path to the file to open.
165
+ encoding: Encoding to decode the file with.
166
+
167
+ Yields:
168
+ io.StringIO: In-memory text buffer with normalized newlines.
169
+ """
170
+ reader = SafeTextFileReader(file_path, encoding=encoding)
171
+ text_lines = reader.read(strip=False)
172
+ text = "\n".join(text_lines)
173
+ sio = StringIO(text)
174
+ try:
175
+ yield sio
176
+ finally:
177
+ sio.close()
@@ -0,0 +1,136 @@
1
+ """Deterministic text-only writer utilities.
2
+
3
+ This module implements :class:`SafeTextFileWriter` and a convenience
4
+ ``open_text_writer`` context manager. Writes always use the configured
5
+ encoding and normalize newline characters to a canonical form (LF) to
6
+ ensure consistent files across platforms.
7
+
8
+ Example:
9
+ with open_text_writer("out.txt") as buf:
10
+ buf.write("line1\nline2\n")
11
+
12
+ Copyright (c) 2025 Jim Schilling
13
+ Please preserve this header and all related material when sharing!
14
+
15
+ License: MIT
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import io
21
+ from collections.abc import Iterable, Iterator
22
+ from contextlib import contextmanager
23
+ from pathlib import Path
24
+ from typing import cast
25
+
26
+ from .exceptions import SplurgeDsvFileEncodingError
27
+
28
+
29
+ class SafeTextFileWriter:
30
+ """Helper for deterministic text writes with newline normalization.
31
+
32
+ Args:
33
+ file_path: Destination file path.
34
+ encoding: Text encoding to use (default: 'utf-8').
35
+ newline: Canonical newline sequence to write (default: '\n').
36
+
37
+ The class exposes a minimal file-like API and will raise
38
+ :class:`SplurgeDsvFileEncodingError` when the underlying file cannot be
39
+ opened with the requested encoding.
40
+ """
41
+
42
+ def __init__(self, file_path: Path, *, encoding: str = "utf-8", newline: str | None = "\n") -> None:
43
+ self._path = Path(file_path)
44
+ self._encoding = encoding
45
+ # newline is the canonical newline we will write; default to LF
46
+ self._newline = "\n" if newline is None else newline
47
+ self._file: io.TextIOBase | None = None
48
+
49
+ def open(self, mode: str = "w") -> io.TextIOBase:
50
+ """Open the underlying file for text writing.
51
+
52
+ Args:
53
+ mode: File open mode (default: 'w').
54
+
55
+ Returns:
56
+ The opened text file object.
57
+
58
+ Raises:
59
+ SplurgeDsvFileEncodingError: If the file cannot be opened with the
60
+ requested encoding or underlying OS error occurs.
61
+ """
62
+ try:
63
+ # open with newline="" to allow us to manage newline normalization
64
+ fp = open(self._path, mode, encoding=self._encoding, newline="")
65
+ # cast to TextIOBase for precise typing
66
+ self._file = cast(io.TextIOBase, fp)
67
+ return self._file
68
+ except (LookupError, OSError) as exc:
69
+ raise SplurgeDsvFileEncodingError(str(exc)) from exc
70
+
71
+ def write(self, text: str) -> int:
72
+ """Normalize newlines and write ``text`` to the opened file.
73
+
74
+ Args:
75
+ text: Text to write (newlines will be normalized).
76
+
77
+ Returns:
78
+ Number of characters written.
79
+ """
80
+ if self._file is None:
81
+ raise ValueError("file not opened")
82
+ normalized = text.replace("\r\n", "\n").replace("\r", "\n")
83
+ return self._file.write(normalized)
84
+
85
+ def writelines(self, lines: Iterable[str]) -> None:
86
+ if self._file is None:
87
+ raise ValueError("file not opened")
88
+ for line in lines:
89
+ self.write(line)
90
+
91
+ def flush(self) -> None:
92
+ if self._file is None:
93
+ return
94
+ self._file.flush()
95
+
96
+ def close(self) -> None:
97
+ if self._file is None:
98
+ return
99
+ try:
100
+ self._file.close()
101
+ finally:
102
+ self._file = None
103
+
104
+
105
+ @contextmanager
106
+ def open_text_writer(file_path: Path | str, *, encoding: str = "utf-8", mode: str = "w") -> Iterator[io.StringIO]:
107
+ """Context manager yielding an in-memory StringIO to accumulate text.
108
+
109
+ On successful exit, the buffered content is normalized and written to
110
+ disk using :class:`SafeTextFileWriter`. If an exception occurs inside
111
+ the context, nothing is written and the exception is propagated.
112
+
113
+ Args:
114
+ file_path: Destination path to write to on successful exit.
115
+ encoding: Encoding to use when writing.
116
+ mode: File open mode passed to writer (default: 'w').
117
+
118
+ Yields:
119
+ io.StringIO: Buffer to write textual content into.
120
+ """
121
+ path = Path(file_path)
122
+ buffer = io.StringIO()
123
+ try:
124
+ yield buffer
125
+ except Exception:
126
+ # Do not write on exceptions; re-raise
127
+ raise
128
+ else:
129
+ content = buffer.getvalue()
130
+ writer = SafeTextFileWriter(path, encoding=encoding)
131
+ try:
132
+ writer.open(mode=mode)
133
+ writer.write(content)
134
+ writer.flush()
135
+ finally:
136
+ writer.close()
@@ -12,7 +12,7 @@ This module is licensed under the MIT License.
12
12
  """
13
13
 
14
14
  # Local imports
15
- from splurge_dsv.exceptions import SplurgeParameterError
15
+ from splurge_dsv.exceptions import SplurgeDsvParameterError
16
16
 
17
17
 
18
18
  class StringTokenizer:
@@ -29,21 +29,24 @@ class StringTokenizer:
29
29
 
30
30
  @staticmethod
31
31
  def parse(content: str | None, *, delimiter: str, strip: bool = DEFAULT_STRIP) -> list[str]:
32
- """
33
- Split a string into tokens based on a delimiter.
32
+ """Tokenize a single string using ``delimiter``.
33
+
34
+ The function preserves empty tokens (e.g. ``"a,,c"`` with
35
+ delimiter ``","`` yields ``['a', '', 'c']``). If ``content`` is
36
+ None an empty list is returned.
34
37
 
35
38
  Args:
36
- content (str | None): The input string to tokenize
37
- delimiter (str): The character(s) to split the string on
38
- strip (bool, optional): Whether to strip whitespace from tokens. Defaults to True.
39
+ content: The input string to tokenize, or ``None``.
40
+ delimiter: The delimiter string to split on.
41
+ strip: If True, strip leading/trailing whitespace from each token.
39
42
 
40
43
  Returns:
41
- list[str]: List of tokens, preserving empty tokens
44
+ A list of tokens. Empty tokens are preserved.
42
45
 
43
46
  Raises:
44
- SplurgeParameterError: If delimiter is empty or None
47
+ SplurgeDsvParameterError: If ``delimiter`` is empty or ``None``.
45
48
 
46
- Example:
49
+ Examples:
47
50
  >>> StringTokenizer.parse("a,b,c", delimiter=",")
48
51
  ['a', 'b', 'c']
49
52
  >>> StringTokenizer.parse("a,,c", delimiter=",")
@@ -53,7 +56,7 @@ class StringTokenizer:
53
56
  return []
54
57
 
55
58
  if delimiter is None or delimiter == "":
56
- raise SplurgeParameterError("delimiter cannot be empty or None")
59
+ raise SplurgeDsvParameterError("delimiter cannot be empty or None")
57
60
 
58
61
  if strip and not content.strip():
59
62
  return []
@@ -65,51 +68,56 @@ class StringTokenizer:
65
68
 
66
69
  @classmethod
67
70
  def parses(cls, content: list[str], *, delimiter: str, strip: bool = DEFAULT_STRIP) -> list[list[str]]:
68
- """
69
- Process multiple strings into lists of tokens.
71
+ """Tokenize multiple strings.
70
72
 
71
73
  Args:
72
- content (list[str]): List of strings to tokenize
73
- delimiter (str): The character(s) to split each string on
74
- strip (bool, optional): Whether to strip whitespace from tokens. Defaults to True.
74
+ content: A list of strings to tokenize.
75
+ delimiter: The delimiter to use for splitting.
76
+ strip: If True, strip whitespace from tokens.
75
77
 
76
78
  Returns:
77
- list[list[str]]: List of token lists, one for each input string
79
+ A list where each element is the token list for the corresponding
80
+ input string.
78
81
 
79
82
  Raises:
80
- SplurgeParameterError: If delimiter is empty or None
83
+ SplurgeDsvParameterError: If ``delimiter`` is empty or ``None``.
81
84
 
82
85
  Example:
83
86
  >>> StringTokenizer.parses(["a,b", "c,d"], delimiter=",")
84
87
  [['a', 'b'], ['c', 'd']]
85
88
  """
86
89
  if delimiter is None or delimiter == "":
87
- raise SplurgeParameterError("delimiter cannot be empty or None")
90
+ raise SplurgeDsvParameterError("delimiter cannot be empty or None")
88
91
 
89
92
  return [cls.parse(text, delimiter=delimiter, strip=strip) for text in content]
90
93
 
91
94
  @staticmethod
92
95
  def remove_bookends(content: str, *, bookend: str, strip: bool = DEFAULT_STRIP) -> str:
93
- """
94
- Remove matching characters from both ends of a string.
96
+ """Remove matching bookend characters from both endpoints of ``content``.
97
+
98
+ The function optionally strips surrounding whitespace before checking
99
+ for matching bookend characters. If both ends match the provided
100
+ ``bookend`` and the remaining content is long enough, the bookends are
101
+ removed; otherwise the possibly-stripped input is returned unchanged.
95
102
 
96
103
  Args:
97
- content (str): The input string to process
98
- bookend (str): The character(s) to remove from both ends
99
- strip (bool, optional): Whether to strip whitespace first. Defaults to True.
104
+ content: The input string to process.
105
+ bookend: The bookend string to remove from both ends (e.g. '"').
106
+ strip: If True, strip whitespace prior to bookend removal.
100
107
 
101
108
  Returns:
102
- str: The string with matching bookends removed
109
+ The input string with matching bookend characters removed when
110
+ applicable.
103
111
 
104
112
  Raises:
105
- SplurgeParameterError: If bookend is empty or None
113
+ SplurgeDsvParameterError: If ``bookend`` is empty or ``None``.
106
114
 
107
115
  Example:
108
116
  >>> StringTokenizer.remove_bookends("'hello'", bookend="'")
109
117
  'hello'
110
118
  """
111
119
  if bookend is None or bookend == "":
112
- raise SplurgeParameterError("bookend cannot be empty or None")
120
+ raise SplurgeDsvParameterError("bookend cannot be empty or None")
113
121
 
114
122
  value: str = content.strip() if strip else content
115
123
  if value.startswith(bookend) and value.endswith(bookend) and len(value) > 2 * len(bookend) - 1: