ledgerkit 1.0.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ledgerkit/loader.py ADDED
@@ -0,0 +1,311 @@
1
+ """File loader for ledgerkit.
2
+
3
+ Handles file I/O, include directive expansion, path resolution, glob
4
+ matching, and circular include detection. Calls parse_string() for
5
+ text-to-Journal conversion.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import glob as _glob_module
11
+ import os
12
+ import re
13
+ from pathlib import Path
14
+
15
+ from ledgerkit.models import Journal
16
+ from ledgerkit.parser import ParseError, parse_string
17
+
18
+
19
+ _SUPPORTED_EXTENSIONS: frozenset[str] = frozenset({".journal", ".ledger"})
20
+
21
+ # Known hledger format type prefixes (e.g. "timedot:", "csv:").
22
+ # These are not supported in ledgerkit v1.
23
+ _FORMAT_PREFIXES: frozenset[str] = frozenset({
24
+ "journal", "ledger", "timeclock", "timedot", "csv", "ssv", "tsv", "rules",
25
+ })
26
+
27
+ # Matches an hledger `include` directive line.
28
+ #
29
+ # Purpose: detect whether a non-indented line is an include directive and
30
+ # extract the target path string for further resolution.
31
+ #
32
+ # Group breakdown:
33
+ # (1) (.+) — the raw target path string, captured after the mandatory
34
+ # whitespace that follows the "include" keyword. May contain
35
+ # glob characters, tildes, slashes, or spaces. Caller strips
36
+ # leading/trailing whitespace before use.
37
+ #
38
+ # Edge cases:
39
+ # - A line of just "include" (no whitespace) does not match because \s+
40
+ # requires at least one whitespace character after the keyword.
41
+ # - "include " (keyword + only spaces) matches with a whitespace-only
42
+ # group (1); the caller raises ParseError after stripping.
43
+ # - Indented lines (starting with spaces/tabs) do not match because ^
44
+ # anchors to the start of the line; indented posting-style lines are
45
+ # never directives.
46
+ # - "included" or "includes" do not match because \s+ requires whitespace
47
+ # immediately after the exact word "include".
48
+ _INCLUDE_LINE = re.compile(r"^include\s+(.+)$")
49
+
50
+
51
+ def _validate_extension(
52
+ path: Path,
53
+ *,
54
+ lineno: int | None = None,
55
+ source: Path | None = None,
56
+ ) -> None:
57
+ """Raise ParseError if path has an unsupported extension."""
58
+ ext = path.suffix.lower()
59
+ if ext not in _SUPPORTED_EXTENSIONS:
60
+ supported = ", ".join(sorted(_SUPPORTED_EXTENSIONS))
61
+ src = f" in {source}" if source is not None else ""
62
+ raise ParseError(
63
+ f"unsupported file format {ext!r} — ledgerkit accepts: {supported}{src}",
64
+ lineno,
65
+ )
66
+
67
+
68
+ def _check_format_prefix(raw: str, lineno: int, source: Path) -> None:
69
+ """Raise ParseError if raw begins with a known hledger format type prefix.
70
+
71
+ Format prefixes (e.g. "timedot:notes.md") are not supported in ledgerkit v1.
72
+ Single-character prefixes are treated as Windows drive letters (e.g. "C:")
73
+ and are not flagged here.
74
+ """
75
+ colon = raw.find(":")
76
+ if colon > 1:
77
+ prefix = raw[:colon].lower()
78
+ if prefix in _FORMAT_PREFIXES:
79
+ raise ParseError(
80
+ f"format prefixes not supported in ledgerkit v1 — "
81
+ f"remove the '{prefix}:' prefix from the include path",
82
+ lineno,
83
+ )
84
+
85
+
86
+ def _resolve_include_path(raw: str, containing_dir: Path) -> list[Path]:
87
+ """Resolve a raw include path string to a sorted list of absolute Paths.
88
+
89
+ Handles tilde expansion, absolute paths, relative paths (relative to
90
+ containing_dir), and glob patterns. Returns an empty list when a glob
91
+ pattern matches no files. Does NOT filter out the calling file.
92
+
93
+ Args:
94
+ raw: The stripped path string from the include directive.
95
+ containing_dir: Absolute directory of the file containing the include.
96
+
97
+ Returns:
98
+ Sorted list of resolved absolute Path objects.
99
+ """
100
+ if raw.startswith("~"):
101
+ base = Path(raw).expanduser()
102
+ elif Path(raw).is_absolute():
103
+ base = Path(raw)
104
+ else:
105
+ base = containing_dir / raw
106
+
107
+ base_str = str(base)
108
+ if any(c in base_str for c in ("*", "?", "[")):
109
+ matches = _glob_module.glob(base_str, recursive=True)
110
+ return sorted(Path(m).resolve() for m in matches)
111
+
112
+ return [base.resolve()]
113
+
114
+
115
+ def _expand_includes(
116
+ file_path: Path,
117
+ visited: set[Path],
118
+ line_map: list[tuple[Path, int]],
119
+ ) -> str:
120
+ """Recursively read file_path and expand all include directives.
121
+
122
+ Appends one (file_path, lineno) entry to line_map for every line that
123
+ appears in the returned text, so callers can map expanded-text line
124
+ numbers back to their originating source file and line.
125
+
126
+ The include directive lines themselves are consumed and produce no output
127
+ or line_map entries; they are replaced by the expanded content of the
128
+ referenced file(s).
129
+
130
+ Args:
131
+ file_path: Resolved absolute path to the file to expand.
132
+ visited: Set of absolute paths currently in the include chain.
133
+ Mutated (add before recurse, remove after) for cycle detection.
134
+ line_map: Accumulator for source attribution. Caller passes an empty
135
+ list for the root file; recursive calls share the same list.
136
+
137
+ Returns:
138
+ Fully expanded text with all include directives substituted inline.
139
+
140
+ Raises:
141
+ FileNotFoundError: if file_path or a non-glob included file does not exist.
142
+ ParseError: on circular include, unsupported extension, format prefix,
143
+ or a glob pattern that matches no files.
144
+ """
145
+ text = file_path.read_text(encoding="utf-8")
146
+ lines = text.splitlines()
147
+ output: list[str] = []
148
+
149
+ for lineno, line in enumerate(lines, 1):
150
+ m = _INCLUDE_LINE.match(line)
151
+ if not m:
152
+ line_map.append((file_path, lineno))
153
+ output.append(line + "\n")
154
+ continue
155
+
156
+ raw_target = m.group(1).strip()
157
+ if not raw_target:
158
+ raise ParseError("include: missing file path", lineno)
159
+
160
+ _check_format_prefix(raw_target, lineno, file_path)
161
+
162
+ is_glob = any(c in raw_target for c in ("*", "?", "["))
163
+ targets = _resolve_include_path(raw_target, file_path.parent)
164
+ targets = [t for t in targets if t != file_path]
165
+
166
+ if not targets:
167
+ if is_glob:
168
+ raise ParseError(
169
+ f"include: no files matched {raw_target!r}",
170
+ lineno,
171
+ )
172
+ # Non-glob: resolve for a clear error message
173
+ resolved = _resolve_include_path(raw_target, file_path.parent)[0]
174
+ raise FileNotFoundError(
175
+ f"include: file not found: {resolved} "
176
+ f"(referenced at {file_path}, line {lineno})"
177
+ )
178
+
179
+ for target in targets:
180
+ _validate_extension(target, lineno=lineno, source=file_path)
181
+ if target in visited:
182
+ raise ParseError(
183
+ f"circular include detected: {target} is already being "
184
+ f"processed (referenced at {file_path}, line {lineno})",
185
+ lineno,
186
+ )
187
+ visited.add(target)
188
+ output.append(_expand_includes(target, visited, line_map))
189
+ visited.remove(target)
190
+
191
+ return "".join(output)
192
+
193
+
194
+ def load_journal_stdin() -> Journal:
195
+ """Read a journal from stdin and return a Journal object.
196
+
197
+ Parses the full stdin contents as hledger journal text.
198
+ Sets source_file to "(stdin)". included_files is always 0
199
+ because stdin content cannot reference include directives
200
+ with resolvable relative paths.
201
+
202
+ Returns:
203
+ A :class:`~ledgerkit.models.Journal` with ``source_file``
204
+ set to ``"(stdin)"``.
205
+
206
+ Raises:
207
+ ParseError: if the stdin content is malformed.
208
+ """
209
+ import sys
210
+
211
+ journal = parse_string(sys.stdin.read())
212
+ journal.source_file = "(stdin)"
213
+ return journal
214
+
215
+
216
+ def merge_journals(journals: list[Journal]) -> Journal:
217
+ """Merge a list of Journal objects into a single Journal.
218
+
219
+ Transactions and prices are concatenated in input order.
220
+ ``source_file`` is taken from the first journal in the list.
221
+ ``included_files`` is the sum of all input journals'
222
+ ``included_files`` values.
223
+
224
+ Args:
225
+ journals: Non-empty list of Journal objects to merge.
226
+
227
+ Returns:
228
+ A new :class:`~ledgerkit.models.Journal` containing the
229
+ combined data, or the original object when the list has
230
+ exactly one entry.
231
+
232
+ Raises:
233
+ ValueError: if ``journals`` is empty.
234
+ """
235
+ if not journals:
236
+ raise ValueError("merge_journals: at least one journal required")
237
+ if len(journals) == 1:
238
+ return journals[0]
239
+ return Journal(
240
+ transactions=[t for j in journals for t in j.transactions],
241
+ prices=[p for j in journals for p in j.prices],
242
+ declared_accounts=[a for j in journals for a in j.declared_accounts],
243
+ declared_commodities=[c for j in journals for c in j.declared_commodities],
244
+ declared_payees=[p for j in journals for p in j.declared_payees],
245
+ declared_tags=[t for j in journals for t in j.declared_tags],
246
+ source_file=journals[0].source_file,
247
+ included_files=sum(j.included_files for j in journals),
248
+ )
249
+
250
+
251
+ def load_journal(path: str | os.PathLike) -> Journal:
252
+ """Load a .journal or .ledger file and return a Journal object.
253
+
254
+ Supports the hledger ``include`` directive. Included files are expanded
255
+ recursively at the point of the directive before parsing, so directive
256
+ scope (e.g. an ``alias`` active before an ``include``) propagates
257
+ naturally through included content.
258
+
259
+ Path resolution in include directives:
260
+ - ``~/...`` tilde expanded to the home directory
261
+ - ``/abs/path`` used as-is (absolute)
262
+ - ``relative`` resolved relative to the containing file's directory
263
+ - Glob patterns (``*``, ``**``, ``?``, ``[range]``) are expanded via
264
+ :func:`glob.glob`; the containing file is always excluded from results.
265
+
266
+ Only ``.journal`` and ``.ledger`` files may be loaded or included.
267
+ Format prefixes (e.g. ``timedot:``) raise :class:`ParseError`.
268
+ Circular includes raise :class:`ParseError`.
269
+
270
+ Args:
271
+ path: Absolute or relative path to the root journal file.
272
+
273
+ Returns:
274
+ A :class:`~ledgerkit.models.Journal` with ``source_file`` and
275
+ ``included_files`` populated.
276
+
277
+ Raises:
278
+ FileNotFoundError: if the root path or a non-glob included file does
279
+ not exist.
280
+ ParseError: if an extension is unsupported, a format prefix is used,
281
+ a circular include is detected, a glob matches nothing, or the
282
+ file contents are malformed.
283
+ """
284
+ abs_path = Path(os.fspath(path)).resolve()
285
+ _validate_extension(abs_path)
286
+
287
+ line_map: list[tuple[Path, int]] = []
288
+ visited: set[Path] = {abs_path}
289
+ expanded = _expand_includes(abs_path, visited, line_map)
290
+
291
+ included_count = len({src for src, _ in line_map if src != abs_path})
292
+
293
+ try:
294
+ journal = parse_string(expanded, source_file=str(abs_path))
295
+ except ParseError as exc:
296
+ if exc.line_number is not None and 1 <= exc.line_number <= len(line_map):
297
+ orig_file, orig_lineno = line_map[exc.line_number - 1]
298
+ # Strip any existing " (line N)" suffix from the original message
299
+ # before re-raising with the correctly attributed location.
300
+ orig_msg = exc.args[0]
301
+ if " (line " in orig_msg:
302
+ orig_msg = orig_msg[: orig_msg.rfind(" (line ")]
303
+ raise ParseError(
304
+ f"{orig_file}: {orig_msg}",
305
+ orig_lineno,
306
+ ) from exc
307
+ raise
308
+
309
+ journal.source_file = str(abs_path)
310
+ journal.included_files = included_count
311
+ return journal