ledgerkit 1.0.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ledgerkit/parser.py ADDED
@@ -0,0 +1,1547 @@
1
+ """Journal file parser for ledgerkit.
2
+
3
+ Converts raw .journal text into Journal/Transaction/Posting objects.
4
+ See docs/hledger-compatibility.md for the transaction block structure and
5
+ the list of supported format features.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import datetime
11
+ import re
12
+ from dataclasses import dataclass
13
+ from decimal import Decimal, InvalidOperation
14
+ from typing import Optional
15
+
16
+ from ledgerkit.models import Amount, BalanceAssertion, Journal, Posting, PriceDirective, SourceSpan, Transaction
17
+
18
+
19
+ @dataclass
20
+ class _ParseContext:
21
+ """Mutable parser state threaded through the parsing call chain.
22
+
23
+ Holds directive-accumulated values that must be visible to both
24
+ _parse_amount and _parse_posting without threading multiple individual
25
+ parameters. Fields are mutated in-place as directives are encountered.
26
+ """
27
+
28
+ default_year: int
29
+ decimal_mark: str
30
+ default_commodity: Optional[str] = None
31
+ account_prefix: Optional[str] = None
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Module-level constants
36
+ # ---------------------------------------------------------------------------
37
+
38
+ # Matches the two-or-more-space + comment-character separator used in directives.
39
+ #
40
+ # Purpose: split directive lines on the boundary between directive body and
41
+ # inline comment, per hledger's rule that a single space may appear
42
+ # inside a body value (e.g. an account name like "expenses:fun money")
43
+ # but two or more spaces before a comment character always begin a comment.
44
+ #
45
+ # Pattern: \s{2,}[;#]
46
+ # \s{2,} — two or more whitespace characters (spaces or tabs)
47
+ # [;#] — semicolon or hash: both are recognised comment introducers in ledgerkit
48
+ #
49
+ # Edge cases:
50
+ # - A single space before ';' or '#' is NOT a separator (belongs to the body)
51
+ # - Only the FIRST match is used (re.split with maxsplit=1)
52
+ # - Lines with no such pattern return the original body unchanged
53
+ _TWO_SPACE_SEP = re.compile(r"\s{2,}[;#]")
54
+
55
+
56
+ class ParseError(ValueError):
57
+ """Raised on malformed hledger journal input.
58
+
59
+ Attributes:
60
+ line_number: 1-based line number where the error was detected,
61
+ or None if not applicable.
62
+ """
63
+
64
+ def __init__(self, message: str, line_number: int | None = None) -> None:
65
+ self.line_number = line_number
66
+ location = f" (line {line_number})" if line_number is not None else ""
67
+ super().__init__(f"{message}{location}")
68
+
69
+
70
+ class ParseWarning(ParseError):
71
+ """A non-fatal parse notice; does not prevent journal loading."""
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # Regex patterns
76
+ # ---------------------------------------------------------------------------
77
+
78
+ # Matches a transaction header line in hledger journal format.
79
+ #
80
+ # Purpose: extract date, optional status flag, optional transaction code,
81
+ # description text, and optional inline comment from a single
82
+ # non-indented line that begins a transaction block.
83
+ #
84
+ # Group breakdown:
85
+ # (1) primary date (and optional =DATE2)
86
+ # — captured as a single raw string and passed to _parse_txn_header,
87
+ # which splits on '=' when present to obtain both dates.
88
+ # Handles YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD, and year-omitted forms
89
+ # (M/DD, MM-DD, etc.); leading zeros optional on month and day.
90
+ # The secondary date uses the same format rules as the primary date.
91
+ # (2) [*!]? — status flag: '*' = cleared, '!' = pending;
92
+ # absent means uncleared
93
+ # (3) (?:\(([^)]*)\))? — transaction code in parentheses, e.g. (INV-42);
94
+ # outer parens consumed, only inner text captured;
95
+ # [^)]* prevents greedily crossing a closing paren
96
+ # (4) .*? — description: lazy so the trailing comment anchor
97
+ # can match; stripped of surrounding whitespace
98
+ # after the match
99
+ # (5) (?:\s*;\s*(.*))? — inline comment following ';'; the ';' itself and
100
+ # surrounding spaces are consumed but not captured
101
+ #
102
+ # Edge cases:
103
+ # - Description may be empty (e.g. "2024-01-01 *" with no text after the flag)
104
+ # - Code may be absent even when a flag is present, and vice versa
105
+ # - A bare ';' anywhere after the date is treated as the comment delimiter;
106
+ # this matches hledger's own behaviour (first ';' ends the description)
107
+ # - Mixed separators (e.g. "2024-01/15") are captured here without complaint;
108
+ # _parse_simple_date accepts them since each separator pair is matched
109
+ # independently
110
+ # - "2024-02-20=2024-02-22 * desc" → group 1 = "2024-02-20=2024-02-22";
111
+ # _parse_txn_header splits on '=' to obtain both dates
112
+ # - A trailing '=' without a following date2 pattern is not matched by the
113
+ # optional group, so group 1 contains only the primary date
114
+ _TXN_HEADER = re.compile(
115
+ r"^((?:\d{4}[-/.])?(?:\d{1,2})[-/.](?:\d{1,2})"
116
+ r"(?:=(?:\d{4}[-/.])?(?:\d{1,2})[-/.](?:\d{1,2}))?)"
117
+ r"\s*([*!])?"
118
+ r"\s*(?:\(([^)]*)\))?"
119
+ r"\s*(.*?)"
120
+ r"(?:\s*;\s*(.*))?$"
121
+ )
122
+
123
+ # Parses a simple date string captured by _TXN_HEADER into its year, month,
124
+ # and day components.
125
+ #
126
+ # Purpose: decompose a raw date token (any of the accepted formats) into
127
+ # integer components so that datetime.date() can validate and
128
+ # construct the final date object.
129
+ #
130
+ # Group breakdown:
131
+ # (1) (\d{4}) — four-digit year; the entire (?:...) wrapper is optional,
132
+ # so this group is None when the year is omitted
133
+ # [-/.] — separator: hyphen, forward-slash, or dot; not captured
134
+ # (2) (\d{1,2}) — month, 1–2 digits, leading zero optional
135
+ # [-/.] — separator (same character classes; mixing is tolerated)
136
+ # (3) (\d{1,2}) — day of month, 1–2 digits, leading zero optional
137
+ #
138
+ # Edge cases:
139
+ # - Year absent ("1/31", "01-31"): group 1 is None; caller supplies default_year
140
+ # - Invalid calendar values ("2024-13-01"): regex matches but datetime.date()
141
+ # raises ValueError, which _parse_simple_date converts to ParseError
142
+ # - Dot separator ("2010.1.31"): matched by [-/.]; note this is unambiguous in
143
+ # header context because amounts (which also use '.') are on indented lines
144
+ _SIMPLE_DATE = re.compile(
145
+ r"^(?:(\d{4})[-/.])?(\d{1,2})[-/.](\d{1,2})$"
146
+ )
147
+
148
+ # Matches an hledger amount string in either prefix-symbol or suffix-symbol form.
149
+ #
150
+ # Purpose: parse the quantity and commodity out of an amount token that has
151
+ # already been separated from the account name. Supports both
152
+ # '£30.00' (symbol before number) and '30.00 EUR' (symbol after number),
153
+ # optional thousands-separator commas, a leading minus sign, and a
154
+ # mid-minus sign that appears after the prefix symbol (e.g. '$-300').
155
+ #
156
+ # Group breakdown:
157
+ # (1) (-?) — optional leading minus sign before the symbol;
158
+ # applies when the minus precedes everything
159
+ # (2) ([^\d,.\s-]*) — prefix commodity: any run of characters that
160
+ # are not digits, commas, dots, whitespace, or
161
+ # minus; matches '£', '$', '€', etc.; empty
162
+ # string when the commodity is a suffix
163
+ # (3) (-?) — optional mid-minus: sign that appears AFTER the
164
+ # prefix symbol, e.g. '$-300.00'; combined with
165
+ # group 1 in _parse_amount — effective sign is
166
+ # group 1 OR group 3
167
+ # (4) ([\d,]+(?:\.\d*)?(?:[Ee][+-]?\d+)?)
168
+ # — numeric quantity: one or more digits/commas
169
+ # optionally followed by a decimal part, then
170
+ # an optional E-notation exponent ([Ee][+-]?\d+);
171
+ # commas are stripped before Decimal conversion
172
+ # (5) ([A-Za-z][A-Za-z0-9]*)? — suffix commodity: a letter-started alphanumeric
173
+ # token (e.g. EUR, USD, AAPL); absent when the
174
+ # commodity is a prefix symbol
175
+ #
176
+ # Edge cases:
177
+ # - Exactly one of group 2 or group 5 must be non-empty; if both are empty
178
+ # the caller raises ParseError (no commodity)
179
+ # - A space between prefix symbol and quantity is allowed: '£ 30.00' matches
180
+ # because \s* between groups 2/3 and 4 absorbs it
181
+ # - Integer quantities ('£100') are valid; the decimal part is optional
182
+ # - Negative suffix amounts ('-30.00 EUR'): the minus in group 1 precedes the
183
+ # empty group 2, empty group 3, then the quantity in group 4, then EUR in 5
184
+ # - Mid-sign after prefix symbol ('$-300'): group 1 is empty, group 3 is '-';
185
+ # combined so Decimal('-300') is produced correctly
186
+ # - Both group 1 and group 3 present is malformed but harmless; either '-'
187
+ # makes the effective sign negative
188
+ # - Trailing decimal with no fractional digits ('$1,000.') is accepted;
189
+ # Python's Decimal('1000.') is valid and equals Decimal('1000')
190
+ # - E-notation ('1E3 EUR', '1.5e-2 GBP'): Decimal natively handles these forms;
191
+ # no extra conversion needed beyond comma-stripping
192
+ _AMOUNT = re.compile(
193
+ r"^(-?)"
194
+ r"([^\d,.\s-]*)"
195
+ r"(-?)"
196
+ r"\s*([\d,]+(?:\.\d*)?(?:[Ee][+-]?\d+)?)"
197
+ r"\s*([A-Za-z][A-Za-z0-9]*)?"
198
+ r"$"
199
+ )
200
+
201
+ # Matches an hledger amount string when `decimal-mark ,` is active (European
202
+ # notation where comma is the decimal mark and period is the thousands separator).
203
+ #
204
+ # Purpose: parse amounts of the form "1.234,56" or "€1.234,56" where the
205
+ # convention is the reverse of the default _AMOUNT regex. Selected
206
+ # by _parse_amount when decimal_mark == ",". Also supports a mid-minus
207
+ # sign after the prefix symbol (e.g. '€-1.234,56').
208
+ #
209
+ # Group breakdown: (mirrors _AMOUNT; only the numeric group differs)
210
+ # (1) (-?) — optional leading minus sign before the symbol
211
+ # (2) ([^\d,.\s-]*) — prefix commodity symbol (£, $, €, etc.)
212
+ # (3) (-?) — optional mid-minus: sign after prefix symbol;
213
+ # combined with group 1 in _parse_amount
214
+ # (4) ([\d.]*(?:,\d*)?(?:[Ee][+-]?\d+)?)
215
+ # — numeric quantity in comma-decimal form:
216
+ # zero or more digits/periods followed by
217
+ # optional comma+decimal digits, then
218
+ # an optional E-notation exponent;
219
+ # periods stripped and comma→period before Decimal
220
+ # (5) ([A-Za-z][A-Za-z0-9]*)? — suffix commodity symbol (EUR, USD, etc.)
221
+ #
222
+ # Edge cases:
223
+ # - "1.234,56" → period=thousands, comma=decimal → Decimal("1234.56")
224
+ # - "100,50" → no thousands separator → Decimal("100.50")
225
+ # - "1.234" → period=thousands, no decimal → Decimal("1234")
226
+ # - "100" → no separators at all → Decimal("100")
227
+ # - "€1.234,56" → prefix "€" + comma-decimal numeric
228
+ # - "€-1.234,56" → prefix "€", mid-minus '-', comma-decimal numeric
229
+ # - "1.234,56 EUR" → suffix "EUR" style
230
+ # - Trailing comma with no fractional digits ('1.234,') is accepted;
231
+ # Python's Decimal('1234.') is valid and equals Decimal('1234')
232
+ # - "1E3 EUR" in comma-decimal mode: no comma/period, exponent appended directly
233
+ _AMOUNT_COMMA = re.compile(
234
+ r"^(-?)"
235
+ r"([^\d,.\s-]*)"
236
+ r"(-?)"
237
+ r"\s*([\d.]*(?:,\d*)?(?:[Ee][+-]?\d+)?)"
238
+ r"\s*([A-Za-z][A-Za-z0-9]*)?"
239
+ r"$"
240
+ )
241
+
242
+ # Matches a P (market price) directive line.
243
+ #
244
+ # Purpose: detect a P directive and capture its three space-delimited components
245
+ # so the handler can extract date, commodity symbol, and price amount.
246
+ #
247
+ # Group breakdown:
248
+ # (1) (\S+) — raw date string; no whitespace; passed to _parse_simple_date
249
+ # (2) (\S+) — COMMODITY1SYMBOL: the commodity being priced, e.g. "€", "$", "AAPL";
250
+ # no whitespace — multi-word commodity names are not valid in this position
251
+ # (3) (.+) — raw COMMODITY2AMOUNT string, e.g. "$1.35", "1.40 USD";
252
+ # may include a trailing inline comment (" ; note") — caller strips
253
+ # with _strip_directive_comment before passing to _parse_amount
254
+ #
255
+ # Edge cases:
256
+ # - "P 2024-01-01 AAPL $179.00 ; note" → group 3 is "$179.00 ; note";
257
+ # _strip_directive_comment removes the trailing " ;" portion
258
+ # - "P 2024-01-01 AAPL" (no price amount) does not match; caller raises ParseError
259
+ # - Lines starting with "P" but not "P " never reach this handler because they
260
+ # are either transaction headers (start with a date digit) or posting lines
261
+ # (indented)
262
+ # - "P" alone (no whitespace) does not match because \s+ requires at least one space
263
+ _P_DIRECTIVE = re.compile(r"^P\s+(\S+)\s+(\S+)\s+(.+)$")
264
+
265
+ # Matches an alias directive line.
266
+ #
267
+ # Purpose: detect an alias directive and capture the entire body after
268
+ # "alias " so the handler can determine whether it is a basic
269
+ # alias or a regex alias and parse it accordingly.
270
+ #
271
+ # Group breakdown:
272
+ # (1) (.+) — the raw alias body, e.g. "checking = assets:bank" or
273
+ # "/^(.+):bank/ = \1" or "/old/=new ; note" or "/old/=new # note";
274
+ # inline comments ( ; or #) are stripped by _strip_directive_comment
275
+ # before further parsing
276
+ #
277
+ # Edge cases:
278
+ # - "alias" with no body does not match because \s+ requires at least
279
+ # one space and (.+) requires at least one character after it
280
+ # - "aliases" (plural) does not match because \s+ requires whitespace
281
+ # immediately after the exact word "alias"
282
+ # - Leading whitespace on line: the outer guard (not line[0:1].isspace())
283
+ # prevents indented lines from ever reaching this check
284
+ _ALIAS_DIRECTIVE = re.compile(r"^alias\s+(.+)$")
285
+
286
+ # Matches an end aliases directive line (exact keyword, no trailing content).
287
+ #
288
+ # Purpose: detect the "end aliases" directive that clears all currently
289
+ # active alias rules from the parse state. The handler passes the
290
+ # line through _strip_directive_comment first, so trailing "; comment"
291
+ # and "# comment" sequences (with two-or-more-space prefix) are stripped
292
+ # before this regex is applied.
293
+ #
294
+ # Group breakdown:
295
+ # No capture groups — presence of the directive is sufficient.
296
+ #
297
+ # Edge cases:
298
+ # - "end aliases" (two spaces between words) matches because \s+ allows
299
+ # multiple spaces — consistent with hledger's lenient whitespace handling
300
+ # - "end aliases ; comment" or "end aliases # comment": the handler strips
301
+ # the trailing comment via _strip_directive_comment before matching, so
302
+ # both forms are recognised correctly
303
+ # - "end aliasesX" does not match because $ anchors immediately after "aliases"
304
+ _END_ALIASES = re.compile(r"^end\s+aliases$")
305
+
306
+
307
+ # ---------------------------------------------------------------------------
308
+ # Private helpers
309
+ # ---------------------------------------------------------------------------
310
+
311
+ def _parse_simple_date(
312
+ date_str: str, lineno: int, default_year: int
313
+ ) -> datetime.date:
314
+ """Parse a simple date string into a datetime.date.
315
+
316
+ Accepts YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD, and year-omitted forms such as
317
+ M/DD or MM-DD. Leading zeros on month and day are optional. When the year is
318
+ absent, default_year is used (typically the current calendar year).
319
+
320
+ Raises:
321
+ ParseError: if the string does not match the simple date pattern or the
322
+ resulting calendar date is invalid (e.g. month 13).
323
+ """
324
+ m = _SIMPLE_DATE.match(date_str)
325
+ if not m:
326
+ raise ParseError(f"invalid date {date_str!r}", lineno)
327
+ year_str, month_str, day_str = m.groups()
328
+ year = int(year_str) if year_str else default_year
329
+ try:
330
+ return datetime.date(year, int(month_str), int(day_str))
331
+ except ValueError as exc:
332
+ raise ParseError(f"invalid date {date_str!r}: {exc}", lineno)
333
+
334
+
335
+ def _parse_txn_header(line: str, lineno: int, default_year: int) -> Transaction:
336
+ """Parse a transaction header line into a Transaction with no postings."""
337
+ m = _TXN_HEADER.match(line)
338
+ if not m:
339
+ raise ParseError(f"invalid transaction header: {line!r}", lineno)
340
+
341
+ date_raw, flag, code, description, comment = m.groups()
342
+ if "=" in date_raw:
343
+ primary_raw, secondary_raw = date_raw.split("=", 1)
344
+ date = _parse_simple_date(primary_raw, lineno, default_year)
345
+ date2 = _parse_simple_date(secondary_raw, lineno, default_year)
346
+ else:
347
+ date = _parse_simple_date(date_raw, lineno, default_year)
348
+ date2 = None
349
+
350
+ comment_text = (comment or "").strip()
351
+ return Transaction(
352
+ date=date,
353
+ date2=date2,
354
+ description=(description or "").strip(),
355
+ postings=[],
356
+ cleared=(flag == "*"),
357
+ pending=(flag == "!"),
358
+ code=(code or "").strip(),
359
+ comment=comment_text,
360
+ source_line=lineno,
361
+ inline_comment=comment_text or None,
362
+ )
363
+
364
+
365
+ def _strip_cost_annotation(raw: str) -> tuple[str, str | None]:
366
+ """Strip a cost annotation (@/@@ PRICE) from a raw amount token.
367
+
368
+ Returns (cleaned_amount, cost_raw) where cost_raw is the text after the
369
+ marker (without the marker itself), or None if absent.
370
+ Checks @@ before @ so the two-char marker takes priority.
371
+ """
372
+ for marker in (" @@ ", " @ "):
373
+ idx = raw.find(marker)
374
+ if idx != -1:
375
+ return raw[:idx].strip(), raw[idx + len(marker):].strip()
376
+ return raw, None
377
+
378
+
379
+ # Matches one lot-annotation token appended to an amount in hledger journal format.
380
+ # Purpose: identify the four recognised annotation syntaxes so they can be removed
381
+ # before the amount regex is applied. hledger supports cost annotations
382
+ # ({AMOUNT}, {{AMOUNT}}), lot dates ([DATE]), and lot labels ((LABEL))
383
+ # appended after the commodity token with leading whitespace.
384
+ #
385
+ # Group breakdown: no capture groups — used only for substitution via re.sub
386
+ # \s+ — one or more whitespace chars separating amount from annotation
387
+ # (?:...) — non-capturing alternation of the four annotation forms:
388
+ # \{\{[^}]*\}\} — double-brace total-cost annotation: {{...}}
389
+ # \{[^}]*\} — single-brace per-unit-cost annotation: {...}
390
+ # \[[^\]]*\] — square-bracket lot date: [DATE]
391
+ # \([^)]*\) — round-bracket lot label: (LABEL)
392
+ #
393
+ # Edge cases:
394
+ # - Double-brace must be listed before single-brace (longer match wins)
395
+ # - Nested brackets are not supported (hledger itself does not allow them)
396
+ # - Multiple annotations in sequence (e.g. "{$1} [2024-01-01]") are all removed
397
+ # because re.sub replaces all non-overlapping matches
398
+ # - An annotation with no preceding whitespace is not matched (requires \s+);
399
+ # such input is malformed and will fail the amount regex anyway
400
+ _LOT_ANNOTATION_RE = re.compile(
401
+ r'\s+(?:\{\{[^}]*\}\}|\{[^}]*\}|\[[^\]]*\]|\([^)]*\))'
402
+ )
403
+
404
+
405
+ def _strip_lot_annotations(raw: str) -> str:
406
+ """Remove all lot-annotation tokens from a raw amount string."""
407
+ return _LOT_ANNOTATION_RE.sub("", raw).strip()
408
+
409
+
410
+ # Matches a space used as a digit-group separator between a digit and a group of
411
+ # exactly three digits (followed by another digit-group boundary or non-digit).
412
+ #
413
+ # Purpose: normalise space-separated amounts like '1 000 000 JPY' → '1000000 JPY'
414
+ # so the main amount regex (which does not allow spaces in the numeric
415
+ # group) can parse them. Applied iteratively in _normalise_space_separators.
416
+ #
417
+ # Group breakdown:
418
+ # (1) (\d) — the digit immediately before the space
419
+ # (2) (\d{3}) — exactly three digits forming the group after the space
420
+ # (?=\s|\D|$) — lookahead: next char is whitespace, non-digit, or end-of-string;
421
+ # prevents collapsing a space before a non-three-digit run
422
+ #
423
+ # Edge cases:
424
+ # - '1 000 000 JPY' → after two iterations → '1000000 JPY' ✓
425
+ # - '1 JPY' → single digit before space, followed by non-digit 'J' → NOT collapsed ✓
426
+ # - '1 00 JPY' → two-digit group → NOT collapsed (requires exactly 3 digits)
427
+ # - Applied iteratively because each pass collapses one separator; two passes
428
+ # handle '1 000 000' → '1000 000' → '1000000'
429
+ _SPACE_DIGIT_GROUP_RE = re.compile(r'(\d) (\d{3})(?=\s|\D|$)')
430
+
431
+
432
+ def _normalise_space_separators(s: str) -> str:
433
+ """Collapse space digit-group separators: '1 000 000 JPY' → '1000000 JPY'."""
434
+ prev = None
435
+ while prev != s:
436
+ prev = s
437
+ s = _SPACE_DIGIT_GROUP_RE.sub(r'\1\2', s)
438
+ return s
439
+
440
+
441
+ # Matches an amount with a quoted commodity suffix, e.g. '-3 "Chocolate Frogs"'.
442
+ #
443
+ # Purpose: handle commodity names that require double quotes because they
444
+ # contain spaces or start with non-letter characters. This branch is
445
+ # checked BEFORE the main _AMOUNT regex, which cannot match quoted names.
446
+ #
447
+ # Group breakdown:
448
+ # (1) (-?) — optional leading minus sign
449
+ # (2) (.*?) — numeric quantity, possibly with prefix symbol (lazy match
450
+ # stops before the quoted suffix); may include the sign if
451
+ # the commodity is a prefix symbol
452
+ # (3) ([^"]+) — quoted commodity name, inner text only; outer quotes consumed
453
+ #
454
+ # Edge cases:
455
+ # - '-3 "Chocolate Frogs"' → sign='-', numeric='3', sym='Chocolate Frogs'
456
+ # - '3 "Foo Bar"' → sign='', numeric='3', sym='Foo Bar'
457
+ # - Quantity with comma/period ('1,000 "AAAA"'): commas stripped in branch
458
+ # - Empty quoted name '""' is technically matched but will produce a commodity
459
+ # of empty string; the caller does not separately reject it here
460
+ # - Cost/lot annotations are already stripped before this branch is reached,
461
+ # so "@@ $10" tails will never appear in raw at this point
462
+ _QUOTED_SUFFIX_RE = re.compile(r'^(-?)(.*?)\s+"([^"]+)"\s*$')
463
+
464
+
465
+ def _parse_amount(raw: str, lineno: int, ctx: _ParseContext) -> tuple[Amount, str | None]:
466
+ """Parse a raw amount string into an (Amount, cost_raw) tuple.
467
+
468
+ Strips cost annotations (@ PRICE / @@ TOTAL) and lot annotations
469
+ ({...}, {{...}}, [...], (...)) before parsing. The stripped cost
470
+ annotation text is returned as the second element (None if absent).
471
+
472
+ Supports prefix commodity (£30.00), suffix commodity (30.00 EUR),
473
+ negative amounts (-£5.00, $-300, -30.00 EUR), and digit-group separators.
474
+
475
+ When ctx.decimal_mark is "." (default), commas are treated as thousands
476
+ separators and periods as decimal marks (e.g. 1,234.56).
477
+ When ctx.decimal_mark is ",", periods are thousands separators and commas
478
+ are decimal marks (e.g. 1.234,56).
479
+ """
480
+ raw = raw.strip()
481
+ raw, cost_raw = _strip_cost_annotation(raw)
482
+ raw = _strip_lot_annotations(raw)
483
+ raw = _normalise_space_separators(raw)
484
+
485
+ m_qs = _QUOTED_SUFFIX_RE.match(raw)
486
+ if m_qs:
487
+ sign, numeric_part, quoted_sym = m_qs.groups()
488
+ numeric_clean = numeric_part.strip().replace(",", "")
489
+ try:
490
+ quantity = Decimal(sign + numeric_clean) if numeric_clean else Decimal(0)
491
+ except InvalidOperation:
492
+ raise ParseError(f"invalid numeric quantity in amount: {raw!r}", lineno)
493
+ return Amount(quantity=quantity, commodity=quoted_sym, raw=raw), cost_raw
494
+
495
+ if ctx.decimal_mark == ",":
496
+ m = _AMOUNT_COMMA.match(raw)
497
+ else:
498
+ m = _AMOUNT.match(raw)
499
+
500
+ if not m:
501
+ raise ParseError(f"invalid amount: {raw!r}", lineno)
502
+
503
+ minus, prefix_sym, mid_minus, quantity_str, suffix_sym = m.groups()
504
+ negative = minus or mid_minus
505
+
506
+ commodity = (prefix_sym or suffix_sym or "").strip()
507
+ if not commodity:
508
+ if ctx.default_commodity:
509
+ commodity = ctx.default_commodity
510
+ else:
511
+ raise ParseError(f"amount has no commodity symbol: {raw!r}", lineno)
512
+
513
+ if ctx.decimal_mark == ",":
514
+ # Period is the digit-group mark; comma is the decimal mark.
515
+ quantity_clean = quantity_str.replace(".", "").replace(",", ".")
516
+ else:
517
+ # Comma is the digit-group mark; period is the decimal mark (default).
518
+ quantity_clean = quantity_str.replace(",", "")
519
+
520
+ try:
521
+ quantity = Decimal(negative + quantity_clean)
522
+ except InvalidOperation:
523
+ raise ParseError(f"invalid numeric quantity in amount: {raw!r}", lineno)
524
+
525
+ return Amount(quantity=quantity, commodity=commodity, raw=raw), cost_raw
526
+
527
+
528
+ def _strip_directive_comment(raw: str) -> str:
529
+ """Strip an inline comment from a directive body using the 2-space rule.
530
+
531
+ Returns the body text before the first ' ;' sequence, stripped of
532
+ surrounding whitespace. If no such sequence exists, returns the stripped
533
+ input unchanged.
534
+ """
535
+ parts = _TWO_SPACE_SEP.split(raw, maxsplit=1)
536
+ return parts[0].strip()
537
+
538
+
539
+ # Matches a commodity directive body where the symbol is a quoted trailing name,
540
+ # e.g. '1,000. "Chocolate Frogs"' or '0. "My Fund"'.
541
+ #
542
+ # Purpose: detect the trailing-quoted-symbol form BEFORE _COMMODITY_AMOUNT.match
543
+ # is tried, since _COMMODITY_AMOUNT's suffix group cannot match quoted names
544
+ # containing spaces.
545
+ #
546
+ # Group breakdown:
547
+ # (1) ([\d,. ]*) — numeric sample: digits, commas, dots, internal spaces;
548
+ # may be empty if the directive is just a quoted name
549
+ # (2) ([^"]+) — quoted commodity name, inner text only; outer quotes consumed
550
+ #
551
+ # Edge cases:
552
+ # - '1,000. "Chocolate Frogs"' → numeric='1,000. ', sym='Chocolate Frogs'
553
+ # - '"Chocolate Frogs"' → numeric='', sym='Chocolate Frogs'
554
+ # (but this form is caught by the earlier startswith('"') block)
555
+ # - Trailing whitespace after closing quote: absorbed by \s*$
556
+ # - Empty quoted name '""' is accepted (produces empty-string commodity)
557
+ _TRAILING_QUOTED_COMMODITY_RE = re.compile(r'^([\d,. ]*)\s*"([^"]+)"\s*$')
558
+
559
+
560
+ # Matches the numeric-and-optional-suffix part of a commodity sample amount.
561
+ #
562
+ # Purpose: extract the commodity symbol from a commodity directive whose body
563
+ # is an amount token (e.g. "$1,000.00", "1,000.00 EUR", "1000. AAAA").
564
+ # Also handles a bare symbol (e.g. "$", "INR") where no digits follow.
565
+ #
566
+ # Group breakdown:
567
+ # (1) [^\d,.\s"-]* — prefix symbol: any run of chars that are NOT digits,
568
+ # commas, dots, whitespace, quotes, or minus; captures
569
+ # '$', '£', '€', etc. when they lead the token; empty
570
+ # string when the commodity is a suffix token
571
+ # (2) [\d,. ]* — numeric portion: digits, commas, dots, and spaces
572
+ # (thousands-separated amounts can contain internal spaces)
573
+ # (3) \s*([^\d,.\s]*) — suffix symbol: any non-numeric run after the numeric
574
+ # portion, stripped of leading whitespace; captures
575
+ # 'EUR', 'USD', 'AAPL' etc.; empty when prefix symbol
576
+ #
577
+ # Edge cases:
578
+ # - "1000. AAAA" → prefix='' numeric='1000. ' suffix='AAAA'
579
+ # - "$1,000.00" → prefix='$' numeric='1,000.00' suffix=''
580
+ # - "$" → prefix='$' numeric='' suffix=''
581
+ # - "INR" → prefix='' numeric='' suffix='INR' (falls through to bare-symbol path)
582
+ # - '1 000 000.0000' → internal spaces handled by numeric group
583
+ _COMMODITY_AMOUNT = re.compile(
584
+ r'^([^\d,.\s"-]*)'
585
+ r'([\d,. ]*)'
586
+ r'\s*([^\d,.\s]*)'
587
+ r'$'
588
+ )
589
+
590
+
591
+ def _extract_commodity_symbol(raw: str, lineno: int) -> str:
592
+ """Extract the commodity symbol from a commodity directive body.
593
+
594
+ Handles all hledger commodity directive forms:
595
+ - Quoted: "AAPL 2023" → AAPL 2023
596
+ - Prefix+amt: $1,000.00 → $
597
+ - Suffix+amt: 1,000.00 EUR → EUR
598
+ - Bare symbol: INR → INR
599
+ - Bare sigil: $ → $
600
+ - Empty quoted: "" → (empty string, the no-symbol commodity)
601
+
602
+ Raises:
603
+ ParseError: if the body is empty after stripping.
604
+ """
605
+ body = raw.strip()
606
+ if not body:
607
+ raise ParseError("commodity directive has no symbol", lineno)
608
+
609
+ # Quoted symbol: "AAPL 2023" or ""
610
+ if body.startswith('"'):
611
+ end = body.find('"', 1)
612
+ if end == -1:
613
+ raise ParseError(f"commodity directive has unterminated quoted symbol: {body!r}", lineno)
614
+ return body[1:end]
615
+
616
+ # Trailing quoted symbol: e.g. '1,000. "Chocolate Frogs"'
617
+ m_tq = _TRAILING_QUOTED_COMMODITY_RE.match(body)
618
+ if m_tq:
619
+ return m_tq.group(2)
620
+
621
+ m = _COMMODITY_AMOUNT.match(body)
622
+ if not m:
623
+ raise ParseError(f"commodity directive: cannot parse symbol from {body!r}", lineno)
624
+
625
+ prefix, numeric, suffix = m.group(1), m.group(2), m.group(3)
626
+
627
+ if prefix:
628
+ return prefix
629
+ if suffix:
630
+ return suffix.strip()
631
+ # Numeric-only body (e.g. "1000.") — no symbol, no-symbol commodity
632
+ return ""
633
+
634
+
635
+ # Matches a balance assertion marker embedded in a posting amount token.
636
+ # Purpose: detect the first occurrence of ==*, ==, =*, or = that is preceded
637
+ # and followed by whitespace, so we can split a posting amount token
638
+ # into "posting amount" and "assertion amount".
639
+ # Group breakdown:
640
+ # (1) ==* | == | =* | = — the assertion marker; alternatives ordered
641
+ # longest-first so ==* is tried before == and =* before =
642
+ # Edge cases:
643
+ # - ==* must precede == in the alternation to avoid consuming only ==
644
+ # - =* must precede = in the alternation to avoid consuming only =
645
+ # - surrounding \s+ prevents matching = inside commodity symbols or
646
+ # numbers (e.g. scientific notation, if ever supported)
647
+ # - a bare = at the start of amount_raw ("= $500") indicates a balance
648
+ # assignment (amount elided); this regex will not match it because
649
+ # there is no leading \s+ before the =
650
+ _ASSERTION_MARKER_RE = re.compile(r"\s+(==\*|==|=\*|=)\s+")
651
+
652
+
653
+ def _parse_posting(line: str, lineno: int, ctx: _ParseContext) -> Posting:
654
+ """Parse a single posting line (already stripped of leading whitespace).
655
+
656
+ Splits on two-or-more whitespace to separate account from amount.
657
+ If no amount token is present the posting is elided (amount=None).
658
+ ctx.decimal_mark controls how the amount's numeric portion is interpreted
659
+ ("." = period-decimal default; "," = comma-decimal / EU style).
660
+ """
661
+ # Purpose: split the posting line into (account, amount) on the first run
662
+ # of two or more whitespace characters. hledger requires at least
663
+ # two spaces to separate account from amount so that account names
664
+ # containing single spaces (e.g. "expenses:fun money") are preserved.
665
+ # Pattern: \s{2,}
666
+ # \s{2,} — two or more whitespace characters (spaces or tabs)
667
+ # maxsplit=1 ensures only the first such gap is used as the delimiter;
668
+ # any further double-spaces inside the amount are left intact
669
+ # Edge cases:
670
+ # - A posting with no amount (" assets:bank") produces a single-element
671
+ # list; the caller treats this as an elided amount (None)
672
+ # - An account name with a single internal space ("expenses:fun money £5")
673
+ # is correctly split because the delimiter requires two spaces
674
+ parts = re.split(r"\s{2,}", line, maxsplit=1)
675
+ account = parts[0].strip()
676
+
677
+ if not account:
678
+ raise ParseError("posting has no account name", lineno)
679
+
680
+ # Strip inline comment from the amount portion and capture it
681
+ amount_raw = ""
682
+ posting_inline_comment: str | None = None
683
+ if len(parts) > 1:
684
+ amount_part = parts[1]
685
+ # Remove trailing ; comment and capture the text
686
+ comment_idx = amount_part.find(";")
687
+ if comment_idx != -1:
688
+ posting_inline_comment = amount_part[comment_idx + 1:].strip() or None
689
+ amount_part = amount_part[:comment_idx]
690
+ amount_raw = amount_part.strip()
691
+
692
+ if not amount_raw:
693
+ return Posting(account=account, amount=None, source_line=lineno, inline_comment=posting_inline_comment)
694
+
695
+ # Detect a balance assertion marker (=, ==, =*, ==*) in the amount token.
696
+ # If found, split into posting amount and assertion amount.
697
+ assertion: BalanceAssertion | None = None
698
+ am = _ASSERTION_MARKER_RE.search(amount_raw)
699
+ if am:
700
+ marker = am.group(1)
701
+ posting_amount_raw = amount_raw[: am.start()].strip()
702
+ assertion_amount_raw = amount_raw[am.end() :].strip()
703
+ assertion_amount, _ = _parse_amount(assertion_amount_raw, lineno, ctx)
704
+ assertion = BalanceAssertion(
705
+ amount=assertion_amount,
706
+ inclusive="*" in marker,
707
+ sole_commodity=marker.startswith("=="),
708
+ )
709
+ amount_raw = posting_amount_raw
710
+
711
+ if not amount_raw:
712
+ # Posting amount elided (balance assignment syntax) — amount stays None
713
+ return Posting(
714
+ account=account,
715
+ amount=None,
716
+ balance_assertion=assertion,
717
+ source_line=lineno,
718
+ inline_comment=posting_inline_comment,
719
+ )
720
+
721
+ posting_amount, cost_raw = _parse_amount(amount_raw, lineno, ctx)
722
+ return Posting(
723
+ account=account,
724
+ amount=posting_amount,
725
+ balance_assertion=assertion,
726
+ cost_raw=cost_raw,
727
+ source_line=lineno,
728
+ inline_comment=posting_inline_comment,
729
+ )
730
+
731
+
732
+ def _parse_alias_body(body: str, lineno: int) -> tuple[str, str, bool]:
733
+ """Parse alias directive body; return (old_or_pattern, replacement, is_regex)."""
734
+ if body.startswith("/"):
735
+ # Regex alias: /PATTERN/ = REPLACEMENT
736
+ # Scan forward to find the closing unescaped '/'.
737
+ i = 1
738
+ while i < len(body):
739
+ if body[i] == "/" and body[i - 1] != "\\":
740
+ break
741
+ i += 1
742
+ if i >= len(body):
743
+ raise ParseError(f"unclosed regex in alias directive: {body!r}", lineno)
744
+ pattern_str = body[1:i].replace("\\/", "/")
745
+ rest = body[i + 1 :].lstrip()
746
+ if not rest.startswith("="):
747
+ raise ParseError(f"missing '=' in alias directive: {body!r}", lineno)
748
+ replacement = rest[1:].lstrip()
749
+ try:
750
+ re.compile(pattern_str, re.IGNORECASE) # validate regex early
751
+ except re.error as exc:
752
+ raise ParseError(f"invalid regex in alias directive: {exc}", lineno)
753
+ return (pattern_str, replacement, True)
754
+ else:
755
+ # Basic alias: OLD = NEW (spaces around '=' are optional)
756
+ if "=" not in body:
757
+ raise ParseError(f"missing '=' in alias directive: {body!r}", lineno)
758
+ idx = body.index("=")
759
+ old = body[:idx].rstrip()
760
+ new = body[idx + 1 :].lstrip()
761
+ if not old:
762
+ raise ParseError(f"empty account name in alias directive: {body!r}", lineno)
763
+ return (old, new, False)
764
+
765
+
766
+ def _apply_aliases(account: str, aliases: list[tuple[str, str, bool]]) -> str:
767
+ """Apply active alias rules to an account name, most-recently-defined first (LIFO).
768
+
769
+ Basic aliases match as an exact name or colon-delimited prefix.
770
+ Regex aliases use re.sub with IGNORECASE and support backreferences.
771
+ """
772
+ for old, new, is_regex in reversed(aliases):
773
+ if is_regex:
774
+ # Purpose: substitute matching substring in account name.
775
+ # re.IGNORECASE per hledger spec ("REGEX is case-insensitive as usual").
776
+ # Backreferences in `new` (e.g. \1) are supported by re.sub natively.
777
+ account = re.sub(old, new, account, flags=re.IGNORECASE)
778
+ else:
779
+ # Basic alias: replace OLD as exact match or as colon-delimited prefix.
780
+ # "checking" rewrites "checking" → new and "checking:a" → new + ":a"
781
+ # but NOT "other:checking" (prefix boundary enforced by + ":").
782
+ if account == old:
783
+ account = new
784
+ elif account.startswith(old + ":"):
785
+ account = new + account[len(old):]
786
+ return account
787
+
788
+
789
+ # ---------------------------------------------------------------------------
790
+ # Public API
791
+ # ---------------------------------------------------------------------------
792
+
793
+ def resolve_elision(txn: Transaction) -> list[Posting]:
794
+ """Return the full posting list for txn with any elided amount resolved.
795
+
796
+ If txn has zero elided postings, returns list(txn.postings) unchanged.
797
+ If txn has one elided posting and exactly one commodity in the explicit
798
+ postings, replaces the elided posting with one inferred Posting whose
799
+ amount is the negation of that commodity's net.
800
+ If txn has one elided posting and N > 1 commodities, replaces the elided
801
+ posting with N inferred Postings (one per commodity, sorted by symbol).
802
+
803
+ Edge cases:
804
+ - 2+ elided postings: returns list(txn.postings) unchanged (parse-time error).
805
+ - All explicit postings have amount=None (empty commodity_sums): returns
806
+ list(txn.postings) unchanged.
807
+ - Inferred postings carry inferred=True and the elided posting's source_line.
808
+
809
+ Args:
810
+ txn: A Transaction, parsed or constructed programmatically.
811
+
812
+ Returns:
813
+ List of Posting objects representing the resolved transaction.
814
+ """
815
+ elided_indices = [i for i, p in enumerate(txn.postings) if p.amount is None]
816
+ if len(elided_indices) != 1:
817
+ return list(txn.postings)
818
+
819
+ elided_idx = elided_indices[0]
820
+ elided_posting = txn.postings[elided_idx]
821
+
822
+ # Build per-commodity sums from all explicit postings.
823
+ commodity_sums: dict[str, Decimal] = {}
824
+ for p in txn.postings:
825
+ if p.amount is None:
826
+ continue
827
+ c = p.amount.commodity
828
+ commodity_sums[c] = commodity_sums.get(c, Decimal(0)) + p.amount.quantity
829
+
830
+ if not commodity_sums:
831
+ return list(txn.postings)
832
+
833
+ # Generate one synthetic posting per commodity (sorted for determinism).
834
+ synthetic = [
835
+ Posting(
836
+ account=elided_posting.account,
837
+ amount=Amount(-net, commodity),
838
+ source_line=elided_posting.source_line,
839
+ inferred=True,
840
+ )
841
+ for commodity, net in sorted(commodity_sums.items())
842
+ ]
843
+
844
+ result = list(txn.postings)
845
+ result[elided_idx : elided_idx + 1] = synthetic
846
+ return result
847
+
848
+
849
+ def _flush_txn(
850
+ txn: Transaction,
851
+ end: int,
852
+ all_lines: list[str],
853
+ source_file: str,
854
+ ) -> None:
855
+ """Attach SourceSpan and raw_text to a transaction that is about to be finalised."""
856
+ start = txn.source_line or 1
857
+ txn.source_span = SourceSpan(file=source_file, start_line=start, end_line=end)
858
+ txn.raw_text = "\n".join(all_lines[start - 1 : end]) + "\n"
859
+
860
+
861
+ def _parse_string_impl(
862
+ text: str,
863
+ default_year: int,
864
+ errors_out: list[ParseError] | None,
865
+ source_file: str = "(string)",
866
+ ) -> Journal:
867
+ """Shared body for parse_string and parse_string_lenient.
868
+
869
+ When errors_out is None, raises ParseError on the first malformed line.
870
+ When errors_out is a list, appends errors and continues parsing; malformed
871
+ transactions are discarded and parsing resumes at the next boundary.
872
+ """
873
+ all_lines = text.splitlines()
874
+ transactions: list[Transaction] = []
875
+ prices: list[PriceDirective] = []
876
+ declared_accounts: list[str] = []
877
+ declared_commodities: list[str] = []
878
+ declared_payees: list[str] = []
879
+ declared_tags: list[str] = []
880
+ commodity_directive_raws: dict = {} # symbol → raw amount string from directive
881
+ aliases: list[tuple[str, str, bool]] = [] # (old_or_pattern, replacement, is_regex)
882
+ ctx = _ParseContext(default_year=default_year, decimal_mark=".")
883
+ current_txn: Transaction | None = None
884
+ current_txn_last_lineno: int | None = None # tracks end_line for source_span
885
+ last_posting_in_txn: Posting | None = None # for standalone comment attribution
886
+ in_block_comment = False
887
+ in_subdirective = False # True while consuming indented subdirective lines
888
+ skip_until_blank = False # lenient mode: True while skipping a malformed transaction
889
+
890
+ for lineno, raw in enumerate(all_lines, start=1):
891
+ line = raw.rstrip()
892
+
893
+ # --- Block comment mode ---
894
+ #
895
+ # Purpose: skip every line between a `comment` directive and its
896
+ # matching `end comment` directive (or EOF).
897
+ #
898
+ # `comment` and `end comment` are non-indented directives. The
899
+ # `comment` keyword may be followed by an inline comment but must be
900
+ # the first word on the line. `end comment` is matched after stripping
901
+ # any surrounding whitespace to be lenient about trailing spaces.
902
+ #
903
+ # Edge cases:
904
+ # - A `comment` block that reaches EOF without `end comment` is
905
+ # silently accepted; the parser simply consumes the rest of the file
906
+ # - A `comment` directive encountered mid-transaction flushes the open
907
+ # transaction first, then enters block-comment mode
908
+ # - `end comment` outside a block comment falls through to the
909
+ # "silently skip" branch (no error raised)
910
+ # - Nested `comment` directives inside a block comment are ignored
911
+ if in_block_comment:
912
+ if line.strip() == "end comment":
913
+ in_block_comment = False
914
+ continue
915
+
916
+ # --- Lenient mode: skip remainder of a malformed transaction ---
917
+ #
918
+ # When errors_out is not None and a ParseError occurs mid-transaction,
919
+ # skip_until_blank is set True and current_txn is cleared. Subsequent
920
+ # lines are discarded until a blank line or a new transaction header is
921
+ # encountered, at which point normal parsing resumes.
922
+ if skip_until_blank:
923
+ if not line.strip():
924
+ skip_until_blank = False
925
+ in_subdirective = False
926
+ continue
927
+ if not re.match(r"^(?:\d{4}[-/.])?(?:\d{1,2})[-/.](?:\d{1,2})(?=[\s*!(=]|$)", line):
928
+ continue
929
+ skip_until_blank = False
930
+ # Fall through: treat this line as a new transaction header.
931
+
932
+ # --- Blank line: end the current block ---
933
+ if not line.strip():
934
+ if current_txn is not None:
935
+ end = current_txn_last_lineno or (current_txn.source_line or 1)
936
+ _flush_txn(current_txn, end, all_lines, source_file)
937
+ transactions.append(current_txn)
938
+ current_txn = None
939
+ current_txn_last_lineno = None
940
+ last_posting_in_txn = None
941
+ in_subdirective = False
942
+ continue
943
+
944
+ # --- Comment-only line (whole-line or indented follow-on `;` / `#`) ---
945
+ #
946
+ # Two distinct sub-cases share this branch:
947
+ #
948
+ # 1. Column-0 (non-indented) `#` or `;` — a standalone top-level comment.
949
+ # These are ALWAYS silently skipped regardless of whether a transaction
950
+ # block is open. They never contribute to a transaction's comment fields
951
+ # and never extend its source_span.
952
+ #
953
+ # 2. Indented (leading whitespace) `#` or `;` inside an open transaction —
954
+ # a follow-on comment line. `;`-led lines are attached to the preceding
955
+ # posting's inline_comment (or to the transaction's inline_comment if no
956
+ # posting has been seen yet in this block). `#`-led lines update the span
957
+ # but do not attach their text.
958
+ #
959
+ # The `is_indented` flag disambiguates the two sub-cases; `lstrip()` is
960
+ # applied only for the startswith test, not for the indentation check.
961
+ #
962
+ # Edge cases:
963
+ # - A column-0 `;` with no blank line between two transactions is a
964
+ # top-level comment; it must NOT be captured as a follow-on comment
965
+ # on the previous transaction's last posting.
966
+ # - An empty `;` (nothing after the semicolon) sets inline_comment to
967
+ # None, not to the empty string.
968
+ is_indented = line[0:1].isspace()
969
+ stripped = line.lstrip()
970
+ if stripped.startswith(";") or stripped.startswith("#"):
971
+ if current_txn is not None and is_indented:
972
+ current_txn_last_lineno = lineno
973
+ if stripped.startswith(";"):
974
+ comment_text = stripped[1:].strip()
975
+ if last_posting_in_txn is None:
976
+ if current_txn.inline_comment:
977
+ current_txn.inline_comment += "\n" + comment_text
978
+ else:
979
+ current_txn.inline_comment = comment_text or None
980
+ else:
981
+ if last_posting_in_txn.inline_comment:
982
+ last_posting_in_txn.inline_comment += "\n" + comment_text
983
+ else:
984
+ last_posting_in_txn.inline_comment = comment_text or None
985
+ continue
986
+
987
+ # --- ~ (periodic transaction rule) ---
988
+ #
989
+ # Purpose: recognise a periodic transaction rule header and skip its
990
+ # posting lines without raising ParseError. The rule and its
991
+ # postings are not stored; --forecast expansion is out of scope
992
+ # for v1. A ParseWarning is appended in lenient mode (not a
993
+ # hard error).
994
+ #
995
+ # Group breakdown: no capture groups — match is a boolean gate only.
996
+ #
997
+ # Edge cases:
998
+ # - "~ monthly budget goals" — leading ~ is sufficient; rest ignored
999
+ # - If a transaction is currently open, flush it first (malformed input)
1000
+ # - skip_until_blank consumes indented postings until next blank line
1001
+ if not line[0:1].isspace() and line.startswith("~"):
1002
+ if current_txn is not None:
1003
+ end = current_txn_last_lineno or (current_txn.source_line or 1)
1004
+ _flush_txn(current_txn, end, all_lines, source_file)
1005
+ transactions.append(current_txn)
1006
+ current_txn = None
1007
+ current_txn_last_lineno = None
1008
+ last_posting_in_txn = None
1009
+ if errors_out is not None:
1010
+ errors_out.append(ParseWarning(
1011
+ "periodic transaction rule (~) skipped (not supported in v1)",
1012
+ lineno,
1013
+ ))
1014
+ skip_until_blank = True
1015
+ continue
1016
+
1017
+ # --- = (auto-posting rule) ---
1018
+ #
1019
+ # Purpose: recognise an auto-posting rule header and skip its posting
1020
+ # lines. Only the "= QUERY" form (space after =) is matched;
1021
+ # bare "=account" is not valid hledger auto-posting syntax.
1022
+ # A ParseWarning is appended in lenient mode (not a hard error).
1023
+ #
1024
+ # Group breakdown: no capture groups — match is a boolean gate only.
1025
+ #
1026
+ # Edge cases:
1027
+ # - "= expenses:food" matches; "=expenses:food" (no space) does NOT
1028
+ # - Balance-assignment postings begin with "=" but are indented; the
1029
+ # indentation guard (`not line[0:1].isspace()`) distinguishes them
1030
+ if not line[0:1].isspace() and re.match(r"^=\s+\S", line):
1031
+ if current_txn is not None:
1032
+ end = current_txn_last_lineno or (current_txn.source_line or 1)
1033
+ _flush_txn(current_txn, end, all_lines, source_file)
1034
+ transactions.append(current_txn)
1035
+ current_txn = None
1036
+ current_txn_last_lineno = None
1037
+ last_posting_in_txn = None
1038
+ if errors_out is not None:
1039
+ errors_out.append(ParseWarning(
1040
+ "auto-posting rule (=) skipped (not supported in v1)",
1041
+ lineno,
1042
+ ))
1043
+ skip_until_blank = True
1044
+ continue
1045
+
1046
+ # --- Transaction header (non-indented line starting with a simple date) ---
1047
+ #
1048
+ # Purpose: quickly determine whether a non-indented line opens a new
1049
+ # transaction block before handing off to _parse_txn_header.
1050
+ # This check runs before posting detection so that a date-like
1051
+ # token at column 0 is always treated as a new header, never as
1052
+ # an un-indented posting inside an open block.
1053
+ # Pattern: ^(?:\d{4}[-/.])?(?:\d{1,2})[-/.](?:\d{1,2})(?=[\s*!(=]|$)
1054
+ # ^ — anchored to start of the rstripped line
1055
+ # (?:\d{4}[-/.])? — optional four-digit year + separator
1056
+ # (?:\d{1,2})[-/.] — month (1–2 digits) + separator
1057
+ # (?:\d{1,2}) — day (1–2 digits)
1058
+ # (?=[\s*!(=]|$) — lookahead: must be followed by whitespace,
1059
+ # a status flag, the start of a code, an '='
1060
+ # introducing a secondary date, or end-of-line;
1061
+ # prevents matching bare numeric expressions
1062
+ # Edge cases:
1063
+ # - "2024-13-45 Bad" passes this check but fails in _parse_simple_date
1064
+ # when datetime.date() rejects the invalid calendar values
1065
+ # - "1.5" without a trailing space/flag does NOT match (lookahead fails),
1066
+ # preventing accidental collision with decimal amounts on directive lines
1067
+ # - "2024-02-20=2024-02-22" passes because '=' is in the lookahead set;
1068
+ # the full _TXN_HEADER regex validates the secondary date format
1069
+ if re.match(r"^(?:\d{4}[-/.])?(?:\d{1,2})[-/.](?:\d{1,2})(?=[\s*!(=]|$)", line):
1070
+ if current_txn is not None:
1071
+ # No blank line between transactions — flush previous block
1072
+ end = current_txn_last_lineno or (current_txn.source_line or 1)
1073
+ _flush_txn(current_txn, end, all_lines, source_file)
1074
+ transactions.append(current_txn)
1075
+ in_subdirective = False
1076
+ current_txn_last_lineno = None
1077
+ last_posting_in_txn = None
1078
+ try:
1079
+ current_txn = _parse_txn_header(line, lineno, default_year)
1080
+ current_txn_last_lineno = lineno
1081
+ except ParseError as _err:
1082
+ if errors_out is None:
1083
+ raise
1084
+ errors_out.append(_err)
1085
+ current_txn = None
1086
+ skip_until_blank = True
1087
+ continue
1088
+
1089
+ # --- Block comment start (`comment` directive) ---
1090
+ #
1091
+ # A non-indented line whose first whitespace-delimited token is exactly
1092
+ # "comment" opens a block comment. Anything after "comment" on the same
1093
+ # line is ignored (treated as inline commentary on the directive itself).
1094
+ # Any open transaction is flushed before entering block-comment mode so
1095
+ # that a `comment` block sitting between transactions is parsed cleanly.
1096
+ if not line[0:1].isspace() and line.split()[0] == "comment":
1097
+ if current_txn is not None:
1098
+ transactions.append(current_txn)
1099
+ current_txn = None
1100
+ in_subdirective = False
1101
+ in_block_comment = True
1102
+ continue
1103
+
1104
+ # --- Subdirective lines (indented lines following account/commodity/payee) ---
1105
+ #
1106
+ # Purpose: consume Ledger-style indented subdirectives (e.g. "format …"
1107
+ # below a commodity directive) without treating them as postings
1108
+ # or raising ParseError. in_subdirective is set to True whenever
1109
+ # we finish processing an account/commodity/payee directive line;
1110
+ # it is cleared on the next non-indented, non-blank line.
1111
+ #
1112
+ # Edge cases:
1113
+ # - Blank lines above already clear in_subdirective via the blank-line
1114
+ # branch (which sets current_txn=None and falls through; the next
1115
+ # non-blank line will hit this check with in_subdirective still True
1116
+ # only if there was no blank line — so blank lines naturally end the
1117
+ # subdirective block)
1118
+ # - An indented subdirective line that contains a valid posting syntax
1119
+ # is still skipped here (subdirective wins); the containing file is
1120
+ # expected to be well-formed per hledger conventions
1121
+ if in_subdirective:
1122
+ if line[0:1].isspace():
1123
+ continue # consume indented subdirective silently
1124
+ in_subdirective = False
1125
+ # fall through to process this non-indented line normally
1126
+
1127
+ # --- account directive ---
1128
+ #
1129
+ # Purpose: record a declared account name for strict-mode checking.
1130
+ # The account name follows the keyword and may contain spaces
1131
+ # and ';' characters; an inline comment is delimited by the
1132
+ # first occurrence of two-or-more spaces followed by ';'.
1133
+ # Indented lines that follow (Ledger-style subdirectives) are
1134
+ # consumed silently via the in_subdirective flag.
1135
+ #
1136
+ # Edge cases:
1137
+ # - "account a:b;c" (single space before ';') → name is "a:b;c"
1138
+ # - "account a:b ; note" → name is "a:b"
1139
+ # - "accounts" does not match because \s+ requires whitespace after
1140
+ # the exact word "account"
1141
+ if not line[0:1].isspace() and re.match(r"^account\s+", line):
1142
+ body = line[len("account"):].lstrip()
1143
+ account_name = _strip_directive_comment(body)
1144
+ if ctx.account_prefix:
1145
+ account_name = f"{ctx.account_prefix}:{account_name}"
1146
+ if account_name:
1147
+ declared_accounts.append(_apply_aliases(account_name, aliases))
1148
+ in_subdirective = True
1149
+ continue
1150
+
1151
+ # --- commodity directive ---
1152
+ #
1153
+ # Purpose: record a declared commodity symbol for strict-mode checking.
1154
+ # Supports all hledger commodity directive forms: sample amount
1155
+ # with prefix symbol ($1,000.00), sample amount with suffix
1156
+ # symbol (1,000.00 EUR), bare symbol ($, INR), quoted symbol
1157
+ # ("AAPL 2023"), empty-quoted no-symbol (""), and numeric-only
1158
+ # (1000.) for format declarations.
1159
+ #
1160
+ # Edge cases:
1161
+ # - "commodity" with no body raises ParseError (empty symbol)
1162
+ # - Indented "format" subdirectives are consumed via in_subdirective
1163
+ # - The same symbol may be declared more than once; deduplication is
1164
+ # done at check time, not parse time
1165
+ if not line[0:1].isspace() and re.match(r"^commodity(\s|$)", line):
1166
+ rest = line[len("commodity"):].strip()
1167
+ body = _strip_directive_comment(rest)
1168
+ symbol = _extract_commodity_symbol(body, lineno)
1169
+ declared_commodities.append(symbol)
1170
+ # Store the raw directive body for style inference if it looks like
1171
+ # a sample amount (contains at least one digit).
1172
+ if symbol and any(ch.isdigit() for ch in body):
1173
+ commodity_directive_raws[symbol] = body
1174
+ in_subdirective = True
1175
+ continue
1176
+
1177
+ # --- payee directive ---
1178
+ #
1179
+ # Purpose: record a declared payee name for strict-mode / payee checking.
1180
+ # The payee name follows the keyword; inline comments are stripped
1181
+ # with the 2-space rule. Quoted empty-string payee ("") is stored
1182
+ # as the empty string. Tags in comments are ignored per the spec.
1183
+ #
1184
+ # Edge cases:
1185
+ # - 'payee ""' → stored as "" (the no-payee sentinel)
1186
+ # - "payee Whole Foods ; comment" → "Whole Foods"
1187
+ # - Indented Ledger-style subdirectives are consumed silently
1188
+ if not line[0:1].isspace() and re.match(r"^payee\s+", line):
1189
+ body = line[len("payee"):].lstrip()
1190
+ payee_name = _strip_directive_comment(body)
1191
+ # Unquote a quoted payee name (e.g. payee "" or payee "Smith & Co")
1192
+ if payee_name.startswith('"') and payee_name.endswith('"'):
1193
+ payee_name = payee_name[1:-1]
1194
+ declared_payees.append(payee_name)
1195
+ in_subdirective = True
1196
+ continue
1197
+
1198
+ # --- tag directive ---
1199
+ #
1200
+ # Purpose: record a declared tag name for strict-mode / tag checking.
1201
+ # TAGNAME follows the keyword with no spaces. Inline comments
1202
+ # are stripped using the 2-space separator rule. Indented
1203
+ # subdirectives are consumed silently via in_subdirective.
1204
+ #
1205
+ # Edge cases:
1206
+ # - "tag item-id ; note" → tag name is "item-id"
1207
+ # - "tags" does not match because \s+ requires whitespace after
1208
+ # the exact word "tag"
1209
+ if not line[0:1].isspace() and re.match(r"^tag\s+", line):
1210
+ body = line[len("tag"):].lstrip()
1211
+ tag_name = _strip_directive_comment(body)
1212
+ if tag_name:
1213
+ declared_tags.append(tag_name)
1214
+ in_subdirective = True
1215
+ continue
1216
+
1217
+ # --- decimal-mark directive ---
1218
+ #
1219
+ # Purpose: declare which character is the decimal mark for amount
1220
+ # parsing in this file. Affects all postings from this point
1221
+ # forward (typically placed at the top of file).
1222
+ # Only "." (default) and "," are valid values.
1223
+ #
1224
+ # Edge cases:
1225
+ # - "decimal-mark ." → default; no change to behaviour
1226
+ # - "decimal-mark ," → commas are decimal marks; periods are
1227
+ # digit-group marks (e.g. 1.234,56 = 1234.56)
1228
+ # - Any other value raises ParseError
1229
+ # - No subdirectives are defined for decimal-mark; in_subdirective
1230
+ # is NOT set (nothing to consume)
1231
+ if not line[0:1].isspace() and re.match(r"^decimal-mark(\s|$)", line):
1232
+ rest = line[len("decimal-mark"):].strip()
1233
+ dm = _strip_directive_comment(rest)
1234
+ if dm not in (".", ","):
1235
+ _err = ParseError(
1236
+ f"decimal-mark must be '.' or ',',"
1237
+ f" got {dm!r}",
1238
+ lineno,
1239
+ )
1240
+ if errors_out is None:
1241
+ raise _err
1242
+ errors_out.append(_err)
1243
+ continue
1244
+ ctx.decimal_mark = dm
1245
+ continue
1246
+
1247
+ # --- P directive ---
1248
+ #
1249
+ # Purpose: record a market price declaration (commodity conversion rate
1250
+ # on a date). Stored in `prices` for later use by valuation
1251
+ # reports. No subdirectives are defined for P; in_subdirective
1252
+ # is NOT set.
1253
+ #
1254
+ # Edge cases:
1255
+ # - "P DATE COMMODITY PRICE ; comment" → comment stripped before parse
1256
+ # - A P directive encountered while a transaction is open does NOT close
1257
+ # the transaction (consistent with other directive handlers); P inside
1258
+ # a transaction block is malformed but handled leniently
1259
+ # - A "P " line that fails the full regex (missing commodity or amount)
1260
+ # raises ParseError immediately
1261
+ if not line[0:1].isspace() and line.startswith("P "):
1262
+ m_p = _P_DIRECTIVE.match(line)
1263
+ if not m_p:
1264
+ _err = ParseError(f"invalid P directive: {line!r}", lineno)
1265
+ if errors_out is None:
1266
+ raise _err
1267
+ errors_out.append(_err)
1268
+ continue
1269
+ date_str, commodity1, amount_raw = m_p.groups()
1270
+ amount_clean = _strip_directive_comment(amount_raw)
1271
+ try:
1272
+ p_date = _parse_simple_date(date_str, lineno, default_year)
1273
+ p_price, _ = _parse_amount(amount_clean, lineno, ctx)
1274
+ except ParseError as _err:
1275
+ if errors_out is None:
1276
+ raise
1277
+ errors_out.append(_err)
1278
+ continue
1279
+ prices.append(PriceDirective(date=p_date, commodity=commodity1, price=p_price))
1280
+ continue
1281
+
1282
+ # --- alias directive ---
1283
+ #
1284
+ # Purpose: register an account-name alias rule. Rules accumulate in `aliases`
1285
+ # and are applied to every posting account name parsed after this point.
1286
+ # Basic aliases match as an exact name or colon-delimited prefix;
1287
+ # regex aliases match any substring (case-insensitive, backrefs supported).
1288
+ # Aliases are also applied to account names in `account` directives.
1289
+ # No subdirectives; in_subdirective is NOT set.
1290
+ #
1291
+ # Edge cases:
1292
+ # - "alias /invalid[/ = x" → ParseError (invalid regex detected early)
1293
+ # - "alias = new" (empty OLD) → ParseError
1294
+ # - "alias old = new ; comment" or "alias old = new # comment"
1295
+ # → comment stripped by _strip_directive_comment before body parse
1296
+ # - Rules accumulate; use "end aliases" to clear all
1297
+ if not line[0:1].isspace() and _ALIAS_DIRECTIVE.match(line):
1298
+ m_alias = _ALIAS_DIRECTIVE.match(line)
1299
+ body = _strip_directive_comment(m_alias.group(1))
1300
+ try:
1301
+ old, new, is_regex = _parse_alias_body(body, lineno)
1302
+ except ParseError as _err:
1303
+ if errors_out is None:
1304
+ raise
1305
+ errors_out.append(_err)
1306
+ continue
1307
+ aliases.append((old, new, is_regex))
1308
+ continue
1309
+
1310
+ # --- end aliases directive ---
1311
+ #
1312
+ # Purpose: clear all active alias rules so postings after this line are
1313
+ # not rewritten. A no-op if no aliases are currently active.
1314
+ # _strip_directive_comment is applied first so that trailing
1315
+ # "; comment" or "# comment" (with two-space separation) are
1316
+ # stripped before the keyword is matched.
1317
+ #
1318
+ # Edge cases:
1319
+ # - "end aliases ; done" or "end aliases # done" → comment stripped,
1320
+ # directive recognised correctly
1321
+ # - "end aliases" with no active aliases: silently clears empty list
1322
+ # - Inside a block comment: skipped by the in_block_comment guard above
1323
+ if not line[0:1].isspace() and _END_ALIASES.match(_strip_directive_comment(line)):
1324
+ aliases.clear()
1325
+ continue
1326
+
1327
+ # --- Y directive (default year) ---
1328
+ #
1329
+ # Purpose: set the default year for all year-omitted dates in this file,
1330
+ # overriding the default_year parameter passed to parse_string.
1331
+ # All yearless dates parsed AFTER this directive use the declared
1332
+ # year. Multiple Y directives are allowed; last one wins.
1333
+ #
1334
+ # Group breakdown: no capture groups — year extracted from directive body.
1335
+ #
1336
+ # Edge cases:
1337
+ # - "Y 2024 ; comment" → comment stripped, year = 2024
1338
+ # - "Y 20xx" (non-integer body) → ParseError in lenient; raises in strict
1339
+ if not line[0:1].isspace() and re.match(r"^Y\s", line):
1340
+ body = _strip_directive_comment(line[len("Y"):].strip())
1341
+ try:
1342
+ default_year = int(body)
1343
+ except ValueError:
1344
+ _err = ParseError(f"invalid Y directive year: {body!r}", lineno)
1345
+ if errors_out is None:
1346
+ raise _err
1347
+ errors_out.append(_err)
1348
+ continue
1349
+
1350
+ # --- D directive (default commodity + style) ---
1351
+ #
1352
+ # Purpose: declare a default commodity symbol and display style for
1353
+ # amounts that carry no explicit symbol. e.g. "D $1,000.00"
1354
+ # sets ctx.default_commodity = "$". The raw sample amount is
1355
+ # stored in commodity_directive_raws for style inference, just
1356
+ # like a commodity directive.
1357
+ #
1358
+ # Group breakdown: no capture groups — body extracted from directive text.
1359
+ #
1360
+ # Edge cases:
1361
+ # - "D $1,000.00 ; comment" → comment stripped before symbol extraction
1362
+ # - "D EUR" (bare symbol, no numeric sample) → default_commodity = "EUR"
1363
+ # - Invalid or unparseable body → ParseError in lenient; raises in strict
1364
+ if not line[0:1].isspace() and re.match(r"^D\s", line):
1365
+ body = _strip_directive_comment(line[len("D"):].strip())
1366
+ try:
1367
+ sym = _extract_commodity_symbol(body, lineno)
1368
+ except ParseError as _err:
1369
+ if errors_out is None:
1370
+ raise
1371
+ errors_out.append(_err)
1372
+ continue
1373
+ ctx.default_commodity = sym
1374
+ if body and sym not in commodity_directive_raws:
1375
+ commodity_directive_raws[sym] = body
1376
+ continue
1377
+
1378
+ # --- apply account / end apply account directives ---
1379
+ #
1380
+ # Purpose: prepend PREFIX: to every account name in postings and account
1381
+ # directives encountered inside this block. Mirrors hledger's
1382
+ # "apply account" / "end apply account" syntax. Aliases are
1383
+ # applied to the base account name BEFORE the prefix is prepended.
1384
+ #
1385
+ # Group breakdown: no capture groups — prefix extracted from directive body.
1386
+ #
1387
+ # Edge cases:
1388
+ # - Nested apply account: second directive replaces first; ParseWarning emitted in lenient mode
1389
+ # - "end apply account" with no active prefix: silently ignored
1390
+ # - Empty body "apply account ; comment" → prefix set to None
1391
+ if not line[0:1].isspace() and re.match(r"^apply\s+account\s+", line):
1392
+ body = _strip_directive_comment(line[len("apply account"):].strip())
1393
+ if ctx.account_prefix is not None and errors_out is not None:
1394
+ errors_out.append(ParseWarning(
1395
+ "nested apply account: previous prefix replaced (nesting not supported)",
1396
+ lineno,
1397
+ ))
1398
+ ctx.account_prefix = body or None
1399
+ continue
1400
+
1401
+ if not line[0:1].isspace() and re.match(r"^end\s+apply\s+account\b", line):
1402
+ ctx.account_prefix = None
1403
+ continue
1404
+
1405
+ # --- Posting line ---
1406
+ #
1407
+ # Posting lines are conventionally written with 2+ leading spaces or a
1408
+ # tab, but indentation is not strictly required inside an open block.
1409
+ # Any line inside an open transaction block that has not been matched by
1410
+ # the blank-line, comment, transaction-header, or directive branches
1411
+ # above is treated as a posting.
1412
+ #
1413
+ # Indented lines (2+ spaces or tab) outside a transaction block still
1414
+ # raise ParseError — indentation unambiguously signals "this is a
1415
+ # posting", so encountering one with no open block is always an error.
1416
+ # Non-indented lines outside a block are silently skipped (directives,
1417
+ # stray text, etc.).
1418
+ if current_txn is None and (line.startswith(" ") or line.startswith("\t")):
1419
+ _err = ParseError("posting found outside a transaction block", lineno)
1420
+ if errors_out is None:
1421
+ raise _err
1422
+ errors_out.append(_err)
1423
+ skip_until_blank = True
1424
+ continue
1425
+ if current_txn is not None:
1426
+ try:
1427
+ posting = _parse_posting(stripped, lineno, ctx)
1428
+ except ParseError as _err:
1429
+ if errors_out is None:
1430
+ raise
1431
+ errors_out.append(_err)
1432
+ current_txn = None
1433
+ current_txn_last_lineno = None
1434
+ last_posting_in_txn = None
1435
+ skip_until_blank = True
1436
+ continue
1437
+ if aliases:
1438
+ posting = Posting(
1439
+ account=_apply_aliases(posting.account, aliases),
1440
+ amount=posting.amount,
1441
+ balance_assertion=posting.balance_assertion,
1442
+ cost_raw=posting.cost_raw,
1443
+ source_line=posting.source_line,
1444
+ inline_comment=posting.inline_comment,
1445
+ )
1446
+ if ctx.account_prefix:
1447
+ posting = Posting(
1448
+ account=f"{ctx.account_prefix}:{posting.account}",
1449
+ amount=posting.amount,
1450
+ balance_assertion=posting.balance_assertion,
1451
+ cost_raw=posting.cost_raw,
1452
+ source_line=posting.source_line,
1453
+ inline_comment=posting.inline_comment,
1454
+ )
1455
+ # Enforce at-most-one elided amount per block
1456
+ if posting.amount is None:
1457
+ elided = [p for p in current_txn.postings if p.amount is None]
1458
+ if elided:
1459
+ _err = ParseError(
1460
+ "a transaction block may have at most one elided amount", lineno
1461
+ )
1462
+ if errors_out is None:
1463
+ raise _err
1464
+ errors_out.append(_err)
1465
+ current_txn = None
1466
+ current_txn_last_lineno = None
1467
+ last_posting_in_txn = None
1468
+ skip_until_blank = True
1469
+ continue
1470
+ current_txn.postings.append(posting)
1471
+ current_txn_last_lineno = lineno
1472
+ last_posting_in_txn = posting
1473
+ continue
1474
+
1475
+ # --- Any other line outside a transaction block: silently skip ---
1476
+
1477
+ # Flush final block if file ends without a trailing blank line
1478
+ if current_txn is not None:
1479
+ end = current_txn_last_lineno or (current_txn.source_line or 1)
1480
+ _flush_txn(current_txn, end, all_lines, source_file)
1481
+ transactions.append(current_txn)
1482
+
1483
+ return Journal(
1484
+ transactions=transactions,
1485
+ prices=prices,
1486
+ declared_accounts=declared_accounts,
1487
+ declared_commodities=declared_commodities,
1488
+ declared_payees=declared_payees,
1489
+ declared_tags=declared_tags,
1490
+ _commodity_directive_raws=commodity_directive_raws,
1491
+ )
1492
+
1493
+
1494
+ def parse_string(
1495
+ text: str,
1496
+ default_year: int | None = None,
1497
+ source_file: str = "(string)",
1498
+ ) -> Journal:
1499
+ """Parse a journal from a string and return a Journal object.
1500
+
1501
+ Accepted date formats: YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD, and year-omitted
1502
+ forms such as M/DD or MM-DD. Leading zeros on month and day are optional.
1503
+ When the year is omitted, default_year is used (defaults to the current
1504
+ calendar year when None).
1505
+
1506
+ source_file is stored in every Transaction.source_span.file. Defaults to
1507
+ "(string)" for direct callers; loader.py passes the resolved absolute path.
1508
+
1509
+ Raises:
1510
+ ParseError: if the input is not valid hledger journal syntax.
1511
+ """
1512
+ if default_year is None:
1513
+ default_year = datetime.date.today().year
1514
+ return _parse_string_impl(text, default_year, errors_out=None, source_file=source_file)
1515
+
1516
+
1517
+ def parse_string_lenient(
1518
+ text: str,
1519
+ default_year: int | None = None,
1520
+ source_file: str = "(string)",
1521
+ ) -> tuple[Journal, list[ParseError]]:
1522
+ """Parse a journal leniently, collecting errors instead of raising.
1523
+
1524
+ Returns a (Journal, list[ParseError]) tuple. The Journal contains all
1525
+ transactions that were successfully parsed; malformed transactions are
1526
+ discarded. The error list is empty when the input is valid.
1527
+
1528
+ This function never raises. It is intended for editor integrations that
1529
+ call it on every text-changed event to provide real-time diagnostics while
1530
+ the file is being edited.
1531
+
1532
+ Args:
1533
+ text: Raw journal text.
1534
+ default_year: Year to use for year-omitted dates. Defaults to the
1535
+ current calendar year when None.
1536
+
1537
+ Returns:
1538
+ A (journal, errors) tuple where journal contains all valid transactions
1539
+ and errors is a (possibly empty) list of ParseError.
1540
+ """
1541
+ if default_year is None:
1542
+ default_year = datetime.date.today().year
1543
+ errors: list[ParseError] = []
1544
+ journal = _parse_string_impl(text, default_year, errors_out=errors, source_file=source_file)
1545
+ return journal, errors
1546
+
1547
+