logseq-matryca-parser 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1047 @@
1
+ """Stack-machine parser for deterministic Logseq AST construction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import logging
7
+ import os
8
+ import re
9
+ import uuid
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from .logos_core import LogseqNode, LogseqPage
15
+
16
+ LOGSEQ_PATTERNS: dict[str, re.Pattern[str]] = {
17
+ "property": re.compile(r"^([\w-]+)::\s*(.*)$"),
18
+ "wikilink": re.compile(r"\[\[(.*?)\]\]"),
19
+ "tag": re.compile(r"#\[\[([^\]]+)\]\]|#([^\s#\]]+)"),
20
+ "block_ref": re.compile(
21
+ r"(?:\[[^\]]+\])?\(\(\(([a-f0-9\-]{36})\)\)\)|\(\(([a-f0-9\-]{36})\)\)"
22
+ ),
23
+ "uuid_prop": re.compile(r"^id::\s*([a-f0-9\-]{36})$"),
24
+ "inline_uuid_prop": re.compile(r"\bid::\s*([a-f0-9\-]{36})\b"),
25
+ }
26
+ TASK_STATUSES: tuple[str, ...] = (
27
+ "TODO",
28
+ "DOING",
29
+ "DONE",
30
+ "LATER",
31
+ "NOW",
32
+ "WAITING",
33
+ "CANCELED",
34
+ )
35
+ TIME_PATTERN: re.Pattern[str] = re.compile(r"\b(SCHEDULED|DEADLINE):\s*(<[^>]+>)")
36
+ PRIORITY_PATTERN: re.Pattern[str] = re.compile(r"\[#([A-Z])\]")
37
+ _SHIELD_TOKEN_PREFIX = "___LOGOS_SHIELD_TOKEN_"
38
+ HEADING_PATTERN: re.Pattern[str] = re.compile(r"^(#{1,6})\s+(.+)$")
39
+ ALIASED_BLOCK_REF_PATTERN: re.Pattern[str] = re.compile(
40
+ r"(\[[^\]]+\])\(\(\([a-f0-9\-]{36}\)\)\)"
41
+ )
42
+ PLAIN_BLOCK_REF_PATTERN: re.Pattern[str] = re.compile(r"\(\(([a-f0-9\-]{36})\)\)")
43
+
44
+ SYSTEM_BLOCK_PATTERNS: tuple[re.Pattern[str], ...] = (
45
+ re.compile(r"^\s*:(?:LOGBOOK|PROPERTIES):", re.IGNORECASE),
46
+ re.compile(r"^\s*END:", re.IGNORECASE),
47
+ re.compile(r"^\s*CLOCK:", re.IGNORECASE),
48
+ re.compile(r"^\s*collapsed::", re.IGNORECASE),
49
+ )
50
+
51
+ BULLET_PATTERN: re.Pattern[str] = re.compile(r"^(\s*)[-*]\s+(.*)$")
52
+ HEADING_BLOCK_PATTERN: re.Pattern[str] = re.compile(r"^(\s*)(#{1,6}\s+.+)$")
53
+ logger = logging.getLogger(__name__)
54
+
55
+ CREATED_AT_KEYS: tuple[str, ...] = ("created_at", "created-at", "createdat")
56
+ UPDATED_AT_KEYS: tuple[str, ...] = ("updated_at", "updated-at", "updatedat")
57
+ REPEATER_PATTERN: re.Pattern[str] = re.compile(r"(\.\+|\+\+|\+)\d+[hdwmy]")
58
+ CLOCK_PATTERN: re.Pattern[str] = re.compile(
59
+ r"^\s*CLOCK:\s*\[([0-9]{4}-[0-9]{2}-[0-9]{2}\s+[A-Za-z]{3}\s+[0-9]{2}:[0-9]{2})\]\s*--\s*"
60
+ r"\[([0-9]{4}-[0-9]{2}-[0-9]{2}\s+[A-Za-z]{3}\s+[0-9]{2}:[0-9]{2})\]\s*=>\s*([0-9]{2}:[0-9]{2}:[0-9]{2})\s*$"
61
+ )
62
+
63
+
64
+ def is_system_block(line: str) -> bool:
65
+ """Return True for Logseq metadata/noise lines."""
66
+ return any(pattern.match(line) for pattern in SYSTEM_BLOCK_PATTERNS)
67
+
68
+
69
+ def clean_node_content(raw_content: str, properties: dict[str, Any]) -> str:
70
+ """Strip Logseq properties and bullet syntax from block text."""
71
+ cleaned_lines: list[str] = []
72
+ property_keys = tuple(properties.keys())
73
+ in_code_block = False
74
+
75
+ for line_index, line in enumerate(raw_content.splitlines()):
76
+ stripped = line.strip()
77
+ if _is_code_fence_line(stripped):
78
+ in_code_block = not in_code_block
79
+ cleaned_lines.append(stripped)
80
+ continue
81
+ if in_code_block:
82
+ cleaned_lines.append(line)
83
+ continue
84
+ if property_keys and any(stripped.startswith(f"{key}::") for key in property_keys):
85
+ continue
86
+ cleaned_line = TIME_PATTERN.sub("", line)
87
+ cleaned_line = LOGSEQ_PATTERNS["inline_uuid_prop"].sub("", cleaned_line)
88
+ cleaned_line = ALIASED_BLOCK_REF_PATTERN.sub(r"\1", cleaned_line)
89
+ cleaned_line = PLAIN_BLOCK_REF_PATTERN.sub("", cleaned_line)
90
+ cleaned_line = re.sub(r"^\*\*(.+?)\s\*\*$", r"\1", cleaned_line.strip())
91
+ cleaned_line = re.sub(r"^\s*-\s+", "", cleaned_line).strip()
92
+ heading_match = HEADING_PATTERN.match(cleaned_line)
93
+ if heading_match:
94
+ cleaned_line = heading_match.group(2).strip()
95
+ if line_index == 0:
96
+ _, cleaned_line = _extract_task_status(cleaned_line)
97
+ cleaned_line = PRIORITY_PATTERN.sub("", cleaned_line).strip()
98
+ cleaned_line = re.sub(r"\s{2,}", " ", cleaned_line).strip()
99
+ if not cleaned_line:
100
+ continue
101
+ cleaned_lines.append(cleaned_line)
102
+
103
+ return "\n".join(cleaned_lines).strip()
104
+
105
+
106
+ def _is_code_fence_line(stripped_line: str) -> bool:
107
+ return stripped_line.startswith("```")
108
+
109
+
110
+ def _try_open_fence_line(content: str, line_start: int, n: int) -> tuple[int, int] | None:
111
+ """If the line starting at ``line_start`` opens a fenced code block, return (tick_index, tick_len)."""
112
+ k = line_start
113
+ while k < n and content[k] in " \t":
114
+ k += 1
115
+ if k >= n or content[k] != "`":
116
+ return None
117
+ tick_end = k
118
+ while tick_end < n and content[tick_end] == "`":
119
+ tick_end += 1
120
+ tick_len = tick_end - k
121
+ if tick_len < 3:
122
+ return None
123
+ return (k, tick_len)
124
+
125
+
126
+ def _fence_line_is_closing(line: str, tick_len: int) -> bool:
127
+ stripped = line.strip()
128
+ if not stripped or stripped[0] != "`":
129
+ return False
130
+ run = 0
131
+ while run < len(stripped) and stripped[run] == "`":
132
+ run += 1
133
+ remainder = stripped[run:].strip()
134
+ return run >= tick_len and remainder == ""
135
+
136
+
137
+ def _fence_region_end(content: str, tick_start: int, tick_len: int, n: int) -> int:
138
+ """Return end index (exclusive) of a fenced code region opened at ``tick_start``."""
139
+ line_end = content.find("\n", tick_start + tick_len)
140
+ if line_end == -1:
141
+ return n
142
+ pos = line_end + 1
143
+ while pos < n:
144
+ next_nl = content.find("\n", pos)
145
+ segment = content[pos:] if next_nl == -1 else content[pos:next_nl]
146
+ if _fence_line_is_closing(segment, tick_len):
147
+ return n if next_nl == -1 else next_nl + 1
148
+ if next_nl == -1:
149
+ return n
150
+ pos = next_nl + 1
151
+ return n
152
+
153
+
154
+ def _find_inline_code_close(content: str, body_start: int, tick_len: int, n: int) -> int:
155
+ p = body_start
156
+ while p < n:
157
+ if content[p] == "`":
158
+ q = p
159
+ while q < n and content[q] == "`":
160
+ q += 1
161
+ if q - p == tick_len:
162
+ return q
163
+ p += 1
164
+ return -1
165
+
166
+
167
+ def _consume_inline_code_span(content: str, i: int, n: int) -> tuple[str, int]:
168
+ """Return the full span (including delimiters) and exclusive end index."""
169
+ j = i
170
+ while j < n and content[j] == "`":
171
+ j += 1
172
+ tick_len = j - i
173
+ close = _find_inline_code_close(content, j, tick_len, n)
174
+ if close == -1:
175
+ return content[i:n], n
176
+ return content[i:close], close
177
+
178
+
179
+ def _shield_inline_code(content: str) -> tuple[str, list[str]]:
180
+ """Mask inline code, fenced code, and ``{{...}}`` macros for entity extraction only."""
181
+ literals: list[str] = []
182
+ parts: list[str] = []
183
+ i = 0
184
+ n = len(content)
185
+
186
+ def emit_placeholder(segment: str) -> None:
187
+ literals.append(segment)
188
+ parts.append(f"{_SHIELD_TOKEN_PREFIX}{len(literals) - 1}___")
189
+
190
+ while i < n:
191
+ at_line_start = i == 0 or content[i - 1] == "\n"
192
+ if at_line_start:
193
+ line_start = i
194
+ fence_open = _try_open_fence_line(content, line_start, n)
195
+ if fence_open is not None:
196
+ tick_start, tick_len = fence_open
197
+ fence_end = _fence_region_end(content, tick_start, tick_len, n)
198
+ emit_placeholder(content[i:fence_end])
199
+ i = fence_end
200
+ continue
201
+
202
+ if i + 1 < n and content[i] == "{" and content[i + 1] == "{":
203
+ close = content.find("}}", i + 2)
204
+ if close == -1:
205
+ emit_placeholder(content[i:n])
206
+ break
207
+ emit_placeholder(content[i : close + 2])
208
+ i = close + 2
209
+ continue
210
+
211
+ if content[i] == "`":
212
+ segment, end = _consume_inline_code_span(content, i, n)
213
+ emit_placeholder(segment)
214
+ i = end
215
+ continue
216
+
217
+ parts.append(content[i])
218
+ i += 1
219
+
220
+ return "".join(parts), literals
221
+
222
+
223
+ def _extract_task_status(first_line: str) -> tuple[str | None, str]:
224
+ for status in TASK_STATUSES:
225
+ prefix = f"{status} "
226
+ if first_line.startswith(prefix):
227
+ return status, first_line[len(prefix) :].strip()
228
+ return None, first_line
229
+
230
+
231
+ def _extract_time_properties(raw_content: str) -> dict[str, Any]:
232
+ properties: dict[str, Any] = {}
233
+ in_code_block = False
234
+ for line in raw_content.splitlines():
235
+ stripped = line.strip()
236
+ if _is_code_fence_line(stripped):
237
+ in_code_block = not in_code_block
238
+ continue
239
+ if in_code_block:
240
+ continue
241
+ for key, value in TIME_PATTERN.findall(line):
242
+ marker_lower = key.lower()
243
+ marker_payload = value.strip("<>")
244
+ properties[marker_lower] = value
245
+ repeater_match = REPEATER_PATTERN.search(marker_payload)
246
+ repeater = repeater_match.group(0) if repeater_match else None
247
+ payload_without_repeater = (
248
+ REPEATER_PATTERN.sub("", marker_payload).strip() if repeater else marker_payload
249
+ )
250
+ parsed_dt = _parse_logseq_datetime(payload_without_repeater)
251
+ if parsed_dt is not None:
252
+ properties[f"{marker_lower}_journal_day"] = int(parsed_dt.strftime("%Y%m%d"))
253
+ properties[f"{marker_lower}_iso"] = parsed_dt.isoformat(timespec="seconds")
254
+ properties[f"{marker_lower}_at"] = int(
255
+ parsed_dt.replace(tzinfo=timezone.utc).timestamp()
256
+ )
257
+ if repeater is not None:
258
+ properties["repeater"] = repeater
259
+ return properties
260
+
261
+
262
+ def _extract_tags(raw_content: str) -> list[str]:
263
+ tags: list[str] = []
264
+ shielded, _ = _shield_inline_code(raw_content)
265
+ for bracketed, simple in LOGSEQ_PATTERNS["tag"].findall(shielded):
266
+ tag = bracketed or simple
267
+ if tag:
268
+ tags.append(tag.rstrip(".,;:"))
269
+ return tags
270
+
271
+
272
+ def _extract_block_refs(raw_content: str) -> list[str]:
273
+ refs: list[str] = []
274
+ shielded, _ = _shield_inline_code(raw_content)
275
+ for alias_ref, plain_ref in LOGSEQ_PATTERNS["block_ref"].findall(shielded):
276
+ block_ref = alias_ref or plain_ref
277
+ if block_ref:
278
+ refs.append(block_ref)
279
+ return refs
280
+
281
+
282
+ def _extract_heading_level(content: str) -> int | None:
283
+ first_line = content.splitlines()[0].strip() if content.splitlines() else ""
284
+ match = HEADING_PATTERN.match(first_line)
285
+ if match:
286
+ return len(match.group(1))
287
+ return None
288
+
289
+
290
+ def _strip_ordinal_suffix(value: str) -> str:
291
+ return re.sub(r"\b([0-9]{1,2})(st|nd|rd|th)\b", r"\1", value, flags=re.IGNORECASE)
292
+
293
+
294
+ def _parse_logseq_datetime(raw_value: str) -> datetime | None:
295
+ candidate = _strip_ordinal_suffix(raw_value.strip())
296
+ datetime_formats = (
297
+ "%Y-%m-%d %a %H:%M",
298
+ "%Y-%m-%d %H:%M",
299
+ "%Y-%m-%d %a",
300
+ "%Y-%m-%d",
301
+ "%b %d, %Y",
302
+ "%Y_%m_%d",
303
+ "%a, %d-%m-%Y",
304
+ )
305
+ for fmt in datetime_formats:
306
+ try:
307
+ return datetime.strptime(candidate, fmt)
308
+ except ValueError:
309
+ continue
310
+ return None
311
+
312
+
313
+ def resolve_journal_day(value: str) -> int | None:
314
+ """Resolve a journal-like string into Logseq YYYYMMDD integer."""
315
+ candidate = value.strip()
316
+ if not candidate:
317
+ return None
318
+ if candidate.startswith("[[") and candidate.endswith("]]"):
319
+ candidate = candidate[2:-2].strip()
320
+ if candidate.lower().endswith(".md"):
321
+ candidate = candidate[:-3]
322
+ parsed = _parse_logseq_datetime(candidate)
323
+ if parsed is None:
324
+ return None
325
+ return int(parsed.strftime("%Y%m%d"))
326
+
327
+
328
+ def normalize_logseq_timestamp(value: Any) -> int | None:
329
+ """Normalize Logseq-style timestamp values to unix epoch seconds."""
330
+ if value is None:
331
+ return None
332
+
333
+ if isinstance(value, bool):
334
+ return None
335
+
336
+ if isinstance(value, (int, float)):
337
+ timestamp = int(value)
338
+ return timestamp // 1000 if timestamp >= 10**12 else timestamp
339
+
340
+ if isinstance(value, str):
341
+ candidate = value.strip()
342
+ if not candidate:
343
+ return None
344
+ if candidate.isdigit():
345
+ parsed = int(candidate)
346
+ return parsed // 1000 if parsed >= 10**12 else parsed
347
+
348
+ iso_candidate = candidate.replace("Z", "+00:00")
349
+ try:
350
+ parsed_datetime = datetime.fromisoformat(iso_candidate)
351
+ if parsed_datetime.tzinfo is None:
352
+ parsed_datetime = parsed_datetime.replace(tzinfo=timezone.utc)
353
+ return int(parsed_datetime.timestamp())
354
+ except ValueError:
355
+ pass
356
+
357
+ parsed_logseq_date = _parse_logseq_datetime(candidate)
358
+ if parsed_logseq_date is not None:
359
+ return int(parsed_logseq_date.replace(tzinfo=timezone.utc).timestamp())
360
+
361
+ date_formats = ("%Y/%m/%d", "%Y%m%d")
362
+ for fmt in date_formats:
363
+ try:
364
+ parsed_date = datetime.strptime(candidate, fmt).replace(tzinfo=timezone.utc)
365
+ return int(parsed_date.timestamp())
366
+ except ValueError:
367
+ continue
368
+
369
+ return None
370
+
371
+
372
+ def _first_normalized_timestamp(properties: dict[str, Any], keys: tuple[str, ...]) -> int | None:
373
+ for key in keys:
374
+ if key in properties:
375
+ normalized = normalize_logseq_timestamp(properties[key])
376
+ if normalized is not None:
377
+ return normalized
378
+ return None
379
+
380
+
381
+ def _merge_refs(wikilinks: list[str], tags: list[str]) -> list[str]:
382
+ merged: list[str] = []
383
+ seen: set[str] = set()
384
+ for token in [*wikilinks, *tags]:
385
+ if token and token not in seen:
386
+ seen.add(token)
387
+ merged.append(token)
388
+ return merged
389
+
390
+
391
+ def _extract_property_graph_tokens(
392
+ properties: dict[str, Any],
393
+ ) -> tuple[list[str], list[str], list[str]]:
394
+ property_wikilinks: list[str] = []
395
+ property_tags: list[str] = []
396
+ property_block_refs: list[str] = []
397
+ for value in properties.values():
398
+ if not isinstance(value, str):
399
+ continue
400
+ property_wikilinks.extend(_extract_wikilinks(value))
401
+ property_tags.extend(_extract_tags(value))
402
+ property_block_refs.extend(_extract_block_refs(value))
403
+ return property_wikilinks, property_tags, property_block_refs
404
+
405
+
406
+ def _extract_wikilinks(raw_content: str) -> list[str]:
407
+ shielded, _ = _shield_inline_code(raw_content)
408
+ return LOGSEQ_PATTERNS["wikilink"].findall(shielded)
409
+
410
+
411
+ class PageRegistry:
412
+ """Track all nodes by uuid for local block-reference resolution."""
413
+
414
+ def __init__(self) -> None:
415
+ self.blocks: dict[str, LogseqNode] = {}
416
+
417
+ def register(self, node: LogseqNode) -> None:
418
+ if node.uuid:
419
+ self.blocks[node.uuid] = node
420
+
421
+ def resolve(self, node_uuid: str) -> LogseqNode | None:
422
+ return self.blocks.get(node_uuid)
423
+
424
+
425
+ class StackMachineParser:
426
+ """O(N) indentation parser that builds a strict immutable AST."""
427
+
428
+ def __init__(self, tab_size: int = 2) -> None:
429
+ self.tab_size = tab_size
430
+ self.registry = PageRegistry()
431
+
432
+ def parse(self, text: str, page_title: str = "untitled") -> LogseqPage:
433
+ """Parse Logseq markdown text into a `LogseqPage`."""
434
+ stack: list[LogseqNode] = []
435
+ stack_columns: list[int] = []
436
+ stack_indents: list[str] = []
437
+ root_nodes: list[LogseqNode] = []
438
+ page_properties: dict[str, Any] = {}
439
+ current_node: LogseqNode | None = None
440
+ frontmatter_active = True
441
+ property_list_indent_level: int | None = None
442
+ in_code_block = False
443
+ in_drawer = False
444
+
445
+ for line_number, raw_line in enumerate(text.splitlines(), start=1):
446
+ stripped_line = raw_line.strip()
447
+
448
+ if in_code_block and current_node is not None:
449
+ merged_content = f"{current_node.content}\n{raw_line}"
450
+ updated = self._refresh_node(current_node, merged_content, line_end=line_number)
451
+ self._replace_stack_tail_node(stack, root_nodes, updated)
452
+ current_node = updated
453
+ if _is_code_fence_line(stripped_line):
454
+ in_code_block = False
455
+ frontmatter_active = False
456
+ property_list_indent_level = None
457
+ continue
458
+
459
+ if in_drawer:
460
+ if stripped_line.upper() == ":END:":
461
+ in_drawer = False
462
+ continue
463
+ if BULLET_PATTERN.match(raw_line):
464
+ in_drawer = False
465
+ else:
466
+ if current_node is not None:
467
+ properties = dict(current_node.properties)
468
+ logbook_entries = list(properties.get("logbook", []))
469
+ logbook_entries.append(stripped_line)
470
+ properties["logbook"] = logbook_entries
471
+ clock_match = CLOCK_PATTERN.match(stripped_line)
472
+ if clock_match:
473
+ start_text, end_text, duration_text = clock_match.groups()
474
+ start_dt = datetime.strptime(start_text, "%Y-%m-%d %a %H:%M")
475
+ end_dt = datetime.strptime(end_text, "%Y-%m-%d %a %H:%M")
476
+ hours, minutes, seconds = [int(part) for part in duration_text.split(":")]
477
+ duration_seconds = (hours * 3600) + (minutes * 60) + seconds
478
+ clock_entries = list(properties.get("clock", []))
479
+ clock_entries.append(
480
+ {
481
+ "start_iso": start_dt.isoformat(timespec="seconds"),
482
+ "end_iso": end_dt.isoformat(timespec="seconds"),
483
+ "duration": duration_text,
484
+ "duration_seconds": duration_seconds,
485
+ }
486
+ )
487
+ properties["clock"] = clock_entries
488
+ updated = self._refresh_node(
489
+ current_node,
490
+ current_node.content,
491
+ properties_override=properties,
492
+ line_end=line_number,
493
+ )
494
+ self._replace_stack_tail_node(stack, root_nodes, updated)
495
+ current_node = updated
496
+ continue
497
+
498
+ if stripped_line.upper() == ":LOGBOOK:" and current_node is not None:
499
+ in_drawer = True
500
+ properties = dict(current_node.properties)
501
+ properties.setdefault("logbook", [])
502
+ updated = self._refresh_node(
503
+ current_node,
504
+ current_node.content,
505
+ properties_override=properties,
506
+ line_end=line_number,
507
+ )
508
+ self._replace_stack_tail_node(stack, root_nodes, updated)
509
+ current_node = updated
510
+ continue
511
+
512
+ collapsed_match = re.match(r"^\s*collapsed::\s*(\S+)\s*$", raw_line, re.IGNORECASE)
513
+ if collapsed_match and current_node is not None:
514
+ collapsed_value = collapsed_match.group(1).lower() == "true"
515
+ properties = dict(current_node.properties)
516
+ properties["collapsed"] = collapsed_value
517
+ updated = self._refresh_node(
518
+ current_node,
519
+ current_node.content,
520
+ properties_override=properties,
521
+ line_end=line_number,
522
+ )
523
+ self._replace_stack_tail_node(stack, root_nodes, updated)
524
+ current_node = updated
525
+ continue
526
+
527
+ if not stripped_line or is_system_block(raw_line):
528
+ continue
529
+
530
+ bullet_match = BULLET_PATTERN.match(raw_line)
531
+ if bullet_match:
532
+ indent_level = self._compute_indent_level(bullet_match.group(1))
533
+ if property_list_indent_level is not None and indent_level > property_list_indent_level:
534
+ indent_level -= 1
535
+ else:
536
+ property_list_indent_level = None
537
+
538
+ raw_indent = bullet_match.group(1)
539
+ if (
540
+ stack_columns
541
+ and "\t" in stack_indents[-1]
542
+ and raw_indent
543
+ and "\t" not in raw_indent
544
+ and indent_level == stack_columns[-1] + 1
545
+ ):
546
+ indent_level = stack_columns[-1]
547
+
548
+ while stack_columns and stack_columns[-1] >= indent_level:
549
+ stack.pop()
550
+ stack_columns.pop()
551
+ stack_indents.pop()
552
+
553
+ parent_uuid = self._resolve_parent_uuid_for_synthetic(stack)
554
+ node = self._build_node(
555
+ block_text=bullet_match.group(2),
556
+ indent_level=indent_level,
557
+ page_title=page_title,
558
+ line_start=line_number,
559
+ parent_uuid=parent_uuid,
560
+ )
561
+
562
+ node = self._initialize_node_graph_fields(node, stack, root_nodes)
563
+ if stack:
564
+ node = self._attach_node_to_parent(stack, root_nodes, node)
565
+ else:
566
+ root_nodes.append(node)
567
+
568
+ stack.append(node)
569
+ stack_columns.append(indent_level)
570
+ stack_indents.append(raw_indent)
571
+ current_node = node
572
+ self.registry.register(node)
573
+ frontmatter_active = False
574
+ continue
575
+
576
+ heading_match = HEADING_BLOCK_PATTERN.match(raw_line)
577
+ if heading_match:
578
+ indent_level = self._compute_indent_level(heading_match.group(1))
579
+ property_list_indent_level = None
580
+
581
+ raw_indent = heading_match.group(1)
582
+
583
+ while stack_columns and stack_columns[-1] >= indent_level:
584
+ stack.pop()
585
+ stack_columns.pop()
586
+ stack_indents.pop()
587
+
588
+ parent_uuid = self._resolve_parent_uuid_for_synthetic(stack)
589
+ node = self._build_node(
590
+ block_text=heading_match.group(2),
591
+ indent_level=indent_level,
592
+ page_title=page_title,
593
+ line_start=line_number,
594
+ parent_uuid=parent_uuid,
595
+ )
596
+
597
+ node = self._initialize_node_graph_fields(node, stack, root_nodes)
598
+ if stack:
599
+ node = self._attach_node_to_parent(stack, root_nodes, node)
600
+ else:
601
+ root_nodes.append(node)
602
+
603
+ stack.append(node)
604
+ stack_columns.append(indent_level)
605
+ stack_indents.append(raw_indent)
606
+ current_node = node
607
+ self.registry.register(node)
608
+ frontmatter_active = False
609
+ continue
610
+
611
+ property_match = LOGSEQ_PATTERNS["property"].match(raw_line.strip())
612
+ if property_match:
613
+ key, value = property_match.groups()
614
+
615
+ if current_node is None and frontmatter_active:
616
+ page_properties[key] = value
617
+ continue
618
+
619
+ if current_node is None:
620
+ frontmatter_active = False
621
+ continue
622
+
623
+ properties = dict(current_node.properties)
624
+ properties[key] = value
625
+ properties_order = list(current_node.properties_order)
626
+ if key not in properties_order:
627
+ properties_order.append(key)
628
+
629
+ updated = self._refresh_node(
630
+ current_node,
631
+ current_node.content,
632
+ properties_override=properties,
633
+ properties_order_override=properties_order,
634
+ line_end=line_number,
635
+ )
636
+ if key == "id":
637
+ updated = updated.model_copy(
638
+ update={"source_uuid": value, "synthetic_id": False}
639
+ )
640
+ self._replace_stack_tail_node(stack, root_nodes, updated)
641
+ current_node = updated
642
+ self.registry.register(updated)
643
+
644
+ raw_indent = raw_line[: len(raw_line) - len(raw_line.lstrip(" \t"))]
645
+ property_list_indent_level = (
646
+ self._compute_indent_level(raw_indent) if value.strip() == "" else None
647
+ )
648
+ frontmatter_active = False
649
+ continue
650
+
651
+ if not stack:
652
+ frontmatter_active = False
653
+ continue
654
+
655
+ active_node = stack[-1]
656
+ merged_content = f"{active_node.content}\n{raw_line}"
657
+ updated = self._refresh_node(active_node, merged_content, line_end=line_number)
658
+ self._replace_stack_tail_node(stack, root_nodes, updated)
659
+ current_node = updated
660
+ logger.debug(
661
+ "Soft-break continuation merged into stack tip line=%s depth=%s",
662
+ line_number,
663
+ len(stack),
664
+ )
665
+ frontmatter_active = False
666
+ property_list_indent_level = None
667
+ if _is_code_fence_line(stripped_line):
668
+ in_code_block = True
669
+
670
+ self._validate_references(root_nodes)
671
+ root_nodes = self._normalize_indent_levels(root_nodes)
672
+ page_refs = self._collect_page_refs(root_nodes)
673
+ created_at = _first_normalized_timestamp(page_properties, CREATED_AT_KEYS)
674
+ updated_at = _first_normalized_timestamp(page_properties, UPDATED_AT_KEYS)
675
+ title_segments = [segment for segment in page_title.split("/") if segment]
676
+ namespace_chain = title_segments[:-1] if len(title_segments) > 1 else []
677
+ return LogseqPage(
678
+ title=page_title,
679
+ raw_content=text,
680
+ properties=page_properties,
681
+ refs=page_refs,
682
+ created_at=created_at,
683
+ updated_at=updated_at,
684
+ namespace_chain=namespace_chain,
685
+ root_nodes=root_nodes,
686
+ )
687
+
688
+ def parse_file(self, path: Path | str) -> list[LogseqNode]:
689
+ """Compatibility API: parse file and return root nodes."""
690
+ page = self.parse_page_file(path)
691
+ return page.root_nodes
692
+
693
+ def parse_page_file(self, path: Path | str) -> LogseqPage:
694
+ """Parse a markdown file and return a graph-native page model."""
695
+ path = Path(path)
696
+ content = path.read_text(encoding="utf-8")
697
+ if not content.strip():
698
+ logger.warning("Il file %s è vuoto.", path)
699
+ return LogseqPage(
700
+ title=path.stem,
701
+ raw_content=content,
702
+ namespace_chain=[],
703
+ source_path=str(path.resolve()),
704
+ graph_root=str(path.resolve().parent),
705
+ )
706
+
707
+ page_title = self._derive_page_title(path)
708
+ page = self.parse(content, page_title=page_title)
709
+ graph_root = self._derive_graph_root(path)
710
+ created_at = page.created_at
711
+ updated_at = page.updated_at
712
+ if created_at is None:
713
+ created_at = int(os.path.getctime(path))
714
+ if updated_at is None:
715
+ updated_at = int(os.path.getmtime(path))
716
+ source_path = str(path.resolve())
717
+ return page.model_copy(
718
+ update={
719
+ "source_path": source_path,
720
+ "graph_root": str(graph_root),
721
+ "created_at": created_at,
722
+ "updated_at": updated_at,
723
+ "root_nodes": self._apply_source_path(page.root_nodes, source_path),
724
+ }
725
+ )
726
+
727
+ def _derive_page_title(self, path: Path) -> str:
728
+ resolved_path = path.resolve()
729
+ if resolved_path.suffix == ".md":
730
+ resolved_path = resolved_path.with_suffix("")
731
+ parts = list(resolved_path.parts)
732
+ if "pages" in parts:
733
+ page_index = parts.index("pages")
734
+ return "/".join(parts[page_index + 1 :])
735
+ if "journals" in parts:
736
+ journal_index = parts.index("journals")
737
+ return "/".join(parts[journal_index + 1 :])
738
+ return path.stem
739
+
740
+ def _derive_graph_root(self, path: Path) -> Path:
741
+ resolved_path = path.resolve()
742
+ marker_dirs = {"pages", "journals", "assets", "logseq"}
743
+ for parent in resolved_path.parents:
744
+ if parent.name in marker_dirs:
745
+ return parent.parent.resolve()
746
+ return resolved_path.parent.resolve()
747
+
748
+ def _apply_source_path(self, nodes: list[LogseqNode], source_path: str) -> list[LogseqNode]:
749
+ return [
750
+ node.model_copy(
751
+ update={
752
+ "source_path": source_path,
753
+ "children": self._apply_source_path(node.children, source_path),
754
+ }
755
+ )
756
+ for node in nodes
757
+ ]
758
+
759
+ def _compute_indent_level(self, indentation: str) -> int:
760
+ spaces = indentation.count(" ") + (indentation.count("\t") * self.tab_size)
761
+ logger.debug(
762
+ "Computed indentation level via floor division: spaces=%s tab_size=%s level=%s",
763
+ spaces,
764
+ self.tab_size,
765
+ spaces // self.tab_size,
766
+ )
767
+ return spaces // self.tab_size
768
+
769
+ def _resolve_parent_uuid_for_synthetic(self, stack: list[LogseqNode]) -> str | None:
770
+ """Return the parent block UUID for synthetic hashing; None at graph root (payload uses 'root')."""
771
+ if not stack:
772
+ logger.debug("Stack empty: synthetic UUID parent_uuid=None (hashed as root sentinel)")
773
+ return None
774
+ resolved_parent_uuid = stack[-1].uuid
775
+ logger.debug(
776
+ "Stack depth=%s: synthetic UUID parent_uuid=%s", len(stack), resolved_parent_uuid
777
+ )
778
+ return resolved_parent_uuid
779
+
780
+ def _build_node(
781
+ self,
782
+ block_text: str,
783
+ indent_level: int,
784
+ page_title: str,
785
+ line_start: int,
786
+ parent_uuid: str | None,
787
+ ) -> LogseqNode:
788
+ stripped_text = block_text.strip()
789
+ properties: dict[str, Any] = {}
790
+
791
+ uuid_match = LOGSEQ_PATTERNS["uuid_prop"].match(stripped_text)
792
+ inline_uuid_match = LOGSEQ_PATTERNS["inline_uuid_prop"].search(stripped_text)
793
+ if inline_uuid_match is not None:
794
+ inline_uuid = inline_uuid_match.group(1)
795
+ properties["id"] = inline_uuid
796
+ stripped_text = LOGSEQ_PATTERNS["inline_uuid_prop"].sub("", stripped_text).strip()
797
+ source_uuid = (
798
+ uuid_match.group(1)
799
+ if uuid_match
800
+ else (inline_uuid_match.group(1) if inline_uuid_match else None)
801
+ )
802
+ node_uuid = self._deterministic_uuid(page_title, line_start, stripped_text, parent_uuid)
803
+ time_properties = _extract_time_properties(stripped_text)
804
+ scheduled_at: int | None = None
805
+ deadline_at: int | None = None
806
+ if time_properties:
807
+ scheduled_raw = time_properties.get("scheduled_at")
808
+ deadline_raw = time_properties.get("deadline_at")
809
+ scheduled_at = scheduled_raw if isinstance(scheduled_raw, int) else None
810
+ deadline_at = deadline_raw if isinstance(deadline_raw, int) else None
811
+ merge_time = {
812
+ key: value
813
+ for key, value in time_properties.items()
814
+ if key not in ("scheduled_at", "deadline_at")
815
+ }
816
+ properties.update(merge_time)
817
+ first_line = stripped_text.splitlines()[0].strip() if stripped_text else ""
818
+ priority_match = PRIORITY_PATTERN.search(first_line)
819
+ task_priority = priority_match.group(1) if priority_match else None
820
+ task_status, _ = _extract_task_status(stripped_text)
821
+ heading_level = _extract_heading_level(stripped_text)
822
+ if heading_level is not None:
823
+ properties["heading_level"] = heading_level
824
+ property_wikilinks, property_tags, property_block_refs = _extract_property_graph_tokens(
825
+ properties
826
+ )
827
+ wikilinks = [*_extract_wikilinks(stripped_text), *property_wikilinks]
828
+ tags = [*_extract_tags(stripped_text), *property_tags]
829
+ properties_order = ["id"] if "id" in properties else []
830
+
831
+ return LogseqNode(
832
+ uuid=node_uuid,
833
+ source_uuid=source_uuid,
834
+ synthetic_id=source_uuid is None,
835
+ content=stripped_text,
836
+ clean_text=clean_node_content(stripped_text, properties),
837
+ indent_level=indent_level,
838
+ properties=properties,
839
+ properties_order=properties_order,
840
+ wikilinks=wikilinks,
841
+ tags=tags,
842
+ refs=_merge_refs(wikilinks, tags),
843
+ block_refs=[*_extract_block_refs(stripped_text), *property_block_refs],
844
+ task_status=task_status,
845
+ task_priority=task_priority,
846
+ scheduled_at=scheduled_at,
847
+ deadline_at=deadline_at,
848
+ repeater=properties.get("repeater") if isinstance(properties.get("repeater"), str) else None,
849
+ parent_id=None,
850
+ line_start=line_start,
851
+ line_end=line_start,
852
+ created_at=_first_normalized_timestamp(properties, CREATED_AT_KEYS),
853
+ updated_at=_first_normalized_timestamp(properties, UPDATED_AT_KEYS),
854
+ children=[],
855
+ )
856
+
857
+ def _deterministic_uuid(
858
+ self,
859
+ page_title: str,
860
+ line_start: int,
861
+ content: str,
862
+ parent_uuid: str | None,
863
+ ) -> str:
864
+ parent_token = "root" if parent_uuid is None else parent_uuid
865
+ logger.debug(
866
+ "Stack-Machine synthetic UUID payload parent_token=%s line_start=%s page_title=%s",
867
+ parent_token,
868
+ line_start,
869
+ page_title,
870
+ )
871
+ payload = f"{page_title}:{line_start}:{parent_token}:{content}".encode("utf-8")
872
+ digest = hashlib.sha256(payload).hexdigest()
873
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, digest))
874
+
875
+ def _replace_stack_tail_node(
876
+ self,
877
+ stack: list[LogseqNode],
878
+ root_nodes: list[LogseqNode],
879
+ updated_node: LogseqNode,
880
+ ) -> None:
881
+ if not stack:
882
+ return
883
+
884
+ stack[-1] = updated_node
885
+ if len(stack) == 1:
886
+ root_nodes[-1] = updated_node
887
+ return
888
+
889
+ parent = stack[-2]
890
+ parent_children = list(parent.children)
891
+ parent_children[-1] = updated_node
892
+ updated_parent = parent.model_copy(update={"children": parent_children})
893
+ stack[-2] = updated_parent
894
+
895
+ if len(stack) == 2:
896
+ root_nodes[-1] = updated_parent
897
+ return
898
+
899
+ grand_parent = stack[-3]
900
+ grand_parent_children = list(grand_parent.children)
901
+ grand_parent_children[-1] = updated_parent
902
+ stack[-3] = grand_parent.model_copy(update={"children": grand_parent_children})
903
+
904
+ def _attach_node_to_parent(
905
+ self,
906
+ stack: list[LogseqNode],
907
+ root_nodes: list[LogseqNode],
908
+ node: LogseqNode,
909
+ ) -> LogseqNode:
910
+ parent = stack[-1]
911
+ attached_node = node.model_copy(update={"parent_id": parent.uuid})
912
+ updated_ancestor = attached_node
913
+
914
+ for idx in range(len(stack) - 1, -1, -1):
915
+ ancestor = stack[idx]
916
+ ancestor_children = list(ancestor.children)
917
+ if idx == len(stack) - 1:
918
+ ancestor_children.append(updated_ancestor)
919
+ else:
920
+ ancestor_children[-1] = updated_ancestor
921
+ updated_ancestor = ancestor.model_copy(update={"children": ancestor_children})
922
+ stack[idx] = updated_ancestor
923
+
924
+ root_nodes[-1] = stack[0]
925
+ return attached_node
926
+
927
+ def _initialize_node_graph_fields(
928
+ self,
929
+ node: LogseqNode,
930
+ stack: list[LogseqNode],
931
+ root_nodes: list[LogseqNode],
932
+ ) -> LogseqNode:
933
+ left_id = self._resolve_left_sibling_id(stack, root_nodes)
934
+ if stack:
935
+ parent = stack[-1]
936
+ path = [*parent.path, node.uuid]
937
+ outline_path = [*parent.outline_path, len(parent.children) + 1]
938
+ else:
939
+ path = [node.uuid]
940
+ outline_path = [len(root_nodes) + 1]
941
+ return node.model_copy(
942
+ update={"left_id": left_id, "path": path, "outline_path": outline_path}
943
+ )
944
+
945
+ def _resolve_left_sibling_id(
946
+ self, stack: list[LogseqNode], root_nodes: list[LogseqNode]
947
+ ) -> str | None:
948
+ if stack:
949
+ parent = stack[-1]
950
+ return parent.children[-1].uuid if parent.children else None
951
+ return root_nodes[-1].uuid if root_nodes else None
952
+
953
+ def _collect_page_refs(self, roots: list[LogseqNode]) -> list[str]:
954
+ collected: list[str] = []
955
+ seen: set[str] = set()
956
+
957
+ def visit(nodes: list[LogseqNode]) -> None:
958
+ for node in nodes:
959
+ for token in node.refs:
960
+ if token not in seen:
961
+ seen.add(token)
962
+ collected.append(token)
963
+ visit(node.children)
964
+
965
+ visit(roots)
966
+ return collected
967
+
968
+ def _validate_references(self, roots: list[LogseqNode]) -> None:
969
+ _ = roots
970
+
971
+ def _refresh_node(
972
+ self,
973
+ node: LogseqNode,
974
+ content: str,
975
+ properties_override: dict[str, Any] | None = None,
976
+ properties_order_override: list[str] | None = None,
977
+ line_end: int | None = None,
978
+ ) -> LogseqNode:
979
+ properties = dict(node.properties) if properties_override is None else dict(properties_override)
980
+ properties_order = (
981
+ list(node.properties_order)
982
+ if properties_order_override is None
983
+ else list(properties_order_override)
984
+ )
985
+ time_properties = _extract_time_properties(content)
986
+ scheduled_at: int | None = None
987
+ deadline_at: int | None = None
988
+ if time_properties:
989
+ scheduled_raw = time_properties.get("scheduled_at")
990
+ deadline_raw = time_properties.get("deadline_at")
991
+ scheduled_at = scheduled_raw if isinstance(scheduled_raw, int) else None
992
+ deadline_at = deadline_raw if isinstance(deadline_raw, int) else None
993
+ merge_time = {
994
+ key: value
995
+ for key, value in time_properties.items()
996
+ if key not in ("scheduled_at", "deadline_at")
997
+ }
998
+ properties.update(merge_time)
999
+ heading_level = _extract_heading_level(content)
1000
+ if heading_level is not None:
1001
+ properties["heading_level"] = heading_level
1002
+ first_line = content.splitlines()[0].strip() if content.splitlines() else ""
1003
+ priority_match = PRIORITY_PATTERN.search(first_line)
1004
+ task_priority = priority_match.group(1) if priority_match else None
1005
+ task_status, _ = _extract_task_status(content.splitlines()[0].strip())
1006
+ property_wikilinks, property_tags, property_block_refs = _extract_property_graph_tokens(
1007
+ properties
1008
+ )
1009
+ wikilinks = [*_extract_wikilinks(content), *property_wikilinks]
1010
+ tags = [*_extract_tags(content), *property_tags]
1011
+ return node.model_copy(
1012
+ update={
1013
+ "content": content,
1014
+ "properties": properties,
1015
+ "properties_order": properties_order,
1016
+ "clean_text": clean_node_content(content, properties),
1017
+ "task_status": task_status,
1018
+ "task_priority": task_priority,
1019
+ "scheduled_at": scheduled_at,
1020
+ "deadline_at": deadline_at,
1021
+ "repeater": (
1022
+ properties.get("repeater") if isinstance(properties.get("repeater"), str) else None
1023
+ ),
1024
+ "wikilinks": wikilinks,
1025
+ "tags": tags,
1026
+ "refs": _merge_refs(wikilinks, tags),
1027
+ "block_refs": [*_extract_block_refs(content), *property_block_refs],
1028
+ "line_end": line_end if line_end is not None else node.line_end,
1029
+ "created_at": _first_normalized_timestamp(properties, CREATED_AT_KEYS),
1030
+ "updated_at": _first_normalized_timestamp(properties, UPDATED_AT_KEYS),
1031
+ }
1032
+ )
1033
+
1034
+ def _normalize_indent_levels(
1035
+ self, nodes: list[LogseqNode], depth: int = 0
1036
+ ) -> list[LogseqNode]:
1037
+ normalized_nodes: list[LogseqNode] = []
1038
+ for node in nodes:
1039
+ normalized_children = self._normalize_indent_levels(node.children, depth + 1)
1040
+ normalized_nodes.append(
1041
+ node.model_copy(update={"indent_level": depth, "children": normalized_children})
1042
+ )
1043
+ return normalized_nodes
1044
+
1045
+
1046
+ # Backward-compatible alias.
1047
+ LogosParser = StackMachineParser