logseq-matryca-parser 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logseq_matryca_parser/.gitignore +0 -0
- logseq_matryca_parser/NOTICE +7 -0
- logseq_matryca_parser/__init__.py +61 -0
- logseq_matryca_parser/__main__.py +8 -0
- logseq_matryca_parser/agent_press.py +99 -0
- logseq_matryca_parser/agent_writer.py +250 -0
- logseq_matryca_parser/exceptions.py +13 -0
- logseq_matryca_parser/forge.py +399 -0
- logseq_matryca_parser/graph.py +493 -0
- logseq_matryca_parser/kinetic.py +531 -0
- logseq_matryca_parser/lens.py +427 -0
- logseq_matryca_parser/logos_core.py +171 -0
- logseq_matryca_parser/logos_parser.py +1047 -0
- logseq_matryca_parser/pyproject.toml +0 -0
- logseq_matryca_parser/synapse.py +329 -0
- logseq_matryca_parser-0.3.0.dist-info/METADATA +279 -0
- logseq_matryca_parser-0.3.0.dist-info/RECORD +21 -0
- logseq_matryca_parser-0.3.0.dist-info/WHEEL +4 -0
- logseq_matryca_parser-0.3.0.dist-info/entry_points.txt +2 -0
- logseq_matryca_parser-0.3.0.dist-info/licenses/LICENSE +201 -0
- logseq_matryca_parser-0.3.0.dist-info/licenses/NOTICE +7 -0
|
@@ -0,0 +1,1047 @@
|
|
|
1
|
+
"""Stack-machine parser for deterministic Logseq AST construction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import uuid
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from .logos_core import LogseqNode, LogseqPage
|
|
15
|
+
|
|
16
|
+
LOGSEQ_PATTERNS: dict[str, re.Pattern[str]] = {
|
|
17
|
+
"property": re.compile(r"^([\w-]+)::\s*(.*)$"),
|
|
18
|
+
"wikilink": re.compile(r"\[\[(.*?)\]\]"),
|
|
19
|
+
"tag": re.compile(r"#\[\[([^\]]+)\]\]|#([^\s#\]]+)"),
|
|
20
|
+
"block_ref": re.compile(
|
|
21
|
+
r"(?:\[[^\]]+\])?\(\(\(([a-f0-9\-]{36})\)\)\)|\(\(([a-f0-9\-]{36})\)\)"
|
|
22
|
+
),
|
|
23
|
+
"uuid_prop": re.compile(r"^id::\s*([a-f0-9\-]{36})$"),
|
|
24
|
+
"inline_uuid_prop": re.compile(r"\bid::\s*([a-f0-9\-]{36})\b"),
|
|
25
|
+
}
|
|
26
|
+
TASK_STATUSES: tuple[str, ...] = (
|
|
27
|
+
"TODO",
|
|
28
|
+
"DOING",
|
|
29
|
+
"DONE",
|
|
30
|
+
"LATER",
|
|
31
|
+
"NOW",
|
|
32
|
+
"WAITING",
|
|
33
|
+
"CANCELED",
|
|
34
|
+
)
|
|
35
|
+
TIME_PATTERN: re.Pattern[str] = re.compile(r"\b(SCHEDULED|DEADLINE):\s*(<[^>]+>)")
|
|
36
|
+
PRIORITY_PATTERN: re.Pattern[str] = re.compile(r"\[#([A-Z])\]")
|
|
37
|
+
_SHIELD_TOKEN_PREFIX = "___LOGOS_SHIELD_TOKEN_"
|
|
38
|
+
HEADING_PATTERN: re.Pattern[str] = re.compile(r"^(#{1,6})\s+(.+)$")
|
|
39
|
+
ALIASED_BLOCK_REF_PATTERN: re.Pattern[str] = re.compile(
|
|
40
|
+
r"(\[[^\]]+\])\(\(\([a-f0-9\-]{36}\)\)\)"
|
|
41
|
+
)
|
|
42
|
+
PLAIN_BLOCK_REF_PATTERN: re.Pattern[str] = re.compile(r"\(\(([a-f0-9\-]{36})\)\)")
|
|
43
|
+
|
|
44
|
+
SYSTEM_BLOCK_PATTERNS: tuple[re.Pattern[str], ...] = (
|
|
45
|
+
re.compile(r"^\s*:(?:LOGBOOK|PROPERTIES):", re.IGNORECASE),
|
|
46
|
+
re.compile(r"^\s*END:", re.IGNORECASE),
|
|
47
|
+
re.compile(r"^\s*CLOCK:", re.IGNORECASE),
|
|
48
|
+
re.compile(r"^\s*collapsed::", re.IGNORECASE),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
BULLET_PATTERN: re.Pattern[str] = re.compile(r"^(\s*)[-*]\s+(.*)$")
|
|
52
|
+
HEADING_BLOCK_PATTERN: re.Pattern[str] = re.compile(r"^(\s*)(#{1,6}\s+.+)$")
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
CREATED_AT_KEYS: tuple[str, ...] = ("created_at", "created-at", "createdat")
|
|
56
|
+
UPDATED_AT_KEYS: tuple[str, ...] = ("updated_at", "updated-at", "updatedat")
|
|
57
|
+
REPEATER_PATTERN: re.Pattern[str] = re.compile(r"(\.\+|\+\+|\+)\d+[hdwmy]")
|
|
58
|
+
CLOCK_PATTERN: re.Pattern[str] = re.compile(
|
|
59
|
+
r"^\s*CLOCK:\s*\[([0-9]{4}-[0-9]{2}-[0-9]{2}\s+[A-Za-z]{3}\s+[0-9]{2}:[0-9]{2})\]\s*--\s*"
|
|
60
|
+
r"\[([0-9]{4}-[0-9]{2}-[0-9]{2}\s+[A-Za-z]{3}\s+[0-9]{2}:[0-9]{2})\]\s*=>\s*([0-9]{2}:[0-9]{2}:[0-9]{2})\s*$"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def is_system_block(line: str) -> bool:
|
|
65
|
+
"""Return True for Logseq metadata/noise lines."""
|
|
66
|
+
return any(pattern.match(line) for pattern in SYSTEM_BLOCK_PATTERNS)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def clean_node_content(raw_content: str, properties: dict[str, Any]) -> str:
|
|
70
|
+
"""Strip Logseq properties and bullet syntax from block text."""
|
|
71
|
+
cleaned_lines: list[str] = []
|
|
72
|
+
property_keys = tuple(properties.keys())
|
|
73
|
+
in_code_block = False
|
|
74
|
+
|
|
75
|
+
for line_index, line in enumerate(raw_content.splitlines()):
|
|
76
|
+
stripped = line.strip()
|
|
77
|
+
if _is_code_fence_line(stripped):
|
|
78
|
+
in_code_block = not in_code_block
|
|
79
|
+
cleaned_lines.append(stripped)
|
|
80
|
+
continue
|
|
81
|
+
if in_code_block:
|
|
82
|
+
cleaned_lines.append(line)
|
|
83
|
+
continue
|
|
84
|
+
if property_keys and any(stripped.startswith(f"{key}::") for key in property_keys):
|
|
85
|
+
continue
|
|
86
|
+
cleaned_line = TIME_PATTERN.sub("", line)
|
|
87
|
+
cleaned_line = LOGSEQ_PATTERNS["inline_uuid_prop"].sub("", cleaned_line)
|
|
88
|
+
cleaned_line = ALIASED_BLOCK_REF_PATTERN.sub(r"\1", cleaned_line)
|
|
89
|
+
cleaned_line = PLAIN_BLOCK_REF_PATTERN.sub("", cleaned_line)
|
|
90
|
+
cleaned_line = re.sub(r"^\*\*(.+?)\s\*\*$", r"\1", cleaned_line.strip())
|
|
91
|
+
cleaned_line = re.sub(r"^\s*-\s+", "", cleaned_line).strip()
|
|
92
|
+
heading_match = HEADING_PATTERN.match(cleaned_line)
|
|
93
|
+
if heading_match:
|
|
94
|
+
cleaned_line = heading_match.group(2).strip()
|
|
95
|
+
if line_index == 0:
|
|
96
|
+
_, cleaned_line = _extract_task_status(cleaned_line)
|
|
97
|
+
cleaned_line = PRIORITY_PATTERN.sub("", cleaned_line).strip()
|
|
98
|
+
cleaned_line = re.sub(r"\s{2,}", " ", cleaned_line).strip()
|
|
99
|
+
if not cleaned_line:
|
|
100
|
+
continue
|
|
101
|
+
cleaned_lines.append(cleaned_line)
|
|
102
|
+
|
|
103
|
+
return "\n".join(cleaned_lines).strip()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _is_code_fence_line(stripped_line: str) -> bool:
|
|
107
|
+
return stripped_line.startswith("```")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _try_open_fence_line(content: str, line_start: int, n: int) -> tuple[int, int] | None:
|
|
111
|
+
"""If the line starting at ``line_start`` opens a fenced code block, return (tick_index, tick_len)."""
|
|
112
|
+
k = line_start
|
|
113
|
+
while k < n and content[k] in " \t":
|
|
114
|
+
k += 1
|
|
115
|
+
if k >= n or content[k] != "`":
|
|
116
|
+
return None
|
|
117
|
+
tick_end = k
|
|
118
|
+
while tick_end < n and content[tick_end] == "`":
|
|
119
|
+
tick_end += 1
|
|
120
|
+
tick_len = tick_end - k
|
|
121
|
+
if tick_len < 3:
|
|
122
|
+
return None
|
|
123
|
+
return (k, tick_len)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _fence_line_is_closing(line: str, tick_len: int) -> bool:
|
|
127
|
+
stripped = line.strip()
|
|
128
|
+
if not stripped or stripped[0] != "`":
|
|
129
|
+
return False
|
|
130
|
+
run = 0
|
|
131
|
+
while run < len(stripped) and stripped[run] == "`":
|
|
132
|
+
run += 1
|
|
133
|
+
remainder = stripped[run:].strip()
|
|
134
|
+
return run >= tick_len and remainder == ""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _fence_region_end(content: str, tick_start: int, tick_len: int, n: int) -> int:
|
|
138
|
+
"""Return end index (exclusive) of a fenced code region opened at ``tick_start``."""
|
|
139
|
+
line_end = content.find("\n", tick_start + tick_len)
|
|
140
|
+
if line_end == -1:
|
|
141
|
+
return n
|
|
142
|
+
pos = line_end + 1
|
|
143
|
+
while pos < n:
|
|
144
|
+
next_nl = content.find("\n", pos)
|
|
145
|
+
segment = content[pos:] if next_nl == -1 else content[pos:next_nl]
|
|
146
|
+
if _fence_line_is_closing(segment, tick_len):
|
|
147
|
+
return n if next_nl == -1 else next_nl + 1
|
|
148
|
+
if next_nl == -1:
|
|
149
|
+
return n
|
|
150
|
+
pos = next_nl + 1
|
|
151
|
+
return n
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _find_inline_code_close(content: str, body_start: int, tick_len: int, n: int) -> int:
|
|
155
|
+
p = body_start
|
|
156
|
+
while p < n:
|
|
157
|
+
if content[p] == "`":
|
|
158
|
+
q = p
|
|
159
|
+
while q < n and content[q] == "`":
|
|
160
|
+
q += 1
|
|
161
|
+
if q - p == tick_len:
|
|
162
|
+
return q
|
|
163
|
+
p += 1
|
|
164
|
+
return -1
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _consume_inline_code_span(content: str, i: int, n: int) -> tuple[str, int]:
|
|
168
|
+
"""Return the full span (including delimiters) and exclusive end index."""
|
|
169
|
+
j = i
|
|
170
|
+
while j < n and content[j] == "`":
|
|
171
|
+
j += 1
|
|
172
|
+
tick_len = j - i
|
|
173
|
+
close = _find_inline_code_close(content, j, tick_len, n)
|
|
174
|
+
if close == -1:
|
|
175
|
+
return content[i:n], n
|
|
176
|
+
return content[i:close], close
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _shield_inline_code(content: str) -> tuple[str, list[str]]:
|
|
180
|
+
"""Mask inline code, fenced code, and ``{{...}}`` macros for entity extraction only."""
|
|
181
|
+
literals: list[str] = []
|
|
182
|
+
parts: list[str] = []
|
|
183
|
+
i = 0
|
|
184
|
+
n = len(content)
|
|
185
|
+
|
|
186
|
+
def emit_placeholder(segment: str) -> None:
|
|
187
|
+
literals.append(segment)
|
|
188
|
+
parts.append(f"{_SHIELD_TOKEN_PREFIX}{len(literals) - 1}___")
|
|
189
|
+
|
|
190
|
+
while i < n:
|
|
191
|
+
at_line_start = i == 0 or content[i - 1] == "\n"
|
|
192
|
+
if at_line_start:
|
|
193
|
+
line_start = i
|
|
194
|
+
fence_open = _try_open_fence_line(content, line_start, n)
|
|
195
|
+
if fence_open is not None:
|
|
196
|
+
tick_start, tick_len = fence_open
|
|
197
|
+
fence_end = _fence_region_end(content, tick_start, tick_len, n)
|
|
198
|
+
emit_placeholder(content[i:fence_end])
|
|
199
|
+
i = fence_end
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
if i + 1 < n and content[i] == "{" and content[i + 1] == "{":
|
|
203
|
+
close = content.find("}}", i + 2)
|
|
204
|
+
if close == -1:
|
|
205
|
+
emit_placeholder(content[i:n])
|
|
206
|
+
break
|
|
207
|
+
emit_placeholder(content[i : close + 2])
|
|
208
|
+
i = close + 2
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
if content[i] == "`":
|
|
212
|
+
segment, end = _consume_inline_code_span(content, i, n)
|
|
213
|
+
emit_placeholder(segment)
|
|
214
|
+
i = end
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
parts.append(content[i])
|
|
218
|
+
i += 1
|
|
219
|
+
|
|
220
|
+
return "".join(parts), literals
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _extract_task_status(first_line: str) -> tuple[str | None, str]:
|
|
224
|
+
for status in TASK_STATUSES:
|
|
225
|
+
prefix = f"{status} "
|
|
226
|
+
if first_line.startswith(prefix):
|
|
227
|
+
return status, first_line[len(prefix) :].strip()
|
|
228
|
+
return None, first_line
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _extract_time_properties(raw_content: str) -> dict[str, Any]:
|
|
232
|
+
properties: dict[str, Any] = {}
|
|
233
|
+
in_code_block = False
|
|
234
|
+
for line in raw_content.splitlines():
|
|
235
|
+
stripped = line.strip()
|
|
236
|
+
if _is_code_fence_line(stripped):
|
|
237
|
+
in_code_block = not in_code_block
|
|
238
|
+
continue
|
|
239
|
+
if in_code_block:
|
|
240
|
+
continue
|
|
241
|
+
for key, value in TIME_PATTERN.findall(line):
|
|
242
|
+
marker_lower = key.lower()
|
|
243
|
+
marker_payload = value.strip("<>")
|
|
244
|
+
properties[marker_lower] = value
|
|
245
|
+
repeater_match = REPEATER_PATTERN.search(marker_payload)
|
|
246
|
+
repeater = repeater_match.group(0) if repeater_match else None
|
|
247
|
+
payload_without_repeater = (
|
|
248
|
+
REPEATER_PATTERN.sub("", marker_payload).strip() if repeater else marker_payload
|
|
249
|
+
)
|
|
250
|
+
parsed_dt = _parse_logseq_datetime(payload_without_repeater)
|
|
251
|
+
if parsed_dt is not None:
|
|
252
|
+
properties[f"{marker_lower}_journal_day"] = int(parsed_dt.strftime("%Y%m%d"))
|
|
253
|
+
properties[f"{marker_lower}_iso"] = parsed_dt.isoformat(timespec="seconds")
|
|
254
|
+
properties[f"{marker_lower}_at"] = int(
|
|
255
|
+
parsed_dt.replace(tzinfo=timezone.utc).timestamp()
|
|
256
|
+
)
|
|
257
|
+
if repeater is not None:
|
|
258
|
+
properties["repeater"] = repeater
|
|
259
|
+
return properties
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _extract_tags(raw_content: str) -> list[str]:
|
|
263
|
+
tags: list[str] = []
|
|
264
|
+
shielded, _ = _shield_inline_code(raw_content)
|
|
265
|
+
for bracketed, simple in LOGSEQ_PATTERNS["tag"].findall(shielded):
|
|
266
|
+
tag = bracketed or simple
|
|
267
|
+
if tag:
|
|
268
|
+
tags.append(tag.rstrip(".,;:"))
|
|
269
|
+
return tags
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _extract_block_refs(raw_content: str) -> list[str]:
|
|
273
|
+
refs: list[str] = []
|
|
274
|
+
shielded, _ = _shield_inline_code(raw_content)
|
|
275
|
+
for alias_ref, plain_ref in LOGSEQ_PATTERNS["block_ref"].findall(shielded):
|
|
276
|
+
block_ref = alias_ref or plain_ref
|
|
277
|
+
if block_ref:
|
|
278
|
+
refs.append(block_ref)
|
|
279
|
+
return refs
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _extract_heading_level(content: str) -> int | None:
|
|
283
|
+
first_line = content.splitlines()[0].strip() if content.splitlines() else ""
|
|
284
|
+
match = HEADING_PATTERN.match(first_line)
|
|
285
|
+
if match:
|
|
286
|
+
return len(match.group(1))
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _strip_ordinal_suffix(value: str) -> str:
|
|
291
|
+
return re.sub(r"\b([0-9]{1,2})(st|nd|rd|th)\b", r"\1", value, flags=re.IGNORECASE)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _parse_logseq_datetime(raw_value: str) -> datetime | None:
|
|
295
|
+
candidate = _strip_ordinal_suffix(raw_value.strip())
|
|
296
|
+
datetime_formats = (
|
|
297
|
+
"%Y-%m-%d %a %H:%M",
|
|
298
|
+
"%Y-%m-%d %H:%M",
|
|
299
|
+
"%Y-%m-%d %a",
|
|
300
|
+
"%Y-%m-%d",
|
|
301
|
+
"%b %d, %Y",
|
|
302
|
+
"%Y_%m_%d",
|
|
303
|
+
"%a, %d-%m-%Y",
|
|
304
|
+
)
|
|
305
|
+
for fmt in datetime_formats:
|
|
306
|
+
try:
|
|
307
|
+
return datetime.strptime(candidate, fmt)
|
|
308
|
+
except ValueError:
|
|
309
|
+
continue
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def resolve_journal_day(value: str) -> int | None:
|
|
314
|
+
"""Resolve a journal-like string into Logseq YYYYMMDD integer."""
|
|
315
|
+
candidate = value.strip()
|
|
316
|
+
if not candidate:
|
|
317
|
+
return None
|
|
318
|
+
if candidate.startswith("[[") and candidate.endswith("]]"):
|
|
319
|
+
candidate = candidate[2:-2].strip()
|
|
320
|
+
if candidate.lower().endswith(".md"):
|
|
321
|
+
candidate = candidate[:-3]
|
|
322
|
+
parsed = _parse_logseq_datetime(candidate)
|
|
323
|
+
if parsed is None:
|
|
324
|
+
return None
|
|
325
|
+
return int(parsed.strftime("%Y%m%d"))
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def normalize_logseq_timestamp(value: Any) -> int | None:
|
|
329
|
+
"""Normalize Logseq-style timestamp values to unix epoch seconds."""
|
|
330
|
+
if value is None:
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
if isinstance(value, bool):
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
if isinstance(value, (int, float)):
|
|
337
|
+
timestamp = int(value)
|
|
338
|
+
return timestamp // 1000 if timestamp >= 10**12 else timestamp
|
|
339
|
+
|
|
340
|
+
if isinstance(value, str):
|
|
341
|
+
candidate = value.strip()
|
|
342
|
+
if not candidate:
|
|
343
|
+
return None
|
|
344
|
+
if candidate.isdigit():
|
|
345
|
+
parsed = int(candidate)
|
|
346
|
+
return parsed // 1000 if parsed >= 10**12 else parsed
|
|
347
|
+
|
|
348
|
+
iso_candidate = candidate.replace("Z", "+00:00")
|
|
349
|
+
try:
|
|
350
|
+
parsed_datetime = datetime.fromisoformat(iso_candidate)
|
|
351
|
+
if parsed_datetime.tzinfo is None:
|
|
352
|
+
parsed_datetime = parsed_datetime.replace(tzinfo=timezone.utc)
|
|
353
|
+
return int(parsed_datetime.timestamp())
|
|
354
|
+
except ValueError:
|
|
355
|
+
pass
|
|
356
|
+
|
|
357
|
+
parsed_logseq_date = _parse_logseq_datetime(candidate)
|
|
358
|
+
if parsed_logseq_date is not None:
|
|
359
|
+
return int(parsed_logseq_date.replace(tzinfo=timezone.utc).timestamp())
|
|
360
|
+
|
|
361
|
+
date_formats = ("%Y/%m/%d", "%Y%m%d")
|
|
362
|
+
for fmt in date_formats:
|
|
363
|
+
try:
|
|
364
|
+
parsed_date = datetime.strptime(candidate, fmt).replace(tzinfo=timezone.utc)
|
|
365
|
+
return int(parsed_date.timestamp())
|
|
366
|
+
except ValueError:
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _first_normalized_timestamp(properties: dict[str, Any], keys: tuple[str, ...]) -> int | None:
|
|
373
|
+
for key in keys:
|
|
374
|
+
if key in properties:
|
|
375
|
+
normalized = normalize_logseq_timestamp(properties[key])
|
|
376
|
+
if normalized is not None:
|
|
377
|
+
return normalized
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _merge_refs(wikilinks: list[str], tags: list[str]) -> list[str]:
|
|
382
|
+
merged: list[str] = []
|
|
383
|
+
seen: set[str] = set()
|
|
384
|
+
for token in [*wikilinks, *tags]:
|
|
385
|
+
if token and token not in seen:
|
|
386
|
+
seen.add(token)
|
|
387
|
+
merged.append(token)
|
|
388
|
+
return merged
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _extract_property_graph_tokens(
|
|
392
|
+
properties: dict[str, Any],
|
|
393
|
+
) -> tuple[list[str], list[str], list[str]]:
|
|
394
|
+
property_wikilinks: list[str] = []
|
|
395
|
+
property_tags: list[str] = []
|
|
396
|
+
property_block_refs: list[str] = []
|
|
397
|
+
for value in properties.values():
|
|
398
|
+
if not isinstance(value, str):
|
|
399
|
+
continue
|
|
400
|
+
property_wikilinks.extend(_extract_wikilinks(value))
|
|
401
|
+
property_tags.extend(_extract_tags(value))
|
|
402
|
+
property_block_refs.extend(_extract_block_refs(value))
|
|
403
|
+
return property_wikilinks, property_tags, property_block_refs
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _extract_wikilinks(raw_content: str) -> list[str]:
|
|
407
|
+
shielded, _ = _shield_inline_code(raw_content)
|
|
408
|
+
return LOGSEQ_PATTERNS["wikilink"].findall(shielded)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class PageRegistry:
|
|
412
|
+
"""Track all nodes by uuid for local block-reference resolution."""
|
|
413
|
+
|
|
414
|
+
def __init__(self) -> None:
|
|
415
|
+
self.blocks: dict[str, LogseqNode] = {}
|
|
416
|
+
|
|
417
|
+
def register(self, node: LogseqNode) -> None:
|
|
418
|
+
if node.uuid:
|
|
419
|
+
self.blocks[node.uuid] = node
|
|
420
|
+
|
|
421
|
+
def resolve(self, node_uuid: str) -> LogseqNode | None:
|
|
422
|
+
return self.blocks.get(node_uuid)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
class StackMachineParser:
|
|
426
|
+
"""O(N) indentation parser that builds a strict immutable AST."""
|
|
427
|
+
|
|
428
|
+
def __init__(self, tab_size: int = 2) -> None:
|
|
429
|
+
self.tab_size = tab_size
|
|
430
|
+
self.registry = PageRegistry()
|
|
431
|
+
|
|
432
|
+
def parse(self, text: str, page_title: str = "untitled") -> LogseqPage:
|
|
433
|
+
"""Parse Logseq markdown text into a `LogseqPage`."""
|
|
434
|
+
stack: list[LogseqNode] = []
|
|
435
|
+
stack_columns: list[int] = []
|
|
436
|
+
stack_indents: list[str] = []
|
|
437
|
+
root_nodes: list[LogseqNode] = []
|
|
438
|
+
page_properties: dict[str, Any] = {}
|
|
439
|
+
current_node: LogseqNode | None = None
|
|
440
|
+
frontmatter_active = True
|
|
441
|
+
property_list_indent_level: int | None = None
|
|
442
|
+
in_code_block = False
|
|
443
|
+
in_drawer = False
|
|
444
|
+
|
|
445
|
+
for line_number, raw_line in enumerate(text.splitlines(), start=1):
|
|
446
|
+
stripped_line = raw_line.strip()
|
|
447
|
+
|
|
448
|
+
if in_code_block and current_node is not None:
|
|
449
|
+
merged_content = f"{current_node.content}\n{raw_line}"
|
|
450
|
+
updated = self._refresh_node(current_node, merged_content, line_end=line_number)
|
|
451
|
+
self._replace_stack_tail_node(stack, root_nodes, updated)
|
|
452
|
+
current_node = updated
|
|
453
|
+
if _is_code_fence_line(stripped_line):
|
|
454
|
+
in_code_block = False
|
|
455
|
+
frontmatter_active = False
|
|
456
|
+
property_list_indent_level = None
|
|
457
|
+
continue
|
|
458
|
+
|
|
459
|
+
if in_drawer:
|
|
460
|
+
if stripped_line.upper() == ":END:":
|
|
461
|
+
in_drawer = False
|
|
462
|
+
continue
|
|
463
|
+
if BULLET_PATTERN.match(raw_line):
|
|
464
|
+
in_drawer = False
|
|
465
|
+
else:
|
|
466
|
+
if current_node is not None:
|
|
467
|
+
properties = dict(current_node.properties)
|
|
468
|
+
logbook_entries = list(properties.get("logbook", []))
|
|
469
|
+
logbook_entries.append(stripped_line)
|
|
470
|
+
properties["logbook"] = logbook_entries
|
|
471
|
+
clock_match = CLOCK_PATTERN.match(stripped_line)
|
|
472
|
+
if clock_match:
|
|
473
|
+
start_text, end_text, duration_text = clock_match.groups()
|
|
474
|
+
start_dt = datetime.strptime(start_text, "%Y-%m-%d %a %H:%M")
|
|
475
|
+
end_dt = datetime.strptime(end_text, "%Y-%m-%d %a %H:%M")
|
|
476
|
+
hours, minutes, seconds = [int(part) for part in duration_text.split(":")]
|
|
477
|
+
duration_seconds = (hours * 3600) + (minutes * 60) + seconds
|
|
478
|
+
clock_entries = list(properties.get("clock", []))
|
|
479
|
+
clock_entries.append(
|
|
480
|
+
{
|
|
481
|
+
"start_iso": start_dt.isoformat(timespec="seconds"),
|
|
482
|
+
"end_iso": end_dt.isoformat(timespec="seconds"),
|
|
483
|
+
"duration": duration_text,
|
|
484
|
+
"duration_seconds": duration_seconds,
|
|
485
|
+
}
|
|
486
|
+
)
|
|
487
|
+
properties["clock"] = clock_entries
|
|
488
|
+
updated = self._refresh_node(
|
|
489
|
+
current_node,
|
|
490
|
+
current_node.content,
|
|
491
|
+
properties_override=properties,
|
|
492
|
+
line_end=line_number,
|
|
493
|
+
)
|
|
494
|
+
self._replace_stack_tail_node(stack, root_nodes, updated)
|
|
495
|
+
current_node = updated
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
if stripped_line.upper() == ":LOGBOOK:" and current_node is not None:
|
|
499
|
+
in_drawer = True
|
|
500
|
+
properties = dict(current_node.properties)
|
|
501
|
+
properties.setdefault("logbook", [])
|
|
502
|
+
updated = self._refresh_node(
|
|
503
|
+
current_node,
|
|
504
|
+
current_node.content,
|
|
505
|
+
properties_override=properties,
|
|
506
|
+
line_end=line_number,
|
|
507
|
+
)
|
|
508
|
+
self._replace_stack_tail_node(stack, root_nodes, updated)
|
|
509
|
+
current_node = updated
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
collapsed_match = re.match(r"^\s*collapsed::\s*(\S+)\s*$", raw_line, re.IGNORECASE)
|
|
513
|
+
if collapsed_match and current_node is not None:
|
|
514
|
+
collapsed_value = collapsed_match.group(1).lower() == "true"
|
|
515
|
+
properties = dict(current_node.properties)
|
|
516
|
+
properties["collapsed"] = collapsed_value
|
|
517
|
+
updated = self._refresh_node(
|
|
518
|
+
current_node,
|
|
519
|
+
current_node.content,
|
|
520
|
+
properties_override=properties,
|
|
521
|
+
line_end=line_number,
|
|
522
|
+
)
|
|
523
|
+
self._replace_stack_tail_node(stack, root_nodes, updated)
|
|
524
|
+
current_node = updated
|
|
525
|
+
continue
|
|
526
|
+
|
|
527
|
+
if not stripped_line or is_system_block(raw_line):
|
|
528
|
+
continue
|
|
529
|
+
|
|
530
|
+
bullet_match = BULLET_PATTERN.match(raw_line)
|
|
531
|
+
if bullet_match:
|
|
532
|
+
indent_level = self._compute_indent_level(bullet_match.group(1))
|
|
533
|
+
if property_list_indent_level is not None and indent_level > property_list_indent_level:
|
|
534
|
+
indent_level -= 1
|
|
535
|
+
else:
|
|
536
|
+
property_list_indent_level = None
|
|
537
|
+
|
|
538
|
+
raw_indent = bullet_match.group(1)
|
|
539
|
+
if (
|
|
540
|
+
stack_columns
|
|
541
|
+
and "\t" in stack_indents[-1]
|
|
542
|
+
and raw_indent
|
|
543
|
+
and "\t" not in raw_indent
|
|
544
|
+
and indent_level == stack_columns[-1] + 1
|
|
545
|
+
):
|
|
546
|
+
indent_level = stack_columns[-1]
|
|
547
|
+
|
|
548
|
+
while stack_columns and stack_columns[-1] >= indent_level:
|
|
549
|
+
stack.pop()
|
|
550
|
+
stack_columns.pop()
|
|
551
|
+
stack_indents.pop()
|
|
552
|
+
|
|
553
|
+
parent_uuid = self._resolve_parent_uuid_for_synthetic(stack)
|
|
554
|
+
node = self._build_node(
|
|
555
|
+
block_text=bullet_match.group(2),
|
|
556
|
+
indent_level=indent_level,
|
|
557
|
+
page_title=page_title,
|
|
558
|
+
line_start=line_number,
|
|
559
|
+
parent_uuid=parent_uuid,
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
node = self._initialize_node_graph_fields(node, stack, root_nodes)
|
|
563
|
+
if stack:
|
|
564
|
+
node = self._attach_node_to_parent(stack, root_nodes, node)
|
|
565
|
+
else:
|
|
566
|
+
root_nodes.append(node)
|
|
567
|
+
|
|
568
|
+
stack.append(node)
|
|
569
|
+
stack_columns.append(indent_level)
|
|
570
|
+
stack_indents.append(raw_indent)
|
|
571
|
+
current_node = node
|
|
572
|
+
self.registry.register(node)
|
|
573
|
+
frontmatter_active = False
|
|
574
|
+
continue
|
|
575
|
+
|
|
576
|
+
heading_match = HEADING_BLOCK_PATTERN.match(raw_line)
|
|
577
|
+
if heading_match:
|
|
578
|
+
indent_level = self._compute_indent_level(heading_match.group(1))
|
|
579
|
+
property_list_indent_level = None
|
|
580
|
+
|
|
581
|
+
raw_indent = heading_match.group(1)
|
|
582
|
+
|
|
583
|
+
while stack_columns and stack_columns[-1] >= indent_level:
|
|
584
|
+
stack.pop()
|
|
585
|
+
stack_columns.pop()
|
|
586
|
+
stack_indents.pop()
|
|
587
|
+
|
|
588
|
+
parent_uuid = self._resolve_parent_uuid_for_synthetic(stack)
|
|
589
|
+
node = self._build_node(
|
|
590
|
+
block_text=heading_match.group(2),
|
|
591
|
+
indent_level=indent_level,
|
|
592
|
+
page_title=page_title,
|
|
593
|
+
line_start=line_number,
|
|
594
|
+
parent_uuid=parent_uuid,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
node = self._initialize_node_graph_fields(node, stack, root_nodes)
|
|
598
|
+
if stack:
|
|
599
|
+
node = self._attach_node_to_parent(stack, root_nodes, node)
|
|
600
|
+
else:
|
|
601
|
+
root_nodes.append(node)
|
|
602
|
+
|
|
603
|
+
stack.append(node)
|
|
604
|
+
stack_columns.append(indent_level)
|
|
605
|
+
stack_indents.append(raw_indent)
|
|
606
|
+
current_node = node
|
|
607
|
+
self.registry.register(node)
|
|
608
|
+
frontmatter_active = False
|
|
609
|
+
continue
|
|
610
|
+
|
|
611
|
+
property_match = LOGSEQ_PATTERNS["property"].match(raw_line.strip())
|
|
612
|
+
if property_match:
|
|
613
|
+
key, value = property_match.groups()
|
|
614
|
+
|
|
615
|
+
if current_node is None and frontmatter_active:
|
|
616
|
+
page_properties[key] = value
|
|
617
|
+
continue
|
|
618
|
+
|
|
619
|
+
if current_node is None:
|
|
620
|
+
frontmatter_active = False
|
|
621
|
+
continue
|
|
622
|
+
|
|
623
|
+
properties = dict(current_node.properties)
|
|
624
|
+
properties[key] = value
|
|
625
|
+
properties_order = list(current_node.properties_order)
|
|
626
|
+
if key not in properties_order:
|
|
627
|
+
properties_order.append(key)
|
|
628
|
+
|
|
629
|
+
updated = self._refresh_node(
|
|
630
|
+
current_node,
|
|
631
|
+
current_node.content,
|
|
632
|
+
properties_override=properties,
|
|
633
|
+
properties_order_override=properties_order,
|
|
634
|
+
line_end=line_number,
|
|
635
|
+
)
|
|
636
|
+
if key == "id":
|
|
637
|
+
updated = updated.model_copy(
|
|
638
|
+
update={"source_uuid": value, "synthetic_id": False}
|
|
639
|
+
)
|
|
640
|
+
self._replace_stack_tail_node(stack, root_nodes, updated)
|
|
641
|
+
current_node = updated
|
|
642
|
+
self.registry.register(updated)
|
|
643
|
+
|
|
644
|
+
raw_indent = raw_line[: len(raw_line) - len(raw_line.lstrip(" \t"))]
|
|
645
|
+
property_list_indent_level = (
|
|
646
|
+
self._compute_indent_level(raw_indent) if value.strip() == "" else None
|
|
647
|
+
)
|
|
648
|
+
frontmatter_active = False
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
if not stack:
|
|
652
|
+
frontmatter_active = False
|
|
653
|
+
continue
|
|
654
|
+
|
|
655
|
+
active_node = stack[-1]
|
|
656
|
+
merged_content = f"{active_node.content}\n{raw_line}"
|
|
657
|
+
updated = self._refresh_node(active_node, merged_content, line_end=line_number)
|
|
658
|
+
self._replace_stack_tail_node(stack, root_nodes, updated)
|
|
659
|
+
current_node = updated
|
|
660
|
+
logger.debug(
|
|
661
|
+
"Soft-break continuation merged into stack tip line=%s depth=%s",
|
|
662
|
+
line_number,
|
|
663
|
+
len(stack),
|
|
664
|
+
)
|
|
665
|
+
frontmatter_active = False
|
|
666
|
+
property_list_indent_level = None
|
|
667
|
+
if _is_code_fence_line(stripped_line):
|
|
668
|
+
in_code_block = True
|
|
669
|
+
|
|
670
|
+
self._validate_references(root_nodes)
|
|
671
|
+
root_nodes = self._normalize_indent_levels(root_nodes)
|
|
672
|
+
page_refs = self._collect_page_refs(root_nodes)
|
|
673
|
+
created_at = _first_normalized_timestamp(page_properties, CREATED_AT_KEYS)
|
|
674
|
+
updated_at = _first_normalized_timestamp(page_properties, UPDATED_AT_KEYS)
|
|
675
|
+
title_segments = [segment for segment in page_title.split("/") if segment]
|
|
676
|
+
namespace_chain = title_segments[:-1] if len(title_segments) > 1 else []
|
|
677
|
+
return LogseqPage(
|
|
678
|
+
title=page_title,
|
|
679
|
+
raw_content=text,
|
|
680
|
+
properties=page_properties,
|
|
681
|
+
refs=page_refs,
|
|
682
|
+
created_at=created_at,
|
|
683
|
+
updated_at=updated_at,
|
|
684
|
+
namespace_chain=namespace_chain,
|
|
685
|
+
root_nodes=root_nodes,
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
def parse_file(self, path: Path | str) -> list[LogseqNode]:
|
|
689
|
+
"""Compatibility API: parse file and return root nodes."""
|
|
690
|
+
page = self.parse_page_file(path)
|
|
691
|
+
return page.root_nodes
|
|
692
|
+
|
|
693
|
+
def parse_page_file(self, path: Path | str) -> LogseqPage:
|
|
694
|
+
"""Parse a markdown file and return a graph-native page model."""
|
|
695
|
+
path = Path(path)
|
|
696
|
+
content = path.read_text(encoding="utf-8")
|
|
697
|
+
if not content.strip():
|
|
698
|
+
logger.warning("Il file %s è vuoto.", path)
|
|
699
|
+
return LogseqPage(
|
|
700
|
+
title=path.stem,
|
|
701
|
+
raw_content=content,
|
|
702
|
+
namespace_chain=[],
|
|
703
|
+
source_path=str(path.resolve()),
|
|
704
|
+
graph_root=str(path.resolve().parent),
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
page_title = self._derive_page_title(path)
|
|
708
|
+
page = self.parse(content, page_title=page_title)
|
|
709
|
+
graph_root = self._derive_graph_root(path)
|
|
710
|
+
created_at = page.created_at
|
|
711
|
+
updated_at = page.updated_at
|
|
712
|
+
if created_at is None:
|
|
713
|
+
created_at = int(os.path.getctime(path))
|
|
714
|
+
if updated_at is None:
|
|
715
|
+
updated_at = int(os.path.getmtime(path))
|
|
716
|
+
source_path = str(path.resolve())
|
|
717
|
+
return page.model_copy(
|
|
718
|
+
update={
|
|
719
|
+
"source_path": source_path,
|
|
720
|
+
"graph_root": str(graph_root),
|
|
721
|
+
"created_at": created_at,
|
|
722
|
+
"updated_at": updated_at,
|
|
723
|
+
"root_nodes": self._apply_source_path(page.root_nodes, source_path),
|
|
724
|
+
}
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
def _derive_page_title(self, path: Path) -> str:
|
|
728
|
+
resolved_path = path.resolve()
|
|
729
|
+
if resolved_path.suffix == ".md":
|
|
730
|
+
resolved_path = resolved_path.with_suffix("")
|
|
731
|
+
parts = list(resolved_path.parts)
|
|
732
|
+
if "pages" in parts:
|
|
733
|
+
page_index = parts.index("pages")
|
|
734
|
+
return "/".join(parts[page_index + 1 :])
|
|
735
|
+
if "journals" in parts:
|
|
736
|
+
journal_index = parts.index("journals")
|
|
737
|
+
return "/".join(parts[journal_index + 1 :])
|
|
738
|
+
return path.stem
|
|
739
|
+
|
|
740
|
+
def _derive_graph_root(self, path: Path) -> Path:
|
|
741
|
+
resolved_path = path.resolve()
|
|
742
|
+
marker_dirs = {"pages", "journals", "assets", "logseq"}
|
|
743
|
+
for parent in resolved_path.parents:
|
|
744
|
+
if parent.name in marker_dirs:
|
|
745
|
+
return parent.parent.resolve()
|
|
746
|
+
return resolved_path.parent.resolve()
|
|
747
|
+
|
|
748
|
+
def _apply_source_path(self, nodes: list[LogseqNode], source_path: str) -> list[LogseqNode]:
|
|
749
|
+
return [
|
|
750
|
+
node.model_copy(
|
|
751
|
+
update={
|
|
752
|
+
"source_path": source_path,
|
|
753
|
+
"children": self._apply_source_path(node.children, source_path),
|
|
754
|
+
}
|
|
755
|
+
)
|
|
756
|
+
for node in nodes
|
|
757
|
+
]
|
|
758
|
+
|
|
759
|
+
def _compute_indent_level(self, indentation: str) -> int:
|
|
760
|
+
spaces = indentation.count(" ") + (indentation.count("\t") * self.tab_size)
|
|
761
|
+
logger.debug(
|
|
762
|
+
"Computed indentation level via floor division: spaces=%s tab_size=%s level=%s",
|
|
763
|
+
spaces,
|
|
764
|
+
self.tab_size,
|
|
765
|
+
spaces // self.tab_size,
|
|
766
|
+
)
|
|
767
|
+
return spaces // self.tab_size
|
|
768
|
+
|
|
769
|
+
def _resolve_parent_uuid_for_synthetic(self, stack: list[LogseqNode]) -> str | None:
|
|
770
|
+
"""Return the parent block UUID for synthetic hashing; None at graph root (payload uses 'root')."""
|
|
771
|
+
if not stack:
|
|
772
|
+
logger.debug("Stack empty: synthetic UUID parent_uuid=None (hashed as root sentinel)")
|
|
773
|
+
return None
|
|
774
|
+
resolved_parent_uuid = stack[-1].uuid
|
|
775
|
+
logger.debug(
|
|
776
|
+
"Stack depth=%s: synthetic UUID parent_uuid=%s", len(stack), resolved_parent_uuid
|
|
777
|
+
)
|
|
778
|
+
return resolved_parent_uuid
|
|
779
|
+
|
|
780
|
+
def _build_node(
|
|
781
|
+
self,
|
|
782
|
+
block_text: str,
|
|
783
|
+
indent_level: int,
|
|
784
|
+
page_title: str,
|
|
785
|
+
line_start: int,
|
|
786
|
+
parent_uuid: str | None,
|
|
787
|
+
) -> LogseqNode:
|
|
788
|
+
stripped_text = block_text.strip()
|
|
789
|
+
properties: dict[str, Any] = {}
|
|
790
|
+
|
|
791
|
+
uuid_match = LOGSEQ_PATTERNS["uuid_prop"].match(stripped_text)
|
|
792
|
+
inline_uuid_match = LOGSEQ_PATTERNS["inline_uuid_prop"].search(stripped_text)
|
|
793
|
+
if inline_uuid_match is not None:
|
|
794
|
+
inline_uuid = inline_uuid_match.group(1)
|
|
795
|
+
properties["id"] = inline_uuid
|
|
796
|
+
stripped_text = LOGSEQ_PATTERNS["inline_uuid_prop"].sub("", stripped_text).strip()
|
|
797
|
+
source_uuid = (
|
|
798
|
+
uuid_match.group(1)
|
|
799
|
+
if uuid_match
|
|
800
|
+
else (inline_uuid_match.group(1) if inline_uuid_match else None)
|
|
801
|
+
)
|
|
802
|
+
node_uuid = self._deterministic_uuid(page_title, line_start, stripped_text, parent_uuid)
|
|
803
|
+
time_properties = _extract_time_properties(stripped_text)
|
|
804
|
+
scheduled_at: int | None = None
|
|
805
|
+
deadline_at: int | None = None
|
|
806
|
+
if time_properties:
|
|
807
|
+
scheduled_raw = time_properties.get("scheduled_at")
|
|
808
|
+
deadline_raw = time_properties.get("deadline_at")
|
|
809
|
+
scheduled_at = scheduled_raw if isinstance(scheduled_raw, int) else None
|
|
810
|
+
deadline_at = deadline_raw if isinstance(deadline_raw, int) else None
|
|
811
|
+
merge_time = {
|
|
812
|
+
key: value
|
|
813
|
+
for key, value in time_properties.items()
|
|
814
|
+
if key not in ("scheduled_at", "deadline_at")
|
|
815
|
+
}
|
|
816
|
+
properties.update(merge_time)
|
|
817
|
+
first_line = stripped_text.splitlines()[0].strip() if stripped_text else ""
|
|
818
|
+
priority_match = PRIORITY_PATTERN.search(first_line)
|
|
819
|
+
task_priority = priority_match.group(1) if priority_match else None
|
|
820
|
+
task_status, _ = _extract_task_status(stripped_text)
|
|
821
|
+
heading_level = _extract_heading_level(stripped_text)
|
|
822
|
+
if heading_level is not None:
|
|
823
|
+
properties["heading_level"] = heading_level
|
|
824
|
+
property_wikilinks, property_tags, property_block_refs = _extract_property_graph_tokens(
|
|
825
|
+
properties
|
|
826
|
+
)
|
|
827
|
+
wikilinks = [*_extract_wikilinks(stripped_text), *property_wikilinks]
|
|
828
|
+
tags = [*_extract_tags(stripped_text), *property_tags]
|
|
829
|
+
properties_order = ["id"] if "id" in properties else []
|
|
830
|
+
|
|
831
|
+
return LogseqNode(
|
|
832
|
+
uuid=node_uuid,
|
|
833
|
+
source_uuid=source_uuid,
|
|
834
|
+
synthetic_id=source_uuid is None,
|
|
835
|
+
content=stripped_text,
|
|
836
|
+
clean_text=clean_node_content(stripped_text, properties),
|
|
837
|
+
indent_level=indent_level,
|
|
838
|
+
properties=properties,
|
|
839
|
+
properties_order=properties_order,
|
|
840
|
+
wikilinks=wikilinks,
|
|
841
|
+
tags=tags,
|
|
842
|
+
refs=_merge_refs(wikilinks, tags),
|
|
843
|
+
block_refs=[*_extract_block_refs(stripped_text), *property_block_refs],
|
|
844
|
+
task_status=task_status,
|
|
845
|
+
task_priority=task_priority,
|
|
846
|
+
scheduled_at=scheduled_at,
|
|
847
|
+
deadline_at=deadline_at,
|
|
848
|
+
repeater=properties.get("repeater") if isinstance(properties.get("repeater"), str) else None,
|
|
849
|
+
parent_id=None,
|
|
850
|
+
line_start=line_start,
|
|
851
|
+
line_end=line_start,
|
|
852
|
+
created_at=_first_normalized_timestamp(properties, CREATED_AT_KEYS),
|
|
853
|
+
updated_at=_first_normalized_timestamp(properties, UPDATED_AT_KEYS),
|
|
854
|
+
children=[],
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
def _deterministic_uuid(
|
|
858
|
+
self,
|
|
859
|
+
page_title: str,
|
|
860
|
+
line_start: int,
|
|
861
|
+
content: str,
|
|
862
|
+
parent_uuid: str | None,
|
|
863
|
+
) -> str:
|
|
864
|
+
parent_token = "root" if parent_uuid is None else parent_uuid
|
|
865
|
+
logger.debug(
|
|
866
|
+
"Stack-Machine synthetic UUID payload parent_token=%s line_start=%s page_title=%s",
|
|
867
|
+
parent_token,
|
|
868
|
+
line_start,
|
|
869
|
+
page_title,
|
|
870
|
+
)
|
|
871
|
+
payload = f"{page_title}:{line_start}:{parent_token}:{content}".encode("utf-8")
|
|
872
|
+
digest = hashlib.sha256(payload).hexdigest()
|
|
873
|
+
return str(uuid.uuid5(uuid.NAMESPACE_DNS, digest))
|
|
874
|
+
|
|
875
|
+
def _replace_stack_tail_node(
|
|
876
|
+
self,
|
|
877
|
+
stack: list[LogseqNode],
|
|
878
|
+
root_nodes: list[LogseqNode],
|
|
879
|
+
updated_node: LogseqNode,
|
|
880
|
+
) -> None:
|
|
881
|
+
if not stack:
|
|
882
|
+
return
|
|
883
|
+
|
|
884
|
+
stack[-1] = updated_node
|
|
885
|
+
if len(stack) == 1:
|
|
886
|
+
root_nodes[-1] = updated_node
|
|
887
|
+
return
|
|
888
|
+
|
|
889
|
+
parent = stack[-2]
|
|
890
|
+
parent_children = list(parent.children)
|
|
891
|
+
parent_children[-1] = updated_node
|
|
892
|
+
updated_parent = parent.model_copy(update={"children": parent_children})
|
|
893
|
+
stack[-2] = updated_parent
|
|
894
|
+
|
|
895
|
+
if len(stack) == 2:
|
|
896
|
+
root_nodes[-1] = updated_parent
|
|
897
|
+
return
|
|
898
|
+
|
|
899
|
+
grand_parent = stack[-3]
|
|
900
|
+
grand_parent_children = list(grand_parent.children)
|
|
901
|
+
grand_parent_children[-1] = updated_parent
|
|
902
|
+
stack[-3] = grand_parent.model_copy(update={"children": grand_parent_children})
|
|
903
|
+
|
|
904
|
+
def _attach_node_to_parent(
|
|
905
|
+
self,
|
|
906
|
+
stack: list[LogseqNode],
|
|
907
|
+
root_nodes: list[LogseqNode],
|
|
908
|
+
node: LogseqNode,
|
|
909
|
+
) -> LogseqNode:
|
|
910
|
+
parent = stack[-1]
|
|
911
|
+
attached_node = node.model_copy(update={"parent_id": parent.uuid})
|
|
912
|
+
updated_ancestor = attached_node
|
|
913
|
+
|
|
914
|
+
for idx in range(len(stack) - 1, -1, -1):
|
|
915
|
+
ancestor = stack[idx]
|
|
916
|
+
ancestor_children = list(ancestor.children)
|
|
917
|
+
if idx == len(stack) - 1:
|
|
918
|
+
ancestor_children.append(updated_ancestor)
|
|
919
|
+
else:
|
|
920
|
+
ancestor_children[-1] = updated_ancestor
|
|
921
|
+
updated_ancestor = ancestor.model_copy(update={"children": ancestor_children})
|
|
922
|
+
stack[idx] = updated_ancestor
|
|
923
|
+
|
|
924
|
+
root_nodes[-1] = stack[0]
|
|
925
|
+
return attached_node
|
|
926
|
+
|
|
927
|
+
def _initialize_node_graph_fields(
|
|
928
|
+
self,
|
|
929
|
+
node: LogseqNode,
|
|
930
|
+
stack: list[LogseqNode],
|
|
931
|
+
root_nodes: list[LogseqNode],
|
|
932
|
+
) -> LogseqNode:
|
|
933
|
+
left_id = self._resolve_left_sibling_id(stack, root_nodes)
|
|
934
|
+
if stack:
|
|
935
|
+
parent = stack[-1]
|
|
936
|
+
path = [*parent.path, node.uuid]
|
|
937
|
+
outline_path = [*parent.outline_path, len(parent.children) + 1]
|
|
938
|
+
else:
|
|
939
|
+
path = [node.uuid]
|
|
940
|
+
outline_path = [len(root_nodes) + 1]
|
|
941
|
+
return node.model_copy(
|
|
942
|
+
update={"left_id": left_id, "path": path, "outline_path": outline_path}
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
def _resolve_left_sibling_id(
|
|
946
|
+
self, stack: list[LogseqNode], root_nodes: list[LogseqNode]
|
|
947
|
+
) -> str | None:
|
|
948
|
+
if stack:
|
|
949
|
+
parent = stack[-1]
|
|
950
|
+
return parent.children[-1].uuid if parent.children else None
|
|
951
|
+
return root_nodes[-1].uuid if root_nodes else None
|
|
952
|
+
|
|
953
|
+
def _collect_page_refs(self, roots: list[LogseqNode]) -> list[str]:
|
|
954
|
+
collected: list[str] = []
|
|
955
|
+
seen: set[str] = set()
|
|
956
|
+
|
|
957
|
+
def visit(nodes: list[LogseqNode]) -> None:
|
|
958
|
+
for node in nodes:
|
|
959
|
+
for token in node.refs:
|
|
960
|
+
if token not in seen:
|
|
961
|
+
seen.add(token)
|
|
962
|
+
collected.append(token)
|
|
963
|
+
visit(node.children)
|
|
964
|
+
|
|
965
|
+
visit(roots)
|
|
966
|
+
return collected
|
|
967
|
+
|
|
968
|
+
def _validate_references(self, roots: list[LogseqNode]) -> None:
|
|
969
|
+
_ = roots
|
|
970
|
+
|
|
971
|
+
def _refresh_node(
|
|
972
|
+
self,
|
|
973
|
+
node: LogseqNode,
|
|
974
|
+
content: str,
|
|
975
|
+
properties_override: dict[str, Any] | None = None,
|
|
976
|
+
properties_order_override: list[str] | None = None,
|
|
977
|
+
line_end: int | None = None,
|
|
978
|
+
) -> LogseqNode:
|
|
979
|
+
properties = dict(node.properties) if properties_override is None else dict(properties_override)
|
|
980
|
+
properties_order = (
|
|
981
|
+
list(node.properties_order)
|
|
982
|
+
if properties_order_override is None
|
|
983
|
+
else list(properties_order_override)
|
|
984
|
+
)
|
|
985
|
+
time_properties = _extract_time_properties(content)
|
|
986
|
+
scheduled_at: int | None = None
|
|
987
|
+
deadline_at: int | None = None
|
|
988
|
+
if time_properties:
|
|
989
|
+
scheduled_raw = time_properties.get("scheduled_at")
|
|
990
|
+
deadline_raw = time_properties.get("deadline_at")
|
|
991
|
+
scheduled_at = scheduled_raw if isinstance(scheduled_raw, int) else None
|
|
992
|
+
deadline_at = deadline_raw if isinstance(deadline_raw, int) else None
|
|
993
|
+
merge_time = {
|
|
994
|
+
key: value
|
|
995
|
+
for key, value in time_properties.items()
|
|
996
|
+
if key not in ("scheduled_at", "deadline_at")
|
|
997
|
+
}
|
|
998
|
+
properties.update(merge_time)
|
|
999
|
+
heading_level = _extract_heading_level(content)
|
|
1000
|
+
if heading_level is not None:
|
|
1001
|
+
properties["heading_level"] = heading_level
|
|
1002
|
+
first_line = content.splitlines()[0].strip() if content.splitlines() else ""
|
|
1003
|
+
priority_match = PRIORITY_PATTERN.search(first_line)
|
|
1004
|
+
task_priority = priority_match.group(1) if priority_match else None
|
|
1005
|
+
task_status, _ = _extract_task_status(content.splitlines()[0].strip())
|
|
1006
|
+
property_wikilinks, property_tags, property_block_refs = _extract_property_graph_tokens(
|
|
1007
|
+
properties
|
|
1008
|
+
)
|
|
1009
|
+
wikilinks = [*_extract_wikilinks(content), *property_wikilinks]
|
|
1010
|
+
tags = [*_extract_tags(content), *property_tags]
|
|
1011
|
+
return node.model_copy(
|
|
1012
|
+
update={
|
|
1013
|
+
"content": content,
|
|
1014
|
+
"properties": properties,
|
|
1015
|
+
"properties_order": properties_order,
|
|
1016
|
+
"clean_text": clean_node_content(content, properties),
|
|
1017
|
+
"task_status": task_status,
|
|
1018
|
+
"task_priority": task_priority,
|
|
1019
|
+
"scheduled_at": scheduled_at,
|
|
1020
|
+
"deadline_at": deadline_at,
|
|
1021
|
+
"repeater": (
|
|
1022
|
+
properties.get("repeater") if isinstance(properties.get("repeater"), str) else None
|
|
1023
|
+
),
|
|
1024
|
+
"wikilinks": wikilinks,
|
|
1025
|
+
"tags": tags,
|
|
1026
|
+
"refs": _merge_refs(wikilinks, tags),
|
|
1027
|
+
"block_refs": [*_extract_block_refs(content), *property_block_refs],
|
|
1028
|
+
"line_end": line_end if line_end is not None else node.line_end,
|
|
1029
|
+
"created_at": _first_normalized_timestamp(properties, CREATED_AT_KEYS),
|
|
1030
|
+
"updated_at": _first_normalized_timestamp(properties, UPDATED_AT_KEYS),
|
|
1031
|
+
}
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
def _normalize_indent_levels(
|
|
1035
|
+
self, nodes: list[LogseqNode], depth: int = 0
|
|
1036
|
+
) -> list[LogseqNode]:
|
|
1037
|
+
normalized_nodes: list[LogseqNode] = []
|
|
1038
|
+
for node in nodes:
|
|
1039
|
+
normalized_children = self._normalize_indent_levels(node.children, depth + 1)
|
|
1040
|
+
normalized_nodes.append(
|
|
1041
|
+
node.model_copy(update={"indent_level": depth, "children": normalized_children})
|
|
1042
|
+
)
|
|
1043
|
+
return normalized_nodes
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
# Backward-compatible alias.
|
|
1047
|
+
LogosParser = StackMachineParser
|