logseq-matryca-parser 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ """FORGE exporters implemented with AST visitors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import re
8
+ from collections.abc import Callable
9
+ from typing import Any
10
+
11
+ from .logos_core import ASTVisitor, LogseqNode, LogseqPage
12
+ from .logos_parser import LOGSEQ_PATTERNS
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ EmbedResolver = Callable[[str], tuple[str, str] | None]
17
+ """Maps a Logseq block id string to (Obsidian page title, anchor without leading ^)."""
18
+
19
+
20
+ def _flatten_nodes_preorder(nodes: list[LogseqNode]) -> list[LogseqNode]:
21
+ flat: list[LogseqNode] = []
22
+ for node in nodes:
23
+ flat.append(node)
24
+ if node.children:
25
+ flat.extend(_flatten_nodes_preorder(node.children))
26
+ return flat
27
+
28
+
29
+ def _node_identity_keys(node: LogseqNode) -> set[str]:
30
+ keys: set[str] = {node.uuid.lower()}
31
+ if node.source_uuid:
32
+ keys.add(node.source_uuid.lower())
33
+ raw_id = node.properties.get("id")
34
+ if isinstance(raw_id, str) and len(raw_id) == 36:
35
+ keys.add(raw_id.lower())
36
+ return keys
37
+
38
+
39
+ def _outgoing_embed_ids(node: LogseqNode) -> set[str]:
40
+ found: set[str] = {r.lower() for r in node.block_refs}
41
+ for match in LOGSEQ_PATTERNS["block_ref"].finditer(node.clean_text):
42
+ uid = match.group(1) or match.group(2)
43
+ if uid:
44
+ found.add(uid.lower())
45
+ return found
46
+
47
+
48
+ def _nodes_needing_trailing_anchor(
49
+ flat: list[LogseqNode],
50
+ *,
51
+ vault_wide_ref_targets: set[str] | None = None,
52
+ ) -> set[str]:
53
+ """Synthetic UUIDs of nodes that should receive a trailing Obsidian block id.
54
+
55
+ When ``vault_wide_ref_targets`` is provided (lowercased embed ids used anywhere in the
56
+ vault), any node whose identity keys intersect that set is marked — so cross-page
57
+ ``((uuid))`` references still get a stable ``^`` anchor on the target block's line.
58
+ """
59
+ if vault_wide_ref_targets is not None:
60
+ return {
61
+ n.uuid for n in flat if _node_identity_keys(n) & {t.lower() for t in vault_wide_ref_targets}
62
+ }
63
+ need: set[str] = set()
64
+ for target in flat:
65
+ target_keys = _node_identity_keys(target)
66
+ for referrer in flat:
67
+ if referrer.uuid == target.uuid:
68
+ continue
69
+ if target_keys & _outgoing_embed_ids(referrer):
70
+ need.add(target.uuid)
71
+ break
72
+ return need
73
+
74
+
75
+ def _allocate_obsidian_suffixes(flat: list[LogseqNode], need_anchor: set[str]) -> dict[str, str]:
76
+ """Return mapping synthetic uuid -> suffix (without ^) unique within this page."""
77
+ used: set[str] = set()
78
+ result: dict[str, str] = {}
79
+ for node in flat:
80
+ if node.uuid not in need_anchor:
81
+ continue
82
+ base = node.uuid.replace("-", "")[:8]
83
+ candidate = base
84
+ if candidate in used:
85
+ candidate = node.uuid.replace("-", "")
86
+ if candidate in used:
87
+ candidate = node.uuid
88
+ n = 0
89
+ while candidate in used:
90
+ n += 1
91
+ candidate = f"{base}{n}"
92
+ used.add(candidate)
93
+ result[node.uuid] = candidate
94
+ return result
95
+
96
+
97
+ def _yaml_quote_key(key: str) -> str:
98
+ if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_-]*", key):
99
+ return key
100
+ return json.dumps(key, ensure_ascii=False)
101
+
102
+
103
+ def _yaml_quote_value(value: Any) -> str:
104
+ if value is None:
105
+ return "null"
106
+ if isinstance(value, bool):
107
+ return "true" if value else "false"
108
+ if isinstance(value, int) and not isinstance(value, bool):
109
+ return str(value)
110
+ if isinstance(value, float):
111
+ return repr(value)
112
+ if isinstance(value, str):
113
+ if "\n" in value:
114
+ lines = value.split("\n")
115
+ body = "\n".join(" " + line for line in lines)
116
+ return "|\n" + body
117
+ if re.search(r'[:#"\[\]{}]|^\s|\s$', value) or value in ("true", "false", "null"):
118
+ return json.dumps(value, ensure_ascii=False)
119
+ return value
120
+ return json.dumps(value, default=str, ensure_ascii=False)
121
+
122
+
123
+ def _page_properties_to_yaml_frontmatter(properties: dict[str, Any]) -> str:
124
+ if not properties:
125
+ return ""
126
+ lines = ["---", *[f"{_yaml_quote_key(str(k))}: {_yaml_quote_value(v)}" for k, v in properties.items()], "---"]
127
+ return "\n".join(lines) + "\n\n"
128
+
129
+
130
+ def _build_local_embed_index(flat: list[LogseqNode]) -> dict[str, LogseqNode]:
131
+ index: dict[str, LogseqNode] = {}
132
+ for node in flat:
133
+ index[node.uuid.lower()] = node
134
+ if node.source_uuid:
135
+ index[node.source_uuid.lower()] = node
136
+ raw_id = node.properties.get("id")
137
+ if isinstance(raw_id, str) and len(raw_id) == 36:
138
+ index[raw_id.lower()] = node
139
+ return index
140
+
141
+
142
+ def _replace_block_refs_in_text(
143
+ text: str,
144
+ page_title: str,
145
+ local_index: dict[str, LogseqNode],
146
+ suffix_map: dict[str, str],
147
+ embed_resolver: EmbedResolver | None,
148
+ ) -> str:
149
+ block_ref_pattern = LOGSEQ_PATTERNS["block_ref"]
150
+
151
+ def repl(match: re.Match[str]) -> str:
152
+ uid = match.group(1) or match.group(2)
153
+ if not uid:
154
+ return match.group(0)
155
+ uid_lower = uid.lower()
156
+ if embed_resolver is not None:
157
+ resolved = embed_resolver(uid)
158
+ if resolved is not None:
159
+ other_title, resolved_anchor = resolved
160
+ logger.debug(
161
+ "embed_resolver mapped ref=%s -> [[%s#^%s]]",
162
+ uid,
163
+ other_title,
164
+ resolved_anchor,
165
+ )
166
+ return f"[[{other_title}#^{resolved_anchor}]]"
167
+ target = local_index.get(uid_lower)
168
+ if target is None:
169
+ logger.debug("Unresolved block ref %s in Obsidian export (same-page index miss)", uid)
170
+ return match.group(0)
171
+ block_anchor = suffix_map.get(target.uuid)
172
+ if block_anchor is None:
173
+ block_anchor = target.uuid.replace("-", "")[:8]
174
+ logger.debug("Same-page block ref %s -> [[%s#^%s]]", uid, page_title, block_anchor)
175
+ return f"[[{page_title}#^{block_anchor}]]"
176
+
177
+ return block_ref_pattern.sub(repl, text)
178
+
179
+
180
+ def _obsidian_line_source(node: LogseqNode) -> str:
181
+ """Prefer the first line of ``content`` so ``((uuid))`` survives when stripped from ``clean_text``."""
182
+ if node.content:
183
+ first = node.content.split("\n", 1)[0]
184
+ else:
185
+ first = node.clean_text
186
+ stripped = LOGSEQ_PATTERNS["inline_uuid_prop"].sub("", first)
187
+ return stripped.replace("\n", " ").strip()
188
+
189
+
190
+ class JSONForgeVisitor(ASTVisitor):
191
+ """Builds a nested JSON-serializable structure during AST traversal."""
192
+
193
+ def __init__(self) -> None:
194
+ self._roots: list[dict[str, Any]] = []
195
+ self._stack: list[dict[str, Any]] = []
196
+
197
+ def visit_node(self, node: LogseqNode) -> None:
198
+ node_payload = node.model_dump(exclude={"children"})
199
+ node_payload["children"] = []
200
+ if self._stack:
201
+ self._stack[-1]["children"].append(node_payload)
202
+ else:
203
+ self._roots.append(node_payload)
204
+ self._stack.append(node_payload)
205
+
206
+ def depart_node(self, node: LogseqNode) -> None:
207
+ _ = node
208
+ self._stack.pop()
209
+
210
+ def get_data(self) -> list[dict[str, Any]]:
211
+ return self._roots
212
+
213
+ def get_json(self, indent: int = 2) -> str:
214
+ return json.dumps(self._roots, indent=indent)
215
+
216
+
217
+ class FlatListForgeVisitor(ASTVisitor):
218
+ """Collects nodes in preorder as a flat list."""
219
+
220
+ def __init__(self) -> None:
221
+ self._flat_items: list[dict[str, Any]] = []
222
+
223
+ def visit_node(self, node: LogseqNode) -> None:
224
+ self._flat_items.append(node.model_dump(exclude={"children"}))
225
+
226
+ def depart_node(self, node: LogseqNode) -> None:
227
+ _ = node
228
+
229
+ def get_data(self) -> list[dict[str, Any]]:
230
+ return self._flat_items
231
+
232
+
233
+ class MarkdownForgeVisitor(ASTVisitor):
234
+ """Builds clean markdown output with topology-preserving indentation."""
235
+
236
+ def __init__(self) -> None:
237
+ self._lines: list[str] = []
238
+ self._stack: list[str] = []
239
+
240
+ def visit_node(self, node: LogseqNode) -> None:
241
+ depth = len(self._stack)
242
+ prefix = " " * depth + "- "
243
+ line_text = node.clean_text.replace("\n", " ")
244
+ self._lines.append(f"{prefix}{line_text}")
245
+ if node.properties:
246
+ for key, value in node.properties.items():
247
+ if key != "id":
248
+ self._lines.append(f" {' ' * depth} [:{key} {value}]")
249
+ self._stack.append(node.uuid)
250
+
251
+ def depart_node(self, node: LogseqNode) -> None:
252
+ _ = node
253
+ self._stack.pop()
254
+
255
+ def get_markdown(self) -> str:
256
+ return "\n".join(self._lines)
257
+
258
+
259
+ class ObsidianForgeVisitor(ASTVisitor):
260
+ """Builds Obsidian-friendly Markdown: YAML frontmatter, list body, ``^`` block ids."""
261
+
262
+ def __init__(
263
+ self,
264
+ *,
265
+ page_title: str,
266
+ suffix_map: dict[str, str],
267
+ needs_suffix: set[str],
268
+ local_index: dict[str, LogseqNode],
269
+ embed_resolver: EmbedResolver | None,
270
+ header: str,
271
+ ) -> None:
272
+ self._page_title = page_title
273
+ self._suffix_map = suffix_map
274
+ self._needs_suffix = needs_suffix
275
+ self._local_index = local_index
276
+ self._embed_resolver = embed_resolver
277
+ self._header = header
278
+ self._lines: list[str] = []
279
+ self._stack: list[str] = []
280
+
281
+ def visit_node(self, node: LogseqNode) -> None:
282
+ depth = len(self._stack)
283
+ prefix = " " * depth + "- "
284
+ line_core = _replace_block_refs_in_text(
285
+ _obsidian_line_source(node),
286
+ self._page_title,
287
+ self._local_index,
288
+ self._suffix_map,
289
+ self._embed_resolver,
290
+ )
291
+ if node.uuid in self._needs_suffix:
292
+ anchor = self._suffix_map.get(node.uuid, node.uuid.replace("-", "")[:8])
293
+ line_core = f"{line_core.rstrip()} ^{anchor}"
294
+ logger.debug("Obsidian trailing block id uuid=%s ^%s", node.uuid, anchor)
295
+ self._lines.append(f"{prefix}{line_core}")
296
+ self._stack.append(node.uuid)
297
+
298
+ def depart_node(self, node: LogseqNode) -> None:
299
+ _ = node
300
+ self._stack.pop()
301
+
302
+ def get_markdown(self) -> str:
303
+ return self._header + "\n".join(self._lines)
304
+
305
+
306
+ class ForgeExporter:
307
+ """Transforms Logseq nodes into artifacts ready for AI ingestion."""
308
+
309
+ @staticmethod
310
+ def to_json(nodes: list[LogseqNode], indent: int = 2) -> str:
311
+ """Export the full tree as structured JSON."""
312
+ visitor = JSONForgeVisitor()
313
+ for node in nodes:
314
+ node.accept(visitor)
315
+ return visitor.get_json(indent=indent)
316
+
317
+ @staticmethod
318
+ def to_flat_list(nodes: list[LogseqNode]) -> list[dict[str, Any]]:
319
+ """Flatten the tree in preorder while preserving node metadata."""
320
+ visitor = FlatListForgeVisitor()
321
+ for node in nodes:
322
+ node.accept(visitor)
323
+ return visitor.get_data()
324
+
325
+ @staticmethod
326
+ def to_clean_markdown(nodes: list[LogseqNode]) -> str:
327
+ """Render clean markdown preserving spatial hierarchy."""
328
+ visitor = MarkdownForgeVisitor()
329
+ for node in nodes:
330
+ node.accept(visitor)
331
+ return visitor.get_markdown()
332
+
333
+ @staticmethod
334
+ def vault_wide_embed_targets(pages: list[LogseqPage]) -> set[str]:
335
+ """Lowercased block ids referenced via ``((uuid))`` or ``block_refs`` anywhere in ``pages``."""
336
+ targets: set[str] = set()
337
+ for page in pages:
338
+ for node in _flatten_nodes_preorder(page.root_nodes):
339
+ targets |= _outgoing_embed_ids(node)
340
+ return targets
341
+
342
+ @staticmethod
343
+ def build_vault_obsidian_suffix_map(
344
+ pages: list[LogseqPage],
345
+ *,
346
+ vault_wide_ref_targets: set[str] | None = None,
347
+ ) -> dict[str, str]:
348
+ """Map every block synthetic ``uuid`` to a per-vault-stable ``^`` anchor suffix string."""
349
+ targets = (
350
+ vault_wide_ref_targets
351
+ if vault_wide_ref_targets is not None
352
+ else ForgeExporter.vault_wide_embed_targets(pages)
353
+ )
354
+ merged: dict[str, str] = {}
355
+ for page in pages:
356
+ flat = _flatten_nodes_preorder(page.root_nodes)
357
+ need = _nodes_needing_trailing_anchor(flat, vault_wide_ref_targets=targets)
358
+ merged.update(_allocate_obsidian_suffixes(flat, need))
359
+ return merged
360
+
361
+ @staticmethod
362
+ def to_obsidian_markdown(
363
+ nodes: list[LogseqNode],
364
+ page_properties: dict[str, Any],
365
+ *,
366
+ embed_resolver: EmbedResolver | None = None,
367
+ global_suffix_map: dict[str, str] | None = None,
368
+ vault_wide_ref_targets: set[str] | None = None,
369
+ ) -> str:
370
+ """Render Obsidian-compatible Markdown (YAML frontmatter, ``^`` block ids, wikilinks preserved)."""
371
+ props = dict(page_properties)
372
+ page_title = str(props.get("title") or "Untitled")
373
+ flat = _flatten_nodes_preorder(nodes)
374
+ need_suffix = _nodes_needing_trailing_anchor(
375
+ flat,
376
+ vault_wide_ref_targets=vault_wide_ref_targets,
377
+ )
378
+ local_alloc = _allocate_obsidian_suffixes(flat, need_suffix)
379
+ suffix_map: dict[str, str] = {}
380
+ for n in flat:
381
+ if n.uuid not in need_suffix:
382
+ continue
383
+ if global_suffix_map is not None and n.uuid in global_suffix_map:
384
+ suffix_map[n.uuid] = global_suffix_map[n.uuid]
385
+ else:
386
+ suffix_map[n.uuid] = local_alloc[n.uuid]
387
+ local_index = _build_local_embed_index(flat)
388
+ header = _page_properties_to_yaml_frontmatter(props)
389
+ visitor = ObsidianForgeVisitor(
390
+ page_title=page_title,
391
+ suffix_map=suffix_map,
392
+ needs_suffix=need_suffix,
393
+ local_index=local_index,
394
+ embed_resolver=embed_resolver,
395
+ header=header,
396
+ )
397
+ for node in nodes:
398
+ node.accept(visitor)
399
+ return visitor.get_markdown()