logseq-matryca-parser 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,493 @@
1
+ """In-memory Logseq graph orchestration (no database)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ import uuid
8
+ from collections.abc import Callable
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from pathlib import Path
11
+ from typing import Any, Self
12
+
13
+ from pydantic import BaseModel, ConfigDict, PrivateAttr
14
+
15
+ from logseq_matryca_parser.kinetic import _discover_graph_files
16
+ from logseq_matryca_parser.logos_core import LogseqNode, LogseqPage
17
+ from logseq_matryca_parser.logos_parser import StackMachineParser
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _DEFAULT_MAX_WORKERS = min(32, (os.cpu_count() or 1) + 4)
22
+
23
+
24
+ class GraphQuery:
25
+ """Fluent, chainable filter pipeline over a fixed slice of ``LogseqNode`` instances."""
26
+
27
+ def __init__(self, graph: LogseqGraph, nodes: list[LogseqNode]) -> None:
28
+ self._graph = graph
29
+ self._nodes: list[LogseqNode] = list(nodes)
30
+ logger.debug(
31
+ "GraphQuery init graph_path=%s seed_nodes=%s",
32
+ self._graph.graph_path,
33
+ len(self._nodes),
34
+ )
35
+
36
+ def has_tag(self, tag: str) -> Self:
37
+ self._nodes = [n for n in self._nodes if tag in n.tags]
38
+ logger.debug("GraphQuery.has_tag tag=%s remaining=%s", tag, len(self._nodes))
39
+ return self
40
+
41
+ def with_priority(self, priority: str) -> Self:
42
+ self._nodes = [n for n in self._nodes if n.task_priority == priority]
43
+ logger.debug("GraphQuery.with_priority priority=%s remaining=%s", priority, len(self._nodes))
44
+ return self
45
+
46
+ def under_parent(self, parent_uuid: str) -> Self:
47
+ """Keep nodes whose ancestry chain (``path`` sans self) contains ``parent_uuid``."""
48
+ self._nodes = [n for n in self._nodes if len(n.path) > 1 and parent_uuid in n.path[:-1]]
49
+ logger.debug("GraphQuery.under_parent parent=%s remaining=%s", parent_uuid, len(self._nodes))
50
+ return self
51
+
52
+ def is_task_state(self, status: str) -> Self:
53
+ self._nodes = [n for n in self._nodes if n.task_status == status]
54
+ logger.debug("GraphQuery.is_task_state status=%s remaining=%s", status, len(self._nodes))
55
+ return self
56
+
57
+ def execute(self) -> list[LogseqNode]:
58
+ return list(self._nodes)
59
+
60
+
61
+ def _flatten_nodes(nodes: list[LogseqNode]) -> list[LogseqNode]:
62
+ """Depth-first flattening of a node tree."""
63
+ flat: list[LogseqNode] = []
64
+ for node in nodes:
65
+ flat.append(node)
66
+ if node.children:
67
+ flat.extend(_flatten_nodes(node.children))
68
+ return flat
69
+
70
+
71
+ def _normalize_backlink_key(token: str) -> str:
72
+ """Normalize a wikilink title, tag, or block-ref string for the backlink registry."""
73
+ stripped = token.strip()
74
+ if not stripped:
75
+ return ""
76
+ try:
77
+ return str(uuid.UUID(stripped))
78
+ except ValueError:
79
+ return stripped.lower()
80
+
81
+
82
+ def _append_backlink(registry: dict[str, list[str]], key: str, source_uuid: str) -> None:
83
+ if key not in registry:
84
+ registry[key] = []
85
+ registry[key].append(source_uuid)
86
+ logger.debug("backlink index: %s <- source=%s", key, source_uuid)
87
+
88
+
89
+ def _build_backlink_registry(pages: dict[str, LogseqPage]) -> dict[str, list[str]]:
90
+ """Map normalized targets (page title lower or block UUID) to source node UUIDs."""
91
+ registry: dict[str, list[str]] = {}
92
+ for page in pages.values():
93
+ for node in _flatten_nodes(page.root_nodes):
94
+ for link in node.wikilinks:
95
+ key = _normalize_backlink_key(link)
96
+ if key:
97
+ _append_backlink(registry, key, node.uuid)
98
+ for tag in node.tags:
99
+ key = _normalize_backlink_key(tag)
100
+ if key:
101
+ _append_backlink(registry, key, node.uuid)
102
+ for block_ref in node.block_refs:
103
+ key = _normalize_backlink_key(block_ref)
104
+ if key:
105
+ _append_backlink(registry, key, node.uuid)
106
+ logger.debug("backlink registry built: %s distinct targets", len(registry))
107
+ return registry
108
+
109
+
110
+ def _parse_page_file_worker(path: Path) -> LogseqPage:
111
+ """Parse a single markdown file in isolation (thread-safe)."""
112
+ return StackMachineParser().parse_page_file(path)
113
+
114
+
115
+ class LogseqGraph(BaseModel):
116
+ """Bulk-loaded Logseq vault: pages plus O(1) node lookup by synthetic UUID."""
117
+
118
+ model_config = ConfigDict(strict=True, frozen=True)
119
+
120
+ graph_path: Path
121
+ pages: dict[str, LogseqPage]
122
+
123
+ _node_registry: dict[str, LogseqNode] = PrivateAttr(default_factory=dict)
124
+ _backlink_registry: dict[str, list[str]] = PrivateAttr(default_factory=dict)
125
+
126
+ def __init__(
127
+ self,
128
+ graph_path: Path,
129
+ pages: dict[str, LogseqPage],
130
+ *,
131
+ node_registry: dict[str, LogseqNode] | None = None,
132
+ backlink_registry: dict[str, list[str]] | None = None,
133
+ ) -> None:
134
+ super().__init__(graph_path=graph_path, pages=pages)
135
+ self._node_registry = dict(node_registry) if node_registry is not None else {}
136
+ self._backlink_registry = (
137
+ dict(backlink_registry) if backlink_registry is not None else {}
138
+ )
139
+
140
+ @classmethod
141
+ def load_directory(cls, graph_path: Path) -> LogseqGraph:
142
+ """Discover markdown under ``pages/`` and ``journals/``, parse concurrently, build indexes."""
143
+ resolved = graph_path.expanduser().resolve()
144
+ files = _discover_graph_files(resolved)
145
+ pages: dict[str, LogseqPage] = {}
146
+ node_registry: dict[str, LogseqNode] = {}
147
+
148
+ if not files:
149
+ logger.debug("LogseqGraph.load_directory: no markdown files under %s", resolved)
150
+ return cls(
151
+ graph_path=resolved,
152
+ pages=pages,
153
+ node_registry=node_registry,
154
+ backlink_registry={},
155
+ )
156
+
157
+ max_workers = min(_DEFAULT_MAX_WORKERS, len(files))
158
+ logger.debug(
159
+ "LogseqGraph.load_directory: parsing %s files with max_workers=%s",
160
+ len(files),
161
+ max_workers,
162
+ )
163
+
164
+ path_page_pairs: list[tuple[Path, LogseqPage]] = []
165
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
166
+ future_to_path = {pool.submit(_parse_page_file_worker, p): p for p in files}
167
+ for future in as_completed(future_to_path):
168
+ source_path = future_to_path[future]
169
+ page = future.result()
170
+ path_page_pairs.append((source_path, page))
171
+
172
+ path_page_pairs.sort(key=lambda item: str(item[0].resolve()))
173
+ for _path, page in path_page_pairs:
174
+ pages[page.title] = page
175
+ for node in _flatten_nodes(page.root_nodes):
176
+ node_registry[node.uuid] = node
177
+
178
+ backlink_registry = _build_backlink_registry(pages)
179
+
180
+ logger.debug(
181
+ "LogseqGraph.load_directory: indexed %s pages, %s nodes",
182
+ len(pages),
183
+ len(node_registry),
184
+ )
185
+ return cls(
186
+ graph_path=resolved,
187
+ pages=pages,
188
+ node_registry=node_registry,
189
+ backlink_registry=backlink_registry,
190
+ )
191
+
192
+ @property
193
+ def tab_size(self) -> int:
194
+ """Logseq outline tab width in spaces (matches ``StackMachineParser`` default)."""
195
+ return 2
196
+
197
+ def get_node_by_uuid(self, uuid: str) -> LogseqNode | None:
198
+ """Return the node for ``uuid`` if present in the global registry."""
199
+ return self._node_registry.get(uuid)
200
+
201
+ def get_broken_references(self) -> list[LogseqNode]:
202
+ """Return nodes whose ``block_refs`` point at UUIDs missing from the node registry."""
203
+ broken: list[LogseqNode] = []
204
+ for node in self._node_registry.values():
205
+ if not node.block_refs:
206
+ continue
207
+ for ref in node.block_refs:
208
+ if ref not in self._node_registry:
209
+ broken.append(node)
210
+ logger.debug(
211
+ "get_broken_references origin=%s missing_ref=%s",
212
+ node.uuid,
213
+ ref,
214
+ )
215
+ break
216
+ return broken
217
+
218
+ def get_node_by_embed_ref(self, block_ref: str) -> LogseqNode | None:
219
+ """Resolve a Logseq block id: synthetic registry UUID, ``source_uuid``, or ``properties['id']``."""
220
+ stripped = block_ref.strip()
221
+ if not stripped:
222
+ return None
223
+ direct = self.get_node_by_uuid(stripped)
224
+ if direct is not None:
225
+ return direct
226
+ for node in self._node_registry.values():
227
+ if node.source_uuid == stripped:
228
+ return node
229
+ if node.properties.get("id") == stripped:
230
+ return node
231
+ logger.debug("get_node_by_embed_ref: no node for ref=%s", stripped)
232
+ return None
233
+
234
+ def query(self) -> GraphQuery:
235
+ """Return a fluent query over all nodes registered in the graph."""
236
+ return GraphQuery(self, list(self._node_registry.values()))
237
+
238
+ def get_backlinks(self, target: str) -> list[LogseqNode]:
239
+ """Return nodes that reference ``target`` via wikilinks, tags, or block refs."""
240
+ key = _normalize_backlink_key(target)
241
+ if not key:
242
+ return []
243
+ source_ids = self._backlink_registry.get(key, [])
244
+ ordered_unique: list[str] = list(dict.fromkeys(source_ids))
245
+ result: list[LogseqNode] = []
246
+ for sid in ordered_unique:
247
+ node = self._node_registry.get(sid)
248
+ if node is not None:
249
+ result.append(node)
250
+ logger.debug("get_backlinks target=%s resolved=%s nodes", key, len(result))
251
+ return result
252
+
253
+ def _page_for_node(self, node: LogseqNode) -> LogseqPage | None:
254
+ """Resolve the parsed page that owns ``node`` (same source file)."""
255
+ if not node.source_path:
256
+ return None
257
+ node_path = Path(node.source_path).resolve()
258
+ for page in self.pages.values():
259
+ if page.source_path:
260
+ if Path(page.source_path).resolve() == node_path:
261
+ return page
262
+ return None
263
+
264
+ def get_effective_properties(self, node_uuid: str) -> dict[str, Any]:
265
+ """Merge page frontmatter with outline ancestors top-down; deeper blocks override."""
266
+ node = self.get_node_by_uuid(node_uuid)
267
+ if node is None:
268
+ return {}
269
+ merged: dict[str, Any] = {}
270
+ page = self._page_for_node(node)
271
+ if page is not None:
272
+ merged.update(page.properties)
273
+ for path_uuid in node.path:
274
+ ancestor = self._node_registry.get(path_uuid)
275
+ if ancestor is not None:
276
+ merged = {**merged, **ancestor.properties}
277
+ logger.debug(
278
+ "get_effective_properties node_uuid=%s keys=%s",
279
+ node_uuid,
280
+ tuple(merged.keys()),
281
+ )
282
+ return merged
283
+
284
+ def get_nodes_by_tag(self, tag: str) -> list[LogseqNode]:
285
+ """Return all nodes whose ``tags`` contain ``tag``."""
286
+ matches: list[LogseqNode] = []
287
+ for node in self._node_registry.values():
288
+ if tag in node.tags:
289
+ matches.append(node)
290
+ return matches
291
+
292
+ def search_content(self, query: str) -> list[LogseqNode]:
293
+ """Linear scan of ``clean_text`` for substring ``query``."""
294
+ if not query:
295
+ return []
296
+ hits: list[LogseqNode] = []
297
+ for node in self._node_registry.values():
298
+ if query in node.clean_text:
299
+ hits.append(node)
300
+ return hits
301
+
302
+ def resolve_relative_page_link(self, current_page_title: str, link_target: str) -> str | None:
303
+ """Resolve a relative page title like Logseq OG: nested namespace shadowing beats global."""
304
+ target = link_target.strip()
305
+ if not target:
306
+ return None
307
+ segments = [part for part in current_page_title.split("/") if part]
308
+ for prefix_len in range(len(segments), 0, -1):
309
+ candidate = "/".join([*segments[:prefix_len], target])
310
+ if candidate in self.pages:
311
+ logger.debug(
312
+ "resolve_relative_page_link: contextual hit current=%s target=%s -> %s",
313
+ current_page_title,
314
+ link_target,
315
+ candidate,
316
+ )
317
+ return candidate
318
+ if target in self.pages:
319
+ logger.debug(
320
+ "resolve_relative_page_link: global fallback current=%s target=%s",
321
+ current_page_title,
322
+ link_target,
323
+ )
324
+ return target
325
+ return None
326
+
327
+ def get_namespace_children(self, namespace_prefix: str) -> list[LogseqPage]:
328
+ """Return direct child pages under ``namespace_prefix`` (one extra path segment only)."""
329
+ prefix = namespace_prefix.strip().rstrip("/")
330
+ if not prefix:
331
+ return []
332
+ needle = f"{prefix}/"
333
+ children: list[LogseqPage] = []
334
+ for title, page in self.pages.items():
335
+ if not title.startswith(needle):
336
+ continue
337
+ remainder = title[len(needle) :]
338
+ if remainder and "/" not in remainder:
339
+ children.append(page)
340
+ children.sort(key=lambda p: p.title)
341
+ logger.debug(
342
+ "get_namespace_children prefix=%s count=%s",
343
+ prefix,
344
+ len(children),
345
+ )
346
+ return children
347
+
348
+ def _resolved_path_is_tracked_markdown(self, path: Path) -> bool:
349
+ """True when ``path`` is a ``.md`` file under this graph's ``pages/`` or ``journals/``."""
350
+ resolved = path.resolve()
351
+ if resolved.suffix.lower() != ".md":
352
+ return False
353
+ graph_root = self.graph_path.resolve()
354
+ for folder in ("pages", "journals"):
355
+ root = graph_root / folder
356
+ if not root.is_dir():
357
+ continue
358
+ try:
359
+ resolved.relative_to(root)
360
+ return True
361
+ except ValueError:
362
+ continue
363
+ return False
364
+
365
+ def _page_title_for_source_path(self, resolved_file: Path) -> str | None:
366
+ """Return the ``pages`` dict key for the page loaded from ``resolved_file``, if any."""
367
+ for title, page in self.pages.items():
368
+ sp = page.source_path
369
+ if sp and Path(sp).resolve() == resolved_file:
370
+ return title
371
+ return None
372
+
373
+ def _purge_stale_page_uuids(self, stale: set[str]) -> None:
374
+ """Remove stale node UUIDs from the node registry and scrub backlink source lists."""
375
+ for uid in stale:
376
+ self._node_registry.pop(uid, None)
377
+ dead_keys: list[str] = []
378
+ for key, sources in self._backlink_registry.items():
379
+ filtered = [s for s in sources if s not in stale]
380
+ if filtered:
381
+ self._backlink_registry[key] = filtered
382
+ else:
383
+ dead_keys.append(key)
384
+ for key in dead_keys:
385
+ del self._backlink_registry[key]
386
+ logger.debug(
387
+ "Stack-Machine incremental purge: stale_uuids=%s dead_backlink_keys=%s",
388
+ len(stale),
389
+ len(dead_keys),
390
+ )
391
+
392
+ def _register_page_nodes(self, page: LogseqPage) -> None:
393
+ for node in _flatten_nodes(page.root_nodes):
394
+ self._node_registry[node.uuid] = node
395
+
396
+ def _append_page_backlinks(self, page: LogseqPage) -> None:
397
+ for node in _flatten_nodes(page.root_nodes):
398
+ for link in node.wikilinks:
399
+ key = _normalize_backlink_key(link)
400
+ if key:
401
+ _append_backlink(self._backlink_registry, key, node.uuid)
402
+ for tag in node.tags:
403
+ key = _normalize_backlink_key(tag)
404
+ if key:
405
+ _append_backlink(self._backlink_registry, key, node.uuid)
406
+ for block_ref in node.block_refs:
407
+ key = _normalize_backlink_key(block_ref)
408
+ if key:
409
+ _append_backlink(self._backlink_registry, key, node.uuid)
410
+
411
+ def invalidate_and_reload_page(self, file_path: Path) -> None:
412
+ """Re-parse a single file, purge its old nodes/backlinks, and merge fresh indexes."""
413
+ resolved = Path(file_path).expanduser().resolve()
414
+ if not self._resolved_path_is_tracked_markdown(resolved):
415
+ logger.debug("invalidate_and_reload_page: skip non-tracked path=%s", resolved)
416
+ return
417
+ fresh = StackMachineParser().parse_page_file(resolved)
418
+ old_title = self._page_title_for_source_path(resolved)
419
+ stale: set[str] = set()
420
+ if old_title is not None:
421
+ old_page = self.pages[old_title]
422
+ stale = {n.uuid for n in _flatten_nodes(old_page.root_nodes)}
423
+ self._purge_stale_page_uuids(stale)
424
+ new_pages = dict(self.pages)
425
+ if old_title is not None:
426
+ del new_pages[old_title]
427
+ new_pages[fresh.title] = fresh
428
+ object.__setattr__(self, "pages", new_pages)
429
+ self._register_page_nodes(fresh)
430
+ self._append_page_backlinks(fresh)
431
+ logger.debug(
432
+ "Stack-Machine incremental re-hydrate: path=%s title=%s nodes=%s",
433
+ resolved,
434
+ fresh.title,
435
+ len(list(_flatten_nodes(fresh.root_nodes))),
436
+ )
437
+
438
+ def start_watching(self, callback: Callable[[Path], None] | None = None) -> LogseqGraphWatcher:
439
+ """Start a recursive filesystem observer over the graph (requires optional ``watchdog``)."""
440
+ return LogseqGraphWatcher(self, callback).start()
441
+
442
+
443
+ class LogseqGraphWatcher:
444
+ """Background ``watchdog`` observer that incremental-reloads touched Markdown pages."""
445
+
446
+ def __init__(self, graph: LogseqGraph, callback: Callable[[Path], None] | None = None) -> None:
447
+ self._graph = graph
448
+ self._callback = callback
449
+ self._observer: Any = None
450
+
451
+ def start(self) -> LogseqGraphWatcher:
452
+ from watchdog.events import FileSystemEventHandler
453
+ from watchdog.observers import Observer
454
+
455
+ graph = self._graph
456
+ user_callback = self._callback
457
+
458
+ def _route_event(event: Any) -> None:
459
+ if getattr(event, "is_directory", False):
460
+ return
461
+ path = Path(str(event.src_path))
462
+ if not graph._resolved_path_is_tracked_markdown(path):
463
+ logger.debug("Stack-Machine watcher: ignore path=%s", path)
464
+ return
465
+ logger.debug("Stack-Machine watcher: invalidate path=%s", path)
466
+ graph.invalidate_and_reload_page(path)
467
+ if user_callback is not None:
468
+ user_callback(path)
469
+
470
+ class _MarkdownGraphHandler(FileSystemEventHandler):
471
+ def on_modified(self, event: Any) -> None:
472
+ _route_event(event)
473
+
474
+ def on_created(self, event: Any) -> None:
475
+ _route_event(event)
476
+
477
+ observer = Observer()
478
+ observer.schedule(
479
+ _MarkdownGraphHandler(),
480
+ str(graph.graph_path.resolve()),
481
+ recursive=True,
482
+ )
483
+ observer.start()
484
+ self._observer = observer
485
+ logger.debug("Stack-Machine watcher: started on graph_path=%s", graph.graph_path)
486
+ return self
487
+
488
+ def stop(self) -> None:
489
+ if self._observer is not None:
490
+ self._observer.stop()
491
+ self._observer.join(timeout=5)
492
+ self._observer = None
493
+ logger.debug("Stack-Machine watcher: stopped")