logseq-matryca-parser 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,329 @@
1
+ """SYNAPSE adapters implemented with AST visitors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from logseq_matryca_parser.logos_core import ASTVisitor, LogseqNode, LogseqPage
11
+ from logseq_matryca_parser.logos_parser import LogosParser
12
+
13
+ if TYPE_CHECKING:
14
+ from logseq_matryca_parser.graph import LogseqGraph
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ _BLOCK_EMBED_PATTERN = re.compile(
19
+ r"\{\{\s*embed\s+\(\((?P<uuid>[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})\)\)\s*\}\}",
20
+ re.IGNORECASE,
21
+ )
22
+ _PAGE_EMBED_PATTERN = re.compile(r"\{\{\s*embed\s+\[\[(?P<title>[^\]]+)\]\]\s*\}\}")
23
+
24
+ Document: type[Any] | None
25
+ NodeRelationship: Any
26
+ RelatedNodeInfo: type[Any] | None
27
+ TextNode: type[Any] | None
28
+
29
+ try:
30
+ from langchain_core.documents import Document # type: ignore
31
+ except ImportError:
32
+ Document = None
33
+
34
+ try:
35
+ from llama_index.core.schema import ( # type: ignore
36
+ NodeRelationship,
37
+ RelatedNodeInfo,
38
+ TextNode,
39
+ )
40
+ except ImportError:
41
+ NodeRelationship = None
42
+ RelatedNodeInfo = None
43
+ TextNode = None
44
+
45
+
46
+ def _flatten_nodes_for_export(nodes: list[LogseqNode]) -> list[LogseqNode]:
47
+ """Depth-first flattening of a node tree (same order as graph indexing)."""
48
+ flat: list[LogseqNode] = []
49
+ for node in nodes:
50
+ flat.append(node)
51
+ if node.children:
52
+ flat.extend(_flatten_nodes_for_export(node.children))
53
+ return flat
54
+
55
+
56
+ def _strip_markdown_for_embedding(text: str) -> str:
57
+ """Remove common markdown noise from breadcrumb fragments for embedding-friendly strings."""
58
+ s = text.strip()
59
+ s = re.sub(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]", r"\1", s)
60
+ s = re.sub(r"\*\*([^*]+)\*\*", r"\1", s)
61
+ s = re.sub(r"(?<!\*)\*([^*]+)\*(?!\*)", r"\1", s)
62
+ s = re.sub(r"`([^`]+)`", r"\1", s)
63
+ s = re.sub(r"#([^\s#]+)", "", s)
64
+ s = re.sub(r"\s+", " ", s).strip()
65
+ return s
66
+
67
+
68
+ def _expand_macros_and_embeds(text: str, graph: LogseqGraph, visited_uuids: set[str]) -> str:
69
+ """Expand ``{{embed ((uuid))}}`` / ``{{embed [[page]]}}`` for RAG text.
70
+
71
+ Operates on raw block ``content`` (not ``clean_text``) so ``((uuid))`` inside macros is
72
+ still visible to the scanner after parsing.
73
+ """
74
+ return _expand_macros_and_embeds_impl(text, graph, visited_uuids, set())
75
+
76
+
77
+ def _expand_macros_and_embeds_impl(
78
+ text: str,
79
+ graph: LogseqGraph,
80
+ visited_uuids: set[str],
81
+ visited_pages: set[str],
82
+ ) -> str:
83
+ """Shared worker: ``visited_uuids`` breaks block cycles; ``visited_pages`` breaks page cycles."""
84
+ result = text
85
+ while True:
86
+ bm = _BLOCK_EMBED_PATTERN.search(result)
87
+ pm = _PAGE_EMBED_PATTERN.search(result)
88
+ if bm is None and pm is None:
89
+ break
90
+ use_block = bm is not None and (pm is None or bm.start() <= pm.start())
91
+ if use_block:
92
+ assert bm is not None
93
+ match = bm
94
+ uid = match.group("uuid")
95
+ if uid in visited_uuids:
96
+ logger.debug("Stack-Machine embed: cyclic block uuid=%s", uid)
97
+ replacement = ""
98
+ else:
99
+ target = graph.get_node_by_embed_ref(uid)
100
+ if target is None:
101
+ logger.debug("Stack-Machine embed: unresolved block uuid=%s", uid)
102
+ replacement = match.group(0)
103
+ else:
104
+ next_seen = set(visited_uuids)
105
+ next_seen.add(uid)
106
+ replacement = _expand_macros_and_embeds_impl(
107
+ target.content, graph, next_seen, visited_pages
108
+ )
109
+ result = result[: match.start()] + replacement + result[match.end() :]
110
+ else:
111
+ assert pm is not None
112
+ match = pm
113
+ title = match.group("title").strip()
114
+ if title in visited_pages:
115
+ logger.debug("Stack-Machine embed: cyclic page title=%s", title)
116
+ replacement = ""
117
+ else:
118
+ page = graph.pages.get(title)
119
+ if page is None:
120
+ logger.debug("Stack-Machine embed: unknown page title=%s", title)
121
+ replacement = match.group(0)
122
+ else:
123
+ visited_pages.add(title)
124
+ try:
125
+ shared_blocks = set(visited_uuids)
126
+ pieces: list[str] = []
127
+ for n in _flatten_nodes_for_export(page.root_nodes):
128
+ frag = _expand_macros_and_embeds_impl(
129
+ n.content, graph, shared_blocks, visited_pages
130
+ )
131
+ stripped = frag.strip()
132
+ if stripped:
133
+ pieces.append(stripped)
134
+ replacement = "\n".join(pieces)
135
+ finally:
136
+ visited_pages.discard(title)
137
+ result = result[: match.start()] + replacement + result[match.end() :]
138
+ return result
139
+
140
+
141
+ def _build_breadcrumbs(graph: LogseqGraph, node: LogseqNode) -> tuple[str, LogseqPage | None]:
142
+ """Build `Page > ancestor clean_text ...` using ``node.path`` and the graph registry."""
143
+ page = graph._page_for_node(node)
144
+ page_title = page.title if page is not None else ""
145
+ parts: list[str] = []
146
+ if page_title:
147
+ parts.append(_strip_markdown_for_embedding(page_title))
148
+ for ancestor_uuid in node.path[:-1]:
149
+ ancestor = graph.get_node_by_uuid(ancestor_uuid)
150
+ if ancestor is None:
151
+ continue
152
+ stripped = _strip_markdown_for_embedding(ancestor.clean_text)
153
+ if stripped:
154
+ parts.append(stripped)
155
+ return " > ".join(parts), page
156
+
157
+
158
+ class LangChainVisitor(ASTVisitor):
159
+ """Build LangChain documents during AST traversal."""
160
+
161
+ def __init__(self, source_name: str, document_cls: type[Any]) -> None:
162
+ self._source_name = source_name
163
+ self._document_cls = document_cls
164
+ self._documents: list[Any] = []
165
+
166
+ def visit_node(self, node: LogseqNode) -> None:
167
+ metadata = {
168
+ **node.properties,
169
+ "uuid": node.uuid,
170
+ "parent_id": node.parent_id,
171
+ "indent_level": node.indent_level,
172
+ "source": self._source_name,
173
+ "path": node.path,
174
+ "left_id": node.left_id,
175
+ "refs": node.refs,
176
+ "task_status": node.task_status,
177
+ "repeater": node.repeater,
178
+ "created_at": node.created_at,
179
+ }
180
+ self._documents.append(
181
+ self._document_cls(
182
+ page_content=node.clean_text,
183
+ metadata=metadata,
184
+ )
185
+ )
186
+
187
+ def depart_node(self, node: LogseqNode) -> None:
188
+ _ = node
189
+
190
+ def get_documents(self) -> list[Any]:
191
+ return self._documents
192
+
193
+
194
+ class LlamaIndexVisitor(ASTVisitor):
195
+ """Build LlamaIndex nodes and inject parent/child topology relationships."""
196
+
197
+ def __init__(
198
+ self,
199
+ text_node_cls: type[Any],
200
+ node_relationship: Any,
201
+ related_node_info_cls: type[Any],
202
+ ) -> None:
203
+ self._text_node_cls = text_node_cls
204
+ self._node_relationship = node_relationship
205
+ self._related_node_info_cls = related_node_info_cls
206
+ self._nodes_by_id: dict[str, Any] = {}
207
+ self._ordered_nodes: list[Any] = []
208
+
209
+ def visit_node(self, node: LogseqNode) -> None:
210
+ text_node = self._text_node_cls(
211
+ id_=node.uuid,
212
+ text=node.clean_text,
213
+ metadata={
214
+ **node.properties,
215
+ "uuid": node.uuid,
216
+ "indent_level": node.indent_level,
217
+ "path": node.path,
218
+ "left_id": node.left_id,
219
+ "refs": node.refs,
220
+ "task_status": node.task_status,
221
+ "repeater": node.repeater,
222
+ "created_at": node.created_at,
223
+ },
224
+ )
225
+ if not hasattr(text_node, "relationships") or text_node.relationships is None:
226
+ text_node.relationships = {}
227
+
228
+ if node.parent_id:
229
+ text_node.relationships[self._node_relationship.PARENT] = self._related_node_info_cls(
230
+ node_id=node.parent_id
231
+ )
232
+ parent_node = self._nodes_by_id.get(node.parent_id)
233
+ if parent_node is not None:
234
+ child_relationships = parent_node.relationships.get(
235
+ self._node_relationship.CHILD, []
236
+ )
237
+ child_relationships.append(self._related_node_info_cls(node_id=node.uuid))
238
+ parent_node.relationships[self._node_relationship.CHILD] = child_relationships
239
+
240
+ self._nodes_by_id[node.uuid] = text_node
241
+ self._ordered_nodes.append(text_node)
242
+
243
+ def depart_node(self, node: LogseqNode) -> None:
244
+ _ = node
245
+
246
+ def get_nodes(self) -> list[Any]:
247
+ return self._ordered_nodes
248
+
249
+
250
+ class SynapseAdapter:
251
+ """Transform Logseq hierarchy into framework-native AI objects."""
252
+
253
+ @staticmethod
254
+ def to_langchain_documents(nodes: list[LogseqNode], source_name: str) -> list[Any]:
255
+ """Convert AST nodes to LangChain documents using `LangChainVisitor`."""
256
+ if Document is None:
257
+ raise ImportError("LangChain non rilevato. Installa 'langchain-core' per usare Synapse.")
258
+ visitor = LangChainVisitor(source_name=source_name, document_cls=Document)
259
+ for node in nodes:
260
+ node.accept(visitor)
261
+ return visitor.get_documents()
262
+
263
+ @staticmethod
264
+ def to_llamaindex_nodes(nodes: list[LogseqNode]) -> list[Any]:
265
+ """Convert AST nodes to LlamaIndex nodes preserving topology links."""
266
+ if TextNode is None or NodeRelationship is None or RelatedNodeInfo is None:
267
+ raise ImportError("LlamaIndex non rilevato. Installa 'llama-index' per usare Synapse.")
268
+ visitor = LlamaIndexVisitor(
269
+ text_node_cls=TextNode,
270
+ node_relationship=NodeRelationship,
271
+ related_node_info_cls=RelatedNodeInfo,
272
+ )
273
+ for node in nodes:
274
+ node.accept(visitor)
275
+ return visitor.get_nodes()
276
+
277
+ @staticmethod
278
+ def to_context_enriched_chunks(
279
+ nodes: list[LogseqNode],
280
+ graph: LogseqGraph,
281
+ format_template: str = "[{breadcrumbs}] {content}",
282
+ ) -> list[Any]:
283
+ """Flatten ``nodes`` and emit LangChain ``Document``s with breadcrumb-enriched ``page_content``."""
284
+ if Document is None:
285
+ raise ImportError("LangChain non rilevato. Installa 'langchain-core' per usare Synapse.")
286
+ documents: list[Any] = []
287
+ flat = _flatten_nodes_for_export(nodes)
288
+ for node in flat:
289
+ breadcrumbs, page = _build_breadcrumbs(graph, node)
290
+ source_name = Path(node.source_path).name if node.source_path else str(graph.graph_path.name)
291
+ expanded_content = _expand_macros_and_embeds(node.content, graph, set())
292
+ page_content = format_template.format(
293
+ breadcrumbs=breadcrumbs,
294
+ content=expanded_content,
295
+ )
296
+ effective_properties = dict(graph.get_effective_properties(node.uuid))
297
+ metadata = {
298
+ **node.properties,
299
+ "uuid": node.uuid,
300
+ "parent_id": node.parent_id,
301
+ "indent_level": node.indent_level,
302
+ "source": source_name,
303
+ "path": node.path,
304
+ "left_id": node.left_id,
305
+ "refs": node.refs,
306
+ "task_status": node.task_status,
307
+ "repeater": node.repeater,
308
+ "created_at": node.created_at,
309
+ "clean_text": node.clean_text,
310
+ "page_title": page.title if page is not None else "",
311
+ "source_path": node.source_path,
312
+ "line_start": node.line_start,
313
+ "effective_properties": effective_properties,
314
+ }
315
+ documents.append(Document(page_content=page_content, metadata=metadata))
316
+ logger.debug(
317
+ "context chunk uuid=%s breadcrumbs_len=%s effective_keys=%s",
318
+ node.uuid,
319
+ len(breadcrumbs),
320
+ tuple(effective_properties.keys()),
321
+ )
322
+ return documents
323
+
324
+ @classmethod
325
+ def load_and_convert(cls, file_path: Path) -> list[Any]:
326
+ """Parse a file and convert it to LangChain documents."""
327
+ parser = LogosParser()
328
+ nodes = parser.parse_file(file_path)
329
+ return cls.to_langchain_documents(nodes, source_name=file_path.name)
@@ -0,0 +1,279 @@
1
+ Metadata-Version: 2.4
2
+ Name: logseq-matryca-parser
3
+ Version: 0.3.0
4
+ Summary: The Logos Protocol: Deterministic Logseq AST parsing for Matryca.ai.
5
+ Project-URL: Homepage, https://github.com/MarcoPorcellato/logseq-matryca-parser
6
+ Project-URL: Bug Tracker, https://github.com/MarcoPorcellato/logseq-matryca-parser/issues
7
+ Project-URL: Matryca.ai, https://matryca.ai
8
+ Author-email: Marco Porcellato <marco@matryca.ai>
9
+ License: Apache-2.0
10
+ License-File: LICENSE
11
+ License-File: NOTICE
12
+ Keywords: ai,ast,knowledge-graph,logseq,parser,rag
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.12
19
+ Requires-Dist: pydantic>=2.7.0
20
+ Requires-Dist: rich>=13.7.1
21
+ Requires-Dist: typer>=0.12.0
22
+ Provides-Extra: ai
23
+ Requires-Dist: langchain-core; extra == 'ai'
24
+ Requires-Dist: llama-index-core; extra == 'ai'
25
+ Provides-Extra: all
26
+ Requires-Dist: langchain-core; extra == 'all'
27
+ Requires-Dist: llama-index-core; extra == 'all'
28
+ Requires-Dist: networkx>=3.0.0; extra == 'all'
29
+ Requires-Dist: pyvis>=0.3.2; extra == 'all'
30
+ Provides-Extra: viz
31
+ Requires-Dist: networkx>=3.0.0; extra == 'viz'
32
+ Requires-Dist: pyvis>=0.3.2; extra == 'viz'
33
+ Provides-Extra: watch
34
+ Requires-Dist: watchdog>=4.0.0; extra == 'watch'
35
+ Description-Content-Type: text/markdown
36
+
37
+ <div align="center">
38
+
39
+ # 🔱 Logseq Matryca Parser (The Logos Protocol)
40
+
41
+ **Stop feeding broken Markdown to your AI.**
42
+
43
+ [![CI/CD Status](https://github.com/MarcoPorcellato/logseq-matryca-parser/actions/workflows/ci.yml/badge.svg)](https://github.com/MarcoPorcellato/logseq-matryca-parser/actions)
44
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
45
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/MarcoPorcellato/logseq-matryca-parser/blob/main/LICENSE)
46
+ [![PyPI](https://img.shields.io/badge/PyPI-install%20via%20GitHub-3775A9?logo=pypi&logoColor=white)](https://github.com/MarcoPorcellato/logseq-matryca-parser#-quickstart)
47
+ [![Status: Alpha](https://img.shields.io/badge/Status-Alpha-orange?style=for-the-badge)](#)
48
+ ![Origin: Matryca.ai](https://img.shields.io/badge/Origin-Matryca.ai-gold?style=for-the-badge)
49
+
50
+ > *Turning a forest of local plain-text files into a unified semantic powerhouse.*
51
+
52
+ <p align="center">
53
+ <video src="https://github.com/user-attachments/assets/24f73c6d-3eca-4adb-8442-981f2ba4cccd" autoplay loop muted playsinline width="800"></video>
54
+ </p>
55
+
56
+ [👉 **TRY THE LIVE INTERACTIVE DEMO**](https://MarcoPorcellato.github.io/logseq-matryca-parser/)
57
+
58
+ [📘 **READ THE ARCHITECTURE (LLM OS Vision)**](docs/ARCHITECTURE.md)
59
+
60
+ </div>
61
+
62
+ ---
63
+
64
+ ## 🌐 The Vision: Virtual Centralization vs. Binary Lock-in
65
+
66
+ The PKM (Personal Knowledge Management) world is currently forcing users to make a painful choice between **Data Longevity** and **AI Power**.
67
+
68
+ * **Vanilla Logseq / Obsidian** is a "Forest" of decentralized Markdown files. It guarantees the Lindy effect (plain-text lasts forever) and perfect Git versioning, but standard AI chunkers treat it like a blender, destroying the outliner hierarchy.
69
+ * **Tana** is a centralized "Tree". It offers incredible semantic power, but traps your brain in a proprietary cloud database.
70
+ * **The new Logseq DB (SQLite)** aims for database speed, but at a huge cost: it locks your notes inside a binary `.db` file. You lose human-readable files, you lose line-by-line Git diffs, and you lose the immortality of plain-text.
71
+
72
+ ### 🔱 The Matryca Solution: The Best of Both Worlds
73
+ **Logseq Matryca Parser** is the ultimate bridge. It allows you to **keep your sovereign, future-proof Markdown files**, while synthesizing a **Virtual Global Graph** in RAM at runtime.
74
+
75
+ It acts as the strict **File System Driver** for your LLM OS. By using a deterministic Stack-Machine to parse your outliner topology, it feeds LangChain or LlamaIndex with the exact parent-child context of every single block.
76
+
77
+ *You get the reasoning power of a centralized relational database, without sacrificing the plain-text soul of your Second Brain in Logseq.*
78
+
79
+ ---
80
+
81
+ ## ⚖️ The PKM Landscape
82
+
83
+ | Feature | Vanilla Markdown | **Matryca Parser** | Logseq DB (SQLite) | Tana |
84
+ | :--- | :--- | :--- | :--- | :--- |
85
+ | **Data Format** | Plain-text (.md) | **Plain-text (.md)** | Binary (.db) | Proprietary Cloud |
86
+ | **Version Control** | Perfect (Git) | **Perfect (Git)** | Poor (Binary blob) | None |
87
+ | **Data Structure** | Decentralized Forest | **Virtually Centralized Graph** | Relational Database | Centralized Tree |
88
+ | **AI Readiness** | Low (Linear Chunks) | **High (Topological AST)** | TBD (Requires SQL) | High (Proprietary) |
89
+ | **Sovereignty** | 100% Local | **100% Local (Sovereign AI)** | 100% Local | Cloud-Only |
90
+
91
+ ---
92
+
93
+ ## 🧭 Matryca vs. naive framework loaders
94
+
95
+ | Capability | Typical LangChain / LlamaIndex Markdown loaders | **Matryca (LOGOS + SYNAPSE + graph)** |
96
+ | :--- | :--- | :--- |
97
+ | **Parent–child context** | Character or heading splits; children often orphaned from parents | **True outliner AST**: every block carries `parent_id`, `path`, `left_id` and visits in deterministic tree order |
98
+ | **Block references `((uuid))`** | Treated as opaque text or dropped | **Resolved** against `LogseqGraph`; optional **embed expansion** and **Obsidian `[[Page#^anchor]]`** export |
99
+ | **Property inheritance** | Page-level frontmatter at best | **`get_effective_properties`**: page + ancestor outline keys merged top-down (Org-mode style), then exposed on enriched chunks |
100
+ | **Live sync** | Re-read whole tree or poll | **`LogseqGraph.start_watching()`** (optional `watchdog`): **per-file invalidation** — re-parse one page, purge stale UUIDs from registries, refresh backlinks |
101
+
102
+ ---
103
+
104
+ ### 🚀 The Problem
105
+ Standard RAG pipelines treat your notes like a blender. They chop Markdown into random shards, destroying the **parent-child hierarchy** that makes Logseq powerful.
106
+
107
+ ```mermaid
108
+ graph TD
109
+ Raw[(Logseq Markdown\nFiles)]
110
+
111
+ subgraph Standard RAG
112
+ Blender[Standard Text Splitter\n'The Blender']
113
+ Chunk1[Chunk 1: Orphan text]
114
+ Chunk2[Chunk 2: Lost context]
115
+ Blender --> Chunk1 & Chunk2
116
+ end
117
+
118
+ subgraph Matryca Parser
119
+ Architect[Logos Engine\nStack-Machine]
120
+ Parent[Parent Node\n+ Properties]
121
+ Child[Child Node\n+ Task State & Time]
122
+ Architect --> Parent --> Child
123
+ end
124
+
125
+ Raw --> Blender
126
+ Raw --> Architect
127
+
128
+ classDef bad fill:#fee2e2,stroke:#ef4444,color:#000;
129
+ classDef good fill:#dcfce7,stroke:#22c55e,color:#000;
130
+ class Chunk1,Chunk2 bad;
131
+ class Parent,Child good;
132
+ ```
133
+
134
+ ### 🔱 The Solution
135
+ Logseq Matryca Parser is a deterministic **Stack-Machine engine** that acts as the **File System Driver** for your LLM. It preserves the true topology of your thoughts, ensuring AI understands spatial hierarchy, time, and block-lineage—including **structured task state** and **first-class temporal attributes** you can query in downstream graph databases and GraphRAG engines without re-parsing raw Markdown.
136
+
137
+ ---
138
+
139
+ ## ⚡ Recent superpowers (Waves 4–11)
140
+
141
+ ### Obsidian-native export
142
+ Compile an entire Logseq graph into an **Obsidian vault layout**: YAML frontmatter from page properties, list body preserved, Logseq `((uuid))` links rewritten to **`[[Page#^anchor]]`**, and trailing **`^block-id`** on referenced blocks. Namespace titles become nested folders (e.g. `Projects/AI/Demo.md`).
143
+
144
+ ```bash
145
+ matryca-parse export /path/to/logseq/graph /path/to/obsidian/vault --format obsidian
146
+ ```
147
+
148
+ > **Note:** Wikilinks currently use the **Logseq page title** (e.g. `[[Target#^…]]`). Vault files may live under namespace folders (`Projects/AI/Demo.md`). Obsidian usually resolves unique titles; aligning link text to folder paths is a possible future refinement.
149
+
150
+ ### Live incremental watcher
151
+ `LogseqGraph` supports **surgical file invalidation** (optional dependency: `pip install 'logseq-matryca-parser[watch]'`). `start_watching()` runs a recursive **watchdog** observer: on `created` / `modified` under `pages/` or `journals/`, only that file is re-parsed; stale synthetic UUIDs are purged from `_node_registry` and scrubbed from `_backlink_registry`—no full-graph cold reload.
152
+
153
+ ### Fluent topological queries
154
+ Filter the global node registry with a **chainable** API (tags, task state, ancestry under a parent UUID):
155
+
156
+ ```python
157
+ from logseq_matryca_parser.graph import LogseqGraph
158
+
159
+ graph = LogseqGraph.load_directory("/path/to/logseq/graph")
160
+ hits = (
161
+ graph.query()
162
+ .has_tag("idea")
163
+ .under_parent("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
164
+ .is_task_state("TODO")
165
+ .execute()
166
+ )
167
+ ```
168
+
169
+ ### Agent-Native X-Ray Mode (Token Optimization)
170
+ For autonomous LLM agents, passing raw Markdown into the context window wastes thousands of tokens on **36-character UUIDs**, hidden `id::` properties, drawers, and collapsed directives that carry no immediate semantic signal. **X-Ray mode** compresses the parsed AST into **ultra-dense, zero-fluff plain text**: each block becomes `{indent}[{alias}] {clean_text}`, with heavy Logseq UUIDs replaced by **sequential integer aliases** (`[0]`, `[1]`, …) held in a session registry. On typical outlines this can reduce context consumption by **up to ~35×** compared to dumping full block payloads.
171
+
172
+ ```bash
173
+ matryca-parse agent-read /path/to/graph --tag idea
174
+ matryca-parse agent-read /path/to/graph --query "quantum"
175
+ ```
176
+
177
+ The agent reads cheap topology now; the registry resolves aliases back to sovereign UUIDs when you wire targeted writes.
178
+
179
+ ---
180
+
181
+ ## 🏗️ Core Capabilities
182
+
183
+ | Feature | Description |
184
+ | :--- | :--- |
185
+ | **LOGOS Engine** | Deterministic AST parsing. No regex-guessing. Handles `id::`, aliases, and multiline blocks. |
186
+ | **Advanced Task Extraction** | Task **state** (TODO / DOING / …), **priority** markers `[#A]`–`[#C]` promoted to `task_priority`, and **SCHEDULED** / **DEADLINE** Logseq timestamps normalized to **UTC Unix epoch seconds** on `scheduled_at` / `deadline_at` for temporal graph and retrieval pipelines. |
187
+ | **SYNAPSE Adapter** | Native exports for **LangChain** and **LlamaIndex** with automated lineage metadata; **context-enriched** chunks with breadcrumbs, embed expansion, and inherited properties. |
188
+ | **FORGE** | JSON, clean Markdown, and **Obsidian** vault serialization (`ObsidianForgeVisitor`, `ForgeExporter.to_obsidian_markdown`). |
189
+ | **LENS Visualizer** | 60FPS interactive graph rendering (10k+ nodes) with Glassmorphism HUD. |
190
+ | **Agent-Native Printing Press** | [`agent_press.py`](src/logseq_matryca_parser/agent_press.py): **`SessionAliasRegistry`** maps session aliases ↔ block UUIDs; **`to_xray_markdown`** emits token-minimal outline text for autonomous agents (`matryca-parse agent-read`). |
191
+ | **Sovereign AI** | 100% Local. Zero telemetry. Private by design. |
192
+
193
+ ### Data model — `LogseqNode` task fields
194
+
195
+ Each AST block is a `LogseqNode`. Alongside `task_status`, the parser surfaces priority and schedule metadata as typed fields (epoch integers are **seconds since Unix epoch, UTC**):
196
+
197
+ ```json
198
+ {
199
+ "uuid": "6ba7b810-9dad-11d1-80b4-00c04fd430c8",
200
+ "task_status": "TODO",
201
+ "task_priority": "A",
202
+ "scheduled_at": 1641600000,
203
+ "deadline_at": 1641772800,
204
+ "clean_text": "Cut v0.3.0 release"
205
+ }
206
+ ```
207
+
208
+ Marker syntax (`[#A]`, `SCHEDULED: <...>`, `DEADLINE: <...>`) is stripped from `clean_text` so embeddings stay clean; the promoted fields carry the structured signal for downstream graph databases and GraphRAG engines.
209
+
210
+ ---
211
+
212
+ ## 🛠️ Quickstart
213
+
214
+ ```bash
215
+ # Install from GitHub (PyPI distribution tracked on roadmap)
216
+ pip install git+https://github.com/MarcoPorcellato/logseq-matryca-parser.git
217
+
218
+ # Optional: filesystem watcher for live incremental graph updates
219
+ pip install 'logseq-matryca-parser[watch]'
220
+
221
+ # 1. Visualize your local graph (LENS)
222
+ matryca-parse visualize /path/to/logseq/graph my-map.html
223
+
224
+ # 2. Export for AI / RAG (SYNAPSE)
225
+ matryca-parse export /path/to/logseq/graph output --format langchain
226
+
227
+ # 3. Context-enriched LangChain JSON (graph + inheritance + embed expansion)
228
+ matryca-parse export /path/to/logseq/graph output --format langchain-enriched
229
+
230
+ # 4. Obsidian vault (YAML frontmatter + ^ block ids)
231
+ matryca-parse export /path/to/logseq/graph output --format obsidian
232
+ ```
233
+
234
+ ### Python API
235
+ ```python
236
+ from logseq_matryca_parser.logos_parser import LogosParser
237
+ from logseq_matryca_parser.synapse import SynapseAdapter
238
+
239
+ # Parse to AST
240
+ page = LogosParser().parse_page_file("page.md")
241
+
242
+ # Export to LangChain with lineage metadata
243
+ docs = SynapseAdapter.to_langchain_documents(page.root_nodes, source_name=page.title)
244
+ ```
245
+
246
+ ### 🤖 Agentic Write Access (Append-Only)
247
+
248
+ Agents such as Hermes or OpenClaw can record structured notes into a Logseq graph **without rewriting existing pages**. The helper `logseq_agent_write` only **opens the weekly agent page in append mode** (`"a"`), writes a new bullet (journal link + optional tag links + body), and never truncates or replaces prior content—so routine logging cannot wipe blocks that already live in that file.
249
+
250
+ Point it at your graph’s **`pages`** directory and **`config.edn`** so journal titles match Logseq’s `:journal/page-title-format` (including ordinal days when you use `do` in the pattern).
251
+
252
+ ```python
253
+ from logseq_matryca_parser import logseq_agent_write
254
+
255
+ result = logseq_agent_write(
256
+ "Summarized user intent and proposed next steps.",
257
+ config_path="/path/to/logseq/config.edn",
258
+ pages_dir="/path/to/logseq/pages",
259
+ context_tags=["agent/hermes", "#session"],
260
+ )
261
+ assert result["status"] == "success"
262
+ # result["path"] → e.g. .../pages/2026-18-agent.md
263
+ ```
264
+ ---
265
+
266
+ ## 🗺️ Roadmap
267
+ - [ ] **Desktop GUI:** Standalone app for non-technical users. [(Join the RFC)](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/3)
268
+ - [x] **Obsidian Adapter:** Native CLI export (`--format obsidian`) with YAML frontmatter and `^` block anchors.
269
+ - [ ] **Ollama Integration:** One-click local RAG setup.
270
+
271
+ ## ☕ Support & Enterprise
272
+ Logseq Matryca Parser is open-source. If it powers your pipeline, consider a star ⭐ or a sponsorship!
273
+
274
+ **💖 [Sponsor me on GitHub](https://github.com/sponsors/MarcoPorcellato)**
275
+
276
+ Need custom RAG integrations or consulting? Contact: [marco@marcoporcellato.it](mailto:marco@marcoporcellato.it)
277
+
278
+ ---
279
+ Architected by **Marco Porcellato** | Powered by **Matryca.ai**
@@ -0,0 +1,21 @@
1
+ logseq_matryca_parser/.gitignore,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ logseq_matryca_parser/NOTICE,sha256=maIqe2nNKdNAhLqgjIbO4Xa86EpLKqQwFiOgSze3ewY,230
3
+ logseq_matryca_parser/__init__.py,sha256=yhAqWJMb-Dy62e_mBDYljqALTg3k5SXSCpyxGYGfGU0,1463
4
+ logseq_matryca_parser/__main__.py,sha256=L_hCPkZR0DWIqEka-hrolxdjAczy5AbbUyAe3DIMh60,215
5
+ logseq_matryca_parser/agent_press.py,sha256=LG_RIHBCaY6GG-4Q4jgemb4hiLtxC6q4W_zjcOJQb6s,3722
6
+ logseq_matryca_parser/agent_writer.py,sha256=gDWYmmqjFpar_Iziq3gormj0MRrJraz27SD6rO3mODI,8000
7
+ logseq_matryca_parser/exceptions.py,sha256=7RhkTwkcLT10YLPciD7ZJCnrRC2CZ776ZiZUhRAsWPY,359
8
+ logseq_matryca_parser/forge.py,sha256=F-FCRrfTnm2o_z5LSDzHEBF9Vrnmypk1uXssyMOBpEY,13948
9
+ logseq_matryca_parser/graph.py,sha256=oxnqRJjC-Tkrz_YJHrlyD2cqo0-Mgii0IBIrvBfeS7w,19433
10
+ logseq_matryca_parser/kinetic.py,sha256=jFF_6ddjvQ0UDTSa7YwaUcPM3qfxnyBlE1TSmaRuMlY,19256
11
+ logseq_matryca_parser/lens.py,sha256=2euDU4xU5OJglaeqzKhP9SdKyKZn5h9wyl3YmLULKQ4,16061
12
+ logseq_matryca_parser/logos_core.py,sha256=WUR4ecFyrrvfdCu0V7NESHEL14ZXRdbfegsWYWxT5c4,5896
13
+ logseq_matryca_parser/logos_parser.py,sha256=yRuec3Q6Rdh0OO0R7r8AQiCDdDLu98q_CnucoeMMnsU,40814
14
+ logseq_matryca_parser/pyproject.toml,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ logseq_matryca_parser/synapse.py,sha256=N5N6iVv-7s5rsRW9cOaWPB904tLRSjGSVhXwZQXQoZs,12632
16
+ logseq_matryca_parser-0.3.0.dist-info/METADATA,sha256=Slt5KLj-Pn5I_3882tyGFIwGGkJfS6FztemxkAsO1i8,14430
17
+ logseq_matryca_parser-0.3.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
18
+ logseq_matryca_parser-0.3.0.dist-info/entry_points.txt,sha256=wQfdsO_DVmG_RIh9oBMv40OIV9nQ_tZaq_w2CAz6Vus,68
19
+ logseq_matryca_parser-0.3.0.dist-info/licenses/LICENSE,sha256=E_Dz4jVCe2JdJH6VkLe7pKrCIr9mifszSIv1usGyLJ0,11334
20
+ logseq_matryca_parser-0.3.0.dist-info/licenses/NOTICE,sha256=maIqe2nNKdNAhLqgjIbO4Xa86EpLKqQwFiOgSze3ewY,230
21
+ logseq_matryca_parser-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ matryca-parse = logseq_matryca_parser.kinetic:app