logseq-matryca-parser 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logseq_matryca_parser/.gitignore +0 -0
- logseq_matryca_parser/NOTICE +7 -0
- logseq_matryca_parser/__init__.py +61 -0
- logseq_matryca_parser/__main__.py +8 -0
- logseq_matryca_parser/agent_press.py +99 -0
- logseq_matryca_parser/agent_writer.py +250 -0
- logseq_matryca_parser/exceptions.py +13 -0
- logseq_matryca_parser/forge.py +399 -0
- logseq_matryca_parser/graph.py +493 -0
- logseq_matryca_parser/kinetic.py +531 -0
- logseq_matryca_parser/lens.py +427 -0
- logseq_matryca_parser/logos_core.py +171 -0
- logseq_matryca_parser/logos_parser.py +1047 -0
- logseq_matryca_parser/pyproject.toml +0 -0
- logseq_matryca_parser/synapse.py +329 -0
- logseq_matryca_parser-0.3.0.dist-info/METADATA +279 -0
- logseq_matryca_parser-0.3.0.dist-info/RECORD +21 -0
- logseq_matryca_parser-0.3.0.dist-info/WHEEL +4 -0
- logseq_matryca_parser-0.3.0.dist-info/entry_points.txt +2 -0
- logseq_matryca_parser-0.3.0.dist-info/licenses/LICENSE +201 -0
- logseq_matryca_parser-0.3.0.dist-info/licenses/NOTICE +7 -0
|
File without changes
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""SYNAPSE adapters implemented with AST visitors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from logseq_matryca_parser.logos_core import ASTVisitor, LogseqNode, LogseqPage
|
|
11
|
+
from logseq_matryca_parser.logos_parser import LogosParser
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from logseq_matryca_parser.graph import LogseqGraph
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
_BLOCK_EMBED_PATTERN = re.compile(
|
|
19
|
+
r"\{\{\s*embed\s+\(\((?P<uuid>[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})\)\)\s*\}\}",
|
|
20
|
+
re.IGNORECASE,
|
|
21
|
+
)
|
|
22
|
+
_PAGE_EMBED_PATTERN = re.compile(r"\{\{\s*embed\s+\[\[(?P<title>[^\]]+)\]\]\s*\}\}")
|
|
23
|
+
|
|
24
|
+
Document: type[Any] | None
|
|
25
|
+
NodeRelationship: Any
|
|
26
|
+
RelatedNodeInfo: type[Any] | None
|
|
27
|
+
TextNode: type[Any] | None
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from langchain_core.documents import Document # type: ignore
|
|
31
|
+
except ImportError:
|
|
32
|
+
Document = None
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
from llama_index.core.schema import ( # type: ignore
|
|
36
|
+
NodeRelationship,
|
|
37
|
+
RelatedNodeInfo,
|
|
38
|
+
TextNode,
|
|
39
|
+
)
|
|
40
|
+
except ImportError:
|
|
41
|
+
NodeRelationship = None
|
|
42
|
+
RelatedNodeInfo = None
|
|
43
|
+
TextNode = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _flatten_nodes_for_export(nodes: list[LogseqNode]) -> list[LogseqNode]:
|
|
47
|
+
"""Depth-first flattening of a node tree (same order as graph indexing)."""
|
|
48
|
+
flat: list[LogseqNode] = []
|
|
49
|
+
for node in nodes:
|
|
50
|
+
flat.append(node)
|
|
51
|
+
if node.children:
|
|
52
|
+
flat.extend(_flatten_nodes_for_export(node.children))
|
|
53
|
+
return flat
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _strip_markdown_for_embedding(text: str) -> str:
|
|
57
|
+
"""Remove common markdown noise from breadcrumb fragments for embedding-friendly strings."""
|
|
58
|
+
s = text.strip()
|
|
59
|
+
s = re.sub(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]", r"\1", s)
|
|
60
|
+
s = re.sub(r"\*\*([^*]+)\*\*", r"\1", s)
|
|
61
|
+
s = re.sub(r"(?<!\*)\*([^*]+)\*(?!\*)", r"\1", s)
|
|
62
|
+
s = re.sub(r"`([^`]+)`", r"\1", s)
|
|
63
|
+
s = re.sub(r"#([^\s#]+)", "", s)
|
|
64
|
+
s = re.sub(r"\s+", " ", s).strip()
|
|
65
|
+
return s
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _expand_macros_and_embeds(text: str, graph: LogseqGraph, visited_uuids: set[str]) -> str:
|
|
69
|
+
"""Expand ``{{embed ((uuid))}}`` / ``{{embed [[page]]}}`` for RAG text.
|
|
70
|
+
|
|
71
|
+
Operates on raw block ``content`` (not ``clean_text``) so ``((uuid))`` inside macros is
|
|
72
|
+
still visible to the scanner after parsing.
|
|
73
|
+
"""
|
|
74
|
+
return _expand_macros_and_embeds_impl(text, graph, visited_uuids, set())
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _expand_macros_and_embeds_impl(
|
|
78
|
+
text: str,
|
|
79
|
+
graph: LogseqGraph,
|
|
80
|
+
visited_uuids: set[str],
|
|
81
|
+
visited_pages: set[str],
|
|
82
|
+
) -> str:
|
|
83
|
+
"""Shared worker: ``visited_uuids`` breaks block cycles; ``visited_pages`` breaks page cycles."""
|
|
84
|
+
result = text
|
|
85
|
+
while True:
|
|
86
|
+
bm = _BLOCK_EMBED_PATTERN.search(result)
|
|
87
|
+
pm = _PAGE_EMBED_PATTERN.search(result)
|
|
88
|
+
if bm is None and pm is None:
|
|
89
|
+
break
|
|
90
|
+
use_block = bm is not None and (pm is None or bm.start() <= pm.start())
|
|
91
|
+
if use_block:
|
|
92
|
+
assert bm is not None
|
|
93
|
+
match = bm
|
|
94
|
+
uid = match.group("uuid")
|
|
95
|
+
if uid in visited_uuids:
|
|
96
|
+
logger.debug("Stack-Machine embed: cyclic block uuid=%s", uid)
|
|
97
|
+
replacement = ""
|
|
98
|
+
else:
|
|
99
|
+
target = graph.get_node_by_embed_ref(uid)
|
|
100
|
+
if target is None:
|
|
101
|
+
logger.debug("Stack-Machine embed: unresolved block uuid=%s", uid)
|
|
102
|
+
replacement = match.group(0)
|
|
103
|
+
else:
|
|
104
|
+
next_seen = set(visited_uuids)
|
|
105
|
+
next_seen.add(uid)
|
|
106
|
+
replacement = _expand_macros_and_embeds_impl(
|
|
107
|
+
target.content, graph, next_seen, visited_pages
|
|
108
|
+
)
|
|
109
|
+
result = result[: match.start()] + replacement + result[match.end() :]
|
|
110
|
+
else:
|
|
111
|
+
assert pm is not None
|
|
112
|
+
match = pm
|
|
113
|
+
title = match.group("title").strip()
|
|
114
|
+
if title in visited_pages:
|
|
115
|
+
logger.debug("Stack-Machine embed: cyclic page title=%s", title)
|
|
116
|
+
replacement = ""
|
|
117
|
+
else:
|
|
118
|
+
page = graph.pages.get(title)
|
|
119
|
+
if page is None:
|
|
120
|
+
logger.debug("Stack-Machine embed: unknown page title=%s", title)
|
|
121
|
+
replacement = match.group(0)
|
|
122
|
+
else:
|
|
123
|
+
visited_pages.add(title)
|
|
124
|
+
try:
|
|
125
|
+
shared_blocks = set(visited_uuids)
|
|
126
|
+
pieces: list[str] = []
|
|
127
|
+
for n in _flatten_nodes_for_export(page.root_nodes):
|
|
128
|
+
frag = _expand_macros_and_embeds_impl(
|
|
129
|
+
n.content, graph, shared_blocks, visited_pages
|
|
130
|
+
)
|
|
131
|
+
stripped = frag.strip()
|
|
132
|
+
if stripped:
|
|
133
|
+
pieces.append(stripped)
|
|
134
|
+
replacement = "\n".join(pieces)
|
|
135
|
+
finally:
|
|
136
|
+
visited_pages.discard(title)
|
|
137
|
+
result = result[: match.start()] + replacement + result[match.end() :]
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _build_breadcrumbs(graph: LogseqGraph, node: LogseqNode) -> tuple[str, LogseqPage | None]:
|
|
142
|
+
"""Build `Page > ancestor clean_text ...` using ``node.path`` and the graph registry."""
|
|
143
|
+
page = graph._page_for_node(node)
|
|
144
|
+
page_title = page.title if page is not None else ""
|
|
145
|
+
parts: list[str] = []
|
|
146
|
+
if page_title:
|
|
147
|
+
parts.append(_strip_markdown_for_embedding(page_title))
|
|
148
|
+
for ancestor_uuid in node.path[:-1]:
|
|
149
|
+
ancestor = graph.get_node_by_uuid(ancestor_uuid)
|
|
150
|
+
if ancestor is None:
|
|
151
|
+
continue
|
|
152
|
+
stripped = _strip_markdown_for_embedding(ancestor.clean_text)
|
|
153
|
+
if stripped:
|
|
154
|
+
parts.append(stripped)
|
|
155
|
+
return " > ".join(parts), page
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class LangChainVisitor(ASTVisitor):
|
|
159
|
+
"""Build LangChain documents during AST traversal."""
|
|
160
|
+
|
|
161
|
+
def __init__(self, source_name: str, document_cls: type[Any]) -> None:
|
|
162
|
+
self._source_name = source_name
|
|
163
|
+
self._document_cls = document_cls
|
|
164
|
+
self._documents: list[Any] = []
|
|
165
|
+
|
|
166
|
+
def visit_node(self, node: LogseqNode) -> None:
|
|
167
|
+
metadata = {
|
|
168
|
+
**node.properties,
|
|
169
|
+
"uuid": node.uuid,
|
|
170
|
+
"parent_id": node.parent_id,
|
|
171
|
+
"indent_level": node.indent_level,
|
|
172
|
+
"source": self._source_name,
|
|
173
|
+
"path": node.path,
|
|
174
|
+
"left_id": node.left_id,
|
|
175
|
+
"refs": node.refs,
|
|
176
|
+
"task_status": node.task_status,
|
|
177
|
+
"repeater": node.repeater,
|
|
178
|
+
"created_at": node.created_at,
|
|
179
|
+
}
|
|
180
|
+
self._documents.append(
|
|
181
|
+
self._document_cls(
|
|
182
|
+
page_content=node.clean_text,
|
|
183
|
+
metadata=metadata,
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def depart_node(self, node: LogseqNode) -> None:
|
|
188
|
+
_ = node
|
|
189
|
+
|
|
190
|
+
def get_documents(self) -> list[Any]:
|
|
191
|
+
return self._documents
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class LlamaIndexVisitor(ASTVisitor):
|
|
195
|
+
"""Build LlamaIndex nodes and inject parent/child topology relationships."""
|
|
196
|
+
|
|
197
|
+
def __init__(
|
|
198
|
+
self,
|
|
199
|
+
text_node_cls: type[Any],
|
|
200
|
+
node_relationship: Any,
|
|
201
|
+
related_node_info_cls: type[Any],
|
|
202
|
+
) -> None:
|
|
203
|
+
self._text_node_cls = text_node_cls
|
|
204
|
+
self._node_relationship = node_relationship
|
|
205
|
+
self._related_node_info_cls = related_node_info_cls
|
|
206
|
+
self._nodes_by_id: dict[str, Any] = {}
|
|
207
|
+
self._ordered_nodes: list[Any] = []
|
|
208
|
+
|
|
209
|
+
def visit_node(self, node: LogseqNode) -> None:
|
|
210
|
+
text_node = self._text_node_cls(
|
|
211
|
+
id_=node.uuid,
|
|
212
|
+
text=node.clean_text,
|
|
213
|
+
metadata={
|
|
214
|
+
**node.properties,
|
|
215
|
+
"uuid": node.uuid,
|
|
216
|
+
"indent_level": node.indent_level,
|
|
217
|
+
"path": node.path,
|
|
218
|
+
"left_id": node.left_id,
|
|
219
|
+
"refs": node.refs,
|
|
220
|
+
"task_status": node.task_status,
|
|
221
|
+
"repeater": node.repeater,
|
|
222
|
+
"created_at": node.created_at,
|
|
223
|
+
},
|
|
224
|
+
)
|
|
225
|
+
if not hasattr(text_node, "relationships") or text_node.relationships is None:
|
|
226
|
+
text_node.relationships = {}
|
|
227
|
+
|
|
228
|
+
if node.parent_id:
|
|
229
|
+
text_node.relationships[self._node_relationship.PARENT] = self._related_node_info_cls(
|
|
230
|
+
node_id=node.parent_id
|
|
231
|
+
)
|
|
232
|
+
parent_node = self._nodes_by_id.get(node.parent_id)
|
|
233
|
+
if parent_node is not None:
|
|
234
|
+
child_relationships = parent_node.relationships.get(
|
|
235
|
+
self._node_relationship.CHILD, []
|
|
236
|
+
)
|
|
237
|
+
child_relationships.append(self._related_node_info_cls(node_id=node.uuid))
|
|
238
|
+
parent_node.relationships[self._node_relationship.CHILD] = child_relationships
|
|
239
|
+
|
|
240
|
+
self._nodes_by_id[node.uuid] = text_node
|
|
241
|
+
self._ordered_nodes.append(text_node)
|
|
242
|
+
|
|
243
|
+
def depart_node(self, node: LogseqNode) -> None:
|
|
244
|
+
_ = node
|
|
245
|
+
|
|
246
|
+
def get_nodes(self) -> list[Any]:
|
|
247
|
+
return self._ordered_nodes
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class SynapseAdapter:
|
|
251
|
+
"""Transform Logseq hierarchy into framework-native AI objects."""
|
|
252
|
+
|
|
253
|
+
@staticmethod
|
|
254
|
+
def to_langchain_documents(nodes: list[LogseqNode], source_name: str) -> list[Any]:
|
|
255
|
+
"""Convert AST nodes to LangChain documents using `LangChainVisitor`."""
|
|
256
|
+
if Document is None:
|
|
257
|
+
raise ImportError("LangChain non rilevato. Installa 'langchain-core' per usare Synapse.")
|
|
258
|
+
visitor = LangChainVisitor(source_name=source_name, document_cls=Document)
|
|
259
|
+
for node in nodes:
|
|
260
|
+
node.accept(visitor)
|
|
261
|
+
return visitor.get_documents()
|
|
262
|
+
|
|
263
|
+
@staticmethod
|
|
264
|
+
def to_llamaindex_nodes(nodes: list[LogseqNode]) -> list[Any]:
|
|
265
|
+
"""Convert AST nodes to LlamaIndex nodes preserving topology links."""
|
|
266
|
+
if TextNode is None or NodeRelationship is None or RelatedNodeInfo is None:
|
|
267
|
+
raise ImportError("LlamaIndex non rilevato. Installa 'llama-index' per usare Synapse.")
|
|
268
|
+
visitor = LlamaIndexVisitor(
|
|
269
|
+
text_node_cls=TextNode,
|
|
270
|
+
node_relationship=NodeRelationship,
|
|
271
|
+
related_node_info_cls=RelatedNodeInfo,
|
|
272
|
+
)
|
|
273
|
+
for node in nodes:
|
|
274
|
+
node.accept(visitor)
|
|
275
|
+
return visitor.get_nodes()
|
|
276
|
+
|
|
277
|
+
@staticmethod
|
|
278
|
+
def to_context_enriched_chunks(
|
|
279
|
+
nodes: list[LogseqNode],
|
|
280
|
+
graph: LogseqGraph,
|
|
281
|
+
format_template: str = "[{breadcrumbs}] {content}",
|
|
282
|
+
) -> list[Any]:
|
|
283
|
+
"""Flatten ``nodes`` and emit LangChain ``Document``s with breadcrumb-enriched ``page_content``."""
|
|
284
|
+
if Document is None:
|
|
285
|
+
raise ImportError("LangChain non rilevato. Installa 'langchain-core' per usare Synapse.")
|
|
286
|
+
documents: list[Any] = []
|
|
287
|
+
flat = _flatten_nodes_for_export(nodes)
|
|
288
|
+
for node in flat:
|
|
289
|
+
breadcrumbs, page = _build_breadcrumbs(graph, node)
|
|
290
|
+
source_name = Path(node.source_path).name if node.source_path else str(graph.graph_path.name)
|
|
291
|
+
expanded_content = _expand_macros_and_embeds(node.content, graph, set())
|
|
292
|
+
page_content = format_template.format(
|
|
293
|
+
breadcrumbs=breadcrumbs,
|
|
294
|
+
content=expanded_content,
|
|
295
|
+
)
|
|
296
|
+
effective_properties = dict(graph.get_effective_properties(node.uuid))
|
|
297
|
+
metadata = {
|
|
298
|
+
**node.properties,
|
|
299
|
+
"uuid": node.uuid,
|
|
300
|
+
"parent_id": node.parent_id,
|
|
301
|
+
"indent_level": node.indent_level,
|
|
302
|
+
"source": source_name,
|
|
303
|
+
"path": node.path,
|
|
304
|
+
"left_id": node.left_id,
|
|
305
|
+
"refs": node.refs,
|
|
306
|
+
"task_status": node.task_status,
|
|
307
|
+
"repeater": node.repeater,
|
|
308
|
+
"created_at": node.created_at,
|
|
309
|
+
"clean_text": node.clean_text,
|
|
310
|
+
"page_title": page.title if page is not None else "",
|
|
311
|
+
"source_path": node.source_path,
|
|
312
|
+
"line_start": node.line_start,
|
|
313
|
+
"effective_properties": effective_properties,
|
|
314
|
+
}
|
|
315
|
+
documents.append(Document(page_content=page_content, metadata=metadata))
|
|
316
|
+
logger.debug(
|
|
317
|
+
"context chunk uuid=%s breadcrumbs_len=%s effective_keys=%s",
|
|
318
|
+
node.uuid,
|
|
319
|
+
len(breadcrumbs),
|
|
320
|
+
tuple(effective_properties.keys()),
|
|
321
|
+
)
|
|
322
|
+
return documents
|
|
323
|
+
|
|
324
|
+
@classmethod
|
|
325
|
+
def load_and_convert(cls, file_path: Path) -> list[Any]:
|
|
326
|
+
"""Parse a file and convert it to LangChain documents."""
|
|
327
|
+
parser = LogosParser()
|
|
328
|
+
nodes = parser.parse_file(file_path)
|
|
329
|
+
return cls.to_langchain_documents(nodes, source_name=file_path.name)
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: logseq-matryca-parser
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: The Logos Protocol: Deterministic Logseq AST parsing for Matryca.ai.
|
|
5
|
+
Project-URL: Homepage, https://github.com/MarcoPorcellato/logseq-matryca-parser
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/MarcoPorcellato/logseq-matryca-parser/issues
|
|
7
|
+
Project-URL: Matryca.ai, https://matryca.ai
|
|
8
|
+
Author-email: Marco Porcellato <marco@matryca.ai>
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
License-File: NOTICE
|
|
12
|
+
Keywords: ai,ast,knowledge-graph,logseq,parser,rag
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Requires-Dist: pydantic>=2.7.0
|
|
20
|
+
Requires-Dist: rich>=13.7.1
|
|
21
|
+
Requires-Dist: typer>=0.12.0
|
|
22
|
+
Provides-Extra: ai
|
|
23
|
+
Requires-Dist: langchain-core; extra == 'ai'
|
|
24
|
+
Requires-Dist: llama-index-core; extra == 'ai'
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: langchain-core; extra == 'all'
|
|
27
|
+
Requires-Dist: llama-index-core; extra == 'all'
|
|
28
|
+
Requires-Dist: networkx>=3.0.0; extra == 'all'
|
|
29
|
+
Requires-Dist: pyvis>=0.3.2; extra == 'all'
|
|
30
|
+
Provides-Extra: viz
|
|
31
|
+
Requires-Dist: networkx>=3.0.0; extra == 'viz'
|
|
32
|
+
Requires-Dist: pyvis>=0.3.2; extra == 'viz'
|
|
33
|
+
Provides-Extra: watch
|
|
34
|
+
Requires-Dist: watchdog>=4.0.0; extra == 'watch'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
<div align="center">
|
|
38
|
+
|
|
39
|
+
# 🔱 Logseq Matryca Parser (The Logos Protocol)
|
|
40
|
+
|
|
41
|
+
**Stop feeding broken Markdown to your AI.**
|
|
42
|
+
|
|
43
|
+
[](https://github.com/MarcoPorcellato/logseq-matryca-parser/actions)
|
|
44
|
+
[](https://www.python.org/downloads/)
|
|
45
|
+
[](https://github.com/MarcoPorcellato/logseq-matryca-parser/blob/main/LICENSE)
|
|
46
|
+
[](https://github.com/MarcoPorcellato/logseq-matryca-parser#-quickstart)
|
|
47
|
+
[](#)
|
|
48
|
+

|
|
49
|
+
|
|
50
|
+
> *Turning a forest of local plain-text files into a unified semantic powerhouse.*
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<video src="https://github.com/user-attachments/assets/24f73c6d-3eca-4adb-8442-981f2ba4cccd" autoplay loop muted playsinline width="800"></video>
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
[👉 **TRY THE LIVE INTERACTIVE DEMO**](https://MarcoPorcellato.github.io/logseq-matryca-parser/)
|
|
57
|
+
|
|
58
|
+
[📘 **READ THE ARCHITECTURE (LLM OS Vision)**](docs/ARCHITECTURE.md)
|
|
59
|
+
|
|
60
|
+
</div>
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## 🌐 The Vision: Virtual Centralization vs. Binary Lock-in
|
|
65
|
+
|
|
66
|
+
The PKM (Personal Knowledge Management) world is currently forcing users to make a painful choice between **Data Longevity** and **AI Power**.
|
|
67
|
+
|
|
68
|
+
* **Vanilla Logseq / Obsidian** is a "Forest" of decentralized Markdown files. It guarantees the Lindy effect (plain-text lasts forever) and perfect Git versioning, but standard AI chunkers treat it like a blender, destroying the outliner hierarchy.
|
|
69
|
+
* **Tana** is a centralized "Tree". It offers incredible semantic power, but traps your brain in a proprietary cloud database.
|
|
70
|
+
* **The new Logseq DB (SQLite)** aims for database speed, but at a huge cost: it locks your notes inside a binary `.db` file. You lose human-readable files, you lose line-by-line Git diffs, and you lose the immortality of plain-text.
|
|
71
|
+
|
|
72
|
+
### 🔱 The Matryca Solution: The Best of Both Worlds
|
|
73
|
+
**Logseq Matryca Parser** is the ultimate bridge. It allows you to **keep your sovereign, future-proof Markdown files**, while synthesizing a **Virtual Global Graph** in RAM at runtime.
|
|
74
|
+
|
|
75
|
+
It acts as the strict **File System Driver** for your LLM OS. By using a deterministic Stack-Machine to parse your outliner topology, it feeds LangChain or LlamaIndex with the exact parent-child context of every single block.
|
|
76
|
+
|
|
77
|
+
*You get the reasoning power of a centralized relational database, without sacrificing the plain-text soul of your Second Brain in Logseq.*
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## ⚖️ The PKM Landscape
|
|
82
|
+
|
|
83
|
+
| Feature | Vanilla Markdown | **Matryca Parser** | Logseq DB (SQLite) | Tana |
|
|
84
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
85
|
+
| **Data Format** | Plain-text (.md) | **Plain-text (.md)** | Binary (.db) | Proprietary Cloud |
|
|
86
|
+
| **Version Control** | Perfect (Git) | **Perfect (Git)** | Poor (Binary blob) | None |
|
|
87
|
+
| **Data Structure** | Decentralized Forest | **Virtually Centralized Graph** | Relational Database | Centralized Tree |
|
|
88
|
+
| **AI Readiness** | Low (Linear Chunks) | **High (Topological AST)** | TBD (Requires SQL) | High (Proprietary) |
|
|
89
|
+
| **Sovereignty** | 100% Local | **100% Local (Sovereign AI)** | 100% Local | Cloud-Only |
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## 🧭 Matryca vs. naive framework loaders
|
|
94
|
+
|
|
95
|
+
| Capability | Typical LangChain / LlamaIndex Markdown loaders | **Matryca (LOGOS + SYNAPSE + graph)** |
|
|
96
|
+
| :--- | :--- | :--- |
|
|
97
|
+
| **Parent–child context** | Character or heading splits; children often orphaned from parents | **True outliner AST**: every block carries `parent_id`, `path`, `left_id` and visits in deterministic tree order |
|
|
98
|
+
| **Block references `((uuid))`** | Treated as opaque text or dropped | **Resolved** against `LogseqGraph`; optional **embed expansion** and **Obsidian `[[Page#^anchor]]`** export |
|
|
99
|
+
| **Property inheritance** | Page-level frontmatter at best | **`get_effective_properties`**: page + ancestor outline keys merged top-down (Org-mode style), then exposed on enriched chunks |
|
|
100
|
+
| **Live sync** | Re-read whole tree or poll | **`LogseqGraph.start_watching()`** (optional `watchdog`): **per-file invalidation** — re-parse one page, purge stale UUIDs from registries, refresh backlinks |
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
### 🚀 The Problem
|
|
105
|
+
Standard RAG pipelines treat your notes like a blender. They chop Markdown into random shards, destroying the **parent-child hierarchy** that makes Logseq powerful.
|
|
106
|
+
|
|
107
|
+
```mermaid
|
|
108
|
+
graph TD
|
|
109
|
+
Raw[(Logseq Markdown\nFiles)]
|
|
110
|
+
|
|
111
|
+
subgraph Standard RAG
|
|
112
|
+
Blender[Standard Text Splitter\n'The Blender']
|
|
113
|
+
Chunk1[Chunk 1: Orphan text]
|
|
114
|
+
Chunk2[Chunk 2: Lost context]
|
|
115
|
+
Blender --> Chunk1 & Chunk2
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
subgraph Matryca Parser
|
|
119
|
+
Architect[Logos Engine\nStack-Machine]
|
|
120
|
+
Parent[Parent Node\n+ Properties]
|
|
121
|
+
Child[Child Node\n+ Task State & Time]
|
|
122
|
+
Architect --> Parent --> Child
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
Raw --> Blender
|
|
126
|
+
Raw --> Architect
|
|
127
|
+
|
|
128
|
+
classDef bad fill:#fee2e2,stroke:#ef4444,color:#000;
|
|
129
|
+
classDef good fill:#dcfce7,stroke:#22c55e,color:#000;
|
|
130
|
+
class Chunk1,Chunk2 bad;
|
|
131
|
+
class Parent,Child good;
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### 🔱 The Solution
|
|
135
|
+
Logseq Matryca Parser is a deterministic **Stack-Machine engine** that acts as the **File System Driver** for your LLM. It preserves the true topology of your thoughts, ensuring AI understands spatial hierarchy, time, and block-lineage—including **structured task state** and **first-class temporal attributes** you can query in downstream graph databases and GraphRAG engines without re-parsing raw Markdown.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## ⚡ Recent superpowers (Waves 4–11)
|
|
140
|
+
|
|
141
|
+
### Obsidian-native export
|
|
142
|
+
Compile an entire Logseq graph into an **Obsidian vault layout**: YAML frontmatter from page properties, list body preserved, Logseq `((uuid))` links rewritten to **`[[Page#^anchor]]`**, and trailing **`^block-id`** on referenced blocks. Namespace titles become nested folders (e.g. `Projects/AI/Demo.md`).
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
matryca-parse export /path/to/logseq/graph /path/to/obsidian/vault --format obsidian
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
> **Note:** Wikilinks currently use the **Logseq page title** (e.g. `[[Target#^…]]`). Vault files may live under namespace folders (`Projects/AI/Demo.md`). Obsidian usually resolves unique titles; aligning link text to folder paths is a possible future refinement.
|
|
149
|
+
|
|
150
|
+
### Live incremental watcher
|
|
151
|
+
`LogseqGraph` supports **surgical file invalidation** (optional dependency: `pip install 'logseq-matryca-parser[watch]'`). `start_watching()` runs a recursive **watchdog** observer: on `created` / `modified` under `pages/` or `journals/`, only that file is re-parsed; stale synthetic UUIDs are purged from `_node_registry` and scrubbed from `_backlink_registry`—no full-graph cold reload.
|
|
152
|
+
|
|
153
|
+
### Fluent topological queries
|
|
154
|
+
Filter the global node registry with a **chainable** API (tags, task state, ancestry under a parent UUID):
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from logseq_matryca_parser.graph import LogseqGraph
|
|
158
|
+
|
|
159
|
+
graph = LogseqGraph.load_directory("/path/to/logseq/graph")
|
|
160
|
+
hits = (
|
|
161
|
+
graph.query()
|
|
162
|
+
.has_tag("idea")
|
|
163
|
+
.under_parent("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
|
|
164
|
+
.is_task_state("TODO")
|
|
165
|
+
.execute()
|
|
166
|
+
)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Agent-Native X-Ray Mode (Token Optimization)
|
|
170
|
+
For autonomous LLM agents, passing raw Markdown into the context window wastes thousands of tokens on **36-character UUIDs**, hidden `id::` properties, drawers, and collapsed directives that carry no immediate semantic signal. **X-Ray mode** compresses the parsed AST into **ultra-dense, zero-fluff plain text**: each block becomes `{indent}[{alias}] {clean_text}`, with heavy Logseq UUIDs replaced by **sequential integer aliases** (`[0]`, `[1]`, …) held in a session registry. On typical outlines this can reduce context consumption by **up to ~35×** compared to dumping full block payloads.
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
matryca-parse agent-read /path/to/graph --tag idea
|
|
174
|
+
matryca-parse agent-read /path/to/graph --query "quantum"
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
The agent reads cheap topology now; the registry resolves aliases back to sovereign UUIDs when you wire targeted writes.
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## 🏗️ Core Capabilities
|
|
182
|
+
|
|
183
|
+
| Feature | Description |
|
|
184
|
+
| :--- | :--- |
|
|
185
|
+
| **LOGOS Engine** | Deterministic AST parsing. No regex-guessing. Handles `id::`, aliases, and multiline blocks. |
|
|
186
|
+
| **Advanced Task Extraction** | Task **state** (TODO / DOING / …), **priority** markers `[#A]`–`[#C]` promoted to `task_priority`, and **SCHEDULED** / **DEADLINE** Logseq timestamps normalized to **UTC Unix epoch seconds** on `scheduled_at` / `deadline_at` for temporal graph and retrieval pipelines. |
|
|
187
|
+
| **SYNAPSE Adapter** | Native exports for **LangChain** and **LlamaIndex** with automated lineage metadata; **context-enriched** chunks with breadcrumbs, embed expansion, and inherited properties. |
|
|
188
|
+
| **FORGE** | JSON, clean Markdown, and **Obsidian** vault serialization (`ObsidianForgeVisitor`, `ForgeExporter.to_obsidian_markdown`). |
|
|
189
|
+
| **LENS Visualizer** | 60FPS interactive graph rendering (10k+ nodes) with Glassmorphism HUD. |
|
|
190
|
+
| **Agent-Native Printing Press** | [`agent_press.py`](src/logseq_matryca_parser/agent_press.py): **`SessionAliasRegistry`** maps session aliases ↔ block UUIDs; **`to_xray_markdown`** emits token-minimal outline text for autonomous agents (`matryca-parse agent-read`). |
|
|
191
|
+
| **Sovereign AI** | 100% Local. Zero telemetry. Private by design. |
|
|
192
|
+
|
|
193
|
+
### Data model — `LogseqNode` task fields
|
|
194
|
+
|
|
195
|
+
Each AST block is a `LogseqNode`. Alongside `task_status`, the parser surfaces priority and schedule metadata as typed fields (epoch integers are **seconds since Unix epoch, UTC**):
|
|
196
|
+
|
|
197
|
+
```json
|
|
198
|
+
{
|
|
199
|
+
"uuid": "6ba7b810-9dad-11d1-80b4-00c04fd430c8",
|
|
200
|
+
"task_status": "TODO",
|
|
201
|
+
"task_priority": "A",
|
|
202
|
+
"scheduled_at": 1641600000,
|
|
203
|
+
"deadline_at": 1641772800,
|
|
204
|
+
"clean_text": "Cut v0.3.0 release"
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Marker syntax (`[#A]`, `SCHEDULED: <...>`, `DEADLINE: <...>`) is stripped from `clean_text` so embeddings stay clean; the promoted fields carry the structured signal for downstream graph databases and GraphRAG engines.
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## 🛠️ Quickstart
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
# Install from GitHub (PyPI distribution tracked on roadmap)
|
|
216
|
+
pip install git+https://github.com/MarcoPorcellato/logseq-matryca-parser.git
|
|
217
|
+
|
|
218
|
+
# Optional: filesystem watcher for live incremental graph updates
|
|
219
|
+
pip install 'logseq-matryca-parser[watch]'
|
|
220
|
+
|
|
221
|
+
# 1. Visualize your local graph (LENS)
|
|
222
|
+
matryca-parse visualize /path/to/logseq/graph my-map.html
|
|
223
|
+
|
|
224
|
+
# 2. Export for AI / RAG (SYNAPSE)
|
|
225
|
+
matryca-parse export /path/to/logseq/graph output --format langchain
|
|
226
|
+
|
|
227
|
+
# 3. Context-enriched LangChain JSON (graph + inheritance + embed expansion)
|
|
228
|
+
matryca-parse export /path/to/logseq/graph output --format langchain-enriched
|
|
229
|
+
|
|
230
|
+
# 4. Obsidian vault (YAML frontmatter + ^ block ids)
|
|
231
|
+
matryca-parse export /path/to/logseq/graph output --format obsidian
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Python API
|
|
235
|
+
```python
|
|
236
|
+
from logseq_matryca_parser.logos_parser import LogosParser
|
|
237
|
+
from logseq_matryca_parser.synapse import SynapseAdapter
|
|
238
|
+
|
|
239
|
+
# Parse to AST
|
|
240
|
+
page = LogosParser().parse_page_file("page.md")
|
|
241
|
+
|
|
242
|
+
# Export to LangChain with lineage metadata
|
|
243
|
+
docs = SynapseAdapter.to_langchain_documents(page.root_nodes, source_name=page.title)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### 🤖 Agentic Write Access (Append-Only)
|
|
247
|
+
|
|
248
|
+
Agents such as Hermes or OpenClaw can record structured notes into a Logseq graph **without rewriting existing pages**. The helper `logseq_agent_write` only **opens the weekly agent page in append mode** (`"a"`), writes a new bullet (journal link + optional tag links + body), and never truncates or replaces prior content—so routine logging cannot wipe blocks that already live in that file.
|
|
249
|
+
|
|
250
|
+
Point it at your graph’s **`pages`** directory and **`config.edn`** so journal titles match Logseq’s `:journal/page-title-format` (including ordinal days when you use `do` in the pattern).
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
from logseq_matryca_parser import logseq_agent_write
|
|
254
|
+
|
|
255
|
+
result = logseq_agent_write(
|
|
256
|
+
"Summarized user intent and proposed next steps.",
|
|
257
|
+
config_path="/path/to/logseq/config.edn",
|
|
258
|
+
pages_dir="/path/to/logseq/pages",
|
|
259
|
+
context_tags=["agent/hermes", "#session"],
|
|
260
|
+
)
|
|
261
|
+
assert result["status"] == "success"
|
|
262
|
+
# result["path"] → e.g. .../pages/2026-18-agent.md
|
|
263
|
+
```
|
|
264
|
+
---
|
|
265
|
+
|
|
266
|
+
## 🗺️ Roadmap
|
|
267
|
+
- [ ] **Desktop GUI:** Standalone app for non-technical users. [(Join the RFC)](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/3)
|
|
268
|
+
- [x] **Obsidian Adapter:** Native CLI export (`--format obsidian`) with YAML frontmatter and `^` block anchors.
|
|
269
|
+
- [ ] **Ollama Integration:** One-click local RAG setup.
|
|
270
|
+
|
|
271
|
+
## ☕ Support & Enterprise
|
|
272
|
+
Logseq Matryca Parser is open-source. If it powers your pipeline, consider a star ⭐ or a sponsorship!
|
|
273
|
+
|
|
274
|
+
**💖 [Sponsor me on GitHub](https://github.com/sponsors/MarcoPorcellato)**
|
|
275
|
+
|
|
276
|
+
Need custom RAG integrations or consulting? Contact: [marco@marcoporcellato.it](mailto:marco@marcoporcellato.it)
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
Architected by **Marco Porcellato** | Powered by **Matryca.ai**
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
logseq_matryca_parser/.gitignore,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
logseq_matryca_parser/NOTICE,sha256=maIqe2nNKdNAhLqgjIbO4Xa86EpLKqQwFiOgSze3ewY,230
|
|
3
|
+
logseq_matryca_parser/__init__.py,sha256=yhAqWJMb-Dy62e_mBDYljqALTg3k5SXSCpyxGYGfGU0,1463
|
|
4
|
+
logseq_matryca_parser/__main__.py,sha256=L_hCPkZR0DWIqEka-hrolxdjAczy5AbbUyAe3DIMh60,215
|
|
5
|
+
logseq_matryca_parser/agent_press.py,sha256=LG_RIHBCaY6GG-4Q4jgemb4hiLtxC6q4W_zjcOJQb6s,3722
|
|
6
|
+
logseq_matryca_parser/agent_writer.py,sha256=gDWYmmqjFpar_Iziq3gormj0MRrJraz27SD6rO3mODI,8000
|
|
7
|
+
logseq_matryca_parser/exceptions.py,sha256=7RhkTwkcLT10YLPciD7ZJCnrRC2CZ776ZiZUhRAsWPY,359
|
|
8
|
+
logseq_matryca_parser/forge.py,sha256=F-FCRrfTnm2o_z5LSDzHEBF9Vrnmypk1uXssyMOBpEY,13948
|
|
9
|
+
logseq_matryca_parser/graph.py,sha256=oxnqRJjC-Tkrz_YJHrlyD2cqo0-Mgii0IBIrvBfeS7w,19433
|
|
10
|
+
logseq_matryca_parser/kinetic.py,sha256=jFF_6ddjvQ0UDTSa7YwaUcPM3qfxnyBlE1TSmaRuMlY,19256
|
|
11
|
+
logseq_matryca_parser/lens.py,sha256=2euDU4xU5OJglaeqzKhP9SdKyKZn5h9wyl3YmLULKQ4,16061
|
|
12
|
+
logseq_matryca_parser/logos_core.py,sha256=WUR4ecFyrrvfdCu0V7NESHEL14ZXRdbfegsWYWxT5c4,5896
|
|
13
|
+
logseq_matryca_parser/logos_parser.py,sha256=yRuec3Q6Rdh0OO0R7r8AQiCDdDLu98q_CnucoeMMnsU,40814
|
|
14
|
+
logseq_matryca_parser/pyproject.toml,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
logseq_matryca_parser/synapse.py,sha256=N5N6iVv-7s5rsRW9cOaWPB904tLRSjGSVhXwZQXQoZs,12632
|
|
16
|
+
logseq_matryca_parser-0.3.0.dist-info/METADATA,sha256=Slt5KLj-Pn5I_3882tyGFIwGGkJfS6FztemxkAsO1i8,14430
|
|
17
|
+
logseq_matryca_parser-0.3.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
18
|
+
logseq_matryca_parser-0.3.0.dist-info/entry_points.txt,sha256=wQfdsO_DVmG_RIh9oBMv40OIV9nQ_tZaq_w2CAz6Vus,68
|
|
19
|
+
logseq_matryca_parser-0.3.0.dist-info/licenses/LICENSE,sha256=E_Dz4jVCe2JdJH6VkLe7pKrCIr9mifszSIv1usGyLJ0,11334
|
|
20
|
+
logseq_matryca_parser-0.3.0.dist-info/licenses/NOTICE,sha256=maIqe2nNKdNAhLqgjIbO4Xa86EpLKqQwFiOgSze3ewY,230
|
|
21
|
+
logseq_matryca_parser-0.3.0.dist-info/RECORD,,
|