logseq-matryca-parser 1.0.0__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.cursorignore +28 -3
  2. logseq_matryca_parser-1.1.1/CHANGELOG.md +40 -0
  3. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/CONTRIBUTING.md +14 -0
  4. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/PKG-INFO +44 -7
  5. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/README.md +43 -6
  6. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/ARCHITECTURE.md +30 -4
  7. logseq_matryca_parser-1.1.1/docs/RELEASE_PROCESS.md +67 -0
  8. logseq_matryca_parser-1.1.1/docs/logseq_ast_primer.md +152 -0
  9. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/pyproject.toml +1 -1
  10. logseq_matryca_parser-1.1.1/scripts/extract_changelog.py +138 -0
  11. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/__init__.py +1 -1
  12. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/graph.py +137 -16
  13. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/logos_core.py +4 -5
  14. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/logos_parser.py +316 -77
  15. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/logseq_markdown.py +39 -14
  16. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/logseq_paths.py +22 -1
  17. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_graph.py +64 -0
  18. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_logos_parser.py +228 -15
  19. logseq_matryca_parser-1.1.1/tests/test_logseq_markdown.py +203 -0
  20. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_logseq_paths.py +11 -0
  21. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/uv.lock +4 -4
  22. logseq_matryca_parser-1.0.0/.cursorrules +0 -5
  23. logseq_matryca_parser-1.0.0/docs/logseq_ast_primer.md +0 -82
  24. logseq_matryca_parser-1.0.0/tests/test_logseq_markdown.py +0 -98
  25. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/FUNDING.yml +0 -0
  26. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  27. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  28. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  29. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/dependabot.yml +0 -0
  30. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/workflows/ci.yml +0 -0
  31. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.github/workflows/pypi_publish.yml +0 -0
  32. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.gitignore +0 -0
  33. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.pre-commit-config.yaml +0 -0
  34. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/.repomixignore +0 -0
  35. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/LICENSE +0 -0
  36. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/Makefile +0 -0
  37. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/NOTICE +0 -0
  38. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/ROADMAP_AGENT_NATIVE_XRAY.md +0 -0
  39. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/ROADMAP_HEADLESS_WRITER.md +0 -0
  40. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/ROADMAP_OBSIDIAN_ADAPTER.md +0 -0
  41. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/SECURITY.md +0 -0
  42. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/claude-skill-logseq-read/SKILL.md +0 -0
  43. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/claude-skill-logseq-read/scripts/parse_logseq.py +0 -0
  44. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/ARCHITECTURE_BLUEPRINT.md +0 -0
  45. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/CODE_SCAFFOLD.md +0 -0
  46. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/LOGSEQ_ASSET_RESOLUTION_SPEC.md +0 -0
  47. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/LOGSEQ_DATASCRIPT_MAPPING.md +0 -0
  48. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/LOGSEQ_TEMPORAL_ONTOLOGY.md +0 -0
  49. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/OFFICIAL_MLDOC_SPECS.md +0 -0
  50. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/design-docs/REFERENCE_SPEC.md +0 -0
  51. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/error_log.md +0 -0
  52. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_CLI_HYDRATION_AND_ENRICHMENT.md +0 -0
  53. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_CONTEXT_SYNTHESIS_AND_SCOPING.md +0 -0
  54. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_EMBED_EXPANSION_AND_FLUENT_QUERIES.md +0 -0
  55. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_GRAPH_RAG_SEMANTICS.md +0 -0
  56. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_INCREMENTAL_WATCHER.md +0 -0
  57. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_INLINE_SHIELD_AND_NAMESPACES.md +0 -0
  58. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_ROBUSTNESS_AND_SOFT_BREAKS.md +0 -0
  59. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_TOML_FIX_AND_PYPI_DISTRIBUTION.md +0 -0
  60. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/docs/roadmaps/ROADMAP_UUID_AND_GRAPH_SUPERPOWERS.md +0 -0
  61. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/examples/demo_logseq_journal.md +0 -0
  62. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/examples/run_demo.py +0 -0
  63. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/legacy/local_digestor.py +0 -0
  64. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/lib/bindings/utils.js +0 -0
  65. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/lib/tom-select/tom-select.complete.min.js +0 -0
  66. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/lib/tom-select/tom-select.css +0 -0
  67. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/lib/vis-9.1.2/vis-network.css +0 -0
  68. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/lib/vis-9.1.2/vis-network.min.js +0 -0
  69. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/repomix-output-parser.xml +0 -0
  70. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/.gitignore +0 -0
  71. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/NOTICE +0 -0
  72. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/__main__.py +0 -0
  73. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/agent_press.py +0 -0
  74. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/agent_writer.py +0 -0
  75. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/exceptions.py +0 -0
  76. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/forge.py +0 -0
  77. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/kinetic.py +0 -0
  78. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/lens.py +0 -0
  79. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/pyproject.toml +0 -0
  80. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/src/logseq_matryca_parser/synapse.py +0 -0
  81. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_agent_press.py +0 -0
  82. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_agent_writer.py +0 -0
  83. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_forge.py +0 -0
  84. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_kinetic.py +0 -0
  85. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_lens.py +0 -0
  86. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_package_version.py +0 -0
  87. {logseq_matryca_parser-1.0.0 → logseq_matryca_parser-1.1.1}/tests/test_synapse.py +0 -0
@@ -1,10 +1,35 @@
1
1
  # .cursorignore
2
2
  # (Nota: Cursor ignora già automaticamente tutto ciò che è nel .gitignore)
3
3
 
4
+ # =========================
5
+ # Python & Virtual Environments
6
+ # =========================
7
+ .venv/
8
+ venv/
9
+ env/
10
+ __pycache__/
11
+ *.pyc
12
+
13
+ # =========================
14
+ # Linter & Test Caches
15
+ # =========================
16
+ .ruff_cache/
17
+ .mypy_cache/
18
+ .pytest_cache/
19
+ .coverage
20
+ htmlcov/
21
+
22
+ # =========================
23
+ # Build Artifacts
24
+ # =========================
25
+ dist/
26
+ build/
27
+ *.egg-info/
28
+
4
29
  # =========================
5
30
  # Lockfiles (Letali per il Codebase Indexing)
6
31
  # =========================
7
- # I lockfile devono stare su Git, ma l'IA non deve MAI leggerli,
32
+ # I lockfile devono stare su Git, ma l'IA non deve MAI leggerli,
8
33
  # sono solo un muro di versioni incomprensibili.
9
34
  poetry.lock
10
35
  uv.lock
@@ -33,7 +58,7 @@ tests/fixtures/*.md
33
58
  # =========================
34
59
  # Assets Vettoriali
35
60
  # =========================
36
- # Le immagini PNG/JPG Cursor le ignora da solo, ma gli SVG sono file di testo!
61
+ # Le immagini PNG/JPG Cursor le ignora da solo, ma gli SVG sono file di testo!
37
62
  # Se l'IA legge un SVG, legge migliaia di coordinate matematiche inutili.
38
63
  *.svg
39
64
 
@@ -42,4 +67,4 @@ tests/fixtures/*.md
42
67
  # =========================
43
68
  .vscode/
44
69
  .idea/
45
- .clinerules
70
+ .clinerules
@@ -0,0 +1,40 @@
1
+ # Changelog
2
+
3
+ All notable changes to **logseq-matryca-parser** (The Logos Protocol) are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [1.1.1] - 2026-05-28
11
+
12
+ ### Added
13
+
14
+ - **Graph page aliases** — `LogseqGraph.load_directory` honors `title::`, `alias::` / `aliases::` for `pages` lookup and backlinks; incremental reload re-applies enrichment after watcher edits.
15
+ - **LaTeX math shielding** — `_shield_inline_code` masks `$$...$$` and `$...$` spans so wikilinks/tags inside equations are not extracted.
16
+ - **Datalog query dead zones** — `#+BEGIN_QUERY` … `#+END_QUERY` blocks are ignored for entity extraction (parse-loop state plus shielding).
17
+ - **Numbered list blocks** — `logos_parser.py` recognizes ordered-list markers (`1. `, `12. `, etc.) as outliner bullets alongside `-` and `*`.
18
+ - **Markdown task checkboxes** — `[ ]`, `[-]`, and `[x]`/`[X]` on block text map to `TODO`, `DOING`, and `DONE` before Org-mode prefix fallback.
19
+
20
+ ### Fixed
21
+
22
+ - **Logseq OG parity (parser)** — `{{embed [[Page]]}}` and similar macros expose nested wikilinks; Unicode tags and markdown boundaries (`**#tag**`, `==#tag==`); comma-separated `tags::` / `alias::` / `aliases::` inject implicit graph tokens; `~~~` fences share code-block immunity with ` ``` ` fences.
23
+ - **Property contiguity** — block `key:: value` lines apply only while contiguous below the bullet; after a soft-break, later `key::` lines stay in `content` / `clean_text` (Logseq-native behavior).
24
+ - **Property bullet lists** — empty `alias::` / `tags::` followed by indented `-` bullets serialize as `list[str]` without orphan AST children.
25
+ - **Case-insensitive property keys** — all property keys normalized to lowercase at parse time; `TITLE::` frontmatter overrides graph page titles like `title::`.
26
+ - **Extended task markers** — `DELEGATED`, `POSTPONED`, `IN-PROGRESS` (longest-prefix matching) alongside existing Org-mode statuses.
27
+ - **Aliased block references** — `[Visible](((uuid)))` clean text retains visible alias only (no surrounding `[` `]`).
28
+
29
+ ## [1.0.0] - 2026-05-28
30
+
31
+ ### Added
32
+
33
+ - **LOGOS engine** — deterministic Stack-Machine parser (`StackMachineParser`) producing strict `LogseqPage` / `LogseqNode` ASTs from Spatial Markdown.
34
+ - **SYNAPSE adapters** — LangChain and LlamaIndex exporters with parent-child lineage metadata.
35
+ - **FORGE exporters** — JSON, Markdown, Obsidian, and enriched chunk payloads.
36
+ - **LENS visualizer** — interactive topology HTML via NetworkX / PyVis.
37
+ - **KINETIC CLI** — `matryca-parse` Typer entry point for export, visualization, and agent read/write workflows.
38
+ - **Headless CRUD** — append-only agent writer and X-Ray press utilities for sovereign graph mutation.
39
+ - **Logseq-native serialization** — round-trip page and block property layout via `logseq_markdown.py`.
40
+ - **Graph query layer** — `LogseqGraph` with backlinks, effective property inheritance, and optional filesystem watcher.
@@ -8,6 +8,20 @@ To maintain the architectural integrity of the project, please follow the guidel
8
8
 
9
9
  ---
10
10
 
11
+ ## 📚 Documentation
12
+
13
+ User-facing behavior is documented in:
14
+
15
+ - [`README.md`](README.md) — overview, quickstart, and feature matrix
16
+ - [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) — LOGOS, SYNAPSE, `LogseqGraph`, agents, and data flow
17
+ - [`docs/logseq_ast_primer.md`](docs/logseq_ast_primer.md) — Logseq Spatial Markdown domain rules
18
+ - [`CHANGELOG.md`](CHANGELOG.md) — shipped releases (current: **1.1.1**) and **Unreleased** changes (Keep a Changelog)
19
+ - [`docs/RELEASE_PROCESS.md`](docs/RELEASE_PROCESS.md) — version bump, tag, and PyPI publish checklist
20
+
21
+ When you add or change observable parser or graph behavior, update the relevant doc sections and add a bullet under **`## [Unreleased]`** in `CHANGELOG.md` (see [`.cursor/rules/05-auto-changelog.mdc`](.cursor/rules/05-auto-changelog.mdc)).
22
+
23
+ ---
24
+
11
25
  ## 🏛️ Architectural Philosophy
12
26
 
13
27
  Before writing any code, please understand our core principles:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: logseq-matryca-parser
3
- Version: 1.0.0
3
+ Version: 1.1.1
4
4
  Summary: The Logos Protocol: Deterministic Logseq AST parsing for Matryca.ai.
5
5
  Project-URL: Homepage, https://github.com/MarcoPorcellato/logseq-matryca-parser
6
6
  Project-URL: Bug Tracker, https://github.com/MarcoPorcellato/logseq-matryca-parser/issues
@@ -47,7 +47,7 @@ Description-Content-Type: text/markdown
47
47
  [![Status: Stable](https://img.shields.io/badge/Status-Stable-22c55e.svg?style=flat-square)](#)
48
48
  ![Origin: Matryca.ai](https://img.shields.io/badge/Origin-Matryca.ai-gold?style=for-the-badge)
49
49
 
50
- **v1.0.0 Stable** — heavily tested (156+ tests), full bidirectional Headless CRUD engine, native markdown serialization, and static typing; ready for production Enterprise integration.
50
+ **v1.1.1** — Logseq OG parity release (see [CHANGELOG](CHANGELOG.md)) — **200+ tests**, full bidirectional Headless CRUD engine, native markdown serialization, and static typing; ready for production Enterprise integration.
51
51
 
52
52
  > *Turning a forest of local plain-text files into a unified semantic powerhouse.*
53
53
 
@@ -57,7 +57,7 @@ Description-Content-Type: text/markdown
57
57
 
58
58
  [👉 **TRY THE LIVE INTERACTIVE DEMO**](https://MarcoPorcellato.github.io/logseq-matryca-parser/)
59
59
 
60
- [📘 **READ THE ARCHITECTURE (LLM OS Vision)**](docs/ARCHITECTURE.md)
60
+ [📘 **ARCHITECTURE**](docs/ARCHITECTURE.md) · [AST Primer](docs/logseq_ast_primer.md) · [Changelog](CHANGELOG.md) · [Release process](docs/RELEASE_PROCESS.md)
61
61
 
62
62
  </div>
63
63
 
@@ -100,6 +100,7 @@ It acts as the strict **File System Driver** for your LLM OS. By using a determi
100
100
  | **Block references `((uuid))`** | Treated as opaque text or dropped | **Resolved** against `LogseqGraph`; optional **embed expansion** and **Obsidian `[[Page#^anchor]]`** export |
101
101
  | **Property inheritance** | Page-level frontmatter at best | **`get_effective_properties`**: page + ancestor outline keys merged top-down (Org-mode style), then exposed on enriched chunks |
102
102
  | **Live sync** | Re-read whole tree or poll | **`LogseqGraph.start_watching()`** (optional `watchdog`): **per-file invalidation** — re-parse one page, purge stale UUIDs from registries, refresh backlinks |
103
+ | **Page aliases & titles** | Filename-only or manual link maps | **`title::`**, **`alias::`** / **`aliases::`** re-key `graph.pages` and wire **backlinks** for alias wikilinks |
103
104
 
104
105
  ---
105
106
 
@@ -138,7 +139,37 @@ Logseq Matryca Parser is a deterministic **Stack-Machine engine** that acts as t
138
139
 
139
140
  ---
140
141
 
141
- ## ⚡ Recent superpowers (Waves 4–12)
142
+ ## ⚡ Recent superpowers (v1.1.1)
143
+
144
+ ### Native parity (parser + graph)
145
+
146
+ | Area | Capability |
147
+ | :--- | :--- |
148
+ | **Graph index** | `title::` / `TITLE::` overrides the filename-derived page title; `alias::` / `aliases::` inject extra keys into `graph.pages` (comma-separated strings, bullet-list values, or Python lists). |
149
+ | **Backlinks** | `[[Dev]]` resolves against alias keys the same way as canonical titles (`get_backlinks("Dev")`). |
150
+ | **Incremental reload** | `invalidate_and_reload_page` re-applies title/alias enrichment after watcher edits. |
151
+ | **Parser shields** | LaTeX `$…$` / `$$…$$`, `#+BEGIN_QUERY` … `#+END_QUERY`, fenced code (` ``` ` and `~~~`), drawers, and `{{embed [[Page]]}}` macros do not emit false wikilinks/tags. |
152
+ | **Property contiguity** | `key:: value` lines apply only while contiguous under the bullet; after a soft-break, later property syntax stays in block text. |
153
+ | **Property bullet lists** | `alias::` / `tags::` with indented `-` children become `list[str]` properties — no spurious AST child nodes. |
154
+ | **Outliner bullets** | Ordered-list markers (`1. `, `12. `, …) are first-class bullets alongside `-` and `*`. |
155
+ | **Tasks** | GFM checkboxes (`[ ]`, `[-]`, `[x]`) plus Org-mode markers including `DELEGATED`, `POSTPONED`, `IN-PROGRESS`. |
156
+ | **Aliased block refs** | `[Label](((uuid)))` cleans to `Label` in `clean_text` for RAG-friendly prose. |
157
+
158
+ ```python
159
+ from logseq_matryca_parser.graph import LogseqGraph
160
+
161
+ graph = LogseqGraph.load_directory("/path/to/logseq/graph")
162
+
163
+ # file_name.md with frontmatter: title:: Custom Title
164
+ page = graph.pages["Custom Title"]
165
+
166
+ # Development.md with alias:: Dev, Coding — wikilinks to aliases resolve
167
+ assert graph.pages["Dev"] is graph.pages["Development"]
168
+ linker = graph.pages["Linker"].root_nodes[0]
169
+ assert linker in graph.get_backlinks("Dev")
170
+ ```
171
+
172
+ Deep dive: [Architecture §3.6 — LogseqGraph](docs/ARCHITECTURE.md#36-logseqgraph--namespace-scoping-o1-invalidation-live-watch) and [AST primer — page properties](docs/logseq_ast_primer.md#5-page-properties-title-aliases-and-graph-indexing).
142
173
 
143
174
  ### Obsidian-native export
144
175
  Compile an entire Logseq graph into an **Obsidian vault layout**: YAML frontmatter from page properties, list body preserved, Logseq `((uuid))` links rewritten to **`[[Page#^anchor]]`**, and trailing **`^block-id`** on referenced blocks. Namespace titles become nested folders (e.g. `Projects/AI/Demo.md`).
@@ -194,8 +225,9 @@ For graph hygiene, **`LogseqGraph.get_broken_references()`** flags nodes whose `
194
225
 
195
226
  | Feature | Description |
196
227
  | :--- | :--- |
197
- | **LOGOS Engine** | Deterministic AST parsing. No regex-guessing. Handles `id::`, aliases, and multiline blocks. |
198
- | **Advanced Task Extraction** | Task **state** (TODO / DOING / ), **priority** markers `[#A]`–`[#C]` promoted to `task_priority`, and **SCHEDULED** / **DEADLINE** Logseq timestamps normalized to **UTC Unix epoch seconds** on `scheduled_at` / `deadline_at` for temporal graph and retrieval pipelines. |
228
+ | **LOGOS Engine** | Deterministic AST parsing. Property contiguity, bullet-list properties, lowercase keys, multiline blocks, extended task markers, GFM checkboxes, numbered bullets, and **shielded** code/math/query regions. |
229
+ | **LogseqGraph** | In-memory vault: `pages` index (with **title/alias enrichment**), backlinks, effective properties, namespace resolution, fluent `GraphQuery`, optional **watchdog** invalidation. |
230
+ | **Advanced Task Extraction** | Task **state** (TODO / DOING / DELEGATED / IN-PROGRESS / …), **priority** markers `[#A]`–`[#C]` promoted to `task_priority`, and **SCHEDULED** / **DEADLINE** Logseq timestamps normalized to **UTC Unix epoch seconds** on `scheduled_at` / `deadline_at` for temporal graph and retrieval pipelines. |
199
231
  | **SYNAPSE Adapter** | Native exports for **LangChain** and **LlamaIndex** with automated lineage metadata; **context-enriched** chunks with breadcrumbs, embed expansion, and inherited properties. |
200
232
  | **FORGE** | JSON, clean Markdown, and **Obsidian** vault serialization (`ObsidianForgeVisitor`, `ForgeExporter.to_obsidian_markdown`). |
201
233
  | **LENS Visualizer** | 60FPS interactive graph rendering (10k+ nodes) with Glassmorphism HUD. |
@@ -248,12 +280,17 @@ matryca-parse export /path/to/logseq/graph output --format obsidian
248
280
 
249
281
  ### Python API
250
282
  ```python
283
+ from logseq_matryca_parser.graph import LogseqGraph
251
284
  from logseq_matryca_parser.logos_parser import LogosParser
252
285
  from logseq_matryca_parser.synapse import SynapseAdapter
253
286
 
254
- # Parse to AST
287
+ # Parse a single page to AST
255
288
  page = LogosParser().parse_page_file("page.md")
256
289
 
290
+ # Load the whole vault (pages, backlinks, node registry)
291
+ graph = LogseqGraph.load_directory("/path/to/logseq/graph")
292
+ effective = graph.get_effective_properties(graph.pages["My Page"].root_nodes[0].uuid)
293
+
257
294
  # Export to LangChain with lineage metadata
258
295
  docs = SynapseAdapter.to_langchain_documents(page.root_nodes, source_name=page.title)
259
296
  ```
@@ -11,7 +11,7 @@
11
11
  [![Status: Stable](https://img.shields.io/badge/Status-Stable-22c55e.svg?style=flat-square)](#)
12
12
  ![Origin: Matryca.ai](https://img.shields.io/badge/Origin-Matryca.ai-gold?style=for-the-badge)
13
13
 
14
- **v1.0.0 Stable** — heavily tested (156+ tests), full bidirectional Headless CRUD engine, native markdown serialization, and static typing; ready for production Enterprise integration.
14
+ **v1.1.1** — Logseq OG parity release (see [CHANGELOG](CHANGELOG.md)) — **200+ tests**, full bidirectional Headless CRUD engine, native markdown serialization, and static typing; ready for production Enterprise integration.
15
15
 
16
16
  > *Turning a forest of local plain-text files into a unified semantic powerhouse.*
17
17
 
@@ -21,7 +21,7 @@
21
21
 
22
22
  [👉 **TRY THE LIVE INTERACTIVE DEMO**](https://MarcoPorcellato.github.io/logseq-matryca-parser/)
23
23
 
24
- [📘 **READ THE ARCHITECTURE (LLM OS Vision)**](docs/ARCHITECTURE.md)
24
+ [📘 **ARCHITECTURE**](docs/ARCHITECTURE.md) · [AST Primer](docs/logseq_ast_primer.md) · [Changelog](CHANGELOG.md) · [Release process](docs/RELEASE_PROCESS.md)
25
25
 
26
26
  </div>
27
27
 
@@ -64,6 +64,7 @@ It acts as the strict **File System Driver** for your LLM OS. By using a determi
64
64
  | **Block references `((uuid))`** | Treated as opaque text or dropped | **Resolved** against `LogseqGraph`; optional **embed expansion** and **Obsidian `[[Page#^anchor]]`** export |
65
65
  | **Property inheritance** | Page-level frontmatter at best | **`get_effective_properties`**: page + ancestor outline keys merged top-down (Org-mode style), then exposed on enriched chunks |
66
66
  | **Live sync** | Re-read whole tree or poll | **`LogseqGraph.start_watching()`** (optional `watchdog`): **per-file invalidation** — re-parse one page, purge stale UUIDs from registries, refresh backlinks |
67
+ | **Page aliases & titles** | Filename-only or manual link maps | **`title::`**, **`alias::`** / **`aliases::`** re-key `graph.pages` and wire **backlinks** for alias wikilinks |
67
68
 
68
69
  ---
69
70
 
@@ -102,7 +103,37 @@ Logseq Matryca Parser is a deterministic **Stack-Machine engine** that acts as t
102
103
 
103
104
  ---
104
105
 
105
- ## ⚡ Recent superpowers (Waves 4–12)
106
+ ## ⚡ Recent superpowers (v1.1.1)
107
+
108
+ ### Native parity (parser + graph)
109
+
110
+ | Area | Capability |
111
+ | :--- | :--- |
112
+ | **Graph index** | `title::` / `TITLE::` overrides the filename-derived page title; `alias::` / `aliases::` inject extra keys into `graph.pages` (comma-separated strings, bullet-list values, or Python lists). |
113
+ | **Backlinks** | `[[Dev]]` resolves against alias keys the same way as canonical titles (`get_backlinks("Dev")`). |
114
+ | **Incremental reload** | `invalidate_and_reload_page` re-applies title/alias enrichment after watcher edits. |
115
+ | **Parser shields** | LaTeX `$…$` / `$$…$$`, `#+BEGIN_QUERY` … `#+END_QUERY`, fenced code (` ``` ` and `~~~`), drawers, and `{{embed [[Page]]}}` macros do not emit false wikilinks/tags. |
116
+ | **Property contiguity** | `key:: value` lines apply only while contiguous under the bullet; after a soft-break, later property syntax stays in block text. |
117
+ | **Property bullet lists** | `alias::` / `tags::` with indented `-` children become `list[str]` properties — no spurious AST child nodes. |
118
+ | **Outliner bullets** | Ordered-list markers (`1. `, `12. `, …) are first-class bullets alongside `-` and `*`. |
119
+ | **Tasks** | GFM checkboxes (`[ ]`, `[-]`, `[x]`) plus Org-mode markers including `DELEGATED`, `POSTPONED`, `IN-PROGRESS`. |
120
+ | **Aliased block refs** | `[Label](((uuid)))` cleans to `Label` in `clean_text` for RAG-friendly prose. |
121
+
122
+ ```python
123
+ from logseq_matryca_parser.graph import LogseqGraph
124
+
125
+ graph = LogseqGraph.load_directory("/path/to/logseq/graph")
126
+
127
+ # file_name.md with frontmatter: title:: Custom Title
128
+ page = graph.pages["Custom Title"]
129
+
130
+ # Development.md with alias:: Dev, Coding — wikilinks to aliases resolve
131
+ assert graph.pages["Dev"] is graph.pages["Development"]
132
+ linker = graph.pages["Linker"].root_nodes[0]
133
+ assert linker in graph.get_backlinks("Dev")
134
+ ```
135
+
136
+ Deep dive: [Architecture §3.6 — LogseqGraph](docs/ARCHITECTURE.md#36-logseqgraph--namespace-scoping-o1-invalidation-live-watch) and [AST primer — page properties](docs/logseq_ast_primer.md#5-page-properties-title-aliases-and-graph-indexing).
106
137
 
107
138
  ### Obsidian-native export
108
139
  Compile an entire Logseq graph into an **Obsidian vault layout**: YAML frontmatter from page properties, list body preserved, Logseq `((uuid))` links rewritten to **`[[Page#^anchor]]`**, and trailing **`^block-id`** on referenced blocks. Namespace titles become nested folders (e.g. `Projects/AI/Demo.md`).
@@ -158,8 +189,9 @@ For graph hygiene, **`LogseqGraph.get_broken_references()`** flags nodes whose `
158
189
 
159
190
  | Feature | Description |
160
191
  | :--- | :--- |
161
- | **LOGOS Engine** | Deterministic AST parsing. No regex-guessing. Handles `id::`, aliases, and multiline blocks. |
162
- | **Advanced Task Extraction** | Task **state** (TODO / DOING / ), **priority** markers `[#A]`–`[#C]` promoted to `task_priority`, and **SCHEDULED** / **DEADLINE** Logseq timestamps normalized to **UTC Unix epoch seconds** on `scheduled_at` / `deadline_at` for temporal graph and retrieval pipelines. |
192
+ | **LOGOS Engine** | Deterministic AST parsing. Property contiguity, bullet-list properties, lowercase keys, multiline blocks, extended task markers, GFM checkboxes, numbered bullets, and **shielded** code/math/query regions. |
193
+ | **LogseqGraph** | In-memory vault: `pages` index (with **title/alias enrichment**), backlinks, effective properties, namespace resolution, fluent `GraphQuery`, optional **watchdog** invalidation. |
194
+ | **Advanced Task Extraction** | Task **state** (TODO / DOING / DELEGATED / IN-PROGRESS / …), **priority** markers `[#A]`–`[#C]` promoted to `task_priority`, and **SCHEDULED** / **DEADLINE** Logseq timestamps normalized to **UTC Unix epoch seconds** on `scheduled_at` / `deadline_at` for temporal graph and retrieval pipelines. |
163
195
  | **SYNAPSE Adapter** | Native exports for **LangChain** and **LlamaIndex** with automated lineage metadata; **context-enriched** chunks with breadcrumbs, embed expansion, and inherited properties. |
164
196
  | **FORGE** | JSON, clean Markdown, and **Obsidian** vault serialization (`ObsidianForgeVisitor`, `ForgeExporter.to_obsidian_markdown`). |
165
197
  | **LENS Visualizer** | 60FPS interactive graph rendering (10k+ nodes) with Glassmorphism HUD. |
@@ -212,12 +244,17 @@ matryca-parse export /path/to/logseq/graph output --format obsidian
212
244
 
213
245
  ### Python API
214
246
  ```python
247
+ from logseq_matryca_parser.graph import LogseqGraph
215
248
  from logseq_matryca_parser.logos_parser import LogosParser
216
249
  from logseq_matryca_parser.synapse import SynapseAdapter
217
250
 
218
- # Parse to AST
251
+ # Parse a single page to AST
219
252
  page = LogosParser().parse_page_file("page.md")
220
253
 
254
+ # Load the whole vault (pages, backlinks, node registry)
255
+ graph = LogseqGraph.load_directory("/path/to/logseq/graph")
256
+ effective = graph.get_effective_properties(graph.pages["My Page"].root_nodes[0].uuid)
257
+
221
258
  # Export to LangChain with lineage metadata
222
259
  docs = SynapseAdapter.to_langchain_documents(page.root_nodes, source_name=page.title)
223
260
  ```
@@ -187,9 +187,17 @@ Auxiliary **FORGE** serialization (JSON / flat Markdown / Obsidian) appears as a
187
187
  - **Push** the freshly built `LogseqNode` onto the stack and register its UUID with `PageRegistry` for deterministic identity and future block-reference linkage.
188
188
  This yields **finite-state, linear-time** traversal with explicit ascend/descend behavior — not regex-driven whole-document guessing.
189
189
 
190
- - **Spatial indentation rules.** In Logseq, **indentation defines the AST**, not list decoration. Heading blocks and bullets both participate as first-class structural lines. Levels are **normalized post-pass** to tree depth (`_normalize_indent_levels`) so persisted `indent_level` reflects hierarchical depth independent of authoring quirks after stack repair.
190
+ - **Spatial indentation rules.** In Logseq, **indentation defines the AST**, not list decoration. Heading blocks and bullets both participate as first-class structural lines. The bullet detector accepts **unordered markers** (`-`, `*`) and **ordered-list markers** (`1. `, `12. `, …) via a shared `BULLET_PATTERN`, so numbered outlines participate in the stack machine like standard bullets. Levels are **normalized post-pass** to tree depth (`_normalize_indent_levels`) so persisted `indent_level` reflects hierarchical depth independent of authoring quirks after stack repair.
191
191
 
192
- - **Block properties & `id::`.** Subsequent lines matching `key:: value` attach to **`current_node`** (or accumulate into **frontmatter-derived page properties** when no node exists yet). Parsed properties live in **`LogseqNode.properties`**. Native **`id::`** values are preserved in **`source_uuid`** (and in **`properties["id"]`** when applicable) so **`((uuid))`** references match Logseq; the parser’s stable **`uuid`** field remains the synthetic identity used for AST wiring and adapters.
192
+ - **GFM task checkboxes (before Org-mode tasks).** On the first line of a block, GitHub-flavored checkboxes are recognized and mapped to **`task_status`** before Org-mode prefix fallback: `[ ]` **`TODO`**, `[-]` **`DOING`**, `[x]` / `[X]` **`DONE`**. The checkbox token is stripped from **`clean_text`** so embeddings stay prose-only.
193
+
194
+ - **Org-mode task prefixes (extended).** After checkbox handling, **`_extract_task_status`** matches longest-first Org prefixes (`TODO`, `DOING`, `DELEGATED`, `IN-PROGRESS`, …) at the start of the first line and promotes the remainder to **`clean_text`**.
195
+
196
+ - **Protected regions (entity extraction dead zones).** Wikilink, tag, and block-reference harvesters run on **`_shield_inline_code`**-masked text so literals inside **fenced code** (backtick and tilde fences), **inline code**, **LaTeX** (`$…$` and `$$…$$`), **`#+BEGIN_QUERY` … `#+END_QUERY`** blocks (parse-loop state plus shielding), and **Org drawers** do not produce false graph tokens. **`{{embed [[Page]]}}`** and similar macros are **not** fully opaque: nested wikilinks inside embed bodies are harvested for graph indexing.
197
+
198
+ - **Block properties & `id::`.** Subsequent lines matching `key:: value` attach to **`current_node`** only while **`properties_allowed`** remains true (contiguous property window immediately under the bullet). A **soft-break** continuation disables further property extraction; later `key::` lines merge into **`content`** as plain text. Keys are normalized with **`_normalize_property_key`** (lowercase) for Datomic parity. An empty value (`alias::` with no inline text) opens a **pending bullet-list** accumulator: indented `-` / `*` lines deeper than the property line become **`list[str]`** values without creating child **`LogseqNode`** entries. Page frontmatter uses the same key normalization (`TITLE::` ≡ `title::`). Parsed properties live in **`LogseqNode.properties`**. Native **`id::`** values are preserved in **`source_uuid`** (and in **`properties["id"]`** when applicable) so **`((uuid))`** references match Logseq; the parser’s stable **`uuid`** field remains the synthetic identity used for AST wiring and adapters.
199
+
200
+ - **Aliased block references in `clean_text`.** Markdown links of the form **`[Visible](((uuid)))`** are reduced to **`Visible`** in **`clean_text`** (brackets stripped) while UUIDs still populate **`block_refs`** for graph resolution.
193
201
 
194
202
  #### Sovereign UUID architecture and zero-corruption guarantee
195
203
 
@@ -276,6 +284,24 @@ Both paths keep **existing topology intact** relative to their contract: append-
276
284
 
277
285
  The **in-memory graph** ([`graph.py`](../src/logseq_matryca_parser/graph.py)) is the runtime **RAM image** of the sovereign vault: `pages: dict[str, LogseqPage]`, a private **`_node_registry`** keyed by synthetic block UUID, and a **`_backlink_registry`** mapping normalized link targets to source node UUIDs.
278
286
 
287
+ #### Page title overrides and alias indexing (`_enrich_pages_index`)
288
+
289
+ After every bulk or incremental parse, the graph applies a **post-parse enrichment pass** before backlink construction:
290
+
291
+ 1. **Filename → canonical title.** Each markdown file is first keyed by **`derive_page_title_from_source_path`** (see §3.9).
292
+ 2. **`title::` override.** If page frontmatter contains a non-empty string **`title`**, the frozen `LogseqPage` is updated via **`model_copy(update={"title": custom})`**, the old filename key is removed from **`pages`**, and the page is re-inserted under the custom title (collision with another file’s title is skipped with a debug log).
293
+ 3. **Alias injection.** For each canonical dict entry where **`dict_key == page.title`**, values from **`alias::`** and **`aliases::`** are normalized (comma-separated strings or Python lists; `[[Page]]` / `#tag` adornments stripped using the same rules as [`logseq_markdown.py`](../src/logseq_matryca_parser/logseq_markdown.py)) and registered as **additional keys** pointing at the **same `LogseqPage` instance** — e.g. `pages["Dev"]` and `pages["Development"]` share identity.
294
+ 4. **Backlinks.** **`_build_backlink_registry`** walks **unique pages** (`id(page)` deduplication) so alias keys do not double-count outgoing links. Incoming wikilinks such as **`[[Dev]]`** normalize to lowercase registry keys and resolve through **`get_backlinks("Dev")`** like any other page title.
295
+
296
+ **Incremental parity:** **`invalidate_and_reload_page`** drops **all** `pages` keys tied to the file’s `source_path` (not only the first alias hit), merges the freshly parsed page, re-runs **`_enrich_pages_index`**, then re-registers nodes and appends backlinks for the enriched instance. **`_page_title_for_source_path`** returns the canonical **`page.title`**, not an arbitrary alias key.
297
+
298
+ ```python
299
+ graph = LogseqGraph.load_directory("/vault")
300
+ dev = graph.pages["Dev"] # alias key
301
+ assert dev is graph.pages["Development"]
302
+ assert linker in graph.get_backlinks("Dev")
303
+ ```
304
+
279
305
  #### Namespace shadowing (`resolve_relative_page_link`)
280
306
 
281
307
  Relative page resolution follows **Logseq-style longest-prefix wins**: for a current page title split on **`/`** (namespace segments), the resolver tries candidates **`prefix + "/" + link_target`** for prefixes from **full namespace down to empty**, and returns the **first title that exists** in `pages`. Only if no contextual page exists does it fall back to a **global** title match. Thus a contextual page **`Progetti/AI/Sviluppo`** **shadows** a global **`Sviluppo`** when resolving from **`Progetti/AI/Matryca`** — matching the **nested-namespace shadowing** semantics described in the scoping roadmap.
@@ -287,9 +313,9 @@ Full-directory loads are expensive for always-on agents. **`invalidate_and_reloa
287
313
  1. Ignore paths outside tracked **`pages/*.md`** and **`journals/*.md`**.
288
314
  2. Re-parse the file with **`StackMachineParser.parse_page_file`**, producing a fresh `LogseqPage`.
289
315
  3. If the path previously mapped to a page, collect **all synthetic UUIDs** from the old tree and call **`_purge_stale_page_uuids`**: remove each UUID from **`_node_registry`**, scrub those UUIDs from every **`_backlink_registry`** source list, and delete backlink keys that become empty.
290
- 4. Replace the **`pages`** dict entry (title may change if the file moved), then **`_register_page_nodes`** and **`_append_page_backlinks`** for the new AST.
316
+ 4. Remove every **`pages`** key whose value shares the file’s **`source_path`**, insert the freshly parsed page under its filename title, run **`_enrich_pages_index`** (title + aliases), then **`_register_page_nodes`** and **`_append_page_backlinks`** for the enriched page.
291
317
 
292
- This keeps **global indexes consistent** without rebuilding the entire graph.
318
+ This keeps **global indexes consistent** without rebuilding the entire graph — including alias keys and custom titles declared in frontmatter.
293
319
 
294
320
  #### Live filesystem watcher (`start_watching`)
295
321
 
@@ -0,0 +1,67 @@
1
+ # Release process
2
+
3
+ **Logseq Matryca Parser** (The Logos Protocol · Marco Porcellato · [Matryca.ai](https://matryca.ai)) uses a **curated** [`CHANGELOG.md`](../CHANGELOG.md) (Keep a Changelog). PyPI publishing is triggered when you push a `v*` git tag.
4
+
5
+ ---
6
+
7
+ ## During development
8
+
9
+ Add user-facing bullets under **`## [Unreleased]`** (`Added` / `Changed` / `Fixed` / `Removed` / `Security`). One line per notable change. See [`.cursor/rules/05-auto-changelog.mdc`](../.cursor/rules/05-auto-changelog.mdc).
10
+
11
+ ---
12
+
13
+ ## Release day (local)
14
+
15
+ Replace `X.Y.Z` with the semver you are shipping (no `v` prefix in `pyproject.toml`; use `vX.Y.Z` for the git tag).
16
+
17
+ ### 1. Prepare (Cursor or manual)
18
+
19
+ - [ ] Move everything from `[Unreleased]` to `## [X.Y.Z] - YYYY-MM-DD` in `CHANGELOG.md`
20
+ - [ ] Leave an empty `## [Unreleased]` section at the top
21
+ - [ ] Set `version = "X.Y.Z"` in `pyproject.toml`
22
+ - [ ] Run `make all` (ruff, mypy, pytest)
23
+
24
+ **Cursor shortcut:** ask the agent to *“prepare release vX.Y.Z”* (see [`.cursor/rules/04-release-preparation.mdc`](../.cursor/rules/04-release-preparation.mdc)).
25
+
26
+ ### 2. Verify release notes (optional but recommended)
27
+
28
+ ```bash
29
+ python scripts/extract_changelog.py vX.Y.Z | less
30
+ ```
31
+
32
+ You should see exactly the section that will appear on GitHub if you attach release notes manually.
33
+
34
+ ### 3. Commit, tag, push
35
+
36
+ ```bash
37
+ git add CHANGELOG.md pyproject.toml
38
+ git commit -m "chore: release X.Y.Z"
39
+ git tag vX.Y.Z
40
+ git push origin main
41
+ git push origin vX.Y.Z
42
+ ```
43
+
44
+ ### 4. CI does the rest
45
+
46
+ On tag push, [`.github/workflows/pypi_publish.yml`](../.github/workflows/pypi_publish.yml):
47
+
48
+ 1. Builds sdist and wheel with `python -m build`
49
+ 2. Publishes to PyPI (trusted publishing)
50
+
51
+ ---
52
+
53
+ ## Troubleshooting
54
+
55
+ | Problem | Fix |
56
+ |---------|-----|
57
+ | PyPI version already exists | Bump patch version; never re-use a published version. |
58
+ | Notes look wrong | Re-run locally: `python scripts/extract_changelog.py vX.Y.Z` and compare to `CHANGELOG.md`. |
59
+ | CI fails on tests | Run `make all` locally before tagging. |
60
+
61
+ ---
62
+
63
+ ## Related
64
+
65
+ - [`CHANGELOG.md`](../CHANGELOG.md)
66
+ - [`CONTRIBUTING.md`](../CONTRIBUTING.md) — quality gates before tag
67
+ - [`scripts/extract_changelog.py`](../scripts/extract_changelog.py)
@@ -0,0 +1,152 @@
1
+ # 🧠 Logseq AST Primer: Understanding Spatial Markdown and the Logos Protocol
2
+
3
+ To contribute to or understand the **Logos Protocol**, you must first understand the idiosyncratic nature of Logseq's data structure.
4
+
5
+ Logseq does not use standard Markdown; it uses an **Outliner-based Spatial Markdown**.
6
+ Standard NLP text splitters and RAG chunkers destroy Logseq data because they parse text linearly. Logseq must be parsed **topologically**.
7
+
8
+ Here is the domain logic that the Logos Stack-Machine is built to handle.
9
+
10
+ ---
11
+
12
+ ## 1. The Outliner Paradigm (Spatial Indentation)
13
+
14
+ In standard Markdown, lists are just formatting. In Logseq, **indentation dictates the Abstract Syntax Tree (AST)**. The physical space defines the semantic parent-child relationship.
15
+
16
+ **Example:**
17
+
18
+ ```markdown
19
+ - Strategy Meeting
20
+ - Discussed Q3 goals
21
+ - Marketing budget needs approval
22
+ - Fired the PR agency
23
+ ```
24
+
25
+ ### The AST Translation
26
+
27
+ - **"Strategy Meeting" (Level 0)** is the Parent.
28
+ - **"Discussed Q3 goals" (Level 1)** is a Child.
29
+
30
+ If you delete the parent, this child loses its context. Standard chunkers will split these lines into different vector embeddings, destroying the semantic link. The Logos engine preserves this topology via exact `parent_id` and `path` lineage.
31
+
32
+ ---
33
+
34
+ ## 2. Block Properties (The Metadata Layer)
35
+
36
+ Logseq allows injecting metadata directly into blocks. These are not standard Markdown frontmatter, but inline key-value pairs that must immediately follow the block's first line.
37
+
38
+ **Example:**
39
+
40
+ ```markdown
41
+ - Advanced RAG Architecture #[[AI]]
42
+ id:: 6628ec8c-5544-486a-8d77-62860c239851
43
+ collapsed:: true
44
+ custom_state:: verified
45
+ - First principle of data extraction...
46
+ ```
47
+
48
+ ### Parsing Rules for Logos
49
+
50
+ 1. Properties (like `id::`) belong to the block above them and must appear **contiguously** immediately after the bullet line (or after other property lines in the same window).
51
+ 2. If a **soft-break** plain-text line appears in the block body, the property window **closes**: later `key:: value` lines are **plain text**, not metadata.
52
+ 3. Property keys are stored **lowercase** (`Title::` → `title`) to match Logseq’s case-insensitive Datomic attributes.
53
+ 4. They must be stripped from the raw text content to avoid polluting the AI's context window (`clean_node_content` uses case-insensitive matching).
54
+ 5. The `id::` property is sacred: it overrides any deterministic UUID generation because it is the native anchor for Logseq's internal block-references `((uuid))`.
55
+
56
+ **Bullet-list property values** (Logseq-native):
57
+
58
+ ```markdown
59
+ - Root block
60
+ tags::
61
+ - Alpha
62
+ - Beta
63
+ ```
64
+
65
+ When `key::` has no inline value, indented bullet children are absorbed into **`properties["tags"] == ["Alpha", "Beta"]`** — they do **not** become outline child nodes.
66
+
67
+ ---
68
+
69
+ ## 3. Soft Breaks vs. Hard Breaks (Multiline Blocks)
70
+
71
+ A single block in Logseq can contain multiple lines of text without creating a new node. This is represented by `Shift+Enter` (soft breaks).
72
+
73
+ **Example:**
74
+
75
+ ```markdown
76
+ - This is the first line of the block.
77
+ This is the second line of the SAME block.
78
+ - This is a new child block.
79
+ ```
80
+
81
+ ### Parsing Rules for Logos
82
+
83
+ If a line does not start with a bullet (`-` or `*`), and is not a property (`key:: value`), it is treated as a multiline continuation of the `current_node`.
84
+
85
+ **Ordered lists:** Lines matching `1. `, `12. `, etc. are treated as structural bullets with the same indentation rules as `-` and `*`.
86
+
87
+ **GFM checkboxes:** On the first line of a block, `[ ]`, `[-]`, and `[x]` / `[X]` map to `task_status` values `TODO`, `DOING`, and `DONE` respectively (checked before Org-mode prefixes).
88
+
89
+ **Org-mode task markers:** Prefixes such as `TODO`, `DOING`, `DELEGATED`, `POSTPONED`, and `IN-PROGRESS` (longest match first) set `task_status` and are removed from `clean_text`.
90
+
91
+ **Aliased block references:** `[My Label](((block-uuid)))` resolves the UUID for `block_refs` but `clean_text` exposes only `My Label` (no square brackets).
92
+
93
+ ---
94
+
95
+ ## 4. Page Properties (Frontmatter) and Graph Indexing
96
+
97
+ Unlike block properties, **page properties** live at the **top of the file** as raw `key:: value` lines (no leading `- `), followed by a blank line before the first outline bullet.
98
+
99
+ **Example:**
100
+
101
+ ```markdown
102
+ title:: Custom Title
103
+ alias:: Dev, Coding
104
+ tags:: parser, logseq
105
+
106
+ - First root block
107
+ ```
108
+
109
+ ### Parsing rules
110
+
111
+ 1. Keys and values are stored in **`LogseqPage.properties`** with **lowercase keys**; the parser does **not** automatically change **`LogseqPage.title`** (that remains filename-derived until graph load).
112
+ 2. **`alias::`** and **`aliases::`** accept comma-separated strings, **bullet-list** values (`alias::` followed by indented `-` lines), or Python `list` values after parse. Serialization strips `[[wikilink]]` and `#tag` adornments from each token.
113
+
114
+ ### `LogseqGraph` enrichment
115
+
116
+ When you call **`LogseqGraph.load_directory`**, a post-parse pass:
117
+
118
+ - Applies **`title::`** → updates **`page.title`** and re-keys **`graph.pages`**
119
+ - Injects **alias keys** → `graph.pages["Dev"]` points to the same object as `graph.pages["Development"]`
120
+ - Builds **backlinks** so `[[Dev]]` resolves like a canonical page link
121
+
122
+ See [Architecture §3.6](ARCHITECTURE.md#36-logseqgraph--namespace-scoping-o1-invalidation-live-watch) for the full pipeline.
123
+
124
+ ---
125
+
126
+ ## 5. Protected Regions (Dead Zones)
127
+
128
+ Logseq markdown often contains syntax that **looks** like links or tags but must not become graph entities:
129
+
130
+ | Region | Why it is shielded |
131
+ | :--- | :--- |
132
+ | Fenced / inline code | Literals such as `[[not-a-page]]` inside samples |
133
+ | LaTeX `$…$` and `$$…$$` | Equations may contain bracket-heavy notation |
134
+ | `#+BEGIN_QUERY` … `#+END_QUERY` | Datalog snippets reference `[[pages]]` symbolically |
135
+ | Org drawers (`:LOGBOOK:`, etc.) | System metadata, not prose |
136
+ | `{{embed [[Page]]}}` macros | Nested wikilinks inside embed bodies **are** extracted for the graph |
137
+
138
+ The LOGOS engine masks these spans before wikilink/tag/block-ref extraction (see `_shield_inline_code` in `logos_parser.py`). This keeps vector pipelines and backlink registries aligned with **human navigation intent**, not accidental token matches inside non-prose regions.
139
+
140
+ ---
141
+
142
+ ## 6. The Matryca Moat: Why Standard RAG Fails
143
+
144
+ If you feed Logseq Markdown into `RecursiveCharacterTextSplitter` (LangChain) or similar naive chunkers:
145
+
146
+ - It splits blocks mid-sentence based on character count.
147
+ - It completely loses the parent-child indentation context.
148
+ - It ingests system properties (e.g., `collapsed:: true`) as semantic text, confusing the LLM.
149
+
150
+ The **Logos Protocol** solves this by walking the AST deterministically, isolating properties, shielding dead-zone literals, and using the `SYNAPSE` adapter to export native LangChain `Document` or LlamaIndex `TextNode` objects. Every generated object retains its exact hierarchical lineage in the metadata, feeding your local LLM perfectly structured data.
151
+
152
+ For vault-wide navigation (aliases, backlinks, namespace shadowing), load the graph with **`LogseqGraph`** — see the [README](../README.md) and [CHANGELOG](../CHANGELOG.md) (**v1.1.1** OG parity).
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "logseq-matryca-parser"
7
- version = "1.0.0"
7
+ version = "1.1.1"
8
8
  description = "The Logos Protocol: Deterministic Logseq AST parsing for Matryca.ai."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"