delfos 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. delfos-0.1.0/.devin/skills/llvm-cpp-toolchain/SKILL.md +104 -0
  2. delfos-0.1.0/.env.example +38 -0
  3. delfos-0.1.0/.github/workflows/ci.yml +72 -0
  4. delfos-0.1.0/.github/workflows/release.yml +77 -0
  5. delfos-0.1.0/.gitignore +41 -0
  6. delfos-0.1.0/.python-version +1 -0
  7. delfos-0.1.0/AGENTS.md +57 -0
  8. delfos-0.1.0/ARCHITECTURE.md +220 -0
  9. delfos-0.1.0/CLAUDE.md +129 -0
  10. delfos-0.1.0/CMakeLists.txt +112 -0
  11. delfos-0.1.0/CMakePresets.json +41 -0
  12. delfos-0.1.0/LICENSE +190 -0
  13. delfos-0.1.0/PKG-INFO +116 -0
  14. delfos-0.1.0/README.md +95 -0
  15. delfos-0.1.0/delfos/__init__.py +52 -0
  16. delfos-0.1.0/delfos/_delfos.pyi +119 -0
  17. delfos-0.1.0/delfos/_logging.py +33 -0
  18. delfos-0.1.0/delfos/cli/__init__.py +1 -0
  19. delfos-0.1.0/delfos/cli/__main__.py +10 -0
  20. delfos-0.1.0/delfos/cli/app.py +189 -0
  21. delfos-0.1.0/delfos/cli/render.py +82 -0
  22. delfos-0.1.0/delfos/config.py +268 -0
  23. delfos-0.1.0/delfos/indexer/__init__.py +31 -0
  24. delfos-0.1.0/delfos/indexer/embedder.py +106 -0
  25. delfos-0.1.0/delfos/indexer/extractor.py +241 -0
  26. delfos-0.1.0/delfos/indexer/parser.py +290 -0
  27. delfos-0.1.0/delfos/indexer/pipeline.py +303 -0
  28. delfos-0.1.0/delfos/mcp/__init__.py +1 -0
  29. delfos-0.1.0/delfos/mcp/__main__.py +67 -0
  30. delfos-0.1.0/delfos/mcp/config.py +19 -0
  31. delfos-0.1.0/delfos/mcp/server.py +176 -0
  32. delfos-0.1.0/delfos/mcp/views.py +113 -0
  33. delfos-0.1.0/delfos/py.typed +0 -0
  34. delfos-0.1.0/delfos/reconstruct/__init__.py +20 -0
  35. delfos-0.1.0/delfos/reconstruct/planner.py +62 -0
  36. delfos-0.1.0/delfos/reconstruct/planners/__init__.py +1 -0
  37. delfos-0.1.0/delfos/reconstruct/planners/fake.py +37 -0
  38. delfos-0.1.0/delfos/reconstruct/planners/openai.py +62 -0
  39. delfos-0.1.0/delfos/reconstruct/service.py +284 -0
  40. delfos-0.1.0/delfos/reconstruct/summaries.py +34 -0
  41. delfos-0.1.0/delfos/schema/__init__.py +41 -0
  42. delfos-0.1.0/delfos/schema/edges.py +34 -0
  43. delfos-0.1.0/delfos/schema/enums.py +90 -0
  44. delfos-0.1.0/delfos/schema/nodes.py +133 -0
  45. delfos-0.1.0/delfos/scip/__init__.py +0 -0
  46. delfos-0.1.0/delfos/scip/generate.py +88 -0
  47. delfos-0.1.0/delfos/scip/reader.py +137 -0
  48. delfos-0.1.0/delfos/scip/scip.proto +961 -0
  49. delfos-0.1.0/delfos/scip/scip_pb2.py +97 -0
  50. delfos-0.1.0/delfos/scip/scip_pb2.pyi +758 -0
  51. delfos-0.1.0/delfos/scip/service.py +59 -0
  52. delfos-0.1.0/delfos/store/__init__.py +11 -0
  53. delfos-0.1.0/delfos/store/base.py +179 -0
  54. delfos-0.1.0/delfos/store/native_store.py +355 -0
  55. delfos-0.1.0/delfos/workspace.py +209 -0
  56. delfos-0.1.0/docs/decisions.md +158 -0
  57. delfos-0.1.0/libdelfos/bench/CMakeLists.txt +11 -0
  58. delfos-0.1.0/libdelfos/bench/bench_vector.cpp +75 -0
  59. delfos-0.1.0/libdelfos/bindings/CMakeLists.txt +6 -0
  60. delfos-0.1.0/libdelfos/bindings/py_delfos.cpp +407 -0
  61. delfos-0.1.0/libdelfos/flatbuffers/delfos.fbs +60 -0
  62. delfos-0.1.0/libdelfos/flatbuffers/delfos_generated.h +1194 -0
  63. delfos-0.1.0/libdelfos/include/delfos/delfos.hpp +11 -0
  64. delfos-0.1.0/libdelfos/include/delfos/edge.hpp +24 -0
  65. delfos-0.1.0/libdelfos/include/delfos/graph.hpp +374 -0
  66. delfos-0.1.0/libdelfos/include/delfos/node.hpp +54 -0
  67. delfos-0.1.0/libdelfos/include/delfos/snapshot.hpp +273 -0
  68. delfos-0.1.0/libdelfos/include/delfos/types.hpp +22 -0
  69. delfos-0.1.0/libdelfos/include/delfos/vector_index.hpp +219 -0
  70. delfos-0.1.0/libdelfos/tests/CMakeLists.txt +20 -0
  71. delfos-0.1.0/libdelfos/tests/test_graph.cpp +711 -0
  72. delfos-0.1.0/libdelfos/tests/test_snapshot.cpp +415 -0
  73. delfos-0.1.0/libdelfos/tests/test_vector_index.cpp +293 -0
  74. delfos-0.1.0/pyproject.toml +84 -0
  75. delfos-0.1.0/tests/__init__.py +0 -0
  76. delfos-0.1.0/tests/cli/__init__.py +0 -0
  77. delfos-0.1.0/tests/cli/conftest.py +22 -0
  78. delfos-0.1.0/tests/cli/test_commands.py +51 -0
  79. delfos-0.1.0/tests/cli/test_doctor.py +67 -0
  80. delfos-0.1.0/tests/cli/test_dotenv.py +19 -0
  81. delfos-0.1.0/tests/cli/test_parser.py +58 -0
  82. delfos-0.1.0/tests/cli/test_render.py +68 -0
  83. delfos-0.1.0/tests/indexer/__init__.py +0 -0
  84. delfos-0.1.0/tests/indexer/test_parser.py +241 -0
  85. delfos-0.1.0/tests/indexer/test_pipeline_scip.py +195 -0
  86. delfos-0.1.0/tests/mcp/__init__.py +0 -0
  87. delfos-0.1.0/tests/mcp/conftest.py +26 -0
  88. delfos-0.1.0/tests/mcp/test_config.py +101 -0
  89. delfos-0.1.0/tests/mcp/test_server.py +217 -0
  90. delfos-0.1.0/tests/mcp/test_views.py +71 -0
  91. delfos-0.1.0/tests/reconstruct/__init__.py +0 -0
  92. delfos-0.1.0/tests/reconstruct/conftest.py +104 -0
  93. delfos-0.1.0/tests/reconstruct/test_exports.py +9 -0
  94. delfos-0.1.0/tests/reconstruct/test_fetch.py +63 -0
  95. delfos-0.1.0/tests/reconstruct/test_openai_planner.py +128 -0
  96. delfos-0.1.0/tests/reconstruct/test_planner.py +71 -0
  97. delfos-0.1.0/tests/reconstruct/test_reconstruct.py +185 -0
  98. delfos-0.1.0/tests/reconstruct/test_search.py +32 -0
  99. delfos-0.1.0/tests/reconstruct/test_summaries.py +63 -0
  100. delfos-0.1.0/tests/reconstruct/test_traverse.py +100 -0
  101. delfos-0.1.0/tests/scip/__init__.py +0 -0
  102. delfos-0.1.0/tests/scip/builders.py +91 -0
  103. delfos-0.1.0/tests/scip/test_generate.py +66 -0
  104. delfos-0.1.0/tests/scip/test_reader.py +146 -0
  105. delfos-0.1.0/tests/scip/test_service.py +108 -0
  106. delfos-0.1.0/tests/store/__init__.py +0 -0
  107. delfos-0.1.0/tests/store/test_native_store.py +388 -0
  108. delfos-0.1.0/tests/test_config.py +172 -0
  109. delfos-0.1.0/tests/test_e2e.py +170 -0
  110. delfos-0.1.0/tests/test_embedder.py +72 -0
  111. delfos-0.1.0/tests/test_logging.py +122 -0
  112. delfos-0.1.0/tests/test_workspace.py +133 -0
  113. delfos-0.1.0/uv.lock +952 -0
@@ -0,0 +1,104 @@
1
+ ---
2
+ name: llvm-cpp-toolchain
3
+ description: LLVM-based toolchain for debugging and profiling C++ code in this project. Covers sanitizers, profiling, optimization, and benchmarking — all Clang/LLVM, no GCC or Valgrind.
4
+ ---
5
+
6
+ # C++ Debugging & Profiling Toolchain (LLVM)
7
+
8
+ All debugging and performance work uses the LLVM/Clang toolchain exclusively.
9
+
10
+ ## Sanitizers (Debug/CI)
11
+
12
+ | Sanitizer | Flag | Catches | Overhead |
13
+ |---|---|---|---|
14
+ | AddressSanitizer | `-fsanitize=address` | Buffer overflows, use-after-free, double-free, leaks | ~2x |
15
+ | UndefinedBehaviorSanitizer | `-fsanitize=undefined` | Signed overflow, null deref, misaligned access | ~1.2x |
16
+ | ThreadSanitizer | `-fsanitize=thread` | Data races, lock-order inversions | ~5–15x |
17
+ | MemorySanitizer | `-fsanitize=memory` | Reads of uninitialized memory | ~3x |
18
+
19
+ Always compile with `-fno-omit-frame-pointer -g` alongside sanitizers. ASan+UBSan should run in CI on every commit. TSan when concurrency is involved.
20
+
21
+ ## Profiling
22
+
23
+ | Tool | Purpose | Usage |
24
+ |---|---|---|
25
+ | `perf record -g` + flamegraphs | Find where CPU time goes | `perf record -g --call-graph=fp ./binary && perf script \| flamegraph.pl > flame.svg` |
26
+ | `llvm-xray` | Precise per-function instrumentation tracing | Compile with `-fxray-instrument`, run with `XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic"`, analyze with `llvm-xray account` or convert to Perfetto/Chrome trace |
27
+ | `perf stat` hw counters | Cache misses, branch mispredicts | `perf stat -e cache-misses,branch-misses,LLC-load-misses ./binary` |
28
+ | MemProf | Allocation hot/cold analysis | Compile with `-fmemory-profile -gmlt`, merge with `llvm-profdata` |
29
+
30
+ Build for profiling: `CMAKE_BUILD_TYPE=RelWithDebInfo` + `-fno-omit-frame-pointer`.
31
+
32
+ ## Optimization
33
+
34
+ | Tool | What it does | When to use |
35
+ |---|---|---|
36
+ | PGO (`-fprofile-generate` / `-fprofile-use`) | Profile-guided code generation (10–20% gains) | Once code is stable, before release |
37
+ | LTO (`-flto=thin`) | Link-time optimization across translation units | Always in release builds |
38
+ | BOLT (`llvm-bolt`) | Post-link binary layout optimization (5–15% gains) | Final production binary |
39
+
40
+ PGO workflow:
41
+ ```bash
42
+ clang++ -fprofile-generate -O2 ... # instrumented build
43
+ ./binary --workload # run representative workload
44
+ llvm-profdata merge -o default.profdata *.profraw
45
+ clang++ -fprofile-use=default.profdata -O3 -flto=thin ... # optimized build
46
+ ```
47
+
48
+ ## Benchmarking
49
+
50
+ Use [nanobench](https://github.com/martinus/nanobench) (single header). It reads perf counters directly and reports median, percentiles, instructions/op, branch misses/op, cache misses/op.
51
+
52
+ ```cpp
53
+ #include <nanobench.h>
54
+
55
+ ankerl::nanobench::Bench().run("name", [&] {
56
+ auto result = function_under_test();
57
+ ankerl::nanobench::doNotOptimizeAway(result);
58
+ });
59
+ ```
60
+
61
+ ## CMake Presets
62
+
63
+ ```json
64
+ {
65
+ "configurePresets": [
66
+ {
67
+ "name": "debug",
68
+ "cacheVariables": {
69
+ "CMAKE_BUILD_TYPE": "Debug",
70
+ "CMAKE_CXX_COMPILER": "clang++",
71
+ "CMAKE_CXX_FLAGS": "-fsanitize=address,undefined -fno-omit-frame-pointer"
72
+ }
73
+ },
74
+ {
75
+ "name": "profile",
76
+ "cacheVariables": {
77
+ "CMAKE_BUILD_TYPE": "RelWithDebInfo",
78
+ "CMAKE_CXX_COMPILER": "clang++",
79
+ "CMAKE_CXX_FLAGS": "-fno-omit-frame-pointer -fxray-instrument"
80
+ }
81
+ },
82
+ {
83
+ "name": "release",
84
+ "cacheVariables": {
85
+ "CMAKE_BUILD_TYPE": "Release",
86
+ "CMAKE_CXX_COMPILER": "clang++",
87
+ "CMAKE_CXX_FLAGS": "-O3 -flto=thin -march=native"
88
+ }
89
+ }
90
+ ]
91
+ }
92
+ ```
93
+
94
+ ## Workflow
95
+
96
+ ```
97
+ 1. Develop with ASan+UBSan (catch memory/UB bugs immediately)
98
+ 2. Write nanobench micro-benchmarks (establish baseline)
99
+ 3. perf → flamegraph (identify hotspots)
100
+ 4. llvm-xray → Perfetto (precise function-level timing)
101
+ 5. perf stat (validate cache/branch behavior)
102
+ 6. PGO + LTO (profile-guided release build)
103
+ 7. BOLT (final binary layout optimization)
104
+ ```
@@ -0,0 +1,38 @@
1
+ # Copy to .env at your repo root and edit. Real environment variables always
2
+ # override these; this file is loaded explicitly at startup, anchored to the
3
+ # repo being served/indexed (via --repo or DELFOS_REPO) — never by walking up
4
+ # from wherever the process happens to be launched.
5
+ #
6
+ # llama.cpp note: one llama-server serves one model, so run TWO servers and
7
+ # point the chat/embed base URLs at their separate ports:
8
+ # llama-server -m chat.gguf --port 8080 --jinja
9
+ # llama-server -m embed.gguf --port 8081 --embedding --pooling mean
10
+ # (--jinja enables the chat template; --embedding turns on /v1/embeddings.)
11
+
12
+ # OpenAI-compatible endpoint for the hop planner (chat).
13
+ # llama.cpp: http://localhost:8080/v1
14
+ # Ollama: http://localhost:11434/v1
15
+ # LM Studio: http://localhost:1234/v1
16
+ DELFOS_LLM_BASE_URL=http://localhost:8080/v1
17
+
18
+ # Chat model that supports JSON-schema structured outputs.
19
+ # llama.cpp ignores this value (it serves the loaded model); set anything.
20
+ DELFOS_LLM_MODEL=local-chat
21
+
22
+ # Local servers ignore the value, but the client requires one.
23
+ DELFOS_LLM_API_KEY=local
24
+
25
+ # Embedding endpoint for semantic seeds. With llama.cpp this is a SEPARATE
26
+ # server/port from the chat one — set it explicitly; there is no automatic
27
+ # fallback to DELFOS_LLM_BASE_URL.
28
+ DELFOS_EMBED_BASE_URL=http://localhost:8081/v1
29
+
30
+ # Local embedding model and its output dimension (must match the model).
31
+ # nomic-embed-text -> 768. Set DELFOS_EMBED_DIM to your model's dimension.
32
+ DELFOS_EMBED_MODEL=local-embed
33
+ DELFOS_EMBED_DIM=768
34
+
35
+ # API key for the embedding endpoint. Local servers usually ignore the value,
36
+ # but a string is still required. Required (no local server) when
37
+ # DELFOS_EMBED_BASE_URL is unset — you're targeting the real OpenAI API.
38
+ DELFOS_EMBED_API_KEY=local
@@ -0,0 +1,72 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ lint:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - uses: astral-sh/setup-uv@v5
19
+ with:
20
+ python-version: "3.12"
21
+ enable-cache: true
22
+ - name: Install dependencies
23
+ # Lint/type-check never import the compiled _delfos extension (a
24
+ # checked-in delfos/_delfos.pyi stub covers pyright), so skip building
25
+ # and installing the project itself — it's the slowest part of `uv sync`.
26
+ run: uv sync --no-install-project
27
+ - name: Ruff lint
28
+ # --no-sync: `uv run` auto-syncs (and would rebuild the project) otherwise.
29
+ run: uv run --no-sync ruff check .
30
+ - name: Ruff format check
31
+ run: uv run --no-sync ruff format --check .
32
+ - name: Pyright
33
+ run: uv run --no-sync pyright
34
+
35
+ test:
36
+ strategy:
37
+ fail-fast: false
38
+ matrix:
39
+ os: [ubuntu-latest, macos-latest]
40
+ runs-on: ${{ matrix.os }}
41
+ env:
42
+ FC_CACHE_DIR: ${{ github.workspace }}/.fc-cache
43
+ steps:
44
+ - uses: actions/checkout@v4
45
+ - uses: astral-sh/setup-uv@v5
46
+ with:
47
+ python-version: "3.12"
48
+ enable-cache: true
49
+ - name: Cache CMake FetchContent sources
50
+ uses: actions/cache@v4
51
+ with:
52
+ path: ${{ env.FC_CACHE_DIR }}
53
+ key: fc-deps-${{ matrix.os }}-${{ hashFiles('CMakeLists.txt') }}
54
+ restore-keys: |
55
+ fc-deps-${{ matrix.os }}-
56
+ - name: Install dependencies
57
+ # The project itself gets built by the editable install below, so
58
+ # skip building it here to avoid compiling the C++ extension twice.
59
+ run: uv sync --no-install-project
60
+ - name: Build and install extension (editable)
61
+ run: >
62
+ uv pip install -e .
63
+ --config-settings=cmake.define.FETCHCONTENT_BASE_DIR=${{ env.FC_CACHE_DIR }}
64
+ - name: Pytest
65
+ # --no-sync: `uv run` auto-syncs (and would rebuild the project) otherwise.
66
+ run: uv run --no-sync pytest
67
+ - name: CMake configure (debug, ASan+UBSan)
68
+ run: cmake --preset debug -DFETCHCONTENT_BASE_DIR=${{ env.FC_CACHE_DIR }}
69
+ - name: CMake build
70
+ run: cmake --build build/debug
71
+ - name: Ctest
72
+ run: ctest --test-dir build/debug --output-on-failure
@@ -0,0 +1,77 @@
1
+ name: Release
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ release:
6
+ types: [published]
7
+
8
+ jobs:
9
+ validate-tag:
10
+ name: Validate release tag
11
+ if: github.event_name == 'release'
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - name: Check tag matches vMAJOR.MINOR.PATCH
15
+ env:
16
+ TAG: ${{ github.event.release.tag_name }}
17
+ run: |
18
+ if ! [[ "$TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-[0-9A-Za-z.-]+)?(\+[0-9A-Za-z.-]+)?$ ]]; then
19
+ echo "::error::Tag '$TAG' does not match the required vMAJOR.MINOR.PATCH format (optionally with a -prerelease and/or +build suffix)"
20
+ exit 1
21
+ fi
22
+
23
+ build-wheels:
24
+ name: Build wheels on ${{ matrix.os }}
25
+ needs: [validate-tag]
26
+ if: always() && (needs.validate-tag.result == 'success' || needs.validate-tag.result == 'skipped')
27
+ strategy:
28
+ fail-fast: false
29
+ matrix:
30
+ os: [ubuntu-latest, macos-latest]
31
+ runs-on: ${{ matrix.os }}
32
+ steps:
33
+ - uses: actions/checkout@v6
34
+ with:
35
+ persist-credentials: false
36
+ fetch-depth: 0 # setuptools_scm needs full history + tags
37
+ - name: Build wheels
38
+ uses: pypa/cibuildwheel@v4.1.0
39
+ - uses: actions/upload-artifact@v4
40
+ with:
41
+ name: cibw-wheels-${{ matrix.os }}
42
+ path: ./wheelhouse/*.whl
43
+
44
+ build-sdist:
45
+ name: Build source distribution
46
+ needs: [validate-tag]
47
+ if: always() && (needs.validate-tag.result == 'success' || needs.validate-tag.result == 'skipped')
48
+ runs-on: ubuntu-latest
49
+ steps:
50
+ - uses: actions/checkout@v6
51
+ with:
52
+ persist-credentials: false
53
+ fetch-depth: 0 # setuptools_scm needs full history + tags
54
+ - uses: astral-sh/setup-uv@v5
55
+ with:
56
+ python-version: "3.12"
57
+ - run: uv build --sdist
58
+ - uses: actions/upload-artifact@v4
59
+ with:
60
+ name: cibw-sdist
61
+ path: dist/*.tar.gz
62
+
63
+ publish:
64
+ name: Publish to PyPI
65
+ needs: [build-wheels, build-sdist]
66
+ runs-on: ubuntu-latest
67
+ environment: pypi
68
+ permissions:
69
+ id-token: write
70
+ if: github.event_name == 'release' && github.event.action == 'published'
71
+ steps:
72
+ - uses: actions/download-artifact@v5
73
+ with:
74
+ pattern: cibw-*
75
+ path: dist
76
+ merge-multiple: true
77
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,41 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ .eggs/
7
+ build/
8
+ dist/
9
+ .venv/
10
+ venv/
11
+
12
+ # uv
13
+ .uv/
14
+
15
+ # Tooling caches
16
+ .ruff_cache/
17
+ .pyright/
18
+ .mypy_cache/
19
+ .pytest_cache/
20
+
21
+ # Databases / local artifacts
22
+ *.duckdb
23
+ *.duckdb.wal
24
+ *.sqlite
25
+ *.sqlite3
26
+ *.db
27
+
28
+ # Per-repo Delfos workspace (graph store, SCIP index, manifest — all regenerated)
29
+ .delfos/
30
+
31
+ # Legacy generated SCIP index location (pre-workspace)
32
+ index.scip
33
+
34
+ # Local env / secrets
35
+ .env
36
+ .env.local
37
+
38
+ # OS / editor
39
+ .DS_Store
40
+ .idea/
41
+ .vscode/
@@ -0,0 +1 @@
1
+ 3.12
delfos-0.1.0/AGENTS.md ADDED
@@ -0,0 +1,57 @@
1
+ # AGENTS.md
2
+
3
+ ## Cursor Cloud specific instructions
4
+
5
+ Delfos is a graph-memory **MCP server** for codebases, with a full CLI. The
6
+ `delfos` package provides the schema (`delfos.schema`), the graph store
7
+ (`delfos.store` — `NativeGraphStore` over the C++ `delfos._delfos` nanobind
8
+ extension; the earlier DuckDB store has been removed), the indexer
9
+ (`delfos.indexer`), the read path (`delfos.reconstruct`), SCIP cross-references
10
+ (`delfos.scip`), the FastMCP server (`delfos.mcp`), and the `delfos` CLI
11
+ (`delfos.cli`). See `ARCHITECTURE.md` for the overview and `docs/decisions.md`
12
+ for the rationale.
13
+
14
+ ### Tooling
15
+ - Project is managed by `uv` (see `README.md`/`CLAUDE.md` for the standard
16
+ commands). The cloud VM installs `uv` to `~/.local/bin`; it is added to PATH via
17
+ `~/.bashrc`, so interactive shells have it. The startup update script runs
18
+ `uv sync`.
19
+ - `uv pip install -e .` builds and installs the `_delfos` C++ extension (build
20
+ isolation pulls scikit-build-core + nanobind).
21
+ - Standard commands (already documented in `CLAUDE.md`): `uv run ruff check .`,
22
+ `uv run ruff format --check .`, `uv run pyright` (strict mode), `uv run pytest`.
23
+
24
+ ### C++ toolchain (for `libdelfos`, see `CMakePresets.json` and `CLAUDE.md`)
25
+ The cloud VM has the toolchain preinstalled: `clang++` 18
26
+ (LLVM; the build standardizes on clang, not GCC), `cmake` 3.28, `ninja`, and the
27
+ clang sanitizer runtimes (ASan/UBSan via `libclang-rt-18-dev`).
28
+ `CMakePresets.json` uses the Ninja generator with `clang++`. Build/test commands
29
+ are in `CLAUDE.md` (`cmake --preset debug`, `cmake --build build/debug`, etc.).
30
+
31
+ Non-obvious gotchas:
32
+ - `clang++` auto-selects the **GCC 14** libstdc++ install (highest version
33
+ present), so `libstdc++-14-dev` must be installed — not just `libstdc++-13-dev`.
34
+ Both are installed here; without the 14 dev package, links fail with
35
+ `cannot find -lstdc++` and `<span>`/`<cstdint>` headers are not found.
36
+ - All external C++ deps (USearch, FlatBuffers, nanobind, Catch2, nanobench) are
37
+ pulled via CMake `FetchContent` at **configure time** — the first configure
38
+ needs network access to GitHub and is slower while it clones/builds them.
39
+
40
+ ### Running it end-to-end
41
+ - Entry points: the `delfos` CLI (`index`, `status`, `doctor`, `search`,
42
+ `reconstruct`, `serve`) and `delfos-mcp` (the MCP read server over stdio).
43
+ Every command anchors on a repo's `.delfos/` workspace (`--repo`, default:
44
+ the current directory), which holds the store snapshot, `index.scip`,
45
+ `manifest.json`, and optional `config.toml`.
46
+ - Configuration is via `DELFOS_*` env variables (precedence and full table in
47
+ `README.md`). Defaults: `nomic-embed-text`, dim 768. Point
48
+ `DELFOS_EMBED_BASE_URL` at any OpenAI-compatible endpoint, or leave it unset
49
+ for OpenAI-hosted (then `DELFOS_EMBED_API_KEY` is required). Queries against
50
+ an already-indexed repo read the model/dim from the manifest and need only
51
+ credentials.
52
+ - For offline/deterministic runs, drive the library directly: supply any object
53
+ satisfying the `Embedder` protocol (`delfos.indexer.embedder.Embedder`) — a
54
+ deterministic hash embedder is enough to index, search, and traverse without
55
+ network access (this is what the tests do, along with a `FakeHopPlanner`).
56
+ The store's `embedding_dim`/`embedding_model` must match the embedder's, or
57
+ writes are rejected.
@@ -0,0 +1,220 @@
1
+ # Architecture
2
+
3
+ This document is the orientation pass for contributors: what Delfos is built
4
+ around, how the pieces fit, and which invariants hold the whole thing together.
5
+ It is deliberately not exhaustive — the *why* behind each decision lives in
6
+ [`docs/decisions.md`](docs/decisions.md), and the module docstrings are the
7
+ source of truth for details. Read this first, then go where your change takes
8
+ you.
9
+
10
+ ## The one idea everything serves
11
+
12
+ Delfos implements the active *reconstruction* model from ["Memory is
13
+ Reconstructed, Not Retrieved: Graph Memory for LLM Agents"](https://arxiv.org/abs/2606.06036)
14
+ (arXiv 2606.06036) over a code repository, and exposes it as an MCP server.
15
+
16
+ The premise: an agent looking for relevant code shouldn't get one shot at a
17
+ similarity search. It should *walk* — start from a semantic entry point, look at
18
+ the neighbors, decide what to keep, descend into the most promising branch,
19
+ backtrack when a path dries up. Memory as an iterative, reasoning-driven
20
+ traversal of a persistent graph, not a top-k lookup.
21
+
22
+ Every layer of this codebase is shaped by that premise. If a change makes the
23
+ walk less legible, less cheap, or less deterministic where it should be
24
+ deterministic, it's probably fighting the architecture.
25
+
26
+ ## The graph: Cue → Tag → Content
27
+
28
+ Three node types, defined as Pydantic models in `delfos/schema/`:
29
+
30
+ - **`CueNode`** — the entry points agents query by: function names, concept
31
+ strings, error messages. These are the *only* nodes that carry embeddings and
32
+ the only ones reachable by vector search.
33
+ - **`TagNode`** — categorical bridges (`MODULE_PATH`, `ARCH_LAYER`,
34
+ `PATTERN_TYPE`, `LANG_CONSTRUCT`, `LANGUAGE`). Filtered by category, never
35
+ embedded. Shared across files.
36
+ - **`ContentNode`** — the payload: functions, classes, modules, commits, tests.
37
+ What actually gets returned to the agent.
38
+
39
+ Edges are directional and typed: `CUE_OF`, `TAGGED_WITH`, `PART_OF_TOPIC`,
40
+ `REDIRECTS_TO`.
41
+
42
+ The separation is the point. Keeping cues (small, embedded, searchable) apart
43
+ from content (rich, returned, never embedded) keeps the vector index tiny and
44
+ the payloads unconstrained, and tags give the walk cheap categorical filtering
45
+ without more embeddings.
46
+
47
+ ## Two layers, one boundary
48
+
49
+ The codebase is Python orchestration over a C++ storage engine, and they meet
50
+ at exactly one seam: the `GraphStore` abstract base class
51
+ (`delfos/store/base.py`).
52
+
53
+ ```
54
+ indexer mcp server reconstruct cli
55
+ \ | | /
56
+ \ | | /
57
+ +---------- GraphStore (ABC) ----------+ ← the only boundary
58
+ |
59
+ NativeGraphStore
60
+ |
61
+ delfos._delfos (nanobind)
62
+ |
63
+ libdelfos (C++)
64
+ ```
65
+
66
+ **No component ever touches the C++ engine directly.** Not the indexer, not the
67
+ MCP tools, not the CLI. Everything codes against `GraphStore`;
68
+ `NativeGraphStore` (`delfos/store/native_store.py`) is the concrete backend.
69
+ This seam is what made it possible to delete the earlier DuckDB backend without
70
+ touching anything above it — keep it that way.
71
+
72
+ ### The engine: `libdelfos/`
73
+
74
+ Header-only C++ under `libdelfos/include/delfos/`:
75
+
76
+ - **`graph.hpp`** — an in-memory CSR directed property graph: O(1) ID lookup,
77
+ cache-friendly adjacency.
78
+ - **`vector_index.hpp`** — HNSW cosine similarity over USearch; <1ms for
79
+ 50K × dim=1536 k=5 queries.
80
+ - **`snapshot.hpp`** — persistence via FlatBuffers + USearch's native format,
81
+ with atomic rename for crash safety.
82
+
83
+ `libdelfos/bindings/py_delfos.cpp` exposes `Store` and `NodeData` to Python as
84
+ the `delfos._delfos` extension. It has its own tests (`libdelfos/tests/`,
85
+ Catch2, built with ASan+UBSan in the debug preset) and a nanobench benchmark
86
+ (`libdelfos/bench/`).
87
+
88
+ ## The write path: indexing
89
+
90
+ `delfos/indexer/` turns a repository into the graph. Four modules, in pipeline
91
+ order:
92
+
93
+ 1. **`parser.py`** — tree-sitter parse of Python source into a `ParsedModule`
94
+ IR. Error-tolerant: files with syntax errors are parsed partially instead of
95
+ raising.
96
+ 2. **`extractor.py`** — the pure, side-effect-free heart: `ParsedModule` in,
97
+ nodes and edges out. No I/O, no embedding, no persistence. Per file it emits
98
+ one module `ContentNode`, one `ContentNode` per definition, `CueNode`s for
99
+ symbols and raised error messages, and shared `TagNode`s.
100
+ 3. **`embedder.py`** — attaches vectors to cue nodes via an OpenAI-compatible
101
+ endpoint.
102
+ 4. **`pipeline.py`** — the `Indexer` that ties it together and owns all the
103
+ transactional discipline (below).
104
+
105
+ The crash-recovery model is simple and strict: **one file per transaction**.
106
+ For each file the pipeline computes the git blob SHA, skips it if the checkpoint
107
+ manifest already has that SHA, and otherwise deletes the file's prior
108
+ nodes/edges and writes the new ones — embeddings included — inside a single
109
+ transaction, together with the manifest entry. A crash mid-file leaves the store
110
+ untouched and the file is retried on the next run. Staleness handling is
111
+ **delete-and-reindex**: no symbol-level diffing, no tombstones, no rename
112
+ detection in v1.
113
+
114
+ Everything a run produces lands in one self-describing directory per repo, the
115
+ `.delfos/` workspace (`delfos/workspace.py`):
116
+
117
+ ```
118
+ <repo>/.delfos/
119
+ ├── store/ # NativeGraphStore snapshot (graph + vectors)
120
+ ├── index.scip # SCIP cross-reference index
121
+ ├── manifest.json # provenance + consistency metadata
122
+ └── config.toml # optional non-secret config
123
+ ```
124
+
125
+ The manifest records which run produced the graph and the SCIP index (so
126
+ inconsistency is detectable) and the embedding model/dimension the index was
127
+ built with (so queries don't need to re-specify them).
128
+
129
+ ## The read path: reconstruction
130
+
131
+ `delfos/reconstruct/` is the read-path service, sitting entirely on
132
+ `GraphStore`. The split that matters:
133
+
134
+ - **Three pure primitives** — `search` (vector search over cues),
135
+ `traverse_forward` (cues → content, with tag filters), `traverse_reverse`
136
+ (content → cues). Deterministic graph operations, independently testable,
137
+ no LLM anywhere near them.
138
+ - **One LLM-in-the-loop operation** — `reconstruct`, the depth-first walk. At
139
+ each hop a `HopPlanner` (`planner.py`, a Protocol) sees compact
140
+ `CandidateSummary`s of the neighbors and decides what to collect and which
141
+ *single* neighbor to descend into. `budget` is a hard ceiling on planner
142
+ calls (backtracking spends it too). `summaries.py` is the single place that
143
+ decides how much of a node the LLM sees.
144
+
145
+ All non-determinism and all token cost are confined to `reconstruct`. Tests
146
+ inject `FakeHopPlanner` and a fake embedder, so the whole layer runs offline.
147
+
148
+ ### The MCP server: the calling agent *is* the planner
149
+
150
+ Here's the twist worth understanding before touching `delfos/mcp/`: the MCP
151
+ server does **not** run a server-side planner LLM. The agent issuing the tool
152
+ calls (Claude Code, Cursor, …) is already an LLM capable of per-hop reasoning —
153
+ so the server exposes the *primitives* (`search`, `traverse_forward`,
154
+ `traverse_reverse`, `fetch`) plus a *prompt* that teaches the walk discipline
155
+ (seed → expand → descend one → respect budget → stop). The in-tree
156
+ `reconstruct` engine and its planners remain as an internal engine used by the
157
+ CLI, tests, and the smoke harness; they are not MCP tools.
158
+
159
+ Returns are tiered, following the MCP search+fetch convention: walk tools
160
+ return compact `NodeSummary`s, and a separate `fetch` returns full
161
+ `ContentDetail` bodies for the agent's final picks. Embeddings are never
162
+ serialized back to the agent. The MCP view models live in `mcp/views.py`,
163
+ deliberately separate from the planner's `CandidateSummary` so the tool surface
164
+ and the planner contract can evolve independently.
165
+
166
+ ### SCIP cross-references
167
+
168
+ `delfos/scip/` adds precise code navigation next to the semantic graph: at
169
+ index time, `scip-python` generates `index.scip`; at read time, `ScipService`
170
+ resolves a `ContentNode` to its references, implementations, and type
171
+ definitions. The trick that keeps this cheap: content node IDs *are* SCIP
172
+ symbol strings when SCIP coverage exists, so lookup is a direct key access with
173
+ no foreign-key indirection. The MCP server exposes these as three more tools
174
+ (`references`, `implementations`, `type_definition`).
175
+
176
+ ## Entry points and wiring
177
+
178
+ - **`delfos/cli/`** — the `delfos` command: `index`, `status`, `doctor`,
179
+ `search`, `reconstruct`, `serve`. Every command anchors on a repo's
180
+ `.delfos/` workspace.
181
+ - **`delfos/mcp/`** — the FastMCP read server (`delfos-mcp` / `delfos serve`).
182
+ Tool logic lives in plain `_`-prefixed functions so it's unit-testable
183
+ without an MCP transport; `build_server` registers thin wrappers.
184
+ - **`delfos/config.py`** — env-driven startup configuration (`DELFOS_*`
185
+ variables; precedence documented in the README). This is also where the
186
+ embedding-model startup check lives.
187
+
188
+ ## Invariants — the short list
189
+
190
+ Break one of these and things fail in ways that are hard to see:
191
+
192
+ 1. **Everything goes through `GraphStore`.** No reaching past the seam into
193
+ `delfos._delfos`.
194
+ 2. **Cue and content nodes always carry `source_file` + `git_sha`.**
195
+ Provenance is what makes delete-and-reindex possible. Tags are the
196
+ deliberate exception — they are shared across files and carry no
197
+ provenance; a re-index drops the file-scoped edges, not the tags.
198
+ 3. **One file per transaction**, manifest entry included. This is the entire
199
+ crash-recovery story.
200
+ 4. **One embedding model per store.** `NativeGraphStore` is constructed with
201
+ an `embedding_model` and rejects nodes whose model differs; the server
202
+ fail-fasts at startup if the query-time embedder doesn't match the index.
203
+ A mismatch here doesn't error at query time — it silently returns garbage,
204
+ which is why it's caught at the door.
205
+ 5. **Primitives stay pure.** LLM calls belong only in `reconstruct` (or in the
206
+ calling agent, in the MCP case).
207
+ 6. **Pyright strict mode and `extra="forbid"` on every Pydantic model** are
208
+ non-negotiable. Schema drift and untyped code surface immediately, by
209
+ design.
210
+
211
+ ## Where to go next
212
+
213
+ - [`docs/decisions.md`](docs/decisions.md) — the decision log: what was chosen,
214
+ why, and what was explicitly ruled out.
215
+ - [`README.md`](README.md) — setup, configuration, and build commands.
216
+ - The module docstrings — each pipeline stage and service opens with a
217
+ docstring stating its contract; they are kept accurate and are the fastest
218
+ way to understand a module before editing it.
219
+ - `tests/` mirrors the package layout; the reconstruct tests
220
+ (`tests/reconstruct/`) are the best executable walkthrough of the read path.