graphifyy 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. graphifyy-0.1.1/PKG-INFO +271 -0
  2. graphifyy-0.1.1/README.md +231 -0
  3. graphifyy-0.1.1/graphify/__init__.py +27 -0
  4. graphifyy-0.1.1/graphify/__main__.py +89 -0
  5. graphifyy-0.1.1/graphify/analyze.py +429 -0
  6. graphifyy-0.1.1/graphify/benchmark.py +126 -0
  7. graphifyy-0.1.1/graphify/build.py +31 -0
  8. graphifyy-0.1.1/graphify/cache.py +118 -0
  9. graphifyy-0.1.1/graphify/cluster.py +104 -0
  10. graphifyy-0.1.1/graphify/detect.py +274 -0
  11. graphifyy-0.1.1/graphify/export.py +656 -0
  12. graphifyy-0.1.1/graphify/extract.py +2440 -0
  13. graphifyy-0.1.1/graphify/ingest.py +289 -0
  14. graphifyy-0.1.1/graphify/manifest.py +4 -0
  15. graphifyy-0.1.1/graphify/report.py +133 -0
  16. graphifyy-0.1.1/graphify/security.py +166 -0
  17. graphifyy-0.1.1/graphify/serve.py +328 -0
  18. graphifyy-0.1.1/graphify/skill.md +1036 -0
  19. graphifyy-0.1.1/graphify/validate.py +71 -0
  20. graphifyy-0.1.1/graphify/watch.py +82 -0
  21. graphifyy-0.1.1/graphifyy.egg-info/PKG-INFO +271 -0
  22. graphifyy-0.1.1/graphifyy.egg-info/SOURCES.txt +42 -0
  23. graphifyy-0.1.1/graphifyy.egg-info/dependency_links.txt +1 -0
  24. graphifyy-0.1.1/graphifyy.egg-info/entry_points.txt +2 -0
  25. graphifyy-0.1.1/graphifyy.egg-info/requires.txt +37 -0
  26. graphifyy-0.1.1/graphifyy.egg-info/top_level.txt +1 -0
  27. graphifyy-0.1.1/pyproject.toml +47 -0
  28. graphifyy-0.1.1/setup.cfg +4 -0
  29. graphifyy-0.1.1/tests/test_analyze.py +179 -0
  30. graphifyy-0.1.1/tests/test_benchmark.py +119 -0
  31. graphifyy-0.1.1/tests/test_build.py +41 -0
  32. graphifyy-0.1.1/tests/test_cache.py +74 -0
  33. graphifyy-0.1.1/tests/test_cluster.py +52 -0
  34. graphifyy-0.1.1/tests/test_detect.py +71 -0
  35. graphifyy-0.1.1/tests/test_export.py +54 -0
  36. graphifyy-0.1.1/tests/test_extract.py +144 -0
  37. graphifyy-0.1.1/tests/test_ingest.py +68 -0
  38. graphifyy-0.1.1/tests/test_languages.py +219 -0
  39. graphifyy-0.1.1/tests/test_multilang.py +173 -0
  40. graphifyy-0.1.1/tests/test_report.py +63 -0
  41. graphifyy-0.1.1/tests/test_security.py +187 -0
  42. graphifyy-0.1.1/tests/test_serve.py +156 -0
  43. graphifyy-0.1.1/tests/test_validate.py +87 -0
  44. graphifyy-0.1.1/tests/test_watch.py +68 -0
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: graphifyy
3
+ Version: 0.1.1
4
+ Summary: Turn any codebase, docs, or images into a queryable knowledge graph
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: networkx
9
+ Requires-Dist: graspologic
10
+ Requires-Dist: pyvis
11
+ Requires-Dist: tree-sitter
12
+ Requires-Dist: tree-sitter-python
13
+ Requires-Dist: tree-sitter-javascript
14
+ Requires-Dist: tree-sitter-typescript
15
+ Requires-Dist: tree-sitter-go
16
+ Requires-Dist: tree-sitter-rust
17
+ Requires-Dist: tree-sitter-java
18
+ Requires-Dist: tree-sitter-c
19
+ Requires-Dist: tree-sitter-cpp
20
+ Requires-Dist: tree-sitter-ruby
21
+ Requires-Dist: tree-sitter-c-sharp
22
+ Requires-Dist: tree-sitter-kotlin
23
+ Requires-Dist: tree-sitter-scala
24
+ Requires-Dist: tree-sitter-php
25
+ Provides-Extra: mcp
26
+ Requires-Dist: mcp; extra == "mcp"
27
+ Provides-Extra: neo4j
28
+ Requires-Dist: neo4j; extra == "neo4j"
29
+ Provides-Extra: pdf
30
+ Requires-Dist: pypdf; extra == "pdf"
31
+ Requires-Dist: html2text; extra == "pdf"
32
+ Provides-Extra: watch
33
+ Requires-Dist: watchdog; extra == "watch"
34
+ Provides-Extra: all
35
+ Requires-Dist: mcp; extra == "all"
36
+ Requires-Dist: neo4j; extra == "all"
37
+ Requires-Dist: pypdf; extra == "all"
38
+ Requires-Dist: html2text; extra == "all"
39
+ Requires-Dist: watchdog; extra == "all"
40
+
41
+ # graphify
42
+
43
+ [![CI](https://github.com/safishamsi/graphify/actions/workflows/ci.yml/badge.svg?branch=v1)](https://github.com/safishamsi/graphify/actions/workflows/ci.yml)
44
+
45
+ **A Claude Code skill.** Type `/graphify` in Claude Code — it reads your files, builds a knowledge graph, and gives you back structure you didn't know was there.
46
+
47
+ > Andrej Karpathy keeps a `/raw` folder where he drops papers, tweets, screenshots, and notes. The problem: that folder becomes opaque. You forget what's in it. You can't see what connects. graphify is the answer to that problem.
48
+
49
+ ```
50
+ /graphify ./raw
51
+ ```
52
+
53
+ ```
54
+ .graphify/
55
+ ├── obsidian/ open as Obsidian vault — visual graph, wikilinks, filter by community
56
+ ├── GRAPH_REPORT.md what the graph found: god nodes, surprising connections, suggested questions
57
+ ├── graph.json persistent graph — query it weeks later without re-reading anything
58
+ ├── cache/ per-file SHA256 cache — re-runs only process changed files
59
+ └── memory/ Q&A results filed back in — what you ask grows the graph on next --update
60
+ ```
61
+
62
+ ## Why this exists
63
+
64
+ graphify takes that observation and builds the missing infrastructure:
65
+
66
+ | His problem | What graphify adds |
67
+ |---|---|
68
+ | Folder becomes opaque | Community detection surfaces structure automatically |
69
+ | Forget what's in it | Persistent `graph.json` — query weeks later without re-reading |
70
+ | Can't see connections | Cross-community surprising connections as a first-class output |
71
+ | Claude hallucinates missing links | `EXTRACTED` / `INFERRED` / `AMBIGUOUS` — honest about what was found vs guessed |
72
+ | Context resets every session | Memory feedback loop — what you ask grows the graph on `--update` |
73
+ | Only works on text | PDFs, images, screenshots, tweets, any language via vision |
74
+
75
+ **What LLMs get wrong without it:** Naive summarization fills every gap confidently. You get output that sounds complete but you can't tell what was actually in the files vs invented. And next session, it's all gone.
76
+
77
+ **What graphify does differently:**
78
+
79
+ - **Persistent graph** — relationships stored in `.graphify/graph.json`, survive across sessions. Query weeks later without re-reading anything.
80
+ - **Honest audit trail** — every edge tagged `EXTRACTED` (explicitly stated), `INFERRED` (call-graph or reasonable deduction), or `AMBIGUOUS` (flagged for review). You always know what was found vs invented.
81
+ - **Cross-document surprise** — Leiden community detection finds clusters, then surfaces cross-community connections: the things you would never think to ask about directly.
82
+ - **Feedback loop** — every query answer saved to `.graphify/memory/`. On next `--update`, that Q&A becomes a node. The graph grows from what you ask, not just what you add.
83
+
84
+ The result: a navigable map of your corpus that is honest about what it knows and what it guessed.
85
+
86
+ ## Install
87
+
88
+ ```bash
89
+ pip install graphify && graphify install
90
+ ```
91
+
92
+ This copies the skill file into `~/.claude/skills/graphify/` and registers it in `~/.claude/CLAUDE.md`. The Python package and all dependencies install automatically on first `/graphify` run — you never touch pip again.
93
+
94
+ Then open Claude Code in any directory and type:
95
+
96
+ ```
97
+ /graphify .
98
+ ```
99
+
100
+ <details>
101
+ <summary>Manual install (curl)</summary>
102
+
103
+ **Step 1 — copy the skill file**
104
+
105
+ ```bash
106
+ mkdir -p ~/.claude/skills/graphify
107
+ curl -fsSL https://raw.githubusercontent.com/safishamsi/graphify/v1/skills/graphify/skill.md \
108
+ > ~/.claude/skills/graphify/SKILL.md
109
+ ```
110
+
111
+ **Step 2 — register it in Claude Code**
112
+
113
+ Add this to `~/.claude/CLAUDE.md` (create the file if it doesn't exist):
114
+
115
+ ```
116
+ - **graphify** (`~/.claude/skills/graphify/SKILL.md`) — any input to knowledge graph. Trigger: `/graphify`
117
+ When the user types `/graphify`, invoke the Skill tool with `skill: "graphify"` before doing anything else.
118
+ ```
119
+
120
+ </details>
121
+
122
+ ## Usage
123
+
124
+ All commands are typed inside Claude Code:
125
+
126
+ ```
127
+ /graphify # run on current directory
128
+ /graphify ./raw # run on a specific folder
129
+ /graphify ./raw --mode deep # more aggressive INFERRED edge extraction
130
+ /graphify ./raw --update # re-extract only changed files, merge into existing graph
131
+ /graphify ./raw --watch # notify when new files appear
132
+
133
+ /graphify add https://arxiv.org/abs/1706.03762 # fetch a paper, save, update graph
134
+ /graphify add https://x.com/karpathy/status/... # fetch a tweet
135
+ /graphify add <url> --author "Karpathy" --contributor "safi"
136
+
137
+ /graphify query "what connects attention to the optimizer?" # BFS — broad context
138
+ /graphify query "how does the encoder reach the loss?" --dfs # DFS — trace a path
139
+ /graphify query "..." --budget 1500 # cap at N tokens
140
+
141
+ /graphify path "DigestAuth" "Response" # shortest path between two concepts
142
+ /graphify explain "SwinTransformer" # plain-language node explanation
143
+
144
+ /graphify ./raw --html # also export graph.html (browser, no Obsidian needed)
145
+ /graphify ./raw --svg # also export graph.svg (embeds in Notion, GitHub)
146
+ /graphify ./raw --neo4j # generate cypher.txt for Neo4j import
147
+ /graphify ./raw --mcp # start MCP stdio server for agent access
148
+ ```
149
+
150
+ Works with any mix of file types in the same folder:
151
+
152
+ | Type | Extensions | How it's extracted |
153
+ |------|-----------|-------------------|
154
+ | Code | `.py .ts .tsx .js .go .rs .java .c .cpp .rb .cs .kt .scala .php` | AST via tree-sitter (deterministic) + call-graph pass (INFERRED) |
155
+ | Documents | `.md .txt .rst` | Concepts + relationships via Claude |
156
+ | Papers | `.pdf` | Citation mining + concept extraction |
157
+ | Images | `.png .jpg .webp .gif .svg` | Claude vision — screenshots, charts, whiteboards, any language |
158
+
159
+ ## What you get
160
+
161
+ After running, Claude outputs three things directly in chat:
162
+
163
+ **God nodes** — highest-degree concepts (what everything connects through)
164
+
165
+ **Surprising connections** — cross-community edges; relationships between concepts in different clusters that you didn't know to look for
166
+
167
+ **Suggested questions** — 4-5 questions the graph is uniquely positioned to answer, with the reason why (which bridge node makes it interesting, which community boundary it crosses)
168
+
169
+ The full GRAPH_REPORT.md adds community summaries with cohesion scores and a list of ambiguous edges for review.
170
+
171
+ ## Key files explained
172
+
173
+ | File | Purpose |
174
+ |------|---------|
175
+ | `GRAPH_REPORT.md` | The audit report. God nodes, surprising connections, community cohesion scores, ambiguous edge list, suggested questions. |
176
+ | `graph.json` | Persistent graph in node-link format. Load it with NetworkX or push to Neo4j. Survives sessions. |
177
+ | `obsidian/` | Wikilink vault. Open in Obsidian → enable graph view → see communities as clusters. Filter by tag, search across everything. |
178
+ | `.graphify/cache/` | SHA256-based per-file cache. A re-run on an unchanged corpus takes seconds. |
179
+ | `.graphify/memory/` | Q&A feedback loop. Every `/graphify query` answer is saved here. Next `--update` extracts it into the graph. |
180
+
181
+ ## What this skill will NOT do
182
+
183
+ - **Won't invent edges** — `AMBIGUOUS` exists so uncertain relationships are flagged, not hidden. If the connection isn't clear, it's tagged, not fabricated.
184
+ - **Won't claim the graph is useful when it isn't** — a corpus over 2M words or 200 files gets a cost warning before proceeding.
185
+ - **Won't re-extract unchanged files** — SHA256 cache ensures warm re-runs skip everything that hasn't changed.
186
+ - **Won't visualize graphs over 5,000 nodes** — use `--no-viz` or query instead.
187
+ - **Won't download datasets or set up infrastructure** — graphify reads your files. What you put in the folder is what it works with.
188
+ - **Won't implement baselines or run experiments** — it reads and maps. Analysis is yours.
189
+
190
+ ## Design principles
191
+
192
+ 1. **Extraction quality is everything** — clustering is downstream of it. A bad graph clusters into bad communities. The AST + call-graph pass exists because deterministic beats probabilistic for code.
193
+ 2. **Show the numbers** — cohesion is `0.91`, not "good". Token cost is always printed. You know what you spent.
194
+ 3. **The best output is what you didn't know** — Surprising Connections is not optional. God nodes you probably already suspected. Cross-community edges are what you came for.
195
+ 4. **The graph earns its complexity** — below a certain density, just use Claude directly. The graph adds value when you have more than you can hold in context across sessions.
196
+ 5. **What you ask grows the graph** — query results are filed back in automatically. The corpus is not static.
197
+ 6. **Honest uncertainty** — `EXTRACTED`, `INFERRED`, `AMBIGUOUS` are not cosmetic labels. They are the difference between trusting the graph and being misled by it.
198
+
199
+ ## Contributing
200
+
201
+ **Adding worked examples**
202
+
203
+ Worked examples are the most trust-building part of this project. To add one:
204
+
205
+ 1. Pick a real corpus (people should be able to verify the output)
206
+ 2. Run the skill: `/graphify <path>`
207
+ 3. Save the full output to `worked/{corpus_slug}/`
208
+ 4. Write a `review.md` that honestly evaluates:
209
+ - What the graph got right
210
+ - What edges it correctly flagged AMBIGUOUS
211
+ - Any mistakes or missed connections
212
+ - Any surprising connections that were genuinely surprising
213
+ 5. Submit a PR with all of the above
214
+
215
+ **Improving extraction**
216
+
217
+ If you find a file type or language where extraction is poor, open an issue with a minimal reproduction case. The best bug reports include: the input file, the extraction output (`.graphify/cache/` entry), and what was missed or invented.
218
+
219
+ **Adding domain knowledge**
220
+
221
+ If corpora in your domain consistently contain structures graphify doesn't extract well (e.g., legal documents, lab notebooks, musical scores), open a discussion with examples.
222
+
223
+ ## Worked examples
224
+
225
+ | Corpus | Type | Reduction | Eval report |
226
+ |--------|------|-----------|-------------|
227
+ | Karpathy repos + 5 research papers + 4 images | Mixed (code + papers + images) | **71.5x** | [`worked/karpathy-repos/review.md`](worked/karpathy-repos/review.md) |
228
+ | httpx (Python HTTP client) | Codebase | — | [`worked/httpx/review.md`](worked/httpx/review.md) + [`GRAPH_REPORT.md`](worked/httpx/GRAPH_REPORT.md) |
229
+ | Mixed corpus (code + paper + Arabic image) | Multi-type | — | [`worked/mixed-corpus/review.md`](worked/mixed-corpus/review.md) |
230
+
231
+ Each includes the full graph output and an honest evaluation of what the skill got right and wrong.
232
+
233
+ ## Tech stack
234
+
235
+ | Layer | Library | Why |
236
+ |-------|---------|-----|
237
+ | Graph | NetworkX | Pure Python, same internals as MS GraphRAG |
238
+ | Community detection | Leiden via graspologic | Better than K-means for sparse graphs |
239
+ | Code parsing | tree-sitter | Multi-language AST, deterministic, zero hallucination |
240
+ | Extraction | Claude (parallel subagents) | Reads anything, outputs structured graph data |
241
+ | Visualization | Obsidian vault | Native graph view, wikilinks, no server needed |
242
+
243
+ No Neo4j required. No dashboards. No server. Runs entirely locally.
244
+
245
+ ## Files
246
+
247
+ ```
248
+ graphify/
249
+ ├── detect.py detect file types, auto-exclude venvs/caches/node_modules; scan .graphify/memory/
250
+ ├── extract.py AST extraction (13 languages via tree-sitter) + call-graph pass (INFERRED edges)
251
+ ├── build.py assemble NetworkX graph from extraction JSON; schema-validates before assembly
252
+ ├── cluster.py Leiden community detection, cohesion scoring
253
+ ├── analyze.py god nodes, bridge nodes, surprising connections, suggested questions, graph diff
254
+ ├── report.py render GRAPH_REPORT.md
255
+ ├── export.py Obsidian vault, graph.json, graph.html, graph.svg, Neo4j Cypher, Canvas
256
+ ├── ingest.py fetch URLs (arXiv, Twitter/X, PDF, any webpage); save Q&A to .graphify/memory/
257
+ ├── cache.py SHA256-based per-file extraction cache; check_semantic_cache / save_semantic_cache
258
+ ├── security.py URL validation (http/https only), safe fetch with size cap, path guards, label sanitisation
259
+ ├── validate.py JSON schema checks on extraction output
260
+ ├── serve.py MCP stdio server — query_graph, get_node, get_neighbors, shortest_path, god_nodes
261
+ └── watch.py fs watcher, writes flag file when new files appear
262
+
263
+ skills/graphify/
264
+ └── skill.md the Claude Code skill — the full pipeline the agent runs step by step
265
+
266
+ ARCHITECTURE.md module responsibilities, extraction schema, how to add a language
267
+ SECURITY.md threat model, mitigations, vulnerability reporting
268
+ worked/ eval reports from real corpora (karpathy-repos, httpx, mixed-corpus)
269
+ tests/ 212 tests, one file per module
270
+ pyproject.toml pip install graphify | pip install graphify[mcp,neo4j,pdf,watch]
271
+ ```
@@ -0,0 +1,231 @@
1
+ # graphify
2
+
3
+ [![CI](https://github.com/safishamsi/graphify/actions/workflows/ci.yml/badge.svg?branch=v1)](https://github.com/safishamsi/graphify/actions/workflows/ci.yml)
4
+
5
+ **A Claude Code skill.** Type `/graphify` in Claude Code — it reads your files, builds a knowledge graph, and gives you back structure you didn't know was there.
6
+
7
+ > Andrej Karpathy keeps a `/raw` folder where he drops papers, tweets, screenshots, and notes. The problem: that folder becomes opaque. You forget what's in it. You can't see what connects. graphify is the answer to that problem.
8
+
9
+ ```
10
+ /graphify ./raw
11
+ ```
12
+
13
+ ```
14
+ .graphify/
15
+ ├── obsidian/ open as Obsidian vault — visual graph, wikilinks, filter by community
16
+ ├── GRAPH_REPORT.md what the graph found: god nodes, surprising connections, suggested questions
17
+ ├── graph.json persistent graph — query it weeks later without re-reading anything
18
+ ├── cache/ per-file SHA256 cache — re-runs only process changed files
19
+ └── memory/ Q&A results filed back in — what you ask grows the graph on next --update
20
+ ```
21
+
22
+ ## Why this exists
23
+
24
+ graphify takes that observation and builds the missing infrastructure:
25
+
26
+ | His problem | What graphify adds |
27
+ |---|---|
28
+ | Folder becomes opaque | Community detection surfaces structure automatically |
29
+ | Forget what's in it | Persistent `graph.json` — query weeks later without re-reading |
30
+ | Can't see connections | Cross-community surprising connections as a first-class output |
31
+ | Claude hallucinates missing links | `EXTRACTED` / `INFERRED` / `AMBIGUOUS` — honest about what was found vs guessed |
32
+ | Context resets every session | Memory feedback loop — what you ask grows the graph on `--update` |
33
+ | Only works on text | PDFs, images, screenshots, tweets, any language via vision |
34
+
35
+ **What LLMs get wrong without it:** Naive summarization fills every gap confidently. You get output that sounds complete but you can't tell what was actually in the files vs invented. And next session, it's all gone.
36
+
37
+ **What graphify does differently:**
38
+
39
+ - **Persistent graph** — relationships stored in `.graphify/graph.json`, survive across sessions. Query weeks later without re-reading anything.
40
+ - **Honest audit trail** — every edge tagged `EXTRACTED` (explicitly stated), `INFERRED` (call-graph or reasonable deduction), or `AMBIGUOUS` (flagged for review). You always know what was found vs invented.
41
+ - **Cross-document surprise** — Leiden community detection finds clusters, then surfaces cross-community connections: the things you would never think to ask about directly.
42
+ - **Feedback loop** — every query answer saved to `.graphify/memory/`. On next `--update`, that Q&A becomes a node. The graph grows from what you ask, not just what you add.
43
+
44
+ The result: a navigable map of your corpus that is honest about what it knows and what it guessed.
45
+
46
+ ## Install
47
+
48
+ ```bash
49
+ pip install graphify && graphify install
50
+ ```
51
+
52
+ This copies the skill file into `~/.claude/skills/graphify/` and registers it in `~/.claude/CLAUDE.md`. The Python package and all dependencies install automatically on first `/graphify` run — you never touch pip again.
53
+
54
+ Then open Claude Code in any directory and type:
55
+
56
+ ```
57
+ /graphify .
58
+ ```
59
+
60
+ <details>
61
+ <summary>Manual install (curl)</summary>
62
+
63
+ **Step 1 — copy the skill file**
64
+
65
+ ```bash
66
+ mkdir -p ~/.claude/skills/graphify
67
+ curl -fsSL https://raw.githubusercontent.com/safishamsi/graphify/v1/skills/graphify/skill.md \
68
+ > ~/.claude/skills/graphify/SKILL.md
69
+ ```
70
+
71
+ **Step 2 — register it in Claude Code**
72
+
73
+ Add this to `~/.claude/CLAUDE.md` (create the file if it doesn't exist):
74
+
75
+ ```
76
+ - **graphify** (`~/.claude/skills/graphify/SKILL.md`) — any input to knowledge graph. Trigger: `/graphify`
77
+ When the user types `/graphify`, invoke the Skill tool with `skill: "graphify"` before doing anything else.
78
+ ```
79
+
80
+ </details>
81
+
82
+ ## Usage
83
+
84
+ All commands are typed inside Claude Code:
85
+
86
+ ```
87
+ /graphify # run on current directory
88
+ /graphify ./raw # run on a specific folder
89
+ /graphify ./raw --mode deep # more aggressive INFERRED edge extraction
90
+ /graphify ./raw --update # re-extract only changed files, merge into existing graph
91
+ /graphify ./raw --watch # notify when new files appear
92
+
93
+ /graphify add https://arxiv.org/abs/1706.03762 # fetch a paper, save, update graph
94
+ /graphify add https://x.com/karpathy/status/... # fetch a tweet
95
+ /graphify add <url> --author "Karpathy" --contributor "safi"
96
+
97
+ /graphify query "what connects attention to the optimizer?" # BFS — broad context
98
+ /graphify query "how does the encoder reach the loss?" --dfs # DFS — trace a path
99
+ /graphify query "..." --budget 1500 # cap at N tokens
100
+
101
+ /graphify path "DigestAuth" "Response" # shortest path between two concepts
102
+ /graphify explain "SwinTransformer" # plain-language node explanation
103
+
104
+ /graphify ./raw --html # also export graph.html (browser, no Obsidian needed)
105
+ /graphify ./raw --svg # also export graph.svg (embeds in Notion, GitHub)
106
+ /graphify ./raw --neo4j # generate cypher.txt for Neo4j import
107
+ /graphify ./raw --mcp # start MCP stdio server for agent access
108
+ ```
109
+
110
+ Works with any mix of file types in the same folder:
111
+
112
+ | Type | Extensions | How it's extracted |
113
+ |------|-----------|-------------------|
114
+ | Code | `.py .ts .tsx .js .go .rs .java .c .cpp .rb .cs .kt .scala .php` | AST via tree-sitter (deterministic) + call-graph pass (INFERRED) |
115
+ | Documents | `.md .txt .rst` | Concepts + relationships via Claude |
116
+ | Papers | `.pdf` | Citation mining + concept extraction |
117
+ | Images | `.png .jpg .webp .gif .svg` | Claude vision — screenshots, charts, whiteboards, any language |
118
+
119
+ ## What you get
120
+
121
+ After running, Claude outputs three things directly in chat:
122
+
123
+ **God nodes** — highest-degree concepts (what everything connects through)
124
+
125
+ **Surprising connections** — cross-community edges; relationships between concepts in different clusters that you didn't know to look for
126
+
127
+ **Suggested questions** — 4-5 questions the graph is uniquely positioned to answer, with the reason why (which bridge node makes it interesting, which community boundary it crosses)
128
+
129
+ The full GRAPH_REPORT.md adds community summaries with cohesion scores and a list of ambiguous edges for review.
130
+
131
+ ## Key files explained
132
+
133
+ | File | Purpose |
134
+ |------|---------|
135
+ | `GRAPH_REPORT.md` | The audit report. God nodes, surprising connections, community cohesion scores, ambiguous edge list, suggested questions. |
136
+ | `graph.json` | Persistent graph in node-link format. Load it with NetworkX or push to Neo4j. Survives sessions. |
137
+ | `obsidian/` | Wikilink vault. Open in Obsidian → enable graph view → see communities as clusters. Filter by tag, search across everything. |
138
+ | `.graphify/cache/` | SHA256-based per-file cache. A re-run on an unchanged corpus takes seconds. |
139
+ | `.graphify/memory/` | Q&A feedback loop. Every `/graphify query` answer is saved here. Next `--update` extracts it into the graph. |
140
+
141
+ ## What this skill will NOT do
142
+
143
+ - **Won't invent edges** — `AMBIGUOUS` exists so uncertain relationships are flagged, not hidden. If the connection isn't clear, it's tagged, not fabricated.
144
+ - **Won't claim the graph is useful when it isn't** — a corpus over 2M words or 200 files gets a cost warning before proceeding.
145
+ - **Won't re-extract unchanged files** — SHA256 cache ensures warm re-runs skip everything that hasn't changed.
146
+ - **Won't visualize graphs over 5,000 nodes** — use `--no-viz` or query instead.
147
+ - **Won't download datasets or set up infrastructure** — graphify reads your files. What you put in the folder is what it works with.
148
+ - **Won't implement baselines or run experiments** — it reads and maps. Analysis is yours.
149
+
150
+ ## Design principles
151
+
152
+ 1. **Extraction quality is everything** — clustering is downstream of it. A bad graph clusters into bad communities. The AST + call-graph pass exists because deterministic beats probabilistic for code.
153
+ 2. **Show the numbers** — cohesion is `0.91`, not "good". Token cost is always printed. You know what you spent.
154
+ 3. **The best output is what you didn't know** — Surprising Connections is not optional. God nodes you probably already suspected. Cross-community edges are what you came for.
155
+ 4. **The graph earns its complexity** — below a certain density, just use Claude directly. The graph adds value when you have more than you can hold in context across sessions.
156
+ 5. **What you ask grows the graph** — query results are filed back in automatically. The corpus is not static.
157
+ 6. **Honest uncertainty** — `EXTRACTED`, `INFERRED`, `AMBIGUOUS` are not cosmetic labels. They are the difference between trusting the graph and being misled by it.
158
+
159
+ ## Contributing
160
+
161
+ **Adding worked examples**
162
+
163
+ Worked examples are the most trust-building part of this project. To add one:
164
+
165
+ 1. Pick a real corpus (people should be able to verify the output)
166
+ 2. Run the skill: `/graphify <path>`
167
+ 3. Save the full output to `worked/{corpus_slug}/`
168
+ 4. Write a `review.md` that honestly evaluates:
169
+ - What the graph got right
170
+ - What edges it correctly flagged AMBIGUOUS
171
+ - Any mistakes or missed connections
172
+ - Any surprising connections that were genuinely surprising
173
+ 5. Submit a PR with all of the above
174
+
175
+ **Improving extraction**
176
+
177
+ If you find a file type or language where extraction is poor, open an issue with a minimal reproduction case. The best bug reports include: the input file, the extraction output (`.graphify/cache/` entry), and what was missed or invented.
178
+
179
+ **Adding domain knowledge**
180
+
181
+ If corpora in your domain consistently contain structures graphify doesn't extract well (e.g., legal documents, lab notebooks, musical scores), open a discussion with examples.
182
+
183
+ ## Worked examples
184
+
185
+ | Corpus | Type | Reduction | Eval report |
186
+ |--------|------|-----------|-------------|
187
+ | Karpathy repos + 5 research papers + 4 images | Mixed (code + papers + images) | **71.5x** | [`worked/karpathy-repos/review.md`](worked/karpathy-repos/review.md) |
188
+ | httpx (Python HTTP client) | Codebase | — | [`worked/httpx/review.md`](worked/httpx/review.md) + [`GRAPH_REPORT.md`](worked/httpx/GRAPH_REPORT.md) |
189
+ | Mixed corpus (code + paper + Arabic image) | Multi-type | — | [`worked/mixed-corpus/review.md`](worked/mixed-corpus/review.md) |
190
+
191
+ Each includes the full graph output and an honest evaluation of what the skill got right and wrong.
192
+
193
+ ## Tech stack
194
+
195
+ | Layer | Library | Why |
196
+ |-------|---------|-----|
197
+ | Graph | NetworkX | Pure Python, same internals as MS GraphRAG |
198
+ | Community detection | Leiden via graspologic | Better than K-means for sparse graphs |
199
+ | Code parsing | tree-sitter | Multi-language AST, deterministic, zero hallucination |
200
+ | Extraction | Claude (parallel subagents) | Reads anything, outputs structured graph data |
201
+ | Visualization | Obsidian vault | Native graph view, wikilinks, no server needed |
202
+
203
+ No Neo4j required. No dashboards. No server. Runs entirely locally.
204
+
205
+ ## Files
206
+
207
+ ```
208
+ graphify/
209
+ ├── detect.py detect file types, auto-exclude venvs/caches/node_modules; scan .graphify/memory/
210
+ ├── extract.py AST extraction (13 languages via tree-sitter) + call-graph pass (INFERRED edges)
211
+ ├── build.py assemble NetworkX graph from extraction JSON; schema-validates before assembly
212
+ ├── cluster.py Leiden community detection, cohesion scoring
213
+ ├── analyze.py god nodes, bridge nodes, surprising connections, suggested questions, graph diff
214
+ ├── report.py render GRAPH_REPORT.md
215
+ ├── export.py Obsidian vault, graph.json, graph.html, graph.svg, Neo4j Cypher, Canvas
216
+ ├── ingest.py fetch URLs (arXiv, Twitter/X, PDF, any webpage); save Q&A to .graphify/memory/
217
+ ├── cache.py SHA256-based per-file extraction cache; check_semantic_cache / save_semantic_cache
218
+ ├── security.py URL validation (http/https only), safe fetch with size cap, path guards, label sanitisation
219
+ ├── validate.py JSON schema checks on extraction output
220
+ ├── serve.py MCP stdio server — query_graph, get_node, get_neighbors, shortest_path, god_nodes
221
+ └── watch.py fs watcher, writes flag file when new files appear
222
+
223
+ skills/graphify/
224
+ └── skill.md the Claude Code skill — the full pipeline the agent runs step by step
225
+
226
+ ARCHITECTURE.md module responsibilities, extraction schema, how to add a language
227
+ SECURITY.md threat model, mitigations, vulnerability reporting
228
+ worked/ eval reports from real corpora (karpathy-repos, httpx, mixed-corpus)
229
+ tests/ 212 tests, one file per module
230
+ pyproject.toml pip install graphify | pip install graphify[mcp,neo4j,pdf,watch]
231
+ ```
@@ -0,0 +1,27 @@
1
+ """graphify — extract · build · cluster · analyze · report."""
2
+
3
+
4
+ def __getattr__(name):
5
+ # Lazy imports so `graphify install` works before heavy deps are in place.
6
+ _map = {
7
+ "extract": ("graphify.extract", "extract"),
8
+ "collect_files": ("graphify.extract", "collect_files"),
9
+ "build_from_json": ("graphify.build", "build_from_json"),
10
+ "cluster": ("graphify.cluster", "cluster"),
11
+ "score_all": ("graphify.cluster", "score_all"),
12
+ "cohesion_score": ("graphify.cluster", "cohesion_score"),
13
+ "god_nodes": ("graphify.analyze", "god_nodes"),
14
+ "surprising_connections": ("graphify.analyze", "surprising_connections"),
15
+ "suggest_questions": ("graphify.analyze", "suggest_questions"),
16
+ "generate": ("graphify.report", "generate"),
17
+ "to_json": ("graphify.export", "to_json"),
18
+ "to_html": ("graphify.export", "to_html"),
19
+ "to_svg": ("graphify.export", "to_svg"),
20
+ "to_canvas": ("graphify.export", "to_canvas"),
21
+ }
22
+ if name in _map:
23
+ import importlib
24
+ mod_name, attr = _map[name]
25
+ mod = importlib.import_module(mod_name)
26
+ return getattr(mod, attr)
27
+ raise AttributeError(f"module 'graphify' has no attribute {name!r}")
@@ -0,0 +1,89 @@
1
+ """graphify CLI — `graphify install` sets up the Claude Code skill."""
2
+ from __future__ import annotations
3
+ import json
4
+ import shutil
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ _SKILL_REGISTRATION = (
9
+ "\n# graphify\n"
10
+ "- **graphify** (`~/.claude/skills/graphify/SKILL.md`) "
11
+ "— any input to knowledge graph. Trigger: `/graphify`\n"
12
+ "When the user types `/graphify`, invoke the Skill tool "
13
+ "with `skill: \"graphify\"` before doing anything else.\n"
14
+ )
15
+
16
+
17
+ def _bundled_skill() -> Path:
18
+ """Path to the skill.md bundled with this package."""
19
+ return Path(__file__).parent / "skill.md"
20
+
21
+
22
+ def install() -> None:
23
+ skill_src = _bundled_skill()
24
+ if not skill_src.exists():
25
+ print("error: skill.md not found in package — reinstall graphify", file=sys.stderr)
26
+ sys.exit(1)
27
+
28
+ # Copy skill to ~/.claude/skills/graphify/SKILL.md
29
+ skill_dst = Path.home() / ".claude" / "skills" / "graphify" / "SKILL.md"
30
+ skill_dst.parent.mkdir(parents=True, exist_ok=True)
31
+ shutil.copy(skill_src, skill_dst)
32
+ print(f" skill installed → {skill_dst}")
33
+
34
+ # Register in ~/.claude/CLAUDE.md
35
+ claude_md = Path.home() / ".claude" / "CLAUDE.md"
36
+ if claude_md.exists():
37
+ content = claude_md.read_text()
38
+ if "graphify" in content:
39
+ print(f" CLAUDE.md → already registered (no change)")
40
+ else:
41
+ claude_md.write_text(content.rstrip() + _SKILL_REGISTRATION)
42
+ print(f" CLAUDE.md → skill registered in {claude_md}")
43
+ else:
44
+ claude_md.parent.mkdir(parents=True, exist_ok=True)
45
+ claude_md.write_text(_SKILL_REGISTRATION.lstrip())
46
+ print(f" CLAUDE.md → created at {claude_md}")
47
+
48
+ print()
49
+ print("Done. Open Claude Code in any directory and type:")
50
+ print()
51
+ print(" /graphify .")
52
+ print()
53
+
54
+
55
+ def main() -> None:
56
+ if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
57
+ print("Usage: graphify <command>")
58
+ print()
59
+ print("Commands:")
60
+ print(" install copy skill to ~/.claude/skills/ and register in CLAUDE.md")
61
+ print(" benchmark [graph.json] measure token reduction vs naive full-corpus approach")
62
+ print()
63
+ return
64
+
65
+ cmd = sys.argv[1]
66
+ if cmd == "install":
67
+ install()
68
+ elif cmd == "benchmark":
69
+ from graphify.benchmark import run_benchmark, print_benchmark
70
+ graph_path = sys.argv[2] if len(sys.argv) > 2 else ".graphify/graph.json"
71
+ # Try to load corpus_words from detect output
72
+ corpus_words = None
73
+ detect_path = Path(".graphify_detect.json")
74
+ if detect_path.exists():
75
+ try:
76
+ detect_data = json.loads(detect_path.read_text())
77
+ corpus_words = detect_data.get("total_words")
78
+ except Exception:
79
+ pass
80
+ result = run_benchmark(graph_path, corpus_words=corpus_words)
81
+ print_benchmark(result)
82
+ else:
83
+ print(f"error: unknown command '{cmd}'", file=sys.stderr)
84
+ print("Run 'graphify --help' for usage.", file=sys.stderr)
85
+ sys.exit(1)
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()