malimgraph 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. malimgraph-0.1.1/.claude/settings.local.json +17 -0
  2. malimgraph-0.1.1/.github/workflows/ci.yml +42 -0
  3. malimgraph-0.1.1/.github/workflows/publish.yml +78 -0
  4. malimgraph-0.1.1/.gitignore +13 -0
  5. malimgraph-0.1.1/LICENSE +21 -0
  6. malimgraph-0.1.1/Makefile +45 -0
  7. malimgraph-0.1.1/PKG-INFO +374 -0
  8. malimgraph-0.1.1/README.md +324 -0
  9. malimgraph-0.1.1/docs/database-setup.md +60 -0
  10. malimgraph-0.1.1/docs/getting-started.md +63 -0
  11. malimgraph-0.1.1/docs/mcp-server.md +51 -0
  12. malimgraph-0.1.1/examples/example_output/knowledge_graph.json +189 -0
  13. malimgraph-0.1.1/pyproject.toml +71 -0
  14. malimgraph-0.1.1/skills/chunks-to-pgvector/SKILL.md +200 -0
  15. malimgraph-0.1.1/skills/chunks-to-pgvector/scripts/embed_chunks.py +240 -0
  16. malimgraph-0.1.1/skills/chunks-to-pgvector/scripts/manage_vectors.py +117 -0
  17. malimgraph-0.1.1/skills/chunks-to-pgvector/scripts/search_vectors.py +172 -0
  18. malimgraph-0.1.1/skills/document-to-html/SKILL.md +119 -0
  19. malimgraph-0.1.1/skills/document-to-html/scripts/extract_text.py +125 -0
  20. malimgraph-0.1.1/skills/document-to-html/scripts/render_html.py +230 -0
  21. malimgraph-0.1.1/skills/graph-db-admin/SKILL.md +181 -0
  22. malimgraph-0.1.1/skills/graph-db-admin/scripts/load_graph.py +155 -0
  23. malimgraph-0.1.1/skills/graph-db-admin/scripts/manage_graph.py +144 -0
  24. malimgraph-0.1.1/skills/graph-db-admin/scripts/query_graph.py +100 -0
  25. malimgraph-0.1.1/skills/pdf-to-chunks/SKILL.md +134 -0
  26. malimgraph-0.1.1/skills/pdf-to-chunks/scripts/chunk_document.py +218 -0
  27. malimgraph-0.1.1/skills/pdf-to-chunks/scripts/extract_text.py +125 -0
  28. malimgraph-0.1.1/skills/pdf-to-knowledge-graph/SKILL.md +166 -0
  29. malimgraph-0.1.1/skills/pdf-to-knowledge-graph/scripts/build_knowledge_graph.py +262 -0
  30. malimgraph-0.1.1/skills/pdf-to-knowledge-graph/scripts/extract_text.py +126 -0
  31. malimgraph-0.1.1/skills/pdf-to-knowledge-graph/scripts/generate_graph_files.py +142 -0
  32. malimgraph-0.1.1/src/malimgraph/__init__.py +29 -0
  33. malimgraph-0.1.1/src/malimgraph/cli.py +516 -0
  34. malimgraph-0.1.1/src/malimgraph/core/__init__.py +15 -0
  35. malimgraph-0.1.1/src/malimgraph/core/chunker.py +185 -0
  36. malimgraph-0.1.1/src/malimgraph/core/db_client.py +240 -0
  37. malimgraph-0.1.1/src/malimgraph/core/embedder.py +138 -0
  38. malimgraph-0.1.1/src/malimgraph/core/graph_builder.py +162 -0
  39. malimgraph-0.1.1/src/malimgraph/core/html_renderer.py +327 -0
  40. malimgraph-0.1.1/src/malimgraph/core/llm_extractor.py +274 -0
  41. malimgraph-0.1.1/src/malimgraph/core/pdf_reader.py +131 -0
  42. malimgraph-0.1.1/src/malimgraph/core/rule_extractor.py +175 -0
  43. malimgraph-0.1.1/src/malimgraph/core/vector_client.py +275 -0
  44. malimgraph-0.1.1/src/malimgraph/generators/__init__.py +4 -0
  45. malimgraph-0.1.1/src/malimgraph/generators/age_sql.py +96 -0
  46. malimgraph-0.1.1/src/malimgraph/generators/cypher.py +109 -0
  47. malimgraph-0.1.1/src/malimgraph/schemas/__init__.py +28 -0
  48. malimgraph-0.1.1/src/malimgraph/schemas/chunks.py +38 -0
  49. malimgraph-0.1.1/src/malimgraph/schemas/config.py +26 -0
  50. malimgraph-0.1.1/src/malimgraph/schemas/entities.py +72 -0
  51. malimgraph-0.1.1/src/malimgraph/server.py +387 -0
  52. malimgraph-0.1.1/src/malimgraph/utils/__init__.py +4 -0
  53. malimgraph-0.1.1/src/malimgraph/utils/hashing.py +15 -0
  54. malimgraph-0.1.1/src/malimgraph/utils/text.py +41 -0
  55. malimgraph-0.1.1/tests/conftest.py +121 -0
  56. malimgraph-0.1.1/tests/test_chunker.py +78 -0
  57. malimgraph-0.1.1/tests/test_db_client.py +42 -0
  58. malimgraph-0.1.1/tests/test_generators.py +65 -0
  59. malimgraph-0.1.1/tests/test_graph_builder.py +142 -0
  60. malimgraph-0.1.1/tests/test_html_renderer.py +87 -0
  61. malimgraph-0.1.1/tests/test_pdf_reader.py +115 -0
  62. malimgraph-0.1.1/tests/test_rule_extractor.py +115 -0
  63. malimgraph-0.1.1/tests/test_vector_client.py +83 -0
@@ -0,0 +1,17 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(git config *)",
5
+ "Bash(git commit -m ' *)",
6
+ "Bash(git remote *)",
7
+ "Bash(git branch *)",
8
+ "Bash(git push *)",
9
+ "Bash(git add *)",
10
+ "Bash(gh run *)",
11
+ "mcp__Claude_in_Chrome__tabs_context_mcp",
12
+ "Bash(pip install *)",
13
+ "PowerShell(python --version)",
14
+ "PowerShell(python -m pip install ruff 2>&1)"
15
+ ]
16
+ }
17
+ }
@@ -0,0 +1,42 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ name: Lint
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.12"
18
+ - name: Install ruff
19
+ run: pip install ruff
20
+ - name: Run ruff check
21
+ run: ruff check src/ tests/
22
+ - name: Run ruff format check
23
+ run: ruff format --check src/ tests/
24
+
25
+ test:
26
+ name: Test (Python ${{ matrix.python-version }})
27
+ runs-on: ubuntu-latest
28
+ strategy:
29
+ fail-fast: false
30
+ matrix:
31
+ python-version: ["3.10", "3.11", "3.12"]
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+ - uses: actions/setup-python@v5
35
+ with:
36
+ python-version: ${{ matrix.python-version }}
37
+ - name: Install dependencies
38
+ run: pip install -e ".[dev]"
39
+ - name: Run tests
40
+ run: pytest tests/ -v --tb=short
41
+ env:
42
+ ANTHROPIC_API_KEY: "" # Tests that need this are mocked
@@ -0,0 +1,78 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ release-build:
10
+ name: Build distributions
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.12"
18
+
19
+ - name: Build release distributions
20
+ run: |
21
+ python -m pip install build hatchling
22
+ python -m build
23
+
24
+ - name: Upload distributions
25
+ uses: actions/upload-artifact@v4
26
+ with:
27
+ name: release-dists
28
+ path: dist/
29
+
30
+ pypi-publish:
31
+ name: Publish to PyPI
32
+ runs-on: ubuntu-latest
33
+ needs: release-build
34
+ permissions:
35
+ id-token: write
36
+ environment:
37
+ name: pypi
38
+ url: https://pypi.org/project/malimgraph/${{ github.ref_name }}
39
+ steps:
40
+ - name: Retrieve release distributions
41
+ uses: actions/download-artifact@v4
42
+ with:
43
+ name: release-dists
44
+ path: dist/
45
+
46
+ - name: Publish to PyPI
47
+ uses: pypa/gh-action-pypi-publish@release/v1
48
+ with:
49
+ packages-dir: dist/
50
+
51
+ package-skills:
52
+ name: Package Claude Skills
53
+ runs-on: ubuntu-latest
54
+ permissions:
55
+ contents: write
56
+ steps:
57
+ - uses: actions/checkout@v4
58
+
59
+ - name: Package all skills
60
+ run: |
61
+ mkdir -p dist
62
+ cd skills/pdf-to-knowledge-graph && zip -r ../../dist/pdf-to-knowledge-graph.skill SKILL.md scripts/ && cd ../..
63
+ cd skills/pdf-to-chunks && zip -r ../../dist/pdf-to-chunks.skill SKILL.md scripts/ && cd ../..
64
+ cd skills/document-to-html && zip -r ../../dist/document-to-html.skill SKILL.md scripts/ && cd ../..
65
+ cd skills/graph-db-admin && zip -r ../../dist/graph-db-admin.skill SKILL.md scripts/ && cd ../..
66
+ cd skills/chunks-to-pgvector && zip -r ../../dist/chunks-to-pgvector.skill SKILL.md scripts/ && cd ../..
67
+
68
+ - name: Upload skills to GitHub Release
69
+ uses: softprops/action-gh-release@v2
70
+ with:
71
+ files: |
72
+ dist/pdf-to-knowledge-graph.skill
73
+ dist/pdf-to-chunks.skill
74
+ dist/document-to-html.skill
75
+ dist/graph-db-admin.skill
76
+ dist/chunks-to-pgvector.skill
77
+ env:
78
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.pyo
4
+ .Python
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .env
9
+ .venv
10
+ venv/
11
+ .pytest_cache/
12
+ .ruff_cache/
13
+ *.egg
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Malim AI Labs Social Enterprise (003827047-U)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,45 @@
1
+ .PHONY: install dev lint test build skills clean
2
+
3
+ install:
4
+ pip install -e .
5
+
6
+ dev:
7
+ pip install -e ".[dev,all]"
8
+
9
+ lint:
10
+ ruff check src/ tests/
11
+ ruff format --check src/ tests/
12
+
13
+ format:
14
+ ruff format src/ tests/
15
+ ruff check --fix src/ tests/
16
+
17
+ test:
18
+ pytest tests/ -v
19
+
20
+ test-cov:
21
+ pytest tests/ -v --tb=short --cov=malimgraph --cov-report=term-missing
22
+
23
+ build:
24
+ python -m build
25
+
26
+ skills:
27
+ @echo "Packaging skills..."
28
+ @mkdir -p dist
29
+ cd skills/pdf-to-knowledge-graph && zip -r ../../dist/pdf-to-knowledge-graph.skill SKILL.md scripts/
30
+ cd skills/pdf-to-chunks && zip -r ../../dist/pdf-to-chunks.skill SKILL.md scripts/
31
+ cd skills/document-to-html && zip -r ../../dist/document-to-html.skill SKILL.md scripts/
32
+ cd skills/graph-db-admin && zip -r ../../dist/graph-db-admin.skill SKILL.md scripts/
33
+ cd skills/chunks-to-pgvector && zip -r ../../dist/chunks-to-pgvector.skill SKILL.md scripts/
34
+ @echo "Skills packaged in dist/"
35
+
36
+ clean:
37
+ rm -rf dist/ build/ *.egg-info src/*.egg-info __pycache__
38
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
39
+ find . -name "*.pyc" -delete
40
+
41
+ serve:
42
+ malimgraph serve
43
+
44
+ serve-http:
45
+ malimgraph serve --transport http --port 8080
@@ -0,0 +1,374 @@
1
+ Metadata-Version: 2.4
2
+ Name: malimgraph
3
+ Version: 0.1.1
4
+ Summary: Transform PDF documents into structured knowledge graphs with citation provenance
5
+ Project-URL: Homepage, https://github.com/AiMalim/malimgraph
6
+ Project-URL: Documentation, https://ailabs.malim.my/malimgraph
7
+ Project-URL: Repository, https://github.com/AiMalim/malimgraph
8
+ Project-URL: Issues, https://github.com/AiMalim/malimgraph/issues
9
+ Author-email: Malim AI Labs <hello@malim.my>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: apache-age,cypher,graphrag,knowledge-graph,mcp,neo4j,nlp,pdf
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Topic :: Database :: Database Engines/Servers
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: anthropic>=0.40
20
+ Requires-Dist: click>=8.0
21
+ Requires-Dist: mcp>=1.0
22
+ Requires-Dist: pydantic-settings>=2.0
23
+ Requires-Dist: pydantic>=2.0
24
+ Requires-Dist: pymupdf>=1.24
25
+ Provides-Extra: age
26
+ Requires-Dist: psycopg2-binary>=2.9; extra == 'age'
27
+ Provides-Extra: all
28
+ Requires-Dist: neo4j>=5.0; extra == 'all'
29
+ Requires-Dist: openai>=1.0; extra == 'all'
30
+ Requires-Dist: pgvector>=0.2; extra == 'all'
31
+ Requires-Dist: psycopg2-binary>=2.9; extra == 'all'
32
+ Requires-Dist: sentence-transformers>=3.0; extra == 'all'
33
+ Requires-Dist: voyageai>=0.2; extra == 'all'
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
36
+ Requires-Dist: pytest>=8.0; extra == 'dev'
37
+ Requires-Dist: ruff>=0.5; extra == 'dev'
38
+ Provides-Extra: local
39
+ Requires-Dist: sentence-transformers>=3.0; extra == 'local'
40
+ Provides-Extra: neo4j
41
+ Requires-Dist: neo4j>=5.0; extra == 'neo4j'
42
+ Provides-Extra: openai
43
+ Requires-Dist: openai>=1.0; extra == 'openai'
44
+ Provides-Extra: pgvector
45
+ Requires-Dist: pgvector>=0.2; extra == 'pgvector'
46
+ Requires-Dist: psycopg2-binary>=2.9; extra == 'pgvector'
47
+ Provides-Extra: voyage
48
+ Requires-Dist: voyageai>=0.2; extra == 'voyage'
49
+ Description-Content-Type: text/markdown
50
+
51
+ # MalimGraph
52
+
53
+ ```
54
+ ███╗ ███╗ █████╗ ██╗ ██╗███╗ ███╗ ██████╗ ██████╗ █████╗ ██████╗ ██╗ ██╗
55
+ ████╗ ████║██╔══██╗██║ ██║████╗ ████║██╔════╝ ██╔══██╗██╔══██╗██╔══██╗██║ ██║
56
+ ██╔████╔██║███████║██║ ██║██╔████╔██║██║ ███╗██████╔╝███████║██████╔╝███████║
57
+ ██║╚██╔╝██║██╔══██║██║ ██║██║╚██╔╝██║██║ ██║██╔══██╗██╔══██║██╔═══╝ ██╔══██║
58
+ ██║ ╚═╝ ██║██║ ██║███████╗██║██║ ╚═╝ ██║╚██████╔╝██║ ██║██║ ██║██║ ██║ ██║
59
+ ╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚═╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝
60
+ ```
61
+
62
+ [![PyPI version](https://badge.fury.io/py/malimgraph.svg)](https://badge.fury.io/py/malimgraph)
63
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
64
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
65
+ [![MCP Compatible](https://img.shields.io/badge/MCP-Compatible-purple.svg)](https://modelcontextprotocol.io)
66
+ [![CI](https://github.com/malim-ai-labs/malim-graph-plugin/actions/workflows/ci.yml/badge.svg)](https://github.com/malim-ai-labs/malim-graph-plugin/actions/workflows/ci.yml)
67
+
68
+ **From documents to knowledge graphs.**
69
+
70
+ Transform PDF documents into structured knowledge graphs with full citation provenance. Every entity and relationship traces back to the exact PDF page and verbatim text that supports it.
71
+
72
+ ---
73
+
74
+ ## Features
75
+
76
+ | Tool | Description |
77
+ |------|-------------|
78
+ | `extract_knowledge_graph` | Hybrid rule + LLM extraction → entities, relationships, citations |
79
+ | `chunk_document` | Token-aware overlapping chunks with heading context for RAG |
80
+ | `render_document_html` | Structured HTML with page anchors, entity annotations, TOC + search |
81
+ | `manage_graph_db` | Load, query, and manage graphs in Neo4j or PostgreSQL (Apache AGE) |
82
+ | `embed_and_store_chunks` | Embed chunks into PostgreSQL pgvector (OpenAI / Voyage / local) |
83
+
84
+ **Three ways to use:**
85
+ - **MCP Server** — connect to Claude Desktop, Claude Code, or claude.ai
86
+ - **CLI** — `malimgraph extract`, `chunk`, `render`, `db`, `vector`
87
+ - **Claude Skills** — 5 installable `.skill` packages for claude.ai
88
+
89
+ ---
90
+
91
+ ## Quick Start
92
+
93
+ ```bash
94
+ pip install malimgraph
95
+ export ANTHROPIC_API_KEY=sk-ant-...
96
+
97
+ # Extract knowledge graph
98
+ malimgraph extract --input report.pdf --output ./output/ --format all
99
+
100
+ # Chunk for RAG
101
+ malimgraph chunk --input report.pdf --output ./chunks/
102
+
103
+ # Embed chunks into pgvector
104
+ export PGVECTOR_URI="postgresql://user:pass@localhost:5432/mydb"
105
+ export OPENAI_API_KEY=sk-...
106
+ malimgraph vector load --input ./chunks/chunks.json
107
+
108
+ # Render as browsable HTML
109
+ malimgraph render --input report.pdf --output document.html
110
+ ```
111
+
112
+ ---
113
+
114
+ ## How It Works
115
+
116
+ ```
117
+ PDF
118
+
119
+
120
+ pdf_reader.py ──────────────────────────────────────────────┐
121
+ │ (PyMuPDF: text, headings, tables, page structure) │
122
+ ├──────────────────────────────────┐ │
123
+ ▼ ▼ ▼
124
+ rule_extractor.py llm_extractor.py chunker.py
125
+ │ (regex: dates, amounts, │ (Anthropic API: │ (sliding window
126
+ │ emails, legal refs, │ semantic entities, │ with heading
127
+ │ section numbers) │ relationships, │ context)
128
+ │ │ source_text required) │
129
+ └──────────────┬───────────────┘ │
130
+ ▼ ▼
131
+ graph_builder.py embedder.py
132
+ │ (merge + dedup: │ (OpenAI / Voyage /
133
+ │ hybrid method, │ local sentence-
134
+ │ citation accumulation, │ transformers)
135
+ │ stable IDs) │
136
+ ▼ ▼
137
+ knowledge_graph.json vector_client.py
138
+ │ (pgvector: HNSW index,
139
+ ┌─────┴──────┐ cosine similarity search)
140
+ ▼ ▼
141
+ cypher.py age_sql.py
142
+ (.cypher) (.sql)
143
+ ```
144
+
145
+ ---
146
+
147
+ ## Three Ways to Use
148
+
149
+ ### MCP Server
150
+
151
+ ```bash
152
+ # stdio (for Claude Desktop / Claude Code)
153
+ malimgraph serve
154
+
155
+ # HTTP (for remote connections / claude.ai)
156
+ malimgraph serve --transport http --port 8080
157
+ ```
158
+
159
+ **Claude Desktop config** (`claude_desktop_config.json`):
160
+ ```json
161
+ {
162
+ "mcpServers": {
163
+ "malimgraph": {
164
+ "command": "malimgraph",
165
+ "args": ["serve"],
166
+ "env": { "ANTHROPIC_API_KEY": "sk-ant-..." }
167
+ }
168
+ }
169
+ }
170
+ ```
171
+
172
+ **Claude Code:**
173
+ ```bash
174
+ claude mcp add malimgraph -- malimgraph serve
175
+ ```
176
+
177
+ ### CLI
178
+
179
+ ```bash
180
+ # Extract knowledge graph from PDF
181
+ malimgraph extract \
182
+ --input report.pdf \
183
+ --output ./output/ \
184
+ --entity-types auto \
185
+ --format all \
186
+ --graph-name my_graph
187
+
188
+ # Chunk for embeddings
189
+ malimgraph chunk \
190
+ --input report.pdf \
191
+ --output ./chunks/ \
192
+ --chunk-size 512 \
193
+ --overlap 64 \
194
+ --format json
195
+
196
+ # Embed chunks into PostgreSQL pgvector
197
+ malimgraph vector load \
198
+ --input ./chunks/chunks.json \
199
+ --uri "postgresql://user:pass@localhost:5432/mydb" \
200
+ --provider openai \
201
+ --table document_chunks
202
+
203
+ # Semantic search over embedded chunks
204
+ malimgraph vector search \
205
+ --query "What are the financial risks?" \
206
+ --uri "postgresql://user:pass@localhost:5432/mydb" \
207
+ --top-k 5
208
+
209
+ # Render as browsable HTML
210
+ malimgraph render \
211
+ --input report.pdf \
212
+ --output document.html \
213
+ --knowledge-graph ./output/knowledge_graph.json
214
+
215
+ # Load into Neo4j
216
+ malimgraph db load \
217
+ --input ./output/knowledge_graph.json \
218
+ --target neo4j \
219
+ --uri bolt://localhost:7687 \
220
+ --user neo4j \
221
+ --password secret
222
+
223
+ # Query the graph
224
+ malimgraph db query \
225
+ --target neo4j \
226
+ --uri bolt://localhost:7687 \
227
+ --query "MATCH (n:Organization) RETURN n.label, n.source_pages LIMIT 10"
228
+ ```
229
+
230
+ ### Claude Skills
231
+
232
+ Download `.skill` files from [GitHub Releases](https://github.com/malim-ai-labs/malim-graph-plugin/releases) and install in claude.ai → Settings → Skills.
233
+
234
+ | Skill | Trigger phrases |
235
+ |-------|----------------|
236
+ | `pdf-to-knowledge-graph` | "knowledge graph", "extract entities", "PDF to Cypher" |
237
+ | `pdf-to-chunks` | "chunk document", "split for embeddings", "RAG chunks" |
238
+ | `document-to-html` | "convert PDF to HTML", "render document", "make PDF browsable" |
239
+ | `graph-db-admin` | "load into Neo4j", "Cypher query", "graph statistics" |
240
+ | `chunks-to-pgvector` | "store in pgvector", "embed into PostgreSQL", "semantic search", "RAG with PostgreSQL" |
241
+
242
+ ---
243
+
244
+ ## Installation
245
+
246
+ ```bash
247
+ # Core (knowledge graph + chunking + HTML)
248
+ pip install malimgraph
249
+
250
+ # With Neo4j support
251
+ pip install "malimgraph[neo4j]"
252
+
253
+ # With Apache AGE support
254
+ pip install "malimgraph[age]"
255
+
256
+ # With pgvector + OpenAI embeddings
257
+ pip install "malimgraph[pgvector,openai]"
258
+
259
+ # With pgvector + Voyage AI embeddings
260
+ pip install "malimgraph[pgvector,voyage]"
261
+
262
+ # With local embeddings (no API key needed)
263
+ pip install "malimgraph[pgvector,local]"
264
+
265
+ # Everything
266
+ pip install "malimgraph[all]"
267
+ ```
268
+
269
+ ### Environment Variables
270
+
271
+ ```bash
272
+ ANTHROPIC_API_KEY=sk-ant-... # Required for LLM extraction
273
+ OPENAI_API_KEY=sk-... # Required for OpenAI embeddings
274
+ VOYAGE_API_KEY=pa-... # Required for Voyage AI embeddings
275
+ PGVECTOR_URI=postgresql://... # PostgreSQL connection for pgvector
276
+ NEO4J_URI=bolt://localhost:7687 # Neo4j connection
277
+ NEO4J_USER=neo4j
278
+ NEO4J_PASSWORD=yourpassword
279
+ AGE_CONNECTION_URI=host=... # Apache AGE connection
280
+ ```
281
+
282
+ ---
283
+
284
+ ## Output Schema — `knowledge_graph.json`
285
+
286
+ Every entity and relationship carries full citation provenance:
287
+
288
+ | Field | Type | Description |
289
+ |-------|------|-------------|
290
+ | `id` | string | Stable hash ID: `e_` + MD5(type:label)[:8] |
291
+ | `label` | string | Canonical entity name |
292
+ | `type` | string | Entity type (Organization, Person, Date, …) |
293
+ | `extraction_method` | enum | `rule` / `llm` / `hybrid` |
294
+ | `confidence` | enum | `high` / `medium` / `low` |
295
+ | `source_pages` | int[] | PDF page numbers where found |
296
+ | `source_text` | string | Primary verbatim supporting quote |
297
+ | `source_chunk_id` | string | Processing chunk ID |
298
+ | `citations[]` | object[] | All supporting quotes with page refs |
299
+ | `citation_count` | int | Stored as property in graph DBs |
300
+
301
+ ---
302
+
303
+ ## pgvector — Semantic Search Schema
304
+
305
+ Chunks are stored with embeddings in PostgreSQL, enabling semantic search:
306
+
307
+ ```sql
308
+ -- Find chunks most similar to a query
309
+ SELECT chunk_text, source_file, page_numbers, heading_context,
310
+ 1 - (embedding <=> '[...]'::vector) AS score
311
+ FROM document_chunks
312
+ ORDER BY embedding <=> '[...]'::vector
313
+ LIMIT 10;
314
+
315
+ -- Filter by document
316
+ SELECT * FROM document_chunks
317
+ WHERE document_id = 'annual_report_2024'
318
+ ORDER BY embedding <=> '[...]'::vector LIMIT 5;
319
+ ```
320
+
321
+ **Supported embedding providers:**
322
+
323
+ | Provider | Default model | Dimension | API key |
324
+ |----------|--------------|-----------|---------|
325
+ | `openai` | `text-embedding-3-small` | 1536-d | `OPENAI_API_KEY` |
326
+ | `voyage` | `voyage-3-large` | 1024-d | `VOYAGE_API_KEY` |
327
+ | `local` | `all-MiniLM-L6-v2` | 384-d | none (CPU) |
328
+
329
+ ---
330
+
331
+ ## Database Setup
332
+
333
+ ### Neo4j
334
+ ```bash
335
+ docker run -p 7474:7474 -p 7687:7687 \
336
+ -e NEO4J_AUTH=neo4j/yourpassword neo4j:latest
337
+ ```
338
+
339
+ ### Apache AGE (PostgreSQL)
340
+ ```bash
341
+ docker run -p 5432:5432 -e POSTGRES_PASSWORD=secret apache/age:latest
342
+ ```
343
+
344
+ ### pgvector (PostgreSQL)
345
+ ```bash
346
+ docker run -p 5432:5432 -e POSTGRES_PASSWORD=secret pgvector/pgvector:pg17
347
+ ```
348
+
349
+ See [docs/database-setup.md](docs/database-setup.md) for full guides.
350
+
351
+ ---
352
+
353
+ ## Contributing
354
+
355
+ 1. Fork the repo
356
+ 2. Create a feature branch: `git checkout -b feature/my-feature`
357
+ 3. Install dev deps: `pip install -e ".[dev]"`
358
+ 4. Run tests: `make test`
359
+ 5. Lint: `make lint`
360
+ 6. Submit a PR
361
+
362
+ ---
363
+
364
+ ## Credits
365
+
366
+ Built by **[Malim AI Labs](https://ailabs.malim.my)** — AI-powered knowledge infrastructure for Southeast Asia.
367
+
368
+ Malim AI Labs Social Enterprise (003827047-U) · Kuala Lumpur, Malaysia
369
+
370
+ ---
371
+
372
+ ## License
373
+
374
+ MIT — see [LICENSE](LICENSE)