malimgraph 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- malimgraph-0.1.1/.claude/settings.local.json +17 -0
- malimgraph-0.1.1/.github/workflows/ci.yml +42 -0
- malimgraph-0.1.1/.github/workflows/publish.yml +78 -0
- malimgraph-0.1.1/.gitignore +13 -0
- malimgraph-0.1.1/LICENSE +21 -0
- malimgraph-0.1.1/Makefile +45 -0
- malimgraph-0.1.1/PKG-INFO +374 -0
- malimgraph-0.1.1/README.md +324 -0
- malimgraph-0.1.1/docs/database-setup.md +60 -0
- malimgraph-0.1.1/docs/getting-started.md +63 -0
- malimgraph-0.1.1/docs/mcp-server.md +51 -0
- malimgraph-0.1.1/examples/example_output/knowledge_graph.json +189 -0
- malimgraph-0.1.1/pyproject.toml +71 -0
- malimgraph-0.1.1/skills/chunks-to-pgvector/SKILL.md +200 -0
- malimgraph-0.1.1/skills/chunks-to-pgvector/scripts/embed_chunks.py +240 -0
- malimgraph-0.1.1/skills/chunks-to-pgvector/scripts/manage_vectors.py +117 -0
- malimgraph-0.1.1/skills/chunks-to-pgvector/scripts/search_vectors.py +172 -0
- malimgraph-0.1.1/skills/document-to-html/SKILL.md +119 -0
- malimgraph-0.1.1/skills/document-to-html/scripts/extract_text.py +125 -0
- malimgraph-0.1.1/skills/document-to-html/scripts/render_html.py +230 -0
- malimgraph-0.1.1/skills/graph-db-admin/SKILL.md +181 -0
- malimgraph-0.1.1/skills/graph-db-admin/scripts/load_graph.py +155 -0
- malimgraph-0.1.1/skills/graph-db-admin/scripts/manage_graph.py +144 -0
- malimgraph-0.1.1/skills/graph-db-admin/scripts/query_graph.py +100 -0
- malimgraph-0.1.1/skills/pdf-to-chunks/SKILL.md +134 -0
- malimgraph-0.1.1/skills/pdf-to-chunks/scripts/chunk_document.py +218 -0
- malimgraph-0.1.1/skills/pdf-to-chunks/scripts/extract_text.py +125 -0
- malimgraph-0.1.1/skills/pdf-to-knowledge-graph/SKILL.md +166 -0
- malimgraph-0.1.1/skills/pdf-to-knowledge-graph/scripts/build_knowledge_graph.py +262 -0
- malimgraph-0.1.1/skills/pdf-to-knowledge-graph/scripts/extract_text.py +126 -0
- malimgraph-0.1.1/skills/pdf-to-knowledge-graph/scripts/generate_graph_files.py +142 -0
- malimgraph-0.1.1/src/malimgraph/__init__.py +29 -0
- malimgraph-0.1.1/src/malimgraph/cli.py +516 -0
- malimgraph-0.1.1/src/malimgraph/core/__init__.py +15 -0
- malimgraph-0.1.1/src/malimgraph/core/chunker.py +185 -0
- malimgraph-0.1.1/src/malimgraph/core/db_client.py +240 -0
- malimgraph-0.1.1/src/malimgraph/core/embedder.py +138 -0
- malimgraph-0.1.1/src/malimgraph/core/graph_builder.py +162 -0
- malimgraph-0.1.1/src/malimgraph/core/html_renderer.py +327 -0
- malimgraph-0.1.1/src/malimgraph/core/llm_extractor.py +274 -0
- malimgraph-0.1.1/src/malimgraph/core/pdf_reader.py +131 -0
- malimgraph-0.1.1/src/malimgraph/core/rule_extractor.py +175 -0
- malimgraph-0.1.1/src/malimgraph/core/vector_client.py +275 -0
- malimgraph-0.1.1/src/malimgraph/generators/__init__.py +4 -0
- malimgraph-0.1.1/src/malimgraph/generators/age_sql.py +96 -0
- malimgraph-0.1.1/src/malimgraph/generators/cypher.py +109 -0
- malimgraph-0.1.1/src/malimgraph/schemas/__init__.py +28 -0
- malimgraph-0.1.1/src/malimgraph/schemas/chunks.py +38 -0
- malimgraph-0.1.1/src/malimgraph/schemas/config.py +26 -0
- malimgraph-0.1.1/src/malimgraph/schemas/entities.py +72 -0
- malimgraph-0.1.1/src/malimgraph/server.py +387 -0
- malimgraph-0.1.1/src/malimgraph/utils/__init__.py +4 -0
- malimgraph-0.1.1/src/malimgraph/utils/hashing.py +15 -0
- malimgraph-0.1.1/src/malimgraph/utils/text.py +41 -0
- malimgraph-0.1.1/tests/conftest.py +121 -0
- malimgraph-0.1.1/tests/test_chunker.py +78 -0
- malimgraph-0.1.1/tests/test_db_client.py +42 -0
- malimgraph-0.1.1/tests/test_generators.py +65 -0
- malimgraph-0.1.1/tests/test_graph_builder.py +142 -0
- malimgraph-0.1.1/tests/test_html_renderer.py +87 -0
- malimgraph-0.1.1/tests/test_pdf_reader.py +115 -0
- malimgraph-0.1.1/tests/test_rule_extractor.py +115 -0
- malimgraph-0.1.1/tests/test_vector_client.py +83 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(git config *)",
|
|
5
|
+
"Bash(git commit -m ' *)",
|
|
6
|
+
"Bash(git remote *)",
|
|
7
|
+
"Bash(git branch *)",
|
|
8
|
+
"Bash(git push *)",
|
|
9
|
+
"Bash(git add *)",
|
|
10
|
+
"Bash(gh run *)",
|
|
11
|
+
"mcp__Claude_in_Chrome__tabs_context_mcp",
|
|
12
|
+
"Bash(pip install *)",
|
|
13
|
+
"PowerShell(python --version)",
|
|
14
|
+
"PowerShell(python -m pip install ruff 2>&1)"
|
|
15
|
+
]
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: Lint
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
- name: Install ruff
|
|
19
|
+
run: pip install ruff
|
|
20
|
+
- name: Run ruff check
|
|
21
|
+
run: ruff check src/ tests/
|
|
22
|
+
- name: Run ruff format check
|
|
23
|
+
run: ruff format --check src/ tests/
|
|
24
|
+
|
|
25
|
+
test:
|
|
26
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
strategy:
|
|
29
|
+
fail-fast: false
|
|
30
|
+
matrix:
|
|
31
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
- uses: actions/setup-python@v5
|
|
35
|
+
with:
|
|
36
|
+
python-version: ${{ matrix.python-version }}
|
|
37
|
+
- name: Install dependencies
|
|
38
|
+
run: pip install -e ".[dev]"
|
|
39
|
+
- name: Run tests
|
|
40
|
+
run: pytest tests/ -v --tb=short
|
|
41
|
+
env:
|
|
42
|
+
ANTHROPIC_API_KEY: "" # Tests that need this are mocked
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
release-build:
|
|
10
|
+
name: Build distributions
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
|
|
19
|
+
- name: Build release distributions
|
|
20
|
+
run: |
|
|
21
|
+
python -m pip install build hatchling
|
|
22
|
+
python -m build
|
|
23
|
+
|
|
24
|
+
- name: Upload distributions
|
|
25
|
+
uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: release-dists
|
|
28
|
+
path: dist/
|
|
29
|
+
|
|
30
|
+
pypi-publish:
|
|
31
|
+
name: Publish to PyPI
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
needs: release-build
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write
|
|
36
|
+
environment:
|
|
37
|
+
name: pypi
|
|
38
|
+
url: https://pypi.org/project/malimgraph/${{ github.ref_name }}
|
|
39
|
+
steps:
|
|
40
|
+
- name: Retrieve release distributions
|
|
41
|
+
uses: actions/download-artifact@v4
|
|
42
|
+
with:
|
|
43
|
+
name: release-dists
|
|
44
|
+
path: dist/
|
|
45
|
+
|
|
46
|
+
- name: Publish to PyPI
|
|
47
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
48
|
+
with:
|
|
49
|
+
packages-dir: dist/
|
|
50
|
+
|
|
51
|
+
package-skills:
|
|
52
|
+
name: Package Claude Skills
|
|
53
|
+
runs-on: ubuntu-latest
|
|
54
|
+
permissions:
|
|
55
|
+
contents: write
|
|
56
|
+
steps:
|
|
57
|
+
- uses: actions/checkout@v4
|
|
58
|
+
|
|
59
|
+
- name: Package all skills
|
|
60
|
+
run: |
|
|
61
|
+
mkdir -p dist
|
|
62
|
+
cd skills/pdf-to-knowledge-graph && zip -r ../../dist/pdf-to-knowledge-graph.skill SKILL.md scripts/ && cd ../..
|
|
63
|
+
cd skills/pdf-to-chunks && zip -r ../../dist/pdf-to-chunks.skill SKILL.md scripts/ && cd ../..
|
|
64
|
+
cd skills/document-to-html && zip -r ../../dist/document-to-html.skill SKILL.md scripts/ && cd ../..
|
|
65
|
+
cd skills/graph-db-admin && zip -r ../../dist/graph-db-admin.skill SKILL.md scripts/ && cd ../..
|
|
66
|
+
cd skills/chunks-to-pgvector && zip -r ../../dist/chunks-to-pgvector.skill SKILL.md scripts/ && cd ../..
|
|
67
|
+
|
|
68
|
+
- name: Upload skills to GitHub Release
|
|
69
|
+
uses: softprops/action-gh-release@v2
|
|
70
|
+
with:
|
|
71
|
+
files: |
|
|
72
|
+
dist/pdf-to-knowledge-graph.skill
|
|
73
|
+
dist/pdf-to-chunks.skill
|
|
74
|
+
dist/document-to-html.skill
|
|
75
|
+
dist/graph-db-admin.skill
|
|
76
|
+
dist/chunks-to-pgvector.skill
|
|
77
|
+
env:
|
|
78
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
malimgraph-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Malim AI Labs Social Enterprise (003827047-U)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
.PHONY: install dev lint test build skills clean
|
|
2
|
+
|
|
3
|
+
install:
|
|
4
|
+
pip install -e .
|
|
5
|
+
|
|
6
|
+
dev:
|
|
7
|
+
pip install -e ".[dev,all]"
|
|
8
|
+
|
|
9
|
+
lint:
|
|
10
|
+
ruff check src/ tests/
|
|
11
|
+
ruff format --check src/ tests/
|
|
12
|
+
|
|
13
|
+
format:
|
|
14
|
+
ruff format src/ tests/
|
|
15
|
+
ruff check --fix src/ tests/
|
|
16
|
+
|
|
17
|
+
test:
|
|
18
|
+
pytest tests/ -v
|
|
19
|
+
|
|
20
|
+
test-cov:
|
|
21
|
+
pytest tests/ -v --tb=short --cov=malimgraph --cov-report=term-missing
|
|
22
|
+
|
|
23
|
+
build:
|
|
24
|
+
python -m build
|
|
25
|
+
|
|
26
|
+
skills:
|
|
27
|
+
@echo "Packaging skills..."
|
|
28
|
+
@mkdir -p dist
|
|
29
|
+
cd skills/pdf-to-knowledge-graph && zip -r ../../dist/pdf-to-knowledge-graph.skill SKILL.md scripts/
|
|
30
|
+
cd skills/pdf-to-chunks && zip -r ../../dist/pdf-to-chunks.skill SKILL.md scripts/
|
|
31
|
+
cd skills/document-to-html && zip -r ../../dist/document-to-html.skill SKILL.md scripts/
|
|
32
|
+
cd skills/graph-db-admin && zip -r ../../dist/graph-db-admin.skill SKILL.md scripts/
|
|
33
|
+
cd skills/chunks-to-pgvector && zip -r ../../dist/chunks-to-pgvector.skill SKILL.md scripts/
|
|
34
|
+
@echo "Skills packaged in dist/"
|
|
35
|
+
|
|
36
|
+
clean:
|
|
37
|
+
rm -rf dist/ build/ *.egg-info src/*.egg-info __pycache__
|
|
38
|
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|
|
39
|
+
find . -name "*.pyc" -delete
|
|
40
|
+
|
|
41
|
+
serve:
|
|
42
|
+
malimgraph serve
|
|
43
|
+
|
|
44
|
+
serve-http:
|
|
45
|
+
malimgraph serve --transport http --port 8080
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: malimgraph
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Transform PDF documents into structured knowledge graphs with citation provenance
|
|
5
|
+
Project-URL: Homepage, https://github.com/AiMalim/malimgraph
|
|
6
|
+
Project-URL: Documentation, https://ailabs.malim.my/malimgraph
|
|
7
|
+
Project-URL: Repository, https://github.com/AiMalim/malimgraph
|
|
8
|
+
Project-URL: Issues, https://github.com/AiMalim/malimgraph/issues
|
|
9
|
+
Author-email: Malim AI Labs <hello@malim.my>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: apache-age,cypher,graphrag,knowledge-graph,mcp,neo4j,nlp,pdf
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: anthropic>=0.40
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: mcp>=1.0
|
|
22
|
+
Requires-Dist: pydantic-settings>=2.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0
|
|
24
|
+
Requires-Dist: pymupdf>=1.24
|
|
25
|
+
Provides-Extra: age
|
|
26
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == 'age'
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: neo4j>=5.0; extra == 'all'
|
|
29
|
+
Requires-Dist: openai>=1.0; extra == 'all'
|
|
30
|
+
Requires-Dist: pgvector>=0.2; extra == 'all'
|
|
31
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == 'all'
|
|
32
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'all'
|
|
33
|
+
Requires-Dist: voyageai>=0.2; extra == 'all'
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
38
|
+
Provides-Extra: local
|
|
39
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'local'
|
|
40
|
+
Provides-Extra: neo4j
|
|
41
|
+
Requires-Dist: neo4j>=5.0; extra == 'neo4j'
|
|
42
|
+
Provides-Extra: openai
|
|
43
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
44
|
+
Provides-Extra: pgvector
|
|
45
|
+
Requires-Dist: pgvector>=0.2; extra == 'pgvector'
|
|
46
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == 'pgvector'
|
|
47
|
+
Provides-Extra: voyage
|
|
48
|
+
Requires-Dist: voyageai>=0.2; extra == 'voyage'
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
|
|
51
|
+
# MalimGraph
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
███╗ ███╗ █████╗ ██╗ ██╗███╗ ███╗ ██████╗ ██████╗ █████╗ ██████╗ ██╗ ██╗
|
|
55
|
+
████╗ ████║██╔══██╗██║ ██║████╗ ████║██╔════╝ ██╔══██╗██╔══██╗██╔══██╗██║ ██║
|
|
56
|
+
██╔████╔██║███████║██║ ██║██╔████╔██║██║ ███╗██████╔╝███████║██████╔╝███████║
|
|
57
|
+
██║╚██╔╝██║██╔══██║██║ ██║██║╚██╔╝██║██║ ██║██╔══██╗██╔══██║██╔═══╝ ██╔══██║
|
|
58
|
+
██║ ╚═╝ ██║██║ ██║███████╗██║██║ ╚═╝ ██║╚██████╔╝██║ ██║██║ ██║██║ ██║ ██║
|
|
59
|
+
╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚═╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
[](https://badge.fury.io/py/malimgraph)
|
|
63
|
+
[](https://opensource.org/licenses/MIT)
|
|
64
|
+
[](https://www.python.org/downloads/)
|
|
65
|
+
[](https://modelcontextprotocol.io)
|
|
66
|
+
[](https://github.com/malim-ai-labs/malim-graph-plugin/actions/workflows/ci.yml)
|
|
67
|
+
|
|
68
|
+
**From documents to knowledge graphs.**
|
|
69
|
+
|
|
70
|
+
Transform PDF documents into structured knowledge graphs with full citation provenance. Every entity and relationship traces back to the exact PDF page and verbatim text that supports it.
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Features
|
|
75
|
+
|
|
76
|
+
| Tool | Description |
|
|
77
|
+
|------|-------------|
|
|
78
|
+
| `extract_knowledge_graph` | Hybrid rule + LLM extraction → entities, relationships, citations |
|
|
79
|
+
| `chunk_document` | Token-aware overlapping chunks with heading context for RAG |
|
|
80
|
+
| `render_document_html` | Structured HTML with page anchors, entity annotations, TOC + search |
|
|
81
|
+
| `manage_graph_db` | Load, query, and manage graphs in Neo4j or PostgreSQL (Apache AGE) |
|
|
82
|
+
| `embed_and_store_chunks` | Embed chunks into PostgreSQL pgvector (OpenAI / Voyage / local) |
|
|
83
|
+
|
|
84
|
+
**Three ways to use:**
|
|
85
|
+
- **MCP Server** — connect to Claude Desktop, Claude Code, or claude.ai
|
|
86
|
+
- **CLI** — `malimgraph extract`, `chunk`, `render`, `db`, `vector`
|
|
87
|
+
- **Claude Skills** — 5 installable `.skill` packages for claude.ai
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Quick Start
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install malimgraph
|
|
95
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
96
|
+
|
|
97
|
+
# Extract knowledge graph
|
|
98
|
+
malimgraph extract --input report.pdf --output ./output/ --format all
|
|
99
|
+
|
|
100
|
+
# Chunk for RAG
|
|
101
|
+
malimgraph chunk --input report.pdf --output ./chunks/
|
|
102
|
+
|
|
103
|
+
# Embed chunks into pgvector
|
|
104
|
+
export PGVECTOR_URI="postgresql://user:pass@localhost:5432/mydb"
|
|
105
|
+
export OPENAI_API_KEY=sk-...
|
|
106
|
+
malimgraph vector load --input ./chunks/chunks.json
|
|
107
|
+
|
|
108
|
+
# Render as browsable HTML
|
|
109
|
+
malimgraph render --input report.pdf --output document.html
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## How It Works
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
PDF
|
|
118
|
+
│
|
|
119
|
+
▼
|
|
120
|
+
pdf_reader.py ──────────────────────────────────────────────┐
|
|
121
|
+
│ (PyMuPDF: text, headings, tables, page structure) │
|
|
122
|
+
├──────────────────────────────────┐ │
|
|
123
|
+
▼ ▼ ▼
|
|
124
|
+
rule_extractor.py llm_extractor.py chunker.py
|
|
125
|
+
│ (regex: dates, amounts, │ (Anthropic API: │ (sliding window
|
|
126
|
+
│ emails, legal refs, │ semantic entities, │ with heading
|
|
127
|
+
│ section numbers) │ relationships, │ context)
|
|
128
|
+
│ │ source_text required) │
|
|
129
|
+
└──────────────┬───────────────┘ │
|
|
130
|
+
▼ ▼
|
|
131
|
+
graph_builder.py embedder.py
|
|
132
|
+
│ (merge + dedup: │ (OpenAI / Voyage /
|
|
133
|
+
│ hybrid method, │ local sentence-
|
|
134
|
+
│ citation accumulation, │ transformers)
|
|
135
|
+
│ stable IDs) │
|
|
136
|
+
▼ ▼
|
|
137
|
+
knowledge_graph.json vector_client.py
|
|
138
|
+
│ (pgvector: HNSW index,
|
|
139
|
+
┌─────┴──────┐ cosine similarity search)
|
|
140
|
+
▼ ▼
|
|
141
|
+
cypher.py age_sql.py
|
|
142
|
+
(.cypher) (.sql)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Three Ways to Use
|
|
148
|
+
|
|
149
|
+
### MCP Server
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# stdio (for Claude Desktop / Claude Code)
|
|
153
|
+
malimgraph serve
|
|
154
|
+
|
|
155
|
+
# HTTP (for remote connections / claude.ai)
|
|
156
|
+
malimgraph serve --transport http --port 8080
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**Claude Desktop config** (`claude_desktop_config.json`):
|
|
160
|
+
```json
|
|
161
|
+
{
|
|
162
|
+
"mcpServers": {
|
|
163
|
+
"malimgraph": {
|
|
164
|
+
"command": "malimgraph",
|
|
165
|
+
"args": ["serve"],
|
|
166
|
+
"env": { "ANTHROPIC_API_KEY": "sk-ant-..." }
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
**Claude Code:**
|
|
173
|
+
```bash
|
|
174
|
+
claude mcp add malimgraph -- malimgraph serve
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### CLI
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# Extract knowledge graph from PDF
|
|
181
|
+
malimgraph extract \
|
|
182
|
+
--input report.pdf \
|
|
183
|
+
--output ./output/ \
|
|
184
|
+
--entity-types auto \
|
|
185
|
+
--format all \
|
|
186
|
+
--graph-name my_graph
|
|
187
|
+
|
|
188
|
+
# Chunk for embeddings
|
|
189
|
+
malimgraph chunk \
|
|
190
|
+
--input report.pdf \
|
|
191
|
+
--output ./chunks/ \
|
|
192
|
+
--chunk-size 512 \
|
|
193
|
+
--overlap 64 \
|
|
194
|
+
--format json
|
|
195
|
+
|
|
196
|
+
# Embed chunks into PostgreSQL pgvector
|
|
197
|
+
malimgraph vector load \
|
|
198
|
+
--input ./chunks/chunks.json \
|
|
199
|
+
--uri "postgresql://user:pass@localhost:5432/mydb" \
|
|
200
|
+
--provider openai \
|
|
201
|
+
--table document_chunks
|
|
202
|
+
|
|
203
|
+
# Semantic search over embedded chunks
|
|
204
|
+
malimgraph vector search \
|
|
205
|
+
--query "What are the financial risks?" \
|
|
206
|
+
--uri "postgresql://user:pass@localhost:5432/mydb" \
|
|
207
|
+
--top-k 5
|
|
208
|
+
|
|
209
|
+
# Render as browsable HTML
|
|
210
|
+
malimgraph render \
|
|
211
|
+
--input report.pdf \
|
|
212
|
+
--output document.html \
|
|
213
|
+
--knowledge-graph ./output/knowledge_graph.json
|
|
214
|
+
|
|
215
|
+
# Load into Neo4j
|
|
216
|
+
malimgraph db load \
|
|
217
|
+
--input ./output/knowledge_graph.json \
|
|
218
|
+
--target neo4j \
|
|
219
|
+
--uri bolt://localhost:7687 \
|
|
220
|
+
--user neo4j \
|
|
221
|
+
--password secret
|
|
222
|
+
|
|
223
|
+
# Query the graph
|
|
224
|
+
malimgraph db query \
|
|
225
|
+
--target neo4j \
|
|
226
|
+
--uri bolt://localhost:7687 \
|
|
227
|
+
--query "MATCH (n:Organization) RETURN n.label, n.source_pages LIMIT 10"
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Claude Skills
|
|
231
|
+
|
|
232
|
+
Download `.skill` files from [GitHub Releases](https://github.com/malim-ai-labs/malim-graph-plugin/releases) and install in claude.ai → Settings → Skills.
|
|
233
|
+
|
|
234
|
+
| Skill | Trigger phrases |
|
|
235
|
+
|-------|----------------|
|
|
236
|
+
| `pdf-to-knowledge-graph` | "knowledge graph", "extract entities", "PDF to Cypher" |
|
|
237
|
+
| `pdf-to-chunks` | "chunk document", "split for embeddings", "RAG chunks" |
|
|
238
|
+
| `document-to-html` | "convert PDF to HTML", "render document", "make PDF browsable" |
|
|
239
|
+
| `graph-db-admin` | "load into Neo4j", "Cypher query", "graph statistics" |
|
|
240
|
+
| `chunks-to-pgvector` | "store in pgvector", "embed into PostgreSQL", "semantic search", "RAG with PostgreSQL" |
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Installation
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
# Core (knowledge graph + chunking + HTML)
|
|
248
|
+
pip install malimgraph
|
|
249
|
+
|
|
250
|
+
# With Neo4j support
|
|
251
|
+
pip install "malimgraph[neo4j]"
|
|
252
|
+
|
|
253
|
+
# With Apache AGE support
|
|
254
|
+
pip install "malimgraph[age]"
|
|
255
|
+
|
|
256
|
+
# With pgvector + OpenAI embeddings
|
|
257
|
+
pip install "malimgraph[pgvector,openai]"
|
|
258
|
+
|
|
259
|
+
# With pgvector + Voyage AI embeddings
|
|
260
|
+
pip install "malimgraph[pgvector,voyage]"
|
|
261
|
+
|
|
262
|
+
# With local embeddings (no API key needed)
|
|
263
|
+
pip install "malimgraph[pgvector,local]"
|
|
264
|
+
|
|
265
|
+
# Everything
|
|
266
|
+
pip install "malimgraph[all]"
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### Environment Variables
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
ANTHROPIC_API_KEY=sk-ant-... # Required for LLM extraction
|
|
273
|
+
OPENAI_API_KEY=sk-... # Required for OpenAI embeddings
|
|
274
|
+
VOYAGE_API_KEY=pa-... # Required for Voyage AI embeddings
|
|
275
|
+
PGVECTOR_URI=postgresql://... # PostgreSQL connection for pgvector
|
|
276
|
+
NEO4J_URI=bolt://localhost:7687 # Neo4j connection
|
|
277
|
+
NEO4J_USER=neo4j
|
|
278
|
+
NEO4J_PASSWORD=yourpassword
|
|
279
|
+
AGE_CONNECTION_URI=host=... # Apache AGE connection
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Output Schema — `knowledge_graph.json`
|
|
285
|
+
|
|
286
|
+
Every entity and relationship carries full citation provenance:
|
|
287
|
+
|
|
288
|
+
| Field | Type | Description |
|
|
289
|
+
|-------|------|-------------|
|
|
290
|
+
| `id` | string | Stable hash ID: `e_` + MD5(type:label)[:8] |
|
|
291
|
+
| `label` | string | Canonical entity name |
|
|
292
|
+
| `type` | string | Entity type (Organization, Person, Date, …) |
|
|
293
|
+
| `extraction_method` | enum | `rule` / `llm` / `hybrid` |
|
|
294
|
+
| `confidence` | enum | `high` / `medium` / `low` |
|
|
295
|
+
| `source_pages` | int[] | PDF page numbers where found |
|
|
296
|
+
| `source_text` | string | Primary verbatim supporting quote |
|
|
297
|
+
| `source_chunk_id` | string | Processing chunk ID |
|
|
298
|
+
| `citations[]` | object[] | All supporting quotes with page refs |
|
|
299
|
+
| `citation_count` | int | Stored as property in graph DBs |
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## pgvector — Semantic Search Schema
|
|
304
|
+
|
|
305
|
+
Chunks are stored with embeddings in PostgreSQL, enabling semantic search:
|
|
306
|
+
|
|
307
|
+
```sql
|
|
308
|
+
-- Find chunks most similar to a query
|
|
309
|
+
SELECT chunk_text, source_file, page_numbers, heading_context,
|
|
310
|
+
1 - (embedding <=> '[...]'::vector) AS score
|
|
311
|
+
FROM document_chunks
|
|
312
|
+
ORDER BY embedding <=> '[...]'::vector
|
|
313
|
+
LIMIT 10;
|
|
314
|
+
|
|
315
|
+
-- Filter by document
|
|
316
|
+
SELECT * FROM document_chunks
|
|
317
|
+
WHERE document_id = 'annual_report_2024'
|
|
318
|
+
ORDER BY embedding <=> '[...]'::vector LIMIT 5;
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
**Supported embedding providers:**
|
|
322
|
+
|
|
323
|
+
| Provider | Default model | Dimension | API key |
|
|
324
|
+
|----------|--------------|-----------|---------|
|
|
325
|
+
| `openai` | `text-embedding-3-small` | 1536-d | `OPENAI_API_KEY` |
|
|
326
|
+
| `voyage` | `voyage-3-large` | 1024-d | `VOYAGE_API_KEY` |
|
|
327
|
+
| `local` | `all-MiniLM-L6-v2` | 384-d | none (CPU) |
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
## Database Setup
|
|
332
|
+
|
|
333
|
+
### Neo4j
|
|
334
|
+
```bash
|
|
335
|
+
docker run -p 7474:7474 -p 7687:7687 \
|
|
336
|
+
-e NEO4J_AUTH=neo4j/yourpassword neo4j:latest
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Apache AGE (PostgreSQL)
|
|
340
|
+
```bash
|
|
341
|
+
docker run -p 5432:5432 -e POSTGRES_PASSWORD=secret apache/age:latest
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
### pgvector (PostgreSQL)
|
|
345
|
+
```bash
|
|
346
|
+
docker run -p 5432:5432 -e POSTGRES_PASSWORD=secret pgvector/pgvector:pg17
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
See [docs/database-setup.md](docs/database-setup.md) for full guides.
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## Contributing
|
|
354
|
+
|
|
355
|
+
1. Fork the repo
|
|
356
|
+
2. Create a feature branch: `git checkout -b feature/my-feature`
|
|
357
|
+
3. Install dev deps: `pip install -e ".[dev]"`
|
|
358
|
+
4. Run tests: `make test`
|
|
359
|
+
5. Lint: `make lint`
|
|
360
|
+
6. Submit a PR
|
|
361
|
+
|
|
362
|
+
---
|
|
363
|
+
|
|
364
|
+
## Credits
|
|
365
|
+
|
|
366
|
+
Built by **[Malim AI Labs](https://ailabs.malim.my)** — AI-powered knowledge infrastructure for Southeast Asia.
|
|
367
|
+
|
|
368
|
+
Malim AI Labs Social Enterprise (003827047-U) · Kuala Lumpur, Malaysia
|
|
369
|
+
|
|
370
|
+
---
|
|
371
|
+
|
|
372
|
+
## License
|
|
373
|
+
|
|
374
|
+
MIT — see [LICENSE](LICENSE)
|