kgnode 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kgnode-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,234 @@
1
+ Metadata-Version: 2.3
2
+ Name: kgnode
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Author: afmjoaa
6
+ Author-email: afmjoaa <mohimenul.joaa@gmail.com>
7
+ License: MIT
8
+ Requires-Dist: chromadb>=1.1.1
9
+ Requires-Dist: datasets>=4.2.0
10
+ Requires-Dist: dspy>=3.0.4
11
+ Requires-Dist: numpy>=2.3.3
12
+ Requires-Dist: openai>=2.6.1
13
+ Requires-Dist: pandas>=2.3.3
14
+ Requires-Dist: rdflib>=7.2.1
15
+ Requires-Dist: sentence-transformers>=5.1.1
16
+ Requires-Dist: sparqlwrapper>=2.0.0
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+
20
+ # kgnode
21
+
22
+ Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications
23
+
24
+ ## Overview
25
+
26
+ kgnode is a Python library that extracts relevant subgraphs from large knowledge graphs using a path-aware Markov chain algorithm for question answering tasks.
27
+
28
+ **Implementation Summary:**
29
+ 1. Kgnode - work in progress
30
+ 2. Initial Dataset: DBLP-QuAD
31
+ 3. Knowledge graph embedding ❌
32
+ 4. Simple text embedding with basic template ✅
33
+ 5. Initial Vector DB: ChromaDB
34
+ 6. Framework: LangGraph
35
+ 7. Seed node identification strategy:
36
+ - SPARQL text search (1-hop nodes)
37
+ - High-frequency node (degree) semantic search (2-3 hop nodes)
38
+ - Compile VectorDB with top 1 million nodes
39
+ 8. Node pruning algorithm: Path-aware Markov chain (relevant subgraph identification)
40
+ - P(v→w) ∝ base_weight(v,w) × f(history,v,w)
41
+ - Initially using P(v→w) ∝ softmax(cos(path_embedding, template_embedding))
42
+ - path_embedding == f(a, r, b, r, v, r, w)
43
+ - Query → template → template_embedding
44
+ - Stops when p gets smaller than previous step or reaches 10 hops
45
+ 9. Generate SPARQL for answering the query, using the subgraph as context
46
+ 10. Generate answer of the query by executing SPARQL and using subgraph
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install kgnode
52
+ ```
53
+
54
+ ## Quick Start
55
+
56
+ ```python
57
+ from kgnode import KGConfig, get_seed_nodes, get_subgraphs, generate_answer
58
+
59
+ # Configure for your knowledge graph
60
+ config = KGConfig(
61
+ sparql_endpoint="http://localhost:7878/query",
62
+ embedding_model="all-MiniLM-L6-v2"
63
+ )
64
+
65
+ # Find seed nodes for a query
66
+ seed_nodes = get_seed_nodes(query="What papers did John Smith publish?", config=config)
67
+
68
+ # Extract relevant subgraph
69
+ subgraphs = get_subgraphs(seed_node=seed_nodes[0], query="...", config=config)
70
+
71
+ # Generate answer
72
+ answer = generate_answer(query="...", config=config)
73
+ ```
74
+
75
+ ## Folder Structure
76
+
77
+ ```
78
+ kgnode/
79
+ ├── src/kgnode/
80
+ │ ├── __init__.py # Public API exports
81
+ │ ├── seed_finder.py # Seed node identification
82
+ │ ├── subgraph_extraction.py # Path-aware Markov chain algorithm
83
+ │ ├── generator.py # SPARQL generation and answer generation
84
+ │ ├── validator.py # Subgraph validation
85
+ │ ├── keyword_search.py # Keyword-based entity search
86
+ │ ├── chroma_db.py # Vector database operations
87
+ │ └── core/
88
+ │ ├── kg_config.py # Configuration class
89
+ │ ├── sparql_query.py # SPARQL endpoint communication
90
+ │ ├── schema_extractor.py # Schema extraction from ontology/SPARQL
91
+ │ ├── schema_chromadb.py # Schema ChromaDB collections
92
+ │ └── schema_selector.py # Query-aware schema selection
93
+ ├── tests/ # Unit tests
94
+ ├── docs/ # Documentation
95
+ └── _data/ # Data files (not in repo)
96
+ ```
97
+
98
+ ## Running Oxigraph SPARQL Server
99
+
100
+ kgnode requires a SPARQL endpoint. We recommend Oxigraph:
101
+
102
+ ```bash
103
+ # Start server (read-write)
104
+ oxigraph_server serve -l ./oxigraph_db --cors
105
+
106
+ # Start server (read-only)
107
+ oxigraph_server serve-read-only -l ./oxigraph_db --cors
108
+
109
+ # Load dataset (one-time setup)
110
+ oxigraph_server load -l ./oxigraph_db -f _data/dblp.nt
111
+
112
+ # Custom bind address
113
+ oxigraph_server serve -l ~/oxigraph_db --bind 127.0.0.1:7878
114
+ ```
115
+
116
+ **Default endpoint:** `http://localhost:7878/query`
117
+
118
+ ## Public API
119
+
120
+ ### Main Pipeline
121
+
122
+ ```python
123
+ from kgnode import (
124
+ citable, # Check seed node quality
125
+ get_seed_nodes, # Find seed nodes (keyword + semantic search)
126
+ get_subgraphs, # Extract subgraph using path-aware Markov chain
127
+ generate_sparql, # Generate SPARQL from subgraph
128
+ kg_retrieve, # Full pipeline: query → subgraph → SPARQL → results
129
+ generate_answer, # End-to-end answer generation
130
+ generate_answer_using_subgraph, # Answer generation from subgraph
131
+ )
132
+ ```
133
+
134
+ ### VectorDB Operations
135
+
136
+ ```python
137
+ from kgnode import (
138
+ compile_chromadb, # Build vector DB from knowledge graph
139
+ compile_chromadb_from_csv, # Build from existing CSV
140
+ semantic_search_entities, # Semantic search for entities
141
+ load_chromadb, # Load existing ChromaDB collection
142
+ add_or_update_entities, # Add/update entity embeddings
143
+ delete_entities, # Remove entities from vector DB
144
+ )
145
+ ```
146
+
147
+ ### Search Operations
148
+
149
+ ```python
150
+ from kgnode import search_entities_by_keywords # SPARQL keyword search
151
+ ```
152
+
153
+ ### Validation
154
+
155
+ ```python
156
+ from kgnode import validate_subgraph # Validate extracted subgraph
157
+ ```
158
+
159
+ ### Core Configuration
160
+
161
+ ```python
162
+ from kgnode import KGConfig, execute_sparql_query
163
+
164
+ # Create configuration
165
+ config = KGConfig(
166
+ sparql_endpoint="http://localhost:7878/query",
167
+ embedding_model="all-MiniLM-L6-v2",
168
+ openai_model="gpt-4o-mini"
169
+ )
170
+
171
+ # Execute SPARQL queries
172
+ results = execute_sparql_query(query="SELECT * WHERE { ?s ?p ?o } LIMIT 10", config=config)
173
+ ```
174
+
175
+ ## TODOs
176
+
177
+ ### LangGraph Integration
178
+ - [ ] Orchestrate workflow with LangGraph
179
+ - [ ] Add visualization support
180
+
181
+ ## Documentation
182
+
183
+ For detailed usage, API reference, and examples, see [docs/USAGE.md](docs/USAGE.md) or visit the [online documentation](https://afmjoaa.github.io/kgnode/).
184
+
185
+ ## Dataset
186
+
187
+ **DBLP-QuAD** - Academic publications knowledge graph
188
+ - **Source:** https://dblp.org/rdf/
189
+ - **Download:** https://zenodo.org/records/7638511
190
+ - **Paper:** [DBLP-QuAD (ECIR 2023)](https://www.inf.uni-hamburg.de/en/inst/ab/lt/publications/2023-banerjee-bir-ecir-2023-dblpquad.pdf)
191
+ - **Stats:** 252M triples, 92M entities, 62 relations
192
+
193
+ ## Supported Technologies
194
+
195
+ ### Vector Databases
196
+ - **ChromaDB** ✅ (implemented)
197
+ - Pinecone (planned)
198
+ - Qdrant (planned)
199
+
200
+ ### Embedding Models
201
+ - **all-MiniLM-L6-v2** ✅ (default, 384 dimensions)
202
+ - google/embeddinggemma-300m (alternative)
203
+
204
+ ## License
205
+
206
+ MIT
207
+
208
+ ## Testing
209
+
210
+ ### Run All Tests
211
+ ```bash
212
+ python tests/test_runner.py
213
+ ```
214
+
215
+ ### Run Specific Tests
216
+ ```bash
217
+ # Run single test file
218
+ python tests/test_runner.py chromadb
219
+
220
+ # Run multiple test files
221
+ python tests/test_runner.py chromadb seed_finder subgraph_extraction
222
+
223
+ # List available tests
224
+ python tests/test_runner.py --list
225
+
226
+ # Run standalone test file
227
+ python tests/test_chromadb.py
228
+ ```
229
+
230
+ ### Prerequisites
231
+ - Oxigraph SPARQL server running at `http://localhost:7878/query`
232
+ - `OPENAI_API_KEY` environment variable set
233
+ - ChromaDB created (happens automatically on first run)
234
+
kgnode-0.1.0/README.md ADDED
@@ -0,0 +1,215 @@
1
+ # kgnode
2
+
3
+ Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications
4
+
5
+ ## Overview
6
+
7
+ kgnode is a Python library that extracts relevant subgraphs from large knowledge graphs using a path-aware Markov chain algorithm for question answering tasks.
8
+
9
+ **Implementation Summary:**
10
+ 1. Kgnode - work in progress
11
+ 2. Initial Dataset: DBLP-QuAD
12
+ 3. Knowledge graph embedding ❌
13
+ 4. Simple text embedding with basic template ✅
14
+ 5. Initial Vector DB: ChromaDB
15
+ 6. Framework: LangGraph
16
+ 7. Seed node identification strategy:
17
+ - SPARQL text search (1-hop nodes)
18
+ - High-frequency node (degree) semantic search (2-3 hop nodes)
19
+ - Compile VectorDB with top 1 million nodes
20
+ 8. Node pruning algorithm: Path-aware Markov chain (relevant subgraph identification)
21
+ - P(v→w) ∝ base_weight(v,w) × f(history,v,w)
22
+ - Initially using P(v→w) ∝ softmax(cos(path_embedding, template_embedding))
23
+ - path_embedding == f(a, r, b, r, v, r, w)
24
+ - Query → template → template_embedding
25
+ - Stops when p gets smaller than previous step or reaches 10 hops
26
+ 9. Generate SPARQL for answering the query, using the subgraph as context
27
+ 10. Generate answer of the query by executing SPARQL and using subgraph
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install kgnode
33
+ ```
34
+
35
+ ## Quick Start
36
+
37
+ ```python
38
+ from kgnode import KGConfig, get_seed_nodes, get_subgraphs, generate_answer
39
+
40
+ # Configure for your knowledge graph
41
+ config = KGConfig(
42
+ sparql_endpoint="http://localhost:7878/query",
43
+ embedding_model="all-MiniLM-L6-v2"
44
+ )
45
+
46
+ # Find seed nodes for a query
47
+ seed_nodes = get_seed_nodes(query="What papers did John Smith publish?", config=config)
48
+
49
+ # Extract relevant subgraph
50
+ subgraphs = get_subgraphs(seed_node=seed_nodes[0], query="...", config=config)
51
+
52
+ # Generate answer
53
+ answer = generate_answer(query="...", config=config)
54
+ ```
55
+
56
+ ## Folder Structure
57
+
58
+ ```
59
+ kgnode/
60
+ ├── src/kgnode/
61
+ │ ├── __init__.py # Public API exports
62
+ │ ├── seed_finder.py # Seed node identification
63
+ │ ├── subgraph_extraction.py # Path-aware Markov chain algorithm
64
+ │ ├── generator.py # SPARQL generation and answer generation
65
+ │ ├── validator.py # Subgraph validation
66
+ │ ├── keyword_search.py # Keyword-based entity search
67
+ │ ├── chroma_db.py # Vector database operations
68
+ │ └── core/
69
+ │ ├── kg_config.py # Configuration class
70
+ │ ├── sparql_query.py # SPARQL endpoint communication
71
+ │ ├── schema_extractor.py # Schema extraction from ontology/SPARQL
72
+ │ ├── schema_chromadb.py # Schema ChromaDB collections
73
+ │ └── schema_selector.py # Query-aware schema selection
74
+ ├── tests/ # Unit tests
75
+ ├── docs/ # Documentation
76
+ └── _data/ # Data files (not in repo)
77
+ ```
78
+
79
+ ## Running Oxigraph SPARQL Server
80
+
81
+ kgnode requires a SPARQL endpoint. We recommend Oxigraph:
82
+
83
+ ```bash
84
+ # Start server (read-write)
85
+ oxigraph_server serve -l ./oxigraph_db --cors
86
+
87
+ # Start server (read-only)
88
+ oxigraph_server serve-read-only -l ./oxigraph_db --cors
89
+
90
+ # Load dataset (one-time setup)
91
+ oxigraph_server load -l ./oxigraph_db -f _data/dblp.nt
92
+
93
+ # Custom bind address
94
+ oxigraph_server serve -l ~/oxigraph_db --bind 127.0.0.1:7878
95
+ ```
96
+
97
+ **Default endpoint:** `http://localhost:7878/query`
98
+
99
+ ## Public API
100
+
101
+ ### Main Pipeline
102
+
103
+ ```python
104
+ from kgnode import (
105
+ citable, # Check seed node quality
106
+ get_seed_nodes, # Find seed nodes (keyword + semantic search)
107
+ get_subgraphs, # Extract subgraph using path-aware Markov chain
108
+ generate_sparql, # Generate SPARQL from subgraph
109
+ kg_retrieve, # Full pipeline: query → subgraph → SPARQL → results
110
+ generate_answer, # End-to-end answer generation
111
+ generate_answer_using_subgraph, # Answer generation from subgraph
112
+ )
113
+ ```
114
+
115
+ ### VectorDB Operations
116
+
117
+ ```python
118
+ from kgnode import (
119
+ compile_chromadb, # Build vector DB from knowledge graph
120
+ compile_chromadb_from_csv, # Build from existing CSV
121
+ semantic_search_entities, # Semantic search for entities
122
+ load_chromadb, # Load existing ChromaDB collection
123
+ add_or_update_entities, # Add/update entity embeddings
124
+ delete_entities, # Remove entities from vector DB
125
+ )
126
+ ```
127
+
128
+ ### Search Operations
129
+
130
+ ```python
131
+ from kgnode import search_entities_by_keywords # SPARQL keyword search
132
+ ```
133
+
134
+ ### Validation
135
+
136
+ ```python
137
+ from kgnode import validate_subgraph # Validate extracted subgraph
138
+ ```
139
+
140
+ ### Core Configuration
141
+
142
+ ```python
143
+ from kgnode import KGConfig, execute_sparql_query
144
+
145
+ # Create configuration
146
+ config = KGConfig(
147
+ sparql_endpoint="http://localhost:7878/query",
148
+ embedding_model="all-MiniLM-L6-v2",
149
+ openai_model="gpt-4o-mini"
150
+ )
151
+
152
+ # Execute SPARQL queries
153
+ results = execute_sparql_query(query="SELECT * WHERE { ?s ?p ?o } LIMIT 10", config=config)
154
+ ```
155
+
156
+ ## TODOs
157
+
158
+ ### LangGraph Integration
159
+ - [ ] Orchestrate workflow with LangGraph
160
+ - [ ] Add visualization support
161
+
162
+ ## Documentation
163
+
164
+ For detailed usage, API reference, and examples, see [docs/USAGE.md](docs/USAGE.md) or visit the [online documentation](https://afmjoaa.github.io/kgnode/).
165
+
166
+ ## Dataset
167
+
168
+ **DBLP-QuAD** - Academic publications knowledge graph
169
+ - **Source:** https://dblp.org/rdf/
170
+ - **Download:** https://zenodo.org/records/7638511
171
+ - **Paper:** [DBLP-QuAD (ECIR 2023)](https://www.inf.uni-hamburg.de/en/inst/ab/lt/publications/2023-banerjee-bir-ecir-2023-dblpquad.pdf)
172
+ - **Stats:** 252M triples, 92M entities, 62 relations
173
+
174
+ ## Supported Technologies
175
+
176
+ ### Vector Databases
177
+ - **ChromaDB** ✅ (implemented)
178
+ - Pinecone (planned)
179
+ - Qdrant (planned)
180
+
181
+ ### Embedding Models
182
+ - **all-MiniLM-L6-v2** ✅ (default, 384 dimensions)
183
+ - google/embeddinggemma-300m (alternative)
184
+
185
+ ## License
186
+
187
+ MIT
188
+
189
+ ## Testing
190
+
191
+ ### Run All Tests
192
+ ```bash
193
+ python tests/test_runner.py
194
+ ```
195
+
196
+ ### Run Specific Tests
197
+ ```bash
198
+ # Run single test file
199
+ python tests/test_runner.py chromadb
200
+
201
+ # Run multiple test files
202
+ python tests/test_runner.py chromadb seed_finder subgraph_extraction
203
+
204
+ # List available tests
205
+ python tests/test_runner.py --list
206
+
207
+ # Run standalone test file
208
+ python tests/test_chromadb.py
209
+ ```
210
+
211
+ ### Prerequisites
212
+ - Oxigraph SPARQL server running at `http://localhost:7878/query`
213
+ - `OPENAI_API_KEY` environment variable set
214
+ - ChromaDB created (happens automatically on first run)
215
+
@@ -0,0 +1,79 @@
1
+ [project]
2
+ name = "kgnode"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [
8
+ { name = "afmjoaa", email = "mohimenul.joaa@gmail.com" }
9
+ ]
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "chromadb>=1.1.1",
13
+ "datasets>=4.2.0",
14
+ "dspy>=3.0.4",
15
+ "numpy>=2.3.3",
16
+ "openai>=2.6.1",
17
+ "pandas>=2.3.3",
18
+ "rdflib>=7.2.1",
19
+ "sentence-transformers>=5.1.1",
20
+ "sparqlwrapper>=2.0.0",
21
+ ]
22
+ [dependency-groups]
23
+ dev = [
24
+ "ruff<1.0.0,>=0.4.10",
25
+ "mypy<2.0.0,>=1.10.1",
26
+ "pytest>=8.2.2,<9.0.0",
27
+ "pytest-mock>=3.15.1",
28
+ ]
29
+
30
+ [tool.ruff]
31
+ lint.select = [
32
+ "E", # pycodestyle
33
+ "F", # pyflakes
34
+ "I", # isort
35
+ "D", # pydocstyle
36
+ "D401", # First line should be in imperative mood
37
+ ]
38
+
39
+ [tool.ruff.lint.per-file-ignores]
40
+ "tests/*" = ["D"]
41
+
42
+ [tool.ruff.lint.pydocstyle]
43
+ convention = "google"
44
+
45
+ [tool.ruff.format]
46
+ docstring-code-format = true
47
+ docstring-code-line-length = 80
48
+
49
+ [tool.uv.sources]
50
+ kgnode = { workspace = true }
51
+
52
+ #[tool.pytest-watcher]
53
+ #now = true
54
+ #delay = 3
55
+ #patterns = ["*.py"]
56
+
57
+ # To change the root module location, only src is included by default.
58
+ #[tool.uv.build-backend]
59
+ #module-name = "kgnode"
60
+ #module-root = ""
61
+
62
+ [tool.uv.build-backend]
63
+ source-exclude = [
64
+ "tests/",
65
+ "docs/",
66
+ "paper/",
67
+ ".github/",
68
+ "_data/",
69
+ "_temp/",
70
+ ".python-version",
71
+ ".venv*/**",
72
+ ".editorconfig",
73
+ ".langgraph_api",
74
+ "*.ipynb",
75
+ ]
76
+
77
+ [build-system]
78
+ requires = ["uv_build>=0.8.16,<0.9.0"]
79
+ build-backend = "uv_build"
@@ -0,0 +1,60 @@
1
+ """
2
+ kgnode - Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications.
3
+
4
+ Public API for knowledge graph retrieval and answer generation.
5
+ """
6
+
7
+ # Main Pipeline APIs
8
+ from kgnode.seed_finder import citable, get_seed_nodes
9
+ from kgnode.subgraph_extraction import get_subgraphs
10
+ from kgnode.generator import (
11
+ generate_sparql,
12
+ kg_retrieve,
13
+ generate_answer,
14
+ generate_answer_using_subgraph,
15
+ )
16
+
17
+ # Validation
18
+ from kgnode.validator import validate_subgraph
19
+
20
+ # Search Operations
21
+ from kgnode.keyword_search import search_entities_by_keywords
22
+
23
+ # VectorDB Operations
24
+ from kgnode.chroma_db import (
25
+ compile_chromadb,
26
+ compile_chromadb_from_csv,
27
+ semantic_search_entities,
28
+ get_or_create_chromadb,
29
+ add_or_update_entities,
30
+ delete_entities,
31
+ )
32
+
33
+ # Core Configuration
34
+ from kgnode.core.kg_config import KGConfig
35
+ from kgnode.core.sparql_query import execute_sparql_query
36
+
37
+ __all__ = [
38
+ # Main Pipeline APIs
39
+ "citable",
40
+ "get_seed_nodes",
41
+ "get_subgraphs",
42
+ "generate_sparql",
43
+ "kg_retrieve",
44
+ "generate_answer",
45
+ "generate_answer_using_subgraph",
46
+ # Validation
47
+ "validate_subgraph",
48
+ # Search Operations
49
+ "search_entities_by_keywords",
50
+ # VectorDB Operations
51
+ "compile_chromadb",
52
+ "compile_chromadb_from_csv",
53
+ "semantic_search_entities",
54
+ "get_or_create_chromadb",
55
+ "add_or_update_entities",
56
+ "delete_entities",
57
+ # Core Configuration
58
+ "KGConfig",
59
+ "execute_sparql_query",
60
+ ]