PyPI - kgnode - Versions diffs - 0.1.0__tar.gz - Mend

kgnode 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

kgnode-0.1.0/PKG-INFO +234 -0
kgnode-0.1.0/README.md +215 -0
kgnode-0.1.0/pyproject.toml +79 -0
kgnode-0.1.0/src/kgnode/__init__.py +60 -0
kgnode-0.1.0/src/kgnode/_entity_descriptor.py +474 -0
kgnode-0.1.0/src/kgnode/_node_ranker.py +138 -0
kgnode-0.1.0/src/kgnode/chroma_db.py +782 -0
kgnode-0.1.0/src/kgnode/core/__init__.py +3 -0
kgnode-0.1.0/src/kgnode/core/kg_config.py +496 -0
kgnode-0.1.0/src/kgnode/core/schema_chromadb.py +215 -0
kgnode-0.1.0/src/kgnode/core/schema_extractor.py +226 -0
kgnode-0.1.0/src/kgnode/core/schema_selector.py +127 -0
kgnode-0.1.0/src/kgnode/core/sparql_query.py +77 -0
kgnode-0.1.0/src/kgnode/generator.py +814 -0
kgnode-0.1.0/src/kgnode/keyword_search.py +55 -0
kgnode-0.1.0/src/kgnode/py.typed +0 -0
kgnode-0.1.0/src/kgnode/seed_finder.py +462 -0
kgnode-0.1.0/src/kgnode/subgraph_extraction.py +747 -0
kgnode-0.1.0/src/kgnode/validator.py +135 -0

kgnode-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,234 @@
+Metadata-Version: 2.3
+Name: kgnode
+Version: 0.1.0
+Summary: Add your description here
+Author: afmjoaa
+Author-email: afmjoaa <mohimenul.joaa@gmail.com>
+License: MIT
+Requires-Dist: chromadb>=1.1.1
+Requires-Dist: datasets>=4.2.0
+Requires-Dist: dspy>=3.0.4
+Requires-Dist: numpy>=2.3.3
+Requires-Dist: openai>=2.6.1
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: rdflib>=7.2.1
+Requires-Dist: sentence-transformers>=5.1.1
+Requires-Dist: sparqlwrapper>=2.0.0
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+# kgnode
+Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications
+## Overview
+kgnode is a Python library that extracts relevant subgraphs from large knowledge graphs using a path-aware Markov chain algorithm for question answering tasks.
+**Implementation Summary:**
+1. Kgnode - work in progress
+2. Initial Dataset: DBLP-QuAD
+3. Knowledge graph embedding ❌
+4. Simple text embedding with basic template ✅
+5. Initial Vector DB: ChromaDB
+6. Framework: LangGraph
+7. Seed node identification strategy:
+   - SPARQL text search (1-hop nodes)
+   - High-frequency node (degree) semantic search (2-3 hop nodes)
+   - Compile VectorDB with top 1 million nodes
+8. Node pruning algorithm: Path-aware Markov chain (relevant subgraph identification)
+   - P(v→w) ∝ base_weight(v,w) × f(history,v,w)
+   - Initially using P(v→w) ∝ softmax(cos(path_embedding, template_embedding))
+   - path_embedding == f(a, r, b, r, v, r, w)
+   - Query → template → template_embedding
+   - Stops when p gets smaller than previous step or reaches 10 hops
+9. Generate SPARQL for answering the query, using the subgraph as context
+10. Generate answer of the query by executing SPARQL and using subgraph
+## Installation
+```bash
+pip install kgnode
+```
+## Quick Start
+```python
+from kgnode import KGConfig, get_seed_nodes, get_subgraphs, generate_answer
+# Configure for your knowledge graph
+config = KGConfig(
+    sparql_endpoint="http://localhost:7878/query",
+    embedding_model="all-MiniLM-L6-v2"
+)
+# Find seed nodes for a query
+seed_nodes = get_seed_nodes(query="What papers did John Smith publish?", config=config)
+# Extract relevant subgraph
+subgraphs = get_subgraphs(seed_node=seed_nodes[0], query="...", config=config)
+# Generate answer
+answer = generate_answer(query="...", config=config)
+```
+## Folder Structure
+```
+kgnode/
+├── src/kgnode/
+│   ├── __init__.py              # Public API exports
+│   ├── seed_finder.py           # Seed node identification
+│   ├── subgraph_extraction.py   # Path-aware Markov chain algorithm
+│   ├── generator.py             # SPARQL generation and answer generation
+│   ├── validator.py             # Subgraph validation
+│   ├── keyword_search.py        # Keyword-based entity search
+│   ├── chroma_db.py            # Vector database operations
+│   └── core/
+│       ├── kg_config.py        # Configuration class
+│       ├── sparql_query.py     # SPARQL endpoint communication
+│       ├── schema_extractor.py # Schema extraction from ontology/SPARQL
+│       ├── schema_chromadb.py  # Schema ChromaDB collections
+│       └── schema_selector.py  # Query-aware schema selection
+├── tests/                       # Unit tests
+├── docs/                        # Documentation
+└── _data/                       # Data files (not in repo)
+```
+## Running Oxigraph SPARQL Server
+kgnode requires a SPARQL endpoint. We recommend Oxigraph:
+```bash
+# Start server (read-write)
+oxigraph_server serve -l ./oxigraph_db --cors
+# Start server (read-only)
+oxigraph_server serve-read-only -l ./oxigraph_db --cors
+# Load dataset (one-time setup)
+oxigraph_server load -l ./oxigraph_db -f _data/dblp.nt
+# Custom bind address
+oxigraph_server serve -l ~/oxigraph_db --bind 127.0.0.1:7878
+```
+**Default endpoint:** `http://localhost:7878/query`
+## Public API
+### Main Pipeline
+```python
+from kgnode import (
+    citable,                    # Check seed node quality
+    get_seed_nodes,             # Find seed nodes (keyword + semantic search)
+    get_subgraphs,              # Extract subgraph using path-aware Markov chain
+    generate_sparql,            # Generate SPARQL from subgraph
+    kg_retrieve,                # Full pipeline: query → subgraph → SPARQL → results
+    generate_answer,            # End-to-end answer generation
+    generate_answer_using_subgraph,  # Answer generation from subgraph
+)
+```
+### VectorDB Operations
+```python
+from kgnode import (
+    compile_chromadb,           # Build vector DB from knowledge graph
+    compile_chromadb_from_csv,  # Build from existing CSV
+    semantic_search_entities,   # Semantic search for entities
+    load_chromadb,              # Load existing ChromaDB collection
+    add_or_update_entities,     # Add/update entity embeddings
+    delete_entities,            # Remove entities from vector DB
+)
+```
+### Search Operations
+```python
+from kgnode import search_entities_by_keywords  # SPARQL keyword search
+```
+### Validation
+```python
+from kgnode import validate_subgraph  # Validate extracted subgraph
+```
+### Core Configuration
+```python
+from kgnode import KGConfig, execute_sparql_query
+# Create configuration
+config = KGConfig(
+    sparql_endpoint="http://localhost:7878/query",
+    embedding_model="all-MiniLM-L6-v2",
+    openai_model="gpt-4o-mini"
+)
+# Execute SPARQL queries
+results = execute_sparql_query(query="SELECT * WHERE { ?s ?p ?o } LIMIT 10", config=config)
+```
+## TODOs
+### LangGraph Integration
+- [ ] Orchestrate workflow with LangGraph
+- [ ] Add visualization support
+## Documentation
+For detailed usage, API reference, and examples, see [docs/USAGE.md](docs/USAGE.md) or visit the [online documentation](https://afmjoaa.github.io/kgnode/).
+## Dataset
+**DBLP-QuAD** - Academic publications knowledge graph
+- **Source:** https://dblp.org/rdf/
+- **Download:** https://zenodo.org/records/7638511
+- **Paper:** [DBLP-QuAD (ECIR 2023)](https://www.inf.uni-hamburg.de/en/inst/ab/lt/publications/2023-banerjee-bir-ecir-2023-dblpquad.pdf)
+- **Stats:** 252M triples, 92M entities, 62 relations
+## Supported Technologies
+### Vector Databases
+- **ChromaDB** ✅ (implemented)
+- Pinecone (planned)
+- Qdrant (planned)
+### Embedding Models
+- **all-MiniLM-L6-v2** ✅ (default, 384 dimensions)
+- google/embeddinggemma-300m (alternative)
+## License
+MIT
+## Testing
+### Run All Tests
+```bash
+python tests/test_runner.py
+```
+### Run Specific Tests
+```bash
+# Run single test file
+python tests/test_runner.py chromadb
+# Run multiple test files
+python tests/test_runner.py chromadb seed_finder subgraph_extraction
+# List available tests
+python tests/test_runner.py --list
+# Run standalone test file
+python tests/test_chromadb.py
+```
+### Prerequisites
+- Oxigraph SPARQL server running at `http://localhost:7878/query`
+- `OPENAI_API_KEY` environment variable set
+- ChromaDB created (happens automatically on first run)

kgnode-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,215 @@
+# kgnode
+Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications
+## Overview
+kgnode is a Python library that extracts relevant subgraphs from large knowledge graphs using a path-aware Markov chain algorithm for question answering tasks.
+**Implementation Summary:**
+1. Kgnode - work in progress
+2. Initial Dataset: DBLP-QuAD
+3. Knowledge graph embedding ❌
+4. Simple text embedding with basic template ✅
+5. Initial Vector DB: ChromaDB
+6. Framework: LangGraph
+7. Seed node identification strategy:
+   - SPARQL text search (1-hop nodes)
+   - High-frequency node (degree) semantic search (2-3 hop nodes)
+   - Compile VectorDB with top 1 million nodes
+8. Node pruning algorithm: Path-aware Markov chain (relevant subgraph identification)
+   - P(v→w) ∝ base_weight(v,w) × f(history,v,w)
+   - Initially using P(v→w) ∝ softmax(cos(path_embedding, template_embedding))
+   - path_embedding == f(a, r, b, r, v, r, w)
+   - Query → template → template_embedding
+   - Stops when p gets smaller than previous step or reaches 10 hops
+9. Generate SPARQL for answering the query, using the subgraph as context
+10. Generate answer of the query by executing SPARQL and using subgraph
+## Installation
+```bash
+pip install kgnode
+```
+## Quick Start
+```python
+from kgnode import KGConfig, get_seed_nodes, get_subgraphs, generate_answer
+# Configure for your knowledge graph
+config = KGConfig(
+    sparql_endpoint="http://localhost:7878/query",
+    embedding_model="all-MiniLM-L6-v2"
+)
+# Find seed nodes for a query
+seed_nodes = get_seed_nodes(query="What papers did John Smith publish?", config=config)
+# Extract relevant subgraph
+subgraphs = get_subgraphs(seed_node=seed_nodes[0], query="...", config=config)
+# Generate answer
+answer = generate_answer(query="...", config=config)
+```
+## Folder Structure
+```
+kgnode/
+├── src/kgnode/
+│   ├── __init__.py              # Public API exports
+│   ├── seed_finder.py           # Seed node identification
+│   ├── subgraph_extraction.py   # Path-aware Markov chain algorithm
+│   ├── generator.py             # SPARQL generation and answer generation
+│   ├── validator.py             # Subgraph validation
+│   ├── keyword_search.py        # Keyword-based entity search
+│   ├── chroma_db.py            # Vector database operations
+│   └── core/
+│       ├── kg_config.py        # Configuration class
+│       ├── sparql_query.py     # SPARQL endpoint communication
+│       ├── schema_extractor.py # Schema extraction from ontology/SPARQL
+│       ├── schema_chromadb.py  # Schema ChromaDB collections
+│       └── schema_selector.py  # Query-aware schema selection
+├── tests/                       # Unit tests
+├── docs/                        # Documentation
+└── _data/                       # Data files (not in repo)
+```
+## Running Oxigraph SPARQL Server
+kgnode requires a SPARQL endpoint. We recommend Oxigraph:
+```bash
+# Start server (read-write)
+oxigraph_server serve -l ./oxigraph_db --cors
+# Start server (read-only)
+oxigraph_server serve-read-only -l ./oxigraph_db --cors
+# Load dataset (one-time setup)
+oxigraph_server load -l ./oxigraph_db -f _data/dblp.nt
+# Custom bind address
+oxigraph_server serve -l ~/oxigraph_db --bind 127.0.0.1:7878
+```
+**Default endpoint:** `http://localhost:7878/query`
+## Public API
+### Main Pipeline
+```python
+from kgnode import (
+    citable,                    # Check seed node quality
+    get_seed_nodes,             # Find seed nodes (keyword + semantic search)
+    get_subgraphs,              # Extract subgraph using path-aware Markov chain
+    generate_sparql,            # Generate SPARQL from subgraph
+    kg_retrieve,                # Full pipeline: query → subgraph → SPARQL → results
+    generate_answer,            # End-to-end answer generation
+    generate_answer_using_subgraph,  # Answer generation from subgraph
+)
+```
+### VectorDB Operations
+```python
+from kgnode import (
+    compile_chromadb,           # Build vector DB from knowledge graph
+    compile_chromadb_from_csv,  # Build from existing CSV
+    semantic_search_entities,   # Semantic search for entities
+    load_chromadb,              # Load existing ChromaDB collection
+    add_or_update_entities,     # Add/update entity embeddings
+    delete_entities,            # Remove entities from vector DB
+)
+```
+### Search Operations
+```python
+from kgnode import search_entities_by_keywords  # SPARQL keyword search
+```
+### Validation
+```python
+from kgnode import validate_subgraph  # Validate extracted subgraph
+```
+### Core Configuration
+```python
+from kgnode import KGConfig, execute_sparql_query
+# Create configuration
+config = KGConfig(
+    sparql_endpoint="http://localhost:7878/query",
+    embedding_model="all-MiniLM-L6-v2",
+    openai_model="gpt-4o-mini"
+)
+# Execute SPARQL queries
+results = execute_sparql_query(query="SELECT * WHERE { ?s ?p ?o } LIMIT 10", config=config)
+```
+## TODOs
+### LangGraph Integration
+- [ ] Orchestrate workflow with LangGraph
+- [ ] Add visualization support
+## Documentation
+For detailed usage, API reference, and examples, see [docs/USAGE.md](docs/USAGE.md) or visit the [online documentation](https://afmjoaa.github.io/kgnode/).
+## Dataset
+**DBLP-QuAD** - Academic publications knowledge graph
+- **Source:** https://dblp.org/rdf/
+- **Download:** https://zenodo.org/records/7638511
+- **Paper:** [DBLP-QuAD (ECIR 2023)](https://www.inf.uni-hamburg.de/en/inst/ab/lt/publications/2023-banerjee-bir-ecir-2023-dblpquad.pdf)
+- **Stats:** 252M triples, 92M entities, 62 relations
+## Supported Technologies
+### Vector Databases
+- **ChromaDB** ✅ (implemented)
+- Pinecone (planned)
+- Qdrant (planned)
+### Embedding Models
+- **all-MiniLM-L6-v2** ✅ (default, 384 dimensions)
+- google/embeddinggemma-300m (alternative)
+## License
+MIT
+## Testing
+### Run All Tests
+```bash
+python tests/test_runner.py
+```
+### Run Specific Tests
+```bash
+# Run single test file
+python tests/test_runner.py chromadb
+# Run multiple test files
+python tests/test_runner.py chromadb seed_finder subgraph_extraction
+# List available tests
+python tests/test_runner.py --list
+# Run standalone test file
+python tests/test_chromadb.py
+```
+### Prerequisites
+- Oxigraph SPARQL server running at `http://localhost:7878/query`
+- `OPENAI_API_KEY` environment variable set
+- ChromaDB created (happens automatically on first run)

kgnode-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,79 @@
+[project]
+name = "kgnode"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+license = { text = "MIT" }
+authors = [
+    { name = "afmjoaa", email = "mohimenul.joaa@gmail.com" }
+]
+requires-python = ">=3.11"
+dependencies = [
+    "chromadb>=1.1.1",
+    "datasets>=4.2.0",
+    "dspy>=3.0.4",
+    "numpy>=2.3.3",
+    "openai>=2.6.1",
+    "pandas>=2.3.3",
+    "rdflib>=7.2.1",
+    "sentence-transformers>=5.1.1",
+    "sparqlwrapper>=2.0.0",
+]
+[dependency-groups]
+dev = [
+    "ruff<1.0.0,>=0.4.10",
+    "mypy<2.0.0,>=1.10.1",
+    "pytest>=8.2.2,<9.0.0",
+    "pytest-mock>=3.15.1",
+]
+[tool.ruff]
+lint.select = [
+    "E",    # pycodestyle
+    "F",    # pyflakes
+    "I",    # isort
+    "D",    # pydocstyle
+    "D401", # First line should be in imperative mood
+]
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["D"]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.format]
+docstring-code-format = true
+docstring-code-line-length = 80
+[tool.uv.sources]
+kgnode = { workspace = true }
+#[tool.pytest-watcher]
+#now = true
+#delay = 3
+#patterns = ["*.py"]
+# To change the root module location, only src is included by default.
+#[tool.uv.build-backend]
+#module-name = "kgnode"
+#module-root = ""
+[tool.uv.build-backend]
+source-exclude = [
+    "tests/",
+    "docs/",
+    "paper/",
+    ".github/",
+    "_data/",
+    "_temp/",
+    ".python-version",
+    ".venv*/**",
+    ".editorconfig",
+    ".langgraph_api",
+    "*.ipynb",
+]
+[build-system]
+requires = ["uv_build>=0.8.16,<0.9.0"]
+build-backend = "uv_build"

kgnode-0.1.0/src/kgnode/__init__.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""
+kgnode - Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications.
+Public API for knowledge graph retrieval and answer generation.
+"""
+# Main Pipeline APIs
+from kgnode.seed_finder import citable, get_seed_nodes
+from kgnode.subgraph_extraction import get_subgraphs
+from kgnode.generator import (
+    generate_sparql,
+    kg_retrieve,
+    generate_answer,
+    generate_answer_using_subgraph,
+)
+# Validation
+from kgnode.validator import validate_subgraph
+# Search Operations
+from kgnode.keyword_search import search_entities_by_keywords
+# VectorDB Operations
+from kgnode.chroma_db import (
+    compile_chromadb,
+    compile_chromadb_from_csv,
+    semantic_search_entities,
+    get_or_create_chromadb,
+    add_or_update_entities,
+    delete_entities,
+)
+# Core Configuration
+from kgnode.core.kg_config import KGConfig
+from kgnode.core.sparql_query import execute_sparql_query
+__all__ = [
+    # Main Pipeline APIs
+    "citable",
+    "get_seed_nodes",
+    "get_subgraphs",
+    "generate_sparql",
+    "kg_retrieve",
+    "generate_answer",
+    "generate_answer_using_subgraph",
+    # Validation
+    "validate_subgraph",
+    # Search Operations
+    "search_entities_by_keywords",
+    # VectorDB Operations
+    "compile_chromadb",
+    "compile_chromadb_from_csv",
+    "semantic_search_entities",
+    "get_or_create_chromadb",
+    "add_or_update_entities",
+    "delete_entities",
+    # Core Configuration
+    "KGConfig",
+    "execute_sparql_query",
+]