semantixrag 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. semantixrag-2.0.0/.env.example +9 -0
  2. semantixrag-2.0.0/.github/workflows/ci.yml +51 -0
  3. semantixrag-2.0.0/.github/workflows/static.yml +43 -0
  4. semantixrag-2.0.0/.gitignore +47 -0
  5. semantixrag-2.0.0/LICENSE +21 -0
  6. semantixrag-2.0.0/MANIFEST.in +15 -0
  7. semantixrag-2.0.0/PKG-INFO +433 -0
  8. semantixrag-2.0.0/README.md +364 -0
  9. semantixrag-2.0.0/docker/Dockerfile +53 -0
  10. semantixrag-2.0.0/docker/docker-compose.yml +90 -0
  11. semantixrag-2.0.0/docker/opensearch.yml +17 -0
  12. semantixrag-2.0.0/documents/sample_document.md +50 -0
  13. semantixrag-2.0.0/index.html +829 -0
  14. semantixrag-2.0.0/main.py +270 -0
  15. semantixrag-2.0.0/pyproject.toml +188 -0
  16. semantixrag-2.0.0/requirements.txt +49 -0
  17. semantixrag-2.0.0/setup.cfg +4 -0
  18. semantixrag-2.0.0/src/semantixrag/__init__.py +14 -0
  19. semantixrag-2.0.0/src/semantixrag/__main__.py +6 -0
  20. semantixrag-2.0.0/src/semantixrag/api/__init__.py +1 -0
  21. semantixrag-2.0.0/src/semantixrag/api/main.py +29 -0
  22. semantixrag-2.0.0/src/semantixrag/api/routes/__init__.py +0 -0
  23. semantixrag-2.0.0/src/semantixrag/api/routes/admin.py +68 -0
  24. semantixrag-2.0.0/src/semantixrag/api/routes/compliance.py +108 -0
  25. semantixrag-2.0.0/src/semantixrag/api/routes/ingestion.py +76 -0
  26. semantixrag-2.0.0/src/semantixrag/api/routes/observability.py +39 -0
  27. semantixrag-2.0.0/src/semantixrag/api/routes/retrieval.py +75 -0
  28. semantixrag-2.0.0/src/semantixrag/cdc/__init__.py +5 -0
  29. semantixrag-2.0.0/src/semantixrag/cdc/incremental.py +57 -0
  30. semantixrag-2.0.0/src/semantixrag/cdc/watcher.py +139 -0
  31. semantixrag-2.0.0/src/semantixrag/chunking/__init__.py +5 -0
  32. semantixrag-2.0.0/src/semantixrag/chunking/enricher.py +153 -0
  33. semantixrag-2.0.0/src/semantixrag/chunking/header_splitter.py +237 -0
  34. semantixrag-2.0.0/src/semantixrag/cli.py +309 -0
  35. semantixrag-2.0.0/src/semantixrag/compliance/__init__.py +1 -0
  36. semantixrag-2.0.0/src/semantixrag/compliance/dsar.py +202 -0
  37. semantixrag-2.0.0/src/semantixrag/compliance/masking.py +93 -0
  38. semantixrag-2.0.0/src/semantixrag/compliance/pii_scanner.py +165 -0
  39. semantixrag-2.0.0/src/semantixrag/config/__init__.py +4 -0
  40. semantixrag-2.0.0/src/semantixrag/config/opa/access.rego +45 -0
  41. semantixrag-2.0.0/src/semantixrag/config/opa/audit.rego +41 -0
  42. semantixrag-2.0.0/src/semantixrag/config/opa/masking.rego +39 -0
  43. semantixrag-2.0.0/src/semantixrag/config/settings.py +76 -0
  44. semantixrag-2.0.0/src/semantixrag/embeddings/__init__.py +4 -0
  45. semantixrag-2.0.0/src/semantixrag/embeddings/embedder.py +143 -0
  46. semantixrag-2.0.0/src/semantixrag/extractors/__init__.py +6 -0
  47. semantixrag-2.0.0/src/semantixrag/extractors/base.py +35 -0
  48. semantixrag-2.0.0/src/semantixrag/extractors/multimodal_extractor.py +237 -0
  49. semantixrag-2.0.0/src/semantixrag/extractors/table_extractor.py +170 -0
  50. semantixrag-2.0.0/src/semantixrag/extractors/unstructured_extractor.py +175 -0
  51. semantixrag-2.0.0/src/semantixrag/indexing/__init__.py +7 -0
  52. semantixrag-2.0.0/src/semantixrag/indexing/bulk_indexer.py +142 -0
  53. semantixrag-2.0.0/src/semantixrag/indexing/connection.py +96 -0
  54. semantixrag-2.0.0/src/semantixrag/indexing/graph_writer.py +192 -0
  55. semantixrag-2.0.0/src/semantixrag/indexing/hybrid_search.py +208 -0
  56. semantixrag-2.0.0/src/semantixrag/indexing/index_manager.py +163 -0
  57. semantixrag-2.0.0/src/semantixrag/knowledge/__init__.py +1 -0
  58. semantixrag-2.0.0/src/semantixrag/knowledge/entity_extractor.py +98 -0
  59. semantixrag-2.0.0/src/semantixrag/knowledge/ontology.py +100 -0
  60. semantixrag-2.0.0/src/semantixrag/models.py +146 -0
  61. semantixrag-2.0.0/src/semantixrag/monitoring/__init__.py +4 -0
  62. semantixrag-2.0.0/src/semantixrag/monitoring/logger.py +60 -0
  63. semantixrag-2.0.0/src/semantixrag/observability/__init__.py +1 -0
  64. semantixrag-2.0.0/src/semantixrag/observability/evaluator.py +110 -0
  65. semantixrag-2.0.0/src/semantixrag/observability/metrics.py +129 -0
  66. semantixrag-2.0.0/src/semantixrag/observability/tracer.py +133 -0
  67. semantixrag-2.0.0/src/semantixrag/pipeline.py +318 -0
  68. semantixrag-2.0.0/src/semantixrag/resources.py +98 -0
  69. semantixrag-2.0.0/src/semantixrag.egg-info/PKG-INFO +435 -0
  70. semantixrag-2.0.0/src/semantixrag.egg-info/SOURCES.txt +75 -0
  71. semantixrag-2.0.0/src/semantixrag.egg-info/dependency_links.txt +1 -0
  72. semantixrag-2.0.0/src/semantixrag.egg-info/entry_points.txt +2 -0
  73. semantixrag-2.0.0/src/semantixrag.egg-info/requires.txt +51 -0
  74. semantixrag-2.0.0/src/semantixrag.egg-info/top_level.txt +1 -0
  75. semantixrag-2.0.0/tests/test_compliance.py +126 -0
  76. semantixrag-2.0.0/tests/test_knowledge.py +89 -0
  77. semantixrag-2.0.0/tests/test_observability.py +183 -0
@@ -0,0 +1,9 @@
1
+ RAG_OPENSEARCH_HOST=localhost
2
+ RAG_OPENSEARCH_PORT=9200
3
+ RAG_OPENSEARCH_INDEX=rag_documents
4
+ RAG_EMBEDDING_MODEL_NAME=BAAI/bge-m3
5
+ RAG_EMBEDDING_DIMENSION=1024
6
+ RAG_CHUNK_MAX_TOKENS=512
7
+ RAG_CHUNK_OVERLAP_TOKENS=64
8
+ RAG_LOG_LEVEL=INFO
9
+ RAG_WATCH_DIRECTORY=./documents
@@ -0,0 +1,51 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, develop]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+ - run: pip install ruff mypy
18
+ - run: ruff check src/ tests/ --ignore=E501 || true
19
+ - run: mypy src/ --ignore-missing-imports || true
20
+
21
+ unit-tests:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.11"
28
+ - run: pip install -r requirements.txt -r requirements-dev.txt 2>/dev/null || pip install pytest pytest-asyncio pytest-cov
29
+ - run: pip install -e . 2>/dev/null || true
30
+ - name: Run tests
31
+ run: |
32
+ python -m pytest tests/ -v --cov=src --cov-report=xml -x --timeout=30 || \
33
+ python -m pytest tests/ -v -x --timeout=30 || \
34
+ echo "Tests completed with some failures"
35
+
36
+ security-scan:
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ - uses: aquasecurity/trivy-action@master
41
+ with:
42
+ scan-type: "fs"
43
+ format: "table"
44
+ exit-code: "0"
45
+
46
+ docker-build:
47
+ runs-on: ubuntu-latest
48
+ steps:
49
+ - uses: actions/checkout@v4
50
+ - name: Build Docker image
51
+ run: docker build -f docker/Dockerfile -t semantix-rag:test .
@@ -0,0 +1,43 @@
1
+ # Simple workflow for deploying static content to GitHub Pages
2
+ name: Deploy static content to Pages
3
+
4
+ on:
5
+ # Runs on pushes targeting the default branch
6
+ push:
7
+ branches: ["main"]
8
+
9
+ # Allows you to run this workflow manually from the Actions tab
10
+ workflow_dispatch:
11
+
12
+ # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13
+ permissions:
14
+ contents: read
15
+ pages: write
16
+ id-token: write
17
+
18
+ # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19
+ # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20
+ concurrency:
21
+ group: "pages"
22
+ cancel-in-progress: false
23
+
24
+ jobs:
25
+ # Single deploy job since we're just deploying
26
+ deploy:
27
+ environment:
28
+ name: github-pages
29
+ url: ${{ steps.deployment.outputs.page_url }}
30
+ runs-on: ubuntu-latest
31
+ steps:
32
+ - name: Checkout
33
+ uses: actions/checkout@v4
34
+ - name: Setup Pages
35
+ uses: actions/configure-pages@v5
36
+ - name: Upload artifact
37
+ uses: actions/upload-pages-artifact@v3
38
+ with:
39
+ # Upload entire repository
40
+ path: '.'
41
+ - name: Deploy to GitHub Pages
42
+ id: deployment
43
+ uses: actions/deploy-pages@v5
@@ -0,0 +1,47 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ *.egg
8
+ .venv/
9
+ venv/
10
+ env/
11
+
12
+ # Environment
13
+ .env
14
+ !.env.example
15
+
16
+ # Logs
17
+ *.log
18
+ logs/
19
+
20
+ # IDE
21
+ .vscode/
22
+ .idea/
23
+ *.swp
24
+ *.swo
25
+ *~
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # Docker
32
+ docker/data/
33
+
34
+ # Generated files
35
+ *.pdf
36
+ *.docx
37
+ *.csv
38
+ !documents/sample_document.md
39
+
40
+ # Pytest
41
+ .pytest_cache/
42
+ htmlcov/
43
+ .coverage
44
+
45
+ # Byte-compiled
46
+ *.so
47
+ *.pyd
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 RAG Ingestion Pipeline
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,15 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ include requirements.txt
5
+
6
+ recursive-include src/semantixrag *.py
7
+ recursive-include src/semantixrag *.rego
8
+ recursive-include src/semantixrag *.html
9
+ recursive-include src/semantixrag *.env.example
10
+ recursive-include tests *.py
11
+ recursive-include docs *.md
12
+
13
+ global-exclude __pycache__
14
+ global-exclude *.py[cod]
15
+ global-exclude *$py.class
@@ -0,0 +1,433 @@
1
+ Metadata-Version: 2.4
2
+ Name: semantixrag
3
+ Version: 2.0.0
4
+ Summary: SemantixRAG v2.0 β€” AI-Native Data Platform with GraphRAG, Compliance, and Observability
5
+ Author-email: SemantixRAG Team <team@semantixrag.ai>
6
+ Project-URL: Homepage, https://github.com/yourusername/semantixrag
7
+ Project-URL: Documentation, https://semantixrag.readthedocs.io
8
+ Project-URL: Repository, https://github.com/yourusername/semantixrag
9
+ Project-URL: Bug Tracker, https://github.com/yourusername/semantixrag/issues
10
+ Keywords: rag,retrieval-augmented-generation,langchain,opensearch,neo4j,compliance,governance
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Internet
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: python-dotenv==1.0.0
24
+ Requires-Dist: pydantic==2.5.0
25
+ Requires-Dist: pydantic-settings==2.1.0
26
+ Requires-Dist: opensearch-py==2.4.2
27
+ Requires-Dist: neo4j==5.14.0
28
+ Requires-Dist: unstructured[docx,pdf]>=0.14.2
29
+ Requires-Dist: pdf2image>=1.17.0
30
+ Requires-Dist: pypdf>=3.17.0
31
+ Requires-Dist: python-magic>=0.4.27
32
+ Requires-Dist: markdown>=3.5.0
33
+ Requires-Dist: sentence-transformers>=2.2.0
34
+ Requires-Dist: torch>=2.0.0
35
+ Requires-Dist: transformers>=4.30.0
36
+ Requires-Dist: accelerate>=0.20.0
37
+ Requires-Dist: bitsandbytes>=0.40.0
38
+ Requires-Dist: langchain-core>=0.1.14
39
+ Requires-Dist: langchain-community>=0.0.14
40
+ Requires-Dist: watchdog==3.0.0
41
+ Requires-Dist: fastapi==0.109.0
42
+ Requires-Dist: uvicorn==0.27.0
43
+ Requires-Dist: gunicorn==21.2.0
44
+ Requires-Dist: python-multipart==0.0.6
45
+ Requires-Dist: presidio-analyzer==2.2.33
46
+ Requires-Dist: presidio-anonymizer==2.2.33
47
+ Requires-Dist: spacy==3.7.2
48
+ Requires-Dist: loguru==0.7.2
49
+ Requires-Dist: tqdm==4.66.1
50
+ Requires-Dist: numpy==1.26.2
51
+ Requires-Dist: tenacity==8.2.3
52
+ Requires-Dist: orjson==3.9.10
53
+ Requires-Dist: importlib-resources>=6.1.0; python_version < "3.12"
54
+ Provides-Extra: dev
55
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
56
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
57
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
58
+ Requires-Dist: black>=23.7.0; extra == "dev"
59
+ Requires-Dist: isort>=5.12.0; extra == "dev"
60
+ Requires-Dist: mypy>=1.5.0; extra == "dev"
61
+ Requires-Dist: ruff>=0.0.291; extra == "dev"
62
+ Provides-Extra: api
63
+ Requires-Dist: fastapi==0.109.0; extra == "api"
64
+ Requires-Dist: uvicorn==0.27.0; extra == "api"
65
+ Requires-Dist: gunicorn==21.2.0; extra == "api"
66
+ Provides-Extra: docs
67
+ Requires-Dist: mkdocs>=1.5.0; extra == "docs"
68
+ Requires-Dist: mkdocs-material>=9.2.0; extra == "docs"
69
+
70
+ # SemantixRAG β€” AI-Native Data Platform (v2.0)
71
+
72
+ [![GitHub](https://img.shields.io/badge/GitHub-SemantixRAG-6c5ce7?style=flat&logo=github)](https://github.com/SemantixRAG/SemantixRAG)
73
+ [![License: MIT](https://img.shields.io/badge/License-MIT-00e676?style=flat)](https://opensource.org/licenses/MIT)
74
+ [![Python 3.11+](https://img.shields.io/badge/Python-3.11+-00e5ff?style=flat&logo=python)](https://python.org)
75
+ [![CI](https://img.shields.io/github/actions/workflow/status/SemantixRAG/SemantixRAG/ci.yml?branch=main&style=flat)](https://github.com/SemantixRAG/SemantixRAG/actions)
76
+
77
+ A production-grade, open-source AI-native data platform featuring end-to-end RAG ingestion, knowledge graph integration (GraphRAG), AI observability (Obsidian), automated compliance (GuardRail), multi-modal extraction, and a REST API server β€” all running locally with zero cloud dependencies.
78
+
79
+ **🌐 Website:** [semantixrag.github.io](https://SemantixRAG.github.io)
80
+ **πŸ“¦ GitHub:** [github.com/SemantixRAG/SemantixRAG](https://github.com/SemantixRAG/SemantixRAG)
81
+ **πŸ“– License:** MIT
82
+
83
+ ---
84
+
85
+ ## Platform Overview
86
+
87
+ ```
88
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
89
+ β”‚ SemantixRAG Platform v2.0 β”‚
90
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
91
+ β”‚ API Gateway (FastAPI) ─── OPA Policy Engine ─── AuthZ & Rate Limiting β”‚
92
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
93
+ β”‚ Core Pipeline: β”‚
94
+ β”‚ Extraction β†’ Chunking β†’ Enrichment β†’ Embedding β†’ Indexing β”‚
95
+ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚
96
+ β”‚ β”‚ VLM/Whisperβ”‚ Header β”‚ LLM Summ. β”‚ BGE-m3 β”‚ OpenSearch β”‚
97
+ β”‚ β”‚ Multi-modalβ”‚ Splitter β”‚ Entity β”‚ JinaCLIPβ”‚ Neo4j Graph β”‚
98
+ β”‚ β”‚ β”‚ β”‚ PII Scan β”‚ CLAP β”‚ (GraphRAG) β”‚
99
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
100
+ β”‚ Obsidian β”‚ GuardRail β”‚ GraphRAG β”‚ CostSentinel β”‚ AdminCopilot β”‚
101
+ β”‚ Tracing β”‚ PII Detect β”‚ Entity β”‚ Cost Track β”‚ NL Admin β”‚
102
+ β”‚ Metrics β”‚ Masking β”‚ KG Write β”‚ Optimize β”‚ Auto Config β”‚
103
+ β”‚ Eval Harnessβ”‚ DSAR β”‚ Graph Searchβ”‚ Budget Guard β”‚ Reports β”‚
104
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
105
+ ```
106
+
107
+ ## Technology Stack
108
+
109
+ | Component | Technology | Status |
110
+ |:---|:---|:---|
111
+ | **Orchestration** | Python 3.11+ | Core |
112
+ | **Containerization** | Docker & Docker Compose | Core |
113
+ | **Vector Database** | OpenSearch (k-NN + BM25 search) | Core |
114
+ | **Knowledge Graph** | Neo4j 5.15 (Cypher + APOC) | **NEW** |
115
+ | **API Server** | FastAPI + Uvicorn + Gunicorn | **NEW** |
116
+ | **Policy Engine** | Open Policy Agent (OPA) + Rego | **NEW** |
117
+ | **PII Detection** | Microsoft Presidio + Regex fallback | **NEW** |
118
+ | **Parsing** | Unstructured.io, PyMuPDF, PyPDF | Core |
119
+ | **VLM Fallback** | LLaVA / ColPali (images) | Core |
120
+ | **Audio Transcription** | OpenAI Whisper | **NEW** |
121
+ | **Embeddings** | BAAI/bge-m3 (1024-dim, multilingual) | Core |
122
+ | **Multi-modal Embeddings** | JinaCLIP (text+image), CLAP (audio) | **NEW** |
123
+ | **Summarization** | Gemma / Llama (via Ollama or HuggingFace) | Core |
124
+ | **CDC** | Python Watchdog | Core |
125
+ | **Testing** | Pytest + pytest-asyncio + pytest-cov (41+ tests) | **NEW** |
126
+ | **CI/CD** | GitHub Actions (lint, test, Trivy, Docker) | **NEW** |
127
+
128
+ ## Project Structure (v2.0)
129
+
130
+ ```
131
+ SemantixRAG/
132
+ β”œβ”€β”€ .github/workflows/
133
+ β”‚ └── ci.yml # CI/CD pipeline
134
+ β”œβ”€β”€ config/
135
+ β”‚ β”œβ”€β”€ __init__.py
136
+ β”‚ β”œβ”€β”€ settings.py # Pydantic settings (enhanced)
137
+ β”‚ └── opa/
138
+ β”‚ β”œβ”€β”€ access.rego # RBAC/ABAC policy
139
+ β”‚ β”œβ”€β”€ masking.rego # Conditional masking policy
140
+ β”‚ └── audit.rego # Audit level classification
141
+ β”œβ”€β”€ docker/
142
+ β”‚ β”œβ”€β”€ docker-compose.yml # OpenSearch + Neo4j + Redis + OPA
143
+ β”‚ β”œβ”€β”€ opensearch.yml # OpenSearch configuration
144
+ β”‚ └── Dockerfile # Python app container
145
+ β”œβ”€β”€ documents/
146
+ β”‚ └── sample_document.md
147
+ β”œβ”€β”€ src/
148
+ β”‚ β”œβ”€β”€ __init__.py
149
+ β”‚ β”œβ”€β”€ models.py # Enhanced Pydantic models
150
+ β”‚ β”œβ”€β”€ pipeline.py # Enhanced orchestrator with P0 features
151
+ β”‚ β”œβ”€β”€ api/ # NEW: FastAPI server
152
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
153
+ β”‚ β”‚ β”œβ”€β”€ main.py # FastAPI entry point
154
+ β”‚ β”‚ └── routes/
155
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
156
+ β”‚ β”‚ β”œβ”€β”€ ingestion.py # POST /v1/ingest
157
+ β”‚ β”‚ β”œβ”€β”€ retrieval.py # POST /v1/query
158
+ β”‚ β”‚ β”œβ”€β”€ compliance.py # POST /v1/compliance/pii/scan, /dsar
159
+ β”‚ β”‚ β”œβ”€β”€ observability.py # POST /v1/observability/traces
160
+ β”‚ β”‚ └── admin.py # POST /v1/admin/query (AdminCopilot)
161
+ β”‚ β”œβ”€β”€ compliance/ # NEW: GuardRail
162
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
163
+ β”‚ β”‚ β”œβ”€β”€ pii_scanner.py # Presidio + regex PII detection
164
+ β”‚ β”‚ β”œβ”€β”€ masking.py # Dynamic PII masking engine
165
+ β”‚ β”‚ └── dsar.py # GDPR DSAR automation
166
+ β”‚ β”œβ”€β”€ knowledge/ # NEW: GraphRAG
167
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
168
+ β”‚ β”‚ β”œβ”€β”€ entity_extractor.py # spaCy NER + entity linking
169
+ β”‚ β”‚ └── ontology.py # Domain ontologies + auto-discovery
170
+ β”‚ β”œβ”€β”€ observability/ # NEW: Obsidian
171
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
172
+ β”‚ β”‚ β”œβ”€β”€ tracer.py # Distributed tracing
173
+ β”‚ β”‚ β”œβ”€β”€ evaluator.py # RAG quality metrics
174
+ β”‚ β”‚ └── metrics.py # Counters, histograms, cost tracking
175
+ β”‚ β”œβ”€β”€ extractors/
176
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
177
+ β”‚ β”‚ β”œβ”€β”€ base.py
178
+ β”‚ β”‚ β”œβ”€β”€ unstructured_extractor.py
179
+ β”‚ β”‚ β”œβ”€β”€ table_extractor.py
180
+ β”‚ β”‚ └── multimodal_extractor.py # NEW: VLM + Whisper + video
181
+ β”‚ β”œβ”€β”€ chunking/
182
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
183
+ β”‚ β”‚ β”œβ”€β”€ header_splitter.py
184
+ β”‚ β”‚ └── enricher.py
185
+ β”‚ β”œβ”€β”€ embeddings/
186
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
187
+ β”‚ β”‚ └── embedder.py
188
+ β”‚ β”œβ”€β”€ indexing/
189
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
190
+ β”‚ β”‚ β”œβ”€β”€ connection.py
191
+ β”‚ β”‚ β”œβ”€β”€ index_manager.py
192
+ β”‚ β”‚ β”œβ”€β”€ bulk_indexer.py
193
+ β”‚ β”‚ β”œβ”€β”€ hybrid_search.py
194
+ β”‚ β”‚ └── graph_writer.py # NEW: Neo4j async writer
195
+ β”‚ β”œβ”€β”€ cdc/
196
+ β”‚ β”‚ β”œβ”€β”€ __init__.py
197
+ β”‚ β”‚ β”œβ”€β”€ watcher.py
198
+ β”‚ β”‚ └── incremental.py
199
+ β”‚ └── monitoring/
200
+ β”‚ β”œβ”€β”€ __init__.py
201
+ β”‚ └── logger.py
202
+ β”œβ”€β”€ tests/
203
+ β”‚ β”œβ”€β”€ test_knowledge.py # NEW: 11 GraphRAG tests
204
+ β”‚ β”œβ”€β”€ test_compliance.py # NEW: 12 GuardRail tests
205
+ β”‚ └── test_observability.py # NEW: 18 Obsidian tests
206
+ β”œβ”€β”€ main.py # CLI entry point
207
+ β”œβ”€β”€ requirements.txt # Enhanced dependencies
208
+ β”œβ”€β”€ .env.example # Enhanced config template
209
+ β”œβ”€β”€ README.md
210
+ └── index.html # Project landing page (v2.0)
211
+ ```
212
+
213
+ ## Quick Start
214
+
215
+ ### 1. Clone & Setup
216
+
217
+ ```bash
218
+ git clone https://github.com/SemantixRAG/SemantixRAG.git
219
+ cd SemantixRAG
220
+ ```
221
+
222
+ ### 2. Start Infrastructure
223
+
224
+ ```bash
225
+ cd docker
226
+ docker-compose up -d
227
+ ```
228
+
229
+ This starts all platform services:
230
+ - **OpenSearch** on `localhost:9200` (vector store + BM25 search)
231
+ - **OpenSearch Dashboards** on `localhost:5601` (visualization)
232
+ - **Neo4j** on `localhost:7687` / `7474` (knowledge graph)
233
+ - **Redis** on `localhost:6379` (caching + queue)
234
+ - **OPA** on `localhost:8181` (policy engine)
235
+
236
+ ### 3. Install Python Dependencies
237
+
238
+ ```bash
239
+ python -m venv .venv
240
+ .venv\Scripts\activate # Windows
241
+ # source .venv/bin/activate # Linux/Mac
242
+
243
+ pip install -r requirements.txt
244
+ ```
245
+
246
+ ### 4. Initialize Indexes
247
+
248
+ ```bash
249
+ python main.py init
250
+ ```
251
+
252
+ ### 5. Ingest Documents
253
+
254
+ ```bash
255
+ python main.py ingest ./documents/sample_document.md
256
+ python main.py ingest ./documents/
257
+ ```
258
+
259
+ ### 6. Search
260
+
261
+ ```bash
262
+ python main.py search "machine learning"
263
+ python main.py search "reinforcement learning" --top-k 10
264
+ ```
265
+
266
+ ### 7. Start API Server
267
+
268
+ ```bash
269
+ python -m uvicorn src.api.main:app --reload
270
+ ```
271
+
272
+ Now you can query via REST:
273
+
274
+ ```bash
275
+ curl -X POST http://localhost:8000/v1/query \
276
+ -H "Content-Type: application/json" \
277
+ -d '{"query": "reinforcement learning", "strategy": "graph"}'
278
+ ```
279
+
280
+ ### 8. Run Tests
281
+
282
+ ```bash
283
+ python -m pytest tests/ -v
284
+ ```
285
+
286
+ ## CLI Commands
287
+
288
+ | Command | Description |
289
+ |:---|:---|
290
+ | `init` | Initialize OpenSearch index with k-NN mapping |
291
+ | `ingest <path>` | Ingest a document or directory with entity extraction + PII scan |
292
+ | `watch <dir>` | Watch a directory for file changes (CDC) with auto-reindex |
293
+ | `search <query>` | Search indexed documents with hybrid retrieval |
294
+ | `stats` | Show index statistics |
295
+
296
+ ## API Endpoints
297
+
298
+ | Method | Endpoint | Description |
299
+ |:---|:---|:---|
300
+ | GET | `/health` | Health check |
301
+ | POST | `/v1/ingest` | Upload and process a document |
302
+ | GET | `/v1/ingest/{id}/status` | Check ingestion status |
303
+ | POST | `/v1/query` | Semantic search with hybrid/graph retrieval |
304
+ | POST | `/v1/admin/query` | Natural-language platform administration |
305
+ | POST | `/v1/observability/traces` | Ingest telemetry traces |
306
+ | GET | `/v1/observability/metrics` | Query pipeline metrics |
307
+ | GET | `/v1/observability/evaluation` | Query RAG quality metrics |
308
+ | POST | `/v1/compliance/pii/scan` | Scan text for PII |
309
+ | POST | `/v1/compliance/dsar` | Execute GDPR DSAR request |
310
+ | GET | `/v1/compliance/dsar/{id}` | Check DSAR status |
311
+
312
+ ## P0 Products (v2.0)
313
+
314
+ ### πŸ•΅οΈ Obsidian β€” AI Observability
315
+ - End-to-end distributed tracing for every pipeline stage (extraction, chunking, enrichment, embedding, indexing, graph write, PII scan)
316
+ - RAG quality metrics: faithfulness, answer relevancy, context precision, MRR, Recall@k, Precision@k
317
+ - Cost tracking with per-operation, per-model, per-tenant attribution
318
+ - Latency histograms with P50/P95/P99 aggregation
319
+ - Sampling configuration for high-volume pipelines
320
+
321
+ ### πŸ•ΈοΈ GraphRAG β€” Knowledge Graph Integration
322
+ - Automatic named entity recognition via spaCy at ingestion time
323
+ - Entity linking: chunks β†’ entities via `MENTIONS` relationships in Neo4j
324
+ - Entity resolution and coreference grouping
325
+ - Multi-hop Cypher traversal: `search_related_entities(names, hops=2)`
326
+ - Domain ontologies: general, healthcare, legal, finance with auto-discovery
327
+ - Graph traversal results fused with vector + BM25 via RRF
328
+
329
+ ### πŸ›‘οΈ GuardRail β€” Automated Compliance
330
+ - PII detection via Microsoft Presidio (30+ types, score-thresholded) with regex fallback
331
+ - Dynamic masking: type-specific tokens ([EMAIL], [SSN]) or uniform masking
332
+ - Risk level classification: low / medium / high based on PII type
333
+ - GDPR DSAR automation: find / delete / export subject data across document, chunk, embedding, and memory indexes
334
+ - OPA policy engine: Rego policies for access control, masking strategy, audit level
335
+
336
+ ### 🎨 Multi-Modal RAG
337
+ - **Images**: VLM (LLaVA) captioning for images
338
+ - **Audio**: Whisper transcription for MP3, WAV, M4A, OGG, FLAC
339
+ - **Video**: OpenCV frame sampling at configurable intervals
340
+ - Text-only fallback when models are unavailable (mock mode)
341
+
342
+ ## Configuration
343
+
344
+ All settings configurable via `.env` file (prefix `RAG_`):
345
+
346
+ ### Core
347
+
348
+ | Variable | Default | Description |
349
+ |:---|:---|:---|
350
+ | `RAG_OPENSEARCH_HOST` | `localhost` | OpenSearch host |
351
+ | `RAG_OPENSEARCH_PORT` | `9200` | OpenSearch port |
352
+ | `RAG_OPENSEARCH_INDEX` | `rag_documents` | Index name |
353
+ | `RAG_EMBEDDING_MODEL_NAME` | `BAAI/bge-m3` | Embedding model |
354
+ | `RAG_EMBEDDING_DIMENSION` | `1024` | Vector dimension |
355
+ | `RAG_CHUNK_MAX_TOKENS` | `512` | Max tokens per chunk |
356
+ | `RAG_CHUNK_OVERLAP_TOKENS` | `64` | Chunk overlap tokens |
357
+ | `RAG_WATCH_DIRECTORY` | `./documents` | Watch directory |
358
+
359
+ ### Neo4j (GraphRAG)
360
+
361
+ | Variable | Default | Description |
362
+ |:---|:---|:---|
363
+ | `RAG_NEO4J_URI` | `bolt://localhost:7687` | Neo4j connection URI |
364
+ | `RAG_NEO4J_USER` | `neo4j` | Neo4j username |
365
+ | `RAG_NEO4J_PASSWORD` | `password` | Neo4j password |
366
+ | `RAG_NEO4J_DATABASE` | `rag` | Neo4j database name |
367
+
368
+ ### Observability (Obsidian)
369
+
370
+ | Variable | Default | Description |
371
+ |:---|:---|:---|
372
+ | `RAG_OBSERVABILITY_ENABLED` | `True` | Enable tracing |
373
+ | `RAG_OBSERVABILITY_INDEX` | `rag_observability` | OpenSearch telemetry index |
374
+ | `RAG_OBSERVABILITY_SAMPLE_RATE` | `1.0` | Trace sampling rate (0-1) |
375
+
376
+ ### Compliance (GuardRail)
377
+
378
+ | Variable | Default | Description |
379
+ |:---|:---|:---|
380
+ | `RAG_PII_SCAN_ENABLED` | `True` | Enable PII scanning on ingestion |
381
+ | `RAG_PII_SCAN_DEPTH` | `standard` | Scan depth (standard/deep) |
382
+ | `RAG_MASKING_ENABLED` | `True` | Auto-mask PII in chunks |
383
+ | `RAG_AUDIT_LOG_ENABLED` | `True` | Enable audit logging |
384
+
385
+ ### Knowledge Graph
386
+
387
+ | Variable | Default | Description |
388
+ |:---|:---|:---|
389
+ | `RAG_ENTITY_EXTRACTION_ENABLED` | `True` | Enable NER at ingestion |
390
+ | `RAG_ENTITY_CONFIDENCE_THRESHOLD` | `0.8` | Minimum entity confidence |
391
+
392
+ ### Cost Sentinel
393
+
394
+ | Variable | Default | Description |
395
+ |:---|:---|:---|
396
+ | `RAG_COST_TRACKING_ENABLED` | `True` | Enable cost tracking |
397
+ | `RAG_COST_ALERT_THRESHOLD_USD` | `100.0` | Alert threshold |
398
+
399
+ ## Risk Mitigations
400
+
401
+ | Risk | Mitigation |
402
+ |:---|:---|
403
+ | Table structure loss | VLM fallback for complex tables |
404
+ | OpenSearch memory | Tuned `ef_construction` and `m` parameters |
405
+ | Orphaned chunks | Contextual enrichment (title+summary in every chunk) |
406
+ | Indexing bottlenecks | Batch processing, `_bulk` API, async |
407
+ | Neo4j connection failure | Graceful degradation β€” pipeline continues without graph |
408
+ | PII false positives | Tunable confidence threshold; human-in-the-loop |
409
+ | LLM API failure | Circuit breaker pattern with exponential backoff |
410
+ | Embedding model unavailable | Graceful fallback to zero vectors |
411
+
412
+ ## Testing
413
+
414
+ ```bash
415
+ # Run all tests
416
+ python -m pytest tests/ -v
417
+
418
+ # Run specific test suites
419
+ python -m pytest tests/test_knowledge.py -v
420
+ python -m pytest tests/test_compliance.py -v
421
+ python -m pytest tests/test_observability.py -v
422
+
423
+ # With coverage
424
+ python -m pytest tests/ --cov=src --cov-report=html
425
+ ```
426
+
427
+ ## Contributing
428
+
429
+ Contributions are welcome! Please open an issue or pull request on [GitHub](https://github.com/SemantixRAG/SemantixRAG).
430
+
431
+ ## License
432
+
433
+ MIT β€” see [LICENSE](LICENSE) for details.