deeplightrag 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeplightrag-1.0.6/.env.example +13 -0
- deeplightrag-1.0.6/.gitignore +184 -0
- deeplightrag-1.0.6/LICENSE +21 -0
- deeplightrag-1.0.6/MANIFEST.in +18 -0
- deeplightrag-1.0.6/PKG-INFO +121 -0
- deeplightrag-1.0.6/README.md +33 -0
- deeplightrag-1.0.6/main.py +212 -0
- deeplightrag-1.0.6/pyproject.toml +353 -0
- deeplightrag-1.0.6/requirements-macos.txt +10 -0
- deeplightrag-1.0.6/requirements.txt +53 -0
- deeplightrag-1.0.6/setup.cfg +4 -0
- deeplightrag-1.0.6/src/deeplightrag/__init__.py +47 -0
- deeplightrag-1.0.6/src/deeplightrag/cli.py +177 -0
- deeplightrag-1.0.6/src/deeplightrag/core.py +778 -0
- deeplightrag-1.0.6/src/deeplightrag/graph/__init__.py +9 -0
- deeplightrag-1.0.6/src/deeplightrag/graph/dual_layer.py +480 -0
- deeplightrag-1.0.6/src/deeplightrag/graph/entity_relationship.py +1268 -0
- deeplightrag-1.0.6/src/deeplightrag/graph/visual_spatial.py +654 -0
- deeplightrag-1.0.6/src/deeplightrag/interfaces.py +74 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/__init__.py +17 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/enhanced_ner_pipeline.py +473 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/entity_filter.py +276 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/entity_processor.py +400 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/entity_schema.py +189 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/entity_type_mapper.py +345 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/gliner_ner.py +1020 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/text_classifier.py +304 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/visual_aware_ner.py +743 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/working_enhanced_ner.py +328 -0
- deeplightrag-1.0.6/src/deeplightrag/ner/working_gliner_ner.py +293 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/__init__.py +4 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/clean_ocr.py +360 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/deepseek_model.py +416 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/deepseek_ocr.py +11 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/deepseek_output_parser.py +301 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/enhanced_ocr.py +679 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/geometry.py +34 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/processor.py +434 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/visual_embedding_extractor.py +271 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/visual_feature_extractor.py +1616 -0
- deeplightrag-1.0.6/src/deeplightrag/ocr/visual_token.py +96 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/__init__.py +5 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/adaptive_retriever.py +991 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/multimodal_visual_retriever.py +900 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/query_classifier.py +255 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/toon_formatter.py +65 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/visual_aware_retriever.py +498 -0
- deeplightrag-1.0.6/src/deeplightrag/retrieval/visual_similarity.py +645 -0
- deeplightrag-1.0.6/src/deeplightrag/utils/__init__.py +3 -0
- deeplightrag-1.0.6/src/deeplightrag/utils/component_factory.py +115 -0
- deeplightrag-1.0.6/src/deeplightrag/utils/config_manager.py +154 -0
- deeplightrag-1.0.6/src/deeplightrag/utils/device.py +74 -0
- deeplightrag-1.0.6/src/deeplightrag/utils/helpers.py +127 -0
- deeplightrag-1.0.6/src/deeplightrag.egg-info/PKG-INFO +121 -0
- deeplightrag-1.0.6/src/deeplightrag.egg-info/SOURCES.txt +57 -0
- deeplightrag-1.0.6/src/deeplightrag.egg-info/dependency_links.txt +1 -0
- deeplightrag-1.0.6/src/deeplightrag.egg-info/entry_points.txt +2 -0
- deeplightrag-1.0.6/src/deeplightrag.egg-info/requires.txt +64 -0
- deeplightrag-1.0.6/src/deeplightrag.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Environment Variables Template
|
|
2
|
+
# Copy this file to .env and fill in your actual values
|
|
3
|
+
# NEVER commit .env to git!
|
|
4
|
+
|
|
5
|
+
# LLM API Keys (Optional - only if you want to use LLM features)
|
|
6
|
+
OPENAI_API_KEY=your_openai_key_here
|
|
7
|
+
ANTHROPIC_API_KEY=your_anthropic_key_here
|
|
8
|
+
GOOGLE_API_KEY=your_gemini_key_here
|
|
9
|
+
COHERE_API_KEY=your_cohere_key_here
|
|
10
|
+
|
|
11
|
+
# Other Configuration
|
|
12
|
+
DEEPLIGHTRAG_STORAGE_DIR=./deeplightrag_data
|
|
13
|
+
DEEPLIGHTRAG_DEVICE=auto # auto, cpu, cuda, mps
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
*.manifest
|
|
32
|
+
*.spec
|
|
33
|
+
|
|
34
|
+
# Installer logs
|
|
35
|
+
pip-log.txt
|
|
36
|
+
pip-delete-this-directory.txt
|
|
37
|
+
|
|
38
|
+
# Unit test / coverage reports
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*.cover
|
|
48
|
+
*.py,cover
|
|
49
|
+
.hypothesis/
|
|
50
|
+
.pytest_cache/
|
|
51
|
+
|
|
52
|
+
# Translations
|
|
53
|
+
*.mo
|
|
54
|
+
*.pot
|
|
55
|
+
|
|
56
|
+
# Django stuff:
|
|
57
|
+
*.log
|
|
58
|
+
local_settings.py
|
|
59
|
+
db.sqlite3
|
|
60
|
+
db.sqlite3-journal
|
|
61
|
+
|
|
62
|
+
# Flask stuff:
|
|
63
|
+
instance/
|
|
64
|
+
.webassets-cache
|
|
65
|
+
|
|
66
|
+
# Scrapy stuff:
|
|
67
|
+
.scrapy
|
|
68
|
+
|
|
69
|
+
# Sphinx documentation
|
|
70
|
+
docs/_build/
|
|
71
|
+
|
|
72
|
+
# PyBuilder
|
|
73
|
+
target/
|
|
74
|
+
|
|
75
|
+
# Jupyter Notebook
|
|
76
|
+
.ipynb_checkpoints
|
|
77
|
+
|
|
78
|
+
# IPython
|
|
79
|
+
profile_default/
|
|
80
|
+
ipython_config.py
|
|
81
|
+
|
|
82
|
+
# pyenv
|
|
83
|
+
.python-version
|
|
84
|
+
|
|
85
|
+
# pipenv
|
|
86
|
+
Pipfile.lock
|
|
87
|
+
|
|
88
|
+
# PEP 582
|
|
89
|
+
__pypackages__/
|
|
90
|
+
|
|
91
|
+
# Celery stuff
|
|
92
|
+
celerybeat-schedule
|
|
93
|
+
celerybeat.pid
|
|
94
|
+
|
|
95
|
+
# SageMath parsed files
|
|
96
|
+
*.sage.py
|
|
97
|
+
|
|
98
|
+
# Environments
|
|
99
|
+
.env
|
|
100
|
+
.venv
|
|
101
|
+
env/
|
|
102
|
+
venv/
|
|
103
|
+
ENV/
|
|
104
|
+
env.bak/
|
|
105
|
+
venv.bak/
|
|
106
|
+
|
|
107
|
+
# Spyder project settings
|
|
108
|
+
.spyderproject
|
|
109
|
+
.spyproject
|
|
110
|
+
|
|
111
|
+
# Rope project settings
|
|
112
|
+
.ropeproject
|
|
113
|
+
|
|
114
|
+
# mkdocs documentation
|
|
115
|
+
/site
|
|
116
|
+
|
|
117
|
+
# mypy
|
|
118
|
+
.mypy_cache/
|
|
119
|
+
.dmypy.json
|
|
120
|
+
dmypy.json
|
|
121
|
+
|
|
122
|
+
# Pyre type checker
|
|
123
|
+
.pyre/
|
|
124
|
+
|
|
125
|
+
# IDE
|
|
126
|
+
.vscode/
|
|
127
|
+
.idea/
|
|
128
|
+
*.swp
|
|
129
|
+
*.swo
|
|
130
|
+
*~
|
|
131
|
+
.DS_Store
|
|
132
|
+
*.sublime-project
|
|
133
|
+
*.sublime-workspace
|
|
134
|
+
.vim/
|
|
135
|
+
|
|
136
|
+
# Project specific
|
|
137
|
+
deeplightrag_data/
|
|
138
|
+
config/
|
|
139
|
+
*.yaml.local
|
|
140
|
+
*.env.local
|
|
141
|
+
*.env.*.local
|
|
142
|
+
|
|
143
|
+
# Build artifacts
|
|
144
|
+
dist/
|
|
145
|
+
build/
|
|
146
|
+
*.egg-info/
|
|
147
|
+
.eggs/
|
|
148
|
+
|
|
149
|
+
# Coverage reports
|
|
150
|
+
htmlcov/
|
|
151
|
+
.coverage
|
|
152
|
+
coverage.xml
|
|
153
|
+
|
|
154
|
+
# Test artifacts
|
|
155
|
+
.pytest_cache/
|
|
156
|
+
.tox/
|
|
157
|
+
.nox/
|
|
158
|
+
|
|
159
|
+
# IDE and editor files
|
|
160
|
+
.vscode/
|
|
161
|
+
.idea/
|
|
162
|
+
*.code-workspace
|
|
163
|
+
*.iml
|
|
164
|
+
|
|
165
|
+
# OS
|
|
166
|
+
.DS_Store
|
|
167
|
+
Thumbs.db
|
|
168
|
+
|
|
169
|
+
# Temporary files
|
|
170
|
+
*.tmp
|
|
171
|
+
*.bak
|
|
172
|
+
*.swp
|
|
173
|
+
*.swo
|
|
174
|
+
*~
|
|
175
|
+
|
|
176
|
+
# Local test files
|
|
177
|
+
test_*.py
|
|
178
|
+
*_test.py
|
|
179
|
+
config.yaml
|
|
180
|
+
|
|
181
|
+
# Environment files with secrets
|
|
182
|
+
.env
|
|
183
|
+
*.env
|
|
184
|
+
config.yaml
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Visual-Graph RAG Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Include documentation
|
|
2
|
+
include README.md
|
|
3
|
+
include LICENSE
|
|
4
|
+
include CHANGELOG.md
|
|
5
|
+
|
|
6
|
+
# Include configuration files
|
|
7
|
+
include pyproject.toml
|
|
8
|
+
include requirements.txt
|
|
9
|
+
include requirements-macos.txt
|
|
10
|
+
|
|
11
|
+
# Include documentation
|
|
12
|
+
recursive-include docs *.md *.rst *.txt
|
|
13
|
+
|
|
14
|
+
# Exclude unnecessary files
|
|
15
|
+
global-exclude *.py[cod] __pycache__ *.so *.dylib .DS_Store
|
|
16
|
+
prune test*
|
|
17
|
+
prune example*
|
|
18
|
+
prune .git*
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deeplightrag
|
|
3
|
+
Version: 1.0.6
|
|
4
|
+
Summary: DeepLightRAG: High-performance Document Indexing and Retrieval System (use with any LLM)
|
|
5
|
+
Author-email: Phuong Nguyen <nhphuong.code@gmail.com>
|
|
6
|
+
Maintainer-email: Phuong Nguyen <nhphuong.code@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/png261/DeepLightRag
|
|
9
|
+
Project-URL: Repository, https://github.com/png261/DeepLightRag
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/png261/DeepLightRag/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/png261/DeepLightRag/releases
|
|
12
|
+
Keywords: rag,retrieval,augmented,generation,ocr,vision,graph,nlp,llm,deepseek,document-processing
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
28
|
+
Classifier: Topic :: Text Processing
|
|
29
|
+
Classifier: Typing :: Typed
|
|
30
|
+
Requires-Python: >=3.9
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: numpy>=1.24.0
|
|
34
|
+
Requires-Dist: networkx>=3.0
|
|
35
|
+
Requires-Dist: Pillow>=10.0.0
|
|
36
|
+
Requires-Dist: PyYAML>=6.0
|
|
37
|
+
Requires-Dist: tqdm>=4.65.0
|
|
38
|
+
Requires-Dist: typing-extensions>=4.0.0; python_version < "3.10"
|
|
39
|
+
Requires-Dist: pdf2image>=1.16.0
|
|
40
|
+
Requires-Dist: PyMuPDF>=1.23.0
|
|
41
|
+
Requires-Dist: easyocr>=1.7.0
|
|
42
|
+
Requires-Dist: torch>=2.0.0
|
|
43
|
+
Requires-Dist: torchvision>=0.15.0
|
|
44
|
+
Requires-Dist: transformers>=4.40.0
|
|
45
|
+
Requires-Dist: accelerate>=0.24.0
|
|
46
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
47
|
+
Requires-Dist: setfit>=1.0.0
|
|
48
|
+
Requires-Dist: gliner>=0.1.12
|
|
49
|
+
Requires-Dist: faiss-cpu>=1.7.4
|
|
50
|
+
Requires-Dist: toon-python>=0.1.2
|
|
51
|
+
Provides-Extra: gpu
|
|
52
|
+
Requires-Dist: bitsandbytes>=0.41.0; extra == "gpu"
|
|
53
|
+
Provides-Extra: macos
|
|
54
|
+
Requires-Dist: mlx>=0.21.0; extra == "macos"
|
|
55
|
+
Requires-Dist: mlx-lm>=0.19.0; extra == "macos"
|
|
56
|
+
Requires-Dist: mlx-vlm>=0.0.3; extra == "macos"
|
|
57
|
+
Provides-Extra: llm
|
|
58
|
+
Requires-Dist: google-generativeai>=0.3.0; extra == "llm"
|
|
59
|
+
Requires-Dist: openai>=1.0.0; extra == "llm"
|
|
60
|
+
Requires-Dist: anthropic>=0.25.0; extra == "llm"
|
|
61
|
+
Provides-Extra: advanced-re
|
|
62
|
+
Requires-Dist: opennre>=1.1.0; extra == "advanced-re"
|
|
63
|
+
Provides-Extra: web
|
|
64
|
+
Requires-Dist: streamlit>=1.30.0; extra == "web"
|
|
65
|
+
Requires-Dist: plotly>=5.18.0; extra == "web"
|
|
66
|
+
Requires-Dist: pandas>=2.0.0; extra == "web"
|
|
67
|
+
Provides-Extra: dev
|
|
68
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
69
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
70
|
+
Requires-Dist: pytest-xdist>=3.3.0; extra == "dev"
|
|
71
|
+
Requires-Dist: pytest-timeout>=2.1.0; extra == "dev"
|
|
72
|
+
Requires-Dist: pytest-mock>=3.11.0; extra == "dev"
|
|
73
|
+
Requires-Dist: black>=23.9.0; extra == "dev"
|
|
74
|
+
Requires-Dist: ruff>=0.0.290; extra == "dev"
|
|
75
|
+
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
76
|
+
Requires-Dist: pre-commit>=3.3.0; extra == "dev"
|
|
77
|
+
Requires-Dist: build>=0.10.0; extra == "dev"
|
|
78
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
79
|
+
Requires-Dist: wheel>=0.41.0; extra == "dev"
|
|
80
|
+
Provides-Extra: docs
|
|
81
|
+
Requires-Dist: sphinx>=7.1.0; extra == "docs"
|
|
82
|
+
Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
|
|
83
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.24.0; extra == "docs"
|
|
84
|
+
Requires-Dist: myst-parser>=1.0.0; extra == "docs"
|
|
85
|
+
Provides-Extra: all
|
|
86
|
+
Requires-Dist: deeplightrag[advanced-re,dev,docs,gpu,llm,web]; extra == "all"
|
|
87
|
+
Dynamic: license-file
|
|
88
|
+
|
|
89
|
+
# DeepLightRAG
|
|
90
|
+
|
|
91
|
+
DeepLightRAG is a high-performance document indexing and retrieval system designed to work with any Large Language Model (LLM). It features a dual-layer graph architecture (Visual-Spatial and Entity-Relationship) to provide context-aware and visually-grounded retrieval.
|
|
92
|
+
|
|
93
|
+
## Features
|
|
94
|
+
|
|
95
|
+
- **Dual-Layer Graph**: Combines visual layout awareness with semantic entity relationships.
|
|
96
|
+
- **Visual-Grounded Retrieval**: Retrieves not just text, but visual regions and their spatial context.
|
|
97
|
+
- **Robust OCR**: Integrated with DeepSeek-OCR and EasyOCR fallback for reliable text extraction.
|
|
98
|
+
- **Advanced NER**: Uses GLiNER for zero-shot entity recognition.
|
|
99
|
+
- **Flexible LLM Support**: Compatible with OpenAI, Google Gemini, Anthropic, and local LLMs via MLX/Ollama.
|
|
100
|
+
|
|
101
|
+
## Installation
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install deeplightrag
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Usage
|
|
108
|
+
|
|
109
|
+
Index a document:
|
|
110
|
+
```bash
|
|
111
|
+
deeplightrag index document.pdf
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Query the index:
|
|
115
|
+
```bash
|
|
116
|
+
deeplightrag query "What is the main topic?"
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
MIT License
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# DeepLightRAG
|
|
2
|
+
|
|
3
|
+
DeepLightRAG is a high-performance document indexing and retrieval system designed to work with any Large Language Model (LLM). It features a dual-layer graph architecture (Visual-Spatial and Entity-Relationship) to provide context-aware and visually-grounded retrieval.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Dual-Layer Graph**: Combines visual layout awareness with semantic entity relationships.
|
|
8
|
+
- **Visual-Grounded Retrieval**: Retrieves not just text, but visual regions and their spatial context.
|
|
9
|
+
- **Robust OCR**: Integrated with DeepSeek-OCR and EasyOCR fallback for reliable text extraction.
|
|
10
|
+
- **Advanced NER**: Uses GLiNER for zero-shot entity recognition.
|
|
11
|
+
- **Flexible LLM Support**: Compatible with OpenAI, Google Gemini, Anthropic, and local LLMs via MLX/Ollama.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install deeplightrag
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
Index a document:
|
|
22
|
+
```bash
|
|
23
|
+
deeplightrag index document.pdf
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Query the index:
|
|
27
|
+
```bash
|
|
28
|
+
deeplightrag query "What is the main topic?"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## License
|
|
32
|
+
|
|
33
|
+
MIT License
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
DeepLightRAG CLI Interface
|
|
4
|
+
Main entry point for the system
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import yaml
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
from src.deeplightrag import DeepLightRAG
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_config(config_path: str = "config.yaml") -> dict:
|
|
17
|
+
"""Load configuration from YAML file with automatic Kaggle detection"""
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
# Auto-detect Kaggle environment and use appropriate config
|
|
21
|
+
is_kaggle = (
|
|
22
|
+
os.path.exists("/kaggle")
|
|
23
|
+
or "KAGGLE_KERNEL_RUN_TYPE" in os.environ
|
|
24
|
+
or "/kaggle/" in os.getcwd()
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if is_kaggle and Path("config_kaggle.yaml").exists():
|
|
28
|
+
config_path = "config_kaggle.yaml"
|
|
29
|
+
print(f"🔍 Detected Kaggle environment, using {config_path}")
|
|
30
|
+
|
|
31
|
+
if Path(config_path).exists():
|
|
32
|
+
with open(config_path, "r") as f:
|
|
33
|
+
config = yaml.safe_load(f)
|
|
34
|
+
|
|
35
|
+
# Auto-configure GPU settings
|
|
36
|
+
try:
|
|
37
|
+
import torch
|
|
38
|
+
|
|
39
|
+
if torch.cuda.is_available():
|
|
40
|
+
print(f"🎮 GPU detected: {torch.cuda.get_device_name(0)}")
|
|
41
|
+
# Update config for GPU usage
|
|
42
|
+
if "ocr" in config:
|
|
43
|
+
config["ocr"]["device"] = "cuda"
|
|
44
|
+
if "ner" in config:
|
|
45
|
+
config["ner"]["device"] = "cuda"
|
|
46
|
+
if "relation_extraction" in config:
|
|
47
|
+
config["relation_extraction"]["device"] = "cuda"
|
|
48
|
+
except ImportError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
return config
|
|
52
|
+
return {}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def cmd_index(args, rag_system):
|
|
56
|
+
"""Index a PDF document"""
|
|
57
|
+
print(f"Indexing document: {args.pdf}")
|
|
58
|
+
results = rag_system.index_document(
|
|
59
|
+
args.pdf, document_id=args.doc_id, save_to_disk=not args.no_save
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if args.output:
|
|
63
|
+
with open(args.output, "w") as f:
|
|
64
|
+
json.dump(results, f, indent=2)
|
|
65
|
+
print(f"\nResults saved to {args.output}")
|
|
66
|
+
|
|
67
|
+
return results
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def cmd_query(args, rag_system):
|
|
71
|
+
"""Retrieve context for a query"""
|
|
72
|
+
if args.load_doc:
|
|
73
|
+
rag_system.load_document(args.load_doc)
|
|
74
|
+
|
|
75
|
+
result = rag_system.retrieve(args.question, override_level=args.level)
|
|
76
|
+
|
|
77
|
+
print(f"\n{'='*60}")
|
|
78
|
+
print("RETRIEVED CONTEXT")
|
|
79
|
+
print(f"{'='*60}")
|
|
80
|
+
print(f"\n{result.get('context', 'No context')}\n")
|
|
81
|
+
print(f"{'='*60}")
|
|
82
|
+
print(f"Query Level: {result.get('level_name', 'N/A')} (Level {result.get('query_level', 'N/A')})")
|
|
83
|
+
print(f"Tokens Used: {result.get('tokens_used', 'N/A')}/{result.get('token_budget', 'N/A')}")
|
|
84
|
+
print(f"Entities Found: {result.get('entities_found', 'N/A')}")
|
|
85
|
+
print(f"Nodes Retrieved: {result.get('nodes_retrieved', 'N/A')}")
|
|
86
|
+
print(f"Retrieval Time: {result.get('retrieval_time', 'N/A')}")
|
|
87
|
+
print(f"\n💡 Use this context with your own LLM for generation!")
|
|
88
|
+
|
|
89
|
+
if args.output:
|
|
90
|
+
with open(args.output, "w") as f:
|
|
91
|
+
json.dump(result, f, indent=2, default=str)
|
|
92
|
+
print(f"\nResults saved to {args.output}")
|
|
93
|
+
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def cmd_interactive(args, rag_system):
|
|
98
|
+
"""Interactive retrieval mode"""
|
|
99
|
+
if args.load_doc:
|
|
100
|
+
rag_system.load_document(args.load_doc)
|
|
101
|
+
|
|
102
|
+
print("\n" + "=" * 60)
|
|
103
|
+
print("DeepLightRAG Interactive Retrieval Mode")
|
|
104
|
+
print("=" * 60)
|
|
105
|
+
print("Type your questions (or 'quit' to exit)")
|
|
106
|
+
print("Commands: !stats, !level N")
|
|
107
|
+
print("💡 Context retrieved - use with your own LLM for generation")
|
|
108
|
+
print()
|
|
109
|
+
|
|
110
|
+
override_level = None
|
|
111
|
+
|
|
112
|
+
while True:
|
|
113
|
+
try:
|
|
114
|
+
question = input("\nQuestion: ").strip()
|
|
115
|
+
|
|
116
|
+
if not question:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
if question.lower() in ["quit", "exit", "q"]:
|
|
120
|
+
print("Goodbye!")
|
|
121
|
+
break
|
|
122
|
+
|
|
123
|
+
# Handle commands
|
|
124
|
+
if question.startswith("!"):
|
|
125
|
+
if question == "!stats":
|
|
126
|
+
stats = rag_system.get_statistics()
|
|
127
|
+
print(json.dumps(stats, indent=2, default=str))
|
|
128
|
+
elif question.startswith("!level"):
|
|
129
|
+
parts = question.split()
|
|
130
|
+
if len(parts) == 2:
|
|
131
|
+
override_level = int(parts[1])
|
|
132
|
+
print(f"Query level override set to {override_level}")
|
|
133
|
+
else:
|
|
134
|
+
override_level = None
|
|
135
|
+
print("Query level override cleared")
|
|
136
|
+
else:
|
|
137
|
+
print("Unknown command")
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
# Regular retrieval
|
|
141
|
+
result = rag_system.retrieve(question, override_level=override_level)
|
|
142
|
+
|
|
143
|
+
print(f"\n{'='*60}")
|
|
144
|
+
print("CONTEXT")
|
|
145
|
+
print(f"{'='*60}")
|
|
146
|
+
print(f"{result.get('context', 'No context')[:500]}...")
|
|
147
|
+
print(f"{'='*60}")
|
|
148
|
+
print(
|
|
149
|
+
f"Level: {result.get('level_name', 'N/A')} ({result.get('query_level', 'N/A')}) | "
|
|
150
|
+
f"Tokens: {result.get('tokens_used', 'N/A')}/{result.get('token_budget', 'N/A')} | "
|
|
151
|
+
f"Entities: {result.get('entities_found', 'N/A')}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
except KeyboardInterrupt:
|
|
155
|
+
print("\n\nInterrupted. Goodbye!")
|
|
156
|
+
break
|
|
157
|
+
except Exception as e:
|
|
158
|
+
print(f"Error: {e}")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def main():
|
|
162
|
+
parser = argparse.ArgumentParser(
|
|
163
|
+
description="DeepLightRAG: Efficient Document-based RAG System"
|
|
164
|
+
)
|
|
165
|
+
parser.add_argument("--config", default="config.yaml", help="Path to configuration file")
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--storage", default="./deeplightrag_data", help="Storage directory for graphs and indices"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
|
171
|
+
|
|
172
|
+
# Index command
|
|
173
|
+
index_parser = subparsers.add_parser("index", help="Index a PDF document")
|
|
174
|
+
index_parser.add_argument("pdf", help="Path to PDF file")
|
|
175
|
+
index_parser.add_argument("--doc-id", help="Document identifier")
|
|
176
|
+
index_parser.add_argument("--no-save", action="store_true", help="Don't save to disk")
|
|
177
|
+
index_parser.add_argument("--output", help="Save results to JSON file")
|
|
178
|
+
|
|
179
|
+
# Query command
|
|
180
|
+
query_parser = subparsers.add_parser("query", help="Retrieve context for a query")
|
|
181
|
+
query_parser.add_argument("question", help="Question to ask")
|
|
182
|
+
query_parser.add_argument("--load-doc", help="Load specific document")
|
|
183
|
+
query_parser.add_argument("--level", type=int, help="Override query level (1-5)")
|
|
184
|
+
query_parser.add_argument("--output", help="Save results to JSON file")
|
|
185
|
+
|
|
186
|
+
# Interactive command
|
|
187
|
+
interactive_parser = subparsers.add_parser("interactive", help="Interactive retrieval mode")
|
|
188
|
+
interactive_parser.add_argument("--load-doc", help="Load specific document")
|
|
189
|
+
|
|
190
|
+
args = parser.parse_args()
|
|
191
|
+
|
|
192
|
+
if not args.command:
|
|
193
|
+
parser.print_help()
|
|
194
|
+
sys.exit(1)
|
|
195
|
+
|
|
196
|
+
# Load configuration
|
|
197
|
+
config = load_config(args.config)
|
|
198
|
+
|
|
199
|
+
# Initialize system
|
|
200
|
+
rag_system = DeepLightRAG(config=config, storage_dir=args.storage)
|
|
201
|
+
|
|
202
|
+
# Execute command
|
|
203
|
+
if args.command == "index":
|
|
204
|
+
cmd_index(args, rag_system)
|
|
205
|
+
elif args.command == "query":
|
|
206
|
+
cmd_query(args, rag_system)
|
|
207
|
+
elif args.command == "interactive":
|
|
208
|
+
cmd_interactive(args, rag_system)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
main()
|