qdrant-hybrid-pipeline 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qdrant_hybrid_pipeline-0.1.1/.gitignore +174 -0
- qdrant_hybrid_pipeline-0.1.1/LICENSE +21 -0
- qdrant_hybrid_pipeline-0.1.1/PKG-INFO +220 -0
- qdrant_hybrid_pipeline-0.1.1/README.md +162 -0
- qdrant_hybrid_pipeline-0.1.1/pyproject.toml +59 -0
- qdrant_hybrid_pipeline-0.1.1/src/hybrid_search/__init__.py +15 -0
- qdrant_hybrid_pipeline-0.1.1/src/hybrid_search/hybrid_pipeline.py +372 -0
- qdrant_hybrid_pipeline-0.1.1/src/hybrid_search/hybrid_pipeline_config.py +261 -0
@@ -0,0 +1,174 @@
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
2
|
+
__pycache__/
|
3
|
+
*.py[cod]
|
4
|
+
*$py.class
|
5
|
+
|
6
|
+
# C extensions
|
7
|
+
*.so
|
8
|
+
|
9
|
+
# Distribution / packaging
|
10
|
+
.Python
|
11
|
+
build/
|
12
|
+
develop-eggs/
|
13
|
+
dist/
|
14
|
+
downloads/
|
15
|
+
eggs/
|
16
|
+
.eggs/
|
17
|
+
lib/
|
18
|
+
lib64/
|
19
|
+
parts/
|
20
|
+
sdist/
|
21
|
+
var/
|
22
|
+
wheels/
|
23
|
+
share/python-wheels/
|
24
|
+
*.egg-info/
|
25
|
+
.installed.cfg
|
26
|
+
*.egg
|
27
|
+
MANIFEST
|
28
|
+
|
29
|
+
# PyInstaller
|
30
|
+
# Usually these files are written by a python script from a template
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32
|
+
*.manifest
|
33
|
+
*.spec
|
34
|
+
|
35
|
+
# Installer logs
|
36
|
+
pip-log.txt
|
37
|
+
pip-delete-this-directory.txt
|
38
|
+
|
39
|
+
# Unit test / coverage reports
|
40
|
+
htmlcov/
|
41
|
+
.tox/
|
42
|
+
.nox/
|
43
|
+
.coverage
|
44
|
+
.coverage.*
|
45
|
+
.cache
|
46
|
+
nosetests.xml
|
47
|
+
coverage.xml
|
48
|
+
*.cover
|
49
|
+
*.py,cover
|
50
|
+
.hypothesis/
|
51
|
+
.pytest_cache/
|
52
|
+
cover/
|
53
|
+
|
54
|
+
# Translations
|
55
|
+
*.mo
|
56
|
+
*.pot
|
57
|
+
|
58
|
+
# Django stuff:
|
59
|
+
*.log
|
60
|
+
local_settings.py
|
61
|
+
db.sqlite3
|
62
|
+
db.sqlite3-journal
|
63
|
+
|
64
|
+
# Flask stuff:
|
65
|
+
instance/
|
66
|
+
.webassets-cache
|
67
|
+
|
68
|
+
# Scrapy stuff:
|
69
|
+
.scrapy
|
70
|
+
|
71
|
+
# Sphinx documentation
|
72
|
+
docs/_build/
|
73
|
+
|
74
|
+
# PyBuilder
|
75
|
+
.pybuilder/
|
76
|
+
target/
|
77
|
+
|
78
|
+
# Jupyter Notebook
|
79
|
+
.ipynb_checkpoints
|
80
|
+
|
81
|
+
# IPython
|
82
|
+
profile_default/
|
83
|
+
ipython_config.py
|
84
|
+
|
85
|
+
# pyenv
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
88
|
+
# .python-version
|
89
|
+
|
90
|
+
# pipenv
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94
|
+
# install all needed dependencies.
|
95
|
+
#Pipfile.lock
|
96
|
+
|
97
|
+
# UV
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100
|
+
# commonly ignored for libraries.
|
101
|
+
#uv.lock
|
102
|
+
|
103
|
+
# poetry
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106
|
+
# commonly ignored for libraries.
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108
|
+
#poetry.lock
|
109
|
+
|
110
|
+
# pdm
|
111
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112
|
+
#pdm.lock
|
113
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114
|
+
# in version control.
|
115
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
116
|
+
.pdm.toml
|
117
|
+
.pdm-python
|
118
|
+
.pdm-build/
|
119
|
+
|
120
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
121
|
+
__pypackages__/
|
122
|
+
|
123
|
+
# Celery stuff
|
124
|
+
celerybeat-schedule
|
125
|
+
celerybeat.pid
|
126
|
+
|
127
|
+
# SageMath parsed files
|
128
|
+
*.sage.py
|
129
|
+
|
130
|
+
# Environments
|
131
|
+
.env
|
132
|
+
.venv
|
133
|
+
env/
|
134
|
+
venv/
|
135
|
+
ENV/
|
136
|
+
env.bak/
|
137
|
+
venv.bak/
|
138
|
+
|
139
|
+
# Spyder project settings
|
140
|
+
.spyderproject
|
141
|
+
.spyproject
|
142
|
+
|
143
|
+
# Rope project settings
|
144
|
+
.ropeproject
|
145
|
+
|
146
|
+
# mkdocs documentation
|
147
|
+
/site
|
148
|
+
|
149
|
+
# mypy
|
150
|
+
.mypy_cache/
|
151
|
+
.dmypy.json
|
152
|
+
dmypy.json
|
153
|
+
|
154
|
+
# Pyre type checker
|
155
|
+
.pyre/
|
156
|
+
|
157
|
+
# pytype static type analyzer
|
158
|
+
.pytype/
|
159
|
+
|
160
|
+
# Cython debug symbols
|
161
|
+
cython_debug/
|
162
|
+
|
163
|
+
# PyCharm
|
164
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
165
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
166
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
167
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
168
|
+
#.idea/
|
169
|
+
|
170
|
+
# Ruff stuff:
|
171
|
+
.ruff_cache/
|
172
|
+
|
173
|
+
# PyPI configuration file
|
174
|
+
.pypirc
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Data Parthenon
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,220 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: qdrant-hybrid-pipeline
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary: Configurable Hybrid Search Pipeline with Qdrant and FastEmbed
|
5
|
+
Author-email: Brian O'Grady <genesysdatallc@gmail.com>
|
6
|
+
Maintainer-email: Brian O'Grady <genesysdatallc@gmail.com>
|
7
|
+
License: MIT License
|
8
|
+
|
9
|
+
Copyright (c) 2025 Data Parthenon
|
10
|
+
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
13
|
+
in the Software without restriction, including without limitation the rights
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
16
|
+
furnished to do so, subject to the following conditions:
|
17
|
+
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
19
|
+
copies or substantial portions of the Software.
|
20
|
+
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
27
|
+
SOFTWARE.
|
28
|
+
License-File: LICENSE
|
29
|
+
Keywords: colbert,database,embeddings,late-interaction,qdrant,similarity-search,vector
|
30
|
+
Classifier: Development Status :: 4 - Beta
|
31
|
+
Classifier: Intended Audience :: Developers
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
34
|
+
Classifier: Programming Language :: Python :: 3.11
|
35
|
+
Classifier: Topic :: Database
|
36
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
37
|
+
Classifier: Topic :: Software Development :: Libraries
|
38
|
+
Requires-Python: >=3.11
|
39
|
+
Requires-Dist: fastembed>=0.6.1
|
40
|
+
Requires-Dist: pydantic>=2.11.3
|
41
|
+
Requires-Dist: python-dotenv>=1.1.0
|
42
|
+
Requires-Dist: qdrant-client>=1.13.3
|
43
|
+
Requires-Dist: sentence-transformers>=4.1.0
|
44
|
+
Provides-Extra: dev
|
45
|
+
Requires-Dist: build>=1.0.3; extra == 'dev'
|
46
|
+
Requires-Dist: mypy>=1.11.0; extra == 'dev'
|
47
|
+
Requires-Dist: pre-commit>=3.7.0; extra == 'dev'
|
48
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
49
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
50
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
51
|
+
Requires-Dist: pytest-xdist>=3.6.0; extra == 'dev'
|
52
|
+
Requires-Dist: pytest>=8.2.0; extra == 'dev'
|
53
|
+
Requires-Dist: ruff<0.10,>=0.9.7; extra == 'dev'
|
54
|
+
Requires-Dist: twine>=4.0.2; extra == 'dev'
|
55
|
+
Provides-Extra: test
|
56
|
+
Requires-Dist: fastembed-hybrid-pipeline[dev]; extra == 'test'
|
57
|
+
Description-Content-Type: text/markdown
|
58
|
+
|
59
|
+
[](https://opensource.org/licenses/MIT)
|
60
|
+
[](https://www.python.org/downloads/)
|
61
|
+
[](https://pypi.org/project/fastembed-hybrid-pipeline/)
|
62
|
+
|
63
|
+
# FastEmbed Hybrid Pipeline
|
64
|
+
|
65
|
+
A configurable hybrid search pipeline for building semantic search applications with [FastEmbed](https://github.com/qdrant/fastembed) and [Qdrant](https://github.com/qdrant/qdrant).
|
66
|
+
|
67
|
+
## Features
|
68
|
+
|
69
|
+
- 🚀 **Hybrid Search**: Combines dense embeddings, sparse embeddings, and late interaction embeddings for superior search performance
|
70
|
+
- 🔧 **Configurable**: Customize embedding models, vector parameters, and multi-tenancy settings
|
71
|
+
- 🔄 **Batch Processing**: Efficiently process and index large document collections
|
72
|
+
- 🏢 **Multi-Tenant Support**: Optional partition-based multi-tenancy for SaaS applications
|
73
|
+
|
74
|
+
## Installation
|
75
|
+
|
76
|
+
```bash
|
77
|
+
pip install fastembed-hybrid-pipeline
|
78
|
+
```
|
79
|
+
|
80
|
+
*Requires Python 3.11+*
|
81
|
+
|
82
|
+
## Quick Start
|
83
|
+
|
84
|
+
```python
|
85
|
+
from qdrant_client import QdrantClient
|
86
|
+
from fastembed import TextEmbedding, SparseEmbedding, LateInteractionTextEmbedding
|
87
|
+
from qdrant_client.models import Distance, VectorParams, SparseVectorParams, KeywordIndexParams
|
88
|
+
from hybrid_search import HybridPipelineConfig, HybridPipeline
|
89
|
+
import uuid
|
90
|
+
|
91
|
+
# Initialize Qdrant client
|
92
|
+
client = QdrantClient(":memory:") # Use a local instance or Qdrant Cloud
|
93
|
+
|
94
|
+
# Configure embedding models
|
95
|
+
text_model = TextEmbedding("BAAI/bge-small-en-v1.5")
|
96
|
+
sparse_model = SparseEmbedding("sentence-transformers/all-MiniLM-L6-v2")
|
97
|
+
late_interaction_model = LateInteractionTextEmbedding("intfloat/e5-small-v2")
|
98
|
+
|
99
|
+
# Configure vector parameters
|
100
|
+
dense_params = VectorParams(size=text_model.dimensions, distance=Distance.COSINE)
|
101
|
+
sparse_params = SparseVectorParams()
|
102
|
+
late_interaction_params = VectorParams(size=late_interaction_model.dimensions, distance=Distance.COSINE)
|
103
|
+
|
104
|
+
# Optional: Configure multi-tenancy
|
105
|
+
partition_field = "tenant_id"
|
106
|
+
partition_index = KeywordIndexParams(minWordLength=1, maxWordLength=100)
|
107
|
+
partition_config = (partition_field, partition_index)
|
108
|
+
|
109
|
+
# Create pipeline configuration
|
110
|
+
pipeline_config = HybridPipelineConfig(
|
111
|
+
text_embedding_config=(text_model, dense_params),
|
112
|
+
sparse_embedding_config=(sparse_model, sparse_params),
|
113
|
+
late_interaction_text_embedding_config=(late_interaction_model, late_interaction_params),
|
114
|
+
partition_config=partition_config, # Optional, for multi-tenant setup
|
115
|
+
multi_tenant=True, # Set to False for single-tenant setup
|
116
|
+
replication_factor=1, # For production, use 2+
|
117
|
+
shard_number=1, # For production, use 3+
|
118
|
+
)
|
119
|
+
|
120
|
+
# Initialize the pipeline
|
121
|
+
pipeline = HybridPipeline(
|
122
|
+
qdrant_client=client,
|
123
|
+
collection_name="documents",
|
124
|
+
hybrid_pipeline_config=pipeline_config,
|
125
|
+
)
|
126
|
+
|
127
|
+
# Index documents
|
128
|
+
documents = [
|
129
|
+
"FastEmbed is a lightweight Python library for state-of-the-art text embeddings.",
|
130
|
+
"Qdrant is a vector database for production-ready vector search.",
|
131
|
+
"Hybrid search combines multiple search techniques for better results."
|
132
|
+
]
|
133
|
+
|
134
|
+
payloads = [
|
135
|
+
{"tenant_id": "acme_corp", "document_type": "library"},
|
136
|
+
{"tenant_id": "acme_corp", "document_type": "database"},
|
137
|
+
{"tenant_id": "acme_corp", "document_type": "technique"}
|
138
|
+
]
|
139
|
+
|
140
|
+
document_ids = [uuid.uuid4() for _ in range(len(documents))]
|
141
|
+
|
142
|
+
# Insert documents
|
143
|
+
pipeline.insert_documents(documents, payloads, document_ids)
|
144
|
+
|
145
|
+
# Search
|
146
|
+
results = pipeline.search(
|
147
|
+
query="Which embedding library should I use?",
|
148
|
+
top_k=3,
|
149
|
+
partition_filter="acme_corp", # Only needed for multi-tenant setups
|
150
|
+
)
|
151
|
+
|
152
|
+
# Process results
|
153
|
+
for result in results:
|
154
|
+
print(f"Score: {result.score}")
|
155
|
+
print(f"Document: {result.payload['document']}")
|
156
|
+
print("-" * 30)
|
157
|
+
```
|
158
|
+
|
159
|
+
## Configuration Options
|
160
|
+
|
161
|
+
### Embedding Models
|
162
|
+
|
163
|
+
The pipeline requires three types of embedding models from FastEmbed:
|
164
|
+
|
165
|
+
1. **Dense Embeddings**: Traditional vector embeddings (TextEmbedding)
|
166
|
+
2. **Sparse Embeddings**: Lexical-focused sparse embeddings (SparseEmbedding)
|
167
|
+
3. **Late Interaction**: Special embeddings for late interaction matching (LateInteractionTextEmbedding)
|
168
|
+
|
169
|
+
### Vector Parameters
|
170
|
+
|
171
|
+
Configure vector parameters for each embedding type:
|
172
|
+
|
173
|
+
- **Dense & Late Interaction**: Size, distance metric (cosine, dot, euclidean)
|
174
|
+
- **Sparse**: Uses default sparse vector parameters
|
175
|
+
|
176
|
+
### Multi-Tenant Configuration
|
177
|
+
|
178
|
+
For SaaS applications that need to separate data by tenant:
|
179
|
+
|
180
|
+
```python
|
181
|
+
# Enable multi-tenancy
|
182
|
+
pipeline_config = HybridPipelineConfig(
|
183
|
+
# ... other configs ...
|
184
|
+
partition_config=("tenant_id", KeywordIndexParams(minWordLength=1, maxWordLength=100)),
|
185
|
+
multi_tenant=True,
|
186
|
+
)
|
187
|
+
|
188
|
+
# When searching, specify the tenant
|
189
|
+
results = pipeline.search(query="my search", partition_filter="tenant_123")
|
190
|
+
```
|
191
|
+
|
192
|
+
### Performance Options
|
193
|
+
|
194
|
+
For production deployments:
|
195
|
+
|
196
|
+
```python
|
197
|
+
pipeline_config = HybridPipelineConfig(
|
198
|
+
# ... other configs ...
|
199
|
+
replication_factor=2, # Data redundancy for high availability
|
200
|
+
shard_number=3, # Data distribution for scalability
|
201
|
+
)
|
202
|
+
```
|
203
|
+
|
204
|
+
## Development
|
205
|
+
|
206
|
+
```bash
|
207
|
+
# Clone the repository
|
208
|
+
git clone https://github.com/your-username/fastembed-hybrid-pipeline.git
|
209
|
+
cd fastembed-hybrid-pipeline
|
210
|
+
|
211
|
+
# Install development dependencies
|
212
|
+
pip install -e ".[dev]"
|
213
|
+
|
214
|
+
# Run tests
|
215
|
+
pytest
|
216
|
+
```
|
217
|
+
|
218
|
+
## License
|
219
|
+
|
220
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
@@ -0,0 +1,162 @@
|
|
1
|
+
[](https://opensource.org/licenses/MIT)
|
2
|
+
[](https://www.python.org/downloads/)
|
3
|
+
[](https://pypi.org/project/fastembed-hybrid-pipeline/)
|
4
|
+
|
5
|
+
# FastEmbed Hybrid Pipeline
|
6
|
+
|
7
|
+
A configurable hybrid search pipeline for building semantic search applications with [FastEmbed](https://github.com/qdrant/fastembed) and [Qdrant](https://github.com/qdrant/qdrant).
|
8
|
+
|
9
|
+
## Features
|
10
|
+
|
11
|
+
- 🚀 **Hybrid Search**: Combines dense embeddings, sparse embeddings, and late interaction embeddings for superior search performance
|
12
|
+
- 🔧 **Configurable**: Customize embedding models, vector parameters, and multi-tenancy settings
|
13
|
+
- 🔄 **Batch Processing**: Efficiently process and index large document collections
|
14
|
+
- 🏢 **Multi-Tenant Support**: Optional partition-based multi-tenancy for SaaS applications
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
```bash
|
19
|
+
pip install fastembed-hybrid-pipeline
|
20
|
+
```
|
21
|
+
|
22
|
+
*Requires Python 3.11+*
|
23
|
+
|
24
|
+
## Quick Start
|
25
|
+
|
26
|
+
```python
|
27
|
+
from qdrant_client import QdrantClient
|
28
|
+
from fastembed import TextEmbedding, SparseEmbedding, LateInteractionTextEmbedding
|
29
|
+
from qdrant_client.models import Distance, VectorParams, SparseVectorParams, KeywordIndexParams
|
30
|
+
from hybrid_search import HybridPipelineConfig, HybridPipeline
|
31
|
+
import uuid
|
32
|
+
|
33
|
+
# Initialize Qdrant client
|
34
|
+
client = QdrantClient(":memory:") # Use a local instance or Qdrant Cloud
|
35
|
+
|
36
|
+
# Configure embedding models
|
37
|
+
text_model = TextEmbedding("BAAI/bge-small-en-v1.5")
|
38
|
+
sparse_model = SparseEmbedding("sentence-transformers/all-MiniLM-L6-v2")
|
39
|
+
late_interaction_model = LateInteractionTextEmbedding("intfloat/e5-small-v2")
|
40
|
+
|
41
|
+
# Configure vector parameters
|
42
|
+
dense_params = VectorParams(size=text_model.dimensions, distance=Distance.COSINE)
|
43
|
+
sparse_params = SparseVectorParams()
|
44
|
+
late_interaction_params = VectorParams(size=late_interaction_model.dimensions, distance=Distance.COSINE)
|
45
|
+
|
46
|
+
# Optional: Configure multi-tenancy
|
47
|
+
partition_field = "tenant_id"
|
48
|
+
partition_index = KeywordIndexParams(minWordLength=1, maxWordLength=100)
|
49
|
+
partition_config = (partition_field, partition_index)
|
50
|
+
|
51
|
+
# Create pipeline configuration
|
52
|
+
pipeline_config = HybridPipelineConfig(
|
53
|
+
text_embedding_config=(text_model, dense_params),
|
54
|
+
sparse_embedding_config=(sparse_model, sparse_params),
|
55
|
+
late_interaction_text_embedding_config=(late_interaction_model, late_interaction_params),
|
56
|
+
partition_config=partition_config, # Optional, for multi-tenant setup
|
57
|
+
multi_tenant=True, # Set to False for single-tenant setup
|
58
|
+
replication_factor=1, # For production, use 2+
|
59
|
+
shard_number=1, # For production, use 3+
|
60
|
+
)
|
61
|
+
|
62
|
+
# Initialize the pipeline
|
63
|
+
pipeline = HybridPipeline(
|
64
|
+
qdrant_client=client,
|
65
|
+
collection_name="documents",
|
66
|
+
hybrid_pipeline_config=pipeline_config,
|
67
|
+
)
|
68
|
+
|
69
|
+
# Index documents
|
70
|
+
documents = [
|
71
|
+
"FastEmbed is a lightweight Python library for state-of-the-art text embeddings.",
|
72
|
+
"Qdrant is a vector database for production-ready vector search.",
|
73
|
+
"Hybrid search combines multiple search techniques for better results."
|
74
|
+
]
|
75
|
+
|
76
|
+
payloads = [
|
77
|
+
{"tenant_id": "acme_corp", "document_type": "library"},
|
78
|
+
{"tenant_id": "acme_corp", "document_type": "database"},
|
79
|
+
{"tenant_id": "acme_corp", "document_type": "technique"}
|
80
|
+
]
|
81
|
+
|
82
|
+
document_ids = [uuid.uuid4() for _ in range(len(documents))]
|
83
|
+
|
84
|
+
# Insert documents
|
85
|
+
pipeline.insert_documents(documents, payloads, document_ids)
|
86
|
+
|
87
|
+
# Search
|
88
|
+
results = pipeline.search(
|
89
|
+
query="Which embedding library should I use?",
|
90
|
+
top_k=3,
|
91
|
+
partition_filter="acme_corp", # Only needed for multi-tenant setups
|
92
|
+
)
|
93
|
+
|
94
|
+
# Process results
|
95
|
+
for result in results:
|
96
|
+
print(f"Score: {result.score}")
|
97
|
+
print(f"Document: {result.payload['document']}")
|
98
|
+
print("-" * 30)
|
99
|
+
```
|
100
|
+
|
101
|
+
## Configuration Options
|
102
|
+
|
103
|
+
### Embedding Models
|
104
|
+
|
105
|
+
The pipeline requires three types of embedding models from FastEmbed:
|
106
|
+
|
107
|
+
1. **Dense Embeddings**: Traditional vector embeddings (TextEmbedding)
|
108
|
+
2. **Sparse Embeddings**: Lexical-focused sparse embeddings (SparseEmbedding)
|
109
|
+
3. **Late Interaction**: Special embeddings for late interaction matching (LateInteractionTextEmbedding)
|
110
|
+
|
111
|
+
### Vector Parameters
|
112
|
+
|
113
|
+
Configure vector parameters for each embedding type:
|
114
|
+
|
115
|
+
- **Dense & Late Interaction**: Size, distance metric (cosine, dot, euclidean)
|
116
|
+
- **Sparse**: Uses default sparse vector parameters
|
117
|
+
|
118
|
+
### Multi-Tenant Configuration
|
119
|
+
|
120
|
+
For SaaS applications that need to separate data by tenant:
|
121
|
+
|
122
|
+
```python
|
123
|
+
# Enable multi-tenancy
|
124
|
+
pipeline_config = HybridPipelineConfig(
|
125
|
+
# ... other configs ...
|
126
|
+
partition_config=("tenant_id", KeywordIndexParams(minWordLength=1, maxWordLength=100)),
|
127
|
+
multi_tenant=True,
|
128
|
+
)
|
129
|
+
|
130
|
+
# When searching, specify the tenant
|
131
|
+
results = pipeline.search(query="my search", partition_filter="tenant_123")
|
132
|
+
```
|
133
|
+
|
134
|
+
### Performance Options
|
135
|
+
|
136
|
+
For production deployments:
|
137
|
+
|
138
|
+
```python
|
139
|
+
pipeline_config = HybridPipelineConfig(
|
140
|
+
# ... other configs ...
|
141
|
+
replication_factor=2, # Data redundancy for high availability
|
142
|
+
shard_number=3, # Data distribution for scalability
|
143
|
+
)
|
144
|
+
```
|
145
|
+
|
146
|
+
## Development
|
147
|
+
|
148
|
+
```bash
|
149
|
+
# Clone the repository
|
150
|
+
git clone https://github.com/your-username/fastembed-hybrid-pipeline.git
|
151
|
+
cd fastembed-hybrid-pipeline
|
152
|
+
|
153
|
+
# Install development dependencies
|
154
|
+
pip install -e ".[dev]"
|
155
|
+
|
156
|
+
# Run tests
|
157
|
+
pytest
|
158
|
+
```
|
159
|
+
|
160
|
+
## License
|
161
|
+
|
162
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
@@ -0,0 +1,59 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["hatchling"]
|
3
|
+
build-backend = "hatchling.build"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "qdrant-hybrid-pipeline"
|
7
|
+
version = "0.1.1"
|
8
|
+
description = "Configurable Hybrid Search Pipeline with Qdrant and FastEmbed"
|
9
|
+
readme = "README.md"
|
10
|
+
requires-python = ">=3.11"
|
11
|
+
license = { file = "LICENSE" }
|
12
|
+
authors = [
|
13
|
+
{ name = "Brian O'Grady", email = "genesysdatallc@gmail.com" }
|
14
|
+
]
|
15
|
+
maintainers = [
|
16
|
+
{ name = "Brian O'Grady", email = "genesysdatallc@gmail.com" }
|
17
|
+
]
|
18
|
+
keywords = ["qdrant", "vector", "database", "embeddings", "similarity-search", "colbert", "late-interaction"]
|
19
|
+
classifiers = [
|
20
|
+
"Development Status :: 4 - Beta",
|
21
|
+
"Intended Audience :: Developers",
|
22
|
+
"License :: OSI Approved :: MIT License",
|
23
|
+
"Programming Language :: Python :: 3",
|
24
|
+
"Programming Language :: Python :: 3.11",
|
25
|
+
"Topic :: Database",
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
27
|
+
"Topic :: Software Development :: Libraries",
|
28
|
+
]
|
29
|
+
dependencies = [
|
30
|
+
"fastembed>=0.6.1",
|
31
|
+
"pydantic>=2.11.3",
|
32
|
+
"python-dotenv>=1.1.0",
|
33
|
+
"qdrant-client>=1.13.3",
|
34
|
+
"sentence-transformers>=4.1.0",
|
35
|
+
]
|
36
|
+
|
37
|
+
[project.optional-dependencies]
|
38
|
+
dev = [
|
39
|
+
"pytest>=8.2.0",
|
40
|
+
"pytest-cov>=5.0.0",
|
41
|
+
"pytest-asyncio>=0.23.0",
|
42
|
+
"pytest-mock>=3.14.0",
|
43
|
+
"pytest-xdist>=3.6.0",
|
44
|
+
"mypy>=1.11.0",
|
45
|
+
"ruff>=0.9.7,<0.10",
|
46
|
+
"pre-commit>=3.7.0",
|
47
|
+
"build>=1.0.3",
|
48
|
+
"twine>=4.0.2",
|
49
|
+
]
|
50
|
+
|
51
|
+
test = ["fastembed-hybrid-pipeline[dev]"]
|
52
|
+
|
53
|
+
[tool.hatch.build.targets.wheel]
|
54
|
+
packages = ["src/hybrid_search/"]
|
55
|
+
|
56
|
+
[tool.hatch.build]
|
57
|
+
include = [
|
58
|
+
"src/hybrid_search/**/*.py",
|
59
|
+
]
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""
|
2
|
+
Hybrid Search module for vector search combining dense, sparse, and late interaction embeddings.
|
3
|
+
|
4
|
+
This module provides components for creating and managing hybrid search pipelines
|
5
|
+
that leverage multiple embedding types for improved search performance.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .hybrid_pipeline import HybridPipeline
|
9
|
+
from .hybrid_pipeline_config import HybridPipelineConfig, SentenceTransformerEmbedding
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"HybridPipeline",
|
13
|
+
"HybridPipelineConfig",
|
14
|
+
"SentenceTransformerEmbedding",
|
15
|
+
]
|
@@ -0,0 +1,372 @@
|
|
1
|
+
"""
|
2
|
+
Hybrid Pipeline module for vector search combining dense, sparse, and late interaction embeddings.
|
3
|
+
|
4
|
+
This module provides the implementation of the hybrid search pipeline that leverages multiple
|
5
|
+
embedding types for improved search performance.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import uuid
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
10
|
+
|
11
|
+
from fastembed import TextEmbedding
|
12
|
+
from qdrant_client import QdrantClient
|
13
|
+
from qdrant_client.conversions import common_types as types
|
14
|
+
from qdrant_client.models import (
|
15
|
+
Filter,
|
16
|
+
FieldCondition,
|
17
|
+
PointStruct,
|
18
|
+
Prefetch,
|
19
|
+
QuantizationSearchParams,
|
20
|
+
SearchParams,
|
21
|
+
MatchValue,
|
22
|
+
)
|
23
|
+
|
24
|
+
from .hybrid_pipeline_config import HybridPipelineConfig, SentenceTransformerEmbedding
|
25
|
+
|
26
|
+
|
27
|
+
class HybridPipeline:
|
28
|
+
"""
|
29
|
+
Pipeline for hybrid search using multiple embedding types.
|
30
|
+
|
31
|
+
This class implements a hybrid search pipeline that combines dense embeddings,
|
32
|
+
sparse embeddings, and late interaction embeddings for improved search performance.
|
33
|
+
It handles the creation and management of a Qdrant collection with the specified
|
34
|
+
configuration, as well as document insertion and search operations.
|
35
|
+
|
36
|
+
The hybrid approach combines the strengths of different embedding types:
|
37
|
+
- Dense embeddings: Good for semantic similarity
|
38
|
+
- Sparse embeddings: Good for keyword matching
|
39
|
+
- Late interaction embeddings: Good for retrieval with detailed token-level interactions
|
40
|
+
|
41
|
+
Attributes:
|
42
|
+
collection_name: Name of the Qdrant collection
|
43
|
+
qdrant_client: Client for interacting with the Qdrant vector database
|
44
|
+
config: Configuration for the hybrid pipeline
|
45
|
+
vectors_config_dict: Dictionary of vector configurations
|
46
|
+
sparse_vectors_config_dict: Dictionary of sparse vector configurations
|
47
|
+
multi_tenant: Flag indicating if the pipeline supports multiple tenants
|
48
|
+
replication_factor: Number of replicas for each shard
|
49
|
+
shard_number: Number of shards for the collection
|
50
|
+
partition_field_name: Field name used for partitioning in multi-tenant mode
|
51
|
+
partition_index_params: Index parameters for the partition field
|
52
|
+
"""
|
53
|
+
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
qdrant_client: QdrantClient,
|
57
|
+
collection_name: str,
|
58
|
+
hybrid_pipeline_config: HybridPipelineConfig,
|
59
|
+
):
|
60
|
+
"""
|
61
|
+
Initialize a new HybridPipeline instance.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
qdrant_client: Client for interacting with the Qdrant vector database
|
65
|
+
collection_name: Name of the Qdrant collection to create
|
66
|
+
hybrid_pipeline_config: Configuration for the hybrid pipeline
|
67
|
+
|
68
|
+
Raises:
|
69
|
+
ValueError: If the collection already exists
|
70
|
+
"""
|
71
|
+
self.collection_name = collection_name
|
72
|
+
self.qdrant_client = qdrant_client
|
73
|
+
self.config = hybrid_pipeline_config
|
74
|
+
self.vectors_config_dict = self.config.get_vectors_config_dict()
|
75
|
+
self.sparse_vectors_config_dict = self.config.get_sparse_vectors_config_dict()
|
76
|
+
self.multi_tenant = self.config.multi_tenant
|
77
|
+
self.replication_factor = self.config.replication_factor
|
78
|
+
self.shard_number = self.config.shard_number
|
79
|
+
self.partition_field_name, self.partition_index_params = self.config.get_partition_config()
|
80
|
+
self._create_collection()
|
81
|
+
|
82
|
+
if self.multi_tenant:
|
83
|
+
self._create_payload_index()
|
84
|
+
|
85
|
+
def _create_collection(self) -> bool:
|
86
|
+
"""
|
87
|
+
Create a new Qdrant collection with the configured parameters.
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
bool: True if the collection was created successfully
|
91
|
+
|
92
|
+
Raises:
|
93
|
+
ValueError: If the collection already exists
|
94
|
+
"""
|
95
|
+
if self.qdrant_client.collection_exists(self.collection_name):
|
96
|
+
raise ValueError(
|
97
|
+
f"Collection {self.collection_name} already exists"
|
98
|
+
)
|
99
|
+
|
100
|
+
return self.qdrant_client.create_collection(
|
101
|
+
collection_name=self.collection_name,
|
102
|
+
vectors_config=self.vectors_config_dict,
|
103
|
+
sparse_vectors_config=self.sparse_vectors_config_dict,
|
104
|
+
replication_factor=self.replication_factor,
|
105
|
+
shard_number=self.shard_number,
|
106
|
+
)
|
107
|
+
|
108
|
+
def _create_payload_index(self) -> types.UpdateResult:
|
109
|
+
"""
|
110
|
+
Create a payload index for the partition field in multi-tenant mode.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
types.UpdateResult: Result of the index creation operation
|
114
|
+
"""
|
115
|
+
return self.qdrant_client.create_payload_index(
|
116
|
+
collection_name=self.collection_name,
|
117
|
+
field_name=self.partition_field_name,
|
118
|
+
field_schema=self.partition_index_params,
|
119
|
+
)
|
120
|
+
|
121
|
+
def _embed_documents(self, documents: Union[str, List[str]]) -> Dict[str, List[float]]:
|
122
|
+
"""
|
123
|
+
Embed documents using all configured embedding models.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
documents: A single document string or a list of document strings to embed
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
Dict[str, List[float]]: Dictionary mapping model names to lists of embeddings
|
130
|
+
"""
|
131
|
+
if isinstance(documents, str):
|
132
|
+
documents = [documents]
|
133
|
+
|
134
|
+
if isinstance(self.config.dense_model, SentenceTransformerEmbedding):
|
135
|
+
dense_embeddings = self.config.dense_model.embed(documents)
|
136
|
+
else:
|
137
|
+
dense_embeddings = [emb.tolist() for emb in list(self.config.dense_model.embed(documents))]
|
138
|
+
|
139
|
+
sparse_embeddings = [
|
140
|
+
types.SparseVector(
|
141
|
+
indices=emb.indices.tolist(),
|
142
|
+
values=emb.values.tolist()
|
143
|
+
) for emb in list(self.config.sparse_model.embed(documents))
|
144
|
+
]
|
145
|
+
|
146
|
+
|
147
|
+
late_interaction_embeddings = list(self.config.late_interaction_model.embed(documents))
|
148
|
+
late_interaction_embeddings = [emb.tolist() for emb in late_interaction_embeddings]
|
149
|
+
|
150
|
+
return {
|
151
|
+
self.config.DENSE_VECTOR_NAME: dense_embeddings,
|
152
|
+
self.config.SPARSE_VECTOR_NAME: sparse_embeddings,
|
153
|
+
self.config.LATE_INTERACTION_VECTOR_NAME: late_interaction_embeddings,
|
154
|
+
}
|
155
|
+
|
156
|
+
def _prepare_documents(
|
157
|
+
self,
|
158
|
+
documents: List[str],
|
159
|
+
payloads: List[Dict[str, Any]],
|
160
|
+
document_ids: List[uuid.UUID],
|
161
|
+
) -> List[types.PointStruct]:
|
162
|
+
"""
|
163
|
+
Prepare documents for insertion into the Qdrant collection.
|
164
|
+
|
165
|
+
This method embeds the documents using the configured embedding models and
|
166
|
+
creates PointStruct objects that can be inserted into the Qdrant collection.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
documents: List of document strings to embed and insert
|
170
|
+
payloads: List of payload dictionaries containing metadata for each document
|
171
|
+
document_ids: List of UUIDs to use as IDs for each document
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
List[types.PointStruct]: List of prepared points ready for insertion
|
175
|
+
|
176
|
+
Raises:
|
177
|
+
ValueError: If the lengths of documents, payloads, and document_ids don't match,
|
178
|
+
or if multi_tenant is True and a payload is missing the partition field
|
179
|
+
"""
|
180
|
+
|
181
|
+
if not (len(documents) == len(payloads) == len(document_ids)):
|
182
|
+
raise ValueError(
|
183
|
+
"documents, payloads, and document_ids must be the same length"
|
184
|
+
)
|
185
|
+
|
186
|
+
embeddings_dict = self._embed_documents(documents)
|
187
|
+
|
188
|
+
points = []
|
189
|
+
for i in range(len(documents)):
|
190
|
+
if self.multi_tenant and self.partition_field_name not in payloads[i]:
|
191
|
+
raise ValueError(
|
192
|
+
f"payloads must contain {self.partition_field_name} if multi_tenant is True"
|
193
|
+
)
|
194
|
+
document_id = str(document_ids[i])
|
195
|
+
payloads[i]["document"] = documents[i]
|
196
|
+
payloads[i]["document_id"] = document_id
|
197
|
+
point = PointStruct(
|
198
|
+
id=document_id,
|
199
|
+
vector={
|
200
|
+
vector_name: embeddings_dict[vector_name][i] for vector_name in embeddings_dict
|
201
|
+
},
|
202
|
+
payload=payloads[i],
|
203
|
+
)
|
204
|
+
points.append(point)
|
205
|
+
|
206
|
+
return points
|
207
|
+
|
208
|
+
def insert_documents(
|
209
|
+
self,
|
210
|
+
documents: List[str],
|
211
|
+
payloads: List[Dict[str, Any]],
|
212
|
+
document_ids: List[uuid.UUID],
|
213
|
+
batch_size: int = 100,
|
214
|
+
):
|
215
|
+
"""
|
216
|
+
Insert documents into the Qdrant collection.
|
217
|
+
|
218
|
+
This method embeds the documents using the configured embedding models and
|
219
|
+
inserts them into the Qdrant collection in batches.
|
220
|
+
|
221
|
+
Args:
|
222
|
+
documents: List of document strings to embed and insert
|
223
|
+
payloads: List of payload dictionaries containing metadata for each document
|
224
|
+
document_ids: List of UUIDs to use as IDs for each document
|
225
|
+
batch_size: Number of documents to process in each batch (default: 100)
|
226
|
+
|
227
|
+
Raises:
|
228
|
+
ValueError: If the lengths of documents, payloads, and document_ids don't match,
|
229
|
+
or if multi_tenant is True and a payload is missing the partition field
|
230
|
+
"""
|
231
|
+
if not (len(documents) == len(payloads) == len(document_ids)):
|
232
|
+
raise ValueError(
|
233
|
+
"documents, payloads, and document_ids must be the same length"
|
234
|
+
)
|
235
|
+
|
236
|
+
for i in range(0, len(documents), batch_size):
|
237
|
+
points = self._prepare_documents(
|
238
|
+
documents=documents[i:i+batch_size],
|
239
|
+
payloads=payloads[i:i+batch_size],
|
240
|
+
document_ids=document_ids[i:i+batch_size]
|
241
|
+
)
|
242
|
+
self.qdrant_client.upsert(
|
243
|
+
collection_name=self.collection_name,
|
244
|
+
points=points,
|
245
|
+
)
|
246
|
+
|
247
|
+
def _embed_query(self, query: str) -> Dict[str, List[float]]:
|
248
|
+
"""
|
249
|
+
Embed a query string using all configured embedding models.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
query: Query string to embed
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
Dict[str, List[float]]: Dictionary mapping model names to query embeddings
|
256
|
+
"""
|
257
|
+
if isinstance(self.config.dense_model, SentenceTransformerEmbedding):
|
258
|
+
dense_embeddings = self.config.dense_model.embed([query])[0]
|
259
|
+
else:
|
260
|
+
dense_embeddings = list(self.config.dense_model.embed([query]))[0].tolist()
|
261
|
+
sparse_embeddings = list(self.config.sparse_model.embed([query]))[0]
|
262
|
+
sparse_embeddings = types.SparseVector(
|
263
|
+
indices=sparse_embeddings.indices.tolist(),
|
264
|
+
values=sparse_embeddings.values.tolist()
|
265
|
+
)
|
266
|
+
|
267
|
+
late_interaction_embeddings = [
|
268
|
+
emb.tolist() for emb in
|
269
|
+
list(self.config.late_interaction_model.embed([query]))[0]
|
270
|
+
]
|
271
|
+
|
272
|
+
return {
|
273
|
+
self.config.DENSE_VECTOR_NAME: dense_embeddings,
|
274
|
+
self.config.SPARSE_VECTOR_NAME: sparse_embeddings,
|
275
|
+
self.config.LATE_INTERACTION_VECTOR_NAME: late_interaction_embeddings,
|
276
|
+
}
|
277
|
+
|
278
|
+
def search(
|
279
|
+
self,
|
280
|
+
query: str,
|
281
|
+
top_k: int = 10,
|
282
|
+
partition_filter: Optional[str] = None,
|
283
|
+
overquery_factor: float = 1.0,
|
284
|
+
) -> List[types.ScoredPoint]:
|
285
|
+
"""
|
286
|
+
Search for documents similar to the query using the hybrid approach.
|
287
|
+
|
288
|
+
This method implements a hybrid search that combines dense embeddings,
|
289
|
+
sparse embeddings, and late interaction embeddings to retrieve the most
|
290
|
+
relevant documents for the query.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
query: Query string to search for
|
294
|
+
top_k: Number of results to return (default: 10)
|
295
|
+
partition_filter: Value to filter by in the partition field for multi-tenant mode
|
296
|
+
(must be None if multi_tenant is False)
|
297
|
+
overquery_factor: Factor to oversample results during quantization (default: 1.0,
|
298
|
+
must be >= 1.0)
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
types.QueryResponse: Query response containing the search results
|
302
|
+
|
303
|
+
Raises:
|
304
|
+
ValueError: If overquery_factor is less than 1.0 or if partition_filter is
|
305
|
+
provided when multi_tenant is False
|
306
|
+
"""
|
307
|
+
if overquery_factor < 1.0:
|
308
|
+
raise ValueError("overquery_factor must be greater than or equal to 1.0")
|
309
|
+
|
310
|
+
filter_condition = None
|
311
|
+
if not self.multi_tenant and partition_filter:
|
312
|
+
raise ValueError("partition_filter must be None if multi_tenant is False")
|
313
|
+
|
314
|
+
filter_condition = Filter(
|
315
|
+
must=[
|
316
|
+
FieldCondition(
|
317
|
+
key=self.partition_field_name,
|
318
|
+
match=MatchValue(value=partition_filter)
|
319
|
+
)
|
320
|
+
]
|
321
|
+
)
|
322
|
+
|
323
|
+
query_embeddings = self._embed_query(query)
|
324
|
+
|
325
|
+
dense_prefetch = Prefetch(
|
326
|
+
query=query_embeddings[self.config.DENSE_VECTOR_NAME],
|
327
|
+
using=self.config.DENSE_VECTOR_NAME,
|
328
|
+
limit=top_k,
|
329
|
+
filter=filter_condition,
|
330
|
+
params=SearchParams(
|
331
|
+
quantization=QuantizationSearchParams(
|
332
|
+
ignore=False,
|
333
|
+
rescore=True,
|
334
|
+
oversampling=overquery_factor,
|
335
|
+
),
|
336
|
+
),
|
337
|
+
)
|
338
|
+
|
339
|
+
sparse_prefetch = Prefetch(
|
340
|
+
query=query_embeddings[self.config.SPARSE_VECTOR_NAME],
|
341
|
+
using=self.config.SPARSE_VECTOR_NAME,
|
342
|
+
limit=top_k,
|
343
|
+
filter=filter_condition,
|
344
|
+
)
|
345
|
+
|
346
|
+
return self.qdrant_client.query_points(
|
347
|
+
collection_name=self.collection_name,
|
348
|
+
prefetch=[
|
349
|
+
dense_prefetch,
|
350
|
+
sparse_prefetch,
|
351
|
+
],
|
352
|
+
query=query_embeddings[self.config.LATE_INTERACTION_VECTOR_NAME],
|
353
|
+
using=self.config.LATE_INTERACTION_VECTOR_NAME,
|
354
|
+
limit=top_k,
|
355
|
+
with_payload=True,
|
356
|
+
).points
|
357
|
+
|
358
|
+
def delete_document(self, document_id: str):
|
359
|
+
"""
|
360
|
+
Delete a document from the collection by its ID.
|
361
|
+
|
362
|
+
Args:
|
363
|
+
document_id: ID of the document to delete
|
364
|
+
|
365
|
+
Note:
|
366
|
+
This method is currently not implemented.
|
367
|
+
|
368
|
+
TODO: Implement delete document functionality
|
369
|
+
"""
|
370
|
+
#TODO: Implement delete document
|
371
|
+
pass
|
372
|
+
|
@@ -0,0 +1,261 @@
|
|
1
|
+
"""
|
2
|
+
Configuration module for the hybrid search pipeline.
|
3
|
+
|
4
|
+
This module provides configuration classes and type definitions for setting up
|
5
|
+
hybrid search pipelines that combine dense, sparse, and late interaction embeddings.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import ClassVar, List, Mapping, Optional, Tuple, TypeVar, Union
|
9
|
+
|
10
|
+
from pydantic import BaseModel, model_validator
|
11
|
+
from fastembed.late_interaction import LateInteractionTextEmbedding
|
12
|
+
from fastembed.sparse import SparseTextEmbedding
|
13
|
+
from fastembed.text import TextEmbedding
|
14
|
+
from qdrant_client.conversions import common_types as types
|
15
|
+
from qdrant_client.models import KeywordIndexParams
|
16
|
+
from sentence_transformers import SentenceTransformer
|
17
|
+
|
18
|
+
|
19
|
+
class SentenceTransformerEmbedding(SentenceTransformer):
|
20
|
+
"""
|
21
|
+
A wrapper around the SentenceTransformer class that adds a model_name attribute.
|
22
|
+
"""
|
23
|
+
def __init__(self, model_name_or_path: str, *args, **kwargs):
|
24
|
+
self._model_name_or_path = model_name_or_path
|
25
|
+
super().__init__(model_name_or_path, *args, **kwargs)
|
26
|
+
|
27
|
+
@property
|
28
|
+
def model_name(self) -> str:
|
29
|
+
return self._model_name_or_path
|
30
|
+
|
31
|
+
def embed(self, texts: List[str], **kwargs) -> List[List[float]]:
|
32
|
+
return self.encode(texts, **kwargs).tolist()
|
33
|
+
|
34
|
+
|
35
|
+
Embedding = TypeVar(
|
36
|
+
"Embedding",
|
37
|
+
TextEmbedding,
|
38
|
+
LateInteractionTextEmbedding,
|
39
|
+
SparseTextEmbedding,
|
40
|
+
SentenceTransformerEmbedding,
|
41
|
+
)
|
42
|
+
"""Type variable for the different types of embedding models supported."""
|
43
|
+
|
44
|
+
|
45
|
+
BaseVectorParams = TypeVar(
|
46
|
+
"BaseVectorParams",
|
47
|
+
types.VectorParams,
|
48
|
+
types.SparseVectorParams
|
49
|
+
)
|
50
|
+
"""Type variable for the different types of vector parameters supported."""
|
51
|
+
|
52
|
+
|
53
|
+
class HybridPipelineConfig(BaseModel):
|
54
|
+
"""
|
55
|
+
Configuration for a hybrid search pipeline combining multiple embedding types.
|
56
|
+
|
57
|
+
This class encapsulates the configuration for a hybrid search pipeline that combines
|
58
|
+
dense embeddings, sparse embeddings, and late interaction embeddings for improved
|
59
|
+
search performance. It also includes configuration for multi-tenancy and sharding.
|
60
|
+
|
61
|
+
Attributes:
|
62
|
+
text_embedding_config: Configuration for the dense text embedding model.
|
63
|
+
A tuple containing a TextEmbedding model instance and its associated VectorParams.
|
64
|
+
sparse_embedding_config: Configuration for the sparse embedding model.
|
65
|
+
A tuple containing a SparseTextEmbedding model instance and its associated SparseVectorParams.
|
66
|
+
late_interaction_text_embedding_config: Configuration for the late interaction embedding model.
|
67
|
+
A tuple containing a LateInteractionTextEmbedding model instance and its associated VectorParams.
|
68
|
+
partition_config: Configuration for multi-tenant partitioning.
|
69
|
+
A tuple containing the field name to use for partitioning and the KeywordIndexParams
|
70
|
+
for the partition field. Required if multi_tenant is True.
|
71
|
+
multi_tenant: Flag indicating whether the pipeline should support multiple tenants.
|
72
|
+
If True, the pipeline will create a partitioned collection using the partition_config.
|
73
|
+
Default is False.
|
74
|
+
replication_factor: The number of replicas for each shard in the Qdrant collection.
|
75
|
+
Increases redundancy and read performance. Default is 2.
|
76
|
+
shard_number: The number of shards for the Qdrant collection.
|
77
|
+
Affects write performance and horizontal scalability. Default is 3.
|
78
|
+
"""
|
79
|
+
DENSE_VECTOR_NAME: ClassVar[str] = "dense"
|
80
|
+
SPARSE_VECTOR_NAME: ClassVar[str] = "sparse"
|
81
|
+
LATE_INTERACTION_VECTOR_NAME: ClassVar[str] = "multivector"
|
82
|
+
|
83
|
+
text_embedding_config: Tuple[Union[TextEmbedding, SentenceTransformerEmbedding], types.VectorParams]
|
84
|
+
sparse_embedding_config: Tuple[SparseTextEmbedding, types.SparseVectorParams]
|
85
|
+
late_interaction_text_embedding_config: Tuple[LateInteractionTextEmbedding, types.VectorParams]
|
86
|
+
# TODO: Replace PartitionConfig with MultiTenantConfig -> allow user to specify global index or not during collection creation
|
87
|
+
partition_config: Optional[Tuple[str, KeywordIndexParams]] = None
|
88
|
+
multi_tenant: Optional[bool] = False
|
89
|
+
replication_factor: Optional[int] = 2
|
90
|
+
shard_number: Optional[int] = 3
|
91
|
+
|
92
|
+
model_config = {
|
93
|
+
"arbitrary_types_allowed": True,
|
94
|
+
}
|
95
|
+
|
96
|
+
@model_validator(mode='after')
|
97
|
+
def _validate_config(self):
|
98
|
+
"""
|
99
|
+
Validate the configuration after model initialization.
|
100
|
+
|
101
|
+
Ensures that the configuration is valid by checking:
|
102
|
+
- Multi-tenancy and partition configuration compatibility
|
103
|
+
- Replication factor and shard number are valid
|
104
|
+
- Embedding models are of the correct type and have required attributes
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
self: The validated configuration instance
|
108
|
+
|
109
|
+
Raises:
|
110
|
+
ValueError: If any validation check fails
|
111
|
+
"""
|
112
|
+
if self.multi_tenant and self.partition_config is None:
|
113
|
+
raise ValueError("partition_config must be provided if multi_tenant is True")
|
114
|
+
if not self.multi_tenant and self.partition_config is not None:
|
115
|
+
raise ValueError("partition_config must be None if multi_tenant is False")
|
116
|
+
|
117
|
+
if not isinstance(self.replication_factor, int) or self.replication_factor < 1:
|
118
|
+
raise ValueError("replication_factor must be an integer greater than 0")
|
119
|
+
|
120
|
+
if not isinstance(self.shard_number, int) or self.shard_number < 1:
|
121
|
+
raise ValueError("shard_number must be an integer greater than 0")
|
122
|
+
|
123
|
+
for config_name, (model, _) in [
|
124
|
+
("text_embedding_config", self.text_embedding_config),
|
125
|
+
("sparse_embedding_config", self.sparse_embedding_config),
|
126
|
+
("late_interaction_text_embedding_config", self.late_interaction_text_embedding_config)
|
127
|
+
]:
|
128
|
+
if config_name == "text_embedding_config" and not isinstance(model, Union[TextEmbedding, SentenceTransformerEmbedding]):
|
129
|
+
raise ValueError(f"Embedding model in {config_name} must be an instance of TextEmbedding")
|
130
|
+
elif config_name == "sparse_embedding_config" and not isinstance(model, SparseTextEmbedding):
|
131
|
+
raise ValueError(f"Embedding model in {config_name} must be an instance of SparseEmbedding")
|
132
|
+
elif config_name == "late_interaction_text_embedding_config" and not isinstance(model, LateInteractionTextEmbedding):
|
133
|
+
raise ValueError(f"Embedding model in {config_name} must be an instance of LateInteractionTextEmbedding")
|
134
|
+
|
135
|
+
if not hasattr(model, "model_name"):
|
136
|
+
raise ValueError(f"Embedding model in {config_name} must have a 'model_name' attribute")
|
137
|
+
|
138
|
+
if not hasattr(model, "embed") or not callable(getattr(model, "embed")):
|
139
|
+
raise ValueError(f"Embedding model in {config_name} must have an 'embed' method")
|
140
|
+
return self
|
141
|
+
|
142
|
+
@property
|
143
|
+
def dense_model_config(self) -> Tuple[TextEmbedding, types.VectorParams]:
|
144
|
+
"""Get the dense embedding model configuration."""
|
145
|
+
return self.text_embedding_config
|
146
|
+
|
147
|
+
@property
|
148
|
+
def sparse_model_config(self) -> Tuple[SparseTextEmbedding, types.SparseVectorParams]:
|
149
|
+
"""Get the sparse embedding model configuration."""
|
150
|
+
return self.sparse_embedding_config
|
151
|
+
|
152
|
+
@property
|
153
|
+
def late_interaction_model_config(self) -> Tuple[LateInteractionTextEmbedding, types.VectorParams]:
|
154
|
+
"""Get the late interaction embedding model configuration."""
|
155
|
+
return self.late_interaction_text_embedding_config
|
156
|
+
|
157
|
+
@property
|
158
|
+
def dense_model(self) -> TextEmbedding:
|
159
|
+
"""Get the dense embedding model."""
|
160
|
+
return self.dense_model_config[0]
|
161
|
+
|
162
|
+
@property
|
163
|
+
def sparse_model(self) -> SparseTextEmbedding:
|
164
|
+
"""Get the sparse embedding model."""
|
165
|
+
return self.sparse_model_config[0]
|
166
|
+
|
167
|
+
@property
|
168
|
+
def late_interaction_model(self) -> LateInteractionTextEmbedding:
|
169
|
+
"""Get the late interaction embedding model."""
|
170
|
+
return self.late_interaction_model_config[0]
|
171
|
+
|
172
|
+
@property
|
173
|
+
def dense_model_name(self) -> str:
|
174
|
+
"""Get the name of the dense embedding model."""
|
175
|
+
return self.dense_model.model_name
|
176
|
+
|
177
|
+
@property
|
178
|
+
def sparse_model_name(self) -> str:
|
179
|
+
"""Get the name of the sparse embedding model."""
|
180
|
+
return self.sparse_model.model_name
|
181
|
+
|
182
|
+
@property
|
183
|
+
def late_interaction_model_name(self) -> str:
|
184
|
+
"""Get the name of the late interaction embedding model."""
|
185
|
+
return self.late_interaction_model.model_name
|
186
|
+
|
187
|
+
def list_embedding_configs(self) -> List[Tuple[Embedding, BaseVectorParams]]:
|
188
|
+
"""
|
189
|
+
Get a list of all embedding configurations.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
List[Tuple[Embedding, BaseVectorParams]]: A list containing tuples of embedding models
|
193
|
+
and their associated vector parameters
|
194
|
+
"""
|
195
|
+
return [
|
196
|
+
self.text_embedding_config,
|
197
|
+
self.sparse_embedding_config,
|
198
|
+
self.late_interaction_text_embedding_config
|
199
|
+
]
|
200
|
+
|
201
|
+
def list_embedding_model_names(self) -> List[str]:
|
202
|
+
"""
|
203
|
+
Get a list of all embedding model names.
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
List[str]: A list of embedding model names
|
207
|
+
"""
|
208
|
+
return [
|
209
|
+
config[0].model_name
|
210
|
+
for config in self.list_embedding_configs()
|
211
|
+
]
|
212
|
+
|
213
|
+
def list_embedding_models(self) -> List[Embedding]:
|
214
|
+
"""
|
215
|
+
Get a list of all embedding models.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
List[Embedding]: A list containing all embedding model instances
|
219
|
+
"""
|
220
|
+
return [
|
221
|
+
config[0]
|
222
|
+
for config in self.list_embedding_configs()
|
223
|
+
]
|
224
|
+
|
225
|
+
def get_vectors_config_dict(self) -> Mapping[str, types.VectorParams]:
|
226
|
+
"""
|
227
|
+
Get a dictionary mapping dense embedding model names to their vector parameters.
|
228
|
+
|
229
|
+
Returns:
|
230
|
+
Mapping[str, types.VectorParams]: Dictionary mapping model names to VectorParams
|
231
|
+
"""
|
232
|
+
return {
|
233
|
+
self.DENSE_VECTOR_NAME: self.dense_model_config[1],
|
234
|
+
self.LATE_INTERACTION_VECTOR_NAME: self.late_interaction_model_config[1],
|
235
|
+
}
|
236
|
+
|
237
|
+
def get_sparse_vectors_config_dict(self) -> Mapping[str, types.SparseVectorParams]:
|
238
|
+
"""
|
239
|
+
Get a dictionary mapping sparse embedding model names to their vector parameters.
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
Mapping[str, types.SparseVectorParams]: Dictionary mapping model names to SparseVectorParams
|
243
|
+
"""
|
244
|
+
return {
|
245
|
+
self.SPARSE_VECTOR_NAME: self.sparse_model_config[1],
|
246
|
+
}
|
247
|
+
|
248
|
+
def get_partition_config(self) -> Tuple[str, KeywordIndexParams]:
|
249
|
+
"""
|
250
|
+
Get the partition configuration for multi-tenant setup.
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Tuple[str, KeywordIndexParams]: A tuple containing the partition field name
|
254
|
+
and the KeywordIndexParams for that field
|
255
|
+
|
256
|
+
Raises:
|
257
|
+
ValueError: If partition_config is not set but this method is called
|
258
|
+
"""
|
259
|
+
if not self.partition_config:
|
260
|
+
raise ValueError("partition_config must be specified during instantiation")
|
261
|
+
return self.partition_config
|