qdrant-hybrid-pipeline 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Data Parthenon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,220 @@
1
+ Metadata-Version: 2.4
2
+ Name: qdrant-hybrid-pipeline
3
+ Version: 0.1.1
4
+ Summary: Configurable Hybrid Search Pipeline with Qdrant and FastEmbed
5
+ Author-email: Brian O'Grady <genesysdatallc@gmail.com>
6
+ Maintainer-email: Brian O'Grady <genesysdatallc@gmail.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2025 Data Parthenon
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ License-File: LICENSE
29
+ Keywords: colbert,database,embeddings,late-interaction,qdrant,similarity-search,vector
30
+ Classifier: Development Status :: 4 - Beta
31
+ Classifier: Intended Audience :: Developers
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.11
35
+ Classifier: Topic :: Database
36
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
37
+ Classifier: Topic :: Software Development :: Libraries
38
+ Requires-Python: >=3.11
39
+ Requires-Dist: fastembed>=0.6.1
40
+ Requires-Dist: pydantic>=2.11.3
41
+ Requires-Dist: python-dotenv>=1.1.0
42
+ Requires-Dist: qdrant-client>=1.13.3
43
+ Requires-Dist: sentence-transformers>=4.1.0
44
+ Provides-Extra: dev
45
+ Requires-Dist: build>=1.0.3; extra == 'dev'
46
+ Requires-Dist: mypy>=1.11.0; extra == 'dev'
47
+ Requires-Dist: pre-commit>=3.7.0; extra == 'dev'
48
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
49
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
50
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
51
+ Requires-Dist: pytest-xdist>=3.6.0; extra == 'dev'
52
+ Requires-Dist: pytest>=8.2.0; extra == 'dev'
53
+ Requires-Dist: ruff<0.10,>=0.9.7; extra == 'dev'
54
+ Requires-Dist: twine>=4.0.2; extra == 'dev'
55
+ Provides-Extra: test
56
+ Requires-Dist: fastembed-hybrid-pipeline[dev]; extra == 'test'
57
+ Description-Content-Type: text/markdown
58
+
59
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
60
+ [![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)](https://www.python.org/downloads/)
61
+ [![PyPI version](https://badge.fury.io/py/fastembed-hybrid-pipeline.svg)](https://pypi.org/project/fastembed-hybrid-pipeline/)
62
+
63
+ # FastEmbed Hybrid Pipeline
64
+
65
+ A configurable hybrid search pipeline for building semantic search applications with [FastEmbed](https://github.com/qdrant/fastembed) and [Qdrant](https://github.com/qdrant/qdrant).
66
+
67
+ ## Features
68
+
69
+ - 🚀 **Hybrid Search**: Combines dense embeddings, sparse embeddings, and late interaction embeddings for superior search performance
70
+ - 🔧 **Configurable**: Customize embedding models, vector parameters, and multi-tenancy settings
71
+ - 🔄 **Batch Processing**: Efficiently process and index large document collections
72
+ - 🏢 **Multi-Tenant Support**: Optional partition-based multi-tenancy for SaaS applications
73
+
74
+ ## Installation
75
+
76
+ ```bash
77
+ pip install fastembed-hybrid-pipeline
78
+ ```
79
+
80
+ *Requires Python 3.11+*
81
+
82
+ ## Quick Start
83
+
84
+ ```python
85
+ from qdrant_client import QdrantClient
86
+ from fastembed import TextEmbedding, SparseEmbedding, LateInteractionTextEmbedding
87
+ from qdrant_client.models import Distance, VectorParams, SparseVectorParams, KeywordIndexParams
88
+ from hybrid_search import HybridPipelineConfig, HybridPipeline
89
+ import uuid
90
+
91
+ # Initialize Qdrant client
92
+ client = QdrantClient(":memory:") # Use a local instance or Qdrant Cloud
93
+
94
+ # Configure embedding models
95
+ text_model = TextEmbedding("BAAI/bge-small-en-v1.5")
96
+ sparse_model = SparseEmbedding("sentence-transformers/all-MiniLM-L6-v2")
97
+ late_interaction_model = LateInteractionTextEmbedding("intfloat/e5-small-v2")
98
+
99
+ # Configure vector parameters
100
+ dense_params = VectorParams(size=text_model.dimensions, distance=Distance.COSINE)
101
+ sparse_params = SparseVectorParams()
102
+ late_interaction_params = VectorParams(size=late_interaction_model.dimensions, distance=Distance.COSINE)
103
+
104
+ # Optional: Configure multi-tenancy
105
+ partition_field = "tenant_id"
106
+ partition_index = KeywordIndexParams(minWordLength=1, maxWordLength=100)
107
+ partition_config = (partition_field, partition_index)
108
+
109
+ # Create pipeline configuration
110
+ pipeline_config = HybridPipelineConfig(
111
+ text_embedding_config=(text_model, dense_params),
112
+ sparse_embedding_config=(sparse_model, sparse_params),
113
+ late_interaction_text_embedding_config=(late_interaction_model, late_interaction_params),
114
+ partition_config=partition_config, # Optional, for multi-tenant setup
115
+ multi_tenant=True, # Set to False for single-tenant setup
116
+ replication_factor=1, # For production, use 2+
117
+ shard_number=1, # For production, use 3+
118
+ )
119
+
120
+ # Initialize the pipeline
121
+ pipeline = HybridPipeline(
122
+ qdrant_client=client,
123
+ collection_name="documents",
124
+ hybrid_pipeline_config=pipeline_config,
125
+ )
126
+
127
+ # Index documents
128
+ documents = [
129
+ "FastEmbed is a lightweight Python library for state-of-the-art text embeddings.",
130
+ "Qdrant is a vector database for production-ready vector search.",
131
+ "Hybrid search combines multiple search techniques for better results."
132
+ ]
133
+
134
+ payloads = [
135
+ {"tenant_id": "acme_corp", "document_type": "library"},
136
+ {"tenant_id": "acme_corp", "document_type": "database"},
137
+ {"tenant_id": "acme_corp", "document_type": "technique"}
138
+ ]
139
+
140
+ document_ids = [uuid.uuid4() for _ in range(len(documents))]
141
+
142
+ # Insert documents
143
+ pipeline.insert_documents(documents, payloads, document_ids)
144
+
145
+ # Search
146
+ results = pipeline.search(
147
+ query="Which embedding library should I use?",
148
+ top_k=3,
149
+ partition_filter="acme_corp", # Only needed for multi-tenant setups
150
+ )
151
+
152
+ # Process results
153
+ for result in results:
154
+ print(f"Score: {result.score}")
155
+ print(f"Document: {result.payload['document']}")
156
+ print("-" * 30)
157
+ ```
158
+
159
+ ## Configuration Options
160
+
161
+ ### Embedding Models
162
+
163
+ The pipeline requires three types of embedding models from FastEmbed:
164
+
165
+ 1. **Dense Embeddings**: Traditional vector embeddings (TextEmbedding)
166
+ 2. **Sparse Embeddings**: Lexical-focused sparse embeddings (SparseEmbedding)
167
+ 3. **Late Interaction**: Special embeddings for late interaction matching (LateInteractionTextEmbedding)
168
+
169
+ ### Vector Parameters
170
+
171
+ Configure vector parameters for each embedding type:
172
+
173
+ - **Dense & Late Interaction**: Size, distance metric (cosine, dot, euclidean)
174
+ - **Sparse**: Uses default sparse vector parameters
175
+
176
+ ### Multi-Tenant Configuration
177
+
178
+ For SaaS applications that need to separate data by tenant:
179
+
180
+ ```python
181
+ # Enable multi-tenancy
182
+ pipeline_config = HybridPipelineConfig(
183
+ # ... other configs ...
184
+ partition_config=("tenant_id", KeywordIndexParams(minWordLength=1, maxWordLength=100)),
185
+ multi_tenant=True,
186
+ )
187
+
188
+ # When searching, specify the tenant
189
+ results = pipeline.search(query="my search", partition_filter="tenant_123")
190
+ ```
191
+
192
+ ### Performance Options
193
+
194
+ For production deployments:
195
+
196
+ ```python
197
+ pipeline_config = HybridPipelineConfig(
198
+ # ... other configs ...
199
+ replication_factor=2, # Data redundancy for high availability
200
+ shard_number=3, # Data distribution for scalability
201
+ )
202
+ ```
203
+
204
+ ## Development
205
+
206
+ ```bash
207
+ # Clone the repository
208
+ git clone https://github.com/your-username/fastembed-hybrid-pipeline.git
209
+ cd fastembed-hybrid-pipeline
210
+
211
+ # Install development dependencies
212
+ pip install -e ".[dev]"
213
+
214
+ # Run tests
215
+ pytest
216
+ ```
217
+
218
+ ## License
219
+
220
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,162 @@
1
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
2
+ [![Python Version](https://img.shields.io/badge/python-3.11%2B-blue)](https://www.python.org/downloads/)
3
+ [![PyPI version](https://badge.fury.io/py/fastembed-hybrid-pipeline.svg)](https://pypi.org/project/fastembed-hybrid-pipeline/)
4
+
5
+ # FastEmbed Hybrid Pipeline
6
+
7
+ A configurable hybrid search pipeline for building semantic search applications with [FastEmbed](https://github.com/qdrant/fastembed) and [Qdrant](https://github.com/qdrant/qdrant).
8
+
9
+ ## Features
10
+
11
+ - 🚀 **Hybrid Search**: Combines dense embeddings, sparse embeddings, and late interaction embeddings for superior search performance
12
+ - 🔧 **Configurable**: Customize embedding models, vector parameters, and multi-tenancy settings
13
+ - 🔄 **Batch Processing**: Efficiently process and index large document collections
14
+ - 🏢 **Multi-Tenant Support**: Optional partition-based multi-tenancy for SaaS applications
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install fastembed-hybrid-pipeline
20
+ ```
21
+
22
+ *Requires Python 3.11+*
23
+
24
+ ## Quick Start
25
+
26
+ ```python
27
+ from qdrant_client import QdrantClient
28
+ from fastembed import TextEmbedding, SparseEmbedding, LateInteractionTextEmbedding
29
+ from qdrant_client.models import Distance, VectorParams, SparseVectorParams, KeywordIndexParams
30
+ from hybrid_search import HybridPipelineConfig, HybridPipeline
31
+ import uuid
32
+
33
+ # Initialize Qdrant client
34
+ client = QdrantClient(":memory:") # Use a local instance or Qdrant Cloud
35
+
36
+ # Configure embedding models
37
+ text_model = TextEmbedding("BAAI/bge-small-en-v1.5")
38
+ sparse_model = SparseEmbedding("sentence-transformers/all-MiniLM-L6-v2")
39
+ late_interaction_model = LateInteractionTextEmbedding("intfloat/e5-small-v2")
40
+
41
+ # Configure vector parameters
42
+ dense_params = VectorParams(size=text_model.dimensions, distance=Distance.COSINE)
43
+ sparse_params = SparseVectorParams()
44
+ late_interaction_params = VectorParams(size=late_interaction_model.dimensions, distance=Distance.COSINE)
45
+
46
+ # Optional: Configure multi-tenancy
47
+ partition_field = "tenant_id"
48
+ partition_index = KeywordIndexParams(minWordLength=1, maxWordLength=100)
49
+ partition_config = (partition_field, partition_index)
50
+
51
+ # Create pipeline configuration
52
+ pipeline_config = HybridPipelineConfig(
53
+ text_embedding_config=(text_model, dense_params),
54
+ sparse_embedding_config=(sparse_model, sparse_params),
55
+ late_interaction_text_embedding_config=(late_interaction_model, late_interaction_params),
56
+ partition_config=partition_config, # Optional, for multi-tenant setup
57
+ multi_tenant=True, # Set to False for single-tenant setup
58
+ replication_factor=1, # For production, use 2+
59
+ shard_number=1, # For production, use 3+
60
+ )
61
+
62
+ # Initialize the pipeline
63
+ pipeline = HybridPipeline(
64
+ qdrant_client=client,
65
+ collection_name="documents",
66
+ hybrid_pipeline_config=pipeline_config,
67
+ )
68
+
69
+ # Index documents
70
+ documents = [
71
+ "FastEmbed is a lightweight Python library for state-of-the-art text embeddings.",
72
+ "Qdrant is a vector database for production-ready vector search.",
73
+ "Hybrid search combines multiple search techniques for better results."
74
+ ]
75
+
76
+ payloads = [
77
+ {"tenant_id": "acme_corp", "document_type": "library"},
78
+ {"tenant_id": "acme_corp", "document_type": "database"},
79
+ {"tenant_id": "acme_corp", "document_type": "technique"}
80
+ ]
81
+
82
+ document_ids = [uuid.uuid4() for _ in range(len(documents))]
83
+
84
+ # Insert documents
85
+ pipeline.insert_documents(documents, payloads, document_ids)
86
+
87
+ # Search
88
+ results = pipeline.search(
89
+ query="Which embedding library should I use?",
90
+ top_k=3,
91
+ partition_filter="acme_corp", # Only needed for multi-tenant setups
92
+ )
93
+
94
+ # Process results
95
+ for result in results:
96
+ print(f"Score: {result.score}")
97
+ print(f"Document: {result.payload['document']}")
98
+ print("-" * 30)
99
+ ```
100
+
101
+ ## Configuration Options
102
+
103
+ ### Embedding Models
104
+
105
+ The pipeline requires three types of embedding models from FastEmbed:
106
+
107
+ 1. **Dense Embeddings**: Traditional vector embeddings (TextEmbedding)
108
+ 2. **Sparse Embeddings**: Lexical-focused sparse embeddings (SparseEmbedding)
109
+ 3. **Late Interaction**: Special embeddings for late interaction matching (LateInteractionTextEmbedding)
110
+
111
+ ### Vector Parameters
112
+
113
+ Configure vector parameters for each embedding type:
114
+
115
+ - **Dense & Late Interaction**: Size, distance metric (cosine, dot, euclidean)
116
+ - **Sparse**: Uses default sparse vector parameters
117
+
118
+ ### Multi-Tenant Configuration
119
+
120
+ For SaaS applications that need to separate data by tenant:
121
+
122
+ ```python
123
+ # Enable multi-tenancy
124
+ pipeline_config = HybridPipelineConfig(
125
+ # ... other configs ...
126
+ partition_config=("tenant_id", KeywordIndexParams(minWordLength=1, maxWordLength=100)),
127
+ multi_tenant=True,
128
+ )
129
+
130
+ # When searching, specify the tenant
131
+ results = pipeline.search(query="my search", partition_filter="tenant_123")
132
+ ```
133
+
134
+ ### Performance Options
135
+
136
+ For production deployments:
137
+
138
+ ```python
139
+ pipeline_config = HybridPipelineConfig(
140
+ # ... other configs ...
141
+ replication_factor=2, # Data redundancy for high availability
142
+ shard_number=3, # Data distribution for scalability
143
+ )
144
+ ```
145
+
146
+ ## Development
147
+
148
+ ```bash
149
+ # Clone the repository
150
+ git clone https://github.com/your-username/fastembed-hybrid-pipeline.git
151
+ cd fastembed-hybrid-pipeline
152
+
153
+ # Install development dependencies
154
+ pip install -e ".[dev]"
155
+
156
+ # Run tests
157
+ pytest
158
+ ```
159
+
160
+ ## License
161
+
162
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,59 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "qdrant-hybrid-pipeline"
7
+ version = "0.1.1"
8
+ description = "Configurable Hybrid Search Pipeline with Qdrant and FastEmbed"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { file = "LICENSE" }
12
+ authors = [
13
+ { name = "Brian O'Grady", email = "genesysdatallc@gmail.com" }
14
+ ]
15
+ maintainers = [
16
+ { name = "Brian O'Grady", email = "genesysdatallc@gmail.com" }
17
+ ]
18
+ keywords = ["qdrant", "vector", "database", "embeddings", "similarity-search", "colbert", "late-interaction"]
19
+ classifiers = [
20
+ "Development Status :: 4 - Beta",
21
+ "Intended Audience :: Developers",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Topic :: Database",
26
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
27
+ "Topic :: Software Development :: Libraries",
28
+ ]
29
+ dependencies = [
30
+ "fastembed>=0.6.1",
31
+ "pydantic>=2.11.3",
32
+ "python-dotenv>=1.1.0",
33
+ "qdrant-client>=1.13.3",
34
+ "sentence-transformers>=4.1.0",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ dev = [
39
+ "pytest>=8.2.0",
40
+ "pytest-cov>=5.0.0",
41
+ "pytest-asyncio>=0.23.0",
42
+ "pytest-mock>=3.14.0",
43
+ "pytest-xdist>=3.6.0",
44
+ "mypy>=1.11.0",
45
+ "ruff>=0.9.7,<0.10",
46
+ "pre-commit>=3.7.0",
47
+ "build>=1.0.3",
48
+ "twine>=4.0.2",
49
+ ]
50
+
51
+ test = ["fastembed-hybrid-pipeline[dev]"]
52
+
53
+ [tool.hatch.build.targets.wheel]
54
+ packages = ["src/hybrid_search/"]
55
+
56
+ [tool.hatch.build]
57
+ include = [
58
+ "src/hybrid_search/**/*.py",
59
+ ]
@@ -0,0 +1,15 @@
1
+ """
2
+ Hybrid Search module for vector search combining dense, sparse, and late interaction embeddings.
3
+
4
+ This module provides components for creating and managing hybrid search pipelines
5
+ that leverage multiple embedding types for improved search performance.
6
+ """
7
+
8
+ from .hybrid_pipeline import HybridPipeline
9
+ from .hybrid_pipeline_config import HybridPipelineConfig, SentenceTransformerEmbedding
10
+
11
+ __all__ = [
12
+ "HybridPipeline",
13
+ "HybridPipelineConfig",
14
+ "SentenceTransformerEmbedding",
15
+ ]
@@ -0,0 +1,372 @@
1
+ """
2
+ Hybrid Pipeline module for vector search combining dense, sparse, and late interaction embeddings.
3
+
4
+ This module provides the implementation of the hybrid search pipeline that leverages multiple
5
+ embedding types for improved search performance.
6
+ """
7
+
8
+ import uuid
9
+ from typing import Any, Dict, List, Optional, Union
10
+
11
+ from fastembed import TextEmbedding
12
+ from qdrant_client import QdrantClient
13
+ from qdrant_client.conversions import common_types as types
14
+ from qdrant_client.models import (
15
+ Filter,
16
+ FieldCondition,
17
+ PointStruct,
18
+ Prefetch,
19
+ QuantizationSearchParams,
20
+ SearchParams,
21
+ MatchValue,
22
+ )
23
+
24
+ from .hybrid_pipeline_config import HybridPipelineConfig, SentenceTransformerEmbedding
25
+
26
+
27
+ class HybridPipeline:
28
+ """
29
+ Pipeline for hybrid search using multiple embedding types.
30
+
31
+ This class implements a hybrid search pipeline that combines dense embeddings,
32
+ sparse embeddings, and late interaction embeddings for improved search performance.
33
+ It handles the creation and management of a Qdrant collection with the specified
34
+ configuration, as well as document insertion and search operations.
35
+
36
+ The hybrid approach combines the strengths of different embedding types:
37
+ - Dense embeddings: Good for semantic similarity
38
+ - Sparse embeddings: Good for keyword matching
39
+ - Late interaction embeddings: Good for retrieval with detailed token-level interactions
40
+
41
+ Attributes:
42
+ collection_name: Name of the Qdrant collection
43
+ qdrant_client: Client for interacting with the Qdrant vector database
44
+ config: Configuration for the hybrid pipeline
45
+ vectors_config_dict: Dictionary of vector configurations
46
+ sparse_vectors_config_dict: Dictionary of sparse vector configurations
47
+ multi_tenant: Flag indicating if the pipeline supports multiple tenants
48
+ replication_factor: Number of replicas for each shard
49
+ shard_number: Number of shards for the collection
50
+ partition_field_name: Field name used for partitioning in multi-tenant mode
51
+ partition_index_params: Index parameters for the partition field
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ qdrant_client: QdrantClient,
57
+ collection_name: str,
58
+ hybrid_pipeline_config: HybridPipelineConfig,
59
+ ):
60
+ """
61
+ Initialize a new HybridPipeline instance.
62
+
63
+ Args:
64
+ qdrant_client: Client for interacting with the Qdrant vector database
65
+ collection_name: Name of the Qdrant collection to create
66
+ hybrid_pipeline_config: Configuration for the hybrid pipeline
67
+
68
+ Raises:
69
+ ValueError: If the collection already exists
70
+ """
71
+ self.collection_name = collection_name
72
+ self.qdrant_client = qdrant_client
73
+ self.config = hybrid_pipeline_config
74
+ self.vectors_config_dict = self.config.get_vectors_config_dict()
75
+ self.sparse_vectors_config_dict = self.config.get_sparse_vectors_config_dict()
76
+ self.multi_tenant = self.config.multi_tenant
77
+ self.replication_factor = self.config.replication_factor
78
+ self.shard_number = self.config.shard_number
79
+ self.partition_field_name, self.partition_index_params = self.config.get_partition_config()
80
+ self._create_collection()
81
+
82
+ if self.multi_tenant:
83
+ self._create_payload_index()
84
+
85
+ def _create_collection(self) -> bool:
86
+ """
87
+ Create a new Qdrant collection with the configured parameters.
88
+
89
+ Returns:
90
+ bool: True if the collection was created successfully
91
+
92
+ Raises:
93
+ ValueError: If the collection already exists
94
+ """
95
+ if self.qdrant_client.collection_exists(self.collection_name):
96
+ raise ValueError(
97
+ f"Collection {self.collection_name} already exists"
98
+ )
99
+
100
+ return self.qdrant_client.create_collection(
101
+ collection_name=self.collection_name,
102
+ vectors_config=self.vectors_config_dict,
103
+ sparse_vectors_config=self.sparse_vectors_config_dict,
104
+ replication_factor=self.replication_factor,
105
+ shard_number=self.shard_number,
106
+ )
107
+
108
+ def _create_payload_index(self) -> types.UpdateResult:
109
+ """
110
+ Create a payload index for the partition field in multi-tenant mode.
111
+
112
+ Returns:
113
+ types.UpdateResult: Result of the index creation operation
114
+ """
115
+ return self.qdrant_client.create_payload_index(
116
+ collection_name=self.collection_name,
117
+ field_name=self.partition_field_name,
118
+ field_schema=self.partition_index_params,
119
+ )
120
+
121
+ def _embed_documents(self, documents: Union[str, List[str]]) -> Dict[str, List[float]]:
122
+ """
123
+ Embed documents using all configured embedding models.
124
+
125
+ Args:
126
+ documents: A single document string or a list of document strings to embed
127
+
128
+ Returns:
129
+ Dict[str, List[float]]: Dictionary mapping model names to lists of embeddings
130
+ """
131
+ if isinstance(documents, str):
132
+ documents = [documents]
133
+
134
+ if isinstance(self.config.dense_model, SentenceTransformerEmbedding):
135
+ dense_embeddings = self.config.dense_model.embed(documents)
136
+ else:
137
+ dense_embeddings = [emb.tolist() for emb in list(self.config.dense_model.embed(documents))]
138
+
139
+ sparse_embeddings = [
140
+ types.SparseVector(
141
+ indices=emb.indices.tolist(),
142
+ values=emb.values.tolist()
143
+ ) for emb in list(self.config.sparse_model.embed(documents))
144
+ ]
145
+
146
+
147
+ late_interaction_embeddings = list(self.config.late_interaction_model.embed(documents))
148
+ late_interaction_embeddings = [emb.tolist() for emb in late_interaction_embeddings]
149
+
150
+ return {
151
+ self.config.DENSE_VECTOR_NAME: dense_embeddings,
152
+ self.config.SPARSE_VECTOR_NAME: sparse_embeddings,
153
+ self.config.LATE_INTERACTION_VECTOR_NAME: late_interaction_embeddings,
154
+ }
155
+
156
+ def _prepare_documents(
157
+ self,
158
+ documents: List[str],
159
+ payloads: List[Dict[str, Any]],
160
+ document_ids: List[uuid.UUID],
161
+ ) -> List[types.PointStruct]:
162
+ """
163
+ Prepare documents for insertion into the Qdrant collection.
164
+
165
+ This method embeds the documents using the configured embedding models and
166
+ creates PointStruct objects that can be inserted into the Qdrant collection.
167
+
168
+ Args:
169
+ documents: List of document strings to embed and insert
170
+ payloads: List of payload dictionaries containing metadata for each document
171
+ document_ids: List of UUIDs to use as IDs for each document
172
+
173
+ Returns:
174
+ List[types.PointStruct]: List of prepared points ready for insertion
175
+
176
+ Raises:
177
+ ValueError: If the lengths of documents, payloads, and document_ids don't match,
178
+ or if multi_tenant is True and a payload is missing the partition field
179
+ """
180
+
181
+ if not (len(documents) == len(payloads) == len(document_ids)):
182
+ raise ValueError(
183
+ "documents, payloads, and document_ids must be the same length"
184
+ )
185
+
186
+ embeddings_dict = self._embed_documents(documents)
187
+
188
+ points = []
189
+ for i in range(len(documents)):
190
+ if self.multi_tenant and self.partition_field_name not in payloads[i]:
191
+ raise ValueError(
192
+ f"payloads must contain {self.partition_field_name} if multi_tenant is True"
193
+ )
194
+ document_id = str(document_ids[i])
195
+ payloads[i]["document"] = documents[i]
196
+ payloads[i]["document_id"] = document_id
197
+ point = PointStruct(
198
+ id=document_id,
199
+ vector={
200
+ vector_name: embeddings_dict[vector_name][i] for vector_name in embeddings_dict
201
+ },
202
+ payload=payloads[i],
203
+ )
204
+ points.append(point)
205
+
206
+ return points
207
+
208
+ def insert_documents(
209
+ self,
210
+ documents: List[str],
211
+ payloads: List[Dict[str, Any]],
212
+ document_ids: List[uuid.UUID],
213
+ batch_size: int = 100,
214
+ ):
215
+ """
216
+ Insert documents into the Qdrant collection.
217
+
218
+ This method embeds the documents using the configured embedding models and
219
+ inserts them into the Qdrant collection in batches.
220
+
221
+ Args:
222
+ documents: List of document strings to embed and insert
223
+ payloads: List of payload dictionaries containing metadata for each document
224
+ document_ids: List of UUIDs to use as IDs for each document
225
+ batch_size: Number of documents to process in each batch (default: 100)
226
+
227
+ Raises:
228
+ ValueError: If the lengths of documents, payloads, and document_ids don't match,
229
+ or if multi_tenant is True and a payload is missing the partition field
230
+ """
231
+ if not (len(documents) == len(payloads) == len(document_ids)):
232
+ raise ValueError(
233
+ "documents, payloads, and document_ids must be the same length"
234
+ )
235
+
236
+ for i in range(0, len(documents), batch_size):
237
+ points = self._prepare_documents(
238
+ documents=documents[i:i+batch_size],
239
+ payloads=payloads[i:i+batch_size],
240
+ document_ids=document_ids[i:i+batch_size]
241
+ )
242
+ self.qdrant_client.upsert(
243
+ collection_name=self.collection_name,
244
+ points=points,
245
+ )
246
+
247
+ def _embed_query(self, query: str) -> Dict[str, List[float]]:
248
+ """
249
+ Embed a query string using all configured embedding models.
250
+
251
+ Args:
252
+ query: Query string to embed
253
+
254
+ Returns:
255
+ Dict[str, List[float]]: Dictionary mapping model names to query embeddings
256
+ """
257
+ if isinstance(self.config.dense_model, SentenceTransformerEmbedding):
258
+ dense_embeddings = self.config.dense_model.embed([query])[0]
259
+ else:
260
+ dense_embeddings = list(self.config.dense_model.embed([query]))[0].tolist()
261
+ sparse_embeddings = list(self.config.sparse_model.embed([query]))[0]
262
+ sparse_embeddings = types.SparseVector(
263
+ indices=sparse_embeddings.indices.tolist(),
264
+ values=sparse_embeddings.values.tolist()
265
+ )
266
+
267
+ late_interaction_embeddings = [
268
+ emb.tolist() for emb in
269
+ list(self.config.late_interaction_model.embed([query]))[0]
270
+ ]
271
+
272
+ return {
273
+ self.config.DENSE_VECTOR_NAME: dense_embeddings,
274
+ self.config.SPARSE_VECTOR_NAME: sparse_embeddings,
275
+ self.config.LATE_INTERACTION_VECTOR_NAME: late_interaction_embeddings,
276
+ }
277
+
278
+ def search(
279
+ self,
280
+ query: str,
281
+ top_k: int = 10,
282
+ partition_filter: Optional[str] = None,
283
+ overquery_factor: float = 1.0,
284
+ ) -> List[types.ScoredPoint]:
285
+ """
286
+ Search for documents similar to the query using the hybrid approach.
287
+
288
+ This method implements a hybrid search that combines dense embeddings,
289
+ sparse embeddings, and late interaction embeddings to retrieve the most
290
+ relevant documents for the query.
291
+
292
+ Args:
293
+ query: Query string to search for
294
+ top_k: Number of results to return (default: 10)
295
+ partition_filter: Value to filter by in the partition field for multi-tenant mode
296
+ (must be None if multi_tenant is False)
297
+ overquery_factor: Factor to oversample results during quantization (default: 1.0,
298
+ must be >= 1.0)
299
+
300
+ Returns:
301
+ types.QueryResponse: Query response containing the search results
302
+
303
+ Raises:
304
+ ValueError: If overquery_factor is less than 1.0 or if partition_filter is
305
+ provided when multi_tenant is False
306
+ """
307
+ if overquery_factor < 1.0:
308
+ raise ValueError("overquery_factor must be greater than or equal to 1.0")
309
+
310
+ filter_condition = None
311
+ if not self.multi_tenant and partition_filter:
312
+ raise ValueError("partition_filter must be None if multi_tenant is False")
313
+
314
+ filter_condition = Filter(
315
+ must=[
316
+ FieldCondition(
317
+ key=self.partition_field_name,
318
+ match=MatchValue(value=partition_filter)
319
+ )
320
+ ]
321
+ )
322
+
323
+ query_embeddings = self._embed_query(query)
324
+
325
+ dense_prefetch = Prefetch(
326
+ query=query_embeddings[self.config.DENSE_VECTOR_NAME],
327
+ using=self.config.DENSE_VECTOR_NAME,
328
+ limit=top_k,
329
+ filter=filter_condition,
330
+ params=SearchParams(
331
+ quantization=QuantizationSearchParams(
332
+ ignore=False,
333
+ rescore=True,
334
+ oversampling=overquery_factor,
335
+ ),
336
+ ),
337
+ )
338
+
339
+ sparse_prefetch = Prefetch(
340
+ query=query_embeddings[self.config.SPARSE_VECTOR_NAME],
341
+ using=self.config.SPARSE_VECTOR_NAME,
342
+ limit=top_k,
343
+ filter=filter_condition,
344
+ )
345
+
346
+ return self.qdrant_client.query_points(
347
+ collection_name=self.collection_name,
348
+ prefetch=[
349
+ dense_prefetch,
350
+ sparse_prefetch,
351
+ ],
352
+ query=query_embeddings[self.config.LATE_INTERACTION_VECTOR_NAME],
353
+ using=self.config.LATE_INTERACTION_VECTOR_NAME,
354
+ limit=top_k,
355
+ with_payload=True,
356
+ ).points
357
+
358
+ def delete_document(self, document_id: str):
359
+ """
360
+ Delete a document from the collection by its ID.
361
+
362
+ Args:
363
+ document_id: ID of the document to delete
364
+
365
+ Note:
366
+ This method is currently not implemented.
367
+
368
+ TODO: Implement delete document functionality
369
+ """
370
+ #TODO: Implement delete document
371
+ pass
372
+
@@ -0,0 +1,261 @@
1
+ """
2
+ Configuration module for the hybrid search pipeline.
3
+
4
+ This module provides configuration classes and type definitions for setting up
5
+ hybrid search pipelines that combine dense, sparse, and late interaction embeddings.
6
+ """
7
+
8
+ from typing import ClassVar, List, Mapping, Optional, Tuple, TypeVar, Union
9
+
10
+ from pydantic import BaseModel, model_validator
11
+ from fastembed.late_interaction import LateInteractionTextEmbedding
12
+ from fastembed.sparse import SparseTextEmbedding
13
+ from fastembed.text import TextEmbedding
14
+ from qdrant_client.conversions import common_types as types
15
+ from qdrant_client.models import KeywordIndexParams
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+
19
+ class SentenceTransformerEmbedding(SentenceTransformer):
20
+ """
21
+ A wrapper around the SentenceTransformer class that adds a model_name attribute.
22
+ """
23
+ def __init__(self, model_name_or_path: str, *args, **kwargs):
24
+ self._model_name_or_path = model_name_or_path
25
+ super().__init__(model_name_or_path, *args, **kwargs)
26
+
27
+ @property
28
+ def model_name(self) -> str:
29
+ return self._model_name_or_path
30
+
31
+ def embed(self, texts: List[str], **kwargs) -> List[List[float]]:
32
+ return self.encode(texts, **kwargs).tolist()
33
+
34
+
35
+ Embedding = TypeVar(
36
+ "Embedding",
37
+ TextEmbedding,
38
+ LateInteractionTextEmbedding,
39
+ SparseTextEmbedding,
40
+ SentenceTransformerEmbedding,
41
+ )
42
+ """Type variable for the different types of embedding models supported."""
43
+
44
+
45
+ BaseVectorParams = TypeVar(
46
+ "BaseVectorParams",
47
+ types.VectorParams,
48
+ types.SparseVectorParams
49
+ )
50
+ """Type variable for the different types of vector parameters supported."""
51
+
52
+
53
+ class HybridPipelineConfig(BaseModel):
54
+ """
55
+ Configuration for a hybrid search pipeline combining multiple embedding types.
56
+
57
+ This class encapsulates the configuration for a hybrid search pipeline that combines
58
+ dense embeddings, sparse embeddings, and late interaction embeddings for improved
59
+ search performance. It also includes configuration for multi-tenancy and sharding.
60
+
61
+ Attributes:
62
+ text_embedding_config: Configuration for the dense text embedding model.
63
+ A tuple containing a TextEmbedding model instance and its associated VectorParams.
64
+ sparse_embedding_config: Configuration for the sparse embedding model.
65
+ A tuple containing a SparseTextEmbedding model instance and its associated SparseVectorParams.
66
+ late_interaction_text_embedding_config: Configuration for the late interaction embedding model.
67
+ A tuple containing a LateInteractionTextEmbedding model instance and its associated VectorParams.
68
+ partition_config: Configuration for multi-tenant partitioning.
69
+ A tuple containing the field name to use for partitioning and the KeywordIndexParams
70
+ for the partition field. Required if multi_tenant is True.
71
+ multi_tenant: Flag indicating whether the pipeline should support multiple tenants.
72
+ If True, the pipeline will create a partitioned collection using the partition_config.
73
+ Default is False.
74
+ replication_factor: The number of replicas for each shard in the Qdrant collection.
75
+ Increases redundancy and read performance. Default is 2.
76
+ shard_number: The number of shards for the Qdrant collection.
77
+ Affects write performance and horizontal scalability. Default is 3.
78
+ """
79
+ DENSE_VECTOR_NAME: ClassVar[str] = "dense"
80
+ SPARSE_VECTOR_NAME: ClassVar[str] = "sparse"
81
+ LATE_INTERACTION_VECTOR_NAME: ClassVar[str] = "multivector"
82
+
83
+ text_embedding_config: Tuple[Union[TextEmbedding, SentenceTransformerEmbedding], types.VectorParams]
84
+ sparse_embedding_config: Tuple[SparseTextEmbedding, types.SparseVectorParams]
85
+ late_interaction_text_embedding_config: Tuple[LateInteractionTextEmbedding, types.VectorParams]
86
+ # TODO: Replace PartitionConfig with MultiTenantConfig -> allow user to specify global index or not during collection creation
87
+ partition_config: Optional[Tuple[str, KeywordIndexParams]] = None
88
+ multi_tenant: Optional[bool] = False
89
+ replication_factor: Optional[int] = 2
90
+ shard_number: Optional[int] = 3
91
+
92
+ model_config = {
93
+ "arbitrary_types_allowed": True,
94
+ }
95
+
96
+ @model_validator(mode='after')
97
+ def _validate_config(self):
98
+ """
99
+ Validate the configuration after model initialization.
100
+
101
+ Ensures that the configuration is valid by checking:
102
+ - Multi-tenancy and partition configuration compatibility
103
+ - Replication factor and shard number are valid
104
+ - Embedding models are of the correct type and have required attributes
105
+
106
+ Returns:
107
+ self: The validated configuration instance
108
+
109
+ Raises:
110
+ ValueError: If any validation check fails
111
+ """
112
+ if self.multi_tenant and self.partition_config is None:
113
+ raise ValueError("partition_config must be provided if multi_tenant is True")
114
+ if not self.multi_tenant and self.partition_config is not None:
115
+ raise ValueError("partition_config must be None if multi_tenant is False")
116
+
117
+ if not isinstance(self.replication_factor, int) or self.replication_factor < 1:
118
+ raise ValueError("replication_factor must be an integer greater than 0")
119
+
120
+ if not isinstance(self.shard_number, int) or self.shard_number < 1:
121
+ raise ValueError("shard_number must be an integer greater than 0")
122
+
123
+ for config_name, (model, _) in [
124
+ ("text_embedding_config", self.text_embedding_config),
125
+ ("sparse_embedding_config", self.sparse_embedding_config),
126
+ ("late_interaction_text_embedding_config", self.late_interaction_text_embedding_config)
127
+ ]:
128
+ if config_name == "text_embedding_config" and not isinstance(model, Union[TextEmbedding, SentenceTransformerEmbedding]):
129
+ raise ValueError(f"Embedding model in {config_name} must be an instance of TextEmbedding")
130
+ elif config_name == "sparse_embedding_config" and not isinstance(model, SparseTextEmbedding):
131
+ raise ValueError(f"Embedding model in {config_name} must be an instance of SparseEmbedding")
132
+ elif config_name == "late_interaction_text_embedding_config" and not isinstance(model, LateInteractionTextEmbedding):
133
+ raise ValueError(f"Embedding model in {config_name} must be an instance of LateInteractionTextEmbedding")
134
+
135
+ if not hasattr(model, "model_name"):
136
+ raise ValueError(f"Embedding model in {config_name} must have a 'model_name' attribute")
137
+
138
+ if not hasattr(model, "embed") or not callable(getattr(model, "embed")):
139
+ raise ValueError(f"Embedding model in {config_name} must have an 'embed' method")
140
+ return self
141
+
142
+ @property
143
+ def dense_model_config(self) -> Tuple[TextEmbedding, types.VectorParams]:
144
+ """Get the dense embedding model configuration."""
145
+ return self.text_embedding_config
146
+
147
+ @property
148
+ def sparse_model_config(self) -> Tuple[SparseTextEmbedding, types.SparseVectorParams]:
149
+ """Get the sparse embedding model configuration."""
150
+ return self.sparse_embedding_config
151
+
152
+ @property
153
+ def late_interaction_model_config(self) -> Tuple[LateInteractionTextEmbedding, types.VectorParams]:
154
+ """Get the late interaction embedding model configuration."""
155
+ return self.late_interaction_text_embedding_config
156
+
157
+ @property
158
+ def dense_model(self) -> TextEmbedding:
159
+ """Get the dense embedding model."""
160
+ return self.dense_model_config[0]
161
+
162
+ @property
163
+ def sparse_model(self) -> SparseTextEmbedding:
164
+ """Get the sparse embedding model."""
165
+ return self.sparse_model_config[0]
166
+
167
+ @property
168
+ def late_interaction_model(self) -> LateInteractionTextEmbedding:
169
+ """Get the late interaction embedding model."""
170
+ return self.late_interaction_model_config[0]
171
+
172
+ @property
173
+ def dense_model_name(self) -> str:
174
+ """Get the name of the dense embedding model."""
175
+ return self.dense_model.model_name
176
+
177
+ @property
178
+ def sparse_model_name(self) -> str:
179
+ """Get the name of the sparse embedding model."""
180
+ return self.sparse_model.model_name
181
+
182
+ @property
183
+ def late_interaction_model_name(self) -> str:
184
+ """Get the name of the late interaction embedding model."""
185
+ return self.late_interaction_model.model_name
186
+
187
+ def list_embedding_configs(self) -> List[Tuple[Embedding, BaseVectorParams]]:
188
+ """
189
+ Get a list of all embedding configurations.
190
+
191
+ Returns:
192
+ List[Tuple[Embedding, BaseVectorParams]]: A list containing tuples of embedding models
193
+ and their associated vector parameters
194
+ """
195
+ return [
196
+ self.text_embedding_config,
197
+ self.sparse_embedding_config,
198
+ self.late_interaction_text_embedding_config
199
+ ]
200
+
201
+ def list_embedding_model_names(self) -> List[str]:
202
+ """
203
+ Get a list of all embedding model names.
204
+
205
+ Returns:
206
+ List[str]: A list of embedding model names
207
+ """
208
+ return [
209
+ config[0].model_name
210
+ for config in self.list_embedding_configs()
211
+ ]
212
+
213
+ def list_embedding_models(self) -> List[Embedding]:
214
+ """
215
+ Get a list of all embedding models.
216
+
217
+ Returns:
218
+ List[Embedding]: A list containing all embedding model instances
219
+ """
220
+ return [
221
+ config[0]
222
+ for config in self.list_embedding_configs()
223
+ ]
224
+
225
+ def get_vectors_config_dict(self) -> Mapping[str, types.VectorParams]:
226
+ """
227
+ Get a dictionary mapping dense embedding model names to their vector parameters.
228
+
229
+ Returns:
230
+ Mapping[str, types.VectorParams]: Dictionary mapping model names to VectorParams
231
+ """
232
+ return {
233
+ self.DENSE_VECTOR_NAME: self.dense_model_config[1],
234
+ self.LATE_INTERACTION_VECTOR_NAME: self.late_interaction_model_config[1],
235
+ }
236
+
237
+ def get_sparse_vectors_config_dict(self) -> Mapping[str, types.SparseVectorParams]:
238
+ """
239
+ Get a dictionary mapping sparse embedding model names to their vector parameters.
240
+
241
+ Returns:
242
+ Mapping[str, types.SparseVectorParams]: Dictionary mapping model names to SparseVectorParams
243
+ """
244
+ return {
245
+ self.SPARSE_VECTOR_NAME: self.sparse_model_config[1],
246
+ }
247
+
248
+ def get_partition_config(self) -> Tuple[str, KeywordIndexParams]:
249
+ """
250
+ Get the partition configuration for multi-tenant setup.
251
+
252
+ Returns:
253
+ Tuple[str, KeywordIndexParams]: A tuple containing the partition field name
254
+ and the KeywordIndexParams for that field
255
+
256
+ Raises:
257
+ ValueError: If partition_config is not set but this method is called
258
+ """
259
+ if not self.partition_config:
260
+ raise ValueError("partition_config must be specified during instantiation")
261
+ return self.partition_config