hammad-python 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. hammad/__init__.py +177 -0
  2. hammad/{performance/imports.py → _internal.py} +7 -1
  3. hammad/cache/__init__.py +1 -1
  4. hammad/cli/__init__.py +3 -1
  5. hammad/cli/_runner.py +265 -0
  6. hammad/cli/animations.py +1 -1
  7. hammad/cli/plugins.py +133 -78
  8. hammad/cli/styles/__init__.py +1 -1
  9. hammad/cli/styles/utils.py +149 -3
  10. hammad/data/__init__.py +56 -29
  11. hammad/data/collections/__init__.py +27 -17
  12. hammad/data/collections/collection.py +205 -383
  13. hammad/data/collections/indexes/__init__.py +37 -0
  14. hammad/data/collections/indexes/qdrant/__init__.py +1 -0
  15. hammad/data/collections/indexes/qdrant/index.py +735 -0
  16. hammad/data/collections/indexes/qdrant/settings.py +94 -0
  17. hammad/data/collections/indexes/qdrant/utils.py +220 -0
  18. hammad/data/collections/indexes/tantivy/__init__.py +1 -0
  19. hammad/data/collections/indexes/tantivy/index.py +428 -0
  20. hammad/data/collections/indexes/tantivy/settings.py +51 -0
  21. hammad/data/collections/indexes/tantivy/utils.py +200 -0
  22. hammad/data/configurations/__init__.py +2 -2
  23. hammad/data/configurations/configuration.py +2 -2
  24. hammad/data/models/__init__.py +20 -9
  25. hammad/data/models/extensions/__init__.py +4 -0
  26. hammad/data/models/{pydantic → extensions/pydantic}/__init__.py +6 -19
  27. hammad/data/models/{pydantic → extensions/pydantic}/converters.py +143 -16
  28. hammad/data/models/{base/fields.py → fields.py} +1 -1
  29. hammad/data/models/{base/model.py → model.py} +1 -1
  30. hammad/data/models/{base/utils.py → utils.py} +1 -1
  31. hammad/data/sql/__init__.py +23 -0
  32. hammad/data/sql/database.py +578 -0
  33. hammad/data/sql/types.py +141 -0
  34. hammad/data/types/__init__.py +1 -3
  35. hammad/data/types/file.py +3 -3
  36. hammad/data/types/multimodal/__init__.py +2 -2
  37. hammad/data/types/multimodal/audio.py +2 -2
  38. hammad/data/types/multimodal/image.py +2 -2
  39. hammad/formatting/__init__.py +9 -27
  40. hammad/formatting/json/__init__.py +8 -2
  41. hammad/formatting/json/converters.py +7 -1
  42. hammad/formatting/text/__init__.py +1 -1
  43. hammad/formatting/yaml/__init__.py +1 -1
  44. hammad/genai/__init__.py +78 -0
  45. hammad/genai/agents/__init__.py +1 -0
  46. hammad/genai/agents/types/__init__.py +35 -0
  47. hammad/genai/agents/types/history.py +277 -0
  48. hammad/genai/agents/types/tool.py +490 -0
  49. hammad/genai/embedding_models/__init__.py +41 -0
  50. hammad/{ai/embeddings/client/litellm_embeddings_client.py → genai/embedding_models/embedding_model.py} +47 -142
  51. hammad/genai/embedding_models/embedding_model_name.py +77 -0
  52. hammad/genai/embedding_models/embedding_model_request.py +65 -0
  53. hammad/{ai/embeddings/types.py → genai/embedding_models/embedding_model_response.py} +3 -3
  54. hammad/genai/embedding_models/run.py +161 -0
  55. hammad/genai/language_models/__init__.py +35 -0
  56. hammad/genai/language_models/_streaming.py +622 -0
  57. hammad/genai/language_models/_types.py +276 -0
  58. hammad/genai/language_models/_utils/__init__.py +31 -0
  59. hammad/genai/language_models/_utils/_completions.py +131 -0
  60. hammad/genai/language_models/_utils/_messages.py +89 -0
  61. hammad/genai/language_models/_utils/_requests.py +202 -0
  62. hammad/genai/language_models/_utils/_structured_outputs.py +124 -0
  63. hammad/genai/language_models/language_model.py +734 -0
  64. hammad/genai/language_models/language_model_request.py +135 -0
  65. hammad/genai/language_models/language_model_response.py +219 -0
  66. hammad/genai/language_models/language_model_response_chunk.py +53 -0
  67. hammad/genai/language_models/run.py +530 -0
  68. hammad/genai/multimodal_models.py +48 -0
  69. hammad/genai/rerank_models.py +26 -0
  70. hammad/logging/__init__.py +1 -1
  71. hammad/logging/decorators.py +1 -1
  72. hammad/logging/logger.py +2 -2
  73. hammad/mcp/__init__.py +1 -1
  74. hammad/mcp/client/__init__.py +35 -0
  75. hammad/mcp/client/client.py +105 -4
  76. hammad/mcp/client/client_service.py +10 -3
  77. hammad/mcp/servers/__init__.py +24 -0
  78. hammad/{performance/runtime → runtime}/__init__.py +2 -2
  79. hammad/{performance/runtime → runtime}/decorators.py +1 -1
  80. hammad/{performance/runtime → runtime}/run.py +1 -1
  81. hammad/service/__init__.py +1 -1
  82. hammad/service/create.py +3 -8
  83. hammad/service/decorators.py +8 -8
  84. hammad/typing/__init__.py +28 -0
  85. hammad/web/__init__.py +3 -3
  86. hammad/web/http/client.py +1 -1
  87. hammad/web/models.py +53 -21
  88. hammad/web/search/client.py +99 -52
  89. hammad/web/utils.py +13 -13
  90. hammad_python-0.0.16.dist-info/METADATA +191 -0
  91. hammad_python-0.0.16.dist-info/RECORD +110 -0
  92. hammad/ai/__init__.py +0 -1
  93. hammad/ai/_utils.py +0 -142
  94. hammad/ai/completions/__init__.py +0 -45
  95. hammad/ai/completions/client.py +0 -684
  96. hammad/ai/completions/create.py +0 -710
  97. hammad/ai/completions/settings.py +0 -100
  98. hammad/ai/completions/types.py +0 -792
  99. hammad/ai/completions/utils.py +0 -486
  100. hammad/ai/embeddings/__init__.py +0 -35
  101. hammad/ai/embeddings/client/__init__.py +0 -1
  102. hammad/ai/embeddings/client/base_embeddings_client.py +0 -26
  103. hammad/ai/embeddings/client/fastembed_text_embeddings_client.py +0 -200
  104. hammad/ai/embeddings/create.py +0 -159
  105. hammad/data/collections/base_collection.py +0 -58
  106. hammad/data/collections/searchable_collection.py +0 -556
  107. hammad/data/collections/vector_collection.py +0 -596
  108. hammad/data/databases/__init__.py +0 -21
  109. hammad/data/databases/database.py +0 -902
  110. hammad/data/models/base/__init__.py +0 -35
  111. hammad/data/models/pydantic/models/__init__.py +0 -28
  112. hammad/data/models/pydantic/models/arbitrary_model.py +0 -46
  113. hammad/data/models/pydantic/models/cacheable_model.py +0 -79
  114. hammad/data/models/pydantic/models/fast_model.py +0 -318
  115. hammad/data/models/pydantic/models/function_model.py +0 -176
  116. hammad/data/models/pydantic/models/subscriptable_model.py +0 -63
  117. hammad/performance/__init__.py +0 -36
  118. hammad/py.typed +0 -0
  119. hammad_python-0.0.14.dist-info/METADATA +0 -70
  120. hammad_python-0.0.14.dist-info/RECORD +0 -99
  121. {hammad_python-0.0.14.dist-info → hammad_python-0.0.16.dist-info}/WHEEL +0 -0
  122. {hammad_python-0.0.14.dist-info → hammad_python-0.0.16.dist-info}/licenses/LICENSE +0 -0
@@ -1,452 +1,274 @@
1
1
  """hammad.data.collections.collection"""
2
2
 
3
3
  from typing import (
4
- TYPE_CHECKING,
5
- Literal,
6
- Optional,
7
- overload,
8
4
  Any,
9
- List,
10
5
  Callable,
6
+ Dict,
7
+ List,
8
+ Literal,
9
+ Optional,
10
+ Type,
11
+ TypeVar,
11
12
  Union,
13
+ overload,
14
+ TYPE_CHECKING,
12
15
  )
13
- from typing_extensions import TypedDict
16
+ from pathlib import Path
14
17
 
15
18
  if TYPE_CHECKING:
16
- from .base_collection import BaseCollection
17
- from .searchable_collection import SearchableCollection
18
- from .vector_collection import VectorCollection
19
-
20
-
21
- Distance = Literal[
22
- "cosine",
23
- "euclidean",
24
- "manhattan",
25
- "hamming",
26
- "dot",
27
- "l2",
28
- "l1",
29
- "l2_squared",
30
- "l1_squared",
31
- "cosine_sim",
32
- "euclidean_sim",
33
- "manhattan_sim",
34
- "hamming_sim",
35
- "dot_sim",
36
- ]
37
-
38
-
39
- class SearchableCollectionSettings(TypedDict, total=False):
40
- """Configuration settings for SearchableCollection using tantivy."""
41
-
42
- heap_size: int
43
- num_threads: Optional[int]
44
- index_path: Optional[str]
45
- schema_builder: Optional[Any]
46
- writer_memory: Optional[int]
47
- reload_policy: Optional[str]
48
-
19
+ from .indexes.tantivy.index import TantivyCollectionIndex
20
+ from .indexes.qdrant.index import QdrantCollectionIndex, VectorSearchResult
21
+ from .indexes.tantivy.settings import (
22
+ TantivyCollectionIndexSettings,
23
+ TantivyCollectionIndexQuerySettings,
24
+ )
25
+ from .indexes.qdrant.settings import (
26
+ QdrantCollectionIndexSettings,
27
+ QdrantCollectionIndexQuerySettings,
28
+ DistanceMetric,
29
+ )
30
+ from ..sql.types import DatabaseItemType
31
+ from ...genai.embedding_models.embedding_model_name import EmbeddingModelName
32
+ else:
33
+ from .indexes.tantivy.index import TantivyCollectionIndex
34
+ from .indexes.qdrant.index import QdrantCollectionIndex, VectorSearchResult
49
35
 
50
- class VectorCollectionSettings(TypedDict, total=False):
51
- """Configuration settings for VectorCollection using Qdrant."""
52
36
 
53
- path: Optional[str]
54
- host: Optional[str]
55
- port: Optional[int]
56
- grpc_port: Optional[int]
57
- prefer_grpc: Optional[bool]
58
- api_key: Optional[str]
59
- timeout: Optional[float]
37
+ __all__ = (
38
+ "Collection",
39
+ "VectorSearchResult",
40
+ )
60
41
 
61
42
 
62
43
  class Collection:
63
44
  """
64
- A unified collection factory that creates the appropriate collection type
45
+ A unified collection factory that creates the appropriate collection index type
65
46
  based on the provided parameters.
66
-
47
+
67
48
  This class acts as a factory and doesn't contain its own logic - it simply
68
- returns instances of SearchableCollection or VectorCollection based on the
69
- type parameter.
49
+ returns instances of TantivyCollectionIndex or QdrantCollectionIndex based on the
50
+ vector parameter.
51
+
52
+ The main difference from the old approach is that now collections are 'unified'
53
+ - there's no separate collections interface. Each collection directly uses either
54
+ a Tantivy or Qdrant index with SQL Database as the storage backend.
70
55
  """
71
56
 
72
57
  @overload
73
58
  def __new__(
74
59
  cls,
75
- type: Literal["searchable"],
76
- name: str,
60
+ name: str = "default",
77
61
  *,
78
- schema: Optional[Any] = None,
79
- default_ttl: Optional[int] = None,
80
- storage_backend: Optional[Any] = None,
81
- heap_size: Optional[int] = None,
82
- num_threads: Optional[int] = None,
83
- index_path: Optional[str] = None,
84
- schema_builder: Optional[Any] = None,
85
- writer_memory: Optional[int] = None,
86
- reload_policy: Optional[str] = None,
87
- ) -> "SearchableCollection": ...
62
+ schema: Optional[Type["DatabaseItemType"]] = None,
63
+ ttl: Optional[int] = None,
64
+ path: Optional[Union[Path, str]] = None,
65
+ vector: Literal[False] = False,
66
+ # Tantivy-specific parameters
67
+ fast: bool = True,
68
+ settings: Optional["TantivyCollectionIndexSettings"] = None,
69
+ query_settings: Optional["TantivyCollectionIndexQuerySettings"] = None,
70
+ ) -> "TantivyCollectionIndex": ...
88
71
 
89
72
  @overload
90
73
  def __new__(
91
74
  cls,
92
- type: Literal["vector"],
93
- name: str,
94
- vector_size: int,
75
+ name: str = "default",
95
76
  *,
96
- schema: Optional[Any] = None,
97
- default_ttl: Optional[int] = None,
98
- storage_backend: Optional[Any] = None,
99
- distance_metric: Optional[Any] = None,
100
- embedding_function: Optional[Callable[[Any], List[float]]] = None,
101
- model: Optional[str] = None,
102
- # Common embedding parameters
103
- format: bool = False,
104
- # LiteLLM parameters
105
- dimensions: Optional[int] = None,
106
- encoding_format: Optional[str] = None,
107
- timeout: Optional[int] = None,
108
- api_base: Optional[str] = None,
109
- api_version: Optional[str] = None,
110
- api_key: Optional[str] = None,
111
- api_type: Optional[str] = None,
112
- caching: bool = False,
113
- user: Optional[str] = None,
114
- # FastEmbed parameters
115
- parallel: Optional[int] = None,
116
- batch_size: Optional[int] = None,
117
- # Qdrant parameters
118
- path: Optional[str] = None,
119
- host: Optional[str] = None,
120
- port: Optional[int] = None,
121
- grpc_port: Optional[int] = None,
122
- prefer_grpc: Optional[bool] = None,
123
- qdrant_timeout: Optional[float] = None,
124
- ) -> "VectorCollection": ...
77
+ schema: Optional[Type["DatabaseItemType"]] = None,
78
+ ttl: Optional[int] = None,
79
+ path: Optional[Union[Path, str]] = None,
80
+ vector: Literal[True] = True,
81
+ vector_size: Optional[int] = None,
82
+ # Vector/Qdrant-specific parameters
83
+ distance_metric: "DistanceMetric" = "dot",
84
+ settings: Optional["QdrantCollectionIndexSettings"] = None,
85
+ query_settings: Optional["QdrantCollectionIndexQuerySettings"] = None,
86
+ embedding_model: Optional["EmbeddingModelName"] = "openai/text-embedding-3-small",
87
+ embedding_dimensions: Optional[int] = None,
88
+ embedding_api_key: Optional[str] = None,
89
+ embedding_base_url: Optional[str] = None,
90
+ # Rerank-specific parameters
91
+ rerank_model: Optional[str] = None,
92
+ rerank_api_key: Optional[str] = None,
93
+ rerank_base_url: Optional[str] = None,
94
+ ) -> "QdrantCollectionIndex": ...
125
95
 
126
96
  def __new__(
127
97
  cls,
128
- type: Literal["searchable", "vector"],
129
- name: str,
130
- vector_size: Optional[int] = None,
98
+ name: str = "default",
131
99
  *,
132
- schema: Optional[Any] = None,
133
- default_ttl: Optional[int] = None,
134
- storage_backend: Optional[Any] = None,
135
- distance_metric: Optional[Any] = None,
136
- embedding_function: Optional[Callable[[Any], List[float]]] = None,
137
- model: Optional[str] = None,
138
- # Common embedding parameters
139
- format: bool = False,
140
- # LiteLLM parameters
141
- dimensions: Optional[int] = None,
142
- encoding_format: Optional[str] = None,
143
- timeout: Optional[int] = None,
144
- api_base: Optional[str] = None,
145
- api_version: Optional[str] = None,
146
- api_key: Optional[str] = None,
147
- api_type: Optional[str] = None,
148
- caching: bool = False,
149
- user: Optional[str] = None,
150
- # FastEmbed parameters
151
- parallel: Optional[int] = None,
152
- batch_size: Optional[int] = None,
153
- # Tantivy parameters (searchable collections only)
154
- heap_size: Optional[int] = None,
155
- num_threads: Optional[int] = None,
156
- index_path: Optional[str] = None,
157
- schema_builder: Optional[Any] = None,
158
- writer_memory: Optional[int] = None,
159
- reload_policy: Optional[str] = None,
160
- # Qdrant parameters (vector collections only)
161
- path: Optional[str] = None,
162
- host: Optional[str] = None,
163
- port: Optional[int] = None,
164
- grpc_port: Optional[int] = None,
165
- prefer_grpc: Optional[bool] = None,
166
- qdrant_timeout: Optional[float] = None,
167
- ) -> "BaseCollection":
100
+ schema: Optional[Type["DatabaseItemType"]] = None,
101
+ ttl: Optional[int] = None,
102
+ path: Optional[Union[Path, str]] = None,
103
+ vector: bool = False,
104
+ vector_size: Optional[int] = None,
105
+ # Tantivy-specific parameters
106
+ fast: bool = True,
107
+ # Unified settings parameters
108
+ settings: Optional[Union["TantivyCollectionIndexSettings", "QdrantCollectionIndexSettings"]] = None,
109
+ query_settings: Optional[Union["TantivyCollectionIndexQuerySettings", "QdrantCollectionIndexQuerySettings"]] = None,
110
+ # Vector/Qdrant-specific parameters
111
+ distance_metric: "DistanceMetric" = "dot",
112
+ embedding_model: Optional["EmbeddingModelName"] = "openai/text-embedding-3-small",
113
+ embedding_dimensions: Optional[int] = None,
114
+ embedding_api_key: Optional[str] = None,
115
+ embedding_base_url: Optional[str] = None,
116
+ # Rerank-specific parameters
117
+ rerank_model: Optional[str] = None,
118
+ rerank_api_key: Optional[str] = None,
119
+ rerank_base_url: Optional[str] = None,
120
+ ) -> Union["TantivyCollectionIndex", "QdrantCollectionIndex"]:
168
121
  """
169
122
  Create a collection of the specified type.
170
-
123
+
171
124
  Args:
172
- type: Type of collection to create ("searchable" or "vector")
173
125
  name: Name of the collection
126
+ schema: Optional schema type for validation
127
+ ttl: Default TTL for items in seconds
128
+ path: File path for storage (None = in-memory)
129
+ vector: Whether this is a vector collection (True) or text search collection (False)
174
130
  vector_size: Size of vectors (required for vector collections)
175
- schema: Optional schema for type validation
176
- default_ttl: Default TTL for items in seconds
177
- storage_backend: Optional storage backend
178
- distance_metric: Distance metric for similarity search (vector collections only)
179
- embedding_function: Function to convert objects to vectors (vector collections only)
180
-
181
- Tantivy parameters (searchable collections only):
182
- heap_size: Memory allocation for tantivy heap
183
- num_threads: Number of threads for tantivy operations
184
- index_path: Path to store tantivy index files
185
- schema_builder: Custom schema builder for tantivy
186
- writer_memory: Memory allocation for tantivy writer
187
- reload_policy: Policy for reloading tantivy index
188
-
189
- Qdrant parameters (vector collections only):
190
- path: Path for local Qdrant storage
191
- host: Qdrant server host
192
- port: Qdrant server port
193
- grpc_port: Qdrant gRPC port
194
- prefer_grpc: Whether to prefer gRPC over HTTP
195
- api_key: API key for Qdrant authentication
196
- timeout: Request timeout for Qdrant operations
197
-
131
+
132
+ # Tantivy parameters (for non-vector collections):
133
+ fast: Whether to use fast schema building & indexing
134
+
135
+ # Unified parameters:
136
+ settings: Collection settings (TantivyCollectionIndexSettings or QdrantCollectionIndexSettings)
137
+ query_settings: Query behavior settings (TantivyCollectionIndexQuerySettings or QdrantCollectionIndexQuerySettings)
138
+
139
+ # Qdrant parameters (for vector collections):
140
+ distance_metric: Distance metric for similarity search
141
+ embedding_model: The embedding model to use (e.g., 'openai/text-embedding-3-small')
142
+ embedding_dimensions: Number of dimensions for embeddings
143
+ embedding_api_key: API key for the embedding service
144
+ embedding_base_url: Base URL for the embedding service
145
+
146
+ # Rerank parameters (for vector collections):
147
+ rerank_model: The rerank model to use (e.g., 'cohere/rerank-english-v3.0')
148
+ rerank_api_key: API key for the rerank service
149
+ rerank_base_url: Base URL for the rerank service
150
+
198
151
  Returns:
199
- A SearchableCollection or VectorCollection instance
152
+ A TantivyCollectionIndex or QdrantCollectionIndex instance
200
153
  """
201
- if type == "searchable":
202
- from .searchable_collection import SearchableCollection
203
-
204
- # Build tantivy config from individual parameters
205
- tantivy_config = {}
206
- if heap_size is not None:
207
- tantivy_config["heap_size"] = heap_size
208
- if num_threads is not None:
209
- tantivy_config["num_threads"] = num_threads
210
- if index_path is not None:
211
- tantivy_config["index_path"] = index_path
212
- if schema_builder is not None:
213
- tantivy_config["schema_builder"] = schema_builder
214
- if writer_memory is not None:
215
- tantivy_config["writer_memory"] = writer_memory
216
- if reload_policy is not None:
217
- tantivy_config["reload_policy"] = reload_policy
218
-
219
- return SearchableCollection(
220
- name=name,
221
- schema=schema,
222
- default_ttl=default_ttl,
223
- storage_backend=storage_backend,
224
- tantivy_config=tantivy_config if tantivy_config else None,
225
- )
226
- elif type == "vector":
227
- if vector_size is None:
228
- raise ValueError("vector_size is required for vector collections")
229
-
230
- try:
231
- from .vector_collection import VectorCollection, Distance
232
- except ImportError:
233
- raise ImportError(
234
- "qdrant-client is required for vector collections. "
235
- "Please install it with 'pip install qdrant-client'."
236
- )
237
-
238
- # Set default distance metric if not provided and Distance is available
239
- if distance_metric is None and Distance is not None:
240
- distance_metric = Distance.DOT
241
-
242
- # Build qdrant config from individual parameters
243
- qdrant_config = {}
244
- if path is not None:
245
- qdrant_config["path"] = path
246
- if host is not None:
247
- qdrant_config["host"] = host
248
- if port is not None:
249
- qdrant_config["port"] = port
250
- if grpc_port is not None:
251
- qdrant_config["grpc_port"] = grpc_port
252
- if prefer_grpc is not None:
253
- qdrant_config["prefer_grpc"] = prefer_grpc
254
- if qdrant_timeout is not None:
255
- qdrant_config["timeout"] = qdrant_timeout
256
-
257
- return VectorCollection(
154
+ if vector:
155
+ # Vector collection using Qdrant
156
+ return QdrantCollectionIndex(
258
157
  name=name,
259
158
  vector_size=vector_size,
260
159
  schema=schema,
261
- default_ttl=default_ttl,
262
- storage_backend=storage_backend,
160
+ ttl=ttl,
161
+ path=path,
263
162
  distance_metric=distance_metric,
264
- qdrant_config=qdrant_config if qdrant_config else None,
265
- embedding_function=embedding_function,
266
- model=model,
267
- # Common embedding parameters
268
- format=format,
269
- # LiteLLM parameters
270
- dimensions=dimensions,
271
- encoding_format=encoding_format,
272
- timeout=timeout,
273
- api_base=api_base,
274
- api_version=api_version,
275
- api_key=api_key,
276
- api_type=api_type,
277
- caching=caching,
278
- user=user,
279
- # FastEmbed parameters
280
- parallel=parallel,
281
- batch_size=batch_size,
163
+ settings=settings,
164
+ query_settings=query_settings,
165
+ embedding_model=embedding_model,
166
+ embedding_dimensions=embedding_dimensions,
167
+ embedding_api_key=embedding_api_key,
168
+ embedding_base_url=embedding_base_url,
169
+ rerank_model=rerank_model,
170
+ rerank_api_key=rerank_api_key,
171
+ rerank_base_url=rerank_base_url,
282
172
  )
283
173
  else:
284
- raise ValueError(f"Unsupported collection type: {type}")
285
-
174
+ # Text search collection using Tantivy
175
+ return TantivyCollectionIndex(
176
+ name=name,
177
+ schema=schema,
178
+ ttl=ttl,
179
+ path=path,
180
+ fast=fast,
181
+ settings=settings,
182
+ query_settings=query_settings,
183
+ )
184
+
286
185
 
287
186
  @overload
288
187
  def create_collection(
289
- type: Literal["searchable"],
290
- name: str,
188
+ name: str = "default",
291
189
  *,
292
- schema: Optional[Any] = None,
293
- default_ttl: Optional[int] = None,
294
- storage_backend: Optional[Any] = None,
295
- heap_size: Optional[int] = None,
296
- num_threads: Optional[int] = None,
297
- index_path: Optional[str] = None,
298
- schema_builder: Optional[Any] = None,
299
- writer_memory: Optional[int] = None,
300
- reload_policy: Optional[str] = None,
301
- ) -> "SearchableCollection": ...
302
-
190
+ schema: Optional[Type["DatabaseItemType"]] = None,
191
+ ttl: Optional[int] = None,
192
+ path: Optional[Union[Path, str]] = None,
193
+ vector: Literal[False] = False,
194
+ # Tantivy-specific parameters
195
+ fast: bool = True,
196
+ settings: Optional["TantivyCollectionIndexSettings"] = None,
197
+ query_settings: Optional["TantivyCollectionIndexQuerySettings"] = None,
198
+ ) -> "TantivyCollectionIndex": ...
303
199
 
304
200
  @overload
305
201
  def create_collection(
306
- type: Literal["vector"],
307
- name: str,
308
- vector_size: int,
202
+ name: str = "default",
309
203
  *,
310
- schema: Optional[Any] = None,
311
- default_ttl: Optional[int] = None,
312
- storage_backend: Optional[Any] = None,
313
- distance_metric: Optional[Any] = None,
204
+ schema: Optional[Type["DatabaseItemType"]] = None,
205
+ ttl: Optional[int] = None,
206
+ path: Optional[Union[Path, str]] = None,
207
+ vector: Literal[True],
208
+ vector_size: Optional[int] = None,
209
+ # Vector/Qdrant-specific parameters
210
+ distance_metric: "DistanceMetric" = "dot",
211
+ settings: Optional["QdrantCollectionIndexSettings"] = None,
212
+ query_settings: Optional["QdrantCollectionIndexQuerySettings"] = None,
314
213
  embedding_function: Optional[Callable[[Any], List[float]]] = None,
315
- model: Optional[str] = None,
316
- # Common embedding parameters
317
- format: bool = False,
318
- # LiteLLM parameters
319
- dimensions: Optional[int] = None,
320
- encoding_format: Optional[str] = None,
321
- timeout: Optional[int] = None,
322
- api_base: Optional[str] = None,
323
- api_version: Optional[str] = None,
324
- api_key: Optional[str] = None,
325
- api_type: Optional[str] = None,
326
- caching: bool = False,
327
- user: Optional[str] = None,
328
- # FastEmbed parameters
329
- parallel: Optional[int] = None,
330
- batch_size: Optional[int] = None,
331
- # Qdrant parameters
332
- path: Optional[str] = None,
333
- host: Optional[str] = None,
334
- port: Optional[int] = None,
335
- grpc_port: Optional[int] = None,
336
- prefer_grpc: Optional[bool] = None,
337
- qdrant_timeout: Optional[float] = None,
338
- ) -> "VectorCollection": ...
339
-
214
+ ) -> "QdrantCollectionIndex": ...
340
215
 
341
216
  def create_collection(
342
- type: Literal["searchable", "vector"],
343
- name: str,
344
- vector_size: Optional[int] = None,
217
+ name: str = "default",
345
218
  *,
346
- schema: Optional[Any] = None,
347
- default_ttl: Optional[int] = None,
348
- storage_backend: Optional[Any] = None,
349
- distance_metric: Optional[Any] = None,
219
+ schema: Optional[Type["DatabaseItemType"]] = None,
220
+ ttl: Optional[int] = None,
221
+ path: Optional[Union[Path, str]] = None,
222
+ vector: bool = False,
223
+ vector_size: Optional[int] = None,
224
+ # Tantivy-specific parameters
225
+ fast: bool = True,
226
+ # Unified settings parameters
227
+ settings: Optional[Union["TantivyCollectionIndexSettings", "QdrantCollectionIndexSettings"]] = None,
228
+ query_settings: Optional[Union["TantivyCollectionIndexQuerySettings", "QdrantCollectionIndexQuerySettings"]] = None,
229
+ # Vector/Qdrant-specific parameters
230
+ distance_metric: "DistanceMetric" = "dot",
350
231
  embedding_function: Optional[Callable[[Any], List[float]]] = None,
351
- model: Optional[str] = None,
352
- # Common embedding parameters
353
- format: bool = False,
354
- # LiteLLM parameters
355
- dimensions: Optional[int] = None,
356
- encoding_format: Optional[str] = None,
357
- timeout: Optional[int] = None,
358
- api_base: Optional[str] = None,
359
- api_version: Optional[str] = None,
360
- api_key: Optional[str] = None,
361
- api_type: Optional[str] = None,
362
- caching: bool = False,
363
- user: Optional[str] = None,
364
- # FastEmbed parameters
365
- parallel: Optional[int] = None,
366
- batch_size: Optional[int] = None,
367
- # Tantivy parameters (searchable collections only)
368
- heap_size: Optional[int] = None,
369
- num_threads: Optional[int] = None,
370
- index_path: Optional[str] = None,
371
- schema_builder: Optional[Any] = None,
372
- writer_memory: Optional[int] = None,
373
- reload_policy: Optional[str] = None,
374
- # Qdrant parameters (vector collections only)
375
- path: Optional[str] = None,
376
- host: Optional[str] = None,
377
- port: Optional[int] = None,
378
- grpc_port: Optional[int] = None,
379
- prefer_grpc: Optional[bool] = None,
380
- qdrant_timeout: Optional[float] = None,
381
- ) -> "BaseCollection":
232
+ ) -> Union["TantivyCollectionIndex", "QdrantCollectionIndex"]:
382
233
  """
383
- Create a collection of the specified type.
384
-
385
- This function provides a factory pattern for creating collections.
386
- Use the Collection class for a more object-oriented approach.
387
-
234
+ Create a data collection of the specified type. Collections are a unified
235
+ interface for creating searchable, vectorizable data stores.
236
+
388
237
  Args:
389
- type: Type of collection to create ("searchable" or "vector")
390
238
  name: Name of the collection
239
+ schema: Optional schema type for validation
240
+ ttl: Default TTL for items in seconds
241
+ path: File path for storage (None = in-memory)
242
+ vector: Whether this is a vector collection (True) or text search collection (False)
391
243
  vector_size: Size of vectors (required for vector collections)
392
- schema: Optional schema for type validation
393
- default_ttl: Default TTL for items in seconds
394
- storage_backend: Optional storage backend
395
- distance_metric: Distance metric for similarity search (vector collections only)
396
- embedding_function: Function to convert objects to vectors (vector collections only)
397
-
398
- Tantivy parameters (searchable collections only):
399
- heap_size: Memory allocation for tantivy heap
400
- num_threads: Number of threads for tantivy operations
401
- index_path: Path to store tantivy index files
402
- schema_builder: Custom schema builder for tantivy
403
- writer_memory: Memory allocation for tantivy writer
404
- reload_policy: Policy for reloading tantivy index
405
-
406
- Qdrant parameters (vector collections only):
407
- path: Path for local Qdrant storage
408
- host: Qdrant server host
409
- port: Qdrant server port
410
- grpc_port: Qdrant gRPC port
411
- prefer_grpc: Whether to prefer gRPC over HTTP
412
- api_key: API key for Qdrant authentication
413
- timeout: Request timeout for Qdrant operations
414
-
244
+
245
+ # Tantivy parameters (for non-vector collections):
246
+ fast: Whether to use fast schema building & indexing
247
+
248
+ # Unified parameters:
249
+ settings: Collection settings (TantivyCollectionIndexSettings or QdrantCollectionIndexSettings)
250
+ query_settings: Query behavior settings (TantivyCollectionIndexQuerySettings or QdrantCollectionIndexQuerySettings)
251
+
252
+ # Qdrant parameters (for vector collections):
253
+ distance_metric: Distance metric for similarity search
254
+ embedding_model: The embedding model to use (e.g., 'openai/text-embedding-3-small')
255
+ embedding_dimensions: Number of dimensions for embeddings
256
+ embedding_api_key: API key for the embedding service
257
+ embedding_base_url: Base URL for the embedding service
258
+
415
259
  Returns:
416
- A SearchableCollection or VectorCollection instance
260
+ A TantivyCollectionIndex or QdrantCollectionIndex instance
417
261
  """
418
262
  return Collection(
419
- type=type,
420
263
  name=name,
421
- vector_size=vector_size,
422
264
  schema=schema,
423
- default_ttl=default_ttl,
424
- storage_backend=storage_backend,
265
+ ttl=ttl,
266
+ path=path,
267
+ vector=vector,
268
+ vector_size=vector_size,
269
+ fast=fast,
270
+ settings=settings,
271
+ query_settings=query_settings,
425
272
  distance_metric=distance_metric,
426
273
  embedding_function=embedding_function,
427
- model=model,
428
- format=format,
429
- dimensions=dimensions,
430
- encoding_format=encoding_format,
431
- timeout=timeout,
432
- api_base=api_base,
433
- api_version=api_version,
434
- api_key=api_key,
435
- api_type=api_type,
436
- caching=caching,
437
- user=user,
438
- parallel=parallel,
439
- batch_size=batch_size,
440
- heap_size=heap_size,
441
- num_threads=num_threads,
442
- index_path=index_path,
443
- schema_builder=schema_builder,
444
- writer_memory=writer_memory,
445
- reload_policy=reload_policy,
446
- path=path,
447
- host=host,
448
- port=port,
449
- grpc_port=grpc_port,
450
- prefer_grpc=prefer_grpc,
451
- qdrant_timeout=qdrant_timeout,
452
- )
274
+ )