hammad-python 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hammad_python-0.0.15.dist-info/METADATA +184 -0
- hammad_python-0.0.15.dist-info/RECORD +4 -0
- hammad/__init__.py +0 -1
- hammad/ai/__init__.py +0 -1
- hammad/ai/_utils.py +0 -142
- hammad/ai/completions/__init__.py +0 -45
- hammad/ai/completions/client.py +0 -684
- hammad/ai/completions/create.py +0 -710
- hammad/ai/completions/settings.py +0 -100
- hammad/ai/completions/types.py +0 -792
- hammad/ai/completions/utils.py +0 -486
- hammad/ai/embeddings/__init__.py +0 -35
- hammad/ai/embeddings/client/__init__.py +0 -1
- hammad/ai/embeddings/client/base_embeddings_client.py +0 -26
- hammad/ai/embeddings/client/fastembed_text_embeddings_client.py +0 -200
- hammad/ai/embeddings/client/litellm_embeddings_client.py +0 -288
- hammad/ai/embeddings/create.py +0 -159
- hammad/ai/embeddings/types.py +0 -69
- hammad/cache/__init__.py +0 -40
- hammad/cache/base_cache.py +0 -181
- hammad/cache/cache.py +0 -169
- hammad/cache/decorators.py +0 -261
- hammad/cache/file_cache.py +0 -80
- hammad/cache/ttl_cache.py +0 -74
- hammad/cli/__init__.py +0 -33
- hammad/cli/animations.py +0 -573
- hammad/cli/plugins.py +0 -781
- hammad/cli/styles/__init__.py +0 -55
- hammad/cli/styles/settings.py +0 -139
- hammad/cli/styles/types.py +0 -358
- hammad/cli/styles/utils.py +0 -480
- hammad/data/__init__.py +0 -56
- hammad/data/collections/__init__.py +0 -34
- hammad/data/collections/base_collection.py +0 -58
- hammad/data/collections/collection.py +0 -452
- hammad/data/collections/searchable_collection.py +0 -556
- hammad/data/collections/vector_collection.py +0 -596
- hammad/data/configurations/__init__.py +0 -35
- hammad/data/configurations/configuration.py +0 -564
- hammad/data/databases/__init__.py +0 -21
- hammad/data/databases/database.py +0 -902
- hammad/data/models/__init__.py +0 -44
- hammad/data/models/base/__init__.py +0 -35
- hammad/data/models/base/fields.py +0 -546
- hammad/data/models/base/model.py +0 -1078
- hammad/data/models/base/utils.py +0 -280
- hammad/data/models/pydantic/__init__.py +0 -55
- hammad/data/models/pydantic/converters.py +0 -632
- hammad/data/models/pydantic/models/__init__.py +0 -28
- hammad/data/models/pydantic/models/arbitrary_model.py +0 -46
- hammad/data/models/pydantic/models/cacheable_model.py +0 -79
- hammad/data/models/pydantic/models/fast_model.py +0 -318
- hammad/data/models/pydantic/models/function_model.py +0 -176
- hammad/data/models/pydantic/models/subscriptable_model.py +0 -63
- hammad/data/types/__init__.py +0 -41
- hammad/data/types/file.py +0 -358
- hammad/data/types/multimodal/__init__.py +0 -24
- hammad/data/types/multimodal/audio.py +0 -96
- hammad/data/types/multimodal/image.py +0 -80
- hammad/data/types/text.py +0 -1066
- hammad/formatting/__init__.py +0 -38
- hammad/formatting/json/__init__.py +0 -21
- hammad/formatting/json/converters.py +0 -152
- hammad/formatting/text/__init__.py +0 -63
- hammad/formatting/text/converters.py +0 -723
- hammad/formatting/text/markdown.py +0 -131
- hammad/formatting/yaml/__init__.py +0 -26
- hammad/formatting/yaml/converters.py +0 -5
- hammad/logging/__init__.py +0 -35
- hammad/logging/decorators.py +0 -834
- hammad/logging/logger.py +0 -954
- hammad/mcp/__init__.py +0 -50
- hammad/mcp/client/__init__.py +0 -1
- hammad/mcp/client/client.py +0 -523
- hammad/mcp/client/client_service.py +0 -393
- hammad/mcp/client/settings.py +0 -178
- hammad/mcp/servers/__init__.py +0 -1
- hammad/mcp/servers/launcher.py +0 -1161
- hammad/performance/__init__.py +0 -36
- hammad/performance/imports.py +0 -231
- hammad/performance/runtime/__init__.py +0 -32
- hammad/performance/runtime/decorators.py +0 -142
- hammad/performance/runtime/run.py +0 -299
- hammad/py.typed +0 -0
- hammad/service/__init__.py +0 -49
- hammad/service/create.py +0 -532
- hammad/service/decorators.py +0 -285
- hammad/typing/__init__.py +0 -407
- hammad/web/__init__.py +0 -43
- hammad/web/http/__init__.py +0 -1
- hammad/web/http/client.py +0 -944
- hammad/web/models.py +0 -245
- hammad/web/openapi/__init__.py +0 -1
- hammad/web/openapi/client.py +0 -740
- hammad/web/search/__init__.py +0 -1
- hammad/web/search/client.py +0 -988
- hammad/web/utils.py +0 -472
- hammad_python-0.0.14.dist-info/METADATA +0 -70
- hammad_python-0.0.14.dist-info/RECORD +0 -99
- {hammad_python-0.0.14.dist-info → hammad_python-0.0.15.dist-info}/WHEEL +0 -0
- {hammad_python-0.0.14.dist-info → hammad_python-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -1,556 +0,0 @@
|
|
1
|
-
"""hammad.data.collections.searchable_collection"""
|
2
|
-
|
3
|
-
import uuid
|
4
|
-
import json
|
5
|
-
from typing import Any, Dict, Optional, List, Generic
|
6
|
-
from datetime import datetime, timezone, timedelta
|
7
|
-
from dataclasses import asdict, is_dataclass
|
8
|
-
import tantivy
|
9
|
-
|
10
|
-
from .base_collection import BaseCollection, Object, Filters, Schema
|
11
|
-
|
12
|
-
__all__ = ("SearchableCollection",)
|
13
|
-
|
14
|
-
|
15
|
-
class SearchableCollection(BaseCollection, Generic[Object]):
|
16
|
-
"""
|
17
|
-
Base collection class that can be used independently or with a database.
|
18
|
-
|
19
|
-
This provides the core collection functionality that can work standalone
|
20
|
-
or be integrated with the main Database class.
|
21
|
-
"""
|
22
|
-
|
23
|
-
def __init__(
|
24
|
-
self,
|
25
|
-
name: str,
|
26
|
-
schema: Optional[Schema] = None,
|
27
|
-
default_ttl: Optional[int] = None,
|
28
|
-
storage_backend: Optional[Any] = None,
|
29
|
-
tantivy_config: Optional[Dict[str, Any]] = None,
|
30
|
-
):
|
31
|
-
"""
|
32
|
-
Initialize a collection.
|
33
|
-
|
34
|
-
Args:
|
35
|
-
name: The name of the collection
|
36
|
-
schema: Optional schema for type validation
|
37
|
-
default_ttl: Default TTL for items in seconds
|
38
|
-
storage_backend: Optional storage backend (Database instance or custom)
|
39
|
-
tantivy_config: Optional tantivy configuration for field properties and index settings
|
40
|
-
Example: {
|
41
|
-
"text_fields": {"fast": True, "stored": True},
|
42
|
-
"numeric_fields": {"fast": True, "indexed": True},
|
43
|
-
"writer_heap_size": 256_000_000,
|
44
|
-
"writer_num_threads": 2
|
45
|
-
}
|
46
|
-
"""
|
47
|
-
self.name = name
|
48
|
-
self.schema = schema
|
49
|
-
self.default_ttl = default_ttl
|
50
|
-
self._storage_backend = storage_backend
|
51
|
-
|
52
|
-
# Store tantivy configuration
|
53
|
-
self._tantivy_config = tantivy_config or {}
|
54
|
-
|
55
|
-
# In-memory storage when used independently
|
56
|
-
self._items: Dict[str, Dict[str, Any]] = {}
|
57
|
-
|
58
|
-
# Initialize tantivy index
|
59
|
-
self._init_tantivy_index()
|
60
|
-
|
61
|
-
def _init_tantivy_index(self):
|
62
|
-
"""Initialize the tantivy search index."""
|
63
|
-
# Build schema for tantivy
|
64
|
-
schema_builder = tantivy.SchemaBuilder()
|
65
|
-
|
66
|
-
# Get configuration for different field types
|
67
|
-
text_config = self._tantivy_config.get(
|
68
|
-
"text_fields", {"stored": True, "fast": True}
|
69
|
-
)
|
70
|
-
numeric_config = self._tantivy_config.get(
|
71
|
-
"numeric_fields", {"stored": True, "indexed": True, "fast": True}
|
72
|
-
)
|
73
|
-
date_config = self._tantivy_config.get(
|
74
|
-
"date_fields", {"stored": True, "indexed": True, "fast": True}
|
75
|
-
)
|
76
|
-
json_config = self._tantivy_config.get("json_fields", {"stored": True})
|
77
|
-
|
78
|
-
# Add ID field (stored and indexed)
|
79
|
-
schema_builder.add_text_field("id", **text_config)
|
80
|
-
|
81
|
-
# Add content field for general text search
|
82
|
-
content_config = {
|
83
|
-
**text_config,
|
84
|
-
"tokenizer_name": "default",
|
85
|
-
"index_option": "position",
|
86
|
-
}
|
87
|
-
schema_builder.add_text_field("content", **content_config)
|
88
|
-
|
89
|
-
# Add dynamic fields that might be searched and sorted
|
90
|
-
title_config = {
|
91
|
-
**text_config,
|
92
|
-
"tokenizer_name": "default",
|
93
|
-
"index_option": "position",
|
94
|
-
}
|
95
|
-
schema_builder.add_text_field("title", **title_config)
|
96
|
-
|
97
|
-
# Add JSON field for storing the actual data
|
98
|
-
schema_builder.add_json_field("data", **json_config)
|
99
|
-
|
100
|
-
# Add filter fields as facets
|
101
|
-
schema_builder.add_facet_field("filters")
|
102
|
-
|
103
|
-
# Add timestamp fields
|
104
|
-
schema_builder.add_date_field("created_at", **date_config)
|
105
|
-
schema_builder.add_date_field("expires_at", **date_config)
|
106
|
-
|
107
|
-
# Add numeric fields for sorting
|
108
|
-
schema_builder.add_integer_field("score", **numeric_config)
|
109
|
-
|
110
|
-
# Build the schema
|
111
|
-
self._tantivy_schema = schema_builder.build()
|
112
|
-
|
113
|
-
# Create index in memory (no path means in-memory)
|
114
|
-
self._index = tantivy.Index(self._tantivy_schema)
|
115
|
-
|
116
|
-
# Configure index writer with custom settings if provided
|
117
|
-
writer_config = {}
|
118
|
-
if "writer_heap_size" in self._tantivy_config:
|
119
|
-
writer_config["heap_size"] = self._tantivy_config["writer_heap_size"]
|
120
|
-
if "writer_num_threads" in self._tantivy_config:
|
121
|
-
writer_config["num_threads"] = self._tantivy_config["writer_num_threads"]
|
122
|
-
|
123
|
-
self._index_writer = self._index.writer(**writer_config)
|
124
|
-
|
125
|
-
# Configure index reader if settings provided
|
126
|
-
reader_config = self._tantivy_config.get("reader_config", {})
|
127
|
-
if reader_config:
|
128
|
-
reload_policy = reader_config.get("reload_policy", "commit")
|
129
|
-
num_warmers = reader_config.get("num_warmers", 0)
|
130
|
-
self._index.config_reader(
|
131
|
-
reload_policy=reload_policy, num_warmers=num_warmers
|
132
|
-
)
|
133
|
-
|
134
|
-
def __repr__(self) -> str:
|
135
|
-
item_count = len(self._items) if self._storage_backend is None else "managed"
|
136
|
-
return f"<{self.__class__.__name__} name='{self.name}' items={item_count}>"
|
137
|
-
|
138
|
-
def _calculate_expires_at(self, ttl: Optional[int]) -> Optional[datetime]:
|
139
|
-
"""Calculate expiry time based on TTL."""
|
140
|
-
if ttl is None:
|
141
|
-
ttl = self.default_ttl
|
142
|
-
if ttl and ttl > 0:
|
143
|
-
return datetime.now(timezone.utc) + timedelta(seconds=ttl)
|
144
|
-
return None
|
145
|
-
|
146
|
-
def _is_expired(self, expires_at: Optional[datetime]) -> bool:
|
147
|
-
"""Check if an item has expired."""
|
148
|
-
if expires_at is None:
|
149
|
-
return False
|
150
|
-
now = datetime.now(timezone.utc)
|
151
|
-
if expires_at.tzinfo is None:
|
152
|
-
expires_at = expires_at.replace(tzinfo=timezone.utc)
|
153
|
-
return now >= expires_at
|
154
|
-
|
155
|
-
def _match_filters(
|
156
|
-
self, stored: Optional[Filters], query: Optional[Filters]
|
157
|
-
) -> bool:
|
158
|
-
"""Check if stored filters match query filters."""
|
159
|
-
if query is None:
|
160
|
-
return True
|
161
|
-
if stored is None:
|
162
|
-
return False
|
163
|
-
return all(stored.get(k) == v for k, v in query.items())
|
164
|
-
|
165
|
-
def get(self, id: str, *, filters: Optional[Filters] = None) -> Optional[Object]:
|
166
|
-
"""Get an item by ID."""
|
167
|
-
if self._storage_backend is not None:
|
168
|
-
# Delegate to storage backend (Database instance)
|
169
|
-
return self._storage_backend.get(id, collection=self.name, filters=filters)
|
170
|
-
|
171
|
-
# Independent operation
|
172
|
-
item = self._items.get(id)
|
173
|
-
if not item:
|
174
|
-
return None
|
175
|
-
|
176
|
-
if self._is_expired(item.get("expires_at")):
|
177
|
-
del self._items[id]
|
178
|
-
return None
|
179
|
-
|
180
|
-
if not self._match_filters(item.get("filters"), filters):
|
181
|
-
return None
|
182
|
-
|
183
|
-
return item["value"]
|
184
|
-
|
185
|
-
def _serialize_for_json(self, obj: Any) -> Any:
|
186
|
-
"""Serialize object for JSON storage."""
|
187
|
-
if isinstance(obj, (str, int, float, bool, type(None))):
|
188
|
-
return obj
|
189
|
-
elif isinstance(obj, (list, tuple)):
|
190
|
-
return [self._serialize_for_json(item) for item in obj]
|
191
|
-
elif isinstance(obj, dict):
|
192
|
-
return {k: self._serialize_for_json(v) for k, v in obj.items()}
|
193
|
-
elif is_dataclass(obj):
|
194
|
-
return self._serialize_for_json(asdict(obj))
|
195
|
-
elif hasattr(obj, "__dict__"):
|
196
|
-
return self._serialize_for_json(obj.__dict__)
|
197
|
-
else:
|
198
|
-
return str(obj)
|
199
|
-
|
200
|
-
def add(
|
201
|
-
self,
|
202
|
-
entry: Object,
|
203
|
-
*,
|
204
|
-
id: Optional[str] = None,
|
205
|
-
filters: Optional[Filters] = None,
|
206
|
-
ttl: Optional[int] = None,
|
207
|
-
) -> None:
|
208
|
-
"""Add an item to the collection."""
|
209
|
-
if self._storage_backend is not None:
|
210
|
-
# Delegate to storage backend
|
211
|
-
self._storage_backend.add(
|
212
|
-
entry, id=id, collection=self.name, filters=filters, ttl=ttl
|
213
|
-
)
|
214
|
-
return
|
215
|
-
|
216
|
-
# Independent operation
|
217
|
-
item_id = id or str(uuid.uuid4())
|
218
|
-
expires_at = self._calculate_expires_at(ttl)
|
219
|
-
created_at = datetime.now(timezone.utc)
|
220
|
-
|
221
|
-
# Store in memory
|
222
|
-
self._items[item_id] = {
|
223
|
-
"value": entry,
|
224
|
-
"filters": filters or {},
|
225
|
-
"created_at": created_at,
|
226
|
-
"updated_at": created_at,
|
227
|
-
"expires_at": expires_at,
|
228
|
-
}
|
229
|
-
|
230
|
-
# Add to tantivy index
|
231
|
-
doc = tantivy.Document()
|
232
|
-
doc.add_text("id", item_id)
|
233
|
-
|
234
|
-
# Extract searchable content
|
235
|
-
content = self._extract_content_for_indexing(entry)
|
236
|
-
doc.add_text("content", content)
|
237
|
-
|
238
|
-
# Add title field if present
|
239
|
-
if isinstance(entry, dict) and "title" in entry:
|
240
|
-
doc.add_text("title", str(entry["title"]))
|
241
|
-
|
242
|
-
# Store the full data as JSON
|
243
|
-
serialized_data = self._serialize_for_json(entry)
|
244
|
-
# Wrap in object structure for tantivy JSON field
|
245
|
-
json_data = {"value": serialized_data}
|
246
|
-
doc.add_json("data", json.dumps(json_data))
|
247
|
-
|
248
|
-
# Add filters as facets
|
249
|
-
if filters:
|
250
|
-
for key, value in filters.items():
|
251
|
-
facet_value = f"/{key}/{value}"
|
252
|
-
doc.add_facet("filters", tantivy.Facet.from_string(facet_value))
|
253
|
-
|
254
|
-
# Add timestamps
|
255
|
-
doc.add_date("created_at", created_at)
|
256
|
-
if expires_at:
|
257
|
-
doc.add_date("expires_at", expires_at)
|
258
|
-
|
259
|
-
# Add score field if present
|
260
|
-
if (
|
261
|
-
isinstance(entry, dict)
|
262
|
-
and "score" in entry
|
263
|
-
and isinstance(entry["score"], (int, float))
|
264
|
-
):
|
265
|
-
doc.add_integer("score", int(entry["score"]))
|
266
|
-
|
267
|
-
self._index_writer.add_document(doc)
|
268
|
-
self._index_writer.commit()
|
269
|
-
|
270
|
-
def _extract_content_for_indexing(self, value: Any) -> str:
|
271
|
-
"""Extract searchable text content from value for indexing."""
|
272
|
-
if isinstance(value, str):
|
273
|
-
return value
|
274
|
-
elif isinstance(value, dict):
|
275
|
-
# Concatenate all string values
|
276
|
-
content_parts = []
|
277
|
-
for v in value.values():
|
278
|
-
if isinstance(v, str):
|
279
|
-
content_parts.append(v)
|
280
|
-
elif isinstance(v, (list, dict)):
|
281
|
-
content_parts.append(json.dumps(v))
|
282
|
-
else:
|
283
|
-
content_parts.append(str(v))
|
284
|
-
return " ".join(content_parts)
|
285
|
-
else:
|
286
|
-
return str(value)
|
287
|
-
|
288
|
-
def query(
|
289
|
-
self,
|
290
|
-
*,
|
291
|
-
filters: Optional[Filters] = None,
|
292
|
-
search: Optional[str] = None,
|
293
|
-
limit: Optional[int] = None,
|
294
|
-
offset: int = 0,
|
295
|
-
fields: Optional[List[str]] = None,
|
296
|
-
fuzzy: bool = False,
|
297
|
-
fuzzy_distance: int = 2,
|
298
|
-
fuzzy_transposition_cost_one: bool = True,
|
299
|
-
fuzzy_prefix: bool = False,
|
300
|
-
phrase: bool = False,
|
301
|
-
phrase_slop: int = 0,
|
302
|
-
boost_fields: Optional[Dict[str, float]] = None,
|
303
|
-
min_score: Optional[float] = None,
|
304
|
-
sort_by: Optional[str] = None,
|
305
|
-
ascending: bool = True,
|
306
|
-
count: bool = True,
|
307
|
-
regex_search: Optional[str] = None,
|
308
|
-
) -> List[Object]:
|
309
|
-
"""
|
310
|
-
Query items from the collection using tantivy search.
|
311
|
-
|
312
|
-
Args:
|
313
|
-
filters: Dictionary of filters to apply to results
|
314
|
-
search: Search query string supporting boolean operators (AND, OR, NOT, +, -)
|
315
|
-
limit: Maximum number of results to return
|
316
|
-
offset: Number of results to skip (for pagination)
|
317
|
-
fields: Specific fields to search in (defaults to content field)
|
318
|
-
fuzzy: Enable fuzzy matching for approximate string matching
|
319
|
-
fuzzy_distance: Maximum edit distance for fuzzy matching (default: 2)
|
320
|
-
fuzzy_transposition_cost_one: Whether transpositions have cost 1 in fuzzy matching
|
321
|
-
fuzzy_prefix: Whether to match only as prefix in fuzzy search
|
322
|
-
phrase: Treat search query as exact phrase match
|
323
|
-
phrase_slop: Maximum number of words that can appear between phrase terms
|
324
|
-
boost_fields: Field-specific score boosting weights (field_name -> boost_factor)
|
325
|
-
min_score: Minimum relevance score threshold for results
|
326
|
-
sort_by: Field name to sort results by (defaults to relevance score)
|
327
|
-
ascending: Sort order direction (True for ascending, False for descending)
|
328
|
-
count: Whether to count total matches (performance optimization)
|
329
|
-
regex_search: Regular expression pattern to search for in specified fields
|
330
|
-
|
331
|
-
Returns:
|
332
|
-
List of matching objects sorted by relevance or specified field
|
333
|
-
"""
|
334
|
-
if self._storage_backend is not None:
|
335
|
-
# Delegate to storage backend with enhanced parameters
|
336
|
-
return self._storage_backend.query(
|
337
|
-
collection=self.name,
|
338
|
-
filters=filters,
|
339
|
-
search=search,
|
340
|
-
limit=limit,
|
341
|
-
offset=offset,
|
342
|
-
fields=fields,
|
343
|
-
fuzzy=fuzzy,
|
344
|
-
fuzzy_distance=fuzzy_distance,
|
345
|
-
fuzzy_transposition_cost_one=fuzzy_transposition_cost_one,
|
346
|
-
fuzzy_prefix=fuzzy_prefix,
|
347
|
-
phrase=phrase,
|
348
|
-
phrase_slop=phrase_slop,
|
349
|
-
boost_fields=boost_fields,
|
350
|
-
min_score=min_score,
|
351
|
-
sort_by=sort_by,
|
352
|
-
ascending=ascending,
|
353
|
-
count=count,
|
354
|
-
regex_search=regex_search,
|
355
|
-
)
|
356
|
-
|
357
|
-
# Refresh index and get searcher
|
358
|
-
self._index.reload()
|
359
|
-
searcher = self._index.searcher()
|
360
|
-
|
361
|
-
# Build the query
|
362
|
-
query_parts = []
|
363
|
-
|
364
|
-
# Add filter queries
|
365
|
-
if filters:
|
366
|
-
for key, value in filters.items():
|
367
|
-
facet_query = tantivy.Query.term_query(
|
368
|
-
self._tantivy_schema,
|
369
|
-
"filters",
|
370
|
-
tantivy.Facet.from_string(f"/{key}/{value}"),
|
371
|
-
)
|
372
|
-
query_parts.append((tantivy.Occur.Must, facet_query))
|
373
|
-
|
374
|
-
# Add search query
|
375
|
-
if regex_search:
|
376
|
-
# Regular expression query
|
377
|
-
search_query = tantivy.Query.regex_query(
|
378
|
-
self._tantivy_schema, fields[0] if fields else "content", regex_search
|
379
|
-
)
|
380
|
-
query_parts.append((tantivy.Occur.Must, search_query))
|
381
|
-
elif search:
|
382
|
-
if phrase:
|
383
|
-
# Phrase query
|
384
|
-
words = search.split()
|
385
|
-
search_query = tantivy.Query.phrase_query(
|
386
|
-
self._tantivy_schema, "content", words, slop=phrase_slop
|
387
|
-
)
|
388
|
-
elif fuzzy:
|
389
|
-
# Fuzzy query for each term
|
390
|
-
terms = search.split()
|
391
|
-
fuzzy_queries = []
|
392
|
-
for term in terms:
|
393
|
-
fuzzy_q = tantivy.Query.fuzzy_term_query(
|
394
|
-
self._tantivy_schema,
|
395
|
-
"content",
|
396
|
-
term,
|
397
|
-
distance=fuzzy_distance,
|
398
|
-
transposition_cost_one=fuzzy_transposition_cost_one,
|
399
|
-
prefix=fuzzy_prefix,
|
400
|
-
)
|
401
|
-
fuzzy_queries.append((tantivy.Occur.Should, fuzzy_q))
|
402
|
-
search_query = tantivy.Query.boolean_query(fuzzy_queries)
|
403
|
-
else:
|
404
|
-
# Use tantivy's query parser for boolean operators
|
405
|
-
# Handle None boost_fields
|
406
|
-
if boost_fields:
|
407
|
-
search_query = self._index.parse_query(
|
408
|
-
search,
|
409
|
-
default_field_names=fields or ["content", "title"],
|
410
|
-
field_boosts=boost_fields,
|
411
|
-
)
|
412
|
-
else:
|
413
|
-
search_query = self._index.parse_query(
|
414
|
-
search, default_field_names=fields or ["content", "title"]
|
415
|
-
)
|
416
|
-
|
417
|
-
query_parts.append((tantivy.Occur.Must, search_query))
|
418
|
-
|
419
|
-
# Build final query
|
420
|
-
if query_parts:
|
421
|
-
final_query = tantivy.Query.boolean_query(query_parts)
|
422
|
-
else:
|
423
|
-
final_query = tantivy.Query.all_query()
|
424
|
-
|
425
|
-
# Execute search
|
426
|
-
limit = limit or 100
|
427
|
-
|
428
|
-
# Use tantivy's built-in sorting for known fast fields, otherwise manual sort
|
429
|
-
tantivy_sortable_fields = {
|
430
|
-
"score",
|
431
|
-
"created_at",
|
432
|
-
"expires_at",
|
433
|
-
} # Remove title for now
|
434
|
-
|
435
|
-
if sort_by and sort_by in tantivy_sortable_fields:
|
436
|
-
# Use tantivy's built-in sorting for fast fields
|
437
|
-
try:
|
438
|
-
search_result = searcher.search(
|
439
|
-
final_query,
|
440
|
-
limit=limit,
|
441
|
-
offset=offset,
|
442
|
-
count=count,
|
443
|
-
order_by_field=sort_by,
|
444
|
-
order=tantivy.Order.Asc if ascending else tantivy.Order.Desc,
|
445
|
-
)
|
446
|
-
manual_sort_needed = False
|
447
|
-
except Exception:
|
448
|
-
# Fallback to manual sorting if tantivy sorting fails
|
449
|
-
search_result = searcher.search(
|
450
|
-
final_query, limit=1000, offset=offset, count=count
|
451
|
-
)
|
452
|
-
manual_sort_needed = True
|
453
|
-
else:
|
454
|
-
# Default search or manual sorting needed
|
455
|
-
search_result = searcher.search(
|
456
|
-
final_query,
|
457
|
-
limit=1000 if sort_by else limit,
|
458
|
-
offset=offset,
|
459
|
-
count=count,
|
460
|
-
)
|
461
|
-
manual_sort_needed = bool(sort_by and sort_by != "score")
|
462
|
-
|
463
|
-
# Extract results
|
464
|
-
if manual_sort_needed:
|
465
|
-
# Manual sorting needed for non-tantivy fields
|
466
|
-
all_results = []
|
467
|
-
for score, doc_address in search_result.hits:
|
468
|
-
# Skip if min_score is set and score is too low
|
469
|
-
if min_score and score < min_score:
|
470
|
-
continue
|
471
|
-
|
472
|
-
doc = searcher.doc(doc_address)
|
473
|
-
|
474
|
-
# Check expiration
|
475
|
-
expires_at = doc.get_first("expires_at")
|
476
|
-
if expires_at and self._is_expired(expires_at):
|
477
|
-
continue
|
478
|
-
|
479
|
-
# Get the stored data
|
480
|
-
data = doc.get_first("data")
|
481
|
-
if data:
|
482
|
-
# Parse JSON data back to Python object
|
483
|
-
if isinstance(data, str):
|
484
|
-
json_obj = json.loads(data)
|
485
|
-
parsed_data = json_obj.get("value", json_obj)
|
486
|
-
else:
|
487
|
-
parsed_data = (
|
488
|
-
data.get("value", data) if isinstance(data, dict) else data
|
489
|
-
)
|
490
|
-
all_results.append((score, parsed_data))
|
491
|
-
|
492
|
-
# Sort by the specified field
|
493
|
-
all_results.sort(
|
494
|
-
key=lambda x: self._get_sort_value(x[1], sort_by), reverse=not ascending
|
495
|
-
)
|
496
|
-
|
497
|
-
# Apply limit and extract just the data
|
498
|
-
results = [data for _, data in all_results[:limit]]
|
499
|
-
else:
|
500
|
-
# Direct extraction for tantivy-sorted or unsorted results
|
501
|
-
results = []
|
502
|
-
for score, doc_address in search_result.hits:
|
503
|
-
# Skip if min_score is set and score is too low
|
504
|
-
if min_score and score < min_score:
|
505
|
-
continue
|
506
|
-
|
507
|
-
doc = searcher.doc(doc_address)
|
508
|
-
|
509
|
-
# Check expiration
|
510
|
-
expires_at = doc.get_first("expires_at")
|
511
|
-
if expires_at and self._is_expired(expires_at):
|
512
|
-
continue
|
513
|
-
|
514
|
-
# Get the stored data
|
515
|
-
data = doc.get_first("data")
|
516
|
-
if data:
|
517
|
-
# Parse JSON data back to Python object
|
518
|
-
if isinstance(data, str):
|
519
|
-
json_obj = json.loads(data)
|
520
|
-
parsed_data = json_obj.get("value", json_obj)
|
521
|
-
else:
|
522
|
-
parsed_data = (
|
523
|
-
data.get("value", data) if isinstance(data, dict) else data
|
524
|
-
)
|
525
|
-
results.append(parsed_data)
|
526
|
-
|
527
|
-
return results
|
528
|
-
|
529
|
-
def _get_sort_value(self, value: Any, sort_field: str) -> Any:
|
530
|
-
"""Extract sort value from object for specified field."""
|
531
|
-
if isinstance(value, dict):
|
532
|
-
# For dictionaries, return the value or a default that sorts appropriately
|
533
|
-
if sort_field in value:
|
534
|
-
val = value[sort_field]
|
535
|
-
# Handle numeric values properly
|
536
|
-
if isinstance(val, (int, float)):
|
537
|
-
return val
|
538
|
-
return str(val)
|
539
|
-
# Return a value that sorts to the end for missing fields
|
540
|
-
return float("inf") if sort_field == "score" else ""
|
541
|
-
elif hasattr(value, sort_field):
|
542
|
-
val = getattr(value, sort_field)
|
543
|
-
if isinstance(val, (int, float)):
|
544
|
-
return val
|
545
|
-
return str(val)
|
546
|
-
else:
|
547
|
-
# Return a value that sorts to the end for missing fields
|
548
|
-
return float("inf") if sort_field == "score" else ""
|
549
|
-
|
550
|
-
def attach_to_database(self, database: Any) -> None:
|
551
|
-
"""Attach this collection to a database instance."""
|
552
|
-
self._storage_backend = database
|
553
|
-
# Ensure the collection exists in the database
|
554
|
-
database.create_collection(
|
555
|
-
self.name, schema=self.schema, default_ttl=self.default_ttl
|
556
|
-
)
|