hammad-python 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ham/__init__.py +200 -0
- {hammad_python-0.0.30.dist-info → hammad_python-0.0.32.dist-info}/METADATA +6 -32
- hammad_python-0.0.32.dist-info/RECORD +6 -0
- hammad/__init__.py +0 -84
- hammad/_internal.py +0 -256
- hammad/_main.py +0 -226
- hammad/cache/__init__.py +0 -40
- hammad/cache/base_cache.py +0 -181
- hammad/cache/cache.py +0 -169
- hammad/cache/decorators.py +0 -261
- hammad/cache/file_cache.py +0 -80
- hammad/cache/ttl_cache.py +0 -74
- hammad/cli/__init__.py +0 -33
- hammad/cli/animations.py +0 -573
- hammad/cli/plugins.py +0 -867
- hammad/cli/styles/__init__.py +0 -55
- hammad/cli/styles/settings.py +0 -139
- hammad/cli/styles/types.py +0 -358
- hammad/cli/styles/utils.py +0 -634
- hammad/data/__init__.py +0 -90
- hammad/data/collections/__init__.py +0 -49
- hammad/data/collections/collection.py +0 -326
- hammad/data/collections/indexes/__init__.py +0 -37
- hammad/data/collections/indexes/qdrant/__init__.py +0 -1
- hammad/data/collections/indexes/qdrant/index.py +0 -723
- hammad/data/collections/indexes/qdrant/settings.py +0 -94
- hammad/data/collections/indexes/qdrant/utils.py +0 -210
- hammad/data/collections/indexes/tantivy/__init__.py +0 -1
- hammad/data/collections/indexes/tantivy/index.py +0 -426
- hammad/data/collections/indexes/tantivy/settings.py +0 -40
- hammad/data/collections/indexes/tantivy/utils.py +0 -176
- hammad/data/configurations/__init__.py +0 -35
- hammad/data/configurations/configuration.py +0 -564
- hammad/data/models/__init__.py +0 -50
- hammad/data/models/extensions/__init__.py +0 -4
- hammad/data/models/extensions/pydantic/__init__.py +0 -42
- hammad/data/models/extensions/pydantic/converters.py +0 -759
- hammad/data/models/fields.py +0 -546
- hammad/data/models/model.py +0 -1078
- hammad/data/models/utils.py +0 -280
- hammad/data/sql/__init__.py +0 -24
- hammad/data/sql/database.py +0 -576
- hammad/data/sql/types.py +0 -127
- hammad/data/types/__init__.py +0 -75
- hammad/data/types/file.py +0 -431
- hammad/data/types/multimodal/__init__.py +0 -36
- hammad/data/types/multimodal/audio.py +0 -200
- hammad/data/types/multimodal/image.py +0 -182
- hammad/data/types/text.py +0 -1308
- hammad/formatting/__init__.py +0 -33
- hammad/formatting/json/__init__.py +0 -27
- hammad/formatting/json/converters.py +0 -158
- hammad/formatting/text/__init__.py +0 -63
- hammad/formatting/text/converters.py +0 -723
- hammad/formatting/text/markdown.py +0 -131
- hammad/formatting/yaml/__init__.py +0 -26
- hammad/formatting/yaml/converters.py +0 -5
- hammad/genai/__init__.py +0 -217
- hammad/genai/a2a/__init__.py +0 -32
- hammad/genai/a2a/workers.py +0 -552
- hammad/genai/agents/__init__.py +0 -59
- hammad/genai/agents/agent.py +0 -1973
- hammad/genai/agents/run.py +0 -1024
- hammad/genai/agents/types/__init__.py +0 -42
- hammad/genai/agents/types/agent_context.py +0 -13
- hammad/genai/agents/types/agent_event.py +0 -128
- hammad/genai/agents/types/agent_hooks.py +0 -220
- hammad/genai/agents/types/agent_messages.py +0 -31
- hammad/genai/agents/types/agent_response.py +0 -125
- hammad/genai/agents/types/agent_stream.py +0 -327
- hammad/genai/graphs/__init__.py +0 -125
- hammad/genai/graphs/_utils.py +0 -190
- hammad/genai/graphs/base.py +0 -1828
- hammad/genai/graphs/plugins.py +0 -316
- hammad/genai/graphs/types.py +0 -638
- hammad/genai/models/__init__.py +0 -1
- hammad/genai/models/embeddings/__init__.py +0 -43
- hammad/genai/models/embeddings/model.py +0 -226
- hammad/genai/models/embeddings/run.py +0 -163
- hammad/genai/models/embeddings/types/__init__.py +0 -37
- hammad/genai/models/embeddings/types/embedding_model_name.py +0 -75
- hammad/genai/models/embeddings/types/embedding_model_response.py +0 -76
- hammad/genai/models/embeddings/types/embedding_model_run_params.py +0 -66
- hammad/genai/models/embeddings/types/embedding_model_settings.py +0 -47
- hammad/genai/models/language/__init__.py +0 -57
- hammad/genai/models/language/model.py +0 -1098
- hammad/genai/models/language/run.py +0 -878
- hammad/genai/models/language/types/__init__.py +0 -40
- hammad/genai/models/language/types/language_model_instructor_mode.py +0 -47
- hammad/genai/models/language/types/language_model_messages.py +0 -28
- hammad/genai/models/language/types/language_model_name.py +0 -239
- hammad/genai/models/language/types/language_model_request.py +0 -127
- hammad/genai/models/language/types/language_model_response.py +0 -217
- hammad/genai/models/language/types/language_model_response_chunk.py +0 -56
- hammad/genai/models/language/types/language_model_settings.py +0 -89
- hammad/genai/models/language/types/language_model_stream.py +0 -600
- hammad/genai/models/language/utils/__init__.py +0 -28
- hammad/genai/models/language/utils/requests.py +0 -421
- hammad/genai/models/language/utils/structured_outputs.py +0 -135
- hammad/genai/models/model_provider.py +0 -4
- hammad/genai/models/multimodal.py +0 -47
- hammad/genai/models/reranking.py +0 -26
- hammad/genai/types/__init__.py +0 -1
- hammad/genai/types/base.py +0 -215
- hammad/genai/types/history.py +0 -290
- hammad/genai/types/tools.py +0 -507
- hammad/logging/__init__.py +0 -35
- hammad/logging/decorators.py +0 -834
- hammad/logging/logger.py +0 -1018
- hammad/mcp/__init__.py +0 -53
- hammad/mcp/client/__init__.py +0 -35
- hammad/mcp/client/client.py +0 -624
- hammad/mcp/client/client_service.py +0 -400
- hammad/mcp/client/settings.py +0 -178
- hammad/mcp/servers/__init__.py +0 -26
- hammad/mcp/servers/launcher.py +0 -1161
- hammad/runtime/__init__.py +0 -32
- hammad/runtime/decorators.py +0 -142
- hammad/runtime/run.py +0 -299
- hammad/service/__init__.py +0 -49
- hammad/service/create.py +0 -527
- hammad/service/decorators.py +0 -283
- hammad/types.py +0 -288
- hammad/typing/__init__.py +0 -435
- hammad/web/__init__.py +0 -43
- hammad/web/http/__init__.py +0 -1
- hammad/web/http/client.py +0 -944
- hammad/web/models.py +0 -275
- hammad/web/openapi/__init__.py +0 -1
- hammad/web/openapi/client.py +0 -740
- hammad/web/search/__init__.py +0 -1
- hammad/web/search/client.py +0 -1023
- hammad/web/utils.py +0 -472
- hammad_python-0.0.30.dist-info/RECORD +0 -135
- {hammad → ham}/py.typed +0 -0
- {hammad_python-0.0.30.dist-info → hammad_python-0.0.32.dist-info}/WHEEL +0 -0
- {hammad_python-0.0.30.dist-info → hammad_python-0.0.32.dist-info}/licenses/LICENSE +0 -0
@@ -1,426 +0,0 @@
|
|
1
|
-
"""hammad.data.collections.indexes.tantivy.index"""
|
2
|
-
|
3
|
-
from datetime import datetime, timezone, timedelta
|
4
|
-
from typing import Any, Dict, Generic, List, Optional, Type, final
|
5
|
-
import uuid
|
6
|
-
from pathlib import Path
|
7
|
-
import json
|
8
|
-
|
9
|
-
import tantivy
|
10
|
-
|
11
|
-
from ....sql.types import (
|
12
|
-
DatabaseItemType,
|
13
|
-
DatabaseItemFilters,
|
14
|
-
DatabaseItem,
|
15
|
-
)
|
16
|
-
from ....sql.database import Database
|
17
|
-
from . import utils
|
18
|
-
from .settings import (
|
19
|
-
TantivyCollectionIndexSettings,
|
20
|
-
TantivyCollectionIndexQuerySettings,
|
21
|
-
)
|
22
|
-
|
23
|
-
|
24
|
-
@final
|
25
|
-
class TantivyCollectionIndex(Generic[DatabaseItemType]):
|
26
|
-
"""A standalone (simplified) index that can be used as the
|
27
|
-
storage / search engine for a collection, that implements
|
28
|
-
fast indexing & querying capabilities using the
|
29
|
-
`tantivy` package.
|
30
|
-
|
31
|
-
This collection index is built into the core dependencies
|
32
|
-
of the `hammad-python` package, and is the default index
|
33
|
-
used by the `Collection` class."""
|
34
|
-
|
35
|
-
def __init__(
|
36
|
-
self,
|
37
|
-
*,
|
38
|
-
name: str = "default",
|
39
|
-
schema: Optional[Type[DatabaseItemType]] = None,
|
40
|
-
ttl: Optional[int] = None,
|
41
|
-
path: Optional[Path | str] = None,
|
42
|
-
fast: bool = True,
|
43
|
-
settings: Optional[TantivyCollectionIndexSettings] = None,
|
44
|
-
query_settings: Optional[TantivyCollectionIndexQuerySettings] = None,
|
45
|
-
) -> None:
|
46
|
-
"""Initialize a new `TantivyCollectionIndex` with a given set
|
47
|
-
of parameters.
|
48
|
-
|
49
|
-
Args:
|
50
|
-
name: The name of the index.
|
51
|
-
schema: The schema of the items that can be stored
|
52
|
-
within this index.
|
53
|
-
ttl: The time to live for the items within this index.
|
54
|
-
path: The path to the directory where the index will be stored.
|
55
|
-
(If not provided, the collection will be built on memory. This is how to
|
56
|
-
distinguish between different collection locations.)
|
57
|
-
fast: Whether to use fast schema building & indexing
|
58
|
-
from `tantivy`'s builtin implementation.
|
59
|
-
settings: Default settings to use for indexing & schema
|
60
|
-
building.
|
61
|
-
query_settings: Default settings to use for the query
|
62
|
-
engine.
|
63
|
-
"""
|
64
|
-
self.name = name
|
65
|
-
self.schema = schema
|
66
|
-
self.ttl = ttl
|
67
|
-
|
68
|
-
if path is not None and not isinstance(path, Path):
|
69
|
-
path = Path(path)
|
70
|
-
|
71
|
-
self.path = path
|
72
|
-
"""The file path to the collection index.
|
73
|
-
|
74
|
-
(You wouldnt know), but earlier versions of this package allowed
|
75
|
-
for implementing `databases` with file system paths. The new
|
76
|
-
structure of the package does not implement the `Database` class
|
77
|
-
anymore, and rather allows for creating custom extensions using
|
78
|
-
collections directly.
|
79
|
-
|
80
|
-
Ex: `/database/collection.db | /database/collection.myextension`"""
|
81
|
-
|
82
|
-
if not settings:
|
83
|
-
settings = TantivyCollectionIndexSettings(
|
84
|
-
fast=fast,
|
85
|
-
)
|
86
|
-
|
87
|
-
if not query_settings:
|
88
|
-
query_settings = TantivyCollectionIndexQuerySettings()
|
89
|
-
|
90
|
-
self.settings = settings
|
91
|
-
"""The default settings to use when indexing and schema building
|
92
|
-
for this index."""
|
93
|
-
|
94
|
-
self.query_settings = query_settings
|
95
|
-
"""The default settings to use when querying this index."""
|
96
|
-
|
97
|
-
# Initialize SQL Database as storage backend
|
98
|
-
database_path = None
|
99
|
-
if self.path is not None:
|
100
|
-
database_path = self.path / f"{name}.db"
|
101
|
-
|
102
|
-
self._database = Database[DatabaseItemType](
|
103
|
-
name=name,
|
104
|
-
schema=schema,
|
105
|
-
ttl=ttl,
|
106
|
-
path=database_path,
|
107
|
-
table_name=f"tantivy_{name}",
|
108
|
-
)
|
109
|
-
|
110
|
-
try:
|
111
|
-
self._tantivy_wrapper = utils.build_tantivy_index_from_settings(
|
112
|
-
settings=settings
|
113
|
-
)
|
114
|
-
self._index = self._tantivy_wrapper.index
|
115
|
-
self._schema = self._tantivy_wrapper.schema
|
116
|
-
self._writer = self._tantivy_wrapper.index_writer
|
117
|
-
except Exception as e:
|
118
|
-
raise utils.TantivyCollectionIndexError(
|
119
|
-
f"Failed to build tantivy index from settings: {e}"
|
120
|
-
) from e
|
121
|
-
|
122
|
-
def add(
|
123
|
-
self,
|
124
|
-
item: DatabaseItemType,
|
125
|
-
*,
|
126
|
-
id: Optional[str] = None,
|
127
|
-
filters: Optional[DatabaseItemFilters] = None,
|
128
|
-
ttl: Optional[int] = None,
|
129
|
-
) -> str:
|
130
|
-
"""Add a new item to the index.
|
131
|
-
|
132
|
-
Args:
|
133
|
-
item: The item to add to the index.
|
134
|
-
id: The id of the item.
|
135
|
-
filters: The filters to apply to the item.
|
136
|
-
ttl: The time to live for the item.
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
The ID of the added item.
|
140
|
-
"""
|
141
|
-
# Add to SQL database first
|
142
|
-
item_id = self._database.add(
|
143
|
-
item=item,
|
144
|
-
id=id,
|
145
|
-
filters=filters,
|
146
|
-
ttl=ttl,
|
147
|
-
)
|
148
|
-
|
149
|
-
# Add to tantivy index for search
|
150
|
-
self._add_to_tantivy_index(item_id, item, filters)
|
151
|
-
|
152
|
-
return item_id
|
153
|
-
|
154
|
-
def _add_to_tantivy_index(
|
155
|
-
self,
|
156
|
-
item_id: str,
|
157
|
-
item: DatabaseItemType,
|
158
|
-
filters: Optional[DatabaseItemFilters] = None,
|
159
|
-
) -> None:
|
160
|
-
"""Add item to tantivy search index."""
|
161
|
-
doc = tantivy.Document()
|
162
|
-
|
163
|
-
# Add ID field
|
164
|
-
doc.add_text("id", item_id)
|
165
|
-
|
166
|
-
# Extract and add content for search
|
167
|
-
content = utils.extract_content_for_indexing(item)
|
168
|
-
doc.add_text("content", content)
|
169
|
-
|
170
|
-
# Add title field if present
|
171
|
-
if isinstance(item, dict) and "title" in item:
|
172
|
-
doc.add_text("title", str(item["title"]))
|
173
|
-
|
174
|
-
# Store the full data as JSON in tantivy
|
175
|
-
serialized_data = utils.serialize(item)
|
176
|
-
json_data = {"value": serialized_data}
|
177
|
-
doc.add_json("data", json.dumps(json_data))
|
178
|
-
|
179
|
-
# Add filters as facets
|
180
|
-
if filters:
|
181
|
-
for key, value in filters.items():
|
182
|
-
facet_value = f"/{key}/{value}"
|
183
|
-
doc.add_facet("filters", tantivy.Facet.from_string(facet_value))
|
184
|
-
|
185
|
-
# Add timestamps
|
186
|
-
now = datetime.now(timezone.utc)
|
187
|
-
doc.add_date("created_at", now)
|
188
|
-
|
189
|
-
# Add score field if present
|
190
|
-
if (
|
191
|
-
isinstance(item, dict)
|
192
|
-
and "score" in item
|
193
|
-
and isinstance(item["score"], (int, float))
|
194
|
-
):
|
195
|
-
doc.add_integer("score", int(item["score"]))
|
196
|
-
|
197
|
-
# Add to index
|
198
|
-
self._writer.add_document(doc)
|
199
|
-
self._writer.commit()
|
200
|
-
|
201
|
-
def get(
|
202
|
-
self,
|
203
|
-
id: str,
|
204
|
-
*,
|
205
|
-
filters: Optional[DatabaseItemFilters] = None,
|
206
|
-
) -> Optional[DatabaseItem[DatabaseItemType]]:
|
207
|
-
"""Get an item by ID.
|
208
|
-
|
209
|
-
Args:
|
210
|
-
id: The item ID.
|
211
|
-
filters: Optional filters to match.
|
212
|
-
|
213
|
-
Returns:
|
214
|
-
The database item or None if not found.
|
215
|
-
"""
|
216
|
-
return self._database.get(id, filters=filters)
|
217
|
-
|
218
|
-
def query(
|
219
|
-
self,
|
220
|
-
query: Optional[str] = None,
|
221
|
-
*,
|
222
|
-
filters: Optional[DatabaseItemFilters] = None,
|
223
|
-
limit: Optional[int] = None,
|
224
|
-
offset: int = 0,
|
225
|
-
fuzzy: bool = False,
|
226
|
-
fuzzy_distance: int = 2,
|
227
|
-
phrase: bool = False,
|
228
|
-
phrase_slop: int = 0,
|
229
|
-
boost_fields: Optional[Dict[str, float]] = None,
|
230
|
-
min_score: Optional[float] = None,
|
231
|
-
sort_by: Optional[str] = None,
|
232
|
-
ascending: bool = True,
|
233
|
-
) -> List[DatabaseItem[DatabaseItemType]]:
|
234
|
-
"""Query items using tantivy search.
|
235
|
-
|
236
|
-
Args:
|
237
|
-
query: Search query string.
|
238
|
-
filters: Dictionary of filters to apply.
|
239
|
-
limit: Maximum number of results.
|
240
|
-
offset: Number of results to skip.
|
241
|
-
fuzzy: Enable fuzzy matching.
|
242
|
-
fuzzy_distance: Maximum edit distance for fuzzy matching.
|
243
|
-
phrase: Treat query as exact phrase match.
|
244
|
-
phrase_slop: Max words between phrase terms.
|
245
|
-
boost_fields: Field-specific score boosting.
|
246
|
-
min_score: Minimum relevance score threshold.
|
247
|
-
sort_by: Field to sort by.
|
248
|
-
ascending: Sort direction.
|
249
|
-
|
250
|
-
Returns:
|
251
|
-
List of matching database items.
|
252
|
-
"""
|
253
|
-
if not query:
|
254
|
-
# No search query - use database query directly
|
255
|
-
return self._database.query(
|
256
|
-
limit=limit,
|
257
|
-
offset=offset,
|
258
|
-
order_by=sort_by,
|
259
|
-
ascending=ascending,
|
260
|
-
)
|
261
|
-
|
262
|
-
# Use tantivy for search
|
263
|
-
self._index.reload()
|
264
|
-
searcher = self._index.searcher()
|
265
|
-
|
266
|
-
# Build tantivy query
|
267
|
-
query_parts = []
|
268
|
-
|
269
|
-
# Add filter queries
|
270
|
-
if filters:
|
271
|
-
for key, value in filters.items():
|
272
|
-
facet_query = tantivy.Query.term_query(
|
273
|
-
self._schema,
|
274
|
-
"filters",
|
275
|
-
tantivy.Facet.from_string(f"/{key}/{value}"),
|
276
|
-
)
|
277
|
-
query_parts.append((tantivy.Occur.Must, facet_query))
|
278
|
-
|
279
|
-
# Add search query
|
280
|
-
if phrase:
|
281
|
-
words = query.split()
|
282
|
-
search_query = tantivy.Query.phrase_query(
|
283
|
-
self._schema, "content", words, slop=phrase_slop
|
284
|
-
)
|
285
|
-
elif fuzzy:
|
286
|
-
terms = query.split()
|
287
|
-
fuzzy_queries = []
|
288
|
-
for term in terms:
|
289
|
-
fuzzy_q = tantivy.Query.fuzzy_term_query(
|
290
|
-
self._schema,
|
291
|
-
"content",
|
292
|
-
term,
|
293
|
-
distance=fuzzy_distance,
|
294
|
-
)
|
295
|
-
fuzzy_queries.append((tantivy.Occur.Should, fuzzy_q))
|
296
|
-
search_query = tantivy.Query.boolean_query(fuzzy_queries)
|
297
|
-
else:
|
298
|
-
# Use tantivy's query parser
|
299
|
-
if boost_fields:
|
300
|
-
search_query = self._index.parse_query(
|
301
|
-
query,
|
302
|
-
default_field_names=["content", "title"],
|
303
|
-
field_boosts=boost_fields,
|
304
|
-
)
|
305
|
-
else:
|
306
|
-
search_query = self._index.parse_query(
|
307
|
-
query, default_field_names=["content", "title"]
|
308
|
-
)
|
309
|
-
|
310
|
-
query_parts.append((tantivy.Occur.Must, search_query))
|
311
|
-
|
312
|
-
# Build final query
|
313
|
-
if query_parts:
|
314
|
-
final_query = tantivy.Query.boolean_query(query_parts)
|
315
|
-
else:
|
316
|
-
final_query = tantivy.Query.all_query()
|
317
|
-
|
318
|
-
# Execute search
|
319
|
-
search_limit = limit or self.query_settings.limit
|
320
|
-
|
321
|
-
# Perform search
|
322
|
-
search_result = searcher.search(
|
323
|
-
final_query,
|
324
|
-
limit=search_limit,
|
325
|
-
offset=offset,
|
326
|
-
)
|
327
|
-
|
328
|
-
# Get IDs from search results and fetch from database
|
329
|
-
item_ids = []
|
330
|
-
for score, doc_address in search_result.hits:
|
331
|
-
if min_score and score < min_score:
|
332
|
-
continue
|
333
|
-
|
334
|
-
doc = searcher.doc(doc_address)
|
335
|
-
item_id = doc.get_first("id")
|
336
|
-
if item_id:
|
337
|
-
item_ids.append(item_id)
|
338
|
-
|
339
|
-
# Fetch items from database by IDs
|
340
|
-
results = []
|
341
|
-
for item_id in item_ids:
|
342
|
-
db_item = self._database.get(item_id, filters=filters)
|
343
|
-
if db_item:
|
344
|
-
results.append(db_item)
|
345
|
-
|
346
|
-
return results
|
347
|
-
|
348
|
-
def delete(self, id: str) -> bool:
|
349
|
-
"""Delete an item by ID.
|
350
|
-
|
351
|
-
Args:
|
352
|
-
id: The item ID.
|
353
|
-
|
354
|
-
Returns:
|
355
|
-
True if item was deleted, False if not found.
|
356
|
-
"""
|
357
|
-
# Delete from database
|
358
|
-
deleted = self._database.delete(id)
|
359
|
-
|
360
|
-
if deleted:
|
361
|
-
# Remove from tantivy index by reindexing without this item
|
362
|
-
# Note: Tantivy doesn't have efficient single-document deletion
|
363
|
-
# For now, we rely on the database as the source of truth
|
364
|
-
pass
|
365
|
-
|
366
|
-
return deleted
|
367
|
-
|
368
|
-
def count(
|
369
|
-
self,
|
370
|
-
query: Optional[str] = None,
|
371
|
-
*,
|
372
|
-
filters: Optional[DatabaseItemFilters] = None,
|
373
|
-
) -> int:
|
374
|
-
"""Count items matching the query and filters.
|
375
|
-
|
376
|
-
Args:
|
377
|
-
query: Search query string.
|
378
|
-
filters: Dictionary of filters to apply.
|
379
|
-
|
380
|
-
Returns:
|
381
|
-
Number of matching items.
|
382
|
-
"""
|
383
|
-
if not query:
|
384
|
-
# Simple count from database
|
385
|
-
from ....sql.types import QueryFilter, QueryCondition
|
386
|
-
|
387
|
-
query_filter = None
|
388
|
-
if filters:
|
389
|
-
conditions = [
|
390
|
-
QueryCondition(
|
391
|
-
field="filters", operator="contains", value=json.dumps(filters)
|
392
|
-
)
|
393
|
-
]
|
394
|
-
query_filter = QueryFilter(conditions=conditions)
|
395
|
-
|
396
|
-
return self._database.count(query_filter)
|
397
|
-
else:
|
398
|
-
# Count via search results
|
399
|
-
results = self.query(query, filters=filters, limit=None)
|
400
|
-
return len(results)
|
401
|
-
|
402
|
-
def clear(self) -> int:
|
403
|
-
"""Clear all items from the index.
|
404
|
-
|
405
|
-
Returns:
|
406
|
-
Number of items deleted.
|
407
|
-
"""
|
408
|
-
count = self._database.clear()
|
409
|
-
|
410
|
-
# Clear tantivy index by rebuilding it
|
411
|
-
try:
|
412
|
-
self._tantivy_wrapper = utils.build_tantivy_index_from_settings(
|
413
|
-
settings=self.settings
|
414
|
-
)
|
415
|
-
self._index = self._tantivy_wrapper.index
|
416
|
-
self._schema = self._tantivy_wrapper.schema
|
417
|
-
self._writer = self._tantivy_wrapper.index_writer
|
418
|
-
except Exception:
|
419
|
-
pass
|
420
|
-
|
421
|
-
return count
|
422
|
-
|
423
|
-
def __repr__(self) -> str:
|
424
|
-
"""String representation of the index."""
|
425
|
-
location = str(self.path) if self.path else "memory"
|
426
|
-
return f"<TantivyCollectionIndex name='{self.name}' location='{location}'>"
|
@@ -1,40 +0,0 @@
|
|
1
|
-
"""hammad.data.collections.indexes.tantivy.settings"""
|
2
|
-
|
3
|
-
from dataclasses import dataclass
|
4
|
-
from typing import (
|
5
|
-
Any,
|
6
|
-
Dict,
|
7
|
-
)
|
8
|
-
|
9
|
-
__all__ = ("TantivyCollectionIndexSettings", "TantivyCollectionIndexQuerySettings")
|
10
|
-
|
11
|
-
|
12
|
-
@dataclass
|
13
|
-
class TantivyCollectionIndexSettings:
|
14
|
-
"""Object representation of user configurable settings
|
15
|
-
that can be used to configure a `TantivyCollectionIndex`."""
|
16
|
-
|
17
|
-
fast: bool = True
|
18
|
-
"""Whether to use fast schema building & indexing from
|
19
|
-
`tantivy`'s builtin implementation."""
|
20
|
-
|
21
|
-
def get_tantivy_config(self) -> Dict[str, Any]:
|
22
|
-
"""Returns a configuration dictionary used
|
23
|
-
to configure the tantivy index internally."""
|
24
|
-
|
25
|
-
return {
|
26
|
-
"text_fields": {"stored": True, "fast": self.fast},
|
27
|
-
"numeric_fields": {"stored": True, "indexed": True, "fast": self.fast},
|
28
|
-
"date_fields": {"stored": True, "indexed": True, "fast": self.fast},
|
29
|
-
"json_fields": {"stored": True},
|
30
|
-
}
|
31
|
-
|
32
|
-
|
33
|
-
@dataclass
|
34
|
-
class TantivyCollectionIndexQuerySettings:
|
35
|
-
"""Object representation of user configurable settings
|
36
|
-
that can be used to configure the query engine for a
|
37
|
-
`TantivyCollectionIndex`."""
|
38
|
-
|
39
|
-
limit: int = 10
|
40
|
-
"""The maximum number of results to return."""
|
@@ -1,176 +0,0 @@
|
|
1
|
-
"""hammad.data.collections.indexes.tantivy.utils"""
|
2
|
-
|
3
|
-
from dataclasses import dataclass, is_dataclass, asdict
|
4
|
-
from msgspec import json
|
5
|
-
from typing import Any, Dict, List, Optional, final
|
6
|
-
|
7
|
-
import tantivy
|
8
|
-
|
9
|
-
from .....cache import cached
|
10
|
-
from .settings import (
|
11
|
-
TantivyCollectionIndexSettings,
|
12
|
-
TantivyCollectionIndexQuerySettings,
|
13
|
-
)
|
14
|
-
|
15
|
-
|
16
|
-
__all__ = (
|
17
|
-
"TantivyCollectionIndexError",
|
18
|
-
"extract_content_for_indexing",
|
19
|
-
)
|
20
|
-
|
21
|
-
|
22
|
-
class TantivyCollectionIndexError(Exception):
|
23
|
-
"""Exception raised when an error occurs in the `TantivyCollectionIndex`."""
|
24
|
-
|
25
|
-
|
26
|
-
@dataclass
|
27
|
-
class TantivyIndexWrapper:
|
28
|
-
"""Wrapper over the `tantivy` index object."""
|
29
|
-
|
30
|
-
index: tantivy.Index
|
31
|
-
"""The `tantivy` index object."""
|
32
|
-
|
33
|
-
schema: tantivy.Schema
|
34
|
-
"""The `tantivy` schema object."""
|
35
|
-
|
36
|
-
index_writer: Any
|
37
|
-
"""The `tantivy` index writer object."""
|
38
|
-
|
39
|
-
|
40
|
-
@cached
|
41
|
-
def match_filters_for_query(
|
42
|
-
stored_filters: Dict[str, Any] | None = None,
|
43
|
-
query_filters: Dict[str, Any] | None = None,
|
44
|
-
) -> bool:
|
45
|
-
"""Checks if stored filters match query filters."""
|
46
|
-
if query_filters is None:
|
47
|
-
return True
|
48
|
-
if stored_filters is None:
|
49
|
-
return False
|
50
|
-
return all(stored_filters.get(k) == v for k, v in query_filters.items())
|
51
|
-
|
52
|
-
|
53
|
-
@cached
|
54
|
-
def serialize(obj: Any) -> Any:
|
55
|
-
"""Serializes an object to JSON."""
|
56
|
-
try:
|
57
|
-
return json.decode(json.encode(obj))
|
58
|
-
except Exception:
|
59
|
-
# Fallback to manual serialization if msgspec fails
|
60
|
-
if isinstance(obj, (str, int, float, bool, type(None))):
|
61
|
-
return obj
|
62
|
-
elif isinstance(obj, (list, tuple)):
|
63
|
-
return [serialize(item) for item in obj]
|
64
|
-
elif isinstance(obj, dict):
|
65
|
-
return {k: serialize(v) for k, v in obj.items()}
|
66
|
-
elif is_dataclass(obj):
|
67
|
-
return serialize(asdict(obj))
|
68
|
-
elif hasattr(obj, "__dict__"):
|
69
|
-
return serialize(obj.__dict__)
|
70
|
-
else:
|
71
|
-
return str(obj)
|
72
|
-
|
73
|
-
|
74
|
-
@cached
|
75
|
-
def build_tantivy_index_from_settings(
|
76
|
-
settings: TantivyCollectionIndexSettings,
|
77
|
-
) -> TantivyIndexWrapper:
|
78
|
-
"""Builds a new `tantivy` index from the given settings."""
|
79
|
-
# Init schema for index
|
80
|
-
schema_builder = tantivy.SchemaBuilder()
|
81
|
-
|
82
|
-
# Add fields
|
83
|
-
# ID (stored and indexed)
|
84
|
-
schema_builder.add_text_field("id", **settings.get_tantivy_config()["text_fields"])
|
85
|
-
# Content (stored and indexed) Contains entry content
|
86
|
-
schema_builder.add_text_field(
|
87
|
-
"content",
|
88
|
-
**{
|
89
|
-
**settings.get_tantivy_config()["text_fields"],
|
90
|
-
"tokenizer_name": "default",
|
91
|
-
"index_option": "position",
|
92
|
-
},
|
93
|
-
)
|
94
|
-
# Title (stored and indexed) Contains entry title
|
95
|
-
schema_builder.add_text_field(
|
96
|
-
"title",
|
97
|
-
**{
|
98
|
-
**settings.get_tantivy_config()["text_fields"],
|
99
|
-
"tokenizer_name": "default",
|
100
|
-
"index_option": "position",
|
101
|
-
},
|
102
|
-
)
|
103
|
-
# JSON (stored) Contains actual entry data
|
104
|
-
schema_builder.add_json_field(
|
105
|
-
"data", **settings.get_tantivy_config()["json_fields"]
|
106
|
-
)
|
107
|
-
|
108
|
-
# Timestamps
|
109
|
-
schema_builder.add_date_field(
|
110
|
-
"created_at", **settings.get_tantivy_config()["date_fields"]
|
111
|
-
)
|
112
|
-
schema_builder.add_date_field(
|
113
|
-
"expires_at", **settings.get_tantivy_config()["date_fields"]
|
114
|
-
)
|
115
|
-
|
116
|
-
# Sorting / Scoring
|
117
|
-
schema_builder.add_integer_field(
|
118
|
-
"score", **settings.get_tantivy_config()["numeric_fields"]
|
119
|
-
)
|
120
|
-
|
121
|
-
# Facet for Optional filters
|
122
|
-
schema_builder.add_facet_field("filters")
|
123
|
-
|
124
|
-
# Build the schema
|
125
|
-
schema = schema_builder.build()
|
126
|
-
|
127
|
-
# Create index in memory (no path means in-memory)
|
128
|
-
index = tantivy.Index(schema)
|
129
|
-
|
130
|
-
# Configure index writer with custom settings if provided
|
131
|
-
writer_config = {}
|
132
|
-
if "writer_heap_size" in settings.get_tantivy_config():
|
133
|
-
writer_config["heap_size"] = settings.get_tantivy_config()["writer_heap_size"]
|
134
|
-
if "writer_num_threads" in settings.get_tantivy_config():
|
135
|
-
writer_config["num_threads"] = settings.get_tantivy_config()[
|
136
|
-
"writer_num_threads"
|
137
|
-
]
|
138
|
-
|
139
|
-
index_writer = index.writer(**writer_config)
|
140
|
-
|
141
|
-
# Configure index reader if settings provided
|
142
|
-
reader_config = settings.get_tantivy_config().get("reader_config", {})
|
143
|
-
if reader_config:
|
144
|
-
reload_policy = reader_config.get("reload_policy", "commit")
|
145
|
-
num_warmers = reader_config.get("num_warmers", 0)
|
146
|
-
index.config_reader(reload_policy=reload_policy, num_warmers=num_warmers)
|
147
|
-
|
148
|
-
return TantivyIndexWrapper(schema=schema, index=index, index_writer=index_writer)
|
149
|
-
|
150
|
-
|
151
|
-
@cached
|
152
|
-
def extract_content_for_indexing(value: Any) -> str:
|
153
|
-
"""Extract searchable text content from value for indexing."""
|
154
|
-
if isinstance(value, str):
|
155
|
-
return value
|
156
|
-
elif isinstance(value, dict):
|
157
|
-
# Concatenate all string values
|
158
|
-
content_parts = []
|
159
|
-
for v in value.values():
|
160
|
-
if isinstance(v, str):
|
161
|
-
content_parts.append(v)
|
162
|
-
elif isinstance(v, (list, dict)):
|
163
|
-
content_parts.append(json.encode(v).decode())
|
164
|
-
else:
|
165
|
-
content_parts.append(str(v))
|
166
|
-
return " ".join(content_parts)
|
167
|
-
elif isinstance(value, (list, tuple)):
|
168
|
-
content_parts = []
|
169
|
-
for item in value:
|
170
|
-
if isinstance(item, str):
|
171
|
-
content_parts.append(item)
|
172
|
-
else:
|
173
|
-
content_parts.append(str(item))
|
174
|
-
return " ".join(content_parts)
|
175
|
-
else:
|
176
|
-
return str(value)
|
@@ -1,35 +0,0 @@
|
|
1
|
-
"""hammad.data.configurations
|
2
|
-
|
3
|
-
Contains the `Configuration` class and related functions for parsing configurations
|
4
|
-
from various sources.
|
5
|
-
"""
|
6
|
-
|
7
|
-
from typing import TYPE_CHECKING
|
8
|
-
from ..._internal import create_getattr_importer
|
9
|
-
|
10
|
-
if TYPE_CHECKING:
|
11
|
-
from .configuration import (
|
12
|
-
Configuration,
|
13
|
-
read_configuration_from_file,
|
14
|
-
read_configuration_from_url,
|
15
|
-
read_configuration_from_os_vars,
|
16
|
-
read_configuration_from_os_prefix,
|
17
|
-
read_configuration_from_dotenv,
|
18
|
-
)
|
19
|
-
|
20
|
-
|
21
|
-
__all__ = (
|
22
|
-
"Configuration",
|
23
|
-
"read_configuration_from_file",
|
24
|
-
"read_configuration_from_url",
|
25
|
-
"read_configuration_from_os_vars",
|
26
|
-
"read_configuration_from_os_prefix",
|
27
|
-
"read_configuration_from_dotenv",
|
28
|
-
)
|
29
|
-
|
30
|
-
|
31
|
-
__getattr__ = create_getattr_importer(__all__)
|
32
|
-
|
33
|
-
|
34
|
-
def __dir__() -> list[str]:
|
35
|
-
return list(__all__)
|