sqlitesearch 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/PKG-INFO +1 -1
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/pyproject.toml +2 -0
- sqlitesearch-0.0.2/sqlitesearch/__version__.py +1 -0
- sqlitesearch-0.0.2/sqlitesearch/operators.py +32 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/sqlitesearch/text/fts.py +191 -10
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/sqlitesearch/vector/lsh.py +224 -22
- sqlitesearch-0.0.2/tests/test_range_filters.py +458 -0
- sqlitesearch-0.0.2/tests/test_vector_range_filters.py +333 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/tests/test_vector_search.py +2 -2
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/uv.lock +159 -31
- sqlitesearch-0.0.1/sqlitesearch/__version__.py +0 -1
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/.github/workflows/tests.yml +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/.gitignore +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/CLAUDE.md +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/Makefile +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/README.md +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/notebooks/faq_search.ipynb +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/notebooks/faq_search.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/plan.md +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/sqlitesearch/__init__.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/sqlitesearch/text/__init__.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/sqlitesearch/vector/__init__.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/tests/__init__.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/tests/test_integration.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/tests/test_performance.py +0 -0
- {sqlitesearch-0.0.1 → sqlitesearch-0.0.2}/tests/test_text_search.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.2"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Operators for range filtering in sqlitesearch.
|
|
3
|
+
|
|
4
|
+
This module provides operator functions used for numeric and date range filtering.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Operator mapping for range filters
|
|
8
|
+
# Each operator is a lambda that takes two values (a, b) and returns a boolean
|
|
9
|
+
OPERATORS = {
|
|
10
|
+
'>=': lambda a, b: a >= b,
|
|
11
|
+
'>': lambda a, b: a > b,
|
|
12
|
+
'<=': lambda a, b: a <= b,
|
|
13
|
+
'<': lambda a, b: a < b,
|
|
14
|
+
'==': lambda a, b: a == b,
|
|
15
|
+
'!=': lambda a, b: a != b,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def is_range_filter(value: object) -> bool:
|
|
20
|
+
"""
|
|
21
|
+
Check if a value is a range filter (list of tuples).
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
value: The value to check.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
True if value is a list of (operator, value) tuples.
|
|
28
|
+
"""
|
|
29
|
+
return (
|
|
30
|
+
isinstance(value, list)
|
|
31
|
+
and all(isinstance(v, tuple) and len(v) == 2 for v in value)
|
|
32
|
+
)
|
|
@@ -9,8 +9,11 @@ import json
|
|
|
9
9
|
import re
|
|
10
10
|
import sqlite3
|
|
11
11
|
import threading
|
|
12
|
+
from datetime import date, datetime
|
|
12
13
|
from typing import Any, Optional
|
|
13
14
|
|
|
15
|
+
from sqlitesearch.operators import OPERATORS, is_range_filter
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
class TextSearchIndex:
|
|
16
19
|
"""
|
|
@@ -20,7 +23,7 @@ class TextSearchIndex:
|
|
|
20
23
|
full-text search with BM25 ranking.
|
|
21
24
|
|
|
22
25
|
API matches minsearch.Index for easy migration:
|
|
23
|
-
- __init__(text_fields, keyword_fields=None, id_field=None)
|
|
26
|
+
- __init__(text_fields, keyword_fields=None, numeric_fields=None, date_fields=None, id_field=None)
|
|
24
27
|
- fit(docs) - Index documents (only if index is empty)
|
|
25
28
|
- add(doc) - Add a single document to existing index
|
|
26
29
|
- search(query, filter_dict=None, boost_dict=None, num_results=10, output_ids=False)
|
|
@@ -29,17 +32,21 @@ class TextSearchIndex:
|
|
|
29
32
|
>>> index = TextSearchIndex(
|
|
30
33
|
... text_fields=["title", "description"],
|
|
31
34
|
... keyword_fields=["category"],
|
|
35
|
+
... numeric_fields=["price", "rating"],
|
|
36
|
+
... date_fields=["created_at"],
|
|
32
37
|
... id_field="id",
|
|
33
38
|
... db_path="search.db"
|
|
34
39
|
... )
|
|
35
|
-
>>> index.fit([{"id": 1, "title": "Hello", "description": "World"}])
|
|
36
|
-
>>> results = index.search("hello
|
|
40
|
+
>>> index.fit([{"id": 1, "title": "Hello", "description": "World", "price": 100}])
|
|
41
|
+
>>> results = index.search("hello", filter_dict={"price": [('>=', 50), ('<', 200)]})
|
|
37
42
|
"""
|
|
38
43
|
|
|
39
44
|
def __init__(
|
|
40
45
|
self,
|
|
41
46
|
text_fields: list[str],
|
|
42
47
|
keyword_fields: Optional[list[str]] = None,
|
|
48
|
+
numeric_fields: Optional[list[str]] = None,
|
|
49
|
+
date_fields: Optional[list[str]] = None,
|
|
43
50
|
id_field: Optional[str] = None,
|
|
44
51
|
db_path: str = "sqlitesearch.db",
|
|
45
52
|
stemming: bool = False,
|
|
@@ -50,12 +57,16 @@ class TextSearchIndex:
|
|
|
50
57
|
Args:
|
|
51
58
|
text_fields: List of field names to index with FTS5.
|
|
52
59
|
keyword_fields: List of field names for exact filtering (not full-text searched).
|
|
60
|
+
numeric_fields: List of field names for numeric range filtering.
|
|
61
|
+
date_fields: List of field names for date range filtering.
|
|
53
62
|
id_field: Field name to use as document ID. If None, auto-generates IDs.
|
|
54
63
|
db_path: Path to the SQLite database file.
|
|
55
64
|
stemming: If True, use Porter stemmer for better matching (e.g., "running" matches "run").
|
|
56
65
|
"""
|
|
57
66
|
self.text_fields = text_fields
|
|
58
67
|
self.keyword_fields = list(keyword_fields) if keyword_fields is not None else []
|
|
68
|
+
self.numeric_fields = list(numeric_fields) if numeric_fields is not None else []
|
|
69
|
+
self.date_fields = list(date_fields) if date_fields is not None else []
|
|
59
70
|
self.id_field = id_field
|
|
60
71
|
self.db_path = db_path
|
|
61
72
|
self.stemming = stemming
|
|
@@ -85,11 +96,23 @@ class TextSearchIndex:
|
|
|
85
96
|
keyword_cols.append(f', "{field}" TEXT')
|
|
86
97
|
keyword_sql = "\n".join(keyword_cols)
|
|
87
98
|
|
|
99
|
+
# Build numeric column definitions
|
|
100
|
+
numeric_cols = []
|
|
101
|
+
for field in self.numeric_fields:
|
|
102
|
+
numeric_cols.append(f', "{field}" REAL')
|
|
103
|
+
numeric_sql = "\n".join(numeric_cols)
|
|
104
|
+
|
|
105
|
+
# Build date column definitions (store as ISO 8601 strings for comparison)
|
|
106
|
+
date_cols = []
|
|
107
|
+
for field in self.date_fields:
|
|
108
|
+
date_cols.append(f', "{field}" TEXT')
|
|
109
|
+
date_sql = "\n".join(date_cols)
|
|
110
|
+
|
|
88
111
|
# Create main documents table
|
|
89
112
|
cursor.execute(f"""
|
|
90
113
|
CREATE TABLE IF NOT EXISTS docs (
|
|
91
114
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
92
|
-
doc_json TEXT NOT NULL{keyword_sql}
|
|
115
|
+
doc_json TEXT NOT NULL{keyword_sql}{numeric_sql}{date_sql}
|
|
93
116
|
)
|
|
94
117
|
""")
|
|
95
118
|
|
|
@@ -114,6 +137,14 @@ class TextSearchIndex:
|
|
|
114
137
|
for field in self.keyword_fields:
|
|
115
138
|
cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_{field} ON docs ("{field}")')
|
|
116
139
|
|
|
140
|
+
# Create indexes on numeric fields for faster filtering
|
|
141
|
+
for field in self.numeric_fields:
|
|
142
|
+
cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_num_{field} ON docs ("{field}")')
|
|
143
|
+
|
|
144
|
+
# Create indexes on date fields for faster filtering
|
|
145
|
+
for field in self.date_fields:
|
|
146
|
+
cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_date_{field} ON docs ("{field}")')
|
|
147
|
+
|
|
117
148
|
conn.commit()
|
|
118
149
|
|
|
119
150
|
def _is_empty(self) -> bool:
|
|
@@ -164,19 +195,44 @@ class TextSearchIndex:
|
|
|
164
195
|
conn = self._get_conn()
|
|
165
196
|
cursor = conn.cursor()
|
|
166
197
|
|
|
167
|
-
# Build column lists
|
|
168
|
-
|
|
198
|
+
# Build column lists including keyword, numeric, and date fields
|
|
199
|
+
filter_cols = (
|
|
200
|
+
[f'"{field}"' for field in self.keyword_fields] +
|
|
201
|
+
[f'"{field}"' for field in self.numeric_fields] +
|
|
202
|
+
[f'"{field}"' for field in self.date_fields]
|
|
203
|
+
)
|
|
204
|
+
all_cols = ["doc_json"] + filter_cols
|
|
169
205
|
col_names = ", ".join(all_cols)
|
|
170
206
|
placeholders = ", ".join(["?"] * len(all_cols))
|
|
171
207
|
|
|
172
208
|
for doc in docs:
|
|
173
|
-
|
|
209
|
+
# Convert date/datetime objects to ISO format for JSON serialization
|
|
210
|
+
doc_for_json = {}
|
|
211
|
+
for key, value in doc.items():
|
|
212
|
+
if isinstance(value, (date, datetime)):
|
|
213
|
+
doc_for_json[key] = value.isoformat()
|
|
214
|
+
else:
|
|
215
|
+
doc_for_json[key] = value
|
|
216
|
+
doc_json = json.dumps(doc_for_json)
|
|
217
|
+
|
|
174
218
|
keyword_vals = [doc.get(field) for field in self.keyword_fields]
|
|
175
219
|
|
|
220
|
+
# Extract numeric values
|
|
221
|
+
numeric_vals = [doc.get(field) for field in self.numeric_fields]
|
|
222
|
+
|
|
223
|
+
# Extract date values and convert to ISO format
|
|
224
|
+
date_vals = []
|
|
225
|
+
for field in self.date_fields:
|
|
226
|
+
value = doc.get(field)
|
|
227
|
+
if isinstance(value, (date, datetime)):
|
|
228
|
+
date_vals.append(value.isoformat())
|
|
229
|
+
else:
|
|
230
|
+
date_vals.append(value)
|
|
231
|
+
|
|
176
232
|
# Insert into main table
|
|
177
233
|
cursor.execute(
|
|
178
234
|
f"INSERT INTO docs ({col_names}) VALUES ({placeholders})",
|
|
179
|
-
[doc_json] + keyword_vals
|
|
235
|
+
[doc_json] + keyword_vals + numeric_vals + date_vals
|
|
180
236
|
)
|
|
181
237
|
doc_id = cursor.lastrowid
|
|
182
238
|
|
|
@@ -221,7 +277,12 @@ class TextSearchIndex:
|
|
|
221
277
|
|
|
222
278
|
Args:
|
|
223
279
|
query: The search query string. Supports FTS5 query syntax.
|
|
224
|
-
filter_dict: Dictionary of
|
|
280
|
+
filter_dict: Dictionary of filters. Can include:
|
|
281
|
+
- Keyword fields: {"field": "value"} for exact match
|
|
282
|
+
- Numeric fields: {"field": [('>=', 100), ('<', 200)]} for range filters
|
|
283
|
+
- Numeric fields: {"field": 100} for exact match
|
|
284
|
+
- Date fields: {"field": [('>=', date(...)), ('<', date(...))]} for range filters
|
|
285
|
+
- Any field: {"field": None} for null/missing values
|
|
225
286
|
boost_dict: Dictionary of boost scores for text fields.
|
|
226
287
|
num_results: Maximum number of results to return.
|
|
227
288
|
output_ids: If True, adds an 'id' field with the document ID.
|
|
@@ -244,18 +305,31 @@ class TextSearchIndex:
|
|
|
244
305
|
# Build FTS5 query with boosts
|
|
245
306
|
fts_query = self._build_fts_query(query, boost_dict)
|
|
246
307
|
|
|
247
|
-
# Build WHERE clause for keyword
|
|
308
|
+
# Build WHERE clause for filters (keyword, numeric, date)
|
|
248
309
|
where_clauses = []
|
|
249
310
|
where_params = []
|
|
250
311
|
|
|
251
312
|
for field, value in filter_dict.items():
|
|
252
313
|
if field in self.keyword_fields:
|
|
314
|
+
# Keyword field filters (exact match)
|
|
253
315
|
if value is None:
|
|
254
316
|
where_clauses.append(f'd."{field}" IS NULL')
|
|
255
317
|
else:
|
|
256
318
|
where_clauses.append(f'd."{field}" = ?')
|
|
257
319
|
where_params.append(value)
|
|
258
320
|
|
|
321
|
+
elif field in self.numeric_fields:
|
|
322
|
+
# Numeric field filters (exact match or range)
|
|
323
|
+
where_clauses, where_params = self._add_numeric_filter(
|
|
324
|
+
where_clauses, where_params, field, value
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
elif field in self.date_fields:
|
|
328
|
+
# Date field filters (exact match or range)
|
|
329
|
+
where_clauses, where_params = self._add_date_filter(
|
|
330
|
+
where_clauses, where_params, field, value
|
|
331
|
+
)
|
|
332
|
+
|
|
259
333
|
where_sql = " AND " + " AND ".join(where_clauses) if where_clauses else ""
|
|
260
334
|
|
|
261
335
|
# Execute search query - simpler without content table
|
|
@@ -277,6 +351,8 @@ class TextSearchIndex:
|
|
|
277
351
|
results = []
|
|
278
352
|
for row in rows:
|
|
279
353
|
doc = json.loads(row["doc_json"])
|
|
354
|
+
# Convert ISO date strings back to date/datetime objects
|
|
355
|
+
doc = self._convert_dates(doc)
|
|
280
356
|
if output_ids:
|
|
281
357
|
# Use id_field value if available, otherwise use database id
|
|
282
358
|
if self.id_field:
|
|
@@ -291,6 +367,111 @@ class TextSearchIndex:
|
|
|
291
367
|
|
|
292
368
|
return results
|
|
293
369
|
|
|
370
|
+
def _convert_dates(self, doc: dict[str, Any]) -> dict[str, Any]:
|
|
371
|
+
"""
|
|
372
|
+
Convert ISO date strings back to date/datetime objects for date_fields.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
doc: Document with potentially ISO formatted date strings.
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
Document with date fields converted back to date/datetime objects.
|
|
379
|
+
"""
|
|
380
|
+
if not self.date_fields:
|
|
381
|
+
return doc
|
|
382
|
+
|
|
383
|
+
for field in self.date_fields:
|
|
384
|
+
if field in doc and doc[field] is not None:
|
|
385
|
+
value = doc[field]
|
|
386
|
+
if isinstance(value, str):
|
|
387
|
+
# Check if string contains time component (has 'T' or ' ')
|
|
388
|
+
has_time = 'T' in value or ' ' in value
|
|
389
|
+
|
|
390
|
+
if has_time:
|
|
391
|
+
# Parse as datetime
|
|
392
|
+
try:
|
|
393
|
+
doc[field] = datetime.fromisoformat(value)
|
|
394
|
+
except ValueError:
|
|
395
|
+
pass
|
|
396
|
+
else:
|
|
397
|
+
# Parse as date only
|
|
398
|
+
try:
|
|
399
|
+
doc[field] = date.fromisoformat(value)
|
|
400
|
+
except ValueError:
|
|
401
|
+
pass
|
|
402
|
+
return doc
|
|
403
|
+
|
|
404
|
+
def _add_numeric_filter(
|
|
405
|
+
self,
|
|
406
|
+
where_clauses: list[str],
|
|
407
|
+
where_params: list[Any],
|
|
408
|
+
field: str,
|
|
409
|
+
value: Any,
|
|
410
|
+
) -> tuple[list[str], list[Any]]:
|
|
411
|
+
"""
|
|
412
|
+
Add a numeric filter to the WHERE clause.
|
|
413
|
+
|
|
414
|
+
Supports:
|
|
415
|
+
- None/missing values: {"field": None}
|
|
416
|
+
- Exact match: {"field": 100}
|
|
417
|
+
- Range filters: {"field": [('>=', 100), ('<', 200)]}
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Tuple of (updated where_clauses, updated where_params).
|
|
421
|
+
"""
|
|
422
|
+
if value is None:
|
|
423
|
+
where_clauses.append(f'd."{field}" IS NULL')
|
|
424
|
+
elif is_range_filter(value):
|
|
425
|
+
# Range filter: [('>=', 100), ('<', 200)]
|
|
426
|
+
for op, op_value in value:
|
|
427
|
+
if op in OPERATORS and op_value is not None:
|
|
428
|
+
where_clauses.append(f'd."{field}" {op} ?')
|
|
429
|
+
where_params.append(op_value)
|
|
430
|
+
else:
|
|
431
|
+
# Exact match
|
|
432
|
+
where_clauses.append(f'd."{field}" = ?')
|
|
433
|
+
where_params.append(value)
|
|
434
|
+
|
|
435
|
+
return where_clauses, where_params
|
|
436
|
+
|
|
437
|
+
def _add_date_filter(
|
|
438
|
+
self,
|
|
439
|
+
where_clauses: list[str],
|
|
440
|
+
where_params: list[Any],
|
|
441
|
+
field: str,
|
|
442
|
+
value: Any,
|
|
443
|
+
) -> tuple[list[str], list[Any]]:
|
|
444
|
+
"""
|
|
445
|
+
Add a date filter to the WHERE clause.
|
|
446
|
+
|
|
447
|
+
Supports:
|
|
448
|
+
- None/missing values: {"field": None}
|
|
449
|
+
- Exact match: {"field": date(...)} or {"field": "2024-01-15"}
|
|
450
|
+
- Range filters: {"field": [('>=', date(...)), ('<', date(...))]}
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
Tuple of (updated where_clauses, updated where_params).
|
|
454
|
+
"""
|
|
455
|
+
if value is None:
|
|
456
|
+
where_clauses.append(f'd."{field}" IS NULL')
|
|
457
|
+
elif is_range_filter(value):
|
|
458
|
+
# Range filter: [('>=', date(...)), ('<', date(...))]
|
|
459
|
+
for op, op_value in value:
|
|
460
|
+
if op in OPERATORS and op_value is not None:
|
|
461
|
+
# Convert date/datetime to ISO format string for comparison
|
|
462
|
+
if isinstance(op_value, (date, datetime)):
|
|
463
|
+
op_value = op_value.isoformat()
|
|
464
|
+
where_clauses.append(f'd."{field}" {op} ?')
|
|
465
|
+
where_params.append(op_value)
|
|
466
|
+
else:
|
|
467
|
+
# Exact match - convert date/datetime to ISO format
|
|
468
|
+
if isinstance(value, (date, datetime)):
|
|
469
|
+
value = value.isoformat()
|
|
470
|
+
where_clauses.append(f'd."{field}" = ?')
|
|
471
|
+
where_params.append(value)
|
|
472
|
+
|
|
473
|
+
return where_clauses, where_params
|
|
474
|
+
|
|
294
475
|
def _build_fts_query(self, query: str, boost_dict: dict[str, float]) -> str:
|
|
295
476
|
"""
|
|
296
477
|
Build an FTS5 query with boost weights.
|