haystack-pixeltable 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haystack_pixeltable/__init__.py +8 -0
- haystack_pixeltable/document_store.py +294 -0
- haystack_pixeltable/retriever.py +117 -0
- haystack_pixeltable-0.1.0.dist-info/METADATA +191 -0
- haystack_pixeltable-0.1.0.dist-info/RECORD +8 -0
- haystack_pixeltable-0.1.0.dist-info/WHEEL +5 -0
- haystack_pixeltable-0.1.0.dist-info/licenses/LICENSE +112 -0
- haystack_pixeltable-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Haystack Document Store and Retriever backed by Pixeltable."""
|
|
2
|
+
|
|
3
|
+
from haystack_pixeltable.document_store import PixeltableDocumentStore
|
|
4
|
+
from haystack_pixeltable.retriever import PixeltableRetriever
|
|
5
|
+
|
|
6
|
+
__all__ = ["PixeltableDocumentStore", "PixeltableRetriever"]
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Pixeltable Document Store for Haystack.
|
|
2
|
+
|
|
3
|
+
Implements the Haystack ``DocumentStore`` protocol, mapping CRUD operations
|
|
4
|
+
to a Pixeltable table with an optional embedding index for similarity search.
|
|
5
|
+
|
|
6
|
+
Key Pixeltable advantages:
|
|
7
|
+
- ``.table`` property exposes the underlying Pixeltable table for computed
|
|
8
|
+
columns, version history, multimodal joins, and arbitrary predicates.
|
|
9
|
+
- Metadata is stored as a JSON column, supporting Haystack's nested filter spec.
|
|
10
|
+
- Embedding vectors are stored in a dedicated array column with an HNSW index.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import uuid
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pixeltable as pxt
|
|
21
|
+
from haystack import Document, default_from_dict, default_to_dict
|
|
22
|
+
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
|
23
|
+
from haystack.document_stores.types import DuplicatePolicy
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
_TEXT_COL = "content"
|
|
28
|
+
_META_COL = "meta"
|
|
29
|
+
_ID_COL = "doc_id"
|
|
30
|
+
_EMBEDDING_COL = "embedding"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PixeltableDocumentStore:
|
|
34
|
+
"""Haystack Document Store backed by a Pixeltable table.
|
|
35
|
+
|
|
36
|
+
Stores documents with content, metadata, and optional embedding vectors.
|
|
37
|
+
Supports the four mandatory ``DocumentStore`` protocol methods plus
|
|
38
|
+
serialization via ``to_dict`` / ``from_dict``.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
table_name: Pixeltable table path (e.g. ``'haystack.docs'``).
|
|
42
|
+
embedding_dimension: Dimension of embedding vectors. Required if
|
|
43
|
+
embeddings will be stored. If ``None``, the embedding column
|
|
44
|
+
and index are not created.
|
|
45
|
+
metric: Distance metric for the embedding index
|
|
46
|
+
(``'cosine'``, ``'ip'``, or ``'l2'``).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
table_name: str = "haystack.documents",
|
|
52
|
+
*,
|
|
53
|
+
embedding_dimension: int | None = None,
|
|
54
|
+
metric: str = "cosine",
|
|
55
|
+
):
|
|
56
|
+
self._table_name = table_name
|
|
57
|
+
self._embedding_dimension = embedding_dimension
|
|
58
|
+
self._metric = metric
|
|
59
|
+
self._table: pxt.Table | None = None
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def table(self) -> pxt.Table:
|
|
63
|
+
"""Direct access to the underlying Pixeltable table.
|
|
64
|
+
|
|
65
|
+
Use for Pixeltable-native operations beyond the Haystack interface:
|
|
66
|
+
computed columns, ``.where()`` with arbitrary predicates, version
|
|
67
|
+
history, joins, and multimodal column types.
|
|
68
|
+
"""
|
|
69
|
+
return self._get_or_create_table()
|
|
70
|
+
|
|
71
|
+
def _get_or_create_table(self) -> pxt.Table:
|
|
72
|
+
if self._table is not None:
|
|
73
|
+
return self._table
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
self._table = pxt.get_table(self._table_name)
|
|
77
|
+
return self._table
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
parts = self._table_name.rsplit(".", 1)
|
|
82
|
+
if len(parts) == 2:
|
|
83
|
+
pxt.create_dir(parts[0], if_exists="ignore")
|
|
84
|
+
|
|
85
|
+
schema: dict[str, Any] = {
|
|
86
|
+
_ID_COL: pxt.String,
|
|
87
|
+
_TEXT_COL: pxt.String,
|
|
88
|
+
_META_COL: pxt.Json,
|
|
89
|
+
}
|
|
90
|
+
if self._embedding_dimension is not None:
|
|
91
|
+
schema[_EMBEDDING_COL] = pxt.Array[(self._embedding_dimension,), pxt.Float]
|
|
92
|
+
|
|
93
|
+
self._table = pxt.create_table(self._table_name, schema, if_exists="ignore")
|
|
94
|
+
|
|
95
|
+
if self._embedding_dimension is not None:
|
|
96
|
+
self._table.add_embedding_index(
|
|
97
|
+
_EMBEDDING_COL,
|
|
98
|
+
metric=self._metric,
|
|
99
|
+
if_exists="ignore",
|
|
100
|
+
)
|
|
101
|
+
return self._table
|
|
102
|
+
|
|
103
|
+
def count_documents(self) -> int:
|
|
104
|
+
"""Return the number of documents stored."""
|
|
105
|
+
t = self._get_or_create_table()
|
|
106
|
+
return t.count()
|
|
107
|
+
|
|
108
|
+
def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]:
|
|
109
|
+
"""Return documents matching the provided filters.
|
|
110
|
+
|
|
111
|
+
Supports the Haystack filter specification with ``field``,
|
|
112
|
+
``operator``, and ``value`` keys, plus compound ``AND`` / ``OR`` /
|
|
113
|
+
``NOT`` conditions.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
filters: Haystack filter dict or ``None`` for all documents.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of matching Haystack ``Document`` objects.
|
|
120
|
+
"""
|
|
121
|
+
t = self._get_or_create_table()
|
|
122
|
+
|
|
123
|
+
cols = [getattr(t, _ID_COL), getattr(t, _TEXT_COL), getattr(t, _META_COL)]
|
|
124
|
+
if self._embedding_dimension is not None:
|
|
125
|
+
cols.append(getattr(t, _EMBEDDING_COL))
|
|
126
|
+
|
|
127
|
+
chain = t.select(*cols)
|
|
128
|
+
predicate = self._build_predicate(filters)
|
|
129
|
+
if predicate is not None:
|
|
130
|
+
chain = t.where(predicate).select(*cols)
|
|
131
|
+
|
|
132
|
+
rows = chain.collect()
|
|
133
|
+
return [self._row_to_document(row) for row in rows]
|
|
134
|
+
|
|
135
|
+
def write_documents(
|
|
136
|
+
self,
|
|
137
|
+
documents: list[Document],
|
|
138
|
+
policy: DuplicatePolicy = DuplicatePolicy.NONE,
|
|
139
|
+
) -> int:
|
|
140
|
+
"""Write documents to the store.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
documents: List of Haystack ``Document`` objects to write.
|
|
144
|
+
policy: How to handle duplicates: ``OVERWRITE``, ``SKIP``,
|
|
145
|
+
``NONE`` (raise on duplicate), or ``FAIL`` (raise on duplicate).
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Number of documents written.
|
|
149
|
+
"""
|
|
150
|
+
if not documents:
|
|
151
|
+
return 0
|
|
152
|
+
|
|
153
|
+
t = self._get_or_create_table()
|
|
154
|
+
id_col = getattr(t, _ID_COL)
|
|
155
|
+
written = 0
|
|
156
|
+
|
|
157
|
+
for doc in documents:
|
|
158
|
+
doc_id = doc.id or str(uuid.uuid4())
|
|
159
|
+
|
|
160
|
+
existing = t.where(id_col == doc_id).select(id_col).collect()
|
|
161
|
+
exists = len(existing) > 0
|
|
162
|
+
|
|
163
|
+
if exists:
|
|
164
|
+
if policy == DuplicatePolicy.FAIL:
|
|
165
|
+
raise DuplicateDocumentError(f"Document with id {doc_id!r} already exists.")
|
|
166
|
+
if policy in (DuplicatePolicy.NONE, DuplicatePolicy.SKIP):
|
|
167
|
+
continue
|
|
168
|
+
# OVERWRITE: delete then re-insert
|
|
169
|
+
t.delete(where=(id_col == doc_id))
|
|
170
|
+
|
|
171
|
+
row: dict[str, Any] = {
|
|
172
|
+
_ID_COL: doc_id,
|
|
173
|
+
_TEXT_COL: doc.content or "",
|
|
174
|
+
_META_COL: doc.meta or {},
|
|
175
|
+
}
|
|
176
|
+
if doc.embedding is not None and self._embedding_dimension is not None:
|
|
177
|
+
row[_EMBEDDING_COL] = doc.embedding
|
|
178
|
+
elif self._embedding_dimension is not None:
|
|
179
|
+
row[_EMBEDDING_COL] = [0.0] * self._embedding_dimension
|
|
180
|
+
|
|
181
|
+
t.insert([row])
|
|
182
|
+
written += 1
|
|
183
|
+
|
|
184
|
+
return written
|
|
185
|
+
|
|
186
|
+
def delete_documents(self, document_ids: list[str]) -> None:
|
|
187
|
+
"""Delete documents by ID.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
document_ids: List of document IDs to delete.
|
|
191
|
+
"""
|
|
192
|
+
if not document_ids:
|
|
193
|
+
return
|
|
194
|
+
t = self._get_or_create_table()
|
|
195
|
+
id_col = getattr(t, _ID_COL)
|
|
196
|
+
for doc_id in document_ids:
|
|
197
|
+
t.delete(where=(id_col == doc_id))
|
|
198
|
+
|
|
199
|
+
def to_dict(self) -> dict[str, Any]:
|
|
200
|
+
"""Serialize this store to a dictionary."""
|
|
201
|
+
return default_to_dict(
|
|
202
|
+
self,
|
|
203
|
+
table_name=self._table_name,
|
|
204
|
+
embedding_dimension=self._embedding_dimension,
|
|
205
|
+
metric=self._metric,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def from_dict(cls, data: dict[str, Any]) -> PixeltableDocumentStore:
|
|
210
|
+
"""Deserialize a store from a dictionary."""
|
|
211
|
+
return default_from_dict(cls, data)
|
|
212
|
+
|
|
213
|
+
def _row_to_document(self, row: dict[str, Any]) -> Document:
|
|
214
|
+
embedding = None
|
|
215
|
+
if self._embedding_dimension is not None and _EMBEDDING_COL in row:
|
|
216
|
+
val = row[_EMBEDDING_COL]
|
|
217
|
+
if val is not None:
|
|
218
|
+
embedding = list(np.asarray(val, dtype=np.float32))
|
|
219
|
+
|
|
220
|
+
return Document(
|
|
221
|
+
id=row[_ID_COL],
|
|
222
|
+
content=row[_TEXT_COL],
|
|
223
|
+
meta=row[_META_COL] or {},
|
|
224
|
+
embedding=embedding,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def _build_predicate(self, filters: dict[str, Any] | None) -> Any | None:
|
|
228
|
+
"""Translate Haystack filter spec to a Pixeltable predicate."""
|
|
229
|
+
if not filters:
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
t = self._get_or_create_table()
|
|
233
|
+
|
|
234
|
+
if "conditions" in filters:
|
|
235
|
+
return self._compound_predicate(t, filters)
|
|
236
|
+
|
|
237
|
+
if "field" in filters:
|
|
238
|
+
return self._comparison_predicate(t, filters)
|
|
239
|
+
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
def _comparison_predicate(self, t: pxt.Table, f: dict[str, Any]) -> Any:
|
|
243
|
+
field = f["field"]
|
|
244
|
+
op = f["operator"]
|
|
245
|
+
value = f["value"]
|
|
246
|
+
|
|
247
|
+
if field.startswith("meta."):
|
|
248
|
+
meta_key = field[5:]
|
|
249
|
+
col = getattr(t, _META_COL)[meta_key]
|
|
250
|
+
elif field == "content":
|
|
251
|
+
col = getattr(t, _TEXT_COL)
|
|
252
|
+
elif field == "id":
|
|
253
|
+
col = getattr(t, _ID_COL)
|
|
254
|
+
else:
|
|
255
|
+
raise DocumentStoreError(f"Unsupported filter field: {field!r}")
|
|
256
|
+
|
|
257
|
+
ops = {
|
|
258
|
+
"==": lambda c, v: c == v,
|
|
259
|
+
"!=": lambda c, v: c != v,
|
|
260
|
+
">": lambda c, v: c > v,
|
|
261
|
+
">=": lambda c, v: c >= v,
|
|
262
|
+
"<": lambda c, v: c < v,
|
|
263
|
+
"<=": lambda c, v: c <= v,
|
|
264
|
+
"in": lambda c, v: c.isin(v),
|
|
265
|
+
"not in": lambda c, v: ~c.isin(v),
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
if op not in ops:
|
|
269
|
+
raise DocumentStoreError(f"Unsupported filter operator: {op!r}")
|
|
270
|
+
|
|
271
|
+
return ops[op](col, value)
|
|
272
|
+
|
|
273
|
+
def _compound_predicate(self, t: pxt.Table, f: dict[str, Any]) -> Any:
|
|
274
|
+
op = f.get("operator", "AND")
|
|
275
|
+
conditions = f["conditions"]
|
|
276
|
+
|
|
277
|
+
preds = [self._build_predicate(c) for c in conditions if self._build_predicate(c) is not None]
|
|
278
|
+
if not preds:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
if op == "AND":
|
|
282
|
+
result = preds[0]
|
|
283
|
+
for p in preds[1:]:
|
|
284
|
+
result = result & p
|
|
285
|
+
return result
|
|
286
|
+
elif op == "OR":
|
|
287
|
+
result = preds[0]
|
|
288
|
+
for p in preds[1:]:
|
|
289
|
+
result = result | p
|
|
290
|
+
return result
|
|
291
|
+
elif op == "NOT":
|
|
292
|
+
return ~preds[0]
|
|
293
|
+
else:
|
|
294
|
+
raise DocumentStoreError(f"Unsupported compound operator: {op!r}")
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Pixeltable Retriever for Haystack.
|
|
2
|
+
|
|
3
|
+
A Haystack component that performs embedding-based similarity search on a
|
|
4
|
+
``PixeltableDocumentStore``. Designed to be used in Haystack pipelines
|
|
5
|
+
alongside ``DocumentWriter``, embedders, and generators.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
14
|
+
|
|
15
|
+
from haystack_pixeltable.document_store import PixeltableDocumentStore
|
|
16
|
+
|
|
17
|
+
_EMBEDDING_COL = "embedding"
|
|
18
|
+
_TEXT_COL = "content"
|
|
19
|
+
_META_COL = "meta"
|
|
20
|
+
_ID_COL = "doc_id"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@component
|
|
24
|
+
class PixeltableRetriever:
|
|
25
|
+
"""Retriever for ``PixeltableDocumentStore`` using embedding similarity.
|
|
26
|
+
|
|
27
|
+
Runs vector similarity search against the Pixeltable embedding index
|
|
28
|
+
and returns the top-k most relevant documents.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
document_store: A ``PixeltableDocumentStore`` instance.
|
|
32
|
+
top_k: Maximum number of documents to return.
|
|
33
|
+
filters: Optional default filters applied to every query.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
document_store: PixeltableDocumentStore,
|
|
39
|
+
*,
|
|
40
|
+
top_k: int = 10,
|
|
41
|
+
filters: dict[str, Any] | None = None,
|
|
42
|
+
):
|
|
43
|
+
self._document_store = document_store
|
|
44
|
+
self._top_k = top_k
|
|
45
|
+
self._filters = filters
|
|
46
|
+
|
|
47
|
+
@component.output_types(documents=list[Document])
|
|
48
|
+
def run(
|
|
49
|
+
self,
|
|
50
|
+
query_embedding: list[float],
|
|
51
|
+
*,
|
|
52
|
+
top_k: int | None = None,
|
|
53
|
+
filters: dict[str, Any] | None = None,
|
|
54
|
+
) -> dict[str, list[Document]]:
|
|
55
|
+
"""Retrieve documents by embedding similarity.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
query_embedding: Query vector to search against.
|
|
59
|
+
top_k: Override the default top_k for this query.
|
|
60
|
+
filters: Override the default filters for this query.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Dictionary with key ``"documents"`` containing the results.
|
|
64
|
+
"""
|
|
65
|
+
k = top_k or self._top_k
|
|
66
|
+
active_filters = filters or self._filters
|
|
67
|
+
|
|
68
|
+
t = self._document_store.table
|
|
69
|
+
embed_col = getattr(t, _EMBEDDING_COL)
|
|
70
|
+
text_col = getattr(t, _TEXT_COL)
|
|
71
|
+
meta_col = getattr(t, _META_COL)
|
|
72
|
+
id_col = getattr(t, _ID_COL)
|
|
73
|
+
|
|
74
|
+
query_vec = np.array(query_embedding, dtype=np.float32)
|
|
75
|
+
sim = embed_col.similarity(vector=query_vec)
|
|
76
|
+
|
|
77
|
+
chain = t
|
|
78
|
+
predicate = self._document_store._build_predicate(active_filters)
|
|
79
|
+
if predicate is not None:
|
|
80
|
+
chain = chain.where(predicate)
|
|
81
|
+
|
|
82
|
+
rows = chain.order_by(sim, asc=False).limit(k).select(id_col, text_col, meta_col, embed_col, sim=sim).collect()
|
|
83
|
+
|
|
84
|
+
documents = []
|
|
85
|
+
for row in rows:
|
|
86
|
+
embedding = None
|
|
87
|
+
val = row.get(_EMBEDDING_COL)
|
|
88
|
+
if val is not None:
|
|
89
|
+
embedding = list(np.asarray(val, dtype=np.float32))
|
|
90
|
+
|
|
91
|
+
doc = Document(
|
|
92
|
+
id=row[_ID_COL],
|
|
93
|
+
content=row[_TEXT_COL],
|
|
94
|
+
meta=row[_META_COL] or {},
|
|
95
|
+
embedding=embedding,
|
|
96
|
+
score=float(row["sim"]),
|
|
97
|
+
)
|
|
98
|
+
documents.append(doc)
|
|
99
|
+
|
|
100
|
+
return {"documents": documents}
|
|
101
|
+
|
|
102
|
+
def to_dict(self) -> dict[str, Any]:
|
|
103
|
+
"""Serialize this retriever to a dictionary."""
|
|
104
|
+
return default_to_dict(
|
|
105
|
+
self,
|
|
106
|
+
document_store=self._document_store.to_dict(),
|
|
107
|
+
top_k=self._top_k,
|
|
108
|
+
filters=self._filters,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def from_dict(cls, data: dict[str, Any]) -> PixeltableRetriever:
|
|
113
|
+
"""Deserialize a retriever from a dictionary."""
|
|
114
|
+
data["init_parameters"]["document_store"] = PixeltableDocumentStore.from_dict(
|
|
115
|
+
data["init_parameters"]["document_store"]
|
|
116
|
+
)
|
|
117
|
+
return default_from_dict(cls, data)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: haystack-pixeltable
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Haystack Document Store and Retriever backed by Pixeltable multimodal data infrastructure.
|
|
5
|
+
Author-email: Pixeltable <contact@pixeltable.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/pixeltable/haystack-pixeltable
|
|
8
|
+
Project-URL: Repository, https://github.com/pixeltable/haystack-pixeltable
|
|
9
|
+
Project-URL: Documentation, https://docs.pixeltable.com/
|
|
10
|
+
Project-URL: Issues, https://github.com/pixeltable/haystack-pixeltable/issues
|
|
11
|
+
Project-URL: Discord, https://discord.gg/QPyqFYx2UN
|
|
12
|
+
Keywords: haystack,pixeltable,document-store,retriever,multimodal,embeddings,rag
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: haystack-ai>=2.6.0
|
|
24
|
+
Requires-Dist: pixeltable>=0.2.28
|
|
25
|
+
Requires-Dist: numpy
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# pixeltable-haystack
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/pixeltable-haystack/)
|
|
34
|
+
[](https://github.com/pixeltable/haystack-pixeltable/actions/workflows/ci.yml)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
|
|
37
|
+
Haystack Document Store and Retriever backed by [Pixeltable](https://pixeltable.com/) — persistent, versioned, multimodal data infrastructure for AI applications.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install pixeltable-haystack
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
### Document Store
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from haystack import Document
|
|
51
|
+
from haystack_pixeltable import PixeltableDocumentStore
|
|
52
|
+
|
|
53
|
+
store = PixeltableDocumentStore(
|
|
54
|
+
table_name="myproject.docs",
|
|
55
|
+
embedding_dimension=1536,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Write documents
|
|
59
|
+
store.write_documents([
|
|
60
|
+
Document(content="Pixeltable is multimodal data infrastructure.", embedding=[...]),
|
|
61
|
+
Document(content="Haystack is a framework for building RAG pipelines.", embedding=[...]),
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
# Filter documents
|
|
65
|
+
results = store.filter_documents(
|
|
66
|
+
filters={"field": "meta.category", "operator": "==", "value": "docs"}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Count
|
|
70
|
+
print(store.count_documents())
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Retriever (Similarity Search)
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from haystack_pixeltable import PixeltableDocumentStore, PixeltableRetriever
|
|
77
|
+
|
|
78
|
+
store = PixeltableDocumentStore(
|
|
79
|
+
table_name="myproject.docs",
|
|
80
|
+
embedding_dimension=1536,
|
|
81
|
+
)
|
|
82
|
+
retriever = PixeltableRetriever(document_store=store, top_k=5)
|
|
83
|
+
|
|
84
|
+
# Search by embedding vector
|
|
85
|
+
result = retriever.run(query_embedding=[0.1, 0.2, ...])
|
|
86
|
+
for doc in result["documents"]:
|
|
87
|
+
print(f"{doc.content} (score: {doc.score:.3f})")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### In a Haystack Pipeline
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from haystack import Pipeline
|
|
94
|
+
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
|
|
95
|
+
from haystack.components.writers import DocumentWriter
|
|
96
|
+
from haystack_pixeltable import PixeltableDocumentStore, PixeltableRetriever
|
|
97
|
+
|
|
98
|
+
store = PixeltableDocumentStore(
|
|
99
|
+
table_name="rag.knowledge",
|
|
100
|
+
embedding_dimension=384,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Indexing pipeline
|
|
104
|
+
indexing = Pipeline()
|
|
105
|
+
indexing.add_component("embedder", SentenceTransformersDocumentEmbedder())
|
|
106
|
+
indexing.add_component("writer", DocumentWriter(document_store=store))
|
|
107
|
+
indexing.connect("embedder", "writer")
|
|
108
|
+
|
|
109
|
+
# Query pipeline
|
|
110
|
+
query = Pipeline()
|
|
111
|
+
query.add_component("embedder", SentenceTransformersTextEmbedder())
|
|
112
|
+
query.add_component("retriever", PixeltableRetriever(document_store=store, top_k=5))
|
|
113
|
+
query.connect("embedder.embedding", "retriever.query_embedding")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Filtering
|
|
117
|
+
|
|
118
|
+
The Document Store supports the [Haystack filter specification](https://docs.haystack.deepset.ai/docs/metadata-filtering):
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# Simple equality
|
|
122
|
+
store.filter_documents(filters={"field": "meta.category", "operator": "==", "value": "science"})
|
|
123
|
+
|
|
124
|
+
# Comparison operators: ==, !=, >, >=, <, <=
|
|
125
|
+
store.filter_documents(filters={"field": "meta.score", "operator": ">", "value": 0.8})
|
|
126
|
+
|
|
127
|
+
# Compound AND
|
|
128
|
+
store.filter_documents(filters={
|
|
129
|
+
"operator": "AND",
|
|
130
|
+
"conditions": [
|
|
131
|
+
{"field": "meta.category", "operator": "==", "value": "science"},
|
|
132
|
+
{"field": "meta.score", "operator": ">", "value": 0.5},
|
|
133
|
+
],
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
# Compound OR
|
|
137
|
+
store.filter_documents(filters={
|
|
138
|
+
"operator": "OR",
|
|
139
|
+
"conditions": [
|
|
140
|
+
{"field": "meta.source", "operator": "==", "value": "arxiv"},
|
|
141
|
+
{"field": "meta.source", "operator": "==", "value": "pubmed"},
|
|
142
|
+
],
|
|
143
|
+
})
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Pixeltable Escape Hatch: `.table`
|
|
147
|
+
|
|
148
|
+
The `.table` property gives direct access to the underlying Pixeltable table for operations beyond the Haystack interface:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
store = PixeltableDocumentStore(table_name="myproject.docs", embedding_dimension=1536)
|
|
152
|
+
t = store.table
|
|
153
|
+
|
|
154
|
+
# Add a computed column
|
|
155
|
+
import pixeltable.functions.openai as openai
|
|
156
|
+
t.add_computed_column(
|
|
157
|
+
summary=openai.chat_completions(
|
|
158
|
+
messages=[{"role": "user", "content": t.content}],
|
|
159
|
+
model="gpt-4o-mini",
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Use arbitrary Pixeltable queries
|
|
164
|
+
results = t.where(t.meta["category"] == "science").select(t.content, t.summary).collect()
|
|
165
|
+
|
|
166
|
+
# Version history
|
|
167
|
+
print(t.count(version=-1)) # row count at previous version
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Why Pixeltable?
|
|
171
|
+
|
|
172
|
+
| Feature | Pixeltable | Chroma | Qdrant | pgvector |
|
|
173
|
+
|---------|-----------|--------|--------|----------|
|
|
174
|
+
| Persistent storage | Built-in | Opt-in | Opt-in | Built-in |
|
|
175
|
+
| Computed columns | Native | No | No | No |
|
|
176
|
+
| Version history | Native | No | No | No |
|
|
177
|
+
| Multimodal types | Image, Video, Audio, Document | Text only | Text only | Text only |
|
|
178
|
+
| Metadata filtering | JSON + SQL predicates | Limited | Rich | SQL |
|
|
179
|
+
| Embedding auto-compute | Via computed columns | Manual | Manual | Manual |
|
|
180
|
+
|
|
181
|
+
## Development
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
pip install -e ".[dev]"
|
|
185
|
+
pytest tests/ -v
|
|
186
|
+
ruff check . && ruff format --check .
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## License
|
|
190
|
+
|
|
191
|
+
Apache 2.0
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
haystack_pixeltable/__init__.py,sha256=ozoF_MAB4W7M_-0lkqje5CnYveETF2fzFKQc2r8xwio,285
|
|
2
|
+
haystack_pixeltable/document_store.py,sha256=lfNQBHnZaFuLmhKhcPEheKRvvAstOGJlfetXOigBAtU,9818
|
|
3
|
+
haystack_pixeltable/retriever.py,sha256=U8N8miK0zWMF-i1TOKjlLRp0Kap2Jf_8p91kLHk5Q5g,3749
|
|
4
|
+
haystack_pixeltable-0.1.0.dist-info/licenses/LICENSE,sha256=Lv1r3wd8fcY9xjdHLrJDE2yt7_JzPko2E-F6HTK_c1g,5346
|
|
5
|
+
haystack_pixeltable-0.1.0.dist-info/METADATA,sha256=lyWvM85YzDUhW0ey7HXxw8dW0QkSE_adGavmROogES0,6165
|
|
6
|
+
haystack_pixeltable-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
haystack_pixeltable-0.1.0.dist-info/top_level.txt,sha256=dh7ri7i02q4wye4hDD25fEFi38zpgCbCOOlerrEotVI,20
|
|
8
|
+
haystack_pixeltable-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work.
|
|
38
|
+
|
|
39
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
40
|
+
form, that is based on (or derived from) the Work and for which the
|
|
41
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
42
|
+
represent, as a whole, an original work of authorship.
|
|
43
|
+
|
|
44
|
+
"Contribution" shall mean any work of authorship, including
|
|
45
|
+
the original version of the Work and any modifications or additions
|
|
46
|
+
to that Work, that is intentionally submitted to the Licensor for
|
|
47
|
+
inclusion in the Work by the copyright owner or by an individual or
|
|
48
|
+
Legal Entity authorized to submit on behalf of the copyright owner.
|
|
49
|
+
|
|
50
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
51
|
+
on behalf of whom a Contribution has been received by the Licensor and
|
|
52
|
+
subsequently incorporated within the Work.
|
|
53
|
+
|
|
54
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
55
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
56
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
57
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
58
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
59
|
+
Work and such Derivative Works in Source or Object form.
|
|
60
|
+
|
|
61
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
62
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
63
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
64
|
+
(except as stated in this section) patent license to make, have made,
|
|
65
|
+
use, offer to sell, sell, import, and otherwise transfer the Work.
|
|
66
|
+
|
|
67
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
68
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
69
|
+
modifications, and in Source or Object form, provided that You
|
|
70
|
+
meet the following conditions:
|
|
71
|
+
|
|
72
|
+
(a) You must give any other recipients of the Work or
|
|
73
|
+
Derivative Works a copy of this License; and
|
|
74
|
+
|
|
75
|
+
(b) You must cause any modified files to carry prominent notices
|
|
76
|
+
stating that You changed the files; and
|
|
77
|
+
|
|
78
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
79
|
+
that You distribute, all copyright, patent, trademark, and
|
|
80
|
+
attribution notices from the Source form of the Work,
|
|
81
|
+
excluding those notices that do not pertain to any part of
|
|
82
|
+
the Derivative Works; and
|
|
83
|
+
|
|
84
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
85
|
+
distribution, then any Derivative Works that You distribute must
|
|
86
|
+
include a readable copy of the attribution notices contained
|
|
87
|
+
within such NOTICE file.
|
|
88
|
+
|
|
89
|
+
5. Submission of Contributions.
|
|
90
|
+
|
|
91
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
92
|
+
names, trademarks, service marks, or product names of the Licensor.
|
|
93
|
+
|
|
94
|
+
7. Disclaimer of Warranty.
|
|
95
|
+
|
|
96
|
+
8. Limitation of Liability.
|
|
97
|
+
|
|
98
|
+
9. Accepting Warranty or Additional Liability.
|
|
99
|
+
|
|
100
|
+
Copyright 2024-2026 Pixeltable, Inc.
|
|
101
|
+
|
|
102
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
103
|
+
you may not use this file except in compliance with the License.
|
|
104
|
+
You may obtain a copy of the License at
|
|
105
|
+
|
|
106
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
107
|
+
|
|
108
|
+
Unless required by applicable law or agreed to in writing, software
|
|
109
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
110
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
111
|
+
See the License for the specific language governing permissions and
|
|
112
|
+
limitations under the License.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
haystack_pixeltable
|