hyperstreamdb 0.1.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,468 @@
1
+ from .hyperstreamdb import *
2
+ from .hyperstreamdb import Table as _RustTable
3
+ from .embeddings import registry, EmbeddingFunction
4
+ import pandas as pd
5
+ try:
6
+ import pyarrow as pa
7
+ except ImportError:
8
+ pa = None
9
+ try:
10
+ import polars as pl
11
+ except ImportError:
12
+ pl = None
13
+ from typing import List, Optional, Union, Dict, Any
14
+ import os
15
+
16
+ def _resolve_uri(uri: str) -> str:
17
+ """Resolve a URI to an absolute path if it's a local relative path."""
18
+ if "://" not in uri and not uri.startswith("/"):
19
+ return os.path.abspath(uri)
20
+ return uri
21
+
22
+ class Query:
23
+ """
24
+ Fluent Query interface for HyperStreamDB.
25
+ """
26
+ def __init__(self, table, filter_expr: Optional[str] = None):
27
+ self._table = table
28
+ self._filter = filter_expr
29
+ self._vector_filter = None
30
+ self._columns = None
31
+
32
+ def filter(self, expr: str) -> 'Query':
33
+ """Apply a SQL-like filter expression."""
34
+ if self._filter:
35
+ self._filter = f"({self._filter}) AND ({expr})"
36
+ else:
37
+ self._filter = expr
38
+ return self
39
+
40
+ def vector_search(self, query: Union[List[float], str], column: Optional[str] = None, k: int = 10, **kwargs) -> 'Query':
41
+ """
42
+ Apply a vector search filter.
43
+
44
+ Args:
45
+ query: The query vector (list of floats) or a string to be vectorized.
46
+ column: The vector column to search against.
47
+ k: Number of nearest neighbors to return.
48
+ **kwargs: Additional parameters (e.g., n_probe).
49
+ """
50
+ self._vector_filter = {
51
+ "column": column,
52
+ "query": query,
53
+ "k": k,
54
+ **kwargs
55
+ }
56
+ return self
57
+
58
+ def select(self, columns: List[str]) -> 'Query':
59
+ """Select specific columns to return."""
60
+ self._columns = columns
61
+ return self
62
+
63
+ def to_pandas(self, context: Optional[Any] = None):
64
+ """Execute the query and return results as a Pandas DataFrame."""
65
+ return self._table.to_pandas(
66
+ filter=self._filter,
67
+ vector_filter=self._vector_filter,
68
+ columns=self._columns,
69
+ context=context
70
+ )
71
+
72
+ def to_arrow(self, context: Optional[Any] = None):
73
+ """Execute the query and return results as an Arrow Table."""
74
+ return self._table.to_arrow(
75
+ filter=self._filter,
76
+ vector_filter=self._vector_filter,
77
+ columns=self._columns,
78
+ context=context
79
+ )
80
+
81
+ def execute(self, context: Optional[Any] = None):
82
+ """Execute the query and return results as a Pandas DataFrame (alias for to_pandas)."""
83
+ return self.to_pandas(context)
84
+
85
+ class Table:
86
+ """
87
+ Enhanced HyperStreamDB Table with auto-vectorization and embedding registry support.
88
+ """
89
+ def __init__(self, uri: str, inner_table: Optional[_RustTable] = None, context: Optional[Any] = None, index_all: bool = True, primary_key: Optional[str] = None, explain: bool = False):
90
+ uri = _resolve_uri(uri)
91
+ self.explain = explain
92
+ if inner_table:
93
+ self._inner = inner_table
94
+ else:
95
+ self._inner = _RustTable(uri, context=context)
96
+ self._inner.set_index_all(index_all)
97
+ if primary_key:
98
+ if isinstance(primary_key, str):
99
+ self._inner.set_primary_key([primary_key])
100
+ else:
101
+ self._inner.set_primary_key(list(primary_key))
102
+ self._embedding_configs = {}
103
+
104
+ @classmethod
105
+ def create(cls, uri: str, schema, context: Optional[Any] = None) -> 'Table':
106
+ """Create a new table with an explicit schema."""
107
+ uri = _resolve_uri(uri)
108
+ return cls(uri, inner_table=_RustTable.create(uri, schema, context=context))
109
+
110
+ @classmethod
111
+ def register_external(cls, uri: str, iceberg_metadata_uri: str, context: Optional[Any] = None) -> 'Table':
112
+ """Register an existing Iceberg table."""
113
+ uri = _resolve_uri(uri)
114
+ return cls(uri, inner_table=_RustTable.register_external(uri, iceberg_metadata_uri, context=context))
115
+
116
+ def define_embedding(self, column: str, function: Union[str, EmbeddingFunction], vector_column: Optional[str] = None):
117
+ """
118
+ Link a source column to an embedding function for automatic vectorization.
119
+
120
+ Args:
121
+ column: The source text column.
122
+ function: Registered function name or EmbeddingFunction instance.
123
+ vector_column: Target vector column name (defaults to {column}_vector).
124
+ """
125
+ self._embedding_configs[column] = {
126
+ "function": function,
127
+ "vector_column": vector_column or f"{column}_vector"
128
+ }
129
+
130
+ def write(self, data: Any, context: Optional[Any] = None, mode: str = "append"):
131
+ """
132
+ Write data to the table, automatically generating embeddings for configured columns.
133
+
134
+ Args:
135
+ data: pandas.DataFrame, pyarrow.Table, polars.DataFrame, or List[Dict].
136
+ context: Optional ComputeContext for GPU acceleration.
137
+ mode: 'append' (default) or 'overwrite' (clears table first).
138
+ """
139
+ if mode == "overwrite":
140
+ self.truncate()
141
+
142
+ if isinstance(data, pd.DataFrame):
143
+ return self._write_pandas(data, context=context)
144
+ elif pa and isinstance(data, pa.Table):
145
+ return self._write_arrow(data, context=context)
146
+ elif pl and isinstance(data, pl.DataFrame):
147
+ return self._write_polars(data, context=context)
148
+ elif isinstance(data, list):
149
+ return self._write_list(data, context=context)
150
+ else:
151
+ try:
152
+ import numpy as np
153
+ if isinstance(data, np.ndarray):
154
+ return self.write(pd.DataFrame(data), context=context)
155
+
156
+ import torch
157
+ if isinstance(data, torch.Tensor):
158
+ return self.write(pd.DataFrame(data.detach().cpu().numpy()), context=context)
159
+ except ImportError:
160
+ pass
161
+ raise TypeError(f"Unsupported data type for write: {type(data)}")
162
+
163
+ def write_pandas(self, df: pd.DataFrame, context: Optional[Any] = None):
164
+ """High-level Pandas ingestion with auto-vectorization."""
165
+ return self._write_pandas(df, context=context)
166
+
167
+ def write_arrow(self, table: 'pa.Table', context: Optional[Any] = None):
168
+ """High-level Arrow ingestion with auto-vectorization."""
169
+ return self._write_arrow(table, context=context)
170
+
171
+ def upsert(self, data: Any, key_column: Union[str, List[str]], mode: str = "merge_on_read", context: Optional[Any] = None):
172
+ """Update or insert data using a key column (or list of columns) to avoid duplicates."""
173
+ from .hyperstreamdb import PyMergeMode
174
+
175
+ # Map string mode to Enum
176
+ enum_mode = PyMergeMode.MergeOnRead
177
+ if mode.lower() == "merge_on_write":
178
+ enum_mode = PyMergeMode.MergeOnWrite
179
+
180
+ if isinstance(data, pd.DataFrame):
181
+ processed_df = self._auto_vectorize(data)
182
+ # If key_column is a list, join it with commas for the Rust side (or update Rust to take list)
183
+ if isinstance(key_column, list):
184
+ key_str = ",".join(key_column)
185
+ else:
186
+ key_str = key_column
187
+ return self._inner.merge_pandas(processed_df, key_str, enum_mode, context=context)
188
+
189
+ df = pd.DataFrame(data)
190
+ return self.upsert(df, key_column, mode, context=context)
191
+
192
+ def commit(self):
193
+ """Commit temporary segments to the table."""
194
+ return self._inner.commit()
195
+
196
+ def truncate(self):
197
+ """Clear all data from the table while keeping the schema."""
198
+ return self._inner.truncate()
199
+
200
+ def vacuum(self, retention_versions: int = 1):
201
+ """
202
+ Physically delete unreferenced data and manifest files to reclaim space.
203
+
204
+ Args:
205
+ retention_versions: Number of snapshots to keep (default 1).
206
+ """
207
+ return self._inner.vacuum(retention_versions)
208
+
209
+ @property
210
+ def autocommit(self) -> bool:
211
+ """Get or set the autocommit state of the table."""
212
+ return self._inner.autocommit
213
+
214
+ @autocommit.setter
215
+ def autocommit(self, value: bool):
216
+ self._inner.autocommit = value
217
+
218
+ def wait_for_background_tasks(self):
219
+ """Wait for all background tasks (like index building) to complete."""
220
+ return self._inner.wait_for_background_tasks()
221
+
222
+ def delete(self, filter: str):
223
+ """Delete rows matching the filter expression."""
224
+ return self._inner.delete(filter)
225
+
226
+ def _write_pandas(self, df: pd.DataFrame, context: Optional[Any] = None):
227
+ processed_df = self._auto_vectorize(df)
228
+ return self._inner.write_pandas(processed_df, context=context)
229
+
230
+ def _write_arrow(self, table: 'pa.Table', context: Optional[Any] = None):
231
+ if self._embedding_configs:
232
+ df = table.to_pandas()
233
+ return self._write_pandas(df, context=context)
234
+ return self._inner.write_arrow(table, context=context)
235
+
236
+ def _write_polars(self, df: 'pl.DataFrame', context: Optional[Any] = None):
237
+ if self._embedding_configs:
238
+ pandas_df = df.to_pandas()
239
+ return self._write_pandas(pandas_df, context=context)
240
+ return self._inner.write_arrow(df.to_arrow(), context=context)
241
+
242
+ def _write_list(self, data: List[Dict[str, Any]], context: Optional[Any] = None):
243
+ # Convert to pandas first to handle vectorization and type enforcement
244
+ df = pd.DataFrame(data)
245
+ return self._write_pandas(df, context=context)
246
+
247
+ def _auto_vectorize(self, data: Union[pd.DataFrame, List[Dict[str, Any]]]):
248
+ if not self._embedding_configs:
249
+ return data
250
+
251
+ if isinstance(data, pd.DataFrame):
252
+ import numpy as np
253
+ df = data.copy()
254
+ for col, config in self._embedding_configs.items():
255
+ if col in df.columns:
256
+ func = config["function"]
257
+ if isinstance(func, str):
258
+ func = registry.get(func)
259
+
260
+ if func:
261
+ vector_col = config["vector_column"]
262
+ embeddings = func(df[col].tolist())
263
+ # Enforce Float32 for vector compatibility
264
+ if isinstance(embeddings, np.ndarray):
265
+ embeddings = embeddings.astype(np.float32)
266
+ df[vector_col] = list(embeddings)
267
+ return df
268
+
269
+ # Large list branch omitted for brevity, logic is similar (use pandas path)
270
+ return data
271
+
272
+ def _prepare_vector_filter(self, vector_filter: Optional[Union[Dict[str, Any], List[float]]], **kwargs) -> Optional[Dict[str, Any]]:
273
+ if vector_filter is None:
274
+ return None
275
+
276
+ # 1. Handle vector_filter as a list (simplified search)
277
+ if not isinstance(vector_filter, dict):
278
+ column = "embedding"
279
+ if self._embedding_configs:
280
+ column = list(self._embedding_configs.values())[0]["vector_column"]
281
+ vector_filter = {"column": column, "query": vector_filter}
282
+
283
+ # 2. Add extra kwargs (k, n_probe) to vector_filter if present
284
+ if kwargs:
285
+ vector_filter.update(kwargs)
286
+
287
+ if "k" not in vector_filter:
288
+ vector_filter["k"] = 10
289
+
290
+ # Ensure column is set (e.g. if fluent API sent column=None)
291
+ if vector_filter.get("column") is None:
292
+ column = "embedding"
293
+ if self._embedding_configs:
294
+ column = list(self._embedding_configs.values())[0]["vector_column"]
295
+ vector_filter["column"] = column
296
+
297
+ # Auto-vectorize string query
298
+ if "query" in vector_filter and isinstance(vector_filter["query"], str):
299
+ # Try to find a matching embedding function
300
+ target_col = vector_filter.get("column")
301
+ func = None
302
+
303
+ # 1. Check if we have an explicit config for this vector column
304
+ for src_col, config in self._embedding_configs.items():
305
+ if config["vector_column"] == target_col:
306
+ func = config["function"]
307
+ break
308
+
309
+ # 2. If not, check if any registered function matches the column name
310
+ if not func:
311
+ func = registry.get(target_col)
312
+
313
+ if func:
314
+ if isinstance(func, str):
315
+ func = registry.get(func)
316
+ if func:
317
+ # Vectorize the query string
318
+ vector_filter["query"] = func([vector_filter["query"]])[0].tolist()
319
+ if self.explain:
320
+ print(f"[Explain] Vectorized query using context: {target_col}")
321
+
322
+ return vector_filter
323
+
324
+ def to_pandas(self, filter: Optional[str] = None, vector_filter: Optional[Union[Dict[str, Any], List[float]]] = None, columns: Optional[List[str]] = None, context: Optional[Any] = None, **kwargs):
325
+ """
326
+ Read table to Pandas with auto-vectorization of search queries and flexible parameters.
327
+
328
+ Parameters:
329
+ filter: Optional scalar WHERE clause (e.g., "category = 'news'")
330
+ vector_filter: Dict with vector search params:
331
+ - column: str (required) - vector column name
332
+ - query: list (required) - query vector
333
+ - k: int (required) - number of results
334
+ - metric: str (optional) - 'l2'|'cosine'|'innerproduct'|'l1'|'hamming'|'jaccard' (default: l2)
335
+ - ef_search: int (optional) - HNSW ef parameter for tuning
336
+ - probes: int (optional) - IVF probes parameter for tuning
337
+ columns: Optional list of column names to select
338
+ context: Optional compute context (GPU/CPU)
339
+ **kwargs: Extra params (merged into vector_filter if present)
340
+
341
+ Example:
342
+ # Vector search with cosine metric
343
+ df = table.to_pandas(vector_filter={
344
+ "column": "embedding",
345
+ "query": [1.0, 2.0, 3.0],
346
+ "k": 5,
347
+ "metric": "cosine",
348
+ "ef_search": 200 # Tune HNSW search quality
349
+ })
350
+ """
351
+ vf = self._prepare_vector_filter(vector_filter, **kwargs)
352
+ if self.explain:
353
+ # Call native Rust explain logic
354
+ print(self._inner.explain(filter, vf))
355
+
356
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k not in ["k", "n_probe", "column"]}
357
+ return self._inner.to_pandas(filter, vf, columns, context=context, **filtered_kwargs)
358
+
359
+ def to_arrow(self, filter: Optional[str] = None, vector_filter: Optional[Union[Dict[str, Any], List[float]]] = None, columns: Optional[List[str]] = None, context: Optional[Any] = None, **kwargs):
360
+ """
361
+ Read table to Arrow Table with auto-vectorization of search queries and flexible parameters.
362
+
363
+ Parameters:
364
+ filter: Optional scalar WHERE clause (e.g., "category = 'news'")
365
+ vector_filter: Dict with vector search params:
366
+ - column: str (required) - vector column name
367
+ - query: list (required) - query vector
368
+ - k: int (required) - number of results
369
+ - metric: str (optional) - 'l2'|'cosine'|'innerproduct'|'l1'|'hamming'|'jaccard' (default: l2)
370
+ - ef_search: int (optional) - HNSW ef parameter for tuning
371
+ - probes: int (optional) - IVF probes parameter for tuning
372
+ columns: Optional list of column names to select
373
+ context: Optional compute context (GPU/CPU)
374
+ **kwargs: Extra params (merged into vector_filter if present)
375
+ """
376
+ if "filter" in kwargs and filter is None:
377
+ filter = kwargs.pop("filter")
378
+
379
+ vf = self._prepare_vector_filter(vector_filter, **kwargs)
380
+ # to_arrow in Rust doesn't currently take **kwargs
381
+ return self._inner.to_arrow(filter, vf, columns, context=context)
382
+
383
+ def query(self) -> Query:
384
+ """Start a fluent query."""
385
+ return Query(self)
386
+
387
+ def filter(self, expr: Optional[str] = None, vector_filter: Optional[Union[Dict[str, Any], List[float]]] = None, **kwargs) -> 'Query':
388
+ """
389
+ Start a fluent query or apply immediate filters.
390
+ """
391
+ if "filter" in kwargs and expr is None:
392
+ expr = kwargs.pop("filter")
393
+
394
+ q = Query(self, expr)
395
+ if vector_filter is not None:
396
+ if isinstance(vector_filter, list):
397
+ q.vector_search(vector_filter, **kwargs)
398
+ elif isinstance(vector_filter, dict):
399
+ # Merge dict into Query state
400
+ q._vector_filter = vector_filter
401
+ if kwargs:
402
+ q._vector_filter.update(kwargs)
403
+ elif kwargs:
404
+ # Assume kwargs refer to search params if vector_filter was missing but k was provided?
405
+ # Actually better to be explicit: table.filter(vector_filter=v, k=5)
406
+ pass
407
+ return q
408
+
409
+ @property
410
+ def primary_key(self):
411
+ """Get the current primary key column."""
412
+ return self._inner.get_primary_key()
413
+
414
+ @primary_key.setter
415
+ def primary_key(self, columns: Union[str, List[str]]):
416
+ """Set the primary key column(s)."""
417
+ if isinstance(columns, str):
418
+ self._inner.set_primary_key([columns])
419
+ else:
420
+ self._inner.set_primary_key(list(columns))
421
+
422
+ @property
423
+ def index_all(self):
424
+ """Whether to index all compatible columns by default."""
425
+ return self._inner.get_index_all()
426
+
427
+ @index_all.setter
428
+ def index_all(self, value):
429
+ self._inner.set_index_all(value)
430
+
431
+ @property
432
+ def row_count(self) -> int:
433
+ """Get total row count in the table."""
434
+ return self._inner.get_table_statistics().row_count
435
+
436
+ @property
437
+ def statistics(self):
438
+ """Get full table statistics."""
439
+ return self._inner.get_table_statistics()
440
+
441
+ def add_index_columns(self, columns: List[str], tokenizer: Optional[str] = None):
442
+ """
443
+ Add columns to the indexing configuration.
444
+
445
+ Args:
446
+ columns: List of column names to index.
447
+ tokenizer: Optional tokenizer name from the registry.
448
+ """
449
+ return self._inner.add_index_columns(columns, tokenizer)
450
+
451
+ def set_index_config(self, column: str, enabled: bool = True, tokenizer: Optional[str] = None, device: Optional[str] = None):
452
+ """
453
+ Set indexing configuration for a specific column.
454
+
455
+ Args:
456
+ column: Name of the column to configure.
457
+ enabled: Whether to enable indexing for this column (default: True).
458
+ tokenizer: Tokenizer name from the registry ('identity', 'whitespace', 'standard').
459
+ device: Compute device ('cpu', 'cuda', 'mps') if specific processing is needed.
460
+ """
461
+ self._inner.set_index_config(column, enabled, tokenizer, device)
462
+
463
+ def __getattr__(self, name):
464
+ """Delegate other calls to the Rust implementation."""
465
+ return getattr(self._inner, name)
466
+
467
+ def __repr__(self):
468
+ return f"HyperStreamTable(uri={self._inner.table_uri()})"
@@ -0,0 +1,98 @@
1
+ import abc
2
+ from typing import List, Optional, Union, Dict, Any
3
+ import numpy as np
4
+
5
+ class EmbeddingFunction(abc.ABC):
6
+ """Abstract base class for all embedding functions."""
7
+ @abc.abstractmethod
8
+ def __call__(self, texts: List[str]) -> np.ndarray:
9
+ """Embed a list of strings into a numpy array of vectors."""
10
+ pass
11
+
12
+ class HuggingFaceFunction(EmbeddingFunction):
13
+ """
14
+ Local embedding function using Sentence Transformers (supports all Hugging Face models).
15
+ Examples: 'all-MiniLM-L6-v2', 'BAAI/bge-large-en-v1.5', 'Qwen/Qwen-7B-Chat' (if supported by ST)
16
+ """
17
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str = "cpu", **kwargs):
18
+ try:
19
+ from sentence_transformers import SentenceTransformer
20
+ except ImportError:
21
+ raise ImportError("Please install sentence-transformers: pip install sentence-transformers")
22
+ self.model = SentenceTransformer(model_name, device=device, **kwargs)
23
+
24
+ def __call__(self, texts: List[str]) -> np.ndarray:
25
+ return self.model.encode(texts, convert_to_numpy=True)
26
+
27
+ class OpenAIEmbeddingFunction(EmbeddingFunction):
28
+ """Embedding function using the OpenAI API."""
29
+ def __init__(self, model_name: str = "text-embedding-3-small", api_key: Optional[str] = None, **kwargs):
30
+ try:
31
+ import openai
32
+ except ImportError:
33
+ raise ImportError("Please install openai: pip install openai")
34
+ self.client = openai.OpenAI(api_key=api_key, **kwargs)
35
+ self.model = model_name
36
+
37
+ def __call__(self, texts: List[str]) -> np.ndarray:
38
+ response = self.client.embeddings.create(input=texts, model=self.model)
39
+ return np.array([data.embedding for data in response.data])
40
+
41
+ class AnthropicEmbeddingFunction(EmbeddingFunction):
42
+ """
43
+ Embedding function using Anthropic/Claude (placeholder as Anthropic doesn't have a direct embedding API yet).
44
+ Often used in conjunction with Voyage AI or similar.
45
+ """
46
+ def __init__(self, model_name: str = "voyage-2", api_key: Optional[str] = None, **kwargs):
47
+ try:
48
+ import voyageai
49
+ except ImportError:
50
+ raise ImportError("Anthropic often uses Voyage AI for embeddings. Please install: pip install voyageai")
51
+ self.client = voyageai.Client(api_key=api_key)
52
+ self.model = model_name
53
+
54
+ def __call__(self, texts: List[str]) -> np.ndarray:
55
+ result = self.client.embed(texts, model=self.model)
56
+ return np.array(result.embeddings)
57
+
58
+ class GeminiEmbeddingFunction(EmbeddingFunction):
59
+ """Embedding function using Google's Gemini API."""
60
+ def __init__(self, model_name: str = "models/embedding-001", api_key: Optional[str] = None, **kwargs):
61
+ try:
62
+ import google.generativeai as genai
63
+ except ImportError:
64
+ raise ImportError("Please install google-generativeai: pip install google-generativeai")
65
+ if api_key:
66
+ genai.configure(api_key=api_key)
67
+ self.model = model_name
68
+ self.kwargs = kwargs
69
+
70
+ def __call__(self, texts: List[str]) -> np.ndarray:
71
+ import google.generativeai as genai
72
+ result = genai.embed_content(model=self.model, content=texts, **self.kwargs)
73
+ return np.array(result['embedding'])
74
+
75
+ class EmbeddingRegistry:
76
+ """Registry to manage and retrieve embedding functions."""
77
+ _instance = None
78
+
79
+ def __new__(cls):
80
+ if cls._instance is None:
81
+ cls._instance = super(EmbeddingRegistry, cls).__new__(cls)
82
+ cls._instance.functions = {}
83
+ return cls._instance
84
+
85
+ def register(self, name: str, func: EmbeddingFunction):
86
+ """Register a new embedding function."""
87
+ self.functions[name] = func
88
+
89
+ def get(self, name: str) -> Optional[EmbeddingFunction]:
90
+ """Retrieve a registered embedding function."""
91
+ return self.functions.get(name)
92
+
93
+ # Global registry instance
94
+ registry = EmbeddingRegistry()
95
+
96
+ def get_registry():
97
+ """Access the global embedding registry."""
98
+ return registry