zeusdb-vector-database 0.2.0__cp312-cp312-win32.whl → 0.2.1__cp312-cp312-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  """
2
2
  ZeusDB Vector Database Module
3
3
  """
4
- __version__ = "0.2.0"
4
+ __version__ = "0.2.1"
5
5
 
6
6
  from .vector_database import VectorDatabase # imports the VectorDatabase class from the vector_database.py file
7
7
 
@@ -56,7 +56,8 @@ class VectorDatabase:
56
56
  'subvectors': 8, # Number of subvectors (must divide dim evenly, default: 8)
57
57
  'bits': 8, # Bits per subvector (1-8, controls centroids, default: 8)
58
58
  'training_size': None, # Auto-calculated based on subvectors & bits (or specify manually)
59
- 'max_training_vectors': None # Optional limit on training vectors used
59
+ 'max_training_vectors': None, # Optional limit on training vectors used
60
+ 'storage_mode': 'quantized_only' # Storage mode for quantized vectors (or 'quantized_with_raw')
60
61
  }
61
62
 
62
63
  Note: Quantization reduces memory usage (typically 4-32x compression) but may
@@ -88,7 +89,8 @@ class VectorDatabase:
88
89
  'type': 'pq',
89
90
  'subvectors': 16, # More subvectors = better compression
90
91
  'bits': 6, # Fewer bits = less memory per centroid
91
- 'training_size': 75000 # Override auto-calculation
92
+ 'training_size': 75000, # Override auto-calculation
93
+ 'storage_mode': 'quantized_only' # Only store quantized vectors
92
94
  }
93
95
  index = vdb.create(
94
96
  index_type="hnsw",
@@ -126,11 +128,12 @@ class VectorDatabase:
126
128
 
127
129
  try:
128
130
  # Always pass quantization_config parameter
129
- clean_config = None
130
131
  if quantization_config is not None:
131
- # Clean quantization_config before passing to Rust (remove internal keys)
132
- clean_config = {k: v for k, v in quantization_config.items() if not k.startswith('_')}
133
-
132
+ # Remove keys with None values and internal keys
133
+ clean_config = {k: v for k, v in quantization_config.items() if not k.startswith('_') and v is not None}
134
+ else:
135
+ clean_config = None
136
+
134
137
  return constructor(quantization_config=clean_config, **kwargs)
135
138
  except Exception as e:
136
139
  raise RuntimeError(f"Failed to create {index_type.upper()} index: {e}") from e
@@ -172,7 +175,7 @@ class VectorDatabase:
172
175
  if dim % subvectors != 0:
173
176
  raise ValueError(
174
177
  f"subvectors ({subvectors}) must divide dimension ({dim}) evenly. "
175
- f"Consider using subvectors: {self._suggest_subvector_divisors(dim)}"
178
+ f"Consider using subvectors: {', '.join(map(str, self._suggest_subvector_divisors(dim)))}"
176
179
  )
177
180
 
178
181
  if subvectors > dim:
@@ -206,9 +209,38 @@ class VectorDatabase:
206
209
  )
207
210
  validated_config['max_training_vectors'] = max_training_vectors
208
211
 
212
+ # Validate storage mode
213
+ storage_mode = str(validated_config.get('storage_mode', 'quantized_only')).lower()
214
+ valid_modes = {'quantized_only', 'quantized_with_raw'}
215
+ if storage_mode not in valid_modes:
216
+ raise ValueError(
217
+ f"Invalid storage_mode: '{storage_mode}'. Supported modes: {', '.join(sorted(valid_modes))}"
218
+ )
219
+
220
+ validated_config['storage_mode'] = storage_mode
221
+
209
222
  # Calculate and warn about memory usage
210
223
  self._check_memory_usage(validated_config, dim)
224
+
225
+ # Add helpful warnings about storage mode
226
+ if storage_mode == 'quantized_with_raw':
227
+ import warnings
228
+ compression_ratio = validated_config.get('__memory_info__', {}).get('compression_ratio', 1.0)
229
+ warnings.warn(
230
+ f"storage_mode='quantized_with_raw' will use ~{compression_ratio:.1f}x more memory "
231
+ f"than 'quantized_only' but enables exact vector reconstruction.",
232
+ UserWarning,
233
+ stacklevel=2
234
+ )
211
235
 
236
+ # Final safety check: ensure all expected keys are present
237
+ # This is a final defensive programming - all the keys should already be set above, but added just in case
238
+ validated_config.setdefault('type', 'pq')
239
+ validated_config.setdefault('subvectors', 8)
240
+ validated_config.setdefault('bits', 8)
241
+ validated_config.setdefault('max_training_vectors', None)
242
+ validated_config.setdefault('storage_mode', 'quantized_only')
243
+
212
244
  return validated_config
213
245
 
214
246
  def _calculate_smart_training_size(self, subvectors: int, bits: int) -> int:
@@ -236,13 +268,14 @@ class VectorDatabase:
236
268
 
237
269
  return min(max(statistical_minimum, reasonable_minimum), reasonable_maximum)
238
270
 
239
- def _suggest_subvector_divisors(self, dim: int) -> str:
240
- """Suggest valid subvector counts that divide the dimension evenly."""
241
- divisors = []
242
- for i in range(1, min(33, dim + 1)): # Common subvector counts up to 32
243
- if dim % i == 0:
244
- divisors.append(str(i))
245
- return ', '.join(divisors[:8]) # Show first 8 suggestions
271
+
272
+ def _suggest_subvector_divisors(self, dim: int) -> list[int]:
273
+ """Return valid subvector counts that divide the dimension evenly (up to 32)."""
274
+ return [i for i in range(1, min(33, dim + 1)) if dim % i == 0]
275
+
276
+
277
+
278
+
246
279
 
247
280
  def _check_memory_usage(self, config: Dict[str, Any], dim: int) -> None:
248
281
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zeusdb-vector-database
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Requires-Dist: numpy>=2.2.6,<3.0.0
@@ -594,12 +594,13 @@ To enable PQ, pass a `quantization_config` dictionary to the `.create()` index m
594
594
  | `bits` | `int` | Bits per quantized code (controls centroids per subvector) | 1-8 | `8` |
595
595
  | `training_size` | `int` | Minimum vectors needed for stable k-means clustering | ≥ 1000 | 1000 |
596
596
  | `max_training_vectors` | `int` | Maximum vectors used during training (optional limit) | ≥ training_size | `None` |
597
+ | `storage_mode` | `str` | Storage strategy: "quantized_only" (memory optimized) or "quantized_with_raw" (keep raw vectors for exact reconstruction) | "quantized_only", "quantized_with_raw" | `"quantized_only"` |
597
598
 
598
599
 
599
600
  <br/>
600
601
 
601
602
 
602
- ### 🔧 Usage Example
603
+ ### 🔧 Usage Example 1
603
604
 
604
605
  ```python
605
606
  from zeusdb_vector_database import VectorDatabase
@@ -665,6 +666,36 @@ Results
665
666
  {'id': 'doc_8148', 'score': 0.5139288306236267, 'metadata': {'category': 'tech', 'year': 2026}},
666
667
  {'id': 'doc_7822', 'score': 0.5151920914649963, 'metadata': {'category': 'tech', 'year': 2026}},
667
668
  ]
669
+ ```
670
+ <br />
671
+
672
+ ### 🔧 Usage Example 2 - with explicit storage mode
673
+
674
+ ```python
675
+ from zeusdb_vector_database import VectorDatabase
676
+ import numpy as np
677
+
678
+ # Create index with product quantization
679
+ vdb = VectorDatabase()
680
+
681
+ # Configure quantization for memory efficiency
682
+ quantization_config = {
683
+ 'type': 'pq', # `pq` for Product Quantization
684
+ 'subvectors': 8, # Divide 1536-dim vectors into 8 subvectors of 192 dims each
685
+ 'bits': 8, # 256 centroids per subvector (2^8)
686
+ 'training_size': 10000, # Train when 10k vectors are collected
687
+ 'max_training_vectors': 50000, # Use max 50k vectors for training
688
+ 'storage_mode': 'quantized_only' # Explicitly set storage mode to only keep quantized values
689
+ }
690
+
691
+ # Create index with quantization
692
+ # This will automatically handle training when enough vectors are added
693
+ index = vdb.create(
694
+ index_type="hnsw",
695
+ dim=3072, # OpenAI `text-embedding-3-large` dimension
696
+ quantization_config=quantization_config # Add the compression configuration
697
+ )
698
+
668
699
  ```
669
700
 
670
701
  <br />
@@ -677,7 +708,8 @@ quantization_config = {
677
708
  'type': 'pq',
678
709
  'subvectors': 8, # Balanced: moderate compression, good accuracy
679
710
  'bits': 8, # 256 centroids per subvector (high precision)
680
- 'training_size': 10000 # Or higher for large datasets
711
+ 'training_size': 10000, # Or higher for large datasets
712
+ 'storage_mode': 'quantized_only' # Default, memory efficient
681
713
  }
682
714
  # Achieves ~16x–32x compression with strong recall for most applications
683
715
  ```
@@ -689,7 +721,8 @@ quantization_config = {
689
721
  'type': 'pq',
690
722
  'subvectors': 16, # More subvectors = better compression
691
723
  'bits': 6, # Fewer bits = less memory per centroid
692
- 'training_size': 20000
724
+ 'training_size': 20000,
725
+ 'storage_mode': 'quantized_only'
693
726
  }
694
727
  # Achieves ~32x compression ratio
695
728
  ```
@@ -701,6 +734,7 @@ quantization_config = {
701
734
  'subvectors': 4, # Fewer subvectors = better accuracy
702
735
  'bits': 8, # More bits = more precise quantization
703
736
  'training_size': 50000 # More training data = better centroids
737
+ 'storage_mode': 'quantized_with_raw' # Keep raw vectors for exact recall
704
738
  }
705
739
  # Achieves ~4x compression ratio with minimal accuracy loss
706
740
  ```
@@ -714,6 +748,10 @@ quantization_config = {
714
748
 
715
749
  Quantization is ideal for production deployments with large vector datasets (100k+ vectors) where memory efficiency is critical.
716
750
 
751
+ `"quantized_only"` is recommended for most use cases and maximizes memory savings.
752
+
753
+ `"quantized_with_raw"` keeps both quantized and raw vectors for exact reconstruction, but uses more memory.
754
+
717
755
 
718
756
  <br/>
719
757
 
@@ -0,0 +1,9 @@
1
+ zeusdb_vector_database-0.2.1.dist-info/METADATA,sha256=hZ-IhvlzS7pzuZPw7wbPPpYeRk5pA03KjwgionXIZSA,32236
2
+ zeusdb_vector_database-0.2.1.dist-info/WHEEL,sha256=h3IxzUxEnskQ3JOfe5J-jcBmCp-pER4ojq713ksH704,92
3
+ zeusdb_vector_database-0.2.1.dist-info/licenses/LICENSE,sha256=Tl1woVXzEZSnuYktSg3zeEBiFwe-4cIdbIfaKq6i5OA,11537
4
+ zeusdb_vector_database-0.2.1.dist-info/licenses/NOTICE,sha256=TOiyYyRlAPD1DDHTNZh3JxMeDxAAdobwk3MGhLE9he0,994
5
+ zeusdb_vector_database/__init__.py,sha256=wF8mUMbZWrTyynCmHh2bjlmAmU7zZKuSqogloqDYX3c,209
6
+ zeusdb_vector_database/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ zeusdb_vector_database/vector_database.py,sha256=6XvlAA17d81dLNFI15ayIJvBpCq77ktbQJuH9UFI_0A,15446
8
+ zeusdb_vector_database/zeusdb_vector_database.cp312-win32.pyd,sha256=aAWORoOyqUw3y_GUnneK_uWv_fN3Vq-nTY0HNXMlbqM,3644416
9
+ zeusdb_vector_database-0.2.1.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- zeusdb_vector_database-0.2.0.dist-info/METADATA,sha256=0xpE4OQ7t2Au9_sKDIr5srscdSOjxQ_KMyheUif_rdI,30519
2
- zeusdb_vector_database-0.2.0.dist-info/WHEEL,sha256=h3IxzUxEnskQ3JOfe5J-jcBmCp-pER4ojq713ksH704,92
3
- zeusdb_vector_database-0.2.0.dist-info/licenses/LICENSE,sha256=Tl1woVXzEZSnuYktSg3zeEBiFwe-4cIdbIfaKq6i5OA,11537
4
- zeusdb_vector_database-0.2.0.dist-info/licenses/NOTICE,sha256=TOiyYyRlAPD1DDHTNZh3JxMeDxAAdobwk3MGhLE9he0,994
5
- zeusdb_vector_database/__init__.py,sha256=V-yh5PavxK2apUz1eKiwE_0RK2xJg5-t2y0NaZKTWJQ,209
6
- zeusdb_vector_database/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- zeusdb_vector_database/vector_database.py,sha256=3Wvt1_u1u1hxKLV3xyAgILv-ZY51uJsqau5ydC5mCqI,13892
8
- zeusdb_vector_database/zeusdb_vector_database.cp312-win32.pyd,sha256=QpZ2lDgPSP8C_blUANnu3o_102ZAKcnK1UdZ5UbLSro,3638784
9
- zeusdb_vector_database-0.2.0.dist-info/RECORD,,