arrowspace 0.24.5__tar.gz → 0.24.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {arrowspace-0.24.5 → arrowspace-0.24.8}/Cargo.lock +21 -3
  2. {arrowspace-0.24.5 → arrowspace-0.24.8}/Cargo.toml +5 -3
  3. {arrowspace-0.24.5 → arrowspace-0.24.8}/PKG-INFO +3 -3
  4. {arrowspace-0.24.5 → arrowspace-0.24.8}/README.md +2 -2
  5. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/lib.rs +87 -18
  6. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_10_CVE_sorted_set_fixed_eta.py +1 -1
  7. arrowspace-0.24.8/tests/test_12_BEIR.py +870 -0
  8. arrowspace-0.24.8/tests/test_13_msmarco_topology.py +554 -0
  9. arrowspace-0.24.8/tests/test_14_arxiv_open_search.py +668 -0
  10. arrowspace-0.24.8/tests/test_2_CVE_db.py +875 -0
  11. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_9_CVE_db_fixed_eta.py +2 -2
  12. arrowspace-0.24.8/tests/topolog-embeddings/test_parametric_umap.py +404 -0
  13. arrowspace-0.24.8/tests/topolog-embeddings/test_whitening.py +252 -0
  14. arrowspace-0.24.5/tests/test_2_CVE_db.py +0 -1400
  15. {arrowspace-0.24.5 → arrowspace-0.24.8}/LICENSE +0 -0
  16. {arrowspace-0.24.5 → arrowspace-0.24.8}/pyproject.toml +0 -0
  17. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/energyparams.rs +0 -0
  18. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/helpers.rs +0 -0
  19. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/sorted_index.rs +0 -0
  20. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/subgraphs.rs +0 -0
  21. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/tests.rs +0 -0
  22. {arrowspace-0.24.5 → arrowspace-0.24.8}/src/tests_python.rs +0 -0
  23. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/__init__.py +0 -0
  24. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/eigen_maps_params_learning.py +0 -0
  25. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/embeddings_model.py +0 -0
  26. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/energy_maps_params_learning.py +0 -0
  27. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/requirements.txt +0 -0
  28. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_0.py +0 -0
  29. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_1.py +0 -0
  30. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_2_motives.py +0 -0
  31. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_3_subgraphs.py +0 -0
  32. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_11_CVE_eigenmaps_sorted_set.py +0 -0
  33. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_1_quora_questions.py +0 -0
  34. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_3_beir.py +0 -0
  35. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_4_msmarco_tau_sweep.py +0 -0
  36. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_5_msmarco_eps_sweep.py +0 -0
  37. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_6_beir_new_score.py +0 -0
  38. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_7_beir_new_score_sweep.py +0 -0
  39. {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_8_CVE_db_sweep.py +0 -0
@@ -134,6 +134,12 @@ dependencies = [
134
134
  "num-traits",
135
135
  ]
136
136
 
137
+ [[package]]
138
+ name = "arc-swap"
139
+ version = "1.7.1"
140
+ source = "registry+https://github.com/rust-lang/crates.io-index"
141
+ checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
142
+
137
143
  [[package]]
138
144
  name = "arrow"
139
145
  version = "57.1.0"
@@ -351,9 +357,9 @@ dependencies = [
351
357
 
352
358
  [[package]]
353
359
  name = "arrowspace"
354
- version = "0.24.5"
360
+ version = "0.24.6"
355
361
  source = "registry+https://github.com/rust-lang/crates.io-index"
356
- checksum = "a0771fca361d870c2109f3602ec5916fd6c5c5924738a3d4e6478309ee0c5658"
362
+ checksum = "c5a852f7bbaee37066b280df74b048e33935ef26f9161f63ef0281cda26cb2ab"
357
363
  dependencies = [
358
364
  "approx 0.5.1",
359
365
  "arrow",
@@ -1181,11 +1187,12 @@ dependencies = [
1181
1187
 
1182
1188
  [[package]]
1183
1189
  name = "pyarrowspace"
1184
- version = "0.24.5"
1190
+ version = "0.24.8"
1185
1191
  dependencies = [
1186
1192
  "arrowspace",
1187
1193
  "numpy",
1188
1194
  "pyo3",
1195
+ "pyo3-log",
1189
1196
  "rayon",
1190
1197
  "smartcore",
1191
1198
  ]
@@ -1226,6 +1233,17 @@ dependencies = [
1226
1233
  "pyo3-build-config",
1227
1234
  ]
1228
1235
 
1236
+ [[package]]
1237
+ name = "pyo3-log"
1238
+ version = "0.13.2"
1239
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1240
+ checksum = "2f8bae9ad5ba08b0b0ed2bb9c2bdbaeccc69cafca96d78cf0fbcea0d45d122bb"
1241
+ dependencies = [
1242
+ "arc-swap",
1243
+ "log",
1244
+ "pyo3",
1245
+ ]
1246
+
1229
1247
  [[package]]
1230
1248
  name = "pyo3-macros"
1231
1249
  version = "0.27.1"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "pyarrowspace"
3
- version = "0.24.5"
3
+ version = "0.24.8"
4
4
  edition = "2024"
5
5
  description = "Spectral vector search with taumode (λτ) indexing"
6
6
  authors = ["Lorenzo <tunedconsulting@gmail.com>"]
@@ -25,6 +25,7 @@ exclude = [
25
25
  "tests/output",
26
26
  "tests/small_datasets",
27
27
  "visualization_output",
28
+ "storage/"
28
29
  ]
29
30
 
30
31
 
@@ -38,8 +39,9 @@ name = "arrowspace"
38
39
  crate-type = ["cdylib"]
39
40
 
40
41
  [dependencies]
41
- arrowspace = { version = "0.24.5", features = ["storage"] } #, path = "../arrowspace-rs"}
42
+ arrowspace = { version = "0.24.6", features = ["storage"] } #, path = "../arrowspace-rs"}
42
43
  pyo3 = { version = "0.27.1", features = ["extension-module"] }
44
+ pyo3-log = "0.13"
43
45
  numpy = "0.27.0"
44
46
  rayon = "1.8"
45
- smartcore = "0.4.6"
47
+ smartcore = "0.4.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arrowspace
3
- Version: 0.24.5
3
+ Version: 0.24.8
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -34,9 +34,9 @@ Project-URL: Repository, https://github.com/tuned-org-uk/pyarrowspace.git
34
34
 
35
35
  # pyarrowspace
36
36
 
37
- Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs). This is experimental software meant for research at current state.
37
+ Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs).
38
38
 
39
- This is the starting repository for `arrowspace`, it is made public as a showcase for the Python interface, to collect feedback and make public some results of the tests run. To run needs the `arrowspace-rs` Rust module in a sibling directory.
39
+ `arrowspace` is a database for vectors supported by a graph representation and a key-value store. The main use-cases targeted are: AI search capabilities as advanced vector similarity, graph characterisation analysis and search, indexing of high-dimensional vectors. Design principles described in [this article](https://www.tuned.org.uk/posts/010_game_changer_unifying_vectors_and_features_graphs).
40
40
 
41
41
  For labs and tests please see [tests/](https://github.com/tuned-org-uk/pyarrowspace/tree/main/tests)
42
42
 
@@ -1,8 +1,8 @@
1
1
  # pyarrowspace
2
2
 
3
- Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs). This is experimental software meant for research at current state.
3
+ Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs).
4
4
 
5
- This is the starting repository for `arrowspace`, it is made public as a showcase for the Python interface, to collect feedback and make public some results of the tests run. To run needs the `arrowspace-rs` Rust module in a sibling directory.
5
+ `arrowspace` is a database for vectors supported by a graph representation and a key-value store. The main use-cases targeted are: AI search capabilities as advanced vector similarity, graph characterisation analysis and search, indexing of high-dimensional vectors. Design principles described in [this article](https://www.tuned.org.uk/posts/010_game_changer_unifying_vectors_and_features_graphs).
6
6
 
7
7
  For labs and tests please see [tests/](https://github.com/tuned-org-uk/pyarrowspace/tree/main/tests)
8
8
 
@@ -28,6 +28,16 @@ mod tests;
28
28
  #[cfg(test)]
29
29
  mod tests_python;
30
30
 
31
+ use std::sync::Once;
32
+ static INIT: Once = Once::new();
33
+
34
+ /// Initialize logging for tests
35
+ pub fn init() {
36
+ INIT.call_once(|| {
37
+ pyo3_log::init();
38
+ });
39
+ }
40
+
31
41
  // ------------ Py wrappers ------------
32
42
  #[pyclass(name = "GraphLaplacian")]
33
43
  pub struct PyGraphLaplacian {
@@ -165,7 +175,7 @@ impl PyArrowSpace {
165
175
 
166
176
  dbg_println(format!("search: qlen={}, lambda_q={:.6}", v.len(), lambda_q));
167
177
 
168
- let query = ArrowItem::new(v.to_vec(), lambda_q);
178
+ let query = ArrowItem::new(v, lambda_q);
169
179
  let k = graph_laplacian.graph_params.topk;
170
180
 
171
181
  Ok(self.inner.search_lambda_aware(&query, k, tau))
@@ -203,7 +213,7 @@ impl PyArrowSpace {
203
213
  )));
204
214
  }
205
215
 
206
- let query = ArrowItem::new(v.to_vec(), lambda_q);
216
+ let query = ArrowItem::new(v, lambda_q);
207
217
  results.push(self.inner.search_lambda_aware(&query, k, tau));
208
218
  }
209
219
 
@@ -232,7 +242,7 @@ impl PyArrowSpace {
232
242
 
233
243
  dbg_println(format!("search_hybrid: qlen={}, lambda_q={:.6}", v.len(), lambda_q));
234
244
 
235
- let query = ArrowItem::new(v.to_vec(), lambda_q);
245
+ let query = ArrowItem::new(v, lambda_q);
236
246
  let k = graph_laplacian.graph_params.topk;
237
247
 
238
248
  Ok(self.inner.search_lambda_aware_hybrid(&query, k, tau))
@@ -303,8 +313,8 @@ impl PyArrowSpace {
303
313
  Ok(motifs)
304
314
  }
305
315
 
306
- /// spot_subg_motives(gl: GraphLaplacian, cfg: dict) -> List[dict]
307
- ///
316
+ /// spot_subg_motives(gl: GraphLaplacian, cfg: dict) -> List[dict]
317
+
308
318
  /// Runs energy-mode motif-based subgraph extraction and returns a list of
309
319
  /// subgraph dictionaries with:
310
320
  /// - "node_indices": List[int] centroid indices
@@ -439,16 +449,73 @@ impl PyArrowSpaceBuilder {
439
449
  .with_lambda_graph(eps, k, topk, p, sigma)
440
450
  .with_dims_reduction(true, Some(eps))
441
451
  .with_seed(42)
442
- .with_sparsity_check(false);
452
+ .with_sparsity_check(false)
443
453
  }
444
454
 
445
455
  dbg_println(format!("build: Processing {} rows × {} cols", nrows, ncols));
446
- let (aspace, gl) = builder.build(rows);
456
+ let (aspace, gl) = py.detach(|| {
457
+ let (aspace, gl) = builder.build(rows);
458
+
459
+ dbg_println(format!(
460
+ "build complete: nitems={}, nfeatures={}, lambdas={}",
461
+ aspace.nitems, aspace.nfeatures, aspace.lambdas().len()
462
+ ));
463
+
464
+ (aspace, gl)
465
+ });
466
+
467
+ Ok((
468
+ Py::new(py, PyArrowSpace { inner: aspace })?,
469
+ Py::new(py, PyGraphLaplacian { inner: gl })?,
470
+ ))
471
+ }
472
+
473
+ /// Same as `build(...)` but save computations on parquet files
474
+ #[staticmethod]
475
+ pub fn build_and_store(
476
+ py: Python<'_>,
477
+ graph_params: Option<&Bound<'_, PyDict>>,
478
+ items: PyReadonlyArray2<f64>,
479
+ ) -> PyResult<(Py<PyArrowSpace>, Py<PyGraphLaplacian>)> {
480
+ dbg_println("build: Converting numpy array to internal format");
447
481
 
448
- dbg_println(format!(
449
- "build complete: nitems={}, nfeatures={}, lambdas={}",
450
- aspace.nitems, aspace.nfeatures, aspace.lambdas().len()
451
- ));
482
+ let arr = items.as_array();
483
+ let (nrows, ncols) = (arr.shape()[0], arr.shape()[1]);
484
+
485
+ let rows: Vec<Vec<f64>> = if nrows > 1000 {
486
+ use rayon::prelude::*;
487
+ (0..nrows)
488
+ .into_par_iter()
489
+ .map(|i| arr.row(i).to_owned().to_vec())
490
+ .collect()
491
+ } else {
492
+ (0..nrows)
493
+ .map(|i| arr.row(i).to_owned().to_vec())
494
+ .collect()
495
+ };
496
+
497
+ let mut builder = RustBuilder::new();
498
+
499
+ if let Some((eps, k, topk, p, sigma)) = parse_graph_params(graph_params)? {
500
+ builder = builder
501
+ .with_lambda_graph(eps, k, topk, p, sigma)
502
+ .with_dims_reduction(true, Some(eps))
503
+ .with_seed(42)
504
+ .with_sparsity_check(false)
505
+ .with_persistence("./storage", "dataset".to_string());
506
+ }
507
+
508
+ dbg_println(format!("build: Processing {} rows × {} cols", nrows, ncols));
509
+ let (aspace, gl) = py.detach(|| {
510
+ let (aspace, gl) = builder.build(rows);
511
+
512
+ dbg_println(format!(
513
+ "build complete: nitems={}, nfeatures={}, lambdas={}",
514
+ aspace.nitems, aspace.nfeatures, aspace.lambdas().len()
515
+ ));
516
+
517
+ (aspace, gl)
518
+ });
452
519
 
453
520
  Ok((
454
521
  Py::new(py, PyArrowSpace { inner: aspace })?,
@@ -494,20 +561,22 @@ impl PyArrowSpaceBuilder {
494
561
  .with_dims_reduction(true, Some(eps))
495
562
  .with_extra_dims_reduction(true)
496
563
  .with_seed(999)
497
- .with_inline_sampling(Some(SamplerType::Simple(0.6)))
564
+ .with_inline_sampling(Some(SamplerType::Simple(0.99)))
498
565
  .with_spectral(false)
499
566
  .with_sparsity_check(false);
500
567
  }
501
568
 
502
569
  dbg_println(format!("build_energy: Processing {} rows × {} cols", nrows, ncols));
503
570
  let (aspace, gl_energy) = py.detach(|| {
504
- builder.build_energy(rows, e_params)
571
+ let (aspace, gl_energy) = builder.build_energy(rows, e_params);
572
+
573
+ dbg_println(format!(
574
+ "build_energy complete: nitems={}, nfeatures={}, graph_nodes={}, lambdas={}",
575
+ aspace.nitems, aspace.nfeatures, gl_energy.nnodes, aspace.lambdas().len()
576
+ ));
577
+
578
+ (aspace, gl_energy)
505
579
  });
506
-
507
- dbg_println(format!(
508
- "build_energy complete: nitems={}, nfeatures={}, graph_nodes={}, lambdas={}",
509
- aspace.nitems, aspace.nfeatures, gl_energy.nnodes, aspace.lambdas().len()
510
- ));
511
580
 
512
581
  Ok((
513
582
  Py::new(py, PyArrowSpace { inner: aspace })?,
@@ -36,7 +36,7 @@ STEPS_VALUES = [4, 6]
36
36
 
37
37
  # Standard graph params (used for both standard and energy builds)
38
38
  graph_params = {
39
- "eps": 1.31, # eps for 100K
39
+ "eps": 0.8, # eps for 100K
40
40
  "k": 25,
41
41
  "topk": 15,
42
42
  "p": 2.0,