arrowspace 0.24.5__tar.gz → 0.24.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arrowspace-0.24.5 → arrowspace-0.24.8}/Cargo.lock +21 -3
- {arrowspace-0.24.5 → arrowspace-0.24.8}/Cargo.toml +5 -3
- {arrowspace-0.24.5 → arrowspace-0.24.8}/PKG-INFO +3 -3
- {arrowspace-0.24.5 → arrowspace-0.24.8}/README.md +2 -2
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/lib.rs +87 -18
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_10_CVE_sorted_set_fixed_eta.py +1 -1
- arrowspace-0.24.8/tests/test_12_BEIR.py +870 -0
- arrowspace-0.24.8/tests/test_13_msmarco_topology.py +554 -0
- arrowspace-0.24.8/tests/test_14_arxiv_open_search.py +668 -0
- arrowspace-0.24.8/tests/test_2_CVE_db.py +875 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_9_CVE_db_fixed_eta.py +2 -2
- arrowspace-0.24.8/tests/topolog-embeddings/test_parametric_umap.py +404 -0
- arrowspace-0.24.8/tests/topolog-embeddings/test_whitening.py +252 -0
- arrowspace-0.24.5/tests/test_2_CVE_db.py +0 -1400
- {arrowspace-0.24.5 → arrowspace-0.24.8}/LICENSE +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/pyproject.toml +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/energyparams.rs +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/helpers.rs +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/sorted_index.rs +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/subgraphs.rs +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/tests.rs +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/src/tests_python.rs +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/__init__.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/eigen_maps_params_learning.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/embeddings_model.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/energy_maps_params_learning.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/requirements.txt +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_0.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_1.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_2_motives.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_0_3_subgraphs.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_11_CVE_eigenmaps_sorted_set.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_1_quora_questions.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_3_beir.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_4_msmarco_tau_sweep.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_5_msmarco_eps_sweep.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_6_beir_new_score.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_7_beir_new_score_sweep.py +0 -0
- {arrowspace-0.24.5 → arrowspace-0.24.8}/tests/test_8_CVE_db_sweep.py +0 -0
|
@@ -134,6 +134,12 @@ dependencies = [
|
|
|
134
134
|
"num-traits",
|
|
135
135
|
]
|
|
136
136
|
|
|
137
|
+
[[package]]
|
|
138
|
+
name = "arc-swap"
|
|
139
|
+
version = "1.7.1"
|
|
140
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
141
|
+
checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
|
|
142
|
+
|
|
137
143
|
[[package]]
|
|
138
144
|
name = "arrow"
|
|
139
145
|
version = "57.1.0"
|
|
@@ -351,9 +357,9 @@ dependencies = [
|
|
|
351
357
|
|
|
352
358
|
[[package]]
|
|
353
359
|
name = "arrowspace"
|
|
354
|
-
version = "0.24.
|
|
360
|
+
version = "0.24.6"
|
|
355
361
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
356
|
-
checksum = "
|
|
362
|
+
checksum = "c5a852f7bbaee37066b280df74b048e33935ef26f9161f63ef0281cda26cb2ab"
|
|
357
363
|
dependencies = [
|
|
358
364
|
"approx 0.5.1",
|
|
359
365
|
"arrow",
|
|
@@ -1181,11 +1187,12 @@ dependencies = [
|
|
|
1181
1187
|
|
|
1182
1188
|
[[package]]
|
|
1183
1189
|
name = "pyarrowspace"
|
|
1184
|
-
version = "0.24.
|
|
1190
|
+
version = "0.24.8"
|
|
1185
1191
|
dependencies = [
|
|
1186
1192
|
"arrowspace",
|
|
1187
1193
|
"numpy",
|
|
1188
1194
|
"pyo3",
|
|
1195
|
+
"pyo3-log",
|
|
1189
1196
|
"rayon",
|
|
1190
1197
|
"smartcore",
|
|
1191
1198
|
]
|
|
@@ -1226,6 +1233,17 @@ dependencies = [
|
|
|
1226
1233
|
"pyo3-build-config",
|
|
1227
1234
|
]
|
|
1228
1235
|
|
|
1236
|
+
[[package]]
|
|
1237
|
+
name = "pyo3-log"
|
|
1238
|
+
version = "0.13.2"
|
|
1239
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1240
|
+
checksum = "2f8bae9ad5ba08b0b0ed2bb9c2bdbaeccc69cafca96d78cf0fbcea0d45d122bb"
|
|
1241
|
+
dependencies = [
|
|
1242
|
+
"arc-swap",
|
|
1243
|
+
"log",
|
|
1244
|
+
"pyo3",
|
|
1245
|
+
]
|
|
1246
|
+
|
|
1229
1247
|
[[package]]
|
|
1230
1248
|
name = "pyo3-macros"
|
|
1231
1249
|
version = "0.27.1"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "pyarrowspace"
|
|
3
|
-
version = "0.24.
|
|
3
|
+
version = "0.24.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
description = "Spectral vector search with taumode (λτ) indexing"
|
|
6
6
|
authors = ["Lorenzo <tunedconsulting@gmail.com>"]
|
|
@@ -25,6 +25,7 @@ exclude = [
|
|
|
25
25
|
"tests/output",
|
|
26
26
|
"tests/small_datasets",
|
|
27
27
|
"visualization_output",
|
|
28
|
+
"storage/"
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
|
|
@@ -38,8 +39,9 @@ name = "arrowspace"
|
|
|
38
39
|
crate-type = ["cdylib"]
|
|
39
40
|
|
|
40
41
|
[dependencies]
|
|
41
|
-
arrowspace = { version = "0.24.
|
|
42
|
+
arrowspace = { version = "0.24.6", features = ["storage"] } #, path = "../arrowspace-rs"}
|
|
42
43
|
pyo3 = { version = "0.27.1", features = ["extension-module"] }
|
|
44
|
+
pyo3-log = "0.13"
|
|
43
45
|
numpy = "0.27.0"
|
|
44
46
|
rayon = "1.8"
|
|
45
|
-
smartcore = "0.4.
|
|
47
|
+
smartcore = "0.4.8"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arrowspace
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.8
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
@@ -34,9 +34,9 @@ Project-URL: Repository, https://github.com/tuned-org-uk/pyarrowspace.git
|
|
|
34
34
|
|
|
35
35
|
# pyarrowspace
|
|
36
36
|
|
|
37
|
-
Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs).
|
|
37
|
+
Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs).
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
`arrowspace` is a database for vectors supported by a graph representation and a key-value store. The main use-cases targeted are: AI search capabilities as advanced vector similarity, graph characterisation analysis and search, indexing of high-dimensional vectors. Design principles described in [this article](https://www.tuned.org.uk/posts/010_game_changer_unifying_vectors_and_features_graphs).
|
|
40
40
|
|
|
41
41
|
For labs and tests please see [tests/](https://github.com/tuned-org-uk/pyarrowspace/tree/main/tests)
|
|
42
42
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# pyarrowspace
|
|
2
2
|
|
|
3
|
-
Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs).
|
|
3
|
+
Python bindings for [`arrowspace-rs`](https://github.com/Mec-iS/arrowspace-rs).
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
`arrowspace` is a database for vectors supported by a graph representation and a key-value store. The main use-cases targeted are: AI search capabilities as advanced vector similarity, graph characterisation analysis and search, indexing of high-dimensional vectors. Design principles described in [this article](https://www.tuned.org.uk/posts/010_game_changer_unifying_vectors_and_features_graphs).
|
|
6
6
|
|
|
7
7
|
For labs and tests please see [tests/](https://github.com/tuned-org-uk/pyarrowspace/tree/main/tests)
|
|
8
8
|
|
|
@@ -28,6 +28,16 @@ mod tests;
|
|
|
28
28
|
#[cfg(test)]
|
|
29
29
|
mod tests_python;
|
|
30
30
|
|
|
31
|
+
use std::sync::Once;
|
|
32
|
+
static INIT: Once = Once::new();
|
|
33
|
+
|
|
34
|
+
/// Initialize logging for tests
|
|
35
|
+
pub fn init() {
|
|
36
|
+
INIT.call_once(|| {
|
|
37
|
+
pyo3_log::init();
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
|
|
31
41
|
// ------------ Py wrappers ------------
|
|
32
42
|
#[pyclass(name = "GraphLaplacian")]
|
|
33
43
|
pub struct PyGraphLaplacian {
|
|
@@ -165,7 +175,7 @@ impl PyArrowSpace {
|
|
|
165
175
|
|
|
166
176
|
dbg_println(format!("search: qlen={}, lambda_q={:.6}", v.len(), lambda_q));
|
|
167
177
|
|
|
168
|
-
let query = ArrowItem::new(v
|
|
178
|
+
let query = ArrowItem::new(v, lambda_q);
|
|
169
179
|
let k = graph_laplacian.graph_params.topk;
|
|
170
180
|
|
|
171
181
|
Ok(self.inner.search_lambda_aware(&query, k, tau))
|
|
@@ -203,7 +213,7 @@ impl PyArrowSpace {
|
|
|
203
213
|
)));
|
|
204
214
|
}
|
|
205
215
|
|
|
206
|
-
let query = ArrowItem::new(v
|
|
216
|
+
let query = ArrowItem::new(v, lambda_q);
|
|
207
217
|
results.push(self.inner.search_lambda_aware(&query, k, tau));
|
|
208
218
|
}
|
|
209
219
|
|
|
@@ -232,7 +242,7 @@ impl PyArrowSpace {
|
|
|
232
242
|
|
|
233
243
|
dbg_println(format!("search_hybrid: qlen={}, lambda_q={:.6}", v.len(), lambda_q));
|
|
234
244
|
|
|
235
|
-
let query = ArrowItem::new(v
|
|
245
|
+
let query = ArrowItem::new(v, lambda_q);
|
|
236
246
|
let k = graph_laplacian.graph_params.topk;
|
|
237
247
|
|
|
238
248
|
Ok(self.inner.search_lambda_aware_hybrid(&query, k, tau))
|
|
@@ -303,8 +313,8 @@ impl PyArrowSpace {
|
|
|
303
313
|
Ok(motifs)
|
|
304
314
|
}
|
|
305
315
|
|
|
306
|
-
|
|
307
|
-
|
|
316
|
+
/// spot_subg_motives(gl: GraphLaplacian, cfg: dict) -> List[dict]
|
|
317
|
+
|
|
308
318
|
/// Runs energy-mode motif-based subgraph extraction and returns a list of
|
|
309
319
|
/// subgraph dictionaries with:
|
|
310
320
|
/// - "node_indices": List[int] centroid indices
|
|
@@ -439,16 +449,73 @@ impl PyArrowSpaceBuilder {
|
|
|
439
449
|
.with_lambda_graph(eps, k, topk, p, sigma)
|
|
440
450
|
.with_dims_reduction(true, Some(eps))
|
|
441
451
|
.with_seed(42)
|
|
442
|
-
.with_sparsity_check(false)
|
|
452
|
+
.with_sparsity_check(false)
|
|
443
453
|
}
|
|
444
454
|
|
|
445
455
|
dbg_println(format!("build: Processing {} rows × {} cols", nrows, ncols));
|
|
446
|
-
let (aspace, gl) =
|
|
456
|
+
let (aspace, gl) = py.detach(|| {
|
|
457
|
+
let (aspace, gl) = builder.build(rows);
|
|
458
|
+
|
|
459
|
+
dbg_println(format!(
|
|
460
|
+
"build complete: nitems={}, nfeatures={}, lambdas={}",
|
|
461
|
+
aspace.nitems, aspace.nfeatures, aspace.lambdas().len()
|
|
462
|
+
));
|
|
463
|
+
|
|
464
|
+
(aspace, gl)
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
Ok((
|
|
468
|
+
Py::new(py, PyArrowSpace { inner: aspace })?,
|
|
469
|
+
Py::new(py, PyGraphLaplacian { inner: gl })?,
|
|
470
|
+
))
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
/// Same as `build(...)` but save computations on parquet files
|
|
474
|
+
#[staticmethod]
|
|
475
|
+
pub fn build_and_store(
|
|
476
|
+
py: Python<'_>,
|
|
477
|
+
graph_params: Option<&Bound<'_, PyDict>>,
|
|
478
|
+
items: PyReadonlyArray2<f64>,
|
|
479
|
+
) -> PyResult<(Py<PyArrowSpace>, Py<PyGraphLaplacian>)> {
|
|
480
|
+
dbg_println("build: Converting numpy array to internal format");
|
|
447
481
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
482
|
+
let arr = items.as_array();
|
|
483
|
+
let (nrows, ncols) = (arr.shape()[0], arr.shape()[1]);
|
|
484
|
+
|
|
485
|
+
let rows: Vec<Vec<f64>> = if nrows > 1000 {
|
|
486
|
+
use rayon::prelude::*;
|
|
487
|
+
(0..nrows)
|
|
488
|
+
.into_par_iter()
|
|
489
|
+
.map(|i| arr.row(i).to_owned().to_vec())
|
|
490
|
+
.collect()
|
|
491
|
+
} else {
|
|
492
|
+
(0..nrows)
|
|
493
|
+
.map(|i| arr.row(i).to_owned().to_vec())
|
|
494
|
+
.collect()
|
|
495
|
+
};
|
|
496
|
+
|
|
497
|
+
let mut builder = RustBuilder::new();
|
|
498
|
+
|
|
499
|
+
if let Some((eps, k, topk, p, sigma)) = parse_graph_params(graph_params)? {
|
|
500
|
+
builder = builder
|
|
501
|
+
.with_lambda_graph(eps, k, topk, p, sigma)
|
|
502
|
+
.with_dims_reduction(true, Some(eps))
|
|
503
|
+
.with_seed(42)
|
|
504
|
+
.with_sparsity_check(false)
|
|
505
|
+
.with_persistence("./storage", "dataset".to_string());
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
dbg_println(format!("build: Processing {} rows × {} cols", nrows, ncols));
|
|
509
|
+
let (aspace, gl) = py.detach(|| {
|
|
510
|
+
let (aspace, gl) = builder.build(rows);
|
|
511
|
+
|
|
512
|
+
dbg_println(format!(
|
|
513
|
+
"build complete: nitems={}, nfeatures={}, lambdas={}",
|
|
514
|
+
aspace.nitems, aspace.nfeatures, aspace.lambdas().len()
|
|
515
|
+
));
|
|
516
|
+
|
|
517
|
+
(aspace, gl)
|
|
518
|
+
});
|
|
452
519
|
|
|
453
520
|
Ok((
|
|
454
521
|
Py::new(py, PyArrowSpace { inner: aspace })?,
|
|
@@ -494,20 +561,22 @@ impl PyArrowSpaceBuilder {
|
|
|
494
561
|
.with_dims_reduction(true, Some(eps))
|
|
495
562
|
.with_extra_dims_reduction(true)
|
|
496
563
|
.with_seed(999)
|
|
497
|
-
.with_inline_sampling(Some(SamplerType::Simple(0.
|
|
564
|
+
.with_inline_sampling(Some(SamplerType::Simple(0.99)))
|
|
498
565
|
.with_spectral(false)
|
|
499
566
|
.with_sparsity_check(false);
|
|
500
567
|
}
|
|
501
568
|
|
|
502
569
|
dbg_println(format!("build_energy: Processing {} rows × {} cols", nrows, ncols));
|
|
503
570
|
let (aspace, gl_energy) = py.detach(|| {
|
|
504
|
-
builder.build_energy(rows, e_params)
|
|
571
|
+
let (aspace, gl_energy) = builder.build_energy(rows, e_params);
|
|
572
|
+
|
|
573
|
+
dbg_println(format!(
|
|
574
|
+
"build_energy complete: nitems={}, nfeatures={}, graph_nodes={}, lambdas={}",
|
|
575
|
+
aspace.nitems, aspace.nfeatures, gl_energy.nnodes, aspace.lambdas().len()
|
|
576
|
+
));
|
|
577
|
+
|
|
578
|
+
(aspace, gl_energy)
|
|
505
579
|
});
|
|
506
|
-
|
|
507
|
-
dbg_println(format!(
|
|
508
|
-
"build_energy complete: nitems={}, nfeatures={}, graph_nodes={}, lambdas={}",
|
|
509
|
-
aspace.nitems, aspace.nfeatures, gl_energy.nnodes, aspace.lambdas().len()
|
|
510
|
-
));
|
|
511
580
|
|
|
512
581
|
Ok((
|
|
513
582
|
Py::new(py, PyArrowSpace { inner: aspace })?,
|