graphembed-rs 0.1.0__cp310-cp310-macosx_11_0_arm64.whl → 0.1.2__cp310-cp310-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphembed_rs/__init__.py +7 -0
- graphembed_rs/graphembed_rs.cpython-310-darwin.so +0 -0
- graphembed_rs/load_utils.py +71 -0
- {graphembed_rs-0.1.0.dist-info → graphembed_rs-0.1.2.dist-info}/METADATA +91 -39
- graphembed_rs-0.1.2.dist-info/RECORD +7 -0
- graphembed_rs-0.1.2.dist-info/licenses/LICENSE-MIT +25 -0
- graphembed/__init__.py +0 -5
- graphembed/__init__.pyi +0 -79
- graphembed/graphembed.cpython-310-darwin.so +0 -0
- graphembed/py.typed +0 -0
- graphembed_rs-0.1.0.dist-info/RECORD +0 -7
- {graphembed_rs-0.1.0.dist-info → graphembed_rs-0.1.2.dist-info}/WHEEL +0 -0
Binary file
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from pathlib import Path
|
3
|
+
### install pymongo with pip install pymongo
|
4
|
+
# -- pymongo is a BSON decoder, not a BSON encoder --
|
5
|
+
from bson import decode_file_iter
|
6
|
+
|
7
|
+
|
8
|
+
def _dtype_from_type_name(tname: str):
|
9
|
+
"""Map the Rust type-name string to a NumPy dtype."""
|
10
|
+
if tname == "f32":
|
11
|
+
return np.float32
|
12
|
+
if tname == "f64":
|
13
|
+
return np.float64
|
14
|
+
if tname == "usize": # stored as i64 in BSON
|
15
|
+
return np.int64
|
16
|
+
raise ValueError(f"Unknown type_name {tname!r} in BSON header")
|
17
|
+
|
18
|
+
|
19
|
+
def load_embedding_bson(path: str | Path, *, want_in: bool = False):
|
20
|
+
"""
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
path : str | pathlib.Path
|
24
|
+
File written by graphembed::io::bson_dump(...)
|
25
|
+
want_in : bool, default False
|
26
|
+
• False … always return the OUT/source embedding (shape = (n, d))
|
27
|
+
• True … additionally return the IN/target embedding when
|
28
|
+
the dump is *asymmetric* (tuple(out, in_)). For symmetric
|
29
|
+
dumps the second item is None.
|
30
|
+
|
31
|
+
Returns
|
32
|
+
-------
|
33
|
+
np.ndarray (sym. dump or want_in=False)
|
34
|
+
OR (out_emb, in_emb) (want_in=True)
|
35
|
+
"""
|
36
|
+
path = Path(path)
|
37
|
+
with path.open("rb") as fh:
|
38
|
+
docs = decode_file_iter(fh)
|
39
|
+
|
40
|
+
# -- header ----------------------------------------------------
|
41
|
+
header = next(docs)["header"]
|
42
|
+
n = int(header["nbdata"])
|
43
|
+
d = int(header["dimension"])
|
44
|
+
sym = bool(header["symetric"])
|
45
|
+
dtype = _dtype_from_type_name(header["type_name"])
|
46
|
+
out_emb = np.empty((n, d), dtype=dtype)
|
47
|
+
in_emb = None if sym or not want_in else np.empty((n, d), dtype=dtype)
|
48
|
+
|
49
|
+
# --OUT part --------------------------------------------------
|
50
|
+
for _ in range(n):
|
51
|
+
doc = next(docs)
|
52
|
+
key, vec = next(iter(doc.items())) # only 1 (key,val)
|
53
|
+
idx, tag = map(int, key.split(","))
|
54
|
+
assert tag == 0, f"expected tag 0, got {tag}"
|
55
|
+
out_emb[idx] = np.asarray(vec, dtype=dtype)
|
56
|
+
|
57
|
+
# -- IN part (if any) -----------------------------------------
|
58
|
+
if not sym:
|
59
|
+
for _ in range(n):
|
60
|
+
doc = next(docs)
|
61
|
+
key, vec = next(iter(doc.items()))
|
62
|
+
idx, tag = map(int, key.split(","))
|
63
|
+
assert tag == 1, f"expected tag 1, got {tag}"
|
64
|
+
if in_emb is not None: # want_in == True
|
65
|
+
in_emb[idx] = np.asarray(vec, dtype=dtype)
|
66
|
+
# else: silently drop it
|
67
|
+
|
68
|
+
# -- optional indexation doc – skip for now --------------------
|
69
|
+
# (decode_file_iter stops automatically at EOF)
|
70
|
+
|
71
|
+
return (out_emb, in_emb) if want_in else out_emb
|
@@ -1,30 +1,101 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: graphembed_rs
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
|
+
Requires-Dist: numpy>=2.2.5
|
5
|
+
Requires-Dist: pymongo>=4.12.1
|
6
|
+
License-File: LICENSE-MIT
|
4
7
|
Summary: Python bindings for the high‑performance Rust graph/network embedding library graphembed
|
5
8
|
Keywords: graph,embedding,hash
|
6
9
|
Author: Jianshu Zhao
|
7
|
-
Author-email: jeanpierre.both@gmail.com
|
10
|
+
Author-email: jeanpierre.both@gmail.com
|
8
11
|
License: MIT OR Apache-2.0
|
9
12
|
Requires-Python: >=3.8
|
10
13
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
11
14
|
Project-URL: Source Code, https://github.com/jean-pierreBoth/graphembed
|
12
15
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
16
|
+
[](http://bioconda.github.io/recipes/graphembed/README.html)
|
17
|
+

|
18
|
+

|
19
|
+

|
20
|
+

|
21
|
+
[](https://anaconda.org/bioconda/graphembed)
|
22
|
+
|
23
|
+
<div align="center">
|
24
|
+
<img width="35%" src ="GraphEmbed_log.jpg">
|
25
|
+
</div>
|
26
|
+
|
27
|
+
# GraphEmbed: Efficient and Robust Network Embedding via High-Order Proximity Preservation or Recursive Sketching
|
28
|
+
|
29
|
+
This crate provides an executable and a library for embedding of directed or undirected graphs with positively weighted edges. We engineered and optimized current network embedding algorithms for large-scale network embedding, especially biological network. This crate was developed by [Jianshu Zhao](https://gitlab.com/Jianshu_Zhao) and Jean-Pierre Both [jpboth](https://gitlab.com/jpboth). We have a copy here in [Github](https://github.com/jianshu93/graphembed)
|
30
|
+
|
31
|
+
|
32
|
+
- For simple graphs, without data attached to nodes, there are 2 modules **nodesketch** and **atp**. A simple executable with a validation option based on link prediction is also provided.
|
33
|
+
|
34
|
+
## Quick Install
|
35
|
+
|
36
|
+
### Pre-built binaries on Linux
|
37
|
+
```bash
|
38
|
+
wget https://gitlab.com/-/project/64961144/uploads/9d7d0b038140cb67c584f01cd6dafac9/graphembed_Linux_x86-64_v0.1.6.zip
|
39
|
+
unzip graphembed_Linux_x86-64_v0.1.6.zip
|
40
|
+
chmod a+x ./graphembed
|
41
|
+
./graphembed -h
|
42
|
+
```
|
43
|
+
|
44
|
+
### Bioconda on Linux/MacOS
|
45
|
+
```bash
|
46
|
+
conda install -c conda-forge -c bioconda graphembed
|
47
|
+
```
|
48
|
+
|
49
|
+
### Homebrew on MacOS
|
50
|
+
```bash
|
51
|
+
brew tap jianshu93/graphembed
|
52
|
+
brew update
|
53
|
+
brew install graphembed
|
54
|
+
```
|
55
|
+
|
56
|
+
|
57
|
+
### In Python (Please install python>=3.9 first)
|
58
|
+
```bash
|
59
|
+
pip install graphembed_rs
|
60
|
+
|
61
|
+
### or you can build from source (Linux) after installing maturin
|
62
|
+
git clone https://gitlab.com/Jianshu_Zhao/graphembed
|
63
|
+
cd graphembed
|
64
|
+
pip install maturin
|
65
|
+
### note: for macOS, you need to change the line "features = ["pyo3/extension-module", "intel-mkl-static", "simdeez_f"]" in pyporject.toml to "features = ["pyo3/extension-module","openblas-system","stdsimd"]", you also need to Install OpenBLAS and add to system library path via Homebrew
|
66
|
+
maturin develop --release
|
67
|
+
|
68
|
+
#### Prepare some data
|
69
|
+
wget https://gitlab.com/-/project/64961144/uploads/4e341383d62d86d1dd66e668e91b2c07/BlogCatalog.txt
|
70
|
+
```
|
71
|
+
|
72
|
+
```python
|
73
|
+
import os
|
74
|
+
os.environ["RUST_LOG"] = "info"
|
75
|
+
import graphembed_rs.graphembed_rs as ge
|
76
|
+
import graphembed_rs.load_utils as ge_utils
|
77
|
+
help(ge)
|
78
|
+
help(ge_utils)
|
79
|
+
### HOPE
|
80
|
+
ge.embed_hope_rank("BlogCatalog.txt", target_rank=128, nbiter=4,output="embedding_output")
|
81
|
+
out_vectors=ge_utils.load_embedding_bson("embedding_output.bson")
|
82
|
+
print("OUT embedding shape :", out_vectors.shape)
|
83
|
+
print("first OUT vector :", out_vectors[0])
|
84
|
+
|
85
|
+
### Sketching
|
86
|
+
### sketching only
|
87
|
+
ge.embed_sketching("BlogCatalog.txt", decay=0.3, dim=128, nbiter=5, symetric=True, output="embedding_output")
|
88
|
+
out_vectors=ge_utils.load_embedding_bson("embedding_output.bson")
|
89
|
+
print("OUT embedding shape :", out_vectors.shape)
|
90
|
+
print("first OUT vector :", out_vectors[0])
|
91
|
+
|
92
|
+
|
93
|
+
### validate accuracy
|
94
|
+
auc_scores = ge.validate_sketching("BlogCatalog.txt",decay=0.3, dim=128, nbiter=3, nbpass=1, skip_frac=0.2,symetric=True, centric=True)
|
95
|
+
print("Standard AUC per pass:", auc_scores)
|
96
|
+
```
|
25
97
|
|
26
98
|
## Methods
|
27
|
-
|
28
99
|
### The embedding algorithms used in this crate are based on the following papers
|
29
100
|
|
30
101
|
- **nodesketch**
|
@@ -57,29 +128,12 @@ Source node are related to left singular vectors and target nodes to the right o
|
|
57
128
|
The similarity measure is the dot product, so it is not a norm.
|
58
129
|
The svd is approximated by randomization as described in Halko-Tropp 2011 as implemented in the [annembed crate](https://crates.io/crates/annembed).
|
59
130
|
|
60
|
-
### The core decomposition algorithms
|
61
|
-
|
62
|
-
- **Density-friendly decomposition**
|
63
|
-
|
64
|
-
*Large Scale decomposition via convex programming 2017*
|
65
|
-
M.Danisch T.H Hubert Chan and M.Sozio
|
66
|
-
|
67
|
-
The decomposition of the graph in maximally dense groups of nodes is implemented and used to assess the quality of the embeddings in a structural way. See module *validation* and the comments on the embedding of the *Orkut* graph where we can use the community data provided with the graph to analyze the behaviour of embedded edge lengths.
|
68
|
-
|
69
|
-
In particular it is shown that :
|
70
|
-
- embedding of edges internal to a community are consistently smaller than embedded edges crossing a block frontier.
|
71
|
-
- The transition probabilities of edge from one block to another are similar (low kullback divergence) in the original graph and in the embedded graph.
|
72
|
-
|
73
|
-
See results in [orkut.md](./orkut.md) and examples directory together with a small Rust notebook in directory [Notebooks](./Notebooks/orkutrs.ipynb)
|
74
|
-
|
75
131
|
## Validation
|
76
132
|
|
77
133
|
Validation of embeddings is assessed via standard Auc with random deletion of edges. See documentation in the *link* module and *embed* binary.
|
78
134
|
We give also a variation based on centric quality assessment as explained at [cauc](http://github.com/jean-pierreBoth/linkauc)
|
79
135
|
## Some data sets
|
80
136
|
|
81
|
-
### Without labels
|
82
|
-
|
83
137
|
Small datasets are given in the Data subdirectory (with 7z compression) to run tests.
|
84
138
|
Larger datasets can be downloaded from the SNAP data collections <https://snap.stanford.edu/data>
|
85
139
|
|
@@ -132,13 +186,11 @@ A preliminary of node centric quality estimation is provided in the validation m
|
|
132
186
|
- The munmun_twitter_social graph shows that treating a directed graph as an undirected graph give significantly different results in terms of link prediction AUC.
|
133
187
|
|
134
188
|
|
135
|
-
|
136
|
-
|
137
189
|
## Generalized Svd
|
138
190
|
|
139
191
|
An implementation of Generalized Svd comes as a by-product in module [gsvd](./src/atp/gsvd.rs).
|
140
192
|
|
141
|
-
## Installation and Usage
|
193
|
+
## Detailed Installation and Usage
|
142
194
|
|
143
195
|
### Installation
|
144
196
|
|
@@ -161,21 +213,21 @@ so to the required dimension to get a valid embedding in $R^{n}$.
|
|
161
213
|
- The *embed* module takes embedding and possibly validation commands (link prediction task) in one directive.
|
162
214
|
The general syntax is :
|
163
215
|
|
164
|
-
|
216
|
+
graphembed file_description [validation_command --validation_arguments] sketching mode --embedding_arguments
|
165
217
|
for example:
|
166
218
|
|
167
219
|
For a symetric graph we get:
|
168
220
|
|
169
221
|
- just embedding:
|
170
|
-
|
222
|
+
graphembed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric sketching --decay 0.2 --dim 200 --nbiter
|
171
223
|
|
172
224
|
- embedding and validation:
|
173
225
|
|
174
|
-
|
226
|
+
graphembed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
|
175
227
|
|
176
228
|
For an asymetric graph we get
|
177
229
|
|
178
|
-
|
230
|
+
graphembed --csv ./Data/Graphs/asymetric.csv validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
|
179
231
|
|
180
232
|
|
181
233
|
More details can be found in docs of the embed module. Use cargo doc --no-dep --bin embed (and cargo doc --no-dep) as usual.
|
@@ -0,0 +1,7 @@
|
|
1
|
+
graphembed_rs-0.1.2.dist-info/METADATA,sha256=WEvYJP6qVNPFlFxYzzq-n3XX6L__a2ukxgYkVt_WnYo,11532
|
2
|
+
graphembed_rs-0.1.2.dist-info/WHEEL,sha256=pKyTkFbTEakJa6xy_GKYIZO6TnrTnUxcBEc6JAhr_7o,104
|
3
|
+
graphembed_rs-0.1.2.dist-info/licenses/LICENSE-MIT,sha256=ndZ12D28O4UkfOeoa6HP9E7IKyYG4iH79iQ6WiLs9bc,1077
|
4
|
+
graphembed_rs/load_utils.py,sha256=fXd4-5OxdymSkb4kx39XeW_P3nEDdAuQ7Oe1D6PKDWE,2719
|
5
|
+
graphembed_rs/__init__.py,sha256=s8WorNcMnn15CfjctNaLnXGnwQ9RlPyCR0WJQxepTTc,179
|
6
|
+
graphembed_rs/graphembed_rs.cpython-310-darwin.so,sha256=tsLRSTexzC6Y6ZXPPOr640KrEbGEAt3O_2nznzQ1R-k,5266032
|
7
|
+
graphembed_rs-0.1.2.dist-info/RECORD,,
|
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2022 jean-pierre.both and Jianshu Zhao
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any
|
4
|
+
person obtaining a copy of this software and associated
|
5
|
+
documentation files (the "Software"), to deal in the
|
6
|
+
Software without restriction, including without
|
7
|
+
limitation the rights to use, copy, modify, merge,
|
8
|
+
publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software
|
10
|
+
is furnished to do so, subject to the following
|
11
|
+
conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice
|
14
|
+
shall be included in all copies or substantial portions
|
15
|
+
of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
18
|
+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
19
|
+
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
20
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
21
|
+
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
22
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
24
|
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
25
|
+
DEALINGS IN THE SOFTWARE.
|
graphembed/__init__.py
DELETED
graphembed/__init__.pyi
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
# ---------- Embedding ----------
|
4
|
-
def embed_hope_rank(
|
5
|
-
csv: str,
|
6
|
-
symetric: bool,
|
7
|
-
target_rank: int,
|
8
|
-
nbiter: int,
|
9
|
-
output: Optional[str] = None,
|
10
|
-
) -> None: ...
|
11
|
-
def embed_hope_precision(
|
12
|
-
csv: str,
|
13
|
-
symetric: bool,
|
14
|
-
epsil: float,
|
15
|
-
maxrank: int,
|
16
|
-
blockiter: int,
|
17
|
-
output: Optional[str] = None,
|
18
|
-
) -> None: ...
|
19
|
-
def embed_sketching(
|
20
|
-
csv: str,
|
21
|
-
symetric: bool,
|
22
|
-
decay: float,
|
23
|
-
dim: int,
|
24
|
-
nbiter: int,
|
25
|
-
output: Optional[str] = None,
|
26
|
-
) -> None: ...
|
27
|
-
|
28
|
-
# ---------- Validation (returns mean AUC) ----------
|
29
|
-
def validate_hope_rank(
|
30
|
-
csv: str,
|
31
|
-
symetric: bool,
|
32
|
-
target_rank: int,
|
33
|
-
nbiter: int,
|
34
|
-
nbpass: int = 10,
|
35
|
-
skip_frac: float = 0.1,
|
36
|
-
centric: bool = False,
|
37
|
-
) -> float: ...
|
38
|
-
def validate_hope_precision(
|
39
|
-
csv: str,
|
40
|
-
symetric: bool,
|
41
|
-
epsil: float,
|
42
|
-
maxrank: int,
|
43
|
-
blockiter: int,
|
44
|
-
nbpass: int = 10,
|
45
|
-
skip_frac: float = 0.1,
|
46
|
-
centric: bool = False,
|
47
|
-
) -> float: ...
|
48
|
-
def validate_sketching(
|
49
|
-
csv: str,
|
50
|
-
symetric: bool,
|
51
|
-
decay: float,
|
52
|
-
dim: int,
|
53
|
-
nbiter: int,
|
54
|
-
nbpass: int = 10,
|
55
|
-
skip_frac: float = 0.1,
|
56
|
-
centric: bool = False,
|
57
|
-
) -> float: ...
|
58
|
-
|
59
|
-
# ---------- VCMPR (precision/recall curves) ----------
|
60
|
-
def estimate_vcmpr_hope_rank(
|
61
|
-
csv: str,
|
62
|
-
symetric: bool,
|
63
|
-
target_rank: int,
|
64
|
-
nbiter: int,
|
65
|
-
nbpass: int = 2,
|
66
|
-
topk: int = 10,
|
67
|
-
skip_frac: float = 0.1,
|
68
|
-
) -> None: ...
|
69
|
-
def estimate_vcmpr_sketching(
|
70
|
-
csv: str,
|
71
|
-
symetric: bool,
|
72
|
-
decay: float,
|
73
|
-
dim: int,
|
74
|
-
nbiter: int,
|
75
|
-
nbpass: int = 2,
|
76
|
-
topk: int = 10,
|
77
|
-
skip_frac: float = 0.1,
|
78
|
-
) -> None: ...
|
79
|
-
|
Binary file
|
graphembed/py.typed
DELETED
File without changes
|
@@ -1,7 +0,0 @@
|
|
1
|
-
graphembed_rs-0.1.0.dist-info/METADATA,sha256=1bBA8fy75z8I6YGAsPAMoB-67zpAHW66cHywSb-hPj8,9901
|
2
|
-
graphembed_rs-0.1.0.dist-info/WHEEL,sha256=pKyTkFbTEakJa6xy_GKYIZO6TnrTnUxcBEc6JAhr_7o,104
|
3
|
-
graphembed/__init__.py,sha256=RCcLraveWf-myTsDQGePMYq-scNNfz-3Mv1baSbgAmM,123
|
4
|
-
graphembed/__init__.pyi,sha256=3_KBFG4g9akylo32CHlm9bZStcLwxIY2X4si21ilD3w,1626
|
5
|
-
graphembed/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
graphembed/graphembed.cpython-310-darwin.so,sha256=rpwaeaP3bH7t93yRz2edb4HD8CDEgu7bkEMFO3waTGU,5158592
|
7
|
-
graphembed_rs-0.1.0.dist-info/RECORD,,
|
File without changes
|