graphembed-rs 0.1.2__cp38-cp38-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ from importlib import import_module as _imp
2
+ from graphembed_rs import *
3
+
4
+ # add the helper into the same namespace
5
+ from .load_utils import *
6
+
7
+ del _imp
@@ -0,0 +1,71 @@
1
+ import numpy as np
2
+ from pathlib import Path
3
+ ### install pymongo with pip install pymongo
4
+ # -- pymongo is a BSON decoder, not a BSON encoder --
5
+ from bson import decode_file_iter
6
+
7
+
8
+ def _dtype_from_type_name(tname: str):
9
+ """Map the Rust type-name string to a NumPy dtype."""
10
+ if tname == "f32":
11
+ return np.float32
12
+ if tname == "f64":
13
+ return np.float64
14
+ if tname == "usize": # stored as i64 in BSON
15
+ return np.int64
16
+ raise ValueError(f"Unknown type_name {tname!r} in BSON header")
17
+
18
+
19
+ def load_embedding_bson(path: str | Path, *, want_in: bool = False):
20
+ """
21
+ Parameters
22
+ ----------
23
+ path : str | pathlib.Path
24
+ File written by graphembed::io::bson_dump(...)
25
+ want_in : bool, default False
26
+ • False … always return the OUT/source embedding (shape = (n, d))
27
+ • True … additionally return the IN/target embedding when
28
+ the dump is *asymmetric* (tuple(out, in_)). For symmetric
29
+ dumps the second item is None.
30
+
31
+ Returns
32
+ -------
33
+ np.ndarray (sym. dump or want_in=False)
34
+ OR (out_emb, in_emb) (want_in=True)
35
+ """
36
+ path = Path(path)
37
+ with path.open("rb") as fh:
38
+ docs = decode_file_iter(fh)
39
+
40
+ # -- header ----------------------------------------------------
41
+ header = next(docs)["header"]
42
+ n = int(header["nbdata"])
43
+ d = int(header["dimension"])
44
+ sym = bool(header["symetric"])
45
+ dtype = _dtype_from_type_name(header["type_name"])
46
+ out_emb = np.empty((n, d), dtype=dtype)
47
+ in_emb = None if sym or not want_in else np.empty((n, d), dtype=dtype)
48
+
49
+ # --OUT part --------------------------------------------------
50
+ for _ in range(n):
51
+ doc = next(docs)
52
+ key, vec = next(iter(doc.items())) # only 1 (key,val)
53
+ idx, tag = map(int, key.split(","))
54
+ assert tag == 0, f"expected tag 0, got {tag}"
55
+ out_emb[idx] = np.asarray(vec, dtype=dtype)
56
+
57
+ # -- IN part (if any) -----------------------------------------
58
+ if not sym:
59
+ for _ in range(n):
60
+ doc = next(docs)
61
+ key, vec = next(iter(doc.items()))
62
+ idx, tag = map(int, key.split(","))
63
+ assert tag == 1, f"expected tag 1, got {tag}"
64
+ if in_emb is not None: # want_in == True
65
+ in_emb[idx] = np.asarray(vec, dtype=dtype)
66
+ # else: silently drop it
67
+
68
+ # -- optional indexation doc – skip for now --------------------
69
+ # (decode_file_iter stops automatically at EOF)
70
+
71
+ return (out_emb, in_emb) if want_in else out_emb
@@ -0,0 +1,245 @@
1
+ Metadata-Version: 2.4
2
+ Name: graphembed_rs
3
+ Version: 0.1.2
4
+ Requires-Dist: numpy>=2.2.5
5
+ Requires-Dist: pymongo>=4.12.1
6
+ License-File: LICENSE-MIT
7
+ Summary: Python bindings for the high‑performance Rust graph/network embedding library graphembed
8
+ Keywords: graph,embedding,hash
9
+ Author: Jianshu Zhao
10
+ Author-email: jeanpierre.both@gmail.com
11
+ License: MIT OR Apache-2.0
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
14
+ Project-URL: Source Code, https://github.com/jean-pierreBoth/graphembed
15
+
16
+ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/graphembed/README.html)
17
+ ![](https://anaconda.org/bioconda/graphembed/badges/license.svg)
18
+ ![](https://anaconda.org/bioconda/graphembed/badges/version.svg)
19
+ ![](https://anaconda.org/bioconda/graphembed/badges/latest_release_relative_date.svg)
20
+ ![](https://anaconda.org/bioconda/graphembed/badges/platforms.svg)
21
+ [![install with conda](https://anaconda.org/bioconda/graphembed/badges/downloads.svg)](https://anaconda.org/bioconda/graphembed)
22
+
23
+ <div align="center">
24
+ <img width="35%" src ="GraphEmbed_log.jpg">
25
+ </div>
26
+
27
+ # GraphEmbed: Efficient and Robust Network Embedding via High-Order Proximity Preservation or Recursive Sketching
28
+
29
+ This crate provides an executable and a library for embedding of directed or undirected graphs with positively weighted edges. We engineered and optimized current network embedding algorithms for large-scale network embedding, especially biological network. This crate was developed by [Jianshu Zhao](https://gitlab.com/Jianshu_Zhao) and Jean-Pierre Both [jpboth](https://gitlab.com/jpboth). We have a copy here in [Github](https://github.com/jianshu93/graphembed)
30
+
31
+
32
+ - For simple graphs, without data attached to nodes, there are 2 modules **nodesketch** and **atp**. A simple executable with a validation option based on link prediction is also provided.
33
+
34
+ ## Quick Install
35
+
36
+ ### Pre-built binaries on Linux
37
+ ```bash
38
+ wget https://gitlab.com/-/project/64961144/uploads/9d7d0b038140cb67c584f01cd6dafac9/graphembed_Linux_x86-64_v0.1.6.zip
39
+ unzip graphembed_Linux_x86-64_v0.1.6.zip
40
+ chmod a+x ./graphembed
41
+ ./graphembed -h
42
+ ```
43
+
44
+ ### Bioconda on Linux/MacOS
45
+ ```bash
46
+ conda install -c conda-forge -c bioconda graphembed
47
+ ```
48
+
49
+ ### Homebrew on MacOS
50
+ ```bash
51
+ brew tap jianshu93/graphembed
52
+ brew update
53
+ brew install graphembed
54
+ ```
55
+
56
+
57
+ ### In Python (Please install python>=3.9 first)
58
+ ```bash
59
+ pip install graphembed_rs
60
+
61
+ ### or you can build from source (Linux) after installing maturin
62
+ git clone https://gitlab.com/Jianshu_Zhao/graphembed
63
+ cd graphembed
64
+ pip install maturin
65
+ ### note: for macOS, you need to change the line "features = ["pyo3/extension-module", "intel-mkl-static", "simdeez_f"]" in pyporject.toml to "features = ["pyo3/extension-module","openblas-system","stdsimd"]", you also need to Install OpenBLAS and add to system library path via Homebrew
66
+ maturin develop --release
67
+
68
+ #### Prepare some data
69
+ wget https://gitlab.com/-/project/64961144/uploads/4e341383d62d86d1dd66e668e91b2c07/BlogCatalog.txt
70
+ ```
71
+
72
+ ```python
73
+ import os
74
+ os.environ["RUST_LOG"] = "info"
75
+ import graphembed_rs.graphembed_rs as ge
76
+ import graphembed_rs.load_utils as ge_utils
77
+ help(ge)
78
+ help(ge_utils)
79
+ ### HOPE
80
+ ge.embed_hope_rank("BlogCatalog.txt", target_rank=128, nbiter=4,output="embedding_output")
81
+ out_vectors=ge_utils.load_embedding_bson("embedding_output.bson")
82
+ print("OUT embedding shape :", out_vectors.shape)
83
+ print("first OUT vector :", out_vectors[0])
84
+
85
+ ### Sketching
86
+ ### sketching only
87
+ ge.embed_sketching("BlogCatalog.txt", decay=0.3, dim=128, nbiter=5, symetric=True, output="embedding_output")
88
+ out_vectors=ge_utils.load_embedding_bson("embedding_output.bson")
89
+ print("OUT embedding shape :", out_vectors.shape)
90
+ print("first OUT vector :", out_vectors[0])
91
+
92
+
93
+ ### validate accuracy
94
+ auc_scores = ge.validate_sketching("BlogCatalog.txt",decay=0.3, dim=128, nbiter=3, nbpass=1, skip_frac=0.2,symetric=True, centric=True)
95
+ print("Standard AUC per pass:", auc_scores)
96
+ ```
97
+
98
+ ## Methods
99
+ ### The embedding algorithms used in this crate are based on the following papers
100
+
101
+ - **nodesketch**
102
+
103
+ *NodeSketch : Highly-Efficient Graph Embeddings via Recursive Sketching KDD 2019*. see [nodesketch](https://dl.acm.org/doi/10.1145/3292500.3330951)
104
+ D. Yang,P. Rosso,Bin-Li, P. Cudre-Mauroux.
105
+
106
+ It is based on multi hop neighbourhood identification via sensitive hashing based on the recent algorithm **probminhash**. See [arxiv](https://arxiv.org/abs/1911.00675) or [ieee-2022](https://ieeexplore.ieee.org/document/9185081).
107
+
108
+ The algorithm associates a probability distribution on neighbours of each point depending on edge weights and distance to the point.
109
+ Then this distribution is hashed to build a (discrete) embedding vector consisting in nodes identifiers.
110
+ The distance between embedded vectors is the Jaccard distance so we get
111
+ a real distance on the embedding space for the symetric embedding.
112
+
113
+ An extension of the paper is also implemented to get asymetric embedding for directed graph. The similarity is also based on the hash of sets (nodes going to or from) a given node but then the dissimilarity is no more a distance (no symetry and some discrepancy with the triangular inequality).
114
+
115
+ **The orkut graph with 3 millions nodes and 100 millions of edge is embedded in 5' with a 24 core i9 laptop with this algorithm giving an AUC of 0.95**.
116
+
117
+
118
+ - **atp**
119
+
120
+ *Asymetric Transitivity Preserving Graph Embedding 2016*.
121
+ M. Ou, P Cui, J. Pei, Z. Zhang and W. Zhu. See [hope](https://dl.acm.org/doi/10.1145/2939672.2939751).
122
+
123
+ The objective is to provide an asymetric graph embedding and get estimate of the precision of the embedding in function of its dimension.
124
+
125
+ We use the Adamic-Adar matricial representation of the graph. (It must be noted that the ponderation of a node by the number of couples joined by it is called Resource Allocation in the Graph Kernel litterature).
126
+ The asymetric embedding is obtained from the left and right singular eigenvectors of the Adamic-Adar representation of the graph.
127
+ Source node are related to left singular vectors and target nodes to the right ones.
128
+ The similarity measure is the dot product, so it is not a norm.
129
+ The svd is approximated by randomization as described in Halko-Tropp 2011 as implemented in the [annembed crate](https://crates.io/crates/annembed).
130
+
131
+ ## Validation
132
+
133
+ Validation of embeddings is assessed via standard Auc with random deletion of edges. See documentation in the *link* module and *embed* binary.
134
+ We give also a variation based on centric quality assessment as explained at [cauc](http://github.com/jean-pierreBoth/linkauc)
135
+ ## Some data sets
136
+
137
+ Small datasets are given in the Data subdirectory (with 7z compression) to run tests.
138
+ Larger datasets can be downloaded from the SNAP data collections <https://snap.stanford.edu/data>
139
+
140
+ #### Some small test graphs are provided in a Data subdirectory
141
+
142
+ - Symetric graphs
143
+
144
+ - Les miserables <http://konect.cc/networks/moreno_lesmis>.
145
+ This is the graph of co-occurence of characters in Victor Hugo's novel 'Les Misérables'.
146
+
147
+ - Asymetric graphs
148
+
149
+ - wiki-vote <https://snap.stanford.edu/data/wiki-Vote.html>
150
+ 7115 nodes 103689 edges
151
+
152
+ - Cora : <http://konect.cc/networks/subelj_cora>
153
+ citation network 23166 nodes 91500 edges
154
+
155
+ #### Some larger data tests for user to download
156
+
157
+ These graphs were used in results see below.
158
+
159
+ Beware of the possible need to convert from Windows to Linux End Of Line, see the dos2unix utility.
160
+ Possibly some data can need to be converted from Tsv format to Csv, before being read by the program.
161
+
162
+ - Symetric
163
+
164
+ - Amazon. Nodes: 334 863 Edges: 925 872 <https://snap.stanford.edu/data/amazon0601.html>
165
+ - youtube. Nodes: 1 134 890 Edges: 2 987 624 <https://snap.stanford.edu/data/com-Youtube.html>
166
+ - orkut. Nodes: 3 072 441 Edges: 117 185 083 <https://snap.stanford.edu/data/com-Orkut.html>
167
+
168
+ - Asymetric
169
+
170
+ - twitter as tested in Hope <http://konect.cc/networks/munmun_twitter_social>
171
+ 465017 nodes 834797 edges
172
+
173
+ ## Some results
174
+
175
+ ### results for the *atp* and *nodesketch* modules
176
+
177
+ Embedding and link prediction evaluation for the above data sets are given in file [resultats.md](./resultats.md)
178
+ A more global analysis of the embedding with the nodesketch module is done for the orkut graph in file [orkut.md](./orkut.md)
179
+
180
+ A preliminary of node centric quality estimation is provided in the validation module (see documentation in validation::link).
181
+
182
+ ### Some qualitative comments
183
+
184
+ - For the embedding using the randomized svd, increasing the embedding dimension is interesting as far as the corresponding eigenvalues continue to decrease significantly.
185
+
186
+ - The munmun_twitter_social graph shows that treating a directed graph as an undirected graph give significantly different results in terms of link prediction AUC.
187
+
188
+
189
+ ## Generalized Svd
190
+
191
+ An implementation of Generalized Svd comes as a by-product in module [gsvd](./src/atp/gsvd.rs).
192
+
193
+ ## Detailed Installation and Usage
194
+
195
+ ### Installation
196
+
197
+ The crate provides features (with a default configuration), required by the *annembed* dependency, to specify which version of lapack you want to use or the choice of simd implementation.
198
+ - For example compilation is done by :
199
+ *cargo build --release --features="openblas-system"* to use a dynamic link with openblas.
200
+ The choice of one feature is mandatory to provide required linear algebra library.
201
+ - On Intel the simdeez_f feature can be used. On other cpus the stdsimd feature can be chosen but it requires compiler >= 1.79
202
+
203
+ ### Usage
204
+
205
+ The embed module can be generated with the standard : cargo doc --no-deps --bin embed.
206
+
207
+ - The Hope embedding relying on matrices computations limits the size of the graph to some hundred thousands nodes.
208
+ It is intrinsically asymetric in nature. It nevertheless gives access to the spectrum of Adamic Adar matrix representing the graph and
209
+ so to the required dimension to get a valid embedding in $R^{n}$.
210
+
211
+ - The Sketching embedding is much faster for large graphs but embeds in a space consisting in sequences of node id equipped with the Jaccard distance. It is particularly efficient in low degrees graph.
212
+
213
+ - The *embed* module takes embedding and possibly validation commands (link prediction task) in one directive.
214
+ The general syntax is :
215
+
216
+ graphembed file_description [validation_command --validation_arguments] sketching mode --embedding_arguments
217
+ for example:
218
+
219
+ For a symetric graph we get:
220
+
221
+ - just embedding:
222
+ graphembed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric sketching --decay 0.2 --dim 200 --nbiter
223
+
224
+ - embedding and validation:
225
+
226
+ graphembed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
227
+
228
+ For an asymetric graph we get
229
+
230
+ graphembed --csv ./Data/Graphs/asymetric.csv validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
231
+
232
+
233
+ More details can be found in docs of the embed module. Use cargo doc --no-dep --bin embed (and cargo doc --no-dep) as usual.
234
+
235
+ - Use the environment variable RUST_LOG gives access to some information at various level (debug, info, error) via the **log** and **env_logger** crates.
236
+
237
+ ## License
238
+
239
+ Licensed under either of
240
+
241
+ * Apache License, Version 2.0, [LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>
242
+ * MIT license [LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>
243
+
244
+ at your option.
245
+
@@ -0,0 +1,7 @@
1
+ graphembed_rs-0.1.2.dist-info/METADATA,sha256=WEvYJP6qVNPFlFxYzzq-n3XX6L__a2ukxgYkVt_WnYo,11532
2
+ graphembed_rs-0.1.2.dist-info/WHEEL,sha256=S2d1ZXRHI-kQ6Otx65tlQQmTEYlQToPfICysVxMKI7Y,104
3
+ graphembed_rs-0.1.2.dist-info/licenses/LICENSE-MIT,sha256=ndZ12D28O4UkfOeoa6HP9E7IKyYG4iH79iQ6WiLs9bc,1077
4
+ graphembed_rs/load_utils.py,sha256=fXd4-5OxdymSkb4kx39XeW_P3nEDdAuQ7Oe1D6PKDWE,2719
5
+ graphembed_rs/__init__.py,sha256=s8WorNcMnn15CfjctNaLnXGnwQ9RlPyCR0WJQxepTTc,179
6
+ graphembed_rs/graphembed_rs.cpython-38-darwin.so,sha256=eQyD0tM6mq0NEb0LfKtkKaKjlOi1V3cqtLzK73tt-rY,5568544
7
+ graphembed_rs-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.8.3)
3
+ Root-Is-Purelib: false
4
+ Tag: cp38-cp38-macosx_10_12_x86_64
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2022 jean-pierre.both and Jianshu Zhao
2
+
3
+ Permission is hereby granted, free of charge, to any
4
+ person obtaining a copy of this software and associated
5
+ documentation files (the "Software"), to deal in the
6
+ Software without restriction, including without
7
+ limitation the rights to use, copy, modify, merge,
8
+ publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software
10
+ is furnished to do so, subject to the following
11
+ conditions:
12
+
13
+ The above copyright notice and this permission notice
14
+ shall be included in all copies or substantial portions
15
+ of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21
+ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
+ DEALINGS IN THE SOFTWARE.