graphembed-rs 0.1.0__cp310-cp310-macosx_11_0_arm64.whl → 0.1.2__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ from importlib import import_module as _imp
2
+ from graphembed_rs import *
3
+
4
+ # add the helper into the same namespace
5
+ from .load_utils import *
6
+
7
+ del _imp
@@ -0,0 +1,71 @@
1
+ import numpy as np
2
+ from pathlib import Path
3
+ ### install pymongo with pip install pymongo
4
+ # -- pymongo is a BSON decoder, not a BSON encoder --
5
+ from bson import decode_file_iter
6
+
7
+
8
+ def _dtype_from_type_name(tname: str):
9
+ """Map the Rust type-name string to a NumPy dtype."""
10
+ if tname == "f32":
11
+ return np.float32
12
+ if tname == "f64":
13
+ return np.float64
14
+ if tname == "usize": # stored as i64 in BSON
15
+ return np.int64
16
+ raise ValueError(f"Unknown type_name {tname!r} in BSON header")
17
+
18
+
19
+ def load_embedding_bson(path: str | Path, *, want_in: bool = False):
20
+ """
21
+ Parameters
22
+ ----------
23
+ path : str | pathlib.Path
24
+ File written by graphembed::io::bson_dump(...)
25
+ want_in : bool, default False
26
+ • False … always return the OUT/source embedding (shape = (n, d))
27
+ • True … additionally return the IN/target embedding when
28
+ the dump is *asymmetric* (tuple(out, in_)). For symmetric
29
+ dumps the second item is None.
30
+
31
+ Returns
32
+ -------
33
+ np.ndarray (sym. dump or want_in=False)
34
+ OR (out_emb, in_emb) (want_in=True)
35
+ """
36
+ path = Path(path)
37
+ with path.open("rb") as fh:
38
+ docs = decode_file_iter(fh)
39
+
40
+ # -- header ----------------------------------------------------
41
+ header = next(docs)["header"]
42
+ n = int(header["nbdata"])
43
+ d = int(header["dimension"])
44
+ sym = bool(header["symetric"])
45
+ dtype = _dtype_from_type_name(header["type_name"])
46
+ out_emb = np.empty((n, d), dtype=dtype)
47
+ in_emb = None if sym or not want_in else np.empty((n, d), dtype=dtype)
48
+
49
+ # --OUT part --------------------------------------------------
50
+ for _ in range(n):
51
+ doc = next(docs)
52
+ key, vec = next(iter(doc.items())) # only 1 (key,val)
53
+ idx, tag = map(int, key.split(","))
54
+ assert tag == 0, f"expected tag 0, got {tag}"
55
+ out_emb[idx] = np.asarray(vec, dtype=dtype)
56
+
57
+ # -- IN part (if any) -----------------------------------------
58
+ if not sym:
59
+ for _ in range(n):
60
+ doc = next(docs)
61
+ key, vec = next(iter(doc.items()))
62
+ idx, tag = map(int, key.split(","))
63
+ assert tag == 1, f"expected tag 1, got {tag}"
64
+ if in_emb is not None: # want_in == True
65
+ in_emb[idx] = np.asarray(vec, dtype=dtype)
66
+ # else: silently drop it
67
+
68
+ # -- optional indexation doc – skip for now --------------------
69
+ # (decode_file_iter stops automatically at EOF)
70
+
71
+ return (out_emb, in_emb) if want_in else out_emb
@@ -1,30 +1,101 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graphembed_rs
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
+ Requires-Dist: numpy>=2.2.5
5
+ Requires-Dist: pymongo>=4.12.1
6
+ License-File: LICENSE-MIT
4
7
  Summary: Python bindings for the high‑performance Rust graph/network embedding library graphembed
5
8
  Keywords: graph,embedding,hash
6
9
  Author: Jianshu Zhao
7
- Author-email: jeanpierre.both@gmail.com, jianshuzhao@yahoo.com
10
+ Author-email: jeanpierre.both@gmail.com
8
11
  License: MIT OR Apache-2.0
9
12
  Requires-Python: >=3.8
10
13
  Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
11
14
  Project-URL: Source Code, https://github.com/jean-pierreBoth/graphembed
12
15
 
13
- # Graphembed
14
-
15
- This crate provides ,as a library and an executable,embedding of directed or undirected graphs with positively weighted edges.
16
-
17
-
18
- - For simple graphs, without data attached to nodes/labels, there are 2 (rust) modules **nodesketch** and **atp**. A simple executable with a validation option based on link prediction is also provided.
19
-
20
- - To complement the embeddings we provide also core decomposition of graphs (see the module **structure**). We give try to analyze how Orkut communities are preserved through an embedding. (See Notebooks directory).
21
-
22
- - The module **gkernel** is dedicated to graphs with discrete labels attached to nodes/edges. We use the *petgraph* crate for graph description.
23
- The algorithm is based on an extension of the hashing strategy used in the module **nodesketch**.
24
- In the undirected case, this module also computes a global embedding vector for the whole graph. **It is still in an early version**.
16
+ [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/graphembed/README.html)
17
+ ![](https://anaconda.org/bioconda/graphembed/badges/license.svg)
18
+ ![](https://anaconda.org/bioconda/graphembed/badges/version.svg)
19
+ ![](https://anaconda.org/bioconda/graphembed/badges/latest_release_relative_date.svg)
20
+ ![](https://anaconda.org/bioconda/graphembed/badges/platforms.svg)
21
+ [![install with conda](https://anaconda.org/bioconda/graphembed/badges/downloads.svg)](https://anaconda.org/bioconda/graphembed)
22
+
23
+ <div align="center">
24
+ <img width="35%" src ="GraphEmbed_log.jpg">
25
+ </div>
26
+
27
+ # GraphEmbed: Efficient and Robust Network Embedding via High-Order Proximity Preservation or Recursive Sketching
28
+
29
+ This crate provides an executable and a library for embedding of directed or undirected graphs with positively weighted edges. We engineered and optimized current network embedding algorithms for large-scale network embedding, especially biological network. This crate was developed by [Jianshu Zhao](https://gitlab.com/Jianshu_Zhao) and Jean-Pierre Both [jpboth](https://gitlab.com/jpboth). We have a copy here in [Github](https://github.com/jianshu93/graphembed)
30
+
31
+
32
+ - For simple graphs, without data attached to nodes, there are 2 modules **nodesketch** and **atp**. A simple executable with a validation option based on link prediction is also provided.
33
+
34
+ ## Quick Install
35
+
36
+ ### Pre-built binaries on Linux
37
+ ```bash
38
+ wget https://gitlab.com/-/project/64961144/uploads/9d7d0b038140cb67c584f01cd6dafac9/graphembed_Linux_x86-64_v0.1.6.zip
39
+ unzip graphembed_Linux_x86-64_v0.1.6.zip
40
+ chmod a+x ./graphembed
41
+ ./graphembed -h
42
+ ```
43
+
44
+ ### Bioconda on Linux/MacOS
45
+ ```bash
46
+ conda install -c conda-forge -c bioconda graphembed
47
+ ```
48
+
49
+ ### Homebrew on MacOS
50
+ ```bash
51
+ brew tap jianshu93/graphembed
52
+ brew update
53
+ brew install graphembed
54
+ ```
55
+
56
+
57
+ ### In Python (Please install python>=3.9 first)
58
+ ```bash
59
+ pip install graphembed_rs
60
+
61
+ ### or you can build from source (Linux) after installing maturin
62
+ git clone https://gitlab.com/Jianshu_Zhao/graphembed
63
+ cd graphembed
64
+ pip install maturin
65
+ ### note: for macOS, you need to change the line "features = ["pyo3/extension-module", "intel-mkl-static", "simdeez_f"]" in pyporject.toml to "features = ["pyo3/extension-module","openblas-system","stdsimd"]", you also need to Install OpenBLAS and add to system library path via Homebrew
66
+ maturin develop --release
67
+
68
+ #### Prepare some data
69
+ wget https://gitlab.com/-/project/64961144/uploads/4e341383d62d86d1dd66e668e91b2c07/BlogCatalog.txt
70
+ ```
71
+
72
+ ```python
73
+ import os
74
+ os.environ["RUST_LOG"] = "info"
75
+ import graphembed_rs.graphembed_rs as ge
76
+ import graphembed_rs.load_utils as ge_utils
77
+ help(ge)
78
+ help(ge_utils)
79
+ ### HOPE
80
+ ge.embed_hope_rank("BlogCatalog.txt", target_rank=128, nbiter=4,output="embedding_output")
81
+ out_vectors=ge_utils.load_embedding_bson("embedding_output.bson")
82
+ print("OUT embedding shape :", out_vectors.shape)
83
+ print("first OUT vector :", out_vectors[0])
84
+
85
+ ### Sketching
86
+ ### sketching only
87
+ ge.embed_sketching("BlogCatalog.txt", decay=0.3, dim=128, nbiter=5, symetric=True, output="embedding_output")
88
+ out_vectors=ge_utils.load_embedding_bson("embedding_output.bson")
89
+ print("OUT embedding shape :", out_vectors.shape)
90
+ print("first OUT vector :", out_vectors[0])
91
+
92
+
93
+ ### validate accuracy
94
+ auc_scores = ge.validate_sketching("BlogCatalog.txt",decay=0.3, dim=128, nbiter=3, nbpass=1, skip_frac=0.2,symetric=True, centric=True)
95
+ print("Standard AUC per pass:", auc_scores)
96
+ ```
25
97
 
26
98
  ## Methods
27
-
28
99
  ### The embedding algorithms used in this crate are based on the following papers
29
100
 
30
101
  - **nodesketch**
@@ -57,29 +128,12 @@ Source node are related to left singular vectors and target nodes to the right o
57
128
  The similarity measure is the dot product, so it is not a norm.
58
129
  The svd is approximated by randomization as described in Halko-Tropp 2011 as implemented in the [annembed crate](https://crates.io/crates/annembed).
59
130
 
60
- ### The core decomposition algorithms
61
-
62
- - **Density-friendly decomposition**
63
-
64
- *Large Scale decomposition via convex programming 2017*
65
- M.Danisch T.H Hubert Chan and M.Sozio
66
-
67
- The decomposition of the graph in maximally dense groups of nodes is implemented and used to assess the quality of the embeddings in a structural way. See module *validation* and the comments on the embedding of the *Orkut* graph where we can use the community data provided with the graph to analyze the behaviour of embedded edge lengths.
68
-
69
- In particular it is shown that :
70
- - embedding of edges internal to a community are consistently smaller than embedded edges crossing a block frontier.
71
- - The transition probabilities of edge from one block to another are similar (low kullback divergence) in the original graph and in the embedded graph.
72
-
73
- See results in [orkut.md](./orkut.md) and examples directory together with a small Rust notebook in directory [Notebooks](./Notebooks/orkutrs.ipynb)
74
-
75
131
  ## Validation
76
132
 
77
133
  Validation of embeddings is assessed via standard Auc with random deletion of edges. See documentation in the *link* module and *embed* binary.
78
134
  We give also a variation based on centric quality assessment as explained at [cauc](http://github.com/jean-pierreBoth/linkauc)
79
135
  ## Some data sets
80
136
 
81
- ### Without labels
82
-
83
137
  Small datasets are given in the Data subdirectory (with 7z compression) to run tests.
84
138
  Larger datasets can be downloaded from the SNAP data collections <https://snap.stanford.edu/data>
85
139
 
@@ -132,13 +186,11 @@ A preliminary of node centric quality estimation is provided in the validation m
132
186
  - The munmun_twitter_social graph shows that treating a directed graph as an undirected graph give significantly different results in terms of link prediction AUC.
133
187
 
134
188
 
135
-
136
-
137
189
  ## Generalized Svd
138
190
 
139
191
  An implementation of Generalized Svd comes as a by-product in module [gsvd](./src/atp/gsvd.rs).
140
192
 
141
- ## Installation and Usage
193
+ ## Detailed Installation and Usage
142
194
 
143
195
  ### Installation
144
196
 
@@ -161,21 +213,21 @@ so to the required dimension to get a valid embedding in $R^{n}$.
161
213
  - The *embed* module takes embedding and possibly validation commands (link prediction task) in one directive.
162
214
  The general syntax is :
163
215
 
164
- embed file_description [validation_command --validation_arguments] sketching mode --embedding_arguments
216
+ graphembed file_description [validation_command --validation_arguments] sketching mode --embedding_arguments
165
217
  for example:
166
218
 
167
219
  For a symetric graph we get:
168
220
 
169
221
  - just embedding:
170
- embed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric sketching --decay 0.2 --dim 200 --nbiter
222
+ graphembed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric sketching --decay 0.2 --dim 200 --nbiter
171
223
 
172
224
  - embedding and validation:
173
225
 
174
- embed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
226
+ graphembed --csv ./Data/Graphs/Orkut/com-orkut.ungraph.txt --symetric validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
175
227
 
176
228
  For an asymetric graph we get
177
229
 
178
- embed --csv ./Data/Graphs/asymetric.csv validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
230
+ graphembed --csv ./Data/Graphs/asymetric.csv validation --nbpass 5 --skip 0.15 sketching --decay 0.2 --dim 200 --nbiter 5
179
231
 
180
232
 
181
233
  More details can be found in docs of the embed module. Use cargo doc --no-dep --bin embed (and cargo doc --no-dep) as usual.
@@ -0,0 +1,7 @@
1
+ graphembed_rs-0.1.2.dist-info/METADATA,sha256=WEvYJP6qVNPFlFxYzzq-n3XX6L__a2ukxgYkVt_WnYo,11532
2
+ graphembed_rs-0.1.2.dist-info/WHEEL,sha256=pKyTkFbTEakJa6xy_GKYIZO6TnrTnUxcBEc6JAhr_7o,104
3
+ graphembed_rs-0.1.2.dist-info/licenses/LICENSE-MIT,sha256=ndZ12D28O4UkfOeoa6HP9E7IKyYG4iH79iQ6WiLs9bc,1077
4
+ graphembed_rs/load_utils.py,sha256=fXd4-5OxdymSkb4kx39XeW_P3nEDdAuQ7Oe1D6PKDWE,2719
5
+ graphembed_rs/__init__.py,sha256=s8WorNcMnn15CfjctNaLnXGnwQ9RlPyCR0WJQxepTTc,179
6
+ graphembed_rs/graphembed_rs.cpython-310-darwin.so,sha256=tsLRSTexzC6Y6ZXPPOr640KrEbGEAt3O_2nznzQ1R-k,5266032
7
+ graphembed_rs-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2022 jean-pierre.both and Jianshu Zhao
2
+
3
+ Permission is hereby granted, free of charge, to any
4
+ person obtaining a copy of this software and associated
5
+ documentation files (the "Software"), to deal in the
6
+ Software without restriction, including without
7
+ limitation the rights to use, copy, modify, merge,
8
+ publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software
10
+ is furnished to do so, subject to the following
11
+ conditions:
12
+
13
+ The above copyright notice and this permission notice
14
+ shall be included in all copies or substantial portions
15
+ of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21
+ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
+ DEALINGS IN THE SOFTWARE.
graphembed/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- from .graphembed import *
2
-
3
- __doc__ = graphembed.__doc__
4
- if hasattr(graphembed, "__all__"):
5
- __all__ = graphembed.__all__
graphembed/__init__.pyi DELETED
@@ -1,79 +0,0 @@
1
- from typing import Optional
2
-
3
- # ---------- Embedding ----------
4
- def embed_hope_rank(
5
- csv: str,
6
- symetric: bool,
7
- target_rank: int,
8
- nbiter: int,
9
- output: Optional[str] = None,
10
- ) -> None: ...
11
- def embed_hope_precision(
12
- csv: str,
13
- symetric: bool,
14
- epsil: float,
15
- maxrank: int,
16
- blockiter: int,
17
- output: Optional[str] = None,
18
- ) -> None: ...
19
- def embed_sketching(
20
- csv: str,
21
- symetric: bool,
22
- decay: float,
23
- dim: int,
24
- nbiter: int,
25
- output: Optional[str] = None,
26
- ) -> None: ...
27
-
28
- # ---------- Validation (returns mean AUC) ----------
29
- def validate_hope_rank(
30
- csv: str,
31
- symetric: bool,
32
- target_rank: int,
33
- nbiter: int,
34
- nbpass: int = 10,
35
- skip_frac: float = 0.1,
36
- centric: bool = False,
37
- ) -> float: ...
38
- def validate_hope_precision(
39
- csv: str,
40
- symetric: bool,
41
- epsil: float,
42
- maxrank: int,
43
- blockiter: int,
44
- nbpass: int = 10,
45
- skip_frac: float = 0.1,
46
- centric: bool = False,
47
- ) -> float: ...
48
- def validate_sketching(
49
- csv: str,
50
- symetric: bool,
51
- decay: float,
52
- dim: int,
53
- nbiter: int,
54
- nbpass: int = 10,
55
- skip_frac: float = 0.1,
56
- centric: bool = False,
57
- ) -> float: ...
58
-
59
- # ---------- VCMPR (precision/recall curves) ----------
60
- def estimate_vcmpr_hope_rank(
61
- csv: str,
62
- symetric: bool,
63
- target_rank: int,
64
- nbiter: int,
65
- nbpass: int = 2,
66
- topk: int = 10,
67
- skip_frac: float = 0.1,
68
- ) -> None: ...
69
- def estimate_vcmpr_sketching(
70
- csv: str,
71
- symetric: bool,
72
- decay: float,
73
- dim: int,
74
- nbiter: int,
75
- nbpass: int = 2,
76
- topk: int = 10,
77
- skip_frac: float = 0.1,
78
- ) -> None: ...
79
-
Binary file
graphembed/py.typed DELETED
File without changes
@@ -1,7 +0,0 @@
1
- graphembed_rs-0.1.0.dist-info/METADATA,sha256=1bBA8fy75z8I6YGAsPAMoB-67zpAHW66cHywSb-hPj8,9901
2
- graphembed_rs-0.1.0.dist-info/WHEEL,sha256=pKyTkFbTEakJa6xy_GKYIZO6TnrTnUxcBEc6JAhr_7o,104
3
- graphembed/__init__.py,sha256=RCcLraveWf-myTsDQGePMYq-scNNfz-3Mv1baSbgAmM,123
4
- graphembed/__init__.pyi,sha256=3_KBFG4g9akylo32CHlm9bZStcLwxIY2X4si21ilD3w,1626
5
- graphembed/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- graphembed/graphembed.cpython-310-darwin.so,sha256=rpwaeaP3bH7t93yRz2edb4HD8CDEgu7bkEMFO3waTGU,5158592
7
- graphembed_rs-0.1.0.dist-info/RECORD,,