fast-gspan 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fast-gspan might be problematic. Click here for more details.

Files changed (37) hide show
  1. fast_gspan-0.1.1/.gitignore +19 -0
  2. fast_gspan-0.1.1/PKG-INFO +195 -0
  3. fast_gspan-0.1.1/README.md +167 -0
  4. fast_gspan-0.1.1/fast_gspan/__init__.py +6 -0
  5. fast_gspan-0.1.1/fast_gspan/__main__.py +106 -0
  6. fast_gspan-0.1.1/fast_gspan/gbolt_wrapper.py +509 -0
  7. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/.gitignore +130 -0
  8. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/CMakeLists.txt +81 -0
  9. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/LICENSE +25 -0
  10. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/README.md +124 -0
  11. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/config.h.in +2 -0
  12. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/.gitignore +8 -0
  13. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/CHANGELOG.md +68 -0
  14. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/CMakeLists.txt +94 -0
  15. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/INSTALL +10 -0
  16. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/LICENSE +19 -0
  17. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/README.md +134 -0
  18. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/cxxopts-config.cmake.in +4 -0
  19. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/extern/cxxopts/include/cxxopts.hpp +2005 -0
  20. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/common.h +76 -0
  21. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/database.h +50 -0
  22. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/gbolt.h +199 -0
  23. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/graph.h +203 -0
  24. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/history.h +55 -0
  25. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/output.h +35 -0
  26. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/include/path.h +92 -0
  27. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/database.cc +139 -0
  28. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/gbolt.cc +82 -0
  29. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/gbolt_count.cc +279 -0
  30. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/gbolt_execute.cc +139 -0
  31. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/gbolt_extend.cc +144 -0
  32. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/gbolt_mine.cc +230 -0
  33. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/history.cc +86 -0
  34. fast_gspan-0.1.1/fast_gspan/vendor/gbolt/src/output.cc +38 -0
  35. fast_gspan-0.1.1/hatch_build.py +158 -0
  36. fast_gspan-0.1.1/pyproject.toml +56 -0
  37. fast_gspan-0.1.1/tests/test_wrapper.py +178 -0
@@ -0,0 +1,19 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ *.so
7
+
8
+ # gBolt build artifacts
9
+ fast_gspan/vendor/gbolt/build/
10
+ fast_gspan/vendor/gbolt/include/config.h
11
+
12
+ # IDE
13
+ .idea/
14
+ .vscode/
15
+ *.swp
16
+
17
+ # test / misc
18
+ .pytest_cache/
19
+ .ruff_cache/
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: fast-gspan
3
+ Version: 0.1.1
4
+ Summary: Fast gSpan (frequent subgraph mining) powered by gBolt C++ backend
5
+ Project-URL: Homepage, https://github.com/Masatsugar/fast-gspan
6
+ Project-URL: Repository, https://github.com/Masatsugar/fast-gspan
7
+ Project-URL: Issues, https://github.com/Masatsugar/fast-gspan/issues
8
+ Author-email: Masatsugar <20656329+Masatsugar@users.noreply.github.com>
9
+ License-Expression: BSD-2-Clause
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: BSD License
13
+ Classifier: Operating System :: MacOS
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Programming Language :: C++
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: networkx>=3.0
23
+ Requires-Dist: pandas>=2.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: build; extra == 'dev'
26
+ Requires-Dist: pytest>=7.0; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # fast-gspan
30
+
31
+ [![CI](https://github.com/Masatsugar/fast-gspan/actions/workflows/ci.yml/badge.svg)](https://github.com/Masatsugar/fast-gspan/actions)
32
+ [![PyPI](https://img.shields.io/pypi/v/fast-gspan.svg)](https://pypi.org/project/fast-gspan/)
33
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
34
+ [![License: BSD-2-Clause](https://img.shields.io/badge/license-BSD--2--Clause-green.svg)](https://opensource.org/licenses/BSD-2-Clause)
35
+
36
+ A Python wrapper for frequent subgraph mining powered by the [gBolt](https://github.com/Jokeren/gBolt) C++ backend.
37
+
38
+ Provides a simple API to mine frequent subgraph patterns from NetworkX graphs, with significant speedups over pure-Python gSpan implementations.
39
+
40
+ ## Installation
41
+
42
+ Pre-built wheels are available for Linux (x86_64) and macOS (arm64, x86_64):
43
+
44
+ ```bash
45
+ pip install fast-gspan
46
+ ```
47
+
48
+ ### Building from source
49
+
50
+ If a pre-built wheel is not available for your platform, install from source:
51
+
52
+ ```bash
53
+ pip install git+https://github.com/Masatsugar/fast-gspan.git
54
+ python -m fast_gspan build # compile the C++ backend
55
+ ```
56
+
57
+ Source builds require:
58
+
59
+ - CMake >= 3.10
60
+ - C++ compiler with C++11 support (GCC, Clang)
61
+ - OpenMP (optional, for parallel mining)
62
+
63
+ ```bash
64
+ # Ubuntu/Debian
65
+ sudo apt-get install cmake g++ make
66
+
67
+ # macOS
68
+ brew install cmake libomp
69
+ ```
70
+
71
+ ## Quick start
72
+
73
+ ```python
74
+ import networkx as nx
75
+ from fast_gspan import FastgSpan
76
+
77
+ # Prepare your graph database
78
+ graphs = [...] # list of NetworkX graphs with 'label' attributes on nodes/edges
79
+
80
+ # Mine frequent subgraphs
81
+ fgs = FastgSpan(min_support=10, max_num_vertices=8)
82
+ df = fgs.run_from_graphs(graphs)
83
+
84
+ print(df[["support", "num_vert", "description"]])
85
+ ```
86
+
87
+ ### From a gSpan-format file
88
+
89
+ ```python
90
+ from fast_gspan import FastgSpan
91
+
92
+ df = FastgSpan(min_support=10, max_num_vertices=8).run_from_file("graphs.txt")
93
+ ```
94
+
95
+ ### Parallel mining & progress
96
+
97
+ ```python
98
+ fgs = FastgSpan(
99
+ min_support=10,
100
+ max_num_vertices=8,
101
+ num_threads=4, # 0 = all cores (default)
102
+ show_progress=True, # show real-time pattern count
103
+ )
104
+ df = fgs.run_from_graphs(graphs)
105
+ ```
106
+
107
+ ## API
108
+
109
+ ### `FastgSpan`
110
+
111
+ High-level interface. Constructor parameters:
112
+
113
+ | Parameter | Type | Default | Description |
114
+ |---|---|---|---|
115
+ | `gbolt_path` | `str \| None` | `None` | Path to gBolt executable. Auto-detected if `None`. |
116
+ | `min_support` | `int` | `2` | Minimum absolute support threshold. |
117
+ | `min_num_vertices` | `int` | `1` | Minimum vertices in a pattern. |
118
+ | `max_num_vertices` | `int` | `10` | Maximum vertices in a pattern. |
119
+ | `num_threads` | `int` | `0` | Number of OpenMP threads (0 = all cores). |
120
+ | `show_progress` | `bool` | `False` | Show progress during mining. |
121
+ | `verbose` | `bool` | `False` | Print debug information. |
122
+
123
+ Methods:
124
+
125
+ - **`run_from_graphs(graphs)`** -- Mine from a list of `nx.Graph`. Returns `pd.DataFrame`.
126
+ - **`run_from_file(filepath)`** -- Read a gSpan-format file and mine. Returns `pd.DataFrame`.
127
+
128
+ ### `GBoltWrapper`
129
+
130
+ Low-level wrapper around the gBolt binary. Use this if you need direct access to raw pattern dicts.
131
+
132
+ - **`mine_frequent_subgraphs(graphs)`** -- Returns `list[dict]` with keys: `pattern_id`, `support`, `vertices`, `edges`, `dfs_codes`, `graph_data`.
133
+
134
+ ## Output format
135
+
136
+ The returned DataFrame has the following columns:
137
+
138
+ | Column | Description |
139
+ |---|---|
140
+ | `support` | Number of graphs containing this pattern |
141
+ | `description` | DFS-code representation: `(from, to, from_label, edge_label, to_label)` per edge |
142
+ | `num_vert` | Number of vertices in the pattern |
143
+ | `pattern_id` | Pattern ID assigned by gBolt |
144
+ | `vertices` | List of `(vertex_id, label)` tuples |
145
+ | `edges` | List of `(from, to, edge_label)` tuples |
146
+
147
+ ## gSpan-format file
148
+
149
+ Input files follow the standard gSpan text format:
150
+
151
+ ```
152
+ t # 0
153
+ v 0 1
154
+ v 1 2
155
+ e 0 1 3
156
+ t # 1
157
+ v 0 1
158
+ v 1 1
159
+ v 2 2
160
+ e 0 1 3
161
+ e 1 2 4
162
+ t # -1
163
+ ```
164
+
165
+ ## Changes from upstream gBolt
166
+
167
+ This package bundles a modified fork of [gBolt](https://github.com/Jokeren/gBolt) with the following changes:
168
+
169
+ ### New features
170
+
171
+ - **`-x, --max-vertices` option** -- Limits the maximum number of vertices in mined patterns. Allows early pruning during DFS exploration, reducing both runtime and memory usage.
172
+ - **Projection size guard (`MAX_PROJECTION_SIZE`)** -- Skips projections exceeding 500,000 entries to prevent memory explosion on dense graphs.
173
+
174
+ ### Output format change
175
+
176
+ - The DFS-code output (`-d` flag) now emits the full tuple:
177
+ ```
178
+ e <from> <to> <from_label> <edge_label> <to_label>
179
+ ```
180
+ The upstream format only emitted `e <from> <to> <edge_label>`. The extended format enables exact reconstruction of canonical DFS codes in the Python wrapper.
181
+
182
+ ### Build system
183
+
184
+ - CMake minimum version raised from 2.6 to 3.10.
185
+ - Added macOS (Apple Clang) fallback for OpenMP via Homebrew `libomp`.
186
+
187
+ ### Bug fixes
188
+
189
+ - Added `const` qualifier to three `operator()` methods in `include/graph.h` to fix compiler warnings and ensure correctness with modern C++ standards.
190
+
191
+ ## License
192
+
193
+ The bundled gBolt C++ source is licensed under the **BSD 2-Clause License** (Copyright (c) 2017, Keren Zhou). See [`fast_gspan/vendor/gbolt/LICENSE`](fast_gspan/vendor/gbolt/LICENSE) for details.
194
+
195
+ The Python wrapper code in this repository is also released under the BSD 2-Clause License.
@@ -0,0 +1,167 @@
1
+ # fast-gspan
2
+
3
+ [![CI](https://github.com/Masatsugar/fast-gspan/actions/workflows/ci.yml/badge.svg)](https://github.com/Masatsugar/fast-gspan/actions)
4
+ [![PyPI](https://img.shields.io/pypi/v/fast-gspan.svg)](https://pypi.org/project/fast-gspan/)
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
6
+ [![License: BSD-2-Clause](https://img.shields.io/badge/license-BSD--2--Clause-green.svg)](https://opensource.org/licenses/BSD-2-Clause)
7
+
8
+ A Python wrapper for frequent subgraph mining powered by the [gBolt](https://github.com/Jokeren/gBolt) C++ backend.
9
+
10
+ Provides a simple API to mine frequent subgraph patterns from NetworkX graphs, with significant speedups over pure-Python gSpan implementations.
11
+
12
+ ## Installation
13
+
14
+ Pre-built wheels are available for Linux (x86_64) and macOS (arm64, x86_64):
15
+
16
+ ```bash
17
+ pip install fast-gspan
18
+ ```
19
+
20
+ ### Building from source
21
+
22
+ If a pre-built wheel is not available for your platform, install from source:
23
+
24
+ ```bash
25
+ pip install git+https://github.com/Masatsugar/fast-gspan.git
26
+ python -m fast_gspan build # compile the C++ backend
27
+ ```
28
+
29
+ Source builds require:
30
+
31
+ - CMake >= 3.10
32
+ - C++ compiler with C++11 support (GCC, Clang)
33
+ - OpenMP (optional, for parallel mining)
34
+
35
+ ```bash
36
+ # Ubuntu/Debian
37
+ sudo apt-get install cmake g++ make
38
+
39
+ # macOS
40
+ brew install cmake libomp
41
+ ```
42
+
43
+ ## Quick start
44
+
45
+ ```python
46
+ import networkx as nx
47
+ from fast_gspan import FastgSpan
48
+
49
+ # Prepare your graph database
50
+ graphs = [...] # list of NetworkX graphs with 'label' attributes on nodes/edges
51
+
52
+ # Mine frequent subgraphs
53
+ fgs = FastgSpan(min_support=10, max_num_vertices=8)
54
+ df = fgs.run_from_graphs(graphs)
55
+
56
+ print(df[["support", "num_vert", "description"]])
57
+ ```
58
+
59
+ ### From a gSpan-format file
60
+
61
+ ```python
62
+ from fast_gspan import FastgSpan
63
+
64
+ df = FastgSpan(min_support=10, max_num_vertices=8).run_from_file("graphs.txt")
65
+ ```
66
+
67
+ ### Parallel mining & progress
68
+
69
+ ```python
70
+ fgs = FastgSpan(
71
+ min_support=10,
72
+ max_num_vertices=8,
73
+ num_threads=4, # 0 = all cores (default)
74
+ show_progress=True, # show real-time pattern count
75
+ )
76
+ df = fgs.run_from_graphs(graphs)
77
+ ```
78
+
79
+ ## API
80
+
81
+ ### `FastgSpan`
82
+
83
+ High-level interface. Constructor parameters:
84
+
85
+ | Parameter | Type | Default | Description |
86
+ |---|---|---|---|
87
+ | `gbolt_path` | `str \| None` | `None` | Path to gBolt executable. Auto-detected if `None`. |
88
+ | `min_support` | `int` | `2` | Minimum absolute support threshold. |
89
+ | `min_num_vertices` | `int` | `1` | Minimum vertices in a pattern. |
90
+ | `max_num_vertices` | `int` | `10` | Maximum vertices in a pattern. |
91
+ | `num_threads` | `int` | `0` | Number of OpenMP threads (0 = all cores). |
92
+ | `show_progress` | `bool` | `False` | Show progress during mining. |
93
+ | `verbose` | `bool` | `False` | Print debug information. |
94
+
95
+ Methods:
96
+
97
+ - **`run_from_graphs(graphs)`** -- Mine from a list of `nx.Graph`. Returns `pd.DataFrame`.
98
+ - **`run_from_file(filepath)`** -- Read a gSpan-format file and mine. Returns `pd.DataFrame`.
99
+
100
+ ### `GBoltWrapper`
101
+
102
+ Low-level wrapper around the gBolt binary. Use this if you need direct access to raw pattern dicts.
103
+
104
+ - **`mine_frequent_subgraphs(graphs)`** -- Returns `list[dict]` with keys: `pattern_id`, `support`, `vertices`, `edges`, `dfs_codes`, `graph_data`.
105
+
106
+ ## Output format
107
+
108
+ The returned DataFrame has the following columns:
109
+
110
+ | Column | Description |
111
+ |---|---|
112
+ | `support` | Number of graphs containing this pattern |
113
+ | `description` | DFS-code representation: `(from, to, from_label, edge_label, to_label)` per edge |
114
+ | `num_vert` | Number of vertices in the pattern |
115
+ | `pattern_id` | Pattern ID assigned by gBolt |
116
+ | `vertices` | List of `(vertex_id, label)` tuples |
117
+ | `edges` | List of `(from, to, edge_label)` tuples |
118
+
119
+ ## gSpan-format file
120
+
121
+ Input files follow the standard gSpan text format:
122
+
123
+ ```
124
+ t # 0
125
+ v 0 1
126
+ v 1 2
127
+ e 0 1 3
128
+ t # 1
129
+ v 0 1
130
+ v 1 1
131
+ v 2 2
132
+ e 0 1 3
133
+ e 1 2 4
134
+ t # -1
135
+ ```
136
+
137
+ ## Changes from upstream gBolt
138
+
139
+ This package bundles a modified fork of [gBolt](https://github.com/Jokeren/gBolt) with the following changes:
140
+
141
+ ### New features
142
+
143
+ - **`-x, --max-vertices` option** -- Limits the maximum number of vertices in mined patterns. Allows early pruning during DFS exploration, reducing both runtime and memory usage.
144
+ - **Projection size guard (`MAX_PROJECTION_SIZE`)** -- Skips projections exceeding 500,000 entries to prevent memory explosion on dense graphs.
145
+
146
+ ### Output format change
147
+
148
+ - The DFS-code output (`-d` flag) now emits the full tuple:
149
+ ```
150
+ e <from> <to> <from_label> <edge_label> <to_label>
151
+ ```
152
+ The upstream format only emitted `e <from> <to> <edge_label>`. The extended format enables exact reconstruction of canonical DFS codes in the Python wrapper.
153
+
154
+ ### Build system
155
+
156
+ - CMake minimum version raised from 2.6 to 3.10.
157
+ - Added macOS (Apple Clang) fallback for OpenMP via Homebrew `libomp`.
158
+
159
+ ### Bug fixes
160
+
161
+ - Added `const` qualifier to three `operator()` methods in `include/graph.h` to fix compiler warnings and ensure correctness with modern C++ standards.
162
+
163
+ ## License
164
+
165
+ The bundled gBolt C++ source is licensed under the **BSD 2-Clause License** (Copyright (c) 2017, Keren Zhou). See [`fast_gspan/vendor/gbolt/LICENSE`](fast_gspan/vendor/gbolt/LICENSE) for details.
166
+
167
+ The Python wrapper code in this repository is also released under the BSD 2-Clause License.
@@ -0,0 +1,6 @@
1
+ """Fast gSpan implementation using gBolt C++ backend."""
2
+
3
+ from .gbolt_wrapper import FastgSpan, GBoltWrapper
4
+
5
+ __all__ = ["GBoltWrapper", "FastgSpan"]
6
+ __version__ = "0.1.0"
@@ -0,0 +1,106 @@
1
+ """CLI entry point: python -m fast_gspan build"""
2
+
3
+ import shutil
4
+ import subprocess
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ VENDOR_GBOLT_DIR = Path(__file__).parent / "vendor" / "gbolt"
9
+
10
+
11
+ def _check_tool(name: str, cmd: list[str]) -> bool:
12
+ try:
13
+ subprocess.run(cmd, capture_output=True, check=True)
14
+ return True
15
+ except (subprocess.CalledProcessError, FileNotFoundError):
16
+ print(f" {name}: not found")
17
+ return False
18
+
19
+
20
+ def build():
21
+ """Build the vendored gBolt C++ source."""
22
+ print("Building gBolt from vendored source...")
23
+
24
+ if not VENDOR_GBOLT_DIR.exists():
25
+ print(f"Error: vendor source not found at {VENDOR_GBOLT_DIR}")
26
+ sys.exit(1)
27
+
28
+ # Check prerequisites
29
+ ok = True
30
+ for name, cmd in [
31
+ ("cmake", ["cmake", "--version"]),
32
+ ("make", ["make", "--version"]),
33
+ ]:
34
+ if not _check_tool(name, cmd):
35
+ ok = False
36
+ if not ok:
37
+ print("\nPlease install the missing build tools.")
38
+ sys.exit(1)
39
+
40
+ build_dir = VENDOR_GBOLT_DIR / "build"
41
+ if build_dir.exists():
42
+ shutil.rmtree(build_dir)
43
+ build_dir.mkdir()
44
+
45
+ print(" Configuring (cmake) ...")
46
+ result = subprocess.run(
47
+ ["cmake", ".."],
48
+ cwd=build_dir,
49
+ capture_output=True,
50
+ text=True,
51
+ )
52
+ if result.returncode != 0:
53
+ print(f"cmake failed:\n{result.stderr}")
54
+ sys.exit(1)
55
+
56
+ print(" Compiling (make) ...")
57
+ result = subprocess.run(
58
+ ["make", "-j4", "-k"],
59
+ cwd=build_dir,
60
+ capture_output=True,
61
+ text=True,
62
+ )
63
+
64
+ exe = build_dir / "gbolt"
65
+ if exe.exists():
66
+ print(f" gBolt built successfully: {exe}")
67
+ else:
68
+ print(f" Build failed:\n{result.stderr}")
69
+ sys.exit(1)
70
+
71
+
72
+ def clean():
73
+ """Remove the gBolt build directory."""
74
+ build_dir = VENDOR_GBOLT_DIR / "build"
75
+ if build_dir.exists():
76
+ shutil.rmtree(build_dir)
77
+ print("Build directory removed.")
78
+ else:
79
+ print("Nothing to clean.")
80
+
81
+
82
+ def usage():
83
+ print("Usage: python -m fast_gspan <command>")
84
+ print()
85
+ print("Commands:")
86
+ print(" build Build the gBolt C++ backend")
87
+ print(" clean Remove the gBolt build directory")
88
+
89
+
90
+ def main():
91
+ if len(sys.argv) < 2:
92
+ usage()
93
+ sys.exit(1)
94
+
95
+ cmd = sys.argv[1]
96
+ if cmd == "build":
97
+ build()
98
+ elif cmd == "clean":
99
+ clean()
100
+ else:
101
+ usage()
102
+ sys.exit(1)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ main()