path-boost 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. path_boost-2.1.0/.Rhistory +0 -0
  2. path_boost-2.1.0/.coveragerc +6 -0
  3. path_boost-2.1.0/.github/dependabot.yml +18 -0
  4. path_boost-2.1.0/.github/workflows/deploy-gh-pages.yml +34 -0
  5. path_boost-2.1.0/.github/workflows/lint.yml +23 -0
  6. path_boost-2.1.0/.github/workflows/python-app.yml +33 -0
  7. path_boost-2.1.0/.gitignore +78 -0
  8. path_boost-2.1.0/.pre-commit-config.yaml +16 -0
  9. path_boost-2.1.0/LICENSE +21 -0
  10. path_boost-2.1.0/PKG-INFO +174 -0
  11. path_boost-2.1.0/README.md +141 -0
  12. path_boost-2.1.0/doc/Makefile +184 -0
  13. path_boost-2.1.0/doc/_static/css/project-template.css +59 -0
  14. path_boost-2.1.0/doc/_static/img/index_api.svg +97 -0
  15. path_boost-2.1.0/doc/_static/img/index_examples.svg +76 -0
  16. path_boost-2.1.0/doc/_static/img/index_getting_started.svg +66 -0
  17. path_boost-2.1.0/doc/_static/img/index_user_guide.svg +67 -0
  18. path_boost-2.1.0/doc/_static/img/logo.png +0 -0
  19. path_boost-2.1.0/doc/_static/js/copybutton.js +63 -0
  20. path_boost-2.1.0/doc/_templates/class.rst +26 -0
  21. path_boost-2.1.0/doc/_templates/function.rst +12 -0
  22. path_boost-2.1.0/doc/_templates/numpydoc_docstring.py +16 -0
  23. path_boost-2.1.0/doc/_templates/sidebar-search-bs.html +14 -0
  24. path_boost-2.1.0/doc/api.rst +48 -0
  25. path_boost-2.1.0/doc/conf.py +114 -0
  26. path_boost-2.1.0/doc/index.rst +108 -0
  27. path_boost-2.1.0/doc/make.bat +242 -0
  28. path_boost-2.1.0/doc/quick_start.rst +181 -0
  29. path_boost-2.1.0/doc/user_guide.rst +180 -0
  30. path_boost-2.1.0/examples/README.txt +6 -0
  31. path_boost-2.1.0/examples/example_cross_validation_path_boost.py +103 -0
  32. path_boost-2.1.0/examples/example_on_TUD_dataset.py +57 -0
  33. path_boost-2.1.0/examples/example_run_EPB_on_uNatQ_nx_graphs.py +127 -0
  34. path_boost-2.1.0/examples/example_run_extended_path_boost_on_subsampled_dataset.py +54 -0
  35. path_boost-2.1.0/examples/examples_utils.py +24 -0
  36. path_boost-2.1.0/examples/plot_classifier.py +41 -0
  37. path_boost-2.1.0/examples/plot_template.py +18 -0
  38. path_boost-2.1.0/examples/plot_transformer.py +27 -0
  39. path_boost-2.1.0/path_boost/__init__.py +18 -0
  40. path_boost-2.1.0/path_boost/_path_boost.py +1096 -0
  41. path_boost-2.1.0/path_boost/_version.py +24 -0
  42. path_boost-2.1.0/path_boost/utils/__init__.py +2 -0
  43. path_boost-2.1.0/path_boost/utils/classes/__init__.py +0 -0
  44. path_boost-2.1.0/path_boost/utils/classes/additive_model_wrapper.py +301 -0
  45. path_boost-2.1.0/path_boost/utils/classes/additive_model_wrapper_classifier.py +394 -0
  46. path_boost-2.1.0/path_boost/utils/classes/extended_boosting_matrix.py +596 -0
  47. path_boost-2.1.0/path_boost/utils/classes/interfaces/__init__.py +0 -0
  48. path_boost-2.1.0/path_boost/utils/classes/interfaces/interface_base_learner.py +30 -0
  49. path_boost-2.1.0/path_boost/utils/classes/interfaces/interface_selector.py +27 -0
  50. path_boost-2.1.0/path_boost/utils/classes/sequential_path_boost.py +1023 -0
  51. path_boost-2.1.0/path_boost/utils/classes/sequential_path_boost_classifier.py +840 -0
  52. path_boost-2.1.0/path_boost/utils/cross_validation.py +49 -0
  53. path_boost-2.1.0/path_boost/utils/cyclic_path_boost_utils.py +76 -0
  54. path_boost-2.1.0/path_boost/utils/datasets_for_examples/__init__.py +2 -0
  55. path_boost-2.1.0/path_boost/utils/datasets_for_examples/generate_example_dataset.py +304 -0
  56. path_boost-2.1.0/path_boost/utils/discovery.py +217 -0
  57. path_boost-2.1.0/path_boost/utils/plots_functions.py +153 -0
  58. path_boost-2.1.0/path_boost/utils/validate_data.py +223 -0
  59. path_boost-2.1.0/path_boost/utils/variable_importance_according_to_path_boost.py +341 -0
  60. path_boost-2.1.0/path_boost.egg-info/PKG-INFO +174 -0
  61. path_boost-2.1.0/path_boost.egg-info/SOURCES.txt +94 -0
  62. path_boost-2.1.0/path_boost.egg-info/dependency_links.txt +1 -0
  63. path_boost-2.1.0/path_boost.egg-info/requires.txt +13 -0
  64. path_boost-2.1.0/path_boost.egg-info/top_level.txt +1 -0
  65. path_boost-2.1.0/pixi.lock +11605 -0
  66. path_boost-2.1.0/pyproject.toml +148 -0
  67. path_boost-2.1.0/requirements.txt +23 -0
  68. path_boost-2.1.0/setup.cfg +4 -0
  69. path_boost-2.1.0/tests/__init__.py +2 -0
  70. path_boost-2.1.0/tests/datasets_used_for_tests/__init__.py +2 -0
  71. path_boost-2.1.0/tests/datasets_used_for_tests/load_test_dataset.py +16 -0
  72. path_boost-2.1.0/tests/test_algorithm_correctness/__init__.py +0 -0
  73. path_boost-2.1.0/tests/test_algorithm_correctness/test_algorithm_matches_specification.py +308 -0
  74. path_boost-2.1.0/tests/test_discovery.py +31 -0
  75. path_boost-2.1.0/tests/test_extended_path_boost/__init__.py +2 -0
  76. path_boost-2.1.0/tests/test_extended_path_boost/test_extended_path_boost.py +36 -0
  77. path_boost-2.1.0/tests/test_extended_path_boost/test_extended_path_boost_base_learners_and_selector_classes.py +88 -0
  78. path_boost-2.1.0/tests/test_extended_path_boost/test_extended_path_boost_fit.py +89 -0
  79. path_boost-2.1.0/tests/test_extended_path_boost/test_extended_path_boost_predict.py +173 -0
  80. path_boost-2.1.0/tests/test_extended_path_boost/test_split_extended_path_boost.py +84 -0
  81. path_boost-2.1.0/tests/test_sequential_path_boost/__init__.py +0 -0
  82. path_boost-2.1.0/tests/test_sequential_path_boost/test_fit_single_metal_center_path_boost.py +371 -0
  83. path_boost-2.1.0/tests/test_sequential_path_boost/test_single_metal_center_path_boost.py +62 -0
  84. path_boost-2.1.0/tests/test_sequential_path_boost/test_variable_importance.py +55 -0
  85. path_boost-2.1.0/tests/tests_extended_boosting_matrix/__init__.py +0 -0
  86. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_extended_boosting_matrix_finding_attributes.py +95 -0
  87. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_extended_boosting_matrix_get_frequency_matrix.py +60 -0
  88. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_extended_boosting_matrix_label_path_finding_in_single_graph.py +100 -0
  89. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_extended_boosting_matrix_label_path_finding_in_single_graph_with_starting_node.py +123 -0
  90. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_extended_boosting_matrix_path_finding_in_dataset.py +200 -0
  91. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_initialization_extended_boosting_matrix.py +263 -0
  92. path_boost-2.1.0/tests/tests_extended_boosting_matrix/test_new_columns_extend_extended_boosting_matrix.py +136 -0
  93. path_boost-2.1.0/tests/tests_for_cyclyc_path_boost.py +63 -0
  94. path_boost-2.1.0/tests/tests_sequential_path_boost_classifier/__init__.py +0 -0
  95. path_boost-2.1.0/tests/tests_sequential_path_boost_classifier/test_fit_single_metal_center_path_boost_classifier.py +195 -0
  96. path_boost-2.1.0/tests/tests_sequential_path_boost_classifier/test_variable_importance_classifier.py +58 -0
File without changes
@@ -0,0 +1,6 @@
1
+ # Configuration for coverage.py
2
+
3
+ [run]
4
+ branch = True
5
+ source = path_boost
6
+ include = */path_boost/*
@@ -0,0 +1,18 @@
1
+ version: 2
2
+ updates:
3
+ # Maintain dependencies for GitHub Actions as recommended in SPEC8:
4
+ # https://github.com/scientific-python/specs/pull/325
5
+ # At the time of writing, release critical workflows such as
6
+ # pypa/gh-action-pypi-publish should use hash-based versioning for security
7
+ # reasons. This strategy may be generalized to all other github actions
8
+ # in the future.
9
+ - package-ecosystem: "github-actions"
10
+ directory: "/"
11
+ schedule:
12
+ interval: "weekly"
13
+ groups:
14
+ actions:
15
+ patterns:
16
+ - "*"
17
+ reviewers:
18
+ - "glemaitre"
@@ -0,0 +1,34 @@
1
+ name: Documentation
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ deploy-gh-pages:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ with:
18
+ fetch-depth: 0
19
+ - uses: prefix-dev/setup-pixi@v0.8.1
20
+ with:
21
+ pixi-version: v0.23.0
22
+ environments: doc
23
+ frozen: true
24
+
25
+ - name: Build documentation
26
+ run: pixi run -e doc build-doc
27
+
28
+ - name: Update the main gh-page website
29
+ if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
30
+ uses: peaceiris/actions-gh-pages@v4.0.0
31
+ with:
32
+ github_token: ${{ secrets.GITHUB_TOKEN }}
33
+ publish_dir: ./doc/_build/html
34
+ commit_message: "[ci skip] ${{ github.event.head_commit.message }}"
@@ -0,0 +1,23 @@
1
+ name: Linter
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ build:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: prefix-dev/setup-pixi@v0.8.1
17
+ with:
18
+ pixi-version: v0.23.0
19
+ environments: lint
20
+ frozen: true
21
+
22
+ - name: Run linter
23
+ run: pixi run -e lint lint
@@ -0,0 +1,33 @@
1
+ name: Unit Tests
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ build:
13
+ strategy:
14
+ matrix:
15
+ os: [windows-latest, ubuntu-latest, macos-latest, macos-12]
16
+ environment: [test]
17
+ runs-on: ${{ matrix.os }}
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - uses: prefix-dev/setup-pixi@v0.8.1
21
+ with:
22
+ pixi-version: v0.23.0
23
+ environments: ${{ matrix.environment }}
24
+ frozen: true
25
+
26
+ - name: Run tests
27
+ run: pixi run -e ${{ matrix.environment }} test
28
+
29
+ - name: Upload coverage reports to Codecov
30
+ uses: codecov/codecov-action@v4.6.0
31
+ with:
32
+ token: ${{ secrets.CODECOV_TOKEN }}
33
+ slug: scikit-learn-contrib/path_boost
@@ -0,0 +1,78 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # scikit-learn specific
10
+ doc/_build/
11
+ doc/auto_examples/
12
+ doc/modules/generated/
13
+ doc/datasets/generated/
14
+
15
+ # Distribution / packaging
16
+
17
+ .Python
18
+ env/
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *,cover
53
+ .hypothesis/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+
62
+ # Sphinx documentation
63
+ doc/_build/
64
+ doc/generated/
65
+ doc/sg_execution_times.rst
66
+
67
+ # PyBuilder
68
+ target/
69
+
70
+ .pixi
71
+
72
+ # General
73
+ .DS_Store
74
+ .AppleDouble
75
+ .LSOverride
76
+
77
+ # auto-generated files
78
+ path_boost/_version.py
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.3.0
4
+ hooks:
5
+ - id: check-yaml
6
+ - id: end-of-file-fixer
7
+ - id: trailing-whitespace
8
+ - repo: https://github.com/psf/black
9
+ rev: 23.3.0
10
+ hooks:
11
+ - id: black
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ rev: v0.0.272
14
+ hooks:
15
+ - id: ruff
16
+ args: ["--fix", "--show-source"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Claudio Meggio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: path_boost
3
+ Version: 2.1.0
4
+ Summary: Interpretable machine learning on graph-structured data using path-based boosting.
5
+ Author-email: Vighnesh Birodkar <vighneshbirodkar@nyu.edu>, Guillaume Lemaitre <g.lemaitre58@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Claudio-Me/extended_path_boost
8
+ Project-URL: Issues, https://github.com/Claudio-Me/extended_path_boost/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Operating System :: POSIX
15
+ Classifier: Operating System :: Unix
16
+ Classifier: Operating System :: MacOS
17
+ Classifier: Operating System :: Microsoft :: Windows
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: scikit-learn>=1.4.2
22
+ Requires-Dist: networkx>=3.0
23
+ Requires-Dist: pandas>=2.0
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: matplotlib>=3.5
26
+ Requires-Dist: joblib>=1.2
27
+ Requires-Dist: scipy>=1.10
28
+ Provides-Extra: progress
29
+ Requires-Dist: tqdm>=4.64; extra == "progress"
30
+ Provides-Extra: all
31
+ Requires-Dist: tqdm>=4.64; extra == "all"
32
+ Dynamic: license-file
33
+
34
+ # Path Boost
35
+
36
+ Path Boost is a Python library for interpretable machine learning on graph-structured data. It implements the PathBoost and SequentialPathBoost algorithms, which iteratively construct features based on paths in graphs and use boosting to build predictive models. The library is designed for tasks where input data consists of collections of graphs (e.g., molecules, social networks) and supports variable importance analysis for interpretability.
37
+
38
+ ## Features
39
+
40
+ - **PathBoost**: Ensemble learning over graph paths, partitioned by anchor nodes.
41
+ - **SequentialPathBoost**: Boosting with path-based features, iteratively expanding the feature space.
42
+ - **Variable Importance**: Quantifies the importance of paths/features in prediction.
43
+ - **Parallel Training**: Supports multi-core training for large datasets.
44
+ - **Evaluation and Visualization**: Built-in tools for error tracking and variable importance plotting.
45
+
46
+ ## Installation
47
+
48
+ Install from PyPI:
49
+
50
+ ```bash
51
+ pip install path_boost
52
+ ```
53
+
54
+ ## Usage Example
55
+
56
+ Below is a minimal example using the `PathBoost` model:
57
+
58
+ ```python
59
+ import numpy as np
60
+ import networkx as nx
61
+ from sklearn.model_selection import train_test_split
62
+ from path_boost import PathBoost
63
+ from path_boost.utils.datasets_for_examples.generate_example_dataset import generate_synthetic_graph_dataset
64
+
65
+
66
+
67
+
68
+ if __name__ == "__main__":
69
+
70
+ # Generate synthetic dataset
71
+ nx_graphs, y, true_paths, true_weights = generate_synthetic_graph_dataset()
72
+
73
+
74
+ list_anchor_nodes_labels = [0, 1, 2]
75
+
76
+ parameters_variable_importance: dict = {
77
+ 'criterion': 'absolute',
78
+ 'error_used': 'mse',
79
+ 'use_correlation': False,
80
+ 'normalize': True,
81
+ }
82
+
83
+ X_train, X_test, y_train, y_test = train_test_split(nx_graphs, y, test_size=0.25, random_state=42)
84
+
85
+ eval_set = [(X_test, y_test)]
86
+
87
+ path_boost = PathBoost(
88
+ n_iter=50, # Reduced for quicker example run
89
+ max_path_length=5,
90
+ learning_rate=0.1,
91
+ n_of_cores=1, # Set to >1 for parallel processing if desired
92
+ verbose=True,
93
+ parameters_variable_importance=parameters_variable_importance
94
+ )
95
+
96
+ # Fit the model
97
+ # anchor_nodes_label_name must correspond to the feature storing node types ('feature_0')
98
+ path_boost.fit(
99
+ X=X_train,
100
+ y=y_train,
101
+ eval_set=eval_set,
102
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
103
+ anchor_nodes_label_name="feature_0" # Node types are in 'feature_0'
104
+ )
105
+
106
+ print(f"Generated {len(nx_graphs)} graphs.")
107
+ print(f"Example y values: {y[:5]}")
108
+ print(f"True paths definitions: {true_paths}")
109
+ print(f"True path weights: {true_weights}")
110
+
111
+ path_boost.plot_training_and_eval_errors(skip_first_n_iterations=0, plot_eval_sets_error=True)
112
+ if path_boost.parameters_variable_importance is not None and hasattr(path_boost, 'variable_importance_'):
113
+ path_boost.plot_variable_importance(top_n_features=10)
114
+ else:
115
+ print("Variable importance not computed or available.")
116
+
117
+ print("Example run finished.")
118
+
119
+ ```
120
+
121
+ ## API Overview
122
+
123
+ ### PathBoost
124
+
125
+ - `fit(X, y, anchor_nodes_label_name, list_anchor_nodes_labels, eval_set=None)`
126
+ - `predict(X)`
127
+ - `predict_step_by_step(X)`
128
+ - `evaluate(X, y)`
129
+ - `plot_training_and_eval_errors(skip_first_n_iterations=True)`
130
+ - `plot_variable_importance()`
131
+ - **Attributes:**
132
+ - `train_mse_`: Training error (MSE) at each iteration
133
+ - `mse_eval_set_`: Evaluation set error (MSE) at each iteration (if `eval_set` is provided)
134
+ - `variable_importance_`: Variable/path importance scores (if enabled)
135
+ - `is_fitted_`: Whether the model is fitted
136
+ - `models_list_`: List of fitted SequentialPathBoost models (one per anchor node)
137
+ - (Each SequentialPathBoost in `models_list_` exposes the attributes below)
138
+
139
+ ### SequentialPathBoost
140
+
141
+ - `fit(X, y, list_anchor_nodes_labels, name_of_label_attribute, eval_set=None)`
142
+ - `predict(X)`
143
+ - `predict_step_by_step(X)`
144
+ - `evaluate(X, y)`
145
+ - `plot_training_and_eval_errors(skip_first_n_iterations=True)`
146
+ - `plot_variable_importance()`
147
+ - **Attributes:**
148
+ - `train_mse_`: Training error (MSE) at each iteration
149
+ - `train_mae_`: Training MAE at each iteration
150
+ - `eval_sets_mse_`: Evaluation set error (MSE) at each iteration (if `eval_set` is provided)
151
+ - `eval_sets_mae_`: Evaluation set MAE at each iteration (if `eval_set` is provided)
152
+ - `variable_importance_`: Variable/path importance scores (if enabled)
153
+ - `paths_selected_by_epb_`: Set of selected paths during boosting
154
+ - `columns_names_`: Names of EBM columns/features used
155
+ - `is_fitted_`: Whether the model is fitted
156
+
157
+ ## Requirements
158
+
159
+ - Python 3.10+
160
+ - numpy
161
+ - pandas
162
+ - scikit-learn
163
+ - networkx
164
+ - matplotlib
165
+
166
+ (See `requirements.txt` for the full list.)
167
+
168
+ ## Citation
169
+
170
+ If you use this library in your research, please cite the corresponding paper (add citation here).
171
+
172
+ ## License
173
+
174
+ BSD 3-Clause License
@@ -0,0 +1,141 @@
1
+ # Path Boost
2
+
3
+ Path Boost is a Python library for interpretable machine learning on graph-structured data. It implements the PathBoost and SequentialPathBoost algorithms, which iteratively construct features based on paths in graphs and use boosting to build predictive models. The library is designed for tasks where input data consists of collections of graphs (e.g., molecules, social networks) and supports variable importance analysis for interpretability.
4
+
5
+ ## Features
6
+
7
+ - **PathBoost**: Ensemble learning over graph paths, partitioned by anchor nodes.
8
+ - **SequentialPathBoost**: Boosting with path-based features, iteratively expanding the feature space.
9
+ - **Variable Importance**: Quantifies the importance of paths/features in prediction.
10
+ - **Parallel Training**: Supports multi-core training for large datasets.
11
+ - **Evaluation and Visualization**: Built-in tools for error tracking and variable importance plotting.
12
+
13
+ ## Installation
14
+
15
+ Install from PyPI:
16
+
17
+ ```bash
18
+ pip install path_boost
19
+ ```
20
+
21
+ ## Usage Example
22
+
23
+ Below is a minimal example using the `PathBoost` model:
24
+
25
+ ```python
26
+ import numpy as np
27
+ import networkx as nx
28
+ from sklearn.model_selection import train_test_split
29
+ from path_boost import PathBoost
30
+ from path_boost.utils.datasets_for_examples.generate_example_dataset import generate_synthetic_graph_dataset
31
+
32
+
33
+
34
+
35
+ if __name__ == "__main__":
36
+
37
+ # Generate synthetic dataset
38
+ nx_graphs, y, true_paths, true_weights = generate_synthetic_graph_dataset()
39
+
40
+
41
+ list_anchor_nodes_labels = [0, 1, 2]
42
+
43
+ parameters_variable_importance: dict = {
44
+ 'criterion': 'absolute',
45
+ 'error_used': 'mse',
46
+ 'use_correlation': False,
47
+ 'normalize': True,
48
+ }
49
+
50
+ X_train, X_test, y_train, y_test = train_test_split(nx_graphs, y, test_size=0.25, random_state=42)
51
+
52
+ eval_set = [(X_test, y_test)]
53
+
54
+ path_boost = PathBoost(
55
+ n_iter=50, # Reduced for quicker example run
56
+ max_path_length=5,
57
+ learning_rate=0.1,
58
+ n_of_cores=1, # Set to >1 for parallel processing if desired
59
+ verbose=True,
60
+ parameters_variable_importance=parameters_variable_importance
61
+ )
62
+
63
+ # Fit the model
64
+ # anchor_nodes_label_name must correspond to the feature storing node types ('feature_0')
65
+ path_boost.fit(
66
+ X=X_train,
67
+ y=y_train,
68
+ eval_set=eval_set,
69
+ list_anchor_nodes_labels=list_anchor_nodes_labels,
70
+ anchor_nodes_label_name="feature_0" # Node types are in 'feature_0'
71
+ )
72
+
73
+ print(f"Generated {len(nx_graphs)} graphs.")
74
+ print(f"Example y values: {y[:5]}")
75
+ print(f"True paths definitions: {true_paths}")
76
+ print(f"True path weights: {true_weights}")
77
+
78
+ path_boost.plot_training_and_eval_errors(skip_first_n_iterations=0, plot_eval_sets_error=True)
79
+ if path_boost.parameters_variable_importance is not None and hasattr(path_boost, 'variable_importance_'):
80
+ path_boost.plot_variable_importance(top_n_features=10)
81
+ else:
82
+ print("Variable importance not computed or available.")
83
+
84
+ print("Example run finished.")
85
+
86
+ ```
87
+
88
+ ## API Overview
89
+
90
+ ### PathBoost
91
+
92
+ - `fit(X, y, anchor_nodes_label_name, list_anchor_nodes_labels, eval_set=None)`
93
+ - `predict(X)`
94
+ - `predict_step_by_step(X)`
95
+ - `evaluate(X, y)`
96
+ - `plot_training_and_eval_errors(skip_first_n_iterations=True)`
97
+ - `plot_variable_importance()`
98
+ - **Attributes:**
99
+ - `train_mse_`: Training error (MSE) at each iteration
100
+ - `mse_eval_set_`: Evaluation set error (MSE) at each iteration (if `eval_set` is provided)
101
+ - `variable_importance_`: Variable/path importance scores (if enabled)
102
+ - `is_fitted_`: Whether the model is fitted
103
+ - `models_list_`: List of fitted SequentialPathBoost models (one per anchor node)
104
+ - (Each SequentialPathBoost in `models_list_` exposes the attributes below)
105
+
106
+ ### SequentialPathBoost
107
+
108
+ - `fit(X, y, list_anchor_nodes_labels, name_of_label_attribute, eval_set=None)`
109
+ - `predict(X)`
110
+ - `predict_step_by_step(X)`
111
+ - `evaluate(X, y)`
112
+ - `plot_training_and_eval_errors(skip_first_n_iterations=True)`
113
+ - `plot_variable_importance()`
114
+ - **Attributes:**
115
+ - `train_mse_`: Training error (MSE) at each iteration
116
+ - `train_mae_`: Training MAE at each iteration
117
+ - `eval_sets_mse_`: Evaluation set error (MSE) at each iteration (if `eval_set` is provided)
118
+ - `eval_sets_mae_`: Evaluation set MAE at each iteration (if `eval_set` is provided)
119
+ - `variable_importance_`: Variable/path importance scores (if enabled)
120
+ - `paths_selected_by_epb_`: Set of selected paths during boosting
121
+ - `columns_names_`: Names of EBM columns/features used
122
+ - `is_fitted_`: Whether the model is fitted
123
+
124
+ ## Requirements
125
+
126
+ - Python 3.10+
127
+ - numpy
128
+ - pandas
129
+ - scikit-learn
130
+ - networkx
131
+ - matplotlib
132
+
133
+ (See `requirements.txt` for the full list.)
134
+
135
+ ## Citation
136
+
137
+ If you use this library in your research, please cite the corresponding paper (add citation here).
138
+
139
+ ## License
140
+
141
+ BSD 3-Clause License