nomad-hpc 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomad_hpc-1.2.4/LICENSE +25 -0
- nomad_hpc-1.2.4/PKG-INFO +232 -0
- nomad_hpc-1.2.4/README.md +181 -0
- nomad_hpc-1.2.4/nomad/__init__.py +2 -0
- nomad_hpc-1.2.4/nomad/alerts/__init__.py +25 -0
- nomad_hpc-1.2.4/nomad/alerts/backends.py +264 -0
- nomad_hpc-1.2.4/nomad/alerts/dispatcher.py +252 -0
- nomad_hpc-1.2.4/nomad/alerts/thresholds.py +437 -0
- nomad_hpc-1.2.4/nomad/analysis/__init__.py +31 -0
- nomad_hpc-1.2.4/nomad/analysis/derivatives.py +531 -0
- nomad_hpc-1.2.4/nomad/analysis/similarity.py +571 -0
- nomad_hpc-1.2.4/nomad/cli.py +3519 -0
- nomad_hpc-1.2.4/nomad/collectors/__init__.py +49 -0
- nomad_hpc-1.2.4/nomad/collectors/base.py +331 -0
- nomad_hpc-1.2.4/nomad/collectors/disk.py +428 -0
- nomad_hpc-1.2.4/nomad/collectors/gpu.py +264 -0
- nomad_hpc-1.2.4/nomad/collectors/groups.py +470 -0
- nomad_hpc-1.2.4/nomad/collectors/interactive.py +345 -0
- nomad_hpc-1.2.4/nomad/collectors/iostat.py +347 -0
- nomad_hpc-1.2.4/nomad/collectors/job_metrics.py +565 -0
- nomad_hpc-1.2.4/nomad/collectors/mpstat.py +420 -0
- nomad_hpc-1.2.4/nomad/collectors/network_perf.py +628 -0
- nomad_hpc-1.2.4/nomad/collectors/nfs.py +274 -0
- nomad_hpc-1.2.4/nomad/collectors/node_state.py +321 -0
- nomad_hpc-1.2.4/nomad/collectors/slurm.py +755 -0
- nomad_hpc-1.2.4/nomad/collectors/slurm_legacy.py +492 -0
- nomad_hpc-1.2.4/nomad/collectors/storage.py +553 -0
- nomad_hpc-1.2.4/nomad/collectors/vmstat.py +260 -0
- nomad_hpc-1.2.4/nomad/collectors/workstation.py +475 -0
- nomad_hpc-1.2.4/nomad/community.py +473 -0
- nomad_hpc-1.2.4/nomad/config/__init__.py +21 -0
- nomad_hpc-1.2.4/nomad/config/default.toml +146 -0
- nomad_hpc-1.2.4/nomad/db/__init__.py +13 -0
- nomad_hpc-1.2.4/nomad/db/migrations.py +154 -0
- nomad_hpc-1.2.4/nomad/db/queries.py +325 -0
- nomad_hpc-1.2.4/nomad/db/schema.sql +700 -0
- nomad_hpc-1.2.4/nomad/demo.py +881 -0
- nomad_hpc-1.2.4/nomad/diag/__init__.py +25 -0
- nomad_hpc-1.2.4/nomad/diag/base.py +289 -0
- nomad_hpc-1.2.4/nomad/diag/network.py +593 -0
- nomad_hpc-1.2.4/nomad/diag/node.py +413 -0
- nomad_hpc-1.2.4/nomad/diag/storage.py +551 -0
- nomad_hpc-1.2.4/nomad/diag/workstation.py +516 -0
- nomad_hpc-1.2.4/nomad/edu/__init__.py +29 -0
- nomad_hpc-1.2.4/nomad/edu/explain.py +422 -0
- nomad_hpc-1.2.4/nomad/edu/progress.py +527 -0
- nomad_hpc-1.2.4/nomad/edu/scoring.py +528 -0
- nomad_hpc-1.2.4/nomad/edu/storage.py +254 -0
- nomad_hpc-1.2.4/nomad/hooks/__init__.py +3 -0
- nomad_hpc-1.2.4/nomad/hooks/prolog.py +236 -0
- nomad_hpc-1.2.4/nomad/install.py +520 -0
- nomad_hpc-1.2.4/nomad/ml/__init__.py +58 -0
- nomad_hpc-1.2.4/nomad/ml/autoencoder.py +394 -0
- nomad_hpc-1.2.4/nomad/ml/continuous.py +333 -0
- nomad_hpc-1.2.4/nomad/ml/ensemble.py +454 -0
- nomad_hpc-1.2.4/nomad/ml/estimator.py +521 -0
- nomad_hpc-1.2.4/nomad/ml/gnn.py +420 -0
- nomad_hpc-1.2.4/nomad/ml/gnn_torch.py +377 -0
- nomad_hpc-1.2.4/nomad/ml/lstm.py +424 -0
- nomad_hpc-1.2.4/nomad/ml/persistence.py +245 -0
- nomad_hpc-1.2.4/nomad/monitors/__init__.py +15 -0
- nomad_hpc-1.2.4/nomad/monitors/job_monitor.py +562 -0
- nomad_hpc-1.2.4/nomad/patching/__init__.py +315 -0
- nomad_hpc-1.2.4/nomad/prediction/__init__.py +2 -0
- nomad_hpc-1.2.4/nomad/testing/__init__.py +313 -0
- nomad_hpc-1.2.4/nomad/viz/__init__.py +8 -0
- nomad_hpc-1.2.4/nomad/viz/dashboard.py +4552 -0
- nomad_hpc-1.2.4/nomad/viz/server.py +6086 -0
- nomad_hpc-1.2.4/nomad/viz/static/logo.svg +39 -0
- nomad_hpc-1.2.4/nomad_hpc.egg-info/PKG-INFO +232 -0
- nomad_hpc-1.2.4/nomad_hpc.egg-info/SOURCES.txt +77 -0
- nomad_hpc-1.2.4/nomad_hpc.egg-info/dependency_links.txt +1 -0
- nomad_hpc-1.2.4/nomad_hpc.egg-info/entry_points.txt +2 -0
- nomad_hpc-1.2.4/nomad_hpc.egg-info/requires.txt +26 -0
- nomad_hpc-1.2.4/nomad_hpc.egg-info/top_level.txt +1 -0
- nomad_hpc-1.2.4/pyproject.toml +140 -0
- nomad_hpc-1.2.4/setup.cfg +4 -0
- nomad_hpc-1.2.4/tests/test_disk_and_derivatives.py +316 -0
- nomad_hpc-1.2.4/tests/test_edu.py +289 -0
nomad_hpc-1.2.4/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
GNU AFFERO GENERAL PUBLIC LICENSE
|
|
2
|
+
Version 3, 19 November 2007
|
|
3
|
+
|
|
4
|
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
|
5
|
+
Everyone is permitted to copy and distribute verbatim copies
|
|
6
|
+
of this license document, but changing it is not allowed.
|
|
7
|
+
|
|
8
|
+
Preamble
|
|
9
|
+
|
|
10
|
+
The GNU Affero General Public License is a free, copyleft license for
|
|
11
|
+
software and other kinds of works, specifically designed to ensure
|
|
12
|
+
cooperation with the community in the case of network server software.
|
|
13
|
+
|
|
14
|
+
[Full AGPL v3 text would go here - abbreviated for file size]
|
|
15
|
+
|
|
16
|
+
For the complete license text, see: https://www.gnu.org/licenses/agpl-3.0.txt
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
COMMERCIAL LICENSE
|
|
21
|
+
|
|
22
|
+
For proprietary/commercial use of NØMADE without the AGPL v3 requirements,
|
|
23
|
+
a commercial license is available. Contact [email] for details.
|
|
24
|
+
|
|
25
|
+
Academic and educational use is free under the AGPL v3.
|
nomad_hpc-1.2.4/PKG-INFO
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nomad-hpc
|
|
3
|
+
Version: 1.2.4
|
|
4
|
+
Summary: A lightweight HPC monitoring and predictive analytics tool
|
|
5
|
+
Author-email: Joao Tonini <jtonini@richmond.edu>
|
|
6
|
+
Maintainer-email: Joao Tonini <jtonini@richmond.edu>
|
|
7
|
+
License-Expression: AGPL-3.0-or-later
|
|
8
|
+
Project-URL: Homepage, https://nomad-hpc.com
|
|
9
|
+
Project-URL: Documentation, https://jtonini.github.io/nomad-hpc/
|
|
10
|
+
Project-URL: Repository, https://github.com/jtonini/nomad-hpc
|
|
11
|
+
Project-URL: Issues, https://github.com/jtonini/nomad-hpc/issues
|
|
12
|
+
Keywords: hpc,monitoring,slurm,cluster,predictive-analytics,machine-learning,anomaly-detection,graph-neural-network
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: System :: Monitoring
|
|
24
|
+
Classifier: Topic :: System :: Systems Administration
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: click>=8.0
|
|
30
|
+
Requires-Dist: toml>=0.10
|
|
31
|
+
Requires-Dist: numpy>=1.21
|
|
32
|
+
Requires-Dist: pandas>=1.3
|
|
33
|
+
Requires-Dist: scipy>=1.7
|
|
34
|
+
Provides-Extra: ml
|
|
35
|
+
Requires-Dist: scikit-learn>=1.0; extra == "ml"
|
|
36
|
+
Requires-Dist: torch>=2.0; extra == "ml"
|
|
37
|
+
Requires-Dist: torch-geometric>=2.0; extra == "ml"
|
|
38
|
+
Provides-Extra: dashboard
|
|
39
|
+
Requires-Dist: jinja2>=3.0; extra == "dashboard"
|
|
40
|
+
Provides-Extra: alerts
|
|
41
|
+
Provides-Extra: all
|
|
42
|
+
Requires-Dist: nomad[dashboard,ml]; extra == "all"
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
46
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
47
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
48
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pre-commit>=3.0; extra == "dev"
|
|
50
|
+
Dynamic: license-file
|
|
51
|
+
|
|
52
|
+
# NØMAD-HPC
|
|
53
|
+
|
|
54
|
+
**NØde Monitoring And Diagnostics** — Lightweight HPC monitoring, visualization, and predictive analytics.
|
|
55
|
+
|
|
56
|
+
> *"Travels light, adapts to its environment, and doesn't need permanent infrastructure."*
|
|
57
|
+
|
|
58
|
+
[](https://pypi.org/project/nomad-hpc/)
|
|
59
|
+
[](https://www.gnu.org/licenses/agpl-3.0)
|
|
60
|
+
[](https://www.python.org/downloads/)
|
|
61
|
+
[](https://doi.org/10.5281/zenodo.18614517)
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
📖 **[Full Documentation](https://jtonini.github.io/nomad-hpc/)** — Installation guides, configuration, CLI reference, network methodology, ML framework, and more.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install nomad-hpc
|
|
73
|
+
nomad demo # Try with synthetic data
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
For production:
|
|
77
|
+
```bash
|
|
78
|
+
nomad init # Configure for your cluster
|
|
79
|
+
nomad collect # Start data collection
|
|
80
|
+
nomad dashboard # Launch web interface
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Features
|
|
86
|
+
|
|
87
|
+
| Feature | Description | Command |
|
|
88
|
+
|---------|-------------|---------|
|
|
89
|
+
| **Dashboard** | Real-time multi-cluster monitoring with partition views | `nomad dashboard` |
|
|
90
|
+
| **Educational Analytics** | Track computational proficiency development | `nomad edu explain <job>` |
|
|
91
|
+
| **Alerts** | Threshold + predictive alerts (email, Slack, webhook) | `nomad alerts` |
|
|
92
|
+
| **ML Prediction** | Job failure prediction using similarity networks | `nomad predict` |
|
|
93
|
+
| **Community Export** | Anonymized datasets for cross-institutional research | `nomad community export` |
|
|
94
|
+
| **Interactive Sessions** | Monitor RStudio/Jupyter sessions | `nomad report-interactive` |
|
|
95
|
+
| **Derivative Analysis** | Detect accelerating trends before thresholds | Built into alerts |
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Architecture
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
┌────────────────────────────────────────────────────────────┐
|
|
103
|
+
│ NØMAD │
|
|
104
|
+
├──────────────┬──────────────┬──────────────┬───────────────┤
|
|
105
|
+
│ Collectors │ Analysis │ Viz │ Alerts │
|
|
106
|
+
├──────────────┼──────────────┼──────────────┼───────────────┤
|
|
107
|
+
│ disk │ derivatives │ dashboard │ thresholds │
|
|
108
|
+
│ iostat │ similarity │ network 3D │ predictive │
|
|
109
|
+
│ slurm │ ML ensemble │ partitions │ email/slack │
|
|
110
|
+
│ gpu │ edu scoring │ edu views │ webhooks │
|
|
111
|
+
│ nfs │ │ │ │
|
|
112
|
+
└──────────────┴──────────────┴──────────────┴───────────────┘
|
|
113
|
+
│
|
|
114
|
+
┌─────────┴─────────┐
|
|
115
|
+
│ SQLite Database │
|
|
116
|
+
└───────────────────┘
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## CLI Reference
|
|
122
|
+
|
|
123
|
+
### Core Commands
|
|
124
|
+
```bash
|
|
125
|
+
nomad init # Setup wizard
|
|
126
|
+
nomad collect # Start collectors
|
|
127
|
+
nomad dashboard # Web interface
|
|
128
|
+
nomad demo # Demo mode
|
|
129
|
+
nomad status # System status
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Educational Analytics
|
|
133
|
+
```bash
|
|
134
|
+
nomad edu explain <job_id> # Job analysis with recommendations
|
|
135
|
+
nomad edu trajectory <user> # User proficiency over time
|
|
136
|
+
nomad edu report <group> # Course/group report
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Analysis & Prediction
|
|
140
|
+
```bash
|
|
141
|
+
nomad disk /path # Filesystem trends
|
|
142
|
+
nomad jobs --user <user> # Job history
|
|
143
|
+
nomad similarity # Network analysis
|
|
144
|
+
nomad train # Train ML models
|
|
145
|
+
nomad predict # Run predictions
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Community & Alerts
|
|
149
|
+
```bash
|
|
150
|
+
nomad community export # Export anonymized data
|
|
151
|
+
nomad community preview # Preview export
|
|
152
|
+
nomad alerts # View alerts
|
|
153
|
+
nomad alerts --unresolved # Unresolved only
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Installation
|
|
159
|
+
|
|
160
|
+
### From PyPI
|
|
161
|
+
```bash
|
|
162
|
+
pip install nomad-hpc
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### From Source
|
|
166
|
+
```bash
|
|
167
|
+
git clone https://github.com/jtonini/nomad-hpc
|
|
168
|
+
cd nomad && pip install -e .
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Requirements
|
|
172
|
+
- Python 3.9+
|
|
173
|
+
- SQLite 3.35+
|
|
174
|
+
- sysstat package (`iostat`, `mpstat`)
|
|
175
|
+
- Optional: SLURM, nvidia-smi, nfsiostat
|
|
176
|
+
|
|
177
|
+
### System Check
|
|
178
|
+
```bash
|
|
179
|
+
nomad syscheck
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Documentation
|
|
185
|
+
|
|
186
|
+
📖 **[jtonini.github.io/nomad-hpc](https://jtonini.github.io/nomad-hpc/)**
|
|
187
|
+
|
|
188
|
+
- [Installation & Configuration](https://jtonini.github.io/nomad-hpc/installation/)
|
|
189
|
+
- [System Install (`--system`)](https://jtonini.github.io/nomad-hpc/system-install/)
|
|
190
|
+
- [Dashboard Guide](https://jtonini.github.io/nomad-hpc/dashboard/)
|
|
191
|
+
- [Educational Analytics](https://jtonini.github.io/nomad-hpc/edu/)
|
|
192
|
+
- [Network Methodology](https://jtonini.github.io/nomad-hpc/network/)
|
|
193
|
+
- [ML Framework](https://jtonini.github.io/nomad-hpc/ml/)
|
|
194
|
+
- [Proficiency Scoring](https://jtonini.github.io/nomad-hpc/proficiency/)
|
|
195
|
+
- [CLI Reference](https://jtonini.github.io/nomad-hpc/cli/)
|
|
196
|
+
- [Configuration Options](https://jtonini.github.io/nomad-hpc/config/)
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
Dual-licensed:
|
|
203
|
+
- **AGPL v3** — Free for academic, educational, and open-source use
|
|
204
|
+
- **Commercial License** — Available for proprietary deployments
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Citation
|
|
209
|
+
|
|
210
|
+
```bibtex
|
|
211
|
+
@software{nomad2026,
|
|
212
|
+
author = {Tonini, João Filipe Riva},
|
|
213
|
+
title = {NØMAD: Lightweight HPC Monitoring with Machine Learning-Based Failure Prediction},
|
|
214
|
+
year = {2026},
|
|
215
|
+
url = {https://github.com/jtonini/nomad-hpc},
|
|
216
|
+
doi = {10.5281/zenodo.18614517}
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Contributing
|
|
223
|
+
|
|
224
|
+
See [CONTRIBUTING.md](docs/CONTRIBUTING.md) for guidelines.
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Contact
|
|
229
|
+
|
|
230
|
+
- **Author**: João Tonini
|
|
231
|
+
- **Email**: jtonini@richmond.edu
|
|
232
|
+
- **Issues**: [GitHub Issues](https://github.com/jtonini/nomad-hpc/issues)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# NØMAD-HPC
|
|
2
|
+
|
|
3
|
+
**NØde Monitoring And Diagnostics** — Lightweight HPC monitoring, visualization, and predictive analytics.
|
|
4
|
+
|
|
5
|
+
> *"Travels light, adapts to its environment, and doesn't need permanent infrastructure."*
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org/project/nomad-hpc/)
|
|
8
|
+
[](https://www.gnu.org/licenses/agpl-3.0)
|
|
9
|
+
[](https://www.python.org/downloads/)
|
|
10
|
+
[](https://doi.org/10.5281/zenodo.18614517)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
📖 **[Full Documentation](https://jtonini.github.io/nomad-hpc/)** — Installation guides, configuration, CLI reference, network methodology, ML framework, and more.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Quick Start
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install nomad-hpc
|
|
22
|
+
nomad demo # Try with synthetic data
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
For production:
|
|
26
|
+
```bash
|
|
27
|
+
nomad init # Configure for your cluster
|
|
28
|
+
nomad collect # Start data collection
|
|
29
|
+
nomad dashboard # Launch web interface
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
| Feature | Description | Command |
|
|
37
|
+
|---------|-------------|---------|
|
|
38
|
+
| **Dashboard** | Real-time multi-cluster monitoring with partition views | `nomad dashboard` |
|
|
39
|
+
| **Educational Analytics** | Track computational proficiency development | `nomad edu explain <job>` |
|
|
40
|
+
| **Alerts** | Threshold + predictive alerts (email, Slack, webhook) | `nomad alerts` |
|
|
41
|
+
| **ML Prediction** | Job failure prediction using similarity networks | `nomad predict` |
|
|
42
|
+
| **Community Export** | Anonymized datasets for cross-institutional research | `nomad community export` |
|
|
43
|
+
| **Interactive Sessions** | Monitor RStudio/Jupyter sessions | `nomad report-interactive` |
|
|
44
|
+
| **Derivative Analysis** | Detect accelerating trends before thresholds | Built into alerts |
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Architecture
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
┌────────────────────────────────────────────────────────────┐
|
|
52
|
+
│ NØMAD │
|
|
53
|
+
├──────────────┬──────────────┬──────────────┬───────────────┤
|
|
54
|
+
│ Collectors │ Analysis │ Viz │ Alerts │
|
|
55
|
+
├──────────────┼──────────────┼──────────────┼───────────────┤
|
|
56
|
+
│ disk │ derivatives │ dashboard │ thresholds │
|
|
57
|
+
│ iostat │ similarity │ network 3D │ predictive │
|
|
58
|
+
│ slurm │ ML ensemble │ partitions │ email/slack │
|
|
59
|
+
│ gpu │ edu scoring │ edu views │ webhooks │
|
|
60
|
+
│ nfs │ │ │ │
|
|
61
|
+
└──────────────┴──────────────┴──────────────┴───────────────┘
|
|
62
|
+
│
|
|
63
|
+
┌─────────┴─────────┐
|
|
64
|
+
│ SQLite Database │
|
|
65
|
+
└───────────────────┘
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## CLI Reference
|
|
71
|
+
|
|
72
|
+
### Core Commands
|
|
73
|
+
```bash
|
|
74
|
+
nomad init # Setup wizard
|
|
75
|
+
nomad collect # Start collectors
|
|
76
|
+
nomad dashboard # Web interface
|
|
77
|
+
nomad demo # Demo mode
|
|
78
|
+
nomad status # System status
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Educational Analytics
|
|
82
|
+
```bash
|
|
83
|
+
nomad edu explain <job_id> # Job analysis with recommendations
|
|
84
|
+
nomad edu trajectory <user> # User proficiency over time
|
|
85
|
+
nomad edu report <group> # Course/group report
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Analysis & Prediction
|
|
89
|
+
```bash
|
|
90
|
+
nomad disk /path # Filesystem trends
|
|
91
|
+
nomad jobs --user <user> # Job history
|
|
92
|
+
nomad similarity # Network analysis
|
|
93
|
+
nomad train # Train ML models
|
|
94
|
+
nomad predict # Run predictions
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Community & Alerts
|
|
98
|
+
```bash
|
|
99
|
+
nomad community export # Export anonymized data
|
|
100
|
+
nomad community preview # Preview export
|
|
101
|
+
nomad alerts # View alerts
|
|
102
|
+
nomad alerts --unresolved # Unresolved only
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Installation
|
|
108
|
+
|
|
109
|
+
### From PyPI
|
|
110
|
+
```bash
|
|
111
|
+
pip install nomad-hpc
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### From Source
|
|
115
|
+
```bash
|
|
116
|
+
git clone https://github.com/jtonini/nomad-hpc
|
|
117
|
+
cd nomad && pip install -e .
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Requirements
|
|
121
|
+
- Python 3.9+
|
|
122
|
+
- SQLite 3.35+
|
|
123
|
+
- sysstat package (`iostat`, `mpstat`)
|
|
124
|
+
- Optional: SLURM, nvidia-smi, nfsiostat
|
|
125
|
+
|
|
126
|
+
### System Check
|
|
127
|
+
```bash
|
|
128
|
+
nomad syscheck
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Documentation
|
|
134
|
+
|
|
135
|
+
📖 **[jtonini.github.io/nomad-hpc](https://jtonini.github.io/nomad-hpc/)**
|
|
136
|
+
|
|
137
|
+
- [Installation & Configuration](https://jtonini.github.io/nomad-hpc/installation/)
|
|
138
|
+
- [System Install (`--system`)](https://jtonini.github.io/nomad-hpc/system-install/)
|
|
139
|
+
- [Dashboard Guide](https://jtonini.github.io/nomad-hpc/dashboard/)
|
|
140
|
+
- [Educational Analytics](https://jtonini.github.io/nomad-hpc/edu/)
|
|
141
|
+
- [Network Methodology](https://jtonini.github.io/nomad-hpc/network/)
|
|
142
|
+
- [ML Framework](https://jtonini.github.io/nomad-hpc/ml/)
|
|
143
|
+
- [Proficiency Scoring](https://jtonini.github.io/nomad-hpc/proficiency/)
|
|
144
|
+
- [CLI Reference](https://jtonini.github.io/nomad-hpc/cli/)
|
|
145
|
+
- [Configuration Options](https://jtonini.github.io/nomad-hpc/config/)
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## License
|
|
150
|
+
|
|
151
|
+
Dual-licensed:
|
|
152
|
+
- **AGPL v3** — Free for academic, educational, and open-source use
|
|
153
|
+
- **Commercial License** — Available for proprietary deployments
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Citation
|
|
158
|
+
|
|
159
|
+
```bibtex
|
|
160
|
+
@software{nomad2026,
|
|
161
|
+
author = {Tonini, João Filipe Riva},
|
|
162
|
+
title = {NØMAD: Lightweight HPC Monitoring with Machine Learning-Based Failure Prediction},
|
|
163
|
+
year = {2026},
|
|
164
|
+
url = {https://github.com/jtonini/nomad-hpc},
|
|
165
|
+
doi = {10.5281/zenodo.18614517}
|
|
166
|
+
}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## Contributing
|
|
172
|
+
|
|
173
|
+
See [CONTRIBUTING.md](docs/CONTRIBUTING.md) for guidelines.
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Contact
|
|
178
|
+
|
|
179
|
+
- **Author**: João Tonini
|
|
180
|
+
- **Email**: jtonini@richmond.edu
|
|
181
|
+
- **Issues**: [GitHub Issues](https://github.com/jtonini/nomad-hpc/issues)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 João Tonini
|
|
3
|
+
"""NOMADE Alert System - Detection, Storage, and Dispatch."""
|
|
4
|
+
|
|
5
|
+
from .dispatcher import AlertDispatcher, send_alert, init_dispatcher, get_dispatcher
|
|
6
|
+
from .backends import EmailBackend, SlackBackend, WebhookBackend
|
|
7
|
+
from .thresholds import (
|
|
8
|
+
ThresholdChecker, check_and_alert, DEFAULT_THRESHOLDS,
|
|
9
|
+
PredictiveChecker, check_disk_prediction
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
'AlertDispatcher',
|
|
14
|
+
'send_alert',
|
|
15
|
+
'init_dispatcher',
|
|
16
|
+
'get_dispatcher',
|
|
17
|
+
'EmailBackend',
|
|
18
|
+
'SlackBackend',
|
|
19
|
+
'WebhookBackend',
|
|
20
|
+
'ThresholdChecker',
|
|
21
|
+
'check_and_alert',
|
|
22
|
+
'DEFAULT_THRESHOLDS',
|
|
23
|
+
'PredictiveChecker',
|
|
24
|
+
'check_disk_prediction'
|
|
25
|
+
]
|