fournex 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fournex-0.1.1/PKG-INFO +91 -0
- fournex-0.1.1/README.md +65 -0
- fournex-0.1.1/fournex/__init__.py +168 -0
- fournex-0.1.1/fournex/__main__.py +7 -0
- fournex-0.1.1/fournex/_native.py +9 -0
- fournex-0.1.1/fournex/analysis.py +825 -0
- fournex-0.1.1/fournex/autopilot/__init__.py +4 -0
- fournex-0.1.1/fournex/autopilot/actions.py +134 -0
- fournex-0.1.1/fournex/autopilot/benchmark.py +164 -0
- fournex-0.1.1/fournex/autopilot/comparison.py +196 -0
- fournex-0.1.1/fournex/autopilot/guards.py +63 -0
- fournex-0.1.1/fournex/autopilot/local_executor.py +356 -0
- fournex-0.1.1/fournex/autopilot/quality.py +122 -0
- fournex-0.1.1/fournex/autopilot/report.py +154 -0
- fournex-0.1.1/fournex/autopilot/runner.py +519 -0
- fournex-0.1.1/fournex/autopilot/safety.py +122 -0
- fournex-0.1.1/fournex/autopilot/tuners/__init__.py +165 -0
- fournex-0.1.1/fournex/autopilot/tuners/batch_size.py +63 -0
- fournex-0.1.1/fournex/autopilot/tuners/dataloader.py +113 -0
- fournex-0.1.1/fournex/autopilot/tuners/memory.py +43 -0
- fournex-0.1.1/fournex/autopilot/tuners/mixed_precision.py +86 -0
- fournex-0.1.1/fournex/autopilot/tuners/runtime.py +82 -0
- fournex-0.1.1/fournex/cli.py +1273 -0
- fournex-0.1.1/fournex/common_ir.py +254 -0
- fournex-0.1.1/fournex/common_ir_analysis.py +241 -0
- fournex-0.1.1/fournex/common_ir_validators.py +44 -0
- fournex-0.1.1/fournex/cuda_timers.py +191 -0
- fournex-0.1.1/fournex/data_pipeline_ir.py +144 -0
- fournex-0.1.1/fournex/dataloader.py +115 -0
- fournex-0.1.1/fournex/distributed_ir.py +165 -0
- fournex-0.1.1/fournex/nvml_ir.py +157 -0
- fournex-0.1.1/fournex/profiler.py +199 -0
- fournex-0.1.1/fournex/pytorch_profiler_ir.py +256 -0
- fournex-0.1.1/fournex/recommendations/__init__.py +4 -0
- fournex-0.1.1/fournex/recommendations/engine.py +342 -0
- fournex-0.1.1/fournex/recommendations/signals.py +90 -0
- fournex-0.1.1/fournex/sdk.py +242 -0
- fournex-0.1.1/fournex/shapes.py +91 -0
- fournex-0.1.1/fournex/step_context.py +145 -0
- fournex-0.1.1/fournex/storage.py +82 -0
- fournex-0.1.1/fournex.egg-info/PKG-INFO +91 -0
- fournex-0.1.1/fournex.egg-info/SOURCES.txt +47 -0
- fournex-0.1.1/fournex.egg-info/dependency_links.txt +1 -0
- fournex-0.1.1/fournex.egg-info/entry_points.txt +3 -0
- fournex-0.1.1/fournex.egg-info/requires.txt +1 -0
- fournex-0.1.1/fournex.egg-info/top_level.txt +2 -0
- fournex-0.1.1/pyproject.toml +50 -0
- fournex-0.1.1/setup.cfg +4 -0
- fournex-0.1.1/setup.py +45 -0
fournex-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fournex
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Open-source GPU performance profiler and bottleneck analyzer for PyTorch.
|
|
5
|
+
Author-email: Fournex <hello@fournex.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://fournex.com
|
|
8
|
+
Project-URL: Repository, https://github.com/jorgevee/fournex
|
|
9
|
+
Project-URL: Documentation, https://fournex.com/docs
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/jorgevee/fournex/issues
|
|
11
|
+
Keywords: pytorch,gpu,profiling,cuda,performance,mlops,bottleneck,optimization,training
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: System :: Monitoring
|
|
22
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
Requires-Dist: PyYAML>=6.0
|
|
26
|
+
|
|
27
|
+
# Fournex
|
|
28
|
+
|
|
29
|
+
**Open-source GPU performance profiler and bottleneck analyzer for PyTorch.**
|
|
30
|
+
|
|
31
|
+
[](https://github.com/jorgevee/fournex/blob/main/LICENSE)
|
|
32
|
+
[](https://www.python.org/)
|
|
33
|
+
|
|
34
|
+
Fournex wraps your training script, collects GPU telemetry, and tells you exactly what is slowing it down — with ranked, actionable recommendations.
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install fournex
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick start
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Profile your workload
|
|
46
|
+
frx collect --name my-run -- python train.py
|
|
47
|
+
|
|
48
|
+
# Analyze and get recommendations
|
|
49
|
+
frx analyze runs/run-<id>
|
|
50
|
+
|
|
51
|
+
# Check your environment
|
|
52
|
+
frx doctor
|
|
53
|
+
|
|
54
|
+
# Validate the pipeline end-to-end
|
|
55
|
+
frx smoke-test
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Detected bottleneck types
|
|
59
|
+
|
|
60
|
+
| Label | Signal |
|
|
61
|
+
|---|---|
|
|
62
|
+
| `input_bound` | DataLoader wait ≥ 20% of step time |
|
|
63
|
+
| `copy_bound` | H2D transfer ≥ 15% of step time |
|
|
64
|
+
| `sync_bound` | Sync wait ≥ 10% of step time |
|
|
65
|
+
| `underutilized_gpu` | GPU utilization < 35% |
|
|
66
|
+
| `memory_pressure` | Peak memory ratio ≥ 90% |
|
|
67
|
+
| `shape_instability` | Shape volatility ≥ 30% |
|
|
68
|
+
| `launch_bound` | Low utilization + profiler windows, no dominant stall |
|
|
69
|
+
| `insufficient_telemetry` | No timing or GPU utilization data |
|
|
70
|
+
|
|
71
|
+
## Safe config benchmarking
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
frx tune --safe --max-trials 12 -- python train.py
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Fournex sweeps DataLoader and runtime configs, benchmarks each one, and recommends the fastest safe candidate — without changing your code.
|
|
78
|
+
|
|
79
|
+
Interrupted or repeated tune runs can reuse completed trial artifacts:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
frx tune --resume runs/tune-<id> -- python train.py
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
`--resume` reuses a trial only when the saved `config.yaml`, `benchmark_window.json`, and `metrics.json` match the current workload command and benchmark settings.
|
|
86
|
+
|
|
87
|
+
## Links
|
|
88
|
+
|
|
89
|
+
- [GitHub](https://github.com/jorgevee/fournex)
|
|
90
|
+
- [Documentation](https://fournex.com/docs)
|
|
91
|
+
- [Website](https://fournex.com)
|
fournex-0.1.1/README.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Fournex
|
|
2
|
+
|
|
3
|
+
**Open-source GPU performance profiler and bottleneck analyzer for PyTorch.**
|
|
4
|
+
|
|
5
|
+
[](https://github.com/jorgevee/fournex/blob/main/LICENSE)
|
|
6
|
+
[](https://www.python.org/)
|
|
7
|
+
|
|
8
|
+
Fournex wraps your training script, collects GPU telemetry, and tells you exactly what is slowing it down — with ranked, actionable recommendations.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install fournex
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Quick start
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Profile your workload
|
|
20
|
+
frx collect --name my-run -- python train.py
|
|
21
|
+
|
|
22
|
+
# Analyze and get recommendations
|
|
23
|
+
frx analyze runs/run-<id>
|
|
24
|
+
|
|
25
|
+
# Check your environment
|
|
26
|
+
frx doctor
|
|
27
|
+
|
|
28
|
+
# Validate the pipeline end-to-end
|
|
29
|
+
frx smoke-test
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Detected bottleneck types
|
|
33
|
+
|
|
34
|
+
| Label | Signal |
|
|
35
|
+
|---|---|
|
|
36
|
+
| `input_bound` | DataLoader wait ≥ 20% of step time |
|
|
37
|
+
| `copy_bound` | H2D transfer ≥ 15% of step time |
|
|
38
|
+
| `sync_bound` | Sync wait ≥ 10% of step time |
|
|
39
|
+
| `underutilized_gpu` | GPU utilization < 35% |
|
|
40
|
+
| `memory_pressure` | Peak memory ratio ≥ 90% |
|
|
41
|
+
| `shape_instability` | Shape volatility ≥ 30% |
|
|
42
|
+
| `launch_bound` | Low utilization + profiler windows, no dominant stall |
|
|
43
|
+
| `insufficient_telemetry` | No timing or GPU utilization data |
|
|
44
|
+
|
|
45
|
+
## Safe config benchmarking
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
frx tune --safe --max-trials 12 -- python train.py
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Fournex sweeps DataLoader and runtime configs, benchmarks each one, and recommends the fastest safe candidate — without changing your code.
|
|
52
|
+
|
|
53
|
+
Interrupted or repeated tune runs can reuse completed trial artifacts:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
frx tune --resume runs/tune-<id> -- python train.py
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
`--resume` reuses a trial only when the saved `config.yaml`, `benchmark_window.json`, and `metrics.json` match the current workload command and benchmark settings.
|
|
60
|
+
|
|
61
|
+
## Links
|
|
62
|
+
|
|
63
|
+
- [GitHub](https://github.com/jorgevee/fournex)
|
|
64
|
+
- [Documentation](https://fournex.com/docs)
|
|
65
|
+
- [Website](https://fournex.com)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from .sdk import (
|
|
2
|
+
EVENT_LEVELS,
|
|
3
|
+
EVENT_SOURCES,
|
|
4
|
+
EVENT_TYPES,
|
|
5
|
+
SCHEMA_VERSION,
|
|
6
|
+
begin_span,
|
|
7
|
+
build_runtime_event,
|
|
8
|
+
clear_local_events,
|
|
9
|
+
emit_event,
|
|
10
|
+
end_span,
|
|
11
|
+
flush,
|
|
12
|
+
get_local_events,
|
|
13
|
+
get_runtime_config,
|
|
14
|
+
init,
|
|
15
|
+
make_event,
|
|
16
|
+
shutdown,
|
|
17
|
+
)
|
|
18
|
+
from .dataloader import InstrumentedDataLoader, instrument_dataloader
|
|
19
|
+
from .analysis import (
|
|
20
|
+
build_diagnosis_result,
|
|
21
|
+
classify_bottlenecks,
|
|
22
|
+
derive_run_summary,
|
|
23
|
+
derive_step_metrics,
|
|
24
|
+
select_steady_state_step_ids,
|
|
25
|
+
summarize_run,
|
|
26
|
+
summarize_run_with_steady_state,
|
|
27
|
+
summarize_steady_state,
|
|
28
|
+
summarize_step_scope,
|
|
29
|
+
)
|
|
30
|
+
from .common_ir_analysis import (
|
|
31
|
+
derive_ir_bottleneck_annotations,
|
|
32
|
+
derive_ir_run_summary,
|
|
33
|
+
derive_ir_step_summaries,
|
|
34
|
+
summarize_ir_run,
|
|
35
|
+
)
|
|
36
|
+
from .common_ir import (
|
|
37
|
+
AnnotationRecord,
|
|
38
|
+
BOTTLENECK_CLASSES,
|
|
39
|
+
EVENT_FAMILIES,
|
|
40
|
+
EventRecord,
|
|
41
|
+
JobInfo,
|
|
42
|
+
MEMORY_OPS,
|
|
43
|
+
MODEL_FAMILIES,
|
|
44
|
+
MetricRecord,
|
|
45
|
+
RunRecord,
|
|
46
|
+
WORKLOAD_CLASSES,
|
|
47
|
+
WorkloadInfo,
|
|
48
|
+
validate_run_dict,
|
|
49
|
+
)
|
|
50
|
+
from .common_ir_validators import (
|
|
51
|
+
semantic_warnings_for_run,
|
|
52
|
+
validate_annotation_record,
|
|
53
|
+
validate_event_record,
|
|
54
|
+
validate_metric_record,
|
|
55
|
+
validate_run_payload,
|
|
56
|
+
validate_run_record,
|
|
57
|
+
)
|
|
58
|
+
from .cuda_timers import time_memcpy, time_phase, time_region
|
|
59
|
+
from .storage import (
|
|
60
|
+
persist_local_trace,
|
|
61
|
+
persist_run_artifacts,
|
|
62
|
+
persist_run_summary,
|
|
63
|
+
persist_run_with_steady_state_summary,
|
|
64
|
+
)
|
|
65
|
+
from .profiler import (
|
|
66
|
+
ProfilerSchedule,
|
|
67
|
+
configure_sampled_profiler,
|
|
68
|
+
get_profiler_controller,
|
|
69
|
+
profiler_step_end,
|
|
70
|
+
profiler_step_start,
|
|
71
|
+
profiler_window,
|
|
72
|
+
)
|
|
73
|
+
from .pytorch_profiler_ir import (
|
|
74
|
+
PytorchProfilerTrace,
|
|
75
|
+
PytorchProfilerTraceEvent,
|
|
76
|
+
map_pytorch_profiler_to_ir,
|
|
77
|
+
)
|
|
78
|
+
from .nvml_ir import NvmlSampleRecord, map_nvml_sample_to_ir
|
|
79
|
+
from .distributed_ir import DistributedCommRecord, map_distributed_record_to_ir
|
|
80
|
+
from .data_pipeline_ir import DataPipelineRecord, map_data_pipeline_record_to_ir
|
|
81
|
+
from .step_context import phase, step_context
|
|
82
|
+
from .shapes import (
|
|
83
|
+
describe_batch,
|
|
84
|
+
extract_dtypes,
|
|
85
|
+
extract_shapes,
|
|
86
|
+
infer_batch_size,
|
|
87
|
+
infer_sequence_length,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
__all__ = [
|
|
91
|
+
"AnnotationRecord",
|
|
92
|
+
"BOTTLENECK_CLASSES",
|
|
93
|
+
"DataPipelineRecord",
|
|
94
|
+
"DistributedCommRecord",
|
|
95
|
+
"EVENT_LEVELS",
|
|
96
|
+
"EVENT_FAMILIES",
|
|
97
|
+
"EVENT_SOURCES",
|
|
98
|
+
"EVENT_TYPES",
|
|
99
|
+
"EventRecord",
|
|
100
|
+
"InstrumentedDataLoader",
|
|
101
|
+
"JobInfo",
|
|
102
|
+
"MEMORY_OPS",
|
|
103
|
+
"MODEL_FAMILIES",
|
|
104
|
+
"MetricRecord",
|
|
105
|
+
"NvmlSampleRecord",
|
|
106
|
+
"ProfilerSchedule",
|
|
107
|
+
"PytorchProfilerTrace",
|
|
108
|
+
"PytorchProfilerTraceEvent",
|
|
109
|
+
"RunRecord",
|
|
110
|
+
"SCHEMA_VERSION",
|
|
111
|
+
"WORKLOAD_CLASSES",
|
|
112
|
+
"WorkloadInfo",
|
|
113
|
+
"begin_span",
|
|
114
|
+
"build_diagnosis_result",
|
|
115
|
+
"build_runtime_event",
|
|
116
|
+
"classify_bottlenecks",
|
|
117
|
+
"clear_local_events",
|
|
118
|
+
"derive_ir_bottleneck_annotations",
|
|
119
|
+
"derive_ir_run_summary",
|
|
120
|
+
"derive_ir_step_summaries",
|
|
121
|
+
"describe_batch",
|
|
122
|
+
"derive_run_summary",
|
|
123
|
+
"derive_step_metrics",
|
|
124
|
+
"emit_event",
|
|
125
|
+
"end_span",
|
|
126
|
+
"extract_dtypes",
|
|
127
|
+
"flush",
|
|
128
|
+
"get_local_events",
|
|
129
|
+
"get_profiler_controller",
|
|
130
|
+
"get_runtime_config",
|
|
131
|
+
"init",
|
|
132
|
+
"make_event",
|
|
133
|
+
"configure_sampled_profiler",
|
|
134
|
+
"profiler_step_end",
|
|
135
|
+
"profiler_step_start",
|
|
136
|
+
"profiler_window",
|
|
137
|
+
"select_steady_state_step_ids",
|
|
138
|
+
"shutdown",
|
|
139
|
+
"phase",
|
|
140
|
+
"map_pytorch_profiler_to_ir",
|
|
141
|
+
"map_nvml_sample_to_ir",
|
|
142
|
+
"map_distributed_record_to_ir",
|
|
143
|
+
"map_data_pipeline_record_to_ir",
|
|
144
|
+
"persist_local_trace",
|
|
145
|
+
"persist_run_artifacts",
|
|
146
|
+
"persist_run_summary",
|
|
147
|
+
"persist_run_with_steady_state_summary",
|
|
148
|
+
"step_context",
|
|
149
|
+
"summarize_run",
|
|
150
|
+
"summarize_run_with_steady_state",
|
|
151
|
+
"summarize_steady_state",
|
|
152
|
+
"summarize_step_scope",
|
|
153
|
+
"summarize_ir_run",
|
|
154
|
+
"extract_shapes",
|
|
155
|
+
"infer_batch_size",
|
|
156
|
+
"infer_sequence_length",
|
|
157
|
+
"instrument_dataloader",
|
|
158
|
+
"time_memcpy",
|
|
159
|
+
"time_phase",
|
|
160
|
+
"time_region",
|
|
161
|
+
"validate_run_dict",
|
|
162
|
+
"validate_annotation_record",
|
|
163
|
+
"validate_event_record",
|
|
164
|
+
"validate_metric_record",
|
|
165
|
+
"validate_run_payload",
|
|
166
|
+
"validate_run_record",
|
|
167
|
+
"semantic_warnings_for_run",
|
|
168
|
+
]
|