particleflow 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- habana/gaudi-pod-python-v19-1hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-2hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-3hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-4hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-5hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-6hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-7hpu.yaml +101 -0
- habana/gaudi-pod-python-v19-8hpu.yaml +101 -0
- mlpf/__init__.py +0 -0
- mlpf/conf.py +720 -0
- mlpf/customizations.py +45 -0
- mlpf/data/__init__.py +0 -0
- mlpf/data/cms/__init__.py +0 -0
- mlpf/data/cms/plot_cms.py +359 -0
- mlpf/data/cms/postprocessing2.py +1000 -0
- mlpf/data/cms/prepare_args.py +48 -0
- mlpf/data/cms/prepare_args_val.py +29 -0
- mlpf/data/key4hep/__init__.py +0 -0
- mlpf/data/key4hep/plot_postprocessing.py +380 -0
- mlpf/data/key4hep/postprocessing.py +1480 -0
- mlpf/heptfds/__init__.py +0 -0
- mlpf/heptfds/cld_pf_edm4hep/__init__.py +0 -0
- mlpf/heptfds/cld_pf_edm4hep/qq.py +84 -0
- mlpf/heptfds/cld_pf_edm4hep/ttbar.py +86 -0
- mlpf/heptfds/cld_pf_edm4hep/ww_fullhad.py +84 -0
- mlpf/heptfds/cld_pf_edm4hep/zz.py +86 -0
- mlpf/heptfds/cld_pf_edm4hep_hits/__init__.py +0 -0
- mlpf/heptfds/cld_pf_edm4hep_hits/qq.py +80 -0
- mlpf/heptfds/cld_pf_edm4hep_hits/ttbar.py +80 -0
- mlpf/heptfds/cld_pf_edm4hep_hits/ww_fullhad.py +80 -0
- mlpf/heptfds/cld_pf_edm4hep_hits/zz.py +80 -0
- mlpf/heptfds/clic_pf_edm4hep/__init__.py +0 -0
- mlpf/heptfds/clic_pf_edm4hep/qq.py +92 -0
- mlpf/heptfds/clic_pf_edm4hep/ttbar.py +91 -0
- mlpf/heptfds/clic_pf_edm4hep/ww_fullhad.py +87 -0
- mlpf/heptfds/clic_pf_edm4hep_hits/__init__.py +0 -0
- mlpf/heptfds/clic_pf_edm4hep_hits/qq.py +80 -0
- mlpf/heptfds/clic_pf_edm4hep_hits/ttbar.py +80 -0
- mlpf/heptfds/clic_pf_edm4hep_hits/ww_fullhad.py +80 -0
- mlpf/heptfds/cms_pf/__init__.py +0 -0
- mlpf/heptfds/cms_pf/cms_utils.py +281 -0
- mlpf/heptfds/cms_pf/qcd.py +87 -0
- mlpf/heptfds/cms_pf/qcd_nopu.py +80 -0
- mlpf/heptfds/cms_pf/ttbar.py +91 -0
- mlpf/heptfds/cms_pf/ttbar_nopu.py +82 -0
- mlpf/heptfds/cms_pf/ztt.py +75 -0
- mlpf/heptfds/cms_pf/ztt_nopu.py +75 -0
- mlpf/heptfds/edm4hep_utils/__init__.py +0 -0
- mlpf/heptfds/edm4hep_utils/utils_hits.py +144 -0
- mlpf/heptfds/edm4hep_utils/utils_pf.py +201 -0
- mlpf/jet_utils.py +133 -0
- mlpf/logger.py +107 -0
- mlpf/model/PFDataset.py +571 -0
- mlpf/model/__init__.py +0 -0
- mlpf/model/distributed_ray.py +377 -0
- mlpf/model/gnn_lsh.py +320 -0
- mlpf/model/inference.py +295 -0
- mlpf/model/losses.py +222 -0
- mlpf/model/mlpf.py +715 -0
- mlpf/model/monitoring.py +118 -0
- mlpf/model/plots.py +203 -0
- mlpf/model/training.py +946 -0
- mlpf/model/utils.py +220 -0
- mlpf/optimizers/__init__.py +30 -0
- mlpf/optimizers/lamb.py +207 -0
- mlpf/pipeline.py +254 -0
- mlpf/plotting/__init__.py +0 -0
- mlpf/plotting/cms_fwlite.py +148 -0
- mlpf/plotting/cmssw_validation_data.py +521 -0
- mlpf/plotting/corrections.py +361 -0
- mlpf/plotting/data_preparation.py +135 -0
- mlpf/plotting/draw_graphs.py +154 -0
- mlpf/plotting/plot_jet_response_comparison_v1.py +219 -0
- mlpf/plotting/plot_jet_response_comparison_v2.py +270 -0
- mlpf/plotting/plot_loss_curves.py +258 -0
- mlpf/plotting/plot_met_validation.py +554 -0
- mlpf/plotting/plot_utils.py +2168 -0
- mlpf/plotting/plot_validation.py +1090 -0
- mlpf/plotting/utils.py +177 -0
- mlpf/raytune/__init__.py +0 -0
- mlpf/raytune/search_space.py +38 -0
- mlpf/raytune/utils.py +111 -0
- mlpf/snakemake/produce_cms_validation_snakemake.py +322 -0
- mlpf/snakemake/produce_snakemake.py +732 -0
- mlpf/snakemake/produce_validation_snakemake.py +294 -0
- mlpf/standalone/__init__.py +0 -0
- mlpf/standalone/dsl.py +269 -0
- mlpf/standalone/eval.py +415 -0
- mlpf/standalone/plot_evolution.py +222 -0
- mlpf/standalone/run_evolution.py +551 -0
- mlpf/standalone/train.py +1220 -0
- mlpf/standalone_eval/key4hep/evaluator.py +298 -0
- mlpf/standalone_eval/key4hep/plots.py +262 -0
- mlpf/timing.py +124 -0
- mlpf/utils.py +164 -0
- particleflow-3.1.0.dist-info/METADATA +220 -0
- particleflow-3.1.0.dist-info/RECORD +101 -0
- particleflow-3.1.0.dist-info/WHEEL +5 -0
- particleflow-3.1.0.dist-info/entry_points.txt +2 -0
- particleflow-3.1.0.dist-info/licenses/LICENSE +201 -0
- particleflow-3.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
apiVersion: batch/v1
|
|
2
|
+
kind: Job
|
|
3
|
+
metadata:
|
|
4
|
+
name: mlpf-hpu-strategy-v19-1hpu-hvd-constbatch-bm2
|
|
5
|
+
spec:
|
|
6
|
+
completions: 1
|
|
7
|
+
parallelism: 1
|
|
8
|
+
backoffLimit: 0
|
|
9
|
+
template:
|
|
10
|
+
spec:
|
|
11
|
+
restartPolicy: Never
|
|
12
|
+
serviceAccountName: jduarte
|
|
13
|
+
nodeSelector:
|
|
14
|
+
brightcomputing.com/node-category: "gaudi"
|
|
15
|
+
hostNetwork: false
|
|
16
|
+
volumes:
|
|
17
|
+
- name: home
|
|
18
|
+
hostPath:
|
|
19
|
+
path: /home/jduarte
|
|
20
|
+
type: Directory
|
|
21
|
+
- name: ceph
|
|
22
|
+
hostPath:
|
|
23
|
+
path: /voyager/ceph/users/jduarte
|
|
24
|
+
type: Directory
|
|
25
|
+
- name: scratch
|
|
26
|
+
emptyDir: {}
|
|
27
|
+
imagePullSecrets:
|
|
28
|
+
- name: registry-credentials
|
|
29
|
+
containers:
|
|
30
|
+
- name: htf2110-190-580-20230327-ubuntu2004
|
|
31
|
+
image: jmduarte/particleflow:habana_v19
|
|
32
|
+
imagePullPolicy: Always
|
|
33
|
+
resources:
|
|
34
|
+
requests:
|
|
35
|
+
cpu: 48
|
|
36
|
+
memory: 384Gi
|
|
37
|
+
habana.ai/gaudi: 8
|
|
38
|
+
hugepages-2Mi: 96000Mi
|
|
39
|
+
ephemeral-storage: 256Gi
|
|
40
|
+
limits:
|
|
41
|
+
cpu: 96
|
|
42
|
+
memory: 396Gi
|
|
43
|
+
habana.ai/gaudi: 8
|
|
44
|
+
hugepages-2Mi: 96000Mi
|
|
45
|
+
ephemeral-storage: 512Gi
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: home
|
|
48
|
+
mountPath: /home/jduarte
|
|
49
|
+
- name: ceph
|
|
50
|
+
mountPath: /voyager/ceph/users/jduarte
|
|
51
|
+
- name: scratch
|
|
52
|
+
mountPath: /scratch
|
|
53
|
+
env:
|
|
54
|
+
- name: POD_NAME_ID
|
|
55
|
+
valueFrom:
|
|
56
|
+
fieldRef:
|
|
57
|
+
fieldPath: metadata.name
|
|
58
|
+
- name: POD_NODE_HOSTNAME
|
|
59
|
+
valueFrom:
|
|
60
|
+
fieldRef:
|
|
61
|
+
fieldPath: spec.nodeName
|
|
62
|
+
- name: HOME
|
|
63
|
+
value: "/home/jduarte"
|
|
64
|
+
- name: CEPH
|
|
65
|
+
value: "/voyager/ceph/users/jduarte"
|
|
66
|
+
- name: LOCAL_SCRATCH_DIR
|
|
67
|
+
value: "/scratch"
|
|
68
|
+
- name: MPI_ROOT
|
|
69
|
+
value: "/opt/amazon/openmpi"
|
|
70
|
+
- name: TFDS_DATA_DIR
|
|
71
|
+
value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
|
|
72
|
+
workingDir: /home/jduarte/particleflow
|
|
73
|
+
command: ["/bin/bash", "-c"]
|
|
74
|
+
args:
|
|
75
|
+
- >-
|
|
76
|
+
declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
|
|
77
|
+
declare -xir UNIX_TIME="$(date +'%s')";
|
|
78
|
+
|
|
79
|
+
declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
|
|
80
|
+
|
|
81
|
+
declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
|
|
82
|
+
declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
|
|
83
|
+
|
|
84
|
+
echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
|
|
85
|
+
echo "";
|
|
86
|
+
|
|
87
|
+
cat "${K8S_JOB_YAML_FILE}";
|
|
88
|
+
|
|
89
|
+
printenv;
|
|
90
|
+
|
|
91
|
+
cat /etc/os-release;
|
|
92
|
+
lscpu;
|
|
93
|
+
free -h;
|
|
94
|
+
cat /proc/meminfo;
|
|
95
|
+
lsblk --output-all;
|
|
96
|
+
cat /etc/fstab;
|
|
97
|
+
lspci -vvv;
|
|
98
|
+
hl-smi;
|
|
99
|
+
hl-smi -q;
|
|
100
|
+
|
|
101
|
+
time -p mpirun -n 1 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
apiVersion: batch/v1
|
|
2
|
+
kind: Job
|
|
3
|
+
metadata:
|
|
4
|
+
name: mlpf-hpu-strategy-v19-2hpu-constbatch-bm2
|
|
5
|
+
spec:
|
|
6
|
+
completions: 1
|
|
7
|
+
parallelism: 1
|
|
8
|
+
backoffLimit: 0
|
|
9
|
+
template:
|
|
10
|
+
spec:
|
|
11
|
+
restartPolicy: Never
|
|
12
|
+
serviceAccountName: jduarte
|
|
13
|
+
nodeSelector:
|
|
14
|
+
brightcomputing.com/node-category: "gaudi"
|
|
15
|
+
hostNetwork: false
|
|
16
|
+
volumes:
|
|
17
|
+
- name: home
|
|
18
|
+
hostPath:
|
|
19
|
+
path: /home/jduarte
|
|
20
|
+
type: Directory
|
|
21
|
+
- name: ceph
|
|
22
|
+
hostPath:
|
|
23
|
+
path: /voyager/ceph/users/jduarte
|
|
24
|
+
type: Directory
|
|
25
|
+
- name: scratch
|
|
26
|
+
emptyDir: {}
|
|
27
|
+
imagePullSecrets:
|
|
28
|
+
- name: registry-credentials
|
|
29
|
+
containers:
|
|
30
|
+
- name: htf2110-190-580-20230327-ubuntu2004
|
|
31
|
+
image: jmduarte/particleflow:habana_v19
|
|
32
|
+
imagePullPolicy: Always
|
|
33
|
+
resources:
|
|
34
|
+
requests:
|
|
35
|
+
cpu: 48
|
|
36
|
+
memory: 384Gi
|
|
37
|
+
habana.ai/gaudi: 8
|
|
38
|
+
hugepages-2Mi: 96000Mi
|
|
39
|
+
ephemeral-storage: 256Gi
|
|
40
|
+
limits:
|
|
41
|
+
cpu: 96
|
|
42
|
+
memory: 396Gi
|
|
43
|
+
habana.ai/gaudi: 8
|
|
44
|
+
hugepages-2Mi: 96000Mi
|
|
45
|
+
ephemeral-storage: 512Gi
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: home
|
|
48
|
+
mountPath: /home/jduarte
|
|
49
|
+
- name: ceph
|
|
50
|
+
mountPath: /voyager/ceph/users/jduarte
|
|
51
|
+
- name: scratch
|
|
52
|
+
mountPath: /scratch
|
|
53
|
+
env:
|
|
54
|
+
- name: POD_NAME_ID
|
|
55
|
+
valueFrom:
|
|
56
|
+
fieldRef:
|
|
57
|
+
fieldPath: metadata.name
|
|
58
|
+
- name: POD_NODE_HOSTNAME
|
|
59
|
+
valueFrom:
|
|
60
|
+
fieldRef:
|
|
61
|
+
fieldPath: spec.nodeName
|
|
62
|
+
- name: HOME
|
|
63
|
+
value: "/home/jduarte"
|
|
64
|
+
- name: CEPH
|
|
65
|
+
value: "/voyager/ceph/users/jduarte"
|
|
66
|
+
- name: LOCAL_SCRATCH_DIR
|
|
67
|
+
value: "/scratch"
|
|
68
|
+
- name: MPI_ROOT
|
|
69
|
+
value: "/opt/amazon/openmpi"
|
|
70
|
+
- name: TFDS_DATA_DIR
|
|
71
|
+
value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
|
|
72
|
+
workingDir: /home/jduarte/particleflow
|
|
73
|
+
command: ["/bin/bash", "-c"]
|
|
74
|
+
args:
|
|
75
|
+
- >-
|
|
76
|
+
declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
|
|
77
|
+
declare -xir UNIX_TIME="$(date +'%s')";
|
|
78
|
+
|
|
79
|
+
declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
|
|
80
|
+
|
|
81
|
+
declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
|
|
82
|
+
declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
|
|
83
|
+
|
|
84
|
+
echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
|
|
85
|
+
echo "";
|
|
86
|
+
|
|
87
|
+
cat "${K8S_JOB_YAML_FILE}";
|
|
88
|
+
|
|
89
|
+
printenv;
|
|
90
|
+
|
|
91
|
+
cat /etc/os-release;
|
|
92
|
+
lscpu;
|
|
93
|
+
free -h;
|
|
94
|
+
cat /proc/meminfo;
|
|
95
|
+
lsblk --output-all;
|
|
96
|
+
cat /etc/fstab;
|
|
97
|
+
lspci -vvv;
|
|
98
|
+
hl-smi;
|
|
99
|
+
hl-smi -q;
|
|
100
|
+
|
|
101
|
+
time -p mpirun -n 2 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
apiVersion: batch/v1
|
|
2
|
+
kind: Job
|
|
3
|
+
metadata:
|
|
4
|
+
name: mlpf-hpu-strategy-v19-3hpu-constbatch-bm2
|
|
5
|
+
spec:
|
|
6
|
+
completions: 1
|
|
7
|
+
parallelism: 1
|
|
8
|
+
backoffLimit: 0
|
|
9
|
+
template:
|
|
10
|
+
spec:
|
|
11
|
+
restartPolicy: Never
|
|
12
|
+
serviceAccountName: jduarte
|
|
13
|
+
nodeSelector:
|
|
14
|
+
brightcomputing.com/node-category: "gaudi"
|
|
15
|
+
hostNetwork: false
|
|
16
|
+
volumes:
|
|
17
|
+
- name: home
|
|
18
|
+
hostPath:
|
|
19
|
+
path: /home/jduarte
|
|
20
|
+
type: Directory
|
|
21
|
+
- name: ceph
|
|
22
|
+
hostPath:
|
|
23
|
+
path: /voyager/ceph/users/jduarte
|
|
24
|
+
type: Directory
|
|
25
|
+
- name: scratch
|
|
26
|
+
emptyDir: {}
|
|
27
|
+
imagePullSecrets:
|
|
28
|
+
- name: registry-credentials
|
|
29
|
+
containers:
|
|
30
|
+
- name: htf2110-190-580-20230327-ubuntu2004
|
|
31
|
+
image: jmduarte/particleflow:habana_v19
|
|
32
|
+
imagePullPolicy: Always
|
|
33
|
+
resources:
|
|
34
|
+
requests:
|
|
35
|
+
cpu: 48
|
|
36
|
+
memory: 384Gi
|
|
37
|
+
habana.ai/gaudi: 8
|
|
38
|
+
hugepages-2Mi: 96000Mi
|
|
39
|
+
ephemeral-storage: 256Gi
|
|
40
|
+
limits:
|
|
41
|
+
cpu: 96
|
|
42
|
+
memory: 396Gi
|
|
43
|
+
habana.ai/gaudi: 8
|
|
44
|
+
hugepages-2Mi: 96000Mi
|
|
45
|
+
ephemeral-storage: 512Gi
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: home
|
|
48
|
+
mountPath: /home/jduarte
|
|
49
|
+
- name: ceph
|
|
50
|
+
mountPath: /voyager/ceph/users/jduarte
|
|
51
|
+
- name: scratch
|
|
52
|
+
mountPath: /scratch
|
|
53
|
+
env:
|
|
54
|
+
- name: POD_NAME_ID
|
|
55
|
+
valueFrom:
|
|
56
|
+
fieldRef:
|
|
57
|
+
fieldPath: metadata.name
|
|
58
|
+
- name: POD_NODE_HOSTNAME
|
|
59
|
+
valueFrom:
|
|
60
|
+
fieldRef:
|
|
61
|
+
fieldPath: spec.nodeName
|
|
62
|
+
- name: HOME
|
|
63
|
+
value: "/home/jduarte"
|
|
64
|
+
- name: CEPH
|
|
65
|
+
value: "/voyager/ceph/users/jduarte"
|
|
66
|
+
- name: LOCAL_SCRATCH_DIR
|
|
67
|
+
value: "/scratch"
|
|
68
|
+
- name: MPI_ROOT
|
|
69
|
+
value: "/opt/amazon/openmpi"
|
|
70
|
+
- name: TFDS_DATA_DIR
|
|
71
|
+
value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
|
|
72
|
+
workingDir: /home/jduarte/particleflow
|
|
73
|
+
command: ["/bin/bash", "-c"]
|
|
74
|
+
args:
|
|
75
|
+
- >-
|
|
76
|
+
declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
|
|
77
|
+
declare -xir UNIX_TIME="$(date +'%s')";
|
|
78
|
+
|
|
79
|
+
declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
|
|
80
|
+
|
|
81
|
+
declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
|
|
82
|
+
declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
|
|
83
|
+
|
|
84
|
+
echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
|
|
85
|
+
echo "";
|
|
86
|
+
|
|
87
|
+
cat "${K8S_JOB_YAML_FILE}";
|
|
88
|
+
|
|
89
|
+
printenv;
|
|
90
|
+
|
|
91
|
+
cat /etc/os-release;
|
|
92
|
+
lscpu;
|
|
93
|
+
free -h;
|
|
94
|
+
cat /proc/meminfo;
|
|
95
|
+
lsblk --output-all;
|
|
96
|
+
cat /etc/fstab;
|
|
97
|
+
lspci -vvv;
|
|
98
|
+
hl-smi;
|
|
99
|
+
hl-smi -q;
|
|
100
|
+
|
|
101
|
+
time -p mpirun -n 3 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
apiVersion: batch/v1
|
|
2
|
+
kind: Job
|
|
3
|
+
metadata:
|
|
4
|
+
name: mlpf-hpu-strategy-v19-4hpu-constbatch-bm2
|
|
5
|
+
spec:
|
|
6
|
+
completions: 1
|
|
7
|
+
parallelism: 1
|
|
8
|
+
backoffLimit: 0
|
|
9
|
+
template:
|
|
10
|
+
spec:
|
|
11
|
+
restartPolicy: Never
|
|
12
|
+
serviceAccountName: jduarte
|
|
13
|
+
nodeSelector:
|
|
14
|
+
brightcomputing.com/node-category: "gaudi"
|
|
15
|
+
hostNetwork: false
|
|
16
|
+
volumes:
|
|
17
|
+
- name: home
|
|
18
|
+
hostPath:
|
|
19
|
+
path: /home/jduarte
|
|
20
|
+
type: Directory
|
|
21
|
+
- name: ceph
|
|
22
|
+
hostPath:
|
|
23
|
+
path: /voyager/ceph/users/jduarte
|
|
24
|
+
type: Directory
|
|
25
|
+
- name: scratch
|
|
26
|
+
emptyDir: {}
|
|
27
|
+
imagePullSecrets:
|
|
28
|
+
- name: registry-credentials
|
|
29
|
+
containers:
|
|
30
|
+
- name: htf2110-190-580-20230327-ubuntu2004
|
|
31
|
+
image: jmduarte/particleflow:habana_v19
|
|
32
|
+
imagePullPolicy: Always
|
|
33
|
+
resources:
|
|
34
|
+
requests:
|
|
35
|
+
cpu: 48
|
|
36
|
+
memory: 384Gi
|
|
37
|
+
habana.ai/gaudi: 8
|
|
38
|
+
hugepages-2Mi: 96000Mi
|
|
39
|
+
ephemeral-storage: 256Gi
|
|
40
|
+
limits:
|
|
41
|
+
cpu: 96
|
|
42
|
+
memory: 396Gi
|
|
43
|
+
habana.ai/gaudi: 8
|
|
44
|
+
hugepages-2Mi: 96000Mi
|
|
45
|
+
ephemeral-storage: 512Gi
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: home
|
|
48
|
+
mountPath: /home/jduarte
|
|
49
|
+
- name: ceph
|
|
50
|
+
mountPath: /voyager/ceph/users/jduarte
|
|
51
|
+
- name: scratch
|
|
52
|
+
mountPath: /scratch
|
|
53
|
+
env:
|
|
54
|
+
- name: POD_NAME_ID
|
|
55
|
+
valueFrom:
|
|
56
|
+
fieldRef:
|
|
57
|
+
fieldPath: metadata.name
|
|
58
|
+
- name: POD_NODE_HOSTNAME
|
|
59
|
+
valueFrom:
|
|
60
|
+
fieldRef:
|
|
61
|
+
fieldPath: spec.nodeName
|
|
62
|
+
- name: HOME
|
|
63
|
+
value: "/home/jduarte"
|
|
64
|
+
- name: CEPH
|
|
65
|
+
value: "/voyager/ceph/users/jduarte"
|
|
66
|
+
- name: LOCAL_SCRATCH_DIR
|
|
67
|
+
value: "/scratch"
|
|
68
|
+
- name: MPI_ROOT
|
|
69
|
+
value: "/opt/amazon/openmpi"
|
|
70
|
+
- name: TFDS_DATA_DIR
|
|
71
|
+
value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
|
|
72
|
+
workingDir: /home/jduarte/particleflow
|
|
73
|
+
command: ["/bin/bash", "-c"]
|
|
74
|
+
args:
|
|
75
|
+
- >-
|
|
76
|
+
declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
|
|
77
|
+
declare -xir UNIX_TIME="$(date +'%s')";
|
|
78
|
+
|
|
79
|
+
declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
|
|
80
|
+
|
|
81
|
+
declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
|
|
82
|
+
declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
|
|
83
|
+
|
|
84
|
+
echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
|
|
85
|
+
echo "";
|
|
86
|
+
|
|
87
|
+
cat "${K8S_JOB_YAML_FILE}";
|
|
88
|
+
|
|
89
|
+
printenv;
|
|
90
|
+
|
|
91
|
+
cat /etc/os-release;
|
|
92
|
+
lscpu;
|
|
93
|
+
free -h;
|
|
94
|
+
cat /proc/meminfo;
|
|
95
|
+
lsblk --output-all;
|
|
96
|
+
cat /etc/fstab;
|
|
97
|
+
lspci -vvv;
|
|
98
|
+
hl-smi;
|
|
99
|
+
hl-smi -q;
|
|
100
|
+
|
|
101
|
+
time -p mpirun -n 4 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
apiVersion: batch/v1
|
|
2
|
+
kind: Job
|
|
3
|
+
metadata:
|
|
4
|
+
name: mlpf-hpu-strategy-v19-5hpu-constbatch-bm2
|
|
5
|
+
spec:
|
|
6
|
+
completions: 1
|
|
7
|
+
parallelism: 1
|
|
8
|
+
backoffLimit: 0
|
|
9
|
+
template:
|
|
10
|
+
spec:
|
|
11
|
+
restartPolicy: Never
|
|
12
|
+
serviceAccountName: jduarte
|
|
13
|
+
nodeSelector:
|
|
14
|
+
brightcomputing.com/node-category: "gaudi"
|
|
15
|
+
hostNetwork: false
|
|
16
|
+
volumes:
|
|
17
|
+
- name: home
|
|
18
|
+
hostPath:
|
|
19
|
+
path: /home/jduarte
|
|
20
|
+
type: Directory
|
|
21
|
+
- name: ceph
|
|
22
|
+
hostPath:
|
|
23
|
+
path: /voyager/ceph/users/jduarte
|
|
24
|
+
type: Directory
|
|
25
|
+
- name: scratch
|
|
26
|
+
emptyDir: {}
|
|
27
|
+
imagePullSecrets:
|
|
28
|
+
- name: registry-credentials
|
|
29
|
+
containers:
|
|
30
|
+
- name: htf2110-190-580-20230327-ubuntu2004
|
|
31
|
+
image: jmduarte/particleflow:habana_v19
|
|
32
|
+
imagePullPolicy: Always
|
|
33
|
+
resources:
|
|
34
|
+
requests:
|
|
35
|
+
cpu: 48
|
|
36
|
+
memory: 384Gi
|
|
37
|
+
habana.ai/gaudi: 8
|
|
38
|
+
hugepages-2Mi: 96000Mi
|
|
39
|
+
ephemeral-storage: 256Gi
|
|
40
|
+
limits:
|
|
41
|
+
cpu: 96
|
|
42
|
+
memory: 396Gi
|
|
43
|
+
habana.ai/gaudi: 8
|
|
44
|
+
hugepages-2Mi: 96000Mi
|
|
45
|
+
ephemeral-storage: 512Gi
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: home
|
|
48
|
+
mountPath: /home/jduarte
|
|
49
|
+
- name: ceph
|
|
50
|
+
mountPath: /voyager/ceph/users/jduarte
|
|
51
|
+
- name: scratch
|
|
52
|
+
mountPath: /scratch
|
|
53
|
+
env:
|
|
54
|
+
- name: POD_NAME_ID
|
|
55
|
+
valueFrom:
|
|
56
|
+
fieldRef:
|
|
57
|
+
fieldPath: metadata.name
|
|
58
|
+
- name: POD_NODE_HOSTNAME
|
|
59
|
+
valueFrom:
|
|
60
|
+
fieldRef:
|
|
61
|
+
fieldPath: spec.nodeName
|
|
62
|
+
- name: HOME
|
|
63
|
+
value: "/home/jduarte"
|
|
64
|
+
- name: CEPH
|
|
65
|
+
value: "/voyager/ceph/users/jduarte"
|
|
66
|
+
- name: LOCAL_SCRATCH_DIR
|
|
67
|
+
value: "/scratch"
|
|
68
|
+
- name: MPI_ROOT
|
|
69
|
+
value: "/opt/amazon/openmpi"
|
|
70
|
+
- name: TFDS_DATA_DIR
|
|
71
|
+
value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
|
|
72
|
+
workingDir: /home/jduarte/particleflow
|
|
73
|
+
command: ["/bin/bash", "-c"]
|
|
74
|
+
args:
|
|
75
|
+
- >-
|
|
76
|
+
declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
|
|
77
|
+
declare -xir UNIX_TIME="$(date +'%s')";
|
|
78
|
+
|
|
79
|
+
declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
|
|
80
|
+
|
|
81
|
+
declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
|
|
82
|
+
declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
|
|
83
|
+
|
|
84
|
+
echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
|
|
85
|
+
echo "";
|
|
86
|
+
|
|
87
|
+
cat "${K8S_JOB_YAML_FILE}";
|
|
88
|
+
|
|
89
|
+
printenv;
|
|
90
|
+
|
|
91
|
+
cat /etc/os-release;
|
|
92
|
+
lscpu;
|
|
93
|
+
free -h;
|
|
94
|
+
cat /proc/meminfo;
|
|
95
|
+
lsblk --output-all;
|
|
96
|
+
cat /etc/fstab;
|
|
97
|
+
lspci -vvv;
|
|
98
|
+
hl-smi;
|
|
99
|
+
hl-smi -q;
|
|
100
|
+
|
|
101
|
+
time -p mpirun -n 5 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
apiVersion: batch/v1
|
|
2
|
+
kind: Job
|
|
3
|
+
metadata:
|
|
4
|
+
name: mlpf-hpu-strategy-v19-6hpu-constbatch-bm2
|
|
5
|
+
spec:
|
|
6
|
+
completions: 1
|
|
7
|
+
parallelism: 1
|
|
8
|
+
backoffLimit: 0
|
|
9
|
+
template:
|
|
10
|
+
spec:
|
|
11
|
+
restartPolicy: Never
|
|
12
|
+
serviceAccountName: jduarte
|
|
13
|
+
nodeSelector:
|
|
14
|
+
brightcomputing.com/node-category: "gaudi"
|
|
15
|
+
hostNetwork: false
|
|
16
|
+
volumes:
|
|
17
|
+
- name: home
|
|
18
|
+
hostPath:
|
|
19
|
+
path: /home/jduarte
|
|
20
|
+
type: Directory
|
|
21
|
+
- name: ceph
|
|
22
|
+
hostPath:
|
|
23
|
+
path: /voyager/ceph/users/jduarte
|
|
24
|
+
type: Directory
|
|
25
|
+
- name: scratch
|
|
26
|
+
emptyDir: {}
|
|
27
|
+
imagePullSecrets:
|
|
28
|
+
- name: registry-credentials
|
|
29
|
+
containers:
|
|
30
|
+
- name: htf2110-190-580-20230327-ubuntu2004
|
|
31
|
+
image: jmduarte/particleflow:habana_v19
|
|
32
|
+
imagePullPolicy: Always
|
|
33
|
+
resources:
|
|
34
|
+
requests:
|
|
35
|
+
cpu: 48
|
|
36
|
+
memory: 384Gi
|
|
37
|
+
habana.ai/gaudi: 8
|
|
38
|
+
hugepages-2Mi: 96000Mi
|
|
39
|
+
ephemeral-storage: 256Gi
|
|
40
|
+
limits:
|
|
41
|
+
cpu: 96
|
|
42
|
+
memory: 396Gi
|
|
43
|
+
habana.ai/gaudi: 8
|
|
44
|
+
hugepages-2Mi: 96000Mi
|
|
45
|
+
ephemeral-storage: 512Gi
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: home
|
|
48
|
+
mountPath: /home/jduarte
|
|
49
|
+
- name: ceph
|
|
50
|
+
mountPath: /voyager/ceph/users/jduarte
|
|
51
|
+
- name: scratch
|
|
52
|
+
mountPath: /scratch
|
|
53
|
+
env:
|
|
54
|
+
- name: POD_NAME_ID
|
|
55
|
+
valueFrom:
|
|
56
|
+
fieldRef:
|
|
57
|
+
fieldPath: metadata.name
|
|
58
|
+
- name: POD_NODE_HOSTNAME
|
|
59
|
+
valueFrom:
|
|
60
|
+
fieldRef:
|
|
61
|
+
fieldPath: spec.nodeName
|
|
62
|
+
- name: HOME
|
|
63
|
+
value: "/home/jduarte"
|
|
64
|
+
- name: CEPH
|
|
65
|
+
value: "/voyager/ceph/users/jduarte"
|
|
66
|
+
- name: LOCAL_SCRATCH_DIR
|
|
67
|
+
value: "/scratch"
|
|
68
|
+
- name: MPI_ROOT
|
|
69
|
+
value: "/opt/amazon/openmpi"
|
|
70
|
+
- name: TFDS_DATA_DIR
|
|
71
|
+
value: "/voyager/ceph/users/jduarte/tensorflow_datasets"
|
|
72
|
+
workingDir: /home/jduarte/particleflow
|
|
73
|
+
command: ["/bin/bash", "-c"]
|
|
74
|
+
args:
|
|
75
|
+
- >-
|
|
76
|
+
declare -xr LOCAL_TIME="$(date +'%Y%m%dT%H%M%S%z')";
|
|
77
|
+
declare -xir UNIX_TIME="$(date +'%s')";
|
|
78
|
+
|
|
79
|
+
declare -xr VGR_POD_ID="${POD_NAME_ID}.$(date +'%s').${RANDOM}.${POD_NODE_HOSTNAME}";
|
|
80
|
+
|
|
81
|
+
declare -xr K8S_JOB_YAML_FILE="${PWD}/gaudi-pod-python-v19.yaml";
|
|
82
|
+
declare -xr K8S_JOB_YAML_MD5SUM="$(md5sum ${K8S_JOB_YAML_FILE})";
|
|
83
|
+
|
|
84
|
+
echo "${UNIX_TIME} ${VGR_POD_ID} ${K8S_JOB_YAML_MD5SUM}";
|
|
85
|
+
echo "";
|
|
86
|
+
|
|
87
|
+
cat "${K8S_JOB_YAML_FILE}";
|
|
88
|
+
|
|
89
|
+
printenv;
|
|
90
|
+
|
|
91
|
+
cat /etc/os-release;
|
|
92
|
+
lscpu;
|
|
93
|
+
free -h;
|
|
94
|
+
cat /proc/meminfo;
|
|
95
|
+
lsblk --output-all;
|
|
96
|
+
cat /etc/fstab;
|
|
97
|
+
lspci -vvv;
|
|
98
|
+
hl-smi;
|
|
99
|
+
hl-smi -q;
|
|
100
|
+
|
|
101
|
+
time -p mpirun -n 6 --allow-run-as-root --prefix "${MPI_ROOT}" -x "${VGR_POD_ID}" python3 -u mlpf/pipeline.py train -g -m -c parameters/clic-test.yaml --plot-freq 0 --batch-multiplier 2 --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir;
|