accelforge 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- accelforge/__init__.py +21 -0
- accelforge/_accelerated_imports.py +16 -0
- accelforge/_deprecate/_simanneal/evalmapping.py +271 -0
- accelforge/_deprecate/_simanneal/mapspaceglobals.py +298 -0
- accelforge/_deprecate/_simanneal/simanneal.py +666 -0
- accelforge/_deprecate/_simanneal/tracking.py +105 -0
- accelforge/_deprecate/_simanneal/wrappers.py +218 -0
- accelforge/_deprecate/_simanneal2/__init__.py +7 -0
- accelforge/_deprecate/_simanneal2/simanneal.py +493 -0
- accelforge/_deprecate/_simanneal2/tracking.py +116 -0
- accelforge/_deprecate/compatibility_util.py +181 -0
- accelforge/_deprecate/layerdeduplication/__init__.py +2 -0
- accelforge/_deprecate/layerdeduplication/group_similar_einsums.py +160 -0
- accelforge/_deprecate/layerdeduplication/grouped_einsums.py +84 -0
- accelforge/_deprecate/mapping_filter_tags/__init__.py +2 -0
- accelforge/_deprecate/mapping_filter_tags/ffmt.py +212 -0
- accelforge/_deprecate/mapping_filter_tags/onesplit.py +24 -0
- accelforge/_deprecate/mapping_filter_tags/util.py +24 -0
- accelforge/_deprecate/tags.py +69 -0
- accelforge/_deprecate/viz/__init__.py +0 -0
- accelforge/_deprecate/viz/interactive.py +159 -0
- accelforge/_deprecate/viz/reservationtree.py +307 -0
- accelforge/_deprecate/viz/ski_slope.py +88 -0
- accelforge/_version.py +15 -0
- accelforge/examples.py +39 -0
- accelforge/frontend/__init__.py +10 -0
- accelforge/frontend/_binding.py +129 -0
- accelforge/frontend/_workload_isl/__init__.py +2 -0
- accelforge/frontend/_workload_isl/_isl.py +149 -0
- accelforge/frontend/_workload_isl/_symbolic.py +141 -0
- accelforge/frontend/arch copy.py +1544 -0
- accelforge/frontend/arch.py +1642 -0
- accelforge/frontend/config.py +63 -0
- accelforge/frontend/mapper/__init__.py +5 -0
- accelforge/frontend/mapper/ffm.py +126 -0
- accelforge/frontend/mapper/mapper.py +7 -0
- accelforge/frontend/mapper/metrics.py +30 -0
- accelforge/frontend/mapping/__init__.py +1 -0
- accelforge/frontend/mapping/mapping.py +1736 -0
- accelforge/frontend/model.py +14 -0
- accelforge/frontend/renames.py +150 -0
- accelforge/frontend/spec copy.py +230 -0
- accelforge/frontend/spec.py +301 -0
- accelforge/frontend/variables.py +12 -0
- accelforge/frontend/workload.py +952 -0
- accelforge/mapper/FFM/__init__.py +9 -0
- accelforge/mapper/FFM/_join_pmappings/__init__.py +0 -0
- accelforge/mapper/FFM/_join_pmappings/compatibility.py +653 -0
- accelforge/mapper/FFM/_join_pmappings/compress_pmappings.py +140 -0
- accelforge/mapper/FFM/_join_pmappings/join_pmappings.py +703 -0
- accelforge/mapper/FFM/_join_pmappings/pmapping_dataframe.py +901 -0
- accelforge/mapper/FFM/_join_pmappings/pmapping_group.py +337 -0
- accelforge/mapper/FFM/_make_pmappings/contraints/__init__.py +0 -0
- accelforge/mapper/FFM/_make_pmappings/contraints/constraints.py +360 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/__init__.py +1 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_loops.py +373 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_pmapping_templates.py +463 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_reservations.py +95 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_storage_order.py +382 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_storages.py +155 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings.py +411 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/__init__.py +1 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/make_pmappings_from_templates.py +407 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/make_tile_shapes.py +1681 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/run_model.py +170 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/symbol_relations.py +174 -0
- accelforge/mapper/FFM/_make_pmappings/pmapper_job.py +282 -0
- accelforge/mapper/FFM/_pareto_df/df_convention.py +273 -0
- accelforge/mapper/FFM/_pareto_df/pareto copy.py +836 -0
- accelforge/mapper/FFM/_pareto_df/pareto.py +508 -0
- accelforge/mapper/FFM/data.py +61 -0
- accelforge/mapper/FFM/main copy.py +236 -0
- accelforge/mapper/FFM/main.py +208 -0
- accelforge/mapper/FFM/mappings.py +510 -0
- accelforge/mapper/FFM/pmappings.py +310 -0
- accelforge/mapper/__init__.py +4 -0
- accelforge/mapper.py +0 -0
- accelforge/model/__init__.py +1 -0
- accelforge/model/_looptree/__init__.py +0 -0
- accelforge/model/_looptree/accesses.py +335 -0
- accelforge/model/_looptree/capacity/__init__.py +1 -0
- accelforge/model/_looptree/capacity/aggregators.py +36 -0
- accelforge/model/_looptree/capacity/capacity.py +47 -0
- accelforge/model/_looptree/energy.py +150 -0
- accelforge/model/_looptree/equivalent_ranks.py +29 -0
- accelforge/model/_looptree/latency/__init__.py +1 -0
- accelforge/model/_looptree/latency/latency.py +98 -0
- accelforge/model/_looptree/latency/memory.py +120 -0
- accelforge/model/_looptree/latency/processors.py +92 -0
- accelforge/model/_looptree/mapping_utilities.py +71 -0
- accelforge/model/_looptree/reuse/__init__.py +4 -0
- accelforge/model/_looptree/reuse/isl/__init__.py +1 -0
- accelforge/model/_looptree/reuse/isl/des.py +59 -0
- accelforge/model/_looptree/reuse/isl/isl_functions.py +374 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/__init__.py +4 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/analyze_mapping.py +297 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/skews_from_mapping.py +236 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/tiling.py +685 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/types.py +188 -0
- accelforge/model/_looptree/reuse/isl/spatial.py +260 -0
- accelforge/model/_looptree/reuse/isl/temporal.py +182 -0
- accelforge/model/_looptree/reuse/symbolic/__init__.py +1 -0
- accelforge/model/_looptree/reuse/symbolic/symbolic copy 2.py +1346 -0
- accelforge/model/_looptree/reuse/symbolic/symbolic copy.py +1408 -0
- accelforge/model/_looptree/reuse/symbolic/symbolic.py +1396 -0
- accelforge/model/_looptree/run.py +122 -0
- accelforge/model/_looptree/types.py +26 -0
- accelforge/model/_looptree/visualization/__init__.py +0 -0
- accelforge/model/_looptree/visualization/occupancy.py +11 -0
- accelforge/model/main.py +222 -0
- accelforge/plotting/__init__.py +2 -0
- accelforge/plotting/mappings.py +219 -0
- accelforge/plotting/specs.py +57 -0
- accelforge/util/__init__.py +4 -0
- accelforge/util/_base_analysis_types.py +24 -0
- accelforge/util/_basetypes.py +1089 -0
- accelforge/util/_frozenset.py +36 -0
- accelforge/util/_isl.py +29 -0
- accelforge/util/_itertools.py +14 -0
- accelforge/util/_mathfuncs.py +57 -0
- accelforge/util/_parse_expressions.py +339 -0
- accelforge/util/_picklecache.py +32 -0
- accelforge/util/_setexpressions.py +268 -0
- accelforge/util/_sympy/__init__.py +0 -0
- accelforge/util/_sympy/broadcast_max.py +18 -0
- accelforge/util/_visualization.py +112 -0
- accelforge/util/_yaml.py +579 -0
- accelforge/util/parallel.py +193 -0
- accelforge-0.0.1.dist-info/METADATA +64 -0
- accelforge-0.0.1.dist-info/RECORD +258 -0
- accelforge-0.0.1.dist-info/WHEEL +5 -0
- accelforge-0.0.1.dist-info/licenses/LICENSE +19 -0
- accelforge-0.0.1.dist-info/top_level.txt +5 -0
- docs/_build/html/_sources/fastfusion.frontend.mapper.rst.txt +37 -0
- docs/_build/html/_sources/fastfusion.frontend.rst.txt +70 -0
- docs/_build/html/_sources/fastfusion.frontend.workload.rst.txt +21 -0
- docs/_build/html/_sources/fastfusion.mapper.FFM.rst.txt +37 -0
- docs/_build/html/_sources/fastfusion.mapper.rst.txt +18 -0
- docs/_build/html/_sources/fastfusion.rst.txt +20 -0
- docs/_build/html/_sources/fastfusion.util.rst.txt +21 -0
- docs/_build/html/_sources/index.rst.txt +87 -0
- docs/_build/html/_sources/modules.rst.txt +7 -0
- docs/_build/html/_sources/notes/citation.rst.txt +45 -0
- docs/_build/html/_sources/notes/definitions.rst.txt +43 -0
- docs/_build/html/_sources/notes/faqs.rst.txt +39 -0
- docs/_build/html/_sources/notes/modeling/accelerator_energy_latency.rst.txt +72 -0
- docs/_build/html/_sources/notes/modeling/component_energy_area.rst.txt +96 -0
- docs/_build/html/_sources/notes/modeling/mapping.rst.txt +100 -0
- docs/_build/html/_sources/notes/modeling.rst.txt +33 -0
- docs/_build/html/_sources/notes/parsing/arithmetic_parsing.rst.txt +136 -0
- docs/_build/html/_sources/notes/parsing/setexpressions.rst.txt +63 -0
- docs/_build/html/_sources/notes/parsing/yaml_parsing.rst.txt +176 -0
- docs/_build/html/_sources/notes/quickstart_and_installation.rst.txt +9 -0
- docs/_build/html/_sources/notes/spec/architecture.rst.txt +133 -0
- docs/_build/html/_sources/notes/spec/mapping.rst.txt +12 -0
- docs/_build/html/_sources/notes/spec/workload.rst.txt +83 -0
- docs/_build/html/_sources/notes/spec.rst.txt +36 -0
- docs/source/_ext/include_attrs.py +213 -0
- docs/source/_ext/include_docstring.py +364 -0
- docs/source/_ext/include_functions.py +154 -0
- docs/source/_ext/include_notebook.py +131 -0
- docs/source/_ext/include_yaml.py +119 -0
- docs/source/_ext/inherited_attributes.py +222 -0
- docs/source/_ext/paths.py +4 -0
- docs/source/conf.py +79 -0
- examples/arches/compute_in_memory/_include.yaml +74 -0
- examples/arches/compute_in_memory/_include_functions.py +229 -0
- examples/arches/compute_in_memory/_load_spec.py +57 -0
- examples/arches/compute_in_memory/components/c2c_multiplier.py +181 -0
- examples/arches/compute_in_memory/components/dac_c2c_r2r.py +605 -0
- examples/arches/compute_in_memory/components/misc.py +195 -0
- examples/arches/compute_in_memory/components/util/bit_functions.py +51 -0
- examples/arches/compute_in_memory/components/zero_comparator.py +92 -0
- examples/arches/compute_in_memory/isaac.yaml +233 -0
- examples/arches/compute_in_memory/memory_cells/ecram_demo.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/rram_example.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/rram_isaac_isca_2016.yaml +64 -0
- examples/arches/compute_in_memory/memory_cells/rram_neurosim_default.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/rram_raella_isca_2023.yaml +70 -0
- examples/arches/compute_in_memory/memory_cells/rram_wan_nature_2022.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_colonnade_jssc_2021.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_example.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_jia_jssc_2020.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_sinangil_jssc_2021.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_wang_vlsi_2022.yaml +63 -0
- examples/arches/compute_in_memory/wang_vlsi_2022.yaml +289 -0
- examples/arches/eyeriss.yaml +68 -0
- examples/arches/fanout_variations/at_glb.yaml +31 -0
- examples/arches/fanout_variations/at_glb_with_fanout_node.yaml +34 -0
- examples/arches/fanout_variations/at_mac.yaml +31 -0
- examples/arches/fanout_variations/at_mac_with_constraints.yaml +38 -0
- examples/arches/fanout_variations/at_mac_with_fanout_node.yaml +34 -0
- examples/arches/nvdla.yaml +47 -0
- examples/arches/simple.yaml +28 -0
- examples/arches/tpu_v4i.yaml +67 -0
- examples/mappings/unfused_matmuls_to_simple.yaml +33 -0
- examples/misc/component_annotated.yaml +33 -0
- examples/workloads/gpt3_6.7B.yaml +124 -0
- examples/workloads/matmuls.yaml +20 -0
- examples/workloads/mobilenet_28.yaml +81 -0
- examples/workloads/mobilenet_various_separate.yaml +106 -0
- examples/workloads/three_matmuls_annotated.yaml +59 -0
- notebooks/.ipynb_checkpoints/fastfusion_arch_study_michael-checkpoint.ipynb +359 -0
- notebooks/compute_in_memory/_scripts.py +339 -0
- notebooks/compute_in_memory/isaac.guide.ipynb +270 -0
- notebooks/compute_in_memory/wang_vlsi_2022.ipynb +602 -0
- notebooks/paths.py +4 -0
- notebooks/tutorials/.ipynb_checkpoints/1_FFM-checkpoint.ipynb +3110 -0
- notebooks/tutorials/FFM.ipynb +3498 -0
- notebooks/tutorials/_include.py +48 -0
- notebooks/tutorials/component_energy_area.ipynb +363 -0
- tests/Q_mapping.yaml +38 -0
- tests/__init__.py +0 -0
- tests/conv.mapping.yaml +27 -0
- tests/conv.workload.yaml +13 -0
- tests/conv_sym.mapping.yaml +43 -0
- tests/copy.mapping.yaml +35 -0
- tests/copy.workload.yaml +15 -0
- tests/distribuffers/__init__.py +0 -0
- tests/distribuffers/multicast/test_cases.yaml +482 -0
- tests/distribuffers/spec/binding/valid_bindings.yaml +97 -0
- tests/distribuffers/spec/distributed.yaml +100 -0
- tests/distribuffers/spec/logical_arch.yaml +32 -0
- tests/distribuffers/spec/physical_arch.yaml +69 -0
- tests/distribuffers/test_binding.py +48 -0
- tests/frontend/__init__.py +0 -0
- tests/frontend/test_mapping_viz.py +52 -0
- tests/mapper/__init__.py +0 -0
- tests/mapper/configs/conv1d/conv1d.mapping.yaml +31 -0
- tests/mapper/configs/conv1d/conv1d.workload.yaml +11 -0
- tests/mapper/configs/two_conv1d/two_conv1d.expected.yaml +38 -0
- tests/mapper/configs/two_conv1d/two_conv1d.mapping.yaml +54 -0
- tests/mapper/configs/two_conv1d/two_conv1d.workload.yaml +19 -0
- tests/mapper/test_mapping_to_isl.py +90 -0
- tests/mapper/test_spatial_reuse_analysis.py +67 -0
- tests/mapper/test_temporal_reuse_analysis.py +56 -0
- tests/mapper/util.py +58 -0
- tests/matmul.mapping.yaml +29 -0
- tests/matmul.workload.yaml +12 -0
- tests/matmul_spatial.mapping.yaml +44 -0
- tests/mha.renames.yaml +65 -0
- tests/mha.workload.yaml +67 -0
- tests/mha.yaml +59 -0
- tests/mha_full.workload.yaml +67 -0
- tests/mobilenet.workload.yaml +35 -0
- tests/mobilenet_long.workload.yaml +64 -0
- tests/pmappingcache.py +24 -0
- tests/processing_stage.arch.yaml +40 -0
- tests/snowcat.arch.yaml +36 -0
- tests/test_ffm_join_pmappings.py +106 -0
- tests/test_ffm_make_pmappings.py +82 -0
- tests/test_ffm_make_tile_shapes.py +49 -0
- tests/test_mapper.py +100 -0
- tests/test_model.py +37 -0
- tests/test_plotting.py +72 -0
- tests/test_processing_stage.py +46 -0
- tests/test_symbolic_model.py +248 -0
- tests/test_workload.py +141 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from math import prod
|
|
3
|
+
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
import uuid
|
|
6
|
+
import copy
|
|
7
|
+
|
|
8
|
+
from joblib import delayed
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from accelforge.frontend import arch
|
|
13
|
+
from accelforge.frontend.spec import Spec
|
|
14
|
+
from accelforge.frontend.mapping import Loop, Mapping, TensorHolder
|
|
15
|
+
from accelforge.frontend._workload_isl._isl import (
|
|
16
|
+
get_rank_variable_bounds,
|
|
17
|
+
get_tensor_size,
|
|
18
|
+
get_operation_space_size,
|
|
19
|
+
)
|
|
20
|
+
from accelforge.frontend.workload import EinsumName, SymbolTable, TensorName
|
|
21
|
+
|
|
22
|
+
from accelforge.mapper.FFM._make_pmappings.make_pmapping_templates import (
|
|
23
|
+
make_pmapping_templates,
|
|
24
|
+
)
|
|
25
|
+
from accelforge.frontend.mapper.metrics import Metrics
|
|
26
|
+
from accelforge.mapper.FFM._make_pmappings.make_pmappings_from_templates import (
|
|
27
|
+
make_pmappings_from_templates,
|
|
28
|
+
)
|
|
29
|
+
from accelforge.mapper.FFM._join_pmappings.compatibility import Compatibility
|
|
30
|
+
from accelforge.mapper.FFM._join_pmappings.pmapping_group import PmappingGroup
|
|
31
|
+
from accelforge.util.parallel import (
|
|
32
|
+
parallel,
|
|
33
|
+
_memmap_read,
|
|
34
|
+
get_n_parallel_jobs,
|
|
35
|
+
is_using_parallel_processing,
|
|
36
|
+
)
|
|
37
|
+
from accelforge.mapper.FFM._make_pmappings.pmapper_job import (
|
|
38
|
+
Job,
|
|
39
|
+
SameCompatibilityJobs,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_rank_variable_bounds_for_all_einsums(spec: Spec):
|
|
44
|
+
rank_variable_bounds = {
|
|
45
|
+
einsum_name: get_rank_variable_bounds(spec.workload, einsum_name)
|
|
46
|
+
for einsum_name in spec.workload.einsum_names
|
|
47
|
+
}
|
|
48
|
+
result = {}
|
|
49
|
+
for e1, rv1 in rank_variable_bounds.items():
|
|
50
|
+
result.update(rv1)
|
|
51
|
+
for e2, rv2 in rank_variable_bounds.items():
|
|
52
|
+
for r in set(rv1.keys()) & set(rv2.keys()):
|
|
53
|
+
if rv1[r] != rv2[r]:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Rank variable {r} has different bounds for "
|
|
56
|
+
f"einsum {e1} and {e2}: {rv1[r]} and {rv2[r]}"
|
|
57
|
+
)
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_num_computes(spec: Spec, einsum_name: EinsumName | None = None) -> int:
|
|
62
|
+
einsums = spec.workload.einsums
|
|
63
|
+
einsums = [einsum_name] if einsum_name is not None else spec.workload.einsum_names
|
|
64
|
+
return sum(get_operation_space_size(spec.workload, e) for e in einsums)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_per_tensor_size(spec: Spec) -> dict[TensorName, int]:
|
|
68
|
+
return {
|
|
69
|
+
tensor: get_tensor_size(spec.workload, tensor)
|
|
70
|
+
for tensor in spec.workload.tensor_names
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_jobs(
|
|
75
|
+
spec: Spec,
|
|
76
|
+
metrics: Metrics,
|
|
77
|
+
einsum_names: list[EinsumName],
|
|
78
|
+
fail_if_no_pmappings_for_einsum: bool,
|
|
79
|
+
) -> dict[EinsumName, dict[Compatibility, SameCompatibilityJobs]]:
|
|
80
|
+
|
|
81
|
+
spec = spec
|
|
82
|
+
|
|
83
|
+
einsum2jobs = {}
|
|
84
|
+
fusable_tensors = spec.workload.tensor_names_used_in_multiple_einsums
|
|
85
|
+
rank_variable_bounds = get_rank_variable_bounds_for_all_einsums(spec)
|
|
86
|
+
|
|
87
|
+
einsum2spec: dict[EinsumName, Spec] = {}
|
|
88
|
+
s = f"Getting energy, latency, and leak power for components running "
|
|
89
|
+
pbar = tqdm(einsum_names, desc=s)
|
|
90
|
+
for einsum_name in pbar:
|
|
91
|
+
pbar.set_description(s + einsum_name)
|
|
92
|
+
einsum2spec[einsum_name] = spec._spec_parse_expressions(
|
|
93
|
+
einsum_name=einsum_name,
|
|
94
|
+
_parse_arch=True,
|
|
95
|
+
_parse_non_arch=False,
|
|
96
|
+
).calculate_component_area_energy_latency_leak(
|
|
97
|
+
einsum_name=einsum_name,
|
|
98
|
+
area=False,
|
|
99
|
+
)
|
|
100
|
+
einsum2spec[einsum_name] = _memmap_read(einsum2spec[einsum_name])
|
|
101
|
+
|
|
102
|
+
def make_jobs_for_einsum(einsum_name: EinsumName, spec: Spec):
|
|
103
|
+
jobs = {}
|
|
104
|
+
workload_einsum = spec.workload.einsums[einsum_name]
|
|
105
|
+
for flattened_arch in spec._get_flattened_architecture():
|
|
106
|
+
# Create jobs for each Einsum
|
|
107
|
+
job = Job(
|
|
108
|
+
spec=spec,
|
|
109
|
+
einsum_name=einsum_name,
|
|
110
|
+
metrics=metrics,
|
|
111
|
+
rank_variable_bounds=rank_variable_bounds,
|
|
112
|
+
flattened_arch=_memmap_read(flattened_arch),
|
|
113
|
+
job_id=uuid.uuid4(),
|
|
114
|
+
fusable_tensors=fusable_tensors & workload_einsum.tensor_names,
|
|
115
|
+
)
|
|
116
|
+
for j in make_pmapping_templates(job):
|
|
117
|
+
jobs.setdefault(j.compatibility, SameCompatibilityJobs()).append(j)
|
|
118
|
+
|
|
119
|
+
return einsum_name, jobs
|
|
120
|
+
|
|
121
|
+
for einsum_name, jobs in parallel(
|
|
122
|
+
[
|
|
123
|
+
delayed(make_jobs_for_einsum)(einsum_name, spec)
|
|
124
|
+
for einsum_name, spec in einsum2spec.items()
|
|
125
|
+
],
|
|
126
|
+
pbar="Generating jobs",
|
|
127
|
+
return_as="generator",
|
|
128
|
+
):
|
|
129
|
+
einsum2jobs.setdefault(einsum_name, {})
|
|
130
|
+
for compatibility, job_list in jobs.items():
|
|
131
|
+
einsum2jobs[einsum_name].setdefault(
|
|
132
|
+
compatibility, SameCompatibilityJobs()
|
|
133
|
+
).extend(job_list)
|
|
134
|
+
|
|
135
|
+
if fail_if_no_pmappings_for_einsum:
|
|
136
|
+
for einsum_name, jobs in einsum2jobs.items():
|
|
137
|
+
if len(jobs) == 0:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"No pmappings for {einsum_name}. Was the mapspace overconstrained?"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
total_jobs = sum(len(jobs) for jobs in einsum2jobs.values())
|
|
143
|
+
n_procs = get_n_parallel_jobs()
|
|
144
|
+
memory_limit = min(
|
|
145
|
+
spec.mapper.ffm.memory_limit, spec.mapper.ffm.memory_limit_per_process / n_procs
|
|
146
|
+
)
|
|
147
|
+
time_limit = min(
|
|
148
|
+
spec.mapper.ffm.time_limit * n_procs / max(total_jobs, 1),
|
|
149
|
+
spec.mapper.ffm.time_limit_per_pmapping_template,
|
|
150
|
+
)
|
|
151
|
+
for einsum_name, compatibility_jobs in einsum2jobs.items():
|
|
152
|
+
total_jobs = sum(len(j) for j in compatibility_jobs.values())
|
|
153
|
+
logging.warning(f"Einsum {einsum_name} has {total_jobs} pmapping templates:")
|
|
154
|
+
for job_list in compatibility_jobs.values():
|
|
155
|
+
for job in job_list:
|
|
156
|
+
logging.warning(f"\t{job.mapping.compact_str()}")
|
|
157
|
+
job.memory_limit = memory_limit
|
|
158
|
+
job.time_limit = time_limit
|
|
159
|
+
|
|
160
|
+
return einsum2jobs
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def get_memories_to_track(
|
|
164
|
+
spec: Spec,
|
|
165
|
+
einsum2jobs: dict[EinsumName, list[Job]],
|
|
166
|
+
metrics: Metrics,
|
|
167
|
+
can_combine_multiple_runs: bool,
|
|
168
|
+
) -> tuple[list[str], list[str]]:
|
|
169
|
+
|
|
170
|
+
memories_track_all = set()
|
|
171
|
+
for einsum, jobs in einsum2jobs.items():
|
|
172
|
+
for job in jobs:
|
|
173
|
+
memories_track_all.update(
|
|
174
|
+
m.name for m in job.flattened_arch if isinstance(m, arch.Memory)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
memories_track_pmappings_only = []
|
|
178
|
+
ignored_resources = set()
|
|
179
|
+
|
|
180
|
+
# If we're combining the pmappings from multiple runs, we can't conclude anything
|
|
181
|
+
# about the metrics to track
|
|
182
|
+
if can_combine_multiple_runs:
|
|
183
|
+
ignored_resources = memories_track_all
|
|
184
|
+
return (
|
|
185
|
+
memories_track_all,
|
|
186
|
+
memories_track_pmappings_only,
|
|
187
|
+
ignored_resources,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if metrics.RESOURCE_USAGE in metrics:
|
|
191
|
+
ignored_resources = memories_track_all
|
|
192
|
+
return (
|
|
193
|
+
memories_track_all,
|
|
194
|
+
memories_track_pmappings_only,
|
|
195
|
+
ignored_resources,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
tensor_sizes = {}
|
|
199
|
+
for tensor, size in get_per_tensor_size(spec).items():
|
|
200
|
+
scale = 1
|
|
201
|
+
for einsum in spec.workload.einsums_with_tensor(tensor):
|
|
202
|
+
if einsum.tensor_accesses[tensor].persistent:
|
|
203
|
+
scale = max(scale, spec.workload.n_instances * einsum.n_instances)
|
|
204
|
+
tensor_sizes[tensor] = size * scale
|
|
205
|
+
|
|
206
|
+
# If the memory is big enough to hold all the tensors then we don't need to consider
|
|
207
|
+
# it
|
|
208
|
+
for memory in list(memories_track_all):
|
|
209
|
+
usage = 0
|
|
210
|
+
for einsum in einsum2jobs.keys():
|
|
211
|
+
job = einsum2jobs[einsum][0]
|
|
212
|
+
try:
|
|
213
|
+
mem: arch.Memory = job.spec.arch.find(memory)
|
|
214
|
+
except ValueError:
|
|
215
|
+
continue
|
|
216
|
+
for tensor in spec.workload.einsums[einsum].tensor_names:
|
|
217
|
+
if mem.size == 0:
|
|
218
|
+
usage = 2 # FAIL
|
|
219
|
+
else:
|
|
220
|
+
scale = mem.bits_per_value_scale[tensor] / mem.size
|
|
221
|
+
usage += tensor_sizes[tensor] * scale
|
|
222
|
+
|
|
223
|
+
if usage <= 1:
|
|
224
|
+
ignored_resources.add(memory)
|
|
225
|
+
print(
|
|
226
|
+
f"Not tracking memory {memory}. It is big enough to hold "
|
|
227
|
+
f"every workload tensor that may be stored in it. Max possible "
|
|
228
|
+
f"usage: {usage * 100:.2f}%"
|
|
229
|
+
)
|
|
230
|
+
memories_track_all.remove(memory)
|
|
231
|
+
|
|
232
|
+
# If the memory is below every backing tensor holder node, then we need it for the
|
|
233
|
+
# pmapping exploration but can drop it immediately
|
|
234
|
+
for m in list(memories_track_all):
|
|
235
|
+
must_track = False
|
|
236
|
+
for job in jobs:
|
|
237
|
+
seen = False
|
|
238
|
+
for node in job.mapping.nodes:
|
|
239
|
+
if isinstance(node, TensorHolder) and node.component == m:
|
|
240
|
+
seen = True
|
|
241
|
+
if node.persistent:
|
|
242
|
+
ignored_resources.add(m)
|
|
243
|
+
if node._backing:
|
|
244
|
+
must_track = True
|
|
245
|
+
if isinstance(node, Loop) and node._fused and seen:
|
|
246
|
+
must_track = True
|
|
247
|
+
|
|
248
|
+
if not must_track:
|
|
249
|
+
memories_track_all.remove(m)
|
|
250
|
+
memories_track_pmappings_only.append(m)
|
|
251
|
+
print(
|
|
252
|
+
f"Not tracking memory {m} across joining stages. It is never "
|
|
253
|
+
f"reserved across fused loop iterations."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
return memories_track_all, memories_track_pmappings_only, ignored_resources
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def make_pmappings(
|
|
260
|
+
spec: Spec,
|
|
261
|
+
can_combine_multiple_runs: bool,
|
|
262
|
+
metrics: Metrics = Metrics.ENERGY | Metrics.LATENCY,
|
|
263
|
+
einsum_names: Optional[list[EinsumName]] = None,
|
|
264
|
+
fail_if_no_pmappings_for_einsum: bool | None = None,
|
|
265
|
+
) -> tuple[
|
|
266
|
+
dict[EinsumName, list[PmappingGroup]],
|
|
267
|
+
dict[EinsumName, dict[uuid.UUID, Mapping]],
|
|
268
|
+
dict[EinsumName, list[Job]],
|
|
269
|
+
]:
|
|
270
|
+
"""
|
|
271
|
+
Explores pmapspace of `einsum_names` (default: all Einsums in workload).
|
|
272
|
+
"""
|
|
273
|
+
spec = copy.deepcopy(spec)
|
|
274
|
+
|
|
275
|
+
if einsum_names is None:
|
|
276
|
+
einsum_names = spec.workload.einsum_names
|
|
277
|
+
|
|
278
|
+
if fail_if_no_pmappings_for_einsum is None:
|
|
279
|
+
fail_if_no_pmappings_for_einsum = not can_combine_multiple_runs
|
|
280
|
+
|
|
281
|
+
spec = spec._spec_parse_expressions(
|
|
282
|
+
_parse_arch=False,
|
|
283
|
+
_parse_non_arch=True,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
einsum2jobs = {}
|
|
287
|
+
new_einsum2jobs = get_jobs(
|
|
288
|
+
spec,
|
|
289
|
+
metrics,
|
|
290
|
+
einsum_names,
|
|
291
|
+
fail_if_no_pmappings_for_einsum,
|
|
292
|
+
)
|
|
293
|
+
_fill_jobs_with_memories_to_track(
|
|
294
|
+
new_einsum2jobs, spec, metrics, can_combine_multiple_runs
|
|
295
|
+
)
|
|
296
|
+
for einsum_name, jobs in new_einsum2jobs.items():
|
|
297
|
+
einsum2jobs.setdefault(einsum_name, {})
|
|
298
|
+
for compatibility, job_list in jobs.items():
|
|
299
|
+
einsum2jobs[einsum_name].setdefault(
|
|
300
|
+
compatibility, SameCompatibilityJobs()
|
|
301
|
+
).extend(job_list)
|
|
302
|
+
|
|
303
|
+
calls = _allocate_jobs(einsum2jobs)
|
|
304
|
+
|
|
305
|
+
# Sort the calls by the length of the longest mapping in each job. We get long
|
|
306
|
+
# poles with the long mappings, so we want to get them done early so we don't
|
|
307
|
+
# have one or two procs slowing us down at the end.
|
|
308
|
+
def get_longest_mapping_length(call):
|
|
309
|
+
j: SameCompatibilityJobs = call[2]["jobs_with_similar_compatibilities"]
|
|
310
|
+
return max([len(j2.mapping.nodes) for j2 in j])
|
|
311
|
+
|
|
312
|
+
calls = sorted(calls, key=get_longest_mapping_length, reverse=True)
|
|
313
|
+
# # Randomly permute the calls
|
|
314
|
+
# import random
|
|
315
|
+
# random.shuffle(calls)
|
|
316
|
+
|
|
317
|
+
pmapping_objects = {}
|
|
318
|
+
pmapping_groups = {einsum_name: [] for einsum_name in spec.workload.einsum_names}
|
|
319
|
+
return_jobs = {}
|
|
320
|
+
for (
|
|
321
|
+
einsum_name,
|
|
322
|
+
new_pmapping_groups,
|
|
323
|
+
pmappings,
|
|
324
|
+
jobs_with_similar_compatibilities,
|
|
325
|
+
) in parallel(
|
|
326
|
+
calls,
|
|
327
|
+
pbar=f"Generating pmappings",
|
|
328
|
+
return_as="generator_unordered",
|
|
329
|
+
):
|
|
330
|
+
pmapping_groups[einsum_name].extend(new_pmapping_groups)
|
|
331
|
+
pmapping_objects.setdefault(einsum_name, {}).update(pmappings)
|
|
332
|
+
return_jobs.setdefault(einsum_name, []).extend(
|
|
333
|
+
jobs_with_similar_compatibilities
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
for einsum_name in list(pmapping_groups.keys()):
|
|
337
|
+
pmapping_groups[einsum_name] = PmappingGroup.combine_combineable(
|
|
338
|
+
pmapping_groups[einsum_name],
|
|
339
|
+
"All",
|
|
340
|
+
pbar_postfix=f" for {einsum_name}",
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
return pmapping_groups, pmapping_objects, return_jobs
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _raise_error_if_no_pmappings(einsum2jobs):
|
|
347
|
+
for einsum_name, jobs in einsum2jobs.items():
|
|
348
|
+
if len(jobs) == 0:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
f"No pmappings for {einsum_name}. " f"Was the mapspace overconstrained?"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _allocate_jobs(einsum2jobs):
|
|
355
|
+
calls = []
|
|
356
|
+
for einsum_name, jobs in einsum2jobs.items():
|
|
357
|
+
calls.extend(
|
|
358
|
+
delayed(make_pmappings_from_templates)(
|
|
359
|
+
jobs_with_similar_compatibilities=job_list,
|
|
360
|
+
)
|
|
361
|
+
for job_list in jobs.values()
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
split = False
|
|
365
|
+
if (
|
|
366
|
+
not split
|
|
367
|
+
and is_using_parallel_processing()
|
|
368
|
+
and len(calls) < get_n_parallel_jobs() * 4
|
|
369
|
+
):
|
|
370
|
+
logging.warning(
|
|
371
|
+
f"Insufficient jobs available to utilize available threads. "
|
|
372
|
+
f"Splitting jobs into smaller chunks."
|
|
373
|
+
)
|
|
374
|
+
split = True
|
|
375
|
+
|
|
376
|
+
if split:
|
|
377
|
+
calls = []
|
|
378
|
+
for einsum_name, jobs in einsum2jobs.items():
|
|
379
|
+
for job_list in jobs.values():
|
|
380
|
+
calls.extend(
|
|
381
|
+
delayed(make_pmappings_from_templates)(
|
|
382
|
+
jobs_with_similar_compatibilities=job,
|
|
383
|
+
)
|
|
384
|
+
for job in job_list.split()
|
|
385
|
+
)
|
|
386
|
+
return calls
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _fill_jobs_with_memories_to_track(
|
|
390
|
+
einsum2jobs: dict[EinsumName, dict[Compatibility, SameCompatibilityJobs]],
|
|
391
|
+
spec,
|
|
392
|
+
metrics,
|
|
393
|
+
can_combine_multiple_runs,
|
|
394
|
+
):
|
|
395
|
+
einsum2jobs_flattened = {
|
|
396
|
+
e: [j for jobs in v.values() for j in jobs] for e, v in einsum2jobs.items()
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
memories_track_all, memories_track_pmappings_only, ignored_resources = (
|
|
400
|
+
get_memories_to_track(
|
|
401
|
+
spec,
|
|
402
|
+
einsum2jobs_flattened,
|
|
403
|
+
metrics,
|
|
404
|
+
can_combine_multiple_runs,
|
|
405
|
+
)
|
|
406
|
+
)
|
|
407
|
+
for jobs in einsum2jobs_flattened.values():
|
|
408
|
+
for j in jobs:
|
|
409
|
+
j.memories_track_all = memories_track_all
|
|
410
|
+
j.memories_track_pmappings_only = memories_track_pmappings_only
|
|
411
|
+
j.ignored_resources = ignored_resources
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .make_pmappings_from_templates import make_pmappings_from_templates
|