accelforge 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- accelforge/__init__.py +21 -0
- accelforge/_accelerated_imports.py +16 -0
- accelforge/_deprecate/_simanneal/evalmapping.py +271 -0
- accelforge/_deprecate/_simanneal/mapspaceglobals.py +298 -0
- accelforge/_deprecate/_simanneal/simanneal.py +666 -0
- accelforge/_deprecate/_simanneal/tracking.py +105 -0
- accelforge/_deprecate/_simanneal/wrappers.py +218 -0
- accelforge/_deprecate/_simanneal2/__init__.py +7 -0
- accelforge/_deprecate/_simanneal2/simanneal.py +493 -0
- accelforge/_deprecate/_simanneal2/tracking.py +116 -0
- accelforge/_deprecate/compatibility_util.py +181 -0
- accelforge/_deprecate/layerdeduplication/__init__.py +2 -0
- accelforge/_deprecate/layerdeduplication/group_similar_einsums.py +160 -0
- accelforge/_deprecate/layerdeduplication/grouped_einsums.py +84 -0
- accelforge/_deprecate/mapping_filter_tags/__init__.py +2 -0
- accelforge/_deprecate/mapping_filter_tags/ffmt.py +212 -0
- accelforge/_deprecate/mapping_filter_tags/onesplit.py +24 -0
- accelforge/_deprecate/mapping_filter_tags/util.py +24 -0
- accelforge/_deprecate/tags.py +69 -0
- accelforge/_deprecate/viz/__init__.py +0 -0
- accelforge/_deprecate/viz/interactive.py +159 -0
- accelforge/_deprecate/viz/reservationtree.py +307 -0
- accelforge/_deprecate/viz/ski_slope.py +88 -0
- accelforge/_version.py +15 -0
- accelforge/examples.py +39 -0
- accelforge/frontend/__init__.py +10 -0
- accelforge/frontend/_binding.py +129 -0
- accelforge/frontend/_workload_isl/__init__.py +2 -0
- accelforge/frontend/_workload_isl/_isl.py +149 -0
- accelforge/frontend/_workload_isl/_symbolic.py +141 -0
- accelforge/frontend/arch copy.py +1544 -0
- accelforge/frontend/arch.py +1642 -0
- accelforge/frontend/config.py +63 -0
- accelforge/frontend/mapper/__init__.py +5 -0
- accelforge/frontend/mapper/ffm.py +126 -0
- accelforge/frontend/mapper/mapper.py +7 -0
- accelforge/frontend/mapper/metrics.py +30 -0
- accelforge/frontend/mapping/__init__.py +1 -0
- accelforge/frontend/mapping/mapping.py +1736 -0
- accelforge/frontend/model.py +14 -0
- accelforge/frontend/renames.py +150 -0
- accelforge/frontend/spec copy.py +230 -0
- accelforge/frontend/spec.py +301 -0
- accelforge/frontend/variables.py +12 -0
- accelforge/frontend/workload.py +952 -0
- accelforge/mapper/FFM/__init__.py +9 -0
- accelforge/mapper/FFM/_join_pmappings/__init__.py +0 -0
- accelforge/mapper/FFM/_join_pmappings/compatibility.py +653 -0
- accelforge/mapper/FFM/_join_pmappings/compress_pmappings.py +140 -0
- accelforge/mapper/FFM/_join_pmappings/join_pmappings.py +703 -0
- accelforge/mapper/FFM/_join_pmappings/pmapping_dataframe.py +901 -0
- accelforge/mapper/FFM/_join_pmappings/pmapping_group.py +337 -0
- accelforge/mapper/FFM/_make_pmappings/contraints/__init__.py +0 -0
- accelforge/mapper/FFM/_make_pmappings/contraints/constraints.py +360 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/__init__.py +1 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_loops.py +373 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_pmapping_templates.py +463 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_reservations.py +95 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_storage_order.py +382 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_storages.py +155 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings.py +411 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/__init__.py +1 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/make_pmappings_from_templates.py +407 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/make_tile_shapes.py +1681 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/run_model.py +170 -0
- accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/symbol_relations.py +174 -0
- accelforge/mapper/FFM/_make_pmappings/pmapper_job.py +282 -0
- accelforge/mapper/FFM/_pareto_df/df_convention.py +273 -0
- accelforge/mapper/FFM/_pareto_df/pareto copy.py +836 -0
- accelforge/mapper/FFM/_pareto_df/pareto.py +508 -0
- accelforge/mapper/FFM/data.py +61 -0
- accelforge/mapper/FFM/main copy.py +236 -0
- accelforge/mapper/FFM/main.py +208 -0
- accelforge/mapper/FFM/mappings.py +510 -0
- accelforge/mapper/FFM/pmappings.py +310 -0
- accelforge/mapper/__init__.py +4 -0
- accelforge/mapper.py +0 -0
- accelforge/model/__init__.py +1 -0
- accelforge/model/_looptree/__init__.py +0 -0
- accelforge/model/_looptree/accesses.py +335 -0
- accelforge/model/_looptree/capacity/__init__.py +1 -0
- accelforge/model/_looptree/capacity/aggregators.py +36 -0
- accelforge/model/_looptree/capacity/capacity.py +47 -0
- accelforge/model/_looptree/energy.py +150 -0
- accelforge/model/_looptree/equivalent_ranks.py +29 -0
- accelforge/model/_looptree/latency/__init__.py +1 -0
- accelforge/model/_looptree/latency/latency.py +98 -0
- accelforge/model/_looptree/latency/memory.py +120 -0
- accelforge/model/_looptree/latency/processors.py +92 -0
- accelforge/model/_looptree/mapping_utilities.py +71 -0
- accelforge/model/_looptree/reuse/__init__.py +4 -0
- accelforge/model/_looptree/reuse/isl/__init__.py +1 -0
- accelforge/model/_looptree/reuse/isl/des.py +59 -0
- accelforge/model/_looptree/reuse/isl/isl_functions.py +374 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/__init__.py +4 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/analyze_mapping.py +297 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/skews_from_mapping.py +236 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/tiling.py +685 -0
- accelforge/model/_looptree/reuse/isl/mapping_to_isl/types.py +188 -0
- accelforge/model/_looptree/reuse/isl/spatial.py +260 -0
- accelforge/model/_looptree/reuse/isl/temporal.py +182 -0
- accelforge/model/_looptree/reuse/symbolic/__init__.py +1 -0
- accelforge/model/_looptree/reuse/symbolic/symbolic copy 2.py +1346 -0
- accelforge/model/_looptree/reuse/symbolic/symbolic copy.py +1408 -0
- accelforge/model/_looptree/reuse/symbolic/symbolic.py +1396 -0
- accelforge/model/_looptree/run.py +122 -0
- accelforge/model/_looptree/types.py +26 -0
- accelforge/model/_looptree/visualization/__init__.py +0 -0
- accelforge/model/_looptree/visualization/occupancy.py +11 -0
- accelforge/model/main.py +222 -0
- accelforge/plotting/__init__.py +2 -0
- accelforge/plotting/mappings.py +219 -0
- accelforge/plotting/specs.py +57 -0
- accelforge/util/__init__.py +4 -0
- accelforge/util/_base_analysis_types.py +24 -0
- accelforge/util/_basetypes.py +1089 -0
- accelforge/util/_frozenset.py +36 -0
- accelforge/util/_isl.py +29 -0
- accelforge/util/_itertools.py +14 -0
- accelforge/util/_mathfuncs.py +57 -0
- accelforge/util/_parse_expressions.py +339 -0
- accelforge/util/_picklecache.py +32 -0
- accelforge/util/_setexpressions.py +268 -0
- accelforge/util/_sympy/__init__.py +0 -0
- accelforge/util/_sympy/broadcast_max.py +18 -0
- accelforge/util/_visualization.py +112 -0
- accelforge/util/_yaml.py +579 -0
- accelforge/util/parallel.py +193 -0
- accelforge-0.0.1.dist-info/METADATA +64 -0
- accelforge-0.0.1.dist-info/RECORD +258 -0
- accelforge-0.0.1.dist-info/WHEEL +5 -0
- accelforge-0.0.1.dist-info/licenses/LICENSE +19 -0
- accelforge-0.0.1.dist-info/top_level.txt +5 -0
- docs/_build/html/_sources/fastfusion.frontend.mapper.rst.txt +37 -0
- docs/_build/html/_sources/fastfusion.frontend.rst.txt +70 -0
- docs/_build/html/_sources/fastfusion.frontend.workload.rst.txt +21 -0
- docs/_build/html/_sources/fastfusion.mapper.FFM.rst.txt +37 -0
- docs/_build/html/_sources/fastfusion.mapper.rst.txt +18 -0
- docs/_build/html/_sources/fastfusion.rst.txt +20 -0
- docs/_build/html/_sources/fastfusion.util.rst.txt +21 -0
- docs/_build/html/_sources/index.rst.txt +87 -0
- docs/_build/html/_sources/modules.rst.txt +7 -0
- docs/_build/html/_sources/notes/citation.rst.txt +45 -0
- docs/_build/html/_sources/notes/definitions.rst.txt +43 -0
- docs/_build/html/_sources/notes/faqs.rst.txt +39 -0
- docs/_build/html/_sources/notes/modeling/accelerator_energy_latency.rst.txt +72 -0
- docs/_build/html/_sources/notes/modeling/component_energy_area.rst.txt +96 -0
- docs/_build/html/_sources/notes/modeling/mapping.rst.txt +100 -0
- docs/_build/html/_sources/notes/modeling.rst.txt +33 -0
- docs/_build/html/_sources/notes/parsing/arithmetic_parsing.rst.txt +136 -0
- docs/_build/html/_sources/notes/parsing/setexpressions.rst.txt +63 -0
- docs/_build/html/_sources/notes/parsing/yaml_parsing.rst.txt +176 -0
- docs/_build/html/_sources/notes/quickstart_and_installation.rst.txt +9 -0
- docs/_build/html/_sources/notes/spec/architecture.rst.txt +133 -0
- docs/_build/html/_sources/notes/spec/mapping.rst.txt +12 -0
- docs/_build/html/_sources/notes/spec/workload.rst.txt +83 -0
- docs/_build/html/_sources/notes/spec.rst.txt +36 -0
- docs/source/_ext/include_attrs.py +213 -0
- docs/source/_ext/include_docstring.py +364 -0
- docs/source/_ext/include_functions.py +154 -0
- docs/source/_ext/include_notebook.py +131 -0
- docs/source/_ext/include_yaml.py +119 -0
- docs/source/_ext/inherited_attributes.py +222 -0
- docs/source/_ext/paths.py +4 -0
- docs/source/conf.py +79 -0
- examples/arches/compute_in_memory/_include.yaml +74 -0
- examples/arches/compute_in_memory/_include_functions.py +229 -0
- examples/arches/compute_in_memory/_load_spec.py +57 -0
- examples/arches/compute_in_memory/components/c2c_multiplier.py +181 -0
- examples/arches/compute_in_memory/components/dac_c2c_r2r.py +605 -0
- examples/arches/compute_in_memory/components/misc.py +195 -0
- examples/arches/compute_in_memory/components/util/bit_functions.py +51 -0
- examples/arches/compute_in_memory/components/zero_comparator.py +92 -0
- examples/arches/compute_in_memory/isaac.yaml +233 -0
- examples/arches/compute_in_memory/memory_cells/ecram_demo.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/rram_example.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/rram_isaac_isca_2016.yaml +64 -0
- examples/arches/compute_in_memory/memory_cells/rram_neurosim_default.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/rram_raella_isca_2023.yaml +70 -0
- examples/arches/compute_in_memory/memory_cells/rram_wan_nature_2022.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_colonnade_jssc_2021.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_example.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_jia_jssc_2020.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_sinangil_jssc_2021.yaml +63 -0
- examples/arches/compute_in_memory/memory_cells/sram_wang_vlsi_2022.yaml +63 -0
- examples/arches/compute_in_memory/wang_vlsi_2022.yaml +289 -0
- examples/arches/eyeriss.yaml +68 -0
- examples/arches/fanout_variations/at_glb.yaml +31 -0
- examples/arches/fanout_variations/at_glb_with_fanout_node.yaml +34 -0
- examples/arches/fanout_variations/at_mac.yaml +31 -0
- examples/arches/fanout_variations/at_mac_with_constraints.yaml +38 -0
- examples/arches/fanout_variations/at_mac_with_fanout_node.yaml +34 -0
- examples/arches/nvdla.yaml +47 -0
- examples/arches/simple.yaml +28 -0
- examples/arches/tpu_v4i.yaml +67 -0
- examples/mappings/unfused_matmuls_to_simple.yaml +33 -0
- examples/misc/component_annotated.yaml +33 -0
- examples/workloads/gpt3_6.7B.yaml +124 -0
- examples/workloads/matmuls.yaml +20 -0
- examples/workloads/mobilenet_28.yaml +81 -0
- examples/workloads/mobilenet_various_separate.yaml +106 -0
- examples/workloads/three_matmuls_annotated.yaml +59 -0
- notebooks/.ipynb_checkpoints/fastfusion_arch_study_michael-checkpoint.ipynb +359 -0
- notebooks/compute_in_memory/_scripts.py +339 -0
- notebooks/compute_in_memory/isaac.guide.ipynb +270 -0
- notebooks/compute_in_memory/wang_vlsi_2022.ipynb +602 -0
- notebooks/paths.py +4 -0
- notebooks/tutorials/.ipynb_checkpoints/1_FFM-checkpoint.ipynb +3110 -0
- notebooks/tutorials/FFM.ipynb +3498 -0
- notebooks/tutorials/_include.py +48 -0
- notebooks/tutorials/component_energy_area.ipynb +363 -0
- tests/Q_mapping.yaml +38 -0
- tests/__init__.py +0 -0
- tests/conv.mapping.yaml +27 -0
- tests/conv.workload.yaml +13 -0
- tests/conv_sym.mapping.yaml +43 -0
- tests/copy.mapping.yaml +35 -0
- tests/copy.workload.yaml +15 -0
- tests/distribuffers/__init__.py +0 -0
- tests/distribuffers/multicast/test_cases.yaml +482 -0
- tests/distribuffers/spec/binding/valid_bindings.yaml +97 -0
- tests/distribuffers/spec/distributed.yaml +100 -0
- tests/distribuffers/spec/logical_arch.yaml +32 -0
- tests/distribuffers/spec/physical_arch.yaml +69 -0
- tests/distribuffers/test_binding.py +48 -0
- tests/frontend/__init__.py +0 -0
- tests/frontend/test_mapping_viz.py +52 -0
- tests/mapper/__init__.py +0 -0
- tests/mapper/configs/conv1d/conv1d.mapping.yaml +31 -0
- tests/mapper/configs/conv1d/conv1d.workload.yaml +11 -0
- tests/mapper/configs/two_conv1d/two_conv1d.expected.yaml +38 -0
- tests/mapper/configs/two_conv1d/two_conv1d.mapping.yaml +54 -0
- tests/mapper/configs/two_conv1d/two_conv1d.workload.yaml +19 -0
- tests/mapper/test_mapping_to_isl.py +90 -0
- tests/mapper/test_spatial_reuse_analysis.py +67 -0
- tests/mapper/test_temporal_reuse_analysis.py +56 -0
- tests/mapper/util.py +58 -0
- tests/matmul.mapping.yaml +29 -0
- tests/matmul.workload.yaml +12 -0
- tests/matmul_spatial.mapping.yaml +44 -0
- tests/mha.renames.yaml +65 -0
- tests/mha.workload.yaml +67 -0
- tests/mha.yaml +59 -0
- tests/mha_full.workload.yaml +67 -0
- tests/mobilenet.workload.yaml +35 -0
- tests/mobilenet_long.workload.yaml +64 -0
- tests/pmappingcache.py +24 -0
- tests/processing_stage.arch.yaml +40 -0
- tests/snowcat.arch.yaml +36 -0
- tests/test_ffm_join_pmappings.py +106 -0
- tests/test_ffm_make_pmappings.py +82 -0
- tests/test_ffm_make_tile_shapes.py +49 -0
- tests/test_mapper.py +100 -0
- tests/test_model.py +37 -0
- tests/test_plotting.py +72 -0
- tests/test_processing_stage.py +46 -0
- tests/test_symbolic_model.py +248 -0
- tests/test_workload.py +141 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# @INPROCEEDINGS{9830322,
|
|
2
|
+
# author={Wang, Hechen and Liu, Renzhi and Dorrance, Richard and Dasalukunte, Deepak and Liu, Xiaosen and Lake, Dan and Carlton, Brent and Wu, May},
|
|
3
|
+
# booktitle={2022 IEEE Symposium on VLSI Technology and Circuits (VLSI Technology and Circuits)},
|
|
4
|
+
# title={A 32.2 TOPS/W SRAM Compute-in-Memory Macro Employing a Linear 8b C-2C Ladder for Charge Domain Computation in 22nm for Edge Inference},
|
|
5
|
+
# year={2022},
|
|
6
|
+
# volume={},
|
|
7
|
+
# number={},
|
|
8
|
+
# pages={36-37},
|
|
9
|
+
# doi={10.1109/VLSITechnologyandCir46769.2022.9830322}}
|
|
10
|
+
#
|
|
11
|
+
# @ARTICLE{10008405,
|
|
12
|
+
# author={Wang, Hechen and Liu, Renzhi and Dorrance, Richard and Dasalukunte, Deepak and Lake, Dan and Carlton, Brent},
|
|
13
|
+
# journal={IEEE Journal of Solid-State Circuits},
|
|
14
|
+
# title={A Charge Domain SRAM Compute-in-Memory Macro With C-2C Ladder-Based 8b MAC Unit in 22-nm FinFET Process for Edge Inference},
|
|
15
|
+
# year={2023},
|
|
16
|
+
# volume={58},
|
|
17
|
+
# number={4},
|
|
18
|
+
# pages={1037-1050},
|
|
19
|
+
# doi={10.1109/JSSC.2022.3232601}}
|
|
20
|
+
|
|
21
|
+
{{include_text('_include.yaml')}}
|
|
22
|
+
{{add_to_path('./memory_cells')}}
|
|
23
|
+
|
|
24
|
+
arch:
|
|
25
|
+
arch_globals_dependent_on_workload:
|
|
26
|
+
<<: *variables_global
|
|
27
|
+
|
|
28
|
+
# ===========================================================================
|
|
29
|
+
# Encoding-dependent parameters
|
|
30
|
+
# ===========================================================================
|
|
31
|
+
encoded_input_bits: input_bits
|
|
32
|
+
encoded_weight_bits: weight_bits
|
|
33
|
+
encoded_output_bits: output_bits
|
|
34
|
+
|
|
35
|
+
input_encoding_func: offset_encode_hist
|
|
36
|
+
weight_encoding_func: offset_encode_hist
|
|
37
|
+
|
|
38
|
+
# For accuracy model. Can in-array accumulation include signed values?
|
|
39
|
+
# Signed accumulation not compatible with offset encoding (since offset
|
|
40
|
+
# encoding makes values non-negative).
|
|
41
|
+
signed_sum_across_inputs: True
|
|
42
|
+
signed_sum_across_weights: False
|
|
43
|
+
|
|
44
|
+
# ===========================================================================
|
|
45
|
+
# Architecture & CiM Array Structure
|
|
46
|
+
# ===========================================================================
|
|
47
|
+
# DEFINITIONS:
|
|
48
|
+
# - Cell: Smallest structure capable of storing memory. Note that a cell may
|
|
49
|
+
# store more than one bit. For example, a cell consisting of a RRAM
|
|
50
|
+
# device may store >1 bits, while a cell consisting of an SRAM
|
|
51
|
+
# bitcell may store only 1 bit.
|
|
52
|
+
# - CiM Unit: Smallest structure capable of computing an analog MAC.
|
|
53
|
+
# - CiM Unit Width Cells:
|
|
54
|
+
# Number of CiM unit cells that are accessed as one. These cells receive
|
|
55
|
+
# one analog input and compute one analog MAC per timestep.
|
|
56
|
+
# - CiM Unit Depth Cells:
|
|
57
|
+
# Number of independent groups of "CiM Unit Width" cells that form a CiM
|
|
58
|
+
# unit. Each of these groups is indepently addressible and operates in
|
|
59
|
+
# must be activated in a different timestep than the others.
|
|
60
|
+
|
|
61
|
+
cim_unit_width_cells: supported_weight_bits
|
|
62
|
+
cim_unit_depth_cells: 8
|
|
63
|
+
bits_per_cell: 1
|
|
64
|
+
|
|
65
|
+
# ===========================================================================
|
|
66
|
+
# Data Converters
|
|
67
|
+
# ===========================================================================
|
|
68
|
+
adc_resolution: 8
|
|
69
|
+
voltage_dac_resolution: 8
|
|
70
|
+
temporal_dac_resolution: 1
|
|
71
|
+
dac_unit_resistance: 5000
|
|
72
|
+
|
|
73
|
+
n_adc_per_bank: 16
|
|
74
|
+
|
|
75
|
+
# ===========================================================================
|
|
76
|
+
# Hardware
|
|
77
|
+
# ===========================================================================
|
|
78
|
+
base_latency: 6.4e-9
|
|
79
|
+
latency_columns_scale: dac_unit_resistance / 5000 * array_bitlines / 128
|
|
80
|
+
latency_dac_resolution_scale: voltage_dac_resolution / 8
|
|
81
|
+
# Digital clock runs at 2x analog clock speed. Don't let analog clock go
|
|
82
|
+
# faster than that
|
|
83
|
+
no_faster_than_digital: max(0.5, latency_columns_scale * latency_dac_resolution_scale)
|
|
84
|
+
# Assume temporal DAC runs no faster than 0.05ns/step
|
|
85
|
+
limited_by_temporal_dac: 0.05e-9 * (2 ** temporal_dac_resolution - 1)
|
|
86
|
+
cycle_period: max(base_latency * no_faster_than_digital * voltage_latency_scale, limited_by_temporal_dac, 2e-9)
|
|
87
|
+
read_pulse_width: cycle_period
|
|
88
|
+
|
|
89
|
+
extra_attributes_for_all_component_models:
|
|
90
|
+
<<: *cim_component_attributes
|
|
91
|
+
tech_node: tech_node
|
|
92
|
+
cycle_period: cycle_period
|
|
93
|
+
|
|
94
|
+
nodes:
|
|
95
|
+
- !ProcessingStage # DAC converts digital inputs to analog voltages
|
|
96
|
+
name: DAC
|
|
97
|
+
tensors: {keep: input}
|
|
98
|
+
direction: down
|
|
99
|
+
bits_per_action: input_bits / n_input_slices # n_input_slices reads to send an input
|
|
100
|
+
component_class: DualSidedR2RLadderDAC
|
|
101
|
+
n_parallel_instances: array_parallel_inputs # One DAC for each row
|
|
102
|
+
extra_attributes_for_component_model:
|
|
103
|
+
resolution: dac_resolution
|
|
104
|
+
unit_resistance: dac_unit_resistance
|
|
105
|
+
zero_between_values: 0
|
|
106
|
+
bit_distribution: input_bit_distribution
|
|
107
|
+
hist: hist_to_magnitude(inputs_hist)
|
|
108
|
+
|
|
109
|
+
- !ProcessingStage # Row drivers feed inputs onto the rows of the array
|
|
110
|
+
name: RowDrivers
|
|
111
|
+
tensors: {keep: input}
|
|
112
|
+
direction: down
|
|
113
|
+
bits_per_action: input_bits / n_input_slices # n_input_slices reads to send an input
|
|
114
|
+
component_class: ArrayRowDrivers
|
|
115
|
+
|
|
116
|
+
- !ProcessingStage # Weight drivers write weights to the array
|
|
117
|
+
name: WeightDrivers
|
|
118
|
+
tensors: {keep: weight & Above} # Only program the weights if they're not backed in the CiM units
|
|
119
|
+
direction: down
|
|
120
|
+
component_class: ArrayRowDrivers
|
|
121
|
+
bits_per_action: weight_bits / n_weight_slices # n_weight_slices writes to send a weight
|
|
122
|
+
extra_attributes_for_component_model: {<<: [*weight_drivers_attributes]}
|
|
123
|
+
|
|
124
|
+
- !ProcessingStage # Select different sets of weights each timestep
|
|
125
|
+
name: SelectWordlineDrivers
|
|
126
|
+
tensors: {keep: input}
|
|
127
|
+
direction: down
|
|
128
|
+
bits_per_action: input_bits / n_input_slices # n_input_slices reads to send an input
|
|
129
|
+
component_class: ArrayRowDrivers
|
|
130
|
+
extra_attributes_for_component_model:
|
|
131
|
+
# Unlike normal row drivers, this is always fully asserted to select a row
|
|
132
|
+
average_input_value: 1
|
|
133
|
+
|
|
134
|
+
- !ProcessingStage # Column readout (ADC)
|
|
135
|
+
name: ADC
|
|
136
|
+
tensors: {keep: output}
|
|
137
|
+
direction: up
|
|
138
|
+
component_class: ADC
|
|
139
|
+
bits_per_action: output_bits / n_sliced_psums
|
|
140
|
+
energy_scale: adc_energy_scale
|
|
141
|
+
area_scale: adc_area_scale
|
|
142
|
+
extra_attributes_for_component_model:
|
|
143
|
+
n_bits: adc_resolution
|
|
144
|
+
throughput_scale: 1 # 1 cycle to process all outputs
|
|
145
|
+
throughput: 1 / cycle_period * cols_active_at_once * throughput_scale
|
|
146
|
+
|
|
147
|
+
- !ProcessingStage # Column drivers
|
|
148
|
+
name: ColumnDrivers
|
|
149
|
+
tensors: {keep: output}
|
|
150
|
+
direction: up
|
|
151
|
+
component_class: ArrayColumnDrivers
|
|
152
|
+
bits_per_action: output_bits / n_sliced_psums
|
|
153
|
+
actions: [{name: read, latency: cycle_period / cols_active_at_once}]
|
|
154
|
+
|
|
155
|
+
- !Fanout
|
|
156
|
+
name: ColumnOfSubBanks
|
|
157
|
+
spatial:
|
|
158
|
+
- name: array_reuse_input # Special name that determines array size
|
|
159
|
+
fanout: 16
|
|
160
|
+
usage_scale: n_weight_slices
|
|
161
|
+
reuse: input
|
|
162
|
+
min_usage: 1
|
|
163
|
+
|
|
164
|
+
# Column bandwidth limiter to limit write speed (only one value can be written per
|
|
165
|
+
# column per cycle)
|
|
166
|
+
- !ProcessingStage
|
|
167
|
+
name: ColumnBandwidthLimiter
|
|
168
|
+
# Keep weight and output tensors. Don't keep anything if it doesn't leave the array.
|
|
169
|
+
tensors: {keep: (weight | output) & Above}
|
|
170
|
+
direction: down
|
|
171
|
+
component_class: Dummy
|
|
172
|
+
|
|
173
|
+
# Each time a weight slice is written to the array, consume 0.5 "bits". 0.5 because
|
|
174
|
+
# the digital clock (writing weights) runs at 2x the speed of the analog clock. Each
|
|
175
|
+
# time a sliced psum is read from the array, consume 1 "bit"
|
|
176
|
+
bits_per_value_scale:
|
|
177
|
+
weight: n_weight_slices / weight_bits / 2
|
|
178
|
+
output: n_sliced_psums / output_bits
|
|
179
|
+
All - (weight | output): 0 # Don't care
|
|
180
|
+
|
|
181
|
+
# One cycle period per "bit"
|
|
182
|
+
actions: [{name: read, latency: cycle_period}]
|
|
183
|
+
|
|
184
|
+
- !Fanout # Each sub-bank receives a different input slice. Sub-banks share outputs.
|
|
185
|
+
name: SubBank
|
|
186
|
+
spatial:
|
|
187
|
+
- name: array_reuse_output # Special name that determines array size
|
|
188
|
+
fanout: 64
|
|
189
|
+
reuse: output
|
|
190
|
+
min_usage: 1
|
|
191
|
+
|
|
192
|
+
# This is the CiM unit that stores weights and computes MACs. Each CiM unit stores a
|
|
193
|
+
# different weight slice of up to cim_unit_width_cells bits. It may also store up to
|
|
194
|
+
# cim_unit_depth_cells independently-addressable weight slices, but may only compute
|
|
195
|
+
# MACs on one slice at a time. One of these components represents a collection of CiM
|
|
196
|
+
# units, that together hold one weight.
|
|
197
|
+
- !Memory
|
|
198
|
+
name: CimUnit
|
|
199
|
+
component_class: MemoryCell
|
|
200
|
+
size: cim_unit_width_cells * cim_unit_depth_cells * bits_per_cell * n_weight_slices
|
|
201
|
+
# Requires (n_weight_slices * n_input_slices) computes to fully use one weight
|
|
202
|
+
bits_per_action: weight.bits_per_value / n_weight_slices / n_input_slices
|
|
203
|
+
# Bind together n_weight_slices instances to hold one weight
|
|
204
|
+
n_parallel_instances: n_weight_slices
|
|
205
|
+
extra_attributes_for_component_model:
|
|
206
|
+
n_instances: cim_unit_width_cells * cim_unit_depth_cells
|
|
207
|
+
tensors: {keep: weight, no_refetch_from_above: weight, force_memory_hierarchy_order: False}
|
|
208
|
+
# NeuroSim-returned results are too high for this component, so override the latency
|
|
209
|
+
actions: [{name: read, latency: cycle_period}]
|
|
210
|
+
|
|
211
|
+
- !ProcessingStage # Digital port of the C-2C multiplier. Weights enter here.
|
|
212
|
+
name: C2CMultiplier
|
|
213
|
+
tensors: {keep: input}
|
|
214
|
+
direction: down
|
|
215
|
+
component_class: C2CMultiplier
|
|
216
|
+
# Requires (n_weight_slices * n_input_slices) computes to fully use one weight
|
|
217
|
+
bits_per_action: input.bits_per_value / n_weight_slices / n_input_slices
|
|
218
|
+
extra_attributes_for_component_model: &c2c_params
|
|
219
|
+
resolution: cim_unit_width_cells
|
|
220
|
+
a_hist: inputs_hist
|
|
221
|
+
b_bit_distribution: weight_bit_distribution
|
|
222
|
+
unit_capacitance: 2e-15
|
|
223
|
+
|
|
224
|
+
- !ProcessingStage # Analog port of the C-2C multiplier. Inputs enter here.
|
|
225
|
+
name: C2CMultiplierPortB
|
|
226
|
+
tensors: {keep: weight}
|
|
227
|
+
direction: down
|
|
228
|
+
component_class: C2CMultiplierPortB
|
|
229
|
+
# Accessed n_sliced_psums times to fully read out a weight and create a psum with it
|
|
230
|
+
bits_per_action: weight.bits_per_value / n_sliced_psums
|
|
231
|
+
extra_attributes_for_component_model: *c2c_params
|
|
232
|
+
|
|
233
|
+
# We account for compute energy in the CimUnit reads
|
|
234
|
+
- !Compute
|
|
235
|
+
name: FreeCompute
|
|
236
|
+
component_class: Dummy
|
|
237
|
+
enabled: len(All) == 3
|
|
238
|
+
|
|
239
|
+
# These variables pertain to the workload, microarch, and circuits. They should
|
|
240
|
+
# be matched between architectures when comparing for a fair comparison.
|
|
241
|
+
# Furthermore, this file should follow the same format for all architectures
|
|
242
|
+
# such that we can mix and match architectures with different iso files.
|
|
243
|
+
variables:
|
|
244
|
+
# ===========================================================================
|
|
245
|
+
# Workload, microarch, circuits. Things that should be matched
|
|
246
|
+
# between architectures when comparing.
|
|
247
|
+
# ===========================================================================
|
|
248
|
+
# Set by CiM processor if these values are available in the workload.
|
|
249
|
+
# Otherwise, use the defaults here.
|
|
250
|
+
inputs_hist: [1, 2, 3, 4, 3, 2, 1]
|
|
251
|
+
weights_hist: [1, 1, 1, 1, 1, 1, 1]
|
|
252
|
+
outputs_hist: inputs_hist
|
|
253
|
+
|
|
254
|
+
## Microarch ----------------------------------------------------------------
|
|
255
|
+
supported_input_bits: 8 # Maximum input bits supported by the arch.
|
|
256
|
+
supported_weight_bits: 8 # Maximum weight bits supported by the arch.
|
|
257
|
+
supported_output_bits: 8 # Maximum output bits supported by the arch.
|
|
258
|
+
min_supported_input_bits: 8 # Minimum input bits supported by the arch.
|
|
259
|
+
min_supported_weight_bits: 8 # Minimum weight bits supported by the arch.
|
|
260
|
+
min_supported_output_bits: 8 # Minimum output bits supported by the arch.
|
|
261
|
+
|
|
262
|
+
# Circuits ------------------------------------------------------------------
|
|
263
|
+
voltage: 1
|
|
264
|
+
tech_node: 22e-9 # nm
|
|
265
|
+
cell_config: "{{find_path('sram_wang_vlsi_2022.yaml')}}"
|
|
266
|
+
voltage_energy_scale: (voltage / 1) ** 2
|
|
267
|
+
voltage_latency_scale: (0.7 / voltage) ** 1.1
|
|
268
|
+
|
|
269
|
+
# Calibration ---------------------------------------------------------------
|
|
270
|
+
adc_energy_scale: 3.6 * voltage_energy_scale
|
|
271
|
+
adc_area_scale: 0.4
|
|
272
|
+
row_col_drivers_area_scale: 1
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
# This workload is sized to get peak throughput & energy efficiency.
|
|
276
|
+
workload:
|
|
277
|
+
rank_sizes:
|
|
278
|
+
M: 1
|
|
279
|
+
N: 16
|
|
280
|
+
K: 64
|
|
281
|
+
B: 1
|
|
282
|
+
|
|
283
|
+
einsums:
|
|
284
|
+
- name: Matmul
|
|
285
|
+
tensor_accesses:
|
|
286
|
+
- {name: input, projection: [b, m, k], bits_per_value: 8}
|
|
287
|
+
- {name: weight, projection: [b, k, n], bits_per_value: 8}
|
|
288
|
+
- {name: output, projection: [b, m, n], output: True, bits_per_value: 8}
|
|
289
|
+
renames: {} # Not needed for this workload
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
extra_attributes_for_all_component_models: {tech_node: 65e-9}
|
|
3
|
+
# ============================================================
|
|
4
|
+
# Architecture Description
|
|
5
|
+
# ============================================================
|
|
6
|
+
nodes: # Top-level is hierarchical
|
|
7
|
+
- !Memory # DRAM main memory
|
|
8
|
+
name: MainMemory
|
|
9
|
+
component_class: lpddr4
|
|
10
|
+
size: inf
|
|
11
|
+
|
|
12
|
+
- !Memory
|
|
13
|
+
name: GlobalBuffer
|
|
14
|
+
component_class: SmartBufferSRAM
|
|
15
|
+
size: 1024 * 1024 # 1Mb
|
|
16
|
+
# 32 reads and writes per cycle, 200MHz. Note that the bits per read/write is the
|
|
17
|
+
# bits per action set below.
|
|
18
|
+
total_latency: (read_actions + write_actions) / 32 / 200e6
|
|
19
|
+
extra_attributes_for_component_model: {n_banks: 32}
|
|
20
|
+
actions:
|
|
21
|
+
- {name: read, bits_per_action: 64}
|
|
22
|
+
- {name: write, bits_per_action: 64}
|
|
23
|
+
tensors: {keep: ~MainMemory, may_keep: All}
|
|
24
|
+
|
|
25
|
+
- !Fanout
|
|
26
|
+
name: ArrayFanout
|
|
27
|
+
spatial:
|
|
28
|
+
- {name: reuse_weight, fanout: 14, may_reuse: weight, reuse: weight, min_usage: 1}
|
|
29
|
+
- {name: reuse_output, fanout: 12, may_reuse: output, reuse: output, min_usage: 1}
|
|
30
|
+
|
|
31
|
+
- !Memory # Input scratchpad
|
|
32
|
+
name: InputScratchpad
|
|
33
|
+
component_class: SmartBufferSRAM
|
|
34
|
+
size: 12 * 16 # 12 16b entries
|
|
35
|
+
# One read, one write per cycle, 200MHz. Note bits per action is set below.
|
|
36
|
+
total_latency: max(read_actions / 200e6 + write_actions / 200e6)
|
|
37
|
+
tensors: {keep: input}
|
|
38
|
+
actions:
|
|
39
|
+
- {name: read, bits_per_action: 16}
|
|
40
|
+
- {name: write, bits_per_action: 16}
|
|
41
|
+
|
|
42
|
+
- !Memory # Weight scratchpad
|
|
43
|
+
name: WeightScratchpad
|
|
44
|
+
component_class: SmartBufferSRAM
|
|
45
|
+
size: 192 * 16 # 12 16b entries
|
|
46
|
+
# One read, one write per cycle, 200MHz. Note bits per action is set below.
|
|
47
|
+
total_latency: max(read_actions / 200e6 + write_actions / 200e6)
|
|
48
|
+
tensors: {keep: weight}
|
|
49
|
+
actions:
|
|
50
|
+
- {name: read, bits_per_action: 16}
|
|
51
|
+
- {name: write, bits_per_action: 16}
|
|
52
|
+
|
|
53
|
+
- !Memory # Output scratchpad
|
|
54
|
+
name: OutputScratchpad
|
|
55
|
+
component_class: SmartBufferSRAM
|
|
56
|
+
size: 16 * 16 # 16 16b entries
|
|
57
|
+
# One read, one write per cycle, 200MHz. Note bits per action is set below.
|
|
58
|
+
total_latency: max(read_actions / 200e6 + write_actions / 200e6)
|
|
59
|
+
tensors: {keep: output}
|
|
60
|
+
actions:
|
|
61
|
+
- {name: read, bits_per_action: 16}
|
|
62
|
+
- {name: write, bits_per_action: 16}
|
|
63
|
+
|
|
64
|
+
- !Compute # MAC unit
|
|
65
|
+
name: MAC
|
|
66
|
+
component_class: IntMAC
|
|
67
|
+
total_latency: compute_actions / 200e6
|
|
68
|
+
extra_attributes_for_component_model: {multiplier_width: 8, adder_width: 16}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0
|
|
8
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
9
|
+
actions:
|
|
10
|
+
- {name: read, energy: 1, latency: 0}
|
|
11
|
+
- {name: write, energy: 1, latency: 0}
|
|
12
|
+
|
|
13
|
+
- !Memory
|
|
14
|
+
name: GlobalBuffer
|
|
15
|
+
size: inf #100e6
|
|
16
|
+
leak_power: 0
|
|
17
|
+
area: 0
|
|
18
|
+
tensors: {keep: All}
|
|
19
|
+
spatial:
|
|
20
|
+
- name: X
|
|
21
|
+
fanout: 4
|
|
22
|
+
actions:
|
|
23
|
+
- {name: read, energy: 0, latency: 0}
|
|
24
|
+
- {name: write, energy: 0, latency: 0}
|
|
25
|
+
|
|
26
|
+
- !Compute
|
|
27
|
+
name: MAC
|
|
28
|
+
leak_power: 0
|
|
29
|
+
area: 0
|
|
30
|
+
actions:
|
|
31
|
+
- {name: compute, energy: 0, latency: 1}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0
|
|
8
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
9
|
+
actions:
|
|
10
|
+
- {name: read, energy: 1, latency: 0}
|
|
11
|
+
- {name: write, energy: 1, latency: 0}
|
|
12
|
+
|
|
13
|
+
- !Fanout
|
|
14
|
+
name: GlobalBufferArray
|
|
15
|
+
spatial:
|
|
16
|
+
- name: X
|
|
17
|
+
fanout: 4
|
|
18
|
+
|
|
19
|
+
- !Memory
|
|
20
|
+
name: GlobalBuffer
|
|
21
|
+
size: inf #100e6
|
|
22
|
+
leak_power: 0
|
|
23
|
+
area: 0
|
|
24
|
+
tensors: {keep: All}
|
|
25
|
+
actions:
|
|
26
|
+
- {name: read, energy: 0, latency: 0}
|
|
27
|
+
- {name: write, energy: 0, latency: 0}
|
|
28
|
+
|
|
29
|
+
- !Compute
|
|
30
|
+
name: MAC
|
|
31
|
+
leak_power: 0
|
|
32
|
+
area: 0
|
|
33
|
+
actions:
|
|
34
|
+
- {name: compute, energy: 0, latency: 1}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0
|
|
8
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
9
|
+
actions:
|
|
10
|
+
- {name: read, energy: 1, latency: 0}
|
|
11
|
+
- {name: write, energy: 1, latency: 0}
|
|
12
|
+
|
|
13
|
+
- !Memory
|
|
14
|
+
name: GlobalBuffer
|
|
15
|
+
size: inf #100e6
|
|
16
|
+
leak_power: 0
|
|
17
|
+
area: 0
|
|
18
|
+
tensors: {keep: All}
|
|
19
|
+
actions:
|
|
20
|
+
- {name: read, energy: 0, latency: 0}
|
|
21
|
+
- {name: write, energy: 0, latency: 0}
|
|
22
|
+
|
|
23
|
+
- !Compute
|
|
24
|
+
name: MAC
|
|
25
|
+
leak_power: 0
|
|
26
|
+
area: 0
|
|
27
|
+
spatial:
|
|
28
|
+
- name: X
|
|
29
|
+
fanout: 4
|
|
30
|
+
actions:
|
|
31
|
+
- {name: compute, energy: 0, latency: 1}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0
|
|
8
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
9
|
+
actions:
|
|
10
|
+
- {name: read, energy: 1, latency: 0}
|
|
11
|
+
- {name: write, energy: 1, latency: 0}
|
|
12
|
+
|
|
13
|
+
- !Memory
|
|
14
|
+
name: GlobalBuffer
|
|
15
|
+
size: inf #100e6
|
|
16
|
+
leak_power: 0
|
|
17
|
+
area: 0
|
|
18
|
+
tensors: {keep: All}
|
|
19
|
+
actions:
|
|
20
|
+
- {name: read, energy: 0, latency: 0}
|
|
21
|
+
- {name: write, energy: 0, latency: 0}
|
|
22
|
+
|
|
23
|
+
- !Fanout
|
|
24
|
+
name: MACArray
|
|
25
|
+
spatial:
|
|
26
|
+
- name: X
|
|
27
|
+
fanout: 4
|
|
28
|
+
loop_bounds:
|
|
29
|
+
- expression: ~m
|
|
30
|
+
operator: ==
|
|
31
|
+
value: 1
|
|
32
|
+
|
|
33
|
+
- !Compute
|
|
34
|
+
name: MAC
|
|
35
|
+
leak_power: 0
|
|
36
|
+
area: 0
|
|
37
|
+
actions:
|
|
38
|
+
- {name: compute, energy: 0, latency: 1}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0
|
|
8
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
9
|
+
actions:
|
|
10
|
+
- {name: read, energy: 1, latency: 0}
|
|
11
|
+
- {name: write, energy: 1, latency: 0}
|
|
12
|
+
|
|
13
|
+
- !Memory
|
|
14
|
+
name: GlobalBuffer
|
|
15
|
+
size: inf #100e6
|
|
16
|
+
leak_power: 0
|
|
17
|
+
area: 0
|
|
18
|
+
tensors: {keep: All}
|
|
19
|
+
actions:
|
|
20
|
+
- {name: read, energy: 0, latency: 0}
|
|
21
|
+
- {name: write, energy: 0, latency: 0}
|
|
22
|
+
|
|
23
|
+
- !Fanout
|
|
24
|
+
name: MACArray
|
|
25
|
+
spatial:
|
|
26
|
+
- name: X
|
|
27
|
+
fanout: 4
|
|
28
|
+
|
|
29
|
+
- !Compute
|
|
30
|
+
name: MAC
|
|
31
|
+
leak_power: 0
|
|
32
|
+
area: 0
|
|
33
|
+
actions:
|
|
34
|
+
- {name: compute, energy: 0, latency: 1}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
actions:
|
|
8
|
+
# Energy is upper end of the range from the TPU paper. The lower end came from
|
|
9
|
+
# their reference, and they said it left out some things. Latency is 38.4 GB/s.
|
|
10
|
+
# DDR5-4800. Chip runs at at 1GHz, so divide to get per-cycle bandwidth.
|
|
11
|
+
# https://www.jedec.org/news/pressreleases/jedec-updates-standard-low-power-memory-devices-lpddr5
|
|
12
|
+
- {name: read, energy: 7.03e-12, latency: 1 / (8 * 38.4e9)}
|
|
13
|
+
- {name: write, energy: 7.03e-12, latency: 1 / (8 * 38.4e9)}
|
|
14
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
15
|
+
|
|
16
|
+
- !Memory
|
|
17
|
+
name: GlobalBuffer
|
|
18
|
+
size: 1024*64*8 # 64 kB
|
|
19
|
+
total_latency: max(read_latency, write_latency) # Separate ports
|
|
20
|
+
leak_power: 0
|
|
21
|
+
actions:
|
|
22
|
+
# 512 GB/s read, 128 GB/s write
|
|
23
|
+
- {name: read, energy: 0.249e-12, latency: 1 / 512e9 / 8}
|
|
24
|
+
- {name: write, energy: 0.293e-12, latency: 1 / 128e9 / 8}
|
|
25
|
+
tensors: {keep: All}
|
|
26
|
+
|
|
27
|
+
- !Fanout
|
|
28
|
+
name: ArrayFanout
|
|
29
|
+
spatial:
|
|
30
|
+
- {name: reuse_input, fanout: 32, may_reuse: input, reuse: input, min_usage: 1}
|
|
31
|
+
- {name: reuse_output, fanout: 192, may_reuse: output, reuse: output, min_usage: 1}
|
|
32
|
+
|
|
33
|
+
- !Memory
|
|
34
|
+
name: Register
|
|
35
|
+
size: weight.bits_per_value
|
|
36
|
+
area: 0
|
|
37
|
+
leak_power: 0
|
|
38
|
+
actions:
|
|
39
|
+
- {name: read, energy: 0, latency: 0}
|
|
40
|
+
- {name: write, energy: 0, latency: 0}
|
|
41
|
+
tensors: {keep: weight}
|
|
42
|
+
|
|
43
|
+
- !Compute
|
|
44
|
+
name: MAC
|
|
45
|
+
leak_power: 0
|
|
46
|
+
actions:
|
|
47
|
+
- {name: compute, energy: 0.084e-12, latency: 1 / 1e9}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0
|
|
8
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
9
|
+
actions:
|
|
10
|
+
- {name: read, energy: 1, latency: 0}
|
|
11
|
+
- {name: write, energy: 1, latency: 0}
|
|
12
|
+
|
|
13
|
+
- !Memory
|
|
14
|
+
name: GlobalBuffer
|
|
15
|
+
size: inf #100e6
|
|
16
|
+
leak_power: 0
|
|
17
|
+
area: 0
|
|
18
|
+
tensors: {keep: All}
|
|
19
|
+
actions:
|
|
20
|
+
- {name: read, energy: 0, latency: 0}
|
|
21
|
+
- {name: write, energy: 0, latency: 0}
|
|
22
|
+
|
|
23
|
+
- !Compute
|
|
24
|
+
name: MAC
|
|
25
|
+
leak_power: 0
|
|
26
|
+
area: 0
|
|
27
|
+
actions:
|
|
28
|
+
- {name: compute, energy: 0, latency: 1}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
arch:
|
|
2
|
+
nodes:
|
|
3
|
+
- !Memory
|
|
4
|
+
name: MainMemory
|
|
5
|
+
size: inf
|
|
6
|
+
leak_power: 0
|
|
7
|
+
area: 0 # Don't include off-chip DRAM area
|
|
8
|
+
actions:
|
|
9
|
+
# Upper end of the range from the TPU paper. The lower end came from their
|
|
10
|
+
# reference, and they said it left out some things.
|
|
11
|
+
- {name: read, energy: 7.03e-12, latency: 1 / (8 * 614e9)}
|
|
12
|
+
- {name: write, energy: 7.03e-12, latency: 1 / (8 * 614e9)}
|
|
13
|
+
tensors: {keep: ~Intermediates, may_keep: All}
|
|
14
|
+
|
|
15
|
+
- !Memory
|
|
16
|
+
name: GlobalBuffer
|
|
17
|
+
size: 1024*1024*128*8 # 128MB
|
|
18
|
+
total_latency: max(read_latency, write_latency) # Separate ports
|
|
19
|
+
leak_power: 0
|
|
20
|
+
area: 112e-6 # 112 mm^2
|
|
21
|
+
actions:
|
|
22
|
+
- {name: read, energy: 1.88e-12, latency: 1 / (8 * 2048e9)}
|
|
23
|
+
- {name: write, energy: 2.36e-12, latency: 1 / (8 * 1024e9)}
|
|
24
|
+
tensors: {keep: ~MainMemory.tensors, may_keep: All}
|
|
25
|
+
|
|
26
|
+
- !Memory
|
|
27
|
+
name: LocalBuffer
|
|
28
|
+
spatial: [{name: Z, fanout: 4, may_reuse: Nothing, min_usage: 1}]
|
|
29
|
+
size: 1024*1024*4*8 # 4MB
|
|
30
|
+
leak_power: 0
|
|
31
|
+
area: 50e-6 # 50 mm^2. Very rough estimate based on die photo.
|
|
32
|
+
actions:
|
|
33
|
+
- {name: read, energy: 0.249e-12, latency: 0}
|
|
34
|
+
- {name: write, energy: 0.293e-12, latency: 0}
|
|
35
|
+
tensors: {keep: input | output}
|
|
36
|
+
|
|
37
|
+
- !Compute
|
|
38
|
+
name: ScalarUnit
|
|
39
|
+
area: 10e-6 # 10 um^2. Very rough estimate based on die photo.
|
|
40
|
+
leak_power: 0
|
|
41
|
+
actions:
|
|
42
|
+
- {name: compute, energy: 0, latency: 1 / 1.05e9 / 128}
|
|
43
|
+
enabled: len(All) == 2
|
|
44
|
+
|
|
45
|
+
- !Fanout
|
|
46
|
+
name: ArrayFanout
|
|
47
|
+
spatial:
|
|
48
|
+
- {name: reuse_input, fanout: 128, may_reuse: input, reuse: input, min_usage: 1}
|
|
49
|
+
- {name: reuse_output, fanout: 128, may_reuse: output, reuse: output, min_usage: 1}
|
|
50
|
+
|
|
51
|
+
- !Memory
|
|
52
|
+
name: Register
|
|
53
|
+
size: weight.bits_per_value if weight else 0
|
|
54
|
+
area: 1e-11 # 10 um^2. Very rough estimate based on die photo.
|
|
55
|
+
leak_power: 0
|
|
56
|
+
actions:
|
|
57
|
+
- {name: read, energy: 0, latency: 0}
|
|
58
|
+
- {name: write, energy: 0, latency: 0}
|
|
59
|
+
tensors: {keep: weight}
|
|
60
|
+
|
|
61
|
+
- !Compute
|
|
62
|
+
name: MAC
|
|
63
|
+
leak_power: 0
|
|
64
|
+
area: 9e-11 # 90 um^2. Very rough estimate based on die photo.
|
|
65
|
+
actions:
|
|
66
|
+
- {name: compute, energy: 0.084e-12, latency: 1 / 1.05e9}
|
|
67
|
+
enabled: len(All) == 3
|