accelforge 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. accelforge/__init__.py +21 -0
  2. accelforge/_accelerated_imports.py +16 -0
  3. accelforge/_deprecate/_simanneal/evalmapping.py +271 -0
  4. accelforge/_deprecate/_simanneal/mapspaceglobals.py +298 -0
  5. accelforge/_deprecate/_simanneal/simanneal.py +666 -0
  6. accelforge/_deprecate/_simanneal/tracking.py +105 -0
  7. accelforge/_deprecate/_simanneal/wrappers.py +218 -0
  8. accelforge/_deprecate/_simanneal2/__init__.py +7 -0
  9. accelforge/_deprecate/_simanneal2/simanneal.py +493 -0
  10. accelforge/_deprecate/_simanneal2/tracking.py +116 -0
  11. accelforge/_deprecate/compatibility_util.py +181 -0
  12. accelforge/_deprecate/layerdeduplication/__init__.py +2 -0
  13. accelforge/_deprecate/layerdeduplication/group_similar_einsums.py +160 -0
  14. accelforge/_deprecate/layerdeduplication/grouped_einsums.py +84 -0
  15. accelforge/_deprecate/mapping_filter_tags/__init__.py +2 -0
  16. accelforge/_deprecate/mapping_filter_tags/ffmt.py +212 -0
  17. accelforge/_deprecate/mapping_filter_tags/onesplit.py +24 -0
  18. accelforge/_deprecate/mapping_filter_tags/util.py +24 -0
  19. accelforge/_deprecate/tags.py +69 -0
  20. accelforge/_deprecate/viz/__init__.py +0 -0
  21. accelforge/_deprecate/viz/interactive.py +159 -0
  22. accelforge/_deprecate/viz/reservationtree.py +307 -0
  23. accelforge/_deprecate/viz/ski_slope.py +88 -0
  24. accelforge/_version.py +15 -0
  25. accelforge/examples.py +39 -0
  26. accelforge/frontend/__init__.py +10 -0
  27. accelforge/frontend/_binding.py +129 -0
  28. accelforge/frontend/_workload_isl/__init__.py +2 -0
  29. accelforge/frontend/_workload_isl/_isl.py +149 -0
  30. accelforge/frontend/_workload_isl/_symbolic.py +141 -0
  31. accelforge/frontend/arch copy.py +1544 -0
  32. accelforge/frontend/arch.py +1642 -0
  33. accelforge/frontend/config.py +63 -0
  34. accelforge/frontend/mapper/__init__.py +5 -0
  35. accelforge/frontend/mapper/ffm.py +126 -0
  36. accelforge/frontend/mapper/mapper.py +7 -0
  37. accelforge/frontend/mapper/metrics.py +30 -0
  38. accelforge/frontend/mapping/__init__.py +1 -0
  39. accelforge/frontend/mapping/mapping.py +1736 -0
  40. accelforge/frontend/model.py +14 -0
  41. accelforge/frontend/renames.py +150 -0
  42. accelforge/frontend/spec copy.py +230 -0
  43. accelforge/frontend/spec.py +301 -0
  44. accelforge/frontend/variables.py +12 -0
  45. accelforge/frontend/workload.py +952 -0
  46. accelforge/mapper/FFM/__init__.py +9 -0
  47. accelforge/mapper/FFM/_join_pmappings/__init__.py +0 -0
  48. accelforge/mapper/FFM/_join_pmappings/compatibility.py +653 -0
  49. accelforge/mapper/FFM/_join_pmappings/compress_pmappings.py +140 -0
  50. accelforge/mapper/FFM/_join_pmappings/join_pmappings.py +703 -0
  51. accelforge/mapper/FFM/_join_pmappings/pmapping_dataframe.py +901 -0
  52. accelforge/mapper/FFM/_join_pmappings/pmapping_group.py +337 -0
  53. accelforge/mapper/FFM/_make_pmappings/contraints/__init__.py +0 -0
  54. accelforge/mapper/FFM/_make_pmappings/contraints/constraints.py +360 -0
  55. accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/__init__.py +1 -0
  56. accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_loops.py +373 -0
  57. accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_pmapping_templates.py +463 -0
  58. accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_reservations.py +95 -0
  59. accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_storage_order.py +382 -0
  60. accelforge/mapper/FFM/_make_pmappings/make_pmapping_templates/make_storages.py +155 -0
  61. accelforge/mapper/FFM/_make_pmappings/make_pmappings.py +411 -0
  62. accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/__init__.py +1 -0
  63. accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/make_pmappings_from_templates.py +407 -0
  64. accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/make_tile_shapes.py +1681 -0
  65. accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/run_model.py +170 -0
  66. accelforge/mapper/FFM/_make_pmappings/make_pmappings_from_templates/symbol_relations.py +174 -0
  67. accelforge/mapper/FFM/_make_pmappings/pmapper_job.py +282 -0
  68. accelforge/mapper/FFM/_pareto_df/df_convention.py +273 -0
  69. accelforge/mapper/FFM/_pareto_df/pareto copy.py +836 -0
  70. accelforge/mapper/FFM/_pareto_df/pareto.py +508 -0
  71. accelforge/mapper/FFM/data.py +61 -0
  72. accelforge/mapper/FFM/main copy.py +236 -0
  73. accelforge/mapper/FFM/main.py +208 -0
  74. accelforge/mapper/FFM/mappings.py +510 -0
  75. accelforge/mapper/FFM/pmappings.py +310 -0
  76. accelforge/mapper/__init__.py +4 -0
  77. accelforge/mapper.py +0 -0
  78. accelforge/model/__init__.py +1 -0
  79. accelforge/model/_looptree/__init__.py +0 -0
  80. accelforge/model/_looptree/accesses.py +335 -0
  81. accelforge/model/_looptree/capacity/__init__.py +1 -0
  82. accelforge/model/_looptree/capacity/aggregators.py +36 -0
  83. accelforge/model/_looptree/capacity/capacity.py +47 -0
  84. accelforge/model/_looptree/energy.py +150 -0
  85. accelforge/model/_looptree/equivalent_ranks.py +29 -0
  86. accelforge/model/_looptree/latency/__init__.py +1 -0
  87. accelforge/model/_looptree/latency/latency.py +98 -0
  88. accelforge/model/_looptree/latency/memory.py +120 -0
  89. accelforge/model/_looptree/latency/processors.py +92 -0
  90. accelforge/model/_looptree/mapping_utilities.py +71 -0
  91. accelforge/model/_looptree/reuse/__init__.py +4 -0
  92. accelforge/model/_looptree/reuse/isl/__init__.py +1 -0
  93. accelforge/model/_looptree/reuse/isl/des.py +59 -0
  94. accelforge/model/_looptree/reuse/isl/isl_functions.py +374 -0
  95. accelforge/model/_looptree/reuse/isl/mapping_to_isl/__init__.py +4 -0
  96. accelforge/model/_looptree/reuse/isl/mapping_to_isl/analyze_mapping.py +297 -0
  97. accelforge/model/_looptree/reuse/isl/mapping_to_isl/skews_from_mapping.py +236 -0
  98. accelforge/model/_looptree/reuse/isl/mapping_to_isl/tiling.py +685 -0
  99. accelforge/model/_looptree/reuse/isl/mapping_to_isl/types.py +188 -0
  100. accelforge/model/_looptree/reuse/isl/spatial.py +260 -0
  101. accelforge/model/_looptree/reuse/isl/temporal.py +182 -0
  102. accelforge/model/_looptree/reuse/symbolic/__init__.py +1 -0
  103. accelforge/model/_looptree/reuse/symbolic/symbolic copy 2.py +1346 -0
  104. accelforge/model/_looptree/reuse/symbolic/symbolic copy.py +1408 -0
  105. accelforge/model/_looptree/reuse/symbolic/symbolic.py +1396 -0
  106. accelforge/model/_looptree/run.py +122 -0
  107. accelforge/model/_looptree/types.py +26 -0
  108. accelforge/model/_looptree/visualization/__init__.py +0 -0
  109. accelforge/model/_looptree/visualization/occupancy.py +11 -0
  110. accelforge/model/main.py +222 -0
  111. accelforge/plotting/__init__.py +2 -0
  112. accelforge/plotting/mappings.py +219 -0
  113. accelforge/plotting/specs.py +57 -0
  114. accelforge/util/__init__.py +4 -0
  115. accelforge/util/_base_analysis_types.py +24 -0
  116. accelforge/util/_basetypes.py +1089 -0
  117. accelforge/util/_frozenset.py +36 -0
  118. accelforge/util/_isl.py +29 -0
  119. accelforge/util/_itertools.py +14 -0
  120. accelforge/util/_mathfuncs.py +57 -0
  121. accelforge/util/_parse_expressions.py +339 -0
  122. accelforge/util/_picklecache.py +32 -0
  123. accelforge/util/_setexpressions.py +268 -0
  124. accelforge/util/_sympy/__init__.py +0 -0
  125. accelforge/util/_sympy/broadcast_max.py +18 -0
  126. accelforge/util/_visualization.py +112 -0
  127. accelforge/util/_yaml.py +579 -0
  128. accelforge/util/parallel.py +193 -0
  129. accelforge-0.0.1.dist-info/METADATA +64 -0
  130. accelforge-0.0.1.dist-info/RECORD +258 -0
  131. accelforge-0.0.1.dist-info/WHEEL +5 -0
  132. accelforge-0.0.1.dist-info/licenses/LICENSE +19 -0
  133. accelforge-0.0.1.dist-info/top_level.txt +5 -0
  134. docs/_build/html/_sources/fastfusion.frontend.mapper.rst.txt +37 -0
  135. docs/_build/html/_sources/fastfusion.frontend.rst.txt +70 -0
  136. docs/_build/html/_sources/fastfusion.frontend.workload.rst.txt +21 -0
  137. docs/_build/html/_sources/fastfusion.mapper.FFM.rst.txt +37 -0
  138. docs/_build/html/_sources/fastfusion.mapper.rst.txt +18 -0
  139. docs/_build/html/_sources/fastfusion.rst.txt +20 -0
  140. docs/_build/html/_sources/fastfusion.util.rst.txt +21 -0
  141. docs/_build/html/_sources/index.rst.txt +87 -0
  142. docs/_build/html/_sources/modules.rst.txt +7 -0
  143. docs/_build/html/_sources/notes/citation.rst.txt +45 -0
  144. docs/_build/html/_sources/notes/definitions.rst.txt +43 -0
  145. docs/_build/html/_sources/notes/faqs.rst.txt +39 -0
  146. docs/_build/html/_sources/notes/modeling/accelerator_energy_latency.rst.txt +72 -0
  147. docs/_build/html/_sources/notes/modeling/component_energy_area.rst.txt +96 -0
  148. docs/_build/html/_sources/notes/modeling/mapping.rst.txt +100 -0
  149. docs/_build/html/_sources/notes/modeling.rst.txt +33 -0
  150. docs/_build/html/_sources/notes/parsing/arithmetic_parsing.rst.txt +136 -0
  151. docs/_build/html/_sources/notes/parsing/setexpressions.rst.txt +63 -0
  152. docs/_build/html/_sources/notes/parsing/yaml_parsing.rst.txt +176 -0
  153. docs/_build/html/_sources/notes/quickstart_and_installation.rst.txt +9 -0
  154. docs/_build/html/_sources/notes/spec/architecture.rst.txt +133 -0
  155. docs/_build/html/_sources/notes/spec/mapping.rst.txt +12 -0
  156. docs/_build/html/_sources/notes/spec/workload.rst.txt +83 -0
  157. docs/_build/html/_sources/notes/spec.rst.txt +36 -0
  158. docs/source/_ext/include_attrs.py +213 -0
  159. docs/source/_ext/include_docstring.py +364 -0
  160. docs/source/_ext/include_functions.py +154 -0
  161. docs/source/_ext/include_notebook.py +131 -0
  162. docs/source/_ext/include_yaml.py +119 -0
  163. docs/source/_ext/inherited_attributes.py +222 -0
  164. docs/source/_ext/paths.py +4 -0
  165. docs/source/conf.py +79 -0
  166. examples/arches/compute_in_memory/_include.yaml +74 -0
  167. examples/arches/compute_in_memory/_include_functions.py +229 -0
  168. examples/arches/compute_in_memory/_load_spec.py +57 -0
  169. examples/arches/compute_in_memory/components/c2c_multiplier.py +181 -0
  170. examples/arches/compute_in_memory/components/dac_c2c_r2r.py +605 -0
  171. examples/arches/compute_in_memory/components/misc.py +195 -0
  172. examples/arches/compute_in_memory/components/util/bit_functions.py +51 -0
  173. examples/arches/compute_in_memory/components/zero_comparator.py +92 -0
  174. examples/arches/compute_in_memory/isaac.yaml +233 -0
  175. examples/arches/compute_in_memory/memory_cells/ecram_demo.yaml +63 -0
  176. examples/arches/compute_in_memory/memory_cells/rram_example.yaml +63 -0
  177. examples/arches/compute_in_memory/memory_cells/rram_isaac_isca_2016.yaml +64 -0
  178. examples/arches/compute_in_memory/memory_cells/rram_neurosim_default.yaml +63 -0
  179. examples/arches/compute_in_memory/memory_cells/rram_raella_isca_2023.yaml +70 -0
  180. examples/arches/compute_in_memory/memory_cells/rram_wan_nature_2022.yaml +63 -0
  181. examples/arches/compute_in_memory/memory_cells/sram_colonnade_jssc_2021.yaml +63 -0
  182. examples/arches/compute_in_memory/memory_cells/sram_example.yaml +63 -0
  183. examples/arches/compute_in_memory/memory_cells/sram_jia_jssc_2020.yaml +63 -0
  184. examples/arches/compute_in_memory/memory_cells/sram_sinangil_jssc_2021.yaml +63 -0
  185. examples/arches/compute_in_memory/memory_cells/sram_wang_vlsi_2022.yaml +63 -0
  186. examples/arches/compute_in_memory/wang_vlsi_2022.yaml +289 -0
  187. examples/arches/eyeriss.yaml +68 -0
  188. examples/arches/fanout_variations/at_glb.yaml +31 -0
  189. examples/arches/fanout_variations/at_glb_with_fanout_node.yaml +34 -0
  190. examples/arches/fanout_variations/at_mac.yaml +31 -0
  191. examples/arches/fanout_variations/at_mac_with_constraints.yaml +38 -0
  192. examples/arches/fanout_variations/at_mac_with_fanout_node.yaml +34 -0
  193. examples/arches/nvdla.yaml +47 -0
  194. examples/arches/simple.yaml +28 -0
  195. examples/arches/tpu_v4i.yaml +67 -0
  196. examples/mappings/unfused_matmuls_to_simple.yaml +33 -0
  197. examples/misc/component_annotated.yaml +33 -0
  198. examples/workloads/gpt3_6.7B.yaml +124 -0
  199. examples/workloads/matmuls.yaml +20 -0
  200. examples/workloads/mobilenet_28.yaml +81 -0
  201. examples/workloads/mobilenet_various_separate.yaml +106 -0
  202. examples/workloads/three_matmuls_annotated.yaml +59 -0
  203. notebooks/.ipynb_checkpoints/fastfusion_arch_study_michael-checkpoint.ipynb +359 -0
  204. notebooks/compute_in_memory/_scripts.py +339 -0
  205. notebooks/compute_in_memory/isaac.guide.ipynb +270 -0
  206. notebooks/compute_in_memory/wang_vlsi_2022.ipynb +602 -0
  207. notebooks/paths.py +4 -0
  208. notebooks/tutorials/.ipynb_checkpoints/1_FFM-checkpoint.ipynb +3110 -0
  209. notebooks/tutorials/FFM.ipynb +3498 -0
  210. notebooks/tutorials/_include.py +48 -0
  211. notebooks/tutorials/component_energy_area.ipynb +363 -0
  212. tests/Q_mapping.yaml +38 -0
  213. tests/__init__.py +0 -0
  214. tests/conv.mapping.yaml +27 -0
  215. tests/conv.workload.yaml +13 -0
  216. tests/conv_sym.mapping.yaml +43 -0
  217. tests/copy.mapping.yaml +35 -0
  218. tests/copy.workload.yaml +15 -0
  219. tests/distribuffers/__init__.py +0 -0
  220. tests/distribuffers/multicast/test_cases.yaml +482 -0
  221. tests/distribuffers/spec/binding/valid_bindings.yaml +97 -0
  222. tests/distribuffers/spec/distributed.yaml +100 -0
  223. tests/distribuffers/spec/logical_arch.yaml +32 -0
  224. tests/distribuffers/spec/physical_arch.yaml +69 -0
  225. tests/distribuffers/test_binding.py +48 -0
  226. tests/frontend/__init__.py +0 -0
  227. tests/frontend/test_mapping_viz.py +52 -0
  228. tests/mapper/__init__.py +0 -0
  229. tests/mapper/configs/conv1d/conv1d.mapping.yaml +31 -0
  230. tests/mapper/configs/conv1d/conv1d.workload.yaml +11 -0
  231. tests/mapper/configs/two_conv1d/two_conv1d.expected.yaml +38 -0
  232. tests/mapper/configs/two_conv1d/two_conv1d.mapping.yaml +54 -0
  233. tests/mapper/configs/two_conv1d/two_conv1d.workload.yaml +19 -0
  234. tests/mapper/test_mapping_to_isl.py +90 -0
  235. tests/mapper/test_spatial_reuse_analysis.py +67 -0
  236. tests/mapper/test_temporal_reuse_analysis.py +56 -0
  237. tests/mapper/util.py +58 -0
  238. tests/matmul.mapping.yaml +29 -0
  239. tests/matmul.workload.yaml +12 -0
  240. tests/matmul_spatial.mapping.yaml +44 -0
  241. tests/mha.renames.yaml +65 -0
  242. tests/mha.workload.yaml +67 -0
  243. tests/mha.yaml +59 -0
  244. tests/mha_full.workload.yaml +67 -0
  245. tests/mobilenet.workload.yaml +35 -0
  246. tests/mobilenet_long.workload.yaml +64 -0
  247. tests/pmappingcache.py +24 -0
  248. tests/processing_stage.arch.yaml +40 -0
  249. tests/snowcat.arch.yaml +36 -0
  250. tests/test_ffm_join_pmappings.py +106 -0
  251. tests/test_ffm_make_pmappings.py +82 -0
  252. tests/test_ffm_make_tile_shapes.py +49 -0
  253. tests/test_mapper.py +100 -0
  254. tests/test_model.py +37 -0
  255. tests/test_plotting.py +72 -0
  256. tests/test_processing_stage.py +46 -0
  257. tests/test_symbolic_model.py +248 -0
  258. tests/test_workload.py +141 -0
@@ -0,0 +1,289 @@
1
+ # @INPROCEEDINGS{9830322,
2
+ # author={Wang, Hechen and Liu, Renzhi and Dorrance, Richard and Dasalukunte, Deepak and Liu, Xiaosen and Lake, Dan and Carlton, Brent and Wu, May},
3
+ # booktitle={2022 IEEE Symposium on VLSI Technology and Circuits (VLSI Technology and Circuits)},
4
+ # title={A 32.2 TOPS/W SRAM Compute-in-Memory Macro Employing a Linear 8b C-2C Ladder for Charge Domain Computation in 22nm for Edge Inference},
5
+ # year={2022},
6
+ # volume={},
7
+ # number={},
8
+ # pages={36-37},
9
+ # doi={10.1109/VLSITechnologyandCir46769.2022.9830322}}
10
+ #
11
+ # @ARTICLE{10008405,
12
+ # author={Wang, Hechen and Liu, Renzhi and Dorrance, Richard and Dasalukunte, Deepak and Lake, Dan and Carlton, Brent},
13
+ # journal={IEEE Journal of Solid-State Circuits},
14
+ # title={A Charge Domain SRAM Compute-in-Memory Macro With C-2C Ladder-Based 8b MAC Unit in 22-nm FinFET Process for Edge Inference},
15
+ # year={2023},
16
+ # volume={58},
17
+ # number={4},
18
+ # pages={1037-1050},
19
+ # doi={10.1109/JSSC.2022.3232601}}
20
+
21
+ {{include_text('_include.yaml')}}
22
+ {{add_to_path('./memory_cells')}}
23
+
24
+ arch:
25
+ arch_globals_dependent_on_workload:
26
+ <<: *variables_global
27
+
28
+ # ===========================================================================
29
+ # Encoding-dependent parameters
30
+ # ===========================================================================
31
+ encoded_input_bits: input_bits
32
+ encoded_weight_bits: weight_bits
33
+ encoded_output_bits: output_bits
34
+
35
+ input_encoding_func: offset_encode_hist
36
+ weight_encoding_func: offset_encode_hist
37
+
38
+ # For accuracy model. Can in-array accumulation include signed values?
39
+ # Signed accumulation not compatible with offset encoding (since offset
40
+ # encoding makes values non-negative).
41
+ signed_sum_across_inputs: True
42
+ signed_sum_across_weights: False
43
+
44
+ # ===========================================================================
45
+ # Architecture & CiM Array Structure
46
+ # ===========================================================================
47
+ # DEFINITIONS:
48
+ # - Cell: Smallest structure capable of storing memory. Note that a cell may
49
+ # store more than one bit. For example, a cell consisting of a RRAM
50
+ # device may store >1 bits, while a cell consisting of an SRAM
51
+ # bitcell may store only 1 bit.
52
+ # - CiM Unit: Smallest structure capable of computing an analog MAC.
53
+ # - CiM Unit Width Cells:
54
+ # Number of CiM unit cells that are accessed as one. These cells receive
55
+ # one analog input and compute one analog MAC per timestep.
56
+ # - CiM Unit Depth Cells:
57
+ # Number of independent groups of "CiM Unit Width" cells that form a CiM
58
+ # unit. Each of these groups is indepently addressible and operates in
59
+ # must be activated in a different timestep than the others.
60
+
61
+ cim_unit_width_cells: supported_weight_bits
62
+ cim_unit_depth_cells: 8
63
+ bits_per_cell: 1
64
+
65
+ # ===========================================================================
66
+ # Data Converters
67
+ # ===========================================================================
68
+ adc_resolution: 8
69
+ voltage_dac_resolution: 8
70
+ temporal_dac_resolution: 1
71
+ dac_unit_resistance: 5000
72
+
73
+ n_adc_per_bank: 16
74
+
75
+ # ===========================================================================
76
+ # Hardware
77
+ # ===========================================================================
78
+ base_latency: 6.4e-9
79
+ latency_columns_scale: dac_unit_resistance / 5000 * array_bitlines / 128
80
+ latency_dac_resolution_scale: voltage_dac_resolution / 8
81
+ # Digital clock runs at 2x analog clock speed. Don't let analog clock go
82
+ # faster than that
83
+ no_faster_than_digital: max(0.5, latency_columns_scale * latency_dac_resolution_scale)
84
+ # Assume temporal DAC runs no faster than 0.05ns/step
85
+ limited_by_temporal_dac: 0.05e-9 * (2 ** temporal_dac_resolution - 1)
86
+ cycle_period: max(base_latency * no_faster_than_digital * voltage_latency_scale, limited_by_temporal_dac, 2e-9)
87
+ read_pulse_width: cycle_period
88
+
89
+ extra_attributes_for_all_component_models:
90
+ <<: *cim_component_attributes
91
+ tech_node: tech_node
92
+ cycle_period: cycle_period
93
+
94
+ nodes:
95
+ - !ProcessingStage # DAC converts digital inputs to analog voltages
96
+ name: DAC
97
+ tensors: {keep: input}
98
+ direction: down
99
+ bits_per_action: input_bits / n_input_slices # n_input_slices reads to send an input
100
+ component_class: DualSidedR2RLadderDAC
101
+ n_parallel_instances: array_parallel_inputs # One DAC for each row
102
+ extra_attributes_for_component_model:
103
+ resolution: dac_resolution
104
+ unit_resistance: dac_unit_resistance
105
+ zero_between_values: 0
106
+ bit_distribution: input_bit_distribution
107
+ hist: hist_to_magnitude(inputs_hist)
108
+
109
+ - !ProcessingStage # Row drivers feed inputs onto the rows of the array
110
+ name: RowDrivers
111
+ tensors: {keep: input}
112
+ direction: down
113
+ bits_per_action: input_bits / n_input_slices # n_input_slices reads to send an input
114
+ component_class: ArrayRowDrivers
115
+
116
+ - !ProcessingStage # Weight drivers write weights to the array
117
+ name: WeightDrivers
118
+ tensors: {keep: weight & Above} # Only program the weights if they're not backed in the CiM units
119
+ direction: down
120
+ component_class: ArrayRowDrivers
121
+ bits_per_action: weight_bits / n_weight_slices # n_weight_slices writes to send a weight
122
+ extra_attributes_for_component_model: {<<: [*weight_drivers_attributes]}
123
+
124
+ - !ProcessingStage # Select different sets of weights each timestep
125
+ name: SelectWordlineDrivers
126
+ tensors: {keep: input}
127
+ direction: down
128
+ bits_per_action: input_bits / n_input_slices # n_input_slices reads to send an input
129
+ component_class: ArrayRowDrivers
130
+ extra_attributes_for_component_model:
131
+ # Unlike normal row drivers, this is always fully asserted to select a row
132
+ average_input_value: 1
133
+
134
+ - !ProcessingStage # Column readout (ADC)
135
+ name: ADC
136
+ tensors: {keep: output}
137
+ direction: up
138
+ component_class: ADC
139
+ bits_per_action: output_bits / n_sliced_psums
140
+ energy_scale: adc_energy_scale
141
+ area_scale: adc_area_scale
142
+ extra_attributes_for_component_model:
143
+ n_bits: adc_resolution
144
+ throughput_scale: 1 # 1 cycle to process all outputs
145
+ throughput: 1 / cycle_period * cols_active_at_once * throughput_scale
146
+
147
+ - !ProcessingStage # Column drivers
148
+ name: ColumnDrivers
149
+ tensors: {keep: output}
150
+ direction: up
151
+ component_class: ArrayColumnDrivers
152
+ bits_per_action: output_bits / n_sliced_psums
153
+ actions: [{name: read, latency: cycle_period / cols_active_at_once}]
154
+
155
+ - !Fanout
156
+ name: ColumnOfSubBanks
157
+ spatial:
158
+ - name: array_reuse_input # Special name that determines array size
159
+ fanout: 16
160
+ usage_scale: n_weight_slices
161
+ reuse: input
162
+ min_usage: 1
163
+
164
+ # Column bandwidth limiter to limit write speed (only one value can be written per
165
+ # column per cycle)
166
+ - !ProcessingStage
167
+ name: ColumnBandwidthLimiter
168
+ # Keep weight and output tensors. Don't keep anything if it doesn't leave the array.
169
+ tensors: {keep: (weight | output) & Above}
170
+ direction: down
171
+ component_class: Dummy
172
+
173
+ # Each time a weight slice is written to the array, consume 0.5 "bits". 0.5 because
174
+ # the digital clock (writing weights) runs at 2x the speed of the analog clock. Each
175
+ # time a sliced psum is read from the array, consume 1 "bit"
176
+ bits_per_value_scale:
177
+ weight: n_weight_slices / weight_bits / 2
178
+ output: n_sliced_psums / output_bits
179
+ All - (weight | output): 0 # Don't care
180
+
181
+ # One cycle period per "bit"
182
+ actions: [{name: read, latency: cycle_period}]
183
+
184
+ - !Fanout # Each sub-bank receives a different input slice. Sub-banks share outputs.
185
+ name: SubBank
186
+ spatial:
187
+ - name: array_reuse_output # Special name that determines array size
188
+ fanout: 64
189
+ reuse: output
190
+ min_usage: 1
191
+
192
+ # This is the CiM unit that stores weights and computes MACs. Each CiM unit stores a
193
+ # different weight slice of up to cim_unit_width_cells bits. It may also store up to
194
+ # cim_unit_depth_cells independently-addressable weight slices, but may only compute
195
+ # MACs on one slice at a time. One of these components represents a collection of CiM
196
+ # units, that together hold one weight.
197
+ - !Memory
198
+ name: CimUnit
199
+ component_class: MemoryCell
200
+ size: cim_unit_width_cells * cim_unit_depth_cells * bits_per_cell * n_weight_slices
201
+ # Requires (n_weight_slices * n_input_slices) computes to fully use one weight
202
+ bits_per_action: weight.bits_per_value / n_weight_slices / n_input_slices
203
+ # Bind together n_weight_slices instances to hold one weight
204
+ n_parallel_instances: n_weight_slices
205
+ extra_attributes_for_component_model:
206
+ n_instances: cim_unit_width_cells * cim_unit_depth_cells
207
+ tensors: {keep: weight, no_refetch_from_above: weight, force_memory_hierarchy_order: False}
208
+ # NeuroSim-returned results are too high for this component, so override the latency
209
+ actions: [{name: read, latency: cycle_period}]
210
+
211
+ - !ProcessingStage # Digital port of the C-2C multiplier. Weights enter here.
212
+ name: C2CMultiplier
213
+ tensors: {keep: input}
214
+ direction: down
215
+ component_class: C2CMultiplier
216
+ # Requires (n_weight_slices * n_input_slices) computes to fully use one weight
217
+ bits_per_action: input.bits_per_value / n_weight_slices / n_input_slices
218
+ extra_attributes_for_component_model: &c2c_params
219
+ resolution: cim_unit_width_cells
220
+ a_hist: inputs_hist
221
+ b_bit_distribution: weight_bit_distribution
222
+ unit_capacitance: 2e-15
223
+
224
+ - !ProcessingStage # Analog port of the C-2C multiplier. Inputs enter here.
225
+ name: C2CMultiplierPortB
226
+ tensors: {keep: weight}
227
+ direction: down
228
+ component_class: C2CMultiplierPortB
229
+ # Accessed n_sliced_psums times to fully read out a weight and create a psum with it
230
+ bits_per_action: weight.bits_per_value / n_sliced_psums
231
+ extra_attributes_for_component_model: *c2c_params
232
+
233
+ # We account for compute energy in the CimUnit reads
234
+ - !Compute
235
+ name: FreeCompute
236
+ component_class: Dummy
237
+ enabled: len(All) == 3
238
+
239
+ # These variables pertain to the workload, microarch, and circuits. They should
240
+ # be matched between architectures when comparing for a fair comparison.
241
+ # Furthermore, this file should follow the same format for all architectures
242
+ # such that we can mix and match architectures with different iso files.
243
+ variables:
244
+ # ===========================================================================
245
+ # Workload, microarch, circuits. Things that should be matched
246
+ # between architectures when comparing.
247
+ # ===========================================================================
248
+ # Set by CiM processor if these values are available in the workload.
249
+ # Otherwise, use the defaults here.
250
+ inputs_hist: [1, 2, 3, 4, 3, 2, 1]
251
+ weights_hist: [1, 1, 1, 1, 1, 1, 1]
252
+ outputs_hist: inputs_hist
253
+
254
+ ## Microarch ----------------------------------------------------------------
255
+ supported_input_bits: 8 # Maximum input bits supported by the arch.
256
+ supported_weight_bits: 8 # Maximum weight bits supported by the arch.
257
+ supported_output_bits: 8 # Maximum output bits supported by the arch.
258
+ min_supported_input_bits: 8 # Minimum input bits supported by the arch.
259
+ min_supported_weight_bits: 8 # Minimum weight bits supported by the arch.
260
+ min_supported_output_bits: 8 # Minimum output bits supported by the arch.
261
+
262
+ # Circuits ------------------------------------------------------------------
263
+ voltage: 1
264
+ tech_node: 22e-9 # nm
265
+ cell_config: "{{find_path('sram_wang_vlsi_2022.yaml')}}"
266
+ voltage_energy_scale: (voltage / 1) ** 2
267
+ voltage_latency_scale: (0.7 / voltage) ** 1.1
268
+
269
+ # Calibration ---------------------------------------------------------------
270
+ adc_energy_scale: 3.6 * voltage_energy_scale
271
+ adc_area_scale: 0.4
272
+ row_col_drivers_area_scale: 1
273
+
274
+
275
+ # This workload is sized to get peak throughput & energy efficiency.
276
+ workload:
277
+ rank_sizes:
278
+ M: 1
279
+ N: 16
280
+ K: 64
281
+ B: 1
282
+
283
+ einsums:
284
+ - name: Matmul
285
+ tensor_accesses:
286
+ - {name: input, projection: [b, m, k], bits_per_value: 8}
287
+ - {name: weight, projection: [b, k, n], bits_per_value: 8}
288
+ - {name: output, projection: [b, m, n], output: True, bits_per_value: 8}
289
+ renames: {} # Not needed for this workload
@@ -0,0 +1,68 @@
1
+ arch:
2
+ extra_attributes_for_all_component_models: {tech_node: 65e-9}
3
+ # ============================================================
4
+ # Architecture Description
5
+ # ============================================================
6
+ nodes: # Top-level is hierarchical
7
+ - !Memory # DRAM main memory
8
+ name: MainMemory
9
+ component_class: lpddr4
10
+ size: inf
11
+
12
+ - !Memory
13
+ name: GlobalBuffer
14
+ component_class: SmartBufferSRAM
15
+ size: 1024 * 1024 # 1Mb
16
+ # 32 reads and writes per cycle, 200MHz. Note that the bits per read/write is the
17
+ # bits per action set below.
18
+ total_latency: (read_actions + write_actions) / 32 / 200e6
19
+ extra_attributes_for_component_model: {n_banks: 32}
20
+ actions:
21
+ - {name: read, bits_per_action: 64}
22
+ - {name: write, bits_per_action: 64}
23
+ tensors: {keep: ~MainMemory, may_keep: All}
24
+
25
+ - !Fanout
26
+ name: ArrayFanout
27
+ spatial:
28
+ - {name: reuse_weight, fanout: 14, may_reuse: weight, reuse: weight, min_usage: 1}
29
+ - {name: reuse_output, fanout: 12, may_reuse: output, reuse: output, min_usage: 1}
30
+
31
+ - !Memory # Input scratchpad
32
+ name: InputScratchpad
33
+ component_class: SmartBufferSRAM
34
+ size: 12 * 16 # 12 16b entries
35
+ # One read, one write per cycle, 200MHz. Note bits per action is set below.
36
+ total_latency: max(read_actions / 200e6 + write_actions / 200e6)
37
+ tensors: {keep: input}
38
+ actions:
39
+ - {name: read, bits_per_action: 16}
40
+ - {name: write, bits_per_action: 16}
41
+
42
+ - !Memory # Weight scratchpad
43
+ name: WeightScratchpad
44
+ component_class: SmartBufferSRAM
45
+ size: 192 * 16 # 12 16b entries
46
+ # One read, one write per cycle, 200MHz. Note bits per action is set below.
47
+ total_latency: max(read_actions / 200e6 + write_actions / 200e6)
48
+ tensors: {keep: weight}
49
+ actions:
50
+ - {name: read, bits_per_action: 16}
51
+ - {name: write, bits_per_action: 16}
52
+
53
+ - !Memory # Output scratchpad
54
+ name: OutputScratchpad
55
+ component_class: SmartBufferSRAM
56
+ size: 16 * 16 # 16 16b entries
57
+ # One read, one write per cycle, 200MHz. Note bits per action is set below.
58
+ total_latency: max(read_actions / 200e6 + write_actions / 200e6)
59
+ tensors: {keep: output}
60
+ actions:
61
+ - {name: read, bits_per_action: 16}
62
+ - {name: write, bits_per_action: 16}
63
+
64
+ - !Compute # MAC unit
65
+ name: MAC
66
+ component_class: IntMAC
67
+ total_latency: compute_actions / 200e6
68
+ extra_attributes_for_component_model: {multiplier_width: 8, adder_width: 16}
@@ -0,0 +1,31 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0
8
+ tensors: {keep: ~Intermediates, may_keep: All}
9
+ actions:
10
+ - {name: read, energy: 1, latency: 0}
11
+ - {name: write, energy: 1, latency: 0}
12
+
13
+ - !Memory
14
+ name: GlobalBuffer
15
+ size: inf #100e6
16
+ leak_power: 0
17
+ area: 0
18
+ tensors: {keep: All}
19
+ spatial:
20
+ - name: X
21
+ fanout: 4
22
+ actions:
23
+ - {name: read, energy: 0, latency: 0}
24
+ - {name: write, energy: 0, latency: 0}
25
+
26
+ - !Compute
27
+ name: MAC
28
+ leak_power: 0
29
+ area: 0
30
+ actions:
31
+ - {name: compute, energy: 0, latency: 1}
@@ -0,0 +1,34 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0
8
+ tensors: {keep: ~Intermediates, may_keep: All}
9
+ actions:
10
+ - {name: read, energy: 1, latency: 0}
11
+ - {name: write, energy: 1, latency: 0}
12
+
13
+ - !Fanout
14
+ name: GlobalBufferArray
15
+ spatial:
16
+ - name: X
17
+ fanout: 4
18
+
19
+ - !Memory
20
+ name: GlobalBuffer
21
+ size: inf #100e6
22
+ leak_power: 0
23
+ area: 0
24
+ tensors: {keep: All}
25
+ actions:
26
+ - {name: read, energy: 0, latency: 0}
27
+ - {name: write, energy: 0, latency: 0}
28
+
29
+ - !Compute
30
+ name: MAC
31
+ leak_power: 0
32
+ area: 0
33
+ actions:
34
+ - {name: compute, energy: 0, latency: 1}
@@ -0,0 +1,31 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0
8
+ tensors: {keep: ~Intermediates, may_keep: All}
9
+ actions:
10
+ - {name: read, energy: 1, latency: 0}
11
+ - {name: write, energy: 1, latency: 0}
12
+
13
+ - !Memory
14
+ name: GlobalBuffer
15
+ size: inf #100e6
16
+ leak_power: 0
17
+ area: 0
18
+ tensors: {keep: All}
19
+ actions:
20
+ - {name: read, energy: 0, latency: 0}
21
+ - {name: write, energy: 0, latency: 0}
22
+
23
+ - !Compute
24
+ name: MAC
25
+ leak_power: 0
26
+ area: 0
27
+ spatial:
28
+ - name: X
29
+ fanout: 4
30
+ actions:
31
+ - {name: compute, energy: 0, latency: 1}
@@ -0,0 +1,38 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0
8
+ tensors: {keep: ~Intermediates, may_keep: All}
9
+ actions:
10
+ - {name: read, energy: 1, latency: 0}
11
+ - {name: write, energy: 1, latency: 0}
12
+
13
+ - !Memory
14
+ name: GlobalBuffer
15
+ size: inf #100e6
16
+ leak_power: 0
17
+ area: 0
18
+ tensors: {keep: All}
19
+ actions:
20
+ - {name: read, energy: 0, latency: 0}
21
+ - {name: write, energy: 0, latency: 0}
22
+
23
+ - !Fanout
24
+ name: MACArray
25
+ spatial:
26
+ - name: X
27
+ fanout: 4
28
+ loop_bounds:
29
+ - expression: ~m
30
+ operator: ==
31
+ value: 1
32
+
33
+ - !Compute
34
+ name: MAC
35
+ leak_power: 0
36
+ area: 0
37
+ actions:
38
+ - {name: compute, energy: 0, latency: 1}
@@ -0,0 +1,34 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0
8
+ tensors: {keep: ~Intermediates, may_keep: All}
9
+ actions:
10
+ - {name: read, energy: 1, latency: 0}
11
+ - {name: write, energy: 1, latency: 0}
12
+
13
+ - !Memory
14
+ name: GlobalBuffer
15
+ size: inf #100e6
16
+ leak_power: 0
17
+ area: 0
18
+ tensors: {keep: All}
19
+ actions:
20
+ - {name: read, energy: 0, latency: 0}
21
+ - {name: write, energy: 0, latency: 0}
22
+
23
+ - !Fanout
24
+ name: MACArray
25
+ spatial:
26
+ - name: X
27
+ fanout: 4
28
+
29
+ - !Compute
30
+ name: MAC
31
+ leak_power: 0
32
+ area: 0
33
+ actions:
34
+ - {name: compute, energy: 0, latency: 1}
@@ -0,0 +1,47 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ actions:
8
+ # Energy is upper end of the range from the TPU paper. The lower end came from
9
+ # their reference, and they said it left out some things. Latency is 38.4 GB/s.
10
+ # DDR5-4800. Chip runs at at 1GHz, so divide to get per-cycle bandwidth.
11
+ # https://www.jedec.org/news/pressreleases/jedec-updates-standard-low-power-memory-devices-lpddr5
12
+ - {name: read, energy: 7.03e-12, latency: 1 / (8 * 38.4e9)}
13
+ - {name: write, energy: 7.03e-12, latency: 1 / (8 * 38.4e9)}
14
+ tensors: {keep: ~Intermediates, may_keep: All}
15
+
16
+ - !Memory
17
+ name: GlobalBuffer
18
+ size: 1024*64*8 # 64 kB
19
+ total_latency: max(read_latency, write_latency) # Separate ports
20
+ leak_power: 0
21
+ actions:
22
+ # 512 GB/s read, 128 GB/s write
23
+ - {name: read, energy: 0.249e-12, latency: 1 / 512e9 / 8}
24
+ - {name: write, energy: 0.293e-12, latency: 1 / 128e9 / 8}
25
+ tensors: {keep: All}
26
+
27
+ - !Fanout
28
+ name: ArrayFanout
29
+ spatial:
30
+ - {name: reuse_input, fanout: 32, may_reuse: input, reuse: input, min_usage: 1}
31
+ - {name: reuse_output, fanout: 192, may_reuse: output, reuse: output, min_usage: 1}
32
+
33
+ - !Memory
34
+ name: Register
35
+ size: weight.bits_per_value
36
+ area: 0
37
+ leak_power: 0
38
+ actions:
39
+ - {name: read, energy: 0, latency: 0}
40
+ - {name: write, energy: 0, latency: 0}
41
+ tensors: {keep: weight}
42
+
43
+ - !Compute
44
+ name: MAC
45
+ leak_power: 0
46
+ actions:
47
+ - {name: compute, energy: 0.084e-12, latency: 1 / 1e9}
@@ -0,0 +1,28 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0
8
+ tensors: {keep: ~Intermediates, may_keep: All}
9
+ actions:
10
+ - {name: read, energy: 1, latency: 0}
11
+ - {name: write, energy: 1, latency: 0}
12
+
13
+ - !Memory
14
+ name: GlobalBuffer
15
+ size: inf #100e6
16
+ leak_power: 0
17
+ area: 0
18
+ tensors: {keep: All}
19
+ actions:
20
+ - {name: read, energy: 0, latency: 0}
21
+ - {name: write, energy: 0, latency: 0}
22
+
23
+ - !Compute
24
+ name: MAC
25
+ leak_power: 0
26
+ area: 0
27
+ actions:
28
+ - {name: compute, energy: 0, latency: 1}
@@ -0,0 +1,67 @@
1
+ arch:
2
+ nodes:
3
+ - !Memory
4
+ name: MainMemory
5
+ size: inf
6
+ leak_power: 0
7
+ area: 0 # Don't include off-chip DRAM area
8
+ actions:
9
+ # Upper end of the range from the TPU paper. The lower end came from their
10
+ # reference, and they said it left out some things.
11
+ - {name: read, energy: 7.03e-12, latency: 1 / (8 * 614e9)}
12
+ - {name: write, energy: 7.03e-12, latency: 1 / (8 * 614e9)}
13
+ tensors: {keep: ~Intermediates, may_keep: All}
14
+
15
+ - !Memory
16
+ name: GlobalBuffer
17
+ size: 1024*1024*128*8 # 128MB
18
+ total_latency: max(read_latency, write_latency) # Separate ports
19
+ leak_power: 0
20
+ area: 112e-6 # 112 mm^2
21
+ actions:
22
+ - {name: read, energy: 1.88e-12, latency: 1 / (8 * 2048e9)}
23
+ - {name: write, energy: 2.36e-12, latency: 1 / (8 * 1024e9)}
24
+ tensors: {keep: ~MainMemory.tensors, may_keep: All}
25
+
26
+ - !Memory
27
+ name: LocalBuffer
28
+ spatial: [{name: Z, fanout: 4, may_reuse: Nothing, min_usage: 1}]
29
+ size: 1024*1024*4*8 # 4MB
30
+ leak_power: 0
31
+ area: 50e-6 # 50 mm^2. Very rough estimate based on die photo.
32
+ actions:
33
+ - {name: read, energy: 0.249e-12, latency: 0}
34
+ - {name: write, energy: 0.293e-12, latency: 0}
35
+ tensors: {keep: input | output}
36
+
37
+ - !Compute
38
+ name: ScalarUnit
39
+ area: 10e-6 # 10 um^2. Very rough estimate based on die photo.
40
+ leak_power: 0
41
+ actions:
42
+ - {name: compute, energy: 0, latency: 1 / 1.05e9 / 128}
43
+ enabled: len(All) == 2
44
+
45
+ - !Fanout
46
+ name: ArrayFanout
47
+ spatial:
48
+ - {name: reuse_input, fanout: 128, may_reuse: input, reuse: input, min_usage: 1}
49
+ - {name: reuse_output, fanout: 128, may_reuse: output, reuse: output, min_usage: 1}
50
+
51
+ - !Memory
52
+ name: Register
53
+ size: weight.bits_per_value if weight else 0
54
+ area: 1e-11 # 10 um^2. Very rough estimate based on die photo.
55
+ leak_power: 0
56
+ actions:
57
+ - {name: read, energy: 0, latency: 0}
58
+ - {name: write, energy: 0, latency: 0}
59
+ tensors: {keep: weight}
60
+
61
+ - !Compute
62
+ name: MAC
63
+ leak_power: 0
64
+ area: 9e-11 # 90 um^2. Very rough estimate based on die photo.
65
+ actions:
66
+ - {name: compute, energy: 0.084e-12, latency: 1 / 1.05e9}
67
+ enabled: len(All) == 3