trme 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {trme-0.1.0 → trme-0.2.0}/Cargo.toml +1 -1
- {trme-0.1.0 → trme-0.2.0}/Dockerfile +0 -1
- {trme-0.1.0 → trme-0.2.0}/PKG-INFO +1 -1
- trme-0.2.0/build.rs +40 -0
- {trme-0.1.0 → trme-0.2.0}/pyproject.toml +1 -1
- {trme-0.1.0 → trme-0.2.0}/src/lib.rs +1 -1
- trme-0.2.0/trme/__init__.py +67 -0
- trme-0.1.0/build.rs +0 -40
- trme-0.1.0/trme_torch.py +0 -95
- {trme-0.1.0 → trme-0.2.0}/Cargo.lock +0 -0
- {trme-0.1.0 → trme-0.2.0}/Makefile +0 -0
- {trme-0.1.0 → trme-0.2.0}/README.md +0 -0
- {trme-0.1.0 → trme-0.2.0}/RESEARCH_REPORT.md +0 -0
- {trme-0.1.0 → trme-0.2.0}/RESEARCH_REPORT_FINAL.md +0 -0
- {trme-0.1.0 → trme-0.2.0}/RESEARCH_REPORT_V4.md +0 -0
- {trme-0.1.0 → trme-0.2.0}/TRME_ISA.md +0 -0
- {trme-0.1.0 → trme-0.2.0}/benchmark_rsr.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/cosim_verilator.cpp +0 -0
- {trme-0.1.0 → trme-0.2.0}/energy_estimator.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/fmm_core.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/fmm_cpp_binding.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/fmm_octree.cpp +0 -0
- {trme-0.1.0 → trme-0.2.0}/optical_fdtd.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/optical_noise.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/quantize.cpp +0 -0
- {trme-0.1.0 → trme-0.2.0}/rsr_fused.cpp +0 -0
- {trme-0.1.0 → trme-0.2.0}/rsr_gemm.cpp +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/axi_stream_wrapper.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/clifford_alu.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/hbm_axi_stub.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/lns_adder.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/rns_core.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/rns_pipeline.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/rsr_unit.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/systolic_array_4x4.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/systolic_array_NxN.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/tb_lns_adder.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/tb_systolic.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/trme_top.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/rtl/tropical_alu.v +0 -0
- {trme-0.1.0 → trme-0.2.0}/test_compiler.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/test_torch_integration.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/triton_rsr.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/trme_autotune.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/trme_compiler.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/trme_cosim.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/trme_sim.py +0 -0
- {trme-0.1.0 → trme-0.2.0}/verilator/sim_main.cpp +0 -0
trme-0.2.0/build.rs
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
fn main() {
|
|
2
|
+
// 1. Instancia o compilador C++
|
|
3
|
+
let mut build = cc::Build::new();
|
|
4
|
+
|
|
5
|
+
// 2. Adiciona o arquivo fonte C++
|
|
6
|
+
// CERTIFIQUE-SE que o arquivo "rsr_gemm.cpp" está na mesma pasta que o Cargo.toml
|
|
7
|
+
build.file("rsr_gemm.cpp");
|
|
8
|
+
|
|
9
|
+
// Indica que é C++
|
|
10
|
+
build.cpp(true);
|
|
11
|
+
|
|
12
|
+
// 3. Configura Flags baseadas no Sistema Operacional
|
|
13
|
+
let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
|
|
14
|
+
|
|
15
|
+
if target_os == "windows" {
|
|
16
|
+
// --- Configuração WINDOWS (MSVC) ---
|
|
17
|
+
build.flag("/O2"); // Otimização máxima
|
|
18
|
+
build.flag("/EHsc"); // Tratamento de exceções padrão
|
|
19
|
+
// build.flag("/openmp"); // Descomente se seu C++ usar OpenMP (#include <omp.h>)
|
|
20
|
+
} else {
|
|
21
|
+
// --- Configuração LINUX/MAC ---
|
|
22
|
+
build.flag("-O3");
|
|
23
|
+
build.flag("-march=native");
|
|
24
|
+
build.flag("-fopenmp");
|
|
25
|
+
|
|
26
|
+
// Só linka essas libs se NÃO for Windows
|
|
27
|
+
println!("cargo:rustc-link-lib=gomp");
|
|
28
|
+
println!("cargo:rustc-link-lib=stdc++");
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// 4. Compila a biblioteca estática "trme_core"
|
|
32
|
+
// Isso gera o arquivo .lib que o linker estava reclamando que não existia
|
|
33
|
+
build.compile("trme_core");
|
|
34
|
+
|
|
35
|
+
// 5. Avisa o Cargo para recompilar se o C++ mudar
|
|
36
|
+
println!("cargo:rerun-if-changed=rsr_gemm.cpp");
|
|
37
|
+
|
|
38
|
+
// Avisa onde procurar a lib (na pasta de build atual)
|
|
39
|
+
println!("cargo:rustc-link-search=native=.");
|
|
40
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import numpy as np
|
|
3
|
+
import _trme_backend
|
|
4
|
+
|
|
5
|
+
class TRMEMatmul(torch.autograd.Function):
|
|
6
|
+
@staticmethod
|
|
7
|
+
def forward(ctx, input, weight, bias=None, block_size=4):
|
|
8
|
+
"""
|
|
9
|
+
input: (N, K)
|
|
10
|
+
weight: (M, K)
|
|
11
|
+
bias: (M,) Optional
|
|
12
|
+
"""
|
|
13
|
+
if isinstance(bias, int):
|
|
14
|
+
block_size = bias
|
|
15
|
+
bias = None
|
|
16
|
+
|
|
17
|
+
# Quantization (A -> [0..3], B -> Int8)
|
|
18
|
+
# Note: We do simple casting here for the prototype.
|
|
19
|
+
# Ideally calling C++ quantization logic.
|
|
20
|
+
A_cpu = input.detach().cpu().numpy()
|
|
21
|
+
A_q = np.clip(np.abs(A_cpu), 0, 3).astype(np.int8)
|
|
22
|
+
|
|
23
|
+
B_cpu = weight.detach().cpu().numpy().T
|
|
24
|
+
B_q = np.ascontiguousarray(np.clip(B_cpu, -127, 127).astype(np.int8))
|
|
25
|
+
|
|
26
|
+
N, K = A_q.shape
|
|
27
|
+
_, M = B_q.shape
|
|
28
|
+
|
|
29
|
+
# Flatten for Rust FFI
|
|
30
|
+
a_bytes = A_q.tobytes()
|
|
31
|
+
b_bytes = B_q.tobytes()
|
|
32
|
+
|
|
33
|
+
# Call Rust Backend
|
|
34
|
+
# rsr_gemm returns list[int] (flattened C)
|
|
35
|
+
c_flat = _trme_backend.rsr_gemm(a_bytes, b_bytes, N, K, M, int(block_size))
|
|
36
|
+
|
|
37
|
+
# Reshape and Convert to Tensor
|
|
38
|
+
C_q = np.array(c_flat, dtype=np.int32).reshape(N, M)
|
|
39
|
+
output = torch.from_numpy(C_q.astype(np.float32))
|
|
40
|
+
|
|
41
|
+
if bias is not None:
|
|
42
|
+
output += bias.cpu()
|
|
43
|
+
|
|
44
|
+
ctx.save_for_backward(input, weight, bias)
|
|
45
|
+
return output
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def backward(ctx, grad_output):
|
|
49
|
+
input, weight, bias = ctx.saved_tensors
|
|
50
|
+
grad_input = grad_weight = grad_bias = None
|
|
51
|
+
|
|
52
|
+
if ctx.needs_input_grad[0]:
|
|
53
|
+
grad_input = grad_output.matmul(weight)
|
|
54
|
+
if ctx.needs_input_grad[1]:
|
|
55
|
+
grad_weight = grad_output.t().matmul(input)
|
|
56
|
+
if bias is not None and ctx.needs_input_grad[2]:
|
|
57
|
+
grad_bias = grad_output.sum(0)
|
|
58
|
+
|
|
59
|
+
return grad_input, grad_weight, grad_bias, None
|
|
60
|
+
|
|
61
|
+
def matmul(input, weight):
|
|
62
|
+
# Basic matmul interface
|
|
63
|
+
return TRMEMatmul.apply(input, weight)
|
|
64
|
+
|
|
65
|
+
def linear(input, weight, bias=None, block_size=4):
|
|
66
|
+
# nn.Linear interface
|
|
67
|
+
return TRMEMatmul.apply(input, weight, bias, block_size)
|
trme-0.1.0/build.rs
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
fn main() {
|
|
2
|
-
// 1. Instancia o compilador C++
|
|
3
|
-
let mut build = cc::Build::new();
|
|
4
|
-
|
|
5
|
-
// 2. Adiciona o arquivo fonte C++
|
|
6
|
-
// IMPORTANTE: Verifique se o nome do arquivo na sua pasta é exatamente este.
|
|
7
|
-
// Baseado no seu README, deve ser "rsr_gemm.cpp" ou "rsr_fused.cpp".
|
|
8
|
-
// Se tiver os dois, adicione outra linha: build.file("rsr_fused.cpp");
|
|
9
|
-
build.file("rsr_gemm.cpp");
|
|
10
|
-
|
|
11
|
-
// Indica que é C++ (não C puro)
|
|
12
|
-
build.cpp(true);
|
|
13
|
-
|
|
14
|
-
// 3. Configura Flags específicas por Sistema Operacional
|
|
15
|
-
let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
|
|
16
|
-
|
|
17
|
-
if target_os == "windows" {
|
|
18
|
-
// --- Configuração WINDOWS (MSVC) ---
|
|
19
|
-
build.flag("/O2"); // Otimização máxima
|
|
20
|
-
build.flag("/EHsc"); // Tratamento de exceções C++ padrão
|
|
21
|
-
build.flag("/openmp"); // OpenMP nativo do Visual Studio
|
|
22
|
-
// No Windows, não precisamos linkar stdc++ ou gomp manualmente
|
|
23
|
-
} else {
|
|
24
|
-
// --- Configuração LINUX/MAC ---
|
|
25
|
-
build.flag("-O3");
|
|
26
|
-
build.flag("-march=native");
|
|
27
|
-
build.flag("-fopenmp");
|
|
28
|
-
|
|
29
|
-
// No Linux, precisamos pedir para linkar as libs explicitamente no final
|
|
30
|
-
println!("cargo:rustc-link-lib=gomp");
|
|
31
|
-
println!("cargo:rustc-link-lib=stdc++");
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
// 4. Compila a biblioteca estática "trme_core.lib"
|
|
35
|
-
// Isso vai gerar o arquivo que estava faltando no erro anterior
|
|
36
|
-
build.compile("trme_core");
|
|
37
|
-
|
|
38
|
-
// 5. Instrui o Cargo a recompilar se o C++ mudar
|
|
39
|
-
println!("cargo:rerun-if-changed=rsr_gemm.cpp");
|
|
40
|
-
}
|
trme-0.1.0/trme_torch.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import torch
|
|
2
|
-
import ctypes
|
|
3
|
-
import numpy as np
|
|
4
|
-
import os
|
|
5
|
-
|
|
6
|
-
# Load Library
|
|
7
|
-
_lib_path = os.path.abspath('./librsr.so')
|
|
8
|
-
if os.path.exists(_lib_path):
|
|
9
|
-
_lib = ctypes.CDLL(_lib_path)
|
|
10
|
-
|
|
11
|
-
_lib.rsr_gemm_dispatch.argtypes = [
|
|
12
|
-
np.ctypeslib.ndpointer(dtype=np.int8, ndim=2, flags='C_CONTIGUOUS'),
|
|
13
|
-
np.ctypeslib.ndpointer(dtype=np.int8, ndim=2, flags='C_CONTIGUOUS'),
|
|
14
|
-
np.ctypeslib.ndpointer(dtype=np.int32, ndim=2, flags='C_CONTIGUOUS'),
|
|
15
|
-
ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
|
|
16
|
-
]
|
|
17
|
-
else:
|
|
18
|
-
print("Warning: librsr.so not found. TRME Ops will fail.")
|
|
19
|
-
_lib = None
|
|
20
|
-
|
|
21
|
-
class TRMEMatmul(torch.autograd.Function):
|
|
22
|
-
@staticmethod
|
|
23
|
-
def forward(ctx, input, weight, bias=None, block_size=4):
|
|
24
|
-
"""
|
|
25
|
-
input: (N, K)
|
|
26
|
-
weight: (M, K)
|
|
27
|
-
bias: (M,) Optional
|
|
28
|
-
"""
|
|
29
|
-
# Check if block_size is passed as positional arg
|
|
30
|
-
if isinstance(bias, int):
|
|
31
|
-
block_size = bias
|
|
32
|
-
bias = None
|
|
33
|
-
|
|
34
|
-
# Quantization
|
|
35
|
-
A_cpu = input.detach().cpu().numpy()
|
|
36
|
-
A_q = np.clip(np.abs(A_cpu), 0, 3).astype(np.int8)
|
|
37
|
-
|
|
38
|
-
B_cpu = weight.detach().cpu().numpy().T
|
|
39
|
-
B_q = np.ascontiguousarray(np.clip(B_cpu, -127, 127).astype(np.int8))
|
|
40
|
-
|
|
41
|
-
N, K = A_q.shape
|
|
42
|
-
_, M = B_q.shape
|
|
43
|
-
|
|
44
|
-
C_q = np.zeros((N, M), dtype=np.int32)
|
|
45
|
-
|
|
46
|
-
# Handle Bias (Quantized int32)
|
|
47
|
-
bias_ptr = None
|
|
48
|
-
if bias is not None:
|
|
49
|
-
bias_q = bias.detach().cpu().numpy().astype(np.int32)
|
|
50
|
-
# Check contiguity
|
|
51
|
-
if not bias_q.flags['C_CONTIGUOUS']:
|
|
52
|
-
bias_q = np.ascontiguousarray(bias_q)
|
|
53
|
-
bias_ptr = bias_q.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
54
|
-
else:
|
|
55
|
-
bias_ptr = ctypes.POINTER(ctypes.c_int32)()
|
|
56
|
-
|
|
57
|
-
# Call Kernel
|
|
58
|
-
if _lib:
|
|
59
|
-
# Update signature dynamically if needed
|
|
60
|
-
# For V5, we enable the real call
|
|
61
|
-
if not hasattr(_lib, 'rsr_gemm_dispatch_configured'):
|
|
62
|
-
_lib.rsr_gemm_dispatch.argtypes = [
|
|
63
|
-
np.ctypeslib.ndpointer(dtype=np.int8, ndim=2, flags='C_CONTIGUOUS'),
|
|
64
|
-
np.ctypeslib.ndpointer(dtype=np.int8, ndim=2, flags='C_CONTIGUOUS'),
|
|
65
|
-
np.ctypeslib.ndpointer(dtype=np.int32, ndim=2, flags='C_CONTIGUOUS'),
|
|
66
|
-
ctypes.POINTER(ctypes.c_int32),
|
|
67
|
-
ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int
|
|
68
|
-
]
|
|
69
|
-
_lib.rsr_gemm_dispatch_configured = True
|
|
70
|
-
|
|
71
|
-
_lib.rsr_gemm_dispatch(A_q, B_q, C_q, bias_ptr, N, K, M, int(block_size))
|
|
72
|
-
|
|
73
|
-
output = torch.from_numpy(C_q.astype(np.float32))
|
|
74
|
-
# Note: Bias is added in C++ now, no need to add in Python unless fallback
|
|
75
|
-
|
|
76
|
-
ctx.save_for_backward(input, weight, bias)
|
|
77
|
-
return output
|
|
78
|
-
|
|
79
|
-
@staticmethod
|
|
80
|
-
def backward(ctx, grad_output):
|
|
81
|
-
# Backward pass is standard GEMM (simulated here)
|
|
82
|
-
input, weight, bias = ctx.saved_tensors
|
|
83
|
-
grad_input = grad_weight = grad_bias = None
|
|
84
|
-
|
|
85
|
-
if ctx.needs_input_grad[0]:
|
|
86
|
-
grad_input = grad_output.matmul(weight)
|
|
87
|
-
if ctx.needs_input_grad[1]:
|
|
88
|
-
grad_weight = grad_output.t().matmul(input)
|
|
89
|
-
if bias is not None and ctx.needs_input_grad[2]:
|
|
90
|
-
grad_bias = grad_output.sum(0)
|
|
91
|
-
|
|
92
|
-
return grad_input, grad_weight, grad_bias, None
|
|
93
|
-
|
|
94
|
-
def matmul(input, weight, block_size=4):
|
|
95
|
-
return TRMEMatmul.apply(input, weight, block_size)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|