PyPI - foreblocks - Versions diffs - 0.1.0__py3-none-any.whl - Mend

foreblocks 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (371) hide show

examples/rodrigo.py ADDED Viewed

@@ -0,0 +1,351 @@
+import os
+import pickle
+import sys
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+# Get the current working directory of the notebook
+notebook_dir = os.getcwd()
+# Add the parent directory to sys.path
+parent_dir = os.path.abspath(os.path.join(notebook_dir, "."))
+if parent_dir not in sys.path:
+    sys.path.append(parent_dir)
+from foreblocks.att import AttentionLayer
+from foreblocks.blocks import GRU
+from foreblocks.blocks.fourier import FNO1DLayer, FourierFeatures
+from foreblocks.blocks.graph import LatentGraphNetwork
+from foreblocks.tf.embeddings import LearnablePositionalEncoding
+from foreblocks.tf.transformer import TransformerDecoder, TransformerEncoder
+from torch.jit import script
+from torch.utils.data import DataLoader, TensorDataset
+from foreblocks import ForecastingModel, LSTMDecoder, LSTMEncoder, Trainer
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: hydrogen
+#       format_version: '1.3'
+#       jupytext_version: 1.17.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+# %%
+total_epochs = 500
+# create scheduled_sampling_fn for teacher forcing
+def scheduled_sampling_fn(epoch):
+    tf_ratio = max(0.0, 0.8 - (epoch / total_epochs))
+    return tf_ratio
+# Get the current working directory of the notebook
+notebook_dir = os.getcwd()
+# Add the parent directory to sys.path
+parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
+if parent_dir not in sys.path:
+    sys.path.append(parent_dir)
+# df = pd.read_csv('df_demmand_without_category_2025_05_13.csv')
+# %%
+# # read df_demmand_without_category_2025_05_13.csv
+# import pandas as pd
+# df = pd.read_csv('df_demmand_without_category_2025_05_13.csv')
+# from foreblocks import TimeSeriesPreprocessor
+# import numpy as np
+# import pandas as pd
+# import matplotlib.pyplot as plt
+# # Generate synthetic time series data
+# np.random.seed(42)
+# n_samples = 200
+# timestamps = df.date
+# # convert df to numpy
+# data = df.drop(columns=['date']).values
+# # Create preprocessor with various techniques enabled
+# preprocessor = TimeSeriesPreprocessor(
+#     normalize=True,
+#     differencing=False,
+#     detrend=True,
+#     apply_ewt=True,
+#     window_size=24,
+#     horizon=12,
+#     remove_outliers=True,
+#     outlier_threshold=2.5,
+#     outlier_method="iqr",
+#     impute_method="iterative",
+#     ewt_bands=5,
+#     trend_imf_idx=0,
+#     log_transform=False,
+#     filter_window=5,
+#     filter_polyorder=2,
+#     apply_filter=True,
+#     self_tune=True,
+#     apply_imputation=True,
+#     generate_time_features=False,
+# )
+# # Fit and transform the data
+# X, y, processed_data = preprocessor.fit_transform(data, time_stamps=timestamps)
+# # Visualize the results
+# plt.figure(figsize=(15, 10))
+# plt.subplot(3, 1, 1)
+# plt.title('Original Data with Outliers and Missing Values')
+# plt.plot(data)
+# plt.subplot(3, 1, 2)
+# plt.title('Processed Data')
+# print("Processed data shape:", processed_data.shape)
+# plt.plot(processed_data)
+# plt.subplot(3, 1, 3)
+# plt.title('EWT Components')
+# ewt_components = preprocessor.get_ewt_components()
+# if ewt_components:
+#     for i, imf in enumerate(ewt_components[0].T):
+#         plt.plot(imf, label=f'IMF {i}')
+#     plt.legend()
+# plt.tight_layout()
+# plt.show()
+# print(f"Input sequence shape: {X.shape}")
+# print(f"Target sequence shape: {y.shape}")
+# %%
+# # load the processed data
+# # save X and y to pickle
+# import pickle
+# with open('X.pkl', 'wb') as f:
+#     pickle.dump(X, f)
+# with open('y.pkl', 'wb') as f:
+#     pickle.dump(y, f)
+# %%
+# load X and y from pickle
+with open("examples/X.pkl", "rb") as f:
+    X = pickle.load(f)
+with open("examples/y.pkl", "rb") as f:
+    y = pickle.load(f)
+with open("examples/time.pkl", "rb") as f:
+    time_feat = pickle.load(f)
+# %%
+# Parameters
+input_size = X.shape[2]  # Number of features
+hidden_size = 64
+num_layers = 2
+output_size = X.shape[2]  # Number of features
+target_len = 12
+seq_len = 24
+total_len = 300  # Total synthetic time series length
+# fourier_preprocessor = LatentCorrelationGraphLayer(
+#     #conv_type='sgconv',
+#     input_size=input_size,          # Input dimension
+#     output_size=hidden_size,        # Output dimension (same as hidden_size)
+# )
+preprocessor = LatentGraphNetwork(
+    input_size=input_size,
+    output_size=input_size,
+    hidden_size=input_size,
+    strategy="vanilla",
+    aggregation="mean",
+)
+# preprocessor = FourierFeatures(
+#     input_size=input_size,
+#     output_size=input_size,
+#     num_frequencies=8,
+# )
+# 1. Create encoder and decoder
+# encoder = LSTMEncoder(input_size, hidden_size, num_layers)
+# decoder = LSTMDecoder(output_size, hidden_size, output_size, num_layers)
+model_params = {
+    "input_processor_output_size": input_size,
+    "hidden_size": 64,
+    "nhead": 4,
+    "num_encoder_layers": 1,
+    "num_decoder_layers": 1,
+    "dropout": 0.1,
+    "dim_feedforward": 2048,
+    "seq_len": 24,
+    "target_len": 12,
+    "total_len": 1000,
+    "input_size": input_size,
+    "output_size": output_size,
+}
+from foreblocks.third_party.flash_softpick_attn import parallel_softpick_attn
+def warmup_softpick(device, d_model=512, n_heads=4, seq_len=16):
+    q = torch.randn(
+        1, seq_len, n_heads, d_model // n_heads, device=device, dtype=torch.float16
+    )
+    k = q.clone()
+    v = q.clone()
+    _ = parallel_softpick_attn(q, k, v, head_first=False)
+from foreblocks.blocks.nha import NHA
+embedding_size = 12  # Size of the output embeddings
+# 1. Create the NHA input preprocessor
+nha_preprocessor = NHA(
+    input_dim=input_size,  # Input dimension
+    embedding_dim=embedding_size,  # Output embedding dimension
+    hidden_dim=12,  # Hidden dimension for processing
+    num_blocks=2,  # Number of hierarchical blocks
+    num_levels_per_block=3,  # Number of hierarchical levels per block
+    kernel_size=3,  # Kernel size for convolutions
+    attention_heads=4,  # Number of attention heads
+    dropout=0.1,  # Dropout probability
+)
+from foreblocks.blocks.famous import TimesBlock, TimesBlockPreprocessor
+times_wrapper = TimesBlockPreprocessor(d_model=input_size)
+# warmup_softpick(device=torch.device("cuda"))
+pos_encoder = LearnablePositionalEncoding(512)
+pos_decoder = LearnablePositionalEncoding(512)
+encoder = TransformerEncoder(
+    input_size=model_params.get("input_processor_output_size", 1),
+    nhead=model_params.get("nhead", 4),
+    num_layers=model_params.get("num_encoder_layers", 1),
+    dropout=model_params.get("dropout", 0.1),
+    dim_feedforward=model_params.get("dim_feedforward", 2048),
+    use_moe=True,
+    pos_encoder=pos_encoder,
+    att_type="autocor",
+)
+# Create transformer decoder
+decoder = TransformerDecoder(
+    input_size=model_params.get("input_processor_output_size", 1),
+    output_size=output_size,
+    nhead=model_params.get("nhead", 4),
+    num_layers=model_params.get("num_decoder_layers", 1),
+    dropout=model_params.get("dropout", 0.1),
+    dim_feedforward=model_params.get("dim_feedforward", 2048),
+    informer_like=True,
+    use_moe=True,
+    # att_type="prob_sparse",
+    pos_encoder=pos_decoder,
+)
+# from foreblocks.blocks.mamba import MambaDecoder, MambaEncoder
+# encoder = MambaEncoder(
+#     input_size=input_size, hidden_size=hidden_size, num_layers=num_layers
+# )
+# decoder = MambaDecoder(
+#     input_size=output_size,
+#     hidden_size=hidden_size,
+#     num_layers=num_layers,
+#     output_size=output_size,
+# )
+# attention_module = AttentionLayer(
+#     method="mha",
+#     attention_backend="flash",
+#     encoder_hidden_size=hidden_size,
+#     decoder_hidden_size=hidden_size,
+#     nhead=16,
+# )
+total_epochs = 500
+# create scheduled_sampling_fn for teacher forcing
+def scheduled_sampling_fn(epoch):
+    # Use a linear decay from 1.0 to 0.0 over the epochs
+    tf_ratio = max(0.0, 0.95 - (epoch / total_epochs))
+    return tf_ratio
+outprocessor = nn.Sequential(
+    GRU(input_size=output_size, hidden_size=32, output_size=output_size),
+)
+print("Using timewrapper")
+outnorm = nn.LayerNorm(output_size)
+model = ForecastingModel(
+    encoder=encoder,
+    decoder=decoder,
+    target_len=target_len,
+    forecasting_strategy="seq2seq",
+    model_type="informer-like",
+    scheduled_sampling_fn=scheduled_sampling_fn,
+    output_size=output_size,
+    # attention_module=attention_module,
+    input_preprocessor=times_wrapper,
+    output_block=outprocessor,
+    # output_normalization=outnorm,
+    input_skip_connection=False,
+)
+# model = script(model)  # Convert to TorchScript for optimization
+trainer = Trainer(
+    model,
+    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
+    criterion=nn.MSELoss(),
+)
+train_size = int(0.8 * len(X))
+X_train, Y_train = X[:train_size], y[:train_size]
+X_val, Y_val = X[train_size:], y[train_size:]
+X_train = torch.tensor(X_train, dtype=torch.float32)
+Y_train = torch.tensor(Y_train, dtype=torch.float32)
+Y_val = torch.tensor(Y_val, dtype=torch.float32)
+X_val = torch.tensor(X_val, dtype=torch.float32)
+time_train = torch.tensor(time_feat[:train_size], dtype=torch.float32)
+# create DataLoader
+train_dataset = TensorDataset(X_train, Y_train, time_train)
+print(time_train.shape)
+train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
+data = trainer.train(train_loader, epochs=500)
+metrics = trainer.metrics(X_val, Y_val)
+# %%
+X = torch.tensor(X, dtype=torch.float32)
+fig = trainer.plot_prediction(X_val, Y_val, full_series=X, offset=train_size)
+# %%

flash-attention/benchmarks/benchmark_alibi.py ADDED Viewed

@@ -0,0 +1,275 @@
+# Copyright (c) 2024, Sanghun Cho, Tri Dao.
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn.layers.rotary import apply_rotary_emb
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+def generate_cos_sin(seqlen, rotary_dim, device, dtype):
+    assert rotary_dim % 2 == 0
+    angle = torch.rand(seqlen * 2, rotary_dim // 2, device=device) * 2 * math.pi
+    cos = torch.cos(angle).to(dtype=dtype)
+    sin = torch.sin(angle).to(dtype=dtype)
+    return cos, sin
+def flash_rotary(q, k, v, cos, sin, causal=False):
+    # corrected by @tridao comments
+    q = apply_rotary_emb(
+        q, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    k = apply_rotary_emb(
+        k, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    return flash_attn_func(q, k, v, causal=causal)
+def attn_bias_from_alibi_slopes(
+    slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False
+):
+    batch, nheads = slopes.shape
+    device = slopes.device
+    slopes = rearrange(slopes, "b h -> b h 1 1")
+    if causal:
+        return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes
+    else:
+        row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+        col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+        sk = (
+            seqlen_k
+            if key_padding_mask is None
+            else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        sq = (
+            seqlen_q
+            if query_padding_mask is None
+            else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        relative_pos = torch.abs(row_idx + sk - sq - col_idx)
+        return -slopes * relative_pos.to(dtype=slopes.dtype)
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+def attention_pytorch(q, k, v, dropout_p=0.0, causal=True, attn_bias=None):
+    """
+    Arguments:
+        q, k, v: (batch_size, seqlen, nheads, head_dim)
+        dropout_p: float
+        attn_bias: (batch_size, nheads, seqlen, seqlen) or (1, nheads, seqlen, seqlen)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, nheads, d = q.shape
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    if attn_bias is not None:
+        scores = rearrange(attn_bias, 'b h t s -> (b h) t s')
+    else:
+        scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=1.0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=q.dtype)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+methods = (["fa2_alibi", "torch"]
+           + (["xformers"] if xops is not None else [])
+           + ["sdpa"]
+           + ["fa2_baseline"]
+           + ["fa2_rotary"])
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+            # alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+            alibi_slopes = torch.rand(1, nheads, device=device, dtype=torch.float32) * 0.3
+            attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal).to(dtype)
+            attn_bias = repeat(attn_bias, "1 ... -> b ...", b=batch_size)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                # alibi_slopes=alibi_slopes,
+                alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_baseline"] = f
+            time_b[config, "fa2_baseline"] = b
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                alibi_slopes=rearrange(alibi_slopes, "1 h -> h"),
+                # alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_alibi"] = f
+            time_b[config, "fa2_alibi"] = b
+            try:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch,
+                    q, k, v,
+                    dropout_p,
+                    causal=causal,
+                    attn_bias=attn_bias,
+                    repeats=repeats,
+                    verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "torch"] = f
+            time_b[config, "torch"] = b
+            # F.sdpa doesn't currently (torch 2.1) dispatch to flash-attn but just to be safe
+            with torch.backends.cuda.sdp_kernel(enable_flash=False):
+                q_pt = q.detach().requires_grad_(True).transpose(1, 2)
+                k_pt = k.detach().requires_grad_(True).transpose(1, 2)
+                v_pt = v.detach().requires_grad_(True).transpose(1, 2)
+                f, b = time_fwd_bwd(
+                    F.scaled_dot_product_attention,
+                    q_pt, k_pt, v_pt,
+                    attn_mask=attn_bias,
+                    dropout_p=dropout_p,
+                    is_causal=causal,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "sdpa"] = f
+                time_b[config, "sdpa"] = b
+            if xops is not None:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                if causal:
+                    attn_bias_xops = xops.LowerTriangularMask().add_bias(attn_bias.expand(-1, -1, seqlen, -1).to(dtype=q.dtype))
+                    # NotImplementedError: No operator found for `memory_efficient_attention_backward` with inputs:
+                    # `flshattB@v2.3.6` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    # `cutlassB` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    attn_bias_xops = attn_bias_xops.materialize((batch_size, nheads, seqlen, seqlen), dtype=q.dtype, device=device)
+                else:
+                    attn_bias_xops = attn_bias.to(dtype=q.dtype)
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention,
+                    q, k, v,
+                    attn_bias_xops,
+                    dropout_p,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "xformers"] = f
+                time_b[config, "xformers"] = b
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            cos, sin = generate_cos_sin(seqlen, headdim, device, dtype)
+            f, b = time_fwd_bwd(
+                flash_rotary,
+                q, k, v,
+                cos, sin,
+                causal,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_rotary"] = f
+            time_b[config, "fa2_rotary"] = b
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            csv_output = ""
+            csv_output += f"{causal},{headdim},{batch_size},{seqlen},"
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+                csv_output += f"{speed_f[config, method]:.2f},{speed_b[config, method]:.2f},{speed_f_b[config, method]:.2f},"
+            print(csv_output)