PyPI - psaiops - Versions diffs - 0.4.7__py3-none-any.whl - Mend

psaiops 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of psaiops might be problematic. Click here for more details.

Files changed (36) hide show

psaiops/__init__.py +0 -0
psaiops/combine/__init__.py +0 -0
psaiops/combine/app.py +366 -0
psaiops/common/__init__.py +0 -0
psaiops/common/data.py +31 -0
psaiops/common/model.py +73 -0
psaiops/common/tokenizer.py +41 -0
psaiops/compose/__init__.py +0 -0
psaiops/compose/contrast/__init__.py +0 -0
psaiops/compose/contrast/app.py +195 -0
psaiops/compose/contrast/lib.py +143 -0
psaiops/compose/maths/__init__.py +0 -0
psaiops/compose/maths/app.py +323 -0
psaiops/compose/maths/lib.py +1 -0
psaiops/edit/__init__.py +0 -0
psaiops/reverse/__init__.py +0 -0
psaiops/score/__init__.py +0 -0
psaiops/score/attention/__init__.py +0 -0
psaiops/score/attention/app.py +303 -0
psaiops/score/attention/lib.py +118 -0
psaiops/score/residual/__init__.py +0 -0
psaiops/score/residual/app.py +507 -0
psaiops/score/residual/lib.py +187 -0
psaiops/score/router/__init__.py +0 -0
psaiops/score/router/app.py +282 -0
psaiops/score/router/lib.py +59 -0
psaiops/score/shapley/__init__.py +0 -0
psaiops/score/shapley/app.py +158 -0
psaiops/score/shapley/lib.py +1 -0
psaiops/score/similarity/__init__.py +0 -0
psaiops/score/similarity/app.py +152 -0
psaiops/score/similarity/lib.py +1 -0
psaiops-0.4.7.dist-info/METADATA +34 -0
psaiops-0.4.7.dist-info/RECORD +36 -0
psaiops-0.4.7.dist-info/WHEEL +4 -0
psaiops-0.4.7.dist-info/licenses/.github/LICENSE.md +661 -0

psaiops/score/residual/app.py ADDED Viewed

@@ -0,0 +1,507 @@
+import functools
+import gradio
+import numpy
+import torch
+import torch.cuda
+import matplotlib.pyplot
+import psaiops.common.model
+import psaiops.common.tokenizer
+import psaiops.score.residual.lib
+# META #########################################################################
+STYLE = '''.white-text span { color: white; }'''
+TITLE = '''Visualization Of Residuals'''
+INTRO = '''Plot the hidden states for a given prompt.\nUnder construction, only "openai/gpt-oss-20b" is available for now.'''
+MODEL = 'openai/gpt-oss-20b'
+# COLORS #######################################################################
+def create_selection_cmap() -> dict:
+    return {
+        '0': '#000000',
+        '1': '#004444',
+        '2': '#444400',
+        '3': '#440044',}
+def create_score_cmap() -> dict:
+    return {str(__i): '#{:02x}0000'.format(int(2.55 * __i)) for __i in range(101)}
+# INTRO ########################################################################
+def create_intro_block(intro: str) -> dict:
+    __intro = gradio.Markdown(intro, line_breaks=True)
+    return {'intro_block': __intro}
+# MODEL ########################################################################
+def create_model_block() -> dict:
+    __model = gradio.Dropdown(label='Model', value='openai/gpt-oss-20b', choices=['openai/gpt-oss-20b'], scale=1, allow_custom_value=False, multiselect=False, interactive=True) # 'openai/gpt-oss-120b'
+    return {'model_block': __model,}
+# SAMPLING #####################################################################
+def create_sampling_block() -> dict:
+    __tokens = gradio.Slider(label='Tokens', value=16, minimum=1, maximum=128, step=1, scale=1, interactive=True)
+    __topk = gradio.Slider(label='Top K', value=4, minimum=1, maximum=8, step=1, scale=1, interactive=True)
+    __topp = gradio.Slider(label='Top P', value=0.9, minimum=0.0, maximum=1.0, step=0.1, scale=1, interactive=True)
+    return {
+        'tokens_block': __tokens,
+        'topk_block': __topk,
+        'topp_block': __topp,}
+# DATAVIZ ######################################################################
+def create_visualization_block() -> dict:
+    __3d = gradio.Slider(label='3D', value=1, minimum=0, maximum=1, step=1, scale=1, interactive=True)
+    __points = gradio.Slider(label='Points', value=128, minimum=32, maximum=2880, step=32, scale=1, interactive=True)
+    return {
+        'axes_block': __3d,
+        'points_block': __points,}
+# INPUTS #######################################################################
+def create_inputs_block(label: str='Prompt') -> dict:
+    __input = gradio.Textbox(label=label, value='', placeholder='A string of tokens to score.', lines=4, scale=1, interactive=True)
+    return {'input_block': __input}
+# PLOTS ########################################################################
+def create_plot_block(label: str='Residuals', prefix: str='') -> dict:
+    __plot = gradio.Plot(label=label, scale=1)
+    return {prefix + 'plot_block': __plot,}
+# OUTPUTS ######################################################################
+def create_highlight_block(label: str='Output', prefix: str='', cmap: dict=create_selection_cmap()) -> dict:
+    __output = gradio.HighlightedText(label=label, value='', scale=1, interactive=False, show_legend=False, show_inline_category=False, combine_adjacent=False, color_map=cmap, elem_classes='white-text')
+    return {prefix + 'highlight_block': __output}
+# SELECT #######################################################################
+def create_token_selection_block(label: str='Token', prefix: str='') -> dict:
+    # __play = gradio.Button('>', variant='primary', size='lg', scale=1, interactive=True)
+    __position = gradio.Slider(label=label, value=-1, minimum=-1, maximum=15, step=1, scale=1, interactive=True) # info='-1 to average on all tokens'
+    return {prefix + 'position_block': __position,}
+def create_layer_selection_block(label: str='Layer', prefix: str='') -> dict:
+    __layer = gradio.Slider(label=label, value=-1, minimum=-1, maximum=23, step=1, scale=1, interactive=True) # info='-1 to average on all layers'
+    return {prefix + 'layer_block': __layer,}
+# ACTIONS ######################################################################
+def create_actions_block() -> dict:
+    __process = gradio.Button('Process', variant='primary', size='lg', scale=1, interactive=True)
+    return {'process_block': __process,}
+# STATE ########################################################################
+def create_state() -> dict:
+    return {
+        'output_state': gradio.State(None),
+        'hidden_state': gradio.State(None),}
+# LAYOUT #######################################################################
+def create_layout(intro: str=INTRO) -> dict:
+    __fields = {}
+    __fields.update(create_intro_block(intro=intro))
+    with gradio.Tabs():
+        with gradio.Tab('Residuals') as __main_tab:
+            __fields.update({'main_tab': __main_tab})
+            with gradio.Row(equal_height=True):
+                __fields.update(create_inputs_block())
+            with gradio.Row(equal_height=True):
+                __fields.update(create_highlight_block())
+            with gradio.Row(equal_height=True):
+                __fields.update(create_plot_block(label='Left', prefix='left_'))
+                __fields.update(create_plot_block(label='Right', prefix='right_'))
+            with gradio.Row(equal_height=True):
+                __fields.update(create_highlight_block(label='Score', prefix='left_', cmap=create_score_cmap()))
+                __fields.update(create_highlight_block(label='Score', prefix='right_', cmap=create_score_cmap()))
+            with gradio.Row(equal_height=True):
+                __fields.update(create_token_selection_block(label='Token', prefix='left_'))
+                __fields.update(create_token_selection_block(label='Token', prefix='right_'))
+            with gradio.Row(equal_height=True):
+                __fields.update(create_layer_selection_block(label='Layer', prefix='left_'))
+                __fields.update(create_layer_selection_block(label='Layer', prefix='right_'))
+            with gradio.Row(equal_height=True):
+                __fields.update(create_actions_block())
+        with gradio.Tab('Settings') as __settings_tab:
+            __fields.update({'settings_tab': __settings_tab})
+            with gradio.Row(equal_height=True):
+                __fields.update(create_model_block())
+            with gradio.Row(equal_height=True):
+                __fields.update(create_sampling_block())
+            with gradio.Row(equal_height=True):
+                __fields.update(create_visualization_block())
+    return __fields
+# EVENTS #######################################################################
+def update_position_range(
+    current_val: float,
+    token_num: float,
+    output_data: torch.Tensor,
+) -> dict:
+    # take the generated tokens into account
+    __max = int(token_num) - 1 if (output_data is None) else int(output_data.shape[-1])
+    # keep the previous value if possible
+    __val = min(int(current_val), __max)
+    # return a gradio update dictionary
+    return gradio.update(maximum=__max, value=__val)
+# GENERATE #####################################################################
+def update_computation_state(
+    token_num: float,
+    topk_num: float,
+    topp_num: float,
+    prompt_str: str,
+    device_str: str,
+    model_obj: object,
+    tokenizer_obj: object,
+) -> tuple:
+    # sanitize the inputs
+    __token_num = max(1, min(128, int(token_num)))
+    __topk_num = max(1, min(8, int(topk_num)))
+    __topp_num = max(0.0, min(1.0, float(topp_num)))
+    __prompt_str = prompt_str.strip()
+    __device_str = device_str if (device_str in ['cpu', 'cuda']) else 'cpu'
+    # exit if some values are missing
+    if (not __prompt_str) or (model_obj is None) or (tokenizer_obj is None):
+        return (torch.empty(0), torch.empty(0))
+    # dictionary {'input_ids': _, 'attention_mask': _}
+    __input_data = psaiops.common.tokenizer.preprocess_token_ids(
+        tokenizer_obj=tokenizer_obj,
+        prompt_str=__prompt_str,
+        device_str=__device_str)
+    # tensor (1, T) and O * L * (1, I, H)
+    __output_data, __hidden_data = psaiops.score.residual.lib.generate_token_ids(
+        model_obj=model_obj,
+        input_ids=__input_data['input_ids'],
+        attention_mask=__input_data['attention_mask'],
+        token_num=__token_num,
+        topk_num=__topk_num,
+        topp_num=__topp_num)
+    # tensor (1, L, I + O, H)
+    __hidden_data = psaiops.score.residual.lib.merge_hidden_states(
+        hidden_data=__hidden_data)
+    # update each component => (highlight, plot) states
+    return (
+        __output_data.cpu().float(),
+        __hidden_data.cpu().float(),)
+# HIGHLIGHT ####################################################################
+def update_token_focus(
+    left_idx: float,
+    right_idx: float,
+    output_data: torch.Tensor,
+    tokenizer_obj: object,
+) -> list:
+    # exit if some values are missing
+    if (output_data is None) or (len(output_data) == 0):
+        return None
+    # detokenize the IDs
+    __token_str = psaiops.common.tokenizer.postprocess_token_ids(
+        tokenizer_obj=tokenizer_obj,
+        token_data=output_data)
+    # list of string classes
+    __token_cls = psaiops.score.residual.lib.postprocess_focus_cls(
+        left_idx=int(left_idx),
+        right_idx=int(right_idx),
+        token_dim=len(__token_str))
+    # pairs of token and class
+    return list(zip(__token_str, __token_cls))
+# SCORES #######################################################################
+def update_token_scores(
+    layer_idx: float,
+    output_data: torch.Tensor,
+    hidden_data: torch.Tensor,
+    tokenizer_obj: object,
+    model_obj: object,
+) -> list:
+    # exit if some values are missing
+    if (output_data is None) or (len(output_data) == 0) or (hidden_data is None) or (len(hidden_data) == 0):
+        return None
+    # parse the model meta
+    __device_str = model_obj.lm_head.weight.device
+    __dtype_obj = model_obj.lm_head.weight.dtype
+    # detokenize the IDs
+    __token_str = psaiops.common.tokenizer.postprocess_token_ids(
+        tokenizer_obj=tokenizer_obj,
+        token_data=output_data)
+    # select the relevant hidden states
+    __final_states = hidden_data[0, -1, :, :].to(device=__device_str, dtype=__dtype_obj)
+    __layer_states = hidden_data[0, int(layer_idx), :, :].to(device=__device_str, dtype=__dtype_obj)
+    # compute the logits
+    __final_logits = model_obj.lm_head(__final_states).detach().cpu() # already normalized
+    __layer_logits = model_obj.lm_head(model_obj.model.norm(__layer_states)).detach().cpu()
+    # compute the JSD metric
+    __token_jsd = jsd_from_logits(final_logits=__final_logits, prefix_logits=__layer_logits)
+    # scale into a [0; 100] label
+    __token_cls = postprocess_score_cls(score_data=__token_jsd)
+    # color each token according to the distance between the distribution at layer L and the final distribution
+    return list(zip(__token_str, __token_cls))
+# PLOT #########################################################################
+def update_2d_plot(
+    token_idx: float,
+    layer_idx: float,
+    hidden_data: torch.Tensor,
+) -> tuple:
+    # reduce the layer and token axes (B, L, T, E) => (B, E)
+    __plot_data = psaiops.score.residual.lib.reduce_hidden_states(
+        hidden_data=hidden_data,
+        layer_idx=int(layer_idx),
+        token_idx=int(token_idx),
+        axes_idx=(1, 2))
+    # rescale the data to [-1; 1] (B, E)
+    __plot_data = psaiops.score.residual.lib.rescale_hidden_states(
+        hidden_data=__plot_data)
+    # reshape into a 3D tensor by folding E (B, E) => (B, W, H)
+    __plot_data = psaiops.score.residual.lib.reshape_hidden_states(
+        hidden_data=__plot_data,
+        layer_idx=-1) # there is no layer axis
+    # map the [-1; 1] activations to RGBA colors
+    __plot_data = psaiops.score.residual.lib.color_hidden_states(
+        hidden_data=__plot_data.numpy())
+    # plot the first sample
+    __figure = matplotlib.pyplot.figure()
+    __axes = __figure.add_subplot(1, 1, 1)
+    __axes.imshow(__plot_data[0], vmin=0.0, vmax=1.0, cmap='viridis')
+    __figure.tight_layout()
+    # remove the figure for the pyplot register for garbage collection
+    matplotlib.pyplot.close(__figure)
+    # update each component => (highlight, plot) states
+    return __figure
+def update_3d_plot(
+    token_idx: float,
+    layer_idx: float,
+    points_num: float,
+    hidden_data: torch.Tensor,
+) -> tuple:
+    # reduce the token axis (B, L, T, E) => (B, L, E)
+    __plot_data = psaiops.score.residual.lib.reduce_hidden_states(
+        hidden_data=hidden_data,
+        token_idx=int(token_idx),
+        layer_idx=int(layer_idx),
+        axes_idx=2)
+    # rescale the data to [-1; 1] (B, L, E)
+    __plot_data = psaiops.score.residual.lib.rescale_hidden_states(
+        hidden_data=__plot_data)
+    # mask the small activations to improve the plot readability
+    __mask_data = psaiops.score.residual.lib.mask_hidden_states(
+        hidden_data=__plot_data,
+        topk_num=int(points_num) if int(layer_idx) == -1 else 2 * int(points_num))
+    # reshape into a 3D tensor by folding E (B, L, E) => (B, W, H, L)
+    __plot_data = psaiops.score.residual.lib.reshape_hidden_states(
+        hidden_data=__plot_data,
+        layer_idx=1)
+    __mask_data = psaiops.score.residual.lib.reshape_hidden_states(
+        hidden_data=__mask_data,
+        layer_idx=1)
+    # convert to numpy ndarrays
+    __plot_data = __plot_data.numpy()
+    __mask_data = __mask_data.numpy()
+    # map the [-1; 1] activations to RGBA colors
+    __rgb_data = psaiops.score.residual.lib.color_hidden_states(
+        hidden_data=__plot_data)
+    # map the [-1; 1] activations to point areas
+    __area_data = psaiops.score.residual.lib.size_hidden_states(
+        hidden_data=__plot_data,
+        area_min=0.01,
+        area_max=16.0,
+        gamma_val=1.6)
+    # format the first sample for a scatter plot
+    __x, __y, __z = numpy.nonzero(__mask_data[0])
+    __c = __rgb_data[0, __x, __y, __z]
+    __s = __area_data[0, __x, __y, __z]
+    # plot the first sample
+    __figure = matplotlib.pyplot.figure()
+    __axes = __figure.add_subplot(1, 1, 1, projection='3d')
+    __axes.scatter(__x, __y, __z, c=__c, s=__s, marker='o', linewidths=0)
+    __figure.tight_layout()
+    # remove the figure for the pyplot register for garbage collection
+    matplotlib.pyplot.close(__figure)
+    # update each component => (highlight, plot) states
+    return __figure
+def update_hidden_plot(
+    token_idx: float,
+    layer_idx: float,
+    axes_num: float,
+    points_num: float,
+    hidden_data: torch.Tensor,
+) -> tuple:
+    # exit if some values are missing
+    if (hidden_data is None) or (len(hidden_data) == 0):
+        return None
+    # plot the residuals of a given layer on a 2D heatmap
+    if not axes_num: # 0.0 or 0
+        return update_2d_plot(
+            token_idx=token_idx,
+            layer_idx=layer_idx,
+            hidden_data=hidden_data)
+    # by default, plot the residuals for all the layers in 3D
+    return update_3d_plot(
+        token_idx=token_idx,
+        layer_idx=layer_idx,
+        points_num=points_num,
+        hidden_data=hidden_data)
+# APP ##########################################################################
+def create_app(title: str=TITLE, intro: str=INTRO, style: str=STYLE, model: str=MODEL) -> gradio.Blocks:
+    __fields = {}
+    with gradio.Blocks(theme=gradio.themes.Soft(), title=title, css=style) as __app:
+        # load the model
+        __device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        __model = psaiops.common.model.get_model(name=model, device=__device)
+        __tokenizer = psaiops.common.tokenizer.get_tokenizer(name=model, device=__device)
+        # adapt the event handlers
+        __compute = functools.partial(update_computation_state, model_obj=__model, tokenizer_obj=__tokenizer, device_str=__device)
+        __highlight = functools.partial(update_token_focus, tokenizer_obj=__tokenizer)
+        __score = functools.partial(update_token_scores, tokenizer_obj=__tokenizer, model_obj=__model)
+        # create the UI
+        __fields.update(create_layout(intro=intro))
+        # init the state
+        __fields.update(create_state())
+        # update the data after clicking process
+        __fields['process_block'].click(
+            fn=__compute,
+            inputs=[__fields[__k] for __k in ['tokens_block', 'topk_block', 'topp_block', 'input_block']],
+            outputs=[__fields[__k] for __k in ['output_state', 'hidden_state']],
+            queue=False,
+            show_progress='full'
+        ).then(
+        # update the range of the position sliders when the output changes
+            fn=update_position_range,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'tokens_block', 'output_state']],
+            outputs=__fields['left_position_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+            fn=update_position_range,
+            inputs=[__fields[__k] for __k in ['right_position_block', 'tokens_block', 'output_state']],
+            outputs=__fields['right_position_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+        # update the token highlight when the output data changes
+            fn=__highlight,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'right_position_block', 'output_state']],
+            outputs=__fields['highlight_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+        # update the left token scores when the output data changes
+            fn=__score,
+            inputs=[__fields[__k] for __k in ['left_layer_block', 'output_state', 'hidden_state']],
+            outputs=__fields['left_highlight_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+        # update the right token scores when the output data changes
+            fn=__score,
+            inputs=[__fields[__k] for __k in ['right_layer_block', 'output_state', 'hidden_state']],
+            outputs=__fields['right_highlight_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+        # update the plot when the router data changes
+            fn=update_hidden_plot,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'left_layer_block', 'axes_block', 'points_block', 'hidden_state']],
+            outputs=__fields['left_plot_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+            fn=update_hidden_plot,
+            inputs=[__fields[__k] for __k in ['right_position_block', 'right_layer_block', 'axes_block', 'points_block', 'hidden_state']],
+            outputs=__fields['right_plot_block'],
+            queue=False,
+            show_progress='hidden')
+        # update the range of the position slider when the settings change
+        __fields['tokens_block'].change(
+            fn=update_position_range,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'tokens_block', 'output_state']],
+            outputs=__fields['left_position_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+            fn=update_position_range,
+            inputs=[__fields[__k] for __k in ['right_position_block', 'tokens_block', 'output_state']],
+            outputs=__fields['right_position_block'],
+            queue=False,
+            show_progress='hidden')
+        # update the left plot when the focus changes
+        __fields['left_position_block'].change(
+            fn=update_hidden_plot,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'left_layer_block', 'axes_block', 'points_block', 'hidden_state']],
+            outputs=__fields['left_plot_block'],
+            queue=False,
+            show_progress='hidden')
+        __fields['left_layer_block'].change(
+            fn=update_hidden_plot,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'left_layer_block', 'axes_block', 'points_block', 'hidden_state']],
+            outputs=__fields['left_plot_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+        # update the left token scores when the focus changes
+            fn=__score,
+            inputs=[__fields[__k] for __k in ['left_layer_block', 'output_state', 'hidden_state']],
+            outputs=__fields['left_highlight_block'],
+            queue=False,
+            show_progress='hidden'
+        )
+        # update the right plot when the focus changes
+        __fields['right_position_block'].change(
+            fn=update_hidden_plot,
+            inputs=[__fields[__k] for __k in ['right_position_block', 'right_layer_block', 'axes_block', 'points_block', 'hidden_state']],
+            outputs=__fields['right_plot_block'],
+            queue=False,
+            show_progress='hidden')
+        __fields['right_layer_block'].change(
+            fn=update_hidden_plot,
+            inputs=[__fields[__k] for __k in ['right_position_block', 'right_layer_block', 'axes_block', 'points_block', 'hidden_state']],
+            outputs=__fields['right_plot_block'],
+            queue=False,
+            show_progress='hidden'
+        ).then(
+        # update the right token scores when the focus changes
+            fn=__score,
+            inputs=[__fields[__k] for __k in ['right_layer_block', 'output_state', 'hidden_state']],
+            outputs=__fields['right_highlight_block'],
+            queue=False,
+            show_progress='hidden'
+        )
+        # update the token highlight when the token focus changes
+        __fields['left_position_block'].change(
+            fn=__highlight,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'right_position_block', 'output_state']],
+            outputs=__fields['highlight_block'],
+            queue=False,
+            show_progress='hidden')
+        __fields['right_position_block'].change(
+            fn=__highlight,
+            inputs=[__fields[__k] for __k in ['left_position_block', 'right_position_block', 'output_state']],
+            outputs=__fields['highlight_block'],
+            queue=False,
+            show_progress='hidden')
+        # gradio application
+        return __app
+# MAIN #########################################################################
+if __name__ == '__main__':
+    __app = create_app()
+    __app.launch(share=True, debug=True)

psaiops/score/residual/lib.py ADDED Viewed

@@ -0,0 +1,187 @@
+import functools
+import math
+import matplotlib
+import numpy
+import torch
+import mlable.shapes
+# GENERATE #######################################################################
+@functools.lru_cache(maxsize=32)
+def generate_token_ids(
+    model_obj: object,
+    input_ids: torch.Tensor,
+    token_num: int,
+    topk_num: int = 4,
+    topp_num: float = 0.9,
+    attention_mask: torch.Tensor=None,
+) -> tuple:
+    # generate completion
+    with torch.no_grad():
+        __outputs = model_obj.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=token_num,
+            do_sample=(0.0 < topp_num < 1.0) or (topk_num > 0),
+            top_k=topk_num if (topk_num > 0) else None,
+            top_p=topp_num if (0.0 < topp_num < 1.0) else None,
+            return_dict_in_generate=True,
+            output_hidden_states=True,
+            output_attentions=False,
+            output_scores=False,
+            # early_stopping=True,
+            use_cache=True)
+    # ((B, T), O * L * (B, I, E))
+    return __outputs.sequences, __outputs.hidden_states
+# MERGE ########################################################################
+def merge_hidden_states(
+    hidden_data: torch.Tensor,
+) -> torch.Tensor:
+    # parse the inputs
+    __token_dim = len(hidden_data)
+    __layer_dim = len(hidden_data[0])
+    # stack the data for each layer => (B, L, I + O, E)
+    return torch.stack(
+        [
+            # concatenate the data for all the tokens => (B, I + O, E)
+            torch.concatenate([hidden_data[__t][__l] for __t in range(__token_dim)], dim=1)
+            for __l in range(__layer_dim)],
+        dim=1)
+# REDUCE #######################################################################
+def reduce_hidden_states(
+    hidden_data: torch.Tensor, # (B, L, T, E)
+    layer_idx: int, # -1 => select all layers
+    token_idx: int, # -1 => select all tokens
+    axes_idx: int=2, # token sequence axis
+) -> torch.Tensor:
+    # parse the hidden states (B, L, T, E)
+    __batch_dim, __layer_dim, __token_dim, __hidden_dim = tuple(hidden_data.shape)
+    __layer_idx = min(layer_idx, __layer_dim - 1)
+    __token_idx = min(token_idx, __token_dim - 1)
+    # select the relevant data along each axis
+    __layer_slice = slice(0, __layer_dim) if (__layer_idx < 0) else slice(__layer_idx, __layer_idx + 1)
+    __token_slice = slice(0, __token_dim) if (__token_idx < 0) else slice(__token_idx, __token_idx + 1)
+    # filter the data
+    __data = hidden_data[slice(None), __layer_slice, __token_slice, slice(None)]
+    # reduce the token axis => (B, L, E)
+    return __data.mean(dim=axes_idx, keepdim=False)
+# RESCALE ######################################################################
+def rescale_hidden_states(
+    hidden_data: torch.Tensor, # (B, L, E) or (B, E)
+) -> torch.Tensor:
+    # compute the scale of the data, layer by layer
+    __s = torch.quantile(hidden_data.abs(), q=0.9, dim=-1, keepdim=True)
+    # log scaling on large values and linear near 0
+    __a = torch.asinh(hidden_data / (__s + torch.finfo().eps))
+    # clip and map to [-1; 1]
+    return 0.33 * __a.clamp(min=-3, max=3)
+# RESHAPE ######################################################################
+def reshape_hidden_states(
+    hidden_data: torch.Tensor, # (B, L, E) or (B, E)
+    layer_idx: int=1,
+) -> torch.Tensor:
+    # parse the shape
+    __shape = tuple(hidden_data.shape)
+    # factor the hidden dimension
+    __factor = 2 ** round(0.5 * math.log2(__shape[-1]))
+    # compute the shape with the last axis split
+    __shape = mlable.shapes.divide(shape=__shape, axis=-1, factor=__factor, insert=True, right=True)
+    # move the layer axis at the end
+    __perm = mlable.shapes.move(shape=range(len(__shape)), before=layer_idx, after=-1)
+    # reshape into (B, W, H, L) or (B, W, H)
+    return hidden_data.reshape(__shape).permute(*__perm)
+# MASK #########################################################################
+def mask_hidden_states(
+    hidden_data: torch.Tensor, # (B, L, E)
+    topk_num: int=128,
+) -> torch.Tensor:
+    # sanitize
+    __k = min(topk_num, int(hidden_data.shape[-1]))
+    # indices of the topk values
+    __indices = hidden_data.abs().topk(__k, dim=-1, largest=True, sorted=False).indices
+    # initialize the mask with False
+    __mask = torch.zeros_like(hidden_data, dtype=torch.bool)
+    # (B, L, E) mask of the topk values
+    return __mask.scatter_(dim=-1, index=__indices, value=True)
+# FORMAT #######################################################################
+def color_hidden_states(
+    hidden_data: numpy.array, # (B, H, W, L)
+    color_map: callable=matplotlib.colormaps['coolwarm'],
+) -> list:
+    # [-1; 1] => [0; 1]
+    __data = 0.5 * (hidden_data + 1.0)
+    # (B, W, H, L) => (B, W, H, L, 4)
+    __rgba = color_map(__data)
+    # (B, W, H, L, 3) in [0; 1]
+    return __rgba[..., :3]
+def size_hidden_states(
+    hidden_data: numpy.array, # (B, H, W, L)
+    area_min: float=0.01,
+    area_max: float=16.0,
+    gamma_val: float=1.6,
+) -> list:
+    # [-1; 1] => [0; 1]
+    __data = numpy.abs(hidden_data)
+    # gamma < 1 will boost small values and > 1 emphasize larger values
+    __data = (__data + numpy.finfo(numpy.float32).eps) ** gamma_val
+    # map to point area
+    return area_min + (area_max - area_min) * __data
+# KL SCORES ####################################################################
+def kl_from_logprobs(
+    p_log: torch.Tensor,
+    q_log: torch.Tensor,
+) -> torch.Tensor:
+    # compute the KL div from log probabilities (B, T, E) or (T, E)
+    return (p_log.exp() * (p_log - q_log)).sum(dim=-1)
+def jsd_from_logits(
+    final_logits: torch.Tensor,
+    prefix_logits: torch.Tensor,
+) -> torch.Tensor:
+    # compute the log probs from logits (B, T, E) or (T, E)
+    __p = torch.log_softmax(final_logits.float(), dim=-1)
+    __q = torch.log_softmax(prefix_logits.float(), dim=-1)
+    # m = 0.5(p+q) in log-space (logsumexp trick)
+    __m = torch.logsumexp(torch.stack([__p, __q], dim=0), dim=0) - math.log(2.0)
+    # compute the JSD metric
+    __jsd = 0.5 * kl_from_logprobs(__p, __m) + 0.5 * kl_from_logprobs(__q, __m)
+    # scale to [0; 1]
+    return (__jsd / math.log(2.0)).clamp(0.0, 1.0)
+# POSTPROCESS ##################################################################
+def postprocess_focus_cls(
+    left_idx: int,
+    right_idx: int,
+    token_dim: int,
+) -> list:
+    __left_idx = max(-1, min(token_dim, left_idx))
+    __right_idx = max(-1, min(token_dim, right_idx))
+    # class 1 for the token(s) focused on the left, 0 for the rest
+    __left_cls = token_dim * [1] if (__left_idx < 0) else [int(__i == __left_idx) for __i in range(token_dim)]
+    # class 2 for the token(s) focused on the right, 0 for the rest
+    __right_cls = token_dim * [2] if (__right_idx < 0) else [2 * int(__i == __right_idx) for __i in range(token_dim)]
+    # sum the classes so that the overlap has class 3
+    return [str(__l + __r) for __l, __r in zip(__left_cls, __right_cls)]
+def postprocess_score_cls(
+    score_data: torch.Tensor
+) -> list:
+    return [str(__s) for __s in score_data.numpy().tolist()]