PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/dataset/arc_agi/np_cache.py ADDED Viewed

@@ -0,0 +1,168 @@
+from collections import OrderedDict, namedtuple
+from functools import wraps
+from itertools import chain
+from typing import Callable, Optional, TypeVar, cast
+import numpy as np
+from xxhash import xxh3_64_hexdigest
+__all__ = ["np_lru_cache"]
+TCallable = TypeVar("TCallable", bound=Callable)
+_NpCacheInfo = namedtuple("NpCacheInfo", ["hits", "misses", "maxsize", "currsize"])
+def np_lru_cache(
+    user_function: TCallable = None, *, maxsize: Optional[int] = 16
+) -> TCallable:
+    """Wrapper similar to functool's lru_cache, but can handle caching numpy arrays.
+    Uses xxhash to hash the raw bytes of the argument array(s) + shape information
+    to prevent collisions on arrays with identical data but different dimensions.
+    Intentionally has a smaller default maxsize than lru_cache - if you're using this
+    wrapper, you are likely trying to avoid some slow computations on large arrays.
+    There's no reason to hold onto 128 of these in memory unless you have to.
+    Exposes .cache_info and .cache_clear methods, much like lru_cache.
+    Does not have the thread-safety features of lru_cache.
+    Parameters
+    ----------
+    user_function : TCallable, optional
+    maxsize : int, optional
+        Max number of entries in the cache. None for no limit, by default 16
+    Returns
+    -------
+    TCallable
+        Wrapped function. Should be mypy-compliant.
+    Notes
+    ------
+    This is implemented similarly to the old lru_cache implementation that
+    was discarded for performance upgrades. Generating a hash is by far the
+    slowest step of this wrapper, however, so optimizing getting and setting
+    the cache is not really going to yield much benefit.
+    Does NOT look inside of collections to generate a hash for a number of
+    performance-related reasons. For example, this function can be cached:
+    >>> fun(np.array([1, 2, 3]), 3, kind="type")
+    This one cannot:
+    >>> fun(arrays = [np.array([1, 2, 3]), np.array([4, 5, 6])])
+    """
+    if isinstance(maxsize, int):
+        if maxsize < 0:
+            maxsize = 0
+    def actual_np_cache(user_function):
+        # OrderedDict is not threadsafe for updates, but is for reads.
+        # The use case for this wrapper is CPU-bound tasks so
+        # worrying about thread-safety adds unnecessary overhead
+        cache = OrderedDict()
+        hits = misses = 0
+        cache_len = cache.__len__
+        cache_del = cache.popitem
+        cache_move_to_end = cache.move_to_end
+        full = False
+        if maxsize is None:
+            @wraps(user_function)
+            def _np_cache_wrapper(*args, **kwargs):
+                nonlocal hits, misses
+                key = _make_hash_key(*args, **kwargs)
+                if key not in cache:
+                    misses += 1
+                    cache[key] = user_function(*args, **kwargs)
+                else:
+                    hits += 1
+                return cache[key]
+        elif maxsize == 0:
+            @wraps(user_function)
+            def _np_cache_wrapper(*args, **kwargs):
+                nonlocal misses
+                misses += 1
+                return user_function(*args, **kwargs)
+        else:
+            @wraps(user_function)
+            def _np_cache_wrapper(*args, **kwargs):
+                nonlocal hits, misses, full
+                key = _make_hash_key(*args, **kwargs)
+                if key not in cache:
+                    misses += 1
+                    cache[key] = user_function(*args, **kwargs)
+                    if full:
+                        cache_del(last=False)
+                    else:
+                        full = cache_len() >= maxsize
+                else:
+                    hits += 1
+                    cache_move_to_end(key)
+                return cache[key]
+        def cache_info():
+            return _NpCacheInfo(hits, misses, maxsize, cache_len())
+        def cache_clear():
+            nonlocal hits, misses, full
+            cache.clear()
+            hits = misses = 0
+            full = False
+        _np_cache_wrapper.cache_info = cache_info
+        _np_cache_wrapper.cache_clear = cache_clear
+        _np_cache_wrapper.cache = cache
+        return _np_cache_wrapper
+    if user_function:
+        return cast(TCallable, actual_np_cache)(user_function)
+    return cast(TCallable, actual_np_cache)
+HASH_FUNCTIONS = {np.ndarray: xxh3_64_hexdigest}
+def _hasher(obj):
+    return HASH_FUNCTIONS.get(type(obj), hash)(obj)
+def _make_hash_key(*args, **kwargs):
+    """This approach cares about the order of keyword arguments (that is,
+    f(arr=a, order="c") will be cached separately from f(order="c", arr=a)
+    This is much slower than the equivalent function in functools.lru_cache
+    because every element must be inspected to determine if is an array.
+    Monkeypatching np.ndarray.__hash__ is unfortunately not possible, but
+    would fix this issue."""
+    key = tuple(map(_hasher, args))
+    if kwargs:
+        key += tuple(map(_hasher, chain.from_iterable(kwargs.items())))
+    return _HashedArrSeq(key)
+class _HashedArrSeq(list):
+    """Analogous to _HashedSeq in functools. Essentially caches the __hash__ call
+    so that the hash is not recomputed each time the OrderedDict cache interacts
+    with this object."""
+    __slots__ = "hashvalue"
+    def __init__(self, tup):
+        self.hashvalue = hash(tup)
+        self[:] = [self.hashvalue]
+    def __hash__(self):
+        return self.hashvalue

fusion_bench/dataset/arc_agi/preprocess.py ADDED Viewed

@@ -0,0 +1,298 @@
+import itertools
+from typing import Any, Dict, List, Mapping
+import numpy as np
+from typing_extensions import TYPE_CHECKING
+from .arc import Task
+from .augmenters import (
+    Augmenter,
+    Chain,
+    Concat,
+    Flip,
+    IdentityAugmenter,
+    IncreaseHeight,
+    IncreaseResolution,
+    IncreaseWidth,
+    PermuteColors,
+    PermuteExamples,
+    RandomTranslateXY,
+    Reflect,
+    Repeat,
+    Rotate,
+    Transpose,
+)
+from .messagers import MessageRepresenter
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+def get_augmenters(
+    include_basic: bool = True,
+    include_size: bool = True,
+    include_chain: bool = True,
+    include_repeat: bool = True,
+    include_concat: bool = False,
+) -> List[Augmenter]:
+    basic_augmenters_to_apply = (
+        [
+            Rotate(90),
+            Rotate(270),
+            Rotate(180),
+            Flip(0),
+            Flip(1),
+            Reflect(0, reverse=True),
+            Reflect(1, reverse=True),
+            Reflect(0, reverse=False),
+            Reflect(1, reverse=False),
+            RandomTranslateXY(),
+            Transpose(),
+        ]
+        if include_basic
+        else []
+    )
+    size_augmenters_to_apply = (
+        [
+            IncreaseResolution(2),
+            IncreaseHeight(2),
+            IncreaseWidth(2),
+        ]
+        if include_size
+        else []
+    )
+    concat_augmenters_to_apply = (
+        [
+            Concat((IdentityAugmenter(), Rotate(180)), axis=0),
+            Concat((IdentityAugmenter(), Rotate(180)), axis=1),
+        ]
+        if include_concat
+        else []
+    )
+    chain_augmenters_to_apply = (
+        [
+            Chain([Rotate(90), IncreaseResolution(2)]),
+            Chain([Rotate(270), IncreaseResolution(2)]),
+            Chain([Rotate(180), IncreaseResolution(2)]),
+            Chain([Flip(0), IncreaseResolution(2)]),
+            Chain([Flip(1), IncreaseResolution(2)]),
+            Chain([Transpose(), IncreaseResolution(2)]),
+        ]
+        if include_chain
+        else []
+    )
+    repeat_augmenters_to_apply = (
+        [
+            Repeat(0, 2),
+            Repeat(1, 2),
+            Repeat(2, 2),
+        ]
+        if include_repeat
+        else []
+    )
+    augmenters_to_apply = (
+        basic_augmenters_to_apply
+        + size_augmenters_to_apply
+        + concat_augmenters_to_apply
+        + chain_augmenters_to_apply
+        + repeat_augmenters_to_apply
+    )
+    print(
+        "Augmenters to apply: ", augmenters_to_apply, "len: ", len(augmenters_to_apply)
+    )
+    return augmenters_to_apply
+def format_and_filter(
+    formatter: MessageRepresenter,
+    tokenizer: "PreTrainedTokenizer",
+    task: Task,
+):
+    """
+    Formats and filters a task for model input.
+    Args:
+        formatter (MessageRepresenter): The formatter to encode the task.
+        tokenizer (PreTrainedTokenizer): The tokenizer to tokenize the conversation.
+        task: The task to be formatted and filtered.
+    Returns:
+        Dict[str, Any]: A dictionary containing the formatted data with keys:
+            - "input_ids": The tokenized input IDs.
+            - "attention_mask": The attention mask for the input IDs.
+            - "labels": The labels for the input IDs.
+            - "task_id": The task ID.
+            - "num_prompt_tokens": The number of prompt tokens.
+            - "num_output_tokens": The number of output tokens.
+    """
+    task_id = task.name
+    task = formatter.encode(task)
+    conversation = task[0] + [task[1]]
+    assert conversation[-1]["role"] == "assistant", "Last message should be assistant"
+    prompt_tokens = tokenizer.apply_chat_template(
+        conversation[:-1], tokenize=True, add_generation_prompt=True
+    )
+    generation_tokens = tokenizer.apply_chat_template(conversation, tokenize=True)
+    output_tokens = generation_tokens[len(prompt_tokens) :]
+    data = {
+        "input_ids": prompt_tokens + output_tokens,
+        "attention_mask": [1] * len(prompt_tokens) + [1] * len(output_tokens),
+        "labels": prompt_tokens + output_tokens,
+        "task_id": task_id,
+        "num_prompt_tokens": len(prompt_tokens),
+        "num_output_tokens": len(output_tokens),
+    }
+    return data
+def get_test_time_train_data(
+    original_task: Task,
+    augmenters: List[Augmenter],
+    n: int = 1,
+    permute_n: int = 1,
+    seed: int = 0,
+) -> List[Task]:
+    """
+    Generates augmented training data for test-time training.
+    Args:
+        original_task (Task): The original task containing training examples.
+        augmenters (List[Augmenter]): A list of augmenters to apply to the tasks.
+        n (int, optional): The number of examples to leave out for testing. Defaults to 1.
+        permute_n (int, optional): The number of times to permute the augmented tasks. Defaults to 1.
+        seed (int, optional): The random seed for reproducibility. Defaults to 0.
+    Returns:
+        List[Task]: A list of augmented tasks.
+    """
+    rng = np.random.RandomState(seed)
+    train_examples = original_task.train_examples.copy()
+    initial_tasks = []
+    N = len(train_examples)
+    for i in range(len(train_examples)):
+        examples = train_examples.copy()
+        indices = set(range(N)) - {i}
+        # we already remove i, so we need to remove n-1 more
+        combs = list(itertools.combinations(indices, n - 1))
+        combs = [indices - set(comb) for comb in combs]
+        for comb in combs:
+            initial_tasks.append(
+                Task(
+                    name=original_task.name,
+                    train_examples=[examples[j] for j in comb],
+                    test_example=examples[i],
+                )
+            )
+    augmented_tasks = []
+    for augmenter in augmenters:
+        for task in initial_tasks:
+            task = augmenter.apply_to_task(task, to_input=True, to_output=True, rng=rng)
+            # some augmentations increase shapes
+            if not (task.max_height() <= 30 and task.max_width() <= 30):
+                continue
+            augmented_tasks.append(task)
+    augmented_tasks = list(set(augmented_tasks + initial_tasks))
+    color_and_permute_augmented_tasks = []
+    for _ in range(permute_n):
+        for task in augmented_tasks:
+            if len(augmenters) != 0:
+                new_task = PermuteColors().apply_to_task(
+                    task, to_input=True, to_output=True, rng=rng
+                )
+            else:
+                new_task = task
+            new_task = PermuteExamples().apply_to_task(
+                new_task, rng=rng, to_input=True, to_output=True
+            )
+            color_and_permute_augmented_tasks.append(new_task)
+    augmented_tasks = color_and_permute_augmented_tasks + augmented_tasks
+    augmented_tasks = list(set(augmented_tasks))
+    return augmented_tasks
+def get_formatted_data(
+    task: Task,
+    augmenters: List[Augmenter],
+    formatter: MessageRepresenter,
+    tokenizer: "PreTrainedTokenizer",
+    leave_n: int = 1,
+    permute_n: int = 1,
+    seed: int = 0,
+    max_tokens: int = 8192,
+):
+    train_data = get_test_time_train_data(
+        task, augmenters, n=leave_n, permute_n=permute_n, seed=seed
+    )
+    formatted_data = []
+    for task in train_data:
+        formatted = format_and_filter(formatter, tokenizer, task)
+        if len(formatted["input_ids"]) < max_tokens:
+            formatted_data.append(formatted)
+    return formatted_data
+def process_task_for_ttt(
+    task: Task,
+    augmenters: List[Augmenter],
+    formatter: MessageRepresenter,
+    tokenizer: "PreTrainedTokenizer",
+    permute_n: int = 1,
+    Nmax: int = 250,
+    seed: int = 0,
+):
+    rng = np.random.RandomState(seed)
+    leave_1_train_data = get_formatted_data(
+        task,
+        augmenters,
+        formatter,
+        tokenizer,
+        leave_n=1,
+        permute_n=permute_n,
+        seed=seed,
+    )
+    leave_2_train_data = get_formatted_data(
+        task,
+        augmenters,
+        formatter,
+        tokenizer,
+        leave_n=2,
+        permute_n=permute_n,
+        seed=seed,
+    )
+    train = leave_1_train_data
+    if len(train) == 0:
+        train = leave_2_train_data
+    elif len(train) < Nmax:
+        train += leave_2_train_data[: Nmax - len(train)]
+    elif len(train) > Nmax:
+        rng.shuffle(train)
+        train = train[:Nmax]
+    return train
+def process_task(
+    task: Task,
+    formatter: MessageRepresenter,
+    tokenizer: "PreTrainedTokenizer",
+):
+    formatted = format_and_filter(formatter, tokenizer, task)
+    return [formatted]