PyPI - elastic-kernel - Versions diffs - 0.0.2__py3-none-any.whl - Mend

elastic-kernel 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

elastic_kernel/__init__.py +0 -0
elastic_kernel/command.py +43 -0
elastic_kernel/kernel.json +5 -0
elastic_kernel/kernel.py +258 -0
elastic_kernel-0.0.2.dist-info/METADATA +291 -0
elastic_kernel-0.0.2.dist-info/RECORD +42 -0
elastic_kernel-0.0.2.dist-info/WHEEL +5 -0
elastic_kernel-0.0.2.dist-info/entry_points.txt +2 -0
elastic_kernel-0.0.2.dist-info/licenses/LICENSE +201 -0
elastic_kernel-0.0.2.dist-info/top_level.txt +2 -0
elastic_notebook/__init__.py +0 -0
elastic_notebook/algorithm/__init__.py +0 -0
elastic_notebook/algorithm/baseline.py +31 -0
elastic_notebook/algorithm/optimizer_exact.py +121 -0
elastic_notebook/algorithm/selector.py +41 -0
elastic_notebook/core/__init__.py +0 -0
elastic_notebook/core/common/__init__.py +0 -0
elastic_notebook/core/common/checkpoint_file.py +129 -0
elastic_notebook/core/common/profile_graph_size.py +39 -0
elastic_notebook/core/common/profile_migration_speed.py +69 -0
elastic_notebook/core/common/profile_variable_size.py +66 -0
elastic_notebook/core/graph/__init__.py +0 -0
elastic_notebook/core/graph/cell_execution.py +39 -0
elastic_notebook/core/graph/graph.py +75 -0
elastic_notebook/core/graph/variable_snapshot.py +31 -0
elastic_notebook/core/io/__init__.py +0 -0
elastic_notebook/core/io/adapter.py +18 -0
elastic_notebook/core/io/filesystem_adapter.py +30 -0
elastic_notebook/core/io/migrate.py +98 -0
elastic_notebook/core/io/pickle.py +71 -0
elastic_notebook/core/io/recover.py +51 -0
elastic_notebook/core/mutation/__init__.py +0 -0
elastic_notebook/core/mutation/fingerprint.py +184 -0
elastic_notebook/core/mutation/id_graph.py +147 -0
elastic_notebook/core/mutation/object_hash.py +204 -0
elastic_notebook/core/notebook/__init__.py +0 -0
elastic_notebook/core/notebook/checkpoint.py +222 -0
elastic_notebook/core/notebook/find_input_vars.py +117 -0
elastic_notebook/core/notebook/find_output_vars.py +18 -0
elastic_notebook/core/notebook/restore_notebook.py +91 -0
elastic_notebook/core/notebook/update_graph.py +46 -0
elastic_notebook/elastic_notebook.py +336 -0

elastic_notebook/core/mutation/id_graph.py ADDED Viewed

@@ -0,0 +1,147 @@
+from inspect import isclass
+from types import ModuleType
+PRIMITIVES = {type(None), int, float, bool, str}
+ITERABLES = {tuple, set, list}
+OBJECT_FILTER_FUNC = lambda x: not x[0].startswith("_") and not is_primitive(type(x[1]))
+ITERABLE_FILTER_FUNC = lambda x: not is_primitive(x)
+class IdGraphNode:
+    """
+    The IdGraph is used to model the reachable objects (their ids and types) from a given object.
+    """
+    def __init__(self, obj_id, obj_type, child_nodes):
+        """
+        Args:
+            obj_id: id (memory address) of the object.
+            obj_type: type of the object.
+            child_nodes: other objects reachable from this object (i.e. class attributes, list/set members)
+        """
+        self.obj_id = obj_id
+        self.obj_type = obj_type
+        self.child_nodes = child_nodes
+def is_primitive(obj_type):
+    return obj_type in PRIMITIVES
+def is_root_equals(node1, node2):
+    """
+    Compare only the root notes of 2 ID graphs.
+    """
+    if node1 is None and node2 is None:
+        return True
+    elif node1 is None or node2 is None:
+        return False
+    return node1.obj_id == node2.obj_id and node1.obj_type == node2.obj_type
+def is_structure_equals_helper(node1, node2, visited):
+    """
+    BFS helper for recursively comparing ID graphs.
+    """
+    if (
+        node1.obj_id != node2.obj_id
+        or node1.obj_type != node2.obj_type
+        or len(node1.child_nodes) != len(node2.child_nodes)
+    ):
+        return False
+    visited.add(node1)
+    for i in range(len(node1.child_nodes)):
+        if node1.child_nodes[i] in visited:
+            continue
+        if not is_structure_equals_helper(
+            node1.child_nodes[i], node2.child_nodes[i], visited
+        ):
+            return False
+    return True
+def is_structure_equals(node1, node2):
+    """
+    Compare 2 ID graphs for equality.
+    """
+    if node1 is None and node2 is None:
+        return True
+    elif node1 is None or node2 is None:
+        return False
+    return is_structure_equals_helper(node1, node2, set())
+def construct_id_graph_node(obj, visited):
+    """
+    Helper function for constructing an ID graph. Constructs an ID graph node and recurses into reachable objects
+    in BFS fashion.
+    Args:
+        obj: object.
+        visited (set): set of visited objects (for handling cyclic references).
+    """
+    if obj is None:
+        return None
+    # Construct ID graph node.
+    T = type(obj)
+    child_list = []
+    id_graph_node = IdGraphNode(id(obj), T.__name__, child_list)
+    visited[id(obj)] = id_graph_node
+    # obj is Iterable: recurse into items
+    if T in ITERABLES:
+        for child_obj in obj:
+            if is_primitive(type(child_obj)):
+                continue
+            if id(child_obj) in visited:
+                child_list.append(visited[id(child_obj)])
+            else:
+                child_list.append(construct_id_graph_node(child_obj, visited))
+    # obj is object instance: recurse into attributes
+    # Special note for sets: this works as the order of iterating through sets in the same session is deterministic.
+    elif (
+        hasattr(obj, "__dict__")
+        and not isinstance(obj, ModuleType)
+        and not isclass(obj)
+    ):
+        for k, v in filter(OBJECT_FILTER_FUNC, vars(obj).items()):
+            if id(v) in visited:
+                child_list.append(visited[id(v)])
+            else:
+                child_list.append(construct_id_graph_node(v, visited))
+    # obj is dictionary: recurse into both keys and values
+    elif T is dict:
+        for k, v in obj.items():
+            if is_primitive(type(k)):
+                continue
+            if id(k) in visited:
+                child_list.append(visited[id(k)])
+            else:
+                child_list.append(construct_id_graph_node(k, visited))
+            if is_primitive(type(v)):
+                continue
+            if id(v) in visited:
+                child_list.append(visited[id(v)])
+            else:
+                child_list.append(construct_id_graph_node(v, visited))
+    return id_graph_node
+def construct_id_graph(obj):
+    """
+    Construct the ID graph for an object.
+    Returns:
+        graph_bytestring (bytestring): the serialized bytestring of the ID graph.
+        visited: a set of all IDs of reachable objects in the graph. For checking object overlaps.
+    """
+    visited = {}
+    graph = construct_id_graph_node(obj, visited)
+    return graph, set(visited.keys())

elastic_notebook/core/mutation/object_hash.py ADDED Viewed

@@ -0,0 +1,204 @@
+import copy
+import io
+from inspect import isclass
+from types import FunctionType, ModuleType
+import dill
+# import polars as pl
+import lightgbm
+import numpy as np
+import pandas as pd
+import scipy
+import torch
+import xxhash
+BASE_TYPES = [type(None), FunctionType]
+class ImmutableObj:
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        if isinstance(other, ImmutableObj):
+            return True
+        return False
+# Object representing none.
+class NoneObj:
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        if isinstance(other, NoneObj):
+            return True
+        return False
+# Object representing a dataframe.
+class DataframeObj:
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        if isinstance(other, DataframeObj):
+            return True
+        return False
+class NxGraphObj:
+    def __init__(self, graph):
+        self.graph = graph
+    def __eq__(self, other):
+        if isinstance(other, NxGraphObj):
+            return nx.graphs_equal(self.graph, other.graph)
+        return False
+class NpArrayObj:
+    def __init__(self, arraystr):
+        self.arraystr = arraystr
+        pass
+    def __eq__(self, other):
+        if isinstance(other, NpArrayObj):
+            return self.arraystr == other.arraystr
+        return False
+class ScipyArrayObj:
+    def __init__(self, arraystr):
+        self.arraystr = arraystr
+        pass
+    def __eq__(self, other):
+        if isinstance(other, ScipyArrayObj):
+            return self.arraystr == other.arraystr
+        return False
+class TorchTensorObj:
+    def __init__(self, arraystr):
+        self.arraystr = arraystr
+        pass
+    def __eq__(self, other):
+        if isinstance(other, TorchTensorObj):
+            return self.arraystr == other.arraystr
+        return False
+class ModuleObj:
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        if isinstance(other, ModuleObj):
+            return True
+        return False
+# Object representing general unserializable class.
+class UnserializableObj:
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        if isinstance(other, UnserializableObj):
+            return True
+        return False
+class UncomparableObj:
+    def __init__(self):
+        pass
+    def __eq__(self, other):
+        if isinstance(other, UncomparableObj):
+            return True
+        return False
+def construct_object_hash(obj, deepcopy=False):
+    """
+    Construct an object hash for the object. Uses deep-copy as a fallback.
+    """
+    if type(obj) in BASE_TYPES:
+        return ImmutableObj()
+    if isclass(obj):
+        return type(obj)
+    # Flag hack for Pandas dataframes: each dataframe column is a numpy array.
+    # All the writeable flags of these arrays are set to false; if after cell execution, any of these flags are
+    # reset to True, we assume that the dataframe has been modified.
+    if isinstance(obj, pd.DataFrame):
+        for _, col in obj.items():
+            col.__array__().flags.writeable = False
+        return DataframeObj()
+    if isinstance(obj, pd.Series):
+        obj.__array__().flags.writeable = False
+        return DataframeObj()
+    attr_str = getattr(obj, "__module__", None)
+    if attr_str and (
+        "matplotlib" in attr_str
+        or "transformers" in attr_str
+        or "networkx" in attr_str
+        or "keras" in attr_str
+        or "tensorflow" in attr_str
+    ):
+        return UncomparableObj()
+    # Object is file handle
+    if isinstance(obj, io.IOBase):
+        return UncomparableObj()
+    if isinstance(obj, np.ndarray):
+        h = xxhash.xxh3_128()
+        h.update(np.ascontiguousarray(obj.data))
+        str1 = h.intdigest()
+        return NpArrayObj(str1)
+    if isinstance(obj, scipy.sparse.csr_matrix):
+        h = xxhash.xxh3_128()
+        h.update(np.ascontiguousarray(obj))
+        str1 = h.intdigest()
+        return ScipyArrayObj(str1)
+    if isinstance(obj, torch.Tensor):
+        h = xxhash.xxh3_128()
+        h.update(np.ascontiguousarray(obj))
+        str1 = h.intdigest()
+        return TorchTensorObj(str1)
+    if isinstance(obj, ModuleType) or isclass(obj):
+        return ModuleObj()
+    # Polars dataframes are immutable.
+    # if isinstance(obj, pl.DataFrame):
+    #    return type(obj)
+    # LightGBM dataframes are immutable.
+    if isinstance(obj, lightgbm.Dataset):
+        return type(obj)
+    # Try to hash the object; if the object is unhashable, use deepcopy as fallback.
+    try:
+        h = xxhash.xxh3_128()
+        h.update(obj)
+        return h.intdigest()
+    except:
+        try:
+            if deepcopy:
+                return copy.deepcopy(obj)
+            else:
+                return obj
+        except:
+            # If object is not even deepcopy-able, mark it as unserializable and assume modified-on-write.
+            return UnserializableObj()

elastic_notebook/core/notebook/__init__.py ADDED Viewed

File without changes

elastic_notebook/core/notebook/checkpoint.py ADDED Viewed

@@ -0,0 +1,222 @@
+import time
+from typing import Dict
+import numpy as np
+from ipykernel.zmqshell import ZMQInteractiveShell
+from elastic_notebook.algorithm.selector import Selector
+from elastic_notebook.core.common.profile_variable_size import profile_variable_size
+from elastic_notebook.core.graph.graph import DependencyGraph
+from elastic_notebook.core.io.migrate import migrate
+from elastic_notebook.core.mutation.object_hash import UnserializableObj
+def checkpoint(
+    graph: DependencyGraph,
+    shell: ZMQInteractiveShell,
+    fingerprint_dict: Dict,
+    selector: Selector,
+    udfs: set,
+    filename: str,
+    profile_dict,
+    write_log_location=None,
+    notebook_name=None,
+    optimizer_name=None,
+    elastic_notebook=None,
+):
+    """
+    Checkpoints the notebook. The optimizer selects the VSs to migrate and recompute and the OEs to recompute, then
+    writes the checkpoint as the specified filename.
+    Args:
+        graph (DependencyGraph): dependency graph representation of the notebook.
+        shell (ZMQInteractiveShell): interactive Jupyter shell storing the state of the current session.
+        selector (Selector): optimizer for computing the checkpointing configuration.
+        udfs (set): set of user-declared functions.
+        filename (str): location to write the file to.
+        write_log_location (str): location to write component runtimes to. For experimentation only.
+        notebook_name (str): notebook name. For experimentation only.
+        optimizer_name (str): optimizer name. For experimentation only.
+        elastic_notebook (ElasticNotebook, optional): ElasticNotebook instance to update migration lists.
+    """
+    profile_start = time.time()
+    # Retrieve active VSs from the graph. Active VSs are correspond to the latest instances/versions of each variable.
+    active_vss = set()
+    print("---------------------------")
+    print("all variables:")
+    # print(graph.variable_snapshots)
+    for vs_list in graph.variable_snapshots.values():
+        if not vs_list[-1].deleted:
+            print(f"name: {vs_list[-1].name}")
+            active_vss.add(vs_list[-1])
+    # Profile the size of each variable defined in the current session.
+    for active_vs in active_vss:
+        # 変数がfingerprint_dictに存在するかチェック
+        if active_vs.name not in fingerprint_dict:
+            print(f"Warning: Variable '{active_vs.name}' not found in fingerprint_dict")
+            continue
+        attr_str = getattr(shell.user_ns[active_vs.name], "__module__", None)
+        # Object is unserializable
+        if isinstance(fingerprint_dict[active_vs.name][2], UnserializableObj):
+            active_vs.size = np.inf
+        # Blacklisted object
+        elif attr_str and ("dataprep.eda" in attr_str or "bokeh" in attr_str):
+            active_vs.size = np.inf
+        # Profile size of object.
+        else:
+            active_vs.size = profile_variable_size(shell.user_ns[active_vs.name])
+    # Check for pairwise variable intersections. Variables sharing underlying data must be migrated or recomputed
+    # together.
+    overlapping_vss = []
+    for active_vs1 in active_vss:
+        for active_vs2 in active_vss:
+            if active_vs1 != active_vs2:
+                # 両方の変数がfingerprint_dictに存在するかチェック
+                if (
+                    active_vs1.name not in fingerprint_dict
+                    or active_vs2.name not in fingerprint_dict
+                ):
+                    continue
+                if fingerprint_dict[active_vs1.name][1].intersection(
+                    fingerprint_dict[active_vs2.name][1]
+                ):
+                    overlapping_vss.append((active_vs1, active_vs2))
+    profile_end = time.time()
+    if write_log_location:
+        with open(
+            write_log_location
+            + "/output_"
+            + notebook_name
+            + "_"
+            + optimizer_name
+            + ".txt",
+            "a",
+        ) as f:
+            f.write("overlappings - " + repr(len(overlapping_vss)) + "\n")
+            f.write(
+                "Profile stage took - "
+                + repr(profile_start - profile_end)
+                + " seconds"
+                + "\n"
+            )
+            f.write(
+                "Idgraph stage took - "
+                + repr(profile_dict["idgraph"])
+                + " seconds"
+                + "\n"
+            )
+            f.write(
+                "Representation stage took - "
+                + repr(profile_dict["representation"])
+                + " seconds"
+                + "\n"
+            )
+    optimize_start = time.time()
+    # Initialize the optimizer.
+    add_start = time.time()
+    selector.dependency_graph = graph
+    selector.active_vss = active_vss
+    selector.overlapping_vss = overlapping_vss
+    add_end = time.time()
+    # Use the optimizer to compute the checkpointing configuration.
+    opt_start = time.time()
+    vss_to_migrate, ces_to_recompute = selector.select_vss(
+        write_log_location, notebook_name, optimizer_name
+    )
+    opt_end = time.time()
+    print("---------------------------")
+    print("variables to migrate:")
+    for vs in vss_to_migrate:
+        print(f"name: {vs.name}, size: {vs.size}")
+    difference_start = time.time()
+    vss_to_recompute = active_vss - vss_to_migrate
+    difference_end = time.time()
+    print("---------------------------")
+    print("variables to recompute:")
+    for vs in vss_to_recompute:
+        print(f"name: {vs.name}, size: {vs.size}")
+    print([vs.name for vs in vss_to_recompute])
+    print("---------------------------")
+    print("cells to recompute:")
+    for ce in ces_to_recompute:
+        print(f"cell num: {ce.cell_num}, cell runtime: {ce.cell_runtime}")
+    print(sorted([ce.cell_num + 1 for ce in ces_to_recompute]))
+    optimize_end = time.time()
+    if write_log_location:
+        with open(
+            write_log_location
+            + "/output_"
+            + notebook_name
+            + "_"
+            + optimizer_name
+            + ".txt",
+            "a",
+        ) as f:
+            f.write(
+                "Optimize stage took - "
+                + repr(optimize_end - optimize_start)
+                + " seconds"
+                + "\n"
+            )
+            f.write(
+                "  Add stage took - " + repr(add_end - add_start) + " seconds" + "\n"
+            )
+            f.write(
+                "  Opt stage took - " + repr(opt_end - opt_start) + " seconds" + "\n"
+            )
+            f.write(
+                "  Diff stage took - "
+                + repr(difference_end - difference_start)
+                + " seconds"
+                + "\n"
+            )
+    # 変数リストを更新
+    if elastic_notebook is not None:
+        elastic_notebook.update_migration_lists(vss_to_migrate, vss_to_recompute)
+    # Store the notebook checkpoint to the specified location.
+    migrate_start = time.time()
+    migrate_success = True
+    migrate(
+        graph,
+        shell,
+        vss_to_migrate,
+        vss_to_recompute,
+        ces_to_recompute,
+        udfs,
+        selector.recomputation_ces,
+        selector.overlapping_vss,
+        filename,
+    )
+    migrate_end = time.time()
+    if write_log_location:
+        with open(
+            write_log_location
+            + "/output_"
+            + notebook_name
+            + "_"
+            + optimizer_name
+            + ".txt",
+            "a",
+        ) as f:
+            f.write(
+                "Migrate stage took - "
+                + repr(migrate_end - migrate_start)
+                + " seconds"
+                + "\n"
+            )
+    return migrate_success

elastic_notebook/core/notebook/find_input_vars.py ADDED Viewed

@@ -0,0 +1,117 @@
+import ast
+import inspect
+from collections import deque
+from typing import Tuple
+from ipykernel.zmqshell import ZMQInteractiveShell
+PRIMITIVES = {int, bool, str, float}
+# Node visitor for finding input variables.
+class Visitor(ast.NodeVisitor):
+    def __init__(self, shell, shell_udfs):
+        # Whether we are currently in local scope.
+        self.is_local = False
+        # Functions declared in
+        self.functiondefs = set()
+        self.udfcalls = set()
+        self.loads = set()
+        self.globals = set()
+        self.shell = shell
+        self.udfs = shell_udfs
+    def generic_visit(self, node):
+        ast.NodeVisitor.generic_visit(self, node)
+    def visit_Name(self, node):
+        if isinstance(node.ctx, ast.Load):
+            # Only add as input if variable exists in current scope.
+            if not (
+                self.is_local
+                and node.id not in self.globals
+                and node.id in self.shell.user_ns
+                and type(self.shell.user_ns[node.id]) in PRIMITIVES
+            ):
+                self.loads.add(node.id)
+        ast.NodeVisitor.generic_visit(self, node)
+    def visit_AugAssign(self, node):
+        # Only add as input if variable exists in current scope.
+        if isinstance(node.target, ast.Name):
+            if not (
+                self.is_local
+                and node.target.id not in self.globals
+                and node.target.id in self.shell.user_ns
+                and type(self.shell.user_ns[node.target.id]) in PRIMITIVES
+            ):
+                self.loads.add(node.target.id)
+        ast.NodeVisitor.generic_visit(self, node)
+    def visit_Global(self, node):
+        for name in node.names:
+            self.globals.add(name)
+        ast.NodeVisitor.generic_visit(self, node)
+    def visit_Call(self, node):
+        if isinstance(node.func, ast.Name):
+            self.udfcalls.add(node.func.id)
+        ast.NodeVisitor.generic_visit(self, node)
+    def visit_FunctionDef(self, node):
+        # Only add as input if variable exists in current scope
+        self.is_local = True
+        self.functiondefs.add(node.name)
+        ast.NodeVisitor.generic_visit(self, node)
+        self.is_local = False
+def find_input_vars(
+    cell: str, existing_variables: set, shell: ZMQInteractiveShell, shell_udfs: set
+) -> Tuple[set, dict]:
+    """
+    Capture the input variables of the cell via AST analysis.
+    Args:
+        cell (str): Raw cell cell.
+        existing_variables (set): Set of user-defined variables in the current session.
+        shell (ZMQInteractiveShell): Shell of current session. For inferring variable types.
+        shell_udfs (set): Set of user-declared functions in the shell.
+    """
+    # Initialize AST walker.
+    v1 = Visitor(shell=shell, shell_udfs=shell_udfs)
+    # Parse the cell code.
+    v1.visit(ast.parse(cell))
+    # Find top-level input variables and function declarations.
+    input_variables = v1.loads
+    function_defs = v1.functiondefs
+    # Recurse into accessed UDFs.
+    udf_calls = deque()
+    udf_calls_visited = set()
+    for udf in v1.udfcalls:
+        if udf not in udf_calls_visited and udf in shell_udfs:
+            udf_calls.append(udf)
+            udf_calls_visited.add(udf)
+    while udf_calls:
+        # Visit the next nested UDF call.
+        v_nested = Visitor(shell=shell, shell_udfs=shell_udfs)
+        udf = udf_calls.popleft()
+        v_nested.visit(ast.parse(inspect.getsource(shell.user_ns[udf])))
+        # Update input variables and function definitions
+        input_variables = input_variables.union(v_nested.loads)
+        function_defs = function_defs.union(v_nested.functiondefs)
+        for udf in v_nested.udfcalls:
+            if udf not in udf_calls_visited and udf in shell_udfs:
+                udf_calls.append(udf)
+                udf_calls_visited.add(udf)
+    # A variable is an input only if it is in the shell before cell execution.
+    input_variables = input_variables.intersection(existing_variables)
+    return input_variables, function_defs

elastic_notebook/core/notebook/find_output_vars.py ADDED Viewed

@@ -0,0 +1,18 @@
+def find_created_deleted_vars(pre_execution, post_execution):
+    """
+    Find created and deleted variables through computing a difference of the user namespace pre and post execution.
+    """
+    created_variables = set()
+    deleted_variables = set()
+    # New variables
+    for varname in post_execution.difference(pre_execution):
+        if not varname.startswith("_"):
+            created_variables.add(varname)
+    # Deleted variables
+    for varname in pre_execution.difference(post_execution):
+        if not varname.startswith("_"):
+            deleted_variables.add(varname)
+    return created_variables, deleted_variables