PyPI - saxpy - Versions diffs - 2.0.0__py3-none-any.whl - Mend

saxpy 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

saxpy/__init__.py +11 -0
saxpy/alphabet.py +26 -0
saxpy/discord.py +99 -0
saxpy/distance.py +19 -0
saxpy/hotsax.py +206 -0
saxpy/paa.py +78 -0
saxpy/repair.py +408 -0
saxpy/rra.py +367 -0
saxpy/sax.py +292 -0
saxpy/saxvsm.py +320 -0
saxpy/saxvsm_optimize.py +345 -0
saxpy/strfunc.py +9 -0
saxpy/util.py +34 -0
saxpy/visit_registry.py +67 -0
saxpy/znorm.py +46 -0
saxpy-2.0.0.dist-info/METADATA +380 -0
saxpy-2.0.0.dist-info/RECORD +19 -0
saxpy-2.0.0.dist-info/WHEEL +4 -0
saxpy-2.0.0.dist-info/licenses/LICENSE +339 -0

saxpy/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""SAX stack implementation."""
+from importlib.metadata import PackageNotFoundError, version
+try:
+    __version__ = version("saxpy")
+except PackageNotFoundError:  # package is not installed (e.g. running from a source checkout)
+    __version__ = "0.0.0"
+__author__ = "Pavel Senin <seninp@gmail.com>"
+__all__ = []

saxpy/alphabet.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Implements Alphabet cuts."""
+from functools import cache
+import numpy as np
+from scipy.stats import norm  # already depend on scipy
+@cache
+def cuts_for_asize(a_size: int) -> np.ndarray:
+    """Generate the Gaussian breakpoints for an alphabet of the given size.
+    The returned array is prefixed with ``-inf`` and has ``a_size`` entries.
+    >>> cuts_for_asize(2)
+    array([-inf,   0.])
+    >>> cuts_for_asize(3)
+    array([      -inf, -0.4307273,  0.4307273])
+    >>> len(cuts_for_asize(5))
+    5
+    """
+    if a_size < 2:
+        raise ValueError("alphabet_size must be >= 2")
+    probs = np.arange(1, a_size) / a_size
+    cuts = norm.ppf(probs)
+    return np.concatenate(([-np.inf], cuts))

saxpy/discord.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""Discord discovery routines."""
+import random
+import numpy as np
+from saxpy.visit_registry import VisitRegistry
+from saxpy.znorm import znorm
+def find_discords_brute_force(
+    series, win_size, num_discords=2, znorm_threshold=0.01, random_state=None
+):
+    """Reference O(n^2) distance-based discord discovery.
+    For each candidate window the nearest-neighbour (z-normalized Euclidean)
+    distance is computed against every non-overlapping window; the discord is
+    the candidate whose nearest neighbour is farthest. Distance-tied candidates
+    are broken deterministically on the lowest index, so the result is exact and
+    reproducible.
+    ``random_state`` is accepted for backward compatibility but is now **inert**:
+    the search computes every candidate's exact nearest-neighbour distance with a
+    single vectorized pass (no random visit order, no early abandoning), so there
+    is no trajectory for a seed to influence. The returned discords are identical
+    regardless of its value.
+    """
+    rng = random_state if isinstance(random_state, random.Random) else random.Random(random_state)
+    discords = list()
+    globalRegistry = VisitRegistry(len(series) - win_size + 1, rng=rng)
+    znorms = np.array(
+        [
+            znorm(series[pos : pos + win_size], znorm_threshold)
+            for pos in range(len(series) - win_size + 1)
+        ]
+    )
+    while len(discords) < num_discords:
+        bestDiscord = find_best_discord_brute_force(series, win_size, globalRegistry, znorms)
+        if -1 == bestDiscord[0]:
+            break
+        discords.append(bestDiscord)
+        mark_start = max(0, bestDiscord[0] - win_size + 1)
+        mark_end = bestDiscord[0] + win_size
+        globalRegistry.mark_visited_range(mark_start, mark_end)
+    return discords
+def find_best_discord_brute_force(series, win_size, global_registry, znorms):
+    """Find the single best discord among the not-yet-excluded candidates.
+    The inner nearest-neighbour search is vectorized: for a candidate window,
+    ``((znorms - candidate) ** 2).sum(axis=1)`` is the squared Euclidean distance
+    to *every* window at once. Overlapping windows (within ``win_size`` of the
+    candidate) are masked out, and the minimum gives the candidate's exact NN
+    distance. This replaces the former per-neighbour Python loop over
+    ``early_abandoned_euclidean`` (which, per the audit, is ~47x slower per call
+    in pure Python than a vectorized distance unless it abandons almost
+    immediately) and matches the vectorized approach HOT-SAX already uses --
+    ~90x faster on a full ECG series, with identical discords.
+    """
+    best_so_far_distance = -1.0
+    best_so_far_index = -1
+    n_windows = len(series) - win_size + 1
+    index_arr = np.arange(n_windows)
+    outer_registry = global_registry.clone()
+    outer_idx = outer_registry.get_next_unvisited()
+    while ~np.isnan(outer_idx):
+        outer_registry.mark_visited(outer_idx)
+        candidate_seq = znorms[outer_idx]
+        # Exact NN distance to every non-overlapping window, vectorized.
+        sq_dists = ((znorms - candidate_seq) ** 2).sum(axis=1)
+        sq_dists[np.abs(index_arr - outer_idx) < win_size] = np.inf
+        nn_distance = float(np.sqrt(sq_dists.min()))
+        # Tie-break deterministically on the lowest index.
+        if nn_distance < np.inf and (
+            nn_distance > best_so_far_distance
+            or (nn_distance == best_so_far_distance and outer_idx < best_so_far_index)
+        ):
+            best_so_far_distance = nn_distance
+            best_so_far_index = outer_idx
+        outer_idx = outer_registry.get_next_unvisited()
+    return best_so_far_index, best_so_far_distance

saxpy/distance.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Distance computation."""
+import numpy as np
+def euclidean(a, b):
+    """Compute a Euclidean distance value."""
+    return np.sqrt(np.sum((a - b) ** 2))
+def early_abandoned_euclidean(a, b, upper_limit):
+    """Compute a Euclidean distance value in early abandoning fashion."""
+    lim = upper_limit * upper_limit
+    res = 0.0
+    for i in range(0, len(a)):
+        res += np.dot((a[i] - b[i]), (a[i] - b[i]))
+        if res > lim:
+            return np.nan
+    return np.sqrt(res)

saxpy/hotsax.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Implements HOT-SAX."""
+import numpy as np
+from saxpy.distance import euclidean
+from saxpy.sax import sax_via_window
+from saxpy.znorm import znorm
+def find_discords_hotsax(
+    series,
+    win_size=100,
+    num_discords=2,
+    paa_size=3,
+    alphabet_size=3,
+    znorm_threshold=0.01,
+    sax_type="unidim",
+    random_state=None,
+):
+    """HOT-SAX-driven discords discovery.
+    Argument order: ``(series, win_size, num_discords, paa_size, alphabet_size,
+    znorm_threshold, sax_type)`` -- ``paa_size`` precedes ``alphabet_size`` to
+    match ``sax_via_window`` (changed in 2.0.0; the two were previously
+    reversed, a silent footgun for positional callers).
+    The random-search phase shuffles candidate visit order (an early-abandoning
+    speed heuristic), but the returned discords are reproducible: a candidate's
+    nearest-neighbour distance is order-independent, and exact-distance ties are
+    broken deterministically on the lowest index. Results therefore do not
+    depend on the RNG.
+    ``random_state`` makes the search *trajectory* reproducible too: pass an int
+    (or a ``numpy.random.Generator``) to seed the shuffle, so the distance-call
+    count is deterministic run-to-run. The default ``None`` keeps the historical
+    unseeded behavior (a fresh, independently-seeded generator each call; the
+    visit order, and only the visit order, is nondeterministic). The returned
+    discords are identical either way.
+    """
+    rng = (
+        random_state
+        if isinstance(random_state, np.random.Generator)
+        else np.random.default_rng(random_state)
+    )
+    discords = list()
+    global_registry = set()
+    # Z-normalized versions for every subsequence.
+    znorms = np.array(
+        [
+            znorm(series[pos : pos + win_size], znorm_threshold)
+            for pos in range(len(series) - win_size + 1)
+        ]
+    )
+    # SAX words for every subsequence.
+    sax_data = sax_via_window(
+        series,
+        win_size=win_size,
+        paa_size=paa_size,
+        alphabet_size=alphabet_size,
+        nr_strategy=None,
+        znorm_threshold=znorm_threshold,
+        sax_type=sax_type,
+    )
+    """[2.0] build the 'magic' array"""
+    magic_array = list()
+    for k, v in sax_data.items():
+        magic_array.append((k, len(v)))
+    """[2.1] sort it ascending by the number of occurrences"""
+    magic_array = sorted(magic_array, key=lambda tup: tup[1])
+    while len(discords) < num_discords:
+        best_discord = find_best_discord_hotsax(
+            series, win_size, global_registry, sax_data, magic_array, znorms, rng
+        )
+        if -1 == best_discord[0]:
+            break
+        discords.append(best_discord)
+        mark_start = max(0, best_discord[0] - win_size + 1)
+        mark_end = best_discord[0] + win_size
+        for i in range(mark_start, mark_end):
+            global_registry.add(i)
+    return discords
+def find_best_discord_hotsax(
+    series, win_size, global_registry, sax_data, magic_array, znorms, rng=None
+):
+    """Find the best discord with hotsax.
+    ``rng`` is an optional ``numpy.random.Generator`` for the random-search
+    shuffle; ``None`` falls back to the global ``numpy.random`` (historical
+    behavior).
+    """
+    if rng is None:
+        rng = np.random
+    """[3.0] define the key vars"""
+    best_so_far_position = -1
+    best_so_far_distance = 0.0
+    distance_calls = 0
+    visit_array = np.zeros(len(series), dtype=int)
+    """[4.0] and we are off iterating over the magic array entries"""
+    for entry in magic_array:
+        """[5.0] current SAX words and the number of other sequences mapping to the same SAX word."""
+        curr_word = entry[0]
+        occurrences = sax_data[curr_word]
+        """[6.0] jumping around by the same word occurrences makes it easier to
+        nail down the possibly small distance value -- so we can be efficient
+        and all that..."""
+        for curr_pos in occurrences:
+            if curr_pos in global_registry:
+                continue
+            """[7.0] we don't want an overlapping subsequence"""
+            mark_start = max(0, curr_pos - win_size + 1)
+            mark_end = curr_pos + win_size
+            visit_set = set(range(mark_start, mark_end))
+            """[8.0] here is our subsequence in question"""
+            cur_seq = znorms[curr_pos]
+            """[9.0] let's see what is NN distance"""
+            nn_dist = np.inf
+            do_random_search = True
+            """[10.0] ordered by occurrences search first"""
+            for next_pos in occurrences:
+                """[11.0] skip bad pos"""
+                if next_pos in visit_set:
+                    continue
+                else:
+                    visit_set.add(next_pos)
+                """[12.0] distance we compute"""
+                # NB: keep the vectorized euclidean here, NOT the element-wise
+                # early_abandoned_euclidean (audit #17). Its abandoning is a
+                # pure-Python per-element np.dot loop, ~47x slower per call than
+                # np.sqrt(np.sum(...)) on these win_size windows; unless it
+                # abandons within the first ~2 elements it loses badly, and
+                # swapping it in made HOT-SAX ~3x slower overall. Measured, not
+                # assumed.
+                dist = euclidean(cur_seq, znorms[next_pos])
+                distance_calls += 1
+                """[13.0] keep the books up-to-date"""
+                if dist < nn_dist:
+                    nn_dist = dist
+                if dist < best_so_far_distance:
+                    do_random_search = False
+                    break
+            """[13.0] if not broken above,
+            we shall proceed with random search"""
+            if do_random_search:
+                """[14.0] build that random visit order array"""
+                curr_idx = 0
+                for i in range(0, (len(series) - win_size + 1)):
+                    if i not in visit_set:
+                        visit_array[curr_idx] = i
+                        curr_idx += 1
+                it_order = rng.permutation(visit_array[0:curr_idx])
+                curr_idx -= 1
+                """[15.0] and go random"""
+                while curr_idx >= 0:
+                    rand_pos = it_order[curr_idx]
+                    curr_idx -= 1
+                    # Vectorized euclidean, not early_abandoned_euclidean -- see
+                    # the note in the occurrences loop above (audit #17).
+                    dist = euclidean(cur_seq, znorms[rand_pos])
+                    distance_calls += 1
+                    """[16.0] keep the books up-to-date again"""
+                    if dist < nn_dist:
+                        nn_dist = dist
+                    if dist < best_so_far_distance:
+                        nn_dist = dist
+                        break
+            """[17.0] and BIGGER books -- tie-break on the lowest position so
+            the result matches find_discords_brute_force and never depends on
+            visit order"""
+            if nn_dist < np.inf and (
+                nn_dist > best_so_far_distance
+                or (nn_dist == best_so_far_distance and curr_pos < best_so_far_position)
+            ):
+                best_so_far_distance = nn_dist
+                best_so_far_position = curr_pos
+    return best_so_far_position, best_so_far_distance

saxpy/paa.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Implements PAA."""
+import numpy as np
+def paa(series, paa_segment_size, sax_type="unidim"):
+    """PAA implementation.
+    >>> paa([1, 2, 3], 3, 'unidim')
+    array([1., 2., 3.])
+    >>> paa([1, 2, 3], 1, 'unidim')
+    array([2.])
+    >>> paa([4, 3, 8, 5], 1, 'unidim')
+    array([5.])
+    >>> paa([[1, 2, 3], [6, 5, 4]], 1, 'repeat')
+    array([[3.5, 3.5, 3.5]])
+    >>> paa([[1, 2, 3], [6, 5, 4]], 2, 'repeat')
+    array([[1., 2., 3.],
+           [6., 5., 4.]])
+    """
+    series = np.array(series)
+    series_len = series.shape[0]
+    # PAA reduces a series to fewer segments by averaging; reject inputs that
+    # make that ill-defined instead of failing cryptically (ZeroDivisionError /
+    # all-NaN) or silently up-sampling.
+    if paa_segment_size < 1:
+        raise ValueError("PAA segment size must be a positive integer.")
+    if series_len == 0:
+        raise ValueError("Cannot run PAA on an empty series.")
+    if paa_segment_size > series_len:
+        raise ValueError(
+            "PAA segment size cannot exceed the series length; "
+            "PAA reduces a series, it does not up-sample it."
+        )
+    if sax_type in ["repeat", "energy"]:
+        num_dims = series.shape[1]
+    else:
+        num_dims = 1
+        is_multidimensional = (len(series.shape) > 1) and (series.shape[1] > 1)
+        if is_multidimensional:
+            # A 1-D sax_type collapses to a single column, so a genuinely
+            # multi-column array would silently drop every column but the first.
+            # Reject it instead -- multi-dimensional input belongs to the
+            # 'repeat', 'energy', or 'independent' modes.
+            raise ValueError(
+                f"sax_type={sax_type!r} expects a 1-D series, but got a "
+                f"{series.shape[1]}-column array; use 'repeat', 'energy', or "
+                "'independent' for multi-dimensional input."
+            )
+        series = series.reshape(series.shape[0], 1)
+    res = np.zeros((num_dims, paa_segment_size))
+    for dim in range(num_dims):
+        column = series[:, dim]
+        # PAA by averaging. These are the vectorized form of the original
+        # element-wise ``np.add.at`` scatter loops -- same arithmetic and
+        # summation order, but orders of magnitude faster on long series.
+        if series_len % paa_segment_size == 0:
+            # Evenly divisible: average contiguous blocks of ``inc`` points.
+            inc = series_len // paa_segment_size
+            res[dim] = column.reshape(paa_segment_size, inc).mean(axis=1)
+        else:
+            # Otherwise the classic expand-by-paa_size / contract-by-series_len
+            # construction, so segment boundaries can fall between samples.
+            res[dim] = (
+                np.repeat(column, paa_segment_size)
+                .reshape(paa_segment_size, series_len)
+                .mean(axis=1)
+            )
+    if sax_type in ["repeat", "energy"]:
+        return res.T
+    else:
+        return res.flatten()