saxpy 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
saxpy/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """SAX stack implementation."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("saxpy")
7
+ except PackageNotFoundError: # package is not installed (e.g. running from a source checkout)
8
+ __version__ = "0.0.0"
9
+
10
+ __author__ = "Pavel Senin <seninp@gmail.com>"
11
+ __all__ = []
saxpy/alphabet.py ADDED
@@ -0,0 +1,26 @@
1
+ """Implements Alphabet cuts."""
2
+
3
+ from functools import cache
4
+
5
+ import numpy as np
6
+ from scipy.stats import norm # already depend on scipy
7
+
8
+
9
+ @cache
10
+ def cuts_for_asize(a_size: int) -> np.ndarray:
11
+ """Generate the Gaussian breakpoints for an alphabet of the given size.
12
+
13
+ The returned array is prefixed with ``-inf`` and has ``a_size`` entries.
14
+
15
+ >>> cuts_for_asize(2)
16
+ array([-inf, 0.])
17
+ >>> cuts_for_asize(3)
18
+ array([ -inf, -0.4307273, 0.4307273])
19
+ >>> len(cuts_for_asize(5))
20
+ 5
21
+ """
22
+ if a_size < 2:
23
+ raise ValueError("alphabet_size must be >= 2")
24
+ probs = np.arange(1, a_size) / a_size
25
+ cuts = norm.ppf(probs)
26
+ return np.concatenate(([-np.inf], cuts))
saxpy/discord.py ADDED
@@ -0,0 +1,99 @@
1
+ """Discord discovery routines."""
2
+
3
+ import random
4
+
5
+ import numpy as np
6
+
7
+ from saxpy.visit_registry import VisitRegistry
8
+ from saxpy.znorm import znorm
9
+
10
+
11
+ def find_discords_brute_force(
12
+ series, win_size, num_discords=2, znorm_threshold=0.01, random_state=None
13
+ ):
14
+ """Reference O(n^2) distance-based discord discovery.
15
+
16
+ For each candidate window the nearest-neighbour (z-normalized Euclidean)
17
+ distance is computed against every non-overlapping window; the discord is
18
+ the candidate whose nearest neighbour is farthest. Distance-tied candidates
19
+ are broken deterministically on the lowest index, so the result is exact and
20
+ reproducible.
21
+
22
+ ``random_state`` is accepted for backward compatibility but is now **inert**:
23
+ the search computes every candidate's exact nearest-neighbour distance with a
24
+ single vectorized pass (no random visit order, no early abandoning), so there
25
+ is no trajectory for a seed to influence. The returned discords are identical
26
+ regardless of its value.
27
+ """
28
+ rng = random_state if isinstance(random_state, random.Random) else random.Random(random_state)
29
+
30
+ discords = list()
31
+
32
+ globalRegistry = VisitRegistry(len(series) - win_size + 1, rng=rng)
33
+ znorms = np.array(
34
+ [
35
+ znorm(series[pos : pos + win_size], znorm_threshold)
36
+ for pos in range(len(series) - win_size + 1)
37
+ ]
38
+ )
39
+
40
+ while len(discords) < num_discords:
41
+ bestDiscord = find_best_discord_brute_force(series, win_size, globalRegistry, znorms)
42
+
43
+ if -1 == bestDiscord[0]:
44
+ break
45
+
46
+ discords.append(bestDiscord)
47
+
48
+ mark_start = max(0, bestDiscord[0] - win_size + 1)
49
+ mark_end = bestDiscord[0] + win_size
50
+
51
+ globalRegistry.mark_visited_range(mark_start, mark_end)
52
+
53
+ return discords
54
+
55
+
56
+ def find_best_discord_brute_force(series, win_size, global_registry, znorms):
57
+ """Find the single best discord among the not-yet-excluded candidates.
58
+
59
+ The inner nearest-neighbour search is vectorized: for a candidate window,
60
+ ``((znorms - candidate) ** 2).sum(axis=1)`` is the squared Euclidean distance
61
+ to *every* window at once. Overlapping windows (within ``win_size`` of the
62
+ candidate) are masked out, and the minimum gives the candidate's exact NN
63
+ distance. This replaces the former per-neighbour Python loop over
64
+ ``early_abandoned_euclidean`` (which, per the audit, is ~47x slower per call
65
+ in pure Python than a vectorized distance unless it abandons almost
66
+ immediately) and matches the vectorized approach HOT-SAX already uses --
67
+ ~90x faster on a full ECG series, with identical discords.
68
+ """
69
+ best_so_far_distance = -1.0
70
+ best_so_far_index = -1
71
+
72
+ n_windows = len(series) - win_size + 1
73
+ index_arr = np.arange(n_windows)
74
+
75
+ outer_registry = global_registry.clone()
76
+
77
+ outer_idx = outer_registry.get_next_unvisited()
78
+
79
+ while ~np.isnan(outer_idx):
80
+ outer_registry.mark_visited(outer_idx)
81
+
82
+ candidate_seq = znorms[outer_idx]
83
+
84
+ # Exact NN distance to every non-overlapping window, vectorized.
85
+ sq_dists = ((znorms - candidate_seq) ** 2).sum(axis=1)
86
+ sq_dists[np.abs(index_arr - outer_idx) < win_size] = np.inf
87
+ nn_distance = float(np.sqrt(sq_dists.min()))
88
+
89
+ # Tie-break deterministically on the lowest index.
90
+ if nn_distance < np.inf and (
91
+ nn_distance > best_so_far_distance
92
+ or (nn_distance == best_so_far_distance and outer_idx < best_so_far_index)
93
+ ):
94
+ best_so_far_distance = nn_distance
95
+ best_so_far_index = outer_idx
96
+
97
+ outer_idx = outer_registry.get_next_unvisited()
98
+
99
+ return best_so_far_index, best_so_far_distance
saxpy/distance.py ADDED
@@ -0,0 +1,19 @@
1
+ """Distance computation."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def euclidean(a, b):
7
+ """Compute a Euclidean distance value."""
8
+ return np.sqrt(np.sum((a - b) ** 2))
9
+
10
+
11
+ def early_abandoned_euclidean(a, b, upper_limit):
12
+ """Compute a Euclidean distance value in early abandoning fashion."""
13
+ lim = upper_limit * upper_limit
14
+ res = 0.0
15
+ for i in range(0, len(a)):
16
+ res += np.dot((a[i] - b[i]), (a[i] - b[i]))
17
+ if res > lim:
18
+ return np.nan
19
+ return np.sqrt(res)
saxpy/hotsax.py ADDED
@@ -0,0 +1,206 @@
1
+ """Implements HOT-SAX."""
2
+
3
+ import numpy as np
4
+
5
+ from saxpy.distance import euclidean
6
+ from saxpy.sax import sax_via_window
7
+ from saxpy.znorm import znorm
8
+
9
+
10
+ def find_discords_hotsax(
11
+ series,
12
+ win_size=100,
13
+ num_discords=2,
14
+ paa_size=3,
15
+ alphabet_size=3,
16
+ znorm_threshold=0.01,
17
+ sax_type="unidim",
18
+ random_state=None,
19
+ ):
20
+ """HOT-SAX-driven discords discovery.
21
+
22
+ Argument order: ``(series, win_size, num_discords, paa_size, alphabet_size,
23
+ znorm_threshold, sax_type)`` -- ``paa_size`` precedes ``alphabet_size`` to
24
+ match ``sax_via_window`` (changed in 2.0.0; the two were previously
25
+ reversed, a silent footgun for positional callers).
26
+
27
+ The random-search phase shuffles candidate visit order (an early-abandoning
28
+ speed heuristic), but the returned discords are reproducible: a candidate's
29
+ nearest-neighbour distance is order-independent, and exact-distance ties are
30
+ broken deterministically on the lowest index. Results therefore do not
31
+ depend on the RNG.
32
+
33
+ ``random_state`` makes the search *trajectory* reproducible too: pass an int
34
+ (or a ``numpy.random.Generator``) to seed the shuffle, so the distance-call
35
+ count is deterministic run-to-run. The default ``None`` keeps the historical
36
+ unseeded behavior (a fresh, independently-seeded generator each call; the
37
+ visit order, and only the visit order, is nondeterministic). The returned
38
+ discords are identical either way.
39
+ """
40
+ rng = (
41
+ random_state
42
+ if isinstance(random_state, np.random.Generator)
43
+ else np.random.default_rng(random_state)
44
+ )
45
+ discords = list()
46
+
47
+ global_registry = set()
48
+
49
+ # Z-normalized versions for every subsequence.
50
+ znorms = np.array(
51
+ [
52
+ znorm(series[pos : pos + win_size], znorm_threshold)
53
+ for pos in range(len(series) - win_size + 1)
54
+ ]
55
+ )
56
+
57
+ # SAX words for every subsequence.
58
+ sax_data = sax_via_window(
59
+ series,
60
+ win_size=win_size,
61
+ paa_size=paa_size,
62
+ alphabet_size=alphabet_size,
63
+ nr_strategy=None,
64
+ znorm_threshold=znorm_threshold,
65
+ sax_type=sax_type,
66
+ )
67
+
68
+ """[2.0] build the 'magic' array"""
69
+ magic_array = list()
70
+ for k, v in sax_data.items():
71
+ magic_array.append((k, len(v)))
72
+
73
+ """[2.1] sort it ascending by the number of occurrences"""
74
+ magic_array = sorted(magic_array, key=lambda tup: tup[1])
75
+
76
+ while len(discords) < num_discords:
77
+ best_discord = find_best_discord_hotsax(
78
+ series, win_size, global_registry, sax_data, magic_array, znorms, rng
79
+ )
80
+
81
+ if -1 == best_discord[0]:
82
+ break
83
+
84
+ discords.append(best_discord)
85
+
86
+ mark_start = max(0, best_discord[0] - win_size + 1)
87
+ mark_end = best_discord[0] + win_size
88
+
89
+ for i in range(mark_start, mark_end):
90
+ global_registry.add(i)
91
+
92
+ return discords
93
+
94
+
95
+ def find_best_discord_hotsax(
96
+ series, win_size, global_registry, sax_data, magic_array, znorms, rng=None
97
+ ):
98
+ """Find the best discord with hotsax.
99
+
100
+ ``rng`` is an optional ``numpy.random.Generator`` for the random-search
101
+ shuffle; ``None`` falls back to the global ``numpy.random`` (historical
102
+ behavior).
103
+ """
104
+ if rng is None:
105
+ rng = np.random
106
+
107
+ """[3.0] define the key vars"""
108
+ best_so_far_position = -1
109
+ best_so_far_distance = 0.0
110
+
111
+ distance_calls = 0
112
+
113
+ visit_array = np.zeros(len(series), dtype=int)
114
+
115
+ """[4.0] and we are off iterating over the magic array entries"""
116
+ for entry in magic_array:
117
+ """[5.0] current SAX words and the number of other sequences mapping to the same SAX word."""
118
+ curr_word = entry[0]
119
+ occurrences = sax_data[curr_word]
120
+
121
+ """[6.0] jumping around by the same word occurrences makes it easier to
122
+ nail down the possibly small distance value -- so we can be efficient
123
+ and all that..."""
124
+ for curr_pos in occurrences:
125
+ if curr_pos in global_registry:
126
+ continue
127
+
128
+ """[7.0] we don't want an overlapping subsequence"""
129
+ mark_start = max(0, curr_pos - win_size + 1)
130
+ mark_end = curr_pos + win_size
131
+ visit_set = set(range(mark_start, mark_end))
132
+
133
+ """[8.0] here is our subsequence in question"""
134
+ cur_seq = znorms[curr_pos]
135
+
136
+ """[9.0] let's see what is NN distance"""
137
+ nn_dist = np.inf
138
+ do_random_search = True
139
+
140
+ """[10.0] ordered by occurrences search first"""
141
+ for next_pos in occurrences:
142
+ """[11.0] skip bad pos"""
143
+ if next_pos in visit_set:
144
+ continue
145
+ else:
146
+ visit_set.add(next_pos)
147
+
148
+ """[12.0] distance we compute"""
149
+
150
+ # NB: keep the vectorized euclidean here, NOT the element-wise
151
+ # early_abandoned_euclidean (audit #17). Its abandoning is a
152
+ # pure-Python per-element np.dot loop, ~47x slower per call than
153
+ # np.sqrt(np.sum(...)) on these win_size windows; unless it
154
+ # abandons within the first ~2 elements it loses badly, and
155
+ # swapping it in made HOT-SAX ~3x slower overall. Measured, not
156
+ # assumed.
157
+ dist = euclidean(cur_seq, znorms[next_pos])
158
+ distance_calls += 1
159
+
160
+ """[13.0] keep the books up-to-date"""
161
+ if dist < nn_dist:
162
+ nn_dist = dist
163
+ if dist < best_so_far_distance:
164
+ do_random_search = False
165
+ break
166
+
167
+ """[13.0] if not broken above,
168
+ we shall proceed with random search"""
169
+ if do_random_search:
170
+ """[14.0] build that random visit order array"""
171
+ curr_idx = 0
172
+ for i in range(0, (len(series) - win_size + 1)):
173
+ if i not in visit_set:
174
+ visit_array[curr_idx] = i
175
+ curr_idx += 1
176
+ it_order = rng.permutation(visit_array[0:curr_idx])
177
+ curr_idx -= 1
178
+
179
+ """[15.0] and go random"""
180
+ while curr_idx >= 0:
181
+ rand_pos = it_order[curr_idx]
182
+ curr_idx -= 1
183
+
184
+ # Vectorized euclidean, not early_abandoned_euclidean -- see
185
+ # the note in the occurrences loop above (audit #17).
186
+ dist = euclidean(cur_seq, znorms[rand_pos])
187
+ distance_calls += 1
188
+
189
+ """[16.0] keep the books up-to-date again"""
190
+ if dist < nn_dist:
191
+ nn_dist = dist
192
+ if dist < best_so_far_distance:
193
+ nn_dist = dist
194
+ break
195
+
196
+ """[17.0] and BIGGER books -- tie-break on the lowest position so
197
+ the result matches find_discords_brute_force and never depends on
198
+ visit order"""
199
+ if nn_dist < np.inf and (
200
+ nn_dist > best_so_far_distance
201
+ or (nn_dist == best_so_far_distance and curr_pos < best_so_far_position)
202
+ ):
203
+ best_so_far_distance = nn_dist
204
+ best_so_far_position = curr_pos
205
+
206
+ return best_so_far_position, best_so_far_distance
saxpy/paa.py ADDED
@@ -0,0 +1,78 @@
1
+ """Implements PAA."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def paa(series, paa_segment_size, sax_type="unidim"):
7
+ """PAA implementation.
8
+
9
+ >>> paa([1, 2, 3], 3, 'unidim')
10
+ array([1., 2., 3.])
11
+ >>> paa([1, 2, 3], 1, 'unidim')
12
+ array([2.])
13
+ >>> paa([4, 3, 8, 5], 1, 'unidim')
14
+ array([5.])
15
+ >>> paa([[1, 2, 3], [6, 5, 4]], 1, 'repeat')
16
+ array([[3.5, 3.5, 3.5]])
17
+ >>> paa([[1, 2, 3], [6, 5, 4]], 2, 'repeat')
18
+ array([[1., 2., 3.],
19
+ [6., 5., 4.]])
20
+ """
21
+
22
+ series = np.array(series)
23
+ series_len = series.shape[0]
24
+
25
+ # PAA reduces a series to fewer segments by averaging; reject inputs that
26
+ # make that ill-defined instead of failing cryptically (ZeroDivisionError /
27
+ # all-NaN) or silently up-sampling.
28
+ if paa_segment_size < 1:
29
+ raise ValueError("PAA segment size must be a positive integer.")
30
+ if series_len == 0:
31
+ raise ValueError("Cannot run PAA on an empty series.")
32
+ if paa_segment_size > series_len:
33
+ raise ValueError(
34
+ "PAA segment size cannot exceed the series length; "
35
+ "PAA reduces a series, it does not up-sample it."
36
+ )
37
+
38
+ if sax_type in ["repeat", "energy"]:
39
+ num_dims = series.shape[1]
40
+ else:
41
+ num_dims = 1
42
+ is_multidimensional = (len(series.shape) > 1) and (series.shape[1] > 1)
43
+ if is_multidimensional:
44
+ # A 1-D sax_type collapses to a single column, so a genuinely
45
+ # multi-column array would silently drop every column but the first.
46
+ # Reject it instead -- multi-dimensional input belongs to the
47
+ # 'repeat', 'energy', or 'independent' modes.
48
+ raise ValueError(
49
+ f"sax_type={sax_type!r} expects a 1-D series, but got a "
50
+ f"{series.shape[1]}-column array; use 'repeat', 'energy', or "
51
+ "'independent' for multi-dimensional input."
52
+ )
53
+ series = series.reshape(series.shape[0], 1)
54
+
55
+ res = np.zeros((num_dims, paa_segment_size))
56
+
57
+ for dim in range(num_dims):
58
+ column = series[:, dim]
59
+ # PAA by averaging. These are the vectorized form of the original
60
+ # element-wise ``np.add.at`` scatter loops -- same arithmetic and
61
+ # summation order, but orders of magnitude faster on long series.
62
+ if series_len % paa_segment_size == 0:
63
+ # Evenly divisible: average contiguous blocks of ``inc`` points.
64
+ inc = series_len // paa_segment_size
65
+ res[dim] = column.reshape(paa_segment_size, inc).mean(axis=1)
66
+ else:
67
+ # Otherwise the classic expand-by-paa_size / contract-by-series_len
68
+ # construction, so segment boundaries can fall between samples.
69
+ res[dim] = (
70
+ np.repeat(column, paa_segment_size)
71
+ .reshape(paa_segment_size, series_len)
72
+ .mean(axis=1)
73
+ )
74
+
75
+ if sax_type in ["repeat", "energy"]:
76
+ return res.T
77
+ else:
78
+ return res.flatten()