exonware-xwnode 0.0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exonware/__init__.py +14 -0
- exonware/xwnode/__init__.py +127 -0
- exonware/xwnode/base.py +676 -0
- exonware/xwnode/config.py +178 -0
- exonware/xwnode/contracts.py +730 -0
- exonware/xwnode/errors.py +503 -0
- exonware/xwnode/facade.py +460 -0
- exonware/xwnode/strategies/__init__.py +158 -0
- exonware/xwnode/strategies/advisor.py +463 -0
- exonware/xwnode/strategies/edges/__init__.py +32 -0
- exonware/xwnode/strategies/edges/adj_list.py +227 -0
- exonware/xwnode/strategies/edges/adj_matrix.py +391 -0
- exonware/xwnode/strategies/edges/base.py +169 -0
- exonware/xwnode/strategies/flyweight.py +328 -0
- exonware/xwnode/strategies/impls/__init__.py +13 -0
- exonware/xwnode/strategies/impls/_base_edge.py +403 -0
- exonware/xwnode/strategies/impls/_base_node.py +307 -0
- exonware/xwnode/strategies/impls/edge_adj_list.py +353 -0
- exonware/xwnode/strategies/impls/edge_adj_matrix.py +445 -0
- exonware/xwnode/strategies/impls/edge_bidir_wrapper.py +455 -0
- exonware/xwnode/strategies/impls/edge_block_adj_matrix.py +539 -0
- exonware/xwnode/strategies/impls/edge_coo.py +533 -0
- exonware/xwnode/strategies/impls/edge_csc.py +447 -0
- exonware/xwnode/strategies/impls/edge_csr.py +492 -0
- exonware/xwnode/strategies/impls/edge_dynamic_adj_list.py +503 -0
- exonware/xwnode/strategies/impls/edge_flow_network.py +555 -0
- exonware/xwnode/strategies/impls/edge_hyperedge_set.py +516 -0
- exonware/xwnode/strategies/impls/edge_neural_graph.py +650 -0
- exonware/xwnode/strategies/impls/edge_octree.py +574 -0
- exonware/xwnode/strategies/impls/edge_property_store.py +655 -0
- exonware/xwnode/strategies/impls/edge_quadtree.py +519 -0
- exonware/xwnode/strategies/impls/edge_rtree.py +820 -0
- exonware/xwnode/strategies/impls/edge_temporal_edgeset.py +558 -0
- exonware/xwnode/strategies/impls/edge_tree_graph_basic.py +271 -0
- exonware/xwnode/strategies/impls/edge_weighted_graph.py +411 -0
- exonware/xwnode/strategies/manager.py +775 -0
- exonware/xwnode/strategies/metrics.py +538 -0
- exonware/xwnode/strategies/migration.py +432 -0
- exonware/xwnode/strategies/nodes/__init__.py +50 -0
- exonware/xwnode/strategies/nodes/_base_node.py +307 -0
- exonware/xwnode/strategies/nodes/adjacency_list.py +267 -0
- exonware/xwnode/strategies/nodes/aho_corasick.py +345 -0
- exonware/xwnode/strategies/nodes/array_list.py +209 -0
- exonware/xwnode/strategies/nodes/base.py +247 -0
- exonware/xwnode/strategies/nodes/deque.py +200 -0
- exonware/xwnode/strategies/nodes/hash_map.py +135 -0
- exonware/xwnode/strategies/nodes/heap.py +307 -0
- exonware/xwnode/strategies/nodes/linked_list.py +232 -0
- exonware/xwnode/strategies/nodes/node_aho_corasick.py +520 -0
- exonware/xwnode/strategies/nodes/node_array_list.py +175 -0
- exonware/xwnode/strategies/nodes/node_avl_tree.py +371 -0
- exonware/xwnode/strategies/nodes/node_b_plus_tree.py +542 -0
- exonware/xwnode/strategies/nodes/node_bitmap.py +420 -0
- exonware/xwnode/strategies/nodes/node_bitset_dynamic.py +513 -0
- exonware/xwnode/strategies/nodes/node_bloom_filter.py +347 -0
- exonware/xwnode/strategies/nodes/node_btree.py +357 -0
- exonware/xwnode/strategies/nodes/node_count_min_sketch.py +470 -0
- exonware/xwnode/strategies/nodes/node_cow_tree.py +473 -0
- exonware/xwnode/strategies/nodes/node_cuckoo_hash.py +392 -0
- exonware/xwnode/strategies/nodes/node_fenwick_tree.py +301 -0
- exonware/xwnode/strategies/nodes/node_hash_map.py +269 -0
- exonware/xwnode/strategies/nodes/node_heap.py +191 -0
- exonware/xwnode/strategies/nodes/node_hyperloglog.py +407 -0
- exonware/xwnode/strategies/nodes/node_linked_list.py +409 -0
- exonware/xwnode/strategies/nodes/node_lsm_tree.py +400 -0
- exonware/xwnode/strategies/nodes/node_ordered_map.py +390 -0
- exonware/xwnode/strategies/nodes/node_ordered_map_balanced.py +565 -0
- exonware/xwnode/strategies/nodes/node_patricia.py +512 -0
- exonware/xwnode/strategies/nodes/node_persistent_tree.py +378 -0
- exonware/xwnode/strategies/nodes/node_radix_trie.py +452 -0
- exonware/xwnode/strategies/nodes/node_red_black_tree.py +497 -0
- exonware/xwnode/strategies/nodes/node_roaring_bitmap.py +570 -0
- exonware/xwnode/strategies/nodes/node_segment_tree.py +289 -0
- exonware/xwnode/strategies/nodes/node_set_hash.py +354 -0
- exonware/xwnode/strategies/nodes/node_set_tree.py +480 -0
- exonware/xwnode/strategies/nodes/node_skip_list.py +316 -0
- exonware/xwnode/strategies/nodes/node_splay_tree.py +393 -0
- exonware/xwnode/strategies/nodes/node_suffix_array.py +487 -0
- exonware/xwnode/strategies/nodes/node_treap.py +387 -0
- exonware/xwnode/strategies/nodes/node_tree_graph_hybrid.py +1434 -0
- exonware/xwnode/strategies/nodes/node_trie.py +252 -0
- exonware/xwnode/strategies/nodes/node_union_find.py +187 -0
- exonware/xwnode/strategies/nodes/node_xdata_optimized.py +369 -0
- exonware/xwnode/strategies/nodes/priority_queue.py +209 -0
- exonware/xwnode/strategies/nodes/queue.py +161 -0
- exonware/xwnode/strategies/nodes/sparse_matrix.py +206 -0
- exonware/xwnode/strategies/nodes/stack.py +152 -0
- exonware/xwnode/strategies/nodes/trie.py +274 -0
- exonware/xwnode/strategies/nodes/union_find.py +283 -0
- exonware/xwnode/strategies/pattern_detector.py +603 -0
- exonware/xwnode/strategies/performance_monitor.py +487 -0
- exonware/xwnode/strategies/queries/__init__.py +24 -0
- exonware/xwnode/strategies/queries/base.py +236 -0
- exonware/xwnode/strategies/queries/cql.py +201 -0
- exonware/xwnode/strategies/queries/cypher.py +181 -0
- exonware/xwnode/strategies/queries/datalog.py +70 -0
- exonware/xwnode/strategies/queries/elastic_dsl.py +70 -0
- exonware/xwnode/strategies/queries/eql.py +70 -0
- exonware/xwnode/strategies/queries/flux.py +70 -0
- exonware/xwnode/strategies/queries/gql.py +70 -0
- exonware/xwnode/strategies/queries/graphql.py +240 -0
- exonware/xwnode/strategies/queries/gremlin.py +181 -0
- exonware/xwnode/strategies/queries/hiveql.py +214 -0
- exonware/xwnode/strategies/queries/hql.py +70 -0
- exonware/xwnode/strategies/queries/jmespath.py +219 -0
- exonware/xwnode/strategies/queries/jq.py +66 -0
- exonware/xwnode/strategies/queries/json_query.py +66 -0
- exonware/xwnode/strategies/queries/jsoniq.py +248 -0
- exonware/xwnode/strategies/queries/kql.py +70 -0
- exonware/xwnode/strategies/queries/linq.py +238 -0
- exonware/xwnode/strategies/queries/logql.py +70 -0
- exonware/xwnode/strategies/queries/mql.py +68 -0
- exonware/xwnode/strategies/queries/n1ql.py +210 -0
- exonware/xwnode/strategies/queries/partiql.py +70 -0
- exonware/xwnode/strategies/queries/pig.py +215 -0
- exonware/xwnode/strategies/queries/promql.py +70 -0
- exonware/xwnode/strategies/queries/sparql.py +220 -0
- exonware/xwnode/strategies/queries/sql.py +275 -0
- exonware/xwnode/strategies/queries/xml_query.py +66 -0
- exonware/xwnode/strategies/queries/xpath.py +223 -0
- exonware/xwnode/strategies/queries/xquery.py +258 -0
- exonware/xwnode/strategies/queries/xwnode_executor.py +332 -0
- exonware/xwnode/strategies/queries/xwquery_strategy.py +424 -0
- exonware/xwnode/strategies/registry.py +604 -0
- exonware/xwnode/strategies/simple.py +273 -0
- exonware/xwnode/strategies/utils.py +532 -0
- exonware/xwnode/types.py +912 -0
- exonware/xwnode/version.py +78 -0
- exonware_xwnode-0.0.1.12.dist-info/METADATA +169 -0
- exonware_xwnode-0.0.1.12.dist-info/RECORD +132 -0
- exonware_xwnode-0.0.1.12.dist-info/WHEEL +4 -0
- exonware_xwnode-0.0.1.12.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,487 @@
|
|
1
|
+
"""
|
2
|
+
Suffix Array Node Strategy Implementation
|
3
|
+
|
4
|
+
This module implements the SUFFIX_ARRAY strategy for efficient substring
|
5
|
+
searches and string pattern matching with linear time construction.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any, Iterator, List, Dict, Optional, Tuple
|
9
|
+
import bisect
|
10
|
+
from .base import ANodeTreeStrategy
|
11
|
+
from ...types import NodeMode, NodeTrait
|
12
|
+
|
13
|
+
|
14
|
+
class SuffixArrayStrategy(ANodeTreeStrategy):
|
15
|
+
"""
|
16
|
+
Suffix Array node strategy for efficient string operations.
|
17
|
+
|
18
|
+
Provides fast substring searches, pattern matching, and string analysis
|
19
|
+
with linear space usage and efficient query operations.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, traits: NodeTrait = NodeTrait.NONE, **options):
|
23
|
+
"""Initialize the Suffix Array strategy."""
|
24
|
+
super().__init__(NodeMode.SUFFIX_ARRAY, traits, **options)
|
25
|
+
|
26
|
+
self.enable_lcp = options.get('enable_lcp', True) # Longest Common Prefix array
|
27
|
+
self.case_sensitive = options.get('case_sensitive', True)
|
28
|
+
self.separator = options.get('separator', '$') # End-of-string marker
|
29
|
+
|
30
|
+
# Core storage
|
31
|
+
self._text = ""
|
32
|
+
self._suffix_array: List[int] = []
|
33
|
+
self._lcp_array: List[int] = [] # Longest Common Prefix
|
34
|
+
self._rank: List[int] = [] # Inverse suffix array
|
35
|
+
|
36
|
+
# Key-value mapping for compatibility
|
37
|
+
self._key_to_pos: Dict[str, List[int]] = {}
|
38
|
+
self._values: Dict[str, Any] = {}
|
39
|
+
self._size = 0
|
40
|
+
|
41
|
+
# Performance optimizations
|
42
|
+
self._is_built = False
|
43
|
+
self._pattern_cache: Dict[str, List[int]] = {}
|
44
|
+
|
45
|
+
def get_supported_traits(self) -> NodeTrait:
|
46
|
+
"""Get the traits supported by the suffix array strategy."""
|
47
|
+
return (NodeTrait.ORDERED | NodeTrait.INDEXED | NodeTrait.STREAMING)
|
48
|
+
|
49
|
+
def _preprocess_text(self, text: str) -> str:
|
50
|
+
"""Preprocess text for suffix array construction."""
|
51
|
+
if not self.case_sensitive:
|
52
|
+
text = text.lower()
|
53
|
+
|
54
|
+
# Ensure text ends with separator
|
55
|
+
if not text.endswith(self.separator):
|
56
|
+
text += self.separator
|
57
|
+
|
58
|
+
return text
|
59
|
+
|
60
|
+
def _build_suffix_array_naive(self) -> None:
|
61
|
+
"""Build suffix array using naive O(n²log n) algorithm."""
|
62
|
+
n = len(self._text)
|
63
|
+
suffixes = []
|
64
|
+
|
65
|
+
for i in range(n):
|
66
|
+
suffixes.append((self._text[i:], i))
|
67
|
+
|
68
|
+
# Sort suffixes lexicographically
|
69
|
+
suffixes.sort()
|
70
|
+
|
71
|
+
self._suffix_array = [suffix[1] for suffix in suffixes]
|
72
|
+
self._build_rank_array()
|
73
|
+
|
74
|
+
if self.enable_lcp:
|
75
|
+
self._build_lcp_array()
|
76
|
+
|
77
|
+
def _build_suffix_array_optimized(self) -> None:
|
78
|
+
"""Build suffix array using optimized radix sort approach."""
|
79
|
+
# For simplicity, using naive approach - can be optimized with DC3/SA-IS algorithms
|
80
|
+
self._build_suffix_array_naive()
|
81
|
+
|
82
|
+
def _build_rank_array(self) -> None:
|
83
|
+
"""Build rank array (inverse of suffix array)."""
|
84
|
+
n = len(self._suffix_array)
|
85
|
+
self._rank = [0] * n
|
86
|
+
|
87
|
+
for i in range(n):
|
88
|
+
self._rank[self._suffix_array[i]] = i
|
89
|
+
|
90
|
+
def _build_lcp_array(self) -> None:
|
91
|
+
"""Build Longest Common Prefix array using Kasai's algorithm."""
|
92
|
+
n = len(self._text)
|
93
|
+
self._lcp_array = [0] * n
|
94
|
+
|
95
|
+
if n == 0:
|
96
|
+
return
|
97
|
+
|
98
|
+
k = 0
|
99
|
+
for i in range(n):
|
100
|
+
if self._rank[i] == n - 1:
|
101
|
+
k = 0
|
102
|
+
continue
|
103
|
+
|
104
|
+
j = self._suffix_array[self._rank[i] + 1]
|
105
|
+
|
106
|
+
while (i + k < n and j + k < n and
|
107
|
+
self._text[i + k] == self._text[j + k]):
|
108
|
+
k += 1
|
109
|
+
|
110
|
+
self._lcp_array[self._rank[i]] = k
|
111
|
+
|
112
|
+
if k > 0:
|
113
|
+
k -= 1
|
114
|
+
|
115
|
+
def _rebuild_if_needed(self) -> None:
|
116
|
+
"""Rebuild suffix array if text has changed."""
|
117
|
+
if not self._is_built and self._text:
|
118
|
+
self._build_suffix_array_optimized()
|
119
|
+
self._is_built = True
|
120
|
+
|
121
|
+
# ============================================================================
|
122
|
+
# CORE OPERATIONS (Key-based interface for compatibility)
|
123
|
+
# ============================================================================
|
124
|
+
|
125
|
+
def put(self, key: Any, value: Any = None) -> None:
|
126
|
+
"""Add string to suffix array."""
|
127
|
+
key_str = str(key)
|
128
|
+
|
129
|
+
# If this is the first key or a text replacement
|
130
|
+
if not self._text or key_str == "text":
|
131
|
+
# Replace entire text
|
132
|
+
self._text = self._preprocess_text(str(value) if value else key_str)
|
133
|
+
self._is_built = False
|
134
|
+
self._pattern_cache.clear()
|
135
|
+
self._key_to_pos.clear()
|
136
|
+
self._values[key_str] = value
|
137
|
+
self._size = 1
|
138
|
+
else:
|
139
|
+
# Append to text (less efficient, requires rebuild)
|
140
|
+
if self._text.endswith(self.separator):
|
141
|
+
self._text = self._text[:-1] + str(value) + self.separator
|
142
|
+
else:
|
143
|
+
self._text += str(value) + self.separator
|
144
|
+
|
145
|
+
self._is_built = False
|
146
|
+
self._pattern_cache.clear()
|
147
|
+
self._values[key_str] = value
|
148
|
+
self._size += 1
|
149
|
+
|
150
|
+
def get(self, key: Any, default: Any = None) -> Any:
|
151
|
+
"""Get value by key."""
|
152
|
+
key_str = str(key)
|
153
|
+
|
154
|
+
if key_str == "text":
|
155
|
+
return self._text
|
156
|
+
elif key_str == "suffix_array":
|
157
|
+
self._rebuild_if_needed()
|
158
|
+
return self._suffix_array.copy()
|
159
|
+
elif key_str == "lcp_array":
|
160
|
+
self._rebuild_if_needed()
|
161
|
+
return self._lcp_array.copy()
|
162
|
+
elif key_str in self._values:
|
163
|
+
return self._values[key_str]
|
164
|
+
|
165
|
+
return default
|
166
|
+
|
167
|
+
def has(self, key: Any) -> bool:
|
168
|
+
"""Check if key exists."""
|
169
|
+
key_str = str(key)
|
170
|
+
return key_str in self._values or key_str in ["text", "suffix_array", "lcp_array"]
|
171
|
+
|
172
|
+
def remove(self, key: Any) -> bool:
|
173
|
+
"""Remove key (limited support)."""
|
174
|
+
key_str = str(key)
|
175
|
+
|
176
|
+
if key_str in self._values:
|
177
|
+
del self._values[key_str]
|
178
|
+
self._size -= 1
|
179
|
+
return True
|
180
|
+
|
181
|
+
return False
|
182
|
+
|
183
|
+
def delete(self, key: Any) -> bool:
|
184
|
+
"""Remove key (alias for remove)."""
|
185
|
+
return self.remove(key)
|
186
|
+
|
187
|
+
def clear(self) -> None:
|
188
|
+
"""Clear all data."""
|
189
|
+
self._text = ""
|
190
|
+
self._suffix_array.clear()
|
191
|
+
self._lcp_array.clear()
|
192
|
+
self._rank.clear()
|
193
|
+
self._key_to_pos.clear()
|
194
|
+
self._values.clear()
|
195
|
+
self._pattern_cache.clear()
|
196
|
+
self._size = 0
|
197
|
+
self._is_built = False
|
198
|
+
|
199
|
+
def keys(self) -> Iterator[str]:
|
200
|
+
"""Get all keys."""
|
201
|
+
yield "text"
|
202
|
+
yield "suffix_array"
|
203
|
+
if self.enable_lcp:
|
204
|
+
yield "lcp_array"
|
205
|
+
for key in self._values.keys():
|
206
|
+
yield key
|
207
|
+
|
208
|
+
def values(self) -> Iterator[Any]:
|
209
|
+
"""Get all values."""
|
210
|
+
yield self._text
|
211
|
+
self._rebuild_if_needed()
|
212
|
+
yield self._suffix_array.copy()
|
213
|
+
if self.enable_lcp:
|
214
|
+
yield self._lcp_array.copy()
|
215
|
+
for value in self._values.values():
|
216
|
+
yield value
|
217
|
+
|
218
|
+
def items(self) -> Iterator[tuple[str, Any]]:
|
219
|
+
"""Get all key-value pairs."""
|
220
|
+
yield ("text", self._text)
|
221
|
+
self._rebuild_if_needed()
|
222
|
+
yield ("suffix_array", self._suffix_array.copy())
|
223
|
+
if self.enable_lcp:
|
224
|
+
yield ("lcp_array", self._lcp_array.copy())
|
225
|
+
for key, value in self._values.items():
|
226
|
+
yield (key, value)
|
227
|
+
|
228
|
+
def __len__(self) -> int:
|
229
|
+
"""Get number of stored items."""
|
230
|
+
return self._size
|
231
|
+
|
232
|
+
def to_native(self) -> Dict[str, Any]:
|
233
|
+
"""Convert to native Python dict."""
|
234
|
+
result = {"text": self._text}
|
235
|
+
self._rebuild_if_needed()
|
236
|
+
result["suffix_array"] = self._suffix_array.copy()
|
237
|
+
if self.enable_lcp:
|
238
|
+
result["lcp_array"] = self._lcp_array.copy()
|
239
|
+
result.update(self._values)
|
240
|
+
return result
|
241
|
+
|
242
|
+
@property
|
243
|
+
def is_list(self) -> bool:
|
244
|
+
"""This can behave like a list for suffix access."""
|
245
|
+
return True
|
246
|
+
|
247
|
+
@property
|
248
|
+
def is_dict(self) -> bool:
|
249
|
+
"""This behaves like a dict."""
|
250
|
+
return True
|
251
|
+
|
252
|
+
# ============================================================================
|
253
|
+
# SUFFIX ARRAY SPECIFIC OPERATIONS
|
254
|
+
# ============================================================================
|
255
|
+
|
256
|
+
def set_text(self, text: str) -> None:
|
257
|
+
"""Set the text for suffix array operations."""
|
258
|
+
self._text = self._preprocess_text(text)
|
259
|
+
self._is_built = False
|
260
|
+
self._pattern_cache.clear()
|
261
|
+
self._size = 1
|
262
|
+
|
263
|
+
def search_pattern(self, pattern: str) -> List[int]:
|
264
|
+
"""Search for pattern occurrences using binary search."""
|
265
|
+
if not pattern:
|
266
|
+
return []
|
267
|
+
|
268
|
+
# Check cache first
|
269
|
+
if pattern in self._pattern_cache:
|
270
|
+
return self._pattern_cache[pattern]
|
271
|
+
|
272
|
+
self._rebuild_if_needed()
|
273
|
+
|
274
|
+
if not self._suffix_array:
|
275
|
+
return []
|
276
|
+
|
277
|
+
if not self.case_sensitive:
|
278
|
+
pattern = pattern.lower()
|
279
|
+
|
280
|
+
# Binary search for leftmost occurrence
|
281
|
+
left = self._binary_search_left(pattern)
|
282
|
+
if left == -1:
|
283
|
+
self._pattern_cache[pattern] = []
|
284
|
+
return []
|
285
|
+
|
286
|
+
# Binary search for rightmost occurrence
|
287
|
+
right = self._binary_search_right(pattern)
|
288
|
+
|
289
|
+
# Extract all matching positions
|
290
|
+
positions = []
|
291
|
+
for i in range(left, right + 1):
|
292
|
+
pos = self._suffix_array[i]
|
293
|
+
positions.append(pos)
|
294
|
+
|
295
|
+
positions.sort()
|
296
|
+
self._pattern_cache[pattern] = positions
|
297
|
+
return positions
|
298
|
+
|
299
|
+
def _binary_search_left(self, pattern: str) -> int:
|
300
|
+
"""Find leftmost occurrence of pattern."""
|
301
|
+
left, right = 0, len(self._suffix_array) - 1
|
302
|
+
result = -1
|
303
|
+
|
304
|
+
while left <= right:
|
305
|
+
mid = (left + right) // 2
|
306
|
+
suffix_pos = self._suffix_array[mid]
|
307
|
+
suffix = self._text[suffix_pos:]
|
308
|
+
|
309
|
+
if suffix.startswith(pattern):
|
310
|
+
result = mid
|
311
|
+
right = mid - 1 # Continue searching left
|
312
|
+
elif suffix < pattern:
|
313
|
+
left = mid + 1
|
314
|
+
else:
|
315
|
+
right = mid - 1
|
316
|
+
|
317
|
+
return result
|
318
|
+
|
319
|
+
def _binary_search_right(self, pattern: str) -> int:
|
320
|
+
"""Find rightmost occurrence of pattern."""
|
321
|
+
left, right = 0, len(self._suffix_array) - 1
|
322
|
+
result = -1
|
323
|
+
|
324
|
+
while left <= right:
|
325
|
+
mid = (left + right) // 2
|
326
|
+
suffix_pos = self._suffix_array[mid]
|
327
|
+
suffix = self._text[suffix_pos:]
|
328
|
+
|
329
|
+
if suffix.startswith(pattern):
|
330
|
+
result = mid
|
331
|
+
left = mid + 1 # Continue searching right
|
332
|
+
elif suffix < pattern:
|
333
|
+
left = mid + 1
|
334
|
+
else:
|
335
|
+
right = mid - 1
|
336
|
+
|
337
|
+
return result
|
338
|
+
|
339
|
+
def count_occurrences(self, pattern: str) -> int:
|
340
|
+
"""Count occurrences of pattern."""
|
341
|
+
return len(self.search_pattern(pattern))
|
342
|
+
|
343
|
+
def find_longest_common_substring(self, other_text: str) -> Tuple[str, int, int]:
|
344
|
+
"""Find longest common substring with another text."""
|
345
|
+
if not self._text or not other_text:
|
346
|
+
return "", 0, 0
|
347
|
+
|
348
|
+
# Create combined text with separator
|
349
|
+
combined = self._text + "#" + other_text + self.separator
|
350
|
+
original_text = self._text
|
351
|
+
|
352
|
+
# Temporarily set combined text
|
353
|
+
self.set_text(combined)
|
354
|
+
self._rebuild_if_needed()
|
355
|
+
|
356
|
+
# Find longest common substring using LCP array
|
357
|
+
max_lcp = 0
|
358
|
+
max_pos = 0
|
359
|
+
text1_len = len(original_text)
|
360
|
+
|
361
|
+
for i in range(len(self._lcp_array) - 1):
|
362
|
+
pos1 = self._suffix_array[i]
|
363
|
+
pos2 = self._suffix_array[i + 1]
|
364
|
+
|
365
|
+
# Check if suffixes are from different texts
|
366
|
+
if ((pos1 < text1_len) != (pos2 < text1_len)) and self._lcp_array[i] > max_lcp:
|
367
|
+
max_lcp = self._lcp_array[i]
|
368
|
+
max_pos = min(pos1, pos2)
|
369
|
+
|
370
|
+
# Restore original text
|
371
|
+
self.set_text(original_text)
|
372
|
+
|
373
|
+
if max_lcp > 0:
|
374
|
+
lcs = combined[max_pos:max_pos + max_lcp]
|
375
|
+
return lcs, max_pos, max_lcp
|
376
|
+
|
377
|
+
return "", 0, 0
|
378
|
+
|
379
|
+
def get_suffix(self, index: int) -> str:
|
380
|
+
"""Get suffix starting at given index."""
|
381
|
+
if 0 <= index < len(self._text):
|
382
|
+
return self._text[index:]
|
383
|
+
return ""
|
384
|
+
|
385
|
+
def get_sorted_suffixes(self) -> List[str]:
|
386
|
+
"""Get all suffixes in sorted order."""
|
387
|
+
self._rebuild_if_needed()
|
388
|
+
|
389
|
+
suffixes = []
|
390
|
+
for pos in self._suffix_array:
|
391
|
+
suffixes.append(self._text[pos:])
|
392
|
+
|
393
|
+
return suffixes
|
394
|
+
|
395
|
+
def find_repeated_substrings(self, min_length: int = 2) -> List[Tuple[str, int, List[int]]]:
|
396
|
+
"""Find repeated substrings using LCP array."""
|
397
|
+
self._rebuild_if_needed()
|
398
|
+
|
399
|
+
if not self.enable_lcp:
|
400
|
+
return []
|
401
|
+
|
402
|
+
repeated = []
|
403
|
+
|
404
|
+
for i in range(len(self._lcp_array)):
|
405
|
+
lcp_len = self._lcp_array[i]
|
406
|
+
|
407
|
+
if lcp_len >= min_length:
|
408
|
+
pos1 = self._suffix_array[i]
|
409
|
+
pos2 = self._suffix_array[i + 1]
|
410
|
+
|
411
|
+
substring = self._text[pos1:pos1 + lcp_len]
|
412
|
+
|
413
|
+
# Find all occurrences of this substring
|
414
|
+
positions = self.search_pattern(substring)
|
415
|
+
|
416
|
+
if len(positions) > 1:
|
417
|
+
repeated.append((substring, lcp_len, positions))
|
418
|
+
|
419
|
+
# Remove duplicates and sort by length
|
420
|
+
unique_repeated = {}
|
421
|
+
for substr, length, positions in repeated:
|
422
|
+
if substr not in unique_repeated or len(positions) > len(unique_repeated[substr][1]):
|
423
|
+
unique_repeated[substr] = (length, positions)
|
424
|
+
|
425
|
+
result = [(substr, data[0], data[1]) for substr, data in unique_repeated.items()]
|
426
|
+
result.sort(key=lambda x: x[1], reverse=True)
|
427
|
+
|
428
|
+
return result
|
429
|
+
|
430
|
+
def get_statistics(self) -> Dict[str, Any]:
|
431
|
+
"""Get comprehensive suffix array statistics."""
|
432
|
+
self._rebuild_if_needed()
|
433
|
+
|
434
|
+
if not self._text:
|
435
|
+
return {'text_length': 0, 'unique_characters': 0, 'suffixes': 0}
|
436
|
+
|
437
|
+
unique_chars = len(set(self._text))
|
438
|
+
avg_lcp = sum(self._lcp_array) / len(self._lcp_array) if self._lcp_array else 0
|
439
|
+
max_lcp = max(self._lcp_array) if self._lcp_array else 0
|
440
|
+
|
441
|
+
return {
|
442
|
+
'text_length': len(self._text),
|
443
|
+
'unique_characters': unique_chars,
|
444
|
+
'suffixes': len(self._suffix_array),
|
445
|
+
'avg_lcp': avg_lcp,
|
446
|
+
'max_lcp': max_lcp,
|
447
|
+
'case_sensitive': self.case_sensitive,
|
448
|
+
'pattern_cache_size': len(self._pattern_cache),
|
449
|
+
'memory_usage': len(self._text) + len(self._suffix_array) * 4 + len(self._lcp_array) * 4
|
450
|
+
}
|
451
|
+
|
452
|
+
# ============================================================================
|
453
|
+
# PERFORMANCE CHARACTERISTICS
|
454
|
+
# ============================================================================
|
455
|
+
|
456
|
+
@property
|
457
|
+
def backend_info(self) -> Dict[str, Any]:
|
458
|
+
"""Get backend implementation info."""
|
459
|
+
return {
|
460
|
+
'strategy': 'SUFFIX_ARRAY',
|
461
|
+
'backend': 'Suffix array with LCP array and binary search',
|
462
|
+
'enable_lcp': self.enable_lcp,
|
463
|
+
'case_sensitive': self.case_sensitive,
|
464
|
+
'separator': self.separator,
|
465
|
+
'complexity': {
|
466
|
+
'construction': 'O(n log n)', # Can be optimized to O(n)
|
467
|
+
'pattern_search': 'O(m log n + occ)', # m = pattern length, occ = occurrences
|
468
|
+
'space': 'O(n)',
|
469
|
+
'lcp_construction': 'O(n)',
|
470
|
+
'substring_queries': 'O(log n + occ)'
|
471
|
+
}
|
472
|
+
}
|
473
|
+
|
474
|
+
@property
|
475
|
+
def metrics(self) -> Dict[str, Any]:
|
476
|
+
"""Get performance metrics."""
|
477
|
+
stats = self.get_statistics()
|
478
|
+
|
479
|
+
return {
|
480
|
+
'text_length': stats['text_length'],
|
481
|
+
'suffixes': stats['suffixes'],
|
482
|
+
'unique_chars': stats['unique_characters'],
|
483
|
+
'avg_lcp': f"{stats['avg_lcp']:.2f}",
|
484
|
+
'max_lcp': stats['max_lcp'],
|
485
|
+
'cache_entries': stats['pattern_cache_size'],
|
486
|
+
'memory_usage': f"{stats['memory_usage']} bytes"
|
487
|
+
}
|