exonware-xwnode 0.0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exonware/__init__.py +14 -0
- exonware/xwnode/__init__.py +127 -0
- exonware/xwnode/base.py +676 -0
- exonware/xwnode/config.py +178 -0
- exonware/xwnode/contracts.py +730 -0
- exonware/xwnode/errors.py +503 -0
- exonware/xwnode/facade.py +460 -0
- exonware/xwnode/strategies/__init__.py +158 -0
- exonware/xwnode/strategies/advisor.py +463 -0
- exonware/xwnode/strategies/edges/__init__.py +32 -0
- exonware/xwnode/strategies/edges/adj_list.py +227 -0
- exonware/xwnode/strategies/edges/adj_matrix.py +391 -0
- exonware/xwnode/strategies/edges/base.py +169 -0
- exonware/xwnode/strategies/flyweight.py +328 -0
- exonware/xwnode/strategies/impls/__init__.py +13 -0
- exonware/xwnode/strategies/impls/_base_edge.py +403 -0
- exonware/xwnode/strategies/impls/_base_node.py +307 -0
- exonware/xwnode/strategies/impls/edge_adj_list.py +353 -0
- exonware/xwnode/strategies/impls/edge_adj_matrix.py +445 -0
- exonware/xwnode/strategies/impls/edge_bidir_wrapper.py +455 -0
- exonware/xwnode/strategies/impls/edge_block_adj_matrix.py +539 -0
- exonware/xwnode/strategies/impls/edge_coo.py +533 -0
- exonware/xwnode/strategies/impls/edge_csc.py +447 -0
- exonware/xwnode/strategies/impls/edge_csr.py +492 -0
- exonware/xwnode/strategies/impls/edge_dynamic_adj_list.py +503 -0
- exonware/xwnode/strategies/impls/edge_flow_network.py +555 -0
- exonware/xwnode/strategies/impls/edge_hyperedge_set.py +516 -0
- exonware/xwnode/strategies/impls/edge_neural_graph.py +650 -0
- exonware/xwnode/strategies/impls/edge_octree.py +574 -0
- exonware/xwnode/strategies/impls/edge_property_store.py +655 -0
- exonware/xwnode/strategies/impls/edge_quadtree.py +519 -0
- exonware/xwnode/strategies/impls/edge_rtree.py +820 -0
- exonware/xwnode/strategies/impls/edge_temporal_edgeset.py +558 -0
- exonware/xwnode/strategies/impls/edge_tree_graph_basic.py +271 -0
- exonware/xwnode/strategies/impls/edge_weighted_graph.py +411 -0
- exonware/xwnode/strategies/manager.py +775 -0
- exonware/xwnode/strategies/metrics.py +538 -0
- exonware/xwnode/strategies/migration.py +432 -0
- exonware/xwnode/strategies/nodes/__init__.py +50 -0
- exonware/xwnode/strategies/nodes/_base_node.py +307 -0
- exonware/xwnode/strategies/nodes/adjacency_list.py +267 -0
- exonware/xwnode/strategies/nodes/aho_corasick.py +345 -0
- exonware/xwnode/strategies/nodes/array_list.py +209 -0
- exonware/xwnode/strategies/nodes/base.py +247 -0
- exonware/xwnode/strategies/nodes/deque.py +200 -0
- exonware/xwnode/strategies/nodes/hash_map.py +135 -0
- exonware/xwnode/strategies/nodes/heap.py +307 -0
- exonware/xwnode/strategies/nodes/linked_list.py +232 -0
- exonware/xwnode/strategies/nodes/node_aho_corasick.py +520 -0
- exonware/xwnode/strategies/nodes/node_array_list.py +175 -0
- exonware/xwnode/strategies/nodes/node_avl_tree.py +371 -0
- exonware/xwnode/strategies/nodes/node_b_plus_tree.py +542 -0
- exonware/xwnode/strategies/nodes/node_bitmap.py +420 -0
- exonware/xwnode/strategies/nodes/node_bitset_dynamic.py +513 -0
- exonware/xwnode/strategies/nodes/node_bloom_filter.py +347 -0
- exonware/xwnode/strategies/nodes/node_btree.py +357 -0
- exonware/xwnode/strategies/nodes/node_count_min_sketch.py +470 -0
- exonware/xwnode/strategies/nodes/node_cow_tree.py +473 -0
- exonware/xwnode/strategies/nodes/node_cuckoo_hash.py +392 -0
- exonware/xwnode/strategies/nodes/node_fenwick_tree.py +301 -0
- exonware/xwnode/strategies/nodes/node_hash_map.py +269 -0
- exonware/xwnode/strategies/nodes/node_heap.py +191 -0
- exonware/xwnode/strategies/nodes/node_hyperloglog.py +407 -0
- exonware/xwnode/strategies/nodes/node_linked_list.py +409 -0
- exonware/xwnode/strategies/nodes/node_lsm_tree.py +400 -0
- exonware/xwnode/strategies/nodes/node_ordered_map.py +390 -0
- exonware/xwnode/strategies/nodes/node_ordered_map_balanced.py +565 -0
- exonware/xwnode/strategies/nodes/node_patricia.py +512 -0
- exonware/xwnode/strategies/nodes/node_persistent_tree.py +378 -0
- exonware/xwnode/strategies/nodes/node_radix_trie.py +452 -0
- exonware/xwnode/strategies/nodes/node_red_black_tree.py +497 -0
- exonware/xwnode/strategies/nodes/node_roaring_bitmap.py +570 -0
- exonware/xwnode/strategies/nodes/node_segment_tree.py +289 -0
- exonware/xwnode/strategies/nodes/node_set_hash.py +354 -0
- exonware/xwnode/strategies/nodes/node_set_tree.py +480 -0
- exonware/xwnode/strategies/nodes/node_skip_list.py +316 -0
- exonware/xwnode/strategies/nodes/node_splay_tree.py +393 -0
- exonware/xwnode/strategies/nodes/node_suffix_array.py +487 -0
- exonware/xwnode/strategies/nodes/node_treap.py +387 -0
- exonware/xwnode/strategies/nodes/node_tree_graph_hybrid.py +1434 -0
- exonware/xwnode/strategies/nodes/node_trie.py +252 -0
- exonware/xwnode/strategies/nodes/node_union_find.py +187 -0
- exonware/xwnode/strategies/nodes/node_xdata_optimized.py +369 -0
- exonware/xwnode/strategies/nodes/priority_queue.py +209 -0
- exonware/xwnode/strategies/nodes/queue.py +161 -0
- exonware/xwnode/strategies/nodes/sparse_matrix.py +206 -0
- exonware/xwnode/strategies/nodes/stack.py +152 -0
- exonware/xwnode/strategies/nodes/trie.py +274 -0
- exonware/xwnode/strategies/nodes/union_find.py +283 -0
- exonware/xwnode/strategies/pattern_detector.py +603 -0
- exonware/xwnode/strategies/performance_monitor.py +487 -0
- exonware/xwnode/strategies/queries/__init__.py +24 -0
- exonware/xwnode/strategies/queries/base.py +236 -0
- exonware/xwnode/strategies/queries/cql.py +201 -0
- exonware/xwnode/strategies/queries/cypher.py +181 -0
- exonware/xwnode/strategies/queries/datalog.py +70 -0
- exonware/xwnode/strategies/queries/elastic_dsl.py +70 -0
- exonware/xwnode/strategies/queries/eql.py +70 -0
- exonware/xwnode/strategies/queries/flux.py +70 -0
- exonware/xwnode/strategies/queries/gql.py +70 -0
- exonware/xwnode/strategies/queries/graphql.py +240 -0
- exonware/xwnode/strategies/queries/gremlin.py +181 -0
- exonware/xwnode/strategies/queries/hiveql.py +214 -0
- exonware/xwnode/strategies/queries/hql.py +70 -0
- exonware/xwnode/strategies/queries/jmespath.py +219 -0
- exonware/xwnode/strategies/queries/jq.py +66 -0
- exonware/xwnode/strategies/queries/json_query.py +66 -0
- exonware/xwnode/strategies/queries/jsoniq.py +248 -0
- exonware/xwnode/strategies/queries/kql.py +70 -0
- exonware/xwnode/strategies/queries/linq.py +238 -0
- exonware/xwnode/strategies/queries/logql.py +70 -0
- exonware/xwnode/strategies/queries/mql.py +68 -0
- exonware/xwnode/strategies/queries/n1ql.py +210 -0
- exonware/xwnode/strategies/queries/partiql.py +70 -0
- exonware/xwnode/strategies/queries/pig.py +215 -0
- exonware/xwnode/strategies/queries/promql.py +70 -0
- exonware/xwnode/strategies/queries/sparql.py +220 -0
- exonware/xwnode/strategies/queries/sql.py +275 -0
- exonware/xwnode/strategies/queries/xml_query.py +66 -0
- exonware/xwnode/strategies/queries/xpath.py +223 -0
- exonware/xwnode/strategies/queries/xquery.py +258 -0
- exonware/xwnode/strategies/queries/xwnode_executor.py +332 -0
- exonware/xwnode/strategies/queries/xwquery_strategy.py +424 -0
- exonware/xwnode/strategies/registry.py +604 -0
- exonware/xwnode/strategies/simple.py +273 -0
- exonware/xwnode/strategies/utils.py +532 -0
- exonware/xwnode/types.py +912 -0
- exonware/xwnode/version.py +78 -0
- exonware_xwnode-0.0.1.12.dist-info/METADATA +169 -0
- exonware_xwnode-0.0.1.12.dist-info/RECORD +132 -0
- exonware_xwnode-0.0.1.12.dist-info/WHEEL +4 -0
- exonware_xwnode-0.0.1.12.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,470 @@
|
|
1
|
+
"""
|
2
|
+
Count-Min Sketch Node Strategy Implementation
|
3
|
+
|
4
|
+
This module implements the COUNT_MIN_SKETCH strategy for probabilistic
|
5
|
+
frequency estimation in data streams with bounded error guarantees.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Any, Iterator, List, Dict, Optional, Tuple
|
9
|
+
import hashlib
|
10
|
+
import math
|
11
|
+
from ._base_node import aNodeStrategy
|
12
|
+
from ...types import NodeMode, NodeTrait
|
13
|
+
|
14
|
+
|
15
|
+
class xCountMinSketchStrategy(aNodeStrategy):
|
16
|
+
"""
|
17
|
+
Count-Min Sketch node strategy for streaming frequency estimation.
|
18
|
+
|
19
|
+
Provides memory-efficient approximate frequency counting with
|
20
|
+
probabilistic error bounds and no false negatives.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, traits: NodeTrait = NodeTrait.NONE, **options):
|
24
|
+
"""Initialize the Count-Min Sketch strategy."""
|
25
|
+
super().__init__(NodeMode.COUNT_MIN_SKETCH, traits, **options)
|
26
|
+
|
27
|
+
# Sketch parameters
|
28
|
+
self.epsilon = options.get('epsilon', 0.01) # Error bound (1%)
|
29
|
+
self.delta = options.get('delta', 0.01) # Confidence (99%)
|
30
|
+
|
31
|
+
# Calculate dimensions
|
32
|
+
self.width = self._calculate_width()
|
33
|
+
self.depth = self._calculate_depth()
|
34
|
+
|
35
|
+
# Core sketch matrix
|
36
|
+
self._sketch: List[List[int]] = [[0 for _ in range(self.width)] for _ in range(self.depth)]
|
37
|
+
|
38
|
+
# Hash functions (using different seeds)
|
39
|
+
self._hash_seeds = self._generate_hash_seeds()
|
40
|
+
|
41
|
+
# Key-value mapping for compatibility
|
42
|
+
self._values: Dict[str, Any] = {}
|
43
|
+
self._total_count = 0
|
44
|
+
self._unique_items = set()
|
45
|
+
self._size = 0
|
46
|
+
|
47
|
+
# Heavy hitters tracking
|
48
|
+
self.track_heavy_hitters = options.get('track_heavy_hitters', True)
|
49
|
+
self.heavy_hitter_threshold = options.get('heavy_hitter_threshold', 0.01) # 1% of total
|
50
|
+
self._heavy_hitters: Dict[str, int] = {}
|
51
|
+
|
52
|
+
def get_supported_traits(self) -> NodeTrait:
|
53
|
+
"""Get the traits supported by the count-min sketch strategy."""
|
54
|
+
return (NodeTrait.PROBABILISTIC | NodeTrait.COMPRESSED | NodeTrait.STREAMING)
|
55
|
+
|
56
|
+
def _calculate_width(self) -> int:
|
57
|
+
"""Calculate sketch width based on error bound."""
|
58
|
+
# width = ceil(e / epsilon)
|
59
|
+
e = math.e
|
60
|
+
return max(1, int(math.ceil(e / self.epsilon)))
|
61
|
+
|
62
|
+
def _calculate_depth(self) -> int:
|
63
|
+
"""Calculate sketch depth based on confidence."""
|
64
|
+
# depth = ceil(ln(1/delta))
|
65
|
+
return max(1, int(math.ceil(math.log(1.0 / self.delta))))
|
66
|
+
|
67
|
+
def _generate_hash_seeds(self) -> List[int]:
|
68
|
+
"""Generate seeds for hash functions."""
|
69
|
+
seeds = []
|
70
|
+
for i in range(self.depth):
|
71
|
+
# Use different primes as seeds
|
72
|
+
primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71]
|
73
|
+
seed = primes[i % len(primes)] * (i + 1) * 1000 + i
|
74
|
+
seeds.append(seed)
|
75
|
+
return seeds
|
76
|
+
|
77
|
+
def _hash_item(self, item: str, seed: int) -> int:
|
78
|
+
"""Hash item to bucket using given seed."""
|
79
|
+
hash_obj = hashlib.md5(f"{item}{seed}".encode())
|
80
|
+
hash_value = int(hash_obj.hexdigest(), 16)
|
81
|
+
return hash_value % self.width
|
82
|
+
|
83
|
+
def _update_heavy_hitters(self, item: str, estimated_count: int) -> None:
|
84
|
+
"""Update heavy hitters tracking."""
|
85
|
+
if not self.track_heavy_hitters:
|
86
|
+
return
|
87
|
+
|
88
|
+
threshold = self._total_count * self.heavy_hitter_threshold
|
89
|
+
|
90
|
+
if estimated_count >= threshold:
|
91
|
+
self._heavy_hitters[item] = estimated_count
|
92
|
+
else:
|
93
|
+
# Remove from heavy hitters if below threshold
|
94
|
+
self._heavy_hitters.pop(item, None)
|
95
|
+
|
96
|
+
# ============================================================================
|
97
|
+
# CORE OPERATIONS (Key-based interface for compatibility)
|
98
|
+
# ============================================================================
|
99
|
+
|
100
|
+
def put(self, key: Any, value: Any = None) -> None:
|
101
|
+
"""Add item to count-min sketch."""
|
102
|
+
item = str(key)
|
103
|
+
count = 1
|
104
|
+
|
105
|
+
# If value is a number, treat it as count
|
106
|
+
if isinstance(value, (int, float)) and value > 0:
|
107
|
+
count = int(value)
|
108
|
+
|
109
|
+
# Update sketch
|
110
|
+
for i in range(self.depth):
|
111
|
+
bucket = self._hash_item(item, self._hash_seeds[i])
|
112
|
+
self._sketch[i][bucket] += count
|
113
|
+
|
114
|
+
# Update tracking
|
115
|
+
self._total_count += count
|
116
|
+
self._unique_items.add(item)
|
117
|
+
|
118
|
+
# Store value
|
119
|
+
self._values[item] = value if value is not None else count
|
120
|
+
|
121
|
+
if item not in self._values or self._size == 0:
|
122
|
+
self._size += 1
|
123
|
+
|
124
|
+
# Update heavy hitters
|
125
|
+
estimated_count = self.estimate_count(item)
|
126
|
+
self._update_heavy_hitters(item, estimated_count)
|
127
|
+
|
128
|
+
def get(self, key: Any, default: Any = None) -> Any:
|
129
|
+
"""Get estimated count or stored value."""
|
130
|
+
item = str(key)
|
131
|
+
|
132
|
+
if key == "total_count":
|
133
|
+
return self._total_count
|
134
|
+
elif key == "unique_items":
|
135
|
+
return len(self._unique_items)
|
136
|
+
elif key == "heavy_hitters":
|
137
|
+
return dict(self._heavy_hitters)
|
138
|
+
elif key == "sketch_info":
|
139
|
+
return {
|
140
|
+
'width': self.width,
|
141
|
+
'depth': self.depth,
|
142
|
+
'epsilon': self.epsilon,
|
143
|
+
'delta': self.delta,
|
144
|
+
'total_count': self._total_count
|
145
|
+
}
|
146
|
+
elif key == "estimated_count":
|
147
|
+
# Return function to estimate any item
|
148
|
+
return lambda x: self.estimate_count(x)
|
149
|
+
elif item in self._values:
|
150
|
+
return self._values[item]
|
151
|
+
else:
|
152
|
+
# Return estimated count
|
153
|
+
return self.estimate_count(item)
|
154
|
+
|
155
|
+
def has(self, key: Any) -> bool:
|
156
|
+
"""Check if item might exist (probabilistic)."""
|
157
|
+
item = str(key)
|
158
|
+
|
159
|
+
if key in ["total_count", "unique_items", "heavy_hitters", "sketch_info", "estimated_count"]:
|
160
|
+
return True
|
161
|
+
|
162
|
+
# Item exists if estimated count > 0
|
163
|
+
return self.estimate_count(item) > 0
|
164
|
+
|
165
|
+
def remove(self, key: Any) -> bool:
|
166
|
+
"""Remove item (limited support - decrements count)."""
|
167
|
+
item = str(key)
|
168
|
+
|
169
|
+
if item in self._values:
|
170
|
+
# Decrement count in sketch
|
171
|
+
for i in range(self.depth):
|
172
|
+
bucket = self._hash_item(item, self._hash_seeds[i])
|
173
|
+
if self._sketch[i][bucket] > 0:
|
174
|
+
self._sketch[i][bucket] -= 1
|
175
|
+
|
176
|
+
self._total_count = max(0, self._total_count - 1)
|
177
|
+
|
178
|
+
# Remove from values if count becomes 0
|
179
|
+
if self.estimate_count(item) == 0:
|
180
|
+
del self._values[item]
|
181
|
+
self._unique_items.discard(item)
|
182
|
+
self._size -= 1
|
183
|
+
self._heavy_hitters.pop(item, None)
|
184
|
+
|
185
|
+
return True
|
186
|
+
|
187
|
+
return False
|
188
|
+
|
189
|
+
def delete(self, key: Any) -> bool:
|
190
|
+
"""Remove item (alias for remove)."""
|
191
|
+
return self.remove(key)
|
192
|
+
|
193
|
+
def clear(self) -> None:
|
194
|
+
"""Clear all data."""
|
195
|
+
self._sketch = [[0 for _ in range(self.width)] for _ in range(self.depth)]
|
196
|
+
self._values.clear()
|
197
|
+
self._unique_items.clear()
|
198
|
+
self._heavy_hitters.clear()
|
199
|
+
self._total_count = 0
|
200
|
+
self._size = 0
|
201
|
+
|
202
|
+
def keys(self) -> Iterator[str]:
|
203
|
+
"""Get all tracked items."""
|
204
|
+
for item in self._unique_items:
|
205
|
+
yield item
|
206
|
+
yield "total_count"
|
207
|
+
yield "unique_items"
|
208
|
+
yield "heavy_hitters"
|
209
|
+
yield "sketch_info"
|
210
|
+
yield "estimated_count"
|
211
|
+
|
212
|
+
def values(self) -> Iterator[Any]:
|
213
|
+
"""Get all values."""
|
214
|
+
for item in self._unique_items:
|
215
|
+
yield self.estimate_count(item)
|
216
|
+
yield self._total_count
|
217
|
+
yield len(self._unique_items)
|
218
|
+
yield dict(self._heavy_hitters)
|
219
|
+
yield self.get("sketch_info")
|
220
|
+
yield self.get("estimated_count")
|
221
|
+
|
222
|
+
def items(self) -> Iterator[tuple[str, Any]]:
|
223
|
+
"""Get all item-count pairs."""
|
224
|
+
for item in self._unique_items:
|
225
|
+
yield (item, self.estimate_count(item))
|
226
|
+
yield ("total_count", self._total_count)
|
227
|
+
yield ("unique_items", len(self._unique_items))
|
228
|
+
yield ("heavy_hitters", dict(self._heavy_hitters))
|
229
|
+
yield ("sketch_info", self.get("sketch_info"))
|
230
|
+
yield ("estimated_count", self.get("estimated_count"))
|
231
|
+
|
232
|
+
def __len__(self) -> int:
|
233
|
+
"""Get number of unique items tracked."""
|
234
|
+
return self._size
|
235
|
+
|
236
|
+
def to_native(self) -> Dict[str, Any]:
|
237
|
+
"""Convert to native Python dict."""
|
238
|
+
result = {}
|
239
|
+
for item in self._unique_items:
|
240
|
+
result[item] = self.estimate_count(item)
|
241
|
+
|
242
|
+
result.update({
|
243
|
+
"total_count": self._total_count,
|
244
|
+
"unique_items": len(self._unique_items),
|
245
|
+
"heavy_hitters": dict(self._heavy_hitters),
|
246
|
+
"sketch_info": self.get("sketch_info")
|
247
|
+
})
|
248
|
+
|
249
|
+
return result
|
250
|
+
|
251
|
+
@property
|
252
|
+
def is_list(self) -> bool:
|
253
|
+
"""This is not a list strategy."""
|
254
|
+
return False
|
255
|
+
|
256
|
+
@property
|
257
|
+
def is_dict(self) -> bool:
|
258
|
+
"""This behaves like a dict with probabilistic semantics."""
|
259
|
+
return True
|
260
|
+
|
261
|
+
# ============================================================================
|
262
|
+
# COUNT-MIN SKETCH SPECIFIC OPERATIONS
|
263
|
+
# ============================================================================
|
264
|
+
|
265
|
+
def estimate_count(self, item: str) -> int:
|
266
|
+
"""Estimate count of item."""
|
267
|
+
if not item:
|
268
|
+
return 0
|
269
|
+
|
270
|
+
min_count = float('inf')
|
271
|
+
|
272
|
+
for i in range(self.depth):
|
273
|
+
bucket = self._hash_item(item, self._hash_seeds[i])
|
274
|
+
count = self._sketch[i][bucket]
|
275
|
+
min_count = min(min_count, count)
|
276
|
+
|
277
|
+
return int(min_count) if min_count != float('inf') else 0
|
278
|
+
|
279
|
+
def increment(self, item: str, count: int = 1) -> None:
|
280
|
+
"""Increment count for item."""
|
281
|
+
self.put(item, count)
|
282
|
+
|
283
|
+
def get_frequent_items(self, threshold: Optional[int] = None) -> List[Tuple[str, int]]:
|
284
|
+
"""Get items above frequency threshold."""
|
285
|
+
if threshold is None:
|
286
|
+
threshold = max(1, int(self._total_count * self.heavy_hitter_threshold))
|
287
|
+
|
288
|
+
frequent = []
|
289
|
+
for item in self._unique_items:
|
290
|
+
count = self.estimate_count(item)
|
291
|
+
if count >= threshold:
|
292
|
+
frequent.append((item, count))
|
293
|
+
|
294
|
+
# Sort by frequency (descending)
|
295
|
+
frequent.sort(key=lambda x: x[1], reverse=True)
|
296
|
+
return frequent
|
297
|
+
|
298
|
+
def get_top_k(self, k: int) -> List[Tuple[str, int]]:
|
299
|
+
"""Get top-k most frequent items."""
|
300
|
+
all_items = [(item, self.estimate_count(item)) for item in self._unique_items]
|
301
|
+
all_items.sort(key=lambda x: x[1], reverse=True)
|
302
|
+
return all_items[:k]
|
303
|
+
|
304
|
+
def merge(self, other: 'xCountMinSketchStrategy') -> 'xCountMinSketchStrategy':
|
305
|
+
"""Merge with another Count-Min Sketch."""
|
306
|
+
if (self.width != other.width or self.depth != other.depth or
|
307
|
+
self._hash_seeds != other._hash_seeds):
|
308
|
+
raise ValueError("Cannot merge sketches with different parameters")
|
309
|
+
|
310
|
+
# Create new sketch
|
311
|
+
merged = xCountMinSketchStrategy(
|
312
|
+
traits=self._traits,
|
313
|
+
epsilon=self.epsilon,
|
314
|
+
delta=self.delta,
|
315
|
+
track_heavy_hitters=self.track_heavy_hitters,
|
316
|
+
heavy_hitter_threshold=self.heavy_hitter_threshold
|
317
|
+
)
|
318
|
+
|
319
|
+
# Merge sketch matrices
|
320
|
+
for i in range(self.depth):
|
321
|
+
for j in range(self.width):
|
322
|
+
merged._sketch[i][j] = self._sketch[i][j] + other._sketch[i][j]
|
323
|
+
|
324
|
+
# Merge metadata
|
325
|
+
merged._total_count = self._total_count + other._total_count
|
326
|
+
merged._unique_items = self._unique_items | other._unique_items
|
327
|
+
merged._size = len(merged._unique_items)
|
328
|
+
|
329
|
+
# Merge values (prefer this sketch's values)
|
330
|
+
merged._values.update(other._values)
|
331
|
+
merged._values.update(self._values)
|
332
|
+
|
333
|
+
# Recompute heavy hitters
|
334
|
+
for item in merged._unique_items:
|
335
|
+
count = merged.estimate_count(item)
|
336
|
+
merged._update_heavy_hitters(item, count)
|
337
|
+
|
338
|
+
return merged
|
339
|
+
|
340
|
+
def get_error_bounds(self, item: str) -> Tuple[int, int, float]:
|
341
|
+
"""Get error bounds for item count estimate."""
|
342
|
+
estimate = self.estimate_count(item)
|
343
|
+
|
344
|
+
# Error bound: estimate <= true_count <= estimate + epsilon * total_count
|
345
|
+
max_error = int(self.epsilon * self._total_count)
|
346
|
+
confidence = 1.0 - self.delta
|
347
|
+
|
348
|
+
return estimate, estimate + max_error, confidence
|
349
|
+
|
350
|
+
def point_query(self, item: str) -> Dict[str, Any]:
|
351
|
+
"""Comprehensive point query with error analysis."""
|
352
|
+
estimate = self.estimate_count(item)
|
353
|
+
lower_bound, upper_bound, confidence = self.get_error_bounds(item)
|
354
|
+
|
355
|
+
return {
|
356
|
+
'item': item,
|
357
|
+
'estimated_count': estimate,
|
358
|
+
'lower_bound': lower_bound,
|
359
|
+
'upper_bound': upper_bound,
|
360
|
+
'confidence': confidence,
|
361
|
+
'relative_frequency': estimate / max(1, self._total_count),
|
362
|
+
'is_heavy_hitter': item in self._heavy_hitters
|
363
|
+
}
|
364
|
+
|
365
|
+
def range_query(self, items: List[str]) -> int:
|
366
|
+
"""Estimate total count for a range of items."""
|
367
|
+
# Simple sum - can lead to overestimation due to hash collisions
|
368
|
+
return sum(self.estimate_count(item) for item in items)
|
369
|
+
|
370
|
+
def get_sketch_statistics(self) -> Dict[str, Any]:
|
371
|
+
"""Get comprehensive sketch statistics."""
|
372
|
+
# Calculate sketch density
|
373
|
+
total_cells = self.width * self.depth
|
374
|
+
non_zero_cells = sum(1 for i in range(self.depth) for j in range(self.width)
|
375
|
+
if self._sketch[i][j] > 0)
|
376
|
+
density = non_zero_cells / total_cells if total_cells > 0 else 0
|
377
|
+
|
378
|
+
# Calculate hash distribution
|
379
|
+
max_bucket_count = max(max(row) for row in self._sketch) if self._sketch else 0
|
380
|
+
avg_bucket_count = self._total_count / total_cells if total_cells > 0 else 0
|
381
|
+
|
382
|
+
return {
|
383
|
+
'width': self.width,
|
384
|
+
'depth': self.depth,
|
385
|
+
'total_cells': total_cells,
|
386
|
+
'non_zero_cells': non_zero_cells,
|
387
|
+
'density': density,
|
388
|
+
'total_count': self._total_count,
|
389
|
+
'unique_items': len(self._unique_items),
|
390
|
+
'heavy_hitters': len(self._heavy_hitters),
|
391
|
+
'max_bucket_count': max_bucket_count,
|
392
|
+
'avg_bucket_count': avg_bucket_count,
|
393
|
+
'theoretical_error_bound': self.epsilon,
|
394
|
+
'theoretical_confidence': 1.0 - self.delta,
|
395
|
+
'memory_usage': total_cells * 4 # 4 bytes per int
|
396
|
+
}
|
397
|
+
|
398
|
+
def export_sketch(self) -> Dict[str, Any]:
|
399
|
+
"""Export sketch for analysis or persistence."""
|
400
|
+
return {
|
401
|
+
'sketch_matrix': [row.copy() for row in self._sketch],
|
402
|
+
'parameters': {
|
403
|
+
'width': self.width,
|
404
|
+
'depth': self.depth,
|
405
|
+
'epsilon': self.epsilon,
|
406
|
+
'delta': self.delta,
|
407
|
+
'hash_seeds': self._hash_seeds.copy()
|
408
|
+
},
|
409
|
+
'metadata': {
|
410
|
+
'total_count': self._total_count,
|
411
|
+
'unique_items': list(self._unique_items),
|
412
|
+
'heavy_hitters': dict(self._heavy_hitters)
|
413
|
+
}
|
414
|
+
}
|
415
|
+
|
416
|
+
def import_sketch(self, sketch_data: Dict[str, Any]) -> None:
|
417
|
+
"""Import sketch from exported data."""
|
418
|
+
self._sketch = [row.copy() for row in sketch_data['sketch_matrix']]
|
419
|
+
|
420
|
+
params = sketch_data['parameters']
|
421
|
+
self.width = params['width']
|
422
|
+
self.depth = params['depth']
|
423
|
+
self.epsilon = params['epsilon']
|
424
|
+
self.delta = params['delta']
|
425
|
+
self._hash_seeds = params['hash_seeds'].copy()
|
426
|
+
|
427
|
+
metadata = sketch_data['metadata']
|
428
|
+
self._total_count = metadata['total_count']
|
429
|
+
self._unique_items = set(metadata['unique_items'])
|
430
|
+
self._heavy_hitters = metadata['heavy_hitters'].copy()
|
431
|
+
self._size = len(self._unique_items)
|
432
|
+
|
433
|
+
# ============================================================================
|
434
|
+
# PERFORMANCE CHARACTERISTICS
|
435
|
+
# ============================================================================
|
436
|
+
|
437
|
+
@property
|
438
|
+
def backend_info(self) -> Dict[str, Any]:
|
439
|
+
"""Get backend implementation info."""
|
440
|
+
return {
|
441
|
+
'strategy': 'COUNT_MIN_SKETCH',
|
442
|
+
'backend': 'Probabilistic frequency counter with hash matrix',
|
443
|
+
'width': self.width,
|
444
|
+
'depth': self.depth,
|
445
|
+
'epsilon': self.epsilon,
|
446
|
+
'delta': self.delta,
|
447
|
+
'track_heavy_hitters': self.track_heavy_hitters,
|
448
|
+
'complexity': {
|
449
|
+
'update': 'O(d)', # d = depth
|
450
|
+
'query': 'O(d)',
|
451
|
+
'space': 'O(w * d)', # w = width, d = depth
|
452
|
+
'merge': 'O(w * d)',
|
453
|
+
'error_bound': f'ε * ||f||₁ with probability ≥ {1.0 - self.delta}'
|
454
|
+
}
|
455
|
+
}
|
456
|
+
|
457
|
+
@property
|
458
|
+
def metrics(self) -> Dict[str, Any]:
|
459
|
+
"""Get performance metrics."""
|
460
|
+
stats = self.get_sketch_statistics()
|
461
|
+
|
462
|
+
return {
|
463
|
+
'total_count': stats['total_count'],
|
464
|
+
'unique_items': stats['unique_items'],
|
465
|
+
'sketch_density': f"{stats['density'] * 100:.1f}%",
|
466
|
+
'heavy_hitters': stats['heavy_hitters'],
|
467
|
+
'error_bound': f"{self.epsilon * 100:.2f}%",
|
468
|
+
'confidence': f"{(1.0 - self.delta) * 100:.1f}%",
|
469
|
+
'memory_usage': f"{stats['memory_usage']} bytes"
|
470
|
+
}
|