exonware-xwnode 0.0.1.22__py3-none-any.whl → 0.0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exonware/__init__.py +1 -1
- exonware/xwnode/__init__.py +18 -5
- exonware/xwnode/add_strategy_types.py +165 -0
- exonware/xwnode/common/__init__.py +1 -1
- exonware/xwnode/common/graph/__init__.py +30 -0
- exonware/xwnode/common/graph/caching.py +131 -0
- exonware/xwnode/common/graph/contracts.py +100 -0
- exonware/xwnode/common/graph/errors.py +44 -0
- exonware/xwnode/common/graph/indexing.py +260 -0
- exonware/xwnode/common/graph/manager.py +568 -0
- exonware/xwnode/common/management/__init__.py +3 -5
- exonware/xwnode/common/management/manager.py +2 -2
- exonware/xwnode/common/management/migration.py +3 -3
- exonware/xwnode/common/monitoring/__init__.py +3 -5
- exonware/xwnode/common/monitoring/metrics.py +6 -2
- exonware/xwnode/common/monitoring/pattern_detector.py +1 -1
- exonware/xwnode/common/monitoring/performance_monitor.py +5 -1
- exonware/xwnode/common/patterns/__init__.py +3 -5
- exonware/xwnode/common/patterns/flyweight.py +5 -1
- exonware/xwnode/common/patterns/registry.py +202 -183
- exonware/xwnode/common/utils/__init__.py +25 -11
- exonware/xwnode/common/utils/simple.py +1 -1
- exonware/xwnode/config.py +3 -8
- exonware/xwnode/contracts.py +4 -105
- exonware/xwnode/defs.py +413 -159
- exonware/xwnode/edges/strategies/__init__.py +86 -4
- exonware/xwnode/edges/strategies/_base_edge.py +2 -2
- exonware/xwnode/edges/strategies/adj_list.py +287 -121
- exonware/xwnode/edges/strategies/adj_matrix.py +316 -222
- exonware/xwnode/edges/strategies/base.py +1 -1
- exonware/xwnode/edges/strategies/{edge_bidir_wrapper.py → bidir_wrapper.py} +45 -4
- exonware/xwnode/edges/strategies/bitemporal.py +520 -0
- exonware/xwnode/edges/strategies/{edge_block_adj_matrix.py → block_adj_matrix.py} +77 -6
- exonware/xwnode/edges/strategies/bv_graph.py +664 -0
- exonware/xwnode/edges/strategies/compressed_graph.py +217 -0
- exonware/xwnode/edges/strategies/{edge_coo.py → coo.py} +46 -4
- exonware/xwnode/edges/strategies/{edge_csc.py → csc.py} +45 -4
- exonware/xwnode/edges/strategies/{edge_csr.py → csr.py} +94 -12
- exonware/xwnode/edges/strategies/{edge_dynamic_adj_list.py → dynamic_adj_list.py} +46 -4
- exonware/xwnode/edges/strategies/edge_list.py +168 -0
- exonware/xwnode/edges/strategies/edge_property_store.py +2 -2
- exonware/xwnode/edges/strategies/euler_tour.py +560 -0
- exonware/xwnode/edges/strategies/{edge_flow_network.py → flow_network.py} +2 -2
- exonware/xwnode/edges/strategies/graphblas.py +449 -0
- exonware/xwnode/edges/strategies/hnsw.py +637 -0
- exonware/xwnode/edges/strategies/hop2_labels.py +467 -0
- exonware/xwnode/edges/strategies/{edge_hyperedge_set.py → hyperedge_set.py} +2 -2
- exonware/xwnode/edges/strategies/incidence_matrix.py +250 -0
- exonware/xwnode/edges/strategies/k2_tree.py +613 -0
- exonware/xwnode/edges/strategies/link_cut.py +626 -0
- exonware/xwnode/edges/strategies/multiplex.py +532 -0
- exonware/xwnode/edges/strategies/{edge_neural_graph.py → neural_graph.py} +2 -2
- exonware/xwnode/edges/strategies/{edge_octree.py → octree.py} +69 -11
- exonware/xwnode/edges/strategies/{edge_quadtree.py → quadtree.py} +66 -10
- exonware/xwnode/edges/strategies/roaring_adj.py +438 -0
- exonware/xwnode/edges/strategies/{edge_rtree.py → rtree.py} +43 -5
- exonware/xwnode/edges/strategies/{edge_temporal_edgeset.py → temporal_edgeset.py} +24 -5
- exonware/xwnode/edges/strategies/{edge_tree_graph_basic.py → tree_graph_basic.py} +78 -7
- exonware/xwnode/edges/strategies/{edge_weighted_graph.py → weighted_graph.py} +188 -10
- exonware/xwnode/errors.py +3 -6
- exonware/xwnode/facade.py +20 -20
- exonware/xwnode/nodes/strategies/__init__.py +29 -9
- exonware/xwnode/nodes/strategies/adjacency_list.py +650 -177
- exonware/xwnode/nodes/strategies/aho_corasick.py +358 -183
- exonware/xwnode/nodes/strategies/array_list.py +36 -3
- exonware/xwnode/nodes/strategies/art.py +581 -0
- exonware/xwnode/nodes/strategies/{node_avl_tree.py → avl_tree.py} +77 -6
- exonware/xwnode/nodes/strategies/{node_b_plus_tree.py → b_plus_tree.py} +81 -40
- exonware/xwnode/nodes/strategies/{node_btree.py → b_tree.py} +79 -9
- exonware/xwnode/nodes/strategies/base.py +469 -98
- exonware/xwnode/nodes/strategies/{node_bitmap.py → bitmap.py} +12 -12
- exonware/xwnode/nodes/strategies/{node_bitset_dynamic.py → bitset_dynamic.py} +11 -11
- exonware/xwnode/nodes/strategies/{node_bloom_filter.py → bloom_filter.py} +15 -2
- exonware/xwnode/nodes/strategies/bloomier_filter.py +519 -0
- exonware/xwnode/nodes/strategies/bw_tree.py +531 -0
- exonware/xwnode/nodes/strategies/contracts.py +1 -1
- exonware/xwnode/nodes/strategies/{node_count_min_sketch.py → count_min_sketch.py} +3 -2
- exonware/xwnode/nodes/strategies/{node_cow_tree.py → cow_tree.py} +135 -13
- exonware/xwnode/nodes/strategies/crdt_map.py +629 -0
- exonware/xwnode/nodes/strategies/{node_cuckoo_hash.py → cuckoo_hash.py} +2 -2
- exonware/xwnode/nodes/strategies/{node_xdata_optimized.py → data_interchange_optimized.py} +21 -4
- exonware/xwnode/nodes/strategies/dawg.py +876 -0
- exonware/xwnode/nodes/strategies/deque.py +321 -153
- exonware/xwnode/nodes/strategies/extendible_hash.py +93 -0
- exonware/xwnode/nodes/strategies/{node_fenwick_tree.py → fenwick_tree.py} +111 -19
- exonware/xwnode/nodes/strategies/hamt.py +403 -0
- exonware/xwnode/nodes/strategies/hash_map.py +354 -67
- exonware/xwnode/nodes/strategies/heap.py +105 -5
- exonware/xwnode/nodes/strategies/hopscotch_hash.py +525 -0
- exonware/xwnode/nodes/strategies/{node_hyperloglog.py → hyperloglog.py} +6 -5
- exonware/xwnode/nodes/strategies/interval_tree.py +742 -0
- exonware/xwnode/nodes/strategies/kd_tree.py +703 -0
- exonware/xwnode/nodes/strategies/learned_index.py +533 -0
- exonware/xwnode/nodes/strategies/linear_hash.py +93 -0
- exonware/xwnode/nodes/strategies/linked_list.py +316 -119
- exonware/xwnode/nodes/strategies/{node_lsm_tree.py → lsm_tree.py} +219 -15
- exonware/xwnode/nodes/strategies/masstree.py +130 -0
- exonware/xwnode/nodes/strategies/{node_persistent_tree.py → persistent_tree.py} +149 -9
- exonware/xwnode/nodes/strategies/priority_queue.py +544 -132
- exonware/xwnode/nodes/strategies/queue.py +249 -120
- exonware/xwnode/nodes/strategies/{node_red_black_tree.py → red_black_tree.py} +183 -72
- exonware/xwnode/nodes/strategies/{node_roaring_bitmap.py → roaring_bitmap.py} +19 -6
- exonware/xwnode/nodes/strategies/rope.py +717 -0
- exonware/xwnode/nodes/strategies/{node_segment_tree.py → segment_tree.py} +106 -106
- exonware/xwnode/nodes/strategies/{node_set_hash.py → set_hash.py} +30 -29
- exonware/xwnode/nodes/strategies/{node_skip_list.py → skip_list.py} +74 -6
- exonware/xwnode/nodes/strategies/sparse_matrix.py +427 -131
- exonware/xwnode/nodes/strategies/{node_splay_tree.py → splay_tree.py} +55 -6
- exonware/xwnode/nodes/strategies/stack.py +244 -112
- exonware/xwnode/nodes/strategies/{node_suffix_array.py → suffix_array.py} +5 -1
- exonware/xwnode/nodes/strategies/t_tree.py +94 -0
- exonware/xwnode/nodes/strategies/{node_treap.py → treap.py} +75 -6
- exonware/xwnode/nodes/strategies/{node_tree_graph_hybrid.py → tree_graph_hybrid.py} +46 -5
- exonware/xwnode/nodes/strategies/trie.py +153 -9
- exonware/xwnode/nodes/strategies/union_find.py +111 -5
- exonware/xwnode/nodes/strategies/veb_tree.py +856 -0
- exonware/xwnode/strategies/__init__.py +5 -51
- exonware/xwnode/version.py +3 -3
- exonware_xwnode-0.0.1.24.dist-info/METADATA +900 -0
- exonware_xwnode-0.0.1.24.dist-info/RECORD +130 -0
- exonware/xwnode/edges/strategies/edge_adj_list.py +0 -353
- exonware/xwnode/edges/strategies/edge_adj_matrix.py +0 -445
- exonware/xwnode/nodes/strategies/_base_node.py +0 -307
- exonware/xwnode/nodes/strategies/node_aho_corasick.py +0 -525
- exonware/xwnode/nodes/strategies/node_array_list.py +0 -179
- exonware/xwnode/nodes/strategies/node_hash_map.py +0 -273
- exonware/xwnode/nodes/strategies/node_heap.py +0 -196
- exonware/xwnode/nodes/strategies/node_linked_list.py +0 -413
- exonware/xwnode/nodes/strategies/node_trie.py +0 -257
- exonware/xwnode/nodes/strategies/node_union_find.py +0 -192
- exonware/xwnode/queries/executors/__init__.py +0 -47
- exonware/xwnode/queries/executors/advanced/__init__.py +0 -37
- exonware/xwnode/queries/executors/advanced/aggregate_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/ask_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/construct_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/describe_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/for_loop_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/foreach_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/join_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/let_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/mutation_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/options_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/pipe_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/subscribe_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/subscription_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/union_executor.py +0 -50
- exonware/xwnode/queries/executors/advanced/window_executor.py +0 -51
- exonware/xwnode/queries/executors/advanced/with_cte_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/__init__.py +0 -21
- exonware/xwnode/queries/executors/aggregation/avg_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/count_executor.py +0 -38
- exonware/xwnode/queries/executors/aggregation/distinct_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/group_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/having_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/max_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/min_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/sum_executor.py +0 -50
- exonware/xwnode/queries/executors/aggregation/summarize_executor.py +0 -50
- exonware/xwnode/queries/executors/array/__init__.py +0 -9
- exonware/xwnode/queries/executors/array/indexing_executor.py +0 -51
- exonware/xwnode/queries/executors/array/slicing_executor.py +0 -51
- exonware/xwnode/queries/executors/base.py +0 -257
- exonware/xwnode/queries/executors/capability_checker.py +0 -204
- exonware/xwnode/queries/executors/contracts.py +0 -166
- exonware/xwnode/queries/executors/core/__init__.py +0 -17
- exonware/xwnode/queries/executors/core/create_executor.py +0 -96
- exonware/xwnode/queries/executors/core/delete_executor.py +0 -99
- exonware/xwnode/queries/executors/core/drop_executor.py +0 -100
- exonware/xwnode/queries/executors/core/insert_executor.py +0 -39
- exonware/xwnode/queries/executors/core/select_executor.py +0 -152
- exonware/xwnode/queries/executors/core/update_executor.py +0 -102
- exonware/xwnode/queries/executors/data/__init__.py +0 -13
- exonware/xwnode/queries/executors/data/alter_executor.py +0 -50
- exonware/xwnode/queries/executors/data/load_executor.py +0 -50
- exonware/xwnode/queries/executors/data/merge_executor.py +0 -50
- exonware/xwnode/queries/executors/data/store_executor.py +0 -50
- exonware/xwnode/queries/executors/defs.py +0 -93
- exonware/xwnode/queries/executors/engine.py +0 -221
- exonware/xwnode/queries/executors/errors.py +0 -68
- exonware/xwnode/queries/executors/filtering/__init__.py +0 -25
- exonware/xwnode/queries/executors/filtering/between_executor.py +0 -80
- exonware/xwnode/queries/executors/filtering/filter_executor.py +0 -79
- exonware/xwnode/queries/executors/filtering/has_executor.py +0 -70
- exonware/xwnode/queries/executors/filtering/in_executor.py +0 -70
- exonware/xwnode/queries/executors/filtering/like_executor.py +0 -76
- exonware/xwnode/queries/executors/filtering/optional_executor.py +0 -76
- exonware/xwnode/queries/executors/filtering/range_executor.py +0 -80
- exonware/xwnode/queries/executors/filtering/term_executor.py +0 -77
- exonware/xwnode/queries/executors/filtering/values_executor.py +0 -71
- exonware/xwnode/queries/executors/filtering/where_executor.py +0 -44
- exonware/xwnode/queries/executors/graph/__init__.py +0 -15
- exonware/xwnode/queries/executors/graph/in_traverse_executor.py +0 -51
- exonware/xwnode/queries/executors/graph/match_executor.py +0 -51
- exonware/xwnode/queries/executors/graph/out_executor.py +0 -51
- exonware/xwnode/queries/executors/graph/path_executor.py +0 -51
- exonware/xwnode/queries/executors/graph/return_executor.py +0 -51
- exonware/xwnode/queries/executors/ordering/__init__.py +0 -9
- exonware/xwnode/queries/executors/ordering/by_executor.py +0 -50
- exonware/xwnode/queries/executors/ordering/order_executor.py +0 -51
- exonware/xwnode/queries/executors/projection/__init__.py +0 -9
- exonware/xwnode/queries/executors/projection/extend_executor.py +0 -50
- exonware/xwnode/queries/executors/projection/project_executor.py +0 -50
- exonware/xwnode/queries/executors/registry.py +0 -173
- exonware/xwnode/queries/parsers/__init__.py +0 -26
- exonware/xwnode/queries/parsers/base.py +0 -86
- exonware/xwnode/queries/parsers/contracts.py +0 -46
- exonware/xwnode/queries/parsers/errors.py +0 -53
- exonware/xwnode/queries/parsers/sql_param_extractor.py +0 -318
- exonware/xwnode/queries/strategies/__init__.py +0 -24
- exonware/xwnode/queries/strategies/base.py +0 -236
- exonware/xwnode/queries/strategies/cql.py +0 -201
- exonware/xwnode/queries/strategies/cypher.py +0 -181
- exonware/xwnode/queries/strategies/datalog.py +0 -70
- exonware/xwnode/queries/strategies/elastic_dsl.py +0 -70
- exonware/xwnode/queries/strategies/eql.py +0 -70
- exonware/xwnode/queries/strategies/flux.py +0 -70
- exonware/xwnode/queries/strategies/gql.py +0 -70
- exonware/xwnode/queries/strategies/graphql.py +0 -240
- exonware/xwnode/queries/strategies/gremlin.py +0 -181
- exonware/xwnode/queries/strategies/hiveql.py +0 -214
- exonware/xwnode/queries/strategies/hql.py +0 -70
- exonware/xwnode/queries/strategies/jmespath.py +0 -219
- exonware/xwnode/queries/strategies/jq.py +0 -66
- exonware/xwnode/queries/strategies/json_query.py +0 -66
- exonware/xwnode/queries/strategies/jsoniq.py +0 -248
- exonware/xwnode/queries/strategies/kql.py +0 -70
- exonware/xwnode/queries/strategies/linq.py +0 -238
- exonware/xwnode/queries/strategies/logql.py +0 -70
- exonware/xwnode/queries/strategies/mql.py +0 -68
- exonware/xwnode/queries/strategies/n1ql.py +0 -210
- exonware/xwnode/queries/strategies/partiql.py +0 -70
- exonware/xwnode/queries/strategies/pig.py +0 -215
- exonware/xwnode/queries/strategies/promql.py +0 -70
- exonware/xwnode/queries/strategies/sparql.py +0 -220
- exonware/xwnode/queries/strategies/sql.py +0 -275
- exonware/xwnode/queries/strategies/xml_query.py +0 -66
- exonware/xwnode/queries/strategies/xpath.py +0 -223
- exonware/xwnode/queries/strategies/xquery.py +0 -258
- exonware/xwnode/queries/strategies/xwnode_executor.py +0 -332
- exonware/xwnode/queries/strategies/xwquery.py +0 -456
- exonware_xwnode-0.0.1.22.dist-info/METADATA +0 -168
- exonware_xwnode-0.0.1.22.dist-info/RECORD +0 -214
- /exonware/xwnode/nodes/strategies/{node_ordered_map.py → ordered_map.py} +0 -0
- /exonware/xwnode/nodes/strategies/{node_ordered_map_balanced.py → ordered_map_balanced.py} +0 -0
- /exonware/xwnode/nodes/strategies/{node_patricia.py → patricia.py} +0 -0
- /exonware/xwnode/nodes/strategies/{node_radix_trie.py → radix_trie.py} +0 -0
- /exonware/xwnode/nodes/strategies/{node_set_tree.py → set_tree.py} +0 -0
- {exonware_xwnode-0.0.1.22.dist-info → exonware_xwnode-0.0.1.24.dist-info}/WHEEL +0 -0
- {exonware_xwnode-0.0.1.22.dist-info → exonware_xwnode-0.0.1.24.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,876 @@
|
|
1
|
+
"""
|
2
|
+
#exonware/xwnode/src/exonware/xwnode/nodes/strategies/dawg.py
|
3
|
+
|
4
|
+
DAWG (Directed Acyclic Word Graph) Node Strategy Implementation
|
5
|
+
|
6
|
+
This module implements the DAWG strategy for minimal automaton representation
|
7
|
+
of string sets with massive memory savings over standard tries.
|
8
|
+
|
9
|
+
Company: eXonware.com
|
10
|
+
Author: Eng. Muhammad AlShehri
|
11
|
+
Email: connect@exonware.com
|
12
|
+
Version: 0.0.1.24
|
13
|
+
Generation Date: 12-Oct-2025
|
14
|
+
"""
|
15
|
+
|
16
|
+
from typing import Any, Iterator, List, Dict, Optional, Set, Tuple
|
17
|
+
from collections import defaultdict
|
18
|
+
from .base import ANodeTreeStrategy
|
19
|
+
from .contracts import NodeType
|
20
|
+
from ...defs import NodeMode, NodeTrait
|
21
|
+
from ...errors import XWNodeError, XWNodeValueError
|
22
|
+
|
23
|
+
|
24
|
+
class DawgNode:
|
25
|
+
"""
|
26
|
+
Node in the DAWG structure.
|
27
|
+
|
28
|
+
WHY suffix sharing:
|
29
|
+
- Multiple words can share common suffixes
|
30
|
+
- Drastically reduces memory compared to trie
|
31
|
+
- 10-100x smaller for large dictionaries
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(self):
|
35
|
+
"""Initialize DAWG node."""
|
36
|
+
self.edges: Dict[str, 'DawgNode'] = {}
|
37
|
+
self.is_final = False
|
38
|
+
self.value: Any = None
|
39
|
+
self._hash: Optional[int] = None
|
40
|
+
self._id = id(self)
|
41
|
+
|
42
|
+
def __hash__(self) -> int:
|
43
|
+
"""
|
44
|
+
Hash based on structure for suffix sharing.
|
45
|
+
|
46
|
+
WHY structural hashing:
|
47
|
+
- Identifies identical subtrees for merging
|
48
|
+
- Enables suffix sharing optimization
|
49
|
+
- Critical for DAWG compression
|
50
|
+
"""
|
51
|
+
if self._hash is None:
|
52
|
+
# Hash based on edges and final status
|
53
|
+
edge_tuple = tuple(sorted(
|
54
|
+
(char, id(node)) for char, node in self.edges.items()
|
55
|
+
))
|
56
|
+
self._hash = hash((edge_tuple, self.is_final, self.value))
|
57
|
+
return self._hash
|
58
|
+
|
59
|
+
def __eq__(self, other: Any) -> bool:
|
60
|
+
"""
|
61
|
+
Structural equality for suffix sharing.
|
62
|
+
|
63
|
+
WHY structural equality:
|
64
|
+
- Two nodes with same structure can be merged
|
65
|
+
- Enables automatic suffix compression
|
66
|
+
"""
|
67
|
+
if not isinstance(other, DawgNode):
|
68
|
+
return False
|
69
|
+
|
70
|
+
if self.is_final != other.is_final:
|
71
|
+
return False
|
72
|
+
|
73
|
+
if self.value != other.value:
|
74
|
+
return False
|
75
|
+
|
76
|
+
if len(self.edges) != len(other.edges):
|
77
|
+
return False
|
78
|
+
|
79
|
+
for char, node in self.edges.items():
|
80
|
+
if char not in other.edges:
|
81
|
+
return False
|
82
|
+
if node != other.edges[char]:
|
83
|
+
return False
|
84
|
+
|
85
|
+
return True
|
86
|
+
|
87
|
+
def invalidate_hash(self) -> None:
|
88
|
+
"""Invalidate cached hash after modification."""
|
89
|
+
self._hash = None
|
90
|
+
|
91
|
+
|
92
|
+
class DawgStrategy(ANodeTreeStrategy):
|
93
|
+
"""
|
94
|
+
DAWG (Directed Acyclic Word Graph) strategy for minimal string storage.
|
95
|
+
|
96
|
+
WHY DAWG:
|
97
|
+
- 10-100x memory reduction vs standard trie through suffix sharing
|
98
|
+
- Perfect for large dictionaries, lexicons, spell checkers
|
99
|
+
- Fast prefix queries while using minimal space
|
100
|
+
- Deterministic automaton enables efficient string matching
|
101
|
+
- Excellent for autocomplete with memory constraints
|
102
|
+
|
103
|
+
WHY this implementation:
|
104
|
+
- Incremental construction allows online updates
|
105
|
+
- Structural hashing enables automatic suffix detection
|
106
|
+
- Final state markers support both sets and maps
|
107
|
+
- Value storage enables key-value DAWG variant
|
108
|
+
- Lazy minimization balances construction time and space
|
109
|
+
|
110
|
+
Time Complexity:
|
111
|
+
- Insert: O(k) where k is string length (amortized with minimization)
|
112
|
+
- Search: O(k) where k is string length
|
113
|
+
- Prefix query: O(k + m) where m is result size
|
114
|
+
- Delete: O(k) with lazy minimization
|
115
|
+
- Minimization: O(n log n) where n is total nodes
|
116
|
+
|
117
|
+
Space Complexity: O(c) where c is total unique characters across all suffixes
|
118
|
+
(10-100x smaller than trie which is O(alphabet_size × total_chars))
|
119
|
+
|
120
|
+
Trade-offs:
|
121
|
+
- Advantage: Massive space savings (10-100x vs trie)
|
122
|
+
- Advantage: Still O(k) lookups like trie
|
123
|
+
- Advantage: Perfect for read-heavy dictionary workloads
|
124
|
+
- Limitation: Construction more complex than trie
|
125
|
+
- Limitation: Minimization step adds overhead
|
126
|
+
- Limitation: Best for static or slowly-changing dictionaries
|
127
|
+
- Compared to Trie: Much smaller, same lookup speed
|
128
|
+
- Compared to HashMap: Supports prefix queries, more memory efficient
|
129
|
+
|
130
|
+
Best for:
|
131
|
+
- Large dictionaries and lexicons (>100k words)
|
132
|
+
- Spell checkers and autocomplete systems
|
133
|
+
- Natural language processing applications
|
134
|
+
- Genomics sequence storage
|
135
|
+
- Memory-constrained environments
|
136
|
+
- Read-heavy string matching workloads
|
137
|
+
|
138
|
+
Not recommended for:
|
139
|
+
- Small string sets (<1000 words) - overhead not worth it
|
140
|
+
- Frequently updated dictionaries - minimization expensive
|
141
|
+
- Non-string keys
|
142
|
+
- Random access by index (use array instead)
|
143
|
+
- When trie memory usage is acceptable
|
144
|
+
- Real-time insertion requirements
|
145
|
+
|
146
|
+
Following eXonware Priorities:
|
147
|
+
1. Security: Validates string inputs, prevents malicious data
|
148
|
+
2. Usability: Simple API for dictionary operations, clear errors
|
149
|
+
3. Maintainability: Clean automaton structure, well-documented
|
150
|
+
4. Performance: O(k) operations with minimal memory
|
151
|
+
5. Extensibility: Easy to add pattern matching, fuzzy search
|
152
|
+
|
153
|
+
Industry Best Practices:
|
154
|
+
- Follows Daciuk et al. incremental construction algorithm
|
155
|
+
- Implements structural hashing for suffix detection
|
156
|
+
- Supports both DAWG (set) and DAFSA (map) variants
|
157
|
+
- Provides lazy minimization for performance
|
158
|
+
- Compatible with Aho-Corasick for multi-pattern matching
|
159
|
+
"""
|
160
|
+
|
161
|
+
# Tree node type for classification
|
162
|
+
STRATEGY_TYPE: NodeType = NodeType.TREE
|
163
|
+
|
164
|
+
def __init__(self, mode: NodeMode = NodeMode.DAWG,
|
165
|
+
traits: NodeTrait = NodeTrait.NONE, **options):
|
166
|
+
"""
|
167
|
+
Initialize DAWG strategy.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
mode: Node mode (DAWG)
|
171
|
+
traits: Node traits
|
172
|
+
**options: Additional options
|
173
|
+
"""
|
174
|
+
super().__init__(mode, traits, **options)
|
175
|
+
|
176
|
+
self._root = DawgNode()
|
177
|
+
self._size = 0
|
178
|
+
self._word_count = 0
|
179
|
+
|
180
|
+
# For incremental minimization
|
181
|
+
self._unchecked_nodes: List[Tuple[DawgNode, str, DawgNode]] = []
|
182
|
+
self._minimized_nodes: Dict[DawgNode, DawgNode] = {}
|
183
|
+
self._previous_word = ""
|
184
|
+
|
185
|
+
def get_supported_traits(self) -> NodeTrait:
|
186
|
+
"""Get supported traits."""
|
187
|
+
return (NodeTrait.HIERARCHICAL | NodeTrait.INDEXED |
|
188
|
+
NodeTrait.MEMORY_EFFICIENT | NodeTrait.PREFIX_TREE)
|
189
|
+
|
190
|
+
# ============================================================================
|
191
|
+
# CORE OPERATIONS
|
192
|
+
# ============================================================================
|
193
|
+
|
194
|
+
def put(self, key: Any, value: Any = None) -> None:
|
195
|
+
"""
|
196
|
+
Insert word into DAWG.
|
197
|
+
|
198
|
+
Args:
|
199
|
+
key: String key (word)
|
200
|
+
value: Associated value
|
201
|
+
|
202
|
+
Raises:
|
203
|
+
XWNodeValueError: If key is not a string
|
204
|
+
"""
|
205
|
+
# Security: Type validation
|
206
|
+
if not isinstance(key, str):
|
207
|
+
raise XWNodeValueError(
|
208
|
+
f"DAWG requires string keys, got {type(key).__name__}"
|
209
|
+
)
|
210
|
+
|
211
|
+
# Security: Empty string validation
|
212
|
+
if not key:
|
213
|
+
raise XWNodeValueError("DAWG does not support empty string keys")
|
214
|
+
|
215
|
+
# Incremental insertion with minimization
|
216
|
+
self._insert_with_minimization(key, value)
|
217
|
+
self._size += 1
|
218
|
+
self._word_count += 1
|
219
|
+
|
220
|
+
def _insert_with_minimization(self, word: str, value: Any) -> None:
|
221
|
+
"""
|
222
|
+
Insert word using incremental minimization algorithm.
|
223
|
+
|
224
|
+
WHY incremental minimization:
|
225
|
+
- Maintains DAWG property during construction
|
226
|
+
- Avoids full reconstruction after each insert
|
227
|
+
- Balances construction time and space efficiency
|
228
|
+
"""
|
229
|
+
# Find common prefix with previous word
|
230
|
+
common_prefix_len = 0
|
231
|
+
for i in range(min(len(word), len(self._previous_word))):
|
232
|
+
if word[i] == self._previous_word[i]:
|
233
|
+
common_prefix_len += 1
|
234
|
+
else:
|
235
|
+
break
|
236
|
+
|
237
|
+
# Minimize nodes from previous word
|
238
|
+
self._minimize(common_prefix_len)
|
239
|
+
|
240
|
+
# Add suffix for current word
|
241
|
+
current_node = self._root
|
242
|
+
for i in range(len(self._unchecked_nodes)):
|
243
|
+
if i < common_prefix_len:
|
244
|
+
current_node = self._unchecked_nodes[i][2]
|
245
|
+
|
246
|
+
for char in word[common_prefix_len:]:
|
247
|
+
next_node = DawgNode()
|
248
|
+
current_node.edges[char] = next_node
|
249
|
+
self._unchecked_nodes.append((current_node, char, next_node))
|
250
|
+
current_node = next_node
|
251
|
+
|
252
|
+
# Mark as final and store value
|
253
|
+
current_node.is_final = True
|
254
|
+
current_node.value = value
|
255
|
+
self._previous_word = word
|
256
|
+
|
257
|
+
def _minimize(self, down_to: int) -> None:
|
258
|
+
"""
|
259
|
+
Minimize unchecked nodes down to specified prefix length.
|
260
|
+
|
261
|
+
Args:
|
262
|
+
down_to: Prefix length to minimize to
|
263
|
+
|
264
|
+
WHY minimization:
|
265
|
+
- Merges structurally equivalent nodes
|
266
|
+
- Achieves suffix sharing compression
|
267
|
+
- Maintains DAWG minimality property
|
268
|
+
"""
|
269
|
+
# Pop unchecked nodes and minimize
|
270
|
+
while len(self._unchecked_nodes) > down_to:
|
271
|
+
parent, char, child = self._unchecked_nodes.pop()
|
272
|
+
|
273
|
+
# Check if equivalent node exists
|
274
|
+
if child in self._minimized_nodes:
|
275
|
+
# Replace with existing equivalent node
|
276
|
+
parent.edges[char] = self._minimized_nodes[child]
|
277
|
+
else:
|
278
|
+
# Add to minimized set
|
279
|
+
self._minimized_nodes[child] = child
|
280
|
+
|
281
|
+
parent.invalidate_hash()
|
282
|
+
|
283
|
+
def finish_construction(self) -> None:
|
284
|
+
"""
|
285
|
+
Finish DAWG construction by minimizing all remaining nodes.
|
286
|
+
|
287
|
+
WHY explicit finish:
|
288
|
+
- Completes minimization for all inserted words
|
289
|
+
- Maximizes compression ratio
|
290
|
+
- Should be called after bulk inserts
|
291
|
+
"""
|
292
|
+
self._minimize(0)
|
293
|
+
|
294
|
+
def get(self, key: Any, default: Any = None) -> Any:
|
295
|
+
"""
|
296
|
+
Retrieve value by key.
|
297
|
+
|
298
|
+
Args:
|
299
|
+
key: String key
|
300
|
+
default: Default value if not found
|
301
|
+
|
302
|
+
Returns:
|
303
|
+
Value or default
|
304
|
+
"""
|
305
|
+
if not isinstance(key, str):
|
306
|
+
return default
|
307
|
+
|
308
|
+
current_node = self._root
|
309
|
+
|
310
|
+
# Traverse DAWG
|
311
|
+
for char in key:
|
312
|
+
if char not in current_node.edges:
|
313
|
+
return default
|
314
|
+
current_node = current_node.edges[char]
|
315
|
+
|
316
|
+
# Check if final state
|
317
|
+
if current_node.is_final:
|
318
|
+
return current_node.value
|
319
|
+
|
320
|
+
return default
|
321
|
+
|
322
|
+
def has(self, key: Any) -> bool:
|
323
|
+
"""
|
324
|
+
Check if key exists.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
key: String key
|
328
|
+
|
329
|
+
Returns:
|
330
|
+
True if exists, False otherwise
|
331
|
+
"""
|
332
|
+
if not isinstance(key, str):
|
333
|
+
return False
|
334
|
+
|
335
|
+
current_node = self._root
|
336
|
+
|
337
|
+
# Traverse DAWG
|
338
|
+
for char in key:
|
339
|
+
if char not in current_node.edges:
|
340
|
+
return False
|
341
|
+
current_node = current_node.edges[char]
|
342
|
+
|
343
|
+
return current_node.is_final
|
344
|
+
|
345
|
+
def delete(self, key: Any) -> bool:
|
346
|
+
"""
|
347
|
+
Remove key from DAWG.
|
348
|
+
|
349
|
+
Args:
|
350
|
+
key: String key
|
351
|
+
|
352
|
+
Returns:
|
353
|
+
True if deleted, False if not found
|
354
|
+
|
355
|
+
Note: This is a simplified deletion. Full implementation
|
356
|
+
would rebuild DAWG for optimal compression.
|
357
|
+
"""
|
358
|
+
if not isinstance(key, str):
|
359
|
+
return False
|
360
|
+
|
361
|
+
# Navigate to node
|
362
|
+
path: List[Tuple[DawgNode, str]] = []
|
363
|
+
current_node = self._root
|
364
|
+
|
365
|
+
for char in key:
|
366
|
+
if char not in current_node.edges:
|
367
|
+
return False
|
368
|
+
path.append((current_node, char))
|
369
|
+
current_node = current_node.edges[char]
|
370
|
+
|
371
|
+
# Check if it's a final node
|
372
|
+
if not current_node.is_final:
|
373
|
+
return False
|
374
|
+
|
375
|
+
# Unmark as final
|
376
|
+
current_node.is_final = False
|
377
|
+
current_node.value = None
|
378
|
+
current_node.invalidate_hash()
|
379
|
+
|
380
|
+
# Remove nodes if they have no children and aren't final
|
381
|
+
for i in range(len(path) - 1, -1, -1):
|
382
|
+
parent, char = path[i]
|
383
|
+
child = parent.edges[char]
|
384
|
+
|
385
|
+
if not child.edges and not child.is_final:
|
386
|
+
del parent.edges[char]
|
387
|
+
parent.invalidate_hash()
|
388
|
+
else:
|
389
|
+
break
|
390
|
+
|
391
|
+
self._size -= 1
|
392
|
+
self._word_count -= 1
|
393
|
+
return True
|
394
|
+
|
395
|
+
def keys(self) -> Iterator[Any]:
|
396
|
+
"""
|
397
|
+
Get iterator over all keys in lexicographic order.
|
398
|
+
|
399
|
+
Returns:
|
400
|
+
Iterator of string keys
|
401
|
+
"""
|
402
|
+
yield from self._collect_words(self._root, "")
|
403
|
+
|
404
|
+
def _collect_words(self, node: DawgNode, prefix: str) -> Iterator[str]:
|
405
|
+
"""
|
406
|
+
Recursively collect all words from node.
|
407
|
+
|
408
|
+
Args:
|
409
|
+
node: Current DAWG node
|
410
|
+
prefix: Current prefix string
|
411
|
+
|
412
|
+
Yields:
|
413
|
+
Complete words in lexicographic order
|
414
|
+
"""
|
415
|
+
if node.is_final:
|
416
|
+
yield prefix
|
417
|
+
|
418
|
+
# Traverse in sorted order for lexicographic output
|
419
|
+
for char in sorted(node.edges.keys()):
|
420
|
+
yield from self._collect_words(node.edges[char], prefix + char)
|
421
|
+
|
422
|
+
def values(self) -> Iterator[Any]:
|
423
|
+
"""
|
424
|
+
Get iterator over all values in key-sorted order.
|
425
|
+
|
426
|
+
Returns:
|
427
|
+
Iterator of values
|
428
|
+
"""
|
429
|
+
for key in self.keys():
|
430
|
+
yield self.get(key)
|
431
|
+
|
432
|
+
def items(self) -> Iterator[tuple[Any, Any]]:
|
433
|
+
"""
|
434
|
+
Get iterator over all key-value pairs.
|
435
|
+
|
436
|
+
Returns:
|
437
|
+
Iterator of (key, value) tuples
|
438
|
+
"""
|
439
|
+
for key in self.keys():
|
440
|
+
yield (key, self.get(key))
|
441
|
+
|
442
|
+
def __len__(self) -> int:
|
443
|
+
"""Get number of words."""
|
444
|
+
return self._word_count
|
445
|
+
|
446
|
+
def to_native(self) -> Any:
|
447
|
+
"""
|
448
|
+
Convert to native Python dict.
|
449
|
+
|
450
|
+
Returns:
|
451
|
+
Dictionary representation
|
452
|
+
"""
|
453
|
+
return dict(self.items())
|
454
|
+
|
455
|
+
# ============================================================================
|
456
|
+
# DAWG-SPECIFIC OPERATIONS
|
457
|
+
# ============================================================================
|
458
|
+
|
459
|
+
def has_prefix(self, prefix: str) -> bool:
|
460
|
+
"""
|
461
|
+
Check if any word starts with prefix.
|
462
|
+
|
463
|
+
Args:
|
464
|
+
prefix: Prefix string to check
|
465
|
+
|
466
|
+
Returns:
|
467
|
+
True if prefix exists, False otherwise
|
468
|
+
|
469
|
+
Raises:
|
470
|
+
XWNodeValueError: If prefix is not a string
|
471
|
+
"""
|
472
|
+
if not isinstance(prefix, str):
|
473
|
+
raise XWNodeValueError(
|
474
|
+
f"Prefix must be string, got {type(prefix).__name__}"
|
475
|
+
)
|
476
|
+
|
477
|
+
current_node = self._root
|
478
|
+
|
479
|
+
for char in prefix:
|
480
|
+
if char not in current_node.edges:
|
481
|
+
return False
|
482
|
+
current_node = current_node.edges[char]
|
483
|
+
|
484
|
+
return True
|
485
|
+
|
486
|
+
def get_with_prefix(self, prefix: str) -> List[str]:
|
487
|
+
"""
|
488
|
+
Get all words with given prefix.
|
489
|
+
|
490
|
+
Args:
|
491
|
+
prefix: Prefix string
|
492
|
+
|
493
|
+
Returns:
|
494
|
+
List of words starting with prefix
|
495
|
+
|
496
|
+
Raises:
|
497
|
+
XWNodeValueError: If prefix is not a string
|
498
|
+
"""
|
499
|
+
if not isinstance(prefix, str):
|
500
|
+
raise XWNodeValueError(
|
501
|
+
f"Prefix must be string, got {type(prefix).__name__}"
|
502
|
+
)
|
503
|
+
|
504
|
+
# Navigate to prefix node
|
505
|
+
current_node = self._root
|
506
|
+
for char in prefix:
|
507
|
+
if char not in current_node.edges:
|
508
|
+
return []
|
509
|
+
current_node = current_node.edges[char]
|
510
|
+
|
511
|
+
# Collect all words from this node
|
512
|
+
return list(self._collect_words(current_node, prefix))
|
513
|
+
|
514
|
+
def longest_prefix(self, text: str) -> Optional[str]:
|
515
|
+
"""
|
516
|
+
Find longest prefix in DAWG that matches text.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
text: Text to search
|
520
|
+
|
521
|
+
Returns:
|
522
|
+
Longest matching prefix or None
|
523
|
+
|
524
|
+
Raises:
|
525
|
+
XWNodeValueError: If text is not a string
|
526
|
+
"""
|
527
|
+
if not isinstance(text, str):
|
528
|
+
raise XWNodeValueError(
|
529
|
+
f"Text must be string, got {type(text).__name__}"
|
530
|
+
)
|
531
|
+
|
532
|
+
current_node = self._root
|
533
|
+
longest = None
|
534
|
+
current_prefix = ""
|
535
|
+
|
536
|
+
for char in text:
|
537
|
+
if char not in current_node.edges:
|
538
|
+
break
|
539
|
+
current_prefix += char
|
540
|
+
current_node = current_node.edges[char]
|
541
|
+
|
542
|
+
if current_node.is_final:
|
543
|
+
longest = current_prefix
|
544
|
+
|
545
|
+
return longest
|
546
|
+
|
547
|
+
def count_words_with_prefix(self, prefix: str) -> int:
|
548
|
+
"""
|
549
|
+
Count words with given prefix.
|
550
|
+
|
551
|
+
Args:
|
552
|
+
prefix: Prefix string
|
553
|
+
|
554
|
+
Returns:
|
555
|
+
Number of words with prefix
|
556
|
+
"""
|
557
|
+
return len(self.get_with_prefix(prefix))
|
558
|
+
|
559
|
+
# ============================================================================
|
560
|
+
# COMPRESSION STATISTICS
|
561
|
+
# ============================================================================
|
562
|
+
|
563
|
+
def get_node_count(self) -> int:
|
564
|
+
"""
|
565
|
+
Count total nodes in DAWG.
|
566
|
+
|
567
|
+
Returns:
|
568
|
+
Number of nodes
|
569
|
+
"""
|
570
|
+
visited: Set[int] = set()
|
571
|
+
return self._count_nodes(self._root, visited)
|
572
|
+
|
573
|
+
def _count_nodes(self, node: DawgNode, visited: Set[int]) -> int:
|
574
|
+
"""
|
575
|
+
Recursively count unique nodes.
|
576
|
+
|
577
|
+
Args:
|
578
|
+
node: Current node
|
579
|
+
visited: Set of visited node IDs
|
580
|
+
|
581
|
+
Returns:
|
582
|
+
Node count
|
583
|
+
"""
|
584
|
+
if node._id in visited:
|
585
|
+
return 0
|
586
|
+
|
587
|
+
visited.add(node._id)
|
588
|
+
count = 1
|
589
|
+
|
590
|
+
for child in node.edges.values():
|
591
|
+
count += self._count_nodes(child, visited)
|
592
|
+
|
593
|
+
return count
|
594
|
+
|
595
|
+
def get_compression_ratio(self) -> float:
|
596
|
+
"""
|
597
|
+
Calculate compression ratio vs standard trie.
|
598
|
+
|
599
|
+
Returns:
|
600
|
+
Estimated compression ratio
|
601
|
+
|
602
|
+
WHY this matters:
|
603
|
+
- Quantifies space savings
|
604
|
+
- Validates DAWG effectiveness
|
605
|
+
- Helps choose between DAWG and trie
|
606
|
+
"""
|
607
|
+
if self._word_count == 0:
|
608
|
+
return 1.0
|
609
|
+
|
610
|
+
# Estimate trie nodes (sum of word lengths)
|
611
|
+
trie_nodes = sum(len(word) for word in self.keys())
|
612
|
+
|
613
|
+
# Actual DAWG nodes
|
614
|
+
dawg_nodes = self.get_node_count()
|
615
|
+
|
616
|
+
if dawg_nodes == 0:
|
617
|
+
return 1.0
|
618
|
+
|
619
|
+
return trie_nodes / dawg_nodes
|
620
|
+
|
621
|
+
def get_statistics(self) -> Dict[str, Any]:
|
622
|
+
"""
|
623
|
+
Get comprehensive DAWG statistics.
|
624
|
+
|
625
|
+
Returns:
|
626
|
+
Statistics dictionary
|
627
|
+
"""
|
628
|
+
return {
|
629
|
+
'word_count': self._word_count,
|
630
|
+
'node_count': self.get_node_count(),
|
631
|
+
'compression_ratio': self.get_compression_ratio(),
|
632
|
+
'minimized_nodes': len(self._minimized_nodes),
|
633
|
+
'unchecked_nodes': len(self._unchecked_nodes),
|
634
|
+
'memory_saved_percent': (1 - 1/self.get_compression_ratio()) * 100
|
635
|
+
}
|
636
|
+
|
637
|
+
# ============================================================================
|
638
|
+
# BULK OPERATIONS
|
639
|
+
# ============================================================================
|
640
|
+
|
641
|
+
def build_from_sorted_words(self, words: List[str], values: Optional[List[Any]] = None) -> None:
|
642
|
+
"""
|
643
|
+
Build DAWG from sorted word list efficiently.
|
644
|
+
|
645
|
+
Args:
|
646
|
+
words: Sorted list of words
|
647
|
+
values: Optional list of values (must match words length)
|
648
|
+
|
649
|
+
Raises:
|
650
|
+
XWNodeValueError: If words not sorted or values length mismatch
|
651
|
+
|
652
|
+
WHY sorted requirement:
|
653
|
+
- Enables incremental minimization algorithm
|
654
|
+
- Ensures optimal compression
|
655
|
+
- O(n) construction vs O(n log n) for unsorted
|
656
|
+
"""
|
657
|
+
# Security: Validation
|
658
|
+
if not all(isinstance(w, str) for w in words):
|
659
|
+
raise XWNodeValueError("All words must be strings")
|
660
|
+
|
661
|
+
# Check sorted
|
662
|
+
for i in range(len(words) - 1):
|
663
|
+
if words[i] > words[i + 1]:
|
664
|
+
raise XWNodeValueError(
|
665
|
+
f"Words must be sorted, but '{words[i]}' > '{words[i+1]}'"
|
666
|
+
)
|
667
|
+
|
668
|
+
if values is not None and len(values) != len(words):
|
669
|
+
raise XWNodeValueError(
|
670
|
+
f"Values length ({len(values)}) must match words length ({len(words)})"
|
671
|
+
)
|
672
|
+
|
673
|
+
# Clear existing data
|
674
|
+
self.clear()
|
675
|
+
|
676
|
+
# Insert all words
|
677
|
+
for i, word in enumerate(words):
|
678
|
+
value = values[i] if values else None
|
679
|
+
self.put(word, value)
|
680
|
+
|
681
|
+
# Final minimization
|
682
|
+
self.finish_construction()
|
683
|
+
|
684
|
+
def get(self, key: Any, default: Any = None) -> Any:
|
685
|
+
"""
|
686
|
+
Retrieve value by key.
|
687
|
+
|
688
|
+
Args:
|
689
|
+
key: String key
|
690
|
+
default: Default value
|
691
|
+
|
692
|
+
Returns:
|
693
|
+
Value or default
|
694
|
+
"""
|
695
|
+
if not isinstance(key, str):
|
696
|
+
return default
|
697
|
+
|
698
|
+
current_node = self._root
|
699
|
+
|
700
|
+
for char in key:
|
701
|
+
if char not in current_node.edges:
|
702
|
+
return default
|
703
|
+
current_node = current_node.edges[char]
|
704
|
+
|
705
|
+
if current_node.is_final:
|
706
|
+
return current_node.value if current_node.value is not None else default
|
707
|
+
|
708
|
+
return default
|
709
|
+
|
710
|
+
def has(self, key: Any) -> bool:
|
711
|
+
"""Check if key exists."""
|
712
|
+
if not isinstance(key, str):
|
713
|
+
return False
|
714
|
+
|
715
|
+
current_node = self._root
|
716
|
+
|
717
|
+
for char in key:
|
718
|
+
if char not in current_node.edges:
|
719
|
+
return False
|
720
|
+
current_node = current_node.edges[char]
|
721
|
+
|
722
|
+
return current_node.is_final
|
723
|
+
|
724
|
+
# ============================================================================
|
725
|
+
# PATTERN MATCHING
|
726
|
+
# ============================================================================
|
727
|
+
|
728
|
+
def fuzzy_search(self, word: str, max_distance: int = 1) -> List[str]:
|
729
|
+
"""
|
730
|
+
Find words within edit distance.
|
731
|
+
|
732
|
+
Args:
|
733
|
+
word: Search word
|
734
|
+
max_distance: Maximum Levenshtein distance
|
735
|
+
|
736
|
+
Returns:
|
737
|
+
List of matching words
|
738
|
+
|
739
|
+
WHY fuzzy search:
|
740
|
+
- Essential for spell checkers
|
741
|
+
- Handles typos in autocomplete
|
742
|
+
- Improves usability
|
743
|
+
"""
|
744
|
+
results = []
|
745
|
+
|
746
|
+
def _fuzzy_helper(node: DawgNode, prefix: str,
|
747
|
+
remaining: str, distance: int) -> None:
|
748
|
+
"""Recursive fuzzy matching."""
|
749
|
+
# Found match
|
750
|
+
if not remaining:
|
751
|
+
if node.is_final and distance <= max_distance:
|
752
|
+
results.append(prefix)
|
753
|
+
# Continue for insertions
|
754
|
+
if distance < max_distance:
|
755
|
+
for char, child in node.edges.items():
|
756
|
+
_fuzzy_helper(child, prefix + char, "", distance + 1)
|
757
|
+
return
|
758
|
+
|
759
|
+
# Exact match
|
760
|
+
if remaining[0] in node.edges:
|
761
|
+
_fuzzy_helper(
|
762
|
+
node.edges[remaining[0]],
|
763
|
+
prefix + remaining[0],
|
764
|
+
remaining[1:],
|
765
|
+
distance
|
766
|
+
)
|
767
|
+
|
768
|
+
# Try edits if distance allows
|
769
|
+
if distance < max_distance:
|
770
|
+
# Deletion
|
771
|
+
_fuzzy_helper(node, prefix, remaining[1:], distance + 1)
|
772
|
+
|
773
|
+
# Substitution and Insertion
|
774
|
+
for char, child in node.edges.items():
|
775
|
+
# Substitution
|
776
|
+
_fuzzy_helper(child, prefix + char, remaining[1:], distance + 1)
|
777
|
+
# Insertion
|
778
|
+
_fuzzy_helper(child, prefix + char, remaining, distance + 1)
|
779
|
+
|
780
|
+
_fuzzy_helper(self._root, "", word, 0)
|
781
|
+
return results
|
782
|
+
|
783
|
+
# ============================================================================
|
784
|
+
# UTILITY METHODS
|
785
|
+
# ============================================================================
|
786
|
+
|
787
|
+
def clear(self) -> None:
|
788
|
+
"""Clear all data."""
|
789
|
+
self._root = DawgNode()
|
790
|
+
self._size = 0
|
791
|
+
self._word_count = 0
|
792
|
+
self._unchecked_nodes.clear()
|
793
|
+
self._minimized_nodes.clear()
|
794
|
+
self._previous_word = ""
|
795
|
+
|
796
|
+
def is_empty(self) -> bool:
|
797
|
+
"""Check if empty."""
|
798
|
+
return self._word_count == 0
|
799
|
+
|
800
|
+
def size(self) -> int:
|
801
|
+
"""Get number of words."""
|
802
|
+
return self._word_count
|
803
|
+
|
804
|
+
def get_mode(self) -> NodeMode:
|
805
|
+
"""Get strategy mode."""
|
806
|
+
return self.mode
|
807
|
+
|
808
|
+
def get_traits(self) -> NodeTrait:
|
809
|
+
"""Get strategy traits."""
|
810
|
+
return self.traits
|
811
|
+
|
812
|
+
# ============================================================================
|
813
|
+
# COMPATIBILITY METHODS
|
814
|
+
# ============================================================================
|
815
|
+
|
816
|
+
def find(self, key: Any) -> Optional[Any]:
|
817
|
+
"""Find value by key."""
|
818
|
+
return self.get(key)
|
819
|
+
|
820
|
+
def insert(self, key: Any, value: Any = None) -> None:
|
821
|
+
"""Insert key-value pair."""
|
822
|
+
self.put(key, value)
|
823
|
+
|
824
|
+
def __str__(self) -> str:
|
825
|
+
"""String representation."""
|
826
|
+
stats = self.get_statistics()
|
827
|
+
return (f"DawgStrategy(words={stats['word_count']}, "
|
828
|
+
f"nodes={stats['node_count']}, "
|
829
|
+
f"compression={stats['compression_ratio']:.1f}x)")
|
830
|
+
|
831
|
+
def __repr__(self) -> str:
|
832
|
+
"""Detailed representation."""
|
833
|
+
return f"DawgStrategy(mode={self.mode.name}, words={self._word_count}, traits={self.traits})"
|
834
|
+
|
835
|
+
# ============================================================================
|
836
|
+
# FACTORY METHOD
|
837
|
+
# ============================================================================
|
838
|
+
|
839
|
+
@classmethod
|
840
|
+
def create_from_data(cls, data: Any) -> 'DawgStrategy':
|
841
|
+
"""
|
842
|
+
Create DAWG from data.
|
843
|
+
|
844
|
+
Args:
|
845
|
+
data: Dictionary with string keys or list of strings
|
846
|
+
|
847
|
+
Returns:
|
848
|
+
New DawgStrategy instance
|
849
|
+
|
850
|
+
Raises:
|
851
|
+
XWNodeValueError: If data contains non-string keys
|
852
|
+
"""
|
853
|
+
instance = cls()
|
854
|
+
|
855
|
+
if isinstance(data, dict):
|
856
|
+
# Sort keys for optimal compression
|
857
|
+
sorted_keys = sorted(data.keys())
|
858
|
+
for key in sorted_keys:
|
859
|
+
if not isinstance(key, str):
|
860
|
+
raise XWNodeValueError(
|
861
|
+
f"DAWG requires string keys, found {type(key).__name__}"
|
862
|
+
)
|
863
|
+
instance.put(key, data[key])
|
864
|
+
instance.finish_construction()
|
865
|
+
elif isinstance(data, (list, tuple)):
|
866
|
+
# Treat as list of strings (set variant)
|
867
|
+
sorted_words = sorted(str(item) for item in data)
|
868
|
+
for word in sorted_words:
|
869
|
+
instance.put(word, None)
|
870
|
+
instance.finish_construction()
|
871
|
+
else:
|
872
|
+
# Store scalar as single word
|
873
|
+
instance.put(str(data), data)
|
874
|
+
|
875
|
+
return instance
|
876
|
+
|