maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,117 @@
1
+ imap = lambda *args, **kwargs: list(map(*args, **kwargs))
2
+ ifilter = lambda *args, **kwargs: list(filter(*args, **kwargs))
3
+
4
+
5
+ class BKTree:
6
+ def __init__(self, distfn, words):
7
+ """
8
+ Create a new BK-tree from the given distance function and
9
+ words.
10
+
11
+ Arguments:
12
+
13
+ distfn: a binary function that returns the distance between
14
+ two words. Return value is a non-negative integer. the
15
+ distance function must be a metric space.
16
+
17
+ words: an iterable. produces values that can be passed to
18
+ distfn
19
+
20
+ """
21
+ self.distfn = distfn
22
+
23
+ it = iter(words)
24
+ root = it.next()
25
+ self.tree = (root, {})
26
+
27
+ for i in it:
28
+ self._add_word(self.tree, i)
29
+
30
+ def _add_word(self, parent, word):
31
+ pword, children = parent
32
+ d = self.distfn(word, pword)
33
+ if d in children:
34
+ self._add_word(children[d], word)
35
+ else:
36
+ children[d] = (word, {})
37
+
38
+ def query(self, word, n):
39
+ """
40
+ Return all words in the tree that are within a distance of `n'
41
+ from `word`.
42
+
43
+ Arguments:
44
+
45
+ word: a word to query on
46
+
47
+ n: a non-negative integer that specifies the allowed distance
48
+ from the query word.
49
+
50
+ Return value is a list of tuples (distance, word), sorted in
51
+ ascending order of distance.
52
+
53
+ """
54
+
55
+ def rec(parent):
56
+ pword, children = parent
57
+ d = self.distfn(word, pword)
58
+ results = []
59
+ if d <= n:
60
+ results.append((d, pword))
61
+
62
+ for i in range(d - n, d + n + 1):
63
+ child = children.get(i)
64
+ if child is not None:
65
+ results.extend(rec(child))
66
+ return results
67
+
68
+ # sort by distance
69
+ return sorted(rec(self.tree))
70
+
71
+
72
+ def brute_query(word, words, distfn, n):
73
+ """A brute force distance query
74
+
75
+ Arguments:
76
+
77
+ word: the word to query for
78
+
79
+ words: a iterable that produces words to test
80
+
81
+ distfn: a binary function that returns the distance between a
82
+ `word' and an item in `words'.
83
+
84
+ n: an integer that specifies the distance of a matching word
85
+
86
+ """
87
+ return [i for i in words
88
+ if distfn(i, word) <= n]
89
+
90
+
91
+ # def maxdepth(tree, count=0):
92
+ # _, children = t
93
+ # if len(children):
94
+ # return max(maxdepth(i, c + 1) for i in children.values())
95
+ # else:
96
+ # return c
97
+
98
+
99
+ # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Python
100
+ def levenshtein(s, t):
101
+ m, n = len(s), len(t)
102
+ d = [range(n + 1)]
103
+ d += [[i] for i in range(1, m + 1)]
104
+ for i in range(0, m):
105
+ for j in range(0, n):
106
+ cost = 1
107
+ if s[i] == t[j]: cost = 0
108
+
109
+ d[i + 1].append(min(d[i][j + 1] + 1, # deletion
110
+ d[i + 1][j] + 1, # insertion
111
+ d[i][j] + cost) # substitution
112
+ )
113
+ return d[m][n]
114
+
115
+
116
+ if __name__ == "__main__":
117
+ ...
@@ -0,0 +1,104 @@
1
+ from functools import partial, wraps, reduce
2
+ import inspect
3
+ import numpy as np
4
+ import operator as op
5
+ import random
6
+ import time
7
+ from pandas import DataFrame
8
+ from .utils import exists
9
+
10
+ __all__ = ["topk", "dict_topk", "random_idx", "clamp", "get_num_args", "get_parameters"]
11
+
12
+
13
+ def dict_topk(a: dict, k: int, reverse=False):
14
+ df = DataFrame({'key': a.keys(), 'value': a.values()})
15
+ if not reverse:
16
+ return df.nlargest(k, 'value')
17
+ else:
18
+ return df.nsmallest(k, 'value')
19
+
20
+
21
+ def topk(a, k, axis=-1, largest=True, sort=True):
22
+ """Series top K"""
23
+ a = np.asanyarray(a)
24
+ if axis is None:
25
+ axis_size = a.size
26
+ else:
27
+ axis_size = a.shape[axis]
28
+ assert 1 <= k <= axis_size
29
+
30
+ if largest:
31
+ index_array = np.argpartition(a, axis_size - k, axis=axis)
32
+ topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis)
33
+ else:
34
+ index_array = np.argpartition(a, k - 1, axis=axis)
35
+ topk_indices = np.take(index_array, np.arange(k), axis=axis)
36
+ topk_values = np.take_along_axis(a, topk_indices, axis=axis)
37
+ if sort:
38
+ sorted_indices_in_topk = np.argsort(topk_values, axis=axis)
39
+ if largest:
40
+ sorted_indices_in_topk = np.flip(sorted_indices_in_topk, axis=axis)
41
+ sorted_topk_values = np.take_along_axis(
42
+ topk_values, sorted_indices_in_topk, axis=axis
43
+ )
44
+ sorted_topk_indices = np.take_along_axis(
45
+ topk_indices, sorted_indices_in_topk, axis=axis
46
+ )
47
+ return sorted_topk_values, sorted_topk_indices
48
+ return topk_values, topk_indices
49
+
50
+
51
+ def random_idx(idx_range, exclude_idx=None):
52
+ random.seed(time.time())
53
+ rand_idx = random.randint(*idx_range)
54
+ if rand_idx == exclude_idx:
55
+ return random_idx(idx_range, exclude_idx)
56
+ else:
57
+ return rand_idx
58
+
59
+
60
+ def clamp(x, x_min=None, x_max=None):
61
+ """Clamp a number to same range.
62
+ Examples:
63
+ >>> clamp(-1, 0, 1)
64
+ >>> 0
65
+ >>> clamp([-1, 2, 3], [0, 0, 0], [1, 1, 1])
66
+ >>> [0, 1, 1]
67
+ """
68
+ assert exists(x_min) or exists(x_max)
69
+ if exists(x_min):
70
+ x = np.maximum(x, x_min)
71
+ if exists(x_max):
72
+ x = np.minimum(x, x_max)
73
+ return x
74
+
75
+
76
+ CHOOSE_CACHE = {}
77
+
78
+
79
+ def choose_using_cache(n, r):
80
+ if n not in CHOOSE_CACHE:
81
+ CHOOSE_CACHE[n] = {}
82
+ if r not in CHOOSE_CACHE[n]:
83
+ CHOOSE_CACHE[n][r] = choose(n, r, use_cache=False)
84
+ return CHOOSE_CACHE[n][r]
85
+
86
+
87
+ def choose(n, r, use_cache=True):
88
+ if use_cache:
89
+ return choose_using_cache(n, r)
90
+ if n < r:
91
+ return 0
92
+ if r == 0:
93
+ return 1
94
+ denom = reduce(op.mul, range(1, r + 1), 1)
95
+ numer = reduce(op.mul, range(n, n - r, -1), 1)
96
+ return numer // denom
97
+
98
+
99
+ def get_num_args(function):
100
+ return len(get_parameters(function))
101
+
102
+
103
+ def get_parameters(function):
104
+ return inspect.signature(function).parameters
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+
3
+
4
+ def get_hilbert_1d_array(image: np.ndarray):
5
+ from hilbert import decode, encode # $ pip install numpy-hilbert-curve
6
+ num_dims = 2
7
+ num_bits = np.log2(image.shape[0] * image.shape[1]) / num_dims
8
+ num_bits = int(num_bits)
9
+ max_hil = 2 ** (num_bits * num_dims)
10
+ hilberts = np.arange(max_hil)
11
+ locs = decode(hilberts, num_dims, num_bits)
12
+
13
+ image1d = []
14
+ for coord in locs:
15
+ image1d.append(image[coord[0], coord[1]])
16
+ return np.array(image1d)[None, ...]
@@ -0,0 +1,92 @@
1
+ import numpy as np
2
+ from .bezier import bezier
3
+
4
+
5
+ def linear(t):
6
+ return t
7
+
8
+
9
+ def smooth(t):
10
+ # Zero first and second derivatives at t=0 and t=1.
11
+ # Equivalent to bezier([0, 0, 0, 1, 1, 1])
12
+ s = 1 - t
13
+ return (t**3) * (10 * s * s + 5 * s * t + t * t)
14
+
15
+
16
+ def rush_into(t):
17
+ return 2 * smooth(0.5 * t)
18
+
19
+
20
+ def rush_from(t):
21
+ return 2 * smooth(0.5 * (t + 1)) - 1
22
+
23
+
24
+ def slow_into(t):
25
+ return np.sqrt(1 - (1 - t) * (1 - t))
26
+
27
+
28
+ def double_smooth(t):
29
+ if t < 0.5:
30
+ return 0.5 * smooth(2 * t)
31
+ else:
32
+ return 0.5 * (1 + smooth(2 * t - 1))
33
+
34
+
35
+ def there_and_back(t):
36
+ new_t = 2 * t if t < 0.5 else 2 * (1 - t)
37
+ return smooth(new_t)
38
+
39
+
40
+ def there_and_back_with_pause(t, pause_ratio=1.0 / 3):
41
+ a = 1.0 / pause_ratio
42
+ if t < 0.5 - pause_ratio / 2:
43
+ return smooth(a * t)
44
+ elif t < 0.5 + pause_ratio / 2:
45
+ return 1
46
+ else:
47
+ return smooth(a - a * t)
48
+
49
+
50
+ def running_start(t, pull_factor=-0.5):
51
+ return bezier([0, 0, pull_factor, pull_factor, 1, 1, 1])(t)
52
+
53
+
54
+ def not_quite_there(func=smooth, proportion=0.7):
55
+ def result(t):
56
+ return proportion * func(t)
57
+
58
+ return result
59
+
60
+
61
+ def wiggle(t, wiggles=2):
62
+ return there_and_back(t) * np.sin(wiggles * np.pi * t)
63
+
64
+
65
+ def squish_rate_func(func, a=0.4, b=0.6):
66
+ def result(t):
67
+ if a == b:
68
+ return a
69
+ elif t < a:
70
+ return func(0)
71
+ elif t > b:
72
+ return func(1)
73
+ else:
74
+ return func((t - a) / (b - a))
75
+
76
+ return result
77
+
78
+
79
+ # Stylistically, should this take parameters (with default values)?
80
+ # Ultimately, the functionality is entirely subsumed by squish_rate_func,
81
+ # but it may be useful to have a nice name for with nice default params for
82
+ # "lingering", different from squish_rate_func's default params
83
+
84
+
85
+ def lingering(t):
86
+ return squish_rate_func(lambda t: t, 0, 0.8)(t)
87
+
88
+
89
+ def exponential_decay(t, half_life=0.1):
90
+ # The half-life should be rather small to minimize
91
+ # the cut-off error at the end
92
+ return 1 - np.exp(-t / half_life)
@@ -0,0 +1,27 @@
1
+ # from einops import rearrange, reduce
2
+ import einops
3
+
4
+
5
+ def repeat(tensor, n, axis=-1):
6
+ shape = tensor.shape
7
+ dims = len(shape)
8
+ if dims == 1:
9
+ if axis == 0:
10
+ res = einops.repeat(tensor, "w -> h w", h=n)
11
+ else:
12
+ res = einops.repeat(tensor, "h -> h w", w=n)
13
+ elif dims == 2:
14
+ if axis == 0:
15
+ res = einops.repeat(tensor, "h w -> c h w", c=n)
16
+ else:
17
+ res = einops.repeat(tensor, "h w -> h w c", c=n)
18
+ elif dims == 3:
19
+ if axis == 0:
20
+ res = einops.repeat(tensor, "h w c -> b h w c", b=n)
21
+ elif axis == 1:
22
+ res = einops.repeat(tensor, "h w c -> h b w c", b=n)
23
+ else:
24
+ res = einops.repeat(tensor, "h w c -> h w c b", b=n)
25
+ else:
26
+ raise ValueError
27
+ return res
@@ -0,0 +1,272 @@
1
+ from __future__ import annotations
2
+ from typing import List, Dict, Union
3
+ from abc import ABCMeta, abstractmethod
4
+ from pathlib import Path
5
+
6
+
7
+ class Trie(metaclass=ABCMeta):
8
+ trie = None
9
+ rtrie = None
10
+
11
+ def matches(self, word: str) -> list:
12
+ matched_list = self.startwith(word)
13
+ matched_list.extend(self.prefixes(word))
14
+ return matched_list
15
+
16
+ def __contains__(self, item):
17
+ return item in self.trie
18
+
19
+ @abstractmethod
20
+ def prefixes(self, word: str) -> list:
21
+ """在trie中匹配出所有可以作为word的前缀的keys
22
+
23
+ Parameters
24
+ ----------
25
+ word : str
26
+ 待匹配词语
27
+
28
+ Returns
29
+ -------
30
+ list
31
+ Trie中所有匹配到的元素数组
32
+
33
+ Example
34
+ -------
35
+ >>> trie = Trie(["ab", "bc", "bcd", "bcde"])
36
+ >>> trie.prefixes("bcdf")
37
+ ["bc", "bcd"]
38
+ """
39
+
40
+ def full_prefixes(self, word: str) -> list:
41
+ full_matches = []
42
+ for idx in range(len(word)):
43
+ cur_word = word[idx:]
44
+ lst = self.prefixes(cur_word)
45
+ lst.sort(key=lambda x: len(x), reverse=True)
46
+ full_matches.extend(lst)
47
+ return full_matches
48
+
49
+ @abstractmethod
50
+ def startwith(self, word: str) -> list:
51
+ """在前缀树中匹配出所有以word为前缀的keys
52
+
53
+ Parameters
54
+ ----------
55
+ word : str
56
+ 待匹配词语
57
+
58
+ Returns
59
+ -------
60
+ list
61
+ Trie中所有匹配到的元素数组
62
+
63
+ Example
64
+ -------
65
+ >>> trie = Trie(["ab", "bc", "bcd", "bcde"])
66
+ >>> trie.startwith("bc")
67
+ ["bc", "bcd", "bcde"]
68
+ """
69
+
70
+ @abstractmethod
71
+ def endwith(self, word: str) -> list:
72
+ """在前缀树中匹配出所有以word为后缀的keys
73
+
74
+ Parameters
75
+ ----------
76
+ word : str
77
+ 待匹配词语
78
+
79
+ Returns
80
+ -------
81
+ list
82
+ Trie中所有匹配到的元素数组
83
+
84
+ Example
85
+ -------
86
+ >>> trie = Trie(["bc", "abc", "bcd", "edbc"])
87
+ >>> trie.endwith("bc")
88
+ ["bc", "abc", "edbc"]
89
+ """
90
+
91
+ def has_keys_with_prefix(self, word):
92
+ ...
93
+
94
+ def save(self):
95
+ ...
96
+
97
+ def load(self, file_name: Union[str, Path]):
98
+ ...
99
+
100
+
101
+ class PyTrie:
102
+ def __init__(self, words: List[str]):
103
+ self.trie = {}
104
+ self._end = "eos"
105
+ for word in words:
106
+ self.add(word)
107
+
108
+ def add(self, word):
109
+ word = word.strip()
110
+ assert word
111
+ cur_node = self.trie
112
+ for char in word:
113
+ cur_node = cur_node.setdefault(char, {})
114
+ # cur_node[self._end] = ""
115
+ cur_node[self._end] = word
116
+
117
+ def find(self, word: str, full=True) -> bool:
118
+ """
119
+ Parameters
120
+ ----------
121
+ full: bool
122
+ True: 完全匹配时返回True
123
+ False: 前缀匹配到则返回True
124
+ """
125
+ trie = self.trie
126
+ for c in word:
127
+ if c not in trie:
128
+ return False
129
+ trie = trie[c]
130
+ if full:
131
+ return self._end in trie
132
+ else:
133
+ return True
134
+
135
+ def extract_longest_item(self, word: str):
136
+ """从一个文本的开头开始匹配字典中最长的词,返回最长词和长度"""
137
+ curr_dict, longest, offset = self.trie, None, 0
138
+
139
+ if not word:
140
+ return longest, offset
141
+
142
+ for i, c in enumerate(word):
143
+ if c not in curr_dict:
144
+ return longest, offset
145
+ curr_dict = curr_dict[c]
146
+ if 'end' in curr_dict:
147
+ longest, offset = curr_dict['end'], i + 1
148
+ return longest, offset
149
+
150
+ def __str__(self):
151
+ return self.trie
152
+
153
+
154
+ class HatTrie(Trie):
155
+ ...
156
+
157
+
158
+ class MarisaTrie(Trie):
159
+ """Static memory-efficient Trie-like structures for Python
160
+ based on marisa-trie C++ library.
161
+ String data in a MARISA-trie may take up to 50x-100x less memory than in a standard Python dict;
162
+ the raw lookup speed is comparable;
163
+ trie also provides fast advanced methods like prefix search.
164
+ """
165
+
166
+ def __init__(self, key_list: list):
167
+ import marisa_trie
168
+ self.trie = marisa_trie.Trie(key_list)
169
+ rkey_list = [i[::-1] for i in key_list]
170
+ self.rtrie = marisa_trie.Trie(rkey_list)
171
+
172
+ def prefixes(self, word: str):
173
+ return self.trie.prefixes(word)
174
+
175
+ def rprefixes(self, word: str):
176
+ rword = word[::-1]
177
+ return self.rtrie.prefixes(rword)
178
+
179
+ def startwith(self, word: str):
180
+ return self.trie.keys(word)
181
+
182
+ def endwith(self, word: str):
183
+ rword = word[::-1]
184
+ return [i[::-1] for i in self.rtrie.keys(rword)]
185
+
186
+ def get_longest(self, text):
187
+ """匹配整个字典中最长的词,返回最长词"""
188
+ if not text:
189
+ return None
190
+
191
+ longest = ''
192
+ for idx, item in enumerate(text):
193
+ items = self.trie.prefixes(text[idx:])
194
+
195
+ for item in items:
196
+ if len(item) > len(longest):
197
+ longest = item
198
+ if longest == '':
199
+ return None
200
+ return longest
201
+
202
+ def has_keys_with_prefix(self, word):
203
+ return self.trie.has_keys_with_prefix(word)
204
+
205
+
206
+ class DaTrie(Trie):
207
+ def __init__(self, word_list: list):
208
+ import datrie
209
+ self.trie = datrie.BaseTrie(word_list)
210
+ rkey_list = [i[::-1] for i in word_list]
211
+ self.rtrie = datrie.BaseTrie(rkey_list)
212
+
213
+ def prefixes(self, name: str):
214
+ return self.trie.prefixes(name)
215
+
216
+ def startwith(self, name: str):
217
+ return self.trie.startwith(name)
218
+
219
+ def longest_prefix(self, word: str):
220
+ return self.trie.longest_prefix(word)
221
+
222
+ def has_keys_with_prefix(self, word):
223
+ return self.trie.has_keys_with_prefix(word)
224
+
225
+
226
+ class AutomatonTrie:
227
+ def __init__(self, key_list: list):
228
+ import ahocorasick
229
+ self.trie = ahocorasick.Automaton()
230
+ for idx, key in enumerate(key_list):
231
+ self.trie.add_word(key, (idx, key))
232
+
233
+ def __contains__(self, item):
234
+ return item in self.trie
235
+
236
+
237
+ class Benchmark:
238
+ def __init__(self):
239
+ from maque.performance import MeasureTime
240
+ self.ms = MeasureTime()
241
+ n = 2000000
242
+ word_list = [str(i) for i in range(n)]
243
+ self.ms.start()
244
+ pytrie = PyTrie(word_list)
245
+ self.ms.show_interval(f"pytrie build")
246
+ mtrie = MarisaTrie(word_list)
247
+ self.ms.show_interval(f"marisa trie build")
248
+
249
+ def run(self):
250
+ self.ms.start()
251
+
252
+
253
+ if __name__ == "__main__":
254
+ from rich import print
255
+
256
+ key_list = ['as', 'asdf', "basdfg", 'casd']
257
+ t = MarisaTrie(key_list)
258
+ print(t.startwith("as"))
259
+ print(t.endwith("sd"))
260
+ print('asd' in t)
261
+ t2 = AutomatonTrie(key_list)
262
+ print('asd' in t2)
263
+
264
+ # print(t.rkeys("asd"))
265
+ # t = AutomatonTrie(['as', '1asdf'])
266
+ # print("a" in t.trie)
267
+ # t = PyTrie(["a", "ab", "abc", "bc",
268
+ # "abcd"
269
+ # ])
270
+ # print(t.find("abc"))
271
+ # print(t.find("bcd", ))
272
+ # Benchmark()
@@ -0,0 +1,63 @@
1
+ from functools import partial, wraps, reduce
2
+ from contextlib import contextmanager
3
+
4
+
5
+ def exists(val):
6
+ return val is not None
7
+
8
+
9
+ def default(val, d):
10
+ return val if exists(val) else d
11
+
12
+
13
+ def cast_tuple(val, length=1):
14
+ return val if isinstance(val, tuple) else ((val,) * length)
15
+
16
+
17
+ @contextmanager
18
+ def null_context(*args, **kwargs):
19
+ yield
20
+
21
+
22
+ def pick_and_pop(keys, d):
23
+ values = list(map(lambda key: d.pop(key), keys))
24
+ return dict(zip(keys, values))
25
+
26
+
27
+ def group_dict_by_key(cond, d):
28
+ return_val = [dict(), dict()]
29
+ for key in d.keys():
30
+ match = bool(cond(key))
31
+ ind = int(not match)
32
+ return_val[ind][key] = d[key]
33
+ return (*return_val,)
34
+
35
+
36
+ def string_begins_with(prefix, str):
37
+ return str.startswith(prefix)
38
+
39
+
40
+ def group_by_key_prefix(prefix, d):
41
+ return group_dict_by_key(partial(string_begins_with, prefix), d)
42
+
43
+
44
+ def groupby_prefix_and_trim(prefix, d):
45
+ kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
46
+ kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
47
+ return kwargs_without_prefix, kwargs
48
+
49
+
50
+ def num_to_groups(num, divisor):
51
+ groups = num // divisor
52
+ remainder = num % divisor
53
+ arr = [divisor] * groups
54
+ if remainder > 0:
55
+ arr.append(remainder)
56
+ return arr
57
+
58
+
59
+ def find_first(cond, arr):
60
+ for el in arr:
61
+ if cond(el):
62
+ return el
63
+ return None