maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
imap = lambda *args, **kwargs: list(map(*args, **kwargs))
|
|
2
|
+
ifilter = lambda *args, **kwargs: list(filter(*args, **kwargs))
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BKTree:
|
|
6
|
+
def __init__(self, distfn, words):
|
|
7
|
+
"""
|
|
8
|
+
Create a new BK-tree from the given distance function and
|
|
9
|
+
words.
|
|
10
|
+
|
|
11
|
+
Arguments:
|
|
12
|
+
|
|
13
|
+
distfn: a binary function that returns the distance between
|
|
14
|
+
two words. Return value is a non-negative integer. the
|
|
15
|
+
distance function must be a metric space.
|
|
16
|
+
|
|
17
|
+
words: an iterable. produces values that can be passed to
|
|
18
|
+
distfn
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
self.distfn = distfn
|
|
22
|
+
|
|
23
|
+
it = iter(words)
|
|
24
|
+
root = it.next()
|
|
25
|
+
self.tree = (root, {})
|
|
26
|
+
|
|
27
|
+
for i in it:
|
|
28
|
+
self._add_word(self.tree, i)
|
|
29
|
+
|
|
30
|
+
def _add_word(self, parent, word):
|
|
31
|
+
pword, children = parent
|
|
32
|
+
d = self.distfn(word, pword)
|
|
33
|
+
if d in children:
|
|
34
|
+
self._add_word(children[d], word)
|
|
35
|
+
else:
|
|
36
|
+
children[d] = (word, {})
|
|
37
|
+
|
|
38
|
+
def query(self, word, n):
|
|
39
|
+
"""
|
|
40
|
+
Return all words in the tree that are within a distance of `n'
|
|
41
|
+
from `word`.
|
|
42
|
+
|
|
43
|
+
Arguments:
|
|
44
|
+
|
|
45
|
+
word: a word to query on
|
|
46
|
+
|
|
47
|
+
n: a non-negative integer that specifies the allowed distance
|
|
48
|
+
from the query word.
|
|
49
|
+
|
|
50
|
+
Return value is a list of tuples (distance, word), sorted in
|
|
51
|
+
ascending order of distance.
|
|
52
|
+
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def rec(parent):
|
|
56
|
+
pword, children = parent
|
|
57
|
+
d = self.distfn(word, pword)
|
|
58
|
+
results = []
|
|
59
|
+
if d <= n:
|
|
60
|
+
results.append((d, pword))
|
|
61
|
+
|
|
62
|
+
for i in range(d - n, d + n + 1):
|
|
63
|
+
child = children.get(i)
|
|
64
|
+
if child is not None:
|
|
65
|
+
results.extend(rec(child))
|
|
66
|
+
return results
|
|
67
|
+
|
|
68
|
+
# sort by distance
|
|
69
|
+
return sorted(rec(self.tree))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def brute_query(word, words, distfn, n):
|
|
73
|
+
"""A brute force distance query
|
|
74
|
+
|
|
75
|
+
Arguments:
|
|
76
|
+
|
|
77
|
+
word: the word to query for
|
|
78
|
+
|
|
79
|
+
words: a iterable that produces words to test
|
|
80
|
+
|
|
81
|
+
distfn: a binary function that returns the distance between a
|
|
82
|
+
`word' and an item in `words'.
|
|
83
|
+
|
|
84
|
+
n: an integer that specifies the distance of a matching word
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
return [i for i in words
|
|
88
|
+
if distfn(i, word) <= n]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# def maxdepth(tree, count=0):
|
|
92
|
+
# _, children = t
|
|
93
|
+
# if len(children):
|
|
94
|
+
# return max(maxdepth(i, c + 1) for i in children.values())
|
|
95
|
+
# else:
|
|
96
|
+
# return c
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Python
|
|
100
|
+
def levenshtein(s, t):
|
|
101
|
+
m, n = len(s), len(t)
|
|
102
|
+
d = [range(n + 1)]
|
|
103
|
+
d += [[i] for i in range(1, m + 1)]
|
|
104
|
+
for i in range(0, m):
|
|
105
|
+
for j in range(0, n):
|
|
106
|
+
cost = 1
|
|
107
|
+
if s[i] == t[j]: cost = 0
|
|
108
|
+
|
|
109
|
+
d[i + 1].append(min(d[i][j + 1] + 1, # deletion
|
|
110
|
+
d[i + 1][j] + 1, # insertion
|
|
111
|
+
d[i][j] + cost) # substitution
|
|
112
|
+
)
|
|
113
|
+
return d[m][n]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
...
|
maque/algorithms/core.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from functools import partial, wraps, reduce
|
|
2
|
+
import inspect
|
|
3
|
+
import numpy as np
|
|
4
|
+
import operator as op
|
|
5
|
+
import random
|
|
6
|
+
import time
|
|
7
|
+
from pandas import DataFrame
|
|
8
|
+
from .utils import exists
|
|
9
|
+
|
|
10
|
+
__all__ = ["topk", "dict_topk", "random_idx", "clamp", "get_num_args", "get_parameters"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def dict_topk(a: dict, k: int, reverse=False):
|
|
14
|
+
df = DataFrame({'key': a.keys(), 'value': a.values()})
|
|
15
|
+
if not reverse:
|
|
16
|
+
return df.nlargest(k, 'value')
|
|
17
|
+
else:
|
|
18
|
+
return df.nsmallest(k, 'value')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def topk(a, k, axis=-1, largest=True, sort=True):
|
|
22
|
+
"""Series top K"""
|
|
23
|
+
a = np.asanyarray(a)
|
|
24
|
+
if axis is None:
|
|
25
|
+
axis_size = a.size
|
|
26
|
+
else:
|
|
27
|
+
axis_size = a.shape[axis]
|
|
28
|
+
assert 1 <= k <= axis_size
|
|
29
|
+
|
|
30
|
+
if largest:
|
|
31
|
+
index_array = np.argpartition(a, axis_size - k, axis=axis)
|
|
32
|
+
topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis)
|
|
33
|
+
else:
|
|
34
|
+
index_array = np.argpartition(a, k - 1, axis=axis)
|
|
35
|
+
topk_indices = np.take(index_array, np.arange(k), axis=axis)
|
|
36
|
+
topk_values = np.take_along_axis(a, topk_indices, axis=axis)
|
|
37
|
+
if sort:
|
|
38
|
+
sorted_indices_in_topk = np.argsort(topk_values, axis=axis)
|
|
39
|
+
if largest:
|
|
40
|
+
sorted_indices_in_topk = np.flip(sorted_indices_in_topk, axis=axis)
|
|
41
|
+
sorted_topk_values = np.take_along_axis(
|
|
42
|
+
topk_values, sorted_indices_in_topk, axis=axis
|
|
43
|
+
)
|
|
44
|
+
sorted_topk_indices = np.take_along_axis(
|
|
45
|
+
topk_indices, sorted_indices_in_topk, axis=axis
|
|
46
|
+
)
|
|
47
|
+
return sorted_topk_values, sorted_topk_indices
|
|
48
|
+
return topk_values, topk_indices
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def random_idx(idx_range, exclude_idx=None):
|
|
52
|
+
random.seed(time.time())
|
|
53
|
+
rand_idx = random.randint(*idx_range)
|
|
54
|
+
if rand_idx == exclude_idx:
|
|
55
|
+
return random_idx(idx_range, exclude_idx)
|
|
56
|
+
else:
|
|
57
|
+
return rand_idx
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def clamp(x, x_min=None, x_max=None):
|
|
61
|
+
"""Clamp a number to same range.
|
|
62
|
+
Examples:
|
|
63
|
+
>>> clamp(-1, 0, 1)
|
|
64
|
+
>>> 0
|
|
65
|
+
>>> clamp([-1, 2, 3], [0, 0, 0], [1, 1, 1])
|
|
66
|
+
>>> [0, 1, 1]
|
|
67
|
+
"""
|
|
68
|
+
assert exists(x_min) or exists(x_max)
|
|
69
|
+
if exists(x_min):
|
|
70
|
+
x = np.maximum(x, x_min)
|
|
71
|
+
if exists(x_max):
|
|
72
|
+
x = np.minimum(x, x_max)
|
|
73
|
+
return x
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
CHOOSE_CACHE = {}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def choose_using_cache(n, r):
|
|
80
|
+
if n not in CHOOSE_CACHE:
|
|
81
|
+
CHOOSE_CACHE[n] = {}
|
|
82
|
+
if r not in CHOOSE_CACHE[n]:
|
|
83
|
+
CHOOSE_CACHE[n][r] = choose(n, r, use_cache=False)
|
|
84
|
+
return CHOOSE_CACHE[n][r]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def choose(n, r, use_cache=True):
|
|
88
|
+
if use_cache:
|
|
89
|
+
return choose_using_cache(n, r)
|
|
90
|
+
if n < r:
|
|
91
|
+
return 0
|
|
92
|
+
if r == 0:
|
|
93
|
+
return 1
|
|
94
|
+
denom = reduce(op.mul, range(1, r + 1), 1)
|
|
95
|
+
numer = reduce(op.mul, range(n, n - r, -1), 1)
|
|
96
|
+
return numer // denom
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_num_args(function):
|
|
100
|
+
return len(get_parameters(function))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_parameters(function):
|
|
104
|
+
return inspect.signature(function).parameters
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_hilbert_1d_array(image: np.ndarray):
|
|
5
|
+
from hilbert import decode, encode # $ pip install numpy-hilbert-curve
|
|
6
|
+
num_dims = 2
|
|
7
|
+
num_bits = np.log2(image.shape[0] * image.shape[1]) / num_dims
|
|
8
|
+
num_bits = int(num_bits)
|
|
9
|
+
max_hil = 2 ** (num_bits * num_dims)
|
|
10
|
+
hilberts = np.arange(max_hil)
|
|
11
|
+
locs = decode(hilberts, num_dims, num_bits)
|
|
12
|
+
|
|
13
|
+
image1d = []
|
|
14
|
+
for coord in locs:
|
|
15
|
+
image1d.append(image[coord[0], coord[1]])
|
|
16
|
+
return np.array(image1d)[None, ...]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .bezier import bezier
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def linear(t):
|
|
6
|
+
return t
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def smooth(t):
|
|
10
|
+
# Zero first and second derivatives at t=0 and t=1.
|
|
11
|
+
# Equivalent to bezier([0, 0, 0, 1, 1, 1])
|
|
12
|
+
s = 1 - t
|
|
13
|
+
return (t**3) * (10 * s * s + 5 * s * t + t * t)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def rush_into(t):
|
|
17
|
+
return 2 * smooth(0.5 * t)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def rush_from(t):
|
|
21
|
+
return 2 * smooth(0.5 * (t + 1)) - 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def slow_into(t):
|
|
25
|
+
return np.sqrt(1 - (1 - t) * (1 - t))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def double_smooth(t):
|
|
29
|
+
if t < 0.5:
|
|
30
|
+
return 0.5 * smooth(2 * t)
|
|
31
|
+
else:
|
|
32
|
+
return 0.5 * (1 + smooth(2 * t - 1))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def there_and_back(t):
|
|
36
|
+
new_t = 2 * t if t < 0.5 else 2 * (1 - t)
|
|
37
|
+
return smooth(new_t)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def there_and_back_with_pause(t, pause_ratio=1.0 / 3):
|
|
41
|
+
a = 1.0 / pause_ratio
|
|
42
|
+
if t < 0.5 - pause_ratio / 2:
|
|
43
|
+
return smooth(a * t)
|
|
44
|
+
elif t < 0.5 + pause_ratio / 2:
|
|
45
|
+
return 1
|
|
46
|
+
else:
|
|
47
|
+
return smooth(a - a * t)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def running_start(t, pull_factor=-0.5):
|
|
51
|
+
return bezier([0, 0, pull_factor, pull_factor, 1, 1, 1])(t)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def not_quite_there(func=smooth, proportion=0.7):
|
|
55
|
+
def result(t):
|
|
56
|
+
return proportion * func(t)
|
|
57
|
+
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def wiggle(t, wiggles=2):
|
|
62
|
+
return there_and_back(t) * np.sin(wiggles * np.pi * t)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def squish_rate_func(func, a=0.4, b=0.6):
|
|
66
|
+
def result(t):
|
|
67
|
+
if a == b:
|
|
68
|
+
return a
|
|
69
|
+
elif t < a:
|
|
70
|
+
return func(0)
|
|
71
|
+
elif t > b:
|
|
72
|
+
return func(1)
|
|
73
|
+
else:
|
|
74
|
+
return func((t - a) / (b - a))
|
|
75
|
+
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Stylistically, should this take parameters (with default values)?
|
|
80
|
+
# Ultimately, the functionality is entirely subsumed by squish_rate_func,
|
|
81
|
+
# but it may be useful to have a nice name for with nice default params for
|
|
82
|
+
# "lingering", different from squish_rate_func's default params
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def lingering(t):
|
|
86
|
+
return squish_rate_func(lambda t: t, 0, 0.8)(t)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def exponential_decay(t, half_life=0.1):
|
|
90
|
+
# The half-life should be rather small to minimize
|
|
91
|
+
# the cut-off error at the end
|
|
92
|
+
return 1 - np.exp(-t / half_life)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# from einops import rearrange, reduce
|
|
2
|
+
import einops
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def repeat(tensor, n, axis=-1):
|
|
6
|
+
shape = tensor.shape
|
|
7
|
+
dims = len(shape)
|
|
8
|
+
if dims == 1:
|
|
9
|
+
if axis == 0:
|
|
10
|
+
res = einops.repeat(tensor, "w -> h w", h=n)
|
|
11
|
+
else:
|
|
12
|
+
res = einops.repeat(tensor, "h -> h w", w=n)
|
|
13
|
+
elif dims == 2:
|
|
14
|
+
if axis == 0:
|
|
15
|
+
res = einops.repeat(tensor, "h w -> c h w", c=n)
|
|
16
|
+
else:
|
|
17
|
+
res = einops.repeat(tensor, "h w -> h w c", c=n)
|
|
18
|
+
elif dims == 3:
|
|
19
|
+
if axis == 0:
|
|
20
|
+
res = einops.repeat(tensor, "h w c -> b h w c", b=n)
|
|
21
|
+
elif axis == 1:
|
|
22
|
+
res = einops.repeat(tensor, "h w c -> h b w c", b=n)
|
|
23
|
+
else:
|
|
24
|
+
res = einops.repeat(tensor, "h w c -> h w c b", b=n)
|
|
25
|
+
else:
|
|
26
|
+
raise ValueError
|
|
27
|
+
return res
|
maque/algorithms/trie.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import List, Dict, Union
|
|
3
|
+
from abc import ABCMeta, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Trie(metaclass=ABCMeta):
|
|
8
|
+
trie = None
|
|
9
|
+
rtrie = None
|
|
10
|
+
|
|
11
|
+
def matches(self, word: str) -> list:
|
|
12
|
+
matched_list = self.startwith(word)
|
|
13
|
+
matched_list.extend(self.prefixes(word))
|
|
14
|
+
return matched_list
|
|
15
|
+
|
|
16
|
+
def __contains__(self, item):
|
|
17
|
+
return item in self.trie
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def prefixes(self, word: str) -> list:
|
|
21
|
+
"""在trie中匹配出所有可以作为word的前缀的keys
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
word : str
|
|
26
|
+
待匹配词语
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
list
|
|
31
|
+
Trie中所有匹配到的元素数组
|
|
32
|
+
|
|
33
|
+
Example
|
|
34
|
+
-------
|
|
35
|
+
>>> trie = Trie(["ab", "bc", "bcd", "bcde"])
|
|
36
|
+
>>> trie.prefixes("bcdf")
|
|
37
|
+
["bc", "bcd"]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def full_prefixes(self, word: str) -> list:
|
|
41
|
+
full_matches = []
|
|
42
|
+
for idx in range(len(word)):
|
|
43
|
+
cur_word = word[idx:]
|
|
44
|
+
lst = self.prefixes(cur_word)
|
|
45
|
+
lst.sort(key=lambda x: len(x), reverse=True)
|
|
46
|
+
full_matches.extend(lst)
|
|
47
|
+
return full_matches
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def startwith(self, word: str) -> list:
|
|
51
|
+
"""在前缀树中匹配出所有以word为前缀的keys
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
word : str
|
|
56
|
+
待匹配词语
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
list
|
|
61
|
+
Trie中所有匹配到的元素数组
|
|
62
|
+
|
|
63
|
+
Example
|
|
64
|
+
-------
|
|
65
|
+
>>> trie = Trie(["ab", "bc", "bcd", "bcde"])
|
|
66
|
+
>>> trie.startwith("bc")
|
|
67
|
+
["bc", "bcd", "bcde"]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def endwith(self, word: str) -> list:
|
|
72
|
+
"""在前缀树中匹配出所有以word为后缀的keys
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
word : str
|
|
77
|
+
待匹配词语
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
list
|
|
82
|
+
Trie中所有匹配到的元素数组
|
|
83
|
+
|
|
84
|
+
Example
|
|
85
|
+
-------
|
|
86
|
+
>>> trie = Trie(["bc", "abc", "bcd", "edbc"])
|
|
87
|
+
>>> trie.endwith("bc")
|
|
88
|
+
["bc", "abc", "edbc"]
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def has_keys_with_prefix(self, word):
|
|
92
|
+
...
|
|
93
|
+
|
|
94
|
+
def save(self):
|
|
95
|
+
...
|
|
96
|
+
|
|
97
|
+
def load(self, file_name: Union[str, Path]):
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class PyTrie:
|
|
102
|
+
def __init__(self, words: List[str]):
|
|
103
|
+
self.trie = {}
|
|
104
|
+
self._end = "eos"
|
|
105
|
+
for word in words:
|
|
106
|
+
self.add(word)
|
|
107
|
+
|
|
108
|
+
def add(self, word):
|
|
109
|
+
word = word.strip()
|
|
110
|
+
assert word
|
|
111
|
+
cur_node = self.trie
|
|
112
|
+
for char in word:
|
|
113
|
+
cur_node = cur_node.setdefault(char, {})
|
|
114
|
+
# cur_node[self._end] = ""
|
|
115
|
+
cur_node[self._end] = word
|
|
116
|
+
|
|
117
|
+
def find(self, word: str, full=True) -> bool:
|
|
118
|
+
"""
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
full: bool
|
|
122
|
+
True: 完全匹配时返回True
|
|
123
|
+
False: 前缀匹配到则返回True
|
|
124
|
+
"""
|
|
125
|
+
trie = self.trie
|
|
126
|
+
for c in word:
|
|
127
|
+
if c not in trie:
|
|
128
|
+
return False
|
|
129
|
+
trie = trie[c]
|
|
130
|
+
if full:
|
|
131
|
+
return self._end in trie
|
|
132
|
+
else:
|
|
133
|
+
return True
|
|
134
|
+
|
|
135
|
+
def extract_longest_item(self, word: str):
|
|
136
|
+
"""从一个文本的开头开始匹配字典中最长的词,返回最长词和长度"""
|
|
137
|
+
curr_dict, longest, offset = self.trie, None, 0
|
|
138
|
+
|
|
139
|
+
if not word:
|
|
140
|
+
return longest, offset
|
|
141
|
+
|
|
142
|
+
for i, c in enumerate(word):
|
|
143
|
+
if c not in curr_dict:
|
|
144
|
+
return longest, offset
|
|
145
|
+
curr_dict = curr_dict[c]
|
|
146
|
+
if 'end' in curr_dict:
|
|
147
|
+
longest, offset = curr_dict['end'], i + 1
|
|
148
|
+
return longest, offset
|
|
149
|
+
|
|
150
|
+
def __str__(self):
|
|
151
|
+
return self.trie
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class HatTrie(Trie):
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class MarisaTrie(Trie):
|
|
159
|
+
"""Static memory-efficient Trie-like structures for Python
|
|
160
|
+
based on marisa-trie C++ library.
|
|
161
|
+
String data in a MARISA-trie may take up to 50x-100x less memory than in a standard Python dict;
|
|
162
|
+
the raw lookup speed is comparable;
|
|
163
|
+
trie also provides fast advanced methods like prefix search.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
def __init__(self, key_list: list):
|
|
167
|
+
import marisa_trie
|
|
168
|
+
self.trie = marisa_trie.Trie(key_list)
|
|
169
|
+
rkey_list = [i[::-1] for i in key_list]
|
|
170
|
+
self.rtrie = marisa_trie.Trie(rkey_list)
|
|
171
|
+
|
|
172
|
+
def prefixes(self, word: str):
|
|
173
|
+
return self.trie.prefixes(word)
|
|
174
|
+
|
|
175
|
+
def rprefixes(self, word: str):
|
|
176
|
+
rword = word[::-1]
|
|
177
|
+
return self.rtrie.prefixes(rword)
|
|
178
|
+
|
|
179
|
+
def startwith(self, word: str):
|
|
180
|
+
return self.trie.keys(word)
|
|
181
|
+
|
|
182
|
+
def endwith(self, word: str):
|
|
183
|
+
rword = word[::-1]
|
|
184
|
+
return [i[::-1] for i in self.rtrie.keys(rword)]
|
|
185
|
+
|
|
186
|
+
def get_longest(self, text):
|
|
187
|
+
"""匹配整个字典中最长的词,返回最长词"""
|
|
188
|
+
if not text:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
longest = ''
|
|
192
|
+
for idx, item in enumerate(text):
|
|
193
|
+
items = self.trie.prefixes(text[idx:])
|
|
194
|
+
|
|
195
|
+
for item in items:
|
|
196
|
+
if len(item) > len(longest):
|
|
197
|
+
longest = item
|
|
198
|
+
if longest == '':
|
|
199
|
+
return None
|
|
200
|
+
return longest
|
|
201
|
+
|
|
202
|
+
def has_keys_with_prefix(self, word):
|
|
203
|
+
return self.trie.has_keys_with_prefix(word)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class DaTrie(Trie):
|
|
207
|
+
def __init__(self, word_list: list):
|
|
208
|
+
import datrie
|
|
209
|
+
self.trie = datrie.BaseTrie(word_list)
|
|
210
|
+
rkey_list = [i[::-1] for i in word_list]
|
|
211
|
+
self.rtrie = datrie.BaseTrie(rkey_list)
|
|
212
|
+
|
|
213
|
+
def prefixes(self, name: str):
|
|
214
|
+
return self.trie.prefixes(name)
|
|
215
|
+
|
|
216
|
+
def startwith(self, name: str):
|
|
217
|
+
return self.trie.startwith(name)
|
|
218
|
+
|
|
219
|
+
def longest_prefix(self, word: str):
|
|
220
|
+
return self.trie.longest_prefix(word)
|
|
221
|
+
|
|
222
|
+
def has_keys_with_prefix(self, word):
|
|
223
|
+
return self.trie.has_keys_with_prefix(word)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class AutomatonTrie:
|
|
227
|
+
def __init__(self, key_list: list):
|
|
228
|
+
import ahocorasick
|
|
229
|
+
self.trie = ahocorasick.Automaton()
|
|
230
|
+
for idx, key in enumerate(key_list):
|
|
231
|
+
self.trie.add_word(key, (idx, key))
|
|
232
|
+
|
|
233
|
+
def __contains__(self, item):
|
|
234
|
+
return item in self.trie
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class Benchmark:
|
|
238
|
+
def __init__(self):
|
|
239
|
+
from maque.performance import MeasureTime
|
|
240
|
+
self.ms = MeasureTime()
|
|
241
|
+
n = 2000000
|
|
242
|
+
word_list = [str(i) for i in range(n)]
|
|
243
|
+
self.ms.start()
|
|
244
|
+
pytrie = PyTrie(word_list)
|
|
245
|
+
self.ms.show_interval(f"pytrie build")
|
|
246
|
+
mtrie = MarisaTrie(word_list)
|
|
247
|
+
self.ms.show_interval(f"marisa trie build")
|
|
248
|
+
|
|
249
|
+
def run(self):
|
|
250
|
+
self.ms.start()
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
if __name__ == "__main__":
|
|
254
|
+
from rich import print
|
|
255
|
+
|
|
256
|
+
key_list = ['as', 'asdf', "basdfg", 'casd']
|
|
257
|
+
t = MarisaTrie(key_list)
|
|
258
|
+
print(t.startwith("as"))
|
|
259
|
+
print(t.endwith("sd"))
|
|
260
|
+
print('asd' in t)
|
|
261
|
+
t2 = AutomatonTrie(key_list)
|
|
262
|
+
print('asd' in t2)
|
|
263
|
+
|
|
264
|
+
# print(t.rkeys("asd"))
|
|
265
|
+
# t = AutomatonTrie(['as', '1asdf'])
|
|
266
|
+
# print("a" in t.trie)
|
|
267
|
+
# t = PyTrie(["a", "ab", "abc", "bc",
|
|
268
|
+
# "abcd"
|
|
269
|
+
# ])
|
|
270
|
+
# print(t.find("abc"))
|
|
271
|
+
# print(t.find("bcd", ))
|
|
272
|
+
# Benchmark()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from functools import partial, wraps, reduce
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def exists(val):
|
|
6
|
+
return val is not None
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def default(val, d):
|
|
10
|
+
return val if exists(val) else d
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cast_tuple(val, length=1):
|
|
14
|
+
return val if isinstance(val, tuple) else ((val,) * length)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@contextmanager
|
|
18
|
+
def null_context(*args, **kwargs):
|
|
19
|
+
yield
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def pick_and_pop(keys, d):
|
|
23
|
+
values = list(map(lambda key: d.pop(key), keys))
|
|
24
|
+
return dict(zip(keys, values))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def group_dict_by_key(cond, d):
|
|
28
|
+
return_val = [dict(), dict()]
|
|
29
|
+
for key in d.keys():
|
|
30
|
+
match = bool(cond(key))
|
|
31
|
+
ind = int(not match)
|
|
32
|
+
return_val[ind][key] = d[key]
|
|
33
|
+
return (*return_val,)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def string_begins_with(prefix, str):
|
|
37
|
+
return str.startswith(prefix)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def group_by_key_prefix(prefix, d):
|
|
41
|
+
return group_dict_by_key(partial(string_begins_with, prefix), d)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def groupby_prefix_and_trim(prefix, d):
|
|
45
|
+
kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
|
|
46
|
+
kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
|
|
47
|
+
return kwargs_without_prefix, kwargs
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def num_to_groups(num, divisor):
|
|
51
|
+
groups = num // divisor
|
|
52
|
+
remainder = num % divisor
|
|
53
|
+
arr = [divisor] * groups
|
|
54
|
+
if remainder > 0:
|
|
55
|
+
arr.append(remainder)
|
|
56
|
+
return arr
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def find_first(cond, arr):
|
|
60
|
+
for el in arr:
|
|
61
|
+
if cond(el):
|
|
62
|
+
return el
|
|
63
|
+
return None
|