nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nlpertools/__init__.py +24 -20
- nlpertools/algo/ac.py +18 -0
- nlpertools/algo/bit_ops.py +28 -0
- nlpertools/algo/kmp.py +94 -55
- nlpertools/algo/num_ops.py +12 -0
- nlpertools/algo/template.py +116 -0
- nlpertools/algo/union.py +13 -0
- nlpertools/data_client.py +387 -257
- nlpertools/data_structure/base_structure.py +109 -13
- nlpertools/dataprocess.py +611 -3
- nlpertools/default_db_config.yml +41 -0
- nlpertools/io/__init__.py +3 -3
- nlpertools/io/dir.py +54 -36
- nlpertools/io/file.py +277 -222
- nlpertools/ml.py +483 -460
- nlpertools/monitor/__init__.py +0 -0
- nlpertools/monitor/gpu.py +18 -0
- nlpertools/monitor/memory.py +24 -0
- nlpertools/movie.py +36 -0
- nlpertools/nlpertools_config.yml +1 -0
- nlpertools/{openApi.py → open_api.py} +65 -65
- nlpertools/other.py +364 -249
- nlpertools/pic.py +288 -0
- nlpertools/plugin.py +43 -43
- nlpertools/reminder.py +98 -87
- nlpertools/utils/__init__.py +3 -3
- nlpertools/utils/lazy.py +727 -0
- nlpertools/utils/log_util.py +20 -0
- nlpertools/utils/package.py +89 -76
- nlpertools/utils/package_v1.py +94 -0
- nlpertools/utils/package_v2.py +117 -0
- nlpertools/utils_for_nlpertools.py +93 -93
- nlpertools/vector_index_demo.py +108 -0
- nlpertools/wrapper.py +161 -96
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
- nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
- nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
- {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
- nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
- nlpertools_helper/__init__.py +10 -0
- nlpertools-1.0.5.dist-info/METADATA +0 -85
- nlpertools-1.0.5.dist-info/RECORD +0 -25
- nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/__init__.py
CHANGED
@@ -1,20 +1,24 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
4
|
-
from .algo.kmp import *
|
5
|
-
from .data_structure.base_structure import *
|
6
|
-
from .dataprocess import *
|
7
|
-
from .io.dir import *
|
8
|
-
from .io.file import *
|
9
|
-
from .ml import *
|
10
|
-
from .
|
11
|
-
from .
|
12
|
-
from .
|
13
|
-
from .plugin import *
|
14
|
-
from .reminder import *
|
15
|
-
from .utils_for_nlpertools import *
|
16
|
-
from .wrapper import *
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
from .algo.kmp import *
|
5
|
+
from .data_structure.base_structure import *
|
6
|
+
from .dataprocess import *
|
7
|
+
from .io.dir import *
|
8
|
+
from .io.file import *
|
9
|
+
from .ml import *
|
10
|
+
from .open_api import *
|
11
|
+
from .other import *
|
12
|
+
from .pic import *
|
13
|
+
from .plugin import *
|
14
|
+
from .reminder import *
|
15
|
+
from .utils_for_nlpertools import *
|
16
|
+
from .wrapper import *
|
17
|
+
from .monitor import *
|
18
|
+
|
19
|
+
import os
|
20
|
+
|
21
|
+
|
22
|
+
DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__),"default_db_config.yml")
|
23
|
+
|
24
|
+
__version__ = '1.0.5'
|
nlpertools/algo/ac.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
from ..io.file import readtxt_list_all_strip
|
5
|
+
|
6
|
+
|
7
|
+
def find_sentence_covered_vocab(vocab, sentences):
|
8
|
+
"""
|
9
|
+
找到词典中
|
10
|
+
此为参照写法,具体用的时候复制出去用避免重复构建
|
11
|
+
"""
|
12
|
+
|
13
|
+
from ahocorasick import Automaton
|
14
|
+
atm = Automaton()
|
15
|
+
for word in vocab:
|
16
|
+
atm.add_word(word, word)
|
17
|
+
atm.make_automaton()
|
18
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
def foo(num):
|
5
|
+
return num & -num
|
6
|
+
|
7
|
+
|
8
|
+
def foo2(num):
|
9
|
+
"""
|
10
|
+
raw: 0 1 2 3 4 5 6 7 8 9
|
11
|
+
res: 0 0 0 2 0 4 4 6 0 8
|
12
|
+
"""
|
13
|
+
return num & (num - 1)
|
14
|
+
|
15
|
+
|
16
|
+
def _lowbit(index: int) -> int:
|
17
|
+
"""
|
18
|
+
raw: 0 1 2 3 4 5 6 7 8 9
|
19
|
+
res: 0 1 2 1 4 1 2 1 8 1
|
20
|
+
"""
|
21
|
+
return index & -index
|
22
|
+
|
23
|
+
if __name__ == '__main__':
|
24
|
+
for i in range(10):
|
25
|
+
print(i, end=" ")
|
26
|
+
print()
|
27
|
+
for i in range(10):
|
28
|
+
print(foo2(i), end=" ")
|
nlpertools/algo/kmp.py
CHANGED
@@ -1,55 +1,94 @@
|
|
1
|
-
#!/usr/bin/python3.8
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : youshu.Ji
|
4
|
-
|
5
|
-
def build(pattern_string):
|
6
|
-
"""
|
7
|
-
构建模式串的PMT
|
8
|
-
[zhihu](https://www.zhihu.com/question/21923021/answer/281346746)
|
9
|
-
|
10
|
-
"""
|
11
|
-
# 构建pattern需要回溯的位置,
|
12
|
-
backtrace_points = [0] * len(pattern_string)
|
13
|
-
main_pointer, pattern_pointer = 0, -1
|
14
|
-
backtrace_points[0] = -1
|
15
|
-
while main_pointer < len(pattern_string) - 1:
|
16
|
-
if pattern_pointer == -1 or pattern_string[pattern_pointer] == pattern_string[main_pointer]:
|
17
|
-
main_pointer += 1
|
18
|
-
pattern_pointer += 1
|
19
|
-
backtrace_points[main_pointer] = pattern_pointer
|
20
|
-
else:
|
21
|
-
pattern_pointer = backtrace_points[pattern_pointer]
|
22
|
-
return backtrace_points
|
23
|
-
|
24
|
-
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
main_pointer
|
35
|
-
|
36
|
-
|
37
|
-
if
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
if
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
|
5
|
+
def build(pattern_string):
|
6
|
+
"""
|
7
|
+
构建模式串的PMT
|
8
|
+
[zhihu](https://www.zhihu.com/question/21923021/answer/281346746)
|
9
|
+
|
10
|
+
"""
|
11
|
+
# 构建pattern需要回溯的位置,
|
12
|
+
backtrace_points = [0] * len(pattern_string)
|
13
|
+
main_pointer, pattern_pointer = 0, -1
|
14
|
+
backtrace_points[0] = -1
|
15
|
+
while main_pointer < len(pattern_string) - 1:
|
16
|
+
if pattern_pointer == -1 or pattern_string[pattern_pointer] == pattern_string[main_pointer]:
|
17
|
+
main_pointer += 1
|
18
|
+
pattern_pointer += 1
|
19
|
+
backtrace_points[main_pointer] = pattern_pointer
|
20
|
+
else:
|
21
|
+
pattern_pointer = backtrace_points[pattern_pointer]
|
22
|
+
return backtrace_points
|
23
|
+
|
24
|
+
|
25
|
+
def build_2(needle: str):
|
26
|
+
# 这写的比第一种简洁
|
27
|
+
# 查找方法也是自己,唯一就是判断结束条件,不是用-1了
|
28
|
+
m = len(needle)
|
29
|
+
if m == 0:
|
30
|
+
return 0
|
31
|
+
|
32
|
+
pmt = [0] * m
|
33
|
+
pattern_pointer = 0
|
34
|
+
for main_pointer in range(1, m):
|
35
|
+
while pattern_pointer > 0 and needle[main_pointer] != needle[pattern_pointer]:
|
36
|
+
pattern_pointer = pmt[pattern_pointer - 1]
|
37
|
+
if needle[main_pointer] == needle[pattern_pointer]:
|
38
|
+
pattern_pointer += 1
|
39
|
+
pmt[main_pointer] = pattern_pointer
|
40
|
+
return pmt
|
41
|
+
|
42
|
+
|
43
|
+
def find_after_build(main_string, pattern_string):
|
44
|
+
backtracker = build(pattern_string)
|
45
|
+
# print(backtracker)
|
46
|
+
main_pointer, pattern_pointer = -1, -1
|
47
|
+
while main_pointer <= len(main_string) - 1:
|
48
|
+
if pattern_pointer == -1 or pattern_string[pattern_pointer] == main_string[main_pointer]:
|
49
|
+
# 这是返回首次匹配时main的位置
|
50
|
+
if pattern_pointer == len(pattern_string) - 1:
|
51
|
+
return main_pointer - len(pattern_string) + 1
|
52
|
+
pattern_pointer += 1
|
53
|
+
main_pointer += 1
|
54
|
+
else:
|
55
|
+
pattern_pointer = backtracker[pattern_pointer]
|
56
|
+
return -1
|
57
|
+
|
58
|
+
|
59
|
+
def find(main_string, pattern_string):
|
60
|
+
"""
|
61
|
+
模式匹配
|
62
|
+
一边构建字串的回溯点,一边判断模式是否匹配
|
63
|
+
"""
|
64
|
+
if len(main_string) < len(pattern_string):
|
65
|
+
return False
|
66
|
+
main_string = " " + main_string
|
67
|
+
backtrace_points = [0] * (len(main_string) + 1)
|
68
|
+
main_pointer, pattern_pointer = 0, -1
|
69
|
+
backtrace_points[0] = -1
|
70
|
+
while main_pointer < len(main_string):
|
71
|
+
if pattern_pointer == -1 or pattern_string[pattern_pointer] == main_string[main_pointer]:
|
72
|
+
if pattern_pointer == len(pattern_string) - 1:
|
73
|
+
return True
|
74
|
+
main_pointer += 1
|
75
|
+
pattern_pointer += 1
|
76
|
+
backtrace_points[main_pointer] = pattern_pointer
|
77
|
+
else:
|
78
|
+
pattern_pointer = backtrace_points[pattern_pointer]
|
79
|
+
return False
|
80
|
+
|
81
|
+
|
82
|
+
if __name__ == '__main__':
|
83
|
+
test_main_string = "abababc"
|
84
|
+
test_pattern_string = "abababc"
|
85
|
+
|
86
|
+
res = build(test_pattern_string)
|
87
|
+
print(res)
|
88
|
+
res = build_2(test_pattern_string)
|
89
|
+
print(res)
|
90
|
+
# res = find(test_main_string, test_pattern_string)
|
91
|
+
# print(res)
|
92
|
+
#
|
93
|
+
# res = find_after_build(test_main_string, test_pattern_string)
|
94
|
+
# print(res)
|
@@ -0,0 +1,116 @@
|
|
1
|
+
#!/usr/bin/python3.8
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : youshu.Ji
|
4
|
+
from collections import defaultdict
|
5
|
+
|
6
|
+
|
7
|
+
# from sortedcontainers import SortedDict, SortedList
|
8
|
+
|
9
|
+
# 树状数组只能维护前缀“操作和”(前缀和,前缀积,前缀最大最小),而线段树可以维护区间操作和。
|
10
|
+
|
11
|
+
# 线段树
|
12
|
+
class SegmentTree:
|
13
|
+
"""
|
14
|
+
https://www.zhihu.com/question/346961479/answer/2274087021
|
15
|
+
性质:线段树的每一个树节点其实都存储了一个「区间(段)的信息」
|
16
|
+
通过add添加
|
17
|
+
"""
|
18
|
+
|
19
|
+
pass
|
20
|
+
|
21
|
+
|
22
|
+
# 树状数组(二进制下标树) 模板
|
23
|
+
class BIT:
|
24
|
+
"""
|
25
|
+
TODO 以前在logseq写过笔记,整理到web上
|
26
|
+
代码来自https://leetcode.cn/problems/number-of-recent-calls/solutions/1472043/by-ac_oier-evqe/下的评论
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, n: int):
|
30
|
+
self.size = n
|
31
|
+
self.tree = defaultdict(int)
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def _lowbit(index: int) -> int:
|
35
|
+
# TODO 同样整理到web
|
36
|
+
return index & -index
|
37
|
+
|
38
|
+
def add(self, index: int, delta: int) -> None:
|
39
|
+
"""
|
40
|
+
delta为index位置加的值
|
41
|
+
"""
|
42
|
+
while index <= self.size:
|
43
|
+
self.tree[index] += delta
|
44
|
+
index += self._lowbit(index)
|
45
|
+
|
46
|
+
def query(self, index: int) -> int:
|
47
|
+
if index > self.size:
|
48
|
+
index = self.size
|
49
|
+
res = 0
|
50
|
+
while index > 0:
|
51
|
+
res += self.tree[index]
|
52
|
+
index -= self._lowbit(index)
|
53
|
+
return res
|
54
|
+
|
55
|
+
def sumRange(self, left: int, right: int) -> int:
|
56
|
+
return self.query(right) - self.query(left - 1)
|
57
|
+
|
58
|
+
|
59
|
+
class BITUsageDemo:
|
60
|
+
"""
|
61
|
+
查找区间值的出现次数
|
62
|
+
"""
|
63
|
+
|
64
|
+
def __init__(self):
|
65
|
+
self.bit = BIT(10)
|
66
|
+
|
67
|
+
def add(self, x: int):
|
68
|
+
self.bit.add(x, 1)
|
69
|
+
|
70
|
+
def query(self, x, y):
|
71
|
+
return self.bit.sumRange(x, y)
|
72
|
+
|
73
|
+
|
74
|
+
class Trie:
|
75
|
+
def __init__(self):
|
76
|
+
self.children = [None] * 26
|
77
|
+
self.isEnd = False
|
78
|
+
|
79
|
+
def insert(self, word: str) -> None:
|
80
|
+
node = self
|
81
|
+
for ch in word:
|
82
|
+
ch = ord(ch) - ord("a")
|
83
|
+
if not node.children[ch]:
|
84
|
+
node.children[ch] = Trie()
|
85
|
+
node = node.children[ch]
|
86
|
+
node.isEnd = True
|
87
|
+
|
88
|
+
def search_prefix(self, prefix: str):
|
89
|
+
node = self
|
90
|
+
for ch in prefix:
|
91
|
+
ch = ord(ch) - ord("a")
|
92
|
+
if not node.children[ch]:
|
93
|
+
return None
|
94
|
+
node = node.children[ch]
|
95
|
+
|
96
|
+
return node
|
97
|
+
|
98
|
+
def search(self, word: str) -> bool:
|
99
|
+
node = self.search_prefix(word)
|
100
|
+
return node is not None and node.isEnd
|
101
|
+
|
102
|
+
def starts_with(self, prefix: str) -> bool:
|
103
|
+
return self.search_prefix(prefix) is not None
|
104
|
+
|
105
|
+
|
106
|
+
if __name__ == "__main__":
|
107
|
+
bit_usage_demo = BITUsageDemo()
|
108
|
+
bit_usage_demo.add(1)
|
109
|
+
bit_usage_demo.add(2)
|
110
|
+
res = bit_usage_demo.query(1, 2)
|
111
|
+
print(res)
|
112
|
+
# 前缀树
|
113
|
+
# obj = Trie()
|
114
|
+
# obj.insert(word)
|
115
|
+
# param_2 = obj.search(word)
|
116
|
+
# param_3 = obj.startsWith(prefix)
|
nlpertools/algo/union.py
ADDED