nlpertools 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. nlpertools/__init__.py +24 -11
  2. nlpertools/algo/__init__.py +0 -0
  3. nlpertools/algo/ac.py +18 -0
  4. nlpertools/algo/bit_ops.py +28 -0
  5. nlpertools/algo/kmp.py +94 -0
  6. nlpertools/algo/num_ops.py +12 -0
  7. nlpertools/algo/template.py +116 -0
  8. nlpertools/algo/union.py +13 -0
  9. nlpertools/data_client.py +387 -0
  10. nlpertools/data_structure/__init__.py +0 -0
  11. nlpertools/data_structure/base_structure.py +109 -0
  12. nlpertools/dataprocess.py +611 -3
  13. nlpertools/default_db_config.yml +41 -0
  14. nlpertools/io/__init__.py +3 -3
  15. nlpertools/io/dir.py +54 -47
  16. nlpertools/io/file.py +277 -205
  17. nlpertools/ml.py +483 -317
  18. nlpertools/monitor/__init__.py +0 -0
  19. nlpertools/monitor/gpu.py +18 -0
  20. nlpertools/monitor/memory.py +24 -0
  21. nlpertools/movie.py +36 -0
  22. nlpertools/nlpertools_config.yml +1 -0
  23. nlpertools/{openApi.py → open_api.py} +65 -62
  24. nlpertools/other.py +364 -188
  25. nlpertools/pic.py +288 -0
  26. nlpertools/plugin.py +43 -34
  27. nlpertools/reminder.py +98 -15
  28. nlpertools/template/__init__.py +0 -0
  29. nlpertools/utils/__init__.py +3 -0
  30. nlpertools/utils/lazy.py +727 -0
  31. nlpertools/utils/log_util.py +20 -0
  32. nlpertools/utils/package.py +89 -0
  33. nlpertools/utils/package_v1.py +94 -0
  34. nlpertools/utils/package_v2.py +117 -0
  35. nlpertools/utils_for_nlpertools.py +93 -0
  36. nlpertools/vector_index_demo.py +108 -0
  37. nlpertools/wrapper.py +161 -0
  38. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  39. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  40. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  41. {nlpertools-1.0.4.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  42. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  43. nlpertools_helper/__init__.py +10 -0
  44. nlpertools-1.0.4.dist-info/METADATA +0 -42
  45. nlpertools-1.0.4.dist-info/RECORD +0 -15
  46. nlpertools-1.0.4.dist-info/top_level.txt +0 -1
@@ -0,0 +1,387 @@
1
+ # !/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import datetime
5
+ import json
6
+ import logging
7
+
8
+ from . import DB_CONFIG_FILE
9
+ from .io.file import read_yaml
10
+ from .utils.package import *
11
+
12
+ # import aioredis
13
+ # import happybase
14
+ # import pandas as pd
15
+ # import pymysql
16
+ # from elasticsearch import Elasticsearch, helpers
17
+ # from kafka import KafkaProducer, KafkaConsumer
18
+ # from pymongo import MongoClient
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ global_db_config = read_yaml(DB_CONFIG_FILE)
23
+
24
+
25
+ class Neo4jOps(object):
26
+ # neo4j 连接的超时秒数
27
+ # py2neo 内部会重试 3 次...
28
+ NEO4J_TIMEOUT = 0.3
29
+ pass
30
+
31
+ class SqliteOps(object):
32
+ import sqlite3
33
+ database_path = r'xx.db'
34
+ conn = sqlite3.connect(database_path)
35
+ c = conn.cursor()
36
+ sql = "select name from sqlite_master where type='table' order by name"
37
+ c.execute(sql)
38
+ print(c.fetchall())
39
+ sql = "select * from typecho_contents"
40
+ c.execute(sql)
41
+ res = c.fetchall()
42
+ print(res[3])
43
+
44
+ conn.commit()
45
+ conn.close()
46
+
47
+ class MysqlOps(object):
48
+ import pandas as pd
49
+ def __init__(self, config=global_db_config["mysql"]):
50
+ self.db = pymysql.connect(host=config["host"],
51
+ port=config["port"],
52
+ user=config["user"],
53
+ password=config["password"],
54
+ database=config["database"])
55
+
56
+ def query(self, sql):
57
+ df = pd.read_sql(sql, self.db)
58
+ return df
59
+
60
+
61
+ class EsOps(object):
62
+ from elasticsearch import Elasticsearch, helpers
63
+ def __init__(self, config=global_db_config["es"]):
64
+ self.es = Elasticsearch(
65
+ host=config["host"], timeout=config["timeout"])
66
+
67
+ def search_roll(self, index, body):
68
+ all_data = []
69
+ data = self.es.search(index=index, body=body, scroll="5m")
70
+ all_data.extend(data["hits"]["hits"])
71
+ scroll_id = data["_scroll_id"]
72
+ while data["hits"]["hits"]:
73
+ print(scroll_id[:5])
74
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
75
+ scroll_id = data["_scroll_id"]
76
+ all_data.extend(data["hits"]["hits"])
77
+ all_data = [i["_source"] for i in all_data]
78
+ return all_data
79
+
80
+ def search_roll_iter(self, index, body):
81
+ data = self.es.search(index=index, body=body, scroll="5m")
82
+ scroll_id = data["_scroll_id"]
83
+ while data["hits"]["hits"]:
84
+ yield data["hits"]["hits"]
85
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
86
+ scroll_id = data["_scroll_id"]
87
+
88
+ def search(self, index, body):
89
+ return self.es.search(index=index, body=body)
90
+
91
+ def delete(self, index, body):
92
+ self.es.delete_by_query(index=index, body=body)
93
+
94
+ def save(self, data):
95
+ # data里有index
96
+ helpers.bulk(self.es, data)
97
+
98
+ def delete_data_by_query(self, index, _project_id, _source_ids):
99
+ _query = {
100
+ "query": {
101
+ "bool": {
102
+ "must": [
103
+ {"terms": {"source_id": _source_ids}},
104
+ {"term": {"project_id": _project_id}},
105
+ ]
106
+ }
107
+ }
108
+ }
109
+ _res = self.es.delete_by_query(index=index, body=_query)
110
+ print(f"delete_data_by_query: {_res}")
111
+
112
+ def batch_re_save(self, index, _data, _project_id, _source_ids):
113
+ self.delete_data_by_query(_project_id, _source_ids)
114
+ _action = [{"_index": index, "_source": i} for i in _data]
115
+ _res = helpers.bulk(self.es, _action)
116
+ print(f"批量保存数据: {_res}")
117
+
118
+
119
+ class MongoOps(object):
120
+ from pymongo import MongoClient
121
+ def __init__(self, config=global_db_config["mongo"]):
122
+ mongo_client = MongoClient(config["uri"])
123
+ db = mongo_client[config["db"]]
124
+ self.collection = db[config["col"]]
125
+
126
+ def fetch_all(self):
127
+ """
128
+ 读取所有数据
129
+ :return:
130
+ """
131
+ ans = []
132
+ print('提取所有数据.')
133
+ for record in self.collection.find({}):
134
+ record['_id'] = str(record['_id'])
135
+ ans.append(record)
136
+ return ans
137
+
138
+ def load_from_mongo(self, special_value):
139
+ """
140
+ 读取mongodb该special_value下所有值为special_value的数据
141
+ :param
142
+ :return:
143
+ """
144
+ record = self.collection.find({"{}".format(special_value): special_value})
145
+ record = list(record)
146
+ if not record:
147
+ return None
148
+ else:
149
+ record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
150
+ return record
151
+
152
+ def delete_all(self):
153
+ query = {}
154
+ deleted = self.collection.delete_many(query)
155
+ return deleted
156
+
157
+ def delete_by_time(self, time):
158
+ query = {"name": {"$regex": "^F"}}
159
+ deleted = self.collection.delete_many(query)
160
+
161
+ def fetch_by_time(self, year=2022, month=7, day=7, hour=7, minute=7, second=7):
162
+ query = {"query_time": {"$gte": datetime.datetime(year, month, day, hour, minute, second)}}
163
+ sort_sql = [("query_time", -1)]
164
+ ans = []
165
+ print('提取所有数据.')
166
+ for record in self.collection.find(query).sort(sort_sql):
167
+ record['_id'] = str(record['_id'])
168
+ ans.append(record)
169
+ return ans
170
+
171
+ def save_to_mongo(self, special_value, each_item):
172
+ """
173
+ 数据存入mongo
174
+ :param special_value:
175
+ :param each_item:
176
+ :return:
177
+ """
178
+ query = self.collection.find({"{}".format(special_value): special_value})
179
+ if list(query):
180
+ self.collection.update_one({"{}".format(special_value): special_value},
181
+ {"$push": {'each_item': each_item}})
182
+ else:
183
+ insert_item = {
184
+ "special_value": special_value,
185
+ "each_item": [each_item]
186
+ }
187
+ self.collection.insert_one(insert_item)
188
+ print("update success")
189
+
190
+ def insert_one(self, data):
191
+ self.collection.insert_one(data)
192
+
193
+ def update_to_mongo(self, condition_term, condition_value, new_value):
194
+ """
195
+ 根据提供的字段和值,查询出对应的数据,更新数据存入mongo
196
+ 类似 updata
197
+ :param condition_term: 条件字段term
198
+ :param condition_value: 条件字段值
199
+ :param new_value: 新的值。最好是dict,不是dict的话不知道行不行
200
+ :return:
201
+ """
202
+ query = self.collection.find({condition_term: condition_value})
203
+ if list(query):
204
+ self.collection.update_one({condition_term: condition_value},
205
+ {"$push": new_value})
206
+ else:
207
+ insert_item = {
208
+ condition_term: condition_value,
209
+ "processed_data": new_value
210
+ }
211
+ self.collection.insert_one(insert_item)
212
+ print("update success")
213
+
214
+
215
+ class RedisOps(object):
216
+ def __init__(self, config=global_db_config["redis"]):
217
+ redis_max_connections = 1024
218
+ REDIS_GET_TIMEOUT = 0.1
219
+ self.redis = aioredis.from_url(config["uri"], max_connections=redis_max_connections)
220
+
221
+
222
+ class HBaseOps(object):
223
+ import happybase
224
+ """
225
+ demo
226
+ key = 'test'
227
+ db = HBaseHelper(host=hbase_host)
228
+ data = db.query_single_line(table='table', row_key=key)
229
+ print(data)
230
+ """
231
+
232
+ def __init__(self, config=global_db_config["hbase"]):
233
+ self.host = config["DEFAULT_HOST"]
234
+ self.port = config["DEFAULT_PORT"]
235
+ self.compat = config["DEFAULT_COMPAT"]
236
+ self.table_prefix = None # namespace
237
+ self.transport = config["DEFAULT_TRANSPORT"]
238
+ self.protocol = config["DEFAULT_PROTOCOL"]
239
+ self.conn = self.connect()
240
+
241
+ def connect(self):
242
+ conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
243
+ table_prefix=self.table_prefix, compat=self.compat,
244
+ transport=self.transport, protocol=self.protocol)
245
+ return conn
246
+
247
+ def create_hb_table(self, table_name, **families):
248
+ self.conn.create_table(table_name, families)
249
+
250
+ def single_put(self, table_name, row_key, column, data):
251
+ hb = happybase.Table(table_name, self.conn)
252
+ hb.put(row_key,
253
+ data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
254
+
255
+ def batch_put(self, table, row_key_name, column, datas, batch_size=1):
256
+ hb = happybase.Table(table, self.conn)
257
+ datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
258
+ for x in datas_new:
259
+ with hb.batch(batch_size=batch_size) as batch:
260
+ for da in x:
261
+ da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
262
+ row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
263
+ batch.put(row_key, da_nw)
264
+ return batch
265
+
266
+ def single_put_self(self, table_name, row_keys, datas):
267
+ hb = happybase.Table(table_name, self.conn)
268
+ for row_key, (_, val) in zip(row_keys, datas.items()):
269
+ hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
270
+ 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
271
+
272
+ def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
273
+ filter=None):
274
+ hb = happybase.Table(table, self.conn)
275
+ scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
276
+ hb_dict = dict(scan)
277
+ if hb_dict:
278
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
279
+ for k1, v1 in
280
+ hb_dict.items()}
281
+ else:
282
+ return {}
283
+
284
+ def query_single_line(self, table, row_key):
285
+ conn = self.connect()
286
+ hb = happybase.Table(table, conn)
287
+ hb_dict = hb.row(row_key)
288
+ if hb_dict:
289
+ return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
290
+ else:
291
+ return {}
292
+
293
+ def query_multi_lines(self, table, row_keys):
294
+ hb = happybase.Table(table, self.conn)
295
+ hb_dict = dict(hb.rows(row_keys))
296
+ if hb_dict:
297
+ return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
298
+ hb_dict.items()}
299
+ else:
300
+ return {}
301
+
302
+ def single_delete(self, table, row_key):
303
+ hb = happybase.Table(table, self.conn)
304
+ hb.delete(row_key)
305
+
306
+ def test_scan(self, table):
307
+ hb = happybase.Table(table, self.conn)
308
+ filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
309
+ scan = hb.scan(limit=1000, filter=filter)
310
+
311
+ hb_dict = dict(scan)
312
+ if hb_dict:
313
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
314
+ for k1, v1 in
315
+ hb_dict.items()}
316
+ else:
317
+ return {}
318
+
319
+ def close(self):
320
+ self.conn.close()
321
+
322
+
323
+ class KafkaConfig():
324
+ pass
325
+
326
+
327
+ class KafkaOps(object):
328
+ def __init__(self, config=global_db_config["kafka"]):
329
+ self.bootstrap_server = config["bootstrap_server"]
330
+ self.topic = config["topic"]
331
+ # 超时时间设置默认30s, 修改为60s
332
+ self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
333
+ bootstrap_servers=self.bootstrap_server,
334
+ acks='all',
335
+ request_timeout_ms=60000)
336
+
337
+ def send_data_to_kafka(self, data):
338
+ try:
339
+ self.producer.send(self.topic, data)
340
+ logger.info(f"data send successful! ---- {data}")
341
+ except Exception as e:
342
+ logger.exception(f'kafka occur error ---- {e}')
343
+
344
+ def consumer_msg(self):
345
+ consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
346
+ for msg in consumer:
347
+ recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
348
+ print(recv)
349
+
350
+
351
+
352
+
353
+ class MilvusOps(object):
354
+ def __init__(self, config=global_db_config.milvus):
355
+ from pymilvus import connections, Collection
356
+
357
+ connections.connect("default", host=config.host, port=config.port)
358
+ self.collection = Collection(config.collection)
359
+ self.collection.load()
360
+
361
+ def get_similarity(self, embedding):
362
+ search_params = {
363
+ "metric_type": "L2",
364
+ "params": {"nprobe": 1},
365
+ }
366
+ # # %%
367
+ logger.debug(embedding)
368
+ result = self.collection.search(
369
+ [list(embedding)],
370
+ "vec",
371
+ search_params,
372
+ limit=3,
373
+ output_fields=["pk", "entity_name", "standard_entity_name"],
374
+ )
375
+ hits = result[0]
376
+ entities = []
377
+ for hit in hits:
378
+ entities.append(
379
+ {
380
+ "name": hit.entity.get("entity_name"),
381
+ "standard_name": hit.entity.get("standard_entity_name"),
382
+ }
383
+ )
384
+ return entities
385
+
386
+ # def insert(self, collection, entities):
387
+ # collection.insert(entities)
File without changes
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+
5
+
6
+ class ListNode:
7
+ def __init__(self, x):
8
+ self.length = 1
9
+ if type(x) is int:
10
+ self.val = x
11
+ self.next = None
12
+ else:
13
+ # 初始化list,感觉用递归会比较好
14
+ pre = ListNode(x.pop(0))
15
+ head = pre
16
+ while x:
17
+ self.length += 1
18
+ pre.next = ListNode(x.pop(0))
19
+ pre = pre.next
20
+ self.val = head.val
21
+ self.next = head.next
22
+
23
+ def add(self):
24
+ pass
25
+
26
+ def __str__(self):
27
+ # TODO 循环链表标记出来
28
+ print_string = [self.val]
29
+ tmp = self.next
30
+ # 防止循环链表
31
+ recurrent_num = 0
32
+ while tmp and recurrent_num <= self.length + 10:
33
+ recurrent_num += 1
34
+ print_string.append(tmp.val)
35
+ tmp = tmp.next
36
+ return str(print_string)
37
+
38
+
39
+ class TreeNode:
40
+ def __init__(self, val=0, left=None, right=None):
41
+ if type(val) is list:
42
+ pass
43
+ else:
44
+ self.val = val
45
+ self.left = left
46
+ self.right = right
47
+
48
+ def build_from_list(self):
49
+ pass
50
+
51
+ def __str__(self):
52
+ pass
53
+
54
+ @staticmethod
55
+ def pre_order(node):
56
+ stack = []
57
+ res = []
58
+ while stack or node:
59
+ while node:
60
+ res.append(node)
61
+ stack.append(node)
62
+ node = node.left
63
+ node = stack.pop(-1)
64
+ node = node.right
65
+ return res
66
+
67
+ def level_order(self, node):
68
+ # 层序遍历
69
+ # 直观觉得递归不行,采用迭代
70
+ # deque表示正在遍历的层
71
+ deque = [node]
72
+ nxt_deque = []
73
+ res = []
74
+ while deque:
75
+ while deque:
76
+ node = deque.pop(0)
77
+ res.append(node.val)
78
+ if node.left:
79
+ nxt_deque.append(node.left)
80
+ if node.right:
81
+ nxt_deque.append(node.right)
82
+ deque, nxt_deque = nxt_deque, []
83
+ return res
84
+
85
+ pass
86
+
87
+ def bfs(self):
88
+ # 具体怎么用迭代写BFS,是根据需求来的。
89
+ # dp 里面放这个吧(node, depth)
90
+ pass
91
+
92
+ def mid_order(self, node):
93
+ if node.left:
94
+ self.mid_order(node.left)
95
+ print(node.val)
96
+ if node.right:
97
+ self.mid_order(node.right)
98
+
99
+ def post_order(self, node=None):
100
+ pass
101
+
102
+ def in_order(self, node):
103
+ # bts的读法
104
+ pass
105
+
106
+
107
+ if __name__ == '__main__':
108
+ a = ListNode([1, 2, 3, 4])
109
+ print(a)