nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. nlpertools/__init__.py +23 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/cli.py +87 -0
  9. nlpertools/data_client.py +426 -257
  10. nlpertools/data_structure/base_structure.py +109 -13
  11. nlpertools/dataprocess.py +627 -3
  12. nlpertools/default_db_config.yml +41 -0
  13. nlpertools/draw/__init__.py +0 -0
  14. nlpertools/draw/draw.py +83 -0
  15. nlpertools/draw/math_func.py +33 -0
  16. nlpertools/get_2fa.py +0 -0
  17. nlpertools/io/__init__.py +3 -3
  18. nlpertools/io/dir.py +86 -36
  19. nlpertools/io/file.py +283 -222
  20. nlpertools/ml.py +511 -460
  21. nlpertools/monitor/__init__.py +0 -0
  22. nlpertools/monitor/gpu.py +18 -0
  23. nlpertools/monitor/memory.py +24 -0
  24. nlpertools/movie.py +36 -0
  25. nlpertools/nlpertools_config.yml +1 -0
  26. nlpertools/{openApi.py → open_api.py} +65 -65
  27. nlpertools/other.py +475 -249
  28. nlpertools/pic.py +288 -0
  29. nlpertools/plugin.py +43 -43
  30. nlpertools/reminder.py +98 -87
  31. nlpertools/utils/__init__.py +3 -3
  32. nlpertools/utils/lazy.py +727 -0
  33. nlpertools/utils/log_util.py +20 -0
  34. nlpertools/utils/package.py +89 -76
  35. nlpertools/utils/package_v1.py +94 -0
  36. nlpertools/utils/package_v2.py +117 -0
  37. nlpertools/utils_for_nlpertools.py +93 -93
  38. nlpertools/vector_index_demo.py +108 -0
  39. nlpertools/wrapper.py +161 -96
  40. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
  41. nlpertools-1.0.8.dist-info/METADATA +132 -0
  42. nlpertools-1.0.8.dist-info/RECORD +49 -0
  43. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
  44. nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
  45. nlpertools-1.0.8.dist-info/top_level.txt +2 -0
  46. nlpertools_helper/__init__.py +10 -0
  47. nlpertools-1.0.5.dist-info/METADATA +0 -85
  48. nlpertools-1.0.5.dist-info/RECORD +0 -25
  49. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/data_client.py CHANGED
@@ -1,257 +1,426 @@
1
- # !/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import json
5
- import logging
6
-
7
- from . import DB_CONFIG_FILE
8
- from .io.file import read_yaml
9
- from .utils.package import *
10
-
11
- # import aioredis
12
- # import happybase
13
- # import pandas as pd
14
- # import pymysql
15
- # from elasticsearch import Elasticsearch, helpers
16
- # from kafka import KafkaProducer, KafkaConsumer
17
- # from pymongo import MongoClient
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
- global_db_config = read_yaml(DB_CONFIG_FILE)
22
-
23
-
24
- class Neo4jOps(object):
25
- # neo4j 连接的超时秒数
26
- # py2neo 内部会重试 3 次...
27
- NEO4J_TIMEOUT = 0.3
28
- pass
29
-
30
-
31
- class MysqlOps(object):
32
- def __init__(self, config=global_db_config["mysql"]):
33
- self.db = pymysql.connect(host=config["host"],
34
- port=config["port"],
35
- user=config["user"],
36
- password=config["password"],
37
- database=config["database"])
38
-
39
- def query(self, sql):
40
- df = pd.read_sql(sql, self.db)
41
- return df
42
-
43
-
44
- class EsOps(object):
45
- def __init__(self, config=global_db_config["es"]):
46
- self.es = Elasticsearch(
47
- host=config["host"], timeout=config["timeout"])
48
-
49
- def search_roll(self, index, body):
50
- all_data = []
51
- data = self.es.search(index=index, body=body, scroll="5m")
52
- all_data.extend(data["hits"]["hits"])
53
- scroll_id = data["_scroll_id"]
54
- while data["hits"]["hits"]:
55
- print(scroll_id[:5])
56
- data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
57
- scroll_id = data["_scroll_id"]
58
- all_data.extend(data["hits"]["hits"])
59
- all_data = [i["_source"] for i in all_data]
60
- return all_data
61
-
62
- def search(self, index, body):
63
- return self.es.search(index=index, body=body)
64
-
65
- def delete(self, index, body):
66
- self.es.delete_by_query(index=index, body=body)
67
-
68
- def save(self, data):
69
- # data里有index
70
- helpers.bulk(self.es, data)
71
-
72
-
73
- class MongoOps(object):
74
- def __init__(self, config=global_db_config["mongo"]):
75
- mongo_client = MongoClient(config["uri"])
76
- db = mongo_client[config["db"]]
77
- self.collection = db[config["col"]]
78
-
79
- def fetch_all(self):
80
- """
81
- 读取所有数据
82
- :return:
83
- """
84
- ans = []
85
- print('提取所有数据.')
86
- for record in self.collection.find({}):
87
- record['_id'] = str(record['_id'])
88
- ans.append(record)
89
- return ans
90
-
91
- def load_from_mongo(self, special_value):
92
- """
93
- 读取mongodb该special_value下所有值为special_value的数据
94
- :param
95
- :return:
96
- """
97
- record = self.collection.find({"{}".format(special_value): special_value})
98
- record = list(record)
99
- if not record:
100
- return None
101
- else:
102
- record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
103
- return record
104
-
105
- def delete_by_time(self, time):
106
- query = {"name": {"$regex": "^F"}}
107
- deleted = self.collection.delete_many(query)
108
-
109
- def save_to_mongo(self, special_value, each_item):
110
- """
111
- 数据存入mongo
112
- :param special_value:
113
- :param each_item:
114
- :return:
115
- """
116
- query = self.collection.find({"{}".format(special_value): special_value})
117
- if list(query):
118
- self.collection.update_one({"{}".format(special_value): special_value},
119
- {"$push": {'each_item': each_item}})
120
- else:
121
- insert_item = {
122
- "special_value": special_value,
123
- "each_item": [each_item]
124
- }
125
- self.collection.insert_one(insert_item)
126
- print("update success")
127
-
128
-
129
- class RedisOps(object):
130
- def __init__(self, config=global_db_config["redis"]):
131
- REDIS_MAX_CONNECTIONS = 1024
132
- REDIS_GET_TIMEOUT = 0.1
133
- self.redis = aioredis.from_url(config["uri"], max_connections=REDIS_MAX_CONNECTIONS)
134
-
135
-
136
- class HBaseOps(object):
137
- """
138
- demo
139
- key = 'test'
140
- db = HBaseHelper(host=hbase_host)
141
- data = db.query_single_line(table='table', row_key=key)
142
- print(data)
143
- """
144
-
145
- def __init__(self, config=global_db_config["hbase"]):
146
- self.host = config["DEFAULT_HOST"]
147
- self.port = config["DEFAULT_PORT"]
148
- self.compat = config["DEFAULT_COMPAT"]
149
- self.table_prefix = None # namespace
150
- self.transport = config["DEFAULT_TRANSPORT"]
151
- self.protocol = config["DEFAULT_PROTOCOL"]
152
- self.conn = self.connect()
153
-
154
- def connect(self):
155
- conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
156
- table_prefix=self.table_prefix, compat=self.compat,
157
- transport=self.transport, protocol=self.protocol)
158
- return conn
159
-
160
- def create_hb_table(self, table_name, **families):
161
- self.conn.create_table(table_name, families)
162
-
163
- def single_put(self, table_name, row_key, column, data):
164
- hb = happybase.Table(table_name, self.conn)
165
- hb.put(row_key,
166
- data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
167
-
168
- def batch_put(self, table, row_key_name, column, datas, batch_size=1):
169
- hb = happybase.Table(table, self.conn)
170
- datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
171
- for x in datas_new:
172
- with hb.batch(batch_size=batch_size) as batch:
173
- for da in x:
174
- da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
175
- row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
176
- batch.put(row_key, da_nw)
177
- return batch
178
-
179
- def single_put_self(self, table_name, row_keys, datas):
180
- hb = happybase.Table(table_name, self.conn)
181
- for row_key, (_, val) in zip(row_keys, datas.items()):
182
- hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
183
- 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
184
-
185
- def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
186
- filter=None):
187
- hb = happybase.Table(table, self.conn)
188
- scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
189
- hb_dict = dict(scan)
190
- if hb_dict:
191
- return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
192
- for k1, v1 in
193
- hb_dict.items()}
194
- else:
195
- return {}
196
-
197
- def query_single_line(self, table, row_key):
198
- conn = self.connect()
199
- hb = happybase.Table(table, conn)
200
- hb_dict = hb.row(row_key)
201
- if hb_dict:
202
- return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
203
- else:
204
- return {}
205
-
206
- def query_multi_lines(self, table, row_keys):
207
- hb = happybase.Table(table, self.conn)
208
- hb_dict = dict(hb.rows(row_keys))
209
- if hb_dict:
210
- return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
211
- hb_dict.items()}
212
- else:
213
- return {}
214
-
215
- def single_delete(self, table, row_key):
216
- hb = happybase.Table(table, self.conn)
217
- hb.delete(row_key)
218
-
219
- def test_scan(self, table):
220
- hb = happybase.Table(table, self.conn)
221
- filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
222
- scan = hb.scan(limit=1000, filter=filter)
223
-
224
- hb_dict = dict(scan)
225
- if hb_dict:
226
- return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
227
- for k1, v1 in
228
- hb_dict.items()}
229
- else:
230
- return {}
231
-
232
- def close(self):
233
- self.conn.close()
234
-
235
-
236
- class KafkaOps():
237
- def __init__(self, config=global_db_config["kafka"]):
238
- self.bootstrap_server = config["bootstrap_server"]
239
- self.topic = config["topic"]
240
- # 超时时间设置默认30s, 修改为60s
241
- self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
242
- bootstrap_servers=self.bootstrap_server,
243
- acks='all',
244
- request_timeout_ms=60000)
245
-
246
- def send_data_to_kafka(self, data):
247
- try:
248
- self.producer.send(self.topic, data)
249
- logger.info(f"data send successful! ---- {data}")
250
- except Exception as e:
251
- logger.exception(f'kafka occur error ---- {e}')
252
-
253
- def consumer_msg(self):
254
- consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
255
- for msg in consumer:
256
- recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
257
- print(recv)
1
+ #encoding=utf-8
2
+ # !/usr/bin/python3.8
3
+ # -*- coding: utf-8 -*-
4
+ # @Author : youshu.Ji
5
+ import datetime
6
+ import json
7
+ import logging
8
+
9
+ from .io.file import read_yaml
10
+ from .utils.package import *
11
+ import os
12
+
13
+ DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "default_db_config.yml")
14
+
15
+ # import aioredis
16
+ # import happybase
17
+ # import pandas as pd
18
+ # import pymysql
19
+ # from elasticsearch import Elasticsearch, helpers
20
+ # from kafka import KafkaProducer, KafkaConsumer
21
+ # from pymongo import MongoClient
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ global_db_config = read_yaml(DB_CONFIG_FILE)
26
+
27
+
28
+ class Neo4jOps(object):
29
+ # neo4j 连接的超时秒数
30
+ # py2neo 内部会重试 3 次...
31
+ NEO4J_TIMEOUT = 0.3
32
+ pass
33
+
34
+
35
+ class SqliteOps(object):
36
+ pass
37
+ # import sqlite3
38
+ # database_path = r'xx.db'
39
+ # conn = sqlite3.connect(database_path)
40
+ # c = conn.cursor()
41
+ # sql = "select name from sqlite_master where type='table' order by name"
42
+ # c.execute(sql)
43
+ # print(c.fetchall())
44
+ # sql = "select * from typecho_contents"
45
+ # c.execute(sql)
46
+ # res = c.fetchall()
47
+ # print(res[3])
48
+ #
49
+ # conn.commit()
50
+ # conn.close()
51
+
52
+
53
+ class MysqlOps(object):
54
+ import pandas as pd
55
+ def __init__(self, config=global_db_config["mysql"]):
56
+ self.db = pymysql.connect(host=config["host"],
57
+ port=config["port"],
58
+ user=config["user"],
59
+ password=config["password"],
60
+ database=config["database"])
61
+
62
+ def query(self, sql):
63
+ df = pd.read_sql(sql, self.db)
64
+ return df
65
+
66
+
67
+ class EsOps(object):
68
+ from elasticsearch import Elasticsearch, helpers
69
+ def __init__(self, config=global_db_config["es"]):
70
+ self.es = Elasticsearch(
71
+ host=config["host"], timeout=config["timeout"])
72
+
73
+ def search_roll(self, index, body):
74
+ all_data = []
75
+ data = self.es.search(index=index, body=body, scroll="5m")
76
+ all_data.extend(data["hits"]["hits"])
77
+ scroll_id = data["_scroll_id"]
78
+ while data["hits"]["hits"]:
79
+ print(scroll_id[:5])
80
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
81
+ scroll_id = data["_scroll_id"]
82
+ all_data.extend(data["hits"]["hits"])
83
+ all_data = [i["_source"] for i in all_data]
84
+ return all_data
85
+
86
+ def search_roll_iter(self, index, body):
87
+ data = self.es.search(index=index, body=body, scroll="5m")
88
+ scroll_id = data["_scroll_id"]
89
+ while data["hits"]["hits"]:
90
+ yield data["hits"]["hits"]
91
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
92
+ scroll_id = data["_scroll_id"]
93
+
94
+ def search(self, index, body):
95
+ return self.es.search(index=index, body=body)
96
+
97
+ def delete(self, index, body):
98
+ self.es.delete_by_query(index=index, body=body)
99
+
100
+ def save(self, data):
101
+ # data里有index
102
+ helpers.bulk(self.es, data)
103
+
104
+ def delete_data_by_query(self, index, _project_id, _source_ids):
105
+ _query = {
106
+ "query": {
107
+ "bool": {
108
+ "must": [
109
+ {"terms": {"source_id": _source_ids}},
110
+ {"term": {"project_id": _project_id}},
111
+ ]
112
+ }
113
+ }
114
+ }
115
+ _res = self.es.delete_by_query(index=index, body=_query)
116
+ print(f"delete_data_by_query: {_res}")
117
+
118
+ def batch_re_save(self, index, _data, _project_id, _source_ids):
119
+ self.delete_data_by_query(_project_id, _source_ids)
120
+ _action = [{"_index": index, "_source": i} for i in _data]
121
+ _res = helpers.bulk(self.es, _action)
122
+ print(f"批量保存数据: {_res}")
123
+
124
+
125
+ class MongoDB_BETA:
126
+ def __init__(self, host='localhost', port=27017, db_name=None, collection_name=None):
127
+ self.host = host
128
+ self.port = port
129
+ self.db_name = db_name
130
+ self.collection_name = collection_name
131
+ self.client = None
132
+ self.db = None
133
+ self.collection = None
134
+
135
+ def connect(self):
136
+ self.client = MongoClient(self.host, self.port)
137
+ self.db = self.client[self.db_name]
138
+ self.collection = self.db[self.collection_name]
139
+
140
+ def close(self):
141
+ if self.client:
142
+ self.client.close()
143
+
144
+ def insert_data(self, data):
145
+ if isinstance(data, list):
146
+ self.collection.insert_many(data)
147
+ else:
148
+ self.collection.insert_one(data)
149
+
150
+ def check_data_exists(self, query):
151
+ """
152
+ 检查某个数据是否存在于数据库中
153
+ :param query: 查询条件
154
+ :return: 布尔值,表示数据是否存在
155
+ """
156
+ return self.collection.count_documents(query) > 0
157
+
158
+
159
+
160
+ class MongoOps(object):
161
+ from pymongo import MongoClient
162
+ def __init__(self, config=global_db_config["mongo"]):
163
+ mongo_client = MongoClient(config["uri"])
164
+ db = mongo_client[config["db"]]
165
+ self.collection = db[config["col"]]
166
+
167
+ def fetch_all(self):
168
+ """
169
+ 读取所有数据
170
+ :return:
171
+ """
172
+ ans = []
173
+ print('提取所有数据.')
174
+ for record in self.collection.find({}):
175
+ record['_id'] = str(record['_id'])
176
+ ans.append(record)
177
+ return ans
178
+
179
+ def load_from_mongo(self, special_value):
180
+ """
181
+ 读取mongodb该special_value下所有值为special_value的数据
182
+ :param
183
+ :return:
184
+ """
185
+ record = self.collection.find({"{}".format(special_value): special_value})
186
+ record = list(record)
187
+ if not record:
188
+ return None
189
+ else:
190
+ record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
191
+ return record
192
+
193
+ def delete_all(self):
194
+ query = {}
195
+ deleted = self.collection.delete_many(query)
196
+ return deleted
197
+
198
+ def delete_by_time(self, time):
199
+ query = {"name": {"$regex": "^F"}}
200
+ deleted = self.collection.delete_many(query)
201
+
202
+ def fetch_by_time(self, year=2022, month=7, day=7, hour=7, minute=7, second=7):
203
+ query = {"query_time": {"$gte": datetime.datetime(year, month, day, hour, minute, second)}}
204
+ sort_sql = [("query_time", -1)]
205
+ ans = []
206
+ print('提取所有数据.')
207
+ for record in self.collection.find(query).sort(sort_sql):
208
+ record['_id'] = str(record['_id'])
209
+ ans.append(record)
210
+ return ans
211
+
212
+ def save_to_mongo(self, special_value, each_item):
213
+ """
214
+ 数据存入mongo
215
+ :param special_value:
216
+ :param each_item:
217
+ :return:
218
+ """
219
+ query = self.collection.find({"{}".format(special_value): special_value})
220
+ if list(query):
221
+ self.collection.update_one({"{}".format(special_value): special_value},
222
+ {"$push": {'each_item': each_item}})
223
+ else:
224
+ insert_item = {
225
+ "special_value": special_value,
226
+ "each_item": [each_item]
227
+ }
228
+ self.collection.insert_one(insert_item)
229
+ print("update success")
230
+
231
+ def insert_one(self, data):
232
+ self.collection.insert_one(data)
233
+
234
+ def update_to_mongo(self, condition_term, condition_value, new_value):
235
+ """
236
+ 根据提供的字段和值,查询出对应的数据,更新数据存入mongo
237
+ 类似 updata
238
+ :param condition_term: 条件字段term
239
+ :param condition_value: 条件字段值
240
+ :param new_value: 新的值。最好是dict,不是dict的话不知道行不行
241
+ :return:
242
+ """
243
+ query = self.collection.find({condition_term: condition_value})
244
+ if list(query):
245
+ self.collection.update_one({condition_term: condition_value},
246
+ {"$push": new_value})
247
+ else:
248
+ insert_item = {
249
+ condition_term: condition_value,
250
+ "processed_data": new_value
251
+ }
252
+ self.collection.insert_one(insert_item)
253
+ print("update success")
254
+
255
+
256
+ class RedisOps(object):
257
+ def __init__(self, config=global_db_config["redis"]):
258
+ redis_max_connections = 1024
259
+ REDIS_GET_TIMEOUT = 0.1
260
+ self.redis = aioredis.from_url(config["uri"], max_connections=redis_max_connections)
261
+
262
+
263
+ class HBaseOps(object):
264
+ import happybase
265
+ """
266
+ demo
267
+ key = 'test'
268
+ db = HBaseHelper(host=hbase_host)
269
+ data = db.query_single_line(table='table', row_key=key)
270
+ print(data)
271
+ """
272
+
273
+ def __init__(self, config=global_db_config["hbase"]):
274
+ self.host = config["DEFAULT_HOST"]
275
+ self.port = config["DEFAULT_PORT"]
276
+ self.compat = config["DEFAULT_COMPAT"]
277
+ self.table_prefix = None # namespace
278
+ self.transport = config["DEFAULT_TRANSPORT"]
279
+ self.protocol = config["DEFAULT_PROTOCOL"]
280
+ self.conn = self.connect()
281
+
282
+ def connect(self):
283
+ conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
284
+ table_prefix=self.table_prefix, compat=self.compat,
285
+ transport=self.transport, protocol=self.protocol)
286
+ return conn
287
+
288
+ def create_hb_table(self, table_name, **families):
289
+ self.conn.create_table(table_name, families)
290
+
291
+ def single_put(self, table_name, row_key, column, data):
292
+ hb = happybase.Table(table_name, self.conn)
293
+ hb.put(row_key,
294
+ data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
295
+
296
+ def batch_put(self, table, row_key_name, column, datas, batch_size=1):
297
+ hb = happybase.Table(table, self.conn)
298
+ datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
299
+ for x in datas_new:
300
+ with hb.batch(batch_size=batch_size) as batch:
301
+ for da in x:
302
+ da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
303
+ row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
304
+ batch.put(row_key, da_nw)
305
+ return batch
306
+
307
+ def single_put_self(self, table_name, row_keys, datas):
308
+ hb = happybase.Table(table_name, self.conn)
309
+ for row_key, (_, val) in zip(row_keys, datas.items()):
310
+ hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
311
+ 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
312
+
313
+ def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
314
+ filter=None):
315
+ hb = happybase.Table(table, self.conn)
316
+ scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
317
+ hb_dict = dict(scan)
318
+ if hb_dict:
319
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
320
+ for k1, v1 in
321
+ hb_dict.items()}
322
+ else:
323
+ return {}
324
+
325
+ def query_single_line(self, table, row_key):
326
+ conn = self.connect()
327
+ hb = happybase.Table(table, conn)
328
+ hb_dict = hb.row(row_key)
329
+ if hb_dict:
330
+ return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
331
+ else:
332
+ return {}
333
+
334
+ def query_multi_lines(self, table, row_keys):
335
+ hb = happybase.Table(table, self.conn)
336
+ hb_dict = dict(hb.rows(row_keys))
337
+ if hb_dict:
338
+ return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
339
+ hb_dict.items()}
340
+ else:
341
+ return {}
342
+
343
+ def single_delete(self, table, row_key):
344
+ hb = happybase.Table(table, self.conn)
345
+ hb.delete(row_key)
346
+
347
+ def test_scan(self, table):
348
+ hb = happybase.Table(table, self.conn)
349
+ filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
350
+ scan = hb.scan(limit=1000, filter=filter)
351
+
352
+ hb_dict = dict(scan)
353
+ if hb_dict:
354
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
355
+ for k1, v1 in
356
+ hb_dict.items()}
357
+ else:
358
+ return {}
359
+
360
+ def close(self):
361
+ self.conn.close()
362
+
363
+
364
+ class KafkaConfig():
365
+ pass
366
+
367
+
368
+ class KafkaOps(object):
369
+ def __init__(self, config=global_db_config["kafka"]):
370
+ self.bootstrap_server = config["bootstrap_server"]
371
+ self.topic = config["topic"]
372
+ # 超时时间设置默认30s, 修改为60s
373
+ self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
374
+ bootstrap_servers=self.bootstrap_server,
375
+ acks='all',
376
+ request_timeout_ms=60000)
377
+
378
+ def send_data_to_kafka(self, data):
379
+ try:
380
+ self.producer.send(self.topic, data)
381
+ logger.info(f"data send successful! ---- {data}")
382
+ except Exception as e:
383
+ logger.exception(f'kafka occur error ---- {e}')
384
+
385
+ def consumer_msg(self):
386
+ consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
387
+ for msg in consumer:
388
+ recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
389
+ print(recv)
390
+
391
+
392
+ class MilvusOps(object):
393
+ def __init__(self, config=global_db_config.milvus):
394
+ from pymilvus import connections, Collection
395
+
396
+ connections.connect("default", host=config.host, port=config.port)
397
+ self.collection = Collection(config.collection)
398
+ self.collection.load()
399
+
400
+ def get_similarity(self, embedding):
401
+ search_params = {
402
+ "metric_type": "L2",
403
+ "params": {"nprobe": 1},
404
+ }
405
+ # # %%
406
+ logger.debug(embedding)
407
+ result = self.collection.search(
408
+ [list(embedding)],
409
+ "vec",
410
+ search_params,
411
+ limit=3,
412
+ output_fields=["pk", "entity_name", "standard_entity_name"],
413
+ )
414
+ hits = result[0]
415
+ entities = []
416
+ for hit in hits:
417
+ entities.append(
418
+ {
419
+ "name": hit.entity.get("entity_name"),
420
+ "standard_name": hit.entity.get("standard_entity_name"),
421
+ }
422
+ )
423
+ return entities
424
+
425
+ # def insert(self, collection, entities):
426
+ # collection.insert(entities)