nlpertools 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. nlpertools/__init__.py +23 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/cli.py +87 -0
  9. nlpertools/data_client.py +426 -257
  10. nlpertools/data_structure/base_structure.py +109 -13
  11. nlpertools/dataprocess.py +627 -3
  12. nlpertools/default_db_config.yml +41 -0
  13. nlpertools/draw/__init__.py +0 -0
  14. nlpertools/draw/draw.py +83 -0
  15. nlpertools/draw/math_func.py +33 -0
  16. nlpertools/get_2fa.py +0 -0
  17. nlpertools/io/__init__.py +3 -3
  18. nlpertools/io/dir.py +86 -36
  19. nlpertools/io/file.py +283 -222
  20. nlpertools/ml.py +511 -460
  21. nlpertools/monitor/__init__.py +0 -0
  22. nlpertools/monitor/gpu.py +18 -0
  23. nlpertools/monitor/memory.py +24 -0
  24. nlpertools/movie.py +36 -0
  25. nlpertools/nlpertools_config.yml +1 -0
  26. nlpertools/{openApi.py → open_api.py} +65 -65
  27. nlpertools/other.py +475 -249
  28. nlpertools/pic.py +288 -0
  29. nlpertools/plugin.py +43 -43
  30. nlpertools/reminder.py +98 -87
  31. nlpertools/utils/__init__.py +3 -3
  32. nlpertools/utils/lazy.py +727 -0
  33. nlpertools/utils/log_util.py +20 -0
  34. nlpertools/utils/package.py +89 -76
  35. nlpertools/utils/package_v1.py +94 -0
  36. nlpertools/utils/package_v2.py +117 -0
  37. nlpertools/utils_for_nlpertools.py +93 -93
  38. nlpertools/vector_index_demo.py +108 -0
  39. nlpertools/wrapper.py +161 -96
  40. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/LICENSE +200 -200
  41. nlpertools-1.0.8.dist-info/METADATA +132 -0
  42. nlpertools-1.0.8.dist-info/RECORD +49 -0
  43. {nlpertools-1.0.5.dist-info → nlpertools-1.0.8.dist-info}/WHEEL +1 -1
  44. nlpertools-1.0.8.dist-info/entry_points.txt +2 -0
  45. nlpertools-1.0.8.dist-info/top_level.txt +2 -0
  46. nlpertools_helper/__init__.py +10 -0
  47. nlpertools-1.0.5.dist-info/METADATA +0 -85
  48. nlpertools-1.0.5.dist-info/RECORD +0 -25
  49. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/data_client.py CHANGED
@@ -1,257 +1,426 @@
1
- # !/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import json
5
- import logging
6
-
7
- from . import DB_CONFIG_FILE
8
- from .io.file import read_yaml
9
- from .utils.package import *
10
-
11
- # import aioredis
12
- # import happybase
13
- # import pandas as pd
14
- # import pymysql
15
- # from elasticsearch import Elasticsearch, helpers
16
- # from kafka import KafkaProducer, KafkaConsumer
17
- # from pymongo import MongoClient
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
- global_db_config = read_yaml(DB_CONFIG_FILE)
22
-
23
-
24
- class Neo4jOps(object):
25
- # neo4j 连接的超时秒数
26
- # py2neo 内部会重试 3 次...
27
- NEO4J_TIMEOUT = 0.3
28
- pass
29
-
30
-
31
- class MysqlOps(object):
32
- def __init__(self, config=global_db_config["mysql"]):
33
- self.db = pymysql.connect(host=config["host"],
34
- port=config["port"],
35
- user=config["user"],
36
- password=config["password"],
37
- database=config["database"])
38
-
39
- def query(self, sql):
40
- df = pd.read_sql(sql, self.db)
41
- return df
42
-
43
-
44
- class EsOps(object):
45
- def __init__(self, config=global_db_config["es"]):
46
- self.es = Elasticsearch(
47
- host=config["host"], timeout=config["timeout"])
48
-
49
- def search_roll(self, index, body):
50
- all_data = []
51
- data = self.es.search(index=index, body=body, scroll="5m")
52
- all_data.extend(data["hits"]["hits"])
53
- scroll_id = data["_scroll_id"]
54
- while data["hits"]["hits"]:
55
- print(scroll_id[:5])
56
- data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
57
- scroll_id = data["_scroll_id"]
58
- all_data.extend(data["hits"]["hits"])
59
- all_data = [i["_source"] for i in all_data]
60
- return all_data
61
-
62
- def search(self, index, body):
63
- return self.es.search(index=index, body=body)
64
-
65
- def delete(self, index, body):
66
- self.es.delete_by_query(index=index, body=body)
67
-
68
- def save(self, data):
69
- # data里有index
70
- helpers.bulk(self.es, data)
71
-
72
-
73
- class MongoOps(object):
74
- def __init__(self, config=global_db_config["mongo"]):
75
- mongo_client = MongoClient(config["uri"])
76
- db = mongo_client[config["db"]]
77
- self.collection = db[config["col"]]
78
-
79
- def fetch_all(self):
80
- """
81
- 读取所有数据
82
- :return:
83
- """
84
- ans = []
85
- print('提取所有数据.')
86
- for record in self.collection.find({}):
87
- record['_id'] = str(record['_id'])
88
- ans.append(record)
89
- return ans
90
-
91
- def load_from_mongo(self, special_value):
92
- """
93
- 读取mongodb该special_value下所有值为special_value的数据
94
- :param
95
- :return:
96
- """
97
- record = self.collection.find({"{}".format(special_value): special_value})
98
- record = list(record)
99
- if not record:
100
- return None
101
- else:
102
- record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
103
- return record
104
-
105
- def delete_by_time(self, time):
106
- query = {"name": {"$regex": "^F"}}
107
- deleted = self.collection.delete_many(query)
108
-
109
- def save_to_mongo(self, special_value, each_item):
110
- """
111
- 数据存入mongo
112
- :param special_value:
113
- :param each_item:
114
- :return:
115
- """
116
- query = self.collection.find({"{}".format(special_value): special_value})
117
- if list(query):
118
- self.collection.update_one({"{}".format(special_value): special_value},
119
- {"$push": {'each_item': each_item}})
120
- else:
121
- insert_item = {
122
- "special_value": special_value,
123
- "each_item": [each_item]
124
- }
125
- self.collection.insert_one(insert_item)
126
- print("update success")
127
-
128
-
129
- class RedisOps(object):
130
- def __init__(self, config=global_db_config["redis"]):
131
- REDIS_MAX_CONNECTIONS = 1024
132
- REDIS_GET_TIMEOUT = 0.1
133
- self.redis = aioredis.from_url(config["uri"], max_connections=REDIS_MAX_CONNECTIONS)
134
-
135
-
136
- class HBaseOps(object):
137
- """
138
- demo
139
- key = 'test'
140
- db = HBaseHelper(host=hbase_host)
141
- data = db.query_single_line(table='table', row_key=key)
142
- print(data)
143
- """
144
-
145
- def __init__(self, config=global_db_config["hbase"]):
146
- self.host = config["DEFAULT_HOST"]
147
- self.port = config["DEFAULT_PORT"]
148
- self.compat = config["DEFAULT_COMPAT"]
149
- self.table_prefix = None # namespace
150
- self.transport = config["DEFAULT_TRANSPORT"]
151
- self.protocol = config["DEFAULT_PROTOCOL"]
152
- self.conn = self.connect()
153
-
154
- def connect(self):
155
- conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
156
- table_prefix=self.table_prefix, compat=self.compat,
157
- transport=self.transport, protocol=self.protocol)
158
- return conn
159
-
160
- def create_hb_table(self, table_name, **families):
161
- self.conn.create_table(table_name, families)
162
-
163
- def single_put(self, table_name, row_key, column, data):
164
- hb = happybase.Table(table_name, self.conn)
165
- hb.put(row_key,
166
- data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
167
-
168
- def batch_put(self, table, row_key_name, column, datas, batch_size=1):
169
- hb = happybase.Table(table, self.conn)
170
- datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
171
- for x in datas_new:
172
- with hb.batch(batch_size=batch_size) as batch:
173
- for da in x:
174
- da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
175
- row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
176
- batch.put(row_key, da_nw)
177
- return batch
178
-
179
- def single_put_self(self, table_name, row_keys, datas):
180
- hb = happybase.Table(table_name, self.conn)
181
- for row_key, (_, val) in zip(row_keys, datas.items()):
182
- hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
183
- 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
184
-
185
- def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
186
- filter=None):
187
- hb = happybase.Table(table, self.conn)
188
- scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
189
- hb_dict = dict(scan)
190
- if hb_dict:
191
- return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
192
- for k1, v1 in
193
- hb_dict.items()}
194
- else:
195
- return {}
196
-
197
- def query_single_line(self, table, row_key):
198
- conn = self.connect()
199
- hb = happybase.Table(table, conn)
200
- hb_dict = hb.row(row_key)
201
- if hb_dict:
202
- return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
203
- else:
204
- return {}
205
-
206
- def query_multi_lines(self, table, row_keys):
207
- hb = happybase.Table(table, self.conn)
208
- hb_dict = dict(hb.rows(row_keys))
209
- if hb_dict:
210
- return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
211
- hb_dict.items()}
212
- else:
213
- return {}
214
-
215
- def single_delete(self, table, row_key):
216
- hb = happybase.Table(table, self.conn)
217
- hb.delete(row_key)
218
-
219
- def test_scan(self, table):
220
- hb = happybase.Table(table, self.conn)
221
- filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
222
- scan = hb.scan(limit=1000, filter=filter)
223
-
224
- hb_dict = dict(scan)
225
- if hb_dict:
226
- return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
227
- for k1, v1 in
228
- hb_dict.items()}
229
- else:
230
- return {}
231
-
232
- def close(self):
233
- self.conn.close()
234
-
235
-
236
- class KafkaOps():
237
- def __init__(self, config=global_db_config["kafka"]):
238
- self.bootstrap_server = config["bootstrap_server"]
239
- self.topic = config["topic"]
240
- # 超时时间设置默认30s, 修改为60s
241
- self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
242
- bootstrap_servers=self.bootstrap_server,
243
- acks='all',
244
- request_timeout_ms=60000)
245
-
246
- def send_data_to_kafka(self, data):
247
- try:
248
- self.producer.send(self.topic, data)
249
- logger.info(f"data send successful! ---- {data}")
250
- except Exception as e:
251
- logger.exception(f'kafka occur error ---- {e}')
252
-
253
- def consumer_msg(self):
254
- consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
255
- for msg in consumer:
256
- recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
257
- print(recv)
1
+ #encoding=utf-8
2
+ # !/usr/bin/python3.8
3
+ # -*- coding: utf-8 -*-
4
+ # @Author : youshu.Ji
5
+ import datetime
6
+ import json
7
+ import logging
8
+
9
+ from .io.file import read_yaml
10
+ from .utils.package import *
11
+ import os
12
+
13
+ DB_CONFIG_FILE = os.path.join(os.path.dirname(__file__), "default_db_config.yml")
14
+
15
+ # import aioredis
16
+ # import happybase
17
+ # import pandas as pd
18
+ # import pymysql
19
+ # from elasticsearch import Elasticsearch, helpers
20
+ # from kafka import KafkaProducer, KafkaConsumer
21
+ # from pymongo import MongoClient
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ global_db_config = read_yaml(DB_CONFIG_FILE)
26
+
27
+
28
+ class Neo4jOps(object):
29
+ # neo4j 连接的超时秒数
30
+ # py2neo 内部会重试 3 次...
31
+ NEO4J_TIMEOUT = 0.3
32
+ pass
33
+
34
+
35
+ class SqliteOps(object):
36
+ pass
37
+ # import sqlite3
38
+ # database_path = r'xx.db'
39
+ # conn = sqlite3.connect(database_path)
40
+ # c = conn.cursor()
41
+ # sql = "select name from sqlite_master where type='table' order by name"
42
+ # c.execute(sql)
43
+ # print(c.fetchall())
44
+ # sql = "select * from typecho_contents"
45
+ # c.execute(sql)
46
+ # res = c.fetchall()
47
+ # print(res[3])
48
+ #
49
+ # conn.commit()
50
+ # conn.close()
51
+
52
+
53
+ class MysqlOps(object):
54
+ import pandas as pd
55
+ def __init__(self, config=global_db_config["mysql"]):
56
+ self.db = pymysql.connect(host=config["host"],
57
+ port=config["port"],
58
+ user=config["user"],
59
+ password=config["password"],
60
+ database=config["database"])
61
+
62
+ def query(self, sql):
63
+ df = pd.read_sql(sql, self.db)
64
+ return df
65
+
66
+
67
+ class EsOps(object):
68
+ from elasticsearch import Elasticsearch, helpers
69
+ def __init__(self, config=global_db_config["es"]):
70
+ self.es = Elasticsearch(
71
+ host=config["host"], timeout=config["timeout"])
72
+
73
+ def search_roll(self, index, body):
74
+ all_data = []
75
+ data = self.es.search(index=index, body=body, scroll="5m")
76
+ all_data.extend(data["hits"]["hits"])
77
+ scroll_id = data["_scroll_id"]
78
+ while data["hits"]["hits"]:
79
+ print(scroll_id[:5])
80
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
81
+ scroll_id = data["_scroll_id"]
82
+ all_data.extend(data["hits"]["hits"])
83
+ all_data = [i["_source"] for i in all_data]
84
+ return all_data
85
+
86
+ def search_roll_iter(self, index, body):
87
+ data = self.es.search(index=index, body=body, scroll="5m")
88
+ scroll_id = data["_scroll_id"]
89
+ while data["hits"]["hits"]:
90
+ yield data["hits"]["hits"]
91
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
92
+ scroll_id = data["_scroll_id"]
93
+
94
+ def search(self, index, body):
95
+ return self.es.search(index=index, body=body)
96
+
97
+ def delete(self, index, body):
98
+ self.es.delete_by_query(index=index, body=body)
99
+
100
+ def save(self, data):
101
+ # data里有index
102
+ helpers.bulk(self.es, data)
103
+
104
+ def delete_data_by_query(self, index, _project_id, _source_ids):
105
+ _query = {
106
+ "query": {
107
+ "bool": {
108
+ "must": [
109
+ {"terms": {"source_id": _source_ids}},
110
+ {"term": {"project_id": _project_id}},
111
+ ]
112
+ }
113
+ }
114
+ }
115
+ _res = self.es.delete_by_query(index=index, body=_query)
116
+ print(f"delete_data_by_query: {_res}")
117
+
118
+ def batch_re_save(self, index, _data, _project_id, _source_ids):
119
+ self.delete_data_by_query(_project_id, _source_ids)
120
+ _action = [{"_index": index, "_source": i} for i in _data]
121
+ _res = helpers.bulk(self.es, _action)
122
+ print(f"批量保存数据: {_res}")
123
+
124
+
125
+ class MongoDB_BETA:
126
+ def __init__(self, host='localhost', port=27017, db_name=None, collection_name=None):
127
+ self.host = host
128
+ self.port = port
129
+ self.db_name = db_name
130
+ self.collection_name = collection_name
131
+ self.client = None
132
+ self.db = None
133
+ self.collection = None
134
+
135
+ def connect(self):
136
+ self.client = MongoClient(self.host, self.port)
137
+ self.db = self.client[self.db_name]
138
+ self.collection = self.db[self.collection_name]
139
+
140
+ def close(self):
141
+ if self.client:
142
+ self.client.close()
143
+
144
+ def insert_data(self, data):
145
+ if isinstance(data, list):
146
+ self.collection.insert_many(data)
147
+ else:
148
+ self.collection.insert_one(data)
149
+
150
+ def check_data_exists(self, query):
151
+ """
152
+ 检查某个数据是否存在于数据库中
153
+ :param query: 查询条件
154
+ :return: 布尔值,表示数据是否存在
155
+ """
156
+ return self.collection.count_documents(query) > 0
157
+
158
+
159
+
160
+ class MongoOps(object):
161
+ from pymongo import MongoClient
162
+ def __init__(self, config=global_db_config["mongo"]):
163
+ mongo_client = MongoClient(config["uri"])
164
+ db = mongo_client[config["db"]]
165
+ self.collection = db[config["col"]]
166
+
167
+ def fetch_all(self):
168
+ """
169
+ 读取所有数据
170
+ :return:
171
+ """
172
+ ans = []
173
+ print('提取所有数据.')
174
+ for record in self.collection.find({}):
175
+ record['_id'] = str(record['_id'])
176
+ ans.append(record)
177
+ return ans
178
+
179
+ def load_from_mongo(self, special_value):
180
+ """
181
+ 读取mongodb该special_value下所有值为special_value的数据
182
+ :param
183
+ :return:
184
+ """
185
+ record = self.collection.find({"{}".format(special_value): special_value})
186
+ record = list(record)
187
+ if not record:
188
+ return None
189
+ else:
190
+ record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
191
+ return record
192
+
193
+ def delete_all(self):
194
+ query = {}
195
+ deleted = self.collection.delete_many(query)
196
+ return deleted
197
+
198
+ def delete_by_time(self, time):
199
+ query = {"name": {"$regex": "^F"}}
200
+ deleted = self.collection.delete_many(query)
201
+
202
+ def fetch_by_time(self, year=2022, month=7, day=7, hour=7, minute=7, second=7):
203
+ query = {"query_time": {"$gte": datetime.datetime(year, month, day, hour, minute, second)}}
204
+ sort_sql = [("query_time", -1)]
205
+ ans = []
206
+ print('提取所有数据.')
207
+ for record in self.collection.find(query).sort(sort_sql):
208
+ record['_id'] = str(record['_id'])
209
+ ans.append(record)
210
+ return ans
211
+
212
+ def save_to_mongo(self, special_value, each_item):
213
+ """
214
+ 数据存入mongo
215
+ :param special_value:
216
+ :param each_item:
217
+ :return:
218
+ """
219
+ query = self.collection.find({"{}".format(special_value): special_value})
220
+ if list(query):
221
+ self.collection.update_one({"{}".format(special_value): special_value},
222
+ {"$push": {'each_item': each_item}})
223
+ else:
224
+ insert_item = {
225
+ "special_value": special_value,
226
+ "each_item": [each_item]
227
+ }
228
+ self.collection.insert_one(insert_item)
229
+ print("update success")
230
+
231
+ def insert_one(self, data):
232
+ self.collection.insert_one(data)
233
+
234
+ def update_to_mongo(self, condition_term, condition_value, new_value):
235
+ """
236
+ 根据提供的字段和值,查询出对应的数据,更新数据存入mongo
237
+ 类似 updata
238
+ :param condition_term: 条件字段term
239
+ :param condition_value: 条件字段值
240
+ :param new_value: 新的值。最好是dict,不是dict的话不知道行不行
241
+ :return:
242
+ """
243
+ query = self.collection.find({condition_term: condition_value})
244
+ if list(query):
245
+ self.collection.update_one({condition_term: condition_value},
246
+ {"$push": new_value})
247
+ else:
248
+ insert_item = {
249
+ condition_term: condition_value,
250
+ "processed_data": new_value
251
+ }
252
+ self.collection.insert_one(insert_item)
253
+ print("update success")
254
+
255
+
256
+ class RedisOps(object):
257
+ def __init__(self, config=global_db_config["redis"]):
258
+ redis_max_connections = 1024
259
+ REDIS_GET_TIMEOUT = 0.1
260
+ self.redis = aioredis.from_url(config["uri"], max_connections=redis_max_connections)
261
+
262
+
263
+ class HBaseOps(object):
264
+ import happybase
265
+ """
266
+ demo
267
+ key = 'test'
268
+ db = HBaseHelper(host=hbase_host)
269
+ data = db.query_single_line(table='table', row_key=key)
270
+ print(data)
271
+ """
272
+
273
+ def __init__(self, config=global_db_config["hbase"]):
274
+ self.host = config["DEFAULT_HOST"]
275
+ self.port = config["DEFAULT_PORT"]
276
+ self.compat = config["DEFAULT_COMPAT"]
277
+ self.table_prefix = None # namespace
278
+ self.transport = config["DEFAULT_TRANSPORT"]
279
+ self.protocol = config["DEFAULT_PROTOCOL"]
280
+ self.conn = self.connect()
281
+
282
+ def connect(self):
283
+ conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
284
+ table_prefix=self.table_prefix, compat=self.compat,
285
+ transport=self.transport, protocol=self.protocol)
286
+ return conn
287
+
288
+ def create_hb_table(self, table_name, **families):
289
+ self.conn.create_table(table_name, families)
290
+
291
+ def single_put(self, table_name, row_key, column, data):
292
+ hb = happybase.Table(table_name, self.conn)
293
+ hb.put(row_key,
294
+ data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
295
+
296
+ def batch_put(self, table, row_key_name, column, datas, batch_size=1):
297
+ hb = happybase.Table(table, self.conn)
298
+ datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
299
+ for x in datas_new:
300
+ with hb.batch(batch_size=batch_size) as batch:
301
+ for da in x:
302
+ da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
303
+ row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
304
+ batch.put(row_key, da_nw)
305
+ return batch
306
+
307
+ def single_put_self(self, table_name, row_keys, datas):
308
+ hb = happybase.Table(table_name, self.conn)
309
+ for row_key, (_, val) in zip(row_keys, datas.items()):
310
+ hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
311
+ 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
312
+
313
+ def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
314
+ filter=None):
315
+ hb = happybase.Table(table, self.conn)
316
+ scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
317
+ hb_dict = dict(scan)
318
+ if hb_dict:
319
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
320
+ for k1, v1 in
321
+ hb_dict.items()}
322
+ else:
323
+ return {}
324
+
325
+ def query_single_line(self, table, row_key):
326
+ conn = self.connect()
327
+ hb = happybase.Table(table, conn)
328
+ hb_dict = hb.row(row_key)
329
+ if hb_dict:
330
+ return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
331
+ else:
332
+ return {}
333
+
334
+ def query_multi_lines(self, table, row_keys):
335
+ hb = happybase.Table(table, self.conn)
336
+ hb_dict = dict(hb.rows(row_keys))
337
+ if hb_dict:
338
+ return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
339
+ hb_dict.items()}
340
+ else:
341
+ return {}
342
+
343
+ def single_delete(self, table, row_key):
344
+ hb = happybase.Table(table, self.conn)
345
+ hb.delete(row_key)
346
+
347
+ def test_scan(self, table):
348
+ hb = happybase.Table(table, self.conn)
349
+ filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
350
+ scan = hb.scan(limit=1000, filter=filter)
351
+
352
+ hb_dict = dict(scan)
353
+ if hb_dict:
354
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
355
+ for k1, v1 in
356
+ hb_dict.items()}
357
+ else:
358
+ return {}
359
+
360
+ def close(self):
361
+ self.conn.close()
362
+
363
+
364
+ class KafkaConfig():
365
+ pass
366
+
367
+
368
+ class KafkaOps(object):
369
+ def __init__(self, config=global_db_config["kafka"]):
370
+ self.bootstrap_server = config["bootstrap_server"]
371
+ self.topic = config["topic"]
372
+ # 超时时间设置默认30s, 修改为60s
373
+ self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
374
+ bootstrap_servers=self.bootstrap_server,
375
+ acks='all',
376
+ request_timeout_ms=60000)
377
+
378
+ def send_data_to_kafka(self, data):
379
+ try:
380
+ self.producer.send(self.topic, data)
381
+ logger.info(f"data send successful! ---- {data}")
382
+ except Exception as e:
383
+ logger.exception(f'kafka occur error ---- {e}')
384
+
385
+ def consumer_msg(self):
386
+ consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
387
+ for msg in consumer:
388
+ recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
389
+ print(recv)
390
+
391
+
392
+ class MilvusOps(object):
393
+ def __init__(self, config=global_db_config.milvus):
394
+ from pymilvus import connections, Collection
395
+
396
+ connections.connect("default", host=config.host, port=config.port)
397
+ self.collection = Collection(config.collection)
398
+ self.collection.load()
399
+
400
+ def get_similarity(self, embedding):
401
+ search_params = {
402
+ "metric_type": "L2",
403
+ "params": {"nprobe": 1},
404
+ }
405
+ # # %%
406
+ logger.debug(embedding)
407
+ result = self.collection.search(
408
+ [list(embedding)],
409
+ "vec",
410
+ search_params,
411
+ limit=3,
412
+ output_fields=["pk", "entity_name", "standard_entity_name"],
413
+ )
414
+ hits = result[0]
415
+ entities = []
416
+ for hit in hits:
417
+ entities.append(
418
+ {
419
+ "name": hit.entity.get("entity_name"),
420
+ "standard_name": hit.entity.get("standard_entity_name"),
421
+ }
422
+ )
423
+ return entities
424
+
425
+ # def insert(self, collection, entities):
426
+ # collection.insert(entities)