nlpertools 1.0.5__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. nlpertools/__init__.py +24 -20
  2. nlpertools/algo/ac.py +18 -0
  3. nlpertools/algo/bit_ops.py +28 -0
  4. nlpertools/algo/kmp.py +94 -55
  5. nlpertools/algo/num_ops.py +12 -0
  6. nlpertools/algo/template.py +116 -0
  7. nlpertools/algo/union.py +13 -0
  8. nlpertools/data_client.py +387 -257
  9. nlpertools/data_structure/base_structure.py +109 -13
  10. nlpertools/dataprocess.py +611 -3
  11. nlpertools/default_db_config.yml +41 -0
  12. nlpertools/io/__init__.py +3 -3
  13. nlpertools/io/dir.py +54 -36
  14. nlpertools/io/file.py +277 -222
  15. nlpertools/ml.py +483 -460
  16. nlpertools/monitor/__init__.py +0 -0
  17. nlpertools/monitor/gpu.py +18 -0
  18. nlpertools/monitor/memory.py +24 -0
  19. nlpertools/movie.py +36 -0
  20. nlpertools/nlpertools_config.yml +1 -0
  21. nlpertools/{openApi.py → open_api.py} +65 -65
  22. nlpertools/other.py +364 -249
  23. nlpertools/pic.py +288 -0
  24. nlpertools/plugin.py +43 -43
  25. nlpertools/reminder.py +98 -87
  26. nlpertools/utils/__init__.py +3 -3
  27. nlpertools/utils/lazy.py +727 -0
  28. nlpertools/utils/log_util.py +20 -0
  29. nlpertools/utils/package.py +89 -76
  30. nlpertools/utils/package_v1.py +94 -0
  31. nlpertools/utils/package_v2.py +117 -0
  32. nlpertools/utils_for_nlpertools.py +93 -93
  33. nlpertools/vector_index_demo.py +108 -0
  34. nlpertools/wrapper.py +161 -96
  35. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/LICENSE +200 -200
  36. nlpertools-1.0.6.dev0.dist-info/METADATA +111 -0
  37. nlpertools-1.0.6.dev0.dist-info/RECORD +43 -0
  38. {nlpertools-1.0.5.dist-info → nlpertools-1.0.6.dev0.dist-info}/WHEEL +1 -1
  39. nlpertools-1.0.6.dev0.dist-info/top_level.txt +2 -0
  40. nlpertools_helper/__init__.py +10 -0
  41. nlpertools-1.0.5.dist-info/METADATA +0 -85
  42. nlpertools-1.0.5.dist-info/RECORD +0 -25
  43. nlpertools-1.0.5.dist-info/top_level.txt +0 -1
nlpertools/data_client.py CHANGED
@@ -1,257 +1,387 @@
1
- # !/usr/bin/python3.8
2
- # -*- coding: utf-8 -*-
3
- # @Author : youshu.Ji
4
- import json
5
- import logging
6
-
7
- from . import DB_CONFIG_FILE
8
- from .io.file import read_yaml
9
- from .utils.package import *
10
-
11
- # import aioredis
12
- # import happybase
13
- # import pandas as pd
14
- # import pymysql
15
- # from elasticsearch import Elasticsearch, helpers
16
- # from kafka import KafkaProducer, KafkaConsumer
17
- # from pymongo import MongoClient
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
- global_db_config = read_yaml(DB_CONFIG_FILE)
22
-
23
-
24
- class Neo4jOps(object):
25
- # neo4j 连接的超时秒数
26
- # py2neo 内部会重试 3 次...
27
- NEO4J_TIMEOUT = 0.3
28
- pass
29
-
30
-
31
- class MysqlOps(object):
32
- def __init__(self, config=global_db_config["mysql"]):
33
- self.db = pymysql.connect(host=config["host"],
34
- port=config["port"],
35
- user=config["user"],
36
- password=config["password"],
37
- database=config["database"])
38
-
39
- def query(self, sql):
40
- df = pd.read_sql(sql, self.db)
41
- return df
42
-
43
-
44
- class EsOps(object):
45
- def __init__(self, config=global_db_config["es"]):
46
- self.es = Elasticsearch(
47
- host=config["host"], timeout=config["timeout"])
48
-
49
- def search_roll(self, index, body):
50
- all_data = []
51
- data = self.es.search(index=index, body=body, scroll="5m")
52
- all_data.extend(data["hits"]["hits"])
53
- scroll_id = data["_scroll_id"]
54
- while data["hits"]["hits"]:
55
- print(scroll_id[:5])
56
- data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
57
- scroll_id = data["_scroll_id"]
58
- all_data.extend(data["hits"]["hits"])
59
- all_data = [i["_source"] for i in all_data]
60
- return all_data
61
-
62
- def search(self, index, body):
63
- return self.es.search(index=index, body=body)
64
-
65
- def delete(self, index, body):
66
- self.es.delete_by_query(index=index, body=body)
67
-
68
- def save(self, data):
69
- # data里有index
70
- helpers.bulk(self.es, data)
71
-
72
-
73
- class MongoOps(object):
74
- def __init__(self, config=global_db_config["mongo"]):
75
- mongo_client = MongoClient(config["uri"])
76
- db = mongo_client[config["db"]]
77
- self.collection = db[config["col"]]
78
-
79
- def fetch_all(self):
80
- """
81
- 读取所有数据
82
- :return:
83
- """
84
- ans = []
85
- print('提取所有数据.')
86
- for record in self.collection.find({}):
87
- record['_id'] = str(record['_id'])
88
- ans.append(record)
89
- return ans
90
-
91
- def load_from_mongo(self, special_value):
92
- """
93
- 读取mongodb该special_value下所有值为special_value的数据
94
- :param
95
- :return:
96
- """
97
- record = self.collection.find({"{}".format(special_value): special_value})
98
- record = list(record)
99
- if not record:
100
- return None
101
- else:
102
- record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
103
- return record
104
-
105
- def delete_by_time(self, time):
106
- query = {"name": {"$regex": "^F"}}
107
- deleted = self.collection.delete_many(query)
108
-
109
- def save_to_mongo(self, special_value, each_item):
110
- """
111
- 数据存入mongo
112
- :param special_value:
113
- :param each_item:
114
- :return:
115
- """
116
- query = self.collection.find({"{}".format(special_value): special_value})
117
- if list(query):
118
- self.collection.update_one({"{}".format(special_value): special_value},
119
- {"$push": {'each_item': each_item}})
120
- else:
121
- insert_item = {
122
- "special_value": special_value,
123
- "each_item": [each_item]
124
- }
125
- self.collection.insert_one(insert_item)
126
- print("update success")
127
-
128
-
129
- class RedisOps(object):
130
- def __init__(self, config=global_db_config["redis"]):
131
- REDIS_MAX_CONNECTIONS = 1024
132
- REDIS_GET_TIMEOUT = 0.1
133
- self.redis = aioredis.from_url(config["uri"], max_connections=REDIS_MAX_CONNECTIONS)
134
-
135
-
136
- class HBaseOps(object):
137
- """
138
- demo
139
- key = 'test'
140
- db = HBaseHelper(host=hbase_host)
141
- data = db.query_single_line(table='table', row_key=key)
142
- print(data)
143
- """
144
-
145
- def __init__(self, config=global_db_config["hbase"]):
146
- self.host = config["DEFAULT_HOST"]
147
- self.port = config["DEFAULT_PORT"]
148
- self.compat = config["DEFAULT_COMPAT"]
149
- self.table_prefix = None # namespace
150
- self.transport = config["DEFAULT_TRANSPORT"]
151
- self.protocol = config["DEFAULT_PROTOCOL"]
152
- self.conn = self.connect()
153
-
154
- def connect(self):
155
- conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
156
- table_prefix=self.table_prefix, compat=self.compat,
157
- transport=self.transport, protocol=self.protocol)
158
- return conn
159
-
160
- def create_hb_table(self, table_name, **families):
161
- self.conn.create_table(table_name, families)
162
-
163
- def single_put(self, table_name, row_key, column, data):
164
- hb = happybase.Table(table_name, self.conn)
165
- hb.put(row_key,
166
- data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
167
-
168
- def batch_put(self, table, row_key_name, column, datas, batch_size=1):
169
- hb = happybase.Table(table, self.conn)
170
- datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
171
- for x in datas_new:
172
- with hb.batch(batch_size=batch_size) as batch:
173
- for da in x:
174
- da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
175
- row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
176
- batch.put(row_key, da_nw)
177
- return batch
178
-
179
- def single_put_self(self, table_name, row_keys, datas):
180
- hb = happybase.Table(table_name, self.conn)
181
- for row_key, (_, val) in zip(row_keys, datas.items()):
182
- hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
183
- 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
184
-
185
- def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
186
- filter=None):
187
- hb = happybase.Table(table, self.conn)
188
- scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
189
- hb_dict = dict(scan)
190
- if hb_dict:
191
- return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
192
- for k1, v1 in
193
- hb_dict.items()}
194
- else:
195
- return {}
196
-
197
- def query_single_line(self, table, row_key):
198
- conn = self.connect()
199
- hb = happybase.Table(table, conn)
200
- hb_dict = hb.row(row_key)
201
- if hb_dict:
202
- return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
203
- else:
204
- return {}
205
-
206
- def query_multi_lines(self, table, row_keys):
207
- hb = happybase.Table(table, self.conn)
208
- hb_dict = dict(hb.rows(row_keys))
209
- if hb_dict:
210
- return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
211
- hb_dict.items()}
212
- else:
213
- return {}
214
-
215
- def single_delete(self, table, row_key):
216
- hb = happybase.Table(table, self.conn)
217
- hb.delete(row_key)
218
-
219
- def test_scan(self, table):
220
- hb = happybase.Table(table, self.conn)
221
- filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
222
- scan = hb.scan(limit=1000, filter=filter)
223
-
224
- hb_dict = dict(scan)
225
- if hb_dict:
226
- return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
227
- for k1, v1 in
228
- hb_dict.items()}
229
- else:
230
- return {}
231
-
232
- def close(self):
233
- self.conn.close()
234
-
235
-
236
- class KafkaOps():
237
- def __init__(self, config=global_db_config["kafka"]):
238
- self.bootstrap_server = config["bootstrap_server"]
239
- self.topic = config["topic"]
240
- # 超时时间设置默认30s, 修改为60s
241
- self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
242
- bootstrap_servers=self.bootstrap_server,
243
- acks='all',
244
- request_timeout_ms=60000)
245
-
246
- def send_data_to_kafka(self, data):
247
- try:
248
- self.producer.send(self.topic, data)
249
- logger.info(f"data send successful! ---- {data}")
250
- except Exception as e:
251
- logger.exception(f'kafka occur error ---- {e}')
252
-
253
- def consumer_msg(self):
254
- consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
255
- for msg in consumer:
256
- recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
257
- print(recv)
1
+ # !/usr/bin/python3.8
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : youshu.Ji
4
+ import datetime
5
+ import json
6
+ import logging
7
+
8
+ from . import DB_CONFIG_FILE
9
+ from .io.file import read_yaml
10
+ from .utils.package import *
11
+
12
+ # import aioredis
13
+ # import happybase
14
+ # import pandas as pd
15
+ # import pymysql
16
+ # from elasticsearch import Elasticsearch, helpers
17
+ # from kafka import KafkaProducer, KafkaConsumer
18
+ # from pymongo import MongoClient
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ global_db_config = read_yaml(DB_CONFIG_FILE)
23
+
24
+
25
+ class Neo4jOps(object):
26
+ # neo4j 连接的超时秒数
27
+ # py2neo 内部会重试 3 次...
28
+ NEO4J_TIMEOUT = 0.3
29
+ pass
30
+
31
+ class SqliteOps(object):
32
+ import sqlite3
33
+ database_path = r'xx.db'
34
+ conn = sqlite3.connect(database_path)
35
+ c = conn.cursor()
36
+ sql = "select name from sqlite_master where type='table' order by name"
37
+ c.execute(sql)
38
+ print(c.fetchall())
39
+ sql = "select * from typecho_contents"
40
+ c.execute(sql)
41
+ res = c.fetchall()
42
+ print(res[3])
43
+
44
+ conn.commit()
45
+ conn.close()
46
+
47
+ class MysqlOps(object):
48
+ import pandas as pd
49
+ def __init__(self, config=global_db_config["mysql"]):
50
+ self.db = pymysql.connect(host=config["host"],
51
+ port=config["port"],
52
+ user=config["user"],
53
+ password=config["password"],
54
+ database=config["database"])
55
+
56
+ def query(self, sql):
57
+ df = pd.read_sql(sql, self.db)
58
+ return df
59
+
60
+
61
+ class EsOps(object):
62
+ from elasticsearch import Elasticsearch, helpers
63
+ def __init__(self, config=global_db_config["es"]):
64
+ self.es = Elasticsearch(
65
+ host=config["host"], timeout=config["timeout"])
66
+
67
+ def search_roll(self, index, body):
68
+ all_data = []
69
+ data = self.es.search(index=index, body=body, scroll="5m")
70
+ all_data.extend(data["hits"]["hits"])
71
+ scroll_id = data["_scroll_id"]
72
+ while data["hits"]["hits"]:
73
+ print(scroll_id[:5])
74
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
75
+ scroll_id = data["_scroll_id"]
76
+ all_data.extend(data["hits"]["hits"])
77
+ all_data = [i["_source"] for i in all_data]
78
+ return all_data
79
+
80
+ def search_roll_iter(self, index, body):
81
+ data = self.es.search(index=index, body=body, scroll="5m")
82
+ scroll_id = data["_scroll_id"]
83
+ while data["hits"]["hits"]:
84
+ yield data["hits"]["hits"]
85
+ data = self.es.scroll(scroll_id=scroll_id, scroll="5m")
86
+ scroll_id = data["_scroll_id"]
87
+
88
+ def search(self, index, body):
89
+ return self.es.search(index=index, body=body)
90
+
91
+ def delete(self, index, body):
92
+ self.es.delete_by_query(index=index, body=body)
93
+
94
+ def save(self, data):
95
+ # data里有index
96
+ helpers.bulk(self.es, data)
97
+
98
+ def delete_data_by_query(self, index, _project_id, _source_ids):
99
+ _query = {
100
+ "query": {
101
+ "bool": {
102
+ "must": [
103
+ {"terms": {"source_id": _source_ids}},
104
+ {"term": {"project_id": _project_id}},
105
+ ]
106
+ }
107
+ }
108
+ }
109
+ _res = self.es.delete_by_query(index=index, body=_query)
110
+ print(f"delete_data_by_query: {_res}")
111
+
112
+ def batch_re_save(self, index, _data, _project_id, _source_ids):
113
+ self.delete_data_by_query(_project_id, _source_ids)
114
+ _action = [{"_index": index, "_source": i} for i in _data]
115
+ _res = helpers.bulk(self.es, _action)
116
+ print(f"批量保存数据: {_res}")
117
+
118
+
119
+ class MongoOps(object):
120
+ from pymongo import MongoClient
121
+ def __init__(self, config=global_db_config["mongo"]):
122
+ mongo_client = MongoClient(config["uri"])
123
+ db = mongo_client[config["db"]]
124
+ self.collection = db[config["col"]]
125
+
126
+ def fetch_all(self):
127
+ """
128
+ 读取所有数据
129
+ :return:
130
+ """
131
+ ans = []
132
+ print('提取所有数据.')
133
+ for record in self.collection.find({}):
134
+ record['_id'] = str(record['_id'])
135
+ ans.append(record)
136
+ return ans
137
+
138
+ def load_from_mongo(self, special_value):
139
+ """
140
+ 读取mongodb该special_value下所有值为special_value的数据
141
+ :param
142
+ :return:
143
+ """
144
+ record = self.collection.find({"{}".format(special_value): special_value})
145
+ record = list(record)
146
+ if not record:
147
+ return None
148
+ else:
149
+ record = sorted(record, key=lambda x: len(x.get("another_value", [])))[0]
150
+ return record
151
+
152
+ def delete_all(self):
153
+ query = {}
154
+ deleted = self.collection.delete_many(query)
155
+ return deleted
156
+
157
+ def delete_by_time(self, time):
158
+ query = {"name": {"$regex": "^F"}}
159
+ deleted = self.collection.delete_many(query)
160
+
161
+ def fetch_by_time(self, year=2022, month=7, day=7, hour=7, minute=7, second=7):
162
+ query = {"query_time": {"$gte": datetime.datetime(year, month, day, hour, minute, second)}}
163
+ sort_sql = [("query_time", -1)]
164
+ ans = []
165
+ print('提取所有数据.')
166
+ for record in self.collection.find(query).sort(sort_sql):
167
+ record['_id'] = str(record['_id'])
168
+ ans.append(record)
169
+ return ans
170
+
171
+ def save_to_mongo(self, special_value, each_item):
172
+ """
173
+ 数据存入mongo
174
+ :param special_value:
175
+ :param each_item:
176
+ :return:
177
+ """
178
+ query = self.collection.find({"{}".format(special_value): special_value})
179
+ if list(query):
180
+ self.collection.update_one({"{}".format(special_value): special_value},
181
+ {"$push": {'each_item': each_item}})
182
+ else:
183
+ insert_item = {
184
+ "special_value": special_value,
185
+ "each_item": [each_item]
186
+ }
187
+ self.collection.insert_one(insert_item)
188
+ print("update success")
189
+
190
+ def insert_one(self, data):
191
+ self.collection.insert_one(data)
192
+
193
+ def update_to_mongo(self, condition_term, condition_value, new_value):
194
+ """
195
+ 根据提供的字段和值,查询出对应的数据,更新数据存入mongo
196
+ 类似 updata
197
+ :param condition_term: 条件字段term
198
+ :param condition_value: 条件字段值
199
+ :param new_value: 新的值。最好是dict,不是dict的话不知道行不行
200
+ :return:
201
+ """
202
+ query = self.collection.find({condition_term: condition_value})
203
+ if list(query):
204
+ self.collection.update_one({condition_term: condition_value},
205
+ {"$push": new_value})
206
+ else:
207
+ insert_item = {
208
+ condition_term: condition_value,
209
+ "processed_data": new_value
210
+ }
211
+ self.collection.insert_one(insert_item)
212
+ print("update success")
213
+
214
+
215
+ class RedisOps(object):
216
+ def __init__(self, config=global_db_config["redis"]):
217
+ redis_max_connections = 1024
218
+ REDIS_GET_TIMEOUT = 0.1
219
+ self.redis = aioredis.from_url(config["uri"], max_connections=redis_max_connections)
220
+
221
+
222
+ class HBaseOps(object):
223
+ import happybase
224
+ """
225
+ demo
226
+ key = 'test'
227
+ db = HBaseHelper(host=hbase_host)
228
+ data = db.query_single_line(table='table', row_key=key)
229
+ print(data)
230
+ """
231
+
232
+ def __init__(self, config=global_db_config["hbase"]):
233
+ self.host = config["DEFAULT_HOST"]
234
+ self.port = config["DEFAULT_PORT"]
235
+ self.compat = config["DEFAULT_COMPAT"]
236
+ self.table_prefix = None # namespace
237
+ self.transport = config["DEFAULT_TRANSPORT"]
238
+ self.protocol = config["DEFAULT_PROTOCOL"]
239
+ self.conn = self.connect()
240
+
241
+ def connect(self):
242
+ conn = happybase.Connection(host=self.host, port=self.port, timeout=None, autoconnect=True,
243
+ table_prefix=self.table_prefix, compat=self.compat,
244
+ transport=self.transport, protocol=self.protocol)
245
+ return conn
246
+
247
+ def create_hb_table(self, table_name, **families):
248
+ self.conn.create_table(table_name, families)
249
+
250
+ def single_put(self, table_name, row_key, column, data):
251
+ hb = happybase.Table(table_name, self.conn)
252
+ hb.put(row_key,
253
+ data={'{column}:{k}'.format(column=column, k=k): str(v).encode("utf-8") for k, v in data.items()})
254
+
255
+ def batch_put(self, table, row_key_name, column, datas, batch_size=1):
256
+ hb = happybase.Table(table, self.conn)
257
+ datas_new = [datas[i:i + batch_size] for i in range(0, len(datas), batch_size)]
258
+ for x in datas_new:
259
+ with hb.batch(batch_size=batch_size) as batch:
260
+ for da in x:
261
+ da_nw = {'{column}:{k}'.format(column=column, k=k): v for k, v in da.items()}
262
+ row_key = da_nw.pop('{column}:{k}'.format(column=column, k=row_key_name))
263
+ batch.put(row_key, da_nw)
264
+ return batch
265
+
266
+ def single_put_self(self, table_name, row_keys, datas):
267
+ hb = happybase.Table(table_name, self.conn)
268
+ for row_key, (_, val) in zip(row_keys, datas.items()):
269
+ hb.put(row_key, {'maybe_table_name:maybe_column_name': "%s" % val[0],
270
+ 'maybe_table_name:maybe_column_name2': "%s" % val[1]})
271
+
272
+ def scan_table(self, table, row_start=None, row_stop=None, include_timestamp=False, limit=None, timestamps=None,
273
+ filter=None):
274
+ hb = happybase.Table(table, self.conn)
275
+ scan = hb.scan(row_start=row_start, row_stop=row_stop, limit=limit, timestamp=timestamps, filter=filter)
276
+ hb_dict = dict(scan)
277
+ if hb_dict:
278
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
279
+ for k1, v1 in
280
+ hb_dict.items()}
281
+ else:
282
+ return {}
283
+
284
+ def query_single_line(self, table, row_key):
285
+ conn = self.connect()
286
+ hb = happybase.Table(table, conn)
287
+ hb_dict = hb.row(row_key)
288
+ if hb_dict:
289
+ return {k.decode('utf-8'): v.decode('utf-8') for k, v in hb_dict.items()}
290
+ else:
291
+ return {}
292
+
293
+ def query_multi_lines(self, table, row_keys):
294
+ hb = happybase.Table(table, self.conn)
295
+ hb_dict = dict(hb.rows(row_keys))
296
+ if hb_dict:
297
+ return {k1.decode('utf-8'): {k2.decode('utf-8'): v2.decode('utf-8') for k2, v2 in v1.items()} for k1, v1 in
298
+ hb_dict.items()}
299
+ else:
300
+ return {}
301
+
302
+ def single_delete(self, table, row_key):
303
+ hb = happybase.Table(table, self.conn)
304
+ hb.delete(row_key)
305
+
306
+ def test_scan(self, table):
307
+ hb = happybase.Table(table, self.conn)
308
+ filter = "SingleColumnValueFilter ('maybe_column_name', 'lang', =, 'regexstring:[regex_string]')"
309
+ scan = hb.scan(limit=1000, filter=filter)
310
+
311
+ hb_dict = dict(scan)
312
+ if hb_dict:
313
+ return {str(k1).decode('utf-8'): {str(k2).decode('utf-8'): str(v2).decode('utf-8') for k2, v2 in v1.items()}
314
+ for k1, v1 in
315
+ hb_dict.items()}
316
+ else:
317
+ return {}
318
+
319
+ def close(self):
320
+ self.conn.close()
321
+
322
+
323
+ class KafkaConfig():
324
+ pass
325
+
326
+
327
+ class KafkaOps(object):
328
+ def __init__(self, config=global_db_config["kafka"]):
329
+ self.bootstrap_server = config["bootstrap_server"]
330
+ self.topic = config["topic"]
331
+ # 超时时间设置默认30s, 修改为60s
332
+ self.producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
333
+ bootstrap_servers=self.bootstrap_server,
334
+ acks='all',
335
+ request_timeout_ms=60000)
336
+
337
+ def send_data_to_kafka(self, data):
338
+ try:
339
+ self.producer.send(self.topic, data)
340
+ logger.info(f"data send successful! ---- {data}")
341
+ except Exception as e:
342
+ logger.exception(f'kafka occur error ---- {e}')
343
+
344
+ def consumer_msg(self):
345
+ consumer = KafkaConsumer(self.topic, group_id='test-group_id', bootstrap_servers=self.bootstrap_server)
346
+ for msg in consumer:
347
+ recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
348
+ print(recv)
349
+
350
+
351
+
352
+
353
+ class MilvusOps(object):
354
+ def __init__(self, config=global_db_config.milvus):
355
+ from pymilvus import connections, Collection
356
+
357
+ connections.connect("default", host=config.host, port=config.port)
358
+ self.collection = Collection(config.collection)
359
+ self.collection.load()
360
+
361
+ def get_similarity(self, embedding):
362
+ search_params = {
363
+ "metric_type": "L2",
364
+ "params": {"nprobe": 1},
365
+ }
366
+ # # %%
367
+ logger.debug(embedding)
368
+ result = self.collection.search(
369
+ [list(embedding)],
370
+ "vec",
371
+ search_params,
372
+ limit=3,
373
+ output_fields=["pk", "entity_name", "standard_entity_name"],
374
+ )
375
+ hits = result[0]
376
+ entities = []
377
+ for hit in hits:
378
+ entities.append(
379
+ {
380
+ "name": hit.entity.get("entity_name"),
381
+ "standard_name": hit.entity.get("standard_entity_name"),
382
+ }
383
+ )
384
+ return entities
385
+
386
+ # def insert(self, collection, entities):
387
+ # collection.insert(entities)