mnemosynecore 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mnemosynecore/__init__.py +28 -0
- mnemosynecore/config.py +0 -0
- mnemosynecore/db/vertica.py +316 -0
- mnemosynecore/mattermost.py +141 -0
- mnemosynecore/superset.py +25 -0
- mnemosynecore/vault.py +62 -0
- mnemosynecore/warnings.py +8 -0
- mnemosynecore-0.1.7.dist-info/METADATA +278 -0
- mnemosynecore-0.1.7.dist-info/RECORD +11 -0
- mnemosynecore-0.1.7.dist-info/WHEEL +5 -0
- mnemosynecore-0.1.7.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from mnemosynecore.db.vertica import (
|
|
2
|
+
vertica_conn,
|
|
3
|
+
vertica_sql,
|
|
4
|
+
vertica_select,
|
|
5
|
+
load_sql_tasks_from_dir,
|
|
6
|
+
read_sql_file,
|
|
7
|
+
vertica_dedupe,
|
|
8
|
+
vertica_upsert
|
|
9
|
+
)
|
|
10
|
+
from .mattermost import send_message, send_message_test
|
|
11
|
+
from .superset import superset_request
|
|
12
|
+
from .vault import get_secret, get_secret_test
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"vertica_conn",
|
|
17
|
+
"load_sql_tasks_from_dir",
|
|
18
|
+
"read_sql_file",
|
|
19
|
+
"vertica_dedupe",
|
|
20
|
+
"vertica_upsert",
|
|
21
|
+
"vertica_sql",
|
|
22
|
+
"vertica_select",
|
|
23
|
+
"send_message",
|
|
24
|
+
"send_message_test",
|
|
25
|
+
"superset_request",
|
|
26
|
+
"get_secret",
|
|
27
|
+
"get_secret_test",
|
|
28
|
+
]
|
mnemosynecore/config.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Iterable, Optional
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import vertica_python
|
|
7
|
+
from sqlalchemy import create_engine
|
|
8
|
+
from mnemosynecore.vault import get_secret
|
|
9
|
+
|
|
10
|
+
# Условный импорт Airflow
|
|
11
|
+
HAS_AIRFLOW = False
|
|
12
|
+
try:
|
|
13
|
+
from airflow.operators.python import get_current_context
|
|
14
|
+
from airflow import DAG
|
|
15
|
+
from airflow.utils.task_group import TaskGroup
|
|
16
|
+
from airflow.operators.dummy import DummyOperator
|
|
17
|
+
from os import listdir, path
|
|
18
|
+
HAS_AIRFLOW = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_sql_tasks_from_dir(dir_sql: str, vertica_conn_id: str):
|
|
24
|
+
"""
|
|
25
|
+
Создаёт словарь VerticaOperator из всех SQL файлов в папке,
|
|
26
|
+
автоматически используя текущий DAG из контекста.
|
|
27
|
+
"""
|
|
28
|
+
if not HAS_AIRFLOW:
|
|
29
|
+
raise ImportError("Для использования load_sql_tasks_from_dir необходимо установить Airflow")
|
|
30
|
+
|
|
31
|
+
from airflow.providers.vertica.operators.vertica import VerticaOperator
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
context = get_current_context()
|
|
35
|
+
dag = context['dag'] # берём текущий DAG
|
|
36
|
+
|
|
37
|
+
tasks = {}
|
|
38
|
+
files_list = listdir(dir_sql)
|
|
39
|
+
for filename in files_list:
|
|
40
|
+
if not filename.endswith(".sql"):
|
|
41
|
+
continue
|
|
42
|
+
path_file = path.join(dir_sql, filename)
|
|
43
|
+
sql = read_sql_file(path_file)
|
|
44
|
+
if sql:
|
|
45
|
+
sql_statements = sql.split(';')
|
|
46
|
+
task_name = 'task_' + filename.replace('.sql', '_vertica')
|
|
47
|
+
tasks[task_name] = VerticaOperator(
|
|
48
|
+
sql=sql_statements,
|
|
49
|
+
task_id=task_name,
|
|
50
|
+
vertica_conn_id=vertica_conn_id,
|
|
51
|
+
dag=dag,
|
|
52
|
+
)
|
|
53
|
+
return tasks
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def read_sql_file(file_path: str):
|
|
57
|
+
if not path.exists(file_path):
|
|
58
|
+
print('Error: no file ' + file_path)
|
|
59
|
+
return None
|
|
60
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
61
|
+
return f.read()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def vertica_dedupe(
|
|
65
|
+
table_name: str,
|
|
66
|
+
unique_keys: Union[str, List[str]],
|
|
67
|
+
conn: Optional[vertica_python.Connection] = None,
|
|
68
|
+
conn_id: Optional[str] = None,
|
|
69
|
+
date_col: Optional[str] = None,
|
|
70
|
+
keep: str = "last",
|
|
71
|
+
commit: bool = True
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Удаляет дубликаты из таблицы Vertica, оставляя только уникальные строки.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
table_name: Полное имя таблицы (schema.table)
|
|
78
|
+
unique_keys: Столбец или список столбцов для уникальности
|
|
79
|
+
conn: Готовое соединение с Vertica
|
|
80
|
+
conn_id: ID подключения/секрета (если conn не передан)
|
|
81
|
+
date_col: Столбец с датой для определения последней строки при дубликатах
|
|
82
|
+
keep: 'last' или 'first' — какая запись остаётся среди дубликатов
|
|
83
|
+
commit: Выполнить commit после удаления
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
if isinstance(unique_keys, str):
|
|
87
|
+
unique_keys = [unique_keys]
|
|
88
|
+
|
|
89
|
+
close_conn = False
|
|
90
|
+
if conn is None:
|
|
91
|
+
if not conn_id:
|
|
92
|
+
raise ValueError("Нужно указать conn или conn_id")
|
|
93
|
+
conn = vertica_conn(conn_id)
|
|
94
|
+
close_conn = True
|
|
95
|
+
|
|
96
|
+
# Выбираем строки, которые нужно оставить
|
|
97
|
+
order_clause = f"ORDER BY {date_col} DESC" if date_col and keep == "last" else ""
|
|
98
|
+
unique_cols = ", ".join(unique_keys)
|
|
99
|
+
|
|
100
|
+
dedupe_sql = f"""
|
|
101
|
+
DELETE FROM {table_name} t
|
|
102
|
+
USING (
|
|
103
|
+
SELECT {unique_cols}, ROW_NUMBER() OVER (PARTITION BY {unique_cols} {order_clause}) AS rn
|
|
104
|
+
FROM {table_name}
|
|
105
|
+
) x
|
|
106
|
+
WHERE t.{unique_keys[0]} = x.{unique_keys[0]}
|
|
107
|
+
AND x.rn > 1
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
vertica_sql(conn=conn, sql=dedupe_sql, commit=commit)
|
|
111
|
+
|
|
112
|
+
if close_conn:
|
|
113
|
+
conn.close()
|
|
114
|
+
|
|
115
|
+
print(f"Удаление дубликатов из {table_name} завершено по ключам: {unique_keys}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_connection_as_json(conn_name):
|
|
119
|
+
import json
|
|
120
|
+
from airflow.hooks.base import BaseHook
|
|
121
|
+
|
|
122
|
+
conn = BaseHook.get_connection(conn_name)
|
|
123
|
+
ci = {'host': conn.host, 'password': conn.password, 'login': conn.login, 'port': conn.port, 'schema': conn.schema,
|
|
124
|
+
'extra': conn.extra}
|
|
125
|
+
return json.dumps(ci)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def vertica_upsert(
|
|
129
|
+
df: pd.DataFrame,
|
|
130
|
+
table_name: str,
|
|
131
|
+
unique_keys: Union[str, List[str]],
|
|
132
|
+
conn=None,
|
|
133
|
+
date_col: Optional[str] = None,
|
|
134
|
+
days_back: Optional[int] = None,
|
|
135
|
+
commit: bool = True
|
|
136
|
+
):
|
|
137
|
+
"""
|
|
138
|
+
Эффективно обновляет данные в Vertica:
|
|
139
|
+
- вставляет новые строки,
|
|
140
|
+
- обновляет существующие,
|
|
141
|
+
- удаляет дубликаты.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
df: DataFrame с новыми данными
|
|
145
|
+
table_name: имя таблицы с полным путем (schema.table)
|
|
146
|
+
unique_keys: столбец или список столбцов для уникальности
|
|
147
|
+
conn: подключение к Vertica (если None, создается новое)
|
|
148
|
+
date_col: если указано, фильтруем по последним `days_back` дням
|
|
149
|
+
days_back: число дней для подгрузки (используется с date_col)
|
|
150
|
+
commit: выполнять commit после вставки/мержа
|
|
151
|
+
"""
|
|
152
|
+
if df.empty:
|
|
153
|
+
print("Нет данных для обновления.")
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
if isinstance(unique_keys, str):
|
|
157
|
+
unique_keys = [unique_keys]
|
|
158
|
+
|
|
159
|
+
# Создаем соединение, если не передано
|
|
160
|
+
close_conn = False
|
|
161
|
+
if conn is None:
|
|
162
|
+
conn = vertica_conn()
|
|
163
|
+
close_conn = True
|
|
164
|
+
|
|
165
|
+
# Создаем временную таблицу
|
|
166
|
+
temp_table = f"{table_name}_tmp"
|
|
167
|
+
create_temp_sql = f"""
|
|
168
|
+
DROP TABLE IF EXISTS {temp_table};
|
|
169
|
+
CREATE LOCAL TEMP TABLE {temp_table} AS
|
|
170
|
+
SELECT * FROM {table_name} WHERE 1=0;
|
|
171
|
+
"""
|
|
172
|
+
vertica_sql(conn, create_temp_sql, commit=False)
|
|
173
|
+
|
|
174
|
+
# Вставляем данные во временную таблицу
|
|
175
|
+
rows = [tuple(x) for x in df.to_numpy()]
|
|
176
|
+
cols = ", ".join(df.columns)
|
|
177
|
+
placeholders = ", ".join(["%s"] * len(df.columns))
|
|
178
|
+
|
|
179
|
+
insert_sql = f"INSERT INTO {temp_table} ({cols}) VALUES ({placeholders})"
|
|
180
|
+
vertica_sql(conn, insert_sql, params=rows, many=True, commit=False)
|
|
181
|
+
|
|
182
|
+
# Удаляем старые данные по date_col, если указано
|
|
183
|
+
if date_col and days_back:
|
|
184
|
+
delete_sql = f"""
|
|
185
|
+
DELETE FROM {table_name}
|
|
186
|
+
WHERE {date_col} >= CURRENT_DATE - INTERVAL '{days_back} day';
|
|
187
|
+
"""
|
|
188
|
+
vertica_sql(conn, delete_sql, commit=False)
|
|
189
|
+
|
|
190
|
+
# MERGE с основной таблицей по уникальным ключам
|
|
191
|
+
merge_conditions = " AND ".join([f"t.{k} = s.{k}" for k in unique_keys])
|
|
192
|
+
update_assignments = ", ".join([f"{c} = s.{c}" for c in df.columns if c not in unique_keys])
|
|
193
|
+
|
|
194
|
+
merge_sql = f"""
|
|
195
|
+
MERGE INTO {table_name} t
|
|
196
|
+
USING {temp_table} s
|
|
197
|
+
ON {merge_conditions}
|
|
198
|
+
WHEN MATCHED THEN UPDATE SET {update_assignments}
|
|
199
|
+
WHEN NOT MATCHED THEN INSERT ({cols}) VALUES ({cols});
|
|
200
|
+
"""
|
|
201
|
+
vertica_sql(conn, merge_sql, commit=False)
|
|
202
|
+
|
|
203
|
+
# Финальный commit
|
|
204
|
+
if commit:
|
|
205
|
+
conn.commit()
|
|
206
|
+
|
|
207
|
+
if close_conn:
|
|
208
|
+
conn.close()
|
|
209
|
+
|
|
210
|
+
print(f"Обновление таблицы {table_name} завершено. {len(df)} строк обработано.")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def vertica_conn(conn_id: str) -> vertica_python.Connection:
|
|
214
|
+
"""
|
|
215
|
+
Создаёт подключение к Vertica через Vault / Airflow Connection.
|
|
216
|
+
|
|
217
|
+
:param conn_id: ID секрета (Vault / Airflow)
|
|
218
|
+
"""
|
|
219
|
+
cfg = get_secret(conn_id)
|
|
220
|
+
|
|
221
|
+
return vertica_python.connect(
|
|
222
|
+
host=cfg["host"],
|
|
223
|
+
port=int(cfg["port"]),
|
|
224
|
+
user=cfg["login"],
|
|
225
|
+
password=cfg["password"],
|
|
226
|
+
database=cfg.get("schema") or cfg.get("database"),
|
|
227
|
+
autocommit=False,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def get_vertica_engine(conn_id: str):
|
|
231
|
+
"""
|
|
232
|
+
Создаёт SQLAlchemy engine для Vertica через Vault
|
|
233
|
+
"""
|
|
234
|
+
cfg = json.loads(get_connection_as_json(conn_id))
|
|
235
|
+
|
|
236
|
+
vertica_url = (
|
|
237
|
+
f"vertica+vertica_python://{cfg['login']}:{cfg['password']}@"
|
|
238
|
+
f"{cfg['host']}:{cfg['port']}/{cfg['schema']}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return create_engine(vertica_url, pool_pre_ping=True)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def vertica_sql(
|
|
245
|
+
*,
|
|
246
|
+
conn_id: Optional[str] = None,
|
|
247
|
+
conn: Optional[vertica_python.Connection] = None,
|
|
248
|
+
sql: str,
|
|
249
|
+
params: Optional[Iterable[Any]] = None,
|
|
250
|
+
many: bool = False,
|
|
251
|
+
commit: bool = True,
|
|
252
|
+
) -> None:
|
|
253
|
+
"""
|
|
254
|
+
Выполняет SQL-запрос в Vertica.
|
|
255
|
+
|
|
256
|
+
Можно передать либо conn_id, либо готовое соединение.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
if not conn and not conn_id:
|
|
260
|
+
raise ValueError("Нужно указать conn или conn_id")
|
|
261
|
+
|
|
262
|
+
close_conn = False
|
|
263
|
+
if not conn:
|
|
264
|
+
conn = vertica_conn(conn_id)
|
|
265
|
+
close_conn = True
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
with conn.cursor() as cur:
|
|
269
|
+
if many:
|
|
270
|
+
cur.executemany(sql, params)
|
|
271
|
+
else:
|
|
272
|
+
cur.execute(sql, params)
|
|
273
|
+
|
|
274
|
+
if commit:
|
|
275
|
+
conn.commit()
|
|
276
|
+
|
|
277
|
+
except Exception:
|
|
278
|
+
conn.rollback()
|
|
279
|
+
logging.exception("Ошибка выполнения SQL в Vertica")
|
|
280
|
+
raise
|
|
281
|
+
|
|
282
|
+
finally:
|
|
283
|
+
if close_conn:
|
|
284
|
+
conn.close()
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def vertica_select(
|
|
288
|
+
*,
|
|
289
|
+
conn_id: Optional[str] = None,
|
|
290
|
+
conn: Optional[vertica_python.Connection] = None,
|
|
291
|
+
sql: str,
|
|
292
|
+
params: Optional[Iterable[Any]] = None,
|
|
293
|
+
) -> pd.DataFrame:
|
|
294
|
+
"""
|
|
295
|
+
Выполняет SELECT-запрос и возвращает pandas DataFrame.
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
if not conn and not conn_id:
|
|
299
|
+
raise ValueError("Нужно указать conn или conn_id")
|
|
300
|
+
|
|
301
|
+
close_conn = False
|
|
302
|
+
if not conn:
|
|
303
|
+
conn = vertica_conn(conn_id)
|
|
304
|
+
close_conn = True
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
with conn.cursor() as cur:
|
|
308
|
+
cur.execute(sql, params)
|
|
309
|
+
columns = [desc[0] for desc in cur.description]
|
|
310
|
+
rows = cur.fetchall()
|
|
311
|
+
|
|
312
|
+
return pd.DataFrame(rows, columns=columns)
|
|
313
|
+
|
|
314
|
+
finally:
|
|
315
|
+
if close_conn:
|
|
316
|
+
conn.close()
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import json
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
from mattermostdriver import Driver
|
|
5
|
+
from mnemosynecore.vault import get_secret
|
|
6
|
+
from sqlalchemy import create_engine
|
|
7
|
+
from .vault import get_secret_test
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_mattermost_driver_test(bot_id: str, dir_path: str = "./secrets") -> Driver:
|
|
11
|
+
"""
|
|
12
|
+
Возвращает Mattermost Driver для тестов.
|
|
13
|
+
Использует JSON-файл вместо Vault.
|
|
14
|
+
"""
|
|
15
|
+
cfg = get_secret_test(bot_id, dir_path)
|
|
16
|
+
|
|
17
|
+
driver = Driver({
|
|
18
|
+
"url": cfg["host"],
|
|
19
|
+
"token": cfg["password"],
|
|
20
|
+
"schema": cfg.get("schema", "https"),
|
|
21
|
+
"port": int(cfg.get("port", 443)),
|
|
22
|
+
"basepath": cfg.get("basepath", "/api/v4"),
|
|
23
|
+
})
|
|
24
|
+
|
|
25
|
+
driver.login()
|
|
26
|
+
return driver
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def send_message_test(
|
|
30
|
+
*,
|
|
31
|
+
channel_id: str,
|
|
32
|
+
bot_id: str,
|
|
33
|
+
text: str,
|
|
34
|
+
dir_path: str = "./secrets",
|
|
35
|
+
silent: bool = False,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Отправляет сообщение в Mattermost от имени бота для тестов.
|
|
39
|
+
Использует локальный JSON вместо Vault.
|
|
40
|
+
"""
|
|
41
|
+
driver = get_mattermost_driver_test(bot_id, dir_path)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
driver.posts.create_post(
|
|
45
|
+
options={
|
|
46
|
+
"channel_id": channel_id,
|
|
47
|
+
"message": text.strip(),
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
if not silent:
|
|
51
|
+
print(f"[TEST] Сообщение отправлено в Mattermost: {channel_id}")
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
print(f"[TEST] Ошибка отправки сообщения: {exc}")
|
|
54
|
+
raise
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_mattermost_driver(bot_id: str) -> Driver:
|
|
58
|
+
"""
|
|
59
|
+
Создаёт и логинит Mattermost Driver через Vault.
|
|
60
|
+
"""
|
|
61
|
+
# Проверяем, является ли bot_id строкой JSON
|
|
62
|
+
if bot_id.startswith('{'):
|
|
63
|
+
try:
|
|
64
|
+
cfg = json.loads(bot_id)
|
|
65
|
+
except json.JSONDecodeError:
|
|
66
|
+
raise ValueError("Неверный формат JSON для конфигурации Mattermost")
|
|
67
|
+
else:
|
|
68
|
+
# Используем старый способ - получаем из Vault
|
|
69
|
+
cfg = get_secret(bot_id)
|
|
70
|
+
|
|
71
|
+
driver = Driver({
|
|
72
|
+
"url": cfg["host"],
|
|
73
|
+
"token": cfg["password"],
|
|
74
|
+
"schema": cfg.get("schema", "https"),
|
|
75
|
+
"port": int(cfg.get("port", 443)),
|
|
76
|
+
"basepath": cfg.get("basepath", "/api/v4"),
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
driver.login()
|
|
80
|
+
return driver
|
|
81
|
+
|
|
82
|
+
def get_mattermost_conn(conn_id: str) -> dict:
|
|
83
|
+
"""
|
|
84
|
+
Получает параметры подключения к Mattermost из Vault
|
|
85
|
+
"""
|
|
86
|
+
cfg_json = json.loads(get_connection_as_json(conn_id))
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
"host": cfg_json.get("host"),
|
|
90
|
+
"password": cfg_json.get("password"),
|
|
91
|
+
"schema": cfg_json.get("schema", "https"),
|
|
92
|
+
"port": int(cfg_json.get("port", 443)),
|
|
93
|
+
"basepath": json.loads(cfg_json.get("extra", "{}")).get(
|
|
94
|
+
"basepath", "/api/v4"
|
|
95
|
+
),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_connection_as_json(conn_name):
|
|
100
|
+
import json
|
|
101
|
+
from airflow.hooks.base import BaseHook
|
|
102
|
+
|
|
103
|
+
conn = BaseHook.get_connection(conn_name)
|
|
104
|
+
ci = {'host': conn.host, 'password': conn.password, 'login': conn.login, 'port': conn.port, 'schema': conn.schema,
|
|
105
|
+
'extra': conn.extra}
|
|
106
|
+
return json.dumps(ci)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def send_message(
|
|
110
|
+
*,
|
|
111
|
+
channel_id: str,
|
|
112
|
+
bot_id: str,
|
|
113
|
+
text: str,
|
|
114
|
+
silent: bool = False,
|
|
115
|
+
) -> None:
|
|
116
|
+
"""
|
|
117
|
+
Отправляет сообщение в Mattermost от имени бота.
|
|
118
|
+
|
|
119
|
+
:param channel_id: ID канала
|
|
120
|
+
:param bot_id: ID секрета в Vault или JSON-строка конфигурации
|
|
121
|
+
:param text: Markdown-текст сообщения
|
|
122
|
+
:param silent: не логировать успешную отправку
|
|
123
|
+
"""
|
|
124
|
+
driver = get_mattermost_driver(bot_id)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
driver.posts.create_post(
|
|
128
|
+
options={
|
|
129
|
+
"channel_id": channel_id,
|
|
130
|
+
"message": text.strip(),
|
|
131
|
+
}
|
|
132
|
+
)
|
|
133
|
+
if not silent:
|
|
134
|
+
logging.info("Сообщение отправлено в Mattermost: %s", channel_id)
|
|
135
|
+
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
logging.exception(
|
|
138
|
+
"Ошибка отправки сообщения в Mattermost (channel_id=%s)",
|
|
139
|
+
channel_id
|
|
140
|
+
)
|
|
141
|
+
raise
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from .vault import get_secret
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def superset_request(
|
|
6
|
+
*,
|
|
7
|
+
endpoint: str,
|
|
8
|
+
method: str = "GET",
|
|
9
|
+
payload: dict | None = None,
|
|
10
|
+
vault_conn_id: str
|
|
11
|
+
):
|
|
12
|
+
|
|
13
|
+
cfg = get_secret(vault_conn_id)
|
|
14
|
+
base_url = cfg["host"]
|
|
15
|
+
|
|
16
|
+
headers = {
|
|
17
|
+
"Authorization": f"Bearer {cfg['password']}",
|
|
18
|
+
"Content-Type": "application/json",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
url = f"{base_url.rstrip('/')}/{endpoint.lstrip('/')}"
|
|
22
|
+
|
|
23
|
+
resp = requests.request(method, url, json=payload, headers=headers)
|
|
24
|
+
resp.raise_for_status()
|
|
25
|
+
return resp.json()
|
mnemosynecore/vault.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_connection_as_json(conn_id: str) -> str:
|
|
7
|
+
# Сначала пытаемся получить секрет из переменных окружения
|
|
8
|
+
secret = os.environ.get(conn_id)
|
|
9
|
+
|
|
10
|
+
if not secret:
|
|
11
|
+
# Если нет в переменных окружения, пробуем получить из Vault
|
|
12
|
+
try:
|
|
13
|
+
# Попытка импорта vault_client (если доступен)
|
|
14
|
+
from mnemosyne.vault.client import VaultClient
|
|
15
|
+
|
|
16
|
+
# Создаем клиент Vault
|
|
17
|
+
vault_client = VaultClient()
|
|
18
|
+
|
|
19
|
+
# Получаем секрет из Vault
|
|
20
|
+
secret = vault_client.get_secret(conn_id)
|
|
21
|
+
|
|
22
|
+
if not secret:
|
|
23
|
+
raise ValueError(f"Секрет {conn_id} не найден ни в переменных окружения, ни в Vault")
|
|
24
|
+
|
|
25
|
+
except ImportError:
|
|
26
|
+
# Если Vault клиент недоступен, бросаем исключение
|
|
27
|
+
raise ValueError(f"Секрет {conn_id} не найден в переменных окружения и не удалось подключиться к Vault")
|
|
28
|
+
except Exception as e:
|
|
29
|
+
raise ValueError(f"Ошибка при получении секрета {conn_id} из Vault: {str(e)}")
|
|
30
|
+
|
|
31
|
+
return secret
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_secret(conn_id: str) -> Dict:
|
|
35
|
+
raw = get_connection_as_json(conn_id)
|
|
36
|
+
try:
|
|
37
|
+
return json.loads(raw)
|
|
38
|
+
except json.JSONDecodeError as e:
|
|
39
|
+
raise ValueError(f"Секрет {conn_id} не является корректным JSON: {e}")
|
|
40
|
+
|
|
41
|
+
def get_connection_as_json_test(conn_id: str, dir_path: str = "./secrets") -> str:
|
|
42
|
+
"""
|
|
43
|
+
Возвращает JSON-конфиг подключения для тестов.
|
|
44
|
+
Вместо Vault берёт локальный JSON-файл из dir_path.
|
|
45
|
+
Файл должен называться <conn_id>.json
|
|
46
|
+
"""
|
|
47
|
+
file_path = os.path.join(dir_path, f"{conn_id}.json")
|
|
48
|
+
if not os.path.exists(file_path):
|
|
49
|
+
raise FileNotFoundError(f"Тестовый секрет {file_path} не найден")
|
|
50
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
51
|
+
return f.read()
|
|
52
|
+
|
|
53
|
+
def get_secret_test(conn_id: str, dir_path: str = "./secrets") -> Dict:
|
|
54
|
+
"""
|
|
55
|
+
Возвращает словарь конфигурации для тестов.
|
|
56
|
+
Читает локальный JSON вместо Vault.
|
|
57
|
+
"""
|
|
58
|
+
raw = get_connection_as_json_test(conn_id, dir_path)
|
|
59
|
+
try:
|
|
60
|
+
return json.loads(raw)
|
|
61
|
+
except json.JSONDecodeError as e:
|
|
62
|
+
raise ValueError(f"Тестовый секрет {conn_id} не является корректным JSON: {e}")
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mnemosynecore
|
|
3
|
+
Version: 0.1.7
|
|
4
|
+
Summary: Internal analytics toolkit for data pipelines
|
|
5
|
+
Author-email: rostilin <rostilin@ozon.ru>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: vertica-python>=1.3
|
|
9
|
+
Requires-Dist: pandas>=2.0
|
|
10
|
+
Requires-Dist: mattermostdriver>=2.0
|
|
11
|
+
Requires-Dist: requests>=2.30
|
|
12
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
13
|
+
Requires-Dist: typing-extensions>=4.0.0
|
|
14
|
+
Provides-Extra: airflow
|
|
15
|
+
Requires-Dist: apache-airflow<3.1,>=3.0; extra == "airflow"
|
|
16
|
+
Requires-Dist: apache-airflow-providers-postgres>=5.0; extra == "airflow"
|
|
17
|
+
Requires-Dist: apache-airflow-providers-vertica>=2.0; extra == "airflow"
|
|
18
|
+
Requires-Dist: sqlalchemy<2.0; extra == "airflow"
|
|
19
|
+
|
|
20
|
+
# mnemosynecore Documentation
|
|
21
|
+
|
|
22
|
+
## Общее описание
|
|
23
|
+
Библиотека mnemosynecore предоставляет набор инструментов для работы с Vertica БД,
|
|
24
|
+
Airflow, Mattermost и другими системами аналитики. Она предназначена для автоматизации
|
|
25
|
+
работы с данными и интеграции различных сервисов.
|
|
26
|
+
|
|
27
|
+
## Установка
|
|
28
|
+
```bash
|
|
29
|
+
pip install mnemosynecore
|
|
30
|
+
|
|
31
|
+
Для Airflow:
|
|
32
|
+
pip install mnemosynecore[airflow]
|
|
33
|
+
|
|
34
|
+
Основные модули
|
|
35
|
+
1. Работа с Vertica (mnemosynecore.db.vertica)
|
|
36
|
+
Функции для работы с БД
|
|
37
|
+
vertica_conn(conn_id) - Создание подключения к Vertica
|
|
38
|
+
from mnemosynecore.db.vertica import vertica_conn
|
|
39
|
+
|
|
40
|
+
# Подключение к Vertica
|
|
41
|
+
conn = vertica_conn("VERTICA_CONN_ID")
|
|
42
|
+
# Использование соединения
|
|
43
|
+
result = vertica_sql(conn=conn, sql="SELECT * FROM table")
|
|
44
|
+
conn.close()
|
|
45
|
+
|
|
46
|
+
**vertica_sql(kwargs) - Выполнение SQL запросов
|
|
47
|
+
|
|
48
|
+
from mnemosynecore.db.vertica import vertica_sql
|
|
49
|
+
|
|
50
|
+
# Выполнение запроса с ID подключения
|
|
51
|
+
vertica_sql(
|
|
52
|
+
conn_id="VERTICA_CONN_ID",
|
|
53
|
+
sql="INSERT INTO table VALUES (%s, %s)",
|
|
54
|
+
params=[1, "test"]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Выполнение запроса с готовым соединением
|
|
58
|
+
vertica_sql(conn=conn, sql="UPDATE table SET col = %s WHERE id = %s", params=[100, 1])
|
|
59
|
+
|
|
60
|
+
**vertica_select(kwargs) - Выполнение SELECT запросов
|
|
61
|
+
|
|
62
|
+
from mnemosynecore.db.vertica import vertica_select
|
|
63
|
+
|
|
64
|
+
# Получение данных в DataFrame
|
|
65
|
+
df = vertica_select(
|
|
66
|
+
conn_id="VERTICA_CONN_ID",
|
|
67
|
+
sql="SELECT * FROM table WHERE id = %s",
|
|
68
|
+
params=[123]
|
|
69
|
+
)
|
|
70
|
+
print(df.head())
|
|
71
|
+
|
|
72
|
+
**vertica_dedupe(table_name, unique_keys, kwargs) - Удаление дубликатов
|
|
73
|
+
|
|
74
|
+
from mnemosynecore.db.vertica import vertica_dedupe
|
|
75
|
+
|
|
76
|
+
# Удаление дубликатов по одному ключу
|
|
77
|
+
vertica_dedupe(
|
|
78
|
+
table_name="schema.table",
|
|
79
|
+
unique_keys="id",
|
|
80
|
+
conn_id="VERTICA_CONN_ID"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Удаление дубликатов по нескольким ключам с учетом даты
|
|
84
|
+
vertica_dedupe(
|
|
85
|
+
table_name="schema.table",
|
|
86
|
+
unique_keys=["id", "date"],
|
|
87
|
+
conn_id="VERTICA_CONN_ID",
|
|
88
|
+
date_col="created_at",
|
|
89
|
+
keep="last"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
**vertica_upsert(df, table_name, unique_keys, kwargs) - Upsert операции
|
|
93
|
+
|
|
94
|
+
from mnemosynecore.db.vertica import vertica_upsert
|
|
95
|
+
import pandas as pd
|
|
96
|
+
|
|
97
|
+
# Подготовка данных
|
|
98
|
+
df = pd.DataFrame({
|
|
99
|
+
'id': [1, 2, 3],
|
|
100
|
+
'name': ['Alice', 'Bob', 'Charlie'],
|
|
101
|
+
'value': [100, 200, 300]
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
# Upsert данных
|
|
105
|
+
vertica_upsert(
|
|
106
|
+
df=df,
|
|
107
|
+
table_name="schema.table",
|
|
108
|
+
unique_keys=["id"],
|
|
109
|
+
conn_id="VERTICA_CONN_ID"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
load_sql_tasks_from_dir(dir_sql, vertica_conn_id) - Создание задач из SQL файлов
|
|
113
|
+
|
|
114
|
+
from mnemosynecore.db.vertica import load_sql_tasks_from_dir
|
|
115
|
+
|
|
116
|
+
# Создание задач из директории SQL файлов
|
|
117
|
+
tasks = load_sql_tasks_from_dir("/path/to/sql/files", "VERTICA_CONN_ID")
|
|
118
|
+
|
|
119
|
+
# Использование в Airflow DAG
|
|
120
|
+
from airflow import DAG
|
|
121
|
+
from airflow.utils.task_group import TaskGroup
|
|
122
|
+
|
|
123
|
+
with DAG("my_dag") as dag:
|
|
124
|
+
with TaskGroup("vertica_tasks") as vertica_group:
|
|
125
|
+
for task in tasks.values():
|
|
126
|
+
task
|
|
127
|
+
|
|
128
|
+
read_sql_file(file_path) - Чтение SQL файлов
|
|
129
|
+
from mnemosynecore.db.vertica import read_sql_file
|
|
130
|
+
|
|
131
|
+
# Чтение SQL файла
|
|
132
|
+
sql_content = read_sql_file("/path/to/query.sql")
|
|
133
|
+
if sql_content:
|
|
134
|
+
print("SQL загружен успешно")
|
|
135
|
+
|
|
136
|
+
2. Работа с секретами (mnemosynecore.vault)
|
|
137
|
+
get_secret(conn_id) - Получение секретов
|
|
138
|
+
from mnemosynecore.vault import get_secret
|
|
139
|
+
|
|
140
|
+
# Получение секрета из Vault
|
|
141
|
+
secret = get_secret("SECRET_ID")
|
|
142
|
+
print(secret["host"]) # Хост базы данных
|
|
143
|
+
print(secret["password"]) # Пароль
|
|
144
|
+
|
|
145
|
+
3. Интеграция с Mattermost (mnemosynecore.mattermost)
|
|
146
|
+
send_message(channel_id, bot_id, text, silent=False) - Отправка сообщений
|
|
147
|
+
|
|
148
|
+
from mnemosynecore.mattermost import send_message
|
|
149
|
+
|
|
150
|
+
# Отправка сообщения в канал
|
|
151
|
+
send_message(
|
|
152
|
+
channel_id="s5c11srqkf8j3pbdwfbn9imrde",
|
|
153
|
+
bot_id="MATTERMOST_BOT_ID",
|
|
154
|
+
text="Привет! Это тестовое сообщение"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Отправка Markdown сообщения
|
|
158
|
+
send_message(
|
|
159
|
+
channel_id="s5c11srqkf8j3pbdwfbn9imrde",
|
|
160
|
+
bot_id="MATTERMOST_BOT_ID",
|
|
161
|
+
text="""
|
|
162
|
+
📊 **Отчет по данным** 📊
|
|
163
|
+
|
|
164
|
+
- Данные успешно загружены
|
|
165
|
+
- Обработано записей: 1000
|
|
166
|
+
- Время выполнения: 2 минуты
|
|
167
|
+
|
|
168
|
+
[Подробнее](https://your-dashboard.com)
|
|
169
|
+
"""
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
4. Интеграция с Superset (mnemosynecore.superset)
|
|
173
|
+
superset_request(endpoint, method="GET", payload=None, vault_conn_id) - Запросы к Superset
|
|
174
|
+
from mnemosynecore.superset import superset_request
|
|
175
|
+
|
|
176
|
+
# Получение информации о дашборде
|
|
177
|
+
response = superset_request(
|
|
178
|
+
endpoint="/api/v1/dashboard/123",
|
|
179
|
+
method="GET",
|
|
180
|
+
vault_conn_id="SUPERSET_CONN_ID"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Создание нового дашборда
|
|
184
|
+
new_dashboard = superset_request(
|
|
185
|
+
endpoint="/api/v1/dashboard/",
|
|
186
|
+
method="POST",
|
|
187
|
+
payload={"name": "New Dashboard"},
|
|
188
|
+
vault_conn_id="SUPERSET_CONN_ID"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
Конфигурация секретов
|
|
192
|
+
Формат секретов для Vault
|
|
193
|
+
Для Vertica:
|
|
194
|
+
{
|
|
195
|
+
"host": "vertica-host.com",
|
|
196
|
+
"port": "5433",
|
|
197
|
+
"login": "username",
|
|
198
|
+
"password": "password",
|
|
199
|
+
"schema": "database_schema"
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
Для Mattermost:
|
|
203
|
+
{
|
|
204
|
+
"host": "https://mattermost.company.com",
|
|
205
|
+
"password": "bot_token_here",
|
|
206
|
+
"scheme": "https",
|
|
207
|
+
"port": 443,
|
|
208
|
+
"basepath": "/api/v4"
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
Для Superset:
|
|
212
|
+
{
|
|
213
|
+
"host": "https://superset.company.com",
|
|
214
|
+
"password": "access_token_here",
|
|
215
|
+
"scheme": "https",
|
|
216
|
+
"port": 443,
|
|
217
|
+
"basepath": "/api/v1"
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
Примеры использования
|
|
221
|
+
Пример 1: Полный пайплайн работы с Vertica
|
|
222
|
+
from mnemosynecore.db.vertica import vertica_conn, vertica_sql, vertica_select, vertica_upsert
|
|
223
|
+
from mnemosynecore.mattermost import send_message
|
|
224
|
+
import pandas as pd
|
|
225
|
+
|
|
226
|
+
def process_data_pipeline():
|
|
227
|
+
# Подключение к Vertica
|
|
228
|
+
conn = vertica_conn("VERTICA_CONN_ID")
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
# Выполнение запроса
|
|
232
|
+
df = vertica_select(
|
|
233
|
+
conn=conn,
|
|
234
|
+
sql="SELECT * FROM source_table WHERE date > %s",
|
|
235
|
+
params=['2023-01-01']
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Обработка данных
|
|
239
|
+
processed_df = df.groupby('category').sum()
|
|
240
|
+
|
|
241
|
+
# Upsert в целевую таблицу
|
|
242
|
+
vertica_upsert(
|
|
243
|
+
df=processed_df,
|
|
244
|
+
table_name="analytics.summary",
|
|
245
|
+
unique_keys=["category"],
|
|
246
|
+
conn=conn
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Отправка уведомления
|
|
250
|
+
send_message(
|
|
251
|
+
channel_id="s5c11srqkf8j3pbdwfbn9imrde",
|
|
252
|
+
bot_id="MATTERMOST_BOT_ID",
|
|
253
|
+
text="✅ Пайплайн выполнен успешно"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
except Exception as e:
|
|
257
|
+
send_message(
|
|
258
|
+
channel_id="s5c11srqkf8j3pbdwfbn9imrde",
|
|
259
|
+
bot_id="MATTERMOST_BOT_ID",
|
|
260
|
+
text=f"❌ Ошибка в пайплайне: {str(e)}"
|
|
261
|
+
)
|
|
262
|
+
raise
|
|
263
|
+
finally:
|
|
264
|
+
conn.close()
|
|
265
|
+
|
|
266
|
+
Пример 2: Автоматическая очистка дубликатов
|
|
267
|
+
|
|
268
|
+
from mnemosynecore.db.vertica import vertica_dedupe
|
|
269
|
+
|
|
270
|
+
def remove_duplicates():
|
|
271
|
+
vertica_dedupe(
|
|
272
|
+
table_name="analytics.user_events",
|
|
273
|
+
unique_keys=["user_id", "event_time"],
|
|
274
|
+
conn_id="VERTICA_CONN_ID",
|
|
275
|
+
date_col="event_time",
|
|
276
|
+
keep="last"
|
|
277
|
+
)
|
|
278
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mnemosynecore/__init__.py,sha256=ZekrYOivI7PuaUaTVXkaCn5xo09mjRAFBuf8v3mboJA,607
|
|
2
|
+
mnemosynecore/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
mnemosynecore/mattermost.py,sha256=Kx24iU7Ncltre2HgBA2GjX5JD78JJ3kj8NOKiWQanf4,4287
|
|
4
|
+
mnemosynecore/superset.py,sha256=QtQ7ZXYD0evCQB_NQJiFWtvmCT1Sx3su1Msg2G1Uhxs,548
|
|
5
|
+
mnemosynecore/vault.py,sha256=CuJCMIqtTxi_Wcl1FuZaILPf-YZaBU4p8BM07d4xqy4,2746
|
|
6
|
+
mnemosynecore/warnings.py,sha256=CWKRY2VZrf-MijWjUTZbhA48wC0jIjSULpV31rRpKyc,184
|
|
7
|
+
mnemosynecore/db/vertica.py,sha256=w_0nZ2uRuL27TxpmD_60frriCkG05uyaDzNEOqfCN1w,10442
|
|
8
|
+
mnemosynecore-0.1.7.dist-info/METADATA,sha256=4Y3f31A35zgsKy7mz3QRPknOoVikxCt7XPRt4s4grPU,8534
|
|
9
|
+
mnemosynecore-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
10
|
+
mnemosynecore-0.1.7.dist-info/top_level.txt,sha256=v87precv4itopYOciTHITY3GSFqZsGkgenQmDFKWpFU,14
|
|
11
|
+
mnemosynecore-0.1.7.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mnemosynecore
|