morin 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- morin-0.2.6/LICENSE +21 -0
- morin-0.2.6/PKG-INFO +11 -0
- morin-0.2.6/README.md +1 -0
- morin-0.2.6/morin/__init__.py +3 -0
- morin-0.2.6/morin/clickhouse.py +183 -0
- morin-0.2.6/morin/common.py +247 -0
- morin-0.2.6/morin/ozon_by_date.py +584 -0
- morin-0.2.6/morin/ozon_reklama.py +578 -0
- morin-0.2.6/morin/wb_by_date.py +337 -0
- morin-0.2.6/morin/wb_reklama.py +366 -0
- morin-0.2.6/morin/yd_by_date.py +242 -0
- morin-0.2.6/morin.egg-info/PKG-INFO +11 -0
- morin-0.2.6/morin.egg-info/SOURCES.txt +16 -0
- morin-0.2.6/morin.egg-info/dependency_links.txt +1 -0
- morin-0.2.6/morin.egg-info/requires.txt +4 -0
- morin-0.2.6/morin.egg-info/top_level.txt +1 -0
- morin-0.2.6/setup.cfg +4 -0
- morin-0.2.6/setup.py +17 -0
morin-0.2.6/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Директ-ПРО
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
morin-0.2.6/PKG-INFO
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: morin
|
|
3
|
+
Version: 0.2.6
|
|
4
|
+
Summary: Помощь в подключениях и загрузке в БД
|
|
5
|
+
Home-page: https://github.com/morinad/morin
|
|
6
|
+
Author: Александр Морин
|
|
7
|
+
Author-email: y.director@yandex.ru
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
|
|
11
|
+
# morin
|
morin-0.2.6/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# morin
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
from .common import Common
|
|
2
|
+
import requests
|
|
3
|
+
from datetime import datetime,timedelta
|
|
4
|
+
import clickhouse_connect
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import os
|
|
7
|
+
from dateutil import parser
|
|
8
|
+
import time
|
|
9
|
+
import logging
|
|
10
|
+
import hashlib
|
|
11
|
+
from io import StringIO
|
|
12
|
+
import json
|
|
13
|
+
import math
|
|
14
|
+
|
|
15
|
+
class Clickhouse:
|
|
16
|
+
def __init__(self, logging_path:str, host: str, port: str, username: str, password: str, database: str, start:str, add_name:str, err429:bool, backfill_days:int, platform:str):
|
|
17
|
+
self.logging_path = logging_path
|
|
18
|
+
self.host = host
|
|
19
|
+
self.port = port
|
|
20
|
+
self.username = username
|
|
21
|
+
self.password = password
|
|
22
|
+
self.database = database
|
|
23
|
+
self.now = datetime.now()
|
|
24
|
+
self.start = start
|
|
25
|
+
self.add_name = add_name
|
|
26
|
+
self.err429 = err429
|
|
27
|
+
self.backfill_days = backfill_days
|
|
28
|
+
self.today = datetime.now().date()
|
|
29
|
+
self.platform = platform
|
|
30
|
+
self.common = Common(logging_path)
|
|
31
|
+
logging.basicConfig(filename=self.logging_path, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
32
|
+
|
|
33
|
+
# датафрейм, название таблицы -> вставка данных
|
|
34
|
+
def ch_insert(self, df, to_table):
|
|
35
|
+
try:
|
|
36
|
+
data_tuples = [tuple(x) for x in df.to_numpy()]
|
|
37
|
+
client = clickhouse_connect.get_client(host=self.host, port=self.port, username=self.username,
|
|
38
|
+
password=self.password, database=self.database)
|
|
39
|
+
client.insert(to_table, data_tuples, column_names=df.columns.tolist())
|
|
40
|
+
print(f'Данные вставлены в CH, таблица {to_table}')
|
|
41
|
+
logging.info(f'Данные вставлены в CH, таблица {to_table}')
|
|
42
|
+
optimize_table = f"OPTIMIZE TABLE {to_table} FINAL"
|
|
43
|
+
client.command(optimize_table)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(f'Ошибка вставки в CH: {e}')
|
|
46
|
+
logging.info(f'Ошибка вставки в CH: {e}')
|
|
47
|
+
raise
|
|
48
|
+
finally:
|
|
49
|
+
if client:
|
|
50
|
+
client.close()
|
|
51
|
+
|
|
52
|
+
def ch_execute(self, expression):
|
|
53
|
+
try:
|
|
54
|
+
client = clickhouse_connect.get_client(host=self.host, port=self.port, username=self.username, password=self.password, database=self.database)
|
|
55
|
+
client.command(expression)
|
|
56
|
+
disp_exp = expression.strip()[:17]+'...'
|
|
57
|
+
print(f'Выражение {disp_exp} выполнено')
|
|
58
|
+
logging.info(f'Выражение {disp_exp} выполнено')
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f'Ошибка выражения {disp_exp}: {e}')
|
|
61
|
+
logging.info(f'Ошибка выражения {disp_exp}: {e}')
|
|
62
|
+
finally:
|
|
63
|
+
if client:
|
|
64
|
+
client.close()
|
|
65
|
+
|
|
66
|
+
# список словарей (данные)+уникальность+имятаблицы -> создание/изменение таблицы ch
|
|
67
|
+
def create_alter_ch(self, data, table_name, uniq_columns, partitions, mergetree):
|
|
68
|
+
try:
|
|
69
|
+
upload_list = self.common.analyze_column_types(data, uniq_columns, partitions)
|
|
70
|
+
upload_set = set(upload_list)
|
|
71
|
+
uploads = ''
|
|
72
|
+
for i in upload_list:
|
|
73
|
+
uploads += i + ',\n'
|
|
74
|
+
if partitions == '':
|
|
75
|
+
part_part =''
|
|
76
|
+
else:
|
|
77
|
+
part_part = f'PARTITION BY {partitions}'
|
|
78
|
+
create_table_query_campaigns = f'CREATE TABLE IF NOT EXISTS {table_name} (' + uploads + f'timeStamp DateTime ) ENGINE = {mergetree} ORDER BY ({uniq_columns}) {part_part}'
|
|
79
|
+
client = clickhouse_connect.get_client(host=self.host, port=self.port, username=self.username, password=self.password, database=self.database)
|
|
80
|
+
client.query(create_table_query_campaigns)
|
|
81
|
+
query = f"DESCRIBE TABLE {table_name};"
|
|
82
|
+
result = client.query(query)
|
|
83
|
+
columns_info = result.result_rows
|
|
84
|
+
current_set = set([f"{col[0]} {col[1]}" for col in columns_info])
|
|
85
|
+
diff = list(upload_set - current_set)
|
|
86
|
+
if len(diff) > 0:
|
|
87
|
+
start_alter_exp=f'ALTER TABLE {table_name} '
|
|
88
|
+
for d in diff:
|
|
89
|
+
alter_exp =start_alter_exp + 'ADD COLUMN IF NOT EXISTS ' + d + ' AFTER timeStamp;'
|
|
90
|
+
print(f'Попытка изменения {table_name}, Формула: {alter_exp}')
|
|
91
|
+
logging.info(f'Попытка изменения {table_name}, Формула: {alter_exp}')
|
|
92
|
+
client.query(alter_exp)
|
|
93
|
+
print(f'Успешное изменение {table_name}, Формула: {alter_exp}')
|
|
94
|
+
logging.info(f'Успешное изменение {table_name}, Формула: {alter_exp}')
|
|
95
|
+
time.sleep(2)
|
|
96
|
+
else:
|
|
97
|
+
print(f'Данные готовы для вставки в {table_name}')
|
|
98
|
+
logging.info(f'Данные готовы для вставки в {table_name}')
|
|
99
|
+
except Exception as e:
|
|
100
|
+
print(f'Ошибка подготовки данных: {e}')
|
|
101
|
+
logging.info(f'Ошибка подготовки данных: {e}')
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_missing_dates(self, table_name, report_name, start_date_str):
|
|
105
|
+
try:
|
|
106
|
+
start_date = datetime.strptime(start_date_str, '%Y-%m-%d').date()
|
|
107
|
+
query = f"""
|
|
108
|
+
SELECT date
|
|
109
|
+
FROM {table_name}
|
|
110
|
+
WHERE report = '{report_name}' and collect = True"""
|
|
111
|
+
client = clickhouse_connect.get_client(host=self.host, port=self.port, username=self.username, password=self.password, database=self.database)
|
|
112
|
+
result = client.query(query)
|
|
113
|
+
existing_dates = {row[0] for row in result.result_rows}
|
|
114
|
+
current_date = start_date
|
|
115
|
+
all_dates = set()
|
|
116
|
+
while current_date < self.today:
|
|
117
|
+
all_dates.add(current_date)
|
|
118
|
+
current_date += timedelta(days=1)
|
|
119
|
+
missing_dates = sorted(all_dates - existing_dates)
|
|
120
|
+
missing_dates_str = [date.strftime('%Y-%m-%d') for date in missing_dates]
|
|
121
|
+
print(f'Успешное получение дат. Таблица: {table_name}, Старт: {start_date}')
|
|
122
|
+
logging.info(f'Успешное получение дат. Таблица: {table_name}, Старт: {start_date}')
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f'Ошибка получения дат: {e}')
|
|
125
|
+
logging.info(f'Ошибка получения дат: {e}')
|
|
126
|
+
return missing_dates_str
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def upload_data(self, platform, report_name,upload_table, func_name, uniq_columns, partitions, merge_type, refresh_type, history, delay, date):
|
|
130
|
+
if self.err429 == False:
|
|
131
|
+
try:
|
|
132
|
+
n_days_ago = self.today - timedelta(days=self.backfill_days)
|
|
133
|
+
table_name = f'{platform}_{upload_table}_{self.add_name}'
|
|
134
|
+
if refresh_type == 'delete_date':
|
|
135
|
+
refresh = f"ALTER TABLE {table_name} DROP PARTITION '{date}';"
|
|
136
|
+
elif refresh_type == 'delete_all':
|
|
137
|
+
refresh = f"TRUNCATE TABLE {table_name};"
|
|
138
|
+
else:
|
|
139
|
+
refresh = f"OPTIMIZE TABLE {table_name};"
|
|
140
|
+
data = func_name(date)
|
|
141
|
+
collect = True
|
|
142
|
+
if history and datetime.strptime(date, '%Y-%m-%d').date() >= n_days_ago:
|
|
143
|
+
collect = False
|
|
144
|
+
collection_data = pd.DataFrame({'date': pd.to_datetime([date], format='%Y-%m-%d'), 'report': [report_name], 'collect': [collect]})
|
|
145
|
+
self.create_alter_ch(data, table_name, uniq_columns, partitions, merge_type)
|
|
146
|
+
df = self.common.check_and_convert_types(data, uniq_columns, partitions)
|
|
147
|
+
self.ch_execute(refresh)
|
|
148
|
+
self.ch_insert(df, table_name)
|
|
149
|
+
self.ch_insert(collection_data, f'{platform}_collection_{self.add_name}')
|
|
150
|
+
print(f'Данные добавлены. Репорт: {report_name}. Дата: {date}')
|
|
151
|
+
logging.info(f'Данные добавлены. Репорт: {report_name}. Дата: {date}')
|
|
152
|
+
time.sleep(delay)
|
|
153
|
+
except Exception as e:
|
|
154
|
+
print(f'Ошибка вставки: {e}. Репорт: {report_name}. Дата: {date}')
|
|
155
|
+
logging.info(f'Ошибка вставки: {e}. Репорт: {report_name}. Дата: {date}')
|
|
156
|
+
time.sleep(delay)
|
|
157
|
+
else:
|
|
158
|
+
raise ValueError("Обнаружена ошибка 429")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def collecting_report(self, platform, report_name, upload_table, func_name, uniq_columns, partitions, merge_type, refresh_type, history, frequency, delay):
|
|
162
|
+
logging.info(f"Начинаем сбор {report_name} для клиента: {self.add_name}")
|
|
163
|
+
print(f"Начинаем сбор {report_name} для клиента: {self.add_name}")
|
|
164
|
+
create_table_query_collect = f"""
|
|
165
|
+
CREATE TABLE IF NOT EXISTS {platform}_collection_{self.add_name} (
|
|
166
|
+
date Date, report String, collect Bool ) ENGINE = ReplacingMergeTree(collect) ORDER BY (report, date)"""
|
|
167
|
+
optimize_collection = f"OPTIMIZE TABLE {platform}_collection_{self.add_name} FINAL"
|
|
168
|
+
self.ch_execute(create_table_query_collect)
|
|
169
|
+
self.ch_execute(optimize_collection)
|
|
170
|
+
time.sleep(3)
|
|
171
|
+
if history:
|
|
172
|
+
date_list = self.get_missing_dates(f'{platform}_collection_{self.add_name}', report_name, self.start)
|
|
173
|
+
for date in date_list:
|
|
174
|
+
if self.err429 == False and self.common.to_collect(frequency, date):
|
|
175
|
+
print(f'Начинаем сбор. Репорт: {report_name}, Дата: {date}')
|
|
176
|
+
logging.info(f'Начинаем сбор. Репорт: {report_name}, Дата: {date}')
|
|
177
|
+
self.upload_data(platform, report_name, upload_table, func_name, uniq_columns, partitions, merge_type, refresh_type, history, delay, date)
|
|
178
|
+
else:
|
|
179
|
+
date = self.today.strftime('%Y-%m-%d')
|
|
180
|
+
if self.err429 == False and self.common.to_collect(frequency, date):
|
|
181
|
+
print(f'Начинаем сбор. Репорт: {report_name}, Дата: {date}')
|
|
182
|
+
logging.info(f'Начинаем сбор. Репорт: {report_name}, Дата: {date}')
|
|
183
|
+
self.upload_data(platform, report_name, upload_table, func_name, uniq_columns, partitions, merge_type, refresh_type, history, delay, date)
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from datetime import datetime,timedelta
|
|
3
|
+
import clickhouse_connect
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import os
|
|
6
|
+
from dateutil import parser
|
|
7
|
+
import time
|
|
8
|
+
import logging
|
|
9
|
+
import hashlib
|
|
10
|
+
from io import StringIO
|
|
11
|
+
import json
|
|
12
|
+
import math
|
|
13
|
+
|
|
14
|
+
class Common:
|
|
15
|
+
def __init__(self, logging_path:str):
|
|
16
|
+
self.logging_path = logging_path
|
|
17
|
+
self.now = datetime.now()
|
|
18
|
+
self.today = datetime.now().date()
|
|
19
|
+
logging.basicConfig(filename=self.logging_path, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
20
|
+
|
|
21
|
+
def shorten_text(self, text):
|
|
22
|
+
# Используем хеш-функцию md5 для сокращения строки
|
|
23
|
+
hash_object = hashlib.md5(text.encode()) # Можно также использовать sha256
|
|
24
|
+
return hash_object.hexdigest()[:10] # Возвращаем первые 10 символов хеша
|
|
25
|
+
|
|
26
|
+
def shift_date(self, date_str, days=7):
|
|
27
|
+
# Преобразуем строку в объект datetime
|
|
28
|
+
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
|
|
29
|
+
# Сдвигаем дату на указанное количество дней назад
|
|
30
|
+
new_date = date_obj - timedelta(days=days)
|
|
31
|
+
# Преобразуем дату обратно в строку
|
|
32
|
+
return new_date.strftime('%Y-%m-%d')
|
|
33
|
+
|
|
34
|
+
# значение -> тип значения для clickhouse
|
|
35
|
+
def get_data_type(self, column, value, partitions):
|
|
36
|
+
if value == None: return 'None'
|
|
37
|
+
part_list = partitions.replace(' ', '').split(',')
|
|
38
|
+
if isinstance(value, str):
|
|
39
|
+
if value.lower() == 'false' or value.lower() == 'true':
|
|
40
|
+
return 'UInt8'
|
|
41
|
+
date_formats = [
|
|
42
|
+
'%Y-%m-%dT%H:%M:%S', # ISO формат DateTime: 2024-09-01T21:20:10
|
|
43
|
+
'%Y-%m-%d %H:%M:%S', # DateTime с пробелом: 2024-09-01 21:20:10
|
|
44
|
+
'%Y-%m-%d', # Формат Date: 2021-09-08
|
|
45
|
+
'%d-%m-%Y', # Формат Date с днем в начале: 08-09-2021
|
|
46
|
+
'%Y/%m/%d', # Формат Date через слэш: 2024/09/01
|
|
47
|
+
'%H:%M:%S', # Формат Time: 21:20:10
|
|
48
|
+
]
|
|
49
|
+
for date_format in date_formats:
|
|
50
|
+
try:
|
|
51
|
+
parsed_date = datetime.strptime(value, date_format)
|
|
52
|
+
# Если дата меньше 1970 года — это не допустимая дата для ClickHouse
|
|
53
|
+
if parsed_date.year < 1970:
|
|
54
|
+
return 'String'
|
|
55
|
+
# Определяем тип на основе формата
|
|
56
|
+
if date_format in ['%Y-%m-%d', '%d-%m-%Y', '%Y/%m/%d']:
|
|
57
|
+
return 'Date' # Это формат Date
|
|
58
|
+
elif date_format == '%H:%M:%S':
|
|
59
|
+
return 'Time' # Это формат Time
|
|
60
|
+
else:
|
|
61
|
+
return 'DateTime' # Форматы с датой и временем
|
|
62
|
+
except ValueError:
|
|
63
|
+
continue # Если строка не соответствует формату, продолжаем проверку
|
|
64
|
+
try:
|
|
65
|
+
parsed_date = parser.isoparse(value)
|
|
66
|
+
return 'DateTime' # Это формат DateTime с временной зоной
|
|
67
|
+
except (ValueError, TypeError):
|
|
68
|
+
pass
|
|
69
|
+
try:
|
|
70
|
+
if value.endswith('Z'):
|
|
71
|
+
value = value[:-1] + '+00:00' # Заменяем 'Z' на '+00:00' для UTC
|
|
72
|
+
if '+' in value and value[-3] == ':':
|
|
73
|
+
value = value[:-3] + value[-2:] # Убираем двоеточие в зоне (+00:00 -> +0000)
|
|
74
|
+
parsed_date = datetime.fromisoformat(value)
|
|
75
|
+
return 'DateTime' # Формат с датой, временем и временной зоной
|
|
76
|
+
except ValueError:
|
|
77
|
+
pass
|
|
78
|
+
return 'String' # Если ничего не подошло, возвращаем String
|
|
79
|
+
|
|
80
|
+
# Если значение булевое
|
|
81
|
+
elif isinstance(value, bool):
|
|
82
|
+
return 'UInt8'
|
|
83
|
+
|
|
84
|
+
# Если значение целое число
|
|
85
|
+
elif isinstance(value, int):
|
|
86
|
+
if len(str(abs(value))) > 10 or column in part_list:
|
|
87
|
+
return 'String'
|
|
88
|
+
return 'Float64'
|
|
89
|
+
|
|
90
|
+
# Если значение с плавающей запятой
|
|
91
|
+
elif isinstance(value, float):
|
|
92
|
+
if math.isnan(value):
|
|
93
|
+
return 'Float64'
|
|
94
|
+
if len(str(int(abs(value)))) > 10 or column in part_list:
|
|
95
|
+
return 'String'
|
|
96
|
+
return 'Float64'
|
|
97
|
+
|
|
98
|
+
# Для всех остальных типов
|
|
99
|
+
else:
|
|
100
|
+
return 'String'
|
|
101
|
+
|
|
102
|
+
def column_to_datetime(self, date_str):
|
|
103
|
+
if pd.isna(date_str):
|
|
104
|
+
return None
|
|
105
|
+
date_str = date_str.strip()
|
|
106
|
+
|
|
107
|
+
# Обрабатываем таймзону 'Z' (UTC) и заменяем на '+0000'
|
|
108
|
+
if date_str.endswith('Z'):
|
|
109
|
+
date_str = date_str[:-1] + '+0000'
|
|
110
|
+
# Обрабатываем таймзоны вида +00:00 и заменяем на +0000
|
|
111
|
+
elif '+' in date_str and date_str.endswith(':00'):
|
|
112
|
+
date_str = date_str[:-3] + date_str[-2:]
|
|
113
|
+
|
|
114
|
+
date_formats = [
|
|
115
|
+
"%Y-%m-%dT%H:%M:%S.%f%z", # 2023-10-22T16:36:15.507+0000
|
|
116
|
+
"%Y-%m-%d %H:%M:%S.%f%z", # 2023-10-22 16:36:15.507+0000
|
|
117
|
+
"%Y-%m-%dT%H:%M:%S%z", # 2023-10-22T16:36:15+0000
|
|
118
|
+
"%Y-%m-%d %H:%M:%S%z", # 2023-10-22 16:36:15+0000
|
|
119
|
+
"%Y-%m-%dT%H:%M:%S.%f", # 2023-10-22T16:36:15.507 (без таймзоны)
|
|
120
|
+
"%Y-%m-%d %H:%M:%S.%f", # 2023-10-22 16:36:15.507 (без 'T')
|
|
121
|
+
"%Y-%m-%dT%H:%M:%S", # 2023-10-22T16:36:15 (без миллисекунд и таймзоны)
|
|
122
|
+
"%Y-%m-%d %H:%M:%S", # 2023-10-22 16:36:15 (без 'T', без миллисекунд)
|
|
123
|
+
"%Y-%m-%d", # 2023-10-22 (только дата)
|
|
124
|
+
"%d-%m-%Y" # 22-10-2023 (европейский формат)
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
for fmt in date_formats:
|
|
128
|
+
try:
|
|
129
|
+
dt = datetime.strptime(date_str, fmt)
|
|
130
|
+
return dt.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
131
|
+
except ValueError:
|
|
132
|
+
continue
|
|
133
|
+
return None
|
|
134
|
+
# список словарей (данные) -> список поле_типданных
|
|
135
|
+
|
|
136
|
+
def analyze_column_types(self, data, uniq_columns, partitions):
|
|
137
|
+
try:
|
|
138
|
+
column_types = {}
|
|
139
|
+
# Проходим по всем строкам в данных
|
|
140
|
+
for row in data:
|
|
141
|
+
for column, value in row.items():
|
|
142
|
+
value_type = self.get_data_type(column, value, partitions) # Определяем тип данных
|
|
143
|
+
if column not in column_types:
|
|
144
|
+
column_types[column] = set() # Создаем множество для уникальных типов
|
|
145
|
+
column_types[column].add(value_type)
|
|
146
|
+
# Приводим типы столбцов к общему типу
|
|
147
|
+
final_column_types = {}
|
|
148
|
+
for column, types in column_types.items():
|
|
149
|
+
try: types.remove('None')
|
|
150
|
+
except: pass
|
|
151
|
+
if len(types) == 1:
|
|
152
|
+
final_column_types[column] = next(iter(types))
|
|
153
|
+
else:
|
|
154
|
+
final_column_types[column] = 'String' # Если разные типы, делаем строкой
|
|
155
|
+
create_table_query = []
|
|
156
|
+
non_nullable_list = uniq_columns.replace(' ','').split(',')+[partitions.strip()]
|
|
157
|
+
for field, data_type in final_column_types.items():
|
|
158
|
+
field_type = f'Nullable({data_type})'
|
|
159
|
+
for non in non_nullable_list:
|
|
160
|
+
if field == non:
|
|
161
|
+
field_type = f'{data_type}'
|
|
162
|
+
create_table_query.append(f"{field} {field_type}")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
print(f'Ошибка анализа: {e}')
|
|
165
|
+
logging.info(f'Ошибка анализа: {e}')
|
|
166
|
+
return create_table_query
|
|
167
|
+
|
|
168
|
+
# список словарей (данные) -> датафрейм с нужными типами
|
|
169
|
+
def check_and_convert_types(self, data, uniq_columns, partitions):
|
|
170
|
+
try:
|
|
171
|
+
columns_list=self.analyze_column_types(data, uniq_columns, partitions)
|
|
172
|
+
df=pd.DataFrame(data,dtype=str)
|
|
173
|
+
type_mapping = {
|
|
174
|
+
'UInt8': 'bool',
|
|
175
|
+
'Nullable(UInt8)': 'bool',
|
|
176
|
+
'Date': 'datetime64[ns]', # pandas формат для дат
|
|
177
|
+
'DateTime': 'datetime64[ns]', # pandas формат для дат с временем
|
|
178
|
+
'String': 'object', # Строковый формат в pandas
|
|
179
|
+
'Float64': 'float64', # float64 тип в pandas
|
|
180
|
+
'Nullable(Date)': 'datetime64[ns]', # pandas формат для дат
|
|
181
|
+
'Nullable(DateTime)': 'datetime64[ns]', # pandas формат для дат с временем
|
|
182
|
+
'Nullable(String)': 'object', # Строковый формат в pandas
|
|
183
|
+
'Nullable(Float64)': 'float64' # float64 тип в pandas
|
|
184
|
+
}
|
|
185
|
+
for item in columns_list:
|
|
186
|
+
column_name, expected_type = item.split() # Разделяем по пробелу: 'column_name expected_type'
|
|
187
|
+
if column_name in df.columns:
|
|
188
|
+
expected_type = expected_type.strip()
|
|
189
|
+
try:
|
|
190
|
+
if expected_type in ['Date', 'Nullable(Date)']:
|
|
191
|
+
df[column_name] = df[column_name].apply(self.column_to_datetime)
|
|
192
|
+
df[column_name] = pd.to_datetime(df[column_name], errors='raise')
|
|
193
|
+
df[column_name] = df[column_name].fillna(pd.to_datetime('1970-01-01').date())
|
|
194
|
+
if expected_type in ['DateTime', 'Nullable(DateTime)']:
|
|
195
|
+
df[column_name] = df[column_name].apply(self.column_to_datetime)
|
|
196
|
+
df[column_name] = pd.to_datetime(df[column_name], errors='raise')
|
|
197
|
+
df[column_name] = df[column_name].fillna(pd.Timestamp('1970-01-01'))
|
|
198
|
+
elif expected_type in ['UInt8','Nullable(UInt8)']:
|
|
199
|
+
df[column_name] = df[column_name].replace({'True': True, 'False': False, 'true': True, 'false': False, })
|
|
200
|
+
df[column_name] = df[column_name].fillna(False)
|
|
201
|
+
df[column_name] = df[column_name].astype('bool')
|
|
202
|
+
elif expected_type in ['Float64','Nullable(Float64)']:
|
|
203
|
+
df[column_name] = pd.to_numeric(df[column_name], errors='raise').astype('float64')
|
|
204
|
+
df[column_name] = df[column_name].fillna(0)
|
|
205
|
+
elif expected_type in ['String','Nullable(String)']:
|
|
206
|
+
df[column_name] = df[column_name].astype(str)
|
|
207
|
+
df[column_name] = df[column_name].fillna("")
|
|
208
|
+
except Exception as e:
|
|
209
|
+
print(f"Ошибка при преобразовании столбца '{column_name}': {e}")
|
|
210
|
+
logging.info(f"Ошибка при преобразовании столбца '{column_name}': {e}")
|
|
211
|
+
df['timeStamp'] = self.now
|
|
212
|
+
print(f'Датафрейм успешно преобразован')
|
|
213
|
+
logging.info(f'Датафрейм успешно преобразован')
|
|
214
|
+
except Exception as e:
|
|
215
|
+
print(f'Ошибка преобразования df: {e}')
|
|
216
|
+
logging.info(f'Ошибка преобразования df: {e}')
|
|
217
|
+
return df
|
|
218
|
+
|
|
219
|
+
def to_collect(self, schedule_str, date_str):
|
|
220
|
+
try:
|
|
221
|
+
today = datetime.strptime(date_str, '%Y-%m-%d').date()
|
|
222
|
+
except ValueError:
|
|
223
|
+
raise ValueError("Дата должна быть в формате 'YYYY-MM-DD'")
|
|
224
|
+
day_of_week = today.strftime('%A').lower() # День недели (например, 'friday')
|
|
225
|
+
day_of_month = today.day # Число месяца (например, 22)
|
|
226
|
+
schedule_list = [s.strip().lower() for s in schedule_str.split(',')]
|
|
227
|
+
for schedule in schedule_list:
|
|
228
|
+
if schedule == 'daily': # Если указано "daily", всегда возвращаем True
|
|
229
|
+
return True
|
|
230
|
+
if schedule == day_of_week: # Проверка дня недели (например, 'friday')
|
|
231
|
+
return True
|
|
232
|
+
if schedule.isdigit() and int(schedule) == day_of_month: # Проверка числа месяца
|
|
233
|
+
return True
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
def spread_table(self, source_list):
|
|
237
|
+
result_list = []
|
|
238
|
+
for row in source_list:
|
|
239
|
+
row_dict = {}
|
|
240
|
+
for key, value in row.items():
|
|
241
|
+
if isinstance(value, dict):
|
|
242
|
+
for name, inner_value in dict(value).items():
|
|
243
|
+
row_dict[f'{key}_{name}'] = inner_value
|
|
244
|
+
else:
|
|
245
|
+
row_dict[f'{key}'] = value
|
|
246
|
+
result_list.append(row_dict)
|
|
247
|
+
return result_list
|