connector-for-bb 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- connector_for_bb-0.0.1/MANIFEST.in +10 -0
- connector_for_bb-0.0.1/PKG-INFO +19 -0
- connector_for_bb-0.0.1/README.md +1 -0
- connector_for_bb-0.0.1/VERSION +1 -0
- connector_for_bb-0.0.1/setup.cfg +128 -0
- connector_for_bb-0.0.1/setup.py +70 -0
- connector_for_bb-0.0.1/src/connector_for_bb.egg-info/PKG-INFO +19 -0
- connector_for_bb-0.0.1/src/connector_for_bb.egg-info/SOURCES.txt +23 -0
- connector_for_bb-0.0.1/src/connector_for_bb.egg-info/dependency_links.txt +1 -0
- connector_for_bb-0.0.1/src/connector_for_bb.egg-info/requires.txt +19 -0
- connector_for_bb-0.0.1/src/connector_for_bb.egg-info/top_level.txt +1 -0
- connector_for_bb-0.0.1/src/dump/__init__.py +0 -0
- connector_for_bb-0.0.1/src/dump/click_house/__init__.py +0 -0
- connector_for_bb-0.0.1/src/dump/click_house/connectors.py +32 -0
- connector_for_bb-0.0.1/src/dump/config_utils.py +19 -0
- connector_for_bb-0.0.1/src/dump/files/__init__.py +0 -0
- connector_for_bb-0.0.1/src/dump/files/tmp_util.py +44 -0
- connector_for_bb-0.0.1/src/dump/postgres/__init__.py +0 -0
- connector_for_bb-0.0.1/src/dump/postgres/connectors.py +186 -0
- connector_for_bb-0.0.1/src/dump/postgres/manipulators.py +138 -0
- connector_for_bb-0.0.1/src/dump/postgres/table.py +167 -0
- connector_for_bb-0.0.1/src/dump/postgres/types_util.py +32 -0
- connector_for_bb-0.0.1/src/dump/s3/__init__.py +0 -0
- connector_for_bb-0.0.1/src/dump/s3/connectors.py +116 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
global-exclude *.py[co]
|
|
2
|
+
global-exclude __pycache__
|
|
3
|
+
graft tests
|
|
4
|
+
include README.md
|
|
5
|
+
include VERSION
|
|
6
|
+
recursive-include src/dump *.py
|
|
7
|
+
recursive-include src/dump/click_house *.py
|
|
8
|
+
recursive-include src/dump/files *.py
|
|
9
|
+
recursive-include src/dump/postgres *.py
|
|
10
|
+
recursive-include src/dump/s3 *.py
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: connector_for_bb
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: connector
|
|
5
|
+
Home-page: https://github.com/your_username/your_repository
|
|
6
|
+
Author: Ivan
|
|
7
|
+
Author-email: example@example.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Platform: UNKNOWN
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Provides-Extra: develop
|
|
16
|
+
|
|
17
|
+
Dump project
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Dump project
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.1
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = connector_for_bb
|
|
3
|
+
version = 0.0.1
|
|
4
|
+
description = connector
|
|
5
|
+
long_description = file: README.md
|
|
6
|
+
long_description_content_type = text/markdown
|
|
7
|
+
author = Ivan
|
|
8
|
+
author_email = example@example.com
|
|
9
|
+
license = MIT
|
|
10
|
+
url = https://github.com/your_username/your_repository
|
|
11
|
+
classifiers =
|
|
12
|
+
Programming Language :: Python :: 3
|
|
13
|
+
License :: OSI Approved :: MIT License
|
|
14
|
+
Operating System :: OS Independent
|
|
15
|
+
|
|
16
|
+
[aliases]
|
|
17
|
+
test = pytest
|
|
18
|
+
|
|
19
|
+
[coverage:report]
|
|
20
|
+
exclude_lines =
|
|
21
|
+
@abc.abstractmethod
|
|
22
|
+
@abc.abstractproperty
|
|
23
|
+
CancelledError
|
|
24
|
+
NotImplementedError
|
|
25
|
+
pragma: no cover
|
|
26
|
+
__repr__
|
|
27
|
+
__str__
|
|
28
|
+
fail_under = 81
|
|
29
|
+
precision = 2
|
|
30
|
+
show_missing = True
|
|
31
|
+
|
|
32
|
+
[coverage:run]
|
|
33
|
+
branch = True
|
|
34
|
+
source =
|
|
35
|
+
src
|
|
36
|
+
tests
|
|
37
|
+
|
|
38
|
+
[flake8]
|
|
39
|
+
ignore = E203,W503
|
|
40
|
+
max-line-length = 95
|
|
41
|
+
|
|
42
|
+
[isort]
|
|
43
|
+
atomic = true
|
|
44
|
+
default_section = THIRDPARTY
|
|
45
|
+
force_grid_wrap = 0
|
|
46
|
+
include_trailing_comma = true
|
|
47
|
+
indent = ' '
|
|
48
|
+
known_first_party = dsp_dags
|
|
49
|
+
known_third_party =
|
|
50
|
+
line_length = 95
|
|
51
|
+
lines_after_imports = 2
|
|
52
|
+
multi_line_output = 3
|
|
53
|
+
not_skip = __init__.py
|
|
54
|
+
order_by_type = true
|
|
55
|
+
sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
|
|
56
|
+
use_parentheses = True
|
|
57
|
+
|
|
58
|
+
[mypy]
|
|
59
|
+
check_untyped_defs = True
|
|
60
|
+
|
|
61
|
+
[mypy-lightfm.*]
|
|
62
|
+
ignore_missing_imports = True
|
|
63
|
+
|
|
64
|
+
[mypy-numpy.*]
|
|
65
|
+
ignore_missing_imports = True
|
|
66
|
+
|
|
67
|
+
[mypy-pandas.*]
|
|
68
|
+
ignore_missing_imports = True
|
|
69
|
+
|
|
70
|
+
[mypy-pytest.*]
|
|
71
|
+
ignore_missing_imports = True
|
|
72
|
+
|
|
73
|
+
[mypy-scipy.*]
|
|
74
|
+
ignore_missing_imports = True
|
|
75
|
+
|
|
76
|
+
[mypy-sklearn.*]
|
|
77
|
+
ignore_missing_imports = True
|
|
78
|
+
|
|
79
|
+
[mypy-xgboost.*]
|
|
80
|
+
ignore_missing_imports = True
|
|
81
|
+
|
|
82
|
+
[mypy-psycopg2.*]
|
|
83
|
+
ignore_missing_imports = True
|
|
84
|
+
|
|
85
|
+
[mypy-clickhouse_connect.*]
|
|
86
|
+
ignore_missing_imports = True
|
|
87
|
+
|
|
88
|
+
[options]
|
|
89
|
+
include_package_data = True
|
|
90
|
+
install_requires =
|
|
91
|
+
tqdm==4.66.5
|
|
92
|
+
catboost==1.2.7
|
|
93
|
+
numpy==1.26.0
|
|
94
|
+
pandas==2.2.2
|
|
95
|
+
psycopg2-binary==2.9.9
|
|
96
|
+
boto3==1.35.30
|
|
97
|
+
scikit-learn==1.5.2
|
|
98
|
+
clickhouse-connect==0.8.6
|
|
99
|
+
package_dir =
|
|
100
|
+
= src
|
|
101
|
+
packages = find:
|
|
102
|
+
python_requires = >=3.10
|
|
103
|
+
|
|
104
|
+
[options.extras_require]
|
|
105
|
+
develop =
|
|
106
|
+
black==24.8.0
|
|
107
|
+
coverage==5.5
|
|
108
|
+
flake8==6.0.0
|
|
109
|
+
hypothesis==6.9.2
|
|
110
|
+
isort==5.8.0
|
|
111
|
+
mypy==0.812
|
|
112
|
+
pylint==2.17.4
|
|
113
|
+
pytest==7.3.1
|
|
114
|
+
responses==0.13.2
|
|
115
|
+
|
|
116
|
+
[options.packages.find]
|
|
117
|
+
where = src
|
|
118
|
+
|
|
119
|
+
[tool:pytest]
|
|
120
|
+
addopts =
|
|
121
|
+
--junitxml=junit.xml
|
|
122
|
+
--showlocals
|
|
123
|
+
--verbose
|
|
124
|
+
|
|
125
|
+
[egg_info]
|
|
126
|
+
tag_build =
|
|
127
|
+
tag_date = 0
|
|
128
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
from setuptools import setup, find_packages
|
|
4
|
+
from setuptools.command.sdist import sdist as sdist_orig
|
|
5
|
+
|
|
6
|
+
ROOT = os.path.dirname(__file__)
|
|
7
|
+
VERSION = os.path.join(ROOT, "VERSION")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def format_version(raw_version):
|
|
11
|
+
"""
|
|
12
|
+
Преобразование версии в формат PEP 440.
|
|
13
|
+
Если версия содержит некорректные символы, добавляем их как метаданные.
|
|
14
|
+
"""
|
|
15
|
+
if raw_version.startswith("v"): # Удаляем префикс v, если он есть
|
|
16
|
+
raw_version = raw_version[1:]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
parts = raw_version.split("-")
|
|
20
|
+
if len(parts) == 1:
|
|
21
|
+
return raw_version # Версия уже корректна
|
|
22
|
+
elif len(parts) == 3:
|
|
23
|
+
base, distance, commit_hash = parts
|
|
24
|
+
return f"{base}.dev{distance}+{commit_hash}"
|
|
25
|
+
else:
|
|
26
|
+
# Если версия не в ожидаемом формате, добавляем её как метаданные
|
|
27
|
+
return f"0.1.0+{raw_version}"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def project_version():
|
|
31
|
+
"""Получение версии проекта из git или файла VERSION."""
|
|
32
|
+
version = None
|
|
33
|
+
|
|
34
|
+
# Попытка получить версию из Git
|
|
35
|
+
try:
|
|
36
|
+
output = subprocess.check_output(
|
|
37
|
+
["git", "describe", "--tags", "--always"],
|
|
38
|
+
stderr=open(os.devnull, "wb"),
|
|
39
|
+
).strip().decode()
|
|
40
|
+
version = format_version(output)
|
|
41
|
+
except (FileNotFoundError, subprocess.CalledProcessError) as e:
|
|
42
|
+
print(f"Warning: Unable to retrieve version from git: {e}")
|
|
43
|
+
|
|
44
|
+
# Если версия недоступна, читаем из файла VERSION
|
|
45
|
+
if not version and os.path.exists(VERSION):
|
|
46
|
+
with open(VERSION) as verfile:
|
|
47
|
+
raw_version = verfile.read().strip()
|
|
48
|
+
version = format_version(raw_version)
|
|
49
|
+
|
|
50
|
+
# Если версия всё ещё не найдена, вызываем ошибку
|
|
51
|
+
if not version:
|
|
52
|
+
raise RuntimeError("Cannot detect project version")
|
|
53
|
+
|
|
54
|
+
return version
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class sdist(sdist_orig):
|
|
58
|
+
|
|
59
|
+
def run(self):
|
|
60
|
+
version = project_version()
|
|
61
|
+
with open(VERSION, "w") as verfile:
|
|
62
|
+
verfile.write(version)
|
|
63
|
+
sdist_orig.run(self)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
setup(
|
|
68
|
+
version=project_version(), # Версия из git или файла VERSION
|
|
69
|
+
cmdclass={"sdist": sdist}, # Использование кастомного класса sdist
|
|
70
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: connector-for-bb
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: connector
|
|
5
|
+
Home-page: https://github.com/your_username/your_repository
|
|
6
|
+
Author: Ivan
|
|
7
|
+
Author-email: example@example.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Platform: UNKNOWN
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Provides-Extra: develop
|
|
16
|
+
|
|
17
|
+
Dump project
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
VERSION
|
|
4
|
+
setup.cfg
|
|
5
|
+
setup.py
|
|
6
|
+
src/connector_for_bb.egg-info/PKG-INFO
|
|
7
|
+
src/connector_for_bb.egg-info/SOURCES.txt
|
|
8
|
+
src/connector_for_bb.egg-info/dependency_links.txt
|
|
9
|
+
src/connector_for_bb.egg-info/requires.txt
|
|
10
|
+
src/connector_for_bb.egg-info/top_level.txt
|
|
11
|
+
src/dump/__init__.py
|
|
12
|
+
src/dump/config_utils.py
|
|
13
|
+
src/dump/click_house/__init__.py
|
|
14
|
+
src/dump/click_house/connectors.py
|
|
15
|
+
src/dump/files/__init__.py
|
|
16
|
+
src/dump/files/tmp_util.py
|
|
17
|
+
src/dump/postgres/__init__.py
|
|
18
|
+
src/dump/postgres/connectors.py
|
|
19
|
+
src/dump/postgres/manipulators.py
|
|
20
|
+
src/dump/postgres/table.py
|
|
21
|
+
src/dump/postgres/types_util.py
|
|
22
|
+
src/dump/s3/__init__.py
|
|
23
|
+
src/dump/s3/connectors.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
boto3==1.35.30
|
|
2
|
+
catboost==1.2.7
|
|
3
|
+
clickhouse-connect==0.8.6
|
|
4
|
+
numpy==1.26.0
|
|
5
|
+
pandas==2.2.2
|
|
6
|
+
psycopg2-binary==2.9.9
|
|
7
|
+
scikit-learn==1.5.2
|
|
8
|
+
tqdm==4.66.5
|
|
9
|
+
|
|
10
|
+
[develop]
|
|
11
|
+
black==24.8.0
|
|
12
|
+
coverage==5.5
|
|
13
|
+
flake8==6.0.0
|
|
14
|
+
hypothesis==6.9.2
|
|
15
|
+
isort==5.8.0
|
|
16
|
+
mypy==0.812
|
|
17
|
+
pylint==2.17.4
|
|
18
|
+
pytest==7.3.1
|
|
19
|
+
responses==0.13.2
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dump
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import clickhouse_connect
|
|
2
|
+
from config_util import load_config
|
|
3
|
+
from pandas import DataFrame as PandasDataFrame
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ConnectorCH:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
db_config_name: str = "click_house",
|
|
10
|
+
) -> None:
|
|
11
|
+
self.db_config_name = db_config_name
|
|
12
|
+
self.__config = load_config(section=self.db_config_name)
|
|
13
|
+
|
|
14
|
+
self._client = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def client(self):
|
|
18
|
+
if self._client is None:
|
|
19
|
+
try:
|
|
20
|
+
self._client = clickhouse_connect.get_client(**self.__config)
|
|
21
|
+
# print("Connected to the ClickHouse server.")
|
|
22
|
+
except Exception as error:
|
|
23
|
+
print(error)
|
|
24
|
+
return self._client
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TableCH(ConnectorCH):
|
|
28
|
+
def __init__(self, db_config_name: str = "click_house") -> None:
|
|
29
|
+
super().__init__(db_config_name)
|
|
30
|
+
|
|
31
|
+
def get_df(self, query: str) -> PandasDataFrame:
|
|
32
|
+
return self.client.query_df(query)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from configparser import ConfigParser
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def load_config(filename="database.ini", section="postgresql"):
|
|
5
|
+
parser = ConfigParser()
|
|
6
|
+
parser.read(filename)
|
|
7
|
+
|
|
8
|
+
# get section, default to postgresql
|
|
9
|
+
config = {}
|
|
10
|
+
if parser.has_section(section):
|
|
11
|
+
params = parser.items(section)
|
|
12
|
+
for param in params:
|
|
13
|
+
config[param[0]] = param[1]
|
|
14
|
+
else:
|
|
15
|
+
raise Exception(
|
|
16
|
+
"Section {0} not found in the {1} file".format(section, filename)
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
return config
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TemporaryFileSystem:
|
|
6
|
+
"""
|
|
7
|
+
Temporary directory utils class
|
|
8
|
+
used for data manipulators
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self._tmp_dir = TemporaryDirectory()
|
|
13
|
+
self._last_path: str
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def tmp_dir(self) -> str:
|
|
17
|
+
return self._tmp_dir.name
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def files(self) -> list:
|
|
21
|
+
return os.listdir(str(self.tmp_dir))
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def next_filename(self) -> str:
|
|
25
|
+
number = 0
|
|
26
|
+
avaliable = [int(x.split(".")[0].split("_")[-1]) for x in self.files]
|
|
27
|
+
if len(avaliable) != 0:
|
|
28
|
+
number = max(avaliable) + 1
|
|
29
|
+
return f"data_{number}.csv"
|
|
30
|
+
|
|
31
|
+
def save_path(self) -> str:
|
|
32
|
+
self._last_path = os.path.join(str(self.tmp_dir), self.next_filename)
|
|
33
|
+
return self._last_path
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def dir_info(self) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Shows temporary directory info
|
|
39
|
+
file - file size in mb
|
|
40
|
+
"""
|
|
41
|
+
for file in self.files:
|
|
42
|
+
check_file = os.path.join(self.tmp_dir, file)
|
|
43
|
+
size = os.path.getsize(check_file) / 1024**2
|
|
44
|
+
print(f"{check_file}: {size} mb")
|
|
File without changes
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import psycopg2
|
|
2
|
+
from dump.config_utils import load_config
|
|
3
|
+
from psycopg2 import connect
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
TYPE_MAPPER = {
|
|
7
|
+
"bigint": "Int64",
|
|
8
|
+
"character varying": "str",
|
|
9
|
+
"integer": "Int64",
|
|
10
|
+
"double precision": "float",
|
|
11
|
+
"timestamp without time zone": "timestamp",
|
|
12
|
+
# "timestamp with time zone": "timestamp",
|
|
13
|
+
"USER-DEFINED": "str",
|
|
14
|
+
"date": "timestamp",
|
|
15
|
+
"character": "str",
|
|
16
|
+
"boolean": "bool",
|
|
17
|
+
"real": "Int64",
|
|
18
|
+
"text": "str",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ConnectorPG:
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
db_config_name: str = "postgres",
|
|
26
|
+
) -> None:
|
|
27
|
+
self.db_config_name = db_config_name
|
|
28
|
+
self.__config = load_config(section=self.db_config_name)
|
|
29
|
+
|
|
30
|
+
self._conn: connect = None
|
|
31
|
+
self._cursor = None
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def conn(self):
|
|
35
|
+
if self._conn is None:
|
|
36
|
+
try:
|
|
37
|
+
# connecting to the PostgreSQL server
|
|
38
|
+
with psycopg2.connect(**self.__config) as pgconn:
|
|
39
|
+
# print("Connected to the PostgreSQL server.")
|
|
40
|
+
self._conn = pgconn
|
|
41
|
+
except (psycopg2.DatabaseError, Exception) as error:
|
|
42
|
+
print(error)
|
|
43
|
+
return self._conn
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def cursor(self):
|
|
47
|
+
if self._cursor is None:
|
|
48
|
+
self._cursor = self.conn.cursor()
|
|
49
|
+
return self._cursor
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def close(self):
|
|
53
|
+
self._conn.close()
|
|
54
|
+
|
|
55
|
+
self._conn = None
|
|
56
|
+
self._cursor = None
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def commit(self):
|
|
60
|
+
self.conn.commit()
|
|
61
|
+
|
|
62
|
+
def execute(self, query: str):
|
|
63
|
+
self.cursor.execute(query)
|
|
64
|
+
self.conn.commit()
|
|
65
|
+
|
|
66
|
+
def fetchall(self, queru: str) -> list:
|
|
67
|
+
self.execute(queru)
|
|
68
|
+
values = self.cursor.fetchall()
|
|
69
|
+
self.conn.commit()
|
|
70
|
+
|
|
71
|
+
return values
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DBUtilsPG(ConnectorPG):
|
|
75
|
+
@staticmethod
|
|
76
|
+
def _validate_output(inp: list):
|
|
77
|
+
return [x[0] for x in inp]
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def _schemas(self) -> list:
|
|
81
|
+
query = """
|
|
82
|
+
SELECT schema_name
|
|
83
|
+
FROM information_schema.schemata;
|
|
84
|
+
"""
|
|
85
|
+
return self._validate_output(self.fetchall(query))
|
|
86
|
+
|
|
87
|
+
def get_tables_in_schema(self, schema: str) -> list:
|
|
88
|
+
query = f"""
|
|
89
|
+
SELECT table_name
|
|
90
|
+
FROM information_schema.tables
|
|
91
|
+
WHERE table_schema = '{schema}'
|
|
92
|
+
"""
|
|
93
|
+
tables = self._validate_output(self.fetchall(query))
|
|
94
|
+
return tables
|
|
95
|
+
|
|
96
|
+
def get_table_schema(self, table_name: str, db_schema: str) -> dict:
|
|
97
|
+
query = f"""
|
|
98
|
+
SELECT column_name, data_type, is_nullable, ordinal_position
|
|
99
|
+
FROM information_schema.columns
|
|
100
|
+
WHERE table_name = '{table_name}' and
|
|
101
|
+
table_schema = '{db_schema}';
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
schema = {}
|
|
105
|
+
for column_name, data_type, is_nullable, position in self.fetchall(query):
|
|
106
|
+
schema[column_name] = {
|
|
107
|
+
"data_type": TYPE_MAPPER.get(data_type, "str"),
|
|
108
|
+
"is_nullable": is_nullable,
|
|
109
|
+
"position": position,
|
|
110
|
+
}
|
|
111
|
+
return schema
|
|
112
|
+
|
|
113
|
+
def get_schema_by_table(self, table_name: str) -> str:
|
|
114
|
+
query = f"""
|
|
115
|
+
SELECT table_schema
|
|
116
|
+
FROM information_schema.tables
|
|
117
|
+
WHERE table_name = '{table_name}';
|
|
118
|
+
"""
|
|
119
|
+
schema = self._validate_output(self.fetchall(query))
|
|
120
|
+
return schema[0]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class SequenceUtils:
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
sequence_name: str = "seq_general_job_id",
|
|
127
|
+
sequence_schema: str = "public",
|
|
128
|
+
db_config_name: str = "postgres",
|
|
129
|
+
) -> None:
|
|
130
|
+
self.sequence_name = sequence_name
|
|
131
|
+
self.sequence_schema = sequence_schema
|
|
132
|
+
|
|
133
|
+
self.connector = ConnectorPG(db_config_name)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def next_sequence(self):
|
|
137
|
+
query = f"""
|
|
138
|
+
SELECT nextval('{self.sequence_schema}.{self.sequence_name}');
|
|
139
|
+
"""
|
|
140
|
+
sequence = self.connector.fetchall(query)[0][0]
|
|
141
|
+
return sequence
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def current_sequence(self):
|
|
145
|
+
query = f"""
|
|
146
|
+
SELECT last_value FROM {self.sequence_schema}.{self.sequence_name}
|
|
147
|
+
"""
|
|
148
|
+
sequence = self.connector.fetchall(query)[0][0]
|
|
149
|
+
return sequence
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# class TemporaryTablePG(ConnectorPG):
|
|
153
|
+
# def __init__(self, name: str, db_config_name: str = "postgres") -> None:
|
|
154
|
+
# super().__init__(db_config_name)
|
|
155
|
+
# self.name = name
|
|
156
|
+
# self.data_manipulator = DataManupulationPG(db_config_name=self.db_config_name)
|
|
157
|
+
|
|
158
|
+
# # mapping dtypes to PG types
|
|
159
|
+
# self._KIND_MAPPER = {
|
|
160
|
+
# "M": "timestamp",
|
|
161
|
+
# "f": "real",
|
|
162
|
+
# "i": "bigint",
|
|
163
|
+
# }
|
|
164
|
+
|
|
165
|
+
# def _pd_types_to_pg_types(self, df: PandasDataFrame) -> dict:
|
|
166
|
+
# df_types = df.dtypes.to_dict()
|
|
167
|
+
# some = {
|
|
168
|
+
# col: self._KIND_MAPPER.get(col_type.kind, "varchar")
|
|
169
|
+
# for col, col_type in df_types.items()
|
|
170
|
+
# }
|
|
171
|
+
# return some
|
|
172
|
+
|
|
173
|
+
# def _types_to_str_statments(self, types: dict) -> str:
|
|
174
|
+
# types_statments = ", \n ".join([" ".join(x) for x in types.items()])
|
|
175
|
+
# return types_statments
|
|
176
|
+
|
|
177
|
+
# def create_table(self, df: PandasDataFrame):
|
|
178
|
+
# types = self._pd_types_to_pg_types(df)
|
|
179
|
+
# types_statments = self._types_to_str_statments(types)
|
|
180
|
+
# query = f"""
|
|
181
|
+
# CREATE TEMP TABLE {self.name}(
|
|
182
|
+
# {types_statments}
|
|
183
|
+
# );
|
|
184
|
+
# """
|
|
185
|
+
# self.execute(query)
|
|
186
|
+
# self.commit
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dump.files.tmp_util import TemporaryFileSystem
|
|
5
|
+
from pandas import DataFrame as PandasDataFrame
|
|
6
|
+
|
|
7
|
+
from .connectors import ConnectorPG, DBUtilsPG
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GetDataManipulatorPG(ConnectorPG):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
encoding="utf-16",
|
|
14
|
+
size: int = 262144,
|
|
15
|
+
db_config_name: str = "postgres",
|
|
16
|
+
) -> None:
|
|
17
|
+
super().__init__(db_config_name=db_config_name)
|
|
18
|
+
self.encoding = encoding
|
|
19
|
+
self.size = size
|
|
20
|
+
|
|
21
|
+
# requires for not execute same query more than one time
|
|
22
|
+
self.__query_history: Dict[str, str] = {}
|
|
23
|
+
self.file_system = TemporaryFileSystem()
|
|
24
|
+
|
|
25
|
+
def get_data(self, query: str, path: Optional[str] = None) -> str:
|
|
26
|
+
"""
|
|
27
|
+
fetching data from pg using query
|
|
28
|
+
and saving into csv locating in temporary directory
|
|
29
|
+
"""
|
|
30
|
+
to_execute = f"""
|
|
31
|
+
copy ( {query} )
|
|
32
|
+
to stdout
|
|
33
|
+
WITH
|
|
34
|
+
CSV HEADER
|
|
35
|
+
"""
|
|
36
|
+
if path is None:
|
|
37
|
+
path = self.file_system.save_path()
|
|
38
|
+
|
|
39
|
+
if query not in self.__query_history:
|
|
40
|
+
with open(path, "w", encoding=self.encoding) as fp:
|
|
41
|
+
self.cursor.copy_expert(to_execute, fp, size=self.size)
|
|
42
|
+
self.close
|
|
43
|
+
self.__query_history[query] = self.file_system._last_path
|
|
44
|
+
|
|
45
|
+
path = self.__query_history[query]
|
|
46
|
+
return path
|
|
47
|
+
|
|
48
|
+
def get_df(self, query: str) -> PandasDataFrame:
|
|
49
|
+
path = self.get_data(query)
|
|
50
|
+
return pd.read_csv(path, encoding=self.encoding, low_memory=False)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SaveDataManipulatorPG(ConnectorPG):
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
table_name: str,
|
|
57
|
+
db_schema: Optional[str] = None,
|
|
58
|
+
encoding="utf-16",
|
|
59
|
+
size: int = 262144,
|
|
60
|
+
db_config_name: str = "postgres",
|
|
61
|
+
) -> None:
|
|
62
|
+
super().__init__(db_config_name=db_config_name)
|
|
63
|
+
self.table_name = table_name
|
|
64
|
+
self.db_schema = db_schema
|
|
65
|
+
self.encoding = encoding
|
|
66
|
+
self.size = size
|
|
67
|
+
|
|
68
|
+
self.file_system = TemporaryFileSystem()
|
|
69
|
+
|
|
70
|
+
self.db_utils = DBUtilsPG(db_config_name)
|
|
71
|
+
if self.db_schema is None:
|
|
72
|
+
self.db_schema = self.db_utils.get_schema_by_table(self.table_name)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def __get_columns_right_order(table_schema: Dict[str, str]) -> list:
|
|
76
|
+
"""
|
|
77
|
+
Return list of columns sorted by order in destination table
|
|
78
|
+
"""
|
|
79
|
+
column_positions = {
|
|
80
|
+
column: meta_info["position"] for column, meta_info in table_schema.items()
|
|
81
|
+
}
|
|
82
|
+
columns_sorted_by_position = sorted(column_positions, key=column_positions.get)
|
|
83
|
+
return columns_sorted_by_position
|
|
84
|
+
|
|
85
|
+
def _validate_df_to_save(self, df) -> PandasDataFrame:
|
|
86
|
+
"""
|
|
87
|
+
Check that DataFrame have all required columns
|
|
88
|
+
"""
|
|
89
|
+
table_schemа = self.db_utils.get_table_schema(self.table_name, self.db_schema)
|
|
90
|
+
columns_intersection = set(table_schemа).intersection(df.columns)
|
|
91
|
+
columns_not_found = set(table_schemа) - columns_intersection
|
|
92
|
+
|
|
93
|
+
if len(columns_not_found) != 0:
|
|
94
|
+
raise Exception(f"Columns not found: {columns_not_found}")
|
|
95
|
+
|
|
96
|
+
column_to_write = self.__get_columns_right_order(table_schemа)
|
|
97
|
+
return df[column_to_write]
|
|
98
|
+
|
|
99
|
+
def init_df(self, df: PandasDataFrame) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Validate df with pg table and save df as csv into temporary directory
|
|
102
|
+
"""
|
|
103
|
+
df = self._validate_df_to_save(df)
|
|
104
|
+
|
|
105
|
+
path = self.file_system.save_path()
|
|
106
|
+
df.to_csv(path, encoding=self.encoding, index=False)
|
|
107
|
+
return path
|
|
108
|
+
|
|
109
|
+
def save_data(
|
|
110
|
+
self,
|
|
111
|
+
path: str,
|
|
112
|
+
):
|
|
113
|
+
"""
|
|
114
|
+
reading file from path
|
|
115
|
+
and saving file to self.table_save
|
|
116
|
+
|
|
117
|
+
self.table_save: str - PG table where to save data
|
|
118
|
+
path: str - path to file which should be copied into PG
|
|
119
|
+
|
|
120
|
+
Using with combination self.init_data
|
|
121
|
+
or set path manualy
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
to_execute = f"""
|
|
125
|
+
copy {self.db_schema}.{self.table_name}
|
|
126
|
+
from STDIN
|
|
127
|
+
WITH
|
|
128
|
+
CSV header
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
with open(path, "r", encoding=self.encoding) as fp:
|
|
132
|
+
self.cursor.copy_expert(to_execute, fp, size=self.size)
|
|
133
|
+
self.commit
|
|
134
|
+
self.close
|
|
135
|
+
|
|
136
|
+
def save_df(self, df: PandasDataFrame):
|
|
137
|
+
path = self.init_df(df)
|
|
138
|
+
self.save_data(path)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pandas import DataFrame as PandasDataFrame
|
|
5
|
+
|
|
6
|
+
from .connectors import ConnectorPG, DBUtilsPG, SequenceUtils
|
|
7
|
+
from .manipulators import GetDataManipulatorPG, SaveDataManipulatorPG
|
|
8
|
+
from .types_util import cast_df_by_schema
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
TECH_COLUMNS = {"_inserted_dttm", "_job_id"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TablePG(ConnectorPG):
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
table_name: str,
|
|
18
|
+
db_schema: Optional[str] = None,
|
|
19
|
+
limit: int = None,
|
|
20
|
+
auto_cast: bool = True,
|
|
21
|
+
clear_tech_cols: bool = True,
|
|
22
|
+
db_config_name: str = "postgres",
|
|
23
|
+
) -> None:
|
|
24
|
+
super().__init__(db_config_name=db_config_name)
|
|
25
|
+
self.table_name = table_name
|
|
26
|
+
self.db_schema = db_schema
|
|
27
|
+
self._limit = limit
|
|
28
|
+
self.auto_cast = auto_cast
|
|
29
|
+
self.clear_tech_cols = clear_tech_cols
|
|
30
|
+
|
|
31
|
+
self.data_manipulator = GetDataManipulatorPG(db_config_name=self.db_config_name)
|
|
32
|
+
self.db_utils = DBUtilsPG(db_config_name=db_config_name)
|
|
33
|
+
|
|
34
|
+
if self.db_schema is None:
|
|
35
|
+
self.db_schema = self.db_utils.get_schema_by_table(self.table_name)
|
|
36
|
+
|
|
37
|
+
if self.auto_cast:
|
|
38
|
+
self.table_schema = self.db_utils.get_table_schema(
|
|
39
|
+
self.table_name,
|
|
40
|
+
self.db_schema,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
self._filters_args: set = set()
|
|
44
|
+
self._select_args: set = set()
|
|
45
|
+
|
|
46
|
+
def filter(self, *args, operation: str = "and"):
|
|
47
|
+
self._filters_args: set = set()
|
|
48
|
+
self._filters_args.update([f" {operation} {arg}" for arg in args])
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def _filters(self) -> str:
|
|
53
|
+
filters = "WHERE 1=1"
|
|
54
|
+
filters += " ".join(self._filters_args)
|
|
55
|
+
return filters
|
|
56
|
+
|
|
57
|
+
def select(self, *args):
|
|
58
|
+
self._select_args: set = set()
|
|
59
|
+
self._select_args.update(args)
|
|
60
|
+
return self
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def _select(self) -> str:
|
|
64
|
+
select = "select "
|
|
65
|
+
if len(self._select_args) == 0:
|
|
66
|
+
select += "*"
|
|
67
|
+
select += ", ".join(self._select_args)
|
|
68
|
+
return select
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def query(self) -> str:
|
|
72
|
+
limit = ""
|
|
73
|
+
if self._limit is not None:
|
|
74
|
+
limit = f"LIMIT {self._limit}"
|
|
75
|
+
|
|
76
|
+
query = f"""
|
|
77
|
+
{self._select}
|
|
78
|
+
FROM {self.db_schema}.{self.table_name}
|
|
79
|
+
{self._filters}
|
|
80
|
+
{limit}
|
|
81
|
+
"""
|
|
82
|
+
return query
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def _clear_statments(self):
|
|
86
|
+
self._filters_args: set = set()
|
|
87
|
+
self._select_args: set = set()
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
def get_df(self, query: Optional[str] = None) -> PandasDataFrame:
|
|
91
|
+
if query is None:
|
|
92
|
+
query = self.query
|
|
93
|
+
|
|
94
|
+
df = self.data_manipulator.get_df(query)
|
|
95
|
+
|
|
96
|
+
if self.auto_cast:
|
|
97
|
+
df = cast_df_by_schema(df, self.table_schema)
|
|
98
|
+
|
|
99
|
+
if self.clear_tech_cols:
|
|
100
|
+
to_drop = TECH_COLUMNS.intersection(df.columns)
|
|
101
|
+
df = df.drop(columns=to_drop)
|
|
102
|
+
return df
|
|
103
|
+
|
|
104
|
+
def count(self) -> PandasDataFrame:
|
|
105
|
+
query = f"""select count(*) as cnt from ({self.query}) s"""
|
|
106
|
+
return self.get_df(query)["cnt"]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class JobIDSequence(SequenceUtils):
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
db_config_name: str = "postgres",
|
|
113
|
+
):
|
|
114
|
+
super().__init__(
|
|
115
|
+
sequence_name="seq_general_job_id",
|
|
116
|
+
sequence_schema="meta",
|
|
117
|
+
db_config_name=db_config_name,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class SaveTablePG:
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
table_name: str,
|
|
125
|
+
db_schema: Optional[str] = None,
|
|
126
|
+
insert_dttm_col_name: str = "_inserted_dttm",
|
|
127
|
+
job_id_col_name: str = "_job_id",
|
|
128
|
+
auto_cast: bool = True,
|
|
129
|
+
size: int = 131072,
|
|
130
|
+
db_config_name: str = "postgres",
|
|
131
|
+
sequence_util: SequenceUtils = JobIDSequence,
|
|
132
|
+
) -> None:
|
|
133
|
+
self.table_name = table_name
|
|
134
|
+
self.db_schema = db_schema
|
|
135
|
+
self.insert_dttm_col_name = insert_dttm_col_name
|
|
136
|
+
self.job_id_col_name = job_id_col_name
|
|
137
|
+
self.auto_cast = auto_cast
|
|
138
|
+
self.db_config_name = db_config_name
|
|
139
|
+
|
|
140
|
+
self.data_manipulator = SaveDataManipulatorPG(
|
|
141
|
+
size=size,
|
|
142
|
+
table_name=table_name,
|
|
143
|
+
db_schema=db_schema,
|
|
144
|
+
db_config_name=self.db_config_name,
|
|
145
|
+
)
|
|
146
|
+
self.sequence_utils = sequence_util(db_config_name=self.db_config_name)
|
|
147
|
+
self._job_id = None
|
|
148
|
+
|
|
149
|
+
def _create_tech_cols(self, df: PandasDataFrame) -> PandasDataFrame:
|
|
150
|
+
table_schemа = self.data_manipulator.db_utils.get_table_schema(
|
|
151
|
+
self.table_name, db_schema=self.db_schema
|
|
152
|
+
)
|
|
153
|
+
df[self.insert_dttm_col_name] = pd.Timestamp.now()
|
|
154
|
+
|
|
155
|
+
if self.job_id_col_name in table_schemа.keys():
|
|
156
|
+
self._job_id = self.sequence_utils.next_sequence
|
|
157
|
+
df[self.job_id_col_name] = self._job_id
|
|
158
|
+
print("current sequence in that job", self._job_id)
|
|
159
|
+
|
|
160
|
+
if self.auto_cast:
|
|
161
|
+
df = cast_df_by_schema(df, table_schemа, cast_timestamp=False)
|
|
162
|
+
return df
|
|
163
|
+
|
|
164
|
+
def save(self, df: PandasDataFrame) -> int:
|
|
165
|
+
df = self._create_tech_cols(df)
|
|
166
|
+
self.data_manipulator.save_df(df)
|
|
167
|
+
return self._job_id
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pandas import DataFrame as PandasDataFrame
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
BOOL_MAPPER = {"f": False, "t": True}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def cast_df_by_schema(
|
|
12
|
+
df: PandasDataFrame, schema: Dict[str, str], cast_timestamp: bool = True
|
|
13
|
+
) -> PandasDataFrame:
|
|
14
|
+
for column in df.columns:
|
|
15
|
+
if column in schema:
|
|
16
|
+
cast_to = schema[column]["data_type"]
|
|
17
|
+
try:
|
|
18
|
+
if cast_to == "timestamp":
|
|
19
|
+
if cast_timestamp:
|
|
20
|
+
df[column] = pd.to_datetime(df[column], errors="coerce")
|
|
21
|
+
continue
|
|
22
|
+
|
|
23
|
+
if cast_to == "bool":
|
|
24
|
+
df[column] = df[column].apply(lambda x: BOOL_MAPPER.get(x, np.nan))
|
|
25
|
+
|
|
26
|
+
df = df.astype({column: cast_to})
|
|
27
|
+
except Exception as e:
|
|
28
|
+
print(f"unable to cast column: {column} to {cast_to} error: {e}")
|
|
29
|
+
pass
|
|
30
|
+
df = df.replace("nan", np.nan)
|
|
31
|
+
df = df.replace("None", np.nan)
|
|
32
|
+
return df
|
|
File without changes
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from botocore.config import Config
|
|
7
|
+
from dump.config_utils import load_config
|
|
8
|
+
from dump.files.tmp_util import TemporaryFileSystem
|
|
9
|
+
from pandas import DataFrame as PandasDataFrame
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class S3Client:
|
|
13
|
+
def __init__(
|
|
14
|
+
self, creds_section: str, config_filiname: str = "s3_config.ini"
|
|
15
|
+
) -> None:
|
|
16
|
+
self.creds_section = creds_section
|
|
17
|
+
|
|
18
|
+
self.config = load_config(filename=config_filiname, section=creds_section)
|
|
19
|
+
self.conn_config = Config(
|
|
20
|
+
s3={
|
|
21
|
+
"addressing_style": "virtual",
|
|
22
|
+
},
|
|
23
|
+
retries={"max_attempts": 10, "mode": "standard"},
|
|
24
|
+
region_name="us-east-1",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
self.client = boto3.client(
|
|
28
|
+
"s3",
|
|
29
|
+
**self.config,
|
|
30
|
+
config=self.conn_config,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def get_files_info(self, bucket_name: str, object_folder: Optional[str] = ""):
|
|
34
|
+
files_dict = {}
|
|
35
|
+
response = self.client.list_objects_v2(Bucket=bucket_name, Prefix=object_folder)
|
|
36
|
+
if "Contents" in response:
|
|
37
|
+
for obj in response["Contents"]:
|
|
38
|
+
size = obj["Size"]
|
|
39
|
+
files_dict[obj["Key"]] = {
|
|
40
|
+
"size_bytes": size,
|
|
41
|
+
"size_gb": size / 1024**3,
|
|
42
|
+
}
|
|
43
|
+
return files_dict
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class S3Upload(S3Client):
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
bucket_name: str,
|
|
50
|
+
object_folder: str = "",
|
|
51
|
+
creds_section: str = "test",
|
|
52
|
+
) -> None:
|
|
53
|
+
super().__init__(creds_section)
|
|
54
|
+
self.bucket_name = bucket_name
|
|
55
|
+
self.object_folder = object_folder
|
|
56
|
+
|
|
57
|
+
def _get_path(self, file_name) -> str:
|
|
58
|
+
"""
|
|
59
|
+
creating path where to save file in s3
|
|
60
|
+
"""
|
|
61
|
+
path = os.path.join(self.object_folder, file_name)
|
|
62
|
+
return path
|
|
63
|
+
|
|
64
|
+
def upload_to_s3(self, tmp_path_file, file_name: str):
|
|
65
|
+
"""
|
|
66
|
+
upload local file into s3 bucket
|
|
67
|
+
"""
|
|
68
|
+
path = self._get_path(file_name)
|
|
69
|
+
self.client.upload_file(tmp_path_file, self.bucket_name, path)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class S3Download(S3Client):
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
bucket_name: str,
|
|
76
|
+
object_folder: str = "",
|
|
77
|
+
creds_section: str = "test",
|
|
78
|
+
) -> None:
|
|
79
|
+
super().__init__(creds_section)
|
|
80
|
+
self.bucket_name = bucket_name
|
|
81
|
+
self.object_folder = object_folder
|
|
82
|
+
|
|
83
|
+
self.file_system = TemporaryFileSystem()
|
|
84
|
+
|
|
85
|
+
# requires for not download same file more than one time
|
|
86
|
+
self.__downloads_history: Dict[str, str] = {}
|
|
87
|
+
|
|
88
|
+
def _check_path_exist(self, path: str) -> bool:
|
|
89
|
+
exsist_files = self.get_files_info(self.bucket_name, self.object_folder)
|
|
90
|
+
return path in exsist_files
|
|
91
|
+
|
|
92
|
+
def _get_object_path(self, file_name) -> str:
|
|
93
|
+
path = os.path.join(self.object_folder, file_name)
|
|
94
|
+
return path
|
|
95
|
+
|
|
96
|
+
def download_from_s3(self, download_file_name: str) -> str:
|
|
97
|
+
"""
|
|
98
|
+
downloading file from s3 and put the file into temporary directory
|
|
99
|
+
returns path to file in temporary directory
|
|
100
|
+
"""
|
|
101
|
+
object_path = self._get_object_path(download_file_name)
|
|
102
|
+
if self._check_path_exist(object_path):
|
|
103
|
+
if object_path not in self.__downloads_history:
|
|
104
|
+
save_path = self.file_system.save_path()
|
|
105
|
+
self.client.download_file(self.bucket_name, object_path, save_path)
|
|
106
|
+
self.__downloads_history[object_path] = save_path
|
|
107
|
+
|
|
108
|
+
return self.__downloads_history[object_path]
|
|
109
|
+
else:
|
|
110
|
+
raise ValueError(f"The file {object_path} was not found")
|
|
111
|
+
|
|
112
|
+
def download_from_s3_df(
|
|
113
|
+
self, download_file_name: str, encoding="utf-8"
|
|
114
|
+
) -> PandasDataFrame:
|
|
115
|
+
df_path = self.download_from_s3(download_file_name)
|
|
116
|
+
return pd.read_csv(df_path, encoding=encoding, low_memory=False)
|