connector-for-bb 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ global-exclude *.py[co]
2
+ global-exclude __pycache__
3
+ graft tests
4
+ include README.md
5
+ include VERSION
6
+ recursive-include src/dump *.py
7
+ recursive-include src/dump/click_house *.py
8
+ recursive-include src/dump/files *.py
9
+ recursive-include src/dump/postgres *.py
10
+ recursive-include src/dump/s3 *.py
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: connector_for_bb
3
+ Version: 0.0.1
4
+ Summary: connector
5
+ Home-page: https://github.com/your_username/your_repository
6
+ Author: Ivan
7
+ Author-email: example@example.com
8
+ License: MIT
9
+ Platform: UNKNOWN
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ Provides-Extra: develop
16
+
17
+ Dump project
18
+
19
+
@@ -0,0 +1 @@
1
+ Dump project
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,128 @@
1
+ [metadata]
2
+ name = connector_for_bb
3
+ version = 0.0.1
4
+ description = connector
5
+ long_description = file: README.md
6
+ long_description_content_type = text/markdown
7
+ author = Ivan
8
+ author_email = example@example.com
9
+ license = MIT
10
+ url = https://github.com/your_username/your_repository
11
+ classifiers =
12
+ Programming Language :: Python :: 3
13
+ License :: OSI Approved :: MIT License
14
+ Operating System :: OS Independent
15
+
16
+ [aliases]
17
+ test = pytest
18
+
19
+ [coverage:report]
20
+ exclude_lines =
21
+ @abc.abstractmethod
22
+ @abc.abstractproperty
23
+ CancelledError
24
+ NotImplementedError
25
+ pragma: no cover
26
+ __repr__
27
+ __str__
28
+ fail_under = 81
29
+ precision = 2
30
+ show_missing = True
31
+
32
+ [coverage:run]
33
+ branch = True
34
+ source =
35
+ src
36
+ tests
37
+
38
+ [flake8]
39
+ ignore = E203,W503
40
+ max-line-length = 95
41
+
42
+ [isort]
43
+ atomic = true
44
+ default_section = THIRDPARTY
45
+ force_grid_wrap = 0
46
+ include_trailing_comma = true
47
+ indent = ' '
48
+ known_first_party = dsp_dags
49
+ known_third_party =
50
+ line_length = 95
51
+ lines_after_imports = 2
52
+ multi_line_output = 3
53
+ not_skip = __init__.py
54
+ order_by_type = true
55
+ sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
56
+ use_parentheses = True
57
+
58
+ [mypy]
59
+ check_untyped_defs = True
60
+
61
+ [mypy-lightfm.*]
62
+ ignore_missing_imports = True
63
+
64
+ [mypy-numpy.*]
65
+ ignore_missing_imports = True
66
+
67
+ [mypy-pandas.*]
68
+ ignore_missing_imports = True
69
+
70
+ [mypy-pytest.*]
71
+ ignore_missing_imports = True
72
+
73
+ [mypy-scipy.*]
74
+ ignore_missing_imports = True
75
+
76
+ [mypy-sklearn.*]
77
+ ignore_missing_imports = True
78
+
79
+ [mypy-xgboost.*]
80
+ ignore_missing_imports = True
81
+
82
+ [mypy-psycopg2.*]
83
+ ignore_missing_imports = True
84
+
85
+ [mypy-clickhouse_connect.*]
86
+ ignore_missing_imports = True
87
+
88
+ [options]
89
+ include_package_data = True
90
+ install_requires =
91
+ tqdm==4.66.5
92
+ catboost==1.2.7
93
+ numpy==1.26.0
94
+ pandas==2.2.2
95
+ psycopg2-binary==2.9.9
96
+ boto3==1.35.30
97
+ scikit-learn==1.5.2
98
+ clickhouse-connect==0.8.6
99
+ package_dir =
100
+ = src
101
+ packages = find:
102
+ python_requires = >=3.10
103
+
104
+ [options.extras_require]
105
+ develop =
106
+ black==24.8.0
107
+ coverage==5.5
108
+ flake8==6.0.0
109
+ hypothesis==6.9.2
110
+ isort==5.8.0
111
+ mypy==0.812
112
+ pylint==2.17.4
113
+ pytest==7.3.1
114
+ responses==0.13.2
115
+
116
+ [options.packages.find]
117
+ where = src
118
+
119
+ [tool:pytest]
120
+ addopts =
121
+ --junitxml=junit.xml
122
+ --showlocals
123
+ --verbose
124
+
125
+ [egg_info]
126
+ tag_build =
127
+ tag_date = 0
128
+
@@ -0,0 +1,70 @@
1
+ import os
2
+ import subprocess
3
+ from setuptools import setup, find_packages
4
+ from setuptools.command.sdist import sdist as sdist_orig
5
+
6
+ ROOT = os.path.dirname(__file__)
7
+ VERSION = os.path.join(ROOT, "VERSION")
8
+
9
+
10
+ def format_version(raw_version):
11
+ """
12
+ Преобразование версии в формат PEP 440.
13
+ Если версия содержит некорректные символы, добавляем их как метаданные.
14
+ """
15
+ if raw_version.startswith("v"): # Удаляем префикс v, если он есть
16
+ raw_version = raw_version[1:]
17
+
18
+
19
+ parts = raw_version.split("-")
20
+ if len(parts) == 1:
21
+ return raw_version # Версия уже корректна
22
+ elif len(parts) == 3:
23
+ base, distance, commit_hash = parts
24
+ return f"{base}.dev{distance}+{commit_hash}"
25
+ else:
26
+ # Если версия не в ожидаемом формате, добавляем её как метаданные
27
+ return f"0.1.0+{raw_version}"
28
+
29
+
30
+ def project_version():
31
+ """Получение версии проекта из git или файла VERSION."""
32
+ version = None
33
+
34
+ # Попытка получить версию из Git
35
+ try:
36
+ output = subprocess.check_output(
37
+ ["git", "describe", "--tags", "--always"],
38
+ stderr=open(os.devnull, "wb"),
39
+ ).strip().decode()
40
+ version = format_version(output)
41
+ except (FileNotFoundError, subprocess.CalledProcessError) as e:
42
+ print(f"Warning: Unable to retrieve version from git: {e}")
43
+
44
+ # Если версия недоступна, читаем из файла VERSION
45
+ if not version and os.path.exists(VERSION):
46
+ with open(VERSION) as verfile:
47
+ raw_version = verfile.read().strip()
48
+ version = format_version(raw_version)
49
+
50
+ # Если версия всё ещё не найдена, вызываем ошибку
51
+ if not version:
52
+ raise RuntimeError("Cannot detect project version")
53
+
54
+ return version
55
+
56
+
57
+ class sdist(sdist_orig):
58
+
59
+ def run(self):
60
+ version = project_version()
61
+ with open(VERSION, "w") as verfile:
62
+ verfile.write(version)
63
+ sdist_orig.run(self)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ setup(
68
+ version=project_version(), # Версия из git или файла VERSION
69
+ cmdclass={"sdist": sdist}, # Использование кастомного класса sdist
70
+ )
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: connector-for-bb
3
+ Version: 0.0.1
4
+ Summary: connector
5
+ Home-page: https://github.com/your_username/your_repository
6
+ Author: Ivan
7
+ Author-email: example@example.com
8
+ License: MIT
9
+ Platform: UNKNOWN
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ Provides-Extra: develop
16
+
17
+ Dump project
18
+
19
+
@@ -0,0 +1,23 @@
1
+ MANIFEST.in
2
+ README.md
3
+ VERSION
4
+ setup.cfg
5
+ setup.py
6
+ src/connector_for_bb.egg-info/PKG-INFO
7
+ src/connector_for_bb.egg-info/SOURCES.txt
8
+ src/connector_for_bb.egg-info/dependency_links.txt
9
+ src/connector_for_bb.egg-info/requires.txt
10
+ src/connector_for_bb.egg-info/top_level.txt
11
+ src/dump/__init__.py
12
+ src/dump/config_utils.py
13
+ src/dump/click_house/__init__.py
14
+ src/dump/click_house/connectors.py
15
+ src/dump/files/__init__.py
16
+ src/dump/files/tmp_util.py
17
+ src/dump/postgres/__init__.py
18
+ src/dump/postgres/connectors.py
19
+ src/dump/postgres/manipulators.py
20
+ src/dump/postgres/table.py
21
+ src/dump/postgres/types_util.py
22
+ src/dump/s3/__init__.py
23
+ src/dump/s3/connectors.py
@@ -0,0 +1,19 @@
1
+ boto3==1.35.30
2
+ catboost==1.2.7
3
+ clickhouse-connect==0.8.6
4
+ numpy==1.26.0
5
+ pandas==2.2.2
6
+ psycopg2-binary==2.9.9
7
+ scikit-learn==1.5.2
8
+ tqdm==4.66.5
9
+
10
+ [develop]
11
+ black==24.8.0
12
+ coverage==5.5
13
+ flake8==6.0.0
14
+ hypothesis==6.9.2
15
+ isort==5.8.0
16
+ mypy==0.812
17
+ pylint==2.17.4
18
+ pytest==7.3.1
19
+ responses==0.13.2
File without changes
@@ -0,0 +1,32 @@
1
+ import clickhouse_connect
2
+ from config_util import load_config
3
+ from pandas import DataFrame as PandasDataFrame
4
+
5
+
6
+ class ConnectorCH:
7
+ def __init__(
8
+ self,
9
+ db_config_name: str = "click_house",
10
+ ) -> None:
11
+ self.db_config_name = db_config_name
12
+ self.__config = load_config(section=self.db_config_name)
13
+
14
+ self._client = None
15
+
16
+ @property
17
+ def client(self):
18
+ if self._client is None:
19
+ try:
20
+ self._client = clickhouse_connect.get_client(**self.__config)
21
+ # print("Connected to the ClickHouse server.")
22
+ except Exception as error:
23
+ print(error)
24
+ return self._client
25
+
26
+
27
+ class TableCH(ConnectorCH):
28
+ def __init__(self, db_config_name: str = "click_house") -> None:
29
+ super().__init__(db_config_name)
30
+
31
+ def get_df(self, query: str) -> PandasDataFrame:
32
+ return self.client.query_df(query)
@@ -0,0 +1,19 @@
1
+ from configparser import ConfigParser
2
+
3
+
4
+ def load_config(filename="database.ini", section="postgresql"):
5
+ parser = ConfigParser()
6
+ parser.read(filename)
7
+
8
+ # get section, default to postgresql
9
+ config = {}
10
+ if parser.has_section(section):
11
+ params = parser.items(section)
12
+ for param in params:
13
+ config[param[0]] = param[1]
14
+ else:
15
+ raise Exception(
16
+ "Section {0} not found in the {1} file".format(section, filename)
17
+ )
18
+
19
+ return config
File without changes
@@ -0,0 +1,44 @@
1
+ import os
2
+ from tempfile import TemporaryDirectory
3
+
4
+
5
+ class TemporaryFileSystem:
6
+ """
7
+ Temporary directory utils class
8
+ used for data manipulators
9
+ """
10
+
11
+ def __init__(self) -> None:
12
+ self._tmp_dir = TemporaryDirectory()
13
+ self._last_path: str
14
+
15
+ @property
16
+ def tmp_dir(self) -> str:
17
+ return self._tmp_dir.name
18
+
19
+ @property
20
+ def files(self) -> list:
21
+ return os.listdir(str(self.tmp_dir))
22
+
23
+ @property
24
+ def next_filename(self) -> str:
25
+ number = 0
26
+ avaliable = [int(x.split(".")[0].split("_")[-1]) for x in self.files]
27
+ if len(avaliable) != 0:
28
+ number = max(avaliable) + 1
29
+ return f"data_{number}.csv"
30
+
31
+ def save_path(self) -> str:
32
+ self._last_path = os.path.join(str(self.tmp_dir), self.next_filename)
33
+ return self._last_path
34
+
35
+ @property
36
+ def dir_info(self) -> None:
37
+ """
38
+ Shows temporary directory info
39
+ file - file size in mb
40
+ """
41
+ for file in self.files:
42
+ check_file = os.path.join(self.tmp_dir, file)
43
+ size = os.path.getsize(check_file) / 1024**2
44
+ print(f"{check_file}: {size} mb")
File without changes
@@ -0,0 +1,186 @@
1
+ import psycopg2
2
+ from dump.config_utils import load_config
3
+ from psycopg2 import connect
4
+
5
+
6
+ TYPE_MAPPER = {
7
+ "bigint": "Int64",
8
+ "character varying": "str",
9
+ "integer": "Int64",
10
+ "double precision": "float",
11
+ "timestamp without time zone": "timestamp",
12
+ # "timestamp with time zone": "timestamp",
13
+ "USER-DEFINED": "str",
14
+ "date": "timestamp",
15
+ "character": "str",
16
+ "boolean": "bool",
17
+ "real": "Int64",
18
+ "text": "str",
19
+ }
20
+
21
+
22
+ class ConnectorPG:
23
+ def __init__(
24
+ self,
25
+ db_config_name: str = "postgres",
26
+ ) -> None:
27
+ self.db_config_name = db_config_name
28
+ self.__config = load_config(section=self.db_config_name)
29
+
30
+ self._conn: connect = None
31
+ self._cursor = None
32
+
33
+ @property
34
+ def conn(self):
35
+ if self._conn is None:
36
+ try:
37
+ # connecting to the PostgreSQL server
38
+ with psycopg2.connect(**self.__config) as pgconn:
39
+ # print("Connected to the PostgreSQL server.")
40
+ self._conn = pgconn
41
+ except (psycopg2.DatabaseError, Exception) as error:
42
+ print(error)
43
+ return self._conn
44
+
45
+ @property
46
+ def cursor(self):
47
+ if self._cursor is None:
48
+ self._cursor = self.conn.cursor()
49
+ return self._cursor
50
+
51
+ @property
52
+ def close(self):
53
+ self._conn.close()
54
+
55
+ self._conn = None
56
+ self._cursor = None
57
+
58
+ @property
59
+ def commit(self):
60
+ self.conn.commit()
61
+
62
+ def execute(self, query: str):
63
+ self.cursor.execute(query)
64
+ self.conn.commit()
65
+
66
+ def fetchall(self, queru: str) -> list:
67
+ self.execute(queru)
68
+ values = self.cursor.fetchall()
69
+ self.conn.commit()
70
+
71
+ return values
72
+
73
+
74
+ class DBUtilsPG(ConnectorPG):
75
+ @staticmethod
76
+ def _validate_output(inp: list):
77
+ return [x[0] for x in inp]
78
+
79
+ @property
80
+ def _schemas(self) -> list:
81
+ query = """
82
+ SELECT schema_name
83
+ FROM information_schema.schemata;
84
+ """
85
+ return self._validate_output(self.fetchall(query))
86
+
87
+ def get_tables_in_schema(self, schema: str) -> list:
88
+ query = f"""
89
+ SELECT table_name
90
+ FROM information_schema.tables
91
+ WHERE table_schema = '{schema}'
92
+ """
93
+ tables = self._validate_output(self.fetchall(query))
94
+ return tables
95
+
96
+ def get_table_schema(self, table_name: str, db_schema: str) -> dict:
97
+ query = f"""
98
+ SELECT column_name, data_type, is_nullable, ordinal_position
99
+ FROM information_schema.columns
100
+ WHERE table_name = '{table_name}' and
101
+ table_schema = '{db_schema}';
102
+ """
103
+
104
+ schema = {}
105
+ for column_name, data_type, is_nullable, position in self.fetchall(query):
106
+ schema[column_name] = {
107
+ "data_type": TYPE_MAPPER.get(data_type, "str"),
108
+ "is_nullable": is_nullable,
109
+ "position": position,
110
+ }
111
+ return schema
112
+
113
+ def get_schema_by_table(self, table_name: str) -> str:
114
+ query = f"""
115
+ SELECT table_schema
116
+ FROM information_schema.tables
117
+ WHERE table_name = '{table_name}';
118
+ """
119
+ schema = self._validate_output(self.fetchall(query))
120
+ return schema[0]
121
+
122
+
123
+ class SequenceUtils:
124
+ def __init__(
125
+ self,
126
+ sequence_name: str = "seq_general_job_id",
127
+ sequence_schema: str = "public",
128
+ db_config_name: str = "postgres",
129
+ ) -> None:
130
+ self.sequence_name = sequence_name
131
+ self.sequence_schema = sequence_schema
132
+
133
+ self.connector = ConnectorPG(db_config_name)
134
+
135
+ @property
136
+ def next_sequence(self):
137
+ query = f"""
138
+ SELECT nextval('{self.sequence_schema}.{self.sequence_name}');
139
+ """
140
+ sequence = self.connector.fetchall(query)[0][0]
141
+ return sequence
142
+
143
+ @property
144
+ def current_sequence(self):
145
+ query = f"""
146
+ SELECT last_value FROM {self.sequence_schema}.{self.sequence_name}
147
+ """
148
+ sequence = self.connector.fetchall(query)[0][0]
149
+ return sequence
150
+
151
+
152
+ # class TemporaryTablePG(ConnectorPG):
153
+ # def __init__(self, name: str, db_config_name: str = "postgres") -> None:
154
+ # super().__init__(db_config_name)
155
+ # self.name = name
156
+ # self.data_manipulator = DataManupulationPG(db_config_name=self.db_config_name)
157
+
158
+ # # mapping dtypes to PG types
159
+ # self._KIND_MAPPER = {
160
+ # "M": "timestamp",
161
+ # "f": "real",
162
+ # "i": "bigint",
163
+ # }
164
+
165
+ # def _pd_types_to_pg_types(self, df: PandasDataFrame) -> dict:
166
+ # df_types = df.dtypes.to_dict()
167
+ # some = {
168
+ # col: self._KIND_MAPPER.get(col_type.kind, "varchar")
169
+ # for col, col_type in df_types.items()
170
+ # }
171
+ # return some
172
+
173
+ # def _types_to_str_statments(self, types: dict) -> str:
174
+ # types_statments = ", \n ".join([" ".join(x) for x in types.items()])
175
+ # return types_statments
176
+
177
+ # def create_table(self, df: PandasDataFrame):
178
+ # types = self._pd_types_to_pg_types(df)
179
+ # types_statments = self._types_to_str_statments(types)
180
+ # query = f"""
181
+ # CREATE TEMP TABLE {self.name}(
182
+ # {types_statments}
183
+ # );
184
+ # """
185
+ # self.execute(query)
186
+ # self.commit
@@ -0,0 +1,138 @@
1
+ from typing import Dict, Optional
2
+
3
+ import pandas as pd
4
+ from dump.files.tmp_util import TemporaryFileSystem
5
+ from pandas import DataFrame as PandasDataFrame
6
+
7
+ from .connectors import ConnectorPG, DBUtilsPG
8
+
9
+
10
+ class GetDataManipulatorPG(ConnectorPG):
11
+ def __init__(
12
+ self,
13
+ encoding="utf-16",
14
+ size: int = 262144,
15
+ db_config_name: str = "postgres",
16
+ ) -> None:
17
+ super().__init__(db_config_name=db_config_name)
18
+ self.encoding = encoding
19
+ self.size = size
20
+
21
+ # requires for not execute same query more than one time
22
+ self.__query_history: Dict[str, str] = {}
23
+ self.file_system = TemporaryFileSystem()
24
+
25
+ def get_data(self, query: str, path: Optional[str] = None) -> str:
26
+ """
27
+ fetching data from pg using query
28
+ and saving into csv locating in temporary directory
29
+ """
30
+ to_execute = f"""
31
+ copy ( {query} )
32
+ to stdout
33
+ WITH
34
+ CSV HEADER
35
+ """
36
+ if path is None:
37
+ path = self.file_system.save_path()
38
+
39
+ if query not in self.__query_history:
40
+ with open(path, "w", encoding=self.encoding) as fp:
41
+ self.cursor.copy_expert(to_execute, fp, size=self.size)
42
+ self.close
43
+ self.__query_history[query] = self.file_system._last_path
44
+
45
+ path = self.__query_history[query]
46
+ return path
47
+
48
+ def get_df(self, query: str) -> PandasDataFrame:
49
+ path = self.get_data(query)
50
+ return pd.read_csv(path, encoding=self.encoding, low_memory=False)
51
+
52
+
53
+ class SaveDataManipulatorPG(ConnectorPG):
54
+ def __init__(
55
+ self,
56
+ table_name: str,
57
+ db_schema: Optional[str] = None,
58
+ encoding="utf-16",
59
+ size: int = 262144,
60
+ db_config_name: str = "postgres",
61
+ ) -> None:
62
+ super().__init__(db_config_name=db_config_name)
63
+ self.table_name = table_name
64
+ self.db_schema = db_schema
65
+ self.encoding = encoding
66
+ self.size = size
67
+
68
+ self.file_system = TemporaryFileSystem()
69
+
70
+ self.db_utils = DBUtilsPG(db_config_name)
71
+ if self.db_schema is None:
72
+ self.db_schema = self.db_utils.get_schema_by_table(self.table_name)
73
+
74
+ @staticmethod
75
+ def __get_columns_right_order(table_schema: Dict[str, str]) -> list:
76
+ """
77
+ Return list of columns sorted by order in destination table
78
+ """
79
+ column_positions = {
80
+ column: meta_info["position"] for column, meta_info in table_schema.items()
81
+ }
82
+ columns_sorted_by_position = sorted(column_positions, key=column_positions.get)
83
+ return columns_sorted_by_position
84
+
85
+ def _validate_df_to_save(self, df) -> PandasDataFrame:
86
+ """
87
+ Check that DataFrame have all required columns
88
+ """
89
+ table_schemа = self.db_utils.get_table_schema(self.table_name, self.db_schema)
90
+ columns_intersection = set(table_schemа).intersection(df.columns)
91
+ columns_not_found = set(table_schemа) - columns_intersection
92
+
93
+ if len(columns_not_found) != 0:
94
+ raise Exception(f"Columns not found: {columns_not_found}")
95
+
96
+ column_to_write = self.__get_columns_right_order(table_schemа)
97
+ return df[column_to_write]
98
+
99
+ def init_df(self, df: PandasDataFrame) -> str:
100
+ """
101
+ Validate df with pg table and save df as csv into temporary directory
102
+ """
103
+ df = self._validate_df_to_save(df)
104
+
105
+ path = self.file_system.save_path()
106
+ df.to_csv(path, encoding=self.encoding, index=False)
107
+ return path
108
+
109
+ def save_data(
110
+ self,
111
+ path: str,
112
+ ):
113
+ """
114
+ reading file from path
115
+ and saving file to self.table_save
116
+
117
+ self.table_save: str - PG table where to save data
118
+ path: str - path to file which should be copied into PG
119
+
120
+ Using with combination self.init_data
121
+ or set path manualy
122
+ """
123
+
124
+ to_execute = f"""
125
+ copy {self.db_schema}.{self.table_name}
126
+ from STDIN
127
+ WITH
128
+ CSV header
129
+ """
130
+
131
+ with open(path, "r", encoding=self.encoding) as fp:
132
+ self.cursor.copy_expert(to_execute, fp, size=self.size)
133
+ self.commit
134
+ self.close
135
+
136
+ def save_df(self, df: PandasDataFrame):
137
+ path = self.init_df(df)
138
+ self.save_data(path)
@@ -0,0 +1,167 @@
1
+ from typing import Optional
2
+
3
+ import pandas as pd
4
+ from pandas import DataFrame as PandasDataFrame
5
+
6
+ from .connectors import ConnectorPG, DBUtilsPG, SequenceUtils
7
+ from .manipulators import GetDataManipulatorPG, SaveDataManipulatorPG
8
+ from .types_util import cast_df_by_schema
9
+
10
+
11
+ TECH_COLUMNS = {"_inserted_dttm", "_job_id"}
12
+
13
+
14
+ class TablePG(ConnectorPG):
15
+ def __init__(
16
+ self,
17
+ table_name: str,
18
+ db_schema: Optional[str] = None,
19
+ limit: int = None,
20
+ auto_cast: bool = True,
21
+ clear_tech_cols: bool = True,
22
+ db_config_name: str = "postgres",
23
+ ) -> None:
24
+ super().__init__(db_config_name=db_config_name)
25
+ self.table_name = table_name
26
+ self.db_schema = db_schema
27
+ self._limit = limit
28
+ self.auto_cast = auto_cast
29
+ self.clear_tech_cols = clear_tech_cols
30
+
31
+ self.data_manipulator = GetDataManipulatorPG(db_config_name=self.db_config_name)
32
+ self.db_utils = DBUtilsPG(db_config_name=db_config_name)
33
+
34
+ if self.db_schema is None:
35
+ self.db_schema = self.db_utils.get_schema_by_table(self.table_name)
36
+
37
+ if self.auto_cast:
38
+ self.table_schema = self.db_utils.get_table_schema(
39
+ self.table_name,
40
+ self.db_schema,
41
+ )
42
+
43
+ self._filters_args: set = set()
44
+ self._select_args: set = set()
45
+
46
+ def filter(self, *args, operation: str = "and"):
47
+ self._filters_args: set = set()
48
+ self._filters_args.update([f" {operation} {arg}" for arg in args])
49
+ return self
50
+
51
+ @property
52
+ def _filters(self) -> str:
53
+ filters = "WHERE 1=1"
54
+ filters += " ".join(self._filters_args)
55
+ return filters
56
+
57
+ def select(self, *args):
58
+ self._select_args: set = set()
59
+ self._select_args.update(args)
60
+ return self
61
+
62
+ @property
63
+ def _select(self) -> str:
64
+ select = "select "
65
+ if len(self._select_args) == 0:
66
+ select += "*"
67
+ select += ", ".join(self._select_args)
68
+ return select
69
+
70
+ @property
71
+ def query(self) -> str:
72
+ limit = ""
73
+ if self._limit is not None:
74
+ limit = f"LIMIT {self._limit}"
75
+
76
+ query = f"""
77
+ {self._select}
78
+ FROM {self.db_schema}.{self.table_name}
79
+ {self._filters}
80
+ {limit}
81
+ """
82
+ return query
83
+
84
+ @property
85
+ def _clear_statments(self):
86
+ self._filters_args: set = set()
87
+ self._select_args: set = set()
88
+ return self
89
+
90
+ def get_df(self, query: Optional[str] = None) -> PandasDataFrame:
91
+ if query is None:
92
+ query = self.query
93
+
94
+ df = self.data_manipulator.get_df(query)
95
+
96
+ if self.auto_cast:
97
+ df = cast_df_by_schema(df, self.table_schema)
98
+
99
+ if self.clear_tech_cols:
100
+ to_drop = TECH_COLUMNS.intersection(df.columns)
101
+ df = df.drop(columns=to_drop)
102
+ return df
103
+
104
+ def count(self) -> PandasDataFrame:
105
+ query = f"""select count(*) as cnt from ({self.query}) s"""
106
+ return self.get_df(query)["cnt"]
107
+
108
+
109
+ class JobIDSequence(SequenceUtils):
110
+ def __init__(
111
+ self,
112
+ db_config_name: str = "postgres",
113
+ ):
114
+ super().__init__(
115
+ sequence_name="seq_general_job_id",
116
+ sequence_schema="meta",
117
+ db_config_name=db_config_name,
118
+ )
119
+
120
+
121
+ class SaveTablePG:
122
+ def __init__(
123
+ self,
124
+ table_name: str,
125
+ db_schema: Optional[str] = None,
126
+ insert_dttm_col_name: str = "_inserted_dttm",
127
+ job_id_col_name: str = "_job_id",
128
+ auto_cast: bool = True,
129
+ size: int = 131072,
130
+ db_config_name: str = "postgres",
131
+ sequence_util: SequenceUtils = JobIDSequence,
132
+ ) -> None:
133
+ self.table_name = table_name
134
+ self.db_schema = db_schema
135
+ self.insert_dttm_col_name = insert_dttm_col_name
136
+ self.job_id_col_name = job_id_col_name
137
+ self.auto_cast = auto_cast
138
+ self.db_config_name = db_config_name
139
+
140
+ self.data_manipulator = SaveDataManipulatorPG(
141
+ size=size,
142
+ table_name=table_name,
143
+ db_schema=db_schema,
144
+ db_config_name=self.db_config_name,
145
+ )
146
+ self.sequence_utils = sequence_util(db_config_name=self.db_config_name)
147
+ self._job_id = None
148
+
149
+ def _create_tech_cols(self, df: PandasDataFrame) -> PandasDataFrame:
150
+ table_schemа = self.data_manipulator.db_utils.get_table_schema(
151
+ self.table_name, db_schema=self.db_schema
152
+ )
153
+ df[self.insert_dttm_col_name] = pd.Timestamp.now()
154
+
155
+ if self.job_id_col_name in table_schemа.keys():
156
+ self._job_id = self.sequence_utils.next_sequence
157
+ df[self.job_id_col_name] = self._job_id
158
+ print("current sequence in that job", self._job_id)
159
+
160
+ if self.auto_cast:
161
+ df = cast_df_by_schema(df, table_schemа, cast_timestamp=False)
162
+ return df
163
+
164
+ def save(self, df: PandasDataFrame) -> int:
165
+ df = self._create_tech_cols(df)
166
+ self.data_manipulator.save_df(df)
167
+ return self._job_id
@@ -0,0 +1,32 @@
1
+ from typing import Dict
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from pandas import DataFrame as PandasDataFrame
6
+
7
+
8
+ BOOL_MAPPER = {"f": False, "t": True}
9
+
10
+
11
+ def cast_df_by_schema(
12
+ df: PandasDataFrame, schema: Dict[str, str], cast_timestamp: bool = True
13
+ ) -> PandasDataFrame:
14
+ for column in df.columns:
15
+ if column in schema:
16
+ cast_to = schema[column]["data_type"]
17
+ try:
18
+ if cast_to == "timestamp":
19
+ if cast_timestamp:
20
+ df[column] = pd.to_datetime(df[column], errors="coerce")
21
+ continue
22
+
23
+ if cast_to == "bool":
24
+ df[column] = df[column].apply(lambda x: BOOL_MAPPER.get(x, np.nan))
25
+
26
+ df = df.astype({column: cast_to})
27
+ except Exception as e:
28
+ print(f"unable to cast column: {column} to {cast_to} error: {e}")
29
+ pass
30
+ df = df.replace("nan", np.nan)
31
+ df = df.replace("None", np.nan)
32
+ return df
File without changes
@@ -0,0 +1,116 @@
1
+ import os
2
+ from typing import Dict, Optional
3
+
4
+ import boto3
5
+ import pandas as pd
6
+ from botocore.config import Config
7
+ from dump.config_utils import load_config
8
+ from dump.files.tmp_util import TemporaryFileSystem
9
+ from pandas import DataFrame as PandasDataFrame
10
+
11
+
12
+ class S3Client:
13
+ def __init__(
14
+ self, creds_section: str, config_filiname: str = "s3_config.ini"
15
+ ) -> None:
16
+ self.creds_section = creds_section
17
+
18
+ self.config = load_config(filename=config_filiname, section=creds_section)
19
+ self.conn_config = Config(
20
+ s3={
21
+ "addressing_style": "virtual",
22
+ },
23
+ retries={"max_attempts": 10, "mode": "standard"},
24
+ region_name="us-east-1",
25
+ )
26
+
27
+ self.client = boto3.client(
28
+ "s3",
29
+ **self.config,
30
+ config=self.conn_config,
31
+ )
32
+
33
+ def get_files_info(self, bucket_name: str, object_folder: Optional[str] = ""):
34
+ files_dict = {}
35
+ response = self.client.list_objects_v2(Bucket=bucket_name, Prefix=object_folder)
36
+ if "Contents" in response:
37
+ for obj in response["Contents"]:
38
+ size = obj["Size"]
39
+ files_dict[obj["Key"]] = {
40
+ "size_bytes": size,
41
+ "size_gb": size / 1024**3,
42
+ }
43
+ return files_dict
44
+
45
+
46
+ class S3Upload(S3Client):
47
+ def __init__(
48
+ self,
49
+ bucket_name: str,
50
+ object_folder: str = "",
51
+ creds_section: str = "test",
52
+ ) -> None:
53
+ super().__init__(creds_section)
54
+ self.bucket_name = bucket_name
55
+ self.object_folder = object_folder
56
+
57
+ def _get_path(self, file_name) -> str:
58
+ """
59
+ creating path where to save file in s3
60
+ """
61
+ path = os.path.join(self.object_folder, file_name)
62
+ return path
63
+
64
+ def upload_to_s3(self, tmp_path_file, file_name: str):
65
+ """
66
+ upload local file into s3 bucket
67
+ """
68
+ path = self._get_path(file_name)
69
+ self.client.upload_file(tmp_path_file, self.bucket_name, path)
70
+
71
+
72
+ class S3Download(S3Client):
73
+ def __init__(
74
+ self,
75
+ bucket_name: str,
76
+ object_folder: str = "",
77
+ creds_section: str = "test",
78
+ ) -> None:
79
+ super().__init__(creds_section)
80
+ self.bucket_name = bucket_name
81
+ self.object_folder = object_folder
82
+
83
+ self.file_system = TemporaryFileSystem()
84
+
85
+ # requires for not download same file more than one time
86
+ self.__downloads_history: Dict[str, str] = {}
87
+
88
+ def _check_path_exist(self, path: str) -> bool:
89
+ exsist_files = self.get_files_info(self.bucket_name, self.object_folder)
90
+ return path in exsist_files
91
+
92
+ def _get_object_path(self, file_name) -> str:
93
+ path = os.path.join(self.object_folder, file_name)
94
+ return path
95
+
96
+ def download_from_s3(self, download_file_name: str) -> str:
97
+ """
98
+ downloading file from s3 and put the file into temporary directory
99
+ returns path to file in temporary directory
100
+ """
101
+ object_path = self._get_object_path(download_file_name)
102
+ if self._check_path_exist(object_path):
103
+ if object_path not in self.__downloads_history:
104
+ save_path = self.file_system.save_path()
105
+ self.client.download_file(self.bucket_name, object_path, save_path)
106
+ self.__downloads_history[object_path] = save_path
107
+
108
+ return self.__downloads_history[object_path]
109
+ else:
110
+ raise ValueError(f"The file {object_path} was not found")
111
+
112
+ def download_from_s3_df(
113
+ self, download_file_name: str, encoding="utf-8"
114
+ ) -> PandasDataFrame:
115
+ df_path = self.download_from_s3(download_file_name)
116
+ return pd.read_csv(df_path, encoding=encoding, low_memory=False)