tablemaster 2.1.0__tar.gz → 2.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablemaster-2.1.0 → tablemaster-2.1.2}/PKG-INFO +1 -1
- {tablemaster-2.1.0 → tablemaster-2.1.2}/pyproject.toml +1 -1
- tablemaster-2.1.2/tablemaster/database.py +473 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/feishu.py +14 -9
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/gspread.py +11 -7
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/pull.py +18 -1
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster.egg-info/PKG-INFO +1 -1
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster.egg-info/SOURCES.txt +1 -0
- tablemaster-2.1.2/tests/test_error_visibility.py +54 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tests/test_schema_core.py +29 -2
- tablemaster-2.1.0/tablemaster/database.py +0 -286
- {tablemaster-2.1.0 → tablemaster-2.1.2}/LICENSE +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/README.md +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/setup.cfg +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/__init__.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/__main__.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/cli.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/config.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/local.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/__init__.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/apply.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/dialects/__init__.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/dialects/base.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/dialects/mysql.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/dialects/postgresql.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/dialects/tidb.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/diff.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/init.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/introspect.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/loader.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/models.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/schema/plan.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/sync.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster/utils.py +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster.egg-info/dependency_links.txt +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster.egg-info/entry_points.txt +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster.egg-info/requires.txt +0 -0
- {tablemaster-2.1.0 → tablemaster-2.1.2}/tablemaster.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablemaster
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.2
|
|
4
4
|
Summary: tablemaster is a Python toolkit for moving and managing tabular data across databases, Feishu/Lark, Google Sheets, and local files with one consistent API.
|
|
5
5
|
Author-email: Livid <livid.su@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/ilivid/tablemaster
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tablemaster"
|
|
7
|
-
version = "2.1.
|
|
7
|
+
version = "2.1.2"
|
|
8
8
|
description = "tablemaster is a Python toolkit for moving and managing tabular data across databases, Feishu/Lark, Google Sheets, and local files with one consistent API."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import Union, List, Tuple, Dict, Any, Optional
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
|
|
8
|
+
from sqlalchemy import create_engine, inspect, pool, text
|
|
9
|
+
from sqlalchemy.engine import Engine
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
from urllib.parse import quote_plus
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_connect_args(configs: Any) -> Dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Get database connection arguments, supporting SSL and other common configurations.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
configs (Any): Configuration object that may contain use_ssl, ssl_ca, connect_args, db_type.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dict[str, Any]: A dictionary of connection arguments.
|
|
27
|
+
"""
|
|
28
|
+
connect_args: Dict[str, Any] = {}
|
|
29
|
+
|
|
30
|
+
if hasattr(configs, 'connect_args') and configs.connect_args:
|
|
31
|
+
connect_args = configs.connect_args.copy()
|
|
32
|
+
else:
|
|
33
|
+
use_ssl: bool = getattr(configs, 'use_ssl', False)
|
|
34
|
+
db_type: str = getattr(configs, 'db_type', 'mysql').lower()
|
|
35
|
+
|
|
36
|
+
if db_type == 'tidb' or use_ssl:
|
|
37
|
+
ssl_ca: str = getattr(configs, 'ssl_ca', '/etc/ssl/cert.pem')
|
|
38
|
+
connect_args = {
|
|
39
|
+
'ssl': {
|
|
40
|
+
'ca': ssl_ca,
|
|
41
|
+
'check_hostname': False,
|
|
42
|
+
'verify_identity': False
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return connect_args
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _build_conn_str(configs: Any) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Build the SQLAlchemy connection string based on configuration.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
configs (Any): Configuration object containing host, port, user, password, database, etc.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
str: The SQLAlchemy connection string.
|
|
58
|
+
"""
|
|
59
|
+
db_type: str = getattr(configs, 'db_type', 'mysql').lower()
|
|
60
|
+
password_encoded: str = quote_plus(configs.password)
|
|
61
|
+
match db_type:
|
|
62
|
+
case 'mysql' | 'tidb':
|
|
63
|
+
cf_port: int = getattr(configs, 'port', 3306)
|
|
64
|
+
return f'mysql+pymysql://{configs.user}:{password_encoded}@{configs.host}:{cf_port}/{configs.database}'
|
|
65
|
+
case 'postgresql':
|
|
66
|
+
cf_port: int = getattr(configs, 'port', 5432)
|
|
67
|
+
return f'postgresql+psycopg2://{configs.user}:{password_encoded}@{configs.host}:{cf_port}/{configs.database}'
|
|
68
|
+
case _:
|
|
69
|
+
raise ValueError(f'Unsupported db_type: {configs.db_type}')
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@lru_cache(maxsize=16)
|
|
73
|
+
def _get_engine(conn_str: str, connect_args_json: str = '{}', autocommit: bool = False) -> Engine:
|
|
74
|
+
"""
|
|
75
|
+
Get or create a cached SQLAlchemy Engine instance.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
conn_str (str): The database connection string.
|
|
79
|
+
connect_args_json (str, optional): JSON string representation of connection arguments. Defaults to '{}'.
|
|
80
|
+
autocommit (bool, optional): Whether the engine should be in autocommit mode. Defaults to False.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Engine: The created SQLAlchemy Engine instance.
|
|
84
|
+
"""
|
|
85
|
+
connect_args: Dict[str, Any] = json.loads(connect_args_json) if connect_args_json else {}
|
|
86
|
+
engine_kwargs: Dict[str, Any] = {
|
|
87
|
+
'connect_args': connect_args,
|
|
88
|
+
'poolclass': pool.QueuePool,
|
|
89
|
+
'pool_size': 5,
|
|
90
|
+
'max_overflow': 10,
|
|
91
|
+
'pool_pre_ping': True,
|
|
92
|
+
}
|
|
93
|
+
if autocommit:
|
|
94
|
+
engine_kwargs['isolation_level'] = 'AUTOCOMMIT'
|
|
95
|
+
return create_engine(conn_str, **engine_kwargs)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _resolve_engine(configs: Any, autocommit: bool = False) -> Engine:
|
|
99
|
+
"""
|
|
100
|
+
Resolve and return an Engine based on configuration.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
configs (Any): Configuration object.
|
|
104
|
+
autocommit (bool, optional): Whether to use autocommit mode. Defaults to False.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Engine: The SQLAlchemy Engine instance.
|
|
108
|
+
"""
|
|
109
|
+
connection_string: str = _build_conn_str(configs)
|
|
110
|
+
connect_args: Dict[str, Any] = get_connect_args(configs)
|
|
111
|
+
connect_args_json: str = json.dumps(connect_args, sort_keys=True, default=str)
|
|
112
|
+
return _get_engine(connection_string, connect_args_json, autocommit)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _safe_identifier(identifier: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Ensure an identifier is safe from SQL injection.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
identifier (str): The SQL identifier to validate.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
str: The safe identifier.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
ValueError: If the identifier contains invalid characters.
|
|
127
|
+
"""
|
|
128
|
+
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', identifier):
|
|
129
|
+
raise ValueError(f'Invalid identifier: {identifier}')
|
|
130
|
+
return identifier
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _safe_mysql_type(data_type: str) -> str:
|
|
134
|
+
"""
|
|
135
|
+
Ensure a MySQL data type expression is safe from SQL injection.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
data_type (str): The MySQL data type to validate.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
str: The safe data type string.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
ValueError: If the data type expression contains invalid characters.
|
|
145
|
+
"""
|
|
146
|
+
normalized: str = data_type.strip()
|
|
147
|
+
if not re.match(r'^[A-Za-z0-9_,()\s]+$', normalized):
|
|
148
|
+
raise ValueError(f'Invalid data type expression: {data_type}')
|
|
149
|
+
return normalized
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def query(sql: Union[str, text], configs: Any, params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
153
|
+
"""
|
|
154
|
+
Execute a query and return results as a pandas DataFrame.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
sql (Union[str, text]): The SQL query to execute.
|
|
158
|
+
configs (Any): Configuration object.
|
|
159
|
+
params (Optional[Dict[str, Any]], optional): Query parameters. Defaults to None.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
pd.DataFrame: Query results.
|
|
163
|
+
"""
|
|
164
|
+
logger.info('try to connect to %s...', getattr(configs, 'name', 'database'))
|
|
165
|
+
engine: Engine = _resolve_engine(configs, autocommit=False)
|
|
166
|
+
with engine.connect() as conn:
|
|
167
|
+
statement = text(sql) if isinstance(sql, str) else sql
|
|
168
|
+
df: pd.DataFrame = pd.read_sql(statement, conn, params=params)
|
|
169
|
+
logger.debug('query preview: %s', df.head())
|
|
170
|
+
return df
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def opt(sql: Union[str, text], configs: Any, params: Optional[Dict[str, Any]] = None) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Execute a SQL statement that modifies the database (e.g., INSERT, UPDATE, DELETE).
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
sql (Union[str, text]): The SQL statement to execute.
|
|
179
|
+
configs (Any): Configuration object.
|
|
180
|
+
params (Optional[Dict[str, Any]], optional): Query parameters. Defaults to None.
|
|
181
|
+
"""
|
|
182
|
+
logger.info('try to connect to %s...', getattr(configs, 'name', 'database'))
|
|
183
|
+
engine: Engine = _resolve_engine(configs, autocommit=True)
|
|
184
|
+
with engine.connect() as conn:
|
|
185
|
+
statement = text(sql) if isinstance(sql, str) else sql
|
|
186
|
+
conn.execute(statement, params or {})
|
|
187
|
+
logger.info('database execute success')
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class ManageTable:
|
|
191
|
+
"""
|
|
192
|
+
A class to manage a specific database table's operations.
|
|
193
|
+
"""
|
|
194
|
+
def __init__(self, table: str, configs: Any, verify: bool = False) -> None:
|
|
195
|
+
"""
|
|
196
|
+
Initialize a ManageTable instance.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
table (str): The name of the table.
|
|
200
|
+
configs (Any): Configuration object for the database.
|
|
201
|
+
verify (bool, optional): Whether to verify if the table exists upon initialization. Defaults to False.
|
|
202
|
+
"""
|
|
203
|
+
self.port: int = getattr(configs, 'port', 3306)
|
|
204
|
+
self.table: str = table
|
|
205
|
+
self.name: str = configs.name
|
|
206
|
+
self.user: str = configs.user
|
|
207
|
+
self.password: str = configs.password
|
|
208
|
+
self.host: str = configs.host
|
|
209
|
+
self.database: str = configs.database
|
|
210
|
+
self.configs: Any = configs
|
|
211
|
+
if verify:
|
|
212
|
+
self._check_exists()
|
|
213
|
+
|
|
214
|
+
def _check_exists(self) -> None:
|
|
215
|
+
"""
|
|
216
|
+
Check if the table exists and raise an error if not.
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
ValueError: If the table does not exist.
|
|
220
|
+
"""
|
|
221
|
+
if not self.exists():
|
|
222
|
+
raise ValueError(f'table not found: {self.table}')
|
|
223
|
+
logger.info('table exists: %s', self.table)
|
|
224
|
+
|
|
225
|
+
def exists(self) -> bool:
|
|
226
|
+
"""
|
|
227
|
+
Check if the table exists in the database.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
bool: True if table exists, False otherwise.
|
|
231
|
+
"""
|
|
232
|
+
safe_table: str = _safe_identifier(self.table)
|
|
233
|
+
try:
|
|
234
|
+
engine: Engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
235
|
+
inspector = inspect(engine)
|
|
236
|
+
return inspector.has_table(safe_table)
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logger.exception('failed to check if table exists: %s', e)
|
|
239
|
+
raise
|
|
240
|
+
|
|
241
|
+
def delete_table(self) -> None:
|
|
242
|
+
"""
|
|
243
|
+
Drop the table from the database.
|
|
244
|
+
"""
|
|
245
|
+
safe_table: str = _safe_identifier(self.table)
|
|
246
|
+
try:
|
|
247
|
+
opt(text(f'DROP TABLE `{safe_table}`'), self)
|
|
248
|
+
logger.info('%s deleted', self.table)
|
|
249
|
+
except Exception as e:
|
|
250
|
+
logger.exception('table was not deleted: %s', e)
|
|
251
|
+
raise
|
|
252
|
+
|
|
253
|
+
def par_del(self, clause: str, params: Optional[Dict[str, Any]] = None) -> None:
|
|
254
|
+
"""
|
|
255
|
+
Delete specific records from the table based on a WHERE clause.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
clause (str): The WHERE clause conditions.
|
|
259
|
+
params (Optional[Dict[str, Any]], optional): Parameters for the WHERE clause. Defaults to None.
|
|
260
|
+
"""
|
|
261
|
+
safe_table: str = _safe_identifier(self.table)
|
|
262
|
+
del_clause = text(f'DELETE FROM `{safe_table}` WHERE {clause}')
|
|
263
|
+
opt(del_clause, self, params=params)
|
|
264
|
+
logger.info('records deleted by clause: %s', clause)
|
|
265
|
+
|
|
266
|
+
def change_data_type(self, cols_name: str, data_type: str) -> None:
|
|
267
|
+
"""
|
|
268
|
+
Change the data type of a specific column in the table.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
cols_name (str): The name of the column to alter.
|
|
272
|
+
data_type (str): The new data type expression.
|
|
273
|
+
"""
|
|
274
|
+
safe_table: str = _safe_identifier(self.table)
|
|
275
|
+
safe_col: str = _safe_identifier(cols_name)
|
|
276
|
+
safe_type: str = _safe_mysql_type(data_type)
|
|
277
|
+
change_clause = text(f'ALTER TABLE `{safe_table}` MODIFY COLUMN `{safe_col}` {safe_type}')
|
|
278
|
+
opt(change_clause, self)
|
|
279
|
+
logger.info('%s changed to %s successfully', cols_name, data_type)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def upload_data(self, df: pd.DataFrame, chunk_size: int = 10000, add_date: bool = False) -> None:
|
|
283
|
+
"""
|
|
284
|
+
Upload data from a pandas DataFrame to the database table.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
df (pd.DataFrame): The DataFrame containing data to upload.
|
|
288
|
+
chunk_size (int, optional): Number of rows to upload per chunk. Defaults to 10000.
|
|
289
|
+
add_date (bool, optional): Whether to append the current date to the DataFrame before uploading. Defaults to False.
|
|
290
|
+
"""
|
|
291
|
+
engine: Engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
292
|
+
|
|
293
|
+
with engine.begin() as connection:
|
|
294
|
+
if add_date:
|
|
295
|
+
df_copy: pd.DataFrame = df.copy()
|
|
296
|
+
df_copy['rundate'] = datetime.now().strftime('%Y-%m-%d')
|
|
297
|
+
else:
|
|
298
|
+
df_copy: pd.DataFrame = df
|
|
299
|
+
total_chunks: int = (len(df_copy) // chunk_size) + (0 if len(df_copy) % chunk_size == 0 else 1)
|
|
300
|
+
logger.info('try to upload data now, chunk_size is %s', chunk_size)
|
|
301
|
+
with tqdm(total=total_chunks, desc="Uploading Chunks", unit="chunk") as pbar:
|
|
302
|
+
try:
|
|
303
|
+
for start in range(0, len(df_copy), chunk_size):
|
|
304
|
+
end: int = min(start + chunk_size, len(df_copy))
|
|
305
|
+
chunk: pd.DataFrame = df_copy.iloc[start:end]
|
|
306
|
+
chunk.to_sql(name=self.table, con=connection, if_exists='append', index=False)
|
|
307
|
+
pbar.update(1)
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.exception('an error occurred during upload: %s', e)
|
|
310
|
+
raise
|
|
311
|
+
|
|
312
|
+
def upsert_data(self, df: pd.DataFrame, chunk_size: int = 10000, add_date: bool = False, ignore: bool = False, key: Union[str, List[str], Tuple[str, ...], None] = None) -> None:
|
|
313
|
+
"""
|
|
314
|
+
Upsert data from a pandas DataFrame into the database table.
|
|
315
|
+
|
|
316
|
+
This method will perform an "insert or update" (upsert) operation based on the target database type.
|
|
317
|
+
If the record already exists (based on the specified primary key or unique index), it updates the existing record.
|
|
318
|
+
Otherwise, it inserts a new record.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
df (pd.DataFrame): The pandas DataFrame containing the data to be upserted.
|
|
322
|
+
chunk_size (int, optional): The number of rows to insert per batch. Defaults to 10000.
|
|
323
|
+
add_date (bool, optional): Whether to add a 'rundate' column with the current date to the dataframe. Defaults to False.
|
|
324
|
+
ignore (bool, optional): If True, it performs an 'INSERT IGNORE' or 'ON CONFLICT DO NOTHING' operation, skipping existing records instead of updating them. Defaults to False.
|
|
325
|
+
key (Union[str, List[str], Tuple[str, ...], None], optional): The primary key or unique index column(s) used to detect conflicts.
|
|
326
|
+
Required for PostgreSQL. For MySQL/TiDB, this is used to exclude primary key columns from being updated.
|
|
327
|
+
Can be a comma-separated string or a list/tuple of strings. Defaults to None.
|
|
328
|
+
|
|
329
|
+
Raises:
|
|
330
|
+
ValueError: If 'key' is not provided when 'db_type' is 'postgresql', or if an unsupported 'db_type' is used.
|
|
331
|
+
"""
|
|
332
|
+
engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
333
|
+
db_type: str = getattr(self.configs if hasattr(self, 'configs') else self, 'db_type', 'mysql').lower()
|
|
334
|
+
|
|
335
|
+
with engine.begin() as connection:
|
|
336
|
+
if add_date:
|
|
337
|
+
df_copy: pd.DataFrame = df.copy()
|
|
338
|
+
df_copy['rundate'] = datetime.now().strftime('%Y-%m-%d')
|
|
339
|
+
else:
|
|
340
|
+
df_copy: pd.DataFrame = df
|
|
341
|
+
|
|
342
|
+
total_chunks: int = (len(df_copy) // chunk_size) + (0 if len(df_copy) % chunk_size == 0 else 1)
|
|
343
|
+
logger.info('trying to upload data now, chunk_size is %s', chunk_size)
|
|
344
|
+
|
|
345
|
+
with tqdm(total=total_chunks, desc="Uploading Chunks", unit="chunk") as pbar:
|
|
346
|
+
for start in range(0, len(df_copy), chunk_size):
|
|
347
|
+
end: int = min(start + chunk_size, len(df_copy))
|
|
348
|
+
chunk: pd.DataFrame = df_copy.iloc[start:end]
|
|
349
|
+
columns: List[str] = chunk.columns.tolist()
|
|
350
|
+
value_placeholders: str = ', '.join([f':{col}' for col in columns])
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
if ignore == False:
|
|
354
|
+
keys: List[str] = []
|
|
355
|
+
if key:
|
|
356
|
+
if isinstance(key, str):
|
|
357
|
+
keys = [k.strip() for k in key.split(',')]
|
|
358
|
+
elif isinstance(key, (list, tuple)):
|
|
359
|
+
keys = [str(k).strip() for k in key]
|
|
360
|
+
else:
|
|
361
|
+
raise ValueError('key must be a string or a list of strings')
|
|
362
|
+
|
|
363
|
+
if db_type in ('mysql', 'tidb'):
|
|
364
|
+
if keys:
|
|
365
|
+
update_columns = ', '.join([f"`{col}`=VALUES(`{col}`)" for col in columns if col not in keys])
|
|
366
|
+
else:
|
|
367
|
+
update_columns = ', '.join([f"`{col}`=VALUES(`{col}`)" for col in columns])
|
|
368
|
+
|
|
369
|
+
if update_columns:
|
|
370
|
+
insert_sql = f"""
|
|
371
|
+
INSERT INTO {self.table} ({', '.join([f'`{col}`' for col in columns])})
|
|
372
|
+
VALUES ({value_placeholders})
|
|
373
|
+
ON DUPLICATE KEY UPDATE {update_columns}
|
|
374
|
+
"""
|
|
375
|
+
else:
|
|
376
|
+
insert_sql = f"""
|
|
377
|
+
INSERT IGNORE INTO {self.table} ({', '.join([f'`{col}`' for col in columns])})
|
|
378
|
+
VALUES ({value_placeholders})
|
|
379
|
+
"""
|
|
380
|
+
elif db_type == 'postgresql':
|
|
381
|
+
if not keys:
|
|
382
|
+
raise ValueError('key is required for postgresql upsert')
|
|
383
|
+
|
|
384
|
+
safe_keys = [_safe_identifier(k) for k in keys]
|
|
385
|
+
safe_columns = [_safe_identifier(col) for col in columns]
|
|
386
|
+
quoted_columns = ', '.join([f'"{col}"' for col in safe_columns])
|
|
387
|
+
update_columns = ', '.join(
|
|
388
|
+
[f'"{col}"=EXCLUDED."{col}"' for col in safe_columns if col not in safe_keys]
|
|
389
|
+
)
|
|
390
|
+
conflict_keys_str = ', '.join([f'"{k}"' for k in safe_keys])
|
|
391
|
+
|
|
392
|
+
if update_columns:
|
|
393
|
+
insert_sql = f"""
|
|
394
|
+
INSERT INTO {self.table} ({quoted_columns})
|
|
395
|
+
VALUES ({value_placeholders})
|
|
396
|
+
ON CONFLICT ({conflict_keys_str}) DO UPDATE SET {update_columns}
|
|
397
|
+
"""
|
|
398
|
+
else:
|
|
399
|
+
insert_sql = f"""
|
|
400
|
+
INSERT INTO {self.table} ({quoted_columns})
|
|
401
|
+
VALUES ({value_placeholders})
|
|
402
|
+
ON CONFLICT ({conflict_keys_str}) DO NOTHING
|
|
403
|
+
"""
|
|
404
|
+
else:
|
|
405
|
+
raise ValueError(f'Unsupported db_type for upsert: {db_type}')
|
|
406
|
+
else:
|
|
407
|
+
insert_sql = f"""
|
|
408
|
+
INSERT IGNORE INTO {self.table} ({', '.join([f'`{col}`' for col in columns])})
|
|
409
|
+
VALUES ({value_placeholders})
|
|
410
|
+
"""
|
|
411
|
+
|
|
412
|
+
data = chunk.where(pd.notna(chunk), None).to_dict(orient='records')
|
|
413
|
+
connection.execute(text(insert_sql), data)
|
|
414
|
+
pbar.update(1)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.exception('an error occurred during upsert: %s', e)
|
|
417
|
+
raise
|
|
418
|
+
|
|
419
|
+
class Manage_table(ManageTable):
|
|
420
|
+
"""
|
|
421
|
+
Deprecated class for managing database tables. Use ManageTable instead.
|
|
422
|
+
"""
|
|
423
|
+
def __init__(self, table: str, configs: Any, verify: bool = False) -> None:
|
|
424
|
+
"""
|
|
425
|
+
Initialize the Manage_table instance. Issues a deprecation warning.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
table (str): The name of the table to manage.
|
|
429
|
+
configs (Any): Configuration object containing database connection details.
|
|
430
|
+
verify (bool, optional): Whether to verify the table configuration. Defaults to False.
|
|
431
|
+
"""
|
|
432
|
+
warnings.warn(
|
|
433
|
+
'Manage_table is deprecated and will be removed in v2.0.0; use ManageTable instead.',
|
|
434
|
+
DeprecationWarning,
|
|
435
|
+
stacklevel=2,
|
|
436
|
+
)
|
|
437
|
+
super().__init__(table, configs, verify=verify)
|
|
438
|
+
|
|
439
|
+
def delete_table(self) -> None:
|
|
440
|
+
"""
|
|
441
|
+
Drop the table from the database.
|
|
442
|
+
"""
|
|
443
|
+
super().delete_table()
|
|
444
|
+
|
|
445
|
+
def upload_data(self, df: pd.DataFrame, chunk_size: int = 10000, add_date: bool = True) -> None:
|
|
446
|
+
"""
|
|
447
|
+
Upload data from a pandas DataFrame to the database table.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
df (pd.DataFrame): The pandas DataFrame containing the data to upload.
|
|
451
|
+
chunk_size (int, optional): The number of rows to insert per batch. Defaults to 10000.
|
|
452
|
+
add_date (bool, optional): Whether to add a 'rundate' column with the current date to the dataframe. Defaults to True.
|
|
453
|
+
"""
|
|
454
|
+
engine: Engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
455
|
+
|
|
456
|
+
with engine.begin() as connection:
|
|
457
|
+
if add_date:
|
|
458
|
+
df_copy: pd.DataFrame = df.copy()
|
|
459
|
+
df_copy['rundate'] = datetime.now().strftime('%Y-%m-%d')
|
|
460
|
+
else:
|
|
461
|
+
df_copy: pd.DataFrame = df
|
|
462
|
+
total_chunks: int = (len(df_copy) // chunk_size) + (0 if len(df_copy) % chunk_size == 0 else 1)
|
|
463
|
+
logger.info('try to upload data now, chunk_size is %s', chunk_size)
|
|
464
|
+
with tqdm(total=total_chunks, desc="Uploading Chunks", unit="chunk") as pbar:
|
|
465
|
+
try:
|
|
466
|
+
for start in range(0, len(df_copy), chunk_size):
|
|
467
|
+
end: int = min(start + chunk_size, len(df_copy))
|
|
468
|
+
chunk: pd.DataFrame = df_copy.iloc[start:end]
|
|
469
|
+
chunk.to_sql(name=self.table, con=connection, if_exists='append', index=False)
|
|
470
|
+
pbar.update(1)
|
|
471
|
+
except Exception as e:
|
|
472
|
+
logger.exception('an error occurred during upload: %s', e)
|
|
473
|
+
raise
|
|
@@ -178,9 +178,10 @@ def fs_write_df(sheet_address, df, feishu_cfg, loc='A1', clear_sheet=True):
|
|
|
178
178
|
if clear_resp.json().get('code') == 0:
|
|
179
179
|
logger.info('sheet cleared')
|
|
180
180
|
else:
|
|
181
|
-
|
|
181
|
+
raise RuntimeError(f"failed to clear sheet: {clear_resp.json().get('msg')}")
|
|
182
182
|
except Exception as e:
|
|
183
|
-
logger.
|
|
183
|
+
logger.exception('failed to clear sheet: %s', e)
|
|
184
|
+
raise
|
|
184
185
|
|
|
185
186
|
# 处理 DataFrame 数据类型
|
|
186
187
|
df_copy = df.copy()
|
|
@@ -305,8 +306,7 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
305
306
|
existing_fields = _get_bitable_fields(app_token, table_id, header)
|
|
306
307
|
|
|
307
308
|
if not existing_fields:
|
|
308
|
-
|
|
309
|
-
return None
|
|
309
|
+
raise ValueError('could not fetch table fields or table has no fields')
|
|
310
310
|
|
|
311
311
|
logger.info('table has %s fields', len(existing_fields))
|
|
312
312
|
|
|
@@ -323,8 +323,7 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
323
323
|
logger.warning('skip column: %s', field)
|
|
324
324
|
|
|
325
325
|
if not valid_fields:
|
|
326
|
-
|
|
327
|
-
return None
|
|
326
|
+
raise ValueError('no valid fields to write, all dataframe columns are missing in bitable')
|
|
328
327
|
|
|
329
328
|
logger.info('will write %s valid fields', len(valid_fields))
|
|
330
329
|
|
|
@@ -360,8 +359,9 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
360
359
|
_request_with_retry("post", delete_url, headers=header, json_data=delete_data)
|
|
361
360
|
logger.info('deleted %s records', len(record_ids))
|
|
362
361
|
|
|
363
|
-
except Exception as e:
|
|
364
|
-
logger.
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logger.exception('failed to clear table: %s', e)
|
|
364
|
+
raise
|
|
365
365
|
|
|
366
366
|
# 处理 DataFrame - 只保留有效字段
|
|
367
367
|
df_copy = df[list(valid_fields)].copy()
|
|
@@ -444,7 +444,7 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
444
444
|
str_val = str(value)
|
|
445
445
|
if str_val and str_val != 'None' and str_val != 'nan':
|
|
446
446
|
fields[col] = str_val
|
|
447
|
-
except:
|
|
447
|
+
except Exception:
|
|
448
448
|
if col not in skipped_cols:
|
|
449
449
|
skipped_cols.add(col)
|
|
450
450
|
continue
|
|
@@ -457,6 +457,7 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
457
457
|
# 批量写入(每次最多500条)
|
|
458
458
|
batch_size = 500
|
|
459
459
|
all_responses = []
|
|
460
|
+
failed_batches = []
|
|
460
461
|
|
|
461
462
|
for i in range(0, len(records), batch_size):
|
|
462
463
|
batch = records[i:i + batch_size]
|
|
@@ -473,9 +474,11 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
473
474
|
logger.info('batch %s wrote %s records', i // batch_size + 1, len(batch))
|
|
474
475
|
else:
|
|
475
476
|
logger.error('failed to write batch: %s', response.get('msg', 'Unknown error'))
|
|
477
|
+
failed_batches.append((i // batch_size + 1, response.get('msg', 'Unknown error')))
|
|
476
478
|
|
|
477
479
|
except Exception as e:
|
|
478
480
|
logger.exception('failed to write batch: %s', e)
|
|
481
|
+
failed_batches.append((i // batch_size + 1, str(e)))
|
|
479
482
|
|
|
480
483
|
logger.info('write summary total records: %s', len(records))
|
|
481
484
|
logger.info('write summary fields written: %s', len(valid_fields))
|
|
@@ -483,6 +486,8 @@ def fs_write_base(sheet_address, df, feishu_cfg, clear_table=False):
|
|
|
483
486
|
logger.info('write summary fields skipped: %s', len(missing_fields))
|
|
484
487
|
for field in sorted(missing_fields):
|
|
485
488
|
logger.info('skip field: %s', field)
|
|
489
|
+
if failed_batches:
|
|
490
|
+
raise RuntimeError(f'bitable write failed for {len(failed_batches)} batch(es): {failed_batches}')
|
|
486
491
|
logger.info('data is written')
|
|
487
492
|
|
|
488
493
|
return all_responses
|
|
@@ -68,14 +68,16 @@ def gs_read_df(address, cfg=None, service_account_path=None):
|
|
|
68
68
|
return df
|
|
69
69
|
|
|
70
70
|
except gspread.exceptions.SpreadsheetNotFound:
|
|
71
|
-
|
|
72
|
-
|
|
71
|
+
message = f"spreadsheet '{spreadsheet_identifier}' not found"
|
|
72
|
+
logger.error(message)
|
|
73
|
+
raise ValueError(message)
|
|
73
74
|
except gspread.exceptions.WorksheetNotFound:
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
message = f"worksheet '{worksheet_name}' not found in spreadsheet"
|
|
76
|
+
logger.error(message)
|
|
77
|
+
raise ValueError(message)
|
|
76
78
|
except Exception as e:
|
|
77
79
|
logger.exception('an unexpected error occurred: %s', e)
|
|
78
|
-
|
|
80
|
+
raise
|
|
79
81
|
|
|
80
82
|
|
|
81
83
|
def gs_write_df(address, df, cfg=None, loc='A1', service_account_path=None):
|
|
@@ -105,8 +107,9 @@ def gs_write_df(address, df, cfg=None, loc='A1', service_account_path=None):
|
|
|
105
107
|
|
|
106
108
|
except gspread.exceptions.SpreadsheetNotFound:
|
|
107
109
|
if is_id:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
+
message = f"spreadsheet ID '{spreadsheet_identifier}' not found, cannot create with specific ID"
|
|
111
|
+
logger.error(message)
|
|
112
|
+
raise ValueError(message)
|
|
110
113
|
else:
|
|
111
114
|
logger.info("spreadsheet '%s' not found, creating one", spreadsheet_identifier)
|
|
112
115
|
sh = gc.create(spreadsheet_identifier)
|
|
@@ -128,3 +131,4 @@ def gs_write_df(address, df, cfg=None, loc='A1', service_account_path=None):
|
|
|
128
131
|
logger.info('data is written')
|
|
129
132
|
except Exception as e:
|
|
130
133
|
logger.exception('failed to update worksheet: %s', e)
|
|
134
|
+
raise
|
|
@@ -8,6 +8,17 @@ import yaml
|
|
|
8
8
|
from .models import ActualTable
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
class _QuotedStringDumper(yaml.SafeDumper):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _quoted_string_representer(dumper, value):
|
|
16
|
+
return dumper.represent_scalar('tag:yaml.org,2002:str', value, style='"')
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_QuotedStringDumper.add_representer(str, _quoted_string_representer)
|
|
20
|
+
|
|
21
|
+
|
|
11
22
|
def _table_to_payload(table: ActualTable) -> dict:
|
|
12
23
|
payload: dict = {
|
|
13
24
|
'table': table.table,
|
|
@@ -51,7 +62,13 @@ def write_pulled_schema(
|
|
|
51
62
|
target = out / f'{table.table}.yaml'
|
|
52
63
|
payload = _table_to_payload(table)
|
|
53
64
|
with target.open('w', encoding='utf-8') as f:
|
|
54
|
-
yaml.
|
|
65
|
+
yaml.dump(
|
|
66
|
+
payload,
|
|
67
|
+
f,
|
|
68
|
+
Dumper=_QuotedStringDumper,
|
|
69
|
+
sort_keys=False,
|
|
70
|
+
allow_unicode=True,
|
|
71
|
+
)
|
|
55
72
|
written.append(target)
|
|
56
73
|
return written
|
|
57
74
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablemaster
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.2
|
|
4
4
|
Summary: tablemaster is a Python toolkit for moving and managing tabular data across databases, Feishu/Lark, Google Sheets, and local files with one consistent API.
|
|
5
5
|
Author-email: Livid <livid.su@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/ilivid/tablemaster
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from types import SimpleNamespace
|
|
2
|
+
from unittest import TestCase
|
|
3
|
+
from unittest.mock import patch
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from tablemaster.database import ManageTable
|
|
8
|
+
from tablemaster.feishu import fs_write_base
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _DummyResponse:
|
|
12
|
+
def __init__(self, body, status_code=200):
|
|
13
|
+
self._body = body
|
|
14
|
+
self.status_code = status_code
|
|
15
|
+
|
|
16
|
+
def json(self):
|
|
17
|
+
return self._body
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ErrorVisibilityTests(TestCase):
|
|
21
|
+
def setUp(self):
|
|
22
|
+
self.db_cfg = SimpleNamespace(
|
|
23
|
+
name='test_db',
|
|
24
|
+
user='u',
|
|
25
|
+
password='p',
|
|
26
|
+
host='127.0.0.1',
|
|
27
|
+
database='d',
|
|
28
|
+
db_type='mysql',
|
|
29
|
+
)
|
|
30
|
+
self.feishu_cfg = SimpleNamespace(feishu_app_id='id', feishu_app_secret='secret')
|
|
31
|
+
|
|
32
|
+
def test_manage_table_exists_propagates_errors(self):
|
|
33
|
+
table = ManageTable('orders', self.db_cfg)
|
|
34
|
+
with patch('tablemaster.database._resolve_engine', side_effect=RuntimeError('db unavailable')):
|
|
35
|
+
with self.assertRaises(RuntimeError):
|
|
36
|
+
table.exists()
|
|
37
|
+
|
|
38
|
+
def test_delete_table_propagates_errors(self):
|
|
39
|
+
table = ManageTable('orders', self.db_cfg)
|
|
40
|
+
with patch('tablemaster.database.opt', side_effect=RuntimeError('drop failed')):
|
|
41
|
+
with self.assertRaises(RuntimeError):
|
|
42
|
+
table.delete_table()
|
|
43
|
+
|
|
44
|
+
def test_fs_write_base_raises_when_batch_write_failed(self):
|
|
45
|
+
df = pd.DataFrame({'a': [1]})
|
|
46
|
+
|
|
47
|
+
with patch('tablemaster.feishu._get_tenant_access_token', return_value='token'):
|
|
48
|
+
with patch('tablemaster.feishu._get_bitable_fields', return_value={'a'}):
|
|
49
|
+
with patch(
|
|
50
|
+
'tablemaster.feishu._request_with_retry',
|
|
51
|
+
return_value=_DummyResponse({'code': 1001, 'msg': 'bad request'}),
|
|
52
|
+
):
|
|
53
|
+
with self.assertRaises(RuntimeError):
|
|
54
|
+
fs_write_base(['app_token', 'table_id'], df, self.feishu_cfg)
|
|
@@ -102,8 +102,35 @@ class SchemaCoreTests(unittest.TestCase):
|
|
|
102
102
|
paths = write_pulled_schema(tables, root / 'schema' / 'mydb')
|
|
103
103
|
self.assertEqual(1, len(paths))
|
|
104
104
|
content = paths[0].read_text(encoding='utf-8')
|
|
105
|
-
self.assertIn('table: orders', content)
|
|
106
|
-
self.assertIn('primary_key: true', content)
|
|
105
|
+
self.assertIn('"table": "orders"', content)
|
|
106
|
+
self.assertIn('"primary_key": true', content)
|
|
107
|
+
|
|
108
|
+
def test_pull_quotes_comment_with_colon(self):
|
|
109
|
+
with TemporaryDirectory() as td:
|
|
110
|
+
root = Path(td)
|
|
111
|
+
tables = [
|
|
112
|
+
ActualTable(
|
|
113
|
+
table='orders',
|
|
114
|
+
columns=[
|
|
115
|
+
ActualColumn(
|
|
116
|
+
name='id',
|
|
117
|
+
type='BIGINT',
|
|
118
|
+
nullable=False,
|
|
119
|
+
default=None,
|
|
120
|
+
comment='主键:业务单号',
|
|
121
|
+
primary_key=True,
|
|
122
|
+
)
|
|
123
|
+
],
|
|
124
|
+
indexes=[],
|
|
125
|
+
comment='订单:主表',
|
|
126
|
+
)
|
|
127
|
+
]
|
|
128
|
+
paths = write_pulled_schema(tables, root / 'schema' / 'mydb')
|
|
129
|
+
content = paths[0].read_text(encoding='utf-8')
|
|
130
|
+
self.assertIn('"comment": "订单:主表"', content)
|
|
131
|
+
loaded = load_schema_definitions(connection='mydb', root_dir=root / 'schema')
|
|
132
|
+
self.assertEqual('订单:主表', loaded[0].comment)
|
|
133
|
+
self.assertEqual('主键:业务单号', loaded[0].columns[0].comment)
|
|
107
134
|
|
|
108
135
|
|
|
109
136
|
if __name__ == '__main__':
|
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import re
|
|
4
|
-
import warnings
|
|
5
|
-
from functools import lru_cache
|
|
6
|
-
|
|
7
|
-
from sqlalchemy import create_engine, pool, text
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
from tqdm import tqdm
|
|
11
|
-
from urllib.parse import quote_plus
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def get_connect_args(configs):
|
|
17
|
-
"""
|
|
18
|
-
获取数据库连接参数,支持SSL和其他通用配置
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
configs: 配置对象,可以包含以下属性:
|
|
22
|
-
- use_ssl: 是否使用SSL (bool)
|
|
23
|
-
- ssl_ca: SSL证书路径 (str)
|
|
24
|
-
- connect_args: 自定义连接参数 (dict)
|
|
25
|
-
- db_type: 数据库类型 ('tidb', 'mysql' 等)
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
dict: 连接参数字典
|
|
29
|
-
"""
|
|
30
|
-
connect_args = {}
|
|
31
|
-
|
|
32
|
-
if hasattr(configs, 'connect_args') and configs.connect_args:
|
|
33
|
-
connect_args = configs.connect_args.copy()
|
|
34
|
-
else:
|
|
35
|
-
use_ssl = getattr(configs, 'use_ssl', False)
|
|
36
|
-
db_type = getattr(configs, 'db_type', 'mysql').lower()
|
|
37
|
-
|
|
38
|
-
if db_type == 'tidb' or use_ssl:
|
|
39
|
-
ssl_ca = getattr(configs, 'ssl_ca', '/etc/ssl/cert.pem')
|
|
40
|
-
connect_args = {
|
|
41
|
-
'ssl': {
|
|
42
|
-
'ca': ssl_ca,
|
|
43
|
-
'check_hostname': False,
|
|
44
|
-
'verify_identity': False
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
return connect_args
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def _build_conn_str(configs):
|
|
52
|
-
db_type = getattr(configs, 'db_type', 'mysql').lower()
|
|
53
|
-
password_encoded = quote_plus(configs.password)
|
|
54
|
-
match db_type:
|
|
55
|
-
case 'mysql' | 'tidb':
|
|
56
|
-
cf_port = getattr(configs, 'port', 3306)
|
|
57
|
-
return f'mysql+pymysql://{configs.user}:{password_encoded}@{configs.host}:{cf_port}/{configs.database}'
|
|
58
|
-
case 'postgresql':
|
|
59
|
-
cf_port = getattr(configs, 'port', 5432)
|
|
60
|
-
return f'postgresql+psycopg2://{configs.user}:{password_encoded}@{configs.host}:{cf_port}/{configs.database}'
|
|
61
|
-
case _:
|
|
62
|
-
raise ValueError(f'Unsupported db_type: {configs.db_type}')
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@lru_cache(maxsize=16)
|
|
66
|
-
def _get_engine(conn_str, connect_args_json='{}', autocommit=False):
|
|
67
|
-
connect_args = json.loads(connect_args_json) if connect_args_json else {}
|
|
68
|
-
engine_kwargs = {
|
|
69
|
-
'connect_args': connect_args,
|
|
70
|
-
'poolclass': pool.QueuePool,
|
|
71
|
-
'pool_size': 5,
|
|
72
|
-
'max_overflow': 10,
|
|
73
|
-
'pool_pre_ping': True,
|
|
74
|
-
}
|
|
75
|
-
if autocommit:
|
|
76
|
-
engine_kwargs['isolation_level'] = 'AUTOCOMMIT'
|
|
77
|
-
return create_engine(conn_str, **engine_kwargs)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def _resolve_engine(configs, autocommit=False):
|
|
81
|
-
connection_string = _build_conn_str(configs)
|
|
82
|
-
connect_args = get_connect_args(configs)
|
|
83
|
-
connect_args_json = json.dumps(connect_args, sort_keys=True, default=str)
|
|
84
|
-
return _get_engine(connection_string, connect_args_json, autocommit)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _safe_identifier(identifier):
|
|
88
|
-
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', identifier):
|
|
89
|
-
raise ValueError(f'Invalid identifier: {identifier}')
|
|
90
|
-
return identifier
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _safe_mysql_type(data_type):
|
|
94
|
-
normalized = data_type.strip()
|
|
95
|
-
if not re.match(r'^[A-Za-z0-9_,()\s]+$', normalized):
|
|
96
|
-
raise ValueError(f'Invalid data type expression: {data_type}')
|
|
97
|
-
return normalized
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def query(sql, configs, params=None):
|
|
101
|
-
logger.info('try to connect to %s...', getattr(configs, 'name', 'database'))
|
|
102
|
-
engine = _resolve_engine(configs, autocommit=False)
|
|
103
|
-
with engine.connect() as conn:
|
|
104
|
-
statement = text(sql) if isinstance(sql, str) else sql
|
|
105
|
-
df = pd.read_sql(statement, conn, params=params)
|
|
106
|
-
logger.debug('query preview: %s', df.head())
|
|
107
|
-
return df
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def opt(sql, configs, params=None):
|
|
111
|
-
logger.info('try to connect to %s...', getattr(configs, 'name', 'database'))
|
|
112
|
-
engine = _resolve_engine(configs, autocommit=True)
|
|
113
|
-
with engine.connect() as conn:
|
|
114
|
-
statement = text(sql) if isinstance(sql, str) else sql
|
|
115
|
-
conn.execute(statement, params or {})
|
|
116
|
-
logger.info('database execute success')
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
class ManageTable:
|
|
120
|
-
def __init__(self, table, configs, verify=False):
|
|
121
|
-
self.port = getattr(configs, 'port', 3306)
|
|
122
|
-
self.table = table
|
|
123
|
-
self.name = configs.name
|
|
124
|
-
self.user = configs.user
|
|
125
|
-
self.password = configs.password
|
|
126
|
-
self.host = configs.host
|
|
127
|
-
self.database = configs.database
|
|
128
|
-
self.configs = configs
|
|
129
|
-
if verify:
|
|
130
|
-
self._check_exists()
|
|
131
|
-
|
|
132
|
-
def _check_exists(self):
|
|
133
|
-
if not self.exists():
|
|
134
|
-
raise ValueError(f'table not found: {self.table}')
|
|
135
|
-
logger.info('table exists: %s', self.table)
|
|
136
|
-
|
|
137
|
-
def exists(self):
|
|
138
|
-
safe_table = _safe_identifier(self.table)
|
|
139
|
-
check_sql = text(f'SELECT 1 FROM `{safe_table}` LIMIT 1')
|
|
140
|
-
try:
|
|
141
|
-
opt(check_sql, self)
|
|
142
|
-
return True
|
|
143
|
-
except Exception:
|
|
144
|
-
return False
|
|
145
|
-
|
|
146
|
-
def delete_table(self):
|
|
147
|
-
safe_table = _safe_identifier(self.table)
|
|
148
|
-
try:
|
|
149
|
-
opt(text(f'DROP TABLE `{safe_table}`'), self)
|
|
150
|
-
logger.info('%s deleted', self.table)
|
|
151
|
-
except Exception:
|
|
152
|
-
logger.exception('table was not deleted')
|
|
153
|
-
|
|
154
|
-
def par_del(self, clause, params=None):
|
|
155
|
-
safe_table = _safe_identifier(self.table)
|
|
156
|
-
del_clause = text(f'DELETE FROM `{safe_table}` WHERE {clause}')
|
|
157
|
-
opt(del_clause, self, params=params)
|
|
158
|
-
logger.info('records deleted by clause: %s', clause)
|
|
159
|
-
|
|
160
|
-
def change_data_type(self, cols_name, data_type):
|
|
161
|
-
safe_table = _safe_identifier(self.table)
|
|
162
|
-
safe_col = _safe_identifier(cols_name)
|
|
163
|
-
safe_type = _safe_mysql_type(data_type)
|
|
164
|
-
change_clause = text(f'ALTER TABLE `{safe_table}` MODIFY COLUMN `{safe_col}` {safe_type}')
|
|
165
|
-
opt(change_clause, self)
|
|
166
|
-
logger.info('%s changed to %s successfully', cols_name, data_type)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def upload_data(self, df, chunk_size=10000, add_date=False):
|
|
170
|
-
engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
171
|
-
|
|
172
|
-
with engine.begin() as connection:
|
|
173
|
-
if add_date:
|
|
174
|
-
df_copy = df.copy()
|
|
175
|
-
df_copy['rundate'] = datetime.now().strftime('%Y-%m-%d')
|
|
176
|
-
else:
|
|
177
|
-
df_copy = df
|
|
178
|
-
total_chunks = (len(df_copy) // chunk_size) + (0 if len(df_copy) % chunk_size == 0 else 1)
|
|
179
|
-
logger.info('try to upload data now, chunk_size is %s', chunk_size)
|
|
180
|
-
with tqdm(total=total_chunks, desc="Uploading Chunks", unit="chunk") as pbar:
|
|
181
|
-
try:
|
|
182
|
-
for start in range(0, len(df_copy), chunk_size):
|
|
183
|
-
end = min(start + chunk_size, len(df_copy))
|
|
184
|
-
chunk = df_copy.iloc[start:end]
|
|
185
|
-
chunk.to_sql(name=self.table, con=connection, if_exists='append', index=False)
|
|
186
|
-
pbar.update(1)
|
|
187
|
-
except Exception as e:
|
|
188
|
-
logger.exception('an error occurred during upload: %s', e)
|
|
189
|
-
|
|
190
|
-
def upsert_data(self, df, chunk_size=10000, add_date=False, ignore=False, key=None):
|
|
191
|
-
engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
192
|
-
db_type = getattr(self.configs if hasattr(self, 'configs') else self, 'db_type', 'mysql').lower()
|
|
193
|
-
|
|
194
|
-
with engine.begin() as connection:
|
|
195
|
-
if add_date:
|
|
196
|
-
df_copy = df.copy()
|
|
197
|
-
df_copy['rundate'] = datetime.now().strftime('%Y-%m-%d')
|
|
198
|
-
else:
|
|
199
|
-
df_copy = df
|
|
200
|
-
|
|
201
|
-
total_chunks = (len(df_copy) // chunk_size) + (0 if len(df_copy) % chunk_size == 0 else 1)
|
|
202
|
-
logger.info('trying to upload data now, chunk_size is %s', chunk_size)
|
|
203
|
-
|
|
204
|
-
with tqdm(total=total_chunks, desc="Uploading Chunks", unit="chunk") as pbar:
|
|
205
|
-
for start in range(0, len(df_copy), chunk_size):
|
|
206
|
-
end = min(start + chunk_size, len(df_copy))
|
|
207
|
-
chunk = df_copy.iloc[start:end]
|
|
208
|
-
columns = chunk.columns.tolist()
|
|
209
|
-
value_placeholders = ', '.join([f':{col}' for col in columns])
|
|
210
|
-
|
|
211
|
-
try:
|
|
212
|
-
if ignore == False:
|
|
213
|
-
if db_type in ('mysql', 'tidb'):
|
|
214
|
-
update_columns = ', '.join([f"`{col}`=VALUES(`{col}`)" for col in columns])
|
|
215
|
-
insert_sql = f"""
|
|
216
|
-
INSERT INTO {self.table} ({', '.join([f'`{col}`' for col in columns])})
|
|
217
|
-
VALUES ({value_placeholders})
|
|
218
|
-
ON DUPLICATE KEY UPDATE {update_columns}
|
|
219
|
-
"""
|
|
220
|
-
elif db_type == 'postgresql':
|
|
221
|
-
if not key:
|
|
222
|
-
raise ValueError('key is required for postgresql upsert')
|
|
223
|
-
safe_key = _safe_identifier(key)
|
|
224
|
-
safe_columns = [_safe_identifier(col) for col in columns]
|
|
225
|
-
quoted_columns = ', '.join([f'"{col}"' for col in safe_columns])
|
|
226
|
-
update_columns = ', '.join(
|
|
227
|
-
[f'"{col}"=EXCLUDED."{col}"' for col in safe_columns if col != safe_key]
|
|
228
|
-
)
|
|
229
|
-
if update_columns:
|
|
230
|
-
insert_sql = f"""
|
|
231
|
-
INSERT INTO {self.table} ({quoted_columns})
|
|
232
|
-
VALUES ({value_placeholders})
|
|
233
|
-
ON CONFLICT ("{safe_key}") DO UPDATE SET {update_columns}
|
|
234
|
-
"""
|
|
235
|
-
else:
|
|
236
|
-
insert_sql = f"""
|
|
237
|
-
INSERT INTO {self.table} ({quoted_columns})
|
|
238
|
-
VALUES ({value_placeholders})
|
|
239
|
-
ON CONFLICT ("{safe_key}") DO NOTHING
|
|
240
|
-
"""
|
|
241
|
-
else:
|
|
242
|
-
raise ValueError(f'Unsupported db_type for upsert: {db_type}')
|
|
243
|
-
else:
|
|
244
|
-
insert_sql = f"""
|
|
245
|
-
INSERT IGNORE INTO {self.table} ({', '.join([f'`{col}`' for col in columns])})
|
|
246
|
-
VALUES ({value_placeholders})
|
|
247
|
-
"""
|
|
248
|
-
|
|
249
|
-
data = chunk.where(pd.notna(chunk), None).to_dict(orient='records')
|
|
250
|
-
connection.execute(text(insert_sql), data)
|
|
251
|
-
pbar.update(1)
|
|
252
|
-
except Exception as e:
|
|
253
|
-
logger.exception('an error occurred during upsert: %s', e)
|
|
254
|
-
|
|
255
|
-
class Manage_table(ManageTable):
|
|
256
|
-
def __init__(self, table, configs, verify=False):
|
|
257
|
-
warnings.warn(
|
|
258
|
-
'Manage_table is deprecated and will be removed in v2.0.0; use ManageTable instead.',
|
|
259
|
-
DeprecationWarning,
|
|
260
|
-
stacklevel=2,
|
|
261
|
-
)
|
|
262
|
-
super().__init__(table, configs, verify=verify)
|
|
263
|
-
|
|
264
|
-
def delete_table(self):
|
|
265
|
-
super().delete_table()
|
|
266
|
-
|
|
267
|
-
def upload_data(self, df, chunk_size=10000, add_date=True):
|
|
268
|
-
engine = _resolve_engine(self.configs if hasattr(self, 'configs') else self, autocommit=False)
|
|
269
|
-
|
|
270
|
-
with engine.begin() as connection:
|
|
271
|
-
if add_date:
|
|
272
|
-
df_copy = df.copy()
|
|
273
|
-
df_copy['rundate'] = datetime.now().strftime('%Y-%m-%d')
|
|
274
|
-
else:
|
|
275
|
-
df_copy = df
|
|
276
|
-
total_chunks = (len(df_copy) // chunk_size) + (0 if len(df_copy) % chunk_size == 0 else 1)
|
|
277
|
-
logger.info('try to upload data now, chunk_size is %s', chunk_size)
|
|
278
|
-
with tqdm(total=total_chunks, desc="Uploading Chunks", unit="chunk") as pbar:
|
|
279
|
-
try:
|
|
280
|
-
for start in range(0, len(df_copy), chunk_size):
|
|
281
|
-
end = min(start + chunk_size, len(df_copy))
|
|
282
|
-
chunk = df_copy.iloc[start:end]
|
|
283
|
-
chunk.to_sql(name=self.table, con=connection, if_exists='append', index=False)
|
|
284
|
-
pbar.update(1)
|
|
285
|
-
except Exception as e:
|
|
286
|
-
logger.exception('an error occurred during upload: %s', e)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|