pyposconnector 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyposconnector-0.1.0/LICENSE +21 -0
- pyposconnector-0.1.0/PKG-INFO +122 -0
- pyposconnector-0.1.0/README.md +101 -0
- pyposconnector-0.1.0/pyproject.toml +34 -0
- pyposconnector-0.1.0/src/postgres_connector.py +232 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 johnnyb1509
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyposconnector
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Ultimate PostgreSQL Connector with TimescaleDB and pgvector support for Python 3.12+
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: MinhSonCQF
|
|
7
|
+
Author-email: nguyen.minhson1511@gmail.com
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
|
+
Requires-Dist: loguru (>=0.7.3,<0.8.0)
|
|
14
|
+
Requires-Dist: pandas (>=3.0.1,<4.0.0)
|
|
15
|
+
Requires-Dist: pgvector (>=0.4.2,<0.5.0)
|
|
16
|
+
Requires-Dist: psycopg2-binary (>=2.9.11,<3.0.0)
|
|
17
|
+
Requires-Dist: sqlalchemy (>=2.0.48,<3.0.0)
|
|
18
|
+
Project-URL: Homepage, https://github.com/johnnyb1509/PostgreSQLConnector
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# 🚀 PostgresConnector (Ultimate Edition)
|
|
22
|
+
|
|
23
|
+
## 📖 Introduction
|
|
24
|
+
Welcome to **PostgresConnector**, the ultimate database connection package built for our team's data engineering and AI workflows.
|
|
25
|
+
|
|
26
|
+
This package simplifies interactions with PostgreSQL databases by automating tedious tasks like schema evolution, data type mapping, and bulk upserts. It goes beyond standard SQL by providing native, out-of-the-box support for **TimescaleDB** (for time-series data) and **pgvector** (for AI embeddings).
|
|
27
|
+
|
|
28
|
+
### ✨ Key Features
|
|
29
|
+
* **Smart Upsert (`ON CONFLICT DO UPDATE`):** Blazing fast data ingestion with conflict resolution strategies (`last`, `sum`, `skip`).
|
|
30
|
+
* **Auto Schema Evolution:** Automatically adds missing columns to your database tables based on your Pandas DataFrames.
|
|
31
|
+
* **Native JSONB Support:** Automatically detects nested Python dictionaries/lists and maps them to PostgreSQL `JSONB` format.
|
|
32
|
+
* **TimescaleDB Integration:** Easily convert standard tables into hypertables for optimized time-series data storage.
|
|
33
|
+
* **pgvector for AI:** Automatically detects lists of floats (embeddings) and creates Vector columns with HNSW/IVFFlat indexing for fast similarity searches.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## 📂 Directory Structure
|
|
38
|
+
|
|
39
|
+
This project is managed using [Poetry](https://python-poetry.org/). The standard structure looks like this:
|
|
40
|
+
|
|
41
|
+
```text
|
|
42
|
+
PostgreSQLConnector/
|
|
43
|
+
│
|
|
44
|
+
├── pyproject.toml # Poetry configuration, metadata, and dependencies
|
|
45
|
+
├── README.md # This documentation file
|
|
46
|
+
├── src/ # The actual Python module
|
|
47
|
+
│ ├── __init__.py
|
|
48
|
+
│ └── postgres_connector.py
|
|
49
|
+
└── notebooks/ # (Optional) Tutorials and examples
|
|
50
|
+
└── Tutorial.ipynb
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 💻 Installation
|
|
55
|
+
This package is published on PyPI. You can easily install it into your project using your preferred package manager.
|
|
56
|
+
|
|
57
|
+
Using Poetry (Recommended):
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
poetry add PostgreSQLConnector
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Using pip:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install PostgreSQLConnector
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## 🛠️ Dependencies
|
|
70
|
+
This package relies on several powerful Python libraries to function properly.
|
|
71
|
+
|
|
72
|
+
```pandas``` - For data manipulation and structures.
|
|
73
|
+
|
|
74
|
+
```SQLAlchemy``` - For database connection and ORM capabilities.
|
|
75
|
+
|
|
76
|
+
```psycopg2-binary``` - The most popular PostgreSQL adapter for Python.
|
|
77
|
+
|
|
78
|
+
```pgvector``` - For handling vector data types and AI embeddings in SQLAlchemy.
|
|
79
|
+
|
|
80
|
+
```loguru``` - For beautiful, easy-to-read logging.
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
## 🚀 Quick Start
|
|
84
|
+
Here is a quick example of how to connect and upsert data using the connector:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import pandas as pd
|
|
88
|
+
from postgres_connector import PostgresConnector
|
|
89
|
+
|
|
90
|
+
# 1. Initialize the connection
|
|
91
|
+
pg = PostgresConnector(
|
|
92
|
+
host='localhost',
|
|
93
|
+
database='my_database',
|
|
94
|
+
username='my_user',
|
|
95
|
+
password='my_password'
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# 2. Prepare your data
|
|
99
|
+
data = {
|
|
100
|
+
'id': [1, 2],
|
|
101
|
+
'name': ['Alice', 'Bob'],
|
|
102
|
+
'role': ['Admin', 'User']
|
|
103
|
+
}
|
|
104
|
+
df = pd.DataFrame(data)
|
|
105
|
+
|
|
106
|
+
# 3. Upsert into the database (Creates table if it doesn't exist!)
|
|
107
|
+
pg.upsert_data(
|
|
108
|
+
df=df,
|
|
109
|
+
target_table='team_members',
|
|
110
|
+
primary_key='id'
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# 4. Close the connection
|
|
114
|
+
pg.dispose()
|
|
115
|
+
```
|
|
116
|
+
For more advanced use cases, including **TimescaleDB** and **pgvector** for AI embeddings, please refer to the Tutorial.ipynb file included in this repository.
|
|
117
|
+
|
|
118
|
+
## 👨💻 Creator
|
|
119
|
+
Created by: Nguyen Minh Son, CQF (MinhSonCQF)
|
|
120
|
+
|
|
121
|
+
Contact / Support: nguyen.minhson1511@gmail.com
|
|
122
|
+
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# 🚀 PostgresConnector (Ultimate Edition)
|
|
2
|
+
|
|
3
|
+
## 📖 Introduction
|
|
4
|
+
Welcome to **PostgresConnector**, the ultimate database connection package built for our team's data engineering and AI workflows.
|
|
5
|
+
|
|
6
|
+
This package simplifies interactions with PostgreSQL databases by automating tedious tasks like schema evolution, data type mapping, and bulk upserts. It goes beyond standard SQL by providing native, out-of-the-box support for **TimescaleDB** (for time-series data) and **pgvector** (for AI embeddings).
|
|
7
|
+
|
|
8
|
+
### ✨ Key Features
|
|
9
|
+
* **Smart Upsert (`ON CONFLICT DO UPDATE`):** Blazing fast data ingestion with conflict resolution strategies (`last`, `sum`, `skip`).
|
|
10
|
+
* **Auto Schema Evolution:** Automatically adds missing columns to your database tables based on your Pandas DataFrames.
|
|
11
|
+
* **Native JSONB Support:** Automatically detects nested Python dictionaries/lists and maps them to PostgreSQL `JSONB` format.
|
|
12
|
+
* **TimescaleDB Integration:** Easily convert standard tables into hypertables for optimized time-series data storage.
|
|
13
|
+
* **pgvector for AI:** Automatically detects lists of floats (embeddings) and creates Vector columns with HNSW/IVFFlat indexing for fast similarity searches.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## 📂 Directory Structure
|
|
18
|
+
|
|
19
|
+
This project is managed using [Poetry](https://python-poetry.org/). The standard structure looks like this:
|
|
20
|
+
|
|
21
|
+
```text
|
|
22
|
+
PostgreSQLConnector/
|
|
23
|
+
│
|
|
24
|
+
├── pyproject.toml # Poetry configuration, metadata, and dependencies
|
|
25
|
+
├── README.md # This documentation file
|
|
26
|
+
├── src/ # The actual Python module
|
|
27
|
+
│ ├── __init__.py
|
|
28
|
+
│ └── postgres_connector.py
|
|
29
|
+
└── notebooks/ # (Optional) Tutorials and examples
|
|
30
|
+
└── Tutorial.ipynb
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## 💻 Installation
|
|
35
|
+
This package is published on PyPI. You can easily install it into your project using your preferred package manager.
|
|
36
|
+
|
|
37
|
+
Using Poetry (Recommended):
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
poetry add PostgreSQLConnector
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Using pip:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install PostgreSQLConnector
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## 🛠️ Dependencies
|
|
50
|
+
This package relies on several powerful Python libraries to function properly.
|
|
51
|
+
|
|
52
|
+
```pandas``` - For data manipulation and structures.
|
|
53
|
+
|
|
54
|
+
```SQLAlchemy``` - For database connection and ORM capabilities.
|
|
55
|
+
|
|
56
|
+
```psycopg2-binary``` - The most popular PostgreSQL adapter for Python.
|
|
57
|
+
|
|
58
|
+
```pgvector``` - For handling vector data types and AI embeddings in SQLAlchemy.
|
|
59
|
+
|
|
60
|
+
```loguru``` - For beautiful, easy-to-read logging.
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
## 🚀 Quick Start
|
|
64
|
+
Here is a quick example of how to connect and upsert data using the connector:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import pandas as pd
|
|
68
|
+
from postgres_connector import PostgresConnector
|
|
69
|
+
|
|
70
|
+
# 1. Initialize the connection
|
|
71
|
+
pg = PostgresConnector(
|
|
72
|
+
host='localhost',
|
|
73
|
+
database='my_database',
|
|
74
|
+
username='my_user',
|
|
75
|
+
password='my_password'
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# 2. Prepare your data
|
|
79
|
+
data = {
|
|
80
|
+
'id': [1, 2],
|
|
81
|
+
'name': ['Alice', 'Bob'],
|
|
82
|
+
'role': ['Admin', 'User']
|
|
83
|
+
}
|
|
84
|
+
df = pd.DataFrame(data)
|
|
85
|
+
|
|
86
|
+
# 3. Upsert into the database (Creates table if it doesn't exist!)
|
|
87
|
+
pg.upsert_data(
|
|
88
|
+
df=df,
|
|
89
|
+
target_table='team_members',
|
|
90
|
+
primary_key='id'
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# 4. Close the connection
|
|
94
|
+
pg.dispose()
|
|
95
|
+
```
|
|
96
|
+
For more advanced use cases, including **TimescaleDB** and **pgvector** for AI embeddings, please refer to the Tutorial.ipynb file included in this repository.
|
|
97
|
+
|
|
98
|
+
## 👨💻 Creator
|
|
99
|
+
Created by: Nguyen Minh Son, CQF (MinhSonCQF)
|
|
100
|
+
|
|
101
|
+
Contact / Support: nguyen.minhson1511@gmail.com
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pyposconnector"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Ultimate PostgreSQL Connector with TimescaleDB and pgvector support for Python 3.12+"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "MinhSonCQF", email = "nguyen.minhson1511@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.12,<4.0"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"pandas (>=3.0.1,<4.0.0)",
|
|
12
|
+
"sqlalchemy (>=2.0.48,<3.0.0)",
|
|
13
|
+
"psycopg2-binary (>=2.9.11,<3.0.0)",
|
|
14
|
+
"pgvector (>=0.4.2,<0.5.0)",
|
|
15
|
+
"loguru (>=0.7.3,<0.8.0)"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.poetry]
|
|
19
|
+
# Chuyển phần khai báo gói code xuống đây và chỉ định đích danh file .py
|
|
20
|
+
packages = [
|
|
21
|
+
{ include = "postgres_connector.py", from = "src" }
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
26
|
+
build-backend = "poetry.core.masonry.api"
|
|
27
|
+
|
|
28
|
+
[dependency-groups]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest (>=9.0.2,<10.0.0)"
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
"Homepage" = "https://github.com/johnnyb1509/PostgreSQLConnector"
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
import uuid
|
|
5
|
+
from typing import List, Optional, Dict, Union, Literal
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from sqlalchemy import create_engine, text, URL, inspect, MetaData, Table
|
|
8
|
+
from sqlalchemy.types import BIGINT, DATE
|
|
9
|
+
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION, TEXT, TIMESTAMP, insert
|
|
10
|
+
from pgvector.sqlalchemy import Vector # Cần cài đặt thư viện pgvector
|
|
11
|
+
|
|
12
|
+
class PostgresConnector:
|
|
13
|
+
"""
|
|
14
|
+
Trình kết nối PostgreSQL chuẩn hóa (Ultimate Version).
|
|
15
|
+
|
|
16
|
+
Features:
|
|
17
|
+
- Core: Fast Execute, Schema Evolution, Native PostgreSQL types (TEXT, JSONB).
|
|
18
|
+
- Upsert: Sử dụng ON CONFLICT DO UPDATE nguyên bản của Postgres.
|
|
19
|
+
- Extensions: Hỗ trợ TimescaleDB (Hypertable) cho Time-series và pgvector cho AI.
|
|
20
|
+
- Strategies: 'last' (Update), 'skip' (Ignore), 'sum' (Aggregate numeric).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, host: str, database: str,
|
|
24
|
+
username: str, password: str, port: int = 5432,
|
|
25
|
+
**kwargs):
|
|
26
|
+
self.host = host
|
|
27
|
+
self.database = database
|
|
28
|
+
self.username = username
|
|
29
|
+
self.password = password
|
|
30
|
+
self.port = port
|
|
31
|
+
|
|
32
|
+
self.connection_url = URL.create(
|
|
33
|
+
"postgresql+psycopg2",
|
|
34
|
+
username=self.username,
|
|
35
|
+
password=self.password,
|
|
36
|
+
host=self.host,
|
|
37
|
+
port=self.port,
|
|
38
|
+
database=self.database
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Postgres driver tự động xử lý executemany tối ưu bằng execute_values nếu được cấu hình
|
|
42
|
+
self.engine = create_engine(
|
|
43
|
+
self.connection_url,
|
|
44
|
+
pool_pre_ping=True,
|
|
45
|
+
insertmanyvalues_page_size=10000 # Tối ưu hóa bulk insert (cú pháp mới)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def execute_query(self, query: str, params: Optional[Dict] = None):
|
|
49
|
+
"""Thực thi lệnh không trả về dữ liệu (VD: CREATE EXTENSION, DROP TABLE)"""
|
|
50
|
+
try:
|
|
51
|
+
with self.engine.begin() as conn:
|
|
52
|
+
conn.execute(text(query), params or {})
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logger.error(f"Execute query error: {e}")
|
|
55
|
+
raise e
|
|
56
|
+
|
|
57
|
+
def get_data(self, query: str, params: Optional[Dict] = None) -> pd.DataFrame:
|
|
58
|
+
try:
|
|
59
|
+
with self.engine.connect() as conn:
|
|
60
|
+
return pd.read_sql(text(query), conn, params=params)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.error(f"Get data error: {e}")
|
|
63
|
+
raise e
|
|
64
|
+
|
|
65
|
+
def _generate_dtype_mapping(self, df: pd.DataFrame) -> Dict:
|
|
66
|
+
"""Tự động map kiểu dữ liệu, tích hợp nhận diện JSONB và VECTOR"""
|
|
67
|
+
dtype_map = {}
|
|
68
|
+
for col in df.columns:
|
|
69
|
+
sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
|
|
70
|
+
|
|
71
|
+
# 1. Nhận diện Vector (List các số thực)
|
|
72
|
+
if isinstance(sample_val, list) and all(isinstance(x, (int, float)) for x in sample_val):
|
|
73
|
+
dim = len(sample_val)
|
|
74
|
+
dtype_map[col] = Vector(dim)
|
|
75
|
+
# 2. Nhận diện JSONB (Dict hoặc List chứa Dict/String)
|
|
76
|
+
elif isinstance(sample_val, (dict, list)):
|
|
77
|
+
dtype_map[col] = JSONB()
|
|
78
|
+
# 3. Các kiểu dữ liệu cơ bản
|
|
79
|
+
elif pd.api.types.is_string_dtype(df[col]) or df[col].dtype == 'object':
|
|
80
|
+
dtype_map[col] = TEXT()
|
|
81
|
+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
|
|
82
|
+
dtype_map[col] = TIMESTAMP()
|
|
83
|
+
elif pd.api.types.is_float_dtype(df[col]):
|
|
84
|
+
dtype_map[col] = DOUBLE_PRECISION()
|
|
85
|
+
elif pd.api.types.is_integer_dtype(df[col]):
|
|
86
|
+
dtype_map[col] = BIGINT()
|
|
87
|
+
|
|
88
|
+
return dtype_map
|
|
89
|
+
|
|
90
|
+
def _get_table_columns(self, table_name: str, conn) -> List[str]:
|
|
91
|
+
inspector = inspect(conn)
|
|
92
|
+
return [col['name'] for col in inspector.get_columns(table_name)]
|
|
93
|
+
|
|
94
|
+
def _add_missing_columns(self, table_name: str, missing_cols: List[str], dtype_map: Dict, conn):
|
|
95
|
+
for col in missing_cols:
|
|
96
|
+
col_type = dtype_map.get(col, TEXT())
|
|
97
|
+
type_str = "TEXT"
|
|
98
|
+
|
|
99
|
+
if isinstance(col_type, DOUBLE_PRECISION): type_str = "DOUBLE PRECISION"
|
|
100
|
+
elif isinstance(col_type, BIGINT): type_str = "BIGINT"
|
|
101
|
+
elif isinstance(col_type, TIMESTAMP): type_str = "TIMESTAMP"
|
|
102
|
+
elif isinstance(col_type, JSONB): type_str = "JSONB"
|
|
103
|
+
elif isinstance(col_type, Vector): type_str = f"VECTOR({col_type.dim})"
|
|
104
|
+
|
|
105
|
+
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN "{col}" {type_str}'))
|
|
106
|
+
logger.info(f"Auto-evolve: Added column '{col}' to '{table_name}'")
|
|
107
|
+
|
|
108
|
+
# ==========================================
|
|
109
|
+
# DATABASE EXTENSIONS (Timescale & pgvector)
|
|
110
|
+
# ==========================================
|
|
111
|
+
|
|
112
|
+
def setup_extensions(self):
|
|
113
|
+
"""Kích hoạt các extension cần thiết cho Database"""
|
|
114
|
+
self.execute_query("CREATE EXTENSION IF NOT EXISTS vector;")
|
|
115
|
+
# Nếu Postgres đã cài sẵn TimescaleDB plugin, ta có thể tạo extension:
|
|
116
|
+
self.execute_query("CREATE EXTENSION IF NOT EXISTS timescaledb CASCADE;")
|
|
117
|
+
logger.info("Checked and created 'vector' & 'timescaledb' extensions if applicable.")
|
|
118
|
+
|
|
119
|
+
def enable_timescaledb(self, table_name: str, time_column: str, chunk_time_interval: str = '1 day'):
|
|
120
|
+
"""
|
|
121
|
+
Chuyển đổi bảng thành Hypertable của TimescaleDB.
|
|
122
|
+
Phù hợp cho dữ liệu crawl hàng ngày, logs, IoT, chứng khoán.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
sql = f"""
|
|
126
|
+
SELECT create_hypertable('{table_name}', '{time_column}',
|
|
127
|
+
chunk_time_interval => INTERVAL '{chunk_time_interval}',
|
|
128
|
+
if_not_exists => TRUE);
|
|
129
|
+
"""
|
|
130
|
+
self.execute_query(sql)
|
|
131
|
+
logger.success(f"Converted {table_name} to TimescaleDB Hypertable on column {time_column}.")
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"Could not convert to Hypertable (is TimescaleDB installed?): {e}")
|
|
134
|
+
|
|
135
|
+
def create_vector_index(self, table_name: str, vector_column: str, index_type: Literal['hnsw', 'ivfflat'] = 'hnsw'):
|
|
136
|
+
"""Tạo index cho cột Vector để tìm kiếm semantic search nhanh hơn"""
|
|
137
|
+
index_name = f"idx_{table_name}_{vector_column}_{index_type}"
|
|
138
|
+
try:
|
|
139
|
+
if index_type == 'hnsw':
|
|
140
|
+
sql = f'CREATE INDEX IF NOT EXISTS {index_name} ON "{table_name}" USING hnsw ("{vector_column}" vector_l2_ops);'
|
|
141
|
+
else:
|
|
142
|
+
sql = f'CREATE INDEX IF NOT EXISTS {index_name} ON "{table_name}" USING ivfflat ("{vector_column}" vector_l2_ops) WITH (lists = 100);'
|
|
143
|
+
self.execute_query(sql)
|
|
144
|
+
logger.success(f"Created {index_type} index on {table_name}.{vector_column}")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.error(f"Failed to create vector index: {e}")
|
|
147
|
+
|
|
148
|
+
# ==========================================
|
|
149
|
+
# CORE OPERATIONS (Upsert, Replace)
|
|
150
|
+
# ==========================================
|
|
151
|
+
|
|
152
|
+
def upsert_data(self,
|
|
153
|
+
df: pd.DataFrame,
|
|
154
|
+
target_table: str,
|
|
155
|
+
primary_key: Union[str, List[str]] = None,
|
|
156
|
+
auto_evolve_schema: bool = True,
|
|
157
|
+
conflict_strategy: Literal['sum', 'last', 'skip'] = 'last'):
|
|
158
|
+
"""
|
|
159
|
+
Upsert sử dụng native 'ON CONFLICT DO UPDATE' của PostgreSQL.
|
|
160
|
+
"""
|
|
161
|
+
if df.empty: return
|
|
162
|
+
df = df.copy()
|
|
163
|
+
|
|
164
|
+
join_keys = [primary_key] if isinstance(primary_key, str) else primary_key
|
|
165
|
+
if not join_keys:
|
|
166
|
+
logger.warning(f"No keys provided. Switch to APPEND mode.")
|
|
167
|
+
df.to_sql(target_table, self.engine, if_exists='append', index=False)
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
# Ép kiểu datetime
|
|
171
|
+
for col in df.select_dtypes(include=['object', 'str']):
|
|
172
|
+
if df[col].astype(str).str.match(r'^\d{4}-\d{2}-\d{2}').any():
|
|
173
|
+
df[col] = pd.to_datetime(df[col], errors='ignore')
|
|
174
|
+
|
|
175
|
+
dtype_mapping = self._generate_dtype_mapping(df)
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
with self.engine.begin() as conn:
|
|
179
|
+
# 1. Tạo bảng nếu chưa có
|
|
180
|
+
inspector = inspect(conn)
|
|
181
|
+
if not inspector.has_table(target_table):
|
|
182
|
+
df.head(0).to_sql(target_table, conn, index=False, dtype=dtype_mapping)
|
|
183
|
+
pk_str = ", ".join([f'"{c}"' for c in join_keys])
|
|
184
|
+
conn.execute(text(f'ALTER TABLE "{target_table}" ADD PRIMARY KEY ({pk_str})'))
|
|
185
|
+
logger.info(f"Created new table {target_table} with PK {join_keys}")
|
|
186
|
+
|
|
187
|
+
# 2. Schema Evolution
|
|
188
|
+
db_cols = self._get_table_columns(target_table, conn)
|
|
189
|
+
new_cols = [c for c in df.columns if c.lower() not in [dc.lower() for dc in db_cols]]
|
|
190
|
+
if new_cols and auto_evolve_schema:
|
|
191
|
+
self._add_missing_columns(target_table, new_cols, dtype_mapping, conn)
|
|
192
|
+
elif new_cols:
|
|
193
|
+
df = df.drop(columns=new_cols)
|
|
194
|
+
|
|
195
|
+
# 3. Tạo câu lệnh Upsert (Sử dụng SQLAlchemy postgresql.insert)
|
|
196
|
+
table_cols = df.columns.tolist()
|
|
197
|
+
records = df.to_dict(orient='records')
|
|
198
|
+
|
|
199
|
+
# Load cấu trúc bảng thực tế từ Database thành Table object
|
|
200
|
+
metadata_obj = MetaData()
|
|
201
|
+
target_table_obj = Table(target_table, metadata_obj, autoload_with=conn)
|
|
202
|
+
|
|
203
|
+
# Truyền Table object vào hàm insert
|
|
204
|
+
insert_stmt = insert(target_table_obj).values(records)
|
|
205
|
+
|
|
206
|
+
if conflict_strategy == 'skip':
|
|
207
|
+
upsert_stmt = insert_stmt.on_conflict_do_nothing(index_elements=join_keys)
|
|
208
|
+
else:
|
|
209
|
+
update_dict = {
|
|
210
|
+
col: insert_stmt.excluded[col]
|
|
211
|
+
for col in table_cols if col not in join_keys
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if conflict_strategy == 'sum':
|
|
215
|
+
for col in update_dict:
|
|
216
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
217
|
+
update_dict[col] = text(f'"{target_table}"."{col}" + EXCLUDED."{col}"')
|
|
218
|
+
|
|
219
|
+
upsert_stmt = insert_stmt.on_conflict_do_update(
|
|
220
|
+
index_elements=join_keys,
|
|
221
|
+
set_=update_dict
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
conn.execute(upsert_stmt)
|
|
225
|
+
logger.success(f"Upserted {len(df)} rows to {target_table} (Strategy: {conflict_strategy})")
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error(f"Upsert failed for {target_table}: {e}")
|
|
229
|
+
raise e
|
|
230
|
+
|
|
231
|
+
def dispose(self):
|
|
232
|
+
self.engine.dispose()
|