sqlServerConnector 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/PKG-INFO +1 -1
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/pyproject.toml +1 -1
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/connector.py +48 -47
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/PKG-INFO +1 -1
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/README.md +0 -0
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/setup.cfg +0 -0
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/__init__.py +0 -0
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/SOURCES.txt +0 -0
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/dependency_links.txt +0 -0
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/requires.txt +0 -0
- {sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sqlServerConnector
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A custom SQL Server Connector for ETL processes with Pandas
|
|
5
5
|
Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector
|
|
@@ -23,6 +23,8 @@ class SQLServerConnector:
|
|
|
23
23
|
- Full Unicode/Vietnamese Support (NVARCHAR + UTF8).
|
|
24
24
|
- Automatic Schema Evolution (adds missing columns).
|
|
25
25
|
- Automatic Primary Key detection and creation.
|
|
26
|
+
- [NEW] Automatic Deduplication of source data before Upsert.
|
|
27
|
+
- [NEW] Prevents numeric conversion on Key columns.
|
|
26
28
|
"""
|
|
27
29
|
|
|
28
30
|
def __init__(self, server: str, database: str, username: str, password: str, driver: str = 'ODBC Driver 17 for SQL Server'):
|
|
@@ -146,7 +148,7 @@ class SQLServerConnector:
|
|
|
146
148
|
match_columns: Optional[List[str]] = None,
|
|
147
149
|
auto_evolve_schema: bool = True):
|
|
148
150
|
"""
|
|
149
|
-
Main ETL Function with Unicode Support.
|
|
151
|
+
Main ETL Function with Unicode Support and Auto-Deduplication.
|
|
150
152
|
|
|
151
153
|
Args:
|
|
152
154
|
df: The new data to push.
|
|
@@ -159,8 +161,19 @@ class SQLServerConnector:
|
|
|
159
161
|
logger.warning(f"Dataframe for {target_table} is empty. Skipping.")
|
|
160
162
|
return
|
|
161
163
|
|
|
162
|
-
#
|
|
163
|
-
|
|
164
|
+
# 0. DETERMINE JOIN KEYS FIRST
|
|
165
|
+
# We need this early to protect these keys from being converted to floats during sanitization
|
|
166
|
+
if match_columns:
|
|
167
|
+
join_keys = match_columns
|
|
168
|
+
elif primary_key in df.columns:
|
|
169
|
+
join_keys = [primary_key]
|
|
170
|
+
else:
|
|
171
|
+
# Fallback if table doesn't exist yet or columns missing, handled later but setup empty here
|
|
172
|
+
join_keys = []
|
|
173
|
+
|
|
174
|
+
# 1. PRE-PROCESS DATA (With Key Protection)
|
|
175
|
+
# Pass join_keys to exclude them from numeric conversion (prevents "123" -> 123.0)
|
|
176
|
+
df_clean = self._sanitize_dataframe(df, exclude_cols=join_keys)
|
|
164
177
|
|
|
165
178
|
# 2. CHECK TARGET TABLE
|
|
166
179
|
if not self.check_table_exists(target_table):
|
|
@@ -172,17 +185,26 @@ class SQLServerConnector:
|
|
|
172
185
|
if auto_evolve_schema:
|
|
173
186
|
self._sync_columns(df_clean, target_table)
|
|
174
187
|
|
|
175
|
-
# 4.
|
|
176
|
-
if
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
188
|
+
# 4. RE-VALIDATE JOIN KEYS
|
|
189
|
+
if not join_keys:
|
|
190
|
+
# Try to infer if not provided
|
|
191
|
+
if primary_key in df_clean.columns:
|
|
192
|
+
join_keys = [primary_key]
|
|
193
|
+
else:
|
|
194
|
+
logger.error(f"CRITICAL: Primary Key '{primary_key}' is missing from DataFrame.")
|
|
195
|
+
logger.error("You MUST provide 'match_columns' to identify which rows to update.")
|
|
196
|
+
raise ValueError("Missing match keys for Identity Column Upsert.")
|
|
197
|
+
|
|
198
|
+
# 5. AUTO DEDUPLICATE SOURCE (CRITICAL FIX)
|
|
199
|
+
# SQL MERGE fails if source has duplicates. We enforce uniqueness on join keys here.
|
|
200
|
+
initial_count = len(df_clean)
|
|
201
|
+
df_clean = df_clean.drop_duplicates(subset=join_keys, keep='last')
|
|
202
|
+
final_count = len(df_clean)
|
|
203
|
+
|
|
204
|
+
if initial_count != final_count:
|
|
205
|
+
logger.warning(f"Upsert Safety: Automatically dropped {initial_count - final_count} duplicate rows in source based on keys {join_keys}.")
|
|
184
206
|
|
|
185
|
-
#
|
|
207
|
+
# 6. EXECUTE UPSERT VIA STAGING
|
|
186
208
|
self._execute_merge_upsert(df_clean, target_table, join_keys)
|
|
187
209
|
|
|
188
210
|
def _execute_merge_upsert(self, df: pd.DataFrame, target_table: str, join_keys: List[str]):
|
|
@@ -322,22 +344,25 @@ class SQLServerConnector:
|
|
|
322
344
|
# HELPER: DATA CLEANING
|
|
323
345
|
# ========================================================
|
|
324
346
|
|
|
325
|
-
def _sanitize_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
326
|
-
"""
|
|
347
|
+
def _sanitize_dataframe(self, df: pd.DataFrame, exclude_cols: List[str] = []) -> pd.DataFrame:
|
|
348
|
+
"""
|
|
349
|
+
Cleans numeric strings, NaT, and NaN values.
|
|
350
|
+
Args:
|
|
351
|
+
df: Input dataframe.
|
|
352
|
+
exclude_cols: Columns to skip numeric conversion (e.g. IDs).
|
|
353
|
+
"""
|
|
327
354
|
df = df.copy()
|
|
328
355
|
|
|
329
|
-
# 1. Clean Numeric Strings
|
|
356
|
+
# 1. Clean Numeric Strings (Skip excluded columns)
|
|
330
357
|
for col in df.select_dtypes(include=['object']).columns:
|
|
331
|
-
|
|
332
|
-
|
|
358
|
+
if col in exclude_cols:
|
|
359
|
+
continue # PROTECT ID COLUMNS FROM BEING CONVERTED TO FLOATS
|
|
360
|
+
|
|
361
|
+
# Only try to convert to float if it looks like a number
|
|
333
362
|
sample = df[col].dropna().head(10).astype(str).tolist()
|
|
334
363
|
if any(any(char.isdigit() for char in str(x)) for x in sample):
|
|
335
364
|
try:
|
|
336
|
-
# Attempt conversion, but strictly ignore errors so we don't break text columns
|
|
337
|
-
# We use a temp series to check if conversion is successful for majority
|
|
338
365
|
temp = df[col].apply(self._clean_numeric_string)
|
|
339
|
-
# If the column was actually text (e.g., "Hà Nội"), _clean_numeric_string returns the original string.
|
|
340
|
-
# We trust _clean_numeric_string to be safe.
|
|
341
366
|
df[col] = temp
|
|
342
367
|
except:
|
|
343
368
|
pass
|
|
@@ -362,7 +387,6 @@ class SQLServerConnector:
|
|
|
362
387
|
if not s: return None
|
|
363
388
|
|
|
364
389
|
# Heuristic: If it contains many letters (excluding K,M,B,T for billions), it's probably text
|
|
365
|
-
# Count letters
|
|
366
390
|
alpha_count = sum(c.isalpha() for c in s)
|
|
367
391
|
if alpha_count > 1 and s[-1] not in ['K', 'M', 'B', 'T']:
|
|
368
392
|
return value # It's likely text (e.g. "Cổ phiếu")
|
|
@@ -380,27 +404,4 @@ class SQLServerConnector:
|
|
|
380
404
|
try:
|
|
381
405
|
return float(s_clean)
|
|
382
406
|
except ValueError:
|
|
383
|
-
return value
|
|
384
|
-
|
|
385
|
-
# ========================================================
|
|
386
|
-
# ENTRY POINT
|
|
387
|
-
# ========================================================
|
|
388
|
-
|
|
389
|
-
# def get_db_connector(yaml_path: Optional[str] = None, env_prefix: str = "DB") -> SQLServerConnector:
|
|
390
|
-
# """Factory function to initialize connector."""
|
|
391
|
-
# if yaml_path and os.path.exists(yaml_path):
|
|
392
|
-
# with open(yaml_path, 'r') as f:
|
|
393
|
-
# config = yaml.safe_load(f).get('db_info', {})
|
|
394
|
-
# return SQLServerConnector(
|
|
395
|
-
# config.get('server'),
|
|
396
|
-
# config.get('database'),
|
|
397
|
-
# config.get('username'),
|
|
398
|
-
# config.get('password')
|
|
399
|
-
# )
|
|
400
|
-
# else:
|
|
401
|
-
# return SQLServerConnector(
|
|
402
|
-
# os.environ.get(f'{env_prefix}_SERVER'),
|
|
403
|
-
# os.environ.get(f'{env_prefix}_NAME'),
|
|
404
|
-
# os.environ.get(f'{env_prefix}_USER'),
|
|
405
|
-
# os.environ.get(f'{env_prefix}_PASS')
|
|
406
|
-
# )
|
|
407
|
+
return value
|
{sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sqlServerConnector
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A custom SQL Server Connector for ETL processes with Pandas
|
|
5
5
|
Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/requires.txt
RENAMED
|
File without changes
|
{sqlserverconnector-0.1.1 → sqlserverconnector-0.1.2}/src/sqlServerConnector.egg-info/top_level.txt
RENAMED
|
File without changes
|