sqlServerConnector 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sqlServerConnector
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: A custom SQL Server Connector for ETL processes with Pandas
5
5
  Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "sqlServerConnector"
9
- version = "0.1.1"
9
+ version = "0.1.2"
10
10
  description = "A custom SQL Server Connector for ETL processes with Pandas"
11
11
  readme = "README.md"
12
12
  requires-python = ">=3.8"
@@ -23,6 +23,8 @@ class SQLServerConnector:
23
23
  - Full Unicode/Vietnamese Support (NVARCHAR + UTF8).
24
24
  - Automatic Schema Evolution (adds missing columns).
25
25
  - Automatic Primary Key detection and creation.
26
+ - [NEW] Automatic Deduplication of source data before Upsert.
27
+ - [NEW] Prevents numeric conversion on Key columns.
26
28
  """
27
29
 
28
30
  def __init__(self, server: str, database: str, username: str, password: str, driver: str = 'ODBC Driver 17 for SQL Server'):
@@ -146,7 +148,7 @@ class SQLServerConnector:
146
148
  match_columns: Optional[List[str]] = None,
147
149
  auto_evolve_schema: bool = True):
148
150
  """
149
- Main ETL Function with Unicode Support.
151
+ Main ETL Function with Unicode Support and Auto-Deduplication.
150
152
 
151
153
  Args:
152
154
  df: The new data to push.
@@ -159,8 +161,19 @@ class SQLServerConnector:
159
161
  logger.warning(f"Dataframe for {target_table} is empty. Skipping.")
160
162
  return
161
163
 
162
- # 1. PRE-PROCESS DATA
163
- df_clean = self._sanitize_dataframe(df)
164
+ # 0. DETERMINE JOIN KEYS FIRST
165
+ # We need this early to protect these keys from being converted to floats during sanitization
166
+ if match_columns:
167
+ join_keys = match_columns
168
+ elif primary_key in df.columns:
169
+ join_keys = [primary_key]
170
+ else:
171
+ # Fallback if table doesn't exist yet or columns missing, handled later but setup empty here
172
+ join_keys = []
173
+
174
+ # 1. PRE-PROCESS DATA (With Key Protection)
175
+ # Pass join_keys to exclude them from numeric conversion (prevents "123" -> 123.0)
176
+ df_clean = self._sanitize_dataframe(df, exclude_cols=join_keys)
164
177
 
165
178
  # 2. CHECK TARGET TABLE
166
179
  if not self.check_table_exists(target_table):
@@ -172,17 +185,26 @@ class SQLServerConnector:
172
185
  if auto_evolve_schema:
173
186
  self._sync_columns(df_clean, target_table)
174
187
 
175
- # 4. DETERMINE MATCHING LOGIC
176
- if match_columns:
177
- join_keys = match_columns
178
- elif primary_key in df_clean.columns:
179
- join_keys = [primary_key]
180
- else:
181
- logger.error(f"CRITICAL: Primary Key '{primary_key}' is missing from DataFrame (likely Auto-Increment).")
182
- logger.error("You MUST provide 'match_columns' to identify which rows to update.")
183
- raise ValueError("Missing match keys for Identity Column Upsert.")
188
+ # 4. RE-VALIDATE JOIN KEYS
189
+ if not join_keys:
190
+ # Try to infer if not provided
191
+ if primary_key in df_clean.columns:
192
+ join_keys = [primary_key]
193
+ else:
194
+ logger.error(f"CRITICAL: Primary Key '{primary_key}' is missing from DataFrame.")
195
+ logger.error("You MUST provide 'match_columns' to identify which rows to update.")
196
+ raise ValueError("Missing match keys for Identity Column Upsert.")
197
+
198
+ # 5. AUTO DEDUPLICATE SOURCE (CRITICAL FIX)
199
+ # SQL MERGE fails if source has duplicates. We enforce uniqueness on join keys here.
200
+ initial_count = len(df_clean)
201
+ df_clean = df_clean.drop_duplicates(subset=join_keys, keep='last')
202
+ final_count = len(df_clean)
203
+
204
+ if initial_count != final_count:
205
+ logger.warning(f"Upsert Safety: Automatically dropped {initial_count - final_count} duplicate rows in source based on keys {join_keys}.")
184
206
 
185
- # 5. EXECUTE UPSERT VIA STAGING
207
+ # 6. EXECUTE UPSERT VIA STAGING
186
208
  self._execute_merge_upsert(df_clean, target_table, join_keys)
187
209
 
188
210
  def _execute_merge_upsert(self, df: pd.DataFrame, target_table: str, join_keys: List[str]):
@@ -322,22 +344,25 @@ class SQLServerConnector:
322
344
  # HELPER: DATA CLEANING
323
345
  # ========================================================
324
346
 
325
- def _sanitize_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
326
- """Cleans numeric strings, NaT, and NaN values."""
347
+ def _sanitize_dataframe(self, df: pd.DataFrame, exclude_cols: List[str] = []) -> pd.DataFrame:
348
+ """
349
+ Cleans numeric strings, NaT, and NaN values.
350
+ Args:
351
+ df: Input dataframe.
352
+ exclude_cols: Columns to skip numeric conversion (e.g. IDs).
353
+ """
327
354
  df = df.copy()
328
355
 
329
- # 1. Clean Numeric Strings
356
+ # 1. Clean Numeric Strings (Skip excluded columns)
330
357
  for col in df.select_dtypes(include=['object']).columns:
331
- # Only try to convert to float if it looks like a number (digits or currency symbols)
332
- # Avoid trying to convert Vietnamese text columns
358
+ if col in exclude_cols:
359
+ continue # PROTECT ID COLUMNS FROM BEING CONVERTED TO FLOATS
360
+
361
+ # Only try to convert to float if it looks like a number
333
362
  sample = df[col].dropna().head(10).astype(str).tolist()
334
363
  if any(any(char.isdigit() for char in str(x)) for x in sample):
335
364
  try:
336
- # Attempt conversion, but strictly ignore errors so we don't break text columns
337
- # We use a temp series to check if conversion is successful for majority
338
365
  temp = df[col].apply(self._clean_numeric_string)
339
- # If the column was actually text (e.g., "Hà Nội"), _clean_numeric_string returns the original string.
340
- # We trust _clean_numeric_string to be safe.
341
366
  df[col] = temp
342
367
  except:
343
368
  pass
@@ -362,7 +387,6 @@ class SQLServerConnector:
362
387
  if not s: return None
363
388
 
364
389
  # Heuristic: If it contains many letters (excluding K,M,B,T for billions), it's probably text
365
- # Count letters
366
390
  alpha_count = sum(c.isalpha() for c in s)
367
391
  if alpha_count > 1 and s[-1] not in ['K', 'M', 'B', 'T']:
368
392
  return value # It's likely text (e.g. "Cổ phiếu")
@@ -380,27 +404,4 @@ class SQLServerConnector:
380
404
  try:
381
405
  return float(s_clean)
382
406
  except ValueError:
383
- return value
384
-
385
- # ========================================================
386
- # ENTRY POINT
387
- # ========================================================
388
-
389
- # def get_db_connector(yaml_path: Optional[str] = None, env_prefix: str = "DB") -> SQLServerConnector:
390
- # """Factory function to initialize connector."""
391
- # if yaml_path and os.path.exists(yaml_path):
392
- # with open(yaml_path, 'r') as f:
393
- # config = yaml.safe_load(f).get('db_info', {})
394
- # return SQLServerConnector(
395
- # config.get('server'),
396
- # config.get('database'),
397
- # config.get('username'),
398
- # config.get('password')
399
- # )
400
- # else:
401
- # return SQLServerConnector(
402
- # os.environ.get(f'{env_prefix}_SERVER'),
403
- # os.environ.get(f'{env_prefix}_NAME'),
404
- # os.environ.get(f'{env_prefix}_USER'),
405
- # os.environ.get(f'{env_prefix}_PASS')
406
- # )
407
+ return value
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sqlServerConnector
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: A custom SQL Server Connector for ETL processes with Pandas
5
5
  Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector