sqlServerConnector 0.1.11__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sqlServerConnector
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: A custom SQL Server Connector for ETL processes with Pandas
5
5
  Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "sqlServerConnector"
9
- version = "0.1.11"
9
+ version = "0.1.12"
10
10
  description = "A custom SQL Server Connector for ETL processes with Pandas"
11
11
  readme = "README.md"
12
12
  requires-python = ">=3.8"
@@ -220,7 +220,7 @@ class SQLServerConnector:
220
220
  conn.execute(text(f"ALTER TABLE [{table_name}] ADD [{col}] {type_str}"))
221
221
  logger.info(f"Auto-evolve: Added column '{col}' to table '{table_name}'")
222
222
 
223
- def upsert_data(self,
223
+ def upsert_data(self,
224
224
  df: pd.DataFrame,
225
225
  target_table: str,
226
226
  primary_key: Union[str, List[str]] = None,
@@ -228,66 +228,42 @@ class SQLServerConnector:
228
228
  auto_evolve_schema: bool = True,
229
229
  conflict_strategy: Literal['sum', 'last', 'skip'] = 'last'):
230
230
  """
231
- Hàm Upsert đa năng (Hợp nhất Logic cũ và mới).
232
-
233
- Args:
234
- df: DataFrame đầu vào.
235
- target_table: Tên bảng đích.
236
- primary_key: Tên cột khóa chính (str hoặc list) - Param cũ.
237
- match_columns: Danh sách cột khóa chính - Param mới.
238
- auto_evolve_schema: Tự động thêm cột mới vào DB.
239
- conflict_strategy:
240
- - 'sum': Cộng dồn các cột số, lấy giá trị đầu tiên các cột khác (Logic tài chính).
241
- - 'last': Update ghi đè (Logic thông tin mới nhất).
242
- - 'skip': Bỏ qua nếu trùng khóa.
231
+ Hàm Upsert đa năng (Production Grade).
232
+ Fixes: Schema mirroring, NULL-safe joins, Concurrency Locks, and Aggregation bugs.
243
233
  """
244
234
  if df.empty:
245
235
  return
246
236
 
247
- # 1. Xác định Join Keys (Hợp nhất 2 param)
248
- join_keys = []
249
- if match_columns:
250
- join_keys = match_columns
251
- elif primary_key:
252
- join_keys = [primary_key] if isinstance(primary_key, str) else primary_key
237
+ # 1. Xác định Join Keys
238
+ join_keys = match_columns or (
239
+ [primary_key] if isinstance(primary_key, str) else primary_key
240
+ ) or []
253
241
 
254
242
  if not join_keys:
255
243
  logger.warning(f"No keys provided for {target_table}. Switching to APPEND mode.")
256
244
  else:
257
- # --- CRITICAL FIX FOR UNIQUE KEY VIOLATION ---
258
- # Remove duplicates in Python before sending to Staging Table
259
- # This prevents MERGE from crashing when Source has duplicate keys
245
+ # --- CRITICAL FIX 1: Correct Aggregation Logic ---
246
+ # Drop duplicates ONLY if we are not summing. If summing, groupby handles it!
260
247
  initial_count = len(df)
261
- df = df.drop_duplicates(subset=join_keys, keep='last').copy()
262
- if len(df) < initial_count:
263
- logger.warning(f"Dropped {initial_count - len(df)} duplicate rows based on keys {join_keys} before upsert.")
264
-
265
- # 2. Xử lý Logic 'SUM' (Aggregation tại Python trước khi đẩy DB)
266
- # Đây là logic quý giá từ code cũ của bạn
267
- if conflict_strategy == 'sum' and join_keys:
268
- # Xác định cột số
269
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
270
- num_cols = [c for c in num_cols if c not in join_keys]
271
-
272
- # Logic Aggregation
273
- agg_logic = {col: 'sum' for col in num_cols}
274
-
275
- # Các cột còn lại (text, date...) lấy dòng đầu tiên
276
- other_cols = [c for c in df.columns if c not in join_keys and c not in num_cols]
277
- for c in other_cols:
278
- agg_logic[c] = 'first'
248
+ if conflict_strategy == 'sum':
249
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
250
+ num_cols = [c for c in num_cols if c not in join_keys]
279
251
 
280
- # Thực hiện GroupBy
281
- initial_len = len(df)
282
- df = df.groupby(join_keys, as_index=False).agg(agg_logic)
283
-
284
- if len(df) < initial_len:
285
- logger.info(f"Strategy 'sum': Aggregated {initial_len} -> {len(df)} rows.")
252
+ agg_logic = {col: 'sum' for col in num_cols}
253
+ other_cols = [c for c in df.columns if c not in join_keys and c not in num_cols]
254
+ for c in other_cols:
255
+ agg_logic[c] = 'last'
256
+
257
+ df = df.groupby(join_keys, as_index=False).agg(agg_logic)
258
+ if len(df) < initial_count:
259
+ logger.info(f"Strategy 'sum': Aggregated {initial_count} -> {len(df)} rows.")
260
+ else:
261
+ keep_val = 'last' if conflict_strategy == 'last' else 'first'
262
+ df = df.drop_duplicates(subset=join_keys, keep=keep_val).copy()
263
+ if len(df) < initial_count:
264
+ logger.warning(f"Dropped {initial_count - len(df)} duplicate rows based on keys {join_keys}.")
286
265
 
287
- # 3. Map Unicode Types
288
266
  dtype_mapping = self._generate_dtype_mapping(df)
289
-
290
- # 4. Staging Table Name (Dùng bảng vật lý unique)
291
267
  staging_table = f"Staging_{uuid.uuid4().hex[:10]}"
292
268
 
293
269
  try:
@@ -297,16 +273,15 @@ class SQLServerConnector:
297
273
  if not inspector.has_table(target_table):
298
274
  logger.info(f"Table {target_table} not found. Creating new...")
299
275
  df.to_sql(target_table, conn, index=False, dtype=dtype_mapping)
300
-
301
276
  if join_keys:
302
277
  pk_str = ", ".join([f"[{c}]" for c in join_keys])
303
278
  try:
304
279
  conn.execute(text(f"ALTER TABLE [{target_table}] ADD CONSTRAINT PK_{target_table.replace('.','_')}_{uuid.uuid4().hex[:4]} PRIMARY KEY ({pk_str})"))
305
280
  except Exception as e:
306
281
  logger.warning(f"Could not create PK: {e}")
307
- return # Đã insert xong do tạo bảng mới
282
+ return
308
283
 
309
- # --- B. Schema Evolution (Code cũ gọi là _sync_columns) ---
284
+ # --- B. Schema Evolution ---
310
285
  db_cols = self._get_table_columns(target_table, conn)
311
286
  df_cols = list(df.columns)
312
287
  new_cols = [c for c in df_cols if c not in db_cols]
@@ -316,54 +291,60 @@ class SQLServerConnector:
316
291
  self._add_missing_columns(target_table, new_cols, dtype_mapping, conn)
317
292
  db_cols.extend(new_cols)
318
293
  else:
319
- # Nếu không evolve, bỏ cột thừa đi
320
294
  valid_cols = [c for c in df_cols if c in db_cols]
321
295
  df = df[valid_cols]
322
296
 
323
- # --- C. Đẩy vào Staging ---
297
+ # --- C. CRITICAL FIX 2: Exact Schema Mirroring ---
298
+ # Clone the exact schema of the target table to avoid Type Mismatches
299
+ conn.execute(text(f"SELECT TOP 0 * INTO [{staging_table}] FROM [{target_table}]"))
300
+
301
+ # Append to the cloned table. This forces Python data to cast
302
+ # precisely to the DB's native types (e.g., Datetime vs Date).
324
303
  df.to_sql(
325
304
  name=staging_table,
326
305
  con=conn,
327
- if_exists='replace',
328
- index=False,
329
- dtype=dtype_mapping
306
+ if_exists='append',
307
+ index=False
330
308
  )
331
309
 
332
- # --- D. MERGE Logic ---
333
- # Nếu strategy là 'sum', sau khi aggregate ở Python, nó trở thành 'update' (ghi đè kết quả tổng vào DB)
334
- # Hoặc nếu DB đã có số liệu, logic Merge chuẩn là update lại số mới.
335
-
310
+ # --- D. Dynamic MERGE Logic ---
336
311
  if join_keys:
337
312
  common_cols = [c for c in df.columns if c in db_cols]
338
- on_clause = " AND ".join([f"Target.[{col}] = Source.[{col}]" for col in join_keys])
313
+
314
+ # --- CRITICAL FIX 3: NULL-Safe Joins ---
315
+ on_conditions = []
316
+ for col in join_keys:
317
+ on_conditions.append(f"(Target.[{col}] = Source.[{col}] OR (Target.[{col}] IS NULL AND Source.[{col}] IS NULL))")
318
+ on_clause = " AND ".join(on_conditions)
339
319
 
340
320
  insert_cols = ", ".join([f"[{col}]" for col in common_cols])
341
321
  insert_vals = ", ".join([f"Source.[{col}]" for col in common_cols])
322
+
323
+ # --- CRITICAL FIX 4: HOLDLOCK ---
324
+ # Prevents concurrency race conditions during upsert
325
+ merge_target = f"[{target_table}] WITH (HOLDLOCK)"
342
326
 
343
327
  merge_sql = ""
344
-
345
- # Logic: 'last' hoặc 'sum' (sau khi agg) đều là UPDATE
346
328
  if conflict_strategy in ['last', 'sum']:
347
329
  update_cols = [c for c in common_cols if c not in join_keys]
348
330
  if update_cols:
349
331
  update_set = ", ".join([f"Target.[{col}] = Source.[{col}]" for col in update_cols])
350
332
  merge_sql = f"""
351
- MERGE [{target_table}] AS Target USING [{staging_table}] AS Source
333
+ MERGE {merge_target} AS Target USING [{staging_table}] AS Source
352
334
  ON {on_clause}
353
335
  WHEN MATCHED THEN UPDATE SET {update_set}
354
336
  WHEN NOT MATCHED BY TARGET THEN INSERT ({insert_cols}) VALUES ({insert_vals});
355
337
  """
356
338
  else:
357
- # Chỉ có Key, Insert if not exists
358
339
  merge_sql = f"""
359
- MERGE [{target_table}] AS Target USING [{staging_table}] AS Source
340
+ MERGE {merge_target} AS Target USING [{staging_table}] AS Source
360
341
  ON {on_clause}
361
342
  WHEN NOT MATCHED BY TARGET THEN INSERT ({insert_cols}) VALUES ({insert_vals});
362
343
  """
363
344
 
364
345
  elif conflict_strategy == 'skip':
365
346
  merge_sql = f"""
366
- MERGE [{target_table}] AS Target USING [{staging_table}] AS Source
347
+ MERGE {merge_target} AS Target USING [{staging_table}] AS Source
367
348
  ON {on_clause}
368
349
  WHEN NOT MATCHED BY TARGET THEN INSERT ({insert_cols}) VALUES ({insert_vals});
369
350
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sqlServerConnector
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: A custom SQL Server Connector for ETL processes with Pandas
5
5
  Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector