sqlServerConnector 0.1.11__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/PKG-INFO +1 -1
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/pyproject.toml +1 -1
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/connector.py +49 -68
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/PKG-INFO +1 -1
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/README.md +0 -0
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/setup.cfg +0 -0
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/__init__.py +0 -0
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/SOURCES.txt +0 -0
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/dependency_links.txt +0 -0
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/requires.txt +0 -0
- {sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sqlServerConnector
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.12
|
|
4
4
|
Summary: A custom SQL Server Connector for ETL processes with Pandas
|
|
5
5
|
Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector
|
|
@@ -220,7 +220,7 @@ class SQLServerConnector:
|
|
|
220
220
|
conn.execute(text(f"ALTER TABLE [{table_name}] ADD [{col}] {type_str}"))
|
|
221
221
|
logger.info(f"Auto-evolve: Added column '{col}' to table '{table_name}'")
|
|
222
222
|
|
|
223
|
-
|
|
223
|
+
def upsert_data(self,
|
|
224
224
|
df: pd.DataFrame,
|
|
225
225
|
target_table: str,
|
|
226
226
|
primary_key: Union[str, List[str]] = None,
|
|
@@ -228,66 +228,42 @@ class SQLServerConnector:
|
|
|
228
228
|
auto_evolve_schema: bool = True,
|
|
229
229
|
conflict_strategy: Literal['sum', 'last', 'skip'] = 'last'):
|
|
230
230
|
"""
|
|
231
|
-
Hàm Upsert đa năng (
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
df: DataFrame đầu vào.
|
|
235
|
-
target_table: Tên bảng đích.
|
|
236
|
-
primary_key: Tên cột khóa chính (str hoặc list) - Param cũ.
|
|
237
|
-
match_columns: Danh sách cột khóa chính - Param mới.
|
|
238
|
-
auto_evolve_schema: Tự động thêm cột mới vào DB.
|
|
239
|
-
conflict_strategy:
|
|
240
|
-
- 'sum': Cộng dồn các cột số, lấy giá trị đầu tiên các cột khác (Logic tài chính).
|
|
241
|
-
- 'last': Update ghi đè (Logic thông tin mới nhất).
|
|
242
|
-
- 'skip': Bỏ qua nếu trùng khóa.
|
|
231
|
+
Hàm Upsert đa năng (Production Grade).
|
|
232
|
+
Fixes: Schema mirroring, NULL-safe joins, Concurrency Locks, and Aggregation bugs.
|
|
243
233
|
"""
|
|
244
234
|
if df.empty:
|
|
245
235
|
return
|
|
246
236
|
|
|
247
|
-
# 1. Xác định Join Keys
|
|
248
|
-
join_keys =
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
elif primary_key:
|
|
252
|
-
join_keys = [primary_key] if isinstance(primary_key, str) else primary_key
|
|
237
|
+
# 1. Xác định Join Keys
|
|
238
|
+
join_keys = match_columns or (
|
|
239
|
+
[primary_key] if isinstance(primary_key, str) else primary_key
|
|
240
|
+
) or []
|
|
253
241
|
|
|
254
242
|
if not join_keys:
|
|
255
243
|
logger.warning(f"No keys provided for {target_table}. Switching to APPEND mode.")
|
|
256
244
|
else:
|
|
257
|
-
# --- CRITICAL FIX
|
|
258
|
-
#
|
|
259
|
-
# This prevents MERGE from crashing when Source has duplicate keys
|
|
245
|
+
# --- CRITICAL FIX 1: Correct Aggregation Logic ---
|
|
246
|
+
# Drop duplicates ONLY if we are not summing. If summing, groupby handles it!
|
|
260
247
|
initial_count = len(df)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
# 2. Xử lý Logic 'SUM' (Aggregation tại Python trước khi đẩy DB)
|
|
266
|
-
# Đây là logic quý giá từ code cũ của bạn
|
|
267
|
-
if conflict_strategy == 'sum' and join_keys:
|
|
268
|
-
# Xác định cột số
|
|
269
|
-
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
270
|
-
num_cols = [c for c in num_cols if c not in join_keys]
|
|
271
|
-
|
|
272
|
-
# Logic Aggregation
|
|
273
|
-
agg_logic = {col: 'sum' for col in num_cols}
|
|
274
|
-
|
|
275
|
-
# Các cột còn lại (text, date...) lấy dòng đầu tiên
|
|
276
|
-
other_cols = [c for c in df.columns if c not in join_keys and c not in num_cols]
|
|
277
|
-
for c in other_cols:
|
|
278
|
-
agg_logic[c] = 'first'
|
|
248
|
+
if conflict_strategy == 'sum':
|
|
249
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
250
|
+
num_cols = [c for c in num_cols if c not in join_keys]
|
|
279
251
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
252
|
+
agg_logic = {col: 'sum' for col in num_cols}
|
|
253
|
+
other_cols = [c for c in df.columns if c not in join_keys and c not in num_cols]
|
|
254
|
+
for c in other_cols:
|
|
255
|
+
agg_logic[c] = 'last'
|
|
256
|
+
|
|
257
|
+
df = df.groupby(join_keys, as_index=False).agg(agg_logic)
|
|
258
|
+
if len(df) < initial_count:
|
|
259
|
+
logger.info(f"Strategy 'sum': Aggregated {initial_count} -> {len(df)} rows.")
|
|
260
|
+
else:
|
|
261
|
+
keep_val = 'last' if conflict_strategy == 'last' else 'first'
|
|
262
|
+
df = df.drop_duplicates(subset=join_keys, keep=keep_val).copy()
|
|
263
|
+
if len(df) < initial_count:
|
|
264
|
+
logger.warning(f"Dropped {initial_count - len(df)} duplicate rows based on keys {join_keys}.")
|
|
286
265
|
|
|
287
|
-
# 3. Map Unicode Types
|
|
288
266
|
dtype_mapping = self._generate_dtype_mapping(df)
|
|
289
|
-
|
|
290
|
-
# 4. Staging Table Name (Dùng bảng vật lý unique)
|
|
291
267
|
staging_table = f"Staging_{uuid.uuid4().hex[:10]}"
|
|
292
268
|
|
|
293
269
|
try:
|
|
@@ -297,16 +273,15 @@ class SQLServerConnector:
|
|
|
297
273
|
if not inspector.has_table(target_table):
|
|
298
274
|
logger.info(f"Table {target_table} not found. Creating new...")
|
|
299
275
|
df.to_sql(target_table, conn, index=False, dtype=dtype_mapping)
|
|
300
|
-
|
|
301
276
|
if join_keys:
|
|
302
277
|
pk_str = ", ".join([f"[{c}]" for c in join_keys])
|
|
303
278
|
try:
|
|
304
279
|
conn.execute(text(f"ALTER TABLE [{target_table}] ADD CONSTRAINT PK_{target_table.replace('.','_')}_{uuid.uuid4().hex[:4]} PRIMARY KEY ({pk_str})"))
|
|
305
280
|
except Exception as e:
|
|
306
281
|
logger.warning(f"Could not create PK: {e}")
|
|
307
|
-
return
|
|
282
|
+
return
|
|
308
283
|
|
|
309
|
-
# --- B. Schema Evolution
|
|
284
|
+
# --- B. Schema Evolution ---
|
|
310
285
|
db_cols = self._get_table_columns(target_table, conn)
|
|
311
286
|
df_cols = list(df.columns)
|
|
312
287
|
new_cols = [c for c in df_cols if c not in db_cols]
|
|
@@ -316,54 +291,60 @@ class SQLServerConnector:
|
|
|
316
291
|
self._add_missing_columns(target_table, new_cols, dtype_mapping, conn)
|
|
317
292
|
db_cols.extend(new_cols)
|
|
318
293
|
else:
|
|
319
|
-
# Nếu không evolve, bỏ cột thừa đi
|
|
320
294
|
valid_cols = [c for c in df_cols if c in db_cols]
|
|
321
295
|
df = df[valid_cols]
|
|
322
296
|
|
|
323
|
-
# --- C.
|
|
297
|
+
# --- C. CRITICAL FIX 2: Exact Schema Mirroring ---
|
|
298
|
+
# Clone the exact schema of the target table to avoid Type Mismatches
|
|
299
|
+
conn.execute(text(f"SELECT TOP 0 * INTO [{staging_table}] FROM [{target_table}]"))
|
|
300
|
+
|
|
301
|
+
# Append to the cloned table. This forces Python data to cast
|
|
302
|
+
# precisely to the DB's native types (e.g., Datetime vs Date).
|
|
324
303
|
df.to_sql(
|
|
325
304
|
name=staging_table,
|
|
326
305
|
con=conn,
|
|
327
|
-
if_exists='
|
|
328
|
-
index=False
|
|
329
|
-
dtype=dtype_mapping
|
|
306
|
+
if_exists='append',
|
|
307
|
+
index=False
|
|
330
308
|
)
|
|
331
309
|
|
|
332
|
-
# --- D. MERGE Logic ---
|
|
333
|
-
# Nếu strategy là 'sum', sau khi aggregate ở Python, nó trở thành 'update' (ghi đè kết quả tổng vào DB)
|
|
334
|
-
# Hoặc nếu DB đã có số liệu, logic Merge chuẩn là update lại số mới.
|
|
335
|
-
|
|
310
|
+
# --- D. Dynamic MERGE Logic ---
|
|
336
311
|
if join_keys:
|
|
337
312
|
common_cols = [c for c in df.columns if c in db_cols]
|
|
338
|
-
|
|
313
|
+
|
|
314
|
+
# --- CRITICAL FIX 3: NULL-Safe Joins ---
|
|
315
|
+
on_conditions = []
|
|
316
|
+
for col in join_keys:
|
|
317
|
+
on_conditions.append(f"(Target.[{col}] = Source.[{col}] OR (Target.[{col}] IS NULL AND Source.[{col}] IS NULL))")
|
|
318
|
+
on_clause = " AND ".join(on_conditions)
|
|
339
319
|
|
|
340
320
|
insert_cols = ", ".join([f"[{col}]" for col in common_cols])
|
|
341
321
|
insert_vals = ", ".join([f"Source.[{col}]" for col in common_cols])
|
|
322
|
+
|
|
323
|
+
# --- CRITICAL FIX 4: HOLDLOCK ---
|
|
324
|
+
# Prevents concurrency race conditions during upsert
|
|
325
|
+
merge_target = f"[{target_table}] WITH (HOLDLOCK)"
|
|
342
326
|
|
|
343
327
|
merge_sql = ""
|
|
344
|
-
|
|
345
|
-
# Logic: 'last' hoặc 'sum' (sau khi agg) đều là UPDATE
|
|
346
328
|
if conflict_strategy in ['last', 'sum']:
|
|
347
329
|
update_cols = [c for c in common_cols if c not in join_keys]
|
|
348
330
|
if update_cols:
|
|
349
331
|
update_set = ", ".join([f"Target.[{col}] = Source.[{col}]" for col in update_cols])
|
|
350
332
|
merge_sql = f"""
|
|
351
|
-
MERGE
|
|
333
|
+
MERGE {merge_target} AS Target USING [{staging_table}] AS Source
|
|
352
334
|
ON {on_clause}
|
|
353
335
|
WHEN MATCHED THEN UPDATE SET {update_set}
|
|
354
336
|
WHEN NOT MATCHED BY TARGET THEN INSERT ({insert_cols}) VALUES ({insert_vals});
|
|
355
337
|
"""
|
|
356
338
|
else:
|
|
357
|
-
# Chỉ có Key, Insert if not exists
|
|
358
339
|
merge_sql = f"""
|
|
359
|
-
MERGE
|
|
340
|
+
MERGE {merge_target} AS Target USING [{staging_table}] AS Source
|
|
360
341
|
ON {on_clause}
|
|
361
342
|
WHEN NOT MATCHED BY TARGET THEN INSERT ({insert_cols}) VALUES ({insert_vals});
|
|
362
343
|
"""
|
|
363
344
|
|
|
364
345
|
elif conflict_strategy == 'skip':
|
|
365
346
|
merge_sql = f"""
|
|
366
|
-
MERGE
|
|
347
|
+
MERGE {merge_target} AS Target USING [{staging_table}] AS Source
|
|
367
348
|
ON {on_clause}
|
|
368
349
|
WHEN NOT MATCHED BY TARGET THEN INSERT ({insert_cols}) VALUES ({insert_vals});
|
|
369
350
|
"""
|
{sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sqlServerConnector
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.12
|
|
4
4
|
Summary: A custom SQL Server Connector for ETL processes with Pandas
|
|
5
5
|
Author-email: Nguyen Minh Son <nguyen.minhson1511@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/johnnyb1509/sqlServerConnector
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{sqlserverconnector-0.1.11 → sqlserverconnector-0.1.12}/src/sqlServerConnector.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|