r3-test 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
r3_test-0.0.1/LICENSE ADDED
File without changes
r3_test-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: r3_test
3
+ Version: 0.0.1
4
+ Summary: Just for test
5
+ Author: Ranjeet Aloriya
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy
14
+ Requires-Dist: pandas
15
+ Requires-Dist: polars
16
+ Requires-Dist: pyarrow
17
+ Requires-Dist: sqlalchemy
18
+ Requires-Dist: networkx
19
+ Requires-Dist: pyodbc
20
+ Requires-Dist: fastexcel
21
+ Requires-Dist: rapidfuzz
22
+ Requires-Dist: tqdm
23
+ Requires-Dist: openpyxl
24
+ Requires-Dist: xlrd
25
+ Requires-Dist: xlsxwriter
26
+ Dynamic: license-file
File without changes
@@ -0,0 +1,36 @@
1
+ [project]
2
+ name = "r3_test"
3
+ version = "0.0.1"
4
+ description = "Just for test"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ { name="Ranjeet Aloriya" }
10
+ ]
11
+
12
+ dependencies = [
13
+ "numpy",
14
+ "pandas",
15
+ "polars",
16
+ "pyarrow",
17
+ "sqlalchemy",
18
+ "networkx",
19
+ "pyodbc",
20
+ "fastexcel",
21
+ "rapidfuzz",
22
+ "tqdm",
23
+ "openpyxl",
24
+ "xlrd",
25
+ "xlsxwriter"
26
+ ]
27
+
28
+ classifiers = [
29
+ "Programming Language :: Python :: 3",
30
+ "License :: OSI Approved :: MIT License",
31
+ "Operating System :: OS Independent",
32
+ ]
33
+
34
+ [build-system]
35
+ requires = ["setuptools>=61.0", "wheel"]
36
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,33 @@
1
+ from .main import (
2
+ help,
3
+ csvtsv_to_excel,
4
+ copy_files,
5
+ move_files,
6
+ copy_files_without_ext,
7
+ df_info,
8
+ get_segment,
9
+ arrange_segment,
10
+ split_columns,
11
+ split_full_name,
12
+ ra_replace_chars,
13
+ excel_compile_without_header,
14
+ csv_compile,
15
+ parquet_compile,
16
+ batch_processing,
17
+ table_from_sql,
18
+ table_to_sql,
19
+ dense_id,
20
+ name_id,
21
+ dob_normalize,
22
+ demerge,
23
+ initial_names,
24
+ merging_on_ssntin,
25
+ merging_on_address,
26
+ merging_on_dob,
27
+ merging_on_others,
28
+ name_checks,
29
+ name_final,
30
+ address_final,
31
+ final_cel,
32
+ y_columns
33
+ )
@@ -0,0 +1,712 @@
1
+ import re, os, sys, csv, shutil, networkx, warnings, pyodbc, urllib,sqlalchemy, pyarrow, fastexcel
2
+ import numpy as np
3
+ import pandas as pd
4
+ import polars as pl
5
+ from time import time
6
+ from tqdm import tqdm
7
+ from io import StringIO
8
+ from rapidfuzz import fuzz
9
+ import multiprocessing as mp
10
+ from openpyxl import workbook
11
+ from types import CoroutineType
12
+ from datetime import timedelta, datetime
13
+ from sqlalchemy import create_engine, event
14
+
15
+ def help():
16
+ print(f"""Hello {os.getlogin().title()}!\U0001F60A,
17
+
18
+ Thank you for choosing the Notification_List package. We sincerely appreciate your support.
19
+
20
+ Should you require any assistance or have any questions, please do not hesitate to reach out to -
21
+ Ranjeet Aloriya at +91 940.660.6239 or ranjeet.aloriya@gmail.com.
22
+ We are here to help!
23
+
24
+ Cheers!
25
+ Ranjeet Aloriya""")
26
+
27
+ def csvtsv_to_excel(folder_path):
28
+ files = os.listdir(folder_path)
29
+ i = 0
30
+ for file in files:
31
+ filename = os.path.join(folder_path, file)
32
+ if os.path.isfile(filename):
33
+ if file.endswith('.csv'):
34
+ df = pd.read_csv(filename, dtype=str, encoding='latin')
35
+ elif file.endswith('.tsv'):
36
+ df = pd.read_csv(filename, dtype=str, delimiter='\t', encoding='latin')
37
+ else:
38
+ continue # skip non-csv/tsv files
39
+
40
+ i += 1
41
+ output_file = os.path.splitext(filename)[0] + ".xlsx"
42
+ df.to_excel(output_file, index=False)
43
+ sys.stdout.write(f"\rFile No. {i} - {file} Processing")
44
+ sys.stdout.flush()
45
+
46
+ print(f"\nConversion completed. {i} files processed.")
47
+
48
+ def copy_files(file):
49
+ df = pl.read_csv(file)
50
+ i = 0
51
+ for row in df.iter_rows():
52
+ source_folder = row[1]
53
+ file_name = row[0]
54
+ destination_folder = row[2]
55
+ source_path = os.path.join(source_folder, file_name)
56
+ destination_path = os.path.join(destination_folder, file_name)
57
+ os.makedirs(destination_folder, exist_ok=True)
58
+ try:
59
+ shutil.copy2(source_path, destination_path)
60
+ except:
61
+ pass
62
+ i +=1
63
+ sys.stdout.write(f"\rFiles Copied - {i}/{df.height} ")
64
+ sys.stdout.flush()
65
+
66
+ def move_files(file):
67
+ df = pl.read_csv(file)
68
+ i = 0
69
+ for row in df.iter_rows():
70
+ source_folder = row[1]
71
+ file_name = row[0]
72
+ destination_folder = row[2]
73
+ source_path = os.path.join(source_folder, file_name)
74
+ destination_path = os.path.join(destination_folder, file_name)
75
+ os.makedirs(destination_folder, exist_ok=True)
76
+ try:
77
+ shutil.move(source_path, destination_path)
78
+ except:
79
+ pass
80
+ i +=1
81
+ sys.stdout.write(f"\rFiles Moved - {i}/{df.height} ")
82
+ sys.stdout.flush()
83
+
84
+ def copy_files_without_ext(csv_file):
85
+ df = pl.read_csv(csv_file)
86
+ not_found = []
87
+ total = len(df)
88
+ copied_count = 0
89
+
90
+ for i, row in enumerate(df.iter_rows(), start=1):
91
+ file_name, source_folder, destination_folder = map(str, row)
92
+ found = False
93
+
94
+ for root, dirs, files in os.walk(source_folder):
95
+ for f in files:
96
+ name, ext = os.path.splitext(f)
97
+ if name.lower() == file_name.lower():
98
+ os.makedirs(destination_folder, exist_ok=True)
99
+ shutil.copy2(os.path.join(root, f), os.path.join(destination_folder, f))
100
+ copied_count += 1
101
+ found = True
102
+ break
103
+ if found:
104
+ break
105
+
106
+ if not found:
107
+ not_found.append([file_name])
108
+ sys.stdout.write(f"\rProgress: {i}/{total} processed, {copied_count} copied")
109
+ sys.stdout.flush()
110
+
111
+ if not_found:
112
+ ts = datetime.now().strftime("%m%d%y%H%M%S")
113
+ nf_file = f"Not_Found_Copying_{ts}.csv"
114
+ pl.DataFrame(not_found, schema=["FileName"]).write_csv(nf_file)
115
+ print(f"\nSummary: {copied_count}/{total} copied, {len(not_found)} not found (saved in {nf_file})")
116
+ else:
117
+ print(f"\nSummary: All {total} files copied successfully ✅")
118
+
119
+ def df_info(file):
120
+ df = pl.read_csv(file)
121
+ data = []
122
+ for col in df.columns:
123
+ dtype = df.schema[col]
124
+ non_null_count = len(df[col].drop_nulls())
125
+ unique_count = df[col].n_unique()
126
+ data.append({
127
+ "Column Name": col,
128
+ "Data Type": str(dtype),
129
+ "Non-Null Count": non_null_count,
130
+ "Unique Count": unique_count
131
+ })
132
+ df = pl.DataFrame(data)
133
+
134
+ def get_segment(f, sep = '~'):
135
+ with open(f, 'r', encoding='ascii', errors='ignore') as file:
136
+ data = file.read()
137
+ data = data.replace("\n\n", "")
138
+ data = data.replace("\n", "")
139
+ segments = data.split(sep)
140
+ df = pl.DataFrame({'Segment': segments})
141
+ df = df.filter(pl.col("Segment").str.contains("*", literal=True))
142
+ return df
143
+
144
+ def arrange_segment(df, column_name = "Segment", sep = "*"):
145
+ rows = []
146
+ current = {}
147
+ for value in df[column_name]:
148
+ prefix, data = value.split(sep, 1)
149
+ if prefix in current:
150
+ rows.append(current)
151
+ current = {}
152
+ current[prefix] = data
153
+ if current:
154
+ rows.append(current)
155
+ df = pl.DataFrame(rows).fill_null("")
156
+ return df
157
+
158
+ def split_columns(df, sep='\\*'):
159
+ df = df.to_pandas()
160
+ for column in df.columns:
161
+ df[column] = df[column].fillna('')
162
+ max_splits = df[column].str.count(sep).max() + 1
163
+ max_splits = int(max_splits)
164
+ new_columns = df[column].str.split(sep, expand=True)
165
+ new_column_names = [f"{column}_{i+1}" for i in range(max_splits)]
166
+ new_columns.columns = new_column_names
167
+ df = df.drop(column, axis=1).join(new_columns)
168
+ df = pl.from_pandas(df)
169
+ return df
170
+
171
+ def split_full_name(df, full_name, suffixes):
172
+ suffixes = suffixes
173
+ def clean_and_split(text):
174
+ return text.replace(",", "").split()
175
+ def extract_suffix(words):
176
+ for i, word in enumerate(words):
177
+ if word.upper() in suffixes:
178
+ return word.upper(), words[:i] + words[i+1:]
179
+ return "", words
180
+ def parse_name(part1, part2):
181
+ first = middle = last = suffix = ""
182
+ if part2:
183
+ last_words = clean_and_split(part1)
184
+ suffix, last_words = extract_suffix(last_words)
185
+ last = " ".join(last_words)
186
+ name_words = clean_and_split(part2)
187
+ sfx2, name_words = extract_suffix(name_words)
188
+ suffix = suffix or sfx2
189
+ if name_words:
190
+ first = name_words[0]
191
+ if len(name_words) > 1:
192
+ middle = " ".join(name_words[1:])
193
+ else:
194
+ words = clean_and_split(part1)
195
+ suffix, words = extract_suffix(words)
196
+ if len(words) == 1:
197
+ first = words[0]
198
+ elif len(words) == 2:
199
+ first, last = words
200
+ elif len(words) > 2:
201
+ first = words[0]
202
+ last = words[-1]
203
+ middle = " ".join(words[1:-1])
204
+ return [first, middle, last, suffix]
205
+ return (
206
+ df
207
+ .with_columns(pl.col(full_name).str.split_exact(",", 1).alias("_split"))
208
+ .with_columns([
209
+ pl.col("_split").struct.field("field_0").str.strip_chars().alias("_part1"),
210
+ pl.col("_split").struct.field("field_1").str.strip_chars().fill_null("").alias("_part2")
211
+ ])
212
+ .with_columns(pl.struct(["_part1", "_part2"]).map_elements(
213
+ lambda row: parse_name(row["_part1"], row["_part2"]),
214
+ return_dtype=pl.List(pl.Utf8)
215
+ ).alias("_parsed"))
216
+ .with_columns([
217
+ pl.col("_parsed").list.get(0).alias("split_first_name"),
218
+ pl.col("_parsed").list.get(1).alias("split_middle_name"),
219
+ pl.col("_parsed").list.get(2).alias("split_last_name"),
220
+ pl.col("_parsed").list.get(3).alias("split_suffix"),
221
+ ])
222
+ .drop(["_split", "_part1", "_part2", "_parsed"])
223
+ )
224
+
225
+ def ra_replace_chars(df, column, cleaning_dict):
226
+ col_expr = pl.col(column)
227
+ for pattern, replacement in cleaning_dict.items():
228
+ col_expr = col_expr.str.replace_all(pattern, replacement)
229
+ return df.with_columns(col_expr.str.strip_chars().alias(column))
230
+
231
+ def excel_compile_without_header(path, f):
232
+ my_df = pl.DataFrame()
233
+ file = os.path.join(path, f)
234
+ sheets = pl.read_excel(file, has_header=False, sheet_id=0, raise_if_empty=False, infer_schema_length=0)
235
+ for sheet in sheets.keys():
236
+ df = pl.read_excel(file, has_header=False, sheet_name = sheet, raise_if_empty=False, infer_schema_length=0)
237
+ df = df.with_columns(pl.lit(f).alias('FileName'))
238
+ df = df.with_columns(pl.lit(sheet).alias('SheetName'))
239
+ df = df.select(['FileName', 'SheetName']+[col for col in df.columns if col not in ['FileName', 'SheetName']])
240
+ my_df = pl.concat([my_df, df], how='diagonal')
241
+ return my_df
242
+
243
+ def csv_compile(path, f):
244
+ my_df = pl.DataFrame()
245
+ file = os.path.join(path, f)
246
+ df = pl.read_csv(file, raise_if_empty=False, infer_schema_length=0)
247
+ df = df.with_columns(pl.lit(f).alias('FileName'))
248
+ my_df = pl.concat([my_df, df], how='diagonal')
249
+ return my_df
250
+
251
+ def parquet_compile(path, f):
252
+ my_df = pl.DataFrame()
253
+ file = os.path.join(path, f)
254
+ df = pl.read_parquet(file)
255
+ df = df.with_columns(pl.lit(f).alias('FileName'))
256
+ my_df = pl.concat([my_df, df], how='diagonal')
257
+ return my_df
258
+
259
+ def batch_processing(path, processing_function, b):
260
+ files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
261
+ j = 0
262
+ for i in range(0, len(files), b):
263
+ batch = files[i:i+b]
264
+ my_df = pl.DataFrame()
265
+ j += 1
266
+ k = 0
267
+ for f in batch:
268
+ try:
269
+ df = processing_function(path, f)
270
+ my_df = pl.concat([my_df, df], how='diagonal')
271
+ k += 1
272
+ sys.stdout.write(f"\rFile No. {k} - Processed of Batch No. {j} ")
273
+ sys.stdout.flush()
274
+ except Exception as e:
275
+ sys.stdout.write(f"\r⚠️ Skipping file due to error: {f}{e} ")
276
+ sys.stdout.flush()
277
+ continue
278
+ sys.stdout.write(f"\rBatch No. {j} - Processed ")
279
+ sys.stdout.flush()
280
+ batch_number = f"{j:03d}"
281
+ func_name = processing_function.__name__
282
+ output_folder = os.path.join(path, 'output1')
283
+ os.makedirs(output_folder, exist_ok=True)
284
+ output_file_path = os.path.join(output_folder, f'{func_name}_Batch_{batch_number}.parquet')
285
+ my_df.write_parquet(output_file_path)
286
+ sys.stdout.write(f"\rAll Batches are Processed ")
287
+ sys.stdout.flush()
288
+
289
+ def table_from_sql(server, database, table):
290
+ my_df = pl.DataFrame()
291
+ connection = pyodbc.connect(
292
+ f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'
293
+ )
294
+
295
+ sys.stdout.write(f"\rConnection successful! ")
296
+ sys.stdout.flush()
297
+ query = f"select * from {table}"
298
+ for df in pl.read_database(query, connection=connection, iter_batches=True, batch_size=10000, infer_schema_length=0):
299
+ my_df = pl.concat([my_df, df], how = 'diagonal')
300
+ sys.stdout.write(f"\rDownloading Raws: {my_df.shape[0]} ")
301
+ sys.stdout.flush()
302
+ sys.stdout.write(f"\rSaved as polars DataFrame - Total Raws: {my_df.shape[0]} ")
303
+ sys.stdout.flush()
304
+ return my_df
305
+
306
+ def table_to_sql(server, database, table, df):
307
+ params = urllib.parse.quote_plus(
308
+ f"DRIVER={{ODBC Driver 17 for SQL Server}};"
309
+ f"SERVER={server};"
310
+ f"DATABASE={database};"
311
+ "Trusted_Connection=yes;"
312
+ )
313
+
314
+ engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
315
+ df = df.to_pandas()
316
+ df.to_sql(table, con=engine, index=False, if_exists="replace", schema="dbo")
317
+ sys.stdout.write(f"\rSaved {table} in {database} - Total Raws: {len(df)} ")
318
+ sys.stdout.flush()
319
+
320
+ def dense_id(df, cols):
321
+ df = df.fill_null('')
322
+ df = df.with_columns(pl.concat_str([pl.col(c) for c in cols], separator="", ignore_nulls=True).alias('AllData'))
323
+ df = df.with_columns(pl.col("AllData").rank(method="dense").alias("dense_id"))
324
+ df = df.drop('AllData')
325
+ df = df.with_columns([pl.col(c).cast(pl.Utf8) for c in df.columns])
326
+ print(f"Max Records for CEL V1 - {len(df.select('dense_id').unique())}")
327
+ return df
328
+
329
+ def name_id(df, names):
330
+ df = df.with_columns(
331
+ (
332
+ (pl.col(names[0]).fill_null("").str.strip_chars().str.to_lowercase() + " " +
333
+ pl.col(names[1]).fill_null("").str.strip_chars().str.to_lowercase() + " " +
334
+ pl.col(names[2]).fill_null("").str.strip_chars().str.to_lowercase())
335
+ .str.replace_all(r"\s+", " ")
336
+ .str.strip_chars()
337
+ .alias("full_name_norm")
338
+ )
339
+ )
340
+ df = df.with_columns(
341
+ pl.col("full_name_norm").str.split(" ").alias("tokens")
342
+ )
343
+ unique_entities = []
344
+ entity_ids = []
345
+
346
+ for tokens in df["tokens"]:
347
+ token_set = set(tokens)
348
+ found = False
349
+ for idx, u_tokens in enumerate(unique_entities):
350
+ if token_set <= u_tokens or u_tokens <= token_set:
351
+ entity_ids.append(idx)
352
+ unique_entities[idx] |= token_set
353
+ found = True
354
+ break
355
+ if not found:
356
+ unique_entities.append(token_set)
357
+ entity_ids.append(len(unique_entities)-1)
358
+
359
+ df = df.with_columns(pl.Series("entity_id", entity_ids))
360
+
361
+ df = df.with_columns(pl.concat_str([pl.col(c) for c in names], separator="", ignore_nulls=True).alias('FullName'))
362
+ df = df.with_columns(pl.col("FullName").str.replace_all(r"\s+", ""))
363
+ name_to_entity = df.group_by("FullName").agg(
364
+ pl.first("entity_id").alias("entity_id1")
365
+ )
366
+ df = df.join(name_to_entity, on="FullName", how="left")
367
+
368
+ df = df.with_columns(pl.concat_str([pl.col(c) for c in names], separator=" ", ignore_nulls=True).alias('FullName'))
369
+ df = df.with_columns(
370
+ pl.col("FullName")
371
+ .str.extract_all(r"[^a-zA-Z ]")
372
+ .alias("sp_chars")
373
+ )
374
+ sp_chars = set(char for sublist in df["sp_chars"].to_list() for char in sublist)
375
+ print(f"Special chars in name: {sp_chars}")
376
+ pattern = "[" + re.escape("".join(sp_chars)) + "]"
377
+ df = df.with_columns(pl.col("FullName").str.replace_all(pattern, " ").alias("FullName"))
378
+ df = df.with_columns(pl.arange(0, df.height, 1).alias("name_index"))
379
+ df3 = df.select(['name_index', 'FullName'])
380
+ df3 = df3.with_columns(pl.col('FullName').str.split(' ')).explode('FullName')
381
+ df3 = df3.unique()
382
+ df3 = df3.group_by('name_index').agg(
383
+ pl.col('FullName').unique().sort().str.join('').alias('FullName1')
384
+ )
385
+ df = df.join(df3, on='name_index', how='left')
386
+ name_to_entity = df.group_by("FullName1").agg(
387
+ pl.first("entity_id1").alias("name_id")
388
+ )
389
+ df = df.join(name_to_entity, on="FullName1", how="left")
390
+ df = df.drop(['full_name_norm', 'tokens', 'entity_id', 'FullName', 'entity_id1', 'sp_chars', 'name_index', 'FullName1'])
391
+ unknown_expr = (
392
+ pl.concat_str([pl.col(c) for c in names], separator=" ", ignore_nulls=True)
393
+ .str.to_lowercase()
394
+ .str.contains("unknown")
395
+ )
396
+ max_id = df.select(pl.col("name_id").max()).item()
397
+ df = df.with_columns(
398
+ pl.when(unknown_expr)
399
+ .then(pl.arange(max_id + 1, max_id + 1 + df.height))
400
+ .otherwise(pl.col("name_id"))
401
+ .alias("name_id")
402
+ )
403
+ full_name = [names[0], names[2]]
404
+ df1 = df.with_columns(pl.concat_str([pl.col(c) for c in full_name], separator=" ", ignore_nulls=True).alias('FullName'))
405
+ df2 = df1.with_columns(pl.col('FullName').alias('Reverse_Name'))
406
+ df2 = df2.select('FullName', 'Reverse_Name').unique()
407
+ df2 = df2.with_columns(pl.col('Reverse_Name').str.split(' ')).explode('Reverse_Name')
408
+ df2 = df2.sort('Reverse_Name')
409
+ df2 = df2.group_by('FullName').agg(pl.col('Reverse_Name').unique().str.join('').alias('Reverse_Name'))
410
+ df1 = df1.join(df2, on ='FullName', how = 'left')
411
+ df2 = df1.select('name_id', 'Reverse_Name').unique()
412
+ df2 = df2.with_columns(pl.col('Reverse_Name').count().over('Reverse_Name').alias('Count'))
413
+ df2 = df2.filter(pl.col('Count')>1).sort('Reverse_Name')
414
+ df2 = df2.filter(pl.col('Reverse_Name')!='unknown')
415
+ df2 = df2.join(df2.group_by("Reverse_Name").agg(pl.col("name_id").min().alias("New_name_id")),on = 'Reverse_Name')
416
+ df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
417
+ df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').unique('name_id')
418
+ df1 = df1.join(df2, on = 'name_id', how = 'left')
419
+ df1 = df1.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
420
+ df = df1.drop('FullName', 'Reverse_Name', 'New_name_id')
421
+ return df
422
+
423
+ def dob_normalize(df, dates):
424
+ for DOB in dates:
425
+ formats = ["%m/%d/%Y", "%m/%d/%y"]
426
+ df = df.with_columns(pl.coalesce([pl.col(DOB).cast(pl.Utf8).str.strip_chars()
427
+ .str.replace_all(r"[-. ]", "/")
428
+ .str.to_date(fmt, strict=False)
429
+ for fmt in formats
430
+ ])
431
+ .dt.strftime("%m/%d/%Y")
432
+ .alias(DOB)
433
+ )
434
+ df = df.with_columns(pl.col(DOB).alias('Cleaned_DOB'))
435
+ df1 = df.select('Cleaned_DOB', DOB).unique().filter(pl.col(DOB).is_not_null())
436
+ df1 = df1.with_columns(pl.col(DOB).str.split('/')).explode(DOB)
437
+ df1 = df1.group_by('Cleaned_DOB').agg(pl.col(DOB).sort().str.join('').alias(DOB))
438
+ df = df.drop(DOB).join(df1, on = 'Cleaned_DOB', how = 'left').drop('Cleaned_DOB')
439
+ return df
440
+
441
+ def demerge(df, hard_cols):
442
+ df = df.fill_null('')
443
+ suffix_map = {"sr": "seenior", "jr": "junior", "ii": "second", "iii": "third", "iv": "four", "v": "five", "vi": "six", "vii": "seven",}
444
+ df = df.with_columns(pl.col(hard_cols[0]).cast(pl.Utf8).str.strip_chars().str.to_lowercase().replace(suffix_map))
445
+ om_ids = []
446
+ for id in hard_cols:
447
+ df1 = df.select('name_id', id)
448
+ df1 = df1.filter(pl.col(id)!="").unique()
449
+ df1 = df1.with_columns(pl.col('name_id').count().over('name_id').alias('count'))
450
+ df1 = df1.filter(pl.col('count')>1).sort('name_id')
451
+ df1 = (df1.with_columns(pl.col(id).map_elements(lambda x, s=df1: min(sum(a != b for a, b in zip(x, y)) for y in s.filter(pl.col("name_id") == s.filter(pl.col(id) == x)["name_id"][0])[id] if y != x), return_dtype=pl.Int64).alias("count")))
452
+ df1 = df1.filter(pl.col('count')>2).sort('name_id')
453
+ ids = df1.select(pl.col("name_id").unique())
454
+ ids = ids.to_series().to_list()
455
+ om_ids = list(set(om_ids).union(ids))
456
+ om_df = df.filter(pl.col('name_id').is_in(om_ids)).sort('name_id')
457
+ return om_df
458
+
459
+ def initial_names(df, names, merge_cols):
460
+ df = df.fill_null('')
461
+ df1 = df.with_columns(pl.col(names[0]).str.slice(0, 3).alias("FN3"))
462
+ df1 = df1.with_columns(pl.col(names[2]).str.slice(0, 3).alias("LN3"))
463
+ inames = df1.columns[-2:]
464
+ df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias('inames'))
465
+ df1 = df1.with_columns(pl.col('inames').str.split(' ')).explode('inames')
466
+ df1 = df1.sort('inames')
467
+ df1 = df1.group_by('dense_id').agg(pl.col('inames').unique().str.join('').alias('inames'))
468
+ df = df.join(df1, on = 'dense_id', how = 'left')
469
+ cols = ['dense_id'] + ['name_id'] + ['inames'] + merge_cols
470
+ df = df.select(cols).unique()
471
+ return df
472
+
473
+ def merging_on_ssntin(df, cols):
474
+ for col in cols:
475
+ df = df.fill_null('')
476
+ df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
477
+ df2 = df1.select('name_id', col).unique()
478
+ df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
479
+ df2 = df2.filter(pl.col('Count')>1).sort(col)
480
+ df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
481
+ df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
482
+ df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
483
+ df = df.join(df2, on = 'name_id', how = 'left')
484
+ df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
485
+ df = df.drop(col, 'New_name_id')
486
+ return df
487
+
488
+ def merging_on_address(df, cols):
489
+ for col in cols:
490
+ df = df.fill_null('')
491
+ df1 = df.with_columns(pl.col(col).str.slice(0, 10).alias(col))
492
+ df1 = df1.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
493
+ inames = ['inames', col]
494
+ df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
495
+ df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
496
+ df1 = df1.sort(col)
497
+ df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
498
+ df = df.drop(col)
499
+ df = df.join(df1, on = 'dense_id', how = 'left')
500
+ df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
501
+ df2 = df1.select('name_id', col).unique()
502
+ df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
503
+ df2 = df2.filter(pl.col('Count')>1).sort(col)
504
+ df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
505
+ df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
506
+ df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
507
+ df = df.join(df2, on = 'name_id', how = 'left')
508
+ df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
509
+ df = df.drop(col, 'New_name_id')
510
+ return df
511
+
512
+ def merging_on_dob(df, cols):
513
+ for col in cols:
514
+ df = df.fill_null('')
515
+ df1 = df.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
516
+ inames = ['inames', col]
517
+ df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
518
+ df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
519
+ df1 = df1.sort(col)
520
+ df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
521
+ df = df.drop(col)
522
+ df = df.join(df1, on = 'dense_id', how = 'left')
523
+ df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
524
+ df2 = df1.select('name_id', col).unique()
525
+ df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
526
+ df2 = df2.filter(pl.col('Count')>1).sort(col)
527
+ df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
528
+ df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
529
+ df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
530
+ df = df.join(df2, on = 'name_id', how = 'left')
531
+ df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
532
+ df = df.drop(col, 'New_name_id')
533
+ return df
534
+
535
+ def merging_on_others(df, cols):
536
+ for col in cols:
537
+ df = df.fill_null('')
538
+ df1 = df.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
539
+ inames = ['inames', col]
540
+ df1 = df1.with_columns(pl.col(col).str.split(';')).explode(col)
541
+ df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
542
+ df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
543
+ df1 = df1.sort(col)
544
+ df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
545
+ df = df.drop(col)
546
+ df = df.join(df1, on = 'dense_id', how = 'left')
547
+ df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
548
+ df2 = df1.select('name_id', col).unique()
549
+ df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
550
+ df2 = df2.filter(pl.col('Count')>1).sort(col)
551
+ df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
552
+ df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
553
+ df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
554
+ df = df.join(df2, on = 'name_id', how = 'left')
555
+ df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
556
+ df = df.drop(col, 'New_name_id')
557
+ df = df.drop('inames')
558
+ df = df.unique('dense_id')
559
+ return df
560
+
561
+ def name_checks(df1, names):
562
+ df = df1.select(['name_id'] + names).unique()
563
+ df = df.select([pl.col(col).str.to_uppercase().str.strip_chars().alias(col) for col in df.columns])
564
+ df = df.fill_null("")
565
+ df = df.with_columns(
566
+ pl.concat_str(
567
+ names,
568
+ separator=" "
569
+ )
570
+ .str.to_uppercase()
571
+ .str.replace_all(r"[^A-Z0-9 ]", "")
572
+ .str.replace_all(r"\s+", " ")
573
+ .str.strip_chars()
574
+ .alias("norm_name")
575
+ )
576
+ canonical = (
577
+ df
578
+ .group_by(["name_id", "norm_name"])
579
+ .agg([
580
+ pl.len().alias("freq"),
581
+ pl.col("norm_name").str.len_chars().max().alias("len"),
582
+ pl.first(names[0]).alias("canon_first"),
583
+ pl.first(names[1]).alias("canon_middle"),
584
+ pl.first(names[2]).alias("canon_last"),
585
+ ])
586
+ .sort(
587
+ by=["name_id", "freq", "len"],
588
+ descending=[False, True, True]
589
+ )
590
+ .group_by("name_id")
591
+ .first()
592
+ .select([
593
+ "name_id",
594
+ "canon_first",
595
+ "canon_middle",
596
+ "canon_last",
597
+ ])
598
+ )
599
+ df = df.join(canonical, on="name_id", how="left")
600
+ df = df.with_columns([
601
+ pl.struct(["norm_name", "canon_first", "canon_middle", "canon_last"])
602
+ .map_elements(lambda x: fuzz.token_sort_ratio(
603
+ x["norm_name"],
604
+ f"{x['canon_first']} {x['canon_middle']} {x['canon_last']}".strip()
605
+ ))
606
+ .alias("name_similarity")
607
+ ])
608
+ df = df.with_columns(
609
+ pl.when(pl.col("name_similarity") >= 75)
610
+ .then(pl.lit("AUTO_STANDARDIZED"))
611
+ .otherwise(pl.lit("NEEDS_MANUAL_INTERVENTION"))
612
+ .alias("comment")
613
+ )
614
+ df = df.filter(pl.col('comment')=='NEEDS_MANUAL_INTERVENTION').select('name_id', 'comment').unique()
615
+ df1 = df1.join(df, on = 'name_id', how = 'left')
616
+ return df1
617
+
618
+ def name_final(df, unique_id, names):
619
+ df = df.select([unique_id] + names).unique()
620
+ df = df.select([pl.col(col).str.to_uppercase().str.strip_chars().alias(col) for col in df.columns])
621
+ df = df.with_columns(pl.col(names[0]).str.replace_all(r"UNKNOWN", ""))
622
+ df = df.with_columns(pl.col(names[2]).str.replace_all(r"UNKNOWN", ""))
623
+ df = df.fill_null("")
624
+ df = df.with_columns(
625
+ pl.concat_str(
626
+ names,
627
+ separator=" "
628
+ )
629
+ .str.to_uppercase()
630
+ .str.replace_all(r"[^A-Z0-9 ]", "")
631
+ .str.replace_all(r"\s+", " ")
632
+ .str.strip_chars()
633
+ .alias("norm_name")
634
+ )
635
+ df = (
636
+ df
637
+ .group_by([unique_id, "norm_name"])
638
+ .agg([
639
+ pl.len().alias("freq"),
640
+ pl.col("norm_name").str.len_chars().max().alias("len"),
641
+ pl.first(names[0]).alias(names[0]),
642
+ pl.first(names[1]).alias(names[1]),
643
+ pl.first(names[2]).alias(names[2]),
644
+ ])
645
+ .with_columns(pl.sum_horizontal([pl.when(pl.col(c).is_not_null() & (pl.col(c) != "")).then(1).otherwise(0) for c in names]).alias("countA"))
646
+ .sort(["countA", "freq", "len"], descending=[True, True, True])
647
+ .select([unique_id, names[0], names[1], names[2],])
648
+ .group_by(unique_id).head(1)
649
+ )
650
+ return df
651
+
652
+ def address_final(df, unique_id, addresses):
653
+ df = (
654
+ df
655
+ .with_columns(pl.concat_str([pl.col(c).fill_null("") for c in [unique_id]+ addresses], separator="|").alias("addr_concat"))
656
+ .with_columns(pl.sum_horizontal([pl.when(pl.col(c).is_not_null() & (pl.col(c) != "")).then(1).otherwise(0) for c in addresses]).alias("countA"))
657
+ .with_columns(pl.col("addr_concat").str.len_chars().alias("addr_len"))
658
+ .with_columns(pl.col("addr_concat").count().over("addr_concat").alias("countif"))
659
+ .sort(["countif", "countA", "addr_len"], descending=[True, True, True])
660
+ .group_by(unique_id).head(1)
661
+ .select([unique_id] + addresses)
662
+ )
663
+ return df
664
+
665
+ def final_cel(df, unique_id, summary, names, addresses):
666
+ raw_summary = df.select(summary)
667
+ raw_summary = raw_summary.with_columns(pl.col(summary).str.split(';')).explode(summary)
668
+ raw_summary = raw_summary.with_columns(pl.col(summary).count().over(summary).alias('TOTAL COUNT')).unique()
669
+ final_names = name_final(df, unique_id, names)
670
+ final_addresses = address_final(df, unique_id, addresses)
671
+ final_cols = df.columns
672
+ df1 = df
673
+ df1 = df1.fill_null("")
674
+ df = df.select(unique_id).unique()
675
+ cols = [c for c in df1.columns if c not in names + addresses + [unique_id]]
676
+ for col in cols:
677
+ df3 = df1.select([unique_id, col]).filter(pl.col(col) != '')
678
+ df3 = df3.with_columns(pl.col(col).str.split(';')).explode(col)
679
+ df3 = df3.unique()
680
+ df3 = df3.group_by(unique_id).agg(
681
+ pl.col(col).unique().sort().str.join(';').alias(col)
682
+ )
683
+ df = df.join(df3, on=unique_id, how='left')
684
+ df = df.join(final_names, on = unique_id, how = 'left')
685
+ df = df.join(final_addresses, on = unique_id, how = 'left')
686
+ df = df.select(final_cols).sort(unique_id)
687
+ cel_summary = df.select(summary)
688
+ cel_summary = cel_summary.with_columns(pl.col(summary).str.split(';')).explode(summary)
689
+ cel_summary = cel_summary.with_columns(pl.col(summary).count().over(summary).alias('UNIQUE COUNT')).unique()
690
+ final_summary = raw_summary.join(cel_summary, on = summary, how = 'left')
691
+ now = datetime.now().strftime("%m%d_%H%M")
692
+ final_summary.write_csv(f'summary_{now}.csv')
693
+ return df
694
+
695
+ def y_columns(df, summary, y_cols):
696
+ df = (
697
+ df.with_columns(
698
+ pl.col(summary)
699
+ .fill_null("")
700
+ .str.split(";")
701
+ .alias("split_vals")
702
+ )
703
+ .with_columns([
704
+ pl.when(pl.col("split_vals").list.contains(v))
705
+ .then(pl.lit("Y"))
706
+ .otherwise(pl.lit(""))
707
+ .alias(v)
708
+ for v in y_cols
709
+ ])
710
+ .drop("split_vals")
711
+ )
712
+ return df
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: r3_test
3
+ Version: 0.0.1
4
+ Summary: Just for test
5
+ Author: Ranjeet Aloriya
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy
14
+ Requires-Dist: pandas
15
+ Requires-Dist: polars
16
+ Requires-Dist: pyarrow
17
+ Requires-Dist: sqlalchemy
18
+ Requires-Dist: networkx
19
+ Requires-Dist: pyodbc
20
+ Requires-Dist: fastexcel
21
+ Requires-Dist: rapidfuzz
22
+ Requires-Dist: tqdm
23
+ Requires-Dist: openpyxl
24
+ Requires-Dist: xlrd
25
+ Requires-Dist: xlsxwriter
26
+ Dynamic: license-file
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ r3_test/__init__.py
5
+ r3_test/main.py
6
+ r3_test.egg-info/PKG-INFO
7
+ r3_test.egg-info/SOURCES.txt
8
+ r3_test.egg-info/dependency_links.txt
9
+ r3_test.egg-info/requires.txt
10
+ r3_test.egg-info/top_level.txt
@@ -0,0 +1,13 @@
1
+ numpy
2
+ pandas
3
+ polars
4
+ pyarrow
5
+ sqlalchemy
6
+ networkx
7
+ pyodbc
8
+ fastexcel
9
+ rapidfuzz
10
+ tqdm
11
+ openpyxl
12
+ xlrd
13
+ xlsxwriter
@@ -0,0 +1 @@
1
+ r3_test
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+