r3-test 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- r3_test-0.0.1/LICENSE +0 -0
- r3_test-0.0.1/PKG-INFO +26 -0
- r3_test-0.0.1/README.md +0 -0
- r3_test-0.0.1/pyproject.toml +36 -0
- r3_test-0.0.1/r3_test/__init__.py +33 -0
- r3_test-0.0.1/r3_test/main.py +712 -0
- r3_test-0.0.1/r3_test.egg-info/PKG-INFO +26 -0
- r3_test-0.0.1/r3_test.egg-info/SOURCES.txt +10 -0
- r3_test-0.0.1/r3_test.egg-info/dependency_links.txt +1 -0
- r3_test-0.0.1/r3_test.egg-info/requires.txt +13 -0
- r3_test-0.0.1/r3_test.egg-info/top_level.txt +1 -0
- r3_test-0.0.1/setup.cfg +4 -0
r3_test-0.0.1/LICENSE
ADDED
|
File without changes
|
r3_test-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: r3_test
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Just for test
|
|
5
|
+
Author: Ranjeet Aloriya
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: polars
|
|
16
|
+
Requires-Dist: pyarrow
|
|
17
|
+
Requires-Dist: sqlalchemy
|
|
18
|
+
Requires-Dist: networkx
|
|
19
|
+
Requires-Dist: pyodbc
|
|
20
|
+
Requires-Dist: fastexcel
|
|
21
|
+
Requires-Dist: rapidfuzz
|
|
22
|
+
Requires-Dist: tqdm
|
|
23
|
+
Requires-Dist: openpyxl
|
|
24
|
+
Requires-Dist: xlrd
|
|
25
|
+
Requires-Dist: xlsxwriter
|
|
26
|
+
Dynamic: license-file
|
r3_test-0.0.1/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "r3_test"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "Just for test"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Ranjeet Aloriya" }
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy",
|
|
14
|
+
"pandas",
|
|
15
|
+
"polars",
|
|
16
|
+
"pyarrow",
|
|
17
|
+
"sqlalchemy",
|
|
18
|
+
"networkx",
|
|
19
|
+
"pyodbc",
|
|
20
|
+
"fastexcel",
|
|
21
|
+
"rapidfuzz",
|
|
22
|
+
"tqdm",
|
|
23
|
+
"openpyxl",
|
|
24
|
+
"xlrd",
|
|
25
|
+
"xlsxwriter"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
classifiers = [
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"License :: OSI Approved :: MIT License",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[build-system]
|
|
35
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
36
|
+
build-backend = "setuptools.build_meta"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from .main import (
|
|
2
|
+
help,
|
|
3
|
+
csvtsv_to_excel,
|
|
4
|
+
copy_files,
|
|
5
|
+
move_files,
|
|
6
|
+
copy_files_without_ext,
|
|
7
|
+
df_info,
|
|
8
|
+
get_segment,
|
|
9
|
+
arrange_segment,
|
|
10
|
+
split_columns,
|
|
11
|
+
split_full_name,
|
|
12
|
+
ra_replace_chars,
|
|
13
|
+
excel_compile_without_header,
|
|
14
|
+
csv_compile,
|
|
15
|
+
parquet_compile,
|
|
16
|
+
batch_processing,
|
|
17
|
+
table_from_sql,
|
|
18
|
+
table_to_sql,
|
|
19
|
+
dense_id,
|
|
20
|
+
name_id,
|
|
21
|
+
dob_normalize,
|
|
22
|
+
demerge,
|
|
23
|
+
initial_names,
|
|
24
|
+
merging_on_ssntin,
|
|
25
|
+
merging_on_address,
|
|
26
|
+
merging_on_dob,
|
|
27
|
+
merging_on_others,
|
|
28
|
+
name_checks,
|
|
29
|
+
name_final,
|
|
30
|
+
address_final,
|
|
31
|
+
final_cel,
|
|
32
|
+
y_columns
|
|
33
|
+
)
|
|
@@ -0,0 +1,712 @@
|
|
|
1
|
+
import re, os, sys, csv, shutil, networkx, warnings, pyodbc, urllib,sqlalchemy, pyarrow, fastexcel
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import polars as pl
|
|
5
|
+
from time import time
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from rapidfuzz import fuzz
|
|
9
|
+
import multiprocessing as mp
|
|
10
|
+
from openpyxl import workbook
|
|
11
|
+
from types import CoroutineType
|
|
12
|
+
from datetime import timedelta, datetime
|
|
13
|
+
from sqlalchemy import create_engine, event
|
|
14
|
+
|
|
15
|
+
def help():
|
|
16
|
+
print(f"""Hello {os.getlogin().title()}!\U0001F60A,
|
|
17
|
+
|
|
18
|
+
Thank you for choosing the Notification_List package. We sincerely appreciate your support.
|
|
19
|
+
|
|
20
|
+
Should you require any assistance or have any questions, please do not hesitate to reach out to -
|
|
21
|
+
Ranjeet Aloriya at +91 940.660.6239 or ranjeet.aloriya@gmail.com.
|
|
22
|
+
We are here to help!
|
|
23
|
+
|
|
24
|
+
Cheers!
|
|
25
|
+
Ranjeet Aloriya""")
|
|
26
|
+
|
|
27
|
+
def csvtsv_to_excel(folder_path):
|
|
28
|
+
files = os.listdir(folder_path)
|
|
29
|
+
i = 0
|
|
30
|
+
for file in files:
|
|
31
|
+
filename = os.path.join(folder_path, file)
|
|
32
|
+
if os.path.isfile(filename):
|
|
33
|
+
if file.endswith('.csv'):
|
|
34
|
+
df = pd.read_csv(filename, dtype=str, encoding='latin')
|
|
35
|
+
elif file.endswith('.tsv'):
|
|
36
|
+
df = pd.read_csv(filename, dtype=str, delimiter='\t', encoding='latin')
|
|
37
|
+
else:
|
|
38
|
+
continue # skip non-csv/tsv files
|
|
39
|
+
|
|
40
|
+
i += 1
|
|
41
|
+
output_file = os.path.splitext(filename)[0] + ".xlsx"
|
|
42
|
+
df.to_excel(output_file, index=False)
|
|
43
|
+
sys.stdout.write(f"\rFile No. {i} - {file} Processing")
|
|
44
|
+
sys.stdout.flush()
|
|
45
|
+
|
|
46
|
+
print(f"\nConversion completed. {i} files processed.")
|
|
47
|
+
|
|
48
|
+
def copy_files(file):
|
|
49
|
+
df = pl.read_csv(file)
|
|
50
|
+
i = 0
|
|
51
|
+
for row in df.iter_rows():
|
|
52
|
+
source_folder = row[1]
|
|
53
|
+
file_name = row[0]
|
|
54
|
+
destination_folder = row[2]
|
|
55
|
+
source_path = os.path.join(source_folder, file_name)
|
|
56
|
+
destination_path = os.path.join(destination_folder, file_name)
|
|
57
|
+
os.makedirs(destination_folder, exist_ok=True)
|
|
58
|
+
try:
|
|
59
|
+
shutil.copy2(source_path, destination_path)
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
i +=1
|
|
63
|
+
sys.stdout.write(f"\rFiles Copied - {i}/{df.height} ")
|
|
64
|
+
sys.stdout.flush()
|
|
65
|
+
|
|
66
|
+
def move_files(file):
|
|
67
|
+
df = pl.read_csv(file)
|
|
68
|
+
i = 0
|
|
69
|
+
for row in df.iter_rows():
|
|
70
|
+
source_folder = row[1]
|
|
71
|
+
file_name = row[0]
|
|
72
|
+
destination_folder = row[2]
|
|
73
|
+
source_path = os.path.join(source_folder, file_name)
|
|
74
|
+
destination_path = os.path.join(destination_folder, file_name)
|
|
75
|
+
os.makedirs(destination_folder, exist_ok=True)
|
|
76
|
+
try:
|
|
77
|
+
shutil.move(source_path, destination_path)
|
|
78
|
+
except:
|
|
79
|
+
pass
|
|
80
|
+
i +=1
|
|
81
|
+
sys.stdout.write(f"\rFiles Moved - {i}/{df.height} ")
|
|
82
|
+
sys.stdout.flush()
|
|
83
|
+
|
|
84
|
+
def copy_files_without_ext(csv_file):
|
|
85
|
+
df = pl.read_csv(csv_file)
|
|
86
|
+
not_found = []
|
|
87
|
+
total = len(df)
|
|
88
|
+
copied_count = 0
|
|
89
|
+
|
|
90
|
+
for i, row in enumerate(df.iter_rows(), start=1):
|
|
91
|
+
file_name, source_folder, destination_folder = map(str, row)
|
|
92
|
+
found = False
|
|
93
|
+
|
|
94
|
+
for root, dirs, files in os.walk(source_folder):
|
|
95
|
+
for f in files:
|
|
96
|
+
name, ext = os.path.splitext(f)
|
|
97
|
+
if name.lower() == file_name.lower():
|
|
98
|
+
os.makedirs(destination_folder, exist_ok=True)
|
|
99
|
+
shutil.copy2(os.path.join(root, f), os.path.join(destination_folder, f))
|
|
100
|
+
copied_count += 1
|
|
101
|
+
found = True
|
|
102
|
+
break
|
|
103
|
+
if found:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
if not found:
|
|
107
|
+
not_found.append([file_name])
|
|
108
|
+
sys.stdout.write(f"\rProgress: {i}/{total} processed, {copied_count} copied")
|
|
109
|
+
sys.stdout.flush()
|
|
110
|
+
|
|
111
|
+
if not_found:
|
|
112
|
+
ts = datetime.now().strftime("%m%d%y%H%M%S")
|
|
113
|
+
nf_file = f"Not_Found_Copying_{ts}.csv"
|
|
114
|
+
pl.DataFrame(not_found, schema=["FileName"]).write_csv(nf_file)
|
|
115
|
+
print(f"\nSummary: {copied_count}/{total} copied, {len(not_found)} not found (saved in {nf_file})")
|
|
116
|
+
else:
|
|
117
|
+
print(f"\nSummary: All {total} files copied successfully ✅")
|
|
118
|
+
|
|
119
|
+
def df_info(file):
|
|
120
|
+
df = pl.read_csv(file)
|
|
121
|
+
data = []
|
|
122
|
+
for col in df.columns:
|
|
123
|
+
dtype = df.schema[col]
|
|
124
|
+
non_null_count = len(df[col].drop_nulls())
|
|
125
|
+
unique_count = df[col].n_unique()
|
|
126
|
+
data.append({
|
|
127
|
+
"Column Name": col,
|
|
128
|
+
"Data Type": str(dtype),
|
|
129
|
+
"Non-Null Count": non_null_count,
|
|
130
|
+
"Unique Count": unique_count
|
|
131
|
+
})
|
|
132
|
+
df = pl.DataFrame(data)
|
|
133
|
+
|
|
134
|
+
def get_segment(f, sep = '~'):
|
|
135
|
+
with open(f, 'r', encoding='ascii', errors='ignore') as file:
|
|
136
|
+
data = file.read()
|
|
137
|
+
data = data.replace("\n\n", "")
|
|
138
|
+
data = data.replace("\n", "")
|
|
139
|
+
segments = data.split(sep)
|
|
140
|
+
df = pl.DataFrame({'Segment': segments})
|
|
141
|
+
df = df.filter(pl.col("Segment").str.contains("*", literal=True))
|
|
142
|
+
return df
|
|
143
|
+
|
|
144
|
+
def arrange_segment(df, column_name = "Segment", sep = "*"):
|
|
145
|
+
rows = []
|
|
146
|
+
current = {}
|
|
147
|
+
for value in df[column_name]:
|
|
148
|
+
prefix, data = value.split(sep, 1)
|
|
149
|
+
if prefix in current:
|
|
150
|
+
rows.append(current)
|
|
151
|
+
current = {}
|
|
152
|
+
current[prefix] = data
|
|
153
|
+
if current:
|
|
154
|
+
rows.append(current)
|
|
155
|
+
df = pl.DataFrame(rows).fill_null("")
|
|
156
|
+
return df
|
|
157
|
+
|
|
158
|
+
def split_columns(df, sep='\\*'):
|
|
159
|
+
df = df.to_pandas()
|
|
160
|
+
for column in df.columns:
|
|
161
|
+
df[column] = df[column].fillna('')
|
|
162
|
+
max_splits = df[column].str.count(sep).max() + 1
|
|
163
|
+
max_splits = int(max_splits)
|
|
164
|
+
new_columns = df[column].str.split(sep, expand=True)
|
|
165
|
+
new_column_names = [f"{column}_{i+1}" for i in range(max_splits)]
|
|
166
|
+
new_columns.columns = new_column_names
|
|
167
|
+
df = df.drop(column, axis=1).join(new_columns)
|
|
168
|
+
df = pl.from_pandas(df)
|
|
169
|
+
return df
|
|
170
|
+
|
|
171
|
+
def split_full_name(df, full_name, suffixes):
|
|
172
|
+
suffixes = suffixes
|
|
173
|
+
def clean_and_split(text):
|
|
174
|
+
return text.replace(",", "").split()
|
|
175
|
+
def extract_suffix(words):
|
|
176
|
+
for i, word in enumerate(words):
|
|
177
|
+
if word.upper() in suffixes:
|
|
178
|
+
return word.upper(), words[:i] + words[i+1:]
|
|
179
|
+
return "", words
|
|
180
|
+
def parse_name(part1, part2):
|
|
181
|
+
first = middle = last = suffix = ""
|
|
182
|
+
if part2:
|
|
183
|
+
last_words = clean_and_split(part1)
|
|
184
|
+
suffix, last_words = extract_suffix(last_words)
|
|
185
|
+
last = " ".join(last_words)
|
|
186
|
+
name_words = clean_and_split(part2)
|
|
187
|
+
sfx2, name_words = extract_suffix(name_words)
|
|
188
|
+
suffix = suffix or sfx2
|
|
189
|
+
if name_words:
|
|
190
|
+
first = name_words[0]
|
|
191
|
+
if len(name_words) > 1:
|
|
192
|
+
middle = " ".join(name_words[1:])
|
|
193
|
+
else:
|
|
194
|
+
words = clean_and_split(part1)
|
|
195
|
+
suffix, words = extract_suffix(words)
|
|
196
|
+
if len(words) == 1:
|
|
197
|
+
first = words[0]
|
|
198
|
+
elif len(words) == 2:
|
|
199
|
+
first, last = words
|
|
200
|
+
elif len(words) > 2:
|
|
201
|
+
first = words[0]
|
|
202
|
+
last = words[-1]
|
|
203
|
+
middle = " ".join(words[1:-1])
|
|
204
|
+
return [first, middle, last, suffix]
|
|
205
|
+
return (
|
|
206
|
+
df
|
|
207
|
+
.with_columns(pl.col(full_name).str.split_exact(",", 1).alias("_split"))
|
|
208
|
+
.with_columns([
|
|
209
|
+
pl.col("_split").struct.field("field_0").str.strip_chars().alias("_part1"),
|
|
210
|
+
pl.col("_split").struct.field("field_1").str.strip_chars().fill_null("").alias("_part2")
|
|
211
|
+
])
|
|
212
|
+
.with_columns(pl.struct(["_part1", "_part2"]).map_elements(
|
|
213
|
+
lambda row: parse_name(row["_part1"], row["_part2"]),
|
|
214
|
+
return_dtype=pl.List(pl.Utf8)
|
|
215
|
+
).alias("_parsed"))
|
|
216
|
+
.with_columns([
|
|
217
|
+
pl.col("_parsed").list.get(0).alias("split_first_name"),
|
|
218
|
+
pl.col("_parsed").list.get(1).alias("split_middle_name"),
|
|
219
|
+
pl.col("_parsed").list.get(2).alias("split_last_name"),
|
|
220
|
+
pl.col("_parsed").list.get(3).alias("split_suffix"),
|
|
221
|
+
])
|
|
222
|
+
.drop(["_split", "_part1", "_part2", "_parsed"])
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def ra_replace_chars(df, column, cleaning_dict):
|
|
226
|
+
col_expr = pl.col(column)
|
|
227
|
+
for pattern, replacement in cleaning_dict.items():
|
|
228
|
+
col_expr = col_expr.str.replace_all(pattern, replacement)
|
|
229
|
+
return df.with_columns(col_expr.str.strip_chars().alias(column))
|
|
230
|
+
|
|
231
|
+
def excel_compile_without_header(path, f):
|
|
232
|
+
my_df = pl.DataFrame()
|
|
233
|
+
file = os.path.join(path, f)
|
|
234
|
+
sheets = pl.read_excel(file, has_header=False, sheet_id=0, raise_if_empty=False, infer_schema_length=0)
|
|
235
|
+
for sheet in sheets.keys():
|
|
236
|
+
df = pl.read_excel(file, has_header=False, sheet_name = sheet, raise_if_empty=False, infer_schema_length=0)
|
|
237
|
+
df = df.with_columns(pl.lit(f).alias('FileName'))
|
|
238
|
+
df = df.with_columns(pl.lit(sheet).alias('SheetName'))
|
|
239
|
+
df = df.select(['FileName', 'SheetName']+[col for col in df.columns if col not in ['FileName', 'SheetName']])
|
|
240
|
+
my_df = pl.concat([my_df, df], how='diagonal')
|
|
241
|
+
return my_df
|
|
242
|
+
|
|
243
|
+
def csv_compile(path, f):
|
|
244
|
+
my_df = pl.DataFrame()
|
|
245
|
+
file = os.path.join(path, f)
|
|
246
|
+
df = pl.read_csv(file, raise_if_empty=False, infer_schema_length=0)
|
|
247
|
+
df = df.with_columns(pl.lit(f).alias('FileName'))
|
|
248
|
+
my_df = pl.concat([my_df, df], how='diagonal')
|
|
249
|
+
return my_df
|
|
250
|
+
|
|
251
|
+
def parquet_compile(path, f):
|
|
252
|
+
my_df = pl.DataFrame()
|
|
253
|
+
file = os.path.join(path, f)
|
|
254
|
+
df = pl.read_parquet(file)
|
|
255
|
+
df = df.with_columns(pl.lit(f).alias('FileName'))
|
|
256
|
+
my_df = pl.concat([my_df, df], how='diagonal')
|
|
257
|
+
return my_df
|
|
258
|
+
|
|
259
|
+
def batch_processing(path, processing_function, b):
|
|
260
|
+
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
|
|
261
|
+
j = 0
|
|
262
|
+
for i in range(0, len(files), b):
|
|
263
|
+
batch = files[i:i+b]
|
|
264
|
+
my_df = pl.DataFrame()
|
|
265
|
+
j += 1
|
|
266
|
+
k = 0
|
|
267
|
+
for f in batch:
|
|
268
|
+
try:
|
|
269
|
+
df = processing_function(path, f)
|
|
270
|
+
my_df = pl.concat([my_df, df], how='diagonal')
|
|
271
|
+
k += 1
|
|
272
|
+
sys.stdout.write(f"\rFile No. {k} - Processed of Batch No. {j} ")
|
|
273
|
+
sys.stdout.flush()
|
|
274
|
+
except Exception as e:
|
|
275
|
+
sys.stdout.write(f"\r⚠️ Skipping file due to error: {f}{e} ")
|
|
276
|
+
sys.stdout.flush()
|
|
277
|
+
continue
|
|
278
|
+
sys.stdout.write(f"\rBatch No. {j} - Processed ")
|
|
279
|
+
sys.stdout.flush()
|
|
280
|
+
batch_number = f"{j:03d}"
|
|
281
|
+
func_name = processing_function.__name__
|
|
282
|
+
output_folder = os.path.join(path, 'output1')
|
|
283
|
+
os.makedirs(output_folder, exist_ok=True)
|
|
284
|
+
output_file_path = os.path.join(output_folder, f'{func_name}_Batch_{batch_number}.parquet')
|
|
285
|
+
my_df.write_parquet(output_file_path)
|
|
286
|
+
sys.stdout.write(f"\rAll Batches are Processed ")
|
|
287
|
+
sys.stdout.flush()
|
|
288
|
+
|
|
289
|
+
def table_from_sql(server, database, table):
|
|
290
|
+
my_df = pl.DataFrame()
|
|
291
|
+
connection = pyodbc.connect(
|
|
292
|
+
f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
sys.stdout.write(f"\rConnection successful! ")
|
|
296
|
+
sys.stdout.flush()
|
|
297
|
+
query = f"select * from {table}"
|
|
298
|
+
for df in pl.read_database(query, connection=connection, iter_batches=True, batch_size=10000, infer_schema_length=0):
|
|
299
|
+
my_df = pl.concat([my_df, df], how = 'diagonal')
|
|
300
|
+
sys.stdout.write(f"\rDownloading Raws: {my_df.shape[0]} ")
|
|
301
|
+
sys.stdout.flush()
|
|
302
|
+
sys.stdout.write(f"\rSaved as polars DataFrame - Total Raws: {my_df.shape[0]} ")
|
|
303
|
+
sys.stdout.flush()
|
|
304
|
+
return my_df
|
|
305
|
+
|
|
306
|
+
def table_to_sql(server, database, table, df):
|
|
307
|
+
params = urllib.parse.quote_plus(
|
|
308
|
+
f"DRIVER={{ODBC Driver 17 for SQL Server}};"
|
|
309
|
+
f"SERVER={server};"
|
|
310
|
+
f"DATABASE={database};"
|
|
311
|
+
"Trusted_Connection=yes;"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
|
|
315
|
+
df = df.to_pandas()
|
|
316
|
+
df.to_sql(table, con=engine, index=False, if_exists="replace", schema="dbo")
|
|
317
|
+
sys.stdout.write(f"\rSaved {table} in {database} - Total Raws: {len(df)} ")
|
|
318
|
+
sys.stdout.flush()
|
|
319
|
+
|
|
320
|
+
def dense_id(df, cols):
|
|
321
|
+
df = df.fill_null('')
|
|
322
|
+
df = df.with_columns(pl.concat_str([pl.col(c) for c in cols], separator="", ignore_nulls=True).alias('AllData'))
|
|
323
|
+
df = df.with_columns(pl.col("AllData").rank(method="dense").alias("dense_id"))
|
|
324
|
+
df = df.drop('AllData')
|
|
325
|
+
df = df.with_columns([pl.col(c).cast(pl.Utf8) for c in df.columns])
|
|
326
|
+
print(f"Max Records for CEL V1 - {len(df.select('dense_id').unique())}")
|
|
327
|
+
return df
|
|
328
|
+
|
|
329
|
+
def name_id(df, names):
|
|
330
|
+
df = df.with_columns(
|
|
331
|
+
(
|
|
332
|
+
(pl.col(names[0]).fill_null("").str.strip_chars().str.to_lowercase() + " " +
|
|
333
|
+
pl.col(names[1]).fill_null("").str.strip_chars().str.to_lowercase() + " " +
|
|
334
|
+
pl.col(names[2]).fill_null("").str.strip_chars().str.to_lowercase())
|
|
335
|
+
.str.replace_all(r"\s+", " ")
|
|
336
|
+
.str.strip_chars()
|
|
337
|
+
.alias("full_name_norm")
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
df = df.with_columns(
|
|
341
|
+
pl.col("full_name_norm").str.split(" ").alias("tokens")
|
|
342
|
+
)
|
|
343
|
+
unique_entities = []
|
|
344
|
+
entity_ids = []
|
|
345
|
+
|
|
346
|
+
for tokens in df["tokens"]:
|
|
347
|
+
token_set = set(tokens)
|
|
348
|
+
found = False
|
|
349
|
+
for idx, u_tokens in enumerate(unique_entities):
|
|
350
|
+
if token_set <= u_tokens or u_tokens <= token_set:
|
|
351
|
+
entity_ids.append(idx)
|
|
352
|
+
unique_entities[idx] |= token_set
|
|
353
|
+
found = True
|
|
354
|
+
break
|
|
355
|
+
if not found:
|
|
356
|
+
unique_entities.append(token_set)
|
|
357
|
+
entity_ids.append(len(unique_entities)-1)
|
|
358
|
+
|
|
359
|
+
df = df.with_columns(pl.Series("entity_id", entity_ids))
|
|
360
|
+
|
|
361
|
+
df = df.with_columns(pl.concat_str([pl.col(c) for c in names], separator="", ignore_nulls=True).alias('FullName'))
|
|
362
|
+
df = df.with_columns(pl.col("FullName").str.replace_all(r"\s+", ""))
|
|
363
|
+
name_to_entity = df.group_by("FullName").agg(
|
|
364
|
+
pl.first("entity_id").alias("entity_id1")
|
|
365
|
+
)
|
|
366
|
+
df = df.join(name_to_entity, on="FullName", how="left")
|
|
367
|
+
|
|
368
|
+
df = df.with_columns(pl.concat_str([pl.col(c) for c in names], separator=" ", ignore_nulls=True).alias('FullName'))
|
|
369
|
+
df = df.with_columns(
|
|
370
|
+
pl.col("FullName")
|
|
371
|
+
.str.extract_all(r"[^a-zA-Z ]")
|
|
372
|
+
.alias("sp_chars")
|
|
373
|
+
)
|
|
374
|
+
sp_chars = set(char for sublist in df["sp_chars"].to_list() for char in sublist)
|
|
375
|
+
print(f"Special chars in name: {sp_chars}")
|
|
376
|
+
pattern = "[" + re.escape("".join(sp_chars)) + "]"
|
|
377
|
+
df = df.with_columns(pl.col("FullName").str.replace_all(pattern, " ").alias("FullName"))
|
|
378
|
+
df = df.with_columns(pl.arange(0, df.height, 1).alias("name_index"))
|
|
379
|
+
df3 = df.select(['name_index', 'FullName'])
|
|
380
|
+
df3 = df3.with_columns(pl.col('FullName').str.split(' ')).explode('FullName')
|
|
381
|
+
df3 = df3.unique()
|
|
382
|
+
df3 = df3.group_by('name_index').agg(
|
|
383
|
+
pl.col('FullName').unique().sort().str.join('').alias('FullName1')
|
|
384
|
+
)
|
|
385
|
+
df = df.join(df3, on='name_index', how='left')
|
|
386
|
+
name_to_entity = df.group_by("FullName1").agg(
|
|
387
|
+
pl.first("entity_id1").alias("name_id")
|
|
388
|
+
)
|
|
389
|
+
df = df.join(name_to_entity, on="FullName1", how="left")
|
|
390
|
+
df = df.drop(['full_name_norm', 'tokens', 'entity_id', 'FullName', 'entity_id1', 'sp_chars', 'name_index', 'FullName1'])
|
|
391
|
+
unknown_expr = (
|
|
392
|
+
pl.concat_str([pl.col(c) for c in names], separator=" ", ignore_nulls=True)
|
|
393
|
+
.str.to_lowercase()
|
|
394
|
+
.str.contains("unknown")
|
|
395
|
+
)
|
|
396
|
+
max_id = df.select(pl.col("name_id").max()).item()
|
|
397
|
+
df = df.with_columns(
|
|
398
|
+
pl.when(unknown_expr)
|
|
399
|
+
.then(pl.arange(max_id + 1, max_id + 1 + df.height))
|
|
400
|
+
.otherwise(pl.col("name_id"))
|
|
401
|
+
.alias("name_id")
|
|
402
|
+
)
|
|
403
|
+
full_name = [names[0], names[2]]
|
|
404
|
+
df1 = df.with_columns(pl.concat_str([pl.col(c) for c in full_name], separator=" ", ignore_nulls=True).alias('FullName'))
|
|
405
|
+
df2 = df1.with_columns(pl.col('FullName').alias('Reverse_Name'))
|
|
406
|
+
df2 = df2.select('FullName', 'Reverse_Name').unique()
|
|
407
|
+
df2 = df2.with_columns(pl.col('Reverse_Name').str.split(' ')).explode('Reverse_Name')
|
|
408
|
+
df2 = df2.sort('Reverse_Name')
|
|
409
|
+
df2 = df2.group_by('FullName').agg(pl.col('Reverse_Name').unique().str.join('').alias('Reverse_Name'))
|
|
410
|
+
df1 = df1.join(df2, on ='FullName', how = 'left')
|
|
411
|
+
df2 = df1.select('name_id', 'Reverse_Name').unique()
|
|
412
|
+
df2 = df2.with_columns(pl.col('Reverse_Name').count().over('Reverse_Name').alias('Count'))
|
|
413
|
+
df2 = df2.filter(pl.col('Count')>1).sort('Reverse_Name')
|
|
414
|
+
df2 = df2.filter(pl.col('Reverse_Name')!='unknown')
|
|
415
|
+
df2 = df2.join(df2.group_by("Reverse_Name").agg(pl.col("name_id").min().alias("New_name_id")),on = 'Reverse_Name')
|
|
416
|
+
df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
|
|
417
|
+
df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').unique('name_id')
|
|
418
|
+
df1 = df1.join(df2, on = 'name_id', how = 'left')
|
|
419
|
+
df1 = df1.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
|
|
420
|
+
df = df1.drop('FullName', 'Reverse_Name', 'New_name_id')
|
|
421
|
+
return df
|
|
422
|
+
|
|
423
|
+
def dob_normalize(df, dates):
|
|
424
|
+
for DOB in dates:
|
|
425
|
+
formats = ["%m/%d/%Y", "%m/%d/%y"]
|
|
426
|
+
df = df.with_columns(pl.coalesce([pl.col(DOB).cast(pl.Utf8).str.strip_chars()
|
|
427
|
+
.str.replace_all(r"[-. ]", "/")
|
|
428
|
+
.str.to_date(fmt, strict=False)
|
|
429
|
+
for fmt in formats
|
|
430
|
+
])
|
|
431
|
+
.dt.strftime("%m/%d/%Y")
|
|
432
|
+
.alias(DOB)
|
|
433
|
+
)
|
|
434
|
+
df = df.with_columns(pl.col(DOB).alias('Cleaned_DOB'))
|
|
435
|
+
df1 = df.select('Cleaned_DOB', DOB).unique().filter(pl.col(DOB).is_not_null())
|
|
436
|
+
df1 = df1.with_columns(pl.col(DOB).str.split('/')).explode(DOB)
|
|
437
|
+
df1 = df1.group_by('Cleaned_DOB').agg(pl.col(DOB).sort().str.join('').alias(DOB))
|
|
438
|
+
df = df.drop(DOB).join(df1, on = 'Cleaned_DOB', how = 'left').drop('Cleaned_DOB')
|
|
439
|
+
return df
|
|
440
|
+
|
|
441
|
+
def demerge(df, hard_cols):
|
|
442
|
+
df = df.fill_null('')
|
|
443
|
+
suffix_map = {"sr": "seenior", "jr": "junior", "ii": "second", "iii": "third", "iv": "four", "v": "five", "vi": "six", "vii": "seven",}
|
|
444
|
+
df = df.with_columns(pl.col(hard_cols[0]).cast(pl.Utf8).str.strip_chars().str.to_lowercase().replace(suffix_map))
|
|
445
|
+
om_ids = []
|
|
446
|
+
for id in hard_cols:
|
|
447
|
+
df1 = df.select('name_id', id)
|
|
448
|
+
df1 = df1.filter(pl.col(id)!="").unique()
|
|
449
|
+
df1 = df1.with_columns(pl.col('name_id').count().over('name_id').alias('count'))
|
|
450
|
+
df1 = df1.filter(pl.col('count')>1).sort('name_id')
|
|
451
|
+
df1 = (df1.with_columns(pl.col(id).map_elements(lambda x, s=df1: min(sum(a != b for a, b in zip(x, y)) for y in s.filter(pl.col("name_id") == s.filter(pl.col(id) == x)["name_id"][0])[id] if y != x), return_dtype=pl.Int64).alias("count")))
|
|
452
|
+
df1 = df1.filter(pl.col('count')>2).sort('name_id')
|
|
453
|
+
ids = df1.select(pl.col("name_id").unique())
|
|
454
|
+
ids = ids.to_series().to_list()
|
|
455
|
+
om_ids = list(set(om_ids).union(ids))
|
|
456
|
+
om_df = df.filter(pl.col('name_id').is_in(om_ids)).sort('name_id')
|
|
457
|
+
return om_df
|
|
458
|
+
|
|
459
|
+
def initial_names(df, names, merge_cols):
|
|
460
|
+
df = df.fill_null('')
|
|
461
|
+
df1 = df.with_columns(pl.col(names[0]).str.slice(0, 3).alias("FN3"))
|
|
462
|
+
df1 = df1.with_columns(pl.col(names[2]).str.slice(0, 3).alias("LN3"))
|
|
463
|
+
inames = df1.columns[-2:]
|
|
464
|
+
df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias('inames'))
|
|
465
|
+
df1 = df1.with_columns(pl.col('inames').str.split(' ')).explode('inames')
|
|
466
|
+
df1 = df1.sort('inames')
|
|
467
|
+
df1 = df1.group_by('dense_id').agg(pl.col('inames').unique().str.join('').alias('inames'))
|
|
468
|
+
df = df.join(df1, on = 'dense_id', how = 'left')
|
|
469
|
+
cols = ['dense_id'] + ['name_id'] + ['inames'] + merge_cols
|
|
470
|
+
df = df.select(cols).unique()
|
|
471
|
+
return df
|
|
472
|
+
|
|
473
|
+
def merging_on_ssntin(df, cols):
|
|
474
|
+
for col in cols:
|
|
475
|
+
df = df.fill_null('')
|
|
476
|
+
df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
477
|
+
df2 = df1.select('name_id', col).unique()
|
|
478
|
+
df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
|
|
479
|
+
df2 = df2.filter(pl.col('Count')>1).sort(col)
|
|
480
|
+
df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
|
|
481
|
+
df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
|
|
482
|
+
df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
|
|
483
|
+
df = df.join(df2, on = 'name_id', how = 'left')
|
|
484
|
+
df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
|
|
485
|
+
df = df.drop(col, 'New_name_id')
|
|
486
|
+
return df
|
|
487
|
+
|
|
488
|
+
def merging_on_address(df, cols):
|
|
489
|
+
for col in cols:
|
|
490
|
+
df = df.fill_null('')
|
|
491
|
+
df1 = df.with_columns(pl.col(col).str.slice(0, 10).alias(col))
|
|
492
|
+
df1 = df1.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
493
|
+
inames = ['inames', col]
|
|
494
|
+
df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
|
|
495
|
+
df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
|
|
496
|
+
df1 = df1.sort(col)
|
|
497
|
+
df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
|
|
498
|
+
df = df.drop(col)
|
|
499
|
+
df = df.join(df1, on = 'dense_id', how = 'left')
|
|
500
|
+
df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
501
|
+
df2 = df1.select('name_id', col).unique()
|
|
502
|
+
df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
|
|
503
|
+
df2 = df2.filter(pl.col('Count')>1).sort(col)
|
|
504
|
+
df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
|
|
505
|
+
df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
|
|
506
|
+
df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
|
|
507
|
+
df = df.join(df2, on = 'name_id', how = 'left')
|
|
508
|
+
df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
|
|
509
|
+
df = df.drop(col, 'New_name_id')
|
|
510
|
+
return df
|
|
511
|
+
|
|
512
|
+
def merging_on_dob(df, cols):
|
|
513
|
+
for col in cols:
|
|
514
|
+
df = df.fill_null('')
|
|
515
|
+
df1 = df.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
516
|
+
inames = ['inames', col]
|
|
517
|
+
df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
|
|
518
|
+
df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
|
|
519
|
+
df1 = df1.sort(col)
|
|
520
|
+
df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
|
|
521
|
+
df = df.drop(col)
|
|
522
|
+
df = df.join(df1, on = 'dense_id', how = 'left')
|
|
523
|
+
df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
524
|
+
df2 = df1.select('name_id', col).unique()
|
|
525
|
+
df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
|
|
526
|
+
df2 = df2.filter(pl.col('Count')>1).sort(col)
|
|
527
|
+
df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
|
|
528
|
+
df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
|
|
529
|
+
df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
|
|
530
|
+
df = df.join(df2, on = 'name_id', how = 'left')
|
|
531
|
+
df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
|
|
532
|
+
df = df.drop(col, 'New_name_id')
|
|
533
|
+
return df
|
|
534
|
+
|
|
535
|
+
def merging_on_others(df, cols):
|
|
536
|
+
for col in cols:
|
|
537
|
+
df = df.fill_null('')
|
|
538
|
+
df1 = df.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
539
|
+
inames = ['inames', col]
|
|
540
|
+
df1 = df1.with_columns(pl.col(col).str.split(';')).explode(col)
|
|
541
|
+
df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
|
|
542
|
+
df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
|
|
543
|
+
df1 = df1.sort(col)
|
|
544
|
+
df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
|
|
545
|
+
df = df.drop(col)
|
|
546
|
+
df = df.join(df1, on = 'dense_id', how = 'left')
|
|
547
|
+
df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
|
|
548
|
+
df2 = df1.select('name_id', col).unique()
|
|
549
|
+
df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
|
|
550
|
+
df2 = df2.filter(pl.col('Count')>1).sort(col)
|
|
551
|
+
df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
|
|
552
|
+
df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
|
|
553
|
+
df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
|
|
554
|
+
df = df.join(df2, on = 'name_id', how = 'left')
|
|
555
|
+
df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
|
|
556
|
+
df = df.drop(col, 'New_name_id')
|
|
557
|
+
df = df.drop('inames')
|
|
558
|
+
df = df.unique('dense_id')
|
|
559
|
+
return df
|
|
560
|
+
|
|
561
|
+
def name_checks(df1, names):
|
|
562
|
+
df = df1.select(['name_id'] + names).unique()
|
|
563
|
+
df = df.select([pl.col(col).str.to_uppercase().str.strip_chars().alias(col) for col in df.columns])
|
|
564
|
+
df = df.fill_null("")
|
|
565
|
+
df = df.with_columns(
|
|
566
|
+
pl.concat_str(
|
|
567
|
+
names,
|
|
568
|
+
separator=" "
|
|
569
|
+
)
|
|
570
|
+
.str.to_uppercase()
|
|
571
|
+
.str.replace_all(r"[^A-Z0-9 ]", "")
|
|
572
|
+
.str.replace_all(r"\s+", " ")
|
|
573
|
+
.str.strip_chars()
|
|
574
|
+
.alias("norm_name")
|
|
575
|
+
)
|
|
576
|
+
canonical = (
|
|
577
|
+
df
|
|
578
|
+
.group_by(["name_id", "norm_name"])
|
|
579
|
+
.agg([
|
|
580
|
+
pl.len().alias("freq"),
|
|
581
|
+
pl.col("norm_name").str.len_chars().max().alias("len"),
|
|
582
|
+
pl.first(names[0]).alias("canon_first"),
|
|
583
|
+
pl.first(names[1]).alias("canon_middle"),
|
|
584
|
+
pl.first(names[2]).alias("canon_last"),
|
|
585
|
+
])
|
|
586
|
+
.sort(
|
|
587
|
+
by=["name_id", "freq", "len"],
|
|
588
|
+
descending=[False, True, True]
|
|
589
|
+
)
|
|
590
|
+
.group_by("name_id")
|
|
591
|
+
.first()
|
|
592
|
+
.select([
|
|
593
|
+
"name_id",
|
|
594
|
+
"canon_first",
|
|
595
|
+
"canon_middle",
|
|
596
|
+
"canon_last",
|
|
597
|
+
])
|
|
598
|
+
)
|
|
599
|
+
df = df.join(canonical, on="name_id", how="left")
|
|
600
|
+
df = df.with_columns([
|
|
601
|
+
pl.struct(["norm_name", "canon_first", "canon_middle", "canon_last"])
|
|
602
|
+
.map_elements(lambda x: fuzz.token_sort_ratio(
|
|
603
|
+
x["norm_name"],
|
|
604
|
+
f"{x['canon_first']} {x['canon_middle']} {x['canon_last']}".strip()
|
|
605
|
+
))
|
|
606
|
+
.alias("name_similarity")
|
|
607
|
+
])
|
|
608
|
+
df = df.with_columns(
|
|
609
|
+
pl.when(pl.col("name_similarity") >= 75)
|
|
610
|
+
.then(pl.lit("AUTO_STANDARDIZED"))
|
|
611
|
+
.otherwise(pl.lit("NEEDS_MANUAL_INTERVENTION"))
|
|
612
|
+
.alias("comment")
|
|
613
|
+
)
|
|
614
|
+
df = df.filter(pl.col('comment')=='NEEDS_MANUAL_INTERVENTION').select('name_id', 'comment').unique()
|
|
615
|
+
df1 = df1.join(df, on = 'name_id', how = 'left')
|
|
616
|
+
return df1
|
|
617
|
+
|
|
618
|
+
def name_final(df, unique_id, names):
|
|
619
|
+
df = df.select([unique_id] + names).unique()
|
|
620
|
+
df = df.select([pl.col(col).str.to_uppercase().str.strip_chars().alias(col) for col in df.columns])
|
|
621
|
+
df = df.with_columns(pl.col(names[0]).str.replace_all(r"UNKNOWN", ""))
|
|
622
|
+
df = df.with_columns(pl.col(names[2]).str.replace_all(r"UNKNOWN", ""))
|
|
623
|
+
df = df.fill_null("")
|
|
624
|
+
df = df.with_columns(
|
|
625
|
+
pl.concat_str(
|
|
626
|
+
names,
|
|
627
|
+
separator=" "
|
|
628
|
+
)
|
|
629
|
+
.str.to_uppercase()
|
|
630
|
+
.str.replace_all(r"[^A-Z0-9 ]", "")
|
|
631
|
+
.str.replace_all(r"\s+", " ")
|
|
632
|
+
.str.strip_chars()
|
|
633
|
+
.alias("norm_name")
|
|
634
|
+
)
|
|
635
|
+
df = (
|
|
636
|
+
df
|
|
637
|
+
.group_by([unique_id, "norm_name"])
|
|
638
|
+
.agg([
|
|
639
|
+
pl.len().alias("freq"),
|
|
640
|
+
pl.col("norm_name").str.len_chars().max().alias("len"),
|
|
641
|
+
pl.first(names[0]).alias(names[0]),
|
|
642
|
+
pl.first(names[1]).alias(names[1]),
|
|
643
|
+
pl.first(names[2]).alias(names[2]),
|
|
644
|
+
])
|
|
645
|
+
.with_columns(pl.sum_horizontal([pl.when(pl.col(c).is_not_null() & (pl.col(c) != "")).then(1).otherwise(0) for c in names]).alias("countA"))
|
|
646
|
+
.sort(["countA", "freq", "len"], descending=[True, True, True])
|
|
647
|
+
.select([unique_id, names[0], names[1], names[2],])
|
|
648
|
+
.group_by(unique_id).head(1)
|
|
649
|
+
)
|
|
650
|
+
return df
|
|
651
|
+
|
|
652
|
+
def address_final(df, unique_id, addresses):
|
|
653
|
+
df = (
|
|
654
|
+
df
|
|
655
|
+
.with_columns(pl.concat_str([pl.col(c).fill_null("") for c in [unique_id]+ addresses], separator="|").alias("addr_concat"))
|
|
656
|
+
.with_columns(pl.sum_horizontal([pl.when(pl.col(c).is_not_null() & (pl.col(c) != "")).then(1).otherwise(0) for c in addresses]).alias("countA"))
|
|
657
|
+
.with_columns(pl.col("addr_concat").str.len_chars().alias("addr_len"))
|
|
658
|
+
.with_columns(pl.col("addr_concat").count().over("addr_concat").alias("countif"))
|
|
659
|
+
.sort(["countif", "countA", "addr_len"], descending=[True, True, True])
|
|
660
|
+
.group_by(unique_id).head(1)
|
|
661
|
+
.select([unique_id] + addresses)
|
|
662
|
+
)
|
|
663
|
+
return df
|
|
664
|
+
|
|
665
|
+
def final_cel(df, unique_id, summary, names, addresses):
|
|
666
|
+
raw_summary = df.select(summary)
|
|
667
|
+
raw_summary = raw_summary.with_columns(pl.col(summary).str.split(';')).explode(summary)
|
|
668
|
+
raw_summary = raw_summary.with_columns(pl.col(summary).count().over(summary).alias('TOTAL COUNT')).unique()
|
|
669
|
+
final_names = name_final(df, unique_id, names)
|
|
670
|
+
final_addresses = address_final(df, unique_id, addresses)
|
|
671
|
+
final_cols = df.columns
|
|
672
|
+
df1 = df
|
|
673
|
+
df1 = df1.fill_null("")
|
|
674
|
+
df = df.select(unique_id).unique()
|
|
675
|
+
cols = [c for c in df1.columns if c not in names + addresses + [unique_id]]
|
|
676
|
+
for col in cols:
|
|
677
|
+
df3 = df1.select([unique_id, col]).filter(pl.col(col) != '')
|
|
678
|
+
df3 = df3.with_columns(pl.col(col).str.split(';')).explode(col)
|
|
679
|
+
df3 = df3.unique()
|
|
680
|
+
df3 = df3.group_by(unique_id).agg(
|
|
681
|
+
pl.col(col).unique().sort().str.join(';').alias(col)
|
|
682
|
+
)
|
|
683
|
+
df = df.join(df3, on=unique_id, how='left')
|
|
684
|
+
df = df.join(final_names, on = unique_id, how = 'left')
|
|
685
|
+
df = df.join(final_addresses, on = unique_id, how = 'left')
|
|
686
|
+
df = df.select(final_cols).sort(unique_id)
|
|
687
|
+
cel_summary = df.select(summary)
|
|
688
|
+
cel_summary = cel_summary.with_columns(pl.col(summary).str.split(';')).explode(summary)
|
|
689
|
+
cel_summary = cel_summary.with_columns(pl.col(summary).count().over(summary).alias('UNIQUE COUNT')).unique()
|
|
690
|
+
final_summary = raw_summary.join(cel_summary, on = summary, how = 'left')
|
|
691
|
+
now = datetime.now().strftime("%m%d_%H%M")
|
|
692
|
+
final_summary.write_csv(f'summary_{now}.csv')
|
|
693
|
+
return df
|
|
694
|
+
|
|
695
|
+
def y_columns(df, summary, y_cols):
|
|
696
|
+
df = (
|
|
697
|
+
df.with_columns(
|
|
698
|
+
pl.col(summary)
|
|
699
|
+
.fill_null("")
|
|
700
|
+
.str.split(";")
|
|
701
|
+
.alias("split_vals")
|
|
702
|
+
)
|
|
703
|
+
.with_columns([
|
|
704
|
+
pl.when(pl.col("split_vals").list.contains(v))
|
|
705
|
+
.then(pl.lit("Y"))
|
|
706
|
+
.otherwise(pl.lit(""))
|
|
707
|
+
.alias(v)
|
|
708
|
+
for v in y_cols
|
|
709
|
+
])
|
|
710
|
+
.drop("split_vals")
|
|
711
|
+
)
|
|
712
|
+
return df
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: r3_test
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Just for test
|
|
5
|
+
Author: Ranjeet Aloriya
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: polars
|
|
16
|
+
Requires-Dist: pyarrow
|
|
17
|
+
Requires-Dist: sqlalchemy
|
|
18
|
+
Requires-Dist: networkx
|
|
19
|
+
Requires-Dist: pyodbc
|
|
20
|
+
Requires-Dist: fastexcel
|
|
21
|
+
Requires-Dist: rapidfuzz
|
|
22
|
+
Requires-Dist: tqdm
|
|
23
|
+
Requires-Dist: openpyxl
|
|
24
|
+
Requires-Dist: xlrd
|
|
25
|
+
Requires-Dist: xlsxwriter
|
|
26
|
+
Dynamic: license-file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
r3_test
|
r3_test-0.0.1/setup.cfg
ADDED