PyPI - r3-test - Versions diffs - 0.0.1__tar.gz - Mend

r3-test 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

r3_test-0.0.1/LICENSE +0 -0
r3_test-0.0.1/PKG-INFO +26 -0
r3_test-0.0.1/README.md +0 -0
r3_test-0.0.1/pyproject.toml +36 -0
r3_test-0.0.1/r3_test/__init__.py +33 -0
r3_test-0.0.1/r3_test/main.py +712 -0
r3_test-0.0.1/r3_test.egg-info/PKG-INFO +26 -0
r3_test-0.0.1/r3_test.egg-info/SOURCES.txt +10 -0
r3_test-0.0.1/r3_test.egg-info/dependency_links.txt +1 -0
r3_test-0.0.1/r3_test.egg-info/requires.txt +13 -0
r3_test-0.0.1/r3_test.egg-info/top_level.txt +1 -0
r3_test-0.0.1/setup.cfg +4 -0

r3_test-0.0.1/LICENSE ADDED Viewed

File without changes

r3_test-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,26 @@
+Metadata-Version: 2.4
+Name: r3_test
+Version: 0.0.1
+Summary: Just for test
+Author: Ranjeet Aloriya
+License: MIT
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: polars
+Requires-Dist: pyarrow
+Requires-Dist: sqlalchemy
+Requires-Dist: networkx
+Requires-Dist: pyodbc
+Requires-Dist: fastexcel
+Requires-Dist: rapidfuzz
+Requires-Dist: tqdm
+Requires-Dist: openpyxl
+Requires-Dist: xlrd
+Requires-Dist: xlsxwriter
+Dynamic: license-file

r3_test-0.0.1/README.md ADDED Viewed

File without changes

r3_test-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,36 @@
+[project]
+name = "r3_test"
+version = "0.0.1"
+description = "Just for test"
+readme = "README.md"
+requires-python = ">=3.12"
+license = {text = "MIT"}
+authors = [
+  { name="Ranjeet Aloriya" }
+]
+dependencies = [
+    "numpy",
+    "pandas",
+    "polars",
+    "pyarrow",
+    "sqlalchemy",
+    "networkx",
+    "pyodbc",
+    "fastexcel",
+    "rapidfuzz",
+    "tqdm",
+    "openpyxl",
+    "xlrd",
+    "xlsxwriter"
+]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: MIT License",
+  "Operating System :: OS Independent",
+]
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"

r3_test-0.0.1/r3_test/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+from .main import (
+    help,
+    csvtsv_to_excel,
+    copy_files,
+    move_files,
+    copy_files_without_ext,
+    df_info,
+    get_segment,
+    arrange_segment,
+    split_columns,
+    split_full_name,
+    ra_replace_chars,
+    excel_compile_without_header,
+    csv_compile,
+    parquet_compile,
+    batch_processing,
+    table_from_sql,
+    table_to_sql,
+    dense_id,
+    name_id,
+    dob_normalize,
+    demerge,
+    initial_names,
+    merging_on_ssntin,
+    merging_on_address,
+    merging_on_dob,
+    merging_on_others,
+    name_checks,
+    name_final,
+    address_final,
+    final_cel,
+    y_columns
+)

r3_test-0.0.1/r3_test/main.py ADDED Viewed

@@ -0,0 +1,712 @@
+import re, os, sys, csv, shutil, networkx, warnings, pyodbc, urllib,sqlalchemy, pyarrow, fastexcel
+import numpy as np
+import pandas as pd
+import polars as pl
+from time import time
+from tqdm import tqdm
+from io import StringIO
+from rapidfuzz import fuzz
+import multiprocessing as mp
+from openpyxl import workbook
+from types import CoroutineType
+from datetime import timedelta, datetime
+from sqlalchemy import create_engine, event
+def help():
+    print(f"""Hello {os.getlogin().title()}!\U0001F60A,
+Thank you for choosing the Notification_List package. We sincerely appreciate your support.
+Should you require any assistance or have any questions, please do not hesitate to reach out to -
+Ranjeet Aloriya at +91 940.660.6239 or ranjeet.aloriya@gmail.com.
+We are here to help!
+Cheers!
+Ranjeet Aloriya""")
+def csvtsv_to_excel(folder_path):
+    files = os.listdir(folder_path)
+    i = 0
+    for file in files:
+        filename = os.path.join(folder_path, file)
+        if os.path.isfile(filename):
+            if file.endswith('.csv'):
+                df = pd.read_csv(filename, dtype=str, encoding='latin')
+            elif file.endswith('.tsv'):
+                df = pd.read_csv(filename, dtype=str, delimiter='\t', encoding='latin')
+            else:
+                continue  # skip non-csv/tsv files
+            i += 1
+            output_file = os.path.splitext(filename)[0] + ".xlsx"
+            df.to_excel(output_file, index=False)
+            sys.stdout.write(f"\rFile No. {i} - {file} Processing")
+            sys.stdout.flush()
+    print(f"\nConversion completed. {i} files processed.")
+def copy_files(file):
+    df = pl.read_csv(file)
+    i = 0
+    for row in df.iter_rows():
+        source_folder = row[1]
+        file_name = row[0]
+        destination_folder = row[2]
+        source_path = os.path.join(source_folder, file_name)
+        destination_path = os.path.join(destination_folder, file_name)
+        os.makedirs(destination_folder, exist_ok=True)
+        try:
+            shutil.copy2(source_path, destination_path)
+        except:
+           pass
+        i +=1
+        sys.stdout.write(f"\rFiles Copied - {i}/{df.height}         ")
+        sys.stdout.flush()
+def move_files(file):
+    df = pl.read_csv(file)
+    i = 0
+    for row in df.iter_rows():
+        source_folder = row[1]
+        file_name = row[0]
+        destination_folder = row[2]
+        source_path = os.path.join(source_folder, file_name)
+        destination_path = os.path.join(destination_folder, file_name)
+        os.makedirs(destination_folder, exist_ok=True)
+        try:
+            shutil.move(source_path, destination_path)
+        except:
+           pass
+        i +=1
+        sys.stdout.write(f"\rFiles Moved - {i}/{df.height}         ")
+        sys.stdout.flush()
+def copy_files_without_ext(csv_file):
+    df = pl.read_csv(csv_file)
+    not_found = []
+    total = len(df)
+    copied_count = 0
+    for i, row in enumerate(df.iter_rows(), start=1):
+        file_name, source_folder, destination_folder = map(str, row)
+        found = False
+        for root, dirs, files in os.walk(source_folder):
+            for f in files:
+                name, ext = os.path.splitext(f)
+                if name.lower() == file_name.lower():
+                    os.makedirs(destination_folder, exist_ok=True)
+                    shutil.copy2(os.path.join(root, f), os.path.join(destination_folder, f))
+                    copied_count += 1
+                    found = True
+                    break
+            if found:
+                break
+        if not found:
+            not_found.append([file_name])
+        sys.stdout.write(f"\rProgress: {i}/{total} processed, {copied_count} copied")
+        sys.stdout.flush()
+    if not_found:
+        ts = datetime.now().strftime("%m%d%y%H%M%S")
+        nf_file = f"Not_Found_Copying_{ts}.csv"
+        pl.DataFrame(not_found, schema=["FileName"]).write_csv(nf_file)
+        print(f"\nSummary: {copied_count}/{total} copied, {len(not_found)} not found (saved in {nf_file})")
+    else:
+        print(f"\nSummary: All {total} files copied successfully ✅")
+def df_info(file):
+    df = pl.read_csv(file)
+    data = []
+    for col in df.columns:
+        dtype = df.schema[col]
+        non_null_count = len(df[col].drop_nulls())
+        unique_count = df[col].n_unique()
+        data.append({
+            "Column Name": col,
+            "Data Type": str(dtype),
+            "Non-Null Count": non_null_count,
+            "Unique Count": unique_count
+        })
+    df = pl.DataFrame(data)
+def get_segment(f, sep = '~'):
+    with open(f, 'r', encoding='ascii', errors='ignore') as file:
+        data = file.read()
+        data = data.replace("\n\n", "")
+        data = data.replace("\n", "")
+        segments = data.split(sep)
+        df = pl.DataFrame({'Segment': segments})
+        df = df.filter(pl.col("Segment").str.contains("*", literal=True))
+    return df
+def arrange_segment(df, column_name = "Segment", sep = "*"):
+    rows = []
+    current = {}
+    for value in df[column_name]:
+        prefix, data = value.split(sep, 1)
+        if prefix in current:
+            rows.append(current)
+            current = {}
+        current[prefix] = data
+    if current:
+        rows.append(current)
+    df = pl.DataFrame(rows).fill_null("")
+    return df
+def split_columns(df, sep='\\*'):
+    df = df.to_pandas()
+    for column in df.columns:
+        df[column] = df[column].fillna('')
+        max_splits = df[column].str.count(sep).max() + 1
+        max_splits = int(max_splits)
+        new_columns = df[column].str.split(sep, expand=True)
+        new_column_names = [f"{column}_{i+1}" for i in range(max_splits)]
+        new_columns.columns = new_column_names
+        df = df.drop(column, axis=1).join(new_columns)
+    df = pl.from_pandas(df)
+    return df
+def split_full_name(df, full_name, suffixes):
+    suffixes = suffixes
+    def clean_and_split(text):
+        return text.replace(",", "").split()
+    def extract_suffix(words):
+        for i, word in enumerate(words):
+            if word.upper() in suffixes:
+                return word.upper(), words[:i] + words[i+1:]
+        return "", words
+    def parse_name(part1, part2):
+        first = middle = last = suffix = ""
+        if part2:
+            last_words = clean_and_split(part1)
+            suffix, last_words = extract_suffix(last_words)
+            last = " ".join(last_words)
+            name_words = clean_and_split(part2)
+            sfx2, name_words = extract_suffix(name_words)
+            suffix = suffix or sfx2
+            if name_words:
+                first = name_words[0]
+                if len(name_words) > 1:
+                    middle = " ".join(name_words[1:])
+        else:
+            words = clean_and_split(part1)
+            suffix, words = extract_suffix(words)
+            if len(words) == 1:
+                first = words[0]
+            elif len(words) == 2:
+                first, last = words
+            elif len(words) > 2:
+                first = words[0]
+                last = words[-1]
+                middle = " ".join(words[1:-1])
+        return [first, middle, last, suffix]
+    return (
+        df
+        .with_columns(pl.col(full_name).str.split_exact(",", 1).alias("_split"))
+        .with_columns([
+            pl.col("_split").struct.field("field_0").str.strip_chars().alias("_part1"),
+            pl.col("_split").struct.field("field_1").str.strip_chars().fill_null("").alias("_part2")
+        ])
+        .with_columns(pl.struct(["_part1", "_part2"]).map_elements(
+            lambda row: parse_name(row["_part1"], row["_part2"]),
+            return_dtype=pl.List(pl.Utf8)
+        ).alias("_parsed"))
+        .with_columns([
+            pl.col("_parsed").list.get(0).alias("split_first_name"),
+            pl.col("_parsed").list.get(1).alias("split_middle_name"),
+            pl.col("_parsed").list.get(2).alias("split_last_name"),
+            pl.col("_parsed").list.get(3).alias("split_suffix"),
+        ])
+        .drop(["_split", "_part1", "_part2", "_parsed"])
+    )
+def ra_replace_chars(df, column, cleaning_dict):
+    col_expr = pl.col(column)
+    for pattern, replacement in cleaning_dict.items():
+        col_expr = col_expr.str.replace_all(pattern, replacement)
+    return df.with_columns(col_expr.str.strip_chars().alias(column))
+def excel_compile_without_header(path, f):
+    my_df = pl.DataFrame()
+    file = os.path.join(path, f)
+    sheets = pl.read_excel(file, has_header=False, sheet_id=0, raise_if_empty=False, infer_schema_length=0)
+    for sheet in sheets.keys():
+        df = pl.read_excel(file, has_header=False, sheet_name = sheet, raise_if_empty=False, infer_schema_length=0)
+        df = df.with_columns(pl.lit(f).alias('FileName'))
+        df = df.with_columns(pl.lit(sheet).alias('SheetName'))
+        df = df.select(['FileName', 'SheetName']+[col for col in df.columns if col not in ['FileName', 'SheetName']])
+        my_df = pl.concat([my_df, df], how='diagonal')
+    return my_df
+def csv_compile(path, f):
+    my_df = pl.DataFrame()
+    file = os.path.join(path, f)
+    df = pl.read_csv(file, raise_if_empty=False, infer_schema_length=0)
+    df = df.with_columns(pl.lit(f).alias('FileName'))
+    my_df = pl.concat([my_df, df], how='diagonal')
+    return my_df
+def parquet_compile(path, f):
+    my_df = pl.DataFrame()
+    file = os.path.join(path, f)
+    df = pl.read_parquet(file)
+    df = df.with_columns(pl.lit(f).alias('FileName'))
+    my_df = pl.concat([my_df, df], how='diagonal')
+    return my_df
+def batch_processing(path, processing_function, b):
+    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+    j = 0
+    for i in range(0, len(files), b):
+        batch = files[i:i+b]
+        my_df = pl.DataFrame()
+        j += 1
+        k = 0
+        for f in batch:
+            try:
+                df = processing_function(path, f)
+                my_df = pl.concat([my_df, df], how='diagonal')
+                k += 1
+                sys.stdout.write(f"\rFile No. {k} - Processed of Batch No. {j}          ")
+                sys.stdout.flush()
+            except Exception as e:
+                sys.stdout.write(f"\r⚠️ Skipping file due to error: {f}{e}          ")
+                sys.stdout.flush()
+                continue
+        sys.stdout.write(f"\rBatch No. {j} - Processed                                                 ")
+        sys.stdout.flush()
+        batch_number = f"{j:03d}"
+        func_name = processing_function.__name__
+        output_folder = os.path.join(path, 'output1')
+        os.makedirs(output_folder, exist_ok=True)
+        output_file_path = os.path.join(output_folder, f'{func_name}_Batch_{batch_number}.parquet')
+        my_df.write_parquet(output_file_path)
+    sys.stdout.write(f"\rAll Batches are Processed                                                 ")
+    sys.stdout.flush()
+def table_from_sql(server, database, table):
+    my_df = pl.DataFrame()
+    connection = pyodbc.connect(
+        f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'
+    )
+    sys.stdout.write(f"\rConnection successful!      ")
+    sys.stdout.flush()
+    query = f"select * from {table}"
+    for df in pl.read_database(query, connection=connection, iter_batches=True, batch_size=10000, infer_schema_length=0):
+        my_df = pl.concat([my_df, df], how = 'diagonal')
+        sys.stdout.write(f"\rDownloading Raws: {my_df.shape[0]}      ")
+        sys.stdout.flush()
+    sys.stdout.write(f"\rSaved as polars DataFrame - Total Raws: {my_df.shape[0]}      ")
+    sys.stdout.flush()
+    return my_df
+def table_to_sql(server, database, table, df):
+    params = urllib.parse.quote_plus(
+        f"DRIVER={{ODBC Driver 17 for SQL Server}};"
+        f"SERVER={server};"
+        f"DATABASE={database};"
+        "Trusted_Connection=yes;"
+    )
+    engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
+    df = df.to_pandas()
+    df.to_sql(table, con=engine, index=False, if_exists="replace", schema="dbo")
+    sys.stdout.write(f"\rSaved {table} in {database} - Total Raws: {len(df)}      ")
+    sys.stdout.flush()
+def dense_id(df, cols):
+    df = df.fill_null('')
+    df = df.with_columns(pl.concat_str([pl.col(c) for c in cols], separator="", ignore_nulls=True).alias('AllData'))
+    df = df.with_columns(pl.col("AllData").rank(method="dense").alias("dense_id"))
+    df = df.drop('AllData')
+    df = df.with_columns([pl.col(c).cast(pl.Utf8) for c in df.columns])
+    print(f"Max Records for CEL V1 - {len(df.select('dense_id').unique())}")
+    return df
+def name_id(df, names):
+    df = df.with_columns(
+        (
+            (pl.col(names[0]).fill_null("").str.strip_chars().str.to_lowercase() + " " +
+            pl.col(names[1]).fill_null("").str.strip_chars().str.to_lowercase() + " " +
+            pl.col(names[2]).fill_null("").str.strip_chars().str.to_lowercase())
+            .str.replace_all(r"\s+", " ")
+            .str.strip_chars()
+            .alias("full_name_norm")
+        )
+    )
+    df = df.with_columns(
+        pl.col("full_name_norm").str.split(" ").alias("tokens")
+    )
+    unique_entities = []
+    entity_ids = []
+    for tokens in df["tokens"]:
+        token_set = set(tokens)
+        found = False
+        for idx, u_tokens in enumerate(unique_entities):
+            if token_set <= u_tokens or u_tokens <= token_set:
+                entity_ids.append(idx)
+                unique_entities[idx] |= token_set
+                found = True
+                break
+        if not found:
+            unique_entities.append(token_set)
+            entity_ids.append(len(unique_entities)-1)
+    df = df.with_columns(pl.Series("entity_id", entity_ids))
+    df = df.with_columns(pl.concat_str([pl.col(c) for c in names], separator="", ignore_nulls=True).alias('FullName'))
+    df = df.with_columns(pl.col("FullName").str.replace_all(r"\s+", ""))
+    name_to_entity = df.group_by("FullName").agg(
+        pl.first("entity_id").alias("entity_id1")
+    )
+    df = df.join(name_to_entity, on="FullName", how="left")
+    df = df.with_columns(pl.concat_str([pl.col(c) for c in names], separator=" ", ignore_nulls=True).alias('FullName'))
+    df = df.with_columns(
+        pl.col("FullName")
+        .str.extract_all(r"[^a-zA-Z ]")
+        .alias("sp_chars")
+    )
+    sp_chars = set(char for sublist in df["sp_chars"].to_list() for char in sublist)
+    print(f"Special chars in name: {sp_chars}")
+    pattern = "[" + re.escape("".join(sp_chars)) + "]"
+    df = df.with_columns(pl.col("FullName").str.replace_all(pattern, " ").alias("FullName"))
+    df = df.with_columns(pl.arange(0, df.height, 1).alias("name_index"))
+    df3 = df.select(['name_index', 'FullName'])
+    df3 = df3.with_columns(pl.col('FullName').str.split(' ')).explode('FullName')
+    df3 = df3.unique()
+    df3 = df3.group_by('name_index').agg(
+        pl.col('FullName').unique().sort().str.join('').alias('FullName1')
+    )
+    df = df.join(df3, on='name_index', how='left')
+    name_to_entity = df.group_by("FullName1").agg(
+        pl.first("entity_id1").alias("name_id")
+    )
+    df = df.join(name_to_entity, on="FullName1", how="left")
+    df = df.drop(['full_name_norm', 'tokens', 'entity_id', 'FullName', 'entity_id1', 'sp_chars', 'name_index', 'FullName1'])
+    unknown_expr = (
+        pl.concat_str([pl.col(c) for c in names], separator=" ", ignore_nulls=True)
+        .str.to_lowercase()
+        .str.contains("unknown")
+    )
+    max_id = df.select(pl.col("name_id").max()).item()
+    df = df.with_columns(
+        pl.when(unknown_expr)
+        .then(pl.arange(max_id + 1, max_id + 1 + df.height))
+        .otherwise(pl.col("name_id"))
+        .alias("name_id")
+    )
+    full_name = [names[0], names[2]]
+    df1 = df.with_columns(pl.concat_str([pl.col(c) for c in full_name], separator=" ", ignore_nulls=True).alias('FullName'))
+    df2 = df1.with_columns(pl.col('FullName').alias('Reverse_Name'))
+    df2 = df2.select('FullName', 'Reverse_Name').unique()
+    df2 = df2.with_columns(pl.col('Reverse_Name').str.split(' ')).explode('Reverse_Name')
+    df2 = df2.sort('Reverse_Name')
+    df2 = df2.group_by('FullName').agg(pl.col('Reverse_Name').unique().str.join('').alias('Reverse_Name'))
+    df1 = df1.join(df2, on ='FullName', how = 'left')
+    df2 = df1.select('name_id', 'Reverse_Name').unique()
+    df2 = df2.with_columns(pl.col('Reverse_Name').count().over('Reverse_Name').alias('Count'))
+    df2 = df2.filter(pl.col('Count')>1).sort('Reverse_Name')
+    df2 = df2.filter(pl.col('Reverse_Name')!='unknown')
+    df2 = df2.join(df2.group_by("Reverse_Name").agg(pl.col("name_id").min().alias("New_name_id")),on = 'Reverse_Name')
+    df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
+    df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').unique('name_id')
+    df1 = df1.join(df2, on = 'name_id', how = 'left')
+    df1 = df1.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
+    df = df1.drop('FullName', 'Reverse_Name', 'New_name_id')
+    return df
+def dob_normalize(df, dates):
+    for DOB in dates:
+        formats = ["%m/%d/%Y", "%m/%d/%y"]
+        df = df.with_columns(pl.coalesce([pl.col(DOB).cast(pl.Utf8).str.strip_chars()
+        .str.replace_all(r"[-. ]", "/")
+                .str.to_date(fmt, strict=False)
+                for fmt in formats
+            ])
+            .dt.strftime("%m/%d/%Y")
+            .alias(DOB)
+        )
+        df = df.with_columns(pl.col(DOB).alias('Cleaned_DOB'))
+        df1 = df.select('Cleaned_DOB', DOB).unique().filter(pl.col(DOB).is_not_null())
+        df1 = df1.with_columns(pl.col(DOB).str.split('/')).explode(DOB)
+        df1 = df1.group_by('Cleaned_DOB').agg(pl.col(DOB).sort().str.join('').alias(DOB))
+        df = df.drop(DOB).join(df1, on = 'Cleaned_DOB', how = 'left').drop('Cleaned_DOB')
+    return df
+def demerge(df, hard_cols):
+    df = df.fill_null('')
+    suffix_map = {"sr": "seenior", "jr": "junior", "ii": "second", "iii": "third", "iv": "four", "v": "five", "vi": "six", "vii": "seven",}
+    df = df.with_columns(pl.col(hard_cols[0]).cast(pl.Utf8).str.strip_chars().str.to_lowercase().replace(suffix_map))
+    om_ids = []
+    for id in hard_cols:
+        df1 = df.select('name_id', id)
+        df1 = df1.filter(pl.col(id)!="").unique()
+        df1 = df1.with_columns(pl.col('name_id').count().over('name_id').alias('count'))
+        df1 = df1.filter(pl.col('count')>1).sort('name_id')
+        df1 = (df1.with_columns(pl.col(id).map_elements(lambda x, s=df1: min(sum(a != b for a, b in zip(x, y)) for y in s.filter(pl.col("name_id") == s.filter(pl.col(id) == x)["name_id"][0])[id] if y != x), return_dtype=pl.Int64).alias("count")))
+        df1 = df1.filter(pl.col('count')>2).sort('name_id')
+        ids = df1.select(pl.col("name_id").unique())
+        ids = ids.to_series().to_list()
+        om_ids = list(set(om_ids).union(ids))
+    om_df = df.filter(pl.col('name_id').is_in(om_ids)).sort('name_id')
+    return om_df
+def initial_names(df, names, merge_cols):
+    df = df.fill_null('')
+    df1 = df.with_columns(pl.col(names[0]).str.slice(0, 3).alias("FN3"))
+    df1 = df1.with_columns(pl.col(names[2]).str.slice(0, 3).alias("LN3"))
+    inames = df1.columns[-2:]
+    df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias('inames'))
+    df1 = df1.with_columns(pl.col('inames').str.split(' ')).explode('inames')
+    df1 = df1.sort('inames')
+    df1 = df1.group_by('dense_id').agg(pl.col('inames').unique().str.join('').alias('inames'))
+    df = df.join(df1, on = 'dense_id', how = 'left')
+    cols = ['dense_id'] + ['name_id'] + ['inames'] + merge_cols
+    df = df.select(cols).unique()
+    return df
+def merging_on_ssntin(df, cols):
+    for col in cols:
+        df = df.fill_null('')
+        df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        df2 = df1.select('name_id', col).unique()
+        df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
+        df2 = df2.filter(pl.col('Count')>1).sort(col)
+        df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
+        df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
+        df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
+        df = df.join(df2, on = 'name_id', how = 'left')
+        df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
+        df = df.drop(col, 'New_name_id')
+    return df
+def merging_on_address(df, cols):
+    for col in cols:
+        df = df.fill_null('')
+        df1 = df.with_columns(pl.col(col).str.slice(0, 10).alias(col))
+        df1 = df1.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        inames = ['inames', col]
+        df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
+        df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
+        df1 = df1.sort(col)
+        df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
+        df = df.drop(col)
+        df = df.join(df1, on = 'dense_id', how = 'left')
+        df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        df2 = df1.select('name_id', col).unique()
+        df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
+        df2 = df2.filter(pl.col('Count')>1).sort(col)
+        df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
+        df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
+        df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
+        df = df.join(df2, on = 'name_id', how = 'left')
+        df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
+        df = df.drop(col, 'New_name_id')
+    return df
+def merging_on_dob(df, cols):
+    for col in cols:
+        df = df.fill_null('')
+        df1 = df.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        inames = ['inames', col]
+        df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
+        df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
+        df1 = df1.sort(col)
+        df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
+        df = df.drop(col)
+        df = df.join(df1, on = 'dense_id', how = 'left')
+        df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        df2 = df1.select('name_id', col).unique()
+        df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
+        df2 = df2.filter(pl.col('Count')>1).sort(col)
+        df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
+        df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
+        df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
+        df = df.join(df2, on = 'name_id', how = 'left')
+        df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
+        df = df.drop(col, 'New_name_id')
+    return df
+def merging_on_others(df, cols):
+    for col in cols:
+        df = df.fill_null('')
+        df1 = df.select('dense_id', 'inames', col).unique().filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        inames = ['inames', col]
+        df1 = df1.with_columns(pl.col(col).str.split(';')).explode(col)
+        df1 = df1.with_columns(pl.concat_str([pl.col(c) for c in inames], separator=" ", ignore_nulls=True).alias(col))
+        df1 = df1.with_columns(pl.col(col).str.split(' ')).explode(col)
+        df1 = df1.sort(col)
+        df1 = df1.group_by('dense_id').agg(pl.col(col).unique().str.join('').alias(col))
+        df = df.drop(col)
+        df = df.join(df1, on = 'dense_id', how = 'left')
+        df1 = df.filter(pl.col(col).is_not_null() &(pl.col(col).str.strip_chars()!=''))
+        df2 = df1.select('name_id', col).unique()
+        df2 = df2.with_columns(pl.col(col).count().over(col).alias('Count'))
+        df2 = df2.filter(pl.col('Count')>1).sort(col)
+        df2 = df2.join(df2.group_by(col).agg(pl.col("name_id").min().alias("New_name_id")),on = col)
+        df2 = df2.with_columns((pl.col("name_id") == pl.col("New_name_id")).alias("match"))
+        df2 = df2.filter(pl.col('match')==False).select('name_id', 'New_name_id').sort('name_id').unique('name_id')
+        df = df.join(df2, on = 'name_id', how = 'left')
+        df = df.with_columns(pl.when(pl.col('New_name_id').is_not_null()).then(pl.col('New_name_id')).otherwise(pl.col('name_id')).alias('name_id'))
+        df = df.drop(col, 'New_name_id')
+    df = df.drop('inames')
+    df = df.unique('dense_id')
+    return df
+def name_checks(df1, names):
+    df = df1.select(['name_id'] + names).unique()
+    df = df.select([pl.col(col).str.to_uppercase().str.strip_chars().alias(col) for col in df.columns])
+    df = df.fill_null("")
+    df = df.with_columns(
+        pl.concat_str(
+            names,
+            separator=" "
+        )
+        .str.to_uppercase()
+        .str.replace_all(r"[^A-Z0-9 ]", "")
+        .str.replace_all(r"\s+", " ")
+        .str.strip_chars()
+        .alias("norm_name")
+    )
+    canonical = (
+        df
+        .group_by(["name_id", "norm_name"])
+        .agg([
+            pl.len().alias("freq"),
+            pl.col("norm_name").str.len_chars().max().alias("len"),
+            pl.first(names[0]).alias("canon_first"),
+            pl.first(names[1]).alias("canon_middle"),
+            pl.first(names[2]).alias("canon_last"),
+        ])
+        .sort(
+            by=["name_id", "freq", "len"],
+            descending=[False, True, True]
+        )
+        .group_by("name_id")
+        .first()
+        .select([
+            "name_id",
+            "canon_first",
+            "canon_middle",
+            "canon_last",
+        ])
+    )
+    df = df.join(canonical, on="name_id", how="left")
+    df = df.with_columns([
+        pl.struct(["norm_name", "canon_first", "canon_middle", "canon_last"])
+        .map_elements(lambda x: fuzz.token_sort_ratio(
+            x["norm_name"],
+            f"{x['canon_first']} {x['canon_middle']} {x['canon_last']}".strip()
+        ))
+        .alias("name_similarity")
+    ])
+    df = df.with_columns(
+        pl.when(pl.col("name_similarity") >= 75)
+        .then(pl.lit("AUTO_STANDARDIZED"))
+        .otherwise(pl.lit("NEEDS_MANUAL_INTERVENTION"))
+        .alias("comment")
+    )
+    df = df.filter(pl.col('comment')=='NEEDS_MANUAL_INTERVENTION').select('name_id', 'comment').unique()
+    df1 = df1.join(df, on = 'name_id', how = 'left')
+    return df1
+def name_final(df, unique_id, names):
+    df = df.select([unique_id] + names).unique()
+    df = df.select([pl.col(col).str.to_uppercase().str.strip_chars().alias(col) for col in df.columns])
+    df = df.with_columns(pl.col(names[0]).str.replace_all(r"UNKNOWN", ""))
+    df = df.with_columns(pl.col(names[2]).str.replace_all(r"UNKNOWN", ""))
+    df = df.fill_null("")
+    df = df.with_columns(
+        pl.concat_str(
+            names,
+            separator=" "
+        )
+        .str.to_uppercase()
+        .str.replace_all(r"[^A-Z0-9 ]", "")
+        .str.replace_all(r"\s+", " ")
+        .str.strip_chars()
+        .alias("norm_name")
+    )
+    df = (
+        df
+        .group_by([unique_id, "norm_name"])
+        .agg([
+            pl.len().alias("freq"),
+            pl.col("norm_name").str.len_chars().max().alias("len"),
+            pl.first(names[0]).alias(names[0]),
+            pl.first(names[1]).alias(names[1]),
+            pl.first(names[2]).alias(names[2]),
+        ])
+        .with_columns(pl.sum_horizontal([pl.when(pl.col(c).is_not_null() & (pl.col(c) != "")).then(1).otherwise(0) for c in names]).alias("countA"))
+        .sort(["countA", "freq", "len"], descending=[True, True, True])
+        .select([unique_id, names[0], names[1], names[2],])
+        .group_by(unique_id).head(1)
+    )
+    return df
+def address_final(df, unique_id, addresses):
+    df = (
+        df
+        .with_columns(pl.concat_str([pl.col(c).fill_null("") for c in [unique_id]+ addresses], separator="|").alias("addr_concat"))
+        .with_columns(pl.sum_horizontal([pl.when(pl.col(c).is_not_null() & (pl.col(c) != "")).then(1).otherwise(0) for c in addresses]).alias("countA"))
+        .with_columns(pl.col("addr_concat").str.len_chars().alias("addr_len"))
+        .with_columns(pl.col("addr_concat").count().over("addr_concat").alias("countif"))
+        .sort(["countif", "countA", "addr_len"], descending=[True, True, True])
+        .group_by(unique_id).head(1)
+        .select([unique_id] + addresses)
+    )
+    return df
+def final_cel(df, unique_id, summary, names, addresses):
+    raw_summary = df.select(summary)
+    raw_summary = raw_summary.with_columns(pl.col(summary).str.split(';')).explode(summary)
+    raw_summary = raw_summary.with_columns(pl.col(summary).count().over(summary).alias('TOTAL COUNT')).unique()
+    final_names = name_final(df, unique_id, names)
+    final_addresses = address_final(df, unique_id, addresses)
+    final_cols = df.columns
+    df1 = df
+    df1 = df1.fill_null("")
+    df = df.select(unique_id).unique()
+    cols = [c for c in df1.columns if c not in names + addresses + [unique_id]]
+    for col in cols:
+        df3 = df1.select([unique_id, col]).filter(pl.col(col) != '')
+        df3 = df3.with_columns(pl.col(col).str.split(';')).explode(col)
+        df3 = df3.unique()
+        df3 = df3.group_by(unique_id).agg(
+             pl.col(col).unique().sort().str.join(';').alias(col)
+        )
+        df = df.join(df3, on=unique_id, how='left')
+    df = df.join(final_names, on = unique_id, how = 'left')
+    df = df.join(final_addresses, on = unique_id, how = 'left')
+    df = df.select(final_cols).sort(unique_id)
+    cel_summary = df.select(summary)
+    cel_summary = cel_summary.with_columns(pl.col(summary).str.split(';')).explode(summary)
+    cel_summary = cel_summary.with_columns(pl.col(summary).count().over(summary).alias('UNIQUE COUNT')).unique()
+    final_summary = raw_summary.join(cel_summary, on = summary, how = 'left')
+    now = datetime.now().strftime("%m%d_%H%M")
+    final_summary.write_csv(f'summary_{now}.csv')
+    return df
+def y_columns(df, summary, y_cols):
+    df = (
+        df.with_columns(
+            pl.col(summary)
+            .fill_null("")
+            .str.split(";")
+            .alias("split_vals")
+        )
+        .with_columns([
+            pl.when(pl.col("split_vals").list.contains(v))
+              .then(pl.lit("Y"))
+              .otherwise(pl.lit(""))
+              .alias(v)
+            for v in y_cols
+        ])
+        .drop("split_vals")
+    )
+    return df

r3_test-0.0.1/r3_test.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,26 @@
+Metadata-Version: 2.4
+Name: r3_test
+Version: 0.0.1
+Summary: Just for test
+Author: Ranjeet Aloriya
+License: MIT
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: polars
+Requires-Dist: pyarrow
+Requires-Dist: sqlalchemy
+Requires-Dist: networkx
+Requires-Dist: pyodbc
+Requires-Dist: fastexcel
+Requires-Dist: rapidfuzz
+Requires-Dist: tqdm
+Requires-Dist: openpyxl
+Requires-Dist: xlrd
+Requires-Dist: xlsxwriter
+Dynamic: license-file

r3_test-0.0.1/r3_test.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,10 @@
+LICENSE
+README.md
+pyproject.toml
+r3_test/__init__.py
+r3_test/main.py
+r3_test.egg-info/PKG-INFO
+r3_test.egg-info/SOURCES.txt
+r3_test.egg-info/dependency_links.txt
+r3_test.egg-info/requires.txt
+r3_test.egg-info/top_level.txt

r3_test-0.0.1/r3_test.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

r3_test-0.0.1/r3_test.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,13 @@
+numpy
+pandas
+polars
+pyarrow
+sqlalchemy
+networkx
+pyodbc
+fastexcel
+rapidfuzz
+tqdm
+openpyxl
+xlrd
+xlsxwriter

r3_test-0.0.1/r3_test.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ r3_test

r3_test-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0