PyPI - itertoolkit - Versions diffs - 1.5.0__py3-none-any.whl - Mend

itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

bm_preprocessing/__init__.py +14 -0
bm_preprocessing/importer/DM/__init__.py +7 -0
bm_preprocessing/importer/DM/agg.py +6 -0
bm_preprocessing/importer/DM/dbscan.py +6 -0
bm_preprocessing/importer/DM/finals.py +6 -0
bm_preprocessing/importer/DM/gsp.py +6 -0
bm_preprocessing/importer/DM/test.py +6 -0
bm_preprocessing/importer/Finals/__init__.py +7 -0
bm_preprocessing/importer/Finals/kaadhal.py +6 -0
bm_preprocessing/importer/Finals/raaka.py +6 -0
bm_preprocessing/importer/Finals/seedan.py +6 -0
bm_preprocessing/importer/Finals/vikram.py +6 -0
bm_preprocessing/importer/IR/__init__.py +6 -0
bm_preprocessing/importer/IR/finals.py +6 -0
bm_preprocessing/importer/IR/pagerank.py +6 -0
bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
bm_preprocessing/importer/IR/test.py +6 -0
bm_preprocessing/importer/PY/__init__.py +4 -0
bm_preprocessing/importer/PY/lib_doc.py +6 -0
bm_preprocessing/importer/PY/python_doc.py +6 -0
bm_preprocessing/importer/__init__.py +8 -0
bm_preprocessing/importer/_module_printer.py +23 -0
bm_preprocessing/src/DM/__init__.py +1 -0
bm_preprocessing/src/DM/agg.py +267 -0
bm_preprocessing/src/DM/dbscan.py +218 -0
bm_preprocessing/src/DM/finals.py +19 -0
bm_preprocessing/src/DM/gsp.py +378 -0
bm_preprocessing/src/DM/test.py +19 -0
bm_preprocessing/src/Finals/__init__.py +1 -0
bm_preprocessing/src/Finals/kaadhal.py +1453 -0
bm_preprocessing/src/Finals/raaka.py +1338 -0
bm_preprocessing/src/Finals/seedan.py +1173 -0
bm_preprocessing/src/Finals/vikram.py +520 -0
bm_preprocessing/src/IR/__init__.py +1 -0
bm_preprocessing/src/IR/finals.py +14 -0
bm_preprocessing/src/IR/pagerank.py +109 -0
bm_preprocessing/src/IR/recommenders_pca.py +487 -0
bm_preprocessing/src/IR/test.py +14 -0
bm_preprocessing/src/PY/__init__.py +1 -0
bm_preprocessing/src/PY/lib_doc.py +295 -0
bm_preprocessing/src/PY/python_doc.py +177 -0
bm_preprocessing/src/__init__.py +1 -0
itertoolkit-1.5.0.dist-info/METADATA +120 -0
itertoolkit-1.5.0.dist-info/RECORD +45 -0
itertoolkit-1.5.0.dist-info/WHEEL +4 -0

bm_preprocessing/src/Finals/raaka.py ADDED Viewed

@@ -0,0 +1,1338 @@
+#dm1
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+from collections import defaultdict
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import AgglomerativeClustering, DBSCAN
+from scipy.cluster.hierarchy import dendrogram, linkage
+# 🔹 1. PREPROCESSING
+def load_and_clean(filepath, remove_cols=None):
+    data = pd.read_csv(filepath)
+    if remove_cols:
+        data.drop(columns=remove_cols, inplace=True)
+    # Fill missing values instead of dropping
+    for col in data.columns:
+        if data[col].dtype == 'object':
+            data[col].fillna(data[col].mode()[0], inplace=True)
+        else:
+            data[col].fillna(data[col].mean(), inplace=True)
+    # Manual label encoding using category codes
+    cat_cols = data.select_dtypes(include='object').columns
+    for col in cat_cols:
+        data[col] = pd.Categorical(data[col]).codes
+    return data
+def normalize(matrix):
+    scaler = StandardScaler()
+    return scaler.fit_transform(matrix)
+# 🔹 2. VISUALIZATION
+def scatter_clusters(features, cluster_labels, chart_title):
+    unique_labels = np.unique(cluster_labels)
+    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
+    for lbl, col in zip(unique_labels, colors):
+        mask = np.array(cluster_labels) == lbl
+        plt.scatter(features[mask, 0], features[mask, 1], color=col, label=f'Cluster {lbl}', s=20)
+    plt.title(chart_title)
+    plt.xlabel("Feature 1")
+    plt.ylabel("Feature 2")
+    plt.legend(fontsize=6)
+    plt.show()
+def show_dendrogram(data_matrix):
+    Z = linkage(data_matrix, method='ward')
+    dendrogram(Z)
+    plt.title("Dendrogram")
+    plt.show()
+def bar_chart(x_vals, y_vals, x_label, y_label, title):
+    plt.bar(range(len(x_vals)), y_vals, tick_label=x_vals if len(x_vals) < 30 else None)
+    plt.xlabel(x_label)
+    plt.ylabel(y_label)
+    plt.title(title)
+    plt.show()
+def pie_chart(dataframe, col_name):
+    freq = dataframe[col_name].value_counts()
+    plt.pie(freq.values, labels=freq.index.tolist(), autopct='%1.1f%%')
+    plt.title(f"{col_name} Distribution")
+    plt.show()
+# 🔹 3. AGGLOMERATIVE CLUSTERING
+def run_agglomerative(file):
+    data = load_and_clean(file, remove_cols=["CustomerID"])
+    feature_cols = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
+    X = data[feature_cols].values
+    X_norm = normalize(X)
+    pie_chart(data, "Gender")
+    bar_chart(data['Age'].values, data['Annual Income (k$)'].values, "Age", "Income", "Age vs Income")
+    # Ward linkage agglomerative clustering with 5 clusters
+    clusterer = AgglomerativeClustering(n_clusters=5, linkage='ward')
+    cluster_labels = clusterer.fit_predict(X_norm)
+    scatter_clusters(X_norm, cluster_labels, "Agglomerative Clustering")
+    show_dendrogram(X_norm)
+# 🔹 4. DBSCAN (CUSTOM)
+def compute_distance_matrix(X):
+    # Vectorized pairwise Euclidean distances using broadcasting
+    diff = X[:, np.newaxis, :] - X[np.newaxis, :, :]
+    return np.sqrt((diff ** 2).sum(axis=2))
+def custom_dbscan(X, eps=0.5, min_pts=5):
+    dist_matrix = compute_distance_matrix(X)
+    n = len(X)
+    # -1 = noise, 0 = unvisited
+    labels = [0] * n
+    current_cluster = 0
+    for point_idx in range(n):
+        if labels[point_idx] != 0:
+            continue
+        # Get neighbors using precomputed distances
+        neighbor_indices = list(np.where(dist_matrix[point_idx] <= eps)[0])
+        if len(neighbor_indices) < min_pts:
+            labels[point_idx] = -1  # noise
+            continue
+        current_cluster += 1
+        labels[point_idx] = current_cluster
+        seed_set = set(neighbor_indices) - {point_idx}
+        while seed_set:
+            q = seed_set.pop()
+            if labels[q] == -1:
+                labels[q] = current_cluster
+            if labels[q] != 0:
+                continue
+            labels[q] = current_cluster
+            q_neighbors = list(np.where(dist_matrix[q] <= eps)[0])
+            if len(q_neighbors) >= min_pts:
+                seed_set.update(q_neighbors)
+    return labels
+def run_dbscan(file):
+    data = load_and_clean(file, remove_cols=["Channel", "Region"])
+    # Display first few records
+    print("First 5 records:\n", data.head())
+    features = data[['Grocery', 'Milk']].values
+    features_scaled = normalize(features)
+    plt.scatter(features_scaled[:, 0], features_scaled[:, 1])
+    plt.title("Normalized Data")
+    plt.show()
+    # Custom DBSCAN
+    my_labels = custom_dbscan(features_scaled, eps=0.5, min_pts=15)
+    scatter_clusters(features_scaled, my_labels, "Custom DBSCAN")
+    # Sklearn DBSCAN (for comparison)
+    sk_labels = DBSCAN(eps=0.5, min_samples=15).fit_predict(features_scaled)
+    scatter_clusters(features_scaled, sk_labels, "Sklearn DBSCAN")
+# 🔹 5. MS-GSP (FULL)
+def parse_sequences(filepath):
+    all_seqs = []
+    with open(filepath) as f:
+        for line in f:
+            itemsets = re.findall(r'\{(.*?)\}', line.strip())
+            seq = [frozenset(map(int, s.split(','))) for s in itemsets]
+            if seq:
+                all_seqs.append(seq)
+    return all_seqs
+def parse_params(filepath):
+    mis_vals = {}
+    sdc_val = 0.0
+    with open(filepath) as f:
+        for line in f:
+            line = line.strip()
+            if 'MIS' in line:
+                item_id = int(re.search(r'\((\d+)\)', line).group(1))
+                mis_vals[item_id] = float(line.split('=')[-1].strip())
+            elif 'SDC' in line:
+                sdc_val = float(line.split('=')[-1].strip())
+    return mis_vals, sdc_val
+def item_support(sequences):
+    total = len(sequences)
+    freq = defaultdict(set)
+    for sid, seq in enumerate(sequences):
+        seen = set()
+        for itemset in seq:
+            seen |= itemset
+        for item in seen:
+            freq[item].add(sid)
+    return {item: len(sids) / total for item, sids in freq.items()}
+def pattern_contains(pattern, sequence):
+    # Check if pattern is a subsequence of sequence
+    pos = 0
+    for itemset in sequence:
+        if pos < len(pattern) and pattern[pos].issubset(itemset):
+            pos += 1
+    return pos == len(pattern)
+def support_of(pattern, sequences):
+    return sum(1 for seq in sequences if pattern_contains(pattern, seq))
+def all_items_in(pattern):
+    return set().union(*pattern)
+def sdc_ok(pattern, sup_map, sdc):
+    items = list(all_items_in(pattern))
+    for a in range(len(items)):
+        for b in range(a + 1, len(items)):
+            if abs(sup_map[items[a]] - sup_map[items[b]]) > sdc:
+                return False
+    return True
+def MSGSP(file_data, file_para):
+    sequences = parse_sequences(file_data)
+    MIS, SDC = parse_params(file_para)
+    sup_map = item_support(sequences)
+    total = len(sequences)
+    # Plot item support
+    plt.bar(list(sup_map.keys()), list(sup_map.values()))
+    plt.title("Support Distribution")
+    plt.show()
+    # --- F1: items sorted by MIS that meet their own threshold ---
+    sorted_items = sorted(MIS.keys(), key=lambda x: MIS[x])
+    freq_items = [i for i in sorted_items if sup_map.get(i, 0) >= MIS[i]]
+    print("L:", freq_items)
+    F1 = [[frozenset([i])] for i in freq_items]
+    print("\nF1:", F1)
+    F = [F1]
+    # --- F2: candidate 2-sequences and 2-itemsets ---
+    C2 = []
+    for i in range(len(freq_items)):
+        for j in range(i + 1, len(freq_items)):
+            a, b = freq_items[i], freq_items[j]
+            if abs(sup_map[a] - sup_map[b]) <= SDC:
+                C2.append([frozenset([a, b])])       # itemset
+                C2.append([frozenset([a]), frozenset([b])])  # sequence
+    F2 = []
+    for cand in C2:
+        items = all_items_in(cand)
+        min_mis = min(MIS[i] for i in items)
+        s = support_of(cand, sequences) / total
+        if s >= min_mis and sdc_ok(cand, sup_map, SDC) and cand not in F2:
+            F2.append(cand)
+    print("\nF2:", F2)
+    F.append(F2)
+    # --- Higher-order levels ---
+    level = 3
+    while True:
+        prev = F[level - 2]
+        candidates = []
+        for p in prev:
+            for q in prev:
+                if p[1:] == q[:-1]:
+                    extended = p + [q[-1]]
+                    if extended not in candidates:
+                        candidates.append(extended)
+        if not candidates:
+            break
+        Fk = []
+        for cand in candidates:
+            items = all_items_in(cand)
+            min_mis = min(MIS[i] for i in items)
+            s = support_of(cand, sequences) / total
+            if s >= min_mis and sdc_ok(cand, sup_map, SDC) and cand not in Fk:
+                Fk.append(cand)
+        if not Fk:
+            break
+        print(f"\nF{level}:", Fk)
+        F.append(Fk)
+        level += 1
+    # --- Final output ---
+    print("\nFinal Patterns:\n")
+    for level_patterns in F:
+        for pat in level_patterns:
+            cnt = support_of(pat, sequences)
+            pat_str = "<" + "".join("{" + ",".join(map(str, s)) + "}" for s in pat) + ">"
+            print(f"Pattern: {pat_str}  count: {cnt}")
+# 🔥 RUN EVERYTHING
+# Part 1
+run_agglomerative("filepath")
+# Part 2
+run_dbscan("/filepath")
+# Part 3
+MSGSP("/file1path", "/file2path")
+#dm2
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+from sklearn.cluster import AgglomerativeClustering
+from scipy.cluster.hierarchy import dendrogram, linkage
+# Load dataset
+df = pd.read_csv("FinalDM/Dataset/Mall_Customers.csv")
+# 1. Remove CustomerID
+df = df.drop("CustomerID", axis=1)
+# 2. Check missing values
+print("Missing values:\n", df.isnull().sum())
+# 3. Encode Gender (Male=1, Female=0)
+df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
+# 4. Pie chart (Male vs Female)
+gender_counts = df['Gender'].value_counts()
+labels = ['Female', 'Male']
+plt.pie(gender_counts, labels=labels, autopct='%1.1f%%')
+plt.title("Gender Distribution")
+plt.show()
+# 5. Bar graph (Age & Income)
+plt.bar(df['Age'], df['Annual Income (k$)'])
+plt.xlabel("Age")
+plt.ylabel("Income")
+plt.title("Age vs Income")
+plt.show()
+# 6. Agglomerative Clustering
+X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
+model = AgglomerativeClustering(n_clusters=5, linkage='ward')
+labels = model.fit_predict(X)
+# Plot clusters
+plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels)
+plt.xlabel("Age")
+plt.ylabel("Income")
+plt.title("Agglomerative Clustering")
+plt.show()
+# 7. Dendrogram
+linked = linkage(X, method='ward')
+plt.figure(figsize=(10, 5))
+dendrogram(linked)
+plt.title("Dendrogram")
+plt.show()
+# =========================================================
+# problem 1 - using manual implementation
+# =========================================================
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
+# Load dataset
+df = pd.read_csv("FinalDM/Dataset/Wholesale customers data.csv")
+# Drop columns
+df = df.drop(['Channel', 'Region'], axis=1)
+# 1. Display first few records
+print("First 5 records:\n", df.head())
+# Select Grocery & Milk
+X = df[['Grocery', 'Milk']]
+# Normalize
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+# 2. Visualize normalized dataset
+plt.scatter(X_scaled[:, 0], X_scaled[:, 1])
+plt.xlabel("Grocery (Normalized)")
+plt.ylabel("Milk (Normalized)")
+plt.title("Normalized Dataset")
+plt.show()
+# -------- CUSTOM DBSCAN --------
+def euclidean(p1, p2):
+    return np.sqrt(np.sum((p1 - p2) ** 2))
+def get_neighbors(X, point_idx, eps):
+    return [i for i in range(len(X)) if euclidean(X[point_idx], X[i]) <= eps]
+def dbscan(X, eps, min_pts):
+    labels = [-1] * len(X)
+    cluster_id = 0
+    for i in range(len(X)):
+        if labels[i] != -1:
+            continue
+        neighbors = get_neighbors(X, i, eps)
+        if len(neighbors) < min_pts:
+            labels[i] = 0  # noise
+        else:
+            cluster_id += 1
+            labels[i] = cluster_id
+            j = 0
+            while j < len(neighbors):
+                n = neighbors[j]
+                if labels[n] == 0:
+                    labels[n] = cluster_id
+                if labels[n] == -1:
+                    labels[n] = cluster_id
+                    new_neighbors = get_neighbors(X, n, eps)
+                    if len(new_neighbors) >= min_pts:
+                        neighbors += new_neighbors
+                j += 1
+    return labels
+# Apply DBSCAN
+labels = dbscan(X_scaled, eps=0.5, min_pts=15)
+# 3. Plot cluster results
+plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels)
+plt.xlabel("Grocery")
+plt.ylabel("Milk")
+plt.title("Custom DBSCAN Clustering")
+plt.show()
+# problem 2 -- Using Built-in
+from sklearn.datasets import make_moons
+from sklearn.cluster import DBSCAN
+# Generate moons dataset
+X, _ = make_moons(n_samples=2000, noise=0.05)
+# Apply DBSCAN
+db = DBSCAN(eps=0.2, min_samples=5)
+labels = db.fit_predict(X)
+# Plot
+plt.scatter(X[:, 0], X[:, 1], c=labels)
+plt.title("DBSCAN on Moons Dataset")
+plt.show()
+# Add noise
+noise = np.random.uniform(low=-1.5, high=2.5, size=(200, 2))
+X_noisy = np.vstack([X, noise])
+labels_noisy = db.fit_predict(X_noisy)
+plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c=labels_noisy)
+plt.title("DBSCAN with Noise")
+plt.show()
+# =========================================================
+# FULL MS-GSP
+# =========================================================
+import re
+import matplotlib.pyplot as plt
+from collections import defaultdict
+# -------------------------------
+# 1. READ DATA
+# -------------------------------
+def read_data(file):
+    sequences = []
+    with open(file, 'r') as f:
+        for line in f:
+            sets = re.findall(r'\{(.*?)\}', line)
+            seq = [set(map(int, s.split(','))) for s in sets]
+            sequences.append(seq)
+    return sequences
+# -------------------------------
+# 2. READ PARAMETERS
+# -------------------------------
+def read_params(file):
+    MIS = {}
+    SDC = 0
+    with open(file, 'r') as f:
+        for line in f:
+            if "MIS" in line:
+                item = int(re.findall(r'\((\d+)\)', line)[0])
+                MIS[item] = float(line.split('=')[1])
+            elif "SDC" in line:
+                SDC = float(line.split('=')[1])
+    return MIS, SDC
+# -------------------------------
+# 3. SUPPORT
+# -------------------------------
+def get_support(sequences):
+    count = defaultdict(int)
+    total = len(sequences)
+    for seq in sequences:
+        items = set()
+        for s in seq:
+            items |= s
+        for item in items:
+            count[item] += 1
+    support = {k: v / total for k, v in count.items()}
+    return support, count
+# -------------------------------
+# 4. INIT PASS (L)
+# -------------------------------
+def init_pass(MIS, support):
+    sorted_items = sorted(MIS.keys(), key=lambda x: MIS[x])
+    L = []
+    for item in sorted_items:
+        if support.get(item, 0) >= MIS[item]:
+            L.append(item)
+    return L
+# -------------------------------
+# 5. F1
+# -------------------------------
+def generate_F1(L):
+    return [[{item}] for item in L]
+# -------------------------------
+# 6. SUBSEQUENCE CHECK
+# -------------------------------
+def is_subsequence(pattern, sequence):
+    i = 0
+    for s in sequence:
+        if i < len(pattern) and pattern[i].issubset(s):
+            i += 1
+    return i == len(pattern)
+# -------------------------------
+# 7. COUNT SUPPORT
+# -------------------------------
+def count_support(pattern, sequences):
+    return sum(is_subsequence(pattern, seq) for seq in sequences)
+# -------------------------------
+# 8. GET ITEMS
+# -------------------------------
+def get_items(pattern):
+    items = set()
+    for s in pattern:
+        items |= s
+    return items
+# -------------------------------
+# 9. SDC CHECK
+# -------------------------------
+def check_SDC(pattern, support, SDC):
+    items = list(get_items(pattern))
+    for i in range(len(items)):
+        for j in range(i + 1, len(items)):
+            if abs(support[items[i]] - support[items[j]]) > SDC:
+                return False
+    return True
+# -------------------------------
+# 10. C2 GENERATION
+# -------------------------------
+def generate_C2(L, support, MIS, SDC):
+    C2 = []
+    for i in range(len(L)):
+        for j in range(i + 1, len(L)):
+            i1, i2 = L[i], L[j]
+            if support[i2] >= MIS[i1] and abs(support[i2] - support[i1]) <= SDC:
+                C2.append([{i1, i2}])     # same itemset
+                C2.append([{i1}, {i2}])   # sequence
+    return C2
+# -------------------------------
+# 11. JOIN STEP
+# -------------------------------
+def join_step(Fk_1):
+    Ck = []
+    for p in Fk_1:
+        for q in Fk_1:
+            if p[1:] == q[:-1]:
+                candidate = p + [q[-1]]
+                if candidate not in Ck:
+                    Ck.append(candidate)
+    return Ck
+# -------------------------------
+# 12. PRUNE STEP
+# -------------------------------
+def prune(Ck, Fk_1):
+    pruned = []
+    for c in Ck:
+        valid = True
+        for i in range(len(c)):
+            sub = c[:i] + c[i+1:]
+            if sub not in Fk_1:
+                valid = False
+                break
+        if valid:
+            pruned.append(c)
+    return pruned
+# -------------------------------
+# 13. MS-GSP MAIN
+# -------------------------------
+def MSGSP(sequences, MIS, SDC):
+    support, raw_count = get_support(sequences)
+    # Plot support
+    plt.bar(list(support.keys()), list(support.values()))
+    plt.title("Support Distribution")
+    plt.show()
+    L = init_pass(MIS, support)
+    print("L:", L)
+    F = []
+    # F1
+    F1 = generate_F1(L)
+    print("\nF1:", F1)
+    F.append(F1)
+    # F2
+    C2 = generate_C2(L, support, MIS, SDC)
+    F2 = []
+    for c in C2:
+        count = count_support(c, sequences)
+        sup = count / len(sequences)
+        min_mis = min(MIS[item] for item in get_items(c))
+        if sup >= min_mis and check_SDC(c, support, SDC):
+            if c not in F2:
+                F2.append(c)
+    print("\nF2:", F2)
+    F.append(F2)
+    # Fk
+    k = 3
+    while True:
+        Ck = join_step(F[k-2])
+        Ck = prune(Ck, F[k-2])
+        if not Ck:
+            break
+        Fk = []
+        for c in Ck:
+            count = count_support(c, sequences)
+            sup = count / len(sequences)
+            min_mis = min(MIS[item] for item in get_items(c))
+            if sup >= min_mis and check_SDC(c, support, SDC):
+                if c not in Fk:
+                    Fk.append(c)
+        if not Fk:
+            break
+        print(f"\nF{k}:", Fk)
+        F.append(Fk)
+        k += 1
+    return F
+# -------------------------------
+# 14. PRINT OUTPUT
+# -------------------------------
+def print_patterns(F, sequences):
+    print("\nFinal Patterns:\n")
+    for level in F:
+        for pattern in level:
+            count = count_support(pattern, sequences)
+            pattern_str = "<"
+            for s in pattern:
+                pattern_str += "{" + ",".join(map(str, s)) + "}"
+            pattern_str += ">"
+            print(f"Pattern: {pattern_str}  count: {count}")
+# -------------------------------
+# RUN
+# -------------------------------
+sequences = read_data("FinalDM/Dataset/data (1).txt")
+MIS, SDC = read_params("FinalDM/Dataset/para.txt")
+F = MSGSP(sequences, MIS, SDC)
+print_patterns(F, sequences)
+# =========================================================
+# GSP
+# =========================================================
+import re
+import matplotlib.pyplot as plt
+from collections import defaultdict
+from itertools import combinations
+# -------------------------------
+# 1. READ DATA
+# -------------------------------
+def read_data(file):
+    sequences = []
+    with open(file, 'r') as f:
+        for line in f:
+            sets = re.findall(r'\{(.*?)\}', line)
+            seq = [set(map(int, s.split(','))) for s in sets]
+            sequences.append(seq)
+    return sequences
+# -------------------------------
+# 2. READ PARAMETERS (MIS + SDC)
+# -------------------------------
+def read_params(file):
+    MIS = {}
+    SDC = 0
+    with open(file, 'r') as f:
+        for line in f:
+            if "MIS" in line:
+                item = int(re.findall(r'\((\d+)\)', line)[0])
+                val = float(line.split('=')[1])
+                MIS[item] = val
+            elif "SDC" in line:
+                SDC = float(line.split('=')[1])
+    return MIS, SDC
+# -------------------------------
+# 3. SUPPORT CALCULATION
+# -------------------------------
+def get_support(sequences):
+    count = defaultdict(int)
+    total = len(sequences)
+    for seq in sequences:
+        unique_items = set()
+        for s in seq:
+            unique_items |= s
+        for item in unique_items:
+            count[item] += 1
+    support = {k: v / total for k, v in count.items()}
+    return support, count
+# -------------------------------
+# 4. PLOT SUPPORT GRAPH
+# -------------------------------
+def plot_support(support):
+    items = list(support.keys())
+    values = list(support.values())
+    plt.bar(items, values)
+    plt.xlabel("Items")
+    plt.ylabel("Support")
+    plt.title("Support Distribution")
+    plt.show()
+# -------------------------------
+# 5. FREQUENT 1-ITEMSETS
+# -------------------------------
+def frequent_1_itemsets(support, MIS):
+    return {item for item in support if support[item] >= MIS[item]}
+# -------------------------------
+# 6. CHECK SUBSEQUENCE
+# -------------------------------
+def is_subsequence(pattern, sequence):
+    i = 0
+    for s in sequence:
+        if pattern[i].issubset(s):
+            i += 1
+            if i == len(pattern):
+                return True
+    return False
+# -------------------------------
+# 7. COUNT PATTERN SUPPORT
+# -------------------------------
+def count_pattern(pattern, sequences):
+    count = 0
+    for seq in sequences:
+        if is_subsequence(pattern, seq):
+            count += 1
+    return count
+# -------------------------------
+# 8. GENERATE CANDIDATES (GSP STYLE)
+# -------------------------------
+def generate_candidates(prev_patterns):
+    candidates = []
+    for p1 in prev_patterns:
+        for p2 in prev_patterns:
+            if p1[1:] == p2[:-1]:
+                new_pattern = p1 + [p2[-1]]
+                candidates.append(new_pattern)
+    return candidates
+# -------------------------------
+# 9. FILTER USING MIS + SDC
+# -------------------------------
+def filter_patterns(candidates, sequences, MIS, SDC):
+    total = len(sequences)
+    valid_patterns = []
+    for pattern in candidates:
+        count = count_pattern(pattern, sequences)
+        support = count / total
+        items = set()
+        for p in pattern:
+            items |= p
+        # MIS condition
+        if all(support >= MIS[item] for item in items):
+            # SDC condition (approx)
+            supports = [support]
+            if max(supports) - min(supports) <= SDC:
+                valid_patterns.append((pattern, count))
+    return valid_patterns
+# -------------------------------
+# 10. PRINT PATTERNS
+# -------------------------------
+def print_patterns(patterns):
+    for pattern, count in patterns:
+        pattern_str = "<"
+        for p in pattern:
+            pattern_str += "{" + ",".join(map(str, p)) + "}"
+        pattern_str += ">"
+        print(f"Pattern: {pattern_str}  count: {count}")
+# -------------------------------
+# MAIN EXECUTION
+# -------------------------------
+sequences = read_data("FinalDM/Dataset/data (1).txt")
+MIS, SDC = read_params("FinalDM/Dataset/para.txt")
+# Support
+support, raw_count = get_support(sequences)
+print("SUPPORT VALUES:")
+for k, v in support.items():
+    print(f"Item {k}: {v:.2f}")
+# Plot graph
+plot_support(support)
+# Frequent 1-itemsets
+F1 = frequent_1_itemsets(support, MIS)
+print("\nFrequent 1-itemsets:", F1)
+# Convert F1 to sequence format
+patterns = [[{item}] for item in F1]
+k = 2
+all_patterns = []
+while patterns:
+    print(f"\nGenerating patterns of length {k}...")
+    candidates = generate_candidates(patterns)
+    valid_patterns = filter_patterns(candidates, sequences, MIS, SDC)
+    if not valid_patterns:
+        break
+    print_patterns(valid_patterns)
+    all_patterns.extend(valid_patterns)
+    # Prepare next iteration
+    patterns = [pattern for pattern, _ in valid_patterns]
+    k += 1
+# Final Output
+print("\nFINAL FREQUENT SEQUENTIAL PATTERNS:")
+print_patterns(all_patterns)
+#IR1
+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import PCA
+import networkx as nx
+import matplotlib.pyplot as plt
+import seaborn as sns
+# -------------------- STOPWORDS --------------------
+stop_words = {
+    "is","am","are","was","were","be","been","being",
+    "a","an","the","and","or","not","in","on","at","to",
+    "for","with","by","of","that","this","it","as","from",
+    "but","about","into","over","after","before","between",
+    "out","up","down","so","than","too","very","can","will"
+}
+def simple_stem(word):
+    suffixes = ["ing","ed","ly","es","s","ment"]
+    for suf in suffixes:
+        if word.endswith(suf) and len(word) > len(suf)+2:
+            return word[:-len(suf)]
+    return word
+def preprocess(text):
+    tokens = text.lower().split()
+    tokens = [t.strip(".,!?:;()[]{}\"'") for t in tokens]
+    tokens = [t for t in tokens if t and t not in stop_words]
+    tokens = [simple_stem(t) for t in tokens]
+    return " ".join(tokens)
+# -------------------- LOAD DATASET --------------------
+df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IRdata.csv")
+df["clean_text"] = df["text"].astype(str).apply(preprocess)
+docs = df["clean_text"].tolist()
+print("\n--- Preprocessed Documents ---")
+print(df[["id","clean_text"]])
+# ===============================================================
+# 1️⃣ CONTENT-BASED RECOMMENDATION
+# ===============================================================
+vectorizer = CountVectorizer()
+X = vectorizer.fit_transform(docs)
+similarity_matrix = cosine_similarity(X)
+def content_recommend(doc_id, top_n=3):
+    scores = list(enumerate(similarity_matrix[doc_id]))
+    scores = sorted(scores, key=lambda x: x[1], reverse=True)
+    scores = scores[1:top_n+1]
+    print(f"\n--- Content-Based Recommendations for Document {doc_id+1} ---")
+    for idx, score in scores:
+        print(f"Doc {idx+1} (Score={score:.3f}): {df.iloc[idx]['text']}")
+content_recommend(0)
+# ---------------- VISUALIZATION: Similarity Heatmap ----------------
+plt.figure(figsize=(8,6))
+sns.heatmap(similarity_matrix, annot=True, cmap="Blues")
+plt.title("Document Similarity Matrix (Content-Based)")
+plt.xlabel("Document ID")
+plt.ylabel("Document ID")
+plt.show()
+# ===============================================================
+# 2️⃣ COLLABORATIVE FILTERING (USER–ITEM)
+# ===============================================================
+ratings = pd.DataFrame({
+    "user": ["u1","u1","u2","u2","u3","u3","u4","u4"],
+    "item": [1,2,2,3,3,4,4,5],
+    "rating": [5,4,4,5,3,4,5,4]
+})
+user_item_matrix = ratings.pivot_table(index="user", columns="item", values="rating")
+user_item_matrix = user_item_matrix.fillna(0)
+user_sim = cosine_similarity(user_item_matrix)
+user_sim_df = pd.DataFrame(user_sim, index=user_item_matrix.index, columns=user_item_matrix.index)
+def recommend_item(user, top_n=3):
+    similar_users = user_sim_df[user].sort_values(ascending=False).index[1:top_n+1]
+    print(f"\n--- Collaborative Filtering Recommendation for {user} ---")
+    print("Similar Users:", list(similar_users))
+recommend_item("u1")
+#rating recommendation - given user and item:
+# ===============================================================
+# 🔹 PREDICT MISSING RATING (USER-BASED CF)
+# ===============================================================
+def predict_rating(user, item):
+    numerator = 0
+    denominator = 0
+    for other_user in user_item_matrix.index:
+        # consider only users who rated this item
+        if user_item_matrix.loc[other_user, item] > 0:
+            sim = user_sim_df.loc[user, other_user]
+            rating = user_item_matrix.loc[other_user, item]
+            numerator += sim * rating
+            denominator += abs(sim)
+    if denominator == 0:
+        return 0  # no similar users found
+    return numerator / denominator
+print("\n--- Predicted Rating ---")
+print("u1 rating for item 3:", predict_rating("u1", 3))
+# Optional - rating for all missing values:
+def predict_rating(user, item):
+    numerator = 0
+    denominator = 0
+    for other_user in user_item_matrix.index:
+        if user_item_matrix.loc[other_user, item] > 0:
+            sim = user_sim_df.loc[user, other_user]
+            rating = user_item_matrix.loc[other_user, item]
+            numerator += sim * rating
+            denominator += abs(sim)
+    if denominator == 0:
+        # fallback: average rating of the item
+        item_ratings = user_item_matrix[item]
+        non_zero_ratings = item_ratings[item_ratings > 0]
+        if len(non_zero_ratings) == 0:
+            return 0
+        return non_zero_ratings.mean()
+    return numerator / denominator
+def fill_missing_ratings():
+    filled = user_item_matrix.copy()
+    for user in user_item_matrix.index:
+        for item in user_item_matrix.columns:
+            if user_item_matrix.loc[user, item] == 0:
+                filled.loc[user, item] = predict_rating(user, item)
+    return filled
+filled_matrix = fill_missing_ratings()
+print("\n--- Filled User-Item Matrix ---")
+print(filled_matrix)
+# ---------------- VISUALIZATION: User Similarity Heatmap ----------------
+plt.figure(figsize=(6,4))
+sns.heatmap(user_sim_df, annot=True, cmap="Greens")
+plt.title("User Similarity Matrix (Collaborative Filtering)")
+plt.show()
+# ===============================================================
+# 3️⃣ PAGE RANK ALGORITHM
+# ===============================================================
+G = nx.Graph()
+for i in range(len(docs)):
+    for j in range(i+1, len(docs)):
+        if similarity_matrix[i][j] > 0.2:
+            G.add_edge(i, j, weight=similarity_matrix[i][j])
+pr = nx.pagerank(G)
+print("\n--- PageRank Scores for Documents ---")
+for i, score in pr.items():
+    print(f"Doc {i+1}: Score = {score:.4f}")
+# ---------------- VISUALIZATION: PageRank Graph ----------------
+plt.figure(figsize=(8,6))
+pos = nx.spring_layout(G, seed=7)
+sizes = [5000 * pr[i] for i in G.nodes()]
+nx.draw(G, pos, with_labels=True, node_size=sizes, node_color='skyblue', edge_color='gray')
+plt.title("Document Graph Based on Similarity (PageRank Size = Score)")
+plt.show()
+# ===============================================================
+# 4️⃣ DIMENSION REDUCTION (PCA)
+# ===============================================================
+pca = PCA(n_components=2)
+X_reduced = pca.fit_transform(X.toarray())
+print("\n--- PCA Dimensionality Reduction (2D) ---")
+for i, vec in enumerate(X_reduced):
+    print(f"Doc {i+1}: {vec}")
+# ---------------- VISUALIZATION: PCA Scatter Plot ----------------
+plt.figure(figsize=(8,6))
+plt.scatter(X_reduced[:,0], X_reduced[:,1], s=120, color='purple')
+for i in range(len(X_reduced)):
+    plt.text(X_reduced[i,0]+0.02, X_reduced[i,1]+0.02, f"Doc {i+1}")
+plt.title("PCA - 2D Document Vector Visualization")
+plt.xlabel("PC1")
+plt.ylabel("PC2")
+plt.grid(True)
+plt.show()
+#IR2
+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import PCA
+import networkx as nx
+import matplotlib.pyplot as plt
+import seaborn as sns
+# -------------------- STOPWORDS --------------------
+stop_words = {
+    "is","am","are","was","were","be","been","being",
+    "a","an","the","and","or","not","in","on","at","to",
+    "for","with","by","of","that","this","it","as","from",
+    "but","about","into","over","after","before","between",
+    "out","up","down","so","than","too","very","can","will"
+}
+# Different style: uses a loop with early return instead of checking all suffixes
+def simple_stem(word):
+    for suf in ["ing","ed","ly","es","s","ment"]:
+        if word.endswith(suf) and len(word) > len(suf) + 2:
+            return word[:-len(suf)]
+    return word
+# Different style: uses list comprehension chain instead of step-by-step
+def preprocess(text):
+    raw_tokens = text.lower().split()
+    cleaned    = [t.strip(".,!?:;()[]{}\"'") for t in raw_tokens]
+    filtered   = [t for t in cleaned if t and t not in stop_words]
+    stemmed    = [simple_stem(t) for t in filtered]
+    return " ".join(stemmed)
+# -------------------- LOAD DATASET --------------------
+df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IRdata.csv")
+df["clean_text"] = df["text"].astype(str).apply(preprocess)
+docs = df["clean_text"].tolist()
+print("\n--- Preprocessed Documents ---")
+print(df[["id", "clean_text"]])
+# ===============================================================
+# 1️⃣ CONTENT-BASED RECOMMENDATION
+# ===============================================================
+# Same CountVectorizer — different style: fit and transform separated
+cv = CountVectorizer()
+cv.fit(docs)
+X = cv.transform(docs)
+# Manual cosine similarity — same result as sklearn's cosine_similarity
+# dot(A, B) / (||A|| * ||B||) — done via matrix multiplication on normalized vectors
+X_dense  = X.toarray().astype(float)
+norms    = np.linalg.norm(X_dense, axis=1, keepdims=True)
+norms[norms == 0] = 1e-10
+X_normed = X_dense / norms
+similarity_matrix = X_normed @ X_normed.T   # identical to cosine_similarity(X)
+def content_recommend(doc_id, top_n=3):
+    # Different style: uses dictionary then sorts, instead of list of tuples
+    score_dict = {i: similarity_matrix[doc_id][i]
+                  for i in range(len(docs)) if i != doc_id}
+    top_docs   = sorted(score_dict, key=score_dict.get, reverse=True)[:top_n]
+    print(f"\n--- Content-Based Recommendations for Document {doc_id+1} ---")
+    for idx in top_docs:
+        print(f"Doc {idx+1} (Score={score_dict[idx]:.3f}): {df.iloc[idx]['text']}")
+content_recommend(0)
+# ---------------- VISUALIZATION: Similarity Heatmap ----------------
+plt.figure(figsize=(8, 6))
+sns.heatmap(similarity_matrix, annot=True, cmap="Blues")
+plt.title("Document Similarity Matrix (Content-Based)")
+plt.xlabel("Document ID")
+plt.ylabel("Document ID")
+plt.show()
+# ===============================================================
+# 2️⃣ COLLABORATIVE FILTERING (USER–ITEM)
+# ===============================================================
+# Different style: build as list of tuples first, then DataFrame
+user_item_data = [
+    ("u1", 1, 5), ("u1", 2, 4),
+    ("u2", 2, 4), ("u2", 3, 5),
+    ("u3", 3, 3), ("u3", 4, 4),
+    ("u4", 4, 5), ("u4", 5, 4)
+]
+ratings = pd.DataFrame(user_item_data, columns=["user", "item", "rating"])
+# Build user-item matrix — same pivot logic, different variable names
+user_item_matrix = ratings.pivot_table(index="user", columns="item", values="rating").fillna(0)
+# User similarity — same cosine_similarity call, wrapped differently
+ui_array    = user_item_matrix.values.astype(float)
+raw_sim     = cosine_similarity(ui_array)
+user_sim_df = pd.DataFrame(raw_sim,
+                            index=user_item_matrix.index,
+                            columns=user_item_matrix.index)
+def recommend_item(user, top_n=3):
+    # Different style: drops the user itself first, then takes top_n
+    others        = user_sim_df[user].drop(labels=[user])
+    similar_users = others.sort_values(ascending=False).head(top_n).index.tolist()
+    print(f"\n--- Collaborative Filtering Recommendation for {user} ---")
+    print("Similar Users:", similar_users)
+recommend_item("u1")
+# ===============================================================
+# 🔹 PREDICT MISSING RATING (USER-BASED CF)
+# ===============================================================
+def predict_rating(user, item):
+    num = 0.0
+    den = 0.0
+    for other in user_item_matrix.index:
+        r = user_item_matrix.loc[other, item]
+        if r > 0:
+            s    = user_sim_df.loc[user, other]
+            num += s * r
+            den += abs(s)
+    return num / den if den != 0 else 0
+print("\n--- Predicted Rating ---")
+print("u1 rating for item 3:", predict_rating("u1", 3))
+# Redefine with fallback
+def predict_rating(user, item):
+    num = 0.0
+    den = 0.0
+    for other in user_item_matrix.index:
+        r = user_item_matrix.loc[other, item]
+        if r > 0:
+            s    = user_sim_df.loc[user, other]
+            num += s * r
+            den += abs(s)
+    if den == 0:
+        col      = user_item_matrix[item]
+        non_zero = col[col > 0]
+        return non_zero.mean() if len(non_zero) > 0 else 0
+    return num / den
+def fill_missing_ratings():
+    filled = user_item_matrix.copy()
+    for u in user_item_matrix.index:
+        for it in user_item_matrix.columns:
+            if user_item_matrix.loc[u, it] == 0:
+                filled.loc[u, it] = predict_rating(u, it)
+    return filled
+filled_matrix = fill_missing_ratings()
+print("\n--- Filled User-Item Matrix ---")
+print(filled_matrix)
+# ---------------- VISUALIZATION: User Similarity Heatmap ----------------
+plt.figure(figsize=(6, 4))
+sns.heatmap(user_sim_df, annot=True, cmap="Greens")
+plt.title("User Similarity Matrix (Collaborative Filtering)")
+plt.show()
+# ===============================================================
+# 3️⃣ PAGE RANK ALGORITHM
+# ===============================================================
+# Different style: builds edge list first, then adds all at once
+edge_list = [
+    (i, j, similarity_matrix[i][j])
+    for i in range(len(docs))
+    for j in range(i + 1, len(docs))
+    if similarity_matrix[i][j] > 0.2
+]
+G = nx.Graph()
+G.add_weighted_edges_from(edge_list)
+pr = nx.pagerank(G)
+print("\n--- PageRank Scores for Documents ---")
+for node, score in pr.items():
+    print(f"Doc {node+1}: Score = {score:.4f}")
+# ---------------- VISUALIZATION: PageRank Graph ----------------
+plt.figure(figsize=(8, 6))
+pos   = nx.spring_layout(G, seed=7)
+sizes = [5000 * pr[n] for n in G.nodes()]
+nx.draw(G, pos, with_labels=True, node_size=sizes, node_color='skyblue', edge_color='gray')
+plt.title("Document Graph Based on Similarity (PageRank Size = Score)")
+plt.show()
+# ===============================================================
+# 4️⃣ DIMENSION REDUCTION (PCA)
+# ===============================================================
+# Different style: PCA fit and transform on separate lines
+reducer   = PCA(n_components=2)
+reducer.fit(X.toarray())
+X_reduced = reducer.transform(X.toarray())
+print("\n--- PCA Dimensionality Reduction (2D) ---")
+for i, vec in enumerate(X_reduced):
+    print(f"Doc {i+1}: {vec}")
+# ---------------- VISUALIZATION: PCA Scatter Plot ----------------
+plt.figure(figsize=(8, 6))
+plt.scatter(X_reduced[:, 0], X_reduced[:, 1], s=120, color='purple')
+for i in range(len(X_reduced)):
+    plt.text(X_reduced[i, 0] + 0.02, X_reduced[i, 1] + 0.02, f"Doc {i+1}")
+plt.title("PCA - 2D Document Vector Visualization")
+plt.xlabel("PC1")
+plt.ylabel("PC2")
+plt.grid(True)
+plt.show()