itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
# ===========================================================
|
|
5
|
+
# 📂 FILE READING & INITIAL INSPECTION
|
|
6
|
+
# ===========================================================
|
|
7
|
+
|
|
8
|
+
# Load data (CSV, Excel, JSON)
|
|
9
|
+
df = pd.read_csv("data.csv") # Load CSV
|
|
10
|
+
df = pd.read_excel("data.xlsx") # Load Excel
|
|
11
|
+
df = pd.read_json("data.json") # Load JSON
|
|
12
|
+
|
|
13
|
+
# Quick Inspection
|
|
14
|
+
print(df.head())
|
|
15
|
+
print(df.info())
|
|
16
|
+
print(df.describe()) # Basic stats & info
|
|
17
|
+
print(df.shape) # (rows, columns)
|
|
18
|
+
print(df.columns.tolist()) # List all column names
|
|
19
|
+
print(df.dtypes) # Data types of each column
|
|
20
|
+
print(df.isnull().sum()) # Count NaNs per column
|
|
21
|
+
print(df.nunique()) # Unique values per column
|
|
22
|
+
print(df.value_counts("col")) # Frequency of each value
|
|
23
|
+
|
|
24
|
+
# ===========================================================
|
|
25
|
+
# 🔍 SELECTION & MANIPULATION
|
|
26
|
+
# ===========================================================
|
|
27
|
+
|
|
28
|
+
# Selecting rows and columns
|
|
29
|
+
cols = df[["col1", "col2"]] # Select multiple columns
|
|
30
|
+
rows = df.iloc[0:10] # Select first 10 rows by index
|
|
31
|
+
filtered = df[df["age"] > 25] # Select rows based on condition
|
|
32
|
+
cell = df.loc[0, "col1"] # Select specific cell (label-based)
|
|
33
|
+
cell = df.iloc[0, 0] # Select specific cell (index-based)
|
|
34
|
+
filtered = df.query("age > 25 and city == 'NYC'") # Query-based filtering
|
|
35
|
+
sampled = df.sample(n=5, random_state=42) # Random sample of rows
|
|
36
|
+
|
|
37
|
+
# Sorting
|
|
38
|
+
sorted_df = df.sort_values("col", ascending=False) # Sort by column descending
|
|
39
|
+
sorted_df = df.sort_values(
|
|
40
|
+
["col1", "col2"], ascending=[True, False]
|
|
41
|
+
) # Multi-column sort
|
|
42
|
+
|
|
43
|
+
# Stacking & Merging
|
|
44
|
+
df1 = pd.DataFrame({"id": [1, 2], "val": ["A", "B"]})
|
|
45
|
+
df2 = pd.DataFrame({"id": [2, 3], "val": ["C", "D"]})
|
|
46
|
+
df_stacked = pd.concat([df1, df2], axis=0) # Stack vertically (rows)
|
|
47
|
+
df_wide = pd.concat([df1, df2], axis=1) # Stack horizontally (columns)
|
|
48
|
+
df_merged = pd.merge(
|
|
49
|
+
df1, df2, on="id", how="inner"
|
|
50
|
+
) # SQL-like join (inner/left/right/outer)
|
|
51
|
+
|
|
52
|
+
# ===========================================================
|
|
53
|
+
# 🛠️ DATA CLEANING & PROCESSING
|
|
54
|
+
# ===========================================================
|
|
55
|
+
|
|
56
|
+
# Handling Missing Values
|
|
57
|
+
df["col"].fillna(df["col"].mean(), inplace=True) # Impute by mean (one-liner)
|
|
58
|
+
df["col"].fillna(df["col"].median(), inplace=True) # Impute by median
|
|
59
|
+
df["col"].fillna(df["col"].mode()[0], inplace=True) # Impute by mode (categorical)
|
|
60
|
+
df.dropna(axis=0, inplace=True) # Drop rows with any NaN values
|
|
61
|
+
df.dropna(
|
|
62
|
+
subset=["col1", "col2"], inplace=True
|
|
63
|
+
) # Drop rows with NaN in specific columns
|
|
64
|
+
df.ffill(inplace=True) # Forward fill NaN values
|
|
65
|
+
df.bfill(inplace=True) # Backward fill NaN values
|
|
66
|
+
|
|
67
|
+
# Transformations
|
|
68
|
+
df["new_col"] = df["col"].apply(lambda x: x**2) # Apply custom function
|
|
69
|
+
grouped = df.groupby("category")["sales"].sum() # Groupby & aggregate
|
|
70
|
+
grouped = df.groupby("cat").agg(
|
|
71
|
+
{"sales": "sum", "qty": "mean"}
|
|
72
|
+
) # Multiple aggregations
|
|
73
|
+
df["cat_code"] = df["category"].astype("category").cat.codes # Quick label encoding
|
|
74
|
+
df["col"] = df["col"].str.lower() # String lowercase
|
|
75
|
+
df["col"] = df["col"].str.replace("old", "new") # String replace
|
|
76
|
+
df["col"] = df["col"].str.strip() # Strip whitespace
|
|
77
|
+
df["binned"] = pd.cut(df["age"], bins=[0, 18, 35, 60, 100]) # Binning/discretization
|
|
78
|
+
df = pd.get_dummies(df, columns=["city"], drop_first=True) # One-hot encode columns
|
|
79
|
+
df.rename(columns={"old_name": "new_name"}, inplace=True) # Rename columns
|
|
80
|
+
df.drop(columns=["col1", "col2"], inplace=True) # Drop columns
|
|
81
|
+
df.drop_duplicates(inplace=True) # Remove duplicate rows
|
|
82
|
+
|
|
83
|
+
# Type Conversions
|
|
84
|
+
df["col"] = df["col"].astype(int) # Convert column type
|
|
85
|
+
df["date"] = pd.to_datetime(df["date_str"]) # Parse dates
|
|
86
|
+
|
|
87
|
+
# ===========================================================
|
|
88
|
+
# 🔢 NUMPY ESSENTIALS
|
|
89
|
+
# ===========================================================
|
|
90
|
+
|
|
91
|
+
arr = np.array([1, 2, 3]) # Create array
|
|
92
|
+
reshaped = arr.reshape(1, -1) # Reshape for sklearn (2D)
|
|
93
|
+
mean_val = np.mean(arr)
|
|
94
|
+
std_val = np.std(arr) # Basic stats
|
|
95
|
+
mask = arr[arr > 2] # Boolean indexing/filtering
|
|
96
|
+
zeros = np.zeros((3, 3))
|
|
97
|
+
ones = np.ones((3, 3)) # Zero/One matrices
|
|
98
|
+
eye = np.eye(3) # Identity matrix
|
|
99
|
+
rand = np.random.rand(3, 3) # Random matrix [0,1)
|
|
100
|
+
dot = np.dot(arr, arr) # Dot product
|
|
101
|
+
norm = np.linalg.norm(arr) # Vector norm (L2)
|
|
102
|
+
log = np.log2(arr) # Log base 2 (entropy)
|
|
103
|
+
unique, counts = np.unique(arr, return_counts=True) # Unique values & counts
|
|
104
|
+
|
|
105
|
+
# ===========================================================
|
|
106
|
+
# 🤖 SCIKIT-LEARN PREPROCESSING
|
|
107
|
+
# ===========================================================
|
|
108
|
+
|
|
109
|
+
from sklearn.impute import SimpleImputer
|
|
110
|
+
from sklearn.model_selection import train_test_split
|
|
111
|
+
from sklearn.preprocessing import (
|
|
112
|
+
LabelEncoder,
|
|
113
|
+
MinMaxScaler,
|
|
114
|
+
OneHotEncoder,
|
|
115
|
+
StandardScaler,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Splitting Data
|
|
119
|
+
X = df.drop("target", axis=1)
|
|
120
|
+
y = df["target"]
|
|
121
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
122
|
+
X, y, test_size=0.2, random_state=42
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Imputing (Standard approach)
|
|
126
|
+
imputed = SimpleImputer(strategy="mean").fit_transform(
|
|
127
|
+
df[["num_col"]]
|
|
128
|
+
) # Impute missing with mean
|
|
129
|
+
imputed = SimpleImputer(strategy="most_frequent").fit_transform(
|
|
130
|
+
df[["cat"]]
|
|
131
|
+
) # Impute categorical
|
|
132
|
+
|
|
133
|
+
# Scaling & Encoding
|
|
134
|
+
scaled = StandardScaler().fit_transform(
|
|
135
|
+
df[["age", "salary"]]
|
|
136
|
+
) # Standardize (mean=0, std=1)
|
|
137
|
+
scaled = MinMaxScaler().fit_transform(df[["age", "salary"]]) # Normalize to [0, 1]
|
|
138
|
+
encoded = OneHotEncoder().fit_transform(df[["gender"]]).toarray() # One-hot encode
|
|
139
|
+
le = LabelEncoder().fit_transform(df["target"]) # Encode target labels
|
|
140
|
+
|
|
141
|
+
# Pipeline (all-in-one)
|
|
142
|
+
from sklearn.compose import ColumnTransformer
|
|
143
|
+
from sklearn.pipeline import Pipeline
|
|
144
|
+
|
|
145
|
+
processor = ColumnTransformer(
|
|
146
|
+
[("num", StandardScaler(), ["age"]), ("cat", OneHotEncoder(), ["city"])]
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# ===========================================================
|
|
150
|
+
# ⛏️ DATA MINING (DM) ESSENTIALS
|
|
151
|
+
# ===========================================================
|
|
152
|
+
|
|
153
|
+
from sklearn.cluster import KMeans
|
|
154
|
+
from sklearn.ensemble import (
|
|
155
|
+
AdaBoostClassifier,
|
|
156
|
+
BaggingClassifier,
|
|
157
|
+
RandomForestClassifier,
|
|
158
|
+
)
|
|
159
|
+
from sklearn.metrics import (
|
|
160
|
+
accuracy_score,
|
|
161
|
+
auc,
|
|
162
|
+
classification_report,
|
|
163
|
+
confusion_matrix,
|
|
164
|
+
f1_score,
|
|
165
|
+
precision_recall_curve,
|
|
166
|
+
precision_score,
|
|
167
|
+
recall_score,
|
|
168
|
+
roc_curve,
|
|
169
|
+
)
|
|
170
|
+
from sklearn.preprocessing import label_binarize
|
|
171
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
172
|
+
|
|
173
|
+
# --- Decision Trees ---
|
|
174
|
+
dt = DecisionTreeClassifier(random_state=42).fit(
|
|
175
|
+
X_train, y_train
|
|
176
|
+
) # Train decision tree
|
|
177
|
+
y_pred = dt.predict(X_test) # Predict
|
|
178
|
+
print(accuracy_score(y_test, y_pred)) # Accuracy
|
|
179
|
+
|
|
180
|
+
# --- Ensemble Methods ---
|
|
181
|
+
bag = BaggingClassifier(n_estimators=10, random_state=42).fit(
|
|
182
|
+
X_train, y_train
|
|
183
|
+
) # Bagging
|
|
184
|
+
ada = AdaBoostClassifier(n_estimators=50, random_state=42).fit(
|
|
185
|
+
X_train, y_train
|
|
186
|
+
) # AdaBoost
|
|
187
|
+
rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(
|
|
188
|
+
X_train, y_train
|
|
189
|
+
) # Random Forest
|
|
190
|
+
|
|
191
|
+
# --- Clustering ---
|
|
192
|
+
kmeans = KMeans(n_clusters=3, random_state=42).fit(X) # K-Means clustering
|
|
193
|
+
labels = kmeans.labels_ # Cluster labels
|
|
194
|
+
centers = kmeans.cluster_centers_ # Cluster centroids
|
|
195
|
+
|
|
196
|
+
# --- Classification Metrics ---
|
|
197
|
+
print(accuracy_score(y_test, y_pred)) # Accuracy = (TP+TN)/(TP+TN+FP+FN)
|
|
198
|
+
print(precision_score(y_test, y_pred, average="weighted")) # Precision = TP/(TP+FP)
|
|
199
|
+
print(recall_score(y_test, y_pred, average="weighted")) # Recall = TP/(TP+FN)
|
|
200
|
+
print(f1_score(y_test, y_pred, average="weighted")) # F1 = 2*(P*R)/(P+R)
|
|
201
|
+
print(confusion_matrix(y_test, y_pred)) # Confusion matrix
|
|
202
|
+
print(classification_report(y_test, y_pred)) # Full report
|
|
203
|
+
|
|
204
|
+
# --- ROC & AUC ---
|
|
205
|
+
y_bin = label_binarize(y_test, classes=[0, 1, 2]) # Binarize for multiclass ROC
|
|
206
|
+
y_proba = rf.predict_proba(X_test)
|
|
207
|
+
fpr, tpr, _ = roc_curve(y_bin[:, 0], y_proba[:, 0]) # ROC curve (per class)
|
|
208
|
+
roc_auc = auc(fpr, tpr) # AUC score
|
|
209
|
+
|
|
210
|
+
# --- Apriori / Association Rules (manual) ---
|
|
211
|
+
from itertools import combinations
|
|
212
|
+
|
|
213
|
+
support = lambda itemset, txns: sum(1 for t in txns if itemset.issubset(t)) / len(txns)
|
|
214
|
+
transactions = [
|
|
215
|
+
["milk", "bread"],
|
|
216
|
+
["milk", "diaper", "beer", "bread"],
|
|
217
|
+
["milk", "bread"],
|
|
218
|
+
]
|
|
219
|
+
items = set(item for t in transactions for item in t)
|
|
220
|
+
freq_items = {frozenset([i]) for t in transactions for i in t} # C1 candidates
|
|
221
|
+
pairs = [
|
|
222
|
+
frozenset(c) for c in combinations(sorted(items), 2)
|
|
223
|
+
] # Generate candidate pairs
|
|
224
|
+
|
|
225
|
+
# --- Entropy & Information Gain (ID3) ---
|
|
226
|
+
entropy = lambda probs: -sum(p * np.log2(p) for p in probs if p > 0) # Shannon entropy
|
|
227
|
+
info_gain = lambda parent_ent, children: parent_ent - sum(
|
|
228
|
+
w * entropy(c) for w, c in children
|
|
229
|
+
)
|
|
230
|
+
gini = lambda probs: 1 - sum(p**2 for p in probs) # Gini impurity
|
|
231
|
+
|
|
232
|
+
# ===========================================================
|
|
233
|
+
# 🔎 INFORMATION RETRIEVAL (IR) ESSENTIALS
|
|
234
|
+
# ===========================================================
|
|
235
|
+
|
|
236
|
+
import math
|
|
237
|
+
from collections import Counter
|
|
238
|
+
|
|
239
|
+
# --- Term Frequency (TF) ---
|
|
240
|
+
# TF(t, d) = count(t in d) / total_terms_in_d
|
|
241
|
+
tf = lambda term, doc: doc.count(term) / len(doc) # Term frequency
|
|
242
|
+
|
|
243
|
+
# --- Inverse Document Frequency (IDF) ---
|
|
244
|
+
# IDF(t) = log(N / df(t)) where N = total docs, df(t) = docs containing t
|
|
245
|
+
idf = lambda term, docs: math.log(len(docs) / sum(1 for d in docs if term in d))
|
|
246
|
+
|
|
247
|
+
# --- TF-IDF ---
|
|
248
|
+
# TF-IDF(t, d) = TF(t, d) * IDF(t)
|
|
249
|
+
tfidf = lambda term, doc, docs: tf(term, doc) * idf(term, docs) # TF-IDF score
|
|
250
|
+
|
|
251
|
+
# --- BM25 ---
|
|
252
|
+
# BM25(t, d) = IDF(t) * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * |d| / avgdl))
|
|
253
|
+
k1 = 1.5
|
|
254
|
+
b = 0.75 # BM25 parameters
|
|
255
|
+
docs = [["term1", "term2"], ["term2", "term3"], ["term1", "term3", "term4"]]
|
|
256
|
+
avgdl = np.mean([len(d) for d in docs]) # Average document length
|
|
257
|
+
|
|
258
|
+
# --- Boolean Retrieval ---
|
|
259
|
+
# AND: set(doc1_terms) & set(doc2_terms)
|
|
260
|
+
# OR: set(doc1_terms) | set(doc2_terms)
|
|
261
|
+
# NOT: set(all_terms) - set(doc_terms)
|
|
262
|
+
bool_and = lambda q_terms, doc: all(t in doc for t in q_terms) # Boolean AND query
|
|
263
|
+
bool_or = lambda q_terms, doc: any(t in doc for t in q_terms) # Boolean OR query
|
|
264
|
+
|
|
265
|
+
# --- Cosine Similarity ---
|
|
266
|
+
# cos(A, B) = (A · B) / (||A|| * ||B||)
|
|
267
|
+
cosine_sim = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
268
|
+
|
|
269
|
+
# --- Jaccard Similarity ---
|
|
270
|
+
# J(A, B) = |A ∩ B| / |A ∪ B|
|
|
271
|
+
jaccard = lambda a, b: len(a & b) / len(a | b) # Jaccard similarity (sets)
|
|
272
|
+
|
|
273
|
+
# --- Tokenization & Text Processing ---
|
|
274
|
+
tokens = "hello world foo bar".lower().split() # Basic tokenization
|
|
275
|
+
vocab = set(tokens) # Vocabulary
|
|
276
|
+
bow = Counter(tokens) # Bag of words
|
|
277
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
278
|
+
|
|
279
|
+
tfidf_matrix = TfidfVectorizer().fit_transform(
|
|
280
|
+
["doc1 text", "doc2 text"]
|
|
281
|
+
) # TF-IDF vectorizer
|
|
282
|
+
|
|
283
|
+
# --- Inverted Index ---
|
|
284
|
+
inv_index = {} # Build inverted index
|
|
285
|
+
for doc_id, doc in enumerate(docs):
|
|
286
|
+
for term in doc:
|
|
287
|
+
inv_index.setdefault(term, set()).add(doc_id)
|
|
288
|
+
|
|
289
|
+
# --- Precision & Recall (IR) ---
|
|
290
|
+
# Precision@k = relevant_in_top_k / k
|
|
291
|
+
# Recall@k = relevant_in_top_k / total_relevant
|
|
292
|
+
precision_at_k = lambda retrieved, relevant, k: len(set(retrieved[:k]) & relevant) / k
|
|
293
|
+
recall_at_k = lambda retrieved, relevant, k: len(set(retrieved[:k]) & relevant) / len(
|
|
294
|
+
relevant
|
|
295
|
+
)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
=============================================================================
|
|
3
|
+
PYTHON BASICS CHEAT SHEET
|
|
4
|
+
=============================================================================
|
|
5
|
+
A quick reference guide for core Python concepts, data structures, and features.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# =============================================================================
|
|
9
|
+
# 1. LISTS (Mutable, Ordered)
|
|
10
|
+
# =============================================================================
|
|
11
|
+
my_list = [1, 2, 3, "a", "b"]
|
|
12
|
+
|
|
13
|
+
# Operations
|
|
14
|
+
my_list.append(4) # Add to end: [1, 2, 3, 'a', 'b', 4]
|
|
15
|
+
my_list.insert(0, 0) # Insert at index: [0, 1, 2, 3, 'a', 'b', 4]
|
|
16
|
+
my_list.extend([5, 6]) # Append multiple: [0, 1, 2, 3, 'a', 'b', 4, 5, 6]
|
|
17
|
+
my_list.pop() # Remove & return last item (6)
|
|
18
|
+
my_list.pop(1) # Remove & return item at index 1 (1)
|
|
19
|
+
my_list.remove("a") # Remove first occurrence of 'a'
|
|
20
|
+
my_list.reverse() # Reverse in place
|
|
21
|
+
# my_list.sort() # Sort in place (requires same types)
|
|
22
|
+
# sorted(my_list) # Return new sorted list
|
|
23
|
+
my_list.clear() # Empty the list
|
|
24
|
+
count = my_list.count(2) # Count occurrences
|
|
25
|
+
idx = my_list.index(3) # Find index of first occurrence
|
|
26
|
+
|
|
27
|
+
# Slicing: list[start:stop:step]
|
|
28
|
+
# my_list[1:4] (index 1 to 3), my_list[::-1] (reverse)
|
|
29
|
+
|
|
30
|
+
# =============================================================================
|
|
31
|
+
# 2. SETS (Mutable, Unordered, Unique Elements)
|
|
32
|
+
# =============================================================================
|
|
33
|
+
my_set = {1, 2, 3}
|
|
34
|
+
empty_set = set() # Note: {} creates an empty dict, not a set
|
|
35
|
+
|
|
36
|
+
# Operations
|
|
37
|
+
my_set.add(4) # Add element
|
|
38
|
+
my_set.update([5, 6]) # Add multiple elements
|
|
39
|
+
my_set.remove(6) # Remove element (raises KeyError if not found)
|
|
40
|
+
my_set.discard(10) # Remove element (safe, no error if not found)
|
|
41
|
+
my_set.pop() # Remove & return arbitrary element
|
|
42
|
+
my_set.clear() # Empty the set
|
|
43
|
+
|
|
44
|
+
set_a, set_b = {1, 2}, {2, 3}
|
|
45
|
+
union = set_a | set_b # {1, 2, 3} (or set_a.union(set_b))
|
|
46
|
+
intersection = set_a & set_b # {2} (or set_a.intersection(set_b))
|
|
47
|
+
diff = set_a - set_b # {1} (or set_a.difference(set_b))
|
|
48
|
+
sym_diff = set_a ^ set_b # {1, 3} (or set_a.symmetric_difference(set_b))
|
|
49
|
+
|
|
50
|
+
# =============================================================================
|
|
51
|
+
# 3. TUPLES (Immutable, Ordered)
|
|
52
|
+
# =============================================================================
|
|
53
|
+
my_tuple = (1, 2, 3, 2)
|
|
54
|
+
single_tuple = (1,) # Comma needed for single-element tuple
|
|
55
|
+
|
|
56
|
+
# Operations (Very limited since immutable)
|
|
57
|
+
count = my_tuple.count(2) # Count occurrences (2)
|
|
58
|
+
idx = my_tuple.index(3) # Find index of first occurrence (2)
|
|
59
|
+
# Tuples support unpacking: a, b, c, d = my_tuple
|
|
60
|
+
|
|
61
|
+
# =============================================================================
|
|
62
|
+
# 4. DICTIONARIES (Mutable, Key-Value Pairs, Unordered before Python 3.7)
|
|
63
|
+
# =============================================================================
|
|
64
|
+
my_dict = {"name": "Alice", "age": 25}
|
|
65
|
+
|
|
66
|
+
# Operations
|
|
67
|
+
my_dict["city"] = "NYC" # Add or update key
|
|
68
|
+
val = my_dict.get("age") # Safe get (returns None if not found, instead of KeyError)
|
|
69
|
+
val = my_dict.get("x", 0) # Safe get with default value
|
|
70
|
+
keys = my_dict.keys() # dict_keys(['name', 'age', 'city'])
|
|
71
|
+
values = my_dict.values() # dict_values(['Alice', 25, 'NYC'])
|
|
72
|
+
items = my_dict.items() # dict_items([('name', 'Alice'), ...])
|
|
73
|
+
|
|
74
|
+
# Removal
|
|
75
|
+
popped_val = my_dict.pop("age") # Remove key 'age' and return value
|
|
76
|
+
popped_item = my_dict.popitem() # Remove & return last key-value pair as tuple
|
|
77
|
+
# del my_dict['name'] # Delete key
|
|
78
|
+
my_dict.clear() # Empty dict
|
|
79
|
+
my_dict.update({"a": 1, "b": 2}) # Merge / Update with another dict
|
|
80
|
+
|
|
81
|
+
# =============================================================================
|
|
82
|
+
# 5. LIST / DICT / SET COMPREHENSIONS
|
|
83
|
+
# =============================================================================
|
|
84
|
+
# List Comprehension: [expression for item in iterable if condition]
|
|
85
|
+
squares = [x**2 for x in range(10) if x % 2 == 0] # [0, 4, 16, 36, 64]
|
|
86
|
+
|
|
87
|
+
# Dict Comprehension: {key_expr: val_expr for item in iterable if condition}
|
|
88
|
+
sq_dict = {x: x**2 for x in range(5)} # {0: 0, 1: 1, 2: 4, 3: 9, 4: 16}
|
|
89
|
+
|
|
90
|
+
# Set Comprehension: {expression for item in iterable if condition}
|
|
91
|
+
sq_set = {x**2 for x in [-1, 1, 2]} # {1, 4}
|
|
92
|
+
|
|
93
|
+
# Generator Expression: (expression for item in iterable if condition)
|
|
94
|
+
gen = (x**2 for x in range(10)) # Lazy evaluation
|
|
95
|
+
|
|
96
|
+
# =============================================================================
|
|
97
|
+
# 6. LAMBDA FUNCTIONS, MAP, FILTER, REDUCE
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# lambda arguments: expression
|
|
100
|
+
add = lambda x, y: x + y
|
|
101
|
+
print(add(2, 3)) # 5
|
|
102
|
+
|
|
103
|
+
nums = [1, 2, 3, 4]
|
|
104
|
+
# map: apply function to all items
|
|
105
|
+
mapped = list(map(lambda x: x * 2, nums)) # [2, 4, 6, 8]
|
|
106
|
+
|
|
107
|
+
# filter: keep items where function returns True
|
|
108
|
+
filtered = list(filter(lambda x: x % 2 == 0, nums)) # [2, 4]
|
|
109
|
+
|
|
110
|
+
# reduce (requires functools): cumulative application
|
|
111
|
+
from functools import reduce
|
|
112
|
+
|
|
113
|
+
product = reduce(lambda x, y: x * y, nums) # 24
|
|
114
|
+
|
|
115
|
+
# Sort with lambda key
|
|
116
|
+
words = ["apple", "banana", "cherry"]
|
|
117
|
+
words.sort(key=lambda w: len(w)) # Sort by length
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# =============================================================================
|
|
121
|
+
# 7. CLASSES AND OBJECTS (OOP)
|
|
122
|
+
# =============================================================================
|
|
123
|
+
class Animal:
|
|
124
|
+
"""Base class for animals."""
|
|
125
|
+
|
|
126
|
+
species_count = 0 # Class attribute
|
|
127
|
+
|
|
128
|
+
def __init__(self, name):
|
|
129
|
+
self.name = name # Instance attribute
|
|
130
|
+
Animal.species_count += 1
|
|
131
|
+
|
|
132
|
+
def speak(self):
|
|
133
|
+
"""Instance method"""
|
|
134
|
+
return "Some sound"
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def get_count(cls):
|
|
138
|
+
"""Class method: takes class as first arg"""
|
|
139
|
+
return cls.species_count
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def is_alive():
|
|
143
|
+
"""Static method: no implicit self or cls args"""
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# Inheritance
|
|
148
|
+
class Dog(Animal):
|
|
149
|
+
def __init__(self, name, breed):
|
|
150
|
+
super().__init__(name) # Call parent constructor
|
|
151
|
+
self.breed = breed
|
|
152
|
+
|
|
153
|
+
def speak(self): # Method Overriding
|
|
154
|
+
return "Woof!"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
dog = Dog("Buddy", "Golden Retriever")
|
|
158
|
+
print(dog.speak()) # "Woof!"
|
|
159
|
+
print(Animal.get_count()) # 1
|
|
160
|
+
|
|
161
|
+
# =============================================================================
|
|
162
|
+
# 8. FILE HANDLING
|
|
163
|
+
# =============================================================================
|
|
164
|
+
# Using 'with' is a best practice, as it automatically closes the file
|
|
165
|
+
# Modes: 'r' (read), 'w' (write, truncates), 'a' (append), 'r+' (read & write), 'b' (binary)
|
|
166
|
+
|
|
167
|
+
# Write to file
|
|
168
|
+
with open("example.txt", "w", encoding="utf-8") as file:
|
|
169
|
+
file.write("Hello World\nLine 2")
|
|
170
|
+
|
|
171
|
+
# Read from file
|
|
172
|
+
with open("example.txt", "r", encoding="utf-8") as file:
|
|
173
|
+
content = file.read() # Read entire file
|
|
174
|
+
# file.seek(0) # Reset cursor to start
|
|
175
|
+
# lines = file.readlines() # Read lines into a list
|
|
176
|
+
# for line in file: # Iterate line by line (memory efficient)
|
|
177
|
+
# print(line.strip())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Raw source snippets shipped with the package."""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: itertoolkit
|
|
3
|
+
Version: 1.5.0
|
|
4
|
+
Summary: An itertools-inspired toolkit for cached iterator and data-structure processing
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: gsppy>=5.3.0
|
|
7
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
8
|
+
Requires-Dist: networkx>=3.6.1
|
|
9
|
+
Requires-Dist: numpy>=2.4.4
|
|
10
|
+
Requires-Dist: pandas>=3.0.2
|
|
11
|
+
Requires-Dist: plotly>=6.6.0
|
|
12
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
13
|
+
Requires-Dist: scipy>=1.17.1
|
|
14
|
+
Requires-Dist: seaborn>=0.13.2
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# itertoolkit
|
|
18
|
+
|
|
19
|
+
Functions creating iterators and cached data pipelines for efficient looping.
|
|
20
|
+
|
|
21
|
+
`itertoolkit` is an `itertools`-inspired wrapper focused on practical data processing. It keeps the lazy, composable style of iterator algebra, then adds cache-aware helpers so repeated list and data-structure transformations run faster.
|
|
22
|
+
|
|
23
|
+
The goal is simple:
|
|
24
|
+
|
|
25
|
+
- Keep memory usage low with lazy iterators.
|
|
26
|
+
- Speed up repeated workloads with caching.
|
|
27
|
+
- Make iterator pipelines readable and reusable.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install itertoolkit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from itertools import count, islice
|
|
39
|
+
|
|
40
|
+
# Install name: itertoolkit
|
|
41
|
+
# Current import path in this repo remains bm_preprocessing
|
|
42
|
+
from bm_preprocessing import IR, DM
|
|
43
|
+
|
|
44
|
+
# Example: base itertools stream
|
|
45
|
+
stream = (x * x for x in count(1))
|
|
46
|
+
print(list(islice(stream, 5))) # [1, 4, 9, 16, 25]
|
|
47
|
+
|
|
48
|
+
# Example: cached computation workflow (concept)
|
|
49
|
+
# result = itertoolkit.cached_map(expensive_fn, dataset, cache_key="v1")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Why It Is Faster
|
|
53
|
+
|
|
54
|
+
`itertoolkit` performance comes from combining:
|
|
55
|
+
|
|
56
|
+
- Lazy iteration, so intermediate materialization is avoided.
|
|
57
|
+
- Cache-first wrappers, so repeated transformations are reused.
|
|
58
|
+
- Composable pipelines, so complex loops stay compact and optimized.
|
|
59
|
+
|
|
60
|
+
In repeated analytics or feature-building jobs, the first pass computes and stores results, and later passes can fetch from cache instead of recomputing every step.
|
|
61
|
+
|
|
62
|
+
## Core Iterator Families
|
|
63
|
+
|
|
64
|
+
### General iterators
|
|
65
|
+
|
|
66
|
+
| Iterator concept | Input | Output shape | Typical use |
|
|
67
|
+
| --- | --- | --- | --- |
|
|
68
|
+
| Running reduction | iterable, func | incremental totals | rolling stats |
|
|
69
|
+
| Batching | iterable, n | tuples of size n | chunk processing |
|
|
70
|
+
| Chaining | multiple iterables | one continuous stream | merging sources |
|
|
71
|
+
| Selection | data + selectors | filtered stream | mask-based filtering |
|
|
72
|
+
| Windowing | iterable | adjacent pairs/windows | transition analysis |
|
|
73
|
+
| Truncation | predicate/slice | bounded output | safe handling of infinite streams |
|
|
74
|
+
|
|
75
|
+
### Combinatoric iterators
|
|
76
|
+
|
|
77
|
+
| Iterator concept | Output |
|
|
78
|
+
| --- | --- |
|
|
79
|
+
| Cartesian products | all pairings across inputs |
|
|
80
|
+
| Permutations | order-sensitive tuples |
|
|
81
|
+
| Combinations | order-insensitive unique tuples |
|
|
82
|
+
| Combinations with replacement | tuples allowing repeated values |
|
|
83
|
+
|
|
84
|
+
## Pipeline Pattern
|
|
85
|
+
|
|
86
|
+
Use this pattern when processing large lists, tables, graphs, or text records:
|
|
87
|
+
|
|
88
|
+
1. Start from one or more iterables.
|
|
89
|
+
2. Chain filtering, mapping, grouping, and batching.
|
|
90
|
+
3. Add cache boundaries around expensive stages.
|
|
91
|
+
4. Materialize only where needed (`list`, `tuple`, `DataFrame`, model input).
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from itertools import chain
|
|
95
|
+
|
|
96
|
+
sources = [[1, 2, 3], [4, 5], [6]]
|
|
97
|
+
pipeline = (x * 10 for x in chain.from_iterable(sources) if x % 2 == 0)
|
|
98
|
+
print(list(pipeline)) # [20, 40, 60]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Caching Strategy
|
|
102
|
+
|
|
103
|
+
Recommended caching behavior for data-heavy workloads:
|
|
104
|
+
|
|
105
|
+
- Key by transformation signature and input fingerprint.
|
|
106
|
+
- Keep deterministic steps cacheable.
|
|
107
|
+
- Invalidate cache on function/version changes.
|
|
108
|
+
- Persist long-running results between sessions.
|
|
109
|
+
|
|
110
|
+
This makes repeated preprocessing and feature extraction significantly cheaper.
|
|
111
|
+
|
|
112
|
+
## Compatibility Note
|
|
113
|
+
|
|
114
|
+
Package distribution name is `itertoolkit`.
|
|
115
|
+
|
|
116
|
+
Current code in this repository still exposes the import path `bm_preprocessing` for compatibility with existing users. If needed, a follow-up release can add a top-level `itertoolkit` import alias as well.
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
MIT
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
bm_preprocessing/__init__.py,sha256=780DeJ5tt1xkg572jEcYXd1z8Jj7QteRv9tfRcwqamo,353
|
|
2
|
+
bm_preprocessing/importer/__init__.py,sha256=3144LTa5i9lr6vJZRdvDyqNmRPRM7FWMOj0aOS5NHM8,186
|
|
3
|
+
bm_preprocessing/importer/_module_printer.py,sha256=rWR9jyfr7ACt5qVeZ4uRHfaZLOb4oKzSIP319rzp4Po,649
|
|
4
|
+
bm_preprocessing/importer/DM/__init__.py,sha256=NytK_IN5e7Lxdfh_T1tjLQ0JFrcLoupjdGOJpU5FwWw,180
|
|
5
|
+
bm_preprocessing/importer/DM/agg.py,sha256=QE1GMhYzSDQ9fs1Ymjy80Bwfsfv7mOlm9tWcgCDljx4,211
|
|
6
|
+
bm_preprocessing/importer/DM/dbscan.py,sha256=P-pnTG7PX2E4Z6yCBVgYhodVSTrrYvWNbJEFHW8744M,220
|
|
7
|
+
bm_preprocessing/importer/DM/finals.py,sha256=ldHMMjmRaG8ToFoc0ALzpLQ2nqaqIe-gjIg7_Odrw7g,220
|
|
8
|
+
bm_preprocessing/importer/DM/gsp.py,sha256=tSb0uOc_eistbdpgeykdjLQoOCZrhaT9rVHrFrpsoWs,211
|
|
9
|
+
bm_preprocessing/importer/DM/test.py,sha256=tS-NUS0fIxmEyc7B_YhLh6duT8k87x0wCfq0O1oG1VM,214
|
|
10
|
+
bm_preprocessing/importer/Finals/__init__.py,sha256=chN1hYPGdRM1GX4q54APyDgOSBWNKcSXaexnxnVYdUA,195
|
|
11
|
+
bm_preprocessing/importer/Finals/kaadhal.py,sha256=7gaY85G_sRr3XlbLmbPMUHUjz2S0vkBHfTkF-xjruiE,231
|
|
12
|
+
bm_preprocessing/importer/Finals/raaka.py,sha256=OdmLlgq4u-OpvdZe8ruRlD0g4jxpg8USqIdN2YJ_6lo,225
|
|
13
|
+
bm_preprocessing/importer/Finals/seedan.py,sha256=fEXKRMA_rnDyM3iN5Ka76pv9SaDfeVht_tHK-oHOS2U,228
|
|
14
|
+
bm_preprocessing/importer/Finals/vikram.py,sha256=cijp6K4snmdD_o4vbUceWvN-w7Z1qDmQkOWa1PzHBlM,228
|
|
15
|
+
bm_preprocessing/importer/IR/__init__.py,sha256=ibuzqcqquie8vom3Yj_J4pnjlosUvEwVW2tyFz5d8wo,196
|
|
16
|
+
bm_preprocessing/importer/IR/finals.py,sha256=aTI2-mVhDBInWYjNMdpPbwId_bK9jrXMTML22F2RzNk,220
|
|
17
|
+
bm_preprocessing/importer/IR/pagerank.py,sha256=XH8NxjKm1lczJ0PrQV1QZujogNn5gMJN6pUlth9itxw,226
|
|
18
|
+
bm_preprocessing/importer/IR/recommenders_pca.py,sha256=5Ui3ZK6iVXEAVDlC72LgPKjB0uyW1OSl7F0kz4QYfek,258
|
|
19
|
+
bm_preprocessing/importer/IR/test.py,sha256=RlRBb6usbeKuWUtrs6QKiahK5FZ4p4A7Y-Qoy4eQjMU,214
|
|
20
|
+
bm_preprocessing/importer/PY/__init__.py,sha256=0Dsh94cSFgb1N4We7zwYzQDR3YHC-Y7pcDvHR-9FKAs,105
|
|
21
|
+
bm_preprocessing/importer/PY/lib_doc.py,sha256=z3H3JO5GSVrDJZy8PEzSGlM_tq_dH5E-4sCjzbPHvzg,223
|
|
22
|
+
bm_preprocessing/importer/PY/python_doc.py,sha256=5HCNqn4Y3jEYTt97koY34LAKODQX7HcB93aApbQL54w,232
|
|
23
|
+
bm_preprocessing/src/__init__.py,sha256=Y4g952oNndkD4fX7aRnbrOVTObZOyhfM7yYewiZaR-U,53
|
|
24
|
+
bm_preprocessing/src/DM/__init__.py,sha256=tGoE9Z5I3cdOxyosBmA5fp6Keh2sO2o0hbhf3SGeBEM,36
|
|
25
|
+
bm_preprocessing/src/DM/agg.py,sha256=IptAnnInn37qkADRGLWL41XqMz99kksIwBPAPVCLeaA,6816
|
|
26
|
+
bm_preprocessing/src/DM/dbscan.py,sha256=j06WbIN-nws9D5BKnEQW_oUl5QZPqPpZdOzKWqhXz5A,6337
|
|
27
|
+
bm_preprocessing/src/DM/finals.py,sha256=gIj5e37rvfvsotomjWSTBFd8gX6ax9XA7loadMuyu0g,337
|
|
28
|
+
bm_preprocessing/src/DM/gsp.py,sha256=EYjEe245MMExdyQQxPrsVnkr_g9-sTOGyAoBZVAyXVs,10944
|
|
29
|
+
bm_preprocessing/src/DM/test.py,sha256=sgNQvC72u767xjqeBHsJxHvAaULv0s7d8frLr7vacvg,342
|
|
30
|
+
bm_preprocessing/src/Finals/__init__.py,sha256=SQpE3o4m6Dkcrm8bv-Qaf29wo7a-sDIOwsB-nN-sPHo,31
|
|
31
|
+
bm_preprocessing/src/Finals/kaadhal.py,sha256=XsnzDLcnK6zrGFwDoFYhIsVb2jfLlfYuX-c7Dh_sJX0,38558
|
|
32
|
+
bm_preprocessing/src/Finals/raaka.py,sha256=PzgfzO_FigPZ9J-PTgWO2KcReOKQFjQtJxIGsQC5bZk,40431
|
|
33
|
+
bm_preprocessing/src/Finals/seedan.py,sha256=6g7wDwpsBv0BavLsR-U4AmjlDRrcelTZaWGlAP7_Yo8,31711
|
|
34
|
+
bm_preprocessing/src/Finals/vikram.py,sha256=GbqoPpa4dg4Q9gGDZF4leUfGzrVwMbId1f73-ooU7cI,12841
|
|
35
|
+
bm_preprocessing/src/IR/__init__.py,sha256=ngrkbmeRSLuVoNAztpNYt6WxYhftH0PSNP72sb7TbaE,27
|
|
36
|
+
bm_preprocessing/src/IR/finals.py,sha256=XvPbcxUo5ib-KkhClTKHgAcNYX3w9KSgMrgdCXStVCQ,217
|
|
37
|
+
bm_preprocessing/src/IR/pagerank.py,sha256=UdRvgkezmYrgG_RK_h3kC9-qnZDcoE7tAw_VYqkuRvo,2989
|
|
38
|
+
bm_preprocessing/src/IR/recommenders_pca.py,sha256=pBz5HDwtyGavITDkvBVJtGgHcyl1sGgmcvkCOt9VNuw,14181
|
|
39
|
+
bm_preprocessing/src/IR/test.py,sha256=tdS9WKDBl7-zMu-ETnVSYgAUkqR-nSnB7Sb7AlnJNO4,222
|
|
40
|
+
bm_preprocessing/src/PY/__init__.py,sha256=v-U6e8H9-BNFcb24QEDYMPKiTaFT_q_NB7WGBFpA_jc,39
|
|
41
|
+
bm_preprocessing/src/PY/lib_doc.py,sha256=7zPuq5Ez_NPmtaeBRkKq2vA9mU6Tgf4bFuDqgZ9TnjI,11302
|
|
42
|
+
bm_preprocessing/src/PY/python_doc.py,sha256=iPKngoIgDRtrXxao5WVlx1ZaLmQIs5cf3x39aHLG34o,7088
|
|
43
|
+
itertoolkit-1.5.0.dist-info/METADATA,sha256=gJLZSTVSGPE8mwJ_2ayQM2_VMJe4zArD_go8KbbUALA,3918
|
|
44
|
+
itertoolkit-1.5.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
45
|
+
itertoolkit-1.5.0.dist-info/RECORD,,
|