bm-preprocessing 1.3.5__tar.gz → 1.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing-1.3.8/PKG-INFO +10 -0
- bm_preprocessing-1.3.8/PY/__init__.py +0 -0
- bm_preprocessing-1.3.8/PY/_module_printer.py +11 -0
- bm_preprocessing-1.3.8/PY/lib_doc.py +297 -0
- {bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources → bm_preprocessing-1.3.8/PY}/python_doc.py +54 -49
- bm_preprocessing-1.3.8/README.md +0 -0
- bm_preprocessing-1.3.8/bm_preprocessing.egg-info/PKG-INFO +10 -0
- bm_preprocessing-1.3.8/bm_preprocessing.egg-info/SOURCES.txt +11 -0
- bm_preprocessing-1.3.8/bm_preprocessing.egg-info/dependency_links.txt +1 -0
- bm_preprocessing-1.3.8/bm_preprocessing.egg-info/requires.txt +4 -0
- bm_preprocessing-1.3.8/bm_preprocessing.egg-info/top_level.txt +1 -0
- bm_preprocessing-1.3.8/pyproject.toml +12 -0
- bm_preprocessing-1.3.8/setup.cfg +4 -0
- bm_preprocessing-1.3.5/.gitignore +0 -221
- bm_preprocessing-1.3.5/INSTALLATION.md +0 -35
- bm_preprocessing-1.3.5/PKG-INFO +0 -257
- bm_preprocessing-1.3.5/README.md +0 -243
- bm_preprocessing-1.3.5/USAGE.md +0 -127
- bm_preprocessing-1.3.5/pyproject.toml +0 -22
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/__init__.py +0 -21
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/adaboost.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/all.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/all_vis.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/apriori.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/bagging.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/hash.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/hunts.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/hunts_test.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/id3.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/id3_test.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/lib_doc.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/metrics.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/preprocessing.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/python_doc.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/adaboost.py +0 -69
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/all.py +0 -308
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/all_vis.py +0 -368
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/apriori.py +0 -113
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/bagging.py +0 -173
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/data.csv +0 -11
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/hash.py +0 -161
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/heart.csv +0 -304
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/hunts.py +0 -96
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/hunts_test.py +0 -101
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/id3.py +0 -134
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/id3_test.py +0 -148
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/lib_doc.py +0 -261
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/metrics.py +0 -240
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/preprocessing.py +0 -42
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/tennis.csv +0 -15
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/test_all.py +0 -400
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources/worksheet.py +0 -305
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/test_all.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/DM/worksheet.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/__init__.py +0 -9
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/all.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/all_vis.py +0 -30
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/eval_metrics.py +0 -26
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/ndd.py +0 -26
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/rel.py +0 -26
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/sources/all.py +0 -255
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/sources/all_vis.py +0 -294
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/sources/eval_metrics.py +0 -224
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/sources/ndd.py +0 -105
- bm_preprocessing-1.3.5/src/bm_preprocessing/IR/sources/rel.py +0 -116
- bm_preprocessing-1.3.5/src/bm_preprocessing/__init__.py +0 -5
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bm-preprocessing
|
|
3
|
+
Version: 1.3.8
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
8
|
+
Requires-Dist: numpy>=2.4.4
|
|
9
|
+
Requires-Dist: pandas>=3.0.2
|
|
10
|
+
Requires-Dist: seaborn>=0.13.2
|
|
File without changes
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from bm_preprocessing.PY._module_printer import PrintableModule
|
|
7
|
+
|
|
8
|
+
# ===========================================================
|
|
9
|
+
# 📂 FILE READING & INITIAL INSPECTION
|
|
10
|
+
# ===========================================================
|
|
11
|
+
|
|
12
|
+
# Load data (CSV, Excel, JSON)
|
|
13
|
+
df = pd.read_csv("data.csv") # Load CSV
|
|
14
|
+
df = pd.read_excel("data.xlsx") # Load Excel
|
|
15
|
+
df = pd.read_json("data.json") # Load JSON
|
|
16
|
+
|
|
17
|
+
# Quick Inspection
|
|
18
|
+
print(df.head())
|
|
19
|
+
print(df.info())
|
|
20
|
+
print(df.describe()) # Basic stats & info
|
|
21
|
+
print(df.shape) # (rows, columns)
|
|
22
|
+
print(df.columns.tolist()) # List all column names
|
|
23
|
+
print(df.dtypes) # Data types of each column
|
|
24
|
+
print(df.isnull().sum()) # Count NaNs per column
|
|
25
|
+
print(df.nunique()) # Unique values per column
|
|
26
|
+
print(df.value_counts("col")) # Frequency of each value
|
|
27
|
+
|
|
28
|
+
# ===========================================================
|
|
29
|
+
# 🔍 SELECTION & MANIPULATION
|
|
30
|
+
# ===========================================================
|
|
31
|
+
|
|
32
|
+
# Selecting rows and columns
|
|
33
|
+
cols = df[["col1", "col2"]] # Select multiple columns
|
|
34
|
+
rows = df.iloc[0:10] # Select first 10 rows by index
|
|
35
|
+
filtered = df[df["age"] > 25] # Select rows based on condition
|
|
36
|
+
cell = df.loc[0, "col1"] # Select specific cell (label-based)
|
|
37
|
+
cell = df.iloc[0, 0] # Select specific cell (index-based)
|
|
38
|
+
filtered = df.query("age > 25 and city == 'NYC'") # Query-based filtering
|
|
39
|
+
sampled = df.sample(n=5, random_state=42) # Random sample of rows
|
|
40
|
+
|
|
41
|
+
# Sorting
|
|
42
|
+
sorted_df = df.sort_values("col", ascending=False) # Sort by column descending
|
|
43
|
+
sorted_df = df.sort_values(
|
|
44
|
+
["col1", "col2"], ascending=[True, False]
|
|
45
|
+
) # Multi-column sort
|
|
46
|
+
|
|
47
|
+
# Stacking & Merging
|
|
48
|
+
df1 = pd.DataFrame({"id": [1, 2], "val1": ["A", "B"]})
|
|
49
|
+
df2 = pd.DataFrame({"id": [1, 2], "val2": ["C", "D"]})
|
|
50
|
+
df_stacked = pd.concat([df1, df2], axis=0) # Stack vertically (rows)
|
|
51
|
+
df_wide = pd.concat([df1, df2], axis=1) # Stack horizontally (columns)
|
|
52
|
+
df_merged = pd.merge(
|
|
53
|
+
df1, df2, on="id", how="inner"
|
|
54
|
+
) # SQL-like join (inner/left/right/outer)
|
|
55
|
+
|
|
56
|
+
# ===========================================================
|
|
57
|
+
# 🛠️ DATA CLEANING & PROCESSING
|
|
58
|
+
# ===========================================================
|
|
59
|
+
|
|
60
|
+
# Handling Missing Values
|
|
61
|
+
df["col"].fillna(df["col"].mean(), inplace=True) # Impute by mean (one-liner)
|
|
62
|
+
df["col"].fillna(df["col"].median(), inplace=True) # Impute by median
|
|
63
|
+
df["col"].fillna(df["col"].mode()[0], inplace=True) # Impute by mode (categorical)
|
|
64
|
+
df.dropna(axis=0, inplace=True) # Drop rows with any NaN values
|
|
65
|
+
df.dropna(
|
|
66
|
+
subset=["col1", "col2"], inplace=True
|
|
67
|
+
) # Drop rows with NaN in specific columns
|
|
68
|
+
df.ffill(inplace=True) # Forward fill NaN values
|
|
69
|
+
df.bfill(inplace=True) # Backward fill NaN values
|
|
70
|
+
|
|
71
|
+
# Transformations
|
|
72
|
+
df["new_col"] = df["col"].apply(lambda x: x**2) # Apply custom function
|
|
73
|
+
grouped = df.groupby("category")["sales"].sum() # Groupby & aggregate
|
|
74
|
+
grouped = df.groupby("cat").agg(
|
|
75
|
+
{"sales": "sum", "qty": "mean"}
|
|
76
|
+
) # Multiple aggregations
|
|
77
|
+
df["cat_code"] = df["category"].astype("category").cat.codes # Quick label encoding
|
|
78
|
+
df["col"] = df["col"].str.lower() # String lowercase
|
|
79
|
+
df["col"] = df["col"].str.replace("old", "new") # String replace
|
|
80
|
+
df["col"] = df["col"].str.strip() # Strip whitespace
|
|
81
|
+
df["binned"] = pd.cut(df["age"], bins=[0, 18, 35, 60, 100]) # Binning/discretization
|
|
82
|
+
df = pd.get_dummies(df, columns=["city"], drop_first=True) # One-hot encode columns
|
|
83
|
+
df.rename(columns={"old_name": "new_name"}, inplace=True) # Rename columns
|
|
84
|
+
df.drop(columns=["col1", "col2"], inplace=True) # Drop columns
|
|
85
|
+
df.drop_duplicates(inplace=True) # Remove duplicate rows
|
|
86
|
+
|
|
87
|
+
# Type Conversions
|
|
88
|
+
df["col"] = df["col"].astype(int) # Convert column type
|
|
89
|
+
df["date"] = pd.to_datetime(df["date_str"]) # Parse dates
|
|
90
|
+
|
|
91
|
+
# ===========================================================
|
|
92
|
+
# 🔢 NUMPY ESSENTIALS
|
|
93
|
+
# ===========================================================
|
|
94
|
+
|
|
95
|
+
arr = np.array([1, 2, 3]) # Create array
|
|
96
|
+
reshaped = arr.reshape(1, -1) # Reshape for sklearn (2D)
|
|
97
|
+
mean_val = np.mean(arr)
|
|
98
|
+
std_val = np.std(arr) # Basic stats
|
|
99
|
+
mask = arr[arr > 2] # Boolean indexing/filtering
|
|
100
|
+
zeros = np.zeros((3, 3))
|
|
101
|
+
ones = np.ones((3, 3)) # Zero/One matrices
|
|
102
|
+
eye = np.eye(3) # Identity matrix
|
|
103
|
+
rand = np.random.rand(3, 3) # Random matrix [0,1)
|
|
104
|
+
dot = np.dot(arr, arr) # Dot product
|
|
105
|
+
norm = np.linalg.norm(arr) # Vector norm (L2)
|
|
106
|
+
log = np.log2(arr) # Log base 2 (entropy)
|
|
107
|
+
unique, counts = np.unique(arr, return_counts=True) # Unique values & counts
|
|
108
|
+
|
|
109
|
+
# ===========================================================
|
|
110
|
+
# 🤖 SCIKIT-LEARN PREPROCESSING
|
|
111
|
+
# ===========================================================
|
|
112
|
+
|
|
113
|
+
from sklearn.impute import SimpleImputer
|
|
114
|
+
from sklearn.model_selection import train_test_split
|
|
115
|
+
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
|
|
116
|
+
StandardScaler)
|
|
117
|
+
|
|
118
|
+
# Splitting Data
|
|
119
|
+
X = df.drop("target", axis=1)
|
|
120
|
+
y = df["target"]
|
|
121
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
122
|
+
X, y, test_size=0.2, random_state=42
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Imputing (Standard approach)
|
|
126
|
+
imputed = SimpleImputer(strategy="mean").fit_transform(
|
|
127
|
+
df[["num_col"]]
|
|
128
|
+
) # Impute missing with mean
|
|
129
|
+
imputed = SimpleImputer(strategy="most_frequent").fit_transform(
|
|
130
|
+
df[["cat"]]
|
|
131
|
+
) # Impute categorical
|
|
132
|
+
|
|
133
|
+
# Scaling & Encoding
|
|
134
|
+
scaled = StandardScaler().fit_transform(
|
|
135
|
+
df[["age", "salary"]]
|
|
136
|
+
) # Standardize (mean=0, std=1)
|
|
137
|
+
scaled = MinMaxScaler().fit_transform(df[["age", "salary"]]) # Normalize to [0, 1]
|
|
138
|
+
encoded = OneHotEncoder().fit_transform(df[["gender"]]).toarray() # One-hot encode
|
|
139
|
+
le = LabelEncoder().fit_transform(df["target"]) # Encode target labels
|
|
140
|
+
|
|
141
|
+
# Pipeline (all-in-one)
|
|
142
|
+
from sklearn.compose import ColumnTransformer
|
|
143
|
+
from sklearn.pipeline import Pipeline
|
|
144
|
+
|
|
145
|
+
processor = ColumnTransformer(
|
|
146
|
+
[("num", StandardScaler(), ["age"]), ("cat", OneHotEncoder(), ["city"])]
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# ===========================================================
|
|
150
|
+
# ⛏️ DATA MINING (DM) ESSENTIALS
|
|
151
|
+
# ===========================================================
|
|
152
|
+
|
|
153
|
+
from sklearn.cluster import KMeans
|
|
154
|
+
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
|
|
155
|
+
RandomForestClassifier)
|
|
156
|
+
from sklearn.metrics import (accuracy_score, auc, classification_report,
|
|
157
|
+
confusion_matrix, f1_score,
|
|
158
|
+
precision_recall_curve, precision_score,
|
|
159
|
+
recall_score, roc_curve)
|
|
160
|
+
from sklearn.preprocessing import label_binarize
|
|
161
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
162
|
+
|
|
163
|
+
# --- Decision Trees ---
|
|
164
|
+
dt = DecisionTreeClassifier(random_state=42).fit(
|
|
165
|
+
X_train, y_train
|
|
166
|
+
) # Train decision tree
|
|
167
|
+
y_pred = dt.predict(X_test) # Predict
|
|
168
|
+
print(accuracy_score(y_test, y_pred)) # Accuracy
|
|
169
|
+
|
|
170
|
+
# --- Ensemble Methods ---
|
|
171
|
+
bag = BaggingClassifier(n_estimators=10, random_state=42).fit(
|
|
172
|
+
X_train, y_train
|
|
173
|
+
) # Bagging
|
|
174
|
+
ada = AdaBoostClassifier(n_estimators=50, random_state=42).fit(
|
|
175
|
+
X_train, y_train
|
|
176
|
+
) # AdaBoost
|
|
177
|
+
rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(
|
|
178
|
+
X_train, y_train
|
|
179
|
+
) # Random Forest
|
|
180
|
+
|
|
181
|
+
# --- Clustering ---
|
|
182
|
+
kmeans = KMeans(n_clusters=3, random_state=42).fit(X) # K-Means clustering
|
|
183
|
+
labels = kmeans.labels_ # Cluster labels
|
|
184
|
+
centers = kmeans.cluster_centers_ # Cluster centroids
|
|
185
|
+
|
|
186
|
+
# --- Classification Metrics ---
|
|
187
|
+
print(accuracy_score(y_test, y_pred)) # Accuracy = (TP+TN)/(TP+TN+FP+FN)
|
|
188
|
+
print(precision_score(y_test, y_pred, average="weighted")) # Precision = TP/(TP+FP)
|
|
189
|
+
print(recall_score(y_test, y_pred, average="weighted")) # Recall = TP/(TP+FN)
|
|
190
|
+
print(f1_score(y_test, y_pred, average="weighted")) # F1 = 2*(P*R)/(P+R)
|
|
191
|
+
print(confusion_matrix(y_test, y_pred)) # Confusion matrix
|
|
192
|
+
print(classification_report(y_test, y_pred)) # Full report
|
|
193
|
+
|
|
194
|
+
# --- ROC & AUC ---
|
|
195
|
+
y_bin = label_binarize(y_test, classes=[0, 1, 2]) # Binarize for multiclass ROC
|
|
196
|
+
y_proba = dt.predict_proba(X_test) # Predicted probabilities
|
|
197
|
+
fpr, tpr, _ = roc_curve(y_bin[:, 0], y_proba[:, 0]) # ROC curve (per class)
|
|
198
|
+
roc_auc = auc(fpr, tpr) # AUC score
|
|
199
|
+
|
|
200
|
+
# --- Apriori / Association Rules (manual) ---
|
|
201
|
+
from itertools import combinations
|
|
202
|
+
|
|
203
|
+
support = lambda itemset, txns: sum(1 for t in txns if itemset.issubset(t)) / len(txns)
|
|
204
|
+
transactions = [
|
|
205
|
+
["milk", "bread"],
|
|
206
|
+
["milk", "diaper", "beer", "bread"],
|
|
207
|
+
["milk", "bread", "diaper"],
|
|
208
|
+
["bread", "diaper"],
|
|
209
|
+
]
|
|
210
|
+
items = set(item for t in transactions for item in t)
|
|
211
|
+
freq_items = {frozenset([i]) for t in transactions for i in t} # C1 candidates
|
|
212
|
+
pairs = [
|
|
213
|
+
frozenset(c) for c in combinations(sorted(items), 2)
|
|
214
|
+
] # Generate candidate pairs
|
|
215
|
+
|
|
216
|
+
# --- Entropy & Information Gain (ID3) ---
|
|
217
|
+
entropy = lambda probs: -sum(p * np.log2(p) for p in probs if p > 0) # Shannon entropy
|
|
218
|
+
info_gain = lambda parent_ent, children: parent_ent - sum(
|
|
219
|
+
w * entropy(c) for w, c in children
|
|
220
|
+
)
|
|
221
|
+
gini = lambda probs: 1 - sum(p**2 for p in probs) # Gini impurity
|
|
222
|
+
|
|
223
|
+
# ===========================================================
|
|
224
|
+
# 🔎 INFORMATION RETRIEVAL (IR) ESSENTIALS
|
|
225
|
+
# ===========================================================
|
|
226
|
+
|
|
227
|
+
import math
|
|
228
|
+
from collections import Counter
|
|
229
|
+
|
|
230
|
+
# --- Term Frequency (TF) ---
|
|
231
|
+
# TF(t, d) = count(t in d) / total_terms_in_d
|
|
232
|
+
tf = lambda term, doc: doc.count(term) / len(doc) # Term frequency
|
|
233
|
+
|
|
234
|
+
# --- Inverse Document Frequency (IDF) ---
|
|
235
|
+
# IDF(t) = log(N / df(t)) where N = total docs, df(t) = docs containing t
|
|
236
|
+
idf = lambda term, docs: math.log(len(docs) / sum(1 for d in docs if term in d))
|
|
237
|
+
|
|
238
|
+
# --- TF-IDF ---
|
|
239
|
+
# TF-IDF(t, d) = TF(t, d) * IDF(t)
|
|
240
|
+
tfidf = lambda term, doc, docs: tf(term, doc) * idf(term, docs) # TF-IDF score
|
|
241
|
+
|
|
242
|
+
# --- BM25 ---
|
|
243
|
+
# BM25(t, d) = IDF(t) * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * |d| / avgdl))
|
|
244
|
+
k1 = 1.5
|
|
245
|
+
b = 0.75 # BM25 parameters
|
|
246
|
+
docs = [
|
|
247
|
+
["term1", "term2"],
|
|
248
|
+
["term2", "term3"],
|
|
249
|
+
["term1", "term3", "term4"],
|
|
250
|
+
] # Sample documents
|
|
251
|
+
avgdl = np.mean([len(d) for d in docs]) # Average document length
|
|
252
|
+
|
|
253
|
+
# --- Boolean Retrieval ---
|
|
254
|
+
# AND: set(doc1_terms) & set(doc2_terms)
|
|
255
|
+
# OR: set(doc1_terms) | set(doc2_terms)
|
|
256
|
+
# NOT: set(all_terms) - set(doc_terms)
|
|
257
|
+
bool_and = lambda q_terms, doc: all(t in doc for t in q_terms) # Boolean AND query
|
|
258
|
+
bool_or = lambda q_terms, doc: any(t in doc for t in q_terms) # Boolean OR query
|
|
259
|
+
|
|
260
|
+
# --- Cosine Similarity ---
|
|
261
|
+
# cos(A, B) = (A · B) / (||A|| * ||B||)
|
|
262
|
+
cosine_sim = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
263
|
+
|
|
264
|
+
# --- Jaccard Similarity ---
|
|
265
|
+
# J(A, B) = |A ∩ B| / |A ∪ B|
|
|
266
|
+
jaccard = lambda a, b: len(a & b) / len(a | b) # Jaccard similarity (sets)
|
|
267
|
+
|
|
268
|
+
# --- Tokenization & Text Processing ---
|
|
269
|
+
tokens = "hello world foo bar".lower().split() # Basic tokenization
|
|
270
|
+
vocab = set(tokens) # Vocabulary
|
|
271
|
+
bow = Counter(tokens) # Bag of words
|
|
272
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
273
|
+
|
|
274
|
+
tfidf_matrix = TfidfVectorizer().fit_transform(
|
|
275
|
+
["doc1 text", "doc2 text"]
|
|
276
|
+
) # TF-IDF vectorizer
|
|
277
|
+
|
|
278
|
+
# --- Inverted Index ---
|
|
279
|
+
inv_index = {} # Build inverted index
|
|
280
|
+
for doc_id, doc in enumerate(docs):
|
|
281
|
+
for term in doc:
|
|
282
|
+
inv_index.setdefault(term, set()).add(doc_id)
|
|
283
|
+
|
|
284
|
+
# --- Precision & Recall (IR) ---
|
|
285
|
+
# Precision@k = relevant_in_top_k / k
|
|
286
|
+
# Recall@k = relevant_in_top_k / total_relevant
|
|
287
|
+
precision_at_k = lambda retrieved, relevant, k: len(set(retrieved[:k]) & relevant) / k
|
|
288
|
+
recall_at_k = lambda retrieved, relevant, k: len(set(retrieved[:k]) & relevant) / len(
|
|
289
|
+
relevant
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
with open(__file__, "r", encoding="utf-8") as _f:
|
|
294
|
+
_source_content = _f.read()
|
|
295
|
+
|
|
296
|
+
sys.modules[__name__].__class__ = PrintableModule
|
|
297
|
+
sys.modules[__name__]._source_content = _source_content
|
{bm_preprocessing-1.3.5/src/bm_preprocessing/DM/sources → bm_preprocessing-1.3.8/PY}/python_doc.py
RENAMED
|
@@ -8,21 +8,21 @@ A quick reference guide for core Python concepts, data structures, and features.
|
|
|
8
8
|
# =============================================================================
|
|
9
9
|
# 1. LISTS (Mutable, Ordered)
|
|
10
10
|
# =============================================================================
|
|
11
|
-
my_list = [1, 2, 3,
|
|
11
|
+
my_list = [1, 2, 3, "a", "b"]
|
|
12
12
|
|
|
13
13
|
# Operations
|
|
14
|
-
my_list.append(4)
|
|
15
|
-
my_list.insert(0, 0)
|
|
16
|
-
my_list.extend([5, 6])
|
|
17
|
-
my_list.pop()
|
|
18
|
-
my_list.pop(1)
|
|
19
|
-
my_list.remove(
|
|
20
|
-
my_list.reverse()
|
|
14
|
+
my_list.append(4) # Add to end: [1, 2, 3, 'a', 'b', 4]
|
|
15
|
+
my_list.insert(0, 0) # Insert at index: [0, 1, 2, 3, 'a', 'b', 4]
|
|
16
|
+
my_list.extend([5, 6]) # Append multiple: [0, 1, 2, 3, 'a', 'b', 4, 5, 6]
|
|
17
|
+
my_list.pop() # Remove & return last item (6)
|
|
18
|
+
my_list.pop(1) # Remove & return item at index 1 (1)
|
|
19
|
+
my_list.remove("a") # Remove first occurrence of 'a'
|
|
20
|
+
my_list.reverse() # Reverse in place
|
|
21
21
|
# my_list.sort() # Sort in place (requires same types)
|
|
22
22
|
# sorted(my_list) # Return new sorted list
|
|
23
|
-
my_list.clear()
|
|
24
|
-
count = my_list.count(2)
|
|
25
|
-
idx = my_list.index(3)
|
|
23
|
+
my_list.clear() # Empty the list
|
|
24
|
+
count = my_list.count(2) # Count occurrences
|
|
25
|
+
idx = my_list.index(3) # Find index of first occurrence
|
|
26
26
|
|
|
27
27
|
# Slicing: list[start:stop:step]
|
|
28
28
|
# my_list[1:4] (index 1 to 3), my_list[::-1] (reverse)
|
|
@@ -31,67 +31,67 @@ idx = my_list.index(3) # Find index of first occurrence
|
|
|
31
31
|
# 2. SETS (Mutable, Unordered, Unique Elements)
|
|
32
32
|
# =============================================================================
|
|
33
33
|
my_set = {1, 2, 3}
|
|
34
|
-
empty_set = set()
|
|
34
|
+
empty_set = set() # Note: {} creates an empty dict, not a set
|
|
35
35
|
|
|
36
36
|
# Operations
|
|
37
|
-
my_set.add(4)
|
|
38
|
-
my_set.update([5, 6])
|
|
39
|
-
my_set.remove(6)
|
|
40
|
-
my_set.discard(10)
|
|
41
|
-
my_set.pop()
|
|
42
|
-
my_set.clear()
|
|
37
|
+
my_set.add(4) # Add element
|
|
38
|
+
my_set.update([5, 6]) # Add multiple elements
|
|
39
|
+
my_set.remove(6) # Remove element (raises KeyError if not found)
|
|
40
|
+
my_set.discard(10) # Remove element (safe, no error if not found)
|
|
41
|
+
my_set.pop() # Remove & return arbitrary element
|
|
42
|
+
my_set.clear() # Empty the set
|
|
43
43
|
|
|
44
44
|
set_a, set_b = {1, 2}, {2, 3}
|
|
45
|
-
union = set_a | set_b
|
|
46
|
-
intersection = set_a & set_b
|
|
47
|
-
diff = set_a - set_b
|
|
48
|
-
sym_diff = set_a ^ set_b
|
|
45
|
+
union = set_a | set_b # {1, 2, 3} (or set_a.union(set_b))
|
|
46
|
+
intersection = set_a & set_b # {2} (or set_a.intersection(set_b))
|
|
47
|
+
diff = set_a - set_b # {1} (or set_a.difference(set_b))
|
|
48
|
+
sym_diff = set_a ^ set_b # {1, 3} (or set_a.symmetric_difference(set_b))
|
|
49
49
|
|
|
50
50
|
# =============================================================================
|
|
51
51
|
# 3. TUPLES (Immutable, Ordered)
|
|
52
52
|
# =============================================================================
|
|
53
53
|
my_tuple = (1, 2, 3, 2)
|
|
54
|
-
single_tuple = (1,)
|
|
54
|
+
single_tuple = (1,) # Comma needed for single-element tuple
|
|
55
55
|
|
|
56
56
|
# Operations (Very limited since immutable)
|
|
57
|
-
count = my_tuple.count(2)
|
|
58
|
-
idx = my_tuple.index(3)
|
|
57
|
+
count = my_tuple.count(2) # Count occurrences (2)
|
|
58
|
+
idx = my_tuple.index(3) # Find index of first occurrence (2)
|
|
59
59
|
# Tuples support unpacking: a, b, c, d = my_tuple
|
|
60
60
|
|
|
61
61
|
# =============================================================================
|
|
62
62
|
# 4. DICTIONARIES (Mutable, Key-Value Pairs, Unordered before Python 3.7)
|
|
63
63
|
# =============================================================================
|
|
64
|
-
my_dict = {
|
|
64
|
+
my_dict = {"name": "Alice", "age": 25}
|
|
65
65
|
|
|
66
66
|
# Operations
|
|
67
|
-
my_dict[
|
|
68
|
-
val = my_dict.get(
|
|
69
|
-
val = my_dict.get(
|
|
70
|
-
keys = my_dict.keys()
|
|
71
|
-
values = my_dict.values()
|
|
72
|
-
items = my_dict.items()
|
|
67
|
+
my_dict["city"] = "NYC" # Add or update key
|
|
68
|
+
val = my_dict.get("age") # Safe get (returns None if not found, instead of KeyError)
|
|
69
|
+
val = my_dict.get("x", 0) # Safe get with default value
|
|
70
|
+
keys = my_dict.keys() # dict_keys(['name', 'age', 'city'])
|
|
71
|
+
values = my_dict.values() # dict_values(['Alice', 25, 'NYC'])
|
|
72
|
+
items = my_dict.items() # dict_items([('name', 'Alice'), ...])
|
|
73
73
|
|
|
74
74
|
# Removal
|
|
75
|
-
popped_val = my_dict.pop(
|
|
76
|
-
popped_item = my_dict.popitem()
|
|
75
|
+
popped_val = my_dict.pop("age") # Remove key 'age' and return value
|
|
76
|
+
popped_item = my_dict.popitem() # Remove & return last key-value pair as tuple
|
|
77
77
|
# del my_dict['name'] # Delete key
|
|
78
|
-
my_dict.clear()
|
|
79
|
-
my_dict.update({
|
|
78
|
+
my_dict.clear() # Empty dict
|
|
79
|
+
my_dict.update({"a": 1, "b": 2}) # Merge / Update with another dict
|
|
80
80
|
|
|
81
81
|
# =============================================================================
|
|
82
82
|
# 5. LIST / DICT / SET COMPREHENSIONS
|
|
83
83
|
# =============================================================================
|
|
84
84
|
# List Comprehension: [expression for item in iterable if condition]
|
|
85
|
-
squares = [x**2 for x in range(10) if x % 2 == 0]
|
|
85
|
+
squares = [x**2 for x in range(10) if x % 2 == 0] # [0, 4, 16, 36, 64]
|
|
86
86
|
|
|
87
87
|
# Dict Comprehension: {key_expr: val_expr for item in iterable if condition}
|
|
88
|
-
sq_dict = {x: x**2 for x in range(5)}
|
|
88
|
+
sq_dict = {x: x**2 for x in range(5)} # {0: 0, 1: 1, 2: 4, 3: 9, 4: 16}
|
|
89
89
|
|
|
90
90
|
# Set Comprehension: {expression for item in iterable if condition}
|
|
91
|
-
sq_set = {x**2 for x in [-1, 1, 2]}
|
|
91
|
+
sq_set = {x**2 for x in [-1, 1, 2]} # {1, 4}
|
|
92
92
|
|
|
93
93
|
# Generator Expression: (expression for item in iterable if condition)
|
|
94
|
-
gen = (x**2 for x in range(10))
|
|
94
|
+
gen = (x**2 for x in range(10)) # Lazy evaluation
|
|
95
95
|
|
|
96
96
|
# =============================================================================
|
|
97
97
|
# 6. LAMBDA FUNCTIONS, MAP, FILTER, REDUCE
|
|
@@ -102,24 +102,27 @@ print(add(2, 3)) # 5
|
|
|
102
102
|
|
|
103
103
|
nums = [1, 2, 3, 4]
|
|
104
104
|
# map: apply function to all items
|
|
105
|
-
mapped = list(map(lambda x: x*2, nums))
|
|
105
|
+
mapped = list(map(lambda x: x * 2, nums)) # [2, 4, 6, 8]
|
|
106
106
|
|
|
107
107
|
# filter: keep items where function returns True
|
|
108
|
-
filtered = list(filter(lambda x: x % 2 == 0, nums))
|
|
108
|
+
filtered = list(filter(lambda x: x % 2 == 0, nums)) # [2, 4]
|
|
109
109
|
|
|
110
110
|
# reduce (requires functools): cumulative application
|
|
111
111
|
from functools import reduce
|
|
112
|
-
|
|
112
|
+
|
|
113
|
+
product = reduce(lambda x, y: x * y, nums) # 24
|
|
113
114
|
|
|
114
115
|
# Sort with lambda key
|
|
115
116
|
words = ["apple", "banana", "cherry"]
|
|
116
|
-
words.sort(key=lambda w: len(w))
|
|
117
|
+
words.sort(key=lambda w: len(w)) # Sort by length
|
|
118
|
+
|
|
117
119
|
|
|
118
120
|
# =============================================================================
|
|
119
121
|
# 7. CLASSES AND OBJECTS (OOP)
|
|
120
122
|
# =============================================================================
|
|
121
123
|
class Animal:
|
|
122
124
|
"""Base class for animals."""
|
|
125
|
+
|
|
123
126
|
species_count = 0 # Class attribute
|
|
124
127
|
|
|
125
128
|
def __init__(self, name):
|
|
@@ -140,18 +143,20 @@ class Animal:
|
|
|
140
143
|
"""Static method: no implicit self or cls args"""
|
|
141
144
|
return True
|
|
142
145
|
|
|
146
|
+
|
|
143
147
|
# Inheritance
|
|
144
148
|
class Dog(Animal):
|
|
145
149
|
def __init__(self, name, breed):
|
|
146
150
|
super().__init__(name) # Call parent constructor
|
|
147
151
|
self.breed = breed
|
|
148
|
-
|
|
149
|
-
def speak(self):
|
|
152
|
+
|
|
153
|
+
def speak(self): # Method Overriding
|
|
150
154
|
return "Woof!"
|
|
151
155
|
|
|
156
|
+
|
|
152
157
|
dog = Dog("Buddy", "Golden Retriever")
|
|
153
|
-
print(dog.speak())
|
|
154
|
-
print(Animal.get_count())
|
|
158
|
+
print(dog.speak()) # "Woof!"
|
|
159
|
+
print(Animal.get_count()) # 1
|
|
155
160
|
|
|
156
161
|
# =============================================================================
|
|
157
162
|
# 8. FILE HANDLING
|
|
@@ -165,7 +170,7 @@ with open("example.txt", "w", encoding="utf-8") as file:
|
|
|
165
170
|
|
|
166
171
|
# Read from file
|
|
167
172
|
with open("example.txt", "r", encoding="utf-8") as file:
|
|
168
|
-
content = file.read()
|
|
173
|
+
content = file.read() # Read entire file
|
|
169
174
|
# file.seek(0) # Reset cursor to start
|
|
170
175
|
# lines = file.readlines() # Read lines into a list
|
|
171
176
|
# for line in file: # Iterate line by line (memory efficient)
|
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bm-preprocessing
|
|
3
|
+
Version: 1.3.8
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
8
|
+
Requires-Dist: numpy>=2.4.4
|
|
9
|
+
Requires-Dist: pandas>=3.0.2
|
|
10
|
+
Requires-Dist: seaborn>=0.13.2
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
PY/__init__.py
|
|
4
|
+
PY/_module_printer.py
|
|
5
|
+
PY/lib_doc.py
|
|
6
|
+
PY/python_doc.py
|
|
7
|
+
bm_preprocessing.egg-info/PKG-INFO
|
|
8
|
+
bm_preprocessing.egg-info/SOURCES.txt
|
|
9
|
+
bm_preprocessing.egg-info/dependency_links.txt
|
|
10
|
+
bm_preprocessing.egg-info/requires.txt
|
|
11
|
+
bm_preprocessing.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
PY
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "bm-preprocessing"
|
|
3
|
+
version = "1.3.8"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"matplotlib>=3.10.8",
|
|
9
|
+
"numpy>=2.4.4",
|
|
10
|
+
"pandas>=3.0.2",
|
|
11
|
+
"seaborn>=0.13.2",
|
|
12
|
+
]
|