bm-preprocessing 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/PKG-INFO +1 -1
  2. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/USAGE.md +3 -1
  3. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/pyproject.toml +1 -1
  4. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/__init__.py +2 -1
  5. bm_preprocessing-0.8.0/src/bm_preprocessing/DM/lib_doc.py +30 -0
  6. bm_preprocessing-0.8.0/src/bm_preprocessing/DM/sources/lib_doc.py +223 -0
  7. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/.gitignore +0 -0
  8. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/README.md +0 -0
  9. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/adaboost.py +0 -0
  10. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/all.py +0 -0
  11. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  12. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/bagging.py +0 -0
  13. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/hash.py +0 -0
  14. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  15. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  16. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/id3.py +0 -0
  17. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/id3_test.py +0 -0
  18. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/metrics.py +0 -0
  19. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  20. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/adaboost.py +0 -0
  21. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
  22. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  23. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/bagging.py +0 -0
  24. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  25. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  26. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  27. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  28. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/id3.py +0 -0
  29. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/id3_test.py +0 -0
  30. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/metrics.py +0 -0
  31. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  32. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/DM/sources/tennis.csv +0 -0
  33. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/IR/__init__.py +0 -0
  34. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/IR/all.py +0 -0
  35. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  36. {bm_preprocessing-0.7.0 → bm_preprocessing-0.8.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
@@ -15,7 +15,7 @@ Create a file `example.py`:
15
15
  ```python
16
16
  # Import modules
17
17
  from bm_preprocessing.IR import all
18
- from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, metrics, preprocessing
18
+ from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, lib_doc, metrics, preprocessing
19
19
 
20
20
  # Print the source code
21
21
  print("=== IR All Module ===")
@@ -116,6 +116,7 @@ python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
116
116
  python -c "from bm_preprocessing.DM import id3; print(id3)"
117
117
  python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
118
118
  python -c "from bm_preprocessing.DM import metrics; print(metrics)"
119
+ python -c "from bm_preprocessing.DM import lib_doc; print(lib_doc)"
119
120
  python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
120
121
  ```
121
122
 
@@ -136,4 +137,5 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
136
137
  | `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
137
138
  | `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
138
139
  | `from bm_preprocessing.DM import metrics` | Classification metrics & curves |
140
+ | `from bm_preprocessing.DM import lib_doc` | Pandas/NumPy/Sklearn/DM/IR cheat sheet |
139
141
  | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.7.0"
7
+ version = "0.8.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -9,8 +9,9 @@ from .hunts import hunts
9
9
  from .hunts_test import hunts_test
10
10
  from .id3 import id3
11
11
  from .id3_test import id3_test
12
+ from .lib_doc import lib_doc
12
13
  from .metrics import metrics
13
14
  from .preprocessing import preprocessing
14
15
 
15
- __all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "metrics", "preprocessing"]
16
+ __all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "lib_doc", "metrics", "preprocessing"]
16
17
 
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/lib_doc.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "lib_doc.py"
30
+ lib_doc = SourceCodeModule("DM.lib_doc", _source_file)
@@ -0,0 +1,223 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # ===========================================================
5
+ # 📂 FILE READING & INITIAL INSPECTION
6
+ # ===========================================================
7
+
8
+ # Load data (CSV, Excel, JSON)
9
+ df = pd.read_csv('data.csv') # Load CSV
10
+ df = pd.read_excel('data.xlsx') # Load Excel
11
+ df = pd.read_json('data.json') # Load JSON
12
+
13
+ # Quick Inspection
14
+ print(df.head()); print(df.info()); print(df.describe()) # Basic stats & info
15
+ print(df.shape) # (rows, columns)
16
+ print(df.columns.tolist()) # List all column names
17
+ print(df.dtypes) # Data types of each column
18
+ print(df.isnull().sum()) # Count NaNs per column
19
+ print(df.nunique()) # Unique values per column
20
+ print(df.value_counts('col')) # Frequency of each value
21
+
22
+ # ===========================================================
23
+ # 🔍 SELECTION & MANIPULATION
24
+ # ===========================================================
25
+
26
+ # Selecting rows and columns
27
+ cols = df[['col1', 'col2']] # Select multiple columns
28
+ rows = df.iloc[0:10] # Select first 10 rows by index
29
+ filtered = df[df['age'] > 25] # Select rows based on condition
30
+ cell = df.loc[0, 'col1'] # Select specific cell (label-based)
31
+ cell = df.iloc[0, 0] # Select specific cell (index-based)
32
+ filtered = df.query("age > 25 and city == 'NYC'") # Query-based filtering
33
+ sampled = df.sample(n=5, random_state=42) # Random sample of rows
34
+
35
+ # Sorting
36
+ sorted_df = df.sort_values('col', ascending=False) # Sort by column descending
37
+ sorted_df = df.sort_values(['col1', 'col2'], ascending=[True, False]) # Multi-column sort
38
+
39
+ # Stacking & Merging
40
+ df_stacked = pd.concat([df1, df2], axis=0) # Stack vertically (rows)
41
+ df_wide = pd.concat([df1, df2], axis=1) # Stack horizontally (columns)
42
+ df_merged = pd.merge(df1, df2, on='id', how='inner') # SQL-like join (inner/left/right/outer)
43
+
44
+ # ===========================================================
45
+ # 🛠️ DATA CLEANING & PROCESSING
46
+ # ===========================================================
47
+
48
+ # Handling Missing Values
49
+ df['col'].fillna(df['col'].mean(), inplace=True) # Impute by mean (one-liner)
50
+ df['col'].fillna(df['col'].median(), inplace=True) # Impute by median
51
+ df['col'].fillna(df['col'].mode()[0], inplace=True) # Impute by mode (categorical)
52
+ df.dropna(axis=0, inplace=True) # Drop rows with any NaN values
53
+ df.dropna(subset=['col1', 'col2'], inplace=True) # Drop rows with NaN in specific columns
54
+ df.ffill(inplace=True) # Forward fill NaN values
55
+ df.bfill(inplace=True) # Backward fill NaN values
56
+
57
+ # Transformations
58
+ df['new_col'] = df['col'].apply(lambda x: x**2) # Apply custom function
59
+ grouped = df.groupby('category')['sales'].sum() # Groupby & aggregate
60
+ grouped = df.groupby('cat').agg({'sales': 'sum', 'qty': 'mean'}) # Multiple aggregations
61
+ df['cat_code'] = df['category'].astype('category').cat.codes # Quick label encoding
62
+ df['col'] = df['col'].str.lower() # String lowercase
63
+ df['col'] = df['col'].str.replace('old', 'new') # String replace
64
+ df['col'] = df['col'].str.strip() # Strip whitespace
65
+ df['binned'] = pd.cut(df['age'], bins=[0, 18, 35, 60, 100]) # Binning/discretization
66
+ df = pd.get_dummies(df, columns=['city'], drop_first=True) # One-hot encode columns
67
+ df.rename(columns={'old_name': 'new_name'}, inplace=True) # Rename columns
68
+ df.drop(columns=['col1', 'col2'], inplace=True) # Drop columns
69
+ df.drop_duplicates(inplace=True) # Remove duplicate rows
70
+
71
+ # Type Conversions
72
+ df['col'] = df['col'].astype(int) # Convert column type
73
+ df['date'] = pd.to_datetime(df['date_str']) # Parse dates
74
+
75
+ # ===========================================================
76
+ # 🔢 NUMPY ESSENTIALS
77
+ # ===========================================================
78
+
79
+ arr = np.array([1, 2, 3]) # Create array
80
+ reshaped = arr.reshape(1, -1) # Reshape for sklearn (2D)
81
+ mean_val = np.mean(arr); std_val = np.std(arr) # Basic stats
82
+ mask = arr[arr > 2] # Boolean indexing/filtering
83
+ zeros = np.zeros((3, 3)); ones = np.ones((3, 3)) # Zero/One matrices
84
+ eye = np.eye(3) # Identity matrix
85
+ rand = np.random.rand(3, 3) # Random matrix [0,1)
86
+ dot = np.dot(arr, arr) # Dot product
87
+ norm = np.linalg.norm(arr) # Vector norm (L2)
88
+ log = np.log2(arr) # Log base 2 (entropy)
89
+ unique, counts = np.unique(arr, return_counts=True) # Unique values & counts
90
+
91
+ # ===========================================================
92
+ # 🤖 SCIKIT-LEARN PREPROCESSING
93
+ # ===========================================================
94
+
95
+ from sklearn.impute import SimpleImputer
96
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
97
+ from sklearn.model_selection import train_test_split
98
+
99
+ # Splitting Data
100
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
101
+
102
+ # Imputing (Standard approach)
103
+ imputed = SimpleImputer(strategy='mean').fit_transform(df[['num_col']]) # Impute missing with mean
104
+ imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[['cat']])# Impute categorical
105
+
106
+ # Scaling & Encoding
107
+ scaled = StandardScaler().fit_transform(df[['age', 'salary']]) # Standardize (mean=0, std=1)
108
+ scaled = MinMaxScaler().fit_transform(df[['age', 'salary']]) # Normalize to [0, 1]
109
+ encoded = OneHotEncoder().fit_transform(df[['gender']]).toarray() # One-hot encode
110
+ le = LabelEncoder().fit_transform(df['target']) # Encode target labels
111
+
112
+ # Pipeline (all-in-one)
113
+ from sklearn.compose import ColumnTransformer
114
+ from sklearn.pipeline import Pipeline
115
+ processor = ColumnTransformer([('num', StandardScaler(), ['age']), ('cat', OneHotEncoder(), ['city'])])
116
+
117
+ # ===========================================================
118
+ # ⛏️ DATA MINING (DM) ESSENTIALS
119
+ # ===========================================================
120
+
121
+ from sklearn.tree import DecisionTreeClassifier
122
+ from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
123
+ from sklearn.cluster import KMeans
124
+ from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
125
+ from sklearn.metrics import roc_curve, auc, precision_recall_curve, classification_report
126
+ from sklearn.preprocessing import label_binarize
127
+
128
+ # --- Decision Trees ---
129
+ dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train) # Train decision tree
130
+ y_pred = dt.predict(X_test) # Predict
131
+ print(accuracy_score(y_test, y_pred)) # Accuracy
132
+
133
+ # --- Ensemble Methods ---
134
+ bag = BaggingClassifier(n_estimators=10, random_state=42).fit(X_train, y_train) # Bagging
135
+ ada = AdaBoostClassifier(n_estimators=50, random_state=42).fit(X_train, y_train) # AdaBoost
136
+ rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)# Random Forest
137
+
138
+ # --- Clustering ---
139
+ kmeans = KMeans(n_clusters=3, random_state=42).fit(X) # K-Means clustering
140
+ labels = kmeans.labels_ # Cluster labels
141
+ centers = kmeans.cluster_centers_ # Cluster centroids
142
+
143
+ # --- Classification Metrics ---
144
+ print(accuracy_score(y_test, y_pred)) # Accuracy = (TP+TN)/(TP+TN+FP+FN)
145
+ print(precision_score(y_test, y_pred, average='weighted')) # Precision = TP/(TP+FP)
146
+ print(recall_score(y_test, y_pred, average='weighted')) # Recall = TP/(TP+FN)
147
+ print(f1_score(y_test, y_pred, average='weighted')) # F1 = 2*(P*R)/(P+R)
148
+ print(confusion_matrix(y_test, y_pred)) # Confusion matrix
149
+ print(classification_report(y_test, y_pred)) # Full report
150
+
151
+ # --- ROC & AUC ---
152
+ y_bin = label_binarize(y_test, classes=[0, 1, 2]) # Binarize for multiclass ROC
153
+ fpr, tpr, _ = roc_curve(y_bin[:, 0], y_proba[:, 0]) # ROC curve (per class)
154
+ roc_auc = auc(fpr, tpr) # AUC score
155
+
156
+ # --- Apriori / Association Rules (manual) ---
157
+ from itertools import combinations
158
+ support = lambda itemset, txns: sum(1 for t in txns if itemset.issubset(t)) / len(txns)
159
+ freq_items = {frozenset([i]) for t in transactions for i in t} # C1 candidates
160
+ pairs = [frozenset(c) for c in combinations(sorted(items), 2)] # Generate candidate pairs
161
+
162
+ # --- Entropy & Information Gain (ID3) ---
163
+ entropy = lambda probs: -sum(p * np.log2(p) for p in probs if p > 0) # Shannon entropy
164
+ info_gain = lambda parent_ent, children: parent_ent - sum(w * entropy(c) for w, c in children)
165
+ gini = lambda probs: 1 - sum(p**2 for p in probs) # Gini impurity
166
+
167
+ # ===========================================================
168
+ # 🔎 INFORMATION RETRIEVAL (IR) ESSENTIALS
169
+ # ===========================================================
170
+
171
+ from collections import Counter
172
+ import math
173
+
174
+ # --- Term Frequency (TF) ---
175
+ # TF(t, d) = count(t in d) / total_terms_in_d
176
+ tf = lambda term, doc: doc.count(term) / len(doc) # Term frequency
177
+
178
+ # --- Inverse Document Frequency (IDF) ---
179
+ # IDF(t) = log(N / df(t)) where N = total docs, df(t) = docs containing t
180
+ idf = lambda term, docs: math.log(len(docs) / sum(1 for d in docs if term in d))
181
+
182
+ # --- TF-IDF ---
183
+ # TF-IDF(t, d) = TF(t, d) * IDF(t)
184
+ tfidf = lambda term, doc, docs: tf(term, doc) * idf(term, docs) # TF-IDF score
185
+
186
+ # --- BM25 ---
187
+ # BM25(t, d) = IDF(t) * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * |d| / avgdl))
188
+ k1 = 1.5; b = 0.75 # BM25 parameters
189
+ avgdl = np.mean([len(d) for d in docs]) # Average document length
190
+
191
+ # --- Boolean Retrieval ---
192
+ # AND: set(doc1_terms) & set(doc2_terms)
193
+ # OR: set(doc1_terms) | set(doc2_terms)
194
+ # NOT: set(all_terms) - set(doc_terms)
195
+ bool_and = lambda q_terms, doc: all(t in doc for t in q_terms) # Boolean AND query
196
+ bool_or = lambda q_terms, doc: any(t in doc for t in q_terms) # Boolean OR query
197
+
198
+ # --- Cosine Similarity ---
199
+ # cos(A, B) = (A · B) / (||A|| * ||B||)
200
+ cosine_sim = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
201
+
202
+ # --- Jaccard Similarity ---
203
+ # J(A, B) = |A ∩ B| / |A ∪ B|
204
+ jaccard = lambda a, b: len(a & b) / len(a | b) # Jaccard similarity (sets)
205
+
206
+ # --- Tokenization & Text Processing ---
207
+ tokens = "hello world foo bar".lower().split() # Basic tokenization
208
+ vocab = set(tokens) # Vocabulary
209
+ bow = Counter(tokens) # Bag of words
210
+ from sklearn.feature_extraction.text import TfidfVectorizer
211
+ tfidf_matrix = TfidfVectorizer().fit_transform(["doc1 text", "doc2 text"]) # TF-IDF vectorizer
212
+
213
+ # --- Inverted Index ---
214
+ inv_index = {} # Build inverted index
215
+ for doc_id, doc in enumerate(docs):
216
+ for term in doc:
217
+ inv_index.setdefault(term, set()).add(doc_id)
218
+
219
+ # --- Precision & Recall (IR) ---
220
+ # Precision@k = relevant_in_top_k / k
221
+ # Recall@k = relevant_in_top_k / total_relevant
222
+ precision_at_k = lambda retrieved, relevant, k: len(set(retrieved[:k]) & relevant) / k
223
+ recall_at_k = lambda retrieved, relevant, k: len(set(retrieved[:k]) & relevant) / len(relevant)