my-python-lib-tarik 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- my_python_lib/__init__.py +82 -0
- my_python_lib/core.py +132 -0
- my_python_lib/data.py +174 -0
- my_python_lib/utils.py +116 -0
- my_python_lib_tarik-0.1.0.dist-info/METADATA +75 -0
- my_python_lib_tarik-0.1.0.dist-info/RECORD +8 -0
- my_python_lib_tarik-0.1.0.dist-info/WHEEL +5 -0
- my_python_lib_tarik-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
my_python_lib: A custom Machine Learning and Data Preprocessing library.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .core import (
|
|
6
|
+
LinearRegression,
|
|
7
|
+
LeastSquaresStrategy,
|
|
8
|
+
GradientDescentStrategy,
|
|
9
|
+
KNNClassifier,
|
|
10
|
+
EuclideanDistance,
|
|
11
|
+
ManhattanDistance,
|
|
12
|
+
evaluate_model,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from .data import (
|
|
16
|
+
DataPipeline,
|
|
17
|
+
DataProcessor,
|
|
18
|
+
CSVCreator,
|
|
19
|
+
JSONCreator,
|
|
20
|
+
MeanImputer,
|
|
21
|
+
MedianImputer,
|
|
22
|
+
ModeImputer,
|
|
23
|
+
UnknownImputer,
|
|
24
|
+
LabelEncoder,
|
|
25
|
+
OneHotEncoder,
|
|
26
|
+
TargetEncoder,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
from .utils import (
|
|
30
|
+
calculate_missing_values,
|
|
31
|
+
predict_stats,
|
|
32
|
+
predict_success_summary,
|
|
33
|
+
calculate_accuracy,
|
|
34
|
+
shuffle_data,
|
|
35
|
+
series_to_ndarray,
|
|
36
|
+
list_to_ndarray,
|
|
37
|
+
dimension_control,
|
|
38
|
+
data_summary,
|
|
39
|
+
log_operation,
|
|
40
|
+
apply_pipeline,
|
|
41
|
+
is_categorical,
|
|
42
|
+
NotCategoricalError,
|
|
43
|
+
is_multiple_mode,
|
|
44
|
+
ColumnNotFoundError,
|
|
45
|
+
numeric_check
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
# Core Algorithms & Metrics
|
|
50
|
+
"LinearRegression",
|
|
51
|
+
"LeastSquaresStrategy",
|
|
52
|
+
"GradientDescentStrategy",
|
|
53
|
+
"KNNClassifier",
|
|
54
|
+
"EuclideanDistance",
|
|
55
|
+
"ManhattanDistance",
|
|
56
|
+
"evaluate_model",
|
|
57
|
+
|
|
58
|
+
# Data Pipeline & Processing
|
|
59
|
+
"DataPipeline",
|
|
60
|
+
"DataProcessor",
|
|
61
|
+
"CSVCreator",
|
|
62
|
+
"JSONCreator",
|
|
63
|
+
"MeanImputer",
|
|
64
|
+
"MedianImputer",
|
|
65
|
+
"ModeImputer",
|
|
66
|
+
"UnknownImputer",
|
|
67
|
+
"LabelEncoder",
|
|
68
|
+
"OneHotEncoder",
|
|
69
|
+
"TargetEncoder",
|
|
70
|
+
|
|
71
|
+
# Utilities
|
|
72
|
+
"calculate_missing_values",
|
|
73
|
+
"predict_stats",
|
|
74
|
+
"predict_success_summary",
|
|
75
|
+
"calculate_accuracy",
|
|
76
|
+
"shuffle_data",
|
|
77
|
+
"is_categorical",
|
|
78
|
+
"NotCategoricalError",
|
|
79
|
+
"is_multiple_mode",
|
|
80
|
+
"ColumnNotFoundError",
|
|
81
|
+
"numeric_check"
|
|
82
|
+
]
|
my_python_lib/core.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import threading
|
|
3
|
+
import numpy as np
|
|
4
|
+
from functools import reduce
|
|
5
|
+
from my_python_lib.utils import dimension_control
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseAlgorithm(ABC):
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self.model_data = {}
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def fit(self):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def predict(self):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
class RegressionStrategy(ABC):
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def train(self):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
class LeastSquaresStrategy(RegressionStrategy):
|
|
26
|
+
def train(self,X,y):
|
|
27
|
+
XTX = np.dot(X.T,X)
|
|
28
|
+
XTy = np.dot(X.T,y)
|
|
29
|
+
weights = np.linalg.solve(XTX,XTy)
|
|
30
|
+
return weights
|
|
31
|
+
|
|
32
|
+
class GradientDescentStrategy(RegressionStrategy):
|
|
33
|
+
def __init__(self,learning_rate=0.01,iterations=1000):
|
|
34
|
+
self.lr = learning_rate
|
|
35
|
+
self.iters = iterations
|
|
36
|
+
|
|
37
|
+
def train(self,X,y):
|
|
38
|
+
m , n = X.shape
|
|
39
|
+
weights = np.zeros(n)
|
|
40
|
+
|
|
41
|
+
for _ in range(self.iters):
|
|
42
|
+
prediction = np.dot(X,weights)
|
|
43
|
+
errors = prediction - y
|
|
44
|
+
|
|
45
|
+
gradient = (1 / m) * np.dot(X.T,errors)
|
|
46
|
+
|
|
47
|
+
weights -= self.lr * gradient
|
|
48
|
+
|
|
49
|
+
return weights
|
|
50
|
+
|
|
51
|
+
class DistanceMetric(ABC):
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def calculate(self,v1,v2):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
class EuclideanDistance(DistanceMetric):
|
|
57
|
+
def calculate(self,v1,v2):
|
|
58
|
+
def recursive_sum_sq(a, b, idx):
|
|
59
|
+
if idx == len(a):
|
|
60
|
+
return 0
|
|
61
|
+
return (a[idx] - b[idx]) ** 2 + recursive_sum_sq(a, b, idx + 1)
|
|
62
|
+
|
|
63
|
+
return np.sqrt(recursive_sum_sq(v1, v2, 0))
|
|
64
|
+
|
|
65
|
+
class ManhattanDistance(DistanceMetric):
|
|
66
|
+
def calculate(self,v1,v2):
|
|
67
|
+
return np.sum(np.abs(v1-v2))
|
|
68
|
+
|
|
69
|
+
class LinearRegression(BaseAlgorithm):
|
|
70
|
+
def __init__(self,strategy : RegressionStrategy = LeastSquaresStrategy()):
|
|
71
|
+
super().__init__()
|
|
72
|
+
self.strategy = strategy
|
|
73
|
+
self.weights = None
|
|
74
|
+
|
|
75
|
+
def fit(self,X,y):
|
|
76
|
+
if dimension_control(X,y):
|
|
77
|
+
X_b = np.c_[np.ones((X.shape[0], 1)), X]
|
|
78
|
+
self.weights = self.strategy.train(X_b, y)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def predict(self,X):
|
|
82
|
+
X_b = np.c_[np.ones((X.shape[0], 1)), X]
|
|
83
|
+
return np.dot(X_b,self.weights)
|
|
84
|
+
|
|
85
|
+
class KNNClassifier(BaseAlgorithm):
|
|
86
|
+
def __init__(self,k=3, metric : DistanceMetric = EuclideanDistance()):
|
|
87
|
+
super().__init__()
|
|
88
|
+
self.k = k
|
|
89
|
+
self.metric = metric
|
|
90
|
+
self.X_train = None
|
|
91
|
+
self.y_train = None
|
|
92
|
+
|
|
93
|
+
def fit(self,X,y):
|
|
94
|
+
self.X_train = X
|
|
95
|
+
self.y_train = y
|
|
96
|
+
|
|
97
|
+
def _predict_single(self,x_test,results,index):
|
|
98
|
+
distances = [self.metric.calculate(x_test, x_train) for x_train in self.X_train]
|
|
99
|
+
k_indices = np.argsort(distances)[:self.k]
|
|
100
|
+
k_nearest_labels = self.y_train[k_indices]
|
|
101
|
+
|
|
102
|
+
counts = np.bincount(k_nearest_labels.astype(int))
|
|
103
|
+
results[index] = np.argmax(counts)
|
|
104
|
+
|
|
105
|
+
def predict(self,X):
|
|
106
|
+
threads = []
|
|
107
|
+
predictions = np.zeros(X.shape[0])
|
|
108
|
+
|
|
109
|
+
for i,x_test in enumerate(X):
|
|
110
|
+
t = threading.Thread(target=self._predict_single,args=(x_test,predictions,i))
|
|
111
|
+
threads.append(t)
|
|
112
|
+
t.start()
|
|
113
|
+
|
|
114
|
+
for t in threads:
|
|
115
|
+
t.join()
|
|
116
|
+
|
|
117
|
+
return predictions
|
|
118
|
+
|
|
119
|
+
def evaluate_model(y_true,y_pred):
|
|
120
|
+
errors = (y_true - y_pred) ** 2
|
|
121
|
+
total_error = reduce(lambda x, y: x + y, errors)
|
|
122
|
+
return total_error / len(y_true)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
my_python_lib/data.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import List
|
|
5
|
+
from my_python_lib.utils import is_categorical, NotCategoricalError, is_multiple_mode, ColumnNotFoundError
|
|
6
|
+
|
|
7
|
+
class DataLoader(ABC):
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def load(self,source : str) -> pd.DataFrame:
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
class CSVLoader(DataLoader):
|
|
13
|
+
def load(self,source : str) -> pd.DataFrame:
|
|
14
|
+
pd.set_option('display.max_columns', None)
|
|
15
|
+
pd.set_option('display.width', None)
|
|
16
|
+
return pd.read_csv(source)
|
|
17
|
+
|
|
18
|
+
class JSONLoader(DataLoader):
|
|
19
|
+
def load(self,source : str) -> pd.DataFrame:
|
|
20
|
+
pd.set_option('display.max_columns', None)
|
|
21
|
+
pd.set_option('display.width', None)
|
|
22
|
+
return pd.read_json(source)
|
|
23
|
+
|
|
24
|
+
class DataCreator(ABC):
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def create_document(self) -> DataLoader:
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def generate_document(self, source : str):
|
|
30
|
+
document = self.create_document()
|
|
31
|
+
data = document.load(source)
|
|
32
|
+
return data
|
|
33
|
+
|
|
34
|
+
class CSVCreator(DataCreator):
|
|
35
|
+
def create_document(self) -> DataLoader:
|
|
36
|
+
return CSVLoader()
|
|
37
|
+
|
|
38
|
+
class JSONCreator(DataCreator):
|
|
39
|
+
def create_document(self) -> DataLoader:
|
|
40
|
+
return JSONLoader()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ImputeStrategy(ABC):
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def fill_nulls(self,column : pd.Series) -> pd.Series:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
class MeanImputer(ImputeStrategy):
|
|
49
|
+
def fill_nulls(self,column : pd.Series) -> pd.Series:
|
|
50
|
+
return column.fillna(column.mean())
|
|
51
|
+
|
|
52
|
+
class MedianImputer(ImputeStrategy):
|
|
53
|
+
def fill_nulls(self, column : pd.Series) -> pd.Series:
|
|
54
|
+
return column.fillna(column.median())
|
|
55
|
+
|
|
56
|
+
class ModeImputer(ImputeStrategy):
|
|
57
|
+
def fill_nulls(self, column : pd.Series) -> pd.Series:
|
|
58
|
+
if is_multiple_mode(column):
|
|
59
|
+
return column.fillna(column.mode().mean())
|
|
60
|
+
return column.fillna(column.mode())
|
|
61
|
+
|
|
62
|
+
class UnknownImputer(ImputeStrategy):
|
|
63
|
+
def fill_nulls(self, column : pd.Series) -> pd.Series:
|
|
64
|
+
if not is_categorical(column):
|
|
65
|
+
raise NotCategoricalError(column)
|
|
66
|
+
return column.astype(str).fillna("Unknown")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class EncodingStrategy(ABC):
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def encode(self,df : pd.DataFrame, column : str, target : pd.Series = None) -> pd.DataFrame:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
class LabelEncoder(EncodingStrategy):
|
|
75
|
+
def encode(self,df : pd.DataFrame, column : str, target : pd.Series = None) -> pd.DataFrame:
|
|
76
|
+
df_copy = df.copy()
|
|
77
|
+
df_copy[column] = df_copy[column].astype('category').cat.codes
|
|
78
|
+
return df_copy
|
|
79
|
+
|
|
80
|
+
class OneHotEncoder(EncodingStrategy):
|
|
81
|
+
def encode(self,df : pd.DataFrame, column : str, target : pd.Series = None) -> pd.DataFrame:
|
|
82
|
+
columns = [column]
|
|
83
|
+
return pd.get_dummies(df,columns=columns,dtype=int)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class TargetEncoder(EncodingStrategy):
|
|
87
|
+
def encode(self,df : pd.DataFrame, column : str, target : pd.Series = None) -> pd.DataFrame:
|
|
88
|
+
if target is None:
|
|
89
|
+
raise ValueError("You need a target column if you want to apply Target Encoder")
|
|
90
|
+
|
|
91
|
+
df_copy = df.copy()
|
|
92
|
+
temp_target_name = '__target_tmp__'
|
|
93
|
+
df_copy[temp_target_name] = target
|
|
94
|
+
category_means = df_copy.groupby(column)[temp_target_name].mean()
|
|
95
|
+
df_copy[column] = df_copy[column].map(category_means)
|
|
96
|
+
df_copy = df_copy.drop(columns=[temp_target_name])
|
|
97
|
+
return df_copy
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DataProcessor:
|
|
101
|
+
def __init__(self,data : pd.DataFrame):
|
|
102
|
+
self._data = data
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def data(self) -> pd.DataFrame:
|
|
106
|
+
return self._data
|
|
107
|
+
|
|
108
|
+
def handle_missing_values(self,column : str,strategy : ImputeStrategy):
|
|
109
|
+
self._data[column] = strategy.fill_nulls(self._data[column])
|
|
110
|
+
|
|
111
|
+
def scale_features(self,columns : List[str]):
|
|
112
|
+
for column in columns:
|
|
113
|
+
min_value = self._data[column].min()
|
|
114
|
+
max_value = self._data[column].max()
|
|
115
|
+
self._data[column] = (self._data[column] - min_value) / (max_value - min_value)
|
|
116
|
+
|
|
117
|
+
def transform_categorical(self,column : str, strategy : EncodingStrategy, target_column : str = None):
|
|
118
|
+
target_series = self._data[target_column] if target_column is not None else None
|
|
119
|
+
new_df = strategy.encode(self._data,column,target_series)
|
|
120
|
+
|
|
121
|
+
self._data._update_inplace(new_df)
|
|
122
|
+
return self
|
|
123
|
+
|
|
124
|
+
def split_features_target(self,target_column : str):
|
|
125
|
+
X = self._data.drop(columns=[target_column]).values
|
|
126
|
+
y = self._data[target_column].values
|
|
127
|
+
return X,y
|
|
128
|
+
|
|
129
|
+
class DataPipeline:
|
|
130
|
+
def __init__(self,file_path : str):
|
|
131
|
+
if file_path.endswith(".csv"):
|
|
132
|
+
creator = CSVCreator()
|
|
133
|
+
|
|
134
|
+
elif file_path.endswith(".json"):
|
|
135
|
+
creator = JSONCreator()
|
|
136
|
+
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(f"Invalid file type : {file_path}")
|
|
139
|
+
|
|
140
|
+
raw_data = creator.generate_document(file_path)
|
|
141
|
+
self.processor = DataProcessor(raw_data)
|
|
142
|
+
|
|
143
|
+
def run_default_preprocessing(self, target: str, categorical_columns: List[str] = None,encoding_type: str = 'label', imputer : ImputeStrategy = MeanImputer()):
|
|
144
|
+
if categorical_columns is None:
|
|
145
|
+
categorical_columns = []
|
|
146
|
+
|
|
147
|
+
for col in categorical_columns:
|
|
148
|
+
|
|
149
|
+
if col in self.processor.data.columns:
|
|
150
|
+
|
|
151
|
+
if encoding_type == 'label':
|
|
152
|
+
self.processor.transform_categorical(col, LabelEncoder())
|
|
153
|
+
|
|
154
|
+
elif encoding_type == 'onehot':
|
|
155
|
+
self.processor.transform_categorical(col, OneHotEncoder())
|
|
156
|
+
|
|
157
|
+
elif encoding_type == 'target':
|
|
158
|
+
self.processor.transform_categorical(col, TargetEncoder(), target_column=target)
|
|
159
|
+
|
|
160
|
+
else:
|
|
161
|
+
raise ColumnNotFoundError(col)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
numeric_cols = self.processor.data.select_dtypes(include=[np.number]).columns
|
|
165
|
+
numeric_features = [col for col in numeric_cols if col != target]
|
|
166
|
+
|
|
167
|
+
for col in numeric_features:
|
|
168
|
+
self.processor.handle_missing_values(col, imputer)
|
|
169
|
+
|
|
170
|
+
self.processor.scale_features(numeric_features)
|
|
171
|
+
|
|
172
|
+
return self.processor.split_features_target(target)
|
|
173
|
+
|
|
174
|
+
|
my_python_lib/utils.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from functools import wraps, reduce
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
import time
|
|
5
|
+
from typing import Callable, Any
|
|
6
|
+
|
|
7
|
+
def calculate_missing_values(df : pd.DataFrame):
|
|
8
|
+
print(" MISSING VALUE CALCULATION")
|
|
9
|
+
for col in df.columns:
|
|
10
|
+
missing_num = df[col].isnull().sum()
|
|
11
|
+
if missing_num > 0:
|
|
12
|
+
print(f" --> {col} : {missing_num} missing values")
|
|
13
|
+
|
|
14
|
+
else:
|
|
15
|
+
print(f" --> {col} : Clean (No missing values)")
|
|
16
|
+
|
|
17
|
+
def predict_stats(y_predict : np.ndarray):
|
|
18
|
+
print(f" == [PREDICT DISTRIBUTION STATISTICS] ==")
|
|
19
|
+
print(f" Min Prediction : {np.min(y_predict):.2f}")
|
|
20
|
+
print(f" Max Prediction : {np.max(y_predict):.2f}")
|
|
21
|
+
print(f" Average Prediction : {np.mean(y_predict):.2f}")
|
|
22
|
+
print(f" Standart Deviation of the Predictions: {np.std(y_predict):.2f}")
|
|
23
|
+
|
|
24
|
+
def predict_success_summary(y_real : np.ndarray, y_predict : np.ndarray, threshold_error : float):
|
|
25
|
+
if threshold_error < 0 or threshold_error > 100:
|
|
26
|
+
raise ValueError("Threshold Value must be between 0 and 100")
|
|
27
|
+
error_percentages = (np.abs(y_real - y_predict) / y_real) * 100
|
|
28
|
+
success_predict_num = np.sum(error_percentages < threshold_error)
|
|
29
|
+
failed_predict_num = np.sum(error_percentages > threshold_error)
|
|
30
|
+
print(f"Number of Successful Predictions: {success_predict_num}")
|
|
31
|
+
print(f"Number of Failed Predictions: {failed_predict_num}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def series_to_ndarray(series : pd.Series) -> np.ndarray:
|
|
35
|
+
clean_list = list(map(lambda x : float(x), series))
|
|
36
|
+
return np.array(clean_list)
|
|
37
|
+
|
|
38
|
+
def list_to_ndarray(list : list) -> np.ndarray:
|
|
39
|
+
return np.array(list)
|
|
40
|
+
|
|
41
|
+
def dimension_control(X : np.ndarray, y : np.ndarray):
|
|
42
|
+
if len(X) != len(y):
|
|
43
|
+
print(f"Error : X row number ({len(X)}) and y row number ({len(y)}) are not equal")
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
def data_summary(X : np.ndarray):
|
|
49
|
+
row, column = X.shape
|
|
50
|
+
print(f"Total Rows : {row}")
|
|
51
|
+
print(f"Number of Parameters : {column}")
|
|
52
|
+
|
|
53
|
+
def shuffle_data(X : np.ndarray, y : np.ndarray):
|
|
54
|
+
random_orders = np.random.permutation(len(X))
|
|
55
|
+
return X[random_orders], y[random_orders]
|
|
56
|
+
|
|
57
|
+
def log_operation(func : Callable) -> Callable:
|
|
58
|
+
@wraps(func)
|
|
59
|
+
def wrapper(*args, **kwargs):
|
|
60
|
+
print(f"[LOG] {func.__name__} process started")
|
|
61
|
+
start_time = time.time()
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
result = func(*args, **kwargs)
|
|
65
|
+
end_time = time.time()
|
|
66
|
+
print(f"[LOG] {func.__name__} ended successfully")
|
|
67
|
+
print(f"Time Taken : {end_time - start_time:.4f}s")
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"[LOG] {func.__name__} failed : {e}")
|
|
72
|
+
raise e
|
|
73
|
+
|
|
74
|
+
return wrapper
|
|
75
|
+
|
|
76
|
+
def apply_pipeline(data : np.ndarray, transformations : list[Callable]) -> np.ndarray:
|
|
77
|
+
return reduce(lambda d, func : func(d), transformations, data)
|
|
78
|
+
|
|
79
|
+
def calculate_accuracy(real, prediction):
|
|
80
|
+
accuracy = np.mean(real == prediction)
|
|
81
|
+
print(f"Accuracy rate : %{accuracy * 100:.2f}")
|
|
82
|
+
|
|
83
|
+
def is_categorical(column : pd.Series) -> bool:
|
|
84
|
+
if column.dtype in ["object", "category"] or pd.api.types.is_string_dtype(column):
|
|
85
|
+
return True
|
|
86
|
+
else:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class NotCategoricalError(Exception):
|
|
91
|
+
def __init__(self, column : str):
|
|
92
|
+
self.column = column
|
|
93
|
+
super().__init__(f"Column '{column}' is not categorical")
|
|
94
|
+
|
|
95
|
+
def is_multiple_mode(column : pd.Series) -> bool:
|
|
96
|
+
if column.mode().shape[0] > 1:
|
|
97
|
+
return True
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
class ColumnNotFoundError(Exception):
|
|
101
|
+
def __init__(self, column : str):
|
|
102
|
+
self.column = column
|
|
103
|
+
super().__init__(f"Column '{column}' is not found in the data")
|
|
104
|
+
|
|
105
|
+
def numeric_check(data : pd.DataFrame):
|
|
106
|
+
index = 0
|
|
107
|
+
for col in data.columns:
|
|
108
|
+
if pd.api.types.is_numeric_dtype(data[col]):
|
|
109
|
+
print(f"--> {col} : Numeric Column")
|
|
110
|
+
else:
|
|
111
|
+
print(f"--> {col} : Not Numeric Column")
|
|
112
|
+
index += 1
|
|
113
|
+
|
|
114
|
+
print(f"The Number of Non-Numeric Columns : {index}")
|
|
115
|
+
|
|
116
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: my_python_lib-tarik
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A modular, object-oriented framework for machine learning and data preprocessing
|
|
5
|
+
Author-email: Mustafa Tarık Kocabıyık <mtarikkb@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: numpy>=2.4.4
|
|
14
|
+
Requires-Dist: pandas>=3.0.2
|
|
15
|
+
|
|
16
|
+
# Machine Learning & Data Preprocessing Library
|
|
17
|
+
|
|
18
|
+
## Introduction
|
|
19
|
+
|
|
20
|
+
This is a simple machine learning algorithm library consists of Linear Regression , KNN Classifier and some other data processing algorithms from scratch based on numpy and pandas libraries.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Mapping Core Learning Outcomes
|
|
25
|
+
|
|
26
|
+
The 6 required patterns were applied appropriately in the project. Every single one is explained below.
|
|
27
|
+
|
|
28
|
+
### 1. Object-Oriented Programming (OOP)
|
|
29
|
+
- **Where**: In `core.py` and `data.py`.
|
|
30
|
+
- **How**:
|
|
31
|
+
- **Inheritance & Abstraction**: Employs abstract base classes (`BaseAlgorithm`, `RegressionStrategy`, `DistanceMetric`, `DataLoader`, `DataCreator`, `ImputeStrategy`, `EncodingStrategy`) to enforce blueprints.
|
|
32
|
+
- **Polymorphism**: Concrete implementations dynamically substitute base behavior. For example, `LinearRegression` executes `.train()` polymorphic actions via different assigned regression strategies without altering its own structure.
|
|
33
|
+
- **Encapsulation**: State variables are protected internally. In `data.py`, the raw dataframe is hidden behind a protected attribute `self._data` and managed safely using the `@property` getter.
|
|
34
|
+
|
|
35
|
+
### 2. Functional Programming
|
|
36
|
+
- **Where**: In `core.py` and `utils.py`.
|
|
37
|
+
- **How**:
|
|
38
|
+
- **Pure Functions & Lambda**: `evaluate_model` avoids modifying external states and relies entirely on input arguments, calculating mean squared errors via a clean pure lambda routine.
|
|
39
|
+
- **Higher-Order Functions & Map/Reduce**:
|
|
40
|
+
- `reduce` combined with `lambda` is used inside `evaluate_model` to sum squared errors.
|
|
41
|
+
- `map` is used inside `series_to_ndarray` to cast panda series rows to float representations.
|
|
42
|
+
- `apply_pipeline` utilizes `reduce` to sequentially compose list-based transformation callables across data boundaries (`reduce(lambda d, func: func(d), transformations, data)`).
|
|
43
|
+
|
|
44
|
+
### 3. Concurrency (Multi-threading)
|
|
45
|
+
- **Where**: Implemented in `core.py` inside the `KNNClassifier` class.
|
|
46
|
+
- **How**:
|
|
47
|
+
- Predicting classes for massive feature maps sequentially is computationally bound. The `predict` method generates individual `threading.Thread` operations for every distinct evaluation sample.
|
|
48
|
+
- The `_predict_single` worker calculates specific row-by-row matrix operations concurrently, storing structural outputs inside a shared pre-allocated numpy results matrix (`results[index]`).
|
|
49
|
+
- Thread control structures utilize `t.start()` loops followed by systematic `t.join()` barriers to synchronize and block primary execution until parallel estimations conclude safely.
|
|
50
|
+
|
|
51
|
+
### 4. Recursion / Dynamic Programming
|
|
52
|
+
- **Where**: In `core.py` inside the `EuclideanDistance` class.
|
|
53
|
+
- **How**:
|
|
54
|
+
- Distance metrics typically resolve dimensions via nested iterative syntax or high-level library functions. This implementation achieves element-wise vector difference accumulations via a custom recursive function `recursive_sum_sq(a, b, idx)`.
|
|
55
|
+
- It recursively accumulates squared parameter differences index-by-index until it reaches the base case (`idx == len(a)`), gracefully returning the final structural matrix sqrt reduction.
|
|
56
|
+
|
|
57
|
+
### 5. SOLID Principles
|
|
58
|
+
- **Where**: In `core.py` and `data.py`.
|
|
59
|
+
- **How**:
|
|
60
|
+
- **Single Responsibility Principle (SRP)**: Classes do exactly one thing. `CSVLoader` only ingests data streams; `MeanImputer` strictly provides missing value fillings; `DataProcessor` focuses on data manipulation.
|
|
61
|
+
- **Open/Closed Principle (OCP)**: The system is open for extension but closed for modification. Introducing a new distance metric (e.g., Cosine Distance) requires subclassing `DistanceMetric` without touching `KNNClassifier`.
|
|
62
|
+
- **Liskov Substitution Principle (LSP)**: Derived classes are completely interchangeable with their abstractions. Any encoder (`LabelEncoder`, `OneHotEncoder`, `TargetEncoder`) fulfills the signature constraints expected by `DataProcessor`.
|
|
63
|
+
- **Interface Segregation Principle (ISP)**: Interfaces remain lean and decoupled. `RegressionStrategy` enforces a single clear contractual point (`train`), avoiding bulky, unrelated structural configurations.
|
|
64
|
+
- **Dependency Inversion Principle (DIP)**: High-level objects depend on abstractions rather than low-level concrete modules. `LinearRegression` binds entirely against the `RegressionStrategy` interface, decoupling model training mechanisms from specific analytical algorithms.
|
|
65
|
+
|
|
66
|
+
### 6. Architectural & Design Patterns
|
|
67
|
+
- **Where**: Full design of `data.py` and `core.py`.
|
|
68
|
+
- **How**:
|
|
69
|
+
- **Pipeline Architecture**: Managed by `DataPipeline` which neatly bridges file checking, concrete factory creation, loading, and structured feature preparation routines into a uniform linear API stream (`run_default_preprocessing`).
|
|
70
|
+
- **Strategy Pattern**: Implemented multiple times to provide interchangeable components:
|
|
71
|
+
- Optimization algorithms in `LinearRegression` via `LeastSquaresStrategy` and `GradientDescentStrategy`.
|
|
72
|
+
- Distance formulations in `KNNClassifier` via `EuclideanDistance` and `ManhattanDistance`.
|
|
73
|
+
- Data imputation in `DataProcessor` via `MeanImputer`, `MedianImputer`, and `ModeImputer`.
|
|
74
|
+
- Variable transformations via `LabelEncoder`, `OneHotEncoder`, and `TargetEncoder`.
|
|
75
|
+
- **Factory Method Pattern**: Used to create appropriate data loaders without binding to concrete files. `DataCreator` acts as the creator interface, declaring `create_document()`. Concrete implementations `CSVCreator` and `JSONCreator` override this method to instantiate and return `CSVLoader` or `JSONLoader` respectively, abstracting the instantiation process away from the main pipeline.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
my_python_lib/__init__.py,sha256=ZpKvwB1WJ-Ob6EN5MNebKX6ynJToZ7K_lnMm9uOPfOc,1693
|
|
2
|
+
my_python_lib/core.py,sha256=pBw6adxZkLto-XtIXK85tht_XaS9PRp1H01HaiYWJBY,3420
|
|
3
|
+
my_python_lib/data.py,sha256=quE83V4RmhzFafCHhpxw-OV6CGnkf7Bzevgpg59kfn4,6173
|
|
4
|
+
my_python_lib/utils.py,sha256=BKaajwmJQbPMAnRxC5HPf761x9zv6TY0xXa3r6MVtGs,4057
|
|
5
|
+
my_python_lib_tarik-0.1.0.dist-info/METADATA,sha256=zD8LMyaZZff_yprgTw-F_Emq2YsW6Xye5kkJdKgPn90,6213
|
|
6
|
+
my_python_lib_tarik-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
my_python_lib_tarik-0.1.0.dist-info/top_level.txt,sha256=EQh17lpvMzIpLwWuAX-UrXqCvw3vHOjBMq-MvXl4pvM,14
|
|
8
|
+
my_python_lib_tarik-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
my_python_lib
|