mak-mini-ml 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mak_mini_ml-0.1.1/PKG-INFO +182 -0
- mak_mini_ml-0.1.1/README.md +169 -0
- mak_mini_ml-0.1.1/pyproject.toml +36 -0
- mak_mini_ml-0.1.1/setup.cfg +4 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/__init__.py +21 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/activations.py +115 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/distances.py +127 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/linear_model.py +162 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/metrics.py +340 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/model_selection.py +176 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/neighbors.py +89 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/preprocessing.py +173 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/stats.py +216 -0
- mak_mini_ml-0.1.1/src/Custom_ML_Suite/tree.py +502 -0
- mak_mini_ml-0.1.1/src/mak_mini_ml.egg-info/PKG-INFO +182 -0
- mak_mini_ml-0.1.1/src/mak_mini_ml.egg-info/SOURCES.txt +17 -0
- mak_mini_ml-0.1.1/src/mak_mini_ml.egg-info/dependency_links.txt +1 -0
- mak_mini_ml-0.1.1/src/mak_mini_ml.egg-info/top_level.txt +1 -0
- mak_mini_ml-0.1.1/tests/test_all.py +38 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mak-mini-ml
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Beginner-friendly Machine Learning utility library built from scratch using pure Python
|
|
5
|
+
Author: Aryan Kakade, Kishor Handge
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: machine-learning,python,ml,statistics,data-science,educational
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# Custom_ML_Suite
|
|
15
|
+
|
|
16
|
+
Custom_ML_Suite is a beginner-friendly Machine Learning utility library built completely from scratch using pure Python.
|
|
17
|
+
|
|
18
|
+
This project focuses on understanding the mathematical foundations of Machine Learning by manually implementing core ML algorithms, preprocessing methods, distance metrics, activation functions, statistical operations, and evaluation metrics without using external ML libraries like scikit-learn.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
# Features
|
|
23
|
+
|
|
24
|
+
- Pure Python implementation
|
|
25
|
+
- Beginner-friendly code structure
|
|
26
|
+
- Mathematical formulas included
|
|
27
|
+
- Well-commented educational code
|
|
28
|
+
- Modular package structure
|
|
29
|
+
- ML utilities from scratch
|
|
30
|
+
- Edge-case handling
|
|
31
|
+
- Easy to understand and extend
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
# Modules Included
|
|
36
|
+
|
|
37
|
+
## 1. activations.py
|
|
38
|
+
Activation functions used in neural networks.
|
|
39
|
+
|
|
40
|
+
### Functions
|
|
41
|
+
- sigmoid()
|
|
42
|
+
- relu()
|
|
43
|
+
- tanh()
|
|
44
|
+
- softmax()
|
|
45
|
+
- log_loss()
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 2. distances.py
|
|
50
|
+
Distance and similarity metrics.
|
|
51
|
+
|
|
52
|
+
### Functions
|
|
53
|
+
- euclidean_distance()
|
|
54
|
+
- manhattan_distance()
|
|
55
|
+
- minkowski_distance()
|
|
56
|
+
- cosine_similarity()
|
|
57
|
+
- hamming_distance()
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 3. preprocessing.py
|
|
62
|
+
Data preprocessing and scaling methods.
|
|
63
|
+
|
|
64
|
+
### Functions
|
|
65
|
+
- standardization()
|
|
66
|
+
- mean()
|
|
67
|
+
- min_max_scaling()
|
|
68
|
+
- range()
|
|
69
|
+
- normalization()
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 4. stats.py
|
|
74
|
+
Basic statistical operations.
|
|
75
|
+
|
|
76
|
+
### Functions
|
|
77
|
+
- mean()
|
|
78
|
+
- variance()
|
|
79
|
+
- std_dev()
|
|
80
|
+
- covariance()
|
|
81
|
+
- correlation()
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## 5. metrics.py
|
|
86
|
+
Machine Learning evaluation metrics.
|
|
87
|
+
|
|
88
|
+
### Functions
|
|
89
|
+
- accuracy_score()
|
|
90
|
+
- precision_score()
|
|
91
|
+
- recall_score()
|
|
92
|
+
- f1_score()
|
|
93
|
+
- confusion_matrix()
|
|
94
|
+
- mean_absolute_error()
|
|
95
|
+
- mean_squared_error()
|
|
96
|
+
- root_mean_squared_error()
|
|
97
|
+
- r2_score()
|
|
98
|
+
- binary_crossentropy()
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 6. model_selection.py
|
|
103
|
+
Dataset splitting and validation utilities.
|
|
104
|
+
|
|
105
|
+
### Functions
|
|
106
|
+
- train_test_split()
|
|
107
|
+
- shuffle_data()
|
|
108
|
+
- batch_iterator()
|
|
109
|
+
- k_fold_split()
|
|
110
|
+
- stratified_split()
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## 7. linear_model.py
|
|
115
|
+
Basic regression models and optimization.
|
|
116
|
+
|
|
117
|
+
### Functions
|
|
118
|
+
- linear_regression()
|
|
119
|
+
- predict()
|
|
120
|
+
- gradient_descent()
|
|
121
|
+
- logistic_regression()
|
|
122
|
+
- logistic_update()
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 8. neighbors.py
|
|
127
|
+
K-Nearest Neighbors utilities.
|
|
128
|
+
|
|
129
|
+
### Functions
|
|
130
|
+
- knn_distance()
|
|
131
|
+
- knn_predict()
|
|
132
|
+
- probability()
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## 9. tree.py
|
|
137
|
+
Basic Decision Tree utilities.
|
|
138
|
+
|
|
139
|
+
### Functions
|
|
140
|
+
- gini_impurity()
|
|
141
|
+
- entropy()
|
|
142
|
+
- information_gain()
|
|
143
|
+
- best_split()
|
|
144
|
+
- build_tree()
|
|
145
|
+
- predict_tree()
|
|
146
|
+
- majority_vote()
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
# Project Structure
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
Custom_ML_Suite/
|
|
154
|
+
│
|
|
155
|
+
├── examples/
|
|
156
|
+
│ ├── demo.py
|
|
157
|
+
│ └── Formula.py
|
|
158
|
+
│
|
|
159
|
+
├── src/
|
|
160
|
+
│ └── Custom_ML_Suite/
|
|
161
|
+
│ ├── __init__.py
|
|
162
|
+
│ ├── activations.py
|
|
163
|
+
│ ├── distances.py
|
|
164
|
+
│ ├── linear_model.py
|
|
165
|
+
│ ├── metrics.py
|
|
166
|
+
│ ├── model_selection.py
|
|
167
|
+
│ ├── neighbors.py
|
|
168
|
+
│ ├── preprocessing.py
|
|
169
|
+
│ ├── stats.py
|
|
170
|
+
│ └── tree.py
|
|
171
|
+
│
|
|
172
|
+
├── tests/
|
|
173
|
+
│ └── test_all.py
|
|
174
|
+
│
|
|
175
|
+
├── README.md
|
|
176
|
+
├── pyproject.toml
|
|
177
|
+
└── .gitignore
|
|
178
|
+
|
|
179
|
+
# Authors
|
|
180
|
+
|
|
181
|
+
Aryan Kakade
|
|
182
|
+
Kishor Handge
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# Custom_ML_Suite
|
|
2
|
+
|
|
3
|
+
Custom_ML_Suite is a beginner-friendly Machine Learning utility library built completely from scratch using pure Python.
|
|
4
|
+
|
|
5
|
+
This project focuses on understanding the mathematical foundations of Machine Learning by manually implementing core ML algorithms, preprocessing methods, distance metrics, activation functions, statistical operations, and evaluation metrics without using external ML libraries like scikit-learn.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Features
|
|
10
|
+
|
|
11
|
+
- Pure Python implementation
|
|
12
|
+
- Beginner-friendly code structure
|
|
13
|
+
- Mathematical formulas included
|
|
14
|
+
- Well-commented educational code
|
|
15
|
+
- Modular package structure
|
|
16
|
+
- ML utilities from scratch
|
|
17
|
+
- Edge-case handling
|
|
18
|
+
- Easy to understand and extend
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
# Modules Included
|
|
23
|
+
|
|
24
|
+
## 1. activations.py
|
|
25
|
+
Activation functions used in neural networks.
|
|
26
|
+
|
|
27
|
+
### Functions
|
|
28
|
+
- sigmoid()
|
|
29
|
+
- relu()
|
|
30
|
+
- tanh()
|
|
31
|
+
- softmax()
|
|
32
|
+
- log_loss()
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 2. distances.py
|
|
37
|
+
Distance and similarity metrics.
|
|
38
|
+
|
|
39
|
+
### Functions
|
|
40
|
+
- euclidean_distance()
|
|
41
|
+
- manhattan_distance()
|
|
42
|
+
- minkowski_distance()
|
|
43
|
+
- cosine_similarity()
|
|
44
|
+
- hamming_distance()
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## 3. preprocessing.py
|
|
49
|
+
Data preprocessing and scaling methods.
|
|
50
|
+
|
|
51
|
+
### Functions
|
|
52
|
+
- standardization()
|
|
53
|
+
- mean()
|
|
54
|
+
- min_max_scaling()
|
|
55
|
+
- range()
|
|
56
|
+
- normalization()
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## 4. stats.py
|
|
61
|
+
Basic statistical operations.
|
|
62
|
+
|
|
63
|
+
### Functions
|
|
64
|
+
- mean()
|
|
65
|
+
- variance()
|
|
66
|
+
- std_dev()
|
|
67
|
+
- covariance()
|
|
68
|
+
- correlation()
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## 5. metrics.py
|
|
73
|
+
Machine Learning evaluation metrics.
|
|
74
|
+
|
|
75
|
+
### Functions
|
|
76
|
+
- accuracy_score()
|
|
77
|
+
- precision_score()
|
|
78
|
+
- recall_score()
|
|
79
|
+
- f1_score()
|
|
80
|
+
- confusion_matrix()
|
|
81
|
+
- mean_absolute_error()
|
|
82
|
+
- mean_squared_error()
|
|
83
|
+
- root_mean_squared_error()
|
|
84
|
+
- r2_score()
|
|
85
|
+
- binary_crossentropy()
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 6. model_selection.py
|
|
90
|
+
Dataset splitting and validation utilities.
|
|
91
|
+
|
|
92
|
+
### Functions
|
|
93
|
+
- train_test_split()
|
|
94
|
+
- shuffle_data()
|
|
95
|
+
- batch_iterator()
|
|
96
|
+
- k_fold_split()
|
|
97
|
+
- stratified_split()
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 7. linear_model.py
|
|
102
|
+
Basic regression models and optimization.
|
|
103
|
+
|
|
104
|
+
### Functions
|
|
105
|
+
- linear_regression()
|
|
106
|
+
- predict()
|
|
107
|
+
- gradient_descent()
|
|
108
|
+
- logistic_regression()
|
|
109
|
+
- logistic_update()
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## 8. neighbors.py
|
|
114
|
+
K-Nearest Neighbors utilities.
|
|
115
|
+
|
|
116
|
+
### Functions
|
|
117
|
+
- knn_distance()
|
|
118
|
+
- knn_predict()
|
|
119
|
+
- probability()
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## 9. tree.py
|
|
124
|
+
Basic Decision Tree utilities.
|
|
125
|
+
|
|
126
|
+
### Functions
|
|
127
|
+
- gini_impurity()
|
|
128
|
+
- entropy()
|
|
129
|
+
- information_gain()
|
|
130
|
+
- best_split()
|
|
131
|
+
- build_tree()
|
|
132
|
+
- predict_tree()
|
|
133
|
+
- majority_vote()
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
# Project Structure
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
Custom_ML_Suite/
|
|
141
|
+
│
|
|
142
|
+
├── examples/
|
|
143
|
+
│ ├── demo.py
|
|
144
|
+
│ └── Formula.py
|
|
145
|
+
│
|
|
146
|
+
├── src/
|
|
147
|
+
│ └── Custom_ML_Suite/
|
|
148
|
+
│ ├── __init__.py
|
|
149
|
+
│ ├── activations.py
|
|
150
|
+
│ ├── distances.py
|
|
151
|
+
│ ├── linear_model.py
|
|
152
|
+
│ ├── metrics.py
|
|
153
|
+
│ ├── model_selection.py
|
|
154
|
+
│ ├── neighbors.py
|
|
155
|
+
│ ├── preprocessing.py
|
|
156
|
+
│ ├── stats.py
|
|
157
|
+
│ └── tree.py
|
|
158
|
+
│
|
|
159
|
+
├── tests/
|
|
160
|
+
│ └── test_all.py
|
|
161
|
+
│
|
|
162
|
+
├── README.md
|
|
163
|
+
├── pyproject.toml
|
|
164
|
+
└── .gitignore
|
|
165
|
+
|
|
166
|
+
# Authors
|
|
167
|
+
|
|
168
|
+
Aryan Kakade
|
|
169
|
+
Kishor Handge
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mak-mini-ml"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Beginner-friendly Machine Learning utility library built from scratch using pure Python"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Aryan Kakade"},
|
|
13
|
+
{name = "Kishor Handge"}
|
|
14
|
+
]
|
|
15
|
+
license = {text = "MIT"}
|
|
16
|
+
|
|
17
|
+
keywords = [
|
|
18
|
+
"machine-learning",
|
|
19
|
+
"python",
|
|
20
|
+
"ml",
|
|
21
|
+
"statistics",
|
|
22
|
+
"data-science",
|
|
23
|
+
"educational"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[tool.setuptools]
|
|
33
|
+
package-dir = {"" = "src"}
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["src"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# src/Custom_ML_Suite/__init__.py
|
|
2
|
+
|
|
3
|
+
# metrics.py → evaluation
|
|
4
|
+
# model_selection.py → data splitting
|
|
5
|
+
# distances.py → math for KNN
|
|
6
|
+
# activations.py → ML math functions
|
|
7
|
+
# preprocessing.py → scaling
|
|
8
|
+
# stats.py → statistics
|
|
9
|
+
# linear_model.py → regression models
|
|
10
|
+
# neighbors.py → KNN
|
|
11
|
+
# tree.py → decision tree
|
|
12
|
+
|
|
13
|
+
from .metrics import *
|
|
14
|
+
from .model_selection import *
|
|
15
|
+
from .distances import *
|
|
16
|
+
from .activations import *
|
|
17
|
+
from .preprocessing import *
|
|
18
|
+
from .stats import *
|
|
19
|
+
from .linear_model import *
|
|
20
|
+
from .neighbors import *
|
|
21
|
+
from .tree import *
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# sigmoid
|
|
2
|
+
# relu
|
|
3
|
+
# tanh
|
|
4
|
+
# softmax
|
|
5
|
+
# log_loss
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
|
|
9
|
+
# --------------------------------------------------
|
|
10
|
+
# ReLU activation
|
|
11
|
+
# Used in neural networks
|
|
12
|
+
# Returns 0 if input is negative, else returns same value
|
|
13
|
+
# Range: [0, ∞)
|
|
14
|
+
# --------------------------------------------------
|
|
15
|
+
def relu(x):
|
|
16
|
+
|
|
17
|
+
if x > 0:
|
|
18
|
+
return x
|
|
19
|
+
|
|
20
|
+
else:
|
|
21
|
+
return 0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# --------------------------------------------------
|
|
25
|
+
# Sigmoid activation
|
|
26
|
+
# Converts input into probability
|
|
27
|
+
# Range: (0, 1)
|
|
28
|
+
# Used in binary classification
|
|
29
|
+
# --------------------------------------------------
|
|
30
|
+
def sigmoid(x):
|
|
31
|
+
|
|
32
|
+
if x >= 0:
|
|
33
|
+
|
|
34
|
+
z = math.exp(-x)
|
|
35
|
+
|
|
36
|
+
return 1 / (1 + z)
|
|
37
|
+
|
|
38
|
+
else:
|
|
39
|
+
|
|
40
|
+
z = math.exp(x)
|
|
41
|
+
|
|
42
|
+
return z / (1 + z)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# --------------------------------------------------
|
|
46
|
+
# Tanh activation
|
|
47
|
+
# Similar to sigmoid but centered at 0
|
|
48
|
+
# Range: (-1, 1)
|
|
49
|
+
# Better for hidden layers
|
|
50
|
+
# --------------------------------------------------
|
|
51
|
+
def tanh(x):
|
|
52
|
+
|
|
53
|
+
return math.tanh(x)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# --------------------------------------------------
|
|
57
|
+
# Softmax function
|
|
58
|
+
# Converts list of values into probabilities
|
|
59
|
+
# Output sum = 1
|
|
60
|
+
# Used in multi-class classification
|
|
61
|
+
# --------------------------------------------------
|
|
62
|
+
def softmax(x_list):
|
|
63
|
+
|
|
64
|
+
if len(x_list) == 0:
|
|
65
|
+
raise ValueError("Empty list")
|
|
66
|
+
|
|
67
|
+
max_val = max(x_list)
|
|
68
|
+
|
|
69
|
+
exp_values = []
|
|
70
|
+
|
|
71
|
+
for x in x_list:
|
|
72
|
+
|
|
73
|
+
exp_values.append(math.exp(x - max_val))
|
|
74
|
+
|
|
75
|
+
total = sum(exp_values)
|
|
76
|
+
|
|
77
|
+
result = []
|
|
78
|
+
|
|
79
|
+
for val in exp_values:
|
|
80
|
+
|
|
81
|
+
result.append(val / total)
|
|
82
|
+
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# --------------------------------------------------
|
|
87
|
+
# Log Loss (Binary Cross Entropy)
|
|
88
|
+
# Measures error between true and predicted values
|
|
89
|
+
# Lower value = better model
|
|
90
|
+
# Used in classification problems
|
|
91
|
+
# --------------------------------------------------
|
|
92
|
+
def log_loss(y_true, y_pred):
|
|
93
|
+
|
|
94
|
+
if len(y_true) != len(y_pred):
|
|
95
|
+
raise ValueError("Length mismatch")
|
|
96
|
+
|
|
97
|
+
if len(y_true) == 0:
|
|
98
|
+
raise ValueError("Empty array")
|
|
99
|
+
|
|
100
|
+
n = len(y_true)
|
|
101
|
+
|
|
102
|
+
loss = 0
|
|
103
|
+
|
|
104
|
+
epsilon = 1e-15
|
|
105
|
+
|
|
106
|
+
for i in range(n):
|
|
107
|
+
|
|
108
|
+
pred = max(min(y_pred[i], 1 - epsilon), epsilon)
|
|
109
|
+
|
|
110
|
+
loss = loss + (
|
|
111
|
+
y_true[i] * math.log(pred) +
|
|
112
|
+
(1 - y_true[i]) * math.log(1 - pred)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return -loss / n
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
# euclidean_distance
|
|
4
|
+
# manhattan_distance
|
|
5
|
+
# minkowski_distance
|
|
6
|
+
# cosine_similarity
|
|
7
|
+
# hamming_distance
|
|
8
|
+
|
|
9
|
+
# | p | Distance Type |
|
|
10
|
+
# | - | ------------- |
|
|
11
|
+
# | 1 | Manhattan |
|
|
12
|
+
# | 2 | Euclidean |
|
|
13
|
+
# | ∞ | Chebyshev |
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Hamming distance: number of positions where values differ
|
|
18
|
+
# d = Σ (xi != yi)
|
|
19
|
+
def hamming_distance(x, y):
|
|
20
|
+
|
|
21
|
+
if len(x) != len(y):
|
|
22
|
+
raise ValueError("Length mismatch")
|
|
23
|
+
|
|
24
|
+
count = 0
|
|
25
|
+
n = len(x)
|
|
26
|
+
|
|
27
|
+
i = 0
|
|
28
|
+
|
|
29
|
+
while i < n:
|
|
30
|
+
|
|
31
|
+
if x[i] != y[i]:
|
|
32
|
+
count += 1
|
|
33
|
+
|
|
34
|
+
i += 1
|
|
35
|
+
|
|
36
|
+
return count
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Euclidean Distance: straight-line distance between two points in space
|
|
41
|
+
# d = sqrt(Σ (xi - yi)^2)
|
|
42
|
+
def euclidean_distance(x, y):
|
|
43
|
+
|
|
44
|
+
if len(x) != len(y):
|
|
45
|
+
raise ValueError("Length mismatch")
|
|
46
|
+
|
|
47
|
+
Total = 0
|
|
48
|
+
n = len(x)
|
|
49
|
+
result = 0
|
|
50
|
+
|
|
51
|
+
for i in range(n):
|
|
52
|
+
|
|
53
|
+
Total = Total + (x[i] - y[i]) ** 2 #(p == 2)
|
|
54
|
+
|
|
55
|
+
result = math.sqrt(Total)
|
|
56
|
+
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Manhattan Distance: sum of absolute differences between coordinates (grid-like path)
|
|
61
|
+
# d = Σ |xi - yi|
|
|
62
|
+
def manhattan_distance(x, y):
|
|
63
|
+
|
|
64
|
+
if len(x) != len(y):
|
|
65
|
+
raise ValueError("Length mismatch")
|
|
66
|
+
|
|
67
|
+
Total = 0
|
|
68
|
+
n = len(x)
|
|
69
|
+
|
|
70
|
+
for i in range(n):
|
|
71
|
+
|
|
72
|
+
Total = Total + abs(x[i] - y[i]) # (p == 1) power of parameter
|
|
73
|
+
|
|
74
|
+
return Total
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# Cosine Similarity: measures how similar two vectors are based on the angle between them
|
|
79
|
+
# cos(θ) = (Σ xi*yi) / (sqrt(Σ xi^2) * sqrt(Σ yi^2))
|
|
80
|
+
|
|
81
|
+
# Minimum angle → Maximum similarity
|
|
82
|
+
# Maximum angle → Minimum similarity
|
|
83
|
+
def cosine_similarity(x, y):
|
|
84
|
+
|
|
85
|
+
if len(x) != len(y):
|
|
86
|
+
raise ValueError("Length mismatch")
|
|
87
|
+
|
|
88
|
+
dot_product = 0
|
|
89
|
+
mag_x = 0
|
|
90
|
+
mag_y = 0
|
|
91
|
+
n = len(x)
|
|
92
|
+
|
|
93
|
+
for i in range(n):
|
|
94
|
+
|
|
95
|
+
dot_product = dot_product + x[i] * y[i]
|
|
96
|
+
|
|
97
|
+
mag_x = mag_x + x[i] ** 2
|
|
98
|
+
|
|
99
|
+
mag_y = mag_y + y[i] ** 2
|
|
100
|
+
|
|
101
|
+
denominator = math.sqrt(mag_x) * math.sqrt(mag_y)
|
|
102
|
+
|
|
103
|
+
if denominator == 0:
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
return dot_product / denominator
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# Minkowski Distance: generalized distance formula that includes Euclidean and Manhattan as special cases
|
|
110
|
+
# d = (Σ |xi - yi|^p)^(1/p)
|
|
111
|
+
|
|
112
|
+
def minkowski_distance(x, y, p):
|
|
113
|
+
|
|
114
|
+
if len(x) != len(y):
|
|
115
|
+
raise ValueError("Length mismatch")
|
|
116
|
+
|
|
117
|
+
if p <= 0:
|
|
118
|
+
raise ValueError("p must be greater than 0")
|
|
119
|
+
|
|
120
|
+
total = 0
|
|
121
|
+
n = len(x)
|
|
122
|
+
|
|
123
|
+
for i in range(n):
|
|
124
|
+
|
|
125
|
+
total = total + abs(x[i] - y[i]) ** p
|
|
126
|
+
|
|
127
|
+
return total ** (1/p)
|