matplotliblib 0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matplotliblib-0.1/PKG-INFO +12 -0
- matplotliblib-0.1/README.md +0 -0
- matplotliblib-0.1/matplotliblib/__init__.py +14 -0
- matplotliblib-0.1/matplotliblib/main.py +785 -0
- matplotliblib-0.1/matplotliblib.egg-info/PKG-INFO +12 -0
- matplotliblib-0.1/matplotliblib.egg-info/SOURCES.txt +8 -0
- matplotliblib-0.1/matplotliblib.egg-info/dependency_links.txt +1 -0
- matplotliblib-0.1/matplotliblib.egg-info/top_level.txt +1 -0
- matplotliblib-0.1/setup.cfg +4 -0
- matplotliblib-0.1/setup.py +17 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: matplotliblib
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: Collection of machine learning tools
|
|
5
|
+
Author: .
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Requires-Python: >=3.7
|
|
9
|
+
Dynamic: author
|
|
10
|
+
Dynamic: classifier
|
|
11
|
+
Dynamic: requires-python
|
|
12
|
+
Dynamic: summary
|
|
File without changes
|
|
@@ -0,0 +1,785 @@
|
|
|
1
|
+
def data():
|
|
2
|
+
print('''
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
|
|
7
|
+
df = sns.load_dataset('iris')
|
|
8
|
+
|
|
9
|
+
print(df.head())
|
|
10
|
+
print(df.tail())
|
|
11
|
+
|
|
12
|
+
print(help(df))
|
|
13
|
+
print("Information:")
|
|
14
|
+
df.info()
|
|
15
|
+
|
|
16
|
+
print("Description")
|
|
17
|
+
df.describe()
|
|
18
|
+
|
|
19
|
+
print("Null values: \n", df.isnull().sum())
|
|
20
|
+
df.hist(figsize=(10,6))
|
|
21
|
+
plt.show()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
#Hisogram:
|
|
25
|
+
df.hist(figsize=(10,6))
|
|
26
|
+
plt.show()
|
|
27
|
+
|
|
28
|
+
#Boxplot
|
|
29
|
+
sns.boxplot(data=df)
|
|
30
|
+
plt.show()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
#Countplot
|
|
34
|
+
sns.countplot(x='species', data=df)
|
|
35
|
+
plt.show()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
#Scatter Plot
|
|
39
|
+
plt.scatter(df['sepal_length'], df['sepal_width'])
|
|
40
|
+
plt.xlabel('Sepal Length')
|
|
41
|
+
plt.ylabel('Sepal Width')
|
|
42
|
+
plt.show()
|
|
43
|
+
|
|
44
|
+
#Pairplot
|
|
45
|
+
sns.pairplot(df, hue='species')
|
|
46
|
+
plt.show()
|
|
47
|
+
|
|
48
|
+
#Scatter Multiple
|
|
49
|
+
plt.figure(figsize=(10,6))
|
|
50
|
+
plt.scatter(df['petal_length'], df['sepal_length'], label='Sepal Length')
|
|
51
|
+
plt.scatter(df['petal_length'], df['sepal_width'], label='Sepal Width')
|
|
52
|
+
plt.scatter(df['petal_length'], df['petal_width'], label='Petal Width')
|
|
53
|
+
|
|
54
|
+
plt.xlabel('Petal Length')
|
|
55
|
+
plt.ylabel('Values of Other Attributes')
|
|
56
|
+
plt.title('Scatter Multiple Plot (Iris Dataset)')
|
|
57
|
+
plt.legend()
|
|
58
|
+
plt.show()
|
|
59
|
+
|
|
60
|
+
#Scatter Matrix
|
|
61
|
+
from pandas.plotting import scatter_matrix
|
|
62
|
+
plt.figure(figsize=(10,8))
|
|
63
|
+
scatter_matrix(df.iloc[:, :4], figsize=(10,8), diagonal='hist')
|
|
64
|
+
plt.show()
|
|
65
|
+
|
|
66
|
+
#Parallel Coordinates plot
|
|
67
|
+
from pandas.plotting import parallel_coordinates
|
|
68
|
+
|
|
69
|
+
plt.figure(figsize=(10,6))
|
|
70
|
+
parallel_coordinates(df, 'species')
|
|
71
|
+
plt.title('Parallel Coordinates Plot - Iris Dataset')
|
|
72
|
+
plt.show()
|
|
73
|
+
|
|
74
|
+
#Deviation Chart (from mean)
|
|
75
|
+
mean_vals = df.iloc[:, :4].mean()
|
|
76
|
+
|
|
77
|
+
plt.figure(figsize=(10,6))
|
|
78
|
+
plt.plot(df.iloc[:, :4] - mean_vals)
|
|
79
|
+
plt.title('Deviation Chart - Deviation from Mean')
|
|
80
|
+
plt.xlabel('Record Index')
|
|
81
|
+
plt.ylabel('Deviation')
|
|
82
|
+
plt.show()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#Andrews Curve
|
|
86
|
+
from pandas.plotting import andrews_curves
|
|
87
|
+
|
|
88
|
+
plt.figure(figsize=(10,6))
|
|
89
|
+
andrews_curves(df, 'species')
|
|
90
|
+
plt.title('Andrews Curves - Iris Dataset')
|
|
91
|
+
plt.show()
|
|
92
|
+
''')
|
|
93
|
+
|
|
94
|
+
def decisiontree():
|
|
95
|
+
print('''
|
|
96
|
+
import pandas as pd
|
|
97
|
+
import numpy as np
|
|
98
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
99
|
+
from sklearn.model_selection import train_test_split
|
|
100
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
101
|
+
|
|
102
|
+
from sklearn.datasets import load_iris
|
|
103
|
+
|
|
104
|
+
iris = load_iris()
|
|
105
|
+
X = iris.data
|
|
106
|
+
y = iris.target
|
|
107
|
+
|
|
108
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
109
|
+
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=3)
|
|
110
|
+
dt_model.fit(X_train, y_train)
|
|
111
|
+
y_pred = dt_model.predict(X_test)
|
|
112
|
+
print(y_pred)
|
|
113
|
+
|
|
114
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
115
|
+
print("Accuracy:", accuracy)
|
|
116
|
+
|
|
117
|
+
print(classification_report(y_test, y_pred))
|
|
118
|
+
|
|
119
|
+
from sklearn.tree import plot_tree
|
|
120
|
+
import matplotlib.pyplot as plt
|
|
121
|
+
|
|
122
|
+
plt.figure(figsize=(12,8))
|
|
123
|
+
plot_tree(dt_model, feature_names=iris.feature_names,
|
|
124
|
+
class_names=iris.target_names, filled=True)
|
|
125
|
+
plt.show()
|
|
126
|
+
|
|
127
|
+
# Create decision tree model on mtcars dataset
|
|
128
|
+
import pandas as pd
|
|
129
|
+
import numpy as np
|
|
130
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
131
|
+
from sklearn.model_selection import train_test_split
|
|
132
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
133
|
+
|
|
134
|
+
mtcars = pd.read_csv("mtcars.csv")
|
|
135
|
+
mtcars.head()
|
|
136
|
+
|
|
137
|
+
mtcars = mtcars.drop(columns=['model'])
|
|
138
|
+
X = mtcars.drop('am', axis=1)
|
|
139
|
+
y = mtcars['am']
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
143
|
+
dt_model = DecisionTreeClassifier(criterion="gini",max_depth=3)
|
|
144
|
+
dt_model.fit(X_train, y_train)
|
|
145
|
+
y_pred = dt_model.predict(X_test)
|
|
146
|
+
|
|
147
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
148
|
+
print("Accuracy:", accuracy)
|
|
149
|
+
|
|
150
|
+
print(classification_report(y_test, y_pred))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
from sklearn.tree import plot_tree
|
|
154
|
+
import matplotlib.pyplot as plt
|
|
155
|
+
|
|
156
|
+
plt.figure(figsize=(14,8))
|
|
157
|
+
plot_tree(
|
|
158
|
+
dt_model,
|
|
159
|
+
feature_names=X.columns,
|
|
160
|
+
class_names=["Automatic", "Manual"],
|
|
161
|
+
filled=True
|
|
162
|
+
)
|
|
163
|
+
plt.show()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
''')
|
|
167
|
+
|
|
168
|
+
def kmeans():
|
|
169
|
+
print('''
|
|
170
|
+
import pandas as pd
|
|
171
|
+
import matplotlib.pyplot as plt
|
|
172
|
+
from sklearn.datasets import load_iris
|
|
173
|
+
from sklearn.cluster import KMeans
|
|
174
|
+
|
|
175
|
+
iris = load_iris()
|
|
176
|
+
X = iris.data
|
|
177
|
+
|
|
178
|
+
k = 3
|
|
179
|
+
|
|
180
|
+
kmeans = KMeans(n_clusters=3, random_state=0)
|
|
181
|
+
kmeans.fit(X)
|
|
182
|
+
|
|
183
|
+
labels = kmeans.labels_
|
|
184
|
+
centroids = kmeans.cluster_centers_
|
|
185
|
+
print(labels)
|
|
186
|
+
print(centroids)
|
|
187
|
+
|
|
188
|
+
plt.scatter(X[:,2], X[:,3], c=labels)
|
|
189
|
+
plt.scatter(centroids[:,2], centroids[:,3], marker='X')
|
|
190
|
+
plt.xlabel("Petal Length")
|
|
191
|
+
plt.ylabel("Petal Width")
|
|
192
|
+
plt.title("K-Means Clustering on Iris Dataset")
|
|
193
|
+
plt.show()
|
|
194
|
+
|
|
195
|
+
# DBSCAN using Iris Dataset
|
|
196
|
+
import numpy as np
|
|
197
|
+
import matplotlib.pyplot as plt
|
|
198
|
+
from sklearn.datasets import load_iris
|
|
199
|
+
from sklearn.cluster import DBSCAN
|
|
200
|
+
from sklearn.preprocessing import StandardScaler
|
|
201
|
+
|
|
202
|
+
iris = load_iris()
|
|
203
|
+
X = iris.data
|
|
204
|
+
|
|
205
|
+
scaler = StandardScaler()
|
|
206
|
+
X_scaled = scaler.fit_transform(X)
|
|
207
|
+
|
|
208
|
+
dbscan = DBSCAN(eps=0.6, min_samples=5)
|
|
209
|
+
|
|
210
|
+
labels = dbscan.fit_predict(X_scaled)
|
|
211
|
+
|
|
212
|
+
print(np.unique(labels))
|
|
213
|
+
|
|
214
|
+
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels)
|
|
215
|
+
plt.xlabel("Sepal Length")
|
|
216
|
+
plt.ylabel("Sepal Width")
|
|
217
|
+
plt.title("DBSCAN Clustering on Iris Dataset")
|
|
218
|
+
plt.show()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
''')
|
|
224
|
+
|
|
225
|
+
def regression():
|
|
226
|
+
print('''
|
|
227
|
+
#Simple Linear Regression
|
|
228
|
+
|
|
229
|
+
# Basic operations
|
|
230
|
+
import numpy as np
|
|
231
|
+
import pandas as pd
|
|
232
|
+
|
|
233
|
+
# Plotting
|
|
234
|
+
import matplotlib.pyplot as plt
|
|
235
|
+
import seaborn as sns
|
|
236
|
+
|
|
237
|
+
# sklearn for model building & evaluation
|
|
238
|
+
from sklearn.linear_model import LinearRegression
|
|
239
|
+
from sklearn.model_selection import train_test_split, cross_val_score, KFold
|
|
240
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
|
241
|
+
|
|
242
|
+
# statsmodels for detailed regression output (p-values, std err, etc.)
|
|
243
|
+
import statsmodels.api as sm
|
|
244
|
+
|
|
245
|
+
# For VIF (multicollinearity)
|
|
246
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
247
|
+
|
|
248
|
+
# For qqplot
|
|
249
|
+
import scipy.stats as stats
|
|
250
|
+
|
|
251
|
+
hours = np.array([1, 2, 4, 5, 5, 6])
|
|
252
|
+
score = np.array([64, 66, 76, 73, 74, 81])
|
|
253
|
+
|
|
254
|
+
# Put into a DataFrame for convenience
|
|
255
|
+
df = pd.DataFrame({"hours": hours, "score": score})
|
|
256
|
+
|
|
257
|
+
# Scatter plot with regression line visually
|
|
258
|
+
sns.scatterplot(x="hours", y="score", data=df, s=80)
|
|
259
|
+
sns.regplot(x="hours", y="score", data=df, ci=None, scatter=False) # fitted line
|
|
260
|
+
plt.title("Hours studied vs Exam Score")
|
|
261
|
+
plt.xlabel("Hours studied")
|
|
262
|
+
plt.ylabel("Exam Score")
|
|
263
|
+
plt.show()
|
|
264
|
+
|
|
265
|
+
# Boxplot to check distribution & outliers (response variable)
|
|
266
|
+
plt.figure()
|
|
267
|
+
sns.boxplot(y=df["score"])
|
|
268
|
+
plt.title("Boxplot of Exam Scores")
|
|
269
|
+
plt.show()
|
|
270
|
+
|
|
271
|
+
# Prepare X and y
|
|
272
|
+
X = df[["hours"]] # 2D array (n_samples, n_features)
|
|
273
|
+
y = df["score"]
|
|
274
|
+
|
|
275
|
+
# Fit model (sklearn)
|
|
276
|
+
lr = LinearRegression()
|
|
277
|
+
lr.fit(X, y)
|
|
278
|
+
|
|
279
|
+
# Coefficients
|
|
280
|
+
b0 = lr.intercept_
|
|
281
|
+
b1 = lr.coef_[0]
|
|
282
|
+
print(f"Intercept (b0): {b0:.4f}")
|
|
283
|
+
print(f"Slope (b1): {b1:.4f}")
|
|
284
|
+
|
|
285
|
+
# Add constant for intercept term
|
|
286
|
+
X_sm = sm.add_constant(X) # adds column of 1s for intercept
|
|
287
|
+
model_sm = sm.OLS(y, X_sm).fit()
|
|
288
|
+
print(model_sm.summary())
|
|
289
|
+
|
|
290
|
+
# Predict with sklearn model (single value)
|
|
291
|
+
new_hours = pd.DataFrame({"hours": [3]})
|
|
292
|
+
predicted_score = lr.predict(new_hours)
|
|
293
|
+
print("Predicted score for 3 hours:", predicted_score[0])
|
|
294
|
+
|
|
295
|
+
# For presentation: create a DataFrame with a range and show predictions
|
|
296
|
+
grid = pd.DataFrame({"hours": np.linspace(df.hours.min(), df.hours.max(), 50)})
|
|
297
|
+
grid["predicted_score"] = lr.predict(grid)
|
|
298
|
+
grid.head()
|
|
299
|
+
|
|
300
|
+
df["predicted"] = lr.predict(X)
|
|
301
|
+
df["residuals"] = df["score"] - df["predicted"]
|
|
302
|
+
|
|
303
|
+
# Residual vs Fitted
|
|
304
|
+
plt.figure()
|
|
305
|
+
plt.scatter(df["predicted"], df["residuals"], s=60)
|
|
306
|
+
plt.axhline(0, linestyle='--')
|
|
307
|
+
plt.xlabel("Fitted values")
|
|
308
|
+
plt.ylabel("Residuals")
|
|
309
|
+
plt.title("Residuals vs Fitted")
|
|
310
|
+
plt.show()
|
|
311
|
+
|
|
312
|
+
# QQ-plot of residuals
|
|
313
|
+
plt.figure()
|
|
314
|
+
sm.qqplot(df["residuals"], line='45', fit=True)
|
|
315
|
+
plt.title("Q-Q plot of residuals")
|
|
316
|
+
plt.show()
|
|
317
|
+
|
|
318
|
+
# Histogram of residuals (optional)
|
|
319
|
+
plt.figure()
|
|
320
|
+
sns.histplot(df["residuals"], kde=True)
|
|
321
|
+
plt.title("Histogram of residuals")
|
|
322
|
+
plt.show()
|
|
323
|
+
|
|
324
|
+
# Multiple Linear Regression
|
|
325
|
+
import numpy as np
|
|
326
|
+
import pandas as pd
|
|
327
|
+
import matplotlib.pyplot as plt
|
|
328
|
+
import seaborn as sns
|
|
329
|
+
|
|
330
|
+
from sklearn.linear_model import LinearRegression
|
|
331
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
|
332
|
+
from sklearn.model_selection import train_test_split, KFold, cross_val_score
|
|
333
|
+
|
|
334
|
+
import statsmodels.api as sm
|
|
335
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
336
|
+
from statsmodels.stats.diagnostic import het_breuschpagan
|
|
337
|
+
from statsmodels.stats.stattools import durbin_watson
|
|
338
|
+
import scipy.stats as stats
|
|
339
|
+
|
|
340
|
+
house = pd.read_csv("index.csv")
|
|
341
|
+
# Quick peek
|
|
342
|
+
house.head()
|
|
343
|
+
house.shape
|
|
344
|
+
|
|
345
|
+
house.info() # data types and non-null counts
|
|
346
|
+
house.describe().T # mean, std, min, max, quartiles
|
|
347
|
+
house.columns # column names
|
|
348
|
+
|
|
349
|
+
# Check missing values
|
|
350
|
+
house.isnull().sum()
|
|
351
|
+
|
|
352
|
+
# Pairwise relationships
|
|
353
|
+
sns.pairplot(house[['death_rate','doctor_avail','hosp_avail','annual_income','density_per_capita']])
|
|
354
|
+
plt.suptitle("Pairwise plots", y=1.02)
|
|
355
|
+
plt.show()
|
|
356
|
+
|
|
357
|
+
# Correlation matrix (helpful for collinearity)
|
|
358
|
+
corr = house[['death_rate','doctor_avail','hosp_avail','annual_income','density_per_capita']].corr()
|
|
359
|
+
print(corr)
|
|
360
|
+
sns.heatmap(corr, annot=True, fmt=".2f")
|
|
361
|
+
plt.title("Correlation matrix")
|
|
362
|
+
plt.show()
|
|
363
|
+
|
|
364
|
+
X = house[['death_rate','doctor_avail','hosp_avail','annual_income']]
|
|
365
|
+
y = house['density_per_capita']
|
|
366
|
+
|
|
367
|
+
# (Optional) Train-test split for validation
|
|
368
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
369
|
+
|
|
370
|
+
# Fit with sklearn (fast, gives coefficients)
|
|
371
|
+
lr = LinearRegression()
|
|
372
|
+
lr.fit(X_train, y_train)
|
|
373
|
+
|
|
374
|
+
print("Intercept (sklearn):", lr.intercept_)
|
|
375
|
+
print("Coefficients (sklearn):")
|
|
376
|
+
for name, coef in zip(X.columns, lr.coef_):
|
|
377
|
+
print(f" {name}: {coef:.6f}")
|
|
378
|
+
|
|
379
|
+
# Fit with statsmodels to get p-values, std errors, R-squared etc.
|
|
380
|
+
X_sm = sm.add_constant(X) # adds intercept column
|
|
381
|
+
model_sm = sm.OLS(y, X_sm).fit()
|
|
382
|
+
|
|
383
|
+
# Use statsmodels' summary (detailed regression table)
|
|
384
|
+
print(model_sm.summary())
|
|
385
|
+
|
|
386
|
+
# Predict on training data or new data
|
|
387
|
+
house['predicted'] = model_sm.predict(X_sm) # statsmodels predict expects same X with constant
|
|
388
|
+
|
|
389
|
+
# If using sklearn model (trained on X_train)
|
|
390
|
+
y_test_pred = lr.predict(X_test)
|
|
391
|
+
|
|
392
|
+
# Example: predict for a new observation (replace with real numbers)
|
|
393
|
+
new_obs = pd.DataFrame({'death_rate':[2.0], 'doctor_avail':[1.5], 'hosp_avail':[0.7], 'annual_income':[35000]})
|
|
394
|
+
pred_new = lr.predict(new_obs) # sklearn
|
|
395
|
+
print("Predicted density_per_capita for new_obs:", pred_new[0])
|
|
396
|
+
|
|
397
|
+
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
|
|
398
|
+
r2 = r2_score(y_test, y_test_pred)
|
|
399
|
+
print("Test RMSE:", rmse)
|
|
400
|
+
print("Test R2:", r2)
|
|
401
|
+
|
|
402
|
+
# Step 6 - Visualize the model
|
|
403
|
+
for col in X.columns:
|
|
404
|
+
plt.figure()
|
|
405
|
+
sns.scatterplot(x=house[col], y=house['density_per_capita'])
|
|
406
|
+
sns.regplot(x=house[col], y=house['density_per_capita'], ci=None, scatter=False)
|
|
407
|
+
plt.xlabel(col)
|
|
408
|
+
plt.ylabel('density_per_capita')
|
|
409
|
+
plt.title(f'density_per_capita vs {col}')
|
|
410
|
+
plt.show()
|
|
411
|
+
|
|
412
|
+
# Actual Vs Predicted
|
|
413
|
+
plt.figure()
|
|
414
|
+
sns.scatterplot(x=house['predicted'], y=house['density_per_capita'])
|
|
415
|
+
plt.plot([house['predicted'].min(), house['predicted'].max()],
|
|
416
|
+
[house['predicted'].min(), house['predicted'].max()], color='red', linestyle='--')
|
|
417
|
+
plt.xlabel('Predicted')
|
|
418
|
+
plt.ylabel('Actual')
|
|
419
|
+
plt.title('Actual vs Predicted')
|
|
420
|
+
plt.show()
|
|
421
|
+
|
|
422
|
+
# Residuals vs Fitted (diagnose heteroscedasticity / non-linearity)
|
|
423
|
+
residuals = house['density_per_capita'] - house['predicted']
|
|
424
|
+
plt.figure()
|
|
425
|
+
sns.scatterplot(x=house['predicted'], y=residuals)
|
|
426
|
+
plt.axhline(0, color='red', linestyle='--')
|
|
427
|
+
plt.xlabel('Fitted values')
|
|
428
|
+
plt.ylabel('Residuals')
|
|
429
|
+
plt.title('Residuals vs Fitted')
|
|
430
|
+
plt.show()
|
|
431
|
+
|
|
432
|
+
# Q-Q Plot for residuals (normality)
|
|
433
|
+
sm.qqplot(residuals, line='45', fit=True)
|
|
434
|
+
plt.title('Q-Q plot of residuals')
|
|
435
|
+
plt.show()
|
|
436
|
+
|
|
437
|
+
# Shapiro test (numeric)
|
|
438
|
+
stat, pval = stats.shapiro(residuals)
|
|
439
|
+
print("Shapiro-Wilk: stat=%.4f, p=%.4f" % (stat, pval))
|
|
440
|
+
|
|
441
|
+
# Influence / Leverage plot (outliers & influential points)
|
|
442
|
+
sm.graphics.influence_plot(model_sm, criterion="cooks")
|
|
443
|
+
plt.show()
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
''')
|
|
449
|
+
|
|
450
|
+
def association():
|
|
451
|
+
print('''
|
|
452
|
+
import pandas as pd
|
|
453
|
+
from mlxtend.preprocessing import TransactionEncoder
|
|
454
|
+
from mlxtend.frequent_patterns import apriori, association_rules
|
|
455
|
+
|
|
456
|
+
transactions = [
|
|
457
|
+
['Milk', 'Bread'],
|
|
458
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
459
|
+
['Milk', 'Diaper', 'Beer', 'Cola'],
|
|
460
|
+
['Bread', 'Milk', 'Diaper', 'Beer'],
|
|
461
|
+
['Bread', 'Milk', 'Diaper', 'Cola']
|
|
462
|
+
]
|
|
463
|
+
|
|
464
|
+
te = TransactionEncoder()
|
|
465
|
+
te_array = te.fit(transactions).transform(transactions)
|
|
466
|
+
df = pd.DataFrame(te_array, columns=te.columns_)
|
|
467
|
+
print(df)
|
|
468
|
+
|
|
469
|
+
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True)
|
|
470
|
+
print(frequent_itemsets)
|
|
471
|
+
|
|
472
|
+
rules = association_rules(
|
|
473
|
+
frequent_itemsets,
|
|
474
|
+
metric="confidence",
|
|
475
|
+
min_threshold=0.6
|
|
476
|
+
)
|
|
477
|
+
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# USING FP-Growth ALgorithm
|
|
481
|
+
from mlxtend.frequent_patterns import fpgrowth
|
|
482
|
+
print(df)
|
|
483
|
+
frequent_itemsets_fp = fpgrowth(df, min_support=0.4, use_colnames=True)
|
|
484
|
+
print(frequent_itemsets_fp)
|
|
485
|
+
rules_fp = association_rules(
|
|
486
|
+
frequent_itemsets_fp,
|
|
487
|
+
metric="confidence",
|
|
488
|
+
min_threshold=0.6
|
|
489
|
+
)
|
|
490
|
+
print(rules_fp[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
''')
|
|
494
|
+
|
|
495
|
+
def cnn():
|
|
496
|
+
print('''
|
|
497
|
+
import tensorflow as tf
|
|
498
|
+
from tensorflow import keras
|
|
499
|
+
from tensorflow.keras import layers
|
|
500
|
+
import matplotlib.pyplot as plt
|
|
501
|
+
|
|
502
|
+
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
|
|
503
|
+
|
|
504
|
+
x_train = x_train / 255.0
|
|
505
|
+
x_test = x_test / 255.0
|
|
506
|
+
|
|
507
|
+
x_train = x_train.reshape(-1,28,28,1)
|
|
508
|
+
x_test = x_test.reshape(-1,28,28,1)
|
|
509
|
+
|
|
510
|
+
model = keras.Sequential([
|
|
511
|
+
layers.Conv2D(32, (3,3), activation='relu', input_shape=(28,28,1)),
|
|
512
|
+
layers.MaxPooling2D((2,2)),
|
|
513
|
+
layers.Flatten(),
|
|
514
|
+
layers.Dense(64, activation='relu'),
|
|
515
|
+
layers.Dense(10, activation='softmax')
|
|
516
|
+
])
|
|
517
|
+
|
|
518
|
+
model.compile(optimizer='adam',
|
|
519
|
+
loss='sparse_categorical_crossentropy',
|
|
520
|
+
metrics=['accuracy'])
|
|
521
|
+
|
|
522
|
+
model.fit(x_train, y_train, epochs=5, validation_split=0.1)
|
|
523
|
+
|
|
524
|
+
test_loss, test_acc = model.evaluate(x_test, y_test)
|
|
525
|
+
print("Test Accuracy:", test_acc)
|
|
526
|
+
|
|
527
|
+
plt.imshow(x_test[0].reshape(28,28), cmap='gray')
|
|
528
|
+
plt.show()
|
|
529
|
+
|
|
530
|
+
prediction = model.predict(x_test[0].reshape(1,28,28,1))
|
|
531
|
+
print("Predicted Digit:", prediction.argmax())
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
''')
|
|
535
|
+
|
|
536
|
+
def recommendation():
|
|
537
|
+
print('''
|
|
538
|
+
import pandas as pd
|
|
539
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
540
|
+
|
|
541
|
+
data = {
|
|
542
|
+
'User': ['U1','U1','U2','U2','U3','U3'],
|
|
543
|
+
'Item': ['I1','I2','I1','I3','I2','I3'],
|
|
544
|
+
'Rating': [5,4,4,5,3,4]
|
|
545
|
+
}
|
|
546
|
+
df = pd.DataFrame(data)
|
|
547
|
+
|
|
548
|
+
matrix = df.pivot_table(index='User', columns='Item', values='Rating').fillna(0)
|
|
549
|
+
print(matrix)
|
|
550
|
+
|
|
551
|
+
similarity = cosine_similarity(matrix.T)
|
|
552
|
+
similarity_df = pd.DataFrame(similarity, index=matrix.columns, columns=matrix.columns)
|
|
553
|
+
print(similarity_df)
|
|
554
|
+
|
|
555
|
+
def recommend(item):
|
|
556
|
+
return similarity_df[item].sort_values(ascending=False)
|
|
557
|
+
|
|
558
|
+
print(recommend('I1'))
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
''')
|
|
562
|
+
|
|
563
|
+
def timeseries():
|
|
564
|
+
print('''
|
|
565
|
+
import pandas as pd
|
|
566
|
+
import numpy as np
|
|
567
|
+
import matplotlib.pyplot as plt
|
|
568
|
+
|
|
569
|
+
from statsmodels.tsa.stattools import adfuller
|
|
570
|
+
from statsmodels.tsa.arima.model import ARIMA
|
|
571
|
+
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
|
572
|
+
|
|
573
|
+
data = pd.read_csv("AirPassengers.csv")
|
|
574
|
+
data['Month'] = pd.to_datetime(data['Month'])
|
|
575
|
+
data.set_index('Month', inplace=True)
|
|
576
|
+
data.head()
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
plt.figure()
|
|
580
|
+
plt.plot(data['Passengers'])
|
|
581
|
+
plt.title('Monthly Air Passengers')
|
|
582
|
+
plt.xlabel('Year')
|
|
583
|
+
plt.ylabel('Number of Passengers')
|
|
584
|
+
plt.show()
|
|
585
|
+
|
|
586
|
+
result = adfuller(data['Passengers'])
|
|
587
|
+
|
|
588
|
+
print("ADF Statistic:", result[0])
|
|
589
|
+
print("p-value:", result[1])
|
|
590
|
+
|
|
591
|
+
data_diff = data['Passengers'].diff().dropna()
|
|
592
|
+
|
|
593
|
+
plt.figure()
|
|
594
|
+
plt.plot(data_diff)
|
|
595
|
+
plt.title('First Differenced Series')
|
|
596
|
+
plt.show()
|
|
597
|
+
|
|
598
|
+
adfuller(data_diff)
|
|
599
|
+
|
|
600
|
+
from statsmodels.graphics.tsaplots import plot_pacf
|
|
601
|
+
import matplotlib.pyplot as plt
|
|
602
|
+
|
|
603
|
+
plot_pacf(data_diff, lags=20)
|
|
604
|
+
plt.show()
|
|
605
|
+
|
|
606
|
+
from statsmodels.graphics.tsaplots import plot_acf
|
|
607
|
+
|
|
608
|
+
plot_acf(data_diff, lags=20)
|
|
609
|
+
plt.show()
|
|
610
|
+
|
|
611
|
+
model = ARIMA(data['Passengers'], order=(1,1,1))
|
|
612
|
+
model_fit = model.fit()
|
|
613
|
+
print(model_fit.summary())
|
|
614
|
+
|
|
615
|
+
forecast = model_fit.forecast(steps=12)
|
|
616
|
+
print(forecast)
|
|
617
|
+
|
|
618
|
+
plt.figure()
|
|
619
|
+
plt.plot(data['Passengers'], label='Actual')
|
|
620
|
+
plt.plot(forecast, label='Forecast', color='red')
|
|
621
|
+
plt.legend()
|
|
622
|
+
plt.show()
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
''')
|
|
629
|
+
|
|
630
|
+
def anomaly():
|
|
631
|
+
print('''
|
|
632
|
+
import pandas as pd
|
|
633
|
+
from scipy.stats import zscore
|
|
634
|
+
|
|
635
|
+
# Salary data
|
|
636
|
+
df = pd.DataFrame({
|
|
637
|
+
'Employee_ID': ['E01','E02','E03','E04','E05','E06','E07'],
|
|
638
|
+
'Salary': [48000, 52000, 50500, 49200, 51000, 120000, 18000]
|
|
639
|
+
})
|
|
640
|
+
|
|
641
|
+
# Z-score calculation
|
|
642
|
+
df['Z_Score'] = zscore(df['Salary'])
|
|
643
|
+
print(df['Z_Score'])
|
|
644
|
+
|
|
645
|
+
# Detect outliers
|
|
646
|
+
outliers = df[abs(df['Z_Score']) > 2]
|
|
647
|
+
print(outliers)
|
|
648
|
+
|
|
649
|
+
# IQR Method
|
|
650
|
+
# Employee salary data
|
|
651
|
+
data = {
|
|
652
|
+
'Employee_ID': ['E01','E02','E03','E04','E05','E06','E07'],
|
|
653
|
+
'Salary': [48000, 52000, 50500, 49200, 51000, 120000, 18000]
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
df = pd.DataFrame(data)
|
|
657
|
+
|
|
658
|
+
# Calculate Q1 and Q3
|
|
659
|
+
Q1 = df['Salary'].quantile(0.25)
|
|
660
|
+
Q3 = df['Salary'].quantile(0.75)
|
|
661
|
+
|
|
662
|
+
# Calculate IQR
|
|
663
|
+
IQR = Q3 - Q1
|
|
664
|
+
|
|
665
|
+
# Define lower and upper bounds
|
|
666
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
667
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
668
|
+
|
|
669
|
+
# Detect outliers
|
|
670
|
+
outliers = df[
|
|
671
|
+
(df['Salary'] < lower_bound) |
|
|
672
|
+
(df['Salary'] > upper_bound)
|
|
673
|
+
]
|
|
674
|
+
|
|
675
|
+
print("Lower Bound:", lower_bound)
|
|
676
|
+
print("Upper Bound:", upper_bound)
|
|
677
|
+
print("\nDetected Outliers:")
|
|
678
|
+
print(outliers)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
''')
|
|
683
|
+
|
|
684
|
+
def text():
|
|
685
|
+
print('''
|
|
686
|
+
#Document Classification
|
|
687
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
688
|
+
from sklearn.naive_bayes import MultinomialNB
|
|
689
|
+
|
|
690
|
+
texts = [
|
|
691
|
+
"Meeting scheduled with project team",
|
|
692
|
+
"Family dinner this weekend",
|
|
693
|
+
"Project deadline extended",
|
|
694
|
+
"Birthday party invitation"
|
|
695
|
+
]
|
|
696
|
+
|
|
697
|
+
labels = ["Work", "Personal", "Work", "Personal"]
|
|
698
|
+
|
|
699
|
+
vectorizer = TfidfVectorizer()
|
|
700
|
+
X = vectorizer.fit_transform(texts)
|
|
701
|
+
|
|
702
|
+
model = MultinomialNB()
|
|
703
|
+
model.fit(X, labels)
|
|
704
|
+
|
|
705
|
+
prediction = model.predict(vectorizer.transform(["Project meeting tomorrow"]))
|
|
706
|
+
print(prediction)
|
|
707
|
+
|
|
708
|
+
# Sentiment Analysis
|
|
709
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
710
|
+
from sklearn.linear_model import LogisticRegression
|
|
711
|
+
|
|
712
|
+
reviews = [
|
|
713
|
+
"The product is amazing",
|
|
714
|
+
"Very bad quality",
|
|
715
|
+
"I am happy with the purchase",
|
|
716
|
+
"Worst experience ever"
|
|
717
|
+
]
|
|
718
|
+
|
|
719
|
+
sentiment = ["Positive", "Negative", "Positive", "Negative"]
|
|
720
|
+
|
|
721
|
+
vectorizer = TfidfVectorizer()
|
|
722
|
+
X = vectorizer.fit_transform(reviews)
|
|
723
|
+
|
|
724
|
+
model = LogisticRegression()
|
|
725
|
+
model.fit(X, sentiment)
|
|
726
|
+
|
|
727
|
+
print(model.predict(vectorizer.transform(["The product quality is good"])))
|
|
728
|
+
|
|
729
|
+
# Search Engines
|
|
730
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
731
|
+
import numpy as np
|
|
732
|
+
|
|
733
|
+
documents = [
|
|
734
|
+
"Data science and machine learning",
|
|
735
|
+
"Introduction to text mining",
|
|
736
|
+
"Python for data analysis"
|
|
737
|
+
]
|
|
738
|
+
|
|
739
|
+
query = ["text mining"]
|
|
740
|
+
|
|
741
|
+
vectorizer = TfidfVectorizer()
|
|
742
|
+
tfidf = vectorizer.fit_transform(documents + query)
|
|
743
|
+
|
|
744
|
+
similarity = (tfidf * tfidf.T).toarray()
|
|
745
|
+
print("Most relevant document index:", np.argmax(similarity[-1][:-1]))
|
|
746
|
+
|
|
747
|
+
# Spam Detection
|
|
748
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
749
|
+
from sklearn.naive_bayes import MultinomialNB
|
|
750
|
+
|
|
751
|
+
messages = [
|
|
752
|
+
"Win a free lottery now",
|
|
753
|
+
"Meeting scheduled tomorrow",
|
|
754
|
+
"Urgent offer claim now",
|
|
755
|
+
"Project discussion today"
|
|
756
|
+
]
|
|
757
|
+
|
|
758
|
+
labels = ["Spam", "Not Spam", "Spam", "Not Spam"]
|
|
759
|
+
|
|
760
|
+
vectorizer = TfidfVectorizer()
|
|
761
|
+
X = vectorizer.fit_transform(messages)
|
|
762
|
+
|
|
763
|
+
model = MultinomialNB()
|
|
764
|
+
model.fit(X, labels)
|
|
765
|
+
|
|
766
|
+
print(model.predict(vectorizer.transform(["Free offer just for you"])))
|
|
767
|
+
|
|
768
|
+
# Recommendation Systems
|
|
769
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
770
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
771
|
+
|
|
772
|
+
movies = [
|
|
773
|
+
"Action adventure and hero story",
|
|
774
|
+
"Romantic love story",
|
|
775
|
+
"Adventure and fantasy world"
|
|
776
|
+
]
|
|
777
|
+
|
|
778
|
+
vectorizer = TfidfVectorizer()
|
|
779
|
+
tfidf = vectorizer.fit_transform(movies)
|
|
780
|
+
|
|
781
|
+
similarity = cosine_similarity(tfidf)
|
|
782
|
+
print(similarity)
|
|
783
|
+
|
|
784
|
+
''')
|
|
785
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: matplotliblib
|
|
3
|
+
Version: 0.1
|
|
4
|
+
Summary: Collection of machine learning tools
|
|
5
|
+
Author: .
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Requires-Python: >=3.7
|
|
9
|
+
Dynamic: author
|
|
10
|
+
Dynamic: classifier
|
|
11
|
+
Dynamic: requires-python
|
|
12
|
+
Dynamic: summary
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
matplotliblib
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="matplotliblib",
|
|
5
|
+
version="0.1",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
|
|
8
|
+
author=".",
|
|
9
|
+
description="Collection of machine learning tools",
|
|
10
|
+
|
|
11
|
+
python_requires=">=3.7",
|
|
12
|
+
|
|
13
|
+
classifiers=[
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
],
|
|
17
|
+
)
|