bigdata-helper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bigdata-helper might be problematic. Click here for more details.
- bigdata_helper-0.1.0/LICENSE +21 -0
- bigdata_helper-0.1.0/PKG-INFO +49 -0
- bigdata_helper-0.1.0/README.md +33 -0
- bigdata_helper-0.1.0/bigdata_helper/__init__.py +4 -0
- bigdata_helper-0.1.0/bigdata_helper/codes.py +170 -0
- bigdata_helper-0.1.0/bigdata_helper.egg-info/PKG-INFO +49 -0
- bigdata_helper-0.1.0/bigdata_helper.egg-info/SOURCES.txt +9 -0
- bigdata_helper-0.1.0/bigdata_helper.egg-info/dependency_links.txt +1 -0
- bigdata_helper-0.1.0/bigdata_helper.egg-info/top_level.txt +1 -0
- bigdata_helper-0.1.0/pyproject.toml +21 -0
- bigdata_helper-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Satyam Kale
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bigdata-helper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library containing Big Data practical codes for quick access
|
|
5
|
+
Author: Satyam Kale
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://pypi.org/project/bigdata-helper/
|
|
8
|
+
Keywords: big data,exam,codes,snippets,education
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# bigdata-helper
|
|
18
|
+
|
|
19
|
+
**Version:** 0.1.0
|
|
20
|
+
**Author:** Satyam Kale
|
|
21
|
+
|
|
22
|
+
A lightweight library that stores ready-to-use Big Data practical codes and lets you retrieve them instantly as strings.
|
|
23
|
+
|
|
24
|
+
> ⚠️ This package is intended for learning, practice, and revision. Use responsibly and follow your institution's academic policies.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install bigdata-helper
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quick Start
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from bigdata_helper import get_code, list_codes
|
|
36
|
+
|
|
37
|
+
print(list_codes())
|
|
38
|
+
print(get_code("mini"))
|
|
39
|
+
print(get_code("forestfire"))
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Available Codes
|
|
43
|
+
|
|
44
|
+
- `mini` — Multiple regressors benchmark on a synthetic Graduate Admissions dataset (prints model performance and a sample prediction).
|
|
45
|
+
- `forestfire` — MapReduce-like analysis pipeline for forest fire dataset with SQLite, correlation, and monthly summaries.
|
|
46
|
+
|
|
47
|
+
## Add More
|
|
48
|
+
|
|
49
|
+
You can contribute more codes by adding functions to `bigdata_helper/codes.py` and mapping them in `get_code_map()`.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# bigdata-helper
|
|
2
|
+
|
|
3
|
+
**Version:** 0.1.0
|
|
4
|
+
**Author:** Satyam Kale
|
|
5
|
+
|
|
6
|
+
A lightweight library that stores ready-to-use Big Data practical codes and lets you retrieve them instantly as strings.
|
|
7
|
+
|
|
8
|
+
> ⚠️ This package is intended for learning, practice, and revision. Use responsibly and follow your institution's academic policies.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install bigdata-helper
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from bigdata_helper import get_code, list_codes
|
|
20
|
+
|
|
21
|
+
print(list_codes())
|
|
22
|
+
print(get_code("mini"))
|
|
23
|
+
print(get_code("forestfire"))
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Available Codes
|
|
27
|
+
|
|
28
|
+
- `mini` — Multiple regressors benchmark on a synthetic Graduate Admissions dataset (prints model performance and a sample prediction).
|
|
29
|
+
- `forestfire` — MapReduce-like analysis pipeline for forest fire dataset with SQLite, correlation, and monthly summaries.
|
|
30
|
+
|
|
31
|
+
## Add More
|
|
32
|
+
|
|
33
|
+
You can contribute more codes by adding functions to `bigdata_helper/codes.py` and mapping them in `get_code_map()`.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
def mini_code() -> str:
|
|
5
|
+
return """
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.model_selection import train_test_split
|
|
9
|
+
from sklearn.pipeline import Pipeline
|
|
10
|
+
from sklearn.preprocessing import StandardScaler
|
|
11
|
+
from sklearn.linear_model import LinearRegression
|
|
12
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
13
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
14
|
+
from sklearn.svm import SVR
|
|
15
|
+
from sklearn.neighbors import KNeighborsRegressor
|
|
16
|
+
from sklearn.metrics import r2_score, mean_squared_error
|
|
17
|
+
|
|
18
|
+
def load_data(path):
|
|
19
|
+
data = pd.read_csv(path)
|
|
20
|
+
X = data[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']]
|
|
21
|
+
y = data['Chance of Admit']
|
|
22
|
+
return X, y
|
|
23
|
+
|
|
24
|
+
def build_pipeline(model):
|
|
25
|
+
return Pipeline([
|
|
26
|
+
('scaler', StandardScaler()),
|
|
27
|
+
('model', model)
|
|
28
|
+
])
|
|
29
|
+
|
|
30
|
+
def evaluate_model(model, X_test, y_test):
|
|
31
|
+
y_pred = model.predict(X_test)
|
|
32
|
+
r2 = r2_score(y_test, y_pred)
|
|
33
|
+
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
|
34
|
+
return r2, rmse
|
|
35
|
+
|
|
36
|
+
def main():
|
|
37
|
+
X, y = load_data("Synthetic_Graduate_Admissions.csv")
|
|
38
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
39
|
+
|
|
40
|
+
models = {
|
|
41
|
+
"Linear Regression": LinearRegression(),
|
|
42
|
+
"Decision Tree": DecisionTreeRegressor(random_state=42),
|
|
43
|
+
"Random Forest": RandomForestRegressor(random_state=42),
|
|
44
|
+
"Support Vector Regressor": SVR(kernel='rbf'),
|
|
45
|
+
"KNN Regressor": KNeighborsRegressor(n_neighbors=5)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
results = []
|
|
49
|
+
for name, model in models.items():
|
|
50
|
+
pipe = build_pipeline(model)
|
|
51
|
+
pipe.fit(X_train, y_train)
|
|
52
|
+
r2, rmse = evaluate_model(pipe, X_test, y_test)
|
|
53
|
+
results.append({"Model": name, "R2 Score": round(r2, 3), "RMSE": round(rmse, 3)})
|
|
54
|
+
|
|
55
|
+
results_df = pd.DataFrame(results)
|
|
56
|
+
print("\\nModel Performance Summary:")
|
|
57
|
+
print(results_df.to_string(index=False))
|
|
58
|
+
|
|
59
|
+
sample = pd.DataFrame([[320, 110, 4, 4.5, 4.0, 9.0, 1]],
|
|
60
|
+
columns=['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research'])
|
|
61
|
+
best_model = build_pipeline(RandomForestRegressor(random_state=42))
|
|
62
|
+
best_model.fit(X, y)
|
|
63
|
+
pred = best_model.predict(sample)
|
|
64
|
+
print(f"\\nPredicted Chance of Admission: {pred[0]*100:.2f}%")
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
main()
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def forestfire_code() -> str:
|
|
71
|
+
return """
|
|
72
|
+
from multiprocessing import Pool
|
|
73
|
+
import pandas as pd
|
|
74
|
+
import sqlite3
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def mapper(row):
|
|
78
|
+
return (row["Month"], row["Temperature_Celsius"])
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def reducer(mapped_data):
|
|
82
|
+
result = {}
|
|
83
|
+
for month, temp in mapped_data:
|
|
84
|
+
result.setdefault(month, []).append(temp)
|
|
85
|
+
return {m: sum(v) / len(v) for m, v in result.items()}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def run_mapreduce(df):
|
|
89
|
+
with Pool() as p:
|
|
90
|
+
mapped = p.map(mapper, [row for _, row in df.iterrows()])
|
|
91
|
+
reduced = reducer(mapped)
|
|
92
|
+
|
|
93
|
+
print("\\nAverage Temperature per Month:")
|
|
94
|
+
for m, t in reduced.items():
|
|
95
|
+
print(f"{m}: {t:.2f}")
|
|
96
|
+
return reduced
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def top_fire_months(df, top_n=5):
|
|
100
|
+
top = df.groupby("Month")["Burned_Area_hectares"].mean().sort_values(ascending=False).head(top_n)
|
|
101
|
+
print(f"\\nTop {top_n} Months with Largest Fire Area:\\n{top}\\n")
|
|
102
|
+
return top
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def temperature_area_correlation(df):
|
|
106
|
+
corr = df["Temperature_Celsius"].corr(df["Burned_Area_hectares"])
|
|
107
|
+
print(f"Correlation between Temperature and Fire Area: {corr:.2f}")
|
|
108
|
+
return corr
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def query_avg_area_by_month(conn):
|
|
112
|
+
query = '''
|
|
113
|
+
SELECT Month, AVG(Burned_Area_hectares) AS avg_area
|
|
114
|
+
FROM forestfires
|
|
115
|
+
GROUP BY Month
|
|
116
|
+
ORDER BY avg_area DESC;
|
|
117
|
+
'''
|
|
118
|
+
result = pd.read_sql_query(query, conn)
|
|
119
|
+
print("\\nAverage Burned Area by Month (from SQL):")
|
|
120
|
+
print(result)
|
|
121
|
+
return result
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def run_pipeline():
|
|
125
|
+
print("=== Forest Fire Analysis Pipeline Started ===\\n")
|
|
126
|
+
|
|
127
|
+
df = pd.read_csv("forestfires.csv")
|
|
128
|
+
print(f"Loaded dataset with {len(df)} rows and {len(df.columns)} columns.")
|
|
129
|
+
|
|
130
|
+
conn = sqlite3.connect("forestfires.db")
|
|
131
|
+
df.to_sql("forestfires", conn, if_exists="replace", index=False)
|
|
132
|
+
print("Data saved to SQLite database.\\n")
|
|
133
|
+
|
|
134
|
+
run_mapreduce(df)
|
|
135
|
+
top_fire_months(df)
|
|
136
|
+
temperature_area_correlation(df)
|
|
137
|
+
query_avg_area_by_month(conn)
|
|
138
|
+
|
|
139
|
+
print("\\n=== Pipeline Completed Successfully ===")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
if __name__ == "__main__":
|
|
143
|
+
run_pipeline()
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
def placeholder_code() -> str:
|
|
147
|
+
return "# Add your next practical code here..."
|
|
148
|
+
|
|
149
|
+
def get_code_map() -> Dict[str, str]:
|
|
150
|
+
return {
|
|
151
|
+
"mini": mini_code(),
|
|
152
|
+
"forestfire": forestfire_code(),
|
|
153
|
+
"placeholder": placeholder_code(),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
def get_code(name: str) -> str:
|
|
157
|
+
"""
|
|
158
|
+
Retrieve a stored code snippet by name.
|
|
159
|
+
Available names: see list_codes().
|
|
160
|
+
"""
|
|
161
|
+
key = (name or "").strip().lower()
|
|
162
|
+
mapping = get_code_map()
|
|
163
|
+
if key not in mapping:
|
|
164
|
+
available = ", ".join(sorted(mapping.keys()))
|
|
165
|
+
raise KeyError(f"Code '{name}' not found. Try one of: {available}")
|
|
166
|
+
return mapping[key]
|
|
167
|
+
|
|
168
|
+
def list_codes() -> List[str]:
|
|
169
|
+
\"Return a list of available code names.\"
|
|
170
|
+
return sorted(get_code_map().keys())
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bigdata-helper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library containing Big Data practical codes for quick access
|
|
5
|
+
Author: Satyam Kale
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://pypi.org/project/bigdata-helper/
|
|
8
|
+
Keywords: big data,exam,codes,snippets,education
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# bigdata-helper
|
|
18
|
+
|
|
19
|
+
**Version:** 0.1.0
|
|
20
|
+
**Author:** Satyam Kale
|
|
21
|
+
|
|
22
|
+
A lightweight library that stores ready-to-use Big Data practical codes and lets you retrieve them instantly as strings.
|
|
23
|
+
|
|
24
|
+
> ⚠️ This package is intended for learning, practice, and revision. Use responsibly and follow your institution's academic policies.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install bigdata-helper
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quick Start
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from bigdata_helper import get_code, list_codes
|
|
36
|
+
|
|
37
|
+
print(list_codes())
|
|
38
|
+
print(get_code("mini"))
|
|
39
|
+
print(get_code("forestfire"))
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Available Codes
|
|
43
|
+
|
|
44
|
+
- `mini` — Multiple regressors benchmark on a synthetic Graduate Admissions dataset (prints model performance and a sample prediction).
|
|
45
|
+
- `forestfire` — MapReduce-like analysis pipeline for forest fire dataset with SQLite, correlation, and monthly summaries.
|
|
46
|
+
|
|
47
|
+
## Add More
|
|
48
|
+
|
|
49
|
+
You can contribute more codes by adding functions to `bigdata_helper/codes.py` and mapping them in `get_code_map()`.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
bigdata_helper
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bigdata-helper"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Python library containing Big Data practical codes for quick access"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "Satyam Kale" }]
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
requires-python = ">=3.8"
|
|
13
|
+
keywords = ["big data", "exam", "codes", "snippets", "education"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://pypi.org/project/bigdata-helper/"
|