angels 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- angels-0.1.0/LICENSE +21 -0
- angels-0.1.0/PKG-INFO +19 -0
- angels-0.1.0/README.md +96 -0
- angels-0.1.0/pyproject.toml +29 -0
- angels-0.1.0/setup.cfg +4 -0
- angels-0.1.0/src/angels/__init__.py +7 -0
- angels-0.1.0/src/angels/analysis_algorithms/__init__.py +1 -0
- angels-0.1.0/src/angels/analysis_algorithms/analysis_algorithms.py +48 -0
- angels-0.1.0/src/angels/data_acquisition/__init__.py +1 -0
- angels-0.1.0/src/angels/data_acquisition/data_acquisition.py +85 -0
- angels-0.1.0/src/angels/data_cleaning/__init__.py +1 -0
- angels-0.1.0/src/angels/data_cleaning/data_cleaning.py +50 -0
- angels-0.1.0/src/angels/logic_processing/__init__.py +1 -0
- angels-0.1.0/src/angels/logic_processing/logic_processing.py +38 -0
- angels-0.1.0/src/angels/visualization/__init__.py +1 -0
- angels-0.1.0/src/angels/visualization/visualization.py +62 -0
- angels-0.1.0/src/angels.egg-info/PKG-INFO +19 -0
- angels-0.1.0/src/angels.egg-info/SOURCES.txt +19 -0
- angels-0.1.0/src/angels.egg-info/dependency_links.txt +1 -0
- angels-0.1.0/src/angels.egg-info/requires.txt +8 -0
- angels-0.1.0/src/angels.egg-info/top_level.txt +1 -0
angels-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 aa-ds-kits
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
angels-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: angels
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 数据分析过程工具整合包
|
|
5
|
+
Author: Author
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.6
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: matplotlib
|
|
14
|
+
Requires-Dist: seaborn
|
|
15
|
+
Requires-Dist: scikit-learn
|
|
16
|
+
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: beautifulsoup4
|
|
18
|
+
Requires-Dist: pymysql
|
|
19
|
+
Dynamic: license-file
|
angels-0.1.0/README.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# angels
|
|
2
|
+
|
|
3
|
+
数据分析过程工具整合包,包含数据获取、数据清洗、逻辑加工、分析算法、可视化等五个核心模块。
|
|
4
|
+
|
|
5
|
+
## 安装
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install angels
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 打包说明
|
|
12
|
+
|
|
13
|
+
本项目使用现代的 `pyproject.toml` 打包方式,替代了传统的 `setup.py`。
|
|
14
|
+
|
|
15
|
+
## 核心模块
|
|
16
|
+
|
|
17
|
+
### 1. 数据获取 (data_acquisition)
|
|
18
|
+
|
|
19
|
+
- `load_csv(file_path)`: 加载CSV文件
|
|
20
|
+
- `load_excel(file_path, sheet_name=0)`: 加载Excel文件
|
|
21
|
+
- `load_json(file_path)`: 加载JSON文件
|
|
22
|
+
- `load_from_api(url, params=None)`: 从API获取数据
|
|
23
|
+
- `load_from_web(url, table_selector)`: 从网页表格获取数据
|
|
24
|
+
|
|
25
|
+
### 2. 数据清洗 (data_cleaning)
|
|
26
|
+
|
|
27
|
+
- `remove_duplicates(df)`: 移除重复行
|
|
28
|
+
- `handle_missing_values(df, strategy='drop', fill_value=None)`: 处理缺失值
|
|
29
|
+
- `convert_data_types(df, dtypes=None)`: 转换数据类型
|
|
30
|
+
- `remove_outliers(df, columns=None, method='iqr', threshold=1.5)`: 移除异常值
|
|
31
|
+
- `standardize_columns(df)`: 标准化列名
|
|
32
|
+
|
|
33
|
+
### 3. 逻辑加工 (logic_processing)
|
|
34
|
+
|
|
35
|
+
- `group_by_aggregate(df, group_by, aggregations)`: 按列分组并聚合
|
|
36
|
+
- `pivot_table(df, index, columns, values, aggfunc='mean')`: 创建透视表
|
|
37
|
+
- `calculate_rolling_stats(df, column, window, stats=['mean', 'std'])`: 计算滚动统计量
|
|
38
|
+
- `calculate_diff(df, column, periods=1)`: 计算差值
|
|
39
|
+
- `merge_dataframes(df1, df2, on=None, how='inner')`: 合并数据框
|
|
40
|
+
- `create_features(df)`: 创建特征
|
|
41
|
+
|
|
42
|
+
### 4. 分析算法 (analysis_algorithms)
|
|
43
|
+
|
|
44
|
+
- `descriptive_statistics(df)`: 计算描述性统计量
|
|
45
|
+
- `correlation_analysis(df, method='pearson')`: 计算相关性矩阵
|
|
46
|
+
- `kmeans_clustering(df, n_clusters=3, random_state=42)`: K均值聚类
|
|
47
|
+
- `linear_regression(X, y)`: 线性回归
|
|
48
|
+
- `time_series_analysis(df, time_column, value_column)`: 时间序列分析
|
|
49
|
+
- `hypothesis_testing(sample1, sample2)`: 假设检验
|
|
50
|
+
|
|
51
|
+
### 5. 可视化 (visualization)
|
|
52
|
+
|
|
53
|
+
- `plot_histogram(df, column, bins=30, title=None)`: 绘制直方图
|
|
54
|
+
- `plot_scatter(df, x, y, hue=None, title=None)`: 绘制散点图
|
|
55
|
+
- `plot_bar(df, x, y, title=None)`: 绘制柱状图
|
|
56
|
+
- `plot_box(df, x, y=None, title=None)`: 绘制箱线图
|
|
57
|
+
- `plot_correlation_heatmap(df, title=None)`: 绘制相关性热图
|
|
58
|
+
- `plot_time_series(df, x, y, title=None)`: 绘制时间序列图
|
|
59
|
+
|
|
60
|
+
## 使用示例
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from angels import *
|
|
64
|
+
|
|
65
|
+
# 1. 数据获取
|
|
66
|
+
df = load_csv('data.csv')
|
|
67
|
+
|
|
68
|
+
# 2. 数据清洗
|
|
69
|
+
df = remove_duplicates(df)
|
|
70
|
+
df = handle_missing_values(df, strategy='mean')
|
|
71
|
+
df = standardize_columns(df)
|
|
72
|
+
|
|
73
|
+
# 3. 逻辑加工
|
|
74
|
+
df = group_by_aggregate(df, 'category', {'value': 'sum'})
|
|
75
|
+
|
|
76
|
+
# 4. 分析算法
|
|
77
|
+
stats = descriptive_statistics(df)
|
|
78
|
+
corr = correlation_analysis(df)
|
|
79
|
+
|
|
80
|
+
# 5. 可视化
|
|
81
|
+
plt = plot_histogram(df, 'value')
|
|
82
|
+
plt.show()
|
|
83
|
+
|
|
84
|
+
plt = plot_correlation_heatmap(df)
|
|
85
|
+
plt.show()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 依赖
|
|
89
|
+
|
|
90
|
+
- pandas
|
|
91
|
+
- numpy
|
|
92
|
+
- matplotlib
|
|
93
|
+
- seaborn
|
|
94
|
+
- scikit-learn
|
|
95
|
+
- requests
|
|
96
|
+
- beautifulsoup4
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "angels"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "数据分析过程工具整合包"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Author" }
|
|
11
|
+
]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.6"
|
|
18
|
+
dependencies = [
|
|
19
|
+
"pandas",
|
|
20
|
+
"numpy",
|
|
21
|
+
"matplotlib",
|
|
22
|
+
"seaborn",
|
|
23
|
+
"scikit-learn",
|
|
24
|
+
"requests",
|
|
25
|
+
"beautifulsoup4",
|
|
26
|
+
"pymysql"
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
angels-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .analysis_algorithms import *
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.cluster import KMeans
|
|
4
|
+
from sklearn.linear_model import LinearRegression
|
|
5
|
+
from sklearn.preprocessing import StandardScaler
|
|
6
|
+
|
|
7
|
+
def descriptive_statistics(df):
|
|
8
|
+
"""计算描述性统计量"""
|
|
9
|
+
return df.describe()
|
|
10
|
+
|
|
11
|
+
def correlation_analysis(df, method='pearson'):
|
|
12
|
+
"""计算相关性矩阵"""
|
|
13
|
+
return df.corr(method=method)
|
|
14
|
+
|
|
15
|
+
def kmeans_clustering(df, n_clusters=3, random_state=42):
|
|
16
|
+
"""K均值聚类"""
|
|
17
|
+
scaler = StandardScaler()
|
|
18
|
+
scaled_data = scaler.fit_transform(df.select_dtypes(include=[np.number]))
|
|
19
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
|
|
20
|
+
clusters = kmeans.fit_predict(scaled_data)
|
|
21
|
+
df['cluster'] = clusters
|
|
22
|
+
return df, kmeans
|
|
23
|
+
|
|
24
|
+
def linear_regression(X, y):
|
|
25
|
+
"""线性回归"""
|
|
26
|
+
model = LinearRegression()
|
|
27
|
+
model.fit(X, y)
|
|
28
|
+
predictions = model.predict(X)
|
|
29
|
+
return model, predictions
|
|
30
|
+
|
|
31
|
+
def time_series_analysis(df, time_column, value_column):
|
|
32
|
+
"""时间序列分析"""
|
|
33
|
+
df[time_column] = pd.to_datetime(df[time_column])
|
|
34
|
+
df = df.sort_values(time_column)
|
|
35
|
+
df.set_index(time_column, inplace=True)
|
|
36
|
+
|
|
37
|
+
# 计算移动平均
|
|
38
|
+
df['moving_average'] = df[value_column].rolling(window=7).mean()
|
|
39
|
+
# 计算同比增长
|
|
40
|
+
df['year_over_year'] = df[value_column].pct_change(periods=365)
|
|
41
|
+
|
|
42
|
+
return df
|
|
43
|
+
|
|
44
|
+
def hypothesis_testing(sample1, sample2):
|
|
45
|
+
"""假设检验"""
|
|
46
|
+
from scipy import stats
|
|
47
|
+
t_stat, p_value = stats.ttest_ind(sample1, sample2)
|
|
48
|
+
return t_stat, p_value
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .data_acquisition import *
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import requests
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
import json
|
|
5
|
+
import pymysql
|
|
6
|
+
from pymysql.cursors import DictCursor
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
def load_csv(file_path):
|
|
10
|
+
"""加载CSV文件"""
|
|
11
|
+
return pd.read_csv(file_path)
|
|
12
|
+
|
|
13
|
+
def load_excel(file_path, sheet_name=0):
|
|
14
|
+
"""加载Excel文件"""
|
|
15
|
+
return pd.read_excel(file_path, sheet_name=sheet_name)
|
|
16
|
+
|
|
17
|
+
def load_json(file_path):
|
|
18
|
+
"""加载JSON文件"""
|
|
19
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
20
|
+
data = json.load(f)
|
|
21
|
+
return pd.DataFrame(data)
|
|
22
|
+
|
|
23
|
+
def load_from_api(url, params=None):
|
|
24
|
+
"""从API获取数据"""
|
|
25
|
+
response = requests.get(url, params=params)
|
|
26
|
+
response.raise_for_status()
|
|
27
|
+
return pd.DataFrame(response.json())
|
|
28
|
+
|
|
29
|
+
def load_from_web(url, table_selector):
|
|
30
|
+
"""从网页表格获取数据"""
|
|
31
|
+
response = requests.get(url)
|
|
32
|
+
response.raise_for_status()
|
|
33
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
34
|
+
table = soup.select_one(table_selector)
|
|
35
|
+
return pd.read_html(str(table))[0]
|
|
36
|
+
|
|
37
|
+
def get_db_config():
|
|
38
|
+
"""获取数据库配置"""
|
|
39
|
+
# 从环境变量获取配置文件路径,不设置默认值
|
|
40
|
+
CONFIG_PATH = os.getenv('DB_INFO')
|
|
41
|
+
if not CONFIG_PATH:
|
|
42
|
+
# 环境变量未设置时的提示
|
|
43
|
+
error_msg = "环境变量 DB_INFO 未设置。请按照以下方式配置:\n"
|
|
44
|
+
error_msg += "1. 环境变量配置:\n"
|
|
45
|
+
error_msg += " - Windows (图形界面): 设置环境变量 DB_INFO 为配置文件路径\n"
|
|
46
|
+
error_msg += " - Windows (PowerShell): 执行 $env:DB_INFO='配置文件路径'\n"
|
|
47
|
+
error_msg += " - Linux/Mac: 执行 export DB_INFO=配置文件路径\n"
|
|
48
|
+
raise ValueError(error_msg)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# 尝试读取配置文件
|
|
52
|
+
with open(CONFIG_PATH) as f:
|
|
53
|
+
return json.load(f)
|
|
54
|
+
except FileNotFoundError:
|
|
55
|
+
# 文件不存在时的提示
|
|
56
|
+
error_msg = f"配置文件 {CONFIG_PATH} 不存在。请确保文件路径正确。\n"
|
|
57
|
+
raise FileNotFoundError(error_msg)
|
|
58
|
+
except PermissionError:
|
|
59
|
+
# 权限不足时的提示
|
|
60
|
+
raise PermissionError(f"没有权限读取配置文件 {CONFIG_PATH}")
|
|
61
|
+
|
|
62
|
+
DB_CONFIG = get_db_config()
|
|
63
|
+
|
|
64
|
+
def fetch_dataframe(sql, db_name='rpt'):
|
|
65
|
+
"""从数据库获取DataFrame"""
|
|
66
|
+
if db_name not in DB_CONFIG:
|
|
67
|
+
raise ValueError(f"未知数据库标识: {db_name}")
|
|
68
|
+
# 使用配置文件中的连接信息
|
|
69
|
+
conn_params = DB_CONFIG[db_name]
|
|
70
|
+
conn = pymysql.connect(**conn_params)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
with conn.cursor(DictCursor) as cursor:
|
|
74
|
+
cursor.execute(sql)
|
|
75
|
+
res = cursor.fetchall()
|
|
76
|
+
if res:
|
|
77
|
+
df = pd.DataFrame(res)
|
|
78
|
+
else:
|
|
79
|
+
# 如果没有数据,返回空DataFrame
|
|
80
|
+
df = pd.DataFrame()
|
|
81
|
+
return df
|
|
82
|
+
except Exception as e:
|
|
83
|
+
raise e
|
|
84
|
+
finally:
|
|
85
|
+
conn.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .data_cleaning import *
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def remove_duplicates(df):
|
|
5
|
+
"""移除重复行"""
|
|
6
|
+
return df.drop_duplicates()
|
|
7
|
+
|
|
8
|
+
def handle_missing_values(df, strategy='drop', fill_value=None):
|
|
9
|
+
"""处理缺失值"""
|
|
10
|
+
if strategy == 'drop':
|
|
11
|
+
return df.dropna()
|
|
12
|
+
elif strategy == 'fill':
|
|
13
|
+
return df.fillna(fill_value)
|
|
14
|
+
elif strategy == 'mean':
|
|
15
|
+
return df.fillna(df.mean())
|
|
16
|
+
elif strategy == 'median':
|
|
17
|
+
return df.fillna(df.median())
|
|
18
|
+
else:
|
|
19
|
+
raise ValueError("Invalid strategy. Use 'drop', 'fill', 'mean', or 'median'")
|
|
20
|
+
|
|
21
|
+
def convert_data_types(df, dtypes=None):
|
|
22
|
+
"""转换数据类型"""
|
|
23
|
+
if dtypes:
|
|
24
|
+
return df.astype(dtypes)
|
|
25
|
+
else:
|
|
26
|
+
return df.infer_objects()
|
|
27
|
+
|
|
28
|
+
def remove_outliers(df, columns=None, method='iqr', threshold=1.5):
|
|
29
|
+
"""移除异常值"""
|
|
30
|
+
if columns is None:
|
|
31
|
+
columns = df.select_dtypes(include=[np.number]).columns
|
|
32
|
+
|
|
33
|
+
for col in columns:
|
|
34
|
+
if method == 'iqr':
|
|
35
|
+
Q1 = df[col].quantile(0.25)
|
|
36
|
+
Q3 = df[col].quantile(0.75)
|
|
37
|
+
IQR = Q3 - Q1
|
|
38
|
+
lower_bound = Q1 - threshold * IQR
|
|
39
|
+
upper_bound = Q3 + threshold * IQR
|
|
40
|
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
|
41
|
+
elif method == 'zscore':
|
|
42
|
+
z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
|
|
43
|
+
df = df[z_scores < threshold]
|
|
44
|
+
|
|
45
|
+
return df
|
|
46
|
+
|
|
47
|
+
def standardize_columns(df):
|
|
48
|
+
"""标准化列名"""
|
|
49
|
+
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
|
|
50
|
+
return df
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .logic_processing import *
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
def group_by_aggregate(df, group_by, aggregations):
|
|
5
|
+
"""按列分组并聚合"""
|
|
6
|
+
return df.groupby(group_by).agg(aggregations).reset_index()
|
|
7
|
+
|
|
8
|
+
def pivot_table(df, index, columns, values, aggfunc='mean'):
|
|
9
|
+
"""创建透视表"""
|
|
10
|
+
return pd.pivot_table(df, index=index, columns=columns, values=values, aggfunc=aggfunc)
|
|
11
|
+
|
|
12
|
+
def calculate_rolling_stats(df, column, window, stats=['mean', 'std']):
|
|
13
|
+
"""计算滚动统计量"""
|
|
14
|
+
for stat in stats:
|
|
15
|
+
if stat == 'mean':
|
|
16
|
+
df[f'{column}_rolling_{window}_mean'] = df[column].rolling(window=window).mean()
|
|
17
|
+
elif stat == 'std':
|
|
18
|
+
df[f'{column}_rolling_{window}_std'] = df[column].rolling(window=window).std()
|
|
19
|
+
elif stat == 'sum':
|
|
20
|
+
df[f'{column}_rolling_{window}_sum'] = df[column].rolling(window=window).sum()
|
|
21
|
+
return df
|
|
22
|
+
|
|
23
|
+
def calculate_diff(df, column, periods=1):
|
|
24
|
+
"""计算差值"""
|
|
25
|
+
df[f'{column}_diff_{periods}'] = df[column].diff(periods=periods)
|
|
26
|
+
return df
|
|
27
|
+
|
|
28
|
+
def merge_dataframes(df1, df2, on=None, how='inner'):
|
|
29
|
+
"""合并数据框"""
|
|
30
|
+
return pd.merge(df1, df2, on=on, how=how)
|
|
31
|
+
|
|
32
|
+
def create_features(df):
|
|
33
|
+
"""创建特征"""
|
|
34
|
+
# 示例特征创建
|
|
35
|
+
for col in df.select_dtypes(include=[np.number]).columns:
|
|
36
|
+
df[f'{col}_log'] = np.log(df[col] + 1) # 对数变换
|
|
37
|
+
df[f'{col}_squared'] = df[col] ** 2 # 平方
|
|
38
|
+
return df
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .visualization import *
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import seaborn as sns
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
def plot_histogram(df, column, bins=30, title=None):
|
|
7
|
+
"""绘制直方图"""
|
|
8
|
+
plt.figure(figsize=(10, 6))
|
|
9
|
+
sns.histplot(df[column], bins=bins, kde=True)
|
|
10
|
+
plt.title(title or f'{column} 分布')
|
|
11
|
+
plt.xlabel(column)
|
|
12
|
+
plt.ylabel('频率')
|
|
13
|
+
plt.tight_layout()
|
|
14
|
+
return plt
|
|
15
|
+
|
|
16
|
+
def plot_scatter(df, x, y, hue=None, title=None):
|
|
17
|
+
"""绘制散点图"""
|
|
18
|
+
plt.figure(figsize=(10, 6))
|
|
19
|
+
sns.scatterplot(data=df, x=x, y=y, hue=hue)
|
|
20
|
+
plt.title(title or f'{x} vs {y}')
|
|
21
|
+
plt.tight_layout()
|
|
22
|
+
return plt
|
|
23
|
+
|
|
24
|
+
def plot_bar(df, x, y, title=None):
|
|
25
|
+
"""绘制柱状图"""
|
|
26
|
+
plt.figure(figsize=(10, 6))
|
|
27
|
+
sns.barplot(data=df, x=x, y=y)
|
|
28
|
+
plt.title(title or f'{y} by {x}')
|
|
29
|
+
plt.xticks(rotation=45)
|
|
30
|
+
plt.tight_layout()
|
|
31
|
+
return plt
|
|
32
|
+
|
|
33
|
+
def plot_box(df, x, y=None, title=None):
|
|
34
|
+
"""绘制箱线图"""
|
|
35
|
+
plt.figure(figsize=(10, 6))
|
|
36
|
+
if y:
|
|
37
|
+
sns.boxplot(data=df, x=x, y=y)
|
|
38
|
+
else:
|
|
39
|
+
sns.boxplot(data=df[x])
|
|
40
|
+
plt.title(title or f'{y or x} 分布')
|
|
41
|
+
plt.xticks(rotation=45)
|
|
42
|
+
plt.tight_layout()
|
|
43
|
+
return plt
|
|
44
|
+
|
|
45
|
+
def plot_correlation_heatmap(df, title=None):
|
|
46
|
+
"""绘制相关性热图"""
|
|
47
|
+
plt.figure(figsize=(12, 10))
|
|
48
|
+
corr = df.corr()
|
|
49
|
+
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
|
|
50
|
+
plt.title(title or '相关性矩阵')
|
|
51
|
+
plt.tight_layout()
|
|
52
|
+
return plt
|
|
53
|
+
|
|
54
|
+
def plot_time_series(df, x, y, title=None):
|
|
55
|
+
"""绘制时间序列图"""
|
|
56
|
+
plt.figure(figsize=(12, 6))
|
|
57
|
+
plt.plot(df[x], df[y])
|
|
58
|
+
plt.title(title or f'{y} 时间序列')
|
|
59
|
+
plt.xlabel('时间')
|
|
60
|
+
plt.ylabel(y)
|
|
61
|
+
plt.tight_layout()
|
|
62
|
+
return plt
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: angels
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 数据分析过程工具整合包
|
|
5
|
+
Author: Author
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.6
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: matplotlib
|
|
14
|
+
Requires-Dist: seaborn
|
|
15
|
+
Requires-Dist: scikit-learn
|
|
16
|
+
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: beautifulsoup4
|
|
18
|
+
Requires-Dist: pymysql
|
|
19
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/angels/__init__.py
|
|
5
|
+
src/angels.egg-info/PKG-INFO
|
|
6
|
+
src/angels.egg-info/SOURCES.txt
|
|
7
|
+
src/angels.egg-info/dependency_links.txt
|
|
8
|
+
src/angels.egg-info/requires.txt
|
|
9
|
+
src/angels.egg-info/top_level.txt
|
|
10
|
+
src/angels/analysis_algorithms/__init__.py
|
|
11
|
+
src/angels/analysis_algorithms/analysis_algorithms.py
|
|
12
|
+
src/angels/data_acquisition/__init__.py
|
|
13
|
+
src/angels/data_acquisition/data_acquisition.py
|
|
14
|
+
src/angels/data_cleaning/__init__.py
|
|
15
|
+
src/angels/data_cleaning/data_cleaning.py
|
|
16
|
+
src/angels/logic_processing/__init__.py
|
|
17
|
+
src/angels/logic_processing/logic_processing.py
|
|
18
|
+
src/angels/visualization/__init__.py
|
|
19
|
+
src/angels/visualization/visualization.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
angels
|