ai-data-science-team 0.0.0.9005__py3-none-any.whl → 0.0.0.9007__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +3 -1
- ai_data_science_team/agents/data_cleaning_agent.py +213 -20
- ai_data_science_team/agents/data_visualization_agent.py +331 -0
- ai_data_science_team/agents/data_wrangling_agent.py +66 -24
- ai_data_science_team/agents/feature_engineering_agent.py +50 -13
- ai_data_science_team/agents/sql_database_agent.py +397 -0
- ai_data_science_team/templates/__init__.py +8 -0
- ai_data_science_team/templates/agent_templates.py +154 -37
- ai_data_science_team/tools/logging.py +1 -1
- ai_data_science_team/tools/metadata.py +230 -0
- ai_data_science_team/tools/regex.py +7 -1
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/METADATA +43 -22
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +21 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/WHEEL +1 -1
- ai_data_science_team/tools/data_analysis.py +0 -116
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +0 -19
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/top_level.txt +0 -0
@@ -1,116 +0,0 @@
|
|
1
|
-
import io
|
2
|
-
import pandas as pd
|
3
|
-
from typing import Union, List, Dict
|
4
|
-
|
5
|
-
def summarize_dataframes(
|
6
|
-
dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]]
|
7
|
-
) -> List[str]:
|
8
|
-
"""
|
9
|
-
Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
|
10
|
-
or a dictionary mapping names to DataFrames.
|
11
|
-
|
12
|
-
Parameters
|
13
|
-
----------
|
14
|
-
dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
|
15
|
-
- Single DataFrame: produce a single summary (returned within a one-element list).
|
16
|
-
- List of DataFrames: produce a summary for each DataFrame, using index-based names.
|
17
|
-
- Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
|
18
|
-
|
19
|
-
Example:
|
20
|
-
--------
|
21
|
-
``` python
|
22
|
-
import pandas as pd
|
23
|
-
from sklearn.datasets import load_iris
|
24
|
-
data = load_iris(as_frame=True)
|
25
|
-
dataframes = {
|
26
|
-
"iris": data.frame,
|
27
|
-
"iris_target": data.target,
|
28
|
-
}
|
29
|
-
summaries = summarize_dataframes(dataframes)
|
30
|
-
print(summaries[0])
|
31
|
-
```
|
32
|
-
|
33
|
-
Returns
|
34
|
-
-------
|
35
|
-
list of str
|
36
|
-
A list of summaries, one for each provided DataFrame. Each summary includes:
|
37
|
-
- Shape of the DataFrame (rows, columns)
|
38
|
-
- Column data types
|
39
|
-
- Missing value percentage
|
40
|
-
- Unique value counts
|
41
|
-
- First 30 rows
|
42
|
-
- Descriptive statistics
|
43
|
-
- DataFrame info output
|
44
|
-
"""
|
45
|
-
|
46
|
-
summaries = []
|
47
|
-
|
48
|
-
# --- Dictionary Case ---
|
49
|
-
if isinstance(dataframes, dict):
|
50
|
-
for dataset_name, df in dataframes.items():
|
51
|
-
summaries.append(_summarize_dataframe(df, dataset_name))
|
52
|
-
|
53
|
-
# --- Single DataFrame Case ---
|
54
|
-
elif isinstance(dataframes, pd.DataFrame):
|
55
|
-
summaries.append(_summarize_dataframe(dataframes, "Single_Dataset"))
|
56
|
-
|
57
|
-
# --- List of DataFrames Case ---
|
58
|
-
elif isinstance(dataframes, list):
|
59
|
-
for idx, df in enumerate(dataframes):
|
60
|
-
dataset_name = f"Dataset_{idx}"
|
61
|
-
summaries.append(_summarize_dataframe(df, dataset_name))
|
62
|
-
|
63
|
-
else:
|
64
|
-
raise TypeError(
|
65
|
-
"Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
|
66
|
-
)
|
67
|
-
|
68
|
-
return summaries
|
69
|
-
|
70
|
-
|
71
|
-
def _summarize_dataframe(df: pd.DataFrame, dataset_name: str) -> str:
|
72
|
-
"""Generate a summary string for a single DataFrame."""
|
73
|
-
# 1. Convert dictionary-type cells to strings
|
74
|
-
# This prevents unhashable dict errors during df.nunique().
|
75
|
-
df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
|
76
|
-
|
77
|
-
# 2. Capture df.info() output
|
78
|
-
buffer = io.StringIO()
|
79
|
-
df.info(buf=buffer)
|
80
|
-
info_text = buffer.getvalue()
|
81
|
-
|
82
|
-
# 3. Calculate missing value stats
|
83
|
-
missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
|
84
|
-
missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
|
85
|
-
|
86
|
-
# 4. Get column data types
|
87
|
-
column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
|
88
|
-
|
89
|
-
# 5. Get unique value counts
|
90
|
-
unique_counts = df.nunique() # Will no longer fail on unhashable dict
|
91
|
-
unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
|
92
|
-
|
93
|
-
summary_text = f"""
|
94
|
-
Dataset Name: {dataset_name}
|
95
|
-
----------------------------
|
96
|
-
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
97
|
-
|
98
|
-
Column Data Types:
|
99
|
-
{column_types}
|
100
|
-
|
101
|
-
Missing Value Percentage:
|
102
|
-
{missing_summary}
|
103
|
-
|
104
|
-
Unique Value Counts:
|
105
|
-
{unique_counts_summary}
|
106
|
-
|
107
|
-
Data (first 30 rows):
|
108
|
-
{df.head(30).to_string()}
|
109
|
-
|
110
|
-
Data Description:
|
111
|
-
{df.describe().to_string()}
|
112
|
-
|
113
|
-
Data Info:
|
114
|
-
{info_text}
|
115
|
-
"""
|
116
|
-
return summary_text.strip()
|
@@ -1,19 +0,0 @@
|
|
1
|
-
ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
ai_data_science_team/_version.py,sha256=7tA8TocqCCzLkcB4ptV6bn3k5ni-0TGZvGnVBzmbeIc,26
|
3
|
-
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
|
-
ai_data_science_team/agents/__init__.py,sha256=DtwQgyeG3Q4rQ-NrMbva-jshVQyULaWW1RrnETQGZOY,270
|
5
|
-
ai_data_science_team/agents/data_cleaning_agent.py,sha256=0K-CgngGjamRk_QzMqNkplrI-ddCbtruQ7kjGrsRIN8,14390
|
6
|
-
ai_data_science_team/agents/data_wrangling_agent.py,sha256=uQBJ8vQwrXubQgaI9_UoNZnVQjIEBUOh3dTmNdg326k,14581
|
7
|
-
ai_data_science_team/agents/feature_engineering_agent.py,sha256=QEqXTsfjllUj4Wgsw4nNGUT6r9Y6q629ZNgqGy3Dbbk,15921
|
8
|
-
ai_data_science_team/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
ai_data_science_team/templates/agent_templates.py,sha256=gT48Pq9KlrrrF0yigodGl_BdptmowTJ2rEWUqh7g5E0,15410
|
10
|
-
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
ai_data_science_team/tools/data_analysis.py,sha256=V7e6_fZA01mosFf5VcLwBcpiMVf7fClZMjTrj-egK-o,3715
|
12
|
-
ai_data_science_team/tools/logging.py,sha256=EU5EMg4Y0-Yhqf1vAEFg0eRvSTx8uF0LTOAKss8-T2M,2073
|
13
|
-
ai_data_science_team/tools/parsers.py,sha256=BAi-fJT7BBt9nRS3w5n9LDTsu7JAJsH8CAI9-Qf7jCs,2086
|
14
|
-
ai_data_science_team/tools/regex.py,sha256=KTH2SXPJT8Tzmj7CufyeET-FbA9BMhRzFlPKr4Tan3g,2320
|
15
|
-
ai_data_science_team-0.0.0.9005.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
16
|
-
ai_data_science_team-0.0.0.9005.dist-info/METADATA,sha256=PC6rJR965hPu02LtZrzHICkd3QeWzh2A35axTLjE9hM,5840
|
17
|
-
ai_data_science_team-0.0.0.9005.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
18
|
-
ai_data_science_team-0.0.0.9005.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
19
|
-
ai_data_science_team-0.0.0.9005.dist-info/RECORD,,
|
{ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/LICENSE
RENAMED
File without changes
|
File without changes
|