dsr-data-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Scott Roberts
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: dsr-data-tools
3
+ Version: 0.0.1
4
+ Summary: Generic data handling utilities including data splitting and analysis.
5
+ Author-email: Scott Roberts <scottrdeveloper@gmail.com>
6
+ License: MIT
7
+ Keywords: data,splitting,analysis,ml-data
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: dsr-utils>=0.0.1
20
+ Requires-Dist: numpy>=2.0.0
21
+ Requires-Dist: pandas>=2.0.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=7.0; extra == "dev"
24
+ Requires-Dist: black>=23.0; extra == "dev"
25
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
26
+ Dynamic: license-file
27
+
28
+ # dsr-data-tools
29
+
30
+ Data analysis and exploration tools for exploratory data analysis (EDA).
31
+
32
+ ## Features
33
+
34
+ - **Dataset Analysis**: Comprehensive statistical summaries and data quality assessment
35
+ - **Data Exploration**: Tools for understanding data distributions, correlations, and patterns
36
+ - **Quality Metrics**: Missing value detection, data type analysis, and anomaly identification
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install dsr-data-tools
42
+ ```
43
+
44
+ ## Usage
45
+
46
+ ```python
47
+ import pandas as pd
48
+ from dsr_data_tools import analyze_dataset
49
+
50
+ # Load your data
51
+ df = pd.read_csv('data.csv')
52
+
53
+ # Perform comprehensive analysis
54
+ analyze_dataset(df)
55
+ ```
56
+
57
+ ## Requirements
58
+
59
+ - Python >= 3.9
60
+ - pandas
61
+ - numpy
62
+ - dsr-utils
63
+
64
+ ## License
65
+
66
+ MIT License - see LICENSE file for details
@@ -0,0 +1,39 @@
1
+ # dsr-data-tools
2
+
3
+ Data analysis and exploration tools for exploratory data analysis (EDA).
4
+
5
+ ## Features
6
+
7
+ - **Dataset Analysis**: Comprehensive statistical summaries and data quality assessment
8
+ - **Data Exploration**: Tools for understanding data distributions, correlations, and patterns
9
+ - **Quality Metrics**: Missing value detection, data type analysis, and anomaly identification
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ pip install dsr-data-tools
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ```python
20
+ import pandas as pd
21
+ from dsr_data_tools import analyze_dataset
22
+
23
+ # Load your data
24
+ df = pd.read_csv('data.csv')
25
+
26
+ # Perform comprehensive analysis
27
+ analyze_dataset(df)
28
+ ```
29
+
30
+ ## Requirements
31
+
32
+ - Python >= 3.9
33
+ - pandas
34
+ - numpy
35
+ - dsr-utils
36
+
37
+ ## License
38
+
39
+ MIT License - see LICENSE file for details
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dsr-data-tools"
7
+ version = "0.0.1"
8
+ description = "Generic data handling utilities including data splitting and analysis."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Scott Roberts", email="scottrdeveloper@gmail.com"}
14
+ ]
15
+ keywords = ["data", "splitting", "analysis", "ml-data"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ ]
26
+ dependencies = [
27
+ "dsr-utils>=0.0.1",
28
+ "numpy>=2.0.0",
29
+ "pandas>=2.0.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=7.0",
35
+ "black>=23.0",
36
+ "ruff>=0.1.0",
37
+ ]
38
+
39
+ [tool.setuptools]
40
+ packages = ["dsr_data_tools"]
41
+
42
+ [tool.setuptools.package-dir]
43
+ "" = "src"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,33 @@
1
+ """
2
+ dsr_data_tools: Generic data handling utilities for data splitting and analysis.
3
+ """
4
+
5
+ from dsr_data_tools.analysis import (
6
+ DataframeColumn,
7
+ DataframeInfo,
8
+ analyze_column_data,
9
+ analyze_dataset,
10
+ )
11
+ from dsr_data_tools.recommendations import apply_recommendations
12
+ from dsr_data_tools.enums import (
13
+ RecommendationType,
14
+ EncodingStrategy,
15
+ MissingValueStrategy,
16
+ OutlierStrategy,
17
+ ImbalanceStrategy,
18
+ )
19
+
20
+ __all__ = [
21
+ "DataframeColumn",
22
+ "DataframeInfo",
23
+ "analyze_column_data",
24
+ "analyze_dataset",
25
+ "apply_recommendations",
26
+ "RecommendationType",
27
+ "EncodingStrategy",
28
+ "MissingValueStrategy",
29
+ "OutlierStrategy",
30
+ "ImbalanceStrategy",
31
+ ]
32
+
33
+ __version__ = "0.0.1"
@@ -0,0 +1,467 @@
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ from typing import Type
4
+ from dsr_utils import strings
5
+ from dsr_data_tools.enums import (
6
+ RecommendationType,
7
+ EncodingStrategy,
8
+ MissingValueStrategy,
9
+ OutlierStrategy,
10
+ ImbalanceStrategy,
11
+ )
12
+ from dsr_data_tools.recommendations import (
13
+ Recommendation,
14
+ NonInformativeRecommendation,
15
+ MissingValuesRecommendation,
16
+ EncodingRecommendation,
17
+ ClassImbalanceRecommendation,
18
+ OutlierDetectionRecommendation,
19
+ BooleanClassificationRecommendation,
20
+ BinningRecommendation,
21
+ )
22
+
23
+
24
+ class DataframeColumn:
25
+ """Represents metadata for a single DataFrame column.
26
+
27
+ Stores column name, non-null count, and data type information for analysis
28
+ and display purposes.
29
+
30
+ Attributes:
31
+ name (str): The column name.
32
+ non_null_count (int): Number of non-null values in the column.
33
+ data_type (Type): The pandas data type of the column.
34
+
35
+ Example:
36
+ >>> df = pd.DataFrame({'age': [25, 30, None, 35]})
37
+ >>> col = DataframeColumn('age', 3, float)
38
+ >>> col.name
39
+ 'age'
40
+ >>> col.non_null_count
41
+ 3
42
+ """
43
+ @staticmethod
44
+ def dfc_list_from_df(df: pd.DataFrame) -> list[DataframeColumn]:
45
+ """Create a list of DataframeColumn objects from a DataFrame.
46
+
47
+ Extracts column names, non-null counts, and data types from the DataFrame
48
+ and creates a DataframeColumn object for each column.
49
+
50
+ Args:
51
+ df (pd.DataFrame): The DataFrame to extract column information from.
52
+
53
+ Returns:
54
+ list[DataframeColumn]: List of DataframeColumn objects, one per column.
55
+
56
+ Example:
57
+ >>> df = pd.DataFrame({'name': ['Alice', 'Bob'], 'age': [25, 30]})
58
+ >>> columns = DataframeColumn.dfc_list_from_df(df)
59
+ >>> len(columns)
60
+ 2
61
+ """
62
+ df_columns = df.columns.tolist()
63
+ df_non_null_count = df.count().tolist()
64
+ df_data_types = df.dtypes.tolist()
65
+ n = len(df_columns)
66
+ dfc_list = []
67
+
68
+ for c in range(n):
69
+ dfc = DataframeColumn(df_columns[c],
70
+ df_non_null_count[c],
71
+ df_data_types[c])
72
+ dfc_list.append(dfc)
73
+
74
+ return dfc_list
75
+
76
+ def __init__(
77
+ self,
78
+ name: str,
79
+ non_null_count: int,
80
+ data_type: Type
81
+ ):
82
+ self.__name = name
83
+ self.__non_null_count = non_null_count
84
+ self.__data_type = data_type
85
+
86
+ @property
87
+ def name(self) -> str:
88
+ return self.__name
89
+
90
+ @property
91
+ def non_null_count(self) -> int:
92
+ return self.__non_null_count
93
+
94
+ @property
95
+ def data_type(self) -> Type:
96
+ return self.__data_type
97
+
98
+
99
+ class DataframeInfo:
100
+ """Stores comprehensive information about a DataFrame's structure and content.
101
+
102
+ Provides a summary of DataFrame characteristics including row counts, duplicate
103
+ detection, and detailed column information. Used for data exploration and
104
+ quality assessment.
105
+
106
+ Attributes:
107
+ row_count (int): Total number of rows in the DataFrame.
108
+ duplicate_row_count (int): Number of duplicate rows detected.
109
+ columns (list[DataframeColumn]): List of column metadata objects.
110
+
111
+ Example:
112
+ >>> df = pd.DataFrame({
113
+ ... 'name': ['Alice', 'Bob', 'Alice'],
114
+ ... 'age': [25, 30, 25]
115
+ ... })
116
+ >>> df_info = DataframeInfo(df)
117
+ >>> df_info.row_count
118
+ 3
119
+ >>> df_info.duplicate_row_count
120
+ 1
121
+ >>> len(df_info.columns)
122
+ 2
123
+ """
124
+
125
+ def __init__(
126
+ self,
127
+ df: pd.DataFrame
128
+ ):
129
+ self.__row_count = len(df)
130
+ self.__duplicate_row_count = df.duplicated().sum()
131
+ self.__columns = DataframeColumn.dfc_list_from_df(df)
132
+
133
+ @property
134
+ def row_count(self) -> int:
135
+ return self.__row_count
136
+
137
+ @property
138
+ def duplicate_row_count(self) -> int:
139
+ return self.__duplicate_row_count
140
+
141
+ @property
142
+ def columns(self) -> list[DataframeColumn]:
143
+ return self.__columns
144
+
145
+ def info(self):
146
+ """Display formatted summary of DataFrame information.
147
+
148
+ Prints row count, duplicate count, and a table showing column names,
149
+ non-null counts, and data types for all columns.
150
+
151
+ Example:
152
+ >>> df = pd.DataFrame({'name': ['Alice', 'Bob'], 'age': [25, 30]})
153
+ >>> df_info = DataframeInfo(df)
154
+ >>> df_info.info()
155
+ Rows: 2
156
+ Duplicate rows: 0
157
+
158
+ Column Non-null Data type
159
+ name 2 object
160
+ age 2 int64
161
+ """
162
+ print(f'Rows: {self.row_count}')
163
+ print(f'Duplicate rows: {self.duplicate_row_count}')
164
+ print()
165
+ col_headers = ['Column', 'Non-null', 'Data type']
166
+ col_width = [15, 10, 12]
167
+ print(
168
+ f'{col_headers[0]:<{col_width[0]}}{col_headers[1]:>{col_width[1]}} {col_headers[2]:<{col_width[2]}}')
169
+
170
+ for c in self.columns:
171
+ print(
172
+ f'{c.name:<{col_width[0]}}{c.non_null_count:>{col_width[1]}} {c.data_type.name:<{col_width[2]}}')
173
+
174
+
175
+ def generate_recommendations(
176
+ df: pd.DataFrame,
177
+ target_column: str | None = None
178
+ ) -> dict[str, dict[str, Recommendation]]:
179
+ """Generate data preparation recommendations for each column in a DataFrame.
180
+
181
+ Analyzes each column and generates appropriate recommendations based on
182
+ data characteristics (missing values, cardinality, data type, etc.).
183
+
184
+ Args:
185
+ df (pd.DataFrame): The DataFrame to analyze.
186
+ target_column (str | None): Name of the target column (for imbalance detection).
187
+ If provided, class imbalance will be analyzed for this column.
188
+
189
+ Returns:
190
+ dict[str, dict[str, Recommendation]]: Nested dictionary mapping column names
191
+ to recommendation types to Recommendation instances.
192
+
193
+ Example:
194
+ >>> df = pd.DataFrame({
195
+ ... 'id': range(100),
196
+ ... 'name': ['Alice'] * 100,
197
+ ... 'age': [25] * 100 + [30] * 50,
198
+ ... 'salary': [50000] * 50 + [100000] * 50
199
+ ... })
200
+ >>> recs = generate_recommendations(df, target_column='name')
201
+ >>> recs['id'] # Non-informative (unique count == row count)
202
+ >>> recs['age']['encoding'] # Binary encoding recommendation
203
+ """
204
+ recommendations: dict[str, dict[str, Recommendation]] = {}
205
+
206
+ for col_name in df.columns:
207
+ col_recommendations: dict[str, Recommendation] = {}
208
+ series = df[col_name]
209
+
210
+ # 1. Check for non-informative columns
211
+ unique_count = series.nunique()
212
+ total_rows = len(df)
213
+ is_numeric = pd.api.types.is_numeric_dtype(series)
214
+
215
+ # Non-informative: unique count equals total rows (e.g., ID column)
216
+ if unique_count == total_rows:
217
+ rec = NonInformativeRecommendation(
218
+ type=RecommendationType.NON_INFORMATIVE,
219
+ column_name=col_name,
220
+ description=f"Column '{col_name}' has unique value for each row.",
221
+ reason="Unique count equals row count"
222
+ )
223
+ col_recommendations['non_informative'] = rec
224
+ recommendations[col_name] = col_recommendations
225
+ continue
226
+
227
+ # Non-informative: high cardinality object type (> 25% unique values)
228
+ if not is_numeric and unique_count > total_rows * 0.25:
229
+ rec = NonInformativeRecommendation(
230
+ type=RecommendationType.NON_INFORMATIVE,
231
+ column_name=col_name,
232
+ description=f"Column '{col_name}' has high cardinality ({unique_count} unique values).",
233
+ reason="High cardinality object type"
234
+ )
235
+ col_recommendations['non_informative'] = rec
236
+ recommendations[col_name] = col_recommendations
237
+ continue
238
+
239
+ # 2. Check for missing values
240
+ missing_count = series.isna().sum()
241
+ if missing_count > 0:
242
+ missing_percentage = (missing_count / total_rows) * 100
243
+
244
+ # Determine strategy based on percentage
245
+ if missing_percentage < 10:
246
+ strategy = MissingValueStrategy.DROP_ROWS
247
+ elif missing_percentage > 50:
248
+ strategy = MissingValueStrategy.DROP_COLUMN
249
+ else:
250
+ strategy = MissingValueStrategy.IMPUTE
251
+
252
+ rec = MissingValuesRecommendation(
253
+ type=RecommendationType.MISSING_VALUES,
254
+ column_name=col_name,
255
+ description=f"Column '{col_name}' has {missing_count} missing values ({missing_percentage:.1f}%).",
256
+ missing_count=missing_count,
257
+ missing_percentage=missing_percentage,
258
+ strategy=strategy
259
+ )
260
+ col_recommendations['missing_values'] = rec
261
+
262
+ # 3. Check for boolean classification (exactly 2 unique numeric values)
263
+ # Skip target column as it should remain numeric for classifiers
264
+ if is_numeric and unique_count == 2 and col_name != target_column:
265
+ values = sorted(series.dropna().unique().tolist())
266
+ if values == [0.0, 1.0] or values == [0, 1]:
267
+ rec = BooleanClassificationRecommendation(
268
+ type=RecommendationType.BOOLEAN_CLASSIFICATION,
269
+ column_name=col_name,
270
+ description=f"Column '{col_name}' should be treated as boolean.",
271
+ values=values
272
+ )
273
+ col_recommendations['boolean_classification'] = rec
274
+
275
+ # 4. Check for encoding recommendations (categorical columns)
276
+ if not is_numeric and col_name != target_column:
277
+ # Binary categorical: 2 unique values
278
+ if unique_count == 2:
279
+ rec = EncodingRecommendation(
280
+ type=RecommendationType.ENCODING,
281
+ column_name=col_name,
282
+ description=f"Column '{col_name}' is binary categorical; recommend LabelEncoder.",
283
+ encoder_type=EncodingStrategy.LABEL,
284
+ unique_values=unique_count
285
+ )
286
+ col_recommendations['encoding'] = rec
287
+
288
+ # Multi-class categorical: 3-10 unique values
289
+ elif 3 <= unique_count <= 10:
290
+ rec = EncodingRecommendation(
291
+ type=RecommendationType.ENCODING,
292
+ column_name=col_name,
293
+ description=f"Column '{col_name}' is multi-class categorical; recommend OneHotEncoder.",
294
+ encoder_type=EncodingStrategy.ONEHOT,
295
+ unique_values=unique_count
296
+ )
297
+ col_recommendations['encoding'] = rec
298
+
299
+ # 5. Check for outliers (numeric columns)
300
+ if is_numeric:
301
+ mean_value = series.mean()
302
+ max_value = series.max()
303
+ min_value = series.min()
304
+
305
+ # Check if max value significantly exceeds mean (potential outliers)
306
+ if max_value > mean_value * 2: # Max is more than 2x the mean
307
+ rec = OutlierDetectionRecommendation(
308
+ type=RecommendationType.OUTLIER_DETECTION,
309
+ column_name=col_name,
310
+ description=f"Column '{col_name}' has potential outliers (max={max_value:.2f}, mean={mean_value:.2f}).",
311
+ strategy=OutlierStrategy.SCALING,
312
+ max_value=max_value,
313
+ mean_value=mean_value
314
+ )
315
+ col_recommendations['outlier_detection'] = rec
316
+
317
+ # 6. Check for class imbalance (target column)
318
+ if col_name == target_column and unique_count <= 2:
319
+ class_counts = series.value_counts()
320
+ max_class_percentage = (class_counts.max() / total_rows) * 100
321
+
322
+ if max_class_percentage > 70:
323
+ rec = ClassImbalanceRecommendation(
324
+ type=RecommendationType.CLASS_IMBALANCE,
325
+ column_name=col_name,
326
+ description=f"Target variable '{col_name}' shows class imbalance ({max_class_percentage:.1f}% majority class).",
327
+ majority_percentage=max_class_percentage,
328
+ strategy=ImbalanceStrategy.CLASS_WEIGHT
329
+ )
330
+ col_recommendations['class_imbalance'] = rec
331
+
332
+ # 7. Suggest binning for numeric columns (e.g., Age)
333
+ if is_numeric and col_name.lower() in ['age', 'years']:
334
+ # Use describe() percentiles to suggest bins
335
+ desc = series.describe()
336
+ bins = [series.min() - 1, desc['25%'], desc['50%'],
337
+ desc['75%'], series.max()]
338
+ labels = ['Low', 'Medium_Low', 'Medium_High', 'High']
339
+
340
+ rec = BinningRecommendation(
341
+ type=RecommendationType.BINNING,
342
+ column_name=col_name,
343
+ description=f"Column '{col_name}' could be binned into {len(labels)} categories.",
344
+ bins=bins,
345
+ labels=labels
346
+ )
347
+ col_recommendations['binning'] = rec
348
+
349
+ if col_recommendations:
350
+ recommendations[col_name] = col_recommendations
351
+
352
+ return recommendations
353
+
354
+
355
+ def analyze_column_data(
356
+ series: pd.Series,
357
+ dataframe_column: DataframeColumn
358
+ ):
359
+ """Analyze and print detailed statistics for a single DataFrame column.
360
+
361
+ Displays column name, data type, null counts, unique values, min/max values.
362
+ For float columns, shows integer vs non-integer value counts. For object
363
+ columns, shows numeric vs non-numeric value counts.
364
+
365
+ Args:
366
+ series (pd.Series): The data series to analyze.
367
+ dataframe_column (DataframeColumn): Metadata about the column.
368
+
369
+ Example:
370
+ >>> df = pd.DataFrame({'price': [10.5, 20.0, 30.99]})
371
+ >>> col = DataframeColumn('price', 3, float)
372
+ >>> analyze_column_data(df['price'], col)
373
+ # Prints detailed statistics
374
+ """
375
+ series_length = len(series)
376
+ is_float_type = (dataframe_column.data_type.name == 'float64')
377
+ integer_analysis = ''
378
+
379
+ if is_float_type:
380
+ integer_value_count = series.apply(lambda x: x.is_integer()).sum()
381
+ non_integer_value_count = series_length - integer_value_count
382
+ integer_analysis = f"""Integer values: {integer_value_count}
383
+ Non-integer values: {non_integer_value_count}"""
384
+
385
+ is_object_data_type = (dataframe_column.data_type.name == 'object')
386
+ object_analysis = ''
387
+
388
+ if is_object_data_type:
389
+ numeric_value_count = series.str.isnumeric().sum()
390
+ non_numeric_value_count = series_length - \
391
+ series.apply(strings.is_float_string).sum()
392
+
393
+ object_analysis = f"""Numeric values: {numeric_value_count}
394
+ Non-numeric values: {non_numeric_value_count}"""
395
+
396
+ analysis = f"""
397
+ Column: {dataframe_column.name}
398
+ Data type: {dataframe_column.data_type.name}
399
+ Non-null: {dataframe_column.non_null_count}
400
+ N/A count: {series.isna().sum()}
401
+ Unique values: {series.nunique()}
402
+ Min value: {series.min()}
403
+ Max value: {series.max()}"""
404
+
405
+ print(analysis)
406
+
407
+ if is_float_type:
408
+ print(integer_analysis)
409
+
410
+ if is_object_data_type:
411
+ print(object_analysis)
412
+
413
+
414
+ def analyze_dataset(
415
+ df: pd.DataFrame,
416
+ target_column: str | None = None,
417
+ generate_recs: bool = False
418
+ ) -> tuple[DataframeInfo, dict[str, dict[str, Recommendation]] | None]:
419
+ """Perform comprehensive analysis of all columns in a DataFrame.
420
+
421
+ Displays overall DataFrame information (row count, duplicates) followed by
422
+ detailed analysis of each column including data types, null counts, unique
423
+ values, and type-specific statistics. Optionally generates data preparation
424
+ recommendations.
425
+
426
+ Args:
427
+ df (pd.DataFrame): The DataFrame to analyze.
428
+ target_column (str | None): Name of the target column (for recommendation generation).
429
+ generate_recs (bool): Whether to generate recommendations. Default is False.
430
+
431
+ Returns:
432
+ tuple[DataframeInfo, dict | None]: A tuple containing:
433
+ - DataframeInfo object with structured DataFrame information
434
+ - Recommendations dict (or None if generate_recs is False)
435
+
436
+ Example:
437
+ >>> df = pd.DataFrame({
438
+ ... 'name': ['Alice', 'Bob', 'Charlie'],
439
+ ... 'age': [25, 30, 35],
440
+ ... 'salary': [50000.0, 60000.5, 75000.0]
441
+ ... })
442
+ >>> info, recs = analyze_dataset(df, generate_recs=True)
443
+ # Prints comprehensive analysis of all columns and returns recommendations
444
+ """
445
+ df_info = DataframeInfo(df)
446
+ df_info.info()
447
+
448
+ n = len(df_info.columns)
449
+
450
+ recommendations = None
451
+ if generate_recs:
452
+ recommendations = generate_recommendations(df, target_column)
453
+
454
+ for c in range(n):
455
+ col = df_info.columns[c]
456
+ analyze_column_data(df[col.name], df_info.columns[c])
457
+
458
+ # Display recommendations for this column if available
459
+ if recommendations and col.name in recommendations:
460
+ col_recs = recommendations[col.name]
461
+ if col_recs:
462
+ print("\n Recommendations:")
463
+ for rec_type, recommendation in col_recs.items():
464
+ recommendation.info()
465
+ print()
466
+
467
+ return df_info, recommendations