qdesc 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qdesc might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: qdesc
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Quick and Easy way to do descriptive analysis.
5
5
  Author: Paolo Hilado
6
6
  Author-email: datasciencepgh@proton.me
@@ -0,0 +1,52 @@
1
+ def desc(df):
2
+ import pandas as pd
3
+ import numpy as np
4
+ from scipy.stats import anderson
5
+ x = np.round(df.describe().T,2)
6
+ x = x.iloc[:, [0,1,2,5,3,7]]
7
+ x.rename(columns={'50%': 'median'}, inplace=True)
8
+ mad_values = {}
9
+ # computes the manual mad which is more robust to outliers and non-normal distributions
10
+ for column in df.select_dtypes(include=[np.number]):
11
+ median = np.median(df[column])
12
+ abs_deviation = np.abs(df[column] - median)
13
+ mad = np.median(abs_deviation)
14
+ mad_values[column] = mad
15
+ mad_df = pd.DataFrame(list(mad_values.items()), columns=['Variable', 'MAD'])
16
+ mad_df.set_index('Variable', inplace=True)
17
+ results = {}
18
+ # Loop through each column to test only continuous variables (numeric columns)
19
+ for column in df.select_dtypes(include=[np.number]): # Only continuous variables
20
+ result = anderson(df[column])
21
+ statistic = result.statistic
22
+ critical_values = result.critical_values
23
+ # Only select the 5% and 1% significance levels
24
+ selected_critical_values = {
25
+ '5% crit_value': critical_values[2], # 5% critical value
26
+ '1% crit_value': critical_values[4] # 1% critical value
27
+ }
28
+ # Store the results in a dictionary
29
+ results[column] = {
30
+ 'AD_stat': statistic,
31
+ **selected_critical_values # Add critical values for 5% and 1% levels
32
+ }
33
+ # Convert the results dictionary into a DataFrame
34
+ anderson_df = pd.DataFrame.from_dict(results, orient='index')
35
+
36
+ xl = x.iloc[:, :4]
37
+ xr = x.iloc[:, 4:]
38
+ x_df = np.round(pd.concat([xl, mad_df, xr, anderson_df], axis=1),2)
39
+ return x_df
40
+
41
+ def freqdist(df, column_name):
42
+ import pandas as pd
43
+ if column_name not in df.columns:
44
+ raise ValueError(f"Column '{column_name}' not found in DataFrame.")
45
+
46
+ if df[column_name].dtype not in ['object', 'category']:
47
+ raise ValueError(f"Column '{column_name}' is not a categorical column.")
48
+
49
+ freq_dist = df[column_name].value_counts().reset_index()
50
+ freq_dist.columns = [column_name, 'Count']
51
+ freq_dist['Percentage'] = (freq_dist['Count'] / len(df)) * 100
52
+ return freq_dist
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: qdesc
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Quick and Easy way to do descriptive analysis.
5
5
  Author: Paolo Hilado
6
6
  Author-email: datasciencepgh@proton.me
@@ -1,6 +1,7 @@
1
1
  LICENCE.txt
2
2
  README.txt
3
3
  setup.py
4
+ qdesc/__init__.py
4
5
  qdesc.egg-info/PKG-INFO
5
6
  qdesc.egg-info/SOURCES.txt
6
7
  qdesc.egg-info/dependency_links.txt
@@ -0,0 +1 @@
1
+ qdesc
@@ -7,7 +7,7 @@ long_description = (this_directory / "README.md").read_text()
7
7
 
8
8
  setup(
9
9
  name='qdesc',
10
- version='0.1.1',
10
+ version='0.1.3',
11
11
  packages=find_packages(),
12
12
  install_requires=[
13
13
  # List your dependencies here, e.g., pandas if your function requires it
@@ -1 +0,0 @@
1
-
File without changes
File without changes
File without changes