qdesc 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of qdesc might be problematic. Click here for more details.

qdesc/__init__.py ADDED
@@ -0,0 +1,70 @@
1
+ def desc(df):
2
+ import pandas as pd
3
+ import numpy as np
4
+ from scipy.stats import anderson
5
+ x = np.round(df.describe().T,2)
6
+ x = x.iloc[:, [0,1,2,5,3,7]]
7
+ x.rename(columns={'50%': 'median'}, inplace=True)
8
+ mad_values = {}
9
+ # computes the manual mad which is more robust to outliers and non-normal distributions
10
+ for column in df.select_dtypes(include=[np.number]):
11
+ median = np.median(df[column])
12
+ abs_deviation = np.abs(df[column] - median)
13
+ mad = np.median(abs_deviation)
14
+ mad_values[column] = mad
15
+ mad_df = pd.DataFrame(list(mad_values.items()), columns=['Variable', 'MAD'])
16
+ mad_df.set_index('Variable', inplace=True)
17
+ results = {}
18
+ # Loop through each column to test only continuous variables (numeric columns)
19
+ for column in df.select_dtypes(include=[np.number]): # Only continuous variables
20
+ result = anderson(df[column])
21
+ statistic = result.statistic
22
+ critical_values = result.critical_values
23
+ # Only select the 5% and 1% significance levels
24
+ selected_critical_values = {
25
+ '5% crit_value': critical_values[2], # 5% critical value
26
+ '1% crit_value': critical_values[4] # 1% critical value
27
+ }
28
+ # Store the results in a dictionary
29
+ results[column] = {
30
+ 'AD_stat': statistic,
31
+ **selected_critical_values # Add critical values for 5% and 1% levels
32
+ }
33
+ # Convert the results dictionary into a DataFrame
34
+ anderson_df = pd.DataFrame.from_dict(results, orient='index')
35
+
36
+ xl = x.iloc[:, :4]
37
+ xr = x.iloc[:, 4:]
38
+ x_df = np.round(pd.concat([xl, mad_df, xr, anderson_df], axis=1),2)
39
+ return x_df
40
+
41
+ def freqdist(df, column_name):
42
+ import pandas as pd
43
+ if column_name not in df.columns:
44
+ raise ValueError(f"Column '{column_name}' not found in DataFrame.")
45
+
46
+ if df[column_name].dtype not in ['object', 'category']:
47
+ raise ValueError(f"Column '{column_name}' is not a categorical column.")
48
+
49
+ freq_dist = df[column_name].value_counts().reset_index()
50
+ freq_dist.columns = [column_name, 'Count']
51
+ freq_dist['Percentage'] = (freq_dist['Count'] / len(df)) * 100
52
+ return freq_dist
53
+
54
+
55
+ def freqdist_a(df):
56
+ results = [] # List to store distributions
57
+ for column in df.select_dtypes(include=['object', 'category']).columns:
58
+ frequency_table = df[column].value_counts()
59
+ percentage_table = df[column].value_counts(normalize=True) * 100
60
+ distribution = pd.DataFrame({
61
+ 'Column': column,
62
+ 'Value': frequency_table.index,
63
+ 'Count': frequency_table.values,
64
+ 'Percentage': percentage_table.values
65
+ })
66
+ results.append(distribution)
67
+
68
+ # Combine all distributions into a single DataFrame
69
+ final_df = pd.concat(results, ignore_index=True)
70
+ return final_df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: qdesc
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Quick and Easy way to do descriptive analysis.
5
5
  Author: Paolo Hilado
6
6
  Author-email: datasciencepgh@proton.me
@@ -38,6 +38,12 @@ Run the function qd.freqdist(df, "Variable Name") to easily create a frequency d
38
38
  * Counts - the number of observations
39
39
  * Percentage - percentage of observations from total.
40
40
 
41
+ Run the function qd.freqdist_a(df) to easily create frequency distribution tables for all the categorical variables in your data frame. The resulting
42
+ table will include columns such as:
43
+ * Variable levels (i.e., for Satisfaction: Very Low, Low, Moderate, High, Very High)
44
+ * Counts - the number of observations
45
+ * Percentage - percentage of observations from total.
46
+
41
47
  Later versions will include data visualizations handy for exploring the distribution of the data set.
42
48
 
43
49
  ## Installation
@@ -0,0 +1,6 @@
1
+ qdesc/__init__.py,sha256=4DVLq35LmNvg5Tq1Mz43h_YIWvS-MbwscbRpuqGKFUY,2933
2
+ qdesc-0.1.4.dist-info/LICENCE.txt,sha256=xdFo-Rt6I7EP7C_qrVeIBIcH_7mRGUh8sciJs2R8VmY,9684
3
+ qdesc-0.1.4.dist-info/METADATA,sha256=MHz5mWEIWOTp47oSPV3BBw_FZX5L4UgGIm1ROYQbRsI,3061
4
+ qdesc-0.1.4.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
5
+ qdesc-0.1.4.dist-info/top_level.txt,sha256=JuSs1wWRGN77DVuq-SX-5P7m_mIZF0ikEVgPTBOrHb0,6
6
+ qdesc-0.1.4.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ qdesc
@@ -1,5 +0,0 @@
1
- qdesc-0.1.2.dist-info/LICENCE.txt,sha256=xdFo-Rt6I7EP7C_qrVeIBIcH_7mRGUh8sciJs2R8VmY,9684
2
- qdesc-0.1.2.dist-info/METADATA,sha256=ot6kwgtQqyQRmCu9XlkbzEvNIE6eY9wl6BiQMQAccBM,2690
3
- qdesc-0.1.2.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
4
- qdesc-0.1.2.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
- qdesc-0.1.2.dist-info/RECORD,,
@@ -1 +0,0 @@
1
-
File without changes