JacobTools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. jacobtools-0.0.1/LICENSE +21 -0
  2. jacobtools-0.0.1/MANIFEST.in +2 -0
  3. jacobtools-0.0.1/PKG-INFO +17 -0
  4. jacobtools-0.0.1/README.md +68 -0
  5. jacobtools-0.0.1/pyproject.toml +7 -0
  6. jacobtools-0.0.1/setup.cfg +123 -0
  7. jacobtools-0.0.1/src/JacobTools.egg-info/PKG-INFO +17 -0
  8. jacobtools-0.0.1/src/JacobTools.egg-info/SOURCES.txt +47 -0
  9. jacobtools-0.0.1/src/JacobTools.egg-info/dependency_links.txt +1 -0
  10. jacobtools-0.0.1/src/JacobTools.egg-info/entry_points.txt +2 -0
  11. jacobtools-0.0.1/src/JacobTools.egg-info/requires.txt +2 -0
  12. jacobtools-0.0.1/src/JacobTools.egg-info/top_level.txt +2 -0
  13. jacobtools-0.0.1/src/jacobtools/__init__.py +14 -0
  14. jacobtools-0.0.1/src/jacobtools/__pycache__/__init__.cpython-312.pyc +0 -0
  15. jacobtools-0.0.1/src/jacobtools/__pycache__/config.cpython-312.pyc +0 -0
  16. jacobtools-0.0.1/src/jacobtools/config.py +10 -0
  17. jacobtools-0.0.1/src/jacobtools/gbq_queries/__init__.py +15 -0
  18. jacobtools-0.0.1/src/jacobtools/gbq_queries/__pycache__/__init__.cpython-312.pyc +0 -0
  19. jacobtools-0.0.1/src/jacobtools/gbq_queries/__pycache__/fillrate_analysis.cpython-312.pyc +0 -0
  20. jacobtools-0.0.1/src/jacobtools/gbq_queries/__pycache__/query_gbq.cpython-312.pyc +0 -0
  21. jacobtools-0.0.1/src/jacobtools/gbq_queries/fillrate_analysis.py +99 -0
  22. jacobtools-0.0.1/src/jacobtools/gbq_queries/query_gbq.py +31 -0
  23. jacobtools-0.0.1/src/jacobtools/preprocessing/__init__.py +13 -0
  24. jacobtools-0.0.1/src/jacobtools/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
  25. jacobtools-0.0.1/src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc +0 -0
  26. jacobtools-0.0.1/src/jacobtools/preprocessing/drop_columns_by_schema.py +48 -0
  27. jacobtools-0.0.1/src/jacobtools/preprocessing/enforce_schema.py +0 -0
  28. jacobtools-0.0.1/src/jacobtools/sql_generation/__init__.py +5 -0
  29. jacobtools-0.0.1/src/jacobtools/sql_generation/__pycache__/__init__.cpython-312.pyc +0 -0
  30. jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/__init__.py +6 -0
  31. jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/__pycache__/__init__.cpython-312.pyc +0 -0
  32. jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/__pycache__/generate_create_table_sql.cpython-312.pyc +0 -0
  33. jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/generate_create_table_sql.py +95 -0
  34. jacobtools-0.0.1/src/jacobtools/temp/chart.py +63 -0
  35. jacobtools-0.0.1/src/jacobtools/temp/database.py +160 -0
  36. jacobtools-0.0.1/src/jacobtools/temp/decorators.py +7 -0
  37. jacobtools-0.0.1/src/jacobtools/temp/it.py +36 -0
  38. jacobtools-0.0.1/src/jacobtools/temp/pa.py +34 -0
  39. jacobtools-0.0.1/src/jacobtools/temp/reason_for_absence.csv +29 -0
  40. jacobtools-0.0.1/src/jacobtools_test/__init__.py +5 -0
  41. jacobtools-0.0.1/src/jacobtools_test/harmonic_mean.py +2 -0
  42. jacobtools-0.0.1/src/jacobtools_test/harmony.py +19 -0
  43. jacobtools-0.0.1/src/jacobtools_test.egg-info/PKG-INFO +17 -0
  44. jacobtools-0.0.1/src/jacobtools_test.egg-info/SOURCES.txt +41 -0
  45. jacobtools-0.0.1/src/jacobtools_test.egg-info/dependency_links.txt +1 -0
  46. jacobtools-0.0.1/src/jacobtools_test.egg-info/entry_points.txt +2 -0
  47. jacobtools-0.0.1/src/jacobtools_test.egg-info/requires.txt +2 -0
  48. jacobtools-0.0.1/src/jacobtools_test.egg-info/top_level.txt +2 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Abisheak Jacob J
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ graft src
2
+ recursive-exclude __pycache__*.py[cod]
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: JacobTools
3
+ Version: 0.0.1
4
+ Summary: This package contains functions that help me tackle my day to day data andalytics requirements
5
+ Home-page: https://github.com/AbisheakJacob/JacobTools
6
+ Author: Abisheak Jacob J
7
+ Author-email: "abisheakjacob0032@gmail.com"
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=2.1.0
14
+ Requires-Dist: termcolor>=1.1.0
15
+ Dynamic: license-file
16
+
17
+ README.md
@@ -0,0 +1,68 @@
1
+ # JacobTools
2
+
3
+ **JacobTools** is a my personal python library that contains code to automate day to day activities.
4
+
5
+ ## Installation and updating
6
+
7
+ Installing the package from the github repository
8
+
9
+ ```bash
10
+ pip install git+https://github.com/AbisheakJacob/JacobTools
11
+ ```
12
+
13
+ Installing the local repository so that it updates automatically when a change is made.
14
+
15
+ ```bash
16
+ pip install .
17
+ ```
18
+
19
+ ### Creating C extension
20
+
21
+ ```python
22
+ from setuptools import setup
23
+ from Cython.Build import cythonize
24
+
25
+ setup(
26
+ ext_modules=cythonize("src/jacobtools/harmonic_mean.pyx")
27
+ )
28
+ ```
29
+
30
+ ## Structure
31
+
32
+ ### gbq_queries
33
+
34
+ This module contains codes to connect and work with data in Google BigQuery.
35
+
36
+ | Function Name | Description |
37
+ | ----------------- | -------------------------------------------------- |
38
+ | query_gbq | Read a table from GBQ as a DataFrame |
39
+ | fillrate_analysis | Perfrom Fill Rate Analysis on GBQ dataset or table |
40
+
41
+ ### preprocessing
42
+
43
+ This module contains preprocessing steps to be performed on the data before actual analysis.
44
+
45
+ | Function Name | Description |
46
+ | ---------------------- | --------------------------------------------------------------------------- |
47
+ | drop_columns_by_schema | Drop the columns that are markded in the Schema Document |
48
+ | enforce_schema | Apply the datatypes provided in schema document to the Columns in the Table |
49
+
50
+ ### sql_generation
51
+
52
+ This module automatically generate SQL codes from DataFrame for ease of use.
53
+
54
+ #### postgres
55
+
56
+ | Function Name | Description |
57
+ | ------------------------- | ------------------------------------------------------------------ |
58
+ | generate_create_table_sql | Generates SQL code to create the schema to upload data to postgres |
59
+
60
+ ## Next Steps
61
+
62
+ 1. Custom function to perform basic eda on a given dataframe (info, null values, shape, size)
63
+ 2. Function to perform match% analysis and perfrom a venn diagram for easier visualization
64
+
65
+ ## License
66
+
67
+ **_The Reference to this library can be found here:_**
68
+ The base construct of this library is referenced from [this article](https://mikehuls.medium.com/create-your-custom-python-package-that-you-can-pip-install-from-your-git-repository-f90465867893)
@@ -0,0 +1,7 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.black]
6
+ line-length = 100
7
+ target-version = ["py313"]
@@ -0,0 +1,123 @@
1
+ [metadata]
2
+ name = JacobTools
3
+ version = 0.0.1
4
+ url = https://github.com/AbisheakJacob/JacobTools
5
+ author = Abisheak Jacob J
6
+ author_email = "abisheakjacob0032@gmail.com"
7
+ description = This package contains functions that help me tackle my day to day data andalytics requirements
8
+ long_description = README.md
9
+ long_description_content_type = text/markdown
10
+ license = MIT
11
+ license_file = LICENSE
12
+ classifiers =
13
+ License :: OSI Approved :: MIT License
14
+
15
+ [options]
16
+ package_dir =
17
+ =src
18
+ packages = find:
19
+ include_package_data = True
20
+ python_requires = >=3.9
21
+ install_requires =
22
+ numpy>=2.1.0
23
+ termcolor>=1.1.0
24
+
25
+ [options.packages.find]
26
+ where = src
27
+ exclude =
28
+ test*
29
+
30
+ [options.entry_points]
31
+ console_scripts =
32
+ harmony = jacobtools_test.harmony:main
33
+
34
+ [flake8]
35
+ max-line-length = 100
36
+
37
+ [tool:pytest]
38
+ testpaths =
39
+ test
40
+ addopts =
41
+ --cov
42
+ --strict-markers
43
+ --disable-warnings
44
+ xfail_strict = True
45
+
46
+ [coverage:run]
47
+ branch = True
48
+ source = jacobtools_test
49
+
50
+ [coverage:report]
51
+ show_missing = True
52
+ skip_covered = True
53
+
54
+ [coverage:paths]
55
+ source =
56
+ src/jacobtools_test/
57
+ */site-packages/jacobtools_test/
58
+
59
+ [tox:tox]
60
+ isolated_build = True
61
+ envlist = py312
62
+
63
+ [testimports]
64
+ deps =
65
+ pytest
66
+ requests
67
+
68
+ [testenv]
69
+ deps =
70
+ {[testimports]deps}
71
+ pytest-cov
72
+ commands =
73
+ pytest {posargs}
74
+
75
+ [testenv:get_my_ip]
76
+ skip_install = True
77
+ deps =
78
+ requests
79
+ commands =
80
+ python -c "import requests; print(requests.get('https://canhazip.com').text)"
81
+
82
+ [testenv:check-imports]
83
+ deps =
84
+ {[testimports]deps}
85
+ shipyard
86
+ commands =
87
+ python -m shipyard verify
88
+
89
+ [testenv:typecheck]
90
+ deps =
91
+ mypy
92
+ pytest
93
+ types-termcolor
94
+ commands =
95
+ mypy --ignore-missing-imports {posargs:src test}
96
+
97
+ [mypy]
98
+ python_version = 3.13
99
+ warn_unused_configs = True
100
+ show_error_context = True
101
+ pretty = True
102
+ namespace_packages = True
103
+ check_untyped_defs = True
104
+
105
+ [testenv:format]
106
+ skip_install = True
107
+ deps =
108
+ black
109
+ commands =
110
+ black {posargs:--check --diff src/jacobtools_test test}
111
+
112
+ [testenv:lint]
113
+ skip_install = True
114
+ deps =
115
+ flake8
116
+ flake8-bugbear
117
+ commands =
118
+ flake8 {posargs:src/jacobtools_test test}
119
+
120
+ [egg_info]
121
+ tag_build =
122
+ tag_date = 0
123
+
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: JacobTools
3
+ Version: 0.0.1
4
+ Summary: This package contains functions that help me tackle my day to day data andalytics requirements
5
+ Home-page: https://github.com/AbisheakJacob/JacobTools
6
+ Author: Abisheak Jacob J
7
+ Author-email: "abisheakjacob0032@gmail.com"
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=2.1.0
14
+ Requires-Dist: termcolor>=1.1.0
15
+ Dynamic: license-file
16
+
17
+ README.md
@@ -0,0 +1,47 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ setup.cfg
6
+ src/JacobTools.egg-info/PKG-INFO
7
+ src/JacobTools.egg-info/SOURCES.txt
8
+ src/JacobTools.egg-info/dependency_links.txt
9
+ src/JacobTools.egg-info/entry_points.txt
10
+ src/JacobTools.egg-info/requires.txt
11
+ src/JacobTools.egg-info/top_level.txt
12
+ src/jacobtools/__init__.py
13
+ src/jacobtools/config.py
14
+ src/jacobtools/__pycache__/__init__.cpython-312.pyc
15
+ src/jacobtools/__pycache__/config.cpython-312.pyc
16
+ src/jacobtools/gbq_queries/__init__.py
17
+ src/jacobtools/gbq_queries/fillrate_analysis.py
18
+ src/jacobtools/gbq_queries/query_gbq.py
19
+ src/jacobtools/gbq_queries/__pycache__/__init__.cpython-312.pyc
20
+ src/jacobtools/gbq_queries/__pycache__/fillrate_analysis.cpython-312.pyc
21
+ src/jacobtools/gbq_queries/__pycache__/query_gbq.cpython-312.pyc
22
+ src/jacobtools/preprocessing/__init__.py
23
+ src/jacobtools/preprocessing/drop_columns_by_schema.py
24
+ src/jacobtools/preprocessing/enforce_schema.py
25
+ src/jacobtools/preprocessing/__pycache__/__init__.cpython-312.pyc
26
+ src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc
27
+ src/jacobtools/sql_generation/__init__.py
28
+ src/jacobtools/sql_generation/__pycache__/__init__.cpython-312.pyc
29
+ src/jacobtools/sql_generation/postgres/__init__.py
30
+ src/jacobtools/sql_generation/postgres/generate_create_table_sql.py
31
+ src/jacobtools/sql_generation/postgres/__pycache__/__init__.cpython-312.pyc
32
+ src/jacobtools/sql_generation/postgres/__pycache__/generate_create_table_sql.cpython-312.pyc
33
+ src/jacobtools/temp/chart.py
34
+ src/jacobtools/temp/database.py
35
+ src/jacobtools/temp/decorators.py
36
+ src/jacobtools/temp/it.py
37
+ src/jacobtools/temp/pa.py
38
+ src/jacobtools/temp/reason_for_absence.csv
39
+ src/jacobtools_test/__init__.py
40
+ src/jacobtools_test/harmonic_mean.py
41
+ src/jacobtools_test/harmony.py
42
+ src/jacobtools_test.egg-info/PKG-INFO
43
+ src/jacobtools_test.egg-info/SOURCES.txt
44
+ src/jacobtools_test.egg-info/dependency_links.txt
45
+ src/jacobtools_test.egg-info/entry_points.txt
46
+ src/jacobtools_test.egg-info/requires.txt
47
+ src/jacobtools_test.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ harmony = jacobtools_test.harmony:main
@@ -0,0 +1,2 @@
1
+ numpy>=2.1.0
2
+ termcolor>=1.1.0
@@ -0,0 +1,2 @@
1
+ jacobtools
2
+ jacobtools_test
@@ -0,0 +1,14 @@
1
+ # Expose submodules at top level
2
+
3
+ from .import config
4
+ from .preprocessing import drop_columns_by_schema
5
+ from .sql_generation import generate_create_table_sql
6
+ from .gbq_queries import fillrate_analysis
7
+ from .gbq_queries import query_gbq
8
+
9
+ __all__ =[
10
+ drop_columns_by_schema,
11
+ generate_create_table_sql,
12
+ fillrate_analysis,
13
+ query_gbq
14
+ ]
@@ -0,0 +1,10 @@
1
+ # jacobtools global configurations
2
+
3
+ # Class to map columns in schema table to column numbers
4
+ class SchemaStructure:
5
+ COLUMN_NAME = 0
6
+ DATA_TYPE = 1
7
+ DESCRIPTION = 2
8
+ DROP_FLAG = 3
9
+ REASON_FOR_DROPPING = 4
10
+ FILL_RATE = 5
@@ -0,0 +1,15 @@
1
+ # This is so that you can import ppack or import average from ppack
2
+ # instead of from ppack.functions import average
3
+
4
+ # from .decorators import singleton
5
+ # from .database import infodb, listtb, uploadtb, downloadtb, deletetb
6
+ # from .it import combo
7
+ # from .pa import fheader
8
+
9
+ from .fillrate_analysis import fillrate_analysis
10
+ from .query_gbq import query_gbq
11
+
12
+ __all__ = [
13
+ fillrate_analysis,
14
+ query_gbq
15
+ ]
@@ -0,0 +1,99 @@
1
+ import pandas as pd
2
+ from .query_gbq import query_gbq
3
+ import bigframes.pandas as bpd
4
+ from google.cloud import bigquery
5
+
6
+ """
7
+ AUTHOR: Abisheak Jacob J
8
+ LAST MODIFIED: 04-09-2025
9
+ TITLE: Generate sql code to create a table in postgres
10
+ DESCRIPTION: Generate SQL code to automate the process of writing "CREATE TABLE" sql code.
11
+ """
12
+
13
+ def fillrate_analysis(
14
+ str_gcp_project_id: str,
15
+ str_gbq_project_id: str,
16
+ str_dataset: str,
17
+ bool_check_all_tables: bool = True,
18
+ lst_tables: list = [],
19
+ bool_save_to_csv: bool = False
20
+ ) -> pd.DataFrame:
21
+
22
+ """
23
+ Calculates the fill rate (non-null value percentage) for columns in specified BigQuery tables.
24
+
25
+ Args:
26
+ str_gcp_project_id (str): The GCP project name used for querying BigQuery.
27
+ str_gbq_project_id (str): The project ID where the tables are located.
28
+ str_dataset (str): The dataset name containing the tables.
29
+ bool_check_all_tables (bool): If True, analyze all tables; otherwise, use lst_tables.
30
+ lst_tables (list): List of table names to include in the analysis.
31
+ bool_save_to_csv (bool): If True, save the result DataFrame to a CSV file.
32
+ Returns:
33
+ pd.DataFrame: A DataFrame containing table name, column name, non-null count,
34
+ null-like count, total count, and fill rate for each column.
35
+ """
36
+
37
+ # Get table and column info
38
+ query_columns = f"""
39
+ SELECT table_name, column_name
40
+ FROM `{str_gbq_project_id}.{str_dataset}.INFORMATION_SCHEMA.COLUMNS`
41
+ """
42
+
43
+ columns_df = query_gbq(query=query_columns, str_gcp_project_id=str_gcp_project_id)
44
+ tables = columns_df.groupby('table_name')['column_name'].apply(list).to_dict()
45
+
46
+ # list of tables to be included in the fill rate calculation
47
+ if bool_check_all_tables:
48
+ lst_included_tables = list(tables.heys())
49
+ else:
50
+ lst_included_tables = lst_tables
51
+
52
+ lst_results = []
53
+
54
+ # Step 2: Loop over included tables and calculate fill rate using SQL
55
+ for table_name, column_list in tables.items():
56
+ if table_name not in lst_included_tables:
57
+ continue
58
+
59
+ full_table_id = f"{str_gbq_project_id}.{str_dataset}.{table_name}"
60
+ print(f" Processing table: {full_table_id}")
61
+
62
+ for col in column_list:
63
+ try:
64
+ query_fillrate = f"""
65
+ SELECT
66
+ COUNT(1) AS total_count,
67
+ SUM(CASE WHEN {col} IS NULL OR LOWER(TRIM(CAST({col} AS STRING))) IN ('null', '') THEN 1
68
+ ELSE 0 END) AS null_like_count,
69
+ ROUND(1 - SUM(CASE WHEN {col} IS NULL OR LOWER(TRIM(CAST({col} AS STRING))) IN ('null', '') THEN 1
70
+ ELSE 0 END) / COUNT(1), 4) AS fill_rate
71
+ FROM `{full_table_id}`
72
+ """
73
+
74
+ df_stats = query_gbq(query=query_fillrate, str_gcp_project_id=str_gcp_project_id)
75
+ stats = df_stats.iloc[0]
76
+
77
+ lst_results.append({
78
+ 'table_name': table_name,
79
+ 'column_name': col,
80
+ 'non_null_count': stats['total_count'] - stats['null_like_count'],
81
+ 'null_like_count': stats['null_like_count'],
82
+ 'total_count': stats['total_count'],
83
+ 'fill_rate': stats['fill_rate']
84
+ })
85
+
86
+ except Exception as e:
87
+ print(f" Failed for column '{col}' in table '{table_name}': {e}")
88
+
89
+ # Format Output
90
+ df_result = pd.DataFrame(lst_results)
91
+ df_result.sort_values(by='fill_rate', inplace=True)
92
+ df_result.columns = ['Table Name', 'Column Name', 'Non Null Count', 'Null Like Count', 'Total Count', 'Fill Rate']
93
+
94
+ # Save to CSV
95
+ if bool_save_to_csv:
96
+ df_result.to_csv("fill_rate_summary_{}_{}.csv".format(str_gbq_project_id,str_dataset), index=False)
97
+
98
+ return df_result
99
+
@@ -0,0 +1,31 @@
1
+ import pandas as pd
2
+ import pandas_gbq
3
+ import bigframes.pandas as bpd
4
+ from google.cloud import bigquery
5
+
6
+ """
7
+ AUTHOR: Abisheak Jacob J
8
+ LAST MODIFIED: 04-09-2025
9
+ TITLE: Run SQL Query in GBQ
10
+ DESCRIPTION: Function to connect to GBQ and run SQL Queries
11
+ """
12
+ def query_gbq(
13
+ query: str,
14
+ str_gcp_project_id: str
15
+ ) -> pd.DataFrame:
16
+
17
+ """
18
+ Function to connect to GBQ and run SQL Queries
19
+
20
+ Args:
21
+ query (str): the query that needs to be run in gbq
22
+ str_gcp_project_id (np.str): GCP project id under which you need to run this query. Big queries will be billed under this project
23
+
24
+ Returns:
25
+ pd.DataFrame: the output of the query is returned as a dataframe
26
+ """
27
+
28
+ # excecute query in the respective project
29
+ df_table = pandas_gbq.read_gbq(query, project_id=str_gcp_project_id, dialect="standard", use_bqstorage_api=True)
30
+
31
+ return df_table
@@ -0,0 +1,13 @@
1
+ # This is so that you can import ppack or import average from ppack
2
+ # instead of from ppack.functions import average
3
+
4
+ # from .decorators import singleton
5
+ # from .database import infodb, listtb, uploadtb, downloadtb, deletetb
6
+ # from .it import combo
7
+ # from .pa import fheader
8
+
9
+ from .drop_columns_by_schema import drop_columns_by_schema
10
+
11
+ __all__ = [
12
+ drop_columns_by_schema
13
+ ]
@@ -0,0 +1,48 @@
1
+ # import libraries
2
+ from jacobtools.config import SchemaStructure
3
+ import pandas as pd
4
+
5
+ """
6
+ AUTHOR: Abisheak Jacob J
7
+ LAST MODIFIED: 01-08-2025
8
+ TITLE: Drop Columns by Schema Document
9
+ DESCRIPTION: Drop columns that are not significant in providing any insight into the data using the DropFlag in the schema document
10
+ """
11
+
12
+ def drop_columns_by_schema(
13
+ df: pd.DataFrame,
14
+ schema_df: pd.DataFrame,
15
+ drop_flag_column: int = SchemaStructure.DROP_FLAG
16
+ ) -> pd.DataFrame:
17
+
18
+ # DOCSTRING
19
+ """
20
+ Drop columns that are not significant in providing any insight into the data using the DropFlag in the schema document.
21
+
22
+ PARAMETERS
23
+ -------------------
24
+ df: pd.DataFrame
25
+ This is the input DataFrame where columns need to be dropped
26
+ schema_df: pd.DataFrame
27
+ This is the schema DataFrame with the DropFlag
28
+ drop_flag_column: int
29
+ This is a global variable that indicates the column in the schema where the DropFlag is present
30
+
31
+ RETURNS
32
+ -------------------
33
+ pd.DataFrame
34
+ Residual DataFrame after the flagged columns are dropped
35
+
36
+ RAISES
37
+ --------------------
38
+ ValueError
39
+ If the column is not in the DataFrame or any other errors
40
+ """
41
+
42
+ # drop the columns flagged in schema table
43
+ df.drop(
44
+ columns = [col for col, flag in zip(df.columns.to_list(), schema_df.iloc[:, drop_flag_column]) if flag],
45
+ inplace=True
46
+ )
47
+
48
+ return df
@@ -0,0 +1,5 @@
1
+ from .postgres import generate_create_table_sql
2
+
3
+ __all__ = [
4
+ generate_create_table_sql
5
+ ]
@@ -0,0 +1,6 @@
1
+ from .generate_create_table_sql import generate_create_table_sql, infer_pg_type
2
+
3
+ __all__ = [
4
+ generate_create_table_sql,
5
+ infer_pg_type
6
+ ]
@@ -0,0 +1,95 @@
1
+ # import libraries
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ """
6
+ AUTHOR: Abisheak Jacob J
7
+ LAST MODIFIED: 04-09-2025
8
+ TITLE: Generate sql code to create a table in postgres
9
+ DESCRIPTION: Generate SQL code to automate the process of writing "CREATE TABLE" sql code.
10
+ """
11
+
12
+ def infer_pg_type(
13
+ series: pd.Series
14
+ ) -> str:
15
+
16
+ # DOCSTRING
17
+ """
18
+ Infer efficient Postgres data types from pandas series.
19
+
20
+ PARAMETERS
21
+ -------------------
22
+ series: pd.Series
23
+ This is a column of the dataframe for which we are generating SQL code.
24
+
25
+ RETURNS
26
+ -------------------
27
+ string: str
28
+ Returns a string that contains the data type of the column
29
+
30
+ """
31
+ # integer datatype
32
+ if pd.api.types.is_integer_dtype(series):
33
+ min_val, max_val = series.min(), series.max()
34
+ if min_val >= -32768 and max_val <= 32767:
35
+ return "SMALLINT" # 2 bytes
36
+ elif min_val >= -2147483648 and max_val <= 2147483647:
37
+ return "INTEGER" # 4 bytes
38
+ else:
39
+ return "BIGINT" # 8 bytes
40
+ elif pd.api.types.is_float_dtype(series):
41
+ return "DOUBLE PRECISION" # PostgreSQL's default 8-byte float
42
+ elif pd.api.types.is_bool_dtype(series):
43
+ return "BOOLEAN"
44
+ elif pd.api.types.is_datetime64_any_dtype(series):
45
+ return "TIMESTAMP"
46
+ else:
47
+ max_len = series.astype(str).map(len).max()
48
+ if max_len <= 255:
49
+ return f"VARCHAR({max_len + 10})"
50
+ else:
51
+ return "TEXT"
52
+
53
+ def generate_create_table_sql(
54
+ csv_file: str,
55
+ table_name: str = "my_table"
56
+ ) -> str:
57
+
58
+ # DOCSTRING
59
+ """
60
+ Generate CREATE TABLE sql for postgres
61
+
62
+ PARAMETERS
63
+ -------------------
64
+ csv_file: str
65
+ Path to csv file for which table schema is to be created
66
+ table_name: str
67
+ Name of the table in postgres database
68
+
69
+ RETURNS
70
+ -------------------
71
+ str
72
+ SQL Code
73
+
74
+ RAISES
75
+ --------------------
76
+ ValueError
77
+ If the column is not in the DataFrame or any other errors
78
+ """
79
+ # Load only 100000 rows for type inference
80
+ df = pd.read_csv(csv_file, nrows=10000)
81
+
82
+ # infer efficient data type for each column
83
+ column_types = {col: infer_pg_type(df[col].dropna()) for col in df.columns}
84
+
85
+ # generate sql code
86
+ sql_lines = [f'DROP TABLE IF EXISTS "{table_name}";\n\nCREATE TABLE "{table_name}" (']
87
+
88
+ for col, dtype in column_types.items():
89
+ sql_lines.append(f' "{col}" {dtype},')
90
+
91
+ sql_lines[-1] = sql_lines[-1].rstrip(",") # remove last comma
92
+
93
+ sql_lines.append(");")
94
+
95
+ return print("\n".join(sql_lines))
@@ -0,0 +1,63 @@
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+
4
+ def set_global_style():
5
+ sns.set_theme(style='whitegrid') # You can choose 'white', 'darkgrid', etc.
6
+ plt.rcParams.update({
7
+ 'figure.figsize': (10, 6),
8
+ 'axes.titlesize': 16,
9
+ 'axes.labelsize': 14,
10
+ 'axes.edgecolor': '#333333',
11
+ 'axes.linewidth': 1.2,
12
+ 'axes.grid': True,
13
+ 'grid.color': '#dddddd',
14
+ 'grid.linestyle': '--',
15
+ 'grid.linewidth': 0.5,
16
+ 'xtick.labelsize': 12,
17
+ 'ytick.labelsize': 12,
18
+ 'legend.fontsize': 12,
19
+ 'legend.frameon': True,
20
+ 'legend.framealpha': 0.9,
21
+ 'legend.edgecolor': '#444444',
22
+ 'font.family': 'sans-serif',
23
+ 'font.sans-serif': 'DejaVu Sans'
24
+ })
25
+
26
+ sns.set_palette("Set2") # or use custom_palette = sns.color_palette([...])
27
+
28
+
29
+
30
+ def plot_line(data, x, y, hue=None, title='', xlabel='', ylabel=''):
31
+ set_global_style()
32
+ plt.figure()
33
+ ax = sns.lineplot(data=data, x=x, y=y, hue=hue)
34
+ ax.set_title(title)
35
+ ax.set_xlabel(xlabel if xlabel else x)
36
+ ax.set_ylabel(ylabel if ylabel else y)
37
+ plt.tight_layout()
38
+ plt.show()
39
+
40
+
41
+ def plot_bar(data, x, y, hue=None, title='', xlabel='', ylabel='', orient='v'):
42
+ set_global_style()
43
+ plt.figure()
44
+ ax = sns.barplot(data=data, x=x, y=y, hue=hue, orient=orient)
45
+ ax.set_title(title)
46
+ ax.set_xlabel(xlabel if xlabel else x)
47
+ ax.set_ylabel(ylabel if ylabel else y)
48
+ plt.tight_layout()
49
+ plt.show()
50
+
51
+
52
+ def plot_heatmap(data, title='', xlabel='', ylabel='', annot=True, fmt=".2f", cmap='coolwarm'):
53
+ set_global_style()
54
+ plt.figure()
55
+ ax = sns.heatmap(data, annot=annot, fmt=fmt, cmap=cmap, cbar=True)
56
+ ax.set_title(title)
57
+ ax.set_xlabel(xlabel)
58
+ ax.set_ylabel(ylabel)
59
+ plt.tight_layout()
60
+ plt.show()
61
+
62
+ def save_plot(filename, dpi=300):
63
+ plt.savefig(filename, dpi=dpi, bbox_inches='tight', transparent=False)
@@ -0,0 +1,160 @@
1
+ # %%
2
+ # import the libraries
3
+ from sqlalchemy import create_engine, inspect, MetaData, Table, text
4
+ import pandas as pd
5
+
6
+
7
+ # %%
8
+ # defining the parameters for the database connection as global variables
9
+ def infodb(host, user, passw, dbname, show_url=False):
10
+ # write a docstring for this function
11
+ """
12
+ This function is used to define the parameters for the database connection as global variables.
13
+ This function also defines the MySQL database url for the connection.
14
+
15
+ :param host: object
16
+ :param user: object
17
+ :param passw: object
18
+ :param dbname: object
19
+ :return:
20
+ """
21
+
22
+ # define the global variables
23
+ global hostname
24
+ global username
25
+ global password
26
+ global databasename
27
+ global databaseurl
28
+
29
+ # assign the values of the database credentials to the global variables
30
+ hostname = str(host)
31
+ username = str(user)
32
+ password = str(passw)
33
+ databasename = str(dbname)
34
+ databaseurl = "mysql://" + user + ":" + password + "@" + host + "/" + databasename
35
+
36
+ # if show_url is True then print the database url
37
+ if show_url:
38
+ print(f"The database url is: {databaseurl}")
39
+
40
+
41
+ # %%
42
+ # defining the list all tables in the database function
43
+ def listtb():
44
+ """
45
+ This function is used to list all the tables in the database.
46
+ :return:
47
+ """
48
+ try:
49
+ # Create a SQLAlchemy engine to connect to the database
50
+ engine = create_engine(databaseurl)
51
+
52
+ # Create an Inspector for the engine
53
+ inspector = inspect(engine)
54
+
55
+ # Get the table names from the database
56
+ table_names = inspector.get_table_names()
57
+
58
+ # return the table names
59
+ return pd.DataFrame(table_names)
60
+
61
+ except Exception as e:
62
+
63
+ # if error return the error
64
+ return str(e)
65
+
66
+
67
+ # %%
68
+ # upload a dataframe to the database
69
+ def uploadtb(df, tbname):
70
+ """
71
+ This function is used to upload a dataframe to the database.
72
+ :param df: dataframe
73
+ :param tbname: object
74
+ :return:
75
+ """
76
+
77
+ # convert the table name to string
78
+ tbname = str(tbname)
79
+
80
+ try:
81
+ # Create a SQLAlchemy engine to connect to the database
82
+ engine = create_engine(databaseurl)
83
+
84
+ # engine.execute(text('DELETE FROM NseTicker'))
85
+ with engine.connect() as conn:
86
+ conn.execute(text(f"DELETE FROM {tbname}"))
87
+ conn.commit()
88
+
89
+ # upload the dataframe to the database
90
+ df.to_sql(tbname, engine, if_exists="append", index=False)
91
+
92
+ # return a success message
93
+ return "Dataframe uploaded successfully"
94
+
95
+ except Exception as e:
96
+
97
+ # if error return the error
98
+ return str(e)
99
+
100
+
101
+ # %%
102
+ # download a table from the database
103
+ def downloadtb(tbname):
104
+ """
105
+ This function is used to download a table from the database.
106
+ :param tbname: object
107
+ :return:
108
+ """
109
+
110
+ # convert the table name to string
111
+ tbname = str(tbname)
112
+
113
+ try:
114
+ # Create a SQLAlchemy engine to connect to the database
115
+ engine = create_engine(databaseurl)
116
+
117
+ # download the table from the database
118
+ df = pd.read_sql(tbname, engine)
119
+
120
+ # return the dataframe
121
+ return df
122
+
123
+ except Exception as e:
124
+
125
+ # if error return the error
126
+ print(str(e))
127
+
128
+
129
+ # %%
130
+ # delete a table from the database
131
+ def deletetb(tbname):
132
+ """
133
+ This function is used to delete a table from the database.
134
+ :param tbname: object
135
+ :return:
136
+ """
137
+
138
+ # convert the table name to string
139
+ tbname = str(tbname)
140
+
141
+ try:
142
+ # Create a SQLAlchemy engine to connect to the database
143
+ engine = create_engine(databaseurl)
144
+
145
+ # Create a MetaData object
146
+ metadata = MetaData()
147
+
148
+ # Reflect the existing table
149
+ existing_table = Table(tbname, metadata, autoload_with=engine)
150
+
151
+ # Drop the table
152
+ existing_table.drop(engine, checkfirst=True)
153
+
154
+ # return a success message
155
+ return "Table deleted successfully"
156
+
157
+ except Exception as e:
158
+
159
+ # if error return the error
160
+ return str(e)
@@ -0,0 +1,7 @@
1
+ def singleton(class_):
2
+ instances = {}
3
+ def getinstance(*args, **kwargs):
4
+ if class_ not in instances:
5
+ instances[class_] = class_(*args, **kwargs)
6
+ return instances[class_]
7
+ return getinstance
@@ -0,0 +1,36 @@
1
+ # importing the packages
2
+
3
+ from itertools import combinations
4
+ from collections import Counter
5
+ import pandas as pd
6
+ # creating a function to calculate the count of all the possible combinations
7
+
8
+ def combo(df_data, column_name, groupby_column, combo_no):
9
+
10
+
11
+ # create the groupby column and the column name
12
+ # groupby column - the column which you choose to use as the pivot for the entire combination operation
13
+ # column_name - the column that you wish to create the combinations along a pivot
14
+ # creating a dataframe along the groupby column where the combination column will form a list
15
+
16
+ new_data = df_data.groupby(groupby_column)[column_name].agg(list).reset_index()
17
+
18
+
19
+ # creating a new column all-combinations that contains all the combinations of the items as per the combo_no
20
+
21
+ new_data['all_combinations'] = new_data[column_name].apply(lambda x: list(combinations(x,combo_no)))
22
+
23
+
24
+ # count the number of occurances of each particular combinations
25
+
26
+ combinations_counter = Counter([tuple(sorted(i)) for sublist in new_data['all_combinations'] for i in sublist])
27
+
28
+
29
+ # sort all the combinations in the descending order
30
+
31
+ all_combinations_sorted = combinations_counter.most_common()
32
+
33
+
34
+ # returns the sorted series
35
+
36
+ return all_combinations_sorted
@@ -0,0 +1,34 @@
1
+ # importing the packages
2
+ import pandas as pd
3
+
4
+
5
+ # defining a fuction to convert the function to lowercase
6
+ def fheader(df_name):
7
+
8
+
9
+ # creating a list of the column titles in the dataframe
10
+
11
+ column_titles = list(df_name.columns)
12
+
13
+
14
+ # converting all the elements in the list to lowercase
15
+
16
+ column_titles = [x.lower() for x in column_titles]
17
+
18
+
19
+ # removing empty space in the beginging and the end of all the headers
20
+
21
+ column_titles = [x.strip() for x in column_titles]
22
+
23
+
24
+ # converting all the spaces left in the header to underscore
25
+
26
+ column_titles = [x.replace(" ", "_") for x in column_titles]
27
+
28
+
29
+ # replacing the old header names with the new ones
30
+
31
+ df_name.columns = column_titles
32
+
33
+
34
+ return df_name
@@ -0,0 +1,29 @@
1
+ reason_no,reason_details
2
+ 1,Certain infectious and parasitic diseases
3
+ 2,Neoplasms
4
+ 3,Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism
5
+ 4,"Endocrine, nutritional and metablolic diseases"
6
+ 5,Mental and behavioural disorders
7
+ 6,Diseases of the nervous system
8
+ 7,Diseases of the eye and adnexa
9
+ 8,Diseases of the ear and mastoid process
10
+ 9,Diseases of the circulatory system
11
+ 10,Diseases of the respiratory system
12
+ 11,Diseases of the digestive system
13
+ 12,Diseases of the skin and subcutaneous tissue
14
+ 13,Diseases of he musculoskeletal system and connective tissue
15
+ 14,Disease of the genitourinary system
16
+ 15,"Pregnancy, childbirth and the puerperium"
17
+ 16,Certain conditions originating in the perinatal period
18
+ 17,"Congenital malformations, deformations and chromosal abnormalities"
19
+ 18,"Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"
20
+ 19,"Injury, poisoning and certain other consequences of external causes"
21
+ 20,External causes of morbidity and mortality
22
+ 21,Factors influencing health status and contack with health services
23
+ 22,Patient follow-up
24
+ 23,Medical consultation
25
+ 24,Blood donation
26
+ 25,Laboratory examination
27
+ 26,Unjustified absence
28
+ 27,Physiotherapy
29
+ 28,Dental Consultation
@@ -0,0 +1,5 @@
1
+ # Expose submodules at top level
2
+
3
+ from . import harmonic_mean
4
+
5
+ __all__ = [harmonic_mean]
@@ -0,0 +1,2 @@
1
+ def harmonic_mean(nums):
2
+ return len(nums) / sum(1 / num for num in nums)
@@ -0,0 +1,19 @@
1
+ import sys
2
+ from jacobtools_test.harmonic_mean import harmonic_mean
3
+ from termcolor import cprint
4
+
5
+
6
+ def main():
7
+ result = 0.0
8
+
9
+ try:
10
+ nums = [float(num) for num in sys.argv[1:]]
11
+ except ValueError:
12
+ nums = []
13
+
14
+ try:
15
+ result = harmonic_mean(nums)
16
+ except ZeroDivisionError:
17
+ pass
18
+
19
+ cprint(result, "red", "on_cyan", attrs=["bold"])
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: jacobtools_test
3
+ Version: 0.0.1
4
+ Summary: This package contains functions that help me tackle my day to day data andalytics requirements
5
+ Home-page: 'https://github.com/AbisheakJacob/jacobtools'
6
+ Author: Abisheak Jacob J
7
+ Author-email: "AbisheakJacob0032" abisheakjacob0032@gmail.com
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=2.1.0
14
+ Requires-Dist: termcolor>=1.1.0
15
+ Dynamic: license-file
16
+
17
+ README.md
@@ -0,0 +1,41 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ setup.cfg
6
+ src/jacobtools/__init__.py
7
+ src/jacobtools/config.py
8
+ src/jacobtools/__pycache__/__init__.cpython-312.pyc
9
+ src/jacobtools/__pycache__/config.cpython-312.pyc
10
+ src/jacobtools/gbq_queries/__init__.py
11
+ src/jacobtools/gbq_queries/fillrate_analysis.py
12
+ src/jacobtools/gbq_queries/query_gbq.py
13
+ src/jacobtools/gbq_queries/__pycache__/__init__.cpython-312.pyc
14
+ src/jacobtools/gbq_queries/__pycache__/fillrate_analysis.cpython-312.pyc
15
+ src/jacobtools/gbq_queries/__pycache__/query_gbq.cpython-312.pyc
16
+ src/jacobtools/preprocessing/__init__.py
17
+ src/jacobtools/preprocessing/drop_columns_by_schema.py
18
+ src/jacobtools/preprocessing/enforce_schema.py
19
+ src/jacobtools/preprocessing/__pycache__/__init__.cpython-312.pyc
20
+ src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc
21
+ src/jacobtools/sql_generation/__init__.py
22
+ src/jacobtools/sql_generation/__pycache__/__init__.cpython-312.pyc
23
+ src/jacobtools/sql_generation/postgres/__init__.py
24
+ src/jacobtools/sql_generation/postgres/generate_create_table_sql.py
25
+ src/jacobtools/sql_generation/postgres/__pycache__/__init__.cpython-312.pyc
26
+ src/jacobtools/sql_generation/postgres/__pycache__/generate_create_table_sql.cpython-312.pyc
27
+ src/jacobtools/temp/chart.py
28
+ src/jacobtools/temp/database.py
29
+ src/jacobtools/temp/decorators.py
30
+ src/jacobtools/temp/it.py
31
+ src/jacobtools/temp/pa.py
32
+ src/jacobtools/temp/reason_for_absence.csv
33
+ src/jacobtools_test/__init__.py
34
+ src/jacobtools_test/harmonic_mean.py
35
+ src/jacobtools_test/harmony.py
36
+ src/jacobtools_test.egg-info/PKG-INFO
37
+ src/jacobtools_test.egg-info/SOURCES.txt
38
+ src/jacobtools_test.egg-info/dependency_links.txt
39
+ src/jacobtools_test.egg-info/entry_points.txt
40
+ src/jacobtools_test.egg-info/requires.txt
41
+ src/jacobtools_test.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ harmony = jacobtools_test.harmony:main
@@ -0,0 +1,2 @@
1
+ numpy>=2.1.0
2
+ termcolor>=1.1.0
@@ -0,0 +1,2 @@
1
+ jacobtools
2
+ jacobtools_test