JacobTools 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jacobtools-0.0.1/LICENSE +21 -0
- jacobtools-0.0.1/MANIFEST.in +2 -0
- jacobtools-0.0.1/PKG-INFO +17 -0
- jacobtools-0.0.1/README.md +68 -0
- jacobtools-0.0.1/pyproject.toml +7 -0
- jacobtools-0.0.1/setup.cfg +123 -0
- jacobtools-0.0.1/src/JacobTools.egg-info/PKG-INFO +17 -0
- jacobtools-0.0.1/src/JacobTools.egg-info/SOURCES.txt +47 -0
- jacobtools-0.0.1/src/JacobTools.egg-info/dependency_links.txt +1 -0
- jacobtools-0.0.1/src/JacobTools.egg-info/entry_points.txt +2 -0
- jacobtools-0.0.1/src/JacobTools.egg-info/requires.txt +2 -0
- jacobtools-0.0.1/src/JacobTools.egg-info/top_level.txt +2 -0
- jacobtools-0.0.1/src/jacobtools/__init__.py +14 -0
- jacobtools-0.0.1/src/jacobtools/__pycache__/__init__.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/__pycache__/config.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/config.py +10 -0
- jacobtools-0.0.1/src/jacobtools/gbq_queries/__init__.py +15 -0
- jacobtools-0.0.1/src/jacobtools/gbq_queries/__pycache__/__init__.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/gbq_queries/__pycache__/fillrate_analysis.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/gbq_queries/__pycache__/query_gbq.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/gbq_queries/fillrate_analysis.py +99 -0
- jacobtools-0.0.1/src/jacobtools/gbq_queries/query_gbq.py +31 -0
- jacobtools-0.0.1/src/jacobtools/preprocessing/__init__.py +13 -0
- jacobtools-0.0.1/src/jacobtools/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/preprocessing/drop_columns_by_schema.py +48 -0
- jacobtools-0.0.1/src/jacobtools/preprocessing/enforce_schema.py +0 -0
- jacobtools-0.0.1/src/jacobtools/sql_generation/__init__.py +5 -0
- jacobtools-0.0.1/src/jacobtools/sql_generation/__pycache__/__init__.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/__init__.py +6 -0
- jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/__pycache__/__init__.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/__pycache__/generate_create_table_sql.cpython-312.pyc +0 -0
- jacobtools-0.0.1/src/jacobtools/sql_generation/postgres/generate_create_table_sql.py +95 -0
- jacobtools-0.0.1/src/jacobtools/temp/chart.py +63 -0
- jacobtools-0.0.1/src/jacobtools/temp/database.py +160 -0
- jacobtools-0.0.1/src/jacobtools/temp/decorators.py +7 -0
- jacobtools-0.0.1/src/jacobtools/temp/it.py +36 -0
- jacobtools-0.0.1/src/jacobtools/temp/pa.py +34 -0
- jacobtools-0.0.1/src/jacobtools/temp/reason_for_absence.csv +29 -0
- jacobtools-0.0.1/src/jacobtools_test/__init__.py +5 -0
- jacobtools-0.0.1/src/jacobtools_test/harmonic_mean.py +2 -0
- jacobtools-0.0.1/src/jacobtools_test/harmony.py +19 -0
- jacobtools-0.0.1/src/jacobtools_test.egg-info/PKG-INFO +17 -0
- jacobtools-0.0.1/src/jacobtools_test.egg-info/SOURCES.txt +41 -0
- jacobtools-0.0.1/src/jacobtools_test.egg-info/dependency_links.txt +1 -0
- jacobtools-0.0.1/src/jacobtools_test.egg-info/entry_points.txt +2 -0
- jacobtools-0.0.1/src/jacobtools_test.egg-info/requires.txt +2 -0
- jacobtools-0.0.1/src/jacobtools_test.egg-info/top_level.txt +2 -0
jacobtools-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2021 Abisheak Jacob J
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: JacobTools
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: This package contains functions that help me tackle my day to day data andalytics requirements
|
|
5
|
+
Home-page: https://github.com/AbisheakJacob/JacobTools
|
|
6
|
+
Author: Abisheak Jacob J
|
|
7
|
+
Author-email: "abisheakjacob0032@gmail.com"
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=2.1.0
|
|
14
|
+
Requires-Dist: termcolor>=1.1.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
README.md
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# JacobTools
|
|
2
|
+
|
|
3
|
+
**JacobTools** is a my personal python library that contains code to automate day to day activities.
|
|
4
|
+
|
|
5
|
+
## Installation and updating
|
|
6
|
+
|
|
7
|
+
Installing the package from the github repository
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install git+https://github.com/AbisheakJacob/JacobTools
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Installing the local repository so that it updates automatically when a change is made.
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Creating C extension
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from setuptools import setup
|
|
23
|
+
from Cython.Build import cythonize
|
|
24
|
+
|
|
25
|
+
setup(
|
|
26
|
+
ext_modules=cythonize("src/jacobtools/harmonic_mean.pyx")
|
|
27
|
+
)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Structure
|
|
31
|
+
|
|
32
|
+
### gbq_queries
|
|
33
|
+
|
|
34
|
+
This module contains codes to connect and work with data in Google BigQuery.
|
|
35
|
+
|
|
36
|
+
| Function Name | Description |
|
|
37
|
+
| ----------------- | -------------------------------------------------- |
|
|
38
|
+
| query_gbq | Read a table from GBQ as a DataFrame |
|
|
39
|
+
| fillrate_analysis | Perfrom Fill Rate Analysis on GBQ dataset or table |
|
|
40
|
+
|
|
41
|
+
### preprocessing
|
|
42
|
+
|
|
43
|
+
This module contains preprocessing steps to be performed on the data before actual analysis.
|
|
44
|
+
|
|
45
|
+
| Function Name | Description |
|
|
46
|
+
| ---------------------- | --------------------------------------------------------------------------- |
|
|
47
|
+
| drop_columns_by_schema | Drop the columns that are markded in the Schema Document |
|
|
48
|
+
| enforce_schema | Apply the datatypes provided in schema document to the Columns in the Table |
|
|
49
|
+
|
|
50
|
+
### sql_generation
|
|
51
|
+
|
|
52
|
+
This module automatically generate SQL codes from DataFrame for ease of use.
|
|
53
|
+
|
|
54
|
+
#### postgres
|
|
55
|
+
|
|
56
|
+
| Function Name | Description |
|
|
57
|
+
| ------------------------- | ------------------------------------------------------------------ |
|
|
58
|
+
| generate_create_table_sql | Generates SQL code to create the schema to upload data to postgres |
|
|
59
|
+
|
|
60
|
+
## Next Steps
|
|
61
|
+
|
|
62
|
+
1. Custom function to perform basic eda on a given dataframe (info, null values, shape, size)
|
|
63
|
+
2. Function to perform match% analysis and perfrom a venn diagram for easier visualization
|
|
64
|
+
|
|
65
|
+
## License
|
|
66
|
+
|
|
67
|
+
**_The Reference to this library can be found here:_**
|
|
68
|
+
The base construct of this library is referenced from [this article](https://mikehuls.medium.com/create-your-custom-python-package-that-you-can-pip-install-from-your-git-repository-f90465867893)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = JacobTools
|
|
3
|
+
version = 0.0.1
|
|
4
|
+
url = https://github.com/AbisheakJacob/JacobTools
|
|
5
|
+
author = Abisheak Jacob J
|
|
6
|
+
author_email = "abisheakjacob0032@gmail.com"
|
|
7
|
+
description = This package contains functions that help me tackle my day to day data andalytics requirements
|
|
8
|
+
long_description = README.md
|
|
9
|
+
long_description_content_type = text/markdown
|
|
10
|
+
license = MIT
|
|
11
|
+
license_file = LICENSE
|
|
12
|
+
classifiers =
|
|
13
|
+
License :: OSI Approved :: MIT License
|
|
14
|
+
|
|
15
|
+
[options]
|
|
16
|
+
package_dir =
|
|
17
|
+
=src
|
|
18
|
+
packages = find:
|
|
19
|
+
include_package_data = True
|
|
20
|
+
python_requires = >=3.9
|
|
21
|
+
install_requires =
|
|
22
|
+
numpy>=2.1.0
|
|
23
|
+
termcolor>=1.1.0
|
|
24
|
+
|
|
25
|
+
[options.packages.find]
|
|
26
|
+
where = src
|
|
27
|
+
exclude =
|
|
28
|
+
test*
|
|
29
|
+
|
|
30
|
+
[options.entry_points]
|
|
31
|
+
console_scripts =
|
|
32
|
+
harmony = jacobtools_test.harmony:main
|
|
33
|
+
|
|
34
|
+
[flake8]
|
|
35
|
+
max-line-length = 100
|
|
36
|
+
|
|
37
|
+
[tool:pytest]
|
|
38
|
+
testpaths =
|
|
39
|
+
test
|
|
40
|
+
addopts =
|
|
41
|
+
--cov
|
|
42
|
+
--strict-markers
|
|
43
|
+
--disable-warnings
|
|
44
|
+
xfail_strict = True
|
|
45
|
+
|
|
46
|
+
[coverage:run]
|
|
47
|
+
branch = True
|
|
48
|
+
source = jacobtools_test
|
|
49
|
+
|
|
50
|
+
[coverage:report]
|
|
51
|
+
show_missing = True
|
|
52
|
+
skip_covered = True
|
|
53
|
+
|
|
54
|
+
[coverage:paths]
|
|
55
|
+
source =
|
|
56
|
+
src/jacobtools_test/
|
|
57
|
+
*/site-packages/jacobtools_test/
|
|
58
|
+
|
|
59
|
+
[tox:tox]
|
|
60
|
+
isolated_build = True
|
|
61
|
+
envlist = py312
|
|
62
|
+
|
|
63
|
+
[testimports]
|
|
64
|
+
deps =
|
|
65
|
+
pytest
|
|
66
|
+
requests
|
|
67
|
+
|
|
68
|
+
[testenv]
|
|
69
|
+
deps =
|
|
70
|
+
{[testimports]deps}
|
|
71
|
+
pytest-cov
|
|
72
|
+
commands =
|
|
73
|
+
pytest {posargs}
|
|
74
|
+
|
|
75
|
+
[testenv:get_my_ip]
|
|
76
|
+
skip_install = True
|
|
77
|
+
deps =
|
|
78
|
+
requests
|
|
79
|
+
commands =
|
|
80
|
+
python -c "import requests; print(requests.get('https://canhazip.com').text)"
|
|
81
|
+
|
|
82
|
+
[testenv:check-imports]
|
|
83
|
+
deps =
|
|
84
|
+
{[testimports]deps}
|
|
85
|
+
shipyard
|
|
86
|
+
commands =
|
|
87
|
+
python -m shipyard verify
|
|
88
|
+
|
|
89
|
+
[testenv:typecheck]
|
|
90
|
+
deps =
|
|
91
|
+
mypy
|
|
92
|
+
pytest
|
|
93
|
+
types-termcolor
|
|
94
|
+
commands =
|
|
95
|
+
mypy --ignore-missing-imports {posargs:src test}
|
|
96
|
+
|
|
97
|
+
[mypy]
|
|
98
|
+
python_version = 3.13
|
|
99
|
+
warn_unused_configs = True
|
|
100
|
+
show_error_context = True
|
|
101
|
+
pretty = True
|
|
102
|
+
namespace_packages = True
|
|
103
|
+
check_untyped_defs = True
|
|
104
|
+
|
|
105
|
+
[testenv:format]
|
|
106
|
+
skip_install = True
|
|
107
|
+
deps =
|
|
108
|
+
black
|
|
109
|
+
commands =
|
|
110
|
+
black {posargs:--check --diff src/jacobtools_test test}
|
|
111
|
+
|
|
112
|
+
[testenv:lint]
|
|
113
|
+
skip_install = True
|
|
114
|
+
deps =
|
|
115
|
+
flake8
|
|
116
|
+
flake8-bugbear
|
|
117
|
+
commands =
|
|
118
|
+
flake8 {posargs:src/jacobtools_test test}
|
|
119
|
+
|
|
120
|
+
[egg_info]
|
|
121
|
+
tag_build =
|
|
122
|
+
tag_date = 0
|
|
123
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: JacobTools
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: This package contains functions that help me tackle my day to day data andalytics requirements
|
|
5
|
+
Home-page: https://github.com/AbisheakJacob/JacobTools
|
|
6
|
+
Author: Abisheak Jacob J
|
|
7
|
+
Author-email: "abisheakjacob0032@gmail.com"
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=2.1.0
|
|
14
|
+
Requires-Dist: termcolor>=1.1.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
README.md
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
setup.cfg
|
|
6
|
+
src/JacobTools.egg-info/PKG-INFO
|
|
7
|
+
src/JacobTools.egg-info/SOURCES.txt
|
|
8
|
+
src/JacobTools.egg-info/dependency_links.txt
|
|
9
|
+
src/JacobTools.egg-info/entry_points.txt
|
|
10
|
+
src/JacobTools.egg-info/requires.txt
|
|
11
|
+
src/JacobTools.egg-info/top_level.txt
|
|
12
|
+
src/jacobtools/__init__.py
|
|
13
|
+
src/jacobtools/config.py
|
|
14
|
+
src/jacobtools/__pycache__/__init__.cpython-312.pyc
|
|
15
|
+
src/jacobtools/__pycache__/config.cpython-312.pyc
|
|
16
|
+
src/jacobtools/gbq_queries/__init__.py
|
|
17
|
+
src/jacobtools/gbq_queries/fillrate_analysis.py
|
|
18
|
+
src/jacobtools/gbq_queries/query_gbq.py
|
|
19
|
+
src/jacobtools/gbq_queries/__pycache__/__init__.cpython-312.pyc
|
|
20
|
+
src/jacobtools/gbq_queries/__pycache__/fillrate_analysis.cpython-312.pyc
|
|
21
|
+
src/jacobtools/gbq_queries/__pycache__/query_gbq.cpython-312.pyc
|
|
22
|
+
src/jacobtools/preprocessing/__init__.py
|
|
23
|
+
src/jacobtools/preprocessing/drop_columns_by_schema.py
|
|
24
|
+
src/jacobtools/preprocessing/enforce_schema.py
|
|
25
|
+
src/jacobtools/preprocessing/__pycache__/__init__.cpython-312.pyc
|
|
26
|
+
src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc
|
|
27
|
+
src/jacobtools/sql_generation/__init__.py
|
|
28
|
+
src/jacobtools/sql_generation/__pycache__/__init__.cpython-312.pyc
|
|
29
|
+
src/jacobtools/sql_generation/postgres/__init__.py
|
|
30
|
+
src/jacobtools/sql_generation/postgres/generate_create_table_sql.py
|
|
31
|
+
src/jacobtools/sql_generation/postgres/__pycache__/__init__.cpython-312.pyc
|
|
32
|
+
src/jacobtools/sql_generation/postgres/__pycache__/generate_create_table_sql.cpython-312.pyc
|
|
33
|
+
src/jacobtools/temp/chart.py
|
|
34
|
+
src/jacobtools/temp/database.py
|
|
35
|
+
src/jacobtools/temp/decorators.py
|
|
36
|
+
src/jacobtools/temp/it.py
|
|
37
|
+
src/jacobtools/temp/pa.py
|
|
38
|
+
src/jacobtools/temp/reason_for_absence.csv
|
|
39
|
+
src/jacobtools_test/__init__.py
|
|
40
|
+
src/jacobtools_test/harmonic_mean.py
|
|
41
|
+
src/jacobtools_test/harmony.py
|
|
42
|
+
src/jacobtools_test.egg-info/PKG-INFO
|
|
43
|
+
src/jacobtools_test.egg-info/SOURCES.txt
|
|
44
|
+
src/jacobtools_test.egg-info/dependency_links.txt
|
|
45
|
+
src/jacobtools_test.egg-info/entry_points.txt
|
|
46
|
+
src/jacobtools_test.egg-info/requires.txt
|
|
47
|
+
src/jacobtools_test.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Expose submodules at top level
|
|
2
|
+
|
|
3
|
+
from .import config
|
|
4
|
+
from .preprocessing import drop_columns_by_schema
|
|
5
|
+
from .sql_generation import generate_create_table_sql
|
|
6
|
+
from .gbq_queries import fillrate_analysis
|
|
7
|
+
from .gbq_queries import query_gbq
|
|
8
|
+
|
|
9
|
+
__all__ =[
|
|
10
|
+
drop_columns_by_schema,
|
|
11
|
+
generate_create_table_sql,
|
|
12
|
+
fillrate_analysis,
|
|
13
|
+
query_gbq
|
|
14
|
+
]
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# This is so that you can import ppack or import average from ppack
|
|
2
|
+
# instead of from ppack.functions import average
|
|
3
|
+
|
|
4
|
+
# from .decorators import singleton
|
|
5
|
+
# from .database import infodb, listtb, uploadtb, downloadtb, deletetb
|
|
6
|
+
# from .it import combo
|
|
7
|
+
# from .pa import fheader
|
|
8
|
+
|
|
9
|
+
from .fillrate_analysis import fillrate_analysis
|
|
10
|
+
from .query_gbq import query_gbq
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
fillrate_analysis,
|
|
14
|
+
query_gbq
|
|
15
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from .query_gbq import query_gbq
|
|
3
|
+
import bigframes.pandas as bpd
|
|
4
|
+
from google.cloud import bigquery
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
AUTHOR: Abisheak Jacob J
|
|
8
|
+
LAST MODIFIED: 04-09-2025
|
|
9
|
+
TITLE: Generate sql code to create a table in postgres
|
|
10
|
+
DESCRIPTION: Generate SQL code to automate the process of writing "CREATE TABLE" sql code.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def fillrate_analysis(
|
|
14
|
+
str_gcp_project_id: str,
|
|
15
|
+
str_gbq_project_id: str,
|
|
16
|
+
str_dataset: str,
|
|
17
|
+
bool_check_all_tables: bool = True,
|
|
18
|
+
lst_tables: list = [],
|
|
19
|
+
bool_save_to_csv: bool = False
|
|
20
|
+
) -> pd.DataFrame:
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
Calculates the fill rate (non-null value percentage) for columns in specified BigQuery tables.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
str_gcp_project_id (str): The GCP project name used for querying BigQuery.
|
|
27
|
+
str_gbq_project_id (str): The project ID where the tables are located.
|
|
28
|
+
str_dataset (str): The dataset name containing the tables.
|
|
29
|
+
bool_check_all_tables (bool): If True, analyze all tables; otherwise, use lst_tables.
|
|
30
|
+
lst_tables (list): List of table names to include in the analysis.
|
|
31
|
+
bool_save_to_csv (bool): If True, save the result DataFrame to a CSV file.
|
|
32
|
+
Returns:
|
|
33
|
+
pd.DataFrame: A DataFrame containing table name, column name, non-null count,
|
|
34
|
+
null-like count, total count, and fill rate for each column.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Get table and column info
|
|
38
|
+
query_columns = f"""
|
|
39
|
+
SELECT table_name, column_name
|
|
40
|
+
FROM `{str_gbq_project_id}.{str_dataset}.INFORMATION_SCHEMA.COLUMNS`
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
columns_df = query_gbq(query=query_columns, str_gcp_project_id=str_gcp_project_id)
|
|
44
|
+
tables = columns_df.groupby('table_name')['column_name'].apply(list).to_dict()
|
|
45
|
+
|
|
46
|
+
# list of tables to be included in the fill rate calculation
|
|
47
|
+
if bool_check_all_tables:
|
|
48
|
+
lst_included_tables = list(tables.heys())
|
|
49
|
+
else:
|
|
50
|
+
lst_included_tables = lst_tables
|
|
51
|
+
|
|
52
|
+
lst_results = []
|
|
53
|
+
|
|
54
|
+
# Step 2: Loop over included tables and calculate fill rate using SQL
|
|
55
|
+
for table_name, column_list in tables.items():
|
|
56
|
+
if table_name not in lst_included_tables:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
full_table_id = f"{str_gbq_project_id}.{str_dataset}.{table_name}"
|
|
60
|
+
print(f" Processing table: {full_table_id}")
|
|
61
|
+
|
|
62
|
+
for col in column_list:
|
|
63
|
+
try:
|
|
64
|
+
query_fillrate = f"""
|
|
65
|
+
SELECT
|
|
66
|
+
COUNT(1) AS total_count,
|
|
67
|
+
SUM(CASE WHEN {col} IS NULL OR LOWER(TRIM(CAST({col} AS STRING))) IN ('null', '') THEN 1
|
|
68
|
+
ELSE 0 END) AS null_like_count,
|
|
69
|
+
ROUND(1 - SUM(CASE WHEN {col} IS NULL OR LOWER(TRIM(CAST({col} AS STRING))) IN ('null', '') THEN 1
|
|
70
|
+
ELSE 0 END) / COUNT(1), 4) AS fill_rate
|
|
71
|
+
FROM `{full_table_id}`
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
df_stats = query_gbq(query=query_fillrate, str_gcp_project_id=str_gcp_project_id)
|
|
75
|
+
stats = df_stats.iloc[0]
|
|
76
|
+
|
|
77
|
+
lst_results.append({
|
|
78
|
+
'table_name': table_name,
|
|
79
|
+
'column_name': col,
|
|
80
|
+
'non_null_count': stats['total_count'] - stats['null_like_count'],
|
|
81
|
+
'null_like_count': stats['null_like_count'],
|
|
82
|
+
'total_count': stats['total_count'],
|
|
83
|
+
'fill_rate': stats['fill_rate']
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(f" Failed for column '{col}' in table '{table_name}': {e}")
|
|
88
|
+
|
|
89
|
+
# Format Output
|
|
90
|
+
df_result = pd.DataFrame(lst_results)
|
|
91
|
+
df_result.sort_values(by='fill_rate', inplace=True)
|
|
92
|
+
df_result.columns = ['Table Name', 'Column Name', 'Non Null Count', 'Null Like Count', 'Total Count', 'Fill Rate']
|
|
93
|
+
|
|
94
|
+
# Save to CSV
|
|
95
|
+
if bool_save_to_csv:
|
|
96
|
+
df_result.to_csv("fill_rate_summary_{}_{}.csv".format(str_gbq_project_id,str_dataset), index=False)
|
|
97
|
+
|
|
98
|
+
return df_result
|
|
99
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import pandas_gbq
|
|
3
|
+
import bigframes.pandas as bpd
|
|
4
|
+
from google.cloud import bigquery
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
AUTHOR: Abisheak Jacob J
|
|
8
|
+
LAST MODIFIED: 04-09-2025
|
|
9
|
+
TITLE: Run SQL Query in GBQ
|
|
10
|
+
DESCRIPTION: Function to connect to GBQ and run SQL Queries
|
|
11
|
+
"""
|
|
12
|
+
def query_gbq(
|
|
13
|
+
query: str,
|
|
14
|
+
str_gcp_project_id: str
|
|
15
|
+
) -> pd.DataFrame:
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Function to connect to GBQ and run SQL Queries
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
query (str): the query that needs to be run in gbq
|
|
22
|
+
str_gcp_project_id (np.str): GCP project id under which you need to run this query. Big queries will be billed under this project
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
pd.DataFrame: the output of the query is returned as a dataframe
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
# excecute query in the respective project
|
|
29
|
+
df_table = pandas_gbq.read_gbq(query, project_id=str_gcp_project_id, dialect="standard", use_bqstorage_api=True)
|
|
30
|
+
|
|
31
|
+
return df_table
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# This is so that you can import ppack or import average from ppack
|
|
2
|
+
# instead of from ppack.functions import average
|
|
3
|
+
|
|
4
|
+
# from .decorators import singleton
|
|
5
|
+
# from .database import infodb, listtb, uploadtb, downloadtb, deletetb
|
|
6
|
+
# from .it import combo
|
|
7
|
+
# from .pa import fheader
|
|
8
|
+
|
|
9
|
+
from .drop_columns_by_schema import drop_columns_by_schema
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
drop_columns_by_schema
|
|
13
|
+
]
|
|
Binary file
|
jacobtools-0.0.1/src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc
ADDED
|
Binary file
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# import libraries
|
|
2
|
+
from jacobtools.config import SchemaStructure
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
AUTHOR: Abisheak Jacob J
|
|
7
|
+
LAST MODIFIED: 01-08-2025
|
|
8
|
+
TITLE: Drop Columns by Schema Document
|
|
9
|
+
DESCRIPTION: Drop columns that are not significant in providing any insight into the data using the DropFlag in the schema document
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def drop_columns_by_schema(
|
|
13
|
+
df: pd.DataFrame,
|
|
14
|
+
schema_df: pd.DataFrame,
|
|
15
|
+
drop_flag_column: int = SchemaStructure.DROP_FLAG
|
|
16
|
+
) -> pd.DataFrame:
|
|
17
|
+
|
|
18
|
+
# DOCSTRING
|
|
19
|
+
"""
|
|
20
|
+
Drop columns that are not significant in providing any insight into the data using the DropFlag in the schema document.
|
|
21
|
+
|
|
22
|
+
PARAMETERS
|
|
23
|
+
-------------------
|
|
24
|
+
df: pd.DataFrame
|
|
25
|
+
This is the input DataFrame where columns need to be dropped
|
|
26
|
+
schema_df: pd.DataFrame
|
|
27
|
+
This is the schema DataFrame with the DropFlag
|
|
28
|
+
drop_flag_column: int
|
|
29
|
+
This is a global variable that indicates the column in the schema where the DropFlag is present
|
|
30
|
+
|
|
31
|
+
RETURNS
|
|
32
|
+
-------------------
|
|
33
|
+
pd.DataFrame
|
|
34
|
+
Residual DataFrame after the flagged columns are dropped
|
|
35
|
+
|
|
36
|
+
RAISES
|
|
37
|
+
--------------------
|
|
38
|
+
ValueError
|
|
39
|
+
If the column is not in the DataFrame or any other errors
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# drop the columns flagged in schema table
|
|
43
|
+
df.drop(
|
|
44
|
+
columns = [col for col, flag in zip(df.columns.to_list(), schema_df.iloc[:, drop_flag_column]) if flag],
|
|
45
|
+
inplace=True
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return df
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# import libraries
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
AUTHOR: Abisheak Jacob J
|
|
7
|
+
LAST MODIFIED: 04-09-2025
|
|
8
|
+
TITLE: Generate sql code to create a table in postgres
|
|
9
|
+
DESCRIPTION: Generate SQL code to automate the process of writing "CREATE TABLE" sql code.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def infer_pg_type(
|
|
13
|
+
series: pd.Series
|
|
14
|
+
) -> str:
|
|
15
|
+
|
|
16
|
+
# DOCSTRING
|
|
17
|
+
"""
|
|
18
|
+
Infer efficient Postgres data types from pandas series.
|
|
19
|
+
|
|
20
|
+
PARAMETERS
|
|
21
|
+
-------------------
|
|
22
|
+
series: pd.Series
|
|
23
|
+
This is a column of the dataframe for which we are generating SQL code.
|
|
24
|
+
|
|
25
|
+
RETURNS
|
|
26
|
+
-------------------
|
|
27
|
+
string: str
|
|
28
|
+
Returns a string that contains the data type of the column
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
# integer datatype
|
|
32
|
+
if pd.api.types.is_integer_dtype(series):
|
|
33
|
+
min_val, max_val = series.min(), series.max()
|
|
34
|
+
if min_val >= -32768 and max_val <= 32767:
|
|
35
|
+
return "SMALLINT" # 2 bytes
|
|
36
|
+
elif min_val >= -2147483648 and max_val <= 2147483647:
|
|
37
|
+
return "INTEGER" # 4 bytes
|
|
38
|
+
else:
|
|
39
|
+
return "BIGINT" # 8 bytes
|
|
40
|
+
elif pd.api.types.is_float_dtype(series):
|
|
41
|
+
return "DOUBLE PRECISION" # PostgreSQL's default 8-byte float
|
|
42
|
+
elif pd.api.types.is_bool_dtype(series):
|
|
43
|
+
return "BOOLEAN"
|
|
44
|
+
elif pd.api.types.is_datetime64_any_dtype(series):
|
|
45
|
+
return "TIMESTAMP"
|
|
46
|
+
else:
|
|
47
|
+
max_len = series.astype(str).map(len).max()
|
|
48
|
+
if max_len <= 255:
|
|
49
|
+
return f"VARCHAR({max_len + 10})"
|
|
50
|
+
else:
|
|
51
|
+
return "TEXT"
|
|
52
|
+
|
|
53
|
+
def generate_create_table_sql(
|
|
54
|
+
csv_file: str,
|
|
55
|
+
table_name: str = "my_table"
|
|
56
|
+
) -> str:
|
|
57
|
+
|
|
58
|
+
# DOCSTRING
|
|
59
|
+
"""
|
|
60
|
+
Generate CREATE TABLE sql for postgres
|
|
61
|
+
|
|
62
|
+
PARAMETERS
|
|
63
|
+
-------------------
|
|
64
|
+
csv_file: str
|
|
65
|
+
Path to csv file for which table schema is to be created
|
|
66
|
+
table_name: str
|
|
67
|
+
Name of the table in postgres database
|
|
68
|
+
|
|
69
|
+
RETURNS
|
|
70
|
+
-------------------
|
|
71
|
+
str
|
|
72
|
+
SQL Code
|
|
73
|
+
|
|
74
|
+
RAISES
|
|
75
|
+
--------------------
|
|
76
|
+
ValueError
|
|
77
|
+
If the column is not in the DataFrame or any other errors
|
|
78
|
+
"""
|
|
79
|
+
# Load only 100000 rows for type inference
|
|
80
|
+
df = pd.read_csv(csv_file, nrows=10000)
|
|
81
|
+
|
|
82
|
+
# infer efficient data type for each column
|
|
83
|
+
column_types = {col: infer_pg_type(df[col].dropna()) for col in df.columns}
|
|
84
|
+
|
|
85
|
+
# generate sql code
|
|
86
|
+
sql_lines = [f'DROP TABLE IF EXISTS "{table_name}";\n\nCREATE TABLE "{table_name}" (']
|
|
87
|
+
|
|
88
|
+
for col, dtype in column_types.items():
|
|
89
|
+
sql_lines.append(f' "{col}" {dtype},')
|
|
90
|
+
|
|
91
|
+
sql_lines[-1] = sql_lines[-1].rstrip(",") # remove last comma
|
|
92
|
+
|
|
93
|
+
sql_lines.append(");")
|
|
94
|
+
|
|
95
|
+
return print("\n".join(sql_lines))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import seaborn as sns
|
|
3
|
+
|
|
4
|
+
def set_global_style():
|
|
5
|
+
sns.set_theme(style='whitegrid') # You can choose 'white', 'darkgrid', etc.
|
|
6
|
+
plt.rcParams.update({
|
|
7
|
+
'figure.figsize': (10, 6),
|
|
8
|
+
'axes.titlesize': 16,
|
|
9
|
+
'axes.labelsize': 14,
|
|
10
|
+
'axes.edgecolor': '#333333',
|
|
11
|
+
'axes.linewidth': 1.2,
|
|
12
|
+
'axes.grid': True,
|
|
13
|
+
'grid.color': '#dddddd',
|
|
14
|
+
'grid.linestyle': '--',
|
|
15
|
+
'grid.linewidth': 0.5,
|
|
16
|
+
'xtick.labelsize': 12,
|
|
17
|
+
'ytick.labelsize': 12,
|
|
18
|
+
'legend.fontsize': 12,
|
|
19
|
+
'legend.frameon': True,
|
|
20
|
+
'legend.framealpha': 0.9,
|
|
21
|
+
'legend.edgecolor': '#444444',
|
|
22
|
+
'font.family': 'sans-serif',
|
|
23
|
+
'font.sans-serif': 'DejaVu Sans'
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
sns.set_palette("Set2") # or use custom_palette = sns.color_palette([...])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def plot_line(data, x, y, hue=None, title='', xlabel='', ylabel=''):
|
|
31
|
+
set_global_style()
|
|
32
|
+
plt.figure()
|
|
33
|
+
ax = sns.lineplot(data=data, x=x, y=y, hue=hue)
|
|
34
|
+
ax.set_title(title)
|
|
35
|
+
ax.set_xlabel(xlabel if xlabel else x)
|
|
36
|
+
ax.set_ylabel(ylabel if ylabel else y)
|
|
37
|
+
plt.tight_layout()
|
|
38
|
+
plt.show()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def plot_bar(data, x, y, hue=None, title='', xlabel='', ylabel='', orient='v'):
|
|
42
|
+
set_global_style()
|
|
43
|
+
plt.figure()
|
|
44
|
+
ax = sns.barplot(data=data, x=x, y=y, hue=hue, orient=orient)
|
|
45
|
+
ax.set_title(title)
|
|
46
|
+
ax.set_xlabel(xlabel if xlabel else x)
|
|
47
|
+
ax.set_ylabel(ylabel if ylabel else y)
|
|
48
|
+
plt.tight_layout()
|
|
49
|
+
plt.show()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def plot_heatmap(data, title='', xlabel='', ylabel='', annot=True, fmt=".2f", cmap='coolwarm'):
|
|
53
|
+
set_global_style()
|
|
54
|
+
plt.figure()
|
|
55
|
+
ax = sns.heatmap(data, annot=annot, fmt=fmt, cmap=cmap, cbar=True)
|
|
56
|
+
ax.set_title(title)
|
|
57
|
+
ax.set_xlabel(xlabel)
|
|
58
|
+
ax.set_ylabel(ylabel)
|
|
59
|
+
plt.tight_layout()
|
|
60
|
+
plt.show()
|
|
61
|
+
|
|
62
|
+
def save_plot(filename, dpi=300):
|
|
63
|
+
plt.savefig(filename, dpi=dpi, bbox_inches='tight', transparent=False)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# %%
|
|
2
|
+
# import the libraries
|
|
3
|
+
from sqlalchemy import create_engine, inspect, MetaData, Table, text
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# %%
|
|
8
|
+
# defining the parameters for the database connection as global variables
|
|
9
|
+
def infodb(host, user, passw, dbname, show_url=False):
|
|
10
|
+
# write a docstring for this function
|
|
11
|
+
"""
|
|
12
|
+
This function is used to define the parameters for the database connection as global variables.
|
|
13
|
+
This function also defines the MySQL database url for the connection.
|
|
14
|
+
|
|
15
|
+
:param host: object
|
|
16
|
+
:param user: object
|
|
17
|
+
:param passw: object
|
|
18
|
+
:param dbname: object
|
|
19
|
+
:return:
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# define the global variables
|
|
23
|
+
global hostname
|
|
24
|
+
global username
|
|
25
|
+
global password
|
|
26
|
+
global databasename
|
|
27
|
+
global databaseurl
|
|
28
|
+
|
|
29
|
+
# assign the values of the database credentials to the global variables
|
|
30
|
+
hostname = str(host)
|
|
31
|
+
username = str(user)
|
|
32
|
+
password = str(passw)
|
|
33
|
+
databasename = str(dbname)
|
|
34
|
+
databaseurl = "mysql://" + user + ":" + password + "@" + host + "/" + databasename
|
|
35
|
+
|
|
36
|
+
# if show_url is True then print the database url
|
|
37
|
+
if show_url:
|
|
38
|
+
print(f"The database url is: {databaseurl}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# %%
|
|
42
|
+
# defining the list all tables in the database function
|
|
43
|
+
def listtb():
|
|
44
|
+
"""
|
|
45
|
+
This function is used to list all the tables in the database.
|
|
46
|
+
:return:
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
# Create a SQLAlchemy engine to connect to the database
|
|
50
|
+
engine = create_engine(databaseurl)
|
|
51
|
+
|
|
52
|
+
# Create an Inspector for the engine
|
|
53
|
+
inspector = inspect(engine)
|
|
54
|
+
|
|
55
|
+
# Get the table names from the database
|
|
56
|
+
table_names = inspector.get_table_names()
|
|
57
|
+
|
|
58
|
+
# return the table names
|
|
59
|
+
return pd.DataFrame(table_names)
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
|
|
63
|
+
# if error return the error
|
|
64
|
+
return str(e)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# %%
|
|
68
|
+
# upload a dataframe to the database
|
|
69
|
+
def uploadtb(df, tbname):
|
|
70
|
+
"""
|
|
71
|
+
This function is used to upload a dataframe to the database.
|
|
72
|
+
:param df: dataframe
|
|
73
|
+
:param tbname: object
|
|
74
|
+
:return:
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
# convert the table name to string
|
|
78
|
+
tbname = str(tbname)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Create a SQLAlchemy engine to connect to the database
|
|
82
|
+
engine = create_engine(databaseurl)
|
|
83
|
+
|
|
84
|
+
# engine.execute(text('DELETE FROM NseTicker'))
|
|
85
|
+
with engine.connect() as conn:
|
|
86
|
+
conn.execute(text(f"DELETE FROM {tbname}"))
|
|
87
|
+
conn.commit()
|
|
88
|
+
|
|
89
|
+
# upload the dataframe to the database
|
|
90
|
+
df.to_sql(tbname, engine, if_exists="append", index=False)
|
|
91
|
+
|
|
92
|
+
# return a success message
|
|
93
|
+
return "Dataframe uploaded successfully"
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
|
|
97
|
+
# if error return the error
|
|
98
|
+
return str(e)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# %%
|
|
102
|
+
# download a table from the database
|
|
103
|
+
def downloadtb(tbname):
|
|
104
|
+
"""
|
|
105
|
+
This function is used to download a table from the database.
|
|
106
|
+
:param tbname: object
|
|
107
|
+
:return:
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
# convert the table name to string
|
|
111
|
+
tbname = str(tbname)
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
# Create a SQLAlchemy engine to connect to the database
|
|
115
|
+
engine = create_engine(databaseurl)
|
|
116
|
+
|
|
117
|
+
# download the table from the database
|
|
118
|
+
df = pd.read_sql(tbname, engine)
|
|
119
|
+
|
|
120
|
+
# return the dataframe
|
|
121
|
+
return df
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
|
|
125
|
+
# if error return the error
|
|
126
|
+
print(str(e))
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# %%
|
|
130
|
+
# delete a table from the database
|
|
131
|
+
def deletetb(tbname):
|
|
132
|
+
"""
|
|
133
|
+
This function is used to delete a table from the database.
|
|
134
|
+
:param tbname: object
|
|
135
|
+
:return:
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
# convert the table name to string
|
|
139
|
+
tbname = str(tbname)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
# Create a SQLAlchemy engine to connect to the database
|
|
143
|
+
engine = create_engine(databaseurl)
|
|
144
|
+
|
|
145
|
+
# Create a MetaData object
|
|
146
|
+
metadata = MetaData()
|
|
147
|
+
|
|
148
|
+
# Reflect the existing table
|
|
149
|
+
existing_table = Table(tbname, metadata, autoload_with=engine)
|
|
150
|
+
|
|
151
|
+
# Drop the table
|
|
152
|
+
existing_table.drop(engine, checkfirst=True)
|
|
153
|
+
|
|
154
|
+
# return a success message
|
|
155
|
+
return "Table deleted successfully"
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
|
|
159
|
+
# if error return the error
|
|
160
|
+
return str(e)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# importing the packages
|
|
2
|
+
|
|
3
|
+
from itertools import combinations
|
|
4
|
+
from collections import Counter
|
|
5
|
+
import pandas as pd
|
|
6
|
+
# creating a function to calculate the count of all the possible combinations
|
|
7
|
+
|
|
8
|
+
def combo(df_data, column_name, groupby_column, combo_no):
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# create the groupby column and the column name
|
|
12
|
+
# groupby column - the column which you choose to use as the pivot for the entire combination operation
|
|
13
|
+
# column_name - the column that you wish to create the combinations along a pivot
|
|
14
|
+
# creating a dataframe along the groupby column where the combination column will form a list
|
|
15
|
+
|
|
16
|
+
new_data = df_data.groupby(groupby_column)[column_name].agg(list).reset_index()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# creating a new column all-combinations that contains all the combinations of the items as per the combo_no
|
|
20
|
+
|
|
21
|
+
new_data['all_combinations'] = new_data[column_name].apply(lambda x: list(combinations(x,combo_no)))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# count the number of occurances of each particular combinations
|
|
25
|
+
|
|
26
|
+
combinations_counter = Counter([tuple(sorted(i)) for sublist in new_data['all_combinations'] for i in sublist])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# sort all the combinations in the descending order
|
|
30
|
+
|
|
31
|
+
all_combinations_sorted = combinations_counter.most_common()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# returns the sorted series
|
|
35
|
+
|
|
36
|
+
return all_combinations_sorted
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# importing the packages
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# defining a fuction to convert the function to lowercase
|
|
6
|
+
def fheader(df_name):
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# creating a list of the column titles in the dataframe
|
|
10
|
+
|
|
11
|
+
column_titles = list(df_name.columns)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# converting all the elements in the list to lowercase
|
|
15
|
+
|
|
16
|
+
column_titles = [x.lower() for x in column_titles]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# removing empty space in the beginging and the end of all the headers
|
|
20
|
+
|
|
21
|
+
column_titles = [x.strip() for x in column_titles]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# converting all the spaces left in the header to underscore
|
|
25
|
+
|
|
26
|
+
column_titles = [x.replace(" ", "_") for x in column_titles]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# replacing the old header names with the new ones
|
|
30
|
+
|
|
31
|
+
df_name.columns = column_titles
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
return df_name
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
reason_no,reason_details
|
|
2
|
+
1,Certain infectious and parasitic diseases
|
|
3
|
+
2,Neoplasms
|
|
4
|
+
3,Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism
|
|
5
|
+
4,"Endocrine, nutritional and metablolic diseases"
|
|
6
|
+
5,Mental and behavioural disorders
|
|
7
|
+
6,Diseases of the nervous system
|
|
8
|
+
7,Diseases of the eye and adnexa
|
|
9
|
+
8,Diseases of the ear and mastoid process
|
|
10
|
+
9,Diseases of the circulatory system
|
|
11
|
+
10,Diseases of the respiratory system
|
|
12
|
+
11,Diseases of the digestive system
|
|
13
|
+
12,Diseases of the skin and subcutaneous tissue
|
|
14
|
+
13,Diseases of he musculoskeletal system and connective tissue
|
|
15
|
+
14,Disease of the genitourinary system
|
|
16
|
+
15,"Pregnancy, childbirth and the puerperium"
|
|
17
|
+
16,Certain conditions originating in the perinatal period
|
|
18
|
+
17,"Congenital malformations, deformations and chromosal abnormalities"
|
|
19
|
+
18,"Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"
|
|
20
|
+
19,"Injury, poisoning and certain other consequences of external causes"
|
|
21
|
+
20,External causes of morbidity and mortality
|
|
22
|
+
21,Factors influencing health status and contack with health services
|
|
23
|
+
22,Patient follow-up
|
|
24
|
+
23,Medical consultation
|
|
25
|
+
24,Blood donation
|
|
26
|
+
25,Laboratory examination
|
|
27
|
+
26,Unjustified absence
|
|
28
|
+
27,Physiotherapy
|
|
29
|
+
28,Dental Consultation
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from jacobtools_test.harmonic_mean import harmonic_mean
|
|
3
|
+
from termcolor import cprint
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main():
|
|
7
|
+
result = 0.0
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
nums = [float(num) for num in sys.argv[1:]]
|
|
11
|
+
except ValueError:
|
|
12
|
+
nums = []
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
result = harmonic_mean(nums)
|
|
16
|
+
except ZeroDivisionError:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
cprint(result, "red", "on_cyan", attrs=["bold"])
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jacobtools_test
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: This package contains functions that help me tackle my day to day data andalytics requirements
|
|
5
|
+
Home-page: 'https://github.com/AbisheakJacob/jacobtools'
|
|
6
|
+
Author: Abisheak Jacob J
|
|
7
|
+
Author-email: "AbisheakJacob0032" abisheakjacob0032@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=2.1.0
|
|
14
|
+
Requires-Dist: termcolor>=1.1.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
README.md
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
setup.cfg
|
|
6
|
+
src/jacobtools/__init__.py
|
|
7
|
+
src/jacobtools/config.py
|
|
8
|
+
src/jacobtools/__pycache__/__init__.cpython-312.pyc
|
|
9
|
+
src/jacobtools/__pycache__/config.cpython-312.pyc
|
|
10
|
+
src/jacobtools/gbq_queries/__init__.py
|
|
11
|
+
src/jacobtools/gbq_queries/fillrate_analysis.py
|
|
12
|
+
src/jacobtools/gbq_queries/query_gbq.py
|
|
13
|
+
src/jacobtools/gbq_queries/__pycache__/__init__.cpython-312.pyc
|
|
14
|
+
src/jacobtools/gbq_queries/__pycache__/fillrate_analysis.cpython-312.pyc
|
|
15
|
+
src/jacobtools/gbq_queries/__pycache__/query_gbq.cpython-312.pyc
|
|
16
|
+
src/jacobtools/preprocessing/__init__.py
|
|
17
|
+
src/jacobtools/preprocessing/drop_columns_by_schema.py
|
|
18
|
+
src/jacobtools/preprocessing/enforce_schema.py
|
|
19
|
+
src/jacobtools/preprocessing/__pycache__/__init__.cpython-312.pyc
|
|
20
|
+
src/jacobtools/preprocessing/__pycache__/drop_columns_by_schema.cpython-312.pyc
|
|
21
|
+
src/jacobtools/sql_generation/__init__.py
|
|
22
|
+
src/jacobtools/sql_generation/__pycache__/__init__.cpython-312.pyc
|
|
23
|
+
src/jacobtools/sql_generation/postgres/__init__.py
|
|
24
|
+
src/jacobtools/sql_generation/postgres/generate_create_table_sql.py
|
|
25
|
+
src/jacobtools/sql_generation/postgres/__pycache__/__init__.cpython-312.pyc
|
|
26
|
+
src/jacobtools/sql_generation/postgres/__pycache__/generate_create_table_sql.cpython-312.pyc
|
|
27
|
+
src/jacobtools/temp/chart.py
|
|
28
|
+
src/jacobtools/temp/database.py
|
|
29
|
+
src/jacobtools/temp/decorators.py
|
|
30
|
+
src/jacobtools/temp/it.py
|
|
31
|
+
src/jacobtools/temp/pa.py
|
|
32
|
+
src/jacobtools/temp/reason_for_absence.csv
|
|
33
|
+
src/jacobtools_test/__init__.py
|
|
34
|
+
src/jacobtools_test/harmonic_mean.py
|
|
35
|
+
src/jacobtools_test/harmony.py
|
|
36
|
+
src/jacobtools_test.egg-info/PKG-INFO
|
|
37
|
+
src/jacobtools_test.egg-info/SOURCES.txt
|
|
38
|
+
src/jacobtools_test.egg-info/dependency_links.txt
|
|
39
|
+
src/jacobtools_test.egg-info/entry_points.txt
|
|
40
|
+
src/jacobtools_test.egg-info/requires.txt
|
|
41
|
+
src/jacobtools_test.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|