penwings 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- penwings-0.1.0/.gitignore +10 -0
- penwings-0.1.0/LICENSE +9 -0
- penwings-0.1.0/PKG-INFO +177 -0
- penwings-0.1.0/README.md +151 -0
- penwings-0.1.0/pyproject.toml +59 -0
- penwings-0.1.0/setup.cfg +4 -0
- penwings-0.1.0/src/penwings/__init__.py +9 -0
- penwings-0.1.0/src/penwings/_utils/__init__.py +0 -0
- penwings-0.1.0/src/penwings/_utils/_decorators.py +38 -0
- penwings-0.1.0/src/penwings/_utils/_typing.py +7 -0
- penwings-0.1.0/src/penwings/io/__init__.py +0 -0
- penwings-0.1.0/src/penwings/io/cache.py +93 -0
- penwings-0.1.0/src/penwings/paths.py +16 -0
- penwings-0.1.0/src/penwings.egg-info/PKG-INFO +177 -0
- penwings-0.1.0/src/penwings.egg-info/SOURCES.txt +17 -0
- penwings-0.1.0/src/penwings.egg-info/dependency_links.txt +1 -0
- penwings-0.1.0/src/penwings.egg-info/requires.txt +16 -0
- penwings-0.1.0/src/penwings.egg-info/top_level.txt +1 -0
- penwings-0.1.0/uv.lock +766 -0
penwings-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2026 Raf Blanckaert
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
penwings-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: penwings
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight library to handle data and reproduce workflows
|
|
5
|
+
Author-email: Raf Blanckaert <R.Blanckaert@outlook.com>
|
|
6
|
+
License: LICENSE
|
|
7
|
+
Project-URL: Homepage, https://github.com/Frissie/penwings
|
|
8
|
+
Project-URL: Repository, https://github.com/Frissie/penwings
|
|
9
|
+
Project-URL: Issues, https://github.com/Frissie/penwings/issues
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
|
|
14
|
+
Requires-Dist: pyodbc<6.0.0,>=5.3.0
|
|
15
|
+
Requires-Dist: pandas<4.0.0,>=3.0.0
|
|
16
|
+
Requires-Dist: numpy<3.0.0,>=2.4.1
|
|
17
|
+
Provides-Extra: scipy
|
|
18
|
+
Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
|
|
19
|
+
Provides-Extra: sklearn
|
|
20
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
|
|
21
|
+
Provides-Extra: optuna
|
|
22
|
+
Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# Penwings
|
|
28
|
+
|
|
29
|
+
**Penwings** is a lightweight Python library designed to simplify SQL data workflows by automatically importing data from SQL and caching it as Parquet files. This ensures faster subsequent access and reproducible pipelines, while reducing database get.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Table of Contents
|
|
34
|
+
|
|
35
|
+
1. [Features](#features)
|
|
36
|
+
2. [Installation](#installation)
|
|
37
|
+
3. [Getting Started](#getting-started)
|
|
38
|
+
4. [Usage](#usage)
|
|
39
|
+
5. [Versioning](#versioning)
|
|
40
|
+
6. [Contributing](#contributing)
|
|
41
|
+
7. [License](#license)
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- get data from SQL queries or SQL files
|
|
48
|
+
- Automatically save query results as Parquet files
|
|
49
|
+
- Reuse Parquet files to avoid redundant queries
|
|
50
|
+
- Simple, stable API for reproducible workflows
|
|
51
|
+
- Optimized for performance and ease of integration
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
Install via pip:
|
|
58
|
+
|
|
59
|
+
pip install penwings
|
|
60
|
+
|
|
61
|
+
> Make sure you have Python 3.11+ installed.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Getting Started
|
|
66
|
+
|
|
67
|
+
### Importing the Library
|
|
68
|
+
|
|
69
|
+
from penwings import SQLParquetCache
|
|
70
|
+
|
|
71
|
+
### Initialize the Cache
|
|
72
|
+
|
|
73
|
+
You can initialize the cache by providing either a SQL directory or a query string, along with a Parquet directory:
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
from sqlalchemy import create_engine
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
# SQL connection
|
|
80
|
+
```
|
|
81
|
+
engine = create_engine("postgresql://user:password@localhost/dbname")
|
|
82
|
+
|
|
83
|
+
# Initialize the cache
|
|
84
|
+
loader = SQLParquetCache(
|
|
85
|
+
sql_dir="sql_files", # Optional if using query string
|
|
86
|
+
parquet_dir="parquet_cache",
|
|
87
|
+
conn=engine
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
### 1. Using SQL Files
|
|
95
|
+
|
|
96
|
+
If you have SQL files stored in a directory:
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
# Run a SQL file and cache the result
|
|
100
|
+
df = loader.get("monthly_sales.sql")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
- `penwings` will automatically check if a Parquet version exists.
|
|
104
|
+
- If it exists, the cached Parquet is loaded.
|
|
105
|
+
- If not, the SQL query runs and the result is saved as a Parquet file.
|
|
106
|
+
|
|
107
|
+
### 2. Using SQL Query Strings
|
|
108
|
+
|
|
109
|
+
You can also pass queries directly:
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
query = "SELECT * FROM sales WHERE month='2026-02'"
|
|
113
|
+
df = loader.get(sql=query, parquet_name="sales_feb2026")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
- `parquet_name` determines the Parquet file name.
|
|
117
|
+
- Works similarly to SQL file mode for caching.
|
|
118
|
+
|
|
119
|
+
### 3. Automatic Parquet Management
|
|
120
|
+
|
|
121
|
+
- All results are cached in the specified `parquet_dir`.
|
|
122
|
+
- This reduces repeated database queries and ensures reproducibility.
|
|
123
|
+
- Cached files can be reloaded for faster access.
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Versioning
|
|
128
|
+
|
|
129
|
+
Penwings follows **semantic versioning**:
|
|
130
|
+
|
|
131
|
+
- **MAJOR**: Breaking changes to API
|
|
132
|
+
- **MINOR**: New features, backward-compatible
|
|
133
|
+
- **PATCH**: Bug fixes
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Contributing
|
|
138
|
+
|
|
139
|
+
We welcome contributions!
|
|
140
|
+
|
|
141
|
+
1. Fork the repository
|
|
142
|
+
2. Create a feature branch (`git checkout -b feature/my-feature`)
|
|
143
|
+
3. Commit your changes (`git commit -m 'Add new feature'`)
|
|
144
|
+
4. Push to branch (`git push origin feature/my-feature`)
|
|
145
|
+
5. Open a pull request
|
|
146
|
+
|
|
147
|
+
Please ensure your code follows PEP8 standards.
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Example Workflow
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
from sqlalchemy import create_engine
|
|
161
|
+
from penwings import SQLParquetCache
|
|
162
|
+
|
|
163
|
+
engine = create_engine("sqlite:///example.db")
|
|
164
|
+
|
|
165
|
+
loader = SQLParquetCache(
|
|
166
|
+
sql_dir="sql_queries",
|
|
167
|
+
parquet_dir="parquet_cache",
|
|
168
|
+
conn=engine
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# get data
|
|
172
|
+
df_jan = loader.get("sales_january.sql")
|
|
173
|
+
df_feb = loader.get(sql="SELECT * FROM sales WHERE month='2026-02'", parquet_name="sales_feb")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
- SQL files are automatically cached as Parquet
|
|
177
|
+
- Subsequent loads are fast and do not hit the database
|
penwings-0.1.0/README.md
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# Penwings
|
|
2
|
+
|
|
3
|
+
**Penwings** is a lightweight Python library designed to simplify SQL data workflows by automatically importing data from SQL and caching it as Parquet files. This ensures faster subsequent access and reproducible pipelines, while reducing database get.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Table of Contents
|
|
8
|
+
|
|
9
|
+
1. [Features](#features)
|
|
10
|
+
2. [Installation](#installation)
|
|
11
|
+
3. [Getting Started](#getting-started)
|
|
12
|
+
4. [Usage](#usage)
|
|
13
|
+
5. [Versioning](#versioning)
|
|
14
|
+
6. [Contributing](#contributing)
|
|
15
|
+
7. [License](#license)
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- get data from SQL queries or SQL files
|
|
22
|
+
- Automatically save query results as Parquet files
|
|
23
|
+
- Reuse Parquet files to avoid redundant queries
|
|
24
|
+
- Simple, stable API for reproducible workflows
|
|
25
|
+
- Optimized for performance and ease of integration
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Install via pip:
|
|
32
|
+
|
|
33
|
+
pip install penwings
|
|
34
|
+
|
|
35
|
+
> Make sure you have Python 3.11+ installed.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Getting Started
|
|
40
|
+
|
|
41
|
+
### Importing the Library
|
|
42
|
+
|
|
43
|
+
from penwings import SQLParquetCache
|
|
44
|
+
|
|
45
|
+
### Initialize the Cache
|
|
46
|
+
|
|
47
|
+
You can initialize the cache by providing either a SQL directory or a query string, along with a Parquet directory:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
from sqlalchemy import create_engine
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
# SQL connection
|
|
54
|
+
```
|
|
55
|
+
engine = create_engine("postgresql://user:password@localhost/dbname")
|
|
56
|
+
|
|
57
|
+
# Initialize the cache
|
|
58
|
+
loader = SQLParquetCache(
|
|
59
|
+
sql_dir="sql_files", # Optional if using query string
|
|
60
|
+
parquet_dir="parquet_cache",
|
|
61
|
+
conn=engine
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
### 1. Using SQL Files
|
|
69
|
+
|
|
70
|
+
If you have SQL files stored in a directory:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
# Run a SQL file and cache the result
|
|
74
|
+
df = loader.get("monthly_sales.sql")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
- `penwings` will automatically check if a Parquet version exists.
|
|
78
|
+
- If it exists, the cached Parquet is loaded.
|
|
79
|
+
- If not, the SQL query runs and the result is saved as a Parquet file.
|
|
80
|
+
|
|
81
|
+
### 2. Using SQL Query Strings
|
|
82
|
+
|
|
83
|
+
You can also pass queries directly:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
query = "SELECT * FROM sales WHERE month='2026-02'"
|
|
87
|
+
df = loader.get(sql=query, parquet_name="sales_feb2026")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
- `parquet_name` determines the Parquet file name.
|
|
91
|
+
- Works similarly to SQL file mode for caching.
|
|
92
|
+
|
|
93
|
+
### 3. Automatic Parquet Management
|
|
94
|
+
|
|
95
|
+
- All results are cached in the specified `parquet_dir`.
|
|
96
|
+
- This reduces repeated database queries and ensures reproducibility.
|
|
97
|
+
- Cached files can be reloaded for faster access.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Versioning
|
|
102
|
+
|
|
103
|
+
Penwings follows **semantic versioning**:
|
|
104
|
+
|
|
105
|
+
- **MAJOR**: Breaking changes to API
|
|
106
|
+
- **MINOR**: New features, backward-compatible
|
|
107
|
+
- **PATCH**: Bug fixes
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Contributing
|
|
112
|
+
|
|
113
|
+
We welcome contributions!
|
|
114
|
+
|
|
115
|
+
1. Fork the repository
|
|
116
|
+
2. Create a feature branch (`git checkout -b feature/my-feature`)
|
|
117
|
+
3. Commit your changes (`git commit -m 'Add new feature'`)
|
|
118
|
+
4. Push to branch (`git push origin feature/my-feature`)
|
|
119
|
+
5. Open a pull request
|
|
120
|
+
|
|
121
|
+
Please ensure your code follows PEP8 standards.
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## License
|
|
126
|
+
|
|
127
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Example Workflow
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
from sqlalchemy import create_engine
|
|
135
|
+
from penwings import SQLParquetCache
|
|
136
|
+
|
|
137
|
+
engine = create_engine("sqlite:///example.db")
|
|
138
|
+
|
|
139
|
+
loader = SQLParquetCache(
|
|
140
|
+
sql_dir="sql_queries",
|
|
141
|
+
parquet_dir="parquet_cache",
|
|
142
|
+
conn=engine
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# get data
|
|
146
|
+
df_jan = loader.get("sales_january.sql")
|
|
147
|
+
df_feb = loader.get(sql="SELECT * FROM sales WHERE month='2026-02'", parquet_name="sales_feb")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
- SQL files are automatically cached as Parquet
|
|
151
|
+
- Subsequent loads are fast and do not hit the database
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "penwings"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "Lightweight library to handle data and reproduce workflows"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = {text = "LICENSE"}
|
|
7
|
+
authors = [
|
|
8
|
+
{name = "Raf Blanckaert",email = "R.Blanckaert@outlook.com"}
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"sqlalchemy (>=2.0.46,<3.0.0)",
|
|
13
|
+
"pyodbc (>=5.3.0,<6.0.0)",
|
|
14
|
+
"pandas (>=3.0.0,<4.0.0)",
|
|
15
|
+
"numpy (>=2.4.1,<3.0.0)"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://github.com/Frissie/penwings"
|
|
20
|
+
Repository = "https://github.com/Frissie/penwings"
|
|
21
|
+
Issues = "https://github.com/Frissie/penwings/issues"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
[tool.setuptools_scm]
|
|
25
|
+
version_scheme = "guess-next-dev"
|
|
26
|
+
local_scheme = "no-local-version"
|
|
27
|
+
tag_regex = "^v(?P<version>.*)$"
|
|
28
|
+
|
|
29
|
+
[tool.setuptools]
|
|
30
|
+
package-dir = {"" = "src"}
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["src"]
|
|
34
|
+
include = ["penwings*"]
|
|
35
|
+
exclude = ["penwings._*"]
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["setuptools>=68", "wheel", "setuptools-scm"]
|
|
39
|
+
build-backend = "setuptools.build_meta"
|
|
40
|
+
|
|
41
|
+
[dependency-groups]
|
|
42
|
+
dev = [
|
|
43
|
+
"openpyxl>=3.1.5",
|
|
44
|
+
"optuna>=4.7.0",
|
|
45
|
+
"scikit-learn>=1.8.0",
|
|
46
|
+
"scipy>=1.17.0",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
scipy = ["scipy (>=1.17.0,<2.0.0)"]
|
|
51
|
+
sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
|
|
52
|
+
optuna= ["optuna (>=4.7.0,<5.0.0)"]
|
|
53
|
+
all = ["openpyxl (>=3.1.5,<4.0.0)"]
|
|
54
|
+
|
|
55
|
+
[[tool.uv.index]]
|
|
56
|
+
name = "testpypi"
|
|
57
|
+
url = "https://test.pypi.org/simple/"
|
|
58
|
+
publish-url = "https://test.pypi.org/legacy/"
|
|
59
|
+
explicit = true
|
penwings-0.1.0/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import time as t
|
|
2
|
+
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def timing(func):
|
|
8
|
+
@wraps(func)
|
|
9
|
+
def wrapper(*args, **kwargs):
|
|
10
|
+
start = t.perf_counter()
|
|
11
|
+
result = func(*args, **kwargs)
|
|
12
|
+
end = t.perf_counter()
|
|
13
|
+
print(f"{func.__name__} took {end - start: .2f}")
|
|
14
|
+
return result
|
|
15
|
+
|
|
16
|
+
return wrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def timing_sql(func):
|
|
20
|
+
@wraps(func)
|
|
21
|
+
def wrapper(*args, **kwargs):
|
|
22
|
+
sql_file = kwargs.get("sql_file", None)
|
|
23
|
+
verbose = getattr(args[0], "verbose", True)
|
|
24
|
+
|
|
25
|
+
if sql_file is None and len(args) > 1:
|
|
26
|
+
sql_file = args[1]
|
|
27
|
+
|
|
28
|
+
sql_file = Path(sql_file)
|
|
29
|
+
|
|
30
|
+
start = t.perf_counter()
|
|
31
|
+
result, source = func(*args, **kwargs)
|
|
32
|
+
end = t.perf_counter()
|
|
33
|
+
|
|
34
|
+
if verbose:
|
|
35
|
+
print(f"{sql_file.stem} -> {source} took {end - start: .2f} seconds to load")
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
return wrapper
|
|
File without changes
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import Engine
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Unpack, Union, Optional
|
|
7
|
+
from .._utils._typing import SQLParquetKwargs
|
|
8
|
+
from .._utils._decorators import timing_sql
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SQLParquetCache:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
parquet_dir: Union[Path, str],
|
|
15
|
+
conn: Engine,
|
|
16
|
+
sql_dir: Optional[Union[Path, str]] = None,
|
|
17
|
+
refresh_days: int = 0, # zero disables refresh when force == false
|
|
18
|
+
verbose: bool = True,
|
|
19
|
+
**kwargs: Unpack[SQLParquetKwargs],
|
|
20
|
+
):
|
|
21
|
+
|
|
22
|
+
if sql_dir is not None:
|
|
23
|
+
self.sql_dir: Path = Path(sql_dir)
|
|
24
|
+
self.parquet_dir: Path = Path(parquet_dir)
|
|
25
|
+
self.refresh_days = refresh_days
|
|
26
|
+
self.conn = conn
|
|
27
|
+
self.global_kwargs = kwargs
|
|
28
|
+
|
|
29
|
+
self.verbose = verbose
|
|
30
|
+
self.source = "SQL"
|
|
31
|
+
|
|
32
|
+
def set_params(self, **params):
|
|
33
|
+
for key, value in params.items():
|
|
34
|
+
if not hasattr(self, key):
|
|
35
|
+
raise ValueError(f"Invalid parameter: {key}")
|
|
36
|
+
setattr(self, key, value)
|
|
37
|
+
return self
|
|
38
|
+
|
|
39
|
+
def _sql_path(self, sql_file: str) -> Path:
|
|
40
|
+
return self.sql_dir / sql_file
|
|
41
|
+
|
|
42
|
+
def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
|
|
43
|
+
name = parquet_name or Path(sql_file).stem
|
|
44
|
+
return self.parquet_dir / f"{name}.parquet"
|
|
45
|
+
|
|
46
|
+
def _is_new(self, path: Path, refresh_window: int) -> bool:
|
|
47
|
+
if not path.exists():
|
|
48
|
+
return False
|
|
49
|
+
if self.refresh_days == 0:
|
|
50
|
+
return True
|
|
51
|
+
last_modified = datetime.fromtimestamp(path.stat().st_mtime)
|
|
52
|
+
return datetime.now() - last_modified < timedelta(days=refresh_window)
|
|
53
|
+
|
|
54
|
+
def _read_sql(self, sql_file: str):
|
|
55
|
+
return self._sql_path(sql_file).read_text()
|
|
56
|
+
|
|
57
|
+
def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
|
|
58
|
+
return pd.read_sql(query, conn, **kwargs)
|
|
59
|
+
|
|
60
|
+
@timing_sql
|
|
61
|
+
def get(
|
|
62
|
+
self,
|
|
63
|
+
sql: str,
|
|
64
|
+
parquet_name: Union[str, None] = None,
|
|
65
|
+
conn: Engine | None = None,
|
|
66
|
+
refresh_days: int | None = None,
|
|
67
|
+
force: bool = False,
|
|
68
|
+
**kwargs: Unpack[SQLParquetKwargs],
|
|
69
|
+
) -> tuple[pd.DataFrame, str]:
|
|
70
|
+
if isinstance(sql, str) and Path(sql).suffix == ".sql":
|
|
71
|
+
query = self._read_sql(sql)
|
|
72
|
+
elif isinstance(sql, str):
|
|
73
|
+
if parquet_name is None:
|
|
74
|
+
raise ValueError("parquet_name must be provided if query is passed directly")
|
|
75
|
+
query = sql
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError("sql must be a SQL string or a path to a .sql file")
|
|
78
|
+
|
|
79
|
+
connection = conn or self.conn
|
|
80
|
+
refresh_window = refresh_days or self.refresh_days
|
|
81
|
+
parquet_path = self._parquet_path(query)
|
|
82
|
+
sql_kwargs = self.global_kwargs | kwargs
|
|
83
|
+
|
|
84
|
+
if not force and self._is_new(parquet_path, refresh_window):
|
|
85
|
+
source = "Parquet"
|
|
86
|
+
return pd.read_parquet(parquet_path), source
|
|
87
|
+
|
|
88
|
+
source = "SQL"
|
|
89
|
+
df = self._return_sql(query, connection, **sql_kwargs)
|
|
90
|
+
self.parquet_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
df.to_parquet(parquet_path, index=False)
|
|
92
|
+
|
|
93
|
+
return df, source
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
home_dir = pathlib.Path.cwd()
|
|
4
|
+
proj_dir = pathlib.Path.cwd().parent
|
|
5
|
+
|
|
6
|
+
input_dir = home_dir / "input"
|
|
7
|
+
sql_dir = input_dir / "sql"
|
|
8
|
+
parquet_dir = input_dir / "parquet"
|
|
9
|
+
output_dir = home_dir / "output"
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
i = 1
|
|
13
|
+
for name, value in dict(locals()).items():
|
|
14
|
+
if isinstance(value, pathlib.Path):
|
|
15
|
+
print(f"{i} - {name}: {value}")
|
|
16
|
+
i += 1
|