penwings 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
penwings-0.1.0/LICENSE ADDED
@@ -0,0 +1,9 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2026 Raf Blanckaert
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,177 @@
1
+ Metadata-Version: 2.4
2
+ Name: penwings
3
+ Version: 0.1.0
4
+ Summary: Lightweight library to handle data and reproduce workflows
5
+ Author-email: Raf Blanckaert <R.Blanckaert@outlook.com>
6
+ License: LICENSE
7
+ Project-URL: Homepage, https://github.com/Frissie/penwings
8
+ Project-URL: Repository, https://github.com/Frissie/penwings
9
+ Project-URL: Issues, https://github.com/Frissie/penwings/issues
10
+ Requires-Python: >=3.11
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
14
+ Requires-Dist: pyodbc<6.0.0,>=5.3.0
15
+ Requires-Dist: pandas<4.0.0,>=3.0.0
16
+ Requires-Dist: numpy<3.0.0,>=2.4.1
17
+ Provides-Extra: scipy
18
+ Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
19
+ Provides-Extra: sklearn
20
+ Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
21
+ Provides-Extra: optuna
22
+ Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
23
+ Provides-Extra: all
24
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
25
+ Dynamic: license-file
26
+
27
+ # Penwings
28
+
29
+ **Penwings** is a lightweight Python library designed to simplify SQL data workflows by automatically importing data from SQL and caching it as Parquet files. This ensures faster subsequent access and reproducible pipelines, while reducing database get.
30
+
31
+ ---
32
+
33
+ ## Table of Contents
34
+
35
+ 1. [Features](#features)
36
+ 2. [Installation](#installation)
37
+ 3. [Getting Started](#getting-started)
38
+ 4. [Usage](#usage)
39
+ 5. [Versioning](#versioning)
40
+ 6. [Contributing](#contributing)
41
+ 7. [License](#license)
42
+
43
+ ---
44
+
45
+ ## Features
46
+
47
+ - get data from SQL queries or SQL files
48
+ - Automatically save query results as Parquet files
49
+ - Reuse Parquet files to avoid redundant queries
50
+ - Simple, stable API for reproducible workflows
51
+ - Optimized for performance and ease of integration
52
+
53
+ ---
54
+
55
+ ## Installation
56
+
57
+ Install via pip:
58
+
59
+ pip install penwings
60
+
61
+ > Make sure you have Python 3.11+ installed.
62
+
63
+ ---
64
+
65
+ ## Getting Started
66
+
67
+ ### Importing the Library
68
+
69
+ from penwings import SQLParquetCache
70
+
71
+ ### Initialize the Cache
72
+
73
+ You can initialize the cache by providing either a SQL directory or a query string, along with a Parquet directory:
74
+
75
+ ```
76
+ from sqlalchemy import create_engine
77
+ ```
78
+
79
+ # SQL connection
80
+ ```
81
+ engine = create_engine("postgresql://user:password@localhost/dbname")
82
+
83
+ # Initialize the cache
84
+ loader = SQLParquetCache(
85
+ sql_dir="sql_files", # Optional if using query string
86
+ parquet_dir="parquet_cache",
87
+ conn=engine
88
+ )
89
+ ```
90
+ ---
91
+
92
+ ## Usage
93
+
94
+ ### 1. Using SQL Files
95
+
96
+ If you have SQL files stored in a directory:
97
+
98
+ ```
99
+ # Run a SQL file and cache the result
100
+ df = loader.get("monthly_sales.sql")
101
+ ```
102
+
103
+ - `penwings` will automatically check if a Parquet version exists.
104
+ - If it exists, the cached Parquet is loaded.
105
+ - If not, the SQL query runs and the result is saved as a Parquet file.
106
+
107
+ ### 2. Using SQL Query Strings
108
+
109
+ You can also pass queries directly:
110
+
111
+ ```
112
+ query = "SELECT * FROM sales WHERE month='2026-02'"
113
+ df = loader.get(sql=query, parquet_name="sales_feb2026")
114
+ ```
115
+
116
+ - `parquet_name` determines the Parquet file name.
117
+ - Works similarly to SQL file mode for caching.
118
+
119
+ ### 3. Automatic Parquet Management
120
+
121
+ - All results are cached in the specified `parquet_dir`.
122
+ - This reduces repeated database queries and ensures reproducibility.
123
+ - Cached files can be reloaded for faster access.
124
+
125
+ ---
126
+
127
+ ## Versioning
128
+
129
+ Penwings follows **semantic versioning**:
130
+
131
+ - **MAJOR**: Breaking changes to API
132
+ - **MINOR**: New features, backward-compatible
133
+ - **PATCH**: Bug fixes
134
+
135
+ ---
136
+
137
+ ## Contributing
138
+
139
+ We welcome contributions!
140
+
141
+ 1. Fork the repository
142
+ 2. Create a feature branch (`git checkout -b feature/my-feature`)
143
+ 3. Commit your changes (`git commit -m 'Add new feature'`)
144
+ 4. Push to branch (`git push origin feature/my-feature`)
145
+ 5. Open a pull request
146
+
147
+ Please ensure your code follows PEP8 standards.
148
+
149
+ ---
150
+
151
+ ## License
152
+
153
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
154
+
155
+ ---
156
+
157
+ ## Example Workflow
158
+
159
+ ```
160
+ from sqlalchemy import create_engine
161
+ from penwings import SQLParquetCache
162
+
163
+ engine = create_engine("sqlite:///example.db")
164
+
165
+ loader = SQLParquetCache(
166
+ sql_dir="sql_queries",
167
+ parquet_dir="parquet_cache",
168
+ conn=engine
169
+ )
170
+
171
+ # get data
172
+ df_jan = loader.get("sales_january.sql")
173
+ df_feb = loader.get(sql="SELECT * FROM sales WHERE month='2026-02'", parquet_name="sales_feb")
174
+ ```
175
+
176
+ - SQL files are automatically cached as Parquet
177
+ - Subsequent loads are fast and do not hit the database
@@ -0,0 +1,151 @@
1
+ # Penwings
2
+
3
+ **Penwings** is a lightweight Python library designed to simplify SQL data workflows by automatically importing data from SQL and caching it as Parquet files. This ensures faster subsequent access and reproducible pipelines, while reducing database get.
4
+
5
+ ---
6
+
7
+ ## Table of Contents
8
+
9
+ 1. [Features](#features)
10
+ 2. [Installation](#installation)
11
+ 3. [Getting Started](#getting-started)
12
+ 4. [Usage](#usage)
13
+ 5. [Versioning](#versioning)
14
+ 6. [Contributing](#contributing)
15
+ 7. [License](#license)
16
+
17
+ ---
18
+
19
+ ## Features
20
+
21
+ - get data from SQL queries or SQL files
22
+ - Automatically save query results as Parquet files
23
+ - Reuse Parquet files to avoid redundant queries
24
+ - Simple, stable API for reproducible workflows
25
+ - Optimized for performance and ease of integration
26
+
27
+ ---
28
+
29
+ ## Installation
30
+
31
+ Install via pip:
32
+
33
+ pip install penwings
34
+
35
+ > Make sure you have Python 3.11+ installed.
36
+
37
+ ---
38
+
39
+ ## Getting Started
40
+
41
+ ### Importing the Library
42
+
43
+ from penwings import SQLParquetCache
44
+
45
+ ### Initialize the Cache
46
+
47
+ You can initialize the cache by providing either a SQL directory or a query string, along with a Parquet directory:
48
+
49
+ ```
50
+ from sqlalchemy import create_engine
51
+ ```
52
+
53
+ # SQL connection
54
+ ```
55
+ engine = create_engine("postgresql://user:password@localhost/dbname")
56
+
57
+ # Initialize the cache
58
+ loader = SQLParquetCache(
59
+ sql_dir="sql_files", # Optional if using query string
60
+ parquet_dir="parquet_cache",
61
+ conn=engine
62
+ )
63
+ ```
64
+ ---
65
+
66
+ ## Usage
67
+
68
+ ### 1. Using SQL Files
69
+
70
+ If you have SQL files stored in a directory:
71
+
72
+ ```
73
+ # Run a SQL file and cache the result
74
+ df = loader.get("monthly_sales.sql")
75
+ ```
76
+
77
+ - `penwings` will automatically check if a Parquet version exists.
78
+ - If it exists, the cached Parquet is loaded.
79
+ - If not, the SQL query runs and the result is saved as a Parquet file.
80
+
81
+ ### 2. Using SQL Query Strings
82
+
83
+ You can also pass queries directly:
84
+
85
+ ```
86
+ query = "SELECT * FROM sales WHERE month='2026-02'"
87
+ df = loader.get(sql=query, parquet_name="sales_feb2026")
88
+ ```
89
+
90
+ - `parquet_name` determines the Parquet file name.
91
+ - Works similarly to SQL file mode for caching.
92
+
93
+ ### 3. Automatic Parquet Management
94
+
95
+ - All results are cached in the specified `parquet_dir`.
96
+ - This reduces repeated database queries and ensures reproducibility.
97
+ - Cached files can be reloaded for faster access.
98
+
99
+ ---
100
+
101
+ ## Versioning
102
+
103
+ Penwings follows **semantic versioning**:
104
+
105
+ - **MAJOR**: Breaking changes to API
106
+ - **MINOR**: New features, backward-compatible
107
+ - **PATCH**: Bug fixes
108
+
109
+ ---
110
+
111
+ ## Contributing
112
+
113
+ We welcome contributions!
114
+
115
+ 1. Fork the repository
116
+ 2. Create a feature branch (`git checkout -b feature/my-feature`)
117
+ 3. Commit your changes (`git commit -m 'Add new feature'`)
118
+ 4. Push to branch (`git push origin feature/my-feature`)
119
+ 5. Open a pull request
120
+
121
+ Please ensure your code follows PEP8 standards.
122
+
123
+ ---
124
+
125
+ ## License
126
+
127
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
128
+
129
+ ---
130
+
131
+ ## Example Workflow
132
+
133
+ ```
134
+ from sqlalchemy import create_engine
135
+ from penwings import SQLParquetCache
136
+
137
+ engine = create_engine("sqlite:///example.db")
138
+
139
+ loader = SQLParquetCache(
140
+ sql_dir="sql_queries",
141
+ parquet_dir="parquet_cache",
142
+ conn=engine
143
+ )
144
+
145
+ # get data
146
+ df_jan = loader.get("sales_january.sql")
147
+ df_feb = loader.get(sql="SELECT * FROM sales WHERE month='2026-02'", parquet_name="sales_feb")
148
+ ```
149
+
150
+ - SQL files are automatically cached as Parquet
151
+ - Subsequent loads are fast and do not hit the database
@@ -0,0 +1,59 @@
1
+ [project]
2
+ name = "penwings"
3
+ dynamic = ["version"]
4
+ description = "Lightweight library to handle data and reproduce workflows"
5
+ readme = "README.md"
6
+ license = {text = "LICENSE"}
7
+ authors = [
8
+ {name = "Raf Blanckaert",email = "R.Blanckaert@outlook.com"}
9
+ ]
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "sqlalchemy (>=2.0.46,<3.0.0)",
13
+ "pyodbc (>=5.3.0,<6.0.0)",
14
+ "pandas (>=3.0.0,<4.0.0)",
15
+ "numpy (>=2.4.1,<3.0.0)"
16
+ ]
17
+
18
+ [project.urls]
19
+ Homepage = "https://github.com/Frissie/penwings"
20
+ Repository = "https://github.com/Frissie/penwings"
21
+ Issues = "https://github.com/Frissie/penwings/issues"
22
+
23
+
24
+ [tool.setuptools_scm]
25
+ version_scheme = "guess-next-dev"
26
+ local_scheme = "no-local-version"
27
+ tag_regex = "^v(?P<version>.*)$"
28
+
29
+ [tool.setuptools]
30
+ package-dir = {"" = "src"}
31
+
32
+ [tool.setuptools.packages.find]
33
+ where = ["src"]
34
+ include = ["penwings*"]
35
+ exclude = ["penwings._*"]
36
+
37
+ [build-system]
38
+ requires = ["setuptools>=68", "wheel", "setuptools-scm"]
39
+ build-backend = "setuptools.build_meta"
40
+
41
+ [dependency-groups]
42
+ dev = [
43
+ "openpyxl>=3.1.5",
44
+ "optuna>=4.7.0",
45
+ "scikit-learn>=1.8.0",
46
+ "scipy>=1.17.0",
47
+ ]
48
+
49
+ [project.optional-dependencies]
50
+ scipy = ["scipy (>=1.17.0,<2.0.0)"]
51
+ sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
52
+ optuna= ["optuna (>=4.7.0,<5.0.0)"]
53
+ all = ["openpyxl (>=3.1.5,<4.0.0)"]
54
+
55
+ [[tool.uv.index]]
56
+ name = "testpypi"
57
+ url = "https://test.pypi.org/simple/"
58
+ publish-url = "https://test.pypi.org/legacy/"
59
+ explicit = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,9 @@
1
+ from .io.cache import SQLParquetCache
2
+ from .paths import input_dir, output_dir, model_dir
3
+
4
+ __all__ = [
5
+ "SQLParquetCache",
6
+ "input_dir",
7
+ "output_dir",
8
+ "model_dir",
9
+ ]
File without changes
@@ -0,0 +1,38 @@
1
+ import time as t
2
+
3
+ from functools import wraps
4
+ from pathlib import Path
5
+
6
+
7
+ def timing(func):
8
+ @wraps(func)
9
+ def wrapper(*args, **kwargs):
10
+ start = t.perf_counter()
11
+ result = func(*args, **kwargs)
12
+ end = t.perf_counter()
13
+ print(f"{func.__name__} took {end - start: .2f}")
14
+ return result
15
+
16
+ return wrapper
17
+
18
+
19
+ def timing_sql(func):
20
+ @wraps(func)
21
+ def wrapper(*args, **kwargs):
22
+ sql_file = kwargs.get("sql_file", None)
23
+ verbose = getattr(args[0], "verbose", True)
24
+
25
+ if sql_file is None and len(args) > 1:
26
+ sql_file = args[1]
27
+
28
+ sql_file = Path(sql_file)
29
+
30
+ start = t.perf_counter()
31
+ result, source = func(*args, **kwargs)
32
+ end = t.perf_counter()
33
+
34
+ if verbose:
35
+ print(f"{sql_file.stem} -> {source} took {end - start: .2f} seconds to load")
36
+ return result
37
+
38
+ return wrapper
@@ -0,0 +1,7 @@
1
+ from typing import TypedDict, List, Union
2
+
3
+
4
+ class SQLParquetKwargs(TypedDict, total=False):
5
+ index_col: Union[str, List[str], None]
6
+ parse_dates: Union[List[str], None]
7
+ dtype: Union[dict, None]
File without changes
@@ -0,0 +1,93 @@
1
+ import pandas as pd
2
+
3
+ from sqlalchemy import Engine
4
+ from pathlib import Path
5
+ from datetime import datetime, timedelta
6
+ from typing import Unpack, Union, Optional
7
+ from .._utils._typing import SQLParquetKwargs
8
+ from .._utils._decorators import timing_sql
9
+
10
+
11
+ class SQLParquetCache:
12
+ def __init__(
13
+ self,
14
+ parquet_dir: Union[Path, str],
15
+ conn: Engine,
16
+ sql_dir: Optional[Union[Path, str]] = None,
17
+ refresh_days: int = 0, # zero disables refresh when force == false
18
+ verbose: bool = True,
19
+ **kwargs: Unpack[SQLParquetKwargs],
20
+ ):
21
+
22
+ if sql_dir is not None:
23
+ self.sql_dir: Path = Path(sql_dir)
24
+ self.parquet_dir: Path = Path(parquet_dir)
25
+ self.refresh_days = refresh_days
26
+ self.conn = conn
27
+ self.global_kwargs = kwargs
28
+
29
+ self.verbose = verbose
30
+ self.source = "SQL"
31
+
32
+ def set_params(self, **params):
33
+ for key, value in params.items():
34
+ if not hasattr(self, key):
35
+ raise ValueError(f"Invalid parameter: {key}")
36
+ setattr(self, key, value)
37
+ return self
38
+
39
+ def _sql_path(self, sql_file: str) -> Path:
40
+ return self.sql_dir / sql_file
41
+
42
+ def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
43
+ name = parquet_name or Path(sql_file).stem
44
+ return self.parquet_dir / f"{name}.parquet"
45
+
46
+ def _is_new(self, path: Path, refresh_window: int) -> bool:
47
+ if not path.exists():
48
+ return False
49
+ if self.refresh_days == 0:
50
+ return True
51
+ last_modified = datetime.fromtimestamp(path.stat().st_mtime)
52
+ return datetime.now() - last_modified < timedelta(days=refresh_window)
53
+
54
+ def _read_sql(self, sql_file: str):
55
+ return self._sql_path(sql_file).read_text()
56
+
57
+ def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
58
+ return pd.read_sql(query, conn, **kwargs)
59
+
60
+ @timing_sql
61
+ def get(
62
+ self,
63
+ sql: str,
64
+ parquet_name: Union[str, None] = None,
65
+ conn: Engine | None = None,
66
+ refresh_days: int | None = None,
67
+ force: bool = False,
68
+ **kwargs: Unpack[SQLParquetKwargs],
69
+ ) -> tuple[pd.DataFrame, str]:
70
+ if isinstance(sql, str) and Path(sql).suffix == ".sql":
71
+ query = self._read_sql(sql)
72
+ elif isinstance(sql, str):
73
+ if parquet_name is None:
74
+ raise ValueError("parquet_name must be provided if query is passed directly")
75
+ query = sql
76
+ else:
77
+ raise ValueError("sql must be a SQL string or a path to a .sql file")
78
+
79
+ connection = conn or self.conn
80
+ refresh_window = refresh_days or self.refresh_days
81
+ parquet_path = self._parquet_path(query)
82
+ sql_kwargs = self.global_kwargs | kwargs
83
+
84
+ if not force and self._is_new(parquet_path, refresh_window):
85
+ source = "Parquet"
86
+ return pd.read_parquet(parquet_path), source
87
+
88
+ source = "SQL"
89
+ df = self._return_sql(query, connection, **sql_kwargs)
90
+ self.parquet_dir.mkdir(parents=True, exist_ok=True)
91
+ df.to_parquet(parquet_path, index=False)
92
+
93
+ return df, source
@@ -0,0 +1,16 @@
1
+ import pathlib
2
+
3
+ home_dir = pathlib.Path.cwd()
4
+ proj_dir = pathlib.Path.cwd().parent
5
+
6
+ input_dir = home_dir / "input"
7
+ sql_dir = input_dir / "sql"
8
+ parquet_dir = input_dir / "parquet"
9
+ output_dir = home_dir / "output"
10
+
11
+ if __name__ == "__main__":
12
+ i = 1
13
+ for name, value in dict(locals()).items():
14
+ if isinstance(value, pathlib.Path):
15
+ print(f"{i} - {name}: {value}")
16
+ i += 1