penwings 0.1.3.dev1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {penwings-0.1.3.dev1/src/penwings.egg-info → penwings-0.2.0}/PKG-INFO +28 -12
- penwings-0.2.0/pyproject.toml +70 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/__init__.py +1 -2
- penwings-0.2.0/src/penwings/_version.py +34 -0
- penwings-0.2.0/src/penwings/io/cache.py +198 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/paths.py +3 -1
- {penwings-0.1.3.dev1 → penwings-0.2.0/src/penwings.egg-info}/PKG-INFO +28 -12
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings.egg-info/SOURCES.txt +1 -2
- penwings-0.2.0/src/penwings.egg-info/requires.txt +27 -0
- penwings-0.1.3.dev1/pyproject.toml +0 -59
- penwings-0.1.3.dev1/src/penwings/io/cache.py +0 -93
- penwings-0.1.3.dev1/src/penwings/tuner.py +0 -42
- penwings-0.1.3.dev1/src/penwings/views.py +0 -79
- penwings-0.1.3.dev1/src/penwings.egg-info/requires.txt +0 -16
- {penwings-0.1.3.dev1 → penwings-0.2.0}/.gitignore +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/LICENSE +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/README.md +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/setup.cfg +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/_utils/__init__.py +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/_utils/_decorators.py +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/_utils/_typing.py +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/io/__init__.py +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings.egg-info/dependency_links.txt +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings.egg-info/top_level.txt +0 -0
- {penwings-0.1.3.dev1 → penwings-0.2.0}/uv.lock +0 -0
|
@@ -1,27 +1,43 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: penwings
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Lightweight library to handle data and reproduce workflows
|
|
5
|
-
Author-email: Raf Blanckaert <
|
|
6
|
-
License:
|
|
5
|
+
Author-email: Raf Blanckaert <r.blanckaert@outlook.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/Frissie/penwings
|
|
8
8
|
Project-URL: Repository, https://github.com/Frissie/penwings
|
|
9
9
|
Project-URL: Issues, https://github.com/Frissie/penwings/issues
|
|
10
|
+
Keywords: data,workflow,reproducibility,sql,analytics
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
10
17
|
Requires-Python: >=3.11
|
|
11
18
|
Description-Content-Type: text/markdown
|
|
12
19
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: sqlalchemy<3.0
|
|
14
|
-
Requires-Dist: pyodbc<6.0
|
|
15
|
-
Requires-Dist: pandas<4.0
|
|
16
|
-
Requires-Dist: numpy<3.0
|
|
20
|
+
Requires-Dist: sqlalchemy<3.0,>=2.0
|
|
21
|
+
Requires-Dist: pyodbc<6.0,>=5.0
|
|
22
|
+
Requires-Dist: pandas<4.0,>=2.2
|
|
23
|
+
Requires-Dist: numpy<3.0,>=1.26
|
|
24
|
+
Provides-Extra: excel
|
|
25
|
+
Requires-Dist: openpyxl<4.0,>=3.1; extra == "excel"
|
|
26
|
+
Provides-Extra: ml
|
|
27
|
+
Requires-Dist: scikit-learn<2.0,>=1.4; extra == "ml"
|
|
17
28
|
Provides-Extra: scipy
|
|
18
|
-
Requires-Dist: scipy<2.0
|
|
19
|
-
Provides-Extra: sklearn
|
|
20
|
-
Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
|
|
29
|
+
Requires-Dist: scipy<2.0,>=1.11; extra == "scipy"
|
|
21
30
|
Provides-Extra: optuna
|
|
22
|
-
Requires-Dist: optuna<5.0
|
|
31
|
+
Requires-Dist: optuna<5.0,>=3.5; extra == "optuna"
|
|
23
32
|
Provides-Extra: all
|
|
24
|
-
Requires-Dist: openpyxl<4.0
|
|
33
|
+
Requires-Dist: openpyxl<4.0,>=3.1; extra == "all"
|
|
34
|
+
Requires-Dist: scikit-learn<2.0,>=1.4; extra == "all"
|
|
35
|
+
Requires-Dist: scipy<2.0,>=1.11; extra == "all"
|
|
36
|
+
Requires-Dist: optuna<5.0,>=3.5; extra == "all"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff>=0.3; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
25
41
|
Dynamic: license-file
|
|
26
42
|
|
|
27
43
|
# Penwings
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "setuptools-scm>=8"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "penwings"
|
|
7
|
+
version = "v0.2.0"
|
|
8
|
+
description = "Lightweight library to handle data and reproduce workflows"
|
|
9
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Raf Blanckaert", email = "r.blanckaert@outlook.com" }
|
|
13
|
+
]
|
|
14
|
+
requires-python = ">=3.11"
|
|
15
|
+
|
|
16
|
+
keywords = ["data", "workflow", "reproducibility", "sql", "analytics"]
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
dependencies = [
|
|
28
|
+
"sqlalchemy>=2.0,<3.0",
|
|
29
|
+
"pyodbc>=5.0,<6.0",
|
|
30
|
+
"pandas>=2.2,<4.0",
|
|
31
|
+
"numpy>=1.26,<3.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
excel = ["openpyxl>=3.1,<4.0"]
|
|
36
|
+
ml = ["scikit-learn>=1.4,<2.0"]
|
|
37
|
+
scipy = ["scipy>=1.11,<2.0"]
|
|
38
|
+
optuna = ["optuna>=3.5,<5.0"]
|
|
39
|
+
all = [
|
|
40
|
+
"openpyxl>=3.1,<4.0",
|
|
41
|
+
"scikit-learn>=1.4,<2.0",
|
|
42
|
+
"scipy>=1.11,<2.0",
|
|
43
|
+
"optuna>=3.5,<5.0",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
dev = [
|
|
47
|
+
"pytest>=8.0",
|
|
48
|
+
"ruff>=0.3",
|
|
49
|
+
"mypy>=1.8",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
Homepage = "https://github.com/Frissie/penwings"
|
|
54
|
+
Repository = "https://github.com/Frissie/penwings"
|
|
55
|
+
Issues = "https://github.com/Frissie/penwings/issues"
|
|
56
|
+
|
|
57
|
+
# setuptools config
|
|
58
|
+
[tool.setuptools]
|
|
59
|
+
package-dir = { "" = "src" }
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
where = ["src"]
|
|
63
|
+
include = ["penwings*"]
|
|
64
|
+
|
|
65
|
+
# Versioning via SCM
|
|
66
|
+
[tool.setuptools_scm]
|
|
67
|
+
version_scheme = "guess-next-dev"
|
|
68
|
+
local_scheme = "no-local-version"
|
|
69
|
+
tag_regex = "^v(?P<version>.*)$"
|
|
70
|
+
write_to = "src/penwings/_version.py"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.2.1.dev0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 1, 'dev0')
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'g96cf35c66'
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import Engine
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Unpack, Optional
|
|
7
|
+
from .._utils._typing import SQLParquetKwargs
|
|
8
|
+
from .._utils._decorators import timing_sql
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SQLParquetCache:
|
|
12
|
+
"""
|
|
13
|
+
Cache SQL query results locally as Parquet files.
|
|
14
|
+
|
|
15
|
+
This class executes SQL queries using a SQLAlchemy engine and caches
|
|
16
|
+
the results as Parquet files in a specified directory. On subsequent
|
|
17
|
+
calls, the cached Parquet file is returned if it is considered "fresh"
|
|
18
|
+
according to a configurable refresh window.
|
|
19
|
+
|
|
20
|
+
The query can either be provided directly as a SQL string or as a path
|
|
21
|
+
to a ``.sql`` file. If a Parquet file already exists and is within the
|
|
22
|
+
refresh window, it will be loaded instead of re-executing the query,
|
|
23
|
+
unless ``force=True`` is specified.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
parquet_dir : str or pathlib.Path
|
|
28
|
+
Directory where Parquet cache files will be stored.
|
|
29
|
+
conn : sqlalchemy.engine.Engine
|
|
30
|
+
SQLAlchemy engine used to execute SQL queries.
|
|
31
|
+
sql_dir : str or pathlib.Path, optional
|
|
32
|
+
Directory containing ``.sql`` files. Required when passing a
|
|
33
|
+
filename instead of a raw SQL string.
|
|
34
|
+
refresh_days : int, default 0
|
|
35
|
+
Number of days for which a cached Parquet file is considered fresh.
|
|
36
|
+
If 0, refresh checking is disabled and existing Parquet files are
|
|
37
|
+
always used unless ``force=True``.
|
|
38
|
+
verbose : bool, default True
|
|
39
|
+
Whether to enable verbose output (if used by decorators or
|
|
40
|
+
extended implementations).
|
|
41
|
+
**kwargs : dict
|
|
42
|
+
Additional keyword arguments passed to ``pandas.read_sql``.
|
|
43
|
+
These are stored globally and merged with per-call arguments
|
|
44
|
+
in :meth:`get`.
|
|
45
|
+
|
|
46
|
+
Notes
|
|
47
|
+
-----
|
|
48
|
+
- Cached Parquet filenames are derived from the SQL filename stem
|
|
49
|
+
or from the provided ``parquet_name``.
|
|
50
|
+
- If a raw SQL string is provided, ``parquet_name`` must be specified.
|
|
51
|
+
- The cache directory is created automatically if it does not exist.
|
|
52
|
+
- The method :meth:`get` returns both the DataFrame and the source
|
|
53
|
+
("SQL" or "Parquet") used to obtain the data.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
parquet_dir: Path | str,
|
|
59
|
+
conn: Engine,
|
|
60
|
+
sql_dir: Optional[Path | str] = None,
|
|
61
|
+
refresh_days: int = 0, # zero disables refresh when force == false
|
|
62
|
+
verbose: bool = True,
|
|
63
|
+
**kwargs: Unpack[SQLParquetKwargs],
|
|
64
|
+
):
|
|
65
|
+
|
|
66
|
+
if sql_dir is not None:
|
|
67
|
+
self.sql_dir: Path = Path(sql_dir)
|
|
68
|
+
self.parquet_dir: Path = Path(parquet_dir)
|
|
69
|
+
self.refresh_days = refresh_days
|
|
70
|
+
self.conn = conn
|
|
71
|
+
self.global_kwargs = kwargs
|
|
72
|
+
|
|
73
|
+
self.verbose = verbose
|
|
74
|
+
|
|
75
|
+
def set_params(self, **params):
|
|
76
|
+
for key, value in params.items():
|
|
77
|
+
if not hasattr(self, key):
|
|
78
|
+
raise ValueError(f"Invalid parameter: {key}")
|
|
79
|
+
setattr(self, key, value)
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def _sql_path(self, sql_file: str) -> Path:
|
|
83
|
+
if not hasattr(self, "sql_dir"):
|
|
84
|
+
raise ValueError("sql_dir must be set when passing a .sql filename.")
|
|
85
|
+
return self.sql_dir / sql_file
|
|
86
|
+
|
|
87
|
+
def _parquet_path(self, parquet_name: str) -> Path:
|
|
88
|
+
return self.parquet_dir / f"{parquet_name}.parquet"
|
|
89
|
+
|
|
90
|
+
def _is_fresh(self, path: Path, refresh_window: int) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
Determine whether a cached Parquet file is fresh enough to use.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
path : Path
|
|
97
|
+
Path to the Parquet file.
|
|
98
|
+
refresh_window : int
|
|
99
|
+
Number of days the file is considered valid.
|
|
100
|
+
If 0, the file is always considered fresh (if it exists).
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
bool
|
|
105
|
+
True if the file exists and is within the refresh window.
|
|
106
|
+
"""
|
|
107
|
+
if not path.exists():
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
# 0 means: never refresh (always use cache if it exists)
|
|
111
|
+
if refresh_window == 0:
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
last_modified = datetime.fromtimestamp(path.stat().st_mtime)
|
|
115
|
+
age = datetime.now() - last_modified
|
|
116
|
+
return age < timedelta(days=refresh_window)
|
|
117
|
+
|
|
118
|
+
def _read_sql(self, sql_file: str):
|
|
119
|
+
return self._sql_path(sql_file).read_text()
|
|
120
|
+
|
|
121
|
+
def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
|
|
122
|
+
return pd.read_sql(query, conn, **kwargs)
|
|
123
|
+
|
|
124
|
+
@timing_sql
|
|
125
|
+
def get(
|
|
126
|
+
self,
|
|
127
|
+
sql: str,
|
|
128
|
+
parquet_name: str | None = None,
|
|
129
|
+
conn: Engine | None = None,
|
|
130
|
+
refresh_days: int | None = None,
|
|
131
|
+
force: bool = False,
|
|
132
|
+
**kwargs: Unpack[SQLParquetKwargs],
|
|
133
|
+
) -> tuple[pd.DataFrame, str]:
|
|
134
|
+
"""
|
|
135
|
+
Retrieve a DataFrame from cache or execute the SQL query.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
sql : str
|
|
140
|
+
Either a raw SQL query string or the filename of a ``.sql`` file.
|
|
141
|
+
parquet_name : str, optional
|
|
142
|
+
Name of the Parquet file (without extension) when passing a raw
|
|
143
|
+
SQL string. Ignored if a ``.sql`` filename is provided.
|
|
144
|
+
conn : sqlalchemy.engine.Engine, optional
|
|
145
|
+
Alternative SQLAlchemy engine to use instead of the default.
|
|
146
|
+
refresh_days : int, optional
|
|
147
|
+
Override the instance-level refresh window (in days).
|
|
148
|
+
force : bool, default False
|
|
149
|
+
If True, bypass the cache and force re-execution of the query.
|
|
150
|
+
**kwargs : dict
|
|
151
|
+
Additional keyword arguments passed to ``pandas.read_sql``.
|
|
152
|
+
These override any global keyword arguments defined at
|
|
153
|
+
initialization.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
DataFrame
|
|
158
|
+
The resulting query output.
|
|
159
|
+
str
|
|
160
|
+
The data source used: either ``"SQL"`` or ``"Parquet"``.
|
|
161
|
+
|
|
162
|
+
Raises
|
|
163
|
+
------
|
|
164
|
+
ValueError
|
|
165
|
+
If ``sql`` is a raw SQL string and ``parquet_name`` is not provided.
|
|
166
|
+
If ``sql`` is neither a SQL string nor a ``.sql`` file path.
|
|
167
|
+
|
|
168
|
+
Notes
|
|
169
|
+
-----
|
|
170
|
+
If the Parquet file exists and is within the refresh window,
|
|
171
|
+
it is loaded directly using ``pandas.read_parquet``. Otherwise,
|
|
172
|
+
the SQL query is executed and the result is written to Parquet.
|
|
173
|
+
"""
|
|
174
|
+
if isinstance(sql, str) and Path(sql).suffix == ".sql":
|
|
175
|
+
parquet_name = parquet_name or Path(sql).stem
|
|
176
|
+
query = self._read_sql(sql)
|
|
177
|
+
elif isinstance(sql, str):
|
|
178
|
+
if parquet_name is None:
|
|
179
|
+
raise ValueError("parquet_name must be provided if query is passed directly")
|
|
180
|
+
query = sql
|
|
181
|
+
else:
|
|
182
|
+
raise ValueError("sql must be a SQL string or a path to a .sql file")
|
|
183
|
+
|
|
184
|
+
connection = self.conn if conn is None else conn
|
|
185
|
+
refresh_window = self.refresh_days if refresh_days is None else refresh_days
|
|
186
|
+
parquet_path = self._parquet_path(parquet_name)
|
|
187
|
+
self.parquet_dir.mkdir(parents=True, exist_ok=True)
|
|
188
|
+
sql_kwargs = self.global_kwargs | kwargs
|
|
189
|
+
|
|
190
|
+
if not force and self._is_fresh(parquet_path, refresh_window):
|
|
191
|
+
source = "Parquet"
|
|
192
|
+
return pd.read_parquet(parquet_path), source
|
|
193
|
+
|
|
194
|
+
source = "SQL"
|
|
195
|
+
df = self._return_sql(query, connection, **sql_kwargs)
|
|
196
|
+
df.to_parquet(parquet_path, index=False)
|
|
197
|
+
|
|
198
|
+
return df, source
|
|
@@ -4,9 +4,11 @@ home_dir = pathlib.Path.cwd()
|
|
|
4
4
|
proj_dir = pathlib.Path.cwd().parent
|
|
5
5
|
|
|
6
6
|
input_dir = home_dir / "input"
|
|
7
|
-
model_dir = home_dir / "model"
|
|
8
7
|
output_dir = home_dir / "output"
|
|
9
8
|
|
|
9
|
+
sql_dir = input_dir / "sql"
|
|
10
|
+
parquet_dir = input_dir / "parquet"
|
|
11
|
+
|
|
10
12
|
if __name__ == "__main__":
|
|
11
13
|
i = 1
|
|
12
14
|
for name, value in dict(locals()).items():
|
|
@@ -1,27 +1,43 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: penwings
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Lightweight library to handle data and reproduce workflows
|
|
5
|
-
Author-email: Raf Blanckaert <
|
|
6
|
-
License:
|
|
5
|
+
Author-email: Raf Blanckaert <r.blanckaert@outlook.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/Frissie/penwings
|
|
8
8
|
Project-URL: Repository, https://github.com/Frissie/penwings
|
|
9
9
|
Project-URL: Issues, https://github.com/Frissie/penwings/issues
|
|
10
|
+
Keywords: data,workflow,reproducibility,sql,analytics
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
10
17
|
Requires-Python: >=3.11
|
|
11
18
|
Description-Content-Type: text/markdown
|
|
12
19
|
License-File: LICENSE
|
|
13
|
-
Requires-Dist: sqlalchemy<3.0
|
|
14
|
-
Requires-Dist: pyodbc<6.0
|
|
15
|
-
Requires-Dist: pandas<4.0
|
|
16
|
-
Requires-Dist: numpy<3.0
|
|
20
|
+
Requires-Dist: sqlalchemy<3.0,>=2.0
|
|
21
|
+
Requires-Dist: pyodbc<6.0,>=5.0
|
|
22
|
+
Requires-Dist: pandas<4.0,>=2.2
|
|
23
|
+
Requires-Dist: numpy<3.0,>=1.26
|
|
24
|
+
Provides-Extra: excel
|
|
25
|
+
Requires-Dist: openpyxl<4.0,>=3.1; extra == "excel"
|
|
26
|
+
Provides-Extra: ml
|
|
27
|
+
Requires-Dist: scikit-learn<2.0,>=1.4; extra == "ml"
|
|
17
28
|
Provides-Extra: scipy
|
|
18
|
-
Requires-Dist: scipy<2.0
|
|
19
|
-
Provides-Extra: sklearn
|
|
20
|
-
Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
|
|
29
|
+
Requires-Dist: scipy<2.0,>=1.11; extra == "scipy"
|
|
21
30
|
Provides-Extra: optuna
|
|
22
|
-
Requires-Dist: optuna<5.0
|
|
31
|
+
Requires-Dist: optuna<5.0,>=3.5; extra == "optuna"
|
|
23
32
|
Provides-Extra: all
|
|
24
|
-
Requires-Dist: openpyxl<4.0
|
|
33
|
+
Requires-Dist: openpyxl<4.0,>=3.1; extra == "all"
|
|
34
|
+
Requires-Dist: scikit-learn<2.0,>=1.4; extra == "all"
|
|
35
|
+
Requires-Dist: scipy<2.0,>=1.11; extra == "all"
|
|
36
|
+
Requires-Dist: optuna<5.0,>=3.5; extra == "all"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff>=0.3; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
25
41
|
Dynamic: license-file
|
|
26
42
|
|
|
27
43
|
# Penwings
|
|
@@ -4,9 +4,8 @@ README.md
|
|
|
4
4
|
pyproject.toml
|
|
5
5
|
uv.lock
|
|
6
6
|
src/penwings/__init__.py
|
|
7
|
+
src/penwings/_version.py
|
|
7
8
|
src/penwings/paths.py
|
|
8
|
-
src/penwings/tuner.py
|
|
9
|
-
src/penwings/views.py
|
|
10
9
|
src/penwings.egg-info/PKG-INFO
|
|
11
10
|
src/penwings.egg-info/SOURCES.txt
|
|
12
11
|
src/penwings.egg-info/dependency_links.txt
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
sqlalchemy<3.0,>=2.0
|
|
2
|
+
pyodbc<6.0,>=5.0
|
|
3
|
+
pandas<4.0,>=2.2
|
|
4
|
+
numpy<3.0,>=1.26
|
|
5
|
+
|
|
6
|
+
[all]
|
|
7
|
+
openpyxl<4.0,>=3.1
|
|
8
|
+
scikit-learn<2.0,>=1.4
|
|
9
|
+
scipy<2.0,>=1.11
|
|
10
|
+
optuna<5.0,>=3.5
|
|
11
|
+
|
|
12
|
+
[dev]
|
|
13
|
+
pytest>=8.0
|
|
14
|
+
ruff>=0.3
|
|
15
|
+
mypy>=1.8
|
|
16
|
+
|
|
17
|
+
[excel]
|
|
18
|
+
openpyxl<4.0,>=3.1
|
|
19
|
+
|
|
20
|
+
[ml]
|
|
21
|
+
scikit-learn<2.0,>=1.4
|
|
22
|
+
|
|
23
|
+
[optuna]
|
|
24
|
+
optuna<5.0,>=3.5
|
|
25
|
+
|
|
26
|
+
[scipy]
|
|
27
|
+
scipy<2.0,>=1.11
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
[project]
|
|
2
|
-
name = "penwings"
|
|
3
|
-
dynamic = ["version"]
|
|
4
|
-
description = "Lightweight library to handle data and reproduce workflows"
|
|
5
|
-
readme = "README.md"
|
|
6
|
-
license = {text = "LICENSE"}
|
|
7
|
-
authors = [
|
|
8
|
-
{name = "Raf Blanckaert",email = "R.Blanckaert@outlook.com"}
|
|
9
|
-
]
|
|
10
|
-
requires-python = ">=3.11"
|
|
11
|
-
dependencies = [
|
|
12
|
-
"sqlalchemy (>=2.0.46,<3.0.0)",
|
|
13
|
-
"pyodbc (>=5.3.0,<6.0.0)",
|
|
14
|
-
"pandas (>=3.0.0,<4.0.0)",
|
|
15
|
-
"numpy (>=2.4.1,<3.0.0)"
|
|
16
|
-
]
|
|
17
|
-
|
|
18
|
-
[project.urls]
|
|
19
|
-
Homepage = "https://github.com/Frissie/penwings"
|
|
20
|
-
Repository = "https://github.com/Frissie/penwings"
|
|
21
|
-
Issues = "https://github.com/Frissie/penwings/issues"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
[tool.setuptools_scm]
|
|
25
|
-
version_scheme = "guess-next-dev"
|
|
26
|
-
local_scheme = "no-local-version"
|
|
27
|
-
tag_regex = "^v(?P<version>.*)$"
|
|
28
|
-
|
|
29
|
-
[tool.setuptools]
|
|
30
|
-
package-dir = {"" = "src"}
|
|
31
|
-
|
|
32
|
-
[tool.setuptools.packages.find]
|
|
33
|
-
where = ["src"]
|
|
34
|
-
include = ["penwings*"]
|
|
35
|
-
exclude = ["penwings._*"]
|
|
36
|
-
|
|
37
|
-
[build-system]
|
|
38
|
-
requires = ["setuptools>=68", "wheel", "setuptools-scm"]
|
|
39
|
-
build-backend = "setuptools.build_meta"
|
|
40
|
-
|
|
41
|
-
[dependency-groups]
|
|
42
|
-
dev = [
|
|
43
|
-
"openpyxl>=3.1.5",
|
|
44
|
-
"optuna>=4.7.0",
|
|
45
|
-
"scikit-learn>=1.8.0",
|
|
46
|
-
"scipy>=1.17.0",
|
|
47
|
-
]
|
|
48
|
-
|
|
49
|
-
[project.optional-dependencies]
|
|
50
|
-
scipy = ["scipy (>=1.17.0,<2.0.0)"]
|
|
51
|
-
sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
|
|
52
|
-
optuna= ["optuna (>=4.7.0,<5.0.0)"]
|
|
53
|
-
all = ["openpyxl (>=3.1.5,<4.0.0)"]
|
|
54
|
-
|
|
55
|
-
[[tool.uv.index]]
|
|
56
|
-
name = "testpypi"
|
|
57
|
-
url = "https://test.pypi.org/simple/"
|
|
58
|
-
publish-url = "https://test.pypi.org/legacy/"
|
|
59
|
-
explicit = true
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
|
|
3
|
-
from sqlalchemy import Engine
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from datetime import datetime, timedelta
|
|
6
|
-
from typing import Unpack, Union, Optional
|
|
7
|
-
from .._utils._typing import SQLParquetKwargs
|
|
8
|
-
from .._utils._decorators import timing_sql
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class SQLParquetCache:
|
|
12
|
-
def __init__(
|
|
13
|
-
self,
|
|
14
|
-
parquet_dir: Union[Path, str],
|
|
15
|
-
conn: Engine,
|
|
16
|
-
sql_dir: Optional[Union[Path, str]] = None,
|
|
17
|
-
refresh_days: int = 0, # zero disables refresh when force == false
|
|
18
|
-
verbose: bool = True,
|
|
19
|
-
**kwargs: Unpack[SQLParquetKwargs],
|
|
20
|
-
):
|
|
21
|
-
|
|
22
|
-
if sql_dir is not None:
|
|
23
|
-
self.sql_dir: Path = Path(sql_dir)
|
|
24
|
-
self.parquet_dir: Path = Path(parquet_dir)
|
|
25
|
-
self.refresh_days = refresh_days
|
|
26
|
-
self.conn = conn
|
|
27
|
-
self.global_kwargs = kwargs
|
|
28
|
-
|
|
29
|
-
self.verbose = verbose
|
|
30
|
-
self.source = "SQL"
|
|
31
|
-
|
|
32
|
-
def set_params(self, **params):
|
|
33
|
-
for key, value in params.items():
|
|
34
|
-
if not hasattr(self, key):
|
|
35
|
-
raise ValueError(f"Invalid parameter: {key}")
|
|
36
|
-
setattr(self, key, value)
|
|
37
|
-
return self
|
|
38
|
-
|
|
39
|
-
def _sql_path(self, sql_file: str) -> Path:
|
|
40
|
-
return self.sql_dir / sql_file
|
|
41
|
-
|
|
42
|
-
def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
|
|
43
|
-
name = parquet_name or Path(sql_file).stem
|
|
44
|
-
return self.parquet_dir / f"{name}.parquet"
|
|
45
|
-
|
|
46
|
-
def _is_new(self, path: Path, refresh_window: int) -> bool:
|
|
47
|
-
if not path.exists():
|
|
48
|
-
return False
|
|
49
|
-
if self.refresh_days == 0:
|
|
50
|
-
return True
|
|
51
|
-
last_modified = datetime.fromtimestamp(path.stat().st_mtime)
|
|
52
|
-
return datetime.now() - last_modified < timedelta(days=refresh_window)
|
|
53
|
-
|
|
54
|
-
def _read_sql(self, sql_file: str):
|
|
55
|
-
return self._sql_path(sql_file).read_text()
|
|
56
|
-
|
|
57
|
-
def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
|
|
58
|
-
return pd.read_sql(query, conn, **kwargs)
|
|
59
|
-
|
|
60
|
-
@timing_sql
|
|
61
|
-
def get(
|
|
62
|
-
self,
|
|
63
|
-
sql: str,
|
|
64
|
-
parquet_name: Union[str, None] = None,
|
|
65
|
-
conn: Engine | None = None,
|
|
66
|
-
refresh_days: int | None = None,
|
|
67
|
-
force: bool = False,
|
|
68
|
-
**kwargs: Unpack[SQLParquetKwargs],
|
|
69
|
-
) -> tuple[pd.DataFrame, str]:
|
|
70
|
-
if isinstance(sql, str) and Path(sql).suffix == ".sql":
|
|
71
|
-
query = self._read_sql(sql)
|
|
72
|
-
elif isinstance(sql, str):
|
|
73
|
-
if parquet_name is None:
|
|
74
|
-
raise ValueError("parquet_name must be provided if query is passed directly")
|
|
75
|
-
query = sql
|
|
76
|
-
else:
|
|
77
|
-
raise ValueError("sql must be a SQL string or a path to a .sql file")
|
|
78
|
-
|
|
79
|
-
connection = conn or self.conn
|
|
80
|
-
refresh_window = refresh_days or self.refresh_days
|
|
81
|
-
parquet_path = self._parquet_path(query)
|
|
82
|
-
sql_kwargs = self.global_kwargs | kwargs
|
|
83
|
-
|
|
84
|
-
if not force and self._is_new(parquet_path, refresh_window):
|
|
85
|
-
source = "Parquet"
|
|
86
|
-
return pd.read_parquet(parquet_path), source
|
|
87
|
-
|
|
88
|
-
source = "SQL"
|
|
89
|
-
df = self._return_sql(query, connection, **sql_kwargs)
|
|
90
|
-
self.parquet_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
-
df.to_parquet(parquet_path, index=False)
|
|
92
|
-
|
|
93
|
-
return df, source
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
from optuna.trial import Trial
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def tune_lgbm_params(trial: Trial, model="classifier"):
|
|
5
|
-
if model == "classifier":
|
|
6
|
-
metrics = {
|
|
7
|
-
"objective": "binary",
|
|
8
|
-
"metric": "auc",
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
params = {
|
|
12
|
-
# Core
|
|
13
|
-
"verbosity": -1,
|
|
14
|
-
"boosting_type": "gbdt",
|
|
15
|
-
# GPU
|
|
16
|
-
"device": "gpu",
|
|
17
|
-
"gpu_platform_id": 0,
|
|
18
|
-
"gpu_device_id": 0,
|
|
19
|
-
# Learning
|
|
20
|
-
"learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
|
|
21
|
-
"n_estimators": trial.suggest_int("n_estimators", 1500, 5000),
|
|
22
|
-
# Tree structure (GPU-safe)
|
|
23
|
-
"num_leaves": trial.suggest_int("num_leaves", 31, 128),
|
|
24
|
-
"max_depth": trial.suggest_int("max_depth", 4, 10),
|
|
25
|
-
# Regularization / stability
|
|
26
|
-
"min_child_samples": trial.suggest_int("min_child_samples", 10, 80),
|
|
27
|
-
"min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
|
|
28
|
-
"min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
|
|
29
|
-
# Sampling
|
|
30
|
-
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
|
31
|
-
"subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
|
|
32
|
-
"feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
|
|
33
|
-
# Regularization
|
|
34
|
-
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
|
|
35
|
-
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 5.0, log=True),
|
|
36
|
-
# Histogram
|
|
37
|
-
"max_bin": trial.suggest_int("max_bin", 64, 255),
|
|
38
|
-
# Class imbalance (keep only if needed)
|
|
39
|
-
"scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 3.0),
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
return metrics | params
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from sklearn.pipeline import Pipeline
|
|
2
|
-
from sklearn.compose import ColumnTransformer, make_column_selector
|
|
3
|
-
from sklearn.preprocessing import (
|
|
4
|
-
TargetEncoder,
|
|
5
|
-
OneHotEncoder,
|
|
6
|
-
RobustScaler,
|
|
7
|
-
KBinsDiscretizer,
|
|
8
|
-
FunctionTransformer,
|
|
9
|
-
PolynomialFeatures,
|
|
10
|
-
OrdinalEncoder,
|
|
11
|
-
StandardScaler,
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
LinearView = ColumnTransformer(
|
|
15
|
-
[
|
|
16
|
-
("numerical", RobustScaler(), make_column_selector(dtype_exclude="category")),
|
|
17
|
-
("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
|
|
18
|
-
],
|
|
19
|
-
remainder="drop",
|
|
20
|
-
verbose_feature_names_out=False,
|
|
21
|
-
).set_output(transform="pandas")
|
|
22
|
-
|
|
23
|
-
DenseView = ColumnTransformer(
|
|
24
|
-
[
|
|
25
|
-
("numerical", StandardScaler(), make_column_selector(dtype_exclude="category")),
|
|
26
|
-
("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
|
|
27
|
-
],
|
|
28
|
-
remainder="drop",
|
|
29
|
-
verbose_feature_names_out=False,
|
|
30
|
-
).set_output(transform="pandas")
|
|
31
|
-
|
|
32
|
-
CategoricalView = Pipeline(
|
|
33
|
-
[
|
|
34
|
-
(
|
|
35
|
-
"bins",
|
|
36
|
-
ColumnTransformer(
|
|
37
|
-
[
|
|
38
|
-
(
|
|
39
|
-
"numerical",
|
|
40
|
-
KBinsDiscretizer(n_bins=4, strategy="quantile", quantile_method="averaged_inverted_cdf", encode="ordinal"),
|
|
41
|
-
make_column_selector(dtype_exclude="category"),
|
|
42
|
-
),
|
|
43
|
-
(
|
|
44
|
-
"category",
|
|
45
|
-
OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
|
|
46
|
-
make_column_selector(dtype_include="category"),
|
|
47
|
-
),
|
|
48
|
-
],
|
|
49
|
-
remainder="drop",
|
|
50
|
-
verbose_feature_names_out=False,
|
|
51
|
-
).set_output(transform="pandas"),
|
|
52
|
-
),
|
|
53
|
-
("cats", FunctionTransformer(lambda df: df.astype(int).astype("category"), feature_names_out="one-to-one")),
|
|
54
|
-
]
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
PolynomialView = Pipeline(
|
|
58
|
-
[
|
|
59
|
-
("Linear", LinearView),
|
|
60
|
-
("poly", PolynomialFeatures(degree=2).set_output(transform="pandas")),
|
|
61
|
-
]
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
SparseView = ColumnTransformer(
|
|
65
|
-
[
|
|
66
|
-
(
|
|
67
|
-
"num_bins",
|
|
68
|
-
KBinsDiscretizer(n_bins=10, quantile_method="averaged_inverted_cdf", encode="onehot"),
|
|
69
|
-
make_column_selector(dtype_exclude="category"),
|
|
70
|
-
),
|
|
71
|
-
(
|
|
72
|
-
"cat_ohe",
|
|
73
|
-
OneHotEncoder(handle_unknown="ignore"),
|
|
74
|
-
make_column_selector(dtype_include="category"),
|
|
75
|
-
),
|
|
76
|
-
],
|
|
77
|
-
remainder="drop",
|
|
78
|
-
verbose_feature_names_out=False,
|
|
79
|
-
)
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
sqlalchemy<3.0.0,>=2.0.46
|
|
2
|
-
pyodbc<6.0.0,>=5.3.0
|
|
3
|
-
pandas<4.0.0,>=3.0.0
|
|
4
|
-
numpy<3.0.0,>=2.4.1
|
|
5
|
-
|
|
6
|
-
[all]
|
|
7
|
-
openpyxl<4.0.0,>=3.1.5
|
|
8
|
-
|
|
9
|
-
[optuna]
|
|
10
|
-
optuna<5.0.0,>=4.7.0
|
|
11
|
-
|
|
12
|
-
[scipy]
|
|
13
|
-
scipy<2.0.0,>=1.17.0
|
|
14
|
-
|
|
15
|
-
[sklearn]
|
|
16
|
-
scikit-learn<2.0.0,>=1.8.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|