penwings 0.1.1b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- penwings-0.1.1b0/.gitignore +10 -0
- penwings-0.1.1b0/PKG-INFO +17 -0
- penwings-0.1.1b0/pyproject.toml +43 -0
- penwings-0.1.1b0/setup.cfg +4 -0
- penwings-0.1.1b0/src/penwings/__init__.py +9 -0
- penwings-0.1.1b0/src/penwings/_utils/__init__.py +0 -0
- penwings-0.1.1b0/src/penwings/_utils/_decorators.py +38 -0
- penwings-0.1.1b0/src/penwings/_utils/_typing.py +7 -0
- penwings-0.1.1b0/src/penwings/io/__init__.py +0 -0
- penwings-0.1.1b0/src/penwings/io/cache.py +93 -0
- penwings-0.1.1b0/src/penwings/paths.py +15 -0
- penwings-0.1.1b0/src/penwings/tuner.py +42 -0
- penwings-0.1.1b0/src/penwings/views.py +79 -0
- penwings-0.1.1b0/src/penwings.egg-info/PKG-INFO +17 -0
- penwings-0.1.1b0/src/penwings.egg-info/SOURCES.txt +17 -0
- penwings-0.1.1b0/src/penwings.egg-info/dependency_links.txt +1 -0
- penwings-0.1.1b0/src/penwings.egg-info/requires.txt +16 -0
- penwings-0.1.1b0/src/penwings.egg-info/top_level.txt +1 -0
- penwings-0.1.1b0/uv.lock +766 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: penwings
|
|
3
|
+
Version: 0.1.1b0
|
|
4
|
+
Author-email: Frissie <R.Blanckaert@outlook.com>
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
|
|
7
|
+
Requires-Dist: pyodbc<6.0.0,>=5.3.0
|
|
8
|
+
Requires-Dist: pandas<4.0.0,>=3.0.0
|
|
9
|
+
Requires-Dist: numpy<3.0.0,>=2.4.1
|
|
10
|
+
Provides-Extra: scipy
|
|
11
|
+
Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
|
|
12
|
+
Provides-Extra: sklearn
|
|
13
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
|
|
14
|
+
Provides-Extra: optuna
|
|
15
|
+
Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
|
|
16
|
+
Provides-Extra: all
|
|
17
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "penwings"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Frissie",email = "R.Blanckaert@outlook.com"}
|
|
7
|
+
]
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"sqlalchemy (>=2.0.46,<3.0.0)",
|
|
11
|
+
"pyodbc (>=5.3.0,<6.0.0)",
|
|
12
|
+
"pandas (>=3.0.0,<4.0.0)",
|
|
13
|
+
"numpy (>=2.4.1,<3.0.0)"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.setuptools_scm]
|
|
17
|
+
tag_regex = "^v(?P<version>.*)$"
|
|
18
|
+
|
|
19
|
+
[tool.setuptools]
|
|
20
|
+
package-dir = {"" = "src"}
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["src"]
|
|
24
|
+
include = ["penwings*"]
|
|
25
|
+
exclude = ["penwings._*"]
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["setuptools>=68", "wheel", "setuptools-scm"]
|
|
29
|
+
build-backend = "setuptools.build_meta"
|
|
30
|
+
|
|
31
|
+
[dependency-groups]
|
|
32
|
+
dev = [
|
|
33
|
+
"openpyxl>=3.1.5",
|
|
34
|
+
"optuna>=4.7.0",
|
|
35
|
+
"scikit-learn>=1.8.0",
|
|
36
|
+
"scipy>=1.17.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
scipy = ["scipy (>=1.17.0,<2.0.0)"]
|
|
41
|
+
sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
|
|
42
|
+
optuna= ["optuna (>=4.7.0,<5.0.0)"]
|
|
43
|
+
all = ["openpyxl (>=3.1.5,<4.0.0)"]
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import time as t
|
|
2
|
+
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def timing(func):
|
|
8
|
+
@wraps(func)
|
|
9
|
+
def wrapper(*args, **kwargs):
|
|
10
|
+
start = t.perf_counter()
|
|
11
|
+
result = func(*args, **kwargs)
|
|
12
|
+
end = t.perf_counter()
|
|
13
|
+
print(f"{func.__name__} took {end - start: .2f}")
|
|
14
|
+
return result
|
|
15
|
+
|
|
16
|
+
return wrapper
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def timing_sql(func):
|
|
20
|
+
@wraps(func)
|
|
21
|
+
def wrapper(*args, **kwargs):
|
|
22
|
+
sql_file = kwargs.get("sql_file", None)
|
|
23
|
+
verbose = getattr(args[0], "verbose", True)
|
|
24
|
+
|
|
25
|
+
if sql_file is None and len(args) > 1:
|
|
26
|
+
sql_file = args[1]
|
|
27
|
+
|
|
28
|
+
sql_file = Path(sql_file)
|
|
29
|
+
|
|
30
|
+
start = t.perf_counter()
|
|
31
|
+
result, source = func(*args, **kwargs)
|
|
32
|
+
end = t.perf_counter()
|
|
33
|
+
|
|
34
|
+
if verbose:
|
|
35
|
+
print(f"{sql_file.stem} -> {source} took {end - start: .2f} seconds to load")
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
return wrapper
|
|
File without changes
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import Engine
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Unpack, Union, Optional
|
|
7
|
+
from .._utils._typing import SQLParquetKwargs
|
|
8
|
+
from .._utils._decorators import timing_sql
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SQLParquetCache:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
parquet_dir: Union[Path, str],
|
|
15
|
+
conn: Engine,
|
|
16
|
+
sql_dir: Optional[Union[Path, str]] = None,
|
|
17
|
+
refresh_days: int = 0, # zero disables refresh when force == false
|
|
18
|
+
verbose: bool = True,
|
|
19
|
+
**kwargs: Unpack[SQLParquetKwargs],
|
|
20
|
+
):
|
|
21
|
+
|
|
22
|
+
if sql_dir is not None:
|
|
23
|
+
self.sql_dir: Path = Path(sql_dir)
|
|
24
|
+
self.parquet_dir: Path = Path(parquet_dir)
|
|
25
|
+
self.refresh_days = refresh_days
|
|
26
|
+
self.conn = conn
|
|
27
|
+
self.global_kwargs = kwargs
|
|
28
|
+
|
|
29
|
+
self.verbose = verbose
|
|
30
|
+
self.source = "SQL"
|
|
31
|
+
|
|
32
|
+
def set_params(self, **params):
|
|
33
|
+
for key, value in params.items():
|
|
34
|
+
if not hasattr(self, key):
|
|
35
|
+
raise ValueError(f"Invalid parameter: {key}")
|
|
36
|
+
setattr(self, key, value)
|
|
37
|
+
return self
|
|
38
|
+
|
|
39
|
+
def _sql_path(self, sql_file: str) -> Path:
|
|
40
|
+
return self.sql_dir / sql_file
|
|
41
|
+
|
|
42
|
+
def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
|
|
43
|
+
name = parquet_name or Path(sql_file).stem
|
|
44
|
+
return self.parquet_dir / f"{name}.parquet"
|
|
45
|
+
|
|
46
|
+
def _is_new(self, path: Path, refresh_window: int) -> bool:
|
|
47
|
+
if not path.exists():
|
|
48
|
+
return False
|
|
49
|
+
if self.refresh_days == 0:
|
|
50
|
+
return True
|
|
51
|
+
last_modified = datetime.fromtimestamp(path.stat().st_mtime)
|
|
52
|
+
return datetime.now() - last_modified < timedelta(days=refresh_window)
|
|
53
|
+
|
|
54
|
+
def _read_sql(self, sql_file: str):
|
|
55
|
+
return self._sql_path(sql_file).read_text()
|
|
56
|
+
|
|
57
|
+
def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
|
|
58
|
+
return pd.read_sql(query, conn, **kwargs)
|
|
59
|
+
|
|
60
|
+
@timing_sql
|
|
61
|
+
def get(
|
|
62
|
+
self,
|
|
63
|
+
sql: str,
|
|
64
|
+
parquet_name: Union[str, None] = None,
|
|
65
|
+
conn: Engine | None = None,
|
|
66
|
+
refresh_days: int | None = None,
|
|
67
|
+
force: bool = False,
|
|
68
|
+
**kwargs: Unpack[SQLParquetKwargs],
|
|
69
|
+
) -> tuple[pd.DataFrame, str]:
|
|
70
|
+
if isinstance(sql, str) and Path(sql).suffix == ".sql":
|
|
71
|
+
query = self._read_sql(sql)
|
|
72
|
+
elif isinstance(sql, str):
|
|
73
|
+
if parquet_name is None:
|
|
74
|
+
raise ValueError("parquet_name must be provided if query is passed directly")
|
|
75
|
+
query = sql
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError("sql must be a SQL string or a path to a .sql file")
|
|
78
|
+
|
|
79
|
+
connection = conn or self.conn
|
|
80
|
+
refresh_window = refresh_days or self.refresh_days
|
|
81
|
+
parquet_path = self._parquet_path(query)
|
|
82
|
+
sql_kwargs = self.global_kwargs | kwargs
|
|
83
|
+
|
|
84
|
+
if not force and self._is_new(parquet_path, refresh_window):
|
|
85
|
+
source = "Parquet"
|
|
86
|
+
return pd.read_parquet(parquet_path), source
|
|
87
|
+
|
|
88
|
+
source = "SQL"
|
|
89
|
+
df = self._return_sql(query, connection, **sql_kwargs)
|
|
90
|
+
self.parquet_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
df.to_parquet(parquet_path, index=False)
|
|
92
|
+
|
|
93
|
+
return df, source
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
home_dir = pathlib.Path.cwd()
|
|
4
|
+
proj_dir = pathlib.Path.cwd().parent
|
|
5
|
+
|
|
6
|
+
input_dir = home_dir / "input"
|
|
7
|
+
model_dir = home_dir / "model"
|
|
8
|
+
output_dir = home_dir / "output"
|
|
9
|
+
|
|
10
|
+
if __name__ == "__main__":
|
|
11
|
+
i = 1
|
|
12
|
+
for name, value in dict(locals()).items():
|
|
13
|
+
if isinstance(value, pathlib.Path):
|
|
14
|
+
print(f"{i} - {name}: {value}")
|
|
15
|
+
i += 1
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from optuna.trial import Trial
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def tune_lgbm_params(trial: Trial, model="classifier"):
|
|
5
|
+
if model == "classifier":
|
|
6
|
+
metrics = {
|
|
7
|
+
"objective": "binary",
|
|
8
|
+
"metric": "auc",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
params = {
|
|
12
|
+
# Core
|
|
13
|
+
"verbosity": -1,
|
|
14
|
+
"boosting_type": "gbdt",
|
|
15
|
+
# GPU
|
|
16
|
+
"device": "gpu",
|
|
17
|
+
"gpu_platform_id": 0,
|
|
18
|
+
"gpu_device_id": 0,
|
|
19
|
+
# Learning
|
|
20
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
|
|
21
|
+
"n_estimators": trial.suggest_int("n_estimators", 1500, 5000),
|
|
22
|
+
# Tree structure (GPU-safe)
|
|
23
|
+
"num_leaves": trial.suggest_int("num_leaves", 31, 128),
|
|
24
|
+
"max_depth": trial.suggest_int("max_depth", 4, 10),
|
|
25
|
+
# Regularization / stability
|
|
26
|
+
"min_child_samples": trial.suggest_int("min_child_samples", 10, 80),
|
|
27
|
+
"min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
|
|
28
|
+
"min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
|
|
29
|
+
# Sampling
|
|
30
|
+
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
|
31
|
+
"subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
|
|
32
|
+
"feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
|
|
33
|
+
# Regularization
|
|
34
|
+
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
|
|
35
|
+
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 5.0, log=True),
|
|
36
|
+
# Histogram
|
|
37
|
+
"max_bin": trial.suggest_int("max_bin", 64, 255),
|
|
38
|
+
# Class imbalance (keep only if needed)
|
|
39
|
+
"scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 3.0),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return metrics | params
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from sklearn.pipeline import Pipeline
|
|
2
|
+
from sklearn.compose import ColumnTransformer, make_column_selector
|
|
3
|
+
from sklearn.preprocessing import (
|
|
4
|
+
TargetEncoder,
|
|
5
|
+
OneHotEncoder,
|
|
6
|
+
RobustScaler,
|
|
7
|
+
KBinsDiscretizer,
|
|
8
|
+
FunctionTransformer,
|
|
9
|
+
PolynomialFeatures,
|
|
10
|
+
OrdinalEncoder,
|
|
11
|
+
StandardScaler,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
LinearView = ColumnTransformer(
|
|
15
|
+
[
|
|
16
|
+
("numerical", RobustScaler(), make_column_selector(dtype_exclude="category")),
|
|
17
|
+
("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
|
|
18
|
+
],
|
|
19
|
+
remainder="drop",
|
|
20
|
+
verbose_feature_names_out=False,
|
|
21
|
+
).set_output(transform="pandas")
|
|
22
|
+
|
|
23
|
+
DenseView = ColumnTransformer(
|
|
24
|
+
[
|
|
25
|
+
("numerical", StandardScaler(), make_column_selector(dtype_exclude="category")),
|
|
26
|
+
("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
|
|
27
|
+
],
|
|
28
|
+
remainder="drop",
|
|
29
|
+
verbose_feature_names_out=False,
|
|
30
|
+
).set_output(transform="pandas")
|
|
31
|
+
|
|
32
|
+
CategoricalView = Pipeline(
|
|
33
|
+
[
|
|
34
|
+
(
|
|
35
|
+
"bins",
|
|
36
|
+
ColumnTransformer(
|
|
37
|
+
[
|
|
38
|
+
(
|
|
39
|
+
"numerical",
|
|
40
|
+
KBinsDiscretizer(n_bins=4, strategy="quantile", quantile_method="averaged_inverted_cdf", encode="ordinal"),
|
|
41
|
+
make_column_selector(dtype_exclude="category"),
|
|
42
|
+
),
|
|
43
|
+
(
|
|
44
|
+
"category",
|
|
45
|
+
OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
|
|
46
|
+
make_column_selector(dtype_include="category"),
|
|
47
|
+
),
|
|
48
|
+
],
|
|
49
|
+
remainder="drop",
|
|
50
|
+
verbose_feature_names_out=False,
|
|
51
|
+
).set_output(transform="pandas"),
|
|
52
|
+
),
|
|
53
|
+
("cats", FunctionTransformer(lambda df: df.astype(int).astype("category"), feature_names_out="one-to-one")),
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
PolynomialView = Pipeline(
|
|
58
|
+
[
|
|
59
|
+
("Linear", LinearView),
|
|
60
|
+
("poly", PolynomialFeatures(degree=2).set_output(transform="pandas")),
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
SparseView = ColumnTransformer(
|
|
65
|
+
[
|
|
66
|
+
(
|
|
67
|
+
"num_bins",
|
|
68
|
+
KBinsDiscretizer(n_bins=10, quantile_method="averaged_inverted_cdf", encode="onehot"),
|
|
69
|
+
make_column_selector(dtype_exclude="category"),
|
|
70
|
+
),
|
|
71
|
+
(
|
|
72
|
+
"cat_ohe",
|
|
73
|
+
OneHotEncoder(handle_unknown="ignore"),
|
|
74
|
+
make_column_selector(dtype_include="category"),
|
|
75
|
+
),
|
|
76
|
+
],
|
|
77
|
+
remainder="drop",
|
|
78
|
+
verbose_feature_names_out=False,
|
|
79
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: penwings
|
|
3
|
+
Version: 0.1.1b0
|
|
4
|
+
Author-email: Frissie <R.Blanckaert@outlook.com>
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
|
|
7
|
+
Requires-Dist: pyodbc<6.0.0,>=5.3.0
|
|
8
|
+
Requires-Dist: pandas<4.0.0,>=3.0.0
|
|
9
|
+
Requires-Dist: numpy<3.0.0,>=2.4.1
|
|
10
|
+
Provides-Extra: scipy
|
|
11
|
+
Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
|
|
12
|
+
Provides-Extra: sklearn
|
|
13
|
+
Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
|
|
14
|
+
Provides-Extra: optuna
|
|
15
|
+
Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
|
|
16
|
+
Provides-Extra: all
|
|
17
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
.gitignore
|
|
2
|
+
pyproject.toml
|
|
3
|
+
uv.lock
|
|
4
|
+
src/penwings/__init__.py
|
|
5
|
+
src/penwings/paths.py
|
|
6
|
+
src/penwings/tuner.py
|
|
7
|
+
src/penwings/views.py
|
|
8
|
+
src/penwings.egg-info/PKG-INFO
|
|
9
|
+
src/penwings.egg-info/SOURCES.txt
|
|
10
|
+
src/penwings.egg-info/dependency_links.txt
|
|
11
|
+
src/penwings.egg-info/requires.txt
|
|
12
|
+
src/penwings.egg-info/top_level.txt
|
|
13
|
+
src/penwings/_utils/__init__.py
|
|
14
|
+
src/penwings/_utils/_decorators.py
|
|
15
|
+
src/penwings/_utils/_typing.py
|
|
16
|
+
src/penwings/io/__init__.py
|
|
17
|
+
src/penwings/io/cache.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
sqlalchemy<3.0.0,>=2.0.46
|
|
2
|
+
pyodbc<6.0.0,>=5.3.0
|
|
3
|
+
pandas<4.0.0,>=3.0.0
|
|
4
|
+
numpy<3.0.0,>=2.4.1
|
|
5
|
+
|
|
6
|
+
[all]
|
|
7
|
+
openpyxl<4.0.0,>=3.1.5
|
|
8
|
+
|
|
9
|
+
[optuna]
|
|
10
|
+
optuna<5.0.0,>=4.7.0
|
|
11
|
+
|
|
12
|
+
[scipy]
|
|
13
|
+
scipy<2.0.0,>=1.17.0
|
|
14
|
+
|
|
15
|
+
[sklearn]
|
|
16
|
+
scikit-learn<2.0.0,>=1.8.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
penwings
|