penwings 0.1.1b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: penwings
3
+ Version: 0.1.1b0
4
+ Author-email: Frissie <R.Blanckaert@outlook.com>
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
7
+ Requires-Dist: pyodbc<6.0.0,>=5.3.0
8
+ Requires-Dist: pandas<4.0.0,>=3.0.0
9
+ Requires-Dist: numpy<3.0.0,>=2.4.1
10
+ Provides-Extra: scipy
11
+ Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
12
+ Provides-Extra: sklearn
13
+ Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
14
+ Provides-Extra: optuna
15
+ Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
16
+ Provides-Extra: all
17
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
@@ -0,0 +1,43 @@
1
+ [project]
2
+ name = "penwings"
3
+ dynamic = ["version"]
4
+ description = ""
5
+ authors = [
6
+ {name = "Frissie",email = "R.Blanckaert@outlook.com"}
7
+ ]
8
+ requires-python = ">=3.11"
9
+ dependencies = [
10
+ "sqlalchemy (>=2.0.46,<3.0.0)",
11
+ "pyodbc (>=5.3.0,<6.0.0)",
12
+ "pandas (>=3.0.0,<4.0.0)",
13
+ "numpy (>=2.4.1,<3.0.0)"
14
+ ]
15
+
16
+ [tool.setuptools_scm]
17
+ tag_regex = "^v(?P<version>.*)$"
18
+
19
+ [tool.setuptools]
20
+ package-dir = {"" = "src"}
21
+
22
+ [tool.setuptools.packages.find]
23
+ where = ["src"]
24
+ include = ["penwings*"]
25
+ exclude = ["penwings._*"]
26
+
27
+ [build-system]
28
+ requires = ["setuptools>=68", "wheel", "setuptools-scm"]
29
+ build-backend = "setuptools.build_meta"
30
+
31
+ [dependency-groups]
32
+ dev = [
33
+ "openpyxl>=3.1.5",
34
+ "optuna>=4.7.0",
35
+ "scikit-learn>=1.8.0",
36
+ "scipy>=1.17.0",
37
+ ]
38
+
39
+ [project.optional-dependencies]
40
+ scipy = ["scipy (>=1.17.0,<2.0.0)"]
41
+ sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
42
+ optuna= ["optuna (>=4.7.0,<5.0.0)"]
43
+ all = ["openpyxl (>=3.1.5,<4.0.0)"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,9 @@
1
+ from .io.cache import SQLParquetCache
2
+ from .paths import input_dir, output_dir, model_dir
3
+
4
+ __all__ = [
5
+ "SQLParquetCache",
6
+ "input_dir",
7
+ "output_dir",
8
+ "model_dir",
9
+ ]
File without changes
@@ -0,0 +1,38 @@
1
+ import time as t
2
+
3
+ from functools import wraps
4
+ from pathlib import Path
5
+
6
+
7
+ def timing(func):
8
+ @wraps(func)
9
+ def wrapper(*args, **kwargs):
10
+ start = t.perf_counter()
11
+ result = func(*args, **kwargs)
12
+ end = t.perf_counter()
13
+ print(f"{func.__name__} took {end - start: .2f}")
14
+ return result
15
+
16
+ return wrapper
17
+
18
+
19
+ def timing_sql(func):
20
+ @wraps(func)
21
+ def wrapper(*args, **kwargs):
22
+ sql_file = kwargs.get("sql_file", None)
23
+ verbose = getattr(args[0], "verbose", True)
24
+
25
+ if sql_file is None and len(args) > 1:
26
+ sql_file = args[1]
27
+
28
+ sql_file = Path(sql_file)
29
+
30
+ start = t.perf_counter()
31
+ result, source = func(*args, **kwargs)
32
+ end = t.perf_counter()
33
+
34
+ if verbose:
35
+ print(f"{sql_file.stem} -> {source} took {end - start: .2f} seconds to load")
36
+ return result
37
+
38
+ return wrapper
@@ -0,0 +1,7 @@
1
+ from typing import TypedDict, List, Union
2
+
3
+
4
+ class SQLParquetKwargs(TypedDict, total=False):
5
+ index_col: Union[str, List[str], None]
6
+ parse_dates: Union[List[str], None]
7
+ dtype: Union[dict, None]
File without changes
@@ -0,0 +1,93 @@
1
+ import pandas as pd
2
+
3
+ from sqlalchemy import Engine
4
+ from pathlib import Path
5
+ from datetime import datetime, timedelta
6
+ from typing import Unpack, Union, Optional
7
+ from .._utils._typing import SQLParquetKwargs
8
+ from .._utils._decorators import timing_sql
9
+
10
+
11
+ class SQLParquetCache:
12
+ def __init__(
13
+ self,
14
+ parquet_dir: Union[Path, str],
15
+ conn: Engine,
16
+ sql_dir: Optional[Union[Path, str]] = None,
17
+ refresh_days: int = 0, # zero disables refresh when force == false
18
+ verbose: bool = True,
19
+ **kwargs: Unpack[SQLParquetKwargs],
20
+ ):
21
+
22
+ if sql_dir is not None:
23
+ self.sql_dir: Path = Path(sql_dir)
24
+ self.parquet_dir: Path = Path(parquet_dir)
25
+ self.refresh_days = refresh_days
26
+ self.conn = conn
27
+ self.global_kwargs = kwargs
28
+
29
+ self.verbose = verbose
30
+ self.source = "SQL"
31
+
32
+ def set_params(self, **params):
33
+ for key, value in params.items():
34
+ if not hasattr(self, key):
35
+ raise ValueError(f"Invalid parameter: {key}")
36
+ setattr(self, key, value)
37
+ return self
38
+
39
+ def _sql_path(self, sql_file: str) -> Path:
40
+ return self.sql_dir / sql_file
41
+
42
+ def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
43
+ name = parquet_name or Path(sql_file).stem
44
+ return self.parquet_dir / f"{name}.parquet"
45
+
46
+ def _is_new(self, path: Path, refresh_window: int) -> bool:
47
+ if not path.exists():
48
+ return False
49
+ if self.refresh_days == 0:
50
+ return True
51
+ last_modified = datetime.fromtimestamp(path.stat().st_mtime)
52
+ return datetime.now() - last_modified < timedelta(days=refresh_window)
53
+
54
+ def _read_sql(self, sql_file: str):
55
+ return self._sql_path(sql_file).read_text()
56
+
57
+ def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
58
+ return pd.read_sql(query, conn, **kwargs)
59
+
60
+ @timing_sql
61
+ def get(
62
+ self,
63
+ sql: str,
64
+ parquet_name: Union[str, None] = None,
65
+ conn: Engine | None = None,
66
+ refresh_days: int | None = None,
67
+ force: bool = False,
68
+ **kwargs: Unpack[SQLParquetKwargs],
69
+ ) -> tuple[pd.DataFrame, str]:
70
+ if isinstance(sql, str) and Path(sql).suffix == ".sql":
71
+ query = self._read_sql(sql)
72
+ elif isinstance(sql, str):
73
+ if parquet_name is None:
74
+ raise ValueError("parquet_name must be provided if query is passed directly")
75
+ query = sql
76
+ else:
77
+ raise ValueError("sql must be a SQL string or a path to a .sql file")
78
+
79
+ connection = conn or self.conn
80
+ refresh_window = refresh_days or self.refresh_days
81
+ parquet_path = self._parquet_path(query)
82
+ sql_kwargs = self.global_kwargs | kwargs
83
+
84
+ if not force and self._is_new(parquet_path, refresh_window):
85
+ source = "Parquet"
86
+ return pd.read_parquet(parquet_path), source
87
+
88
+ source = "SQL"
89
+ df = self._return_sql(query, connection, **sql_kwargs)
90
+ self.parquet_dir.mkdir(parents=True, exist_ok=True)
91
+ df.to_parquet(parquet_path, index=False)
92
+
93
+ return df, source
@@ -0,0 +1,15 @@
1
+ import pathlib
2
+
3
+ home_dir = pathlib.Path.cwd()
4
+ proj_dir = pathlib.Path.cwd().parent
5
+
6
+ input_dir = home_dir / "input"
7
+ model_dir = home_dir / "model"
8
+ output_dir = home_dir / "output"
9
+
10
+ if __name__ == "__main__":
11
+ i = 1
12
+ for name, value in dict(locals()).items():
13
+ if isinstance(value, pathlib.Path):
14
+ print(f"{i} - {name}: {value}")
15
+ i += 1
@@ -0,0 +1,42 @@
1
+ from optuna.trial import Trial
2
+
3
+
4
+ def tune_lgbm_params(trial: Trial, model="classifier"):
5
+ if model == "classifier":
6
+ metrics = {
7
+ "objective": "binary",
8
+ "metric": "auc",
9
+ }
10
+
11
+ params = {
12
+ # Core
13
+ "verbosity": -1,
14
+ "boosting_type": "gbdt",
15
+ # GPU
16
+ "device": "gpu",
17
+ "gpu_platform_id": 0,
18
+ "gpu_device_id": 0,
19
+ # Learning
20
+ "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
21
+ "n_estimators": trial.suggest_int("n_estimators", 1500, 5000),
22
+ # Tree structure (GPU-safe)
23
+ "num_leaves": trial.suggest_int("num_leaves", 31, 128),
24
+ "max_depth": trial.suggest_int("max_depth", 4, 10),
25
+ # Regularization / stability
26
+ "min_child_samples": trial.suggest_int("min_child_samples", 10, 80),
27
+ "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
28
+ "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
29
+ # Sampling
30
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
31
+ "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
32
+ "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
33
+ # Regularization
34
+ "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
35
+ "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 5.0, log=True),
36
+ # Histogram
37
+ "max_bin": trial.suggest_int("max_bin", 64, 255),
38
+ # Class imbalance (keep only if needed)
39
+ "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 3.0),
40
+ }
41
+
42
+ return metrics | params
@@ -0,0 +1,79 @@
1
+ from sklearn.pipeline import Pipeline
2
+ from sklearn.compose import ColumnTransformer, make_column_selector
3
+ from sklearn.preprocessing import (
4
+ TargetEncoder,
5
+ OneHotEncoder,
6
+ RobustScaler,
7
+ KBinsDiscretizer,
8
+ FunctionTransformer,
9
+ PolynomialFeatures,
10
+ OrdinalEncoder,
11
+ StandardScaler,
12
+ )
13
+
14
+ LinearView = ColumnTransformer(
15
+ [
16
+ ("numerical", RobustScaler(), make_column_selector(dtype_exclude="category")),
17
+ ("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
18
+ ],
19
+ remainder="drop",
20
+ verbose_feature_names_out=False,
21
+ ).set_output(transform="pandas")
22
+
23
+ DenseView = ColumnTransformer(
24
+ [
25
+ ("numerical", StandardScaler(), make_column_selector(dtype_exclude="category")),
26
+ ("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
27
+ ],
28
+ remainder="drop",
29
+ verbose_feature_names_out=False,
30
+ ).set_output(transform="pandas")
31
+
32
+ CategoricalView = Pipeline(
33
+ [
34
+ (
35
+ "bins",
36
+ ColumnTransformer(
37
+ [
38
+ (
39
+ "numerical",
40
+ KBinsDiscretizer(n_bins=4, strategy="quantile", quantile_method="averaged_inverted_cdf", encode="ordinal"),
41
+ make_column_selector(dtype_exclude="category"),
42
+ ),
43
+ (
44
+ "category",
45
+ OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
46
+ make_column_selector(dtype_include="category"),
47
+ ),
48
+ ],
49
+ remainder="drop",
50
+ verbose_feature_names_out=False,
51
+ ).set_output(transform="pandas"),
52
+ ),
53
+ ("cats", FunctionTransformer(lambda df: df.astype(int).astype("category"), feature_names_out="one-to-one")),
54
+ ]
55
+ )
56
+
57
+ PolynomialView = Pipeline(
58
+ [
59
+ ("Linear", LinearView),
60
+ ("poly", PolynomialFeatures(degree=2).set_output(transform="pandas")),
61
+ ]
62
+ )
63
+
64
+ SparseView = ColumnTransformer(
65
+ [
66
+ (
67
+ "num_bins",
68
+ KBinsDiscretizer(n_bins=10, quantile_method="averaged_inverted_cdf", encode="onehot"),
69
+ make_column_selector(dtype_exclude="category"),
70
+ ),
71
+ (
72
+ "cat_ohe",
73
+ OneHotEncoder(handle_unknown="ignore"),
74
+ make_column_selector(dtype_include="category"),
75
+ ),
76
+ ],
77
+ remainder="drop",
78
+ verbose_feature_names_out=False,
79
+ )
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: penwings
3
+ Version: 0.1.1b0
4
+ Author-email: Frissie <R.Blanckaert@outlook.com>
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
7
+ Requires-Dist: pyodbc<6.0.0,>=5.3.0
8
+ Requires-Dist: pandas<4.0.0,>=3.0.0
9
+ Requires-Dist: numpy<3.0.0,>=2.4.1
10
+ Provides-Extra: scipy
11
+ Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
12
+ Provides-Extra: sklearn
13
+ Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
14
+ Provides-Extra: optuna
15
+ Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
16
+ Provides-Extra: all
17
+ Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
@@ -0,0 +1,17 @@
1
+ .gitignore
2
+ pyproject.toml
3
+ uv.lock
4
+ src/penwings/__init__.py
5
+ src/penwings/paths.py
6
+ src/penwings/tuner.py
7
+ src/penwings/views.py
8
+ src/penwings.egg-info/PKG-INFO
9
+ src/penwings.egg-info/SOURCES.txt
10
+ src/penwings.egg-info/dependency_links.txt
11
+ src/penwings.egg-info/requires.txt
12
+ src/penwings.egg-info/top_level.txt
13
+ src/penwings/_utils/__init__.py
14
+ src/penwings/_utils/_decorators.py
15
+ src/penwings/_utils/_typing.py
16
+ src/penwings/io/__init__.py
17
+ src/penwings/io/cache.py
@@ -0,0 +1,16 @@
1
+ sqlalchemy<3.0.0,>=2.0.46
2
+ pyodbc<6.0.0,>=5.3.0
3
+ pandas<4.0.0,>=3.0.0
4
+ numpy<3.0.0,>=2.4.1
5
+
6
+ [all]
7
+ openpyxl<4.0.0,>=3.1.5
8
+
9
+ [optuna]
10
+ optuna<5.0.0,>=4.7.0
11
+
12
+ [scipy]
13
+ scipy<2.0.0,>=1.17.0
14
+
15
+ [sklearn]
16
+ scikit-learn<2.0.0,>=1.8.0
@@ -0,0 +1 @@
1
+ penwings