PyPI - penwings - Versions diffs - 0.1.1b0__tar.gz - Mend

penwings 0.1.1b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

penwings-0.1.1b0/.gitignore +10 -0
penwings-0.1.1b0/PKG-INFO +17 -0
penwings-0.1.1b0/pyproject.toml +43 -0
penwings-0.1.1b0/setup.cfg +4 -0
penwings-0.1.1b0/src/penwings/__init__.py +9 -0
penwings-0.1.1b0/src/penwings/_utils/__init__.py +0 -0
penwings-0.1.1b0/src/penwings/_utils/_decorators.py +38 -0
penwings-0.1.1b0/src/penwings/_utils/_typing.py +7 -0
penwings-0.1.1b0/src/penwings/io/__init__.py +0 -0
penwings-0.1.1b0/src/penwings/io/cache.py +93 -0
penwings-0.1.1b0/src/penwings/paths.py +15 -0
penwings-0.1.1b0/src/penwings/tuner.py +42 -0
penwings-0.1.1b0/src/penwings/views.py +79 -0
penwings-0.1.1b0/src/penwings.egg-info/PKG-INFO +17 -0
penwings-0.1.1b0/src/penwings.egg-info/SOURCES.txt +17 -0
penwings-0.1.1b0/src/penwings.egg-info/dependency_links.txt +1 -0
penwings-0.1.1b0/src/penwings.egg-info/requires.txt +16 -0
penwings-0.1.1b0/src/penwings.egg-info/top_level.txt +1 -0
penwings-0.1.1b0/uv.lock +766 -0

penwings-0.1.1b0/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

penwings-0.1.1b0/PKG-INFO ADDED Viewed

@@ -0,0 +1,17 @@
+Metadata-Version: 2.4
+Name: penwings
+Version: 0.1.1b0
+Author-email: Frissie <R.Blanckaert@outlook.com>
+Requires-Python: >=3.11
+Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
+Requires-Dist: pyodbc<6.0.0,>=5.3.0
+Requires-Dist: pandas<4.0.0,>=3.0.0
+Requires-Dist: numpy<3.0.0,>=2.4.1
+Provides-Extra: scipy
+Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
+Provides-Extra: sklearn
+Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
+Provides-Extra: optuna
+Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
+Provides-Extra: all
+Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"

penwings-0.1.1b0/pyproject.toml ADDED Viewed

@@ -0,0 +1,43 @@
+[project]
+name = "penwings"
+dynamic = ["version"]
+description = ""
+authors = [
+    {name = "Frissie",email = "R.Blanckaert@outlook.com"}
+]
+requires-python = ">=3.11"
+dependencies = [
+    "sqlalchemy (>=2.0.46,<3.0.0)",
+    "pyodbc (>=5.3.0,<6.0.0)",
+    "pandas (>=3.0.0,<4.0.0)",
+    "numpy (>=2.4.1,<3.0.0)"
+]
+[tool.setuptools_scm]
+tag_regex = "^v(?P<version>.*)$"
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["penwings*"]
+exclude = ["penwings._*"]
+[build-system]
+requires = ["setuptools>=68", "wheel", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+[dependency-groups]
+dev = [
+    "openpyxl>=3.1.5",
+    "optuna>=4.7.0",
+    "scikit-learn>=1.8.0",
+    "scipy>=1.17.0",
+]
+[project.optional-dependencies]
+scipy = ["scipy (>=1.17.0,<2.0.0)"]
+sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
+optuna= ["optuna (>=4.7.0,<5.0.0)"]
+all = ["openpyxl (>=3.1.5,<4.0.0)"]

penwings-0.1.1b0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

penwings-0.1.1b0/src/penwings/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .io.cache import SQLParquetCache
+from .paths import input_dir, output_dir, model_dir
+__all__ = [
+    "SQLParquetCache",
+    "input_dir",
+    "output_dir",
+    "model_dir",
+]

penwings-0.1.1b0/src/penwings/_utils/__init__.py ADDED Viewed

File without changes

penwings-0.1.1b0/src/penwings/_utils/_decorators.py ADDED Viewed

@@ -0,0 +1,38 @@
+import time as t
+from functools import wraps
+from pathlib import Path
+def timing(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        start = t.perf_counter()
+        result = func(*args, **kwargs)
+        end = t.perf_counter()
+        print(f"{func.__name__} took {end - start: .2f}")
+        return result
+    return wrapper
+def timing_sql(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        sql_file = kwargs.get("sql_file", None)
+        verbose = getattr(args[0], "verbose", True)
+        if sql_file is None and len(args) > 1:
+            sql_file = args[1]
+        sql_file = Path(sql_file)
+        start = t.perf_counter()
+        result, source = func(*args, **kwargs)
+        end = t.perf_counter()
+        if verbose:
+            print(f"{sql_file.stem} -> {source} took {end - start: .2f} seconds to load")
+        return result
+    return wrapper

penwings-0.1.1b0/src/penwings/_utils/_typing.py ADDED Viewed

@@ -0,0 +1,7 @@
+from typing import TypedDict, List, Union
+class SQLParquetKwargs(TypedDict, total=False):
+    index_col: Union[str, List[str], None]
+    parse_dates: Union[List[str], None]
+    dtype: Union[dict, None]

penwings-0.1.1b0/src/penwings/io/__init__.py ADDED Viewed

File without changes

penwings-0.1.1b0/src/penwings/io/cache.py ADDED Viewed

@@ -0,0 +1,93 @@
+import pandas as pd
+from sqlalchemy import Engine
+from pathlib import Path
+from datetime import datetime, timedelta
+from typing import Unpack, Union, Optional
+from .._utils._typing import SQLParquetKwargs
+from .._utils._decorators import timing_sql
+class SQLParquetCache:
+    def __init__(
+        self,
+        parquet_dir: Union[Path, str],
+        conn: Engine,
+        sql_dir: Optional[Union[Path, str]] = None,
+        refresh_days: int = 0,  # zero disables refresh when force == false
+        verbose: bool = True,
+        **kwargs: Unpack[SQLParquetKwargs],
+    ):
+        if sql_dir is not None:
+            self.sql_dir: Path = Path(sql_dir)
+        self.parquet_dir: Path = Path(parquet_dir)
+        self.refresh_days = refresh_days
+        self.conn = conn
+        self.global_kwargs = kwargs
+        self.verbose = verbose
+        self.source = "SQL"
+    def set_params(self, **params):
+        for key, value in params.items():
+            if not hasattr(self, key):
+                raise ValueError(f"Invalid parameter: {key}")
+            setattr(self, key, value)
+        return self
+    def _sql_path(self, sql_file: str) -> Path:
+        return self.sql_dir / sql_file
+    def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
+        name = parquet_name or Path(sql_file).stem
+        return self.parquet_dir / f"{name}.parquet"
+    def _is_new(self, path: Path, refresh_window: int) -> bool:
+        if not path.exists():
+            return False
+        if self.refresh_days == 0:
+            return True
+        last_modified = datetime.fromtimestamp(path.stat().st_mtime)
+        return datetime.now() - last_modified < timedelta(days=refresh_window)
+    def _read_sql(self, sql_file: str):
+        return self._sql_path(sql_file).read_text()
+    def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
+        return pd.read_sql(query, conn, **kwargs)
+    @timing_sql
+    def get(
+        self,
+        sql: str,
+        parquet_name: Union[str, None] = None,
+        conn: Engine | None = None,
+        refresh_days: int | None = None,
+        force: bool = False,
+        **kwargs: Unpack[SQLParquetKwargs],
+    ) -> tuple[pd.DataFrame, str]:
+        if isinstance(sql, str) and Path(sql).suffix == ".sql":
+            query = self._read_sql(sql)
+        elif isinstance(sql, str):
+            if parquet_name is None:
+                raise ValueError("parquet_name must be provided if query is passed directly")
+            query = sql
+        else:
+            raise ValueError("sql must be a SQL string or a path to a .sql file")
+        connection = conn or self.conn
+        refresh_window = refresh_days or self.refresh_days
+        parquet_path = self._parquet_path(query)
+        sql_kwargs = self.global_kwargs | kwargs
+        if not force and self._is_new(parquet_path, refresh_window):
+            source = "Parquet"
+            return pd.read_parquet(parquet_path), source
+        source = "SQL"
+        df = self._return_sql(query, connection, **sql_kwargs)
+        self.parquet_dir.mkdir(parents=True, exist_ok=True)
+        df.to_parquet(parquet_path, index=False)
+        return df, source

penwings-0.1.1b0/src/penwings/paths.py ADDED Viewed

@@ -0,0 +1,15 @@
+import pathlib
+home_dir = pathlib.Path.cwd()
+proj_dir = pathlib.Path.cwd().parent
+input_dir = home_dir / "input"
+model_dir = home_dir / "model"
+output_dir = home_dir / "output"
+if __name__ == "__main__":
+    i = 1
+    for name, value in dict(locals()).items():
+        if isinstance(value, pathlib.Path):
+            print(f"{i} - {name}: {value}")
+            i += 1

penwings-0.1.1b0/src/penwings/tuner.py ADDED Viewed

@@ -0,0 +1,42 @@
+from optuna.trial import Trial
+def tune_lgbm_params(trial: Trial, model="classifier"):
+    if model == "classifier":
+        metrics = {
+            "objective": "binary",
+            "metric": "auc",
+        }
+    params = {
+        # Core
+        "verbosity": -1,
+        "boosting_type": "gbdt",
+        # GPU
+        "device": "gpu",
+        "gpu_platform_id": 0,
+        "gpu_device_id": 0,
+        # Learning
+        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
+        "n_estimators": trial.suggest_int("n_estimators", 1500, 5000),
+        # Tree structure (GPU-safe)
+        "num_leaves": trial.suggest_int("num_leaves", 31, 128),
+        "max_depth": trial.suggest_int("max_depth", 4, 10),
+        # Regularization / stability
+        "min_child_samples": trial.suggest_int("min_child_samples", 10, 80),
+        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
+        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
+        # Sampling
+        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
+        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
+        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
+        # Regularization
+        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
+        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 5.0, log=True),
+        # Histogram
+        "max_bin": trial.suggest_int("max_bin", 64, 255),
+        # Class imbalance (keep only if needed)
+        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 3.0),
+    }
+    return metrics | params

penwings-0.1.1b0/src/penwings/views.py ADDED Viewed

@@ -0,0 +1,79 @@
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer, make_column_selector
+from sklearn.preprocessing import (
+    TargetEncoder,
+    OneHotEncoder,
+    RobustScaler,
+    KBinsDiscretizer,
+    FunctionTransformer,
+    PolynomialFeatures,
+    OrdinalEncoder,
+    StandardScaler,
+)
+LinearView = ColumnTransformer(
+    [
+        ("numerical", RobustScaler(), make_column_selector(dtype_exclude="category")),
+        ("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
+    ],
+    remainder="drop",
+    verbose_feature_names_out=False,
+).set_output(transform="pandas")
+DenseView = ColumnTransformer(
+    [
+        ("numerical", StandardScaler(), make_column_selector(dtype_exclude="category")),
+        ("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
+    ],
+    remainder="drop",
+    verbose_feature_names_out=False,
+).set_output(transform="pandas")
+CategoricalView = Pipeline(
+    [
+        (
+            "bins",
+            ColumnTransformer(
+                [
+                    (
+                        "numerical",
+                        KBinsDiscretizer(n_bins=4, strategy="quantile", quantile_method="averaged_inverted_cdf", encode="ordinal"),
+                        make_column_selector(dtype_exclude="category"),
+                    ),
+                    (
+                        "category",
+                        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
+                        make_column_selector(dtype_include="category"),
+                    ),
+                ],
+                remainder="drop",
+                verbose_feature_names_out=False,
+            ).set_output(transform="pandas"),
+        ),
+        ("cats", FunctionTransformer(lambda df: df.astype(int).astype("category"), feature_names_out="one-to-one")),
+    ]
+)
+PolynomialView = Pipeline(
+    [
+        ("Linear", LinearView),
+        ("poly", PolynomialFeatures(degree=2).set_output(transform="pandas")),
+    ]
+)
+SparseView = ColumnTransformer(
+    [
+        (
+            "num_bins",
+            KBinsDiscretizer(n_bins=10, quantile_method="averaged_inverted_cdf", encode="onehot"),
+            make_column_selector(dtype_exclude="category"),
+        ),
+        (
+            "cat_ohe",
+            OneHotEncoder(handle_unknown="ignore"),
+            make_column_selector(dtype_include="category"),
+        ),
+    ],
+    remainder="drop",
+    verbose_feature_names_out=False,
+)

penwings-0.1.1b0/src/penwings.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,17 @@
+Metadata-Version: 2.4
+Name: penwings
+Version: 0.1.1b0
+Author-email: Frissie <R.Blanckaert@outlook.com>
+Requires-Python: >=3.11
+Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
+Requires-Dist: pyodbc<6.0.0,>=5.3.0
+Requires-Dist: pandas<4.0.0,>=3.0.0
+Requires-Dist: numpy<3.0.0,>=2.4.1
+Provides-Extra: scipy
+Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
+Provides-Extra: sklearn
+Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
+Provides-Extra: optuna
+Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
+Provides-Extra: all
+Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"

penwings-0.1.1b0/src/penwings.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,17 @@
+.gitignore
+pyproject.toml
+uv.lock
+src/penwings/__init__.py
+src/penwings/paths.py
+src/penwings/tuner.py
+src/penwings/views.py
+src/penwings.egg-info/PKG-INFO
+src/penwings.egg-info/SOURCES.txt
+src/penwings.egg-info/dependency_links.txt
+src/penwings.egg-info/requires.txt
+src/penwings.egg-info/top_level.txt
+src/penwings/_utils/__init__.py
+src/penwings/_utils/_decorators.py
+src/penwings/_utils/_typing.py
+src/penwings/io/__init__.py
+src/penwings/io/cache.py

penwings-0.1.1b0/src/penwings.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

penwings-0.1.1b0/src/penwings.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,16 @@
+sqlalchemy<3.0.0,>=2.0.46
+pyodbc<6.0.0,>=5.3.0
+pandas<4.0.0,>=3.0.0
+numpy<3.0.0,>=2.4.1
+[all]
+openpyxl<4.0.0,>=3.1.5
+[optuna]
+optuna<5.0.0,>=4.7.0
+[scipy]
+scipy<2.0.0,>=1.17.0
+[sklearn]
+scikit-learn<2.0.0,>=1.8.0

penwings-0.1.1b0/src/penwings.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ penwings