penwings 0.1.3.dev1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {penwings-0.1.3.dev1/src/penwings.egg-info → penwings-0.2.0}/PKG-INFO +28 -12
  2. penwings-0.2.0/pyproject.toml +70 -0
  3. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/__init__.py +1 -2
  4. penwings-0.2.0/src/penwings/_version.py +34 -0
  5. penwings-0.2.0/src/penwings/io/cache.py +198 -0
  6. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/paths.py +3 -1
  7. {penwings-0.1.3.dev1 → penwings-0.2.0/src/penwings.egg-info}/PKG-INFO +28 -12
  8. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings.egg-info/SOURCES.txt +1 -2
  9. penwings-0.2.0/src/penwings.egg-info/requires.txt +27 -0
  10. penwings-0.1.3.dev1/pyproject.toml +0 -59
  11. penwings-0.1.3.dev1/src/penwings/io/cache.py +0 -93
  12. penwings-0.1.3.dev1/src/penwings/tuner.py +0 -42
  13. penwings-0.1.3.dev1/src/penwings/views.py +0 -79
  14. penwings-0.1.3.dev1/src/penwings.egg-info/requires.txt +0 -16
  15. {penwings-0.1.3.dev1 → penwings-0.2.0}/.gitignore +0 -0
  16. {penwings-0.1.3.dev1 → penwings-0.2.0}/LICENSE +0 -0
  17. {penwings-0.1.3.dev1 → penwings-0.2.0}/README.md +0 -0
  18. {penwings-0.1.3.dev1 → penwings-0.2.0}/setup.cfg +0 -0
  19. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/_utils/__init__.py +0 -0
  20. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/_utils/_decorators.py +0 -0
  21. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/_utils/_typing.py +0 -0
  22. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings/io/__init__.py +0 -0
  23. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings.egg-info/dependency_links.txt +0 -0
  24. {penwings-0.1.3.dev1 → penwings-0.2.0}/src/penwings.egg-info/top_level.txt +0 -0
  25. {penwings-0.1.3.dev1 → penwings-0.2.0}/uv.lock +0 -0
@@ -1,27 +1,43 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: penwings
3
- Version: 0.1.3.dev1
3
+ Version: 0.2.0
4
4
  Summary: Lightweight library to handle data and reproduce workflows
5
- Author-email: Raf Blanckaert <R.Blanckaert@outlook.com>
6
- License: LICENSE
5
+ Author-email: Raf Blanckaert <r.blanckaert@outlook.com>
6
+ License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/Frissie/penwings
8
8
  Project-URL: Repository, https://github.com/Frissie/penwings
9
9
  Project-URL: Issues, https://github.com/Frissie/penwings/issues
10
+ Keywords: data,workflow,reproducibility,sql,analytics
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Operating System :: OS Independent
10
17
  Requires-Python: >=3.11
11
18
  Description-Content-Type: text/markdown
12
19
  License-File: LICENSE
13
- Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
14
- Requires-Dist: pyodbc<6.0.0,>=5.3.0
15
- Requires-Dist: pandas<4.0.0,>=3.0.0
16
- Requires-Dist: numpy<3.0.0,>=2.4.1
20
+ Requires-Dist: sqlalchemy<3.0,>=2.0
21
+ Requires-Dist: pyodbc<6.0,>=5.0
22
+ Requires-Dist: pandas<4.0,>=2.2
23
+ Requires-Dist: numpy<3.0,>=1.26
24
+ Provides-Extra: excel
25
+ Requires-Dist: openpyxl<4.0,>=3.1; extra == "excel"
26
+ Provides-Extra: ml
27
+ Requires-Dist: scikit-learn<2.0,>=1.4; extra == "ml"
17
28
  Provides-Extra: scipy
18
- Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
19
- Provides-Extra: sklearn
20
- Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
29
+ Requires-Dist: scipy<2.0,>=1.11; extra == "scipy"
21
30
  Provides-Extra: optuna
22
- Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
31
+ Requires-Dist: optuna<5.0,>=3.5; extra == "optuna"
23
32
  Provides-Extra: all
24
- Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
33
+ Requires-Dist: openpyxl<4.0,>=3.1; extra == "all"
34
+ Requires-Dist: scikit-learn<2.0,>=1.4; extra == "all"
35
+ Requires-Dist: scipy<2.0,>=1.11; extra == "all"
36
+ Requires-Dist: optuna<5.0,>=3.5; extra == "all"
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest>=8.0; extra == "dev"
39
+ Requires-Dist: ruff>=0.3; extra == "dev"
40
+ Requires-Dist: mypy>=1.8; extra == "dev"
25
41
  Dynamic: license-file
26
42
 
27
43
  # Penwings
@@ -0,0 +1,70 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "setuptools-scm>=8"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "penwings"
7
+ version = "v0.2.0"
8
+ description = "Lightweight library to handle data and reproduce workflows"
9
+ readme = { file = "README.md", content-type = "text/markdown" }
10
+ license = "MIT"
11
+ authors = [
12
+ { name = "Raf Blanckaert", email = "r.blanckaert@outlook.com" }
13
+ ]
14
+ requires-python = ">=3.11"
15
+
16
+ keywords = ["data", "workflow", "reproducibility", "sql", "analytics"]
17
+
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Intended Audience :: Developers",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Operating System :: OS Independent",
25
+ ]
26
+
27
+ dependencies = [
28
+ "sqlalchemy>=2.0,<3.0",
29
+ "pyodbc>=5.0,<6.0",
30
+ "pandas>=2.2,<4.0",
31
+ "numpy>=1.26,<3.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ excel = ["openpyxl>=3.1,<4.0"]
36
+ ml = ["scikit-learn>=1.4,<2.0"]
37
+ scipy = ["scipy>=1.11,<2.0"]
38
+ optuna = ["optuna>=3.5,<5.0"]
39
+ all = [
40
+ "openpyxl>=3.1,<4.0",
41
+ "scikit-learn>=1.4,<2.0",
42
+ "scipy>=1.11,<2.0",
43
+ "optuna>=3.5,<5.0",
44
+ ]
45
+
46
+ dev = [
47
+ "pytest>=8.0",
48
+ "ruff>=0.3",
49
+ "mypy>=1.8",
50
+ ]
51
+
52
+ [project.urls]
53
+ Homepage = "https://github.com/Frissie/penwings"
54
+ Repository = "https://github.com/Frissie/penwings"
55
+ Issues = "https://github.com/Frissie/penwings/issues"
56
+
57
+ # setuptools config
58
+ [tool.setuptools]
59
+ package-dir = { "" = "src" }
60
+
61
+ [tool.setuptools.packages.find]
62
+ where = ["src"]
63
+ include = ["penwings*"]
64
+
65
+ # Versioning via SCM
66
+ [tool.setuptools_scm]
67
+ version_scheme = "guess-next-dev"
68
+ local_scheme = "no-local-version"
69
+ tag_regex = "^v(?P<version>.*)$"
70
+ write_to = "src/penwings/_version.py"
@@ -1,9 +1,8 @@
1
1
  from .io.cache import SQLParquetCache
2
- from .paths import input_dir, output_dir, model_dir
2
+ from .paths import input_dir, output_dir
3
3
 
4
4
  __all__ = [
5
5
  "SQLParquetCache",
6
6
  "input_dir",
7
7
  "output_dir",
8
- "model_dir",
9
8
  ]
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.2.1.dev0'
32
+ __version_tuple__ = version_tuple = (0, 2, 1, 'dev0')
33
+
34
+ __commit_id__ = commit_id = 'g96cf35c66'
@@ -0,0 +1,198 @@
1
+ import pandas as pd
2
+
3
+ from sqlalchemy import Engine
4
+ from pathlib import Path
5
+ from datetime import datetime, timedelta
6
+ from typing import Unpack, Optional
7
+ from .._utils._typing import SQLParquetKwargs
8
+ from .._utils._decorators import timing_sql
9
+
10
+
11
+ class SQLParquetCache:
12
+ """
13
+ Cache SQL query results locally as Parquet files.
14
+
15
+ This class executes SQL queries using a SQLAlchemy engine and caches
16
+ the results as Parquet files in a specified directory. On subsequent
17
+ calls, the cached Parquet file is returned if it is considered "fresh"
18
+ according to a configurable refresh window.
19
+
20
+ The query can either be provided directly as a SQL string or as a path
21
+ to a ``.sql`` file. If a Parquet file already exists and is within the
22
+ refresh window, it will be loaded instead of re-executing the query,
23
+ unless ``force=True`` is specified.
24
+
25
+ Parameters
26
+ ----------
27
+ parquet_dir : str or pathlib.Path
28
+ Directory where Parquet cache files will be stored.
29
+ conn : sqlalchemy.engine.Engine
30
+ SQLAlchemy engine used to execute SQL queries.
31
+ sql_dir : str or pathlib.Path, optional
32
+ Directory containing ``.sql`` files. Required when passing a
33
+ filename instead of a raw SQL string.
34
+ refresh_days : int, default 0
35
+ Number of days for which a cached Parquet file is considered fresh.
36
+ If 0, refresh checking is disabled and existing Parquet files are
37
+ always used unless ``force=True``.
38
+ verbose : bool, default True
39
+ Whether to enable verbose output (if used by decorators or
40
+ extended implementations).
41
+ **kwargs : dict
42
+ Additional keyword arguments passed to ``pandas.read_sql``.
43
+ These are stored globally and merged with per-call arguments
44
+ in :meth:`get`.
45
+
46
+ Notes
47
+ -----
48
+ - Cached Parquet filenames are derived from the SQL filename stem
49
+ or from the provided ``parquet_name``.
50
+ - If a raw SQL string is provided, ``parquet_name`` must be specified.
51
+ - The cache directory is created automatically if it does not exist.
52
+ - The method :meth:`get` returns both the DataFrame and the source
53
+ ("SQL" or "Parquet") used to obtain the data.
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ parquet_dir: Path | str,
59
+ conn: Engine,
60
+ sql_dir: Optional[Path | str] = None,
61
+ refresh_days: int = 0, # zero disables refresh when force == false
62
+ verbose: bool = True,
63
+ **kwargs: Unpack[SQLParquetKwargs],
64
+ ):
65
+
66
+ if sql_dir is not None:
67
+ self.sql_dir: Path = Path(sql_dir)
68
+ self.parquet_dir: Path = Path(parquet_dir)
69
+ self.refresh_days = refresh_days
70
+ self.conn = conn
71
+ self.global_kwargs = kwargs
72
+
73
+ self.verbose = verbose
74
+
75
+ def set_params(self, **params):
76
+ for key, value in params.items():
77
+ if not hasattr(self, key):
78
+ raise ValueError(f"Invalid parameter: {key}")
79
+ setattr(self, key, value)
80
+ return self
81
+
82
+ def _sql_path(self, sql_file: str) -> Path:
83
+ if not hasattr(self, "sql_dir"):
84
+ raise ValueError("sql_dir must be set when passing a .sql filename.")
85
+ return self.sql_dir / sql_file
86
+
87
+ def _parquet_path(self, parquet_name: str) -> Path:
88
+ return self.parquet_dir / f"{parquet_name}.parquet"
89
+
90
+ def _is_fresh(self, path: Path, refresh_window: int) -> bool:
91
+ """
92
+ Determine whether a cached Parquet file is fresh enough to use.
93
+
94
+ Parameters
95
+ ----------
96
+ path : Path
97
+ Path to the Parquet file.
98
+ refresh_window : int
99
+ Number of days the file is considered valid.
100
+ If 0, the file is always considered fresh (if it exists).
101
+
102
+ Returns
103
+ -------
104
+ bool
105
+ True if the file exists and is within the refresh window.
106
+ """
107
+ if not path.exists():
108
+ return False
109
+
110
+ # 0 means: never refresh (always use cache if it exists)
111
+ if refresh_window == 0:
112
+ return True
113
+
114
+ last_modified = datetime.fromtimestamp(path.stat().st_mtime)
115
+ age = datetime.now() - last_modified
116
+ return age < timedelta(days=refresh_window)
117
+
118
+ def _read_sql(self, sql_file: str):
119
+ return self._sql_path(sql_file).read_text()
120
+
121
+ def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
122
+ return pd.read_sql(query, conn, **kwargs)
123
+
124
+ @timing_sql
125
+ def get(
126
+ self,
127
+ sql: str,
128
+ parquet_name: str | None = None,
129
+ conn: Engine | None = None,
130
+ refresh_days: int | None = None,
131
+ force: bool = False,
132
+ **kwargs: Unpack[SQLParquetKwargs],
133
+ ) -> tuple[pd.DataFrame, str]:
134
+ """
135
+ Retrieve a DataFrame from cache or execute the SQL query.
136
+
137
+ Parameters
138
+ ----------
139
+ sql : str
140
+ Either a raw SQL query string or the filename of a ``.sql`` file.
141
+ parquet_name : str, optional
142
+ Name of the Parquet file (without extension) when passing a raw
143
+ SQL string. Ignored if a ``.sql`` filename is provided.
144
+ conn : sqlalchemy.engine.Engine, optional
145
+ Alternative SQLAlchemy engine to use instead of the default.
146
+ refresh_days : int, optional
147
+ Override the instance-level refresh window (in days).
148
+ force : bool, default False
149
+ If True, bypass the cache and force re-execution of the query.
150
+ **kwargs : dict
151
+ Additional keyword arguments passed to ``pandas.read_sql``.
152
+ These override any global keyword arguments defined at
153
+ initialization.
154
+
155
+ Returns
156
+ -------
157
+ DataFrame
158
+ The resulting query output.
159
+ str
160
+ The data source used: either ``"SQL"`` or ``"Parquet"``.
161
+
162
+ Raises
163
+ ------
164
+ ValueError
165
+ If ``sql`` is a raw SQL string and ``parquet_name`` is not provided.
166
+ If ``sql`` is neither a SQL string nor a ``.sql`` file path.
167
+
168
+ Notes
169
+ -----
170
+ If the Parquet file exists and is within the refresh window,
171
+ it is loaded directly using ``pandas.read_parquet``. Otherwise,
172
+ the SQL query is executed and the result is written to Parquet.
173
+ """
174
+ if isinstance(sql, str) and Path(sql).suffix == ".sql":
175
+ parquet_name = parquet_name or Path(sql).stem
176
+ query = self._read_sql(sql)
177
+ elif isinstance(sql, str):
178
+ if parquet_name is None:
179
+ raise ValueError("parquet_name must be provided if query is passed directly")
180
+ query = sql
181
+ else:
182
+ raise ValueError("sql must be a SQL string or a path to a .sql file")
183
+
184
+ connection = self.conn if conn is None else conn
185
+ refresh_window = self.refresh_days if refresh_days is None else refresh_days
186
+ parquet_path = self._parquet_path(parquet_name)
187
+ self.parquet_dir.mkdir(parents=True, exist_ok=True)
188
+ sql_kwargs = self.global_kwargs | kwargs
189
+
190
+ if not force and self._is_fresh(parquet_path, refresh_window):
191
+ source = "Parquet"
192
+ return pd.read_parquet(parquet_path), source
193
+
194
+ source = "SQL"
195
+ df = self._return_sql(query, connection, **sql_kwargs)
196
+ df.to_parquet(parquet_path, index=False)
197
+
198
+ return df, source
@@ -4,9 +4,11 @@ home_dir = pathlib.Path.cwd()
4
4
  proj_dir = pathlib.Path.cwd().parent
5
5
 
6
6
  input_dir = home_dir / "input"
7
- model_dir = home_dir / "model"
8
7
  output_dir = home_dir / "output"
9
8
 
9
+ sql_dir = input_dir / "sql"
10
+ parquet_dir = input_dir / "parquet"
11
+
10
12
  if __name__ == "__main__":
11
13
  i = 1
12
14
  for name, value in dict(locals()).items():
@@ -1,27 +1,43 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: penwings
3
- Version: 0.1.3.dev1
3
+ Version: 0.2.0
4
4
  Summary: Lightweight library to handle data and reproduce workflows
5
- Author-email: Raf Blanckaert <R.Blanckaert@outlook.com>
6
- License: LICENSE
5
+ Author-email: Raf Blanckaert <r.blanckaert@outlook.com>
6
+ License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/Frissie/penwings
8
8
  Project-URL: Repository, https://github.com/Frissie/penwings
9
9
  Project-URL: Issues, https://github.com/Frissie/penwings/issues
10
+ Keywords: data,workflow,reproducibility,sql,analytics
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Operating System :: OS Independent
10
17
  Requires-Python: >=3.11
11
18
  Description-Content-Type: text/markdown
12
19
  License-File: LICENSE
13
- Requires-Dist: sqlalchemy<3.0.0,>=2.0.46
14
- Requires-Dist: pyodbc<6.0.0,>=5.3.0
15
- Requires-Dist: pandas<4.0.0,>=3.0.0
16
- Requires-Dist: numpy<3.0.0,>=2.4.1
20
+ Requires-Dist: sqlalchemy<3.0,>=2.0
21
+ Requires-Dist: pyodbc<6.0,>=5.0
22
+ Requires-Dist: pandas<4.0,>=2.2
23
+ Requires-Dist: numpy<3.0,>=1.26
24
+ Provides-Extra: excel
25
+ Requires-Dist: openpyxl<4.0,>=3.1; extra == "excel"
26
+ Provides-Extra: ml
27
+ Requires-Dist: scikit-learn<2.0,>=1.4; extra == "ml"
17
28
  Provides-Extra: scipy
18
- Requires-Dist: scipy<2.0.0,>=1.17.0; extra == "scipy"
19
- Provides-Extra: sklearn
20
- Requires-Dist: scikit-learn<2.0.0,>=1.8.0; extra == "sklearn"
29
+ Requires-Dist: scipy<2.0,>=1.11; extra == "scipy"
21
30
  Provides-Extra: optuna
22
- Requires-Dist: optuna<5.0.0,>=4.7.0; extra == "optuna"
31
+ Requires-Dist: optuna<5.0,>=3.5; extra == "optuna"
23
32
  Provides-Extra: all
24
- Requires-Dist: openpyxl<4.0.0,>=3.1.5; extra == "all"
33
+ Requires-Dist: openpyxl<4.0,>=3.1; extra == "all"
34
+ Requires-Dist: scikit-learn<2.0,>=1.4; extra == "all"
35
+ Requires-Dist: scipy<2.0,>=1.11; extra == "all"
36
+ Requires-Dist: optuna<5.0,>=3.5; extra == "all"
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest>=8.0; extra == "dev"
39
+ Requires-Dist: ruff>=0.3; extra == "dev"
40
+ Requires-Dist: mypy>=1.8; extra == "dev"
25
41
  Dynamic: license-file
26
42
 
27
43
  # Penwings
@@ -4,9 +4,8 @@ README.md
4
4
  pyproject.toml
5
5
  uv.lock
6
6
  src/penwings/__init__.py
7
+ src/penwings/_version.py
7
8
  src/penwings/paths.py
8
- src/penwings/tuner.py
9
- src/penwings/views.py
10
9
  src/penwings.egg-info/PKG-INFO
11
10
  src/penwings.egg-info/SOURCES.txt
12
11
  src/penwings.egg-info/dependency_links.txt
@@ -0,0 +1,27 @@
1
+ sqlalchemy<3.0,>=2.0
2
+ pyodbc<6.0,>=5.0
3
+ pandas<4.0,>=2.2
4
+ numpy<3.0,>=1.26
5
+
6
+ [all]
7
+ openpyxl<4.0,>=3.1
8
+ scikit-learn<2.0,>=1.4
9
+ scipy<2.0,>=1.11
10
+ optuna<5.0,>=3.5
11
+
12
+ [dev]
13
+ pytest>=8.0
14
+ ruff>=0.3
15
+ mypy>=1.8
16
+
17
+ [excel]
18
+ openpyxl<4.0,>=3.1
19
+
20
+ [ml]
21
+ scikit-learn<2.0,>=1.4
22
+
23
+ [optuna]
24
+ optuna<5.0,>=3.5
25
+
26
+ [scipy]
27
+ scipy<2.0,>=1.11
@@ -1,59 +0,0 @@
1
- [project]
2
- name = "penwings"
3
- dynamic = ["version"]
4
- description = "Lightweight library to handle data and reproduce workflows"
5
- readme = "README.md"
6
- license = {text = "LICENSE"}
7
- authors = [
8
- {name = "Raf Blanckaert",email = "R.Blanckaert@outlook.com"}
9
- ]
10
- requires-python = ">=3.11"
11
- dependencies = [
12
- "sqlalchemy (>=2.0.46,<3.0.0)",
13
- "pyodbc (>=5.3.0,<6.0.0)",
14
- "pandas (>=3.0.0,<4.0.0)",
15
- "numpy (>=2.4.1,<3.0.0)"
16
- ]
17
-
18
- [project.urls]
19
- Homepage = "https://github.com/Frissie/penwings"
20
- Repository = "https://github.com/Frissie/penwings"
21
- Issues = "https://github.com/Frissie/penwings/issues"
22
-
23
-
24
- [tool.setuptools_scm]
25
- version_scheme = "guess-next-dev"
26
- local_scheme = "no-local-version"
27
- tag_regex = "^v(?P<version>.*)$"
28
-
29
- [tool.setuptools]
30
- package-dir = {"" = "src"}
31
-
32
- [tool.setuptools.packages.find]
33
- where = ["src"]
34
- include = ["penwings*"]
35
- exclude = ["penwings._*"]
36
-
37
- [build-system]
38
- requires = ["setuptools>=68", "wheel", "setuptools-scm"]
39
- build-backend = "setuptools.build_meta"
40
-
41
- [dependency-groups]
42
- dev = [
43
- "openpyxl>=3.1.5",
44
- "optuna>=4.7.0",
45
- "scikit-learn>=1.8.0",
46
- "scipy>=1.17.0",
47
- ]
48
-
49
- [project.optional-dependencies]
50
- scipy = ["scipy (>=1.17.0,<2.0.0)"]
51
- sklearn = ["scikit-learn (>=1.8.0,<2.0.0)"]
52
- optuna= ["optuna (>=4.7.0,<5.0.0)"]
53
- all = ["openpyxl (>=3.1.5,<4.0.0)"]
54
-
55
- [[tool.uv.index]]
56
- name = "testpypi"
57
- url = "https://test.pypi.org/simple/"
58
- publish-url = "https://test.pypi.org/legacy/"
59
- explicit = true
@@ -1,93 +0,0 @@
1
- import pandas as pd
2
-
3
- from sqlalchemy import Engine
4
- from pathlib import Path
5
- from datetime import datetime, timedelta
6
- from typing import Unpack, Union, Optional
7
- from .._utils._typing import SQLParquetKwargs
8
- from .._utils._decorators import timing_sql
9
-
10
-
11
- class SQLParquetCache:
12
- def __init__(
13
- self,
14
- parquet_dir: Union[Path, str],
15
- conn: Engine,
16
- sql_dir: Optional[Union[Path, str]] = None,
17
- refresh_days: int = 0, # zero disables refresh when force == false
18
- verbose: bool = True,
19
- **kwargs: Unpack[SQLParquetKwargs],
20
- ):
21
-
22
- if sql_dir is not None:
23
- self.sql_dir: Path = Path(sql_dir)
24
- self.parquet_dir: Path = Path(parquet_dir)
25
- self.refresh_days = refresh_days
26
- self.conn = conn
27
- self.global_kwargs = kwargs
28
-
29
- self.verbose = verbose
30
- self.source = "SQL"
31
-
32
- def set_params(self, **params):
33
- for key, value in params.items():
34
- if not hasattr(self, key):
35
- raise ValueError(f"Invalid parameter: {key}")
36
- setattr(self, key, value)
37
- return self
38
-
39
- def _sql_path(self, sql_file: str) -> Path:
40
- return self.sql_dir / sql_file
41
-
42
- def _parquet_path(self, sql_file: str, parquet_name: str | None = None) -> Path:
43
- name = parquet_name or Path(sql_file).stem
44
- return self.parquet_dir / f"{name}.parquet"
45
-
46
- def _is_new(self, path: Path, refresh_window: int) -> bool:
47
- if not path.exists():
48
- return False
49
- if self.refresh_days == 0:
50
- return True
51
- last_modified = datetime.fromtimestamp(path.stat().st_mtime)
52
- return datetime.now() - last_modified < timedelta(days=refresh_window)
53
-
54
- def _read_sql(self, sql_file: str):
55
- return self._sql_path(sql_file).read_text()
56
-
57
- def _return_sql(self, query: str, conn, **kwargs: Unpack[SQLParquetKwargs]) -> pd.DataFrame:
58
- return pd.read_sql(query, conn, **kwargs)
59
-
60
- @timing_sql
61
- def get(
62
- self,
63
- sql: str,
64
- parquet_name: Union[str, None] = None,
65
- conn: Engine | None = None,
66
- refresh_days: int | None = None,
67
- force: bool = False,
68
- **kwargs: Unpack[SQLParquetKwargs],
69
- ) -> tuple[pd.DataFrame, str]:
70
- if isinstance(sql, str) and Path(sql).suffix == ".sql":
71
- query = self._read_sql(sql)
72
- elif isinstance(sql, str):
73
- if parquet_name is None:
74
- raise ValueError("parquet_name must be provided if query is passed directly")
75
- query = sql
76
- else:
77
- raise ValueError("sql must be a SQL string or a path to a .sql file")
78
-
79
- connection = conn or self.conn
80
- refresh_window = refresh_days or self.refresh_days
81
- parquet_path = self._parquet_path(query)
82
- sql_kwargs = self.global_kwargs | kwargs
83
-
84
- if not force and self._is_new(parquet_path, refresh_window):
85
- source = "Parquet"
86
- return pd.read_parquet(parquet_path), source
87
-
88
- source = "SQL"
89
- df = self._return_sql(query, connection, **sql_kwargs)
90
- self.parquet_dir.mkdir(parents=True, exist_ok=True)
91
- df.to_parquet(parquet_path, index=False)
92
-
93
- return df, source
@@ -1,42 +0,0 @@
1
- from optuna.trial import Trial
2
-
3
-
4
- def tune_lgbm_params(trial: Trial, model="classifier"):
5
- if model == "classifier":
6
- metrics = {
7
- "objective": "binary",
8
- "metric": "auc",
9
- }
10
-
11
- params = {
12
- # Core
13
- "verbosity": -1,
14
- "boosting_type": "gbdt",
15
- # GPU
16
- "device": "gpu",
17
- "gpu_platform_id": 0,
18
- "gpu_device_id": 0,
19
- # Learning
20
- "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
21
- "n_estimators": trial.suggest_int("n_estimators", 1500, 5000),
22
- # Tree structure (GPU-safe)
23
- "num_leaves": trial.suggest_int("num_leaves", 31, 128),
24
- "max_depth": trial.suggest_int("max_depth", 4, 10),
25
- # Regularization / stability
26
- "min_child_samples": trial.suggest_int("min_child_samples", 10, 80),
27
- "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
28
- "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
29
- # Sampling
30
- "subsample": trial.suggest_float("subsample", 0.6, 1.0),
31
- "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
32
- "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
33
- # Regularization
34
- "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
35
- "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 5.0, log=True),
36
- # Histogram
37
- "max_bin": trial.suggest_int("max_bin", 64, 255),
38
- # Class imbalance (keep only if needed)
39
- "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 3.0),
40
- }
41
-
42
- return metrics | params
@@ -1,79 +0,0 @@
1
- from sklearn.pipeline import Pipeline
2
- from sklearn.compose import ColumnTransformer, make_column_selector
3
- from sklearn.preprocessing import (
4
- TargetEncoder,
5
- OneHotEncoder,
6
- RobustScaler,
7
- KBinsDiscretizer,
8
- FunctionTransformer,
9
- PolynomialFeatures,
10
- OrdinalEncoder,
11
- StandardScaler,
12
- )
13
-
14
- LinearView = ColumnTransformer(
15
- [
16
- ("numerical", RobustScaler(), make_column_selector(dtype_exclude="category")),
17
- ("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
18
- ],
19
- remainder="drop",
20
- verbose_feature_names_out=False,
21
- ).set_output(transform="pandas")
22
-
23
- DenseView = ColumnTransformer(
24
- [
25
- ("numerical", StandardScaler(), make_column_selector(dtype_exclude="category")),
26
- ("category", TargetEncoder(shuffle=True, smooth=10, random_state=42), make_column_selector(dtype_include="category")),
27
- ],
28
- remainder="drop",
29
- verbose_feature_names_out=False,
30
- ).set_output(transform="pandas")
31
-
32
- CategoricalView = Pipeline(
33
- [
34
- (
35
- "bins",
36
- ColumnTransformer(
37
- [
38
- (
39
- "numerical",
40
- KBinsDiscretizer(n_bins=4, strategy="quantile", quantile_method="averaged_inverted_cdf", encode="ordinal"),
41
- make_column_selector(dtype_exclude="category"),
42
- ),
43
- (
44
- "category",
45
- OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
46
- make_column_selector(dtype_include="category"),
47
- ),
48
- ],
49
- remainder="drop",
50
- verbose_feature_names_out=False,
51
- ).set_output(transform="pandas"),
52
- ),
53
- ("cats", FunctionTransformer(lambda df: df.astype(int).astype("category"), feature_names_out="one-to-one")),
54
- ]
55
- )
56
-
57
- PolynomialView = Pipeline(
58
- [
59
- ("Linear", LinearView),
60
- ("poly", PolynomialFeatures(degree=2).set_output(transform="pandas")),
61
- ]
62
- )
63
-
64
- SparseView = ColumnTransformer(
65
- [
66
- (
67
- "num_bins",
68
- KBinsDiscretizer(n_bins=10, quantile_method="averaged_inverted_cdf", encode="onehot"),
69
- make_column_selector(dtype_exclude="category"),
70
- ),
71
- (
72
- "cat_ohe",
73
- OneHotEncoder(handle_unknown="ignore"),
74
- make_column_selector(dtype_include="category"),
75
- ),
76
- ],
77
- remainder="drop",
78
- verbose_feature_names_out=False,
79
- )
@@ -1,16 +0,0 @@
1
- sqlalchemy<3.0.0,>=2.0.46
2
- pyodbc<6.0.0,>=5.3.0
3
- pandas<4.0.0,>=3.0.0
4
- numpy<3.0.0,>=2.4.1
5
-
6
- [all]
7
- openpyxl<4.0.0,>=3.1.5
8
-
9
- [optuna]
10
- optuna<5.0.0,>=4.7.0
11
-
12
- [scipy]
13
- scipy<2.0.0,>=1.17.0
14
-
15
- [sklearn]
16
- scikit-learn<2.0.0,>=1.8.0
File without changes
File without changes
File without changes
File without changes
File without changes