penwings 0.2.4__tar.gz → 0.3.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. penwings-0.3.0.dev1/PKG-INFO +286 -0
  2. penwings-0.3.0.dev1/README.md +256 -0
  3. {penwings-0.2.4 → penwings-0.3.0.dev1}/pyproject.toml +2 -13
  4. penwings-0.3.0.dev1/src/penwings/io/__init__.py +5 -0
  5. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings/io/cache.py +10 -2
  6. penwings-0.3.0.dev1/src/penwings/paths/__init__.py +6 -0
  7. penwings-0.3.0.dev1/src/penwings/paths/project_paths.py +528 -0
  8. penwings-0.3.0.dev1/src/penwings.egg-info/PKG-INFO +286 -0
  9. penwings-0.3.0.dev1/src/penwings.egg-info/requires.txt +11 -0
  10. {penwings-0.2.4 → penwings-0.3.0.dev1}/uv.lock +6 -440
  11. penwings-0.2.4/PKG-INFO +0 -193
  12. penwings-0.2.4/README.md +0 -151
  13. penwings-0.2.4/src/penwings/paths/__init__.py +0 -0
  14. penwings-0.2.4/src/penwings/paths/project_paths.py +0 -301
  15. penwings-0.2.4/src/penwings/utils/__init__.py +0 -0
  16. penwings-0.2.4/src/penwings.egg-info/PKG-INFO +0 -193
  17. penwings-0.2.4/src/penwings.egg-info/requires.txt +0 -27
  18. {penwings-0.2.4 → penwings-0.3.0.dev1}/.gitignore +0 -0
  19. {penwings-0.2.4 → penwings-0.3.0.dev1}/LICENSE +0 -0
  20. {penwings-0.2.4 → penwings-0.3.0.dev1}/setup.cfg +0 -0
  21. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings/__init__.py +0 -0
  22. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings/exploration/__init__.py +0 -0
  23. {penwings-0.2.4/src/penwings/io → penwings-0.3.0.dev1/src/penwings/modeling}/__init__.py +0 -0
  24. {penwings-0.2.4/src/penwings/modeling → penwings-0.3.0.dev1/src/penwings/utils}/__init__.py +0 -0
  25. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings/utils/_decorators.py +0 -0
  26. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings/utils/_typing.py +0 -0
  27. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings.egg-info/SOURCES.txt +0 -0
  28. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings.egg-info/dependency_links.txt +0 -0
  29. {penwings-0.2.4 → penwings-0.3.0.dev1}/src/penwings.egg-info/top_level.txt +0 -0
@@ -0,0 +1,286 @@
1
+ Metadata-Version: 2.4
2
+ Name: penwings
3
+ Version: 0.3.0.dev1
4
+ Summary: Lightweight library to handle data and reproduce workflows
5
+ Author-email: Raf Blanckaert <r.blanckaert@outlook.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Frissie/penwings
8
+ Project-URL: Repository, https://github.com/Frissie/penwings
9
+ Project-URL: Issues, https://github.com/Frissie/penwings/issues
10
+ Keywords: data,workflow,reproducibility,sql,analytics
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Operating System :: OS Independent
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pandas<4.0,>=2.2
21
+ Requires-Dist: numpy<3.0,>=1.26
22
+ Provides-Extra: sql
23
+ Requires-Dist: sqlalchemy<3.0,>=2.0; extra == "sql"
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=8.0; extra == "dev"
26
+ Requires-Dist: ruff>=0.3; extra == "dev"
27
+ Requires-Dist: mypy>=1.8; extra == "dev"
28
+ Requires-Dist: sqlalchemy<3.0,>=2.0; extra == "dev"
29
+ Dynamic: license-file
30
+
31
+ # Penwings
32
+
33
+ **Penwings** is a lightweight Python library for building **reproducible data workflows**.
34
+
35
+ It provides simple, composable tools to:
36
+ - manage project structure
37
+ - standardize data access
38
+ - cache SQL queries efficiently
39
+
40
+ The goal is to reduce boilerplate and make data pipelines **faster, cleaner, and reproducible by default**.
41
+
42
+ ---
43
+
44
+ ## ✨ Features
45
+
46
+ ### 🗂️ Project Structure Management
47
+ - Standardized folder setup for data science projects
48
+ - Automatic project root detection
49
+ - Flexible, extensible path system
50
+
51
+ ### 🧠 SQL → Parquet Caching
52
+ - Execute SQL queries via SQLAlchemy
53
+ - Automatically cache results as Parquet
54
+ - Reuse cached data to avoid unnecessary database hits
55
+ - Configurable refresh logic
56
+
57
+ ### ⚡ Lightweight & Modular
58
+ - Minimal core dependencies (`pandas`, `numpy`)
59
+ - Optional SQL support
60
+ - Designed to scale into larger workflows
61
+
62
+ ---
63
+
64
+ ## 📦 Installation
65
+
66
+ ```bash
67
+ pip install penwings
68
+ ````
69
+
70
+ ### Optional SQL support
71
+
72
+ ```bash
73
+ pip install penwings[sql]
74
+ ```
75
+
76
+ > Requires Python **3.11+**
77
+
78
+ ---
79
+
80
+ ## 🚀 Quick Start
81
+
82
+ ### 1. Project structure
83
+
84
+ ```python
85
+ from penwings import ProjectPaths
86
+
87
+ paths = ProjectPaths()
88
+
89
+ print(paths.data)
90
+ print(paths.models)
91
+ ```
92
+
93
+ Creates a standardized structure like:
94
+
95
+ ```
96
+ configs/
97
+ data/
98
+ raw/
99
+ processed/
100
+ external/
101
+ features/
102
+ logs/
103
+ models/
104
+ notebooks/
105
+ reports/
106
+ figures/
107
+ tables/
108
+ sql/
109
+ ```
110
+
111
+ ---
112
+
113
+ ### 2. SQL caching
114
+
115
+ ```python
116
+ from sqlalchemy import create_engine
117
+ from penwings import SQLParquetCache
118
+
119
+ engine = create_engine("sqlite:///example.db")
120
+
121
+ cache = SQLParquetCache(
122
+ sql_dir="sql",
123
+ parquet_dir="cache",
124
+ conn=engine,
125
+ refresh_days=1
126
+ )
127
+ ```
128
+
129
+ ---
130
+
131
+ ## 📊 Usage
132
+
133
+ ### Using SQL files
134
+
135
+ ```python
136
+ df = cache.get("sales.sql")
137
+ ```
138
+
139
+ * Loads from **Parquet** if cached
140
+ * Otherwise executes SQL and caches result
141
+
142
+ ---
143
+
144
+ ### Using raw SQL
145
+
146
+ ```python
147
+ query = "SELECT * FROM sales WHERE month = '2026-02'"
148
+
149
+ df = cache.get(
150
+ sql=query,
151
+ parquet_name="sales_feb"
152
+ )
153
+ ```
154
+
155
+ ---
156
+
157
+ ### Cache behavior
158
+
159
+ ```python
160
+ df = cache.get("sales.sql", force=True)
161
+ ```
162
+
163
+ * `force=True` → always re-run SQL
164
+ * `refresh_days=N` → cache expires after N days
165
+ * returns:
166
+
167
+ * `DataFrame`
168
+
169
+
170
+ ---
171
+
172
+ ## 🧩 ProjectPaths
173
+
174
+ Create only specific parts of a project:
175
+
176
+ ```python
177
+ paths = ProjectPaths(folders=["data", "ml"])
178
+ ```
179
+
180
+ Custom directories:
181
+
182
+ ```python
183
+ paths = ProjectPaths(
184
+ custom_dirs={
185
+ "modules": "src/modules",
186
+ "views": "src/views"
187
+ }
188
+ )
189
+ ```
190
+
191
+ Access paths:
192
+
193
+ ```python
194
+ paths.data
195
+ paths["models"]
196
+ paths.as_dict()
197
+ ```
198
+
199
+ ---
200
+
201
+ ## 🧠 Design Philosophy
202
+
203
+ Penwings is built around a few core ideas:
204
+
205
+ * **Reproducibility first** → deterministic data access via caching
206
+ * **Convention over configuration** → sensible defaults for structure
207
+ * **Composable building blocks** → small tools that work well together
208
+ * **Lightweight core** → no heavy framework overhead
209
+
210
+ ---
211
+
212
+ ## 🛣️ Roadmap
213
+
214
+ * Pipeline abstraction (data workflows as steps)
215
+ * Improved SQL utilities and query management
216
+ * Integration with feature engineering workflows
217
+ * Better caching strategies and metadata tracking
218
+
219
+ ---
220
+
221
+ ## 🔢 Versioning
222
+
223
+ Penwings follows **semantic versioning**:
224
+
225
+ * **MAJOR** → breaking changes
226
+ * **MINOR** → new features
227
+ * **PATCH** → bug fixes
228
+
229
+ ---
230
+
231
+ ## 🤝 Contributing
232
+
233
+ Contributions are welcome!
234
+
235
+ 1. Fork the repository
236
+ 2. Create a branch (`feature/my-feature`)
237
+ 3. Commit your changes
238
+ 4. Open a pull request
239
+
240
+ ---
241
+
242
+ ## 📄 License
243
+
244
+ MIT License — see [LICENSE](LICENSE)
245
+
246
+ ---
247
+
248
+ ## 💡 Example Workflow
249
+
250
+ ```python
251
+ from sqlalchemy import create_engine
252
+ from penwings import ProjectPaths, SQLParquetCache
253
+
254
+ # Setup project structure
255
+ paths = ProjectPaths()
256
+
257
+ # Setup SQL cache
258
+ engine = create_engine("sqlite:///example.db")
259
+
260
+ cache = SQLParquetCache(
261
+ sql_dir=paths.sql,
262
+ parquet_dir=paths.data,
263
+ conn=engine
264
+ )
265
+
266
+ # Load data
267
+ df_sales = cache.get("sales.sql")
268
+
269
+ ```
270
+
271
+ ---
272
+
273
+ ## ⭐ Why Penwings?
274
+
275
+ Penwings sits between:
276
+
277
+ * ad-hoc scripts ❌
278
+ * heavy frameworks ❌
279
+
280
+ It gives you just enough structure to:
281
+
282
+ * stay organized
283
+ * move fast
284
+ * keep workflows reproducible
285
+
286
+ without getting in your way.
@@ -0,0 +1,256 @@
1
+ # Penwings
2
+
3
+ **Penwings** is a lightweight Python library for building **reproducible data workflows**.
4
+
5
+ It provides simple, composable tools to:
6
+ - manage project structure
7
+ - standardize data access
8
+ - cache SQL queries efficiently
9
+
10
+ The goal is to reduce boilerplate and make data pipelines **faster, cleaner, and reproducible by default**.
11
+
12
+ ---
13
+
14
+ ## ✨ Features
15
+
16
+ ### 🗂️ Project Structure Management
17
+ - Standardized folder setup for data science projects
18
+ - Automatic project root detection
19
+ - Flexible, extensible path system
20
+
21
+ ### 🧠 SQL → Parquet Caching
22
+ - Execute SQL queries via SQLAlchemy
23
+ - Automatically cache results as Parquet
24
+ - Reuse cached data to avoid unnecessary database hits
25
+ - Configurable refresh logic
26
+
27
+ ### ⚡ Lightweight & Modular
28
+ - Minimal core dependencies (`pandas`, `numpy`)
29
+ - Optional SQL support
30
+ - Designed to scale into larger workflows
31
+
32
+ ---
33
+
34
+ ## 📦 Installation
35
+
36
+ ```bash
37
+ pip install penwings
38
+ ````
39
+
40
+ ### Optional SQL support
41
+
42
+ ```bash
43
+ pip install penwings[sql]
44
+ ```
45
+
46
+ > Requires Python **3.11+**
47
+
48
+ ---
49
+
50
+ ## 🚀 Quick Start
51
+
52
+ ### 1. Project structure
53
+
54
+ ```python
55
+ from penwings import ProjectPaths
56
+
57
+ paths = ProjectPaths()
58
+
59
+ print(paths.data)
60
+ print(paths.models)
61
+ ```
62
+
63
+ Creates a standardized structure like:
64
+
65
+ ```
66
+ configs/
67
+ data/
68
+ raw/
69
+ processed/
70
+ external/
71
+ features/
72
+ logs/
73
+ models/
74
+ notebooks/
75
+ reports/
76
+ figures/
77
+ tables/
78
+ sql/
79
+ ```
80
+
81
+ ---
82
+
83
+ ### 2. SQL caching
84
+
85
+ ```python
86
+ from sqlalchemy import create_engine
87
+ from penwings import SQLParquetCache
88
+
89
+ engine = create_engine("sqlite:///example.db")
90
+
91
+ cache = SQLParquetCache(
92
+ sql_dir="sql",
93
+ parquet_dir="cache",
94
+ conn=engine,
95
+ refresh_days=1
96
+ )
97
+ ```
98
+
99
+ ---
100
+
101
+ ## 📊 Usage
102
+
103
+ ### Using SQL files
104
+
105
+ ```python
106
+ df = cache.get("sales.sql")
107
+ ```
108
+
109
+ * Loads from **Parquet** if cached
110
+ * Otherwise executes SQL and caches result
111
+
112
+ ---
113
+
114
+ ### Using raw SQL
115
+
116
+ ```python
117
+ query = "SELECT * FROM sales WHERE month = '2026-02'"
118
+
119
+ df = cache.get(
120
+ sql=query,
121
+ parquet_name="sales_feb"
122
+ )
123
+ ```
124
+
125
+ ---
126
+
127
+ ### Cache behavior
128
+
129
+ ```python
130
+ df = cache.get("sales.sql", force=True)
131
+ ```
132
+
133
+ * `force=True` → always re-run SQL
134
+ * `refresh_days=N` → cache expires after N days
135
+ * returns:
136
+
137
+ * `DataFrame`
138
+
139
+
140
+ ---
141
+
142
+ ## 🧩 ProjectPaths
143
+
144
+ Create only specific parts of a project:
145
+
146
+ ```python
147
+ paths = ProjectPaths(folders=["data", "ml"])
148
+ ```
149
+
150
+ Custom directories:
151
+
152
+ ```python
153
+ paths = ProjectPaths(
154
+ custom_dirs={
155
+ "modules": "src/modules",
156
+ "views": "src/views"
157
+ }
158
+ )
159
+ ```
160
+
161
+ Access paths:
162
+
163
+ ```python
164
+ paths.data
165
+ paths["models"]
166
+ paths.as_dict()
167
+ ```
168
+
169
+ ---
170
+
171
+ ## 🧠 Design Philosophy
172
+
173
+ Penwings is built around a few core ideas:
174
+
175
+ * **Reproducibility first** → deterministic data access via caching
176
+ * **Convention over configuration** → sensible defaults for structure
177
+ * **Composable building blocks** → small tools that work well together
178
+ * **Lightweight core** → no heavy framework overhead
179
+
180
+ ---
181
+
182
+ ## 🛣️ Roadmap
183
+
184
+ * Pipeline abstraction (data workflows as steps)
185
+ * Improved SQL utilities and query management
186
+ * Integration with feature engineering workflows
187
+ * Better caching strategies and metadata tracking
188
+
189
+ ---
190
+
191
+ ## 🔢 Versioning
192
+
193
+ Penwings follows **semantic versioning**:
194
+
195
+ * **MAJOR** → breaking changes
196
+ * **MINOR** → new features
197
+ * **PATCH** → bug fixes
198
+
199
+ ---
200
+
201
+ ## 🤝 Contributing
202
+
203
+ Contributions are welcome!
204
+
205
+ 1. Fork the repository
206
+ 2. Create a branch (`feature/my-feature`)
207
+ 3. Commit your changes
208
+ 4. Open a pull request
209
+
210
+ ---
211
+
212
+ ## 📄 License
213
+
214
+ MIT License — see [LICENSE](LICENSE)
215
+
216
+ ---
217
+
218
+ ## 💡 Example Workflow
219
+
220
+ ```python
221
+ from sqlalchemy import create_engine
222
+ from penwings import ProjectPaths, SQLParquetCache
223
+
224
+ # Setup project structure
225
+ paths = ProjectPaths()
226
+
227
+ # Setup SQL cache
228
+ engine = create_engine("sqlite:///example.db")
229
+
230
+ cache = SQLParquetCache(
231
+ sql_dir=paths.sql,
232
+ parquet_dir=paths.data,
233
+ conn=engine
234
+ )
235
+
236
+ # Load data
237
+ df_sales = cache.get("sales.sql")
238
+
239
+ ```
240
+
241
+ ---
242
+
243
+ ## ⭐ Why Penwings?
244
+
245
+ Penwings sits between:
246
+
247
+ * ad-hoc scripts ❌
248
+ * heavy frameworks ❌
249
+
250
+ It gives you just enough structure to:
251
+
252
+ * stay organized
253
+ * move fast
254
+ * keep workflows reproducible
255
+
256
+ without getting in your way.
@@ -25,28 +25,17 @@ classifiers = [
25
25
  ]
26
26
 
27
27
  dependencies = [
28
- "sqlalchemy>=2.0,<3.0",
29
- "pyodbc>=5.0,<6.0",
30
28
  "pandas>=2.2,<4.0",
31
29
  "numpy>=1.26,<3.0",
32
30
  ]
33
31
 
34
32
  [project.optional-dependencies]
35
- excel = ["openpyxl>=3.1,<4.0"]
36
- ml = ["scikit-learn>=1.4,<2.0"]
37
- scipy = ["scipy>=1.11,<2.0"]
38
- optuna = ["optuna>=3.5,<5.0"]
39
- all = [
40
- "openpyxl>=3.1,<4.0",
41
- "scikit-learn>=1.4,<2.0",
42
- "scipy>=1.11,<2.0",
43
- "optuna>=3.5,<5.0",
44
- ]
45
-
33
+ sql = ["sqlalchemy>=2.0,<3.0"]
46
34
  dev = [
47
35
  "pytest>=8.0",
48
36
  "ruff>=0.3",
49
37
  "mypy>=1.8",
38
+ "sqlalchemy>=2.0,<3.0",
50
39
  ]
51
40
 
52
41
  [project.urls]
@@ -0,0 +1,5 @@
1
+ from .cache import SQLParquetCache
2
+
3
+ __all__ = [
4
+ "SQLParquetCache",
5
+ ]
@@ -1,12 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  import pandas as pd
2
4
 
3
- from sqlalchemy import Engine
4
5
  from pathlib import Path
5
6
  from datetime import datetime, timedelta
6
- from typing import Unpack, Optional
7
+ from typing import Unpack, Optional, TYPE_CHECKING
7
8
  from ..utils._typing import SQLParquetKwargs
8
9
  from ..utils._decorators import timing_sql
9
10
 
11
+ if TYPE_CHECKING:
12
+ from sqlalchemy.engine import Engine
13
+
10
14
 
11
15
  class SQLParquetCache:
12
16
  """
@@ -62,6 +66,10 @@ class SQLParquetCache:
62
66
  verbose: bool = True,
63
67
  **kwargs: Unpack[SQLParquetKwargs],
64
68
  ):
69
+ try:
70
+ import sqlalchemy # noqa: F401
71
+ except ImportError:
72
+ raise ImportError("SQLParquetCache requires 'sqlalchemy'. Install it with: pip install penwings[sql]")
65
73
 
66
74
  if sql_dir is not None:
67
75
  self.sql_dir: Path = Path(sql_dir)
@@ -0,0 +1,6 @@
1
+ from .project_paths import ProjectPaths, ConfigPaths
2
+
3
+ __all__ = [
4
+ "ProjectPaths",
5
+ "ConfigPaths",
6
+ ]