dr-dasci 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dr_dasci-0.1.0/.gitattributes +2 -0
- dr_dasci-0.1.0/.github/workflows/ci.yml +30 -0
- dr_dasci-0.1.0/.github/workflows/publish.yml +27 -0
- dr_dasci-0.1.0/.gitignore +31 -0
- dr_dasci-0.1.0/LICENSE +21 -0
- dr_dasci-0.1.0/PKG-INFO +506 -0
- dr_dasci-0.1.0/README.md +441 -0
- dr_dasci-0.1.0/docs/FINDINGS.md +116 -0
- dr_dasci-0.1.0/drdasci.png +0 -0
- dr_dasci-0.1.0/pyproject.toml +88 -0
- dr_dasci-0.1.0/src/dr_dasci/__init__.py +30 -0
- dr_dasci-0.1.0/src/dr_dasci/cli.py +25 -0
- dr_dasci-0.1.0/src/dr_dasci/config.py +25 -0
- dr_dasci-0.1.0/src/dr_dasci/core.py +1157 -0
- dr_dasci-0.1.0/src/dr_dasci/py.typed +1 -0
- dr_dasci-0.1.0/src/dr_dasci/report.py +136 -0
- dr_dasci-0.1.0/tests/test_cli.py +10 -0
- dr_dasci-0.1.0/tests/test_diagnose.py +80 -0
- dr_dasci-0.1.0/tests/test_optional_integrations.py +52 -0
- dr_dasci-0.1.0/tests/test_report.py +83 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python-version }}
|
|
19
|
+
- name: Install package
|
|
20
|
+
run: python -m pip install --upgrade pip && python -m pip install -e ".[dev]"
|
|
21
|
+
- name: Lint
|
|
22
|
+
run: ruff check .
|
|
23
|
+
- name: Type check
|
|
24
|
+
run: mypy src
|
|
25
|
+
- name: Test
|
|
26
|
+
run: pytest
|
|
27
|
+
- name: Build
|
|
28
|
+
run: python -m build
|
|
29
|
+
- name: Check distribution
|
|
30
|
+
run: twine check dist/*
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
publish:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
environment: pypi
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
- name: Build distribution
|
|
22
|
+
run: |
|
|
23
|
+
python -m pip install --upgrade pip build twine
|
|
24
|
+
python -m build
|
|
25
|
+
twine check dist/*
|
|
26
|
+
- name: Publish distribution
|
|
27
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Python caches
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Test and coverage caches
|
|
7
|
+
.pytest_cache/
|
|
8
|
+
.coverage
|
|
9
|
+
.coverage.*
|
|
10
|
+
htmlcov/
|
|
11
|
+
|
|
12
|
+
# Build artifacts
|
|
13
|
+
build/
|
|
14
|
+
dist/
|
|
15
|
+
*.egg-info/
|
|
16
|
+
|
|
17
|
+
# Virtual environments
|
|
18
|
+
.venv/
|
|
19
|
+
venv/
|
|
20
|
+
env/
|
|
21
|
+
|
|
22
|
+
# Tool caches
|
|
23
|
+
.ruff_cache/
|
|
24
|
+
.mypy_cache/
|
|
25
|
+
.pyre/
|
|
26
|
+
|
|
27
|
+
# Local editor and OS files
|
|
28
|
+
.vscode/
|
|
29
|
+
.idea/
|
|
30
|
+
.DS_Store
|
|
31
|
+
Thumbs.db
|
dr_dasci-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 JadeyGraham96
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dr_dasci-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dr-dasci
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automatic diagnosis for pandas, polars, and NumPy data pipelines.
|
|
5
|
+
Project-URL: Documentation, https://github.com/Arkay92/dr-dasci#readme
|
|
6
|
+
Project-URL: Homepage, https://github.com/Arkay92/dr-dasci
|
|
7
|
+
Project-URL: Issues, https://github.com/Arkay92/dr-dasci/issues
|
|
8
|
+
Project-URL: Repository, https://github.com/Arkay92/dr-dasci
|
|
9
|
+
Author: Arkay92
|
|
10
|
+
Maintainer: Arkay92
|
|
11
|
+
License: MIT License
|
|
12
|
+
|
|
13
|
+
Copyright (c) 2026 JadeyGraham96
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
17
|
+
in the Software without restriction, including without limitation the rights
|
|
18
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
19
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
20
|
+
furnished to do so, subject to the following conditions:
|
|
21
|
+
|
|
22
|
+
The above copyright notice and this permission notice shall be included in all
|
|
23
|
+
copies or substantial portions of the Software.
|
|
24
|
+
|
|
25
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
26
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
27
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
28
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
29
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
30
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
31
|
+
SOFTWARE.
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Keywords: dataframe,diagnostics,memory,numpy,pandas,polars,profiling,pyarrow
|
|
34
|
+
Classifier: Development Status :: 3 - Alpha
|
|
35
|
+
Classifier: Intended Audience :: Developers
|
|
36
|
+
Classifier: Intended Audience :: Science/Research
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
43
|
+
Classifier: Topic :: Scientific/Engineering
|
|
44
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
45
|
+
Classifier: Typing :: Typed
|
|
46
|
+
Requires-Python: >=3.10
|
|
47
|
+
Provides-Extra: all
|
|
48
|
+
Requires-Dist: numpy>=1.23; extra == 'all'
|
|
49
|
+
Requires-Dist: pandas>=1.5; extra == 'all'
|
|
50
|
+
Requires-Dist: polars>=0.20; extra == 'all'
|
|
51
|
+
Requires-Dist: pyarrow>=12; extra == 'all'
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
54
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
55
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
56
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
57
|
+
Requires-Dist: twine>=5; extra == 'dev'
|
|
58
|
+
Provides-Extra: numpy
|
|
59
|
+
Requires-Dist: numpy>=1.23; extra == 'numpy'
|
|
60
|
+
Provides-Extra: pandas
|
|
61
|
+
Requires-Dist: pandas>=1.5; extra == 'pandas'
|
|
62
|
+
Provides-Extra: polars
|
|
63
|
+
Requires-Dist: polars>=0.20; extra == 'polars'
|
|
64
|
+
Description-Content-Type: text/markdown
|
|
65
|
+
|
|
66
|
+
# dr-dasci
|
|
67
|
+
|
|
68
|
+
<p align="center">
|
|
69
|
+
Automatic diagnostics for pandas, Polars, NumPy, and Parquet data pipelines.
|
|
70
|
+
</p>
|
|
71
|
+
|
|
72
|
+
<p align="center">
|
|
73
|
+
<img width="256" height="256" alt="dr-dasci Logo" src="drdasci.png" />
|
|
74
|
+
</p>
|
|
75
|
+
|
|
76
|
+
<p align="center">
|
|
77
|
+
<a href="https://github.com/Arkay92/dr-dasci/actions/workflows/publish.yml"><img alt="Publish" src="https://github.com/Arkay92/dr-dasci/actions/workflows/publish.yml/badge.svg" /></a>
|
|
78
|
+
<a href="https://pypi.org/project/dr-dasci/"><img alt="PyPI" src="https://img.shields.io/pypi/v/dr-dasci.svg" /></a>
|
|
79
|
+
<img alt="Python" src="https://img.shields.io/pypi/pyversions/dr-dasci.svg" />
|
|
80
|
+
<img alt="Downloads" src="https://img.shields.io/pypi/dm/dr-dasci.svg" />
|
|
81
|
+
<img alt="License" src="https://img.shields.io/pypi/l/dr-dasci.svg" />
|
|
82
|
+
</p>
|
|
83
|
+
|
|
84
|
+
**dr-dasci** combines:
|
|
85
|
+
- **Dataframe diagnostics** for pandas-like and Polars-like objects.
|
|
86
|
+
- **Array diagnostics** for NumPy memory layout, dtype, and copy risks.
|
|
87
|
+
- **Operation preflight checks** for joins, groupbys, pivots, conversions, and Parquet reads.
|
|
88
|
+
- **Configurable thresholds** for laptop, CI, and server memory budgets.
|
|
89
|
+
- **Machine-readable reports** with stable finding codes, metadata, and JSON export.
|
|
90
|
+
- **Safe execution plans** for large tabular transformations.
|
|
91
|
+
- **Optional dependencies** so the base package stays lightweight.
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## Why Diagnostics for Data Pipelines?
|
|
96
|
+
|
|
97
|
+
pandas, Polars, NumPy, and Arrow are powerful, but many expensive operations look
|
|
98
|
+
cheap at the call site:
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
Input object or file
|
|
102
|
+
-> Detect runtime and shape
|
|
103
|
+
-> Inspect dtypes, indexes, memory, cardinality, and layout
|
|
104
|
+
-> Estimate operation-specific peak memory
|
|
105
|
+
|
|
106
|
+
-> Report findings with stable codes
|
|
107
|
+
-> Suggest safer dtypes and execution plans
|
|
108
|
+
-> Export text or JSON for notebooks, CI, and logs
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
`dr-dasci` is designed to catch common problems before they become production
|
|
112
|
+
failures:
|
|
113
|
+
- **Hidden copies** from pandas object blocks, index alignment, and NumPy views.
|
|
114
|
+
- **Memory blowups** in joins, groupbys, pivots, unstack, fillna, and conversions.
|
|
115
|
+
- **String dtype traps** where `object`, high-cardinality text, or repeated labels
|
|
116
|
+
need different treatment.
|
|
117
|
+
- **Parquet-to-pandas expansion** when encoded Arrow data becomes pandas blocks.
|
|
118
|
+
- **Join cardinality surprises** from duplicate keys, null keys, and many-to-many
|
|
119
|
+
merges.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Architecture
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
DataFrame / ndarray / file path
|
|
127
|
+
|
|
|
128
|
+
v
|
|
129
|
+
Adapter detection
|
|
130
|
+
- pandas DataFrame
|
|
131
|
+
- Polars DataFrame / LazyFrame
|
|
132
|
+
- NumPy ndarray
|
|
133
|
+
- dataframe-like fallback
|
|
134
|
+
- Parquet metadata reader
|
|
135
|
+
|
|
|
136
|
+
v
|
|
137
|
+
Diagnostics
|
|
138
|
+
- shape and memory estimates
|
|
139
|
+
- dtype and cardinality checks
|
|
140
|
+
- pandas index/copy-risk checks
|
|
141
|
+
- NumPy layout checks
|
|
142
|
+
- join/groupby/pivot/conversion preflight
|
|
143
|
+
|
|
|
144
|
+
v
|
|
145
|
+
DoctorReport
|
|
146
|
+
- human-readable show()
|
|
147
|
+
- suggestions via suggest()
|
|
148
|
+
- safe_execution_plan()
|
|
149
|
+
- machine-readable to_dict() / to_json()
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Install
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pip install dr-dasci
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
For pandas support:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
pip install "dr-dasci[pandas]"
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
For Polars support:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
pip install "dr-dasci[polars]"
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
For all optional dataframe, array, and Parquet support:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
pip install "dr-dasci[all]"
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
For development:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
pip install -e ".[dev,all]"
|
|
182
|
+
pytest -q
|
|
183
|
+
python -m build
|
|
184
|
+
twine check dist/*
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Quick Start
|
|
190
|
+
|
|
191
|
+
### Basic Diagnosis
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from dr_dasci import diagnose
|
|
195
|
+
|
|
196
|
+
report = diagnose(df, name="orders")
|
|
197
|
+
|
|
198
|
+
report.show()
|
|
199
|
+
print(report.suggest())
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Machine-Readable Output
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
from dr_dasci import diagnose
|
|
206
|
+
|
|
207
|
+
report = diagnose(df)
|
|
208
|
+
|
|
209
|
+
payload = report.to_dict()
|
|
210
|
+
json_text = report.to_json()
|
|
211
|
+
|
|
212
|
+
print(payload["findings"][0]["code"])
|
|
213
|
+
print(json_text)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Safe Execution Plan
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
report = diagnose(df, name="events")
|
|
220
|
+
|
|
221
|
+
for step in report.safe_execution_plan():
|
|
222
|
+
print(step)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Configurable Thresholds
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
from dr_dasci import DoctorConfig, diagnose
|
|
229
|
+
|
|
230
|
+
config = DoctorConfig(
|
|
231
|
+
available_memory_bytes=8_000_000_000,
|
|
232
|
+
large_memory_bytes=1_500_000_000,
|
|
233
|
+
expensive_column_bytes=150_000_000,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
report = diagnose(df, config=config)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## CLI
|
|
242
|
+
|
|
243
|
+
Inspect a local data file:
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
dr-dasci inspect data.parquet
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Emit JSON:
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
dr-dasci inspect data.parquet --json
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Main Features
|
|
258
|
+
|
|
259
|
+
### 1. **Dataframe Diagnosis**
|
|
260
|
+
|
|
261
|
+
Detect expensive object columns, large shapes, numeric downcast candidates,
|
|
262
|
+
nullable dtype candidates, and pandas index risks:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
from dr_dasci import diagnose
|
|
266
|
+
|
|
267
|
+
report = diagnose(df)
|
|
268
|
+
report.show()
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
Common finding codes include:
|
|
272
|
+
|
|
273
|
+
- `EXPENSIVE_OBJECT_COLUMN`
|
|
274
|
+
- `DOWNSIZE_NUMERIC_CANDIDATE`
|
|
275
|
+
- `DUPLICATE_INDEX`
|
|
276
|
+
- `NON_MONOTONIC_INDEX`
|
|
277
|
+
- `PANDAS_OBJECT_BLOCK_COPY_RISK`
|
|
278
|
+
- `PANDAS_ALIGNMENT_COPY_RISK`
|
|
279
|
+
|
|
280
|
+
### 2. **Join Preflight**
|
|
281
|
+
|
|
282
|
+
Estimate join cardinality, null-key risk, many-to-many risk, and peak memory:
|
|
283
|
+
|
|
284
|
+
```python
|
|
285
|
+
from dr_dasci import diagnose_join
|
|
286
|
+
|
|
287
|
+
report = diagnose_join(left, right, on="customer_id", how="left")
|
|
288
|
+
report.show()
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### 3. **Groupby Preflight**
|
|
292
|
+
|
|
293
|
+
Check high-cardinality grouping keys and aggregation memory pressure:
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
from dr_dasci import diagnose_groupby
|
|
297
|
+
|
|
298
|
+
report = diagnose_groupby(events, by=["account_id", "event_day"])
|
|
299
|
+
print(report.risky_operations(minimum="high"))
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### 4. **Pivot and Unstack Preflight**
|
|
303
|
+
|
|
304
|
+
Estimate dense expansion before reshaping:
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
from dr_dasci import diagnose_pivot
|
|
308
|
+
|
|
309
|
+
report = diagnose_pivot(df, index="user_id", columns="event_type")
|
|
310
|
+
report.show()
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
### 5. **Conversion Diagnostics**
|
|
314
|
+
|
|
315
|
+
Preflight conversion costs between pandas, Polars, NumPy, and Arrow-backed data:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
from dr_dasci import diagnose_conversion
|
|
319
|
+
|
|
320
|
+
report = diagnose_conversion(df, target="pandas")
|
|
321
|
+
print(report.to_json())
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### 6. **Parquet Metadata Diagnostics**
|
|
325
|
+
|
|
326
|
+
Inspect Parquet row groups, column counts, and pandas conversion risk without
|
|
327
|
+
loading the full dataset:
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
from dr_dasci import diagnose_parquet
|
|
331
|
+
|
|
332
|
+
report = diagnose_parquet("events.parquet")
|
|
333
|
+
report.show()
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### 7. **NumPy Copy-Risk Checks**
|
|
337
|
+
|
|
338
|
+
Catch object arrays and non-contiguous views:
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from dr_dasci import diagnose
|
|
342
|
+
|
|
343
|
+
report = diagnose(array)
|
|
344
|
+
report.show()
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### 8. **Stable Finding Codes**
|
|
348
|
+
|
|
349
|
+
Every finding includes a stable `code`, `severity`, `suggestion`, optional
|
|
350
|
+
`column`, documentation URL, and metadata:
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
for finding in report.findings:
|
|
354
|
+
print(finding.code, finding.severity, finding.metadata)
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
See [docs/FINDINGS.md](docs/FINDINGS.md) for the finding catalog.
|
|
358
|
+
|
|
359
|
+
---
|
|
360
|
+
|
|
361
|
+
## Configuration
|
|
362
|
+
|
|
363
|
+
Tune behavior via `DoctorConfig`:
|
|
364
|
+
|
|
365
|
+
```python
|
|
366
|
+
from dr_dasci import DoctorConfig
|
|
367
|
+
|
|
368
|
+
config = DoctorConfig(
|
|
369
|
+
large_memory_bytes=1_000_000_000,
|
|
370
|
+
expensive_column_bytes=100_000_000,
|
|
371
|
+
large_cell_count=50_000_000,
|
|
372
|
+
large_rows=1_000_000,
|
|
373
|
+
very_large_rows=5_000_000,
|
|
374
|
+
pivot_row_warning=250_000,
|
|
375
|
+
pivot_width_warning=25,
|
|
376
|
+
join_high_memory_bytes=500_000_000,
|
|
377
|
+
low_cardinality_ratio=0.2,
|
|
378
|
+
low_cardinality_max_unique=50_000,
|
|
379
|
+
high_cardinality_ratio=0.8,
|
|
380
|
+
index_warning_rows=100_000,
|
|
381
|
+
available_memory_bytes=None,
|
|
382
|
+
)
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
---
|
|
386
|
+
|
|
387
|
+
## Examples
|
|
388
|
+
|
|
389
|
+
```python
|
|
390
|
+
from dr_dasci import diagnose, diagnose_join
|
|
391
|
+
|
|
392
|
+
orders_report = diagnose(orders, name="orders")
|
|
393
|
+
customers_join_report = diagnose_join(orders, customers, on="customer_id")
|
|
394
|
+
|
|
395
|
+
orders_report.show()
|
|
396
|
+
customers_join_report.show()
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
dr-dasci inspect warehouse/orders.parquet --json
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
## Project Structure
|
|
406
|
+
|
|
407
|
+
```
|
|
408
|
+
src/dr_dasci/
|
|
409
|
+
__init__.py # Public API
|
|
410
|
+
config.py # DoctorConfig thresholds
|
|
411
|
+
core.py # Diagnostics and operation preflight helpers
|
|
412
|
+
report.py # DoctorReport, findings, estimates, JSON export
|
|
413
|
+
cli.py # Command-line interface
|
|
414
|
+
py.typed # Typing marker
|
|
415
|
+
docs/
|
|
416
|
+
FINDINGS.md # Stable finding-code catalog
|
|
417
|
+
tests/
|
|
418
|
+
test_*.py # Unit and optional integration tests
|
|
419
|
+
.github/
|
|
420
|
+
workflows/
|
|
421
|
+
ci.yml # Lint, type check, tests, build, twine check
|
|
422
|
+
publish.yml # PyPI publishing workflow
|
|
423
|
+
pyproject.toml # Project metadata and dependencies
|
|
424
|
+
drdasci.png # Project logo
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
---
|
|
428
|
+
|
|
429
|
+
## Development
|
|
430
|
+
|
|
431
|
+
```bash
|
|
432
|
+
# Install with dev extras
|
|
433
|
+
pip install -e ".[dev,all]"
|
|
434
|
+
|
|
435
|
+
# Lint
|
|
436
|
+
ruff check .
|
|
437
|
+
|
|
438
|
+
# Type check
|
|
439
|
+
mypy src
|
|
440
|
+
|
|
441
|
+
# Run tests
|
|
442
|
+
pytest -q
|
|
443
|
+
|
|
444
|
+
# Build package
|
|
445
|
+
python -m build
|
|
446
|
+
|
|
447
|
+
# Check distributions
|
|
448
|
+
twine check dist/*
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
---
|
|
452
|
+
|
|
453
|
+
## Limitations and Future Work
|
|
454
|
+
|
|
455
|
+
- **Heuristic estimates**: Memory estimates are intentionally conservative, not
|
|
456
|
+
exact profilers.
|
|
457
|
+
- **Optional integrations**: pandas, Polars, NumPy, and pyarrow are optional and
|
|
458
|
+
required only for their corresponding diagnostics.
|
|
459
|
+
- **File support**: Parquet is metadata-first; CSV diagnostics currently sample
|
|
460
|
+
with pandas when available.
|
|
461
|
+
- **Operation plans**: Preflight helpers do not execute transformations.
|
|
462
|
+
|
|
463
|
+
Future improvements:
|
|
464
|
+
- Runtime instrumentation for actual peak memory.
|
|
465
|
+
- Dask, DuckDB, Spark, and Arrow Dataset diagnostics.
|
|
466
|
+
- Notebook widgets for interactive report inspection.
|
|
467
|
+
- Suppression config for known accepted finding codes.
|
|
468
|
+
- Richer Parquet encoding and row-group recommendations.
|
|
469
|
+
|
|
470
|
+
---
|
|
471
|
+
|
|
472
|
+
## License
|
|
473
|
+
|
|
474
|
+
MIT
|
|
475
|
+
|
|
476
|
+
---
|
|
477
|
+
|
|
478
|
+
## Contributing
|
|
479
|
+
|
|
480
|
+
Contributions are welcome. Open an issue with a reproducible dataframe shape,
|
|
481
|
+
dtypes, operation, and observed memory or runtime behavior.
|
|
482
|
+
|
|
483
|
+
---
|
|
484
|
+
|
|
485
|
+
## Citation
|
|
486
|
+
|
|
487
|
+
If you use dr-dasci in research, please cite:
|
|
488
|
+
|
|
489
|
+
```bibtex
|
|
490
|
+
@software{drdasci2026,
|
|
491
|
+
title={dr-dasci: Automatic Diagnostics for Data Science Pipelines},
|
|
492
|
+
author={Arkay92},
|
|
493
|
+
url={https://github.com/Arkay92/dr-dasci},
|
|
494
|
+
year={2026},
|
|
495
|
+
version={0.1.0},
|
|
496
|
+
}
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
---
|
|
500
|
+
|
|
501
|
+
## Acknowledgments
|
|
502
|
+
|
|
503
|
+
- [pandas](https://pandas.pydata.org/) for dataframe analytics.
|
|
504
|
+
- [Polars](https://pola.rs/) for high-performance dataframe execution.
|
|
505
|
+
- [NumPy](https://numpy.org/) for array computing.
|
|
506
|
+
- [Apache Arrow](https://arrow.apache.org/) for columnar memory and Parquet tooling.
|