nsqip-tools 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nsqip_tools-0.2.0/LICENSE +21 -0
- nsqip_tools-0.2.0/PKG-INFO +252 -0
- nsqip_tools-0.2.0/README.md +222 -0
- nsqip_tools-0.2.0/pyproject.toml +43 -0
- nsqip_tools-0.2.0/setup.cfg +4 -0
- nsqip_tools-0.2.0/src/__init__.py +0 -0
- nsqip_tools-0.2.0/src/nsqip_tools/__init__.py +23 -0
- nsqip_tools-0.2.0/src/nsqip_tools/_internal/__init__.py +6 -0
- nsqip_tools-0.2.0/src/nsqip_tools/_internal/ingest.py +183 -0
- nsqip_tools-0.2.0/src/nsqip_tools/_internal/inspect.py +155 -0
- nsqip_tools-0.2.0/src/nsqip_tools/_internal/memory_utils.py +106 -0
- nsqip_tools-0.2.0/src/nsqip_tools/_internal/transform.py +287 -0
- nsqip_tools-0.2.0/src/nsqip_tools/builder.py +305 -0
- nsqip_tools-0.2.0/src/nsqip_tools/constants.py +118 -0
- nsqip_tools-0.2.0/src/nsqip_tools/data_dictionary.py +750 -0
- nsqip_tools-0.2.0/src/nsqip_tools/query.py +348 -0
- nsqip_tools-0.2.0/src/nsqip_tools.egg-info/PKG-INFO +252 -0
- nsqip_tools-0.2.0/src/nsqip_tools.egg-info/SOURCES.txt +21 -0
- nsqip_tools-0.2.0/src/nsqip_tools.egg-info/dependency_links.txt +1 -0
- nsqip_tools-0.2.0/src/nsqip_tools.egg-info/requires.txt +8 -0
- nsqip_tools-0.2.0/src/nsqip_tools.egg-info/top_level.txt +2 -0
- nsqip_tools-0.2.0/tests/test_builder.py +64 -0
- nsqip_tools-0.2.0/tests/test_query.py +144 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 brant01
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nsqip-tools
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Tools for working with NSQIP surgical quality data
|
|
5
|
+
Author: Jason Brant
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/brant01/nsqip_tools
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/brant01/nsqip_tools/issues
|
|
9
|
+
Project-URL: Source Code, https://github.com/brant01/nsqip_tools
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: polars>=1.0.0
|
|
23
|
+
Requires-Dist: pyarrow>=15.0.0
|
|
24
|
+
Requires-Dist: psutil>=5.9.0
|
|
25
|
+
Requires-Dist: tqdm>=4.67.1
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# NSQIP Tools
|
|
32
|
+
|
|
33
|
+
A Python package for working with National Surgical Quality Improvement Program (NSQIP) data. This package provides tools to convert NSQIP text files into optimized parquet datasets, perform standard data transformations, and query the data efficiently using Polars.
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Data Ingestion**: Convert NSQIP tab-delimited text files to parquet format
|
|
38
|
+
- **Automatic Transformations**: Standard data cleaning and derived variables
|
|
39
|
+
- **Data Verification**: Validate case counts against expected values
|
|
40
|
+
- **Efficient Querying**: Filter by CPT codes, diagnosis codes, years, and more
|
|
41
|
+
- **Data Dictionary**: Auto-generate comprehensive data dictionaries in CSV, JSON, and HTML formats
|
|
42
|
+
- **Memory Efficient**: Designed to work on regular computers with limited RAM
|
|
43
|
+
- **Network Drive Compatible**: Works seamlessly on local or network file systems
|
|
44
|
+
- **Type Safe**: Comprehensive type hints throughout
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install nsqip-tools
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
### Building a Dataset
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import nsqip_tools
|
|
58
|
+
|
|
59
|
+
# Build parquet dataset from NSQIP text files
|
|
60
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
61
|
+
data_dir="/path/to/nsqip/files",
|
|
62
|
+
dataset_type="adult" # or "pediatric"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
print(f"Dataset created at: {result['parquet_dir']}")
|
|
66
|
+
print(f"Data dictionary at: {result['dictionary']}")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Querying Data
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import nsqip_tools
|
|
73
|
+
import polars as pl
|
|
74
|
+
|
|
75
|
+
# Load and filter data
|
|
76
|
+
df = (nsqip_tools.load_data("/path/to/parquet/dataset")
|
|
77
|
+
.filter_by_cpt(["44970", "44979"]) # Laparoscopic procedures
|
|
78
|
+
.filter_by_year([2020, 2021])
|
|
79
|
+
.collect())
|
|
80
|
+
|
|
81
|
+
# Chain with Polars operations
|
|
82
|
+
df = (nsqip_tools.load_data("/path/to/parquet/dataset")
|
|
83
|
+
.filter_by_diagnosis(["K80.20"]) # Gallstones
|
|
84
|
+
.lazy_frame # Access the Polars LazyFrame
|
|
85
|
+
.select(["CASEID", "AGE_AS_INT", "CPT", "OPERYR"])
|
|
86
|
+
.filter(pl.col("AGE_AS_INT") > 50)
|
|
87
|
+
.group_by("CPT")
|
|
88
|
+
.agg(pl.count())
|
|
89
|
+
.collect())
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## API Reference
|
|
93
|
+
|
|
94
|
+
### Building Datasets
|
|
95
|
+
|
|
96
|
+
#### `build_parquet_dataset()`
|
|
97
|
+
|
|
98
|
+
Build an NSQIP parquet dataset from text files with standard transformations.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
102
|
+
data_dir, # Path to NSQIP text files
|
|
103
|
+
output_dir=None, # Output directory (defaults to data_dir)
|
|
104
|
+
dataset_type="adult", # "adult" or "pediatric"
|
|
105
|
+
generate_dictionary=True, # Generate data dictionary
|
|
106
|
+
memory_limit="4GB", # Memory limit for operations
|
|
107
|
+
verify_case_counts=True, # Verify case counts match expected
|
|
108
|
+
apply_transforms=True # Apply standard transformations
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Returns:** Dictionary with paths to:
|
|
113
|
+
- `parquet_dir`: Parquet dataset directory
|
|
114
|
+
- `dictionary`: Data dictionary CSV file (if generated)
|
|
115
|
+
- `log`: Build log file
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
### Querying Data
|
|
119
|
+
|
|
120
|
+
#### `load_data()`
|
|
121
|
+
|
|
122
|
+
Load NSQIP data from a parquet dataset for querying.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
query = nsqip_tools.load_data("/path/to/parquet/dataset")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
#### Filter Methods
|
|
129
|
+
|
|
130
|
+
All filter methods return the query object for chaining:
|
|
131
|
+
|
|
132
|
+
- **`filter_by_cpt(cpt_codes)`**: Filter by CPT procedure codes
|
|
133
|
+
- **`filter_by_diagnosis(diagnosis_codes)`**: Filter by ICD diagnosis codes
|
|
134
|
+
- **`filter_by_year(years)`**: Filter by operation years
|
|
135
|
+
- **`filter_active_variables()`**: Keep only variables with data in most recent year
|
|
136
|
+
- **`select_demographics()`**: Select common demographic variables
|
|
137
|
+
- **`select_outcomes()`**: Select common outcome variables
|
|
138
|
+
|
|
139
|
+
#### Accessing Results
|
|
140
|
+
|
|
141
|
+
- **`.lazy_frame`**: Get the Polars LazyFrame for custom operations
|
|
142
|
+
- **`.collect()`**: Execute query and return Polars DataFrame
|
|
143
|
+
- **`.count()`**: Get count of rows without collecting full data
|
|
144
|
+
- **`.sample(n)`**: Get a random sample of n rows
|
|
145
|
+
- **`.describe()`**: Get summary statistics about the query
|
|
146
|
+
|
|
147
|
+
## Standard Transformations
|
|
148
|
+
|
|
149
|
+
The `build_parquet_dataset()` function automatically applies these transformations:
|
|
150
|
+
|
|
151
|
+
1. **Data Type Conversion**: Identifies and converts numeric columns while preserving categorical codes
|
|
152
|
+
2. **Age Processing**:
|
|
153
|
+
- Keeps original `AGE` column with "90+" values
|
|
154
|
+
- Creates `AGE_AS_INT` (numeric, with 90 for "90+")
|
|
155
|
+
- Creates `AGE_IS_90_PLUS` boolean flag
|
|
156
|
+
3. **CPT Array**: Combines all CPT columns into `ALL_CPT_CODES` array
|
|
157
|
+
4. **Diagnosis Array**: Combines all diagnosis columns into `ALL_DIAGNOSIS_CODES` array
|
|
158
|
+
5. **Race Combination**: Merges `RACE` and `RACE_NEW` into `RACE_COMBINED`
|
|
159
|
+
6. **Work RVU**: Calculates `WORK_RVU_TOTAL` from work RVU columns (adult only)
|
|
160
|
+
7. **Free Flap Indicators**: Derives boolean flags based on CPT codes
|
|
161
|
+
|
|
162
|
+
## Data Dictionary
|
|
163
|
+
|
|
164
|
+
Generated data dictionaries include:
|
|
165
|
+
|
|
166
|
+
- **Column name and data type**
|
|
167
|
+
- **Active status** (has data in most recent year)
|
|
168
|
+
- **Null counts and percentages**
|
|
169
|
+
- **Summary statistics** (numeric: min/max/mean/median, categorical: top values)
|
|
170
|
+
- **Null counts by year** (useful for identifying when variables were added/removed)
|
|
171
|
+
|
|
172
|
+
Available formats:
|
|
173
|
+
- **CSV**: For Excel/spreadsheet users
|
|
174
|
+
- **Excel**: For advanced spreadsheet analysis
|
|
175
|
+
- **HTML**: For easy web viewing
|
|
176
|
+
|
|
177
|
+
## Memory Optimization
|
|
178
|
+
|
|
179
|
+
The package is designed for regular computers:
|
|
180
|
+
|
|
181
|
+
- **Automatic memory detection**: Recommends appropriate memory limits based on available RAM
|
|
182
|
+
- **Columnar storage**: Uses parquet format for efficient compression and access
|
|
183
|
+
- **Lazy evaluation**: Polars LazyFrames enable efficient query planning
|
|
184
|
+
- **Streaming support**: Can process datasets larger than available memory
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
# Check system memory
|
|
188
|
+
mem_info = nsqip_tools.get_memory_info()
|
|
189
|
+
print(f"Total RAM: {mem_info['total']}")
|
|
190
|
+
print(f"Available: {mem_info['available']}")
|
|
191
|
+
print(f"Recommended limit: {mem_info['recommended_limit']}")
|
|
192
|
+
|
|
193
|
+
# Use automatic memory detection (default)
|
|
194
|
+
result = nsqip_tools.build_parquet_dataset(data_dir="/path/to/files")
|
|
195
|
+
|
|
196
|
+
# Or specify custom limit
|
|
197
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
198
|
+
data_dir="/path/to/files",
|
|
199
|
+
memory_limit="8GB"
|
|
200
|
+
)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Safe Data Collection
|
|
204
|
+
|
|
205
|
+
The package includes memory-safe collection to prevent out-of-memory errors:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
# Check size before collecting
|
|
209
|
+
query = nsqip_tools.load_data("/path/to/parquet/dataset").filter_by_year([2021])
|
|
210
|
+
info = query.describe()
|
|
211
|
+
print(f"Total rows: {info['total_rows']}")
|
|
212
|
+
print(f"Columns: {info['columns']}")
|
|
213
|
+
|
|
214
|
+
# Use streaming for large datasets
|
|
215
|
+
df = query.collect(streaming=True)
|
|
216
|
+
|
|
217
|
+
# Get a sample for exploration
|
|
218
|
+
sample_df = query.sample(n=10000)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Network Drive Support
|
|
222
|
+
|
|
223
|
+
The package works seamlessly on network drives and file systems that don't support file locking:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
# Works on network drives, SMB shares, etc.
|
|
227
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
228
|
+
data_dir="/Volumes/network_drive/nsqip_data",
|
|
229
|
+
output_dir="/Volumes/network_drive/processed"
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Query from network location
|
|
233
|
+
query = nsqip_tools.load_data("/Volumes/network_drive/processed/adult_nsqip_parquet")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Data Requirements
|
|
237
|
+
|
|
238
|
+
- NSQIP data files must be tab-delimited text files
|
|
239
|
+
- Files should follow standard NSQIP naming conventions
|
|
240
|
+
- Expected case counts are validated based on official NSQIP documentation
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
|
|
248
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
249
|
+
|
|
250
|
+
## Disclaimer
|
|
251
|
+
|
|
252
|
+
This package is not affiliated with or endorsed by the American College of Surgeons National Surgical Quality Improvement Program. Users must obtain NSQIP data through official channels.
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# NSQIP Tools
|
|
2
|
+
|
|
3
|
+
A Python package for working with National Surgical Quality Improvement Program (NSQIP) data. This package provides tools to convert NSQIP text files into optimized parquet datasets, perform standard data transformations, and query the data efficiently using Polars.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Data Ingestion**: Convert NSQIP tab-delimited text files to parquet format
|
|
8
|
+
- **Automatic Transformations**: Standard data cleaning and derived variables
|
|
9
|
+
- **Data Verification**: Validate case counts against expected values
|
|
10
|
+
- **Efficient Querying**: Filter by CPT codes, diagnosis codes, years, and more
|
|
11
|
+
- **Data Dictionary**: Auto-generate comprehensive data dictionaries in CSV, JSON, and HTML formats
|
|
12
|
+
- **Memory Efficient**: Designed to work on regular computers with limited RAM
|
|
13
|
+
- **Network Drive Compatible**: Works seamlessly on local or network file systems
|
|
14
|
+
- **Type Safe**: Comprehensive type hints throughout
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install nsqip-tools
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
### Building a Dataset
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import nsqip_tools
|
|
28
|
+
|
|
29
|
+
# Build parquet dataset from NSQIP text files
|
|
30
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
31
|
+
data_dir="/path/to/nsqip/files",
|
|
32
|
+
dataset_type="adult" # or "pediatric"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
print(f"Dataset created at: {result['parquet_dir']}")
|
|
36
|
+
print(f"Data dictionary at: {result['dictionary']}")
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Querying Data
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import nsqip_tools
|
|
43
|
+
import polars as pl
|
|
44
|
+
|
|
45
|
+
# Load and filter data
|
|
46
|
+
df = (nsqip_tools.load_data("/path/to/parquet/dataset")
|
|
47
|
+
.filter_by_cpt(["44970", "44979"]) # Laparoscopic procedures
|
|
48
|
+
.filter_by_year([2020, 2021])
|
|
49
|
+
.collect())
|
|
50
|
+
|
|
51
|
+
# Chain with Polars operations
|
|
52
|
+
df = (nsqip_tools.load_data("/path/to/parquet/dataset")
|
|
53
|
+
.filter_by_diagnosis(["K80.20"]) # Gallstones
|
|
54
|
+
.lazy_frame # Access the Polars LazyFrame
|
|
55
|
+
.select(["CASEID", "AGE_AS_INT", "CPT", "OPERYR"])
|
|
56
|
+
.filter(pl.col("AGE_AS_INT") > 50)
|
|
57
|
+
.group_by("CPT")
|
|
58
|
+
.agg(pl.count())
|
|
59
|
+
.collect())
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## API Reference
|
|
63
|
+
|
|
64
|
+
### Building Datasets
|
|
65
|
+
|
|
66
|
+
#### `build_parquet_dataset()`
|
|
67
|
+
|
|
68
|
+
Build an NSQIP parquet dataset from text files with standard transformations.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
72
|
+
data_dir, # Path to NSQIP text files
|
|
73
|
+
output_dir=None, # Output directory (defaults to data_dir)
|
|
74
|
+
dataset_type="adult", # "adult" or "pediatric"
|
|
75
|
+
generate_dictionary=True, # Generate data dictionary
|
|
76
|
+
memory_limit="4GB", # Memory limit for operations
|
|
77
|
+
verify_case_counts=True, # Verify case counts match expected
|
|
78
|
+
apply_transforms=True # Apply standard transformations
|
|
79
|
+
)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Returns:** Dictionary with paths to:
|
|
83
|
+
- `parquet_dir`: Parquet dataset directory
|
|
84
|
+
- `dictionary`: Data dictionary CSV file (if generated)
|
|
85
|
+
- `log`: Build log file
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
### Querying Data
|
|
89
|
+
|
|
90
|
+
#### `load_data()`
|
|
91
|
+
|
|
92
|
+
Load NSQIP data from a parquet dataset for querying.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
query = nsqip_tools.load_data("/path/to/parquet/dataset")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
#### Filter Methods
|
|
99
|
+
|
|
100
|
+
All filter methods return the query object for chaining:
|
|
101
|
+
|
|
102
|
+
- **`filter_by_cpt(cpt_codes)`**: Filter by CPT procedure codes
|
|
103
|
+
- **`filter_by_diagnosis(diagnosis_codes)`**: Filter by ICD diagnosis codes
|
|
104
|
+
- **`filter_by_year(years)`**: Filter by operation years
|
|
105
|
+
- **`filter_active_variables()`**: Keep only variables with data in most recent year
|
|
106
|
+
- **`select_demographics()`**: Select common demographic variables
|
|
107
|
+
- **`select_outcomes()`**: Select common outcome variables
|
|
108
|
+
|
|
109
|
+
#### Accessing Results
|
|
110
|
+
|
|
111
|
+
- **`.lazy_frame`**: Get the Polars LazyFrame for custom operations
|
|
112
|
+
- **`.collect()`**: Execute query and return Polars DataFrame
|
|
113
|
+
- **`.count()`**: Get count of rows without collecting full data
|
|
114
|
+
- **`.sample(n)`**: Get a random sample of n rows
|
|
115
|
+
- **`.describe()`**: Get summary statistics about the query
|
|
116
|
+
|
|
117
|
+
## Standard Transformations
|
|
118
|
+
|
|
119
|
+
The `build_parquet_dataset()` function automatically applies these transformations:
|
|
120
|
+
|
|
121
|
+
1. **Data Type Conversion**: Identifies and converts numeric columns while preserving categorical codes
|
|
122
|
+
2. **Age Processing**:
|
|
123
|
+
- Keeps original `AGE` column with "90+" values
|
|
124
|
+
- Creates `AGE_AS_INT` (numeric, with 90 for "90+")
|
|
125
|
+
- Creates `AGE_IS_90_PLUS` boolean flag
|
|
126
|
+
3. **CPT Array**: Combines all CPT columns into `ALL_CPT_CODES` array
|
|
127
|
+
4. **Diagnosis Array**: Combines all diagnosis columns into `ALL_DIAGNOSIS_CODES` array
|
|
128
|
+
5. **Race Combination**: Merges `RACE` and `RACE_NEW` into `RACE_COMBINED`
|
|
129
|
+
6. **Work RVU**: Calculates `WORK_RVU_TOTAL` from work RVU columns (adult only)
|
|
130
|
+
7. **Free Flap Indicators**: Derives boolean flags based on CPT codes
|
|
131
|
+
|
|
132
|
+
## Data Dictionary
|
|
133
|
+
|
|
134
|
+
Generated data dictionaries include:
|
|
135
|
+
|
|
136
|
+
- **Column name and data type**
|
|
137
|
+
- **Active status** (has data in most recent year)
|
|
138
|
+
- **Null counts and percentages**
|
|
139
|
+
- **Summary statistics** (numeric: min/max/mean/median, categorical: top values)
|
|
140
|
+
- **Null counts by year** (useful for identifying when variables were added/removed)
|
|
141
|
+
|
|
142
|
+
Available formats:
|
|
143
|
+
- **CSV**: For Excel/spreadsheet users
|
|
144
|
+
- **Excel**: For advanced spreadsheet analysis
|
|
145
|
+
- **HTML**: For easy web viewing
|
|
146
|
+
|
|
147
|
+
## Memory Optimization
|
|
148
|
+
|
|
149
|
+
The package is designed for regular computers:
|
|
150
|
+
|
|
151
|
+
- **Automatic memory detection**: Recommends appropriate memory limits based on available RAM
|
|
152
|
+
- **Columnar storage**: Uses parquet format for efficient compression and access
|
|
153
|
+
- **Lazy evaluation**: Polars LazyFrames enable efficient query planning
|
|
154
|
+
- **Streaming support**: Can process datasets larger than available memory
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
# Check system memory
|
|
158
|
+
mem_info = nsqip_tools.get_memory_info()
|
|
159
|
+
print(f"Total RAM: {mem_info['total']}")
|
|
160
|
+
print(f"Available: {mem_info['available']}")
|
|
161
|
+
print(f"Recommended limit: {mem_info['recommended_limit']}")
|
|
162
|
+
|
|
163
|
+
# Use automatic memory detection (default)
|
|
164
|
+
result = nsqip_tools.build_parquet_dataset(data_dir="/path/to/files")
|
|
165
|
+
|
|
166
|
+
# Or specify custom limit
|
|
167
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
168
|
+
data_dir="/path/to/files",
|
|
169
|
+
memory_limit="8GB"
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Safe Data Collection
|
|
174
|
+
|
|
175
|
+
The package includes memory-safe collection to prevent out-of-memory errors:
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
# Check size before collecting
|
|
179
|
+
query = nsqip_tools.load_data("/path/to/parquet/dataset").filter_by_year([2021])
|
|
180
|
+
info = query.describe()
|
|
181
|
+
print(f"Total rows: {info['total_rows']}")
|
|
182
|
+
print(f"Columns: {info['columns']}")
|
|
183
|
+
|
|
184
|
+
# Use streaming for large datasets
|
|
185
|
+
df = query.collect(streaming=True)
|
|
186
|
+
|
|
187
|
+
# Get a sample for exploration
|
|
188
|
+
sample_df = query.sample(n=10000)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Network Drive Support
|
|
192
|
+
|
|
193
|
+
The package works seamlessly on network drives and file systems that don't support file locking:
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
# Works on network drives, SMB shares, etc.
|
|
197
|
+
result = nsqip_tools.build_parquet_dataset(
|
|
198
|
+
data_dir="/Volumes/network_drive/nsqip_data",
|
|
199
|
+
output_dir="/Volumes/network_drive/processed"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Query from network location
|
|
203
|
+
query = nsqip_tools.load_data("/Volumes/network_drive/processed/adult_nsqip_parquet")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Data Requirements
|
|
207
|
+
|
|
208
|
+
- NSQIP data files must be tab-delimited text files
|
|
209
|
+
- Files should follow standard NSQIP naming conventions
|
|
210
|
+
- Expected case counts are validated based on official NSQIP documentation
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
215
|
+
|
|
216
|
+
## Contributing
|
|
217
|
+
|
|
218
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
219
|
+
|
|
220
|
+
## Disclaimer
|
|
221
|
+
|
|
222
|
+
This package is not affiliated with or endorsed by the American College of Surgeons National Surgical Quality Improvement Program. Users must obtain NSQIP data through official channels.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "nsqip-tools"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Tools for working with NSQIP surgical quality data"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Jason Brant"},
|
|
10
|
+
]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Intended Audience :: Healthcare Industry",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Medical Science Apps.",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"polars>=1.0.0",
|
|
24
|
+
"pyarrow>=15.0.0",
|
|
25
|
+
"psutil>=5.9.0",
|
|
26
|
+
"tqdm>=4.67.1",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=7.0.0",
|
|
32
|
+
"pytest-cov>=4.0.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
"Homepage" = "https://github.com/brant01/nsqip_tools"
|
|
37
|
+
"Bug Tracker" = "https://github.com/brant01/nsqip_tools/issues"
|
|
38
|
+
"Source Code" = "https://github.com/brant01/nsqip_tools"
|
|
39
|
+
|
|
40
|
+
[dependency-groups]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=8.3.5",
|
|
43
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""NSQIP Tools: A Python package for working with NSQIP surgical data.
|
|
2
|
+
|
|
3
|
+
This package provides tools for ingesting, transforming, and querying
|
|
4
|
+
National Surgical Quality Improvement Program (NSQIP) data using Polars
|
|
5
|
+
and parquet datasets.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .query import load_data, NSQIPQuery
|
|
9
|
+
from .builder import build_parquet_dataset
|
|
10
|
+
from ._internal.memory_utils import get_memory_info, get_recommended_memory_limit
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"build_parquet_dataset",
|
|
14
|
+
"load_data",
|
|
15
|
+
"NSQIPQuery",
|
|
16
|
+
"get_memory_info",
|
|
17
|
+
"get_recommended_memory_limit",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
__version__ = "0.1.0"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|