nsqip-tools 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 brant01
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,252 @@
1
+ Metadata-Version: 2.4
2
+ Name: nsqip-tools
3
+ Version: 0.2.0
4
+ Summary: Tools for working with NSQIP surgical quality data
5
+ Author: Jason Brant
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/brant01/nsqip_tools
8
+ Project-URL: Bug Tracker, https://github.com/brant01/nsqip_tools/issues
9
+ Project-URL: Source Code, https://github.com/brant01/nsqip_tools
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Healthcare Industry
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: polars>=1.0.0
23
+ Requires-Dist: pyarrow>=15.0.0
24
+ Requires-Dist: psutil>=5.9.0
25
+ Requires-Dist: tqdm>=4.67.1
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
28
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
29
+ Dynamic: license-file
30
+
31
+ # NSQIP Tools
32
+
33
+ A Python package for working with National Surgical Quality Improvement Program (NSQIP) data. This package provides tools to convert NSQIP text files into optimized parquet datasets, perform standard data transformations, and query the data efficiently using Polars.
34
+
35
+ ## Features
36
+
37
+ - **Data Ingestion**: Convert NSQIP tab-delimited text files to parquet format
38
+ - **Automatic Transformations**: Standard data cleaning and derived variables
39
+ - **Data Verification**: Validate case counts against expected values
40
+ - **Efficient Querying**: Filter by CPT codes, diagnosis codes, years, and more
41
+ - **Data Dictionary**: Auto-generate comprehensive data dictionaries in CSV, JSON, and HTML formats
42
+ - **Memory Efficient**: Designed to work on regular computers with limited RAM
43
+ - **Network Drive Compatible**: Works seamlessly on local or network file systems
44
+ - **Type Safe**: Comprehensive type hints throughout
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install nsqip-tools
50
+ ```
51
+
52
+ ## Quick Start
53
+
54
+ ### Building a Dataset
55
+
56
+ ```python
57
+ import nsqip_tools
58
+
59
+ # Build parquet dataset from NSQIP text files
60
+ result = nsqip_tools.build_parquet_dataset(
61
+ data_dir="/path/to/nsqip/files",
62
+ dataset_type="adult" # or "pediatric"
63
+ )
64
+
65
+ print(f"Dataset created at: {result['parquet_dir']}")
66
+ print(f"Data dictionary at: {result['dictionary']}")
67
+ ```
68
+
69
+ ### Querying Data
70
+
71
+ ```python
72
+ import nsqip_tools
73
+ import polars as pl
74
+
75
+ # Load and filter data
76
+ df = (nsqip_tools.load_data("/path/to/parquet/dataset")
77
+ .filter_by_cpt(["44970", "44979"]) # Laparoscopic procedures
78
+ .filter_by_year([2020, 2021])
79
+ .collect())
80
+
81
+ # Chain with Polars operations
82
+ df = (nsqip_tools.load_data("/path/to/parquet/dataset")
83
+ .filter_by_diagnosis(["K80.20"]) # Gallstones
84
+ .lazy_frame # Access the Polars LazyFrame
85
+ .select(["CASEID", "AGE_AS_INT", "CPT", "OPERYR"])
86
+ .filter(pl.col("AGE_AS_INT") > 50)
87
+ .group_by("CPT")
88
+ .agg(pl.count())
89
+ .collect())
90
+ ```
91
+
92
+ ## API Reference
93
+
94
+ ### Building Datasets
95
+
96
+ #### `build_parquet_dataset()`
97
+
98
+ Build an NSQIP parquet dataset from text files with standard transformations.
99
+
100
+ ```python
101
+ result = nsqip_tools.build_parquet_dataset(
102
+ data_dir, # Path to NSQIP text files
103
+ output_dir=None, # Output directory (defaults to data_dir)
104
+ dataset_type="adult", # "adult" or "pediatric"
105
+ generate_dictionary=True, # Generate data dictionary
106
+ memory_limit="4GB", # Memory limit for operations
107
+ verify_case_counts=True, # Verify case counts match expected
108
+ apply_transforms=True # Apply standard transformations
109
+ )
110
+ ```
111
+
112
+ **Returns:** Dictionary with paths to:
113
+ - `parquet_dir`: Parquet dataset directory
114
+ - `dictionary`: Data dictionary CSV file (if generated)
115
+ - `log`: Build log file
116
+
117
+
118
+ ### Querying Data
119
+
120
+ #### `load_data()`
121
+
122
+ Load NSQIP data from a parquet dataset for querying.
123
+
124
+ ```python
125
+ query = nsqip_tools.load_data("/path/to/parquet/dataset")
126
+ ```
127
+
128
+ #### Filter Methods
129
+
130
+ All filter methods return the query object for chaining:
131
+
132
+ - **`filter_by_cpt(cpt_codes)`**: Filter by CPT procedure codes
133
+ - **`filter_by_diagnosis(diagnosis_codes)`**: Filter by ICD diagnosis codes
134
+ - **`filter_by_year(years)`**: Filter by operation years
135
+ - **`filter_active_variables()`**: Keep only variables with data in most recent year
136
+ - **`select_demographics()`**: Select common demographic variables
137
+ - **`select_outcomes()`**: Select common outcome variables
138
+
139
+ #### Accessing Results
140
+
141
+ - **`.lazy_frame`**: Get the Polars LazyFrame for custom operations
142
+ - **`.collect()`**: Execute query and return Polars DataFrame
143
+ - **`.count()`**: Get count of rows without collecting full data
144
+ - **`.sample(n)`**: Get a random sample of n rows
145
+ - **`.describe()`**: Get summary statistics about the query
146
+
147
+ ## Standard Transformations
148
+
149
+ The `build_parquet_dataset()` function automatically applies these transformations:
150
+
151
+ 1. **Data Type Conversion**: Identifies and converts numeric columns while preserving categorical codes
152
+ 2. **Age Processing**:
153
+ - Keeps original `AGE` column with "90+" values
154
+ - Creates `AGE_AS_INT` (numeric, with 90 for "90+")
155
+ - Creates `AGE_IS_90_PLUS` boolean flag
156
+ 3. **CPT Array**: Combines all CPT columns into `ALL_CPT_CODES` array
157
+ 4. **Diagnosis Array**: Combines all diagnosis columns into `ALL_DIAGNOSIS_CODES` array
158
+ 5. **Race Combination**: Merges `RACE` and `RACE_NEW` into `RACE_COMBINED`
159
+ 6. **Work RVU**: Calculates `WORK_RVU_TOTAL` from work RVU columns (adult only)
160
+ 7. **Free Flap Indicators**: Derives boolean flags based on CPT codes
161
+
162
+ ## Data Dictionary
163
+
164
+ Generated data dictionaries include:
165
+
166
+ - **Column name and data type**
167
+ - **Active status** (has data in most recent year)
168
+ - **Null counts and percentages**
169
+ - **Summary statistics** (numeric: min/max/mean/median, categorical: top values)
170
+ - **Null counts by year** (useful for identifying when variables were added/removed)
171
+
172
+ Available formats:
173
+ - **CSV**: For Excel/spreadsheet users
174
+ - **Excel**: For advanced spreadsheet analysis
175
+ - **HTML**: For easy web viewing
176
+
177
+ ## Memory Optimization
178
+
179
+ The package is designed for regular computers:
180
+
181
+ - **Automatic memory detection**: Recommends appropriate memory limits based on available RAM
182
+ - **Columnar storage**: Uses parquet format for efficient compression and access
183
+ - **Lazy evaluation**: Polars LazyFrames enable efficient query planning
184
+ - **Streaming support**: Can process datasets larger than available memory
185
+
186
+ ```python
187
+ # Check system memory
188
+ mem_info = nsqip_tools.get_memory_info()
189
+ print(f"Total RAM: {mem_info['total']}")
190
+ print(f"Available: {mem_info['available']}")
191
+ print(f"Recommended limit: {mem_info['recommended_limit']}")
192
+
193
+ # Use automatic memory detection (default)
194
+ result = nsqip_tools.build_parquet_dataset(data_dir="/path/to/files")
195
+
196
+ # Or specify custom limit
197
+ result = nsqip_tools.build_parquet_dataset(
198
+ data_dir="/path/to/files",
199
+ memory_limit="8GB"
200
+ )
201
+ ```
202
+
203
+ ### Safe Data Collection
204
+
205
+ The package includes memory-safe collection to prevent out-of-memory errors:
206
+
207
+ ```python
208
+ # Check size before collecting
209
+ query = nsqip_tools.load_data("/path/to/parquet/dataset").filter_by_year([2021])
210
+ info = query.describe()
211
+ print(f"Total rows: {info['total_rows']}")
212
+ print(f"Columns: {info['columns']}")
213
+
214
+ # Use streaming for large datasets
215
+ df = query.collect(streaming=True)
216
+
217
+ # Get a sample for exploration
218
+ sample_df = query.sample(n=10000)
219
+ ```
220
+
221
+ ## Network Drive Support
222
+
223
+ The package works seamlessly on network drives and file systems that don't support file locking:
224
+
225
+ ```python
226
+ # Works on network drives, SMB shares, etc.
227
+ result = nsqip_tools.build_parquet_dataset(
228
+ data_dir="/Volumes/network_drive/nsqip_data",
229
+ output_dir="/Volumes/network_drive/processed"
230
+ )
231
+
232
+ # Query from network location
233
+ query = nsqip_tools.load_data("/Volumes/network_drive/processed/adult_nsqip_parquet")
234
+ ```
235
+
236
+ ## Data Requirements
237
+
238
+ - NSQIP data files must be tab-delimited text files
239
+ - Files should follow standard NSQIP naming conventions
240
+ - Expected case counts are validated based on official NSQIP documentation
241
+
242
+ ## License
243
+
244
+ This project is licensed under the MIT License - see the LICENSE file for details.
245
+
246
+ ## Contributing
247
+
248
+ Contributions are welcome! Please feel free to submit a Pull Request.
249
+
250
+ ## Disclaimer
251
+
252
+ This package is not affiliated with or endorsed by the American College of Surgeons National Surgical Quality Improvement Program. Users must obtain NSQIP data through official channels.
@@ -0,0 +1,222 @@
1
+ # NSQIP Tools
2
+
3
+ A Python package for working with National Surgical Quality Improvement Program (NSQIP) data. This package provides tools to convert NSQIP text files into optimized parquet datasets, perform standard data transformations, and query the data efficiently using Polars.
4
+
5
+ ## Features
6
+
7
+ - **Data Ingestion**: Convert NSQIP tab-delimited text files to parquet format
8
+ - **Automatic Transformations**: Standard data cleaning and derived variables
9
+ - **Data Verification**: Validate case counts against expected values
10
+ - **Efficient Querying**: Filter by CPT codes, diagnosis codes, years, and more
11
+ - **Data Dictionary**: Auto-generate comprehensive data dictionaries in CSV, JSON, and HTML formats
12
+ - **Memory Efficient**: Designed to work on regular computers with limited RAM
13
+ - **Network Drive Compatible**: Works seamlessly on local or network file systems
14
+ - **Type Safe**: Comprehensive type hints throughout
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install nsqip-tools
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ### Building a Dataset
25
+
26
+ ```python
27
+ import nsqip_tools
28
+
29
+ # Build parquet dataset from NSQIP text files
30
+ result = nsqip_tools.build_parquet_dataset(
31
+ data_dir="/path/to/nsqip/files",
32
+ dataset_type="adult" # or "pediatric"
33
+ )
34
+
35
+ print(f"Dataset created at: {result['parquet_dir']}")
36
+ print(f"Data dictionary at: {result['dictionary']}")
37
+ ```
38
+
39
+ ### Querying Data
40
+
41
+ ```python
42
+ import nsqip_tools
43
+ import polars as pl
44
+
45
+ # Load and filter data
46
+ df = (nsqip_tools.load_data("/path/to/parquet/dataset")
47
+ .filter_by_cpt(["44970", "44979"]) # Laparoscopic procedures
48
+ .filter_by_year([2020, 2021])
49
+ .collect())
50
+
51
+ # Chain with Polars operations
52
+ df = (nsqip_tools.load_data("/path/to/parquet/dataset")
53
+ .filter_by_diagnosis(["K80.20"]) # Gallstones
54
+ .lazy_frame # Access the Polars LazyFrame
55
+ .select(["CASEID", "AGE_AS_INT", "CPT", "OPERYR"])
56
+ .filter(pl.col("AGE_AS_INT") > 50)
57
+ .group_by("CPT")
58
+ .agg(pl.count())
59
+ .collect())
60
+ ```
61
+
62
+ ## API Reference
63
+
64
+ ### Building Datasets
65
+
66
+ #### `build_parquet_dataset()`
67
+
68
+ Build an NSQIP parquet dataset from text files with standard transformations.
69
+
70
+ ```python
71
+ result = nsqip_tools.build_parquet_dataset(
72
+ data_dir, # Path to NSQIP text files
73
+ output_dir=None, # Output directory (defaults to data_dir)
74
+ dataset_type="adult", # "adult" or "pediatric"
75
+ generate_dictionary=True, # Generate data dictionary
76
+ memory_limit="4GB", # Memory limit for operations
77
+ verify_case_counts=True, # Verify case counts match expected
78
+ apply_transforms=True # Apply standard transformations
79
+ )
80
+ ```
81
+
82
+ **Returns:** Dictionary with paths to:
83
+ - `parquet_dir`: Parquet dataset directory
84
+ - `dictionary`: Data dictionary CSV file (if generated)
85
+ - `log`: Build log file
86
+
87
+
88
+ ### Querying Data
89
+
90
+ #### `load_data()`
91
+
92
+ Load NSQIP data from a parquet dataset for querying.
93
+
94
+ ```python
95
+ query = nsqip_tools.load_data("/path/to/parquet/dataset")
96
+ ```
97
+
98
+ #### Filter Methods
99
+
100
+ All filter methods return the query object for chaining:
101
+
102
+ - **`filter_by_cpt(cpt_codes)`**: Filter by CPT procedure codes
103
+ - **`filter_by_diagnosis(diagnosis_codes)`**: Filter by ICD diagnosis codes
104
+ - **`filter_by_year(years)`**: Filter by operation years
105
+ - **`filter_active_variables()`**: Keep only variables with data in most recent year
106
+ - **`select_demographics()`**: Select common demographic variables
107
+ - **`select_outcomes()`**: Select common outcome variables
108
+
109
+ #### Accessing Results
110
+
111
+ - **`.lazy_frame`**: Get the Polars LazyFrame for custom operations
112
+ - **`.collect()`**: Execute query and return Polars DataFrame
113
+ - **`.count()`**: Get count of rows without collecting full data
114
+ - **`.sample(n)`**: Get a random sample of n rows
115
+ - **`.describe()`**: Get summary statistics about the query
116
+
117
+ ## Standard Transformations
118
+
119
+ The `build_parquet_dataset()` function automatically applies these transformations:
120
+
121
+ 1. **Data Type Conversion**: Identifies and converts numeric columns while preserving categorical codes
122
+ 2. **Age Processing**:
123
+ - Keeps original `AGE` column with "90+" values
124
+ - Creates `AGE_AS_INT` (numeric, with 90 for "90+")
125
+ - Creates `AGE_IS_90_PLUS` boolean flag
126
+ 3. **CPT Array**: Combines all CPT columns into `ALL_CPT_CODES` array
127
+ 4. **Diagnosis Array**: Combines all diagnosis columns into `ALL_DIAGNOSIS_CODES` array
128
+ 5. **Race Combination**: Merges `RACE` and `RACE_NEW` into `RACE_COMBINED`
129
+ 6. **Work RVU**: Calculates `WORK_RVU_TOTAL` from work RVU columns (adult only)
130
+ 7. **Free Flap Indicators**: Derives boolean flags based on CPT codes
131
+
132
+ ## Data Dictionary
133
+
134
+ Generated data dictionaries include:
135
+
136
+ - **Column name and data type**
137
+ - **Active status** (has data in most recent year)
138
+ - **Null counts and percentages**
139
+ - **Summary statistics** (numeric: min/max/mean/median, categorical: top values)
140
+ - **Null counts by year** (useful for identifying when variables were added/removed)
141
+
142
+ Available formats:
143
+ - **CSV**: For Excel/spreadsheet users
144
+ - **Excel**: For advanced spreadsheet analysis
145
+ - **HTML**: For easy web viewing
146
+
147
+ ## Memory Optimization
148
+
149
+ The package is designed for regular computers:
150
+
151
+ - **Automatic memory detection**: Recommends appropriate memory limits based on available RAM
152
+ - **Columnar storage**: Uses parquet format for efficient compression and access
153
+ - **Lazy evaluation**: Polars LazyFrames enable efficient query planning
154
+ - **Streaming support**: Can process datasets larger than available memory
155
+
156
+ ```python
157
+ # Check system memory
158
+ mem_info = nsqip_tools.get_memory_info()
159
+ print(f"Total RAM: {mem_info['total']}")
160
+ print(f"Available: {mem_info['available']}")
161
+ print(f"Recommended limit: {mem_info['recommended_limit']}")
162
+
163
+ # Use automatic memory detection (default)
164
+ result = nsqip_tools.build_parquet_dataset(data_dir="/path/to/files")
165
+
166
+ # Or specify custom limit
167
+ result = nsqip_tools.build_parquet_dataset(
168
+ data_dir="/path/to/files",
169
+ memory_limit="8GB"
170
+ )
171
+ ```
172
+
173
+ ### Safe Data Collection
174
+
175
+ The package includes memory-safe collection to prevent out-of-memory errors:
176
+
177
+ ```python
178
+ # Check size before collecting
179
+ query = nsqip_tools.load_data("/path/to/parquet/dataset").filter_by_year([2021])
180
+ info = query.describe()
181
+ print(f"Total rows: {info['total_rows']}")
182
+ print(f"Columns: {info['columns']}")
183
+
184
+ # Use streaming for large datasets
185
+ df = query.collect(streaming=True)
186
+
187
+ # Get a sample for exploration
188
+ sample_df = query.sample(n=10000)
189
+ ```
190
+
191
+ ## Network Drive Support
192
+
193
+ The package works seamlessly on network drives and file systems that don't support file locking:
194
+
195
+ ```python
196
+ # Works on network drives, SMB shares, etc.
197
+ result = nsqip_tools.build_parquet_dataset(
198
+ data_dir="/Volumes/network_drive/nsqip_data",
199
+ output_dir="/Volumes/network_drive/processed"
200
+ )
201
+
202
+ # Query from network location
203
+ query = nsqip_tools.load_data("/Volumes/network_drive/processed/adult_nsqip_parquet")
204
+ ```
205
+
206
+ ## Data Requirements
207
+
208
+ - NSQIP data files must be tab-delimited text files
209
+ - Files should follow standard NSQIP naming conventions
210
+ - Expected case counts are validated based on official NSQIP documentation
211
+
212
+ ## License
213
+
214
+ This project is licensed under the MIT License - see the LICENSE file for details.
215
+
216
+ ## Contributing
217
+
218
+ Contributions are welcome! Please feel free to submit a Pull Request.
219
+
220
+ ## Disclaimer
221
+
222
+ This package is not affiliated with or endorsed by the American College of Surgeons National Surgical Quality Improvement Program. Users must obtain NSQIP data through official channels.
@@ -0,0 +1,43 @@
1
+ [project]
2
+ name = "nsqip-tools"
3
+ version = "0.2.0"
4
+ description = "Tools for working with NSQIP surgical quality data"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "Jason Brant"},
10
+ ]
11
+ classifiers = [
12
+ "Development Status :: 4 - Beta",
13
+ "Intended Audience :: Healthcare Industry",
14
+ "Intended Audience :: Science/Research",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Scientific/Engineering :: Medical Science Apps.",
21
+ ]
22
+ dependencies = [
23
+ "polars>=1.0.0",
24
+ "pyarrow>=15.0.0",
25
+ "psutil>=5.9.0",
26
+ "tqdm>=4.67.1",
27
+ ]
28
+
29
+ [project.optional-dependencies]
30
+ dev = [
31
+ "pytest>=7.0.0",
32
+ "pytest-cov>=4.0.0",
33
+ ]
34
+
35
+ [project.urls]
36
+ "Homepage" = "https://github.com/brant01/nsqip_tools"
37
+ "Bug Tracker" = "https://github.com/brant01/nsqip_tools/issues"
38
+ "Source Code" = "https://github.com/brant01/nsqip_tools"
39
+
40
+ [dependency-groups]
41
+ dev = [
42
+ "pytest>=8.3.5",
43
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,23 @@
1
+ """NSQIP Tools: A Python package for working with NSQIP surgical data.
2
+
3
+ This package provides tools for ingesting, transforming, and querying
4
+ National Surgical Quality Improvement Program (NSQIP) data using Polars
5
+ and parquet datasets.
6
+ """
7
+
8
+ from .query import load_data, NSQIPQuery
9
+ from .builder import build_parquet_dataset
10
+ from ._internal.memory_utils import get_memory_info, get_recommended_memory_limit
11
+
12
+ __all__ = [
13
+ "build_parquet_dataset",
14
+ "load_data",
15
+ "NSQIPQuery",
16
+ "get_memory_info",
17
+ "get_recommended_memory_limit",
18
+ ]
19
+
20
+ __version__ = "0.1.0"
21
+
22
+
23
+
@@ -0,0 +1,6 @@
1
+ """Internal utilities for NSQIP Tools.
2
+
3
+ This module contains internal implementation details that should not be
4
+ imported directly by users. All public APIs are exposed through the main
5
+ nsqip_tools module.
6
+ """