ncdb-tools 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ncdb_tools-0.1.0/.gitignore +58 -0
- ncdb_tools-0.1.0/.python-version +1 -0
- ncdb_tools-0.1.0/CONTRIBUTING.md +26 -0
- ncdb_tools-0.1.0/LICENSE +21 -0
- ncdb_tools-0.1.0/MANIFEST.in +16 -0
- ncdb_tools-0.1.0/PKG-INFO +128 -0
- ncdb_tools-0.1.0/README.md +96 -0
- ncdb_tools-0.1.0/build_database.py +37 -0
- ncdb_tools-0.1.0/main.py +6 -0
- ncdb_tools-0.1.0/pyproject.toml +63 -0
- ncdb_tools-0.1.0/src/ncdb_tools/__init__.py +17 -0
- ncdb_tools-0.1.0/src/ncdb_tools/_internal/__init__.py +1 -0
- ncdb_tools-0.1.0/src/ncdb_tools/_internal/sas_parser.py +115 -0
- ncdb_tools-0.1.0/src/ncdb_tools/constants.py +87 -0
- ncdb_tools-0.1.0/src/ncdb_tools/data_dictionary.py +534 -0
- ncdb_tools-0.1.0/src/ncdb_tools/database_builder.py +198 -0
- ncdb_tools-0.1.0/src/ncdb_tools/dataset_builder.py +196 -0
- ncdb_tools-0.1.0/src/ncdb_tools/query.py +154 -0
- ncdb_tools-0.1.0/uv.lock +500 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
env/
|
|
8
|
+
venv/
|
|
9
|
+
.venv/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
dist/
|
|
12
|
+
build/
|
|
13
|
+
|
|
14
|
+
# Data - NEVER commit any data files
|
|
15
|
+
data/
|
|
16
|
+
docs/
|
|
17
|
+
scripts/
|
|
18
|
+
*.csv
|
|
19
|
+
*.txt
|
|
20
|
+
*.dat
|
|
21
|
+
*.parquet
|
|
22
|
+
*.feather
|
|
23
|
+
*.json
|
|
24
|
+
*.xml
|
|
25
|
+
*.sas
|
|
26
|
+
*.pdf
|
|
27
|
+
*.rtf
|
|
28
|
+
|
|
29
|
+
# Documentation/instruction files
|
|
30
|
+
development-guidelines.md
|
|
31
|
+
CLAUDE.md
|
|
32
|
+
claude.md
|
|
33
|
+
ncdb_instructions.md
|
|
34
|
+
|
|
35
|
+
# Results and outputs
|
|
36
|
+
results/
|
|
37
|
+
outputs/
|
|
38
|
+
figures/
|
|
39
|
+
*.log
|
|
40
|
+
|
|
41
|
+
# Environment
|
|
42
|
+
.env
|
|
43
|
+
.DS_Store
|
|
44
|
+
|
|
45
|
+
# IDE
|
|
46
|
+
.vscode/
|
|
47
|
+
.idea/
|
|
48
|
+
*.swp
|
|
49
|
+
*.swo
|
|
50
|
+
|
|
51
|
+
# Jupyter
|
|
52
|
+
.ipynb_checkpoints
|
|
53
|
+
*.ipynb
|
|
54
|
+
|
|
55
|
+
# Testing
|
|
56
|
+
.pytest_cache/
|
|
57
|
+
.coverage
|
|
58
|
+
htmlcov/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Contributing to NCDB Tools
|
|
2
|
+
|
|
3
|
+
## Development Setup
|
|
4
|
+
|
|
5
|
+
1. Clone the repository
|
|
6
|
+
2. Install dependencies with `uv sync --all-extras`
|
|
7
|
+
3. Run tests with `uv run pytest`
|
|
8
|
+
|
|
9
|
+
## Code Style
|
|
10
|
+
|
|
11
|
+
- Use `ruff` for formatting and linting
|
|
12
|
+
- Follow Google-style docstrings
|
|
13
|
+
- Add type hints to all functions
|
|
14
|
+
|
|
15
|
+
## Security
|
|
16
|
+
|
|
17
|
+
- Never commit data files
|
|
18
|
+
- Never include PHI or patient information
|
|
19
|
+
- Check .gitignore before committing
|
|
20
|
+
|
|
21
|
+
## Pull Request Process
|
|
22
|
+
|
|
23
|
+
1. Create a feature branch
|
|
24
|
+
2. Make your changes with tests
|
|
25
|
+
3. Run `uv run ruff check --fix .`
|
|
26
|
+
4. Submit PR with clear description
|
ncdb_tools-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 The NCDB Tools Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include CONTRIBUTING.md
|
|
4
|
+
recursive-exclude * __pycache__
|
|
5
|
+
recursive-exclude * *.py[co]
|
|
6
|
+
recursive-exclude * *.sas
|
|
7
|
+
recursive-exclude * *.dat
|
|
8
|
+
recursive-exclude * *.csv
|
|
9
|
+
recursive-exclude * *.parquet
|
|
10
|
+
recursive-exclude * *.txt
|
|
11
|
+
recursive-exclude * *.pdf
|
|
12
|
+
recursive-exclude * *.rtf
|
|
13
|
+
recursive-exclude docs *
|
|
14
|
+
recursive-exclude scripts *
|
|
15
|
+
recursive-exclude * CLAUDE.md
|
|
16
|
+
recursive-exclude * claude.md
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ncdb-tools
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Tools for processing and analyzing National Cancer Database (NCDB) data
|
|
5
|
+
Author: NCDB Tools Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: analysis,cancer,database,medical-research,ncdb,parquet
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Requires-Dist: click>=8.0.0
|
|
23
|
+
Requires-Dist: polars>=0.19.0
|
|
24
|
+
Requires-Dist: pyarrow>=10.0.0
|
|
25
|
+
Requires-Dist: rich>=10.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.0.280; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# ncdb-tools
|
|
34
|
+
|
|
35
|
+
A Python package for efficiently processing and analyzing National Cancer Database (NCDB) data files.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install ncdb-tools
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Important Notice
|
|
44
|
+
|
|
45
|
+
This package provides tools for processing NCDB data files. **You must obtain NCDB data through official channels** - this package does not include any patient data. The National Cancer Database (NCDB) is a clinical oncology database sourced from hospital registry data that are collected in more than 1,500 Commission on Cancer (CoC)-accredited facilities.
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import ncdb_tools
|
|
51
|
+
|
|
52
|
+
# Convert all NCDB data files in a directory to parquet format
|
|
53
|
+
paths = ncdb_tools.build_database("/path/to/NCDB_DATA/")
|
|
54
|
+
|
|
55
|
+
# The function will:
|
|
56
|
+
# 1. Find all .dat files
|
|
57
|
+
# 2. Find the SAS labels file
|
|
58
|
+
# 3. Create a new subdirectory with today's date
|
|
59
|
+
# 4. Convert all files to parquet format
|
|
60
|
+
# 5. Generate a comprehensive data dictionary
|
|
61
|
+
# 6. Create a summary report
|
|
62
|
+
|
|
63
|
+
print(f"Database created in: {paths['output_dir']}")
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Working with the Data
|
|
67
|
+
|
|
68
|
+
After building the database, you can query the parquet files using NCDB-specific filters and standard Polars operations:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import polars as pl
|
|
72
|
+
|
|
73
|
+
# Load data with NCDB-specific filters
|
|
74
|
+
query = ncdb_tools.load_data("path/to/parquet_directory/")
|
|
75
|
+
|
|
76
|
+
# Chain NCDB filters, then use Polars for analysis
|
|
77
|
+
df = (
|
|
78
|
+
query
|
|
79
|
+
.filter_by_year(2021)
|
|
80
|
+
.filter_by_primary_site("C509") # Breast
|
|
81
|
+
.filter_by_histology([8140, 8500]) # Adenocarcinoma codes
|
|
82
|
+
.drop_missing_vital_status()
|
|
83
|
+
.lazy_frame() # Get Polars LazyFrame
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Use standard Polars operations
|
|
87
|
+
results = (
|
|
88
|
+
df
|
|
89
|
+
.filter(pl.col("AGE") >= 50)
|
|
90
|
+
.group_by(["SEX", "RACE"])
|
|
91
|
+
.agg([
|
|
92
|
+
pl.count().alias("count"),
|
|
93
|
+
pl.col("AGE").mean().alias("mean_age")
|
|
94
|
+
])
|
|
95
|
+
.collect()
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The query interface provides these NCDB-specific filters:
|
|
100
|
+
- `filter_by_year()` - Filter by year of diagnosis
|
|
101
|
+
- `filter_by_primary_site()` - Filter by ICD-O-3 primary site codes
|
|
102
|
+
- `filter_by_histology()` - Filter by histology codes (accepts integers or strings)
|
|
103
|
+
- `drop_missing_vital_status()` - Remove cases with missing vital status
|
|
104
|
+
|
|
105
|
+
After applying NCDB filters, use `.lazy_frame()` to access the Polars LazyFrame for further analysis.
|
|
106
|
+
|
|
107
|
+
## Features
|
|
108
|
+
|
|
109
|
+
- Efficiently converts NCDB fixed-width text files to parquet format
|
|
110
|
+
- Automatically parses SAS labels for meaningful column names
|
|
111
|
+
- Generates comprehensive data dictionaries in CSV, JSON, and HTML formats
|
|
112
|
+
- Memory-efficient processing using Polars
|
|
113
|
+
- Simple, high-level API for common tasks
|
|
114
|
+
- NCDB-specific data filters and transformations
|
|
115
|
+
- Compatible with all Python 3.9+ versions
|
|
116
|
+
|
|
117
|
+
## Requirements
|
|
118
|
+
|
|
119
|
+
- Python 3.9 or higher
|
|
120
|
+
- NCDB data files (obtained through official channels)
|
|
121
|
+
|
|
122
|
+
## License
|
|
123
|
+
|
|
124
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
125
|
+
|
|
126
|
+
## Disclaimer
|
|
127
|
+
|
|
128
|
+
This software is provided for research purposes. Users are responsible for ensuring compliance with all applicable data use agreements and privacy regulations when working with NCDB data.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# ncdb-tools
|
|
2
|
+
|
|
3
|
+
A Python package for efficiently processing and analyzing National Cancer Database (NCDB) data files.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install ncdb-tools
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Important Notice
|
|
12
|
+
|
|
13
|
+
This package provides tools for processing NCDB data files. **You must obtain NCDB data through official channels** - this package does not include any patient data. The National Cancer Database (NCDB) is a clinical oncology database sourced from hospital registry data that are collected in more than 1,500 Commission on Cancer (CoC)-accredited facilities.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import ncdb_tools
|
|
19
|
+
|
|
20
|
+
# Convert all NCDB data files in a directory to parquet format
|
|
21
|
+
paths = ncdb_tools.build_database("/path/to/NCDB_DATA/")
|
|
22
|
+
|
|
23
|
+
# The function will:
|
|
24
|
+
# 1. Find all .dat files
|
|
25
|
+
# 2. Find the SAS labels file
|
|
26
|
+
# 3. Create a new subdirectory with today's date
|
|
27
|
+
# 4. Convert all files to parquet format
|
|
28
|
+
# 5. Generate a comprehensive data dictionary
|
|
29
|
+
# 6. Create a summary report
|
|
30
|
+
|
|
31
|
+
print(f"Database created in: {paths['output_dir']}")
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Working with the Data
|
|
35
|
+
|
|
36
|
+
After building the database, you can query the parquet files using NCDB-specific filters and standard Polars operations:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import polars as pl
|
|
40
|
+
|
|
41
|
+
# Load data with NCDB-specific filters
|
|
42
|
+
query = ncdb_tools.load_data("path/to/parquet_directory/")
|
|
43
|
+
|
|
44
|
+
# Chain NCDB filters, then use Polars for analysis
|
|
45
|
+
df = (
|
|
46
|
+
query
|
|
47
|
+
.filter_by_year(2021)
|
|
48
|
+
.filter_by_primary_site("C509") # Breast
|
|
49
|
+
.filter_by_histology([8140, 8500]) # Adenocarcinoma codes
|
|
50
|
+
.drop_missing_vital_status()
|
|
51
|
+
.lazy_frame() # Get Polars LazyFrame
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Use standard Polars operations
|
|
55
|
+
results = (
|
|
56
|
+
df
|
|
57
|
+
.filter(pl.col("AGE") >= 50)
|
|
58
|
+
.group_by(["SEX", "RACE"])
|
|
59
|
+
.agg([
|
|
60
|
+
pl.count().alias("count"),
|
|
61
|
+
pl.col("AGE").mean().alias("mean_age")
|
|
62
|
+
])
|
|
63
|
+
.collect()
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
The query interface provides these NCDB-specific filters:
|
|
68
|
+
- `filter_by_year()` - Filter by year of diagnosis
|
|
69
|
+
- `filter_by_primary_site()` - Filter by ICD-O-3 primary site codes
|
|
70
|
+
- `filter_by_histology()` - Filter by histology codes (accepts integers or strings)
|
|
71
|
+
- `drop_missing_vital_status()` - Remove cases with missing vital status
|
|
72
|
+
|
|
73
|
+
After applying NCDB filters, use `.lazy_frame()` to access the Polars LazyFrame for further analysis.
|
|
74
|
+
|
|
75
|
+
## Features
|
|
76
|
+
|
|
77
|
+
- Efficiently converts NCDB fixed-width text files to parquet format
|
|
78
|
+
- Automatically parses SAS labels for meaningful column names
|
|
79
|
+
- Generates comprehensive data dictionaries in CSV, JSON, and HTML formats
|
|
80
|
+
- Memory-efficient processing using Polars
|
|
81
|
+
- Simple, high-level API for common tasks
|
|
82
|
+
- NCDB-specific data filters and transformations
|
|
83
|
+
- Compatible with all Python 3.9+ versions
|
|
84
|
+
|
|
85
|
+
## Requirements
|
|
86
|
+
|
|
87
|
+
- Python 3.9 or higher
|
|
88
|
+
- NCDB data files (obtained through official channels)
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
93
|
+
|
|
94
|
+
## Disclaimer
|
|
95
|
+
|
|
96
|
+
This software is provided for research purposes. Users are responsible for ensuring compliance with all applicable data use agreements and privacy regulations when working with NCDB data.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Command-line script to build NCDB database from .dat files.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
uv run build_database.py <data_directory>
|
|
7
|
+
|
|
8
|
+
This script will:
|
|
9
|
+
1. Find all .dat files in the specified directory
|
|
10
|
+
2. Locate the SAS labels file
|
|
11
|
+
3. Create a timestamped output subdirectory
|
|
12
|
+
4. Convert all .dat files to parquet format
|
|
13
|
+
5. Generate comprehensive data dictionaries (CSV, JSON, HTML)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from ncdb_tools import build_database
|
|
19
|
+
|
|
20
|
+
def main():
|
|
21
|
+
if len(sys.argv) != 2:
|
|
22
|
+
print("Usage: uv run build_database.py <data_directory>")
|
|
23
|
+
print("\nExample:")
|
|
24
|
+
print(" uv run build_database.py \"R:\\Jason\\NCDB\\NCDB_PUF_DATA_Sep-14-2024\"")
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
data_dir = sys.argv[1]
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
result = build_database(data_dir=data_dir)
|
|
31
|
+
return 0
|
|
32
|
+
except Exception as e:
|
|
33
|
+
print(f"Error: {e}")
|
|
34
|
+
return 1
|
|
35
|
+
|
|
36
|
+
if __name__ == "__main__":
|
|
37
|
+
exit(main())
|
ncdb_tools-0.1.0/main.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ncdb-tools"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Tools for processing and analyzing National Cancer Database (NCDB) data"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "NCDB Tools Contributors"}
|
|
10
|
+
]
|
|
11
|
+
keywords = ["ncdb", "cancer", "database", "analysis", "parquet", "medical-research"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"Intended Audience :: Healthcare Industry",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.9",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Medical Science Apps.",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"polars>=0.19.0",
|
|
28
|
+
"click>=8.0.0",
|
|
29
|
+
"rich>=10.0.0",
|
|
30
|
+
"pyarrow>=10.0.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=7.0.0",
|
|
36
|
+
"pytest-cov>=4.0.0",
|
|
37
|
+
"ruff>=0.0.280",
|
|
38
|
+
"mypy>=1.0.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["hatchling"]
|
|
44
|
+
build-backend = "hatchling.build"
|
|
45
|
+
|
|
46
|
+
[tool.ruff]
|
|
47
|
+
line-length = 88
|
|
48
|
+
target-version = "py39"
|
|
49
|
+
select = ["E", "F", "I", "N", "W", "RUF"]
|
|
50
|
+
|
|
51
|
+
[tool.mypy]
|
|
52
|
+
python_version = "3.9"
|
|
53
|
+
warn_return_any = true
|
|
54
|
+
warn_unused_configs = true
|
|
55
|
+
disallow_untyped_defs = true
|
|
56
|
+
|
|
57
|
+
[dependency-groups]
|
|
58
|
+
dev = [
|
|
59
|
+
"mypy>=1.16.0",
|
|
60
|
+
"pytest>=8.4.0",
|
|
61
|
+
"pytest-cov>=6.1.1",
|
|
62
|
+
"ruff>=0.11.12",
|
|
63
|
+
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""NCDB Tools - Tools for managing and analyzing National Cancer Database data."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
# Core functionality
|
|
6
|
+
from .data_dictionary import generate_data_dictionary
|
|
7
|
+
from .database_builder import build_database
|
|
8
|
+
from .dataset_builder import build_dataset
|
|
9
|
+
from .query import NCDBQuery, load_data
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"NCDBQuery",
|
|
13
|
+
"build_database", # High-level function for most users
|
|
14
|
+
"build_dataset",
|
|
15
|
+
"generate_data_dictionary",
|
|
16
|
+
"load_data",
|
|
17
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Internal utilities for NCDB Tools."""
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Parser for SAS label files."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_sas_labels(sas_file_path: Path) -> Tuple[Dict[str, str], Dict[str, Dict[str, str]]]:
|
|
9
|
+
"""Parse SAS file to extract variable labels and value formats.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
sas_file_path: Path to SAS labels file
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Tuple of (variable_labels, value_formats)
|
|
16
|
+
- variable_labels: Dict mapping variable names to descriptions
|
|
17
|
+
- value_formats: Dict mapping variable names to value->label mappings
|
|
18
|
+
"""
|
|
19
|
+
with open(sas_file_path, 'r', encoding='latin-1') as f:
|
|
20
|
+
content = f.read()
|
|
21
|
+
|
|
22
|
+
# Extract variable labels (e.g., AGE = 'Age at Diagnosis')
|
|
23
|
+
variable_labels = {}
|
|
24
|
+
label_pattern = r"(\w+)\s*=\s*'([^']+)'"
|
|
25
|
+
|
|
26
|
+
# Find the label section
|
|
27
|
+
label_section = re.search(r'label\s+(.*?);', content, re.DOTALL | re.IGNORECASE)
|
|
28
|
+
if label_section:
|
|
29
|
+
for match in re.finditer(label_pattern, label_section.group(1)):
|
|
30
|
+
var_name = match.group(1)
|
|
31
|
+
var_label = match.group(2)
|
|
32
|
+
variable_labels[var_name] = var_label
|
|
33
|
+
|
|
34
|
+
# Extract value formats (simplified - would need more robust parsing)
|
|
35
|
+
value_formats = {}
|
|
36
|
+
|
|
37
|
+
# Look for proc format value statements
|
|
38
|
+
format_blocks = re.findall(r'value\s+(\w+)(.*?);\s*(?=value|\s*run|$)', content, re.DOTALL | re.IGNORECASE)
|
|
39
|
+
|
|
40
|
+
for format_name, format_content in format_blocks:
|
|
41
|
+
# Extract value mappings
|
|
42
|
+
value_map = {}
|
|
43
|
+
|
|
44
|
+
# Pattern for numeric or string values
|
|
45
|
+
value_pattern = r"(['\"]?)([^'\"=]+)\1\s*=\s*['\"]([^'\"]+)['\"]"
|
|
46
|
+
|
|
47
|
+
for match in re.finditer(value_pattern, format_content):
|
|
48
|
+
value = match.group(2).strip()
|
|
49
|
+
label = match.group(3).strip()
|
|
50
|
+
value_map[value] = label
|
|
51
|
+
|
|
52
|
+
if value_map:
|
|
53
|
+
# Try to find which variables use this format
|
|
54
|
+
# Look for format statements
|
|
55
|
+
format_usage = re.findall(rf'(\w+)\s+{format_name}\.', content)
|
|
56
|
+
for var in format_usage:
|
|
57
|
+
value_formats[var] = value_map
|
|
58
|
+
|
|
59
|
+
return variable_labels, value_formats
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_column_positions(sas_file_path: Path) -> List[Dict[str, any]]:
|
|
63
|
+
"""Parse column positions from SAS input statement.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
sas_file_path: Path to SAS file
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List of column definitions with name, start, end positions
|
|
70
|
+
"""
|
|
71
|
+
with open(sas_file_path, 'r', encoding='latin-1') as f:
|
|
72
|
+
content = f.read()
|
|
73
|
+
|
|
74
|
+
columns = []
|
|
75
|
+
|
|
76
|
+
# Find input statement
|
|
77
|
+
input_section = re.search(r'input\s+(.*?);\s*', content, re.DOTALL | re.IGNORECASE)
|
|
78
|
+
if not input_section:
|
|
79
|
+
return columns
|
|
80
|
+
|
|
81
|
+
input_text = input_section.group(1)
|
|
82
|
+
|
|
83
|
+
# Parse column definitions
|
|
84
|
+
# Handle both formats: "AGE $ 50-52" and "@50 AGE $3."
|
|
85
|
+
|
|
86
|
+
# First try the position-name format: @50 AGE $3.
|
|
87
|
+
at_pattern = r'@(\d+)\s+(\w+)\s+\$?(\d+)\.'
|
|
88
|
+
for match in re.finditer(at_pattern, input_text):
|
|
89
|
+
start_pos = int(match.group(1)) - 1 # Convert to 0-based
|
|
90
|
+
name = match.group(2)
|
|
91
|
+
width = int(match.group(3))
|
|
92
|
+
|
|
93
|
+
columns.append({
|
|
94
|
+
'name': name,
|
|
95
|
+
'start': start_pos,
|
|
96
|
+
'end': start_pos + width,
|
|
97
|
+
'width': width
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
# If no columns found, try the range format: AGE $ 50-52
|
|
101
|
+
if not columns:
|
|
102
|
+
range_pattern = r'(\w+)\s*\$?\s*(\d+)-(\d+)'
|
|
103
|
+
for match in re.finditer(range_pattern, input_text):
|
|
104
|
+
name = match.group(1)
|
|
105
|
+
start = int(match.group(2)) - 1 # Convert to 0-based
|
|
106
|
+
end = int(match.group(3))
|
|
107
|
+
|
|
108
|
+
columns.append({
|
|
109
|
+
'name': name,
|
|
110
|
+
'start': start,
|
|
111
|
+
'end': end,
|
|
112
|
+
'width': end - start
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
return columns
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Constants for NCDB Tools."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Set
|
|
4
|
+
|
|
5
|
+
# File patterns
|
|
6
|
+
DATA_FILE_PATTERN = "NCDBPUF_*.dat"
|
|
7
|
+
PARQUET_EXTENSION = ".parquet"
|
|
8
|
+
|
|
9
|
+
# Data specifications
|
|
10
|
+
NCDB_RECORD_LENGTH = 1032
|
|
11
|
+
NCDB_COLUMN_COUNT = 338
|
|
12
|
+
|
|
13
|
+
# Columns that should never be converted to numeric
|
|
14
|
+
NEVER_NUMERIC_COLUMNS: Set[str] = {
|
|
15
|
+
"PUF_CASE_ID",
|
|
16
|
+
"PUF_FACILITY_ID",
|
|
17
|
+
"PRIMARY_SITE",
|
|
18
|
+
"HISTOLOGY",
|
|
19
|
+
"HISTOLOGY_ICDO3",
|
|
20
|
+
"BEHAVIOR",
|
|
21
|
+
"LATERALITY",
|
|
22
|
+
"CLASS_OF_CASE",
|
|
23
|
+
"YEAR_OF_DIAGNOSIS",
|
|
24
|
+
"SEQUENCE_NUMBER",
|
|
25
|
+
"FACILITY_TYPE_CD",
|
|
26
|
+
"FACILITY_LOCATION_CD",
|
|
27
|
+
"ZIP",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Standard column groups for convenience
|
|
31
|
+
DEMOGRAPHIC_COLUMNS: List[str] = [
|
|
32
|
+
"AGE",
|
|
33
|
+
"SEX",
|
|
34
|
+
"RACE",
|
|
35
|
+
"SPANISH_HISPANIC_ORIGIN",
|
|
36
|
+
"INSURANCE_STATUS",
|
|
37
|
+
"CDCC_TOTAL_BEST",
|
|
38
|
+
"MED_INC_QUAR_00",
|
|
39
|
+
"NO_HSD_QUAR_00",
|
|
40
|
+
"UR_CD_03",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
TREATMENT_COLUMNS: List[str] = [
|
|
44
|
+
"RX_SUMM_SURG_PRIM_SITE",
|
|
45
|
+
"RX_SUMM_RADIATION",
|
|
46
|
+
"RX_SUMM_CHEMO",
|
|
47
|
+
"RX_SUMM_HORMONE",
|
|
48
|
+
"RX_SUMM_IMMUNOTHERAPY",
|
|
49
|
+
"RX_SUMM_SYSTEMIC_SUR_SEQ",
|
|
50
|
+
"RX_SUMM_TREATMENT_STATUS",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
OUTCOME_COLUMNS: List[str] = [
|
|
54
|
+
"PUF_VITAL_STATUS",
|
|
55
|
+
"DX_LASTCONTACT_DEATH_MONTHS",
|
|
56
|
+
"READM_HOSP_30_DAYS",
|
|
57
|
+
"REASON_FOR_NO_SURGERY",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
# Tumor types found in the actual data
|
|
61
|
+
TUMOR_TYPES: List[str] = [
|
|
62
|
+
"BoneJont",
|
|
63
|
+
"Brain",
|
|
64
|
+
"CNS",
|
|
65
|
+
"EyeOrbit",
|
|
66
|
+
"GumOtMth",
|
|
67
|
+
"HodgExtr",
|
|
68
|
+
"HodgNdal",
|
|
69
|
+
"Hypophar",
|
|
70
|
+
"Kaposi",
|
|
71
|
+
"Langerhans",
|
|
72
|
+
"Larynx",
|
|
73
|
+
"Lip",
|
|
74
|
+
"Melanoma",
|
|
75
|
+
"MouthFlr",
|
|
76
|
+
"Nasal",
|
|
77
|
+
"Nasophar",
|
|
78
|
+
"NHLExtr",
|
|
79
|
+
"NHLNdal",
|
|
80
|
+
"Orophary",
|
|
81
|
+
"Pharynx",
|
|
82
|
+
"SalivGld",
|
|
83
|
+
"SoftTiss",
|
|
84
|
+
"Thyroid",
|
|
85
|
+
"Tongue",
|
|
86
|
+
"Tonsil",
|
|
87
|
+
]
|