ncdb-tools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ .venv/
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+
14
+ # Data - NEVER commit any data files
15
+ data/
16
+ docs/
17
+ scripts/
18
+ *.csv
19
+ *.txt
20
+ *.dat
21
+ *.parquet
22
+ *.feather
23
+ *.json
24
+ *.xml
25
+ *.sas
26
+ *.pdf
27
+ *.rtf
28
+
29
+ # Documentation/instruction files
30
+ development-guidelines.md
31
+ CLAUDE.md
32
+ claude.md
33
+ ncdb_instructions.md
34
+
35
+ # Results and outputs
36
+ results/
37
+ outputs/
38
+ figures/
39
+ *.log
40
+
41
+ # Environment
42
+ .env
43
+ .DS_Store
44
+
45
+ # IDE
46
+ .vscode/
47
+ .idea/
48
+ *.swp
49
+ *.swo
50
+
51
+ # Jupyter
52
+ .ipynb_checkpoints
53
+ *.ipynb
54
+
55
+ # Testing
56
+ .pytest_cache/
57
+ .coverage
58
+ htmlcov/
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,26 @@
1
+ # Contributing to NCDB Tools
2
+
3
+ ## Development Setup
4
+
5
+ 1. Clone the repository
6
+ 2. Install dependencies with `uv sync --all-extras`
7
+ 3. Run tests with `uv run pytest`
8
+
9
+ ## Code Style
10
+
11
+ - Use `ruff` for formatting and linting
12
+ - Follow Google-style docstrings
13
+ - Add type hints to all functions
14
+
15
+ ## Security
16
+
17
+ - Never commit data files
18
+ - Never include PHI or patient information
19
+ - Check .gitignore before committing
20
+
21
+ ## Pull Request Process
22
+
23
+ 1. Create a feature branch
24
+ 2. Make your changes with tests
25
+ 3. Run `uv run ruff check --fix .`
26
+ 4. Submit PR with clear description
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 The NCDB Tools Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,16 @@
1
+ include README.md
2
+ include LICENSE
3
+ include CONTRIBUTING.md
4
+ recursive-exclude * __pycache__
5
+ recursive-exclude * *.py[co]
6
+ recursive-exclude * *.sas
7
+ recursive-exclude * *.dat
8
+ recursive-exclude * *.csv
9
+ recursive-exclude * *.parquet
10
+ recursive-exclude * *.txt
11
+ recursive-exclude * *.pdf
12
+ recursive-exclude * *.rtf
13
+ recursive-exclude docs *
14
+ recursive-exclude scripts *
15
+ recursive-exclude * CLAUDE.md
16
+ recursive-exclude * claude.md
@@ -0,0 +1,128 @@
1
+ Metadata-Version: 2.4
2
+ Name: ncdb-tools
3
+ Version: 0.1.0
4
+ Summary: Tools for processing and analyzing National Cancer Database (NCDB) data
5
+ Author: NCDB Tools Contributors
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: analysis,cancer,database,medical-research,ncdb,parquet
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Healthcare Industry
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
21
+ Requires-Python: >=3.9
22
+ Requires-Dist: click>=8.0.0
23
+ Requires-Dist: polars>=0.19.0
24
+ Requires-Dist: pyarrow>=10.0.0
25
+ Requires-Dist: rich>=10.0.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
28
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
29
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.0.280; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # ncdb-tools
34
+
35
+ A Python package for efficiently processing and analyzing National Cancer Database (NCDB) data files.
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install ncdb-tools
41
+ ```
42
+
43
+ ## Important Notice
44
+
45
+ This package provides tools for processing NCDB data files. **You must obtain NCDB data through official channels** - this package does not include any patient data. The National Cancer Database (NCDB) is a clinical oncology database sourced from hospital registry data that are collected in more than 1,500 Commission on Cancer (CoC)-accredited facilities.
46
+
47
+ ## Quick Start
48
+
49
+ ```python
50
+ import ncdb_tools
51
+
52
+ # Convert all NCDB data files in a directory to parquet format
53
+ paths = ncdb_tools.build_database("/path/to/NCDB_DATA/")
54
+
55
+ # The function will:
56
+ # 1. Find all .dat files
57
+ # 2. Find the SAS labels file
58
+ # 3. Create a new subdirectory with today's date
59
+ # 4. Convert all files to parquet format
60
+ # 5. Generate a comprehensive data dictionary
61
+ # 6. Create a summary report
62
+
63
+ print(f"Database created in: {paths['output_dir']}")
64
+ ```
65
+
66
+ ## Working with the Data
67
+
68
+ After building the database, you can query the parquet files using NCDB-specific filters and standard Polars operations:
69
+
70
+ ```python
71
+ import polars as pl
72
+
73
+ # Load data with NCDB-specific filters
74
+ query = ncdb_tools.load_data("path/to/parquet_directory/")
75
+
76
+ # Chain NCDB filters, then use Polars for analysis
77
+ df = (
78
+ query
79
+ .filter_by_year(2021)
80
+ .filter_by_primary_site("C509") # Breast
81
+ .filter_by_histology([8140, 8500]) # Adenocarcinoma codes
82
+ .drop_missing_vital_status()
83
+ .lazy_frame() # Get Polars LazyFrame
84
+ )
85
+
86
+ # Use standard Polars operations
87
+ results = (
88
+ df
89
+ .filter(pl.col("AGE") >= 50)
90
+ .group_by(["SEX", "RACE"])
91
+ .agg([
92
+ pl.count().alias("count"),
93
+ pl.col("AGE").mean().alias("mean_age")
94
+ ])
95
+ .collect()
96
+ )
97
+ ```
98
+
99
+ The query interface provides these NCDB-specific filters:
100
+ - `filter_by_year()` - Filter by year of diagnosis
101
+ - `filter_by_primary_site()` - Filter by ICD-O-3 primary site codes
102
+ - `filter_by_histology()` - Filter by histology codes (accepts integers or strings)
103
+ - `drop_missing_vital_status()` - Remove cases with missing vital status
104
+
105
+ After applying NCDB filters, use `.lazy_frame()` to access the Polars LazyFrame for further analysis.
106
+
107
+ ## Features
108
+
109
+ - Efficiently converts NCDB fixed-width text files to parquet format
110
+ - Automatically parses SAS labels for meaningful column names
111
+ - Generates comprehensive data dictionaries in CSV, JSON, and HTML formats
112
+ - Memory-efficient processing using Polars
113
+ - Simple, high-level API for common tasks
114
+ - NCDB-specific data filters and transformations
115
+ - Compatible with all Python 3.9+ versions
116
+
117
+ ## Requirements
118
+
119
+ - Python 3.9 or higher
120
+ - NCDB data files (obtained through official channels)
121
+
122
+ ## License
123
+
124
+ MIT License - see [LICENSE](LICENSE) file for details.
125
+
126
+ ## Disclaimer
127
+
128
+ This software is provided for research purposes. Users are responsible for ensuring compliance with all applicable data use agreements and privacy regulations when working with NCDB data.
@@ -0,0 +1,96 @@
1
+ # ncdb-tools
2
+
3
+ A Python package for efficiently processing and analyzing National Cancer Database (NCDB) data files.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install ncdb-tools
9
+ ```
10
+
11
+ ## Important Notice
12
+
13
+ This package provides tools for processing NCDB data files. **You must obtain NCDB data through official channels** - this package does not include any patient data. The National Cancer Database (NCDB) is a clinical oncology database sourced from hospital registry data that are collected in more than 1,500 Commission on Cancer (CoC)-accredited facilities.
14
+
15
+ ## Quick Start
16
+
17
+ ```python
18
+ import ncdb_tools
19
+
20
+ # Convert all NCDB data files in a directory to parquet format
21
+ paths = ncdb_tools.build_database("/path/to/NCDB_DATA/")
22
+
23
+ # The function will:
24
+ # 1. Find all .dat files
25
+ # 2. Find the SAS labels file
26
+ # 3. Create a new subdirectory with today's date
27
+ # 4. Convert all files to parquet format
28
+ # 5. Generate a comprehensive data dictionary
29
+ # 6. Create a summary report
30
+
31
+ print(f"Database created in: {paths['output_dir']}")
32
+ ```
33
+
34
+ ## Working with the Data
35
+
36
+ After building the database, you can query the parquet files using NCDB-specific filters and standard Polars operations:
37
+
38
+ ```python
39
+ import polars as pl
40
+
41
+ # Load data with NCDB-specific filters
42
+ query = ncdb_tools.load_data("path/to/parquet_directory/")
43
+
44
+ # Chain NCDB filters, then use Polars for analysis
45
+ df = (
46
+ query
47
+ .filter_by_year(2021)
48
+ .filter_by_primary_site("C509") # Breast
49
+ .filter_by_histology([8140, 8500]) # Adenocarcinoma codes
50
+ .drop_missing_vital_status()
51
+ .lazy_frame() # Get Polars LazyFrame
52
+ )
53
+
54
+ # Use standard Polars operations
55
+ results = (
56
+ df
57
+ .filter(pl.col("AGE") >= 50)
58
+ .group_by(["SEX", "RACE"])
59
+ .agg([
60
+ pl.count().alias("count"),
61
+ pl.col("AGE").mean().alias("mean_age")
62
+ ])
63
+ .collect()
64
+ )
65
+ ```
66
+
67
+ The query interface provides these NCDB-specific filters:
68
+ - `filter_by_year()` - Filter by year of diagnosis
69
+ - `filter_by_primary_site()` - Filter by ICD-O-3 primary site codes
70
+ - `filter_by_histology()` - Filter by histology codes (accepts integers or strings)
71
+ - `drop_missing_vital_status()` - Remove cases with missing vital status
72
+
73
+ After applying NCDB filters, use `.lazy_frame()` to access the Polars LazyFrame for further analysis.
74
+
75
+ ## Features
76
+
77
+ - Efficiently converts NCDB fixed-width text files to parquet format
78
+ - Automatically parses SAS labels for meaningful column names
79
+ - Generates comprehensive data dictionaries in CSV, JSON, and HTML formats
80
+ - Memory-efficient processing using Polars
81
+ - Simple, high-level API for common tasks
82
+ - NCDB-specific data filters and transformations
83
+ - Compatible with all Python 3.9+ versions
84
+
85
+ ## Requirements
86
+
87
+ - Python 3.9 or higher
88
+ - NCDB data files (obtained through official channels)
89
+
90
+ ## License
91
+
92
+ MIT License - see [LICENSE](LICENSE) file for details.
93
+
94
+ ## Disclaimer
95
+
96
+ This software is provided for research purposes. Users are responsible for ensuring compliance with all applicable data use agreements and privacy regulations when working with NCDB data.
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line script to build NCDB database from .dat files.
4
+
5
+ Usage:
6
+ uv run build_database.py <data_directory>
7
+
8
+ This script will:
9
+ 1. Find all .dat files in the specified directory
10
+ 2. Locate the SAS labels file
11
+ 3. Create a timestamped output subdirectory
12
+ 4. Convert all .dat files to parquet format
13
+ 5. Generate comprehensive data dictionaries (CSV, JSON, HTML)
14
+ """
15
+
16
+ import sys
17
+ from pathlib import Path
18
+ from ncdb_tools import build_database
19
+
20
+ def main():
21
+ if len(sys.argv) != 2:
22
+ print("Usage: uv run build_database.py <data_directory>")
23
+ print("\nExample:")
24
+ print(" uv run build_database.py \"R:\\Jason\\NCDB\\NCDB_PUF_DATA_Sep-14-2024\"")
25
+ return 1
26
+
27
+ data_dir = sys.argv[1]
28
+
29
+ try:
30
+ result = build_database(data_dir=data_dir)
31
+ return 0
32
+ except Exception as e:
33
+ print(f"Error: {e}")
34
+ return 1
35
+
36
+ if __name__ == "__main__":
37
+ exit(main())
@@ -0,0 +1,6 @@
1
+ def main():
2
+ print("Hello from ncdb-tools!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,63 @@
1
+ [project]
2
+ name = "ncdb-tools"
3
+ version = "0.1.0"
4
+ description = "Tools for processing and analyzing National Cancer Database (NCDB) data"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "NCDB Tools Contributors"}
10
+ ]
11
+ keywords = ["ncdb", "cancer", "database", "analysis", "parquet", "medical-research"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Science/Research",
15
+ "Intended Audience :: Healthcare Industry",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.9",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Scientific/Engineering :: Medical Science Apps.",
23
+ "Topic :: Scientific/Engineering :: Information Analysis",
24
+ "Operating System :: OS Independent",
25
+ ]
26
+ dependencies = [
27
+ "polars>=0.19.0",
28
+ "click>=8.0.0",
29
+ "rich>=10.0.0",
30
+ "pyarrow>=10.0.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=7.0.0",
36
+ "pytest-cov>=4.0.0",
37
+ "ruff>=0.0.280",
38
+ "mypy>=1.0.0",
39
+ ]
40
+
41
+
42
+ [build-system]
43
+ requires = ["hatchling"]
44
+ build-backend = "hatchling.build"
45
+
46
+ [tool.ruff]
47
+ line-length = 88
48
+ target-version = "py39"
49
+ select = ["E", "F", "I", "N", "W", "RUF"]
50
+
51
+ [tool.mypy]
52
+ python_version = "3.9"
53
+ warn_return_any = true
54
+ warn_unused_configs = true
55
+ disallow_untyped_defs = true
56
+
57
+ [dependency-groups]
58
+ dev = [
59
+ "mypy>=1.16.0",
60
+ "pytest>=8.4.0",
61
+ "pytest-cov>=6.1.1",
62
+ "ruff>=0.11.12",
63
+ ]
@@ -0,0 +1,17 @@
1
+ """NCDB Tools - Tools for managing and analyzing National Cancer Database data."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ # Core functionality
6
+ from .data_dictionary import generate_data_dictionary
7
+ from .database_builder import build_database
8
+ from .dataset_builder import build_dataset
9
+ from .query import NCDBQuery, load_data
10
+
11
+ __all__ = [
12
+ "NCDBQuery",
13
+ "build_database", # High-level function for most users
14
+ "build_dataset",
15
+ "generate_data_dictionary",
16
+ "load_data",
17
+ ]
@@ -0,0 +1 @@
1
+ """Internal utilities for NCDB Tools."""
@@ -0,0 +1,115 @@
1
+ """Parser for SAS label files."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Dict, List, Tuple
6
+
7
+
8
+ def parse_sas_labels(sas_file_path: Path) -> Tuple[Dict[str, str], Dict[str, Dict[str, str]]]:
9
+ """Parse SAS file to extract variable labels and value formats.
10
+
11
+ Args:
12
+ sas_file_path: Path to SAS labels file
13
+
14
+ Returns:
15
+ Tuple of (variable_labels, value_formats)
16
+ - variable_labels: Dict mapping variable names to descriptions
17
+ - value_formats: Dict mapping variable names to value->label mappings
18
+ """
19
+ with open(sas_file_path, 'r', encoding='latin-1') as f:
20
+ content = f.read()
21
+
22
+ # Extract variable labels (e.g., AGE = 'Age at Diagnosis')
23
+ variable_labels = {}
24
+ label_pattern = r"(\w+)\s*=\s*'([^']+)'"
25
+
26
+ # Find the label section
27
+ label_section = re.search(r'label\s+(.*?);', content, re.DOTALL | re.IGNORECASE)
28
+ if label_section:
29
+ for match in re.finditer(label_pattern, label_section.group(1)):
30
+ var_name = match.group(1)
31
+ var_label = match.group(2)
32
+ variable_labels[var_name] = var_label
33
+
34
+ # Extract value formats (simplified - would need more robust parsing)
35
+ value_formats = {}
36
+
37
+ # Look for proc format value statements
38
+ format_blocks = re.findall(r'value\s+(\w+)(.*?);\s*(?=value|\s*run|$)', content, re.DOTALL | re.IGNORECASE)
39
+
40
+ for format_name, format_content in format_blocks:
41
+ # Extract value mappings
42
+ value_map = {}
43
+
44
+ # Pattern for numeric or string values
45
+ value_pattern = r"(['\"]?)([^'\"=]+)\1\s*=\s*['\"]([^'\"]+)['\"]"
46
+
47
+ for match in re.finditer(value_pattern, format_content):
48
+ value = match.group(2).strip()
49
+ label = match.group(3).strip()
50
+ value_map[value] = label
51
+
52
+ if value_map:
53
+ # Try to find which variables use this format
54
+ # Look for format statements
55
+ format_usage = re.findall(rf'(\w+)\s+{format_name}\.', content)
56
+ for var in format_usage:
57
+ value_formats[var] = value_map
58
+
59
+ return variable_labels, value_formats
60
+
61
+
62
+ def parse_column_positions(sas_file_path: Path) -> List[Dict[str, any]]:
63
+ """Parse column positions from SAS input statement.
64
+
65
+ Args:
66
+ sas_file_path: Path to SAS file
67
+
68
+ Returns:
69
+ List of column definitions with name, start, end positions
70
+ """
71
+ with open(sas_file_path, 'r', encoding='latin-1') as f:
72
+ content = f.read()
73
+
74
+ columns = []
75
+
76
+ # Find input statement
77
+ input_section = re.search(r'input\s+(.*?);\s*', content, re.DOTALL | re.IGNORECASE)
78
+ if not input_section:
79
+ return columns
80
+
81
+ input_text = input_section.group(1)
82
+
83
+ # Parse column definitions
84
+ # Handle both formats: "AGE $ 50-52" and "@50 AGE $3."
85
+
86
+ # First try the position-name format: @50 AGE $3.
87
+ at_pattern = r'@(\d+)\s+(\w+)\s+\$?(\d+)\.'
88
+ for match in re.finditer(at_pattern, input_text):
89
+ start_pos = int(match.group(1)) - 1 # Convert to 0-based
90
+ name = match.group(2)
91
+ width = int(match.group(3))
92
+
93
+ columns.append({
94
+ 'name': name,
95
+ 'start': start_pos,
96
+ 'end': start_pos + width,
97
+ 'width': width
98
+ })
99
+
100
+ # If no columns found, try the range format: AGE $ 50-52
101
+ if not columns:
102
+ range_pattern = r'(\w+)\s*\$?\s*(\d+)-(\d+)'
103
+ for match in re.finditer(range_pattern, input_text):
104
+ name = match.group(1)
105
+ start = int(match.group(2)) - 1 # Convert to 0-based
106
+ end = int(match.group(3))
107
+
108
+ columns.append({
109
+ 'name': name,
110
+ 'start': start,
111
+ 'end': end,
112
+ 'width': end - start
113
+ })
114
+
115
+ return columns
@@ -0,0 +1,87 @@
1
+ """Constants for NCDB Tools."""
2
+
3
+ from typing import List, Set
4
+
5
+ # File patterns
6
+ DATA_FILE_PATTERN = "NCDBPUF_*.dat"
7
+ PARQUET_EXTENSION = ".parquet"
8
+
9
+ # Data specifications
10
+ NCDB_RECORD_LENGTH = 1032
11
+ NCDB_COLUMN_COUNT = 338
12
+
13
+ # Columns that should never be converted to numeric
14
+ NEVER_NUMERIC_COLUMNS: Set[str] = {
15
+ "PUF_CASE_ID",
16
+ "PUF_FACILITY_ID",
17
+ "PRIMARY_SITE",
18
+ "HISTOLOGY",
19
+ "HISTOLOGY_ICDO3",
20
+ "BEHAVIOR",
21
+ "LATERALITY",
22
+ "CLASS_OF_CASE",
23
+ "YEAR_OF_DIAGNOSIS",
24
+ "SEQUENCE_NUMBER",
25
+ "FACILITY_TYPE_CD",
26
+ "FACILITY_LOCATION_CD",
27
+ "ZIP",
28
+ }
29
+
30
+ # Standard column groups for convenience
31
+ DEMOGRAPHIC_COLUMNS: List[str] = [
32
+ "AGE",
33
+ "SEX",
34
+ "RACE",
35
+ "SPANISH_HISPANIC_ORIGIN",
36
+ "INSURANCE_STATUS",
37
+ "CDCC_TOTAL_BEST",
38
+ "MED_INC_QUAR_00",
39
+ "NO_HSD_QUAR_00",
40
+ "UR_CD_03",
41
+ ]
42
+
43
+ TREATMENT_COLUMNS: List[str] = [
44
+ "RX_SUMM_SURG_PRIM_SITE",
45
+ "RX_SUMM_RADIATION",
46
+ "RX_SUMM_CHEMO",
47
+ "RX_SUMM_HORMONE",
48
+ "RX_SUMM_IMMUNOTHERAPY",
49
+ "RX_SUMM_SYSTEMIC_SUR_SEQ",
50
+ "RX_SUMM_TREATMENT_STATUS",
51
+ ]
52
+
53
+ OUTCOME_COLUMNS: List[str] = [
54
+ "PUF_VITAL_STATUS",
55
+ "DX_LASTCONTACT_DEATH_MONTHS",
56
+ "READM_HOSP_30_DAYS",
57
+ "REASON_FOR_NO_SURGERY",
58
+ ]
59
+
60
+ # Tumor types found in the actual data
61
+ TUMOR_TYPES: List[str] = [
62
+ "BoneJont",
63
+ "Brain",
64
+ "CNS",
65
+ "EyeOrbit",
66
+ "GumOtMth",
67
+ "HodgExtr",
68
+ "HodgNdal",
69
+ "Hypophar",
70
+ "Kaposi",
71
+ "Langerhans",
72
+ "Larynx",
73
+ "Lip",
74
+ "Melanoma",
75
+ "MouthFlr",
76
+ "Nasal",
77
+ "Nasophar",
78
+ "NHLExtr",
79
+ "NHLNdal",
80
+ "Orophary",
81
+ "Pharynx",
82
+ "SalivGld",
83
+ "SoftTiss",
84
+ "Thyroid",
85
+ "Tongue",
86
+ "Tonsil",
87
+ ]