hccinfhir 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/PKG-INFO +145 -3
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/README.md +143 -2
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/__init__.py +2 -1
- hccinfhir-0.2.2/hccinfhir/constants.py +240 -0
- hccinfhir-0.2.2/hccinfhir/data/ra_labels_2026.csv +784 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/datamodels.py +17 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/defaults.py +3 -1
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/extractor_834.py +52 -71
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/extractor_837.py +2 -2
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/hccinfhir.py +10 -10
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/model_calculate.py +18 -2
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/model_coefficients.py +2 -2
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/model_demographics.py +26 -29
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/model_interactions.py +7 -7
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/utils.py +68 -1
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/pyproject.toml +5 -2
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/.gitignore +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/LICENSE +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/__init__.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/hcc_is_chronic.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/hcc_is_chronic_without_esrd_model.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_coefficients_2025.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_coefficients_2026.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_dx_to_cc_2025.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_dx_to_cc_2026.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_eligible_cpt_hcpcs_2023.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_eligible_cpt_hcpcs_2024.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_eligible_cpt_hcpcs_2025.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_eligible_cpt_hcpcs_2026.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_hierarchies_2025.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/data/ra_hierarchies_2026.csv +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/extractor.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/extractor_fhir.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/filter.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/model_dx_to_cc.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/model_hierarchies.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/__init__.py +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_834_01.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_0.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_1.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_10.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_11.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_12.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_2.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_3.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_4.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_5.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_6.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_7.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_8.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_837_9.txt +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_eob_1.json +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_eob_2.json +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_eob_200.ndjson +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/sample_files/sample_eob_3.json +0 -0
- {hccinfhir-0.2.0 → hccinfhir-0.2.2}/hccinfhir/samples.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: hccinfhir
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: HCC Algorithm for FHIR Resources
|
|
5
5
|
Project-URL: Homepage, https://github.com/mimilabs/hccinfhir
|
|
6
6
|
Project-URL: Issues, https://github.com/mimilabs/hccinfhir/issues
|
|
@@ -10,6 +10,7 @@ Classifier: Operating System :: OS Independent
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Requires-Python: >=3.8
|
|
12
12
|
Requires-Dist: pydantic>=2.10.3
|
|
13
|
+
Requires-Dist: typing-extensions>=4.6.0
|
|
13
14
|
Description-Content-Type: text/markdown
|
|
14
15
|
|
|
15
16
|
# HCCInFHIR
|
|
@@ -62,6 +63,7 @@ print(f"HCCs: {result.hcc_list}")
|
|
|
62
63
|
- [Demographic Prefix Override](#demographic-prefix-override)
|
|
63
64
|
- [Custom File Path Resolution](#custom-file-path-resolution)
|
|
64
65
|
- [Batch Processing](#batch-processing)
|
|
66
|
+
- [Large-Scale Processing with Databricks](#large-scale-processing-with-databricks)
|
|
65
67
|
- [Converting to Dictionaries](#converting-to-dictionaries)
|
|
66
68
|
- [Sample Data](#sample-data)
|
|
67
69
|
- [Testing](#testing)
|
|
@@ -78,7 +80,7 @@ print(f"HCCs: {result.hcc_list}")
|
|
|
78
80
|
- **Custom Data Files**: Full support for custom coefficients, mappings, and hierarchies
|
|
79
81
|
- **Flexible File Resolution**: Absolute paths, relative paths, or bundled data files
|
|
80
82
|
- **Type-Safe**: Built on Pydantic with full type hints
|
|
81
|
-
- **Well-Tested**:
|
|
83
|
+
- **Well-Tested**: 181 comprehensive tests covering all features
|
|
82
84
|
|
|
83
85
|
## 📊 Data Sources & Use Cases
|
|
84
86
|
|
|
@@ -834,6 +836,146 @@ with open("risk_scores.json", "w") as f:
|
|
|
834
836
|
json.dump(results, f, indent=2)
|
|
835
837
|
```
|
|
836
838
|
|
|
839
|
+
### Large-Scale Processing with Databricks
|
|
840
|
+
|
|
841
|
+
For processing millions of beneficiaries, use PySpark's `pandas_udf` for distributed computation. The hccinfhir logic is well-suited for batch operations with clear, simple transformations.
|
|
842
|
+
|
|
843
|
+
**Performance Benchmark**:
|
|
844
|
+
|
|
845
|
+

|
|
846
|
+
|
|
847
|
+
*Tested with ACO data on Databricks Runtime 17.3 LTS, Worker: i3.4xlarge (122GB, 16 cores)*
|
|
848
|
+
|
|
849
|
+
The chart shows execution time varies based on condition complexity - members with more diagnoses require additional internal processing loops. While the relationship isn't perfectly linear, **1 million members can be processed in under 2 minutes** with this configuration.
|
|
850
|
+
|
|
851
|
+
```python
|
|
852
|
+
from pyspark.sql import SparkSession
|
|
853
|
+
from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, StringType
|
|
854
|
+
from pyspark.sql import functions as F
|
|
855
|
+
from pyspark.sql.functions import pandas_udf
|
|
856
|
+
import pandas as pd
|
|
857
|
+
|
|
858
|
+
from hccinfhir import HCCInFHIR, Demographics
|
|
859
|
+
|
|
860
|
+
# Define the return schema
|
|
861
|
+
hcc_schema = StructType([
|
|
862
|
+
StructField("risk_score", FloatType(), True),
|
|
863
|
+
StructField("risk_score_demographics", FloatType(), True),
|
|
864
|
+
StructField("risk_score_chronic_only", FloatType(), True),
|
|
865
|
+
StructField("risk_score_hcc", FloatType(), True),
|
|
866
|
+
StructField("hcc_list", ArrayType(StringType()), True)
|
|
867
|
+
])
|
|
868
|
+
|
|
869
|
+
# Initialize processor (will be serialized to each executor)
|
|
870
|
+
hcc_processor = HCCInFHIR(model_name="CMS-HCC Model V28")
|
|
871
|
+
|
|
872
|
+
# Create the pandas UDF
|
|
873
|
+
@pandas_udf(hcc_schema)
|
|
874
|
+
def calculate_hcc(
|
|
875
|
+
age_series: pd.Series,
|
|
876
|
+
sex_series: pd.Series,
|
|
877
|
+
diagnosis_series: pd.Series
|
|
878
|
+
) -> pd.DataFrame:
|
|
879
|
+
results = []
|
|
880
|
+
|
|
881
|
+
for age, sex, diagnosis_codes in zip(age_series, sex_series, diagnosis_series):
|
|
882
|
+
try:
|
|
883
|
+
demographics = Demographics(age=int(age), sex=sex)
|
|
884
|
+
|
|
885
|
+
# diagnosis_codes can be passed directly - accepts any iterable including numpy arrays
|
|
886
|
+
result = hcc_processor.calculate_from_diagnosis(diagnosis_codes, demographics)
|
|
887
|
+
|
|
888
|
+
results.append({
|
|
889
|
+
'risk_score': float(result.risk_score),
|
|
890
|
+
'risk_score_demographics': float(result.risk_score_demographics),
|
|
891
|
+
'risk_score_chronic_only': float(result.risk_score_chronic_only),
|
|
892
|
+
'risk_score_hcc': float(result.risk_score_hcc),
|
|
893
|
+
'hcc_list': result.hcc_list
|
|
894
|
+
})
|
|
895
|
+
except Exception as e:
|
|
896
|
+
# Log error and return nulls for failed rows
|
|
897
|
+
print(f"ERROR processing row: {e}")
|
|
898
|
+
results.append({
|
|
899
|
+
'risk_score': None,
|
|
900
|
+
'risk_score_demographics': None,
|
|
901
|
+
'risk_score_chronic_only': None,
|
|
902
|
+
'risk_score_hcc': None,
|
|
903
|
+
'hcc_list': None
|
|
904
|
+
})
|
|
905
|
+
|
|
906
|
+
return pd.DataFrame(results)
|
|
907
|
+
|
|
908
|
+
# Apply the UDF to your DataFrame
|
|
909
|
+
# Assumes df has columns: age, patient_gender, diagnosis_codes (array of strings)
|
|
910
|
+
df = df.withColumn(
|
|
911
|
+
"hcc_results",
|
|
912
|
+
calculate_hcc(
|
|
913
|
+
F.col("age"),
|
|
914
|
+
F.col("patient_gender"),
|
|
915
|
+
F.col("diagnosis_codes")
|
|
916
|
+
)
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# Expand the struct into separate columns
|
|
920
|
+
df = df.select(
|
|
921
|
+
"*",
|
|
922
|
+
F.col("hcc_results.risk_score").alias("risk_score"),
|
|
923
|
+
F.col("hcc_results.risk_score_demographics").alias("risk_score_demographics"),
|
|
924
|
+
F.col("hcc_results.risk_score_chronic_only").alias("risk_score_chronic_only"),
|
|
925
|
+
F.col("hcc_results.risk_score_hcc").alias("risk_score_hcc"),
|
|
926
|
+
F.col("hcc_results.hcc_list").alias("hcc_list")
|
|
927
|
+
).drop("hcc_results")
|
|
928
|
+
```
|
|
929
|
+
|
|
930
|
+
**Performance Tips**:
|
|
931
|
+
- **Repartition** your DataFrame before applying the UDF to balance workload across executors
|
|
932
|
+
- **Cache** the processor initialization by defining it at module level
|
|
933
|
+
- **Batch size**: pandas_udf processes data in batches; Spark handles optimal batch sizing automatically
|
|
934
|
+
- **Install hccinfhir** on all cluster nodes: `%pip install hccinfhir` in a notebook cell or add to cluster init script
|
|
935
|
+
|
|
936
|
+
**Extended Schema with Demographics**:
|
|
937
|
+
|
|
938
|
+
```python
|
|
939
|
+
# Include additional demographic parameters
|
|
940
|
+
@pandas_udf(hcc_schema)
|
|
941
|
+
def calculate_hcc_full(
|
|
942
|
+
age_series: pd.Series,
|
|
943
|
+
sex_series: pd.Series,
|
|
944
|
+
dual_status_series: pd.Series,
|
|
945
|
+
diagnosis_series: pd.Series
|
|
946
|
+
) -> pd.DataFrame:
|
|
947
|
+
results = []
|
|
948
|
+
|
|
949
|
+
for age, sex, dual_status, diagnosis_codes in zip(
|
|
950
|
+
age_series, sex_series, dual_status_series, diagnosis_series
|
|
951
|
+
):
|
|
952
|
+
try:
|
|
953
|
+
demographics = Demographics(
|
|
954
|
+
age=int(age),
|
|
955
|
+
sex=sex,
|
|
956
|
+
dual_elgbl_cd=dual_status if dual_status else "00"
|
|
957
|
+
)
|
|
958
|
+
result = hcc_processor.calculate_from_diagnosis(diagnosis_codes, demographics)
|
|
959
|
+
|
|
960
|
+
results.append({
|
|
961
|
+
'risk_score': float(result.risk_score),
|
|
962
|
+
'risk_score_demographics': float(result.risk_score_demographics),
|
|
963
|
+
'risk_score_chronic_only': float(result.risk_score_chronic_only),
|
|
964
|
+
'risk_score_hcc': float(result.risk_score_hcc),
|
|
965
|
+
'hcc_list': result.hcc_list
|
|
966
|
+
})
|
|
967
|
+
except Exception as e:
|
|
968
|
+
results.append({
|
|
969
|
+
'risk_score': None,
|
|
970
|
+
'risk_score_demographics': None,
|
|
971
|
+
'risk_score_chronic_only': None,
|
|
972
|
+
'risk_score_hcc': None,
|
|
973
|
+
'hcc_list': None
|
|
974
|
+
})
|
|
975
|
+
|
|
976
|
+
return pd.DataFrame(results)
|
|
977
|
+
```
|
|
978
|
+
|
|
837
979
|
### Converting to Dictionaries
|
|
838
980
|
|
|
839
981
|
All Pydantic models support dictionary conversion for JSON serialization, database storage, or legacy code:
|
|
@@ -914,7 +1056,7 @@ hatch shell
|
|
|
914
1056
|
# Install in development mode
|
|
915
1057
|
pip install -e .
|
|
916
1058
|
|
|
917
|
-
# Run all tests (
|
|
1059
|
+
# Run all tests (181 tests)
|
|
918
1060
|
pytest tests/
|
|
919
1061
|
|
|
920
1062
|
# Run specific test file
|
|
@@ -48,6 +48,7 @@ print(f"HCCs: {result.hcc_list}")
|
|
|
48
48
|
- [Demographic Prefix Override](#demographic-prefix-override)
|
|
49
49
|
- [Custom File Path Resolution](#custom-file-path-resolution)
|
|
50
50
|
- [Batch Processing](#batch-processing)
|
|
51
|
+
- [Large-Scale Processing with Databricks](#large-scale-processing-with-databricks)
|
|
51
52
|
- [Converting to Dictionaries](#converting-to-dictionaries)
|
|
52
53
|
- [Sample Data](#sample-data)
|
|
53
54
|
- [Testing](#testing)
|
|
@@ -64,7 +65,7 @@ print(f"HCCs: {result.hcc_list}")
|
|
|
64
65
|
- **Custom Data Files**: Full support for custom coefficients, mappings, and hierarchies
|
|
65
66
|
- **Flexible File Resolution**: Absolute paths, relative paths, or bundled data files
|
|
66
67
|
- **Type-Safe**: Built on Pydantic with full type hints
|
|
67
|
-
- **Well-Tested**:
|
|
68
|
+
- **Well-Tested**: 181 comprehensive tests covering all features
|
|
68
69
|
|
|
69
70
|
## 📊 Data Sources & Use Cases
|
|
70
71
|
|
|
@@ -820,6 +821,146 @@ with open("risk_scores.json", "w") as f:
|
|
|
820
821
|
json.dump(results, f, indent=2)
|
|
821
822
|
```
|
|
822
823
|
|
|
824
|
+
### Large-Scale Processing with Databricks
|
|
825
|
+
|
|
826
|
+
For processing millions of beneficiaries, use PySpark's `pandas_udf` for distributed computation. The hccinfhir logic is well-suited for batch operations with clear, simple transformations.
|
|
827
|
+
|
|
828
|
+
**Performance Benchmark**:
|
|
829
|
+
|
|
830
|
+

|
|
831
|
+
|
|
832
|
+
*Tested with ACO data on Databricks Runtime 17.3 LTS, Worker: i3.4xlarge (122GB, 16 cores)*
|
|
833
|
+
|
|
834
|
+
The chart shows execution time varies based on condition complexity - members with more diagnoses require additional internal processing loops. While the relationship isn't perfectly linear, **1 million members can be processed in under 2 minutes** with this configuration.
|
|
835
|
+
|
|
836
|
+
```python
|
|
837
|
+
from pyspark.sql import SparkSession
|
|
838
|
+
from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, StringType
|
|
839
|
+
from pyspark.sql import functions as F
|
|
840
|
+
from pyspark.sql.functions import pandas_udf
|
|
841
|
+
import pandas as pd
|
|
842
|
+
|
|
843
|
+
from hccinfhir import HCCInFHIR, Demographics
|
|
844
|
+
|
|
845
|
+
# Define the return schema
|
|
846
|
+
hcc_schema = StructType([
|
|
847
|
+
StructField("risk_score", FloatType(), True),
|
|
848
|
+
StructField("risk_score_demographics", FloatType(), True),
|
|
849
|
+
StructField("risk_score_chronic_only", FloatType(), True),
|
|
850
|
+
StructField("risk_score_hcc", FloatType(), True),
|
|
851
|
+
StructField("hcc_list", ArrayType(StringType()), True)
|
|
852
|
+
])
|
|
853
|
+
|
|
854
|
+
# Initialize processor (will be serialized to each executor)
|
|
855
|
+
hcc_processor = HCCInFHIR(model_name="CMS-HCC Model V28")
|
|
856
|
+
|
|
857
|
+
# Create the pandas UDF
|
|
858
|
+
@pandas_udf(hcc_schema)
|
|
859
|
+
def calculate_hcc(
|
|
860
|
+
age_series: pd.Series,
|
|
861
|
+
sex_series: pd.Series,
|
|
862
|
+
diagnosis_series: pd.Series
|
|
863
|
+
) -> pd.DataFrame:
|
|
864
|
+
results = []
|
|
865
|
+
|
|
866
|
+
for age, sex, diagnosis_codes in zip(age_series, sex_series, diagnosis_series):
|
|
867
|
+
try:
|
|
868
|
+
demographics = Demographics(age=int(age), sex=sex)
|
|
869
|
+
|
|
870
|
+
# diagnosis_codes can be passed directly - accepts any iterable including numpy arrays
|
|
871
|
+
result = hcc_processor.calculate_from_diagnosis(diagnosis_codes, demographics)
|
|
872
|
+
|
|
873
|
+
results.append({
|
|
874
|
+
'risk_score': float(result.risk_score),
|
|
875
|
+
'risk_score_demographics': float(result.risk_score_demographics),
|
|
876
|
+
'risk_score_chronic_only': float(result.risk_score_chronic_only),
|
|
877
|
+
'risk_score_hcc': float(result.risk_score_hcc),
|
|
878
|
+
'hcc_list': result.hcc_list
|
|
879
|
+
})
|
|
880
|
+
except Exception as e:
|
|
881
|
+
# Log error and return nulls for failed rows
|
|
882
|
+
print(f"ERROR processing row: {e}")
|
|
883
|
+
results.append({
|
|
884
|
+
'risk_score': None,
|
|
885
|
+
'risk_score_demographics': None,
|
|
886
|
+
'risk_score_chronic_only': None,
|
|
887
|
+
'risk_score_hcc': None,
|
|
888
|
+
'hcc_list': None
|
|
889
|
+
})
|
|
890
|
+
|
|
891
|
+
return pd.DataFrame(results)
|
|
892
|
+
|
|
893
|
+
# Apply the UDF to your DataFrame
|
|
894
|
+
# Assumes df has columns: age, patient_gender, diagnosis_codes (array of strings)
|
|
895
|
+
df = df.withColumn(
|
|
896
|
+
"hcc_results",
|
|
897
|
+
calculate_hcc(
|
|
898
|
+
F.col("age"),
|
|
899
|
+
F.col("patient_gender"),
|
|
900
|
+
F.col("diagnosis_codes")
|
|
901
|
+
)
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
# Expand the struct into separate columns
|
|
905
|
+
df = df.select(
|
|
906
|
+
"*",
|
|
907
|
+
F.col("hcc_results.risk_score").alias("risk_score"),
|
|
908
|
+
F.col("hcc_results.risk_score_demographics").alias("risk_score_demographics"),
|
|
909
|
+
F.col("hcc_results.risk_score_chronic_only").alias("risk_score_chronic_only"),
|
|
910
|
+
F.col("hcc_results.risk_score_hcc").alias("risk_score_hcc"),
|
|
911
|
+
F.col("hcc_results.hcc_list").alias("hcc_list")
|
|
912
|
+
).drop("hcc_results")
|
|
913
|
+
```
|
|
914
|
+
|
|
915
|
+
**Performance Tips**:
|
|
916
|
+
- **Repartition** your DataFrame before applying the UDF to balance workload across executors
|
|
917
|
+
- **Cache** the processor initialization by defining it at module level
|
|
918
|
+
- **Batch size**: pandas_udf processes data in batches; Spark handles optimal batch sizing automatically
|
|
919
|
+
- **Install hccinfhir** on all cluster nodes: `%pip install hccinfhir` in a notebook cell or add to cluster init script
|
|
920
|
+
|
|
921
|
+
**Extended Schema with Demographics**:
|
|
922
|
+
|
|
923
|
+
```python
|
|
924
|
+
# Include additional demographic parameters
|
|
925
|
+
@pandas_udf(hcc_schema)
|
|
926
|
+
def calculate_hcc_full(
|
|
927
|
+
age_series: pd.Series,
|
|
928
|
+
sex_series: pd.Series,
|
|
929
|
+
dual_status_series: pd.Series,
|
|
930
|
+
diagnosis_series: pd.Series
|
|
931
|
+
) -> pd.DataFrame:
|
|
932
|
+
results = []
|
|
933
|
+
|
|
934
|
+
for age, sex, dual_status, diagnosis_codes in zip(
|
|
935
|
+
age_series, sex_series, dual_status_series, diagnosis_series
|
|
936
|
+
):
|
|
937
|
+
try:
|
|
938
|
+
demographics = Demographics(
|
|
939
|
+
age=int(age),
|
|
940
|
+
sex=sex,
|
|
941
|
+
dual_elgbl_cd=dual_status if dual_status else "00"
|
|
942
|
+
)
|
|
943
|
+
result = hcc_processor.calculate_from_diagnosis(diagnosis_codes, demographics)
|
|
944
|
+
|
|
945
|
+
results.append({
|
|
946
|
+
'risk_score': float(result.risk_score),
|
|
947
|
+
'risk_score_demographics': float(result.risk_score_demographics),
|
|
948
|
+
'risk_score_chronic_only': float(result.risk_score_chronic_only),
|
|
949
|
+
'risk_score_hcc': float(result.risk_score_hcc),
|
|
950
|
+
'hcc_list': result.hcc_list
|
|
951
|
+
})
|
|
952
|
+
except Exception as e:
|
|
953
|
+
results.append({
|
|
954
|
+
'risk_score': None,
|
|
955
|
+
'risk_score_demographics': None,
|
|
956
|
+
'risk_score_chronic_only': None,
|
|
957
|
+
'risk_score_hcc': None,
|
|
958
|
+
'hcc_list': None
|
|
959
|
+
})
|
|
960
|
+
|
|
961
|
+
return pd.DataFrame(results)
|
|
962
|
+
```
|
|
963
|
+
|
|
823
964
|
### Converting to Dictionaries
|
|
824
965
|
|
|
825
966
|
All Pydantic models support dictionary conversion for JSON serialization, database storage, or legacy code:
|
|
@@ -900,7 +1041,7 @@ hatch shell
|
|
|
900
1041
|
# Install in development mode
|
|
901
1042
|
pip install -e .
|
|
902
1043
|
|
|
903
|
-
# Run all tests (
|
|
1044
|
+
# Run all tests (181 tests)
|
|
904
1045
|
pytest tests/
|
|
905
1046
|
|
|
906
1047
|
# Run specific test file
|
|
@@ -9,7 +9,7 @@ from .hccinfhir import HCCInFHIR
|
|
|
9
9
|
from .extractor import extract_sld, extract_sld_list
|
|
10
10
|
from .filter import apply_filter
|
|
11
11
|
from .model_calculate import calculate_raf
|
|
12
|
-
from .datamodels import Demographics, ServiceLevelData, RAFResult, ModelName
|
|
12
|
+
from .datamodels import Demographics, ServiceLevelData, RAFResult, ModelName, HCCDetail
|
|
13
13
|
|
|
14
14
|
# Sample data functions
|
|
15
15
|
from .samples import (
|
|
@@ -37,6 +37,7 @@ __all__ = [
|
|
|
37
37
|
"ServiceLevelData",
|
|
38
38
|
"RAFResult",
|
|
39
39
|
"ModelName",
|
|
40
|
+
"HCCDetail",
|
|
40
41
|
|
|
41
42
|
# Sample data
|
|
42
43
|
"SampleData",
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CMS Risk Adjustment Domain Constants
|
|
3
|
+
|
|
4
|
+
This module contains constants used across the HCC risk adjustment system,
|
|
5
|
+
including dual eligibility codes, OREC/CREC values, and state-specific mappings.
|
|
6
|
+
|
|
7
|
+
References:
|
|
8
|
+
- CMS Rate Announcement and Call Letter
|
|
9
|
+
- Medicare Advantage Enrollment and Disenrollment Guidance
|
|
10
|
+
- X12 834 Implementation Guides
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Set, Dict
|
|
14
|
+
|
|
15
|
+
# =============================================================================
|
|
16
|
+
# DUAL ELIGIBILITY CODES
|
|
17
|
+
# =============================================================================
|
|
18
|
+
# CMS Dual Eligibility Status Codes (Medicare + Medicaid)
|
|
19
|
+
# Used in coefficient prefix selection (CNA_, CFA_, CPA_, etc.)
|
|
20
|
+
|
|
21
|
+
VALID_DUAL_CODES: Set[str] = {'00', '01', '02', '03', '04', '05', '06', '08'}
|
|
22
|
+
|
|
23
|
+
# Non-Dual Eligible
|
|
24
|
+
NON_DUAL_CODE: str = '00'
|
|
25
|
+
|
|
26
|
+
# Full Benefit Dual Eligible (receive both Medicare and full Medicaid benefits)
|
|
27
|
+
# Uses CFA_ (Community, Full Benefit Dual, Aged) or CFD_ (Disabled) prefixes
|
|
28
|
+
FULL_BENEFIT_DUAL_CODES: Set[str] = {
|
|
29
|
+
'02', # QMB Plus (Qualified Medicare Beneficiary Plus)
|
|
30
|
+
'04', # SLMB Plus (Specified Low-Income Medicare Beneficiary Plus)
|
|
31
|
+
'08', # Other Full Benefit Dual Eligible
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Partial Benefit Dual Eligible (Medicare + limited Medicaid)
|
|
35
|
+
# Uses CPA_ (Community, Partial Benefit Dual, Aged) or CPD_ (Disabled) prefixes
|
|
36
|
+
PARTIAL_BENEFIT_DUAL_CODES: Set[str] = {
|
|
37
|
+
'01', # QMB Only
|
|
38
|
+
'03', # SLMB Only
|
|
39
|
+
'05', # QDWI (Qualified Disabled and Working Individual)
|
|
40
|
+
'06', # QI (Qualifying Individual)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# =============================================================================
|
|
44
|
+
# OREC - Original Reason for Entitlement Code
|
|
45
|
+
# =============================================================================
|
|
46
|
+
# Determines if beneficiary has ESRD and affects coefficient prefix selection
|
|
47
|
+
|
|
48
|
+
VALID_OREC_VALUES: Set[str] = {'0', '1', '2', '3'}
|
|
49
|
+
|
|
50
|
+
OREC_DESCRIPTIONS: Dict[str, str] = {
|
|
51
|
+
'0': 'Old Age and Survivors Insurance (OASI)',
|
|
52
|
+
'1': 'Disability Insurance Benefits (DIB)',
|
|
53
|
+
'2': 'ESRD - End-Stage Renal Disease',
|
|
54
|
+
'3': 'DIB and ESRD',
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# OREC codes indicating ESRD status (per CMS documentation)
|
|
58
|
+
OREC_ESRD_CODES: Set[str] = {'2', '3'}
|
|
59
|
+
|
|
60
|
+
# =============================================================================
|
|
61
|
+
# CREC - Current Reason for Entitlement Code
|
|
62
|
+
# =============================================================================
|
|
63
|
+
# Current entitlement status (may differ from OREC)
|
|
64
|
+
|
|
65
|
+
VALID_CREC_VALUES: Set[str] = {'0', '1', '2', '3'}
|
|
66
|
+
|
|
67
|
+
CREC_DESCRIPTIONS: Dict[str, str] = {
|
|
68
|
+
'0': 'Old Age and Survivors Insurance (OASI)',
|
|
69
|
+
'1': 'Disability Insurance Benefits (DIB)',
|
|
70
|
+
'2': 'ESRD - End-Stage Renal Disease',
|
|
71
|
+
'3': 'DIB and ESRD',
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# CREC codes indicating ESRD status
|
|
75
|
+
CREC_ESRD_CODES: Set[str] = {'2', '3'}
|
|
76
|
+
|
|
77
|
+
# =============================================================================
|
|
78
|
+
# COEFFICIENT PREFIX GROUPS
|
|
79
|
+
# =============================================================================
|
|
80
|
+
# Used for prefix_override logic in model_demographics.py
|
|
81
|
+
|
|
82
|
+
# ESRD model prefixes
|
|
83
|
+
ESRD_PREFIXES: Set[str] = {'DI_', 'DNE_', 'GI_', 'GNE_', 'GFPA_', 'GFPN_', 'GNPA_', 'GNPN_'}
|
|
84
|
+
|
|
85
|
+
# CMS-HCC new enrollee prefixes
|
|
86
|
+
NEW_ENROLLEE_PREFIXES: Set[str] = {'NE_', 'SNPNE_', 'DNE_', 'GNE_'}
|
|
87
|
+
|
|
88
|
+
# CMS-HCC community prefixes
|
|
89
|
+
COMMUNITY_PREFIXES: Set[str] = {'CNA_', 'CND_', 'CFA_', 'CFD_', 'CPA_', 'CPD_'}
|
|
90
|
+
|
|
91
|
+
# Institutionalized prefixes
|
|
92
|
+
INSTITUTIONAL_PREFIXES: Set[str] = {'INS_', 'GI_'}
|
|
93
|
+
|
|
94
|
+
# Full Benefit Dual prefixes
|
|
95
|
+
FULL_BENEFIT_DUAL_PREFIXES: Set[str] = {'CFA_', 'CFD_', 'GFPA_', 'GFPN_'}
|
|
96
|
+
|
|
97
|
+
# Partial Benefit Dual prefixes
|
|
98
|
+
PARTIAL_BENEFIT_DUAL_PREFIXES: Set[str] = {'CPA_', 'CPD_'}
|
|
99
|
+
|
|
100
|
+
# Non-Dual prefixes
|
|
101
|
+
NON_DUAL_PREFIXES: Set[str] = {'CNA_', 'CND_', 'GNPA_', 'GNPN_'}
|
|
102
|
+
|
|
103
|
+
# =============================================================================
|
|
104
|
+
# DEMOGRAPHIC CODES
|
|
105
|
+
# =============================================================================
|
|
106
|
+
|
|
107
|
+
VALID_SEX_CODES: Set[str] = {'M', 'F'}
|
|
108
|
+
|
|
109
|
+
# X12 834 Gender Code mappings
|
|
110
|
+
X12_SEX_CODE_MAPPING: Dict[str, str] = {
|
|
111
|
+
'M': 'M',
|
|
112
|
+
'F': 'F',
|
|
113
|
+
'1': 'M', # X12 numeric code
|
|
114
|
+
'2': 'F', # X12 numeric code
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# X12 834 MAINTENANCE TYPE CODES
|
|
119
|
+
# =============================================================================
|
|
120
|
+
# INS03 - Maintenance Type Code
|
|
121
|
+
|
|
122
|
+
MAINTENANCE_TYPE_CHANGE: str = '001'
|
|
123
|
+
MAINTENANCE_TYPE_ADD: str = '021'
|
|
124
|
+
MAINTENANCE_TYPE_CANCEL: str = '024'
|
|
125
|
+
MAINTENANCE_TYPE_REINSTATE: str = '025'
|
|
126
|
+
|
|
127
|
+
MAINTENANCE_TYPE_DESCRIPTIONS: Dict[str, str] = {
|
|
128
|
+
'001': 'Change',
|
|
129
|
+
'021': 'Addition',
|
|
130
|
+
'024': 'Cancellation/Termination',
|
|
131
|
+
'025': 'Reinstatement',
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
# =============================================================================
|
|
135
|
+
# STATE-SPECIFIC MAPPINGS
|
|
136
|
+
# =============================================================================
|
|
137
|
+
|
|
138
|
+
# -----------------------------------------------------------------------------
|
|
139
|
+
# California DHCS Medi-Cal Aid Codes
|
|
140
|
+
# -----------------------------------------------------------------------------
|
|
141
|
+
# Maps California-specific aid codes to CMS dual eligibility codes
|
|
142
|
+
# Source: California DHCS 834 Implementation Guide
|
|
143
|
+
|
|
144
|
+
MEDI_CAL_AID_CODES: Dict[str, str] = {
|
|
145
|
+
# Full Benefit Dual (QMB Plus, SLMB Plus)
|
|
146
|
+
'4N': '02', # QMB Plus - Aged
|
|
147
|
+
'4P': '02', # QMB Plus - Disabled
|
|
148
|
+
'5B': '04', # SLMB Plus - Aged
|
|
149
|
+
'5D': '04', # SLMB Plus - Disabled
|
|
150
|
+
|
|
151
|
+
# Partial Benefit Dual (QMB Only, SLMB Only, QI)
|
|
152
|
+
'4M': '01', # QMB Only - Aged
|
|
153
|
+
'4O': '01', # QMB Only - Disabled
|
|
154
|
+
'5A': '03', # SLMB Only - Aged
|
|
155
|
+
'5C': '03', # SLMB Only - Disabled
|
|
156
|
+
'5E': '06', # QI - Aged
|
|
157
|
+
'5F': '06', # QI - Disabled
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# -----------------------------------------------------------------------------
|
|
161
|
+
# Medicare Status Code Mappings
|
|
162
|
+
# -----------------------------------------------------------------------------
|
|
163
|
+
# Maps Medicare status codes (from various sources) to CMS dual eligibility codes
|
|
164
|
+
# Used in X12 834 REF*ABB segment and other payer files
|
|
165
|
+
|
|
166
|
+
MEDICARE_STATUS_CODE_MAPPING: Dict[str, str] = {
|
|
167
|
+
# QMB - Qualified Medicare Beneficiary
|
|
168
|
+
'QMB': '01', # QMB Only (Partial)
|
|
169
|
+
'QMBONLY': '01',
|
|
170
|
+
'QMBPLUS': '02', # QMB Plus (Full Benefit)
|
|
171
|
+
'QMB+': '02',
|
|
172
|
+
|
|
173
|
+
# SLMB - Specified Low-Income Medicare Beneficiary
|
|
174
|
+
'SLMB': '03', # SLMB Only (Partial)
|
|
175
|
+
'SLMBONLY': '03',
|
|
176
|
+
'SLMBPLUS': '04', # SLMB Plus (Full Benefit)
|
|
177
|
+
'SLMB+': '04',
|
|
178
|
+
|
|
179
|
+
# Other dual eligibility programs
|
|
180
|
+
'QDWI': '05', # Qualified Disabled and Working Individual
|
|
181
|
+
'QI': '06', # Qualifying Individual
|
|
182
|
+
'QI1': '06',
|
|
183
|
+
'FBDE': '08', # Full Benefit Dual Eligible (Other)
|
|
184
|
+
'OTHERFULL': '08',
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# =============================================================================
|
|
188
|
+
# HELPER FUNCTIONS
|
|
189
|
+
# =============================================================================
|
|
190
|
+
|
|
191
|
+
def is_full_benefit_dual(dual_code: str) -> bool:
|
|
192
|
+
"""Check if dual eligibility code is Full Benefit Dual"""
|
|
193
|
+
return dual_code in FULL_BENEFIT_DUAL_CODES
|
|
194
|
+
|
|
195
|
+
def is_partial_benefit_dual(dual_code: str) -> bool:
|
|
196
|
+
"""Check if dual eligibility code is Partial Benefit Dual"""
|
|
197
|
+
return dual_code in PARTIAL_BENEFIT_DUAL_CODES
|
|
198
|
+
|
|
199
|
+
def is_esrd_by_orec(orec: str) -> bool:
|
|
200
|
+
"""Check if OREC indicates ESRD status"""
|
|
201
|
+
return orec in OREC_ESRD_CODES
|
|
202
|
+
|
|
203
|
+
def is_esrd_by_crec(crec: str) -> bool:
|
|
204
|
+
"""Check if CREC indicates ESRD status"""
|
|
205
|
+
return crec in CREC_ESRD_CODES
|
|
206
|
+
|
|
207
|
+
def normalize_medicare_status_code(status: str) -> str:
|
|
208
|
+
"""Normalize Medicare status code (uppercase, no spaces/hyphens)"""
|
|
209
|
+
if not status:
|
|
210
|
+
return ''
|
|
211
|
+
return status.upper().replace(' ', '').replace('-', '')
|
|
212
|
+
|
|
213
|
+
def map_medicare_status_to_dual_code(status: str) -> str:
|
|
214
|
+
"""Map Medicare status code to dual eligibility code
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
status: Medicare status code (e.g., 'QMB Plus', 'SLMB', 'QI')
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Dual eligibility code ('01'-'08') or '00' if not found
|
|
221
|
+
"""
|
|
222
|
+
if not status:
|
|
223
|
+
return NON_DUAL_CODE
|
|
224
|
+
|
|
225
|
+
normalized = normalize_medicare_status_code(status)
|
|
226
|
+
return MEDICARE_STATUS_CODE_MAPPING.get(normalized, NON_DUAL_CODE)
|
|
227
|
+
|
|
228
|
+
def map_aid_code_to_dual_status(aid_code: str) -> str:
|
|
229
|
+
"""Map California Medi-Cal aid code to dual eligibility code
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
aid_code: California aid code (e.g., '4N', '5B')
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Dual eligibility code ('01'-'08') or '00' if not found
|
|
236
|
+
"""
|
|
237
|
+
if not aid_code:
|
|
238
|
+
return NON_DUAL_CODE
|
|
239
|
+
|
|
240
|
+
return MEDI_CAL_AID_CODES.get(aid_code, NON_DUAL_CODE)
|