patient-data-expander-tool 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patient_data_expander_tool-0.1.0/PKG-INFO +19 -0
- patient_data_expander_tool-0.1.0/README.md +0 -0
- patient_data_expander_tool-0.1.0/patient_data_expander_tool.egg-info/PKG-INFO +19 -0
- patient_data_expander_tool-0.1.0/patient_data_expander_tool.egg-info/SOURCES.txt +11 -0
- patient_data_expander_tool-0.1.0/patient_data_expander_tool.egg-info/dependency_links.txt +1 -0
- patient_data_expander_tool-0.1.0/patient_data_expander_tool.egg-info/entry_points.txt +2 -0
- patient_data_expander_tool-0.1.0/patient_data_expander_tool.egg-info/requires.txt +13 -0
- patient_data_expander_tool-0.1.0/patient_data_expander_tool.egg-info/top_level.txt +1 -0
- patient_data_expander_tool-0.1.0/pdet/__init__.py +0 -0
- patient_data_expander_tool-0.1.0/pdet/gdc_index_code.py +164 -0
- patient_data_expander_tool-0.1.0/pdet/main.py +2215 -0
- patient_data_expander_tool-0.1.0/pyproject.toml +24 -0
- patient_data_expander_tool-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: patient-data-expander-tool
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The Patient Data Expander Tool allows researchers to identify clusters within their patient data sets, select the features that best define those clusters, then query public databases for similar patients and export their data to expand their initial dataset.
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: ehrapy>=0.14.0
|
|
8
|
+
Requires-Dist: ehrdata>=0.2.1
|
|
9
|
+
Requires-Dist: holoviews>=1.22.1
|
|
10
|
+
Requires-Dist: hvplot>=0.12.2
|
|
11
|
+
Requires-Dist: ipykernel>=7.3.0
|
|
12
|
+
Requires-Dist: matplotlib>=3.11.0
|
|
13
|
+
Requires-Dist: nbdev>=3.0.17
|
|
14
|
+
Requires-Dist: pandas>=3.0.3
|
|
15
|
+
Requires-Dist: panel>=1.9.3
|
|
16
|
+
Requires-Dist: rapidfuzz>=3.14.5
|
|
17
|
+
Requires-Dist: scikit-learn>=1.9.0
|
|
18
|
+
Requires-Dist: sentence-transformers>=5.5.1
|
|
19
|
+
Requires-Dist: wordninja>=2.0.0
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: patient-data-expander-tool
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The Patient Data Expander Tool allows researchers to identify clusters within their patient data sets, select the features that best define those clusters, then query public databases for similar patients and export their data to expand their initial dataset.
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: ehrapy>=0.14.0
|
|
8
|
+
Requires-Dist: ehrdata>=0.2.1
|
|
9
|
+
Requires-Dist: holoviews>=1.22.1
|
|
10
|
+
Requires-Dist: hvplot>=0.12.2
|
|
11
|
+
Requires-Dist: ipykernel>=7.3.0
|
|
12
|
+
Requires-Dist: matplotlib>=3.11.0
|
|
13
|
+
Requires-Dist: nbdev>=3.0.17
|
|
14
|
+
Requires-Dist: pandas>=3.0.3
|
|
15
|
+
Requires-Dist: panel>=1.9.3
|
|
16
|
+
Requires-Dist: rapidfuzz>=3.14.5
|
|
17
|
+
Requires-Dist: scikit-learn>=1.9.0
|
|
18
|
+
Requires-Dist: sentence-transformers>=5.5.1
|
|
19
|
+
Requires-Dist: wordninja>=2.0.0
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
patient_data_expander_tool.egg-info/PKG-INFO
|
|
4
|
+
patient_data_expander_tool.egg-info/SOURCES.txt
|
|
5
|
+
patient_data_expander_tool.egg-info/dependency_links.txt
|
|
6
|
+
patient_data_expander_tool.egg-info/entry_points.txt
|
|
7
|
+
patient_data_expander_tool.egg-info/requires.txt
|
|
8
|
+
patient_data_expander_tool.egg-info/top_level.txt
|
|
9
|
+
pdet/__init__.py
|
|
10
|
+
pdet/gdc_index_code.py
|
|
11
|
+
pdet/main.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdet
|
|
File without changes
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import yaml
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# ── Config ────────────────────────────────────────────────────────────────────
|
|
7
|
+
REPO_URL = "https://github.com/NCI-GDC/gdcdictionary.git"
|
|
8
|
+
BRANCH = "develop"
|
|
9
|
+
CLONE_DIR = Path("/home/joseph_cottrell_99/ICR/JosephCottrell_2026Q2/data/epi700/gdcdictionary")
|
|
10
|
+
SCHEMAS_DIR = CLONE_DIR / "src/gdcdictionary/schemas"
|
|
11
|
+
OUTPUT_PATH = Path("/home/joseph_cottrell_99/ICR/JosephCottrell_2026Q2/data/epi700/gdc_field_index.json")
|
|
12
|
+
SKIP_FILES = {"README.md", "_terms_enum.yaml"}
|
|
13
|
+
|
|
14
|
+
# ── Load _definitions.yaml ────────────────────────────────────────────────────
|
|
15
|
+
def load_definitions(schemas_dir: Path) -> dict:
|
|
16
|
+
with open(schemas_dir / "_definitions.yaml") as f:
|
|
17
|
+
raw = yaml.safe_load(f)
|
|
18
|
+
definitions = {}
|
|
19
|
+
for def_name, def_body in raw.items():
|
|
20
|
+
if not isinstance(def_body, dict):
|
|
21
|
+
continue
|
|
22
|
+
definitions[def_name] = {
|
|
23
|
+
"type": def_body.get("type", "enum" if "enum" in def_body else "unknown"),
|
|
24
|
+
"description": def_body.get("description", ""),
|
|
25
|
+
"enum_values": def_body.get("enum", []),
|
|
26
|
+
}
|
|
27
|
+
return definitions
|
|
28
|
+
|
|
29
|
+
# ── Load _terms.yaml ──────────────────────────────────────────────────────────
|
|
30
|
+
def load_terms(schemas_dir: Path) -> dict:
|
|
31
|
+
with open(schemas_dir / "_terms.yaml") as f:
|
|
32
|
+
raw = yaml.safe_load(f)
|
|
33
|
+
# Structure is: term_name -> { common: { description: "..." }, ... }
|
|
34
|
+
terms = {}
|
|
35
|
+
for term_name, term_body in raw.items():
|
|
36
|
+
if not isinstance(term_body, dict):
|
|
37
|
+
continue
|
|
38
|
+
# Try to get description from the 'common' sub-key first, then top-level
|
|
39
|
+
desc = (
|
|
40
|
+
term_body.get("common", {}).get("description")
|
|
41
|
+
or term_body.get("description", "")
|
|
42
|
+
)
|
|
43
|
+
if desc:
|
|
44
|
+
terms[term_name] = desc
|
|
45
|
+
return terms
|
|
46
|
+
|
|
47
|
+
# ── Resolve a single $ref string ──────────────────────────────────────────────
|
|
48
|
+
def resolve_ref_str(ref_str: str, definitions: dict, terms: dict) -> dict:
|
|
49
|
+
result = {}
|
|
50
|
+
|
|
51
|
+
if "_terms.yaml#/" in ref_str:
|
|
52
|
+
# e.g. "_terms.yaml#/experimental_strategy/common"
|
|
53
|
+
# extract the first path segment after #/
|
|
54
|
+
key = ref_str.split("_terms.yaml#/")[-1].split("/")[0]
|
|
55
|
+
desc = terms.get(key, "")
|
|
56
|
+
if desc:
|
|
57
|
+
result["description"] = desc
|
|
58
|
+
|
|
59
|
+
elif "_definitions.yaml#/" in ref_str:
|
|
60
|
+
# e.g. "_definitions.yaml#/data_type"
|
|
61
|
+
key = ref_str.split("_definitions.yaml#/")[-1].split("/")[0]
|
|
62
|
+
result = definitions.get(key, {})
|
|
63
|
+
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
# ── Resolve $ref (handles both string and list forms) ─────────────────────────
|
|
67
|
+
def resolve_ref(field_def: dict, definitions: dict, terms: dict) -> dict:
|
|
68
|
+
ref_val = field_def.get("$ref")
|
|
69
|
+
if ref_val is None:
|
|
70
|
+
return {}
|
|
71
|
+
|
|
72
|
+
# $ref can be a string or a list of strings
|
|
73
|
+
refs = ref_val if isinstance(ref_val, list) else [ref_val]
|
|
74
|
+
|
|
75
|
+
merged = {}
|
|
76
|
+
for ref_str in refs:
|
|
77
|
+
resolved = resolve_ref_str(ref_str, definitions, terms)
|
|
78
|
+
# Merge: don't overwrite already-found values
|
|
79
|
+
for k, v in resolved.items():
|
|
80
|
+
if k not in merged or not merged[k]:
|
|
81
|
+
merged[k] = v
|
|
82
|
+
|
|
83
|
+
return merged
|
|
84
|
+
|
|
85
|
+
# ── Parse a single field ──────────────────────────────────────────────────────
|
|
86
|
+
def parse_field(field_def: dict, definitions: dict, terms: dict) -> dict:
|
|
87
|
+
if not isinstance(field_def, dict):
|
|
88
|
+
return {}
|
|
89
|
+
|
|
90
|
+
result = {
|
|
91
|
+
"type": field_def.get("type", ""),
|
|
92
|
+
"description": field_def.get("description", ""),
|
|
93
|
+
"enum_values": field_def.get("enum", []),
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# Resolve $ref (now handles list form and _terms.yaml refs)
|
|
97
|
+
resolved = resolve_ref(field_def, definitions, terms)
|
|
98
|
+
result["type"] = result["type"] or resolved.get("type", "")
|
|
99
|
+
result["description"] = result["description"] or resolved.get("description", "")
|
|
100
|
+
result["enum_values"] = result["enum_values"] or resolved.get("enum_values", [])
|
|
101
|
+
|
|
102
|
+
# Handle oneOf / anyOf
|
|
103
|
+
for union_key in ("oneOf", "anyOf"):
|
|
104
|
+
if union_key in field_def and not result["type"]:
|
|
105
|
+
for option in field_def[union_key]:
|
|
106
|
+
if isinstance(option, dict) and option.get("type") not in (None, "null"):
|
|
107
|
+
result["type"] = option.get("type", "")
|
|
108
|
+
result["enum_values"] = option.get("enum", result["enum_values"])
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
result["type"] = result["type"] or "unknown"
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def build_gdc_index(clone_dir: Path = CLONE_DIR,
|
|
116
|
+
schemas_dir: Path = SCHEMAS_DIR,
|
|
117
|
+
output_path: Path = OUTPUT_PATH):
|
|
118
|
+
# ── Clone / update repo ───────────────────────────────────────────────────
|
|
119
|
+
if clone_dir.exists():
|
|
120
|
+
print("Repo already cloned — pulling latest...")
|
|
121
|
+
subprocess.run(["git", "-C", str(clone_dir), "pull"], check=True, capture_output=True)
|
|
122
|
+
else:
|
|
123
|
+
print("Cloning GDC dictionary repo (first run)...")
|
|
124
|
+
subprocess.run([
|
|
125
|
+
"git", "clone", "--depth=1", "--branch", BRANCH,
|
|
126
|
+
REPO_URL, str(clone_dir)
|
|
127
|
+
], check=True, capture_output=True)
|
|
128
|
+
|
|
129
|
+
# ── Build index ───────────────────────────────────────────────────────────
|
|
130
|
+
print("Loading shared definition files...")
|
|
131
|
+
definitions = load_definitions(schemas_dir)
|
|
132
|
+
terms = load_terms(schemas_dir)
|
|
133
|
+
|
|
134
|
+
index = {}
|
|
135
|
+
|
|
136
|
+
for yaml_file in sorted(schemas_dir.glob("*.yaml")):
|
|
137
|
+
if yaml_file.name in SKIP_FILES:
|
|
138
|
+
continue
|
|
139
|
+
if yaml_file.name.startswith("_"):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
entity = yaml_file.stem
|
|
143
|
+
|
|
144
|
+
with open(yaml_file) as f:
|
|
145
|
+
schema = yaml.safe_load(f)
|
|
146
|
+
|
|
147
|
+
for field_name, field_def in schema.get("properties", {}).items():
|
|
148
|
+
parsed = parse_field(field_def, definitions, terms)
|
|
149
|
+
if not parsed:
|
|
150
|
+
continue
|
|
151
|
+
gdc_path = f"{entity}.{field_name}"
|
|
152
|
+
index[gdc_path] = {
|
|
153
|
+
"entity": entity,
|
|
154
|
+
"field": field_name,
|
|
155
|
+
**parsed
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
# ── Save ──────────────────────────────────────────────────────────────────
|
|
159
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
with open(output_path, "w") as f:
|
|
161
|
+
json.dump(index, f, indent=2)
|
|
162
|
+
|
|
163
|
+
entities = len(set(v["entity"] for v in index.values()))
|
|
164
|
+
print(f"✓ Indexed {len(index)} fields across {entities} entities — saved to {output_path}")
|