patient-data-expander-tool 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: patient-data-expander-tool
3
+ Version: 0.1.0
4
+ Summary: The Patient Data Expander Tool allows researchers to identify clusters within their patient data sets, select the features that best define those clusters, then query public databases for similar patients and export their data to expand their initial dataset.
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: ehrapy>=0.14.0
8
+ Requires-Dist: ehrdata>=0.2.1
9
+ Requires-Dist: holoviews>=1.22.1
10
+ Requires-Dist: hvplot>=0.12.2
11
+ Requires-Dist: ipykernel>=7.3.0
12
+ Requires-Dist: matplotlib>=3.11.0
13
+ Requires-Dist: nbdev>=3.0.17
14
+ Requires-Dist: pandas>=3.0.3
15
+ Requires-Dist: panel>=1.9.3
16
+ Requires-Dist: rapidfuzz>=3.14.5
17
+ Requires-Dist: scikit-learn>=1.9.0
18
+ Requires-Dist: sentence-transformers>=5.5.1
19
+ Requires-Dist: wordninja>=2.0.0
File without changes
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: patient-data-expander-tool
3
+ Version: 0.1.0
4
+ Summary: The Patient Data Expander Tool allows researchers to identify clusters within their patient data sets, select the features that best define those clusters, then query public databases for similar patients and export their data to expand their initial dataset.
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: ehrapy>=0.14.0
8
+ Requires-Dist: ehrdata>=0.2.1
9
+ Requires-Dist: holoviews>=1.22.1
10
+ Requires-Dist: hvplot>=0.12.2
11
+ Requires-Dist: ipykernel>=7.3.0
12
+ Requires-Dist: matplotlib>=3.11.0
13
+ Requires-Dist: nbdev>=3.0.17
14
+ Requires-Dist: pandas>=3.0.3
15
+ Requires-Dist: panel>=1.9.3
16
+ Requires-Dist: rapidfuzz>=3.14.5
17
+ Requires-Dist: scikit-learn>=1.9.0
18
+ Requires-Dist: sentence-transformers>=5.5.1
19
+ Requires-Dist: wordninja>=2.0.0
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ patient_data_expander_tool.egg-info/PKG-INFO
4
+ patient_data_expander_tool.egg-info/SOURCES.txt
5
+ patient_data_expander_tool.egg-info/dependency_links.txt
6
+ patient_data_expander_tool.egg-info/entry_points.txt
7
+ patient_data_expander_tool.egg-info/requires.txt
8
+ patient_data_expander_tool.egg-info/top_level.txt
9
+ pdet/__init__.py
10
+ pdet/gdc_index_code.py
11
+ pdet/main.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ run_pdet = pdet.main:main
@@ -0,0 +1,13 @@
1
+ ehrapy>=0.14.0
2
+ ehrdata>=0.2.1
3
+ holoviews>=1.22.1
4
+ hvplot>=0.12.2
5
+ ipykernel>=7.3.0
6
+ matplotlib>=3.11.0
7
+ nbdev>=3.0.17
8
+ pandas>=3.0.3
9
+ panel>=1.9.3
10
+ rapidfuzz>=3.14.5
11
+ scikit-learn>=1.9.0
12
+ sentence-transformers>=5.5.1
13
+ wordninja>=2.0.0
File without changes
@@ -0,0 +1,164 @@
1
+ import subprocess
2
+ import yaml
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # ── Config ────────────────────────────────────────────────────────────────────
7
+ REPO_URL = "https://github.com/NCI-GDC/gdcdictionary.git"
8
+ BRANCH = "develop"
9
+ CLONE_DIR = Path("/home/joseph_cottrell_99/ICR/JosephCottrell_2026Q2/data/epi700/gdcdictionary")
10
+ SCHEMAS_DIR = CLONE_DIR / "src/gdcdictionary/schemas"
11
+ OUTPUT_PATH = Path("/home/joseph_cottrell_99/ICR/JosephCottrell_2026Q2/data/epi700/gdc_field_index.json")
12
+ SKIP_FILES = {"README.md", "_terms_enum.yaml"}
13
+
14
+ # ── Load _definitions.yaml ────────────────────────────────────────────────────
15
+ def load_definitions(schemas_dir: Path) -> dict:
16
+ with open(schemas_dir / "_definitions.yaml") as f:
17
+ raw = yaml.safe_load(f)
18
+ definitions = {}
19
+ for def_name, def_body in raw.items():
20
+ if not isinstance(def_body, dict):
21
+ continue
22
+ definitions[def_name] = {
23
+ "type": def_body.get("type", "enum" if "enum" in def_body else "unknown"),
24
+ "description": def_body.get("description", ""),
25
+ "enum_values": def_body.get("enum", []),
26
+ }
27
+ return definitions
28
+
29
+ # ── Load _terms.yaml ──────────────────────────────────────────────────────────
30
+ def load_terms(schemas_dir: Path) -> dict:
31
+ with open(schemas_dir / "_terms.yaml") as f:
32
+ raw = yaml.safe_load(f)
33
+ # Structure is: term_name -> { common: { description: "..." }, ... }
34
+ terms = {}
35
+ for term_name, term_body in raw.items():
36
+ if not isinstance(term_body, dict):
37
+ continue
38
+ # Try to get description from the 'common' sub-key first, then top-level
39
+ desc = (
40
+ term_body.get("common", {}).get("description")
41
+ or term_body.get("description", "")
42
+ )
43
+ if desc:
44
+ terms[term_name] = desc
45
+ return terms
46
+
47
+ # ── Resolve a single $ref string ──────────────────────────────────────────────
48
+ def resolve_ref_str(ref_str: str, definitions: dict, terms: dict) -> dict:
49
+ result = {}
50
+
51
+ if "_terms.yaml#/" in ref_str:
52
+ # e.g. "_terms.yaml#/experimental_strategy/common"
53
+ # extract the first path segment after #/
54
+ key = ref_str.split("_terms.yaml#/")[-1].split("/")[0]
55
+ desc = terms.get(key, "")
56
+ if desc:
57
+ result["description"] = desc
58
+
59
+ elif "_definitions.yaml#/" in ref_str:
60
+ # e.g. "_definitions.yaml#/data_type"
61
+ key = ref_str.split("_definitions.yaml#/")[-1].split("/")[0]
62
+ result = definitions.get(key, {})
63
+
64
+ return result
65
+
66
+ # ── Resolve $ref (handles both string and list forms) ─────────────────────────
67
+ def resolve_ref(field_def: dict, definitions: dict, terms: dict) -> dict:
68
+ ref_val = field_def.get("$ref")
69
+ if ref_val is None:
70
+ return {}
71
+
72
+ # $ref can be a string or a list of strings
73
+ refs = ref_val if isinstance(ref_val, list) else [ref_val]
74
+
75
+ merged = {}
76
+ for ref_str in refs:
77
+ resolved = resolve_ref_str(ref_str, definitions, terms)
78
+ # Merge: don't overwrite already-found values
79
+ for k, v in resolved.items():
80
+ if k not in merged or not merged[k]:
81
+ merged[k] = v
82
+
83
+ return merged
84
+
85
+ # ── Parse a single field ──────────────────────────────────────────────────────
86
+ def parse_field(field_def: dict, definitions: dict, terms: dict) -> dict:
87
+ if not isinstance(field_def, dict):
88
+ return {}
89
+
90
+ result = {
91
+ "type": field_def.get("type", ""),
92
+ "description": field_def.get("description", ""),
93
+ "enum_values": field_def.get("enum", []),
94
+ }
95
+
96
+ # Resolve $ref (now handles list form and _terms.yaml refs)
97
+ resolved = resolve_ref(field_def, definitions, terms)
98
+ result["type"] = result["type"] or resolved.get("type", "")
99
+ result["description"] = result["description"] or resolved.get("description", "")
100
+ result["enum_values"] = result["enum_values"] or resolved.get("enum_values", [])
101
+
102
+ # Handle oneOf / anyOf
103
+ for union_key in ("oneOf", "anyOf"):
104
+ if union_key in field_def and not result["type"]:
105
+ for option in field_def[union_key]:
106
+ if isinstance(option, dict) and option.get("type") not in (None, "null"):
107
+ result["type"] = option.get("type", "")
108
+ result["enum_values"] = option.get("enum", result["enum_values"])
109
+ break
110
+
111
+ result["type"] = result["type"] or "unknown"
112
+ return result
113
+
114
+
115
+ def build_gdc_index(clone_dir: Path = CLONE_DIR,
116
+ schemas_dir: Path = SCHEMAS_DIR,
117
+ output_path: Path = OUTPUT_PATH):
118
+ # ── Clone / update repo ───────────────────────────────────────────────────
119
+ if clone_dir.exists():
120
+ print("Repo already cloned — pulling latest...")
121
+ subprocess.run(["git", "-C", str(clone_dir), "pull"], check=True, capture_output=True)
122
+ else:
123
+ print("Cloning GDC dictionary repo (first run)...")
124
+ subprocess.run([
125
+ "git", "clone", "--depth=1", "--branch", BRANCH,
126
+ REPO_URL, str(clone_dir)
127
+ ], check=True, capture_output=True)
128
+
129
+ # ── Build index ───────────────────────────────────────────────────────────
130
+ print("Loading shared definition files...")
131
+ definitions = load_definitions(schemas_dir)
132
+ terms = load_terms(schemas_dir)
133
+
134
+ index = {}
135
+
136
+ for yaml_file in sorted(schemas_dir.glob("*.yaml")):
137
+ if yaml_file.name in SKIP_FILES:
138
+ continue
139
+ if yaml_file.name.startswith("_"):
140
+ continue
141
+
142
+ entity = yaml_file.stem
143
+
144
+ with open(yaml_file) as f:
145
+ schema = yaml.safe_load(f)
146
+
147
+ for field_name, field_def in schema.get("properties", {}).items():
148
+ parsed = parse_field(field_def, definitions, terms)
149
+ if not parsed:
150
+ continue
151
+ gdc_path = f"{entity}.{field_name}"
152
+ index[gdc_path] = {
153
+ "entity": entity,
154
+ "field": field_name,
155
+ **parsed
156
+ }
157
+
158
+ # ── Save ──────────────────────────────────────────────────────────────────
159
+ output_path.parent.mkdir(parents=True, exist_ok=True)
160
+ with open(output_path, "w") as f:
161
+ json.dump(index, f, indent=2)
162
+
163
+ entities = len(set(v["entity"] for v in index.values()))
164
+ print(f"✓ Indexed {len(index)} fields across {entities} entities — saved to {output_path}")