celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/regulatory.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Regulatory tools for CDISC delivery quality checks.
|
|
3
|
+
|
|
4
|
+
Focused on pragmatic linting of SDTM-like tabular datasets and Define-XML files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import re
|
|
11
|
+
import xml.etree.ElementTree as ET
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from ct.tools import registry
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_COLUMN_RE = re.compile(r"^[A-Z][A-Z0-9_]*$")
|
|
19
|
+
_ISO8601_PARTIAL_RE = re.compile(
|
|
20
|
+
r"^\d{4}(-\d{2}){0,2}((T\d{2}(:\d{2}){0,2}(\.\d+)?)?(Z|[+-]\d{2}:\d{2})?)?$"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
_DOMAIN_REQUIRED = {
|
|
24
|
+
"DM": ["SUBJID", "SEX", "RFSTDTC"],
|
|
25
|
+
"AE": ["AESEQ", "AETERM", "AESTDTC"],
|
|
26
|
+
"LB": ["LBSEQ", "LBTEST", "LBORRES", "LBDTC"],
|
|
27
|
+
"VS": ["VSSEQ", "VSTEST", "VSORRES", "VSDTC"],
|
|
28
|
+
"CM": ["CMSEQ", "CMTRT", "CMSTDTC"],
|
|
29
|
+
"EX": ["EXSEQ", "EXTRT", "EXSTDTC"],
|
|
30
|
+
"MH": ["MHSEQ", "MHTERM"],
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _local_name(tag: str) -> str:
|
|
35
|
+
return tag.rsplit("}", 1)[-1]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _issue(
|
|
39
|
+
issues: list[dict],
|
|
40
|
+
severity: str,
|
|
41
|
+
code: str,
|
|
42
|
+
message: str,
|
|
43
|
+
field: str | None = None,
|
|
44
|
+
) -> None:
|
|
45
|
+
payload = {"severity": severity, "code": code, "message": message}
|
|
46
|
+
if field:
|
|
47
|
+
payload["field"] = field
|
|
48
|
+
issues.append(payload)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _score_quality(errors: int, warnings: int) -> int:
|
|
52
|
+
return max(0, int(100 - errors * 12 - warnings * 4))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _read_tabular(path: Path) -> tuple[pd.DataFrame | None, str | None]:
|
|
56
|
+
suffix = path.suffix.lower()
|
|
57
|
+
try:
|
|
58
|
+
if suffix in {".csv"}:
|
|
59
|
+
return pd.read_csv(path), None
|
|
60
|
+
if suffix in {".tsv", ".txt"}:
|
|
61
|
+
return pd.read_csv(path, sep="\t"), None
|
|
62
|
+
if suffix == ".parquet":
|
|
63
|
+
return pd.read_parquet(path), None
|
|
64
|
+
if suffix == ".xpt":
|
|
65
|
+
return pd.read_sas(path, format="xport"), None
|
|
66
|
+
return None, f"Unsupported dataset format '{suffix or '<none>'}'"
|
|
67
|
+
except Exception as exc:
|
|
68
|
+
return None, f"Failed to read dataset: {exc}"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _infer_domain_from_path(path: Path) -> str:
|
|
72
|
+
stem = path.stem.upper()
|
|
73
|
+
tokens = [t for t in re.split(r"[^A-Z0-9]+", stem) if t]
|
|
74
|
+
for token in tokens:
|
|
75
|
+
if token in _DOMAIN_REQUIRED:
|
|
76
|
+
return token
|
|
77
|
+
alpha = "".join(ch for ch in stem if ch.isalpha())
|
|
78
|
+
return alpha[:2] if len(alpha) >= 2 else ""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@registry.register(
|
|
82
|
+
name="regulatory.cdisc_lint",
|
|
83
|
+
description="Lint a CDISC-like tabular dataset for naming, key, required-variable, and date-format issues",
|
|
84
|
+
category="regulatory",
|
|
85
|
+
parameters={
|
|
86
|
+
"dataset_path": "Path to SDTM/ADaM-like dataset file (.csv, .tsv, .parquet, .xpt)",
|
|
87
|
+
"domain": "Optional expected domain (e.g., AE, DM, LB); auto-inferred when omitted",
|
|
88
|
+
"required_columns": "Optional list of additional required column names",
|
|
89
|
+
"strict": "If true, treat variable-name length violations as errors",
|
|
90
|
+
"max_issues": "Maximum issues returned (default 100)",
|
|
91
|
+
},
|
|
92
|
+
usage_guide=(
|
|
93
|
+
"Use before submission-package handoff to catch high-impact CDISC data quality issues early. "
|
|
94
|
+
"Best for SDTM-like domains with STUDYID/USUBJID and --DTC fields."
|
|
95
|
+
),
|
|
96
|
+
)
|
|
97
|
+
def cdisc_lint(
|
|
98
|
+
dataset_path: str,
|
|
99
|
+
domain: str = "",
|
|
100
|
+
required_columns: list[str] | None = None,
|
|
101
|
+
strict: bool = False,
|
|
102
|
+
max_issues: int = 100,
|
|
103
|
+
**kwargs,
|
|
104
|
+
) -> dict:
|
|
105
|
+
"""Run pragmatic CDISC-style lint checks on a tabular dataset."""
|
|
106
|
+
del kwargs
|
|
107
|
+
|
|
108
|
+
if not dataset_path:
|
|
109
|
+
return {"summary": "dataset_path is required.", "error": "missing_dataset_path"}
|
|
110
|
+
|
|
111
|
+
path = Path(dataset_path).expanduser()
|
|
112
|
+
if not path.exists():
|
|
113
|
+
return {"summary": f"Dataset file not found: {path}", "error": "file_not_found"}
|
|
114
|
+
if path.is_dir():
|
|
115
|
+
return {"summary": f"dataset_path must be a file: {path}", "error": "path_is_directory"}
|
|
116
|
+
|
|
117
|
+
frame, read_error = _read_tabular(path)
|
|
118
|
+
if read_error:
|
|
119
|
+
return {"summary": read_error, "error": "read_failed"}
|
|
120
|
+
|
|
121
|
+
assert frame is not None # For type checkers.
|
|
122
|
+
|
|
123
|
+
issues: list[dict] = []
|
|
124
|
+
n_rows = int(len(frame))
|
|
125
|
+
n_cols = int(len(frame.columns))
|
|
126
|
+
|
|
127
|
+
requested_domain = str(domain or "").strip().upper()
|
|
128
|
+
inferred_domain = requested_domain or _infer_domain_from_path(path)
|
|
129
|
+
|
|
130
|
+
columns = [str(c).strip() for c in frame.columns.tolist()]
|
|
131
|
+
dup_cols = sorted({c for c in columns if columns.count(c) > 1})
|
|
132
|
+
for col in dup_cols:
|
|
133
|
+
_issue(issues, "error", "duplicate_column", f"Duplicate column name: {col}", field=col)
|
|
134
|
+
|
|
135
|
+
for col in columns:
|
|
136
|
+
if not _COLUMN_RE.match(col):
|
|
137
|
+
_issue(
|
|
138
|
+
issues,
|
|
139
|
+
"error",
|
|
140
|
+
"invalid_variable_name",
|
|
141
|
+
"Variable names must be uppercase A-Z, 0-9, underscore, starting with a letter.",
|
|
142
|
+
field=col,
|
|
143
|
+
)
|
|
144
|
+
if len(col) > 8:
|
|
145
|
+
sev = "error" if strict else "warning"
|
|
146
|
+
_issue(
|
|
147
|
+
issues,
|
|
148
|
+
sev,
|
|
149
|
+
"variable_name_too_long",
|
|
150
|
+
f"Variable name exceeds 8 characters ({len(col)}).",
|
|
151
|
+
field=col,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
required = ["STUDYID", "USUBJID"]
|
|
155
|
+
if inferred_domain:
|
|
156
|
+
required.append("DOMAIN")
|
|
157
|
+
required.extend(_DOMAIN_REQUIRED.get(inferred_domain, []))
|
|
158
|
+
if required_columns:
|
|
159
|
+
required.extend([str(c).strip().upper() for c in required_columns if str(c).strip()])
|
|
160
|
+
required = sorted(set(required))
|
|
161
|
+
|
|
162
|
+
missing_required = [c for c in required if c not in columns]
|
|
163
|
+
for col in missing_required:
|
|
164
|
+
_issue(issues, "error", "missing_required_column", "Missing required column.", field=col)
|
|
165
|
+
|
|
166
|
+
if "DOMAIN" in columns and inferred_domain:
|
|
167
|
+
observed = sorted(
|
|
168
|
+
{
|
|
169
|
+
str(v).strip().upper()
|
|
170
|
+
for v in frame["DOMAIN"].dropna().astype(str).tolist()
|
|
171
|
+
if str(v).strip()
|
|
172
|
+
}
|
|
173
|
+
)
|
|
174
|
+
if observed and observed != [inferred_domain]:
|
|
175
|
+
_issue(
|
|
176
|
+
issues,
|
|
177
|
+
"error",
|
|
178
|
+
"domain_mismatch",
|
|
179
|
+
f"DOMAIN values {observed} do not match expected domain {inferred_domain}.",
|
|
180
|
+
field="DOMAIN",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if n_rows == 0:
|
|
184
|
+
_issue(issues, "warning", "empty_dataset", "Dataset has zero rows.")
|
|
185
|
+
|
|
186
|
+
# Missingness on required columns present in data.
|
|
187
|
+
for col in required:
|
|
188
|
+
if col not in frame.columns:
|
|
189
|
+
continue
|
|
190
|
+
series = frame[col]
|
|
191
|
+
missing_count = int(series.isna().sum())
|
|
192
|
+
if series.dtype == object:
|
|
193
|
+
missing_count += int(series.astype(str).str.strip().eq("").sum())
|
|
194
|
+
if missing_count > 0:
|
|
195
|
+
_issue(
|
|
196
|
+
issues,
|
|
197
|
+
"error",
|
|
198
|
+
"required_column_missing_values",
|
|
199
|
+
f"{missing_count} missing/blank values in required column.",
|
|
200
|
+
field=col,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
key_cols = [c for c in ["STUDYID", "USUBJID"] if c in frame.columns]
|
|
204
|
+
seq_col = f"{inferred_domain}SEQ" if inferred_domain else ""
|
|
205
|
+
if seq_col and seq_col in frame.columns:
|
|
206
|
+
key_cols.append(seq_col)
|
|
207
|
+
if len(key_cols) >= 2 and n_rows > 0:
|
|
208
|
+
dup_rows = int(frame.duplicated(subset=key_cols).sum())
|
|
209
|
+
if dup_rows > 0:
|
|
210
|
+
_issue(
|
|
211
|
+
issues,
|
|
212
|
+
"error",
|
|
213
|
+
"duplicate_keys",
|
|
214
|
+
f"{dup_rows} duplicate rows detected for key {key_cols}.",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
date_cols = [c for c in columns if c.endswith("DTC")]
|
|
218
|
+
for col in date_cols:
|
|
219
|
+
raw = frame[col].dropna().astype(str).str.strip()
|
|
220
|
+
if len(raw) == 0:
|
|
221
|
+
continue
|
|
222
|
+
bad = raw[~raw.str.match(_ISO8601_PARTIAL_RE, na=False)]
|
|
223
|
+
if len(bad) > 0:
|
|
224
|
+
_issue(
|
|
225
|
+
issues,
|
|
226
|
+
"error",
|
|
227
|
+
"invalid_datetime_format",
|
|
228
|
+
f"{len(bad)} values are not ISO8601/partial ISO8601 compliant.",
|
|
229
|
+
field=col,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
n_errors = sum(1 for x in issues if x["severity"] == "error")
|
|
233
|
+
n_warnings = sum(1 for x in issues if x["severity"] == "warning")
|
|
234
|
+
score = _score_quality(n_errors, n_warnings)
|
|
235
|
+
max_issues = max(1, int(max_issues or 100))
|
|
236
|
+
|
|
237
|
+
summary = (
|
|
238
|
+
f"CDISC lint for {path.name}: {n_errors} error(s), {n_warnings} warning(s), "
|
|
239
|
+
f"quality score {score}/100."
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
"summary": summary,
|
|
244
|
+
"dataset_path": str(path),
|
|
245
|
+
"domain": inferred_domain or None,
|
|
246
|
+
"n_rows": n_rows,
|
|
247
|
+
"n_columns": n_cols,
|
|
248
|
+
"required_columns_checked": required,
|
|
249
|
+
"error_count": n_errors,
|
|
250
|
+
"warning_count": n_warnings,
|
|
251
|
+
"quality_score": score,
|
|
252
|
+
"issues": issues[:max_issues],
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _duplicate_oids(elements: list[ET.Element]) -> set[str]:
|
|
257
|
+
seen: set[str] = set()
|
|
258
|
+
duplicates: set[str] = set()
|
|
259
|
+
for elem in elements:
|
|
260
|
+
oid = str(elem.attrib.get("OID", "")).strip()
|
|
261
|
+
if not oid:
|
|
262
|
+
continue
|
|
263
|
+
if oid in seen:
|
|
264
|
+
duplicates.add(oid)
|
|
265
|
+
seen.add(oid)
|
|
266
|
+
return duplicates
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _attr_by_local_name(elem: ET.Element, local_name: str) -> str:
|
|
270
|
+
for key, value in elem.attrib.items():
|
|
271
|
+
if _local_name(key) == local_name:
|
|
272
|
+
return str(value).strip()
|
|
273
|
+
return ""
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
@registry.register(
|
|
277
|
+
name="regulatory.define_xml_lint",
|
|
278
|
+
description="Lint a Define-XML file for structural integrity and common referential issues",
|
|
279
|
+
category="regulatory",
|
|
280
|
+
parameters={
|
|
281
|
+
"define_xml_path": "Path to define.xml file",
|
|
282
|
+
"strict": "If true, missing optional metadata is elevated to warning/error",
|
|
283
|
+
"max_issues": "Maximum issues returned (default 100)",
|
|
284
|
+
},
|
|
285
|
+
usage_guide=(
|
|
286
|
+
"Use to preflight Define-XML before package delivery. Checks parseability, "
|
|
287
|
+
"core structure, and reference integrity (ItemRef/CodeListRef/ValueListRef)."
|
|
288
|
+
),
|
|
289
|
+
)
|
|
290
|
+
def define_xml_lint(
|
|
291
|
+
define_xml_path: str,
|
|
292
|
+
strict: bool = False,
|
|
293
|
+
max_issues: int = 100,
|
|
294
|
+
**kwargs,
|
|
295
|
+
) -> dict:
|
|
296
|
+
"""Run structural and referential lint checks for Define-XML."""
|
|
297
|
+
del kwargs
|
|
298
|
+
|
|
299
|
+
if not define_xml_path:
|
|
300
|
+
return {"summary": "define_xml_path is required.", "error": "missing_define_xml_path"}
|
|
301
|
+
|
|
302
|
+
path = Path(define_xml_path).expanduser()
|
|
303
|
+
if not path.exists():
|
|
304
|
+
return {"summary": f"Define-XML file not found: {path}", "error": "file_not_found"}
|
|
305
|
+
if path.is_dir():
|
|
306
|
+
return {"summary": f"define_xml_path must be a file: {path}", "error": "path_is_directory"}
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
tree = ET.parse(path)
|
|
310
|
+
except ET.ParseError as exc:
|
|
311
|
+
return {"summary": f"Invalid XML: {exc}", "error": "xml_parse_error"}
|
|
312
|
+
except Exception as exc:
|
|
313
|
+
return {"summary": f"Failed to read Define-XML: {exc}", "error": "read_failed"}
|
|
314
|
+
|
|
315
|
+
root = tree.getroot()
|
|
316
|
+
issues: list[dict] = []
|
|
317
|
+
|
|
318
|
+
if _local_name(root.tag) != "ODM":
|
|
319
|
+
_issue(issues, "error", "root_not_odm", "Root element is not ODM.")
|
|
320
|
+
|
|
321
|
+
studies = [x for x in root.iter() if _local_name(x.tag) == "Study"]
|
|
322
|
+
mdvs = [x for x in root.iter() if _local_name(x.tag) == "MetaDataVersion"]
|
|
323
|
+
item_defs = [x for x in root.iter() if _local_name(x.tag) == "ItemDef"]
|
|
324
|
+
item_groups = [x for x in root.iter() if _local_name(x.tag) == "ItemGroupDef"]
|
|
325
|
+
code_lists = [x for x in root.iter() if _local_name(x.tag) == "CodeList"]
|
|
326
|
+
value_lists = [x for x in root.iter() if _local_name(x.tag) == "ValueListDef"]
|
|
327
|
+
where_defs = [x for x in root.iter() if _local_name(x.tag) == "WhereClauseDef"]
|
|
328
|
+
leaf_nodes = [x for x in root.iter() if _local_name(x.tag).lower() == "leaf"]
|
|
329
|
+
|
|
330
|
+
if not studies:
|
|
331
|
+
_issue(issues, "error", "missing_study", "No Study element found.")
|
|
332
|
+
if not mdvs:
|
|
333
|
+
_issue(issues, "error", "missing_metadataversion", "No MetaDataVersion element found.")
|
|
334
|
+
if not item_defs:
|
|
335
|
+
_issue(issues, "error", "missing_itemdefs", "No ItemDef elements found.")
|
|
336
|
+
if not item_groups:
|
|
337
|
+
_issue(issues, "error", "missing_itemgroups", "No ItemGroupDef elements found.")
|
|
338
|
+
|
|
339
|
+
for oid in sorted(_duplicate_oids(item_defs)):
|
|
340
|
+
_issue(issues, "error", "duplicate_itemdef_oid", f"Duplicate ItemDef OID: {oid}")
|
|
341
|
+
for oid in sorted(_duplicate_oids(item_groups)):
|
|
342
|
+
_issue(issues, "error", "duplicate_itemgroup_oid", f"Duplicate ItemGroupDef OID: {oid}")
|
|
343
|
+
for oid in sorted(_duplicate_oids(code_lists)):
|
|
344
|
+
_issue(issues, "error", "duplicate_codelist_oid", f"Duplicate CodeList OID: {oid}")
|
|
345
|
+
|
|
346
|
+
item_oids = {str(x.attrib.get("OID", "")).strip() for x in item_defs if x.attrib.get("OID")}
|
|
347
|
+
codelist_oids = {
|
|
348
|
+
str(x.attrib.get("OID", "")).strip()
|
|
349
|
+
for x in code_lists
|
|
350
|
+
if x.attrib.get("OID")
|
|
351
|
+
}
|
|
352
|
+
valuelist_oids = {
|
|
353
|
+
str(x.attrib.get("OID", "")).strip()
|
|
354
|
+
for x in value_lists
|
|
355
|
+
if x.attrib.get("OID")
|
|
356
|
+
}
|
|
357
|
+
where_oids = {
|
|
358
|
+
str(x.attrib.get("OID", "")).strip()
|
|
359
|
+
for x in where_defs
|
|
360
|
+
if x.attrib.get("OID")
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
for item in item_defs:
|
|
364
|
+
oid = str(item.attrib.get("OID", "")).strip() or "<missing>"
|
|
365
|
+
if not str(item.attrib.get("Name", "")).strip():
|
|
366
|
+
_issue(issues, "error", "itemdef_missing_name", "ItemDef missing Name.", field=oid)
|
|
367
|
+
if not str(item.attrib.get("DataType", "")).strip():
|
|
368
|
+
_issue(issues, "error", "itemdef_missing_datatype", "ItemDef missing DataType.", field=oid)
|
|
369
|
+
|
|
370
|
+
for ref in [x for x in root.iter() if _local_name(x.tag) == "ItemRef"]:
|
|
371
|
+
item_oid = str(ref.attrib.get("ItemOID", "")).strip()
|
|
372
|
+
if not item_oid:
|
|
373
|
+
_issue(issues, "error", "itemref_missing_itemoid", "ItemRef missing ItemOID.")
|
|
374
|
+
continue
|
|
375
|
+
if item_oid not in item_oids:
|
|
376
|
+
_issue(
|
|
377
|
+
issues,
|
|
378
|
+
"error",
|
|
379
|
+
"itemref_unknown_itemoid",
|
|
380
|
+
f"ItemRef points to unknown ItemDef OID: {item_oid}",
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
for ref in [x for x in root.iter() if _local_name(x.tag) == "CodeListRef"]:
|
|
384
|
+
code_oid = str(ref.attrib.get("CodeListOID", "")).strip()
|
|
385
|
+
if not code_oid:
|
|
386
|
+
_issue(issues, "error", "codelistref_missing_oid", "CodeListRef missing CodeListOID.")
|
|
387
|
+
continue
|
|
388
|
+
if code_oid not in codelist_oids:
|
|
389
|
+
_issue(
|
|
390
|
+
issues,
|
|
391
|
+
"error",
|
|
392
|
+
"codelistref_unknown_oid",
|
|
393
|
+
f"CodeListRef points to unknown CodeList OID: {code_oid}",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
for ref in [x for x in root.iter() if _local_name(x.tag) == "ValueListRef"]:
|
|
397
|
+
value_oid = str(ref.attrib.get("ValueListOID", "")).strip()
|
|
398
|
+
if not value_oid:
|
|
399
|
+
_issue(issues, "error", "valuelistref_missing_oid", "ValueListRef missing ValueListOID.")
|
|
400
|
+
continue
|
|
401
|
+
if value_oid not in valuelist_oids:
|
|
402
|
+
_issue(
|
|
403
|
+
issues,
|
|
404
|
+
"error",
|
|
405
|
+
"valuelistref_unknown_oid",
|
|
406
|
+
f"ValueListRef points to unknown ValueListDef OID: {value_oid}",
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
for ref in [x for x in root.iter() if _local_name(x.tag) == "WhereClauseRef"]:
|
|
410
|
+
where_oid = str(ref.attrib.get("WhereClauseOID", "")).strip()
|
|
411
|
+
if not where_oid:
|
|
412
|
+
_issue(issues, "warning", "whereclauseref_missing_oid", "WhereClauseRef missing OID.")
|
|
413
|
+
continue
|
|
414
|
+
if where_oids and where_oid not in where_oids:
|
|
415
|
+
_issue(
|
|
416
|
+
issues,
|
|
417
|
+
"error",
|
|
418
|
+
"whereclauseref_unknown_oid",
|
|
419
|
+
f"WhereClauseRef points to unknown WhereClauseDef OID: {where_oid}",
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if not leaf_nodes and strict:
|
|
423
|
+
_issue(issues, "warning", "missing_leaf", "No def:leaf nodes found for data/metadata files.")
|
|
424
|
+
|
|
425
|
+
for leaf in leaf_nodes:
|
|
426
|
+
href = _attr_by_local_name(leaf, "href")
|
|
427
|
+
if not href:
|
|
428
|
+
_issue(issues, "error", "leaf_missing_href", "def:leaf is missing xlink:href.")
|
|
429
|
+
|
|
430
|
+
n_errors = sum(1 for x in issues if x["severity"] == "error")
|
|
431
|
+
n_warnings = sum(1 for x in issues if x["severity"] == "warning")
|
|
432
|
+
score = _score_quality(n_errors, n_warnings)
|
|
433
|
+
max_issues = max(1, int(max_issues or 100))
|
|
434
|
+
|
|
435
|
+
summary = (
|
|
436
|
+
f"Define-XML lint for {path.name}: {n_errors} error(s), {n_warnings} warning(s), "
|
|
437
|
+
f"quality score {score}/100."
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
"summary": summary,
|
|
442
|
+
"define_xml_path": str(path),
|
|
443
|
+
"error_count": n_errors,
|
|
444
|
+
"warning_count": n_warnings,
|
|
445
|
+
"quality_score": score,
|
|
446
|
+
"counts": {
|
|
447
|
+
"study": len(studies),
|
|
448
|
+
"metadataversion": len(mdvs),
|
|
449
|
+
"itemdef": len(item_defs),
|
|
450
|
+
"itemgroupdef": len(item_groups),
|
|
451
|
+
"codelist": len(code_lists),
|
|
452
|
+
"valuelistdef": len(value_lists),
|
|
453
|
+
"leaf": len(leaf_nodes),
|
|
454
|
+
},
|
|
455
|
+
"issues": issues[:max_issues],
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _extract_leaf_hrefs(xml_path: Path) -> tuple[list[str], str | None]:
|
|
460
|
+
try:
|
|
461
|
+
root = ET.parse(xml_path).getroot()
|
|
462
|
+
except Exception as exc:
|
|
463
|
+
return [], str(exc)
|
|
464
|
+
|
|
465
|
+
hrefs = []
|
|
466
|
+
for leaf in [x for x in root.iter() if _local_name(x.tag).lower() == "leaf"]:
|
|
467
|
+
href = _attr_by_local_name(leaf, "href")
|
|
468
|
+
if href:
|
|
469
|
+
hrefs.append(href.strip())
|
|
470
|
+
return hrefs, None
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
@registry.register(
|
|
474
|
+
name="regulatory.submission_package_check",
|
|
475
|
+
description="Run cross-file submission package checks across datasets and Define-XML",
|
|
476
|
+
category="regulatory",
|
|
477
|
+
parameters={
|
|
478
|
+
"package_dir": "Submission package root directory",
|
|
479
|
+
"define_xml_path": "Optional explicit define.xml path (auto-discovered when omitted)",
|
|
480
|
+
"max_datasets": "Maximum dataset files to lint (default 30)",
|
|
481
|
+
"strict": "If true, tighten checks (warnings promoted where applicable)",
|
|
482
|
+
},
|
|
483
|
+
usage_guide=(
|
|
484
|
+
"Use before external handoff to validate end-to-end package consistency: "
|
|
485
|
+
"Define-XML integrity, dataset linting, and cross-file reference checks."
|
|
486
|
+
),
|
|
487
|
+
)
|
|
488
|
+
def submission_package_check(
|
|
489
|
+
package_dir: str,
|
|
490
|
+
define_xml_path: str = "",
|
|
491
|
+
max_datasets: int = 30,
|
|
492
|
+
strict: bool = False,
|
|
493
|
+
**kwargs,
|
|
494
|
+
) -> dict:
|
|
495
|
+
"""Run cross-file validation for a submission package directory."""
|
|
496
|
+
del kwargs
|
|
497
|
+
if not package_dir:
|
|
498
|
+
return {"summary": "package_dir is required.", "error": "missing_package_dir"}
|
|
499
|
+
|
|
500
|
+
root = Path(package_dir).expanduser()
|
|
501
|
+
if not root.exists():
|
|
502
|
+
return {"summary": f"Package directory not found: {root}", "error": "package_not_found"}
|
|
503
|
+
if not root.is_dir():
|
|
504
|
+
return {"summary": f"package_dir must be a directory: {root}", "error": "not_a_directory"}
|
|
505
|
+
|
|
506
|
+
max_datasets = max(1, min(int(max_datasets or 30), 200))
|
|
507
|
+
issues: list[dict] = []
|
|
508
|
+
|
|
509
|
+
# Resolve define.xml
|
|
510
|
+
define_paths = []
|
|
511
|
+
if define_xml_path:
|
|
512
|
+
candidate = Path(define_xml_path).expanduser()
|
|
513
|
+
if not candidate.exists():
|
|
514
|
+
_issue(issues, "error", "define_xml_missing", f"define_xml_path not found: {candidate}")
|
|
515
|
+
else:
|
|
516
|
+
define_paths = [candidate]
|
|
517
|
+
else:
|
|
518
|
+
define_paths = sorted(
|
|
519
|
+
[p for p in root.rglob("*.xml") if "define" in p.name.lower()],
|
|
520
|
+
key=lambda p: p.name.lower(),
|
|
521
|
+
)
|
|
522
|
+
if not define_paths:
|
|
523
|
+
_issue(issues, "warning", "define_xml_not_found", "No define.xml discovered in package directory.")
|
|
524
|
+
|
|
525
|
+
define_results = []
|
|
526
|
+
for p in define_paths[:2]:
|
|
527
|
+
lint = define_xml_lint(define_xml_path=str(p), strict=strict, max_issues=200)
|
|
528
|
+
define_results.append(lint)
|
|
529
|
+
if "error" in lint:
|
|
530
|
+
_issue(issues, "error", "define_xml_lint_failed", lint["summary"], field=str(p))
|
|
531
|
+
|
|
532
|
+
# Dataset discovery
|
|
533
|
+
dataset_paths = sorted(
|
|
534
|
+
[
|
|
535
|
+
p
|
|
536
|
+
for p in root.rglob("*")
|
|
537
|
+
if p.is_file() and p.suffix.lower() in {".xpt", ".csv", ".tsv", ".parquet"}
|
|
538
|
+
]
|
|
539
|
+
)
|
|
540
|
+
if not dataset_paths:
|
|
541
|
+
_issue(issues, "error", "no_datasets_found", "No dataset files (*.xpt/*.csv/*.tsv/*.parquet) found.")
|
|
542
|
+
|
|
543
|
+
dataset_lints = []
|
|
544
|
+
for p in dataset_paths[:max_datasets]:
|
|
545
|
+
result = cdisc_lint(
|
|
546
|
+
dataset_path=str(p),
|
|
547
|
+
strict=strict,
|
|
548
|
+
max_issues=50,
|
|
549
|
+
)
|
|
550
|
+
dataset_lints.append(result)
|
|
551
|
+
if result.get("error_count", 0) > 0:
|
|
552
|
+
_issue(
|
|
553
|
+
issues,
|
|
554
|
+
"error",
|
|
555
|
+
"dataset_errors",
|
|
556
|
+
f"{p.name}: {result.get('error_count', 0)} error(s) from CDISC lint.",
|
|
557
|
+
field=str(p),
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Cross-file leaf href resolution against package files.
|
|
561
|
+
referenced_files = set()
|
|
562
|
+
missing_leaf_files = []
|
|
563
|
+
for d in define_results:
|
|
564
|
+
define_path = d.get("define_xml_path")
|
|
565
|
+
if not define_path:
|
|
566
|
+
continue
|
|
567
|
+
hrefs, href_error = _extract_leaf_hrefs(Path(define_path))
|
|
568
|
+
if href_error:
|
|
569
|
+
_issue(
|
|
570
|
+
issues,
|
|
571
|
+
"warning",
|
|
572
|
+
"leaf_parse_failed",
|
|
573
|
+
f"Could not parse leaf href values for {define_path}: {href_error}",
|
|
574
|
+
)
|
|
575
|
+
continue
|
|
576
|
+
base_dir = Path(define_path).parent
|
|
577
|
+
for href in hrefs:
|
|
578
|
+
normalized = href.replace("\\", "/").strip()
|
|
579
|
+
referenced_files.add(normalized)
|
|
580
|
+
candidate = (base_dir / normalized).resolve()
|
|
581
|
+
if not candidate.exists():
|
|
582
|
+
# Fallback check relative to package root.
|
|
583
|
+
candidate = (root / normalized).resolve()
|
|
584
|
+
if not candidate.exists():
|
|
585
|
+
missing_leaf_files.append(normalized)
|
|
586
|
+
|
|
587
|
+
for href in sorted(set(missing_leaf_files)):
|
|
588
|
+
_issue(
|
|
589
|
+
issues,
|
|
590
|
+
"error",
|
|
591
|
+
"missing_leaf_target",
|
|
592
|
+
f"Define-XML leaf href does not resolve to a file in package: {href}",
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Orphan dataset warning: files that are present but not referenced in define.xml.
|
|
596
|
+
if referenced_files:
|
|
597
|
+
dataset_relpaths = {
|
|
598
|
+
str(p.relative_to(root)).replace("\\", "/"): p
|
|
599
|
+
for p in dataset_paths
|
|
600
|
+
if p.is_file()
|
|
601
|
+
}
|
|
602
|
+
unreferenced = []
|
|
603
|
+
ref_suffixes = {Path(x).name.lower() for x in referenced_files}
|
|
604
|
+
for rel, path_obj in dataset_relpaths.items():
|
|
605
|
+
if path_obj.name.lower() not in ref_suffixes:
|
|
606
|
+
unreferenced.append(rel)
|
|
607
|
+
for rel in sorted(unreferenced)[:20]:
|
|
608
|
+
_issue(
|
|
609
|
+
issues,
|
|
610
|
+
"warning",
|
|
611
|
+
"unreferenced_dataset",
|
|
612
|
+
f"Dataset not referenced by define.xml leaf entries: {rel}",
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
n_errors = sum(1 for x in issues if x["severity"] == "error")
|
|
616
|
+
n_warnings = sum(1 for x in issues if x["severity"] == "warning")
|
|
617
|
+
readiness_score = _score_quality(n_errors, n_warnings)
|
|
618
|
+
if n_errors == 0 and readiness_score >= 85:
|
|
619
|
+
readiness = "ready"
|
|
620
|
+
elif n_errors <= 3 and readiness_score >= 60:
|
|
621
|
+
readiness = "needs_review"
|
|
622
|
+
else:
|
|
623
|
+
readiness = "not_ready"
|
|
624
|
+
|
|
625
|
+
summary = (
|
|
626
|
+
f"Submission package check for {root}: {readiness} "
|
|
627
|
+
f"(score={readiness_score}/100, errors={n_errors}, warnings={n_warnings}). "
|
|
628
|
+
f"Datasets linted: {len(dataset_lints)}/{len(dataset_paths)}."
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
return {
|
|
632
|
+
"summary": summary,
|
|
633
|
+
"package_dir": str(root),
|
|
634
|
+
"readiness": readiness,
|
|
635
|
+
"readiness_score": readiness_score,
|
|
636
|
+
"error_count": n_errors,
|
|
637
|
+
"warning_count": n_warnings,
|
|
638
|
+
"define_xml_results": define_results,
|
|
639
|
+
"dataset_lints": dataset_lints,
|
|
640
|
+
"datasets_discovered": len(dataset_paths),
|
|
641
|
+
"datasets_linted": len(dataset_lints),
|
|
642
|
+
"issues": issues,
|
|
643
|
+
}
|