omicsmeta 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
omicsmeta/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Tools for harmonizing public omics metadata."""
2
+
3
+ from omicsmeta._version import __version__
4
+ from omicsmeta.core.harmonizer import HarmonizationResult, Harmonizer
5
+
6
+ __all__ = ["HarmonizationResult", "Harmonizer", "__version__"]
7
+
omicsmeta/_version.py ADDED
@@ -0,0 +1,2 @@
1
+ __version__ = "0.1.0"
2
+
omicsmeta/benchmark.py ADDED
@@ -0,0 +1,199 @@
1
+ """Benchmark helpers for known-answer metadata harmonization fixtures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import json
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+
10
+ from omicsmeta.core.harmonizer import Harmonizer
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class BenchmarkMetrics:
15
+ """Precision/recall/F1 summary for ontology mappings."""
16
+
17
+ true_positive: int
18
+ false_positive: int
19
+ false_negative: int
20
+ precision: float
21
+ recall: float
22
+ f1: float
23
+
24
+ def as_dict(self) -> dict[str, object]:
25
+ return {
26
+ "true_positive": self.true_positive,
27
+ "false_positive": self.false_positive,
28
+ "false_negative": self.false_negative,
29
+ "precision": self.precision,
30
+ "recall": self.recall,
31
+ "f1": self.f1,
32
+ }
33
+
34
+
35
+ MappingKey = tuple[str, str, str]
36
+
37
+
38
+ def benchmark_file(
39
+ input_path: str | Path,
40
+ truth_path: str | Path,
41
+ *,
42
+ input_type: str = "tabular",
43
+ confidence_threshold: float = 0.70,
44
+ ) -> dict[str, object]:
45
+ """Run harmonization and compare accepted direct mappings to truth."""
46
+
47
+ result = Harmonizer(confidence_threshold=confidence_threshold).from_file(str(input_path), file_type=input_type)
48
+ observed = _observed_mappings(result.harmonized)
49
+ expected = _truth_mappings(truth_path)
50
+ overall = _metrics(observed, expected)
51
+ fields = sorted({field for _, field, _ in observed | expected})
52
+ by_field = {
53
+ field: _metrics(
54
+ {mapping for mapping in observed if mapping[1] == field},
55
+ {mapping for mapping in expected if mapping[1] == field},
56
+ ).as_dict()
57
+ for field in fields
58
+ }
59
+
60
+ return {
61
+ "input_path": str(input_path),
62
+ "truth_path": str(truth_path),
63
+ "confidence_threshold": confidence_threshold,
64
+ "overall": overall.as_dict(),
65
+ "by_field": by_field,
66
+ "observed_count": len(observed),
67
+ "expected_count": len(expected),
68
+ }
69
+
70
+
71
+ def benchmark_suite(
72
+ manifest_path: str | Path,
73
+ *,
74
+ confidence_threshold: float = 0.70,
75
+ ) -> dict[str, object]:
76
+ """Run a TSV manifest of known-answer benchmark cases."""
77
+
78
+ manifest = Path(manifest_path)
79
+ base_dir = manifest.parent
80
+ cases: list[dict[str, object]] = []
81
+ total_true_positive = 0
82
+ total_false_positive = 0
83
+ total_false_negative = 0
84
+ field_counts: dict[str, dict[str, int]] = {}
85
+
86
+ with manifest.open(newline="", encoding="utf-8") as handle:
87
+ rows = csv.DictReader(handle, delimiter="\t")
88
+ for row in rows:
89
+ name = row["name"].strip()
90
+ input_path = _resolve_manifest_path(base_dir, row["input_path"])
91
+ truth_path = _resolve_manifest_path(base_dir, row["truth_path"])
92
+ summary = benchmark_file(
93
+ input_path,
94
+ truth_path,
95
+ input_type=row.get("input_type", "tabular") or "tabular",
96
+ confidence_threshold=confidence_threshold,
97
+ )
98
+ case_summary = {
99
+ "name": name,
100
+ "description": row.get("description", ""),
101
+ **summary,
102
+ }
103
+ cases.append(case_summary)
104
+
105
+ overall = summary["overall"]
106
+ if isinstance(overall, dict):
107
+ total_true_positive += int(overall["true_positive"])
108
+ total_false_positive += int(overall["false_positive"])
109
+ total_false_negative += int(overall["false_negative"])
110
+
111
+ by_field = summary["by_field"]
112
+ if isinstance(by_field, dict):
113
+ for field, metrics in by_field.items():
114
+ if not isinstance(metrics, dict):
115
+ continue
116
+ counts = field_counts.setdefault(
117
+ str(field),
118
+ {"true_positive": 0, "false_positive": 0, "false_negative": 0},
119
+ )
120
+ counts["true_positive"] += int(metrics["true_positive"])
121
+ counts["false_positive"] += int(metrics["false_positive"])
122
+ counts["false_negative"] += int(metrics["false_negative"])
123
+
124
+ overall_metrics = _metrics_from_counts(
125
+ total_true_positive,
126
+ total_false_positive,
127
+ total_false_negative,
128
+ )
129
+ by_field_summary = {
130
+ field: _metrics_from_counts(
131
+ counts["true_positive"],
132
+ counts["false_positive"],
133
+ counts["false_negative"],
134
+ ).as_dict()
135
+ for field, counts in sorted(field_counts.items())
136
+ }
137
+
138
+ return {
139
+ "manifest_path": str(manifest),
140
+ "confidence_threshold": confidence_threshold,
141
+ "case_count": len(cases),
142
+ "overall": overall_metrics.as_dict(),
143
+ "by_field": by_field_summary,
144
+ "cases": cases,
145
+ }
146
+
147
+
148
+ def write_benchmark_json(summary: dict[str, object], path: str | Path) -> None:
149
+ """Write benchmark summary JSON."""
150
+
151
+ Path(path).write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
152
+
153
+
154
+ def _observed_mappings(records: list[dict[str, object]]) -> set[MappingKey]:
155
+ return {
156
+ (str(record["sample_id"]), str(record["field_type"]), str(record["ontology_id"]))
157
+ for record in records
158
+ if record.get("accepted") and record.get("ontology_id") and record.get("backend") != "inference"
159
+ }
160
+
161
+
162
+ def _truth_mappings(path: str | Path) -> set[MappingKey]:
163
+ with Path(path).open(newline="", encoding="utf-8") as handle:
164
+ rows = csv.DictReader(handle, delimiter="\t")
165
+ return {
166
+ (row["sample_id"], row["field_type"], row["ontology_id"])
167
+ for row in rows
168
+ if row.get("sample_id") and row.get("field_type") and row.get("ontology_id")
169
+ }
170
+
171
+
172
+ def _metrics(observed: set[MappingKey], expected: set[MappingKey]) -> BenchmarkMetrics:
173
+ true_positive = len(observed & expected)
174
+ false_positive = len(observed - expected)
175
+ false_negative = len(expected - observed)
176
+ return _metrics_from_counts(true_positive, false_positive, false_negative)
177
+
178
+
179
+ def _metrics_from_counts(
180
+ true_positive: int,
181
+ false_positive: int,
182
+ false_negative: int,
183
+ ) -> BenchmarkMetrics:
184
+ precision = true_positive / (true_positive + false_positive) if true_positive + false_positive else 0.0
185
+ recall = true_positive / (true_positive + false_negative) if true_positive + false_negative else 0.0
186
+ f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
187
+ return BenchmarkMetrics(
188
+ true_positive=true_positive,
189
+ false_positive=false_positive,
190
+ false_negative=false_negative,
191
+ precision=round(precision, 4),
192
+ recall=round(recall, 4),
193
+ f1=round(f1, 4),
194
+ )
195
+
196
+
197
+ def _resolve_manifest_path(base_dir: Path, value: str) -> Path:
198
+ path = Path(value)
199
+ return path if path.is_absolute() else base_dir / path
omicsmeta/cli.py ADDED
@@ -0,0 +1,252 @@
1
+ """Command-line interface for omicsmeta."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from omicsmeta.core.harmonizer import HarmonizationResult, Harmonizer, merge_results
9
+ from omicsmeta.core.mapper import BuiltinMapper, Text2TermMapper, load_builtin_terms
10
+ from omicsmeta.io.writers import write_html_report, write_tabular
11
+ from omicsmeta.ontologies.resources import (
12
+ DEFAULT_CACHE_DIR,
13
+ all_resource_names,
14
+ build_ontology_index,
15
+ cached_resource_paths,
16
+ default_index_path,
17
+ download_resources,
18
+ resource_status,
19
+ )
20
+
21
+ INPUT_TYPES = ["tabular", "geo_soft", "biosample_xml", "sra_xml"]
22
+
23
+
24
+ def build_parser() -> argparse.ArgumentParser:
25
+ parser = argparse.ArgumentParser(prog="omicsmeta")
26
+ subparsers = parser.add_subparsers(dest="command", required=True)
27
+
28
+ harmonize = subparsers.add_parser("harmonize", help="harmonize metadata from a file or GEO accession")
29
+ harmonize.add_argument("input", nargs="?", help="input metadata file")
30
+ harmonize.add_argument("--input-type", choices=INPUT_TYPES, default="tabular")
31
+ harmonize.add_argument("--geo-accession", help="GEO accession to fetch directly, such as GSE123456")
32
+ harmonize.add_argument("--output", required=True, help="harmonized output TSV")
33
+ harmonize.add_argument("--unmapped", required=True, help="unmapped terms TSV")
34
+ harmonize.add_argument("--unmapped-summary-output", help="deduplicated unmapped-term review TSV")
35
+ harmonize.add_argument("--sample-output", help="sample-wide output TSV with one row per input sample")
36
+ harmonize.add_argument("--report", required=True, help="HTML QC report")
37
+ harmonize.add_argument("--confidence-threshold", type=float, default=0.70)
38
+ harmonize.add_argument("--mapper", choices=["builtin", "text2term"], default="builtin")
39
+ harmonize.add_argument(
40
+ "--ontology-obo",
41
+ action="append",
42
+ default=[],
43
+ help="local OBO file to load into the built-in mapper; may be repeated",
44
+ )
45
+ harmonize.add_argument(
46
+ "--no-default-terms",
47
+ action="store_true",
48
+ help="use only terms loaded from --ontology-obo with the built-in mapper",
49
+ )
50
+ harmonize.add_argument(
51
+ "--ontology-resource",
52
+ action="append",
53
+ choices=[*all_resource_names(), "all"],
54
+ default=[],
55
+ help="cached managed ontology resource to load into the built-in mapper; may be repeated",
56
+ )
57
+ harmonize.add_argument(
58
+ "--ontology-cache-dir",
59
+ default=str(DEFAULT_CACHE_DIR),
60
+ help="directory containing managed ontology resources",
61
+ )
62
+
63
+ batch = subparsers.add_parser("batch", help="harmonize multiple metadata files or GEO accessions")
64
+ batch.add_argument("--input", action="append", default=[], help="input metadata file; may be repeated")
65
+ batch.add_argument("--input-type", choices=INPUT_TYPES, default="tabular")
66
+ batch.add_argument("--geo-accession", action="append", default=[], help="GEO accession to fetch; may be repeated")
67
+ batch.add_argument("--output", required=True, help="combined harmonized output TSV")
68
+ batch.add_argument("--unmapped", required=True, help="combined unmapped terms TSV")
69
+ batch.add_argument("--unmapped-summary-output", help="deduplicated unmapped-term review TSV")
70
+ batch.add_argument("--sample-output", help="combined sample-wide output TSV")
71
+ batch.add_argument("--report", required=True, help="combined HTML QC report")
72
+ batch.add_argument("--confidence-threshold", type=float, default=0.70)
73
+ batch.add_argument("--mapper", choices=["builtin", "text2term"], default="builtin")
74
+ batch.add_argument("--ontology-obo", action="append", default=[])
75
+ batch.add_argument("--no-default-terms", action="store_true")
76
+ batch.add_argument("--ontology-resource", action="append", choices=[*all_resource_names(), "all"], default=[])
77
+ batch.add_argument("--ontology-cache-dir", default=str(DEFAULT_CACHE_DIR))
78
+
79
+ ontologies = subparsers.add_parser("ontologies", help="manage local ontology resources")
80
+ ontology_subparsers = ontologies.add_subparsers(dest="ontology_command", required=True)
81
+
82
+ list_cmd = ontology_subparsers.add_parser("list", help="list known ontology resources")
83
+ list_cmd.add_argument("--cache-dir", default=str(DEFAULT_CACHE_DIR))
84
+
85
+ download_cmd = ontology_subparsers.add_parser("download", help="download ontology resources")
86
+ download_cmd.add_argument("resources", nargs="*", choices=[*all_resource_names(), "all"])
87
+ download_cmd.add_argument("--cache-dir", default=str(DEFAULT_CACHE_DIR))
88
+ download_cmd.add_argument("--overwrite", action="store_true")
89
+
90
+ index_cmd = ontology_subparsers.add_parser("index", help="build a SQLite synonym index")
91
+ index_cmd.add_argument("--resource", action="append", choices=[*all_resource_names(), "all"], default=[])
92
+ index_cmd.add_argument("--ontology-obo", action="append", default=[])
93
+ index_cmd.add_argument("--cache-dir", default=str(DEFAULT_CACHE_DIR))
94
+ index_cmd.add_argument("--output", help="SQLite output path")
95
+
96
+ return parser
97
+
98
+
99
+ def main(argv: list[str] | None = None) -> int:
100
+ parser = build_parser()
101
+ args = parser.parse_args(argv)
102
+
103
+ if args.command == "harmonize":
104
+ if not args.input and not args.geo_accession:
105
+ parser.error("harmonize requires an input file or --geo-accession")
106
+
107
+ mapper = _mapper(
108
+ args.mapper,
109
+ args.confidence_threshold,
110
+ ontology_paths=args.ontology_obo,
111
+ ontology_resources=args.ontology_resource,
112
+ ontology_cache_dir=args.ontology_cache_dir,
113
+ include_defaults=not args.no_default_terms,
114
+ )
115
+ harmonizer = Harmonizer(mapper=mapper, confidence_threshold=args.confidence_threshold)
116
+ if args.geo_accession:
117
+ result = harmonizer.from_geo(args.geo_accession)
118
+ else:
119
+ result = harmonizer.from_file(args.input, file_type=args.input_type)
120
+ _write_result_outputs(result, args)
121
+ return 0
122
+
123
+ if args.command == "batch":
124
+ if not args.input and not args.geo_accession:
125
+ parser.error("batch requires --input or --geo-accession")
126
+
127
+ mapper = _mapper(
128
+ args.mapper,
129
+ args.confidence_threshold,
130
+ ontology_paths=args.ontology_obo,
131
+ ontology_resources=args.ontology_resource,
132
+ ontology_cache_dir=args.ontology_cache_dir,
133
+ include_defaults=not args.no_default_terms,
134
+ )
135
+ harmonizer = Harmonizer(mapper=mapper, confidence_threshold=args.confidence_threshold)
136
+ sourced_results: list[tuple[str, HarmonizationResult]] = []
137
+ for input_path in args.input:
138
+ sourced_results.append((Path(input_path).name, harmonizer.from_file(input_path, file_type=args.input_type)))
139
+ for accession in args.geo_accession:
140
+ sourced_results.append((str(accession).upper(), harmonizer.from_geo(accession)))
141
+
142
+ _write_result_outputs(merge_results(sourced_results), args)
143
+ return 0
144
+
145
+ if args.command == "ontologies":
146
+ if args.ontology_command == "list":
147
+ _print_resource_status(args.cache_dir)
148
+ return 0
149
+ if args.ontology_command == "download":
150
+ names = _expand_resource_names(args.resources or ["all"])
151
+ paths = download_resources(names, cache_dir=args.cache_dir, overwrite=args.overwrite)
152
+ for path in paths:
153
+ print(path)
154
+ return 0
155
+ if args.ontology_command == "index":
156
+ names = _expand_resource_names(args.resource)
157
+ ontology_paths = []
158
+ if names:
159
+ ontology_paths.extend(cached_resource_paths(names, cache_dir=args.cache_dir))
160
+ ontology_paths.extend(Path(path) for path in args.ontology_obo)
161
+ if not ontology_paths:
162
+ parser.error("ontologies index requires --resource or --ontology-obo")
163
+ output = Path(args.output) if args.output else default_index_path(args.cache_dir)
164
+ output.parent.mkdir(parents=True, exist_ok=True)
165
+ count = build_ontology_index(ontology_paths, output_path=output)
166
+ print(f"Indexed {count} terms into {output}")
167
+ return 0
168
+
169
+ parser.error(f"Unhandled command: {args.command}")
170
+ return 2
171
+
172
+
173
+ def _mapper(
174
+ name: str,
175
+ confidence_threshold: float,
176
+ *,
177
+ ontology_paths: list[str],
178
+ ontology_resources: list[str],
179
+ ontology_cache_dir: str,
180
+ include_defaults: bool,
181
+ ) -> BuiltinMapper | Text2TermMapper:
182
+ if name == "builtin":
183
+ resource_names = _expand_resource_names(ontology_resources)
184
+ resource_paths = cached_resource_paths(resource_names, cache_dir=ontology_cache_dir) if resource_names else []
185
+ terms = load_builtin_terms([*resource_paths, *ontology_paths], include_defaults=include_defaults)
186
+ return BuiltinMapper(terms=terms, confidence_threshold=confidence_threshold)
187
+ if name == "text2term":
188
+ if ontology_paths or ontology_resources or not include_defaults:
189
+ raise ValueError(
190
+ "--ontology-obo, --ontology-resource, and --no-default-terms are only supported by --mapper builtin"
191
+ )
192
+ return Text2TermMapper(confidence_threshold=confidence_threshold)
193
+ raise ValueError(f"Unsupported mapper: {name}")
194
+
195
+
196
+ def _write_result_outputs(result: HarmonizationResult, args: argparse.Namespace) -> None:
197
+ write_tabular(result.harmonized, args.output)
198
+ write_tabular(result.unmapped, args.unmapped)
199
+ if getattr(args, "unmapped_summary_output", None):
200
+ write_tabular(
201
+ result.unmapped_summary,
202
+ args.unmapped_summary_output,
203
+ default_columns=[
204
+ "field_type",
205
+ "normalized_term",
206
+ "occurrence_count",
207
+ "batch_sources",
208
+ "sample_ids",
209
+ "columns",
210
+ "example_terms",
211
+ "best_candidate_id",
212
+ "best_candidate_label",
213
+ "best_candidate_ontology",
214
+ "best_candidate_confidence",
215
+ ],
216
+ )
217
+ if getattr(args, "sample_output", None):
218
+ write_tabular(
219
+ result.sample_table,
220
+ args.sample_output,
221
+ default_columns=["row_index", "sample_id", "batch_source"],
222
+ )
223
+ write_html_report(result.qc_summary, args.report)
224
+
225
+
226
+ def _expand_resource_names(names: list[str]) -> list[str]:
227
+ if not names:
228
+ return []
229
+ if "all" in names:
230
+ return all_resource_names()
231
+ return names
232
+
233
+
234
+ def _print_resource_status(cache_dir: str) -> None:
235
+ print("name\tcached\tsize_bytes\tpath\turl\tdescription")
236
+ for row in resource_status(cache_dir):
237
+ print(
238
+ "\t".join(
239
+ [
240
+ str(row["name"]),
241
+ str(row["cached"]).lower(),
242
+ str(row["size_bytes"]),
243
+ str(row["path"]),
244
+ str(row["url"]),
245
+ str(row["description"]),
246
+ ]
247
+ )
248
+ )
249
+
250
+
251
+ if __name__ == "__main__":
252
+ raise SystemExit(main())
@@ -0,0 +1,26 @@
1
+ """Core harmonization components."""
2
+
3
+ from omicsmeta.core.detector import FieldDetection, detect_field, detect_fields
4
+ from omicsmeta.core.fetcher import fetch_geo_rows, fetch_geo_soft
5
+ from omicsmeta.core.harmonizer import HarmonizationResult, Harmonizer, merge_results
6
+ from omicsmeta.core.mapper import BuiltinMapper, MappingResult, load_builtin_terms
7
+ from omicsmeta.core.normalizer import normalize_text, split_terms
8
+ from omicsmeta.core.types import FieldType, OntologyTerm
9
+
10
+ __all__ = [
11
+ "BuiltinMapper",
12
+ "FieldDetection",
13
+ "FieldType",
14
+ "HarmonizationResult",
15
+ "Harmonizer",
16
+ "MappingResult",
17
+ "OntologyTerm",
18
+ "detect_field",
19
+ "detect_fields",
20
+ "fetch_geo_rows",
21
+ "fetch_geo_soft",
22
+ "load_builtin_terms",
23
+ "merge_results",
24
+ "normalize_text",
25
+ "split_terms",
26
+ ]
@@ -0,0 +1,205 @@
1
+ """Heuristic detection of metadata field types."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from collections.abc import Iterable, Mapping
8
+
9
+ from omicsmeta.core.normalizer import normalize_text
10
+ from omicsmeta.core.types import FieldType
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class NameHint:
15
+ """A weighted column-name signal for field detection."""
16
+
17
+ text: str
18
+ weight: float
19
+
20
+
21
+ FIELD_NAME_HINTS: dict[FieldType, tuple[NameHint, ...]] = {
22
+ FieldType.DISEASE: (
23
+ NameHint("disease state", 0.82),
24
+ NameHint("disease", 0.78),
25
+ NameHint("diagnosis", 0.78),
26
+ NameHint("pathology", 0.72),
27
+ NameHint("tumor type", 0.72),
28
+ NameHint("tumour type", 0.72),
29
+ NameHint("cancer", 0.72),
30
+ NameHint("condition", 0.48),
31
+ NameHint("phenotype", 0.24),
32
+ ),
33
+ FieldType.TISSUE: (
34
+ NameHint("body site", 0.82),
35
+ NameHint("tissue", 0.78),
36
+ NameHint("organ", 0.72),
37
+ NameHint("anatomy", 0.72),
38
+ NameHint("sample type", 0.42),
39
+ ),
40
+ FieldType.CELL_LINE: (
41
+ NameHint("cell line", 0.84),
42
+ NameHint("cellline", 0.84),
43
+ NameHint("cell_line", 0.84),
44
+ NameHint("cell", 0.35),
45
+ ),
46
+ FieldType.SEX: (
47
+ NameHint("sex", 0.84),
48
+ NameHint("gender", 0.84),
49
+ ),
50
+ FieldType.AGE: (
51
+ NameHint("age at", 0.84),
52
+ NameHint("age", 0.78),
53
+ NameHint("developmental stage", 0.48),
54
+ ),
55
+ FieldType.TREATMENT: (
56
+ NameHint("treatment", 0.84),
57
+ NameHint("treated", 0.72),
58
+ NameHint("drug", 0.72),
59
+ NameHint("compound", 0.72),
60
+ NameHint("stimulus", 0.72),
61
+ NameHint("exposure", 0.72),
62
+ ),
63
+ FieldType.SPECIES: (
64
+ NameHint("organism", 0.84),
65
+ NameHint("species", 0.78),
66
+ NameHint("taxon", 0.78),
67
+ NameHint("taxid", 0.78),
68
+ ),
69
+ }
70
+
71
+ SEX_VALUES = {"male", "female", "m", "f", "man", "woman"}
72
+ SPECIES_VALUES = {
73
+ "homo sapiens",
74
+ "human",
75
+ "mus musculus",
76
+ "mouse",
77
+ "rattus norvegicus",
78
+ "rat",
79
+ }
80
+ KNOWN_CELL_LINES = {"a549", "hela", "hek293", "jurkat", "k562", "mcf-7", "mcf7", "u87"}
81
+ DISEASE_MARKERS = {
82
+ "adenocarcinoma",
83
+ "cancer",
84
+ "carcinoma",
85
+ "disease",
86
+ "fibrosis",
87
+ "glioblastoma",
88
+ "leukemia",
89
+ "lymphoma",
90
+ "melanoma",
91
+ "tumor",
92
+ "tumour",
93
+ }
94
+ TISSUE_MARKERS = {
95
+ "blood",
96
+ "brain",
97
+ "breast",
98
+ "colon",
99
+ "heart",
100
+ "kidney",
101
+ "liver",
102
+ "lung",
103
+ "spleen",
104
+ "tissue",
105
+ }
106
+ TREATMENT_MARKERS = {"treated", "untreated", "control", "vehicle", "drug", "stimulated"}
107
+ AGE_RE = re.compile(r"^\d+(\.\d+)?\s*(day|days|week|weeks|month|months|year|years|yr|yrs|y)?$", re.I)
108
+
109
+
110
+ @dataclass(frozen=True)
111
+ class FieldDetection:
112
+ """Detected semantic type for one metadata column."""
113
+
114
+ field_type: FieldType
115
+ confidence: float
116
+ signals: tuple[str, ...] = ()
117
+
118
+
119
+ def detect_field(column_name: str, values: Iterable[object] = ()) -> FieldDetection:
120
+ """Detect the semantic role of a metadata column."""
121
+
122
+ normalized_name = normalize_text(column_name, expand_abbreviations=False)
123
+ value_list = [normalize_text(value, expand_abbreviations=False) for value in values if str(value).strip()]
124
+
125
+ scores: dict[FieldType, float] = {field_type: 0.0 for field_type in FieldType}
126
+ signals: dict[FieldType, list[str]] = {field_type: [] for field_type in FieldType}
127
+
128
+ for field_type, hints in FIELD_NAME_HINTS.items():
129
+ for hint in hints:
130
+ hint_norm = normalize_text(hint.text, expand_abbreviations=False)
131
+ if hint_norm and hint_norm in normalized_name:
132
+ scores[field_type] += hint.weight
133
+ signals[field_type].append(f"name:{hint.text}:{hint.weight:.2f}")
134
+ break
135
+
136
+ for field_type in (
137
+ FieldType.DISEASE,
138
+ FieldType.TISSUE,
139
+ FieldType.CELL_LINE,
140
+ FieldType.SEX,
141
+ FieldType.AGE,
142
+ FieldType.TREATMENT,
143
+ FieldType.SPECIES,
144
+ ):
145
+ score = _value_score(field_type, value_list)
146
+ if score:
147
+ scores[field_type] += score
148
+ signals[field_type].append(f"values:{score:.2f}")
149
+
150
+ best_type = max(scores, key=scores.get)
151
+ best_score = scores[best_type]
152
+ if best_type == FieldType.UNKNOWN or best_score < 0.35:
153
+ return FieldDetection(FieldType.UNKNOWN, 0.0, ())
154
+
155
+ return FieldDetection(best_type, min(best_score, 0.99), tuple(signals[best_type]))
156
+
157
+
158
+ def detect_fields(rows: Iterable[Mapping[str, object]]) -> dict[str, FieldDetection]:
159
+ """Detect field types for all columns in a row-oriented table."""
160
+
161
+ row_list = list(rows)
162
+ columns: list[str] = []
163
+ seen: set[str] = set()
164
+ for row in row_list:
165
+ for column in row:
166
+ if column not in seen:
167
+ seen.add(column)
168
+ columns.append(column)
169
+
170
+ detections: dict[str, FieldDetection] = {}
171
+ for column in columns:
172
+ detections[column] = detect_field(column, (row.get(column, "") for row in row_list))
173
+ return detections
174
+
175
+
176
+ def _value_score(field_type: FieldType, values: list[str]) -> float:
177
+ if not values:
178
+ return 0.0
179
+
180
+ sample = values[:50]
181
+ matches = sum(1 for value in sample if _matches_field(field_type, value))
182
+ if matches == 0:
183
+ return 0.0
184
+
185
+ proportion = matches / len(sample)
186
+ return min(0.70, 0.20 + proportion * 0.55)
187
+
188
+
189
+ def _matches_field(field_type: FieldType, value: str) -> bool:
190
+ if field_type == FieldType.SEX:
191
+ return value in SEX_VALUES
192
+ if field_type == FieldType.SPECIES:
193
+ return value in SPECIES_VALUES or value.startswith("ncbitaxon:")
194
+ if field_type == FieldType.CELL_LINE:
195
+ tokens = set(re.split(r"[^a-z0-9-]+", value))
196
+ return value in KNOWN_CELL_LINES or bool(tokens & KNOWN_CELL_LINES)
197
+ if field_type == FieldType.AGE:
198
+ return bool(AGE_RE.match(value))
199
+ if field_type == FieldType.DISEASE:
200
+ return any(marker in value for marker in DISEASE_MARKERS)
201
+ if field_type == FieldType.TISSUE:
202
+ return value in TISSUE_MARKERS or value.endswith(" tissue")
203
+ if field_type == FieldType.TREATMENT:
204
+ return any(marker in value for marker in TREATMENT_MARKERS)
205
+ return False