chebilp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chebILP/__init__.py ADDED
File without changes
chebILP/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from chebILP.cli import main
2
+
3
+ main()
chebILP/cli.py ADDED
@@ -0,0 +1,607 @@
1
+ import os
2
+ from typing import Literal
3
+ import typing
4
+ import json
5
+ import time
6
+ import argparse
7
+
8
+ from chebILP.ilp_problem_builder import ILPProblemBuilder, AVAILABLE_PREDICATE_SETS
9
+ from chebILP.learn_fgs import FGILPProblemBuilder
10
+ from chebILP.ilp_classifier import run_ilp_training_subprocess
11
+ from chebILP.ilp_path_manager import get_exs_path, get_bk_path, get_bias_path
12
+ from chebILP.utils import tee_output
13
+
14
+ from chebILP.ilp_classifier import learn_chebi_classes
15
+
16
+
17
+
18
+ # ── Helpers ──────────────────────────────────────────────────────────────────
19
+
20
+ def _load_classes(labels_file: str) -> list[str]:
21
+ with open(labels_file, "r") as f:
22
+ return [line.strip() for line in f.readlines() if line.strip()]
23
+
24
+
25
+ def _make_ilp_builder(args) -> ILPProblemBuilder:
26
+ if isinstance(args, dict):
27
+ fg_mode = args["fg_mode"]
28
+ chebi_split = args.get("chebi_split")
29
+ predicate_set = args["predicate_set"]
30
+ max_vars = int(args.get("max_vars", 6))
31
+ max_body = int(args.get("max_body", 8))
32
+ max_clauses = int(args.get("max_clauses", 2))
33
+ chebi_graph_path = args.get("chebi_graph_path")
34
+ molecules_path = args.get("molecules_path")
35
+ else:
36
+ fg_mode = args.fg_mode
37
+ chebi_split = getattr(args, "chebi_split", None)
38
+ predicate_set = args.predicate_set
39
+ max_vars = args.max_vars
40
+ max_body = args.max_body
41
+ max_clauses = args.max_clauses
42
+ chebi_graph_path = getattr(args, "chebi_graph_path", None)
43
+ molecules_path = getattr(args, "molecules_path", None)
44
+
45
+ if fg_mode:
46
+ return FGILPProblemBuilder(
47
+ chebi_split=chebi_split,
48
+ chebi_graph_path=chebi_graph_path,
49
+ molecules_path=molecules_path,
50
+ dataset_path=os.path.join("data", "chebi_fgs_dataset.pkl"),
51
+ predicate_set=predicate_set,
52
+ max_vars=max_vars,
53
+ max_body=max_body,
54
+ max_clauses=max_clauses,
55
+ )
56
+ return ILPProblemBuilder(
57
+ chebi_split=chebi_split,
58
+ chebi_graph_path=chebi_graph_path,
59
+ molecules_path=molecules_path,
60
+ muggleton=False,
61
+ predicate_set=predicate_set,
62
+ max_vars=max_vars,
63
+ max_body=max_body,
64
+ max_clauses=max_clauses,
65
+ )
66
+
67
+
68
+ def _make_results_dir(fg_mode: bool) -> str:
69
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
70
+ results_dir = os.path.join("data", "results", f"run_fgs_{timestamp}" if fg_mode else f"run_{timestamp}")
71
+ os.makedirs(results_dir, exist_ok=True)
72
+ with open(os.path.join(results_dir, "results.json"), "w+") as f:
73
+ f.write("")
74
+ return results_dir
75
+
76
+
77
+ # ── Subcommand handlers ─────────────────────────────────────────────────────
78
+
79
+
80
+ def _handle_build_samples(args):
81
+ classes = _load_classes(args.labels_file)
82
+ ilp_builder = _make_ilp_builder(args)
83
+ ilp_builder.build_examples(classes, min_pos_samples=args.min_pos_samples, max_pos_samples=args.max_pos_samples, min_neg_samples=args.min_neg_samples, max_neg_samples=args.max_neg_samples)
84
+
85
+
86
+ def _handle_build_bk(args):
87
+ classes = _load_classes(args.labels_file)
88
+ ilp_builder = _make_ilp_builder(args)
89
+ ilp_builder.build_bk(classes)
90
+
91
+
92
+ def _handle_learn(args):
93
+ classes = _load_classes(args.labels_file)
94
+ results_dir = _make_results_dir(args.fg_mode)
95
+ log_path = os.path.join(results_dir, "run.log")
96
+
97
+ # write config file
98
+ with open(os.path.join(results_dir, "config.yml"), "w+") as f:
99
+ f.write("args:\n")
100
+ for arg in vars(args):
101
+ f.write(f" {arg}: {getattr(args, arg)}\n")
102
+
103
+ with tee_output(log_path):
104
+ learn_chebi_classes(
105
+ classes,
106
+ getattr(args, "problem_dir", None),
107
+ args.predicate_set,
108
+ results_dir,
109
+ timeout=args.timeout,
110
+ selection_mode=args.selection_mode,
111
+ selection_k=args.selection_k,
112
+ max_vars=args.max_vars,
113
+ max_body=args.max_body,
114
+ max_clauses=args.max_clauses,
115
+ mdl_weight_fn=args.mdl_weight_fn,
116
+ mdl_weight_fp=args.mdl_weight_fp,
117
+ mdl_weight_size=args.mdl_weight_size,
118
+ )
119
+
120
+
121
+ def _handle_select_predicates(args):
122
+ from chebILP.select_predicates import select_predicates_for_classes
123
+
124
+ with open(args.labels_file, "r") as f:
125
+ chebi_ids = [int(line.strip()) for line in f if line.strip()]
126
+
127
+ print(f"Processing {len(chebi_ids)} ChEBI classes...")
128
+ results = select_predicates_for_classes(
129
+ chebi_ids=chebi_ids,
130
+ chebi_version=args.chebi_version,
131
+ problem_dir=args.problem_dir,
132
+ predicate_set=args.predicate_set,
133
+ selection_mode=args.selection_mode,
134
+ selection_k=args.selection_k,
135
+ )
136
+ successful = sum(1 for v in results.values() if v is not None)
137
+ print(f"\nCompleted: {successful}/{len(chebi_ids)} classes processed successfully")
138
+
139
+
140
+ def _load_label_stats(path: str) -> list[str]:
141
+ with open(path) as f:
142
+ lines = [line.strip() for line in f if line.strip()]
143
+ if path.endswith(".txt"):
144
+ return lines
145
+ # CSV: first column is class ID, skip header
146
+ return [line.split(",")[0] for line in lines[1:]]
147
+
148
+
149
+
150
+ def _handle_build_ilp_preds_for_ensemble(args):
151
+ """Build a full ILP predictions tensor for a given split from a run's results.json."""
152
+ import pandas as pd
153
+ from chebILP.test import build_ilp_preds_tensor
154
+ from chebILP.ensemble_eval import load_ilp_results
155
+
156
+ results = load_ilp_results(args.run_dir)
157
+ programs = {cid: entry["program"] for cid, entry in results.items() if entry.get("program")}
158
+ print(f"Loaded {len(programs)} ILP programs from {args.run_dir}")
159
+
160
+ mol_ids = []
161
+ with open(args.chebi_split) as f:
162
+ for line in f.readlines()[1:]:
163
+ mol_id, split = line.strip().split(",")
164
+ if split == args.predict_on:
165
+ mol_ids.append(mol_id)
166
+ print(f"Building predictions for {len(mol_ids)} '{args.predict_on}' molecules")
167
+
168
+ molecules_pkl = getattr(args, "molecules_path", None) or os.path.join(
169
+ "data", f"chebi_v{args.chebi_version}", "molecules.pkl"
170
+ )
171
+ print(f"Loading molecules from {molecules_pkl}...")
172
+ molecules_df = pd.read_pickle(molecules_pkl)
173
+
174
+ prefix = "val" if args.predict_on == "validation" else args.predict_on
175
+ output_npy = os.path.join(args.run_dir, f"full_{prefix}_preds.npy")
176
+ output_meta = os.path.join(args.run_dir, f"full_{prefix}_preds_metadata.json")
177
+
178
+ build_ilp_preds_tensor(programs, molecules_df, mol_ids, output_npy, output_meta)
179
+
180
+
181
+ def _handle_ensemble_construct(args):
182
+ """Perform model selection and generate the ILP predictions tensor."""
183
+ from chebILP.ensemble_eval import EnsembleConstructor, load_dl_preds
184
+
185
+ label_stats = _load_label_stats(args.label_stats)
186
+ print(f"Label stats: {len(label_stats)} labels")
187
+
188
+ dl_val_preds = load_dl_preds(args.dl_val_preds_npy, args.dl_val_preds_meta)
189
+ print(f"DL val predictions: {dl_val_preds.shape[0]} molecules x {dl_val_preds.shape[1]} classes")
190
+
191
+ ilp_val_run_dirs = {os.path.basename(p.rstrip("/\\")): p for p in args.ilp_val_runs}
192
+ data_dir = os.path.join("data", f"chebi_v{args.chebi_version}")
193
+ import pickle as _pickle
194
+ with open(os.path.join(data_dir, "chebi_graph.pkl"), "rb") as _f:
195
+ chebi_graph = _pickle.load(_f)
196
+
197
+ constructor = EnsembleConstructor(
198
+ ilp_val_runs=ilp_val_run_dirs,
199
+ dl_val_preds=dl_val_preds,
200
+ label_stats=label_stats,
201
+ chebi_graph=chebi_graph,
202
+ predict_on=args.predict_on,
203
+ )
204
+
205
+ output_base = args.output
206
+ os.makedirs(os.path.dirname(output_base) or ".", exist_ok=True)
207
+
208
+ trusted_path = output_base + "_trusted_models.csv"
209
+ with open(trusted_path, "w") as f:
210
+ f.write("chebi_id,model\n")
211
+ for chebi_id, model in constructor.trusted_model.items():
212
+ f.write(f"{chebi_id},{model}\n")
213
+ print(f"Saved trusted models: {trusted_path}")
214
+
215
+ mol_ids = []
216
+ with open(args.chebi_split) as f:
217
+ for line in f.readlines()[1:]:
218
+ mol_id, split = line.strip().split(",")
219
+ if split == args.predict_on:
220
+ mol_ids.append(mol_id)
221
+ print(f"Slicing ILP tensor for {len(mol_ids)} '{args.predict_on}' molecules...")
222
+
223
+ constructor.slice_ilp_preds(
224
+ mol_order=mol_ids,
225
+ output_npy_path=output_base + "_ilp_preds.npy",
226
+ output_meta_path=output_base + "_ilp_preds_metadata.json",
227
+ )
228
+
229
+
230
+ def _handle_ensemble_aggregate(args):
231
+ """Aggregate pre-computed DL and ILP prediction tensors into ensemble predictions."""
232
+ from chebILP.ensemble_eval import EnsembleAggregator, load_dl_preds, load_ilp_preds
233
+ import numpy as np, json as _json, pandas as pd
234
+
235
+ label_stats = _load_label_stats(args.label_stats)
236
+
237
+ dl_preds = load_dl_preds(args.dl_preds_npy, args.dl_preds_meta)
238
+ print(f"DL predictions: {dl_preds.shape[0]} molecules x {dl_preds.shape[1]} labels")
239
+
240
+ ilp_preds = load_ilp_preds(args.ilp_preds_npy, args.ilp_preds_meta)
241
+ print(f"ILP predictions: {ilp_preds.shape[0]} molecules x {ilp_preds.shape[1]} classes")
242
+
243
+ with open(args.trusted_models) as f:
244
+ lines = [line.strip().split(",") for line in f if line.strip()]
245
+ if lines[0] == ["chebi_id", "model"]:
246
+ trusted_model = {line[0]: [model.replace("'", "").replace("[", "").replace("]", "").strip() for model in line[1:]] for line in lines[1:]}
247
+ model_weights_dict = None
248
+ print(f"Loaded trusted models: {len(trusted_model)} classes")
249
+ else:
250
+ tm_df = pd.read_csv(args.trusted_models, dtype=str)
251
+ trusted_model = None
252
+ tm_df = tm_df.set_index("chebi_id")
253
+ model_weights_dict = {
254
+ cls_id: {col: float(val) for col, val in row.items()}
255
+ for cls_id, row in tm_df.iterrows()
256
+ }
257
+ print(f"Loaded model weights: {len(model_weights_dict)} classes")
258
+
259
+ data_dir = os.path.join("data", f"chebi_v{args.chebi_version}")
260
+ import pickle as _pickle
261
+ with open(os.path.join(data_dir, "chebi_graph.pkl"), "rb") as _f:
262
+ chebi_graph = _pickle.load(_f)
263
+
264
+ aggregator = EnsembleAggregator(
265
+ dl_preds=dl_preds,
266
+ ilp_preds=ilp_preds,
267
+ label_stats=label_stats,
268
+ chebi_graph=chebi_graph,
269
+ trusted_model=trusted_model,
270
+ model_weights=model_weights_dict,
271
+ )
272
+
273
+ mol_ids = dl_preds.index.tolist()
274
+ print(f"Predicting on {len(mol_ids)} molecules...")
275
+
276
+ predictions_df = aggregator.predict_set(mol_ids)
277
+
278
+ arr = predictions_df.to_numpy().astype("float32")
279
+ meta = {"mol_order": list(predictions_df.index), "class_labels": list(predictions_df.columns)}
280
+
281
+ npy_path = args.output if args.output.endswith(".npy") else args.output + ".npy"
282
+ meta_path = npy_path.replace(".npy", "_metadata.json")
283
+
284
+ os.makedirs(os.path.dirname(npy_path) or ".", exist_ok=True)
285
+ np.save(npy_path, arr)
286
+ with open(meta_path, "w") as f:
287
+ _json.dump(meta, f, indent=2)
288
+
289
+ n_pos = int(arr.sum())
290
+ print(f"Saved predictions: {npy_path} (shape {arr.shape}, {n_pos} positive assignments)")
291
+ print(f"Saved metadata: {meta_path}")
292
+
293
+
294
+ def _handle_test(args):
295
+ from chebILP.test import test_chebi_classes
296
+
297
+ # load config from the run to evaluate
298
+ with open(os.path.join(args.run_to_evaluate, "config.yml"), "r") as f:
299
+ config = {}
300
+ for line in f:
301
+ if ": " in line:
302
+ key, value = line.strip().split(": ", 1)
303
+ config[key] = value
304
+ assert "predicate_set" in config and "selection_mode" in config and "selection_k" in config and "problem_dir" in config and "fg_mode" in config, \
305
+ "Config file must contain predicate_set, selection_mode, selection_k, problem_dir, and fg_mode"
306
+
307
+ config["fg_mode"] = config["fg_mode"] == "True"
308
+ config["selection_mode"] = config["selection_mode"] if config["selection_mode"] != "None" else None
309
+ config["selection_k"] = int(config["selection_k"]) if config["selection_k"] != "None" else None
310
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
311
+ results_dir = os.path.join("data", f"results_{args.test_on}", f"run_fgs_{timestamp}" if config["fg_mode"] else f"run_{timestamp}")
312
+ os.makedirs(results_dir, exist_ok=True)
313
+ with open(os.path.join(results_dir, "results.json"), "w+") as f:
314
+ f.write("")
315
+
316
+ log_path = os.path.join(results_dir, "run.log")
317
+
318
+ with open(os.path.join(results_dir, "config.yml"), "w+") as f:
319
+ f.write("args:\n")
320
+ for arg in vars(args):
321
+ f.write(f" {arg}: {getattr(args, arg)}\n")
322
+
323
+ print(f"Config for test run saved to {os.path.join(results_dir, 'config.yml')}")
324
+
325
+ with tee_output(log_path):
326
+ test_chebi_classes(args.run_to_evaluate, config["problem_dir"], config["predicate_set"], results_dir, selection_mode=config["selection_mode"], selection_k=config["selection_k"], test_on=args.test_on, verbose=args.verbose)
327
+
328
+
329
+ # ── Argument parsing ─────────────────────────────────────────────────────────
330
+
331
+
332
+ def _add_common_args(parser: argparse.ArgumentParser):
333
+ """Add arguments shared by all subcommands that build an ILPProblemBuilder."""
334
+ parser.add_argument("--labels_file", type=str, required=True, help="Path to the labels file (one ChEBI ID per line).")
335
+ parser.add_argument("--chebi_split", type=str, required=True, help="Path to the ChEBI split file (mol_id,split CSV).")
336
+ parser.add_argument("--fg_mode", action="store_true", help="Learn functional groups instead of ChEBI classes.")
337
+ parser.add_argument("--chebi_graph_path", type=str, required=True,
338
+ help="Path to chebi_graph.pkl.")
339
+ parser.add_argument("--molecules_path", type=str, default=True,
340
+ help="Path to molecules.pkl.")
341
+ parser.add_argument("--predicate_set", type=str, default="atoms", choices=typing.get_args(AVAILABLE_PREDICATE_SETS), help="Which predicate set to use for background knowledge.")
342
+
343
+
344
+ def _handle_prepare_dataset(args):
345
+ from chebILP.data_preparation import ChEBIDataset
346
+ ChEBIDataset.prepare(
347
+ chebi_version=args.chebi_version,
348
+ three_star_only=not args.include_two_star,
349
+ data_dir=args.data_dir,
350
+ min_pos_samples=args.min_pos_samples,
351
+ val_ratio=args.val_ratio,
352
+ test_ratio=args.test_ratio,
353
+ seed=args.seed,
354
+ labels_path=args.labels_path,
355
+ splits_path=args.splits_path,
356
+ )
357
+
358
+
359
+ def build_parser() -> argparse.ArgumentParser:
360
+ parser = argparse.ArgumentParser(
361
+ description="ILP classification CLI for ChEBI classes using Popper.",
362
+ )
363
+ subparsers = parser.add_subparsers(dest="command", required=True)
364
+
365
+ # ── prepare_dataset ──────────────────────────────────────────────────
366
+ sp_pd = subparsers.add_parser(
367
+ "prepare_dataset",
368
+ help="Download ChEBI data and build all dataset artefacts "
369
+ "(graph cache, molecule cache, labels.txt, splits.csv).",
370
+ )
371
+ sp_pd.add_argument("--chebi_version", type=int, required=True,
372
+ help="ChEBI ontology version (e.g. 248).")
373
+ sp_pd.add_argument("--include_two_star", action="store_true",
374
+ help="Include classes with 2-star or 3-star annotation status (default: Only 3-star classes).")
375
+ sp_pd.add_argument("--data_dir", type=str, default=None,
376
+ help="Root directory for raw and cached files "
377
+ "(default: data/chebi_v{version}).")
378
+ sp_pd.add_argument("--min_pos_samples", type=int, default=50,
379
+ help="Minimum descendant molecules per label class (default: 50).")
380
+ sp_pd.add_argument("--val_ratio", type=float, default=0.1,
381
+ help="Fraction of molecules for validation (default: 0.1).")
382
+ sp_pd.add_argument("--test_ratio", type=float, default=0.1,
383
+ help="Fraction of molecules for test (default: 0.1).")
384
+ sp_pd.add_argument("--seed", type=int, default=42,
385
+ help="Random seed for splits (default: 42).")
386
+ sp_pd.add_argument("--labels_path", type=str, default=None,
387
+ help="Output path for labels.txt "
388
+ "(default: {data_dir}/min{n}/labels.txt).")
389
+ sp_pd.add_argument("--splits_path", type=str, default=None,
390
+ help="Output path for splits.csv "
391
+ "(default: {data_dir}/min{n}/splits.csv).")
392
+ sp_pd.set_defaults(func=_handle_prepare_dataset)
393
+
394
+ # ── build_samples ────────────────────────────────────────────────────
395
+ sp_samples = subparsers.add_parser(
396
+ "build_samples",
397
+ help="Build positive/negative example files (exs.pl) for the given ChEBI classes.",
398
+ )
399
+ _add_common_args(sp_samples)
400
+ sp_samples.add_argument("--min_pos_samples", type=int, default=25, help="Minimum positive samples per class.")
401
+ sp_samples.add_argument("--max_pos_samples", type=int, default=200, help="Maximum positive samples per class.")
402
+ sp_samples.add_argument("--min_neg_samples", type=int, default=25, help="Minimum negative samples per class.")
403
+ sp_samples.add_argument("--max_neg_samples", type=int, default=200, help="Maximum negative samples per class.")
404
+
405
+ sp_samples.set_defaults(func=_handle_build_samples)
406
+
407
+ # ── build_bk ─────────────────────────────────────────────────────────
408
+ sp_bk = subparsers.add_parser(
409
+ "build_bk",
410
+ help="Build background knowledge (bk.pl) and bias template (bias.pl) for the given ChEBI classes.",
411
+ )
412
+ _add_common_args(sp_bk)
413
+ sp_bk.set_defaults(func=_handle_build_bk)
414
+
415
+ # ── learn ────────────────────────────────────────────────────────────
416
+ sp_learn = subparsers.add_parser(
417
+ "learn",
418
+ help="Run ILP learning (training + validation) for the given ChEBI classes.",
419
+ )
420
+ sp_learn.add_argument("--labels_file", type=str, required=True, help="Path to the labels file (one ChEBI ID per line).")
421
+ sp_learn.add_argument("--timeout", type=int, default=20, help="Timeout for ILP solver in seconds.")
422
+ sp_learn.add_argument("--predicate_set", type=str, default="atoms", choices=typing.get_args(AVAILABLE_PREDICATE_SETS), help="Which predicate set to use.")
423
+ sp_learn.add_argument("--fg_mode", action="store_true", help="Learn functional groups instead of ChEBI classes.")
424
+ sp_learn.add_argument("--max_pos_samples", type=int, default=200, help="Maximum positive samples per class.")
425
+ sp_learn.add_argument("--max_neg_samples", type=int, default=200, help="Maximum negative samples per class.")
426
+ sp_learn.add_argument("--selection_mode", type=str, default=None, choices=["claude", "random", "top_k"], help="Mode for selecting body predicates in bias file.")
427
+ sp_learn.add_argument("--selection_k", type=int, default=10, help="Number of predicates selection with selection_mode (required if selection_mode is set).")
428
+ sp_learn.add_argument("--max_vars", type=int, default=6, help="Maximum number of variables in learned rules.")
429
+ sp_learn.add_argument("--max_body", type=int, default=8, help="Maximum number of body literals in learned rules.")
430
+ sp_learn.add_argument("--max_clauses", type=int, default=2, help="Maximum number of clauses in the learned program.")
431
+ sp_learn.add_argument("--mdl_weight_fn", type=int, default=1, help="Weight β for false negatives in MDL cost (default: 1).")
432
+ sp_learn.add_argument("--mdl_weight_fp", type=int, default=1, help="Weight γ for false positives in MDL cost (default: 1).")
433
+ sp_learn.add_argument("--mdl_weight_size", type=int, default=1, help="Weight α for program size in MDL cost (default: 1).")
434
+ sp_learn.set_defaults(func=_handle_learn)
435
+
436
+ # ── select_predicates ────────────────────────────────────────────────
437
+ sp_select = subparsers.add_parser(
438
+ "select_predicates",
439
+ help="Select predicates for ChEBI classes (via Claude, random, or top-k frequency).",
440
+ )
441
+ sp_select.add_argument("--labels_file", type=str, required=True, help="Path to file with ChEBI IDs (one per line).")
442
+ sp_select.add_argument("--chebi_version", type=int, default=248, help="ChEBI version to use.")
443
+ sp_select.add_argument("--problem_dir", type=str, default=None, help="Base directory for ILP problems.")
444
+ sp_select.add_argument("--predicate_set", type=str, default="atoms", choices=typing.get_args(AVAILABLE_PREDICATE_SETS), help="Which predicate set to use.")
445
+ sp_select.add_argument("--selection_mode", type=str, default="claude", choices=["claude", "random", "top_k"], help="How to select predicates.")
446
+ sp_select.add_argument("--selection_k", type=int, default=10, help="Number of predicates to select.")
447
+ sp_select.set_defaults(func=_handle_select_predicates)
448
+
449
+ # ── test ─────────────────────────────────────────────────────────────
450
+ sp_test = subparsers.add_parser(
451
+ "test",
452
+ help="Evaluate learned programs on the test set using results from a previous run.",
453
+ )
454
+ sp_test.add_argument("--run_to_evaluate", type=str, required=True, help="Path to a previous run directory (must contain results.json and config.yml).")
455
+ sp_test.add_argument("--test_on", type=str, default="test", choices=["validation", "test"], help="Split to evaluate on: 'test' (default) or 'validation' (validation).")
456
+ sp_test.add_argument("--verbose", action="store_true", help="Log classification result for up to 10 positive and negative samples per class.")
457
+ sp_test.set_defaults(func=_handle_test)
458
+
459
+ # ── build_ilp_preds_for_ensemble ─────────────────────────────────────
460
+ sp_bipe = subparsers.add_parser(
461
+ "build_ilp_preds_for_ensemble",
462
+ help="Build a full ILP predictions tensor for a split using programs from a run's results.json. "
463
+ "Builds background knowledge from scratch (not from bk.pl files). "
464
+ "Output is saved as full_val_preds.npy / full_test_preds.npy inside the run directory.",
465
+ )
466
+ sp_bipe.add_argument("--run_dir", type=str, required=True,
467
+ help="Run directory containing results.json (output of 'learn' or 'test').")
468
+ sp_bipe.add_argument("--predict_on", type=str, default="validation",
469
+ choices=["validation", "test"],
470
+ help="Split to build predictions for (default: validation).")
471
+ sp_bipe.add_argument("--chebi_split", type=str, required=True,
472
+ help="Path to the splits CSV (mol_id,split).")
473
+ sp_bipe.add_argument("--chebi_version", type=int, default=248,
474
+ help="ChEBI version; used to derive default molecules_path.")
475
+ sp_bipe.add_argument("--molecules_path", type=str, default=None,
476
+ help="Path to molecules.pkl (default: data/chebi_v{version}/molecules.pkl).")
477
+ sp_bipe.set_defaults(func=_handle_build_ilp_preds_for_ensemble)
478
+
479
+ # ── ensemble_construct ───────────────────────────────────────────────
480
+ sp_ec = subparsers.add_parser(
481
+ "ensemble_construct",
482
+ help="(EXP-006) Perform model selection and generate the ILP predictions tensor.",
483
+ )
484
+ sp_ec.add_argument("--chebi_version", type=int, default=248)
485
+ sp_ec.add_argument("--chebi_split", type=str, required=True,
486
+ help="Path to the splits CSV (mol_id,split); used to obtain the molecule list for the ILP tensor.")
487
+ sp_ec.add_argument("--predict_on", type=str, default="validation",
488
+ choices=["validation", "test"],
489
+ help="Which split to build the ILP predictions tensor for (default: validation).")
490
+ sp_ec.add_argument("--dl_val_preds_npy", type=str, required=True,
491
+ help="DL validation predictions .npy (mol × class float scores).")
492
+ sp_ec.add_argument("--dl_val_preds_meta", type=str, required=True,
493
+ help="Metadata JSON for --dl_val_preds_npy.")
494
+ sp_ec.add_argument("--ilp_val_runs", type=str, nargs="+", default=[],
495
+ help="Validation-run directories (output of 'test --test_on validation'). "
496
+ "Each must contain full_val_preds.npy + full_val_preds_metadata.json "
497
+ "and results.json (for ILP programs).")
498
+ sp_ec.add_argument("--label_stats", type=str, default=os.path.join("data", "chebi_v248", "ChEBI25_3_STAR", "processed", "class_stats.csv"),
499
+ help="Class statistics CSV (label list + has_negatives flag).")
500
+ sp_ec.add_argument("--output", type=str,
501
+ default=os.path.join("data", "ensemble_predictions", "ensemble_f1"),
502
+ help="Base output path. Suffixes _trusted_models.csv, _ilp_preds.npy, _ilp_preds_metadata.json are appended.")
503
+ sp_ec.set_defaults(func=_handle_ensemble_construct)
504
+
505
+ # ── ensemble_aggregate ───────────────────────────────────────────────
506
+ sp_ea = subparsers.add_parser(
507
+ "ensemble_aggregate",
508
+ help="(EXP-006) Aggregate pre-computed DL and ILP tensors into ensemble predictions.",
509
+ )
510
+ sp_ea.add_argument("--chebi_version", type=int, default=248)
511
+ sp_ea.add_argument("--dl_preds_npy", type=str, required=True,
512
+ help="DL predictions .npy for the target split.")
513
+ sp_ea.add_argument("--dl_preds_meta", type=str, required=True,
514
+ help="Metadata JSON for --dl_preds_npy.")
515
+ sp_ea.add_argument("--ilp_preds_npy", type=str, required=True,
516
+ help="ILP predictions tensor .npy (from ensemble_construct).")
517
+ sp_ea.add_argument("--ilp_preds_meta", type=str, required=True,
518
+ help="Metadata JSON for --ilp_preds_npy.")
519
+ sp_ea.add_argument("--trusted_models", type=str, required=True,
520
+ help="Trusted models CSV (_trusted_models.csv) or model weights CSV (_model_weights.csv) from ensemble_construct.")
521
+ sp_ea.add_argument("--label_stats", type=str, default=os.path.join("data", "chebi_v248", "ChEBI25_3_STAR", "processed", "class_stats.csv"),
522
+ help="Class statistics CSV (label list + has_negatives flag).")
523
+ sp_ea.add_argument("--output", type=str,
524
+ default=os.path.join("data", "ensemble_predictions", "ensemble_predictions.npy"),
525
+ help="Output .npy path; a matching _metadata.json is written alongside.")
526
+ sp_ea.set_defaults(func=_handle_ensemble_aggregate)
527
+
528
+ # ── explain ──────────────────────────────────────────────────────────────
529
+ sp_explain = subparsers.add_parser(
530
+ "explain",
531
+ help="Explain why a molecule satisfies a learned ILP rule using xclingo.",
532
+ )
533
+ smiles_group = sp_explain.add_mutually_exclusive_group(required=True)
534
+ smiles_group.add_argument("--smiles", type=str, help="SMILES string of the molecule to explain.")
535
+ smiles_group.add_argument("--smiles_file", type=str, help="File containing a single SMILES string.")
536
+ rule_group = sp_explain.add_mutually_exclusive_group(required=True)
537
+ rule_group.add_argument("--rule", type=str, help="ILP rule clause(s) as a string.")
538
+ rule_group.add_argument("--rule_file", type=str, help="File containing ILP rule clause(s).")
539
+ sp_explain.add_argument("--label_parents_json", type=str, default=os.path.join("data", "class_parents.json"), help="JSON file mapping class labels to their parent labels (for hierarchical explanations).")
540
+ sp_explain.add_argument("--output", type=str, default=None, help="Path to save the molecule visualization image (PNG).")
541
+ sp_explain.add_argument("--verbose", "-v", action="store_true", help="Print the assembled xclingo program before running.")
542
+ sp_explain.set_defaults(func=_handle_explain)
543
+
544
+ # ── rule_to_nl ───────────────────────────────────────────────────────────
545
+ sp_rtnl = subparsers.add_parser(
546
+ "rule_to_nl",
547
+ help="Translate a learned ILP rule to a natural language description.",
548
+ )
549
+ rule_group_rtnl = sp_rtnl.add_mutually_exclusive_group(required=True)
550
+ rule_group_rtnl.add_argument("--rule", type=str, help="ILP rule clause(s) as a string.")
551
+ rule_group_rtnl.add_argument("--rule_file", type=str, help="File containing ILP rule clause(s).")
552
+ sp_rtnl.add_argument(
553
+ "--class_parents", type=str,
554
+ default=os.path.join("data", "class_parents.json"),
555
+ help="Path to class_parents.json for name/parent lookup (default: data/class_parents.json).",
556
+ )
557
+ sp_rtnl.set_defaults(func=_handle_rule_to_nl)
558
+
559
+ return parser
560
+
561
+
562
+ def _handle_rule_to_nl(args):
563
+ from chebILP.rule_to_nl import translate_rule, load_class_parents
564
+
565
+ rule = args.rule
566
+ if rule is None:
567
+ with open(args.rule_file, "r") as f:
568
+ rule = f.read()
569
+
570
+ class_parents = None
571
+ if os.path.exists(args.class_parents):
572
+ class_parents = load_class_parents(args.class_parents)
573
+
574
+ print(translate_rule(rule, class_parents=class_parents))
575
+
576
+
577
+ def _handle_explain(args):
578
+ from chebILP.explain import explain_molecule
579
+
580
+ smiles = args.smiles
581
+ if smiles is None:
582
+ with open(args.smiles_file, "r") as f:
583
+ smiles = f.read().strip()
584
+
585
+ rule = args.rule
586
+ if rule is None:
587
+ with open(args.rule_file, "r") as f:
588
+ rule = f.read()
589
+
590
+ satisfies, explanation_text, _ = explain_molecule(
591
+ smiles=smiles,
592
+ rule=rule,
593
+ label_parents_json=args.label_parents_json,
594
+ output_path=args.output,
595
+ verbose=args.verbose,
596
+ )
597
+
598
+ print(explanation_text)
599
+
600
+ def main():
601
+ parser = build_parser()
602
+ args = parser.parse_args()
603
+ args.func(args)
604
+
605
+
606
+ if __name__ == "__main__":
607
+ main()