olmsted-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ """Olmsted CLI - Command-line interface for Olmsted data processing."""
2
+
3
+ # Re-export the package version, the main API class, and commonly used types.
4
+ from .api import OlmstedData
5
+ from .types import (
6
+ OlmstedClone,
7
+ OlmstedDataset,
8
+ OlmstedNode,
9
+ OlmstedOutput,
10
+ OlmstedTree,
11
+ )
12
+ from .version import __version__
13
+
14
+ __all__ = [
15
+ "__version__",
16
+ "OlmstedData",
17
+ "OlmstedNode",
18
+ "OlmstedTree",
19
+ "OlmstedClone",
20
+ "OlmstedDataset",
21
+ "OlmstedOutput",
22
+ ]
olmsted_cli/api.py ADDED
@@ -0,0 +1,590 @@
1
+ """
2
+ High-level API for working with Olmsted data.
3
+
4
+ This module provides the OlmstedData class, which is the main interface for
5
+ loading, manipulating, and saving Olmsted data in various formats.
6
+
7
+ Usage:
8
+ from olmsted_cli.api import OlmstedData
9
+
10
+ # Load from different formats
11
+ data = OlmstedData.from_olmsted_json("output.json")
12
+ data = OlmstedData.from_pcp("pcp.csv", "trees.csv")
13
+ data = OlmstedData.from_airr_json("airr_data.json")
14
+
15
+ # Access typed data
16
+ for clone in data.clones["dataset-1"]:
17
+ print(clone["clone_id"])
18
+
19
+ # Save to different formats
20
+ data.to_olmsted_json("output.json")
21
+ data.to_pcp("pcp.csv", "trees.csv")
22
+ data.to_airr_json("airr_output.json")
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import csv
28
+ import gzip
29
+ import json
30
+ from argparse import Namespace
31
+ from pathlib import Path
32
+ from typing import Dict, List, Optional, Union
33
+
34
+ from .data_io import read_airr_json, read_olmsted_json
35
+ from .identifier import IdentMinter
36
+ from .types import (
37
+ OlmstedClone,
38
+ OlmstedDataset,
39
+ OlmstedOutput,
40
+ OlmstedTree,
41
+ OutputMetadata,
42
+ )
43
+
44
+
45
+ class OlmstedData:
46
+ """
47
+ Main class for working with Olmsted data.
48
+
49
+ Provides methods for loading from and saving to different formats:
50
+ - Olmsted JSON (native format)
51
+ - AIRR JSON
52
+ - PCP CSV + Trees CSV
53
+
54
+ Attributes:
55
+ datasets: List of dataset metadata
56
+ clones: Dictionary mapping dataset_id to list of clones
57
+ trees: List of phylogenetic trees
58
+ metadata: Optional output metadata
59
+
60
+ Example:
61
+ # Load from PCP format
62
+ data = OlmstedData.from_pcp("pcp.csv", "trees.csv")
63
+
64
+ # Access data with type hints
65
+ for dataset_id, clone_list in data.clones.items():
66
+ for clone in clone_list:
67
+ print(clone["clone_id"], clone["mean_mut_freq"])
68
+
69
+ # Save to Olmsted JSON
70
+ data.to_olmsted_json("output.json")
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ datasets: List[OlmstedDataset],
76
+ clones: Dict[str, List[OlmstedClone]],
77
+ trees: List[OlmstedTree],
78
+ metadata: Optional[OutputMetadata] = None,
79
+ ):
80
+ self.datasets = datasets
81
+ self.clones = clones
82
+ self.trees = trees
83
+ self.metadata = metadata
84
+
85
+ # -------------------------------------------------------------------------
86
+ # Factory methods for loading data
87
+ # -------------------------------------------------------------------------
88
+
89
+ @classmethod
90
+ def from_olmsted_json(
91
+ cls,
92
+ filepath: Union[str, Path],
93
+ ) -> "OlmstedData":
94
+ """
95
+ Load data from Olmsted JSON format.
96
+
97
+ Args:
98
+ filepath: Path to consolidated Olmsted JSON file
99
+
100
+ Returns:
101
+ OlmstedData instance
102
+
103
+ Example:
104
+ data = OlmstedData.from_olmsted_json("output.json")
105
+ """
106
+ raw_data = read_olmsted_json(filepath)
107
+
108
+ return cls(
109
+ datasets=raw_data.get("datasets", []),
110
+ clones=raw_data.get("clones", {}),
111
+ trees=raw_data.get("trees", []),
112
+ metadata=raw_data.get("metadata"),
113
+ )
114
+
115
+ @classmethod
116
+ def from_pcp(
117
+ cls,
118
+ pcp_csv: Union[str, Path],
119
+ trees_csv: Optional[Union[str, Path]] = None,
120
+ *,
121
+ compute_metrics: bool = False,
122
+ seed: Optional[int] = None,
123
+ name: Optional[str] = None,
124
+ verbosity: int = 1,
125
+ ) -> "OlmstedData":
126
+ """
127
+ Load data from PCP CSV format.
128
+
129
+ Args:
130
+ pcp_csv: Path to PCP CSV file (can be gzipped)
131
+ trees_csv: Optional path to trees CSV file (can be gzipped)
132
+ compute_metrics: Whether to compute LBI/LBR metrics
133
+ seed: Random seed for deterministic UUID generation
134
+ name: Optional dataset name
135
+ verbosity: Verbosity level (0=quiet, 1=normal, 2=verbose, 3=debug)
136
+
137
+ Returns:
138
+ OlmstedData instance
139
+
140
+ Example:
141
+ data = OlmstedData.from_pcp("pcp.csv", "trees.csv")
142
+ data = OlmstedData.from_pcp("paired.csv.gz", "trees.csv.gz")
143
+ """
144
+ from .process_pcp_data import (
145
+ parse_newick_csv,
146
+ parse_pcp_csv,
147
+ process_pcp_to_olmsted,
148
+ )
149
+
150
+ # Parse input files
151
+ pcp_families = parse_pcp_csv(str(pcp_csv))
152
+ newick_trees = parse_newick_csv(str(trees_csv)) if trees_csv else None
153
+
154
+ # Process to Olmsted format
155
+ datasets, clones, trees = process_pcp_to_olmsted(
156
+ pcp_families,
157
+ newick_trees,
158
+ minter=IdentMinter(seed=seed),
159
+ compute_metrics=compute_metrics,
160
+ name=name,
161
+ verbosity=verbosity,
162
+ )
163
+
164
+ return cls(datasets=datasets, clones=clones, trees=trees)
165
+
166
+ @classmethod
167
+ def from_airr_json(
168
+ cls,
169
+ filepath: Union[str, Path],
170
+ *,
171
+ seed: Optional[int] = None,
172
+ name: Optional[str] = None,
173
+ verbosity: int = 1,
174
+ ) -> "OlmstedData":
175
+ """
176
+ Load data from AIRR JSON format.
177
+
178
+ Args:
179
+ filepath: Path to AIRR JSON file
180
+ seed: Random seed for deterministic UUID generation
181
+ name: Optional dataset name
182
+ verbosity: Verbosity level
183
+
184
+ Returns:
185
+ OlmstedData instance
186
+
187
+ Example:
188
+ data = OlmstedData.from_airr_json("airr_data.json")
189
+
190
+ Note:
191
+ For full AIRR processing options, use the CLI:
192
+ `olmsted process -f airr -i input.json -o output.json`
193
+ """
194
+ from .process_airr_data import process_dataset
195
+
196
+ # Load AIRR JSON
197
+ airr_data = read_airr_json(filepath)
198
+
199
+ args = Namespace(
200
+ minter=IdentMinter(seed=seed),
201
+ verbose=verbosity > 0,
202
+ name=name,
203
+ )
204
+
205
+ # Process datasets
206
+ datasets: List[OlmstedDataset] = []
207
+ clones_dict: Dict[str, List[OlmstedClone]] = {}
208
+ trees: List[OlmstedTree] = []
209
+
210
+ # AIRR format has datasets at the top level
211
+ for dataset in airr_data.get("datasets", [airr_data]):
212
+ processed_dataset = process_dataset(args, dataset, clones_dict, trees)
213
+ if processed_dataset:
214
+ datasets.append(processed_dataset)
215
+
216
+ return cls(datasets=datasets, clones=clones_dict, trees=trees)
217
+
218
+ # -------------------------------------------------------------------------
219
+ # Methods for saving data
220
+ # -------------------------------------------------------------------------
221
+
222
+ def to_olmsted_json(
223
+ self,
224
+ filepath: Union[str, Path],
225
+ *,
226
+ indent: Optional[int] = 2,
227
+ include_metadata: bool = True,
228
+ ) -> None:
229
+ """
230
+ Save data to Olmsted JSON format.
231
+
232
+ Args:
233
+ filepath: Output file path (use .gz extension for compression)
234
+ indent: JSON indentation (None for compact output)
235
+ include_metadata: Whether to include metadata in output
236
+
237
+ Example:
238
+ data.to_olmsted_json("output.json")
239
+ data.to_olmsted_json("output.json.gz") # Compressed
240
+ """
241
+ filepath = Path(filepath)
242
+
243
+ output: OlmstedOutput = {
244
+ "datasets": self.datasets,
245
+ "clones": self.clones,
246
+ "trees": self.trees,
247
+ }
248
+
249
+ if include_metadata and self.metadata:
250
+ output["metadata"] = self.metadata
251
+
252
+ if filepath.suffix == ".gz":
253
+ with gzip.open(filepath, "wt") as f:
254
+ json.dump(output, f, indent=indent)
255
+ else:
256
+ with open(filepath, "w") as f:
257
+ json.dump(output, f, indent=indent)
258
+
259
+ def to_dict(self) -> OlmstedOutput:
260
+ """
261
+ Convert to dictionary representation.
262
+
263
+ Returns:
264
+ OlmstedOutput dictionary
265
+ """
266
+ result: OlmstedOutput = {
267
+ "datasets": self.datasets,
268
+ "clones": self.clones,
269
+ "trees": self.trees,
270
+ }
271
+ if self.metadata:
272
+ result["metadata"] = self.metadata
273
+ return result
274
+
275
+ def to_pcp(
276
+ self,
277
+ pcp_csv: Union[str, Path],
278
+ trees_csv: Optional[Union[str, Path]] = None,
279
+ *,
280
+ include_light_chain: bool = True,
281
+ ) -> None:
282
+ """
283
+ Export data to PCP CSV format.
284
+
285
+ Args:
286
+ pcp_csv: Output path for PCP CSV file (use .gz for compression)
287
+ trees_csv: Optional output path for trees CSV file
288
+ include_light_chain: Whether to include light chain columns for paired data
289
+
290
+ Example:
291
+ data.to_pcp("output_pcp.csv", "output_trees.csv")
292
+ data.to_pcp("output.csv.gz", "trees.csv.gz") # Compressed
293
+ """
294
+ pcp_path = Path(pcp_csv)
295
+ is_paired = self.is_paired and include_light_chain
296
+
297
+ # Build PCP rows from trees
298
+ pcp_rows = []
299
+
300
+ for tree in self.trees:
301
+ clone_id = tree.get("clone_id", "")
302
+ nodes = tree.get("nodes", [])
303
+
304
+ # Handle both list and dict node formats
305
+ if isinstance(nodes, dict):
306
+ nodes_list = list(nodes.values())
307
+ else:
308
+ nodes_list = nodes
309
+
310
+ # Build node lookup
311
+ nodes_by_id = {n.get("sequence_id", ""): n for n in nodes_list}
312
+
313
+ # Find sample_id from clone
314
+ sample_id = ""
315
+ for clone_list in self.clones.values():
316
+ for clone in clone_list:
317
+ if clone.get("clone_id") == clone_id:
318
+ sample_id = clone.get("sample_id", "")
319
+ break
320
+
321
+ # Build parent-child rows from nodes
322
+ for node in nodes_list:
323
+ node_id = node.get("sequence_id", "")
324
+ parent_id = node.get("parent")
325
+
326
+ if parent_id is None:
327
+ continue # Skip root node (no parent)
328
+
329
+ parent_node = nodes_by_id.get(parent_id, {})
330
+
331
+ row = {
332
+ "sample_id": sample_id,
333
+ "family": clone_id,
334
+ "parent_name": parent_id,
335
+ "child_name": node_id,
336
+ "parent_heavy": parent_node.get("sequence_alignment", ""),
337
+ "child_heavy": node.get("sequence_alignment", ""),
338
+ "branch_length": node.get("length", 0.0),
339
+ "distance": node.get("distance", 0.0),
340
+ "parent_is_naive": str(parent_node.get("type") == "root"),
341
+ "child_is_leaf": str(node.get("type") == "leaf"),
342
+ }
343
+
344
+ # Add light chain columns for paired data
345
+ if is_paired:
346
+ row["parent_light"] = parent_node.get("sequence_alignment_light", "")
347
+ row["child_light"] = node.get("sequence_alignment_light", "")
348
+
349
+ pcp_rows.append(row)
350
+
351
+ # Determine columns
352
+ base_columns = [
353
+ "sample_id",
354
+ "family",
355
+ "parent_name",
356
+ "child_name",
357
+ "parent_heavy",
358
+ "child_heavy",
359
+ "branch_length",
360
+ "distance",
361
+ "parent_is_naive",
362
+ "child_is_leaf",
363
+ ]
364
+ if is_paired:
365
+ base_columns.extend(["parent_light", "child_light"])
366
+
367
+ # Write PCP CSV
368
+ if pcp_path.suffix == ".gz":
369
+ with gzip.open(pcp_path, "wt", newline="") as f:
370
+ writer = csv.DictWriter(f, fieldnames=base_columns)
371
+ writer.writeheader()
372
+ writer.writerows(pcp_rows)
373
+ else:
374
+ with open(pcp_path, "w", newline="") as f:
375
+ writer = csv.DictWriter(f, fieldnames=base_columns)
376
+ writer.writeheader()
377
+ writer.writerows(pcp_rows)
378
+
379
+ # Write trees CSV if requested
380
+ if trees_csv:
381
+ trees_path = Path(trees_csv)
382
+ tree_rows = []
383
+
384
+ for tree in self.trees:
385
+ clone_id = tree.get("clone_id", "")
386
+ newick = tree.get("newick", "")
387
+
388
+ # Find sample_id and rate scaling from clone
389
+ sample_id = ""
390
+ rate_scale_heavy = 1.0
391
+ rate_scale_light = 1.0
392
+
393
+ for clone_list in self.clones.values():
394
+ for clone in clone_list:
395
+ if clone.get("clone_id") == clone_id:
396
+ sample_id = clone.get("sample_id", "")
397
+ rate_scale_heavy = clone.get("rate_scale_heavy", 1.0)
398
+ rate_scale_light = clone.get("rate_scale_light", 1.0)
399
+ break
400
+
401
+ row = {
402
+ "sample_id": sample_id,
403
+ "family": clone_id,
404
+ "newick": newick,
405
+ }
406
+
407
+ if is_paired:
408
+ row["rate_scale_heavy"] = rate_scale_heavy
409
+ row["rate_scale_light"] = rate_scale_light
410
+
411
+ tree_rows.append(row)
412
+
413
+ # Determine tree columns
414
+ tree_columns = ["sample_id", "family", "newick"]
415
+ if is_paired:
416
+ tree_columns = [
417
+ "sample_id",
418
+ "family",
419
+ "rate_scale_heavy",
420
+ "rate_scale_light",
421
+ "newick",
422
+ ]
423
+
424
+ if trees_path.suffix == ".gz":
425
+ with gzip.open(trees_path, "wt", newline="") as f:
426
+ writer = csv.DictWriter(f, fieldnames=tree_columns)
427
+ writer.writeheader()
428
+ writer.writerows(tree_rows)
429
+ else:
430
+ with open(trees_path, "w", newline="") as f:
431
+ writer = csv.DictWriter(f, fieldnames=tree_columns)
432
+ writer.writeheader()
433
+ writer.writerows(tree_rows)
434
+
435
+ def to_airr_json(
436
+ self,
437
+ filepath: Union[str, Path],
438
+ *,
439
+ indent: Optional[int] = 2,
440
+ ) -> None:
441
+ """
442
+ Export data to AIRR-compatible JSON format.
443
+
444
+ This exports data in a format compatible with AIRR standards,
445
+ suitable for use with other AIRR-compliant tools.
446
+
447
+ Args:
448
+ filepath: Output file path (use .gz extension for compression)
449
+ indent: JSON indentation (None for compact output)
450
+
451
+ Example:
452
+ data.to_airr_json("airr_output.json")
453
+
454
+ Note:
455
+ This produces a simplified AIRR-compatible format. For full
456
+ AIRR compliance, additional metadata may be required.
457
+ """
458
+ filepath = Path(filepath)
459
+
460
+ # Build AIRR-compatible structure
461
+ airr_output = {
462
+ "Info": {
463
+ "title": "Olmsted Export",
464
+ "version": "1.0",
465
+ "description": "Data exported from Olmsted",
466
+ },
467
+ "DataProcessing": [],
468
+ "Repertoire": [],
469
+ "GermlineSet": [],
470
+ "Clone": [],
471
+ "Tree": [],
472
+ }
473
+
474
+ # Convert clones to AIRR Clone format
475
+ for dataset_id, clone_list in self.clones.items():
476
+ for clone in clone_list:
477
+ airr_clone = {
478
+ "clone_id": clone.get("clone_id"),
479
+ "repertoire_id": dataset_id,
480
+ "data_processing_id": None,
481
+ "sequences": clone.get("unique_seqs_count", 0),
482
+ "v_call": clone.get("v_call"),
483
+ "d_call": clone.get("d_call"),
484
+ "j_call": clone.get("j_call"),
485
+ "junction": clone.get("cdr3_sequence"),
486
+ "junction_aa": None,
487
+ "junction_length": clone.get("junction_length"),
488
+ }
489
+
490
+ # Add trees inline if present
491
+ clone_trees = clone.get("trees", [])
492
+ if clone_trees:
493
+ airr_clone["trees"] = []
494
+ for tree in clone_trees:
495
+ airr_tree = {
496
+ "tree_id": tree.get("tree_id"),
497
+ "clone_id": clone.get("clone_id"),
498
+ "newick": tree.get("newick"),
499
+ }
500
+ airr_clone["trees"].append(airr_tree)
501
+
502
+ airr_output["Clone"].append(airr_clone)
503
+
504
+ # Add standalone trees
505
+ for tree in self.trees:
506
+ airr_tree = {
507
+ "tree_id": tree.get("tree_id"),
508
+ "clone_id": tree.get("clone_id"),
509
+ "newick": tree.get("newick"),
510
+ "nodes": tree.get("nodes", []),
511
+ }
512
+ airr_output["Tree"].append(airr_tree)
513
+
514
+ # Write output
515
+ if filepath.suffix == ".gz":
516
+ with gzip.open(filepath, "wt") as f:
517
+ json.dump(airr_output, f, indent=indent)
518
+ else:
519
+ with open(filepath, "w") as f:
520
+ json.dump(airr_output, f, indent=indent)
521
+
522
+ # -------------------------------------------------------------------------
523
+ # Convenience properties
524
+ # -------------------------------------------------------------------------
525
+
526
+ @property
527
+ def dataset_ids(self) -> List[str]:
528
+ """Get list of all dataset IDs."""
529
+ return [d["dataset_id"] for d in self.datasets if "dataset_id" in d]
530
+
531
+ @property
532
+ def clone_count(self) -> int:
533
+ """Get total number of clones across all datasets."""
534
+ return sum(len(clone_list) for clone_list in self.clones.values())
535
+
536
+ @property
537
+ def tree_count(self) -> int:
538
+ """Get total number of trees."""
539
+ return len(self.trees)
540
+
541
+ @property
542
+ def is_paired(self) -> bool:
543
+ """Check if data contains paired heavy/light chain information."""
544
+ for clone_list in self.clones.values():
545
+ for clone in clone_list:
546
+ if clone.get("is_paired"):
547
+ return True
548
+ return False
549
+
550
+ def get_clones(self, dataset_id: Optional[str] = None) -> List[OlmstedClone]:
551
+ """
552
+ Get clones, optionally filtered by dataset.
553
+
554
+ Args:
555
+ dataset_id: Optional dataset ID to filter by
556
+
557
+ Returns:
558
+ List of clones
559
+ """
560
+ if dataset_id:
561
+ return self.clones.get(dataset_id, [])
562
+ # Return all clones flattened
563
+ return [clone for clone_list in self.clones.values() for clone in clone_list]
564
+
565
+ def get_trees(self, clone_id: Optional[str] = None) -> List[OlmstedTree]:
566
+ """
567
+ Get trees, optionally filtered by clone.
568
+
569
+ Args:
570
+ clone_id: Optional clone ID to filter by
571
+
572
+ Returns:
573
+ List of trees
574
+ """
575
+ if clone_id:
576
+ return [t for t in self.trees if t.get("clone_id") == clone_id]
577
+ return self.trees
578
+
579
+ def __repr__(self) -> str:
580
+ paired_str = " (paired)" if self.is_paired else ""
581
+ return (
582
+ f"OlmstedData("
583
+ f"datasets={len(self.datasets)}, "
584
+ f"clones={self.clone_count}, "
585
+ f"trees={self.tree_count}"
586
+ f"{paired_str})"
587
+ )
588
+
589
+
590
+ __all__ = ["OlmstedData"]