smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
CHANGED
|
@@ -1,4 +1,15 @@
|
|
|
1
1
|
## readwrite ##
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Union, Iterable
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterable, Sequence, Optional
|
|
9
|
+
|
|
10
|
+
import warnings
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import anndata as ad
|
|
2
13
|
|
|
3
14
|
######################################################################################################
|
|
4
15
|
## Datetime functionality
|
|
@@ -21,6 +32,101 @@ def time_string():
|
|
|
21
32
|
return current_time.strftime("%H:%M:%S")
|
|
22
33
|
######################################################################################################
|
|
23
34
|
|
|
35
|
+
######################################################################################################
|
|
36
|
+
## General file and directory handling
|
|
37
|
+
def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Create one or multiple directories.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
directories : str | Path | list/iterable of str | Path
|
|
44
|
+
Paths of directories to create. If a file path is passed,
|
|
45
|
+
the parent directory is created.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
None
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# allow user to pass a single string/Path
|
|
53
|
+
if isinstance(directories, (str, Path)):
|
|
54
|
+
directories = [directories]
|
|
55
|
+
|
|
56
|
+
for d in directories:
|
|
57
|
+
p = Path(d)
|
|
58
|
+
|
|
59
|
+
# If someone passes in a file path, make its parent
|
|
60
|
+
if p.suffix: # p.suffix != "" means it's a file
|
|
61
|
+
p = p.parent
|
|
62
|
+
|
|
63
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
def add_or_update_column_in_csv(
|
|
66
|
+
csv_path: str | Path,
|
|
67
|
+
column_name: str,
|
|
68
|
+
values,
|
|
69
|
+
index: bool = False,
|
|
70
|
+
):
|
|
71
|
+
"""
|
|
72
|
+
Add (or overwrite) a column in a CSV file.
|
|
73
|
+
If the CSV does not exist, create it containing only that column.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
csv_path : str | Path
|
|
78
|
+
Path to the CSV file.
|
|
79
|
+
column_name : str
|
|
80
|
+
Name of the column to add or update.
|
|
81
|
+
values : list | scalar | callable
|
|
82
|
+
- If list/Series: must match the number of rows.
|
|
83
|
+
- If scalar: broadcast to all rows (or single-row CSV if new file).
|
|
84
|
+
- If callable(df): function should return the column values based on df.
|
|
85
|
+
index : bool
|
|
86
|
+
Whether to write the pandas index into the CSV. Default False.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
pd.DataFrame : the updated DataFrame.
|
|
91
|
+
"""
|
|
92
|
+
csv_path = Path(csv_path)
|
|
93
|
+
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
|
94
|
+
|
|
95
|
+
# Case 1 — CSV does not exist → create it
|
|
96
|
+
if not csv_path.exists():
|
|
97
|
+
if hasattr(values, "__len__") and not isinstance(values, str):
|
|
98
|
+
df = pd.DataFrame({column_name: list(values)})
|
|
99
|
+
else:
|
|
100
|
+
df = pd.DataFrame({column_name: [values]})
|
|
101
|
+
df.to_csv(csv_path, index=index)
|
|
102
|
+
return df
|
|
103
|
+
|
|
104
|
+
# Case 2 — CSV exists → load + modify
|
|
105
|
+
df = pd.read_csv(csv_path)
|
|
106
|
+
|
|
107
|
+
# If values is callable, call it with df
|
|
108
|
+
if callable(values):
|
|
109
|
+
values = values(df)
|
|
110
|
+
|
|
111
|
+
# Broadcast scalar
|
|
112
|
+
if not hasattr(values, "__len__") or isinstance(values, str):
|
|
113
|
+
df[column_name] = values
|
|
114
|
+
df.to_csv(csv_path, index=index)
|
|
115
|
+
return df
|
|
116
|
+
|
|
117
|
+
# Sequence case: lengths must match
|
|
118
|
+
if len(values) != len(df):
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"Length mismatch: CSV has {len(df)} rows "
|
|
121
|
+
f"but values has {len(values)} entries."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
df[column_name] = list(values)
|
|
125
|
+
df.to_csv(csv_path, index=index)
|
|
126
|
+
return df
|
|
127
|
+
|
|
128
|
+
######################################################################################################
|
|
129
|
+
|
|
24
130
|
######################################################################################################
|
|
25
131
|
## Numpy, Pandas, Anndata functionality
|
|
26
132
|
|
|
@@ -62,7 +168,6 @@ def adata_to_df(adata, layer=None):
|
|
|
62
168
|
|
|
63
169
|
return df
|
|
64
170
|
|
|
65
|
-
|
|
66
171
|
def save_matrix(matrix, save_name):
|
|
67
172
|
"""
|
|
68
173
|
Input: A numpy matrix and a save_name
|
|
@@ -71,106 +176,913 @@ def save_matrix(matrix, save_name):
|
|
|
71
176
|
import numpy as np
|
|
72
177
|
np.savetxt(f'{save_name}.txt', matrix)
|
|
73
178
|
|
|
74
|
-
def concatenate_h5ads(
|
|
179
|
+
def concatenate_h5ads(
|
|
180
|
+
output_path: str | Path,
|
|
181
|
+
*,
|
|
182
|
+
input_dir: str | Path | None = None,
|
|
183
|
+
csv_path: str | Path | None = None,
|
|
184
|
+
csv_column: str = "h5ad_path",
|
|
185
|
+
file_suffixes: Sequence[str] = (".h5ad", ".h5ad.gz"),
|
|
186
|
+
delete_inputs: bool = False,
|
|
187
|
+
restore_backups: bool = True,
|
|
188
|
+
) -> Path:
|
|
75
189
|
"""
|
|
76
|
-
Concatenate
|
|
77
|
-
|
|
190
|
+
Concatenate multiple .h5ad files into one AnnData and write it safely.
|
|
191
|
+
|
|
192
|
+
Two input modes (choose ONE):
|
|
193
|
+
1) Directory mode: use all *.h5ad / *.h5ad.gz in `input_dir`.
|
|
194
|
+
2) CSV mode: use file paths from column `csv_column` in `csv_path`.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
output_path
|
|
199
|
+
Path to the final concatenated .h5ad (can be .h5ad or .h5ad.gz).
|
|
200
|
+
input_dir
|
|
201
|
+
Directory containing .h5ad files to concatenate. If None and csv_path
|
|
202
|
+
is also None, defaults to the current working directory.
|
|
203
|
+
csv_path
|
|
204
|
+
Path to a CSV containing file paths to concatenate (in column `csv_column`).
|
|
205
|
+
csv_column
|
|
206
|
+
Name of the column in the CSV containing .h5ad paths.
|
|
207
|
+
file_suffixes
|
|
208
|
+
Tuple of allowed suffixes (default: (".h5ad", ".h5ad.gz")).
|
|
209
|
+
delete_inputs
|
|
210
|
+
If True, delete the input .h5ad files after successful write of output.
|
|
211
|
+
restore_backups
|
|
212
|
+
Passed through to `safe_read_h5ad(restore_backups=...)`.
|
|
213
|
+
|
|
214
|
+
Returns
|
|
215
|
+
-------
|
|
216
|
+
Path
|
|
217
|
+
The path to the written concatenated .h5ad file.
|
|
218
|
+
|
|
219
|
+
Raises
|
|
220
|
+
------
|
|
221
|
+
ValueError
|
|
222
|
+
If both `input_dir` and `csv_path` are provided, or none contain files.
|
|
223
|
+
FileNotFoundError
|
|
224
|
+
If specified CSV or directory does not exist.
|
|
78
225
|
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
#
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
226
|
+
|
|
227
|
+
# ------------------------------------------------------------------
|
|
228
|
+
# Setup and input resolution
|
|
229
|
+
# ------------------------------------------------------------------
|
|
230
|
+
output_path = Path(output_path)
|
|
231
|
+
|
|
232
|
+
if input_dir is not None and csv_path is not None:
|
|
233
|
+
raise ValueError("Provide either `input_dir` OR `csv_path`, not both.")
|
|
234
|
+
|
|
235
|
+
if csv_path is None:
|
|
236
|
+
# Directory mode
|
|
237
|
+
input_dir = Path(input_dir) if input_dir is not None else Path.cwd()
|
|
238
|
+
if not input_dir.exists():
|
|
239
|
+
raise FileNotFoundError(f"Input directory does not exist: {input_dir}")
|
|
240
|
+
if not input_dir.is_dir():
|
|
241
|
+
raise ValueError(f"input_dir is not a directory: {input_dir}")
|
|
242
|
+
|
|
243
|
+
# collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
|
|
244
|
+
suffixes_lower = tuple(s.lower() for s in file_suffixes)
|
|
245
|
+
h5_paths = sorted(
|
|
246
|
+
p for p in input_dir.iterdir()
|
|
247
|
+
if p.is_file() and p.suffix.lower() in suffixes_lower
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
else:
|
|
251
|
+
# CSV mode
|
|
252
|
+
csv_path = Path(csv_path)
|
|
253
|
+
if not csv_path.exists():
|
|
254
|
+
raise FileNotFoundError(f"CSV path does not exist: {csv_path}")
|
|
255
|
+
|
|
256
|
+
df = pd.read_csv(csv_path, dtype=str)
|
|
257
|
+
if csv_column not in df.columns:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
|
|
260
|
+
)
|
|
261
|
+
paths = df[csv_column].dropna().astype(str).tolist()
|
|
262
|
+
if not paths:
|
|
263
|
+
raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
|
|
264
|
+
|
|
265
|
+
h5_paths = [Path(p).expanduser() for p in paths]
|
|
266
|
+
|
|
267
|
+
if not h5_paths:
|
|
268
|
+
raise ValueError("No input .h5ad files found to concatenate.")
|
|
269
|
+
|
|
270
|
+
# Ensure directory for output exists
|
|
271
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
272
|
+
|
|
273
|
+
# ------------------------------------------------------------------
|
|
274
|
+
# Concatenate
|
|
275
|
+
# ------------------------------------------------------------------
|
|
276
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="anndata")
|
|
277
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
|
|
278
|
+
|
|
279
|
+
print(f"{time_string()}: Found {len(h5_paths)} input h5ad files:")
|
|
280
|
+
for p in h5_paths:
|
|
281
|
+
print(f" - {p}")
|
|
282
|
+
|
|
283
|
+
final_adata: Optional[ad.AnnData] = None
|
|
284
|
+
|
|
285
|
+
for p in h5_paths:
|
|
286
|
+
print(f"{time_string()}: Reading {p}")
|
|
287
|
+
temp_adata, read_report = safe_read_h5ad(p, restore_backups=restore_backups)
|
|
288
|
+
|
|
289
|
+
if final_adata is None:
|
|
290
|
+
print(f"{time_string()}: Initializing final AnnData with {p}")
|
|
106
291
|
final_adata = temp_adata
|
|
107
|
-
|
|
108
|
-
|
|
292
|
+
else:
|
|
293
|
+
print(f"{time_string()}: Concatenating {p} into final AnnData")
|
|
294
|
+
final_adata = ad.concat(
|
|
295
|
+
[final_adata, temp_adata],
|
|
296
|
+
join="outer",
|
|
297
|
+
merge='unique',
|
|
298
|
+
uns_merge='unique',
|
|
299
|
+
index_unique=None,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if final_adata is None:
|
|
303
|
+
raise RuntimeError("Unexpected: no AnnData objects loaded.")
|
|
109
304
|
|
|
110
|
-
|
|
305
|
+
print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
|
|
306
|
+
safe_write_h5ad(final_adata, output_path, backup=restore_backups)
|
|
307
|
+
|
|
308
|
+
# ------------------------------------------------------------------
|
|
309
|
+
# Optional cleanup (delete inputs)
|
|
310
|
+
# ------------------------------------------------------------------
|
|
111
311
|
if delete_inputs:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
print(f"Deleted file: {
|
|
121
|
-
|
|
122
|
-
|
|
312
|
+
out_resolved = output_path.resolve()
|
|
313
|
+
for p in h5_paths:
|
|
314
|
+
try:
|
|
315
|
+
# Don't delete the output file if it happens to be in the list
|
|
316
|
+
if p.resolve() == out_resolved:
|
|
317
|
+
continue
|
|
318
|
+
if p.exists():
|
|
319
|
+
p.unlink()
|
|
320
|
+
print(f"Deleted input file: {p}")
|
|
321
|
+
except OSError as e:
|
|
322
|
+
print(f"Error deleting file {p}: {e}")
|
|
123
323
|
else:
|
|
124
|
-
print(
|
|
324
|
+
print("Keeping input files.")
|
|
125
325
|
|
|
126
|
-
|
|
326
|
+
return output_path
|
|
327
|
+
|
|
328
|
+
def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
|
|
127
329
|
"""
|
|
128
|
-
|
|
330
|
+
Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
|
|
129
331
|
|
|
130
|
-
|
|
131
|
-
adata (AnnData): The AnnData object to save.
|
|
132
|
-
path (str): Output .h5ad file path.
|
|
133
|
-
compression (str): Compression method for h5ad file.
|
|
134
|
-
backup (bool): If True, saves problematic columns to CSV files.
|
|
135
|
-
backup_dir (str): Directory to store backups if backup=True.
|
|
332
|
+
Returns a report dict and prints a summary of what was converted/backed up/skipped.
|
|
136
333
|
"""
|
|
137
|
-
import
|
|
334
|
+
import os, json, pickle
|
|
335
|
+
from pathlib import Path
|
|
336
|
+
import numpy as np
|
|
138
337
|
import pandas as pd
|
|
139
|
-
import
|
|
338
|
+
import warnings
|
|
339
|
+
import anndata as _ad
|
|
340
|
+
|
|
341
|
+
path = Path(path)
|
|
342
|
+
|
|
343
|
+
if not backup_dir:
|
|
344
|
+
backup_dir = path.parent / str(path.name).split(".")[0]
|
|
140
345
|
|
|
141
346
|
os.makedirs(backup_dir, exist_ok=True)
|
|
142
347
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
348
|
+
# report structure
|
|
349
|
+
report = {
|
|
350
|
+
"obs_converted_columns": [],
|
|
351
|
+
"obs_backed_up_columns": [],
|
|
352
|
+
"var_converted_columns": [],
|
|
353
|
+
"var_backed_up_columns": [],
|
|
354
|
+
"uns_backed_up_keys": [],
|
|
355
|
+
"uns_json_keys": [],
|
|
356
|
+
"layers_converted": [],
|
|
357
|
+
"layers_skipped": [],
|
|
358
|
+
"obsm_converted": [],
|
|
359
|
+
"obsm_skipped": [],
|
|
360
|
+
"X_replaced_or_converted": None,
|
|
361
|
+
"errors": [],
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
def _backup(obj, name):
|
|
365
|
+
"""Pickle obj to backup_dir/name.pkl and return filename (or None)."""
|
|
366
|
+
fname = backup_dir / f"{name}.pkl"
|
|
367
|
+
try:
|
|
368
|
+
with open(fname, "wb") as fh:
|
|
369
|
+
pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)
|
|
370
|
+
if verbose:
|
|
371
|
+
print(f" backed up {name} -> {fname}")
|
|
372
|
+
return fname
|
|
373
|
+
except Exception as e:
|
|
374
|
+
msg = f"failed to back up {name}: {e}"
|
|
375
|
+
if verbose:
|
|
376
|
+
print(" " + msg)
|
|
377
|
+
report["errors"].append(msg)
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
def _make_obs_var_safe(df: pd.DataFrame, which: str):
|
|
381
|
+
"""
|
|
382
|
+
Return a sanitized copy of df where:
|
|
383
|
+
- object columns converted to strings (with backup)
|
|
384
|
+
- categorical columns' categories coerced to str (with backup)
|
|
385
|
+
"""
|
|
386
|
+
df = df.copy()
|
|
387
|
+
for col in list(df.columns):
|
|
388
|
+
ser = df[col]
|
|
389
|
+
# categorical handling
|
|
390
|
+
try:
|
|
391
|
+
is_cat = pd.api.types.is_categorical_dtype(ser.dtype)
|
|
392
|
+
except Exception:
|
|
393
|
+
is_cat = False
|
|
394
|
+
|
|
395
|
+
if is_cat:
|
|
396
|
+
try:
|
|
397
|
+
cats = ser.cat.categories
|
|
398
|
+
cats_str = cats.astype(str)
|
|
399
|
+
df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
|
|
400
|
+
if verbose:
|
|
401
|
+
print(f" coerced categorical column '{which}.{col}' -> string categories")
|
|
402
|
+
if which == "obs":
|
|
403
|
+
report["obs_converted_columns"].append(col)
|
|
404
|
+
else:
|
|
405
|
+
report["var_converted_columns"].append(col)
|
|
406
|
+
except Exception:
|
|
407
|
+
# backup then coerce
|
|
408
|
+
if backup:
|
|
409
|
+
_backup(ser, f"{which}.{col}_categorical_backup")
|
|
410
|
+
if which == "obs":
|
|
411
|
+
report["obs_backed_up_columns"].append(col)
|
|
412
|
+
else:
|
|
413
|
+
report["var_backed_up_columns"].append(col)
|
|
414
|
+
df[col] = ser.astype(str)
|
|
415
|
+
if verbose:
|
|
416
|
+
print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
# object dtype handling: try to coerce each element to string
|
|
420
|
+
try:
|
|
421
|
+
is_obj = ser.dtype == object or pd.api.types.is_object_dtype(ser.dtype)
|
|
422
|
+
except Exception:
|
|
423
|
+
is_obj = False
|
|
424
|
+
|
|
425
|
+
if is_obj:
|
|
426
|
+
# test whether converting to string succeeds for all elements
|
|
427
|
+
try:
|
|
428
|
+
_ = np.array(ser.values.astype(str))
|
|
429
|
+
if backup:
|
|
430
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
431
|
+
if which == "obs":
|
|
432
|
+
report["obs_backed_up_columns"].append(col)
|
|
433
|
+
else:
|
|
434
|
+
report["var_backed_up_columns"].append(col)
|
|
435
|
+
df[col] = ser.values.astype(str)
|
|
436
|
+
if verbose:
|
|
437
|
+
print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
|
|
438
|
+
if which == "obs":
|
|
439
|
+
report["obs_converted_columns"].append(col)
|
|
440
|
+
else:
|
|
441
|
+
report["var_converted_columns"].append(col)
|
|
442
|
+
except Exception:
|
|
443
|
+
# fallback: attempt per-element json.dumps; if fails mark as backed-up and coerce via str()
|
|
444
|
+
convertible = True
|
|
445
|
+
for val in ser.values:
|
|
446
|
+
try:
|
|
447
|
+
json.dumps(val, default=str)
|
|
448
|
+
except Exception:
|
|
449
|
+
convertible = False
|
|
450
|
+
break
|
|
451
|
+
if convertible:
|
|
452
|
+
if backup:
|
|
453
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
454
|
+
if which == "obs":
|
|
455
|
+
report["obs_backed_up_columns"].append(col)
|
|
456
|
+
else:
|
|
457
|
+
report["var_backed_up_columns"].append(col)
|
|
458
|
+
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
459
|
+
if verbose:
|
|
460
|
+
print(f" json-stringified object column '{which}.{col}' (backup={backup})")
|
|
461
|
+
if which == "obs":
|
|
462
|
+
report["obs_converted_columns"].append(col)
|
|
463
|
+
else:
|
|
464
|
+
report["var_converted_columns"].append(col)
|
|
465
|
+
else:
|
|
466
|
+
# fallback to string repr and backup
|
|
467
|
+
if backup:
|
|
468
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
469
|
+
if which == "obs":
|
|
470
|
+
report["obs_backed_up_columns"].append(col)
|
|
471
|
+
else:
|
|
472
|
+
report["var_backed_up_columns"].append(col)
|
|
473
|
+
df[col] = ser.astype(str)
|
|
474
|
+
if verbose:
|
|
475
|
+
print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
|
|
476
|
+
if which == "obs":
|
|
477
|
+
report["obs_converted_columns"].append(col)
|
|
478
|
+
else:
|
|
479
|
+
report["var_converted_columns"].append(col)
|
|
480
|
+
return df
|
|
481
|
+
|
|
482
|
+
def _sanitize_uns(uns: dict):
|
|
483
|
+
"""
|
|
484
|
+
For each key/value in uns:
|
|
485
|
+
- if json.dumps(value) works: keep it
|
|
486
|
+
- else: pickle value to backup dir, and add a JSON-stringified representation under key+'_json'
|
|
487
|
+
"""
|
|
488
|
+
clean = {}
|
|
489
|
+
backed_up = []
|
|
490
|
+
for k, v in uns.items():
|
|
491
|
+
try:
|
|
492
|
+
json.dumps(v)
|
|
493
|
+
clean[k] = v
|
|
494
|
+
except Exception:
|
|
495
|
+
try:
|
|
496
|
+
s = json.dumps(v, default=str)
|
|
497
|
+
clean[k + "_json"] = s
|
|
498
|
+
if backup:
|
|
499
|
+
_backup(v, f"uns_{k}_backup")
|
|
500
|
+
backed_up.append(k)
|
|
501
|
+
if verbose:
|
|
502
|
+
print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
|
|
503
|
+
report["uns_json_keys"].append(k)
|
|
504
|
+
except Exception:
|
|
505
|
+
try:
|
|
506
|
+
if backup:
|
|
507
|
+
_backup(v, f"uns_{k}_backup")
|
|
508
|
+
clean[k + "_str"] = str(v)
|
|
509
|
+
backed_up.append(k)
|
|
510
|
+
if verbose:
|
|
511
|
+
print(f" uns['{k}'] stored as string under '{k}_str' (backed up).")
|
|
512
|
+
report["uns_backed_up_keys"].append(k)
|
|
513
|
+
except Exception as e:
|
|
514
|
+
msg = f"uns['{k}'] could not be preserved: {e}"
|
|
515
|
+
report["errors"].append(msg)
|
|
516
|
+
if verbose:
|
|
517
|
+
print(" " + msg)
|
|
518
|
+
if backed_up and verbose:
|
|
519
|
+
print(f"Sanitized .uns keys (backed up): {backed_up}")
|
|
520
|
+
return clean
|
|
521
|
+
|
|
522
|
+
def _sanitize_layers_obsm(src_dict, which: str):
|
|
523
|
+
"""
|
|
524
|
+
Ensure arrays in layers/obsm are numeric and non-object dtype.
|
|
525
|
+
Returns a cleaned dict suitable to pass into AnnData(...)
|
|
526
|
+
If an entry is not convertible, it is backed up & skipped.
|
|
527
|
+
"""
|
|
528
|
+
cleaned = {}
|
|
529
|
+
for k, v in src_dict.items():
|
|
530
|
+
try:
|
|
531
|
+
arr = np.asarray(v)
|
|
532
|
+
if arr.dtype == object:
|
|
533
|
+
try:
|
|
534
|
+
arr_f = arr.astype(float)
|
|
535
|
+
cleaned[k] = arr_f
|
|
536
|
+
report_key = f"{which}.{k}"
|
|
537
|
+
report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
538
|
+
if verbose:
|
|
539
|
+
print(f" {which}.{k} object array coerced to float.")
|
|
540
|
+
except Exception:
|
|
541
|
+
try:
|
|
542
|
+
arr_i = arr.astype(int)
|
|
543
|
+
cleaned[k] = arr_i
|
|
544
|
+
report_key = f"{which}.{k}"
|
|
545
|
+
report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
546
|
+
if verbose:
|
|
547
|
+
print(f" {which}.{k} object array coerced to int.")
|
|
548
|
+
except Exception:
|
|
549
|
+
if backup:
|
|
550
|
+
_backup(v, f"{which}_{k}_backup")
|
|
551
|
+
if which == "layers":
|
|
552
|
+
report["layers_skipped"].append(k)
|
|
553
|
+
else:
|
|
554
|
+
report["obsm_skipped"].append(k)
|
|
555
|
+
if verbose:
|
|
556
|
+
print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
|
|
557
|
+
continue
|
|
558
|
+
else:
|
|
559
|
+
cleaned[k] = arr
|
|
560
|
+
except Exception as e:
|
|
561
|
+
if backup:
|
|
562
|
+
_backup(v, f"{which}_{k}_backup")
|
|
563
|
+
if which == "layers":
|
|
564
|
+
report["layers_skipped"].append(k)
|
|
565
|
+
else:
|
|
566
|
+
report["obsm_skipped"].append(k)
|
|
567
|
+
msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
|
|
568
|
+
report["errors"].append(msg)
|
|
569
|
+
if verbose:
|
|
570
|
+
print(msg)
|
|
571
|
+
continue
|
|
572
|
+
return cleaned
|
|
573
|
+
|
|
574
|
+
# ---------- sanitize obs, var ----------
|
|
575
|
+
try:
|
|
576
|
+
obs_clean = _make_obs_var_safe(adata.obs, "obs")
|
|
577
|
+
except Exception as e:
|
|
578
|
+
msg = f"Failed to sanitize obs: {e}"
|
|
579
|
+
report["errors"].append(msg)
|
|
580
|
+
if verbose:
|
|
581
|
+
print(msg)
|
|
582
|
+
obs_clean = adata.obs.copy()
|
|
583
|
+
|
|
584
|
+
try:
|
|
585
|
+
var_clean = _make_obs_var_safe(adata.var, "var")
|
|
586
|
+
except Exception as e:
|
|
587
|
+
msg = f"Failed to sanitize var: {e}"
|
|
588
|
+
report["errors"].append(msg)
|
|
589
|
+
if verbose:
|
|
590
|
+
print(msg)
|
|
591
|
+
var_clean = adata.var.copy()
|
|
592
|
+
|
|
593
|
+
# ---------- sanitize uns ----------
|
|
594
|
+
try:
|
|
595
|
+
uns_clean = _sanitize_uns(adata.uns)
|
|
596
|
+
except Exception as e:
|
|
597
|
+
msg = f"Failed to sanitize uns: {e}"
|
|
598
|
+
report["errors"].append(msg)
|
|
599
|
+
if verbose:
|
|
600
|
+
print(msg)
|
|
601
|
+
uns_clean = {}
|
|
602
|
+
|
|
603
|
+
# ---------- sanitize layers and obsm ----------
|
|
604
|
+
layers_src = getattr(adata, "layers", {})
|
|
605
|
+
obsm_src = getattr(adata, "obsm", {})
|
|
606
|
+
|
|
607
|
+
try:
|
|
608
|
+
layers_clean = _sanitize_layers_obsm(layers_src, "layers")
|
|
609
|
+
except Exception as e:
|
|
610
|
+
msg = f"Failed to sanitize layers: {e}"
|
|
611
|
+
report["errors"].append(msg)
|
|
612
|
+
if verbose:
|
|
613
|
+
print(msg)
|
|
614
|
+
layers_clean = {}
|
|
615
|
+
|
|
616
|
+
try:
|
|
617
|
+
obsm_clean = _sanitize_layers_obsm(obsm_src, "obsm")
|
|
618
|
+
except Exception as e:
|
|
619
|
+
msg = f"Failed to sanitize obsm: {e}"
|
|
620
|
+
report["errors"].append(msg)
|
|
621
|
+
if verbose:
|
|
622
|
+
print(msg)
|
|
623
|
+
obsm_clean = {}
|
|
624
|
+
|
|
625
|
+
# ---------- handle X ----------
|
|
626
|
+
X_to_use = adata.X
|
|
627
|
+
try:
|
|
628
|
+
X_arr = np.asarray(adata.X)
|
|
629
|
+
if X_arr.dtype == object:
|
|
630
|
+
try:
|
|
631
|
+
X_to_use = X_arr.astype(float)
|
|
632
|
+
report["X_replaced_or_converted"] = "converted_to_float"
|
|
633
|
+
if verbose:
|
|
634
|
+
print("Converted adata.X object-dtype -> float")
|
|
635
|
+
except Exception:
|
|
636
|
+
if backup:
|
|
637
|
+
_backup(adata.X, "X_backup")
|
|
638
|
+
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
639
|
+
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
640
|
+
if verbose:
|
|
641
|
+
print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
|
|
642
|
+
except Exception as e:
|
|
643
|
+
msg = f"Error handling adata.X: {e}"
|
|
644
|
+
report["errors"].append(msg)
|
|
645
|
+
if verbose:
|
|
646
|
+
print(msg)
|
|
647
|
+
X_to_use = adata.X
|
|
648
|
+
|
|
649
|
+
# ---------- build lightweight AnnData copy ----------
|
|
650
|
+
try:
|
|
651
|
+
adata_copy = _ad.AnnData(
|
|
652
|
+
X=X_to_use,
|
|
653
|
+
obs=obs_clean,
|
|
654
|
+
var=var_clean,
|
|
655
|
+
layers=layers_clean,
|
|
656
|
+
uns=uns_clean,
|
|
657
|
+
obsm=obsm_clean,
|
|
658
|
+
varm=getattr(adata, "varm", None),
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# preserve names (as strings)
|
|
662
|
+
try:
|
|
663
|
+
adata_copy.obs_names = adata.obs_names.astype(str)
|
|
664
|
+
adata_copy.var_names = adata.var_names.astype(str)
|
|
665
|
+
except Exception:
|
|
666
|
+
adata_copy.obs_names = adata.obs_names
|
|
667
|
+
adata_copy.var_names = adata.var_names
|
|
668
|
+
|
|
669
|
+
# --- write
|
|
670
|
+
adata_copy.write_h5ad(path, compression=compression)
|
|
671
|
+
if verbose:
|
|
672
|
+
print(f"Saved safely to {path}")
|
|
673
|
+
except Exception as e:
|
|
674
|
+
msg = f"Failed to write h5ad: {e}"
|
|
675
|
+
report["errors"].append(msg)
|
|
676
|
+
if verbose:
|
|
677
|
+
print(msg)
|
|
678
|
+
raise
|
|
679
|
+
|
|
680
|
+
# Print a concise interactive report
|
|
681
|
+
print("\n=== safe_write_h5ad REPORT ===")
|
|
682
|
+
print(f"Saved file: {path}")
|
|
683
|
+
print(f"Adata shape: {adata.shape}")
|
|
684
|
+
if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
|
|
685
|
+
print(f"obs: converted columns -> {report['obs_converted_columns']}")
|
|
686
|
+
print(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
|
|
687
|
+
else:
|
|
688
|
+
print("obs: no problematic columns found.")
|
|
689
|
+
|
|
690
|
+
if report["var_converted_columns"] or report["var_backed_up_columns"]:
|
|
691
|
+
print(f"var: converted columns -> {report['var_converted_columns']}")
|
|
692
|
+
print(f"var: backed-up columns -> {report['var_backed_up_columns']}")
|
|
693
|
+
else:
|
|
694
|
+
print("var: no problematic columns found.")
|
|
695
|
+
|
|
696
|
+
if report["uns_json_keys"] or report["uns_backed_up_keys"]:
|
|
697
|
+
print(f".uns: jsonified keys -> {report['uns_json_keys']}")
|
|
698
|
+
print(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
|
|
699
|
+
else:
|
|
700
|
+
print(".uns: no problematic keys found.")
|
|
701
|
+
|
|
702
|
+
if report["layers_converted"] or report["layers_skipped"]:
|
|
703
|
+
print(f"layers: converted -> {report['layers_converted']}")
|
|
704
|
+
print(f"layers: skipped -> {report['layers_skipped']}")
|
|
705
|
+
else:
|
|
706
|
+
print("layers: no problematic entries found.")
|
|
707
|
+
|
|
708
|
+
if report["obsm_converted"] or report["obsm_skipped"]:
|
|
709
|
+
print(f"obsm: converted -> {report['obsm_converted']}")
|
|
710
|
+
print(f"obsm: skipped -> {report['obsm_skipped']}")
|
|
711
|
+
else:
|
|
712
|
+
print("obsm: no problematic entries found.")
|
|
713
|
+
|
|
714
|
+
if report["X_replaced_or_converted"]:
|
|
715
|
+
print(f"adata.X handled: {report['X_replaced_or_converted']}")
|
|
716
|
+
else:
|
|
717
|
+
print("adata.X: no changes.")
|
|
718
|
+
|
|
719
|
+
if report["errors"]:
|
|
720
|
+
print("\nWarnings / errors encountered:")
|
|
721
|
+
for e in report["errors"]:
|
|
722
|
+
print(" -", e)
|
|
723
|
+
|
|
724
|
+
print("=== end report ===\n")
|
|
725
|
+
return report
|
|
726
|
+
|
|
727
|
+
def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
|
|
728
|
+
"""
|
|
729
|
+
Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
|
|
730
|
+
from the backup_dir produced during save.
|
|
731
|
+
|
|
732
|
+
Parameters
|
|
733
|
+
----------
|
|
734
|
+
path : str
|
|
735
|
+
Path to the cleaned .h5ad produced by safe_write_h5ad.
|
|
736
|
+
backup_dir : str
|
|
737
|
+
Directory where safe_write_h5ad stored pickled backups (default "./uns_backups").
|
|
738
|
+
restore_backups : bool
|
|
739
|
+
If True, attempt to load pickled backups and restore original objects into adata.
|
|
740
|
+
re_categorize : bool
|
|
741
|
+
If True, try to coerce small unique-count string columns back into pandas.Categorical.
|
|
742
|
+
categorical_threshold : int
|
|
743
|
+
Max unique values for a column to be considered categorical for automatic recasting.
|
|
744
|
+
verbose : bool
|
|
745
|
+
Print progress/summary.
|
|
746
|
+
|
|
747
|
+
Returns
|
|
748
|
+
-------
|
|
749
|
+
(adata, report) :
|
|
750
|
+
adata : AnnData
|
|
751
|
+
The reloaded (and possibly restored) AnnData instance.
|
|
752
|
+
report : dict
|
|
753
|
+
A report describing restored items, parsed JSON keys, and any failures.
|
|
754
|
+
"""
|
|
755
|
+
import os
|
|
756
|
+
from pathlib import Path
|
|
757
|
+
import json
|
|
758
|
+
import pickle
|
|
759
|
+
import numpy as np
|
|
760
|
+
import pandas as pd
|
|
761
|
+
import anndata as _ad
|
|
762
|
+
|
|
763
|
+
path = Path(path)
|
|
764
|
+
|
|
765
|
+
if not backup_dir:
|
|
766
|
+
backup_dir = path.parent / str(path.name).split(".")[0]
|
|
767
|
+
|
|
768
|
+
report = {
|
|
769
|
+
"restored_obs_columns": [],
|
|
770
|
+
"restored_var_columns": [],
|
|
771
|
+
"restored_uns_keys": [],
|
|
772
|
+
"parsed_uns_json_keys": [],
|
|
773
|
+
"restored_layers": [],
|
|
774
|
+
"restored_obsm": [],
|
|
775
|
+
"recategorized_obs": [],
|
|
776
|
+
"recategorized_var": [],
|
|
777
|
+
"missing_backups": [],
|
|
778
|
+
"errors": [],
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
if verbose:
|
|
782
|
+
print(f"[safe_read_h5ad] loading {path}")
|
|
783
|
+
|
|
784
|
+
# 1) load the cleaned h5ad
|
|
785
|
+
try:
|
|
786
|
+
adata = _ad.read_h5ad(path)
|
|
787
|
+
except Exception as e:
|
|
788
|
+
raise RuntimeError(f"Failed to read h5ad at {path}: {e}")
|
|
789
|
+
|
|
790
|
+
# Ensure backup_dir exists (may be relative to cwd)
|
|
791
|
+
if verbose:
|
|
792
|
+
print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
|
|
793
|
+
|
|
794
|
+
def _load_pickle_if_exists(fname):
|
|
795
|
+
if os.path.exists(fname):
|
|
796
|
+
try:
|
|
797
|
+
with open(fname, "rb") as fh:
|
|
798
|
+
val = pickle.load(fh)
|
|
799
|
+
return val
|
|
800
|
+
except Exception as e:
|
|
801
|
+
report["errors"].append(f"Failed to load pickle {fname}: {e}")
|
|
802
|
+
if verbose:
|
|
803
|
+
print(f" error loading {fname}: {e}")
|
|
804
|
+
return None
|
|
805
|
+
return None
|
|
806
|
+
|
|
807
|
+
# 2) Restore obs columns
|
|
808
|
+
for col in list(adata.obs.columns):
|
|
809
|
+
# Look for backup with exact naming from safe_write_h5ad: "obs.<col>_backup.pkl" or "obs.<col>_categorical_backup.pkl"
|
|
810
|
+
bname1 = backup_dir / f"obs.{col}_backup.pkl"
|
|
811
|
+
bname2 = backup_dir / f"obs.{col}_categorical_backup.pkl"
|
|
812
|
+
restored = False
|
|
813
|
+
|
|
814
|
+
if restore_backups:
|
|
815
|
+
val = _load_pickle_if_exists(bname2)
|
|
816
|
+
if val is not None:
|
|
817
|
+
# val may be the categorical series or categories
|
|
818
|
+
try:
|
|
819
|
+
# If pickled numpy array or pandas Series, coerce to same index alignment
|
|
820
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
821
|
+
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
822
|
+
else:
|
|
823
|
+
# fallback: place pickled object directly
|
|
824
|
+
adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
|
|
825
|
+
report["restored_obs_columns"].append((col, bname2))
|
|
826
|
+
restored = True
|
|
827
|
+
if verbose:
|
|
828
|
+
print(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
|
|
829
|
+
except Exception as e:
|
|
830
|
+
report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
|
|
831
|
+
restored = False
|
|
832
|
+
|
|
833
|
+
if not restored:
|
|
834
|
+
val = _load_pickle_if_exists(bname1)
|
|
835
|
+
if val is not None:
|
|
836
|
+
try:
|
|
837
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
838
|
+
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
839
|
+
else:
|
|
840
|
+
adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
|
|
841
|
+
report["restored_obs_columns"].append((col, bname1))
|
|
842
|
+
restored = True
|
|
843
|
+
if verbose:
|
|
844
|
+
print(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
|
|
845
|
+
except Exception as e:
|
|
846
|
+
report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
|
|
847
|
+
restored = False
|
|
848
|
+
|
|
849
|
+
# If not restored and column dtype is object but contains JSON-like strings, try json.loads per element
|
|
850
|
+
if (not restored) and (adata.obs[col].dtype == object):
|
|
851
|
+
sample_vals = adata.obs[col].dropna().astype(str).head(20).tolist()
|
|
852
|
+
looks_like_json = False
|
|
853
|
+
for sv in sample_vals:
|
|
854
|
+
svs = sv.strip()
|
|
855
|
+
if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
|
|
856
|
+
looks_like_json = True
|
|
857
|
+
break
|
|
858
|
+
if looks_like_json:
|
|
859
|
+
parsed = []
|
|
860
|
+
success_parse = True
|
|
861
|
+
for v in adata.obs[col].astype(str).values:
|
|
862
|
+
try:
|
|
863
|
+
parsed.append(json.loads(v))
|
|
864
|
+
except Exception:
|
|
865
|
+
# if any element fails, don't convert whole column
|
|
866
|
+
success_parse = False
|
|
867
|
+
break
|
|
868
|
+
if success_parse:
|
|
869
|
+
adata.obs[col] = pd.Series(parsed, index=adata.obs.index)
|
|
870
|
+
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
871
|
+
restored = True
|
|
872
|
+
if verbose:
|
|
873
|
+
print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
|
|
874
|
+
|
|
875
|
+
# If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
|
|
876
|
+
if (not restored) and re_categorize and adata.obs[col].dtype == object:
|
|
877
|
+
try:
|
|
878
|
+
nunique = adata.obs[col].dropna().astype(str).nunique()
|
|
879
|
+
if nunique > 0 and nunique <= categorical_threshold:
|
|
880
|
+
# cast to category
|
|
881
|
+
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
882
|
+
report["recategorized_obs"].append(col)
|
|
883
|
+
if verbose:
|
|
884
|
+
print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
|
|
885
|
+
except Exception as e:
|
|
886
|
+
report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
|
|
887
|
+
|
|
888
|
+
# 3) Restore var columns (same logic)
|
|
889
|
+
for col in list(adata.var.columns):
|
|
890
|
+
bname1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
|
|
891
|
+
bname2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
|
|
892
|
+
restored = False
|
|
893
|
+
|
|
894
|
+
if restore_backups:
|
|
895
|
+
val = _load_pickle_if_exists(bname2)
|
|
896
|
+
if val is not None:
|
|
897
|
+
try:
|
|
898
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
899
|
+
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
900
|
+
else:
|
|
901
|
+
adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
|
|
902
|
+
report["restored_var_columns"].append((col, bname2))
|
|
903
|
+
restored = True
|
|
904
|
+
if verbose:
|
|
905
|
+
print(f"[safe_read_h5ad] restored var.{col} from {bname2}")
|
|
906
|
+
except Exception as e:
|
|
907
|
+
report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
|
|
908
|
+
|
|
909
|
+
if not restored:
|
|
910
|
+
val = _load_pickle_if_exists(bname1)
|
|
911
|
+
if val is not None:
|
|
912
|
+
try:
|
|
913
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
914
|
+
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
915
|
+
else:
|
|
916
|
+
adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
|
|
917
|
+
report["restored_var_columns"].append((col, bname1))
|
|
918
|
+
restored = True
|
|
919
|
+
if verbose:
|
|
920
|
+
print(f"[safe_read_h5ad] restored var.{col} from {bname1}")
|
|
921
|
+
except Exception as e:
|
|
922
|
+
report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
|
|
923
|
+
|
|
924
|
+
if (not restored) and (adata.var[col].dtype == object):
|
|
925
|
+
# try JSON parsing
|
|
926
|
+
sample_vals = adata.var[col].dropna().astype(str).head(20).tolist()
|
|
927
|
+
looks_like_json = False
|
|
928
|
+
for sv in sample_vals:
|
|
929
|
+
svs = sv.strip()
|
|
930
|
+
if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
|
|
931
|
+
looks_like_json = True
|
|
932
|
+
break
|
|
933
|
+
if looks_like_json:
|
|
934
|
+
parsed = []
|
|
935
|
+
success_parse = True
|
|
936
|
+
for v in adata.var[col].astype(str).values:
|
|
937
|
+
try:
|
|
938
|
+
parsed.append(json.loads(v))
|
|
939
|
+
except Exception:
|
|
940
|
+
success_parse = False
|
|
941
|
+
break
|
|
942
|
+
if success_parse:
|
|
943
|
+
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
944
|
+
report["restored_var_columns"].append((col, "parsed_json"))
|
|
945
|
+
if verbose:
|
|
946
|
+
print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
|
|
947
|
+
|
|
948
|
+
if (not restored) and re_categorize and adata.var[col].dtype == object:
|
|
949
|
+
try:
|
|
950
|
+
nunique = adata.var[col].dropna().astype(str).nunique()
|
|
951
|
+
if nunique > 0 and nunique <= categorical_threshold:
|
|
952
|
+
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
953
|
+
report["recategorized_var"].append(col)
|
|
954
|
+
if verbose:
|
|
955
|
+
print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
|
|
956
|
+
except Exception as e:
|
|
957
|
+
report["errors"].append(f"Failed to recategorize var.{col}: {e}")
|
|
958
|
+
|
|
959
|
+
# 4) Restore uns: look for uns_{k}_backup.pkl, or keys like "<k>_json"
|
|
960
|
+
uns_keys = list(adata.uns.keys())
|
|
961
|
+
# First, if we have "<k>_json", convert back into k
|
|
962
|
+
for k in uns_keys:
|
|
963
|
+
if k.endswith("_json"):
|
|
964
|
+
base = k[:-5]
|
|
965
|
+
sval = adata.uns.get(k)
|
|
966
|
+
try:
|
|
967
|
+
parsed = json.loads(sval)
|
|
968
|
+
adata.uns[base] = parsed
|
|
969
|
+
report["parsed_uns_json_keys"].append(base)
|
|
970
|
+
if verbose:
|
|
971
|
+
print(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
|
|
972
|
+
# remove the _json entry
|
|
973
|
+
try:
|
|
974
|
+
del adata.uns[k]
|
|
975
|
+
except KeyError:
|
|
976
|
+
pass
|
|
977
|
+
except Exception as e:
|
|
978
|
+
report["errors"].append(f"Failed to json-parse uns['{k}']: {e}")
|
|
979
|
+
|
|
980
|
+
# Now try to restore pickled backups for uns keys
|
|
981
|
+
# Look for files named uns_<key>_backup.pkl
|
|
982
|
+
# We will attempt to restore into adata.uns[key] if backup exists
|
|
983
|
+
for fname in os.listdir(backup_dir) if os.path.isdir(backup_dir) else []:
|
|
984
|
+
if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
|
|
985
|
+
continue
|
|
986
|
+
# fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
|
|
987
|
+
key = fname[len("uns_"):-len("_backup.pkl")]
|
|
988
|
+
full = os.path.join(backup_dir, fname)
|
|
989
|
+
val = _load_pickle_if_exists(full)
|
|
990
|
+
if val is not None:
|
|
991
|
+
adata.uns[key] = val
|
|
992
|
+
report["restored_uns_keys"].append((key, full))
|
|
993
|
+
if verbose:
|
|
994
|
+
print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
|
|
995
|
+
|
|
996
|
+
# 5) Restore layers and obsm from backups if present
|
|
997
|
+
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
|
|
998
|
+
if os.path.isdir(backup_dir):
|
|
999
|
+
for fname in os.listdir(backup_dir):
|
|
1000
|
+
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
1001
|
+
layer_name = fname[len("layers_"):-len("_backup.pkl")]
|
|
1002
|
+
full = os.path.join(backup_dir, fname)
|
|
1003
|
+
val = _load_pickle_if_exists(full)
|
|
1004
|
+
if val is not None:
|
|
1005
|
+
try:
|
|
1006
|
+
adata.layers[layer_name] = np.asarray(val)
|
|
1007
|
+
report["restored_layers"].append((layer_name, full))
|
|
1008
|
+
if verbose:
|
|
1009
|
+
print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
|
|
1010
|
+
except Exception as e:
|
|
1011
|
+
report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
|
|
1012
|
+
|
|
1013
|
+
if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
|
|
1014
|
+
obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
|
|
1015
|
+
full = os.path.join(backup_dir, fname)
|
|
1016
|
+
val = _load_pickle_if_exists(full)
|
|
1017
|
+
if val is not None:
|
|
1018
|
+
try:
|
|
1019
|
+
adata.obsm[obsm_name] = np.asarray(val)
|
|
1020
|
+
report["restored_obsm"].append((obsm_name, full))
|
|
1021
|
+
if verbose:
|
|
1022
|
+
print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
|
|
1023
|
+
except Exception as e:
|
|
1024
|
+
report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
|
|
1025
|
+
|
|
1026
|
+
# 6) If restore_backups True but some expected backups missing, note them
|
|
1027
|
+
if restore_backups and os.path.isdir(backup_dir):
|
|
1028
|
+
# detect common expected names from obs/var/uns/layers in adata
|
|
1029
|
+
expected_missing = []
|
|
1030
|
+
# obs/var columns
|
|
1031
|
+
for col in list(adata.obs.columns):
|
|
1032
|
+
p1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
|
|
1033
|
+
p2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
|
|
1034
|
+
if (not os.path.exists(p1)) and (not os.path.exists(p2)):
|
|
1035
|
+
# we don't require backups for every column; only record if column still looks like placeholder strings
|
|
1036
|
+
if adata.obs[col].dtype == object:
|
|
1037
|
+
expected_missing.append(("obs", col))
|
|
1038
|
+
for col in list(adata.var.columns):
|
|
1039
|
+
p1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
|
|
1040
|
+
p2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
|
|
1041
|
+
if (not os.path.exists(p1)) and (not os.path.exists(p2)):
|
|
1042
|
+
if adata.var[col].dtype == object:
|
|
1043
|
+
expected_missing.append(("var", col))
|
|
1044
|
+
# uns keys
|
|
1045
|
+
for k in adata.uns.keys():
|
|
1046
|
+
# if we have *_json or *_str variants we expect backups optionally
|
|
1047
|
+
if k.endswith("_json") or k.endswith("_str"):
|
|
1048
|
+
b = os.path.join(backup_dir, f"uns_{k[:-5]}_backup.pkl")
|
|
1049
|
+
if not os.path.exists(b):
|
|
1050
|
+
report["missing_backups"].append(("uns", k))
|
|
1051
|
+
if expected_missing and verbose:
|
|
1052
|
+
n = len(expected_missing)
|
|
1053
|
+
if verbose:
|
|
1054
|
+
print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
|
|
1055
|
+
# add to report
|
|
1056
|
+
report["missing_backups"].extend(expected_missing)
|
|
1057
|
+
|
|
1058
|
+
# final summary print
|
|
1059
|
+
if verbose:
|
|
1060
|
+
print("\n=== safe_read_h5ad summary ===")
|
|
1061
|
+
if report["restored_obs_columns"]:
|
|
1062
|
+
print("Restored obs columns:", report["restored_obs_columns"])
|
|
1063
|
+
if report["restored_var_columns"]:
|
|
1064
|
+
print("Restored var columns:", report["restored_var_columns"])
|
|
1065
|
+
if report["restored_uns_keys"]:
|
|
1066
|
+
print("Restored uns keys:", report["restored_uns_keys"])
|
|
1067
|
+
if report["parsed_uns_json_keys"]:
|
|
1068
|
+
print("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
|
|
1069
|
+
if report["restored_layers"]:
|
|
1070
|
+
print("Restored layers:", report["restored_layers"])
|
|
1071
|
+
if report["restored_obsm"]:
|
|
1072
|
+
print("Restored obsm:", report["restored_obsm"])
|
|
1073
|
+
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
1074
|
+
print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
|
|
1075
|
+
if report["missing_backups"]:
|
|
1076
|
+
print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
|
|
1077
|
+
if report["errors"]:
|
|
1078
|
+
print("Errors encountered (see report['errors']):")
|
|
1079
|
+
for e in report["errors"]:
|
|
1080
|
+
print(" -", e)
|
|
1081
|
+
print("=== end summary ===\n")
|
|
1082
|
+
|
|
1083
|
+
return adata, report
|
|
1084
|
+
|
|
1085
|
+
def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
174
1086
|
import numpy as np
|
|
175
1087
|
import anndata as ad
|
|
176
1088
|
|
|
@@ -194,5 +1106,25 @@ def merge_barcoded_anndatas(adata_single, adata_double):
|
|
|
194
1106
|
adata_merged.uns = {**adata_single.uns, **adata_double.uns}
|
|
195
1107
|
|
|
196
1108
|
return adata_merged
|
|
197
|
-
|
|
198
|
-
|
|
1109
|
+
######################################################################################################
|
|
1110
|
+
|
|
1111
|
+
### File conversion misc ###
|
|
1112
|
+
import argparse
|
|
1113
|
+
from Bio import SeqIO
|
|
1114
|
+
def genbank_to_gff(genbank_file, output_file, record_id):
|
|
1115
|
+
with open(output_file, "w") as out:
|
|
1116
|
+
for record in SeqIO.parse(genbank_file, "genbank"):
|
|
1117
|
+
for feature in record.features:
|
|
1118
|
+
# Skip features without location information
|
|
1119
|
+
if feature.location is None:
|
|
1120
|
+
continue
|
|
1121
|
+
# Extract feature information
|
|
1122
|
+
start = feature.location.start + 1 # Convert to 1-based index
|
|
1123
|
+
end = feature.location.end
|
|
1124
|
+
strand = "+" if feature.location.strand == 1 else "-"
|
|
1125
|
+
feature_type = feature.type
|
|
1126
|
+
# Format attributes
|
|
1127
|
+
attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
|
|
1128
|
+
# Write GFF3 line
|
|
1129
|
+
gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
|
|
1130
|
+
out.write(gff3_line + "\n")
|