smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
CHANGED
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
## readwrite ##
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
6
|
-
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Iterable, Sequence, Optional
|
|
6
|
+
from typing import Iterable, List, Sequence, Union
|
|
9
7
|
|
|
10
|
-
import warnings
|
|
11
|
-
import pandas as pd
|
|
12
8
|
import anndata as ad
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from Bio import SeqIO
|
|
11
|
+
|
|
13
12
|
|
|
14
13
|
######################################################################################################
|
|
15
14
|
## Datetime functionality
|
|
@@ -18,20 +17,26 @@ def date_string():
|
|
|
18
17
|
Each time this is called, it returns the current date string
|
|
19
18
|
"""
|
|
20
19
|
from datetime import datetime
|
|
20
|
+
|
|
21
21
|
current_date = datetime.now()
|
|
22
22
|
date_string = current_date.strftime("%Y%m%d")
|
|
23
23
|
date_string = date_string[2:]
|
|
24
24
|
return date_string
|
|
25
25
|
|
|
26
|
+
|
|
26
27
|
def time_string():
|
|
27
28
|
"""
|
|
28
29
|
Each time this is called, it returns the current time string
|
|
29
30
|
"""
|
|
30
31
|
from datetime import datetime
|
|
32
|
+
|
|
31
33
|
current_time = datetime.now()
|
|
32
34
|
return current_time.strftime("%H:%M:%S")
|
|
35
|
+
|
|
36
|
+
|
|
33
37
|
######################################################################################################
|
|
34
38
|
|
|
39
|
+
|
|
35
40
|
######################################################################################################
|
|
36
41
|
## General file and directory handling
|
|
37
42
|
def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
|
|
@@ -57,11 +62,12 @@ def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None
|
|
|
57
62
|
p = Path(d)
|
|
58
63
|
|
|
59
64
|
# If someone passes in a file path, make its parent
|
|
60
|
-
if p.suffix:
|
|
65
|
+
if p.suffix: # p.suffix != "" means it's a file
|
|
61
66
|
p = p.parent
|
|
62
67
|
|
|
63
68
|
p.mkdir(parents=True, exist_ok=True)
|
|
64
69
|
|
|
70
|
+
|
|
65
71
|
def add_or_update_column_in_csv(
|
|
66
72
|
csv_path: str | Path,
|
|
67
73
|
column_name: str,
|
|
@@ -117,19 +123,20 @@ def add_or_update_column_in_csv(
|
|
|
117
123
|
# Sequence case: lengths must match
|
|
118
124
|
if len(values) != len(df):
|
|
119
125
|
raise ValueError(
|
|
120
|
-
f"Length mismatch: CSV has {len(df)} rows "
|
|
121
|
-
f"but values has {len(values)} entries."
|
|
126
|
+
f"Length mismatch: CSV has {len(df)} rows but values has {len(values)} entries."
|
|
122
127
|
)
|
|
123
128
|
|
|
124
129
|
df[column_name] = list(values)
|
|
125
130
|
df.to_csv(csv_path, index=index)
|
|
126
131
|
return df
|
|
127
132
|
|
|
133
|
+
|
|
128
134
|
######################################################################################################
|
|
129
135
|
|
|
130
136
|
######################################################################################################
|
|
131
137
|
## Numpy, Pandas, Anndata functionality
|
|
132
138
|
|
|
139
|
+
|
|
133
140
|
def adata_to_df(adata, layer=None):
|
|
134
141
|
"""
|
|
135
142
|
Convert an AnnData object into a Pandas DataFrame.
|
|
@@ -142,8 +149,6 @@ def adata_to_df(adata, layer=None):
|
|
|
142
149
|
pd.DataFrame: A DataFrame where rows are observations and columns are positions.
|
|
143
150
|
"""
|
|
144
151
|
import pandas as pd
|
|
145
|
-
import anndata as ad
|
|
146
|
-
import numpy as np
|
|
147
152
|
|
|
148
153
|
# Validate that the requested layer exists
|
|
149
154
|
if layer and layer not in adata.layers:
|
|
@@ -153,28 +158,83 @@ def adata_to_df(adata, layer=None):
|
|
|
153
158
|
data_matrix = adata.layers.get(layer, adata.X)
|
|
154
159
|
|
|
155
160
|
# Ensure matrix is dense (handle sparse formats)
|
|
156
|
-
if hasattr(data_matrix, "toarray"):
|
|
161
|
+
if hasattr(data_matrix, "toarray"):
|
|
157
162
|
data_matrix = data_matrix.toarray()
|
|
158
163
|
|
|
159
164
|
# Ensure obs and var have unique indices
|
|
160
165
|
if adata.obs.index.duplicated().any():
|
|
161
|
-
raise ValueError(
|
|
162
|
-
|
|
166
|
+
raise ValueError(
|
|
167
|
+
"Duplicate values found in `adata.obs.index`. Ensure unique observation indices."
|
|
168
|
+
)
|
|
169
|
+
|
|
163
170
|
if adata.var.index.duplicated().any():
|
|
164
|
-
raise ValueError(
|
|
171
|
+
raise ValueError(
|
|
172
|
+
"Duplicate values found in `adata.var.index`. Ensure unique variable indices."
|
|
173
|
+
)
|
|
165
174
|
|
|
166
175
|
# Convert to DataFrame
|
|
167
176
|
df = pd.DataFrame(data_matrix, index=adata.obs.index, columns=adata.var.index)
|
|
168
177
|
|
|
169
178
|
return df
|
|
170
179
|
|
|
180
|
+
|
|
171
181
|
def save_matrix(matrix, save_name):
|
|
172
182
|
"""
|
|
173
183
|
Input: A numpy matrix and a save_name
|
|
174
184
|
Output: A txt file representation of the data matrix
|
|
175
185
|
"""
|
|
176
186
|
import numpy as np
|
|
177
|
-
|
|
187
|
+
|
|
188
|
+
np.savetxt(f"{save_name}.txt", matrix)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _harmonize_var_schema(adatas: List[ad.AnnData]) -> None:
|
|
192
|
+
"""
|
|
193
|
+
In-place:
|
|
194
|
+
- Make every AnnData.var have the *union* of columns.
|
|
195
|
+
- Normalize dtypes so columns can hold NaN and round-trip via HDF5:
|
|
196
|
+
* ints -> float64 (to support NaN)
|
|
197
|
+
* objects -> try numeric->float64, else pandas 'string'
|
|
198
|
+
"""
|
|
199
|
+
import numpy as np
|
|
200
|
+
|
|
201
|
+
# 1) Union of all .var columns
|
|
202
|
+
all_cols = set()
|
|
203
|
+
for a in adatas:
|
|
204
|
+
all_cols.update(a.var.columns)
|
|
205
|
+
all_cols = list(all_cols)
|
|
206
|
+
|
|
207
|
+
# 2) Add any missing columns as float64 NaN
|
|
208
|
+
for a in adatas:
|
|
209
|
+
missing = [c for c in all_cols if c not in a.var.columns]
|
|
210
|
+
for c in missing:
|
|
211
|
+
a.var[c] = np.nan # becomes float64 by default
|
|
212
|
+
|
|
213
|
+
# 3) Normalize dtypes per AnnData so concat doesn't create mixed/object columns
|
|
214
|
+
for a in adatas:
|
|
215
|
+
for c in a.var.columns:
|
|
216
|
+
s = a.var[c]
|
|
217
|
+
dt = s.dtype
|
|
218
|
+
|
|
219
|
+
# Integer/unsigned -> float64 (so NaN fits)
|
|
220
|
+
if dt.kind in ("i", "u"):
|
|
221
|
+
a.var[c] = s.astype("float64")
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
# Object -> numeric if possible; else pandas 'string'
|
|
225
|
+
if dt == "O":
|
|
226
|
+
try:
|
|
227
|
+
s_num = pd.to_numeric(s, errors="raise")
|
|
228
|
+
a.var[c] = s_num.astype("float64")
|
|
229
|
+
except Exception:
|
|
230
|
+
a.var[c] = s.astype("string")
|
|
231
|
+
|
|
232
|
+
# Optional: ensure consistent column order (sorted + stable)
|
|
233
|
+
# Not required, but can make diffs easier to read:
|
|
234
|
+
all_cols_sorted = sorted(all_cols)
|
|
235
|
+
for a in adatas:
|
|
236
|
+
a.var = a.var.reindex(columns=all_cols_sorted)
|
|
237
|
+
|
|
178
238
|
|
|
179
239
|
def concatenate_h5ads(
|
|
180
240
|
output_path: str | Path,
|
|
@@ -243,8 +303,7 @@ def concatenate_h5ads(
|
|
|
243
303
|
# collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
|
|
244
304
|
suffixes_lower = tuple(s.lower() for s in file_suffixes)
|
|
245
305
|
h5_paths = sorted(
|
|
246
|
-
p for p in input_dir.iterdir()
|
|
247
|
-
if p.is_file() and p.suffix.lower() in suffixes_lower
|
|
306
|
+
p for p in input_dir.iterdir() if p.is_file() and p.suffix.lower() in suffixes_lower
|
|
248
307
|
)
|
|
249
308
|
|
|
250
309
|
else:
|
|
@@ -255,9 +314,7 @@ def concatenate_h5ads(
|
|
|
255
314
|
|
|
256
315
|
df = pd.read_csv(csv_path, dtype=str)
|
|
257
316
|
if csv_column not in df.columns:
|
|
258
|
-
raise ValueError(
|
|
259
|
-
f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
|
|
260
|
-
)
|
|
317
|
+
raise ValueError(f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths.")
|
|
261
318
|
paths = df[csv_column].dropna().astype(str).tolist()
|
|
262
319
|
if not paths:
|
|
263
320
|
raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
|
|
@@ -280,27 +337,41 @@ def concatenate_h5ads(
|
|
|
280
337
|
for p in h5_paths:
|
|
281
338
|
print(f" - {p}")
|
|
282
339
|
|
|
283
|
-
|
|
284
|
-
|
|
340
|
+
# Load all first so we can harmonize schemas before concat
|
|
341
|
+
loaded: List[ad.AnnData] = []
|
|
285
342
|
for p in h5_paths:
|
|
286
343
|
print(f"{time_string()}: Reading {p}")
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
344
|
+
a, _ = safe_read_h5ad(p, restore_backups=restore_backups)
|
|
345
|
+
loaded.append(a)
|
|
346
|
+
|
|
347
|
+
# Critical: make every .var share the same columns + safe dtypes
|
|
348
|
+
_harmonize_var_schema(loaded)
|
|
349
|
+
|
|
350
|
+
print(f"{time_string()}: Concatenating {len(loaded)} AnnData objects")
|
|
351
|
+
final_adata = ad.concat(
|
|
352
|
+
loaded,
|
|
353
|
+
axis=0, # stack observations
|
|
354
|
+
join="outer", # keep union of variables
|
|
355
|
+
merge="unique",
|
|
356
|
+
uns_merge="unique",
|
|
357
|
+
index_unique=None,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Defensive pass: ensure final var dtypes are write-safe
|
|
361
|
+
for c in final_adata.var.columns:
|
|
362
|
+
s = final_adata.var[c]
|
|
363
|
+
dt = s.dtype
|
|
364
|
+
if dt.kind in ("i", "u"):
|
|
365
|
+
final_adata.var[c] = s.astype("float64")
|
|
366
|
+
elif dt == "O":
|
|
367
|
+
try:
|
|
368
|
+
s_num = pd.to_numeric(s, errors="raise")
|
|
369
|
+
final_adata.var[c] = s_num.astype("float64")
|
|
370
|
+
except Exception:
|
|
371
|
+
final_adata.var[c] = s.astype("string")
|
|
301
372
|
|
|
302
|
-
|
|
303
|
-
|
|
373
|
+
# Let anndata write pandas StringArray reliably
|
|
374
|
+
ad.settings.allow_write_nullable_strings = True
|
|
304
375
|
|
|
305
376
|
print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
|
|
306
377
|
safe_write_h5ad(final_adata, output_path, backup=restore_backups)
|
|
@@ -325,18 +396,21 @@ def concatenate_h5ads(
|
|
|
325
396
|
|
|
326
397
|
return output_path
|
|
327
398
|
|
|
399
|
+
|
|
328
400
|
def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
|
|
329
401
|
"""
|
|
330
402
|
Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
|
|
331
403
|
|
|
332
404
|
Returns a report dict and prints a summary of what was converted/backed up/skipped.
|
|
333
405
|
"""
|
|
334
|
-
import
|
|
406
|
+
import json
|
|
407
|
+
import os
|
|
408
|
+
import pickle
|
|
335
409
|
from pathlib import Path
|
|
410
|
+
|
|
411
|
+
import anndata as _ad
|
|
336
412
|
import numpy as np
|
|
337
413
|
import pandas as pd
|
|
338
|
-
import warnings
|
|
339
|
-
import anndata as _ad
|
|
340
414
|
|
|
341
415
|
path = Path(path)
|
|
342
416
|
|
|
@@ -413,7 +487,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
413
487
|
report["var_backed_up_columns"].append(col)
|
|
414
488
|
df[col] = ser.astype(str)
|
|
415
489
|
if verbose:
|
|
416
|
-
print(
|
|
490
|
+
print(
|
|
491
|
+
f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
|
|
492
|
+
)
|
|
417
493
|
continue
|
|
418
494
|
|
|
419
495
|
# object dtype handling: try to coerce each element to string
|
|
@@ -434,7 +510,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
434
510
|
report["var_backed_up_columns"].append(col)
|
|
435
511
|
df[col] = ser.values.astype(str)
|
|
436
512
|
if verbose:
|
|
437
|
-
print(
|
|
513
|
+
print(
|
|
514
|
+
f" converted object column '{which}.{col}' -> strings (backup={backup})"
|
|
515
|
+
)
|
|
438
516
|
if which == "obs":
|
|
439
517
|
report["obs_converted_columns"].append(col)
|
|
440
518
|
else:
|
|
@@ -457,7 +535,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
457
535
|
report["var_backed_up_columns"].append(col)
|
|
458
536
|
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
459
537
|
if verbose:
|
|
460
|
-
print(
|
|
538
|
+
print(
|
|
539
|
+
f" json-stringified object column '{which}.{col}' (backup={backup})"
|
|
540
|
+
)
|
|
461
541
|
if which == "obs":
|
|
462
542
|
report["obs_converted_columns"].append(col)
|
|
463
543
|
else:
|
|
@@ -472,7 +552,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
472
552
|
report["var_backed_up_columns"].append(col)
|
|
473
553
|
df[col] = ser.astype(str)
|
|
474
554
|
if verbose:
|
|
475
|
-
print(
|
|
555
|
+
print(
|
|
556
|
+
f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
|
|
557
|
+
)
|
|
476
558
|
if which == "obs":
|
|
477
559
|
report["obs_converted_columns"].append(col)
|
|
478
560
|
else:
|
|
@@ -499,7 +581,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
499
581
|
_backup(v, f"uns_{k}_backup")
|
|
500
582
|
backed_up.append(k)
|
|
501
583
|
if verbose:
|
|
502
|
-
print(
|
|
584
|
+
print(
|
|
585
|
+
f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
|
|
586
|
+
)
|
|
503
587
|
report["uns_json_keys"].append(k)
|
|
504
588
|
except Exception:
|
|
505
589
|
try:
|
|
@@ -534,7 +618,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
534
618
|
arr_f = arr.astype(float)
|
|
535
619
|
cleaned[k] = arr_f
|
|
536
620
|
report_key = f"{which}.{k}"
|
|
537
|
-
report["layers_converted"].append(
|
|
621
|
+
report["layers_converted"].append(
|
|
622
|
+
report_key
|
|
623
|
+
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
538
624
|
if verbose:
|
|
539
625
|
print(f" {which}.{k} object array coerced to float.")
|
|
540
626
|
except Exception:
|
|
@@ -542,7 +628,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
542
628
|
arr_i = arr.astype(int)
|
|
543
629
|
cleaned[k] = arr_i
|
|
544
630
|
report_key = f"{which}.{k}"
|
|
545
|
-
report["layers_converted"].append(
|
|
631
|
+
report["layers_converted"].append(
|
|
632
|
+
report_key
|
|
633
|
+
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
546
634
|
if verbose:
|
|
547
635
|
print(f" {which}.{k} object array coerced to int.")
|
|
548
636
|
except Exception:
|
|
@@ -553,7 +641,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
553
641
|
else:
|
|
554
642
|
report["obsm_skipped"].append(k)
|
|
555
643
|
if verbose:
|
|
556
|
-
print(
|
|
644
|
+
print(
|
|
645
|
+
f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
|
|
646
|
+
)
|
|
557
647
|
continue
|
|
558
648
|
else:
|
|
559
649
|
cleaned[k] = arr
|
|
@@ -638,7 +728,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
638
728
|
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
639
729
|
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
640
730
|
if verbose:
|
|
641
|
-
print(
|
|
731
|
+
print(
|
|
732
|
+
"adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
|
|
733
|
+
)
|
|
642
734
|
except Exception as e:
|
|
643
735
|
msg = f"Error handling adata.X: {e}"
|
|
644
736
|
report["errors"].append(msg)
|
|
@@ -722,9 +814,121 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
722
814
|
print(" -", e)
|
|
723
815
|
|
|
724
816
|
print("=== end report ===\n")
|
|
817
|
+
|
|
818
|
+
# ---------- create CSV output directory ----------
|
|
819
|
+
try:
|
|
820
|
+
csv_dir = path.parent / "csvs"
|
|
821
|
+
csv_dir.mkdir(exist_ok=True)
|
|
822
|
+
if verbose:
|
|
823
|
+
print(f"CSV outputs will be written to: {csv_dir}")
|
|
824
|
+
except Exception as e:
|
|
825
|
+
msg = f"Failed to create CSV output directory: {e}"
|
|
826
|
+
report["errors"].append(msg)
|
|
827
|
+
if verbose:
|
|
828
|
+
print(msg)
|
|
829
|
+
csv_dir = path.parent # fallback just in case
|
|
830
|
+
|
|
831
|
+
# ---------- write keys summary CSV ----------
|
|
832
|
+
try:
|
|
833
|
+
meta_rows = []
|
|
834
|
+
|
|
835
|
+
# obs columns
|
|
836
|
+
for col in adata_copy.obs.columns:
|
|
837
|
+
meta_rows.append(
|
|
838
|
+
{
|
|
839
|
+
"kind": "obs",
|
|
840
|
+
"name": col,
|
|
841
|
+
"dtype": str(adata_copy.obs[col].dtype),
|
|
842
|
+
}
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
# var columns
|
|
846
|
+
for col in adata_copy.var.columns:
|
|
847
|
+
meta_rows.append(
|
|
848
|
+
{
|
|
849
|
+
"kind": "var",
|
|
850
|
+
"name": col,
|
|
851
|
+
"dtype": str(adata_copy.var[col].dtype),
|
|
852
|
+
}
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
# layers
|
|
856
|
+
for k, v in adata_copy.layers.items():
|
|
857
|
+
meta_rows.append(
|
|
858
|
+
{
|
|
859
|
+
"kind": "layer",
|
|
860
|
+
"name": k,
|
|
861
|
+
"dtype": str(np.asarray(v).dtype),
|
|
862
|
+
}
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# obsm
|
|
866
|
+
for k, v in adata_copy.obsm.items():
|
|
867
|
+
meta_rows.append(
|
|
868
|
+
{
|
|
869
|
+
"kind": "obsm",
|
|
870
|
+
"name": k,
|
|
871
|
+
"dtype": str(np.asarray(v).dtype),
|
|
872
|
+
}
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
# uns
|
|
876
|
+
for k, v in adata_copy.uns.items():
|
|
877
|
+
meta_rows.append(
|
|
878
|
+
{
|
|
879
|
+
"kind": "uns",
|
|
880
|
+
"name": k,
|
|
881
|
+
"dtype": type(v).__name__,
|
|
882
|
+
}
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
meta_df = pd.DataFrame(meta_rows)
|
|
886
|
+
|
|
887
|
+
# same base name, inside csvs/
|
|
888
|
+
base = path.stem # removes .h5ad
|
|
889
|
+
meta_path = csv_dir / f"{base}.keys.csv"
|
|
890
|
+
|
|
891
|
+
meta_df.to_csv(meta_path, index=False)
|
|
892
|
+
if verbose:
|
|
893
|
+
print(f"Wrote keys summary CSV to {meta_path}")
|
|
894
|
+
|
|
895
|
+
except Exception as e:
|
|
896
|
+
msg = f"Failed to write keys CSV: {e}"
|
|
897
|
+
report["errors"].append(msg)
|
|
898
|
+
if verbose:
|
|
899
|
+
print(msg)
|
|
900
|
+
|
|
901
|
+
# ---------- write full obs and var dataframes ----------
|
|
902
|
+
try:
|
|
903
|
+
base = path.stem
|
|
904
|
+
|
|
905
|
+
obs_path = csv_dir / f"{base}.obs.csv"
|
|
906
|
+
var_path = csv_dir / f"{base}.var.csv"
|
|
907
|
+
|
|
908
|
+
adata_copy.obs.to_csv(obs_path, index=True)
|
|
909
|
+
adata_copy.var.to_csv(var_path, index=True)
|
|
910
|
+
|
|
911
|
+
if verbose:
|
|
912
|
+
print(f"Wrote obs DataFrame to {obs_path}")
|
|
913
|
+
print(f"Wrote var DataFrame to {var_path}")
|
|
914
|
+
|
|
915
|
+
except Exception as e:
|
|
916
|
+
msg = f"Failed to write obs/var CSVs: {e}"
|
|
917
|
+
report["errors"].append(msg)
|
|
918
|
+
if verbose:
|
|
919
|
+
print(msg)
|
|
920
|
+
|
|
725
921
|
return report
|
|
726
922
|
|
|
727
|
-
|
|
923
|
+
|
|
924
|
+
def safe_read_h5ad(
|
|
925
|
+
path,
|
|
926
|
+
backup_dir=None,
|
|
927
|
+
restore_backups=True,
|
|
928
|
+
re_categorize=True,
|
|
929
|
+
categorical_threshold=100,
|
|
930
|
+
verbose=True,
|
|
931
|
+
):
|
|
728
932
|
"""
|
|
729
933
|
Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
|
|
730
934
|
from the backup_dir produced during save.
|
|
@@ -752,13 +956,14 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
752
956
|
report : dict
|
|
753
957
|
A report describing restored items, parsed JSON keys, and any failures.
|
|
754
958
|
"""
|
|
755
|
-
import os
|
|
756
|
-
from pathlib import Path
|
|
757
959
|
import json
|
|
960
|
+
import os
|
|
758
961
|
import pickle
|
|
962
|
+
from pathlib import Path
|
|
963
|
+
|
|
964
|
+
import anndata as _ad
|
|
759
965
|
import numpy as np
|
|
760
966
|
import pandas as pd
|
|
761
|
-
import anndata as _ad
|
|
762
967
|
|
|
763
968
|
path = Path(path)
|
|
764
969
|
|
|
@@ -837,7 +1042,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
837
1042
|
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
838
1043
|
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
839
1044
|
else:
|
|
840
|
-
adata.obs[col] = pd.Series(
|
|
1045
|
+
adata.obs[col] = pd.Series(
|
|
1046
|
+
[val] * adata.shape[0], index=adata.obs.index
|
|
1047
|
+
)
|
|
841
1048
|
report["restored_obs_columns"].append((col, bname1))
|
|
842
1049
|
restored = True
|
|
843
1050
|
if verbose:
|
|
@@ -852,7 +1059,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
852
1059
|
looks_like_json = False
|
|
853
1060
|
for sv in sample_vals:
|
|
854
1061
|
svs = sv.strip()
|
|
855
|
-
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1062
|
+
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1063
|
+
svs.startswith("[") and svs.endswith("]")
|
|
1064
|
+
):
|
|
856
1065
|
looks_like_json = True
|
|
857
1066
|
break
|
|
858
1067
|
if looks_like_json:
|
|
@@ -870,7 +1079,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
870
1079
|
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
871
1080
|
restored = True
|
|
872
1081
|
if verbose:
|
|
873
|
-
print(
|
|
1082
|
+
print(
|
|
1083
|
+
f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
|
|
1084
|
+
)
|
|
874
1085
|
|
|
875
1086
|
# If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
|
|
876
1087
|
if (not restored) and re_categorize and adata.obs[col].dtype == object:
|
|
@@ -881,7 +1092,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
881
1092
|
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
882
1093
|
report["recategorized_obs"].append(col)
|
|
883
1094
|
if verbose:
|
|
884
|
-
print(
|
|
1095
|
+
print(
|
|
1096
|
+
f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
|
|
1097
|
+
)
|
|
885
1098
|
except Exception as e:
|
|
886
1099
|
report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
|
|
887
1100
|
|
|
@@ -913,7 +1126,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
913
1126
|
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
914
1127
|
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
915
1128
|
else:
|
|
916
|
-
adata.var[col] = pd.Series(
|
|
1129
|
+
adata.var[col] = pd.Series(
|
|
1130
|
+
[val] * adata.shape[1], index=adata.var.index
|
|
1131
|
+
)
|
|
917
1132
|
report["restored_var_columns"].append((col, bname1))
|
|
918
1133
|
restored = True
|
|
919
1134
|
if verbose:
|
|
@@ -927,7 +1142,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
927
1142
|
looks_like_json = False
|
|
928
1143
|
for sv in sample_vals:
|
|
929
1144
|
svs = sv.strip()
|
|
930
|
-
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1145
|
+
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1146
|
+
svs.startswith("[") and svs.endswith("]")
|
|
1147
|
+
):
|
|
931
1148
|
looks_like_json = True
|
|
932
1149
|
break
|
|
933
1150
|
if looks_like_json:
|
|
@@ -943,7 +1160,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
943
1160
|
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
944
1161
|
report["restored_var_columns"].append((col, "parsed_json"))
|
|
945
1162
|
if verbose:
|
|
946
|
-
print(
|
|
1163
|
+
print(
|
|
1164
|
+
f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
|
|
1165
|
+
)
|
|
947
1166
|
|
|
948
1167
|
if (not restored) and re_categorize and adata.var[col].dtype == object:
|
|
949
1168
|
try:
|
|
@@ -952,7 +1171,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
952
1171
|
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
953
1172
|
report["recategorized_var"].append(col)
|
|
954
1173
|
if verbose:
|
|
955
|
-
print(
|
|
1174
|
+
print(
|
|
1175
|
+
f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
|
|
1176
|
+
)
|
|
956
1177
|
except Exception as e:
|
|
957
1178
|
report["errors"].append(f"Failed to recategorize var.{col}: {e}")
|
|
958
1179
|
|
|
@@ -984,7 +1205,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
984
1205
|
if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
|
|
985
1206
|
continue
|
|
986
1207
|
# fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
|
|
987
|
-
key = fname[len("uns_")
|
|
1208
|
+
key = fname[len("uns_") : -len("_backup.pkl")]
|
|
988
1209
|
full = os.path.join(backup_dir, fname)
|
|
989
1210
|
val = _load_pickle_if_exists(full)
|
|
990
1211
|
if val is not None:
|
|
@@ -998,7 +1219,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
998
1219
|
if os.path.isdir(backup_dir):
|
|
999
1220
|
for fname in os.listdir(backup_dir):
|
|
1000
1221
|
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
1001
|
-
layer_name = fname[len("layers_")
|
|
1222
|
+
layer_name = fname[len("layers_") : -len("_backup.pkl")]
|
|
1002
1223
|
full = os.path.join(backup_dir, fname)
|
|
1003
1224
|
val = _load_pickle_if_exists(full)
|
|
1004
1225
|
if val is not None:
|
|
@@ -1008,10 +1229,12 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1008
1229
|
if verbose:
|
|
1009
1230
|
print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
|
|
1010
1231
|
except Exception as e:
|
|
1011
|
-
report["errors"].append(
|
|
1232
|
+
report["errors"].append(
|
|
1233
|
+
f"Failed to restore layers['{layer_name}'] from {full}: {e}"
|
|
1234
|
+
)
|
|
1012
1235
|
|
|
1013
1236
|
if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
|
|
1014
|
-
obsm_name = fname[len("obsm_")
|
|
1237
|
+
obsm_name = fname[len("obsm_") : -len("_backup.pkl")]
|
|
1015
1238
|
full = os.path.join(backup_dir, fname)
|
|
1016
1239
|
val = _load_pickle_if_exists(full)
|
|
1017
1240
|
if val is not None:
|
|
@@ -1021,7 +1244,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1021
1244
|
if verbose:
|
|
1022
1245
|
print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
|
|
1023
1246
|
except Exception as e:
|
|
1024
|
-
report["errors"].append(
|
|
1247
|
+
report["errors"].append(
|
|
1248
|
+
f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
|
|
1249
|
+
)
|
|
1025
1250
|
|
|
1026
1251
|
# 6) If restore_backups True but some expected backups missing, note them
|
|
1027
1252
|
if restore_backups and os.path.isdir(backup_dir):
|
|
@@ -1051,7 +1276,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1051
1276
|
if expected_missing and verbose:
|
|
1052
1277
|
n = len(expected_missing)
|
|
1053
1278
|
if verbose:
|
|
1054
|
-
print(
|
|
1279
|
+
print(
|
|
1280
|
+
f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
|
|
1281
|
+
)
|
|
1055
1282
|
# add to report
|
|
1056
1283
|
report["missing_backups"].extend(expected_missing)
|
|
1057
1284
|
|
|
@@ -1071,9 +1298,16 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1071
1298
|
if report["restored_obsm"]:
|
|
1072
1299
|
print("Restored obsm:", report["restored_obsm"])
|
|
1073
1300
|
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
1074
|
-
print(
|
|
1301
|
+
print(
|
|
1302
|
+
"Recategorized columns (obs/var):",
|
|
1303
|
+
report["recategorized_obs"],
|
|
1304
|
+
report["recategorized_var"],
|
|
1305
|
+
)
|
|
1075
1306
|
if report["missing_backups"]:
|
|
1076
|
-
print(
|
|
1307
|
+
print(
|
|
1308
|
+
"Missing backups or object columns without backups (investigate):",
|
|
1309
|
+
report["missing_backups"],
|
|
1310
|
+
)
|
|
1077
1311
|
if report["errors"]:
|
|
1078
1312
|
print("Errors encountered (see report['errors']):")
|
|
1079
1313
|
for e in report["errors"]:
|
|
@@ -1082,9 +1316,10 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1082
1316
|
|
|
1083
1317
|
return adata, report
|
|
1084
1318
|
|
|
1319
|
+
|
|
1085
1320
|
def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
1086
|
-
import numpy as np
|
|
1087
1321
|
import anndata as ad
|
|
1322
|
+
import numpy as np
|
|
1088
1323
|
|
|
1089
1324
|
# Step 1: Identify overlap
|
|
1090
1325
|
overlap = np.intersect1d(adata_single.obs_names, adata_double.obs_names)
|
|
@@ -1093,24 +1328,25 @@ def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
|
1093
1328
|
adata_single_filtered = adata_single[~adata_single.obs_names.isin(overlap)].copy()
|
|
1094
1329
|
|
|
1095
1330
|
# Step 3: Add source tag
|
|
1096
|
-
adata_single_filtered.obs[
|
|
1097
|
-
adata_double.obs[
|
|
1331
|
+
adata_single_filtered.obs["source"] = "single_barcode"
|
|
1332
|
+
adata_double.obs["source"] = "double_barcode"
|
|
1098
1333
|
|
|
1099
1334
|
# Step 4: Concatenate all components
|
|
1100
|
-
adata_merged = ad.concat(
|
|
1101
|
-
adata_single_filtered,
|
|
1102
|
-
|
|
1103
|
-
], join='outer', merge='same') # merge='same' preserves matching layers, obsm, etc.
|
|
1335
|
+
adata_merged = ad.concat(
|
|
1336
|
+
[adata_single_filtered, adata_double], join="outer", merge="same"
|
|
1337
|
+
) # merge='same' preserves matching layers, obsm, etc.
|
|
1104
1338
|
|
|
1105
1339
|
# Step 5: Merge `.uns`
|
|
1106
1340
|
adata_merged.uns = {**adata_single.uns, **adata_double.uns}
|
|
1107
1341
|
|
|
1108
1342
|
return adata_merged
|
|
1343
|
+
|
|
1344
|
+
|
|
1109
1345
|
######################################################################################################
|
|
1110
1346
|
|
|
1111
1347
|
### File conversion misc ###
|
|
1112
|
-
|
|
1113
|
-
|
|
1348
|
+
|
|
1349
|
+
|
|
1114
1350
|
def genbank_to_gff(genbank_file, output_file, record_id):
|
|
1115
1351
|
with open(output_file, "w") as out:
|
|
1116
1352
|
for record in SeqIO.parse(genbank_file, "genbank"):
|
|
@@ -1126,5 +1362,18 @@ def genbank_to_gff(genbank_file, output_file, record_id):
|
|
|
1126
1362
|
# Format attributes
|
|
1127
1363
|
attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
|
|
1128
1364
|
# Write GFF3 line
|
|
1129
|
-
gff3_line = "\t".join(
|
|
1130
|
-
|
|
1365
|
+
gff3_line = "\t".join(
|
|
1366
|
+
str(x)
|
|
1367
|
+
for x in [
|
|
1368
|
+
record_id,
|
|
1369
|
+
feature.type,
|
|
1370
|
+
feature_type,
|
|
1371
|
+
start,
|
|
1372
|
+
end,
|
|
1373
|
+
".",
|
|
1374
|
+
strand,
|
|
1375
|
+
".",
|
|
1376
|
+
attributes,
|
|
1377
|
+
]
|
|
1378
|
+
)
|
|
1379
|
+
out.write(gff3_line + "\n")
|