DAJIN2 0.4.3__zip → 0.4.4__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {DAJIN2-0.4.3/src/DAJIN2.egg-info → dajin2-0.4.4}/PKG-INFO +15 -50
  2. {DAJIN2-0.4.3 → dajin2-0.4.4}/README.md +14 -49
  3. {DAJIN2-0.4.3 → dajin2-0.4.4}/setup.py +1 -1
  4. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/clustering.py +11 -10
  5. dajin2-0.4.4/src/DAJIN2/core/clustering/strand_bias_handler.py +115 -0
  6. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mutation_extractor.py +7 -7
  7. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/main.py +20 -20
  8. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/io.py +8 -6
  9. {DAJIN2-0.4.3 → dajin2-0.4.4/src/DAJIN2.egg-info}/PKG-INFO +15 -50
  10. DAJIN2-0.4.3/src/DAJIN2/core/clustering/strand_bias_handler.py +0 -113
  11. {DAJIN2-0.4.3 → dajin2-0.4.4}/LICENSE +0 -0
  12. {DAJIN2-0.4.3 → dajin2-0.4.4}/MANIFEST.in +0 -0
  13. {DAJIN2-0.4.3 → dajin2-0.4.4}/requirements.txt +0 -0
  14. {DAJIN2-0.4.3 → dajin2-0.4.4}/setup.cfg +0 -0
  15. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/__init__.py +0 -0
  16. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/__init__.py +0 -0
  17. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/classification/__init__.py +0 -0
  18. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/classification/allele_merger.py +0 -0
  19. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/classification/classifier.py +0 -0
  20. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/__init__.py +0 -0
  21. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/appender.py +0 -0
  22. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/kmer_generator.py +0 -0
  23. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_extractor.py +0 -0
  24. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_merger.py +0 -0
  25. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/label_updator.py +0 -0
  26. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/clustering/score_handler.py +0 -0
  27. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/__init__.py +0 -0
  28. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/clust_formatter.py +0 -0
  29. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/consensus.py +0 -0
  30. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/mutation_extractor.py +0 -0
  31. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/name_handler.py +0 -0
  32. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/consensus/similarity_searcher.py +0 -0
  33. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/core.py +0 -0
  34. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/__init__.py +0 -0
  35. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/cache_checker.py +0 -0
  36. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/directory_manager.py +0 -0
  37. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/genome_fetcher.py +0 -0
  38. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/homopolymer_handler.py +0 -0
  39. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/input_formatter.py +0 -0
  40. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/insertions_to_fasta.py +0 -0
  41. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/knockin_handler.py +0 -0
  42. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/mapping.py +0 -0
  43. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/preprocess/midsv_caller.py +0 -0
  44. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/__init__.py +0 -0
  45. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/bam_exporter.py +0 -0
  46. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/insertion_reflector.py +0 -0
  47. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/mutation_exporter.py +0 -0
  48. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/core/report/sequence_exporter.py +0 -0
  49. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/gui.py +0 -0
  50. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/static/css/style.css +0 -0
  51. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/template_igvjs.html +0 -0
  52. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/templates/index.html +0 -0
  53. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/config.py +0 -0
  54. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/cssplits_handler.py +0 -0
  55. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/dna_handler.py +0 -0
  56. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/fastx_handler.py +0 -0
  57. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/input_validator.py +0 -0
  58. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/multiprocess.py +0 -0
  59. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/report_generator.py +0 -0
  60. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/utils/sam_handler.py +0 -0
  61. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2/view.py +0 -0
  62. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/SOURCES.txt +0 -0
  63. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/dependency_links.txt +0 -0
  64. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/entry_points.txt +0 -0
  65. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/requires.txt +0 -0
  66. {DAJIN2-0.4.3 → dajin2-0.4.4}/src/DAJIN2.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.3
3
+ Version: 0.4.4
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -166,7 +166,7 @@ Options:
166
166
 
167
167
  ```bash
168
168
  # Download example dataset
169
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
169
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
170
170
  tar -xf example_single.tar.gz
171
171
 
172
172
  # Run DAJIN2
@@ -230,48 +230,13 @@ options:
230
230
 
231
231
  ```bash
232
232
  # Donwload the example dataset
233
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
233
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
234
234
  tar -xf example_batch.tar.gz
235
235
 
236
236
  # Run DAJIN2
237
237
  DAJIN2 batch --file example_batch/batch.csv --threads 4
238
238
  ```
239
239
 
240
- <!-- ```bash
241
- # Donwload the example dataset
242
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
243
- tar -xf example_batch.tar.gz
244
-
245
- # Run DAJIN2
246
- DAJIN2 batch --file example-batch/batch.csv --threads 3
247
-
248
- # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
249
- # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
250
- # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
251
- # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
252
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
253
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
254
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
255
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
256
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
257
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
258
- # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
259
- # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
260
- # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
261
- # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
262
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
263
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
264
- # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
265
- # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
266
- # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
267
- # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
268
- # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
269
- # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
270
- # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
271
- # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
272
- # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
273
- # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
274
- ``` -->
275
240
 
276
241
  ## 📈 Report Contents
277
242
 
@@ -281,22 +246,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
281
246
  ```
282
247
  DAJIN_Results/tyr-substitution
283
248
  ├── BAM
284
- │ ├── tyr_c230gt_01%
285
- │ ├── tyr_c230gt_10%
286
- │ ├── tyr_c230gt_50%
249
+ │ ├── tyr_c230gt_01
250
+ │ ├── tyr_c230gt_10
251
+ │ ├── tyr_c230gt_50
287
252
  │ └── tyr_control
288
253
  ├── FASTA
289
- │ ├── tyr_c230gt_01%
290
- │ ├── tyr_c230gt_10%
291
- │ └── tyr_c230gt_50%
254
+ │ ├── tyr_c230gt_01
255
+ │ ├── tyr_c230gt_10
256
+ │ └── tyr_c230gt_50
292
257
  ├── HTML
293
- │ ├── tyr_c230gt_01%
294
- │ ├── tyr_c230gt_10%
295
- │ └── tyr_c230gt_50%
258
+ │ ├── tyr_c230gt_01
259
+ │ ├── tyr_c230gt_10
260
+ │ └── tyr_c230gt_50
296
261
  ├── MUTATION_INFO
297
- │ ├── tyr_c230gt_01%.csv
298
- │ ├── tyr_c230gt_10%.csv
299
- │ └── tyr_c230gt_50%.csv
262
+ │ ├── tyr_c230gt_01.csv
263
+ │ ├── tyr_c230gt_10.csv
264
+ │ └── tyr_c230gt_50.csv
300
265
  ├── read_plot.html
301
266
  ├── read_plot.pdf
302
267
  └── read_summary.xlsx
@@ -133,7 +133,7 @@ Options:
133
133
 
134
134
  ```bash
135
135
  # Download example dataset
136
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
136
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
137
137
  tar -xf example_single.tar.gz
138
138
 
139
139
  # Run DAJIN2
@@ -197,48 +197,13 @@ options:
197
197
 
198
198
  ```bash
199
199
  # Donwload the example dataset
200
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
200
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
201
201
  tar -xf example_batch.tar.gz
202
202
 
203
203
  # Run DAJIN2
204
204
  DAJIN2 batch --file example_batch/batch.csv --threads 4
205
205
  ```
206
206
 
207
- <!-- ```bash
208
- # Donwload the example dataset
209
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
210
- tar -xf example_batch.tar.gz
211
-
212
- # Run DAJIN2
213
- DAJIN2 batch --file example-batch/batch.csv --threads 3
214
-
215
- # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
216
- # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
217
- # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
218
- # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
219
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
220
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
221
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
222
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
223
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
224
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
225
- # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
226
- # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
227
- # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
228
- # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
229
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
230
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
231
- # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
232
- # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
233
- # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
234
- # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
235
- # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
236
- # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
237
- # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
238
- # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
239
- # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
240
- # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
241
- ``` -->
242
207
 
243
208
  ## 📈 Report Contents
244
209
 
@@ -248,22 +213,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
248
213
  ```
249
214
  DAJIN_Results/tyr-substitution
250
215
  ├── BAM
251
- │ ├── tyr_c230gt_01%
252
- │ ├── tyr_c230gt_10%
253
- │ ├── tyr_c230gt_50%
216
+ │ ├── tyr_c230gt_01
217
+ │ ├── tyr_c230gt_10
218
+ │ ├── tyr_c230gt_50
254
219
  │ └── tyr_control
255
220
  ├── FASTA
256
- │ ├── tyr_c230gt_01%
257
- │ ├── tyr_c230gt_10%
258
- │ └── tyr_c230gt_50%
221
+ │ ├── tyr_c230gt_01
222
+ │ ├── tyr_c230gt_10
223
+ │ └── tyr_c230gt_50
259
224
  ├── HTML
260
- │ ├── tyr_c230gt_01%
261
- │ ├── tyr_c230gt_10%
262
- │ └── tyr_c230gt_50%
225
+ │ ├── tyr_c230gt_01
226
+ │ ├── tyr_c230gt_10
227
+ │ └── tyr_c230gt_50
263
228
  ├── MUTATION_INFO
264
- │ ├── tyr_c230gt_01%.csv
265
- │ ├── tyr_c230gt_10%.csv
266
- │ └── tyr_c230gt_50%.csv
229
+ │ ├── tyr_c230gt_01.csv
230
+ │ ├── tyr_c230gt_10.csv
231
+ │ └── tyr_c230gt_50.csv
267
232
  ├── read_plot.html
268
233
  ├── read_plot.pdf
269
234
  └── read_summary.xlsx
@@ -9,7 +9,7 @@ with open("requirements.txt") as requirements_file:
9
9
 
10
10
  setuptools.setup(
11
11
  name="DAJIN2",
12
- version="0.4.3",
12
+ version="0.4.4",
13
13
  author="Akihiro Kuno",
14
14
  author_email="akuno@md.tsukuba.ac.jp",
15
15
  description="One-step genotyping tools for targeted long-read sequencing",
@@ -39,17 +39,16 @@ def optimize_labels(X: spmatrix, coverage_sample: int, coverage_control: int) ->
39
39
  # print(i, Counter(labels_sample), Counter(labels_control), Counter(labels_current)) # ! DEBUG
40
40
 
41
41
  num_labels_control = count_number_of_clusters(labels_control, coverage_control)
42
- mutual_info = metrics.adjusted_rand_score(labels_previous, labels_current)
42
+ rand_index = metrics.adjusted_rand_score(labels_previous, labels_current)
43
43
 
44
44
  """
45
45
  Return the number of clusters when:
46
- - the number of clusters in control is split into more than one.
47
- - the mutual information between the current and previous labels is high enough (= similar).
46
+ - the number of clusters in control is split into more than one.
47
+ - the mutual information between the current and previous labels is high enough (= similar).
48
+ To reduce the allele number, previous labels are returned.
48
49
  """
49
- if num_labels_control >= 2:
50
+ if num_labels_control >= 2 or rand_index >= 0.95:
50
51
  return labels_previous
51
- if 0.95 <= mutual_info <= 1.0:
52
- return labels_current
53
52
  labels_previous = labels_current
54
53
  return labels_previous
55
54
 
@@ -58,11 +57,13 @@ def get_label_most_common(labels: list[int]) -> int:
58
57
  return Counter(labels).most_common()[0][0]
59
58
 
60
59
 
61
- def return_labels(path_score_sample: Path, path_score_control: Path, path_sample: Path, strand_bias: bool) -> list[int]:
60
+ def return_labels(
61
+ path_score_sample: Path, path_score_control: Path, path_sample: Path, strand_bias_in_control: bool
62
+ ) -> list[int]:
62
63
  np.random.seed(seed=1)
63
64
  score_control = list(io.read_jsonl(path_score_control))
64
65
  X_control = csr_matrix(score_control)
65
- # subset to 1000 reads of controls in the most common cluster to remove outliers and reduce computation time
66
+ """Subset to 1000 reads of controls in the most common cluster to remove outliers and reduce computation time"""
66
67
  labels_control = BisectingKMeans(n_clusters=2, random_state=1).fit_predict(X_control)
67
68
  label_most_common = get_label_most_common(labels_control)
68
69
  scores_control_subset = subset_scores(labels_control, io.read_jsonl(path_score_control), label_most_common, 1000)
@@ -71,7 +72,7 @@ def return_labels(path_score_sample: Path, path_score_control: Path, path_sample
71
72
  coverage_sample = io.count_newlines(path_score_sample)
72
73
  coverage_control = len(scores_control_subset)
73
74
  labels = optimize_labels(X, coverage_sample, coverage_control)
74
- # correct clusters with strand bias
75
- if strand_bias is False:
75
+ """Re-allocate clusters with strand bias to clusters without strand bias"""
76
+ if strand_bias_in_control is False:
76
77
  labels = remove_biased_clusters(path_sample, path_score_sample, labels)
77
78
  return labels
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Nanopore sequencing results often results in strand specific mutations even though the mutation is not strand specific, thus they are considered as sequencing errors and should be removed.
5
+
6
+ This module provides functions to determine whether each allele obtained after clustering is formed due to sequencing errors caused by strand bias.
7
+
8
+ Re-allocates reads belonging to clusters with strand bias to clusters without strand bias.
9
+ """
10
+
11
+ from pathlib import Path
12
+ from collections import defaultdict
13
+ from sklearn.tree import DecisionTreeClassifier
14
+
15
+ from DAJIN2.utils import io
16
+
17
+ # Constants
18
+ STRAND_BIAS_LOWER_LIMIT = 0.1
19
+ STRAND_BIAS_UPPER_LIMIT = 0.9
20
+
21
+
22
+ def is_strand_bias(path_control: Path) -> bool:
23
+ """
24
+ Determines whether there is a strand bias in sequencing data
25
+ based on the distribution of '+' and '-' strands.
26
+ """
27
+ count_strand = defaultdict(int)
28
+ for sample in io.read_jsonl(path_control):
29
+ count_strand[sample["STRAND"]] += 1
30
+
31
+ total = count_strand["+"] + count_strand["-"]
32
+ percentage_plus = count_strand["+"] / total if total > 0 else 0
33
+
34
+ return not (STRAND_BIAS_LOWER_LIMIT < percentage_plus < STRAND_BIAS_UPPER_LIMIT)
35
+
36
+
37
+ ###############################################################################
38
+ # Handle Strand bias
39
+ # # Clusters of reads with mutations with strand bias are merged into similar clusters without strand bias
40
+ ###############################################################################
41
+
42
+
43
+ def count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[dict[str, int], dict[str, int]]:
44
+ """Count the occurrences of each strand type by label."""
45
+ positive_strand_counts_by_labels = defaultdict(int)
46
+ total_counts_by_labels = defaultdict(int)
47
+
48
+ for label, sample in zip(labels, samples):
49
+ total_counts_by_labels[label] += 1
50
+ if sample["STRAND"] == "+":
51
+ positive_strand_counts_by_labels[label] += 1
52
+
53
+ return dict(positive_strand_counts_by_labels), dict(total_counts_by_labels)
54
+
55
+
56
+ def determine_strand_biases(
57
+ positive_strand_counts_by_labels: defaultdict, total_counts_by_labels: defaultdict
58
+ ) -> dict[int, bool]:
59
+ """Determine strand biases based on positive strand counts."""
60
+ strand_biases = {}
61
+ for label, total in total_counts_by_labels.items():
62
+ positive_strand_count = positive_strand_counts_by_labels[label]
63
+ strand_ratio = positive_strand_count / total
64
+ strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
65
+
66
+ return strand_biases
67
+
68
+
69
+ def prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
70
+ """Prepare training and testing datasets based on strand biases."""
71
+ train_data, train_labels, test_data = [], [], []
72
+ for label, score in zip(labels, scores):
73
+ if strand_biases[label]:
74
+ test_data.append(score)
75
+ else:
76
+ train_data.append(score)
77
+ train_labels.append(label)
78
+ return train_data, train_labels, test_data
79
+
80
+
81
+ def train_decision_tree(train_data, train_labels) -> DecisionTreeClassifier:
82
+ """Train a decision tree classifier using the provided features and labels."""
83
+ dtree = DecisionTreeClassifier(random_state=1)
84
+ dtree.fit(train_data, train_labels)
85
+ return dtree
86
+
87
+
88
+ def allocate_labels(labels: list[int], strand_biases: dict[str, bool], dtree, test_data) -> list[int]:
89
+ """Re-allocates reads belonging to clusters with strand bias to clusters without strand bias."""
90
+ label_predictions = iter(dtree.predict(test_data))
91
+ for i, label in enumerate(labels):
92
+ if strand_biases[label]:
93
+ labels[i] = next(label_predictions)
94
+ return labels
95
+
96
+
97
+ def remove_biased_clusters(path_sample: Path, path_score_sample: Path, labels: list[int]) -> list[int]:
98
+ """Remove clusters with strand bias by re-labeling based on decision tree predictions.
99
+ Continue until at least one of the samples exhibits strand bias (i.e., do not calculate if all samples exhibit strand bias, or conversely, if none of the samples exhibit strand bias) or
100
+ 1000 iterations are reached, which serves as a safeguard to prevent infinite loops.
101
+ """
102
+ samples = io.read_jsonl(path_sample)
103
+ positive_strand_counts_by_labels, total_counts_by_labels = count_strand(labels, samples)
104
+ strand_biases = determine_strand_biases(positive_strand_counts_by_labels, total_counts_by_labels)
105
+
106
+ iteration_count = 0
107
+ labels_corrected = labels
108
+ while len(set(strand_biases.values())) > 1 or iteration_count < 1000:
109
+ scores = io.read_jsonl(path_score_sample)
110
+ train_data, train_labels, test_data = prepare_training_testing_sets(labels, scores, strand_biases)
111
+ dtree = train_decision_tree(train_data, train_labels)
112
+ labels_corrected = allocate_labels(labels, strand_biases, dtree, test_data)
113
+ strand_biases = determine_strand_biases(labels_corrected, path_sample)
114
+ iteration_count += 1
115
+ return labels_corrected
@@ -89,13 +89,13 @@ def cosine_similarity(x, y):
89
89
 
90
90
 
91
91
  def identify_dissimilar_loci(values_sample, values_control, index: int, is_consensus: bool = False) -> int:
92
- # If 'sample' has more than X% variation compared to 'control', unconditionally set it to "dissimilar loci"
93
- threshold = 20 if is_consensus else 5
94
- if values_sample[index] - values_control[index] > threshold:
92
+ # If 'sample' has more than 20% variation compared to 'control' in consensus mode, unconditionally set it to 'dissimilar loci'. This is set to counteract cases where, when evaluating cosine similarity during significant deletions, values exceedingly close to 1 can occur even if not observed in the control (e.g., control = [1,1,1,1,1], sample = [100,100,100,100,100] -> cosine similarity = 1).
93
+ if is_consensus and values_sample[index] - values_control[index] > 20:
95
94
  return True
96
95
 
97
- x = values_sample[index - 5 : index + 6]
98
- y = values_control[index - 5 : index + 6]
96
+ # Subset 10 bases around index and add 1e-6 to avoid division by zero when calculating cosine similarity.
97
+ x = np.array(values_sample[index - 5 : index + 6]) + 1e-6
98
+ y = np.array(values_control[index - 5 : index + 6]) + 1e-6
99
99
 
100
100
  return cosine_similarity(x, y) < 0.95
101
101
 
@@ -109,8 +109,8 @@ def detect_anomalies(values_sample, values_control, threshold: float, is_consens
109
109
 
110
110
  values_subtract_reshaped = values_subtract.reshape(-1, 1)
111
111
  kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, n_init="auto").fit(values_subtract_reshaped)
112
- threshold = kmeans.cluster_centers_.mean()
113
- candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold}
112
+ threshold_kmeans = kmeans.cluster_centers_.mean()
113
+ candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold_kmeans}
114
114
 
115
115
  return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i, is_consensus)}
116
116
 
@@ -20,7 +20,7 @@ from DAJIN2.core import core
20
20
  from DAJIN2.utils import io, config, report_generator, input_validator, multiprocess
21
21
 
22
22
 
23
- DAJIN_VERSION = "0.4.3"
23
+ DAJIN_VERSION = "0.4.4"
24
24
 
25
25
 
26
26
  def generate_report(name: str) -> None:
@@ -58,21 +58,21 @@ def execute_single_mode(arguments: dict[str]):
58
58
  ################################################################################
59
59
 
60
60
 
61
- def validate_columns_of_batch_file(columns: list, filepath: str) -> None:
62
- """Validate the columns of a batch file."""
63
- required_columns = ["sample", "control", "allele", "name"]
64
- accepted_columns = ["sample", "control", "allele", "name", "genome"]
61
+ def validate_headers_of_batch_file(headers: list, filepath: str) -> None:
62
+ """Validate the headers of a batch file."""
63
+ required_headers = ["sample", "control", "allele", "name"]
64
+ accepted_headers = ["sample", "control", "allele", "name", "genome"]
65
65
 
66
- if not set(required_columns).issubset(set(columns)):
67
- raise ValueError(f"{filepath} must contain {', '.join(required_columns)} in the header")
66
+ if not set(required_headers).issubset(set(headers)):
67
+ raise ValueError(f"{filepath} must contain {', '.join(required_headers)} in the header")
68
68
 
69
- if not set(columns).issubset(accepted_columns):
70
- raise ValueError(f"Accepted header names of {filepath} are {', '.join(accepted_columns)}.")
69
+ if not set(headers).issubset(accepted_headers):
70
+ raise ValueError(f"Accepted header names of {filepath} are {', '.join(accepted_headers)}.")
71
71
 
72
72
 
73
- def create_argument_dict(columns: list, group: list, cache_urls_genome: dict, is_control: bool) -> dict:
74
- """Create a dictionary of arguments from the given columns and group."""
75
- args = dict(zip(columns, group))
73
+ def create_argument_dict(headers: list, group: list, cache_urls_genome: dict, is_control: bool) -> dict:
74
+ """Create a dictionary of arguments from the given headers and group."""
75
+ args = dict(zip(headers, group))
76
76
  args["threads"] = 1 # Set the number of threads to 1 for batch mode
77
77
 
78
78
  # Assign the "sample" field depending on whether it's a control or not
@@ -89,11 +89,11 @@ def create_argument_dict(columns: list, group: list, cache_urls_genome: dict, is
89
89
 
90
90
 
91
91
  def run_DAJIN2(
92
- groups: list, columns: list, cache_urls_genome: dict, is_control: bool = True, num_workers: int = 1
92
+ groups: list, headers: list, cache_urls_genome: dict, is_control: bool = True, num_workers: int = 1
93
93
  ) -> None:
94
94
  contents = []
95
95
  for group in groups:
96
- args = create_argument_dict(columns, group, cache_urls_genome, is_control)
96
+ args = create_argument_dict(headers, group, cache_urls_genome, is_control)
97
97
  if args: # Add args to contents only if it's not an empty dict
98
98
  contents.append(args)
99
99
 
@@ -117,17 +117,17 @@ def execute_batch_mode(arguments: dict[str]):
117
117
  inputs = io.load_batchfile(path_batchfile)
118
118
 
119
119
  # Validate Column of the batch file
120
- columns = inputs[0]
121
- validate_columns_of_batch_file(columns, path_batchfile)
120
+ headers = inputs[0]
121
+ validate_headers_of_batch_file(headers, path_batchfile)
122
122
 
123
123
  # Validate contents and fetch genome urls
124
124
  contents = inputs[1:]
125
125
  cache_urls_genome = dict()
126
- index_of_name = columns.index("name")
126
+ index_of_name = headers.index("name")
127
127
  contents.sort(key=lambda x: x[index_of_name])
128
128
  for _, groups in groupby(contents, key=lambda x: x[index_of_name]):
129
129
  for group in groups:
130
- args = dict(zip(columns, group))
130
+ args = dict(zip(headers, group))
131
131
  # validate contents in the batch file
132
132
  input_validator.validate_files(args["sample"], args["control"], args["allele"])
133
133
  # validate genome and fetch urls
@@ -141,8 +141,8 @@ def execute_batch_mode(arguments: dict[str]):
141
141
  config.set_logging(path_logfile)
142
142
  groups = list(groups)
143
143
  # Run DAJIN2
144
- run_DAJIN2(groups, columns, cache_urls_genome, is_control=True, num_workers=arguments["threads"])
145
- run_DAJIN2(groups, columns, cache_urls_genome, is_control=False, num_workers=arguments["threads"])
144
+ run_DAJIN2(groups, headers, cache_urls_genome, is_control=True, num_workers=arguments["threads"])
145
+ run_DAJIN2(groups, headers, cache_urls_genome, is_control=False, num_workers=arguments["threads"])
146
146
  # Finish
147
147
  generate_report(name)
148
148
  shutil.move(path_logfile, Path("DAJIN_Results", name))
@@ -83,8 +83,8 @@ def write_xlsx(data: list[dict[str, str]], file_path: str | Path) -> None:
83
83
  ###########################################################
84
84
 
85
85
 
86
- def check_excel_or_csv(file_path: str) -> str | None:
87
- """Check if the file is an Excel or CSV file. Raise error for other types."""
86
+ def determine_file_type(file_path: str) -> str | None:
87
+ """Determine if the file is an Excel or CSV file. Raise error for other types."""
88
88
  file_extension = Path(file_path).suffix
89
89
  if file_extension in [".xlsx", ".xls"]:
90
90
  return "excel"
@@ -112,16 +112,18 @@ def read_xlsx(file_path: str | Path) -> list[dict[str, str]]:
112
112
  def read_csv(file_path: str) -> list[dict[str, str]]:
113
113
  """Load data from a CSV file and return as a list."""
114
114
  with open(file_path, "r") as csvfile:
115
- contents = []
115
+ inputs = []
116
116
  for row in csv.reader(csvfile):
117
+ if not row: # Skip empty rows
118
+ continue
117
119
  trimmed_row = [field.strip() for field in row]
118
- contents.append(trimmed_row)
119
- return contents
120
+ inputs.append(trimmed_row)
121
+ return inputs
120
122
 
121
123
 
122
124
  def load_batchfile(batchfile_path: str) -> list[dict[str, str]]:
123
125
  """Load data from either an Excel or CSV file."""
124
- file_type = check_excel_or_csv(batchfile_path)
126
+ file_type = determine_file_type(batchfile_path)
125
127
  if file_type == "excel":
126
128
  return read_xlsx(batchfile_path)
127
129
  elif file_type == "csv":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: DAJIN2
3
- Version: 0.4.3
3
+ Version: 0.4.4
4
4
  Summary: One-step genotyping tools for targeted long-read sequencing
5
5
  Home-page: https://github.com/akikuno/DAJIN2
6
6
  Author: Akihiro Kuno
@@ -166,7 +166,7 @@ Options:
166
166
 
167
167
  ```bash
168
168
  # Download example dataset
169
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
169
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_single.tar.gz
170
170
  tar -xf example_single.tar.gz
171
171
 
172
172
  # Run DAJIN2
@@ -230,48 +230,13 @@ options:
230
230
 
231
231
  ```bash
232
232
  # Donwload the example dataset
233
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
233
+ curl -LJO https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
234
234
  tar -xf example_batch.tar.gz
235
235
 
236
236
  # Run DAJIN2
237
237
  DAJIN2 batch --file example_batch/batch.csv --threads 4
238
238
  ```
239
239
 
240
- <!-- ```bash
241
- # Donwload the example dataset
242
- wget https://github.com/akikuno/DAJIN2/raw/main/examples/example_batch.tar.gz
243
- tar -xf example_batch.tar.gz
244
-
245
- # Run DAJIN2
246
- DAJIN2 batch --file example-batch/batch.csv --threads 3
247
-
248
- # 2023-07-31 17:01:10: example-batch/tyr_control.fq.gz is now processing...
249
- # 2023-07-31 17:01:16: Preprocess example-batch/tyr_control.fq.gz...
250
- # 2023-07-31 17:01:48: Output BAM files of example-batch/tyr_control.fq.gz...
251
- # 2023-07-31 17:01:52: 🍵 example-batch/tyr_control.fq.gz is finished!
252
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_50%.fq.gz is now processing...
253
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_10%.fq.gz is now processing...
254
- # 2023-07-31 17:01:52: example-batch/tyr_c230gt_01%.fq.gz is now processing...
255
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_01%.fq.gz...
256
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_50%.fq.gz...
257
- # 2023-07-31 17:01:55: Preprocess example-batch/tyr_c230gt_10%.fq.gz...
258
- # 2023-07-31 17:02:17: Classify example-batch/tyr_c230gt_50%.fq.gz...
259
- # 2023-07-31 17:02:19: Clustering example-batch/tyr_c230gt_50%.fq.gz...
260
- # 2023-07-31 17:02:34: Classify example-batch/tyr_c230gt_01%.fq.gz...
261
- # 2023-07-31 17:02:35: Classify example-batch/tyr_c230gt_10%.fq.gz...
262
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_01%.fq.gz...
263
- # 2023-07-31 17:02:39: Clustering example-batch/tyr_c230gt_10%.fq.gz...
264
- # 2023-07-31 17:02:53: Consensus calling of example-batch/tyr_c230gt_50%.fq.gz...
265
- # 2023-07-31 17:02:59: Output reports of example-batch/tyr_c230gt_50%.fq.gz...
266
- # 2023-07-31 17:03:04: 🍵 example-batch/tyr_c230gt_50%.fq.gz is finished!
267
- # 2023-07-31 17:03:39: Consensus calling of example-batch/tyr_c230gt_01%.fq.gz...
268
- # 2023-07-31 17:03:51: Output reports of example-batch/tyr_c230gt_01%.fq.gz...
269
- # 2023-07-31 17:04:03: 🍵 example-batch/tyr_c230gt_01%.fq.gz is finished!
270
- # 2023-07-31 17:04:08: Consensus calling of example-batch/tyr_c230gt_10%.fq.gz...
271
- # 2023-07-31 17:04:16: Output reports of example-batch/tyr_c230gt_10%.fq.gz...
272
- # 2023-07-31 17:04:24: 🍵 example-batch/tyr_c230gt_10%.fq.gz is finished!
273
- # 🎉 Finished! Open DAJIN_Results/tyr-substitution to see the report.
274
- ``` -->
275
240
 
276
241
  ## 📈 Report Contents
277
242
 
@@ -281,22 +246,22 @@ Inside the **DAJIN_Results** directory, the following files can be found:
281
246
  ```
282
247
  DAJIN_Results/tyr-substitution
283
248
  ├── BAM
284
- │ ├── tyr_c230gt_01%
285
- │ ├── tyr_c230gt_10%
286
- │ ├── tyr_c230gt_50%
249
+ │ ├── tyr_c230gt_01
250
+ │ ├── tyr_c230gt_10
251
+ │ ├── tyr_c230gt_50
287
252
  │ └── tyr_control
288
253
  ├── FASTA
289
- │ ├── tyr_c230gt_01%
290
- │ ├── tyr_c230gt_10%
291
- │ └── tyr_c230gt_50%
254
+ │ ├── tyr_c230gt_01
255
+ │ ├── tyr_c230gt_10
256
+ │ └── tyr_c230gt_50
292
257
  ├── HTML
293
- │ ├── tyr_c230gt_01%
294
- │ ├── tyr_c230gt_10%
295
- │ └── tyr_c230gt_50%
258
+ │ ├── tyr_c230gt_01
259
+ │ ├── tyr_c230gt_10
260
+ │ └── tyr_c230gt_50
296
261
  ├── MUTATION_INFO
297
- │ ├── tyr_c230gt_01%.csv
298
- │ ├── tyr_c230gt_10%.csv
299
- │ └── tyr_c230gt_50%.csv
262
+ │ ├── tyr_c230gt_01.csv
263
+ │ ├── tyr_c230gt_10.csv
264
+ │ └── tyr_c230gt_50.csv
300
265
  ├── read_plot.html
301
266
  ├── read_plot.pdf
302
267
  └── read_summary.xlsx
@@ -1,113 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
- from collections import defaultdict, Counter
5
- from sklearn.tree import DecisionTreeClassifier
6
-
7
- from DAJIN2.utils import io
8
-
9
- # Constants
10
- STRAND_BIAS_LOWER_LIMIT = 0.1
11
- STRAND_BIAS_UPPER_LIMIT = 0.9
12
-
13
-
14
- def is_strand_bias(path_control: Path) -> bool:
15
- count_strand = defaultdict(int)
16
- for m in io.read_jsonl(path_control):
17
- count_strand[m["STRAND"]] += 1
18
-
19
- total = count_strand["+"] + count_strand["-"]
20
- percentage_plus = count_strand["+"] / total if total else 0
21
-
22
- return not (STRAND_BIAS_LOWER_LIMIT < percentage_plus < STRAND_BIAS_UPPER_LIMIT)
23
-
24
-
25
- ###############################################################################
26
- # Handle Strand bias
27
- # # Clusters of reads with mutations with strand bias are merged into similar clusters without strand bias
28
- ###############################################################################
29
-
30
-
31
- def _count_strand(labels: list[int], samples: list[dict[str, str]]) -> tuple[defaultdict, defaultdict]:
32
- """Count the occurrences of each strand type by label."""
33
- count_strand_by_labels = defaultdict(int)
34
- total_count_by_labels = defaultdict(int)
35
-
36
- for label, sample in zip(labels, samples):
37
- total_count_by_labels[label] += 1
38
- if sample["STRAND"] == "+":
39
- count_strand_by_labels[label] += 1
40
-
41
- return count_strand_by_labels, total_count_by_labels
42
-
43
-
44
- def _calculate_strand_biases(
45
- count_strand_by_labels: defaultdict, total_count_by_labels: defaultdict
46
- ) -> dict[int, bool]:
47
- """Calculate strand biases based on strand counts."""
48
- strand_biases = {}
49
- for label, total in total_count_by_labels.items():
50
- strand_count = count_strand_by_labels[label]
51
- strand_ratio = strand_count / total
52
- strand_biases[label] = not (STRAND_BIAS_LOWER_LIMIT < strand_ratio < STRAND_BIAS_UPPER_LIMIT)
53
-
54
- return strand_biases
55
-
56
-
57
- def _get_strand_biases_on_each_label(labels: list[int], path_sample: Path | str) -> dict[int, bool]:
58
- """Get strand biases for given labels and samples.
59
- Args:
60
- labels: A list of integer labels.
61
- path_sample: The path to the sample file.
62
- Returns:
63
- A dictionary containing strand biases by label.
64
- """
65
- samples = io.read_jsonl(path_sample)
66
- count_strand_by_labels, total_count_by_labels = _count_strand(labels, samples)
67
- return _calculate_strand_biases(count_strand_by_labels, total_count_by_labels)
68
-
69
-
70
- def _prepare_training_testing_sets(labels, scores, strand_biases) -> tuple[list, list, list]:
71
- x_train, y_train, x_test = [], [], []
72
- for label, score in zip(labels, scores):
73
- if strand_biases[label]:
74
- x_test.append(score)
75
- else:
76
- x_train.append(score)
77
- y_train.append(label)
78
- return x_train, y_train, x_test
79
-
80
-
81
- def _train_decision_tree(x_train, y_train) -> DecisionTreeClassifier:
82
- dtree = DecisionTreeClassifier(random_state=1)
83
- dtree.fit(x_train, y_train)
84
- return dtree
85
-
86
-
87
- def _allocate_labels(labels, strand_biases, dtree, x_test) -> list[int]:
88
- label_predictions = dtree.predict(x_test)
89
- label_predict_iter = iter(label_predictions)
90
- for i, label in enumerate(labels):
91
- if strand_biases[label]:
92
- labels[i] = next(label_predict_iter)
93
- return labels
94
-
95
-
96
- def _correct_clusters_with_strand_bias(path_score_sample, labels, strand_biases) -> list[int]:
97
- scores = io.read_jsonl(path_score_sample)
98
- x_train, y_train, x_test = _prepare_training_testing_sets(labels, scores, strand_biases)
99
- dtree = _train_decision_tree(x_train, y_train)
100
- return _allocate_labels(labels, strand_biases, dtree, x_test)
101
-
102
-
103
- def remove_biased_clusters(path_sample, path_score_sample, labels) -> list[int]:
104
- strand_biases = _get_strand_biases_on_each_label(labels, path_sample)
105
- # Until there is at least one True and one False or
106
- # 1000 iterations (1000 is a suitable number to exit an infinite loop just in case)
107
- i = 0
108
- labels_corrected = labels
109
- while len(Counter(strand_biases.values())) > 1 and i < 1000:
110
- labels_corrected = _correct_clusters_with_strand_bias(path_score_sample, labels_corrected, strand_biases)
111
- strand_biases = _get_strand_biases_on_each_label(labels_corrected, path_sample)
112
- i += 1
113
- return labels_corrected
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes