REDItools3 3.1a0__py3-none-any.whl → 3.2a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: REDItools3
3
- Version: 3.1a0
3
+ Version: 3.2a0
4
4
  Author: Ernesto Picardi
5
5
  Author-email: Adam Handen <adam.handen@gmail.com>
6
6
  Project-URL: homepage, https://github.com/BioinfoUNIBA/REDItools3
@@ -13,7 +13,7 @@ Classifier: Intended Audience :: Science/Research
13
13
  Classifier: License :: OSI Approved :: GNU General Public License (GPL)
14
14
  Classifier: Operating System :: MacOS :: MacOS X
15
15
  Classifier: Operating System :: Unix
16
- Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.7
17
17
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
18
  Requires-Python: >=3.7
19
19
  Description-Content-Type: text/markdown
@@ -2,20 +2,20 @@ reditools/__init__.py,sha256=7nSB0hrQznxrn6l95cv_pSonJTG6jZCQdbn7aT1TtvY,46
2
2
  reditools/__main__.py,sha256=mWJ9O2LDiOpBWDBJJUN7OiM4SyltW-kVXXAGBe_JxgQ,842
3
3
  reditools/alignment_file.py,sha256=YFyCEhMek2t93DpmpwEst5v3gDZkmRotbd6Fy_mP0aE,4258
4
4
  reditools/alignment_manager.py,sha256=_FXwvqGWoXRdzVrwBxki2heaVZA2cQbGXqCopr-g1Hs,4138
5
- reditools/analyze.py,sha256=u38yN5DmXUCW8nQP_BMfsXuvb59rFO12di5cYT8Ye58,15280
5
+ reditools/analyze.py,sha256=tW9Rz-R_8R-mJ2uQP5fpGFTH7TEv-2pOsigtbKqwYDY,14649
6
6
  reditools/compiled_position.py,sha256=v540uUEie_HHUwsYQmBqeeOkUvtYlcnWj1v8gAhLUiE,3858
7
7
  reditools/compiled_reads.py,sha256=7Hm5f7g1T8q1zDOOxZUD7aZax9b7SdQ0PlmT93hmcaE,4154
8
8
  reditools/fasta_file.py,sha256=KBsJBs7OnBpew2PGWGp0mTxPLlpBmRrtXL4uvQw4t34,2212
9
- reditools/file_utils.py,sha256=AJjU9leOxSou5U_4RAgapR9PGQz0OYQlkCudvTcXGeQ,3284
10
- reditools/homopolymerics.py,sha256=BCYXBJa6YuouzccFisBFOtGfZAEOSqeqJsO-c37At84,2123
11
- reditools/index.py,sha256=K3JQTMx4ojUUiPQTDMDsoYoFQQ_o-ZNqTrh5dIVFVSQ,7398
9
+ reditools/file_utils.py,sha256=MfQPzJ4ogbwNvIiEu1oooS64EJH1CFdRS8eoqT9Zo4w,2763
10
+ reditools/homopolymerics.py,sha256=UsHTr0e_OP_dkGq5te-oTSe5u6kzi5UJOF9t9QAunUk,2269
11
+ reditools/index.py,sha256=jLgWwKXIA_e-bqVu74SDZXmrdWch_syDSmMnFZPbqz4,7537
12
12
  reditools/logger.py,sha256=u4L2SYxy4vJ4KDHEymd0b1sCa8BXXHchx8LR_wcFq1A,1210
13
- reditools/reditools.py,sha256=Rb5bllqjE1wHti98p-v2t4Vu-YEvZgNv-FXcUPgDVO0,12725
13
+ reditools/reditools.py,sha256=RNH7aKC2QnbafA7T9E6UpV5Llv3FjfDIabjPCuwDgW0,13111
14
14
  reditools/region.py,sha256=_BiKDc5lCl1snjkokRiUWOgzA57ME3yLydEIwK9ku7U,3780
15
- reditools/rtchecks.py,sha256=tkaosQDBc2XN_RlVMtNwrxZjCQoQo2bWfQISROXCmKA,8221
15
+ reditools/rtchecks.py,sha256=TmCow38fCRwSICvx3nlOxy6Q216BcDXESGhM7bB_ixo,8878
16
16
  reditools/utils.py,sha256=a2qfhMcrH2QlK-JoR-HHF6_bnlo5v3jihAqqknvVIjc,2733
17
- REDItools3-3.1a0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
18
- REDItools3-3.1a0.dist-info/METADATA,sha256=EPD47hLxZoozfc0Gd4uFPOaid9uz81DkWI4Pkv0STpo,1289
19
- REDItools3-3.1a0.dist-info/WHEEL,sha256=a7TGlA-5DaHMRrarXjVbQagU3Man_dCnGIWMJr5kRWo,91
20
- REDItools3-3.1a0.dist-info/top_level.txt,sha256=wrvvbFXhmNg7s6LQqjlV_fVQYUZOOpF93IcMu_hBCx4,10
21
- REDItools3-3.1a0.dist-info/RECORD,,
17
+ REDItools3-3.2a0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
18
+ REDItools3-3.2a0.dist-info/METADATA,sha256=NRwZTGGmlHBkP6XiQ4Sdql-XXRR2Ii27beCDf_jCt90,1288
19
+ REDItools3-3.2a0.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
20
+ REDItools3-3.2a0.dist-info/top_level.txt,sha256=wrvvbFXhmNg7s6LQqjlV_fVQYUZOOpF93IcMu_hBCx4,10
21
+ REDItools3-3.2a0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.4.0)
2
+ Generator: setuptools (75.5.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
reditools/analyze.py CHANGED
@@ -79,19 +79,11 @@ def setup_rtools(options): # noqa:WPS213,WPS231
79
79
  rtools.log_level = Logger.info_level
80
80
 
81
81
  if options.load_omopolymeric_file:
82
- regions = file_utils.load_omopolymeric_regions(
83
- options.load_omopolymeric_file,
84
- )
82
+ regions = file_utils.read_bed_file(options.load_omopolymeric_file)
85
83
  rtools.exclude(regions)
86
84
 
87
- if options.create_omopolymeric_file:
88
- rtools.create_omopolymeric_positions(
89
- options.create_omopolymeric_file,
90
- options.omopolymeric_span,
91
- )
92
-
93
85
  if options.splicing_file:
94
- rtools.load_splicing_file(
86
+ rtools.splice_positions = file_utils.load_splicing_file(
95
87
  options.splicing_file,
96
88
  options.splicing_span,
97
89
  )
@@ -109,10 +101,11 @@ def setup_rtools(options): # noqa:WPS213,WPS231
109
101
  rtools.max_base_position = options.max_base_position
110
102
  rtools.min_base_quality = options.min_base_quality
111
103
 
112
- rtools.min_column_length = options.min_column_length
104
+ rtools.min_column_length = options.min_read_depth
113
105
  rtools.min_edits = options.min_edits
114
106
  rtools.min_edits_per_nucleotide = options.min_edits_per_nucleotide
115
107
  rtools.strand = options.strand
108
+ rtools.max_alts = options.max_editing_nucleotides
116
109
 
117
110
  rtools.strand_confidence_threshold = options.strand_confidence_threshold
118
111
 
@@ -225,21 +218,26 @@ def parse_options(): # noqa:WPS213
225
218
  Returns:
226
219
  namespace: commandline args
227
220
  """
228
- parser = argparse.ArgumentParser(description='REDItools 2.0')
221
+ parser = argparse.ArgumentParser(
222
+ prog="reditools analyze",
223
+ description='REDItools3',
224
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
225
+ )
229
226
  parser.add_argument(
230
227
  'file',
231
228
  nargs='+',
232
- help='The bam file to be analyzed',
229
+ help='The bam file(s) to be analyzed.',
233
230
  )
234
231
  parser.add_argument(
235
232
  '-r',
236
233
  '--reference',
237
- help='The reference FASTA file',
234
+ help='Reference FASTA file.',
238
235
  )
239
236
  parser.add_argument(
240
237
  '-o',
241
238
  '--output-file',
242
- help='The output statistics file',
239
+ help='Path to write output to.',
240
+ default='/dev/stdout',
243
241
  )
244
242
  parser.add_argument(
245
243
  '-s',
@@ -248,96 +246,85 @@ def parse_options(): # noqa:WPS213
248
246
  type=int,
249
247
  default=0,
250
248
  help='Strand: this can be 0 (unstranded),' +
251
- '1 (secondstrand oriented) or ' +
252
- '2 (firststrand oriented)',
249
+ '1 (second strand oriented) or ' +
250
+ '2 (first strand oriented).',
253
251
  )
254
252
  parser.add_argument(
255
253
  '-a',
256
254
  '--append-file',
257
255
  action='store_true',
258
- help='Appends results to file (and creates if not existing)',
256
+ help='Appends results to file (and creates if not existing).',
259
257
  )
260
258
  parser.add_argument(
261
259
  '-g',
262
260
  '--region',
263
- help='The self.region of the bam file to be analyzed',
261
+ help='Only analyzes the specified region.',
264
262
  )
265
263
  parser.add_argument(
266
264
  '-m',
267
265
  '--load-omopolymeric-file',
268
- help='The file containing the omopolymeric positions',
269
- )
270
- parser.add_argument(
271
- '-c',
272
- '--create-omopolymeric-file',
273
- default=False,
274
- help='Path to write omopolymeric positions to',
275
- action='store_true',
266
+ help='BED file of omopolymeric positions.',
276
267
  )
277
268
  parser.add_argument(
278
269
  '-os',
279
270
  '--omopolymeric-span',
280
271
  type=int,
281
272
  default=5,
282
- help='The omopolymeric span',
273
+ help='The omopolymeric span.',
283
274
  )
284
275
  parser.add_argument(
285
276
  '-sf',
286
277
  '--splicing-file',
287
- help='The file containing the splicing sites positions',
278
+ help='The file containing splicing site positions.',
288
279
  )
289
280
  parser.add_argument(
290
281
  '-ss',
291
282
  '--splicing-span',
292
283
  type=int,
293
284
  default=4,
294
- help='The splicing span',
285
+ help='The splicing span.',
295
286
  )
296
287
  parser.add_argument(
297
288
  '-mrl',
298
289
  '--min-read-length',
299
290
  type=int,
300
291
  default=30, # noqa:WPS432
301
- help='Reads whose length is below this value will be discarded.',
292
+ help='Reads with length below -mrl will be discarded.',
302
293
  )
303
294
  parser.add_argument(
304
295
  '-q',
305
296
  '--min-read-quality',
306
297
  type=int,
307
298
  default=20, # noqa:WPS432
308
- help='Reads with mapping quality below this value will be discarded.',
299
+ help='Reads with mapping quality below -q will be discarded.',
309
300
  )
310
301
  parser.add_argument(
311
302
  '-bq',
312
303
  '--min-base-quality',
313
304
  type=int,
314
305
  default=30, # noqa:WPS432
315
- help='Base quality below this value will not be included in ' +
316
- 'the analysis.',
306
+ help='Base quality below -bq will bed discarded.',
317
307
  )
318
308
  parser.add_argument(
319
309
  '-mbp',
320
310
  '--min-base-position',
321
311
  type=int,
322
312
  default=0,
323
- help='Bases which reside in a previous position (in the read)' +
324
- 'will not be included in the analysis.',
313
+ help='Ignores the first -mbp bases in each read.',
325
314
  )
326
315
  parser.add_argument(
327
316
  '-Mbp',
328
317
  '--max-base-position',
329
318
  type=int,
330
319
  default=0,
331
- help='Bases which reside in a further position (in the read)' +
332
- 'will not be included in the analysis.',
320
+ help='Ignores the last -Mpb bases in each read.',
333
321
  )
334
322
  parser.add_argument(
335
323
  '-l',
336
- '--min-column-length',
324
+ '--min-read-depth',
337
325
  type=int,
338
326
  default=1,
339
- help='Positions whose columns have length below this value will' +
340
- 'not be included in the analysis.',
327
+ help='Only report on positions with at least -l read depth',
341
328
  )
342
329
  parser.add_argument(
343
330
  '-e',
@@ -351,8 +338,7 @@ def parse_options(): # noqa:WPS213
351
338
  '--min-edits-per-nucleotide',
352
339
  type=int,
353
340
  default=0,
354
- help='Positions whose columns have bases with less than' +
355
- 'min-edits-per-base edits will not be included in the analysis.',
341
+ help='Positions with fewer than -men edits will not be discarded.',
356
342
  )
357
343
  parser.add_argument(
358
344
  '-me',
@@ -360,16 +346,14 @@ def parse_options(): # noqa:WPS213
360
346
  type=int,
361
347
  default=0, # noqa:WPS432
362
348
  help='The minimum number of editing events (per position). ' +
363
- 'Positions whose columns have bases with less than ' +
364
- '"min-edits-per-base edits" will not be included in the ' +
365
- 'analysis.',
349
+ 'Positions with fewer than -me edits will be discarded.',
366
350
  )
367
351
  parser.add_argument(
368
352
  '-Men',
369
353
  '--max-editing-nucleotides',
370
354
  type=int,
371
- default=100, # noqa:WPS432
372
- help='The maximum number of editing nucleotides, from 0 to 4 ' +
355
+ default=4, # noqa:WPS432
356
+ help='The maximum number of editing nucleotides, from 0 to 3 ' +
373
357
  '(per position). Positions whose columns have more than ' +
374
358
  '"max-editing-nucleotides" will not be included in the analysis.',
375
359
  )
@@ -378,8 +362,8 @@ def parse_options(): # noqa:WPS213
378
362
  '--strand-confidence-threshold',
379
363
  type=float,
380
364
  default=0.7, # noqa:WPS432
381
- help='Only report the strandedness if at least this proportion of ' +
382
- 'reads are of a given strand',
365
+ help='Only report the strandedness if at least -T proportion of ' +
366
+ 'reads are of a given strand.',
383
367
  )
384
368
  parser.add_argument(
385
369
  '-C',
@@ -393,25 +377,25 @@ def parse_options(): # noqa:WPS213
393
377
  '-V',
394
378
  '--verbose',
395
379
  default=False,
396
- help='Verbose information in stderr',
380
+ help='Run in verbose mode.',
397
381
  action='store_true',
398
382
  )
399
383
  parser.add_argument(
400
384
  '-N',
401
385
  '--dna',
402
386
  default=False,
403
- help='Run REDItools 2.0 on DNA-Seq data',
387
+ help='Run REDItools on DNA-Seq data.',
404
388
  action='store_true',
405
389
  )
406
390
  parser.add_argument(
407
391
  '-B',
408
392
  '--bed_file',
409
- help='Path of BED file containing target self.regions',
393
+ help='Only analyze regions in the provided BED file.',
410
394
  )
411
395
  parser.add_argument(
412
396
  '-t',
413
397
  '--threads',
414
- help='Number of threads to run',
398
+ help='Number of threads for parallel processing.',
415
399
  type=int,
416
400
  default=1,
417
401
  )
@@ -419,7 +403,7 @@ def parse_options(): # noqa:WPS213
419
403
  '-w',
420
404
  '--window',
421
405
  help='How many bp should be processed by each thread at a time. ' +
422
- 'Defaults to full contig.',
406
+ 'Zero uses the full contig.',
423
407
  type=int,
424
408
  default=0,
425
409
  )
@@ -427,18 +411,18 @@ def parse_options(): # noqa:WPS213
427
411
  '-k',
428
412
  '--exclude_regions',
429
413
  nargs='+',
430
- help='Path of BED file containing regions to exclude from analysis',
414
+ help='Skip regions in the provided BED file(s).',
431
415
  )
432
416
  parser.add_argument(
433
417
  '-E',
434
418
  '--exclude_reads',
435
- help='Path to a text file listing read names to exclude from analysis',
419
+ help='Text file listing read names to exclude from analysis.',
436
420
  )
437
421
  parser.add_argument(
438
422
  '-d',
439
423
  '--debug',
440
424
  default=False,
441
- help='REDItools is run in DEBUG mode.',
425
+ help='Run in debug mode.',
442
426
  action='store_true',
443
427
  )
444
428
 
reditools/file_utils.py CHANGED
@@ -2,11 +2,8 @@
2
2
 
3
3
  import csv
4
4
  import os
5
- from collections import defaultdict
6
5
  from gzip import open as gzip_open
7
6
 
8
- from sortedcontainers import SortedSet
9
-
10
7
  from reditools.region import Region
11
8
 
12
9
 
@@ -68,54 +65,36 @@ def concat(output, *fnames, clean_up=True, encoding='utf-8'):
68
65
  os.remove(fname)
69
66
 
70
67
 
71
- def load_poly_regions(fname):
72
- """
73
- Read omopolymeric positions from a file.
74
-
75
- Parameters:
76
- fname (str): File path
77
-
78
- Returns:
79
- (dict): Contigs and regions
80
- """
81
- poly_regions = defaultdict(set)
82
- with read_bed_file(fname) as reader:
83
- for row in reader:
84
- poly_regions[row[0]] = Region(
85
- contig=row[0],
86
- start=row[1],
87
- stop=row[2],
88
- )
89
- return poly_regions
90
-
91
-
92
- def load_splicing_file(splicing_file, span):
68
+ def load_splicing_file(splicing_file, splicing_span):
93
69
  """
94
70
  Read splicing positions from a file.
95
71
 
96
72
  Parameters:
97
73
  splicing_file (str): File path
98
- span(int): Width of splice sites
74
+ splicing_span(int): Width of splice sites
99
75
 
100
- Returns:
101
- (dict): Contig and positions
76
+ Yeilds:
77
+ Splicing file contents as Regions.
102
78
  """
103
- splice_positions = defaultdict(SortedSet)
104
79
  strand_map = {'-': 'D', '+': 'A'}
105
80
 
106
- with open_stream(splicing_file, 'r') as stream:
107
- for line in stream:
108
- fields = line.strip().split()
109
-
110
- chrom = fields[0]
111
- strand = fields[4]
112
- splice = fields[3]
113
- span = int(fields[1])
114
-
115
- coe = -1 if strand_map.get(strand, None) == splice else 1
116
- new_positions = [1 + span + coe * fctr for fctr in range(span)]
117
- splice_positions[chrom] |= new_positions
118
- return splice_positions
81
+ stream = open_stream(splicing_file)
82
+ reader = csv.reader(
83
+ filter(lambda row: row[0] != '#', stream),
84
+ delimiter=' ',
85
+ )
86
+ for row in reader:
87
+ contig = row[0]
88
+ span = int(row[1])
89
+ splice = row[3]
90
+ strand = row[4]
91
+
92
+ coe = -1 if strand_map.get(strand, None) == splice else 1
93
+ start = 1 + span
94
+ stop = start + splicing_span * coe
95
+ if start > stop:
96
+ start, stop = stop, start
97
+ yield Region(contig=contig, start=start, stop=stop)
119
98
 
120
99
 
121
100
  def load_text_file(file_name):
@@ -42,7 +42,11 @@ def parse_options():
42
42
  Returns:
43
43
  namespace
44
44
  """
45
- parser = argparse.ArgumentParser(description='REDItools 2.0')
45
+ parser = argparse.ArgumentParser(
46
+ prog="reditools find-repeats",
47
+ description='REDItools3',
48
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
49
+ )
46
50
  parser.add_argument(
47
51
  'file',
48
52
  help='The fasta file to be analyzed',
@@ -57,6 +61,7 @@ def parse_options():
57
61
  parser.add_argument(
58
62
  '-o',
59
63
  '--output',
64
+ default='/dev/stdout',
60
65
  help='Destination to write results. Default is to use STDOUT. ' +
61
66
  'If the filename ends in .gz, the contents will be gzipped.',
62
67
  )
reditools/index.py CHANGED
@@ -180,7 +180,11 @@ def parse_options(): # noqa:WPS213
180
180
  Returns:
181
181
  namespace: commandline args
182
182
  """
183
- parser = argparse.ArgumentParser(description='REDItools 2.0')
183
+ parser = argparse.ArgumentParser(
184
+ prog="reditools index",
185
+ description='REDItools3',
186
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
187
+ )
184
188
  parser.add_argument(
185
189
  'file',
186
190
  nargs='+',
@@ -189,6 +193,7 @@ def parse_options(): # noqa:WPS213
189
193
  parser.add_argument(
190
194
  '-o',
191
195
  '--output-file',
196
+ default='/dev/stdout',
192
197
  help='The output statistics file',
193
198
  )
194
199
  parser.add_argument(
@@ -239,7 +244,7 @@ def main():
239
244
  indexer.add_target_from_bed(trg_fname)
240
245
 
241
246
  if options.output_file:
242
- stream = open_stream(options.output_fipe, 'w')
247
+ stream = open_stream(options.output_file, 'w')
243
248
  else:
244
249
  stream = sys.stdout
245
250
 
reditools/reditools.py CHANGED
@@ -127,7 +127,6 @@ class REDItools(object):
127
127
  self._target_positions = False
128
128
  self._exclude_positions = {}
129
129
  self._splice_positions = []
130
-
131
130
  self._specific_edits = None
132
131
 
133
132
  self.reference = None
@@ -294,6 +293,20 @@ class REDItools(object):
294
293
  """
295
294
  return self._exclude_positions
296
295
 
296
+ @property
297
+ def max_alts(self):
298
+ """Maximum number of alternative bases for a position."""
299
+ return self._max_alts
300
+
301
+ @max_alts.setter
302
+ def max_alts(self, max_alts):
303
+ self._max_alts = max_alts
304
+ function = self._rtqc.check_max_alts
305
+ if max_alts < 3:
306
+ self._rtqc.add(function)
307
+ else:
308
+ self._rtqc.discard(function)
309
+
297
310
  def exclude(self, regions):
298
311
  """
299
312
  Explicitly skip specified genomic regions.
reditools/rtchecks.py CHANGED
@@ -272,3 +272,26 @@ class RTChecks(object):
272
272
  )
273
273
  return False
274
274
  return True
275
+
276
+ def check_max_alts(self, bases, rtools):
277
+ """
278
+ Check that there are no more than a max number of alts.
279
+
280
+ Parameters:
281
+ bases (CompiledPosition): Base position under analysis
282
+ rtools (REDItools): Object running the analysis
283
+
284
+ Returns:
285
+ (bool): True if there are n or fewer alts
286
+ """
287
+
288
+ alts = bases.get_variants()
289
+ if len(alts) > rtools.max_alts:
290
+ rtools.log(
291
+ Logger.debug_level,
292
+ 'DISCARD COLUMN alts={} > {}',
293
+ len(alts),
294
+ rtools.max_alts,
295
+ )
296
+ return False
297
+ return True