bscampp 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/__init__.py CHANGED
@@ -12,7 +12,7 @@ import logging, os
12
12
  # not really needed for BSCAMPP but safe to update here
13
13
  os.sys.setrecursionlimit(1000000)
14
14
 
15
- __version__ = "1.0.3"
15
+ __version__ = "1.0.6"
16
16
  _INSTALL_PATH = __path__[0]
17
17
 
18
18
  # global variables to store all loggers
bscampp/configs.py CHANGED
@@ -6,6 +6,7 @@ except ImportError:
6
6
  from argparse import ArgumentParser, Namespace
7
7
  from bscampp.init_configs import init_config_file
8
8
  from bscampp import get_logger, log_exception
9
+ #from bscampp.utils import inferDataType
9
10
 
10
11
  # detect home.path or create if missing
11
12
  homepath = os.path.dirname(__file__) + '/home.path'
@@ -33,6 +34,9 @@ class Configs:
33
34
  keeptemp = False # whether to keep all temporary files
34
35
  verbose = 'INFO' # default verbose level to print
35
36
  num_cpus = 1 # number of cores to use for parallelization
37
+ cpus_per_job = 2 # number of cores to use per job
38
+ max_workers = 1 # max_workers for ProcessPoolExecutor
39
+ # ... = max(1, num_cpus // cpus_per_job)
36
40
 
37
41
  # binaries
38
42
  pplacer_path = None
@@ -42,7 +46,6 @@ class Configs:
42
46
 
43
47
  # placement settings
44
48
  placement_method = 'epa-ng'
45
- model = 'GTR'
46
49
  subtreesize = 2000
47
50
  votes = 5
48
51
  similarityflag = True
@@ -162,6 +165,8 @@ def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
162
165
  Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
163
166
  else:
164
167
  Configs.num_cpus = os.cpu_count()
168
+ # compute max_workers based on num_cpus and cpus_per_job
169
+ Configs.max_workers = max(1, Configs.num_cpus // Configs.cpus_per_job)
165
170
 
166
171
  # sanity check for existence of base placement binary path
167
172
  if Configs.placement_method == 'epa-ng':
bscampp/functions.py CHANGED
@@ -1,15 +1,80 @@
1
1
  import json, time, os, sys
2
2
  import treeswift
3
3
  from collections import defaultdict, Counter
4
+ import subprocess
4
5
 
5
6
  from bscampp import get_logger, log_exception
6
7
  from bscampp.configs import Configs
7
- from bscampp.jobs import EPAngJob, TaxtasticJob, PplacerTaxtasticJob
8
+ from bscampp.jobs import GenericJob, EPAngJob, TaxtasticJob, PplacerTaxtasticJob
8
9
  from bscampp.utils import write_fasta
9
10
  import bscampp.utils as utils
10
11
 
12
+ import concurrent.futures
13
+
11
14
  _LOG = get_logger(__name__)
12
15
 
16
+ ############################# helper functions ################################
17
+ '''
18
+ Function to recompile binaries from the given directory.
19
+ Assumption, the directory contains a CMakeLists.txt file
20
+ '''
21
+ def recompileBinariesFromDir(dir):
22
+ _LOG.warning(f"Recompiling binaries with cmake/make at {dir}")
23
+
24
+ # need to recompile the binaries
25
+ cmake_p = subprocess.Popen(['cmake', dir],
26
+ cwd=dir, stdout=subprocess.PIPE,
27
+ stderr=subprocess.PIPE, text=True)
28
+ cmake_stdout, cmake_stderr = cmake_p.communicate()
29
+
30
+ if cmake_p.returncode != 0:
31
+ _LOG.error("cmake failed!")
32
+ exit(cmake_p.returncode)
33
+ else:
34
+ _LOG.warning("cmake succeeded!")
35
+
36
+ # run make
37
+ make_p = subprocess.Popen(['make'],
38
+ cwd=dir, stdout=subprocess.PIPE,
39
+ stderr=subprocess.PIPE, text=True)
40
+ make_stdout, make_stderr = make_p.communicate()
41
+
42
+ if make_p.returncode != 0:
43
+ _LOG.error(f"make failed!")
44
+ exit(make_p.returncode)
45
+ else:
46
+ _LOG.warning("make succeeded!")
47
+ _LOG.warning(f"Successfully recompiled binaries at {dir}!")
48
+
49
+ '''
50
+ Function to check hamming/fragment_hamming/homology binaries are executable,
51
+ since they were compiled using dynamic library
52
+ '''
53
+ def ensureBinaryExecutable(binpath):
54
+ dir = os.path.dirname(binpath)
55
+
56
+ # binpath does not exist
57
+ b_recompile = False
58
+ if not os.path.exists(binpath):
59
+ _LOG.warning(f"{binpath} does not exist!")
60
+ b_recompile = True
61
+ else:
62
+ p = subprocess.Popen([binpath], stdout=subprocess.PIPE,
63
+ stderr=subprocess.PIPE)
64
+ stdout, stderr = p.communicate()
65
+ # 255 or -1 indicates that the binaries work
66
+ if p.returncode == 255 or p.returncode == -1:
67
+ pass
68
+ else:
69
+ _LOG.warning(f"{binpath} return code is {p.returncode}!")
70
+ b_recompile = True
71
+
72
+ if b_recompile:
73
+ recompileBinariesFromDir(dir)
74
+ return
75
+
76
+ ########################## end of helper functions ############################
77
+
13
78
  '''
14
79
  Function to read in the placement tree and alignment.
15
80
  If query alignment is provided, will use the provided query instead of
@@ -20,7 +85,7 @@ def readData(workdir, dry_run=False):
20
85
  _LOG.info('Reading in input data...')
21
86
 
22
87
  if dry_run:
23
- return None, dict(), '', dict(), '', dict()
88
+ return None, dict(), '', dict(), '', dict(), dict(), dict()
24
89
 
25
90
  # (1) load reference tree
26
91
  tree = treeswift.read_tree_newick(Configs.tree_path)
@@ -40,22 +105,44 @@ def readData(workdir, dry_run=False):
40
105
  if Configs.qaln_path is not None:
41
106
  ref_dict = utils.read_data(Configs.aln_path)
42
107
  q_dict = utils.read_data(Configs.qaln_path)
43
- aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
108
+ #aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
44
109
  else:
45
110
  aln_dict = utils.read_data(Configs.aln_path)
46
111
  ref_dict, q_dict = utils.seperate(aln_dict, leaf_dict)
47
112
 
48
- # after separating queries from the reference alignment, write
49
- # them to to TEMP/
50
- qaln_path = os.path.join(workdir, 'qaln.fa')
51
- write_fasta(qaln_path, q_dict)
52
-
53
- aln_path = os.path.join(workdir, 'aln.fa')
54
- write_fasta(aln_path, ref_dict)
113
+ # Added on 3.8.2025 by Chengze Shen
114
+ # - to ensure that any characters from the query has correct names
115
+ # (e.g., having ":" can cause trouble), have a qname_map that maps
116
+ # each taxon name to an idx
117
+ qidx = 1
118
+ qname_map = dict()
119
+ qname_map_rev = dict()
120
+ for name in q_dict.keys():
121
+ cvt = str(qidx).zfill(16) # 16 digits
122
+ qname_map[name] = cvt
123
+ qname_map_rev[cvt] = name
124
+ qidx += 1
125
+ # modify q_dict as well
126
+ for name, cvt in qname_map.items():
127
+ q_dict[cvt] = q_dict[name]
128
+ q_dict.pop(name)
129
+
130
+ # after separating queries from the reference alignment, write
131
+ # them to to TEMP/
132
+ # Updated on 3.5.2025 by Chengze Shen
133
+ # - regardless of the input choices, write a copy of both reference
134
+ # and query alignment to the workdir
135
+ qaln_path = os.path.join(workdir, 'qaln.fa')
136
+ write_fasta(qaln_path, q_dict)
137
+
138
+ aln_path = os.path.join(workdir, 'aln.fa')
139
+ write_fasta(aln_path, ref_dict)
140
+
55
141
 
56
142
  t1 = time.perf_counter()
57
143
  _LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
58
- return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict
144
+ return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict, \
145
+ qname_map, qname_map_rev
59
146
 
60
147
  '''
61
148
  Function to get the closest leaf for each query sequence based on Hamming
@@ -75,18 +162,29 @@ def getClosestLeaves(aln_path, qaln_path, aln, qaln, workdir, dry_run=False):
75
162
  if Configs.subtreetype == "h":
76
163
  Configs.votes = Configs.subtreesize
77
164
 
78
- cmd = []
79
165
  if Configs.similarityflag:
80
- cmd.append(os.path.join(Configs.hamming_distance_dir, 'homology'))
166
+ job_type = 'homology'
81
167
  else:
82
- if Configs.fragmentflag == False:
83
- cmd.append(os.path.join(Configs.hamming_distance_dir, 'hamming'))
168
+ if Configs.fragmentflag:
169
+ job_type = 'fragment_hamming'
84
170
  else:
85
- cmd.append(os.path.join(
86
- Configs.hamming_distance_dir, 'fragment_hamming'))
171
+ job_type = 'hamming'
172
+ binpath = os.path.join(Configs.hamming_distance_dir, job_type)
173
+ cmd = [binpath]
174
+
175
+ # Added @ 3.9.2025 by Chengze Shen
176
+ # - check if binpath is executable, since the compiled files use dynamic
177
+ # libraries.
178
+ # If works: should have return code 255
179
+ # If not: should have return code 1,
180
+ # recompile the binaries using cmake and make
181
+ ensureBinaryExecutable(binpath)
182
+
87
183
  cmd.extend([aln_path, str(len(aln)), qaln_path, str(len(qaln)),
88
184
  tmp_output, str(Configs.votes)])
89
- os.system(' '.join(cmd))
185
+ job = GenericJob(cmd=cmd, job_type=job_type)
186
+ _ = job.run()
187
+ #os.system(' '.join(cmd))
90
188
 
91
189
  # process closest leaves
92
190
  unusable_queries = set()
@@ -282,16 +380,30 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
282
380
  '''
283
381
  Helper function to run a single placement task. Designed to use with
284
382
  multiprocessing
383
+ Input: job object
384
+ Return: outpath from job.run()
285
385
  '''
286
- def placeOneSubtree():
287
- # TODO
288
- pass
386
+ def placeOneSubtree(*jobs,
387
+ subtree_id=0, num_assigned_queries=-1, outpath=None, logging=None):
388
+ job_type = None
389
+ # record the last job_type and _outpath, which will be for the placement
390
+ # job
391
+ for job in jobs:
392
+ job_type = job.job_type
393
+ # run the job
394
+ _outpath = job.run(logging=logging)
395
+
396
+ # move output file for EPA-ng output
397
+ if job_type == 'epa-ng':
398
+ os.system('mv {} {}'.format(_outpath, outpath))
399
+ return subtree_id, num_assigned_queries, outpath
289
400
 
290
401
  '''
291
402
  Function to perform placement of queries for each subtree
292
403
  '''
293
404
  def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
294
- aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
405
+ aln, qaln, cmdline_args, workdir, qname_map, qname_map_rev,
406
+ pool, lock, dry_run=False):
295
407
  t0 = time.perf_counter()
296
408
  _LOG.info('Performing placement on each subtree...')
297
409
 
@@ -307,22 +419,21 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
307
419
  # go over the dictionary of subtrees and their assigned queries
308
420
  # perform placement using either EPA-ng or pplacer
309
421
  final_subtree_count, total_subtrees_examined = 0, 0
422
+ futures = []
423
+ _LOG.info("Submitting jobs for subtree placement...")
310
424
  for subtree, query_list in new_subtree_dict.items():
311
425
  total_subtrees_examined += 1
312
- _LOG.info('- Subtree {}/{} with {} queries'.format(
313
- total_subtrees_examined, len(new_subtree_dict), len(query_list)))
314
426
 
315
427
  # empty subtree, continue
316
428
  if len(query_list) == 0:
317
429
  continue
318
- final_subtree_count += 1
319
430
 
320
431
  subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
321
432
  if not os.path.isdir(subtree_dir):
322
433
  os.makedirs(subtree_dir)
323
434
 
324
435
  # name all temporary output files
325
- tmp_tree = os.path.join(subtree_dir, 'tree')
436
+ tmp_tree = os.path.join(subtree_dir, f'subtree_{final_subtree_count}.tre')
326
437
  tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
327
438
  tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
328
439
  tmp_output = os.path.join(subtree_dir,
@@ -345,14 +456,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
345
456
 
346
457
  # 1.27.2025 - Chengze Shen
347
458
  # choose the placement method to run
459
+ jobs = []
348
460
  if Configs.placement_method == 'epa-ng':
349
461
  job = EPAngJob(path=Configs.epang_path,
350
462
  info_path=Configs.info_path, tree_path=tmp_tree,
351
463
  aln_path=tmp_aln, qaln_path=tmp_qaln,
352
464
  outdir=subtree_dir, num_cpus=Configs.num_cpus)
353
- # for EPA-ng, ensure that outpath name is changed to the one we want
354
- _outpath = job.run(logging=f'subtree_{final_subtree_count}')
355
- os.system('mv {} {}'.format(_outpath, tmp_output))
465
+ jobs.append(job)
466
+ ## for EPA-ng, ensure that outpath name is changed to the one we want
467
+ #_outpath = job.run(logging=f'subtree_{final_subtree_count}')
468
+ #os.system('mv {} {}'.format(_outpath, tmp_output))
356
469
  elif Configs.placement_method == 'pplacer':
357
470
  # build ref_pkg with info and tmp_tree and tmp_aln
358
471
  refpkg_dir = os.path.join(subtree_dir,
@@ -361,17 +474,33 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
361
474
  outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
362
475
  aln_path=tmp_aln, tree_path=tmp_tree,
363
476
  info_path=Configs.info_path)
364
- _ = taxit_job.run()
477
+ jobs.append(taxit_job)
478
+ #_ = taxit_job.run()
365
479
 
366
480
  # run pplacer-taxtastic
367
481
  job = PplacerTaxtasticJob(path=Configs.pplacer_path,
368
- refpkg_dir=refpkg_dir, model=Configs.model,
482
+ refpkg_dir=refpkg_dir,
483
+ #molecule=Configs.molecule, model=Configs.model,
369
484
  outpath=tmp_output, num_cpus=Configs.num_cpus,
370
485
  qaln_path=tmp_qaln)
371
- tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
486
+ #tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
487
+ jobs.append(job)
372
488
  else:
373
489
  raise ValueError(
374
490
  f"Placement method {Configs.placement_method} not recognized")
491
+ logging = f'subtree_{final_subtree_count}'
492
+ futures.append(pool.submit(placeOneSubtree, *jobs,
493
+ subtree_id=final_subtree_count,
494
+ num_assigned_queries=len(query_list),
495
+ outpath=tmp_output, logging=logging))
496
+ # increment final_subtree_count
497
+ final_subtree_count += 1
498
+
499
+ # deal with outputs
500
+ for future in concurrent.futures.as_completed(futures):
501
+ subtree_id, num_assigned_queries, tmp_output = future.result()
502
+ _LOG.info('- Subtree {}/{} with {} queries'.format(
503
+ subtree_id + 1, final_subtree_count, num_assigned_queries))
375
504
 
376
505
  # read in each placement result
377
506
  place_file = open(tmp_output, 'r')
@@ -391,8 +520,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
391
520
  field_to_idx = {field: i for i, field in enumerate(fields)}
392
521
 
393
522
  for tmp_place in place_json["placements"]:
394
- #print(tmp_place)
395
- placed_query_list.append(tmp_place[tgt][0])
523
+ # convert qname back using qname_map_rev
524
+ qname = qname_map_rev[tmp_place[tgt][0]]
525
+ tmp_place[tgt][0] = qname
526
+ placed_query_list.append(qname)
527
+
528
+ #placed_query_list.append(tmp_place[tgt][0])
396
529
  for i in range(len(tmp_place["p"])):
397
530
  edge_num = tmp_place["p"][i][
398
531
  field_to_idx['edge_num']]
@@ -434,6 +567,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
434
567
 
435
568
  placements.append(tmp_place.copy())
436
569
  place_file.close()
570
+
437
571
  _LOG.info(f'Final number of subtrees used: {final_subtree_count}')
438
572
 
439
573
  # prepare the output jplace to write
@@ -447,6 +581,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
447
581
  _LOG.info('Time to place queries to subtrees: {} seconds'.format(t1 - t0))
448
582
  return jplace
449
583
 
584
+
450
585
  '''
451
586
  Function to write a given jplace object to local output
452
587
  '''
bscampp/jobs.py CHANGED
@@ -112,7 +112,7 @@ class Job(object):
112
112
  else:
113
113
  _LOG.error(error_msg + '\nSTDOUT: ' + stdout +
114
114
  '\nSTDERR: ' + stderr + logpath)
115
- exit(1)
115
+ exit(self.returncode)
116
116
  except Exception:
117
117
  log_exception(_LOG)
118
118
 
@@ -123,6 +123,18 @@ class Job(object):
123
123
  raise NotImplementedError(
124
124
  'get_invocation() should be implemented by subclasses.')
125
125
 
126
+ '''
127
+ Generic job that runs the given command, represented as a list of strings
128
+ '''
129
+ class GenericJob(Job):
130
+ def __init__(self, cmd=[], job_type='external'):
131
+ Job.__init__(self)
132
+ self.job_type = job_type
133
+ self.cmd = cmd
134
+
135
+ def get_invocation(self):
136
+ return self.cmd, None
137
+
126
138
  '''
127
139
  A EPA-ng job that runs EPA-ng with given parameters
128
140
  '''
@@ -137,6 +149,8 @@ class EPAngJob(Job):
137
149
  self.aln_path = ''
138
150
  self.qaln_path = ''
139
151
  self.outdir = ''
152
+ #self.molecule = ''
153
+ #self.model = ''
140
154
  self.num_cpus = 1
141
155
 
142
156
  for k, v in kwargs.items():
@@ -194,7 +208,7 @@ class PplacerTaxtasticJob(Job):
194
208
  self.qaln_path = ''
195
209
  self.outdir = ''
196
210
  self.outpath = ''
197
- self.model = 'GTR'
211
+ #self.model = 'GTR'
198
212
  self.num_cpus = 1
199
213
 
200
214
  for k, v in kwargs.items():
@@ -202,7 +216,8 @@ class PplacerTaxtasticJob(Job):
202
216
 
203
217
  def get_invocation(self):
204
218
  # outpath defined
205
- cmd = [self.path, '-m', self.model,
219
+ cmd = [self.path,
220
+ #'-m', self.model,
206
221
  '-c', self.refpkg_dir, '-o', self.outpath,
207
222
  '-j', str(self.num_cpus), self.qaln_path]
208
223
  return cmd, self.outpath
bscampp/pipeline.py CHANGED
@@ -32,7 +32,9 @@ def bscampp_pipeline(*args, **kwargs):
32
32
 
33
33
  # initialize multiprocessing (if needed)
34
34
  _LOG.warning('Initializing ProcessPoolExecutor...')
35
- pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
35
+ # maximally concurrently run Configs.num_cpus // 2 jobs, each job
36
+ # can use 2 threads
37
+ pool = ProcessPoolExecutor(Configs.max_workers, initializer=initial_pool,
36
38
  initargs=(parser, cmdline_args,))
37
39
 
38
40
  # (0) temporary files wrote to here
@@ -48,8 +50,8 @@ def bscampp_pipeline(*args, **kwargs):
48
50
 
49
51
  # (1) read in tree, alignment, and separate reference sequences from
50
52
  # query sequences
51
- tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
52
- dry_run=dry_run)
53
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
54
+ = readData(workdir, dry_run=dry_run)
53
55
 
54
56
  # (2) compute closest leaves for all query sequences
55
57
  query_votes_dict, query_top_vote_dict = getClosestLeaves(
@@ -64,8 +66,9 @@ def bscampp_pipeline(*args, **kwargs):
64
66
 
65
67
  # (4) perform placement for each subtree
66
68
  output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
67
- placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
68
- dry_run=dry_run)
69
+ placed_query_list, aln, qaln, cmdline_args, workdir,
70
+ qname_map, qname_map_rev,
71
+ pool, lock, dry_run=dry_run)
69
72
 
70
73
  # (5) write the output jplace to local
71
74
  writeOutputJplace(output_jplace, dry_run=dry_run)
@@ -121,8 +124,8 @@ def scampp_pipeline(*args, **kwargs):
121
124
 
122
125
  # (1) read in tree, alignment, and separate reference sequences from
123
126
  # query sequences
124
- tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
125
- dry_run=dry_run)
127
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
128
+ = readData(workdir, dry_run=dry_run)
126
129
 
127
130
  # (2) compute closest leaves for all query sequences
128
131
  query_votes_dict, query_top_vote_dict = getClosestLeaves(
@@ -136,8 +139,9 @@ def scampp_pipeline(*args, **kwargs):
136
139
 
137
140
  # (4) perform placement for each subtree
138
141
  output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
139
- placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
140
- dry_run=dry_run)
142
+ placed_query_list, aln, qaln, cmdline_args, workdir,
143
+ qname_map, qname_map_rev,
144
+ pool, lock, dry_run=dry_run)
141
145
 
142
146
  # (5) write the output jplace to local
143
147
  writeOutputJplace(output_jplace, dry_run=dry_run)
@@ -257,14 +261,20 @@ def _init_parser(default_outdir="bscampp_output",
257
261
  basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
258
262
  dest="aln_path",
259
263
  help=("Path for reference sequence alignment in "
260
- "FASTA format. Optionally with query sequences. "
264
+ "FASTA format (can be a .gz file). "
265
+ "Optionally with query sequences. "
261
266
  "Query alignment can be specified with --qaln-path"),
262
267
  required=required, default=None)
263
268
  basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
264
269
  dest="qaln_path",
265
270
  help=("Optionally provide path to query sequence alignment "
266
- "in FASTA format. Default: None"),
271
+ "in FASTA format (can be a .gz file). Default: None"),
267
272
  required=False, default=None)
273
+ #basic_group.add_argument("--molecule", type=str,
274
+ # choices=['nucl', 'nucleotide', 'prot', 'protein'],
275
+ # help=("Specify nucleotide or protein sequences. "
276
+ # "Default: infer datatype"),
277
+ # required=False, default=None)
268
278
  basic_group.add_argument("-d", "--outdir", type=str,
269
279
  help="Directory path for output. Default: bscampp_output/",
270
280
  required=False, default=default_outdir)
@@ -275,6 +285,9 @@ def _init_parser(default_outdir="bscampp_output",
275
285
  dest="num_cpus",
276
286
  help="Number of cores for parallelization, default: -1 (all)",
277
287
  required=False, default=-1)
288
+ basic_group.add_argument("--cpus-per-job", type=int,
289
+ help="Number of cores to use for each job, default: 2",
290
+ required=False, default=2)
278
291
 
279
292
  # advanced parameter settings
280
293
  advance_group = parser.add_argument_group(
@@ -284,16 +297,24 @@ def _init_parser(default_outdir="bscampp_output",
284
297
  ))
285
298
  parser.groups['advance_group'] = advance_group
286
299
 
287
- advance_group.add_argument("-m", "--model", type=str,
288
- help="Model used for edge distances. Default: GTR",
289
- required=False, default="GTR")
300
+ #advance_group.add_argument("-m", "--model", type=str,
301
+ # help=("Model used for edge distances. EPA-ng will use the "
302
+ # "provided info_path (*.bestModel) for model. "
303
+ # "Default: GTR for nucleotide, LG for protein"),
304
+ # required=False, default=None)
290
305
  advance_group.add_argument("-b", "--subtreesize", type=int,
291
306
  help="Integer size of the subtree. Default: 2000",
292
307
  required=False, default=2000)
293
308
  advance_group.add_argument("-V", "--votes", type=int,
294
- help="This is only used for BSCAMPP! Number of votes per "
295
- "query sequence. Default: 5",
309
+ help="(BSCAMPP only) Number of votes per query sequence. "
310
+ "Default: 5",
296
311
  required=False, default=5)
312
+ advance_group.add_argument("--subtreetype", type=str,
313
+ help="(SCAMPP only) Options for collecting "
314
+ "nodes for the subtree - d for edge weighted "
315
+ "distances, n for node distances, h for Hamming "
316
+ "distances. Default: d",
317
+ required=False, default='d')
297
318
  advance_group.add_argument("--similarityflag", type=str2bool,
298
319
  help="Boolean, True if maximizing sequence similarity "
299
320
  "instead of simple Hamming distance (ignoring gap "
@@ -306,17 +327,12 @@ def _init_parser(default_outdir="bscampp_output",
306
327
  parser.groups['misc_group'] = misc_group
307
328
 
308
329
  misc_group.add_argument("-n","--tmpfilenbr", type=int,
309
- help="Temporary file indexing. Default: 0",
330
+ help="Temporary file indexing (e.g., tmp0/). Default: 0",
310
331
  required=False, default=0)
311
332
  misc_group.add_argument("--fragmentflag", type=str2bool,
312
- help="If queries contains fragments. Default: True",
333
+ help="If queries contains fragments. Does not do anything "
334
+ "if similarity flag is set to True. Default: True",
313
335
  required=False, default=True)
314
- misc_group.add_argument("--subtreetype", type=str,
315
- help="(SCAMPP only) Options for collecting "
316
- "nodes for the subtree - d for edge weighted "
317
- "distances, n for node distances, h for Hamming "
318
- "distances. Default: d",
319
- required=False, default='d')
320
336
  misc_group.add_argument("--keeptemp", type=str2bool,
321
337
  help="Boolean, True to keep all temporary files. "
322
338
  "Default: False",
bscampp/utils.py CHANGED
@@ -9,8 +9,11 @@ import random
9
9
  import statistics
10
10
  import copy
11
11
  import gzip
12
-
13
12
  import argparse
13
+
14
+ from bscampp import get_logger, log_exception
15
+ _LOG = get_logger(__name__)
16
+
14
17
  # reformat argparse help text formatting
15
18
  class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
16
19
  def add_text(self, text):
@@ -36,6 +39,34 @@ BRACKET = {
36
39
  }
37
40
 
38
41
 
42
+ # infer datatype from input file
43
+ def inferDataType(path):
44
+ sequences = read_data(path)
45
+ acg, t, u, total = 0, 0, 0, 0
46
+ for taxon, seq in sequences.items():
47
+ letters = seq.upper()
48
+ for letter in letters:
49
+ total = total + 1
50
+
51
+ if letter in ('A', 'C', 'G', 'N'):
52
+ acg += 1
53
+ elif letter == 'T':
54
+ t += 1
55
+ elif letter == 'U':
56
+ u += 1
57
+ # dna -> nucleotide
58
+ if u == 0 and (acg + t) / total > 0.9:
59
+ datatype = 'nucleotide'
60
+ # rna -> nucleotide
61
+ elif t == 0 and (acg + u) / total > 0.9:
62
+ datatype = 'nucleotide'
63
+ # amino acid -> protein
64
+ else:
65
+ datatype = 'protein'
66
+
67
+ _LOG.info(f"Inferred input data type: {datatype}")
68
+ return datatype
69
+
39
70
  def write_fasta(aln, aln_dict, aligned=True):
40
71
  """ Write given dictionary as FASTA file out
41
72
 
@@ -76,7 +107,12 @@ def read_data(aln):
76
107
 
77
108
  """
78
109
 
79
- f = open(aln)
110
+ # determine the file type, whether we have a .gz/.gzip file
111
+ suffix = aln.split('.')[-1]
112
+ if suffix in ['gz', 'gzip']:
113
+ f = gzip.open(aln, 'rt')
114
+ else:
115
+ f = open(aln)
80
116
  result = dict()
81
117
 
82
118
  taxa = ""
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: bscampp
3
- Version: 1.0.3
4
- Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
3
+ Version: 1.0.6
4
+ Summary: BSCAMPP and SCAMPP - Scalable Phylogenetic Placement Tools
5
5
  Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
6
6
  License: MIT License
7
7
 
@@ -66,7 +66,7 @@ Requires-Dist: taxtastic>=0.9.3
66
66
  # Overview
67
67
  * **Inputs**
68
68
  1. Reference tree to place sequences into.
69
- 2. Alignment of reference sequences.
69
+ 2. Alignment of reference sequences (protein or nucleotide).
70
70
  3. Alignment of query sequences (can be combined with ii.).
71
71
  4. Tree info file.
72
72
  - (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
@@ -230,16 +230,17 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
230
230
  > Output file name. Default: bscampp_result.jplace
231
231
  > --threads NUM_CPUS, --num-cpus NUM_CPUS
232
232
  > Number of cores for parallelization, default: -1 (all)
233
+ > --cpus-per-job CPUS_PER_JOB
234
+ > Number of cores to use for each job, default: 2
233
235
  >
234
236
  > ADVANCE PARAMETERS:
235
- > These parameters control how BSCAMPP is run. The default values are set based on experiments.
237
+ > These parameters control how BSCAMPP and SCAMPP are run. The default values are set based on experiments.
236
238
  >
237
- > -m MODEL, --model MODEL
238
- > Model used for edge distances. Default: GTR
239
239
  > -b SUBTREESIZE, --subtreesize SUBTREESIZE
240
240
  > Integer size of the subtree. Default: 2000
241
241
  > -V VOTES, --votes VOTES
242
- > Number of votes per query sequence. Default: 5
242
+ > (BSCAMPP only) Number of votes per query sequence.
243
+ > Default: 5
243
244
  > --similarityflag SIMILARITYFLAG
244
245
  > Boolean, True if maximizing sequence similarity
245
246
  > instead of simple Hamming distance (ignoring gap sites
@@ -1,11 +1,11 @@
1
- bscampp/__init__.py,sha256=toGV8EzvMKviV7xHahhXs0K6fAmHw2cnWb6EDscpIOY,2289
2
- bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
1
+ bscampp/__init__.py,sha256=eDIMYifzKrFdtA3Ac7OvPTyIHUO1ZLgVaM0pKFxxEHE,2289
2
+ bscampp/configs.py,sha256=perl6u5hto6J3JV1JMbsTQ6tqr2uGOk-Z9jfzflid0s,6122
3
3
  bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
4
- bscampp/functions.py,sha256=QYI5RsUEMGc6jLPzFdInpmxA8wiYyN7785P3WxWYiTo,17839
4
+ bscampp/functions.py,sha256=DGHQJLLzXSghDKbha0LW0YPip_45M6MI4t3zdDpzULI,22448
5
5
  bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
6
- bscampp/jobs.py,sha256=PrVMJBabi4cYlrxVLo37XPOY82fY0zZ8Iyp9CWCNWhU,7181
7
- bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
8
- bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
6
+ bscampp/jobs.py,sha256=v7buZJs1AnNoXiILwu-W8fo3QjxAh3i9Mp7xfmlJvAY,7569
7
+ bscampp/pipeline.py,sha256=IPZnXZmVxGGfbVUuGCQh5X9oBq48-6pA9QkuvMGPTag,14000
8
+ bscampp/utils.py,sha256=-wns6FaWMKD2wVqjxdBQvjTdagTjywBIaGfqb2mupe4,30039
9
9
  bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
10
10
  bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
11
11
  bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
17
17
  bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
18
18
  bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
19
19
  bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
20
- bscampp-1.0.3.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
- bscampp-1.0.3.dist-info/METADATA,sha256=01Vl-oCadCIiWFBLA564CLNErXILqEzdRrQNPpGy_mc,12507
22
- bscampp-1.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
- bscampp-1.0.3.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
- bscampp-1.0.3.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
- bscampp-1.0.3.dist-info/RECORD,,
20
+ bscampp-1.0.6.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
+ bscampp-1.0.6.dist-info/METADATA,sha256=0sWAKK30wlps8i0d1BdFqyv5MZVgefRnTn_-yMmO8lQ,12602
22
+ bscampp-1.0.6.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
23
+ bscampp-1.0.6.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
+ bscampp-1.0.6.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
+ bscampp-1.0.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5