bscampp 1.0.2b0__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/__init__.py CHANGED
@@ -12,7 +12,7 @@ import logging, os
12
12
  # not really needed for BSCAMPP but safe to update here
13
13
  os.sys.setrecursionlimit(1000000)
14
14
 
15
- __version__ = "1.0.2b"
15
+ __version__ = "1.0.5"
16
16
  _INSTALL_PATH = __path__[0]
17
17
 
18
18
  # global variables to store all loggers
bscampp/configs.py CHANGED
@@ -6,6 +6,7 @@ except ImportError:
6
6
  from argparse import ArgumentParser, Namespace
7
7
  from bscampp.init_configs import init_config_file
8
8
  from bscampp import get_logger, log_exception
9
+ #from bscampp.utils import inferDataType
9
10
 
10
11
  # detect home.path or create if missing
11
12
  homepath = os.path.dirname(__file__) + '/home.path'
@@ -33,6 +34,9 @@ class Configs:
33
34
  keeptemp = False # whether to keep all temporary files
34
35
  verbose = 'INFO' # default verbose level to print
35
36
  num_cpus = 1 # number of cores to use for parallelization
37
+ cpus_per_job = 2 # number of cores to use per job
38
+ max_workers = 1 # max_workers for ProcessPoolExecutor
39
+ # ... = max(1, num_cpus // cpus_per_job)
36
40
 
37
41
  # binaries
38
42
  pplacer_path = None
@@ -42,7 +46,6 @@ class Configs:
42
46
 
43
47
  # placement settings
44
48
  placement_method = 'epa-ng'
45
- model = 'GTR'
46
49
  subtreesize = 2000
47
50
  votes = 5
48
51
  similarityflag = True
@@ -162,6 +165,8 @@ def buildConfigs(parser, cmdline_args, child_process=False, rerun=False):
162
165
  Configs.num_cpus = min(os.cpu_count(), Configs.num_cpus)
163
166
  else:
164
167
  Configs.num_cpus = os.cpu_count()
168
+ # compute max_workers based on num_cpus and cpus_per_job
169
+ Configs.max_workers = max(1, Configs.num_cpus // Configs.cpus_per_job)
165
170
 
166
171
  # sanity check for existence of base placement binary path
167
172
  if Configs.placement_method == 'epa-ng':
bscampp/functions.py CHANGED
@@ -8,6 +8,8 @@ from bscampp.jobs import EPAngJob, TaxtasticJob, PplacerTaxtasticJob
8
8
  from bscampp.utils import write_fasta
9
9
  import bscampp.utils as utils
10
10
 
11
+ import concurrent.futures
12
+
11
13
  _LOG = get_logger(__name__)
12
14
 
13
15
  '''
@@ -20,7 +22,7 @@ def readData(workdir, dry_run=False):
20
22
  _LOG.info('Reading in input data...')
21
23
 
22
24
  if dry_run:
23
- return None, dict(), '', dict(), '', dict()
25
+ return None, dict(), '', dict(), '', dict(), dict(), dict()
24
26
 
25
27
  # (1) load reference tree
26
28
  tree = treeswift.read_tree_newick(Configs.tree_path)
@@ -40,22 +42,39 @@ def readData(workdir, dry_run=False):
40
42
  if Configs.qaln_path is not None:
41
43
  ref_dict = utils.read_data(Configs.aln_path)
42
44
  q_dict = utils.read_data(Configs.qaln_path)
43
- aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
45
+ #aln_path, qaln_path = Configs.aln_path, Configs.qaln_path
44
46
  else:
45
47
  aln_dict = utils.read_data(Configs.aln_path)
46
48
  ref_dict, q_dict = utils.seperate(aln_dict, leaf_dict)
47
49
 
48
- # after separating queries from the reference alignment, write
49
- # them to to TEMP/
50
- qaln_path = os.path.join(workdir, 'qaln.fa')
51
- write_fasta(qaln_path, q_dict)
52
-
53
- aln_path = os.path.join(workdir, 'aln.fa')
54
- write_fasta(aln_path, ref_dict)
50
+ # after separating queries from the reference alignment, write
51
+ # them to to TEMP/
52
+ # Updated on 3.5.2025 by Chengze Shen
53
+ # - regardless of the input choices, write a copy of both reference
54
+ # and query alignment to the workdir
55
+ qaln_path = os.path.join(workdir, 'qaln.fa')
56
+ write_fasta(qaln_path, q_dict)
57
+
58
+ aln_path = os.path.join(workdir, 'aln.fa')
59
+ write_fasta(aln_path, ref_dict)
60
+
61
+ # Added on 3.8.2025 by Chengze Shen
62
+ # - to ensure that any characters from the query has correct names
63
+ # (e.g., having ":" can cause trouble), have a qname_map that maps
64
+ # each taxon name to an idx
65
+ qidx = 1
66
+ qname_map = dict()
67
+ qname_map_rev = dict()
68
+ for name in q_dict.keys():
69
+ cvt = str(qidx).zfill(16) # 16 digits
70
+ qname_map[name] = cvt
71
+ qname_map_rev[cvt] = name
72
+ qidx += 1
55
73
 
56
74
  t1 = time.perf_counter()
57
75
  _LOG.info('Time to read in input data: {} seconds'.format(t1 - t0))
58
- return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict
76
+ return tree, leaf_dict, aln_path, ref_dict, qaln_path, q_dict, \
77
+ qname_map, qname_map_rev
59
78
 
60
79
  '''
61
80
  Function to get the closest leaf for each query sequence based on Hamming
@@ -282,16 +301,30 @@ def buildQuerySubtrees(query_votes_dict, query_top_vote_dict,
282
301
  '''
283
302
  Helper function to run a single placement task. Designed to use with
284
303
  multiprocessing
304
+ Input: job object
305
+ Return: outpath from job.run()
285
306
  '''
286
- def placeOneSubtree():
287
- # TODO
288
- pass
307
+ def placeOneSubtree(*jobs,
308
+ subtree_id=0, num_assigned_queries=-1, outpath=None, logging=None):
309
+ job_type = None
310
+ # record the last job_type and _outpath, which will be for the placement
311
+ # job
312
+ for job in jobs:
313
+ job_type = job.job_type
314
+ # run the job
315
+ _outpath = job.run(logging=logging)
316
+
317
+ # move output file for EPA-ng output
318
+ if job_type == 'epa-ng':
319
+ os.system('mv {} {}'.format(_outpath, outpath))
320
+ return subtree_id, num_assigned_queries, outpath
289
321
 
290
322
  '''
291
323
  Function to perform placement of queries for each subtree
292
324
  '''
293
325
  def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
294
- aln, qaln, cmdline_args, workdir, pool, lock, dry_run=False):
326
+ aln, qaln, cmdline_args, workdir, qname_map, qname_map_rev,
327
+ pool, lock, dry_run=False):
295
328
  t0 = time.perf_counter()
296
329
  _LOG.info('Performing placement on each subtree...')
297
330
 
@@ -307,22 +340,21 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
307
340
  # go over the dictionary of subtrees and their assigned queries
308
341
  # perform placement using either EPA-ng or pplacer
309
342
  final_subtree_count, total_subtrees_examined = 0, 0
343
+ futures = []
344
+ _LOG.info("Submitting jobs for subtree placement...")
310
345
  for subtree, query_list in new_subtree_dict.items():
311
346
  total_subtrees_examined += 1
312
- _LOG.info('- Subtree {}/{} with {} queries'.format(
313
- total_subtrees_examined, len(new_subtree_dict), len(query_list)))
314
347
 
315
348
  # empty subtree, continue
316
349
  if len(query_list) == 0:
317
350
  continue
318
- final_subtree_count += 1
319
351
 
320
352
  subtree_dir = os.path.join(workdir, f'subtree_{final_subtree_count}')
321
353
  if not os.path.isdir(subtree_dir):
322
354
  os.makedirs(subtree_dir)
323
355
 
324
356
  # name all temporary output files
325
- tmp_tree = os.path.join(subtree_dir, 'tree')
357
+ tmp_tree = os.path.join(subtree_dir, f'subtree_{final_subtree_count}.tre')
326
358
  tmp_aln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_aln.fa')
327
359
  tmp_qaln = os.path.join(subtree_dir, f'subtree_{final_subtree_count}_qaln.fa')
328
360
  tmp_output = os.path.join(subtree_dir,
@@ -334,7 +366,10 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
334
366
  if '' in tmp_leaf_dict:
335
367
  del tmp_leaf_dict['']
336
368
  tmp_ref_dict = {label : aln[label] for label in tmp_leaf_dict.keys()}
337
- tmp_q_dict = {name : qaln[name] for name in query_list}
369
+ # Changed @ 3.8.2025 by Chengze Shen
370
+ # - wrote converted name for query sequences and convert them
371
+ # - back when placements are done
372
+ tmp_q_dict = {qname_map[name] : qaln[name] for name in query_list}
338
373
  write_fasta(tmp_aln, tmp_ref_dict)
339
374
  write_fasta(tmp_qaln, tmp_q_dict)
340
375
 
@@ -345,14 +380,16 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
345
380
 
346
381
  # 1.27.2025 - Chengze Shen
347
382
  # choose the placement method to run
383
+ jobs = []
348
384
  if Configs.placement_method == 'epa-ng':
349
385
  job = EPAngJob(path=Configs.epang_path,
350
386
  info_path=Configs.info_path, tree_path=tmp_tree,
351
387
  aln_path=tmp_aln, qaln_path=tmp_qaln,
352
388
  outdir=subtree_dir, num_cpus=Configs.num_cpus)
353
- # for EPA-ng, ensure that outpath name is changed to the one we want
354
- _outpath = job.run(logging=f'subtree_{final_subtree_count}')
355
- os.system('mv {} {}'.format(_outpath, tmp_output))
389
+ jobs.append(job)
390
+ ## for EPA-ng, ensure that outpath name is changed to the one we want
391
+ #_outpath = job.run(logging=f'subtree_{final_subtree_count}')
392
+ #os.system('mv {} {}'.format(_outpath, tmp_output))
356
393
  elif Configs.placement_method == 'pplacer':
357
394
  # build ref_pkg with info and tmp_tree and tmp_aln
358
395
  refpkg_dir = os.path.join(subtree_dir,
@@ -361,17 +398,33 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
361
398
  outdir=refpkg_dir, name=f'subtree_{final_subtree_count}',
362
399
  aln_path=tmp_aln, tree_path=tmp_tree,
363
400
  info_path=Configs.info_path)
364
- _ = taxit_job.run()
401
+ jobs.append(taxit_job)
402
+ #_ = taxit_job.run()
365
403
 
366
404
  # run pplacer-taxtastic
367
405
  job = PplacerTaxtasticJob(path=Configs.pplacer_path,
368
- refpkg_dir=refpkg_dir, model=Configs.model,
406
+ refpkg_dir=refpkg_dir,
407
+ #molecule=Configs.molecule, model=Configs.model,
369
408
  outpath=tmp_output, num_cpus=Configs.num_cpus,
370
409
  qaln_path=tmp_qaln)
371
- tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
410
+ #tmp_output = job.run(logging=f'subtree_{final_subtree_count}')
411
+ jobs.append(job)
372
412
  else:
373
413
  raise ValueError(
374
414
  f"Placement method {Configs.placement_method} not recognized")
415
+ logging = f'subtree_{final_subtree_count}'
416
+ futures.append(pool.submit(placeOneSubtree, *jobs,
417
+ subtree_id=final_subtree_count,
418
+ num_assigned_queries=len(query_list),
419
+ outpath=tmp_output, logging=logging))
420
+ # increment final_subtree_count
421
+ final_subtree_count += 1
422
+
423
+ # deal with outputs
424
+ for future in concurrent.futures.as_completed(futures):
425
+ subtree_id, num_assigned_queries, tmp_output = future.result()
426
+ _LOG.info('- Subtree {}/{} with {} queries'.format(
427
+ subtree_id + 1, final_subtree_count, num_assigned_queries))
375
428
 
376
429
  # read in each placement result
377
430
  place_file = open(tmp_output, 'r')
@@ -391,8 +444,12 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
391
444
  field_to_idx = {field: i for i, field in enumerate(fields)}
392
445
 
393
446
  for tmp_place in place_json["placements"]:
394
- #print(tmp_place)
395
- placed_query_list.append(tmp_place[tgt][0])
447
+ # convert qname back using qname_map_rev
448
+ qname = qname_map_rev[tmp_place[tgt][0]]
449
+ tmp_place[tgt][0] = qname
450
+ placed_query_list.append(qname)
451
+
452
+ #placed_query_list.append(tmp_place[tgt][0])
396
453
  for i in range(len(tmp_place["p"])):
397
454
  edge_num = tmp_place["p"][i][
398
455
  field_to_idx['edge_num']]
@@ -434,6 +491,7 @@ def placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict, placed_query_list,
434
491
 
435
492
  placements.append(tmp_place.copy())
436
493
  place_file.close()
494
+
437
495
  _LOG.info(f'Final number of subtrees used: {final_subtree_count}')
438
496
 
439
497
  # prepare the output jplace to write
bscampp/jobs.py CHANGED
@@ -137,6 +137,8 @@ class EPAngJob(Job):
137
137
  self.aln_path = ''
138
138
  self.qaln_path = ''
139
139
  self.outdir = ''
140
+ #self.molecule = ''
141
+ #self.model = ''
140
142
  self.num_cpus = 1
141
143
 
142
144
  for k, v in kwargs.items():
@@ -194,7 +196,7 @@ class PplacerTaxtasticJob(Job):
194
196
  self.qaln_path = ''
195
197
  self.outdir = ''
196
198
  self.outpath = ''
197
- self.model = 'GTR'
199
+ #self.model = 'GTR'
198
200
  self.num_cpus = 1
199
201
 
200
202
  for k, v in kwargs.items():
@@ -202,7 +204,8 @@ class PplacerTaxtasticJob(Job):
202
204
 
203
205
  def get_invocation(self):
204
206
  # outpath defined
205
- cmd = [self.path, '-m', self.model,
207
+ cmd = [self.path,
208
+ #'-m', self.model,
206
209
  '-c', self.refpkg_dir, '-o', self.outpath,
207
210
  '-j', str(self.num_cpus), self.qaln_path]
208
211
  return cmd, self.outpath
bscampp/pipeline.py CHANGED
@@ -32,7 +32,9 @@ def bscampp_pipeline(*args, **kwargs):
32
32
 
33
33
  # initialize multiprocessing (if needed)
34
34
  _LOG.warning('Initializing ProcessPoolExecutor...')
35
- pool = ProcessPoolExecutor(Configs.num_cpus, initializer=initial_pool,
35
+ # maximally concurrently run Configs.num_cpus // 2 jobs, each job
36
+ # can use 2 threads
37
+ pool = ProcessPoolExecutor(Configs.max_workers, initializer=initial_pool,
36
38
  initargs=(parser, cmdline_args,))
37
39
 
38
40
  # (0) temporary files wrote to here
@@ -48,8 +50,8 @@ def bscampp_pipeline(*args, **kwargs):
48
50
 
49
51
  # (1) read in tree, alignment, and separate reference sequences from
50
52
  # query sequences
51
- tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
52
- dry_run=dry_run)
53
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
54
+ = readData(workdir, dry_run=dry_run)
53
55
 
54
56
  # (2) compute closest leaves for all query sequences
55
57
  query_votes_dict, query_top_vote_dict = getClosestLeaves(
@@ -64,8 +66,9 @@ def bscampp_pipeline(*args, **kwargs):
64
66
 
65
67
  # (4) perform placement for each subtree
66
68
  output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
67
- placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
68
- dry_run=dry_run)
69
+ placed_query_list, aln, qaln, cmdline_args, workdir,
70
+ qname_map, qname_map_rev,
71
+ pool, lock, dry_run=dry_run)
69
72
 
70
73
  # (5) write the output jplace to local
71
74
  writeOutputJplace(output_jplace, dry_run=dry_run)
@@ -121,8 +124,8 @@ def scampp_pipeline(*args, **kwargs):
121
124
 
122
125
  # (1) read in tree, alignment, and separate reference sequences from
123
126
  # query sequences
124
- tree, leaf_dict, aln_path, aln, qaln_path, qaln = readData(workdir,
125
- dry_run=dry_run)
127
+ tree, leaf_dict, aln_path, aln, qaln_path, qaln, qname_map, qname_map_rev \
128
+ = readData(workdir, dry_run=dry_run)
126
129
 
127
130
  # (2) compute closest leaves for all query sequences
128
131
  query_votes_dict, query_top_vote_dict = getClosestLeaves(
@@ -136,8 +139,9 @@ def scampp_pipeline(*args, **kwargs):
136
139
 
137
140
  # (4) perform placement for each subtree
138
141
  output_jplace = placeQueriesToSubtrees(tree, leaf_dict, new_subtree_dict,
139
- placed_query_list, aln, qaln, cmdline_args, workdir, pool, lock,
140
- dry_run=dry_run)
142
+ placed_query_list, aln, qaln, cmdline_args, workdir,
143
+ qname_map, qname_map_rev,
144
+ pool, lock, dry_run=dry_run)
141
145
 
142
146
  # (5) write the output jplace to local
143
147
  writeOutputJplace(output_jplace, dry_run=dry_run)
@@ -257,14 +261,20 @@ def _init_parser(default_outdir="bscampp_output",
257
261
  basic_group.add_argument("-a", "--alignment", "--aln-path", type=str,
258
262
  dest="aln_path",
259
263
  help=("Path for reference sequence alignment in "
260
- "FASTA format. Optionally with query sequences. "
264
+ "FASTA format (can be a .gz file). "
265
+ "Optionally with query sequences. "
261
266
  "Query alignment can be specified with --qaln-path"),
262
267
  required=required, default=None)
263
268
  basic_group.add_argument("-q", "--qalignment", "--qaln-path", type=str,
264
269
  dest="qaln_path",
265
270
  help=("Optionally provide path to query sequence alignment "
266
- "in FASTA format. Default: None"),
271
+ "in FASTA format (can be a .gz file). Default: None"),
267
272
  required=False, default=None)
273
+ #basic_group.add_argument("--molecule", type=str,
274
+ # choices=['nucl', 'nucleotide', 'prot', 'protein'],
275
+ # help=("Specify nucleotide or protein sequences. "
276
+ # "Default: infer datatype"),
277
+ # required=False, default=None)
268
278
  basic_group.add_argument("-d", "--outdir", type=str,
269
279
  help="Directory path for output. Default: bscampp_output/",
270
280
  required=False, default=default_outdir)
@@ -275,6 +285,9 @@ def _init_parser(default_outdir="bscampp_output",
275
285
  dest="num_cpus",
276
286
  help="Number of cores for parallelization, default: -1 (all)",
277
287
  required=False, default=-1)
288
+ basic_group.add_argument("--cpus-per-job", type=int,
289
+ help="Number of cores to use for each job, default: 2",
290
+ required=False, default=2)
278
291
 
279
292
  # advanced parameter settings
280
293
  advance_group = parser.add_argument_group(
@@ -284,16 +297,24 @@ def _init_parser(default_outdir="bscampp_output",
284
297
  ))
285
298
  parser.groups['advance_group'] = advance_group
286
299
 
287
- advance_group.add_argument("-m", "--model", type=str,
288
- help="Model used for edge distances. Default: GTR",
289
- required=False, default="GTR")
300
+ #advance_group.add_argument("-m", "--model", type=str,
301
+ # help=("Model used for edge distances. EPA-ng will use the "
302
+ # "provided info_path (*.bestModel) for model. "
303
+ # "Default: GTR for nucleotide, LG for protein"),
304
+ # required=False, default=None)
290
305
  advance_group.add_argument("-b", "--subtreesize", type=int,
291
306
  help="Integer size of the subtree. Default: 2000",
292
307
  required=False, default=2000)
293
308
  advance_group.add_argument("-V", "--votes", type=int,
294
- help="This is only used for BSCAMPP! Number of votes per "
295
- "query sequence. Default: 5",
309
+ help="(BSCAMPP only) Number of votes per query sequence. "
310
+ "Default: 5",
296
311
  required=False, default=5)
312
+ advance_group.add_argument("--subtreetype", type=str,
313
+ help="(SCAMPP only) Options for collecting "
314
+ "nodes for the subtree - d for edge weighted "
315
+ "distances, n for node distances, h for Hamming "
316
+ "distances. Default: d",
317
+ required=False, default='d')
297
318
  advance_group.add_argument("--similarityflag", type=str2bool,
298
319
  help="Boolean, True if maximizing sequence similarity "
299
320
  "instead of simple Hamming distance (ignoring gap "
@@ -306,17 +327,12 @@ def _init_parser(default_outdir="bscampp_output",
306
327
  parser.groups['misc_group'] = misc_group
307
328
 
308
329
  misc_group.add_argument("-n","--tmpfilenbr", type=int,
309
- help="Temporary file indexing. Default: 0",
330
+ help="Temporary file indexing (e.g., tmp0/). Default: 0",
310
331
  required=False, default=0)
311
332
  misc_group.add_argument("--fragmentflag", type=str2bool,
312
- help="If queries contains fragments. Default: True",
333
+ help="If queries contains fragments. Does not do anything "
334
+ "if similarity flag is set to True. Default: True",
313
335
  required=False, default=True)
314
- misc_group.add_argument("--subtreetype", type=str,
315
- help="(SCAMPP only) Options for collecting "
316
- "nodes for the subtree - d for edge weighted "
317
- "distances, n for node distances, h for Hamming "
318
- "distances. Default: d",
319
- required=False, default='d')
320
336
  misc_group.add_argument("--keeptemp", type=str2bool,
321
337
  help="Boolean, True to keep all temporary files. "
322
338
  "Default: False",
bscampp/utils.py CHANGED
@@ -9,8 +9,11 @@ import random
9
9
  import statistics
10
10
  import copy
11
11
  import gzip
12
-
13
12
  import argparse
13
+
14
+ from bscampp import get_logger, log_exception
15
+ _LOG = get_logger(__name__)
16
+
14
17
  # reformat argparse help text formatting
15
18
  class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
16
19
  def add_text(self, text):
@@ -36,6 +39,34 @@ BRACKET = {
36
39
  }
37
40
 
38
41
 
42
+ # infer datatype from input file
43
+ def inferDataType(path):
44
+ sequences = read_data(path)
45
+ acg, t, u, total = 0, 0, 0, 0
46
+ for taxon, seq in sequences.items():
47
+ letters = seq.upper()
48
+ for letter in letters:
49
+ total = total + 1
50
+
51
+ if letter in ('A', 'C', 'G', 'N'):
52
+ acg += 1
53
+ elif letter == 'T':
54
+ t += 1
55
+ elif letter == 'U':
56
+ u += 1
57
+ # dna -> nucleotide
58
+ if u == 0 and (acg + t) / total > 0.9:
59
+ datatype = 'nucleotide'
60
+ # rna -> nucleotide
61
+ elif t == 0 and (acg + u) / total > 0.9:
62
+ datatype = 'nucleotide'
63
+ # amino acid -> protein
64
+ else:
65
+ datatype = 'protein'
66
+
67
+ _LOG.info(f"Inferred input data type: {datatype}")
68
+ return datatype
69
+
39
70
  def write_fasta(aln, aln_dict, aligned=True):
40
71
  """ Write given dictionary as FASTA file out
41
72
 
@@ -76,7 +107,12 @@ def read_data(aln):
76
107
 
77
108
  """
78
109
 
79
- f = open(aln)
110
+ # determine the file type, whether we have a .gz/.gzip file
111
+ suffix = aln.split('.')[-1]
112
+ if suffix in ['gz', 'gzip']:
113
+ f = gzip.open(aln, 'rt')
114
+ else:
115
+ f = open(aln)
80
116
  result = dict()
81
117
 
82
118
  taxa = ""
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: bscampp
3
- Version: 1.0.2b0
4
- Summary: BSCAMPP - A Scalable Phylogenetic Placement Tool
3
+ Version: 1.0.5
4
+ Summary: BSCAMPP and SCAMPP - Scalable Phylogenetic Placement Tools
5
5
  Author-email: Eleanor Wedell <ewedell2@illinois.edu>, Chengze Shen <chengze5@illinois.edu>
6
6
  License: MIT License
7
7
 
@@ -66,7 +66,7 @@ Requires-Dist: taxtastic>=0.9.3
66
66
  # Overview
67
67
  * **Inputs**
68
68
  1. Reference tree to place sequences into.
69
- 2. Alignment of reference sequences.
69
+ 2. Alignment of reference sequences (protein or nucleotide).
70
70
  3. Alignment of query sequences (can be combined with ii.).
71
71
  4. Tree info file.
72
72
  - (EPA-ng as base method), RAxML-ng info file, typically with suffix `.bestModel`.
@@ -230,16 +230,17 @@ run_bscampp.py -i [raxml best model] -t [reference tree] -a [reference alignment
230
230
  > Output file name. Default: bscampp_result.jplace
231
231
  > --threads NUM_CPUS, --num-cpus NUM_CPUS
232
232
  > Number of cores for parallelization, default: -1 (all)
233
+ > --cpus-per-job CPUS_PER_JOB
234
+ > Number of cores to use for each job, default: 2
233
235
  >
234
236
  > ADVANCE PARAMETERS:
235
- > These parameters control how BSCAMPP is run. The default values are set based on experiments.
237
+ > These parameters control how BSCAMPP and SCAMPP are run. The default values are set based on experiments.
236
238
  >
237
- > -m MODEL, --model MODEL
238
- > Model used for edge distances. Default: GTR
239
239
  > -b SUBTREESIZE, --subtreesize SUBTREESIZE
240
240
  > Integer size of the subtree. Default: 2000
241
241
  > -V VOTES, --votes VOTES
242
- > Number of votes per query sequence. Default: 5
242
+ > (BSCAMPP only) Number of votes per query sequence.
243
+ > Default: 5
243
244
  > --similarityflag SIMILARITYFLAG
244
245
  > Boolean, True if maximizing sequence similarity
245
246
  > instead of simple Hamming distance (ignoring gap sites
@@ -1,11 +1,11 @@
1
- bscampp/__init__.py,sha256=mZGsa6XRWkYMo62gK_z5OFyFxRZHadW_SWHAirS1Dvg,2290
2
- bscampp/configs.py,sha256=3HJHLN2fLV5Tv3TJL95NpOuSXUV6CvqxRqCOM6TpbJQ,5767
1
+ bscampp/__init__.py,sha256=ER9JtHb4EYnE1qyPBUmkpbsMeRC_4JDHUla46QUoInw,2289
2
+ bscampp/configs.py,sha256=perl6u5hto6J3JV1JMbsTQ6tqr2uGOk-Z9jfzflid0s,6122
3
3
  bscampp/default.config,sha256=CEfsUHBy--vwJhEcUuJ0btfuGQWb_lKMVWUIP9f5YGw,112
4
- bscampp/functions.py,sha256=QYI5RsUEMGc6jLPzFdInpmxA8wiYyN7785P3WxWYiTo,17839
4
+ bscampp/functions.py,sha256=qzlxW-bIJi0woStCzraALPb6VEPlO3CdPfCdfQqT2fQ,20119
5
5
  bscampp/init_configs.py,sha256=EA9sMN5jWj6zj2b-7tN19LhX2Ef61ByQLxQRLHAqLDM,3600
6
- bscampp/jobs.py,sha256=PrVMJBabi4cYlrxVLo37XPOY82fY0zZ8Iyp9CWCNWhU,7181
7
- bscampp/pipeline.py,sha256=C6I1vWeA6Rq_spPHy_il1FJA_DomWHUHYHLUUk9SnLk,13024
8
- bscampp/utils.py,sha256=ragaI14Lqb2fVp_uYDkFQnV7a50G9-sUOWdVM-sNhUE,29005
6
+ bscampp/jobs.py,sha256=1FdvpSX_5VxNmJCCYAMdBKy8n68O1TjSET4XU1QULq0,7252
7
+ bscampp/pipeline.py,sha256=IPZnXZmVxGGfbVUuGCQh5X9oBq48-6pA9QkuvMGPTag,14000
8
+ bscampp/utils.py,sha256=-wns6FaWMKD2wVqjxdBQvjTdagTjywBIaGfqb2mupe4,30039
9
9
  bscampp/tools/epa-ng,sha256=f3EVoZAAOXLN6l521qp-TrWDl5J2nqL3tGgjPaQE9WQ,3772096
10
10
  bscampp/tools/pplacer,sha256=p0H4eo9uuiYoWS_kJbPfauOV99i7BXJdZSiwXIuLxTw,7834576
11
11
  bscampp/tools/hamming_distance/CMakeLists.txt,sha256=yf9iq7Y61t3WObJHoR4RoGDEvUw_Q8JW3UnI4uh0cfU,389
@@ -17,9 +17,9 @@ bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp,sha256=xCmyAT-OZJOD
17
17
  bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp,sha256=eKxgODRlpf0hU84QjNhigvRhWCT9tiJZjA5oQFQ1bUk,7404
18
18
  bscampp/tools/hamming_distance/src/homology.cpp,sha256=ZE0uXZWQ-cN4U1Wk5kUr_KKHgzsgA6Sno-IViRa4tmI,6053
19
19
  bscampp/tools/hamming_distance/src/new_hamming.cpp,sha256=fBRm99RquBZgZjaLOn9xDI3cH9NchhrxKbL-11j8fmk,5342
20
- bscampp-1.0.2b0.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
- bscampp-1.0.2b0.dist-info/METADATA,sha256=OWSIl8dFMrgzB9Xe8geqXQw2fBNd8hta3p40O5Q9T5Q,12509
22
- bscampp-1.0.2b0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
- bscampp-1.0.2b0.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
- bscampp-1.0.2b0.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
- bscampp-1.0.2b0.dist-info/RECORD,,
20
+ bscampp-1.0.5.dist-info/LICENSE,sha256=HEa4YQdOR0e2Gz-NiOwr9X6aJcZtY0AGmlJQDmfN0Iw,1064
21
+ bscampp-1.0.5.dist-info/METADATA,sha256=Nz0xmODp6N_e-u2VAgvegoVKW7134rM7UroUYqO7B0Q,12602
22
+ bscampp-1.0.5.dist-info/WHEEL,sha256=5U-5D1CS1IlCq2UZGreCPlzbpvhviDLR_iCQyI6CTvY,91
23
+ bscampp-1.0.5.dist-info/entry_points.txt,sha256=4Ft83qHc39tNNpMLgSgFXDHM-vuAB99JtmczCQj5pq8,204
24
+ bscampp-1.0.5.dist-info/top_level.txt,sha256=1loGRUAft6Tcdq0f3lHbVwWN7W_SW1srfhAVSpg9DWE,8
25
+ bscampp-1.0.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5