PyamilySeq 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/PyamilySeq.py CHANGED
@@ -1,6 +1,3 @@
1
- import argparse
2
- #from config import config_params
3
-
4
1
  try:
5
2
  from .PyamilySeq_Species import cluster as species_cluster
6
3
  from .PyamilySeq_Genus import cluster as genus_cluster
@@ -12,10 +9,11 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
12
9
  from constants import *
13
10
  from utils import *
14
11
 
15
-
16
-
12
+ import traceback
13
+ import sys
17
14
 
18
15
  def run_cd_hit(options, input_file, clustering_output, clustering_mode):
16
+ logger = logging.getLogger("PyamilySeq.PyamilySeq")
19
17
  cdhit_command = [
20
18
  clustering_mode,
21
19
  '-i', input_file,
@@ -29,14 +27,25 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
29
27
  '-sc', "1",
30
28
  '-sf', "1"
31
29
  ]
32
- if options.verbose == True:
33
- subprocess.run(cdhit_command)
34
- else:
35
- subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
30
+ logger.debug("CD-HIT command: %s", " ".join(cdhit_command))
31
+ try:
32
+ if options.verbose:
33
+ ret = subprocess.run(cdhit_command)
34
+ else:
35
+ ret = subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
36
+ if ret.returncode != 0:
37
+ logger.error("cd-hit returned non-zero exit code %s", ret.returncode)
38
+ else:
39
+ logger.info("cd-hit completed successfully: %s", clustering_output)
40
+ except Exception as e:
41
+ logger.exception("Failed to run cd-hit: %s", e)
36
42
 
37
43
 
38
44
  def main():
39
- parser = argparse.ArgumentParser(description=f"PyamilySeq {PyamilySeq_Version}: A tool for gene clustering and analysis.")
45
+ # Initial console-only logger so welcome and parser.description are logged before argparse outputs.
46
+ early_logger = configure_logger("PyamilySeq.PyamilySeq", enable_file=False, log_dir=None, verbose=False)
47
+ # Use LoggingArgumentParser so usage/errors are emitted via logger
48
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.PyamilySeq")#, description="PyamilySeq entrypoint")
40
49
 
41
50
  # Add subparsers for Full and Partial modes
42
51
  subparsers = parser.add_subparsers(dest="run_mode", required=True, help="Choose a mode: 'Full' or 'Partial'.")
@@ -109,28 +118,28 @@ def main():
109
118
  subparser.add_argument("-T", type=int, default=8, dest="threads", required=False,
110
119
  help="Number of threads for clustering/alignment - CD-HIT parameter '-T' | MAFFT parameter '--thread'.")
111
120
 
112
- # Miscellaneous Arguments
113
- subparser.add_argument("-verbose", action="store_true",
114
- help="Print verbose output.")
115
- subparser.add_argument("-v", "--version", action="version",
116
- version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
121
+ # Miscellaneous Arguments
122
+ # Global logging options (user controls logfile creation)
123
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
124
+ parser.add_argument("--log-dir", dest="log_dir", default=None,
125
+ help="Directory for logfile (default: output dir or cwd).")
126
+ parser.add_argument("-verbose", action="store_true",
127
+ help="Print verbose output.")
128
+ parser.add_argument("-v", "--version", action="version",
129
+ version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
117
130
 
118
131
  # Parse Arguments
119
132
  options = parser.parse_args()
120
- ## Configuration
121
133
 
134
+ # Setup logger once we know output paths/options
135
+ # after we resolve output_path / options.output_dir:
136
+ resolved_log_dir = options.log_dir if getattr(options, "log_dir", None) else (os.path.abspath(options.output_dir) if getattr(options, "output_dir", None) else os.getcwd())
137
+ logger = configure_logger("PyamilySeq.PyamilySeq", enable_file=getattr(options, "log", False), log_dir=resolved_log_dir, verbose=options.verbose)
138
+ logger.info("Running PyamilySeq %s in %s mode", PyamilySeq_Version, getattr(options, "run_mode", "N/A"))
139
+ if options.verbose:
140
+ logger.debug("Options: %s", vars(options))
122
141
 
123
- if options.write_groups != None and options.write_individual_groups == False:
124
- options.write_individual_groups = True
125
142
 
126
- # Example of conditional logic based on selected mode
127
- print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
128
- if options.run_mode == "Full" and options.verbose == True:
129
- print("Processing Full mode with options:", vars(options))
130
- elif options.run_mode == "Partial" and options.verbose == True:
131
- print("Processing Partial mode with options:", vars(options))
132
-
133
- ### Checking all required parameters are provided by user #!!# Doesn't seem to work
134
143
  if options.run_mode == 'Full':
135
144
  options.clustering_format = 'CD-HIT'
136
145
  if getattr(options, 'reclustered', None) is not None:
@@ -145,6 +154,7 @@ def main():
145
154
  missing_options = [opt for opt in
146
155
  ['input_type', 'input_dir', 'name_split_gff', 'clustering_format', 'pident', 'len_diff'] if
147
156
  not options.__dict__.get(opt)]
157
+ logger.error("Missing required options for Full mode: %s", ', '.join(missing_options))
148
158
  sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
149
159
  if options.align_core:
150
160
  options.write_individual_groups = True
@@ -176,34 +186,28 @@ def main():
176
186
  ##MAFFT
177
187
  if options.align_core == True:
178
188
  if is_tool_installed('mafft'):
179
- if options.verbose == True:
180
- print("mafft is installed. Proceeding with alignment.")
189
+ logger.info("mafft is installed. Proceeding with alignment.")
181
190
  else:
191
+ logger.error("mafft is not installed. Please install mafft to proceed.")
182
192
  exit("mafft is not installed. Please install mafft to proceed.")
183
193
  ##CD-HIT
184
194
  if options.run_mode == 'Full':
185
195
  if is_tool_installed('cd-hit'):
186
- if options.verbose == True:
187
- print("cd-hit is installed. Proceeding with clustering.")
196
+ logger.info("cd-hit is installed. Proceeding with clustering.")
188
197
  if options.sequence_type == 'DNA':
189
198
  clustering_mode = 'cd-hit-est'
190
199
  elif options.sequence_type == 'AA':
191
200
  clustering_mode = 'cd-hit'
192
201
  if options.fast_mode == True:
193
202
  options.fast_mode = 1
194
- if options.verbose == True:
195
- print("Running CD-HIT in fast mode.")
203
+ logger.info("Running CD-HIT in fast mode.")
196
204
  else:
197
205
  options.fast_mode = 0
198
- if options.verbose == True:
199
- print("Running CD-HIT in accurate mode.")
206
+ logger.info("Running CD-HIT in accurate mode.")
200
207
  else:
208
+ logger.error("cd-hit is not installed. Please install cd-hit to proceed.")
201
209
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
202
210
 
203
-
204
- # if options.write_groups != None and options.original_fasta == False:
205
- # exit("-fasta must br provided if -w is used")
206
-
207
211
  if hasattr(options, 'cluster_file') and options.cluster_file:
208
212
  options.cluster_file = fix_path(options.cluster_file)
209
213
  if hasattr(options, 'reclustered') and options.reclustered:
@@ -308,10 +312,29 @@ def main():
308
312
 
309
313
 
310
314
  if options.group_mode == 'Species':
311
- species_cluster(clustering_options)
315
+ try:
316
+ species_cluster(clustering_options)
317
+ logger.info("Invoked species clustering.")
318
+ except FileNotFoundError as e:
319
+ logger.error("File not found during species clustering: %s", e)
320
+ logger.debug("Traceback:\n%s", traceback.format_exc())
321
+ sys.exit(1)
322
+ except Exception as e:
323
+ logger.error("Unexpected error during species clustering: %s", e)
324
+ logger.debug("Traceback:\n%s", traceback.format_exc())
325
+ sys.exit(1)
312
326
  elif options.group_mode == 'Genus':
313
- genus_cluster((clustering_options))
314
-
327
+ try:
328
+ genus_cluster(clustering_options)
329
+ logger.info("Invoked genus clustering.")
330
+ except FileNotFoundError as e:
331
+ logger.error("File not found during genus clustering: %s", e)
332
+ logger.debug("Traceback:\n%s", traceback.format_exc())
333
+ sys.exit(1)
334
+ except Exception as e:
335
+ logger.error("Unexpected error during genus clustering: %s", e)
336
+ logger.debug("Traceback:\n%s", traceback.format_exc())
337
+ sys.exit(1)
315
338
 
316
339
  # Save arguments to a text file
317
340
  from datetime import datetime
@@ -319,9 +342,9 @@ def main():
319
342
  outfile.write(f"Timestamp: {datetime.now().isoformat()}\n")
320
343
  for arg, value in vars(options).items():
321
344
  outfile.write(f"{arg}: {value}\n")
345
+ logger.info("Saved parameters to %s", os.path.join(output_path, "PyamilySeq_params.txt"))
346
+
322
347
 
323
- print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
324
- "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
325
348
 
326
349
  if __name__ == "__main__":
327
350
  main()
@@ -88,7 +88,7 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Secon
88
88
  except KeyError:
89
89
  cores['extended_genera_>'].append(cluster)
90
90
  #@profile
91
- def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
91
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF-Reporter but combined >1 PEP
92
92
  group = First_num + Second_num
93
93
  try:
94
94
  cores['combined_genera_' + str(group)].append(cluster)
@@ -9,7 +9,7 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
9
9
  from utils import *
10
10
 
11
11
 
12
- def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted,
12
+ def gene_presence_absence_output(options, genome_dict,
13
13
  pangenome_clusters_First_sequences_sorted,
14
14
  combined_pangenome_clusters_First_Second_clustered=None,
15
15
  combined_pangenome_clusters_Second_sequences_sorted=None):
@@ -137,48 +137,6 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
137
137
  if options.reclustered is not None:
138
138
  print(f"Merged Second cluster IDs: {len(merged_second_cluster_ids)}")
139
139
 
140
- # def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
141
- # print("Outputting gene_presence_absence file")
142
- # output_dir = os.path.abspath(options.output_dir)
143
- # #in_name = options.clusters.split('.')[0].split('/')[-1]
144
- # gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
145
- # gpa_outfile = open(gpa_outfile, 'w')
146
- # genome_dict = OrderedDict(sorted(genome_dict.items()))
147
- # gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
148
- # '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
149
- # gpa_outfile.write('","'.join(genome_dict.keys()))
150
- # gpa_outfile.write('"\n')
151
- # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
152
- # average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
153
- # gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
154
- # '","","","","","","","",""')
155
- #
156
- #
157
- # for genome in genome_dict.keys():
158
- # full_out = ''
159
- # tmp_list = []
160
- # for value in sequences:
161
- # if value.split('|')[0] == genome:
162
- # tmp_list.append(value.split('|')[1])
163
- # if tmp_list:
164
- # full_out += ',"'+' '.join(tmp_list)+'"'
165
- # else:
166
- # full_out = ',""'
167
- # gpa_outfile.write(full_out)
168
- # gpa_outfile.write('\n')
169
-
170
- ### Below is some unfinished code
171
- # edge_list_outfile = open(in_name+'_edge_list.csv','w')
172
- # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
173
- # output = []
174
- # for entry in sequences:
175
- # # Split each entry at '|'
176
- # genome, gene = entry.split('|')
177
- # # Format the result as "gene genome"
178
- # output.append(f"{gene}\t{genome}")
179
- # for line in output:
180
- # edge_list_outfile.write(line + '\n')
181
-
182
140
 
183
141
 
184
142
 
@@ -209,7 +167,7 @@ def get_cores(options,genome_dict):
209
167
  cores[only_second_core_group] = []
210
168
  return cores, groups
211
169
 
212
- #@profile
170
+
213
171
  def calc_First_only_core(cluster, First_num, groups, cores):
214
172
  groups_as_list = list(groups.values())
215
173
  for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
@@ -217,7 +175,7 @@ def calc_First_only_core(cluster, First_num, groups, cores):
217
175
  family_group = list(groups)[res]
218
176
  cores['First_core_'+family_group].append(cluster)
219
177
 
220
- #@profile
178
+
221
179
  def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
222
180
  groups_as_list = list(groups.values())
223
181
  for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
@@ -227,8 +185,8 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
227
185
  cores['extended_core_' + family_group].append(cluster)
228
186
 
229
187
 
230
- #@profile
231
- def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
188
+
189
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF-Reporter but combined >1 PEP
232
190
  groups_as_list = list(groups.values())
233
191
  # Looping through the list to find the matching condition
234
192
  for idx, (sec, fir) in enumerate(groups_as_list):
@@ -239,7 +197,7 @@ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores
239
197
  cores['combined_core_' + family_group].append(cluster)
240
198
 
241
199
 
242
- #@profile
200
+
243
201
  def calc_Second_only_core(cluster, Second_num, groups, cores):
244
202
  groups_as_list = list(groups.values())
245
203
  for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
@@ -247,7 +205,7 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
247
205
  family_group = list(groups)[res]
248
206
  cores['Second_core_' + family_group].append(cluster)
249
207
 
250
- #@profile
208
+
251
209
  def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
252
210
  try:
253
211
  groups_as_list = list(groups.values())
@@ -259,7 +217,7 @@ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count
259
217
  sys.exit("Error in calc_only_Second_only_core")
260
218
 
261
219
 
262
- #@profile
220
+
263
221
  def cluster(options):
264
222
 
265
223
  if options.cluster_format == 'CD-HIT':
@@ -273,18 +231,17 @@ def cluster(options):
273
231
  cores, groups = get_cores(options, genome_dict)
274
232
  ###
275
233
 
276
- if options.reclustered != None: #FIX
234
+ if options.reclustered != None: # Combined clustering
277
235
  if options.cluster_format == 'CD-HIT':
278
236
  combined_pangenome_clusters_First_Second_clustered, not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
279
237
  elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
280
- #Fix
281
238
  combined_pangenome_clusters_First_Second_clustered, not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
282
239
 
283
240
  pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences, '|')
284
241
 
285
242
  # Sort First clusters
286
243
  sorted_First_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
287
- pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
244
+ #pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
288
245
  pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_First_keys)
289
246
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_First_keys)
290
247
 
@@ -296,7 +253,7 @@ def cluster(options):
296
253
  else:
297
254
  pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
298
255
  sorted_First_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
299
- pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
256
+ #pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
300
257
  pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences,
301
258
  sorted_First_keys)
302
259
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_First_keys)
@@ -375,17 +332,16 @@ def cluster(options):
375
332
  if options.gene_presence_absence_out != False:
376
333
  if options.reclustered != None:
377
334
  # Pass both First and Second clustering data
378
- gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted,
335
+ gene_presence_absence_output(options, genome_dict,
379
336
  pangenome_clusters_First_sequences_sorted,
380
337
  combined_pangenome_clusters_First_Second_clustered,
381
338
  combined_pangenome_clusters_Second_sequences_sorted)
382
339
  else:
383
340
  # Only First clustering data available
384
- gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted,
385
- pangenome_clusters_First_sequences_sorted)
341
+ gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sequences_sorted)
386
342
 
387
343
 
388
- ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
344
+ ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -write-groups not presented then it needs
389
345
  # to be done for alignment full anyway...
390
346
 
391
347
  genome_list = list(genome_dict.keys())
@@ -400,17 +356,24 @@ def cluster(options):
400
356
  outfile.write('>group_'+str(cluster)+'\n')
401
357
  wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
402
358
  outfile.write(wrapped_aa_seq+'\n')
403
- if options.write_groups != None:
359
+ if options.write_groups != False:
404
360
  print("Outputting gene group FASTA files")
405
361
  #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
406
362
  output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
407
363
  write_groups_func(options,output_dir, key_order, cores, sequences,
408
364
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
409
365
 
410
- if options.align_core != None:
366
+ if options.align_core != False:
411
367
  print("Processing gene group alignment")
412
368
  process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
413
369
 
370
+ if options.write_individual_groups == True:
371
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
372
+ write_individual_groups(options, output_dir, key_order, cores, sequences,
373
+ pangenome_clusters_First_sequences_sorted,
374
+ combined_pangenome_clusters_Second_sequences)
375
+
376
+
414
377
  elif options.run_mode == 'Partial':
415
378
  sequences = read_fasta(options.fasta)
416
379
  if options.reclustered == None:
@@ -432,16 +395,21 @@ def cluster(options):
432
395
  outfile.write('>group_'+str(cluster)+'\n')
433
396
  wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
434
397
  outfile.write(wrapped_aa_seq+'\n')
435
- if options.write_groups != None:
398
+ if options.write_groups != False:
436
399
  print("Outputting gene group FASTA files")
437
400
  output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
438
401
  write_groups_func(options,output_dir, key_order, cores, sequences,
439
402
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
440
403
 
441
- if options.align_core != None:
404
+ if options.align_core != False:
442
405
  print("Processing gene group alignment")
443
406
  process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
444
407
 
408
+ if options.write_individual_groups == True:
409
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
410
+ write_individual_groups(options, output_dir, key_order, cores, sequences,
411
+ pangenome_clusters_First_sequences_sorted,
412
+ combined_pangenome_clusters_Second_sequences)
445
413
 
446
414
 
447
415
  #
@@ -461,4 +429,3 @@ def cluster(options):
461
429
  #
462
430
  #
463
431
  #
464
-
@@ -1,6 +1,3 @@
1
- import argparse
2
-
3
-
4
1
  try:
5
2
  from .constants import *
6
3
  from .utils import *
@@ -8,10 +5,94 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
8
5
  from constants import *
9
6
  from utils import *
10
7
 
8
+ import threading
9
+ import time
10
+ import os
11
+ from typing import Optional
12
+ import re
13
+
14
+ def count_matching_files(input_dir: str, name_split: Optional[str], extensions):
15
+ """
16
+ Count input files in input_dir that match the provided extensions and, if name_split supplied,
17
+ contain the name_split substring in the filename. This is used to compute total work units (files).
18
+ """
19
+ if not input_dir or not os.path.isdir(input_dir):
20
+ return 0
21
+ total = 0
22
+ for fname in os.listdir(input_dir):
23
+ low = fname.lower()
24
+ if any(low.endswith(ext) for ext in extensions):
25
+ if name_split:
26
+ if name_split in fname:
27
+ total += 1
28
+ else:
29
+ total += 1
30
+ return total
31
+
32
+ def count_files_present_in_combined(combined_file: str, name_split: Optional[str]) -> int:
33
+ """
34
+ Heuristic: count number of distinct input files (genomes) already present in the combined output.
35
+ Primary approach: parse headers and take the second '|' field (header.split('|')[1]) as genome/file id.
36
+ If that parsing fails, look for tokens containing name_split inside the header.
37
+ """
38
+ if not combined_file or not os.path.exists(combined_file):
39
+ return 0
40
+ seen = set()
41
+ try:
42
+ with open(combined_file, 'r') as fh:
43
+ for line in fh:
44
+ if not line.startswith('>'):
45
+ continue
46
+ header = line[1:].strip()
47
+ # 1) Prefer headers like ">id|genome|rest" -> take genome (second field)
48
+ if '|' in header:
49
+ parts = header.split('|')
50
+ if len(parts) > 1 and parts[1]:
51
+ seen.add(parts[1])
52
+ continue
53
+ # 2) If name_split provided, look for a filename-like token that includes it
54
+ if name_split:
55
+ match = re.search(r'([^\s/\\]*' + re.escape(name_split) + r'[^\s/\\]*)', header)
56
+ if match:
57
+ token = os.path.basename(match.group(1))
58
+ seen.add(token)
59
+ continue
60
+ # 3) If nothing matched, skip this header (avoids per-sequence overcounting)
61
+ except Exception:
62
+ return 0
63
+ return len(seen)
64
+
65
+ # Helpers for progress reporting
11
66
 
67
+ def progress_reporter(stop_event, logger, total_files, combined_file, name_split=None, interval=10):
68
+ """
69
+ Periodically log progress. Preference: count headers in combined_file.
70
+ Falls back to simple heartbeat if combined_file isn't yet created.
71
+ """
72
+ start = time.time()
73
+ while not stop_event.is_set():
74
+ # Use number of distinct input files represented in the combined output for "processed"
75
+ processed = count_files_present_in_combined(combined_file, name_split) if combined_file else 0
76
+ # Cap processed to total_files (prevents >100%)
77
+ if total_files > 0 and processed > total_files:
78
+ processed = total_files
79
+ pct = (processed / total_files * 100) if total_files > 0 else 0.0
80
+ elapsed = time.time() - start
81
+ logger.info("Progress: %d/%d processed (%.1f%%). Elapsed: %.0fs", processed, total_files, pct, elapsed)
82
+ # Wait with early exit support
83
+ stop_event.wait(interval)
84
+ # Final log when exiting
85
+ processed = count_files_present_in_combined(combined_file, name_split) if combined_file else 0
86
+ if total_files > 0 and processed > total_files:
87
+ processed = total_files
88
+ pct = (processed / total_files * 100) if total_files > 0 else 0.0
89
+ elapsed = time.time() - start
90
+ logger.info("Final progress: %d/%d processed (%.1f%%). Total elapsed: %.0fs", processed, total_files, pct, elapsed)
12
91
 
13
92
  def main():
14
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
93
+ # Early console-only logger so parser.description is logged before help/usage.
94
+ early_logger = configure_logger("PyamilySeq.Seq_Combiner", enable_file=False, log_dir=None, verbose=False)
95
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Combiner", description='Running Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
15
96
  ### Required Arguments
16
97
  required = parser.add_argument_group('Required Arguments')
17
98
  required.add_argument('-input_dir', action='store', dest='input_dir',
@@ -47,37 +128,66 @@ def main():
47
128
  misc.add_argument("-v", "--version", action="version",
48
129
  version=f"PyamilySeq: Seq-Combiner version {PyamilySeq_Version} - Exiting",
49
130
  help="Print out version number and exit")
131
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
132
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir).")
50
133
 
51
134
  options = parser.parse_args()
52
135
 
136
+ # Setup logger for Seq-Combiner
137
+ output_path = os.path.abspath(options.output_dir)
138
+ if not os.path.exists(output_path):
139
+ os.makedirs(output_path)
140
+ log_dir = options.log_dir if getattr(options, "log_dir", None) else output_path
141
+ logger = configure_logger("PyamilySeq.Seq_Combiner", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
142
+
143
+ # --- Progress reporting setup ------------------------------------------------
144
+ combined_out_file = os.path.join(output_path, options.output_file)
145
+ # Determine name_split and extensions per mode and count matching input files as total work units
146
+ if options.input_type == 'fasta':
147
+ name_split = options.name_split_fasta
148
+ exts = ('.fasta', '.fa', '.fna')
149
+ else: # 'separate' or 'combined'
150
+ name_split = options.name_split_gff
151
+ exts = ('.gff', '.gff3', '.gff.gz', '.gff3.gz')
152
+
153
+ total_work = count_matching_files(options.input_dir, name_split, exts)
154
+ logger.info("Found %d input files (matching pattern) to process in %s", total_work, options.input_dir)
155
+
156
+ stop_event = threading.Event()
157
+ reporter_thread = threading.Thread(target=progress_reporter, args=(stop_event, logger, total_work, combined_out_file, name_split, 10), daemon=True)
158
+ reporter_thread.start()
159
+ # ---------------------------------------------------------------------------
53
160
 
54
161
  if options.input_type == 'separate' and options.name_split_gff is None:
162
+ logger.error("Please provide a substring to split the filename and extract the genome name.")
55
163
  print("Please provide a substring to split the filename and extract the genome name.")
56
164
  exit(1)
57
165
  if options.input_type == 'combined' and options.name_split_gff is None:
166
+ logger.error("Please provide a substring to split the filename and extract the genome name.")
58
167
  print("Please provide a substring to split the filename and extract the genome name.")
59
168
  exit(1)
60
169
  if options.input_type == 'fasta' and options.name_split_fasta is None:
170
+ logger.error("Please provide a substring to split the filename and extract the genome name.")
61
171
  print("Please provide a substring to split the filename and extract the genome name.")
62
172
  exit(1)
63
173
 
64
- output_path = os.path.abspath(options.output_dir)
65
- if not os.path.exists(output_path):
66
- os.makedirs(output_path)
67
-
68
- #output_file = options.output_file + '.fasta'
69
- if os.path.exists(os.path.join(output_path, options.output_file)):
70
- print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
71
- exit(1)
72
-
73
- combined_out_file = os.path.join(output_path, options.output_file )
74
-
75
174
  if options.input_type == 'separate':
175
+ logger.info("Processing 'separate' input_type from %s", options.input_dir)
76
176
  read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
77
177
  elif options.input_type == 'combined':
178
+ logger.info("Processing 'combined' input_type from %s", options.input_dir)
78
179
  read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
79
180
  elif options.input_type == 'fasta':
181
+ logger.info("Processing 'fasta' input_type from %s", options.input_dir)
80
182
  read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate, True)
183
+ logger.info("Seq-Combiner completed.")
184
+
185
+ # Stop reporter and wait for final log
186
+ stop_event.set()
187
+ reporter_thread.join(timeout=5)
188
+ # Final summary: count number of input files represented (heuristic)
189
+ final_files = count_files_present_in_combined(combined_out_file, name_split)
190
+ logger.info("Completed combining. Final combined file: %s (input files represented: %d)", combined_out_file, final_files)
81
191
 
82
192
  if __name__ == "__main__":
83
193
  main()
@@ -1,5 +1,12 @@
1
- import argparse
1
+
2
2
  import copy
3
+ import os
4
+
5
+ # Use centralised logger factory
6
+ try:
7
+ from .constants import configure_logger, LoggingArgumentParser
8
+ except Exception:
9
+ from constants import configure_logger, LoggingArgumentParser
3
10
 
4
11
  def find_gene_ids_in_csv(csv_file, group_name):
5
12
  """Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
@@ -39,7 +46,10 @@ def extract_sequences(fasta_file, gene_ids):
39
46
  return sequences
40
47
 
41
48
  def main():
42
- parser = argparse.ArgumentParser(description="Extract sequences for specified group name from CSV file and corresponding FASTA file.")
49
+ # Early console-only logger so parser.description appears in logger output before argparse prints the menu.
50
+ early_logger = configure_logger("PyamilySeq.Seq_Extractor", enable_file=False, log_dir=None, verbose=False)
51
+ parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Extractor", description="Running Seq-Extractor - A tool to extract sequences for specified group name from CSV file and corresponding FASTA file.")
52
+
43
53
  parser.add_argument("-csv", action='store', dest='csv_file',
44
54
  help="CSV file containing group data", required=True)
45
55
  parser.add_argument("-group", action='store', dest='group_name',
@@ -48,22 +58,34 @@ def main():
48
58
  help="Input FASTA file containing sequences", required=True)
49
59
  parser.add_argument("-out", action='store', dest='output_file',
50
60
  help="Output FASTA file with extracted sequences", required=True)
61
+ parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
62
+ parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: dir of output_file).")
51
63
 
52
64
  options = parser.parse_args()
53
65
 
66
+ # Setup logger
67
+ out_dir = os.path.abspath(os.path.dirname(options.output_file)) if options.output_file else os.getcwd()
68
+ log_dir = options.log_dir if getattr(options, "log_dir", None) else out_dir
69
+ logger = configure_logger("PyamilySeq.Seq_Extractor", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
70
+
71
+ logger.info("Searching for gene IDs in CSV %s for group %s", options.csv_file, options.group_name)
72
+
54
73
  # Find gene IDs in CSV
55
74
  gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
56
75
  if not gene_ids:
76
+ logger.warning("No gene IDs found for group name '%s' in the CSV.", options.group_name)
57
77
  print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
58
78
  return
59
79
 
60
80
  # Extract sequences from the FASTA file
81
+ logger.info("Extracting sequences from FASTA: %s", options.fasta_file)
61
82
  sequences = extract_sequences(options.fasta_file, gene_ids)
62
83
 
63
84
  # Write matched sequences to the output FASTA file
64
85
  with open(options.output_file, 'w') as output:
65
86
  for gene_id, sequence_lines in sequences.items():
66
87
  output.write("\n".join(sequence_lines) + "\n")
88
+ logger.info("Wrote %d sequences to %s", len(sequences), options.output_file)
67
89
 
68
90
  if __name__ == "__main__":
69
91
  main()