miga-base 1.3.22.6 → 1.4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b7617c312b68875e84d349bc0613493cc5423da5930c7c4639646c968743f8c7
4
- data.tar.gz: 8cc072f053bf244889f928ce842e79311eff9deb5dc36fed9d42702536b5cdfb
3
+ metadata.gz: 5525ee26722f9be06bf3d8ddd8a94909457c55d8e2ac9d8f137bf44e15cf2035
4
+ data.tar.gz: ce4036f422bb2cc4e5da887d0dfaea6de89a73f152f63269c5894644f8a92127
5
5
  SHA512:
6
- metadata.gz: 200520c0b8a9a4a13d22db354e8f1d9cfe5876fbca2ab4f6ea454f8638d9dc10d6898b027002ed50f003ddf898311242b3321e5c77836546b45e807bc858f7f9
7
- data.tar.gz: 493c26112d46fa6f83783129be0d1a0832b147ac9476f00da6abf5678b67da08bd64ab2e6a92f56124d1bed85de2c9bd626ca60b0f96a809674ef58e4d964ac2
6
+ metadata.gz: 8b3393571b1f4425a243181ba50117829b264022c02ac251717721bb6ef092ac81f323b8ccb1f4ea1989fe77d61b9f224115f0758f81a0963503127938b59b3f
7
+ data.tar.gz: d6dd244f7ea5261900d4fc8329398e5bb07a44e28e0ddf8706c546798546814735142c0179831193a4a48d8b3c2bdc93745bd0a021ee31df7ec385f7ef58c4fe
@@ -170,7 +170,15 @@ class MiGA::Cli::Action::Add < MiGA::Cli::Action
170
170
  r_path = File.expand_path("data/#{r_dir}/#{d.name}", p.path)
171
171
  file_type[2].each_with_index do |ext, i|
172
172
  gz = file[i] =~ /\.gz/ ? '.gz' : ''
173
- FileUtils.cp(file[i], "#{r_path}#{ext}#{gz}")
173
+ fo = "#{r_path}#{ext}#{gz}"
174
+ if gz == ''
175
+ FileUtils.cp(file[i], fo)
176
+ else
177
+ MiGA::MiGA.run_cmd(
178
+ "gzip -cd #{file[i].shellescape} | gzip -c > #{fo.shellescape}",
179
+ source: :miga_env # <- To load pigz if available
180
+ )
181
+ end
174
182
  cli.say " file: #{File.basename(file[i])}"
175
183
  end
176
184
  File.open("#{r_path}.done", 'w') { |f| f.print Time.now.to_s }
@@ -83,8 +83,9 @@ module MiGA::Cli::Action::Doctor::Base
83
83
  data = MiGA::SQLite.new(db_file).run(sql) || []
84
84
  Hash[
85
85
  data.map do |row|
86
- k, v = row.shift(2)
87
- [k, row.all?(&:zero?) ? v : [v] + row]
86
+ r = row.dup
87
+ k, v = r.shift(2)
88
+ [k, r.all?(&:zero?) ? v : [v] + r]
88
89
  end
89
90
  ]
90
91
  end
@@ -18,6 +18,10 @@ class MiGA::Cli::Action::Env < MiGA::Cli::Action
18
18
  for util in enveomics/Scripts FastAAI/fastaai multitrim ; do
19
19
  export PATH="$MIGA/utils/$util:$PATH"
20
20
  done
21
+ # Override gzip with pigz (if available)
22
+ if command -v pigz &>/dev/null ; then
23
+ function gzip { pigz -p ${CORES:-2} "$@" ; }
24
+ fi
21
25
  BASH
22
26
  end
23
27
 
@@ -7,7 +7,7 @@ class MiGA::Cli::Action::Files < MiGA::Cli::Action
7
7
  def parse_cli
8
8
  cli.defaults = { details: false, json: true }
9
9
  cli.parse do |opt|
10
- cli.opt_object(opt, [:project, :dataset_opt])
10
+ cli.opt_object(opt, %i[project dataset_opt result_opt])
11
11
  opt.on(
12
12
  '-i', '--info',
13
13
  'Print additional details for each file'
@@ -21,7 +21,10 @@ class MiGA::Cli::Action::Files < MiGA::Cli::Action
21
21
  end
22
22
 
23
23
  def perform
24
- cli.load_project_or_dataset.each_result do |sym, res|
24
+ obj = cli.load_project_or_dataset
25
+ res = cli[:result] ? [cli.load_result] : cli.load_project_or_dataset.results
26
+ res.each do |res|
27
+ sym = res.key
25
28
  cli.puts "#{"#{sym}\tjson\t" if cli[:details]}#{res.path}" if cli[:json]
26
29
  res.each_file do |k, f|
27
30
  cli.puts "#{"#{sym}\t#{k}\t" if cli[:details]}#{res.dir}/#{f}"
@@ -140,7 +140,7 @@ class MiGA::Cli::Action::Init < MiGA::Cli::Action
140
140
  paths[r[1]] = cli[:"path_to_#{r[1]}"]
141
141
  cli.puts "user-provided: #{paths[r[1]]}"
142
142
  else
143
- path = find_software(r[1])
143
+ path = find_software(r[1], rc_fh)
144
144
  paths[r[1]] = File.expand_path(r[1], path).shellescape
145
145
  end
146
146
  end
@@ -169,7 +169,7 @@ class MiGA::Cli::Action::Init < MiGA::Cli::Action
169
169
  ) == 'yes'
170
170
  end
171
171
 
172
- def find_software(exec)
172
+ def find_software(exec, rc_fh)
173
173
  path = nil
174
174
  loop do
175
175
  d_path = File.dirname(run_cmd(cli, ['which', exec], raise: false))
@@ -14,6 +14,10 @@ class MiGA::Cli::Action::Option < MiGA::Cli::Action
14
14
  ) { |v| cli[:key] = v }
15
15
  opt.on(
16
16
  '--value STRING',
17
+ '::HIDE::' # Replaced by --set, but aliased for backward compatibility
18
+ ) { |v| cli[:value] = v }
19
+ opt.on(
20
+ '-s', '--set STRING',
17
21
  'Value of the option to set (by default, option value is not changed)',
18
22
  'Recognized tokens: nil, true, false'
19
23
  ) { |v| cli[:value] = v }
@@ -7,7 +7,7 @@ class MiGA::Cli::Action::Run < MiGA::Cli::Action
7
7
  def parse_cli
8
8
  cli.defaults = { try_load: false, thr: 1, env: false, check_first: false }
9
9
  cli.parse do |opt|
10
- cli.opt_object(opt, [:project, :dataset_opt, :result_opt])
10
+ cli.opt_object(opt, %i[project dataset_opt result_opt])
11
11
  opt.on(
12
12
  '-t', '--threads INT', Integer,
13
13
  "Threads to use in the local run (by default: #{cli[:thr]})"
@@ -92,22 +92,22 @@ module MiGA::Cli::OptHelper
92
92
  opt.on(
93
93
  '-r', '--result STRING',
94
94
  "#{'(Mandatory) ' if w == :result}Name of the result",
95
- 'Recognized names for dataset-specific results include:',
96
- *MiGA::Dataset.RESULT_DIRS.keys.map { |n| " ~ #{n}" },
97
- 'Recognized names for project-wide results include:',
98
- *MiGA::Project.RESULT_DIRS.keys.map { |n| " ~ #{n}" }
95
+ '~ Recognized names for dataset-specific results include:',
96
+ *list_to_paragraph(MiGA::Dataset.RESULT_DIRS.keys),
97
+ '~ Recognized names for project-wide results include:',
98
+ *list_to_paragraph(MiGA::Project.RESULT_DIRS.keys)
99
99
  ) { |v| self[:result] = v.downcase.to_sym }
100
100
  when :result_dataset
101
101
  opt.on(
102
102
  '-r', '--result STRING',
103
103
  '(Mandatory) Name of the result, one of:',
104
- *MiGA::Dataset.RESULT_DIRS.keys.map { |n| " ~ #{n}" }
104
+ *list_to_paragraph(MiGA::Dataset.RESULT_DIRS.keys, indent: 0)
105
105
  ) { |v| self[:result] = v.downcase.to_sym }
106
106
  when :result_project
107
107
  opt.on(
108
108
  '-r', '--result STRING',
109
109
  '(Mandatory) Name of the result, one of:',
110
- *MiGA::Project.RESULT_DIRS.keys.map { |n| " ~ #{n}" }
110
+ *list_to_paragraph(MiGA::Project.RESULT_DIRS.keys, indent: 0)
111
111
  ) { |v| self[:result] = v.downcase.to_sym }
112
112
  else
113
113
  raise "Internal error: Unrecognized option: #{w}"
@@ -174,4 +174,10 @@ module MiGA::Cli::OptHelper
174
174
  sym = flag.to_sym if sym.nil?
175
175
  opt.on("--#{flag.to_s.tr('_', '-')}", description) { |v| self[sym] = v }
176
176
  end
177
+
178
+ def list_to_paragraph(list, width: 50, indent: 2)
179
+ list.map(&:to_s).join(', ')
180
+ .scan(/\S.{0,#{width}}\S(?=\s|$)|\S+/).to_a
181
+ .map { |i| "#{' ' * indent}#{i}" }
182
+ end
177
183
  end
@@ -158,6 +158,9 @@ module MiGA::Dataset::Base
158
158
  },
159
159
  dist_req: {
160
160
  desc: 'Run distances against these datasets', type: Array, default: []
161
+ },
162
+ keep_assembly_graphs: {
163
+ desc: 'Do not clean assembly graphs', in: [true, false], default: false
161
164
  }
162
165
  }
163
166
  end
@@ -58,10 +58,12 @@ module MiGA::Dataset::Result::Add
58
58
  return nil unless
59
59
  result_files_exist?(base, '.CoupledReads.fa') ||
60
60
  result_files_exist?(base, '.SingleReads.fa') ||
61
- result_files_exist?(base, %w[.1.fasta .2.fasta])
61
+ result_files_exist?(base, %w[.1.fasta .2.fasta]) ||
62
+ result_files_exist?(base, '.empty')
62
63
 
63
64
  add_files_to_ds_result(
64
65
  MiGA::Result.new("#{base}.json"), name,
66
+ empty: '.empty',
65
67
  coupled: '.CoupledReads.fa',
66
68
  single: '.SingleReads.fa',
67
69
  pair1: '.1.fasta',
data/lib/miga/dataset.rb CHANGED
@@ -199,7 +199,9 @@ class MiGA::Dataset < MiGA::MiGA
199
199
 
200
200
  ##
201
201
  # Retrieves the option with name +key+ from the dataset's metadata
202
- # extending support to relative paths in +:db_project+
202
+ # extending support to relative paths in +:db_project+, and for all
203
+ # other options looks for metadata defined in the project before
204
+ # returning the default
203
205
  def option_by_metadata(key)
204
206
  case key.to_sym
205
207
  when :db_project
@@ -209,6 +211,8 @@ class MiGA::Dataset < MiGA::MiGA
209
211
  return y
210
212
  end
211
213
 
214
+ return project.metadata[key] unless project.metadata[key].nil?
215
+
212
216
  super
213
217
  end
214
218
  end
@@ -133,6 +133,11 @@ module MiGA::Project::Base
133
133
  desc: 'Maximum p-value to transfer taxonomy', default: 0.1, type: Float,
134
134
  in: 0.0..1.0
135
135
  },
136
+ indexing: {
137
+ desc: 'Approach used to index the collection as database', type: String,
138
+ default: 'hierarchical',
139
+ in: %w[hierarchical gsearch no]
140
+ },
136
141
  haai_p: {
137
142
  desc: 'Value of aai.rb -p on hAAI', type: String,
138
143
  default: proc { |project|
@@ -146,7 +151,7 @@ module MiGA::Project::Base
146
151
  },
147
152
  ani_p: {
148
153
  desc: 'Value of ani.rb -p on ANI', default: 'fastani', type: String,
149
- in: %w[blast+ blast blat fastani]
154
+ in: %w[blast+ blast blat fastani no]
150
155
  },
151
156
  max_try: {
152
157
  desc: 'Maximum number of task attempts', default: 10, type: Integer,
@@ -42,6 +42,10 @@ module MiGA::Result::Stats
42
42
  stats
43
43
  end
44
44
 
45
+ def compute_stats_trimmed_reads
46
+ compute_stats_raw_reads
47
+ end
48
+
45
49
  def compute_stats_trimmed_fasta
46
50
  f = self[:files][:coupled].nil? ? file_path(:single) : file_path(:coupled)
47
51
  s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true, skew: true)
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.3, 22, 6].freeze
15
+ VERSION = [1.4, 0, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2025, 5, 19)
23
+ VERSION_DATE = Date.new(2025, 9, 23)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -11,6 +11,7 @@ miga date > "$DATASET.start"
11
11
 
12
12
  # Interpose (if needed)
13
13
  interpose=no
14
+ TR="../02.trimmed_reads"
14
15
  TF="../04.trimmed_fasta"
15
16
  b=$DATASET
16
17
  if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then
@@ -38,25 +39,66 @@ for i in SingleReads CoupledReads ; do
38
39
  miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
39
40
  fi
40
41
  done
42
+ for i in 1 2 ; do
43
+ base="$TR/${DATASET}.${i}.clipped.fastq"
44
+ if [[ -e "$base" && ! -s "${base}.gz" ]] ; then
45
+ gzip -9f "$base"
46
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
47
+ fi
48
+ done
41
49
 
42
50
  # Assemble
43
- FA="$TF/${DATASET}.CoupledReads.fa.gz"
44
- [[ -e "$FA" ]] || FA="$TF/${DATASET}.SingleReads.fa.gz"
45
- RD="r"
46
- [[ $FA == *.SingleReads.fa* ]] && RD="l"
47
- gzip -cd "$FA" \
48
- | idba_ud --pre_correction -$RD /dev/stdin \
49
- -o "$DATASET" --num_threads "$CORES" || true
50
- [[ -s "$DATASET/contig.fa" ]] || exit 1
51
+ CMD="spades.py -o $DATASET -t $CORES"
52
+ TYPE_OPT=""
53
+ case "$(miga ls -P "$PROJECT" -D "$DATASET" -m type | cut -f 2)" in
54
+ "metagenome")
55
+ TYPE_OPT="--meta" ;;
56
+ "plasmid")
57
+ TYPE_OPT="--plasmid" ;;
58
+ "scgenome")
59
+ TYPE_OPT="--sc" ;;
60
+ "genome")
61
+ TYPE_OPT="--isolate" ;;
62
+ "virome")
63
+ TYPE_OPT="--metaviral" ;;
64
+ esac
65
+ F1="$TR/${DATASET}.1.clipped.fastq.gz"
66
+ F2="$TR/${DATASET}.2.clipped.fastq.gz"
67
+ if [[ -s "$F1" ]] ; then
68
+ if [[ -s "$F2" ]] ; then
69
+ CMD="$CMD -1 $F1 -2 $F2"
70
+ else
71
+ CMD="$CMD -s $F1"
72
+ [[ "$TYPE_OPT" == "--meta" ]] && TYPE_OPT=""
73
+ fi
74
+ else
75
+ F1="$TF/${DATASET}.CoupledReads.fa.gz"
76
+ F1="$TF/${DATASET}.SingleReads.fa.gz"
77
+ if [[ -s "$F1" ]] ; then
78
+ CMD="$CMD --12 $F1"
79
+ elif [[ -s "$F2" ]] ; then
80
+ CMD="$CMD -s $F2"
81
+ [[ "$TYPE_OPT" == "--meta" ]] && TYPE_OPT=""
82
+ else
83
+ echo "No input files found to assemble" >&2
84
+ exit 1
85
+ fi
86
+ fi
87
+ CMD="$CMD $TYPE_OPT"
88
+ echo "$CMD"
89
+ $CMD || true
90
+ [[ -s "$DATASET/contigs.fa" ]] || exit 1
51
91
 
52
92
  # Clean
53
- ( cd "$DATASET" && rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa )
93
+ KEEP_GR=$(miga option -P "$PROJECT" -D "$DATASET" -k keep_assembly_graphs)
94
+ [[ "$KEEP_GR" == "true" ]] || ( cd "$DATASET" && rm -R *.gfa *.fastg *.paths )
95
+ ( cd "$DATASET" && rm -R K* corrected misc pipeline_state before_rr.fasta )
54
96
 
55
97
  # Extract
56
- if [[ -s "$DATASET/scaffold.fa" ]] ; then
57
- ln -s "$DATASET/scaffold.fa" "$DATASET.AllContigs.fna"
98
+ if [[ -s "$DATASET/scaffolds.fasta" ]] ; then
99
+ ln -s "$DATASET/scaffolds.fasta" "$DATASET.AllContigs.fna"
58
100
  else
59
- ln -s "$DATASET/contig.fa" "$DATASET.AllContigs.fna"
101
+ ln -s "$DATASET/contigs.fasta" "$DATASET.AllContigs.fna"
60
102
  fi
61
103
  FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2 >= 1000 { print $1 }' \
62
104
  | FastA.filter.pl /dev/stdin "$DATASET.AllContigs.fna" \
@@ -64,6 +106,7 @@ FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2 >= 1000 { print $1 }' \
64
106
 
65
107
  # Finalize
66
108
  miga date > "$DATASET.done"
109
+ [[ -n "$OPT_TYPE" ]] || OPT_TYPE="default"
67
110
  cat <<VERSIONS \
68
111
  | miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f --stdin-versions
69
112
  => MiGA
@@ -74,7 +117,6 @@ $(
74
117
  echo "version unknown"
75
118
  fi
76
119
  )
77
- => IDBA-UD
78
- version unknown
120
+ => SPADES
121
+ $(spades.py --version | perl -pe 's/.* //') [$OPT_TYPE]
79
122
  VERSIONS
80
-
data/scripts/miga.bash CHANGED
@@ -15,11 +15,6 @@ function exists { [[ -e "$1" ]] ; }
15
15
  # Evaluates if the first passed argument is a function
16
16
  function fx_exists { [[ $(type -t "$1") == "function" ]] ; }
17
17
 
18
- # Override gzip with pigz (if available)
19
- if command -v pigz &>/dev/null ; then
20
- function gzip { pigz -p ${CORES:-2} "$@" ; }
21
- fi
22
-
23
18
  # Initiate a project-wide run
24
19
  function miga_start_project_step {
25
20
  local dir="$1"
@@ -11,35 +11,18 @@ b=$DATASET
11
11
  # Initialize
12
12
  miga date > "$DATASET.start"
13
13
 
14
- # FastQ -> FastA
15
- for s in 1 2 ; do
16
- in="../02.trimmed_reads/${b}.${s}.clipped.fastq.gz"
17
- [[ -s "$in" ]] \
18
- && FastQ.maskQual.rb -i "$in" -o "${b}.${s}.fasta" --fasta --qual 18
19
- done
20
-
21
- # Interpose
22
- if [[ -e "${b}.2.fasta" ]] ; then
23
- FastA.interpose.pl "${b}.CoupledReads.fa" "$b".[12].fasta
24
- else
25
- mv "${b}.1.fasta" "${b}.SingleReads.fa"
26
- fi
27
-
28
- # Gzip
14
+ # Gzip (if needed)
29
15
  for x in 1.fasta 2.fasta SingleReads.fa CoupledReads.fa ; do
30
16
  in="${b}.${x}"
31
17
  [[ -e "$in" ]] && gzip -9f "$in"
32
18
  done
33
19
 
34
20
  # Finalize
21
+ echo 'Using FastQ directly' > "${DATASET}.empty"
35
22
  miga date > "${DATASET}.done"
36
23
  cat <<VERSIONS \
37
24
  | miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f --stdin-versions
38
25
  => MiGA
39
26
  $(miga --version)
40
- => Enveomics Collection: FastQ.maskQual.rb
41
- $(FastQ.maskQual.rb --version | perl -pe 's/.* //')
42
- => Enveomics Collection: FastA.interpose.pl
43
- version unknown
44
27
  VERSIONS
45
28
 
@@ -67,7 +67,7 @@ $(miga --version)
67
67
  => Enveomics Collection: FastQ.tag.rb
68
68
  $(FastQ.tag.rb --version | perl -pe 's/.* //')
69
69
  => Multitrim
70
- version unknown
70
+ $(multitrim.py --version | perl -pe 's/.* //')
71
71
  => FaQCs
72
72
  $(FaQCs --version 2>&1 | perl -pe 's/.*: //')
73
73
  => Seqtk
data/test/common_test.rb CHANGED
@@ -26,11 +26,13 @@ class CommonTest < Test::Unit::TestCase
26
26
  assert_respond_to(MiGA::MiGA, :DEBUG_ON)
27
27
  assert_respond_to(MiGA::MiGA, :DEBUG_OFF)
28
28
  MiGA::MiGA.DEBUG_TRACE_ON
29
+ assert(MiGA::MiGA.debug_trace?)
29
30
  err = capture_stderr do
30
31
  MiGA::MiGA.DEBUG 'Dandadi'
31
32
  end
32
- assert_match(/Dandadi\n .*block in test_debug_trace/, err.string)
33
+ assert_match(/Dandadi\n .*block in .*test_debug_trace/, err.string)
33
34
  MiGA::MiGA.DEBUG_TRACE_OFF
35
+ assert(!MiGA::MiGA.debug_trace?)
34
36
  err = capture_stderr do
35
37
  MiGA::MiGA.DEBUG 'Dandada'
36
38
  end
@@ -53,7 +53,8 @@ class ResultStatsTest < Test::Unit::TestCase
53
53
  r = dataset.add_result(:trimmed_reads)
54
54
  assert_equal({}, r.stats)
55
55
  r.compute_stats
56
- assert_equal({}, r.stats)
56
+ assert_equal([nil, '%'], r.stats[:a_t_skew])
57
+ assert_equal(0, r.stats[:reads])
57
58
  end
58
59
 
59
60
  def test_read_quality
data/test/test_helper.rb CHANGED
@@ -1,9 +1,10 @@
1
+ require_relative 'test_patch'
1
2
  require 'simplecov'
2
3
  SimpleCov.start
3
4
 
4
5
  require 'rubygems'
5
- require 'test/unit'
6
6
  require 'assertions'
7
+ require 'test/unit'
7
8
  require 'miga/common'
8
9
  require 'stringio'
9
10
 
@@ -0,0 +1,20 @@
1
+ # Narrowly filter known legacy-gem noise without changing behavior.
2
+
3
+ module Warning
4
+ class << self
5
+ alias_method :__warn_original, :warn
6
+
7
+ def warn(msg)
8
+ # 1) test-unit <-> assertions duplicate method warning
9
+ return if msg.include?('method redefined; discarding old assert_raise_message')
10
+ return if msg.include?('previous definition of assert_raise_message was here')
11
+
12
+ # 2) simplecov 0.13 "literal string will be frozen in the future"
13
+ # (emitted by simplecov/version.rb when assigning the VERSION constant)
14
+ return if msg.include?('simplecov/version.rb') &&
15
+ msg.include?('literal string will be frozen in the future')
16
+
17
+ __warn_original(msg)
18
+ end
19
+ end
20
+ end
@@ -44,6 +44,7 @@ module MiGA::DistanceRunner::Commands
44
44
  $stderr.puts "[#{Time.now}] ANI: #{dataset.name} vs #{targets.size} targets"
45
45
  empty_vals = targets.map { |_i| nil }
46
46
  return empty_vals unless File.size?(tmp_file('largecontigs.fa'))
47
+ return empty_vals if opts[:ani_p] == 'no'
47
48
 
48
49
  # Launch comparisons
49
50
  sbj = pending_targets(targets, :ani)
@@ -106,7 +106,7 @@ module MiGA::DistanceRunner::Database
106
106
  [n1, n2]
107
107
  ).first
108
108
  end if File.size?(db)
109
- y
109
+ y.dup
110
110
  rescue SQLite3::CorruptException => e
111
111
  $stderr.puts "Corrupt database: #{db}"
112
112
  raise e
@@ -157,7 +157,7 @@ module MiGA::DistanceRunner::Database
157
157
  data = {}
158
158
  SQLite3::Database.new(db) do |conn|
159
159
  sql = "select seq2, #{table}, sd, n, omega from #{table}"
160
- conn.execute(sql).each { |row| data[row.shift] = row }
160
+ conn.execute(sql).each { |row| r = row.dup; data[r.shift] = r }
161
161
  end
162
162
  data
163
163
  rescue => e
@@ -187,7 +187,7 @@ module MiGA::DistanceRunner::Database
187
187
  # Iterates for each entry in +db+
188
188
  def foreach_in_db(db, metric, &blk)
189
189
  SQLite3::Database.new(db) do |conn|
190
- conn.execute("select * from #{metric}").each { |r| blk[r] }
190
+ conn.execute("select * from #{metric}").each { |r| blk[r.dup] }
191
191
  end
192
192
  end
193
193
 
@@ -25,9 +25,8 @@ class MiGA::DistanceRunner
25
25
  @ref_project = project
26
26
  end
27
27
  @opts[:thr] ||= ENV.fetch('CORES') { 1 }.to_i
28
- %i[haai_p aai_p ani_p distances_checkpoint aai_save_rbm].each do |m|
29
- @opts[m] ||= ref_project.option(m)
30
- end
28
+ %i[haai_p aai_p ani_p distances_checkpoint aai_save_rbm indexing]
29
+ .each { |m| @opts[m] ||= ref_project.option(m) }
31
30
  $stderr.puts "Options: #{opts}"
32
31
  end
33
32
 
@@ -46,6 +45,27 @@ class MiGA::DistanceRunner
46
45
  # Launch analysis for reference datasets
47
46
  def go_ref!
48
47
  $stderr.puts 'Launching analysis for reference dataset'
48
+
49
+ # Check if the project is non-hierarchical
50
+ case ref_project.option(:indexing)
51
+ when 'no'
52
+ # No index? No distance
53
+ out_base = File.expand_path(dataset.name, home)
54
+ File.open("#{out_base}.empty", 'w') { |fh| fh.puts 'No indexing' }
55
+ return
56
+ when 'gsearch'
57
+ if project == ref_project
58
+ # No need to pre-calculate any distances for GSearch indexes
59
+ out_base = File.expand_path(dataset.name, home)
60
+ File.open("#{out_base}.empty", 'w') { |fh| fh.puts 'GSearch indexing' }
61
+ return
62
+ else
63
+ # Just keep going, gsearch will override haai_p and aai_p
64
+ end
65
+ when 'hierarchical'
66
+ # Just keep going
67
+ end
68
+
49
69
  # Initialize databases
50
70
  initialize_dbs! true
51
71
 
@@ -12,8 +12,10 @@ db.execute 'create table metadata(' \
12
12
  '`name` varchar(256), `field` varchar(256), `value` text)'
13
13
 
14
14
  def searchable(db, d, k, v)
15
- db.execute 'insert into metadata values(?,?,?)',
16
- d.name, k.to_s, " #{v.to_s.downcase.gsub(/[^A-Za-z0-9\-]+/, ' ')} "
15
+ db.execute(
16
+ 'insert into metadata values(?,?,?)',
17
+ [d.name, k.to_s, " #{v.to_s.downcase.gsub(/[^A-Za-z0-9\-]+/, ' ')} "]
18
+ )
17
19
  end
18
20
 
19
21
  p.each_dataset do |d|
@@ -515,7 +515,7 @@ def adapter_identification_pe(artificial_artifacts, seqtk_binary, faqcs_binary,
515
515
  begin_assessment = True
516
516
  else:
517
517
  segment = line.strip().split()
518
- detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
518
+ detected_adapters[segment[0]] = float(re.findall(r"\d+\.\d+", segment[3])[0])
519
519
 
520
520
  detection_report.close()
521
521
 
@@ -576,7 +576,7 @@ def adapter_identification_se(artificial_artifacts, seqtk_binary, faqcs_binary,
576
576
  begin_assessment = True
577
577
  else:
578
578
  segment = line.strip().split()
579
- detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
579
+ detected_adapters[segment[0]] = float(re.findall(r"\d+\.\d+", segment[3])[0])
580
580
 
581
581
  detection_report.close()
582
582
 
@@ -1028,11 +1028,13 @@ def gather_opts():
1028
1028
 
1029
1029
  parser.add_argument("--resources", dest = "resource_list", action = 'store_true', help = "Print a list of resources used by Multitrim and quit.")
1030
1030
 
1031
+ parser.add_argument("--version", dest = "version", action = 'store_true', help = "Print the version of multitrim and exit")
1032
+
1031
1033
 
1032
1034
  return(parser, parser.parse_args())
1033
1035
 
1034
1036
  def print_resources():
1035
- print("Multitrim github: https://github.com/KGerhardt/multitrim")
1037
+ print("Multitrim github: https://github.com/bio-miga/multitrim")
1036
1038
  print("MiGA adapters available at: https://github.com/bio-miga/miga/blob/main/utils/adapters.fa")
1037
1039
  internal_adapters = faqcs_internal_adapters()
1038
1040
  print("FaQCs mandatory adapters are:")
@@ -1042,6 +1044,9 @@ def print_resources():
1042
1044
  print("fastp github: https://github.com/OpenGene/fastp")
1043
1045
  print("Falco github: https://github.com/smithlabcode/falco")
1044
1046
 
1047
+ def print_version():
1048
+ print("MiGA's Multitrim 1.0")
1049
+
1045
1050
  #Program Control
1046
1051
  def main():
1047
1052
  #Keep the parser on hand so I can prent usage as needed.s
@@ -1052,6 +1057,10 @@ def main():
1052
1057
  print_resources()
1053
1058
  quit()
1054
1059
 
1060
+ if options.version:
1061
+ print_version()
1062
+ quit()
1063
+
1055
1064
 
1056
1065
  #Allows for the script to take no inputs and print help/usage
1057
1066
  if len(sys.argv)==1:
@@ -12,11 +12,11 @@ Bedtools bedtools http://bedtools.readthedocs.org/en/latest/
12
12
  Prodigal prodigal http://prodigal.ornl.gov
13
13
  MCL mcl http://micans.org/mcl/
14
14
  Barrnap barrnap http://www.vicbioinformatics.com/software.barrnap.shtml
15
- IDBA (reads) idba_ud http://i.cs.hku.hk/~alse/hkubrg/projects/idba
16
15
  FaQCs (reads) FaQCs https://github.com/LANL-Bioinformatics/FaQCs
17
16
  Falco (reads) falco https://github.com/smithlabcode/falco
18
17
  Seqtk (reads) seqtk https://github.com/lh3/seqtk
19
18
  Fastp (reads) fastp https://github.com/OpenGene/fastp
19
+ SPADES (reads) spades.py https://ablab.github.io/spades/ Required version: 3+
20
20
  Temurin (rdp) java https://adoptium.net/ Any Java VM would work
21
21
  MyTaxa (mytaxa) MyTaxa http://enve-omics.ce.gatech.edu/mytaxa
22
22
  Krona (mytaxa) ktImportText https://github.com/marbl/Krona/wiki
@@ -2,6 +2,7 @@
2
2
  module MiGA::SubcladeRunner::Pipeline
3
3
  # Run species-level clusterings using ANI > 95% / AAI > 90%
4
4
  def cluster_species
5
+ return unless opts[:indexing] == 'hierarchical'
5
6
  tasks = {
6
7
  ani95: [:ani_distances, opts[:gsp_ani], :ani],
7
8
  aai90: [:aai_distances, opts[:gsp_aai], :aai]
@@ -69,6 +70,17 @@ module MiGA::SubcladeRunner::Pipeline
69
70
  end
70
71
 
71
72
  def subclades(metric)
73
+ case opts[:indexing]
74
+ when 'no'
75
+ # Do nothing
76
+ when 'gsearch'
77
+ subclades_gsearch(metric)
78
+ when 'hierarchical'
79
+ subclades_hierarchical(metric)
80
+ end
81
+ end
82
+
83
+ def subclades_hierarchical(metric)
72
84
  src = File.expand_path('utils/subclades.R', MiGA::MiGA.root_path)
73
85
  step = :"#{metric}_distances"
74
86
  metric_res = project.result(step) or raise "Incomplete step #{step}"
@@ -82,6 +94,34 @@ module MiGA::SubcladeRunner::Pipeline
82
94
  end
83
95
  end
84
96
 
97
+ def subclades_gsearch(metric)
98
+ tmp_dir = tmp_file('genomes')
99
+ Dir.mkdir(tmp_dir)
100
+
101
+ cmd = %w[gsearch --pio 2000 --nbthreads] + [opts[:thr].to_s]
102
+ cmd += %w[tohnsw -k 16 -n 128 --ef 1600 --algo optdens]
103
+ cmd += %w[--scale_modify_f 0.25 -d] + tmp_dir
104
+
105
+ if metric.to_sym == :ani
106
+ project.dataset_ref_active.each do |ds|
107
+ f = ds&.result(:assembly)&.file_path(:largecontigs) or next
108
+ FileUtils.ln_s(f, tmp_dir)
109
+ end
110
+ cmd += %w[-s 18000]
111
+ else
112
+ project.dataset_ref_active.each do |ds|
113
+ f = ds&.result(:cds)&.file_path(:proteins) or next
114
+ FileUtils.ln_s(f, tmp_dir)
115
+ end
116
+ cmd += %w[-s 12000 --aa]
117
+ end
118
+
119
+ Dir.mkdir('gsearch.d')
120
+ Dir.chdir('gsearch.d')
121
+ run_cmd(cmd)
122
+ Dir.chdir('..')
123
+ end
124
+
85
125
  def compile
86
126
  src = File.expand_path('utils/subclades-compile.rb', MiGA::MiGA.root_path)
87
127
  run_cmd(['ruby', src, '.', 'miga-project.class'])
@@ -14,20 +14,21 @@ class MiGA::SubcladeRunner
14
14
  @step == :clade_finding ? '01.find.running' : '02.ani.running'
15
15
  )
16
16
  @opts[:thr] ||= ENV.fetch('CORES') { 2 }.to_i
17
- @opts[:run_clades] = @project.option(:run_clades)
18
- @opts[:gsp_ani] = @project.option(:gsp_ani)
19
- @opts[:gsp_aai] = @project.option(:gsp_aai)
20
- @opts[:gsp_metric] = @project.option(:gsp_metric)
17
+ %i[run_clades gsp_ani gsp_aai gsp_metric indexing].each do |m|
18
+ @opts[m] = @project.option(m)
19
+ end
21
20
  end
22
21
 
23
22
  # Launch the appropriate analysis
24
23
  def go!
25
24
  return if project.type == :metagenomes
26
25
 
27
- unless @project.dataset_names.any? { |i| @project.dataset(i).ref? }
26
+ if @opts[:indexing] == 'no' ||
27
+ !@project.dataset_names.any? { |i| @project.dataset(i).ref? }
28
28
  FileUtils.touch(File.join(@home, 'miga-project.empty'))
29
29
  return
30
30
  end
31
+
31
32
  Dir.chdir home
32
33
  Dir.mktmpdir do |tmp_dir|
33
34
  @tmp = tmp_dir
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.22.6
4
+ version: 1.4.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-05-19 00:00:00.000000000 Z
11
+ date: 2025-09-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.3'
47
+ version: '2.7'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.3'
54
+ version: '2.7'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: net-http
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -100,14 +100,14 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '12'
103
+ version: '13.0'
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '12'
110
+ version: '13.0'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: test-unit
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -317,6 +317,7 @@ files:
317
317
  - test/tax_index_test.rb
318
318
  - test/taxonomy_test.rb
319
319
  - test/test_helper.rb
320
+ - test/test_patch.rb
320
321
  - test/with_daemon_test.rb
321
322
  - test/with_option_test.rb
322
323
  - utils/FastAAI/FastAAI-legacy/FastAAI
@@ -642,7 +643,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
642
643
  requirements:
643
644
  - - ">="
644
645
  - !ruby/object:Gem::Version
645
- version: '2.7'
646
+ version: '3.1'
646
647
  required_rubygems_version: !ruby/object:Gem::Requirement
647
648
  requirements:
648
649
  - - ">="