miga-base 1.3.22.6 → 1.4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/add.rb +9 -1
- data/lib/miga/cli/action/doctor/base.rb +3 -2
- data/lib/miga/cli/action/env.rb +4 -0
- data/lib/miga/cli/action/files.rb +5 -2
- data/lib/miga/cli/action/init.rb +2 -2
- data/lib/miga/cli/action/option.rb +4 -0
- data/lib/miga/cli/action/run.rb +1 -1
- data/lib/miga/cli/opt_helper.rb +12 -6
- data/lib/miga/dataset/base.rb +3 -0
- data/lib/miga/dataset/result/add.rb +3 -1
- data/lib/miga/dataset.rb +5 -1
- data/lib/miga/project/base.rb +6 -1
- data/lib/miga/result/stats.rb +4 -0
- data/lib/miga/version.rb +2 -2
- data/scripts/assembly.bash +57 -15
- data/scripts/miga.bash +0 -5
- data/scripts/trimmed_fasta.bash +2 -19
- data/scripts/trimmed_reads.bash +1 -1
- data/test/common_test.rb +3 -1
- data/test/result_stats_test.rb +2 -1
- data/test/test_helper.rb +2 -1
- data/test/test_patch.rb +20 -0
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +3 -3
- data/utils/distance/runner.rb +23 -3
- data/utils/index_metadata.rb +4 -2
- data/utils/multitrim/multitrim.py +12 -3
- data/utils/requirements.txt +1 -1
- data/utils/subclade/pipeline.rb +40 -0
- data/utils/subclade/runner.rb +6 -5
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5525ee26722f9be06bf3d8ddd8a94909457c55d8e2ac9d8f137bf44e15cf2035
|
4
|
+
data.tar.gz: ce4036f422bb2cc4e5da887d0dfaea6de89a73f152f63269c5894644f8a92127
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b3393571b1f4425a243181ba50117829b264022c02ac251717721bb6ef092ac81f323b8ccb1f4ea1989fe77d61b9f224115f0758f81a0963503127938b59b3f
|
7
|
+
data.tar.gz: d6dd244f7ea5261900d4fc8329398e5bb07a44e28e0ddf8706c546798546814735142c0179831193a4a48d8b3c2bdc93745bd0a021ee31df7ec385f7ef58c4fe
|
data/lib/miga/cli/action/add.rb
CHANGED
@@ -170,7 +170,15 @@ class MiGA::Cli::Action::Add < MiGA::Cli::Action
|
|
170
170
|
r_path = File.expand_path("data/#{r_dir}/#{d.name}", p.path)
|
171
171
|
file_type[2].each_with_index do |ext, i|
|
172
172
|
gz = file[i] =~ /\.gz/ ? '.gz' : ''
|
173
|
-
|
173
|
+
fo = "#{r_path}#{ext}#{gz}"
|
174
|
+
if gz == ''
|
175
|
+
FileUtils.cp(file[i], fo)
|
176
|
+
else
|
177
|
+
MiGA::MiGA.run_cmd(
|
178
|
+
"gzip -cd #{file[i].shellescape} | gzip -c > #{fo.shellescape}",
|
179
|
+
source: :miga_env # <- To load pigz if available
|
180
|
+
)
|
181
|
+
end
|
174
182
|
cli.say " file: #{File.basename(file[i])}"
|
175
183
|
end
|
176
184
|
File.open("#{r_path}.done", 'w') { |f| f.print Time.now.to_s }
|
@@ -83,8 +83,9 @@ module MiGA::Cli::Action::Doctor::Base
|
|
83
83
|
data = MiGA::SQLite.new(db_file).run(sql) || []
|
84
84
|
Hash[
|
85
85
|
data.map do |row|
|
86
|
-
|
87
|
-
|
86
|
+
r = row.dup
|
87
|
+
k, v = r.shift(2)
|
88
|
+
[k, r.all?(&:zero?) ? v : [v] + r]
|
88
89
|
end
|
89
90
|
]
|
90
91
|
end
|
data/lib/miga/cli/action/env.rb
CHANGED
@@ -18,6 +18,10 @@ class MiGA::Cli::Action::Env < MiGA::Cli::Action
|
|
18
18
|
for util in enveomics/Scripts FastAAI/fastaai multitrim ; do
|
19
19
|
export PATH="$MIGA/utils/$util:$PATH"
|
20
20
|
done
|
21
|
+
# Override gzip with pigz (if available)
|
22
|
+
if command -v pigz &>/dev/null ; then
|
23
|
+
function gzip { pigz -p ${CORES:-2} "$@" ; }
|
24
|
+
fi
|
21
25
|
BASH
|
22
26
|
end
|
23
27
|
|
@@ -7,7 +7,7 @@ class MiGA::Cli::Action::Files < MiGA::Cli::Action
|
|
7
7
|
def parse_cli
|
8
8
|
cli.defaults = { details: false, json: true }
|
9
9
|
cli.parse do |opt|
|
10
|
-
cli.opt_object(opt, [
|
10
|
+
cli.opt_object(opt, %i[project dataset_opt result_opt])
|
11
11
|
opt.on(
|
12
12
|
'-i', '--info',
|
13
13
|
'Print additional details for each file'
|
@@ -21,7 +21,10 @@ class MiGA::Cli::Action::Files < MiGA::Cli::Action
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def perform
|
24
|
-
cli.load_project_or_dataset
|
24
|
+
obj = cli.load_project_or_dataset
|
25
|
+
res = cli[:result] ? [cli.load_result] : cli.load_project_or_dataset.results
|
26
|
+
res.each do |res|
|
27
|
+
sym = res.key
|
25
28
|
cli.puts "#{"#{sym}\tjson\t" if cli[:details]}#{res.path}" if cli[:json]
|
26
29
|
res.each_file do |k, f|
|
27
30
|
cli.puts "#{"#{sym}\t#{k}\t" if cli[:details]}#{res.dir}/#{f}"
|
data/lib/miga/cli/action/init.rb
CHANGED
@@ -140,7 +140,7 @@ class MiGA::Cli::Action::Init < MiGA::Cli::Action
|
|
140
140
|
paths[r[1]] = cli[:"path_to_#{r[1]}"]
|
141
141
|
cli.puts "user-provided: #{paths[r[1]]}"
|
142
142
|
else
|
143
|
-
path = find_software(r[1])
|
143
|
+
path = find_software(r[1], rc_fh)
|
144
144
|
paths[r[1]] = File.expand_path(r[1], path).shellescape
|
145
145
|
end
|
146
146
|
end
|
@@ -169,7 +169,7 @@ class MiGA::Cli::Action::Init < MiGA::Cli::Action
|
|
169
169
|
) == 'yes'
|
170
170
|
end
|
171
171
|
|
172
|
-
def find_software(exec)
|
172
|
+
def find_software(exec, rc_fh)
|
173
173
|
path = nil
|
174
174
|
loop do
|
175
175
|
d_path = File.dirname(run_cmd(cli, ['which', exec], raise: false))
|
@@ -14,6 +14,10 @@ class MiGA::Cli::Action::Option < MiGA::Cli::Action
|
|
14
14
|
) { |v| cli[:key] = v }
|
15
15
|
opt.on(
|
16
16
|
'--value STRING',
|
17
|
+
'::HIDE::' # Replaced by --set, but aliased for backward compatibility
|
18
|
+
) { |v| cli[:value] = v }
|
19
|
+
opt.on(
|
20
|
+
'-s', '--set STRING',
|
17
21
|
'Value of the option to set (by default, option value is not changed)',
|
18
22
|
'Recognized tokens: nil, true, false'
|
19
23
|
) { |v| cli[:value] = v }
|
data/lib/miga/cli/action/run.rb
CHANGED
@@ -7,7 +7,7 @@ class MiGA::Cli::Action::Run < MiGA::Cli::Action
|
|
7
7
|
def parse_cli
|
8
8
|
cli.defaults = { try_load: false, thr: 1, env: false, check_first: false }
|
9
9
|
cli.parse do |opt|
|
10
|
-
cli.opt_object(opt, [
|
10
|
+
cli.opt_object(opt, %i[project dataset_opt result_opt])
|
11
11
|
opt.on(
|
12
12
|
'-t', '--threads INT', Integer,
|
13
13
|
"Threads to use in the local run (by default: #{cli[:thr]})"
|
data/lib/miga/cli/opt_helper.rb
CHANGED
@@ -92,22 +92,22 @@ module MiGA::Cli::OptHelper
|
|
92
92
|
opt.on(
|
93
93
|
'-r', '--result STRING',
|
94
94
|
"#{'(Mandatory) ' if w == :result}Name of the result",
|
95
|
-
'Recognized names for dataset-specific results include:',
|
96
|
-
*MiGA::Dataset.RESULT_DIRS.keys
|
97
|
-
'Recognized names for project-wide results include:',
|
98
|
-
*MiGA::Project.RESULT_DIRS.keys
|
95
|
+
'~ Recognized names for dataset-specific results include:',
|
96
|
+
*list_to_paragraph(MiGA::Dataset.RESULT_DIRS.keys),
|
97
|
+
'~ Recognized names for project-wide results include:',
|
98
|
+
*list_to_paragraph(MiGA::Project.RESULT_DIRS.keys)
|
99
99
|
) { |v| self[:result] = v.downcase.to_sym }
|
100
100
|
when :result_dataset
|
101
101
|
opt.on(
|
102
102
|
'-r', '--result STRING',
|
103
103
|
'(Mandatory) Name of the result, one of:',
|
104
|
-
*MiGA::Dataset.RESULT_DIRS.keys
|
104
|
+
*list_to_paragraph(MiGA::Dataset.RESULT_DIRS.keys, indent: 0)
|
105
105
|
) { |v| self[:result] = v.downcase.to_sym }
|
106
106
|
when :result_project
|
107
107
|
opt.on(
|
108
108
|
'-r', '--result STRING',
|
109
109
|
'(Mandatory) Name of the result, one of:',
|
110
|
-
*MiGA::Project.RESULT_DIRS.keys
|
110
|
+
*list_to_paragraph(MiGA::Project.RESULT_DIRS.keys, indent: 0)
|
111
111
|
) { |v| self[:result] = v.downcase.to_sym }
|
112
112
|
else
|
113
113
|
raise "Internal error: Unrecognized option: #{w}"
|
@@ -174,4 +174,10 @@ module MiGA::Cli::OptHelper
|
|
174
174
|
sym = flag.to_sym if sym.nil?
|
175
175
|
opt.on("--#{flag.to_s.tr('_', '-')}", description) { |v| self[sym] = v }
|
176
176
|
end
|
177
|
+
|
178
|
+
def list_to_paragraph(list, width: 50, indent: 2)
|
179
|
+
list.map(&:to_s).join(', ')
|
180
|
+
.scan(/\S.{0,#{width}}\S(?=\s|$)|\S+/).to_a
|
181
|
+
.map { |i| "#{' ' * indent}#{i}" }
|
182
|
+
end
|
177
183
|
end
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -158,6 +158,9 @@ module MiGA::Dataset::Base
|
|
158
158
|
},
|
159
159
|
dist_req: {
|
160
160
|
desc: 'Run distances against these datasets', type: Array, default: []
|
161
|
+
},
|
162
|
+
keep_assembly_graphs: {
|
163
|
+
desc: 'Do not clean assembly graphs', in: [true, false], default: false
|
161
164
|
}
|
162
165
|
}
|
163
166
|
end
|
@@ -58,10 +58,12 @@ module MiGA::Dataset::Result::Add
|
|
58
58
|
return nil unless
|
59
59
|
result_files_exist?(base, '.CoupledReads.fa') ||
|
60
60
|
result_files_exist?(base, '.SingleReads.fa') ||
|
61
|
-
result_files_exist?(base, %w[.1.fasta .2.fasta])
|
61
|
+
result_files_exist?(base, %w[.1.fasta .2.fasta]) ||
|
62
|
+
result_files_exist?(base, '.empty')
|
62
63
|
|
63
64
|
add_files_to_ds_result(
|
64
65
|
MiGA::Result.new("#{base}.json"), name,
|
66
|
+
empty: '.empty',
|
65
67
|
coupled: '.CoupledReads.fa',
|
66
68
|
single: '.SingleReads.fa',
|
67
69
|
pair1: '.1.fasta',
|
data/lib/miga/dataset.rb
CHANGED
@@ -199,7 +199,9 @@ class MiGA::Dataset < MiGA::MiGA
|
|
199
199
|
|
200
200
|
##
|
201
201
|
# Retrieves the option with name +key+ from the dataset's metadata
|
202
|
-
# extending support to relative paths in +:db_project
|
202
|
+
# extending support to relative paths in +:db_project+, and for all
|
203
|
+
# other options looks for metadata defined in the project before
|
204
|
+
# returning the default
|
203
205
|
def option_by_metadata(key)
|
204
206
|
case key.to_sym
|
205
207
|
when :db_project
|
@@ -209,6 +211,8 @@ class MiGA::Dataset < MiGA::MiGA
|
|
209
211
|
return y
|
210
212
|
end
|
211
213
|
|
214
|
+
return project.metadata[key] unless project.metadata[key].nil?
|
215
|
+
|
212
216
|
super
|
213
217
|
end
|
214
218
|
end
|
data/lib/miga/project/base.rb
CHANGED
@@ -133,6 +133,11 @@ module MiGA::Project::Base
|
|
133
133
|
desc: 'Maximum p-value to transfer taxonomy', default: 0.1, type: Float,
|
134
134
|
in: 0.0..1.0
|
135
135
|
},
|
136
|
+
indexing: {
|
137
|
+
desc: 'Approach used to index the collection as database', type: String,
|
138
|
+
default: 'hierarchical',
|
139
|
+
in: %w[hierarchical gsearch no]
|
140
|
+
},
|
136
141
|
haai_p: {
|
137
142
|
desc: 'Value of aai.rb -p on hAAI', type: String,
|
138
143
|
default: proc { |project|
|
@@ -146,7 +151,7 @@ module MiGA::Project::Base
|
|
146
151
|
},
|
147
152
|
ani_p: {
|
148
153
|
desc: 'Value of ani.rb -p on ANI', default: 'fastani', type: String,
|
149
|
-
in: %w[blast+ blast blat fastani]
|
154
|
+
in: %w[blast+ blast blat fastani no]
|
150
155
|
},
|
151
156
|
max_try: {
|
152
157
|
desc: 'Maximum number of task attempts', default: 10, type: Integer,
|
data/lib/miga/result/stats.rb
CHANGED
@@ -42,6 +42,10 @@ module MiGA::Result::Stats
|
|
42
42
|
stats
|
43
43
|
end
|
44
44
|
|
45
|
+
def compute_stats_trimmed_reads
|
46
|
+
compute_stats_raw_reads
|
47
|
+
end
|
48
|
+
|
45
49
|
def compute_stats_trimmed_fasta
|
46
50
|
f = self[:files][:coupled].nil? ? file_path(:single) : file_path(:coupled)
|
47
51
|
s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true, skew: true)
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.
|
15
|
+
VERSION = [1.4, 0, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2025,
|
23
|
+
VERSION_DATE = Date.new(2025, 9, 23)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/assembly.bash
CHANGED
@@ -11,6 +11,7 @@ miga date > "$DATASET.start"
|
|
11
11
|
|
12
12
|
# Interpose (if needed)
|
13
13
|
interpose=no
|
14
|
+
TR="../02.trimmed_reads"
|
14
15
|
TF="../04.trimmed_fasta"
|
15
16
|
b=$DATASET
|
16
17
|
if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then
|
@@ -38,25 +39,66 @@ for i in SingleReads CoupledReads ; do
|
|
38
39
|
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
|
39
40
|
fi
|
40
41
|
done
|
42
|
+
for i in 1 2 ; do
|
43
|
+
base="$TR/${DATASET}.${i}.clipped.fastq"
|
44
|
+
if [[ -e "$base" && ! -s "${base}.gz" ]] ; then
|
45
|
+
gzip -9f "$base"
|
46
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
47
|
+
fi
|
48
|
+
done
|
41
49
|
|
42
50
|
# Assemble
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
+
CMD="spades.py -o $DATASET -t $CORES"
|
52
|
+
TYPE_OPT=""
|
53
|
+
case "$(miga ls -P "$PROJECT" -D "$DATASET" -m type | cut -f 2)" in
|
54
|
+
"metagenome")
|
55
|
+
TYPE_OPT="--meta" ;;
|
56
|
+
"plasmid")
|
57
|
+
TYPE_OPT="--plasmid" ;;
|
58
|
+
"scgenome")
|
59
|
+
TYPE_OPT="--sc" ;;
|
60
|
+
"genome")
|
61
|
+
TYPE_OPT="--isolate" ;;
|
62
|
+
"virome")
|
63
|
+
TYPE_OPT="--metaviral" ;;
|
64
|
+
esac
|
65
|
+
F1="$TR/${DATASET}.1.clipped.fastq.gz"
|
66
|
+
F2="$TR/${DATASET}.2.clipped.fastq.gz"
|
67
|
+
if [[ -s "$F1" ]] ; then
|
68
|
+
if [[ -s "$F2" ]] ; then
|
69
|
+
CMD="$CMD -1 $F1 -2 $F2"
|
70
|
+
else
|
71
|
+
CMD="$CMD -s $F1"
|
72
|
+
[[ "$TYPE_OPT" == "--meta" ]] && TYPE_OPT=""
|
73
|
+
fi
|
74
|
+
else
|
75
|
+
F1="$TF/${DATASET}.CoupledReads.fa.gz"
|
76
|
+
F1="$TF/${DATASET}.SingleReads.fa.gz"
|
77
|
+
if [[ -s "$F1" ]] ; then
|
78
|
+
CMD="$CMD --12 $F1"
|
79
|
+
elif [[ -s "$F2" ]] ; then
|
80
|
+
CMD="$CMD -s $F2"
|
81
|
+
[[ "$TYPE_OPT" == "--meta" ]] && TYPE_OPT=""
|
82
|
+
else
|
83
|
+
echo "No input files found to assemble" >&2
|
84
|
+
exit 1
|
85
|
+
fi
|
86
|
+
fi
|
87
|
+
CMD="$CMD $TYPE_OPT"
|
88
|
+
echo "$CMD"
|
89
|
+
$CMD || true
|
90
|
+
[[ -s "$DATASET/contigs.fa" ]] || exit 1
|
51
91
|
|
52
92
|
# Clean
|
53
|
-
(
|
93
|
+
KEEP_GR=$(miga option -P "$PROJECT" -D "$DATASET" -k keep_assembly_graphs)
|
94
|
+
[[ "$KEEP_GR" == "true" ]] || ( cd "$DATASET" && rm -R *.gfa *.fastg *.paths )
|
95
|
+
( cd "$DATASET" && rm -R K* corrected misc pipeline_state before_rr.fasta )
|
54
96
|
|
55
97
|
# Extract
|
56
|
-
if [[ -s "$DATASET/
|
57
|
-
ln -s "$DATASET/
|
98
|
+
if [[ -s "$DATASET/scaffolds.fasta" ]] ; then
|
99
|
+
ln -s "$DATASET/scaffolds.fasta" "$DATASET.AllContigs.fna"
|
58
100
|
else
|
59
|
-
ln -s "$DATASET/
|
101
|
+
ln -s "$DATASET/contigs.fasta" "$DATASET.AllContigs.fna"
|
60
102
|
fi
|
61
103
|
FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2 >= 1000 { print $1 }' \
|
62
104
|
| FastA.filter.pl /dev/stdin "$DATASET.AllContigs.fna" \
|
@@ -64,6 +106,7 @@ FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2 >= 1000 { print $1 }' \
|
|
64
106
|
|
65
107
|
# Finalize
|
66
108
|
miga date > "$DATASET.done"
|
109
|
+
[[ -n "$OPT_TYPE" ]] || OPT_TYPE="default"
|
67
110
|
cat <<VERSIONS \
|
68
111
|
| miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f --stdin-versions
|
69
112
|
=> MiGA
|
@@ -74,7 +117,6 @@ $(
|
|
74
117
|
echo "version unknown"
|
75
118
|
fi
|
76
119
|
)
|
77
|
-
=>
|
78
|
-
version
|
120
|
+
=> SPADES
|
121
|
+
$(spades.py --version | perl -pe 's/.* //') [$OPT_TYPE]
|
79
122
|
VERSIONS
|
80
|
-
|
data/scripts/miga.bash
CHANGED
@@ -15,11 +15,6 @@ function exists { [[ -e "$1" ]] ; }
|
|
15
15
|
# Evaluates if the first passed argument is a function
|
16
16
|
function fx_exists { [[ $(type -t "$1") == "function" ]] ; }
|
17
17
|
|
18
|
-
# Override gzip with pigz (if available)
|
19
|
-
if command -v pigz &>/dev/null ; then
|
20
|
-
function gzip { pigz -p ${CORES:-2} "$@" ; }
|
21
|
-
fi
|
22
|
-
|
23
18
|
# Initiate a project-wide run
|
24
19
|
function miga_start_project_step {
|
25
20
|
local dir="$1"
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -11,35 +11,18 @@ b=$DATASET
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "$DATASET.start"
|
13
13
|
|
14
|
-
#
|
15
|
-
for s in 1 2 ; do
|
16
|
-
in="../02.trimmed_reads/${b}.${s}.clipped.fastq.gz"
|
17
|
-
[[ -s "$in" ]] \
|
18
|
-
&& FastQ.maskQual.rb -i "$in" -o "${b}.${s}.fasta" --fasta --qual 18
|
19
|
-
done
|
20
|
-
|
21
|
-
# Interpose
|
22
|
-
if [[ -e "${b}.2.fasta" ]] ; then
|
23
|
-
FastA.interpose.pl "${b}.CoupledReads.fa" "$b".[12].fasta
|
24
|
-
else
|
25
|
-
mv "${b}.1.fasta" "${b}.SingleReads.fa"
|
26
|
-
fi
|
27
|
-
|
28
|
-
# Gzip
|
14
|
+
# Gzip (if needed)
|
29
15
|
for x in 1.fasta 2.fasta SingleReads.fa CoupledReads.fa ; do
|
30
16
|
in="${b}.${x}"
|
31
17
|
[[ -e "$in" ]] && gzip -9f "$in"
|
32
18
|
done
|
33
19
|
|
34
20
|
# Finalize
|
21
|
+
echo 'Using FastQ directly' > "${DATASET}.empty"
|
35
22
|
miga date > "${DATASET}.done"
|
36
23
|
cat <<VERSIONS \
|
37
24
|
| miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f --stdin-versions
|
38
25
|
=> MiGA
|
39
26
|
$(miga --version)
|
40
|
-
=> Enveomics Collection: FastQ.maskQual.rb
|
41
|
-
$(FastQ.maskQual.rb --version | perl -pe 's/.* //')
|
42
|
-
=> Enveomics Collection: FastA.interpose.pl
|
43
|
-
version unknown
|
44
27
|
VERSIONS
|
45
28
|
|
data/scripts/trimmed_reads.bash
CHANGED
@@ -67,7 +67,7 @@ $(miga --version)
|
|
67
67
|
=> Enveomics Collection: FastQ.tag.rb
|
68
68
|
$(FastQ.tag.rb --version | perl -pe 's/.* //')
|
69
69
|
=> Multitrim
|
70
|
-
version
|
70
|
+
$(multitrim.py --version | perl -pe 's/.* //')
|
71
71
|
=> FaQCs
|
72
72
|
$(FaQCs --version 2>&1 | perl -pe 's/.*: //')
|
73
73
|
=> Seqtk
|
data/test/common_test.rb
CHANGED
@@ -26,11 +26,13 @@ class CommonTest < Test::Unit::TestCase
|
|
26
26
|
assert_respond_to(MiGA::MiGA, :DEBUG_ON)
|
27
27
|
assert_respond_to(MiGA::MiGA, :DEBUG_OFF)
|
28
28
|
MiGA::MiGA.DEBUG_TRACE_ON
|
29
|
+
assert(MiGA::MiGA.debug_trace?)
|
29
30
|
err = capture_stderr do
|
30
31
|
MiGA::MiGA.DEBUG 'Dandadi'
|
31
32
|
end
|
32
|
-
assert_match(/Dandadi\n .*block in test_debug_trace/, err.string)
|
33
|
+
assert_match(/Dandadi\n .*block in .*test_debug_trace/, err.string)
|
33
34
|
MiGA::MiGA.DEBUG_TRACE_OFF
|
35
|
+
assert(!MiGA::MiGA.debug_trace?)
|
34
36
|
err = capture_stderr do
|
35
37
|
MiGA::MiGA.DEBUG 'Dandada'
|
36
38
|
end
|
data/test/result_stats_test.rb
CHANGED
@@ -53,7 +53,8 @@ class ResultStatsTest < Test::Unit::TestCase
|
|
53
53
|
r = dataset.add_result(:trimmed_reads)
|
54
54
|
assert_equal({}, r.stats)
|
55
55
|
r.compute_stats
|
56
|
-
assert_equal(
|
56
|
+
assert_equal([nil, '%'], r.stats[:a_t_skew])
|
57
|
+
assert_equal(0, r.stats[:reads])
|
57
58
|
end
|
58
59
|
|
59
60
|
def test_read_quality
|
data/test/test_helper.rb
CHANGED
data/test/test_patch.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# Narrowly filter known legacy-gem noise without changing behavior.
|
2
|
+
|
3
|
+
module Warning
|
4
|
+
class << self
|
5
|
+
alias_method :__warn_original, :warn
|
6
|
+
|
7
|
+
def warn(msg)
|
8
|
+
# 1) test-unit <-> assertions duplicate method warning
|
9
|
+
return if msg.include?('method redefined; discarding old assert_raise_message')
|
10
|
+
return if msg.include?('previous definition of assert_raise_message was here')
|
11
|
+
|
12
|
+
# 2) simplecov 0.13 "literal string will be frozen in the future"
|
13
|
+
# (emitted by simplecov/version.rb when assigning the VERSION constant)
|
14
|
+
return if msg.include?('simplecov/version.rb') &&
|
15
|
+
msg.include?('literal string will be frozen in the future')
|
16
|
+
|
17
|
+
__warn_original(msg)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/utils/distance/commands.rb
CHANGED
@@ -44,6 +44,7 @@ module MiGA::DistanceRunner::Commands
|
|
44
44
|
$stderr.puts "[#{Time.now}] ANI: #{dataset.name} vs #{targets.size} targets"
|
45
45
|
empty_vals = targets.map { |_i| nil }
|
46
46
|
return empty_vals unless File.size?(tmp_file('largecontigs.fa'))
|
47
|
+
return empty_vals if opts[:ani_p] == 'no'
|
47
48
|
|
48
49
|
# Launch comparisons
|
49
50
|
sbj = pending_targets(targets, :ani)
|
data/utils/distance/database.rb
CHANGED
@@ -106,7 +106,7 @@ module MiGA::DistanceRunner::Database
|
|
106
106
|
[n1, n2]
|
107
107
|
).first
|
108
108
|
end if File.size?(db)
|
109
|
-
y
|
109
|
+
y.dup
|
110
110
|
rescue SQLite3::CorruptException => e
|
111
111
|
$stderr.puts "Corrupt database: #{db}"
|
112
112
|
raise e
|
@@ -157,7 +157,7 @@ module MiGA::DistanceRunner::Database
|
|
157
157
|
data = {}
|
158
158
|
SQLite3::Database.new(db) do |conn|
|
159
159
|
sql = "select seq2, #{table}, sd, n, omega from #{table}"
|
160
|
-
conn.execute(sql).each { |row| data[
|
160
|
+
conn.execute(sql).each { |row| r = row.dup; data[r.shift] = r }
|
161
161
|
end
|
162
162
|
data
|
163
163
|
rescue => e
|
@@ -187,7 +187,7 @@ module MiGA::DistanceRunner::Database
|
|
187
187
|
# Iterates for each entry in +db+
|
188
188
|
def foreach_in_db(db, metric, &blk)
|
189
189
|
SQLite3::Database.new(db) do |conn|
|
190
|
-
conn.execute("select * from #{metric}").each { |r| blk[r] }
|
190
|
+
conn.execute("select * from #{metric}").each { |r| blk[r.dup] }
|
191
191
|
end
|
192
192
|
end
|
193
193
|
|
data/utils/distance/runner.rb
CHANGED
@@ -25,9 +25,8 @@ class MiGA::DistanceRunner
|
|
25
25
|
@ref_project = project
|
26
26
|
end
|
27
27
|
@opts[:thr] ||= ENV.fetch('CORES') { 1 }.to_i
|
28
|
-
%i[haai_p aai_p ani_p distances_checkpoint aai_save_rbm]
|
29
|
-
@opts[m] ||= ref_project.option(m)
|
30
|
-
end
|
28
|
+
%i[haai_p aai_p ani_p distances_checkpoint aai_save_rbm indexing]
|
29
|
+
.each { |m| @opts[m] ||= ref_project.option(m) }
|
31
30
|
$stderr.puts "Options: #{opts}"
|
32
31
|
end
|
33
32
|
|
@@ -46,6 +45,27 @@ class MiGA::DistanceRunner
|
|
46
45
|
# Launch analysis for reference datasets
|
47
46
|
def go_ref!
|
48
47
|
$stderr.puts 'Launching analysis for reference dataset'
|
48
|
+
|
49
|
+
# Check if the project is non-hierarchical
|
50
|
+
case ref_project.option(:indexing)
|
51
|
+
when 'no'
|
52
|
+
# No index? No distance
|
53
|
+
out_base = File.expand_path(dataset.name, home)
|
54
|
+
File.open("#{out_base}.empty", 'w') { |fh| fh.puts 'No indexing' }
|
55
|
+
return
|
56
|
+
when 'gsearch'
|
57
|
+
if project == ref_project
|
58
|
+
# No need to pre-calculate any distances for GSearch indexes
|
59
|
+
out_base = File.expand_path(dataset.name, home)
|
60
|
+
File.open("#{out_base}.empty", 'w') { |fh| fh.puts 'GSearch indexing' }
|
61
|
+
return
|
62
|
+
else
|
63
|
+
# Just keep going, gsearch will override haai_p and aai_p
|
64
|
+
end
|
65
|
+
when 'hierarchical'
|
66
|
+
# Just keep going
|
67
|
+
end
|
68
|
+
|
49
69
|
# Initialize databases
|
50
70
|
initialize_dbs! true
|
51
71
|
|
data/utils/index_metadata.rb
CHANGED
@@ -12,8 +12,10 @@ db.execute 'create table metadata(' \
|
|
12
12
|
'`name` varchar(256), `field` varchar(256), `value` text)'
|
13
13
|
|
14
14
|
def searchable(db, d, k, v)
|
15
|
-
db.execute
|
16
|
-
|
15
|
+
db.execute(
|
16
|
+
'insert into metadata values(?,?,?)',
|
17
|
+
[d.name, k.to_s, " #{v.to_s.downcase.gsub(/[^A-Za-z0-9\-]+/, ' ')} "]
|
18
|
+
)
|
17
19
|
end
|
18
20
|
|
19
21
|
p.each_dataset do |d|
|
@@ -515,7 +515,7 @@ def adapter_identification_pe(artificial_artifacts, seqtk_binary, faqcs_binary,
|
|
515
515
|
begin_assessment = True
|
516
516
|
else:
|
517
517
|
segment = line.strip().split()
|
518
|
-
detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
|
518
|
+
detected_adapters[segment[0]] = float(re.findall(r"\d+\.\d+", segment[3])[0])
|
519
519
|
|
520
520
|
detection_report.close()
|
521
521
|
|
@@ -576,7 +576,7 @@ def adapter_identification_se(artificial_artifacts, seqtk_binary, faqcs_binary,
|
|
576
576
|
begin_assessment = True
|
577
577
|
else:
|
578
578
|
segment = line.strip().split()
|
579
|
-
detected_adapters[segment[0]] = float(re.findall("\d+\.\d+", segment[3])[0])
|
579
|
+
detected_adapters[segment[0]] = float(re.findall(r"\d+\.\d+", segment[3])[0])
|
580
580
|
|
581
581
|
detection_report.close()
|
582
582
|
|
@@ -1028,11 +1028,13 @@ def gather_opts():
|
|
1028
1028
|
|
1029
1029
|
parser.add_argument("--resources", dest = "resource_list", action = 'store_true', help = "Print a list of resources used by Multitrim and quit.")
|
1030
1030
|
|
1031
|
+
parser.add_argument("--version", dest = "version", action = 'store_true', help = "Print the version of multitrim and exit")
|
1032
|
+
|
1031
1033
|
|
1032
1034
|
return(parser, parser.parse_args())
|
1033
1035
|
|
1034
1036
|
def print_resources():
|
1035
|
-
print("Multitrim github: https://github.com/
|
1037
|
+
print("Multitrim github: https://github.com/bio-miga/multitrim")
|
1036
1038
|
print("MiGA adapters available at: https://github.com/bio-miga/miga/blob/main/utils/adapters.fa")
|
1037
1039
|
internal_adapters = faqcs_internal_adapters()
|
1038
1040
|
print("FaQCs mandatory adapters are:")
|
@@ -1042,6 +1044,9 @@ def print_resources():
|
|
1042
1044
|
print("fastp github: https://github.com/OpenGene/fastp")
|
1043
1045
|
print("Falco github: https://github.com/smithlabcode/falco")
|
1044
1046
|
|
1047
|
+
def print_version():
|
1048
|
+
print("MiGA's Multitrim 1.0")
|
1049
|
+
|
1045
1050
|
#Program Control
|
1046
1051
|
def main():
|
1047
1052
|
#Keep the parser on hand so I can prent usage as needed.s
|
@@ -1052,6 +1057,10 @@ def main():
|
|
1052
1057
|
print_resources()
|
1053
1058
|
quit()
|
1054
1059
|
|
1060
|
+
if options.version:
|
1061
|
+
print_version()
|
1062
|
+
quit()
|
1063
|
+
|
1055
1064
|
|
1056
1065
|
#Allows for the script to take no inputs and print help/usage
|
1057
1066
|
if len(sys.argv)==1:
|
data/utils/requirements.txt
CHANGED
@@ -12,11 +12,11 @@ Bedtools bedtools http://bedtools.readthedocs.org/en/latest/
|
|
12
12
|
Prodigal prodigal http://prodigal.ornl.gov
|
13
13
|
MCL mcl http://micans.org/mcl/
|
14
14
|
Barrnap barrnap http://www.vicbioinformatics.com/software.barrnap.shtml
|
15
|
-
IDBA (reads) idba_ud http://i.cs.hku.hk/~alse/hkubrg/projects/idba
|
16
15
|
FaQCs (reads) FaQCs https://github.com/LANL-Bioinformatics/FaQCs
|
17
16
|
Falco (reads) falco https://github.com/smithlabcode/falco
|
18
17
|
Seqtk (reads) seqtk https://github.com/lh3/seqtk
|
19
18
|
Fastp (reads) fastp https://github.com/OpenGene/fastp
|
19
|
+
SPADES (reads) spades.py https://ablab.github.io/spades/ Required version: 3+
|
20
20
|
Temurin (rdp) java https://adoptium.net/ Any Java VM would work
|
21
21
|
MyTaxa (mytaxa) MyTaxa http://enve-omics.ce.gatech.edu/mytaxa
|
22
22
|
Krona (mytaxa) ktImportText https://github.com/marbl/Krona/wiki
|
data/utils/subclade/pipeline.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
module MiGA::SubcladeRunner::Pipeline
|
3
3
|
# Run species-level clusterings using ANI > 95% / AAI > 90%
|
4
4
|
def cluster_species
|
5
|
+
return unless opts[:indexing] == 'hierarchical'
|
5
6
|
tasks = {
|
6
7
|
ani95: [:ani_distances, opts[:gsp_ani], :ani],
|
7
8
|
aai90: [:aai_distances, opts[:gsp_aai], :aai]
|
@@ -69,6 +70,17 @@ module MiGA::SubcladeRunner::Pipeline
|
|
69
70
|
end
|
70
71
|
|
71
72
|
def subclades(metric)
|
73
|
+
case opts[:indexing]
|
74
|
+
when 'no'
|
75
|
+
# Do nothing
|
76
|
+
when 'gsearch'
|
77
|
+
subclades_gsearch(metric)
|
78
|
+
when 'hierarchical'
|
79
|
+
subclades_hierarchical(metric)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def subclades_hierarchical(metric)
|
72
84
|
src = File.expand_path('utils/subclades.R', MiGA::MiGA.root_path)
|
73
85
|
step = :"#{metric}_distances"
|
74
86
|
metric_res = project.result(step) or raise "Incomplete step #{step}"
|
@@ -82,6 +94,34 @@ module MiGA::SubcladeRunner::Pipeline
|
|
82
94
|
end
|
83
95
|
end
|
84
96
|
|
97
|
+
def subclades_gsearch(metric)
|
98
|
+
tmp_dir = tmp_file('genomes')
|
99
|
+
Dir.mkdir(tmp_dir)
|
100
|
+
|
101
|
+
cmd = %w[gsearch --pio 2000 --nbthreads] + [opts[:thr].to_s]
|
102
|
+
cmd += %w[tohnsw -k 16 -n 128 --ef 1600 --algo optdens]
|
103
|
+
cmd += %w[--scale_modify_f 0.25 -d] + tmp_dir
|
104
|
+
|
105
|
+
if metric.to_sym == :ani
|
106
|
+
project.dataset_ref_active.each do |ds|
|
107
|
+
f = ds&.result(:assembly)&.file_path(:largecontigs) or next
|
108
|
+
FileUtils.ln_s(f, tmp_dir)
|
109
|
+
end
|
110
|
+
cmd += %w[-s 18000]
|
111
|
+
else
|
112
|
+
project.dataset_ref_active.each do |ds|
|
113
|
+
f = ds&.result(:cds)&.file_path(:proteins) or next
|
114
|
+
FileUtils.ln_s(f, tmp_dir)
|
115
|
+
end
|
116
|
+
cmd += %w[-s 12000 --aa]
|
117
|
+
end
|
118
|
+
|
119
|
+
Dir.mkdir('gsearch.d')
|
120
|
+
Dir.chdir('gsearch.d')
|
121
|
+
run_cmd(cmd)
|
122
|
+
Dir.chdir('..')
|
123
|
+
end
|
124
|
+
|
85
125
|
def compile
|
86
126
|
src = File.expand_path('utils/subclades-compile.rb', MiGA::MiGA.root_path)
|
87
127
|
run_cmd(['ruby', src, '.', 'miga-project.class'])
|
data/utils/subclade/runner.rb
CHANGED
@@ -14,20 +14,21 @@ class MiGA::SubcladeRunner
|
|
14
14
|
@step == :clade_finding ? '01.find.running' : '02.ani.running'
|
15
15
|
)
|
16
16
|
@opts[:thr] ||= ENV.fetch('CORES') { 2 }.to_i
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
@opts[:gsp_metric] = @project.option(:gsp_metric)
|
17
|
+
%i[run_clades gsp_ani gsp_aai gsp_metric indexing].each do |m|
|
18
|
+
@opts[m] = @project.option(m)
|
19
|
+
end
|
21
20
|
end
|
22
21
|
|
23
22
|
# Launch the appropriate analysis
|
24
23
|
def go!
|
25
24
|
return if project.type == :metagenomes
|
26
25
|
|
27
|
-
|
26
|
+
if @opts[:indexing] == 'no' ||
|
27
|
+
!@project.dataset_names.any? { |i| @project.dataset(i).ref? }
|
28
28
|
FileUtils.touch(File.join(@home, 'miga-project.empty'))
|
29
29
|
return
|
30
30
|
end
|
31
|
+
|
31
32
|
Dir.chdir home
|
32
33
|
Dir.mktmpdir do |tmp_dir|
|
33
34
|
@tmp = tmp_dir
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '2.7'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '2.7'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: net-http
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
103
|
+
version: '13.0'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
110
|
+
version: '13.0'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: test-unit
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -317,6 +317,7 @@ files:
|
|
317
317
|
- test/tax_index_test.rb
|
318
318
|
- test/taxonomy_test.rb
|
319
319
|
- test/test_helper.rb
|
320
|
+
- test/test_patch.rb
|
320
321
|
- test/with_daemon_test.rb
|
321
322
|
- test/with_option_test.rb
|
322
323
|
- utils/FastAAI/FastAAI-legacy/FastAAI
|
@@ -642,7 +643,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
642
643
|
requirements:
|
643
644
|
- - ">="
|
644
645
|
- !ruby/object:Gem::Version
|
645
|
-
version: '
|
646
|
+
version: '3.1'
|
646
647
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
647
648
|
requirements:
|
648
649
|
- - ">="
|