miga-base 0.7.26.3 → 1.0.0.sr1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
|
|
2
|
+
require 'enveomics_rb/enveomics'
|
|
3
|
+
require 'enveomics_rb/match'
|
|
4
|
+
use 'tmpdir'
|
|
5
|
+
use 'shellwords'
|
|
6
|
+
|
|
7
|
+
module Enveomics
|
|
8
|
+
class BMset
|
|
9
|
+
attr :qry, :sbj, :set, :opt
|
|
10
|
+
|
|
11
|
+
##
|
|
12
|
+
# Initialize Enveomics::BMset object with sequence paths +qry+ and +sbj+,
|
|
13
|
+
# and options Hash +opts+ (see #opt for supported options) with Symbol keys
|
|
14
|
+
def initialize(qry, sbj, opts = {})
|
|
15
|
+
@qry = qry
|
|
16
|
+
@sbj = sbj
|
|
17
|
+
@set = nil
|
|
18
|
+
@opt = opts
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# Returns option with key +k+ as defined by #initialize or by default
|
|
23
|
+
# Supported options include [defaults in brackets]:
|
|
24
|
+
# - len [0]: Minimum alignment length in residues
|
|
25
|
+
# - id [0.0]: Minimum alignment identity in percent
|
|
26
|
+
# - fract [0.0]: Minimum alignment length as fraction of the query
|
|
27
|
+
# - score [0.0]: Minimum alignment score in bits
|
|
28
|
+
# - nucl [false]: The sequences are in nucleotides
|
|
29
|
+
# - thr [1]: Number of threads to use
|
|
30
|
+
# - bin ['']: Path to the directory containing binaries
|
|
31
|
+
# - program [:blast+]: Search engine to use
|
|
32
|
+
def opt(k)
|
|
33
|
+
@defaults ||= {
|
|
34
|
+
len: 0, id: 0.0, fract: 0.0, score: 0.0,
|
|
35
|
+
nucl: false, thr: 1, bin: '', program: :'blast+'
|
|
36
|
+
}
|
|
37
|
+
k = k.to_sym
|
|
38
|
+
@opt[k] = @defaults[k] if @opt[k].nil?
|
|
39
|
+
@opt[k]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# Array of Enveomics::Match objects
|
|
44
|
+
def set
|
|
45
|
+
match_and_filter! if @set.nil?
|
|
46
|
+
@set
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
##
|
|
50
|
+
# Returns the best match of query +qry+ as Enveomics::Match or nil if
|
|
51
|
+
# no qualifying match was found
|
|
52
|
+
def [](qry)
|
|
53
|
+
set[qry]
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
##
|
|
57
|
+
# Number of matches found
|
|
58
|
+
def count
|
|
59
|
+
set.count
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
##
|
|
63
|
+
# Execute search and filter matches
|
|
64
|
+
def match_and_filter!
|
|
65
|
+
@set = {}
|
|
66
|
+
match!.each do |match|
|
|
67
|
+
# Already a better match?
|
|
68
|
+
next if self[match.qry] && self[match.qry].score >= match.score
|
|
69
|
+
|
|
70
|
+
# Is this a good enough match?
|
|
71
|
+
next unless %i[len id score fract].all? do |metric|
|
|
72
|
+
match.send(metric) >= opt(metric)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Save match
|
|
76
|
+
@set[match.qry] = match
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
##
|
|
81
|
+
# Find all matches and return as an array of Enveomics::Match objects
|
|
82
|
+
def match!
|
|
83
|
+
y = []
|
|
84
|
+
Dir.mktmpdir do |dir|
|
|
85
|
+
# Determine commands
|
|
86
|
+
say('Temporal directory: ', dir)
|
|
87
|
+
db_path = File.join(dir, 'sbj.db')
|
|
88
|
+
out_path = File.join(dir, 'out.tsv')
|
|
89
|
+
cmds = []
|
|
90
|
+
case opt(:program)
|
|
91
|
+
when :blast
|
|
92
|
+
cmds << [
|
|
93
|
+
'formatdb', '-i', sbj, '-n', db_path, '-l', File.join(dir, 'log'),
|
|
94
|
+
'-p', opt(:nucl) ? 'F' : 'T'
|
|
95
|
+
]
|
|
96
|
+
cmd << [
|
|
97
|
+
'blastall', '-p', opt(:nucl) ? 'blastn' : 'blastp', '-d', db_path,
|
|
98
|
+
'-i', qry, '-v', '1', '-b', '1', '-a', opt(:thr).to_s, '-m', '8',
|
|
99
|
+
'-o', out_path
|
|
100
|
+
]
|
|
101
|
+
when :'blast+'
|
|
102
|
+
cmds << [
|
|
103
|
+
'makeblastdb', '-in', sbj, '-out', db_path,
|
|
104
|
+
'-dbtype', opt(:nucl) ? 'nucl' : 'prot'
|
|
105
|
+
]
|
|
106
|
+
cmds << [
|
|
107
|
+
opt(:nucl) ? 'blastn' : 'blastp', '-db', db_path, '-query', qry,
|
|
108
|
+
'-num_threads', opt(:thr).to_s, '-out', out_path, '-outfmt',
|
|
109
|
+
'6 qseqid sseqid pident length mismatch gapopen qstart qend ' \
|
|
110
|
+
'sstart send evalue bitscore qlen slen'
|
|
111
|
+
]
|
|
112
|
+
when :diamond
|
|
113
|
+
raise Enveomics::OptionError.new(
|
|
114
|
+
'Unsupported search engine diamond for nucleotides'
|
|
115
|
+
) if opt(:nucl)
|
|
116
|
+
cmds << [
|
|
117
|
+
'diamond', 'makedb', '--in', sbj, '--db', db_path,
|
|
118
|
+
'--threads', opt(:thr).to_s
|
|
119
|
+
]
|
|
120
|
+
cmds << [
|
|
121
|
+
'diamond', 'blastp', '--threads', opt(:thr).to_s,
|
|
122
|
+
'--db', db_path, '--query', qry, '--daa', "#{out_path}.daa",
|
|
123
|
+
'--quiet', '--sensitive'
|
|
124
|
+
]
|
|
125
|
+
cmds << [
|
|
126
|
+
'diamond', 'view', '--daa', "#{out_path}.daa", '--out', out_path,
|
|
127
|
+
'--quiet', '--outfmt'
|
|
128
|
+
] + %w[6 qseqid sseqid pident length mismatch gapopen qstart] +
|
|
129
|
+
%w[qend sstart send evalue bitscore qlen slen]
|
|
130
|
+
when :blat
|
|
131
|
+
cmds << ['blat', sbj, qry, '-out=blast8', out_path]
|
|
132
|
+
cmds[0] << '-prot' unless opt(:nucl)
|
|
133
|
+
else
|
|
134
|
+
raise Enveomics::OptionError.new(
|
|
135
|
+
"Unsupported search engine: #{opt(:program)}"
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Run commands
|
|
140
|
+
say('Running comparison')
|
|
141
|
+
say('Query: ', qry)
|
|
142
|
+
say('Subject: ', sbj)
|
|
143
|
+
cmd_err = File.join(dir, 'err')
|
|
144
|
+
begin
|
|
145
|
+
cmds.each do |cmd|
|
|
146
|
+
cmd[0] = File.join(opt(:bin), cmd[0]) unless opt(:bin) == ''
|
|
147
|
+
run_cmd(cmd, stderr: cmd_err)
|
|
148
|
+
end
|
|
149
|
+
rescue Enveomics::CommandError => e
|
|
150
|
+
$stderr.puts e
|
|
151
|
+
$stderr.puts ''
|
|
152
|
+
$stderr.puts '[ Error log ]'
|
|
153
|
+
$stderr.puts File.read(cmd_err)
|
|
154
|
+
exit
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Parse output
|
|
158
|
+
File.open(out_path, 'r') do |fh|
|
|
159
|
+
fh.each { |ln| y << Enveomics::Match.new(ln) }
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
y
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
##
|
|
166
|
+
# Enumerate RBMs and yield +blk+
|
|
167
|
+
def each(&blk)
|
|
168
|
+
if block_given?
|
|
169
|
+
set.each { |_, bm| blk.call(bm) }
|
|
170
|
+
else
|
|
171
|
+
to_enum(:each)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
#
|
|
2
|
+
require 'enveomics_rb/utils'
|
|
3
|
+
use 'optparse'
|
|
4
|
+
ARGV << '-h' if ARGV.empty?
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
module Enveomics
|
|
7
|
+
class << self
|
|
8
|
+
def opt_banner(opt, banner, usage = nil)
|
|
9
|
+
opt.version ||= $VERSION
|
|
10
|
+
usage ||= "#{opt.program_name}.rb [options]"
|
|
11
|
+
opt.banner = <<~BANNER
|
|
9
12
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
[Enveomics Collection: #{opt.program_name} #{opt.version}]
|
|
14
|
+
|
|
15
|
+
#{banner}
|
|
16
|
+
|
|
17
|
+
Usage
|
|
18
|
+
#{usage}
|
|
19
|
+
|
|
20
|
+
BANNER
|
|
16
21
|
end
|
|
17
|
-
return true
|
|
18
|
-
rescue LoadError
|
|
19
|
-
abort "\nUnmet requirements, please install required gems:" +
|
|
20
|
-
gems.map{ |gem| "\n gem install #{gem}" }.join + "\n\n" if mandatory
|
|
21
|
-
return false
|
|
22
22
|
end
|
|
23
23
|
end
|
|
24
24
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
|
|
2
|
+
require 'enveomics_rb/stats/sample'
|
|
3
|
+
|
|
4
|
+
module Enveomics
|
|
5
|
+
# Calculate Gaussian Mixture Models by Expectation Maximization
|
|
6
|
+
class GmmEm
|
|
7
|
+
attr :sample
|
|
8
|
+
attr :components
|
|
9
|
+
attr :opts
|
|
10
|
+
|
|
11
|
+
# Initialize Enve::GmmEm object from numeric array +x+, +components+
|
|
12
|
+
# gaussian components (an Integer), and options hash +opts+ with supported
|
|
13
|
+
# Symbol keys:
|
|
14
|
+
# - ll_delta_converge: Maximum change in LL to consider convergence
|
|
15
|
+
# (by default: 1e-15)
|
|
16
|
+
# - max_iter: Maximum number of EM iterations (by default: 1_000)
|
|
17
|
+
# - init_mu: Initial components means as numeric array
|
|
18
|
+
# - init_sigma: Initial components standard deviation as numeric array
|
|
19
|
+
# - init_alpha: Initial components fractions as numeric array adding up to 1
|
|
20
|
+
def initialize(x, components = 2, opts = {})
|
|
21
|
+
@sample = Enve::Stats::Sample.new(x)
|
|
22
|
+
@opts = opts
|
|
23
|
+
@opts[:ll_delta_convergence] ||= 1e-15
|
|
24
|
+
@opts[:max_iter] ||= 1_000
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
|
|
2
|
+
module Enveomics
|
|
3
|
+
##
|
|
4
|
+
# A simple object representing a sequence match from a search engine
|
|
5
|
+
# supporting tabular BLAST output
|
|
6
|
+
class Match
|
|
7
|
+
attr :row
|
|
8
|
+
|
|
9
|
+
##
|
|
10
|
+
# Initialize Enveomics::Match object from a tabular blast line String +ln+
|
|
11
|
+
def initialize(ln)
|
|
12
|
+
@row = ln.chomp.split("\t")
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def qry
|
|
16
|
+
row[0]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def sbj
|
|
20
|
+
row[1]
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def id
|
|
24
|
+
@id ||= row[2].to_f
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def len
|
|
28
|
+
@len ||= row[3].to_i
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def evalue
|
|
32
|
+
@evalue ||= row[9].to_f
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def score
|
|
36
|
+
@score ||= row[10].to_f
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def qry_len
|
|
40
|
+
@qry_len ||= row[12].to_i
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def sbj_len
|
|
44
|
+
@sbj_len ||= row[13].to_i
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def qry_fract
|
|
48
|
+
return 0.0 unless qry_len.zero?
|
|
49
|
+
@fract ||= len.to_f / qry_len
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
alias fract qry_fract
|
|
53
|
+
|
|
54
|
+
def sbj_fract
|
|
55
|
+
return 0.0 unless sbj_len.zero?
|
|
56
|
+
@fract ||= len.to_f / sbj_len
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def to_s
|
|
60
|
+
row.join("\t")
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
require 'enveomics_rb/bm_set'
|
|
2
|
+
|
|
3
|
+
module Enveomics
|
|
4
|
+
class RBM
|
|
5
|
+
attr :seq1, :seq2, :bms1, :bms2
|
|
6
|
+
|
|
7
|
+
##
|
|
8
|
+
# Initialize RBM object with sequence paths +seq1+ and +seq2+, and
|
|
9
|
+
# Enveomics::BMset options Hash +bm_opts+
|
|
10
|
+
def initialize(seq1, seq2, bm_opts = {})
|
|
11
|
+
@seq1 = seq1
|
|
12
|
+
@seq2 = seq2
|
|
13
|
+
@bms1 = Enveomics::BMset.new(seq1, seq2, bm_opts)
|
|
14
|
+
@bms2 = Enveomics::BMset.new(seq2, seq1, bm_opts)
|
|
15
|
+
@set = nil
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
##
|
|
19
|
+
# Array of Reciprocal Best Enveomics::Match objects
|
|
20
|
+
def set
|
|
21
|
+
@set ||= reciprocate!
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
##
|
|
25
|
+
# Number of reciprocal best matches found
|
|
26
|
+
def count
|
|
27
|
+
set.count
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Find reciprocal best matches and return the subset of +bms1+ that
|
|
32
|
+
# is reciprocal with +bms2+
|
|
33
|
+
def reciprocate!
|
|
34
|
+
bms1.each.select do |bm|
|
|
35
|
+
bms2[bm.sbj] && bm.qry == bms2[bm.sbj].sbj
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
##
|
|
40
|
+
# Enumerate RBMs and yield +blk+
|
|
41
|
+
def each(&blk)
|
|
42
|
+
if block_given?
|
|
43
|
+
set.each { |bm| blk.call(bm) }
|
|
44
|
+
else
|
|
45
|
+
to_enum(:each)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
|
|
2
|
+
module Enveomics
|
|
3
|
+
module Stats
|
|
4
|
+
class << self
|
|
5
|
+
# Generates a random number from the +dist+ distribution with +params+
|
|
6
|
+
# parameters. This is simply a wrapper to the r_* functions below.
|
|
7
|
+
def rand(dist = :unif, *params)
|
|
8
|
+
send("r_#{dist}", *params)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Generates a random number from the uniform distribution between +min+
|
|
12
|
+
# and +max+. By default generates random numbers between 0.0 and 1.0.
|
|
13
|
+
def r_unif(min = 0.0, max = 1.0)
|
|
14
|
+
min + (max - min) * Random::rand
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Generates a random number from the geometric distribution with support
|
|
18
|
+
# {0, 1, 2, ...} and probability of success +p+.
|
|
19
|
+
def r_geom(p)
|
|
20
|
+
(Math::log(1.0 - rand) / Math::log(1.0 - p) - 1.0).ceil
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Generates a random number from the shifted geometric distribution with
|
|
24
|
+
# support {1, 2, 3, ...} and probability of success +p+.
|
|
25
|
+
def r_sgeom(p)
|
|
26
|
+
(Math::log(1.0 - rand) / Math::log(1.0 - p)).ceil
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
|
|
2
|
+
module Enveomics
|
|
3
|
+
module Stats
|
|
4
|
+
# Descriptive statistics for a given sample
|
|
5
|
+
class Sample
|
|
6
|
+
attr :x
|
|
7
|
+
attr :opts
|
|
8
|
+
|
|
9
|
+
# Initialize Enveomics::Stats::Sample with numeric vector +x+ and options
|
|
10
|
+
# Hash +opts+ supporting the keys:
|
|
11
|
+
# - +effective_range+: Range where values fall (by default: range of +x+)
|
|
12
|
+
# - +histo_bin_size+: Width of histogram widths
|
|
13
|
+
# (by default: 1/50th of +effective_range+)
|
|
14
|
+
def initialize(x, opts = {})
|
|
15
|
+
raise 'Cannot initialize an empty sample' if x.empty?
|
|
16
|
+
@x = x.map(&:to_f)
|
|
17
|
+
@opts = opts
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Size of the sample
|
|
21
|
+
def n
|
|
22
|
+
x.size
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Estimates the sample mean
|
|
26
|
+
def mean
|
|
27
|
+
@mean ||= x.inject(:+) / n
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Estimates the mean of the square of the sample
|
|
31
|
+
def square_mean
|
|
32
|
+
@square_mean ||= x.map { |i| i**2 }.inject(:+) / n
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Estimates the unbiased sample variance
|
|
36
|
+
def var
|
|
37
|
+
@var ||= (square_mean - mean ** 2) * n / (n - 1)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Estimates the unbiased sample standard deviation
|
|
41
|
+
def sd
|
|
42
|
+
@sd ||= var ** 0.5
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# --- Higher moments ---
|
|
46
|
+
|
|
47
|
+
# Estimate sample skewness
|
|
48
|
+
def skewness
|
|
49
|
+
return 0.0 if n == 1
|
|
50
|
+
cubed_dev = x.inject(0.0) { |sum, i| sum + (i - mean) ** 3 }
|
|
51
|
+
cubed_dev / ((n - 1) * (sd ** 3))
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Estimate sample excess kurtosis
|
|
55
|
+
def kurtosis
|
|
56
|
+
return 0.0 if n == 1
|
|
57
|
+
quart_dev = x.inject(0.0) { |sum, i| sum + (i - mean)**4 }
|
|
58
|
+
quart_dev / ((n - 1) * (sd**4))
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# --- Ranges ---
|
|
62
|
+
|
|
63
|
+
# Range effectively considered
|
|
64
|
+
def effective_range
|
|
65
|
+
@opts[:effective_range] ||= [nil, nil]
|
|
66
|
+
@opts[:effective_range][0] ||= x.min
|
|
67
|
+
@opts[:effective_range][1] ||= x.max
|
|
68
|
+
@opts[:effective_range]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Size of the effective range
|
|
72
|
+
def effective_range_size
|
|
73
|
+
effective_range[1] - effective_range[0]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# --- Histograms ---
|
|
77
|
+
|
|
78
|
+
# Size of each histogram bin
|
|
79
|
+
def histo_bin_size
|
|
80
|
+
@opts[:histo_bin_size] ||= effective_range_size / 50.0
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Calculate histogram ranges without checking for cached value
|
|
84
|
+
#
|
|
85
|
+
# Use #histo_ranges instead
|
|
86
|
+
def calculate_histo_ranges
|
|
87
|
+
rng = [[effective_range[1], effective_range[1] - histo_bin_size]]
|
|
88
|
+
while rng[rng.size - 1][1] > effective_range[0]
|
|
89
|
+
rng << [rng[rng.size - 1][1], rng[rng.size - 1][1] - histo_bin_size]
|
|
90
|
+
end
|
|
91
|
+
rng
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Histogram ranges as an array of two-entry arrays where the fist entry
|
|
95
|
+
# is the closed-ended maximum value (inclusive) of the range and the
|
|
96
|
+
# second entry is the open-ended minimum value (non-inclusive) of the
|
|
97
|
+
# range. The array is sorted from maximum to minimum
|
|
98
|
+
#
|
|
99
|
+
# Something like: +[[100.0, 99.0], [99.0, 98.0], ...]+, representing the
|
|
100
|
+
# ranges: {[100, 99), [99, 98), ...}
|
|
101
|
+
#
|
|
102
|
+
# The bin width is determined by #hist_bin_size
|
|
103
|
+
def histo_ranges
|
|
104
|
+
@histo_ranges ||= calculate_histo_ranges
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Mid-points of the histogram ranges from #histo_ranges, returns
|
|
108
|
+
# and array of Float
|
|
109
|
+
def histo_mids
|
|
110
|
+
@histo_mids ||= histo_ranges.map { |x| (x[0] + x[1]) / 2 }
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Calculate the histogram counts withouth checking cached value
|
|
114
|
+
#
|
|
115
|
+
# Use #histo_count instead
|
|
116
|
+
def calculate_histo_counts
|
|
117
|
+
counts = []
|
|
118
|
+
xx = x.dup
|
|
119
|
+
histo_ranges.each do |i|
|
|
120
|
+
counts << xx.size - xx.delete_if { |j| j > i[1] }.size
|
|
121
|
+
end
|
|
122
|
+
counts
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Histogram counts in the ranges determined by #histo_ranges
|
|
126
|
+
def histo_counts
|
|
127
|
+
@histo_counts ||= calculate_histo_counts
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# --- Bimodality coefficients ---
|
|
131
|
+
|
|
132
|
+
# Sarle's sample bimodality coefficient b
|
|
133
|
+
def sarle_bimodality
|
|
134
|
+
(skewness**2 + 1) /
|
|
135
|
+
(kurtosis + (3 * ((n - 1)**2)) / ((n - 2) * (n - 3)))
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# de Michele & Accantino (2014) B index
|
|
139
|
+
# DOI: 10.1371%2Fjournal.pone.0091195
|
|
140
|
+
def dma_bimodality
|
|
141
|
+
(mean - dma_mu_M).abs
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# µ_M index proposed by Michele & Accantino (2014)
|
|
145
|
+
# DOI: 10.1371%2Fjournal.pone.0091195
|
|
146
|
+
def dma_mu_M
|
|
147
|
+
histo_counts.each_with_index.map { |m, k| m * histo_mids[k] }.inject(:+) / n
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|