miga-base 0.7.26.3 → 1.0.0.sr1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -0,0 +1,175 @@
|
|
1
|
+
|
2
|
+
require 'enveomics_rb/enveomics'
|
3
|
+
require 'enveomics_rb/match'
|
4
|
+
use 'tmpdir'
|
5
|
+
use 'shellwords'
|
6
|
+
|
7
|
+
module Enveomics
|
8
|
+
class BMset
|
9
|
+
attr :qry, :sbj, :set, :opt
|
10
|
+
|
11
|
+
##
|
12
|
+
# Initialize Enveomics::BMset object with sequence paths +qry+ and +sbj+,
|
13
|
+
# and options Hash +opts+ (see #opt for supported options) with Symbol keys
|
14
|
+
def initialize(qry, sbj, opts = {})
|
15
|
+
@qry = qry
|
16
|
+
@sbj = sbj
|
17
|
+
@set = nil
|
18
|
+
@opt = opts
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Returns option with key +k+ as defined by #initialize or by default
|
23
|
+
# Supported options include [defaults in brackets]:
|
24
|
+
# - len [0]: Minimum alignment length in residues
|
25
|
+
# - id [0.0]: Minimum alignment identity in percent
|
26
|
+
# - fract [0.0]: Minimum alignment length as fraction of the query
|
27
|
+
# - score [0.0]: Minimum alignment score in bits
|
28
|
+
# - nucl [false]: The sequences are in nucleotides
|
29
|
+
# - thr [1]: Number of threads to use
|
30
|
+
# - bin ['']: Path to the directory containing binaries
|
31
|
+
# - program [:blast+]: Search engine to use
|
32
|
+
def opt(k)
|
33
|
+
@defaults ||= {
|
34
|
+
len: 0, id: 0.0, fract: 0.0, score: 0.0,
|
35
|
+
nucl: false, thr: 1, bin: '', program: :'blast+'
|
36
|
+
}
|
37
|
+
k = k.to_sym
|
38
|
+
@opt[k] = @defaults[k] if @opt[k].nil?
|
39
|
+
@opt[k]
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Array of Enveomics::Match objects
|
44
|
+
def set
|
45
|
+
match_and_filter! if @set.nil?
|
46
|
+
@set
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Returns the best match of query +qry+ as Enveomics::Match or nil if
|
51
|
+
# no qualifying match was found
|
52
|
+
def [](qry)
|
53
|
+
set[qry]
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Number of matches found
|
58
|
+
def count
|
59
|
+
set.count
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
63
|
+
# Execute search and filter matches
|
64
|
+
def match_and_filter!
|
65
|
+
@set = {}
|
66
|
+
match!.each do |match|
|
67
|
+
# Already a better match?
|
68
|
+
next if self[match.qry] && self[match.qry].score >= match.score
|
69
|
+
|
70
|
+
# Is this a good enough match?
|
71
|
+
next unless %i[len id score fract].all? do |metric|
|
72
|
+
match.send(metric) >= opt(metric)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Save match
|
76
|
+
@set[match.qry] = match
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# Find all matches and return as an array of Enveomics::Match objects
|
82
|
+
def match!
|
83
|
+
y = []
|
84
|
+
Dir.mktmpdir do |dir|
|
85
|
+
# Determine commands
|
86
|
+
say('Temporal directory: ', dir)
|
87
|
+
db_path = File.join(dir, 'sbj.db')
|
88
|
+
out_path = File.join(dir, 'out.tsv')
|
89
|
+
cmds = []
|
90
|
+
case opt(:program)
|
91
|
+
when :blast
|
92
|
+
cmds << [
|
93
|
+
'formatdb', '-i', sbj, '-n', db_path, '-l', File.join(dir, 'log'),
|
94
|
+
'-p', opt(:nucl) ? 'F' : 'T'
|
95
|
+
]
|
96
|
+
cmd << [
|
97
|
+
'blastall', '-p', opt(:nucl) ? 'blastn' : 'blastp', '-d', db_path,
|
98
|
+
'-i', qry, '-v', '1', '-b', '1', '-a', opt(:thr).to_s, '-m', '8',
|
99
|
+
'-o', out_path
|
100
|
+
]
|
101
|
+
when :'blast+'
|
102
|
+
cmds << [
|
103
|
+
'makeblastdb', '-in', sbj, '-out', db_path,
|
104
|
+
'-dbtype', opt(:nucl) ? 'nucl' : 'prot'
|
105
|
+
]
|
106
|
+
cmds << [
|
107
|
+
opt(:nucl) ? 'blastn' : 'blastp', '-db', db_path, '-query', qry,
|
108
|
+
'-num_threads', opt(:thr).to_s, '-out', out_path, '-outfmt',
|
109
|
+
'6 qseqid sseqid pident length mismatch gapopen qstart qend ' \
|
110
|
+
'sstart send evalue bitscore qlen slen'
|
111
|
+
]
|
112
|
+
when :diamond
|
113
|
+
raise Enveomics::OptionError.new(
|
114
|
+
'Unsupported search engine diamond for nucleotides'
|
115
|
+
) if opt(:nucl)
|
116
|
+
cmds << [
|
117
|
+
'diamond', 'makedb', '--in', sbj, '--db', db_path,
|
118
|
+
'--threads', opt(:thr).to_s
|
119
|
+
]
|
120
|
+
cmds << [
|
121
|
+
'diamond', 'blastp', '--threads', opt(:thr).to_s,
|
122
|
+
'--db', db_path, '--query', qry, '--daa', "#{out_path}.daa",
|
123
|
+
'--quiet', '--sensitive'
|
124
|
+
]
|
125
|
+
cmds << [
|
126
|
+
'diamond', 'view', '--daa', "#{out_path}.daa", '--out', out_path,
|
127
|
+
'--quiet', '--outfmt'
|
128
|
+
] + %w[6 qseqid sseqid pident length mismatch gapopen qstart] +
|
129
|
+
%w[qend sstart send evalue bitscore qlen slen]
|
130
|
+
when :blat
|
131
|
+
cmds << ['blat', sbj, qry, '-out=blast8', out_path]
|
132
|
+
cmds[0] << '-prot' unless opt(:nucl)
|
133
|
+
else
|
134
|
+
raise Enveomics::OptionError.new(
|
135
|
+
"Unsupported search engine: #{opt(:program)}"
|
136
|
+
)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Run commands
|
140
|
+
say('Running comparison')
|
141
|
+
say('Query: ', qry)
|
142
|
+
say('Subject: ', sbj)
|
143
|
+
cmd_err = File.join(dir, 'err')
|
144
|
+
begin
|
145
|
+
cmds.each do |cmd|
|
146
|
+
cmd[0] = File.join(opt(:bin), cmd[0]) unless opt(:bin) == ''
|
147
|
+
run_cmd(cmd, stderr: cmd_err)
|
148
|
+
end
|
149
|
+
rescue Enveomics::CommandError => e
|
150
|
+
$stderr.puts e
|
151
|
+
$stderr.puts ''
|
152
|
+
$stderr.puts '[ Error log ]'
|
153
|
+
$stderr.puts File.read(cmd_err)
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
|
157
|
+
# Parse output
|
158
|
+
File.open(out_path, 'r') do |fh|
|
159
|
+
fh.each { |ln| y << Enveomics::Match.new(ln) }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
y
|
163
|
+
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# Enumerate RBMs and yield +blk+
|
167
|
+
def each(&blk)
|
168
|
+
if block_given?
|
169
|
+
set.each { |_, bm| blk.call(bm) }
|
170
|
+
else
|
171
|
+
to_enum(:each)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
@@ -1,24 +1,24 @@
|
|
1
1
|
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
#
|
2
|
+
require 'enveomics_rb/utils'
|
3
|
+
use 'optparse'
|
4
|
+
ARGV << '-h' if ARGV.empty?
|
6
5
|
|
7
|
-
|
8
|
-
|
6
|
+
module Enveomics
|
7
|
+
class << self
|
8
|
+
def opt_banner(opt, banner, usage = nil)
|
9
|
+
opt.version ||= $VERSION
|
10
|
+
usage ||= "#{opt.program_name}.rb [options]"
|
11
|
+
opt.banner = <<~BANNER
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
[Enveomics Collection: #{opt.program_name} #{opt.version}]
|
14
|
+
|
15
|
+
#{banner}
|
16
|
+
|
17
|
+
Usage
|
18
|
+
#{usage}
|
19
|
+
|
20
|
+
BANNER
|
16
21
|
end
|
17
|
-
return true
|
18
|
-
rescue LoadError
|
19
|
-
abort "\nUnmet requirements, please install required gems:" +
|
20
|
-
gems.map{ |gem| "\n gem install #{gem}" }.join + "\n\n" if mandatory
|
21
|
-
return false
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
require 'enveomics_rb/stats/sample'
|
3
|
+
|
4
|
+
module Enveomics
|
5
|
+
# Calculate Gaussian Mixture Models by Expectation Maximization
|
6
|
+
class GmmEm
|
7
|
+
attr :sample
|
8
|
+
attr :components
|
9
|
+
attr :opts
|
10
|
+
|
11
|
+
# Initialize Enve::GmmEm object from numeric array +x+, +components+
|
12
|
+
# gaussian components (an Integer), and options hash +opts+ with supported
|
13
|
+
# Symbol keys:
|
14
|
+
# - ll_delta_converge: Maximum change in LL to consider convergence
|
15
|
+
# (by default: 1e-15)
|
16
|
+
# - max_iter: Maximum number of EM iterations (by default: 1_000)
|
17
|
+
# - init_mu: Initial components means as numeric array
|
18
|
+
# - init_sigma: Initial components standard deviation as numeric array
|
19
|
+
# - init_alpha: Initial components fractions as numeric array adding up to 1
|
20
|
+
def initialize(x, components = 2, opts = {})
|
21
|
+
@sample = Enve::Stats::Sample.new(x)
|
22
|
+
@opts = opts
|
23
|
+
@opts[:ll_delta_convergence] ||= 1e-15
|
24
|
+
@opts[:max_iter] ||= 1_000
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
module Enveomics
|
3
|
+
##
|
4
|
+
# A simple object representing a sequence match from a search engine
|
5
|
+
# supporting tabular BLAST output
|
6
|
+
class Match
|
7
|
+
attr :row
|
8
|
+
|
9
|
+
##
|
10
|
+
# Initialize Enveomics::Match object from a tabular blast line String +ln+
|
11
|
+
def initialize(ln)
|
12
|
+
@row = ln.chomp.split("\t")
|
13
|
+
end
|
14
|
+
|
15
|
+
def qry
|
16
|
+
row[0]
|
17
|
+
end
|
18
|
+
|
19
|
+
def sbj
|
20
|
+
row[1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def id
|
24
|
+
@id ||= row[2].to_f
|
25
|
+
end
|
26
|
+
|
27
|
+
def len
|
28
|
+
@len ||= row[3].to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
def evalue
|
32
|
+
@evalue ||= row[9].to_f
|
33
|
+
end
|
34
|
+
|
35
|
+
def score
|
36
|
+
@score ||= row[10].to_f
|
37
|
+
end
|
38
|
+
|
39
|
+
def qry_len
|
40
|
+
@qry_len ||= row[12].to_i
|
41
|
+
end
|
42
|
+
|
43
|
+
def sbj_len
|
44
|
+
@sbj_len ||= row[13].to_i
|
45
|
+
end
|
46
|
+
|
47
|
+
def qry_fract
|
48
|
+
return 0.0 unless qry_len.zero?
|
49
|
+
@fract ||= len.to_f / qry_len
|
50
|
+
end
|
51
|
+
|
52
|
+
alias fract qry_fract
|
53
|
+
|
54
|
+
def sbj_fract
|
55
|
+
return 0.0 unless sbj_len.zero?
|
56
|
+
@fract ||= len.to_f / sbj_len
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_s
|
60
|
+
row.join("\t")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'enveomics_rb/bm_set'
|
2
|
+
|
3
|
+
module Enveomics
|
4
|
+
class RBM
|
5
|
+
attr :seq1, :seq2, :bms1, :bms2
|
6
|
+
|
7
|
+
##
|
8
|
+
# Initialize RBM object with sequence paths +seq1+ and +seq2+, and
|
9
|
+
# Enveomics::BMset options Hash +bm_opts+
|
10
|
+
def initialize(seq1, seq2, bm_opts = {})
|
11
|
+
@seq1 = seq1
|
12
|
+
@seq2 = seq2
|
13
|
+
@bms1 = Enveomics::BMset.new(seq1, seq2, bm_opts)
|
14
|
+
@bms2 = Enveomics::BMset.new(seq2, seq1, bm_opts)
|
15
|
+
@set = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# Array of Reciprocal Best Enveomics::Match objects
|
20
|
+
def set
|
21
|
+
@set ||= reciprocate!
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# Number of reciprocal best matches found
|
26
|
+
def count
|
27
|
+
set.count
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Find reciprocal best matches and return the subset of +bms1+ that
|
32
|
+
# is reciprocal with +bms2+
|
33
|
+
def reciprocate!
|
34
|
+
bms1.each.select do |bm|
|
35
|
+
bms2[bm.sbj] && bm.qry == bms2[bm.sbj].sbj
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Enumerate RBMs and yield +blk+
|
41
|
+
def each(&blk)
|
42
|
+
if block_given?
|
43
|
+
set.each { |bm| blk.call(bm) }
|
44
|
+
else
|
45
|
+
to_enum(:each)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
module Enveomics
|
3
|
+
module Stats
|
4
|
+
class << self
|
5
|
+
# Generates a random number from the +dist+ distribution with +params+
|
6
|
+
# parameters. This is simply a wrapper to the r_* functions below.
|
7
|
+
def rand(dist = :unif, *params)
|
8
|
+
send("r_#{dist}", *params)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Generates a random number from the uniform distribution between +min+
|
12
|
+
# and +max+. By default generates random numbers between 0.0 and 1.0.
|
13
|
+
def r_unif(min = 0.0, max = 1.0)
|
14
|
+
min + (max - min) * Random::rand
|
15
|
+
end
|
16
|
+
|
17
|
+
# Generates a random number from the geometric distribution with support
|
18
|
+
# {0, 1, 2, ...} and probability of success +p+.
|
19
|
+
def r_geom(p)
|
20
|
+
(Math::log(1.0 - rand) / Math::log(1.0 - p) - 1.0).ceil
|
21
|
+
end
|
22
|
+
|
23
|
+
# Generates a random number from the shifted geometric distribution with
|
24
|
+
# support {1, 2, 3, ...} and probability of success +p+.
|
25
|
+
def r_sgeom(p)
|
26
|
+
(Math::log(1.0 - rand) / Math::log(1.0 - p)).ceil
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1,152 @@
|
|
1
|
+
|
2
|
+
module Enveomics
|
3
|
+
module Stats
|
4
|
+
# Descriptive statistics for a given sample
|
5
|
+
class Sample
|
6
|
+
attr :x
|
7
|
+
attr :opts
|
8
|
+
|
9
|
+
# Initialize Enveomics::Stats::Sample with numeric vector +x+ and options
|
10
|
+
# Hash +opts+ supporting the keys:
|
11
|
+
# - +effective_range+: Range where values fall (by default: range of +x+)
|
12
|
+
# - +histo_bin_size+: Width of histogram widths
|
13
|
+
# (by default: 1/50th of +effective_range+)
|
14
|
+
def initialize(x, opts = {})
|
15
|
+
raise 'Cannot initialize an empty sample' if x.empty?
|
16
|
+
@x = x.map(&:to_f)
|
17
|
+
@opts = opts
|
18
|
+
end
|
19
|
+
|
20
|
+
# Size of the sample
|
21
|
+
def n
|
22
|
+
x.size
|
23
|
+
end
|
24
|
+
|
25
|
+
# Estimates the sample mean
|
26
|
+
def mean
|
27
|
+
@mean ||= x.inject(:+) / n
|
28
|
+
end
|
29
|
+
|
30
|
+
# Estimates the mean of the square of the sample
|
31
|
+
def square_mean
|
32
|
+
@square_mean ||= x.map { |i| i**2 }.inject(:+) / n
|
33
|
+
end
|
34
|
+
|
35
|
+
# Estimates the unbiased sample variance
|
36
|
+
def var
|
37
|
+
@var ||= (square_mean - mean ** 2) * n / (n - 1)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Estimates the unbiased sample standard deviation
|
41
|
+
def sd
|
42
|
+
@sd ||= var ** 0.5
|
43
|
+
end
|
44
|
+
|
45
|
+
# --- Higher moments ---
|
46
|
+
|
47
|
+
# Estimate sample skewness
|
48
|
+
def skewness
|
49
|
+
return 0.0 if n == 1
|
50
|
+
cubed_dev = x.inject(0.0) { |sum, i| sum + (i - mean) ** 3 }
|
51
|
+
cubed_dev / ((n - 1) * (sd ** 3))
|
52
|
+
end
|
53
|
+
|
54
|
+
# Estimate sample excess kurtosis
|
55
|
+
def kurtosis
|
56
|
+
return 0.0 if n == 1
|
57
|
+
quart_dev = x.inject(0.0) { |sum, i| sum + (i - mean)**4 }
|
58
|
+
quart_dev / ((n - 1) * (sd**4))
|
59
|
+
end
|
60
|
+
|
61
|
+
# --- Ranges ---
|
62
|
+
|
63
|
+
# Range effectively considered
|
64
|
+
def effective_range
|
65
|
+
@opts[:effective_range] ||= [nil, nil]
|
66
|
+
@opts[:effective_range][0] ||= x.min
|
67
|
+
@opts[:effective_range][1] ||= x.max
|
68
|
+
@opts[:effective_range]
|
69
|
+
end
|
70
|
+
|
71
|
+
# Size of the effective range
|
72
|
+
def effective_range_size
|
73
|
+
effective_range[1] - effective_range[0]
|
74
|
+
end
|
75
|
+
|
76
|
+
# --- Histograms ---
|
77
|
+
|
78
|
+
# Size of each histogram bin
|
79
|
+
def histo_bin_size
|
80
|
+
@opts[:histo_bin_size] ||= effective_range_size / 50.0
|
81
|
+
end
|
82
|
+
|
83
|
+
# Calculate histogram ranges without checking for cached value
|
84
|
+
#
|
85
|
+
# Use #histo_ranges instead
|
86
|
+
def calculate_histo_ranges
|
87
|
+
rng = [[effective_range[1], effective_range[1] - histo_bin_size]]
|
88
|
+
while rng[rng.size - 1][1] > effective_range[0]
|
89
|
+
rng << [rng[rng.size - 1][1], rng[rng.size - 1][1] - histo_bin_size]
|
90
|
+
end
|
91
|
+
rng
|
92
|
+
end
|
93
|
+
|
94
|
+
# Histogram ranges as an array of two-entry arrays where the fist entry
|
95
|
+
# is the closed-ended maximum value (inclusive) of the range and the
|
96
|
+
# second entry is the open-ended minimum value (non-inclusive) of the
|
97
|
+
# range. The array is sorted from maximum to minimum
|
98
|
+
#
|
99
|
+
# Something like: +[[100.0, 99.0], [99.0, 98.0], ...]+, representing the
|
100
|
+
# ranges: {[100, 99), [99, 98), ...}
|
101
|
+
#
|
102
|
+
# The bin width is determined by #hist_bin_size
|
103
|
+
def histo_ranges
|
104
|
+
@histo_ranges ||= calculate_histo_ranges
|
105
|
+
end
|
106
|
+
|
107
|
+
# Mid-points of the histogram ranges from #histo_ranges, returns
|
108
|
+
# and array of Float
|
109
|
+
def histo_mids
|
110
|
+
@histo_mids ||= histo_ranges.map { |x| (x[0] + x[1]) / 2 }
|
111
|
+
end
|
112
|
+
|
113
|
+
# Calculate the histogram counts withouth checking cached value
|
114
|
+
#
|
115
|
+
# Use #histo_count instead
|
116
|
+
def calculate_histo_counts
|
117
|
+
counts = []
|
118
|
+
xx = x.dup
|
119
|
+
histo_ranges.each do |i|
|
120
|
+
counts << xx.size - xx.delete_if { |j| j > i[1] }.size
|
121
|
+
end
|
122
|
+
counts
|
123
|
+
end
|
124
|
+
|
125
|
+
# Histogram counts in the ranges determined by #histo_ranges
|
126
|
+
def histo_counts
|
127
|
+
@histo_counts ||= calculate_histo_counts
|
128
|
+
end
|
129
|
+
|
130
|
+
# --- Bimodality coefficients ---
|
131
|
+
|
132
|
+
# Sarle's sample bimodality coefficient b
|
133
|
+
def sarle_bimodality
|
134
|
+
(skewness**2 + 1) /
|
135
|
+
(kurtosis + (3 * ((n - 1)**2)) / ((n - 2) * (n - 3)))
|
136
|
+
end
|
137
|
+
|
138
|
+
# de Michele & Accantino (2014) B index
|
139
|
+
# DOI: 10.1371%2Fjournal.pone.0091195
|
140
|
+
def dma_bimodality
|
141
|
+
(mean - dma_mu_M).abs
|
142
|
+
end
|
143
|
+
|
144
|
+
# µ_M index proposed by Michele & Accantino (2014)
|
145
|
+
# DOI: 10.1371%2Fjournal.pone.0091195
|
146
|
+
def dma_mu_M
|
147
|
+
histo_counts.each_with_index.map { |m, k| m * histo_mids[k] }.inject(:+) / n
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|