statsample 0.17.0 → 0.18.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,10 @@ require 'statsample/graph/scatterplot'
2
2
  require 'statsample/graph/boxplot'
3
3
  require 'statsample/graph/histogram'
4
4
  module Statsample
5
+ # Several Graph, based on Rubyvis[http://rubyvis.rubyforge.org/]
6
+ # * Statsample::Graph::Boxplot
7
+ # * Statsample::Graph::Histogram
8
+ # * Statsample::Graph::Scatterplot
5
9
  module Graph
6
10
  end
7
11
  end
@@ -45,6 +45,10 @@ module Statsample
45
45
  attr_accessor :maximum
46
46
  # Vectors to box-ploting
47
47
  attr_accessor :vectors
48
+ # The rotation angle, in radians. Text is rotated clockwise relative
49
+ # to the anchor location. For example, with the default left alignment,
50
+ # an angle of Math.PI / 2 causes text to proceed downwards. The default angle is zero.
51
+ attr_accessor :label_angle
48
52
 
49
53
  attr_reader :x_scale, :y_scale
50
54
  # Create a new Boxplot.
@@ -66,7 +70,8 @@ module Statsample
66
70
  :margin_left=>20,
67
71
  :margin_right=>20,
68
72
  :minimum=>nil,
69
- :maximum=>nil
73
+ :maximum=>nil,
74
+ :label_angle=>0
70
75
  }
71
76
  @opts=opts_default.merge(opts)
72
77
  opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
@@ -94,26 +99,24 @@ module Statsample
94
99
 
95
100
  data=@vectors.map {|v|
96
101
  out={:percentil_25=>v.percentil(25), :median=>v.median, :percentil_75=>v.percentil(75), :name=>v.name}
97
- out[:iqr]=out[:percentil_75]-out[:percentil_25]
102
+ out[:iqr]=out[:percentil_75] - out[:percentil_25]
98
103
 
99
- irq_max=out[:percentil_75]+out[:iqr]
100
- irq_min=out[:percentil_25]-out[:iqr]
104
+ irq_max=out[:percentil_75] + out[:iqr]
105
+ irq_min=out[:percentil_25] - out[:iqr]
101
106
 
102
107
  # Find the last data inside the margin
103
- min=out[:percentil_25]
104
- max=out[:percentil_75]
108
+ min = out[:percentil_25]
109
+ max = out[:percentil_75]
105
110
 
106
111
  v.each {|d|
107
- min=d if d<min and d>irq_min
108
- max=d if d>max and d<irq_max
112
+ min=d if d < min and d > irq_min
113
+ max=d if d > max and d < irq_max
109
114
  }
110
115
  # Whiskers!
111
116
  out[:low_whisker]=min
112
117
  out[:high_whisker]=max
113
118
  # And now, data outside whiskers
114
- out[:outliers]=v.data_with_nils.find_all {|d|
115
- d<min or d>max
116
- }
119
+ out[:outliers]=v.data_with_nils.find_all {|d| d < min or d > max }
117
120
  out
118
121
  }
119
122
 
@@ -132,7 +135,6 @@ module Statsample
132
135
  bottom y_scale
133
136
  stroke_style {|d| d!=0 ? "#eee" : "#000"}
134
137
  label(:anchor=>'left') do
135
- visible {|d| true}
136
138
  text y_scale.tick_format
137
139
  end
138
140
  end
@@ -140,23 +142,27 @@ module Statsample
140
142
  bottom 0
141
143
  stroke_style 'black'
142
144
  end
145
+
146
+ # Labels
147
+
143
148
  pan.label do |l|
144
149
  l.data data
145
- l.left {|v| x_scale.scale(index)}
146
- l.bottom -15
150
+ l.text_angle that.label_angle
151
+ l.left {|v| x_scale[index] }
152
+ l.bottom(-15)
147
153
  l.text {|v,x| v[:name]}
148
154
  end
149
155
 
150
156
  pan.panel do |bp|
151
157
  bp.data data
152
- bp.left {|v| x_scale.scale(index)}
158
+ bp.left {|v| x_scale[index]}
153
159
  bp.width x_scale.range_band
154
160
 
155
161
 
156
162
  # Bar
157
163
  bp.bar do |b|
158
- b.bottom {|v| y_scale.scale(v[:percentil_25])}
159
- b.height {|v| y_scale.scale(v[:percentil_75]) - y_scale.scale(v[:percentil_25]) }
164
+ b.bottom {|v| y_scale[v[:percentil_25]]}
165
+ b.height {|v| y_scale[v[:percentil_75]] - y_scale[v[:percentil_25]] }
160
166
  b.line_width 1
161
167
  b.stroke_style {|v|
162
168
  if that.groups
@@ -177,34 +183,39 @@ module Statsample
177
183
  end
178
184
  # Median
179
185
  bp.rule do |r|
180
- r.bottom {|v| y_scale.scale(v[:median])}
186
+ r.bottom {|v| y_scale[v[:median]]}
181
187
  r.width x_scale.range_band
182
188
  r.line_width 2
183
189
  end
184
-
190
+ ##
185
191
  # Whiskeys
192
+ ##
193
+ # Low whiskey
186
194
  bp.rule do |r|
187
- r.visible {|v| v[:percentil_25]>v[:low_whisker]}
188
- r.bottom {|v| y_scale.scale(v[:low_whisker])}
195
+ r.visible {|v| v[:percentil_25] > v[:low_whisker]}
196
+ r.bottom {|v| y_scale[v[:low_whisker]]}
189
197
  end
198
+
190
199
  bp.rule do |r|
191
- r.visible {|v| v[:percentil_25]>v[:low_whisker]}
192
- r.bottom {|v| y_scale.scale(v[:low_whisker])}
200
+ r.visible {|v| v[:percentil_25] > v[:low_whisker]}
201
+ r.bottom {|v| y_scale[v[:low_whisker]]}
193
202
  r.left {|v| x_scale.range_band / 2.0}
194
- r.height {|v| y_scale.scale(v[:percentil_25])-y_scale.scale(v[:low_whisker])}
203
+ r.height {|v| y_scale.scale(v[:percentil_25]) - y_scale.scale(v[:low_whisker])}
195
204
  end
205
+ # High whiskey
206
+
196
207
  bp.rule do |r|
197
- r.visible {|v| v[:percentil_75]<v[:high_whisker]}
208
+ r.visible {|v| v[:percentil_75] < v[:high_whisker]}
198
209
  r.bottom {|v| y_scale.scale(v[:high_whisker])}
199
210
  end
200
211
 
201
212
  bp.rule do |r|
202
- r.visible {|v| v[:percentil_75]<v[:high_whisker]}
213
+ r.visible {|v| v[:percentil_75] < v[:high_whisker]}
203
214
  r.bottom {|v| y_scale.scale(v[:percentil_75])}
204
215
  r.left {|v| x_scale.range_band / 2.0}
205
- r.height {|v| y_scale.scale(v[:high_whisker])-y_scale.scale(v[:percentil_75])}
216
+ r.height {|v| y_scale.scale(v[:high_whisker]) - y_scale.scale(v[:percentil_75])}
206
217
  end
207
-
218
+ # Outliers
208
219
  bp.dot do |dot|
209
220
  dot.shape_size 4
210
221
  dot.data {|v| v[:outliers]}
@@ -1,12 +1,26 @@
1
1
  require 'rubyvis'
2
2
  module Statsample
3
3
  module Graph
4
+
5
+ # In statistics, a histogram is a graphical representation, showing a visual impression of the distribution of experimental data. It is an estimate of the probability distribution of a continuous variable and was first introduced by Karl Pearson [1]. A histogram consists of tabular frequencies, shown as adjacent rectangles, erected over discrete intervals (bins), with an area equal to the frequency of the observations in the interval. The height of a rectangle is also equal to the frequency density of the interval, i.e., the frequency divided by the width of the interval. The total area of the histogram is equal to the number of data.
6
+ #
7
+ # == Usage
8
+ # === Svg output
9
+ # a=[1,2,3,4].to_scale
10
+ # puts Statsample::Graph::Histogram.new(a).to_svg
11
+ # === Using ReportBuilder
12
+ # a=[1,2,3,4].to_scale
13
+ # rb=ReportBuilder.new
14
+ # rb.add(Statsample::Graph::Histogram.new(a))
15
+ # rb.save_html('histogram.html')
16
+
4
17
  class Histogram
5
18
  include Summarizable
19
+ # Histogram name
6
20
  attr_accessor :name
7
- # Total width of Boxplot
21
+ # Total width
8
22
  attr_accessor :width
9
- # Total height of Boxplot
23
+ # Total height
10
24
  attr_accessor :height
11
25
  # Top margin
12
26
  attr_accessor :margin_top
@@ -19,44 +33,91 @@ module Statsample
19
33
  attr_reader :hist
20
34
  # Could be an array of ranges or number of bins
21
35
  attr_accessor :bins
36
+ # Minimum value on x axis. Calculated automaticly from data if not set
37
+ attr_accessor :minimum_x
38
+ # Maximum value on x axis. Calculated automaticly from data if not set
39
+ attr_accessor :maximum_x
40
+ # Minimum value on y axis. Set to 0 if not set
41
+ attr_accessor :minimum_y
42
+ # Maximum value on y axis. Calculated automaticly from data if not set.
43
+ attr_accessor :maximum_y
44
+ # Add a line showing normal distribution
45
+ attr_accessor :line_normal_distribution
22
46
  # data could be a vector or a histogram
23
47
  def initialize(data,opts=Hash.new)
24
- prov_name=data.respond_to? :name ? data.name : ""
48
+ prov_name=(data.respond_to?(:name)) ? data.name : ""
25
49
  opts_default={
26
50
  :name=>_("Histograma (%s)") % prov_name,
27
51
  :width=>400,
28
52
  :height=>300,
29
53
  :margin_top=>10,
30
54
  :margin_bottom=>20,
31
- :margin_left=>20,
55
+ :margin_left=>30,
32
56
  :margin_right=>20,
33
- :bins=>nil
57
+ :minimum_x=>nil,
58
+ :maximum_x=>nil,
59
+ :minimum_y=>nil,
60
+ :maximum_y=>nil,
61
+ :bins=>nil,
62
+ :line_normal_distribution=>false
34
63
  }
35
64
  @opts=opts_default.merge(opts)
36
65
  opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
37
66
  @data=data
38
67
  end
39
- def pre_vis
68
+ def pre_vis # :nodoc:
40
69
  if @data.is_a? Statsample::Histogram
41
70
  @hist=@data
71
+ @mean=@hist.estimated_mean
72
+ @sd=@hist.estimated_standard_deviation
42
73
  elsif @data.is_a? Statsample::Vector
74
+ @mean=@data.mean
75
+ @sd=@data.sd
43
76
  @bins||=Math::sqrt(@data.size).floor
44
77
  @hist=@data.histogram(@bins)
45
78
  end
46
79
  end
80
+ def rubyvis_normal_distribution(pan)
81
+ x_scale=@x_scale
82
+ y_scale=@y_scale
83
+
84
+ wob = @hist.get_range(0)[1] - @hist.get_range(0)[0]
85
+
86
+ nob = ((@maximum_x-@minimum_x) / wob.to_f).floor
87
+ sum=@hist.sum
88
+
89
+ data=nob.times.map {|i|
90
+ l=@minimum_x+i*wob
91
+ r=@minimum_x+(i+1)*wob
92
+ middle=(l+r) / 2.0
93
+ pi=Distribution::Normal.cdf((r-@mean) / @sd) - Distribution::Normal.cdf((l-@mean) / @sd)
94
+ {:x=>middle, :y=>pi*sum}
95
+ }
96
+ pan.line do |l|
97
+ l.data data
98
+ l.interpolate "cardinal"
99
+ l.stroke_style "black"
100
+ l.bottom {|d| y_scale[d[:y]]}
101
+ l.left {|d| x_scale[d[:x]]}
102
+ end
103
+
104
+ end
47
105
  # Returns a Rubyvis panel with scatterplot
48
106
  def rubyvis_panel # :nodoc:
49
107
  pre_vis
50
108
  that=self
51
109
 
52
- max_bin = @hist.max_val
110
+ @minimum_x||=@hist.min
111
+ @maximum_x||=@hist.max
112
+ @minimum_y||=0
113
+ @maximum_y||=@hist.max_val
53
114
 
54
115
  margin_hor=margin_left + margin_right
55
116
  margin_vert=margin_top + margin_bottom
56
117
 
57
- x_scale = pv.Scale.linear(@hist.min, @hist.max).range(0,width-margin_hor)
118
+ x_scale = pv.Scale.linear(@minimum_x, @maximum_x).range(0, width - margin_hor)
58
119
 
59
- y_scale=Rubyvis::Scale.linear(0,max_bin).range(0, height-margin_vert)
120
+ y_scale=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
60
121
 
61
122
  y_scale.nice
62
123
  max_range=@hist.max
@@ -67,6 +128,8 @@ module Statsample
67
128
  :value=>@hist.bin[i]
68
129
  }
69
130
  }
131
+ @x_scale=x_scale
132
+ @y_scale=y_scale
70
133
  # cache data
71
134
  vis=Rubyvis::Panel.new do |pan|
72
135
  pan.width width - margin_hor
@@ -90,21 +153,22 @@ module Statsample
90
153
  left x_scale
91
154
  stroke_style "black"
92
155
  height 5
93
- bottom -5
156
+ bottom(-5)
94
157
  label(:anchor=>'bottom') do
95
158
  text x_scale.tick_format
96
159
  end
97
160
  end
98
-
161
+
99
162
  pan.bar do |bar|
100
163
  bar.data(bins)
101
- bar.left {|v| x_scale.scale(v[:low])}
102
- bar.width {|v| x_scale.scale(v[:high]) - x_scale.scale(v[:low])}
164
+ bar.left {|v| x_scale[v[:low]]}
165
+ bar.width {|v| x_scale[v[:high]] - x_scale[v[:low]]}
103
166
  bar.bottom 0
104
- bar.height {|v| y_scale.scale(v[:value])}
167
+ bar.height {|v| y_scale[v[:value]]}
105
168
  bar.stroke_style "black"
106
169
  bar.line_width 1
107
170
  end
171
+ rubyvis_normal_distribution(pan) if @line_normal_distribution
108
172
  end
109
173
  end
110
174
  # Returns SVG with scatterplot
@@ -41,7 +41,25 @@ module Statsample
41
41
 
42
42
  attr_reader :data
43
43
  attr_reader :v1,:v2
44
+
45
+ # Array with assignation to groups of bars
46
+ # For example, for four vectors,
47
+ # boxplot.groups=[1,2,1,3]
48
+ # Assign same color to first and third element, and different to
49
+ # second and fourth
50
+ attr_accessor :groups
51
+
52
+
44
53
  attr_reader :x_scale, :y_scale
54
+ # Minimum value on x axis. Calculated automaticly from data if not set
55
+ attr_accessor :minimum_x
56
+ # Maximum value on x axis. Calculated automaticly from data if not set
57
+ attr_accessor :maximum_x
58
+ # Minimum value on y axis. Set to 0 if not set
59
+ attr_accessor :minimum_y
60
+ # Maximum value on y axis. Calculated automaticly from data if not set.
61
+ attr_accessor :maximum_y
62
+
45
63
  # Create a new Scatterplot.
46
64
  # Params:
47
65
  # * v1: Vector on X axis
@@ -59,14 +77,18 @@ module Statsample
59
77
  :margin_top=>10,
60
78
  :margin_bottom=>20,
61
79
  :margin_left=>20,
62
- :margin_right=>20
63
-
80
+ :margin_right=>20,
81
+ :minimum_x=>nil,
82
+ :maximum_x=>nil,
83
+ :minimum_y=>nil,
84
+ :maximum_y=>nil,
85
+ :groups=>nil
64
86
  }
65
87
  @opts=opts_default.merge(opts)
66
88
  opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
67
89
  @data=[]
68
90
  @v1.each_with_index {|d1,i|
69
- @data.push({:x=>d1,:y=>@v2[i]})
91
+ @data.push({:x=>d1, :y=>@v2[i]})
70
92
  }
71
93
  end
72
94
  # Add a rule on median of X and Y axis
@@ -98,13 +120,24 @@ module Statsample
98
120
  def rubyvis_panel # :nodoc:
99
121
  that=self
100
122
  #p @v1.map {|v| v}
101
- x=Rubyvis::Scale.linear(@v1.to_a).range(0,width)
102
- y=Rubyvis::Scale.linear(@v2.to_a).range(0,height)
123
+
124
+ @minimum_x||=@v1.min
125
+ @maximum_x||=@v1.max
126
+ @minimum_y||=@v2.min
127
+ @maximum_y||=@v2.max
128
+
129
+ colors=Rubyvis::Colors.category10
130
+
131
+ margin_hor=margin_left + margin_right
132
+ margin_vert=margin_top + margin_bottom
133
+
134
+ x=Rubyvis::Scale.linear(@minimum_x, @maximum_x).range(0, width - margin_hor)
135
+ y=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
103
136
  @x_scale=x
104
137
  @y_scale=y
105
138
  vis=Rubyvis::Panel.new do |pan|
106
- pan.width width - (margin_left + margin_right)
107
- pan.height height - (margin_top + margin_bottom)
139
+ pan.width width - margin_hor
140
+ pan.height height - margin_vert
108
141
  pan.bottom margin_bottom
109
142
  pan.left margin_left
110
143
  pan.right margin_right
@@ -115,7 +148,7 @@ module Statsample
115
148
  bottom y
116
149
  stroke_style {|d| d!=0 ? "#eee" : "#000"}
117
150
  label(:anchor=>'left') do
118
- visible {|d| d>0 and d<that.width}
151
+ visible {|d| d!=0 and d < that.width}
119
152
  text y.tick_format
120
153
  end
121
154
  end
@@ -136,9 +169,26 @@ module Statsample
136
169
  pan.panel do
137
170
  data(that.data)
138
171
  dot do
139
- left {|d| x.scale(d[:x])}
140
- bottom {|d| y.scale(d[:y])}
141
- stroke_style Rubyvis.color("red").alpha(that.dot_alpha)
172
+ left {|d| x[d[:x]]}
173
+ bottom {|d| y[d[:y]]}
174
+
175
+ fill_style {|v|
176
+ alpha=(that.dot_alpha-0.3<=0) ? 0.1 : that.dot_alpha-0.3
177
+ if that.groups
178
+
179
+ colors.scale(that.groups[index]).alpha(alpha)
180
+ else
181
+ colors.scale(0).alpha(alpha)
182
+ end
183
+ }
184
+
185
+ stroke_style {|v|
186
+ if that.groups
187
+ colors.scale(that.groups[parent.index]).alpha(that.dot_alpha)
188
+ else
189
+ colors.scale(0).alpha(that.dot_alpha)
190
+ end
191
+ }
142
192
  shape_radius 2
143
193
  end
144
194
  end
@@ -38,6 +38,7 @@ module Statsample
38
38
  # * http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
39
39
 
40
40
  class Histogram
41
+ include Enumerable
41
42
  class << self
42
43
  # Alloc +n_bins+, using +range+ as ranges of bins
43
44
  def alloc(n_bins, range=nil, opts=Hash.new)
@@ -91,7 +92,7 @@ module Statsample
91
92
  end
92
93
  #
93
94
  def increment(x, w=1)
94
- if x.is_a? Array
95
+ if x.respond_to? :each
95
96
  x.each{|y| increment(y,w) }
96
97
  elsif x.is_a? Numeric
97
98
  (range.size-1).times do |i|
@@ -121,6 +122,41 @@ module Statsample
121
122
  def min_val
122
123
  @bin.min
123
124
  end
125
+ def each
126
+ bins.times.each do |i|
127
+ r=get_range(i)
128
+ arg={:i=>i, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]}
129
+ yield arg
130
+ end
131
+ end
132
+ def estimated_variance
133
+ sum,n=0,0
134
+ mean=estimated_mean
135
+ each do |v|
136
+ sum+=v[:value]*(v[:middle]-mean)**2
137
+ n+=v[:value]
138
+ end
139
+ sum / (n-1)
140
+ end
141
+ def estimated_standard_deviation
142
+ Math::sqrt(estimated_variance)
143
+ end
144
+ def estimated_mean
145
+ sum,n=0,0
146
+ each do |v|
147
+ sum+= v[:value]* v[:middle]
148
+ n+=v[:value]
149
+ end
150
+ sum / n
151
+ end
152
+ alias :mean :estimated_mean
153
+ alias :sigma :estimated_standard_deviation
154
+
155
+ def sum(start=nil,_end=nil)
156
+ start||=0
157
+ _end||=@n_bins-1
158
+ (start.._end).inject(0) {|ac,i| ac+@bin[i]}
159
+ end
124
160
  def report_building(generator)
125
161
  hg=Statsample::Graph::Histogram.new(self)
126
162
  generator.parse_element(hg)