RubyGems - statsample - Versions diffs - 0.3.0 - Mend

statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/History.txt +79 -0
data/Manifest.txt +56 -0
data/README.txt +77 -0
data/Rakefile +22 -0
data/bin/statsample +2 -0
data/demo/benchmark.rb +52 -0
data/demo/chi-square.rb +44 -0
data/demo/dice.rb +13 -0
data/demo/distribution_t.rb +95 -0
data/demo/graph.rb +9 -0
data/demo/item_analysis.rb +30 -0
data/demo/mean.rb +81 -0
data/demo/proportion.rb +57 -0
data/demo/sample_test.csv +113 -0
data/demo/strata_proportion.rb +152 -0
data/demo/stratum.rb +141 -0
data/lib/spss.rb +131 -0
data/lib/statsample.rb +216 -0
data/lib/statsample/anova.rb +74 -0
data/lib/statsample/bivariate.rb +255 -0
data/lib/statsample/chidistribution.rb +39 -0
data/lib/statsample/codification.rb +120 -0
data/lib/statsample/converters.rb +338 -0
data/lib/statsample/crosstab.rb +122 -0
data/lib/statsample/dataset.rb +526 -0
data/lib/statsample/dominanceanalysis.rb +259 -0
data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
data/lib/statsample/graph/gdchart.rb +45 -0
data/lib/statsample/graph/svgboxplot.rb +108 -0
data/lib/statsample/graph/svggraph.rb +181 -0
data/lib/statsample/graph/svghistogram.rb +208 -0
data/lib/statsample/graph/svgscatterplot.rb +111 -0
data/lib/statsample/htmlreport.rb +232 -0
data/lib/statsample/multiset.rb +281 -0
data/lib/statsample/regression.rb +522 -0
data/lib/statsample/reliability.rb +235 -0
data/lib/statsample/resample.rb +20 -0
data/lib/statsample/srs.rb +159 -0
data/lib/statsample/test.rb +25 -0
data/lib/statsample/vector.rb +759 -0
data/test/_test_chart.rb +58 -0
data/test/test_anova.rb +31 -0
data/test/test_codification.rb +59 -0
data/test/test_crosstab.rb +55 -0
data/test/test_csv.csv +7 -0
data/test/test_csv.rb +27 -0
data/test/test_dataset.rb +293 -0
data/test/test_ggobi.rb +42 -0
data/test/test_multiset.rb +98 -0
data/test/test_regression.rb +108 -0
data/test/test_reliability.rb +32 -0
data/test/test_resample.rb +23 -0
data/test/test_srs.rb +14 -0
data/test/test_statistics.rb +152 -0
data/test/test_stratified.rb +19 -0
data/test/test_svg_graph.rb +63 -0
data/test/test_vector.rb +265 -0
data/test/test_xls.rb +32 -0
metadata +158 -0

data/lib/statsample/htmlreport.rb ADDED Viewed

@@ -0,0 +1,232 @@
+require 'statsample/graph/svggraph'
+module Statsample
+    class HtmlReport
+    def initialize(name,dir=nil)
+        require 'fileutils'
+        @uniq=1
+        @uniq_file=0
+        @name=name
+        @partials=[]
+        @anchors=[]
+        dir||=@name+"/"
+        @dir=dir
+        @level=1
+        FileUtils.mkdir(@dir) if !File.exists? @dir
+    end
+    def add_summary(name,summary)
+        add_anchor(name)
+        @partials.push(summary)
+    end
+    def add_anchor(name)
+        @anchors.push([name,@level,@uniq])
+        @partials.push("<a name='#{@uniq}'> </a>")
+        @uniq+=1
+    end
+    def uniq_file(prepend="file")
+        @uniq_file+=1
+        "#{prepend}_#{@uniq_file}_#{Time.now.to_i}"
+    end
+    def add_correlation_matrix(ds)
+        add_anchor("Correlation Matrix")
+        html="<h2>Correlation Matrix</h2> <table><thead><th>-</th><th>"+ds.fields.join("</th><th>")+"</th> </thead> <tbody>"
+        matrix=Statsample::Bivariate.correlation_matrix(ds)
+        pmatrix=Statsample::Bivariate.correlation_probability_matrix(ds)
+        (0...(matrix.row_size)).each {|row|
+            html+="<tr><td>"+ds.fields[row]+"</td>"
+            (0...(matrix.column_size)).each {|col|
+                if matrix[row,col].nil?
+                    html+="<td>--</td>"
+                else
+                    sig=""
+                    prob_out=""
+                    if !pmatrix[row,col].nil?
+                        prob=pmatrix[row,col]
+                        prob_out=sprintf("%0.3f",prob)
+                        if prob<0.01
+                            sig="**"
+                        elsif prob<0.05
+                            sig="*"
+                        else
+                            sig=""
+                        end
+                    end
+                    if sig==""
+                        html+="<td>#{sprintf("%0.3f",matrix[row,col])} #{sig}<br /> #{prob_out}</td>"
+                    else
+                        html+="<td><strong>#{sprintf("%0.3f",matrix[row,col])} #{sig}<br /> #{prob_out}</strong></td>"
+                    end
+                end
+            }
+            html+="</tr>"
+        }
+        html+="</tbody></table>"
+        @partials.push(html)
+    end
+    # Add a scale
+    # First arg is the name of the scale
+    # Other are fields
+    def add_scale(ds,name, fields,icc=false)
+        raise "Fields are empty" if fields.size==0
+        add_anchor("Scale:#{name}")
+        ds_partial=ds.dup(fields)
+        ia=Statsample::Reliability::ItemAnalysis.new(ds_partial)
+        html="<h2>Scale: #{name}</h2>"
+        html << ia.html_summary
+        @partials.push(html)
+        @level+=1
+        v=ds_partial.vector_mean
+            add_histogram(name, v)
+            add_runsequence_plot(name, v)
+            add_normalprobability_plot(name,v)
+            add_icc(name,fields) if icc
+        @level-=1
+    end
+    def add_boxplot(name,vector,options={})
+        add_graph("Box Plot #{name}", name, vector.svggraph_boxplot(options))
+    end
+    def add_graph(name,id,graph)
+        add_anchor(name)
+        rs_file=@dir+"/#{uniq_file()}.svg"
+        html = "<h3>#{name}</h3> <p><embed src='#{rs_file}'  width='#{graph.width}' height='#{graph.height}' type='image/svg+xml' /></p>\n"
+        File.open(rs_file, "w") {|f|
+            f.puts(graph.burn)
+        }
+        @partials.push(html)
+    end
+    def add_runsequence_plot(name, vector,options={})
+        add_graph("Run-Sequence Plot #{name}", name, vector.svggraph_runsequence_plot(options))
+    end
+    def add_lag_plot(name,vector, options={})
+        add_graph("Lag Plot #{name}", name,vector.svggraph_lag_plot(options))
+    end
+    def add_normalprobability_plot(name,vector,options={})
+        add_graph("Normal Probability Plot #{name}", name, vector.svggraph_normalprobability_plot(options))
+    end
+    def add_scatterplot(name, ds,x_field=nil, y_fields=nil,config={})
+        add_anchor("Scatterplot: #{name}")
+        x_field||=ds.fields[0]
+        y_fields||=ds.fields-[x_field]
+        ds_partial=ds.dup([x_field]+y_fields)
+        sc=Statsample::Graph::SvgScatterplot.new(ds_partial, config)
+        sc.parse
+        sc_file=@dir+"/#{uniq_file("sc")}.svg"
+        html = "<h3>Scatterplot #{name}</h3> <p><embed src='#{sc_file}'  width='#{sc.width}' height='#{sc.height}' type='image/svg+xml' /></p>\n"
+        File.open(sc_file, "w") {|f|
+              f.puts(sc.burn)
+        }
+        @partials.push(html)
+    end
+    def add_boxplots(name, ds,options={})
+        add_anchor("Boxplots: #{name}")
+        options={:graph_title=>"Boxplots:#{name}", :show_graph_title=>true, :height=>500}.merge! options
+        graph = Statsample::Graph::SvgBoxplot.new(options)
+        ds.fields.each{|f|
+            graph.add_data(:title=>f,
+                :data=>ds[f].valid_data,
+                :vector=>ds[f]
+                )
+        }
+        add_graph(name,name,graph)
+        graph
+    end
+    def add_histogram(name,vector,bins=nil,options={})
+        bins||=vector.size / 15
+        bins=15 if bins>15
+        graph=vector.svggraph_histogram(bins,options)
+        add_graph("Histogram:#{name}",name,graph)
+        html = "<ul><li>Skewness=#{sprintf("%0.3f",vector.skew)}</li>
+        <li>Kurtosis=#{sprintf("%0.3f",vector.kurtosis)}</li></ul>"
+        @partials.push(html)
+    end
+    def add_icc(name,ds, fields)
+        require 'statsample/graph/svggraph'
+        raise "Fields are empty" if fields.size==0
+        add_anchor("ICC:#{name}")
+        ds_partial=ds.dup(fields)
+        ia=Statsample::Reliability::ItemAnalysis.new(ds_partial)
+        html="<h3>ICC for scale: #{name}</h3>"
+        ia.svggraph_item_characteristic_curve(@dir ,name, {:width=>400,:height=>300})
+        ds_partial.fields.sort.each{|f|
+            html << "<div><p><strong>#{f}</strong></p><embed src='#{@dir}/#{name}_#{f}.svg'  width='400' height='300' type='image/svg+xml' /></div>\n"
+        }
+        @partials.push(html)
+    end
+    def css
+<<HERE
+table {
+  border-collapse: collapse;
+}
+th {
+  text-align: left;
+  padding-right: 1em;
+  border-bottom: 3px solid #ccc;
+}
+th.active img {
+  display: inline;
+}
+tr.even, tr.odd {
+  background-color: #eee;
+  border-bottom: 1px solid #ccc;
+}
+tr.even, tr.odd {
+  padding: 0.1em 0.6em;
+}
+td.active {
+  background-color: #ddd;
+}
+table td {
+border:1px solid #aaa;
+}
+table tr.line td{
+border-top: 2px solid black;
+}
+HERE
+    end
+    def create_uls(level)
+        if @c_level!=level
+            if level>@c_level
+                "<ul>\n" * (level-@c_level)
+            else
+                "</ul>\n" * (@c_level-level)
+            end
+        else
+            ""
+        end
+    end
+    def parse
+        html="<html><head><title>#{@name}</title><style>#{css()}</style></head><body><h1>Report: #{@name}</h1>"
+        if @anchors.size>0
+            html << "<div class='index'>Index</div><ul>"
+            @c_level=1
+            @anchors.each{|name,level,uniq|
+                html << create_uls(level)
+                @c_level=level
+                html << "<li><a href='#"+uniq.to_s+"'>#{name}</a></li>"
+            }
+            html << create_uls(1)
+            html << "</ul></div>"
+        end
+        html+="<div class='section'>"+@partials.join("</div><div class='section'>")+"</div>"
+        html+="</body></html>"
+        html
+    end
+    def save(filename)
+        File.open(filename,"w") {|fp|
+            fp.write(parse)
+        }
+    end
+end
+end

data/lib/statsample/multiset.rb ADDED Viewed

@@ -0,0 +1,281 @@
+module Statsample
+    # Multiset joins multiple dataset with the same fields and vectors
+    # but with different number of cases.
+    # This is the base class for stratified and cluster sampling estimation
+    class Multiset
+        attr_reader :fields, :datasets
+        # To create a multiset
+        # * Multiset.new(%w{f1 f2 f3}) # define only fields
+        def initialize(fields)
+            @fields=fields
+            @datasets={}
+        end
+        def self.new_empty_vectors(fields,ds_names)
+            ms=Multiset.new(fields)
+            ds_names.each{|d|
+                ms.add_dataset(d,Dataset.new(fields))
+            }
+            ms
+        end
+        def datasets_names
+            @datasets.keys.sort
+        end
+        def n_datasets
+            @datasets.size
+        end
+        def add_dataset(key,ds)
+            if(ds.fields!=@fields)
+            raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
+            else
+                @datasets[key]=ds
+            end
+        end
+		def sum_field(field)
+			@datasets.inject(0) {|a,da|
+				stratum_name=da[0]
+                vector=da[1][field]
+				val=yield stratum_name,vector
+				a+val
+			}
+		end
+        def collect_vector(field)
+            @datasets.collect {|k,v|
+                yield k, v[field]
+            }
+        end
+        def[](i)
+            @datasets[i]
+        end
+    end
+    class StratifiedSample
+		class << self
+			# mean for an array of vectors
+			def mean(*v)
+				n_total=0
+				a=v.inject(0){|a,v|
+					n_total+=v.size
+					a+v.sum
+				}
+				a.to_f/n_total
+			end
+            def standard_error_ksd_wr(es)
+                n_total=0
+                sum=es.inject(0){|a,h|
+                    n_total+=h['N']
+                    a+((h['N']**2 * h['s']**2) / h['n'].to_f)
+                }
+                (1.to_f / n_total)*Math::sqrt(sum)
+            end
+            def variance_ksd_wr(es)
+                standard_error_ksd_wr(es)**2
+            end
+            # Source : Cochran (1972)
+            def variance_ksd_wor(es)
+                n_total=es.inject(0) {|a,h|
+                    a+h['N']
+                }
+                es.inject(0){|a,h|
+                    val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
+                    a+val
+                }
+            end
+            def standard_error_ksd_wor(es)
+                Math::sqrt(variance_ksd_wor(es))
+            end
+            def variance_esd_wor(es)
+                n_total=es.inject(0) {|a,h|
+                    a+h['N']
+                }
+                sum=es.inject(0){|a,h|
+                    val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
+                    a+val
+                }
+                (1.0/(n_total**2))*sum
+            end
+            def standard_error_esd_wor(es)
+                Math::sqrt(variance_ksd_wor(es))
+            end
+            # Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
+            def variance_esd_wr(es)
+                n_total=es.inject(0) {|a,h|
+                    a+h['N']
+                }
+                sum=es.inject(0){|a,h|
+                    val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
+                    a+val
+                }
+                (1.0/(n_total**2))*sum
+            end
+            def standard_error_esd_wr(es)
+                Math::sqrt(variance_esd_wr(es))
+            end
+            def proportion_variance_ksd_wor(es)
+                n_total=es.inject(0) {|a,h|
+                    a+h['N']
+                }
+                es.inject(0){|a,h|
+                    val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
+                    a+val
+                }
+            end
+            def proportion_sd_ksd_wor(es)
+                Math::sqrt(proportion_variance_ksd_wor(es))
+            end
+            def proportion_sd_ksd_wr(es)
+                n_total=es.inject(0) {|a,h|
+                    a+h['N']
+                }
+                sum=es.inject(0){|a,h|
+                    val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
+                    a+val
+                }
+                Math::sqrt(sum) * (1.0/n_total)
+            end
+            def proportion_variance_ksd_wr(es)
+                proportion_variance_ksd_wor(es)**2
+            end
+            def proportion_variance_esd_wor(es)
+                n_total=es.inject(0) {|a,h|
+                    a+h['N']
+                }
+                sum=es.inject(0){|a,h|
+                    a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
+                    a+val
+                }
+                Math::sqrt(sum) * (1.0/n_total**2)
+            end
+            def proportion_sd_esd_wor(es)
+                Math::sqrt(proportion_variance_ksd_wor(es))
+            end
+		end
+        def initialize(ms,strata_sizes)
+            raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
+            @ms=ms
+            raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
+            @strata_sizes=strata_sizes
+            @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
+            @strata_number=@ms.n_datasets
+            @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
+        end
+        # Number of strata
+        def strata_number
+            @strata_number
+        end
+        # Population size. Equal to sum of strata sizes
+        # Symbol: N<sub>h</sub>
+        def population_size
+            @population_size
+        end
+        # Sample size. Equal to sum of sample of each stratum
+        def sample_size
+            @sample_size
+        end
+        # Size of stratum x
+        def stratum_size(h)
+            @strata_sizes[h]
+        end
+        def vectors_by_field(field)
+            @ms.datasets.collect{|k,ds|
+                ds[field]
+            }
+        end
+        # Population proportion based on strata
+        def proportion(field, v=1)
+			@ms.sum_field(field) {|s_name,vector|
+				stratum_ponderation(s_name)*vector.proportion(v)
+			}
+        end
+        # Stratum ponderation.
+        # Symbol: W\<sub>h\</sub>
+        def stratum_ponderation(h)
+            @strata_sizes[h].to_f / @population_size
+        end
+        alias_method :wh, :stratum_ponderation
+        # Population mean based on strata
+        def mean(field)
+			@ms.sum_field(field) {|s_name,vector|
+				stratum_ponderation(s_name)*vector.mean
+			}
+        end
+        # Standard error with estimated population variance and without replacement.
+        # Source: Cochran (1972)
+        def standard_error_wor(field)
+            es=@ms.collect_vector(field) {|s_n, vector|
+                {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
+            }
+            StratifiedSample.standard_error_esd_wor(es)
+        end
+        # Standard error with estimated population variance and without replacement.
+        # Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
+        def standard_error_wor_2(field)
+			sum=@ms.sum_field(field) {|s_name,vector|
+                s_size=@strata_sizes[s_name]
+				(s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
+			}
+            (1/@population_size.to_f)*Math::sqrt(sum)
+        end
+        def standard_error_wr(field)
+            es=@ms.collect_vector(field) {|s_n, vector|
+                {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
+            }
+            StratifiedSample.standard_error_esd_wr(es)
+        end
+        def proportion_sd_esd_wor(field,v=1)
+            es=@ms.collect_vector(field) {|s_n, vector|
+                {'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
+            }
+            StratifiedSample.proportion_sd_esd_wor(es)
+        end
+        def proportion_standard_error(field,v=1)
+            prop=proportion(field,v)
+            sum=@ms.sum_field(field) {|s_name,vector|
+                nh=vector.size
+                s_size=@strata_sizes[s_name]
+                (s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
+            }
+            (1/@population_size.to_f) * Math::sqrt(sum)
+        end
+        # Cochran(1971), p. 150
+        def variance_pst(field,v=1)
+            sum=@ms.datasets.inject(0) {|a,da|
+                stratum_name=da[0]
+                ds=da[1]
+                nh=ds.cases.to_f
+                s_size=@strata_sizes[stratum_name]
+                prop=ds[field].proportion(v)
+                a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
+            }
+            (1/@population_size.to_f ** 2)*sum
+        end
+    end
+end