rails-data-explorer 0.2.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -3
- data/README.md +2 -0
- data/lib/rails-data-explorer-no-rails.rb +36 -32
- data/lib/rails-data-explorer.rb +38 -35
- data/lib/rails_data_explorer.rb +29 -10
- data/lib/{rails-data-explorer → rails_data_explorer}/action_view_extension.rb +39 -17
- data/lib/rails_data_explorer/active_record_extension.rb +19 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart.rb +10 -0
- data/lib/rails_data_explorer/chart/anova.rb +1 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot.rb +12 -3
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot_group.rb +49 -22
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/contingency_table.rb +19 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/descriptive_statistics_table.rb +9 -0
- data/lib/rails_data_explorer/chart/descriptive_statistics_table_group.rb +1 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_categorical.rb +12 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_quantitative.rb +12 -2
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_temporal.rb +11 -2
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/multi_dimensional_charts.rb +2 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_coordinates.rb +11 -1
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_set.rb +11 -2
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/pie_chart.rb +12 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot.rb +13 -1
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot_matrix.rb +2 -0
- data/lib/{rails-data-explorer/chart/stacked_bar_chart_categorical_percent.rb → rails_data_explorer/chart/stacked_bar_chart_categorical.rb} +37 -14
- data/lib/rails_data_explorer/chart/stacked_bar_chart_categorical_percent.rb +28 -0
- data/lib/rails_data_explorer/chart/stacked_histogram_temporal.rb +199 -0
- data/lib/rails_data_explorer/data_series.rb +241 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_set.rb +13 -4
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type.rb +13 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/categorical.rb +79 -18
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/geo.rb +2 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative.rb +14 -4
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/decimal.rb +9 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/integer.rb +9 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/temporal.rb +9 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/engine.rb +12 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/exploration.rb +11 -0
- data/lib/rails_data_explorer/statistics/pearsons_chi_squared_independence_test.rb +72 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_category.rb +13 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_gaussian.rb +12 -1
- data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_power_law.rb +11 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/color_scale.rb +6 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_binner.rb +13 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_encoder.rb +2 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer.rb +8 -3
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/rde_table.rb +14 -11
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/value_formatter.rb +9 -4
- data/rails-data-explorer.gemspec +5 -6
- data/spec/rails_data_explorer/chart_spec.rb +11 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/data_series_spec.rb +0 -0
- data/spec/rails_data_explorer/data_set_spec.rb +31 -0
- data/spec/rails_data_explorer/data_type/categorical_spec.rb +126 -0
- data/{lib/rails-data-explorer/chart/descriptive_statistics_table_group.rb → spec/rails_data_explorer/data_type/quantitative/decimal_spec.rb} +0 -0
- data/spec/rails_data_explorer/data_type/quantitative/integer_spec.rb +0 -0
- data/spec/rails_data_explorer/data_type/quantitative/temporal_spec.rb +34 -0
- data/spec/rails_data_explorer/data_type/quantitative_spec.rb +118 -0
- data/spec/rails_data_explorer/data_type_spec.rb +7 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/exploration_spec.rb +5 -5
- data/spec/rails_data_explorer/statistics/pearsons_chi_squared_independence_test_spec.rb +0 -0
- data/spec/rails_data_explorer/utils/color_scale_spec.rb +13 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_binner_spec.rb +0 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer_spec.rb +0 -0
- data/spec/rails_data_explorer/utils/value_formatter_spec.rb +33 -0
- data/vendor/assets/stylesheets/sources/rde-default-style.css +5 -1
- metadata +91 -82
- data/lib/rails-data-explorer/active_record_extension.rb +0 -14
- data/lib/rails-data-explorer/constants.rb +0 -5
- data/lib/rails-data-explorer/data_series.rb +0 -156
- data/lib/rails-data-explorer/statistics/pearsons_chi_squared_independence_test.rb +0 -75
- data/spec/rails-data-explorer/data_type/categorical_spec.rb +0 -34
@@ -1,10 +1,19 @@
|
|
1
|
-
#
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
2
3
|
class RailsDataExplorer
|
4
|
+
|
5
|
+
# Responsibilities:
|
6
|
+
# * Container for DataSeries
|
7
|
+
#
|
8
|
+
# Collaborators:
|
9
|
+
# * DataSeries
|
10
|
+
# * Exploration
|
11
|
+
#
|
3
12
|
class DataSet
|
4
13
|
|
5
14
|
attr_reader :data_series
|
6
15
|
|
7
|
-
# @param[Array<Numeric, String, Symbol, Nil, Hash, DataSeries>]
|
16
|
+
# @param values_or_data_series [Array<Numeric, String, Symbol, Nil, Hash, DataSeries>]
|
8
17
|
# Array can contain the following:
|
9
18
|
# * Numeric, String, Symbol, Nil - for a single data series
|
10
19
|
# * Hash - for multiple data series with the following keys:
|
@@ -13,7 +22,7 @@ class RailsDataExplorer
|
|
13
22
|
# * :chart_roles [Array<Symbol>, optional] - what to use this series for. possible values: :x, :y, :color
|
14
23
|
# * :data_type (optional) - :quantitative, :categorical, :temporal
|
15
24
|
# * DataSeries
|
16
|
-
# @param[String]
|
25
|
+
# @param exploration_title [String] used as fall back for data series name
|
17
26
|
def initialize(values_or_data_series, exploration_title)
|
18
27
|
@data_series = initialize_data_series(values_or_data_series, exploration_title)
|
19
28
|
validate_data_series
|
@@ -39,7 +48,7 @@ class RailsDataExplorer
|
|
39
48
|
else
|
40
49
|
raise(
|
41
50
|
ArgumentError.new(
|
42
|
-
"Invalid datum. Only Hash, Numeric,
|
51
|
+
"Invalid datum. Only DataSeries, Hash, ActiveSupport::TimeWithZone, DateTime, Numeric, NilClass, String, or Symbol are allowed. " + \
|
43
52
|
"Found #{ values_or_data_series.first.class.to_s }."
|
44
53
|
)
|
45
54
|
)
|
@@ -1,4 +1,17 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
4
|
+
|
5
|
+
# Responsibilities:
|
6
|
+
# * Represent a type of data
|
7
|
+
# * Determine available chart types
|
8
|
+
# * Compute descriptive statistics
|
9
|
+
# * Compute modified values
|
10
|
+
#
|
11
|
+
# Collaborators:
|
12
|
+
# * DataSeries
|
13
|
+
# * Chart
|
14
|
+
#
|
2
15
|
class DataType
|
3
16
|
|
4
17
|
# @param[Hash, optional] constraints
|
@@ -1,9 +1,17 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
2
4
|
class DataType
|
5
|
+
|
6
|
+
# Responsibilities:
|
7
|
+
# * Provide available charts and statistics for categorical data type.
|
8
|
+
# * Provide methods for categorical data type.
|
9
|
+
#
|
10
|
+
# Collaborators:
|
11
|
+
# * DataSet
|
12
|
+
#
|
3
13
|
class Categorical < DataType
|
4
14
|
|
5
|
-
# TODO: when there are too many categories, only separate the N most
|
6
|
-
# significant ones and group all other values under "Other"
|
7
15
|
def all_available_chart_types
|
8
16
|
[
|
9
17
|
{
|
@@ -12,12 +20,12 @@ class RailsDataExplorer
|
|
12
20
|
dimensions_count_min: 1,
|
13
21
|
dimensions_count_max: 1,
|
14
22
|
},
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
23
|
+
{
|
24
|
+
chart_class: Chart::PieChart,
|
25
|
+
chart_roles: [:any],
|
26
|
+
dimensions_count_min: 1,
|
27
|
+
dimensions_count_max: 1,
|
28
|
+
},
|
21
29
|
{
|
22
30
|
chart_class: Chart::BoxPlotGroup,
|
23
31
|
chart_roles: [:y],
|
@@ -34,6 +42,12 @@ class RailsDataExplorer
|
|
34
42
|
chart_roles: [:dimension],
|
35
43
|
dimensions_count_min: 3,
|
36
44
|
},
|
45
|
+
{
|
46
|
+
chart_class: Chart::StackedBarChartCategorical,
|
47
|
+
chart_roles: [:x, :y],
|
48
|
+
dimensions_count_min: 2,
|
49
|
+
dimensions_count_max: 2,
|
50
|
+
},
|
37
51
|
{
|
38
52
|
chart_class: Chart::StackedBarChartCategoricalPercent,
|
39
53
|
chart_roles: [:x, :y],
|
@@ -67,7 +81,7 @@ class RailsDataExplorer
|
|
67
81
|
end
|
68
82
|
|
69
83
|
def descriptive_statistics(values)
|
70
|
-
frequencies = values
|
84
|
+
frequencies = compute_histogram(values)
|
71
85
|
labels_ds = DataSeries.new('_', values.uniq)
|
72
86
|
total_count = values.length
|
73
87
|
ruby_formatters = {
|
@@ -168,25 +182,43 @@ class RailsDataExplorer
|
|
168
182
|
%(function(d) { return d })
|
169
183
|
end
|
170
184
|
|
171
|
-
# @param[Symbol, nil]
|
172
|
-
# @param[DataSeries]
|
173
|
-
# @param[Proc]
|
174
|
-
# @return[Proc] a Proc that will be used by #sort
|
185
|
+
# @param label_val_key [Symbol, nil] the hash key to use to get the label value during sort (sent to a,b)
|
186
|
+
# @param data_series [DataSeries] the ds that contains the uniq vals
|
187
|
+
# @param value_sorter [Proc] the sorting proc to use if not sorted numerically
|
188
|
+
# @return [Proc] a Proc that will be used by #sort
|
175
189
|
def label_sorter(label_val_key, data_series, value_sorter)
|
176
190
|
if data_series.uniq_vals.any? { |e| e.to_s =~ /^[\+\-]?\d+/ }
|
177
191
|
# Sort numerical categories by key ASC
|
192
|
+
# This lambda can be used in conjunction with `#sort`.
|
193
|
+
# It returns -1, 0, or 1
|
178
194
|
lambda { |a,b|
|
179
195
|
number_and_full_string_extractor = lambda { |val|
|
180
196
|
str = label_val_key ? val[label_val_key] : val
|
181
197
|
number = str.gsub(/^[^\d\+\-]*/, '') # remove non-digit leading chars
|
182
198
|
.gsub(',', '') # remove delimiter commas, they throw off to_f parsing
|
183
|
-
|
184
|
-
|
199
|
+
if '' != number
|
200
|
+
# label contains digits
|
201
|
+
number = number.to_f
|
202
|
+
number += 1 if str =~ /^>/ # increase highest threshold by one for proper sorting
|
203
|
+
number -= 1 if str =~ /^</ # decrease lowest threshold by one for proper sorting
|
204
|
+
else
|
205
|
+
# label doesn't contain digits, set to nil to sort at end
|
206
|
+
number = nil
|
207
|
+
end
|
185
208
|
[number, str]
|
186
209
|
}
|
187
|
-
|
188
|
-
|
189
|
-
|
210
|
+
a_num, a_str = number_and_full_string_extractor.call(a)
|
211
|
+
b_num, b_str = number_and_full_string_extractor.call(b)
|
212
|
+
if a_num && b_num
|
213
|
+
# Both numbers are present, compare them
|
214
|
+
[a_num, a_str] <=> [b_num, b_str]
|
215
|
+
elsif a_num
|
216
|
+
# a_num is present, b_num isn't. Sort a before b
|
217
|
+
-1
|
218
|
+
else
|
219
|
+
# a_num is not present, b_num is, Sort a after b
|
220
|
+
1
|
221
|
+
end
|
190
222
|
}
|
191
223
|
else
|
192
224
|
# Use provided value sorter
|
@@ -194,6 +226,35 @@ class RailsDataExplorer
|
|
194
226
|
end
|
195
227
|
end
|
196
228
|
|
229
|
+
# Returns the top N max frequent distinct observations in values. Groups
|
230
|
+
# less frequent observations under val_for_others.
|
231
|
+
# @param values [Array]
|
232
|
+
# @param max_num_vals [Integer] the max number of distinct values to return (including val_for_others)
|
233
|
+
# @param val_for_others [String, optional] defaults to '[Other]'
|
234
|
+
def limit_distinct_values(values, max_num_vals, val_for_others = nil)
|
235
|
+
distinct_values = values.uniq
|
236
|
+
# Return values if they already have lte max_num_vals distinct observations
|
237
|
+
return values if distinct_values.length <= max_num_vals
|
238
|
+
|
239
|
+
val_for_others ||= '[Other]'
|
240
|
+
frequencies = compute_histogram(values)
|
241
|
+
top_vals = frequencies.to_a.sort { |a,b|
|
242
|
+
# a = [value, frequency]
|
243
|
+
# Sort by frequency DESC, value ASC
|
244
|
+
[b.last, a.first] <=> [a.last, b.first]
|
245
|
+
}.first(max_num_vals - 1).map { |e| e.first }
|
246
|
+
values.map { |e| top_vals.include?(e) ? e : val_for_others }
|
247
|
+
end
|
248
|
+
|
249
|
+
protected
|
250
|
+
|
251
|
+
# Computes a histogram for values
|
252
|
+
# @param values [Array]
|
253
|
+
# @return a Hash with distinct vals as keys and their frequency as value
|
254
|
+
def compute_histogram(values)
|
255
|
+
values.inject(Hash.new(0)) { |m,e| m[e] += 1; m }
|
256
|
+
end
|
257
|
+
|
197
258
|
end
|
198
259
|
end
|
199
260
|
end
|
@@ -1,8 +1,18 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
2
4
|
class DataType
|
3
|
-
class Quantitative < DataType
|
4
5
|
|
5
|
-
|
6
|
+
# This is an abstract class. Use sub_classes
|
7
|
+
#
|
8
|
+
# Responsibilities:
|
9
|
+
# * Provide available charts and statistics for quantitative data type.
|
10
|
+
# * Provide methods for quantitative data type.
|
11
|
+
#
|
12
|
+
# Collaborators:
|
13
|
+
# * DataSet
|
14
|
+
#
|
15
|
+
class Quantitative < DataType
|
6
16
|
|
7
17
|
def all_available_chart_types
|
8
18
|
[
|
@@ -122,9 +132,9 @@ class RailsDataExplorer
|
|
122
132
|
raise "Implement me in sub_class"
|
123
133
|
end
|
124
134
|
|
125
|
-
def axis_scale(data_series, d3_or_vega)
|
135
|
+
def axis_scale(data_series, modification, d3_or_vega)
|
126
136
|
# Log scales can't handle 0 values
|
127
|
-
if data_series.min_val > 0.0 && data_series.has_large_dynamic_range?
|
137
|
+
if data_series.min_val(modification) > 0.0 && data_series.has_large_dynamic_range?(modification)
|
128
138
|
{ d3: 'd3.scale.log', vega: 'log' }[d3_or_vega]
|
129
139
|
else
|
130
140
|
{ d3: 'd3.scale.linear', vega: 'linear' }[d3_or_vega]
|
@@ -1,6 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
2
4
|
class DataType
|
3
5
|
class Quantitative
|
6
|
+
|
7
|
+
# Responsibilities:
|
8
|
+
# * Provide methods for decimal quantitative data type.
|
9
|
+
#
|
10
|
+
# Collaborators:
|
11
|
+
# * DataSet
|
12
|
+
#
|
4
13
|
class Decimal < Quantitative
|
5
14
|
|
6
15
|
def axis_tick_format(values)
|
@@ -1,6 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
2
4
|
class DataType
|
3
5
|
class Quantitative
|
6
|
+
|
7
|
+
# Responsibilities:
|
8
|
+
# * Provide methods for integer quantitative data type.
|
9
|
+
#
|
10
|
+
# Collaborators:
|
11
|
+
# * DataSet
|
12
|
+
#
|
4
13
|
class Integer < Quantitative
|
5
14
|
|
6
15
|
def axis_tick_format(values)
|
@@ -1,6 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
2
4
|
class DataType
|
3
5
|
class Quantitative
|
6
|
+
|
7
|
+
# Responsibilities:
|
8
|
+
# * Provide methods for temporal quantitative data type.
|
9
|
+
#
|
10
|
+
# Collaborators:
|
11
|
+
# * DataSet
|
12
|
+
#
|
4
13
|
class Temporal < Quantitative
|
5
14
|
|
6
15
|
def all_available_chart_types
|
@@ -1,6 +1,18 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
require 'rails'
|
2
4
|
|
3
5
|
class RailsDataExplorer
|
6
|
+
|
7
|
+
# Responsibilities:
|
8
|
+
# * Tie RailsDataExplorer into a Rails app
|
9
|
+
# * Initialize ActionViewExtension
|
10
|
+
# * Tell rails which assets to precompile
|
11
|
+
#
|
12
|
+
# Collaborators:
|
13
|
+
# * ActiveSupport
|
14
|
+
# * RailsDataExplorer
|
15
|
+
#
|
4
16
|
class Engine < ::Rails::Engine
|
5
17
|
|
6
18
|
# It's an engine so that we can add javascript and image assets
|
@@ -1,4 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
4
|
+
|
5
|
+
# Responsibilities:
|
6
|
+
# * Represent and initialize a data exploration
|
7
|
+
# * Initialize and render self (including charts)
|
8
|
+
#
|
9
|
+
# Collaborators:
|
10
|
+
# * DataSet
|
11
|
+
# * Chart
|
12
|
+
#
|
2
13
|
class Exploration
|
3
14
|
|
4
15
|
attr_accessor :output_buffer # required for content_tag
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
class RailsDataExplorer
|
4
|
+
module Statistics
|
5
|
+
|
6
|
+
# From http://en.wikipedia.org/wiki/Pearson's_chi-squared_test
|
7
|
+
|
8
|
+
# Pearson's chi-squared test is used to assess whether paired observations on two
|
9
|
+
# variables, expressed in a contingency table, are independent of each other.
|
10
|
+
|
11
|
+
# An "observation" consists of the values of two outcomes and the null hypothesis
|
12
|
+
# is that the occurrence of these outcomes is statistically independent. Each
|
13
|
+
# observation is allocated to one cell of a two-dimensional array of cells (called
|
14
|
+
# a contingency table) according to the values of the two outcomes.
|
15
|
+
|
16
|
+
# Assumptions
|
17
|
+
# -----------
|
18
|
+
|
19
|
+
# The chi-squared test, when used with the standard approximation that a chi-
|
20
|
+
# squared distribution is applicable, has the following assumptions:
|
21
|
+
|
22
|
+
# * Simple random sample – The sample data is a random sampling from a fixed
|
23
|
+
# distribution or population where every collection of members of the population
|
24
|
+
# of the given sample size has an equal probability of selection. Variants of
|
25
|
+
# the test have been developed for complex samples, such as where the data is
|
26
|
+
# weighted. Other forms can be used such as purposive sampling.
|
27
|
+
# * Sample size (whole table) – A sample with a sufficiently large size is assumed.
|
28
|
+
# If a chi squared test is conducted on a sample with a smaller size, then the
|
29
|
+
# chi squared test will yield an inaccurate inference. The researcher, by using
|
30
|
+
# chi squared test on small samples, might end up committing a Type II error.
|
31
|
+
# * Expected cell count – Adequate expected cell counts. Some require 5 or more,
|
32
|
+
# and others require 10 or more. A common rule is 5 or more in all cells of a
|
33
|
+
# 2-by-2 table, and 5 or more in 80% of cells in larger tables, but no cells
|
34
|
+
# with zero expected count. When this assumption is not met, Yates's Correction
|
35
|
+
# is applied.
|
36
|
+
# * Independence – The observations are always assumed to be independent of each
|
37
|
+
# other. This means chi-squared cannot be used to test correlated data
|
38
|
+
# (like matched pairs or panel data). In those cases you might want to turn to
|
39
|
+
# McNemar's test.
|
40
|
+
|
41
|
+
# Problems
|
42
|
+
# --------
|
43
|
+
|
44
|
+
# The approximation to the chi-squared distribution breaks down if expected
|
45
|
+
# frequencies are too low. It will normally be acceptable so long as no more than
|
46
|
+
# 20% of the events have expected frequencies below 5. Where there is only 1
|
47
|
+
# degree of freedom, the approximation is not reliable if expected frequencies are
|
48
|
+
# below 10. In this case, a better approximation can be obtained by reducing the
|
49
|
+
# absolute value of each difference between observed and expected frequencies by
|
50
|
+
# 0.5 before squaring; this is called Yates's correction for continuity.
|
51
|
+
|
52
|
+
# In cases where the expected value, E, is found to be small (indicating a small
|
53
|
+
# underlying population probability, and/or a small number of observations), the
|
54
|
+
# normal approximation of the multinomial distribution can fail, and in such cases
|
55
|
+
# it is found to be more appropriate to use the G-test, a likelihood ratio-based
|
56
|
+
# test statistic. Where the total sample size is small, it is necessary to use an
|
57
|
+
# appropriate exact test, typically either the binomial test or (for contingency
|
58
|
+
# tables) Fisher's exact test. This test uses the conditional distribution of the
|
59
|
+
# test statistic given the marginal totals; however, it does not assume that the
|
60
|
+
# data were generated from an experiment in which the marginal totals are fixed
|
61
|
+
# and is valid whether or not that is the case.
|
62
|
+
class PearsonsChiSquaredIndependenceTest
|
63
|
+
|
64
|
+
def initialize(data_matrix, min_probability = 0.05)
|
65
|
+
end
|
66
|
+
|
67
|
+
def compute
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -1,7 +1,17 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
1
3
|
class RailsDataExplorer
|
2
4
|
module Statistics
|
5
|
+
|
6
|
+
# Responsibilities:
|
7
|
+
# * Provide random categorical data. Useful for testing and demo data.
|
8
|
+
#
|
3
9
|
class RngCategory
|
4
10
|
|
11
|
+
# @param categories [Array<Object>] the pool of available categories.
|
12
|
+
# @param category_probabilities [Array, optional] probability of each category.
|
13
|
+
# @param rng [Proc, optional] lambda to generate random numbers which will
|
14
|
+
# be mapped to categories.
|
5
15
|
def initialize(categories, category_probabilities = nil, rng = lambda { Kernel.rand })
|
6
16
|
@categories, @category_probabilities, @rng = categories, category_probabilities, rng
|
7
17
|
@category_probabilities ||= @categories.map { |e| @rng.call }
|
@@ -9,6 +19,7 @@ class RailsDataExplorer
|
|
9
19
|
@category_order = compute_category_order
|
10
20
|
end
|
11
21
|
|
22
|
+
# Returns a random category
|
12
23
|
def rand
|
13
24
|
r_v = @rng.call
|
14
25
|
rnd = @category_order.detect { |e|
|
@@ -17,6 +28,8 @@ class RailsDataExplorer
|
|
17
28
|
rnd[:category]
|
18
29
|
end
|
19
30
|
|
31
|
+
protected
|
32
|
+
|
20
33
|
def normalize_category_probabilities
|
21
34
|
total = @category_probabilities.inject(0) { |m,e| m += e }
|
22
35
|
@category_probabilities.map { |e| e / total.to_f }
|
@@ -1,12 +1,23 @@
|
|
1
|
-
#
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
2
3
|
class RailsDataExplorer
|
3
4
|
module Statistics
|
5
|
+
|
6
|
+
# Responsibilities:
|
7
|
+
# * Provide random numeric data, following a gaussian distribution.
|
8
|
+
#
|
9
|
+
# From http://stackoverflow.com/a/9266488
|
4
10
|
class RngGaussian
|
11
|
+
|
12
|
+
# @param mean [Float] the expected mean
|
13
|
+
# @param sd [Float] the expected standard deviation
|
14
|
+
# @param rng [Proc, optional] a random number generator
|
5
15
|
def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
|
6
16
|
@mean, @sd, @rng = mean, sd, rng
|
7
17
|
@compute_next_pair = false
|
8
18
|
end
|
9
19
|
|
20
|
+
# Returns random numbers with a gaussian distribution.
|
10
21
|
def rand
|
11
22
|
if (@compute_next_pair = !@compute_next_pair)
|
12
23
|
# Compute a pair of random values with normal distribution.
|