rails-data-explorer 0.2.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -3
- data/README.md +2 -0
- data/lib/rails-data-explorer-no-rails.rb +36 -32
- data/lib/rails-data-explorer.rb +38 -35
- data/lib/rails_data_explorer.rb +29 -10
- data/lib/{rails-data-explorer → rails_data_explorer}/action_view_extension.rb +39 -17
- data/lib/rails_data_explorer/active_record_extension.rb +19 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart.rb +10 -0
- data/lib/rails_data_explorer/chart/anova.rb +1 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot.rb +12 -3
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot_group.rb +49 -22
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/contingency_table.rb +19 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/descriptive_statistics_table.rb +9 -0
- data/lib/rails_data_explorer/chart/descriptive_statistics_table_group.rb +1 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_categorical.rb +12 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_quantitative.rb +12 -2
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_temporal.rb +11 -2
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/multi_dimensional_charts.rb +2 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_coordinates.rb +11 -1
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_set.rb +11 -2
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/pie_chart.rb +12 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot.rb +13 -1
- data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot_matrix.rb +2 -0
- data/lib/{rails-data-explorer/chart/stacked_bar_chart_categorical_percent.rb → rails_data_explorer/chart/stacked_bar_chart_categorical.rb} +37 -14
- data/lib/rails_data_explorer/chart/stacked_bar_chart_categorical_percent.rb +28 -0
- data/lib/rails_data_explorer/chart/stacked_histogram_temporal.rb +199 -0
- data/lib/rails_data_explorer/data_series.rb +241 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_set.rb +13 -4
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type.rb +13 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/categorical.rb +79 -18
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/geo.rb +2 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative.rb +14 -4
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/decimal.rb +9 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/integer.rb +9 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/temporal.rb +9 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/engine.rb +12 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/exploration.rb +11 -0
- data/lib/rails_data_explorer/statistics/pearsons_chi_squared_independence_test.rb +72 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_category.rb +13 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_gaussian.rb +12 -1
- data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_power_law.rb +11 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/color_scale.rb +6 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_binner.rb +13 -8
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_encoder.rb +2 -0
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer.rb +8 -3
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/rde_table.rb +14 -11
- data/lib/{rails-data-explorer → rails_data_explorer}/utils/value_formatter.rb +9 -4
- data/rails-data-explorer.gemspec +5 -6
- data/spec/rails_data_explorer/chart_spec.rb +11 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/data_series_spec.rb +0 -0
- data/spec/rails_data_explorer/data_set_spec.rb +31 -0
- data/spec/rails_data_explorer/data_type/categorical_spec.rb +126 -0
- data/{lib/rails-data-explorer/chart/descriptive_statistics_table_group.rb → spec/rails_data_explorer/data_type/quantitative/decimal_spec.rb} +0 -0
- data/spec/rails_data_explorer/data_type/quantitative/integer_spec.rb +0 -0
- data/spec/rails_data_explorer/data_type/quantitative/temporal_spec.rb +34 -0
- data/spec/rails_data_explorer/data_type/quantitative_spec.rb +118 -0
- data/spec/rails_data_explorer/data_type_spec.rb +7 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/exploration_spec.rb +5 -5
- data/spec/rails_data_explorer/statistics/pearsons_chi_squared_independence_test_spec.rb +0 -0
- data/spec/rails_data_explorer/utils/color_scale_spec.rb +13 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_binner_spec.rb +0 -0
- data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer_spec.rb +0 -0
- data/spec/rails_data_explorer/utils/value_formatter_spec.rb +33 -0
- data/vendor/assets/stylesheets/sources/rde-default-style.css +5 -1
- metadata +91 -82
- data/lib/rails-data-explorer/active_record_extension.rb +0 -14
- data/lib/rails-data-explorer/constants.rb +0 -5
- data/lib/rails-data-explorer/data_series.rb +0 -156
- data/lib/rails-data-explorer/statistics/pearsons_chi_squared_independence_test.rb +0 -75
- data/spec/rails-data-explorer/data_type/categorical_spec.rb +0 -34
|
@@ -1,10 +1,19 @@
|
|
|
1
|
-
#
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
2
3
|
class RailsDataExplorer
|
|
4
|
+
|
|
5
|
+
# Responsibilities:
|
|
6
|
+
# * Container for DataSeries
|
|
7
|
+
#
|
|
8
|
+
# Collaborators:
|
|
9
|
+
# * DataSeries
|
|
10
|
+
# * Exploration
|
|
11
|
+
#
|
|
3
12
|
class DataSet
|
|
4
13
|
|
|
5
14
|
attr_reader :data_series
|
|
6
15
|
|
|
7
|
-
# @param[Array<Numeric, String, Symbol, Nil, Hash, DataSeries>]
|
|
16
|
+
# @param values_or_data_series [Array<Numeric, String, Symbol, Nil, Hash, DataSeries>]
|
|
8
17
|
# Array can contain the following:
|
|
9
18
|
# * Numeric, String, Symbol, Nil - for a single data series
|
|
10
19
|
# * Hash - for multiple data series with the following keys:
|
|
@@ -13,7 +22,7 @@ class RailsDataExplorer
|
|
|
13
22
|
# * :chart_roles [Array<Symbol>, optional] - what to use this series for. possible values: :x, :y, :color
|
|
14
23
|
# * :data_type (optional) - :quantitative, :categorical, :temporal
|
|
15
24
|
# * DataSeries
|
|
16
|
-
# @param[String]
|
|
25
|
+
# @param exploration_title [String] used as fall back for data series name
|
|
17
26
|
def initialize(values_or_data_series, exploration_title)
|
|
18
27
|
@data_series = initialize_data_series(values_or_data_series, exploration_title)
|
|
19
28
|
validate_data_series
|
|
@@ -39,7 +48,7 @@ class RailsDataExplorer
|
|
|
39
48
|
else
|
|
40
49
|
raise(
|
|
41
50
|
ArgumentError.new(
|
|
42
|
-
"Invalid datum. Only Hash, Numeric,
|
|
51
|
+
"Invalid datum. Only DataSeries, Hash, ActiveSupport::TimeWithZone, DateTime, Numeric, NilClass, String, or Symbol are allowed. " + \
|
|
43
52
|
"Found #{ values_or_data_series.first.class.to_s }."
|
|
44
53
|
)
|
|
45
54
|
)
|
|
@@ -1,4 +1,17 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
4
|
+
|
|
5
|
+
# Responsibilities:
|
|
6
|
+
# * Represent a type of data
|
|
7
|
+
# * Determine available chart types
|
|
8
|
+
# * Compute descriptive statistics
|
|
9
|
+
# * Compute modified values
|
|
10
|
+
#
|
|
11
|
+
# Collaborators:
|
|
12
|
+
# * DataSeries
|
|
13
|
+
# * Chart
|
|
14
|
+
#
|
|
2
15
|
class DataType
|
|
3
16
|
|
|
4
17
|
# @param[Hash, optional] constraints
|
|
@@ -1,9 +1,17 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
2
4
|
class DataType
|
|
5
|
+
|
|
6
|
+
# Responsibilities:
|
|
7
|
+
# * Provide available charts and statistics for categorical data type.
|
|
8
|
+
# * Provide methods for categorical data type.
|
|
9
|
+
#
|
|
10
|
+
# Collaborators:
|
|
11
|
+
# * DataSet
|
|
12
|
+
#
|
|
3
13
|
class Categorical < DataType
|
|
4
14
|
|
|
5
|
-
# TODO: when there are too many categories, only separate the N most
|
|
6
|
-
# significant ones and group all other values under "Other"
|
|
7
15
|
def all_available_chart_types
|
|
8
16
|
[
|
|
9
17
|
{
|
|
@@ -12,12 +20,12 @@ class RailsDataExplorer
|
|
|
12
20
|
dimensions_count_min: 1,
|
|
13
21
|
dimensions_count_max: 1,
|
|
14
22
|
},
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
23
|
+
{
|
|
24
|
+
chart_class: Chart::PieChart,
|
|
25
|
+
chart_roles: [:any],
|
|
26
|
+
dimensions_count_min: 1,
|
|
27
|
+
dimensions_count_max: 1,
|
|
28
|
+
},
|
|
21
29
|
{
|
|
22
30
|
chart_class: Chart::BoxPlotGroup,
|
|
23
31
|
chart_roles: [:y],
|
|
@@ -34,6 +42,12 @@ class RailsDataExplorer
|
|
|
34
42
|
chart_roles: [:dimension],
|
|
35
43
|
dimensions_count_min: 3,
|
|
36
44
|
},
|
|
45
|
+
{
|
|
46
|
+
chart_class: Chart::StackedBarChartCategorical,
|
|
47
|
+
chart_roles: [:x, :y],
|
|
48
|
+
dimensions_count_min: 2,
|
|
49
|
+
dimensions_count_max: 2,
|
|
50
|
+
},
|
|
37
51
|
{
|
|
38
52
|
chart_class: Chart::StackedBarChartCategoricalPercent,
|
|
39
53
|
chart_roles: [:x, :y],
|
|
@@ -67,7 +81,7 @@ class RailsDataExplorer
|
|
|
67
81
|
end
|
|
68
82
|
|
|
69
83
|
def descriptive_statistics(values)
|
|
70
|
-
frequencies = values
|
|
84
|
+
frequencies = compute_histogram(values)
|
|
71
85
|
labels_ds = DataSeries.new('_', values.uniq)
|
|
72
86
|
total_count = values.length
|
|
73
87
|
ruby_formatters = {
|
|
@@ -168,25 +182,43 @@ class RailsDataExplorer
|
|
|
168
182
|
%(function(d) { return d })
|
|
169
183
|
end
|
|
170
184
|
|
|
171
|
-
# @param[Symbol, nil]
|
|
172
|
-
# @param[DataSeries]
|
|
173
|
-
# @param[Proc]
|
|
174
|
-
# @return[Proc] a Proc that will be used by #sort
|
|
185
|
+
# @param label_val_key [Symbol, nil] the hash key to use to get the label value during sort (sent to a,b)
|
|
186
|
+
# @param data_series [DataSeries] the ds that contains the uniq vals
|
|
187
|
+
# @param value_sorter [Proc] the sorting proc to use if not sorted numerically
|
|
188
|
+
# @return [Proc] a Proc that will be used by #sort
|
|
175
189
|
def label_sorter(label_val_key, data_series, value_sorter)
|
|
176
190
|
if data_series.uniq_vals.any? { |e| e.to_s =~ /^[\+\-]?\d+/ }
|
|
177
191
|
# Sort numerical categories by key ASC
|
|
192
|
+
# This lambda can be used in conjunction with `#sort`.
|
|
193
|
+
# It returns -1, 0, or 1
|
|
178
194
|
lambda { |a,b|
|
|
179
195
|
number_and_full_string_extractor = lambda { |val|
|
|
180
196
|
str = label_val_key ? val[label_val_key] : val
|
|
181
197
|
number = str.gsub(/^[^\d\+\-]*/, '') # remove non-digit leading chars
|
|
182
198
|
.gsub(',', '') # remove delimiter commas, they throw off to_f parsing
|
|
183
|
-
|
|
184
|
-
|
|
199
|
+
if '' != number
|
|
200
|
+
# label contains digits
|
|
201
|
+
number = number.to_f
|
|
202
|
+
number += 1 if str =~ /^>/ # increase highest threshold by one for proper sorting
|
|
203
|
+
number -= 1 if str =~ /^</ # decrease lowest threshold by one for proper sorting
|
|
204
|
+
else
|
|
205
|
+
# label doesn't contain digits, set to nil to sort at end
|
|
206
|
+
number = nil
|
|
207
|
+
end
|
|
185
208
|
[number, str]
|
|
186
209
|
}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
210
|
+
a_num, a_str = number_and_full_string_extractor.call(a)
|
|
211
|
+
b_num, b_str = number_and_full_string_extractor.call(b)
|
|
212
|
+
if a_num && b_num
|
|
213
|
+
# Both numbers are present, compare them
|
|
214
|
+
[a_num, a_str] <=> [b_num, b_str]
|
|
215
|
+
elsif a_num
|
|
216
|
+
# a_num is present, b_num isn't. Sort a before b
|
|
217
|
+
-1
|
|
218
|
+
else
|
|
219
|
+
# a_num is not present, b_num is, Sort a after b
|
|
220
|
+
1
|
|
221
|
+
end
|
|
190
222
|
}
|
|
191
223
|
else
|
|
192
224
|
# Use provided value sorter
|
|
@@ -194,6 +226,35 @@ class RailsDataExplorer
|
|
|
194
226
|
end
|
|
195
227
|
end
|
|
196
228
|
|
|
229
|
+
# Returns the top N max frequent distinct observations in values. Groups
|
|
230
|
+
# less frequent observations under val_for_others.
|
|
231
|
+
# @param values [Array]
|
|
232
|
+
# @param max_num_vals [Integer] the max number of distinct values to return (including val_for_others)
|
|
233
|
+
# @param val_for_others [String, optional] defaults to '[Other]'
|
|
234
|
+
def limit_distinct_values(values, max_num_vals, val_for_others = nil)
|
|
235
|
+
distinct_values = values.uniq
|
|
236
|
+
# Return values if they already have lte max_num_vals distinct observations
|
|
237
|
+
return values if distinct_values.length <= max_num_vals
|
|
238
|
+
|
|
239
|
+
val_for_others ||= '[Other]'
|
|
240
|
+
frequencies = compute_histogram(values)
|
|
241
|
+
top_vals = frequencies.to_a.sort { |a,b|
|
|
242
|
+
# a = [value, frequency]
|
|
243
|
+
# Sort by frequency DESC, value ASC
|
|
244
|
+
[b.last, a.first] <=> [a.last, b.first]
|
|
245
|
+
}.first(max_num_vals - 1).map { |e| e.first }
|
|
246
|
+
values.map { |e| top_vals.include?(e) ? e : val_for_others }
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
protected
|
|
250
|
+
|
|
251
|
+
# Computes a histogram for values
|
|
252
|
+
# @param values [Array]
|
|
253
|
+
# @return a Hash with distinct vals as keys and their frequency as value
|
|
254
|
+
def compute_histogram(values)
|
|
255
|
+
values.inject(Hash.new(0)) { |m,e| m[e] += 1; m }
|
|
256
|
+
end
|
|
257
|
+
|
|
197
258
|
end
|
|
198
259
|
end
|
|
199
260
|
end
|
|
@@ -1,8 +1,18 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
2
4
|
class DataType
|
|
3
|
-
class Quantitative < DataType
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
# This is an abstract class. Use sub_classes
|
|
7
|
+
#
|
|
8
|
+
# Responsibilities:
|
|
9
|
+
# * Provide available charts and statistics for quantitative data type.
|
|
10
|
+
# * Provide methods for quantitative data type.
|
|
11
|
+
#
|
|
12
|
+
# Collaborators:
|
|
13
|
+
# * DataSet
|
|
14
|
+
#
|
|
15
|
+
class Quantitative < DataType
|
|
6
16
|
|
|
7
17
|
def all_available_chart_types
|
|
8
18
|
[
|
|
@@ -122,9 +132,9 @@ class RailsDataExplorer
|
|
|
122
132
|
raise "Implement me in sub_class"
|
|
123
133
|
end
|
|
124
134
|
|
|
125
|
-
def axis_scale(data_series, d3_or_vega)
|
|
135
|
+
def axis_scale(data_series, modification, d3_or_vega)
|
|
126
136
|
# Log scales can't handle 0 values
|
|
127
|
-
if data_series.min_val > 0.0 && data_series.has_large_dynamic_range?
|
|
137
|
+
if data_series.min_val(modification) > 0.0 && data_series.has_large_dynamic_range?(modification)
|
|
128
138
|
{ d3: 'd3.scale.log', vega: 'log' }[d3_or_vega]
|
|
129
139
|
else
|
|
130
140
|
{ d3: 'd3.scale.linear', vega: 'linear' }[d3_or_vega]
|
|
@@ -1,6 +1,15 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
2
4
|
class DataType
|
|
3
5
|
class Quantitative
|
|
6
|
+
|
|
7
|
+
# Responsibilities:
|
|
8
|
+
# * Provide methods for decimal quantitative data type.
|
|
9
|
+
#
|
|
10
|
+
# Collaborators:
|
|
11
|
+
# * DataSet
|
|
12
|
+
#
|
|
4
13
|
class Decimal < Quantitative
|
|
5
14
|
|
|
6
15
|
def axis_tick_format(values)
|
|
@@ -1,6 +1,15 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
2
4
|
class DataType
|
|
3
5
|
class Quantitative
|
|
6
|
+
|
|
7
|
+
# Responsibilities:
|
|
8
|
+
# * Provide methods for integer quantitative data type.
|
|
9
|
+
#
|
|
10
|
+
# Collaborators:
|
|
11
|
+
# * DataSet
|
|
12
|
+
#
|
|
4
13
|
class Integer < Quantitative
|
|
5
14
|
|
|
6
15
|
def axis_tick_format(values)
|
|
@@ -1,6 +1,15 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
2
4
|
class DataType
|
|
3
5
|
class Quantitative
|
|
6
|
+
|
|
7
|
+
# Responsibilities:
|
|
8
|
+
# * Provide methods for temporal quantitative data type.
|
|
9
|
+
#
|
|
10
|
+
# Collaborators:
|
|
11
|
+
# * DataSet
|
|
12
|
+
#
|
|
4
13
|
class Temporal < Quantitative
|
|
5
14
|
|
|
6
15
|
def all_available_chart_types
|
|
@@ -1,6 +1,18 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
require 'rails'
|
|
2
4
|
|
|
3
5
|
class RailsDataExplorer
|
|
6
|
+
|
|
7
|
+
# Responsibilities:
|
|
8
|
+
# * Tie RailsDataExplorer into a Rails app
|
|
9
|
+
# * Initialize ActionViewExtension
|
|
10
|
+
# * Tell rails which assets to precompile
|
|
11
|
+
#
|
|
12
|
+
# Collaborators:
|
|
13
|
+
# * ActiveSupport
|
|
14
|
+
# * RailsDataExplorer
|
|
15
|
+
#
|
|
4
16
|
class Engine < ::Rails::Engine
|
|
5
17
|
|
|
6
18
|
# It's an engine so that we can add javascript and image assets
|
|
@@ -1,4 +1,15 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
4
|
+
|
|
5
|
+
# Responsibilities:
|
|
6
|
+
# * Represent and initialize a data exploration
|
|
7
|
+
# * Initialize and render self (including charts)
|
|
8
|
+
#
|
|
9
|
+
# Collaborators:
|
|
10
|
+
# * DataSet
|
|
11
|
+
# * Chart
|
|
12
|
+
#
|
|
2
13
|
class Exploration
|
|
3
14
|
|
|
4
15
|
attr_accessor :output_buffer # required for content_tag
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
class RailsDataExplorer
|
|
4
|
+
module Statistics
|
|
5
|
+
|
|
6
|
+
# From http://en.wikipedia.org/wiki/Pearson's_chi-squared_test
|
|
7
|
+
|
|
8
|
+
# Pearson's chi-squared test is used to assess whether paired observations on two
|
|
9
|
+
# variables, expressed in a contingency table, are independent of each other.
|
|
10
|
+
|
|
11
|
+
# An "observation" consists of the values of two outcomes and the null hypothesis
|
|
12
|
+
# is that the occurrence of these outcomes is statistically independent. Each
|
|
13
|
+
# observation is allocated to one cell of a two-dimensional array of cells (called
|
|
14
|
+
# a contingency table) according to the values of the two outcomes.
|
|
15
|
+
|
|
16
|
+
# Assumptions
|
|
17
|
+
# -----------
|
|
18
|
+
|
|
19
|
+
# The chi-squared test, when used with the standard approximation that a chi-
|
|
20
|
+
# squared distribution is applicable, has the following assumptions:
|
|
21
|
+
|
|
22
|
+
# * Simple random sample – The sample data is a random sampling from a fixed
|
|
23
|
+
# distribution or population where every collection of members of the population
|
|
24
|
+
# of the given sample size has an equal probability of selection. Variants of
|
|
25
|
+
# the test have been developed for complex samples, such as where the data is
|
|
26
|
+
# weighted. Other forms can be used such as purposive sampling.
|
|
27
|
+
# * Sample size (whole table) – A sample with a sufficiently large size is assumed.
|
|
28
|
+
# If a chi squared test is conducted on a sample with a smaller size, then the
|
|
29
|
+
# chi squared test will yield an inaccurate inference. The researcher, by using
|
|
30
|
+
# chi squared test on small samples, might end up committing a Type II error.
|
|
31
|
+
# * Expected cell count – Adequate expected cell counts. Some require 5 or more,
|
|
32
|
+
# and others require 10 or more. A common rule is 5 or more in all cells of a
|
|
33
|
+
# 2-by-2 table, and 5 or more in 80% of cells in larger tables, but no cells
|
|
34
|
+
# with zero expected count. When this assumption is not met, Yates's Correction
|
|
35
|
+
# is applied.
|
|
36
|
+
# * Independence – The observations are always assumed to be independent of each
|
|
37
|
+
# other. This means chi-squared cannot be used to test correlated data
|
|
38
|
+
# (like matched pairs or panel data). In those cases you might want to turn to
|
|
39
|
+
# McNemar's test.
|
|
40
|
+
|
|
41
|
+
# Problems
|
|
42
|
+
# --------
|
|
43
|
+
|
|
44
|
+
# The approximation to the chi-squared distribution breaks down if expected
|
|
45
|
+
# frequencies are too low. It will normally be acceptable so long as no more than
|
|
46
|
+
# 20% of the events have expected frequencies below 5. Where there is only 1
|
|
47
|
+
# degree of freedom, the approximation is not reliable if expected frequencies are
|
|
48
|
+
# below 10. In this case, a better approximation can be obtained by reducing the
|
|
49
|
+
# absolute value of each difference between observed and expected frequencies by
|
|
50
|
+
# 0.5 before squaring; this is called Yates's correction for continuity.
|
|
51
|
+
|
|
52
|
+
# In cases where the expected value, E, is found to be small (indicating a small
|
|
53
|
+
# underlying population probability, and/or a small number of observations), the
|
|
54
|
+
# normal approximation of the multinomial distribution can fail, and in such cases
|
|
55
|
+
# it is found to be more appropriate to use the G-test, a likelihood ratio-based
|
|
56
|
+
# test statistic. Where the total sample size is small, it is necessary to use an
|
|
57
|
+
# appropriate exact test, typically either the binomial test or (for contingency
|
|
58
|
+
# tables) Fisher's exact test. This test uses the conditional distribution of the
|
|
59
|
+
# test statistic given the marginal totals; however, it does not assume that the
|
|
60
|
+
# data were generated from an experiment in which the marginal totals are fixed
|
|
61
|
+
# and is valid whether or not that is the case.
|
|
62
|
+
class PearsonsChiSquaredIndependenceTest
|
|
63
|
+
|
|
64
|
+
def initialize(data_matrix, min_probability = 0.05)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def compute
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
1
3
|
class RailsDataExplorer
|
|
2
4
|
module Statistics
|
|
5
|
+
|
|
6
|
+
# Responsibilities:
|
|
7
|
+
# * Provide random categorical data. Useful for testing and demo data.
|
|
8
|
+
#
|
|
3
9
|
class RngCategory
|
|
4
10
|
|
|
11
|
+
# @param categories [Array<Object>] the pool of available categories.
|
|
12
|
+
# @param category_probabilities [Array, optional] probability of each category.
|
|
13
|
+
# @param rng [Proc, optional] lambda to generate random numbers which will
|
|
14
|
+
# be mapped to categories.
|
|
5
15
|
def initialize(categories, category_probabilities = nil, rng = lambda { Kernel.rand })
|
|
6
16
|
@categories, @category_probabilities, @rng = categories, category_probabilities, rng
|
|
7
17
|
@category_probabilities ||= @categories.map { |e| @rng.call }
|
|
@@ -9,6 +19,7 @@ class RailsDataExplorer
|
|
|
9
19
|
@category_order = compute_category_order
|
|
10
20
|
end
|
|
11
21
|
|
|
22
|
+
# Returns a random category
|
|
12
23
|
def rand
|
|
13
24
|
r_v = @rng.call
|
|
14
25
|
rnd = @category_order.detect { |e|
|
|
@@ -17,6 +28,8 @@ class RailsDataExplorer
|
|
|
17
28
|
rnd[:category]
|
|
18
29
|
end
|
|
19
30
|
|
|
31
|
+
protected
|
|
32
|
+
|
|
20
33
|
def normalize_category_probabilities
|
|
21
34
|
total = @category_probabilities.inject(0) { |m,e| m += e }
|
|
22
35
|
@category_probabilities.map { |e| e / total.to_f }
|
|
@@ -1,12 +1,23 @@
|
|
|
1
|
-
#
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
2
3
|
class RailsDataExplorer
|
|
3
4
|
module Statistics
|
|
5
|
+
|
|
6
|
+
# Responsibilities:
|
|
7
|
+
# * Provide random numeric data, following a gaussian distribution.
|
|
8
|
+
#
|
|
9
|
+
# From http://stackoverflow.com/a/9266488
|
|
4
10
|
class RngGaussian
|
|
11
|
+
|
|
12
|
+
# @param mean [Float] the expected mean
|
|
13
|
+
# @param sd [Float] the expected standard deviation
|
|
14
|
+
# @param rng [Proc, optional] a random number generator
|
|
5
15
|
def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
|
|
6
16
|
@mean, @sd, @rng = mean, sd, rng
|
|
7
17
|
@compute_next_pair = false
|
|
8
18
|
end
|
|
9
19
|
|
|
20
|
+
# Returns random numbers with a gaussian distribution.
|
|
10
21
|
def rand
|
|
11
22
|
if (@compute_next_pair = !@compute_next_pair)
|
|
12
23
|
# Compute a pair of random values with normal distribution.
|