rails-data-explorer 0.2.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +19 -3
  3. data/README.md +2 -0
  4. data/lib/rails-data-explorer-no-rails.rb +36 -32
  5. data/lib/rails-data-explorer.rb +38 -35
  6. data/lib/rails_data_explorer.rb +29 -10
  7. data/lib/{rails-data-explorer → rails_data_explorer}/action_view_extension.rb +39 -17
  8. data/lib/rails_data_explorer/active_record_extension.rb +19 -0
  9. data/lib/{rails-data-explorer → rails_data_explorer}/chart.rb +10 -0
  10. data/lib/rails_data_explorer/chart/anova.rb +1 -0
  11. data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot.rb +12 -3
  12. data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot_group.rb +49 -22
  13. data/lib/{rails-data-explorer → rails_data_explorer}/chart/contingency_table.rb +19 -8
  14. data/lib/{rails-data-explorer → rails_data_explorer}/chart/descriptive_statistics_table.rb +9 -0
  15. data/lib/rails_data_explorer/chart/descriptive_statistics_table_group.rb +1 -0
  16. data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_categorical.rb +12 -8
  17. data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_quantitative.rb +12 -2
  18. data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_temporal.rb +11 -2
  19. data/lib/{rails-data-explorer → rails_data_explorer}/chart/multi_dimensional_charts.rb +2 -0
  20. data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_coordinates.rb +11 -1
  21. data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_set.rb +11 -2
  22. data/lib/{rails-data-explorer → rails_data_explorer}/chart/pie_chart.rb +12 -8
  23. data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot.rb +13 -1
  24. data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot_matrix.rb +2 -0
  25. data/lib/{rails-data-explorer/chart/stacked_bar_chart_categorical_percent.rb → rails_data_explorer/chart/stacked_bar_chart_categorical.rb} +37 -14
  26. data/lib/rails_data_explorer/chart/stacked_bar_chart_categorical_percent.rb +28 -0
  27. data/lib/rails_data_explorer/chart/stacked_histogram_temporal.rb +199 -0
  28. data/lib/rails_data_explorer/data_series.rb +241 -0
  29. data/lib/{rails-data-explorer → rails_data_explorer}/data_set.rb +13 -4
  30. data/lib/{rails-data-explorer → rails_data_explorer}/data_type.rb +13 -0
  31. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/categorical.rb +79 -18
  32. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/geo.rb +2 -0
  33. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative.rb +14 -4
  34. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/decimal.rb +9 -0
  35. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/integer.rb +9 -0
  36. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/temporal.rb +9 -0
  37. data/lib/{rails-data-explorer → rails_data_explorer}/engine.rb +12 -0
  38. data/lib/{rails-data-explorer → rails_data_explorer}/exploration.rb +11 -0
  39. data/lib/rails_data_explorer/statistics/pearsons_chi_squared_independence_test.rb +72 -0
  40. data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_category.rb +13 -0
  41. data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_gaussian.rb +12 -1
  42. data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_power_law.rb +11 -0
  43. data/lib/{rails-data-explorer → rails_data_explorer}/utils/color_scale.rb +6 -0
  44. data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_binner.rb +13 -8
  45. data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_encoder.rb +2 -0
  46. data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer.rb +8 -3
  47. data/lib/{rails-data-explorer → rails_data_explorer}/utils/rde_table.rb +14 -11
  48. data/lib/{rails-data-explorer → rails_data_explorer}/utils/value_formatter.rb +9 -4
  49. data/rails-data-explorer.gemspec +5 -6
  50. data/spec/rails_data_explorer/chart_spec.rb +11 -0
  51. data/spec/{rails-data-explorer → rails_data_explorer}/data_series_spec.rb +0 -0
  52. data/spec/rails_data_explorer/data_set_spec.rb +31 -0
  53. data/spec/rails_data_explorer/data_type/categorical_spec.rb +126 -0
  54. data/{lib/rails-data-explorer/chart/descriptive_statistics_table_group.rb → spec/rails_data_explorer/data_type/quantitative/decimal_spec.rb} +0 -0
  55. data/spec/rails_data_explorer/data_type/quantitative/integer_spec.rb +0 -0
  56. data/spec/rails_data_explorer/data_type/quantitative/temporal_spec.rb +34 -0
  57. data/spec/rails_data_explorer/data_type/quantitative_spec.rb +118 -0
  58. data/spec/rails_data_explorer/data_type_spec.rb +7 -0
  59. data/spec/{rails-data-explorer → rails_data_explorer}/exploration_spec.rb +5 -5
  60. data/spec/rails_data_explorer/statistics/pearsons_chi_squared_independence_test_spec.rb +0 -0
  61. data/spec/rails_data_explorer/utils/color_scale_spec.rb +13 -0
  62. data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_binner_spec.rb +0 -0
  63. data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer_spec.rb +0 -0
  64. data/spec/rails_data_explorer/utils/value_formatter_spec.rb +33 -0
  65. data/vendor/assets/stylesheets/sources/rde-default-style.css +5 -1
  66. metadata +91 -82
  67. data/lib/rails-data-explorer/active_record_extension.rb +0 -14
  68. data/lib/rails-data-explorer/constants.rb +0 -5
  69. data/lib/rails-data-explorer/data_series.rb +0 -156
  70. data/lib/rails-data-explorer/statistics/pearsons_chi_squared_independence_test.rb +0 -75
  71. data/spec/rails-data-explorer/data_type/categorical_spec.rb +0 -34
@@ -0,0 +1,28 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class RailsDataExplorer
4
+ class Chart
5
+
6
+ # Responsibilities:
7
+ # * Render a stacked bar chart for bivariate analysis of two categorical
8
+ # data series. Renders percentage distribution of y-data series.
9
+ #
10
+ # Collaborators:
11
+ # * DataSet
12
+ #
13
+ class StackedBarChartCategoricalPercent < StackedBarChartCategorical
14
+
15
+ # Override this method to change how the y value is computed. E.g., to
16
+ # change from absolute values to percentages.
17
+ def compute_y_value(data_matrix, x_val, y_val)
18
+ (data_matrix[x_val][y_val] / data_matrix[x_val][:_sum].to_f) * 100
19
+ end
20
+
21
+ # @param y_ds_name [String] name of the y data series
22
+ def compute_y_axis_label(y_ds_name)
23
+ "#{ y_ds_name } distribution [%]"
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,199 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class RailsDataExplorer
4
+ class Chart
5
+
6
+ # Responsibilities:
7
+ # * Render a stacked bar chart for bivariate analysis of a temporal and a
8
+ # categorical data series.
9
+ #
10
+ # Collaborators:
11
+ # * DataSet
12
+ #
13
+ class StackedHistogramTemporal < Chart
14
+
15
+ def initialize(_data_set, options = {})
16
+ @data_set = _data_set
17
+ @options = {}.merge(options)
18
+ end
19
+
20
+ def compute_chart_attrs
21
+ x_candidates = @data_set.data_series.find_all { |ds|
22
+ (ds.chart_roles[Chart::StackedHistogramTemporal] & [:x, :any]).any?
23
+ }.sort { |a,b| b.uniq_vals.length <=> a.uniq_vals.length }
24
+ y_candidates = @data_set.data_series.find_all { |ds|
25
+ (ds.chart_roles[Chart::StackedHistogramTemporal] & [:y, :any]).any?
26
+ }
27
+
28
+ x_ds = x_candidates.first
29
+ y_ds = (y_candidates - [x_ds]).first
30
+ return false if x_ds.nil? || y_ds.nil?
31
+
32
+ # initialize data_matrix
33
+ data_matrix = { _sum: { _sum: 0 } }
34
+ x_ds.uniq_vals.each { |x_val|
35
+ data_matrix[x_val] = {}
36
+ data_matrix[x_val][:_sum] = 0
37
+ y_ds.uniq_vals.each { |y_val|
38
+ data_matrix[x_val][y_val] = 0
39
+ data_matrix[:_sum][y_val] = 0
40
+ }
41
+ }
42
+ # populate data_matrix
43
+ x_ds.values.length.times { |idx|
44
+ x_val = x_ds.values[idx]
45
+ y_val = y_ds.values[idx]
46
+ data_matrix[x_val][y_val] += 1
47
+ data_matrix[:_sum][y_val] += 1
48
+ data_matrix[x_val][:_sum] += 1
49
+ data_matrix[:_sum][:_sum] += 1
50
+ }
51
+
52
+ x_sorted_keys = x_ds.uniq_vals.sort(
53
+ &x_ds.label_sorter(
54
+ nil,
55
+ lambda { |a,b| data_matrix[b][:_sum] <=> data_matrix[a][:_sum] }
56
+ )
57
+ )
58
+ y_sorted_keys = y_ds.uniq_vals.sort(
59
+ &y_ds.label_sorter(
60
+ nil,
61
+ lambda { |a,b| data_matrix[:_sum][b] <=> data_matrix[:_sum][a] }
62
+ )
63
+ )
64
+
65
+ values = case @data_set.dimensions_count
66
+ when 2
67
+ y_sorted_keys.map { |y_val|
68
+ x_sorted_keys.map { |x_val|
69
+ {
70
+ x: x_val,
71
+ y: data_matrix[x_val][y_val],
72
+ c: y_val
73
+ }
74
+ }
75
+ }.flatten
76
+ else
77
+ raise(ArgumentError.new("Exactly two data series required for contingency table."))
78
+ end
79
+ {
80
+ values: values,
81
+ x_axis_label: x_ds.name,
82
+ x_axis_tick_format: 'function(d) { return d }',
83
+ y_axis_label: "#{ y_ds.name } distribution [%]",
84
+ y_axis_tick_format: "d3.format('.1%')",
85
+ }
86
+ end
87
+
88
+ def render
89
+ return '' unless render?
90
+ ca = compute_chart_attrs
91
+ return '' unless ca
92
+ render_vega(ca)
93
+ end
94
+
95
+ def render_vega(ca)
96
+ %(
97
+ <div class="rde-chart rde-stacked-histogram-temporal">
98
+ <h3 class="rde-chart-title">Stacked Histogram (temporal)</h3>
99
+ <div id="#{ dom_id }"></div>
100
+ <script type="text/javascript">
101
+ (function() {
102
+ var spec = {
103
+ "width": 960,
104
+ "height": 200,
105
+ "padding": {"top": 10, "left": 50, "bottom": 50, "right": 100},
106
+ "data": [
107
+ {
108
+ "name": "table",
109
+ "values": #{ ca[:values].to_json }
110
+ },
111
+ {
112
+ "name": "stats",
113
+ "source": "table",
114
+ "transform": [
115
+ {"type": "facet", "keys": ["data.x"]},
116
+ {"type": "stats", "value": "data.y"}
117
+ ]
118
+ }
119
+ ],
120
+ "scales": [
121
+ {
122
+ "name": "x",
123
+ "type": "ordinal",
124
+ "range": "width",
125
+ "domain": {"data": "table", "field": "data.x"}
126
+ },
127
+ {
128
+ "name": "y",
129
+ "type": "linear",
130
+ "range": "height",
131
+ "nice": true,
132
+ "domain": {"data": "stats", "field": "sum"}
133
+ },
134
+ {
135
+ "name": "color",
136
+ "type": "ordinal",
137
+ "range": "category10"
138
+ }
139
+ ],
140
+ "axes": [
141
+ {
142
+ "type": "x",
143
+ "scale": "x",
144
+ "title": "#{ ca[:x_axis_label] }",
145
+ "format": #{ ca[:x_axis_tick_format] },
146
+ },
147
+ {
148
+ "type": "y",
149
+ "scale": "y",
150
+ "title": "#{ ca[:y_axis_label] }",
151
+ "format": #{ ca[:y_axis_tick_format] },
152
+ }
153
+ ],
154
+ "marks": [
155
+ {
156
+ "type": "group",
157
+ "from": {
158
+ "data": "table",
159
+ "transform": [
160
+ {"type": "facet", "keys": ["data.c"]},
161
+ {"type": "stack", "point": "data.x", "height": "data.y"}
162
+ ]
163
+ },
164
+ "marks": [
165
+ {
166
+ "type": "rect",
167
+ "properties": {
168
+ "enter": {
169
+ "x": {"scale": "x", "field": "data.x"},
170
+ "width": {"scale": "x", "band": true, "offset": -1},
171
+ "y": {"scale": "y", "field": "y"},
172
+ "y2": {"scale": "y", "field": "y2"},
173
+ "fill": {"scale": "color", "field": "data.c"}
174
+ },
175
+ }
176
+ }
177
+ ]
178
+ }
179
+ ],
180
+ "legends": [
181
+ {
182
+ "fill": "color",
183
+ }
184
+ ],
185
+ };
186
+
187
+ vg.parse.spec(spec, function(chart) {
188
+ var view = chart({ el:"##{ dom_id }" }).update();
189
+ });
190
+
191
+ })();
192
+ </script>
193
+ </div>
194
+ )
195
+ end
196
+
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,241 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class RailsDataExplorer
4
+
5
+ # NOTE: DataSeries values are immutable once instantiated.
6
+ #
7
+ # Responsibilities:
8
+ # * Represent a data series
9
+ # * Compute statistics
10
+ # * Compute chart attributes
11
+ # * Cache computed properties like values, statistics
12
+ # * Provide modified versions of values
13
+ # (e.g., :limit_distinct_values, :compress_quantitative_values)
14
+ #
15
+ # Collaborators:
16
+ # * DataType
17
+ #
18
+ class DataSeries
19
+
20
+ # TODO: Add concept of significant figures for rounding values when displaying them
21
+ # http://en.wikipedia.org/wiki/Significant_figures
22
+
23
+ attr_reader :data_type, :name, :chart_roles
24
+ delegate :available_chart_types, to: :data_type, prefix: false
25
+ delegate :available_chart_roles, to: :data_type, prefix: false
26
+
27
+ # Any data series with a dynamic range greater than this is considered
28
+ # having a large dynamic range
29
+ # We consider dynamic range the ratio between the largest and the smallest value.
30
+ def self.large_dynamic_range_threshold
31
+ 10000.0
32
+ end
33
+
34
+ # Any data series with more than this uniq vals is considered having many
35
+ # uniq values.
36
+ def self.many_uniq_vals_threshold
37
+ 20
38
+ end
39
+
40
+ # options: :chart_roles, :data_type (all optional)
41
+ def initialize(_name, _values, options={})
42
+ options = { chart_roles: [], data_type: nil }.merge(options)
43
+ @name = _name
44
+ @values = _values
45
+ @data_type = init_data_type(options[:data_type])
46
+ @chart_roles = init_chart_roles(options[:chart_roles]) # after data_type!
47
+ @options = options
48
+ end
49
+
50
+ # Returns descriptive_statistics as a flat Array
51
+ # (see #values)
52
+ def descriptive_statistics(modification = {})
53
+ @cached_descriptive_statistics ||= {}
54
+ @cached_descriptive_statistics[modification] ||= (
55
+ data_type.descriptive_statistics(values(modification))
56
+ )
57
+ end
58
+
59
+ # Returns descriptive_statistics as a renderable table structure
60
+ # (see #values)
61
+ def descriptive_statistics_table(modification = {})
62
+ @cached_descriptive_statistics_table ||= {}
63
+ @cached_descriptive_statistics_table[modification] ||= (
64
+ data_type.descriptive_statistics_table(values(modification))
65
+ )
66
+ end
67
+
68
+ # (see #values)
69
+ def number_of_values(modification = {})
70
+ @cached_number_of_values ||= {}
71
+ @cached_number_of_values[modification] ||= (
72
+ values(modification).length
73
+ )
74
+ end
75
+
76
+ # (see #values)
77
+ def values_summary(modification = {})
78
+ @cached_values_summary ||= {}
79
+ @cached_values_summary[modification] ||= (
80
+ v = values(modification)
81
+ if v.length < 3 || v.inspect.length < 80
82
+ v.inspect
83
+ else
84
+ "[#{ v.first } ... #{ v.last }]"
85
+ end
86
+ )
87
+ end
88
+
89
+ # Returns the values for this data series with an optional modification
90
+ # @param modification [Hash, optional] type of modification.
91
+ # {
92
+ # name: :limit_distinct_values,
93
+ # max_num_distinct_values: 20,
94
+ # val_for_others: '[Other]',
95
+ # }
96
+ # {
97
+ # name: :compress_quantitative_values,
98
+ # }
99
+ def values(modification = {})
100
+ @cached_values ||= {}
101
+ @cached_values[modification] ||= (
102
+ case modification[:name]
103
+ when NilClass
104
+ @values
105
+ when :limit_distinct_values
106
+ # Returns variant of self's values with number of distinct values limited
107
+ # to :max_num_distinct_values. Less frequent values are mapped to
108
+ # :val_for_others.
109
+ # @param max_num_distinct_values [Integer, optional]
110
+ data_type.limit_distinct_values(
111
+ @values,
112
+ (
113
+ modification[:max_num_distinct_values] ||
114
+ @options[:max_num_distinct_values] ||
115
+ self.class.many_uniq_vals_threshold
116
+ ),
117
+ (
118
+ modification[:val_for_others] ||
119
+ @options[:val_for_others]
120
+ )
121
+ )
122
+ else
123
+ raise "Handle this modification: #{ modification.inspect }"
124
+ end
125
+ )
126
+ end
127
+
128
+ def inspect(indent=1, recursive=1000)
129
+ r = %(#<#{ self.class.to_s }\n)
130
+ r << [
131
+ "@name=#{ name.inspect }",
132
+ "@data_type=#{ data_type.inspect }",
133
+ "@chart_roles=#{ chart_roles.inspect }",
134
+ "@values=<count: #{ values.count }, items: #{ values_summary }>",
135
+ ].map { |e| "#{ ' ' * indent }#{ e }\n"}.join
136
+ if recursive > 0
137
+ # nothing to recurse
138
+ end
139
+ r << %(#{ ' ' * (indent-1) }>\n)
140
+ end
141
+
142
+ # (see #values)
143
+ def axis_tick_format(modification = {})
144
+ data_type.axis_tick_format(values(modification))
145
+ end
146
+
147
+ # @param[Symbol] d3_or_vega :d3 or :vega
148
+ def axis_scale(d3_or_vega, modification = {})
149
+ data_type.axis_scale(self, modification, d3_or_vega)
150
+ end
151
+
152
+ # (see #values)
153
+ def uniq_vals(modification = {})
154
+ @cached_uniq_vals ||= {}
155
+ @cached_uniq_vals[modification] ||= values(modification).uniq
156
+ end
157
+
158
+ # (see #values)
159
+ def uniq_vals_count(modification = {})
160
+ @cached_uniq_vals_count ||= {}
161
+ @cached_uniq_vals_count[modification] ||= uniq_vals(modification).length
162
+ end
163
+
164
+ # (see #values)
165
+ def min_val(modification = {})
166
+ @cached_min_val ||= {}
167
+ @cached_min_val[modification] ||= values(modification).compact.min
168
+ end
169
+
170
+ # (see #values)
171
+ def max_val(modification = {})
172
+ @cached_max_val ||= {}
173
+ @cached_max_val[modification] ||= values(modification).compact.max
174
+ end
175
+
176
+ # (see #values)
177
+ def dynamic_range(modification = {})
178
+ @cached_dynamic_range ||= {}
179
+ @cached_dynamic_range[modification] ||= (
180
+ divisor = [min_val(modification), max_val(modification)].min.to_f
181
+ 0 == divisor ? 0.0 : max_val / divisor
182
+ )
183
+ end
184
+
185
+ # (see #values)
186
+ def has_large_dynamic_range?(modification = {})
187
+ @cached_has_large_dynamic_range ||= {}
188
+ @cached_has_large_dynamic_range[modification] ||= (
189
+ dynamic_range(modification) > self.class.large_dynamic_range_threshold
190
+ )
191
+ end
192
+
193
+ def label_sorter(label_val_key, value_sorter)
194
+ data_type.label_sorter(label_val_key, self, value_sorter)
195
+ end
196
+
197
+ private
198
+
199
+ # @param[Array<Symbol>] chart_role_overrides, :x, :y, :color
200
+ # @return[Hash] keys are chart_classes, and values are arrays with roles
201
+ def init_chart_roles(chart_role_overrides)
202
+ r = if chart_role_overrides.any?
203
+ available_chart_types.inject(Hash.new([])) { |m,chart_type|
204
+ subset = chart_type[:chart_roles] & chart_role_overrides
205
+ next m if subset.empty?
206
+ m[chart_type[:chart_class]] += subset
207
+ m[chart_type[:chart_class]].uniq!
208
+ m
209
+ }
210
+ else
211
+ available_chart_types.inject(Hash.new([])) { |m,chart_type|
212
+ m[chart_type[:chart_class]] += chart_type[:chart_roles]
213
+ m[chart_type[:chart_class]].uniq!
214
+ m
215
+ }
216
+ end
217
+ r.freeze
218
+ end
219
+
220
+ def init_data_type(data_type_override)
221
+ if data_type_override.nil?
222
+ first_value = values.detect { |e| !e.nil? }
223
+ case first_value
224
+ when Integer, Bignum, Fixnum
225
+ DataType::Quantitative::Integer.new
226
+ when Float
227
+ DataType::Quantitative::Decimal.new
228
+ when String
229
+ DataType::Categorical.new
230
+ when Time, DateTime, ActiveSupport::TimeWithZone
231
+ DataType::Quantitative::Temporal.new
232
+ else
233
+ raise(ArgumentError.new("Can't infer data type for value: #{ values.first.class.inspect }"))
234
+ end
235
+ else
236
+ data_type_override
237
+ end
238
+ end
239
+
240
+ end
241
+ end