rails-data-explorer 0.2.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +19 -3
  3. data/README.md +2 -0
  4. data/lib/rails-data-explorer-no-rails.rb +36 -32
  5. data/lib/rails-data-explorer.rb +38 -35
  6. data/lib/rails_data_explorer.rb +29 -10
  7. data/lib/{rails-data-explorer → rails_data_explorer}/action_view_extension.rb +39 -17
  8. data/lib/rails_data_explorer/active_record_extension.rb +19 -0
  9. data/lib/{rails-data-explorer → rails_data_explorer}/chart.rb +10 -0
  10. data/lib/rails_data_explorer/chart/anova.rb +1 -0
  11. data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot.rb +12 -3
  12. data/lib/{rails-data-explorer → rails_data_explorer}/chart/box_plot_group.rb +49 -22
  13. data/lib/{rails-data-explorer → rails_data_explorer}/chart/contingency_table.rb +19 -8
  14. data/lib/{rails-data-explorer → rails_data_explorer}/chart/descriptive_statistics_table.rb +9 -0
  15. data/lib/rails_data_explorer/chart/descriptive_statistics_table_group.rb +1 -0
  16. data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_categorical.rb +12 -8
  17. data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_quantitative.rb +12 -2
  18. data/lib/{rails-data-explorer → rails_data_explorer}/chart/histogram_temporal.rb +11 -2
  19. data/lib/{rails-data-explorer → rails_data_explorer}/chart/multi_dimensional_charts.rb +2 -0
  20. data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_coordinates.rb +11 -1
  21. data/lib/{rails-data-explorer → rails_data_explorer}/chart/parallel_set.rb +11 -2
  22. data/lib/{rails-data-explorer → rails_data_explorer}/chart/pie_chart.rb +12 -8
  23. data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot.rb +13 -1
  24. data/lib/{rails-data-explorer → rails_data_explorer}/chart/scatterplot_matrix.rb +2 -0
  25. data/lib/{rails-data-explorer/chart/stacked_bar_chart_categorical_percent.rb → rails_data_explorer/chart/stacked_bar_chart_categorical.rb} +37 -14
  26. data/lib/rails_data_explorer/chart/stacked_bar_chart_categorical_percent.rb +28 -0
  27. data/lib/rails_data_explorer/chart/stacked_histogram_temporal.rb +199 -0
  28. data/lib/rails_data_explorer/data_series.rb +241 -0
  29. data/lib/{rails-data-explorer → rails_data_explorer}/data_set.rb +13 -4
  30. data/lib/{rails-data-explorer → rails_data_explorer}/data_type.rb +13 -0
  31. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/categorical.rb +79 -18
  32. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/geo.rb +2 -0
  33. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative.rb +14 -4
  34. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/decimal.rb +9 -0
  35. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/integer.rb +9 -0
  36. data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/temporal.rb +9 -0
  37. data/lib/{rails-data-explorer → rails_data_explorer}/engine.rb +12 -0
  38. data/lib/{rails-data-explorer → rails_data_explorer}/exploration.rb +11 -0
  39. data/lib/rails_data_explorer/statistics/pearsons_chi_squared_independence_test.rb +72 -0
  40. data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_category.rb +13 -0
  41. data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_gaussian.rb +12 -1
  42. data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_power_law.rb +11 -0
  43. data/lib/{rails-data-explorer → rails_data_explorer}/utils/color_scale.rb +6 -0
  44. data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_binner.rb +13 -8
  45. data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_encoder.rb +2 -0
  46. data/lib/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer.rb +8 -3
  47. data/lib/{rails-data-explorer → rails_data_explorer}/utils/rde_table.rb +14 -11
  48. data/lib/{rails-data-explorer → rails_data_explorer}/utils/value_formatter.rb +9 -4
  49. data/rails-data-explorer.gemspec +5 -6
  50. data/spec/rails_data_explorer/chart_spec.rb +11 -0
  51. data/spec/{rails-data-explorer → rails_data_explorer}/data_series_spec.rb +0 -0
  52. data/spec/rails_data_explorer/data_set_spec.rb +31 -0
  53. data/spec/rails_data_explorer/data_type/categorical_spec.rb +126 -0
  54. data/{lib/rails-data-explorer/chart/descriptive_statistics_table_group.rb → spec/rails_data_explorer/data_type/quantitative/decimal_spec.rb} +0 -0
  55. data/spec/rails_data_explorer/data_type/quantitative/integer_spec.rb +0 -0
  56. data/spec/rails_data_explorer/data_type/quantitative/temporal_spec.rb +34 -0
  57. data/spec/rails_data_explorer/data_type/quantitative_spec.rb +118 -0
  58. data/spec/rails_data_explorer/data_type_spec.rb +7 -0
  59. data/spec/{rails-data-explorer → rails_data_explorer}/exploration_spec.rb +5 -5
  60. data/spec/rails_data_explorer/statistics/pearsons_chi_squared_independence_test_spec.rb +0 -0
  61. data/spec/rails_data_explorer/utils/color_scale_spec.rb +13 -0
  62. data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_binner_spec.rb +0 -0
  63. data/spec/{rails-data-explorer → rails_data_explorer}/utils/data_quantizer_spec.rb +0 -0
  64. data/spec/rails_data_explorer/utils/value_formatter_spec.rb +33 -0
  65. data/vendor/assets/stylesheets/sources/rde-default-style.css +5 -1
  66. metadata +91 -82
  67. data/lib/rails-data-explorer/active_record_extension.rb +0 -14
  68. data/lib/rails-data-explorer/constants.rb +0 -5
  69. data/lib/rails-data-explorer/data_series.rb +0 -156
  70. data/lib/rails-data-explorer/statistics/pearsons_chi_squared_independence_test.rb +0 -75
  71. data/spec/rails-data-explorer/data_type/categorical_spec.rb +0 -34
@@ -0,0 +1,28 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class RailsDataExplorer
4
+ class Chart
5
+
6
+ # Responsibilities:
7
+ # * Render a stacked bar chart for bivariate analysis of two categorical
8
+ # data series. Renders percentage distribution of y-data series.
9
+ #
10
+ # Collaborators:
11
+ # * DataSet
12
+ #
13
+ class StackedBarChartCategoricalPercent < StackedBarChartCategorical
14
+
15
+ # Override this method to change how the y value is computed. E.g., to
16
+ # change from absolute values to percentages.
17
+ def compute_y_value(data_matrix, x_val, y_val)
18
+ (data_matrix[x_val][y_val] / data_matrix[x_val][:_sum].to_f) * 100
19
+ end
20
+
21
+ # @param y_ds_name [String] name of the y data series
22
+ def compute_y_axis_label(y_ds_name)
23
+ "#{ y_ds_name } distribution [%]"
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,199 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class RailsDataExplorer
4
+ class Chart
5
+
6
+ # Responsibilities:
7
+ # * Render a stacked bar chart for bivariate analysis of a temporal and a
8
+ # categorical data series.
9
+ #
10
+ # Collaborators:
11
+ # * DataSet
12
+ #
13
+ class StackedHistogramTemporal < Chart
14
+
15
+ def initialize(_data_set, options = {})
16
+ @data_set = _data_set
17
+ @options = {}.merge(options)
18
+ end
19
+
20
+ def compute_chart_attrs
21
+ x_candidates = @data_set.data_series.find_all { |ds|
22
+ (ds.chart_roles[Chart::StackedHistogramTemporal] & [:x, :any]).any?
23
+ }.sort { |a,b| b.uniq_vals.length <=> a.uniq_vals.length }
24
+ y_candidates = @data_set.data_series.find_all { |ds|
25
+ (ds.chart_roles[Chart::StackedHistogramTemporal] & [:y, :any]).any?
26
+ }
27
+
28
+ x_ds = x_candidates.first
29
+ y_ds = (y_candidates - [x_ds]).first
30
+ return false if x_ds.nil? || y_ds.nil?
31
+
32
+ # initialize data_matrix
33
+ data_matrix = { _sum: { _sum: 0 } }
34
+ x_ds.uniq_vals.each { |x_val|
35
+ data_matrix[x_val] = {}
36
+ data_matrix[x_val][:_sum] = 0
37
+ y_ds.uniq_vals.each { |y_val|
38
+ data_matrix[x_val][y_val] = 0
39
+ data_matrix[:_sum][y_val] = 0
40
+ }
41
+ }
42
+ # populate data_matrix
43
+ x_ds.values.length.times { |idx|
44
+ x_val = x_ds.values[idx]
45
+ y_val = y_ds.values[idx]
46
+ data_matrix[x_val][y_val] += 1
47
+ data_matrix[:_sum][y_val] += 1
48
+ data_matrix[x_val][:_sum] += 1
49
+ data_matrix[:_sum][:_sum] += 1
50
+ }
51
+
52
+ x_sorted_keys = x_ds.uniq_vals.sort(
53
+ &x_ds.label_sorter(
54
+ nil,
55
+ lambda { |a,b| data_matrix[b][:_sum] <=> data_matrix[a][:_sum] }
56
+ )
57
+ )
58
+ y_sorted_keys = y_ds.uniq_vals.sort(
59
+ &y_ds.label_sorter(
60
+ nil,
61
+ lambda { |a,b| data_matrix[:_sum][b] <=> data_matrix[:_sum][a] }
62
+ )
63
+ )
64
+
65
+ values = case @data_set.dimensions_count
66
+ when 2
67
+ y_sorted_keys.map { |y_val|
68
+ x_sorted_keys.map { |x_val|
69
+ {
70
+ x: x_val,
71
+ y: data_matrix[x_val][y_val],
72
+ c: y_val
73
+ }
74
+ }
75
+ }.flatten
76
+ else
77
+ raise(ArgumentError.new("Exactly two data series required for contingency table."))
78
+ end
79
+ {
80
+ values: values,
81
+ x_axis_label: x_ds.name,
82
+ x_axis_tick_format: 'function(d) { return d }',
83
+ y_axis_label: "#{ y_ds.name } distribution [%]",
84
+ y_axis_tick_format: "d3.format('.1%')",
85
+ }
86
+ end
87
+
88
+ def render
89
+ return '' unless render?
90
+ ca = compute_chart_attrs
91
+ return '' unless ca
92
+ render_vega(ca)
93
+ end
94
+
95
+ def render_vega(ca)
96
+ %(
97
+ <div class="rde-chart rde-stacked-histogram-temporal">
98
+ <h3 class="rde-chart-title">Stacked Histogram (temporal)</h3>
99
+ <div id="#{ dom_id }"></div>
100
+ <script type="text/javascript">
101
+ (function() {
102
+ var spec = {
103
+ "width": 960,
104
+ "height": 200,
105
+ "padding": {"top": 10, "left": 50, "bottom": 50, "right": 100},
106
+ "data": [
107
+ {
108
+ "name": "table",
109
+ "values": #{ ca[:values].to_json }
110
+ },
111
+ {
112
+ "name": "stats",
113
+ "source": "table",
114
+ "transform": [
115
+ {"type": "facet", "keys": ["data.x"]},
116
+ {"type": "stats", "value": "data.y"}
117
+ ]
118
+ }
119
+ ],
120
+ "scales": [
121
+ {
122
+ "name": "x",
123
+ "type": "ordinal",
124
+ "range": "width",
125
+ "domain": {"data": "table", "field": "data.x"}
126
+ },
127
+ {
128
+ "name": "y",
129
+ "type": "linear",
130
+ "range": "height",
131
+ "nice": true,
132
+ "domain": {"data": "stats", "field": "sum"}
133
+ },
134
+ {
135
+ "name": "color",
136
+ "type": "ordinal",
137
+ "range": "category10"
138
+ }
139
+ ],
140
+ "axes": [
141
+ {
142
+ "type": "x",
143
+ "scale": "x",
144
+ "title": "#{ ca[:x_axis_label] }",
145
+ "format": #{ ca[:x_axis_tick_format] },
146
+ },
147
+ {
148
+ "type": "y",
149
+ "scale": "y",
150
+ "title": "#{ ca[:y_axis_label] }",
151
+ "format": #{ ca[:y_axis_tick_format] },
152
+ }
153
+ ],
154
+ "marks": [
155
+ {
156
+ "type": "group",
157
+ "from": {
158
+ "data": "table",
159
+ "transform": [
160
+ {"type": "facet", "keys": ["data.c"]},
161
+ {"type": "stack", "point": "data.x", "height": "data.y"}
162
+ ]
163
+ },
164
+ "marks": [
165
+ {
166
+ "type": "rect",
167
+ "properties": {
168
+ "enter": {
169
+ "x": {"scale": "x", "field": "data.x"},
170
+ "width": {"scale": "x", "band": true, "offset": -1},
171
+ "y": {"scale": "y", "field": "y"},
172
+ "y2": {"scale": "y", "field": "y2"},
173
+ "fill": {"scale": "color", "field": "data.c"}
174
+ },
175
+ }
176
+ }
177
+ ]
178
+ }
179
+ ],
180
+ "legends": [
181
+ {
182
+ "fill": "color",
183
+ }
184
+ ],
185
+ };
186
+
187
+ vg.parse.spec(spec, function(chart) {
188
+ var view = chart({ el:"##{ dom_id }" }).update();
189
+ });
190
+
191
+ })();
192
+ </script>
193
+ </div>
194
+ )
195
+ end
196
+
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,241 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ class RailsDataExplorer
4
+
5
+ # NOTE: DataSeries values are immutable once instantiated.
6
+ #
7
+ # Responsibilities:
8
+ # * Represent a data series
9
+ # * Compute statistics
10
+ # * Compute chart attributes
11
+ # * Cache computed properties like values, statistics
12
+ # * Provide modified versions of values
13
+ # (e.g., :limit_distinct_values, :compress_quantitative_values)
14
+ #
15
+ # Collaborators:
16
+ # * DataType
17
+ #
18
+ class DataSeries
19
+
20
+ # TODO: Add concept of significant figures for rounding values when displaying them
21
+ # http://en.wikipedia.org/wiki/Significant_figures
22
+
23
+ attr_reader :data_type, :name, :chart_roles
24
+ delegate :available_chart_types, to: :data_type, prefix: false
25
+ delegate :available_chart_roles, to: :data_type, prefix: false
26
+
27
+ # Any data series with a dynamic range greater than this is considered
28
+ # having a large dynamic range
29
+ # We consider dynamic range the ratio between the largest and the smallest value.
30
+ def self.large_dynamic_range_threshold
31
+ 10000.0
32
+ end
33
+
34
+ # Any data series with more than this uniq vals is considered having many
35
+ # uniq values.
36
+ def self.many_uniq_vals_threshold
37
+ 20
38
+ end
39
+
40
+ # options: :chart_roles, :data_type (all optional)
41
+ def initialize(_name, _values, options={})
42
+ options = { chart_roles: [], data_type: nil }.merge(options)
43
+ @name = _name
44
+ @values = _values
45
+ @data_type = init_data_type(options[:data_type])
46
+ @chart_roles = init_chart_roles(options[:chart_roles]) # after data_type!
47
+ @options = options
48
+ end
49
+
50
+ # Returns descriptive_statistics as a flat Array
51
+ # (see #values)
52
+ def descriptive_statistics(modification = {})
53
+ @cached_descriptive_statistics ||= {}
54
+ @cached_descriptive_statistics[modification] ||= (
55
+ data_type.descriptive_statistics(values(modification))
56
+ )
57
+ end
58
+
59
+ # Returns descriptive_statistics as a renderable table structure
60
+ # (see #values)
61
+ def descriptive_statistics_table(modification = {})
62
+ @cached_descriptive_statistics_table ||= {}
63
+ @cached_descriptive_statistics_table[modification] ||= (
64
+ data_type.descriptive_statistics_table(values(modification))
65
+ )
66
+ end
67
+
68
+ # (see #values)
69
+ def number_of_values(modification = {})
70
+ @cached_number_of_values ||= {}
71
+ @cached_number_of_values[modification] ||= (
72
+ values(modification).length
73
+ )
74
+ end
75
+
76
+ # (see #values)
77
+ def values_summary(modification = {})
78
+ @cached_values_summary ||= {}
79
+ @cached_values_summary[modification] ||= (
80
+ v = values(modification)
81
+ if v.length < 3 || v.inspect.length < 80
82
+ v.inspect
83
+ else
84
+ "[#{ v.first } ... #{ v.last }]"
85
+ end
86
+ )
87
+ end
88
+
89
+ # Returns the values for this data series with an optional modification
90
+ # @param modification [Hash, optional] type of modification.
91
+ # {
92
+ # name: :limit_distinct_values,
93
+ # max_num_distinct_values: 20,
94
+ # val_for_others: '[Other]',
95
+ # }
96
+ # {
97
+ # name: :compress_quantitative_values,
98
+ # }
99
+ def values(modification = {})
100
+ @cached_values ||= {}
101
+ @cached_values[modification] ||= (
102
+ case modification[:name]
103
+ when NilClass
104
+ @values
105
+ when :limit_distinct_values
106
+ # Returns variant of self's values with number of distinct values limited
107
+ # to :max_num_distinct_values. Less frequent values are mapped to
108
+ # :val_for_others.
109
+ # @param max_num_distinct_values [Integer, optional]
110
+ data_type.limit_distinct_values(
111
+ @values,
112
+ (
113
+ modification[:max_num_distinct_values] ||
114
+ @options[:max_num_distinct_values] ||
115
+ self.class.many_uniq_vals_threshold
116
+ ),
117
+ (
118
+ modification[:val_for_others] ||
119
+ @options[:val_for_others]
120
+ )
121
+ )
122
+ else
123
+ raise "Handle this modification: #{ modification.inspect }"
124
+ end
125
+ )
126
+ end
127
+
128
+ def inspect(indent=1, recursive=1000)
129
+ r = %(#<#{ self.class.to_s }\n)
130
+ r << [
131
+ "@name=#{ name.inspect }",
132
+ "@data_type=#{ data_type.inspect }",
133
+ "@chart_roles=#{ chart_roles.inspect }",
134
+ "@values=<count: #{ values.count }, items: #{ values_summary }>",
135
+ ].map { |e| "#{ ' ' * indent }#{ e }\n"}.join
136
+ if recursive > 0
137
+ # nothing to recurse
138
+ end
139
+ r << %(#{ ' ' * (indent-1) }>\n)
140
+ end
141
+
142
+ # (see #values)
143
+ def axis_tick_format(modification = {})
144
+ data_type.axis_tick_format(values(modification))
145
+ end
146
+
147
+ # @param[Symbol] d3_or_vega :d3 or :vega
148
+ def axis_scale(d3_or_vega, modification = {})
149
+ data_type.axis_scale(self, modification, d3_or_vega)
150
+ end
151
+
152
+ # (see #values)
153
+ def uniq_vals(modification = {})
154
+ @cached_uniq_vals ||= {}
155
+ @cached_uniq_vals[modification] ||= values(modification).uniq
156
+ end
157
+
158
+ # (see #values)
159
+ def uniq_vals_count(modification = {})
160
+ @cached_uniq_vals_count ||= {}
161
+ @cached_uniq_vals_count[modification] ||= uniq_vals(modification).length
162
+ end
163
+
164
+ # (see #values)
165
+ def min_val(modification = {})
166
+ @cached_min_val ||= {}
167
+ @cached_min_val[modification] ||= values(modification).compact.min
168
+ end
169
+
170
+ # (see #values)
171
+ def max_val(modification = {})
172
+ @cached_max_val ||= {}
173
+ @cached_max_val[modification] ||= values(modification).compact.max
174
+ end
175
+
176
+ # (see #values)
177
+ def dynamic_range(modification = {})
178
+ @cached_dynamic_range ||= {}
179
+ @cached_dynamic_range[modification] ||= (
180
+ divisor = [min_val(modification), max_val(modification)].min.to_f
181
+ 0 == divisor ? 0.0 : max_val / divisor
182
+ )
183
+ end
184
+
185
+ # (see #values)
186
+ def has_large_dynamic_range?(modification = {})
187
+ @cached_has_large_dynamic_range ||= {}
188
+ @cached_has_large_dynamic_range[modification] ||= (
189
+ dynamic_range(modification) > self.class.large_dynamic_range_threshold
190
+ )
191
+ end
192
+
193
+ def label_sorter(label_val_key, value_sorter)
194
+ data_type.label_sorter(label_val_key, self, value_sorter)
195
+ end
196
+
197
+ private
198
+
199
+ # @param[Array<Symbol>] chart_role_overrides, :x, :y, :color
200
+ # @return[Hash] keys are chart_classes, and values are arrays with roles
201
+ def init_chart_roles(chart_role_overrides)
202
+ r = if chart_role_overrides.any?
203
+ available_chart_types.inject(Hash.new([])) { |m,chart_type|
204
+ subset = chart_type[:chart_roles] & chart_role_overrides
205
+ next m if subset.empty?
206
+ m[chart_type[:chart_class]] += subset
207
+ m[chart_type[:chart_class]].uniq!
208
+ m
209
+ }
210
+ else
211
+ available_chart_types.inject(Hash.new([])) { |m,chart_type|
212
+ m[chart_type[:chart_class]] += chart_type[:chart_roles]
213
+ m[chart_type[:chart_class]].uniq!
214
+ m
215
+ }
216
+ end
217
+ r.freeze
218
+ end
219
+
220
+ def init_data_type(data_type_override)
221
+ if data_type_override.nil?
222
+ first_value = values.detect { |e| !e.nil? }
223
+ case first_value
224
+ when Integer, Bignum, Fixnum
225
+ DataType::Quantitative::Integer.new
226
+ when Float
227
+ DataType::Quantitative::Decimal.new
228
+ when String
229
+ DataType::Categorical.new
230
+ when Time, DateTime, ActiveSupport::TimeWithZone
231
+ DataType::Quantitative::Temporal.new
232
+ else
233
+ raise(ArgumentError.new("Can't infer data type for value: #{ values.first.class.inspect }"))
234
+ end
235
+ else
236
+ data_type_override
237
+ end
238
+ end
239
+
240
+ end
241
+ end