RubyGems - rails-data-explorer - Versions diffs - 0.2.3 → 1.0.0 - Mend

rails-data-explorer 0.2.3 → 1.0.0

Files changed (71) hide show

data/lib/{rails-data-explorer → rails_data_explorer}/data_set.rb RENAMED

@@ -1,10 +1,19 @@
-# Container for data series
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
+  # Responsibilities:
+  #  * Container for DataSeries
+  #
+  # Collaborators:
+  #  * DataSeries
+  #  * Exploration
+  #
   class DataSet
     attr_reader :data_series
-    # @param[Array<Numeric, String, Symbol, Nil, Hash, DataSeries>] values_or_data_series
+    # @param values_or_data_series [Array<Numeric, String, Symbol, Nil, Hash, DataSeries>]
     #  Array can contain the following:
     #  * Numeric, String, Symbol, Nil - for a single data series
     #  * Hash - for multiple data series with the following keys:
@@ -13,7 +22,7 @@ class RailsDataExplorer
     #    * :chart_roles [Array<Symbol>, optional] - what to use this series for. possible values: :x, :y, :color
     #    * :data_type (optional) - :quantitative, :categorical, :temporal
     #  * DataSeries
-    # @param[String] exploration_title used as fall back for data series name
+    # @param exploration_title [String] used as fall back for data series name
     def initialize(values_or_data_series, exploration_title)
       @data_series = initialize_data_series(values_or_data_series, exploration_title)
       validate_data_series
@@ -39,7 +48,7 @@ class RailsDataExplorer
       else
         raise(
           ArgumentError.new(
-            "Invalid datum. Only Hash, Numeric, String, Symbol, and Nil are allowed. " + \
+            "Invalid datum. Only DataSeries, Hash, ActiveSupport::TimeWithZone, DateTime, Numeric, NilClass, String, or Symbol are allowed. " + \
             "Found #{ values_or_data_series.first.class.to_s }."
           )
         )

data/lib/{rails-data-explorer → rails_data_explorer}/data_type.rb RENAMED

@@ -1,4 +1,17 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
+  # Responsibilities:
+  #  * Represent a type of data
+  #  * Determine available chart types
+  #  * Compute descriptive statistics
+  #  * Compute modified values
+  #
+  # Collaborators:
+  #  * DataSeries
+  #  * Chart
+  #
   class DataType
     # @param[Hash, optional] constraints

data/lib/{rails-data-explorer → rails_data_explorer}/data_type/categorical.rb RENAMED

@@ -1,9 +1,17 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   class DataType
+    # Responsibilities:
+    #  * Provide available charts and statistics for categorical data type.
+    #  * Provide methods for categorical data type.
+    #
+    # Collaborators:
+    #  * DataSet
+    #
     class Categorical < DataType
-      # TODO: when there are too many categories, only separate the N most
-      # significant ones and group all other values under "Other"
       def all_available_chart_types
         [
           {
@@ -12,12 +20,12 @@ class RailsDataExplorer
             dimensions_count_min: 1,
             dimensions_count_max: 1,
           },
-          # {
-          #   chart_class: Chart::PieChart,
-          #   chart_roles: [:any],
-          #   dimensions_count_min: 1,
-          #   dimensions_count_max: 1,
-          # },
+          {
+            chart_class: Chart::PieChart,
+            chart_roles: [:any],
+            dimensions_count_min: 1,
+            dimensions_count_max: 1,
+          },
           {
             chart_class: Chart::BoxPlotGroup,
             chart_roles: [:y],
@@ -34,6 +42,12 @@ class RailsDataExplorer
             chart_roles: [:dimension],
             dimensions_count_min: 3,
           },
+          {
+            chart_class: Chart::StackedBarChartCategorical,
+            chart_roles: [:x, :y],
+            dimensions_count_min: 2,
+            dimensions_count_max: 2,
+          },
           {
             chart_class: Chart::StackedBarChartCategoricalPercent,
             chart_roles: [:x, :y],
@@ -67,7 +81,7 @@ class RailsDataExplorer
       end
       def descriptive_statistics(values)
-        frequencies = values.inject(Hash.new(0)) { |m,e| m[e] += 1; m }
+        frequencies = compute_histogram(values)
         labels_ds = DataSeries.new('_', values.uniq)
         total_count = values.length
         ruby_formatters = {
@@ -168,25 +182,43 @@ class RailsDataExplorer
         %(function(d) { return d })
       end
-      # @param[Symbol, nil] label_val_key the hash key to use to get the label value during sort (sent to a,b)
-      # @param[DataSeries] data_series the ds that contains the uniq vals
-      # @param[Proc] value_sorter the sorting proc to use if not sorted numerically
-      # @return[Proc] a Proc that will be used by #sort
+      # @param label_val_key [Symbol, nil] the hash key to use to get the label value during sort (sent to a,b)
+      # @param data_series [DataSeries] the ds that contains the uniq vals
+      # @param value_sorter [Proc] the sorting proc to use if not sorted numerically
+      # @return [Proc] a Proc that will be used by #sort
       def label_sorter(label_val_key, data_series, value_sorter)
         if data_series.uniq_vals.any? { |e| e.to_s =~ /^[\+\-]?\d+/ }
           # Sort numerical categories by key ASC
+          # This lambda can be used in conjunction with `#sort`.
+          # It returns -1, 0, or 1
           lambda { |a,b|
             number_and_full_string_extractor = lambda { |val|
               str = label_val_key ? val[label_val_key] : val
               number = str.gsub(/^[^\d\+\-]*/, '') # remove non-digit leading chars
                           .gsub(',', '') # remove delimiter commas, they throw off to_f parsing
-                          .to_f
-              number += 1  if str =~ /^>/ # increase highest threshold by one for proper sorting
+              if '' != number
+                # label contains digits
+                number = number.to_f
+                number += 1  if str =~ /^>/ # increase highest threshold by one for proper sorting
+                number -= 1  if str =~ /^</ # decrease lowest threshold by one for proper sorting
+              else
+                # label doesn't contain digits, set to nil to sort at end
+                number = nil
+              end
               [number, str]
             }
-            a_number_and_full_string = number_and_full_string_extractor.call(a)
-            b_number_and_full_string = number_and_full_string_extractor.call(b)
-            a_number_and_full_string <=> b_number_and_full_string
+            a_num, a_str = number_and_full_string_extractor.call(a)
+            b_num, b_str = number_and_full_string_extractor.call(b)
+            if a_num && b_num
+              # Both numbers are present, compare them
+              [a_num, a_str] <=> [b_num, b_str]
+            elsif a_num
+              # a_num is present, b_num isn't. Sort a before b
+              -1
+            else
+              # a_num is not present, b_num is, Sort a after b
+              1
+            end
           }
         else
           # Use provided value sorter
@@ -194,6 +226,35 @@ class RailsDataExplorer
         end
       end
+      # Returns the top N max frequent distinct observations in values. Groups
+      # less frequent observations under val_for_others.
+      # @param values [Array]
+      # @param max_num_vals [Integer] the max number of distinct values to return (including val_for_others)
+      # @param val_for_others [String, optional] defaults to '[Other]'
+      def limit_distinct_values(values, max_num_vals, val_for_others = nil)
+        distinct_values = values.uniq
+        # Return values if they already have lte max_num_vals distinct observations
+        return values  if distinct_values.length <= max_num_vals
+        val_for_others ||= '[Other]'
+        frequencies = compute_histogram(values)
+        top_vals = frequencies.to_a.sort { |a,b|
+          # a = [value, frequency]
+          # Sort by frequency DESC, value ASC
+          [b.last, a.first] <=> [a.last, b.first]
+        }.first(max_num_vals - 1).map { |e| e.first }
+        values.map { |e| top_vals.include?(e) ? e : val_for_others }
+      end
+    protected
+      # Computes a histogram for values
+      # @param values [Array]
+      # @return a Hash with distinct vals as keys and their frequency as value
+      def compute_histogram(values)
+        values.inject(Hash.new(0)) { |m,e| m[e] += 1; m }
+      end
     end
   end
 end

data/lib/{rails-data-explorer → rails_data_explorer}/data_type/geo.rb RENAMED

@@ -1 +1,3 @@
+# -*- coding: utf-8 -*-
 # For displaying data on maps.

data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative.rb RENAMED

@@ -1,8 +1,18 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   class DataType
-    class Quantitative < DataType
-      # This is an abstract class. Use sub_classes
+    # This is an abstract class. Use sub_classes
+    #
+    # Responsibilities:
+    #  * Provide available charts and statistics for quantitative data type.
+    #  * Provide methods for quantitative data type.
+    #
+    # Collaborators:
+    #  * DataSet
+    #
+    class Quantitative < DataType
       def all_available_chart_types
         [
@@ -122,9 +132,9 @@ class RailsDataExplorer
         raise "Implement me in sub_class"
       end
-      def axis_scale(data_series, d3_or_vega)
+      def axis_scale(data_series, modification, d3_or_vega)
         # Log scales can't handle 0 values
-        if data_series.min_val > 0.0 && data_series.has_large_dynamic_range?
+        if data_series.min_val(modification) > 0.0 && data_series.has_large_dynamic_range?(modification)
           { d3: 'd3.scale.log', vega: 'log' }[d3_or_vega]
         else
           { d3: 'd3.scale.linear', vega: 'linear' }[d3_or_vega]

data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/decimal.rb RENAMED

@@ -1,6 +1,15 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   class DataType
     class Quantitative
+      # Responsibilities:
+      #  * Provide methods for decimal quantitative data type.
+      #
+      # Collaborators:
+      #  * DataSet
+      #
       class Decimal < Quantitative
         def axis_tick_format(values)

data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/integer.rb RENAMED

@@ -1,6 +1,15 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   class DataType
     class Quantitative
+      # Responsibilities:
+      #  * Provide methods for integer quantitative data type.
+      #
+      # Collaborators:
+      #  * DataSet
+      #
       class Integer < Quantitative
         def axis_tick_format(values)

data/lib/{rails-data-explorer → rails_data_explorer}/data_type/quantitative/temporal.rb RENAMED

@@ -1,6 +1,15 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   class DataType
     class Quantitative
+      # Responsibilities:
+      #  * Provide methods for temporal quantitative data type.
+      #
+      # Collaborators:
+      #  * DataSet
+      #
       class Temporal < Quantitative
         def all_available_chart_types

data/lib/{rails-data-explorer → rails_data_explorer}/engine.rb RENAMED

@@ -1,6 +1,18 @@
+# -*- coding: utf-8 -*-
 require 'rails'
 class RailsDataExplorer
+  # Responsibilities:
+  #  * Tie RailsDataExplorer into a Rails app
+  #  * Initialize ActionViewExtension
+  #  * Tell rails which assets to precompile
+  #
+  # Collaborators:
+  #  * ActiveSupport
+  #  * RailsDataExplorer
+  #
   class Engine < ::Rails::Engine
     # It's an engine so that we can add javascript and image assets

data/lib/{rails-data-explorer → rails_data_explorer}/exploration.rb RENAMED

@@ -1,4 +1,15 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
+  # Responsibilities:
+  #  * Represent and initialize a data exploration
+  #  * Initialize and render self (including charts)
+  #
+  # Collaborators:
+  #  * DataSet
+  #  * Chart
+  #
   class Exploration
     attr_accessor :output_buffer # required for content_tag

data/lib/rails_data_explorer/statistics/pearsons_chi_squared_independence_test.rb ADDED

@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+class RailsDataExplorer
+  module Statistics
+    # From http://en.wikipedia.org/wiki/Pearson's_chi-squared_test
+    # Pearson's chi-squared test is used to assess whether paired observations on two
+    # variables, expressed in a contingency table, are independent of each other.
+    # An "observation" consists of the values of two outcomes and the null hypothesis
+    # is that the occurrence of these outcomes is statistically independent. Each
+    # observation is allocated to one cell of a two-dimensional array of cells (called
+    # a contingency table) according to the values of the two outcomes.
+    # Assumptions
+    # -----------
+    # The chi-squared test, when used with the standard approximation that a chi-
+    # squared distribution is applicable, has the following assumptions:
+    # * Simple random sample – The sample data is a random sampling from a fixed
+    #   distribution or population where every collection of members of the population
+    #   of the given sample size has an equal probability of selection. Variants of
+    #   the test have been developed for complex samples, such as where the data is
+    #   weighted. Other forms can be used such as purposive sampling.
+    # * Sample size (whole table) – A sample with a sufficiently large size is assumed.
+    #   If a chi squared test is conducted on a sample with a smaller size, then the
+    #   chi squared test will yield an inaccurate inference. The researcher, by using
+    #   chi squared test on small samples, might end up committing a Type II error.
+    # * Expected cell count – Adequate expected cell counts. Some require 5 or more,
+    #   and others require 10 or more. A common rule is 5 or more in all cells of a
+    #   2-by-2 table, and 5 or more in 80% of cells in larger tables, but no cells
+    #   with zero expected count. When this assumption is not met, Yates's Correction
+    #   is applied.
+    # * Independence – The observations are always assumed to be independent of each
+    #   other. This means chi-squared cannot be used to test correlated data
+    #   (like matched pairs or panel data). In those cases you might want to turn to
+    #   McNemar's test.
+    # Problems
+    # --------
+    # The approximation to the chi-squared distribution breaks down if expected
+    # frequencies are too low. It will normally be acceptable so long as no more than
+    # 20% of the events have expected frequencies below 5. Where there is only 1
+    # degree of freedom, the approximation is not reliable if expected frequencies are
+    # below 10. In this case, a better approximation can be obtained by reducing the
+    # absolute value of each difference between observed and expected frequencies by
+    # 0.5 before squaring; this is called Yates's correction for continuity.
+    # In cases where the expected value, E, is found to be small (indicating a small
+    # underlying population probability, and/or a small number of observations), the
+    # normal approximation of the multinomial distribution can fail, and in such cases
+    # it is found to be more appropriate to use the G-test, a likelihood ratio-based
+    # test statistic. Where the total sample size is small, it is necessary to use an
+    # appropriate exact test, typically either the binomial test or (for contingency
+    # tables) Fisher's exact test. This test uses the conditional distribution of the
+    # test statistic given the marginal totals; however, it does not assume that the
+    # data were generated from an experiment in which the marginal totals are fixed
+    # and is valid whether or not that is the case.
+    class PearsonsChiSquaredIndependenceTest
+      def initialize(data_matrix, min_probability = 0.05)
+      end
+      def compute
+      end
+    end
+  end
+end

data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_category.rb RENAMED

@@ -1,7 +1,17 @@
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   module Statistics
+    # Responsibilities:
+    #  * Provide random categorical data. Useful for testing and demo data.
+    #
     class RngCategory
+      # @param categories [Array<Object>] the pool of available categories.
+      # @param category_probabilities [Array, optional] probability of each category.
+      # @param rng [Proc, optional] lambda to generate random numbers which will
+      #            be mapped to categories.
       def initialize(categories, category_probabilities = nil, rng = lambda { Kernel.rand })
         @categories, @category_probabilities, @rng = categories, category_probabilities, rng
         @category_probabilities ||= @categories.map { |e| @rng.call }
@@ -9,6 +19,7 @@ class RailsDataExplorer
         @category_order = compute_category_order
       end
+      # Returns a random category
       def rand
         r_v = @rng.call
         rnd = @category_order.detect { |e|
@@ -17,6 +28,8 @@ class RailsDataExplorer
         rnd[:category]
       end
+    protected
       def normalize_category_probabilities
         total = @category_probabilities.inject(0) { |m,e| m += e }
         @category_probabilities.map { |e| e / total.to_f }

data/lib/{rails-data-explorer → rails_data_explorer}/statistics/rng_gaussian.rb RENAMED

@@ -1,12 +1,23 @@
-# From http://stackoverflow.com/a/9266488
+# -*- coding: utf-8 -*-
 class RailsDataExplorer
   module Statistics
+    # Responsibilities:
+    #  * Provide random numeric data, following a gaussian distribution.
+    #
+    # From http://stackoverflow.com/a/9266488
     class RngGaussian
+      # @param mean [Float] the expected mean
+      # @param sd [Float] the expected standard deviation
+      # @param rng [Proc, optional] a random number generator
       def initialize(mean = 0.0, sd = 1.0, rng = lambda { Kernel.rand })
         @mean, @sd, @rng = mean, sd, rng
         @compute_next_pair = false
       end
+      # Returns random numbers with a gaussian distribution.
       def rand
         if (@compute_next_pair = !@compute_next_pair)
           # Compute a pair of random values with normal distribution.