RubyGems - sciruby - Versions diffs - 0.1.0 → 0.1.3 - Mend

sciruby 0.1.0 → 0.1.3

Files changed (121) hide show

data/.autotest +23 -0
data/.gemtest +0 -0
data/History.txt +6 -0
data/Manifest.txt +119 -0
data/Rakefile +178 -0
data/bin/sciruby-plotter +12 -0
data/data/r/man/AirPassengers.Rd +51 -0
data/data/r/man/BJsales.Rd +34 -0
data/data/r/man/BOD.Rd +53 -0
data/data/r/man/ChickWeight.Rd +68 -0
data/data/r/man/DNase.Rd +63 -0
data/data/r/man/EuStockMarkets.Rd +28 -0
data/data/r/man/Formaldehyde.Rd +44 -0
data/data/r/man/HairEyeColor.Rd +77 -0
data/data/r/man/Harman23.cor.Rd +25 -0
data/data/r/man/Harman74.cor.Rd +28 -0
data/data/r/man/Indometh.Rd +57 -0
data/data/r/man/InsectSprays.Rd +45 -0
data/data/r/man/JohnsonJohnson.Rd +37 -0
data/data/r/man/LakeHuron.Rd +27 -0
data/data/r/man/LifeCycleSavings.Rd +54 -0
data/data/r/man/Loblolly.Rd +56 -0
data/data/r/man/Nile.Rd +78 -0
data/data/r/man/Orange.Rd +57 -0
data/data/r/man/OrchardSprays.Rd +62 -0
data/data/r/man/PlantGrowth.Rd +39 -0
data/data/r/man/Puromycin.Rd +84 -0
data/data/r/man/Theoph.Rd +84 -0
data/data/r/man/Titanic.Rd +73 -0
data/data/r/man/ToothGrowth.Rd +40 -0
data/data/r/man/UCBAdmissions.Rd +68 -0
data/data/r/man/UKDriverDeaths.Rd +72 -0
data/data/r/man/UKLungDeaths.Rd +40 -0
data/data/r/man/UKgas.Rd +25 -0
data/data/r/man/USAccDeaths.Rd +23 -0
data/data/r/man/USArrests.Rd +45 -0
data/data/r/man/USJudgeRatings.Rd +38 -0
data/data/r/man/USPersonalExpenditure.Rd +33 -0
data/data/r/man/VADeaths.Rd +51 -0
data/data/r/man/WWWusage.Rd +41 -0
data/data/r/man/WorldPhones.Rd +40 -0
data/data/r/man/ability.cov.Rd +50 -0
data/data/r/man/airmiles.Rd +29 -0
data/data/r/man/airquality.Rd +56 -0
data/data/r/man/anscombe.Rd +62 -0
data/data/r/man/attenu.Rd +66 -0
data/data/r/man/attitude.Rd +48 -0
data/data/r/man/austres.Rd +22 -0
data/data/r/man/beavers.Rd +73 -0
data/data/r/man/cars.Rd +59 -0
data/data/r/man/chickwts.Rd +47 -0
data/data/r/man/co2.Rd +43 -0
data/data/r/man/crimtab.Rd +129 -0
data/data/r/man/datasets-package.Rd +24 -0
data/data/r/man/discoveries.Rd +30 -0
data/data/r/man/esoph.Rd +66 -0
data/data/r/man/euro.Rd +56 -0
data/data/r/man/eurodist.Rd +25 -0
data/data/r/man/faithful.Rd +63 -0
data/data/r/man/freeny.Rd +56 -0
data/data/r/man/infert.Rd +56 -0
data/data/r/man/iris.Rd +62 -0
data/data/r/man/islands.Rd +29 -0
data/data/r/man/lh.Rd +22 -0
data/data/r/man/longley.Rd +56 -0
data/data/r/man/lynx.Rd +33 -0
data/data/r/man/morley.Rd +50 -0
data/data/r/man/mtcars.Rd +44 -0
data/data/r/man/nhtemp.Rd +30 -0
data/data/r/man/nottem.Rd +30 -0
data/data/r/man/occupationalStatus.Rd +44 -0
data/data/r/man/precip.Rd +31 -0
data/data/r/man/presidents.Rd +36 -0
data/data/r/man/pressure.Rd +41 -0
data/data/r/man/quakes.Rd +40 -0
data/data/r/man/randu.Rd +46 -0
data/data/r/man/rivers.Rd +21 -0
data/data/r/man/rock.Rd +34 -0
data/data/r/man/sleep.Rd +51 -0
data/data/r/man/stackloss.Rd +77 -0
data/data/r/man/state.Rd +80 -0
data/data/r/man/sunspot.month.Rd +49 -0
data/data/r/man/sunspot.year.Rd +26 -0
data/data/r/man/sunspots.Rd +33 -0
data/data/r/man/swiss.Rd +79 -0
data/data/r/man/treering.Rd +38 -0
data/data/r/man/trees.Rd +48 -0
data/data/r/man/uspop.Rd +27 -0
data/data/r/man/volcano.Rd +31 -0
data/data/r/man/warpbreaks.Rd +56 -0
data/data/r/man/women.Rd +40 -0
data/data/r/man/zCO2.Rd +81 -0
data/lib/ext/csv.rb +22 -0
data/lib/ext/shoes.rb +131 -0
data/lib/ext/string.rb +39 -0
data/lib/sciruby.rb +50 -4
data/lib/sciruby/analysis.rb +98 -0
data/lib/sciruby/analysis/suite.rb +87 -0
data/lib/sciruby/analysis/suite_report_builder.rb +44 -0
data/lib/sciruby/config.rb +93 -0
data/lib/sciruby/data.rb +168 -0
data/lib/sciruby/data/guardian.rb +96 -0
data/lib/sciruby/data/r.rb +155 -0
data/lib/sciruby/data/r/base.rb +110 -0
data/lib/sciruby/data/r/data_frame.rb +24 -0
data/lib/sciruby/data/r/grouped_data.rb +7 -0
data/lib/sciruby/data/r/list.rb +20 -0
data/lib/sciruby/data/r/multi_time_series.rb +24 -0
data/lib/sciruby/data/r/r_matrix.rb +7 -0
data/lib/sciruby/data/r/time_series.rb +19 -0
data/lib/sciruby/data/r/time_series_base.rb +40 -0
data/lib/sciruby/data/r/vector.rb +125 -0
data/lib/sciruby/editor.rb +82 -0
data/lib/sciruby/plotter.rb +128 -0
data/lib/sciruby/recommend.rb +4 -0
data/lib/sciruby/validation.rb +368 -0
data/readme.md +75 -0
data/static/sciruby-icon.png +0 -0
data/test/helpers_tests.rb +58 -0
data/test/test_recommend.rb +16 -0
metadata +396 -20

data/lib/sciruby/data/guardian.rb ADDED Viewed

@@ -0,0 +1,96 @@
+module SciRuby
+  module Data
+    # World Government Data from the Guardian.
+    class Guardian < PublicSearcher
+      QUERY_DOMAIN = %q{www.guardian.co.uk}
+      QUERY_PATH   = %q{/world-government-data/search.json}
+      FOUR_OH_FOUR_MESSAGE = '404 Page not found'
+      ALLOWED_FORMATS = [:csv, :excel]
+      class DatasetInfo < ::OpenStruct
+        def initialize h
+          super h
+          self.download_links.each_index do |i|
+            self.download_links[i] = ::OpenStruct.new(self.download_links[i])
+          end
+        end
+      end
+      # Search the site or database using some set of parameters.
+      #
+      # This function is the one that you should redefine if you want to require certain parameters, or if there are
+      # parameter co-dependencies. Ultimately, you call `search_internal(params)`.
+      #
+      # == Arguments
+      # * q: keywords (default: '', if no other parameters are supplied)
+      # * facet_country: country code abbreviation to search
+      # * facet_source_title: e.g., data from Australian government would be data.nsw.org.au
+      # * facet_format: e.g., csv, excel, xml, shapefile, kml
+      def initialize args={}
+        #args[:facet_format] ||= :csv
+        #@require_format ||= args[:facet_format] # This should be removed when we can interpret other formats.
+        @search_result = search(args)
+      end
+      # Return dataset meta-data found in the search, hashed by source_id. So, do datasets.keys if you want a list of
+      # source_ids.
+      def datasets
+        @datasets ||= begin
+          h = {}
+          search_result["results"].each do |res|
+            h[res['source_id']] = DatasetInfo.new(res)
+          end
+          h
+        end
+      end
+      # Download a specific dataset by +source_id+ and cache it in the searcher. Returns a Statsample::Dataset.
+      #
+      # If this raises an exception, you can try this:
+      #
+      #     links = raw_dataset_links_cached(source_id)
+      #
+      # And then for each of +links+, do `raw_dataset(source_id, link)` to see what the actual downloaded data was.
+      # This is good for debugging -- e.g., did the page move? or is there something wrong with Ruby's CSV interpreter?
+      # Or is it in some other format altogether?
+      #
+      # Right now, this function only handles CSV. TODO: Add more format handlers!
+      def dataset source_id
+        @dataset ||= {}
+        @dataset[source_id] ||= begin # Datasets are stored by source ID
+          pos = 0
+          datasets[source_id].download_links.each do |link_info|
+            unless ALLOWED_FORMATS.include?(link_info.format)
+              pos += 1
+              next # Format is incorrect.
+            end
+            # Format appears to be correct, prior to actually downloading. Proceed.
+            # Attempt to read the cached one first, and if that fails, try downloading.
+            raw = cached_dataset(source_id) || download_dataset(link_info.link)
+            begin
+              ds  = parse_dataset link_info.format, raw, datasets[source_id].title
+              cache_dataset(source_id, raw, link_info.format)
+            rescue TypeError => e
+              if pos == datasets[source_id].download_links.size - 1
+                raise DatasetNotFoundError.new(e)
+              end
+            ensure
+              pos += 1
+            end
+            return ds unless ds.nil?
+          end
+        end
+      end
+    end
+  end
+end

data/lib/sciruby/data/r.rb ADDED Viewed

@@ -0,0 +1,155 @@
+module SciRuby
+  module Data
+    # R data module.
+    class R < Base
+      DIR = Pathname.new(__FILE__).realpath.dirname.to_s
+      require "simpler"
+      # Attempt to parse an R dataset through simpler. Works with most datasets (but not for table, dist, or array).
+      #
+      # Note that not all of these datasets have functions for converting directly to Statsample or SciRuby types. In
+      # other words, parsing works, but it may not be as simple as calling to_dataset or to_h (yet).
+      #
+      # TODO: Add basic conversion functions like to_h, to_a, etc.
+      #
+      # == R datasets that don't work
+      # * crimtab (table)
+      # * eurodist (dist)
+      # * HairEyeColor (table)
+      # * iris3 (array)
+      # * occupationalStatus (table)
+      # * Titanic (table)
+      # * UCBAdmissions (table)
+      # * volcano (matrix): TODO: Handle non-named rows and columns in matrix
+      #
+      # TODO: rownames that are just counters need to be ignored in some cases, e.g., Puromycin
+      #
+      # == R datasets that work partially
+      # * chickwts: doesn't know how to handle levels, but still loads them as strings.
+      def dataset id
+        begin
+          r(id)
+        rescue Simpler::RError => e
+          raise DatasetNotFoundError.new(e)
+        end
+      end
+      # TODO: Fix so that aggregate datasets, like state, are listed properly in search results.
+      def search args={}
+        parse_datasets_index(r.eval! { %q{library(help="datasets")} })
+      end
+      alias_method :datasets, :search
+      # Alias for self.r.
+      def r obj=nil; SciRuby::Data::R.r(obj); end
+      class << self
+        def in_dir &block
+          SciRuby::Data.in_dir {  Dir.chdir('r') { yield } }
+        end
+        def in_man_dir &block
+          in_dir {   Dir.chdir('man') { yield } }
+        end
+        # With an argument, this function attempts to read from R some variable (probably a built-in dataset).
+        # Without an argument, this function gives access to the R console. See also: simpler by jtprince on github.
+        def r obj=nil
+          require "simpler"
+          @@r ||= ::Simpler.new
+          unless obj.nil?
+            r_class = Base.class(obj)
+            if r_class == 'numeric' || r_class == 'integer' || r_class == 'ordered' || r_class == 'factor' || r_class == 'character'
+              return Vector.new(obj)
+            elsif r_class == 'data.frame'
+              return DataFrame.new(obj)
+            elsif r_class == 'nfnGroupedData'
+              return GroupedData.new(obj)
+            elsif r_class == 'matrix'
+              return RMatrix.new(obj)
+            elsif r_class == 'ts'
+              return TimeSeries.new(obj)
+            elsif r_class == 'mts'
+              return MultiTimeSeries.new(obj)
+            elsif r_class == 'list'
+              return List.new(obj).to_h
+            else
+              raise(NotImplementedError, "Don't know how to recognize class #{r_class} yet.")
+            end
+          end
+          return @@r
+        end
+      end
+      # Hacked together tex parser to extract useful information from .Rd R manual files. Unlikely to work on any other
+      # TeX or LaTeX files.
+      class Man < OpenStruct
+        class << self
+          def in_dir &block
+            SciRuby::Data::R.in_man_dir { yield }
+          end
+        end
+        def in_dir &block
+          SciRuby::Data::R::Man.in_dir { yield }
+        end
+        def initialize dataset_id
+          h = {}
+          in_dir do
+            raw = File.read("#{dataset_id}.Rd")
+            entries = raw.split("\n\\") # this is a total hack
+            entries.each do |entry|
+              next if entry =~ /^%/
+              command, content = entry.split('{', 2)
+              h[command.underscore] = content.strip.gsub(/}$/, '').gsub(/\n$/, '')
+            end
+          end
+          super(h)
+        end
+      end
+    protected
+      # Listing of datasets read directly from R.
+      def parse_datasets_index raw
+        h = {}
+        mode = :pre
+        last_key = nil
+        raw.split("\n").each do |line|
+          next if mode == :pre && line !~ /^Index\:/
+          mode = :index
+          next if line =~ /^Index\:/
+          next if line.strip.empty?
+          if line =~ /^ /
+            h[last_key] = [h[last_key], line.strip].join(' ')
+          else
+            k, v        = line.split(' ', 2)
+            last_key    = k.strip
+            h[last_key] = v.strip
+          end
+        end
+        h
+      end
+    end
+  end
+end
+require File.join(SciRuby::Data::R::DIR, 'r', 'base.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'data_frame.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'time_series_base.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'time_series.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'multi_time_series.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'vector.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'r_matrix.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'grouped_data.rb')
+require File.join(SciRuby::Data::R::DIR, 'r', 'list.rb')

data/lib/sciruby/data/r/base.rb ADDED Viewed

@@ -0,0 +1,110 @@
+module SciRuby::Data
+  class R
+    # Parses datasets from R directly.
+    class Base
+      FLOAT_RE = /([.eE])/
+      require "simpler"
+      def initialize id
+        @rob  = id # R object name
+        assign_properties # Read as many properties as possible from R
+      end
+      def self.class obj
+        #STDERR.puts "obj=#{obj}"
+        Base.new(obj).send :read_class
+      end
+      attr_reader :rob
+      alias_method :rname, :rob
+    protected
+      def assign_properties; end
+      def r obj=nil
+        SciRuby::Data::R.r(obj)
+      end
+      def float_re
+        SciRuby::Data::R::Base::FLOAT_RE
+      end
+      def call_function fn=nil
+        #STDERR.puts "Call function: #{fn.to_s}\t#{rob}"
+        fn.nil? ? r.eval! { rob } : r.eval! { "#{fn.to_s}(#{rob})" }
+      end
+      def call_property prop
+        r.eval! { "#{rob}$'#{prop.to_s}'"}
+      end
+      def read_class fn=:class
+        read_single_line(fn).first
+      end
+      def read_single_line fn=nil
+        line = call_function fn
+        #STDERR.puts "rsl Got back: #{line}"
+        CSV::parse_line(line.split(' ', 2).tap{ |s| s.shift }.first, :col_sep => ' ')
+      end
+      def read_single_token fn=nil
+        line = call_function fn
+        #STDERR.puts "rst Got back: #{line}"
+        line.split.tap{ |s| s.shift }.first
+      end
+      # Read multiple lines from a function call. You can also pass in a block if you want to ask for a property instead
+      # of a function call, e.g.,
+      #     read_multiple_lines { call_property('height') }
+      def read_multiple_lines fn=nil
+        lines = block_given? ? yield : call_function(fn)
+        #STDERR.puts "rml Got back:\n#{lines}"
+        lines = lines.split("\n")
+        return nil if lines.first =~ /^NULL/
+        if lines.first =~ /^ *\[/
+          return lines.map do |line|
+            remaining_line = CSV::parse_line(line.split(' ', 2).tap { |s| s.shift }.first, :col_sep => ' ')
+            remaining_line = remaining_line.tap { |l| l.pop } if remaining_line.last.nil?
+            remaining_line
+          end.flatten
+        end
+        raise "Unrecognized R output"
+      end
+      def read_row_names fn='rownames'
+        attempt = read_multiple_lines(fn) # may return nil if no rownames found.
+        return [] if attempt.nil?
+        attempt
+      end
+      def read_col_names fn='colnames'
+        read_row_names fn
+      end
+      def read_names fn='names'
+        read_row_names fn
+      end
+      def read_levels fn='levels'
+        read_row_names fn
+      end
+      def read_columns fields
+        columns = {}
+        fields.each do |field|
+          raise(ArgumentError, "nil field") if field.nil?
+          columns_for_field = SciRuby::Data::R.r("#{rob}[,'#{field.to_s}']")
+          columns[field] = (columns_for_field.is_a?(Vector) && columns_for_field.has_levels?) || columns_for_field.is_a?(TimeSeries) ? columns_for_field : columns_for_field.to_a
+        end
+        columns
+      end
+    end
+  end
+end

data/lib/sciruby/data/r/data_frame.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module SciRuby::Data
+  class R
+    class DataFrame < Base
+      attr_reader :row_names, :columns
+      def col_names
+        columns.keys
+      end
+      def levels col_name
+        columns[col_name].levels
+      end
+    protected
+      def assign_properties
+        @row_names = read_row_names
+        col_names  = read_col_names
+        @columns   = read_columns(col_names)
+      end
+    end
+  end
+end

data/lib/sciruby/data/r/grouped_data.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module SciRuby::Data
+  class R
+    class GroupedData < DataFrame
+    end
+  end
+end

data/lib/sciruby/data/r/list.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module SciRuby::Data
+  class R
+    # An intermediate object that doesn't really get used -- immediately gets converted to a Ruby Hash of other R objects.
+    class List < Base
+      def to_h
+        @data
+      end
+    protected
+      def assign_properties
+        @names = read_names
+        @names = nil if @names.nil? || (@names.is_a?(Array) && @names.empty?)
+        @data = {}
+        @names.each do |list_item|
+          @data[list_item] = r("#{rob}[['#{list_item}']]")
+        end
+      end
+    end
+  end
+end

data/lib/sciruby/data/r/multi_time_series.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module SciRuby::Data
+  class R
+    # class 'mts' in R
+    class MultiTimeSeries < TimeSeriesBase
+      attr_reader :row_names, :columns
+      def col_names
+        columns.keys
+      end
+      def levels col_name
+        columns[col_name].levels
+      end
+    protected
+      def assign_properties
+        @row_names = read_row_names
+        col_names  = read_col_names
+        @columns   = read_columns(col_names)
+        super
+      end
+    end
+  end
+end

data/lib/sciruby/data/r/r_matrix.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module SciRuby::Data
+  class R
+    class RMatrix < DataFrame
+    end
+  end
+end