sciruby 0.1.0 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (121) hide show
  1. data/.autotest +23 -0
  2. data/.gemtest +0 -0
  3. data/History.txt +6 -0
  4. data/Manifest.txt +119 -0
  5. data/Rakefile +178 -0
  6. data/bin/sciruby-plotter +12 -0
  7. data/data/r/man/AirPassengers.Rd +51 -0
  8. data/data/r/man/BJsales.Rd +34 -0
  9. data/data/r/man/BOD.Rd +53 -0
  10. data/data/r/man/ChickWeight.Rd +68 -0
  11. data/data/r/man/DNase.Rd +63 -0
  12. data/data/r/man/EuStockMarkets.Rd +28 -0
  13. data/data/r/man/Formaldehyde.Rd +44 -0
  14. data/data/r/man/HairEyeColor.Rd +77 -0
  15. data/data/r/man/Harman23.cor.Rd +25 -0
  16. data/data/r/man/Harman74.cor.Rd +28 -0
  17. data/data/r/man/Indometh.Rd +57 -0
  18. data/data/r/man/InsectSprays.Rd +45 -0
  19. data/data/r/man/JohnsonJohnson.Rd +37 -0
  20. data/data/r/man/LakeHuron.Rd +27 -0
  21. data/data/r/man/LifeCycleSavings.Rd +54 -0
  22. data/data/r/man/Loblolly.Rd +56 -0
  23. data/data/r/man/Nile.Rd +78 -0
  24. data/data/r/man/Orange.Rd +57 -0
  25. data/data/r/man/OrchardSprays.Rd +62 -0
  26. data/data/r/man/PlantGrowth.Rd +39 -0
  27. data/data/r/man/Puromycin.Rd +84 -0
  28. data/data/r/man/Theoph.Rd +84 -0
  29. data/data/r/man/Titanic.Rd +73 -0
  30. data/data/r/man/ToothGrowth.Rd +40 -0
  31. data/data/r/man/UCBAdmissions.Rd +68 -0
  32. data/data/r/man/UKDriverDeaths.Rd +72 -0
  33. data/data/r/man/UKLungDeaths.Rd +40 -0
  34. data/data/r/man/UKgas.Rd +25 -0
  35. data/data/r/man/USAccDeaths.Rd +23 -0
  36. data/data/r/man/USArrests.Rd +45 -0
  37. data/data/r/man/USJudgeRatings.Rd +38 -0
  38. data/data/r/man/USPersonalExpenditure.Rd +33 -0
  39. data/data/r/man/VADeaths.Rd +51 -0
  40. data/data/r/man/WWWusage.Rd +41 -0
  41. data/data/r/man/WorldPhones.Rd +40 -0
  42. data/data/r/man/ability.cov.Rd +50 -0
  43. data/data/r/man/airmiles.Rd +29 -0
  44. data/data/r/man/airquality.Rd +56 -0
  45. data/data/r/man/anscombe.Rd +62 -0
  46. data/data/r/man/attenu.Rd +66 -0
  47. data/data/r/man/attitude.Rd +48 -0
  48. data/data/r/man/austres.Rd +22 -0
  49. data/data/r/man/beavers.Rd +73 -0
  50. data/data/r/man/cars.Rd +59 -0
  51. data/data/r/man/chickwts.Rd +47 -0
  52. data/data/r/man/co2.Rd +43 -0
  53. data/data/r/man/crimtab.Rd +129 -0
  54. data/data/r/man/datasets-package.Rd +24 -0
  55. data/data/r/man/discoveries.Rd +30 -0
  56. data/data/r/man/esoph.Rd +66 -0
  57. data/data/r/man/euro.Rd +56 -0
  58. data/data/r/man/eurodist.Rd +25 -0
  59. data/data/r/man/faithful.Rd +63 -0
  60. data/data/r/man/freeny.Rd +56 -0
  61. data/data/r/man/infert.Rd +56 -0
  62. data/data/r/man/iris.Rd +62 -0
  63. data/data/r/man/islands.Rd +29 -0
  64. data/data/r/man/lh.Rd +22 -0
  65. data/data/r/man/longley.Rd +56 -0
  66. data/data/r/man/lynx.Rd +33 -0
  67. data/data/r/man/morley.Rd +50 -0
  68. data/data/r/man/mtcars.Rd +44 -0
  69. data/data/r/man/nhtemp.Rd +30 -0
  70. data/data/r/man/nottem.Rd +30 -0
  71. data/data/r/man/occupationalStatus.Rd +44 -0
  72. data/data/r/man/precip.Rd +31 -0
  73. data/data/r/man/presidents.Rd +36 -0
  74. data/data/r/man/pressure.Rd +41 -0
  75. data/data/r/man/quakes.Rd +40 -0
  76. data/data/r/man/randu.Rd +46 -0
  77. data/data/r/man/rivers.Rd +21 -0
  78. data/data/r/man/rock.Rd +34 -0
  79. data/data/r/man/sleep.Rd +51 -0
  80. data/data/r/man/stackloss.Rd +77 -0
  81. data/data/r/man/state.Rd +80 -0
  82. data/data/r/man/sunspot.month.Rd +49 -0
  83. data/data/r/man/sunspot.year.Rd +26 -0
  84. data/data/r/man/sunspots.Rd +33 -0
  85. data/data/r/man/swiss.Rd +79 -0
  86. data/data/r/man/treering.Rd +38 -0
  87. data/data/r/man/trees.Rd +48 -0
  88. data/data/r/man/uspop.Rd +27 -0
  89. data/data/r/man/volcano.Rd +31 -0
  90. data/data/r/man/warpbreaks.Rd +56 -0
  91. data/data/r/man/women.Rd +40 -0
  92. data/data/r/man/zCO2.Rd +81 -0
  93. data/lib/ext/csv.rb +22 -0
  94. data/lib/ext/shoes.rb +131 -0
  95. data/lib/ext/string.rb +39 -0
  96. data/lib/sciruby.rb +50 -4
  97. data/lib/sciruby/analysis.rb +98 -0
  98. data/lib/sciruby/analysis/suite.rb +87 -0
  99. data/lib/sciruby/analysis/suite_report_builder.rb +44 -0
  100. data/lib/sciruby/config.rb +93 -0
  101. data/lib/sciruby/data.rb +168 -0
  102. data/lib/sciruby/data/guardian.rb +96 -0
  103. data/lib/sciruby/data/r.rb +155 -0
  104. data/lib/sciruby/data/r/base.rb +110 -0
  105. data/lib/sciruby/data/r/data_frame.rb +24 -0
  106. data/lib/sciruby/data/r/grouped_data.rb +7 -0
  107. data/lib/sciruby/data/r/list.rb +20 -0
  108. data/lib/sciruby/data/r/multi_time_series.rb +24 -0
  109. data/lib/sciruby/data/r/r_matrix.rb +7 -0
  110. data/lib/sciruby/data/r/time_series.rb +19 -0
  111. data/lib/sciruby/data/r/time_series_base.rb +40 -0
  112. data/lib/sciruby/data/r/vector.rb +125 -0
  113. data/lib/sciruby/editor.rb +82 -0
  114. data/lib/sciruby/plotter.rb +128 -0
  115. data/lib/sciruby/recommend.rb +4 -0
  116. data/lib/sciruby/validation.rb +368 -0
  117. data/readme.md +75 -0
  118. data/static/sciruby-icon.png +0 -0
  119. data/test/helpers_tests.rb +58 -0
  120. data/test/test_recommend.rb +16 -0
  121. metadata +396 -20
@@ -0,0 +1,96 @@
1
+ module SciRuby
2
+ module Data
3
+
4
+ # World Government Data from the Guardian.
5
+ class Guardian < PublicSearcher
6
+ QUERY_DOMAIN = %q{www.guardian.co.uk}
7
+ QUERY_PATH = %q{/world-government-data/search.json}
8
+ FOUR_OH_FOUR_MESSAGE = '404 Page not found'
9
+ ALLOWED_FORMATS = [:csv, :excel]
10
+
11
+ class DatasetInfo < ::OpenStruct
12
+ def initialize h
13
+ super h
14
+ self.download_links.each_index do |i|
15
+ self.download_links[i] = ::OpenStruct.new(self.download_links[i])
16
+ end
17
+ end
18
+ end
19
+
20
+
21
+ # Search the site or database using some set of parameters.
22
+ #
23
+ # This function is the one that you should redefine if you want to require certain parameters, or if there are
24
+ # parameter co-dependencies. Ultimately, you call `search_internal(params)`.
25
+ #
26
+ # == Arguments
27
+ # * q: keywords (default: '', if no other parameters are supplied)
28
+ # * facet_country: country code abbreviation to search
29
+ # * facet_source_title: e.g., data from Australian government would be data.nsw.org.au
30
+ # * facet_format: e.g., csv, excel, xml, shapefile, kml
31
+ def initialize args={}
32
+ #args[:facet_format] ||= :csv
33
+ #@require_format ||= args[:facet_format] # This should be removed when we can interpret other formats.
34
+
35
+ @search_result = search(args)
36
+ end
37
+
38
+ # Return dataset meta-data found in the search, hashed by source_id. So, do datasets.keys if you want a list of
39
+ # source_ids.
40
+ def datasets
41
+ @datasets ||= begin
42
+ h = {}
43
+ search_result["results"].each do |res|
44
+ h[res['source_id']] = DatasetInfo.new(res)
45
+ end
46
+ h
47
+ end
48
+ end
49
+
50
+ # Download a specific dataset by +source_id+ and cache it in the searcher. Returns a Statsample::Dataset.
51
+ #
52
+ # If this raises an exception, you can try this:
53
+ #
54
+ # links = raw_dataset_links_cached(source_id)
55
+ #
56
+ # And then for each of +links+, do `raw_dataset(source_id, link)` to see what the actual downloaded data was.
57
+ # This is good for debugging -- e.g., did the page move? or is there something wrong with Ruby's CSV interpreter?
58
+ # Or is it in some other format altogether?
59
+ #
60
+ # Right now, this function only handles CSV. TODO: Add more format handlers!
61
+ def dataset source_id
62
+ @dataset ||= {}
63
+ @dataset[source_id] ||= begin # Datasets are stored by source ID
64
+ pos = 0
65
+ datasets[source_id].download_links.each do |link_info|
66
+
67
+ unless ALLOWED_FORMATS.include?(link_info.format)
68
+ pos += 1
69
+ next # Format is incorrect.
70
+ end
71
+
72
+ # Format appears to be correct, prior to actually downloading. Proceed.
73
+
74
+ # Attempt to read the cached one first, and if that fails, try downloading.
75
+ raw = cached_dataset(source_id) || download_dataset(link_info.link)
76
+
77
+ begin
78
+ ds = parse_dataset link_info.format, raw, datasets[source_id].title
79
+ cache_dataset(source_id, raw, link_info.format)
80
+ rescue TypeError => e
81
+ if pos == datasets[source_id].download_links.size - 1
82
+ raise DatasetNotFoundError.new(e)
83
+ end
84
+ ensure
85
+ pos += 1
86
+ end
87
+
88
+ return ds unless ds.nil?
89
+
90
+ end
91
+ end
92
+ end
93
+
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,155 @@
1
+ module SciRuby
2
+ module Data
3
+ # R data module.
4
+ class R < Base
5
+ DIR = Pathname.new(__FILE__).realpath.dirname.to_s
6
+
7
+ require "simpler"
8
+
9
+ # Attempt to parse an R dataset through simpler. Works with most datasets (but not for table, dist, or array).
10
+ #
11
+ # Note that not all of these datasets have functions for converting directly to Statsample or SciRuby types. In
12
+ # other words, parsing works, but it may not be as simple as calling to_dataset or to_h (yet).
13
+ #
14
+ # TODO: Add basic conversion functions like to_h, to_a, etc.
15
+ #
16
+ # == R datasets that don't work
17
+ # * crimtab (table)
18
+ # * eurodist (dist)
19
+ # * HairEyeColor (table)
20
+ # * iris3 (array)
21
+ # * occupationalStatus (table)
22
+ # * Titanic (table)
23
+ # * UCBAdmissions (table)
24
+ # * volcano (matrix): TODO: Handle non-named rows and columns in matrix
25
+ #
26
+ # TODO: rownames that are just counters need to be ignored in some cases, e.g., Puromycin
27
+ #
28
+ # == R datasets that work partially
29
+ # * chickwts: doesn't know how to handle levels, but still loads them as strings.
30
+ def dataset id
31
+ begin
32
+ r(id)
33
+ rescue Simpler::RError => e
34
+ raise DatasetNotFoundError.new(e)
35
+ end
36
+ end
37
+
38
+ # TODO: Fix so that aggregate datasets, like state, are listed properly in search results.
39
+ def search args={}
40
+ parse_datasets_index(r.eval! { %q{library(help="datasets")} })
41
+ end
42
+ alias_method :datasets, :search
43
+
44
+ # Alias for self.r.
45
+ def r obj=nil; SciRuby::Data::R.r(obj); end
46
+
47
+
48
+ class << self
49
+ def in_dir &block
50
+ SciRuby::Data.in_dir { Dir.chdir('r') { yield } }
51
+ end
52
+
53
+ def in_man_dir &block
54
+ in_dir { Dir.chdir('man') { yield } }
55
+ end
56
+
57
+ # With an argument, this function attempts to read from R some variable (probably a built-in dataset).
58
+ # Without an argument, this function gives access to the R console. See also: simpler by jtprince on github.
59
+ def r obj=nil
60
+ require "simpler"
61
+ @@r ||= ::Simpler.new
62
+ unless obj.nil?
63
+ r_class = Base.class(obj)
64
+ if r_class == 'numeric' || r_class == 'integer' || r_class == 'ordered' || r_class == 'factor' || r_class == 'character'
65
+ return Vector.new(obj)
66
+ elsif r_class == 'data.frame'
67
+ return DataFrame.new(obj)
68
+ elsif r_class == 'nfnGroupedData'
69
+ return GroupedData.new(obj)
70
+ elsif r_class == 'matrix'
71
+ return RMatrix.new(obj)
72
+ elsif r_class == 'ts'
73
+ return TimeSeries.new(obj)
74
+ elsif r_class == 'mts'
75
+ return MultiTimeSeries.new(obj)
76
+ elsif r_class == 'list'
77
+ return List.new(obj).to_h
78
+ else
79
+ raise(NotImplementedError, "Don't know how to recognize class #{r_class} yet.")
80
+ end
81
+ end
82
+ return @@r
83
+ end
84
+ end
85
+
86
+
87
+
88
+ # Hacked together tex parser to extract useful information from .Rd R manual files. Unlikely to work on any other
89
+ # TeX or LaTeX files.
90
+ class Man < OpenStruct
91
+ class << self
92
+ def in_dir &block
93
+ SciRuby::Data::R.in_man_dir { yield }
94
+ end
95
+ end
96
+
97
+ def in_dir &block
98
+ SciRuby::Data::R::Man.in_dir { yield }
99
+ end
100
+
101
+
102
+ def initialize dataset_id
103
+ h = {}
104
+ in_dir do
105
+ raw = File.read("#{dataset_id}.Rd")
106
+ entries = raw.split("\n\\") # this is a total hack
107
+ entries.each do |entry|
108
+ next if entry =~ /^%/
109
+ command, content = entry.split('{', 2)
110
+ h[command.underscore] = content.strip.gsub(/}$/, '').gsub(/\n$/, '')
111
+ end
112
+ end
113
+ super(h)
114
+ end
115
+ end
116
+
117
+ protected
118
+
119
+ # Listing of datasets read directly from R.
120
+ def parse_datasets_index raw
121
+ h = {}
122
+ mode = :pre
123
+
124
+ last_key = nil
125
+
126
+ raw.split("\n").each do |line|
127
+ next if mode == :pre && line !~ /^Index\:/
128
+ mode = :index
129
+ next if line =~ /^Index\:/
130
+ next if line.strip.empty?
131
+
132
+ if line =~ /^ /
133
+ h[last_key] = [h[last_key], line.strip].join(' ')
134
+ else
135
+ k, v = line.split(' ', 2)
136
+ last_key = k.strip
137
+ h[last_key] = v.strip
138
+ end
139
+ end
140
+
141
+ h
142
+ end
143
+ end
144
+ end
145
+ end
146
+
147
+ require File.join(SciRuby::Data::R::DIR, 'r', 'base.rb')
148
+ require File.join(SciRuby::Data::R::DIR, 'r', 'data_frame.rb')
149
+ require File.join(SciRuby::Data::R::DIR, 'r', 'time_series_base.rb')
150
+ require File.join(SciRuby::Data::R::DIR, 'r', 'time_series.rb')
151
+ require File.join(SciRuby::Data::R::DIR, 'r', 'multi_time_series.rb')
152
+ require File.join(SciRuby::Data::R::DIR, 'r', 'vector.rb')
153
+ require File.join(SciRuby::Data::R::DIR, 'r', 'r_matrix.rb')
154
+ require File.join(SciRuby::Data::R::DIR, 'r', 'grouped_data.rb')
155
+ require File.join(SciRuby::Data::R::DIR, 'r', 'list.rb')
@@ -0,0 +1,110 @@
1
+ module SciRuby::Data
2
+ class R
3
+ # Parses datasets from R directly.
4
+ class Base
5
+ FLOAT_RE = /([.eE])/
6
+
7
+ require "simpler"
8
+
9
+ def initialize id
10
+ @rob = id # R object name
11
+
12
+ assign_properties # Read as many properties as possible from R
13
+ end
14
+
15
+ def self.class obj
16
+ #STDERR.puts "obj=#{obj}"
17
+ Base.new(obj).send :read_class
18
+ end
19
+
20
+ attr_reader :rob
21
+ alias_method :rname, :rob
22
+
23
+ protected
24
+ def assign_properties; end
25
+
26
+ def r obj=nil
27
+ SciRuby::Data::R.r(obj)
28
+ end
29
+
30
+ def float_re
31
+ SciRuby::Data::R::Base::FLOAT_RE
32
+ end
33
+
34
+ def call_function fn=nil
35
+ #STDERR.puts "Call function: #{fn.to_s}\t#{rob}"
36
+ fn.nil? ? r.eval! { rob } : r.eval! { "#{fn.to_s}(#{rob})" }
37
+ end
38
+
39
+ def call_property prop
40
+ r.eval! { "#{rob}$'#{prop.to_s}'"}
41
+ end
42
+
43
+ def read_class fn=:class
44
+ read_single_line(fn).first
45
+ end
46
+
47
+ def read_single_line fn=nil
48
+ line = call_function fn
49
+ #STDERR.puts "rsl Got back: #{line}"
50
+ CSV::parse_line(line.split(' ', 2).tap{ |s| s.shift }.first, :col_sep => ' ')
51
+ end
52
+
53
+ def read_single_token fn=nil
54
+ line = call_function fn
55
+ #STDERR.puts "rst Got back: #{line}"
56
+ line.split.tap{ |s| s.shift }.first
57
+ end
58
+
59
+ # Read multiple lines from a function call. You can also pass in a block if you want to ask for a property instead
60
+ # of a function call, e.g.,
61
+ # read_multiple_lines { call_property('height') }
62
+ def read_multiple_lines fn=nil
63
+ lines = block_given? ? yield : call_function(fn)
64
+ #STDERR.puts "rml Got back:\n#{lines}"
65
+
66
+ lines = lines.split("\n")
67
+
68
+ return nil if lines.first =~ /^NULL/
69
+ if lines.first =~ /^ *\[/
70
+ return lines.map do |line|
71
+ remaining_line = CSV::parse_line(line.split(' ', 2).tap { |s| s.shift }.first, :col_sep => ' ')
72
+ remaining_line = remaining_line.tap { |l| l.pop } if remaining_line.last.nil?
73
+ remaining_line
74
+ end.flatten
75
+ end
76
+
77
+ raise "Unrecognized R output"
78
+ end
79
+
80
+ def read_row_names fn='rownames'
81
+ attempt = read_multiple_lines(fn) # may return nil if no rownames found.
82
+ return [] if attempt.nil?
83
+ attempt
84
+ end
85
+
86
+ def read_col_names fn='colnames'
87
+ read_row_names fn
88
+ end
89
+
90
+ def read_names fn='names'
91
+ read_row_names fn
92
+ end
93
+
94
+ def read_levels fn='levels'
95
+ read_row_names fn
96
+ end
97
+
98
+ def read_columns fields
99
+ columns = {}
100
+ fields.each do |field|
101
+ raise(ArgumentError, "nil field") if field.nil?
102
+ columns_for_field = SciRuby::Data::R.r("#{rob}[,'#{field.to_s}']")
103
+ columns[field] = (columns_for_field.is_a?(Vector) && columns_for_field.has_levels?) || columns_for_field.is_a?(TimeSeries) ? columns_for_field : columns_for_field.to_a
104
+ end
105
+ columns
106
+ end
107
+
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,24 @@
1
+ module SciRuby::Data
2
+ class R
3
+ class DataFrame < Base
4
+ attr_reader :row_names, :columns
5
+
6
+ def col_names
7
+ columns.keys
8
+ end
9
+
10
+ def levels col_name
11
+ columns[col_name].levels
12
+ end
13
+
14
+ protected
15
+
16
+ def assign_properties
17
+ @row_names = read_row_names
18
+ col_names = read_col_names
19
+ @columns = read_columns(col_names)
20
+ end
21
+
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,7 @@
1
+ module SciRuby::Data
2
+ class R
3
+ class GroupedData < DataFrame
4
+
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,20 @@
1
+ module SciRuby::Data
2
+ class R
3
+ # An intermediate object that doesn't really get used -- immediately gets converted to a Ruby Hash of other R objects.
4
+ class List < Base
5
+ def to_h
6
+ @data
7
+ end
8
+ protected
9
+ def assign_properties
10
+ @names = read_names
11
+ @names = nil if @names.nil? || (@names.is_a?(Array) && @names.empty?)
12
+
13
+ @data = {}
14
+ @names.each do |list_item|
15
+ @data[list_item] = r("#{rob}[['#{list_item}']]")
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,24 @@
1
+ module SciRuby::Data
2
+ class R
3
+ # class 'mts' in R
4
+ class MultiTimeSeries < TimeSeriesBase
5
+ attr_reader :row_names, :columns
6
+
7
+ def col_names
8
+ columns.keys
9
+ end
10
+
11
+ def levels col_name
12
+ columns[col_name].levels
13
+ end
14
+
15
+ protected
16
+ def assign_properties
17
+ @row_names = read_row_names
18
+ col_names = read_col_names
19
+ @columns = read_columns(col_names)
20
+ super
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,7 @@
1
+ module SciRuby::Data
2
+ class R
3
+ class RMatrix < DataFrame
4
+
5
+ end
6
+ end
7
+ end