sciruby 0.1.0 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +23 -0
- data/.gemtest +0 -0
- data/History.txt +6 -0
- data/Manifest.txt +119 -0
- data/Rakefile +178 -0
- data/bin/sciruby-plotter +12 -0
- data/data/r/man/AirPassengers.Rd +51 -0
- data/data/r/man/BJsales.Rd +34 -0
- data/data/r/man/BOD.Rd +53 -0
- data/data/r/man/ChickWeight.Rd +68 -0
- data/data/r/man/DNase.Rd +63 -0
- data/data/r/man/EuStockMarkets.Rd +28 -0
- data/data/r/man/Formaldehyde.Rd +44 -0
- data/data/r/man/HairEyeColor.Rd +77 -0
- data/data/r/man/Harman23.cor.Rd +25 -0
- data/data/r/man/Harman74.cor.Rd +28 -0
- data/data/r/man/Indometh.Rd +57 -0
- data/data/r/man/InsectSprays.Rd +45 -0
- data/data/r/man/JohnsonJohnson.Rd +37 -0
- data/data/r/man/LakeHuron.Rd +27 -0
- data/data/r/man/LifeCycleSavings.Rd +54 -0
- data/data/r/man/Loblolly.Rd +56 -0
- data/data/r/man/Nile.Rd +78 -0
- data/data/r/man/Orange.Rd +57 -0
- data/data/r/man/OrchardSprays.Rd +62 -0
- data/data/r/man/PlantGrowth.Rd +39 -0
- data/data/r/man/Puromycin.Rd +84 -0
- data/data/r/man/Theoph.Rd +84 -0
- data/data/r/man/Titanic.Rd +73 -0
- data/data/r/man/ToothGrowth.Rd +40 -0
- data/data/r/man/UCBAdmissions.Rd +68 -0
- data/data/r/man/UKDriverDeaths.Rd +72 -0
- data/data/r/man/UKLungDeaths.Rd +40 -0
- data/data/r/man/UKgas.Rd +25 -0
- data/data/r/man/USAccDeaths.Rd +23 -0
- data/data/r/man/USArrests.Rd +45 -0
- data/data/r/man/USJudgeRatings.Rd +38 -0
- data/data/r/man/USPersonalExpenditure.Rd +33 -0
- data/data/r/man/VADeaths.Rd +51 -0
- data/data/r/man/WWWusage.Rd +41 -0
- data/data/r/man/WorldPhones.Rd +40 -0
- data/data/r/man/ability.cov.Rd +50 -0
- data/data/r/man/airmiles.Rd +29 -0
- data/data/r/man/airquality.Rd +56 -0
- data/data/r/man/anscombe.Rd +62 -0
- data/data/r/man/attenu.Rd +66 -0
- data/data/r/man/attitude.Rd +48 -0
- data/data/r/man/austres.Rd +22 -0
- data/data/r/man/beavers.Rd +73 -0
- data/data/r/man/cars.Rd +59 -0
- data/data/r/man/chickwts.Rd +47 -0
- data/data/r/man/co2.Rd +43 -0
- data/data/r/man/crimtab.Rd +129 -0
- data/data/r/man/datasets-package.Rd +24 -0
- data/data/r/man/discoveries.Rd +30 -0
- data/data/r/man/esoph.Rd +66 -0
- data/data/r/man/euro.Rd +56 -0
- data/data/r/man/eurodist.Rd +25 -0
- data/data/r/man/faithful.Rd +63 -0
- data/data/r/man/freeny.Rd +56 -0
- data/data/r/man/infert.Rd +56 -0
- data/data/r/man/iris.Rd +62 -0
- data/data/r/man/islands.Rd +29 -0
- data/data/r/man/lh.Rd +22 -0
- data/data/r/man/longley.Rd +56 -0
- data/data/r/man/lynx.Rd +33 -0
- data/data/r/man/morley.Rd +50 -0
- data/data/r/man/mtcars.Rd +44 -0
- data/data/r/man/nhtemp.Rd +30 -0
- data/data/r/man/nottem.Rd +30 -0
- data/data/r/man/occupationalStatus.Rd +44 -0
- data/data/r/man/precip.Rd +31 -0
- data/data/r/man/presidents.Rd +36 -0
- data/data/r/man/pressure.Rd +41 -0
- data/data/r/man/quakes.Rd +40 -0
- data/data/r/man/randu.Rd +46 -0
- data/data/r/man/rivers.Rd +21 -0
- data/data/r/man/rock.Rd +34 -0
- data/data/r/man/sleep.Rd +51 -0
- data/data/r/man/stackloss.Rd +77 -0
- data/data/r/man/state.Rd +80 -0
- data/data/r/man/sunspot.month.Rd +49 -0
- data/data/r/man/sunspot.year.Rd +26 -0
- data/data/r/man/sunspots.Rd +33 -0
- data/data/r/man/swiss.Rd +79 -0
- data/data/r/man/treering.Rd +38 -0
- data/data/r/man/trees.Rd +48 -0
- data/data/r/man/uspop.Rd +27 -0
- data/data/r/man/volcano.Rd +31 -0
- data/data/r/man/warpbreaks.Rd +56 -0
- data/data/r/man/women.Rd +40 -0
- data/data/r/man/zCO2.Rd +81 -0
- data/lib/ext/csv.rb +22 -0
- data/lib/ext/shoes.rb +131 -0
- data/lib/ext/string.rb +39 -0
- data/lib/sciruby.rb +50 -4
- data/lib/sciruby/analysis.rb +98 -0
- data/lib/sciruby/analysis/suite.rb +87 -0
- data/lib/sciruby/analysis/suite_report_builder.rb +44 -0
- data/lib/sciruby/config.rb +93 -0
- data/lib/sciruby/data.rb +168 -0
- data/lib/sciruby/data/guardian.rb +96 -0
- data/lib/sciruby/data/r.rb +155 -0
- data/lib/sciruby/data/r/base.rb +110 -0
- data/lib/sciruby/data/r/data_frame.rb +24 -0
- data/lib/sciruby/data/r/grouped_data.rb +7 -0
- data/lib/sciruby/data/r/list.rb +20 -0
- data/lib/sciruby/data/r/multi_time_series.rb +24 -0
- data/lib/sciruby/data/r/r_matrix.rb +7 -0
- data/lib/sciruby/data/r/time_series.rb +19 -0
- data/lib/sciruby/data/r/time_series_base.rb +40 -0
- data/lib/sciruby/data/r/vector.rb +125 -0
- data/lib/sciruby/editor.rb +82 -0
- data/lib/sciruby/plotter.rb +128 -0
- data/lib/sciruby/recommend.rb +4 -0
- data/lib/sciruby/validation.rb +368 -0
- data/readme.md +75 -0
- data/static/sciruby-icon.png +0 -0
- data/test/helpers_tests.rb +58 -0
- data/test/test_recommend.rb +16 -0
- metadata +396 -20
data/lib/ext/string.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
class String
|
2
|
+
unless method_defined?(:constantize)
|
3
|
+
# Based on constantize from ActiveSupport::Inflector
|
4
|
+
def constantize
|
5
|
+
names = self.split('::')
|
6
|
+
names.shift if names.empty? || names.first.empty?
|
7
|
+
|
8
|
+
constant = Object
|
9
|
+
names.each do |name|
|
10
|
+
constant = constant.const_defined?(name, false) ? constant.const_get(name) : constant.const_missing(name)
|
11
|
+
end
|
12
|
+
constant
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
unless method_defined?(:camelize)
|
17
|
+
# Adapted from camelize from ActiveSupport::Inflector
|
18
|
+
def camelize first_letter_in_uppercase = true
|
19
|
+
if first_letter_in_uppercase
|
20
|
+
self.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
21
|
+
else
|
22
|
+
self.to_s[0].chr.downcase + self[1..-1].camelize
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
unless method_defined?(:underscore)
|
28
|
+
# Adapted from underscore from ActiveSupport::Inflector
|
29
|
+
def underscore
|
30
|
+
word = self.dup
|
31
|
+
word.gsub!(/::/, '/')
|
32
|
+
word.gsub!(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
|
33
|
+
word.gsub!(/([a-z\d])([A-Z])/,'\1_\2')
|
34
|
+
word.tr!("-", "_")
|
35
|
+
word.downcase!
|
36
|
+
word
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/sciruby.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# =
|
1
|
+
# = sciruby.rb -
|
2
2
|
# SciRuby - Ruby scientific visualization and computation.
|
3
3
|
#
|
4
4
|
# Copyright (C) 2011 SciRuby Development Team
|
@@ -9,7 +9,7 @@
|
|
9
9
|
#
|
10
10
|
# This program is free software; you can redistribute it and/or
|
11
11
|
# modify it under the terms of the GNU General Public License
|
12
|
-
# as published by the Free Software Foundation; either version
|
12
|
+
# as published by the Free Software Foundation; either version 3
|
13
13
|
# of the License, or (at your option) any later version.
|
14
14
|
#
|
15
15
|
# This program is distributed in the hope that it will be useful,
|
@@ -24,8 +24,54 @@
|
|
24
24
|
# Specific notices will be placed where they are appropriate.
|
25
25
|
#
|
26
26
|
|
27
|
+
require "rubygems"
|
28
|
+
require "bundler/setup"
|
29
|
+
|
27
30
|
module SciRuby
|
28
|
-
VERSION = '0.1.
|
31
|
+
VERSION = '0.1.3'
|
32
|
+
DIR = Pathname.new(__FILE__).realpath.dirname.to_s
|
33
|
+
|
34
|
+
require File.join(::SciRuby::DIR, 'ext', 'string.rb')
|
35
|
+
require File.join(::SciRuby::DIR, 'ext', 'csv.rb')
|
36
|
+
|
37
|
+
class << self
|
38
|
+
def plot script
|
39
|
+
SciRuby::Plotter.new script
|
40
|
+
end
|
41
|
+
|
42
|
+
def integrate *args, &block
|
43
|
+
require "integration"
|
44
|
+
::Integration.integrate(*args, &block)
|
45
|
+
end
|
29
46
|
|
30
|
-
|
47
|
+
# Produce a list of datasets that can be loaded using the +dataset+ method
|
48
|
+
def dataset_search database, args = {}
|
49
|
+
"SciRuby::Data::#{database.to_s.camelize}".constantize.new(args).datasets.keys
|
50
|
+
end
|
51
|
+
|
52
|
+
# Load a dataset from a specific database. For a list of datasets, use `dataset_search(:guardian)`, for example.
|
53
|
+
def dataset database, source_id
|
54
|
+
begin
|
55
|
+
"SciRuby::Data::#{database.to_s.camelize}".constantize.new.dataset(source_id)
|
56
|
+
rescue DatabaseUnavailableError => e
|
57
|
+
warn "Database appears to be unavailable. Attempting to use cached version."
|
58
|
+
SciRuby::Data::Cacher.new.dataset(source_id, database)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Shorthand for SciRuby::Analysis.store(*args, &block)
|
63
|
+
def analyze *args, &block
|
64
|
+
SciRuby::Analysis.store(*args, &block)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
autoload(:Analysis, File.join(DIR, 'sciruby', 'analysis'))
|
69
|
+
autoload(:Config, File.join(DIR, 'sciruby', 'config'))
|
70
|
+
autoload(:Editor, File.join(DIR, 'sciruby', 'editor'))
|
71
|
+
autoload(:Plotter, File.join(DIR, 'sciruby', 'plotter'))
|
72
|
+
autoload(:Recommend, File.join(DIR, 'sciruby', 'recommend'))
|
73
|
+
autoload(:Validation, File.join(DIR, 'sciruby', 'validation'))
|
74
|
+
autoload(:Data, File.join(DIR, 'sciruby', 'data'))
|
31
75
|
end
|
76
|
+
|
77
|
+
autoload(:Shoes, File.join(SciRuby::DIR, 'ext', 'shoes'))
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'sciruby/analysis/suite'
|
2
|
+
require 'sciruby/analysis/suite_report_builder'
|
3
|
+
|
4
|
+
module SciRuby
|
5
|
+
# DSL to run a statistical analysis without hassle.
|
6
|
+
# * Shortcut methods to avoid having to use complete namespaces, many based on R.
|
7
|
+
# * Attach/detach vectors to workspace, as with R
|
8
|
+
# == Example
|
9
|
+
# an1 = Statsample::Analysis.store(:first) do
|
10
|
+
# # Load excel file with x,y,z vectors
|
11
|
+
# ds = excel('data.xls')
|
12
|
+
# # See variables on ds dataset
|
13
|
+
# names(ds)
|
14
|
+
# # Attach the vectors to workspace, like R
|
15
|
+
# attach(ds)
|
16
|
+
# # vector 'x' is attached to workspace like a method,
|
17
|
+
# # so you can use like any variable
|
18
|
+
# mean,sd = x.mean, x.sd
|
19
|
+
# # Shameless R robbery
|
20
|
+
# a = c( 1:10)
|
21
|
+
# b = c(21:30)
|
22
|
+
# summary(cor(ds)) # Call summary method on correlation matrix
|
23
|
+
# end
|
24
|
+
# # You can run the analysis by its name
|
25
|
+
# Statsample::Analysis.run(:first)
|
26
|
+
# # or using the returned variables
|
27
|
+
# an1.run
|
28
|
+
# # You can also generate a report using ReportBuilder.
|
29
|
+
# # .summary() method call 'report_building' on the object,
|
30
|
+
# # instead of calling text summary
|
31
|
+
# an1.generate("report.html")
|
32
|
+
module Analysis
|
33
|
+
@@stored_analyses={}
|
34
|
+
@@last_analysis=nil
|
35
|
+
def self.clear_analysis
|
36
|
+
@@stored_analyses.clear
|
37
|
+
end
|
38
|
+
def self.stored_analyses
|
39
|
+
@@stored_analyses
|
40
|
+
end
|
41
|
+
def self.last
|
42
|
+
@@stored_analyses[@@last_analysis]
|
43
|
+
end
|
44
|
+
def self.store(name, opts=Hash.new,&block)
|
45
|
+
raise "You should provide a block" if !block
|
46
|
+
@@last_analysis=name
|
47
|
+
opts={:name=>name}.merge(opts)
|
48
|
+
@@stored_analyses[name]=Suite.new(opts,&block)
|
49
|
+
end
|
50
|
+
# Run analysis +*args+
|
51
|
+
# Without arguments, run all stored analyses
|
52
|
+
# Only 'echo' will be printed to screen.
|
53
|
+
def self.run(*args)
|
54
|
+
args=stored_analyses.keys if args.size==0
|
55
|
+
raise "Analysis #{args} doesn't exists" if (args - stored_analyses.keys).size>0
|
56
|
+
args.each do |name|
|
57
|
+
stored_analyses[name].run
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Add analysis +*args+ to a ReportBuilder object.
|
62
|
+
# Without arguments, add all stored analyses.
|
63
|
+
# Each analysis is wrapped inside a ReportBuilder::Section object.
|
64
|
+
# This is the method used by +save+ and +to_text+.
|
65
|
+
def self.add_to_reportbuilder(rb, *args)
|
66
|
+
args=stored_analyses.keys if args.size==0
|
67
|
+
raise "Analysis #{name} doesn't exists" if (args - stored_analyses.keys).size>0
|
68
|
+
args.each do |name|
|
69
|
+
section=ReportBuilder::Section.new(:name=>stored_analyses[name].name)
|
70
|
+
rb_an=stored_analyses[name].add_to_reportbuilder(section)
|
71
|
+
rb.add(section)
|
72
|
+
rb_an.run
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Save the analysis to a file.
|
77
|
+
# Without arguments, adds all stored analyses.
|
78
|
+
def self.save(filename, *args)
|
79
|
+
rb=ReportBuilder.new(:name=>filename)
|
80
|
+
add_to_reportbuilder(rb, *args)
|
81
|
+
rb.save(filename)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Run analysis and return as string.
|
85
|
+
# Only 'echo' will be printed to screen.
|
86
|
+
# Without arguments, add all stored analyses.
|
87
|
+
def self.to_text(*args)
|
88
|
+
rb=ReportBuilder.new(:name=>"Analysis #{Time.now}")
|
89
|
+
add_to_reportbuilder(rb, *args)
|
90
|
+
rb.to_text
|
91
|
+
end
|
92
|
+
|
93
|
+
# Run analysis and print to screen all echo and summary callings
|
94
|
+
def self.run_batch(*args)
|
95
|
+
puts to_text(*args)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require "statsample"
|
2
|
+
|
3
|
+
module SciRuby
|
4
|
+
module Analysis
|
5
|
+
class Suite
|
6
|
+
include ::Statsample::Shorthand
|
7
|
+
attr_accessor :output
|
8
|
+
attr_accessor :name
|
9
|
+
attr_reader :block
|
10
|
+
def initialize(opts=Hash.new(), &block)
|
11
|
+
if !opts.is_a? Hash
|
12
|
+
opts={:name=>opts}
|
13
|
+
end
|
14
|
+
|
15
|
+
@block=block
|
16
|
+
@name=opts[:name] || "Analysis #{Time.now}"
|
17
|
+
@attached=[]
|
18
|
+
@output=opts[:output] || ::STDOUT
|
19
|
+
end
|
20
|
+
# Run the analysis, putting output on
|
21
|
+
def run
|
22
|
+
@block.arity<1 ? instance_eval(&@block) : @block.call(self)
|
23
|
+
end
|
24
|
+
# Provides a description of the procedure. Only appears as a commentary on
|
25
|
+
# SuiteReportBuilder outputs
|
26
|
+
def desc(d)
|
27
|
+
@output.puts("Description:")
|
28
|
+
@output.puts(" #{d}")
|
29
|
+
end
|
30
|
+
def echo(*args)
|
31
|
+
@output.puts(*args)
|
32
|
+
end
|
33
|
+
def summary(obj)
|
34
|
+
obj.summary
|
35
|
+
end
|
36
|
+
def add_to_reportbuilder(rb)
|
37
|
+
SuiteReportBuilder.new({:name=>name, :rb=>rb}, &block)
|
38
|
+
end
|
39
|
+
|
40
|
+
def generate(filename)
|
41
|
+
ar=SuiteReportBuilder.new({:name=>name}, &block)
|
42
|
+
ar.generate(filename)
|
43
|
+
end
|
44
|
+
def to_text
|
45
|
+
ar=SuiteReportBuilder.new({:name=>name}, &block)
|
46
|
+
ar.to_text
|
47
|
+
end
|
48
|
+
|
49
|
+
def attach(ds)
|
50
|
+
@attached.push(ds)
|
51
|
+
end
|
52
|
+
def detach(ds=nil)
|
53
|
+
if ds.nil?
|
54
|
+
@attached.pop
|
55
|
+
else
|
56
|
+
@attached.delete(ds)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
alias :old_boxplot :boxplot
|
60
|
+
alias :old_histogram :histogram
|
61
|
+
alias :old_scatterplot :scatterplot
|
62
|
+
|
63
|
+
def show_svg(svg)
|
64
|
+
require 'tmpdir'
|
65
|
+
fn=Dir.tmpdir+"/image_#{Time.now.to_f}.svg"
|
66
|
+
File.open(fn,"w") {|fp| fp.write svg}
|
67
|
+
`xdg-open '#{fn}'`
|
68
|
+
end
|
69
|
+
def boxplot(*args)
|
70
|
+
show_svg(old_boxplot(*args).to_svg)
|
71
|
+
end
|
72
|
+
def histogram(*args)
|
73
|
+
show_svg(old_histogram(*args).to_svg)
|
74
|
+
end
|
75
|
+
def scatterplot(*args)
|
76
|
+
show_svg(old_scatterplot(*args).to_svg)
|
77
|
+
end
|
78
|
+
|
79
|
+
def method_missing(name, *args,&block)
|
80
|
+
@attached.reverse.each do |ds|
|
81
|
+
return ds[name.to_s] if ds.fields.include? (name.to_s)
|
82
|
+
end
|
83
|
+
raise "Method #{name} doesn't exists"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module SciRuby
|
2
|
+
module Analysis
|
3
|
+
class SuiteReportBuilder < Suite
|
4
|
+
attr_accessor :rb
|
5
|
+
def initialize(opts=Hash.new,&block)
|
6
|
+
if !opts.is_a? Hash
|
7
|
+
opts={:name=>opts}
|
8
|
+
end
|
9
|
+
super(opts,&block)
|
10
|
+
@rb=opts[:rb] || ReportBuilder.new(:name=>name)
|
11
|
+
end
|
12
|
+
def generate(filename)
|
13
|
+
run if @block
|
14
|
+
@rb.save(filename)
|
15
|
+
end
|
16
|
+
def to_text
|
17
|
+
run if @block
|
18
|
+
@rb.to_text
|
19
|
+
end
|
20
|
+
def summary(o)
|
21
|
+
@rb.add(o)
|
22
|
+
end
|
23
|
+
def desc(d)
|
24
|
+
@rb.add(d)
|
25
|
+
end
|
26
|
+
def echo(*args)
|
27
|
+
args.each do |a|
|
28
|
+
@rb.add(a)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def boxplot(*args)
|
33
|
+
@rb.add(old_boxplot(*args))
|
34
|
+
end
|
35
|
+
def histogram(*args)
|
36
|
+
@rb.add(old_histogram(*args))
|
37
|
+
end
|
38
|
+
def boxplot(*args)
|
39
|
+
@rb.add(old_boxplot(*args))
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module SciRuby
|
2
|
+
module Config
|
3
|
+
class << self
|
4
|
+
|
5
|
+
# Create a .sciruby directory if it doesn't exist (.sciruby) and chdir to it.
|
6
|
+
def dir
|
7
|
+
Dir.chdir(Dir.home) do
|
8
|
+
FileUtils.mkdir('.sciruby') unless Dir.exists?('.sciruby')
|
9
|
+
Dir.chdir '.sciruby' do
|
10
|
+
yield if block_given?
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# Create a data dir in the .sciruby directory if it doesn't exist (data/) and chdir to it.
|
16
|
+
def data_dir
|
17
|
+
dir do
|
18
|
+
FileUtils.mkdir('data') unless Dir.exists?('data')
|
19
|
+
Dir.chdir 'data' do
|
20
|
+
yield
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Create a data source directory within the .sciruby dir for a given module, e.g., ./sciruby/data/guardian for Guardian.
|
26
|
+
def data_source_dir module_name, create=true
|
27
|
+
dir_name = module_name.to_s if module_name.is_a?(Symbol)
|
28
|
+
dir_name ||= module_name.split('::').tap{ |m| 2.times { m.shift } }.join('::').underscore
|
29
|
+
data_dir do
|
30
|
+
FileUtils.mkdir(dir_name) if !Dir.exists?(dir_name) && create
|
31
|
+
Dir.chdir dir_name do
|
32
|
+
yield if block_given?
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Add an extension to the basename for a dataset based on the format.
|
39
|
+
def filename_for_dataset id, format=nil
|
40
|
+
basename = basename_for_dataset(id)
|
41
|
+
format.nil? ? basename : [basename, format.to_s].join('.')
|
42
|
+
end
|
43
|
+
|
44
|
+
# Generate a unique and safe filename for a dataset. This may need to be improved to incorporate some kind of hash.
|
45
|
+
# Hopefully there will be no collisions.
|
46
|
+
def basename_for_dataset id
|
47
|
+
return id.gsub(/[^a-zA-Z0-9\_]/, '_')
|
48
|
+
end
|
49
|
+
|
50
|
+
# Determines whether the basename for a cached dataset exists in some format or another.
|
51
|
+
def basename_exists? id
|
52
|
+
matches = Dir.glob("#{basename_for_dataset(id)}.*")
|
53
|
+
return matches.first if matches.size >= 1
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
|
57
|
+
# Store a given dataset in the .sciruby/data directory.
|
58
|
+
def cache_dataset module_name, dataset_id, file_contents, format
|
59
|
+
for_dataset_filename(module_name, dataset_id, format) do |dataset_filename|
|
60
|
+
unless File.exists?(dataset_filename) || basename_exists?(dataset_id)
|
61
|
+
File.open(dataset_filename, 'w') do |file|
|
62
|
+
file.write file_contents
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# In the data source directory, do something with the dataset cache file. e.g.,
|
69
|
+
# for_dataset('Guardian', '963', :cvs) do |dataset_filename|
|
70
|
+
# File.open(dataset_filename, 'w') do |f|
|
71
|
+
# f.write "Hello, world!"
|
72
|
+
# end
|
73
|
+
# end
|
74
|
+
#
|
75
|
+
# It computes the block arg (here, +dataset_filename+) for you using Config::filename_for_dataset. It also puts
|
76
|
+
# you in the correct directory.
|
77
|
+
#
|
78
|
+
# This function is used by Config::cache_dataset.
|
79
|
+
def for_dataset_filename module_name, dataset_id, format, &block
|
80
|
+
data_source_dir module_name do
|
81
|
+
yield filename_for_dataset(dataset_id, format)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def for_dataset_basename module_name, dataset_id, &block
|
86
|
+
data_source_dir module_name do
|
87
|
+
yield basename_for_dataset(dataset_id)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/sciruby/data.rb
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
require "json"
|
2
|
+
require "net/http"
|
3
|
+
require "uri"
|
4
|
+
require "cgi"
|
5
|
+
require "ostruct"
|
6
|
+
|
7
|
+
|
8
|
+
module SciRuby
|
9
|
+
class DatabaseUnavailableError < IOError
|
10
|
+
def initialize domain, path, http_get_result=nil
|
11
|
+
@domain = domain
|
12
|
+
@path = path
|
13
|
+
@http_get_result = http_get_result
|
14
|
+
end
|
15
|
+
attr_reader :domain, :path, :http_get_result
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"Database at domain '#{@domain}', path '#{@path}' appears to be unavailable."
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class DatasetNotFoundError < TypeError
|
23
|
+
def initialize e
|
24
|
+
@exp=e
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
"Dataset does not exist. It may have moved, is not available in a format SciRuby can interpret." + @exp.message + "\n" + @exp.backtrace.join("\n")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
module Data
|
34
|
+
DIR = File.join(SciRuby::DIR, 'sciruby', 'data')
|
35
|
+
|
36
|
+
def self.in_dir &block
|
37
|
+
Dir.chdir(File.join(SciRuby::DIR, '..', 'data')) do
|
38
|
+
yield
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Really just a placeholder.
|
43
|
+
class Base #:nodoc:
|
44
|
+
end
|
45
|
+
|
46
|
+
# Basic dataset type -- handles caching of datasets, that's about it.
|
47
|
+
class Cacher < Base
|
48
|
+
|
49
|
+
# Attempt to load a dataset. This is overridden for publicly-searchable datasets.
|
50
|
+
# Basically it works as a fallback if a publicly-searchable database is unavailable for some reason, but we
|
51
|
+
# may already have the data in the cache.
|
52
|
+
def dataset source_id, module_name=nil
|
53
|
+
module_name ||= self.class.to_s
|
54
|
+
raw = cached_dataset(source_id, module_name)
|
55
|
+
if raw.nil?
|
56
|
+
raise(ArgumentError, "Dataset is not cached.")
|
57
|
+
else
|
58
|
+
match = SciRuby::Config.data_source_dir(module_name, false) { SciRuby::Config.basename_exists?(source_id) }
|
59
|
+
format = match.split('.').last.to_sym
|
60
|
+
title = SciRuby::Config.basename_for_dataset(source_id)
|
61
|
+
parse_dataset(format, raw, title)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
protected
|
66
|
+
|
67
|
+
# Attempt to get the dataset from the cache. This function is a little bit fragile for the following reason:
|
68
|
+
# The +dataset+ function [eventually] allows for different +download_links+ of a dataset, which may be in different
|
69
|
+
# formats. +cached_dataset+, however, guesses the format based on the format indicated for the first download link.
|
70
|
+
#
|
71
|
+
# TODO: Consider gzipping cached datasets.
|
72
|
+
def cached_dataset source_id, module_name=nil
|
73
|
+
module_name ||= self.class.to_s
|
74
|
+
SciRuby::Config.for_dataset_basename(module_name, source_id) do |basename|
|
75
|
+
filename = SciRuby::Config.basename_exists?(source_id)
|
76
|
+
return nil unless filename
|
77
|
+
File.read(filename)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Store a dataset locally. Use cached_dataset to retrieve.
|
82
|
+
def cache_dataset source_id, raw_data, format
|
83
|
+
SciRuby::Config.cache_dataset self.class.to_s, source_id, raw_data, format
|
84
|
+
end
|
85
|
+
|
86
|
+
# Parse and cache a dataset, using the appropriate interpreter.
|
87
|
+
def parse_dataset format, raw, name
|
88
|
+
begin
|
89
|
+
case format
|
90
|
+
when :csv
|
91
|
+
CSV.parse(raw, :headers => true, :converters => :all).to_dataset.tap { |da| da.name = name }
|
92
|
+
when :excel
|
93
|
+
Statsample::Excel.parse(raw, :name => name)
|
94
|
+
end
|
95
|
+
rescue NameError => e
|
96
|
+
STDERR.puts "Unable to load statsample"
|
97
|
+
raise e
|
98
|
+
rescue => e
|
99
|
+
STDERR.puts e.inspect
|
100
|
+
raise TypeError, "Format was not as expected; dataset may have moved"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
# Base class for searching datasets. R dataset interpreter and PublicSearcherBase (and thus Guardian) are all derived
|
107
|
+
# from this type.
|
108
|
+
class Searcher < Cacher
|
109
|
+
def initialize args={}
|
110
|
+
@search_result = search(args)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
# Handles searching public datasets. Doesn't actually do it itself, but you can derive searchers from this -- e.g.,
|
116
|
+
# Guardian.
|
117
|
+
class PublicSearcher < Searcher
|
118
|
+
FOUR_OH_FOUR_MESSAGE = '404'
|
119
|
+
attr_reader :search_result
|
120
|
+
|
121
|
+
# Search the site or database using some set of parameters.
|
122
|
+
#
|
123
|
+
# This function is the one that you should redefine if you want to require certain parameters, or if there are
|
124
|
+
# parameter co-dependencies. Ultimately, you call `search_internal(params)`.
|
125
|
+
#
|
126
|
+
# == Example Arguments
|
127
|
+
# * q: keywords
|
128
|
+
# * facet_country: country code abbreviation to search
|
129
|
+
# * facet_source_title: e.g., data from Australian government would be data.nsw.org.au
|
130
|
+
def search args={}
|
131
|
+
JSON.parse(search_internal(args))
|
132
|
+
end
|
133
|
+
|
134
|
+
# Download a dataset from a given link.
|
135
|
+
def download_dataset link
|
136
|
+
url = URI.parse link
|
137
|
+
http_get(url.host, url.path)
|
138
|
+
end
|
139
|
+
|
140
|
+
protected
|
141
|
+
# Like http_get, but gets the domain and path from the child searcher class.
|
142
|
+
def search_internal params={} #:nodoc:
|
143
|
+
domain = self.class.const_get(:QUERY_DOMAIN, true)
|
144
|
+
path = self.class.const_get(:QUERY_PATH, true)
|
145
|
+
|
146
|
+
result = http_get(domain, path, params)
|
147
|
+
|
148
|
+
if result.include?(self.class.const_get(:FOUR_OH_FOUR_MESSAGE, true))
|
149
|
+
raise(DatabaseUnavailableError.new(domain, path, result))
|
150
|
+
end
|
151
|
+
|
152
|
+
result
|
153
|
+
end
|
154
|
+
|
155
|
+
# Execute an HTTP get request with or without parameters.
|
156
|
+
#
|
157
|
+
# Adapted from: http://stackoverflow.com/questions/1252210/parametrized-get-request-in-ruby/1252305#1252305
|
158
|
+
def http_get domain, path, params = {} #:nodoc:
|
159
|
+
path_with_params = "#{path}?".concat(params.collect { |k,v| "#{k.to_s}=#{CGI::escape(v.to_s)}"}.join('&'))
|
160
|
+
return Net::HTTP.get(domain, path_with_params) unless params.empty?
|
161
|
+
Net::HTTP.get(domain, path)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
autoload(:R, File.join(DIR, 'r'))
|
166
|
+
autoload(:Guardian, File.join(DIR, 'guardian'))
|
167
|
+
end
|
168
|
+
end
|