imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
data/README.rdoc
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
|
2
|
+
= Overview
|
3
|
+
|
4
|
+
The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
|
5
|
+
tasks of acquiring, extracting, transforming, loading, and packaging
|
6
|
+
data. It has the following goals:
|
7
|
+
|
8
|
+
* Minimize programmer time even at the expense of increasing run
|
9
|
+
time.
|
10
|
+
|
11
|
+
* Take data through a full transformation from raw source to packaged
|
12
|
+
purity in as few lines of code as possible.
|
13
|
+
|
14
|
+
* Treat data records as objects as much as possible.
|
15
|
+
|
16
|
+
* Use instead of repeat better code that already exists in other
|
17
|
+
libraries (FasterCSV, I'm talkin' to you).
|
18
|
+
|
19
|
+
* Make what's common easy without making what's uncommon impossible.
|
20
|
+
|
21
|
+
* Work with messy data as well as clean data.
|
22
|
+
|
23
|
+
* Let you incorporate your own tools wherever you choose to.
|
24
|
+
|
25
|
+
The Infinite Monkeywrench is a powerful tool but it is not always the
|
26
|
+
right one to use. IMW is **not** designed for
|
27
|
+
|
28
|
+
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan], Monkeyshines[http://github.com/infochimps/monkeyshines], and Edamame[http://github.com/infochimps/edamame].)
|
29
|
+
|
30
|
+
* Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
|
31
|
+
|
32
|
+
* Data mining
|
33
|
+
|
34
|
+
* Data visualization
|
35
|
+
|
36
|
+
|
37
|
+
= Setup
|
38
|
+
|
39
|
+
IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
|
40
|
+
|
41
|
+
You'll have to set up Gemcutter
|
42
|
+
|
43
|
+
$ sudo gem install gemcutter
|
44
|
+
$ gem tumble
|
45
|
+
|
46
|
+
and then install IMW
|
47
|
+
|
48
|
+
$ sudo gem install imw
|
49
|
+
|
50
|
+
= Using IMW
|
51
|
+
|
52
|
+
The central goal of IMW is to make workflow involved in processing a
|
53
|
+
dataset from a raw source to a finished product as simple as possible.
|
54
|
+
|
55
|
+
So consider that there exist two datasets that I want to combine. The
|
56
|
+
first details the historical price of bananas over the past century
|
57
|
+
and the second
|
58
|
+
|
59
|
+
== Working with paths and files
|
60
|
+
|
61
|
+
require 'rubygems'
|
62
|
+
require 'imw'
|
63
|
+
|
64
|
+
IMW holds a registry of paths that you can define on the fly or store
|
65
|
+
in a configuration file.
|
66
|
+
|
67
|
+
IMW.add_path :dropbox, "/var/www/public/dropbox"
|
68
|
+
IMW.add_path :raw, "/mnt/data/raw"
|
69
|
+
IMW.add_path :
|
70
|
+
|
71
|
+
This makes it easeir
|
72
|
+
|
73
|
+
IMW.path_to :raw, "one/particular/dataset"
|
74
|
+
#=> "/mnt/data/raw/one/particular/dataset"
|
75
|
+
|
76
|
+
IMW makes it easy to manipulate compressed files and archives.
|
77
|
+
|
78
|
+
|
79
|
+
# Move a collection of files from a public dropbox to a processing directory
|
80
|
+
|
81
|
+
raw
|
82
|
+
|
83
|
+
Dir["/public/*"].each do |path|
|
84
|
+
file = IMW.open(path)
|
85
|
+
case
|
86
|
+
when file.compressed?
|
87
|
+
file.decompress.mv_to_dir "/raw"
|
88
|
+
when file.archive?
|
89
|
+
FileUtils.cd("/raw") do
|
90
|
+
file.extract
|
91
|
+
end
|
92
|
+
else
|
93
|
+
file.mv_to_dir("/raw")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
# http://github.com/technicalpickles/jeweler
|
6
|
+
require 'jeweler'
|
7
|
+
Jeweler::Tasks.new do |gem|
|
8
|
+
gem.name = "imw"
|
9
|
+
gem.summary = "The Infinite Monkeywrench (IMW) makes acquiring, extracting, transforming, loading, and packaging data easy."
|
10
|
+
gem.description = "The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the tasks of acquiring, extracting, transforming, loading, and packaging data. It minimizes programmer time by encapsulating common data workflows and patterns and creating interfaces to many other useful Ruby libraries."
|
11
|
+
gem.email = "coders@infochimps.org"
|
12
|
+
gem.homepage = "http://github.com/infochimps/imw"
|
13
|
+
gem.authors = ["Dhruv Bansal", "Philip (flip) Kromer"]
|
14
|
+
|
15
|
+
gem.files.exclude "old/**/*"
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
20
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/etc/imwrc.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
#-*- mode: ruby -*-
|
2
|
+
#
|
3
|
+
# h2. etc/imwrc -- default site-wide imw configuration file
|
4
|
+
#
|
5
|
+
# == About
|
6
|
+
#
|
7
|
+
# This file contains the site-wide configuration settings for this
|
8
|
+
# installation of the Infinite Monkeywrench. Settings here override
|
9
|
+
# the defaults in <tt>lib/imw/utils/config.rb</tt> (see the
|
10
|
+
# documentation for that file for more detail on the variables that
|
11
|
+
# can be configured here) but will in turn be overwritten by settings
|
12
|
+
# in the <tt>~/.imwrc</tt> file in each user's directory (though the
|
13
|
+
# location of this file can be customized).
|
14
|
+
#
|
15
|
+
# At the present moment, all settings are stored as plain Ruby files
|
16
|
+
# (though they may lack the <tt>.rb</tt> extension). As the IMW
|
17
|
+
# develops, these will be replaced by YAML files which will be parsed
|
18
|
+
# by <tt>lib/imw/utils/config.rb</tt>.
|
19
|
+
#
|
20
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
21
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
22
|
+
# License:: GPL 3.0
|
23
|
+
# Website:: http://infinitemonkeywrench.org/
|
24
|
+
#
|
25
|
+
|
26
|
+
module IMW
|
27
|
+
PATHS = {
|
28
|
+
:home => ENV['HOME'],
|
29
|
+
:data_root => "/var/lib/imw",
|
30
|
+
:log_root => "/var/log/imw",
|
31
|
+
:scripts_root => "/usr/share/imw",
|
32
|
+
:tmp_root => "/tmp/imw",
|
33
|
+
|
34
|
+
# the imw library
|
35
|
+
:imw_root => File.expand_path(File.dirname(__FILE__) + "/.."),
|
36
|
+
:imw_bin => [:imw_root, 'bin'],
|
37
|
+
:imw_etc => [:imw_root, 'etc'],
|
38
|
+
:imw_lib => [:imw_root, 'lib'],
|
39
|
+
|
40
|
+
# workflow
|
41
|
+
:ripd_root => [:data_root, 'ripd'],
|
42
|
+
:peeld_root => [:data_root, 'peeld'],
|
43
|
+
:mungd_root => [:data_root, 'mungd'],
|
44
|
+
:temp_root => [:data_root, 'temp'],
|
45
|
+
:fixd_root => [:data_root, 'fixd'],
|
46
|
+
:pkgd_root => [:data_root, 'pkgd']
|
47
|
+
}
|
48
|
+
|
49
|
+
# Default time format.
|
50
|
+
STRFTIME_FORMAT = "%Y%m%d-%H%M%S" unless defined? STRFTIME_FORMAT
|
51
|
+
|
52
|
+
# Paths to external programs used by IMW.
|
53
|
+
EXTERNAL_PROGRAMS = {
|
54
|
+
:tar => "tar",
|
55
|
+
:rar => "rar",
|
56
|
+
:zip => "zip",
|
57
|
+
:unzip => "unzip",
|
58
|
+
:gzip => "gzip",
|
59
|
+
:bzip2 => "bzip2",
|
60
|
+
:wget => "wget"
|
61
|
+
} unless defined? ::IMW::EXTERNAL_PROGRAMS
|
62
|
+
|
63
|
+
module Files
|
64
|
+
# Regular expressions which match pathnames to the name of the
|
65
|
+
# appropriate IMW::Files class.
|
66
|
+
#
|
67
|
+
# File class names should be stripped of the leading
|
68
|
+
# <tt>IMW::Files</tt> prefix, i.e. - the file object
|
69
|
+
# <tt>IMW::Files::Bz2</tt> should be referenced by the string
|
70
|
+
# <tt>"Bz2"</tt>.
|
71
|
+
FILE_REGEXPS = [] unless defined? ::IMW::Files::FILE_REGEXPS
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
|
data/lib/imw.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw.rb -- main imw file
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# This file is the entry-point to the IMW library. It loads a minimal
|
7
|
+
# setup. Optional components can be loaded by calling the function
|
8
|
+
# <tt>IMW.imw_components</tt>.
|
9
|
+
#
|
10
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
+
# License:: GPL 3.0
|
13
|
+
# Website:: http://infinitemonkeywrench.org/
|
14
|
+
#
|
15
|
+
# puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
|
19
|
+
require 'imw/boot'
|
20
|
+
require 'imw/utils'
|
21
|
+
require 'imw/dataset'
|
22
|
+
require 'imw/files'
|
23
|
+
require 'imw/parsers'
|
24
|
+
require 'imw/packagers'
|
25
|
+
|
26
|
+
# The Infinite Monkeywrench (IMW) is a Ruby library for obtaining,
|
27
|
+
# parsing, transforming, reconciling, and packaging datasets.
|
28
|
+
#
|
29
|
+
# Data is obtained via FIXME
|
30
|
+
#
|
31
|
+
# Data is loaded into IMW using <tt>IMW.open</tt> which provides a
|
32
|
+
# uniform interface across a variety of data formats. The objects
|
33
|
+
# returned will each have +load+ method which will return data in the
|
34
|
+
# best form for further processing. If the data is a YAML file, then
|
35
|
+
# Ruby's +YAML+ library will be used to return primitive Ruby objects,
|
36
|
+
# if it is a CSV, then the +FasterCSV+ library will be used, &c.
|
37
|
+
#
|
38
|
+
# The main interface to handling data is the <tt>IMW::Dataset</tt>
|
39
|
+
# class. It has methods for summarizing, transforming, and dumping
|
40
|
+
# data to a variety of formats.
|
41
|
+
module IMW
|
42
|
+
end
|
data/lib/imw/boot.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/boot.rb -- startup functions
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# This file contains code necessary to boot the Infinite Monkeywrench
|
7
|
+
# at a particular site.
|
8
|
+
#
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
+
# License:: GPL 3.0
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
13
|
+
#
|
14
|
+
# puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
|
15
|
+
|
16
|
+
module IMW
|
17
|
+
module Config
|
18
|
+
|
19
|
+
# Root of the IMW source base.
|
20
|
+
def self.imw_root
|
21
|
+
File.expand_path File.join(File.dirname(__FILE__), '../..')
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# User configuration file
|
26
|
+
#
|
27
|
+
# By default, the file ~/.imwrc (.imwrc, in your home directory -- note no .rb extension)
|
28
|
+
# is sourced at top level. If the $IMWRC environment variable is set,
|
29
|
+
# that file will be sourced instead.
|
30
|
+
#
|
31
|
+
# Any code within this file will override settings in IMW_ROOT/etc/imwrc.rb
|
32
|
+
#
|
33
|
+
USER_CONFIG_FILE = File.join(ENV['HOME'] || '', '.imwrc')
|
34
|
+
# Environment variable to override user configuration file location.
|
35
|
+
ENV_CONFIG_FILE = "IMWRC"
|
36
|
+
def self.user_config_file # :nodoc:
|
37
|
+
File.expand_path(ENV[ENV_CONFIG_FILE] || USER_CONFIG_FILE)
|
38
|
+
end
|
39
|
+
|
40
|
+
# System-level config file
|
41
|
+
SITE_CONFIG_FILE = "etc/imwrc.rb"
|
42
|
+
def self.site_config_file # :nodoc:
|
43
|
+
File.join(imw_root, SITE_CONFIG_FILE)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Source the config files
|
47
|
+
def self.load_config
|
48
|
+
require site_config_file
|
49
|
+
load user_config_file if File.exist? user_config_file
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Load the config files
|
56
|
+
#
|
57
|
+
IMW::Config.load_config
|
58
|
+
|
data/lib/imw/dataset.rb
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/dataset.rb -- imw dataset
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# Defines basic properties of the <tt>IMW::Dataset</tt>
|
7
|
+
#
|
8
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
+
# License:: GPL 3.0
|
11
|
+
# Website:: http://infinitemonkeywrench.org/
|
12
|
+
#
|
13
|
+
# puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
|
14
|
+
|
15
|
+
require 'rake'
|
16
|
+
require 'ostruct'
|
17
|
+
|
18
|
+
require 'imw/utils'
|
19
|
+
require 'imw/dataset/workflow'
|
20
|
+
require 'imw/dataset/loaddump'
|
21
|
+
require 'imw/dataset/stats'
|
22
|
+
|
23
|
+
module IMW
|
24
|
+
|
25
|
+
# The basic unit in IMW is the dataset. Each dataset has a handle
|
26
|
+
# which is meant to be unique (at least in the context of a
|
27
|
+
# particular pool of datasets, see <tt>IMW::Pool</tt>). A dataset
|
28
|
+
# can also have a taxonomic classification or _taxon_
|
29
|
+
#
|
30
|
+
# dataset = IMW::Dataset.new :recent_history_of_banana_prices,
|
31
|
+
# :taxon => [:economics,:alarming_trends]
|
32
|
+
#
|
33
|
+
# but it isn't required like the handle.
|
34
|
+
#
|
35
|
+
# Processing a dataset commonly occurs in four course steps. IMW
|
36
|
+
# defines a task[http://rake.rubyforge.org] for each of these steps
|
37
|
+
# and keeps files involved in different steps in different
|
38
|
+
# directories.
|
39
|
+
#
|
40
|
+
# rip::
|
41
|
+
# Managed by the <tt>:rip</tt> task, data is collected from a
|
42
|
+
# source (+http+, +ftp+, database, &c.) and deposited in a
|
43
|
+
# subdirectory of the <tt>:ripd</tt> directory named for the URI
|
44
|
+
# of the source.
|
45
|
+
#
|
46
|
+
# dataset.task :rip do
|
47
|
+
# IMW::Rip.from_web 'http://econ.chimpu.edu/datasets/produce_prices.tar.bz2'
|
48
|
+
# #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
|
49
|
+
#
|
50
|
+
# IMW::Rip.from_database :named => "weather_records",
|
51
|
+
# :at => "public.astro.chimpu.edu",
|
52
|
+
# :select => "* FROM hurricane_frequency"
|
53
|
+
# #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
|
54
|
+
# end
|
55
|
+
#
|
56
|
+
# Where <tt>[ripd]</tt> would be replaced by the IMW
|
57
|
+
# <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is
|
58
|
+
# empty so If there's no need to rip data (perhaps it's already on
|
59
|
+
# disk?) then nothing needs to be done here.
|
60
|
+
#
|
61
|
+
# raw::
|
62
|
+
# Managed by the <tt>:raw</tt> task, data is uncompressed and
|
63
|
+
# extracted (if necessary) and stored in a subdirectory of the
|
64
|
+
# <tt>:data</tt> directory named by the taxon and handle of this
|
65
|
+
# dataset.
|
66
|
+
#
|
67
|
+
# dataset.task :raw do
|
68
|
+
# IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
|
69
|
+
# Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
|
70
|
+
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
|
71
|
+
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
|
72
|
+
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
|
73
|
+
# ...
|
74
|
+
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
|
75
|
+
# end
|
76
|
+
#
|
77
|
+
# Where <tt>[data]</tt> would be replaced by the IMW
|
78
|
+
# <tt>:data</tt> directory.
|
79
|
+
#
|
80
|
+
# If this dataset didn't have a taxon
|
81
|
+
# (economics/alarming_trends) its files would be stored in a
|
82
|
+
# directory +recent_history_of_banana_prices+ just below the
|
83
|
+
# <tt>:data</tt> directory.
|
84
|
+
#
|
85
|
+
# fix::
|
86
|
+
# Managed by the <tt>:fix</tt> task, transformations on the data
|
87
|
+
# are performed. IMW's method is to read data from a source
|
88
|
+
# format (XML, YAML, CSV, &c.) into Ruby objects with hash
|
89
|
+
# semantics. These objects might be based upon structs,
|
90
|
+
# ActiveRecord, DataMapper::Resource, FasterCSV...anything which
|
91
|
+
# can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
|
92
|
+
# ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
|
93
|
+
# neatly into your toobox.
|
94
|
+
#
|
95
|
+
#
|
96
|
+
# # Open an output file in XML for writing
|
97
|
+
# output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
|
98
|
+
# #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
|
99
|
+
#
|
100
|
+
# # A place to store the combined data
|
101
|
+
# correlations = []
|
102
|
+
#
|
103
|
+
# dataset.task :fix do
|
104
|
+
#
|
105
|
+
# # Return the contents of the weather data which has rows like
|
106
|
+
# #
|
107
|
+
# # 1 2008-09-01 4
|
108
|
+
# # 2 2008-09-08 3
|
109
|
+
# # 3 2008-08-15 3
|
110
|
+
# # ...
|
111
|
+
# #
|
112
|
+
# weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
|
113
|
+
# :headers => ["ID","DATE","NUM_HURRICANES"]).entries
|
114
|
+
# #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
|
115
|
+
#
|
116
|
+
#
|
117
|
+
# # Return the matching data from the produce prices XML file which looks like
|
118
|
+
# #
|
119
|
+
# # <prices>
|
120
|
+
# # <price type="apple">
|
121
|
+
# # <date>2008/09/01</date>
|
122
|
+
# # <amount>0.15</amount>
|
123
|
+
# # </price>
|
124
|
+
# # <price type="banana">
|
125
|
+
# # <date>2008/09/01</date>
|
126
|
+
# # <amount>0.20</amount>
|
127
|
+
# # </price>
|
128
|
+
# # ...
|
129
|
+
# # </prices>
|
130
|
+
# parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
|
131
|
+
# { :week => 'date',
|
132
|
+
# :price => 'amount' }]
|
133
|
+
#
|
134
|
+
# # Loop through the XML produce prices, mixing in the hurricane data,
|
135
|
+
# # and outputting new rows.
|
136
|
+
# Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
|
137
|
+
# IMW.open file do |xml| #=> Hpricot::Doc
|
138
|
+
# parser.parse(xml).each do |record|
|
139
|
+
# num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
|
140
|
+
# output << [week,record[:price],num_hurricanes]
|
141
|
+
# end
|
142
|
+
# end
|
143
|
+
# end
|
144
|
+
# end
|
145
|
+
#
|
146
|
+
# package::
|
147
|
+
# Data is packaged and compressed (if necessary) into a delivery
|
148
|
+
# format and deposited into the <tt>:pkgd</tt> directory.
|
149
|
+
#
|
150
|
+
# dataset.task :pkg do
|
151
|
+
# IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
|
152
|
+
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
|
153
|
+
# end
|
154
|
+
#
|
155
|
+
# In the above, <tt>dataset.task</tt> behaves like
|
156
|
+
# <tt>Rake.task</tt>, merely defining a task and its dependencies
|
157
|
+
# without executing it via
|
158
|
+
#
|
159
|
+
# dataset.task(:pkg).invoke
|
160
|
+
#
|
161
|
+
# Since the <tt>:rip</tt>, <tt>:raw</tt>, <tt>:fix</tt>, and
|
162
|
+
# <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
|
163
|
+
# will first cause <tt>:rip</tt> to run.
|
164
|
+
#
|
165
|
+
# By default, the tasks associated with a dataset are blank. All of
|
166
|
+
# IMW's functionality is available without defining tasks. Tasks
|
167
|
+
# simply provide a convenient scaffold for building a data
|
168
|
+
# transformation upon.
|
169
|
+
#
|
170
|
+
# Similarly, there is no requirement to use the directory structure
|
171
|
+
# outlined above. IMW's methods accept plain filenames and do the
|
172
|
+
# Right Thing where possible. The combination of tasks with
|
173
|
+
# matching directory structure is a suggested but not mandatory
|
174
|
+
# framework in which to program.
|
175
|
+
class Dataset
|
176
|
+
|
177
|
+
# The <tt>Rake::TaskManager</tt> module allows the
|
178
|
+
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
179
|
+
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
180
|
+
# associated with the processing of this dataset.
|
181
|
+
include Rake::TaskManager
|
182
|
+
|
183
|
+
# The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
|
184
|
+
# dataset processing.
|
185
|
+
include IMW::Workflow
|
186
|
+
|
187
|
+
attr_reader :handle, :taxon, :options
|
188
|
+
attr_accessor :data
|
189
|
+
|
190
|
+
# The default taxon assigned to a dataset.
|
191
|
+
DEFAULT_TAXON = nil
|
192
|
+
|
193
|
+
# Default options passed to <tt>Rake</tt>. Any class including
|
194
|
+
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
195
|
+
# this name.
|
196
|
+
DEFAULT_OPTIONS = {
|
197
|
+
:dry_run => false,
|
198
|
+
:trace => false,
|
199
|
+
:verbose => false
|
200
|
+
}
|
201
|
+
|
202
|
+
# Create a new dataset. Arguments include
|
203
|
+
#
|
204
|
+
# <tt>:taxon</tt> (+DEFAULT_TAXON+):: a string or sequence
|
205
|
+
# giving the taxonomic classification of the dataset. See
|
206
|
+
# <tt>IMW::Dataset.taxon=</tt> for more details on how this
|
207
|
+
# argument is interpreted.
|
208
|
+
def initialize handle, options = {}
|
209
|
+
options = options.reverse_merge :taxon => DEFAULT_TAXON
|
210
|
+
|
211
|
+
# FIXME is this how the attribute writer functions should be
|
212
|
+
# called?
|
213
|
+
@handle = handle
|
214
|
+
@taxon = options[:taxon]
|
215
|
+
|
216
|
+
# for rake
|
217
|
+
@tasks = Hash.new
|
218
|
+
@rules = Array.new
|
219
|
+
@scope = Array.new
|
220
|
+
@last_description = nil
|
221
|
+
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
222
|
+
create_default_tasks
|
223
|
+
|
224
|
+
# sets an empty @paths hash; see utils/paths.rb
|
225
|
+
set_paths
|
226
|
+
end
|
227
|
+
|
228
|
+
def handle= thing
|
229
|
+
@handle = thing.is_a?(String) ? thing.to_handle : thing
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|