imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,101 @@
1
+
2
+ = Overview
3
+
4
+ The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
5
+ tasks of acquiring, extracting, transforming, loading, and packaging
6
+ data. It has the following goals:
7
+
8
+ * Minimize programmer time even at the expense of increasing run
9
+ time.
10
+
11
+ * Take data through a full transformation from raw source to packaged
12
+ purity in as few lines of code as possible.
13
+
14
+ * Treat data records as objects as much as possible.
15
+
16
+ * Use instead of repeat better code that already exists in other
17
+ libraries (FasterCSV, I'm talkin' to you).
18
+
19
+ * Make what's common easy without making what's uncommon impossible.
20
+
21
+ * Work with messy data as well as clean data.
22
+
23
+ * Let you incorporate your own tools wherever you choose to.
24
+
25
+ The Infinite Monkeywrench is a powerful tool but it is not always the
26
+ right one to use. IMW is **not** designed for
27
+
28
+ * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan], Monkeyshines[http://github.com/infochimps/monkeyshines], and Edamame[http://github.com/infochimps/edamame].)
29
+
30
+ * Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
31
+
32
+ * Data mining
33
+
34
+ * Data visualization
35
+
36
+
37
+ = Setup
38
+
39
+ IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
40
+
41
+ You'll have to set up Gemcutter
42
+
43
+ $ sudo gem install gemcutter
44
+ $ gem tumble
45
+
46
+ and then install IMW
47
+
48
+ $ sudo gem install imw
49
+
50
+ = Using IMW
51
+
52
+ The central goal of IMW is to make workflow involved in processing a
53
+ dataset from a raw source to a finished product as simple as possible.
54
+
55
+ So consider that there exist two datasets that I want to combine. The
56
+ first details the historical price of bananas over the past century
57
+ and the second
58
+
59
+ == Working with paths and files
60
+
61
+ require 'rubygems'
62
+ require 'imw'
63
+
64
+ IMW holds a registry of paths that you can define on the fly or store
65
+ in a configuration file.
66
+
67
+ IMW.add_path :dropbox, "/var/www/public/dropbox"
68
+ IMW.add_path :raw, "/mnt/data/raw"
69
+ IMW.add_path :
70
+
71
+ This makes it easeir
72
+
73
+ IMW.path_to :raw, "one/particular/dataset"
74
+ #=> "/mnt/data/raw/one/particular/dataset"
75
+
76
+ IMW makes it easy to manipulate compressed files and archives.
77
+
78
+
79
+ # Move a collection of files from a public dropbox to a processing directory
80
+
81
+ raw
82
+
83
+ Dir["/public/*"].each do |path|
84
+ file = IMW.open(path)
85
+ case
86
+ when file.compressed?
87
+ file.decompress.mv_to_dir "/raw"
88
+ when file.archive?
89
+ FileUtils.cd("/raw") do
90
+ file.extract
91
+ end
92
+ else
93
+ file.mv_to_dir("/raw")
94
+ end
95
+ end
96
+
97
+
98
+
99
+
100
+
101
+
@@ -0,0 +1,20 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ # http://github.com/technicalpickles/jeweler
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ gem.name = "imw"
9
+ gem.summary = "The Infinite Monkeywrench (IMW) makes acquiring, extracting, transforming, loading, and packaging data easy."
10
+ gem.description = "The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the tasks of acquiring, extracting, transforming, loading, and packaging data. It minimizes programmer time by encapsulating common data workflows and patterns and creating interfaces to many other useful Ruby libraries."
11
+ gem.email = "coders@infochimps.org"
12
+ gem.homepage = "http://github.com/infochimps/imw"
13
+ gem.authors = ["Dhruv Bansal", "Philip (flip) Kromer"]
14
+
15
+ gem.files.exclude "old/**/*"
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,76 @@
1
+ #-*- mode: ruby -*-
2
+ #
3
+ # h2. etc/imwrc -- default site-wide imw configuration file
4
+ #
5
+ # == About
6
+ #
7
+ # This file contains the site-wide configuration settings for this
8
+ # installation of the Infinite Monkeywrench. Settings here override
9
+ # the defaults in <tt>lib/imw/utils/config.rb</tt> (see the
10
+ # documentation for that file for more detail on the variables that
11
+ # can be configured here) but will in turn be overwritten by settings
12
+ # in the <tt>~/.imwrc</tt> file in each user's directory (though the
13
+ # location of this file can be customized).
14
+ #
15
+ # At the present moment, all settings are stored as plain Ruby files
16
+ # (though they may lack the <tt>.rb</tt> extension). As the IMW
17
+ # develops, these will be replaced by YAML files which will be parsed
18
+ # by <tt>lib/imw/utils/config.rb</tt>.
19
+ #
20
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
21
+ # Copyright:: Copyright (c) 2008 infochimps.org
22
+ # License:: GPL 3.0
23
+ # Website:: http://infinitemonkeywrench.org/
24
+ #
25
+
26
+ module IMW
27
+ PATHS = {
28
+ :home => ENV['HOME'],
29
+ :data_root => "/var/lib/imw",
30
+ :log_root => "/var/log/imw",
31
+ :scripts_root => "/usr/share/imw",
32
+ :tmp_root => "/tmp/imw",
33
+
34
+ # the imw library
35
+ :imw_root => File.expand_path(File.dirname(__FILE__) + "/.."),
36
+ :imw_bin => [:imw_root, 'bin'],
37
+ :imw_etc => [:imw_root, 'etc'],
38
+ :imw_lib => [:imw_root, 'lib'],
39
+
40
+ # workflow
41
+ :ripd_root => [:data_root, 'ripd'],
42
+ :peeld_root => [:data_root, 'peeld'],
43
+ :mungd_root => [:data_root, 'mungd'],
44
+ :temp_root => [:data_root, 'temp'],
45
+ :fixd_root => [:data_root, 'fixd'],
46
+ :pkgd_root => [:data_root, 'pkgd']
47
+ }
48
+
49
+ # Default time format.
50
+ STRFTIME_FORMAT = "%Y%m%d-%H%M%S" unless defined? STRFTIME_FORMAT
51
+
52
+ # Paths to external programs used by IMW.
53
+ EXTERNAL_PROGRAMS = {
54
+ :tar => "tar",
55
+ :rar => "rar",
56
+ :zip => "zip",
57
+ :unzip => "unzip",
58
+ :gzip => "gzip",
59
+ :bzip2 => "bzip2",
60
+ :wget => "wget"
61
+ } unless defined? ::IMW::EXTERNAL_PROGRAMS
62
+
63
+ module Files
64
+ # Regular expressions which match pathnames to the name of the
65
+ # appropriate IMW::Files class.
66
+ #
67
+ # File class names should be stripped of the leading
68
+ # <tt>IMW::Files</tt> prefix, i.e. - the file object
69
+ # <tt>IMW::Files::Bz2</tt> should be referenced by the string
70
+ # <tt>"Bz2"</tt>.
71
+ FILE_REGEXPS = [] unless defined? ::IMW::Files::FILE_REGEXPS
72
+ end
73
+
74
+ end
75
+
76
+
@@ -0,0 +1,42 @@
1
+ #
2
+ # h2. lib/imw.rb -- main imw file
3
+ #
4
+ # == About
5
+ #
6
+ # This file is the entry-point to the IMW library. It loads a minimal
7
+ # setup. Optional components can be loaded by calling the function
8
+ # <tt>IMW.imw_components</tt>.
9
+ #
10
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
+ # Copyright:: Copyright (c) 2008 infochimps.org
12
+ # License:: GPL 3.0
13
+ # Website:: http://infinitemonkeywrench.org/
14
+ #
15
+ # puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
16
+
17
+ require 'rubygems'
18
+ require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
19
+ require 'imw/boot'
20
+ require 'imw/utils'
21
+ require 'imw/dataset'
22
+ require 'imw/files'
23
+ require 'imw/parsers'
24
+ require 'imw/packagers'
25
+
26
+ # The Infinite Monkeywrench (IMW) is a Ruby library for obtaining,
27
+ # parsing, transforming, reconciling, and packaging datasets.
28
+ #
29
+ # Data is obtained via FIXME
30
+ #
31
+ # Data is loaded into IMW using <tt>IMW.open</tt> which provides a
32
+ # uniform interface across a variety of data formats. The objects
33
+ # returned will each have +load+ method which will return data in the
34
+ # best form for further processing. If the data is a YAML file, then
35
+ # Ruby's +YAML+ library will be used to return primitive Ruby objects,
36
+ # if it is a CSV, then the +FasterCSV+ library will be used, &c.
37
+ #
38
+ # The main interface to handling data is the <tt>IMW::Dataset</tt>
39
+ # class. It has methods for summarizing, transforming, and dumping
40
+ # data to a variety of formats.
41
+ module IMW
42
+ end
@@ -0,0 +1,58 @@
1
+ #
2
+ # h2. lib/imw/boot.rb -- startup functions
3
+ #
4
+ # == About
5
+ #
6
+ # This file contains code necessary to boot the Infinite Monkeywrench
7
+ # at a particular site.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
15
+
16
+ module IMW
17
+ module Config
18
+
19
+ # Root of the IMW source base.
20
+ def self.imw_root
21
+ File.expand_path File.join(File.dirname(__FILE__), '../..')
22
+ end
23
+
24
+ #
25
+ # User configuration file
26
+ #
27
+ # By default, the file ~/.imwrc (.imwrc, in your home directory -- note no .rb extension)
28
+ # is sourced at top level. If the $IMWRC environment variable is set,
29
+ # that file will be sourced instead.
30
+ #
31
+ # Any code within this file will override settings in IMW_ROOT/etc/imwrc.rb
32
+ #
33
+ USER_CONFIG_FILE = File.join(ENV['HOME'] || '', '.imwrc')
34
+ # Environment variable to override user configuration file location.
35
+ ENV_CONFIG_FILE = "IMWRC"
36
+ def self.user_config_file # :nodoc:
37
+ File.expand_path(ENV[ENV_CONFIG_FILE] || USER_CONFIG_FILE)
38
+ end
39
+
40
+ # System-level config file
41
+ SITE_CONFIG_FILE = "etc/imwrc.rb"
42
+ def self.site_config_file # :nodoc:
43
+ File.join(imw_root, SITE_CONFIG_FILE)
44
+ end
45
+
46
+ # Source the config files
47
+ def self.load_config
48
+ require site_config_file
49
+ load user_config_file if File.exist? user_config_file
50
+ end
51
+ end
52
+ end
53
+
54
+ #
55
+ # Load the config files
56
+ #
57
+ IMW::Config.load_config
58
+
@@ -0,0 +1,233 @@
1
+ #
2
+ # h2. lib/imw/dataset.rb -- imw dataset
3
+ #
4
+ # == About
5
+ #
6
+ # Defines basic properties of the <tt>IMW::Dataset</tt>
7
+ #
8
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
9
+ # Copyright:: Copyright (c) 2008 infochimps.org
10
+ # License:: GPL 3.0
11
+ # Website:: http://infinitemonkeywrench.org/
12
+ #
13
+ # puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
14
+
15
+ require 'rake'
16
+ require 'ostruct'
17
+
18
+ require 'imw/utils'
19
+ require 'imw/dataset/workflow'
20
+ require 'imw/dataset/loaddump'
21
+ require 'imw/dataset/stats'
22
+
23
+ module IMW
24
+
25
+ # The basic unit in IMW is the dataset. Each dataset has a handle
26
+ # which is meant to be unique (at least in the context of a
27
+ # particular pool of datasets, see <tt>IMW::Pool</tt>). A dataset
28
+ # can also have a taxonomic classification or _taxon_
29
+ #
30
+ # dataset = IMW::Dataset.new :recent_history_of_banana_prices,
31
+ # :taxon => [:economics,:alarming_trends]
32
+ #
33
+ # but it isn't required like the handle.
34
+ #
35
+ # Processing a dataset commonly occurs in four course steps. IMW
36
+ # defines a task[http://rake.rubyforge.org] for each of these steps
37
+ # and keeps files involved in different steps in different
38
+ # directories.
39
+ #
40
+ # rip::
41
+ # Managed by the <tt>:rip</tt> task, data is collected from a
42
+ # source (+http+, +ftp+, database, &c.) and deposited in a
43
+ # subdirectory of the <tt>:ripd</tt> directory named for the URI
44
+ # of the source.
45
+ #
46
+ # dataset.task :rip do
47
+ # IMW::Rip.from_web 'http://econ.chimpu.edu/datasets/produce_prices.tar.bz2'
48
+ # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
49
+ #
50
+ # IMW::Rip.from_database :named => "weather_records",
51
+ # :at => "public.astro.chimpu.edu",
52
+ # :select => "* FROM hurricane_frequency"
53
+ # #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
54
+ # end
55
+ #
56
+ # Where <tt>[ripd]</tt> would be replaced by the IMW
57
+ # <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is
58
+ # empty so If there's no need to rip data (perhaps it's already on
59
+ # disk?) then nothing needs to be done here.
60
+ #
61
+ # raw::
62
+ # Managed by the <tt>:raw</tt> task, data is uncompressed and
63
+ # extracted (if necessary) and stored in a subdirectory of the
64
+ # <tt>:data</tt> directory named by the taxon and handle of this
65
+ # dataset.
66
+ #
67
+ # dataset.task :raw do
68
+ # IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
69
+ # Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
70
+ # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
71
+ # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
72
+ # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
73
+ # ...
74
+ # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
75
+ # end
76
+ #
77
+ # Where <tt>[data]</tt> would be replaced by the IMW
78
+ # <tt>:data</tt> directory.
79
+ #
80
+ # If this dataset didn't have a taxon
81
+ # (economics/alarming_trends) its files would be stored in a
82
+ # directory +recent_history_of_banana_prices+ just below the
83
+ # <tt>:data</tt> directory.
84
+ #
85
+ # fix::
86
+ # Managed by the <tt>:fix</tt> task, transformations on the data
87
+ # are performed. IMW's method is to read data from a source
88
+ # format (XML, YAML, CSV, &c.) into Ruby objects with hash
89
+ # semantics. These objects might be based upon structs,
90
+ # ActiveRecord, DataMapper::Resource, FasterCSV...anything which
91
+ # can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or'
92
+ # ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits
93
+ # neatly into your toobox.
94
+ #
95
+ #
96
+ # # Open an output file in XML for writing
97
+ # output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
98
+ # #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
99
+ #
100
+ # # A place to store the combined data
101
+ # correlations = []
102
+ #
103
+ # dataset.task :fix do
104
+ #
105
+ # # Return the contents of the weather data which has rows like
106
+ # #
107
+ # # 1 2008-09-01 4
108
+ # # 2 2008-09-08 3
109
+ # # 3 2008-08-15 3
110
+ # # ...
111
+ # #
112
+ # weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
113
+ # :headers => ["ID","DATE","NUM_HURRICANES"]).entries
114
+ # #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ]
115
+ #
116
+ #
117
+ # # Return the matching data from the produce prices XML file which looks like
118
+ # #
119
+ # # <prices>
120
+ # # <price type="apple">
121
+ # # <date>2008/09/01</date>
122
+ # # <amount>0.15</amount>
123
+ # # </price>
124
+ # # <price type="banana">
125
+ # # <date>2008/09/01</date>
126
+ # # <amount>0.20</amount>
127
+ # # </price>
128
+ # # ...
129
+ # # </prices>
130
+ # parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
131
+ # { :week => 'date',
132
+ # :price => 'amount' }]
133
+ #
134
+ # # Loop through the XML produce prices, mixing in the hurricane data,
135
+ # # and outputting new rows.
136
+ # Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
137
+ # IMW.open file do |xml| #=> Hpricot::Doc
138
+ # parser.parse(xml).each do |record|
139
+ # num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
140
+ # output << [week,record[:price],num_hurricanes]
141
+ # end
142
+ # end
143
+ # end
144
+ # end
145
+ #
146
+ # package::
147
+ # Data is packaged and compressed (if necessary) into a delivery
148
+ # format and deposited into the <tt>:pkgd</tt> directory.
149
+ #
150
+ # dataset.task :pkg do
151
+ # IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
152
+ # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
153
+ # end
154
+ #
155
+ # In the above, <tt>dataset.task</tt> behaves like
156
+ # <tt>Rake.task</tt>, merely defining a task and its dependencies
157
+ # without executing it via
158
+ #
159
+ # dataset.task(:pkg).invoke
160
+ #
161
+ # Since the <tt>:rip</tt>, <tt>:raw</tt>, <tt>:fix</tt>, and
162
+ # <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt>
163
+ # will first cause <tt>:rip</tt> to run.
164
+ #
165
+ # By default, the tasks associated with a dataset are blank. All of
166
+ # IMW's functionality is available without defining tasks. Tasks
167
+ # simply provide a convenient scaffold for building a data
168
+ # transformation upon.
169
+ #
170
+ # Similarly, there is no requirement to use the directory structure
171
+ # outlined above. IMW's methods accept plain filenames and do the
172
+ # Right Thing where possible. The combination of tasks with
173
+ # matching directory structure is a suggested but not mandatory
174
+ # framework in which to program.
175
+ class Dataset
176
+
177
+ # The <tt>Rake::TaskManager</tt> module allows the
178
+ # <tt>IMW::Dataset</tt> class to leverage the functionality of the
179
+ # Rake[http://rake.rubyforge.org/] library to manage tasks
180
+ # associated with the processing of this dataset.
181
+ include Rake::TaskManager
182
+
183
+ # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
184
+ # dataset processing.
185
+ include IMW::Workflow
186
+
187
+ attr_reader :handle, :taxon, :options
188
+ attr_accessor :data
189
+
190
+ # The default taxon assigned to a dataset.
191
+ DEFAULT_TAXON = nil
192
+
193
+ # Default options passed to <tt>Rake</tt>. Any class including
194
+ # the <tt>Rake::TaskManager</tt> module must define a constant by
195
+ # this name.
196
+ DEFAULT_OPTIONS = {
197
+ :dry_run => false,
198
+ :trace => false,
199
+ :verbose => false
200
+ }
201
+
202
+ # Create a new dataset. Arguments include
203
+ #
204
+ # <tt>:taxon</tt> (+DEFAULT_TAXON+):: a string or sequence
205
+ # giving the taxonomic classification of the dataset. See
206
+ # <tt>IMW::Dataset.taxon=</tt> for more details on how this
207
+ # argument is interpreted.
208
+ def initialize handle, options = {}
209
+ options = options.reverse_merge :taxon => DEFAULT_TAXON
210
+
211
+ # FIXME is this how the attribute writer functions should be
212
+ # called?
213
+ @handle = handle
214
+ @taxon = options[:taxon]
215
+
216
+ # for rake
217
+ @tasks = Hash.new
218
+ @rules = Array.new
219
+ @scope = Array.new
220
+ @last_description = nil
221
+ @options = OpenStruct.new(DEFAULT_OPTIONS)
222
+ create_default_tasks
223
+
224
+ # sets an empty @paths hash; see utils/paths.rb
225
+ set_paths
226
+ end
227
+
228
+ def handle= thing
229
+ @handle = thing.is_a?(String) ? thing.to_handle : thing
230
+ end
231
+
232
+ end
233
+ end