imw 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/utils/log.rb CHANGED
@@ -1,16 +1,16 @@
1
1
  require 'logger'
2
2
 
3
3
  module IMW
4
+
5
+ # Default log file.
4
6
  LOG_FILE_DESTINATION = STDERR unless defined?(LOG_FILE_DESTINATION)
7
+
5
8
  LOG_TIMEFORMAT = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
6
9
 
7
10
  class << self; attr_accessor :log end
8
- #
9
- # Create a Logger and point it at LOG_FILE_DESTINATION
10
- #
11
- # LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
12
- # ~/.imwrc, or set IMW.log yourself, if that's not cool.
13
- #
11
+
12
+ # Create a Logger and point it at IMW::LOG_FILE_DESTINATION which is
13
+ # set in ~/.imwrc and defaults to STDERR.
14
14
  def self.instantiate_logger!
15
15
  IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
16
16
  IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
@@ -18,15 +18,19 @@ module IMW
18
18
  end
19
19
 
20
20
  def announce *events
21
- options = events.extract_options!
21
+ options = events.flatten.extract_options!
22
22
  options.reverse_merge! :level => Logger::INFO
23
- # puts [options, events ].inspect, "*"*76
24
23
  IMW.log.add options[:level], events.join("\n")
25
24
  end
26
25
  def banner *events
27
- options = events.extract_options!
26
+ options = events.flatten.extract_options!
28
27
  options.reverse_merge! :level => Logger::INFO
29
- ["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
28
+ announce(["*"*75, events, "*"*75], options)
29
+ end
30
+ def warn *events
31
+ options = events.flatten.extract_options!
32
+ options.reverse_merge! :level => Logger::WARN
33
+ announce events, options
30
34
  end
31
35
 
32
36
  PROGRESS_TRACKERS = {}
@@ -61,7 +65,4 @@ module IMW
61
65
  end
62
66
  end
63
67
 
64
- #
65
- # Make the default logger
66
- #
67
68
  IMW.instantiate_logger!
@@ -1,10 +1,4 @@
1
1
  module IMW
2
- # Return a string representing the current UTC time in the IMW
3
- # format.
4
- def self.current_utc_time_string
5
- Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
6
- end
7
-
8
2
 
9
3
  # A simple counter. The +value+ and +add+ methods read and
10
4
  # increment the counter's value.
@@ -1,3 +1,5 @@
1
+ require 'pathname'
2
+
1
3
  module IMW
2
4
 
3
5
  # Implements methods designed to work with an object's
@@ -9,19 +11,48 @@ module IMW
9
11
  # <tt>@paths</tt>.
10
12
  module Paths
11
13
 
12
- # Expands a shorthand workflow path specification to an
13
- # actual file path.
14
+ # Expands a shorthand workflow path specification to an actual
15
+ # file path. Strings are interpreted literally but symbols are
16
+ # first resolved to the paths they represent.
17
+ #
18
+ # add_path :foo, '~/whoa'
19
+ # path_to :foo, 'my_thing'
20
+ # => '~/whoa/my_thing'
14
21
  #
15
- # add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
16
- # path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
17
- # => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
22
+ # @param [String, Symbol] pathsegs the path segments to join
23
+ # @return [String] the resulting expanded path
18
24
  def path_to *pathsegs
19
- begin
20
- path = Pathname.new path_to_helper(*pathsegs)
21
- path.absolute? ? File.expand_path(path) : path.to_s
22
- rescue Exception => e
23
- raise("Can't find path to '#{pathsegs}': #{e}");
24
- end
25
+ path = Pathname.new path_to_helper(*pathsegs)
26
+ path.absolute? ? File.expand_path(path) : path.to_s
27
+ end
28
+
29
+ # Return the presently defined paths for this object.
30
+ #
31
+ # @return [Hash]
32
+ def paths
33
+ @paths ||= {}
34
+ end
35
+
36
+ # Adds a symbolic path for expansion by +path_to+.
37
+ #
38
+ # add_path :foo, '~/whoa'
39
+ # add_path :bar, :foo, 'baz'
40
+ # path_to :bar
41
+ # => '~/whoa/baz'
42
+ #
43
+ # @param [Symbol] sym the name of the path to store
44
+ # @param [Symbol, String] pathsegs the path segments to use to define the path to the name
45
+ # @return [String] the resulting path
46
+ def add_path sym, *pathsegs
47
+ paths[sym] = pathsegs.flatten
48
+ path_to(sym)
49
+ end
50
+
51
+ # Removes a symbolic path for expansion by +path_to+.
52
+ #
53
+ # @param [Symbol] sym the stored path symbol to remove
54
+ def remove_path sym
55
+ paths.delete sym if paths.include? sym
25
56
  end
26
57
 
27
58
  private
@@ -29,7 +60,7 @@ module IMW
29
60
  # +path_to_helper+ handles the recursive calls for +path_to+.
30
61
  expanded = pathsegs.flatten.compact.map do |pathseg|
31
62
  case
32
- when pathseg.is_a?(Symbol) && @paths.include?(pathseg) then path_to(@paths[pathseg])
63
+ when pathseg.is_a?(Symbol) && paths.include?(pathseg) then path_to(paths[pathseg])
33
64
  when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
34
65
  when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
35
66
  else pathseg
@@ -37,29 +68,70 @@ module IMW
37
68
  end
38
69
  File.join(*expanded)
39
70
  end
40
- public
71
+ end
41
72
 
42
- # Adds a symbolic path for expansion by +path_to+.
43
- def add_path sym, *pathsegs
44
- @paths[sym] = pathsegs.flatten
45
- end
46
73
 
47
- # Removes a symbolic path for expansion by +path_to+.
48
- def remove_path sym
49
- @paths.delete sym if @paths.include? sym
50
- end
51
- end
74
+ # Default paths for the IMW. Chosen to make sense on most *NIX
75
+ # distributions.
76
+ DEFAULT_PATHS = {
77
+ :home => ENV['HOME'],
78
+ :data_root => "/var/lib/imw",
79
+ :log_root => "/var/log/imw",
80
+ :scripts_root => "/usr/share/imw",
81
+ :tmp_root => "/tmp/imw",
82
+
83
+ # the imw library
84
+ :imw_root => File.expand_path(File.dirname(__FILE__) + "/../../.."),
85
+ :imw_bin => [:imw_root, 'bin'],
86
+ :imw_etc => [:imw_root, 'etc'],
87
+ :imw_lib => [:imw_root, 'lib'],
88
+
89
+ # workflow
90
+ :ripd_root => [:data_root, 'ripd'],
91
+ :rawd_root => [:data_root, 'rawd'],
92
+ :fixd_root => [:data_root, 'fixd'],
93
+ :pkgd_root => [:data_root, 'pkgd']
94
+ }
95
+ defined?(PATHS) ? PATHS.reverse_merge!(DEFAULT_PATHS) : PATHS = DEFAULT_PATHS
52
96
 
97
+ # Expands a shorthand workflow path specification to an actual
98
+ # file path. Strings are interpreted literally but symbols are
99
+ # first resolved to the paths they represent.
100
+ #
101
+ # IMW.add_path :foo, '~/whoa'
102
+ # IMW.path_to :foo, 'my_thing'
103
+ # => '~/whoa/my_thing'
104
+ #
105
+ # @param [String, Symbol] pathsegs the path segments to join
106
+ # @return [String] the resulting expanded path
53
107
  def self.path_to *pathsegs
54
- begin
55
- path = Pathname.new IMW.path_to_helper(*pathsegs)
56
- path.absolute? ? File.expand_path(path) : path.to_s
57
- rescue Exception => e
58
- raise("Can't find path to '#{pathsegs}': #{e}");
59
- end
108
+ path = Pathname.new IMW.path_to_helper(*pathsegs)
109
+ path.absolute? ? File.expand_path(path) : path.to_s
110
+ end
111
+
112
+ # Adds a symbolic path for expansion by +path_to+.
113
+ #
114
+ # IMW.add_path :foo, '~/whoa'
115
+ # IMW.add_path :bar, :foo, 'baz'
116
+ # IMW.path_to :bar
117
+ # => '~/whoa/baz'
118
+ #
119
+ # @param [Symbol] sym the name of the path to store
120
+ # @param [Symbol, String] pathsegs the path segments to use to define the path to the name
121
+ # @return [String] the resulting path
122
+ def self.add_path sym, *pathsegs
123
+ IMW::PATHS[sym] = pathsegs.flatten
124
+ path_to[sym]
60
125
  end
61
126
 
62
- private
127
+ # Removes a symbolic path for expansion by +path_to+.
128
+ #
129
+ # @param [Symbol] sym the stored path symbol to remove
130
+ def self.remove_path sym
131
+ IMW::PATHS.delete sym if IMW::PATHS.include? sym
132
+ end
133
+
134
+ protected
63
135
  def self.path_to_helper *pathsegs # :nodoc:
64
136
  # +path_to_helper+ handles the recursive calls for +path_to+.
65
137
  expanded = pathsegs.flatten.compact.map do |pathseg|
@@ -71,17 +143,4 @@ module IMW
71
143
  end
72
144
  File.join(*expanded)
73
145
  end
74
- public
75
-
76
- # Adds a symbolic path for expansion by +path_to+.
77
- def self.add_path sym, *pathsegs
78
- IMW::PATHS[sym] = pathsegs.flatten
79
- end
80
-
81
- # Removes a symbolic path for expansion by +path_to+.
82
- def self.remove_path sym
83
- IMW::PATHS.delete sym if IMW::PATHS.include? sym
84
- end
85
146
  end
86
-
87
- # puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."
@@ -1,12 +1,11 @@
1
- # copied from activewarehouse-etl gem
2
- module IMWVersion #:nodoc:
1
+ module IMW
3
2
  unless defined?(VERSION)
4
- module VERSION #:nodoc:
5
- MAJOR = 0
6
- MINOR = 0
7
- TINY = 0
8
-
9
- STRING = [MAJOR, MINOR, TINY].join('.')
10
- end
3
+ module VERSION #:nodoc:
4
+ MAJOR = 0
5
+ MINOR = 0
6
+ TINY = 0
7
+
8
+ STRING = [MAJOR, MINOR, TINY].join('.')
9
+ end
11
10
  end
12
11
  end
data/lib/imw/utils.rb CHANGED
@@ -1,24 +1,8 @@
1
- #
2
- # h2. lib/imw/utils.rb -- utility functions
3
- #
4
- # == About
5
- #
6
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
7
- # Copyright:: Copyright (c) 2008 infochimps.org
8
- # License:: GPL 3.0
9
- # Website:: http://infinitemonkeywrench.org/
10
- #
11
-
12
1
  require 'rubygems'
2
+ require 'fileutils'
13
3
  require 'imw/utils/error'
14
4
  require 'imw/utils/log'
15
- require 'imw/utils/config'
16
5
  require 'imw/utils/paths'
17
6
  require 'imw/utils/misc'
18
- require 'imw/utils/components'
19
- require 'imw/utils/extensions/core'
20
- require 'fileutils'
21
- require 'pathname'
22
-
7
+ require 'imw/utils/extensions'
23
8
 
24
- # puts "#{File.basename(__FILE__)}: Early economists thought they would measure the utility of an action in units of `utils'. Really." # at bottom
data/lib/imw.rb CHANGED
@@ -1,11 +1,6 @@
1
1
  require 'rubygems'
2
2
  require 'imw/boot'
3
3
  require 'imw/utils'
4
- require 'imw/dataset'
5
- require 'imw/repository'
6
- require 'imw/files'
7
- require 'imw/parsers'
8
- require 'imw/packagers'
9
4
 
10
5
  # The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
11
6
  # extracting, parsing, munging, and packaging datasets. It allows you
@@ -13,19 +8,99 @@ require 'imw/packagers'
13
8
  # transformations of data as a network of dependencies (a la Make or
14
9
  # Rake).
15
10
  #
16
- # On first reading of IMW examine the classes within the IMW::Files
17
- # module, all transparently instantiated when using IMW.open (instead
18
- # of File.open). These classes do a lot of work to ensure that all
19
- # objects returned by IMW.open share methods (write, read, load, dump,
20
- # parse, compress, extract, &c.) while continuing to use existing
21
- # implementations of these concepts.
11
+ # IMW has a few central concepts: resources, datasets, workflows, and
12
+ # repositories.
22
13
  #
23
- # Another entrace point is the <tt>IMW::Dataset</tt> class. It
24
- # leverages Rake to craft workflows for transforming datasets. IMW
25
- # encourages you to organize your data transformations in a step-wise
26
- # process, managed with dependencies.
14
+ # Resources represent individual data resources like local files,
15
+ # websites, databases, &c. Resources are typically instantiated via
16
+ # IMW.open, with IMW doing the work of figuring out what to return
17
+ # based on the URI passed in.
27
18
  #
28
- # Utilities to help with one step in particular (ripping, parsing,
29
- # pacaking, &c.) are in their own directories.
19
+ # Datasets represent collections of related data resources. An
20
+ # IMW::Dataset comes with a pre-defined (but customizable) workflow
21
+ # that takes data resources through several steps: rip, parse, munge,
22
+ # and package. The workflow leverages Rake and so the various tasks
23
+ # that are necessary to process the data till it is nice and pretty
24
+ # can all be linked with dependencies.
25
+ #
26
+ # Repositories are collections of datasets and it is on these
27
+ # collections that the +imw+ command line tool operates.
30
28
  module IMW
29
+ autoload :Resource, 'imw/resource'
30
+ autoload :Resources, 'imw/resources'
31
+ autoload :Repository, 'imw/repository'
32
+ autoload :Dataset, 'imw/dataset'
33
+ autoload :Transforms, 'imw/transforms'
34
+ autoload :Parsers, 'imw/parsers'
35
+
36
+ # Open a resource at the given +uri+. The resource will
37
+ # automatically be extended by modules which make sense given the
38
+ # +uri+.
39
+ #
40
+ # See the documentation for IMW::Resource and the various modules
41
+ # within IMW::Resources for more information and options.
42
+ #
43
+ # Passing in an IMW::Resource will simply return it.
44
+ #
45
+ # @param [String, Addressable::URI, IMW::Resource] obj the URI to open
46
+ # @return [IMW::Resource] the resulting resource, property extended for the given URI
47
+ def self.open obj, options={}
48
+ return obj if obj.is_a?(IMW::Resource)
49
+ IMW::Resource.new(obj, options)
50
+ end
51
+
52
+ # Works the same way as IMW.open except opens the resource for
53
+ # writing.
54
+ #
55
+ # @param [String, Addressable::URI] uri the URI to open
56
+ # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
57
+ def self.open! uri, options={}
58
+ IMW::Resource.new(uri, options.merge(:mode => 'w'))
59
+ end
60
+
61
+ # The default repository in which to place datasets. See the
62
+ # documentation for IMW::Repository for more information on how
63
+ # datasets and repositories fit together.
64
+ #
65
+ # @return [IMW::Repository] the default IMW repository
66
+ def self.repository
67
+ @@repository ||= IMW::Repository.new
68
+ end
69
+
70
+ # Create a dataset and put it in the default IMW repository. Also
71
+ # yields the dataset so you can define its workflow
72
+ #
73
+ # IMW.dataset :my_dataset do
74
+ #
75
+ # # Define some paths we're going to use
76
+ # add_path :raw_data, :ripd, 'raw_data.csv'
77
+ # add_path :fixd_data, :fixd, 'fixed_data.csv'
78
+ #
79
+ # # Copy a file from a website to this dataset's +ripd+ directory.
80
+ # rip do
81
+ # IMW.open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:raw_data))
82
+ # end
83
+ #
84
+ # # Filter the raw data to those values which match some criterion defined by <tt>accept?</tt>
85
+ # munge do
86
+ # IMW.open(path_to(:raw_data)).map do |row|
87
+ # row if accept?(row)
88
+ # end.compact.dump(path_to(:fixd_data))
89
+ # end
90
+ #
91
+ # # Compress this new data
92
+ # package do
93
+ # IMW.open(path_to(:fixd_data)).compress.mv(path_to(:pkgd))
94
+ # end
95
+ # end
96
+ #
97
+ # @param [Symbol, String] handle the handle to identify this dataset with
98
+ # @param [Hash] options a hash of options (see IMW::Dataset)
99
+ # @return [IMW::Dataset] the new dataset
100
+ def self.dataset handle, options={}, &block
101
+ d = IMW::Dataset.new(handle, options)
102
+ d.instance_eval(&block) if block_given?
103
+ d
104
+ end
105
+
31
106
  end
data/spec/data/sample.csv CHANGED
@@ -81,7 +81,7 @@ ID,Name,Genus,Species
81
81
  080,Tonkean Macaque,Macaca,tonkeana
82
82
  081,Heck's Macaque,Macaca,hecki
83
83
  082,Gorontalo Macaque,Macaca,nigrescens
84
- 083,Celebes Crested Macaque or Black "Ape",Macaca,nigra
84
+ 083,Celebes Crested Macaque or Black Ape,Macaca,nigra
85
85
  084,Crab-eating Macaque or Long-tailed Macaque or Kera,Macaca,fascicularis
86
86
  085,Stump-tailed Macaque or Bear Macaque,Macaca,arctoides
87
87
  086,Rhesus Macaque,Macaca,mulatta
@@ -0,0 +1 @@
1
+ {"monkeys":[{"monkey":{"name":"Gray-bellied Night Monkey","id":1,"genus":"Aotus","species":"lemurinus"}},{"monkey":{"name":"Panamanian Night Monkey","id":2,"genus":"Aotus","species":"zonalis"}},{"monkey":{"name":"Hern\u00e1ndez-Camacho's Night Monkey","id":3,"genus":"Aotus","species":"jorgehernandezi"}},{"monkey":{"name":"Gray-handed Night Monkey","id":4,"genus":"Aotus","species":"griseimembra"}},{"monkey":{"name":"Hershkovitz's Night Monkey","id":5,"genus":"Aotus","species":"hershkovitzi"}},{"monkey":{"name":"Brumback's Night Monkey","id":6,"genus":"Aotus","species":"brumbacki"}},{"monkey":{"name":"Three-striped Night Monkey","id":7,"genus":"Aotus","species":"trivirgatus"}},{"monkey":{"name":"Spix's Night Monkey","id":"008","genus":"Aotus","species":"vociferans"}},{"monkey":{"name":"Malaysian Lar Gibbon","id":"009","genus":"Hylobates","species":"lar lar"}},{"monkey":{"name":"Carpenter's Lar Gibbon","id":8,"genus":"Hylobates","species":"lar carpenteri"}},{"monkey":{"name":"Central Lar Gibbon","id":9,"genus":"Hylobates","species":"lar entelloides"}},{"monkey":{"name":"Sumatran Lar Gibbon","id":10,"genus":"Hylobates","species":"lar vestitus"}},{"monkey":{"name":"Yunnan Lar Gibbon","id":11,"genus":"Hylobates","species":"lar yunnanensis"}},{"monkey":{"name":"Mountain Agile Gibbon","id":12,"genus":"Hylobates","species":"agilis agilis"}},{"monkey":{"name":"Bornean White-bearded Gibbon","id":13,"genus":"Hylobates","species":"agilis albibarbis"}},{"monkey":{"name":"Lowland Agile Gibbon","id":14,"genus":"Hylobates","species":"agilis unko"}},{"monkey":{"name":"M\u00fcller's Gray Gibbon","id":15,"genus":"Hylobates","species":"muelleri muelleri"}},{"monkey":{"name":"Abbott's Gray Gibbon","id":"018","genus":"Hylobates","species":"muelleri abbotti"}},{"monkey":{"name":"Northern Gray Gibbon","id":"019","genus":"Hylobates","species":"muelleri funereus"}},{"monkey":{"name":"Black Tamarin","id":16,"genus":"Saguinas","species":"niger"}},{"monkey":{"name":"Black-mantled Tamarin","id":17,"genus":"Saguinas","species":"nigricollis"}},{"monkey":{"name":"Brown-mantled Tamarin","id":18,"genus":"Saguinas","species":"fuscicollis"}},{"monkey":{"name":"Cottontop Tamarin or Pinch\u00e9 Tamarin","id":19,"genus":"Saguinas","species":"oedipus"}},{"monkey":{"name":"Emperor Tamarin","id":20,"genus":"Saguinas","species":"imperator"}},{"monkey":{"name":"Geoffroy's Tamarin","id":21,"genus":"Saguinas","species":"geoffroyi"}},{"monkey":{"name":"Golden-mantled Tamarin","id":22,"genus":"Saguinas","species":"tripartitus"}},{"monkey":{"name":"Graells's Tamarin","id":23,"genus":"Saguinas","species":"graellsi"}},{"monkey":{"name":"Martins's Tamarin","id":"028","genus":"Saguinas","species":"martinsi"}},{"monkey":{"name":"Mottle-faced Tamarin","id":"029","genus":"Saguinas","species":"inustus"}},{"monkey":{"name":"Moustached Tamarin","id":24,"genus":"Saguinas","species":"mystax"}},{"monkey":{"name":"Pied Tamarin","id":25,"genus":"Saguinas","species":"bicolor"}},{"monkey":{"name":"Red-capped Tamarin","id":26,"genus":"Saguinas","species":"pileatus"}},{"monkey":{"name":"Red-handed Tamarin","id":27,"genus":"Saguinas","species":"midas"}},{"monkey":{"name":"White-footed Tamarin","id":28,"genus":"Saguinas","species":"leucopus"}},{"monkey":{"name":"White-lipped Tamarin","id":29,"genus":"Saguinas","species":"labiatus"}},{"monkey":{"name":"White-mantled Tamarin","id":30,"genus":"Saguinas","species":"melanoleucus"}},{"monkey":{"name":"Allen's Swamp Monkey","id":31,"genus":"Allenopithecus","species":"nigroviridis"}},{"monkey":{"name":"Angolan Talapoin","id":"038","genus":"Miopithecus","species":"talapoin"}},{"monkey":{"name":"Gabon Talapoin","id":"039","genus":"Miopithecus","species":"ogouensis"}},{"monkey":{"name":"Patas Monkey","id":32,"genus":"Erythrocebus","species":"patas"}},{"monkey":{"name":"Green Monkey","id":33,"genus":"Chlorocebus","species":"sabaeus"}},{"monkey":{"name":"Grivet","id":34,"genus":"Chlorocebus","species":"aethiops"}},{"monkey":{"name":"Bale Mountains Vervet","id":35,"genus":"Chlorocebus","species":"djamdjamensis"}},{"monkey":{"name":"Tantalus Monkey","id":36,"genus":"Chlorocebus","species":"tantalus"}},{"monkey":{"name":"Vervet Monkey","id":37,"genus":"Chlorocebus","species":"pygerythrus"}},{"monkey":{"name":"Malbrouck","id":38,"genus":"Chlorocebus","species":"cynosuros"}},{"monkey":{"name":"Dryas Monkey or Salongo Monkey","id":39,"genus":"Cercopithecus","species":"dryas"}},{"monkey":{"name":"Diana Monkey","id":"048","genus":"Cercopithecus","species":"diana"}},{"monkey":{"name":"Roloway Monkey","id":"049","genus":"Cercopithecus","species":"roloway"}},{"monkey":{"name":"Greater Spot-nosed Monkey","id":40,"genus":"Cercopithecus","species":"nictitans"}},{"monkey":{"name":"Blue Monkey","id":41,"genus":"Cercopithecus","species":"mitis"}},{"monkey":{"name":"Silver Monkey","id":42,"genus":"Cercopithecus","species":"doggetti"}},{"monkey":{"name":"Golden Monkey","id":43,"genus":"Cercopithecus","species":"kandti"}},{"monkey":{"name":"Sykes's Monkey","id":44,"genus":"Cercopithecus","species":"albogularis"}},{"monkey":{"name":"Mona Monkey","id":45,"genus":"Cercopithecus","species":"mona"}},{"monkey":{"name":"Campbell's Mona Monkey","id":46,"genus":"Cercopithecus","species":"campbelli"}},{"monkey":{"name":"Lowe's Mona Monkey","id":47,"genus":"Cercopithecus","species":"lowei"}},{"monkey":{"name":"Crested Mona Monkey","id":"058","genus":"Cercopithecus","species":"pogonias"}},{"monkey":{"name":"Wolf's Mona Monkey","id":"059","genus":"Cercopithecus","species":"wolfi"}},{"monkey":{"name":"Dent's Mona Monkey","id":48,"genus":"Cercopithecus","species":"denti"}},{"monkey":{"name":"Lesser Spot-nosed Monkey","id":49,"genus":"Cercopithecus","species":"petaurista"}},{"monkey":{"name":"White-throated Guenon","id":50,"genus":"Cercopithecus","species":"erythrogaster"}},{"monkey":{"name":"Sclater's Guenon","id":51,"genus":"Cercopithecus","species":"sclateri"}},{"monkey":{"name":"Red-eared Guenon","id":52,"genus":"Cercopithecus","species":"erythrotis"}},{"monkey":{"name":"Moustached Guenon","id":53,"genus":"Cercopithecus","species":"cephus"}},{"monkey":{"name":"Red-tailed Monkey","id":54,"genus":"Cercopithecus","species":"ascanius"}},{"monkey":{"name":"L'Hoest's Monkey","id":55,"genus":"Cercopithecus","species":"lhoesti"}},{"monkey":{"name":"Preuss's Monkey","id":"068","genus":"Cercopithecus","species":"preussi"}},{"monkey":{"name":"Sun-tailed Monkey","id":"069","genus":"Cercopithecus","species":"solatus"}},{"monkey":{"name":"Hamlyn's Monkey","id":56,"genus":"Cercopithecus","species":"hamlyni"}},{"monkey":{"name":"De Brazza's Monkey","id":57,"genus":"Cercopithecus","species":"neglectus"}},{"monkey":{"name":"Barbary Macaque","id":58,"genus":"Macaca","species":"sylvanus"}},{"monkey":{"name":"Lion-tailed Macaque","id":59,"genus":"Macaca","species":"silenus"}},{"monkey":{"name":"Southern Pig-tailed Macaque or Beruk","id":60,"genus":"Macaca","species":"nemestrina"}},{"monkey":{"name":"Northern Pig-tailed Macaque","id":61,"genus":"Macaca","species":"leonina"}},{"monkey":{"name":"Pagai Island Macaque or Bokkoi","id":62,"genus":"Macaca","species":"pagensis"}},{"monkey":{"name":"Siberut Macaque","id":63,"genus":"Macaca","species":"siberu"}},{"monkey":{"name":"Moor Macaque","id":"078","genus":"Macaca","species":"maura"}},{"monkey":{"name":"Booted Macaque","id":"079","genus":"Macaca","species":"ochreata"}},{"monkey":{"name":"Tonkean Macaque","id":"080","genus":"Macaca","species":"tonkeana"}},{"monkey":{"name":"Heck's Macaque","id":"081","genus":"Macaca","species":"hecki"}},{"monkey":{"name":"Gorontalo Macaque","id":"082","genus":"Macaca","species":"nigrescens"}},{"monkey":{"name":"Celebes Crested Macaque or Black Ape","id":"083","genus":"Macaca","species":"nigra"}},{"monkey":{"name":"Crab-eating Macaque or Long-tailed Macaque or Kera","id":"084","genus":"Macaca","species":"fascicularis"}},{"monkey":{"name":"Stump-tailed Macaque or Bear Macaque","id":"085","genus":"Macaca","species":"arctoides"}},{"monkey":{"name":"Rhesus Macaque","id":"086","genus":"Macaca","species":"mulatta"}},{"monkey":{"name":"Formosan Rock Macaque","id":"087","genus":"Macaca","species":"cyclopis"}},{"monkey":{"name":"Japanese Macaque","id":"088","genus":"Macaca","species":"fuscata"}},{"monkey":{"name":"Toque Macaque","id":"089","genus":"Macaca","species":"sinica"}},{"monkey":{"name":"Bonnet Macaque","id":"090","genus":"Macaca","species":"radiata"}},{"monkey":{"name":"Assam Macaque","id":"091","genus":"Macaca","species":"assamensis"}},{"monkey":{"name":"Tibetan Macaque or Milne-Edwards' Macaque","id":"092","genus":"Macaca","species":"thibetana"}},{"monkey":{"name":"Arunachal Macaque or Munzala","id":"093","genus":"Macaca","species":"munzala"}},{"monkey":{"name":"Grey-cheeked Mangabey","id":"094","genus":"Lophocebus","species":"albigena"}},{"monkey":{"name":"Black Crested Mangabey","id":"095","genus":"Lophocebus","species":"aterrimus"}},{"monkey":{"name":"Opdenbosch's Mangabey","id":"096","genus":"Lophocebus","species":"opdenboschi"}},{"monkey":{"name":"Uganda Mangabey","id":"097","genus":"Lophocebus","species":"ugandae"}},{"monkey":{"name":"Johnston's Mangabey","id":"098","genus":"Lophocebus","species":"johnstoni"}},{"monkey":{"name":"Osman Hill's Mangabey","id":"099","genus":"Lophocebus","species":"osmani"}},{"monkey":{"name":"Kipunji","id":100,"genus":"Rungwecebus","species":"kipunji"}},{"monkey":{"name":"Hamadryas Baboon","id":101,"genus":"Papio","species":"hamadryas"}},{"monkey":{"name":"Guinea Baboon","id":102,"genus":"Papio","species":"papio"}},{"monkey":{"name":"Olive Baboon","id":103,"genus":"Papio","species":"anubis"}},{"monkey":{"name":"Yellow Baboon","id":104,"genus":"Papio","species":"cynocephalus"}},{"monkey":{"name":"Chacma Baboon","id":105,"genus":"Papio","species":"ursinus"}},{"monkey":{"name":"Gelada","id":106,"genus":"Theropithecus","species":"gelada"}},{"monkey":{"name":"Sooty Mangabey","id":107,"genus":"Cercocebus","species":"atys"}},{"monkey":{"name":"Collared Mangabey","id":108,"genus":"Cercocebus","species":"torquatus"}},{"monkey":{"name":"Agile Mangabey","id":109,"genus":"Cercocebus","species":"agilis"}},{"monkey":{"name":"Golden-bellied Mangabey","id":110,"genus":"Cercocebus","species":"chrysogaster"}},{"monkey":{"name":"Tana River Mangabey","id":111,"genus":"Cercocebus","species":"galeritus"}},{"monkey":{"name":"Sanje Mangabey","id":112,"genus":"Cercocebus","species":"sanjei"}},{"monkey":{"name":"Mandrill","id":113,"genus":"Mandrillus","species":"sphinx"}},{"monkey":{"name":"Drill","id":114,"genus":"Mandrillus","species":"leucophaeus"}},{"monkey":{"name":"Black Colobus","id":115,"genus":"Colobus","species":"satanas"}},{"monkey":{"name":"Angola Colobus","id":116,"genus":"Colobus","species":"angolensis"}},{"monkey":{"name":"King Colobus","id":117,"genus":"Colobus","species":"polykomos"}},{"monkey":{"name":"Ursine Colobus","id":118,"genus":"Colobus","species":"vellerosus"}},{"monkey":{"name":"Mantled Guereza","id":119,"genus":"Colobus","species":"guereza"}},{"monkey":{"name":"Western Red Colobus","id":120,"genus":"Piliocolobus","species":"badius"}},{"monkey":{"name":"Pennant's Colobus","id":121,"genus":"Piliocolobus","species":"pennantii"}},{"monkey":{"name":"Preuss's Red Colobus","id":122,"genus":"Piliocolobus","species":"preussi"}},{"monkey":{"name":"Thollon's Red Colobus","id":123,"genus":"Piliocolobus","species":"tholloni"}},{"monkey":{"name":"Central African Red Colobus","id":124,"genus":"Piliocolobus","species":"foai"}},{"monkey":{"name":"Ugandan Red Colobus","id":125,"genus":"Piliocolobus","species":"tephrosceles"}},{"monkey":{"name":"Uzungwa Red Colobus","id":126,"genus":"Piliocolobus","species":"gordonorum"}},{"monkey":{"name":"Zanzibar Red Colobus","id":127,"genus":"Piliocolobus","species":"kirkii"}},{"monkey":{"name":"Tana River Red Colobus","id":128,"genus":"Piliocolobus","species":"rufomitratus"}},{"monkey":{"name":"Olive Colobus","id":129,"genus":"Procolobus","species":"verus"}},{"monkey":{"name":"Maroon Leaf Monkey","id":130,"genus":"Presbytis","species":"rubicunda"}}]}
data/spec/data/sample.tsv CHANGED
@@ -81,7 +81,7 @@ ID Name Genus Species
81
81
  080 Tonkean Macaque Macaca tonkeana
82
82
  081 Heck's Macaque Macaca hecki
83
83
  082 Gorontalo Macaque Macaca nigrescens
84
- 083 Celebes Crested Macaque or Black "Ape" Macaca nigra
84
+ 083 Celebes Crested Macaque or Black Ape Macaca nigra
85
85
  084 Crab-eating Macaque or Long-tailed Macaque or Kera Macaca fascicularis
86
86
  085 Stump-tailed Macaque or Bear Macaque Macaca arctoides
87
87
  086 Rhesus Macaque Macaca mulatta
data/spec/data/sample.txt CHANGED
@@ -81,7 +81,7 @@ ID,Name,Genus,Species
81
81
  080,Tonkean Macaque,Macaca,tonkeana
82
82
  081,Heck's Macaque,Macaca,hecki
83
83
  082,Gorontalo Macaque,Macaca,nigrescens
84
- 083,Celebes Crested Macaque or Black "Ape",Macaca,nigra
84
+ 083,Celebes Crested Macaque or Black Ape,Macaca,nigra
85
85
  084,Crab-eating Macaque or Long-tailed Macaque or Kera,Macaca,fascicularis
86
86
  085,Stump-tailed Macaque or Bear Macaque,Macaca,arctoides
87
87
  086,Rhesus Macaque,Macaca,mulatta
data/spec/data/sample.xml CHANGED
@@ -412,7 +412,7 @@
412
412
  <monkey id="083">
413
413
  <genus>Macaca</genus>
414
414
  <species>nigra</species>
415
- <name>Celebes Crested Macaque or Black "Ape"</name>
415
+ <name>Celebes Crested Macaque or Black Ape</name>
416
416
  </monkey>
417
417
  <monkey id="084">
418
418
  <genus>Macaca</genus>
@@ -412,7 +412,7 @@ monkeys:
412
412
  species: nigrescens
413
413
  - monkey:
414
414
  id: 083
415
- name: Celebes Crested Macaque or Black "Ape"
415
+ name: Celebes Crested Macaque or Black Ape
416
416
  genus: Macaca
417
417
  species: nigra
418
418
  - monkey:
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+ require File.dirname(__FILE__) + "/../utils/shared_paths_spec"
3
+
4
+ describe IMW::Dataset do
5
+
6
+ describe 'setting default paths' do
7
+
8
+ before do
9
+ @dataset = IMW::Dataset.new(:testing, :root => IMWTest::TMP_DIR)
10
+ end
11
+
12
+ it "should set its root path to the value given" do
13
+ @dataset.path_to(:root).should == IMWTest::TMP_DIR
14
+ end
15
+
16
+ it "should set paths for each workflow dir" do
17
+ @dataset.workflow_dirs.each do |dir|
18
+ @dataset.path_to(dir).should == File.join(IMWTest::TMP_DIR, dir.to_s)
19
+ end
20
+ end
21
+
22
+ before do
23
+ @path_manager = @dataset
24
+ end
25
+ it_should_behave_like "an object that manages paths"
26
+
27
+ end
28
+ end
29
+
30
+
31
+
32
+
@@ -0,0 +1,41 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+ require 'imw/dataset/workflow'
3
+ describe IMW::Workflow do
4
+
5
+ before do
6
+ @dataset = IMW::Dataset.new :testing
7
+ end
8
+
9
+ it "should dynamically define methods for each workflow step" do
10
+ @dataset.workflow_steps.each do |step|
11
+ @dataset.respond_to?(step).should be_true
12
+ end
13
+ end
14
+
15
+ describe "initializing workflow" do
16
+ it "should not make any directories if no tasks are invoked" do
17
+ @dataset.path_to(:root).should_not contain(*@dataset.workflow_dirs.map(&:to_s))
18
+ end
19
+
20
+ it "should only make directories once a task is invoked" do
21
+ @dataset[:initialize].invoke
22
+ @dataset.path_to(:root).should contain(*@dataset.workflow_dirs.map(&:to_s))
23
+ end
24
+ end
25
+
26
+ describe "cleaning workflow directories" do
27
+ it "should clean without error even if there's nothing to clean" do
28
+ @dataset[:clean].invoke
29
+ @dataset.path_to(:root).should_not contain(*@dataset.workflow_dirs.map(&:to_s))
30
+ end
31
+
32
+ it "should remove workflow directories when invoked" do
33
+ @dataset[:initialize].invoke
34
+ IMWTest::Random.file(@dataset.path_to(:ripd, 'foobar.txt')) # put a file in
35
+ @dataset[:clean].invoke
36
+ @dataset.path_to(:root).should_not contain(*@dataset.workflow_dirs.map(&:to_s))
37
+ end
38
+ end
39
+
40
+ end
41
+