imw 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/.gitignore +4 -1
  2. data/Rakefile +10 -0
  3. data/TODO +18 -0
  4. data/VERSION +1 -1
  5. data/bin/imw +1 -1
  6. data/etc/imwrc.rb +0 -50
  7. data/examples/dataset.rb +12 -0
  8. data/lib/imw/boot.rb +55 -9
  9. data/lib/imw/dataset/paths.rb +15 -24
  10. data/lib/imw/dataset/workflow.rb +131 -72
  11. data/lib/imw/dataset.rb +94 -186
  12. data/lib/imw/parsers/html_parser.rb +1 -1
  13. data/lib/imw/parsers.rb +1 -1
  14. data/lib/imw/repository.rb +3 -27
  15. data/lib/imw/resource.rb +190 -0
  16. data/lib/imw/resources/archive.rb +97 -0
  17. data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
  18. data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
  19. data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
  20. data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
  21. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
  22. data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
  23. data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
  24. data/lib/imw/resources/archives_and_compressed.rb +32 -0
  25. data/lib/imw/resources/compressed_file.rb +89 -0
  26. data/lib/imw/resources/compressible.rb +77 -0
  27. data/lib/imw/resources/formats/delimited.rb +92 -0
  28. data/lib/imw/resources/formats/excel.rb +125 -0
  29. data/lib/imw/resources/formats/json.rb +53 -0
  30. data/lib/imw/resources/formats/sgml.rb +72 -0
  31. data/lib/imw/resources/formats/yaml.rb +53 -0
  32. data/lib/imw/resources/formats.rb +32 -0
  33. data/lib/imw/resources/local.rb +198 -0
  34. data/lib/imw/resources/remote.rb +110 -0
  35. data/lib/imw/resources/schemes/hdfs.rb +242 -0
  36. data/lib/imw/resources/schemes/http.rb +161 -0
  37. data/lib/imw/resources/schemes/s3.rb +137 -0
  38. data/lib/imw/resources/schemes.rb +19 -0
  39. data/lib/imw/resources.rb +118 -0
  40. data/lib/imw/runner.rb +5 -4
  41. data/lib/imw/transforms/archiver.rb +215 -0
  42. data/lib/imw/transforms/transferer.rb +103 -0
  43. data/lib/imw/transforms.rb +8 -0
  44. data/lib/imw/utils/error.rb +26 -30
  45. data/lib/imw/utils/extensions/array.rb +5 -15
  46. data/lib/imw/utils/extensions/hash.rb +6 -16
  47. data/lib/imw/utils/extensions/hpricot.rb +0 -14
  48. data/lib/imw/utils/extensions/string.rb +5 -15
  49. data/lib/imw/utils/extensions/symbol.rb +0 -13
  50. data/lib/imw/utils/extensions.rb +65 -0
  51. data/lib/imw/utils/log.rb +14 -13
  52. data/lib/imw/utils/misc.rb +0 -6
  53. data/lib/imw/utils/paths.rb +101 -42
  54. data/lib/imw/utils/version.rb +8 -9
  55. data/lib/imw/utils.rb +2 -18
  56. data/lib/imw.rb +92 -17
  57. data/spec/data/sample.csv +1 -1
  58. data/spec/data/sample.json +1 -0
  59. data/spec/data/sample.tsv +1 -1
  60. data/spec/data/sample.txt +1 -1
  61. data/spec/data/sample.xml +1 -1
  62. data/spec/data/sample.yaml +1 -1
  63. data/spec/imw/dataset/paths_spec.rb +32 -0
  64. data/spec/imw/dataset/workflow_spec.rb +41 -0
  65. data/spec/imw/resource_spec.rb +79 -0
  66. data/spec/imw/resources/archive_spec.rb +69 -0
  67. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
  68. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
  69. data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
  70. data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
  71. data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
  72. data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
  73. data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
  74. data/spec/imw/resources/compressed_file_spec.rb +48 -0
  75. data/spec/imw/resources/compressible_spec.rb +36 -0
  76. data/spec/imw/resources/formats/delimited_spec.rb +33 -0
  77. data/spec/imw/resources/formats/json_spec.rb +32 -0
  78. data/spec/imw/resources/formats/sgml_spec.rb +24 -0
  79. data/spec/imw/resources/formats/yaml_spec.rb +41 -0
  80. data/spec/imw/resources/local_spec.rb +98 -0
  81. data/spec/imw/resources/remote_spec.rb +35 -0
  82. data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
  83. data/spec/imw/resources/schemes/http_spec.rb +19 -0
  84. data/spec/imw/resources/schemes/s3_spec.rb +19 -0
  85. data/spec/imw/transforms/archiver_spec.rb +120 -0
  86. data/spec/imw/transforms/transferer_spec.rb +113 -0
  87. data/spec/imw/utils/paths_spec.rb +5 -33
  88. data/spec/imw/utils/shared_paths_spec.rb +29 -0
  89. data/spec/spec_helper.rb +5 -5
  90. data/spec/support/paths_matcher.rb +67 -0
  91. data/spec/support/random.rb +39 -36
  92. metadata +88 -75
  93. data/lib/imw/dataset/task.rb +0 -41
  94. data/lib/imw/files/archive.rb +0 -113
  95. data/lib/imw/files/basicfile.rb +0 -122
  96. data/lib/imw/files/binary.rb +0 -28
  97. data/lib/imw/files/compressed_file.rb +0 -93
  98. data/lib/imw/files/compressed_files_and_archives.rb +0 -334
  99. data/lib/imw/files/compressible.rb +0 -103
  100. data/lib/imw/files/csv.rb +0 -113
  101. data/lib/imw/files/directory.rb +0 -62
  102. data/lib/imw/files/excel.rb +0 -84
  103. data/lib/imw/files/json.rb +0 -41
  104. data/lib/imw/files/sgml.rb +0 -46
  105. data/lib/imw/files/text.rb +0 -68
  106. data/lib/imw/files/yaml.rb +0 -46
  107. data/lib/imw/files.rb +0 -125
  108. data/lib/imw/packagers/archiver.rb +0 -126
  109. data/lib/imw/packagers/s3_mover.rb +0 -36
  110. data/lib/imw/packagers.rb +0 -8
  111. data/lib/imw/utils/components.rb +0 -61
  112. data/lib/imw/utils/config.rb +0 -46
  113. data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
  114. data/lib/imw/utils/extensions/core.rb +0 -27
  115. data/lib/imw/utils/extensions/dir.rb +0 -24
  116. data/lib/imw/utils/extensions/file_core.rb +0 -64
  117. data/lib/imw/utils/extensions/typed_struct.rb +0 -22
  118. data/lib/imw/utils/extensions/uri.rb +0 -59
  119. data/lib/imw/utils/view/dump_csv.rb +0 -112
  120. data/lib/imw/utils/view/dump_csv_older.rb +0 -117
  121. data/lib/imw/utils/view.rb +0 -113
  122. data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
  123. data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
  124. data/spec/imw/files/archive_spec.rb +0 -118
  125. data/spec/imw/files/basicfile_spec.rb +0 -121
  126. data/spec/imw/files/bz2_spec.rb +0 -32
  127. data/spec/imw/files/compressed_file_spec.rb +0 -96
  128. data/spec/imw/files/compressible_spec.rb +0 -100
  129. data/spec/imw/files/file_spec.rb +0 -144
  130. data/spec/imw/files/gz_spec.rb +0 -32
  131. data/spec/imw/files/rar_spec.rb +0 -33
  132. data/spec/imw/files/tar_spec.rb +0 -31
  133. data/spec/imw/files/text_spec.rb +0 -23
  134. data/spec/imw/files/zip_spec.rb +0 -31
  135. data/spec/imw/files_spec.rb +0 -38
  136. data/spec/imw/packagers/archiver_spec.rb +0 -125
  137. data/spec/imw/packagers/s3_mover_spec.rb +0 -7
  138. data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
  139. data/spec/imw/utils/extensions/find_spec.rb +0 -113
  140. data/spec/imw/workflow/rip/local_spec.rb +0 -89
  141. data/spec/imw/workflow/rip_spec.rb +0 -27
  142. data/spec/support/archive_contents_matcher.rb +0 -94
  143. data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/utils/log.rb CHANGED
@@ -1,16 +1,16 @@
1
1
  require 'logger'
2
2
 
3
3
  module IMW
4
+
5
+ # Default log file.
4
6
  LOG_FILE_DESTINATION = STDERR unless defined?(LOG_FILE_DESTINATION)
7
+
5
8
  LOG_TIMEFORMAT = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
6
9
 
7
10
  class << self; attr_accessor :log end
8
- #
9
- # Create a Logger and point it at LOG_FILE_DESTINATION
10
- #
11
- # LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
12
- # ~/.imwrc, or set IMW.log yourself, if that's not cool.
13
- #
11
+
12
+ # Create a Logger and point it at IMW::LOG_FILE_DESTINATION which is
13
+ # set in ~/.imwrc and defaults to STDERR.
14
14
  def self.instantiate_logger!
15
15
  IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
16
16
  IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
@@ -18,15 +18,19 @@ module IMW
18
18
  end
19
19
 
20
20
  def announce *events
21
- options = events.extract_options!
21
+ options = events.flatten.extract_options!
22
22
  options.reverse_merge! :level => Logger::INFO
23
- # puts [options, events ].inspect, "*"*76
24
23
  IMW.log.add options[:level], events.join("\n")
25
24
  end
26
25
  def banner *events
27
- options = events.extract_options!
26
+ options = events.flatten.extract_options!
28
27
  options.reverse_merge! :level => Logger::INFO
29
- ["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
28
+ announce(["*"*75, events, "*"*75], options)
29
+ end
30
+ def warn *events
31
+ options = events.flatten.extract_options!
32
+ options.reverse_merge! :level => Logger::WARN
33
+ announce events, options
30
34
  end
31
35
 
32
36
  PROGRESS_TRACKERS = {}
@@ -61,7 +65,4 @@ module IMW
61
65
  end
62
66
  end
63
67
 
64
- #
65
- # Make the default logger
66
- #
67
68
  IMW.instantiate_logger!
@@ -1,10 +1,4 @@
1
1
  module IMW
2
- # Return a string representing the current UTC time in the IMW
3
- # format.
4
- def self.current_utc_time_string
5
- Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
6
- end
7
-
8
2
 
9
3
  # A simple counter. The +value+ and +add+ methods read and
10
4
  # increment the counter's value.
@@ -1,3 +1,5 @@
1
+ require 'pathname'
2
+
1
3
  module IMW
2
4
 
3
5
  # Implements methods designed to work with an object's
@@ -9,19 +11,48 @@ module IMW
9
11
  # <tt>@paths</tt>.
10
12
  module Paths
11
13
 
12
- # Expands a shorthand workflow path specification to an
13
- # actual file path.
14
+ # Expands a shorthand workflow path specification to an actual
15
+ # file path. Strings are interpreted literally but symbols are
16
+ # first resolved to the paths they represent.
17
+ #
18
+ # add_path :foo, '~/whoa'
19
+ # path_to :foo, 'my_thing'
20
+ # => '~/whoa/my_thing'
14
21
  #
15
- # add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
16
- # path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
17
- # => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
22
+ # @param [String, Symbol] pathsegs the path segments to join
23
+ # @return [String] the resulting expanded path
18
24
  def path_to *pathsegs
19
- begin
20
- path = Pathname.new path_to_helper(*pathsegs)
21
- path.absolute? ? File.expand_path(path) : path.to_s
22
- rescue Exception => e
23
- raise("Can't find path to '#{pathsegs}': #{e}");
24
- end
25
+ path = Pathname.new path_to_helper(*pathsegs)
26
+ path.absolute? ? File.expand_path(path) : path.to_s
27
+ end
28
+
29
+ # Return the presently defined paths for this object.
30
+ #
31
+ # @return [Hash]
32
+ def paths
33
+ @paths ||= {}
34
+ end
35
+
36
+ # Adds a symbolic path for expansion by +path_to+.
37
+ #
38
+ # add_path :foo, '~/whoa'
39
+ # add_path :bar, :foo, 'baz'
40
+ # path_to :bar
41
+ # => '~/whoa/baz'
42
+ #
43
+ # @param [Symbol] sym the name of the path to store
44
+ # @param [Symbol, String] pathsegs the path segments to use to define the path to the name
45
+ # @return [String] the resulting path
46
+ def add_path sym, *pathsegs
47
+ paths[sym] = pathsegs.flatten
48
+ path_to(sym)
49
+ end
50
+
51
+ # Removes a symbolic path for expansion by +path_to+.
52
+ #
53
+ # @param [Symbol] sym the stored path symbol to remove
54
+ def remove_path sym
55
+ paths.delete sym if paths.include? sym
25
56
  end
26
57
 
27
58
  private
@@ -29,7 +60,7 @@ module IMW
29
60
  # +path_to_helper+ handles the recursive calls for +path_to+.
30
61
  expanded = pathsegs.flatten.compact.map do |pathseg|
31
62
  case
32
- when pathseg.is_a?(Symbol) && @paths.include?(pathseg) then path_to(@paths[pathseg])
63
+ when pathseg.is_a?(Symbol) && paths.include?(pathseg) then path_to(paths[pathseg])
33
64
  when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
34
65
  when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
35
66
  else pathseg
@@ -37,29 +68,70 @@ module IMW
37
68
  end
38
69
  File.join(*expanded)
39
70
  end
40
- public
71
+ end
41
72
 
42
- # Adds a symbolic path for expansion by +path_to+.
43
- def add_path sym, *pathsegs
44
- @paths[sym] = pathsegs.flatten
45
- end
46
73
 
47
- # Removes a symbolic path for expansion by +path_to+.
48
- def remove_path sym
49
- @paths.delete sym if @paths.include? sym
50
- end
51
- end
74
+ # Default paths for the IMW. Chosen to make sense on most *NIX
75
+ # distributions.
76
+ DEFAULT_PATHS = {
77
+ :home => ENV['HOME'],
78
+ :data_root => "/var/lib/imw",
79
+ :log_root => "/var/log/imw",
80
+ :scripts_root => "/usr/share/imw",
81
+ :tmp_root => "/tmp/imw",
82
+
83
+ # the imw library
84
+ :imw_root => File.expand_path(File.dirname(__FILE__) + "/../../.."),
85
+ :imw_bin => [:imw_root, 'bin'],
86
+ :imw_etc => [:imw_root, 'etc'],
87
+ :imw_lib => [:imw_root, 'lib'],
88
+
89
+ # workflow
90
+ :ripd_root => [:data_root, 'ripd'],
91
+ :rawd_root => [:data_root, 'rawd'],
92
+ :fixd_root => [:data_root, 'fixd'],
93
+ :pkgd_root => [:data_root, 'pkgd']
94
+ }
95
+ defined?(PATHS) ? PATHS.reverse_merge!(DEFAULT_PATHS) : PATHS = DEFAULT_PATHS
52
96
 
97
+ # Expands a shorthand workflow path specification to an actual
98
+ # file path. Strings are interpreted literally but symbols are
99
+ # first resolved to the paths they represent.
100
+ #
101
+ # IMW.add_path :foo, '~/whoa'
102
+ # IMW.path_to :foo, 'my_thing'
103
+ # => '~/whoa/my_thing'
104
+ #
105
+ # @param [String, Symbol] pathsegs the path segments to join
106
+ # @return [String] the resulting expanded path
53
107
  def self.path_to *pathsegs
54
- begin
55
- path = Pathname.new IMW.path_to_helper(*pathsegs)
56
- path.absolute? ? File.expand_path(path) : path.to_s
57
- rescue Exception => e
58
- raise("Can't find path to '#{pathsegs}': #{e}");
59
- end
108
+ path = Pathname.new IMW.path_to_helper(*pathsegs)
109
+ path.absolute? ? File.expand_path(path) : path.to_s
110
+ end
111
+
112
+ # Adds a symbolic path for expansion by +path_to+.
113
+ #
114
+ # IMW.add_path :foo, '~/whoa'
115
+ # IMW.add_path :bar, :foo, 'baz'
116
+ # IMW.path_to :bar
117
+ # => '~/whoa/baz'
118
+ #
119
+ # @param [Symbol] sym the name of the path to store
120
+ # @param [Symbol, String] pathsegs the path segments to use to define the path to the name
121
+ # @return [String] the resulting path
122
+ def self.add_path sym, *pathsegs
123
+ IMW::PATHS[sym] = pathsegs.flatten
124
+ path_to[sym]
60
125
  end
61
126
 
62
- private
127
+ # Removes a symbolic path for expansion by +path_to+.
128
+ #
129
+ # @param [Symbol] sym the stored path symbol to remove
130
+ def self.remove_path sym
131
+ IMW::PATHS.delete sym if IMW::PATHS.include? sym
132
+ end
133
+
134
+ protected
63
135
  def self.path_to_helper *pathsegs # :nodoc:
64
136
  # +path_to_helper+ handles the recursive calls for +path_to+.
65
137
  expanded = pathsegs.flatten.compact.map do |pathseg|
@@ -71,17 +143,4 @@ module IMW
71
143
  end
72
144
  File.join(*expanded)
73
145
  end
74
- public
75
-
76
- # Adds a symbolic path for expansion by +path_to+.
77
- def self.add_path sym, *pathsegs
78
- IMW::PATHS[sym] = pathsegs.flatten
79
- end
80
-
81
- # Removes a symbolic path for expansion by +path_to+.
82
- def self.remove_path sym
83
- IMW::PATHS.delete sym if IMW::PATHS.include? sym
84
- end
85
146
  end
86
-
87
- # puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."
@@ -1,12 +1,11 @@
1
- # copied from activewarehouse-etl gem
2
- module IMWVersion #:nodoc:
1
+ module IMW
3
2
  unless defined?(VERSION)
4
- module VERSION #:nodoc:
5
- MAJOR = 0
6
- MINOR = 0
7
- TINY = 0
8
-
9
- STRING = [MAJOR, MINOR, TINY].join('.')
10
- end
3
+ module VERSION #:nodoc:
4
+ MAJOR = 0
5
+ MINOR = 0
6
+ TINY = 0
7
+
8
+ STRING = [MAJOR, MINOR, TINY].join('.')
9
+ end
11
10
  end
12
11
  end
data/lib/imw/utils.rb CHANGED
@@ -1,24 +1,8 @@
1
- #
2
- # h2. lib/imw/utils.rb -- utility functions
3
- #
4
- # == About
5
- #
6
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
7
- # Copyright:: Copyright (c) 2008 infochimps.org
8
- # License:: GPL 3.0
9
- # Website:: http://infinitemonkeywrench.org/
10
- #
11
-
12
1
  require 'rubygems'
2
+ require 'fileutils'
13
3
  require 'imw/utils/error'
14
4
  require 'imw/utils/log'
15
- require 'imw/utils/config'
16
5
  require 'imw/utils/paths'
17
6
  require 'imw/utils/misc'
18
- require 'imw/utils/components'
19
- require 'imw/utils/extensions/core'
20
- require 'fileutils'
21
- require 'pathname'
22
-
7
+ require 'imw/utils/extensions'
23
8
 
24
- # puts "#{File.basename(__FILE__)}: Early economists thought they would measure the utility of an action in units of `utils'. Really." # at bottom
data/lib/imw.rb CHANGED
@@ -1,11 +1,6 @@
1
1
  require 'rubygems'
2
2
  require 'imw/boot'
3
3
  require 'imw/utils'
4
- require 'imw/dataset'
5
- require 'imw/repository'
6
- require 'imw/files'
7
- require 'imw/parsers'
8
- require 'imw/packagers'
9
4
 
10
5
  # The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
11
6
  # extracting, parsing, munging, and packaging datasets. It allows you
@@ -13,19 +8,99 @@ require 'imw/packagers'
13
8
  # transformations of data as a network of dependencies (a la Make or
14
9
  # Rake).
15
10
  #
16
- # On first reading of IMW examine the classes within the IMW::Files
17
- # module, all transparently instantiated when using IMW.open (instead
18
- # of File.open). These classes do a lot of work to ensure that all
19
- # objects returned by IMW.open share methods (write, read, load, dump,
20
- # parse, compress, extract, &c.) while continuing to use existing
21
- # implementations of these concepts.
11
+ # IMW has a few central concepts: resources, datasets, workflows, and
12
+ # repositories.
22
13
  #
23
- # Another entrace point is the <tt>IMW::Dataset</tt> class. It
24
- # leverages Rake to craft workflows for transforming datasets. IMW
25
- # encourages you to organize your data transformations in a step-wise
26
- # process, managed with dependencies.
14
+ # Resources represent individual data resources like local files,
15
+ # websites, databases, &c. Resources are typically instantiated via
16
+ # IMW.open, with IMW doing the work of figuring out what to return
17
+ # based on the URI passed in.
27
18
  #
28
- # Utilities to help with one step in particular (ripping, parsing,
29
- # pacaking, &c.) are in their own directories.
19
+ # Datasets represent collections of related data resources. An
20
+ # IMW::Dataset comes with a pre-defined (but customizable) workflow
21
+ # that takes data resources through several steps: rip, parse, munge,
22
+ # and package. The workflow leverages Rake and so the various tasks
23
+ # that are necessary to process the data till it is nice and pretty
24
+ # can all be linked with dependencies.
25
+ #
26
+ # Repositories are collections of datasets and it is on these
27
+ # collections that the +imw+ command line tool operates.
30
28
  module IMW
29
+ autoload :Resource, 'imw/resource'
30
+ autoload :Resources, 'imw/resources'
31
+ autoload :Repository, 'imw/repository'
32
+ autoload :Dataset, 'imw/dataset'
33
+ autoload :Transforms, 'imw/transforms'
34
+ autoload :Parsers, 'imw/parsers'
35
+
36
+ # Open a resource at the given +uri+. The resource will
37
+ # automatically be extended by modules which make sense given the
38
+ # +uri+.
39
+ #
40
+ # See the documentation for IMW::Resource and the various modules
41
+ # within IMW::Resources for more information and options.
42
+ #
43
+ # Passing in an IMW::Resource will simply return it.
44
+ #
45
+ # @param [String, Addressable::URI, IMW::Resource] obj the URI to open
46
+ # @return [IMW::Resource] the resulting resource, property extended for the given URI
47
+ def self.open obj, options={}
48
+ return obj if obj.is_a?(IMW::Resource)
49
+ IMW::Resource.new(obj, options)
50
+ end
51
+
52
+ # Works the same way as IMW.open except opens the resource for
53
+ # writing.
54
+ #
55
+ # @param [String, Addressable::URI] uri the URI to open
56
+ # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
57
+ def self.open! uri, options={}
58
+ IMW::Resource.new(uri, options.merge(:mode => 'w'))
59
+ end
60
+
61
+ # The default repository in which to place datasets. See the
62
+ # documentation for IMW::Repository for more information on how
63
+ # datasets and repositories fit together.
64
+ #
65
+ # @return [IMW::Repository] the default IMW repository
66
+ def self.repository
67
+ @@repository ||= IMW::Repository.new
68
+ end
69
+
70
+ # Create a dataset and put it in the default IMW repository. Also
71
+ # yields the dataset so you can define its workflow
72
+ #
73
+ # IMW.dataset :my_dataset do
74
+ #
75
+ # # Define some paths we're going to use
76
+ # add_path :raw_data, :ripd, 'raw_data.csv'
77
+ # add_path :fixd_data, :fixd, 'fixed_data.csv'
78
+ #
79
+ # # Copy a file from a website to this dataset's +ripd+ directory.
80
+ # rip do
81
+ # IMW.open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:raw_data))
82
+ # end
83
+ #
84
+ # # Filter the raw data to those values which match some criterion defined by <tt>accept?</tt>
85
+ # munge do
86
+ # IMW.open(path_to(:raw_data)).map do |row|
87
+ # row if accept?(row)
88
+ # end.compact.dump(path_to(:fixd_data))
89
+ # end
90
+ #
91
+ # # Compress this new data
92
+ # package do
93
+ # IMW.open(path_to(:fixd_data)).compress.mv(path_to(:pkgd))
94
+ # end
95
+ # end
96
+ #
97
+ # @param [Symbol, String] handle the handle to identify this dataset with
98
+ # @param [Hash] options a hash of options (see IMW::Dataset)
99
+ # @return [IMW::Dataset] the new dataset
100
+ def self.dataset handle, options={}, &block
101
+ d = IMW::Dataset.new(handle, options)
102
+ d.instance_eval(&block) if block_given?
103
+ d
104
+ end
105
+
31
106
  end
data/spec/data/sample.csv CHANGED
@@ -81,7 +81,7 @@ ID,Name,Genus,Species
81
81
  080,Tonkean Macaque,Macaca,tonkeana
82
82
  081,Heck's Macaque,Macaca,hecki
83
83
  082,Gorontalo Macaque,Macaca,nigrescens
84
- 083,Celebes Crested Macaque or Black "Ape",Macaca,nigra
84
+ 083,Celebes Crested Macaque or Black Ape,Macaca,nigra
85
85
  084,Crab-eating Macaque or Long-tailed Macaque or Kera,Macaca,fascicularis
86
86
  085,Stump-tailed Macaque or Bear Macaque,Macaca,arctoides
87
87
  086,Rhesus Macaque,Macaca,mulatta
@@ -0,0 +1 @@
1
+ {"monkeys":[{"monkey":{"name":"Gray-bellied Night Monkey","id":1,"genus":"Aotus","species":"lemurinus"}},{"monkey":{"name":"Panamanian Night Monkey","id":2,"genus":"Aotus","species":"zonalis"}},{"monkey":{"name":"Hern\u00e1ndez-Camacho's Night Monkey","id":3,"genus":"Aotus","species":"jorgehernandezi"}},{"monkey":{"name":"Gray-handed Night Monkey","id":4,"genus":"Aotus","species":"griseimembra"}},{"monkey":{"name":"Hershkovitz's Night Monkey","id":5,"genus":"Aotus","species":"hershkovitzi"}},{"monkey":{"name":"Brumback's Night Monkey","id":6,"genus":"Aotus","species":"brumbacki"}},{"monkey":{"name":"Three-striped Night Monkey","id":7,"genus":"Aotus","species":"trivirgatus"}},{"monkey":{"name":"Spix's Night Monkey","id":"008","genus":"Aotus","species":"vociferans"}},{"monkey":{"name":"Malaysian Lar Gibbon","id":"009","genus":"Hylobates","species":"lar lar"}},{"monkey":{"name":"Carpenter's Lar Gibbon","id":8,"genus":"Hylobates","species":"lar carpenteri"}},{"monkey":{"name":"Central Lar Gibbon","id":9,"genus":"Hylobates","species":"lar entelloides"}},{"monkey":{"name":"Sumatran Lar Gibbon","id":10,"genus":"Hylobates","species":"lar vestitus"}},{"monkey":{"name":"Yunnan Lar Gibbon","id":11,"genus":"Hylobates","species":"lar yunnanensis"}},{"monkey":{"name":"Mountain Agile Gibbon","id":12,"genus":"Hylobates","species":"agilis agilis"}},{"monkey":{"name":"Bornean White-bearded Gibbon","id":13,"genus":"Hylobates","species":"agilis albibarbis"}},{"monkey":{"name":"Lowland Agile Gibbon","id":14,"genus":"Hylobates","species":"agilis unko"}},{"monkey":{"name":"M\u00fcller's Gray Gibbon","id":15,"genus":"Hylobates","species":"muelleri muelleri"}},{"monkey":{"name":"Abbott's Gray Gibbon","id":"018","genus":"Hylobates","species":"muelleri abbotti"}},{"monkey":{"name":"Northern Gray Gibbon","id":"019","genus":"Hylobates","species":"muelleri funereus"}},{"monkey":{"name":"Black Tamarin","id":16,"genus":"Saguinas","species":"niger"}},{"monkey":{"name":"Black-mantled Tamarin","id":17,"genus":"Saguinas","species":"nigricollis"}},{"monkey":{"name":"Brown-mantled Tamarin","id":18,"genus":"Saguinas","species":"fuscicollis"}},{"monkey":{"name":"Cottontop Tamarin or Pinch\u00e9 Tamarin","id":19,"genus":"Saguinas","species":"oedipus"}},{"monkey":{"name":"Emperor Tamarin","id":20,"genus":"Saguinas","species":"imperator"}},{"monkey":{"name":"Geoffroy's Tamarin","id":21,"genus":"Saguinas","species":"geoffroyi"}},{"monkey":{"name":"Golden-mantled Tamarin","id":22,"genus":"Saguinas","species":"tripartitus"}},{"monkey":{"name":"Graells's Tamarin","id":23,"genus":"Saguinas","species":"graellsi"}},{"monkey":{"name":"Martins's Tamarin","id":"028","genus":"Saguinas","species":"martinsi"}},{"monkey":{"name":"Mottle-faced Tamarin","id":"029","genus":"Saguinas","species":"inustus"}},{"monkey":{"name":"Moustached Tamarin","id":24,"genus":"Saguinas","species":"mystax"}},{"monkey":{"name":"Pied Tamarin","id":25,"genus":"Saguinas","species":"bicolor"}},{"monkey":{"name":"Red-capped Tamarin","id":26,"genus":"Saguinas","species":"pileatus"}},{"monkey":{"name":"Red-handed Tamarin","id":27,"genus":"Saguinas","species":"midas"}},{"monkey":{"name":"White-footed Tamarin","id":28,"genus":"Saguinas","species":"leucopus"}},{"monkey":{"name":"White-lipped Tamarin","id":29,"genus":"Saguinas","species":"labiatus"}},{"monkey":{"name":"White-mantled Tamarin","id":30,"genus":"Saguinas","species":"melanoleucus"}},{"monkey":{"name":"Allen's Swamp Monkey","id":31,"genus":"Allenopithecus","species":"nigroviridis"}},{"monkey":{"name":"Angolan Talapoin","id":"038","genus":"Miopithecus","species":"talapoin"}},{"monkey":{"name":"Gabon Talapoin","id":"039","genus":"Miopithecus","species":"ogouensis"}},{"monkey":{"name":"Patas Monkey","id":32,"genus":"Erythrocebus","species":"patas"}},{"monkey":{"name":"Green Monkey","id":33,"genus":"Chlorocebus","species":"sabaeus"}},{"monkey":{"name":"Grivet","id":34,"genus":"Chlorocebus","species":"aethiops"}},{"monkey":{"name":"Bale Mountains Vervet","id":35,"genus":"Chlorocebus","species":"djamdjamensis"}},{"monkey":{"name":"Tantalus Monkey","id":36,"genus":"Chlorocebus","species":"tantalus"}},{"monkey":{"name":"Vervet Monkey","id":37,"genus":"Chlorocebus","species":"pygerythrus"}},{"monkey":{"name":"Malbrouck","id":38,"genus":"Chlorocebus","species":"cynosuros"}},{"monkey":{"name":"Dryas Monkey or Salongo Monkey","id":39,"genus":"Cercopithecus","species":"dryas"}},{"monkey":{"name":"Diana Monkey","id":"048","genus":"Cercopithecus","species":"diana"}},{"monkey":{"name":"Roloway Monkey","id":"049","genus":"Cercopithecus","species":"roloway"}},{"monkey":{"name":"Greater Spot-nosed Monkey","id":40,"genus":"Cercopithecus","species":"nictitans"}},{"monkey":{"name":"Blue Monkey","id":41,"genus":"Cercopithecus","species":"mitis"}},{"monkey":{"name":"Silver Monkey","id":42,"genus":"Cercopithecus","species":"doggetti"}},{"monkey":{"name":"Golden Monkey","id":43,"genus":"Cercopithecus","species":"kandti"}},{"monkey":{"name":"Sykes's Monkey","id":44,"genus":"Cercopithecus","species":"albogularis"}},{"monkey":{"name":"Mona Monkey","id":45,"genus":"Cercopithecus","species":"mona"}},{"monkey":{"name":"Campbell's Mona Monkey","id":46,"genus":"Cercopithecus","species":"campbelli"}},{"monkey":{"name":"Lowe's Mona Monkey","id":47,"genus":"Cercopithecus","species":"lowei"}},{"monkey":{"name":"Crested Mona Monkey","id":"058","genus":"Cercopithecus","species":"pogonias"}},{"monkey":{"name":"Wolf's Mona Monkey","id":"059","genus":"Cercopithecus","species":"wolfi"}},{"monkey":{"name":"Dent's Mona Monkey","id":48,"genus":"Cercopithecus","species":"denti"}},{"monkey":{"name":"Lesser Spot-nosed Monkey","id":49,"genus":"Cercopithecus","species":"petaurista"}},{"monkey":{"name":"White-throated Guenon","id":50,"genus":"Cercopithecus","species":"erythrogaster"}},{"monkey":{"name":"Sclater's Guenon","id":51,"genus":"Cercopithecus","species":"sclateri"}},{"monkey":{"name":"Red-eared Guenon","id":52,"genus":"Cercopithecus","species":"erythrotis"}},{"monkey":{"name":"Moustached Guenon","id":53,"genus":"Cercopithecus","species":"cephus"}},{"monkey":{"name":"Red-tailed Monkey","id":54,"genus":"Cercopithecus","species":"ascanius"}},{"monkey":{"name":"L'Hoest's Monkey","id":55,"genus":"Cercopithecus","species":"lhoesti"}},{"monkey":{"name":"Preuss's Monkey","id":"068","genus":"Cercopithecus","species":"preussi"}},{"monkey":{"name":"Sun-tailed Monkey","id":"069","genus":"Cercopithecus","species":"solatus"}},{"monkey":{"name":"Hamlyn's Monkey","id":56,"genus":"Cercopithecus","species":"hamlyni"}},{"monkey":{"name":"De Brazza's Monkey","id":57,"genus":"Cercopithecus","species":"neglectus"}},{"monkey":{"name":"Barbary Macaque","id":58,"genus":"Macaca","species":"sylvanus"}},{"monkey":{"name":"Lion-tailed Macaque","id":59,"genus":"Macaca","species":"silenus"}},{"monkey":{"name":"Southern Pig-tailed Macaque or Beruk","id":60,"genus":"Macaca","species":"nemestrina"}},{"monkey":{"name":"Northern Pig-tailed Macaque","id":61,"genus":"Macaca","species":"leonina"}},{"monkey":{"name":"Pagai Island Macaque or Bokkoi","id":62,"genus":"Macaca","species":"pagensis"}},{"monkey":{"name":"Siberut Macaque","id":63,"genus":"Macaca","species":"siberu"}},{"monkey":{"name":"Moor Macaque","id":"078","genus":"Macaca","species":"maura"}},{"monkey":{"name":"Booted Macaque","id":"079","genus":"Macaca","species":"ochreata"}},{"monkey":{"name":"Tonkean Macaque","id":"080","genus":"Macaca","species":"tonkeana"}},{"monkey":{"name":"Heck's Macaque","id":"081","genus":"Macaca","species":"hecki"}},{"monkey":{"name":"Gorontalo Macaque","id":"082","genus":"Macaca","species":"nigrescens"}},{"monkey":{"name":"Celebes Crested Macaque or Black Ape","id":"083","genus":"Macaca","species":"nigra"}},{"monkey":{"name":"Crab-eating Macaque or Long-tailed Macaque or Kera","id":"084","genus":"Macaca","species":"fascicularis"}},{"monkey":{"name":"Stump-tailed Macaque or Bear Macaque","id":"085","genus":"Macaca","species":"arctoides"}},{"monkey":{"name":"Rhesus Macaque","id":"086","genus":"Macaca","species":"mulatta"}},{"monkey":{"name":"Formosan Rock Macaque","id":"087","genus":"Macaca","species":"cyclopis"}},{"monkey":{"name":"Japanese Macaque","id":"088","genus":"Macaca","species":"fuscata"}},{"monkey":{"name":"Toque Macaque","id":"089","genus":"Macaca","species":"sinica"}},{"monkey":{"name":"Bonnet Macaque","id":"090","genus":"Macaca","species":"radiata"}},{"monkey":{"name":"Assam Macaque","id":"091","genus":"Macaca","species":"assamensis"}},{"monkey":{"name":"Tibetan Macaque or Milne-Edwards' Macaque","id":"092","genus":"Macaca","species":"thibetana"}},{"monkey":{"name":"Arunachal Macaque or Munzala","id":"093","genus":"Macaca","species":"munzala"}},{"monkey":{"name":"Grey-cheeked Mangabey","id":"094","genus":"Lophocebus","species":"albigena"}},{"monkey":{"name":"Black Crested Mangabey","id":"095","genus":"Lophocebus","species":"aterrimus"}},{"monkey":{"name":"Opdenbosch's Mangabey","id":"096","genus":"Lophocebus","species":"opdenboschi"}},{"monkey":{"name":"Uganda Mangabey","id":"097","genus":"Lophocebus","species":"ugandae"}},{"monkey":{"name":"Johnston's Mangabey","id":"098","genus":"Lophocebus","species":"johnstoni"}},{"monkey":{"name":"Osman Hill's Mangabey","id":"099","genus":"Lophocebus","species":"osmani"}},{"monkey":{"name":"Kipunji","id":100,"genus":"Rungwecebus","species":"kipunji"}},{"monkey":{"name":"Hamadryas Baboon","id":101,"genus":"Papio","species":"hamadryas"}},{"monkey":{"name":"Guinea Baboon","id":102,"genus":"Papio","species":"papio"}},{"monkey":{"name":"Olive Baboon","id":103,"genus":"Papio","species":"anubis"}},{"monkey":{"name":"Yellow Baboon","id":104,"genus":"Papio","species":"cynocephalus"}},{"monkey":{"name":"Chacma Baboon","id":105,"genus":"Papio","species":"ursinus"}},{"monkey":{"name":"Gelada","id":106,"genus":"Theropithecus","species":"gelada"}},{"monkey":{"name":"Sooty Mangabey","id":107,"genus":"Cercocebus","species":"atys"}},{"monkey":{"name":"Collared Mangabey","id":108,"genus":"Cercocebus","species":"torquatus"}},{"monkey":{"name":"Agile Mangabey","id":109,"genus":"Cercocebus","species":"agilis"}},{"monkey":{"name":"Golden-bellied Mangabey","id":110,"genus":"Cercocebus","species":"chrysogaster"}},{"monkey":{"name":"Tana River Mangabey","id":111,"genus":"Cercocebus","species":"galeritus"}},{"monkey":{"name":"Sanje Mangabey","id":112,"genus":"Cercocebus","species":"sanjei"}},{"monkey":{"name":"Mandrill","id":113,"genus":"Mandrillus","species":"sphinx"}},{"monkey":{"name":"Drill","id":114,"genus":"Mandrillus","species":"leucophaeus"}},{"monkey":{"name":"Black Colobus","id":115,"genus":"Colobus","species":"satanas"}},{"monkey":{"name":"Angola Colobus","id":116,"genus":"Colobus","species":"angolensis"}},{"monkey":{"name":"King Colobus","id":117,"genus":"Colobus","species":"polykomos"}},{"monkey":{"name":"Ursine Colobus","id":118,"genus":"Colobus","species":"vellerosus"}},{"monkey":{"name":"Mantled Guereza","id":119,"genus":"Colobus","species":"guereza"}},{"monkey":{"name":"Western Red Colobus","id":120,"genus":"Piliocolobus","species":"badius"}},{"monkey":{"name":"Pennant's Colobus","id":121,"genus":"Piliocolobus","species":"pennantii"}},{"monkey":{"name":"Preuss's Red Colobus","id":122,"genus":"Piliocolobus","species":"preussi"}},{"monkey":{"name":"Thollon's Red Colobus","id":123,"genus":"Piliocolobus","species":"tholloni"}},{"monkey":{"name":"Central African Red Colobus","id":124,"genus":"Piliocolobus","species":"foai"}},{"monkey":{"name":"Ugandan Red Colobus","id":125,"genus":"Piliocolobus","species":"tephrosceles"}},{"monkey":{"name":"Uzungwa Red Colobus","id":126,"genus":"Piliocolobus","species":"gordonorum"}},{"monkey":{"name":"Zanzibar Red Colobus","id":127,"genus":"Piliocolobus","species":"kirkii"}},{"monkey":{"name":"Tana River Red Colobus","id":128,"genus":"Piliocolobus","species":"rufomitratus"}},{"monkey":{"name":"Olive Colobus","id":129,"genus":"Procolobus","species":"verus"}},{"monkey":{"name":"Maroon Leaf Monkey","id":130,"genus":"Presbytis","species":"rubicunda"}}]}
data/spec/data/sample.tsv CHANGED
@@ -81,7 +81,7 @@ ID Name Genus Species
81
81
  080 Tonkean Macaque Macaca tonkeana
82
82
  081 Heck's Macaque Macaca hecki
83
83
  082 Gorontalo Macaque Macaca nigrescens
84
- 083 Celebes Crested Macaque or Black "Ape" Macaca nigra
84
+ 083 Celebes Crested Macaque or Black Ape Macaca nigra
85
85
  084 Crab-eating Macaque or Long-tailed Macaque or Kera Macaca fascicularis
86
86
  085 Stump-tailed Macaque or Bear Macaque Macaca arctoides
87
87
  086 Rhesus Macaque Macaca mulatta
data/spec/data/sample.txt CHANGED
@@ -81,7 +81,7 @@ ID,Name,Genus,Species
81
81
  080,Tonkean Macaque,Macaca,tonkeana
82
82
  081,Heck's Macaque,Macaca,hecki
83
83
  082,Gorontalo Macaque,Macaca,nigrescens
84
- 083,Celebes Crested Macaque or Black "Ape",Macaca,nigra
84
+ 083,Celebes Crested Macaque or Black Ape,Macaca,nigra
85
85
  084,Crab-eating Macaque or Long-tailed Macaque or Kera,Macaca,fascicularis
86
86
  085,Stump-tailed Macaque or Bear Macaque,Macaca,arctoides
87
87
  086,Rhesus Macaque,Macaca,mulatta
data/spec/data/sample.xml CHANGED
@@ -412,7 +412,7 @@
412
412
  <monkey id="083">
413
413
  <genus>Macaca</genus>
414
414
  <species>nigra</species>
415
- <name>Celebes Crested Macaque or Black "Ape"</name>
415
+ <name>Celebes Crested Macaque or Black Ape</name>
416
416
  </monkey>
417
417
  <monkey id="084">
418
418
  <genus>Macaca</genus>
@@ -412,7 +412,7 @@ monkeys:
412
412
  species: nigrescens
413
413
  - monkey:
414
414
  id: 083
415
- name: Celebes Crested Macaque or Black "Ape"
415
+ name: Celebes Crested Macaque or Black Ape
416
416
  genus: Macaca
417
417
  species: nigra
418
418
  - monkey:
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+ require File.dirname(__FILE__) + "/../utils/shared_paths_spec"
3
+
4
+ describe IMW::Dataset do
5
+
6
+ describe 'setting default paths' do
7
+
8
+ before do
9
+ @dataset = IMW::Dataset.new(:testing, :root => IMWTest::TMP_DIR)
10
+ end
11
+
12
+ it "should set its root path to the value given" do
13
+ @dataset.path_to(:root).should == IMWTest::TMP_DIR
14
+ end
15
+
16
+ it "should set paths for each workflow dir" do
17
+ @dataset.workflow_dirs.each do |dir|
18
+ @dataset.path_to(dir).should == File.join(IMWTest::TMP_DIR, dir.to_s)
19
+ end
20
+ end
21
+
22
+ before do
23
+ @path_manager = @dataset
24
+ end
25
+ it_should_behave_like "an object that manages paths"
26
+
27
+ end
28
+ end
29
+
30
+
31
+
32
+
@@ -0,0 +1,41 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+ require 'imw/dataset/workflow'
3
+ describe IMW::Workflow do
4
+
5
+ before do
6
+ @dataset = IMW::Dataset.new :testing
7
+ end
8
+
9
+ it "should dynamically define methods for each workflow step" do
10
+ @dataset.workflow_steps.each do |step|
11
+ @dataset.respond_to?(step).should be_true
12
+ end
13
+ end
14
+
15
+ describe "initializing workflow" do
16
+ it "should not make any directories if no tasks are invoked" do
17
+ @dataset.path_to(:root).should_not contain(*@dataset.workflow_dirs.map(&:to_s))
18
+ end
19
+
20
+ it "should only make directories once a task is invoked" do
21
+ @dataset[:initialize].invoke
22
+ @dataset.path_to(:root).should contain(*@dataset.workflow_dirs.map(&:to_s))
23
+ end
24
+ end
25
+
26
+ describe "cleaning workflow directories" do
27
+ it "should clean without error even if there's nothing to clean" do
28
+ @dataset[:clean].invoke
29
+ @dataset.path_to(:root).should_not contain(*@dataset.workflow_dirs.map(&:to_s))
30
+ end
31
+
32
+ it "should remove workflow directories when invoked" do
33
+ @dataset[:initialize].invoke
34
+ IMWTest::Random.file(@dataset.path_to(:ripd, 'foobar.txt')) # put a file in
35
+ @dataset[:clean].invoke
36
+ @dataset.path_to(:root).should_not contain(*@dataset.workflow_dirs.map(&:to_s))
37
+ end
38
+ end
39
+
40
+ end
41
+