imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -0,0 +1,24 @@
1
+ require 'rubygems'
2
+ require 'nibbler'
3
+ require 'open-uri'
4
+
5
+ class Game < Nibbler
6
+ element 'td/i/a' => :name
7
+ element 'td/a' => :publisher
8
+ element 'td[2]' => :year
9
+ end
10
+
11
+ class Table < Nibbler
12
+
13
+ elements "//h2/span[@id='Licensed_games']/following::table[1]/tr" => :licensed_games, :with => Game
14
+ elements "//h2/span[@id='Unlicensed_games']/following::table[1]/tr" => :unlicensed_games, :with => Game
15
+
16
+ end
17
+
18
+ foo = Table.parse open('http://en.wikipedia.org/wiki/List_of_Nintendo_Entertainment_System_games')
19
+
20
+ puts foo.unlicensed_games[1].inspect
21
+
22
+
23
+
24
+
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require 'nokogiri'
6
+ require 'fastercsv'
7
+
8
+ url = "http://www.gamespot.com/games.html?platform=19&category=&type=games&mode=all&sort=title&sortdir=desc&page="
9
+ pages = (0..27)
10
+
11
+ FasterCSV.open("nes_gamespot.csv","w", :write_headers => true, :headers => %w[ title category release ]) do |csv|
12
+ pages.each do |page|
13
+ doc = Nokogiri::HTML(open(url + page.to_s))
14
+ doc.xpath("//tr").each do |node|
15
+ game = node.content.split("\n").map {|i| i.strip }.reject { |i| i.length == 0 }
16
+ csv << game unless game.include?("Release Date")
17
+ end
18
+ end
19
+ end
data/lib/imw.rb CHANGED
@@ -1,60 +1,16 @@
1
1
  require 'rubygems'
2
- require 'bundler/setup'
3
- require 'imw/boot'
4
2
  require 'imw/utils'
3
+ require 'imw/error'
4
+ require 'imw/uri'
5
5
 
6
- # The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
7
- # extracting, parsing, munging, and packaging datasets. It allows you
8
- # to handle different data formats transparently as well as organize
9
- # transformations of data as a network of dependencies (a la Make or
10
- # Rake).
11
- #
12
- # IMW has a few central concepts: resources, metadata, datasets,
13
- # workflows, and repositories.
14
- #
15
- # Resources represent individual data resources like local files,
16
- # websites, databases, &c. An IMW::Resource is typically instantiated
17
- # via IMW.open, with IMW doing the work of figuring out what to return
18
- # based on the URI passed in.
19
- #
20
- # A Resource can have a schema which describes the fields in its data.
21
- # IMW::Metadata consists of classes which describe fields.
22
- #
23
- # Datasets represent collections of related data resources .. An
24
- # IMW::Dataset comes with a pre-defined (but customizable) workflow
25
- # that takes data resources through several steps: rip, parse, munge,
26
- # and package. The workflow leverages Rake and so the various tasks
27
- # that are necessary to process the data till it is nice and pretty
28
- # can all be linked with dependencies.
29
- #
30
- # Repositories are collections of datasets and it is on these
31
- # collections that the +imw+ command line tool operates.
32
6
  module IMW
7
+
8
+ autoload :Recordizer, 'imw/recordizer'
33
9
  autoload :Resource, 'imw/resource'
34
10
  autoload :Schemes, 'imw/schemes'
35
- autoload :Archives, 'imw/archives'
36
- autoload :CompressedFiles, 'imw/compressed_files'
37
- autoload :Formats, 'imw/formats'
38
- autoload :Tools, 'imw/tools'
11
+ autoload :Formats, 'imw/formats'
39
12
  autoload :Parsers, 'imw/parsers'
40
- autoload :Dataset, 'imw/dataset'
41
- autoload :Repository, 'imw/repository'
42
- autoload :Metadata, 'imw/metadata'
43
13
 
44
- # Open a resource at the given +uri+. The resource will
45
- # automatically be extended by modules which make sense given the
46
- # +uri+.
47
- #
48
- # See the documentation for IMW::Resource and the various modules
49
- # within IMW::Resources for more information and options.
50
- #
51
- # Passing in an IMW::Resource will simply return it.
52
- #
53
- # @param [String, Addressable::URI, IMW::Resource] obj the URI to open
54
- # @param [Hash] options
55
- # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
56
- # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
57
- # @return [IMW::Resource] the resulting resource, property extended for the given URI
58
14
  def self.open obj, options={}, &block
59
15
  if obj.is_a?(IMW::Resource)
60
16
  resource = obj
@@ -63,103 +19,35 @@ module IMW
63
19
  options[:skip_modules] ||= (options[:without] || [])
64
20
  resource = IMW::Resource.new(obj, options)
65
21
  end
66
- if block_given?
67
- yield resource
68
- resource.close
69
- else
70
- resource
71
- end
72
22
  end
73
23
 
74
- # Open (and create if necessary) a directory at the given URI.
75
- #
76
- # Will automatically create directories recursively. Options will
77
- # be passed to IMW.open and interpreted appropriately. If a block
78
- # is passed, the directory will be created before the block is
79
- # yielded to.
80
- #
81
- # @param [String, IMW::Resource] uri
82
- # @param [Hash] options
83
- # @return [IMW::Resource]
84
- def self.dir! uri, options={}, &block
85
- if block_given?
86
- new_dir = open(uri, options.merge(:as => (options[:as] || []) + [Schemes::Local::LocalDirectory])) do |d|
87
- new_dir.create
88
- yield
24
+ class Resource
25
+
26
+ attr_reader :uri
27
+
28
+ def initialize(uri, mode='r')
29
+ raise FileModeError.new("'#{mode}' is not a valid access mode") unless valid_modes.include? mode
30
+ @uri = Uri.new(uri)
31
+ end
32
+
33
+ def self.open(uri, mode='r', &blk)
34
+ resource = Resource.new(uri, mode)
35
+ if block_given?
36
+ yield resource
37
+ else
38
+ return resource
89
39
  end
90
- else
91
- new_dir = open(uri, options.merge(:as => (options[:as] || []) + [Schemes::Local::LocalDirectory]))
92
- new_dir.create
93
40
  end
94
- new_dir
95
- end
96
-
97
- # Works the same way as IMW.open except opens the resource for
98
- # writing.
99
- #
100
- # @param [String, Addressable::URI] uri the URI to open
101
- # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
102
- def self.open! uri, options={}, &block
103
- open(uri, options.merge(:mode => 'w'), &block)
104
- end
105
41
 
106
- # The default repository in which to place datasets. See the
107
- # documentation for IMW::Repository for more information on how
108
- # datasets and repositories fit together.
109
- #
110
- # @return [IMW::Repository] the default IMW repository
111
- def self.repository
112
- @@repository ||= IMW::Repository.new
113
- end
42
+ def self.exists? resource
43
+ true
44
+ end
114
45
 
115
- # Create a dataset and put it in the default IMW repository.
116
- #
117
- # Evaluates the given block in the context of the new dataset. This
118
- # allows you to define tasks, add paths, and use defined metadata in
119
- # an elegant way.
120
- #
121
- # IMW.dataset :my_dataset do
122
- #
123
- # # Define some paths we're going to use
124
- # add_path :original, :rawd, 'original.csv'
125
- # add_path :filtered, :fixd, 'filtered.csv'
126
- # add_path :package, :pkgd, 'filtered.tar.bz2'
127
- #
128
- # # Copy a CSV filefrom a website to this machine.
129
- # rip do
130
- # open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
131
- # end
132
- #
133
- # # Filter the original CSV data by the
134
- # # <tt>meets_some_condition?</tt> method we define elsewhere...
135
- # munge do
136
- # open!(path_to(:filtered)) do |filtered|
137
- # open(path_to(:original)).each do |row|
138
- # filtered << row if meets_some_condition?(row)
139
- # end
140
- # end
141
- #
142
- # # Compress the filtered data to an archive.
143
- # package do
144
- # open(path_to(:filtered)).compress.mv(path_to(:package))
145
- # end
146
- # end
147
- #
148
- # See the <tt>/examples</tt> directory of the IMW distribution for
149
- # more examples.
150
- #
151
- # @param [Symbol, String] handle the handle to identify this dataset with
152
- # @param [Hash] options a hash of options (see IMW::Dataset)
153
- # @return [IMW::Dataset] the new dataset
154
- def self.dataset handle, options={}, &block
155
- d = IMW::Dataset.new(handle, options.merge(:repository => IMW.repository))
156
- d.instance_eval(&block) if block_given?
157
- d
158
- end
46
+ private
47
+ def valid_modes
48
+ %w[ r w a ]
49
+ end
159
50
 
160
- end
51
+ end
161
52
 
162
- # Works just like IMW.dataset but defined at a top-level scope.
163
- def dataset handle, options={}, &block
164
- IMW.dataset(handle, options, &block)
165
53
  end
@@ -0,0 +1,9 @@
1
+ module IMW
2
+
3
+ Error = Class.new(StandardError)
4
+
5
+ FileModeError = Class.new(Error)
6
+
7
+ InvalidFormatError = Class.new(Error)
8
+
9
+ end
@@ -0,0 +1,8 @@
1
+ module IMW
2
+ module Recordizer
3
+
4
+ autoload :StringSliceRecordizer, 'imw/recordizer/string_slice_recordizer'
5
+ autoload :HTMLSelectorRecordizer, 'imw/recordizer/html_selector_recordizer'
6
+
7
+ end
8
+ end
@@ -0,0 +1,86 @@
1
+ module IMW
2
+ module Recordizer
3
+ class HTMLSelectorRecordizer
4
+
5
+ def self.element(*args, &block)
6
+ selector, name, delegate = parse_rule_declaration(*args, &block)
7
+ rules[name] = [selector, delegate]
8
+ attr_accessor name
9
+ name
10
+ end
11
+
12
+ def self.elements(*args, &block)
13
+ name = element(*args, &block)
14
+ rules[name] << true
15
+ end
16
+
17
+ def initialize
18
+ self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
19
+ end
20
+
21
+ def self.recordize(doc)
22
+ self.new.recordize(doc)
23
+ end
24
+
25
+ def recordize(doc)
26
+ self.class.rules.each do |target, (selector, delegate, plural)|
27
+ if plural
28
+ send(target).concat doc.search(selector).map { |i| parse_result(i, delegate) }
29
+ else
30
+ send("#{target}=", parse_result(doc.at(selector), delegate))
31
+ end
32
+ end
33
+ self.to_hash
34
+ end
35
+
36
+ def to_hash
37
+ converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
38
+ self.class.rules.keys.inject({}) do |hash, name|
39
+ value = send(name)
40
+ hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
41
+ hash
42
+ end
43
+ end
44
+
45
+ protected
46
+
47
+ def parse_result(node, delegate)
48
+ if delegate
49
+ delegate.respond_to?(:call) ? delegate.call(node) : delegate.recordize(node)
50
+ elsif node.respond_to? :inner_text
51
+ node.inner_text
52
+ else
53
+ node
54
+ end unless node.nil?
55
+ end
56
+
57
+ private
58
+
59
+ def self.rules
60
+ @rules ||= {}
61
+ end
62
+
63
+ def self.inherited(subclass)
64
+ subclass.rules.update self.rules
65
+ end
66
+
67
+ # Rule declaration forms:
68
+ #
69
+ # { 'selector' => :property, :with => delegate }
70
+ # #=> ['selector', :property, delegate]
71
+ #
72
+ # :title
73
+ # #=> ['title', :title, nil]
74
+ def self.parse_rule_declaration(*args, &block)
75
+ options, name = Hash === args.last ? args.pop : {}, args.first
76
+ delegate = options.delete(:with)
77
+ selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
78
+ raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
79
+ # eval block in context of a new scraper subclass
80
+ delegate = Class.new(delegate || Nibbler, &block) if block_given?
81
+ return selector, property, delegate
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,39 @@
1
+ module IMW
2
+ module Recordizer
3
+ class StringSliceRecordizer
4
+
5
+ attr_reader :schema
6
+
7
+ def initialize ranges
8
+ @schema = ranges
9
+ end
10
+
11
+ def recordize line
12
+ format = schema
13
+ case format
14
+ when Array then slice_by_array(line, format)
15
+ when Hash then slice_by_hash(line, format)
16
+ end
17
+ end
18
+
19
+ def slice_range string, range
20
+ string.slice(range).strip
21
+ end
22
+
23
+ def slice_by_array string, format
24
+ format.map { |range| slice_range(string, range) }
25
+ end
26
+
27
+ def slice_by_hash string, format
28
+ format.inject({}) do |hsh, (key, val)|
29
+ case val
30
+ when Range then hsh[key] = slice_range(string, val)
31
+ when Hash then hsh[key] = slice_by_hash(string, val)
32
+ end
33
+ hsh
34
+ end
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -1,80 +1,10 @@
1
1
  require 'imw/utils/has_uri'
2
2
 
3
3
  module IMW
4
-
5
- # A resource can be anything addressable via a URI. Examples
6
- # include local files, remote files, webpages, &c.
7
- #
8
- # The IMW::Resource class takes a URI as input and then dynamically
9
- # extends itself with appropriate modules from IMW. As an example,
10
- # calling
11
- #
12
- # my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
13
- #
14
- # would return an IMW::Resource extended by
15
- # IMW::Archives::Tarbz2 (among other modules) which
16
- # therefore has methods for extracting, listing, and appending to
17
- # the archive.
18
- #
19
- # Modules are so extended based on handlers defined in the
20
- # <tt>imw/resources</tt> directory and accessible via
21
- # IMW::Resource.handlers. You can define your own handlers by
22
- # defining the constant IMW::Resource::USER_DEFINED_HANDLERS in your
23
- # configuration file.
24
- #
25
- # The modules extending a particular IMW::Resource instance can be
26
- # listed as follows
27
- #
28
- # my_archive.modules #=> [IMW::Local::Base, IMW::Local::File, IMW::Local::Compressible, IMW::Archives::Tarbz2]
29
- #
30
- # By default, resources are opened for reading. Passing in the
31
- # appropriate <tt>:mode</tt> option changes this:
32
- #
33
- # IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
34
- #
35
- # If the <tt>:skip_modules</tt> option is passed in then the
36
- # resource will not extend itself with any modules and will
37
- # essentially only retain the bare functionality of a URI. This can
38
- # be useful when subclassing IMW::Resource or dealing with a very
39
- # strange kind of resource.
40
- #
41
- # Read the documentation for modules in IMW::Resources to learn more
42
- # about the various behaviors an IMW::Resource can acquire.
43
- #
44
- # You can also instantiate an IMW::Resource using IMW.open, which
45
- # accepts all the same arguments as IMW::Resource.new.
46
4
  class Resource
47
5
 
48
- # The mode in which to access this resource.
49
- attr_accessor :mode
50
-
51
- # A copy of the options passed to this resource on initialization.
52
- attr_accessor :resource_options
6
+ attr_accessor :mode, :resource_options
53
7
 
54
- # Create a new resource representing +uri+.
55
- #
56
- # IMW will automatically extend the resulting IMW::Resource
57
- # instance with modules appropriate for the given URI:
58
- #
59
- # r = IMW::Resource.new("http://www.infochimps.com")
60
- # r.modules
61
- # => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
62
- #
63
- # You can prevent this altogether by passing in
64
- # <tt>:no_modules</tt>:
65
- #
66
- # r = IMW::Resource.new("http://www.infochimps.com", :no_modules => true)
67
- # r.modules
68
- # => []
69
- #
70
- # And you can exert more fine-grained control with the
71
- # <tt>:use_modules</tt> and <tt>:skip_modules</tt> options, see
72
- # IMW::Resource.extend_instance! for details.
73
- #
74
- # @param [String, Addressable::URI] uri
75
- # @param [Hash] options
76
- # @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
77
- # @return [IMW::Resource]
78
8
  def initialize uri, options={}
79
9
  self.uri = uri
80
10
  self.resource_options = options
@@ -85,20 +15,14 @@ module IMW
85
15
  # Provides resources with a wrapped Addressable::URI object.
86
16
  include IMW::Utils::HasURI
87
17
 
88
- # Provides resources with a summary, metadata, & schema.
89
- include IMW::Metadata::HasSummary
90
-
91
18
  # Gives IMW::Resource instances with the ability to dynamically
92
19
  # extend themselves with modules chosen from a set of handlers
93
20
  # stored by the IMW::Resource class.
94
21
  include IMW::Utils::DynamicallyExtendable
95
- [IMW::Schemes::HANDLERS, IMW::CompressedFiles::HANDLERS, IMW::Archives::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
22
+ [IMW::Schemes::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
96
23
  register_handlers *handlers
97
24
  end
98
-
99
- # Raise an error unless this resource exists.
100
- #
101
- # @param [String] message an optional message to include
25
+
102
26
  def should_exist!(message=nil)
103
27
  raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
104
28
  raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
@@ -106,52 +30,12 @@ module IMW
106
30
  self
107
31
  end
108
32
 
109
- # Close this resource.
110
- #
111
- # Modules should hook into super() as they need to redefine this
112
- # method.
113
33
  def close
114
34
  end
115
35
 
116
- # Open a copy of this resource.
117
- #
118
- # This is useful when wanting to reset file handles. Though -- be
119
- # warned -- it does not close any file handles itself...
120
- #
121
- # @return [IMW::Resource] the new (old) resource
122
36
  def reopen
123
37
  IMW.open(uri.to_s)
124
38
  end
125
39
 
126
- # If +method+ begins with the strings +is+, +on+, or +via+ and
127
- # ends with a question mark then we interpret it as a question
128
- # this resource doesn't know how to answer -- so we have it answer
129
- # +false+.
130
- #
131
- # As an example, consider the following loop:
132
- #
133
- # IMW.open('/tmp').all_contents.each do |obj|
134
- # if obj.is_archive?
135
- # # ... do something
136
- # end
137
- # end
138
- #
139
- # When +obj+ is initialized and it _isn't_ an archive, then it
140
- # doesn't know about the <tt>is_archive?</tt> method -- but it
141
- # should therefore answer false anyway.
142
- #
143
- # This lets a basic text file answer questions about whether it's
144
- # an archive (or on S3, or accessed via some user-defined scheme,
145
- # &c.) without needing to know anything about archives (or S3 or
146
- # the user-defined scheme).
147
- def method_missing method, *args
148
- if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
149
- # querying for a boolean response so answer false
150
- return false
151
- else
152
- raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{modules.join(', ')}"
153
- end
154
- end
155
-
156
40
  end
157
41
  end