imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -0,0 +1,24 @@
1
+ require 'rubygems'
2
+ require 'nibbler'
3
+ require 'open-uri'
4
+
5
+ class Game < Nibbler
6
+ element 'td/i/a' => :name
7
+ element 'td/a' => :publisher
8
+ element 'td[2]' => :year
9
+ end
10
+
11
+ class Table < Nibbler
12
+
13
+ elements "//h2/span[@id='Licensed_games']/following::table[1]/tr" => :licensed_games, :with => Game
14
+ elements "//h2/span[@id='Unlicensed_games']/following::table[1]/tr" => :unlicensed_games, :with => Game
15
+
16
+ end
17
+
18
+ foo = Table.parse open('http://en.wikipedia.org/wiki/List_of_Nintendo_Entertainment_System_games')
19
+
20
+ puts foo.unlicensed_games[1].inspect
21
+
22
+
23
+
24
+
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require 'nokogiri'
6
+ require 'fastercsv'
7
+
8
+ url = "http://www.gamespot.com/games.html?platform=19&category=&type=games&mode=all&sort=title&sortdir=desc&page="
9
+ pages = (0..27)
10
+
11
+ FasterCSV.open("nes_gamespot.csv","w", :write_headers => true, :headers => %w[ title category release ]) do |csv|
12
+ pages.each do |page|
13
+ doc = Nokogiri::HTML(open(url + page.to_s))
14
+ doc.xpath("//tr").each do |node|
15
+ game = node.content.split("\n").map {|i| i.strip }.reject { |i| i.length == 0 }
16
+ csv << game unless game.include?("Release Date")
17
+ end
18
+ end
19
+ end
data/lib/imw.rb CHANGED
@@ -1,60 +1,16 @@
1
1
  require 'rubygems'
2
- require 'bundler/setup'
3
- require 'imw/boot'
4
2
  require 'imw/utils'
3
+ require 'imw/error'
4
+ require 'imw/uri'
5
5
 
6
- # The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
7
- # extracting, parsing, munging, and packaging datasets. It allows you
8
- # to handle different data formats transparently as well as organize
9
- # transformations of data as a network of dependencies (a la Make or
10
- # Rake).
11
- #
12
- # IMW has a few central concepts: resources, metadata, datasets,
13
- # workflows, and repositories.
14
- #
15
- # Resources represent individual data resources like local files,
16
- # websites, databases, &c. An IMW::Resource is typically instantiated
17
- # via IMW.open, with IMW doing the work of figuring out what to return
18
- # based on the URI passed in.
19
- #
20
- # A Resource can have a schema which describes the fields in its data.
21
- # IMW::Metadata consists of classes which describe fields.
22
- #
23
- # Datasets represent collections of related data resources .. An
24
- # IMW::Dataset comes with a pre-defined (but customizable) workflow
25
- # that takes data resources through several steps: rip, parse, munge,
26
- # and package. The workflow leverages Rake and so the various tasks
27
- # that are necessary to process the data till it is nice and pretty
28
- # can all be linked with dependencies.
29
- #
30
- # Repositories are collections of datasets and it is on these
31
- # collections that the +imw+ command line tool operates.
32
6
  module IMW
7
+
8
+ autoload :Recordizer, 'imw/recordizer'
33
9
  autoload :Resource, 'imw/resource'
34
10
  autoload :Schemes, 'imw/schemes'
35
- autoload :Archives, 'imw/archives'
36
- autoload :CompressedFiles, 'imw/compressed_files'
37
- autoload :Formats, 'imw/formats'
38
- autoload :Tools, 'imw/tools'
11
+ autoload :Formats, 'imw/formats'
39
12
  autoload :Parsers, 'imw/parsers'
40
- autoload :Dataset, 'imw/dataset'
41
- autoload :Repository, 'imw/repository'
42
- autoload :Metadata, 'imw/metadata'
43
13
 
44
- # Open a resource at the given +uri+. The resource will
45
- # automatically be extended by modules which make sense given the
46
- # +uri+.
47
- #
48
- # See the documentation for IMW::Resource and the various modules
49
- # within IMW::Resources for more information and options.
50
- #
51
- # Passing in an IMW::Resource will simply return it.
52
- #
53
- # @param [String, Addressable::URI, IMW::Resource] obj the URI to open
54
- # @param [Hash] options
55
- # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
56
- # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
57
- # @return [IMW::Resource] the resulting resource, property extended for the given URI
58
14
  def self.open obj, options={}, &block
59
15
  if obj.is_a?(IMW::Resource)
60
16
  resource = obj
@@ -63,103 +19,35 @@ module IMW
63
19
  options[:skip_modules] ||= (options[:without] || [])
64
20
  resource = IMW::Resource.new(obj, options)
65
21
  end
66
- if block_given?
67
- yield resource
68
- resource.close
69
- else
70
- resource
71
- end
72
22
  end
73
23
 
74
- # Open (and create if necessary) a directory at the given URI.
75
- #
76
- # Will automatically create directories recursively. Options will
77
- # be passed to IMW.open and interpreted appropriately. If a block
78
- # is passed, the directory will be created before the block is
79
- # yielded to.
80
- #
81
- # @param [String, IMW::Resource] uri
82
- # @param [Hash] options
83
- # @return [IMW::Resource]
84
- def self.dir! uri, options={}, &block
85
- if block_given?
86
- new_dir = open(uri, options.merge(:as => (options[:as] || []) + [Schemes::Local::LocalDirectory])) do |d|
87
- new_dir.create
88
- yield
24
+ class Resource
25
+
26
+ attr_reader :uri
27
+
28
+ def initialize(uri, mode='r')
29
+ raise FileModeError.new("'#{mode}' is not a valid access mode") unless valid_modes.include? mode
30
+ @uri = Uri.new(uri)
31
+ end
32
+
33
+ def self.open(uri, mode='r', &blk)
34
+ resource = Resource.new(uri, mode)
35
+ if block_given?
36
+ yield resource
37
+ else
38
+ return resource
89
39
  end
90
- else
91
- new_dir = open(uri, options.merge(:as => (options[:as] || []) + [Schemes::Local::LocalDirectory]))
92
- new_dir.create
93
40
  end
94
- new_dir
95
- end
96
-
97
- # Works the same way as IMW.open except opens the resource for
98
- # writing.
99
- #
100
- # @param [String, Addressable::URI] uri the URI to open
101
- # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
102
- def self.open! uri, options={}, &block
103
- open(uri, options.merge(:mode => 'w'), &block)
104
- end
105
41
 
106
- # The default repository in which to place datasets. See the
107
- # documentation for IMW::Repository for more information on how
108
- # datasets and repositories fit together.
109
- #
110
- # @return [IMW::Repository] the default IMW repository
111
- def self.repository
112
- @@repository ||= IMW::Repository.new
113
- end
42
+ def self.exists? resource
43
+ true
44
+ end
114
45
 
115
- # Create a dataset and put it in the default IMW repository.
116
- #
117
- # Evaluates the given block in the context of the new dataset. This
118
- # allows you to define tasks, add paths, and use defined metadata in
119
- # an elegant way.
120
- #
121
- # IMW.dataset :my_dataset do
122
- #
123
- # # Define some paths we're going to use
124
- # add_path :original, :rawd, 'original.csv'
125
- # add_path :filtered, :fixd, 'filtered.csv'
126
- # add_path :package, :pkgd, 'filtered.tar.bz2'
127
- #
128
- # # Copy a CSV filefrom a website to this machine.
129
- # rip do
130
- # open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
131
- # end
132
- #
133
- # # Filter the original CSV data by the
134
- # # <tt>meets_some_condition?</tt> method we define elsewhere...
135
- # munge do
136
- # open!(path_to(:filtered)) do |filtered|
137
- # open(path_to(:original)).each do |row|
138
- # filtered << row if meets_some_condition?(row)
139
- # end
140
- # end
141
- #
142
- # # Compress the filtered data to an archive.
143
- # package do
144
- # open(path_to(:filtered)).compress.mv(path_to(:package))
145
- # end
146
- # end
147
- #
148
- # See the <tt>/examples</tt> directory of the IMW distribution for
149
- # more examples.
150
- #
151
- # @param [Symbol, String] handle the handle to identify this dataset with
152
- # @param [Hash] options a hash of options (see IMW::Dataset)
153
- # @return [IMW::Dataset] the new dataset
154
- def self.dataset handle, options={}, &block
155
- d = IMW::Dataset.new(handle, options.merge(:repository => IMW.repository))
156
- d.instance_eval(&block) if block_given?
157
- d
158
- end
46
+ private
47
+ def valid_modes
48
+ %w[ r w a ]
49
+ end
159
50
 
160
- end
51
+ end
161
52
 
162
- # Works just like IMW.dataset but defined at a top-level scope.
163
- def dataset handle, options={}, &block
164
- IMW.dataset(handle, options, &block)
165
53
  end
@@ -0,0 +1,9 @@
1
+ module IMW
2
+
3
+ Error = Class.new(StandardError)
4
+
5
+ FileModeError = Class.new(Error)
6
+
7
+ InvalidFormatError = Class.new(Error)
8
+
9
+ end
@@ -0,0 +1,8 @@
1
+ module IMW
2
+ module Recordizer
3
+
4
+ autoload :StringSliceRecordizer, 'imw/recordizer/string_slice_recordizer'
5
+ autoload :HTMLSelectorRecordizer, 'imw/recordizer/html_selector_recordizer'
6
+
7
+ end
8
+ end
@@ -0,0 +1,86 @@
1
+ module IMW
2
+ module Recordizer
3
+ class HTMLSelectorRecordizer
4
+
5
+ def self.element(*args, &block)
6
+ selector, name, delegate = parse_rule_declaration(*args, &block)
7
+ rules[name] = [selector, delegate]
8
+ attr_accessor name
9
+ name
10
+ end
11
+
12
+ def self.elements(*args, &block)
13
+ name = element(*args, &block)
14
+ rules[name] << true
15
+ end
16
+
17
+ def initialize
18
+ self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
19
+ end
20
+
21
+ def self.recordize(doc)
22
+ self.new.recordize(doc)
23
+ end
24
+
25
+ def recordize(doc)
26
+ self.class.rules.each do |target, (selector, delegate, plural)|
27
+ if plural
28
+ send(target).concat doc.search(selector).map { |i| parse_result(i, delegate) }
29
+ else
30
+ send("#{target}=", parse_result(doc.at(selector), delegate))
31
+ end
32
+ end
33
+ self.to_hash
34
+ end
35
+
36
+ def to_hash
37
+ converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
38
+ self.class.rules.keys.inject({}) do |hash, name|
39
+ value = send(name)
40
+ hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
41
+ hash
42
+ end
43
+ end
44
+
45
+ protected
46
+
47
+ def parse_result(node, delegate)
48
+ if delegate
49
+ delegate.respond_to?(:call) ? delegate.call(node) : delegate.recordize(node)
50
+ elsif node.respond_to? :inner_text
51
+ node.inner_text
52
+ else
53
+ node
54
+ end unless node.nil?
55
+ end
56
+
57
+ private
58
+
59
+ def self.rules
60
+ @rules ||= {}
61
+ end
62
+
63
+ def self.inherited(subclass)
64
+ subclass.rules.update self.rules
65
+ end
66
+
67
+ # Rule declaration forms:
68
+ #
69
+ # { 'selector' => :property, :with => delegate }
70
+ # #=> ['selector', :property, delegate]
71
+ #
72
+ # :title
73
+ # #=> ['title', :title, nil]
74
+ def self.parse_rule_declaration(*args, &block)
75
+ options, name = Hash === args.last ? args.pop : {}, args.first
76
+ delegate = options.delete(:with)
77
+ selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
78
+ raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
79
+ # eval block in context of a new scraper subclass
80
+ delegate = Class.new(delegate || Nibbler, &block) if block_given?
81
+ return selector, property, delegate
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,39 @@
1
+ module IMW
2
+ module Recordizer
3
+ class StringSliceRecordizer
4
+
5
+ attr_reader :schema
6
+
7
+ def initialize ranges
8
+ @schema = ranges
9
+ end
10
+
11
+ def recordize line
12
+ format = schema
13
+ case format
14
+ when Array then slice_by_array(line, format)
15
+ when Hash then slice_by_hash(line, format)
16
+ end
17
+ end
18
+
19
+ def slice_range string, range
20
+ string.slice(range).strip
21
+ end
22
+
23
+ def slice_by_array string, format
24
+ format.map { |range| slice_range(string, range) }
25
+ end
26
+
27
+ def slice_by_hash string, format
28
+ format.inject({}) do |hsh, (key, val)|
29
+ case val
30
+ when Range then hsh[key] = slice_range(string, val)
31
+ when Hash then hsh[key] = slice_by_hash(string, val)
32
+ end
33
+ hsh
34
+ end
35
+ end
36
+
37
+ end
38
+ end
39
+ end
@@ -1,80 +1,10 @@
1
1
  require 'imw/utils/has_uri'
2
2
 
3
3
  module IMW
4
-
5
- # A resource can be anything addressable via a URI. Examples
6
- # include local files, remote files, webpages, &c.
7
- #
8
- # The IMW::Resource class takes a URI as input and then dynamically
9
- # extends itself with appropriate modules from IMW. As an example,
10
- # calling
11
- #
12
- # my_archive = IMW::Resource.new('/path/to/my/archive.tar.bz2')
13
- #
14
- # would return an IMW::Resource extended by
15
- # IMW::Archives::Tarbz2 (among other modules) which
16
- # therefore has methods for extracting, listing, and appending to
17
- # the archive.
18
- #
19
- # Modules are so extended based on handlers defined in the
20
- # <tt>imw/resources</tt> directory and accessible via
21
- # IMW::Resource.handlers. You can define your own handlers by
22
- # defining the constant IMW::Resource::USER_DEFINED_HANDLERS in your
23
- # configuration file.
24
- #
25
- # The modules extending a particular IMW::Resource instance can be
26
- # listed as follows
27
- #
28
- # my_archive.modules #=> [IMW::Local::Base, IMW::Local::File, IMW::Local::Compressible, IMW::Archives::Tarbz2]
29
- #
30
- # By default, resources are opened for reading. Passing in the
31
- # appropriate <tt>:mode</tt> option changes this:
32
- #
33
- # IMW::Resource.new('/path/to/my_new_file', :mode => 'w')
34
- #
35
- # If the <tt>:skip_modules</tt> option is passed in then the
36
- # resource will not extend itself with any modules and will
37
- # essentially only retain the bare functionality of a URI. This can
38
- # be useful when subclassing IMW::Resource or dealing with a very
39
- # strange kind of resource.
40
- #
41
- # Read the documentation for modules in IMW::Resources to learn more
42
- # about the various behaviors an IMW::Resource can acquire.
43
- #
44
- # You can also instantiate an IMW::Resource using IMW.open, which
45
- # accepts all the same arguments as IMW::Resource.new.
46
4
  class Resource
47
5
 
48
- # The mode in which to access this resource.
49
- attr_accessor :mode
50
-
51
- # A copy of the options passed to this resource on initialization.
52
- attr_accessor :resource_options
6
+ attr_accessor :mode, :resource_options
53
7
 
54
- # Create a new resource representing +uri+.
55
- #
56
- # IMW will automatically extend the resulting IMW::Resource
57
- # instance with modules appropriate for the given URI:
58
- #
59
- # r = IMW::Resource.new("http://www.infochimps.com")
60
- # r.modules
61
- # => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
62
- #
63
- # You can prevent this altogether by passing in
64
- # <tt>:no_modules</tt>:
65
- #
66
- # r = IMW::Resource.new("http://www.infochimps.com", :no_modules => true)
67
- # r.modules
68
- # => []
69
- #
70
- # And you can exert more fine-grained control with the
71
- # <tt>:use_modules</tt> and <tt>:skip_modules</tt> options, see
72
- # IMW::Resource.extend_instance! for details.
73
- #
74
- # @param [String, Addressable::URI] uri
75
- # @param [Hash] options
76
- # @option options [String] mode the mode to open the resource in (will be ignored when inapplicable)
77
- # @return [IMW::Resource]
78
8
  def initialize uri, options={}
79
9
  self.uri = uri
80
10
  self.resource_options = options
@@ -85,20 +15,14 @@ module IMW
85
15
  # Provides resources with a wrapped Addressable::URI object.
86
16
  include IMW::Utils::HasURI
87
17
 
88
- # Provides resources with a summary, metadata, & schema.
89
- include IMW::Metadata::HasSummary
90
-
91
18
  # Gives IMW::Resource instances with the ability to dynamically
92
19
  # extend themselves with modules chosen from a set of handlers
93
20
  # stored by the IMW::Resource class.
94
21
  include IMW::Utils::DynamicallyExtendable
95
- [IMW::Schemes::HANDLERS, IMW::CompressedFiles::HANDLERS, IMW::Archives::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
22
+ [IMW::Schemes::HANDLERS, IMW::Formats::HANDLERS].each do |handlers|
96
23
  register_handlers *handlers
97
24
  end
98
-
99
- # Raise an error unless this resource exists.
100
- #
101
- # @param [String] message an optional message to include
25
+
102
26
  def should_exist!(message=nil)
103
27
  raise IMW::Error.new([message, "No path defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:path)
104
28
  raise IMW::Error.new([message, "No exist? method defined for #{self.inspect} extended by #{modules.join(' ')}"].compact.join(', ')) unless respond_to?(:exist?)
@@ -106,52 +30,12 @@ module IMW
106
30
  self
107
31
  end
108
32
 
109
- # Close this resource.
110
- #
111
- # Modules should hook into super() as they need to redefine this
112
- # method.
113
33
  def close
114
34
  end
115
35
 
116
- # Open a copy of this resource.
117
- #
118
- # This is useful when wanting to reset file handles. Though -- be
119
- # warned -- it does not close any file handles itself...
120
- #
121
- # @return [IMW::Resource] the new (old) resource
122
36
  def reopen
123
37
  IMW.open(uri.to_s)
124
38
  end
125
39
 
126
- # If +method+ begins with the strings +is+, +on+, or +via+ and
127
- # ends with a question mark then we interpret it as a question
128
- # this resource doesn't know how to answer -- so we have it answer
129
- # +false+.
130
- #
131
- # As an example, consider the following loop:
132
- #
133
- # IMW.open('/tmp').all_contents.each do |obj|
134
- # if obj.is_archive?
135
- # # ... do something
136
- # end
137
- # end
138
- #
139
- # When +obj+ is initialized and it _isn't_ an archive, then it
140
- # doesn't know about the <tt>is_archive?</tt> method -- but it
141
- # should therefore answer false anyway.
142
- #
143
- # This lets a basic text file answer questions about whether it's
144
- # an archive (or on S3, or accessed via some user-defined scheme,
145
- # &c.) without needing to know anything about archives (or S3 or
146
- # the user-defined scheme).
147
- def method_missing method, *args
148
- if args.empty? && method.to_s =~ /(is|on|via)_.*\?$/
149
- # querying for a boolean response so answer false
150
- return false
151
- else
152
- raise IMW::NoMethodError, "undefined method `#{method}' for #{self}, extended by #{modules.join(', ')}"
153
- end
154
- end
155
-
156
40
  end
157
41
  end