imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,33 +0,0 @@
1
- module IMW
2
- module Formats
3
- autoload :Csv, 'imw/formats/delimited'
4
- autoload :Tsv, 'imw/formats/delimited'
5
- autoload :Excel, 'imw/formats/excel'
6
- autoload :Json, 'imw/formats/json'
7
- autoload :Xml, 'imw/formats/sgml'
8
- autoload :Xsl, 'imw/formats/sgml'
9
- autoload :Html, 'imw/formats/sgml'
10
- autoload :Xhtml, 'imw/formats/sgml'
11
- autoload :Rdf, 'imw/formats/sgml'
12
- autoload :Yaml, 'imw/formats/yaml'
13
- autoload :Pdf, 'imw/formats/pdf'
14
-
15
- # Handlers which augment a resource with data format specific
16
- # methods.
17
- HANDLERS = [
18
- [ "Formats::Csv", /\.csv$/i ],
19
- [ "Formats::Tsv", /\.tsv$/i ],
20
- [ "Formats::Excel", /\.xlsx?$/i ],
21
- [ "Formats::Json", /\.json$/i ],
22
- [ "Formats::Xml", /\.xml$/i ],
23
- [ "Formats::Xsl", /\.xsl$/i ],
24
- [ "Formats::Html", /\.html?$/i ],
25
- [ "Formats::Xhtml", /\.xhtml?$/i ],
26
- [ "Formats::Rdf", /\.rdf?$/i ],
27
- [ "Formats::Yaml", /\.ya?ml$/i ],
28
- [ "Formats::Pdf", /\.pdf$/i ]
29
- ]
30
- end
31
- end
32
-
33
-
@@ -1,170 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods used for parsing and writing delimited data
5
- # formats (CSV, TSV, &c.) with the FasterCSV library. This
6
- # module is not used to directly extend a resource. Instead,
7
- # more specific modules (e.g. - IMW::Resources::Formats::Csv)
8
- # include this one and also define +delimited_options+ which is
9
- # actually what's passed to FasterCSV.
10
- #
11
- # @abstract
12
- module Delimited
13
-
14
- # Default options to be passed to
15
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
16
- # documentation for more information.
17
- #
18
- # @return [Hash]
19
- def delimited_options
20
- @delimited_options ||= {
21
- :headers => fields && fields.map { |field| field['name'] }
22
- }.merge(resource_options_compatible_with_faster_csv)
23
- end
24
-
25
- # Return the data in this delimited resource as an array of
26
- # arrays.
27
- #
28
- # Yield each outer array (row) if passed a block.
29
- #
30
- # @return [Array] the full data matrix
31
- # @yield [Array] each row of the data
32
- def load &block
33
- require 'fastercsv'
34
- FasterCSV.parse(read, delimited_options, &block)
35
- end
36
-
37
- # Gives us goodies! Needs +each+ below.
38
- include Enumerable
39
-
40
- # Call +block+ with each row in this delimited resource.
41
- def each &block
42
- require 'fastercsv'
43
- FasterCSV.new(io, delimited_options).each(&block)
44
- end
45
-
46
- # Emit a single array or an array of arrays into this resource.
47
- #
48
- # @param [Array<Array>, Array] data array or array of arrays to emit
49
- # @param [Hash] options
50
- # @option options [true, false] :persist Keep this resource's IO object open after emiting
51
- def emit data, options={}
52
- require 'fastercsv'
53
- data = [data] unless data.first.is_a?(Array)
54
- data.each do |row|
55
- write(FasterCSV.generate_line(row, delimited_options))
56
- end
57
- self
58
- end
59
- alias_method :<<, :emit
60
-
61
- # Do a heuristic check to determine whether or not the first row
62
- # of this delimited data is a row of headers.
63
- #
64
- # @return [true, false]
65
- def fields_in_first_line?
66
- # grab the header and up to 10 body rows
67
- require 'fastercsv'
68
- copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
69
- header = (copy.shift || []) rescue []
70
- body = 10.times.map { (copy.shift || []) rescue []}.flatten
71
-
72
- # guess how many elements in a row
73
- #size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
74
-
75
- # calculate the fraction of bytes that are [-A-z_] (letters +
76
- # underscore + hypen) for header and body and compute a
77
- # threshold determinant
78
- header_chars = header.map(&:to_s).join
79
- header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
80
- body_chars = body.map(&:to_s).join
81
- body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
82
- header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
83
- body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
84
- determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
85
-
86
- # decide, setting the threshold at 0.05 based on some guesswork...
87
- determinant && determinant >= 0.05
88
- end
89
-
90
- # If it seems like there are fields in the first line of this
91
- # data then go ahead and use them to define this resource's
92
- # fields.
93
- #
94
- # Will overwrite any fields already present for this resource.
95
- def guess_fields!
96
- return unless fields_in_first_line?
97
- copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
98
- names = (copy.shift || []) rescue []
99
- self.fields = names.map { |n| { 'name' => n } }
100
- delimited_options[:headers] = names
101
- end
102
-
103
- # Return a 10-line sample of this file.
104
- #
105
- # @return [Array<Array>]
106
- def snippet
107
- require 'fastercsv'
108
- [].tap do |rows|
109
- rows_sampled = 0
110
- begin
111
- each do |row|
112
- begin
113
- break if rows_sampled > 100
114
- row_size = row.size.to_f
115
- if (row.reject(&:blank?).size.to_f / row_size) >= 0.5
116
- rows << row.size.times.map { |index| row[index] }
117
- rows_sampled += 1
118
- end
119
- rescue => e
120
- next
121
- end
122
- end
123
- rescue => e
124
- end
125
- end
126
- end
127
-
128
- protected
129
- # An array of option names used by FasterCSV.
130
- FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
131
-
132
- # Return the subset of options this resource was initialized
133
- # with that are compatible with FasterCSV (it complains when you
134
- # give it keywords it doesn't know).
135
- #
136
- # @return [Hash]
137
- def resource_options_compatible_with_faster_csv
138
- @compatible_options ||= {}.tap do |compatible_options|
139
- FASTER_CSV_OPTION_NAMES.each do |option_name|
140
- compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
141
- end
142
- end
143
- end
144
- end
145
-
146
- # A module for working with CSV (comma-separated value) formatted
147
- # data.
148
- #
149
- # @see IMW::Formats::Delimited
150
- module Csv
151
- include Delimited
152
- def delimited_options
153
- @delimited_options ||= {:col_sep => ","}.merge(super())
154
- end
155
- end
156
-
157
- # A module for working with TSV (tab-separated value) formatted
158
- # data.
159
- #
160
- # @see IMW::Formats::Delimited
161
- module Tsv
162
- include Delimited
163
- def delimited_options
164
- @delimited_options ||= {
165
- :col_sep => "\t",
166
- }.merge(super())
167
- end
168
- end
169
- end
170
- end
@@ -1,100 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods for reading and writing Microsoft Excel data.
5
- module Excel
6
-
7
- # Ensure that this Excel resource is described by a an ordered
8
- # collection of flat fields.
9
- def validate_schema!
10
- raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
11
- end
12
-
13
- # Return the data in this Excel document as an array of arrays.
14
- #
15
- # Data from consecutive worksheets will be concatenated into a
16
- # single outer array.
17
- #
18
- # @return [Array<Array>]
19
- def load
20
- require 'spreadsheet'
21
- data = []
22
- Spreadsheet.open(path).worksheets.each do |worksheet|
23
- data += worksheet.map do |row|
24
- row.to_a
25
- end
26
- end
27
- data
28
- end
29
-
30
- # Gives us goodies! Needs +each+ below.
31
- include Enumerable
32
-
33
- # Yield each row of this Excel document.
34
- #
35
- # Will loop from one worksheet to the next.
36
- #
37
- # @yield [Spreadsheet::Excel::Row]
38
- def each &block
39
- require 'spreadsheet'
40
- Spreadsheet.open(path).worksheets.each do |worksheet|
41
- worksheet.each(&block)
42
- end
43
- end
44
-
45
- # Return the number of lines in this Excel document.
46
- #
47
- # Measured across worksheets.
48
- #
49
- # @return [Integer]
50
- def num_lines
51
- require 'spreadsheet'
52
- Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
53
- sum += worksheet.row_count
54
- end
55
- end
56
-
57
- # TODO
58
- #
59
- # def emit
60
- # end
61
-
62
- # TODO
63
- #
64
- # Extract the following methods from delimited into a module and
65
- # let both Excel and Delimited use them.
66
- #
67
- # Or let Excel include Delimited and let it override
68
- # appropriately.
69
- #
70
- # headers_in_first_line?
71
- # guess_schema!
72
- #
73
- #
74
-
75
- #
76
- def snippet
77
- require 'spreadsheet'
78
- [].tap do |snip|
79
- rows_sampled = 0
80
- Spreadsheet.open(path).worksheets.each do |worksheet|
81
- worksheet.each do |row|
82
- begin
83
- break if rows_sampled > 100
84
- row_size = row.size.to_f
85
- if (row.reject(&:blank?).size.to_f / row_size) > 0.5
86
- snip << row.to_a
87
- rows_sampled += 1
88
- end
89
- rescue => e
90
- next
91
- end
92
- end
93
- break if rows_sampled > 10
94
- end
95
- end
96
- end
97
- end
98
- end
99
- end
100
-
@@ -1,41 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods for reading and writing JSON data.
5
- module Json
6
-
7
- include Enumerable
8
-
9
- # Return the content of this resource.
10
- #
11
- # Will pass a block to the outermost JSON data structure's each
12
- # method.
13
- #
14
- # @return [Hash, Array, String, Fixnum] whatever the JSON contained
15
- def load &block
16
- require 'json'
17
- json = JSON.parse(read)
18
- if block_given?
19
- json.each(&block)
20
- else
21
- json
22
- end
23
- end
24
-
25
- # Iterate over the elements in the JSON.
26
- def each &block
27
- load(&block)
28
- end
29
-
30
- # Emit the +data+ into this resource. It must be opened for
31
- # writing.
32
- #
33
- # @param [Hash, String, Array, Fixnum] data the Ruby object to emit
34
- def emit data, options={}
35
- require 'json'
36
- write(data.to_json)
37
- self
38
- end
39
- end
40
- end
41
- end
@@ -1,71 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods for parsing and generating PDF.
5
- #
6
- # Uses PDF::Reader for parsing and Prawn for generating.
7
- module Pdf
8
-
9
- # Return a snippet of text from this PDF.
10
- #
11
- # @return [String]
12
- def snippet
13
- begin
14
- require 'pdf/reader'
15
- snippetizer = Snippetizer.new
16
- PDF::Reader.file(path, snippetizer)
17
- snippetizer.snippet
18
- rescue Snippetizer::SnippetEndError
19
- snippetizer.snippet
20
- rescue
21
- ''
22
- end
23
- end
24
-
25
- # A receiver class used by PDF::Reader which agglomerates text
26
- # up to 1024 bytes and then bails.
27
- class Snippetizer
28
-
29
- # A custom error class that can be thrown while receiving text
30
- # from PDF::Reader to cut-short walking large PDF documents.
31
- SnippetEndError = Class.new(IMW::Error)
32
-
33
- # The snippet being built by this snippetizer.
34
- attr_accessor :snippet
35
-
36
- def initialize
37
- @snippet = ''
38
- end
39
-
40
- # Agglomerates text from PDF::Reader up to a fixed size of
41
- # 1024 bytes.
42
- #
43
- # Will convert a single-space line from PDF::Reader as a
44
- # newline character.
45
- #
46
- # FIXME How does the receiver ask PDF::Reader to abort walking
47
- # the document now that enough text has been returned? Till a
48
- # more graceful way is found this method simply raises an
49
- # error, creating a GOTO...
50
- def show_text *params
51
- params.each do |string|
52
- if @snippet.size < 1024
53
- if string == ' '
54
- @snippet += "\n"
55
- else
56
- @snippet += string[0..1024]
57
- end
58
- else
59
- raise SnippetEndError.new
60
- end
61
- end
62
- end
63
- alias_method :show_text_with_positioning, :show_text
64
- alias_method :move_to_next_line_and_show_text, :show_text
65
- alias_method :set_spacing_next_line_show_text, :show_text
66
- end
67
-
68
- end
69
- end
70
- end
71
-
@@ -1,69 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods to parse SGML-derived data formats (XML, HTML,
5
- # &c.). This module isn't directly used to extend resources.
6
- # Instead, more specific modules (e.g. -
7
- # IMW::Resources::Formats::Xml) are used.
8
- module Sgml
9
-
10
- # Parse this resource using Hpricot and return (or yield if
11
- # given a block) the resulting Hpricot::Doc.
12
- #
13
- # @return [Hpricot::Doc]
14
- # @yield [Hpricot::Doc]
15
- def load &block
16
- require 'hpricot'
17
- sgml = Hpricot(io)
18
- if block_given?
19
- yield sgml
20
- else
21
- sgml
22
- end
23
- end
24
-
25
- # Parse the Hpricot::Doc of this resource with the given
26
- # +parser+.
27
- #
28
- # The parser can either be an IMW::Parsers::HtmlParser or a
29
- # hash which will be used to build such a parser. See the
30
- # documentation for IMW::Parsers::HtmlParser for more
31
- # information.
32
- #
33
- # @param [Hash, IMW::Parsers::HtmlParser] parser
34
- # @return [Hash] the parser's output
35
- def parse parser
36
- if parser.is_a?(IMW::Parsers::HtmlParser)
37
- parser.parse(load)
38
- else
39
- IMW::Parsers::HtmlParser.new(parser).parse(load)
40
- end
41
- end
42
- end
43
-
44
- # Defines methods for XML data.
45
- module Xml
46
- include Sgml
47
- end
48
-
49
- # Defines methods for XSL data.
50
- module Xsl
51
- include Sgml
52
- end
53
-
54
- # Defines methods for XHTML data.
55
- module Xhtml
56
- include Sgml
57
- end
58
-
59
- # Defines methods for HTML data.
60
- module Html
61
- include Sgml
62
- end
63
-
64
- # Defines methods for RDF data.
65
- module Rdf
66
- include Sgml
67
- end
68
- end
69
- end