imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,33 +0,0 @@
1
- module IMW
2
- module Formats
3
- autoload :Csv, 'imw/formats/delimited'
4
- autoload :Tsv, 'imw/formats/delimited'
5
- autoload :Excel, 'imw/formats/excel'
6
- autoload :Json, 'imw/formats/json'
7
- autoload :Xml, 'imw/formats/sgml'
8
- autoload :Xsl, 'imw/formats/sgml'
9
- autoload :Html, 'imw/formats/sgml'
10
- autoload :Xhtml, 'imw/formats/sgml'
11
- autoload :Rdf, 'imw/formats/sgml'
12
- autoload :Yaml, 'imw/formats/yaml'
13
- autoload :Pdf, 'imw/formats/pdf'
14
-
15
- # Handlers which augment a resource with data format specific
16
- # methods.
17
- HANDLERS = [
18
- [ "Formats::Csv", /\.csv$/i ],
19
- [ "Formats::Tsv", /\.tsv$/i ],
20
- [ "Formats::Excel", /\.xlsx?$/i ],
21
- [ "Formats::Json", /\.json$/i ],
22
- [ "Formats::Xml", /\.xml$/i ],
23
- [ "Formats::Xsl", /\.xsl$/i ],
24
- [ "Formats::Html", /\.html?$/i ],
25
- [ "Formats::Xhtml", /\.xhtml?$/i ],
26
- [ "Formats::Rdf", /\.rdf?$/i ],
27
- [ "Formats::Yaml", /\.ya?ml$/i ],
28
- [ "Formats::Pdf", /\.pdf$/i ]
29
- ]
30
- end
31
- end
32
-
33
-
@@ -1,170 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods used for parsing and writing delimited data
5
- # formats (CSV, TSV, &c.) with the FasterCSV library. This
6
- # module is not used to directly extend a resource. Instead,
7
- # more specific modules (e.g. - IMW::Resources::Formats::Csv)
8
- # include this one and also define +delimited_options+ which is
9
- # actually what's passed to FasterCSV.
10
- #
11
- # @abstract
12
- module Delimited
13
-
14
- # Default options to be passed to
15
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
16
- # documentation for more information.
17
- #
18
- # @return [Hash]
19
- def delimited_options
20
- @delimited_options ||= {
21
- :headers => fields && fields.map { |field| field['name'] }
22
- }.merge(resource_options_compatible_with_faster_csv)
23
- end
24
-
25
- # Return the data in this delimited resource as an array of
26
- # arrays.
27
- #
28
- # Yield each outer array (row) if passed a block.
29
- #
30
- # @return [Array] the full data matrix
31
- # @yield [Array] each row of the data
32
- def load &block
33
- require 'fastercsv'
34
- FasterCSV.parse(read, delimited_options, &block)
35
- end
36
-
37
- # Gives us goodies! Needs +each+ below.
38
- include Enumerable
39
-
40
- # Call +block+ with each row in this delimited resource.
41
- def each &block
42
- require 'fastercsv'
43
- FasterCSV.new(io, delimited_options).each(&block)
44
- end
45
-
46
- # Emit a single array or an array of arrays into this resource.
47
- #
48
- # @param [Array<Array>, Array] data array or array of arrays to emit
49
- # @param [Hash] options
50
- # @option options [true, false] :persist Keep this resource's IO object open after emiting
51
- def emit data, options={}
52
- require 'fastercsv'
53
- data = [data] unless data.first.is_a?(Array)
54
- data.each do |row|
55
- write(FasterCSV.generate_line(row, delimited_options))
56
- end
57
- self
58
- end
59
- alias_method :<<, :emit
60
-
61
- # Do a heuristic check to determine whether or not the first row
62
- # of this delimited data is a row of headers.
63
- #
64
- # @return [true, false]
65
- def fields_in_first_line?
66
- # grab the header and up to 10 body rows
67
- require 'fastercsv'
68
- copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
69
- header = (copy.shift || []) rescue []
70
- body = 10.times.map { (copy.shift || []) rescue []}.flatten
71
-
72
- # guess how many elements in a row
73
- #size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
74
-
75
- # calculate the fraction of bytes that are [-A-z_] (letters +
76
- # underscore + hypen) for header and body and compute a
77
- # threshold determinant
78
- header_chars = header.map(&:to_s).join
79
- header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
80
- body_chars = body.map(&:to_s).join
81
- body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
82
- header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
83
- body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
84
- determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
85
-
86
- # decide, setting the threshold at 0.05 based on some guesswork...
87
- determinant && determinant >= 0.05
88
- end
89
-
90
- # If it seems like there are fields in the first line of this
91
- # data then go ahead and use them to define this resource's
92
- # fields.
93
- #
94
- # Will overwrite any fields already present for this resource.
95
- def guess_fields!
96
- return unless fields_in_first_line?
97
- copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
98
- names = (copy.shift || []) rescue []
99
- self.fields = names.map { |n| { 'name' => n } }
100
- delimited_options[:headers] = names
101
- end
102
-
103
- # Return a 10-line sample of this file.
104
- #
105
- # @return [Array<Array>]
106
- def snippet
107
- require 'fastercsv'
108
- [].tap do |rows|
109
- rows_sampled = 0
110
- begin
111
- each do |row|
112
- begin
113
- break if rows_sampled > 100
114
- row_size = row.size.to_f
115
- if (row.reject(&:blank?).size.to_f / row_size) >= 0.5
116
- rows << row.size.times.map { |index| row[index] }
117
- rows_sampled += 1
118
- end
119
- rescue => e
120
- next
121
- end
122
- end
123
- rescue => e
124
- end
125
- end
126
- end
127
-
128
- protected
129
- # An array of option names used by FasterCSV.
130
- FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
131
-
132
- # Return the subset of options this resource was initialized
133
- # with that are compatible with FasterCSV (it complains when you
134
- # give it keywords it doesn't know).
135
- #
136
- # @return [Hash]
137
- def resource_options_compatible_with_faster_csv
138
- @compatible_options ||= {}.tap do |compatible_options|
139
- FASTER_CSV_OPTION_NAMES.each do |option_name|
140
- compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
141
- end
142
- end
143
- end
144
- end
145
-
146
- # A module for working with CSV (comma-separated value) formatted
147
- # data.
148
- #
149
- # @see IMW::Formats::Delimited
150
- module Csv
151
- include Delimited
152
- def delimited_options
153
- @delimited_options ||= {:col_sep => ","}.merge(super())
154
- end
155
- end
156
-
157
- # A module for working with TSV (tab-separated value) formatted
158
- # data.
159
- #
160
- # @see IMW::Formats::Delimited
161
- module Tsv
162
- include Delimited
163
- def delimited_options
164
- @delimited_options ||= {
165
- :col_sep => "\t",
166
- }.merge(super())
167
- end
168
- end
169
- end
170
- end
@@ -1,100 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods for reading and writing Microsoft Excel data.
5
- module Excel
6
-
7
- # Ensure that this Excel resource is described by a an ordered
8
- # collection of flat fields.
9
- def validate_schema!
10
- raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
11
- end
12
-
13
- # Return the data in this Excel document as an array of arrays.
14
- #
15
- # Data from consecutive worksheets will be concatenated into a
16
- # single outer array.
17
- #
18
- # @return [Array<Array>]
19
- def load
20
- require 'spreadsheet'
21
- data = []
22
- Spreadsheet.open(path).worksheets.each do |worksheet|
23
- data += worksheet.map do |row|
24
- row.to_a
25
- end
26
- end
27
- data
28
- end
29
-
30
- # Gives us goodies! Needs +each+ below.
31
- include Enumerable
32
-
33
- # Yield each row of this Excel document.
34
- #
35
- # Will loop from one worksheet to the next.
36
- #
37
- # @yield [Spreadsheet::Excel::Row]
38
- def each &block
39
- require 'spreadsheet'
40
- Spreadsheet.open(path).worksheets.each do |worksheet|
41
- worksheet.each(&block)
42
- end
43
- end
44
-
45
- # Return the number of lines in this Excel document.
46
- #
47
- # Measured across worksheets.
48
- #
49
- # @return [Integer]
50
- def num_lines
51
- require 'spreadsheet'
52
- Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
53
- sum += worksheet.row_count
54
- end
55
- end
56
-
57
- # TODO
58
- #
59
- # def emit
60
- # end
61
-
62
- # TODO
63
- #
64
- # Extract the following methods from delimited into a module and
65
- # let both Excel and Delimited use them.
66
- #
67
- # Or let Excel include Delimited and let it override
68
- # appropriately.
69
- #
70
- # headers_in_first_line?
71
- # guess_schema!
72
- #
73
- #
74
-
75
- #
76
- def snippet
77
- require 'spreadsheet'
78
- [].tap do |snip|
79
- rows_sampled = 0
80
- Spreadsheet.open(path).worksheets.each do |worksheet|
81
- worksheet.each do |row|
82
- begin
83
- break if rows_sampled > 100
84
- row_size = row.size.to_f
85
- if (row.reject(&:blank?).size.to_f / row_size) > 0.5
86
- snip << row.to_a
87
- rows_sampled += 1
88
- end
89
- rescue => e
90
- next
91
- end
92
- end
93
- break if rows_sampled > 10
94
- end
95
- end
96
- end
97
- end
98
- end
99
- end
100
-
@@ -1,41 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods for reading and writing JSON data.
5
- module Json
6
-
7
- include Enumerable
8
-
9
- # Return the content of this resource.
10
- #
11
- # Will pass a block to the outermost JSON data structure's each
12
- # method.
13
- #
14
- # @return [Hash, Array, String, Fixnum] whatever the JSON contained
15
- def load &block
16
- require 'json'
17
- json = JSON.parse(read)
18
- if block_given?
19
- json.each(&block)
20
- else
21
- json
22
- end
23
- end
24
-
25
- # Iterate over the elements in the JSON.
26
- def each &block
27
- load(&block)
28
- end
29
-
30
- # Emit the +data+ into this resource. It must be opened for
31
- # writing.
32
- #
33
- # @param [Hash, String, Array, Fixnum] data the Ruby object to emit
34
- def emit data, options={}
35
- require 'json'
36
- write(data.to_json)
37
- self
38
- end
39
- end
40
- end
41
- end
@@ -1,71 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods for parsing and generating PDF.
5
- #
6
- # Uses PDF::Reader for parsing and Prawn for generating.
7
- module Pdf
8
-
9
- # Return a snippet of text from this PDF.
10
- #
11
- # @return [String]
12
- def snippet
13
- begin
14
- require 'pdf/reader'
15
- snippetizer = Snippetizer.new
16
- PDF::Reader.file(path, snippetizer)
17
- snippetizer.snippet
18
- rescue Snippetizer::SnippetEndError
19
- snippetizer.snippet
20
- rescue
21
- ''
22
- end
23
- end
24
-
25
- # A receiver class used by PDF::Reader which agglomerates text
26
- # up to 1024 bytes and then bails.
27
- class Snippetizer
28
-
29
- # A custom error class that can be thrown while receiving text
30
- # from PDF::Reader to cut-short walking large PDF documents.
31
- SnippetEndError = Class.new(IMW::Error)
32
-
33
- # The snippet being built by this snippetizer.
34
- attr_accessor :snippet
35
-
36
- def initialize
37
- @snippet = ''
38
- end
39
-
40
- # Agglomerates text from PDF::Reader up to a fixed size of
41
- # 1024 bytes.
42
- #
43
- # Will convert a single-space line from PDF::Reader as a
44
- # newline character.
45
- #
46
- # FIXME How does the receiver ask PDF::Reader to abort walking
47
- # the document now that enough text has been returned? Till a
48
- # more graceful way is found this method simply raises an
49
- # error, creating a GOTO...
50
- def show_text *params
51
- params.each do |string|
52
- if @snippet.size < 1024
53
- if string == ' '
54
- @snippet += "\n"
55
- else
56
- @snippet += string[0..1024]
57
- end
58
- else
59
- raise SnippetEndError.new
60
- end
61
- end
62
- end
63
- alias_method :show_text_with_positioning, :show_text
64
- alias_method :move_to_next_line_and_show_text, :show_text
65
- alias_method :set_spacing_next_line_show_text, :show_text
66
- end
67
-
68
- end
69
- end
70
- end
71
-
@@ -1,69 +0,0 @@
1
- module IMW
2
- module Formats
3
-
4
- # Defines methods to parse SGML-derived data formats (XML, HTML,
5
- # &c.). This module isn't directly used to extend resources.
6
- # Instead, more specific modules (e.g. -
7
- # IMW::Resources::Formats::Xml) are used.
8
- module Sgml
9
-
10
- # Parse this resource using Hpricot and return (or yield if
11
- # given a block) the resulting Hpricot::Doc.
12
- #
13
- # @return [Hpricot::Doc]
14
- # @yield [Hpricot::Doc]
15
- def load &block
16
- require 'hpricot'
17
- sgml = Hpricot(io)
18
- if block_given?
19
- yield sgml
20
- else
21
- sgml
22
- end
23
- end
24
-
25
- # Parse the Hpricot::Doc of this resource with the given
26
- # +parser+.
27
- #
28
- # The parser can either be an IMW::Parsers::HtmlParser or a
29
- # hash which will be used to build such a parser. See the
30
- # documentation for IMW::Parsers::HtmlParser for more
31
- # information.
32
- #
33
- # @param [Hash, IMW::Parsers::HtmlParser] parser
34
- # @return [Hash] the parser's output
35
- def parse parser
36
- if parser.is_a?(IMW::Parsers::HtmlParser)
37
- parser.parse(load)
38
- else
39
- IMW::Parsers::HtmlParser.new(parser).parse(load)
40
- end
41
- end
42
- end
43
-
44
- # Defines methods for XML data.
45
- module Xml
46
- include Sgml
47
- end
48
-
49
- # Defines methods for XSL data.
50
- module Xsl
51
- include Sgml
52
- end
53
-
54
- # Defines methods for XHTML data.
55
- module Xhtml
56
- include Sgml
57
- end
58
-
59
- # Defines methods for HTML data.
60
- module Html
61
- include Sgml
62
- end
63
-
64
- # Defines methods for RDF data.
65
- module Rdf
66
- include Sgml
67
- end
68
- end
69
- end