imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/lib/imw/formats.rb
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Formats
|
|
3
|
-
autoload :Csv, 'imw/formats/delimited'
|
|
4
|
-
autoload :Tsv, 'imw/formats/delimited'
|
|
5
|
-
autoload :Excel, 'imw/formats/excel'
|
|
6
|
-
autoload :Json, 'imw/formats/json'
|
|
7
|
-
autoload :Xml, 'imw/formats/sgml'
|
|
8
|
-
autoload :Xsl, 'imw/formats/sgml'
|
|
9
|
-
autoload :Html, 'imw/formats/sgml'
|
|
10
|
-
autoload :Xhtml, 'imw/formats/sgml'
|
|
11
|
-
autoload :Rdf, 'imw/formats/sgml'
|
|
12
|
-
autoload :Yaml, 'imw/formats/yaml'
|
|
13
|
-
autoload :Pdf, 'imw/formats/pdf'
|
|
14
|
-
|
|
15
|
-
# Handlers which augment a resource with data format specific
|
|
16
|
-
# methods.
|
|
17
|
-
HANDLERS = [
|
|
18
|
-
[ "Formats::Csv", /\.csv$/i ],
|
|
19
|
-
[ "Formats::Tsv", /\.tsv$/i ],
|
|
20
|
-
[ "Formats::Excel", /\.xlsx?$/i ],
|
|
21
|
-
[ "Formats::Json", /\.json$/i ],
|
|
22
|
-
[ "Formats::Xml", /\.xml$/i ],
|
|
23
|
-
[ "Formats::Xsl", /\.xsl$/i ],
|
|
24
|
-
[ "Formats::Html", /\.html?$/i ],
|
|
25
|
-
[ "Formats::Xhtml", /\.xhtml?$/i ],
|
|
26
|
-
[ "Formats::Rdf", /\.rdf?$/i ],
|
|
27
|
-
[ "Formats::Yaml", /\.ya?ml$/i ],
|
|
28
|
-
[ "Formats::Pdf", /\.pdf$/i ]
|
|
29
|
-
]
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Formats
|
|
3
|
-
|
|
4
|
-
# Defines methods used for parsing and writing delimited data
|
|
5
|
-
# formats (CSV, TSV, &c.) with the FasterCSV library. This
|
|
6
|
-
# module is not used to directly extend a resource. Instead,
|
|
7
|
-
# more specific modules (e.g. - IMW::Resources::Formats::Csv)
|
|
8
|
-
# include this one and also define +delimited_options+ which is
|
|
9
|
-
# actually what's passed to FasterCSV.
|
|
10
|
-
#
|
|
11
|
-
# @abstract
|
|
12
|
-
module Delimited
|
|
13
|
-
|
|
14
|
-
# Default options to be passed to
|
|
15
|
-
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
|
16
|
-
# documentation for more information.
|
|
17
|
-
#
|
|
18
|
-
# @return [Hash]
|
|
19
|
-
def delimited_options
|
|
20
|
-
@delimited_options ||= {
|
|
21
|
-
:headers => fields && fields.map { |field| field['name'] }
|
|
22
|
-
}.merge(resource_options_compatible_with_faster_csv)
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# Return the data in this delimited resource as an array of
|
|
26
|
-
# arrays.
|
|
27
|
-
#
|
|
28
|
-
# Yield each outer array (row) if passed a block.
|
|
29
|
-
#
|
|
30
|
-
# @return [Array] the full data matrix
|
|
31
|
-
# @yield [Array] each row of the data
|
|
32
|
-
def load &block
|
|
33
|
-
require 'fastercsv'
|
|
34
|
-
FasterCSV.parse(read, delimited_options, &block)
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# Gives us goodies! Needs +each+ below.
|
|
38
|
-
include Enumerable
|
|
39
|
-
|
|
40
|
-
# Call +block+ with each row in this delimited resource.
|
|
41
|
-
def each &block
|
|
42
|
-
require 'fastercsv'
|
|
43
|
-
FasterCSV.new(io, delimited_options).each(&block)
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# Emit a single array or an array of arrays into this resource.
|
|
47
|
-
#
|
|
48
|
-
# @param [Array<Array>, Array] data array or array of arrays to emit
|
|
49
|
-
# @param [Hash] options
|
|
50
|
-
# @option options [true, false] :persist Keep this resource's IO object open after emiting
|
|
51
|
-
def emit data, options={}
|
|
52
|
-
require 'fastercsv'
|
|
53
|
-
data = [data] unless data.first.is_a?(Array)
|
|
54
|
-
data.each do |row|
|
|
55
|
-
write(FasterCSV.generate_line(row, delimited_options))
|
|
56
|
-
end
|
|
57
|
-
self
|
|
58
|
-
end
|
|
59
|
-
alias_method :<<, :emit
|
|
60
|
-
|
|
61
|
-
# Do a heuristic check to determine whether or not the first row
|
|
62
|
-
# of this delimited data is a row of headers.
|
|
63
|
-
#
|
|
64
|
-
# @return [true, false]
|
|
65
|
-
def fields_in_first_line?
|
|
66
|
-
# grab the header and up to 10 body rows
|
|
67
|
-
require 'fastercsv'
|
|
68
|
-
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
|
69
|
-
header = (copy.shift || []) rescue []
|
|
70
|
-
body = 10.times.map { (copy.shift || []) rescue []}.flatten
|
|
71
|
-
|
|
72
|
-
# guess how many elements in a row
|
|
73
|
-
#size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
|
|
74
|
-
|
|
75
|
-
# calculate the fraction of bytes that are [-A-z_] (letters +
|
|
76
|
-
# underscore + hypen) for header and body and compute a
|
|
77
|
-
# threshold determinant
|
|
78
|
-
header_chars = header.map(&:to_s).join
|
|
79
|
-
header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
|
|
80
|
-
body_chars = body.map(&:to_s).join
|
|
81
|
-
body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
|
|
82
|
-
header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
|
|
83
|
-
body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
|
|
84
|
-
determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
|
|
85
|
-
|
|
86
|
-
# decide, setting the threshold at 0.05 based on some guesswork...
|
|
87
|
-
determinant && determinant >= 0.05
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# If it seems like there are fields in the first line of this
|
|
91
|
-
# data then go ahead and use them to define this resource's
|
|
92
|
-
# fields.
|
|
93
|
-
#
|
|
94
|
-
# Will overwrite any fields already present for this resource.
|
|
95
|
-
def guess_fields!
|
|
96
|
-
return unless fields_in_first_line?
|
|
97
|
-
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
|
98
|
-
names = (copy.shift || []) rescue []
|
|
99
|
-
self.fields = names.map { |n| { 'name' => n } }
|
|
100
|
-
delimited_options[:headers] = names
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
# Return a 10-line sample of this file.
|
|
104
|
-
#
|
|
105
|
-
# @return [Array<Array>]
|
|
106
|
-
def snippet
|
|
107
|
-
require 'fastercsv'
|
|
108
|
-
[].tap do |rows|
|
|
109
|
-
rows_sampled = 0
|
|
110
|
-
begin
|
|
111
|
-
each do |row|
|
|
112
|
-
begin
|
|
113
|
-
break if rows_sampled > 100
|
|
114
|
-
row_size = row.size.to_f
|
|
115
|
-
if (row.reject(&:blank?).size.to_f / row_size) >= 0.5
|
|
116
|
-
rows << row.size.times.map { |index| row[index] }
|
|
117
|
-
rows_sampled += 1
|
|
118
|
-
end
|
|
119
|
-
rescue => e
|
|
120
|
-
next
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
rescue => e
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
protected
|
|
129
|
-
# An array of option names used by FasterCSV.
|
|
130
|
-
FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
|
|
131
|
-
|
|
132
|
-
# Return the subset of options this resource was initialized
|
|
133
|
-
# with that are compatible with FasterCSV (it complains when you
|
|
134
|
-
# give it keywords it doesn't know).
|
|
135
|
-
#
|
|
136
|
-
# @return [Hash]
|
|
137
|
-
def resource_options_compatible_with_faster_csv
|
|
138
|
-
@compatible_options ||= {}.tap do |compatible_options|
|
|
139
|
-
FASTER_CSV_OPTION_NAMES.each do |option_name|
|
|
140
|
-
compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
end
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
# A module for working with CSV (comma-separated value) formatted
|
|
147
|
-
# data.
|
|
148
|
-
#
|
|
149
|
-
# @see IMW::Formats::Delimited
|
|
150
|
-
module Csv
|
|
151
|
-
include Delimited
|
|
152
|
-
def delimited_options
|
|
153
|
-
@delimited_options ||= {:col_sep => ","}.merge(super())
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# A module for working with TSV (tab-separated value) formatted
|
|
158
|
-
# data.
|
|
159
|
-
#
|
|
160
|
-
# @see IMW::Formats::Delimited
|
|
161
|
-
module Tsv
|
|
162
|
-
include Delimited
|
|
163
|
-
def delimited_options
|
|
164
|
-
@delimited_options ||= {
|
|
165
|
-
:col_sep => "\t",
|
|
166
|
-
}.merge(super())
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
end
|
data/lib/imw/formats/excel.rb
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Formats
|
|
3
|
-
|
|
4
|
-
# Defines methods for reading and writing Microsoft Excel data.
|
|
5
|
-
module Excel
|
|
6
|
-
|
|
7
|
-
# Ensure that this Excel resource is described by a an ordered
|
|
8
|
-
# collection of flat fields.
|
|
9
|
-
def validate_schema!
|
|
10
|
-
raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
# Return the data in this Excel document as an array of arrays.
|
|
14
|
-
#
|
|
15
|
-
# Data from consecutive worksheets will be concatenated into a
|
|
16
|
-
# single outer array.
|
|
17
|
-
#
|
|
18
|
-
# @return [Array<Array>]
|
|
19
|
-
def load
|
|
20
|
-
require 'spreadsheet'
|
|
21
|
-
data = []
|
|
22
|
-
Spreadsheet.open(path).worksheets.each do |worksheet|
|
|
23
|
-
data += worksheet.map do |row|
|
|
24
|
-
row.to_a
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
data
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
# Gives us goodies! Needs +each+ below.
|
|
31
|
-
include Enumerable
|
|
32
|
-
|
|
33
|
-
# Yield each row of this Excel document.
|
|
34
|
-
#
|
|
35
|
-
# Will loop from one worksheet to the next.
|
|
36
|
-
#
|
|
37
|
-
# @yield [Spreadsheet::Excel::Row]
|
|
38
|
-
def each &block
|
|
39
|
-
require 'spreadsheet'
|
|
40
|
-
Spreadsheet.open(path).worksheets.each do |worksheet|
|
|
41
|
-
worksheet.each(&block)
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# Return the number of lines in this Excel document.
|
|
46
|
-
#
|
|
47
|
-
# Measured across worksheets.
|
|
48
|
-
#
|
|
49
|
-
# @return [Integer]
|
|
50
|
-
def num_lines
|
|
51
|
-
require 'spreadsheet'
|
|
52
|
-
Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
|
|
53
|
-
sum += worksheet.row_count
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
# TODO
|
|
58
|
-
#
|
|
59
|
-
# def emit
|
|
60
|
-
# end
|
|
61
|
-
|
|
62
|
-
# TODO
|
|
63
|
-
#
|
|
64
|
-
# Extract the following methods from delimited into a module and
|
|
65
|
-
# let both Excel and Delimited use them.
|
|
66
|
-
#
|
|
67
|
-
# Or let Excel include Delimited and let it override
|
|
68
|
-
# appropriately.
|
|
69
|
-
#
|
|
70
|
-
# headers_in_first_line?
|
|
71
|
-
# guess_schema!
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
#
|
|
76
|
-
def snippet
|
|
77
|
-
require 'spreadsheet'
|
|
78
|
-
[].tap do |snip|
|
|
79
|
-
rows_sampled = 0
|
|
80
|
-
Spreadsheet.open(path).worksheets.each do |worksheet|
|
|
81
|
-
worksheet.each do |row|
|
|
82
|
-
begin
|
|
83
|
-
break if rows_sampled > 100
|
|
84
|
-
row_size = row.size.to_f
|
|
85
|
-
if (row.reject(&:blank?).size.to_f / row_size) > 0.5
|
|
86
|
-
snip << row.to_a
|
|
87
|
-
rows_sampled += 1
|
|
88
|
-
end
|
|
89
|
-
rescue => e
|
|
90
|
-
next
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
break if rows_sampled > 10
|
|
94
|
-
end
|
|
95
|
-
end
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
|
data/lib/imw/formats/json.rb
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Formats
|
|
3
|
-
|
|
4
|
-
# Defines methods for reading and writing JSON data.
|
|
5
|
-
module Json
|
|
6
|
-
|
|
7
|
-
include Enumerable
|
|
8
|
-
|
|
9
|
-
# Return the content of this resource.
|
|
10
|
-
#
|
|
11
|
-
# Will pass a block to the outermost JSON data structure's each
|
|
12
|
-
# method.
|
|
13
|
-
#
|
|
14
|
-
# @return [Hash, Array, String, Fixnum] whatever the JSON contained
|
|
15
|
-
def load &block
|
|
16
|
-
require 'json'
|
|
17
|
-
json = JSON.parse(read)
|
|
18
|
-
if block_given?
|
|
19
|
-
json.each(&block)
|
|
20
|
-
else
|
|
21
|
-
json
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# Iterate over the elements in the JSON.
|
|
26
|
-
def each &block
|
|
27
|
-
load(&block)
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
# Emit the +data+ into this resource. It must be opened for
|
|
31
|
-
# writing.
|
|
32
|
-
#
|
|
33
|
-
# @param [Hash, String, Array, Fixnum] data the Ruby object to emit
|
|
34
|
-
def emit data, options={}
|
|
35
|
-
require 'json'
|
|
36
|
-
write(data.to_json)
|
|
37
|
-
self
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
end
|
data/lib/imw/formats/pdf.rb
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Formats
|
|
3
|
-
|
|
4
|
-
# Defines methods for parsing and generating PDF.
|
|
5
|
-
#
|
|
6
|
-
# Uses PDF::Reader for parsing and Prawn for generating.
|
|
7
|
-
module Pdf
|
|
8
|
-
|
|
9
|
-
# Return a snippet of text from this PDF.
|
|
10
|
-
#
|
|
11
|
-
# @return [String]
|
|
12
|
-
def snippet
|
|
13
|
-
begin
|
|
14
|
-
require 'pdf/reader'
|
|
15
|
-
snippetizer = Snippetizer.new
|
|
16
|
-
PDF::Reader.file(path, snippetizer)
|
|
17
|
-
snippetizer.snippet
|
|
18
|
-
rescue Snippetizer::SnippetEndError
|
|
19
|
-
snippetizer.snippet
|
|
20
|
-
rescue
|
|
21
|
-
''
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# A receiver class used by PDF::Reader which agglomerates text
|
|
26
|
-
# up to 1024 bytes and then bails.
|
|
27
|
-
class Snippetizer
|
|
28
|
-
|
|
29
|
-
# A custom error class that can be thrown while receiving text
|
|
30
|
-
# from PDF::Reader to cut-short walking large PDF documents.
|
|
31
|
-
SnippetEndError = Class.new(IMW::Error)
|
|
32
|
-
|
|
33
|
-
# The snippet being built by this snippetizer.
|
|
34
|
-
attr_accessor :snippet
|
|
35
|
-
|
|
36
|
-
def initialize
|
|
37
|
-
@snippet = ''
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
# Agglomerates text from PDF::Reader up to a fixed size of
|
|
41
|
-
# 1024 bytes.
|
|
42
|
-
#
|
|
43
|
-
# Will convert a single-space line from PDF::Reader as a
|
|
44
|
-
# newline character.
|
|
45
|
-
#
|
|
46
|
-
# FIXME How does the receiver ask PDF::Reader to abort walking
|
|
47
|
-
# the document now that enough text has been returned? Till a
|
|
48
|
-
# more graceful way is found this method simply raises an
|
|
49
|
-
# error, creating a GOTO...
|
|
50
|
-
def show_text *params
|
|
51
|
-
params.each do |string|
|
|
52
|
-
if @snippet.size < 1024
|
|
53
|
-
if string == ' '
|
|
54
|
-
@snippet += "\n"
|
|
55
|
-
else
|
|
56
|
-
@snippet += string[0..1024]
|
|
57
|
-
end
|
|
58
|
-
else
|
|
59
|
-
raise SnippetEndError.new
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
alias_method :show_text_with_positioning, :show_text
|
|
64
|
-
alias_method :move_to_next_line_and_show_text, :show_text
|
|
65
|
-
alias_method :set_spacing_next_line_show_text, :show_text
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
|
data/lib/imw/formats/sgml.rb
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Formats
|
|
3
|
-
|
|
4
|
-
# Defines methods to parse SGML-derived data formats (XML, HTML,
|
|
5
|
-
# &c.). This module isn't directly used to extend resources.
|
|
6
|
-
# Instead, more specific modules (e.g. -
|
|
7
|
-
# IMW::Resources::Formats::Xml) are used.
|
|
8
|
-
module Sgml
|
|
9
|
-
|
|
10
|
-
# Parse this resource using Hpricot and return (or yield if
|
|
11
|
-
# given a block) the resulting Hpricot::Doc.
|
|
12
|
-
#
|
|
13
|
-
# @return [Hpricot::Doc]
|
|
14
|
-
# @yield [Hpricot::Doc]
|
|
15
|
-
def load &block
|
|
16
|
-
require 'hpricot'
|
|
17
|
-
sgml = Hpricot(io)
|
|
18
|
-
if block_given?
|
|
19
|
-
yield sgml
|
|
20
|
-
else
|
|
21
|
-
sgml
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
# Parse the Hpricot::Doc of this resource with the given
|
|
26
|
-
# +parser+.
|
|
27
|
-
#
|
|
28
|
-
# The parser can either be an IMW::Parsers::HtmlParser or a
|
|
29
|
-
# hash which will be used to build such a parser. See the
|
|
30
|
-
# documentation for IMW::Parsers::HtmlParser for more
|
|
31
|
-
# information.
|
|
32
|
-
#
|
|
33
|
-
# @param [Hash, IMW::Parsers::HtmlParser] parser
|
|
34
|
-
# @return [Hash] the parser's output
|
|
35
|
-
def parse parser
|
|
36
|
-
if parser.is_a?(IMW::Parsers::HtmlParser)
|
|
37
|
-
parser.parse(load)
|
|
38
|
-
else
|
|
39
|
-
IMW::Parsers::HtmlParser.new(parser).parse(load)
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Defines methods for XML data.
|
|
45
|
-
module Xml
|
|
46
|
-
include Sgml
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# Defines methods for XSL data.
|
|
50
|
-
module Xsl
|
|
51
|
-
include Sgml
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# Defines methods for XHTML data.
|
|
55
|
-
module Xhtml
|
|
56
|
-
include Sgml
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# Defines methods for HTML data.
|
|
60
|
-
module Html
|
|
61
|
-
include Sgml
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# Defines methods for RDF data.
|
|
65
|
-
module Rdf
|
|
66
|
-
include Sgml
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
end
|