iron-import 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/History.txt +16 -1
- data/README.rdoc +43 -16
- data/Version.txt +1 -1
- data/lib/iron/import/column.rb +27 -14
- data/lib/iron/import/csv_reader.rb +4 -4
- data/lib/iron/import/custom_reader.rb +14 -8
- data/lib/iron/import/data_reader.rb +42 -30
- data/lib/iron/import/error.rb +4 -16
- data/lib/iron/import/excel_reader.rb +69 -0
- data/lib/iron/import/html_reader.rb +78 -0
- data/lib/iron/import/importer.rb +432 -103
- data/lib/iron/import/row.rb +15 -11
- data/lib/iron/import/xls_reader.rb +3 -37
- data/lib/iron/import/xlsx_reader.rb +2 -37
- data/lib/iron/import.rb +2 -1
- data/spec/importer/column_spec.rb +4 -5
- data/spec/importer/csv_reader_spec.rb +1 -1
- data/spec/importer/custom_reader_spec.rb +6 -10
- data/spec/importer/data_reader_spec.rb +6 -5
- data/spec/importer/html_reader_spec.rb +105 -0
- data/spec/importer/importer_spec.rb +107 -0
- data/spec/importer/row_spec.rb +9 -2
- data/spec/importer/xls_reader_spec.rb +77 -0
- data/spec/importer/xlsx_reader_spec.rb +2 -3
- data/spec/samples/3-sheets.xls +0 -0
- data/spec/samples/col-span.html +29 -0
- data/spec/samples/html-th-td.html +11 -0
- data/spec/samples/multi-table.html +29 -0
- data/spec/samples/nanodrop.xlsx +0 -0
- data/spec/samples/scores.html +30 -0
- data/spec/samples/simple.html +14 -0
- data/spec/spec_helper.rb +1 -0
- metadata +30 -8
- data/lib/iron/import/sheet.rb +0 -263
- data/spec/importer/sheet_spec.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6931785ff3b1cb03394e6a8be0943337c589ac90
|
4
|
+
data.tar.gz: 82a7b1f123134eea5388672e8aed578590cd2daa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5d689572ef2680fce8273cc22c05ba76b2dc9a8a56f07c890d64ba6f1b4d4aba78f6bffea8f2b2b2eb9bdf057115bcf6911241f1febe567c7caee3a9eea6011
|
7
|
+
data.tar.gz: 4fafd479220b82ed7fd983b77b1c54d73ba993442d29c4b19a37c087e94feb35c25517549966fff2426be333491f71f59e23c20650c02a3ca5f6b34e4e6f4e91
|
data/History.txt
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
== 0.7.0 / 2017-02-16
|
2
|
+
|
3
|
+
* Breaking Change: Removed multi-sheet support - use multiple importers instead
|
4
|
+
* Breaking Change: Removed warnings as they were not being used
|
5
|
+
* Breaking Change: Removed Column#required! due to bugginess and overlap with Column#validate
|
6
|
+
* Add Importer#scope to allow narrowing the search to one or more sheets/tables when importing
|
7
|
+
* Add new HtmlReader support to handle parsing HTML <table> rows
|
8
|
+
* Modify Importer#import to support block mode combining #import and #process
|
9
|
+
* Add Importer#import_string for handling explicit CSV/HTML/Custom text
|
10
|
+
* Add Importer#on_error(&block) to allow inline conditional error handling
|
11
|
+
* Improve error message when headers can't be detected to list missing headers
|
12
|
+
* Change Importer#error_summary to group identical errors into single summary line
|
13
|
+
* Improve :cents type column rounding to better handle floating point ugliness
|
14
|
+
* Much improved test coverage and documentation
|
15
|
+
|
1
16
|
== 0.6.1 / 2015-08-24
|
2
17
|
|
3
18
|
* Better handling for nil return value in custom format readers
|
@@ -8,7 +23,7 @@
|
|
8
23
|
* Vastly improved internal and user-facing comments
|
9
24
|
* Improved error logging, replaced some exceptions with errors
|
10
25
|
|
11
|
-
== 0.5.0 / 2015-
|
26
|
+
== 0.5.0 / 2015-03-19
|
12
27
|
|
13
28
|
* Initial revision
|
14
29
|
* Support for CSV, XLS and XLSX importing
|
data/README.rdoc
CHANGED
@@ -7,8 +7,8 @@ Written by Rob Morris @ Irongaze Consulting LLC (http://irongaze.com)
|
|
7
7
|
Simple, reliable tabular data import.
|
8
8
|
|
9
9
|
This gem provides a set of classes to support automating import of tabular data from
|
10
|
-
CSV, XLS and XLSX files, or custom formats via a simple block reader. Provides help
|
11
|
-
in defining columns, auto-detecting column order, pre-parsing data, and error
|
10
|
+
CSV, HTML, XLS and XLSX files, or custom formats via a simple block reader. Provides help
|
11
|
+
in defining columns, auto-detecting column order, pre-parsing data, and error tracking.
|
12
12
|
|
13
13
|
The Roo/Spreadsheet gems do a great job of providing general purpose spreadsheet reading.
|
14
14
|
However, using them with unreliable user submitted data requires a lot of error checking,
|
@@ -21,37 +21,64 @@ This is NOT a general-purpose tool for reading spreadsheets. If you want access
|
|
21
21
|
cell styling, reading underlying formulas, etc., you will be better served building
|
22
22
|
a custom importer based on Roo. But if you're looking to take an uploaded CSV file,
|
23
23
|
validate and coerce values, then write each row to a database, all the while tracking
|
24
|
-
any
|
24
|
+
any errors encountered... well, this is the library for you!
|
25
25
|
|
26
26
|
IMPORTANT NOTE: this gem is in flux as we work to define the best possible abstraction
|
27
|
-
for the task. Breaking changes will be noted by increases in the
|
27
|
+
for the task. Breaking changes will be noted by increases in the minor version,
|
28
28
|
ie 0.5.0 and 0.5.1 will be compatible, but 0.6.0 will not (i.e. we follow semantic versioning).
|
29
29
|
|
30
30
|
== SAMPLE USAGE
|
31
31
|
|
32
|
-
# Define our importer, with
|
33
|
-
# "name" and "
|
32
|
+
# Define our importer, with three columns. The importer will look for a row containing
|
33
|
+
# "name"/"product", "description" and "price" (case insensitively) and automatically determine column
|
34
34
|
# order and starting row of the data.
|
35
35
|
importer = Importer.build do
|
36
|
-
column :name
|
37
|
-
|
36
|
+
column :name do
|
37
|
+
# Column order and start row are auto-detected
|
38
|
+
header /(name|product)/i
|
39
|
+
end
|
40
|
+
column :description do
|
41
|
+
# Columns can do custom parsing
|
42
|
+
parse do |raw_val|
|
43
|
+
raw_val.to_s.strip
|
44
|
+
end
|
45
|
+
# And per-row validation
|
46
|
+
validate do |parsed_val|
|
47
|
+
raise "Invalid description" unless parsed_val.length > 5
|
48
|
+
end
|
49
|
+
end
|
50
|
+
column :price do
|
51
|
+
# Built in type conversion handles common cases
|
52
|
+
type :cents
|
53
|
+
end
|
54
|
+
|
55
|
+
# Need to skip rows? Use a filter!
|
56
|
+
filter do |row|
|
57
|
+
row[:price] != 0 && row[:name] != 'Sample'
|
58
|
+
end
|
38
59
|
end
|
39
60
|
|
40
|
-
# Import the provided file row-by-row if importing succeeds, automatically
|
61
|
+
# Import the provided file row-by-row (if importing succeeds), automatically
|
41
62
|
# using the proper library to read CSV data. This same code would work
|
42
63
|
# with XLS or XLSX files with no changes to the code.
|
43
|
-
|
44
|
-
|
45
|
-
puts row[:name] + ' = ' + row[:description]
|
46
|
-
end
|
64
|
+
importer.import('/tmp/source.csv') do |row|
|
65
|
+
puts row[:name] + ' = ' + row[:description]
|
47
66
|
end
|
48
67
|
|
68
|
+
# Check for errors and do the right thing:
|
69
|
+
importer.on_error do
|
70
|
+
puts "Error: " + error_summary
|
71
|
+
end
|
72
|
+
|
49
73
|
== REQUIREMENTS
|
50
74
|
|
51
|
-
Depends on the iron-extensions and iron-dsl gems
|
52
|
-
|
75
|
+
Depends on the iron-extensions and iron-dsl gems for CSV and custom import formats.
|
76
|
+
|
77
|
+
Optionally requires the roo gem to support XLS and XLSX import and parsing.
|
78
|
+
|
79
|
+
Optionally requires the nokogiri gem to support HTML import and parsing.
|
53
80
|
|
54
|
-
Requires RSpec and roo to build/test.
|
81
|
+
Requires RSpec, nokogiri and roo to build/test.
|
55
82
|
|
56
83
|
== INSTALLATION
|
57
84
|
|
data/Version.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.7.0
|
data/lib/iron/import/column.rb
CHANGED
@@ -26,7 +26,11 @@ class Importer
|
|
26
26
|
# # seems like the "same" source value, for example an Excel source file
|
27
27
|
# # will give you a float value for all numeric types, even "integers".
|
28
28
|
# parse do |raw_value|
|
29
|
-
# raw_value.to_i + 1000
|
29
|
+
# val = raw_value.to_i + 1000
|
30
|
+
# # NOTE: we're in a block, so don't do this:
|
31
|
+
# return val
|
32
|
+
# # Instead, use implied return:
|
33
|
+
# val
|
30
34
|
# end
|
31
35
|
#
|
32
36
|
# # You can also add a custom validator to check the value and add
|
@@ -54,7 +58,6 @@ class Importer
|
|
54
58
|
attr_reader :data
|
55
59
|
|
56
60
|
# Configuration
|
57
|
-
dsl_flag :required
|
58
61
|
dsl_accessor :header, :position, :type
|
59
62
|
dsl_accessor :parse, :validate
|
60
63
|
|
@@ -85,21 +88,17 @@ class Importer
|
|
85
88
|
str
|
86
89
|
end
|
87
90
|
|
88
|
-
# Create a new column definition
|
91
|
+
# Create a new column definition with the key for the column,
|
89
92
|
# and an optional set of options. The options supported are the same as those supported
|
90
93
|
# in block/builder mode.
|
91
|
-
def initialize(
|
94
|
+
def initialize(importer, key, options_hash = {})
|
92
95
|
# Save off our info
|
93
96
|
@key = key
|
94
|
-
@
|
95
|
-
@importer = @sheet.importer
|
97
|
+
@importer = importer
|
96
98
|
|
97
99
|
# Return it as a string, by default
|
98
100
|
@type = options_hash.delete(:type) { :string }
|
99
101
|
|
100
|
-
# By default, we allow empty values
|
101
|
-
@required = options_hash.delete(:required) { false }
|
102
|
-
|
103
102
|
# Position can be explicitly set
|
104
103
|
@position = options_hash.delete(:position)
|
105
104
|
|
@@ -126,6 +125,19 @@ class Importer
|
|
126
125
|
def reset
|
127
126
|
@data = Data.new
|
128
127
|
end
|
128
|
+
|
129
|
+
# DEPRECATED - duplicates functionality better provided by #validate, e.g.
|
130
|
+
#
|
131
|
+
# validate do |val|
|
132
|
+
# raise 'Missing required value for column foo' if val.nil?
|
133
|
+
# end
|
134
|
+
def required!
|
135
|
+
Kernel.warn "[DEPRECATION] Importer::Column#required! is deprecated. Please use #validate instead."
|
136
|
+
col = self.key
|
137
|
+
validate do |val|
|
138
|
+
raise "Missing required value for column :#{col}"
|
139
|
+
end
|
140
|
+
end
|
129
141
|
|
130
142
|
# When true, our header definition or index match the passed text or column index.
|
131
143
|
def match_header?(text, index)
|
@@ -151,7 +163,7 @@ class Importer
|
|
151
163
|
|
152
164
|
# Applies any validation to a parsed value
|
153
165
|
def validate_value(row, val)
|
154
|
-
return unless @validate
|
166
|
+
return true unless @validate
|
155
167
|
begin
|
156
168
|
@validate.call(val)
|
157
169
|
true
|
@@ -178,20 +190,21 @@ class Importer
|
|
178
190
|
'Column ' + @data.pos
|
179
191
|
end
|
180
192
|
|
181
|
-
# Extracts the
|
193
|
+
# Extracts the imported values for this column and returns them in an array.
|
182
194
|
# Note that the array indices ARE NOT row indices, as the rows may have been
|
183
195
|
# filtered and any header rows have been skipped.
|
184
196
|
def to_a
|
185
|
-
@
|
197
|
+
@importer.data.rows.collect {|r| r[@key] }
|
186
198
|
end
|
187
199
|
|
188
|
-
# Extracts the
|
200
|
+
# Extracts the values for this column and returns them in a hash of
|
189
201
|
# row num => value for all non-filtered, non-header rows.
|
190
202
|
def to_h
|
191
203
|
res = {}
|
192
|
-
@
|
204
|
+
@importer.data.rows.collect {|r| res[r.num] = r[@key] }
|
193
205
|
res
|
194
206
|
end
|
207
|
+
def to_hash ; to_h ; end
|
195
208
|
|
196
209
|
end
|
197
210
|
|
@@ -30,10 +30,10 @@ class Importer
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
# Normally, we'd check the
|
34
|
-
# there's only one
|
35
|
-
def
|
36
|
-
@raw_rows
|
33
|
+
# Normally, we'd check the scopes and return the proper data, but for CSV files,
|
34
|
+
# there's only one scope...
|
35
|
+
def load_raw(scopes, &block)
|
36
|
+
block.call(@raw_rows)
|
37
37
|
end
|
38
38
|
|
39
39
|
end
|
@@ -24,19 +24,25 @@ class Importer
|
|
24
24
|
@source = source
|
25
25
|
end
|
26
26
|
|
27
|
-
def
|
27
|
+
def load_raw(scopes, &block)
|
28
|
+
# Default to just running one scope passing nil
|
29
|
+
if scopes.nil? || scopes.empty?
|
30
|
+
scopes = [nil]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the proper reader
|
28
34
|
reader = @readers[@mode]
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
35
|
+
scopes.each do |scope|
|
36
|
+
rows = DslProxy.exec(self, @source, scope, &reader)
|
37
|
+
if rows.is_a?(Array) && !@importer.has_errors?
|
38
|
+
found = block.call(rows)
|
39
|
+
break if found
|
40
|
+
end
|
34
41
|
end
|
35
42
|
|
36
43
|
rescue Exception => e
|
37
44
|
# Catch any exceptions thrown and note them with helpful stacktrace info for debugging custom readers
|
38
|
-
|
39
|
-
false
|
45
|
+
add_error("Error in custom reader: #{e} @ #{e.backtrace.first}")
|
40
46
|
end
|
41
47
|
|
42
48
|
end
|
@@ -11,6 +11,16 @@ class Importer
|
|
11
11
|
def self.verify_roo!
|
12
12
|
if Gem::Specification.find_all_by_name('roo', '~> 1.13.0').empty?
|
13
13
|
raise "You are attempting to use the iron-import gem to import an Excel file. Doing so requires installing the roo gem, version 1.13.0 or later."
|
14
|
+
else
|
15
|
+
require 'roo'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.verify_nokogiri!
|
20
|
+
if Gem::Specification.find_all_by_name('nokogiri', '~> 1.6.0').empty?
|
21
|
+
raise "You are attempting to use the iron-import gem to import an HTML file. Doing so requires installing the nokogiri gem, version 1.6.0 or later."
|
22
|
+
else
|
23
|
+
require 'nokogiri'
|
14
24
|
end
|
15
25
|
end
|
16
26
|
|
@@ -42,6 +52,9 @@ class Importer
|
|
42
52
|
when :xlsx
|
43
53
|
verify_roo!
|
44
54
|
XlsxReader.new(importer)
|
55
|
+
when :html
|
56
|
+
verify_nokogiri!
|
57
|
+
HtmlReader.new(importer)
|
45
58
|
else
|
46
59
|
nil
|
47
60
|
end
|
@@ -49,9 +62,11 @@ class Importer
|
|
49
62
|
|
50
63
|
# Figure out which format to use for a given path based on file name
|
51
64
|
def self.for_path(importer, path)
|
52
|
-
format = path.to_s.extract(/\.(csv|xlsx?)\z/i)
|
65
|
+
format = path.to_s.extract(/\.(csv|html?|xlsx?)\z/i)
|
53
66
|
if format
|
54
|
-
format = format.downcase
|
67
|
+
format = format.downcase
|
68
|
+
format = 'html' if format == 'htm'
|
69
|
+
format = format.to_sym
|
55
70
|
for_format(importer, format)
|
56
71
|
else
|
57
72
|
nil
|
@@ -90,6 +105,10 @@ class Importer
|
|
90
105
|
@supports = []
|
91
106
|
end
|
92
107
|
|
108
|
+
def supports?(mode)
|
109
|
+
@supports.include?(mode)
|
110
|
+
end
|
111
|
+
|
93
112
|
def supports_stream!
|
94
113
|
@supports << :stream
|
95
114
|
end
|
@@ -98,10 +117,6 @@ class Importer
|
|
98
117
|
@supports << :file
|
99
118
|
end
|
100
119
|
|
101
|
-
def supports?(mode)
|
102
|
-
@supports.include?(mode)
|
103
|
-
end
|
104
|
-
|
105
120
|
def supports_file?
|
106
121
|
supports?(:file)
|
107
122
|
end
|
@@ -114,13 +129,22 @@ class Importer
|
|
114
129
|
# a file path) and attempts to load it. Returns true if successful, false
|
115
130
|
# if not. If false, there will be one or more errors explaining what went
|
116
131
|
# wrong.
|
117
|
-
|
132
|
+
#
|
133
|
+
# Passed scopes are interpreted by each derived class as makes sense, but
|
134
|
+
# generally are used to target seaching in multi-block formats such as
|
135
|
+
# Excel spreadsheets (sheet name/index) or HTML documents (css selectors,
|
136
|
+
# xpath selectors). If scopes is nil, all possible blocks will be checked.
|
137
|
+
#
|
138
|
+
# Each block is read in as raw data from the source, and passed to the
|
139
|
+
# given block as an array of arrays. If the block returns true, processing
|
140
|
+
# is stopped and no further blocks will be checked.
|
141
|
+
def load(path_or_stream, scopes = nil, &block)
|
118
142
|
# Figure out what we've been passed, and handle it
|
119
143
|
if self.class.is_stream?(path_or_stream)
|
120
144
|
# We have a stream (open file, upload, whatever)
|
121
145
|
if supports_stream?
|
122
146
|
# Stream loader defined, run it
|
123
|
-
|
147
|
+
load_each(:stream, path_or_stream, scopes, &block)
|
124
148
|
else
|
125
149
|
# Write to temp file, as some of our readers only read physical files, annoyingly
|
126
150
|
file = Tempfile.new(['importer', ".#{format}"])
|
@@ -128,7 +152,7 @@ class Importer
|
|
128
152
|
begin
|
129
153
|
file.write path_or_stream.read
|
130
154
|
file.close
|
131
|
-
|
155
|
+
load_each(:file, file.path, scopes, &block)
|
132
156
|
ensure
|
133
157
|
file.close
|
134
158
|
file.unlink
|
@@ -140,18 +164,18 @@ class Importer
|
|
140
164
|
if File.exist?(path_or_stream)
|
141
165
|
if supports_file?
|
142
166
|
# We're all set, load up the given path
|
143
|
-
|
167
|
+
load_each(:file, path_or_stream, scopes, &block)
|
144
168
|
else
|
145
169
|
# No file handler, so open the file and run the stream processor
|
146
170
|
file = File.open(path_or_stream, 'rb')
|
147
|
-
|
171
|
+
load_each(:stream, file, scopes, &block)
|
148
172
|
end
|
149
173
|
else
|
150
|
-
|
174
|
+
add_error("Unable to locate source file #{path_or_stream}")
|
151
175
|
end
|
152
176
|
|
153
177
|
else
|
154
|
-
|
178
|
+
add_error("Unable to load data source - not a file path or stream: #{path_or_stream.inspect}")
|
155
179
|
end
|
156
180
|
|
157
181
|
# Return our status
|
@@ -159,20 +183,12 @@ class Importer
|
|
159
183
|
end
|
160
184
|
|
161
185
|
# Load up the sheets in the correct mode
|
162
|
-
def
|
186
|
+
def load_each(mode, source, scopes, &block)
|
163
187
|
# Let our derived classes open the file, etc. as they need
|
164
188
|
if init_source(mode, source)
|
165
189
|
# Once the source is set, run through each defined sheet, pass it to
|
166
190
|
# our sheet loader, and have the sheet parse it out.
|
167
|
-
|
168
|
-
res = load_raw_sheet(sheet)
|
169
|
-
if res === false
|
170
|
-
# D'oh.
|
171
|
-
else
|
172
|
-
# Tell the sheet to parse the data
|
173
|
-
sheet.parse_raw_data(res)
|
174
|
-
end
|
175
|
-
end
|
191
|
+
load_raw(scopes, &block)
|
176
192
|
end
|
177
193
|
end
|
178
194
|
|
@@ -185,8 +201,8 @@ class Importer
|
|
185
201
|
# Override this method in derived classes to take the given sheet definition,
|
186
202
|
# find that sheet in the input source, and read out the raw (unparsed) rows
|
187
203
|
# as an array of arrays. Return false if the sheet cannot be loaded.
|
188
|
-
def
|
189
|
-
raise "Unimplemented method #
|
204
|
+
def load_raw(scopes, &block)
|
205
|
+
raise "Unimplemented method #load_raw in data reader #{self.class.name}"
|
190
206
|
end
|
191
207
|
|
192
208
|
# Provides default value parsing/coersion for all derived data readers. Attempts to be clever and
|
@@ -241,7 +257,7 @@ class Importer
|
|
241
257
|
else
|
242
258
|
floatval = parse_value(val, :float)
|
243
259
|
if floatval
|
244
|
-
(floatval * 100).
|
260
|
+
(floatval * 100).round
|
245
261
|
else
|
246
262
|
nil
|
247
263
|
end
|
@@ -261,10 +277,6 @@ class Importer
|
|
261
277
|
@importer.add_error(*args)
|
262
278
|
end
|
263
279
|
|
264
|
-
def add_warning(*args)
|
265
|
-
@importer.add_warning(*args)
|
266
|
-
end
|
267
|
-
|
268
280
|
end
|
269
281
|
|
270
282
|
end
|
data/lib/iron/import/error.rb
CHANGED
@@ -2,14 +2,11 @@ class Importer
|
|
2
2
|
|
3
3
|
class Error
|
4
4
|
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :row, :text
|
6
6
|
|
7
7
|
def initialize(context, text)
|
8
|
-
if context.is_a?(Importer::
|
9
|
-
@sheet = context
|
10
|
-
elsif context.is_a?(Importer::Row)
|
8
|
+
if context.is_a?(Importer::Row)
|
11
9
|
@row = context
|
12
|
-
@sheet = context.sheet
|
13
10
|
end
|
14
11
|
@text = text.to_s
|
15
12
|
end
|
@@ -17,9 +14,7 @@ class Importer
|
|
17
14
|
def summary
|
18
15
|
summary = ''
|
19
16
|
if @row
|
20
|
-
summary += "#{@
|
21
|
-
elsif @sheet
|
22
|
-
summary += "#{@sheet}: "
|
17
|
+
summary += "#{@row}: "
|
23
18
|
end
|
24
19
|
summary + @text
|
25
20
|
end
|
@@ -29,10 +24,9 @@ class Importer
|
|
29
24
|
end
|
30
25
|
|
31
26
|
# Returns the level at which this error occurred, one of
|
32
|
-
# :row, :
|
27
|
+
# :row, :importer
|
33
28
|
def level
|
34
29
|
return :row if @row
|
35
|
-
return :sheet if @sheet
|
36
30
|
return :importer
|
37
31
|
end
|
38
32
|
|
@@ -40,10 +34,6 @@ class Importer
|
|
40
34
|
level == :row
|
41
35
|
end
|
42
36
|
|
43
|
-
def sheet_level?
|
44
|
-
level == :sheet
|
45
|
-
end
|
46
|
-
|
47
37
|
def importer_level?
|
48
38
|
level == :importer
|
49
39
|
end
|
@@ -54,8 +44,6 @@ class Importer
|
|
54
44
|
case context
|
55
45
|
when Row
|
56
46
|
return @row == context
|
57
|
-
when Sheet
|
58
|
-
return @sheet == context
|
59
47
|
else
|
60
48
|
return true
|
61
49
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class Importer
|
2
|
+
|
3
|
+
# Uses the Roo gem to read in .xls files
|
4
|
+
class ExcelReader < DataReader
|
5
|
+
|
6
|
+
def initialize(importer, format)
|
7
|
+
super(importer, format)
|
8
|
+
supports_file!
|
9
|
+
end
|
10
|
+
|
11
|
+
def init_source(mode, source)
|
12
|
+
if mode == :file
|
13
|
+
if @format == :xls
|
14
|
+
@spreadsheet = Roo::Excel.new(source, :file_warning => :ignore)
|
15
|
+
true
|
16
|
+
elsif @format == :xlsx
|
17
|
+
@spreadsheet = Roo::Excelx.new(source, :file_warning => :ignore)
|
18
|
+
true
|
19
|
+
else
|
20
|
+
add_error("Unknown format for Excel file: :#{@format}")
|
21
|
+
false
|
22
|
+
end
|
23
|
+
else
|
24
|
+
add_error("Unsupported #{@format.to_s.upcase} mode: #{mode}")
|
25
|
+
false
|
26
|
+
end
|
27
|
+
rescue Exception => e
|
28
|
+
add_error("Error reading file #{source}: #{e}")
|
29
|
+
false
|
30
|
+
end
|
31
|
+
|
32
|
+
def load_raw(scopes, &block)
|
33
|
+
@spreadsheet.sheets.each_with_index do |name, index|
|
34
|
+
# See if this sheet's name or index matches the requested sheet definition
|
35
|
+
if include_sheet?(scopes, name, index)
|
36
|
+
# Extract our raw data
|
37
|
+
raw_rows = []
|
38
|
+
@spreadsheet.sheet(name).each_with_index do |row, line|
|
39
|
+
raw_rows << row
|
40
|
+
end
|
41
|
+
# Yield our raw rows for this sheet
|
42
|
+
found = block.call(raw_rows)
|
43
|
+
# If we've found a working sheet, stop
|
44
|
+
return if found
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
rescue Exception => e
|
49
|
+
# Not sure why we'd get here, but we strive for error-freedom here, yessir.
|
50
|
+
@importer.add_error("Error loading Excel data: #{e}")
|
51
|
+
end
|
52
|
+
|
53
|
+
# When true, the given sheet name or zero-based index
|
54
|
+
# is a match with our id.
|
55
|
+
def include_sheet?(scopes, name, index)
|
56
|
+
return true if scopes.nil? || scopes.empty?
|
57
|
+
scopes.each do |scope|
|
58
|
+
if scope.is_a?(Fixnum)
|
59
|
+
return true if scope.to_i == index+1
|
60
|
+
else
|
61
|
+
return true if scope.to_s.downcase == name.downcase
|
62
|
+
end
|
63
|
+
end
|
64
|
+
false
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
class Importer
|
2
|
+
|
3
|
+
class HtmlReader < DataReader
|
4
|
+
|
5
|
+
def initialize(importer)
|
6
|
+
super(importer, :html)
|
7
|
+
supports_file!
|
8
|
+
supports_stream!
|
9
|
+
@tables = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def init_source(mode, source)
|
13
|
+
if mode == :stream
|
14
|
+
@html = Nokogiri::HTML(source)
|
15
|
+
elsif mode == :file
|
16
|
+
if File.exist?(source)
|
17
|
+
@html = File.open(source) {|f| Nokogiri::HTML(f) }
|
18
|
+
else
|
19
|
+
add_error("File not found: #{source}")
|
20
|
+
return false
|
21
|
+
end
|
22
|
+
else
|
23
|
+
add_error("Unsupported HTML mode: #{mode}")
|
24
|
+
return false
|
25
|
+
end
|
26
|
+
|
27
|
+
if @html
|
28
|
+
true
|
29
|
+
else
|
30
|
+
add_error("Failed parsing of HTML")
|
31
|
+
false
|
32
|
+
end
|
33
|
+
|
34
|
+
rescue Exception => e
|
35
|
+
add_error("Error reading HTML source #{source}: #{e}")
|
36
|
+
false
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_raw(scopes, &block)
|
40
|
+
# Default to searching all tables in the document
|
41
|
+
if scopes.nil? || scopes.empty?
|
42
|
+
scopes = ['table']
|
43
|
+
end
|
44
|
+
|
45
|
+
# Catch here lets us break out of the nested loop cleanly
|
46
|
+
catch(:found) do
|
47
|
+
# Run each scope, which should be a valid css selector
|
48
|
+
scopes.each do |scope|
|
49
|
+
@html.css(scope).each do |table_node|
|
50
|
+
rows = []
|
51
|
+
table_node.css('tr').each do |row_node|
|
52
|
+
row = []
|
53
|
+
row_node.children.each do |cell_node|
|
54
|
+
if ['th', 'td'].include?(cell_node.name)
|
55
|
+
row << cell_node.text.strip
|
56
|
+
# Handle col-span values appropriately
|
57
|
+
span_count = cell_node.attr('colspan')
|
58
|
+
(span_count.to_i - 1).times do
|
59
|
+
row << nil
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
rows << row
|
64
|
+
end
|
65
|
+
found = block.call(rows)
|
66
|
+
throw(:found, true) if found
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
rescue Exception => e
|
72
|
+
# Not sure why we'd get here, but we strive for error-freedom here, yessir.
|
73
|
+
add_error("Error loading tables #{scopes.list_join(', ')}: #{e}")
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|