iron-import 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/History.txt +16 -1
- data/README.rdoc +43 -16
- data/Version.txt +1 -1
- data/lib/iron/import/column.rb +27 -14
- data/lib/iron/import/csv_reader.rb +4 -4
- data/lib/iron/import/custom_reader.rb +14 -8
- data/lib/iron/import/data_reader.rb +42 -30
- data/lib/iron/import/error.rb +4 -16
- data/lib/iron/import/excel_reader.rb +69 -0
- data/lib/iron/import/html_reader.rb +78 -0
- data/lib/iron/import/importer.rb +432 -103
- data/lib/iron/import/row.rb +15 -11
- data/lib/iron/import/xls_reader.rb +3 -37
- data/lib/iron/import/xlsx_reader.rb +2 -37
- data/lib/iron/import.rb +2 -1
- data/spec/importer/column_spec.rb +4 -5
- data/spec/importer/csv_reader_spec.rb +1 -1
- data/spec/importer/custom_reader_spec.rb +6 -10
- data/spec/importer/data_reader_spec.rb +6 -5
- data/spec/importer/html_reader_spec.rb +105 -0
- data/spec/importer/importer_spec.rb +107 -0
- data/spec/importer/row_spec.rb +9 -2
- data/spec/importer/xls_reader_spec.rb +77 -0
- data/spec/importer/xlsx_reader_spec.rb +2 -3
- data/spec/samples/3-sheets.xls +0 -0
- data/spec/samples/col-span.html +29 -0
- data/spec/samples/html-th-td.html +11 -0
- data/spec/samples/multi-table.html +29 -0
- data/spec/samples/nanodrop.xlsx +0 -0
- data/spec/samples/scores.html +30 -0
- data/spec/samples/simple.html +14 -0
- data/spec/spec_helper.rb +1 -0
- metadata +30 -8
- data/lib/iron/import/sheet.rb +0 -263
- data/spec/importer/sheet_spec.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6931785ff3b1cb03394e6a8be0943337c589ac90
|
4
|
+
data.tar.gz: 82a7b1f123134eea5388672e8aed578590cd2daa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5d689572ef2680fce8273cc22c05ba76b2dc9a8a56f07c890d64ba6f1b4d4aba78f6bffea8f2b2b2eb9bdf057115bcf6911241f1febe567c7caee3a9eea6011
|
7
|
+
data.tar.gz: 4fafd479220b82ed7fd983b77b1c54d73ba993442d29c4b19a37c087e94feb35c25517549966fff2426be333491f71f59e23c20650c02a3ca5f6b34e4e6f4e91
|
data/History.txt
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
== 0.7.0 / 2017-02-16
|
2
|
+
|
3
|
+
* Breaking Change: Removed multi-sheet support - use multiple importers instead
|
4
|
+
* Breaking Change: Removed warnings as they were not being used
|
5
|
+
* Breaking Change: Removed Column#required! due to bugginess and overlap with Column#validate
|
6
|
+
* Add Importer#scope to allow narrowing the search to one or more sheets/tables when importing
|
7
|
+
* Add new HtmlReader support to handle parsing HTML <table> rows
|
8
|
+
* Modify Importer#import to support block mode combining #import and #process
|
9
|
+
* Add Importer#import_string for handling explicit CSV/HTML/Custom text
|
10
|
+
* Add Importer#on_error(&block) to allow inline conditional error handling
|
11
|
+
* Improve error message when headers can't be detected to list missing headers
|
12
|
+
* Change Importer#error_summary to group identical errors into single summary line
|
13
|
+
* Improve :cents type column rounding to better handle floating point ugliness
|
14
|
+
* Much improved test coverage and documentation
|
15
|
+
|
1
16
|
== 0.6.1 / 2015-08-24
|
2
17
|
|
3
18
|
* Better handling for nil return value in custom format readers
|
@@ -8,7 +23,7 @@
|
|
8
23
|
* Vastly improved internal and user-facing comments
|
9
24
|
* Improved error logging, replaced some exceptions with errors
|
10
25
|
|
11
|
-
== 0.5.0 / 2015-
|
26
|
+
== 0.5.0 / 2015-03-19
|
12
27
|
|
13
28
|
* Initial revision
|
14
29
|
* Support for CSV, XLS and XLSX importing
|
data/README.rdoc
CHANGED
@@ -7,8 +7,8 @@ Written by Rob Morris @ Irongaze Consulting LLC (http://irongaze.com)
|
|
7
7
|
Simple, reliable tabular data import.
|
8
8
|
|
9
9
|
This gem provides a set of classes to support automating import of tabular data from
|
10
|
-
CSV, XLS and XLSX files, or custom formats via a simple block reader. Provides help
|
11
|
-
in defining columns, auto-detecting column order, pre-parsing data, and error
|
10
|
+
CSV, HTML, XLS and XLSX files, or custom formats via a simple block reader. Provides help
|
11
|
+
in defining columns, auto-detecting column order, pre-parsing data, and error tracking.
|
12
12
|
|
13
13
|
The Roo/Spreadsheet gems do a great job of providing general purpose spreadsheet reading.
|
14
14
|
However, using them with unreliable user submitted data requires a lot of error checking,
|
@@ -21,37 +21,64 @@ This is NOT a general-purpose tool for reading spreadsheets. If you want access
|
|
21
21
|
cell styling, reading underlying formulas, etc., you will be better served building
|
22
22
|
a custom importer based on Roo. But if you're looking to take an uploaded CSV file,
|
23
23
|
validate and coerce values, then write each row to a database, all the while tracking
|
24
|
-
any
|
24
|
+
any errors encountered... well, this is the library for you!
|
25
25
|
|
26
26
|
IMPORTANT NOTE: this gem is in flux as we work to define the best possible abstraction
|
27
|
-
for the task. Breaking changes will be noted by increases in the
|
27
|
+
for the task. Breaking changes will be noted by increases in the minor version,
|
28
28
|
ie 0.5.0 and 0.5.1 will be compatible, but 0.6.0 will not (i.e. we follow semantic versioning).
|
29
29
|
|
30
30
|
== SAMPLE USAGE
|
31
31
|
|
32
|
-
# Define our importer, with
|
33
|
-
# "name" and "
|
32
|
+
# Define our importer, with three columns. The importer will look for a row containing
|
33
|
+
# "name"/"product", "description" and "price" (case insensitively) and automatically determine column
|
34
34
|
# order and starting row of the data.
|
35
35
|
importer = Importer.build do
|
36
|
-
column :name
|
37
|
-
|
36
|
+
column :name do
|
37
|
+
# Column order and start row are auto-detected
|
38
|
+
header /(name|product)/i
|
39
|
+
end
|
40
|
+
column :description do
|
41
|
+
# Columns can do custom parsing
|
42
|
+
parse do |raw_val|
|
43
|
+
raw_val.to_s.strip
|
44
|
+
end
|
45
|
+
# And per-row validation
|
46
|
+
validate do |parsed_val|
|
47
|
+
raise "Invalid description" unless parsed_val.length > 5
|
48
|
+
end
|
49
|
+
end
|
50
|
+
column :price do
|
51
|
+
# Built in type conversion handles common cases
|
52
|
+
type :cents
|
53
|
+
end
|
54
|
+
|
55
|
+
# Need to skip rows? Use a filter!
|
56
|
+
filter do |row|
|
57
|
+
row[:price] != 0 && row[:name] != 'Sample'
|
58
|
+
end
|
38
59
|
end
|
39
60
|
|
40
|
-
# Import the provided file row-by-row if importing succeeds, automatically
|
61
|
+
# Import the provided file row-by-row (if importing succeeds), automatically
|
41
62
|
# using the proper library to read CSV data. This same code would work
|
42
63
|
# with XLS or XLSX files with no changes to the code.
|
43
|
-
|
44
|
-
|
45
|
-
puts row[:name] + ' = ' + row[:description]
|
46
|
-
end
|
64
|
+
importer.import('/tmp/source.csv') do |row|
|
65
|
+
puts row[:name] + ' = ' + row[:description]
|
47
66
|
end
|
48
67
|
|
68
|
+
# Check for errors and do the right thing:
|
69
|
+
importer.on_error do
|
70
|
+
puts "Error: " + error_summary
|
71
|
+
end
|
72
|
+
|
49
73
|
== REQUIREMENTS
|
50
74
|
|
51
|
-
Depends on the iron-extensions and iron-dsl gems
|
52
|
-
|
75
|
+
Depends on the iron-extensions and iron-dsl gems for CSV and custom import formats.
|
76
|
+
|
77
|
+
Optionally requires the roo gem to support XLS and XLSX import and parsing.
|
78
|
+
|
79
|
+
Optionally requires the nokogiri gem to support HTML import and parsing.
|
53
80
|
|
54
|
-
Requires RSpec and roo to build/test.
|
81
|
+
Requires RSpec, nokogiri and roo to build/test.
|
55
82
|
|
56
83
|
== INSTALLATION
|
57
84
|
|
data/Version.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.7.0
|
data/lib/iron/import/column.rb
CHANGED
@@ -26,7 +26,11 @@ class Importer
|
|
26
26
|
# # seems like the "same" source value, for example an Excel source file
|
27
27
|
# # will give you a float value for all numeric types, even "integers".
|
28
28
|
# parse do |raw_value|
|
29
|
-
# raw_value.to_i + 1000
|
29
|
+
# val = raw_value.to_i + 1000
|
30
|
+
# # NOTE: we're in a block, so don't do this:
|
31
|
+
# return val
|
32
|
+
# # Instead, use implied return:
|
33
|
+
# val
|
30
34
|
# end
|
31
35
|
#
|
32
36
|
# # You can also add a custom validator to check the value and add
|
@@ -54,7 +58,6 @@ class Importer
|
|
54
58
|
attr_reader :data
|
55
59
|
|
56
60
|
# Configuration
|
57
|
-
dsl_flag :required
|
58
61
|
dsl_accessor :header, :position, :type
|
59
62
|
dsl_accessor :parse, :validate
|
60
63
|
|
@@ -85,21 +88,17 @@ class Importer
|
|
85
88
|
str
|
86
89
|
end
|
87
90
|
|
88
|
-
# Create a new column definition
|
91
|
+
# Create a new column definition with the key for the column,
|
89
92
|
# and an optional set of options. The options supported are the same as those supported
|
90
93
|
# in block/builder mode.
|
91
|
-
def initialize(
|
94
|
+
def initialize(importer, key, options_hash = {})
|
92
95
|
# Save off our info
|
93
96
|
@key = key
|
94
|
-
@
|
95
|
-
@importer = @sheet.importer
|
97
|
+
@importer = importer
|
96
98
|
|
97
99
|
# Return it as a string, by default
|
98
100
|
@type = options_hash.delete(:type) { :string }
|
99
101
|
|
100
|
-
# By default, we allow empty values
|
101
|
-
@required = options_hash.delete(:required) { false }
|
102
|
-
|
103
102
|
# Position can be explicitly set
|
104
103
|
@position = options_hash.delete(:position)
|
105
104
|
|
@@ -126,6 +125,19 @@ class Importer
|
|
126
125
|
def reset
|
127
126
|
@data = Data.new
|
128
127
|
end
|
128
|
+
|
129
|
+
# DEPRECATED - duplicates functionality better provided by #validate, e.g.
|
130
|
+
#
|
131
|
+
# validate do |val|
|
132
|
+
# raise 'Missing required value for column foo' if val.nil?
|
133
|
+
# end
|
134
|
+
def required!
|
135
|
+
Kernel.warn "[DEPRECATION] Importer::Column#required! is deprecated. Please use #validate instead."
|
136
|
+
col = self.key
|
137
|
+
validate do |val|
|
138
|
+
raise "Missing required value for column :#{col}"
|
139
|
+
end
|
140
|
+
end
|
129
141
|
|
130
142
|
# When true, our header definition or index match the passed text or column index.
|
131
143
|
def match_header?(text, index)
|
@@ -151,7 +163,7 @@ class Importer
|
|
151
163
|
|
152
164
|
# Applies any validation to a parsed value
|
153
165
|
def validate_value(row, val)
|
154
|
-
return unless @validate
|
166
|
+
return true unless @validate
|
155
167
|
begin
|
156
168
|
@validate.call(val)
|
157
169
|
true
|
@@ -178,20 +190,21 @@ class Importer
|
|
178
190
|
'Column ' + @data.pos
|
179
191
|
end
|
180
192
|
|
181
|
-
# Extracts the
|
193
|
+
# Extracts the imported values for this column and returns them in an array.
|
182
194
|
# Note that the array indices ARE NOT row indices, as the rows may have been
|
183
195
|
# filtered and any header rows have been skipped.
|
184
196
|
def to_a
|
185
|
-
@
|
197
|
+
@importer.data.rows.collect {|r| r[@key] }
|
186
198
|
end
|
187
199
|
|
188
|
-
# Extracts the
|
200
|
+
# Extracts the values for this column and returns them in a hash of
|
189
201
|
# row num => value for all non-filtered, non-header rows.
|
190
202
|
def to_h
|
191
203
|
res = {}
|
192
|
-
@
|
204
|
+
@importer.data.rows.collect {|r| res[r.num] = r[@key] }
|
193
205
|
res
|
194
206
|
end
|
207
|
+
def to_hash ; to_h ; end
|
195
208
|
|
196
209
|
end
|
197
210
|
|
@@ -30,10 +30,10 @@ class Importer
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
# Normally, we'd check the
|
34
|
-
# there's only one
|
35
|
-
def
|
36
|
-
@raw_rows
|
33
|
+
# Normally, we'd check the scopes and return the proper data, but for CSV files,
|
34
|
+
# there's only one scope...
|
35
|
+
def load_raw(scopes, &block)
|
36
|
+
block.call(@raw_rows)
|
37
37
|
end
|
38
38
|
|
39
39
|
end
|
@@ -24,19 +24,25 @@ class Importer
|
|
24
24
|
@source = source
|
25
25
|
end
|
26
26
|
|
27
|
-
def
|
27
|
+
def load_raw(scopes, &block)
|
28
|
+
# Default to just running one scope passing nil
|
29
|
+
if scopes.nil? || scopes.empty?
|
30
|
+
scopes = [nil]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the proper reader
|
28
34
|
reader = @readers[@mode]
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
35
|
+
scopes.each do |scope|
|
36
|
+
rows = DslProxy.exec(self, @source, scope, &reader)
|
37
|
+
if rows.is_a?(Array) && !@importer.has_errors?
|
38
|
+
found = block.call(rows)
|
39
|
+
break if found
|
40
|
+
end
|
34
41
|
end
|
35
42
|
|
36
43
|
rescue Exception => e
|
37
44
|
# Catch any exceptions thrown and note them with helpful stacktrace info for debugging custom readers
|
38
|
-
|
39
|
-
false
|
45
|
+
add_error("Error in custom reader: #{e} @ #{e.backtrace.first}")
|
40
46
|
end
|
41
47
|
|
42
48
|
end
|
@@ -11,6 +11,16 @@ class Importer
|
|
11
11
|
def self.verify_roo!
|
12
12
|
if Gem::Specification.find_all_by_name('roo', '~> 1.13.0').empty?
|
13
13
|
raise "You are attempting to use the iron-import gem to import an Excel file. Doing so requires installing the roo gem, version 1.13.0 or later."
|
14
|
+
else
|
15
|
+
require 'roo'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.verify_nokogiri!
|
20
|
+
if Gem::Specification.find_all_by_name('nokogiri', '~> 1.6.0').empty?
|
21
|
+
raise "You are attempting to use the iron-import gem to import an HTML file. Doing so requires installing the nokogiri gem, version 1.6.0 or later."
|
22
|
+
else
|
23
|
+
require 'nokogiri'
|
14
24
|
end
|
15
25
|
end
|
16
26
|
|
@@ -42,6 +52,9 @@ class Importer
|
|
42
52
|
when :xlsx
|
43
53
|
verify_roo!
|
44
54
|
XlsxReader.new(importer)
|
55
|
+
when :html
|
56
|
+
verify_nokogiri!
|
57
|
+
HtmlReader.new(importer)
|
45
58
|
else
|
46
59
|
nil
|
47
60
|
end
|
@@ -49,9 +62,11 @@ class Importer
|
|
49
62
|
|
50
63
|
# Figure out which format to use for a given path based on file name
|
51
64
|
def self.for_path(importer, path)
|
52
|
-
format = path.to_s.extract(/\.(csv|xlsx?)\z/i)
|
65
|
+
format = path.to_s.extract(/\.(csv|html?|xlsx?)\z/i)
|
53
66
|
if format
|
54
|
-
format = format.downcase
|
67
|
+
format = format.downcase
|
68
|
+
format = 'html' if format == 'htm'
|
69
|
+
format = format.to_sym
|
55
70
|
for_format(importer, format)
|
56
71
|
else
|
57
72
|
nil
|
@@ -90,6 +105,10 @@ class Importer
|
|
90
105
|
@supports = []
|
91
106
|
end
|
92
107
|
|
108
|
+
def supports?(mode)
|
109
|
+
@supports.include?(mode)
|
110
|
+
end
|
111
|
+
|
93
112
|
def supports_stream!
|
94
113
|
@supports << :stream
|
95
114
|
end
|
@@ -98,10 +117,6 @@ class Importer
|
|
98
117
|
@supports << :file
|
99
118
|
end
|
100
119
|
|
101
|
-
def supports?(mode)
|
102
|
-
@supports.include?(mode)
|
103
|
-
end
|
104
|
-
|
105
120
|
def supports_file?
|
106
121
|
supports?(:file)
|
107
122
|
end
|
@@ -114,13 +129,22 @@ class Importer
|
|
114
129
|
# a file path) and attempts to load it. Returns true if successful, false
|
115
130
|
# if not. If false, there will be one or more errors explaining what went
|
116
131
|
# wrong.
|
117
|
-
|
132
|
+
#
|
133
|
+
# Passed scopes are interpreted by each derived class as makes sense, but
|
134
|
+
# generally are used to target seaching in multi-block formats such as
|
135
|
+
# Excel spreadsheets (sheet name/index) or HTML documents (css selectors,
|
136
|
+
# xpath selectors). If scopes is nil, all possible blocks will be checked.
|
137
|
+
#
|
138
|
+
# Each block is read in as raw data from the source, and passed to the
|
139
|
+
# given block as an array of arrays. If the block returns true, processing
|
140
|
+
# is stopped and no further blocks will be checked.
|
141
|
+
def load(path_or_stream, scopes = nil, &block)
|
118
142
|
# Figure out what we've been passed, and handle it
|
119
143
|
if self.class.is_stream?(path_or_stream)
|
120
144
|
# We have a stream (open file, upload, whatever)
|
121
145
|
if supports_stream?
|
122
146
|
# Stream loader defined, run it
|
123
|
-
|
147
|
+
load_each(:stream, path_or_stream, scopes, &block)
|
124
148
|
else
|
125
149
|
# Write to temp file, as some of our readers only read physical files, annoyingly
|
126
150
|
file = Tempfile.new(['importer', ".#{format}"])
|
@@ -128,7 +152,7 @@ class Importer
|
|
128
152
|
begin
|
129
153
|
file.write path_or_stream.read
|
130
154
|
file.close
|
131
|
-
|
155
|
+
load_each(:file, file.path, scopes, &block)
|
132
156
|
ensure
|
133
157
|
file.close
|
134
158
|
file.unlink
|
@@ -140,18 +164,18 @@ class Importer
|
|
140
164
|
if File.exist?(path_or_stream)
|
141
165
|
if supports_file?
|
142
166
|
# We're all set, load up the given path
|
143
|
-
|
167
|
+
load_each(:file, path_or_stream, scopes, &block)
|
144
168
|
else
|
145
169
|
# No file handler, so open the file and run the stream processor
|
146
170
|
file = File.open(path_or_stream, 'rb')
|
147
|
-
|
171
|
+
load_each(:stream, file, scopes, &block)
|
148
172
|
end
|
149
173
|
else
|
150
|
-
|
174
|
+
add_error("Unable to locate source file #{path_or_stream}")
|
151
175
|
end
|
152
176
|
|
153
177
|
else
|
154
|
-
|
178
|
+
add_error("Unable to load data source - not a file path or stream: #{path_or_stream.inspect}")
|
155
179
|
end
|
156
180
|
|
157
181
|
# Return our status
|
@@ -159,20 +183,12 @@ class Importer
|
|
159
183
|
end
|
160
184
|
|
161
185
|
# Load up the sheets in the correct mode
|
162
|
-
def
|
186
|
+
def load_each(mode, source, scopes, &block)
|
163
187
|
# Let our derived classes open the file, etc. as they need
|
164
188
|
if init_source(mode, source)
|
165
189
|
# Once the source is set, run through each defined sheet, pass it to
|
166
190
|
# our sheet loader, and have the sheet parse it out.
|
167
|
-
|
168
|
-
res = load_raw_sheet(sheet)
|
169
|
-
if res === false
|
170
|
-
# D'oh.
|
171
|
-
else
|
172
|
-
# Tell the sheet to parse the data
|
173
|
-
sheet.parse_raw_data(res)
|
174
|
-
end
|
175
|
-
end
|
191
|
+
load_raw(scopes, &block)
|
176
192
|
end
|
177
193
|
end
|
178
194
|
|
@@ -185,8 +201,8 @@ class Importer
|
|
185
201
|
# Override this method in derived classes to take the given sheet definition,
|
186
202
|
# find that sheet in the input source, and read out the raw (unparsed) rows
|
187
203
|
# as an array of arrays. Return false if the sheet cannot be loaded.
|
188
|
-
def
|
189
|
-
raise "Unimplemented method #
|
204
|
+
def load_raw(scopes, &block)
|
205
|
+
raise "Unimplemented method #load_raw in data reader #{self.class.name}"
|
190
206
|
end
|
191
207
|
|
192
208
|
# Provides default value parsing/coersion for all derived data readers. Attempts to be clever and
|
@@ -241,7 +257,7 @@ class Importer
|
|
241
257
|
else
|
242
258
|
floatval = parse_value(val, :float)
|
243
259
|
if floatval
|
244
|
-
(floatval * 100).
|
260
|
+
(floatval * 100).round
|
245
261
|
else
|
246
262
|
nil
|
247
263
|
end
|
@@ -261,10 +277,6 @@ class Importer
|
|
261
277
|
@importer.add_error(*args)
|
262
278
|
end
|
263
279
|
|
264
|
-
def add_warning(*args)
|
265
|
-
@importer.add_warning(*args)
|
266
|
-
end
|
267
|
-
|
268
280
|
end
|
269
281
|
|
270
282
|
end
|
data/lib/iron/import/error.rb
CHANGED
@@ -2,14 +2,11 @@ class Importer
|
|
2
2
|
|
3
3
|
class Error
|
4
4
|
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :row, :text
|
6
6
|
|
7
7
|
def initialize(context, text)
|
8
|
-
if context.is_a?(Importer::
|
9
|
-
@sheet = context
|
10
|
-
elsif context.is_a?(Importer::Row)
|
8
|
+
if context.is_a?(Importer::Row)
|
11
9
|
@row = context
|
12
|
-
@sheet = context.sheet
|
13
10
|
end
|
14
11
|
@text = text.to_s
|
15
12
|
end
|
@@ -17,9 +14,7 @@ class Importer
|
|
17
14
|
def summary
|
18
15
|
summary = ''
|
19
16
|
if @row
|
20
|
-
summary += "#{@
|
21
|
-
elsif @sheet
|
22
|
-
summary += "#{@sheet}: "
|
17
|
+
summary += "#{@row}: "
|
23
18
|
end
|
24
19
|
summary + @text
|
25
20
|
end
|
@@ -29,10 +24,9 @@ class Importer
|
|
29
24
|
end
|
30
25
|
|
31
26
|
# Returns the level at which this error occurred, one of
|
32
|
-
# :row, :
|
27
|
+
# :row, :importer
|
33
28
|
def level
|
34
29
|
return :row if @row
|
35
|
-
return :sheet if @sheet
|
36
30
|
return :importer
|
37
31
|
end
|
38
32
|
|
@@ -40,10 +34,6 @@ class Importer
|
|
40
34
|
level == :row
|
41
35
|
end
|
42
36
|
|
43
|
-
def sheet_level?
|
44
|
-
level == :sheet
|
45
|
-
end
|
46
|
-
|
47
37
|
def importer_level?
|
48
38
|
level == :importer
|
49
39
|
end
|
@@ -54,8 +44,6 @@ class Importer
|
|
54
44
|
case context
|
55
45
|
when Row
|
56
46
|
return @row == context
|
57
|
-
when Sheet
|
58
|
-
return @sheet == context
|
59
47
|
else
|
60
48
|
return true
|
61
49
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class Importer
|
2
|
+
|
3
|
+
# Uses the Roo gem to read in .xls files
|
4
|
+
class ExcelReader < DataReader
|
5
|
+
|
6
|
+
def initialize(importer, format)
|
7
|
+
super(importer, format)
|
8
|
+
supports_file!
|
9
|
+
end
|
10
|
+
|
11
|
+
def init_source(mode, source)
|
12
|
+
if mode == :file
|
13
|
+
if @format == :xls
|
14
|
+
@spreadsheet = Roo::Excel.new(source, :file_warning => :ignore)
|
15
|
+
true
|
16
|
+
elsif @format == :xlsx
|
17
|
+
@spreadsheet = Roo::Excelx.new(source, :file_warning => :ignore)
|
18
|
+
true
|
19
|
+
else
|
20
|
+
add_error("Unknown format for Excel file: :#{@format}")
|
21
|
+
false
|
22
|
+
end
|
23
|
+
else
|
24
|
+
add_error("Unsupported #{@format.to_s.upcase} mode: #{mode}")
|
25
|
+
false
|
26
|
+
end
|
27
|
+
rescue Exception => e
|
28
|
+
add_error("Error reading file #{source}: #{e}")
|
29
|
+
false
|
30
|
+
end
|
31
|
+
|
32
|
+
def load_raw(scopes, &block)
|
33
|
+
@spreadsheet.sheets.each_with_index do |name, index|
|
34
|
+
# See if this sheet's name or index matches the requested sheet definition
|
35
|
+
if include_sheet?(scopes, name, index)
|
36
|
+
# Extract our raw data
|
37
|
+
raw_rows = []
|
38
|
+
@spreadsheet.sheet(name).each_with_index do |row, line|
|
39
|
+
raw_rows << row
|
40
|
+
end
|
41
|
+
# Yield our raw rows for this sheet
|
42
|
+
found = block.call(raw_rows)
|
43
|
+
# If we've found a working sheet, stop
|
44
|
+
return if found
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
rescue Exception => e
|
49
|
+
# Not sure why we'd get here, but we strive for error-freedom here, yessir.
|
50
|
+
@importer.add_error("Error loading Excel data: #{e}")
|
51
|
+
end
|
52
|
+
|
53
|
+
# When true, the given sheet name or zero-based index
|
54
|
+
# is a match with our id.
|
55
|
+
def include_sheet?(scopes, name, index)
|
56
|
+
return true if scopes.nil? || scopes.empty?
|
57
|
+
scopes.each do |scope|
|
58
|
+
if scope.is_a?(Fixnum)
|
59
|
+
return true if scope.to_i == index+1
|
60
|
+
else
|
61
|
+
return true if scope.to_s.downcase == name.downcase
|
62
|
+
end
|
63
|
+
end
|
64
|
+
false
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
class Importer
|
2
|
+
|
3
|
+
class HtmlReader < DataReader
|
4
|
+
|
5
|
+
def initialize(importer)
|
6
|
+
super(importer, :html)
|
7
|
+
supports_file!
|
8
|
+
supports_stream!
|
9
|
+
@tables = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def init_source(mode, source)
|
13
|
+
if mode == :stream
|
14
|
+
@html = Nokogiri::HTML(source)
|
15
|
+
elsif mode == :file
|
16
|
+
if File.exist?(source)
|
17
|
+
@html = File.open(source) {|f| Nokogiri::HTML(f) }
|
18
|
+
else
|
19
|
+
add_error("File not found: #{source}")
|
20
|
+
return false
|
21
|
+
end
|
22
|
+
else
|
23
|
+
add_error("Unsupported HTML mode: #{mode}")
|
24
|
+
return false
|
25
|
+
end
|
26
|
+
|
27
|
+
if @html
|
28
|
+
true
|
29
|
+
else
|
30
|
+
add_error("Failed parsing of HTML")
|
31
|
+
false
|
32
|
+
end
|
33
|
+
|
34
|
+
rescue Exception => e
|
35
|
+
add_error("Error reading HTML source #{source}: #{e}")
|
36
|
+
false
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_raw(scopes, &block)
|
40
|
+
# Default to searching all tables in the document
|
41
|
+
if scopes.nil? || scopes.empty?
|
42
|
+
scopes = ['table']
|
43
|
+
end
|
44
|
+
|
45
|
+
# Catch here lets us break out of the nested loop cleanly
|
46
|
+
catch(:found) do
|
47
|
+
# Run each scope, which should be a valid css selector
|
48
|
+
scopes.each do |scope|
|
49
|
+
@html.css(scope).each do |table_node|
|
50
|
+
rows = []
|
51
|
+
table_node.css('tr').each do |row_node|
|
52
|
+
row = []
|
53
|
+
row_node.children.each do |cell_node|
|
54
|
+
if ['th', 'td'].include?(cell_node.name)
|
55
|
+
row << cell_node.text.strip
|
56
|
+
# Handle col-span values appropriately
|
57
|
+
span_count = cell_node.attr('colspan')
|
58
|
+
(span_count.to_i - 1).times do
|
59
|
+
row << nil
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
rows << row
|
64
|
+
end
|
65
|
+
found = block.call(rows)
|
66
|
+
throw(:found, true) if found
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
rescue Exception => e
|
72
|
+
# Not sure why we'd get here, but we strive for error-freedom here, yessir.
|
73
|
+
add_error("Error loading tables #{scopes.list_join(', ')}: #{e}")
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|