iron-import 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/History.txt +16 -1
- data/README.rdoc +43 -16
- data/Version.txt +1 -1
- data/lib/iron/import/column.rb +27 -14
- data/lib/iron/import/csv_reader.rb +4 -4
- data/lib/iron/import/custom_reader.rb +14 -8
- data/lib/iron/import/data_reader.rb +42 -30
- data/lib/iron/import/error.rb +4 -16
- data/lib/iron/import/excel_reader.rb +69 -0
- data/lib/iron/import/html_reader.rb +78 -0
- data/lib/iron/import/importer.rb +432 -103
- data/lib/iron/import/row.rb +15 -11
- data/lib/iron/import/xls_reader.rb +3 -37
- data/lib/iron/import/xlsx_reader.rb +2 -37
- data/lib/iron/import.rb +2 -1
- data/spec/importer/column_spec.rb +4 -5
- data/spec/importer/csv_reader_spec.rb +1 -1
- data/spec/importer/custom_reader_spec.rb +6 -10
- data/spec/importer/data_reader_spec.rb +6 -5
- data/spec/importer/html_reader_spec.rb +105 -0
- data/spec/importer/importer_spec.rb +107 -0
- data/spec/importer/row_spec.rb +9 -2
- data/spec/importer/xls_reader_spec.rb +77 -0
- data/spec/importer/xlsx_reader_spec.rb +2 -3
- data/spec/samples/3-sheets.xls +0 -0
- data/spec/samples/col-span.html +29 -0
- data/spec/samples/html-th-td.html +11 -0
- data/spec/samples/multi-table.html +29 -0
- data/spec/samples/nanodrop.xlsx +0 -0
- data/spec/samples/scores.html +30 -0
- data/spec/samples/simple.html +14 -0
- data/spec/spec_helper.rb +1 -0
- metadata +30 -8
- data/lib/iron/import/sheet.rb +0 -263
- data/spec/importer/sheet_spec.rb +0 -65
@@ -0,0 +1,30 @@
|
|
1
|
+
<table id="table_4ae4f429_6b24_496d_8dae_7c31a7644644" class="mx-grid boxscore d-b-s">
|
2
|
+
<thead>
|
3
|
+
<tr class="primary-header-row ">
|
4
|
+
<th class=" team string first" scope="col" data-cl="9"><span>1/31 @ 8p</span></th>
|
5
|
+
<th class=" quarter1 string score dw" scope="col" data-cl="2" title="Quarter 1"><span>Q1</span></th>
|
6
|
+
<th class=" quarter2 string score dw" scope="col" data-cl="2" title="Quarter 2"><span>Q2</span></th>
|
7
|
+
<th class=" quarter3 string score dw" scope="col" data-cl="2" title="Quarter 3"><span>Q3</span></th>
|
8
|
+
<th class=" quarter4 string score dw" scope="col" data-cl="2" title="Quarter 4"><span>Q4</span></th>
|
9
|
+
<th class=" score string last total score" scope="col" data-cl="5" title="Final"><span>Final</span></th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr class="first">
|
14
|
+
<th class="team first" scope="row"><a href="/high-schools/mcdowell-titans-(marion,nc)/basketball/home.htm" >McDowell</a></th>
|
15
|
+
<td class="quarter1 score dw">16</td>
|
16
|
+
<td class="quarter2 score dw">20</td>
|
17
|
+
<td class="quarter3 score dw">27</td>
|
18
|
+
<td class="quarter4 score dw">17</td>
|
19
|
+
<td class="score last total score">80</td>
|
20
|
+
</tr>
|
21
|
+
<tr class="last alternate">
|
22
|
+
<th class="team first" scope="row"><a href="/high-schools/asheville-cougars-(asheville,nc)/basketball/home.htm" >Asheville</a></th>
|
23
|
+
<td class="quarter1 score dw">13</td>
|
24
|
+
<td class="quarter2 score dw">17</td>
|
25
|
+
<td class="quarter3 score dw">17</td>
|
26
|
+
<td class="quarter4 score dw">13</td>
|
27
|
+
<td class="score last total score">60</td>
|
28
|
+
</tr>
|
29
|
+
</tbody>
|
30
|
+
</table>
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iron-import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Morris
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: iron-extensions
|
@@ -72,9 +72,23 @@ dependencies:
|
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
74
|
version: '1.13'
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: nokogiri
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.6'
|
82
|
+
type: :development
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '1.6'
|
89
|
+
description: Simple yet powerful library for importing tabular data from CSV, HTML,
|
90
|
+
XLS and XLSX files, including support for auto-detecting column order, parsing/validating
|
91
|
+
cell data, aggregating errors, etc.
|
78
92
|
email:
|
79
93
|
- rob@irongaze.com
|
80
94
|
executables: []
|
@@ -93,22 +107,30 @@ files:
|
|
93
107
|
- lib/iron/import/custom_reader.rb
|
94
108
|
- lib/iron/import/data_reader.rb
|
95
109
|
- lib/iron/import/error.rb
|
110
|
+
- lib/iron/import/excel_reader.rb
|
111
|
+
- lib/iron/import/html_reader.rb
|
96
112
|
- lib/iron/import/importer.rb
|
97
113
|
- lib/iron/import/row.rb
|
98
|
-
- lib/iron/import/sheet.rb
|
99
114
|
- lib/iron/import/xls_reader.rb
|
100
115
|
- lib/iron/import/xlsx_reader.rb
|
101
116
|
- spec/importer/column_spec.rb
|
102
117
|
- spec/importer/csv_reader_spec.rb
|
103
118
|
- spec/importer/custom_reader_spec.rb
|
104
119
|
- spec/importer/data_reader_spec.rb
|
120
|
+
- spec/importer/html_reader_spec.rb
|
105
121
|
- spec/importer/importer_spec.rb
|
106
122
|
- spec/importer/row_spec.rb
|
107
|
-
- spec/importer/
|
123
|
+
- spec/importer/xls_reader_spec.rb
|
108
124
|
- spec/importer/xlsx_reader_spec.rb
|
125
|
+
- spec/samples/3-sheets.xls
|
126
|
+
- spec/samples/col-span.html
|
127
|
+
- spec/samples/html-th-td.html
|
109
128
|
- spec/samples/icd10-custom.txt
|
129
|
+
- spec/samples/multi-table.html
|
110
130
|
- spec/samples/nanodrop.xlsx
|
131
|
+
- spec/samples/scores.html
|
111
132
|
- spec/samples/simple.csv
|
133
|
+
- spec/samples/simple.html
|
112
134
|
- spec/samples/test-products.xls
|
113
135
|
- spec/spec_helper.rb
|
114
136
|
homepage: http://irongaze.com
|
@@ -134,5 +156,5 @@ rubyforge_project:
|
|
134
156
|
rubygems_version: 2.4.3
|
135
157
|
signing_key:
|
136
158
|
specification_version: 4
|
137
|
-
summary: CSV, XLS, and XLSX import automation support
|
159
|
+
summary: CSV, HTML, XLS, and XLSX import automation support
|
138
160
|
test_files: []
|
data/lib/iron/import/sheet.rb
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
class Importer
|
2
|
-
|
3
|
-
# The Sheet class handles building the sheet's column configuration and other
|
4
|
-
# setup, then holds all load-time row data. In some file types (Excel mostly)
|
5
|
-
# there may be more than one sheet definition in a given importer. In others,
|
6
|
-
# the default sheet is the only one (possibly implicitly) defined.
|
7
|
-
#
|
8
|
-
# The following builder options are available:
|
9
|
-
#
|
10
|
-
# Importer.build do
|
11
|
-
# sheet('Some Sheet Name') do
|
12
|
-
# # Don't try to look for a header using column definitions, there is no header
|
13
|
-
# headerless!
|
14
|
-
#
|
15
|
-
# # Manually set the start row for data in this sheet, defaults to nil
|
16
|
-
# # indicating that the data rows start immediatly following the header.
|
17
|
-
# start_row 4
|
18
|
-
#
|
19
|
-
# # Define a filter that will skip unneeded rows. The filter command takes
|
20
|
-
# # a block that receives the parsed (but not validated!) row data as an
|
21
|
-
# # associative hash of :col_key => <parsed value>, and returns
|
22
|
-
# # true to keep the row or false to exclude it.
|
23
|
-
# filter do |row|
|
24
|
-
# row[:id].to_i > 5000
|
25
|
-
# end
|
26
|
-
#
|
27
|
-
# # Of course, the main thing to do in a sheet is define columns. See the
|
28
|
-
# # Column class' notes for options when defining a column. Note that
|
29
|
-
# # you can define columns using either hash-style:
|
30
|
-
# column :id, :type => :integer
|
31
|
-
# # or builder-style:
|
32
|
-
# column :name do
|
33
|
-
# header /company\s*name/
|
34
|
-
# type :string
|
35
|
-
# end
|
36
|
-
# end
|
37
|
-
class Sheet
|
38
|
-
|
39
|
-
# Inner class for holding load-time data that gets reset on each load call
|
40
|
-
class Data
|
41
|
-
attr_accessor :start_row, :rows
|
42
|
-
def initialize
|
43
|
-
@start_row = nil
|
44
|
-
@rows = []
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
# Key data
|
49
|
-
attr_reader :importer
|
50
|
-
attr_reader :columns
|
51
|
-
attr_reader :data
|
52
|
-
|
53
|
-
# Settings
|
54
|
-
dsl_flag :headerless
|
55
|
-
dsl_accessor :id
|
56
|
-
dsl_accessor :start_row
|
57
|
-
dsl_accessor :filter
|
58
|
-
|
59
|
-
def initialize(importer, id)
|
60
|
-
@importer = importer
|
61
|
-
@id = id
|
62
|
-
|
63
|
-
@headerless = false
|
64
|
-
@start_row = nil
|
65
|
-
@filter = nil
|
66
|
-
|
67
|
-
@columns = []
|
68
|
-
|
69
|
-
reset
|
70
|
-
end
|
71
|
-
|
72
|
-
# Define our columns etc. via builder-style method calling
|
73
|
-
def build(&block)
|
74
|
-
DslProxy.exec(self, &block)
|
75
|
-
end
|
76
|
-
|
77
|
-
# Call with a block accepting a single Importer::Row with contents that
|
78
|
-
# look like :column_key => <parsed value>. Any filtered rows
|
79
|
-
# will not be present. If you want to register an error, simply
|
80
|
-
# raise "some text" and it will be added to the importer's error
|
81
|
-
# list for display to the user, logging, or whatever.
|
82
|
-
def process
|
83
|
-
@data.rows.each do |row|
|
84
|
-
begin
|
85
|
-
yield row
|
86
|
-
rescue Exception => e
|
87
|
-
@importer.add_error(row, e.to_s)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# Add a new column definition to our list, allows customizing the new
|
93
|
-
# column with a builder block. See Importer::Column docs for
|
94
|
-
# options. In lieu of a builder mode, you can pass the same values
|
95
|
-
# as key => value pairs in the options hash to this method, so:
|
96
|
-
#
|
97
|
-
# column(:foo) do
|
98
|
-
# type :string
|
99
|
-
# parse do |val|
|
100
|
-
# val.to_s.upcase
|
101
|
-
# end
|
102
|
-
# end
|
103
|
-
#
|
104
|
-
# Is equivalent to:
|
105
|
-
#
|
106
|
-
# column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
|
107
|
-
#
|
108
|
-
# Use whichever you prefer!
|
109
|
-
def column(key, options_hash = {}, &block)
|
110
|
-
# Find existing column with key to allow re-opening an existing definition
|
111
|
-
col = @columns.detect {|c| c.key == key }
|
112
|
-
unless col
|
113
|
-
# if none found, add a new one
|
114
|
-
col = Column.new(self, key, options_hash)
|
115
|
-
@columns << col
|
116
|
-
end
|
117
|
-
|
118
|
-
# Customize if needed
|
119
|
-
DslProxy::exec(col, &block) if block
|
120
|
-
|
121
|
-
col
|
122
|
-
end
|
123
|
-
|
124
|
-
# Reset for load attempt
|
125
|
-
def reset
|
126
|
-
@data = Data.new
|
127
|
-
end
|
128
|
-
|
129
|
-
def parse_raw_data(raw_rows)
|
130
|
-
# Find our column layout, start of data, etc
|
131
|
-
if parse_header(raw_rows)
|
132
|
-
# Now, run all the data and add it as a Row instance
|
133
|
-
raw_rows.each_with_index do |raw, index|
|
134
|
-
row_num = index + 1
|
135
|
-
if row_num >= @data.start_row
|
136
|
-
add_row(row_num, raw)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Add a new row to our stash
|
143
|
-
def add_row(line, raw_data)
|
144
|
-
# Gracefully handle custom parsers that return nil for a row's data
|
145
|
-
raw_data ||= []
|
146
|
-
# Add the row
|
147
|
-
row = Row.new(self, line)
|
148
|
-
|
149
|
-
# Parse out the values
|
150
|
-
values = {}
|
151
|
-
@columns.each do |col|
|
152
|
-
index = col.data.index
|
153
|
-
raw_val = raw_data[index]
|
154
|
-
if col.parse
|
155
|
-
# Use custom parser if this row has one
|
156
|
-
val = col.parse_value(row, raw_val)
|
157
|
-
else
|
158
|
-
# Otherwise use our standard parser
|
159
|
-
val = @importer.data.parse_value(raw_val, col.type)
|
160
|
-
end
|
161
|
-
values[col.key] = val
|
162
|
-
end
|
163
|
-
|
164
|
-
# Set the values and filter if needed
|
165
|
-
row.set_values(values)
|
166
|
-
return nil unless !@filter || @filter.call(row)
|
167
|
-
|
168
|
-
# Row is solid, now check for missing required vals
|
169
|
-
@columns.each do |col|
|
170
|
-
val = values[col.key]
|
171
|
-
if col.validate_value(row, val)
|
172
|
-
if col.required?
|
173
|
-
if values[col.key].nil?
|
174
|
-
@importer.add_error(row, "Missing required value for #{col}")
|
175
|
-
end
|
176
|
-
end
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
# We is good
|
181
|
-
@data.rows << row
|
182
|
-
row
|
183
|
-
end
|
184
|
-
|
185
|
-
# Process the raw values for the first rows in a sheet,
|
186
|
-
# and attempt to build a map of the column layout, and
|
187
|
-
# detect the first row of real data
|
188
|
-
def parse_header(raw_rows)
|
189
|
-
if headerless?
|
190
|
-
# Use implicit or explicit column position when told to not look for a header
|
191
|
-
next_index = 0
|
192
|
-
@columns.each do |col|
|
193
|
-
unless col.position.nil?
|
194
|
-
next_index = col.fixed_index
|
195
|
-
end
|
196
|
-
col.data.index = next_index
|
197
|
-
next_index += 1
|
198
|
-
end
|
199
|
-
@data.start_row = @start_row || 1
|
200
|
-
return true
|
201
|
-
|
202
|
-
else
|
203
|
-
# Match by testing
|
204
|
-
raw_rows.each_with_index do |row, i|
|
205
|
-
# Um, have data?
|
206
|
-
next unless row
|
207
|
-
|
208
|
-
# Set up for this iteration
|
209
|
-
remaining = @columns.dup
|
210
|
-
|
211
|
-
# Step through this row's raw values, and look for a matching column for all columns
|
212
|
-
row.each_with_index do |val, i|
|
213
|
-
col = remaining.detect {|c| c.match_header?(val.to_s, i) }
|
214
|
-
if col
|
215
|
-
remaining -= [col]
|
216
|
-
col.data.index = i
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
if remaining.empty?
|
221
|
-
# Found the cols, have a map, update our start row to be the next line and return!
|
222
|
-
@data.start_row = @start_row || i+2
|
223
|
-
return true
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
# If we get here, we're hosed
|
228
|
-
@importer.add_error(self, "Unable to locate required column header(s) in sheet")
|
229
|
-
false
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
# When true, the given sheet name or zero-based index
|
234
|
-
# is a match with our id.
|
235
|
-
def match_sheet?(name, index)
|
236
|
-
if @id.is_a?(Fixnum)
|
237
|
-
@id.to_i == index+1
|
238
|
-
else
|
239
|
-
@id.to_s.downcase == name.downcase
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
def add_error(msg)
|
244
|
-
@importer.add_error(self, msg)
|
245
|
-
end
|
246
|
-
|
247
|
-
def add_warning(msg)
|
248
|
-
@importer.add_warning(self, msg)
|
249
|
-
end
|
250
|
-
|
251
|
-
def to_s
|
252
|
-
"Sheet #{@id}"
|
253
|
-
end
|
254
|
-
|
255
|
-
# Return all parsed, filtered data in the sheet as an
|
256
|
-
# array of arrays.
|
257
|
-
def dump
|
258
|
-
@data.rows.collect(&:values)
|
259
|
-
end
|
260
|
-
|
261
|
-
end
|
262
|
-
|
263
|
-
end
|
data/spec/importer/sheet_spec.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
describe Importer::Sheet do
|
2
|
-
|
3
|
-
before do
|
4
|
-
@importer = Importer.new
|
5
|
-
@sheet = @importer.default_sheet
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'should respond to build' do
|
9
|
-
@sheet.should respond_to(:build)
|
10
|
-
@sheet.build do
|
11
|
-
column :foo
|
12
|
-
end
|
13
|
-
@sheet.columns.count.should == 1
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'should define columns' do
|
17
|
-
@sheet.column(:foo)
|
18
|
-
@sheet.columns.count.should == 1
|
19
|
-
end
|
20
|
-
|
21
|
-
it 'should find headers automatically' do
|
22
|
-
# Define a few sample columns
|
23
|
-
@sheet.column(:alpha)
|
24
|
-
@sheet.column(:gamma)
|
25
|
-
# Some dummy data
|
26
|
-
rows = [
|
27
|
-
['', '', '', ''],
|
28
|
-
['Alpha', 'Beta', 'Gamma', 'Epsilon']
|
29
|
-
]
|
30
|
-
|
31
|
-
# Parse it!
|
32
|
-
@sheet.parse_header(rows).should be_true
|
33
|
-
|
34
|
-
@sheet.column(:alpha).data.index.should == 0
|
35
|
-
@sheet.column(:gamma).data.index.should == 2
|
36
|
-
@sheet.data.start_row.should == 3
|
37
|
-
end
|
38
|
-
|
39
|
-
it 'should record an error if a column can\'t be found' do
|
40
|
-
# Define a few sample columns
|
41
|
-
@sheet.column(:alpha)
|
42
|
-
@sheet.column(:gamma)
|
43
|
-
# Some dummy data
|
44
|
-
rows = [
|
45
|
-
['', '', '', ''],
|
46
|
-
['Bob', 'Beta', 'Gamma', 'Epsilon']
|
47
|
-
]
|
48
|
-
|
49
|
-
# Parse it!
|
50
|
-
@sheet.parse_header(rows).should be_false
|
51
|
-
@importer.errors.count.should == 1
|
52
|
-
@importer.error_summary.should =~ /unable to locate required column header/i
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'should match by sheet name or number' do
|
56
|
-
@sheet.id = 5
|
57
|
-
@sheet.match_sheet?('foo', 3).should be_false
|
58
|
-
@sheet.match_sheet?('foo', 4).should be_true
|
59
|
-
|
60
|
-
@sheet.id = 'Sheet 5'
|
61
|
-
@sheet.match_sheet?('Sheet', 4).should be_false
|
62
|
-
@sheet.match_sheet?('Sheet 5', 3).should be_true
|
63
|
-
end
|
64
|
-
|
65
|
-
end
|