iron-import 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/History.txt +16 -1
- data/README.rdoc +43 -16
- data/Version.txt +1 -1
- data/lib/iron/import/column.rb +27 -14
- data/lib/iron/import/csv_reader.rb +4 -4
- data/lib/iron/import/custom_reader.rb +14 -8
- data/lib/iron/import/data_reader.rb +42 -30
- data/lib/iron/import/error.rb +4 -16
- data/lib/iron/import/excel_reader.rb +69 -0
- data/lib/iron/import/html_reader.rb +78 -0
- data/lib/iron/import/importer.rb +432 -103
- data/lib/iron/import/row.rb +15 -11
- data/lib/iron/import/xls_reader.rb +3 -37
- data/lib/iron/import/xlsx_reader.rb +2 -37
- data/lib/iron/import.rb +2 -1
- data/spec/importer/column_spec.rb +4 -5
- data/spec/importer/csv_reader_spec.rb +1 -1
- data/spec/importer/custom_reader_spec.rb +6 -10
- data/spec/importer/data_reader_spec.rb +6 -5
- data/spec/importer/html_reader_spec.rb +105 -0
- data/spec/importer/importer_spec.rb +107 -0
- data/spec/importer/row_spec.rb +9 -2
- data/spec/importer/xls_reader_spec.rb +77 -0
- data/spec/importer/xlsx_reader_spec.rb +2 -3
- data/spec/samples/3-sheets.xls +0 -0
- data/spec/samples/col-span.html +29 -0
- data/spec/samples/html-th-td.html +11 -0
- data/spec/samples/multi-table.html +29 -0
- data/spec/samples/nanodrop.xlsx +0 -0
- data/spec/samples/scores.html +30 -0
- data/spec/samples/simple.html +14 -0
- data/spec/spec_helper.rb +1 -0
- metadata +30 -8
- data/lib/iron/import/sheet.rb +0 -263
- data/spec/importer/sheet_spec.rb +0 -65
@@ -0,0 +1,30 @@
|
|
1
|
+
<table id="table_4ae4f429_6b24_496d_8dae_7c31a7644644" class="mx-grid boxscore d-b-s">
|
2
|
+
<thead>
|
3
|
+
<tr class="primary-header-row ">
|
4
|
+
<th class=" team string first" scope="col" data-cl="9"><span>1/31 @ 8p</span></th>
|
5
|
+
<th class=" quarter1 string score dw" scope="col" data-cl="2" title="Quarter 1"><span>Q1</span></th>
|
6
|
+
<th class=" quarter2 string score dw" scope="col" data-cl="2" title="Quarter 2"><span>Q2</span></th>
|
7
|
+
<th class=" quarter3 string score dw" scope="col" data-cl="2" title="Quarter 3"><span>Q3</span></th>
|
8
|
+
<th class=" quarter4 string score dw" scope="col" data-cl="2" title="Quarter 4"><span>Q4</span></th>
|
9
|
+
<th class=" score string last total score" scope="col" data-cl="5" title="Final"><span>Final</span></th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr class="first">
|
14
|
+
<th class="team first" scope="row"><a href="/high-schools/mcdowell-titans-(marion,nc)/basketball/home.htm" >McDowell</a></th>
|
15
|
+
<td class="quarter1 score dw">16</td>
|
16
|
+
<td class="quarter2 score dw">20</td>
|
17
|
+
<td class="quarter3 score dw">27</td>
|
18
|
+
<td class="quarter4 score dw">17</td>
|
19
|
+
<td class="score last total score">80</td>
|
20
|
+
</tr>
|
21
|
+
<tr class="last alternate">
|
22
|
+
<th class="team first" scope="row"><a href="/high-schools/asheville-cougars-(asheville,nc)/basketball/home.htm" >Asheville</a></th>
|
23
|
+
<td class="quarter1 score dw">13</td>
|
24
|
+
<td class="quarter2 score dw">17</td>
|
25
|
+
<td class="quarter3 score dw">17</td>
|
26
|
+
<td class="quarter4 score dw">13</td>
|
27
|
+
<td class="score last total score">60</td>
|
28
|
+
</tr>
|
29
|
+
</tbody>
|
30
|
+
</table>
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iron-import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Morris
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: iron-extensions
|
@@ -72,9 +72,23 @@ dependencies:
|
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
74
|
version: '1.13'
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: nokogiri
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.6'
|
82
|
+
type: :development
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '1.6'
|
89
|
+
description: Simple yet powerful library for importing tabular data from CSV, HTML,
|
90
|
+
XLS and XLSX files, including support for auto-detecting column order, parsing/validating
|
91
|
+
cell data, aggregating errors, etc.
|
78
92
|
email:
|
79
93
|
- rob@irongaze.com
|
80
94
|
executables: []
|
@@ -93,22 +107,30 @@ files:
|
|
93
107
|
- lib/iron/import/custom_reader.rb
|
94
108
|
- lib/iron/import/data_reader.rb
|
95
109
|
- lib/iron/import/error.rb
|
110
|
+
- lib/iron/import/excel_reader.rb
|
111
|
+
- lib/iron/import/html_reader.rb
|
96
112
|
- lib/iron/import/importer.rb
|
97
113
|
- lib/iron/import/row.rb
|
98
|
-
- lib/iron/import/sheet.rb
|
99
114
|
- lib/iron/import/xls_reader.rb
|
100
115
|
- lib/iron/import/xlsx_reader.rb
|
101
116
|
- spec/importer/column_spec.rb
|
102
117
|
- spec/importer/csv_reader_spec.rb
|
103
118
|
- spec/importer/custom_reader_spec.rb
|
104
119
|
- spec/importer/data_reader_spec.rb
|
120
|
+
- spec/importer/html_reader_spec.rb
|
105
121
|
- spec/importer/importer_spec.rb
|
106
122
|
- spec/importer/row_spec.rb
|
107
|
-
- spec/importer/
|
123
|
+
- spec/importer/xls_reader_spec.rb
|
108
124
|
- spec/importer/xlsx_reader_spec.rb
|
125
|
+
- spec/samples/3-sheets.xls
|
126
|
+
- spec/samples/col-span.html
|
127
|
+
- spec/samples/html-th-td.html
|
109
128
|
- spec/samples/icd10-custom.txt
|
129
|
+
- spec/samples/multi-table.html
|
110
130
|
- spec/samples/nanodrop.xlsx
|
131
|
+
- spec/samples/scores.html
|
111
132
|
- spec/samples/simple.csv
|
133
|
+
- spec/samples/simple.html
|
112
134
|
- spec/samples/test-products.xls
|
113
135
|
- spec/spec_helper.rb
|
114
136
|
homepage: http://irongaze.com
|
@@ -134,5 +156,5 @@ rubyforge_project:
|
|
134
156
|
rubygems_version: 2.4.3
|
135
157
|
signing_key:
|
136
158
|
specification_version: 4
|
137
|
-
summary: CSV, XLS, and XLSX import automation support
|
159
|
+
summary: CSV, HTML, XLS, and XLSX import automation support
|
138
160
|
test_files: []
|
data/lib/iron/import/sheet.rb
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
class Importer
|
2
|
-
|
3
|
-
# The Sheet class handles building the sheet's column configuration and other
|
4
|
-
# setup, then holds all load-time row data. In some file types (Excel mostly)
|
5
|
-
# there may be more than one sheet definition in a given importer. In others,
|
6
|
-
# the default sheet is the only one (possibly implicitly) defined.
|
7
|
-
#
|
8
|
-
# The following builder options are available:
|
9
|
-
#
|
10
|
-
# Importer.build do
|
11
|
-
# sheet('Some Sheet Name') do
|
12
|
-
# # Don't try to look for a header using column definitions, there is no header
|
13
|
-
# headerless!
|
14
|
-
#
|
15
|
-
# # Manually set the start row for data in this sheet, defaults to nil
|
16
|
-
# # indicating that the data rows start immediatly following the header.
|
17
|
-
# start_row 4
|
18
|
-
#
|
19
|
-
# # Define a filter that will skip unneeded rows. The filter command takes
|
20
|
-
# # a block that receives the parsed (but not validated!) row data as an
|
21
|
-
# # associative hash of :col_key => <parsed value>, and returns
|
22
|
-
# # true to keep the row or false to exclude it.
|
23
|
-
# filter do |row|
|
24
|
-
# row[:id].to_i > 5000
|
25
|
-
# end
|
26
|
-
#
|
27
|
-
# # Of course, the main thing to do in a sheet is define columns. See the
|
28
|
-
# # Column class' notes for options when defining a column. Note that
|
29
|
-
# # you can define columns using either hash-style:
|
30
|
-
# column :id, :type => :integer
|
31
|
-
# # or builder-style:
|
32
|
-
# column :name do
|
33
|
-
# header /company\s*name/
|
34
|
-
# type :string
|
35
|
-
# end
|
36
|
-
# end
|
37
|
-
class Sheet
|
38
|
-
|
39
|
-
# Inner class for holding load-time data that gets reset on each load call
|
40
|
-
class Data
|
41
|
-
attr_accessor :start_row, :rows
|
42
|
-
def initialize
|
43
|
-
@start_row = nil
|
44
|
-
@rows = []
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
# Key data
|
49
|
-
attr_reader :importer
|
50
|
-
attr_reader :columns
|
51
|
-
attr_reader :data
|
52
|
-
|
53
|
-
# Settings
|
54
|
-
dsl_flag :headerless
|
55
|
-
dsl_accessor :id
|
56
|
-
dsl_accessor :start_row
|
57
|
-
dsl_accessor :filter
|
58
|
-
|
59
|
-
def initialize(importer, id)
|
60
|
-
@importer = importer
|
61
|
-
@id = id
|
62
|
-
|
63
|
-
@headerless = false
|
64
|
-
@start_row = nil
|
65
|
-
@filter = nil
|
66
|
-
|
67
|
-
@columns = []
|
68
|
-
|
69
|
-
reset
|
70
|
-
end
|
71
|
-
|
72
|
-
# Define our columns etc. via builder-style method calling
|
73
|
-
def build(&block)
|
74
|
-
DslProxy.exec(self, &block)
|
75
|
-
end
|
76
|
-
|
77
|
-
# Call with a block accepting a single Importer::Row with contents that
|
78
|
-
# look like :column_key => <parsed value>. Any filtered rows
|
79
|
-
# will not be present. If you want to register an error, simply
|
80
|
-
# raise "some text" and it will be added to the importer's error
|
81
|
-
# list for display to the user, logging, or whatever.
|
82
|
-
def process
|
83
|
-
@data.rows.each do |row|
|
84
|
-
begin
|
85
|
-
yield row
|
86
|
-
rescue Exception => e
|
87
|
-
@importer.add_error(row, e.to_s)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# Add a new column definition to our list, allows customizing the new
|
93
|
-
# column with a builder block. See Importer::Column docs for
|
94
|
-
# options. In lieu of a builder mode, you can pass the same values
|
95
|
-
# as key => value pairs in the options hash to this method, so:
|
96
|
-
#
|
97
|
-
# column(:foo) do
|
98
|
-
# type :string
|
99
|
-
# parse do |val|
|
100
|
-
# val.to_s.upcase
|
101
|
-
# end
|
102
|
-
# end
|
103
|
-
#
|
104
|
-
# Is equivalent to:
|
105
|
-
#
|
106
|
-
# column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
|
107
|
-
#
|
108
|
-
# Use whichever you prefer!
|
109
|
-
def column(key, options_hash = {}, &block)
|
110
|
-
# Find existing column with key to allow re-opening an existing definition
|
111
|
-
col = @columns.detect {|c| c.key == key }
|
112
|
-
unless col
|
113
|
-
# if none found, add a new one
|
114
|
-
col = Column.new(self, key, options_hash)
|
115
|
-
@columns << col
|
116
|
-
end
|
117
|
-
|
118
|
-
# Customize if needed
|
119
|
-
DslProxy::exec(col, &block) if block
|
120
|
-
|
121
|
-
col
|
122
|
-
end
|
123
|
-
|
124
|
-
# Reset for load attempt
|
125
|
-
def reset
|
126
|
-
@data = Data.new
|
127
|
-
end
|
128
|
-
|
129
|
-
def parse_raw_data(raw_rows)
|
130
|
-
# Find our column layout, start of data, etc
|
131
|
-
if parse_header(raw_rows)
|
132
|
-
# Now, run all the data and add it as a Row instance
|
133
|
-
raw_rows.each_with_index do |raw, index|
|
134
|
-
row_num = index + 1
|
135
|
-
if row_num >= @data.start_row
|
136
|
-
add_row(row_num, raw)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Add a new row to our stash
|
143
|
-
def add_row(line, raw_data)
|
144
|
-
# Gracefully handle custom parsers that return nil for a row's data
|
145
|
-
raw_data ||= []
|
146
|
-
# Add the row
|
147
|
-
row = Row.new(self, line)
|
148
|
-
|
149
|
-
# Parse out the values
|
150
|
-
values = {}
|
151
|
-
@columns.each do |col|
|
152
|
-
index = col.data.index
|
153
|
-
raw_val = raw_data[index]
|
154
|
-
if col.parse
|
155
|
-
# Use custom parser if this row has one
|
156
|
-
val = col.parse_value(row, raw_val)
|
157
|
-
else
|
158
|
-
# Otherwise use our standard parser
|
159
|
-
val = @importer.data.parse_value(raw_val, col.type)
|
160
|
-
end
|
161
|
-
values[col.key] = val
|
162
|
-
end
|
163
|
-
|
164
|
-
# Set the values and filter if needed
|
165
|
-
row.set_values(values)
|
166
|
-
return nil unless !@filter || @filter.call(row)
|
167
|
-
|
168
|
-
# Row is solid, now check for missing required vals
|
169
|
-
@columns.each do |col|
|
170
|
-
val = values[col.key]
|
171
|
-
if col.validate_value(row, val)
|
172
|
-
if col.required?
|
173
|
-
if values[col.key].nil?
|
174
|
-
@importer.add_error(row, "Missing required value for #{col}")
|
175
|
-
end
|
176
|
-
end
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
# We is good
|
181
|
-
@data.rows << row
|
182
|
-
row
|
183
|
-
end
|
184
|
-
|
185
|
-
# Process the raw values for the first rows in a sheet,
|
186
|
-
# and attempt to build a map of the column layout, and
|
187
|
-
# detect the first row of real data
|
188
|
-
def parse_header(raw_rows)
|
189
|
-
if headerless?
|
190
|
-
# Use implicit or explicit column position when told to not look for a header
|
191
|
-
next_index = 0
|
192
|
-
@columns.each do |col|
|
193
|
-
unless col.position.nil?
|
194
|
-
next_index = col.fixed_index
|
195
|
-
end
|
196
|
-
col.data.index = next_index
|
197
|
-
next_index += 1
|
198
|
-
end
|
199
|
-
@data.start_row = @start_row || 1
|
200
|
-
return true
|
201
|
-
|
202
|
-
else
|
203
|
-
# Match by testing
|
204
|
-
raw_rows.each_with_index do |row, i|
|
205
|
-
# Um, have data?
|
206
|
-
next unless row
|
207
|
-
|
208
|
-
# Set up for this iteration
|
209
|
-
remaining = @columns.dup
|
210
|
-
|
211
|
-
# Step through this row's raw values, and look for a matching column for all columns
|
212
|
-
row.each_with_index do |val, i|
|
213
|
-
col = remaining.detect {|c| c.match_header?(val.to_s, i) }
|
214
|
-
if col
|
215
|
-
remaining -= [col]
|
216
|
-
col.data.index = i
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
if remaining.empty?
|
221
|
-
# Found the cols, have a map, update our start row to be the next line and return!
|
222
|
-
@data.start_row = @start_row || i+2
|
223
|
-
return true
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
# If we get here, we're hosed
|
228
|
-
@importer.add_error(self, "Unable to locate required column header(s) in sheet")
|
229
|
-
false
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
# When true, the given sheet name or zero-based index
|
234
|
-
# is a match with our id.
|
235
|
-
def match_sheet?(name, index)
|
236
|
-
if @id.is_a?(Fixnum)
|
237
|
-
@id.to_i == index+1
|
238
|
-
else
|
239
|
-
@id.to_s.downcase == name.downcase
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
def add_error(msg)
|
244
|
-
@importer.add_error(self, msg)
|
245
|
-
end
|
246
|
-
|
247
|
-
def add_warning(msg)
|
248
|
-
@importer.add_warning(self, msg)
|
249
|
-
end
|
250
|
-
|
251
|
-
def to_s
|
252
|
-
"Sheet #{@id}"
|
253
|
-
end
|
254
|
-
|
255
|
-
# Return all parsed, filtered data in the sheet as an
|
256
|
-
# array of arrays.
|
257
|
-
def dump
|
258
|
-
@data.rows.collect(&:values)
|
259
|
-
end
|
260
|
-
|
261
|
-
end
|
262
|
-
|
263
|
-
end
|
data/spec/importer/sheet_spec.rb
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
describe Importer::Sheet do
|
2
|
-
|
3
|
-
before do
|
4
|
-
@importer = Importer.new
|
5
|
-
@sheet = @importer.default_sheet
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'should respond to build' do
|
9
|
-
@sheet.should respond_to(:build)
|
10
|
-
@sheet.build do
|
11
|
-
column :foo
|
12
|
-
end
|
13
|
-
@sheet.columns.count.should == 1
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'should define columns' do
|
17
|
-
@sheet.column(:foo)
|
18
|
-
@sheet.columns.count.should == 1
|
19
|
-
end
|
20
|
-
|
21
|
-
it 'should find headers automatically' do
|
22
|
-
# Define a few sample columns
|
23
|
-
@sheet.column(:alpha)
|
24
|
-
@sheet.column(:gamma)
|
25
|
-
# Some dummy data
|
26
|
-
rows = [
|
27
|
-
['', '', '', ''],
|
28
|
-
['Alpha', 'Beta', 'Gamma', 'Epsilon']
|
29
|
-
]
|
30
|
-
|
31
|
-
# Parse it!
|
32
|
-
@sheet.parse_header(rows).should be_true
|
33
|
-
|
34
|
-
@sheet.column(:alpha).data.index.should == 0
|
35
|
-
@sheet.column(:gamma).data.index.should == 2
|
36
|
-
@sheet.data.start_row.should == 3
|
37
|
-
end
|
38
|
-
|
39
|
-
it 'should record an error if a column can\'t be found' do
|
40
|
-
# Define a few sample columns
|
41
|
-
@sheet.column(:alpha)
|
42
|
-
@sheet.column(:gamma)
|
43
|
-
# Some dummy data
|
44
|
-
rows = [
|
45
|
-
['', '', '', ''],
|
46
|
-
['Bob', 'Beta', 'Gamma', 'Epsilon']
|
47
|
-
]
|
48
|
-
|
49
|
-
# Parse it!
|
50
|
-
@sheet.parse_header(rows).should be_false
|
51
|
-
@importer.errors.count.should == 1
|
52
|
-
@importer.error_summary.should =~ /unable to locate required column header/i
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'should match by sheet name or number' do
|
56
|
-
@sheet.id = 5
|
57
|
-
@sheet.match_sheet?('foo', 3).should be_false
|
58
|
-
@sheet.match_sheet?('foo', 4).should be_true
|
59
|
-
|
60
|
-
@sheet.id = 'Sheet 5'
|
61
|
-
@sheet.match_sheet?('Sheet', 4).should be_false
|
62
|
-
@sheet.match_sheet?('Sheet 5', 3).should be_true
|
63
|
-
end
|
64
|
-
|
65
|
-
end
|