iron-import 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,30 @@
1
+ <table id="table_4ae4f429_6b24_496d_8dae_7c31a7644644" class="mx-grid boxscore d-b-s">
2
+ <thead>
3
+ <tr class="primary-header-row ">
4
+ <th class=" team string first" scope="col" data-cl="9"><span>1/31 @ 8p</span></th>
5
+ <th class=" quarter1 string score dw" scope="col" data-cl="2" title="Quarter 1"><span>Q1</span></th>
6
+ <th class=" quarter2 string score dw" scope="col" data-cl="2" title="Quarter 2"><span>Q2</span></th>
7
+ <th class=" quarter3 string score dw" scope="col" data-cl="2" title="Quarter 3"><span>Q3</span></th>
8
+ <th class=" quarter4 string score dw" scope="col" data-cl="2" title="Quarter 4"><span>Q4</span></th>
9
+ <th class=" score string last total score" scope="col" data-cl="5" title="Final"><span>Final</span></th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr class="first">
14
+ <th class="team first" scope="row"><a href="/high-schools/mcdowell-titans-(marion,nc)/basketball/home.htm" >McDowell</a></th>
15
+ <td class="quarter1 score dw">16</td>
16
+ <td class="quarter2 score dw">20</td>
17
+ <td class="quarter3 score dw">27</td>
18
+ <td class="quarter4 score dw">17</td>
19
+ <td class="score last total score">80</td>
20
+ </tr>
21
+ <tr class="last alternate">
22
+ <th class="team first" scope="row"><a href="/high-schools/asheville-cougars-(asheville,nc)/basketball/home.htm" >Asheville</a></th>
23
+ <td class="quarter1 score dw">13</td>
24
+ <td class="quarter2 score dw">17</td>
25
+ <td class="quarter3 score dw">17</td>
26
+ <td class="quarter4 score dw">13</td>
27
+ <td class="score last total score">60</td>
28
+ </tr>
29
+ </tbody>
30
+ </table>
@@ -0,0 +1,14 @@
1
+ <table>
2
+ <tr>
3
+ <td>Name</td>
4
+ <td>ID</td>
5
+ </tr>
6
+ <tr>
7
+ <td>John</td>
8
+ <td>888</td>
9
+ </tr>
10
+ <tr>
11
+ <td>Anne</td>
12
+ <td>1234</td>
13
+ </tr>
14
+ </table>
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  # Set up development requirements
2
2
  require 'roo'
3
+ require 'nokogiri'
3
4
 
4
5
  # Require our library
5
6
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'iron', 'import'))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iron-import
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Morris
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-24 00:00:00.000000000 Z
11
+ date: 2017-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: iron-extensions
@@ -72,9 +72,23 @@ dependencies:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
74
  version: '1.13'
75
- description: Simple yet powerful library for importing tabular data including support
76
- for auto-detecting column order, parsing/validating cell data, aggregating errors,
77
- etc.
75
+ - !ruby/object:Gem::Dependency
76
+ name: nokogiri
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.6'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '1.6'
89
+ description: Simple yet powerful library for importing tabular data from CSV, HTML,
90
+ XLS and XLSX files, including support for auto-detecting column order, parsing/validating
91
+ cell data, aggregating errors, etc.
78
92
  email:
79
93
  - rob@irongaze.com
80
94
  executables: []
@@ -93,22 +107,30 @@ files:
93
107
  - lib/iron/import/custom_reader.rb
94
108
  - lib/iron/import/data_reader.rb
95
109
  - lib/iron/import/error.rb
110
+ - lib/iron/import/excel_reader.rb
111
+ - lib/iron/import/html_reader.rb
96
112
  - lib/iron/import/importer.rb
97
113
  - lib/iron/import/row.rb
98
- - lib/iron/import/sheet.rb
99
114
  - lib/iron/import/xls_reader.rb
100
115
  - lib/iron/import/xlsx_reader.rb
101
116
  - spec/importer/column_spec.rb
102
117
  - spec/importer/csv_reader_spec.rb
103
118
  - spec/importer/custom_reader_spec.rb
104
119
  - spec/importer/data_reader_spec.rb
120
+ - spec/importer/html_reader_spec.rb
105
121
  - spec/importer/importer_spec.rb
106
122
  - spec/importer/row_spec.rb
107
- - spec/importer/sheet_spec.rb
123
+ - spec/importer/xls_reader_spec.rb
108
124
  - spec/importer/xlsx_reader_spec.rb
125
+ - spec/samples/3-sheets.xls
126
+ - spec/samples/col-span.html
127
+ - spec/samples/html-th-td.html
109
128
  - spec/samples/icd10-custom.txt
129
+ - spec/samples/multi-table.html
110
130
  - spec/samples/nanodrop.xlsx
131
+ - spec/samples/scores.html
111
132
  - spec/samples/simple.csv
133
+ - spec/samples/simple.html
112
134
  - spec/samples/test-products.xls
113
135
  - spec/spec_helper.rb
114
136
  homepage: http://irongaze.com
@@ -134,5 +156,5 @@ rubyforge_project:
134
156
  rubygems_version: 2.4.3
135
157
  signing_key:
136
158
  specification_version: 4
137
- summary: CSV, XLS, and XLSX import automation support
159
+ summary: CSV, HTML, XLS, and XLSX import automation support
138
160
  test_files: []
@@ -1,263 +0,0 @@
1
- class Importer
2
-
3
- # The Sheet class handles building the sheet's column configuration and other
4
- # setup, then holds all load-time row data. In some file types (Excel mostly)
5
- # there may be more than one sheet definition in a given importer. In others,
6
- # the default sheet is the only one (possibly implicitly) defined.
7
- #
8
- # The following builder options are available:
9
- #
10
- # Importer.build do
11
- # sheet('Some Sheet Name') do
12
- # # Don't try to look for a header using column definitions, there is no header
13
- # headerless!
14
- #
15
- # # Manually set the start row for data in this sheet, defaults to nil
16
- # # indicating that the data rows start immediatly following the header.
17
- # start_row 4
18
- #
19
- # # Define a filter that will skip unneeded rows. The filter command takes
20
- # # a block that receives the parsed (but not validated!) row data as an
21
- # # associative hash of :col_key => <parsed value>, and returns
22
- # # true to keep the row or false to exclude it.
23
- # filter do |row|
24
- # row[:id].to_i > 5000
25
- # end
26
- #
27
- # # Of course, the main thing to do in a sheet is define columns. See the
28
- # # Column class' notes for options when defining a column. Note that
29
- # # you can define columns using either hash-style:
30
- # column :id, :type => :integer
31
- # # or builder-style:
32
- # column :name do
33
- # header /company\s*name/
34
- # type :string
35
- # end
36
- # end
37
- class Sheet
38
-
39
- # Inner class for holding load-time data that gets reset on each load call
40
- class Data
41
- attr_accessor :start_row, :rows
42
- def initialize
43
- @start_row = nil
44
- @rows = []
45
- end
46
- end
47
-
48
- # Key data
49
- attr_reader :importer
50
- attr_reader :columns
51
- attr_reader :data
52
-
53
- # Settings
54
- dsl_flag :headerless
55
- dsl_accessor :id
56
- dsl_accessor :start_row
57
- dsl_accessor :filter
58
-
59
- def initialize(importer, id)
60
- @importer = importer
61
- @id = id
62
-
63
- @headerless = false
64
- @start_row = nil
65
- @filter = nil
66
-
67
- @columns = []
68
-
69
- reset
70
- end
71
-
72
- # Define our columns etc. via builder-style method calling
73
- def build(&block)
74
- DslProxy.exec(self, &block)
75
- end
76
-
77
- # Call with a block accepting a single Importer::Row with contents that
78
- # look like :column_key => <parsed value>. Any filtered rows
79
- # will not be present. If you want to register an error, simply
80
- # raise "some text" and it will be added to the importer's error
81
- # list for display to the user, logging, or whatever.
82
- def process
83
- @data.rows.each do |row|
84
- begin
85
- yield row
86
- rescue Exception => e
87
- @importer.add_error(row, e.to_s)
88
- end
89
- end
90
- end
91
-
92
- # Add a new column definition to our list, allows customizing the new
93
- # column with a builder block. See Importer::Column docs for
94
- # options. In lieu of a builder mode, you can pass the same values
95
- # as key => value pairs in the options hash to this method, so:
96
- #
97
- # column(:foo) do
98
- # type :string
99
- # parse do |val|
100
- # val.to_s.upcase
101
- # end
102
- # end
103
- #
104
- # Is equivalent to:
105
- #
106
- # column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
107
- #
108
- # Use whichever you prefer!
109
- def column(key, options_hash = {}, &block)
110
- # Find existing column with key to allow re-opening an existing definition
111
- col = @columns.detect {|c| c.key == key }
112
- unless col
113
- # if none found, add a new one
114
- col = Column.new(self, key, options_hash)
115
- @columns << col
116
- end
117
-
118
- # Customize if needed
119
- DslProxy::exec(col, &block) if block
120
-
121
- col
122
- end
123
-
124
- # Reset for load attempt
125
- def reset
126
- @data = Data.new
127
- end
128
-
129
- def parse_raw_data(raw_rows)
130
- # Find our column layout, start of data, etc
131
- if parse_header(raw_rows)
132
- # Now, run all the data and add it as a Row instance
133
- raw_rows.each_with_index do |raw, index|
134
- row_num = index + 1
135
- if row_num >= @data.start_row
136
- add_row(row_num, raw)
137
- end
138
- end
139
- end
140
- end
141
-
142
- # Add a new row to our stash
143
- def add_row(line, raw_data)
144
- # Gracefully handle custom parsers that return nil for a row's data
145
- raw_data ||= []
146
- # Add the row
147
- row = Row.new(self, line)
148
-
149
- # Parse out the values
150
- values = {}
151
- @columns.each do |col|
152
- index = col.data.index
153
- raw_val = raw_data[index]
154
- if col.parse
155
- # Use custom parser if this row has one
156
- val = col.parse_value(row, raw_val)
157
- else
158
- # Otherwise use our standard parser
159
- val = @importer.data.parse_value(raw_val, col.type)
160
- end
161
- values[col.key] = val
162
- end
163
-
164
- # Set the values and filter if needed
165
- row.set_values(values)
166
- return nil unless !@filter || @filter.call(row)
167
-
168
- # Row is solid, now check for missing required vals
169
- @columns.each do |col|
170
- val = values[col.key]
171
- if col.validate_value(row, val)
172
- if col.required?
173
- if values[col.key].nil?
174
- @importer.add_error(row, "Missing required value for #{col}")
175
- end
176
- end
177
- end
178
- end
179
-
180
- # We is good
181
- @data.rows << row
182
- row
183
- end
184
-
185
- # Process the raw values for the first rows in a sheet,
186
- # and attempt to build a map of the column layout, and
187
- # detect the first row of real data
188
- def parse_header(raw_rows)
189
- if headerless?
190
- # Use implicit or explicit column position when told to not look for a header
191
- next_index = 0
192
- @columns.each do |col|
193
- unless col.position.nil?
194
- next_index = col.fixed_index
195
- end
196
- col.data.index = next_index
197
- next_index += 1
198
- end
199
- @data.start_row = @start_row || 1
200
- return true
201
-
202
- else
203
- # Match by testing
204
- raw_rows.each_with_index do |row, i|
205
- # Um, have data?
206
- next unless row
207
-
208
- # Set up for this iteration
209
- remaining = @columns.dup
210
-
211
- # Step through this row's raw values, and look for a matching column for all columns
212
- row.each_with_index do |val, i|
213
- col = remaining.detect {|c| c.match_header?(val.to_s, i) }
214
- if col
215
- remaining -= [col]
216
- col.data.index = i
217
- end
218
- end
219
-
220
- if remaining.empty?
221
- # Found the cols, have a map, update our start row to be the next line and return!
222
- @data.start_row = @start_row || i+2
223
- return true
224
- end
225
- end
226
-
227
- # If we get here, we're hosed
228
- @importer.add_error(self, "Unable to locate required column header(s) in sheet")
229
- false
230
- end
231
- end
232
-
233
- # When true, the given sheet name or zero-based index
234
- # is a match with our id.
235
- def match_sheet?(name, index)
236
- if @id.is_a?(Fixnum)
237
- @id.to_i == index+1
238
- else
239
- @id.to_s.downcase == name.downcase
240
- end
241
- end
242
-
243
- def add_error(msg)
244
- @importer.add_error(self, msg)
245
- end
246
-
247
- def add_warning(msg)
248
- @importer.add_warning(self, msg)
249
- end
250
-
251
- def to_s
252
- "Sheet #{@id}"
253
- end
254
-
255
- # Return all parsed, filtered data in the sheet as an
256
- # array of arrays.
257
- def dump
258
- @data.rows.collect(&:values)
259
- end
260
-
261
- end
262
-
263
- end
@@ -1,65 +0,0 @@
1
- describe Importer::Sheet do
2
-
3
- before do
4
- @importer = Importer.new
5
- @sheet = @importer.default_sheet
6
- end
7
-
8
- it 'should respond to build' do
9
- @sheet.should respond_to(:build)
10
- @sheet.build do
11
- column :foo
12
- end
13
- @sheet.columns.count.should == 1
14
- end
15
-
16
- it 'should define columns' do
17
- @sheet.column(:foo)
18
- @sheet.columns.count.should == 1
19
- end
20
-
21
- it 'should find headers automatically' do
22
- # Define a few sample columns
23
- @sheet.column(:alpha)
24
- @sheet.column(:gamma)
25
- # Some dummy data
26
- rows = [
27
- ['', '', '', ''],
28
- ['Alpha', 'Beta', 'Gamma', 'Epsilon']
29
- ]
30
-
31
- # Parse it!
32
- @sheet.parse_header(rows).should be_true
33
-
34
- @sheet.column(:alpha).data.index.should == 0
35
- @sheet.column(:gamma).data.index.should == 2
36
- @sheet.data.start_row.should == 3
37
- end
38
-
39
- it 'should record an error if a column can\'t be found' do
40
- # Define a few sample columns
41
- @sheet.column(:alpha)
42
- @sheet.column(:gamma)
43
- # Some dummy data
44
- rows = [
45
- ['', '', '', ''],
46
- ['Bob', 'Beta', 'Gamma', 'Epsilon']
47
- ]
48
-
49
- # Parse it!
50
- @sheet.parse_header(rows).should be_false
51
- @importer.errors.count.should == 1
52
- @importer.error_summary.should =~ /unable to locate required column header/i
53
- end
54
-
55
- it 'should match by sheet name or number' do
56
- @sheet.id = 5
57
- @sheet.match_sheet?('foo', 3).should be_false
58
- @sheet.match_sheet?('foo', 4).should be_true
59
-
60
- @sheet.id = 'Sheet 5'
61
- @sheet.match_sheet?('Sheet', 4).should be_false
62
- @sheet.match_sheet?('Sheet 5', 3).should be_true
63
- end
64
-
65
- end