iron-import 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ <table id="table_4ae4f429_6b24_496d_8dae_7c31a7644644" class="mx-grid boxscore d-b-s">
2
+ <thead>
3
+ <tr class="primary-header-row ">
4
+ <th class=" team string first" scope="col" data-cl="9"><span>1/31 @ 8p</span></th>
5
+ <th class=" quarter1 string score dw" scope="col" data-cl="2" title="Quarter 1"><span>Q1</span></th>
6
+ <th class=" quarter2 string score dw" scope="col" data-cl="2" title="Quarter 2"><span>Q2</span></th>
7
+ <th class=" quarter3 string score dw" scope="col" data-cl="2" title="Quarter 3"><span>Q3</span></th>
8
+ <th class=" quarter4 string score dw" scope="col" data-cl="2" title="Quarter 4"><span>Q4</span></th>
9
+ <th class=" score string last total score" scope="col" data-cl="5" title="Final"><span>Final</span></th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr class="first">
14
+ <th class="team first" scope="row"><a href="/high-schools/mcdowell-titans-(marion,nc)/basketball/home.htm" >McDowell</a></th>
15
+ <td class="quarter1 score dw">16</td>
16
+ <td class="quarter2 score dw">20</td>
17
+ <td class="quarter3 score dw">27</td>
18
+ <td class="quarter4 score dw">17</td>
19
+ <td class="score last total score">80</td>
20
+ </tr>
21
+ <tr class="last alternate">
22
+ <th class="team first" scope="row"><a href="/high-schools/asheville-cougars-(asheville,nc)/basketball/home.htm" >Asheville</a></th>
23
+ <td class="quarter1 score dw">13</td>
24
+ <td class="quarter2 score dw">17</td>
25
+ <td class="quarter3 score dw">17</td>
26
+ <td class="quarter4 score dw">13</td>
27
+ <td class="score last total score">60</td>
28
+ </tr>
29
+ </tbody>
30
+ </table>
@@ -0,0 +1,14 @@
1
+ <table>
2
+ <tr>
3
+ <td>Name</td>
4
+ <td>ID</td>
5
+ </tr>
6
+ <tr>
7
+ <td>John</td>
8
+ <td>888</td>
9
+ </tr>
10
+ <tr>
11
+ <td>Anne</td>
12
+ <td>1234</td>
13
+ </tr>
14
+ </table>
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  # Set up development requirements
2
2
  require 'roo'
3
+ require 'nokogiri'
3
4
 
4
5
  # Require our library
5
6
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'iron', 'import'))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iron-import
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Morris
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-24 00:00:00.000000000 Z
11
+ date: 2017-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: iron-extensions
@@ -72,9 +72,23 @@ dependencies:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
74
  version: '1.13'
75
- description: Simple yet powerful library for importing tabular data including support
76
- for auto-detecting column order, parsing/validating cell data, aggregating errors,
77
- etc.
75
+ - !ruby/object:Gem::Dependency
76
+ name: nokogiri
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.6'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '1.6'
89
+ description: Simple yet powerful library for importing tabular data from CSV, HTML,
90
+ XLS and XLSX files, including support for auto-detecting column order, parsing/validating
91
+ cell data, aggregating errors, etc.
78
92
  email:
79
93
  - rob@irongaze.com
80
94
  executables: []
@@ -93,22 +107,30 @@ files:
93
107
  - lib/iron/import/custom_reader.rb
94
108
  - lib/iron/import/data_reader.rb
95
109
  - lib/iron/import/error.rb
110
+ - lib/iron/import/excel_reader.rb
111
+ - lib/iron/import/html_reader.rb
96
112
  - lib/iron/import/importer.rb
97
113
  - lib/iron/import/row.rb
98
- - lib/iron/import/sheet.rb
99
114
  - lib/iron/import/xls_reader.rb
100
115
  - lib/iron/import/xlsx_reader.rb
101
116
  - spec/importer/column_spec.rb
102
117
  - spec/importer/csv_reader_spec.rb
103
118
  - spec/importer/custom_reader_spec.rb
104
119
  - spec/importer/data_reader_spec.rb
120
+ - spec/importer/html_reader_spec.rb
105
121
  - spec/importer/importer_spec.rb
106
122
  - spec/importer/row_spec.rb
107
- - spec/importer/sheet_spec.rb
123
+ - spec/importer/xls_reader_spec.rb
108
124
  - spec/importer/xlsx_reader_spec.rb
125
+ - spec/samples/3-sheets.xls
126
+ - spec/samples/col-span.html
127
+ - spec/samples/html-th-td.html
109
128
  - spec/samples/icd10-custom.txt
129
+ - spec/samples/multi-table.html
110
130
  - spec/samples/nanodrop.xlsx
131
+ - spec/samples/scores.html
111
132
  - spec/samples/simple.csv
133
+ - spec/samples/simple.html
112
134
  - spec/samples/test-products.xls
113
135
  - spec/spec_helper.rb
114
136
  homepage: http://irongaze.com
@@ -134,5 +156,5 @@ rubyforge_project:
134
156
  rubygems_version: 2.4.3
135
157
  signing_key:
136
158
  specification_version: 4
137
- summary: CSV, XLS, and XLSX import automation support
159
+ summary: CSV, HTML, XLS, and XLSX import automation support
138
160
  test_files: []
@@ -1,263 +0,0 @@
1
- class Importer
2
-
3
- # The Sheet class handles building the sheet's column configuration and other
4
- # setup, then holds all load-time row data. In some file types (Excel mostly)
5
- # there may be more than one sheet definition in a given importer. In others,
6
- # the default sheet is the only one (possibly implicitly) defined.
7
- #
8
- # The following builder options are available:
9
- #
10
- # Importer.build do
11
- # sheet('Some Sheet Name') do
12
- # # Don't try to look for a header using column definitions, there is no header
13
- # headerless!
14
- #
15
- # # Manually set the start row for data in this sheet, defaults to nil
16
- # # indicating that the data rows start immediatly following the header.
17
- # start_row 4
18
- #
19
- # # Define a filter that will skip unneeded rows. The filter command takes
20
- # # a block that receives the parsed (but not validated!) row data as an
21
- # # associative hash of :col_key => <parsed value>, and returns
22
- # # true to keep the row or false to exclude it.
23
- # filter do |row|
24
- # row[:id].to_i > 5000
25
- # end
26
- #
27
- # # Of course, the main thing to do in a sheet is define columns. See the
28
- # # Column class' notes for options when defining a column. Note that
29
- # # you can define columns using either hash-style:
30
- # column :id, :type => :integer
31
- # # or builder-style:
32
- # column :name do
33
- # header /company\s*name/
34
- # type :string
35
- # end
36
- # end
37
- class Sheet
38
-
39
- # Inner class for holding load-time data that gets reset on each load call
40
- class Data
41
- attr_accessor :start_row, :rows
42
- def initialize
43
- @start_row = nil
44
- @rows = []
45
- end
46
- end
47
-
48
- # Key data
49
- attr_reader :importer
50
- attr_reader :columns
51
- attr_reader :data
52
-
53
- # Settings
54
- dsl_flag :headerless
55
- dsl_accessor :id
56
- dsl_accessor :start_row
57
- dsl_accessor :filter
58
-
59
- def initialize(importer, id)
60
- @importer = importer
61
- @id = id
62
-
63
- @headerless = false
64
- @start_row = nil
65
- @filter = nil
66
-
67
- @columns = []
68
-
69
- reset
70
- end
71
-
72
- # Define our columns etc. via builder-style method calling
73
- def build(&block)
74
- DslProxy.exec(self, &block)
75
- end
76
-
77
- # Call with a block accepting a single Importer::Row with contents that
78
- # look like :column_key => <parsed value>. Any filtered rows
79
- # will not be present. If you want to register an error, simply
80
- # raise "some text" and it will be added to the importer's error
81
- # list for display to the user, logging, or whatever.
82
- def process
83
- @data.rows.each do |row|
84
- begin
85
- yield row
86
- rescue Exception => e
87
- @importer.add_error(row, e.to_s)
88
- end
89
- end
90
- end
91
-
92
- # Add a new column definition to our list, allows customizing the new
93
- # column with a builder block. See Importer::Column docs for
94
- # options. In lieu of a builder mode, you can pass the same values
95
- # as key => value pairs in the options hash to this method, so:
96
- #
97
- # column(:foo) do
98
- # type :string
99
- # parse do |val|
100
- # val.to_s.upcase
101
- # end
102
- # end
103
- #
104
- # Is equivalent to:
105
- #
106
- # column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
107
- #
108
- # Use whichever you prefer!
109
- def column(key, options_hash = {}, &block)
110
- # Find existing column with key to allow re-opening an existing definition
111
- col = @columns.detect {|c| c.key == key }
112
- unless col
113
- # if none found, add a new one
114
- col = Column.new(self, key, options_hash)
115
- @columns << col
116
- end
117
-
118
- # Customize if needed
119
- DslProxy::exec(col, &block) if block
120
-
121
- col
122
- end
123
-
124
- # Reset for load attempt
125
- def reset
126
- @data = Data.new
127
- end
128
-
129
- def parse_raw_data(raw_rows)
130
- # Find our column layout, start of data, etc
131
- if parse_header(raw_rows)
132
- # Now, run all the data and add it as a Row instance
133
- raw_rows.each_with_index do |raw, index|
134
- row_num = index + 1
135
- if row_num >= @data.start_row
136
- add_row(row_num, raw)
137
- end
138
- end
139
- end
140
- end
141
-
142
- # Add a new row to our stash
143
- def add_row(line, raw_data)
144
- # Gracefully handle custom parsers that return nil for a row's data
145
- raw_data ||= []
146
- # Add the row
147
- row = Row.new(self, line)
148
-
149
- # Parse out the values
150
- values = {}
151
- @columns.each do |col|
152
- index = col.data.index
153
- raw_val = raw_data[index]
154
- if col.parse
155
- # Use custom parser if this row has one
156
- val = col.parse_value(row, raw_val)
157
- else
158
- # Otherwise use our standard parser
159
- val = @importer.data.parse_value(raw_val, col.type)
160
- end
161
- values[col.key] = val
162
- end
163
-
164
- # Set the values and filter if needed
165
- row.set_values(values)
166
- return nil unless !@filter || @filter.call(row)
167
-
168
- # Row is solid, now check for missing required vals
169
- @columns.each do |col|
170
- val = values[col.key]
171
- if col.validate_value(row, val)
172
- if col.required?
173
- if values[col.key].nil?
174
- @importer.add_error(row, "Missing required value for #{col}")
175
- end
176
- end
177
- end
178
- end
179
-
180
- # We is good
181
- @data.rows << row
182
- row
183
- end
184
-
185
- # Process the raw values for the first rows in a sheet,
186
- # and attempt to build a map of the column layout, and
187
- # detect the first row of real data
188
- def parse_header(raw_rows)
189
- if headerless?
190
- # Use implicit or explicit column position when told to not look for a header
191
- next_index = 0
192
- @columns.each do |col|
193
- unless col.position.nil?
194
- next_index = col.fixed_index
195
- end
196
- col.data.index = next_index
197
- next_index += 1
198
- end
199
- @data.start_row = @start_row || 1
200
- return true
201
-
202
- else
203
- # Match by testing
204
- raw_rows.each_with_index do |row, i|
205
- # Um, have data?
206
- next unless row
207
-
208
- # Set up for this iteration
209
- remaining = @columns.dup
210
-
211
- # Step through this row's raw values, and look for a matching column for all columns
212
- row.each_with_index do |val, i|
213
- col = remaining.detect {|c| c.match_header?(val.to_s, i) }
214
- if col
215
- remaining -= [col]
216
- col.data.index = i
217
- end
218
- end
219
-
220
- if remaining.empty?
221
- # Found the cols, have a map, update our start row to be the next line and return!
222
- @data.start_row = @start_row || i+2
223
- return true
224
- end
225
- end
226
-
227
- # If we get here, we're hosed
228
- @importer.add_error(self, "Unable to locate required column header(s) in sheet")
229
- false
230
- end
231
- end
232
-
233
- # When true, the given sheet name or zero-based index
234
- # is a match with our id.
235
- def match_sheet?(name, index)
236
- if @id.is_a?(Fixnum)
237
- @id.to_i == index+1
238
- else
239
- @id.to_s.downcase == name.downcase
240
- end
241
- end
242
-
243
- def add_error(msg)
244
- @importer.add_error(self, msg)
245
- end
246
-
247
- def add_warning(msg)
248
- @importer.add_warning(self, msg)
249
- end
250
-
251
- def to_s
252
- "Sheet #{@id}"
253
- end
254
-
255
- # Return all parsed, filtered data in the sheet as an
256
- # array of arrays.
257
- def dump
258
- @data.rows.collect(&:values)
259
- end
260
-
261
- end
262
-
263
- end
@@ -1,65 +0,0 @@
1
- describe Importer::Sheet do
2
-
3
- before do
4
- @importer = Importer.new
5
- @sheet = @importer.default_sheet
6
- end
7
-
8
- it 'should respond to build' do
9
- @sheet.should respond_to(:build)
10
- @sheet.build do
11
- column :foo
12
- end
13
- @sheet.columns.count.should == 1
14
- end
15
-
16
- it 'should define columns' do
17
- @sheet.column(:foo)
18
- @sheet.columns.count.should == 1
19
- end
20
-
21
- it 'should find headers automatically' do
22
- # Define a few sample columns
23
- @sheet.column(:alpha)
24
- @sheet.column(:gamma)
25
- # Some dummy data
26
- rows = [
27
- ['', '', '', ''],
28
- ['Alpha', 'Beta', 'Gamma', 'Epsilon']
29
- ]
30
-
31
- # Parse it!
32
- @sheet.parse_header(rows).should be_true
33
-
34
- @sheet.column(:alpha).data.index.should == 0
35
- @sheet.column(:gamma).data.index.should == 2
36
- @sheet.data.start_row.should == 3
37
- end
38
-
39
- it 'should record an error if a column can\'t be found' do
40
- # Define a few sample columns
41
- @sheet.column(:alpha)
42
- @sheet.column(:gamma)
43
- # Some dummy data
44
- rows = [
45
- ['', '', '', ''],
46
- ['Bob', 'Beta', 'Gamma', 'Epsilon']
47
- ]
48
-
49
- # Parse it!
50
- @sheet.parse_header(rows).should be_false
51
- @importer.errors.count.should == 1
52
- @importer.error_summary.should =~ /unable to locate required column header/i
53
- end
54
-
55
- it 'should match by sheet name or number' do
56
- @sheet.id = 5
57
- @sheet.match_sheet?('foo', 3).should be_false
58
- @sheet.match_sheet?('foo', 4).should be_true
59
-
60
- @sheet.id = 'Sheet 5'
61
- @sheet.match_sheet?('Sheet', 4).should be_false
62
- @sheet.match_sheet?('Sheet 5', 3).should be_true
63
- end
64
-
65
- end