iron-import 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,188 @@
1
+ # Implements the entry-point for our importing system. To use, construct
2
+ # an importer using the builder syntax (examples below), then run one or more
3
+ # files or streams through the import system.
4
+ #
5
+ # Constructing a simple importer:
6
+ #
7
+ # importer = Importer.build do
8
+ # column :order_number
9
+ # column :date
10
+ # column :amount
11
+ # end
12
+ #
13
+ # To use this importer simply call:
14
+ #
15
+ # if importer.import('/path/to/file.xls')
16
+ # importer.process do |row|
17
+ # puts "Order #{row[:order_number]: #{row[:amount]} on #{row[:date]}"
18
+ # end
19
+ # end
20
+ #
21
+ # The row.all? call will verify that each row passed contains a value for all defined columns.
22
+ #
23
+ # A more realistic and complex example follows:
24
+ #
25
+ # importer = Importer.build do
26
+ # column :order_number do
27
+ # match /order (num.*|id)/i
28
+ # end
29
+ # column :date
30
+ # column :amount
31
+ # end
32
+ #
33
+ class Importer
34
+
35
+ # Array of error message or nil for each non-header row
36
+ attr_accessor :errors, :warnings, :data
37
+ attr_accessor :sheets
38
+ # Source file/stream encoding, assumes UTF-8 if none specified
39
+ dsl_accessor :encoding
40
+
41
+ def self.build(options = {}, &block)
42
+ importer = Importer.new(options)
43
+ importer.build(&block)
44
+ importer
45
+ end
46
+
47
+ def initialize(options = {})
48
+ @encoding = 'UTF-8'
49
+ @sheets = {}
50
+
51
+ reset
52
+ end
53
+
54
+ def build(&block)
55
+ DslProxy.exec(self, &block) if block
56
+ self
57
+ end
58
+
59
+ def default_sheet
60
+ sheet(1)
61
+ end
62
+
63
+ # Access a Sheet definition by id (either number (1-N) or sheet name)
64
+ def sheet(id, create=true, &block)
65
+ # Find the sheet, creating it if needed (and requested!)
66
+ if @sheets[id].nil?
67
+ if create
68
+ @sheets[id] = Sheet.new(self, id)
69
+ else
70
+ return nil
71
+ end
72
+ end
73
+ sheet = @sheets[id]
74
+
75
+ # Allow customization by DSL block if requested
76
+ sheet.build(&block) if block
77
+
78
+ # Return the sheet
79
+ sheet
80
+ end
81
+
82
+ # Very, very commonly we only want to deal with the default sheet. In this case,
83
+ # let folks skip the sheet(n) do ... end block wrapper and just define columns
84
+ # against the main importer. Internally, proxy those calls to the first sheet
85
+ def column(*args, &block)
86
+ default_sheet.column(*args, &block)
87
+ end
88
+
89
+ def filter(*args, &block)
90
+ default_sheet.filter(*args, &block)
91
+ end
92
+
93
+ # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
94
+ # validate the required values, run custom validations... basically pre-parse and
95
+ # massage the supplied data. It will return true on success, or false if one
96
+ # or more errors were encountered and the import failed.
97
+ #
98
+ # You may supply various options for the import using the options hash. Supported
99
+ # options include:
100
+ #
101
+ # format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
102
+ # source as the specified format, or auto-detects if set to :auto
103
+ # encoding: source encoding override, defaults to guessing based on input
104
+ #
105
+ # Generally, you should be able to throw a source at it and it should work. The
106
+ # options exist to allow overriding in cases where the automation heuristics
107
+ # have failed and the input type is known by the caller.
108
+ #
109
+ # After #import has completed successfully, you can process the resulting data
110
+ # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
111
+ def import(path_or_stream, options = {})
112
+ # Clear all our load-time state, including all rows, header locations... you name it
113
+ reset
114
+
115
+ # Get the reader for this format
116
+ format = options.delete(:format)
117
+ if format && format != :auto
118
+ @data = DataReader::for_format(self, format)
119
+ unless reader
120
+ add_error("Unable to find format handler for format #{format} - aborting")
121
+ return
122
+ end
123
+ else
124
+ if path_or_stream.respond_to?(:read)
125
+ @data = DataReader::for_stream(self, path_or_stream)
126
+ unless @data
127
+ add_error("Unable to find format handler for stream - aborting")
128
+ return
129
+ end
130
+ else
131
+ @data = DataReader::for_path(self, path_or_stream)
132
+ unless @data
133
+ add_error("Unable to find format handler for file #{path_or_stream} - aborting")
134
+ return
135
+ end
136
+ end
137
+ end
138
+
139
+ # Read in the data!
140
+ @data.load(path_or_stream)
141
+ end
142
+
143
+ # Process a specific sheet, or the default sheet if none is provided. Your
144
+ # passed block will be handed one Row at a time.
145
+ def process(sheet_id = nil, &block)
146
+ s = sheet(sheet_id, false) || default_sheet
147
+ s.process(&block)
148
+ end
149
+
150
+ def add_error(context, msg = nil)
151
+ if context.is_a?(String) && msg.nil?
152
+ msg = context
153
+ context = nil
154
+ end
155
+ @errors << Error.new(context, msg)
156
+ end
157
+
158
+ def has_errors?
159
+ @errors.any?
160
+ end
161
+
162
+ def add_warning(context, msg)
163
+ if context.is_a?(String) && msg.nil?
164
+ msg = context
165
+ context = nil
166
+ end
167
+ @warnings << Error.new(context, msg)
168
+ end
169
+
170
+ def has_warnings?
171
+ @warnings.any?
172
+ end
173
+
174
+ # Returns a human-readable summary of the errors present on the importer
175
+ def error_summary
176
+ return nil unless has_errors?
177
+ @errors.collect(&:summary).list_join(', ')
178
+ end
179
+
180
+ protected
181
+
182
+ def reset
183
+ @errors = []
184
+ @warnings = []
185
+ @sheets.values.each(&:reset)
186
+ end
187
+
188
+ end
@@ -0,0 +1,59 @@
1
+ class Importer
2
+
3
+ class Row
4
+
5
+ attr_reader :sheet, :line, :values
6
+
7
+ def initialize(sheet, line, value_hash = nil)
8
+ @sheet = sheet
9
+ @line = line
10
+ @values = value_hash
11
+ end
12
+
13
+ def set_values(value_hash)
14
+ @values = value_hash
15
+ end
16
+
17
+ # True when all columns have a non-nil value, useful in filtering out junk
18
+ # rows
19
+ def all?(*keys)
20
+ if keys.any?
21
+ # Check only the specified keys
22
+ valid = true
23
+ keys.each do |key|
24
+ unless @values.has_key?(key)
25
+ raise "Unknown column key :#{key} in call to Row#all?"
26
+ end
27
+ valid = valid && !@values[key].nil?
28
+ end
29
+ valid
30
+ else
31
+ # Check all value keys
32
+ @values.values.all? {|v| !v.nil? }
33
+ end
34
+ end
35
+
36
+ def empty?
37
+ @values.values.all?(&:nil?)
38
+ end
39
+
40
+ # Returns the value of a column
41
+ def [](column_key)
42
+ @values[column_key]
43
+ end
44
+
45
+ def to_s
46
+ "Row #{@line}"
47
+ end
48
+
49
+ def add_error(msg)
50
+ @sheet.importer.add_error(self, msg)
51
+ end
52
+
53
+ def add_warning(msg)
54
+ @sheet.importer.add_warning(self, msg)
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,186 @@
1
+ class Importer
2
+
3
+ # The Sheet class handles building the sheet's column configuration and other
4
+ # setup, then holds all load-time row data.
5
+ class Sheet
6
+
7
+ # Inner class for holding load-time data that gets reset on each load call
8
+ class Data
9
+ attr_accessor :start_row, :rows
10
+ def initialize
11
+ @start_row = nil
12
+ @rows = []
13
+ end
14
+ end
15
+
16
+ # Key data
17
+ attr_reader :importer
18
+ attr_reader :columns
19
+ attr_reader :data
20
+
21
+ # Settings
22
+ dsl_flag :headerless
23
+ dsl_accessor :id
24
+ dsl_accessor :start_row
25
+ dsl_accessor :filter
26
+
27
+ def initialize(importer, id)
28
+ @importer = importer
29
+ @id = id
30
+
31
+ @headerless = false
32
+ @start_row = nil
33
+ @filter = nil
34
+
35
+ @columns = []
36
+
37
+ reset
38
+ end
39
+
40
+ def build(&block)
41
+ DslProxy.exec(self, &block)
42
+ end
43
+
44
+ def process
45
+ @data.rows.each do |row|
46
+ begin
47
+ yield row
48
+ rescue Exception => e
49
+ @importer.add_error(row, e.to_s)
50
+ end
51
+ end
52
+ end
53
+
54
+ def column(key, &block)
55
+ col = @columns.detect {|c| c.key == key }
56
+ unless col
57
+ col = Column.new(self, key)
58
+ @columns << col
59
+ end
60
+
61
+ DslProxy::exec(col, &block) if block
62
+
63
+ col
64
+ end
65
+
66
+ # Reset for load attempt
67
+ def reset
68
+ @data = Data.new
69
+ end
70
+
71
+ def parse_raw_data(raw_rows)
72
+ # Find our column layout, start of data, etc
73
+ if parse_header(raw_rows)
74
+ # Now, run all the data and add it as a Row instance
75
+ raw_rows.each_with_index do |raw, index|
76
+ line = index + 1
77
+ if line >= @data.start_row
78
+ add_row(line, raw)
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ # Add a new row to our stash
85
+ def add_row(line, raw_data)
86
+ # Add the row
87
+ row = Row.new(self, line)
88
+
89
+ # Parse out the values
90
+ values = {}
91
+ @columns.each do |col|
92
+ index = col.data.index
93
+ raw_val = raw_data[index]
94
+ if col.parse
95
+ val = col.parse_value(row, raw_val)
96
+ else
97
+ val = @importer.data.parse_value(raw_val, col.type)
98
+ end
99
+ values[col.key] = val
100
+ end
101
+
102
+ # Set the values and filter if needed
103
+ row.set_values(values)
104
+ return nil unless !@filter || @filter.call(row)
105
+
106
+ # Row is solid, now check for missing required vals
107
+ @columns.each do |col|
108
+ val = values[col.key]
109
+ if col.validate_value(row, val)
110
+ if col.required?
111
+ if values[col.key].nil?
112
+ @importer.add_error(row, "Missing required value for #{col}")
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ # We is good
119
+ @data.rows << row
120
+ row
121
+ end
122
+
123
+ # Process the raw values for the first rows in a sheet,
124
+ # and attempt to build a map of the column layout, and
125
+ # detect the first row of real data
126
+ def parse_header(raw_rows)
127
+ if headerless?
128
+ # Use implicit or explicit column position when told to not look for a header
129
+ next_index = 0
130
+ @columns.each do |col|
131
+ if col.index.present?
132
+ next_index = col.index
133
+ end
134
+ col.data.index = next_index
135
+ next_index += 1
136
+ end
137
+ @data.start_row = @start_row || 1
138
+ return true
139
+
140
+ else
141
+ # Match by testing
142
+ raw_rows.each_with_index do |row, i|
143
+ # Set up for this iteration
144
+ remaining = @columns.dup
145
+
146
+ # Step through this row's raw values, and look for a matching column for all columns
147
+ row.each_with_index do |val, i|
148
+ col = remaining.detect {|c| c.match_header?(val.to_s, i) }
149
+ if col
150
+ remaining -= [col]
151
+ col.data.index = i
152
+ end
153
+ end
154
+
155
+ if remaining.empty?
156
+ # Found the cols, have a map, update our start row to be the next line and return!
157
+ @data.start_row = @start_row || i+2
158
+ return true
159
+ end
160
+ end
161
+
162
+ # If we get here, we're hosed
163
+ @importer.add_error(self, "Unable to locate required column header(s) in sheet")
164
+ false
165
+ end
166
+ end
167
+
168
+ def match_sheet?(name, index)
169
+ if @id.is_a?(Fixnum)
170
+ @id.to_i == index+1
171
+ else
172
+ @id.to_s == name
173
+ end
174
+ end
175
+
176
+ def to_s
177
+ "Sheet #{@id}"
178
+ end
179
+
180
+ def dump
181
+ @data.rows.collect(&:values)
182
+ end
183
+
184
+ end
185
+
186
+ end
@@ -0,0 +1,60 @@
1
+ class Importer
2
+
3
+ class XlsReader < DataReader
4
+
5
+ def initialize(importer)
6
+ super(importer, :xlsx)
7
+ end
8
+
9
+ def load_file(path)
10
+ spreadsheet = Roo::Excel.new(path, :file_warning => :ignore)
11
+ if spreadsheet
12
+ # Get our list of sheet definitions, and run all the sheets in the spreadsheet
13
+ remaining_sheets = @importer.sheets.values
14
+ spreadsheet.sheets.each_with_index do |name, index|
15
+ # Look for a sheet definition that matches this sheet's name/index
16
+ sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
17
+ if sheet
18
+ # Remove from our list of remaining sheets
19
+ remaining_sheets.delete(sheet)
20
+ # Extract our raw data
21
+ raw_rows = []
22
+ spreadsheet.sheet(name).each_with_index do |row, line|
23
+ raw_rows << row
24
+ end
25
+ # Let the sheet sort it out
26
+ sheet.parse_raw_data(raw_rows)
27
+ end
28
+ end
29
+ return true
30
+ else
31
+ @importer.add_error("Unable to read Excel file at path #{path}")
32
+ return false
33
+ end
34
+
35
+ rescue Exception => e
36
+ @importer.add_error("Error reading file #{path}: #{e}")
37
+ false
38
+ end
39
+
40
+ private
41
+
42
+ def load_raw_rows(sheet, raw_rows)
43
+ # Figure out where our columns are and where our data starts
44
+ column_map = sheet.find_header(raw_rows[0...5])
45
+ start_row = sheet.data.start_row
46
+
47
+ # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
48
+ if !@importer.has_errors?
49
+ raw_rows.each_with_index do |raw, index|
50
+ line = index + 1
51
+ if line >= start_row
52
+ row = sheet.add_row(line, raw)
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,60 @@
1
+ class Importer
2
+
3
+ class XlsxReader < DataReader
4
+
5
+ def initialize(importer)
6
+ super(importer, :xlsx)
7
+ end
8
+
9
+ def load_file(path)
10
+ spreadsheet = Roo::Excelx.new(path, :file_warning => :ignore)
11
+ if spreadsheet
12
+ # Get our list of sheet definitions, and run all the sheets in the spreadsheet
13
+ remaining_sheets = @importer.sheets.values
14
+ spreadsheet.sheets.each_with_index do |name, index|
15
+ # Look for a sheet definition that matches this sheet's name/index
16
+ sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
17
+ if sheet
18
+ # Remove from our list of remaining sheets
19
+ remaining_sheets.delete(sheet)
20
+ # Extract our raw data
21
+ raw_rows = []
22
+ spreadsheet.sheet(name).each_with_index do |row, line|
23
+ raw_rows << row
24
+ end
25
+ # Let the sheet sort it out
26
+ sheet.parse_raw_data(raw_rows)
27
+ end
28
+ end
29
+ return true
30
+ else
31
+ @importer.add_error("Unable to read ExcelX file at path #{path}")
32
+ return false
33
+ end
34
+
35
+ rescue Exception => e
36
+ @importer.add_error("Error reading file #{path}: #{e} @ #{e.backtrace.first}")
37
+ false
38
+ end
39
+
40
+ private
41
+
42
+ def load_raw_rows(sheet, raw_rows)
43
+ # Figure out where our columns are and where our data starts
44
+ column_map = sheet.find_header(raw_rows[0...5])
45
+ start_row = sheet.data.start_row
46
+
47
+ # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
48
+ if !@importer.has_errors?
49
+ raw_rows.each_with_index do |raw, index|
50
+ line = index + 1
51
+ if line >= start_row
52
+ row = sheet.add_row(line, raw)
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,14 @@
1
+ # Dependencies
2
+ require 'iron/extensions'
3
+ require 'iron/dsl'
4
+
5
+ # Include required classes
6
+ require_relative 'import/column'
7
+ require_relative 'import/sheet'
8
+ require_relative 'import/row'
9
+ require_relative 'import/error'
10
+ require_relative 'import/data_reader'
11
+ require_relative 'import/csv_reader'
12
+ require_relative 'import/xls_reader'
13
+ require_relative 'import/xlsx_reader'
14
+ require_relative 'import/importer'
@@ -0,0 +1,116 @@
1
+ describe Importer::Column do
2
+
3
+ before do
4
+ @importer = Importer.new
5
+ @sheet = @importer.default_sheet
6
+ @col = Importer::Column.new(@sheet, :test)
7
+ @row = Importer::Row.new(@sheet, 1)
8
+ end
9
+
10
+ it 'should respond to build' do
11
+ @col.should respond_to(:build)
12
+ @col.build do
13
+ required!
14
+ end
15
+ @col.required?.should be_true
16
+ end
17
+
18
+ it 'should convert position strings to indexes' do
19
+ {
20
+ 'A' => 0,
21
+ 'C' => 2,
22
+ 'AA' => 26,
23
+ 'BAA' => 2*26*26 + 26
24
+ }.each_pair do |pos, index|
25
+ Importer::Column.pos_to_index(pos).should == index
26
+ end
27
+ end
28
+
29
+ it 'should convert position ints to position codes' do
30
+ {
31
+ 0 => 'A',
32
+ 25 => 'Z',
33
+ 26 => 'AA',
34
+ 2*26*26 + 26 + 3 => 'BAD'
35
+ }.each_pair do |index, pos|
36
+ Importer::Column.index_to_pos(index).should == pos
37
+ end
38
+ end
39
+
40
+ it 'should accept both int and string positions, and convert them to an index' do
41
+ {
42
+ 'A' => 0,
43
+ 5 => 4,
44
+ 'Z' => 25
45
+ }.each_pair do |pos, index|
46
+ @col.position = pos
47
+ @col.fixed_index.should == index
48
+ end
49
+ end
50
+
51
+ it 'should put a pretty output on conversion to string' do
52
+ @col.data.index = 3
53
+ @col.to_s.should == 'Column D'
54
+ end
55
+
56
+ it 'should match by key by default' do
57
+ ['Test', 'test', ' TEST '].each do |header|
58
+ @col.match_header?(header, 888).should be_true
59
+ end
60
+
61
+ ['', nil, 'Foo', 'Testy'].each do |header|
62
+ @col.match_header?(header, 888).should be_false
63
+ end
64
+ end
65
+
66
+ it 'should default to string type' do
67
+ @col.type.should == :string
68
+ end
69
+
70
+ it 'should match by position if position is specified' do
71
+ @col.position 'B'
72
+ @col.match_header?('junk', 1).should be_true
73
+ @col.match_header?('junk', 2).should be_false
74
+ end
75
+
76
+ it 'should match custom header matchers' do
77
+ {
78
+ /(alpha|beta)/ => { 'alphabet' => true, 'beta X' => true, 'gamma' => false },
79
+ /^test.$/i => { 'Testy' => true, 'test?' => true, 'notest' => false }
80
+ }.each_pair do |matcher, tests|
81
+ @col.header matcher
82
+ tests.each_pair do |val, res|
83
+ @col.match_header?(val, 1234).should == res
84
+ end
85
+ end
86
+ end
87
+
88
+ it 'should properly apply custom parsers' do
89
+ @col.parse_value(@row, 5).should == 5
90
+ @col.parse do |raw|
91
+ raw.to_i + 2
92
+ end
93
+ @col.parse_value(@row, 5).should == 7
94
+ end
95
+
96
+ it 'should record exceptions during parsing as errors' do
97
+ @col.parse do |raw|
98
+ raise 'nope'
99
+ end
100
+ @importer.has_errors?.should be_false
101
+ @col.parse_value(@row, 5).should be_nil
102
+ @importer.has_errors?.should be_true
103
+ end
104
+
105
+ it 'should allow custom validation' do
106
+ @col.validate do |val|
107
+ raise 'nope' if val != 5
108
+ end
109
+ @importer.has_errors?.should be_false
110
+ @col.validate_value(@row, 5).should be_true
111
+ @importer.has_errors?.should be_false
112
+ @col.validate_value(@row, 4).should be_false
113
+ @importer.has_errors?.should be_true
114
+ end
115
+
116
+ end
@@ -0,0 +1,31 @@
1
+ describe Importer::CsvReader do
2
+
3
+ before do
4
+ @importer = Importer.new
5
+ @reader = Importer::CsvReader.new(@importer)
6
+ end
7
+
8
+ it 'should load our simple CSV data' do
9
+ importer = Importer.build do
10
+ column :number do
11
+ type :integer
12
+ end
13
+ column :string do
14
+ type :string
15
+ end
16
+ column :date do
17
+ type :date
18
+ end
19
+ column :cost do
20
+ type :cents
21
+ end
22
+ end
23
+ importer.import(SpecHelper.sample_path('simple.csv')).should be_true
24
+ importer.default_sheet.dump.should == [
25
+ {:number => 123, :string => 'Abc', :date => Date.new(1977,5,13), :cost => 899},
26
+ {:number => nil, :string => nil, :date => nil, :cost => nil},
27
+ {:number => 5, :string => 'String with end spaces', :date => Date.new(2004,2,1), :cost => 1000}
28
+ ]
29
+ end
30
+
31
+ end