iron-import 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ # Implements the entry-point for our importing system. To use, construct
2
+ # an importer using the builder syntax (examples below), then run one or more
3
+ # files or streams through the import system.
4
+ #
5
+ # Constructing a simple importer:
6
+ #
7
+ # importer = Importer.build do
8
+ # column :order_number
9
+ # column :date
10
+ # column :amount
11
+ # end
12
+ #
13
+ # To use this importer simply call:
14
+ #
15
+ # if importer.import('/path/to/file.xls')
16
+ # importer.process do |row|
17
+ # puts "Order #{row[:order_number]: #{row[:amount]} on #{row[:date]}"
18
+ # end
19
+ # end
20
+ #
21
+ # The row.all? call will verify that each row passed contains a value for all defined columns.
22
+ #
23
+ # A more realistic and complex example follows:
24
+ #
25
+ # importer = Importer.build do
26
+ # column :order_number do
27
+ # match /order (num.*|id)/i
28
+ # end
29
+ # column :date
30
+ # column :amount
31
+ # end
32
+ #
33
+ class Importer
34
+
35
+ # Array of error message or nil for each non-header row
36
+ attr_accessor :errors, :warnings, :data
37
+ attr_accessor :sheets
38
+ # Source file/stream encoding, assumes UTF-8 if none specified
39
+ dsl_accessor :encoding
40
+
41
+ def self.build(options = {}, &block)
42
+ importer = Importer.new(options)
43
+ importer.build(&block)
44
+ importer
45
+ end
46
+
47
+ def initialize(options = {})
48
+ @encoding = 'UTF-8'
49
+ @sheets = {}
50
+
51
+ reset
52
+ end
53
+
54
+ def build(&block)
55
+ DslProxy.exec(self, &block) if block
56
+ self
57
+ end
58
+
59
+ def default_sheet
60
+ sheet(1)
61
+ end
62
+
63
+ # Access a Sheet definition by id (either number (1-N) or sheet name)
64
+ def sheet(id, create=true, &block)
65
+ # Find the sheet, creating it if needed (and requested!)
66
+ if @sheets[id].nil?
67
+ if create
68
+ @sheets[id] = Sheet.new(self, id)
69
+ else
70
+ return nil
71
+ end
72
+ end
73
+ sheet = @sheets[id]
74
+
75
+ # Allow customization by DSL block if requested
76
+ sheet.build(&block) if block
77
+
78
+ # Return the sheet
79
+ sheet
80
+ end
81
+
82
+ # Very, very commonly we only want to deal with the default sheet. In this case,
83
+ # let folks skip the sheet(n) do ... end block wrapper and just define columns
84
+ # against the main importer. Internally, proxy those calls to the first sheet
85
+ def column(*args, &block)
86
+ default_sheet.column(*args, &block)
87
+ end
88
+
89
+ def filter(*args, &block)
90
+ default_sheet.filter(*args, &block)
91
+ end
92
+
93
+ # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
94
+ # validate the required values, run custom validations... basically pre-parse and
95
+ # massage the supplied data. It will return true on success, or false if one
96
+ # or more errors were encountered and the import failed.
97
+ #
98
+ # You may supply various options for the import using the options hash. Supported
99
+ # options include:
100
+ #
101
+ # format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
102
+ # source as the specified format, or auto-detects if set to :auto
103
+ # encoding: source encoding override, defaults to guessing based on input
104
+ #
105
+ # Generally, you should be able to throw a source at it and it should work. The
106
+ # options exist to allow overriding in cases where the automation heuristics
107
+ # have failed and the input type is known by the caller.
108
+ #
109
+ # After #import has completed successfully, you can process the resulting data
110
+ # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
111
+ def import(path_or_stream, options = {})
112
+ # Clear all our load-time state, including all rows, header locations... you name it
113
+ reset
114
+
115
+ # Get the reader for this format
116
+ format = options.delete(:format)
117
+ if format && format != :auto
118
+ @data = DataReader::for_format(self, format)
119
+ unless reader
120
+ add_error("Unable to find format handler for format #{format} - aborting")
121
+ return
122
+ end
123
+ else
124
+ if path_or_stream.respond_to?(:read)
125
+ @data = DataReader::for_stream(self, path_or_stream)
126
+ unless @data
127
+ add_error("Unable to find format handler for stream - aborting")
128
+ return
129
+ end
130
+ else
131
+ @data = DataReader::for_path(self, path_or_stream)
132
+ unless @data
133
+ add_error("Unable to find format handler for file #{path_or_stream} - aborting")
134
+ return
135
+ end
136
+ end
137
+ end
138
+
139
+ # Read in the data!
140
+ @data.load(path_or_stream)
141
+ end
142
+
143
+ # Process a specific sheet, or the default sheet if none is provided. Your
144
+ # passed block will be handed one Row at a time.
145
+ def process(sheet_id = nil, &block)
146
+ s = sheet(sheet_id, false) || default_sheet
147
+ s.process(&block)
148
+ end
149
+
150
+ def add_error(context, msg = nil)
151
+ if context.is_a?(String) && msg.nil?
152
+ msg = context
153
+ context = nil
154
+ end
155
+ @errors << Error.new(context, msg)
156
+ end
157
+
158
+ def has_errors?
159
+ @errors.any?
160
+ end
161
+
162
+ def add_warning(context, msg)
163
+ if context.is_a?(String) && msg.nil?
164
+ msg = context
165
+ context = nil
166
+ end
167
+ @warnings << Error.new(context, msg)
168
+ end
169
+
170
+ def has_warnings?
171
+ @warnings.any?
172
+ end
173
+
174
+ # Returns a human-readable summary of the errors present on the importer
175
+ def error_summary
176
+ return nil unless has_errors?
177
+ @errors.collect(&:summary).list_join(', ')
178
+ end
179
+
180
+ protected
181
+
182
+ def reset
183
+ @errors = []
184
+ @warnings = []
185
+ @sheets.values.each(&:reset)
186
+ end
187
+
188
+ end
@@ -0,0 +1,59 @@
1
+ class Importer
2
+
3
+ class Row
4
+
5
+ attr_reader :sheet, :line, :values
6
+
7
+ def initialize(sheet, line, value_hash = nil)
8
+ @sheet = sheet
9
+ @line = line
10
+ @values = value_hash
11
+ end
12
+
13
+ def set_values(value_hash)
14
+ @values = value_hash
15
+ end
16
+
17
+ # True when all columns have a non-nil value, useful in filtering out junk
18
+ # rows
19
+ def all?(*keys)
20
+ if keys.any?
21
+ # Check only the specified keys
22
+ valid = true
23
+ keys.each do |key|
24
+ unless @values.has_key?(key)
25
+ raise "Unknown column key :#{key} in call to Row#all?"
26
+ end
27
+ valid = valid && !@values[key].nil?
28
+ end
29
+ valid
30
+ else
31
+ # Check all value keys
32
+ @values.values.all? {|v| !v.nil? }
33
+ end
34
+ end
35
+
36
+ def empty?
37
+ @values.values.all?(&:nil?)
38
+ end
39
+
40
+ # Returns the value of a column
41
+ def [](column_key)
42
+ @values[column_key]
43
+ end
44
+
45
+ def to_s
46
+ "Row #{@line}"
47
+ end
48
+
49
+ def add_error(msg)
50
+ @sheet.importer.add_error(self, msg)
51
+ end
52
+
53
+ def add_warning(msg)
54
+ @sheet.importer.add_warning(self, msg)
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,186 @@
1
+ class Importer
2
+
3
+ # The Sheet class handles building the sheet's column configuration and other
4
+ # setup, then holds all load-time row data.
5
+ class Sheet
6
+
7
+ # Inner class for holding load-time data that gets reset on each load call
8
+ class Data
9
+ attr_accessor :start_row, :rows
10
+ def initialize
11
+ @start_row = nil
12
+ @rows = []
13
+ end
14
+ end
15
+
16
+ # Key data
17
+ attr_reader :importer
18
+ attr_reader :columns
19
+ attr_reader :data
20
+
21
+ # Settings
22
+ dsl_flag :headerless
23
+ dsl_accessor :id
24
+ dsl_accessor :start_row
25
+ dsl_accessor :filter
26
+
27
+ def initialize(importer, id)
28
+ @importer = importer
29
+ @id = id
30
+
31
+ @headerless = false
32
+ @start_row = nil
33
+ @filter = nil
34
+
35
+ @columns = []
36
+
37
+ reset
38
+ end
39
+
40
+ def build(&block)
41
+ DslProxy.exec(self, &block)
42
+ end
43
+
44
+ def process
45
+ @data.rows.each do |row|
46
+ begin
47
+ yield row
48
+ rescue Exception => e
49
+ @importer.add_error(row, e.to_s)
50
+ end
51
+ end
52
+ end
53
+
54
+ def column(key, &block)
55
+ col = @columns.detect {|c| c.key == key }
56
+ unless col
57
+ col = Column.new(self, key)
58
+ @columns << col
59
+ end
60
+
61
+ DslProxy::exec(col, &block) if block
62
+
63
+ col
64
+ end
65
+
66
+ # Reset for load attempt
67
+ def reset
68
+ @data = Data.new
69
+ end
70
+
71
+ def parse_raw_data(raw_rows)
72
+ # Find our column layout, start of data, etc
73
+ if parse_header(raw_rows)
74
+ # Now, run all the data and add it as a Row instance
75
+ raw_rows.each_with_index do |raw, index|
76
+ line = index + 1
77
+ if line >= @data.start_row
78
+ add_row(line, raw)
79
+ end
80
+ end
81
+ end
82
+ end
83
+
84
+ # Add a new row to our stash
85
+ def add_row(line, raw_data)
86
+ # Add the row
87
+ row = Row.new(self, line)
88
+
89
+ # Parse out the values
90
+ values = {}
91
+ @columns.each do |col|
92
+ index = col.data.index
93
+ raw_val = raw_data[index]
94
+ if col.parse
95
+ val = col.parse_value(row, raw_val)
96
+ else
97
+ val = @importer.data.parse_value(raw_val, col.type)
98
+ end
99
+ values[col.key] = val
100
+ end
101
+
102
+ # Set the values and filter if needed
103
+ row.set_values(values)
104
+ return nil unless !@filter || @filter.call(row)
105
+
106
+ # Row is solid, now check for missing required vals
107
+ @columns.each do |col|
108
+ val = values[col.key]
109
+ if col.validate_value(row, val)
110
+ if col.required?
111
+ if values[col.key].nil?
112
+ @importer.add_error(row, "Missing required value for #{col}")
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ # We is good
119
+ @data.rows << row
120
+ row
121
+ end
122
+
123
+ # Process the raw values for the first rows in a sheet,
124
+ # and attempt to build a map of the column layout, and
125
+ # detect the first row of real data
126
+ def parse_header(raw_rows)
127
+ if headerless?
128
+ # Use implicit or explicit column position when told to not look for a header
129
+ next_index = 0
130
+ @columns.each do |col|
131
+ if col.index.present?
132
+ next_index = col.index
133
+ end
134
+ col.data.index = next_index
135
+ next_index += 1
136
+ end
137
+ @data.start_row = @start_row || 1
138
+ return true
139
+
140
+ else
141
+ # Match by testing
142
+ raw_rows.each_with_index do |row, i|
143
+ # Set up for this iteration
144
+ remaining = @columns.dup
145
+
146
+ # Step through this row's raw values, and look for a matching column for all columns
147
+ row.each_with_index do |val, i|
148
+ col = remaining.detect {|c| c.match_header?(val.to_s, i) }
149
+ if col
150
+ remaining -= [col]
151
+ col.data.index = i
152
+ end
153
+ end
154
+
155
+ if remaining.empty?
156
+ # Found the cols, have a map, update our start row to be the next line and return!
157
+ @data.start_row = @start_row || i+2
158
+ return true
159
+ end
160
+ end
161
+
162
+ # If we get here, we're hosed
163
+ @importer.add_error(self, "Unable to locate required column header(s) in sheet")
164
+ false
165
+ end
166
+ end
167
+
168
+ def match_sheet?(name, index)
169
+ if @id.is_a?(Fixnum)
170
+ @id.to_i == index+1
171
+ else
172
+ @id.to_s == name
173
+ end
174
+ end
175
+
176
+ def to_s
177
+ "Sheet #{@id}"
178
+ end
179
+
180
+ def dump
181
+ @data.rows.collect(&:values)
182
+ end
183
+
184
+ end
185
+
186
+ end
@@ -0,0 +1,60 @@
1
+ class Importer
2
+
3
+ class XlsReader < DataReader
4
+
5
+ def initialize(importer)
6
+ super(importer, :xlsx)
7
+ end
8
+
9
+ def load_file(path)
10
+ spreadsheet = Roo::Excel.new(path, :file_warning => :ignore)
11
+ if spreadsheet
12
+ # Get our list of sheet definitions, and run all the sheets in the spreadsheet
13
+ remaining_sheets = @importer.sheets.values
14
+ spreadsheet.sheets.each_with_index do |name, index|
15
+ # Look for a sheet definition that matches this sheet's name/index
16
+ sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
17
+ if sheet
18
+ # Remove from our list of remaining sheets
19
+ remaining_sheets.delete(sheet)
20
+ # Extract our raw data
21
+ raw_rows = []
22
+ spreadsheet.sheet(name).each_with_index do |row, line|
23
+ raw_rows << row
24
+ end
25
+ # Let the sheet sort it out
26
+ sheet.parse_raw_data(raw_rows)
27
+ end
28
+ end
29
+ return true
30
+ else
31
+ @importer.add_error("Unable to read Excel file at path #{path}")
32
+ return false
33
+ end
34
+
35
+ rescue Exception => e
36
+ @importer.add_error("Error reading file #{path}: #{e}")
37
+ false
38
+ end
39
+
40
+ private
41
+
42
+ def load_raw_rows(sheet, raw_rows)
43
+ # Figure out where our columns are and where our data starts
44
+ column_map = sheet.find_header(raw_rows[0...5])
45
+ start_row = sheet.data.start_row
46
+
47
+ # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
48
+ if !@importer.has_errors?
49
+ raw_rows.each_with_index do |raw, index|
50
+ line = index + 1
51
+ if line >= start_row
52
+ row = sheet.add_row(line, raw)
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,60 @@
1
+ class Importer
2
+
3
+ class XlsxReader < DataReader
4
+
5
+ def initialize(importer)
6
+ super(importer, :xlsx)
7
+ end
8
+
9
+ def load_file(path)
10
+ spreadsheet = Roo::Excelx.new(path, :file_warning => :ignore)
11
+ if spreadsheet
12
+ # Get our list of sheet definitions, and run all the sheets in the spreadsheet
13
+ remaining_sheets = @importer.sheets.values
14
+ spreadsheet.sheets.each_with_index do |name, index|
15
+ # Look for a sheet definition that matches this sheet's name/index
16
+ sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
17
+ if sheet
18
+ # Remove from our list of remaining sheets
19
+ remaining_sheets.delete(sheet)
20
+ # Extract our raw data
21
+ raw_rows = []
22
+ spreadsheet.sheet(name).each_with_index do |row, line|
23
+ raw_rows << row
24
+ end
25
+ # Let the sheet sort it out
26
+ sheet.parse_raw_data(raw_rows)
27
+ end
28
+ end
29
+ return true
30
+ else
31
+ @importer.add_error("Unable to read ExcelX file at path #{path}")
32
+ return false
33
+ end
34
+
35
+ rescue Exception => e
36
+ @importer.add_error("Error reading file #{path}: #{e} @ #{e.backtrace.first}")
37
+ false
38
+ end
39
+
40
+ private
41
+
42
+ def load_raw_rows(sheet, raw_rows)
43
+ # Figure out where our columns are and where our data starts
44
+ column_map = sheet.find_header(raw_rows[0...5])
45
+ start_row = sheet.data.start_row
46
+
47
+ # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
48
+ if !@importer.has_errors?
49
+ raw_rows.each_with_index do |raw, index|
50
+ line = index + 1
51
+ if line >= start_row
52
+ row = sheet.add_row(line, raw)
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,14 @@
1
+ # Dependencies
2
+ require 'iron/extensions'
3
+ require 'iron/dsl'
4
+
5
+ # Include required classes
6
+ require_relative 'import/column'
7
+ require_relative 'import/sheet'
8
+ require_relative 'import/row'
9
+ require_relative 'import/error'
10
+ require_relative 'import/data_reader'
11
+ require_relative 'import/csv_reader'
12
+ require_relative 'import/xls_reader'
13
+ require_relative 'import/xlsx_reader'
14
+ require_relative 'import/importer'
@@ -0,0 +1,116 @@
1
+ describe Importer::Column do
2
+
3
+ before do
4
+ @importer = Importer.new
5
+ @sheet = @importer.default_sheet
6
+ @col = Importer::Column.new(@sheet, :test)
7
+ @row = Importer::Row.new(@sheet, 1)
8
+ end
9
+
10
+ it 'should respond to build' do
11
+ @col.should respond_to(:build)
12
+ @col.build do
13
+ required!
14
+ end
15
+ @col.required?.should be_true
16
+ end
17
+
18
+ it 'should convert position strings to indexes' do
19
+ {
20
+ 'A' => 0,
21
+ 'C' => 2,
22
+ 'AA' => 26,
23
+ 'BAA' => 2*26*26 + 26
24
+ }.each_pair do |pos, index|
25
+ Importer::Column.pos_to_index(pos).should == index
26
+ end
27
+ end
28
+
29
+ it 'should convert position ints to position codes' do
30
+ {
31
+ 0 => 'A',
32
+ 25 => 'Z',
33
+ 26 => 'AA',
34
+ 2*26*26 + 26 + 3 => 'BAD'
35
+ }.each_pair do |index, pos|
36
+ Importer::Column.index_to_pos(index).should == pos
37
+ end
38
+ end
39
+
40
+ it 'should accept both int and string positions, and convert them to an index' do
41
+ {
42
+ 'A' => 0,
43
+ 5 => 4,
44
+ 'Z' => 25
45
+ }.each_pair do |pos, index|
46
+ @col.position = pos
47
+ @col.fixed_index.should == index
48
+ end
49
+ end
50
+
51
+ it 'should put a pretty output on conversion to string' do
52
+ @col.data.index = 3
53
+ @col.to_s.should == 'Column D'
54
+ end
55
+
56
+ it 'should match by key by default' do
57
+ ['Test', 'test', ' TEST '].each do |header|
58
+ @col.match_header?(header, 888).should be_true
59
+ end
60
+
61
+ ['', nil, 'Foo', 'Testy'].each do |header|
62
+ @col.match_header?(header, 888).should be_false
63
+ end
64
+ end
65
+
66
+ it 'should default to string type' do
67
+ @col.type.should == :string
68
+ end
69
+
70
+ it 'should match by position if position is specified' do
71
+ @col.position 'B'
72
+ @col.match_header?('junk', 1).should be_true
73
+ @col.match_header?('junk', 2).should be_false
74
+ end
75
+
76
+ it 'should match custom header matchers' do
77
+ {
78
+ /(alpha|beta)/ => { 'alphabet' => true, 'beta X' => true, 'gamma' => false },
79
+ /^test.$/i => { 'Testy' => true, 'test?' => true, 'notest' => false }
80
+ }.each_pair do |matcher, tests|
81
+ @col.header matcher
82
+ tests.each_pair do |val, res|
83
+ @col.match_header?(val, 1234).should == res
84
+ end
85
+ end
86
+ end
87
+
88
+ it 'should properly apply custom parsers' do
89
+ @col.parse_value(@row, 5).should == 5
90
+ @col.parse do |raw|
91
+ raw.to_i + 2
92
+ end
93
+ @col.parse_value(@row, 5).should == 7
94
+ end
95
+
96
+ it 'should record exceptions during parsing as errors' do
97
+ @col.parse do |raw|
98
+ raise 'nope'
99
+ end
100
+ @importer.has_errors?.should be_false
101
+ @col.parse_value(@row, 5).should be_nil
102
+ @importer.has_errors?.should be_true
103
+ end
104
+
105
+ it 'should allow custom validation' do
106
+ @col.validate do |val|
107
+ raise 'nope' if val != 5
108
+ end
109
+ @importer.has_errors?.should be_false
110
+ @col.validate_value(@row, 5).should be_true
111
+ @importer.has_errors?.should be_false
112
+ @col.validate_value(@row, 4).should be_false
113
+ @importer.has_errors?.should be_true
114
+ end
115
+
116
+ end
@@ -0,0 +1,31 @@
1
+ describe Importer::CsvReader do
2
+
3
+ before do
4
+ @importer = Importer.new
5
+ @reader = Importer::CsvReader.new(@importer)
6
+ end
7
+
8
+ it 'should load our simple CSV data' do
9
+ importer = Importer.build do
10
+ column :number do
11
+ type :integer
12
+ end
13
+ column :string do
14
+ type :string
15
+ end
16
+ column :date do
17
+ type :date
18
+ end
19
+ column :cost do
20
+ type :cents
21
+ end
22
+ end
23
+ importer.import(SpecHelper.sample_path('simple.csv')).should be_true
24
+ importer.default_sheet.dump.should == [
25
+ {:number => 123, :string => 'Abc', :date => Date.new(1977,5,13), :cost => 899},
26
+ {:number => nil, :string => nil, :date => nil, :cost => nil},
27
+ {:number => 5, :string => 'String with end spaces', :date => Date.new(2004,2,1), :cost => 1000}
28
+ ]
29
+ end
30
+
31
+ end