mindreframer-oxcelix 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ module Oxcelix
2
+ VERSION = '0.5.0'
3
+ end
@@ -0,0 +1,396 @@
1
+ # The namespace for all classes and modules included on Oxcelix.
2
+ module Oxcelix
3
+ # Helper methods for the Workbook class
4
+ module Workbookhelper
5
+ # returns a sheet based on its name
6
+
7
+ # @example Select a sheet
8
+ # w = Workbook.new('Example.xlsx')
9
+ # sheet = w["Examplesheet"]
10
+ def [] (sheetname=String)
11
+ @sheets.select{|s| s.name == sheetname}[0]
12
+ end
13
+ end
14
+
15
+ # A class that represents an Excel workbook. By default, it will open the excel file, and convert it to a collection of
16
+ # Matrix objects
17
+ # @!attribute [rw] sheets
18
+ # @return [Array] a collection of {Sheet} objects
19
+ class Workbook
20
+ include Cellhelper
21
+ include Workbookhelper
22
+ include Numformats
23
+
24
+ attr_accessor :sheets
25
+
26
+ ##
27
+ # Create a new {Workbook} object.
28
+ #
29
+ # filename is the name of the Excel 2007/2010 file (xlsx) to be opened (Optional)
30
+ #
31
+ # options is a collection of options that can be passed to Workbook.
32
+ # Options may include:
33
+ # * :copymerge (=> true/false) - Copy and repeat the content of the merged cells into the whole group, e.g.
34
+ # the group of three merged cells <tt>| a |</tt>
35
+ # will become: <tt>|a|a|a|</tt>
36
+ # * :include (Array) - an array of sheet names to be included
37
+ # * :exclude (Array) - an array of sheet names not to be processed
38
+ # * :paginate (Array) - an array that defines the number of lines to be included in the pagination and the page to be parsed
39
+ # * :cellrange (Range) - the range of cells to be included in parsing
40
+ #
41
+ # If a filename gets passed, the excel file is first getting unzipped, then
42
+ # the workbook.xml file gets processed.
43
+ # This file stores sheet metadata, which will be filtered (by including
44
+ # and excluding sheets from further processing)
45
+ #
46
+ # The next stage is building sheets.
47
+ # This includes:
48
+ # * Parsing the XML files representing the sheets
49
+ # * Interpolation of the shared strings
50
+ # * adding comments to the cells
51
+ # * Converting each sheet to a Matrix object
52
+ # * Deleting the temporary directory that stores the XML files.
53
+ def initialize(filename=nil, options={})
54
+ @sheets = []
55
+ @sheetbase = {}
56
+ @sharedstrings = []
57
+ unless filename.nil?
58
+ unpack filename
59
+ open options
60
+ parse options
61
+ FileUtils.remove_dir(@destination, true)
62
+ end
63
+ end
64
+
65
+ at_exit do
66
+ FileUtils.remove_dir(@destination, true)
67
+ end
68
+
69
+ # Unzips the excel file to a temporary directory. The directory will be removed at the end of the parsing stage when invoked
70
+ # by initialize, otherwise at exit.
71
+ # @param [String] filename the name of the Excel file to be unpacked
72
+ def unpack(filename)
73
+ @destination = Dir.mktmpdir
74
+ Zip::File.open(filename){ |zip_file|
75
+ zip_file.each{ |f|
76
+ f_path = File.join(@destination, f.name)
77
+ FileUtils.mkdir_p(File.dirname(f_path))
78
+ zip_file.extract(f, f_path) unless File.exists?(f_path)
79
+ }
80
+ }
81
+ end
82
+
83
+ # Parses workbook metadata (sheet data, comments, shared strings)
84
+ # @param [Hash] options Options affecting file opening, metadata collection and processing.
85
+ def open(options={})
86
+ f = IO.read(@destination + '/xl/workbook.xml')
87
+ a = Ox::load(f)
88
+
89
+ sheetdata(a, options); commentsrel; shstrings;
90
+
91
+ @styles = Styles.new()
92
+ File.open(@destination + '/xl/styles.xml', 'r') do |f|
93
+ Ox.sax_parse(@styles, f)
94
+ end
95
+
96
+ @styles.temparray.sort_by!{|st| st[:numFmtId].to_i}
97
+ add_custom_formats @styles.temparray
98
+ @styles.styleary.map!{|s| Numformats::Formatarray[s.to_i][:id].to_i}
99
+ end
100
+
101
+ # Parses sheet data by feeding the output of the Xlsheet SAX parser into the arrays representing the sheets.
102
+ # @param [Hash] options Options that affect the parser.
103
+ def parse(options={})
104
+ @sheets.each do |x|
105
+ if !options[:paginate].nil?
106
+ lines = options[:paginate][0]
107
+ page = options[:paginate][1]
108
+ sheet = PagSheet.new(lines, page)
109
+ elsif !options[:cellrange].nil?
110
+ range = options[:cellrange]
111
+ sheet = Cellrange.new(range)
112
+ else
113
+ sheet = Xlsheet.new()
114
+ end
115
+
116
+ File.open(@destination+"/xl/#{x[:filename]}", 'r') do |f|
117
+ Ox.sax_parse(sheet, f)
118
+ end
119
+ comments = mkcomments(x[:comments])
120
+ sheet.cellarray.each do |sh|
121
+ sh.numformat = @styles.styleary[sh.style.to_i]
122
+ if sh.type == "s"
123
+ sh.value = @sharedstrings[sh.value.to_i]
124
+ end
125
+ if !comments.nil?
126
+ comm = comments.select{|c| c[:ref] == (sh.xlcoords) }
127
+ if comm.size > 0
128
+ sh.comment = comm[0][:comment]
129
+ end
130
+ comments.delete_if{ |c| c[:ref] == (sh.xlcoords) }
131
+ end
132
+ end
133
+ x[:cells] = sheet.cellarray
134
+ x[:mergedcells] = sheet.mergedcells
135
+ end
136
+ matrixto options
137
+ end
138
+
139
+ private
140
+ # @private
141
+ # Given the data found in workbook.xml, create a hash and push it to the sheets
142
+ # array.
143
+ #
144
+ # The hash will not be pushed into the array if the sheet name is blacklisted
145
+ # (it appears in the *excluded_sheets* array) or does not appear in the list of
146
+ # included sheets.
147
+ #
148
+ # If *included_sheets* (the array of whitelisted sheets) is *nil*, the hash is added.
149
+ def sheetdata(wb_file, options)
150
+ wb_file.locate("workbook/sheets/*").each do |x|
151
+ @sheetbase[:name] = x[:name]
152
+ @sheetbase[:sheetId] = x[:sheetId]
153
+ @sheetbase[:relationId] = x[:"r:id"]
154
+
155
+ relationshipfile = nil
156
+ fname = nil
157
+ unless Dir[@destination + '/xl/_rels'].empty?
158
+ Find.find(@destination + '/xl/_rels') do |path|
159
+ if File.basename(path).split(".").last=='rels'
160
+ g = IO.read(path)
161
+ relationshipfile=Ox::load(g)
162
+ end
163
+ end
164
+ end
165
+ relationshipfile.locate("Relationships/*").each do |rship|
166
+ if rship[:Id] == x[:"r:id"]
167
+ @sheetbase[:filename]=rship[:Target]
168
+ end
169
+ end
170
+
171
+
172
+ @sheets << @sheetbase
173
+ @sheetbase = Hash.new
174
+ end
175
+ sheetarr = @sheets.map{|i| i[:name]}
176
+ sheet_collection(sheetarr, options)
177
+ end
178
+
179
+ # Build the array of working sheets based on the :include and :exclude parameters.
180
+ # @param[sheetarr, options]
181
+ def sheet_collection(sheetarr, options)
182
+ options[:include]||=[]
183
+ if options[:include].to_a.size > 0
184
+ sheetarr.keep_if{|item| options[:include].to_a.detect{|d| d == item} }
185
+ end
186
+ sheetarr = (sheetarr - options[:exclude].to_a)
187
+ @sheets.keep_if{|item| sheetarr.detect{|d| d == item[:name] } }
188
+ @sheets.uniq!
189
+ end
190
+
191
+ # Build the relationship between sheets and the XML files storing the comments
192
+ # to the actual sheet.
193
+ def commentsrel
194
+ unless Dir[@destination + '/xl/worksheets/_rels'].empty?
195
+ Find.find(@destination + '/xl/worksheets/_rels') do |path|
196
+ if File.basename(path).split(".").last == 'rels'
197
+ a = IO.read(path)
198
+ f = Ox::load(a)
199
+ f.locate("Relationships/*").each do |x|
200
+ if x[:Target].include?"comments"
201
+ @sheets.each do |s|
202
+ if "worksheets/" + File.basename(path,".rels") == s[:filename]
203
+ s[:comments] = x[:Target]
204
+ end
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
210
+ else
211
+ @sheets.each do |s|
212
+ s[:comments] = nil
213
+ end
214
+ end
215
+ end
216
+
217
+ # Invokes the Sharedstrings helper class
218
+ def shstrings
219
+ strings = Sharedstrings.new()
220
+ File.open(@destination + '/xl/sharedStrings.xml', 'r') do |f|
221
+ Ox.sax_parse(strings, f)
222
+ end
223
+ @sharedstrings = strings.stringarray
224
+ end
225
+
226
+ # Parses the comments related to the actual sheet.
227
+ # @param [String] commentfile
228
+ # @return [Array] a collection of comments relative to the Excel sheet currently processed
229
+ def mkcomments(commentfile)
230
+ unless commentfile.nil?
231
+ comms = Comments.new()
232
+ File.open(@destination + '/xl/'+commentfile.gsub('../', ''), 'r') do |f|
233
+ Ox.sax_parse(comms, f)
234
+ end
235
+ return comms.commarray
236
+ end
237
+ end
238
+
239
+ # Returns an array of Matrix objects.
240
+ # For each sheet, matrixto first checks the address (xlcoords) of the
241
+ # last cell in the cellarray, then builds a *nil*-filled Matrix object of
242
+ # size *xlcoords.x, xlcoords.y*.
243
+ #
244
+ # The matrix will then be filled with Cell objects according to their coordinates.
245
+ #
246
+ # If the *copymerge* parameter is *true*, it creates a submatrix (minor)
247
+ # of every mergegroup (based on the mergedcells array relative to the actual
248
+ # sheet), and after the only meaningful cell of the minor is found, it is
249
+ # copied back to the remaining cells of the group. The coordinates (xlcoords)
250
+ # of each copied cell is changed to reflect the actual Excel coordinate.
251
+ #
252
+ # The matrix will replace the array of cells in the actual sheet.
253
+ # @param [Hash] options
254
+ # @return [Matrix] a Matrix object that stores the cell values, and, depending on the copymerge parameter, will copy the merged value
255
+ # into every merged cell
256
+ def matrixto(options)
257
+ @sheets.each_with_index do |sheet, i|
258
+ if sheet[:cells].empty?
259
+ m = Sheet.build(0,0)
260
+ else
261
+ m = buildsheet(sheet, options)
262
+ if options[:copymerge] == true
263
+ sheet[:mergedcells].each do |mc|
264
+ a = mc.split(':')
265
+ x1 = x(a[0])
266
+ y1 = y(a[0])
267
+ x2 = x(a[1])
268
+ y2 = y(a[1])
269
+ mrange = m.minor(y1..y2, x1..x2)
270
+ valuecell = mrange.to_a.flatten.compact[0]
271
+ (x1..x2).each do |col|
272
+ (y1..y2).each do |row|
273
+ m, valuecell = mergevalues(m, col, row, valuecell)
274
+ end
275
+ end
276
+ end
277
+ end
278
+ m.name = @sheets[i][:name];
279
+ m.sheetId = @sheets[i][:sheetId];
280
+ m.relationId = @sheets[i][:relationId]
281
+ @sheets[i] = m
282
+ end
283
+ end
284
+ end
285
+
286
+ # buildsheet creates a matrix of the needed size and fills it with the cells. Mainly for internal use only.
287
+ # When paginating or parsing only a range of cells, the size of the matrix will be adjusted (no nil values
288
+ # will be left at the beginning of the sheet), to preserve memory.
289
+ # @param [Sheet] sheet the actual sheetarray.
290
+ # @param [Hash] options :paginate or :cellrange will affect the size of the matrix
291
+ # @return [Sheet] a Sheet object that stores the cell values.
292
+ def buildsheet(sheet, options)
293
+ ydiff, xdiff = 0,0
294
+ if !options[:paginate].nil?
295
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
296
+ elsif !options[:cellrange].nil?
297
+ xdiff = x(options[:cellrange].begin)
298
+ ydiff = y(options[:cellrange].begin)
299
+ end
300
+
301
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
302
+ sheet[:cells].each do |c|
303
+ m[c.y-ydiff, c.x-xdiff] = c
304
+ end
305
+ return m
306
+ end
307
+
308
+ # Replace the empty values of the mergegroup with cell values or nil.
309
+ # @param [Matrix] m the Sheet object
310
+ # @param [Integer] col Column of the actual cell
311
+ # @param [Integer] row Row of the actual cell
312
+ # @param [Cell] valuecell A Cell containing the value to be copied over the mergegroup
313
+ # @return [Matrix, Cell] the sheet and the new (empty) cell or nil.
314
+ def mergevalues(m, col, row, valuecell)
315
+ if valuecell != nil
316
+ valuecell.xlcoords = (col.col_name)+(row+1).to_s
317
+ m[row, col] = valuecell
318
+ return m, valuecell
319
+ else
320
+ valuecell = Cell.new
321
+ valuecell.xlcoords = (col.col_name)+(row+1).to_s
322
+ m[row, col] = valuecell
323
+ return m, valuecell
324
+ end
325
+ end
326
+ end
327
+
328
+ # RawWorkbook is a Workbook that contains the raw values of the original Excel cells instead of Cell objects.
329
+ # The values are taken from the Sheet arrays by running the #Cell::value method.
330
+ class RawWorkbook < Workbook
331
+ private
332
+
333
+ # {include:Workbook}
334
+ def buildsheet(sheet, options)
335
+ ydiff, xdiff = 0,0
336
+ if !options[:paginate].nil?
337
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
338
+ elsif !options[:cellrange].nil?
339
+ xdiff = x(options[:cellrange].begin)
340
+ ydiff = y(options[:cellrange].begin)
341
+ end
342
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
343
+ sheet[:cells].each do |c|
344
+ m[c.y-ydiff, c.x-xdiff] = c.value
345
+ end
346
+ return m
347
+ end
348
+ end
349
+
350
+ # RuValueWorkbook is a Workbook that contains the "rubyfied" values of the original Excel cells instead of Cell objects
351
+ # (e.g. DateTime objects).
352
+ # The values are taken from the Sheet arrays by running the #Cell::to_ru method. The result will be exactly the same as if
353
+ # you ran the #Sheet::to_ru method, but it will be snappier as the merged cellgroups will not need to be processed.
354
+ class RuValueWorkbook < Workbook
355
+ private
356
+
357
+ # {include:Workbook}
358
+ def buildsheet(sheet, options)
359
+ ydiff, xdiff = 0,0
360
+ if !options[:paginate].nil?
361
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
362
+ elsif !options[:cellrange].nil?
363
+ xdiff = x(options[:cellrange].begin)
364
+ ydiff = y(options[:cellrange].begin)
365
+ end
366
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
367
+ sheet[:cells].each do |c|
368
+ m[c.y-ydiff, c.x-xdiff] = c.to_ru
369
+ end
370
+ return m
371
+ end
372
+ end
373
+
374
+ # FormattedWorkbook is a Workbook that contains the formatted values (strings) of the original Excel cells instead of Cell objects.
375
+ # The values are taken from the Sheet arrays by running the #Cell::to_fmt method. The result will be exactly the same as if
376
+ # you ran the #Sheet::to_fmt method, but it will be snappier as the merged cellgroups will not need to be processed.
377
+ class FormattedWorkbook < Workbook
378
+ private
379
+
380
+ # {include:Workbook}
381
+ def buildsheet(sheet, options)
382
+ ydiff, xdiff = 0,0
383
+ if !options[:paginate].nil?
384
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
385
+ elsif !options[:cellrange].nil?
386
+ xdiff = x(options[:cellrange].begin)
387
+ ydiff = y(options[:cellrange].begin)
388
+ end
389
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
390
+ sheet[:cells].each do |c|
391
+ m[c.y-ydiff, c.x-xdiff] = c.to_fmt
392
+ end
393
+ return m
394
+ end
395
+ end
396
+ end
data/oxcelix.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'oxcelix/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "mindreframer-oxcelix"
8
+ spec.version = Oxcelix::VERSION
9
+ spec.authors = ['Giovanni Biczo', "Roman Heinrich"]
10
+ spec.email = ["roman.heinrich@gmail.com"]
11
+ spec.summary = 'A fast Excel 2007/2010 file parser'
12
+ spec.description = 'A fast Excel 2007/2010 (.xlsx) file parser that returns a collection of Matrix objects'
13
+ spec.homepage = "http://github.com/mindreframer/oxcelix"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+
22
+ spec.add_runtime_dependency "ox", [">= 2.1.7"]
23
+ spec.add_runtime_dependency "rubyzip", [">= 1.1.0"]
24
+ spec.add_development_dependency "pry"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec"
27
+ spec.add_development_dependency "oga"
28
+ end
data/spec/cell_spec.rb ADDED
@@ -0,0 +1,13 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe "Cell" do
4
+ describe '#r' do
5
+ it "sets the value of xlcoords" do
6
+ c=Oxcelix::Cell.new
7
+ c.r('H276')
8
+ c.xlcoords.should == 'H276'
9
+ c.x.should == 7
10
+ c.y.should == 275
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe "Fixnum object" do
4
+ describe '#col_name' do
5
+ it "returns a string representing an excel column name" do
6
+ (0..25).each do |x|
7
+ x.col_name.should == ('A'..'Z').to_a[x]
8
+ end
9
+ end
10
+ end
11
+ end