mindreframer-oxcelix 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ module Oxcelix
2
+ VERSION = '0.5.0'
3
+ end
@@ -0,0 +1,396 @@
1
+ # The namespace for all classes and modules included on Oxcelix.
2
+ module Oxcelix
3
+ # Helper methods for the Workbook class
4
+ module Workbookhelper
5
+ # returns a sheet based on its name
6
+
7
+ # @example Select a sheet
8
+ # w = Workbook.new('Example.xlsx')
9
+ # sheet = w["Examplesheet"]
10
+ def [] (sheetname=String)
11
+ @sheets.select{|s| s.name == sheetname}[0]
12
+ end
13
+ end
14
+
15
+ # A class that represents an Excel workbook. By default, it will open the excel file, and convert it to a collection of
16
+ # Matrix objects
17
+ # @!attribute [rw] sheets
18
+ # @return [Array] a collection of {Sheet} objects
19
+ class Workbook
20
+ include Cellhelper
21
+ include Workbookhelper
22
+ include Numformats
23
+
24
+ attr_accessor :sheets
25
+
26
+ ##
27
+ # Create a new {Workbook} object.
28
+ #
29
+ # filename is the name of the Excel 2007/2010 file (xlsx) to be opened (Optional)
30
+ #
31
+ # options is a collection of options that can be passed to Workbook.
32
+ # Options may include:
33
+ # * :copymerge (=> true/false) - Copy and repeat the content of the merged cells into the whole group, e.g.
34
+ # the group of three merged cells <tt>| a |</tt>
35
+ # will become: <tt>|a|a|a|</tt>
36
+ # * :include (Array) - an array of sheet names to be included
37
+ # * :exclude (Array) - an array of sheet names not to be processed
38
+ # * :paginate (Array) - an array that defines the number of lines to be included in the pagination and the page to be parsed
39
+ # * :cellrange (Range) - the range of cells to be included in parsing
40
+ #
41
+ # If a filename gets passed, the excel file is first getting unzipped, then
42
+ # the workbook.xml file gets processed.
43
+ # This file stores sheet metadata, which will be filtered (by including
44
+ # and excluding sheets from further processing)
45
+ #
46
+ # The next stage is building sheets.
47
+ # This includes:
48
+ # * Parsing the XML files representing the sheets
49
+ # * Interpolation of the shared strings
50
+ # * adding comments to the cells
51
+ # * Converting each sheet to a Matrix object
52
+ # * Deleting the temporary directory that stores the XML files.
53
+ def initialize(filename=nil, options={})
54
+ @sheets = []
55
+ @sheetbase = {}
56
+ @sharedstrings = []
57
+ unless filename.nil?
58
+ unpack filename
59
+ open options
60
+ parse options
61
+ FileUtils.remove_dir(@destination, true)
62
+ end
63
+ end
64
+
65
+ at_exit do
66
+ FileUtils.remove_dir(@destination, true)
67
+ end
68
+
69
+ # Unzips the excel file to a temporary directory. The directory will be removed at the end of the parsing stage when invoked
70
+ # by initialize, otherwise at exit.
71
+ # @param [String] filename the name of the Excel file to be unpacked
72
+ def unpack(filename)
73
+ @destination = Dir.mktmpdir
74
+ Zip::File.open(filename){ |zip_file|
75
+ zip_file.each{ |f|
76
+ f_path = File.join(@destination, f.name)
77
+ FileUtils.mkdir_p(File.dirname(f_path))
78
+ zip_file.extract(f, f_path) unless File.exists?(f_path)
79
+ }
80
+ }
81
+ end
82
+
83
+ # Parses workbook metadata (sheet data, comments, shared strings)
84
+ # @param [Hash] options Options affecting file opening, metadata collection and processing.
85
+ def open(options={})
86
+ f = IO.read(@destination + '/xl/workbook.xml')
87
+ a = Ox::load(f)
88
+
89
+ sheetdata(a, options); commentsrel; shstrings;
90
+
91
+ @styles = Styles.new()
92
+ File.open(@destination + '/xl/styles.xml', 'r') do |f|
93
+ Ox.sax_parse(@styles, f)
94
+ end
95
+
96
+ @styles.temparray.sort_by!{|st| st[:numFmtId].to_i}
97
+ add_custom_formats @styles.temparray
98
+ @styles.styleary.map!{|s| Numformats::Formatarray[s.to_i][:id].to_i}
99
+ end
100
+
101
+ # Parses sheet data by feeding the output of the Xlsheet SAX parser into the arrays representing the sheets.
102
+ # @param [Hash] options Options that affect the parser.
103
+ def parse(options={})
104
+ @sheets.each do |x|
105
+ if !options[:paginate].nil?
106
+ lines = options[:paginate][0]
107
+ page = options[:paginate][1]
108
+ sheet = PagSheet.new(lines, page)
109
+ elsif !options[:cellrange].nil?
110
+ range = options[:cellrange]
111
+ sheet = Cellrange.new(range)
112
+ else
113
+ sheet = Xlsheet.new()
114
+ end
115
+
116
+ File.open(@destination+"/xl/#{x[:filename]}", 'r') do |f|
117
+ Ox.sax_parse(sheet, f)
118
+ end
119
+ comments = mkcomments(x[:comments])
120
+ sheet.cellarray.each do |sh|
121
+ sh.numformat = @styles.styleary[sh.style.to_i]
122
+ if sh.type == "s"
123
+ sh.value = @sharedstrings[sh.value.to_i]
124
+ end
125
+ if !comments.nil?
126
+ comm = comments.select{|c| c[:ref] == (sh.xlcoords) }
127
+ if comm.size > 0
128
+ sh.comment = comm[0][:comment]
129
+ end
130
+ comments.delete_if{ |c| c[:ref] == (sh.xlcoords) }
131
+ end
132
+ end
133
+ x[:cells] = sheet.cellarray
134
+ x[:mergedcells] = sheet.mergedcells
135
+ end
136
+ matrixto options
137
+ end
138
+
139
+ private
140
+ # @private
141
+ # Given the data found in workbook.xml, create a hash and push it to the sheets
142
+ # array.
143
+ #
144
+ # The hash will not be pushed into the array if the sheet name is blacklisted
145
+ # (it appears in the *excluded_sheets* array) or does not appear in the list of
146
+ # included sheets.
147
+ #
148
+ # If *included_sheets* (the array of whitelisted sheets) is *nil*, the hash is added.
149
+ def sheetdata(wb_file, options)
150
+ wb_file.locate("workbook/sheets/*").each do |x|
151
+ @sheetbase[:name] = x[:name]
152
+ @sheetbase[:sheetId] = x[:sheetId]
153
+ @sheetbase[:relationId] = x[:"r:id"]
154
+
155
+ relationshipfile = nil
156
+ fname = nil
157
+ unless Dir[@destination + '/xl/_rels'].empty?
158
+ Find.find(@destination + '/xl/_rels') do |path|
159
+ if File.basename(path).split(".").last=='rels'
160
+ g = IO.read(path)
161
+ relationshipfile=Ox::load(g)
162
+ end
163
+ end
164
+ end
165
+ relationshipfile.locate("Relationships/*").each do |rship|
166
+ if rship[:Id] == x[:"r:id"]
167
+ @sheetbase[:filename]=rship[:Target]
168
+ end
169
+ end
170
+
171
+
172
+ @sheets << @sheetbase
173
+ @sheetbase = Hash.new
174
+ end
175
+ sheetarr = @sheets.map{|i| i[:name]}
176
+ sheet_collection(sheetarr, options)
177
+ end
178
+
179
+ # Build the array of working sheets based on the :include and :exclude parameters.
180
+ # @param[sheetarr, options]
181
+ def sheet_collection(sheetarr, options)
182
+ options[:include]||=[]
183
+ if options[:include].to_a.size > 0
184
+ sheetarr.keep_if{|item| options[:include].to_a.detect{|d| d == item} }
185
+ end
186
+ sheetarr = (sheetarr - options[:exclude].to_a)
187
+ @sheets.keep_if{|item| sheetarr.detect{|d| d == item[:name] } }
188
+ @sheets.uniq!
189
+ end
190
+
191
+ # Build the relationship between sheets and the XML files storing the comments
192
+ # to the actual sheet.
193
+ def commentsrel
194
+ unless Dir[@destination + '/xl/worksheets/_rels'].empty?
195
+ Find.find(@destination + '/xl/worksheets/_rels') do |path|
196
+ if File.basename(path).split(".").last == 'rels'
197
+ a = IO.read(path)
198
+ f = Ox::load(a)
199
+ f.locate("Relationships/*").each do |x|
200
+ if x[:Target].include?"comments"
201
+ @sheets.each do |s|
202
+ if "worksheets/" + File.basename(path,".rels") == s[:filename]
203
+ s[:comments] = x[:Target]
204
+ end
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
210
+ else
211
+ @sheets.each do |s|
212
+ s[:comments] = nil
213
+ end
214
+ end
215
+ end
216
+
217
+ # Invokes the Sharedstrings helper class
218
+ def shstrings
219
+ strings = Sharedstrings.new()
220
+ File.open(@destination + '/xl/sharedStrings.xml', 'r') do |f|
221
+ Ox.sax_parse(strings, f)
222
+ end
223
+ @sharedstrings = strings.stringarray
224
+ end
225
+
226
+ # Parses the comments related to the actual sheet.
227
+ # @param [String] commentfile
228
+ # @return [Array] a collection of comments relative to the Excel sheet currently processed
229
+ def mkcomments(commentfile)
230
+ unless commentfile.nil?
231
+ comms = Comments.new()
232
+ File.open(@destination + '/xl/'+commentfile.gsub('../', ''), 'r') do |f|
233
+ Ox.sax_parse(comms, f)
234
+ end
235
+ return comms.commarray
236
+ end
237
+ end
238
+
239
+ # Returns an array of Matrix objects.
240
+ # For each sheet, matrixto first checks the address (xlcoords) of the
241
+ # last cell in the cellarray, then builds a *nil*-filled Matrix object of
242
+ # size *xlcoords.x, xlcoords.y*.
243
+ #
244
+ # The matrix will then be filled with Cell objects according to their coordinates.
245
+ #
246
+ # If the *copymerge* parameter is *true*, it creates a submatrix (minor)
247
+ # of every mergegroup (based on the mergedcells array relative to the actual
248
+ # sheet), and after the only meaningful cell of the minor is found, it is
249
+ # copied back to the remaining cells of the group. The coordinates (xlcoords)
250
+ # of each copied cell is changed to reflect the actual Excel coordinate.
251
+ #
252
+ # The matrix will replace the array of cells in the actual sheet.
253
+ # @param [Hash] options
254
+ # @return [Matrix] a Matrix object that stores the cell values, and, depending on the copymerge parameter, will copy the merged value
255
+ # into every merged cell
256
+ def matrixto(options)
257
+ @sheets.each_with_index do |sheet, i|
258
+ if sheet[:cells].empty?
259
+ m = Sheet.build(0,0)
260
+ else
261
+ m = buildsheet(sheet, options)
262
+ if options[:copymerge] == true
263
+ sheet[:mergedcells].each do |mc|
264
+ a = mc.split(':')
265
+ x1 = x(a[0])
266
+ y1 = y(a[0])
267
+ x2 = x(a[1])
268
+ y2 = y(a[1])
269
+ mrange = m.minor(y1..y2, x1..x2)
270
+ valuecell = mrange.to_a.flatten.compact[0]
271
+ (x1..x2).each do |col|
272
+ (y1..y2).each do |row|
273
+ m, valuecell = mergevalues(m, col, row, valuecell)
274
+ end
275
+ end
276
+ end
277
+ end
278
+ m.name = @sheets[i][:name];
279
+ m.sheetId = @sheets[i][:sheetId];
280
+ m.relationId = @sheets[i][:relationId]
281
+ @sheets[i] = m
282
+ end
283
+ end
284
+ end
285
+
286
+ # buildsheet creates a matrix of the needed size and fills it with the cells. Mainly for internal use only.
287
+ # When paginating or parsing only a range of cells, the size of the matrix will be adjusted (no nil values
288
+ # will be left at the beginning of the sheet), to preserve memory.
289
+ # @param [Sheet] sheet the actual sheetarray.
290
+ # @param [Hash] options :paginate or :cellrange will affect the size of the matrix
291
+ # @return [Sheet] a Sheet object that stores the cell values.
292
+ def buildsheet(sheet, options)
293
+ ydiff, xdiff = 0,0
294
+ if !options[:paginate].nil?
295
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
296
+ elsif !options[:cellrange].nil?
297
+ xdiff = x(options[:cellrange].begin)
298
+ ydiff = y(options[:cellrange].begin)
299
+ end
300
+
301
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
302
+ sheet[:cells].each do |c|
303
+ m[c.y-ydiff, c.x-xdiff] = c
304
+ end
305
+ return m
306
+ end
307
+
308
+ # Replace the empty values of the mergegroup with cell values or nil.
309
+ # @param [Matrix] m the Sheet object
310
+ # @param [Integer] col Column of the actual cell
311
+ # @param [Integer] row Row of the actual cell
312
+ # @param [Cell] valuecell A Cell containing the value to be copied over the mergegroup
313
+ # @return [Matrix, Cell] the sheet and the new (empty) cell or nil.
314
+ def mergevalues(m, col, row, valuecell)
315
+ if valuecell != nil
316
+ valuecell.xlcoords = (col.col_name)+(row+1).to_s
317
+ m[row, col] = valuecell
318
+ return m, valuecell
319
+ else
320
+ valuecell = Cell.new
321
+ valuecell.xlcoords = (col.col_name)+(row+1).to_s
322
+ m[row, col] = valuecell
323
+ return m, valuecell
324
+ end
325
+ end
326
+ end
327
+
328
+ # RawWorkbook is a Workbook that contains the raw values of the original Excel cells instead of Cell objects.
329
+ # The values are taken from the Sheet arrays by running the #Cell::value method.
330
+ class RawWorkbook < Workbook
331
+ private
332
+
333
+ # {include:Workbook}
334
+ def buildsheet(sheet, options)
335
+ ydiff, xdiff = 0,0
336
+ if !options[:paginate].nil?
337
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
338
+ elsif !options[:cellrange].nil?
339
+ xdiff = x(options[:cellrange].begin)
340
+ ydiff = y(options[:cellrange].begin)
341
+ end
342
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
343
+ sheet[:cells].each do |c|
344
+ m[c.y-ydiff, c.x-xdiff] = c.value
345
+ end
346
+ return m
347
+ end
348
+ end
349
+
350
+ # RuValueWorkbook is a Workbook that contains the "rubyfied" values of the original Excel cells instead of Cell objects
351
+ # (e.g. DateTime objects).
352
+ # The values are taken from the Sheet arrays by running the #Cell::to_ru method. The result will be exactly the same as if
353
+ # you ran the #Sheet::to_ru method, but it will be snappier as the merged cellgroups will not need to be processed.
354
+ class RuValueWorkbook < Workbook
355
+ private
356
+
357
+ # {include:Workbook}
358
+ def buildsheet(sheet, options)
359
+ ydiff, xdiff = 0,0
360
+ if !options[:paginate].nil?
361
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
362
+ elsif !options[:cellrange].nil?
363
+ xdiff = x(options[:cellrange].begin)
364
+ ydiff = y(options[:cellrange].begin)
365
+ end
366
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
367
+ sheet[:cells].each do |c|
368
+ m[c.y-ydiff, c.x-xdiff] = c.to_ru
369
+ end
370
+ return m
371
+ end
372
+ end
373
+
374
+ # FormattedWorkbook is a Workbook that contains the formatted values (strings) of the original Excel cells instead of Cell objects.
375
+ # The values are taken from the Sheet arrays by running the #Cell::to_fmt method. The result will be exactly the same as if
376
+ # you ran the #Sheet::to_fmt method, but it will be snappier as the merged cellgroups will not need to be processed.
377
+ class FormattedWorkbook < Workbook
378
+ private
379
+
380
+ # {include:Workbook}
381
+ def buildsheet(sheet, options)
382
+ ydiff, xdiff = 0,0
383
+ if !options[:paginate].nil?
384
+ ydiff = options[:paginate][0] * (options[:paginate][1]-1)
385
+ elsif !options[:cellrange].nil?
386
+ xdiff = x(options[:cellrange].begin)
387
+ ydiff = y(options[:cellrange].begin)
388
+ end
389
+ m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
390
+ sheet[:cells].each do |c|
391
+ m[c.y-ydiff, c.x-xdiff] = c.to_fmt
392
+ end
393
+ return m
394
+ end
395
+ end
396
+ end
data/oxcelix.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'oxcelix/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "mindreframer-oxcelix"
8
+ spec.version = Oxcelix::VERSION
9
+ spec.authors = ['Giovanni Biczo', "Roman Heinrich"]
10
+ spec.email = ["roman.heinrich@gmail.com"]
11
+ spec.summary = 'A fast Excel 2007/2010 file parser'
12
+ spec.description = 'A fast Excel 2007/2010 (.xlsx) file parser that returns a collection of Matrix objects'
13
+ spec.homepage = "http://github.com/mindreframer/oxcelix"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+
22
+ spec.add_runtime_dependency "ox", [">= 2.1.7"]
23
+ spec.add_runtime_dependency "rubyzip", [">= 1.1.0"]
24
+ spec.add_development_dependency "pry"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec"
27
+ spec.add_development_dependency "oga"
28
+ end
data/spec/cell_spec.rb ADDED
@@ -0,0 +1,13 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe "Cell" do
4
+ describe '#r' do
5
+ it "sets the value of xlcoords" do
6
+ c=Oxcelix::Cell.new
7
+ c.r('H276')
8
+ c.xlcoords.should == 'H276'
9
+ c.x.should == 7
10
+ c.y.should == 275
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ require './spec/spec_helper'
2
+
3
+ describe "Fixnum object" do
4
+ describe '#col_name' do
5
+ it "returns a string representing an excel column name" do
6
+ (0..25).each do |x|
7
+ x.col_name.should == ('A'..'Z').to_a[x]
8
+ end
9
+ end
10
+ end
11
+ end