mindreframer-oxcelix 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +7 -0
- data/.yardopts +3 -0
- data/CHANGES +33 -0
- data/Gemfile +4 -0
- data/History.md +12 -0
- data/LICENSE +20 -0
- data/README.md +82 -0
- data/README.rdoc +70 -0
- data/Rakefile +8 -0
- data/lib/oxcelix.rb +24 -0
- data/lib/oxcelix/cell.rb +22 -0
- data/lib/oxcelix/cellhelper.rb +49 -0
- data/lib/oxcelix/core_ext/ext.rb +26 -0
- data/lib/oxcelix/nf.rb +172 -0
- data/lib/oxcelix/numformats.rb +115 -0
- data/lib/oxcelix/sax/comments.rb +28 -0
- data/lib/oxcelix/sax/sharedstrings.rb +17 -0
- data/lib/oxcelix/sax/styles.rb +49 -0
- data/lib/oxcelix/sax/xlsheet.rb +136 -0
- data/lib/oxcelix/sheet.rb +97 -0
- data/lib/oxcelix/version.rb +3 -0
- data/lib/oxcelix/workbook.rb +396 -0
- data/oxcelix.gemspec +28 -0
- data/spec/cell_spec.rb +13 -0
- data/spec/fixnum_spec.rb +11 -0
- data/spec/fixtures/shared_strings.xml +12 -0
- data/spec/fixtures/test.xlsx +0 -0
- data/spec/matrix_spec.rb +11 -0
- data/spec/oxcelix_spec.rb +31 -0
- data/spec/sax/shared_strings_spec.rb +20 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/string_spec.rb +21 -0
- metadata +172 -0
@@ -0,0 +1,396 @@
|
|
1
|
+
# The namespace for all classes and modules included on Oxcelix.
|
2
|
+
module Oxcelix
|
3
|
+
# Helper methods for the Workbook class
|
4
|
+
module Workbookhelper
|
5
|
+
# returns a sheet based on its name
|
6
|
+
|
7
|
+
# @example Select a sheet
|
8
|
+
# w = Workbook.new('Example.xlsx')
|
9
|
+
# sheet = w["Examplesheet"]
|
10
|
+
def [] (sheetname=String)
|
11
|
+
@sheets.select{|s| s.name == sheetname}[0]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# A class that represents an Excel workbook. By default, it will open the excel file, and convert it to a collection of
|
16
|
+
# Matrix objects
|
17
|
+
# @!attribute [rw] sheets
|
18
|
+
# @return [Array] a collection of {Sheet} objects
|
19
|
+
class Workbook
|
20
|
+
include Cellhelper
|
21
|
+
include Workbookhelper
|
22
|
+
include Numformats
|
23
|
+
|
24
|
+
attr_accessor :sheets
|
25
|
+
|
26
|
+
##
|
27
|
+
# Create a new {Workbook} object.
|
28
|
+
#
|
29
|
+
# filename is the name of the Excel 2007/2010 file (xlsx) to be opened (Optional)
|
30
|
+
#
|
31
|
+
# options is a collection of options that can be passed to Workbook.
|
32
|
+
# Options may include:
|
33
|
+
# * :copymerge (=> true/false) - Copy and repeat the content of the merged cells into the whole group, e.g.
|
34
|
+
# the group of three merged cells <tt>| a |</tt>
|
35
|
+
# will become: <tt>|a|a|a|</tt>
|
36
|
+
# * :include (Array) - an array of sheet names to be included
|
37
|
+
# * :exclude (Array) - an array of sheet names not to be processed
|
38
|
+
# * :paginate (Array) - an array that defines the number of lines to be included in the pagination and the page to be parsed
|
39
|
+
# * :cellrange (Range) - the range of cells to be included in parsing
|
40
|
+
#
|
41
|
+
# If a filename gets passed, the excel file is first getting unzipped, then
|
42
|
+
# the workbook.xml file gets processed.
|
43
|
+
# This file stores sheet metadata, which will be filtered (by including
|
44
|
+
# and excluding sheets from further processing)
|
45
|
+
#
|
46
|
+
# The next stage is building sheets.
|
47
|
+
# This includes:
|
48
|
+
# * Parsing the XML files representing the sheets
|
49
|
+
# * Interpolation of the shared strings
|
50
|
+
# * adding comments to the cells
|
51
|
+
# * Converting each sheet to a Matrix object
|
52
|
+
# * Deleting the temporary directory that stores the XML files.
|
53
|
+
def initialize(filename=nil, options={})
|
54
|
+
@sheets = []
|
55
|
+
@sheetbase = {}
|
56
|
+
@sharedstrings = []
|
57
|
+
unless filename.nil?
|
58
|
+
unpack filename
|
59
|
+
open options
|
60
|
+
parse options
|
61
|
+
FileUtils.remove_dir(@destination, true)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
at_exit do
|
66
|
+
FileUtils.remove_dir(@destination, true)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Unzips the excel file to a temporary directory. The directory will be removed at the end of the parsing stage when invoked
|
70
|
+
# by initialize, otherwise at exit.
|
71
|
+
# @param [String] filename the name of the Excel file to be unpacked
|
72
|
+
def unpack(filename)
|
73
|
+
@destination = Dir.mktmpdir
|
74
|
+
Zip::File.open(filename){ |zip_file|
|
75
|
+
zip_file.each{ |f|
|
76
|
+
f_path = File.join(@destination, f.name)
|
77
|
+
FileUtils.mkdir_p(File.dirname(f_path))
|
78
|
+
zip_file.extract(f, f_path) unless File.exists?(f_path)
|
79
|
+
}
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
# Parses workbook metadata (sheet data, comments, shared strings)
|
84
|
+
# @param [Hash] options Options affecting file opening, metadata collection and processing.
|
85
|
+
def open(options={})
|
86
|
+
f = IO.read(@destination + '/xl/workbook.xml')
|
87
|
+
a = Ox::load(f)
|
88
|
+
|
89
|
+
sheetdata(a, options); commentsrel; shstrings;
|
90
|
+
|
91
|
+
@styles = Styles.new()
|
92
|
+
File.open(@destination + '/xl/styles.xml', 'r') do |f|
|
93
|
+
Ox.sax_parse(@styles, f)
|
94
|
+
end
|
95
|
+
|
96
|
+
@styles.temparray.sort_by!{|st| st[:numFmtId].to_i}
|
97
|
+
add_custom_formats @styles.temparray
|
98
|
+
@styles.styleary.map!{|s| Numformats::Formatarray[s.to_i][:id].to_i}
|
99
|
+
end
|
100
|
+
|
101
|
+
# Parses sheet data by feeding the output of the Xlsheet SAX parser into the arrays representing the sheets.
|
102
|
+
# @param [Hash] options Options that affect the parser.
|
103
|
+
def parse(options={})
|
104
|
+
@sheets.each do |x|
|
105
|
+
if !options[:paginate].nil?
|
106
|
+
lines = options[:paginate][0]
|
107
|
+
page = options[:paginate][1]
|
108
|
+
sheet = PagSheet.new(lines, page)
|
109
|
+
elsif !options[:cellrange].nil?
|
110
|
+
range = options[:cellrange]
|
111
|
+
sheet = Cellrange.new(range)
|
112
|
+
else
|
113
|
+
sheet = Xlsheet.new()
|
114
|
+
end
|
115
|
+
|
116
|
+
File.open(@destination+"/xl/#{x[:filename]}", 'r') do |f|
|
117
|
+
Ox.sax_parse(sheet, f)
|
118
|
+
end
|
119
|
+
comments = mkcomments(x[:comments])
|
120
|
+
sheet.cellarray.each do |sh|
|
121
|
+
sh.numformat = @styles.styleary[sh.style.to_i]
|
122
|
+
if sh.type == "s"
|
123
|
+
sh.value = @sharedstrings[sh.value.to_i]
|
124
|
+
end
|
125
|
+
if !comments.nil?
|
126
|
+
comm = comments.select{|c| c[:ref] == (sh.xlcoords) }
|
127
|
+
if comm.size > 0
|
128
|
+
sh.comment = comm[0][:comment]
|
129
|
+
end
|
130
|
+
comments.delete_if{ |c| c[:ref] == (sh.xlcoords) }
|
131
|
+
end
|
132
|
+
end
|
133
|
+
x[:cells] = sheet.cellarray
|
134
|
+
x[:mergedcells] = sheet.mergedcells
|
135
|
+
end
|
136
|
+
matrixto options
|
137
|
+
end
|
138
|
+
|
139
|
+
private
|
140
|
+
# @private
|
141
|
+
# Given the data found in workbook.xml, create a hash and push it to the sheets
|
142
|
+
# array.
|
143
|
+
#
|
144
|
+
# The hash will not be pushed into the array if the sheet name is blacklisted
|
145
|
+
# (it appears in the *excluded_sheets* array) or does not appear in the list of
|
146
|
+
# included sheets.
|
147
|
+
#
|
148
|
+
# If *included_sheets* (the array of whitelisted sheets) is *nil*, the hash is added.
|
149
|
+
def sheetdata(wb_file, options)
|
150
|
+
wb_file.locate("workbook/sheets/*").each do |x|
|
151
|
+
@sheetbase[:name] = x[:name]
|
152
|
+
@sheetbase[:sheetId] = x[:sheetId]
|
153
|
+
@sheetbase[:relationId] = x[:"r:id"]
|
154
|
+
|
155
|
+
relationshipfile = nil
|
156
|
+
fname = nil
|
157
|
+
unless Dir[@destination + '/xl/_rels'].empty?
|
158
|
+
Find.find(@destination + '/xl/_rels') do |path|
|
159
|
+
if File.basename(path).split(".").last=='rels'
|
160
|
+
g = IO.read(path)
|
161
|
+
relationshipfile=Ox::load(g)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
relationshipfile.locate("Relationships/*").each do |rship|
|
166
|
+
if rship[:Id] == x[:"r:id"]
|
167
|
+
@sheetbase[:filename]=rship[:Target]
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
@sheets << @sheetbase
|
173
|
+
@sheetbase = Hash.new
|
174
|
+
end
|
175
|
+
sheetarr = @sheets.map{|i| i[:name]}
|
176
|
+
sheet_collection(sheetarr, options)
|
177
|
+
end
|
178
|
+
|
179
|
+
# Build the array of working sheets based on the :include and :exclude parameters.
|
180
|
+
# @param[sheetarr, options]
|
181
|
+
def sheet_collection(sheetarr, options)
|
182
|
+
options[:include]||=[]
|
183
|
+
if options[:include].to_a.size > 0
|
184
|
+
sheetarr.keep_if{|item| options[:include].to_a.detect{|d| d == item} }
|
185
|
+
end
|
186
|
+
sheetarr = (sheetarr - options[:exclude].to_a)
|
187
|
+
@sheets.keep_if{|item| sheetarr.detect{|d| d == item[:name] } }
|
188
|
+
@sheets.uniq!
|
189
|
+
end
|
190
|
+
|
191
|
+
# Build the relationship between sheets and the XML files storing the comments
|
192
|
+
# to the actual sheet.
|
193
|
+
def commentsrel
|
194
|
+
unless Dir[@destination + '/xl/worksheets/_rels'].empty?
|
195
|
+
Find.find(@destination + '/xl/worksheets/_rels') do |path|
|
196
|
+
if File.basename(path).split(".").last == 'rels'
|
197
|
+
a = IO.read(path)
|
198
|
+
f = Ox::load(a)
|
199
|
+
f.locate("Relationships/*").each do |x|
|
200
|
+
if x[:Target].include?"comments"
|
201
|
+
@sheets.each do |s|
|
202
|
+
if "worksheets/" + File.basename(path,".rels") == s[:filename]
|
203
|
+
s[:comments] = x[:Target]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
else
|
211
|
+
@sheets.each do |s|
|
212
|
+
s[:comments] = nil
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# Invokes the Sharedstrings helper class
|
218
|
+
def shstrings
|
219
|
+
strings = Sharedstrings.new()
|
220
|
+
File.open(@destination + '/xl/sharedStrings.xml', 'r') do |f|
|
221
|
+
Ox.sax_parse(strings, f)
|
222
|
+
end
|
223
|
+
@sharedstrings = strings.stringarray
|
224
|
+
end
|
225
|
+
|
226
|
+
# Parses the comments related to the actual sheet.
|
227
|
+
# @param [String] commentfile
|
228
|
+
# @return [Array] a collection of comments relative to the Excel sheet currently processed
|
229
|
+
def mkcomments(commentfile)
|
230
|
+
unless commentfile.nil?
|
231
|
+
comms = Comments.new()
|
232
|
+
File.open(@destination + '/xl/'+commentfile.gsub('../', ''), 'r') do |f|
|
233
|
+
Ox.sax_parse(comms, f)
|
234
|
+
end
|
235
|
+
return comms.commarray
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
# Returns an array of Matrix objects.
|
240
|
+
# For each sheet, matrixto first checks the address (xlcoords) of the
|
241
|
+
# last cell in the cellarray, then builds a *nil*-filled Matrix object of
|
242
|
+
# size *xlcoords.x, xlcoords.y*.
|
243
|
+
#
|
244
|
+
# The matrix will then be filled with Cell objects according to their coordinates.
|
245
|
+
#
|
246
|
+
# If the *copymerge* parameter is *true*, it creates a submatrix (minor)
|
247
|
+
# of every mergegroup (based on the mergedcells array relative to the actual
|
248
|
+
# sheet), and after the only meaningful cell of the minor is found, it is
|
249
|
+
# copied back to the remaining cells of the group. The coordinates (xlcoords)
|
250
|
+
# of each copied cell is changed to reflect the actual Excel coordinate.
|
251
|
+
#
|
252
|
+
# The matrix will replace the array of cells in the actual sheet.
|
253
|
+
# @param [Hash] options
|
254
|
+
# @return [Matrix] a Matrix object that stores the cell values, and, depending on the copymerge parameter, will copy the merged value
|
255
|
+
# into every merged cell
|
256
|
+
def matrixto(options)
|
257
|
+
@sheets.each_with_index do |sheet, i|
|
258
|
+
if sheet[:cells].empty?
|
259
|
+
m = Sheet.build(0,0)
|
260
|
+
else
|
261
|
+
m = buildsheet(sheet, options)
|
262
|
+
if options[:copymerge] == true
|
263
|
+
sheet[:mergedcells].each do |mc|
|
264
|
+
a = mc.split(':')
|
265
|
+
x1 = x(a[0])
|
266
|
+
y1 = y(a[0])
|
267
|
+
x2 = x(a[1])
|
268
|
+
y2 = y(a[1])
|
269
|
+
mrange = m.minor(y1..y2, x1..x2)
|
270
|
+
valuecell = mrange.to_a.flatten.compact[0]
|
271
|
+
(x1..x2).each do |col|
|
272
|
+
(y1..y2).each do |row|
|
273
|
+
m, valuecell = mergevalues(m, col, row, valuecell)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
m.name = @sheets[i][:name];
|
279
|
+
m.sheetId = @sheets[i][:sheetId];
|
280
|
+
m.relationId = @sheets[i][:relationId]
|
281
|
+
@sheets[i] = m
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
# buildsheet creates a matrix of the needed size and fills it with the cells. Mainly for internal use only.
|
287
|
+
# When paginating or parsing only a range of cells, the size of the matrix will be adjusted (no nil values
|
288
|
+
# will be left at the beginning of the sheet), to preserve memory.
|
289
|
+
# @param [Sheet] sheet the actual sheetarray.
|
290
|
+
# @param [Hash] options :paginate or :cellrange will affect the size of the matrix
|
291
|
+
# @return [Sheet] a Sheet object that stores the cell values.
|
292
|
+
def buildsheet(sheet, options)
|
293
|
+
ydiff, xdiff = 0,0
|
294
|
+
if !options[:paginate].nil?
|
295
|
+
ydiff = options[:paginate][0] * (options[:paginate][1]-1)
|
296
|
+
elsif !options[:cellrange].nil?
|
297
|
+
xdiff = x(options[:cellrange].begin)
|
298
|
+
ydiff = y(options[:cellrange].begin)
|
299
|
+
end
|
300
|
+
|
301
|
+
m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
|
302
|
+
sheet[:cells].each do |c|
|
303
|
+
m[c.y-ydiff, c.x-xdiff] = c
|
304
|
+
end
|
305
|
+
return m
|
306
|
+
end
|
307
|
+
|
308
|
+
# Replace the empty values of the mergegroup with cell values or nil.
|
309
|
+
# @param [Matrix] m the Sheet object
|
310
|
+
# @param [Integer] col Column of the actual cell
|
311
|
+
# @param [Integer] row Row of the actual cell
|
312
|
+
# @param [Cell] valuecell A Cell containing the value to be copied over the mergegroup
|
313
|
+
# @return [Matrix, Cell] the sheet and the new (empty) cell or nil.
|
314
|
+
def mergevalues(m, col, row, valuecell)
|
315
|
+
if valuecell != nil
|
316
|
+
valuecell.xlcoords = (col.col_name)+(row+1).to_s
|
317
|
+
m[row, col] = valuecell
|
318
|
+
return m, valuecell
|
319
|
+
else
|
320
|
+
valuecell = Cell.new
|
321
|
+
valuecell.xlcoords = (col.col_name)+(row+1).to_s
|
322
|
+
m[row, col] = valuecell
|
323
|
+
return m, valuecell
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
# RawWorkbook is a Workbook that contains the raw values of the original Excel cells instead of Cell objects.
|
329
|
+
# The values are taken from the Sheet arrays by running the #Cell::value method.
|
330
|
+
class RawWorkbook < Workbook
|
331
|
+
private
|
332
|
+
|
333
|
+
# {include:Workbook}
|
334
|
+
def buildsheet(sheet, options)
|
335
|
+
ydiff, xdiff = 0,0
|
336
|
+
if !options[:paginate].nil?
|
337
|
+
ydiff = options[:paginate][0] * (options[:paginate][1]-1)
|
338
|
+
elsif !options[:cellrange].nil?
|
339
|
+
xdiff = x(options[:cellrange].begin)
|
340
|
+
ydiff = y(options[:cellrange].begin)
|
341
|
+
end
|
342
|
+
m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
|
343
|
+
sheet[:cells].each do |c|
|
344
|
+
m[c.y-ydiff, c.x-xdiff] = c.value
|
345
|
+
end
|
346
|
+
return m
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
# RuValueWorkbook is a Workbook that contains the "rubyfied" values of the original Excel cells instead of Cell objects
|
351
|
+
# (e.g. DateTime objects).
|
352
|
+
# The values are taken from the Sheet arrays by running the #Cell::to_ru method. The result will be exactly the same as if
|
353
|
+
# you ran the #Sheet::to_ru method, but it will be snappier as the merged cellgroups will not need to be processed.
|
354
|
+
class RuValueWorkbook < Workbook
|
355
|
+
private
|
356
|
+
|
357
|
+
# {include:Workbook}
|
358
|
+
def buildsheet(sheet, options)
|
359
|
+
ydiff, xdiff = 0,0
|
360
|
+
if !options[:paginate].nil?
|
361
|
+
ydiff = options[:paginate][0] * (options[:paginate][1]-1)
|
362
|
+
elsif !options[:cellrange].nil?
|
363
|
+
xdiff = x(options[:cellrange].begin)
|
364
|
+
ydiff = y(options[:cellrange].begin)
|
365
|
+
end
|
366
|
+
m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
|
367
|
+
sheet[:cells].each do |c|
|
368
|
+
m[c.y-ydiff, c.x-xdiff] = c.to_ru
|
369
|
+
end
|
370
|
+
return m
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
# FormattedWorkbook is a Workbook that contains the formatted values (strings) of the original Excel cells instead of Cell objects.
|
375
|
+
# The values are taken from the Sheet arrays by running the #Cell::to_fmt method. The result will be exactly the same as if
|
376
|
+
# you ran the #Sheet::to_fmt method, but it will be snappier as the merged cellgroups will not need to be processed.
|
377
|
+
class FormattedWorkbook < Workbook
|
378
|
+
private
|
379
|
+
|
380
|
+
# {include:Workbook}
|
381
|
+
def buildsheet(sheet, options)
|
382
|
+
ydiff, xdiff = 0,0
|
383
|
+
if !options[:paginate].nil?
|
384
|
+
ydiff = options[:paginate][0] * (options[:paginate][1]-1)
|
385
|
+
elsif !options[:cellrange].nil?
|
386
|
+
xdiff = x(options[:cellrange].begin)
|
387
|
+
ydiff = y(options[:cellrange].begin)
|
388
|
+
end
|
389
|
+
m = Sheet.build(sheet[:cells].last.y+1-ydiff, sheet[:cells].last.x+1-xdiff) {nil}
|
390
|
+
sheet[:cells].each do |c|
|
391
|
+
m[c.y-ydiff, c.x-xdiff] = c.to_fmt
|
392
|
+
end
|
393
|
+
return m
|
394
|
+
end
|
395
|
+
end
|
396
|
+
end
|
data/oxcelix.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'oxcelix/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "mindreframer-oxcelix"
|
8
|
+
spec.version = Oxcelix::VERSION
|
9
|
+
spec.authors = ['Giovanni Biczo', "Roman Heinrich"]
|
10
|
+
spec.email = ["roman.heinrich@gmail.com"]
|
11
|
+
spec.summary = 'A fast Excel 2007/2010 file parser'
|
12
|
+
spec.description = 'A fast Excel 2007/2010 (.xlsx) file parser that returns a collection of Matrix objects'
|
13
|
+
spec.homepage = "http://github.com/mindreframer/oxcelix"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
|
22
|
+
spec.add_runtime_dependency "ox", [">= 2.1.7"]
|
23
|
+
spec.add_runtime_dependency "rubyzip", [">= 1.1.0"]
|
24
|
+
spec.add_development_dependency "pry"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
spec.add_development_dependency "rspec"
|
27
|
+
spec.add_development_dependency "oga"
|
28
|
+
end
|
data/spec/cell_spec.rb
ADDED
data/spec/fixnum_spec.rb
ADDED