dullard 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +9 -1
- data/lib/dullard/reader.rb +101 -44
- data/lib/dullard/version.rb +1 -1
- data/spec/dullard_spec.rb +45 -2
- data/spec/error_missing_metadata.xlsx +0 -0
- data/spec/error_missing_r.xlsx +0 -0
- data/spec/error_missing_ss.xlsx +0 -0
- metadata +20 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3a941a2433fc527bc868fd41ef33ab4af46acc8
|
4
|
+
data.tar.gz: 6c0a1f95069ac0c0751e1452b2e6e3464872bd46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ac8f461122aaa0fb5d90a3702fac5cdd2524c21e7b78ddfc02355585ba3e0288971e660c4664a78d92e7ca2c0c9afe73c41ba8de71db56ddce6d41d8055129e
|
7
|
+
data.tar.gz: a6a2774cb28d125ab7e5b59026395cc671842e450c36c2cef0cafd98744bde274c6b662efa8764e1e0343eae8367ac8206eae0f4c7e64a315ad54ce83dacc786
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# dullard
|
2
|
+
|
1
3
|
Super simple, super fast stream-based XLSX parsing. Suitable for very large
|
2
4
|
files.
|
3
5
|
|
@@ -7,5 +9,11 @@ Requires Ruby 2.0.
|
|
7
9
|
|
8
10
|
workbook = Dullard::Workbook.new "file.xlsx"
|
9
11
|
workbook.sheets[0].rows.each do |row|
|
10
|
-
|
12
|
+
p row # => ["a","b","c", 0.3, #<DateTime: -4712-01-01....>, ...]
|
11
13
|
end
|
14
|
+
|
15
|
+
## Current limitations
|
16
|
+
* Limited validation and error handling.
|
17
|
+
* Formatted cells are read, but formatting is not accessible.
|
18
|
+
* May be buggy. Pull requests welcome!
|
19
|
+
* Rows that end with empty cells may be truncated.
|
data/lib/dullard/reader.rb
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
require 'zip/filesystem'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
|
-
module Dullard
|
4
|
+
module Dullard
|
5
|
+
class Error < StandardError; end
|
6
|
+
OOXMLEpoch = DateTime.new(1899,12,30)
|
7
|
+
SharedStringPath = 'xl/sharedStrings.xml'
|
8
|
+
StylesPath = 'xl/styles.xml'
|
9
|
+
end
|
5
10
|
|
6
11
|
class Dullard::Workbook
|
7
12
|
# Code borrowed from Roo (https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb)
|
@@ -12,8 +17,8 @@ class Dullard::Workbook
|
|
12
17
|
'0.00' => :float,
|
13
18
|
'#,##0' => :float,
|
14
19
|
'#,##0.00' => :float,
|
15
|
-
'0%' => :
|
16
|
-
'0.00%' => :
|
20
|
+
'0%' => :float,
|
21
|
+
'0.00%' => :float,
|
17
22
|
'0.00E+00' => :float,
|
18
23
|
'# ?/?' => :float, #??? TODO:
|
19
24
|
'# ??/??' => :float, #??? TODO:
|
@@ -78,24 +83,42 @@ class Dullard::Workbook
|
|
78
83
|
|
79
84
|
def initialize(file, user_defined_formats = {})
|
80
85
|
@file = file
|
81
|
-
|
86
|
+
begin
|
87
|
+
@zipfs = Zip::File.open(@file)
|
88
|
+
rescue Zip::Error => e
|
89
|
+
raise Dullard::Error, e.message
|
90
|
+
end
|
82
91
|
@user_defined_formats = user_defined_formats
|
83
92
|
read_styles
|
84
93
|
end
|
85
94
|
|
86
95
|
def sheets
|
87
|
-
|
88
|
-
|
96
|
+
begin
|
97
|
+
workbook = Nokogiri::XML::Document.parse(@zipfs.file.open('xl/workbook.xml'))
|
98
|
+
rescue Zip::Error
|
99
|
+
raise Dullard::Error, 'Invalid file, could not open xl/workbook.xml'
|
100
|
+
end
|
101
|
+
@sheets = workbook.css('sheet').each_with_index.map do |n, i|
|
102
|
+
Dullard::Sheet.new(self, n.attr('name'), n.attr('sheetId'), i+1)
|
103
|
+
end
|
89
104
|
end
|
90
105
|
|
91
106
|
def string_table
|
92
|
-
@
|
107
|
+
@string_table ||= read_string_table
|
93
108
|
end
|
94
109
|
|
95
110
|
def read_string_table
|
96
|
-
|
111
|
+
return [] unless @zipfs.file.exist? Dullard::SharedStringPath
|
112
|
+
|
113
|
+
begin
|
114
|
+
shared_string = @zipfs.file.open(Dullard::SharedStringPath)
|
115
|
+
rescue Zip::Error
|
116
|
+
raise Dullard::Error, 'Invalid file, could not open shared string file.'
|
117
|
+
end
|
118
|
+
|
97
119
|
entry = ''
|
98
|
-
|
120
|
+
@string_table = []
|
121
|
+
Nokogiri::XML::Reader(shared_string).each do |node|
|
99
122
|
if node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
100
123
|
entry = ''
|
101
124
|
elsif node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
@@ -108,20 +131,29 @@ class Dullard::Workbook
|
|
108
131
|
end
|
109
132
|
|
110
133
|
def read_styles
|
111
|
-
doc = Nokogiri::XML(@zipfs.file.open("xl/styles.xml"))
|
112
|
-
|
113
134
|
@num_formats = {}
|
114
135
|
@cell_xfs = []
|
136
|
+
return unless @zipfs.file.exist? Dullard::StylesPath
|
137
|
+
|
138
|
+
begin
|
139
|
+
doc = Nokogiri::XML(@zipfs.file.open(Dullard::StylesPath))
|
140
|
+
rescue Zip::Error
|
141
|
+
raise Dullard::Error, 'Invalid file, could not open styles'
|
142
|
+
end
|
115
143
|
|
116
144
|
doc.css('/styleSheet/numFmts/numFmt').each do |numFmt|
|
117
|
-
numFmtId
|
118
|
-
|
119
|
-
|
145
|
+
if numFmt.attributes['numFmtId'] && numFmt.attributes['formatCode']
|
146
|
+
numFmtId = numFmt.attributes['numFmtId'].value.to_i
|
147
|
+
formatCode = numFmt.attributes['formatCode'].value
|
148
|
+
@num_formats[numFmtId] = formatCode
|
149
|
+
end
|
120
150
|
end
|
121
151
|
|
122
152
|
doc.css('/styleSheet/cellXfs/xf').each do |xf|
|
123
|
-
|
124
|
-
|
153
|
+
if xf.attributes['numFmtId']
|
154
|
+
numFmtId = xf.attributes['numFmtId'].value.to_i
|
155
|
+
@cell_xfs << numFmtId
|
156
|
+
end
|
125
157
|
end
|
126
158
|
end
|
127
159
|
|
@@ -168,11 +200,15 @@ class Dullard::Sheet
|
|
168
200
|
@name = name
|
169
201
|
@id = id
|
170
202
|
@index = index
|
171
|
-
|
203
|
+
begin
|
204
|
+
@file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path)
|
205
|
+
rescue Zip::Error => e
|
206
|
+
raise Dullard::Error, "Couldn't open sheet #{index}: #{e.message}"
|
207
|
+
end
|
172
208
|
end
|
173
209
|
|
174
210
|
def string_lookup(i)
|
175
|
-
@workbook.string_table[i]
|
211
|
+
@workbook.string_table[i] || (raise Dullard::Error, 'File invalid, invalid string table.')
|
176
212
|
end
|
177
213
|
|
178
214
|
def rows
|
@@ -181,6 +217,7 @@ class Dullard::Sheet
|
|
181
217
|
@file.rewind
|
182
218
|
shared = false
|
183
219
|
row = nil
|
220
|
+
cell_map = nil # Map of column letter to cell value for a row
|
184
221
|
column = nil
|
185
222
|
cell_type = nil
|
186
223
|
Nokogiri::XML::Reader(@file).each do |node|
|
@@ -188,55 +225,65 @@ class Dullard::Sheet
|
|
188
225
|
when Nokogiri::XML::Reader::TYPE_ELEMENT
|
189
226
|
case node.name
|
190
227
|
when "row"
|
191
|
-
|
192
|
-
column = 0
|
228
|
+
cell_map = {}
|
193
229
|
next
|
194
|
-
when
|
195
|
-
|
230
|
+
when 'c'
|
231
|
+
node_type = node.attributes['t']
|
232
|
+
cell_index = node.attributes['r']
|
233
|
+
if !cell_index
|
234
|
+
raise Dullard::Error, 'Invalid spreadsheet XML.'
|
235
|
+
end
|
236
|
+
|
237
|
+
if node_type != 's' && node_type != 'b'
|
196
238
|
cell_format_index = node.attributes['s'].to_i
|
197
239
|
cell_type = @workbook.format2type(@workbook.attribute2format(cell_format_index))
|
198
240
|
end
|
199
241
|
|
200
|
-
|
201
|
-
|
202
|
-
rcolumn.delete!("0-9")
|
203
|
-
while column < self.class.column_names.size and rcolumn != self.class.column_names[column]
|
204
|
-
row << nil
|
205
|
-
column += 1
|
206
|
-
end
|
207
|
-
end
|
208
|
-
shared = (node.attribute("t") == "s")
|
209
|
-
column += 1
|
242
|
+
column = cell_index.delete('0-9')
|
243
|
+
shared = (node_type == 's')
|
210
244
|
next
|
211
245
|
end
|
212
246
|
when Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
213
|
-
if node.name ==
|
214
|
-
y <<
|
215
|
-
next
|
247
|
+
if node.name == 'row'
|
248
|
+
y << process_row(cell_map)
|
216
249
|
end
|
250
|
+
next
|
217
251
|
end
|
218
|
-
value = node.value
|
219
252
|
|
220
|
-
if value
|
253
|
+
if node.value
|
254
|
+
value = (shared ? string_lookup(value.to_i) : value)
|
221
255
|
case cell_type
|
222
256
|
when :datetime
|
223
257
|
when :time
|
224
258
|
when :date
|
225
|
-
value = (
|
226
|
-
when :percentage # ? TODO
|
259
|
+
value = (Dullard::OOXMLEpoch + node.value.to_f)
|
227
260
|
when :float
|
228
|
-
value = value.to_f
|
261
|
+
value = node.value.to_f
|
229
262
|
else
|
230
263
|
# leave as string
|
231
264
|
end
|
232
265
|
cell_type = nil
|
233
|
-
|
234
|
-
row << (shared ? string_lookup(value.to_i) : value)
|
266
|
+
cell_map[column] = value
|
235
267
|
end
|
236
268
|
end
|
237
269
|
end
|
238
270
|
end
|
239
271
|
|
272
|
+
def process_row(cell_map)
|
273
|
+
max = cell_map.keys.map {|c| self.class.column_name_to_index c }.max
|
274
|
+
row = []
|
275
|
+
self.class.column_names[0..max].each do |col|
|
276
|
+
if self.class.column_name_to_index(col) > max
|
277
|
+
break
|
278
|
+
else
|
279
|
+
row << cell_map[col]
|
280
|
+
end
|
281
|
+
end
|
282
|
+
row
|
283
|
+
end
|
284
|
+
|
285
|
+
|
286
|
+
|
240
287
|
# Returns A to ZZZ.
|
241
288
|
def self.column_names
|
242
289
|
if @column_names
|
@@ -252,6 +299,16 @@ class Dullard::Sheet
|
|
252
299
|
end
|
253
300
|
end
|
254
301
|
|
302
|
+
def self.column_name_to_index(name)
|
303
|
+
if not @column_names_to_indices
|
304
|
+
@column_names_to_indices = {}
|
305
|
+
self.column_names.each_with_index do |name, i|
|
306
|
+
@column_names_to_indices[name] = i
|
307
|
+
end
|
308
|
+
end
|
309
|
+
@column_names_to_indices[name]
|
310
|
+
end
|
311
|
+
|
255
312
|
def row_count
|
256
313
|
if defined? @row_count
|
257
314
|
@row_count
|
@@ -260,11 +317,11 @@ class Dullard::Sheet
|
|
260
317
|
Nokogiri::XML::Reader(@file).each do |node|
|
261
318
|
if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
262
319
|
case node.name
|
263
|
-
when
|
320
|
+
when 'dimension'
|
264
321
|
if ref = node.attributes["ref"]
|
265
322
|
break @row_count = ref.scan(/\d+$/).first.to_i
|
266
323
|
end
|
267
|
-
when
|
324
|
+
when 'sheetData'
|
268
325
|
break @row_count = nil
|
269
326
|
end
|
270
327
|
end
|
data/lib/dullard/version.rb
CHANGED
data/spec/dullard_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'dullard'
|
2
2
|
|
3
|
-
describe "
|
3
|
+
describe "test.xlsx," do
|
4
4
|
before(:each) do
|
5
5
|
@file = File.open(File.expand_path("../test.xlsx", __FILE__))
|
6
6
|
end
|
@@ -22,7 +22,6 @@ describe "dullard," do
|
|
22
22
|
rows = @xlsx.sheets[0].rows
|
23
23
|
rows.next.count.should == 300
|
24
24
|
rows.next.count.should == 9
|
25
|
-
rows.next.count.should == 1
|
26
25
|
end
|
27
26
|
|
28
27
|
it "reads the right number of rows" do
|
@@ -71,3 +70,47 @@ describe "dullard," do
|
|
71
70
|
end
|
72
71
|
end
|
73
72
|
end
|
73
|
+
|
74
|
+
describe "test2.xlsx" do
|
75
|
+
before(:each) do
|
76
|
+
@file = File.open(File.expand_path("../test2.xlsx", __FILE__))
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should not skip nils" do
|
80
|
+
rows = Dullard::Workbook.new(@file).sheets[0].rows.to_a
|
81
|
+
rows.should == [
|
82
|
+
[1],
|
83
|
+
[nil, 2],
|
84
|
+
[nil, nil, 3]
|
85
|
+
]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe "error handling" do
|
90
|
+
it "should raise an error when a cell is missing r attr" do
|
91
|
+
@file = File.expand_path("../error_missing_r.xlsx", __FILE__)
|
92
|
+
book = Dullard::Workbook.new(@file)
|
93
|
+
sheet = book.sheets[0]
|
94
|
+
expect {
|
95
|
+
sheet.rows.to_a
|
96
|
+
}.to raise_error(Dullard::Error)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should succeed when styles are missing" do
|
100
|
+
file = File.expand_path("../error_missing_metadata.xlsx", __FILE__)
|
101
|
+
book = Dullard::Workbook.new(file)
|
102
|
+
sheet = book.sheets[0]
|
103
|
+
expect {
|
104
|
+
sheet.rows.to_a
|
105
|
+
}.not_to raise_error
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should raise an error with invalid shared string index" do
|
109
|
+
file = File.expand_path("../error_missing_ss.xlsx", __FILE__)
|
110
|
+
book = Dullard::Workbook.new(file)
|
111
|
+
sheet = book.sheets[0]
|
112
|
+
expect {
|
113
|
+
sheet.rows.to_a
|
114
|
+
}.to raise_error(Dullard::Error)
|
115
|
+
end
|
116
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,69 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dullard
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ted Kaplan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '2.14'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.14'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.1'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.6'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.6'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rubyzip
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - ~>
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '1.1'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - ~>
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.1'
|
69
69
|
description:
|
@@ -73,7 +73,7 @@ executables: []
|
|
73
73
|
extensions: []
|
74
74
|
extra_rdoc_files: []
|
75
75
|
files:
|
76
|
-
- .gitignore
|
76
|
+
- ".gitignore"
|
77
77
|
- Gemfile
|
78
78
|
- LICENSE
|
79
79
|
- README.md
|
@@ -83,6 +83,9 @@ files:
|
|
83
83
|
- lib/dullard/reader.rb
|
84
84
|
- lib/dullard/version.rb
|
85
85
|
- spec/dullard_spec.rb
|
86
|
+
- spec/error_missing_metadata.xlsx
|
87
|
+
- spec/error_missing_r.xlsx
|
88
|
+
- spec/error_missing_ss.xlsx
|
86
89
|
- spec/test.xlsx
|
87
90
|
homepage: http://github.com/thirtyseven/dullard
|
88
91
|
licenses:
|
@@ -94,20 +97,23 @@ require_paths:
|
|
94
97
|
- lib
|
95
98
|
required_ruby_version: !ruby/object:Gem::Requirement
|
96
99
|
requirements:
|
97
|
-
- -
|
100
|
+
- - ">="
|
98
101
|
- !ruby/object:Gem::Version
|
99
102
|
version: '0'
|
100
103
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
104
|
requirements:
|
102
|
-
- -
|
105
|
+
- - ">="
|
103
106
|
- !ruby/object:Gem::Version
|
104
107
|
version: '0'
|
105
108
|
requirements: []
|
106
109
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.
|
110
|
+
rubygems_version: 2.4.3
|
108
111
|
signing_key:
|
109
112
|
specification_version: 4
|
110
113
|
summary: A fast XLSX parser using Nokogiri
|
111
114
|
test_files:
|
112
115
|
- spec/dullard_spec.rb
|
116
|
+
- spec/error_missing_metadata.xlsx
|
117
|
+
- spec/error_missing_r.xlsx
|
118
|
+
- spec/error_missing_ss.xlsx
|
113
119
|
- spec/test.xlsx
|