dullard 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +9 -1
- data/lib/dullard/reader.rb +101 -44
- data/lib/dullard/version.rb +1 -1
- data/spec/dullard_spec.rb +45 -2
- data/spec/error_missing_metadata.xlsx +0 -0
- data/spec/error_missing_r.xlsx +0 -0
- data/spec/error_missing_ss.xlsx +0 -0
- metadata +20 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3a941a2433fc527bc868fd41ef33ab4af46acc8
|
4
|
+
data.tar.gz: 6c0a1f95069ac0c0751e1452b2e6e3464872bd46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ac8f461122aaa0fb5d90a3702fac5cdd2524c21e7b78ddfc02355585ba3e0288971e660c4664a78d92e7ca2c0c9afe73c41ba8de71db56ddce6d41d8055129e
|
7
|
+
data.tar.gz: a6a2774cb28d125ab7e5b59026395cc671842e450c36c2cef0cafd98744bde274c6b662efa8764e1e0343eae8367ac8206eae0f4c7e64a315ad54ce83dacc786
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# dullard
|
2
|
+
|
1
3
|
Super simple, super fast stream-based XLSX parsing. Suitable for very large
|
2
4
|
files.
|
3
5
|
|
@@ -7,5 +9,11 @@ Requires Ruby 2.0.
|
|
7
9
|
|
8
10
|
workbook = Dullard::Workbook.new "file.xlsx"
|
9
11
|
workbook.sheets[0].rows.each do |row|
|
10
|
-
|
12
|
+
p row # => ["a","b","c", 0.3, #<DateTime: -4712-01-01....>, ...]
|
11
13
|
end
|
14
|
+
|
15
|
+
## Current limitations
|
16
|
+
* Limited validation and error handling.
|
17
|
+
* Formatted cells are read, but formatting is not accessible.
|
18
|
+
* May be buggy. Pull requests welcome!
|
19
|
+
* Rows that end with empty cells may be truncated.
|
data/lib/dullard/reader.rb
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
require 'zip/filesystem'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
|
-
module Dullard
|
4
|
+
module Dullard
|
5
|
+
class Error < StandardError; end
|
6
|
+
OOXMLEpoch = DateTime.new(1899,12,30)
|
7
|
+
SharedStringPath = 'xl/sharedStrings.xml'
|
8
|
+
StylesPath = 'xl/styles.xml'
|
9
|
+
end
|
5
10
|
|
6
11
|
class Dullard::Workbook
|
7
12
|
# Code borrowed from Roo (https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb)
|
@@ -12,8 +17,8 @@ class Dullard::Workbook
|
|
12
17
|
'0.00' => :float,
|
13
18
|
'#,##0' => :float,
|
14
19
|
'#,##0.00' => :float,
|
15
|
-
'0%' => :
|
16
|
-
'0.00%' => :
|
20
|
+
'0%' => :float,
|
21
|
+
'0.00%' => :float,
|
17
22
|
'0.00E+00' => :float,
|
18
23
|
'# ?/?' => :float, #??? TODO:
|
19
24
|
'# ??/??' => :float, #??? TODO:
|
@@ -78,24 +83,42 @@ class Dullard::Workbook
|
|
78
83
|
|
79
84
|
def initialize(file, user_defined_formats = {})
|
80
85
|
@file = file
|
81
|
-
|
86
|
+
begin
|
87
|
+
@zipfs = Zip::File.open(@file)
|
88
|
+
rescue Zip::Error => e
|
89
|
+
raise Dullard::Error, e.message
|
90
|
+
end
|
82
91
|
@user_defined_formats = user_defined_formats
|
83
92
|
read_styles
|
84
93
|
end
|
85
94
|
|
86
95
|
def sheets
|
87
|
-
|
88
|
-
|
96
|
+
begin
|
97
|
+
workbook = Nokogiri::XML::Document.parse(@zipfs.file.open('xl/workbook.xml'))
|
98
|
+
rescue Zip::Error
|
99
|
+
raise Dullard::Error, 'Invalid file, could not open xl/workbook.xml'
|
100
|
+
end
|
101
|
+
@sheets = workbook.css('sheet').each_with_index.map do |n, i|
|
102
|
+
Dullard::Sheet.new(self, n.attr('name'), n.attr('sheetId'), i+1)
|
103
|
+
end
|
89
104
|
end
|
90
105
|
|
91
106
|
def string_table
|
92
|
-
@
|
107
|
+
@string_table ||= read_string_table
|
93
108
|
end
|
94
109
|
|
95
110
|
def read_string_table
|
96
|
-
|
111
|
+
return [] unless @zipfs.file.exist? Dullard::SharedStringPath
|
112
|
+
|
113
|
+
begin
|
114
|
+
shared_string = @zipfs.file.open(Dullard::SharedStringPath)
|
115
|
+
rescue Zip::Error
|
116
|
+
raise Dullard::Error, 'Invalid file, could not open shared string file.'
|
117
|
+
end
|
118
|
+
|
97
119
|
entry = ''
|
98
|
-
|
120
|
+
@string_table = []
|
121
|
+
Nokogiri::XML::Reader(shared_string).each do |node|
|
99
122
|
if node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
100
123
|
entry = ''
|
101
124
|
elsif node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
@@ -108,20 +131,29 @@ class Dullard::Workbook
|
|
108
131
|
end
|
109
132
|
|
110
133
|
def read_styles
|
111
|
-
doc = Nokogiri::XML(@zipfs.file.open("xl/styles.xml"))
|
112
|
-
|
113
134
|
@num_formats = {}
|
114
135
|
@cell_xfs = []
|
136
|
+
return unless @zipfs.file.exist? Dullard::StylesPath
|
137
|
+
|
138
|
+
begin
|
139
|
+
doc = Nokogiri::XML(@zipfs.file.open(Dullard::StylesPath))
|
140
|
+
rescue Zip::Error
|
141
|
+
raise Dullard::Error, 'Invalid file, could not open styles'
|
142
|
+
end
|
115
143
|
|
116
144
|
doc.css('/styleSheet/numFmts/numFmt').each do |numFmt|
|
117
|
-
numFmtId
|
118
|
-
|
119
|
-
|
145
|
+
if numFmt.attributes['numFmtId'] && numFmt.attributes['formatCode']
|
146
|
+
numFmtId = numFmt.attributes['numFmtId'].value.to_i
|
147
|
+
formatCode = numFmt.attributes['formatCode'].value
|
148
|
+
@num_formats[numFmtId] = formatCode
|
149
|
+
end
|
120
150
|
end
|
121
151
|
|
122
152
|
doc.css('/styleSheet/cellXfs/xf').each do |xf|
|
123
|
-
|
124
|
-
|
153
|
+
if xf.attributes['numFmtId']
|
154
|
+
numFmtId = xf.attributes['numFmtId'].value.to_i
|
155
|
+
@cell_xfs << numFmtId
|
156
|
+
end
|
125
157
|
end
|
126
158
|
end
|
127
159
|
|
@@ -168,11 +200,15 @@ class Dullard::Sheet
|
|
168
200
|
@name = name
|
169
201
|
@id = id
|
170
202
|
@index = index
|
171
|
-
|
203
|
+
begin
|
204
|
+
@file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path)
|
205
|
+
rescue Zip::Error => e
|
206
|
+
raise Dullard::Error, "Couldn't open sheet #{index}: #{e.message}"
|
207
|
+
end
|
172
208
|
end
|
173
209
|
|
174
210
|
def string_lookup(i)
|
175
|
-
@workbook.string_table[i]
|
211
|
+
@workbook.string_table[i] || (raise Dullard::Error, 'File invalid, invalid string table.')
|
176
212
|
end
|
177
213
|
|
178
214
|
def rows
|
@@ -181,6 +217,7 @@ class Dullard::Sheet
|
|
181
217
|
@file.rewind
|
182
218
|
shared = false
|
183
219
|
row = nil
|
220
|
+
cell_map = nil # Map of column letter to cell value for a row
|
184
221
|
column = nil
|
185
222
|
cell_type = nil
|
186
223
|
Nokogiri::XML::Reader(@file).each do |node|
|
@@ -188,55 +225,65 @@ class Dullard::Sheet
|
|
188
225
|
when Nokogiri::XML::Reader::TYPE_ELEMENT
|
189
226
|
case node.name
|
190
227
|
when "row"
|
191
|
-
|
192
|
-
column = 0
|
228
|
+
cell_map = {}
|
193
229
|
next
|
194
|
-
when
|
195
|
-
|
230
|
+
when 'c'
|
231
|
+
node_type = node.attributes['t']
|
232
|
+
cell_index = node.attributes['r']
|
233
|
+
if !cell_index
|
234
|
+
raise Dullard::Error, 'Invalid spreadsheet XML.'
|
235
|
+
end
|
236
|
+
|
237
|
+
if node_type != 's' && node_type != 'b'
|
196
238
|
cell_format_index = node.attributes['s'].to_i
|
197
239
|
cell_type = @workbook.format2type(@workbook.attribute2format(cell_format_index))
|
198
240
|
end
|
199
241
|
|
200
|
-
|
201
|
-
|
202
|
-
rcolumn.delete!("0-9")
|
203
|
-
while column < self.class.column_names.size and rcolumn != self.class.column_names[column]
|
204
|
-
row << nil
|
205
|
-
column += 1
|
206
|
-
end
|
207
|
-
end
|
208
|
-
shared = (node.attribute("t") == "s")
|
209
|
-
column += 1
|
242
|
+
column = cell_index.delete('0-9')
|
243
|
+
shared = (node_type == 's')
|
210
244
|
next
|
211
245
|
end
|
212
246
|
when Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
213
|
-
if node.name ==
|
214
|
-
y <<
|
215
|
-
next
|
247
|
+
if node.name == 'row'
|
248
|
+
y << process_row(cell_map)
|
216
249
|
end
|
250
|
+
next
|
217
251
|
end
|
218
|
-
value = node.value
|
219
252
|
|
220
|
-
if value
|
253
|
+
if node.value
|
254
|
+
value = (shared ? string_lookup(value.to_i) : value)
|
221
255
|
case cell_type
|
222
256
|
when :datetime
|
223
257
|
when :time
|
224
258
|
when :date
|
225
|
-
value = (
|
226
|
-
when :percentage # ? TODO
|
259
|
+
value = (Dullard::OOXMLEpoch + node.value.to_f)
|
227
260
|
when :float
|
228
|
-
value = value.to_f
|
261
|
+
value = node.value.to_f
|
229
262
|
else
|
230
263
|
# leave as string
|
231
264
|
end
|
232
265
|
cell_type = nil
|
233
|
-
|
234
|
-
row << (shared ? string_lookup(value.to_i) : value)
|
266
|
+
cell_map[column] = value
|
235
267
|
end
|
236
268
|
end
|
237
269
|
end
|
238
270
|
end
|
239
271
|
|
272
|
+
def process_row(cell_map)
|
273
|
+
max = cell_map.keys.map {|c| self.class.column_name_to_index c }.max
|
274
|
+
row = []
|
275
|
+
self.class.column_names[0..max].each do |col|
|
276
|
+
if self.class.column_name_to_index(col) > max
|
277
|
+
break
|
278
|
+
else
|
279
|
+
row << cell_map[col]
|
280
|
+
end
|
281
|
+
end
|
282
|
+
row
|
283
|
+
end
|
284
|
+
|
285
|
+
|
286
|
+
|
240
287
|
# Returns A to ZZZ.
|
241
288
|
def self.column_names
|
242
289
|
if @column_names
|
@@ -252,6 +299,16 @@ class Dullard::Sheet
|
|
252
299
|
end
|
253
300
|
end
|
254
301
|
|
302
|
+
def self.column_name_to_index(name)
|
303
|
+
if not @column_names_to_indices
|
304
|
+
@column_names_to_indices = {}
|
305
|
+
self.column_names.each_with_index do |name, i|
|
306
|
+
@column_names_to_indices[name] = i
|
307
|
+
end
|
308
|
+
end
|
309
|
+
@column_names_to_indices[name]
|
310
|
+
end
|
311
|
+
|
255
312
|
def row_count
|
256
313
|
if defined? @row_count
|
257
314
|
@row_count
|
@@ -260,11 +317,11 @@ class Dullard::Sheet
|
|
260
317
|
Nokogiri::XML::Reader(@file).each do |node|
|
261
318
|
if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
262
319
|
case node.name
|
263
|
-
when
|
320
|
+
when 'dimension'
|
264
321
|
if ref = node.attributes["ref"]
|
265
322
|
break @row_count = ref.scan(/\d+$/).first.to_i
|
266
323
|
end
|
267
|
-
when
|
324
|
+
when 'sheetData'
|
268
325
|
break @row_count = nil
|
269
326
|
end
|
270
327
|
end
|
data/lib/dullard/version.rb
CHANGED
data/spec/dullard_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'dullard'
|
2
2
|
|
3
|
-
describe "
|
3
|
+
describe "test.xlsx," do
|
4
4
|
before(:each) do
|
5
5
|
@file = File.open(File.expand_path("../test.xlsx", __FILE__))
|
6
6
|
end
|
@@ -22,7 +22,6 @@ describe "dullard," do
|
|
22
22
|
rows = @xlsx.sheets[0].rows
|
23
23
|
rows.next.count.should == 300
|
24
24
|
rows.next.count.should == 9
|
25
|
-
rows.next.count.should == 1
|
26
25
|
end
|
27
26
|
|
28
27
|
it "reads the right number of rows" do
|
@@ -71,3 +70,47 @@ describe "dullard," do
|
|
71
70
|
end
|
72
71
|
end
|
73
72
|
end
|
73
|
+
|
74
|
+
describe "test2.xlsx" do
|
75
|
+
before(:each) do
|
76
|
+
@file = File.open(File.expand_path("../test2.xlsx", __FILE__))
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should not skip nils" do
|
80
|
+
rows = Dullard::Workbook.new(@file).sheets[0].rows.to_a
|
81
|
+
rows.should == [
|
82
|
+
[1],
|
83
|
+
[nil, 2],
|
84
|
+
[nil, nil, 3]
|
85
|
+
]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe "error handling" do
|
90
|
+
it "should raise an error when a cell is missing r attr" do
|
91
|
+
@file = File.expand_path("../error_missing_r.xlsx", __FILE__)
|
92
|
+
book = Dullard::Workbook.new(@file)
|
93
|
+
sheet = book.sheets[0]
|
94
|
+
expect {
|
95
|
+
sheet.rows.to_a
|
96
|
+
}.to raise_error(Dullard::Error)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should succeed when styles are missing" do
|
100
|
+
file = File.expand_path("../error_missing_metadata.xlsx", __FILE__)
|
101
|
+
book = Dullard::Workbook.new(file)
|
102
|
+
sheet = book.sheets[0]
|
103
|
+
expect {
|
104
|
+
sheet.rows.to_a
|
105
|
+
}.not_to raise_error
|
106
|
+
end
|
107
|
+
|
108
|
+
it "should raise an error with invalid shared string index" do
|
109
|
+
file = File.expand_path("../error_missing_ss.xlsx", __FILE__)
|
110
|
+
book = Dullard::Workbook.new(file)
|
111
|
+
sheet = book.sheets[0]
|
112
|
+
expect {
|
113
|
+
sheet.rows.to_a
|
114
|
+
}.to raise_error(Dullard::Error)
|
115
|
+
end
|
116
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,69 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dullard
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ted Kaplan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '2.14'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.14'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.1'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '1.6'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.6'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rubyzip
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - ~>
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '1.1'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - ~>
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.1'
|
69
69
|
description:
|
@@ -73,7 +73,7 @@ executables: []
|
|
73
73
|
extensions: []
|
74
74
|
extra_rdoc_files: []
|
75
75
|
files:
|
76
|
-
- .gitignore
|
76
|
+
- ".gitignore"
|
77
77
|
- Gemfile
|
78
78
|
- LICENSE
|
79
79
|
- README.md
|
@@ -83,6 +83,9 @@ files:
|
|
83
83
|
- lib/dullard/reader.rb
|
84
84
|
- lib/dullard/version.rb
|
85
85
|
- spec/dullard_spec.rb
|
86
|
+
- spec/error_missing_metadata.xlsx
|
87
|
+
- spec/error_missing_r.xlsx
|
88
|
+
- spec/error_missing_ss.xlsx
|
86
89
|
- spec/test.xlsx
|
87
90
|
homepage: http://github.com/thirtyseven/dullard
|
88
91
|
licenses:
|
@@ -94,20 +97,23 @@ require_paths:
|
|
94
97
|
- lib
|
95
98
|
required_ruby_version: !ruby/object:Gem::Requirement
|
96
99
|
requirements:
|
97
|
-
- -
|
100
|
+
- - ">="
|
98
101
|
- !ruby/object:Gem::Version
|
99
102
|
version: '0'
|
100
103
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
104
|
requirements:
|
102
|
-
- -
|
105
|
+
- - ">="
|
103
106
|
- !ruby/object:Gem::Version
|
104
107
|
version: '0'
|
105
108
|
requirements: []
|
106
109
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.
|
110
|
+
rubygems_version: 2.4.3
|
108
111
|
signing_key:
|
109
112
|
specification_version: 4
|
110
113
|
summary: A fast XLSX parser using Nokogiri
|
111
114
|
test_files:
|
112
115
|
- spec/dullard_spec.rb
|
116
|
+
- spec/error_missing_metadata.xlsx
|
117
|
+
- spec/error_missing_r.xlsx
|
118
|
+
- spec/error_missing_ss.xlsx
|
113
119
|
- spec/test.xlsx
|