dullard 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9d02ed091d258d5d690ba9e3ba350c392488664e
4
- data.tar.gz: 9f61befa4e3003f7e469a26ff7c4412ffa4d6e55
3
+ metadata.gz: e3a941a2433fc527bc868fd41ef33ab4af46acc8
4
+ data.tar.gz: 6c0a1f95069ac0c0751e1452b2e6e3464872bd46
5
5
  SHA512:
6
- metadata.gz: 87d7d94d530d337ba92623146e5d0a115e44b04d15b1ee91fcf4c49bc76e90ace74bbf29f6e707ad3853d3e314e000cb4a3ec96a9b256576755adaeacce8022c
7
- data.tar.gz: 2d10be78032321162cc1e2416771946c02cecea129f2959f12ec9a3bc49e2e9d68563ac18b9896159fef1a097d1d90fa5bf6ac7698985a2ec1fc3afcf8248297
6
+ metadata.gz: 1ac8f461122aaa0fb5d90a3702fac5cdd2524c21e7b78ddfc02355585ba3e0288971e660c4664a78d92e7ca2c0c9afe73c41ba8de71db56ddce6d41d8055129e
7
+ data.tar.gz: a6a2774cb28d125ab7e5b59026395cc671842e450c36c2cef0cafd98744bde274c6b662efa8764e1e0343eae8367ac8206eae0f4c7e64a315ad54ce83dacc786
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2013 Edward Kaplan
1
+ Copyright (c) 2015 Edward Kaplan
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -1,3 +1,5 @@
1
+ # dullard
2
+
1
3
  Super simple, super fast stream-based XLSX parsing. Suitable for very large
2
4
  files.
3
5
 
@@ -7,5 +9,11 @@ Requires Ruby 2.0.
7
9
 
8
10
  workbook = Dullard::Workbook.new "file.xlsx"
9
11
  workbook.sheets[0].rows.each do |row|
10
- puts row # => ["a","b","c",...]
12
+ p row # => ["a","b","c", 0.3, #<DateTime: -4712-01-01....>, ...]
11
13
  end
14
+
15
+ ## Current limitations
16
+ * Limited validation and error handling.
17
+ * Formatted cells are read, but formatting is not accessible.
18
+ * May be buggy. Pull requests welcome!
19
+ * Rows that end with empty cells may be truncated.
@@ -1,7 +1,12 @@
1
1
  require 'zip/filesystem'
2
2
  require 'nokogiri'
3
3
 
4
- module Dullard; end
4
+ module Dullard
5
+ class Error < StandardError; end
6
+ OOXMLEpoch = DateTime.new(1899,12,30)
7
+ SharedStringPath = 'xl/sharedStrings.xml'
8
+ StylesPath = 'xl/styles.xml'
9
+ end
5
10
 
6
11
  class Dullard::Workbook
7
12
  # Code borrowed from Roo (https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb)
@@ -12,8 +17,8 @@ class Dullard::Workbook
12
17
  '0.00' => :float,
13
18
  '#,##0' => :float,
14
19
  '#,##0.00' => :float,
15
- '0%' => :percentage,
16
- '0.00%' => :percentage,
20
+ '0%' => :float,
21
+ '0.00%' => :float,
17
22
  '0.00E+00' => :float,
18
23
  '# ?/?' => :float, #??? TODO:
19
24
  '# ??/??' => :float, #??? TODO:
@@ -78,24 +83,42 @@ class Dullard::Workbook
78
83
 
79
84
  def initialize(file, user_defined_formats = {})
80
85
  @file = file
81
- @zipfs = Zip::File.open(@file)
86
+ begin
87
+ @zipfs = Zip::File.open(@file)
88
+ rescue Zip::Error => e
89
+ raise Dullard::Error, e.message
90
+ end
82
91
  @user_defined_formats = user_defined_formats
83
92
  read_styles
84
93
  end
85
94
 
86
95
  def sheets
87
- workbook = Nokogiri::XML::Document.parse(@zipfs.file.open("xl/workbook.xml"))
88
- @sheets = workbook.css("sheet").each_with_index.map {|n,i| Dullard::Sheet.new(self, n.attr("name"), n.attr("sheetId"), i+1) }
96
+ begin
97
+ workbook = Nokogiri::XML::Document.parse(@zipfs.file.open('xl/workbook.xml'))
98
+ rescue Zip::Error
99
+ raise Dullard::Error, 'Invalid file, could not open xl/workbook.xml'
100
+ end
101
+ @sheets = workbook.css('sheet').each_with_index.map do |n, i|
102
+ Dullard::Sheet.new(self, n.attr('name'), n.attr('sheetId'), i+1)
103
+ end
89
104
  end
90
105
 
91
106
  def string_table
92
- @string_tabe ||= read_string_table
107
+ @string_table ||= read_string_table
93
108
  end
94
109
 
95
110
  def read_string_table
96
- @string_table = []
111
+ return [] unless @zipfs.file.exist? Dullard::SharedStringPath
112
+
113
+ begin
114
+ shared_string = @zipfs.file.open(Dullard::SharedStringPath)
115
+ rescue Zip::Error
116
+ raise Dullard::Error, 'Invalid file, could not open shared string file.'
117
+ end
118
+
97
119
  entry = ''
98
- Nokogiri::XML::Reader(@zipfs.file.open("xl/sharedStrings.xml")).each do |node|
120
+ @string_table = []
121
+ Nokogiri::XML::Reader(shared_string).each do |node|
99
122
  if node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
100
123
  entry = ''
101
124
  elsif node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
@@ -108,20 +131,29 @@ class Dullard::Workbook
108
131
  end
109
132
 
110
133
  def read_styles
111
- doc = Nokogiri::XML(@zipfs.file.open("xl/styles.xml"))
112
-
113
134
  @num_formats = {}
114
135
  @cell_xfs = []
136
+ return unless @zipfs.file.exist? Dullard::StylesPath
137
+
138
+ begin
139
+ doc = Nokogiri::XML(@zipfs.file.open(Dullard::StylesPath))
140
+ rescue Zip::Error
141
+ raise Dullard::Error, 'Invalid file, could not open styles'
142
+ end
115
143
 
116
144
  doc.css('/styleSheet/numFmts/numFmt').each do |numFmt|
117
- numFmtId = numFmt.attributes['numFmtId'].value.to_i
118
- formatCode = numFmt.attributes['formatCode'].value
119
- @num_formats[numFmtId] = formatCode
145
+ if numFmt.attributes['numFmtId'] && numFmt.attributes['formatCode']
146
+ numFmtId = numFmt.attributes['numFmtId'].value.to_i
147
+ formatCode = numFmt.attributes['formatCode'].value
148
+ @num_formats[numFmtId] = formatCode
149
+ end
120
150
  end
121
151
 
122
152
  doc.css('/styleSheet/cellXfs/xf').each do |xf|
123
- numFmtId = xf.attributes['numFmtId'].value.to_i
124
- @cell_xfs << numFmtId
153
+ if xf.attributes['numFmtId']
154
+ numFmtId = xf.attributes['numFmtId'].value.to_i
155
+ @cell_xfs << numFmtId
156
+ end
125
157
  end
126
158
  end
127
159
 
@@ -168,11 +200,15 @@ class Dullard::Sheet
168
200
  @name = name
169
201
  @id = id
170
202
  @index = index
171
- @file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path)
203
+ begin
204
+ @file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path)
205
+ rescue Zip::Error => e
206
+ raise Dullard::Error, "Couldn't open sheet #{index}: #{e.message}"
207
+ end
172
208
  end
173
209
 
174
210
  def string_lookup(i)
175
- @workbook.string_table[i]
211
+ @workbook.string_table[i] || (raise Dullard::Error, 'File invalid, invalid string table.')
176
212
  end
177
213
 
178
214
  def rows
@@ -181,6 +217,7 @@ class Dullard::Sheet
181
217
  @file.rewind
182
218
  shared = false
183
219
  row = nil
220
+ cell_map = nil # Map of column letter to cell value for a row
184
221
  column = nil
185
222
  cell_type = nil
186
223
  Nokogiri::XML::Reader(@file).each do |node|
@@ -188,55 +225,65 @@ class Dullard::Sheet
188
225
  when Nokogiri::XML::Reader::TYPE_ELEMENT
189
226
  case node.name
190
227
  when "row"
191
- row = []
192
- column = 0
228
+ cell_map = {}
193
229
  next
194
- when "c"
195
- if node.attributes['t'] != 's' && node.attributes['t'] != 'b'
230
+ when 'c'
231
+ node_type = node.attributes['t']
232
+ cell_index = node.attributes['r']
233
+ if !cell_index
234
+ raise Dullard::Error, 'Invalid spreadsheet XML.'
235
+ end
236
+
237
+ if node_type != 's' && node_type != 'b'
196
238
  cell_format_index = node.attributes['s'].to_i
197
239
  cell_type = @workbook.format2type(@workbook.attribute2format(cell_format_index))
198
240
  end
199
241
 
200
- rcolumn = node.attributes["r"]
201
- if rcolumn
202
- rcolumn.delete!("0-9")
203
- while column < self.class.column_names.size and rcolumn != self.class.column_names[column]
204
- row << nil
205
- column += 1
206
- end
207
- end
208
- shared = (node.attribute("t") == "s")
209
- column += 1
242
+ column = cell_index.delete('0-9')
243
+ shared = (node_type == 's')
210
244
  next
211
245
  end
212
246
  when Nokogiri::XML::Reader::TYPE_END_ELEMENT
213
- if node.name == "row"
214
- y << row
215
- next
247
+ if node.name == 'row'
248
+ y << process_row(cell_map)
216
249
  end
250
+ next
217
251
  end
218
- value = node.value
219
252
 
220
- if value
253
+ if node.value
254
+ value = (shared ? string_lookup(value.to_i) : value)
221
255
  case cell_type
222
256
  when :datetime
223
257
  when :time
224
258
  when :date
225
- value = (DateTime.new(1899,12,30) + value.to_f)
226
- when :percentage # ? TODO
259
+ value = (Dullard::OOXMLEpoch + node.value.to_f)
227
260
  when :float
228
- value = value.to_f
261
+ value = node.value.to_f
229
262
  else
230
263
  # leave as string
231
264
  end
232
265
  cell_type = nil
233
-
234
- row << (shared ? string_lookup(value.to_i) : value)
266
+ cell_map[column] = value
235
267
  end
236
268
  end
237
269
  end
238
270
  end
239
271
 
272
+ def process_row(cell_map)
273
+ max = cell_map.keys.map {|c| self.class.column_name_to_index c }.max
274
+ row = []
275
+ self.class.column_names[0..max].each do |col|
276
+ if self.class.column_name_to_index(col) > max
277
+ break
278
+ else
279
+ row << cell_map[col]
280
+ end
281
+ end
282
+ row
283
+ end
284
+
285
+
286
+
240
287
  # Returns A to ZZZ.
241
288
  def self.column_names
242
289
  if @column_names
@@ -252,6 +299,16 @@ class Dullard::Sheet
252
299
  end
253
300
  end
254
301
 
302
+ def self.column_name_to_index(name)
303
+ if not @column_names_to_indices
304
+ @column_names_to_indices = {}
305
+ self.column_names.each_with_index do |name, i|
306
+ @column_names_to_indices[name] = i
307
+ end
308
+ end
309
+ @column_names_to_indices[name]
310
+ end
311
+
255
312
  def row_count
256
313
  if defined? @row_count
257
314
  @row_count
@@ -260,11 +317,11 @@ class Dullard::Sheet
260
317
  Nokogiri::XML::Reader(@file).each do |node|
261
318
  if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
262
319
  case node.name
263
- when "dimension"
320
+ when 'dimension'
264
321
  if ref = node.attributes["ref"]
265
322
  break @row_count = ref.scan(/\d+$/).first.to_i
266
323
  end
267
- when "sheetData"
324
+ when 'sheetData'
268
325
  break @row_count = nil
269
326
  end
270
327
  end
@@ -1,3 +1,3 @@
1
1
  module Dullard
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -1,6 +1,6 @@
1
1
  require 'dullard'
2
2
 
3
- describe "dullard," do
3
+ describe "test.xlsx," do
4
4
  before(:each) do
5
5
  @file = File.open(File.expand_path("../test.xlsx", __FILE__))
6
6
  end
@@ -22,7 +22,6 @@ describe "dullard," do
22
22
  rows = @xlsx.sheets[0].rows
23
23
  rows.next.count.should == 300
24
24
  rows.next.count.should == 9
25
- rows.next.count.should == 1
26
25
  end
27
26
 
28
27
  it "reads the right number of rows" do
@@ -71,3 +70,47 @@ describe "dullard," do
71
70
  end
72
71
  end
73
72
  end
73
+
74
+ describe "test2.xlsx" do
75
+ before(:each) do
76
+ @file = File.open(File.expand_path("../test2.xlsx", __FILE__))
77
+ end
78
+
79
+ it "should not skip nils" do
80
+ rows = Dullard::Workbook.new(@file).sheets[0].rows.to_a
81
+ rows.should == [
82
+ [1],
83
+ [nil, 2],
84
+ [nil, nil, 3]
85
+ ]
86
+ end
87
+ end
88
+
89
+ describe "error handling" do
90
+ it "should raise an error when a cell is missing r attr" do
91
+ @file = File.expand_path("../error_missing_r.xlsx", __FILE__)
92
+ book = Dullard::Workbook.new(@file)
93
+ sheet = book.sheets[0]
94
+ expect {
95
+ sheet.rows.to_a
96
+ }.to raise_error(Dullard::Error)
97
+ end
98
+
99
+ it "should succeed when styles are missing" do
100
+ file = File.expand_path("../error_missing_metadata.xlsx", __FILE__)
101
+ book = Dullard::Workbook.new(file)
102
+ sheet = book.sheets[0]
103
+ expect {
104
+ sheet.rows.to_a
105
+ }.not_to raise_error
106
+ end
107
+
108
+ it "should raise an error with invalid shared string index" do
109
+ file = File.expand_path("../error_missing_ss.xlsx", __FILE__)
110
+ book = Dullard::Workbook.new(file)
111
+ sheet = book.sheets[0]
112
+ expect {
113
+ sheet.rows.to_a
114
+ }.to raise_error(Dullard::Error)
115
+ end
116
+ end
Binary file
metadata CHANGED
@@ -1,69 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dullard
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ted Kaplan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-30 00:00:00.000000000 Z
11
+ date: 2015-01-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '2.14'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.14'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '10.1'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.6'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rubyzip
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ~>
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
61
  version: '1.1'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ~>
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.1'
69
69
  description:
@@ -73,7 +73,7 @@ executables: []
73
73
  extensions: []
74
74
  extra_rdoc_files: []
75
75
  files:
76
- - .gitignore
76
+ - ".gitignore"
77
77
  - Gemfile
78
78
  - LICENSE
79
79
  - README.md
@@ -83,6 +83,9 @@ files:
83
83
  - lib/dullard/reader.rb
84
84
  - lib/dullard/version.rb
85
85
  - spec/dullard_spec.rb
86
+ - spec/error_missing_metadata.xlsx
87
+ - spec/error_missing_r.xlsx
88
+ - spec/error_missing_ss.xlsx
86
89
  - spec/test.xlsx
87
90
  homepage: http://github.com/thirtyseven/dullard
88
91
  licenses:
@@ -94,20 +97,23 @@ require_paths:
94
97
  - lib
95
98
  required_ruby_version: !ruby/object:Gem::Requirement
96
99
  requirements:
97
- - - '>='
100
+ - - ">="
98
101
  - !ruby/object:Gem::Version
99
102
  version: '0'
100
103
  required_rubygems_version: !ruby/object:Gem::Requirement
101
104
  requirements:
102
- - - '>='
105
+ - - ">="
103
106
  - !ruby/object:Gem::Version
104
107
  version: '0'
105
108
  requirements: []
106
109
  rubyforge_project:
107
- rubygems_version: 2.0.5
110
+ rubygems_version: 2.4.3
108
111
  signing_key:
109
112
  specification_version: 4
110
113
  summary: A fast XLSX parser using Nokogiri
111
114
  test_files:
112
115
  - spec/dullard_spec.rb
116
+ - spec/error_missing_metadata.xlsx
117
+ - spec/error_missing_r.xlsx
118
+ - spec/error_missing_ss.xlsx
113
119
  - spec/test.xlsx