dullard 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9d02ed091d258d5d690ba9e3ba350c392488664e
4
- data.tar.gz: 9f61befa4e3003f7e469a26ff7c4412ffa4d6e55
3
+ metadata.gz: e3a941a2433fc527bc868fd41ef33ab4af46acc8
4
+ data.tar.gz: 6c0a1f95069ac0c0751e1452b2e6e3464872bd46
5
5
  SHA512:
6
- metadata.gz: 87d7d94d530d337ba92623146e5d0a115e44b04d15b1ee91fcf4c49bc76e90ace74bbf29f6e707ad3853d3e314e000cb4a3ec96a9b256576755adaeacce8022c
7
- data.tar.gz: 2d10be78032321162cc1e2416771946c02cecea129f2959f12ec9a3bc49e2e9d68563ac18b9896159fef1a097d1d90fa5bf6ac7698985a2ec1fc3afcf8248297
6
+ metadata.gz: 1ac8f461122aaa0fb5d90a3702fac5cdd2524c21e7b78ddfc02355585ba3e0288971e660c4664a78d92e7ca2c0c9afe73c41ba8de71db56ddce6d41d8055129e
7
+ data.tar.gz: a6a2774cb28d125ab7e5b59026395cc671842e450c36c2cef0cafd98744bde274c6b662efa8764e1e0343eae8367ac8206eae0f4c7e64a315ad54ce83dacc786
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2013 Edward Kaplan
1
+ Copyright (c) 2015 Edward Kaplan
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -1,3 +1,5 @@
1
+ # dullard
2
+
1
3
  Super simple, super fast stream-based XLSX parsing. Suitable for very large
2
4
  files.
3
5
 
@@ -7,5 +9,11 @@ Requires Ruby 2.0.
7
9
 
8
10
  workbook = Dullard::Workbook.new "file.xlsx"
9
11
  workbook.sheets[0].rows.each do |row|
10
- puts row # => ["a","b","c",...]
12
+ p row # => ["a","b","c", 0.3, #<DateTime: -4712-01-01....>, ...]
11
13
  end
14
+
15
+ ## Current limitations
16
+ * Limited validation and error handling.
17
+ * Formatted cells are read, but formatting is not accessible.
18
+ * May be buggy. Pull requests welcome!
19
+ * Rows that end with empty cells may be truncated.
@@ -1,7 +1,12 @@
1
1
  require 'zip/filesystem'
2
2
  require 'nokogiri'
3
3
 
4
- module Dullard; end
4
+ module Dullard
5
+ class Error < StandardError; end
6
+ OOXMLEpoch = DateTime.new(1899,12,30)
7
+ SharedStringPath = 'xl/sharedStrings.xml'
8
+ StylesPath = 'xl/styles.xml'
9
+ end
5
10
 
6
11
  class Dullard::Workbook
7
12
  # Code borrowed from Roo (https://github.com/hmcgowan/roo/blob/master/lib/roo/excelx.rb)
@@ -12,8 +17,8 @@ class Dullard::Workbook
12
17
  '0.00' => :float,
13
18
  '#,##0' => :float,
14
19
  '#,##0.00' => :float,
15
- '0%' => :percentage,
16
- '0.00%' => :percentage,
20
+ '0%' => :float,
21
+ '0.00%' => :float,
17
22
  '0.00E+00' => :float,
18
23
  '# ?/?' => :float, #??? TODO:
19
24
  '# ??/??' => :float, #??? TODO:
@@ -78,24 +83,42 @@ class Dullard::Workbook
78
83
 
79
84
  def initialize(file, user_defined_formats = {})
80
85
  @file = file
81
- @zipfs = Zip::File.open(@file)
86
+ begin
87
+ @zipfs = Zip::File.open(@file)
88
+ rescue Zip::Error => e
89
+ raise Dullard::Error, e.message
90
+ end
82
91
  @user_defined_formats = user_defined_formats
83
92
  read_styles
84
93
  end
85
94
 
86
95
  def sheets
87
- workbook = Nokogiri::XML::Document.parse(@zipfs.file.open("xl/workbook.xml"))
88
- @sheets = workbook.css("sheet").each_with_index.map {|n,i| Dullard::Sheet.new(self, n.attr("name"), n.attr("sheetId"), i+1) }
96
+ begin
97
+ workbook = Nokogiri::XML::Document.parse(@zipfs.file.open('xl/workbook.xml'))
98
+ rescue Zip::Error
99
+ raise Dullard::Error, 'Invalid file, could not open xl/workbook.xml'
100
+ end
101
+ @sheets = workbook.css('sheet').each_with_index.map do |n, i|
102
+ Dullard::Sheet.new(self, n.attr('name'), n.attr('sheetId'), i+1)
103
+ end
89
104
  end
90
105
 
91
106
  def string_table
92
- @string_tabe ||= read_string_table
107
+ @string_table ||= read_string_table
93
108
  end
94
109
 
95
110
  def read_string_table
96
- @string_table = []
111
+ return [] unless @zipfs.file.exist? Dullard::SharedStringPath
112
+
113
+ begin
114
+ shared_string = @zipfs.file.open(Dullard::SharedStringPath)
115
+ rescue Zip::Error
116
+ raise Dullard::Error, 'Invalid file, could not open shared string file.'
117
+ end
118
+
97
119
  entry = ''
98
- Nokogiri::XML::Reader(@zipfs.file.open("xl/sharedStrings.xml")).each do |node|
120
+ @string_table = []
121
+ Nokogiri::XML::Reader(shared_string).each do |node|
99
122
  if node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
100
123
  entry = ''
101
124
  elsif node.name == "si" and node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
@@ -108,20 +131,29 @@ class Dullard::Workbook
108
131
  end
109
132
 
110
133
  def read_styles
111
- doc = Nokogiri::XML(@zipfs.file.open("xl/styles.xml"))
112
-
113
134
  @num_formats = {}
114
135
  @cell_xfs = []
136
+ return unless @zipfs.file.exist? Dullard::StylesPath
137
+
138
+ begin
139
+ doc = Nokogiri::XML(@zipfs.file.open(Dullard::StylesPath))
140
+ rescue Zip::Error
141
+ raise Dullard::Error, 'Invalid file, could not open styles'
142
+ end
115
143
 
116
144
  doc.css('/styleSheet/numFmts/numFmt').each do |numFmt|
117
- numFmtId = numFmt.attributes['numFmtId'].value.to_i
118
- formatCode = numFmt.attributes['formatCode'].value
119
- @num_formats[numFmtId] = formatCode
145
+ if numFmt.attributes['numFmtId'] && numFmt.attributes['formatCode']
146
+ numFmtId = numFmt.attributes['numFmtId'].value.to_i
147
+ formatCode = numFmt.attributes['formatCode'].value
148
+ @num_formats[numFmtId] = formatCode
149
+ end
120
150
  end
121
151
 
122
152
  doc.css('/styleSheet/cellXfs/xf').each do |xf|
123
- numFmtId = xf.attributes['numFmtId'].value.to_i
124
- @cell_xfs << numFmtId
153
+ if xf.attributes['numFmtId']
154
+ numFmtId = xf.attributes['numFmtId'].value.to_i
155
+ @cell_xfs << numFmtId
156
+ end
125
157
  end
126
158
  end
127
159
 
@@ -168,11 +200,15 @@ class Dullard::Sheet
168
200
  @name = name
169
201
  @id = id
170
202
  @index = index
171
- @file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path)
203
+ begin
204
+ @file = @workbook.zipfs.file.open(path) if @workbook.zipfs.file.exist?(path)
205
+ rescue Zip::Error => e
206
+ raise Dullard::Error, "Couldn't open sheet #{index}: #{e.message}"
207
+ end
172
208
  end
173
209
 
174
210
  def string_lookup(i)
175
- @workbook.string_table[i]
211
+ @workbook.string_table[i] || (raise Dullard::Error, 'File invalid, invalid string table.')
176
212
  end
177
213
 
178
214
  def rows
@@ -181,6 +217,7 @@ class Dullard::Sheet
181
217
  @file.rewind
182
218
  shared = false
183
219
  row = nil
220
+ cell_map = nil # Map of column letter to cell value for a row
184
221
  column = nil
185
222
  cell_type = nil
186
223
  Nokogiri::XML::Reader(@file).each do |node|
@@ -188,55 +225,65 @@ class Dullard::Sheet
188
225
  when Nokogiri::XML::Reader::TYPE_ELEMENT
189
226
  case node.name
190
227
  when "row"
191
- row = []
192
- column = 0
228
+ cell_map = {}
193
229
  next
194
- when "c"
195
- if node.attributes['t'] != 's' && node.attributes['t'] != 'b'
230
+ when 'c'
231
+ node_type = node.attributes['t']
232
+ cell_index = node.attributes['r']
233
+ if !cell_index
234
+ raise Dullard::Error, 'Invalid spreadsheet XML.'
235
+ end
236
+
237
+ if node_type != 's' && node_type != 'b'
196
238
  cell_format_index = node.attributes['s'].to_i
197
239
  cell_type = @workbook.format2type(@workbook.attribute2format(cell_format_index))
198
240
  end
199
241
 
200
- rcolumn = node.attributes["r"]
201
- if rcolumn
202
- rcolumn.delete!("0-9")
203
- while column < self.class.column_names.size and rcolumn != self.class.column_names[column]
204
- row << nil
205
- column += 1
206
- end
207
- end
208
- shared = (node.attribute("t") == "s")
209
- column += 1
242
+ column = cell_index.delete('0-9')
243
+ shared = (node_type == 's')
210
244
  next
211
245
  end
212
246
  when Nokogiri::XML::Reader::TYPE_END_ELEMENT
213
- if node.name == "row"
214
- y << row
215
- next
247
+ if node.name == 'row'
248
+ y << process_row(cell_map)
216
249
  end
250
+ next
217
251
  end
218
- value = node.value
219
252
 
220
- if value
253
+ if node.value
254
+ value = (shared ? string_lookup(value.to_i) : value)
221
255
  case cell_type
222
256
  when :datetime
223
257
  when :time
224
258
  when :date
225
- value = (DateTime.new(1899,12,30) + value.to_f)
226
- when :percentage # ? TODO
259
+ value = (Dullard::OOXMLEpoch + node.value.to_f)
227
260
  when :float
228
- value = value.to_f
261
+ value = node.value.to_f
229
262
  else
230
263
  # leave as string
231
264
  end
232
265
  cell_type = nil
233
-
234
- row << (shared ? string_lookup(value.to_i) : value)
266
+ cell_map[column] = value
235
267
  end
236
268
  end
237
269
  end
238
270
  end
239
271
 
272
+ def process_row(cell_map)
273
+ max = cell_map.keys.map {|c| self.class.column_name_to_index c }.max
274
+ row = []
275
+ self.class.column_names[0..max].each do |col|
276
+ if self.class.column_name_to_index(col) > max
277
+ break
278
+ else
279
+ row << cell_map[col]
280
+ end
281
+ end
282
+ row
283
+ end
284
+
285
+
286
+
240
287
  # Returns A to ZZZ.
241
288
  def self.column_names
242
289
  if @column_names
@@ -252,6 +299,16 @@ class Dullard::Sheet
252
299
  end
253
300
  end
254
301
 
302
+ def self.column_name_to_index(name)
303
+ if not @column_names_to_indices
304
+ @column_names_to_indices = {}
305
+ self.column_names.each_with_index do |name, i|
306
+ @column_names_to_indices[name] = i
307
+ end
308
+ end
309
+ @column_names_to_indices[name]
310
+ end
311
+
255
312
  def row_count
256
313
  if defined? @row_count
257
314
  @row_count
@@ -260,11 +317,11 @@ class Dullard::Sheet
260
317
  Nokogiri::XML::Reader(@file).each do |node|
261
318
  if node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
262
319
  case node.name
263
- when "dimension"
320
+ when 'dimension'
264
321
  if ref = node.attributes["ref"]
265
322
  break @row_count = ref.scan(/\d+$/).first.to_i
266
323
  end
267
- when "sheetData"
324
+ when 'sheetData'
268
325
  break @row_count = nil
269
326
  end
270
327
  end
@@ -1,3 +1,3 @@
1
1
  module Dullard
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -1,6 +1,6 @@
1
1
  require 'dullard'
2
2
 
3
- describe "dullard," do
3
+ describe "test.xlsx," do
4
4
  before(:each) do
5
5
  @file = File.open(File.expand_path("../test.xlsx", __FILE__))
6
6
  end
@@ -22,7 +22,6 @@ describe "dullard," do
22
22
  rows = @xlsx.sheets[0].rows
23
23
  rows.next.count.should == 300
24
24
  rows.next.count.should == 9
25
- rows.next.count.should == 1
26
25
  end
27
26
 
28
27
  it "reads the right number of rows" do
@@ -71,3 +70,47 @@ describe "dullard," do
71
70
  end
72
71
  end
73
72
  end
73
+
74
+ describe "test2.xlsx" do
75
+ before(:each) do
76
+ @file = File.open(File.expand_path("../test2.xlsx", __FILE__))
77
+ end
78
+
79
+ it "should not skip nils" do
80
+ rows = Dullard::Workbook.new(@file).sheets[0].rows.to_a
81
+ rows.should == [
82
+ [1],
83
+ [nil, 2],
84
+ [nil, nil, 3]
85
+ ]
86
+ end
87
+ end
88
+
89
+ describe "error handling" do
90
+ it "should raise an error when a cell is missing r attr" do
91
+ @file = File.expand_path("../error_missing_r.xlsx", __FILE__)
92
+ book = Dullard::Workbook.new(@file)
93
+ sheet = book.sheets[0]
94
+ expect {
95
+ sheet.rows.to_a
96
+ }.to raise_error(Dullard::Error)
97
+ end
98
+
99
+ it "should succeed when styles are missing" do
100
+ file = File.expand_path("../error_missing_metadata.xlsx", __FILE__)
101
+ book = Dullard::Workbook.new(file)
102
+ sheet = book.sheets[0]
103
+ expect {
104
+ sheet.rows.to_a
105
+ }.not_to raise_error
106
+ end
107
+
108
+ it "should raise an error with invalid shared string index" do
109
+ file = File.expand_path("../error_missing_ss.xlsx", __FILE__)
110
+ book = Dullard::Workbook.new(file)
111
+ sheet = book.sheets[0]
112
+ expect {
113
+ sheet.rows.to_a
114
+ }.to raise_error(Dullard::Error)
115
+ end
116
+ end
Binary file
metadata CHANGED
@@ -1,69 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dullard
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ted Kaplan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-30 00:00:00.000000000 Z
11
+ date: 2015-01-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '2.14'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.14'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '10.1'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.1'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.6'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.6'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rubyzip
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ~>
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
61
  version: '1.1'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ~>
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '1.1'
69
69
  description:
@@ -73,7 +73,7 @@ executables: []
73
73
  extensions: []
74
74
  extra_rdoc_files: []
75
75
  files:
76
- - .gitignore
76
+ - ".gitignore"
77
77
  - Gemfile
78
78
  - LICENSE
79
79
  - README.md
@@ -83,6 +83,9 @@ files:
83
83
  - lib/dullard/reader.rb
84
84
  - lib/dullard/version.rb
85
85
  - spec/dullard_spec.rb
86
+ - spec/error_missing_metadata.xlsx
87
+ - spec/error_missing_r.xlsx
88
+ - spec/error_missing_ss.xlsx
86
89
  - spec/test.xlsx
87
90
  homepage: http://github.com/thirtyseven/dullard
88
91
  licenses:
@@ -94,20 +97,23 @@ require_paths:
94
97
  - lib
95
98
  required_ruby_version: !ruby/object:Gem::Requirement
96
99
  requirements:
97
- - - '>='
100
+ - - ">="
98
101
  - !ruby/object:Gem::Version
99
102
  version: '0'
100
103
  required_rubygems_version: !ruby/object:Gem::Requirement
101
104
  requirements:
102
- - - '>='
105
+ - - ">="
103
106
  - !ruby/object:Gem::Version
104
107
  version: '0'
105
108
  requirements: []
106
109
  rubyforge_project:
107
- rubygems_version: 2.0.5
110
+ rubygems_version: 2.4.3
108
111
  signing_key:
109
112
  specification_version: 4
110
113
  summary: A fast XLSX parser using Nokogiri
111
114
  test_files:
112
115
  - spec/dullard_spec.rb
116
+ - spec/error_missing_metadata.xlsx
117
+ - spec/error_missing_r.xlsx
118
+ - spec/error_missing_ss.xlsx
113
119
  - spec/test.xlsx