xsv 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9493c51a049db036f7fcfacbdf911d69255008cb15fd4853eafd12a6d9eeaaf1
4
- data.tar.gz: '008dfbfa456e9a07e7e301290388a23bb18e3e61f79ce480e5cff5f8b050253f'
3
+ metadata.gz: 34a568089851462504ab294931b59d454ed2df2788282e8aa15cc166e0c45271
4
+ data.tar.gz: 48e148855403abc349d62093d8351d68497681b43726c5894dc45b87c964a9e7
5
5
  SHA512:
6
- metadata.gz: 154e166f904d5c964d44ad7886d8b5dbc350073d8c01b22f6490c1bf65ec376e06474b249f781314deb3541ee14bbb103e3acd096938bd1b7d8bb118b94b1ab1
7
- data.tar.gz: ac24357f40b3aba550668a4d2ba4be75625751868f180a07b981dc931fcf28c2a714eac72443d123dd0a22ac28e1e6c67e56ca3423eb89a716b4e78b7ce8de59
6
+ metadata.gz: c634494fbff9d65dc9f16af6cdc457fb8caddd56da46bdaf256562a4b4d96ffe271ece897cec1ec153befa30eb617c3d39da70c5b5250039838848464108d14c
7
+ data.tar.gz: f3bde0d89ddd1d8b5badf35b7da02025cddcc41b3f6127f48ec56bfcef31d22e16ff4378f3c3a79502c42089c41dea35bce8b0245920ba23c4ab7e303c6f9340
@@ -1,17 +1,15 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xsv (0.2.2)
5
- nokogiri (~> 1.10)
4
+ xsv (0.2.3)
5
+ ox (~> 2.13)
6
6
  rubyzip (~> 2.2)
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- mini_portile2 (2.4.0)
12
11
  minitest (5.14.0)
13
- nokogiri (1.10.8)
14
- mini_portile2 (~> 2.4.0)
12
+ ox (2.13.2)
15
13
  rake (10.5.0)
16
14
  rubyzip (2.2.0)
17
15
 
data/README.md CHANGED
@@ -1,12 +1,12 @@
1
1
  # Xsv .xlsx reader
2
2
 
3
- Xsv is a very basic parser for Excel files in the .xlsx format that strives to
4
- provide feature parity with common CSV readers and nothing more. This means
5
- it only parses values to basic Ruby types and does not deal with most formatting
6
- or more advanced functionality. The goal is to allow for fast parsing of large
7
- worksheets with minimal RAM and CPU consumption.
3
+ Xsv is a very basic parser for Office Open XML spreadsheet files (.xlsx files)
4
+ that aims to provide feature parity with common CSV readers with high
5
+ performance. This means it only parses values to basic Ruby types and does not
6
+ deal with most formatting or more advanced functionality. The goal is to allow
7
+ for fast parsing of large worksheets with minimal RAM and CPU consumption.
8
8
 
9
- Xsv stands for 'Excel Separated Values' because Excel just gets in the way.
9
+ Xsv stands for 'Excel Separated Values', because Excel just gets in the way.
10
10
 
11
11
  ## Installation
12
12
 
@@ -99,7 +99,9 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
99
99
 
100
100
  ## Contributing
101
101
 
102
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/xsv.
102
+ Bug reports and pull requests are welcome on GitHub at https://github.com/martijn/xsv.
103
+ Please provide an .xlsx file with a minimum breaking example that is acceptable
104
+ for inclusion in the source code repository.
103
105
 
104
106
  ## License
105
107
 
data/lib/xsv.rb CHANGED
@@ -1,7 +1,12 @@
1
1
  require "date"
2
+ require "ox"
2
3
 
3
4
  require "xsv/helpers"
5
+ require "xsv/shared_strings_parser"
4
6
  require "xsv/sheet"
7
+ require "xsv/sheet_bounds_handler"
8
+ require "xsv/sheet_rows_handler"
9
+ require "xsv/styles_handler"
5
10
  require "xsv/version"
6
11
  require "xsv/workbook"
7
12
 
@@ -0,0 +1,37 @@
1
+ module Xsv
2
+ class SharedStringsParser < Ox::Sax
3
+ def self.parse(io)
4
+ strings = []
5
+ handler = new { |s| strings << s }
6
+ Ox.sax_parse(handler, io)
7
+ return strings
8
+ end
9
+
10
+ def initialize(&block)
11
+ @block = block
12
+ @state = nil
13
+ end
14
+
15
+ def start_element(name)
16
+ case name
17
+ when :si
18
+ @current_string = ""
19
+ when :t
20
+ @state = name
21
+ end
22
+ end
23
+
24
+ def text(value)
25
+ @current_string += value if @state == :t
26
+ end
27
+
28
+ def end_element(name)
29
+ case name
30
+ when :si
31
+ @block.call(@current_string)
32
+ when :t
33
+ @state = nil
34
+ end
35
+ end
36
+ end
37
+ end
@@ -3,26 +3,19 @@ module Xsv
3
3
  include Enumerable
4
4
  include Xsv::Helpers
5
5
 
6
- attr_reader :xml, :mode
6
+ attr_reader :mode
7
7
 
8
8
  # Set a number of rows to skip at the top of the sheet (header row offset)
9
9
  attr_accessor :row_skip
10
10
 
11
- def initialize(workbook, xml)
11
+ def initialize(workbook, io)
12
12
  @workbook = workbook
13
- @xml = xml
13
+ @io = io
14
14
  @headers = []
15
15
  @mode = :array
16
16
  @row_skip = 0
17
17
 
18
- @has_cells = !xml.at_css("sheetData c").nil?
19
-
20
- if @has_cells
21
- @column_count, @last_row = get_sheet_dimensions
22
- else
23
- @column_count = 0
24
- @last_row = 0
25
- end
18
+ @last_row, @column_count = SheetBoundsHandler.get_bounds(@io, @workbook)
26
19
  end
27
20
 
28
21
  def inspect
@@ -31,30 +24,14 @@ module Xsv
31
24
 
32
25
  # Iterate over rows
33
26
  def each_row
34
- row_index = 0 - @row_skip
35
-
36
- @xml.css("sheetData row").each do |row_xml|
37
- if row_index < 0
38
- row_index += 1
39
- next
40
- end
41
-
42
- row_index += 1
43
-
44
- next if row_index == 1 && @mode == :hash
45
-
46
- # pad empty rows
47
- while row_index < row_xml["r"].to_i - @row_skip do
48
- yield(empty_row)
49
- row_index += 1
50
- end
27
+ @io.rewind
51
28
 
52
- yield(parse_row(row_xml))
53
-
54
- # Do not return empty trailing rows
55
- break if row_index == @last_row - @row_skip
29
+ handler = SheetRowsHandler.new(@mode, empty_row, @workbook, @row_skip, @last_row) do |row|
30
+ yield(row)
56
31
  end
57
32
 
33
+ Ox.sax_parse(handler, @io)
34
+
58
35
  true
59
36
  end
60
37
 
@@ -62,13 +39,11 @@ module Xsv
62
39
 
63
40
  # Get row by number, starting at 0
64
41
  def [](number)
65
- row_xml = xml.at_css("sheetData row[r=#{number + @row_skip + 1}]")
66
-
67
- if row_xml
68
- parse_row(row_xml)
69
- else
70
- empty_row
42
+ each_with_index do |row, i|
43
+ return row if i == number
71
44
  end
45
+
46
+ return empty_row
72
47
  end
73
48
 
74
49
  # Load headers in the top row of the worksheet. After parsing of headers
@@ -91,7 +66,12 @@ module Xsv
91
66
  private
92
67
 
93
68
  def parse_headers
94
- parse_row(@xml.css("sheetData row")[@row_skip], :array)
69
+ if @mode == :array
70
+ first
71
+ elsif @mode == :hash
72
+ @mode == :array
73
+ headers.tap { @mode = :hash }
74
+ end
95
75
  end
96
76
 
97
77
  def empty_row
@@ -102,71 +82,5 @@ module Xsv
102
82
  @headers.zip([]).to_h
103
83
  end
104
84
  end
105
-
106
- def parse_row(xml, mode = nil)
107
- mode ||= @mode
108
- row = empty_row
109
-
110
- xml.css("c").first(@column_count).each do |c_xml|
111
- value = case c_xml["t"]
112
- when "s"
113
- @workbook.shared_strings[c_xml.css("v").inner_text.to_i]
114
- when "str"
115
- c_xml.css("v").inner_text.to_s
116
- when "e" # N/A
117
- nil
118
- when nil
119
- v = c_xml.at_css("v")
120
-
121
- if v.nil?
122
- nil
123
- elsif c_xml["s"]
124
- style = @workbook.xfs[c_xml["s"].to_i]
125
- numFmt = @workbook.numFmts[style[:numFmtId].to_i]
126
-
127
- parse_number_format(v.inner_text, numFmt)
128
- else
129
- parse_number(v.inner_text)
130
- end
131
- else
132
- raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
133
- end
134
-
135
- # Determine column position and pad row with nil values
136
- col_index = column_index(c_xml["r"])
137
-
138
- case mode
139
- when :array
140
- row[col_index] = value
141
- when :hash
142
- row[@headers[col_index]] = value
143
- end
144
- end
145
-
146
- row
147
- end
148
-
149
- # Read or estimate outer bounds of sheet
150
- def get_sheet_dimensions
151
- dimension = xml.at_css("dimension")
152
-
153
- if dimension
154
- _firstCell, lastCell = dimension["ref"].split(":")
155
- end
156
-
157
- if lastCell
158
- # Assume the dimension reflects the content
159
- column_count = column_index(lastCell) + 1
160
- else
161
- # Find the last cell in every row that has a value
162
- rightmost_cells = @xml.xpath("//xmlns:row/xmlns:c[*[local-name() = 'v']][last()]").map { |c| column_index(c["r"]) }
163
- column_count = rightmost_cells.max + 1
164
- end
165
-
166
- # Find the last row that contains actual values
167
- last_row = @xml.at_xpath("//xmlns:row[*[xmlns:v]][last()]")["r"].to_i
168
-
169
- return [column_count, last_row]
170
- end
171
85
  end
172
86
  end
@@ -0,0 +1,76 @@
1
+ module Xsv
2
+ # SheetBoundsHandler scans a sheet looking for the outer bounds of the content within
3
+ class SheetBoundsHandler < Ox::Sax
4
+ include Xsv::Helpers
5
+
6
+ def self.get_bounds(sheet, workbook)
7
+ rows = 0
8
+ cols = 0
9
+
10
+ handler = new(workbook.trim_empty_rows) do |row, col|
11
+ rows = row
12
+ cols = col == 0 ? 0 : col + 1
13
+
14
+ return rows, cols
15
+ end
16
+
17
+ sheet.rewind
18
+ Ox.sax_parse(handler, sheet)
19
+
20
+ return rows, cols
21
+ end
22
+
23
+ # Ox::Sax implementation
24
+
25
+ def initialize(trim_empty_rows, &block)
26
+ @block = block
27
+ @state = nil
28
+ @cell = nil
29
+ @row = nil
30
+ @maxRow = 0
31
+ @maxColumn = 0
32
+ @trim_empty_rows = trim_empty_rows
33
+ end
34
+
35
+ def start_element(name)
36
+ case name
37
+ when :c
38
+ @state = name
39
+ @cell = nil
40
+ when :v
41
+ col = column_index(@cell)
42
+ @maxColumn = col if col > @maxColumn
43
+ @maxRow = @row if @row > @maxRow
44
+ when :row
45
+ @state = name
46
+ @row = nil
47
+ when :dimension
48
+ @state = name
49
+ end
50
+ end
51
+
52
+ def end_element(name)
53
+ if name == :sheetData
54
+ @block.call(@maxRow, @maxColumn)
55
+ end
56
+ end
57
+
58
+ def attr(name, value)
59
+ if @state == :c && name == :r
60
+ @cell = value
61
+ elsif @state == :row && name == :r
62
+ @row = value.to_i
63
+ elsif @state == :dimension && name == :ref
64
+ _firstCell, lastCell = value.split(":")
65
+
66
+ if lastCell
67
+ @maxColumn = column_index(lastCell)
68
+ unless @trim_empty_rows
69
+ @maxRow = lastCell[/\d+$/].to_i
70
+ @block.call(@maxRow, @maxColumn)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,121 @@
1
+ module Xsv
2
+ class SheetRowsHandler < Ox::Sax
3
+ include Xsv::Helpers
4
+
5
+ def format_cell
6
+ case @current_cell[:t]
7
+ when "s"
8
+ @workbook.shared_strings[@current_value.to_i]
9
+ when "str"
10
+ @current_value
11
+ when "e" # N/A
12
+ nil
13
+ when nil
14
+ if @current_value == ""
15
+ nil
16
+ elsif @current_cell[:s]
17
+ style = @workbook.xfs[@current_cell[:s].to_i]
18
+ numFmt = @workbook.numFmts[style[:numFmtId].to_i]
19
+
20
+ parse_number_format(@current_value, numFmt)
21
+ else
22
+ parse_number(@current_value)
23
+ end
24
+ else
25
+ raise Xsv::Error, "Encountered unknown column type #{@current_cell[:t]}"
26
+ end
27
+ end
28
+
29
+ # Ox::Sax implementation below
30
+
31
+ def initialize(mode, empty_row, workbook, row_skip, last_row, &block)
32
+ @block = block
33
+
34
+ # :sheetData
35
+ # :row
36
+ # :c
37
+ # :v
38
+ @state = nil
39
+
40
+ @mode = mode
41
+ @empty_row = empty_row
42
+ @workbook = workbook
43
+ @row_skip = row_skip
44
+ @row_index = 0 - @row_skip
45
+ @current_row = {}
46
+ @current_row_attrs = {}
47
+ @current_cell = {}
48
+ @current_value = nil
49
+ @last_row = last_row
50
+
51
+ if @mode == :hash
52
+ @headers = @empty_row.keys
53
+ end
54
+ end
55
+
56
+ def start_element(name)
57
+ case name
58
+ when :c
59
+ @state = name
60
+ @current_cell = {}
61
+ @current_value = ""
62
+ when :v
63
+ @state = name
64
+ when :row
65
+ @state = name
66
+ @current_row = @empty_row.dup
67
+ @current_row_attrs = {}
68
+ else
69
+ @state = nil
70
+ end
71
+ end
72
+
73
+ def text(value)
74
+ if @state == :v
75
+ @current_value += value
76
+ end
77
+ end
78
+
79
+ def attr(name, value)
80
+ case @state
81
+ when :c
82
+ @current_cell[name] = value
83
+ when :row
84
+ @current_row_attrs[name] = value
85
+ end
86
+ end
87
+
88
+ def end_element(name)
89
+ case name
90
+ when :c
91
+ col_index = column_index(@current_cell[:r])
92
+
93
+ case @mode
94
+ when :array
95
+ @current_row[col_index] = format_cell
96
+ when :hash
97
+ @current_row[@headers[col_index]] = format_cell
98
+ end
99
+ when :row
100
+ if @row_index < 0
101
+ @row_index += 1
102
+ return
103
+ end
104
+
105
+ @row_index += 1
106
+
107
+ # Skip first row if we're in hash mode
108
+ return if @row_index == 1 && @mode == :hash
109
+
110
+ # Pad empty rows
111
+ while @row_index < @current_row_attrs[:r].to_i - @row_skip
112
+ @block.call(@empty_row)
113
+ @row_index += 1
114
+ end
115
+
116
+ # Do not return empty trailing rows
117
+ @block.call(@current_row) unless @row_index > @last_row - @row_skip
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,58 @@
1
+ module Xsv
2
+ # StylesHandler interprets the relevant parts of styles.xml
3
+ class StylesHandler < Ox::Sax
4
+ def self.get_styles(io, numFmts)
5
+ @xfs = nil
6
+ @numFmts = nil
7
+ handler = new(numFmts) do |xfs, numFmts|
8
+ @xfs = xfs
9
+ @numFmts = numFmts
10
+ end
11
+
12
+ Ox.sax_parse(handler, io)
13
+ return @xfs, @numFmts
14
+ end
15
+
16
+ # Ox::Sax implementation
17
+
18
+ def initialize(numFmts, &block)
19
+ @block = block
20
+ @state = nil
21
+ @xfs = []
22
+ @numFmts = numFmts
23
+
24
+ @xf = {}
25
+ @numFmt = {}
26
+ end
27
+
28
+ def start_element(name)
29
+ case name
30
+ when :cellXfs, :numFmts
31
+ @state = name
32
+ when :xf
33
+ @xf = {}
34
+ when :numFmt
35
+ @numFmt = {}
36
+ end
37
+ end
38
+
39
+ def attr(name, value)
40
+ case @state
41
+ when :cellXfs
42
+ @xf[name] = value
43
+ when :numFmts
44
+ @numFmt[name] = value
45
+ end
46
+ end
47
+
48
+ def end_element(name)
49
+ if @state == :cellXfs && name == :xf
50
+ @xfs << @xf
51
+ elsif @state == :numFmts && name == :numFmt
52
+ @numFmts[@numFmt[:numFmtId].to_i] = @numFmt[:formatCode]
53
+ elsif name == :styleSheet
54
+ @block.call(@xfs, @numFmts)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -1,3 +1,3 @@
1
1
  module Xsv
2
- VERSION = "0.2.3"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -4,22 +4,28 @@ require 'zip'
4
4
  module Xsv
5
5
  class Workbook
6
6
 
7
- attr_reader :sheets, :shared_strings, :xfs, :numFmts
7
+ attr_reader :sheets, :shared_strings, :xfs, :numFmts, :trim_empty_rows
8
8
 
9
9
  # Open the workbook of the given filename, string or buffer
10
- def self.open(data)
10
+ def self.open(data, **kws)
11
11
  if data.is_a?(IO)
12
- @workbook = self.new(Zip::File.open_buffer(data))
12
+ @workbook = self.new(Zip::File.open_buffer(data), kws)
13
13
  elsif data.start_with?("PK\x03\x04")
14
- @workbook = self.new(Zip::File.open_buffer(data))
14
+ @workbook = self.new(Zip::File.open_buffer(data), kws)
15
15
  else
16
- @workbook = self.new(Zip::File.open(data))
16
+ @workbook = self.new(Zip::File.open(data), kws)
17
17
  end
18
18
  end
19
19
 
20
20
  # Open a workbook from an instance of Zip::File
21
- def initialize(zip)
21
+ #
22
+ # Options:
23
+ #
24
+ # trim_empty_rows (false) Scan sheet for end of content and don't return trailing rows
25
+ #
26
+ def initialize(zip, trim_empty_rows: false)
22
27
  @zip = zip
28
+ @trim_empty_rows = trim_empty_rows
23
29
 
24
30
  @sheets = []
25
31
  @xfs = []
@@ -38,35 +44,22 @@ module Xsv
38
44
 
39
45
  def fetch_shared_strings
40
46
  stream = @zip.glob("xl/sharedStrings.xml").first.get_input_stream
41
- xml = Nokogiri::XML(stream)
42
- expected_count = xml.at_css("sst")["uniqueCount"].to_i
43
- @shared_strings = xml.css("sst si").map { |si| si.css("t").map(&:inner_text).join }
44
-
45
- if @shared_strings.count != expected_count
46
- raise Xsv::AssertionFailed, "Mismatch in shared strings count! #{expected_count} <> #{@shared_strings.count}"
47
- end
47
+ @shared_strings = SharedStringsParser.parse(stream)
48
48
 
49
49
  stream.close
50
50
  end
51
51
 
52
52
  def fetch_styles
53
53
  stream = @zip.glob("xl/styles.xml").first.get_input_stream
54
- xml = Nokogiri::XML(stream)
55
54
 
56
- xml.css("cellXfs xf").each do |xf|
57
- @xfs << xf.attributes.map { |k, v| [k.to_sym, v.value] }.to_h
58
- end
59
-
60
- xml.css("numFmts numFmt").each do |numFmt|
61
- @numFmts[numFmt["numFmtId"].to_i] = numFmt["formatCode"]
62
- end
55
+ @xfs, @numFmts = StylesHandler.get_styles(stream, @numFmts)
63
56
  end
64
57
 
65
58
  def fetch_sheets
66
59
  @zip.glob("xl/worksheets/sheet*.xml").sort do |a, b|
67
60
  a.name[/\d+/].to_i <=> b.name[/\d+/].to_i
68
61
  end.each do |entry|
69
- @sheets << Xsv::Sheet.new(self, Nokogiri::XML(entry.get_input_stream))
62
+ @sheets << Xsv::Sheet.new(self, entry.get_input_stream)
70
63
  end
71
64
  end
72
65
  end
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.required_ruby_version = '~> 2.6'
35
35
 
36
36
  spec.add_dependency "rubyzip", "~> 2.2"
37
- spec.add_dependency "nokogiri", "~> 1.10"
37
+ spec.add_dependency "ox", "~> 2.13"
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 1.17"
40
40
  spec.add_development_dependency "rake", "~> 10.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xsv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martijn Storck
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-02-22 00:00:00.000000000 Z
11
+ date: 2020-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -25,19 +25,19 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.2'
27
27
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
28
+ name: ox
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.10'
33
+ version: '2.13'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.10'
40
+ version: '2.13'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +98,11 @@ files:
98
98
  - bin/setup
99
99
  - lib/xsv.rb
100
100
  - lib/xsv/helpers.rb
101
+ - lib/xsv/shared_strings_parser.rb
101
102
  - lib/xsv/sheet.rb
103
+ - lib/xsv/sheet_bounds_handler.rb
104
+ - lib/xsv/sheet_rows_handler.rb
105
+ - lib/xsv/styles_handler.rb
102
106
  - lib/xsv/version.rb
103
107
  - lib/xsv/workbook.rb
104
108
  - test.sh