xsv 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9493c51a049db036f7fcfacbdf911d69255008cb15fd4853eafd12a6d9eeaaf1
4
- data.tar.gz: '008dfbfa456e9a07e7e301290388a23bb18e3e61f79ce480e5cff5f8b050253f'
3
+ metadata.gz: 34a568089851462504ab294931b59d454ed2df2788282e8aa15cc166e0c45271
4
+ data.tar.gz: 48e148855403abc349d62093d8351d68497681b43726c5894dc45b87c964a9e7
5
5
  SHA512:
6
- metadata.gz: 154e166f904d5c964d44ad7886d8b5dbc350073d8c01b22f6490c1bf65ec376e06474b249f781314deb3541ee14bbb103e3acd096938bd1b7d8bb118b94b1ab1
7
- data.tar.gz: ac24357f40b3aba550668a4d2ba4be75625751868f180a07b981dc931fcf28c2a714eac72443d123dd0a22ac28e1e6c67e56ca3423eb89a716b4e78b7ce8de59
6
+ metadata.gz: c634494fbff9d65dc9f16af6cdc457fb8caddd56da46bdaf256562a4b4d96ffe271ece897cec1ec153befa30eb617c3d39da70c5b5250039838848464108d14c
7
+ data.tar.gz: f3bde0d89ddd1d8b5badf35b7da02025cddcc41b3f6127f48ec56bfcef31d22e16ff4378f3c3a79502c42089c41dea35bce8b0245920ba23c4ab7e303c6f9340
@@ -1,17 +1,15 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xsv (0.2.2)
5
- nokogiri (~> 1.10)
4
+ xsv (0.2.3)
5
+ ox (~> 2.13)
6
6
  rubyzip (~> 2.2)
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- mini_portile2 (2.4.0)
12
11
  minitest (5.14.0)
13
- nokogiri (1.10.8)
14
- mini_portile2 (~> 2.4.0)
12
+ ox (2.13.2)
15
13
  rake (10.5.0)
16
14
  rubyzip (2.2.0)
17
15
 
data/README.md CHANGED
@@ -1,12 +1,12 @@
1
1
  # Xsv .xlsx reader
2
2
 
3
- Xsv is a very basic parser for Excel files in the .xlsx format that strives to
4
- provide feature parity with common CSV readers and nothing more. This means
5
- it only parses values to basic Ruby types and does not deal with most formatting
6
- or more advanced functionality. The goal is to allow for fast parsing of large
7
- worksheets with minimal RAM and CPU consumption.
3
+ Xsv is a very basic parser for Office Open XML spreadsheet files (.xlsx files)
4
+ that aims to provide feature parity with common CSV readers with high
5
+ performance. This means it only parses values to basic Ruby types and does not
6
+ deal with most formatting or more advanced functionality. The goal is to allow
7
+ for fast parsing of large worksheets with minimal RAM and CPU consumption.
8
8
 
9
- Xsv stands for 'Excel Separated Values' because Excel just gets in the way.
9
+ Xsv stands for 'Excel Separated Values', because Excel just gets in the way.
10
10
 
11
11
  ## Installation
12
12
 
@@ -99,7 +99,9 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
99
99
 
100
100
  ## Contributing
101
101
 
102
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/xsv.
102
+ Bug reports and pull requests are welcome on GitHub at https://github.com/martijn/xsv.
103
+ Please provide an .xlsx file with a minimum breaking example that is acceptable
104
+ for inclusion in the source code repository.
103
105
 
104
106
  ## License
105
107
 
data/lib/xsv.rb CHANGED
@@ -1,7 +1,12 @@
1
1
  require "date"
2
+ require "ox"
2
3
 
3
4
  require "xsv/helpers"
5
+ require "xsv/shared_strings_parser"
4
6
  require "xsv/sheet"
7
+ require "xsv/sheet_bounds_handler"
8
+ require "xsv/sheet_rows_handler"
9
+ require "xsv/styles_handler"
5
10
  require "xsv/version"
6
11
  require "xsv/workbook"
7
12
 
@@ -0,0 +1,37 @@
1
+ module Xsv
2
+ class SharedStringsParser < Ox::Sax
3
+ def self.parse(io)
4
+ strings = []
5
+ handler = new { |s| strings << s }
6
+ Ox.sax_parse(handler, io)
7
+ return strings
8
+ end
9
+
10
+ def initialize(&block)
11
+ @block = block
12
+ @state = nil
13
+ end
14
+
15
+ def start_element(name)
16
+ case name
17
+ when :si
18
+ @current_string = ""
19
+ when :t
20
+ @state = name
21
+ end
22
+ end
23
+
24
+ def text(value)
25
+ @current_string += value if @state == :t
26
+ end
27
+
28
+ def end_element(name)
29
+ case name
30
+ when :si
31
+ @block.call(@current_string)
32
+ when :t
33
+ @state = nil
34
+ end
35
+ end
36
+ end
37
+ end
@@ -3,26 +3,19 @@ module Xsv
3
3
  include Enumerable
4
4
  include Xsv::Helpers
5
5
 
6
- attr_reader :xml, :mode
6
+ attr_reader :mode
7
7
 
8
8
  # Set a number of rows to skip at the top of the sheet (header row offset)
9
9
  attr_accessor :row_skip
10
10
 
11
- def initialize(workbook, xml)
11
+ def initialize(workbook, io)
12
12
  @workbook = workbook
13
- @xml = xml
13
+ @io = io
14
14
  @headers = []
15
15
  @mode = :array
16
16
  @row_skip = 0
17
17
 
18
- @has_cells = !xml.at_css("sheetData c").nil?
19
-
20
- if @has_cells
21
- @column_count, @last_row = get_sheet_dimensions
22
- else
23
- @column_count = 0
24
- @last_row = 0
25
- end
18
+ @last_row, @column_count = SheetBoundsHandler.get_bounds(@io, @workbook)
26
19
  end
27
20
 
28
21
  def inspect
@@ -31,30 +24,14 @@ module Xsv
31
24
 
32
25
  # Iterate over rows
33
26
  def each_row
34
- row_index = 0 - @row_skip
35
-
36
- @xml.css("sheetData row").each do |row_xml|
37
- if row_index < 0
38
- row_index += 1
39
- next
40
- end
41
-
42
- row_index += 1
43
-
44
- next if row_index == 1 && @mode == :hash
45
-
46
- # pad empty rows
47
- while row_index < row_xml["r"].to_i - @row_skip do
48
- yield(empty_row)
49
- row_index += 1
50
- end
27
+ @io.rewind
51
28
 
52
- yield(parse_row(row_xml))
53
-
54
- # Do not return empty trailing rows
55
- break if row_index == @last_row - @row_skip
29
+ handler = SheetRowsHandler.new(@mode, empty_row, @workbook, @row_skip, @last_row) do |row|
30
+ yield(row)
56
31
  end
57
32
 
33
+ Ox.sax_parse(handler, @io)
34
+
58
35
  true
59
36
  end
60
37
 
@@ -62,13 +39,11 @@ module Xsv
62
39
 
63
40
  # Get row by number, starting at 0
64
41
  def [](number)
65
- row_xml = xml.at_css("sheetData row[r=#{number + @row_skip + 1}]")
66
-
67
- if row_xml
68
- parse_row(row_xml)
69
- else
70
- empty_row
42
+ each_with_index do |row, i|
43
+ return row if i == number
71
44
  end
45
+
46
+ return empty_row
72
47
  end
73
48
 
74
49
  # Load headers in the top row of the worksheet. After parsing of headers
@@ -91,7 +66,12 @@ module Xsv
91
66
  private
92
67
 
93
68
  def parse_headers
94
- parse_row(@xml.css("sheetData row")[@row_skip], :array)
69
+ if @mode == :array
70
+ first
71
+ elsif @mode == :hash
72
+ @mode == :array
73
+ headers.tap { @mode = :hash }
74
+ end
95
75
  end
96
76
 
97
77
  def empty_row
@@ -102,71 +82,5 @@ module Xsv
102
82
  @headers.zip([]).to_h
103
83
  end
104
84
  end
105
-
106
- def parse_row(xml, mode = nil)
107
- mode ||= @mode
108
- row = empty_row
109
-
110
- xml.css("c").first(@column_count).each do |c_xml|
111
- value = case c_xml["t"]
112
- when "s"
113
- @workbook.shared_strings[c_xml.css("v").inner_text.to_i]
114
- when "str"
115
- c_xml.css("v").inner_text.to_s
116
- when "e" # N/A
117
- nil
118
- when nil
119
- v = c_xml.at_css("v")
120
-
121
- if v.nil?
122
- nil
123
- elsif c_xml["s"]
124
- style = @workbook.xfs[c_xml["s"].to_i]
125
- numFmt = @workbook.numFmts[style[:numFmtId].to_i]
126
-
127
- parse_number_format(v.inner_text, numFmt)
128
- else
129
- parse_number(v.inner_text)
130
- end
131
- else
132
- raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
133
- end
134
-
135
- # Determine column position and pad row with nil values
136
- col_index = column_index(c_xml["r"])
137
-
138
- case mode
139
- when :array
140
- row[col_index] = value
141
- when :hash
142
- row[@headers[col_index]] = value
143
- end
144
- end
145
-
146
- row
147
- end
148
-
149
- # Read or estimate outer bounds of sheet
150
- def get_sheet_dimensions
151
- dimension = xml.at_css("dimension")
152
-
153
- if dimension
154
- _firstCell, lastCell = dimension["ref"].split(":")
155
- end
156
-
157
- if lastCell
158
- # Assume the dimension reflects the content
159
- column_count = column_index(lastCell) + 1
160
- else
161
- # Find the last cell in every row that has a value
162
- rightmost_cells = @xml.xpath("//xmlns:row/xmlns:c[*[local-name() = 'v']][last()]").map { |c| column_index(c["r"]) }
163
- column_count = rightmost_cells.max + 1
164
- end
165
-
166
- # Find the last row that contains actual values
167
- last_row = @xml.at_xpath("//xmlns:row[*[xmlns:v]][last()]")["r"].to_i
168
-
169
- return [column_count, last_row]
170
- end
171
85
  end
172
86
  end
@@ -0,0 +1,76 @@
1
+ module Xsv
2
+ # SheetBoundsHandler scans a sheet looking for the outer bounds of the content within
3
+ class SheetBoundsHandler < Ox::Sax
4
+ include Xsv::Helpers
5
+
6
+ def self.get_bounds(sheet, workbook)
7
+ rows = 0
8
+ cols = 0
9
+
10
+ handler = new(workbook.trim_empty_rows) do |row, col|
11
+ rows = row
12
+ cols = col == 0 ? 0 : col + 1
13
+
14
+ return rows, cols
15
+ end
16
+
17
+ sheet.rewind
18
+ Ox.sax_parse(handler, sheet)
19
+
20
+ return rows, cols
21
+ end
22
+
23
+ # Ox::Sax implementation
24
+
25
+ def initialize(trim_empty_rows, &block)
26
+ @block = block
27
+ @state = nil
28
+ @cell = nil
29
+ @row = nil
30
+ @maxRow = 0
31
+ @maxColumn = 0
32
+ @trim_empty_rows = trim_empty_rows
33
+ end
34
+
35
+ def start_element(name)
36
+ case name
37
+ when :c
38
+ @state = name
39
+ @cell = nil
40
+ when :v
41
+ col = column_index(@cell)
42
+ @maxColumn = col if col > @maxColumn
43
+ @maxRow = @row if @row > @maxRow
44
+ when :row
45
+ @state = name
46
+ @row = nil
47
+ when :dimension
48
+ @state = name
49
+ end
50
+ end
51
+
52
+ def end_element(name)
53
+ if name == :sheetData
54
+ @block.call(@maxRow, @maxColumn)
55
+ end
56
+ end
57
+
58
+ def attr(name, value)
59
+ if @state == :c && name == :r
60
+ @cell = value
61
+ elsif @state == :row && name == :r
62
+ @row = value.to_i
63
+ elsif @state == :dimension && name == :ref
64
+ _firstCell, lastCell = value.split(":")
65
+
66
+ if lastCell
67
+ @maxColumn = column_index(lastCell)
68
+ unless @trim_empty_rows
69
+ @maxRow = lastCell[/\d+$/].to_i
70
+ @block.call(@maxRow, @maxColumn)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,121 @@
1
+ module Xsv
2
+ class SheetRowsHandler < Ox::Sax
3
+ include Xsv::Helpers
4
+
5
+ def format_cell
6
+ case @current_cell[:t]
7
+ when "s"
8
+ @workbook.shared_strings[@current_value.to_i]
9
+ when "str"
10
+ @current_value
11
+ when "e" # N/A
12
+ nil
13
+ when nil
14
+ if @current_value == ""
15
+ nil
16
+ elsif @current_cell[:s]
17
+ style = @workbook.xfs[@current_cell[:s].to_i]
18
+ numFmt = @workbook.numFmts[style[:numFmtId].to_i]
19
+
20
+ parse_number_format(@current_value, numFmt)
21
+ else
22
+ parse_number(@current_value)
23
+ end
24
+ else
25
+ raise Xsv::Error, "Encountered unknown column type #{@current_cell[:t]}"
26
+ end
27
+ end
28
+
29
+ # Ox::Sax implementation below
30
+
31
+ def initialize(mode, empty_row, workbook, row_skip, last_row, &block)
32
+ @block = block
33
+
34
+ # :sheetData
35
+ # :row
36
+ # :c
37
+ # :v
38
+ @state = nil
39
+
40
+ @mode = mode
41
+ @empty_row = empty_row
42
+ @workbook = workbook
43
+ @row_skip = row_skip
44
+ @row_index = 0 - @row_skip
45
+ @current_row = {}
46
+ @current_row_attrs = {}
47
+ @current_cell = {}
48
+ @current_value = nil
49
+ @last_row = last_row
50
+
51
+ if @mode == :hash
52
+ @headers = @empty_row.keys
53
+ end
54
+ end
55
+
56
+ def start_element(name)
57
+ case name
58
+ when :c
59
+ @state = name
60
+ @current_cell = {}
61
+ @current_value = ""
62
+ when :v
63
+ @state = name
64
+ when :row
65
+ @state = name
66
+ @current_row = @empty_row.dup
67
+ @current_row_attrs = {}
68
+ else
69
+ @state = nil
70
+ end
71
+ end
72
+
73
+ def text(value)
74
+ if @state == :v
75
+ @current_value += value
76
+ end
77
+ end
78
+
79
+ def attr(name, value)
80
+ case @state
81
+ when :c
82
+ @current_cell[name] = value
83
+ when :row
84
+ @current_row_attrs[name] = value
85
+ end
86
+ end
87
+
88
+ def end_element(name)
89
+ case name
90
+ when :c
91
+ col_index = column_index(@current_cell[:r])
92
+
93
+ case @mode
94
+ when :array
95
+ @current_row[col_index] = format_cell
96
+ when :hash
97
+ @current_row[@headers[col_index]] = format_cell
98
+ end
99
+ when :row
100
+ if @row_index < 0
101
+ @row_index += 1
102
+ return
103
+ end
104
+
105
+ @row_index += 1
106
+
107
+ # Skip first row if we're in hash mode
108
+ return if @row_index == 1 && @mode == :hash
109
+
110
+ # Pad empty rows
111
+ while @row_index < @current_row_attrs[:r].to_i - @row_skip
112
+ @block.call(@empty_row)
113
+ @row_index += 1
114
+ end
115
+
116
+ # Do not return empty trailing rows
117
+ @block.call(@current_row) unless @row_index > @last_row - @row_skip
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,58 @@
1
+ module Xsv
2
+ # StylesHandler interprets the relevant parts of styles.xml
3
+ class StylesHandler < Ox::Sax
4
+ def self.get_styles(io, numFmts)
5
+ @xfs = nil
6
+ @numFmts = nil
7
+ handler = new(numFmts) do |xfs, numFmts|
8
+ @xfs = xfs
9
+ @numFmts = numFmts
10
+ end
11
+
12
+ Ox.sax_parse(handler, io)
13
+ return @xfs, @numFmts
14
+ end
15
+
16
+ # Ox::Sax implementation
17
+
18
+ def initialize(numFmts, &block)
19
+ @block = block
20
+ @state = nil
21
+ @xfs = []
22
+ @numFmts = numFmts
23
+
24
+ @xf = {}
25
+ @numFmt = {}
26
+ end
27
+
28
+ def start_element(name)
29
+ case name
30
+ when :cellXfs, :numFmts
31
+ @state = name
32
+ when :xf
33
+ @xf = {}
34
+ when :numFmt
35
+ @numFmt = {}
36
+ end
37
+ end
38
+
39
+ def attr(name, value)
40
+ case @state
41
+ when :cellXfs
42
+ @xf[name] = value
43
+ when :numFmts
44
+ @numFmt[name] = value
45
+ end
46
+ end
47
+
48
+ def end_element(name)
49
+ if @state == :cellXfs && name == :xf
50
+ @xfs << @xf
51
+ elsif @state == :numFmts && name == :numFmt
52
+ @numFmts[@numFmt[:numFmtId].to_i] = @numFmt[:formatCode]
53
+ elsif name == :styleSheet
54
+ @block.call(@xfs, @numFmts)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -1,3 +1,3 @@
1
1
  module Xsv
2
- VERSION = "0.2.3"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -4,22 +4,28 @@ require 'zip'
4
4
  module Xsv
5
5
  class Workbook
6
6
 
7
- attr_reader :sheets, :shared_strings, :xfs, :numFmts
7
+ attr_reader :sheets, :shared_strings, :xfs, :numFmts, :trim_empty_rows
8
8
 
9
9
  # Open the workbook of the given filename, string or buffer
10
- def self.open(data)
10
+ def self.open(data, **kws)
11
11
  if data.is_a?(IO)
12
- @workbook = self.new(Zip::File.open_buffer(data))
12
+ @workbook = self.new(Zip::File.open_buffer(data), kws)
13
13
  elsif data.start_with?("PK\x03\x04")
14
- @workbook = self.new(Zip::File.open_buffer(data))
14
+ @workbook = self.new(Zip::File.open_buffer(data), kws)
15
15
  else
16
- @workbook = self.new(Zip::File.open(data))
16
+ @workbook = self.new(Zip::File.open(data), kws)
17
17
  end
18
18
  end
19
19
 
20
20
  # Open a workbook from an instance of Zip::File
21
- def initialize(zip)
21
+ #
22
+ # Options:
23
+ #
24
+ # trim_empty_rows (false) Scan sheet for end of content and don't return trailing rows
25
+ #
26
+ def initialize(zip, trim_empty_rows: false)
22
27
  @zip = zip
28
+ @trim_empty_rows = trim_empty_rows
23
29
 
24
30
  @sheets = []
25
31
  @xfs = []
@@ -38,35 +44,22 @@ module Xsv
38
44
 
39
45
  def fetch_shared_strings
40
46
  stream = @zip.glob("xl/sharedStrings.xml").first.get_input_stream
41
- xml = Nokogiri::XML(stream)
42
- expected_count = xml.at_css("sst")["uniqueCount"].to_i
43
- @shared_strings = xml.css("sst si").map { |si| si.css("t").map(&:inner_text).join }
44
-
45
- if @shared_strings.count != expected_count
46
- raise Xsv::AssertionFailed, "Mismatch in shared strings count! #{expected_count} <> #{@shared_strings.count}"
47
- end
47
+ @shared_strings = SharedStringsParser.parse(stream)
48
48
 
49
49
  stream.close
50
50
  end
51
51
 
52
52
  def fetch_styles
53
53
  stream = @zip.glob("xl/styles.xml").first.get_input_stream
54
- xml = Nokogiri::XML(stream)
55
54
 
56
- xml.css("cellXfs xf").each do |xf|
57
- @xfs << xf.attributes.map { |k, v| [k.to_sym, v.value] }.to_h
58
- end
59
-
60
- xml.css("numFmts numFmt").each do |numFmt|
61
- @numFmts[numFmt["numFmtId"].to_i] = numFmt["formatCode"]
62
- end
55
+ @xfs, @numFmts = StylesHandler.get_styles(stream, @numFmts)
63
56
  end
64
57
 
65
58
  def fetch_sheets
66
59
  @zip.glob("xl/worksheets/sheet*.xml").sort do |a, b|
67
60
  a.name[/\d+/].to_i <=> b.name[/\d+/].to_i
68
61
  end.each do |entry|
69
- @sheets << Xsv::Sheet.new(self, Nokogiri::XML(entry.get_input_stream))
62
+ @sheets << Xsv::Sheet.new(self, entry.get_input_stream)
70
63
  end
71
64
  end
72
65
  end
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.required_ruby_version = '~> 2.6'
35
35
 
36
36
  spec.add_dependency "rubyzip", "~> 2.2"
37
- spec.add_dependency "nokogiri", "~> 1.10"
37
+ spec.add_dependency "ox", "~> 2.13"
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 1.17"
40
40
  spec.add_development_dependency "rake", "~> 10.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xsv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martijn Storck
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-02-22 00:00:00.000000000 Z
11
+ date: 2020-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -25,19 +25,19 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.2'
27
27
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
28
+ name: ox
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.10'
33
+ version: '2.13'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.10'
40
+ version: '2.13'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: bundler
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +98,11 @@ files:
98
98
  - bin/setup
99
99
  - lib/xsv.rb
100
100
  - lib/xsv/helpers.rb
101
+ - lib/xsv/shared_strings_parser.rb
101
102
  - lib/xsv/sheet.rb
103
+ - lib/xsv/sheet_bounds_handler.rb
104
+ - lib/xsv/sheet_rows_handler.rb
105
+ - lib/xsv/styles_handler.rb
102
106
  - lib/xsv/version.rb
103
107
  - lib/xsv/workbook.rb
104
108
  - test.sh