xsv 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2eed2bd4654d89a75f817365a919f2ca4c8f4f9bee17e63f5fc938b574826696
4
- data.tar.gz: ddfa4e09f2d9378a79943cef340b1af20ae49764b64bcb28478b672d676e4565
3
+ metadata.gz: 47e4ee16a95b100a1c1bbc526912235bbd1386601a33e3c3449320fe4ea8bc52
4
+ data.tar.gz: 5b0f8320ff29a3dd036cf4396052cbcee00556c2f116dce63c81eb6f4bb69e2a
5
5
  SHA512:
6
- metadata.gz: 6b79348311d076d9397b7cd0629ac4502dc8a3410835b68682e4410af64dea2603d6ea2a68157e0a0bd933f9cdc3b3780034e645d7df9036d7696bda927b858d
7
- data.tar.gz: e40fa7237b12c4d83639f797ba38c9928e6d412bb658b0fa6855dc2bbca6004d89f36a306c74b207e453d2a5e3597b7e45fd76b464c1c5b3a45a716d09d876f0
6
+ metadata.gz: 40bde712c1df13d4fd330b24fdd3cb9e40e5983b4271c172b6f77245aa4d57b8aadc470be5faffb5c759e807c6bf5a507f87139279b86395cddf0fc70b5446fb
7
+ data.tar.gz: 943f5436b416f226f8bcfa2a856dc8079f061be20ca8cc534a9e147692906a280f85f6eb300b7c4e4681ce70a6c5bb81029c6abb70deab87950919e21f03104a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xsv (0.1.1)
4
+ xsv (0.1.2)
5
5
  nokogiri (~> 1.10)
6
6
  rubyzip (~> 2.2)
7
7
 
data/README.md CHANGED
@@ -1,8 +1,10 @@
1
1
  # Xsv .xlsx reader
2
2
 
3
3
  Xsv is a very basic parser for Excel files in the .xlsx format that strives to
4
- provide feature parity with common CSV readers and nothing more. This should
5
- allow for fast parsing of large worksheets with minimal RAM and CPU consumption.
4
+ provide feature parity with common CSV readers and nothing more. This means
5
+ it only parses values to basic Ruby types and does not deal with formatting
6
+ or more advanced functionality. The goal is to allow for fast parsing of large
7
+ worksheets with minimal RAM and CPU consumption.
6
8
 
7
9
  Xsv stands for 'Excel Separated Values' because Excel just gets in the way.
8
10
 
@@ -24,18 +26,48 @@ Or install it yourself as:
24
26
 
25
27
  ## Usage
26
28
 
29
+ Xsv has two modes of operation. By default it returns an array for
30
+ each row in the sheet:
31
+
27
32
  ```ruby
28
- x = Xsv::File.new("sheet.xlsx")
33
+ x = Xsv::Workbook.open("sheet.xlsx")
34
+
35
+ sheet = x.sheets[0]
29
36
 
30
- x.sheets[0].each_row(read_headers: true) do |row|
31
- row # => { "header1" => "value1", "header2", "value2" }
37
+ # Iterate over rows
38
+ sheet.each_row do |row|
39
+ row # => ["header1", "header2"], etc.
32
40
  end
33
- j
34
- x.sheets[0].each_row do |row|
35
- row # => ["header1", "header2"]
41
+
42
+ # Access row by index (zero-based)
43
+ sheet[1] # => ["value1", "value2"]
44
+ ```
45
+
46
+ Alternatively, it can load the headers from the first row and return a hash
47
+ for every row:
48
+
49
+ ```ruby
50
+ x = Xsv::Workbook.open("sheet.xlsx")
51
+
52
+ sheet = x.sheets[0]
53
+
54
+ sheet.mode # => :array
55
+
56
+ # Parse headers and switch to hash mode
57
+ sheet.parse_headers!
58
+
59
+ sheet.mode # => :hash
60
+
61
+ sheet.each_row do |row|
62
+ row # => {"header1" => "value1", "header2" => "value2"}, etc.
36
63
  end
64
+
65
+ sheet[1] # => {"header1" => "value1", "header2" => "value2"}
37
66
  ```
38
67
 
68
+ Be aware that hash mode will lead to unpredictable results if you have multiple
69
+ columns with the same name!
70
+
39
71
  ## Development
40
72
 
41
73
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/lib/xsv/helpers.rb CHANGED
@@ -1,5 +1,39 @@
1
1
  module Xsv
2
2
  module Helpers
3
+ BUILT_IN_NUMBER_FORMATS = {
4
+ 1 => "0",
5
+ 2 => "0.00",
6
+ 3 => "#, ##0",
7
+ 4 => "#, ##0.00",
8
+ 5 => "$#, ##0_);($#, ##0)",
9
+ 6 => "$#, ##0_);[Red]($#, ##0)",
10
+ 7 => "$#, ##0.00_);($#, ##0.00)",
11
+ 8 => "$#, ##0.00_);[Red]($#, ##0.00)",
12
+ 9 => "0%",
13
+ 10 => "0.00%",
14
+ 11 => "0.00E+00",
15
+ 12 => "# ?/?",
16
+ 13 => "# ??/??",
17
+ 14 => "m/d/yyyy",
18
+ 15 => "d-mmm-yy",
19
+ 16 => "d-mmm",
20
+ 17 => "mmm-yy",
21
+ 18 => "h:mm AM/PM",
22
+ 19 => "h:mm:ss AM/PM",
23
+ 20 => "h:mm",
24
+ 21 => "h:mm:ss",
25
+ 22 => "m/d/yyyy h:mm",
26
+ 37 => "#, ##0_);(#, ##0)",
27
+ 38 => "#, ##0_);[Red](#, ##0)",
28
+ 39 => "#, ##0.00_);(#, ##0.00)",
29
+ 40 => "#, ##0.00_);[Red](#, ##0.00)",
30
+ 45 => "mm:ss",
31
+ 46 => "[h]:mm:ss",
32
+ 47 => "mm:ss.0",
33
+ 48 => "##0.0E+0",
34
+ 49 => "@",
35
+ }
36
+
3
37
  # Return the index number for the given Excel column name
4
38
  def column_index(col)
5
39
  val = 0
@@ -10,5 +44,42 @@ module Xsv
10
44
  end
11
45
  return val - 1
12
46
  end
47
+
48
+ # Return a Date for the given Excel date value
49
+ def parse_date(number)
50
+ Date.new(1899, 12, 30) + number
51
+ end
52
+
53
+ # Return a time as a string for the given Excel time value
54
+ def parse_time(number)
55
+ base = number * 24
56
+
57
+ hours = base.truncate
58
+ minutes = (base - hours) * 60
59
+
60
+ "%02d:%02d" % [base, minutes.round]
61
+ end
62
+
63
+ def parse_number(string)
64
+ if string.include? "."
65
+ string.to_f
66
+ else
67
+ string.to_i
68
+ end
69
+ end
70
+
71
+ # Tests if the given format string is a date
72
+ def is_date_format?(format)
73
+ return false if format.nil?
74
+ # If it contains at least 2 sequences of d's, m's or y's it's a date!
75
+ format.scan(/[dmy]+/).length > 1
76
+ end
77
+
78
+ # Tests if the given format string is a time
79
+ def is_time_format?(format)
80
+ return false if format.nil?
81
+ # If it contains at least 2 sequences of h's, m's or s's it's a time!
82
+ format.scan(/[hms]+/).length > 1
83
+ end
13
84
  end
14
85
  end
data/lib/xsv/sheet.rb CHANGED
@@ -2,25 +2,37 @@ module Xsv
2
2
  class Sheet
3
3
  include Xsv::Helpers
4
4
 
5
- attr_reader :xml
5
+ attr_reader :xml, :mode
6
6
 
7
7
  def initialize(workbook, xml)
8
8
  @workbook = workbook
9
9
  @xml = xml
10
10
  @headers = []
11
+
12
+ # Determine number of columns
13
+ bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
14
+ @column_count = (bounds.max - bounds.min) + 1
15
+
16
+ @mode = :array
11
17
  end
12
18
 
13
19
  def inspect
14
20
  "#<#{self.class.name}:#{self.object_id}>"
15
21
  end
16
22
 
17
- # Iterate over rows. Returns an array if read_headers is false, or a hash
18
- # with first row values as keys if read_headers is true
19
- def each_row(read_headers: false)
20
- @parse_headers if read_headers
23
+ # Iterate over rows
24
+ def each_row
25
+ row_index = 0
26
+ @xml.css("sheetData row").each do |row_xml|
27
+ row_index += 1
21
28
 
22
- @xml.css("sheetData row").each_with_index do |row_xml, i|
23
- next if i == 0 && @headers.any?
29
+ next if row_index == 1 && @mode == :hash
30
+
31
+ # pad empty rows
32
+ while row_index < row_xml["r"].to_i do
33
+ yield(empty_row)
34
+ row_index += 1
35
+ end
24
36
 
25
37
  yield(parse_row(row_xml))
26
38
  end
@@ -30,14 +42,23 @@ module Xsv
30
42
 
31
43
  # Get row by number, starting at 0
32
44
  def [](number)
33
- parse_row(@xml.css("sheetData row:nth-child(#{number + 1})").first)
45
+ row_xml = xml.css("sheetData row[r=#{number + 1}]").first
46
+
47
+ if row_xml
48
+ parse_row(row_xml)
49
+ else
50
+ empty_row
51
+ end
34
52
  end
35
53
 
36
54
  # Load headers in the top row of the worksheet. After parsing of headers
37
55
  # all methods return hashes instead of arrays
38
56
  def parse_headers!
57
+ @mode = :array
39
58
  parse_headers
40
59
 
60
+ @mode = :hash
61
+
41
62
  true
42
63
  end
43
64
 
@@ -47,14 +68,17 @@ module Xsv
47
68
  @headers = parse_row(@xml.css("sheetData row").first)
48
69
  end
49
70
 
50
- def parse_row(xml)
51
- if @headers.any?
52
- row = {}
53
- else
54
- row = []
71
+ def empty_row
72
+ case @mode
73
+ when :array
74
+ [nil] * @column_count
75
+ when :hash
76
+ @headers.zip([]).to_h
55
77
  end
78
+ end
56
79
 
57
- next_index = 0
80
+ def parse_row(xml)
81
+ row = empty_row
58
82
 
59
83
  xml.css("c").each do |c_xml|
60
84
  value = case c_xml["t"]
@@ -65,7 +89,23 @@ module Xsv
65
89
  when "e" # N/A
66
90
  nil
67
91
  when nil
68
- c_xml.css("v").inner_text.to_i
92
+ value = parse_number(c_xml.css("v").inner_text)
93
+
94
+ if c_xml["s"]
95
+ style = @workbook.xfs[c_xml["s"].to_i]
96
+ numFmtId = style[:numFmtId].to_i
97
+ if numFmtId == 0
98
+ value
99
+ elsif is_date_format?(@workbook.numFmts[numFmtId])
100
+ parse_date(value)
101
+ elsif is_time_format?(@workbook.numFmts[numFmtId])
102
+ parse_time(value)
103
+ else
104
+ value
105
+ end
106
+ else
107
+ value
108
+ end
69
109
  else
70
110
  raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
71
111
  end
@@ -73,22 +113,12 @@ module Xsv
73
113
  # Determine column position and pad row with nil values
74
114
  col_index = column_index(c_xml["r"].scan(/^[A-Z]+/).first)
75
115
 
76
- (col_index - next_index).times do
77
- if @headers.any?
78
- row[@headers[next_index]] = nil
79
- else
80
- row << nil
81
- end
82
- next_index += 1
83
- end
84
-
85
- if @headers.any?
86
- row[@headers[next_index]] = value
87
- else
88
- row << value
116
+ case @mode
117
+ when :array
118
+ row[col_index] = value
119
+ when :hash
120
+ row[@headers[col_index]] = value
89
121
  end
90
-
91
- next_index += 1
92
122
  end
93
123
 
94
124
  row
data/lib/xsv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Xsv
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -2,14 +2,25 @@ require 'nokogiri'
2
2
  require 'zip'
3
3
 
4
4
  module Xsv
5
- class File
5
+ class Workbook
6
6
 
7
- attr_reader :sheets, :shared_strings
7
+ attr_reader :sheets, :shared_strings, :xfs, :numFmts
8
+
9
+ # Open the workbook of the given filename
10
+ def self.open(file)
11
+ @workbook = self.new(Zip::File.open(file))
12
+ end
13
+
14
+ # Open a workbook from an instance of Zip::File
15
+ def initialize(zip)
16
+ @zip = zip
8
17
 
9
- def initialize(file)
10
- @zip = Zip::File.open(file)
11
18
  @sheets = []
19
+ @xfs = []
20
+ @numFmts = Xsv::Helpers::BUILT_IN_NUMBER_FORMATS
21
+
12
22
  fetch_shared_strings
23
+ fetch_styles
13
24
  fetch_sheets
14
25
  end
15
26
 
@@ -17,11 +28,6 @@ module Xsv
17
28
  "#<#{self.class.name}:#{self.object_id}>"
18
29
  end
19
30
 
20
- def close
21
- # FIXME @sheets.each { |s| s.xml.close }
22
- @zip.close
23
- end
24
-
25
31
  private
26
32
 
27
33
  def fetch_shared_strings
@@ -37,6 +43,19 @@ module Xsv
37
43
  stream.close
38
44
  end
39
45
 
46
+ def fetch_styles
47
+ stream = @zip.glob("xl/styles.xml").first.get_input_stream
48
+ xml = Nokogiri::XML(stream)
49
+
50
+ xml.css("cellXfs xf").each do |xf|
51
+ @xfs << xf.attributes.map { |k, v| [k.to_sym, v.value] }.to_h
52
+ end
53
+
54
+ xml.css("numFmts numFmt").each do |numFmt|
55
+ @numFmts[numFmt["numFmtId"].to_i] = numFmt["formatCode"]
56
+ end
57
+ end
58
+
40
59
  def fetch_sheets
41
60
  @zip.glob("xl/worksheets/sheet*.xml").sort do |entry|
42
61
  entry.name.scan(/\d+/).first.to_i
data/lib/xsv.rb CHANGED
@@ -1,7 +1,9 @@
1
- require "xsv/file"
1
+ require "date"
2
+
2
3
  require "xsv/helpers"
3
4
  require "xsv/sheet"
4
5
  require "xsv/version"
6
+ require "xsv/workbook"
5
7
 
6
8
  module Xsv
7
9
  class Error < StandardError; end
data/test.sh ADDED
@@ -0,0 +1,3 @@
1
+ #!/bin/sh
2
+
3
+ ruby -Ilib:test test/*_test.rb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xsv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martijn Storck
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-02-18 00:00:00.000000000 Z
11
+ date: 2020-02-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -97,10 +97,11 @@ files:
97
97
  - bin/console
98
98
  - bin/setup
99
99
  - lib/xsv.rb
100
- - lib/xsv/file.rb
101
100
  - lib/xsv/helpers.rb
102
101
  - lib/xsv/sheet.rb
103
102
  - lib/xsv/version.rb
103
+ - lib/xsv/workbook.rb
104
+ - test.sh
104
105
  - xsv.gemspec
105
106
  homepage: https://github.com/martijn/xsv
106
107
  licenses: