xsv 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2eed2bd4654d89a75f817365a919f2ca4c8f4f9bee17e63f5fc938b574826696
4
- data.tar.gz: ddfa4e09f2d9378a79943cef340b1af20ae49764b64bcb28478b672d676e4565
3
+ metadata.gz: 47e4ee16a95b100a1c1bbc526912235bbd1386601a33e3c3449320fe4ea8bc52
4
+ data.tar.gz: 5b0f8320ff29a3dd036cf4396052cbcee00556c2f116dce63c81eb6f4bb69e2a
5
5
  SHA512:
6
- metadata.gz: 6b79348311d076d9397b7cd0629ac4502dc8a3410835b68682e4410af64dea2603d6ea2a68157e0a0bd933f9cdc3b3780034e645d7df9036d7696bda927b858d
7
- data.tar.gz: e40fa7237b12c4d83639f797ba38c9928e6d412bb658b0fa6855dc2bbca6004d89f36a306c74b207e453d2a5e3597b7e45fd76b464c1c5b3a45a716d09d876f0
6
+ metadata.gz: 40bde712c1df13d4fd330b24fdd3cb9e40e5983b4271c172b6f77245aa4d57b8aadc470be5faffb5c759e807c6bf5a507f87139279b86395cddf0fc70b5446fb
7
+ data.tar.gz: 943f5436b416f226f8bcfa2a856dc8079f061be20ca8cc534a9e147692906a280f85f6eb300b7c4e4681ce70a6c5bb81029c6abb70deab87950919e21f03104a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xsv (0.1.1)
4
+ xsv (0.1.2)
5
5
  nokogiri (~> 1.10)
6
6
  rubyzip (~> 2.2)
7
7
 
data/README.md CHANGED
@@ -1,8 +1,10 @@
1
1
  # Xsv .xlsx reader
2
2
 
3
3
  Xsv is a very basic parser for Excel files in the .xlsx format that strives to
4
- provide feature parity with common CSV readers and nothing more. This should
5
- allow for fast parsing of large worksheets with minimal RAM and CPU consumption.
4
+ provide feature parity with common CSV readers and nothing more. This means
5
+ it only parses values to basic Ruby types and does not deal with formatting
6
+ or more advanced functionality. The goal is to allow for fast parsing of large
7
+ worksheets with minimal RAM and CPU consumption.
6
8
 
7
9
  Xsv stands for 'Excel Separated Values' because Excel just gets in the way.
8
10
 
@@ -24,18 +26,48 @@ Or install it yourself as:
24
26
 
25
27
  ## Usage
26
28
 
29
+ Xsv has two modes of operation. By default it returns an array for
30
+ each row in the sheet:
31
+
27
32
  ```ruby
28
- x = Xsv::File.new("sheet.xlsx")
33
+ x = Xsv::Workbook.open("sheet.xlsx")
34
+
35
+ sheet = x.sheets[0]
29
36
 
30
- x.sheets[0].each_row(read_headers: true) do |row|
31
- row # => { "header1" => "value1", "header2", "value2" }
37
+ # Iterate over rows
38
+ sheet.each_row do |row|
39
+ row # => ["header1", "header2"], etc.
32
40
  end
33
- j
34
- x.sheets[0].each_row do |row|
35
- row # => ["header1", "header2"]
41
+
42
+ # Access row by index (zero-based)
43
+ sheet[1] # => ["value1", "value2"]
44
+ ```
45
+
46
+ Alternatively, it can load the headers from the first row and return a hash
47
+ for every row:
48
+
49
+ ```ruby
50
+ x = Xsv::Workbook.open("sheet.xlsx")
51
+
52
+ sheet = x.sheets[0]
53
+
54
+ sheet.mode # => :array
55
+
56
+ # Parse headers and switch to hash mode
57
+ sheet.parse_headers!
58
+
59
+ sheet.mode # => :hash
60
+
61
+ sheet.each_row do |row|
62
+ row # => {"header1" => "value1", "header2" => "value2"}, etc.
36
63
  end
64
+
65
+ sheet[1] # => {"header1" => "value1", "header2" => "value2"}
37
66
  ```
38
67
 
68
+ Be aware that hash mode will lead to unpredictable results if you have multiple
69
+ columns with the same name!
70
+
39
71
  ## Development
40
72
 
41
73
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/lib/xsv/helpers.rb CHANGED
@@ -1,5 +1,39 @@
1
1
  module Xsv
2
2
  module Helpers
3
+ BUILT_IN_NUMBER_FORMATS = {
4
+ 1 => "0",
5
+ 2 => "0.00",
6
+ 3 => "#, ##0",
7
+ 4 => "#, ##0.00",
8
+ 5 => "$#, ##0_);($#, ##0)",
9
+ 6 => "$#, ##0_);[Red]($#, ##0)",
10
+ 7 => "$#, ##0.00_);($#, ##0.00)",
11
+ 8 => "$#, ##0.00_);[Red]($#, ##0.00)",
12
+ 9 => "0%",
13
+ 10 => "0.00%",
14
+ 11 => "0.00E+00",
15
+ 12 => "# ?/?",
16
+ 13 => "# ??/??",
17
+ 14 => "m/d/yyyy",
18
+ 15 => "d-mmm-yy",
19
+ 16 => "d-mmm",
20
+ 17 => "mmm-yy",
21
+ 18 => "h:mm AM/PM",
22
+ 19 => "h:mm:ss AM/PM",
23
+ 20 => "h:mm",
24
+ 21 => "h:mm:ss",
25
+ 22 => "m/d/yyyy h:mm",
26
+ 37 => "#, ##0_);(#, ##0)",
27
+ 38 => "#, ##0_);[Red](#, ##0)",
28
+ 39 => "#, ##0.00_);(#, ##0.00)",
29
+ 40 => "#, ##0.00_);[Red](#, ##0.00)",
30
+ 45 => "mm:ss",
31
+ 46 => "[h]:mm:ss",
32
+ 47 => "mm:ss.0",
33
+ 48 => "##0.0E+0",
34
+ 49 => "@",
35
+ }
36
+
3
37
  # Return the index number for the given Excel column name
4
38
  def column_index(col)
5
39
  val = 0
@@ -10,5 +44,42 @@ module Xsv
10
44
  end
11
45
  return val - 1
12
46
  end
47
+
48
+ # Return a Date for the given Excel date value
49
+ def parse_date(number)
50
+ Date.new(1899, 12, 30) + number
51
+ end
52
+
53
+ # Return a time as a string for the given Excel time value
54
+ def parse_time(number)
55
+ base = number * 24
56
+
57
+ hours = base.truncate
58
+ minutes = (base - hours) * 60
59
+
60
+ "%02d:%02d" % [base, minutes.round]
61
+ end
62
+
63
+ def parse_number(string)
64
+ if string.include? "."
65
+ string.to_f
66
+ else
67
+ string.to_i
68
+ end
69
+ end
70
+
71
+ # Tests if the given format string is a date
72
+ def is_date_format?(format)
73
+ return false if format.nil?
74
+ # If it contains at least 2 sequences of d's, m's or y's it's a date!
75
+ format.scan(/[dmy]+/).length > 1
76
+ end
77
+
78
+ # Tests if the given format string is a time
79
+ def is_time_format?(format)
80
+ return false if format.nil?
81
+ # If it contains at least 2 sequences of h's, m's or s's it's a time!
82
+ format.scan(/[hms]+/).length > 1
83
+ end
13
84
  end
14
85
  end
data/lib/xsv/sheet.rb CHANGED
@@ -2,25 +2,37 @@ module Xsv
2
2
  class Sheet
3
3
  include Xsv::Helpers
4
4
 
5
- attr_reader :xml
5
+ attr_reader :xml, :mode
6
6
 
7
7
  def initialize(workbook, xml)
8
8
  @workbook = workbook
9
9
  @xml = xml
10
10
  @headers = []
11
+
12
+ # Determine number of columns
13
+ bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
14
+ @column_count = (bounds.max - bounds.min) + 1
15
+
16
+ @mode = :array
11
17
  end
12
18
 
13
19
  def inspect
14
20
  "#<#{self.class.name}:#{self.object_id}>"
15
21
  end
16
22
 
17
- # Iterate over rows. Returns an array if read_headers is false, or a hash
18
- # with first row values as keys if read_headers is true
19
- def each_row(read_headers: false)
20
- @parse_headers if read_headers
23
+ # Iterate over rows
24
+ def each_row
25
+ row_index = 0
26
+ @xml.css("sheetData row").each do |row_xml|
27
+ row_index += 1
21
28
 
22
- @xml.css("sheetData row").each_with_index do |row_xml, i|
23
- next if i == 0 && @headers.any?
29
+ next if row_index == 1 && @mode == :hash
30
+
31
+ # pad empty rows
32
+ while row_index < row_xml["r"].to_i do
33
+ yield(empty_row)
34
+ row_index += 1
35
+ end
24
36
 
25
37
  yield(parse_row(row_xml))
26
38
  end
@@ -30,14 +42,23 @@ module Xsv
30
42
 
31
43
  # Get row by number, starting at 0
32
44
  def [](number)
33
- parse_row(@xml.css("sheetData row:nth-child(#{number + 1})").first)
45
+ row_xml = xml.css("sheetData row[r=#{number + 1}]").first
46
+
47
+ if row_xml
48
+ parse_row(row_xml)
49
+ else
50
+ empty_row
51
+ end
34
52
  end
35
53
 
36
54
  # Load headers in the top row of the worksheet. After parsing of headers
37
55
  # all methods return hashes instead of arrays
38
56
  def parse_headers!
57
+ @mode = :array
39
58
  parse_headers
40
59
 
60
+ @mode = :hash
61
+
41
62
  true
42
63
  end
43
64
 
@@ -47,14 +68,17 @@ module Xsv
47
68
  @headers = parse_row(@xml.css("sheetData row").first)
48
69
  end
49
70
 
50
- def parse_row(xml)
51
- if @headers.any?
52
- row = {}
53
- else
54
- row = []
71
+ def empty_row
72
+ case @mode
73
+ when :array
74
+ [nil] * @column_count
75
+ when :hash
76
+ @headers.zip([]).to_h
55
77
  end
78
+ end
56
79
 
57
- next_index = 0
80
+ def parse_row(xml)
81
+ row = empty_row
58
82
 
59
83
  xml.css("c").each do |c_xml|
60
84
  value = case c_xml["t"]
@@ -65,7 +89,23 @@ module Xsv
65
89
  when "e" # N/A
66
90
  nil
67
91
  when nil
68
- c_xml.css("v").inner_text.to_i
92
+ value = parse_number(c_xml.css("v").inner_text)
93
+
94
+ if c_xml["s"]
95
+ style = @workbook.xfs[c_xml["s"].to_i]
96
+ numFmtId = style[:numFmtId].to_i
97
+ if numFmtId == 0
98
+ value
99
+ elsif is_date_format?(@workbook.numFmts[numFmtId])
100
+ parse_date(value)
101
+ elsif is_time_format?(@workbook.numFmts[numFmtId])
102
+ parse_time(value)
103
+ else
104
+ value
105
+ end
106
+ else
107
+ value
108
+ end
69
109
  else
70
110
  raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
71
111
  end
@@ -73,22 +113,12 @@ module Xsv
73
113
  # Determine column position and pad row with nil values
74
114
  col_index = column_index(c_xml["r"].scan(/^[A-Z]+/).first)
75
115
 
76
- (col_index - next_index).times do
77
- if @headers.any?
78
- row[@headers[next_index]] = nil
79
- else
80
- row << nil
81
- end
82
- next_index += 1
83
- end
84
-
85
- if @headers.any?
86
- row[@headers[next_index]] = value
87
- else
88
- row << value
116
+ case @mode
117
+ when :array
118
+ row[col_index] = value
119
+ when :hash
120
+ row[@headers[col_index]] = value
89
121
  end
90
-
91
- next_index += 1
92
122
  end
93
123
 
94
124
  row
data/lib/xsv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Xsv
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -2,14 +2,25 @@ require 'nokogiri'
2
2
  require 'zip'
3
3
 
4
4
  module Xsv
5
- class File
5
+ class Workbook
6
6
 
7
- attr_reader :sheets, :shared_strings
7
+ attr_reader :sheets, :shared_strings, :xfs, :numFmts
8
+
9
+ # Open the workbook of the given filename
10
+ def self.open(file)
11
+ @workbook = self.new(Zip::File.open(file))
12
+ end
13
+
14
+ # Open a workbook from an instance of Zip::File
15
+ def initialize(zip)
16
+ @zip = zip
8
17
 
9
- def initialize(file)
10
- @zip = Zip::File.open(file)
11
18
  @sheets = []
19
+ @xfs = []
20
+ @numFmts = Xsv::Helpers::BUILT_IN_NUMBER_FORMATS
21
+
12
22
  fetch_shared_strings
23
+ fetch_styles
13
24
  fetch_sheets
14
25
  end
15
26
 
@@ -17,11 +28,6 @@ module Xsv
17
28
  "#<#{self.class.name}:#{self.object_id}>"
18
29
  end
19
30
 
20
- def close
21
- # FIXME @sheets.each { |s| s.xml.close }
22
- @zip.close
23
- end
24
-
25
31
  private
26
32
 
27
33
  def fetch_shared_strings
@@ -37,6 +43,19 @@ module Xsv
37
43
  stream.close
38
44
  end
39
45
 
46
+ def fetch_styles
47
+ stream = @zip.glob("xl/styles.xml").first.get_input_stream
48
+ xml = Nokogiri::XML(stream)
49
+
50
+ xml.css("cellXfs xf").each do |xf|
51
+ @xfs << xf.attributes.map { |k, v| [k.to_sym, v.value] }.to_h
52
+ end
53
+
54
+ xml.css("numFmts numFmt").each do |numFmt|
55
+ @numFmts[numFmt["numFmtId"].to_i] = numFmt["formatCode"]
56
+ end
57
+ end
58
+
40
59
  def fetch_sheets
41
60
  @zip.glob("xl/worksheets/sheet*.xml").sort do |entry|
42
61
  entry.name.scan(/\d+/).first.to_i
data/lib/xsv.rb CHANGED
@@ -1,7 +1,9 @@
1
- require "xsv/file"
1
+ require "date"
2
+
2
3
  require "xsv/helpers"
3
4
  require "xsv/sheet"
4
5
  require "xsv/version"
6
+ require "xsv/workbook"
5
7
 
6
8
  module Xsv
7
9
  class Error < StandardError; end
data/test.sh ADDED
@@ -0,0 +1,3 @@
1
+ #!/bin/sh
2
+
3
+ ruby -Ilib:test test/*_test.rb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xsv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martijn Storck
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-02-18 00:00:00.000000000 Z
11
+ date: 2020-02-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubyzip
@@ -97,10 +97,11 @@ files:
97
97
  - bin/console
98
98
  - bin/setup
99
99
  - lib/xsv.rb
100
- - lib/xsv/file.rb
101
100
  - lib/xsv/helpers.rb
102
101
  - lib/xsv/sheet.rb
103
102
  - lib/xsv/version.rb
103
+ - lib/xsv/workbook.rb
104
+ - test.sh
104
105
  - xsv.gemspec
105
106
  homepage: https://github.com/martijn/xsv
106
107
  licenses: