xsv 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +40 -8
- data/lib/xsv/helpers.rb +71 -0
- data/lib/xsv/sheet.rb +60 -30
- data/lib/xsv/version.rb +1 -1
- data/lib/xsv/{file.rb → workbook.rb} +28 -9
- data/lib/xsv.rb +3 -1
- data/test.sh +3 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47e4ee16a95b100a1c1bbc526912235bbd1386601a33e3c3449320fe4ea8bc52
|
4
|
+
data.tar.gz: 5b0f8320ff29a3dd036cf4396052cbcee00556c2f116dce63c81eb6f4bb69e2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 40bde712c1df13d4fd330b24fdd3cb9e40e5983b4271c172b6f77245aa4d57b8aadc470be5faffb5c759e807c6bf5a507f87139279b86395cddf0fc70b5446fb
|
7
|
+
data.tar.gz: 943f5436b416f226f8bcfa2a856dc8079f061be20ca8cc534a9e147692906a280f85f6eb300b7c4e4681ce70a6c5bb81029c6abb70deab87950919e21f03104a
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
# Xsv .xlsx reader
|
2
2
|
|
3
3
|
Xsv is a very basic parser for Excel files in the .xlsx format that strives to
|
4
|
-
provide feature parity with common CSV readers and nothing more. This
|
5
|
-
|
4
|
+
provide feature parity with common CSV readers and nothing more. This means
|
5
|
+
it only parses values to basic Ruby types and does not deal with formatting
|
6
|
+
or more advanced functionality. The goal is to allow for fast parsing of large
|
7
|
+
worksheets with minimal RAM and CPU consumption.
|
6
8
|
|
7
9
|
Xsv stands for 'Excel Separated Values' because Excel just gets in the way.
|
8
10
|
|
@@ -24,18 +26,48 @@ Or install it yourself as:
|
|
24
26
|
|
25
27
|
## Usage
|
26
28
|
|
29
|
+
Xsv has two modes of operation. By default it returns an array for
|
30
|
+
each row in the sheet:
|
31
|
+
|
27
32
|
```ruby
|
28
|
-
x = Xsv::
|
33
|
+
x = Xsv::Workbook.open("sheet.xlsx")
|
34
|
+
|
35
|
+
sheet = x.sheets[0]
|
29
36
|
|
30
|
-
|
31
|
-
|
37
|
+
# Iterate over rows
|
38
|
+
sheet.each_row do |row|
|
39
|
+
row # => ["header1", "header2"], etc.
|
32
40
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
41
|
+
|
42
|
+
# Access row by index (zero-based)
|
43
|
+
sheet[1] # => ["value1", "value2"]
|
44
|
+
```
|
45
|
+
|
46
|
+
Alternatively, it can load the headers from the first row and return a hash
|
47
|
+
for every row:
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
x = Xsv::Workbook.open("sheet.xlsx")
|
51
|
+
|
52
|
+
sheet = x.sheets[0]
|
53
|
+
|
54
|
+
sheet.mode # => :array
|
55
|
+
|
56
|
+
# Parse headers and switch to hash mode
|
57
|
+
sheet.parse_headers!
|
58
|
+
|
59
|
+
sheet.mode # => :hash
|
60
|
+
|
61
|
+
sheet.each_row do |row|
|
62
|
+
row # => {"header1" => "value1", "header2" => "value2"}, etc.
|
36
63
|
end
|
64
|
+
|
65
|
+
sheet[1] # => {"header1" => "value1", "header2" => "value2"}
|
37
66
|
```
|
38
67
|
|
68
|
+
Be aware that hash mode will lead to unpredictable results if you have multiple
|
69
|
+
columns with the same name!
|
70
|
+
|
39
71
|
## Development
|
40
72
|
|
41
73
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/xsv/helpers.rb
CHANGED
@@ -1,5 +1,39 @@
|
|
1
1
|
module Xsv
|
2
2
|
module Helpers
|
3
|
+
BUILT_IN_NUMBER_FORMATS = {
|
4
|
+
1 => "0",
|
5
|
+
2 => "0.00",
|
6
|
+
3 => "#, ##0",
|
7
|
+
4 => "#, ##0.00",
|
8
|
+
5 => "$#, ##0_);($#, ##0)",
|
9
|
+
6 => "$#, ##0_);[Red]($#, ##0)",
|
10
|
+
7 => "$#, ##0.00_);($#, ##0.00)",
|
11
|
+
8 => "$#, ##0.00_);[Red]($#, ##0.00)",
|
12
|
+
9 => "0%",
|
13
|
+
10 => "0.00%",
|
14
|
+
11 => "0.00E+00",
|
15
|
+
12 => "# ?/?",
|
16
|
+
13 => "# ??/??",
|
17
|
+
14 => "m/d/yyyy",
|
18
|
+
15 => "d-mmm-yy",
|
19
|
+
16 => "d-mmm",
|
20
|
+
17 => "mmm-yy",
|
21
|
+
18 => "h:mm AM/PM",
|
22
|
+
19 => "h:mm:ss AM/PM",
|
23
|
+
20 => "h:mm",
|
24
|
+
21 => "h:mm:ss",
|
25
|
+
22 => "m/d/yyyy h:mm",
|
26
|
+
37 => "#, ##0_);(#, ##0)",
|
27
|
+
38 => "#, ##0_);[Red](#, ##0)",
|
28
|
+
39 => "#, ##0.00_);(#, ##0.00)",
|
29
|
+
40 => "#, ##0.00_);[Red](#, ##0.00)",
|
30
|
+
45 => "mm:ss",
|
31
|
+
46 => "[h]:mm:ss",
|
32
|
+
47 => "mm:ss.0",
|
33
|
+
48 => "##0.0E+0",
|
34
|
+
49 => "@",
|
35
|
+
}
|
36
|
+
|
3
37
|
# Return the index number for the given Excel column name
|
4
38
|
def column_index(col)
|
5
39
|
val = 0
|
@@ -10,5 +44,42 @@ module Xsv
|
|
10
44
|
end
|
11
45
|
return val - 1
|
12
46
|
end
|
47
|
+
|
48
|
+
# Return a Date for the given Excel date value
|
49
|
+
def parse_date(number)
|
50
|
+
Date.new(1899, 12, 30) + number
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return a time as a string for the given Excel time value
|
54
|
+
def parse_time(number)
|
55
|
+
base = number * 24
|
56
|
+
|
57
|
+
hours = base.truncate
|
58
|
+
minutes = (base - hours) * 60
|
59
|
+
|
60
|
+
"%02d:%02d" % [base, minutes.round]
|
61
|
+
end
|
62
|
+
|
63
|
+
def parse_number(string)
|
64
|
+
if string.include? "."
|
65
|
+
string.to_f
|
66
|
+
else
|
67
|
+
string.to_i
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Tests if the given format string is a date
|
72
|
+
def is_date_format?(format)
|
73
|
+
return false if format.nil?
|
74
|
+
# If it contains at least 2 sequences of d's, m's or y's it's a date!
|
75
|
+
format.scan(/[dmy]+/).length > 1
|
76
|
+
end
|
77
|
+
|
78
|
+
# Tests if the given format string is a time
|
79
|
+
def is_time_format?(format)
|
80
|
+
return false if format.nil?
|
81
|
+
# If it contains at least 2 sequences of h's, m's or s's it's a time!
|
82
|
+
format.scan(/[hms]+/).length > 1
|
83
|
+
end
|
13
84
|
end
|
14
85
|
end
|
data/lib/xsv/sheet.rb
CHANGED
@@ -2,25 +2,37 @@ module Xsv
|
|
2
2
|
class Sheet
|
3
3
|
include Xsv::Helpers
|
4
4
|
|
5
|
-
attr_reader :xml
|
5
|
+
attr_reader :xml, :mode
|
6
6
|
|
7
7
|
def initialize(workbook, xml)
|
8
8
|
@workbook = workbook
|
9
9
|
@xml = xml
|
10
10
|
@headers = []
|
11
|
+
|
12
|
+
# Determine number of columns
|
13
|
+
bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
|
14
|
+
@column_count = (bounds.max - bounds.min) + 1
|
15
|
+
|
16
|
+
@mode = :array
|
11
17
|
end
|
12
18
|
|
13
19
|
def inspect
|
14
20
|
"#<#{self.class.name}:#{self.object_id}>"
|
15
21
|
end
|
16
22
|
|
17
|
-
# Iterate over rows
|
18
|
-
|
19
|
-
|
20
|
-
@
|
23
|
+
# Iterate over rows
|
24
|
+
def each_row
|
25
|
+
row_index = 0
|
26
|
+
@xml.css("sheetData row").each do |row_xml|
|
27
|
+
row_index += 1
|
21
28
|
|
22
|
-
|
23
|
-
|
29
|
+
next if row_index == 1 && @mode == :hash
|
30
|
+
|
31
|
+
# pad empty rows
|
32
|
+
while row_index < row_xml["r"].to_i do
|
33
|
+
yield(empty_row)
|
34
|
+
row_index += 1
|
35
|
+
end
|
24
36
|
|
25
37
|
yield(parse_row(row_xml))
|
26
38
|
end
|
@@ -30,14 +42,23 @@ module Xsv
|
|
30
42
|
|
31
43
|
# Get row by number, starting at 0
|
32
44
|
def [](number)
|
33
|
-
|
45
|
+
row_xml = xml.css("sheetData row[r=#{number + 1}]").first
|
46
|
+
|
47
|
+
if row_xml
|
48
|
+
parse_row(row_xml)
|
49
|
+
else
|
50
|
+
empty_row
|
51
|
+
end
|
34
52
|
end
|
35
53
|
|
36
54
|
# Load headers in the top row of the worksheet. After parsing of headers
|
37
55
|
# all methods return hashes instead of arrays
|
38
56
|
def parse_headers!
|
57
|
+
@mode = :array
|
39
58
|
parse_headers
|
40
59
|
|
60
|
+
@mode = :hash
|
61
|
+
|
41
62
|
true
|
42
63
|
end
|
43
64
|
|
@@ -47,14 +68,17 @@ module Xsv
|
|
47
68
|
@headers = parse_row(@xml.css("sheetData row").first)
|
48
69
|
end
|
49
70
|
|
50
|
-
def
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
71
|
+
def empty_row
|
72
|
+
case @mode
|
73
|
+
when :array
|
74
|
+
[nil] * @column_count
|
75
|
+
when :hash
|
76
|
+
@headers.zip([]).to_h
|
55
77
|
end
|
78
|
+
end
|
56
79
|
|
57
|
-
|
80
|
+
def parse_row(xml)
|
81
|
+
row = empty_row
|
58
82
|
|
59
83
|
xml.css("c").each do |c_xml|
|
60
84
|
value = case c_xml["t"]
|
@@ -65,7 +89,23 @@ module Xsv
|
|
65
89
|
when "e" # N/A
|
66
90
|
nil
|
67
91
|
when nil
|
68
|
-
c_xml.css("v").inner_text
|
92
|
+
value = parse_number(c_xml.css("v").inner_text)
|
93
|
+
|
94
|
+
if c_xml["s"]
|
95
|
+
style = @workbook.xfs[c_xml["s"].to_i]
|
96
|
+
numFmtId = style[:numFmtId].to_i
|
97
|
+
if numFmtId == 0
|
98
|
+
value
|
99
|
+
elsif is_date_format?(@workbook.numFmts[numFmtId])
|
100
|
+
parse_date(value)
|
101
|
+
elsif is_time_format?(@workbook.numFmts[numFmtId])
|
102
|
+
parse_time(value)
|
103
|
+
else
|
104
|
+
value
|
105
|
+
end
|
106
|
+
else
|
107
|
+
value
|
108
|
+
end
|
69
109
|
else
|
70
110
|
raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
|
71
111
|
end
|
@@ -73,22 +113,12 @@ module Xsv
|
|
73
113
|
# Determine column position and pad row with nil values
|
74
114
|
col_index = column_index(c_xml["r"].scan(/^[A-Z]+/).first)
|
75
115
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
next_index += 1
|
83
|
-
end
|
84
|
-
|
85
|
-
if @headers.any?
|
86
|
-
row[@headers[next_index]] = value
|
87
|
-
else
|
88
|
-
row << value
|
116
|
+
case @mode
|
117
|
+
when :array
|
118
|
+
row[col_index] = value
|
119
|
+
when :hash
|
120
|
+
row[@headers[col_index]] = value
|
89
121
|
end
|
90
|
-
|
91
|
-
next_index += 1
|
92
122
|
end
|
93
123
|
|
94
124
|
row
|
data/lib/xsv/version.rb
CHANGED
@@ -2,14 +2,25 @@ require 'nokogiri'
|
|
2
2
|
require 'zip'
|
3
3
|
|
4
4
|
module Xsv
|
5
|
-
class
|
5
|
+
class Workbook
|
6
6
|
|
7
|
-
attr_reader :sheets, :shared_strings
|
7
|
+
attr_reader :sheets, :shared_strings, :xfs, :numFmts
|
8
|
+
|
9
|
+
# Open the workbook of the given filename
|
10
|
+
def self.open(file)
|
11
|
+
@workbook = self.new(Zip::File.open(file))
|
12
|
+
end
|
13
|
+
|
14
|
+
# Open a workbook from an instance of Zip::File
|
15
|
+
def initialize(zip)
|
16
|
+
@zip = zip
|
8
17
|
|
9
|
-
def initialize(file)
|
10
|
-
@zip = Zip::File.open(file)
|
11
18
|
@sheets = []
|
19
|
+
@xfs = []
|
20
|
+
@numFmts = Xsv::Helpers::BUILT_IN_NUMBER_FORMATS
|
21
|
+
|
12
22
|
fetch_shared_strings
|
23
|
+
fetch_styles
|
13
24
|
fetch_sheets
|
14
25
|
end
|
15
26
|
|
@@ -17,11 +28,6 @@ module Xsv
|
|
17
28
|
"#<#{self.class.name}:#{self.object_id}>"
|
18
29
|
end
|
19
30
|
|
20
|
-
def close
|
21
|
-
# FIXME @sheets.each { |s| s.xml.close }
|
22
|
-
@zip.close
|
23
|
-
end
|
24
|
-
|
25
31
|
private
|
26
32
|
|
27
33
|
def fetch_shared_strings
|
@@ -37,6 +43,19 @@ module Xsv
|
|
37
43
|
stream.close
|
38
44
|
end
|
39
45
|
|
46
|
+
def fetch_styles
|
47
|
+
stream = @zip.glob("xl/styles.xml").first.get_input_stream
|
48
|
+
xml = Nokogiri::XML(stream)
|
49
|
+
|
50
|
+
xml.css("cellXfs xf").each do |xf|
|
51
|
+
@xfs << xf.attributes.map { |k, v| [k.to_sym, v.value] }.to_h
|
52
|
+
end
|
53
|
+
|
54
|
+
xml.css("numFmts numFmt").each do |numFmt|
|
55
|
+
@numFmts[numFmt["numFmtId"].to_i] = numFmt["formatCode"]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
40
59
|
def fetch_sheets
|
41
60
|
@zip.glob("xl/worksheets/sheet*.xml").sort do |entry|
|
42
61
|
entry.name.scan(/\d+/).first.to_i
|
data/lib/xsv.rb
CHANGED
data/test.sh
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xsv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martijn Storck
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-02-
|
11
|
+
date: 2020-02-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rubyzip
|
@@ -97,10 +97,11 @@ files:
|
|
97
97
|
- bin/console
|
98
98
|
- bin/setup
|
99
99
|
- lib/xsv.rb
|
100
|
-
- lib/xsv/file.rb
|
101
100
|
- lib/xsv/helpers.rb
|
102
101
|
- lib/xsv/sheet.rb
|
103
102
|
- lib/xsv/version.rb
|
103
|
+
- lib/xsv/workbook.rb
|
104
|
+
- test.sh
|
104
105
|
- xsv.gemspec
|
105
106
|
homepage: https://github.com/martijn/xsv
|
106
107
|
licenses:
|