xsv 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -0
- data/Gemfile.lock +1 -1
- data/README.md +19 -1
- data/lib/xsv/helpers.rb +35 -2
- data/lib/xsv/sheet.rb +50 -16
- data/lib/xsv/version.rb +1 -1
- data/lib/xsv/workbook.rb +2 -2
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ace53a58655a50f4de2f10c4a2d68819774e3e74625595353290502299630b30
|
|
4
|
+
data.tar.gz: 8278ea7b26ac261781ae71328762104db9d14de0cbc05be1b0551a3cb876ea35
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 31ccd6261073893e0a0873997d7befa39fc0f8088f5f4d9d1ec3f97b7d90f48d1153714cff4f124a4be853976ce714326a6ec8634e646f15ecf6cf428c245a39
|
|
7
|
+
data.tar.gz: eed43d77052870dc45bc009719b130096a05a6add11576a349ca027ff93b428bfe64813dd83a8ea58a1a7da94aa8b490b4394efa65bbb175ae09dd74c77ee20a
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Xsv is a very basic parser for Excel files in the .xlsx format that strives to
|
|
4
4
|
provide feature parity with common CSV readers and nothing more. This means
|
|
5
|
-
it only parses values to basic Ruby types and does not deal with formatting
|
|
5
|
+
it only parses values to basic Ruby types and does not deal with most formatting
|
|
6
6
|
or more advanced functionality. The goal is to allow for fast parsing of large
|
|
7
7
|
worksheets with minimal RAM and CPU consumption.
|
|
8
8
|
|
|
@@ -68,6 +68,24 @@ sheet[1] # => {"header1" => "value1", "header2" => "value2"}
|
|
|
68
68
|
Be aware that hash mode will lead to unpredictable results if you have multiple
|
|
69
69
|
columns with the same name!
|
|
70
70
|
|
|
71
|
+
### Assumptions
|
|
72
|
+
|
|
73
|
+
Since Xsv treats worksheets like csv files it makes certain assumptions about your
|
|
74
|
+
sheet:
|
|
75
|
+
|
|
76
|
+
- In array mode, your data starts on the first row
|
|
77
|
+
|
|
78
|
+
- In has mode the first row of the sheet contains headers, followed by rows of data
|
|
79
|
+
|
|
80
|
+
If your data or headers does not start on the first row of the sheet you can
|
|
81
|
+
tell Xsv to skip a number of rows:
|
|
82
|
+
|
|
83
|
+
```ruby
|
|
84
|
+
workbook.sheets[0].row_skip = 1
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
All operations will honour this offset, making the skipped rows unreachable.
|
|
88
|
+
|
|
71
89
|
## Development
|
|
72
90
|
|
|
73
91
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/xsv/helpers.rb
CHANGED
|
@@ -34,8 +34,13 @@ module Xsv
|
|
|
34
34
|
49 => "@",
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
MINUTE = 60
|
|
38
|
+
HOUR = 3600
|
|
39
|
+
|
|
37
40
|
# Return the index number for the given Excel column name
|
|
38
41
|
def column_index(col)
|
|
42
|
+
col = col.scan(/^[A-Z]+/).first
|
|
43
|
+
|
|
39
44
|
val = 0
|
|
40
45
|
while col.length > 0
|
|
41
46
|
val *= 26
|
|
@@ -52,12 +57,35 @@ module Xsv
|
|
|
52
57
|
|
|
53
58
|
# Return a time as a string for the given Excel time value
|
|
54
59
|
def parse_time(number)
|
|
60
|
+
# Disregard date part
|
|
61
|
+
if number > 0
|
|
62
|
+
number = number - number.truncate
|
|
63
|
+
end
|
|
64
|
+
|
|
55
65
|
base = number * 24
|
|
56
66
|
|
|
57
67
|
hours = base.truncate
|
|
58
|
-
minutes = (base - hours) * 60
|
|
68
|
+
minutes = ((base - hours) * 60).round
|
|
59
69
|
|
|
60
|
-
|
|
70
|
+
# Compensate for rounding errors
|
|
71
|
+
if minutes >= 60
|
|
72
|
+
hours = hours + (minutes / 60)
|
|
73
|
+
minutes = minutes % 60
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
"%02d:%02d" % [hours, minutes]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def parse_datetime(number)
|
|
80
|
+
date_base = number.truncate
|
|
81
|
+
time = parse_date(date_base).to_time
|
|
82
|
+
|
|
83
|
+
time_base = (number - date_base) * 24
|
|
84
|
+
|
|
85
|
+
hours = time_base.truncate
|
|
86
|
+
minutes = (time_base - hours) * 60
|
|
87
|
+
|
|
88
|
+
time + hours * HOUR + minutes.round * MINUTE
|
|
61
89
|
end
|
|
62
90
|
|
|
63
91
|
def parse_number(string)
|
|
@@ -68,6 +96,11 @@ module Xsv
|
|
|
68
96
|
end
|
|
69
97
|
end
|
|
70
98
|
|
|
99
|
+
# Tests if the given format string includes both date and time
|
|
100
|
+
def is_datetime_format?(format)
|
|
101
|
+
is_date_format?(format) && is_time_format?(format)
|
|
102
|
+
end
|
|
103
|
+
|
|
71
104
|
# Tests if the given format string is a date
|
|
72
105
|
def is_date_format?(format)
|
|
73
106
|
return false if format.nil?
|
data/lib/xsv/sheet.rb
CHANGED
|
@@ -4,16 +4,34 @@ module Xsv
|
|
|
4
4
|
|
|
5
5
|
attr_reader :xml, :mode
|
|
6
6
|
|
|
7
|
+
# Set a number of rows to skip at the top of the sheet (header row offset)
|
|
8
|
+
attr_accessor :row_skip
|
|
9
|
+
|
|
7
10
|
def initialize(workbook, xml)
|
|
8
11
|
@workbook = workbook
|
|
9
12
|
@xml = xml
|
|
10
13
|
@headers = []
|
|
14
|
+
@mode = :array
|
|
15
|
+
@row_skip = 0
|
|
11
16
|
|
|
12
|
-
|
|
13
|
-
bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
|
|
14
|
-
@column_count = (bounds.max - bounds.min) + 1
|
|
17
|
+
dimension = xml.css("dimension").first
|
|
15
18
|
|
|
16
|
-
|
|
19
|
+
if dimension
|
|
20
|
+
_firstCell, lastCell = dimension["ref"].split(":")
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
if lastCell
|
|
24
|
+
# Assume the dimension reflects the content
|
|
25
|
+
@column_count = column_index(lastCell) + 1
|
|
26
|
+
else
|
|
27
|
+
# Find the last cell in every row that has a value
|
|
28
|
+
rightmost_cells = @xml.xpath("//xmlns:row/xmlns:c[*[local-name() = 'v']][last()]").map { |c| column_index(c["r"]) }
|
|
29
|
+
@column_count = rightmost_cells.max + 1
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Find the last row that contains actual values
|
|
34
|
+
@last_row = @xml.xpath("//xmlns:row[*[xmlns:v]][last()]").first["r"].to_i
|
|
17
35
|
end
|
|
18
36
|
|
|
19
37
|
def inspect
|
|
@@ -22,19 +40,28 @@ module Xsv
|
|
|
22
40
|
|
|
23
41
|
# Iterate over rows
|
|
24
42
|
def each_row
|
|
25
|
-
row_index = 0
|
|
43
|
+
row_index = 0 - @row_skip
|
|
44
|
+
|
|
26
45
|
@xml.css("sheetData row").each do |row_xml|
|
|
46
|
+
if row_index < 0
|
|
47
|
+
row_index += 1
|
|
48
|
+
next
|
|
49
|
+
end
|
|
50
|
+
|
|
27
51
|
row_index += 1
|
|
28
52
|
|
|
29
53
|
next if row_index == 1 && @mode == :hash
|
|
30
54
|
|
|
31
55
|
# pad empty rows
|
|
32
|
-
while row_index < row_xml["r"].to_i do
|
|
56
|
+
while row_index < row_xml["r"].to_i - @row_skip do
|
|
33
57
|
yield(empty_row)
|
|
34
58
|
row_index += 1
|
|
35
59
|
end
|
|
36
60
|
|
|
37
61
|
yield(parse_row(row_xml))
|
|
62
|
+
|
|
63
|
+
# Do not return empty trailing rows
|
|
64
|
+
break if row_index == @last_row - @row_skip
|
|
38
65
|
end
|
|
39
66
|
|
|
40
67
|
true
|
|
@@ -42,7 +69,7 @@ module Xsv
|
|
|
42
69
|
|
|
43
70
|
# Get row by number, starting at 0
|
|
44
71
|
def [](number)
|
|
45
|
-
row_xml = xml.css("sheetData row[r=#{number + 1}]").first
|
|
72
|
+
row_xml = xml.css("sheetData row[r=#{number + @row_skip + 1}]").first
|
|
46
73
|
|
|
47
74
|
if row_xml
|
|
48
75
|
parse_row(row_xml)
|
|
@@ -55,7 +82,7 @@ module Xsv
|
|
|
55
82
|
# all methods return hashes instead of arrays
|
|
56
83
|
def parse_headers!
|
|
57
84
|
@mode = :array
|
|
58
|
-
parse_headers
|
|
85
|
+
@headers = parse_headers
|
|
59
86
|
|
|
60
87
|
@mode = :hash
|
|
61
88
|
|
|
@@ -65,7 +92,7 @@ module Xsv
|
|
|
65
92
|
private
|
|
66
93
|
|
|
67
94
|
def parse_headers
|
|
68
|
-
|
|
95
|
+
parse_row(@xml.css("sheetData row")[@row_skip])
|
|
69
96
|
end
|
|
70
97
|
|
|
71
98
|
def empty_row
|
|
@@ -80,7 +107,7 @@ module Xsv
|
|
|
80
107
|
def parse_row(xml)
|
|
81
108
|
row = empty_row
|
|
82
109
|
|
|
83
|
-
xml.css("c").each do |c_xml|
|
|
110
|
+
xml.css("c").first(@column_count).each do |c_xml|
|
|
84
111
|
value = case c_xml["t"]
|
|
85
112
|
when "s"
|
|
86
113
|
@workbook.shared_strings[c_xml.css("v").inner_text.to_i]
|
|
@@ -89,29 +116,36 @@ module Xsv
|
|
|
89
116
|
when "e" # N/A
|
|
90
117
|
nil
|
|
91
118
|
when nil
|
|
92
|
-
|
|
119
|
+
v = c_xml.css("v").first
|
|
120
|
+
|
|
121
|
+
if v.nil?
|
|
122
|
+
nil
|
|
123
|
+
elsif c_xml["s"]
|
|
124
|
+
value = parse_number(v.inner_text)
|
|
93
125
|
|
|
94
|
-
if c_xml["s"]
|
|
95
126
|
style = @workbook.xfs[c_xml["s"].to_i]
|
|
96
127
|
numFmtId = style[:numFmtId].to_i
|
|
128
|
+
numFmt = @workbook.numFmts[numFmtId]
|
|
97
129
|
if numFmtId == 0
|
|
98
130
|
value
|
|
99
|
-
elsif
|
|
131
|
+
elsif is_datetime_format?(numFmt)
|
|
132
|
+
parse_datetime(value)
|
|
133
|
+
elsif is_date_format?(numFmt)
|
|
100
134
|
parse_date(value)
|
|
101
|
-
elsif is_time_format?(
|
|
135
|
+
elsif is_time_format?(numFmt)
|
|
102
136
|
parse_time(value)
|
|
103
137
|
else
|
|
104
138
|
value
|
|
105
139
|
end
|
|
106
140
|
else
|
|
107
|
-
|
|
141
|
+
parse_number(v.inner_text)
|
|
108
142
|
end
|
|
109
143
|
else
|
|
110
144
|
raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
|
|
111
145
|
end
|
|
112
146
|
|
|
113
147
|
# Determine column position and pad row with nil values
|
|
114
|
-
col_index = column_index(c_xml["r"]
|
|
148
|
+
col_index = column_index(c_xml["r"])
|
|
115
149
|
|
|
116
150
|
case @mode
|
|
117
151
|
when :array
|
data/lib/xsv/version.rb
CHANGED
data/lib/xsv/workbook.rb
CHANGED
|
@@ -57,8 +57,8 @@ module Xsv
|
|
|
57
57
|
end
|
|
58
58
|
|
|
59
59
|
def fetch_sheets
|
|
60
|
-
@zip.glob("xl/worksheets/sheet*.xml").sort do |
|
|
61
|
-
|
|
60
|
+
@zip.glob("xl/worksheets/sheet*.xml").sort do |a, b|
|
|
61
|
+
a.name.scan(/\d+/).first.to_i <=> b.name.scan(/\d+/).first.to_i
|
|
62
62
|
end.each do |entry|
|
|
63
63
|
@sheets << Xsv::Sheet.new(self, Nokogiri::XML(entry.get_input_stream))
|
|
64
64
|
end
|