xsv 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +6 -0
- data/Gemfile.lock +1 -1
- data/README.md +19 -1
- data/lib/xsv/helpers.rb +35 -2
- data/lib/xsv/sheet.rb +50 -16
- data/lib/xsv/version.rb +1 -1
- data/lib/xsv/workbook.rb +2 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ace53a58655a50f4de2f10c4a2d68819774e3e74625595353290502299630b30
|
4
|
+
data.tar.gz: 8278ea7b26ac261781ae71328762104db9d14de0cbc05be1b0551a3cb876ea35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 31ccd6261073893e0a0873997d7befa39fc0f8088f5f4d9d1ec3f97b7d90f48d1153714cff4f124a4be853976ce714326a6ec8634e646f15ecf6cf428c245a39
|
7
|
+
data.tar.gz: eed43d77052870dc45bc009719b130096a05a6add11576a349ca027ff93b428bfe64813dd83a8ea58a1a7da94aa8b490b4394efa65bbb175ae09dd74c77ee20a
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Xsv is a very basic parser for Excel files in the .xlsx format that strives to
|
4
4
|
provide feature parity with common CSV readers and nothing more. This means
|
5
|
-
it only parses values to basic Ruby types and does not deal with formatting
|
5
|
+
it only parses values to basic Ruby types and does not deal with most formatting
|
6
6
|
or more advanced functionality. The goal is to allow for fast parsing of large
|
7
7
|
worksheets with minimal RAM and CPU consumption.
|
8
8
|
|
@@ -68,6 +68,24 @@ sheet[1] # => {"header1" => "value1", "header2" => "value2"}
|
|
68
68
|
Be aware that hash mode will lead to unpredictable results if you have multiple
|
69
69
|
columns with the same name!
|
70
70
|
|
71
|
+
### Assumptions
|
72
|
+
|
73
|
+
Since Xsv treats worksheets like csv files it makes certain assumptions about your
|
74
|
+
sheet:
|
75
|
+
|
76
|
+
- In array mode, your data starts on the first row
|
77
|
+
|
78
|
+
- In has mode the first row of the sheet contains headers, followed by rows of data
|
79
|
+
|
80
|
+
If your data or headers does not start on the first row of the sheet you can
|
81
|
+
tell Xsv to skip a number of rows:
|
82
|
+
|
83
|
+
```ruby
|
84
|
+
workbook.sheets[0].row_skip = 1
|
85
|
+
```
|
86
|
+
|
87
|
+
All operations will honour this offset, making the skipped rows unreachable.
|
88
|
+
|
71
89
|
## Development
|
72
90
|
|
73
91
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/xsv/helpers.rb
CHANGED
@@ -34,8 +34,13 @@ module Xsv
|
|
34
34
|
49 => "@",
|
35
35
|
}
|
36
36
|
|
37
|
+
MINUTE = 60
|
38
|
+
HOUR = 3600
|
39
|
+
|
37
40
|
# Return the index number for the given Excel column name
|
38
41
|
def column_index(col)
|
42
|
+
col = col.scan(/^[A-Z]+/).first
|
43
|
+
|
39
44
|
val = 0
|
40
45
|
while col.length > 0
|
41
46
|
val *= 26
|
@@ -52,12 +57,35 @@ module Xsv
|
|
52
57
|
|
53
58
|
# Return a time as a string for the given Excel time value
|
54
59
|
def parse_time(number)
|
60
|
+
# Disregard date part
|
61
|
+
if number > 0
|
62
|
+
number = number - number.truncate
|
63
|
+
end
|
64
|
+
|
55
65
|
base = number * 24
|
56
66
|
|
57
67
|
hours = base.truncate
|
58
|
-
minutes = (base - hours) * 60
|
68
|
+
minutes = ((base - hours) * 60).round
|
59
69
|
|
60
|
-
|
70
|
+
# Compensate for rounding errors
|
71
|
+
if minutes >= 60
|
72
|
+
hours = hours + (minutes / 60)
|
73
|
+
minutes = minutes % 60
|
74
|
+
end
|
75
|
+
|
76
|
+
"%02d:%02d" % [hours, minutes]
|
77
|
+
end
|
78
|
+
|
79
|
+
def parse_datetime(number)
|
80
|
+
date_base = number.truncate
|
81
|
+
time = parse_date(date_base).to_time
|
82
|
+
|
83
|
+
time_base = (number - date_base) * 24
|
84
|
+
|
85
|
+
hours = time_base.truncate
|
86
|
+
minutes = (time_base - hours) * 60
|
87
|
+
|
88
|
+
time + hours * HOUR + minutes.round * MINUTE
|
61
89
|
end
|
62
90
|
|
63
91
|
def parse_number(string)
|
@@ -68,6 +96,11 @@ module Xsv
|
|
68
96
|
end
|
69
97
|
end
|
70
98
|
|
99
|
+
# Tests if the given format string includes both date and time
|
100
|
+
def is_datetime_format?(format)
|
101
|
+
is_date_format?(format) && is_time_format?(format)
|
102
|
+
end
|
103
|
+
|
71
104
|
# Tests if the given format string is a date
|
72
105
|
def is_date_format?(format)
|
73
106
|
return false if format.nil?
|
data/lib/xsv/sheet.rb
CHANGED
@@ -4,16 +4,34 @@ module Xsv
|
|
4
4
|
|
5
5
|
attr_reader :xml, :mode
|
6
6
|
|
7
|
+
# Set a number of rows to skip at the top of the sheet (header row offset)
|
8
|
+
attr_accessor :row_skip
|
9
|
+
|
7
10
|
def initialize(workbook, xml)
|
8
11
|
@workbook = workbook
|
9
12
|
@xml = xml
|
10
13
|
@headers = []
|
14
|
+
@mode = :array
|
15
|
+
@row_skip = 0
|
11
16
|
|
12
|
-
|
13
|
-
bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
|
14
|
-
@column_count = (bounds.max - bounds.min) + 1
|
17
|
+
dimension = xml.css("dimension").first
|
15
18
|
|
16
|
-
|
19
|
+
if dimension
|
20
|
+
_firstCell, lastCell = dimension["ref"].split(":")
|
21
|
+
end
|
22
|
+
|
23
|
+
if lastCell
|
24
|
+
# Assume the dimension reflects the content
|
25
|
+
@column_count = column_index(lastCell) + 1
|
26
|
+
else
|
27
|
+
# Find the last cell in every row that has a value
|
28
|
+
rightmost_cells = @xml.xpath("//xmlns:row/xmlns:c[*[local-name() = 'v']][last()]").map { |c| column_index(c["r"]) }
|
29
|
+
@column_count = rightmost_cells.max + 1
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
# Find the last row that contains actual values
|
34
|
+
@last_row = @xml.xpath("//xmlns:row[*[xmlns:v]][last()]").first["r"].to_i
|
17
35
|
end
|
18
36
|
|
19
37
|
def inspect
|
@@ -22,19 +40,28 @@ module Xsv
|
|
22
40
|
|
23
41
|
# Iterate over rows
|
24
42
|
def each_row
|
25
|
-
row_index = 0
|
43
|
+
row_index = 0 - @row_skip
|
44
|
+
|
26
45
|
@xml.css("sheetData row").each do |row_xml|
|
46
|
+
if row_index < 0
|
47
|
+
row_index += 1
|
48
|
+
next
|
49
|
+
end
|
50
|
+
|
27
51
|
row_index += 1
|
28
52
|
|
29
53
|
next if row_index == 1 && @mode == :hash
|
30
54
|
|
31
55
|
# pad empty rows
|
32
|
-
while row_index < row_xml["r"].to_i do
|
56
|
+
while row_index < row_xml["r"].to_i - @row_skip do
|
33
57
|
yield(empty_row)
|
34
58
|
row_index += 1
|
35
59
|
end
|
36
60
|
|
37
61
|
yield(parse_row(row_xml))
|
62
|
+
|
63
|
+
# Do not return empty trailing rows
|
64
|
+
break if row_index == @last_row - @row_skip
|
38
65
|
end
|
39
66
|
|
40
67
|
true
|
@@ -42,7 +69,7 @@ module Xsv
|
|
42
69
|
|
43
70
|
# Get row by number, starting at 0
|
44
71
|
def [](number)
|
45
|
-
row_xml = xml.css("sheetData row[r=#{number + 1}]").first
|
72
|
+
row_xml = xml.css("sheetData row[r=#{number + @row_skip + 1}]").first
|
46
73
|
|
47
74
|
if row_xml
|
48
75
|
parse_row(row_xml)
|
@@ -55,7 +82,7 @@ module Xsv
|
|
55
82
|
# all methods return hashes instead of arrays
|
56
83
|
def parse_headers!
|
57
84
|
@mode = :array
|
58
|
-
parse_headers
|
85
|
+
@headers = parse_headers
|
59
86
|
|
60
87
|
@mode = :hash
|
61
88
|
|
@@ -65,7 +92,7 @@ module Xsv
|
|
65
92
|
private
|
66
93
|
|
67
94
|
def parse_headers
|
68
|
-
|
95
|
+
parse_row(@xml.css("sheetData row")[@row_skip])
|
69
96
|
end
|
70
97
|
|
71
98
|
def empty_row
|
@@ -80,7 +107,7 @@ module Xsv
|
|
80
107
|
def parse_row(xml)
|
81
108
|
row = empty_row
|
82
109
|
|
83
|
-
xml.css("c").each do |c_xml|
|
110
|
+
xml.css("c").first(@column_count).each do |c_xml|
|
84
111
|
value = case c_xml["t"]
|
85
112
|
when "s"
|
86
113
|
@workbook.shared_strings[c_xml.css("v").inner_text.to_i]
|
@@ -89,29 +116,36 @@ module Xsv
|
|
89
116
|
when "e" # N/A
|
90
117
|
nil
|
91
118
|
when nil
|
92
|
-
|
119
|
+
v = c_xml.css("v").first
|
120
|
+
|
121
|
+
if v.nil?
|
122
|
+
nil
|
123
|
+
elsif c_xml["s"]
|
124
|
+
value = parse_number(v.inner_text)
|
93
125
|
|
94
|
-
if c_xml["s"]
|
95
126
|
style = @workbook.xfs[c_xml["s"].to_i]
|
96
127
|
numFmtId = style[:numFmtId].to_i
|
128
|
+
numFmt = @workbook.numFmts[numFmtId]
|
97
129
|
if numFmtId == 0
|
98
130
|
value
|
99
|
-
elsif
|
131
|
+
elsif is_datetime_format?(numFmt)
|
132
|
+
parse_datetime(value)
|
133
|
+
elsif is_date_format?(numFmt)
|
100
134
|
parse_date(value)
|
101
|
-
elsif is_time_format?(
|
135
|
+
elsif is_time_format?(numFmt)
|
102
136
|
parse_time(value)
|
103
137
|
else
|
104
138
|
value
|
105
139
|
end
|
106
140
|
else
|
107
|
-
|
141
|
+
parse_number(v.inner_text)
|
108
142
|
end
|
109
143
|
else
|
110
144
|
raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
|
111
145
|
end
|
112
146
|
|
113
147
|
# Determine column position and pad row with nil values
|
114
|
-
col_index = column_index(c_xml["r"]
|
148
|
+
col_index = column_index(c_xml["r"])
|
115
149
|
|
116
150
|
case @mode
|
117
151
|
when :array
|
data/lib/xsv/version.rb
CHANGED
data/lib/xsv/workbook.rb
CHANGED
@@ -57,8 +57,8 @@ module Xsv
|
|
57
57
|
end
|
58
58
|
|
59
59
|
def fetch_sheets
|
60
|
-
@zip.glob("xl/worksheets/sheet*.xml").sort do |
|
61
|
-
|
60
|
+
@zip.glob("xl/worksheets/sheet*.xml").sort do |a, b|
|
61
|
+
a.name.scan(/\d+/).first.to_i <=> b.name.scan(/\d+/).first.to_i
|
62
62
|
end.each do |entry|
|
63
63
|
@sheets << Xsv::Sheet.new(self, Nokogiri::XML(entry.get_input_stream))
|
64
64
|
end
|