xsv 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 47e4ee16a95b100a1c1bbc526912235bbd1386601a33e3c3449320fe4ea8bc52
4
- data.tar.gz: 5b0f8320ff29a3dd036cf4396052cbcee00556c2f116dce63c81eb6f4bb69e2a
3
+ metadata.gz: ace53a58655a50f4de2f10c4a2d68819774e3e74625595353290502299630b30
4
+ data.tar.gz: 8278ea7b26ac261781ae71328762104db9d14de0cbc05be1b0551a3cb876ea35
5
5
  SHA512:
6
- metadata.gz: 40bde712c1df13d4fd330b24fdd3cb9e40e5983b4271c172b6f77245aa4d57b8aadc470be5faffb5c759e807c6bf5a507f87139279b86395cddf0fc70b5446fb
7
- data.tar.gz: 943f5436b416f226f8bcfa2a856dc8079f061be20ca8cc534a9e147692906a280f85f6eb300b7c4e4681ce70a6c5bb81029c6abb70deab87950919e21f03104a
6
+ metadata.gz: 31ccd6261073893e0a0873997d7befa39fc0f8088f5f4d9d1ec3f97b7d90f48d1153714cff4f124a4be853976ce714326a6ec8634e646f15ecf6cf428c245a39
7
+ data.tar.gz: eed43d77052870dc45bc009719b130096a05a6add11576a349ca027ff93b428bfe64813dd83a8ea58a1a7da94aa8b490b4394efa65bbb175ae09dd74c77ee20a
data/.gitignore CHANGED
@@ -6,3 +6,9 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+
10
+ .DS_Store
11
+
12
+ /inspect*
13
+ /dump*
14
+ /*.gem
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xsv (0.1.2)
4
+ xsv (0.2.0)
5
5
  nokogiri (~> 1.10)
6
6
  rubyzip (~> 2.2)
7
7
 
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Xsv is a very basic parser for Excel files in the .xlsx format that strives to
4
4
  provide feature parity with common CSV readers and nothing more. This means
5
- it only parses values to basic Ruby types and does not deal with formatting
5
+ it only parses values to basic Ruby types and does not deal with most formatting
6
6
  or more advanced functionality. The goal is to allow for fast parsing of large
7
7
  worksheets with minimal RAM and CPU consumption.
8
8
 
@@ -68,6 +68,24 @@ sheet[1] # => {"header1" => "value1", "header2" => "value2"}
68
68
  Be aware that hash mode will lead to unpredictable results if you have multiple
69
69
  columns with the same name!
70
70
 
71
+ ### Assumptions
72
+
73
+ Since Xsv treats worksheets like csv files it makes certain assumptions about your
74
+ sheet:
75
+
76
+ - In array mode, your data starts on the first row
77
+
78
+ - In has mode the first row of the sheet contains headers, followed by rows of data
79
+
80
+ If your data or headers does not start on the first row of the sheet you can
81
+ tell Xsv to skip a number of rows:
82
+
83
+ ```ruby
84
+ workbook.sheets[0].row_skip = 1
85
+ ```
86
+
87
+ All operations will honour this offset, making the skipped rows unreachable.
88
+
71
89
  ## Development
72
90
 
73
91
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/lib/xsv/helpers.rb CHANGED
@@ -34,8 +34,13 @@ module Xsv
34
34
  49 => "@",
35
35
  }
36
36
 
37
+ MINUTE = 60
38
+ HOUR = 3600
39
+
37
40
  # Return the index number for the given Excel column name
38
41
  def column_index(col)
42
+ col = col.scan(/^[A-Z]+/).first
43
+
39
44
  val = 0
40
45
  while col.length > 0
41
46
  val *= 26
@@ -52,12 +57,35 @@ module Xsv
52
57
 
53
58
  # Return a time as a string for the given Excel time value
54
59
  def parse_time(number)
60
+ # Disregard date part
61
+ if number > 0
62
+ number = number - number.truncate
63
+ end
64
+
55
65
  base = number * 24
56
66
 
57
67
  hours = base.truncate
58
- minutes = (base - hours) * 60
68
+ minutes = ((base - hours) * 60).round
59
69
 
60
- "%02d:%02d" % [base, minutes.round]
70
+ # Compensate for rounding errors
71
+ if minutes >= 60
72
+ hours = hours + (minutes / 60)
73
+ minutes = minutes % 60
74
+ end
75
+
76
+ "%02d:%02d" % [hours, minutes]
77
+ end
78
+
79
+ def parse_datetime(number)
80
+ date_base = number.truncate
81
+ time = parse_date(date_base).to_time
82
+
83
+ time_base = (number - date_base) * 24
84
+
85
+ hours = time_base.truncate
86
+ minutes = (time_base - hours) * 60
87
+
88
+ time + hours * HOUR + minutes.round * MINUTE
61
89
  end
62
90
 
63
91
  def parse_number(string)
@@ -68,6 +96,11 @@ module Xsv
68
96
  end
69
97
  end
70
98
 
99
+ # Tests if the given format string includes both date and time
100
+ def is_datetime_format?(format)
101
+ is_date_format?(format) && is_time_format?(format)
102
+ end
103
+
71
104
  # Tests if the given format string is a date
72
105
  def is_date_format?(format)
73
106
  return false if format.nil?
data/lib/xsv/sheet.rb CHANGED
@@ -4,16 +4,34 @@ module Xsv
4
4
 
5
5
  attr_reader :xml, :mode
6
6
 
7
+ # Set a number of rows to skip at the top of the sheet (header row offset)
8
+ attr_accessor :row_skip
9
+
7
10
  def initialize(workbook, xml)
8
11
  @workbook = workbook
9
12
  @xml = xml
10
13
  @headers = []
14
+ @mode = :array
15
+ @row_skip = 0
11
16
 
12
- # Determine number of columns
13
- bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
14
- @column_count = (bounds.max - bounds.min) + 1
17
+ dimension = xml.css("dimension").first
15
18
 
16
- @mode = :array
19
+ if dimension
20
+ _firstCell, lastCell = dimension["ref"].split(":")
21
+ end
22
+
23
+ if lastCell
24
+ # Assume the dimension reflects the content
25
+ @column_count = column_index(lastCell) + 1
26
+ else
27
+ # Find the last cell in every row that has a value
28
+ rightmost_cells = @xml.xpath("//xmlns:row/xmlns:c[*[local-name() = 'v']][last()]").map { |c| column_index(c["r"]) }
29
+ @column_count = rightmost_cells.max + 1
30
+
31
+ end
32
+
33
+ # Find the last row that contains actual values
34
+ @last_row = @xml.xpath("//xmlns:row[*[xmlns:v]][last()]").first["r"].to_i
17
35
  end
18
36
 
19
37
  def inspect
@@ -22,19 +40,28 @@ module Xsv
22
40
 
23
41
  # Iterate over rows
24
42
  def each_row
25
- row_index = 0
43
+ row_index = 0 - @row_skip
44
+
26
45
  @xml.css("sheetData row").each do |row_xml|
46
+ if row_index < 0
47
+ row_index += 1
48
+ next
49
+ end
50
+
27
51
  row_index += 1
28
52
 
29
53
  next if row_index == 1 && @mode == :hash
30
54
 
31
55
  # pad empty rows
32
- while row_index < row_xml["r"].to_i do
56
+ while row_index < row_xml["r"].to_i - @row_skip do
33
57
  yield(empty_row)
34
58
  row_index += 1
35
59
  end
36
60
 
37
61
  yield(parse_row(row_xml))
62
+
63
+ # Do not return empty trailing rows
64
+ break if row_index == @last_row - @row_skip
38
65
  end
39
66
 
40
67
  true
@@ -42,7 +69,7 @@ module Xsv
42
69
 
43
70
  # Get row by number, starting at 0
44
71
  def [](number)
45
- row_xml = xml.css("sheetData row[r=#{number + 1}]").first
72
+ row_xml = xml.css("sheetData row[r=#{number + @row_skip + 1}]").first
46
73
 
47
74
  if row_xml
48
75
  parse_row(row_xml)
@@ -55,7 +82,7 @@ module Xsv
55
82
  # all methods return hashes instead of arrays
56
83
  def parse_headers!
57
84
  @mode = :array
58
- parse_headers
85
+ @headers = parse_headers
59
86
 
60
87
  @mode = :hash
61
88
 
@@ -65,7 +92,7 @@ module Xsv
65
92
  private
66
93
 
67
94
  def parse_headers
68
- @headers = parse_row(@xml.css("sheetData row").first)
95
+ parse_row(@xml.css("sheetData row")[@row_skip])
69
96
  end
70
97
 
71
98
  def empty_row
@@ -80,7 +107,7 @@ module Xsv
80
107
  def parse_row(xml)
81
108
  row = empty_row
82
109
 
83
- xml.css("c").each do |c_xml|
110
+ xml.css("c").first(@column_count).each do |c_xml|
84
111
  value = case c_xml["t"]
85
112
  when "s"
86
113
  @workbook.shared_strings[c_xml.css("v").inner_text.to_i]
@@ -89,29 +116,36 @@ module Xsv
89
116
  when "e" # N/A
90
117
  nil
91
118
  when nil
92
- value = parse_number(c_xml.css("v").inner_text)
119
+ v = c_xml.css("v").first
120
+
121
+ if v.nil?
122
+ nil
123
+ elsif c_xml["s"]
124
+ value = parse_number(v.inner_text)
93
125
 
94
- if c_xml["s"]
95
126
  style = @workbook.xfs[c_xml["s"].to_i]
96
127
  numFmtId = style[:numFmtId].to_i
128
+ numFmt = @workbook.numFmts[numFmtId]
97
129
  if numFmtId == 0
98
130
  value
99
- elsif is_date_format?(@workbook.numFmts[numFmtId])
131
+ elsif is_datetime_format?(numFmt)
132
+ parse_datetime(value)
133
+ elsif is_date_format?(numFmt)
100
134
  parse_date(value)
101
- elsif is_time_format?(@workbook.numFmts[numFmtId])
135
+ elsif is_time_format?(numFmt)
102
136
  parse_time(value)
103
137
  else
104
138
  value
105
139
  end
106
140
  else
107
- value
141
+ parse_number(v.inner_text)
108
142
  end
109
143
  else
110
144
  raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
111
145
  end
112
146
 
113
147
  # Determine column position and pad row with nil values
114
- col_index = column_index(c_xml["r"].scan(/^[A-Z]+/).first)
148
+ col_index = column_index(c_xml["r"])
115
149
 
116
150
  case @mode
117
151
  when :array
data/lib/xsv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Xsv
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
data/lib/xsv/workbook.rb CHANGED
@@ -57,8 +57,8 @@ module Xsv
57
57
  end
58
58
 
59
59
  def fetch_sheets
60
- @zip.glob("xl/worksheets/sheet*.xml").sort do |entry|
61
- entry.name.scan(/\d+/).first.to_i
60
+ @zip.glob("xl/worksheets/sheet*.xml").sort do |a, b|
61
+ a.name.scan(/\d+/).first.to_i <=> b.name.scan(/\d+/).first.to_i
62
62
  end.each do |entry|
63
63
  @sheets << Xsv::Sheet.new(self, Nokogiri::XML(entry.get_input_stream))
64
64
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xsv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martijn Storck