xsv 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 47e4ee16a95b100a1c1bbc526912235bbd1386601a33e3c3449320fe4ea8bc52
4
- data.tar.gz: 5b0f8320ff29a3dd036cf4396052cbcee00556c2f116dce63c81eb6f4bb69e2a
3
+ metadata.gz: ace53a58655a50f4de2f10c4a2d68819774e3e74625595353290502299630b30
4
+ data.tar.gz: 8278ea7b26ac261781ae71328762104db9d14de0cbc05be1b0551a3cb876ea35
5
5
  SHA512:
6
- metadata.gz: 40bde712c1df13d4fd330b24fdd3cb9e40e5983b4271c172b6f77245aa4d57b8aadc470be5faffb5c759e807c6bf5a507f87139279b86395cddf0fc70b5446fb
7
- data.tar.gz: 943f5436b416f226f8bcfa2a856dc8079f061be20ca8cc534a9e147692906a280f85f6eb300b7c4e4681ce70a6c5bb81029c6abb70deab87950919e21f03104a
6
+ metadata.gz: 31ccd6261073893e0a0873997d7befa39fc0f8088f5f4d9d1ec3f97b7d90f48d1153714cff4f124a4be853976ce714326a6ec8634e646f15ecf6cf428c245a39
7
+ data.tar.gz: eed43d77052870dc45bc009719b130096a05a6add11576a349ca027ff93b428bfe64813dd83a8ea58a1a7da94aa8b490b4394efa65bbb175ae09dd74c77ee20a
data/.gitignore CHANGED
@@ -6,3 +6,9 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+
10
+ .DS_Store
11
+
12
+ /inspect*
13
+ /dump*
14
+ /*.gem
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- xsv (0.1.2)
4
+ xsv (0.2.0)
5
5
  nokogiri (~> 1.10)
6
6
  rubyzip (~> 2.2)
7
7
 
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Xsv is a very basic parser for Excel files in the .xlsx format that strives to
4
4
  provide feature parity with common CSV readers and nothing more. This means
5
- it only parses values to basic Ruby types and does not deal with formatting
5
+ it only parses values to basic Ruby types and does not deal with most formatting
6
6
  or more advanced functionality. The goal is to allow for fast parsing of large
7
7
  worksheets with minimal RAM and CPU consumption.
8
8
 
@@ -68,6 +68,24 @@ sheet[1] # => {"header1" => "value1", "header2" => "value2"}
68
68
  Be aware that hash mode will lead to unpredictable results if you have multiple
69
69
  columns with the same name!
70
70
 
71
+ ### Assumptions
72
+
73
+ Since Xsv treats worksheets like csv files it makes certain assumptions about your
74
+ sheet:
75
+
76
+ - In array mode, your data starts on the first row
77
+
78
+ - In has mode the first row of the sheet contains headers, followed by rows of data
79
+
80
+ If your data or headers does not start on the first row of the sheet you can
81
+ tell Xsv to skip a number of rows:
82
+
83
+ ```ruby
84
+ workbook.sheets[0].row_skip = 1
85
+ ```
86
+
87
+ All operations will honour this offset, making the skipped rows unreachable.
88
+
71
89
  ## Development
72
90
 
73
91
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/lib/xsv/helpers.rb CHANGED
@@ -34,8 +34,13 @@ module Xsv
34
34
  49 => "@",
35
35
  }
36
36
 
37
+ MINUTE = 60
38
+ HOUR = 3600
39
+
37
40
  # Return the index number for the given Excel column name
38
41
  def column_index(col)
42
+ col = col.scan(/^[A-Z]+/).first
43
+
39
44
  val = 0
40
45
  while col.length > 0
41
46
  val *= 26
@@ -52,12 +57,35 @@ module Xsv
52
57
 
53
58
  # Return a time as a string for the given Excel time value
54
59
  def parse_time(number)
60
+ # Disregard date part
61
+ if number > 0
62
+ number = number - number.truncate
63
+ end
64
+
55
65
  base = number * 24
56
66
 
57
67
  hours = base.truncate
58
- minutes = (base - hours) * 60
68
+ minutes = ((base - hours) * 60).round
59
69
 
60
- "%02d:%02d" % [base, minutes.round]
70
+ # Compensate for rounding errors
71
+ if minutes >= 60
72
+ hours = hours + (minutes / 60)
73
+ minutes = minutes % 60
74
+ end
75
+
76
+ "%02d:%02d" % [hours, minutes]
77
+ end
78
+
79
+ def parse_datetime(number)
80
+ date_base = number.truncate
81
+ time = parse_date(date_base).to_time
82
+
83
+ time_base = (number - date_base) * 24
84
+
85
+ hours = time_base.truncate
86
+ minutes = (time_base - hours) * 60
87
+
88
+ time + hours * HOUR + minutes.round * MINUTE
61
89
  end
62
90
 
63
91
  def parse_number(string)
@@ -68,6 +96,11 @@ module Xsv
68
96
  end
69
97
  end
70
98
 
99
+ # Tests if the given format string includes both date and time
100
+ def is_datetime_format?(format)
101
+ is_date_format?(format) && is_time_format?(format)
102
+ end
103
+
71
104
  # Tests if the given format string is a date
72
105
  def is_date_format?(format)
73
106
  return false if format.nil?
data/lib/xsv/sheet.rb CHANGED
@@ -4,16 +4,34 @@ module Xsv
4
4
 
5
5
  attr_reader :xml, :mode
6
6
 
7
+ # Set a number of rows to skip at the top of the sheet (header row offset)
8
+ attr_accessor :row_skip
9
+
7
10
  def initialize(workbook, xml)
8
11
  @workbook = workbook
9
12
  @xml = xml
10
13
  @headers = []
14
+ @mode = :array
15
+ @row_skip = 0
11
16
 
12
- # Determine number of columns
13
- bounds = @xml.css("cols col").map { |c| [c["min"].to_i, c["max"].to_i] }.flatten
14
- @column_count = (bounds.max - bounds.min) + 1
17
+ dimension = xml.css("dimension").first
15
18
 
16
- @mode = :array
19
+ if dimension
20
+ _firstCell, lastCell = dimension["ref"].split(":")
21
+ end
22
+
23
+ if lastCell
24
+ # Assume the dimension reflects the content
25
+ @column_count = column_index(lastCell) + 1
26
+ else
27
+ # Find the last cell in every row that has a value
28
+ rightmost_cells = @xml.xpath("//xmlns:row/xmlns:c[*[local-name() = 'v']][last()]").map { |c| column_index(c["r"]) }
29
+ @column_count = rightmost_cells.max + 1
30
+
31
+ end
32
+
33
+ # Find the last row that contains actual values
34
+ @last_row = @xml.xpath("//xmlns:row[*[xmlns:v]][last()]").first["r"].to_i
17
35
  end
18
36
 
19
37
  def inspect
@@ -22,19 +40,28 @@ module Xsv
22
40
 
23
41
  # Iterate over rows
24
42
  def each_row
25
- row_index = 0
43
+ row_index = 0 - @row_skip
44
+
26
45
  @xml.css("sheetData row").each do |row_xml|
46
+ if row_index < 0
47
+ row_index += 1
48
+ next
49
+ end
50
+
27
51
  row_index += 1
28
52
 
29
53
  next if row_index == 1 && @mode == :hash
30
54
 
31
55
  # pad empty rows
32
- while row_index < row_xml["r"].to_i do
56
+ while row_index < row_xml["r"].to_i - @row_skip do
33
57
  yield(empty_row)
34
58
  row_index += 1
35
59
  end
36
60
 
37
61
  yield(parse_row(row_xml))
62
+
63
+ # Do not return empty trailing rows
64
+ break if row_index == @last_row - @row_skip
38
65
  end
39
66
 
40
67
  true
@@ -42,7 +69,7 @@ module Xsv
42
69
 
43
70
  # Get row by number, starting at 0
44
71
  def [](number)
45
- row_xml = xml.css("sheetData row[r=#{number + 1}]").first
72
+ row_xml = xml.css("sheetData row[r=#{number + @row_skip + 1}]").first
46
73
 
47
74
  if row_xml
48
75
  parse_row(row_xml)
@@ -55,7 +82,7 @@ module Xsv
55
82
  # all methods return hashes instead of arrays
56
83
  def parse_headers!
57
84
  @mode = :array
58
- parse_headers
85
+ @headers = parse_headers
59
86
 
60
87
  @mode = :hash
61
88
 
@@ -65,7 +92,7 @@ module Xsv
65
92
  private
66
93
 
67
94
  def parse_headers
68
- @headers = parse_row(@xml.css("sheetData row").first)
95
+ parse_row(@xml.css("sheetData row")[@row_skip])
69
96
  end
70
97
 
71
98
  def empty_row
@@ -80,7 +107,7 @@ module Xsv
80
107
  def parse_row(xml)
81
108
  row = empty_row
82
109
 
83
- xml.css("c").each do |c_xml|
110
+ xml.css("c").first(@column_count).each do |c_xml|
84
111
  value = case c_xml["t"]
85
112
  when "s"
86
113
  @workbook.shared_strings[c_xml.css("v").inner_text.to_i]
@@ -89,29 +116,36 @@ module Xsv
89
116
  when "e" # N/A
90
117
  nil
91
118
  when nil
92
- value = parse_number(c_xml.css("v").inner_text)
119
+ v = c_xml.css("v").first
120
+
121
+ if v.nil?
122
+ nil
123
+ elsif c_xml["s"]
124
+ value = parse_number(v.inner_text)
93
125
 
94
- if c_xml["s"]
95
126
  style = @workbook.xfs[c_xml["s"].to_i]
96
127
  numFmtId = style[:numFmtId].to_i
128
+ numFmt = @workbook.numFmts[numFmtId]
97
129
  if numFmtId == 0
98
130
  value
99
- elsif is_date_format?(@workbook.numFmts[numFmtId])
131
+ elsif is_datetime_format?(numFmt)
132
+ parse_datetime(value)
133
+ elsif is_date_format?(numFmt)
100
134
  parse_date(value)
101
- elsif is_time_format?(@workbook.numFmts[numFmtId])
135
+ elsif is_time_format?(numFmt)
102
136
  parse_time(value)
103
137
  else
104
138
  value
105
139
  end
106
140
  else
107
- value
141
+ parse_number(v.inner_text)
108
142
  end
109
143
  else
110
144
  raise Xsv::Error, "Encountered unknown column type #{c_xml["t"]}"
111
145
  end
112
146
 
113
147
  # Determine column position and pad row with nil values
114
- col_index = column_index(c_xml["r"].scan(/^[A-Z]+/).first)
148
+ col_index = column_index(c_xml["r"])
115
149
 
116
150
  case @mode
117
151
  when :array
data/lib/xsv/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Xsv
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
data/lib/xsv/workbook.rb CHANGED
@@ -57,8 +57,8 @@ module Xsv
57
57
  end
58
58
 
59
59
  def fetch_sheets
60
- @zip.glob("xl/worksheets/sheet*.xml").sort do |entry|
61
- entry.name.scan(/\d+/).first.to_i
60
+ @zip.glob("xl/worksheets/sheet*.xml").sort do |a, b|
61
+ a.name.scan(/\d+/).first.to_i <=> b.name.scan(/\d+/).first.to_i
62
62
  end.each do |entry|
63
63
  @sheets << Xsv::Sheet.new(self, Nokogiri::XML(entry.get_input_stream))
64
64
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xsv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martijn Storck