ruh-roo 3.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +677 -0
  3. data/Gemfile +24 -0
  4. data/LICENSE +24 -0
  5. data/README.md +315 -0
  6. data/lib/roo/base.rb +607 -0
  7. data/lib/roo/constants.rb +7 -0
  8. data/lib/roo/csv.rb +141 -0
  9. data/lib/roo/errors.rb +11 -0
  10. data/lib/roo/excelx/cell/base.rb +108 -0
  11. data/lib/roo/excelx/cell/boolean.rb +30 -0
  12. data/lib/roo/excelx/cell/date.rb +28 -0
  13. data/lib/roo/excelx/cell/datetime.rb +107 -0
  14. data/lib/roo/excelx/cell/empty.rb +20 -0
  15. data/lib/roo/excelx/cell/number.rb +89 -0
  16. data/lib/roo/excelx/cell/string.rb +19 -0
  17. data/lib/roo/excelx/cell/time.rb +44 -0
  18. data/lib/roo/excelx/cell.rb +110 -0
  19. data/lib/roo/excelx/comments.rb +55 -0
  20. data/lib/roo/excelx/coordinate.rb +19 -0
  21. data/lib/roo/excelx/extractor.rb +39 -0
  22. data/lib/roo/excelx/format.rb +71 -0
  23. data/lib/roo/excelx/images.rb +26 -0
  24. data/lib/roo/excelx/relationships.rb +33 -0
  25. data/lib/roo/excelx/shared.rb +39 -0
  26. data/lib/roo/excelx/shared_strings.rb +151 -0
  27. data/lib/roo/excelx/sheet.rb +151 -0
  28. data/lib/roo/excelx/sheet_doc.rb +248 -0
  29. data/lib/roo/excelx/styles.rb +64 -0
  30. data/lib/roo/excelx/workbook.rb +63 -0
  31. data/lib/roo/excelx.rb +480 -0
  32. data/lib/roo/font.rb +17 -0
  33. data/lib/roo/formatters/base.rb +15 -0
  34. data/lib/roo/formatters/csv.rb +84 -0
  35. data/lib/roo/formatters/matrix.rb +23 -0
  36. data/lib/roo/formatters/xml.rb +31 -0
  37. data/lib/roo/formatters/yaml.rb +40 -0
  38. data/lib/roo/helpers/default_attr_reader.rb +20 -0
  39. data/lib/roo/helpers/weak_instance_cache.rb +41 -0
  40. data/lib/roo/libre_office.rb +4 -0
  41. data/lib/roo/link.rb +34 -0
  42. data/lib/roo/open_office.rb +628 -0
  43. data/lib/roo/spreadsheet.rb +39 -0
  44. data/lib/roo/tempdir.rb +21 -0
  45. data/lib/roo/utils.rb +128 -0
  46. data/lib/roo/version.rb +3 -0
  47. data/lib/roo.rb +36 -0
  48. data/roo.gemspec +28 -0
  49. metadata +189 -0
@@ -0,0 +1,248 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
4
+ require 'roo/excelx/extractor'
5
+
6
+ module Roo
7
+ class Excelx
8
+ class SheetDoc < Excelx::Extractor
9
+ extend Forwardable
10
+ delegate [:workbook] => :@shared
11
+
12
+ def initialize(path, relationships, shared, options = {})
13
+ super(path)
14
+ @shared = shared
15
+ @options = options
16
+ @relationships = relationships
17
+ end
18
+
19
+ def cells(relationships)
20
+ @cells ||= extract_cells(relationships)
21
+ end
22
+
23
+ def hyperlinks(relationships)
24
+ # If you're sure you're not going to need this hyperlinks you can discard it
25
+ @hyperlinks ||= if @options[:no_hyperlinks] || !relationships.include_type?("hyperlink")
26
+ {}
27
+ else
28
+ extract_hyperlinks(relationships)
29
+ end
30
+ end
31
+
32
+ # Get the dimensions for the sheet.
33
+ # This is the upper bound of cells that might
34
+ # be parsed. (the document may be sparse so cell count is only upper bound)
35
+ def dimensions
36
+ @dimensions ||= extract_dimensions
37
+ end
38
+
39
+ # Yield each row xml element to caller
40
+ def each_row_streaming(&block)
41
+ Roo::Utils.each_element(@path, 'row', &block)
42
+ end
43
+
44
+ # Yield each cell as Excelx::Cell to caller for given
45
+ # row xml
46
+ def each_cell(row_xml)
47
+ return [] unless row_xml
48
+ row_xml.children.each do |cell_element|
49
+ coordinate = ::Roo::Utils.extract_coordinate(cell_element["r"])
50
+ hyperlinks = hyperlinks(@relationships)[coordinate]
51
+
52
+ yield cell_from_xml(cell_element, hyperlinks, coordinate)
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ def cell_value_type(type, format)
59
+ case type
60
+ when 's'
61
+ :shared
62
+ when 'b'
63
+ :boolean
64
+ when 'str'
65
+ :string
66
+ when 'inlineStr'
67
+ :inlinestr
68
+ else
69
+ Excelx::Format.to_type(format)
70
+ end
71
+ end
72
+
73
+ # Internal: Creates a cell based on an XML clell..
74
+ #
75
+ # cell_xml - a Nokogiri::XML::Element. e.g.
76
+ # <c r="A5" s="2">
77
+ # <v>22606</v>
78
+ # </c>
79
+ # hyperlink - a String for the hyperlink for the cell or nil when no
80
+ # hyperlink is present.
81
+ # coordinate - a Roo::Excelx::Coordinate for the coordinate for the cell
82
+ # or nil to extract coordinate from cell_xml.
83
+ # empty_cell - an Optional Boolean value.
84
+ #
85
+ # Examples
86
+ #
87
+ # cells_from_xml(<Nokogiri::XML::Element>, nil, nil)
88
+ # # => <Excelx::Cell::String>
89
+ #
90
+ # Returns a type of <Excelx::Cell>.
91
+ def cell_from_xml(cell_xml, hyperlink, coordinate, empty_cell=true)
92
+ coordinate ||= ::Roo::Utils.extract_coordinate(cell_xml["r"])
93
+ cell_xml_children = cell_xml.children
94
+ return create_empty_cell(coordinate, empty_cell) if cell_xml_children.empty?
95
+
96
+ # NOTE: This is error prone, to_i will silently turn a nil into a 0.
97
+ # This works by coincidence because Format[0] is General.
98
+ style = cell_xml["s"].to_i
99
+ formula = nil
100
+
101
+ cell_xml_children.each do |cell|
102
+ case cell.name
103
+ when 'is'
104
+ content = cell.search('t').map(&:content).join
105
+ unless content.empty?
106
+ return Excelx::Cell.cell_class(:string).new(content, formula, style, hyperlink, coordinate)
107
+ end
108
+ when 'f'
109
+ formula = cell.content
110
+ when 'v'
111
+ format = style_format(style)
112
+ value_type = cell_value_type(cell_xml["t"], format)
113
+
114
+ return create_cell_from_value(value_type, cell, formula, format, style, hyperlink, coordinate)
115
+ end
116
+ end
117
+
118
+ create_empty_cell(coordinate, empty_cell)
119
+ end
120
+
121
+ def create_empty_cell(coordinate, empty_cell)
122
+ if empty_cell
123
+ Excelx::Cell::Empty.new(coordinate)
124
+ end
125
+ end
126
+
127
+ def create_cell_from_value(value_type, cell, formula, format, style, hyperlink, coordinate)
128
+ # NOTE: format.to_s can replace excelx_type as an argument for
129
+ # Cell::Time, Cell::DateTime, Cell::Date or Cell::Number, but
130
+ # it will break some brittle tests.
131
+ excelx_type = [:numeric_or_formula, format.to_s]
132
+
133
+ # NOTE: There are only a few situations where value != cell.content
134
+ # 1. when a sharedString is used. value = sharedString;
135
+ # cell.content = id of sharedString
136
+ # 2. boolean cells: value = 'TRUE' | 'FALSE'; cell.content = '0' | '1';
137
+ # But a boolean cell should use TRUE|FALSE as the formatted value
138
+ # and use a Boolean for it's value. Using a Boolean value breaks
139
+ # Roo::Base#to_csv.
140
+ # 3. formula
141
+ case value_type
142
+ when :shared
143
+ cell_content = cell.content.to_i
144
+ value = shared_strings.use_html?(cell_content) ? shared_strings.to_html[cell_content] : shared_strings[cell_content]
145
+ Excelx::Cell.cell_class(:string).new(value, formula, style, hyperlink, coordinate)
146
+ when :boolean, :string
147
+ value = cell.content
148
+ Excelx::Cell.cell_class(value_type).new(value, formula, style, hyperlink, coordinate)
149
+ when :time, :datetime
150
+ cell_content = cell.content.to_f
151
+ # NOTE: A date will be a whole number. A time will have be > 1. And
152
+ # in general, a datetime will have decimals. But if the cell is
153
+ # using a custom format, it's possible to be interpreted incorrectly.
154
+ # cell_content.to_i == cell_content && standard_style?=> :date
155
+ #
156
+ # Should check to see if the format is standard or not. If it's a
157
+ # standard format, than it's a date, otherwise, it is a datetime.
158
+ # @styles.standard_style?(style_id)
159
+ # STANDARD_STYLES.keys.include?(style_id.to_i)
160
+ cell_type = if cell_content < 1.0
161
+ :time
162
+ elsif (cell_content - cell_content.floor).abs > 0.000001
163
+ :datetime
164
+ else
165
+ :date
166
+ end
167
+ base_value = cell_type == :date ? base_date : base_timestamp
168
+ Excelx::Cell.cell_class(cell_type).new(cell_content, formula, excelx_type, style, hyperlink, base_value, coordinate)
169
+ when :date
170
+ Excelx::Cell.cell_class(:date).new(cell.content, formula, excelx_type, style, hyperlink, base_date, coordinate)
171
+ else
172
+ Excelx::Cell.cell_class(:number).new(cell.content, formula, excelx_type, style, hyperlink, coordinate)
173
+ end
174
+ end
175
+
176
+ def extract_hyperlinks(relationships)
177
+ return {} unless (hyperlinks = doc.xpath('/worksheet/hyperlinks/hyperlink'))
178
+
179
+ hyperlinks.each_with_object({}) do |hyperlink, hash|
180
+ if relationship = relationships[hyperlink['id']]
181
+ target_link = relationship['Target']
182
+ target_link += "##{hyperlink['location']}" if hyperlink['location']
183
+
184
+ Roo::Utils.coordinates_in_range(hyperlink["ref"].to_s) do |coord|
185
+ hash[coord] = target_link
186
+ end
187
+ end
188
+ end
189
+ end
190
+
191
+ def expand_merged_ranges(cells)
192
+ # Extract merged ranges from xml
193
+ merges = {}
194
+ doc.xpath('/worksheet/mergeCells/mergeCell').each do |mergecell_xml|
195
+ src, dst = mergecell_xml["ref"].split(/:/).map { |ref| ::Roo::Utils.ref_to_key(ref) }
196
+ next unless cells[src]
197
+ for row in src[0]..dst[0] do
198
+ for col in src[1]..dst[1] do
199
+ next if row == src[0] && col == src[1]
200
+ merges[[row, col]] = src
201
+ end
202
+ end
203
+ end
204
+ # Duplicate value into all cells in merged range
205
+ merges.each do |dst, src|
206
+ cells[dst] = cells[src]
207
+ end
208
+ end
209
+
210
+ def extract_cells(relationships)
211
+ extracted_cells = {}
212
+ empty_cell = @options[:empty_cell]
213
+
214
+ doc.xpath('/worksheet/sheetData/row/c').each do |cell_xml|
215
+ coordinate = ::Roo::Utils.extract_coordinate(cell_xml["r"])
216
+ cell = cell_from_xml(cell_xml, hyperlinks(relationships)[coordinate], coordinate, empty_cell)
217
+ extracted_cells[coordinate] = cell if cell
218
+ end
219
+
220
+ expand_merged_ranges(extracted_cells) if @options[:expand_merged_ranges]
221
+
222
+ extracted_cells
223
+ end
224
+
225
+ def extract_dimensions
226
+ Roo::Utils.each_element(@path, 'dimension') do |dimension|
227
+ return dimension["ref"]
228
+ end
229
+ end
230
+
231
+ def style_format(style)
232
+ @shared.styles.style_format(style)
233
+ end
234
+
235
+ def base_date
236
+ @shared.base_date
237
+ end
238
+
239
+ def base_timestamp
240
+ @shared.base_timestamp
241
+ end
242
+
243
+ def shared_strings
244
+ @shared.shared_strings
245
+ end
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,64 @@
1
+ require 'roo/font'
2
+ require 'roo/excelx/extractor'
3
+
4
+ module Roo
5
+ class Excelx
6
+ class Styles < Excelx::Extractor
7
+ # convert internal excelx attribute to a format
8
+ def style_format(style)
9
+ id = num_fmt_ids[style.to_i]
10
+ num_fmts[id] || Excelx::Format::STANDARD_FORMATS[id.to_i]
11
+ end
12
+
13
+ def definitions
14
+ @definitions ||= extract_definitions
15
+ end
16
+
17
+ private
18
+
19
+ def num_fmt_ids
20
+ @num_fmt_ids ||= extract_num_fmt_ids
21
+ end
22
+
23
+ def num_fmts
24
+ @num_fmts ||= extract_num_fmts
25
+ end
26
+
27
+ def fonts
28
+ @fonts ||= extract_fonts
29
+ end
30
+
31
+ def extract_definitions
32
+ doc.xpath('//cellXfs').flat_map do |xfs|
33
+ xfs.children.map do |xf|
34
+ fonts[xf['fontId'].to_i]
35
+ end
36
+ end
37
+ end
38
+
39
+ def extract_fonts
40
+ doc.xpath('//fonts/font').map do |font_el|
41
+ Font.new.tap do |font|
42
+ font.bold = !font_el.xpath('./b').empty?
43
+ font.italic = !font_el.xpath('./i').empty?
44
+ font.underline = !font_el.xpath('./u').empty?
45
+ end
46
+ end
47
+ end
48
+
49
+ def extract_num_fmt_ids
50
+ doc.xpath('//cellXfs').flat_map do |xfs|
51
+ xfs.children.map do |xf|
52
+ xf['numFmtId']
53
+ end
54
+ end.compact
55
+ end
56
+
57
+ def extract_num_fmts
58
+ doc.xpath('//numFmt').each_with_object({}) do |num_fmt, hash|
59
+ hash[num_fmt['numFmtId']] = num_fmt['formatCode']
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,63 @@
1
+ require 'roo/excelx/extractor'
2
+
3
+ module Roo
4
+ class Excelx
5
+ class Workbook < Excelx::Extractor
6
+ class Label
7
+ attr_reader :sheet, :row, :col, :name
8
+
9
+ def initialize(name, sheet, row, col)
10
+ @name = name
11
+ @sheet = sheet
12
+ @row = row.to_i
13
+ @col = ::Roo::Utils.letter_to_number(col)
14
+ end
15
+
16
+ def key
17
+ [@row, @col]
18
+ end
19
+ end
20
+
21
+ def initialize(path)
22
+ super
23
+ fail ArgumentError, 'missing required workbook file' unless doc_exists?
24
+ end
25
+
26
+ def sheets
27
+ doc.xpath('//sheet')
28
+ end
29
+
30
+ # aka labels
31
+ def defined_names
32
+ doc.xpath('//definedName').each_with_object({}) do |defined_name, hash|
33
+ # "Sheet1!$C$5"
34
+ sheet, coordinates = defined_name.text.split('!$', 2)
35
+ col, row = coordinates.split('$')
36
+ name = defined_name['name']
37
+ hash[name] = Label.new(name, sheet, row, col)
38
+ end
39
+ end
40
+
41
+ def base_timestamp
42
+ @base_timestamp ||= base_date.to_datetime.to_time.to_i
43
+ end
44
+
45
+ def base_date
46
+ @base_date ||=
47
+ begin
48
+ # Default to 1900 (minus one day due to excel quirk) but use 1904 if
49
+ # it's set in the Workbook's workbookPr
50
+ # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
51
+ result = Date.new(1899, 12, 30) # default
52
+ doc.css('workbookPr[date1904]').each do |workbookPr|
53
+ if workbookPr['date1904'] =~ /true|1/i
54
+ result = Date.new(1904, 01, 01)
55
+ break
56
+ end
57
+ end
58
+ result
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end