ruh-roo 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +677 -0
  3. data/Gemfile +24 -0
  4. data/LICENSE +24 -0
  5. data/README.md +315 -0
  6. data/lib/roo/base.rb +607 -0
  7. data/lib/roo/constants.rb +7 -0
  8. data/lib/roo/csv.rb +141 -0
  9. data/lib/roo/errors.rb +11 -0
  10. data/lib/roo/excelx/cell/base.rb +108 -0
  11. data/lib/roo/excelx/cell/boolean.rb +30 -0
  12. data/lib/roo/excelx/cell/date.rb +28 -0
  13. data/lib/roo/excelx/cell/datetime.rb +107 -0
  14. data/lib/roo/excelx/cell/empty.rb +20 -0
  15. data/lib/roo/excelx/cell/number.rb +89 -0
  16. data/lib/roo/excelx/cell/string.rb +19 -0
  17. data/lib/roo/excelx/cell/time.rb +44 -0
  18. data/lib/roo/excelx/cell.rb +110 -0
  19. data/lib/roo/excelx/comments.rb +55 -0
  20. data/lib/roo/excelx/coordinate.rb +19 -0
  21. data/lib/roo/excelx/extractor.rb +39 -0
  22. data/lib/roo/excelx/format.rb +71 -0
  23. data/lib/roo/excelx/images.rb +26 -0
  24. data/lib/roo/excelx/relationships.rb +33 -0
  25. data/lib/roo/excelx/shared.rb +39 -0
  26. data/lib/roo/excelx/shared_strings.rb +151 -0
  27. data/lib/roo/excelx/sheet.rb +151 -0
  28. data/lib/roo/excelx/sheet_doc.rb +248 -0
  29. data/lib/roo/excelx/styles.rb +64 -0
  30. data/lib/roo/excelx/workbook.rb +63 -0
  31. data/lib/roo/excelx.rb +480 -0
  32. data/lib/roo/font.rb +17 -0
  33. data/lib/roo/formatters/base.rb +15 -0
  34. data/lib/roo/formatters/csv.rb +84 -0
  35. data/lib/roo/formatters/matrix.rb +23 -0
  36. data/lib/roo/formatters/xml.rb +31 -0
  37. data/lib/roo/formatters/yaml.rb +40 -0
  38. data/lib/roo/helpers/default_attr_reader.rb +20 -0
  39. data/lib/roo/helpers/weak_instance_cache.rb +41 -0
  40. data/lib/roo/libre_office.rb +4 -0
  41. data/lib/roo/link.rb +34 -0
  42. data/lib/roo/open_office.rb +628 -0
  43. data/lib/roo/spreadsheet.rb +39 -0
  44. data/lib/roo/tempdir.rb +21 -0
  45. data/lib/roo/utils.rb +128 -0
  46. data/lib/roo/version.rb +3 -0
  47. data/lib/roo.rb +36 -0
  48. data/roo.gemspec +28 -0
  49. metadata +189 -0
@@ -0,0 +1,248 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
4
+ require 'roo/excelx/extractor'
5
+
6
+ module Roo
7
+ class Excelx
8
+ class SheetDoc < Excelx::Extractor
9
+ extend Forwardable
10
+ delegate [:workbook] => :@shared
11
+
12
+ def initialize(path, relationships, shared, options = {})
13
+ super(path)
14
+ @shared = shared
15
+ @options = options
16
+ @relationships = relationships
17
+ end
18
+
19
+ def cells(relationships)
20
+ @cells ||= extract_cells(relationships)
21
+ end
22
+
23
+ def hyperlinks(relationships)
24
+ # If you're sure you're not going to need this hyperlinks you can discard it
25
+ @hyperlinks ||= if @options[:no_hyperlinks] || !relationships.include_type?("hyperlink")
26
+ {}
27
+ else
28
+ extract_hyperlinks(relationships)
29
+ end
30
+ end
31
+
32
+ # Get the dimensions for the sheet.
33
+ # This is the upper bound of cells that might
34
+ # be parsed. (the document may be sparse so cell count is only upper bound)
35
+ def dimensions
36
+ @dimensions ||= extract_dimensions
37
+ end
38
+
39
+ # Yield each row xml element to caller
40
+ def each_row_streaming(&block)
41
+ Roo::Utils.each_element(@path, 'row', &block)
42
+ end
43
+
44
+ # Yield each cell as Excelx::Cell to caller for given
45
+ # row xml
46
+ def each_cell(row_xml)
47
+ return [] unless row_xml
48
+ row_xml.children.each do |cell_element|
49
+ coordinate = ::Roo::Utils.extract_coordinate(cell_element["r"])
50
+ hyperlinks = hyperlinks(@relationships)[coordinate]
51
+
52
+ yield cell_from_xml(cell_element, hyperlinks, coordinate)
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ def cell_value_type(type, format)
59
+ case type
60
+ when 's'
61
+ :shared
62
+ when 'b'
63
+ :boolean
64
+ when 'str'
65
+ :string
66
+ when 'inlineStr'
67
+ :inlinestr
68
+ else
69
+ Excelx::Format.to_type(format)
70
+ end
71
+ end
72
+
73
+ # Internal: Creates a cell based on an XML clell..
74
+ #
75
+ # cell_xml - a Nokogiri::XML::Element. e.g.
76
+ # <c r="A5" s="2">
77
+ # <v>22606</v>
78
+ # </c>
79
+ # hyperlink - a String for the hyperlink for the cell or nil when no
80
+ # hyperlink is present.
81
+ # coordinate - a Roo::Excelx::Coordinate for the coordinate for the cell
82
+ # or nil to extract coordinate from cell_xml.
83
+ # empty_cell - an Optional Boolean value.
84
+ #
85
+ # Examples
86
+ #
87
+ # cells_from_xml(<Nokogiri::XML::Element>, nil, nil)
88
+ # # => <Excelx::Cell::String>
89
+ #
90
+ # Returns a type of <Excelx::Cell>.
91
+ def cell_from_xml(cell_xml, hyperlink, coordinate, empty_cell=true)
92
+ coordinate ||= ::Roo::Utils.extract_coordinate(cell_xml["r"])
93
+ cell_xml_children = cell_xml.children
94
+ return create_empty_cell(coordinate, empty_cell) if cell_xml_children.empty?
95
+
96
+ # NOTE: This is error prone, to_i will silently turn a nil into a 0.
97
+ # This works by coincidence because Format[0] is General.
98
+ style = cell_xml["s"].to_i
99
+ formula = nil
100
+
101
+ cell_xml_children.each do |cell|
102
+ case cell.name
103
+ when 'is'
104
+ content = cell.search('t').map(&:content).join
105
+ unless content.empty?
106
+ return Excelx::Cell.cell_class(:string).new(content, formula, style, hyperlink, coordinate)
107
+ end
108
+ when 'f'
109
+ formula = cell.content
110
+ when 'v'
111
+ format = style_format(style)
112
+ value_type = cell_value_type(cell_xml["t"], format)
113
+
114
+ return create_cell_from_value(value_type, cell, formula, format, style, hyperlink, coordinate)
115
+ end
116
+ end
117
+
118
+ create_empty_cell(coordinate, empty_cell)
119
+ end
120
+
121
+ def create_empty_cell(coordinate, empty_cell)
122
+ if empty_cell
123
+ Excelx::Cell::Empty.new(coordinate)
124
+ end
125
+ end
126
+
127
+ def create_cell_from_value(value_type, cell, formula, format, style, hyperlink, coordinate)
128
+ # NOTE: format.to_s can replace excelx_type as an argument for
129
+ # Cell::Time, Cell::DateTime, Cell::Date or Cell::Number, but
130
+ # it will break some brittle tests.
131
+ excelx_type = [:numeric_or_formula, format.to_s]
132
+
133
+ # NOTE: There are only a few situations where value != cell.content
134
+ # 1. when a sharedString is used. value = sharedString;
135
+ # cell.content = id of sharedString
136
+ # 2. boolean cells: value = 'TRUE' | 'FALSE'; cell.content = '0' | '1';
137
+ # But a boolean cell should use TRUE|FALSE as the formatted value
138
+ # and use a Boolean for it's value. Using a Boolean value breaks
139
+ # Roo::Base#to_csv.
140
+ # 3. formula
141
+ case value_type
142
+ when :shared
143
+ cell_content = cell.content.to_i
144
+ value = shared_strings.use_html?(cell_content) ? shared_strings.to_html[cell_content] : shared_strings[cell_content]
145
+ Excelx::Cell.cell_class(:string).new(value, formula, style, hyperlink, coordinate)
146
+ when :boolean, :string
147
+ value = cell.content
148
+ Excelx::Cell.cell_class(value_type).new(value, formula, style, hyperlink, coordinate)
149
+ when :time, :datetime
150
+ cell_content = cell.content.to_f
151
+ # NOTE: A date will be a whole number. A time will have be > 1. And
152
+ # in general, a datetime will have decimals. But if the cell is
153
+ # using a custom format, it's possible to be interpreted incorrectly.
154
+ # cell_content.to_i == cell_content && standard_style?=> :date
155
+ #
156
+ # Should check to see if the format is standard or not. If it's a
157
+ # standard format, than it's a date, otherwise, it is a datetime.
158
+ # @styles.standard_style?(style_id)
159
+ # STANDARD_STYLES.keys.include?(style_id.to_i)
160
+ cell_type = if cell_content < 1.0
161
+ :time
162
+ elsif (cell_content - cell_content.floor).abs > 0.000001
163
+ :datetime
164
+ else
165
+ :date
166
+ end
167
+ base_value = cell_type == :date ? base_date : base_timestamp
168
+ Excelx::Cell.cell_class(cell_type).new(cell_content, formula, excelx_type, style, hyperlink, base_value, coordinate)
169
+ when :date
170
+ Excelx::Cell.cell_class(:date).new(cell.content, formula, excelx_type, style, hyperlink, base_date, coordinate)
171
+ else
172
+ Excelx::Cell.cell_class(:number).new(cell.content, formula, excelx_type, style, hyperlink, coordinate)
173
+ end
174
+ end
175
+
176
+ def extract_hyperlinks(relationships)
177
+ return {} unless (hyperlinks = doc.xpath('/worksheet/hyperlinks/hyperlink'))
178
+
179
+ hyperlinks.each_with_object({}) do |hyperlink, hash|
180
+ if relationship = relationships[hyperlink['id']]
181
+ target_link = relationship['Target']
182
+ target_link += "##{hyperlink['location']}" if hyperlink['location']
183
+
184
+ Roo::Utils.coordinates_in_range(hyperlink["ref"].to_s) do |coord|
185
+ hash[coord] = target_link
186
+ end
187
+ end
188
+ end
189
+ end
190
+
191
+ def expand_merged_ranges(cells)
192
+ # Extract merged ranges from xml
193
+ merges = {}
194
+ doc.xpath('/worksheet/mergeCells/mergeCell').each do |mergecell_xml|
195
+ src, dst = mergecell_xml["ref"].split(/:/).map { |ref| ::Roo::Utils.ref_to_key(ref) }
196
+ next unless cells[src]
197
+ for row in src[0]..dst[0] do
198
+ for col in src[1]..dst[1] do
199
+ next if row == src[0] && col == src[1]
200
+ merges[[row, col]] = src
201
+ end
202
+ end
203
+ end
204
+ # Duplicate value into all cells in merged range
205
+ merges.each do |dst, src|
206
+ cells[dst] = cells[src]
207
+ end
208
+ end
209
+
210
+ def extract_cells(relationships)
211
+ extracted_cells = {}
212
+ empty_cell = @options[:empty_cell]
213
+
214
+ doc.xpath('/worksheet/sheetData/row/c').each do |cell_xml|
215
+ coordinate = ::Roo::Utils.extract_coordinate(cell_xml["r"])
216
+ cell = cell_from_xml(cell_xml, hyperlinks(relationships)[coordinate], coordinate, empty_cell)
217
+ extracted_cells[coordinate] = cell if cell
218
+ end
219
+
220
+ expand_merged_ranges(extracted_cells) if @options[:expand_merged_ranges]
221
+
222
+ extracted_cells
223
+ end
224
+
225
+ def extract_dimensions
226
+ Roo::Utils.each_element(@path, 'dimension') do |dimension|
227
+ return dimension["ref"]
228
+ end
229
+ end
230
+
231
+ def style_format(style)
232
+ @shared.styles.style_format(style)
233
+ end
234
+
235
+ def base_date
236
+ @shared.base_date
237
+ end
238
+
239
+ def base_timestamp
240
+ @shared.base_timestamp
241
+ end
242
+
243
+ def shared_strings
244
+ @shared.shared_strings
245
+ end
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,64 @@
1
+ require 'roo/font'
2
+ require 'roo/excelx/extractor'
3
+
4
+ module Roo
5
+ class Excelx
6
+ class Styles < Excelx::Extractor
7
+ # convert internal excelx attribute to a format
8
+ def style_format(style)
9
+ id = num_fmt_ids[style.to_i]
10
+ num_fmts[id] || Excelx::Format::STANDARD_FORMATS[id.to_i]
11
+ end
12
+
13
+ def definitions
14
+ @definitions ||= extract_definitions
15
+ end
16
+
17
+ private
18
+
19
+ def num_fmt_ids
20
+ @num_fmt_ids ||= extract_num_fmt_ids
21
+ end
22
+
23
+ def num_fmts
24
+ @num_fmts ||= extract_num_fmts
25
+ end
26
+
27
+ def fonts
28
+ @fonts ||= extract_fonts
29
+ end
30
+
31
+ def extract_definitions
32
+ doc.xpath('//cellXfs').flat_map do |xfs|
33
+ xfs.children.map do |xf|
34
+ fonts[xf['fontId'].to_i]
35
+ end
36
+ end
37
+ end
38
+
39
+ def extract_fonts
40
+ doc.xpath('//fonts/font').map do |font_el|
41
+ Font.new.tap do |font|
42
+ font.bold = !font_el.xpath('./b').empty?
43
+ font.italic = !font_el.xpath('./i').empty?
44
+ font.underline = !font_el.xpath('./u').empty?
45
+ end
46
+ end
47
+ end
48
+
49
+ def extract_num_fmt_ids
50
+ doc.xpath('//cellXfs').flat_map do |xfs|
51
+ xfs.children.map do |xf|
52
+ xf['numFmtId']
53
+ end
54
+ end.compact
55
+ end
56
+
57
+ def extract_num_fmts
58
+ doc.xpath('//numFmt').each_with_object({}) do |num_fmt, hash|
59
+ hash[num_fmt['numFmtId']] = num_fmt['formatCode']
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,63 @@
1
+ require 'roo/excelx/extractor'
2
+
3
+ module Roo
4
+ class Excelx
5
+ class Workbook < Excelx::Extractor
6
+ class Label
7
+ attr_reader :sheet, :row, :col, :name
8
+
9
+ def initialize(name, sheet, row, col)
10
+ @name = name
11
+ @sheet = sheet
12
+ @row = row.to_i
13
+ @col = ::Roo::Utils.letter_to_number(col)
14
+ end
15
+
16
+ def key
17
+ [@row, @col]
18
+ end
19
+ end
20
+
21
+ def initialize(path)
22
+ super
23
+ fail ArgumentError, 'missing required workbook file' unless doc_exists?
24
+ end
25
+
26
+ def sheets
27
+ doc.xpath('//sheet')
28
+ end
29
+
30
+ # aka labels
31
+ def defined_names
32
+ doc.xpath('//definedName').each_with_object({}) do |defined_name, hash|
33
+ # "Sheet1!$C$5"
34
+ sheet, coordinates = defined_name.text.split('!$', 2)
35
+ col, row = coordinates.split('$')
36
+ name = defined_name['name']
37
+ hash[name] = Label.new(name, sheet, row, col)
38
+ end
39
+ end
40
+
41
+ def base_timestamp
42
+ @base_timestamp ||= base_date.to_datetime.to_time.to_i
43
+ end
44
+
45
+ def base_date
46
+ @base_date ||=
47
+ begin
48
+ # Default to 1900 (minus one day due to excel quirk) but use 1904 if
49
+ # it's set in the Workbook's workbookPr
50
+ # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
51
+ result = Date.new(1899, 12, 30) # default
52
+ doc.css('workbookPr[date1904]').each do |workbookPr|
53
+ if workbookPr['date1904'] =~ /true|1/i
54
+ result = Date.new(1904, 01, 01)
55
+ break
56
+ end
57
+ end
58
+ result
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end