rbxl 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,315 @@
1
+ module Rbxl
2
+ # Read-modify-save workbook for surgical edits to an existing +.xlsx+.
3
+ #
4
+ # The design promise mirrors +rbpptx+: <em>what we don't understand, we
5
+ # don't touch</em>. The package is opened as a ZIP, each part you mutate is
6
+ # re-serialized, and every other entry — styles, drawings, charts, comments,
7
+ # pivot caches, custom XML, untouched worksheets — round-trips byte-for-byte
8
+ # via {Zip::Entry#copy_raw_entry}. Inside a worksheet you do edit, only the
9
+ # specific +<c>+ element you target is rewritten; surrounding cells, the
10
+ # row's other attributes, +<mergeCells>+, +<conditionalFormatting>+,
11
+ # +<dataValidations>+, and any unknown OOXML extensions remain in place.
12
+ # The cell's existing +s+ (style index) attribute is preserved, so template
13
+ # number formats, fonts, and fills carry through to the new value.
14
+ #
15
+ # The editable mode is the right tool for template-style fill-ins: open a
16
+ # template with named cells, write a handful of values, save back. It is
17
+ # explicitly <em>not</em> the right tool for rewriting the data area of a
18
+ # large worksheet — the touched sheet is parsed as a Nokogiri DOM, so peak
19
+ # memory scales with that sheet's on-disk size. Use the write-only mode
20
+ # (+Rbxl.new+) for that case instead.
21
+ #
22
+ # == Out of scope (1.4.0)
23
+ #
24
+ # * inserting / deleting / reordering / duplicating sheets
25
+ # * editing styles, formulas, named ranges, drawings, or shared strings
26
+ # * +Date+ / +Time+ / +DateTime+ values (raise {EditableCellTypeError};
27
+ # convert to a numeric serial yourself if you need a date cell)
28
+ # * recomputing the worksheet +<dimension>+ when a write expands the bounds
29
+ #
30
+ # == Strings on write
31
+ #
32
+ # Cells written through this mode become inline strings
33
+ # (+t="inlineStr"+), so +xl/sharedStrings.xml+ is never mutated. Existing
34
+ # +t="s"+ cells you don't touch keep resolving through the SST as usual;
35
+ # only cells you actually overwrite drop their SST reference.
36
+ class EditableWorkbook
37
+ # Namespace for the main SpreadsheetML schema.
38
+ MAIN_NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main".freeze
39
+
40
+ # Namespace used for document-level relationships.
41
+ REL_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships".freeze
42
+
43
+ # Namespace used by the OPC package relationships layer.
44
+ PACKAGE_REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships".freeze
45
+
46
+ # Relationship type identifying the workbook part inside +_rels/.rels+.
47
+ OFFICE_DOC_REL_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument".freeze
48
+
49
+ OLE_CFB_MAGIC = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1".b.freeze
50
+ private_constant :OLE_CFB_MAGIC
51
+
52
+ ZIP_LOCAL_MAGIC = "PK\x03\x04".b.freeze
53
+ private_constant :ZIP_LOCAL_MAGIC
54
+
55
+ # @return [String] filesystem path the workbook was opened from
56
+ attr_reader :path
57
+
58
+ # @return [Array<String>] visible sheet names in workbook order
59
+ attr_reader :sheet_names
60
+
61
+ # Convenience constructor equivalent to +new(path)+. When a block is
62
+ # given, the workbook is yielded and {#close} is called automatically
63
+ # when the block returns or raises.
64
+ #
65
+ # @param path [String, #to_path]
66
+ # @yieldparam book [Rbxl::EditableWorkbook]
67
+ # @return [Rbxl::EditableWorkbook, Object] the workbook when no block is
68
+ # given, otherwise the block's return value
69
+ def self.open(path)
70
+ book = new(path)
71
+ return book unless block_given?
72
+
73
+ begin
74
+ yield book
75
+ ensure
76
+ book.close
77
+ end
78
+ end
79
+
80
+ # Opens the package, validates the format, and indexes worksheet parts
81
+ # by visible sheet name. Worksheet XML is not parsed until the caller
82
+ # touches that sheet via {#sheet}.
83
+ #
84
+ # @param path [String, #to_path] path to the +.xlsx+ file
85
+ # @raise [Rbxl::UnsupportedFormatError] if the file is not a valid
86
+ # +.xlsx+ container (e.g. a legacy +.xls+, or non-ZIP bytes)
87
+ # @raise [Rbxl::WorkbookFormatError] if +xl/workbook.xml+ or its rels are
88
+ # missing, malformed, or internally inconsistent
89
+ def initialize(path)
90
+ @path = path.to_s
91
+ ensure_xlsx_format!(@path)
92
+ @zip = Zip::File.open(@path)
93
+ @closed = false
94
+ @workbook_part = locate_workbook_part
95
+ @workbook_dir = File.dirname(@workbook_part)
96
+ @sheet_entries = load_sheet_entries
97
+ @sheet_names = @sheet_entries.keys.freeze
98
+ @shared_strings = nil
99
+ @sheets_by_name = {}
100
+ end
101
+
102
+ # Returns the editable worksheet for +name_or_index+. Repeated calls for
103
+ # the same sheet return the same in-memory object so edits accumulate
104
+ # across calls before {#save}.
105
+ #
106
+ # @param name_or_index [String, Integer] visible sheet name as listed in
107
+ # {#sheet_names}, or an integer index (negatives count from the end)
108
+ # @return [Rbxl::EditableWorksheet]
109
+ # @raise [Rbxl::SheetNotFoundError] if +name_or_index+ does not resolve
110
+ # @raise [Rbxl::ClosedWorkbookError] if the workbook has been closed
111
+ def sheet(name_or_index)
112
+ ensure_open!
113
+
114
+ name = resolve_sheet_name(name_or_index)
115
+ @sheets_by_name[name] ||= EditableWorksheet.new(
116
+ zip: @zip,
117
+ entry_path: @sheet_entries.fetch(name) {
118
+ raise SheetNotFoundError, "sheet not found: #{name}"
119
+ },
120
+ workbook_path: @path,
121
+ shared_strings: shared_strings,
122
+ name: name
123
+ )
124
+ end
125
+
126
+ # Iterates worksheets in workbook order. Worksheets are constructed on
127
+ # demand and memoized, so iterating then editing is consistent with
128
+ # {#sheet}.
129
+ #
130
+ # @yieldparam worksheet [Rbxl::EditableWorksheet]
131
+ # @return [Enumerator<Rbxl::EditableWorksheet>] when no block is given
132
+ # @raise [Rbxl::ClosedWorkbookError] if the workbook has been closed
133
+ def sheets
134
+ ensure_open!
135
+ return enum_for(:sheets) unless block_given?
136
+
137
+ @sheet_names.each { |name| yield sheet(name) }
138
+ end
139
+
140
+ # Writes the workbook out, preserving every part that has not been
141
+ # mutated byte-for-byte. Worksheets whose cells have been edited are
142
+ # re-serialized from their in-memory Nokogiri document; all other
143
+ # entries (styles, sharedStrings, drawings, charts, pivot caches,
144
+ # custom XML, rels) are streamed straight from the source ZIP without
145
+ # re-parsing.
146
+ #
147
+ # +path+ defaults to the original load path; passing +nil+ or omitting
148
+ # it saves in place. The new file is written to a temp file in the same
149
+ # directory and atomically renamed into place, so a crash mid-write
150
+ # never leaves a half-written workbook. On success, dirty flags on each
151
+ # touched worksheet are cleared, so the object is reusable for further
152
+ # edits and another {#save}.
153
+ #
154
+ # @param path [String, #to_path, nil] destination path; defaults to the
155
+ # path the workbook was opened from
156
+ # @return [String] the path that was written
157
+ # @raise [Rbxl::ClosedWorkbookError] if the workbook has been closed
158
+ def save(path = nil)
159
+ ensure_open!
160
+ out_path = (path || @path).to_s
161
+ overrides = collect_overrides
162
+
163
+ tmp_path = "#{out_path}.rbxl-tmp.#{Process.pid}.#{rand(1 << 32).to_s(16)}"
164
+ begin
165
+ Zip::OutputStream.open(tmp_path) do |out|
166
+ @zip.each do |entry|
167
+ next if entry.directory?
168
+
169
+ if (override_xml = overrides[entry.name])
170
+ out.put_next_entry(entry.name)
171
+ out.write(override_xml)
172
+ else
173
+ out.copy_raw_entry(entry)
174
+ end
175
+ end
176
+ end
177
+ File.rename(tmp_path, out_path)
178
+ rescue StandardError
179
+ File.unlink(tmp_path) if File.exist?(tmp_path)
180
+ raise
181
+ end
182
+
183
+ @sheets_by_name.each_value(&:clear_dirty!)
184
+ out_path
185
+ end
186
+
187
+ # Releases the underlying ZIP file. Idempotent.
188
+ #
189
+ # @return [Boolean] +true+ on the first call, +false+ on subsequent calls
190
+ def close
191
+ return false if @closed
192
+
193
+ @zip&.close
194
+ @zip = nil
195
+ @closed = true
196
+ true
197
+ end
198
+
199
+ # @return [Boolean]
200
+ def closed?
201
+ @closed
202
+ end
203
+
204
+ private
205
+
206
+ def ensure_open!
207
+ raise ClosedWorkbookError, "workbook has been closed" if @closed
208
+ end
209
+
210
+ def resolve_sheet_name(key)
211
+ return key unless key.is_a?(Integer)
212
+
213
+ name = @sheet_names[key]
214
+ return name if name
215
+
216
+ raise SheetNotFoundError, "sheet index out of range: #{key} (#{@sheet_names.length} sheet(s))"
217
+ end
218
+
219
+ def ensure_xlsx_format!(path)
220
+ header = begin
221
+ File.binread(path, 8)
222
+ rescue Errno::ENOENT, Errno::EISDIR, Errno::EACCES => e
223
+ raise UnsupportedFormatError, "#{path}: #{e.message}"
224
+ end
225
+
226
+ raise UnsupportedFormatError, "#{path}: file is empty or unreadable" if header.nil? || header.empty?
227
+ return if header.start_with?(ZIP_LOCAL_MAGIC)
228
+
229
+ if header.start_with?(OLE_CFB_MAGIC)
230
+ raise UnsupportedFormatError,
231
+ "#{path} looks like a legacy .xls (BIFF/CFB). " \
232
+ "rbxl supports .xlsx (OOXML) only; convert first, e.g. " \
233
+ "`libreoffice --headless --convert-to xlsx #{File.basename(path.to_s)}`."
234
+ end
235
+
236
+ raise UnsupportedFormatError,
237
+ "#{path} is not a valid .xlsx (no ZIP signature at offset 0)."
238
+ end
239
+
240
+ def locate_workbook_part
241
+ doc = parse_xml("_rels/.rels")
242
+ rel = doc.at_xpath(
243
+ "/pkg:Relationships/pkg:Relationship[@Type=$type]",
244
+ { "pkg" => PACKAGE_REL_NS },
245
+ { "type" => OFFICE_DOC_REL_TYPE }
246
+ )
247
+ raise WorkbookFormatError, "#{@path}: officeDocument relationship missing from _rels/.rels" unless rel
248
+
249
+ target = rel["Target"] or raise WorkbookFormatError, "#{@path}: officeDocument relationship has no Target"
250
+ target.sub(%r{\A/}, "")
251
+ end
252
+
253
+ def load_sheet_entries
254
+ rels = parse_rels(rels_path_for(@workbook_part))
255
+ doc = parse_xml(@workbook_part)
256
+ sheets = {}
257
+
258
+ doc.xpath("/main:workbook/main:sheets/main:sheet", "main" => MAIN_NS).each do |sheet_node|
259
+ name = sheet_node["name"]
260
+ rid = sheet_node.attribute_with_ns("id", REL_NS)&.value
261
+ next unless name && rid
262
+
263
+ target = rels.fetch(rid) do
264
+ raise WorkbookFormatError,
265
+ "workbook #{@path} references missing relationship #{rid.inspect} for sheet #{name.inspect}"
266
+ end
267
+ sheets[name] = resolve_relative(@workbook_dir, target)
268
+ end
269
+
270
+ sheets
271
+ end
272
+
273
+ def shared_strings
274
+ @shared_strings ||= SharedStringsLoader.load(@zip)
275
+ end
276
+
277
+ def collect_overrides
278
+ @sheets_by_name.each_with_object({}) do |(_, ws), h|
279
+ h[ws.entry_path] = ws.to_xml if ws.dirty?
280
+ end
281
+ end
282
+
283
+ def parse_xml(part_name)
284
+ entry = @zip.find_entry(part_name)
285
+ raise WorkbookFormatError, "#{@path}: missing part #{part_name}" unless entry
286
+
287
+ doc = Nokogiri::XML(entry.get_input_stream.read)
288
+ raise WorkbookFormatError, "#{@path}: #{part_name}: #{doc.errors.first}" unless doc.errors.empty?
289
+
290
+ doc
291
+ end
292
+
293
+ def parse_rels(rels_part)
294
+ entry = @zip.find_entry(rels_part)
295
+ return {} unless entry
296
+
297
+ doc = Nokogiri::XML(entry.get_input_stream.read)
298
+ doc.xpath("/pkg:Relationships/pkg:Relationship", "pkg" => PACKAGE_REL_NS).each_with_object({}) do |r, h|
299
+ h[r["Id"]] = r["Target"]
300
+ end
301
+ end
302
+
303
+ def rels_path_for(part_name)
304
+ dir = File.dirname(part_name)
305
+ base = File.basename(part_name)
306
+ dir == "." ? "_rels/#{base}.rels" : "#{dir}/_rels/#{base}.rels"
307
+ end
308
+
309
+ def resolve_relative(base_dir, target)
310
+ return target.sub(%r{\A/}, "") if target.start_with?("/")
311
+
312
+ File.expand_path(target, "/#{base_dir}").sub(%r{\A/}, "")
313
+ end
314
+ end
315
+ end
@@ -0,0 +1,216 @@
1
+ module Rbxl
2
+ # A single worksheet inside an {EditableWorkbook}.
3
+ #
4
+ # The worksheet's XML payload is parsed lazily — calling {#cell} for the
5
+ # first time triggers a single Nokogiri DOM parse of the sheet entry, and
6
+ # subsequent edits mutate that in-memory tree. Worksheets that are never
7
+ # touched are never parsed; on save they pass through the ZIP unchanged.
8
+ #
9
+ # Cell access is openpyxl-style:
10
+ #
11
+ # sheet["B5"].value = "company name"
12
+ # sheet.cell("B5").value # => "company name"
13
+ #
14
+ # See {EditableWorkbook} for the design contract these edits live inside.
15
+ class EditableWorksheet
16
+ # Namespace for the main SpreadsheetML schema.
17
+ MAIN_NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main".freeze
18
+
19
+ # @return [String] visible sheet name
20
+ attr_reader :name
21
+
22
+ # @return [String] ZIP entry path of the worksheet's XML part
23
+ attr_reader :entry_path
24
+
25
+ # @param zip [Zip::File] open archive shared with the workbook
26
+ # @param entry_path [String] ZIP entry path for this sheet's XML
27
+ # @param workbook_path [String] filesystem path the workbook was opened from
28
+ # @param shared_strings [Array<String>] pre-decoded shared strings table
29
+ # @param name [String] visible sheet name
30
+ def initialize(zip:, entry_path:, workbook_path:, shared_strings:, name:)
31
+ @zip = zip
32
+ @entry_path = entry_path
33
+ @workbook_path = workbook_path
34
+ @shared_strings = shared_strings
35
+ @name = name
36
+ @doc = nil
37
+ @sheet_data = nil
38
+ @row_index = nil
39
+ @dirty = false
40
+ end
41
+
42
+ # Returns the {EditableCell} view for +coordinate+. Cells not present in
43
+ # the sheet's XML are addressable too — reading their value yields +nil+,
44
+ # writing creates the +<c>+ (and its enclosing +<row>+ if needed) in
45
+ # column-sorted position. Repeated calls for the same coordinate may
46
+ # return different {EditableCell} objects but the underlying XML is the
47
+ # same, so reads are consistent.
48
+ #
49
+ # @param coordinate [String] Excel-style coordinate (e.g. +"A1"+, +"B5"+)
50
+ # @return [Rbxl::EditableCell]
51
+ # @raise [ArgumentError] if +coordinate+ is not a valid +A1+-style ref
52
+ def cell(coordinate)
53
+ EditableCell.new(worksheet: self, coordinate: normalize_coordinate(coordinate))
54
+ end
55
+
56
+ alias [] cell
57
+
58
+ # @return [Boolean] whether any cell on this sheet has been mutated since
59
+ # load (or since the last successful save)
60
+ def dirty?
61
+ @dirty
62
+ end
63
+
64
+ # Marks the sheet dirty. Called by {EditableCell#value=}; not part of
65
+ # the public API.
66
+ #
67
+ # @api private
68
+ def mark_dirty!
69
+ @dirty = true
70
+ end
71
+
72
+ # @api private
73
+ def clear_dirty!
74
+ @dirty = false
75
+ end
76
+
77
+ # @return [String] the worksheet's XML, reflecting any in-memory edits.
78
+ # The XML declaration and original namespace bindings are preserved.
79
+ def to_xml
80
+ ensure_doc_loaded!
81
+ @doc.to_xml
82
+ end
83
+
84
+ # @api private
85
+ # Resolves a shared-string index against the table loaded from
86
+ # +xl/sharedStrings.xml+. Used by {EditableCell} when decoding +t="s"+
87
+ # cells.
88
+ def shared_string_at(index)
89
+ @shared_strings[index]
90
+ end
91
+
92
+ # @api private
93
+ # Locates the +<c>+ node for +coordinate+. With +create: true+ the
94
+ # node — and its enclosing +<row>+ — are inserted in sorted position
95
+ # when missing. Returns +nil+ when +create+ is false and the cell does
96
+ # not exist.
97
+ def find_or_create_cell_node(coordinate, create:)
98
+ ensure_doc_loaded!
99
+ col, row = parse_coordinate(coordinate)
100
+ raise ArgumentError, "invalid coordinate: #{coordinate.inspect}" unless col && row
101
+
102
+ row_node = find_or_create_row(row, create: create)
103
+ return nil unless row_node
104
+
105
+ existing = row_node.element_children.find { |c| c["r"] == coordinate }
106
+ return existing if existing
107
+ return nil unless create
108
+
109
+ insert_cell_in_order(row_node, coordinate, col)
110
+ end
111
+
112
+ # @api private
113
+ # Returns the document for in-place mutation. Loads the XML on first
114
+ # access.
115
+ def document
116
+ ensure_doc_loaded!
117
+ @doc
118
+ end
119
+
120
+ private
121
+
122
+ def ensure_doc_loaded!
123
+ return if @doc
124
+
125
+ entry = @zip.find_entry(@entry_path)
126
+ unless entry
127
+ raise WorksheetFormatError,
128
+ "worksheet #{@name.inspect} is missing XML entry #{@entry_path.inspect} in #{@workbook_path}"
129
+ end
130
+
131
+ parsed = Nokogiri::XML(entry.get_input_stream.read)
132
+ unless parsed.errors.empty?
133
+ raise WorksheetFormatError,
134
+ "invalid worksheet XML for sheet #{@name.inspect} in #{@workbook_path}: #{parsed.errors.first}"
135
+ end
136
+
137
+ sheet_data = parsed.at_xpath("/main:worksheet/main:sheetData", "main" => MAIN_NS)
138
+ unless sheet_data
139
+ raise WorksheetFormatError,
140
+ "worksheet #{@name.inspect} in #{@workbook_path} is missing <sheetData>"
141
+ end
142
+
143
+ @doc = parsed
144
+ @sheet_data = sheet_data
145
+ @row_index = sheet_data.xpath("./main:row", "main" => MAIN_NS).each_with_object({}) do |row, h|
146
+ idx = row["r"]&.to_i
147
+ h[idx] = row if idx
148
+ end
149
+ end
150
+
151
+ def find_or_create_row(row_num, create:)
152
+ existing = @row_index[row_num]
153
+ return existing if existing
154
+ return nil unless create
155
+
156
+ row_node = insert_row_in_order(@sheet_data, row_num)
157
+ @row_index[row_num] = row_node
158
+ row_node
159
+ end
160
+
161
+ # Insertion is done by parsing an XML fragment in the parent's context
162
+ # so the new element inherits the SpreadsheetML default namespace
163
+ # binding from its surroundings rather than landing in +xmlns=""+ jail.
164
+ def insert_row_in_order(parent, row_num)
165
+ following = parent.element_children.find do |child|
166
+ child.name == "row" && (child["r"]&.to_i || 0) > row_num
167
+ end
168
+ xml = %(<row r="#{row_num}"/>)
169
+ added = following ? following.add_previous_sibling(xml) : parent.add_child(xml)
170
+ first_node(added)
171
+ end
172
+
173
+ def insert_cell_in_order(parent, coordinate, col_index)
174
+ following = parent.element_children.find do |child|
175
+ next false unless child.name == "c"
176
+
177
+ child_col, _ = parse_coordinate(child["r"])
178
+ child_col && child_col > col_index
179
+ end
180
+ xml = %(<c r="#{coordinate}"/>)
181
+ added = following ? following.add_previous_sibling(xml) : parent.add_child(xml)
182
+ first_node(added)
183
+ end
184
+
185
+ def first_node(result)
186
+ result.is_a?(Nokogiri::XML::NodeSet) ? result.first : result
187
+ end
188
+
189
+ COORDINATE_RE = /\A([A-Z]+)([1-9]\d*)\z/.freeze
190
+ private_constant :COORDINATE_RE
191
+
192
+ def normalize_coordinate(coordinate)
193
+ raise ArgumentError, "coordinate cannot be nil" if coordinate.nil?
194
+
195
+ str = coordinate.to_s.upcase
196
+ raise ArgumentError, "invalid coordinate: #{coordinate.inspect}" unless str.match?(COORDINATE_RE)
197
+
198
+ str
199
+ end
200
+
201
+ def parse_coordinate(coordinate)
202
+ return [nil, nil] unless coordinate
203
+
204
+ m = coordinate.match(COORDINATE_RE)
205
+ return [nil, nil] unless m
206
+
207
+ [column_index(m[1]), m[2].to_i]
208
+ end
209
+
210
+ def column_index(label)
211
+ col = 0
212
+ label.each_byte { |b| col = (col * 26) + (b - 64) }
213
+ col
214
+ end
215
+ end
216
+ end
data/lib/rbxl/errors.rb CHANGED
@@ -34,6 +34,12 @@ module Rbxl
34
34
  # worksheets are stopped mid-inflate rather than after the fact.
35
35
  class WorksheetTooLargeError < Error; end
36
36
 
37
+ # Raised by {Rbxl.open} when the file is not a valid +.xlsx+ container.
38
+ # Most commonly fires on legacy +.xls+ (BIFF/CFB) files — the message
39
+ # names the detected format and suggests a conversion path rather than
40
+ # letting the underlying ZIP parser surface an opaque error.
41
+ class UnsupportedFormatError < Error; end
42
+
37
43
  # Raised when workbook-level XML is malformed or internally inconsistent,
38
44
  # for example when +xl/workbook.xml+ cannot be parsed or references a
39
45
  # missing relationship target.
@@ -46,4 +52,10 @@ module Rbxl
46
52
  # workbook path, sheet name, and cell coordinate to make bad inputs easy
47
53
  # to locate.
48
54
  class CellValueError < WorksheetFormatError; end
55
+
56
+ # Raised by {Rbxl::EditableCell#value=} when the assigned Ruby object is
57
+ # not one of the supported types (+nil+, +String+, +Integer+, +Float+,
58
+ # +true+, +false+). +Date+/+Time+ values raise this error too — see
59
+ # {Rbxl::EditableCell} for the rationale.
60
+ class EditableCellTypeError < Error; end
49
61
  end