RubyGems - culturecode-roo - Versions diffs - 2.0.1 → 2.0.2 - Mend

culturecode-roo 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

data/.gitignore +1 -0
data/CHANGELOG.md +513 -0
data/README.md +206 -73
data/lib/roo.rb +3 -3
data/lib/roo/base.rb +49 -33
data/lib/roo/csv.rb +10 -0
data/lib/roo/excelx.rb +187 -60
data/lib/roo/excelx/comments.rb +2 -1
data/lib/roo/excelx/sheet_doc.rb +30 -3
data/lib/roo/open_office.rb +250 -221
data/lib/roo/utils.rb +28 -31
data/lib/roo/version.rb +1 -1
data/roo.gemspec +10 -12
data/spec/lib/roo/csv_spec.rb +14 -0
data/spec/lib/roo/excelx_spec.rb +90 -2
data/spec/lib/roo/libreoffice_spec.rb +16 -0
data/spec/lib/roo/openoffice_spec.rb +11 -0
data/spec/lib/roo/utils_spec.rb +5 -4
data/test/test_roo.rb +113 -2
metadata +29 -180
data/CHANGELOG +0 -438
data/scripts/txt2html +0 -67
data/test/files/1900_base.xlsx +0 -0
data/test/files/1904_base.xlsx +0 -0
data/test/files/Bibelbund.csv +0 -3741
data/test/files/Bibelbund.ods +0 -0
data/test/files/Bibelbund.xlsx +0 -0
data/test/files/Bibelbund1.ods +0 -0
data/test/files/Pfand_from_windows_phone.xlsx +0 -0
data/test/files/advanced_header.ods +0 -0
data/test/files/bbu.ods +0 -0
data/test/files/bbu.xlsx +0 -0
data/test/files/bode-v1.ods.zip +0 -0
data/test/files/bode-v1.xls.zip +0 -0
data/test/files/boolean.csv +0 -2
data/test/files/boolean.ods +0 -0
data/test/files/boolean.xlsx +0 -0
data/test/files/borders.ods +0 -0
data/test/files/borders.xlsx +0 -0
data/test/files/bug-numbered-sheet-names.xlsx +0 -0
data/test/files/comments.ods +0 -0
data/test/files/comments.xlsx +0 -0
data/test/files/csvtypes.csv +0 -1
data/test/files/datetime.ods +0 -0
data/test/files/datetime.xlsx +0 -0
data/test/files/dreimalvier.ods +0 -0
data/test/files/emptysheets.ods +0 -0
data/test/files/emptysheets.xlsx +0 -0
data/test/files/encrypted-letmein.ods +0 -0
data/test/files/file_item_error.xlsx +0 -0
data/test/files/formula.ods +0 -0
data/test/files/formula.xlsx +0 -0
data/test/files/formula_string_error.xlsx +0 -0
data/test/files/html-escape.ods +0 -0
data/test/files/link.csv +0 -1
data/test/files/link.xlsx +0 -0
data/test/files/matrix.ods +0 -0
data/test/files/named_cells.ods +0 -0
data/test/files/named_cells.xlsx +0 -0
data/test/files/no_spreadsheet_file.txt +0 -1
data/test/files/numbers-export.xlsx +0 -0
data/test/files/numbers1.csv +0 -18
data/test/files/numbers1.ods +0 -0
data/test/files/numbers1.xlsx +0 -0
data/test/files/numbers1withnull.xlsx +0 -0
data/test/files/numeric-link.xlsx +0 -0
data/test/files/only_one_sheet.ods +0 -0
data/test/files/only_one_sheet.xlsx +0 -0
data/test/files/paragraph.ods +0 -0
data/test/files/paragraph.xlsx +0 -0
data/test/files/ric.ods +0 -0
data/test/files/sheet1.xml +0 -109
data/test/files/simple_spreadsheet.ods +0 -0
data/test/files/simple_spreadsheet.xlsx +0 -0
data/test/files/simple_spreadsheet_from_italo.ods +0 -0
data/test/files/so_datetime.csv +0 -8
data/test/files/style.ods +0 -0
data/test/files/style.xlsx +0 -0
data/test/files/time-test.csv +0 -2
data/test/files/time-test.ods +0 -0
data/test/files/time-test.xlsx +0 -0
data/test/files/type_excel.ods +0 -0
data/test/files/type_excel.xlsx +0 -0
data/test/files/type_excelx.ods +0 -0
data/test/files/type_openoffice.xlsx +0 -0
data/test/files/whitespace.ods +0 -0
data/test/files/whitespace.xlsx +0 -0

data/lib/roo/csv.rb CHANGED

@@ -107,4 +107,14 @@ class Roo::CSV < Roo::Base
       @last_column[sheet] -= 1
     end
   end
+  def clean_sheet(sheet)
+    read_cells(sheet)
+    @cell.each_pair do |coord, value|
+      @cell[coord] = sanitize_value(value) if value.is_a?(::String)
+    end
+    @cleaned[sheet] = true
+  end
 end

data/lib/roo/excelx.rb CHANGED

@@ -78,6 +78,7 @@ class Roo::Excelx < Roo::Base
   class Cell
     attr_reader :type, :formula, :value, :excelx_type, :excelx_value, :style, :hyperlink, :coordinate
+    attr_writer :value
     def initialize(value, type, formula, excelx_type, excelx_value, style, hyperlink, base_date, coordinate)
       @type = type
@@ -145,12 +146,12 @@ class Roo::Excelx < Roo::Base
   end
   class Sheet
-    def initialize(name, rels_path, sheet_path, comments_path, styles, shared_strings, workbook)
+    def initialize(name, rels_path, sheet_path, comments_path, styles, shared_strings, workbook, options = {})
       @name = name
       @rels = Relationships.new(rels_path)
       @comments = Comments.new(comments_path)
       @styles = styles
-      @sheet = SheetDoc.new(sheet_path, @rels, @styles, shared_strings, workbook)
+      @sheet = SheetDoc.new(sheet_path, @rels, @styles, shared_strings, workbook, options)
     end
     def cells
@@ -162,13 +163,16 @@ class Roo::Excelx < Roo::Base
     end
     # Yield each row as array of Excelx::Cell objects
-    # accepts options max_rows (int) (offset by 1 for header)
-    # and pad_cells (boolean)
+    # accepts options max_rows (int) (offset by 1 for header),
+    # pad_cells (boolean) and offset (int)
     def each_row(options = {}, &block)
       row_count = 0
+      options[:offset] ||= 0
       @sheet.each_row_streaming do |row|
-        break if options[:max_rows] && row_count == options[:max_rows] + 1
-        block.call(cells_for_row_element(row, options)) if block_given?
+        break if options[:max_rows] && row_count == options[:max_rows] + options[:offset] + 1
+        if block_given? && !(options[:offset] && row_count < options[:offset])
+          block.call(cells_for_row_element(row, options))
+        end
         row_count += 1
       end
     end
@@ -187,25 +191,26 @@ class Roo::Excelx < Roo::Base
     # returns the number of the first non-empty row
     def first_row
-      @first_row ||= present_cells.keys.map {|row, col| row }.min
+      @first_row ||= present_cells.keys.map {|row, _| row }.min
     end
     def last_row
-      @last_row ||= present_cells.keys.map {|row, col| row }.max
+      @last_row ||= present_cells.keys.map {|row, _| row }.max
     end
     # returns the number of the first non-empty column
-    def first_column(sheet=nil)
-      @first_column ||= present_cells.keys.map {|row, col| col }.min
+    def first_column
+      @first_column ||= present_cells.keys.map {|_, col| col }.min
     end
     # returns the number of the last non-empty column
-    def last_column(sheet=nil)
-      @last_column ||= present_cells.keys.map {|row, col| col }.max
+    def last_column
+      @last_column ||= present_cells.keys.map {|_, col| col }.max
     end
     def excelx_format(key)
-      @styles.style_format(cells[key].style).to_s
+      cell = cells[key]
+      @styles.style_format(cell.style).to_s if cell
     end
     def hyperlinks
@@ -250,23 +255,32 @@ class Roo::Excelx < Roo::Base
   # values for packed: :zip
   # optional cell_max (int) parameter for early aborting attempts to parse
   # enormous documents.
-  def initialize(filename, options = {})
+  def initialize(filename_or_stream, options = {})
     packed = options[:packed]
     file_warning = options.fetch(:file_warning, :error)
     cell_max = options.delete(:cell_max)
+    sheet_options = {}
+    sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false)
-    file_type_check(filename,'.xlsx','an Excel-xlsx', file_warning, packed)
+    unless is_stream?(filename_or_stream)
+      file_type_check(filename_or_stream,'.xlsx','an Excel-xlsx', file_warning, packed)
+      basename = File.basename(filename_or_stream)
+    end
-    @tmpdir = make_tmpdir(filename.split('/').last, options[:tmpdir_root])
-    @filename = local_filename(filename, @tmpdir, packed)
+    @tmpdir = make_tmpdir(basename, options[:tmpdir_root])
+    @filename = local_filename(filename_or_stream, @tmpdir, packed)
     @comments_files = []
     @rels_files = []
-    process_zipfile(@tmpdir, @filename)
+    process_zipfile(@filename || filename_or_stream)
-    @sheet_names = workbook.sheets.map { |sheet| sheet['name'] }
+    @sheet_names = workbook.sheets.map do |sheet|
+      unless options[:only_visible_sheets] && sheet['state'] == 'hidden'
+        sheet['name']
+      end
+    end.compact
     @sheets = []
     @sheets_by_name = Hash[@sheet_names.map.with_index do |sheet_name, n|
-      @sheets[n] = Sheet.new(sheet_name, @rels_files[n], @sheet_files[n], @comments_files[n], styles, shared_strings, workbook)
+      @sheets[n] = Sheet.new(sheet_name, @rels_files[n], @sheet_files[n], @comments_files[n], styles, shared_strings, workbook, sheet_options)
       [sheet_name, @sheets[n]]
     end]
@@ -276,11 +290,14 @@ class Roo::Excelx < Roo::Base
     end
     super
+  rescue => e # clean up any temp files, but only if an error was raised
+    close
+    raise e
   end
   def method_missing(method,*args)
     if label = workbook.defined_names[method.to_s]
-      sheet_for(label.sheet).cells[label.key].value
+      safe_send(sheet_for(label.sheet).cells[label.key], :value)
     else
       # call super for methods like #a1
       super
@@ -303,8 +320,7 @@ class Roo::Excelx < Roo::Base
   # cell at the first line and first row.
   def cell(row, col, sheet=nil)
     key = normalize(row,col)
-    cell = sheet_for(sheet).cells[key]
-    cell.value if cell
+    safe_send(sheet_for(sheet).cells[key], :value)
   end
   def row(rownumber,sheet=nil)
@@ -354,7 +370,7 @@ class Roo::Excelx < Roo::Base
   # The method #formula? checks if there is a formula.
   def formula(row,col,sheet=nil)
     key = normalize(row,col)
-    sheet_for(sheet).cells[key].formula
+    safe_send(sheet_for(sheet).cells[key], :formula)
   end
   # Predicate methods really should return a boolean
@@ -375,7 +391,8 @@ class Roo::Excelx < Roo::Base
   # Given a cell, return the cell's style
   def font(row, col, sheet=nil)
     key = normalize(row,col)
-    styles.definitions[sheet_for(sheet).cells[key].style]
+    definition_index = safe_send(sheet_for(sheet).cells[key], :style)
+    styles.definitions[definition_index] if definition_index
   end
   # returns the type of a cell:
@@ -388,7 +405,7 @@ class Roo::Excelx < Roo::Base
   # * :datetime
   def celltype(row,col,sheet=nil)
     key = normalize(row, col)
-    sheet_for(sheet).cells[key].type
+    safe_send(sheet_for(sheet).cells[key], :type)
   end
   # returns the internal type of an excel cell
@@ -397,14 +414,14 @@ class Roo::Excelx < Roo::Base
   # Note: this is only available within the Excelx class
   def excelx_type(row,col,sheet=nil)
     key = normalize(row,col)
-    sheet_for(sheet).cells[key].excelx_type
+    safe_send(sheet_for(sheet).cells[key], :excelx_type)
   end
   # returns the internal value of an excelx cell
   # Note: this is only available within the Excelx class
   def excelx_value(row,col,sheet=nil)
     key = normalize(row,col)
-    sheet_for(sheet).cells[key].excelx_value
+    safe_send(sheet_for(sheet).cells[key], :excelx_value)
   end
   # returns the internal format of an excel cell
@@ -489,42 +506,148 @@ class Roo::Excelx < Roo::Base
   private
+  def clean_sheet(sheet)
+    @sheets_by_name[sheet].cells.each_pair do |coord, value|
+      next unless value.value.is_a?(::String)
+      @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value)
+    end
+    @cleaned[sheet] = true
+  end
+  # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx
+  #           documents require a workbook.xml file, so a if the file is missing
+  #           it is not a valid xlsx file. In these cases, an ArgumentError is
+  #           raised.
+  #
+  # wb - a Zip::Entry for the workbook.xml file.
+  # path - A String for Zip::Entry's destination path.
+  #
+  # Examples
+  #
+  #   extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
+  #   # => ["rId1", "rId2", "rId3"]
+  #
+  # Returns an Array of Strings.
+  def extract_worksheet_ids(entries, path)
+      wb = entries.find { |e| e.name[/workbook.xml$/] }
+      fail ArgumentError 'missing required workbook file' if wb.nil?
+      wb.extract(path)
+      workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
+      workbook_doc.xpath('//sheet').map{ |s| s.attributes['id'].value }
+  end
+  # Internal
+  #
+  # wb_rels - A Zip::Entry for the workbook.xml.rels file.
+  # path - A String for the Zip::Entry's destination path.
+  #
+  # Examples
+  #
+  #   extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
+  #   # => {
+  #         "rId1"=>"worksheets/sheet1.xml",
+  #         "rId2"=>"worksheets/sheet2.xml",
+  #         "rId3"=>"worksheets/sheet3.xml"
+  #        }
+  #
+  # Returns a Hash.
+  def extract_worksheet_rels(entries, path)
+    wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
+    fail ArgumentError 'missing required workbook file' if wb_rels.nil?
+    wb_rels.extract(path)
+    rels_doc = Roo::Utils.load_xml(path).remove_namespaces!
+    worksheet_type ='http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet'
+    relationships = rels_doc.xpath('//Relationship').select do |relationship|
+      relationship.attributes['Type'].value == worksheet_type
+    end
+    relationships.inject({}) do |hash, relationship|
+      attributes = relationship.attributes
+      id = attributes['Id'];
+      hash[id.value] = attributes['Target'].value
+      hash
+    end
+  end
+  def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
+    sheet_ids.each_with_index do |id, i|
+      name = sheets[id]
+      entry = entries.find { |entry| entry.name =~ /#{name}$/ }
+      path = "#{tmpdir}/roo_sheet#{i + 1}"
+      @sheet_files << path
+      entry.extract(path)
+    end
+  end
   # Extracts all needed files from the zip file
-  def process_zipfile(tmpdir, zipfilename)
+  def process_zipfile(zipfilename_or_stream)
     @sheet_files = []
-    Zip::File.foreach(zipfilename) do |entry|
-      path =
-        case entry.name.downcase
-        when /workbook.xml$/
-          "#{tmpdir}/roo_workbook.xml"
-        when /sharedstrings.xml$/
-          "#{tmpdir}/roo_sharedStrings.xml"
-        when /styles.xml$/
-          "#{tmpdir}/roo_styles.xml"
-        when /sheet.xml$/
-          path = "#{tmpdir}/roo_sheet"
-          @sheet_files.unshift(path)
-          path
-        when /sheet([0-9]+).xml$/
-          # Numbers 3.1 exports first sheet without sheet number. Such sheets
-          # are always added to the beginning of the array which, naturally,
-          # causes other sheets to be pushed to the next index which could
-          # lead to sheet references getting overwritten, so we need to
-          # handle that case specifically.
-          nr = $1
-          sheet_files_index = nr.to_i - 1
-          sheet_files_index += 1 if @sheet_files[sheet_files_index]
-          @sheet_files[sheet_files_index] = "#{tmpdir}/roo_sheet#{nr.to_i}"
-        when /comments([0-9]+).xml$/
-          nr = $1
-          @comments_files[nr.to_i-1] = "#{tmpdir}/roo_comments#{nr}"
-        when /sheet([0-9]+).xml.rels$/
-          nr = $1
-          @rels_files[nr.to_i-1] = "#{tmpdir}/roo_rels#{nr}"
+    unless is_stream?(zipfilename_or_stream)
+      process_zipfile_entries Zip::File.open(zipfilename_or_stream).to_a.sort_by(&:name)
+    else
+      stream = Zip::InputStream.open zipfilename_or_stream
+      begin
+        entries = []
+        while entry = stream.get_next_entry
+          entries << entry
         end
-      if path
-        entry.extract(path)
+        process_zipfile_entries entries
+      ensure
+        stream.close
+      end
+    end
+  end
+  def process_zipfile_entries entries
+    # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
+    #       are not in order. With Numbers 3.1, the first sheet is always
+    #       sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
+    #       independent of a worksheet's filename (i.e. sheet6.xml can be the
+    #       first worksheet).
+    #
+    #       workbook.xml lists the correct order of worksheets and
+    #       workbook.xml.rels lists the filenames for those worksheets.
+    #
+    #       workbook.xml:
+    #         <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
+    #         <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
+    #       workbook.xml.rel:
+    #         <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+    #         <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+    sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml")
+    sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels")
+    extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir)
+    entries.each do |entry|
+      path =
+      case entry.name.downcase
+      when /sharedstrings.xml$/
+        "#{@tmpdir}/roo_sharedStrings.xml"
+      when /styles.xml$/
+        "#{@tmpdir}/roo_styles.xml"
+      when /comments([0-9]+).xml$/
+        # FIXME: Most of the time, The order of the comment files are the same
+        #       the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
+        #       In some situations, this isn't true. The true location of a
+        #       sheet's comment file is in the sheet1.xml.rels file. SEE
+        #       ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
+        nr = Regexp.last_match[1].to_i
+        @comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}"
+      when /sheet([0-9]+).xml.rels$/
+        # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
+        #        it also stores the location for sharedStrings, comments,
+        #        drawings, etc.
+        nr = Regexp.last_match[1].to_i
+        @rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}"
       end
+      entry.extract(path) if path
     end
   end
@@ -539,4 +662,8 @@ class Roo::Excelx < Roo::Base
   def workbook
     @workbook ||= Workbook.new(File.join(@tmpdir, "roo_workbook.xml"))
   end
+  def safe_send(object, method, *args)
+    object.send(method, *args) if object && object.respond_to?(method)
+  end
 end

data/lib/roo/excelx/comments.rb CHANGED

@@ -12,7 +12,8 @@ module Roo
     def extract_comments
       if doc_exists?
         Hash[doc.xpath("//comments/commentList/comment").map do |comment|
-          [::Roo::Utils.ref_to_key(comment.attributes['ref'].to_s), comment.at_xpath('./text/r/t').text]
+          value = (comment.at_xpath('./text/r/t') || comment.at_xpath('./text/t')).text
+          [::Roo::Utils.ref_to_key(comment.attributes['ref'].to_s), value]
         end]
       else
         {}

data/lib/roo/excelx/sheet_doc.rb CHANGED

@@ -2,8 +2,9 @@ require 'roo/excelx/extractor'
 module Roo
   class Excelx::SheetDoc < Excelx::Extractor
-    def initialize(path, relationships, styles, shared_strings, workbook)
+    def initialize(path, relationships, styles, shared_strings, workbook, options = {})
       super(path)
+      @options = options
       @relationships = relationships
       @styles = styles
       @shared_strings = shared_strings
@@ -43,6 +44,8 @@ module Roo
     private
     def cell_from_xml(cell_xml, hyperlink)
+      # This is error prone, to_i will silently turn a nil into a 0
+      # and it works by coincidence that Format[0] is general
       style = cell_xml['s'].to_i   # should be here
       # c: <c r="A5" s="2">
       # <v>22606</v>
@@ -120,15 +123,39 @@ module Roo
       end.compact]
     end
+    def expand_merged_ranges(cells)
+      # Extract merged ranges from xml
+      merges = {}
+      doc.xpath("/worksheet/mergeCells/mergeCell").each do |mergecell_xml|
+        tl, br = mergecell_xml['ref'].split(/:/).map {|ref| ::Roo::Utils.ref_to_key(ref)}
+        for row in tl[0]..br[0] do
+          for col in tl[1]..br[1] do
+            next if row == tl[0] && col == tl[1]
+            merges[[row,col]] = tl
+          end
+        end
+      end
+      # Duplicate value into all cells in merged range
+      merges.each do |dst, src|
+        cells[dst] = cells[src]
+      end
+    end
     def extract_cells(relationships)
-      Hash[doc.xpath("/worksheet/sheetData/row/c").map do |cell_xml|
+      extracted_cells = Hash[doc.xpath("/worksheet/sheetData/row/c").map do |cell_xml|
         key = ::Roo::Utils.ref_to_key(cell_xml['r'])
         [key, cell_from_xml(cell_xml, hyperlinks(relationships)[key])]
       end]
+      if @options[:expand_merged_ranges]
+        expand_merged_ranges(extracted_cells)
+      end
+      extracted_cells
     end
     def extract_dimensions
-      doc.xpath("/worksheet/dimension").map { |dim| dim.attributes["ref"].value }.first
+      Roo::Utils.each_element(@path, 'dimension') do |dimension|
+        return dimension.attributes["ref"].value
+      end
     end
 =begin