RubyGems - mindreframer-creek - Versions diffs - 1.0.5 - Mend

mindreframer-creek 1.0.5

Files changed (29) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.rdoc +76 -0
data/Rakefile +7 -0
data/creek.gemspec +30 -0
data/lib/creek/book.rb +38 -0
data/lib/creek/shared_strings.rb +44 -0
data/lib/creek/sheet.rb +187 -0
data/lib/creek/styles/constants.rb +44 -0
data/lib/creek/styles/converter.rb +116 -0
data/lib/creek/styles/style_types.rb +85 -0
data/lib/creek/styles.rb +27 -0
data/lib/creek/version.rb +3 -0
data/lib/creek.rb +12 -0
data/spec/fixtures/invalid.xls +0 -0
data/spec/fixtures/sample-as-zip.zip +0 -0
data/spec/fixtures/sample.xlsx +0 -0
data/spec/fixtures/sheets/sheet1.xml +459 -0
data/spec/fixtures/sst.xml +78 -0
data/spec/fixtures/styles/first.xml +208 -0
data/spec/fixtures/temp_string_io_file_path_with_no_extension +0 -0
data/spec/shared_string_spec.rb +18 -0
data/spec/spec_helper.rb +3 -0
data/spec/styles/converter_spec.rb +16 -0
data/spec/styles/style_types_spec.rb +15 -0
data/spec/test_spec.rb +99 -0
metadata +168 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: f3f021cdb45360b0892a1e1a023302c484efbbdc
+  data.tar.gz: 3f3ca6081a343a1c7ee20c94cb8bb2ad3f37404a
+SHA512:
+  metadata.gz: 0083c8069933048d18b45795eda213d74fa655aaa717651f812f152fc0dfae500d637e17d1833601ab2a1b531a9e417f9ed8ea19637ba7d54008dec4c0e9498b
+  data.tar.gz: 976cc1d6c079bef67193cb135d5d1c36e440d49ebb93f06f046e26cac0dcba4aa34fb9764a1c8ebb65a83a3a789daf330f0426bdea8a202d6b04f3768c8089be

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in creek.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 TODO: Write your name
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,76 @@
+= Creek -- Stream parser for large Excel(xlsx and xlsm) files.
+Creek is a Ruby gem that provide a fast, simple and efficient method of parsing large Excel(xlsx and xlsm) files.
+== Installation
+Creek can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command:
+    gem install creek
+To use it in Rails, add this line to your Gemfile:
+    gem "creek"
+== Basic Usage
+Creek can simply parse an Excel file by looping through the rows enumerator:
+    require 'creek'
+    creek = Creek::Book.new "specs/fixtures/sample.xlsx"
+    sheet= creek.sheets[0]
+    sheet.rows.each do |row|
+      puts row # => {"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}
+    end
+    sheet.rows_with_meta_data.each do |row|
+      puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}}
+    end
+    sheet.state   # => 'visible'
+    sheet.name    # => 'Sheet1'
+    sheet.rid     # => 'rId2'
+== Filename considerations
+By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed:
+    path = 'sample-as-zip.zip'
+    Creek::Book.new path, :check_file_extension => false
+By default, the Rails {file_field_tag}[http://api.rubyonrails.org/classes/ActionView/Helpers/FormTagHelper.html#method-i-file_field_tag] uploads to a temporary location and stores the original filename with the StringIO object. (See {this section}[http://guides.rubyonrails.org/form_helpers.html#uploading-files] of the Rails Guides for more information.)
+Creek can parse this directly without the need for file upload gems such as Carrierwave or Paperclip by passing the original filename as an option:
+    # Import endpoint in Rails controller
+    def import
+      file = params[:file]
+      Creek::Book.new file.path, check_file_extension: false
+    end
+== Contributing
+Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request.
+After forking and then cloning the repository locally, install Bundler and then use it
+to install the development gem dependecies:
+    gem install bundler
+    bundle install
+Once this is complete, you should be able to run the test suite:
+    rake
+== Bug Reporting
+Please use the {Issues}[https://github.com/pythonicrubyist/creek/issues] page to report bugs or suggest new enhancements.
+== License
+Creek has been published under {MIT License}[https://github.com/pythonicrubyist/creek/blob/master/LICENSE.txt]

data/Rakefile ADDED Viewed

@@ -0,0 +1,7 @@
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new('spec')
+# If you want to make this the default task
+task :default => :spec

data/creek.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'creek/version'
+Gem::Specification.new do |spec|
+  spec.name          = "mindreframer-creek"
+  spec.version       = Creek::VERSION
+  spec.authors       = ["pythonicrubyist"]
+  spec.email         = ["pythonicrubyist@gmail.com"]
+  spec.description   = %q{A Ruby gem that streams and parses large Excel(xlsx and xlsm) files fast and efficiently.}
+  spec.summary       = %q{A Ruby gem for parsing large Excel(xlsx and xlsm) files.}
+  spec.homepage      = "https://github.com/mindreframer/creek"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.required_ruby_version = '>= 1.9.2'
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency 'rspec', '~> 2.13.0'
+  spec.add_development_dependency 'pry'
+  spec.add_dependency 'nokogiri', '~> 1.6.0'
+  spec.add_dependency 'rubyzip', '>= 1.0.0'
+end

data/lib/creek/book.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'zip/filesystem'
+require 'nokogiri'
+module Creek
+  class Creek::Book
+    attr_reader :files,
+                :sheets,
+                :shared_strings
+    def initialize path, options = {}
+      check_file_extension = options.fetch(:check_file_extension, true)
+      if check_file_extension
+        extension = File.extname(options[:original_filename] || path).downcase
+        raise 'Not a valid file format.' unless (['.xlsx', '.xlsm'].include? extension)
+      end
+      @files          = Zip::File.open path
+      @shared_strings = SharedStrings.new(self)
+    end
+    def sheets
+      doc = @files.file.open "xl/workbook.xml"
+      xml = Nokogiri::XML::Document.parse doc
+      @sheets = xml.css('sheet').each_with_index.map  do |sheet, i|
+        Sheet.new(self, sheet.attr("name"), sheet.attr("sheetid"),  sheet.attr("state"), sheet.attr("visible"), sheet.attr("r:id"), i+1)
+      end
+    end
+    def style_types
+      @style_types ||= Creek::Styles.new(self).style_types
+    end
+    def close
+      @files.close
+    end
+  end
+end

data/lib/creek/shared_strings.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require 'zip/filesystem'
+require 'nokogiri'
+module Creek
+  class Creek::SharedStrings
+    attr_reader :book, :dictionary
+    def initialize book
+      @book = book
+      parse_shared_shared_strings
+    end
+    def parse_shared_shared_strings
+      path = "xl/sharedStrings.xml"
+      if @book.files.file.exist?(path)
+        doc = @book.files.file.open path
+        xml = Nokogiri::XML::Document.parse doc
+        parse_shared_string_from_document(xml)
+      end
+    end
+    def parse_shared_string_from_document(xml)
+      @dictionary = self.class.parse_shared_string_from_document(xml)
+    end
+    def self.parse_shared_string_from_document(xml)
+      dictionary = Hash.new
+      xml.css('si').each_with_index do |si, idx|
+        text_nodes = si.css('t')
+        if text_nodes.count == 1 # plain text node
+          dictionary[idx] = text_nodes.first.content
+        else # rich text nodes with text fragments
+          dictionary[idx] = text_nodes.map(&:content).join('')
+        end
+      end
+      dictionary
+    end
+  end
+end

data/lib/creek/sheet.rb ADDED Viewed

@@ -0,0 +1,187 @@
+require 'zip/filesystem'
+require 'nokogiri'
+module Creek
+  class Creek::Sheet
+    attr_reader :book,
+                :name,
+                :sheetid,
+                :state,
+                :visible,
+                :rid,
+                :index
+    def initialize(book, name, sheetid, state, visible, rid, index)
+      @book    = book
+      @name    = name
+      @sheetid = sheetid
+      @visible = visible
+      @rid     = rid
+      @state   = state
+      @index   = index
+      # An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns.
+      # This function creates a hash with all valid XLSX column names and associated indices.
+      @@excel_col_names = Hash.new
+      (0...16384).each do |i|
+        @@excel_col_names[col_name(i)] = i
+      end
+    end
+    ##
+    # Provides an Enumerator that returns a hash representing each row.
+    # The key of the hash is the Cell id and the value is the value of the cell.
+    def rows
+      rows_generator
+    end
+    def rows_array
+      rows_array_generator
+    end
+    ##
+    # Provides an Enumerator that returns a hash representing each row.
+    # The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents.
+    def rows_with_meta_data
+      rows_generator true
+    end
+    private
+    ##
+    # Returns valid Excel column name for a given column index.
+    # For example, returns "A" for 0, "B" for 1 and "AQ" for 42.
+    def col_name(i)
+      quot = i/26
+      (quot>0 ? col_name(quot-1) : "") + (i%26+65).chr
+    end
+    ##
+    # Returns a hash per row that includes the cell ids and values.
+    # Empty cells will be also included in the hash with a nil value.
+    def rows_generator(include_meta_data=false)
+      path = "xl/worksheets/sheet#{@index}.xml"
+      if @book.files.file.exist?(path)
+        # SAX parsing, Each element in the stream comes through as two events:
+        # one to open the element and one to close it.
+        opener = Nokogiri::XML::Reader::TYPE_ELEMENT
+        closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
+        Enumerator.new do |y|
+          row            = nil
+          cells          = {}
+          cell           = nil
+          cell_type      = nil
+          cell_style_idx = nil
+          @book.files.file.open(path) do |xml|
+            Nokogiri::XML::Reader.from_io(xml).each do |node|
+              if (node.name.eql? 'row') and (node.node_type.eql? opener)
+                row          = node.attributes
+                row['cells'] = Hash.new
+                cells        = Hash.new
+                y << (include_meta_data ? row : cells) if node.self_closing?
+              elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
+                processed_cells = fill_in_empty_cells(cells, row['r'], cell)
+                row['cells']    = processed_cells
+                y << (include_meta_data ? row : processed_cells)
+              elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
+                cell_type      = node.attribute('t')
+                cell_style_idx = node.attribute('s')
+                cell           = node.attribute('r')
+              elsif node.value?
+                if !cell.nil?
+                  cells[cell] = convert(node.value, cell_type, cell_style_idx)
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+    def convert(value, type, style_idx)
+      style = @book.style_types[style_idx.to_i]
+      Creek::Styles::Converter.call(value, type, style, converter_options)
+    end
+    def converter_options
+      @converter_options ||= {shared_strings: @book.shared_strings.dictionary}
+    end
+    ##
+    # The unzipped XML file does not contain any node for empty cells.
+    # Empty cells are being padded in using this function
+    def fill_in_empty_cells(cells, row_number, last_col)
+      new_cells = Hash.new
+      unless cells.empty?
+        keys           = cells.keys.sort
+        last_col       = last_col.gsub(row_number, '')
+        last_col_index = @@excel_col_names[last_col]
+        [*(0..last_col_index)].each do |i|
+          col = col_name(i)
+          id  = "#{col}#{row_number}"
+          unless cells.has_key? id
+            new_cells[id] = nil
+          else
+            new_cells[id] = cells[id]
+          end
+        end
+      end
+      new_cells
+    end
+    def col_index_for_cell_address(cell_address)
+      col       = cell_address.delete('^A-Z')
+      col_index = @@excel_col_names[col]
+    end
+    ##
+    # Returns a hash per row that includes the cell ids and values.
+    # Empty cells will be also included in the hash with a nil value.
+    def rows_array_generator
+      path = "xl/worksheets/sheet#{@index}.xml"
+      if @book.files.file.exist?(path)
+        # SAX parsing, Each element in the stream comes through as two events:
+        # one to open the element and one to close it.
+        opener = Nokogiri::XML::Reader::TYPE_ELEMENT
+        closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
+        Enumerator.new do |y|
+          row            = nil
+          cell_type      = nil
+          cell_style_idx = nil
+          cell_address   = nil
+          @book.files.file.open(path) do |xml|
+            Nokogiri::XML::Reader.from_io(xml).each do |node|
+              if (node.name.eql? 'row') and (node.node_type.eql? opener)
+                row = []
+                y << (row) if node.self_closing?
+              elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
+                y << row
+              elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
+                cell_type      = node.attribute('t')
+                cell_style_idx = node.attribute('s')
+                cell_address   = node.attribute('r')
+              elsif (node.name.eql? 'c') and (node.node_type.eql? closer)
+                cell_type      = nil
+                cell_style_idx = nil
+                cell_address   = nil
+              elsif (node.name.eql? '#text')
+                if !cell_address.nil? and node.value?
+                  idx      = col_index_for_cell_address(cell_address)
+                  value    = convert(node.value, cell_type, cell_style_idx)
+                  row[idx] = value
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end