mindreframer-creek 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f3f021cdb45360b0892a1e1a023302c484efbbdc
4
+ data.tar.gz: 3f3ca6081a343a1c7ee20c94cb8bb2ad3f37404a
5
+ SHA512:
6
+ metadata.gz: 0083c8069933048d18b45795eda213d74fa655aaa717651f812f152fc0dfae500d637e17d1833601ab2a1b531a9e417f9ed8ea19637ba7d54008dec4c0e9498b
7
+ data.tar.gz: 976cc1d6c079bef67193cb135d5d1c36e440d49ebb93f06f046e26cac0dcba4aa34fb9764a1c8ebb65a83a3a789daf330f0426bdea8a202d6b04f3768c8089be
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in creek.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,76 @@
1
+ = Creek -- Stream parser for large Excel(xlsx and xlsm) files.
2
+
3
+ Creek is a Ruby gem that provide a fast, simple and efficient method of parsing large Excel(xlsx and xlsm) files.
4
+
5
+
6
+ == Installation
7
+
8
+ Creek can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command:
9
+
10
+ gem install creek
11
+
12
+ To use it in Rails, add this line to your Gemfile:
13
+
14
+ gem "creek"
15
+
16
+
17
+ == Basic Usage
18
+ Creek can simply parse an Excel file by looping through the rows enumerator:
19
+
20
+ require 'creek'
21
+ creek = Creek::Book.new "specs/fixtures/sample.xlsx"
22
+ sheet= creek.sheets[0]
23
+
24
+ sheet.rows.each do |row|
25
+ puts row # => {"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}
26
+ end
27
+
28
+
29
+ sheet.rows_with_meta_data.each do |row|
30
+ puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}}
31
+ end
32
+
33
+
34
+ sheet.state # => 'visible'
35
+ sheet.name # => 'Sheet1'
36
+ sheet.rid # => 'rId2'
37
+
38
+ == Filename considerations
39
+ By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed:
40
+
41
+ path = 'sample-as-zip.zip'
42
+ Creek::Book.new path, :check_file_extension => false
43
+
44
+ By default, the Rails {file_field_tag}[http://api.rubyonrails.org/classes/ActionView/Helpers/FormTagHelper.html#method-i-file_field_tag] uploads to a temporary location and stores the original filename with the StringIO object. (See {this section}[http://guides.rubyonrails.org/form_helpers.html#uploading-files] of the Rails Guides for more information.)
45
+
46
+ Creek can parse this directly without the need for file upload gems such as Carrierwave or Paperclip by passing the original filename as an option:
47
+
48
+ # Import endpoint in Rails controller
49
+ def import
50
+ file = params[:file]
51
+ Creek::Book.new file.path, check_file_extension: false
52
+ end
53
+
54
+ == Contributing
55
+
56
+ Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request.
57
+
58
+ After forking and then cloning the repository locally, install Bundler and then use it
59
+ to install the development gem dependecies:
60
+
61
+ gem install bundler
62
+ bundle install
63
+
64
+ Once this is complete, you should be able to run the test suite:
65
+
66
+ rake
67
+
68
+
69
+ == Bug Reporting
70
+
71
+ Please use the {Issues}[https://github.com/pythonicrubyist/creek/issues] page to report bugs or suggest new enhancements.
72
+
73
+
74
+ == License
75
+
76
+ Creek has been published under {MIT License}[https://github.com/pythonicrubyist/creek/blob/master/LICENSE.txt]
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
data/creek.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'creek/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "mindreframer-creek"
8
+ spec.version = Creek::VERSION
9
+ spec.authors = ["pythonicrubyist"]
10
+ spec.email = ["pythonicrubyist@gmail.com"]
11
+ spec.description = %q{A Ruby gem that streams and parses large Excel(xlsx and xlsm) files fast and efficiently.}
12
+ spec.summary = %q{A Ruby gem for parsing large Excel(xlsx and xlsm) files.}
13
+ spec.homepage = "https://github.com/mindreframer/creek"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = '>= 1.9.2'
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency 'rspec', '~> 2.13.0'
26
+ spec.add_development_dependency 'pry'
27
+
28
+ spec.add_dependency 'nokogiri', '~> 1.6.0'
29
+ spec.add_dependency 'rubyzip', '>= 1.0.0'
30
+ end
data/lib/creek/book.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'zip/filesystem'
2
+ require 'nokogiri'
3
+
4
+ module Creek
5
+
6
+ class Creek::Book
7
+
8
+ attr_reader :files,
9
+ :sheets,
10
+ :shared_strings
11
+
12
+ def initialize path, options = {}
13
+ check_file_extension = options.fetch(:check_file_extension, true)
14
+ if check_file_extension
15
+ extension = File.extname(options[:original_filename] || path).downcase
16
+ raise 'Not a valid file format.' unless (['.xlsx', '.xlsm'].include? extension)
17
+ end
18
+ @files = Zip::File.open path
19
+ @shared_strings = SharedStrings.new(self)
20
+ end
21
+
22
+ def sheets
23
+ doc = @files.file.open "xl/workbook.xml"
24
+ xml = Nokogiri::XML::Document.parse doc
25
+ @sheets = xml.css('sheet').each_with_index.map do |sheet, i|
26
+ Sheet.new(self, sheet.attr("name"), sheet.attr("sheetid"), sheet.attr("state"), sheet.attr("visible"), sheet.attr("r:id"), i+1)
27
+ end
28
+ end
29
+
30
+ def style_types
31
+ @style_types ||= Creek::Styles.new(self).style_types
32
+ end
33
+
34
+ def close
35
+ @files.close
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,44 @@
1
+ require 'zip/filesystem'
2
+ require 'nokogiri'
3
+
4
+ module Creek
5
+
6
+ class Creek::SharedStrings
7
+
8
+ attr_reader :book, :dictionary
9
+
10
+ def initialize book
11
+ @book = book
12
+ parse_shared_shared_strings
13
+ end
14
+
15
+ def parse_shared_shared_strings
16
+ path = "xl/sharedStrings.xml"
17
+ if @book.files.file.exist?(path)
18
+ doc = @book.files.file.open path
19
+ xml = Nokogiri::XML::Document.parse doc
20
+ parse_shared_string_from_document(xml)
21
+ end
22
+ end
23
+
24
+ def parse_shared_string_from_document(xml)
25
+ @dictionary = self.class.parse_shared_string_from_document(xml)
26
+ end
27
+
28
+ def self.parse_shared_string_from_document(xml)
29
+ dictionary = Hash.new
30
+
31
+ xml.css('si').each_with_index do |si, idx|
32
+ text_nodes = si.css('t')
33
+ if text_nodes.count == 1 # plain text node
34
+ dictionary[idx] = text_nodes.first.content
35
+ else # rich text nodes with text fragments
36
+ dictionary[idx] = text_nodes.map(&:content).join('')
37
+ end
38
+ end
39
+
40
+ dictionary
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,187 @@
1
+ require 'zip/filesystem'
2
+ require 'nokogiri'
3
+
4
+ module Creek
5
+ class Creek::Sheet
6
+
7
+ attr_reader :book,
8
+ :name,
9
+ :sheetid,
10
+ :state,
11
+ :visible,
12
+ :rid,
13
+ :index
14
+
15
+
16
+ def initialize(book, name, sheetid, state, visible, rid, index)
17
+ @book = book
18
+ @name = name
19
+ @sheetid = sheetid
20
+ @visible = visible
21
+ @rid = rid
22
+ @state = state
23
+ @index = index
24
+
25
+ # An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns.
26
+ # This function creates a hash with all valid XLSX column names and associated indices.
27
+ @@excel_col_names = Hash.new
28
+ (0...16384).each do |i|
29
+ @@excel_col_names[col_name(i)] = i
30
+ end
31
+ end
32
+
33
+ ##
34
+ # Provides an Enumerator that returns a hash representing each row.
35
+ # The key of the hash is the Cell id and the value is the value of the cell.
36
+ def rows
37
+ rows_generator
38
+ end
39
+
40
+ def rows_array
41
+ rows_array_generator
42
+ end
43
+
44
+ ##
45
+ # Provides an Enumerator that returns a hash representing each row.
46
+ # The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents.
47
+ def rows_with_meta_data
48
+ rows_generator true
49
+ end
50
+
51
+ private
52
+ ##
53
+ # Returns valid Excel column name for a given column index.
54
+ # For example, returns "A" for 0, "B" for 1 and "AQ" for 42.
55
+ def col_name(i)
56
+ quot = i/26
57
+ (quot>0 ? col_name(quot-1) : "") + (i%26+65).chr
58
+ end
59
+
60
+ ##
61
+ # Returns a hash per row that includes the cell ids and values.
62
+ # Empty cells will be also included in the hash with a nil value.
63
+ def rows_generator(include_meta_data=false)
64
+ path = "xl/worksheets/sheet#{@index}.xml"
65
+ if @book.files.file.exist?(path)
66
+ # SAX parsing, Each element in the stream comes through as two events:
67
+ # one to open the element and one to close it.
68
+ opener = Nokogiri::XML::Reader::TYPE_ELEMENT
69
+ closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
70
+ Enumerator.new do |y|
71
+ row = nil
72
+ cells = {}
73
+ cell = nil
74
+ cell_type = nil
75
+ cell_style_idx = nil
76
+ @book.files.file.open(path) do |xml|
77
+ Nokogiri::XML::Reader.from_io(xml).each do |node|
78
+ if (node.name.eql? 'row') and (node.node_type.eql? opener)
79
+ row = node.attributes
80
+ row['cells'] = Hash.new
81
+ cells = Hash.new
82
+ y << (include_meta_data ? row : cells) if node.self_closing?
83
+ elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
84
+ processed_cells = fill_in_empty_cells(cells, row['r'], cell)
85
+ row['cells'] = processed_cells
86
+ y << (include_meta_data ? row : processed_cells)
87
+ elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
88
+ cell_type = node.attribute('t')
89
+ cell_style_idx = node.attribute('s')
90
+ cell = node.attribute('r')
91
+
92
+ elsif node.value?
93
+ if !cell.nil?
94
+ cells[cell] = convert(node.value, cell_type, cell_style_idx)
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+
103
+ def convert(value, type, style_idx)
104
+ style = @book.style_types[style_idx.to_i]
105
+ Creek::Styles::Converter.call(value, type, style, converter_options)
106
+ end
107
+
108
+ def converter_options
109
+ @converter_options ||= {shared_strings: @book.shared_strings.dictionary}
110
+ end
111
+
112
+ ##
113
+ # The unzipped XML file does not contain any node for empty cells.
114
+ # Empty cells are being padded in using this function
115
+ def fill_in_empty_cells(cells, row_number, last_col)
116
+ new_cells = Hash.new
117
+ unless cells.empty?
118
+ keys = cells.keys.sort
119
+ last_col = last_col.gsub(row_number, '')
120
+ last_col_index = @@excel_col_names[last_col]
121
+ [*(0..last_col_index)].each do |i|
122
+ col = col_name(i)
123
+ id = "#{col}#{row_number}"
124
+ unless cells.has_key? id
125
+ new_cells[id] = nil
126
+ else
127
+ new_cells[id] = cells[id]
128
+ end
129
+ end
130
+ end
131
+ new_cells
132
+ end
133
+
134
+ def col_index_for_cell_address(cell_address)
135
+ col = cell_address.delete('^A-Z')
136
+ col_index = @@excel_col_names[col]
137
+ end
138
+
139
+
140
+ ##
141
+ # Returns a hash per row that includes the cell ids and values.
142
+ # Empty cells will be also included in the hash with a nil value.
143
+ def rows_array_generator
144
+ path = "xl/worksheets/sheet#{@index}.xml"
145
+ if @book.files.file.exist?(path)
146
+ # SAX parsing, Each element in the stream comes through as two events:
147
+ # one to open the element and one to close it.
148
+ opener = Nokogiri::XML::Reader::TYPE_ELEMENT
149
+ closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
150
+ Enumerator.new do |y|
151
+ row = nil
152
+ cell_type = nil
153
+ cell_style_idx = nil
154
+ cell_address = nil
155
+ @book.files.file.open(path) do |xml|
156
+ Nokogiri::XML::Reader.from_io(xml).each do |node|
157
+ if (node.name.eql? 'row') and (node.node_type.eql? opener)
158
+ row = []
159
+ y << (row) if node.self_closing?
160
+
161
+ elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
162
+ y << row
163
+
164
+ elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
165
+ cell_type = node.attribute('t')
166
+ cell_style_idx = node.attribute('s')
167
+ cell_address = node.attribute('r')
168
+
169
+ elsif (node.name.eql? 'c') and (node.node_type.eql? closer)
170
+ cell_type = nil
171
+ cell_style_idx = nil
172
+ cell_address = nil
173
+
174
+ elsif (node.name.eql? '#text')
175
+ if !cell_address.nil? and node.value?
176
+ idx = col_index_for_cell_address(cell_address)
177
+ value = convert(node.value, cell_type, cell_style_idx)
178
+ row[idx] = value
179
+ end
180
+ end
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
187
+ end