mindreframer-creek 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.rdoc +76 -0
- data/Rakefile +7 -0
- data/creek.gemspec +30 -0
- data/lib/creek/book.rb +38 -0
- data/lib/creek/shared_strings.rb +44 -0
- data/lib/creek/sheet.rb +187 -0
- data/lib/creek/styles/constants.rb +44 -0
- data/lib/creek/styles/converter.rb +116 -0
- data/lib/creek/styles/style_types.rb +85 -0
- data/lib/creek/styles.rb +27 -0
- data/lib/creek/version.rb +3 -0
- data/lib/creek.rb +12 -0
- data/spec/fixtures/invalid.xls +0 -0
- data/spec/fixtures/sample-as-zip.zip +0 -0
- data/spec/fixtures/sample.xlsx +0 -0
- data/spec/fixtures/sheets/sheet1.xml +459 -0
- data/spec/fixtures/sst.xml +78 -0
- data/spec/fixtures/styles/first.xml +208 -0
- data/spec/fixtures/temp_string_io_file_path_with_no_extension +0 -0
- data/spec/shared_string_spec.rb +18 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/styles/converter_spec.rb +16 -0
- data/spec/styles/style_types_spec.rb +15 -0
- data/spec/test_spec.rb +99 -0
- metadata +168 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f3f021cdb45360b0892a1e1a023302c484efbbdc
|
4
|
+
data.tar.gz: 3f3ca6081a343a1c7ee20c94cb8bb2ad3f37404a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0083c8069933048d18b45795eda213d74fa655aaa717651f812f152fc0dfae500d637e17d1833601ab2a1b531a9e417f9ed8ea19637ba7d54008dec4c0e9498b
|
7
|
+
data.tar.gz: 976cc1d6c079bef67193cb135d5d1c36e440d49ebb93f06f046e26cac0dcba4aa34fb9764a1c8ebb65a83a3a789daf330f0426bdea8a202d6b04f3768c8089be
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 TODO: Write your name
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
= Creek -- Stream parser for large Excel(xlsx and xlsm) files.
|
2
|
+
|
3
|
+
Creek is a Ruby gem that provide a fast, simple and efficient method of parsing large Excel(xlsx and xlsm) files.
|
4
|
+
|
5
|
+
|
6
|
+
== Installation
|
7
|
+
|
8
|
+
Creek can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command:
|
9
|
+
|
10
|
+
gem install creek
|
11
|
+
|
12
|
+
To use it in Rails, add this line to your Gemfile:
|
13
|
+
|
14
|
+
gem "creek"
|
15
|
+
|
16
|
+
|
17
|
+
== Basic Usage
|
18
|
+
Creek can simply parse an Excel file by looping through the rows enumerator:
|
19
|
+
|
20
|
+
require 'creek'
|
21
|
+
creek = Creek::Book.new "specs/fixtures/sample.xlsx"
|
22
|
+
sheet= creek.sheets[0]
|
23
|
+
|
24
|
+
sheet.rows.each do |row|
|
25
|
+
puts row # => {"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
sheet.rows_with_meta_data.each do |row|
|
30
|
+
puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}}
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
sheet.state # => 'visible'
|
35
|
+
sheet.name # => 'Sheet1'
|
36
|
+
sheet.rid # => 'rId2'
|
37
|
+
|
38
|
+
== Filename considerations
|
39
|
+
By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed:
|
40
|
+
|
41
|
+
path = 'sample-as-zip.zip'
|
42
|
+
Creek::Book.new path, :check_file_extension => false
|
43
|
+
|
44
|
+
By default, the Rails {file_field_tag}[http://api.rubyonrails.org/classes/ActionView/Helpers/FormTagHelper.html#method-i-file_field_tag] uploads to a temporary location and stores the original filename with the StringIO object. (See {this section}[http://guides.rubyonrails.org/form_helpers.html#uploading-files] of the Rails Guides for more information.)
|
45
|
+
|
46
|
+
Creek can parse this directly without the need for file upload gems such as Carrierwave or Paperclip by passing the original filename as an option:
|
47
|
+
|
48
|
+
# Import endpoint in Rails controller
|
49
|
+
def import
|
50
|
+
file = params[:file]
|
51
|
+
Creek::Book.new file.path, check_file_extension: false
|
52
|
+
end
|
53
|
+
|
54
|
+
== Contributing
|
55
|
+
|
56
|
+
Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request.
|
57
|
+
|
58
|
+
After forking and then cloning the repository locally, install Bundler and then use it
|
59
|
+
to install the development gem dependecies:
|
60
|
+
|
61
|
+
gem install bundler
|
62
|
+
bundle install
|
63
|
+
|
64
|
+
Once this is complete, you should be able to run the test suite:
|
65
|
+
|
66
|
+
rake
|
67
|
+
|
68
|
+
|
69
|
+
== Bug Reporting
|
70
|
+
|
71
|
+
Please use the {Issues}[https://github.com/pythonicrubyist/creek/issues] page to report bugs or suggest new enhancements.
|
72
|
+
|
73
|
+
|
74
|
+
== License
|
75
|
+
|
76
|
+
Creek has been published under {MIT License}[https://github.com/pythonicrubyist/creek/blob/master/LICENSE.txt]
|
data/Rakefile
ADDED
data/creek.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'creek/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "mindreframer-creek"
|
8
|
+
spec.version = Creek::VERSION
|
9
|
+
spec.authors = ["pythonicrubyist"]
|
10
|
+
spec.email = ["pythonicrubyist@gmail.com"]
|
11
|
+
spec.description = %q{A Ruby gem that streams and parses large Excel(xlsx and xlsm) files fast and efficiently.}
|
12
|
+
spec.summary = %q{A Ruby gem for parsing large Excel(xlsx and xlsm) files.}
|
13
|
+
spec.homepage = "https://github.com/mindreframer/creek"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '>= 1.9.2'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency 'rspec', '~> 2.13.0'
|
26
|
+
spec.add_development_dependency 'pry'
|
27
|
+
|
28
|
+
spec.add_dependency 'nokogiri', '~> 1.6.0'
|
29
|
+
spec.add_dependency 'rubyzip', '>= 1.0.0'
|
30
|
+
end
|
data/lib/creek/book.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'zip/filesystem'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Creek
|
5
|
+
|
6
|
+
class Creek::Book
|
7
|
+
|
8
|
+
attr_reader :files,
|
9
|
+
:sheets,
|
10
|
+
:shared_strings
|
11
|
+
|
12
|
+
def initialize path, options = {}
|
13
|
+
check_file_extension = options.fetch(:check_file_extension, true)
|
14
|
+
if check_file_extension
|
15
|
+
extension = File.extname(options[:original_filename] || path).downcase
|
16
|
+
raise 'Not a valid file format.' unless (['.xlsx', '.xlsm'].include? extension)
|
17
|
+
end
|
18
|
+
@files = Zip::File.open path
|
19
|
+
@shared_strings = SharedStrings.new(self)
|
20
|
+
end
|
21
|
+
|
22
|
+
def sheets
|
23
|
+
doc = @files.file.open "xl/workbook.xml"
|
24
|
+
xml = Nokogiri::XML::Document.parse doc
|
25
|
+
@sheets = xml.css('sheet').each_with_index.map do |sheet, i|
|
26
|
+
Sheet.new(self, sheet.attr("name"), sheet.attr("sheetid"), sheet.attr("state"), sheet.attr("visible"), sheet.attr("r:id"), i+1)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def style_types
|
31
|
+
@style_types ||= Creek::Styles.new(self).style_types
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
@files.close
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'zip/filesystem'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Creek
|
5
|
+
|
6
|
+
class Creek::SharedStrings
|
7
|
+
|
8
|
+
attr_reader :book, :dictionary
|
9
|
+
|
10
|
+
def initialize book
|
11
|
+
@book = book
|
12
|
+
parse_shared_shared_strings
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse_shared_shared_strings
|
16
|
+
path = "xl/sharedStrings.xml"
|
17
|
+
if @book.files.file.exist?(path)
|
18
|
+
doc = @book.files.file.open path
|
19
|
+
xml = Nokogiri::XML::Document.parse doc
|
20
|
+
parse_shared_string_from_document(xml)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_shared_string_from_document(xml)
|
25
|
+
@dictionary = self.class.parse_shared_string_from_document(xml)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.parse_shared_string_from_document(xml)
|
29
|
+
dictionary = Hash.new
|
30
|
+
|
31
|
+
xml.css('si').each_with_index do |si, idx|
|
32
|
+
text_nodes = si.css('t')
|
33
|
+
if text_nodes.count == 1 # plain text node
|
34
|
+
dictionary[idx] = text_nodes.first.content
|
35
|
+
else # rich text nodes with text fragments
|
36
|
+
dictionary[idx] = text_nodes.map(&:content).join('')
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
dictionary
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
data/lib/creek/sheet.rb
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'zip/filesystem'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Creek
|
5
|
+
class Creek::Sheet
|
6
|
+
|
7
|
+
attr_reader :book,
|
8
|
+
:name,
|
9
|
+
:sheetid,
|
10
|
+
:state,
|
11
|
+
:visible,
|
12
|
+
:rid,
|
13
|
+
:index
|
14
|
+
|
15
|
+
|
16
|
+
def initialize(book, name, sheetid, state, visible, rid, index)
|
17
|
+
@book = book
|
18
|
+
@name = name
|
19
|
+
@sheetid = sheetid
|
20
|
+
@visible = visible
|
21
|
+
@rid = rid
|
22
|
+
@state = state
|
23
|
+
@index = index
|
24
|
+
|
25
|
+
# An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns.
|
26
|
+
# This function creates a hash with all valid XLSX column names and associated indices.
|
27
|
+
@@excel_col_names = Hash.new
|
28
|
+
(0...16384).each do |i|
|
29
|
+
@@excel_col_names[col_name(i)] = i
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Provides an Enumerator that returns a hash representing each row.
|
35
|
+
# The key of the hash is the Cell id and the value is the value of the cell.
|
36
|
+
def rows
|
37
|
+
rows_generator
|
38
|
+
end
|
39
|
+
|
40
|
+
def rows_array
|
41
|
+
rows_array_generator
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Provides an Enumerator that returns a hash representing each row.
|
46
|
+
# The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents.
|
47
|
+
def rows_with_meta_data
|
48
|
+
rows_generator true
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
##
|
53
|
+
# Returns valid Excel column name for a given column index.
|
54
|
+
# For example, returns "A" for 0, "B" for 1 and "AQ" for 42.
|
55
|
+
def col_name(i)
|
56
|
+
quot = i/26
|
57
|
+
(quot>0 ? col_name(quot-1) : "") + (i%26+65).chr
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# Returns a hash per row that includes the cell ids and values.
|
62
|
+
# Empty cells will be also included in the hash with a nil value.
|
63
|
+
def rows_generator(include_meta_data=false)
|
64
|
+
path = "xl/worksheets/sheet#{@index}.xml"
|
65
|
+
if @book.files.file.exist?(path)
|
66
|
+
# SAX parsing, Each element in the stream comes through as two events:
|
67
|
+
# one to open the element and one to close it.
|
68
|
+
opener = Nokogiri::XML::Reader::TYPE_ELEMENT
|
69
|
+
closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
70
|
+
Enumerator.new do |y|
|
71
|
+
row = nil
|
72
|
+
cells = {}
|
73
|
+
cell = nil
|
74
|
+
cell_type = nil
|
75
|
+
cell_style_idx = nil
|
76
|
+
@book.files.file.open(path) do |xml|
|
77
|
+
Nokogiri::XML::Reader.from_io(xml).each do |node|
|
78
|
+
if (node.name.eql? 'row') and (node.node_type.eql? opener)
|
79
|
+
row = node.attributes
|
80
|
+
row['cells'] = Hash.new
|
81
|
+
cells = Hash.new
|
82
|
+
y << (include_meta_data ? row : cells) if node.self_closing?
|
83
|
+
elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
|
84
|
+
processed_cells = fill_in_empty_cells(cells, row['r'], cell)
|
85
|
+
row['cells'] = processed_cells
|
86
|
+
y << (include_meta_data ? row : processed_cells)
|
87
|
+
elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
|
88
|
+
cell_type = node.attribute('t')
|
89
|
+
cell_style_idx = node.attribute('s')
|
90
|
+
cell = node.attribute('r')
|
91
|
+
|
92
|
+
elsif node.value?
|
93
|
+
if !cell.nil?
|
94
|
+
cells[cell] = convert(node.value, cell_type, cell_style_idx)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def convert(value, type, style_idx)
|
104
|
+
style = @book.style_types[style_idx.to_i]
|
105
|
+
Creek::Styles::Converter.call(value, type, style, converter_options)
|
106
|
+
end
|
107
|
+
|
108
|
+
def converter_options
|
109
|
+
@converter_options ||= {shared_strings: @book.shared_strings.dictionary}
|
110
|
+
end
|
111
|
+
|
112
|
+
##
|
113
|
+
# The unzipped XML file does not contain any node for empty cells.
|
114
|
+
# Empty cells are being padded in using this function
|
115
|
+
def fill_in_empty_cells(cells, row_number, last_col)
|
116
|
+
new_cells = Hash.new
|
117
|
+
unless cells.empty?
|
118
|
+
keys = cells.keys.sort
|
119
|
+
last_col = last_col.gsub(row_number, '')
|
120
|
+
last_col_index = @@excel_col_names[last_col]
|
121
|
+
[*(0..last_col_index)].each do |i|
|
122
|
+
col = col_name(i)
|
123
|
+
id = "#{col}#{row_number}"
|
124
|
+
unless cells.has_key? id
|
125
|
+
new_cells[id] = nil
|
126
|
+
else
|
127
|
+
new_cells[id] = cells[id]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
new_cells
|
132
|
+
end
|
133
|
+
|
134
|
+
def col_index_for_cell_address(cell_address)
|
135
|
+
col = cell_address.delete('^A-Z')
|
136
|
+
col_index = @@excel_col_names[col]
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
##
|
141
|
+
# Returns a hash per row that includes the cell ids and values.
|
142
|
+
# Empty cells will be also included in the hash with a nil value.
|
143
|
+
def rows_array_generator
|
144
|
+
path = "xl/worksheets/sheet#{@index}.xml"
|
145
|
+
if @book.files.file.exist?(path)
|
146
|
+
# SAX parsing, Each element in the stream comes through as two events:
|
147
|
+
# one to open the element and one to close it.
|
148
|
+
opener = Nokogiri::XML::Reader::TYPE_ELEMENT
|
149
|
+
closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
150
|
+
Enumerator.new do |y|
|
151
|
+
row = nil
|
152
|
+
cell_type = nil
|
153
|
+
cell_style_idx = nil
|
154
|
+
cell_address = nil
|
155
|
+
@book.files.file.open(path) do |xml|
|
156
|
+
Nokogiri::XML::Reader.from_io(xml).each do |node|
|
157
|
+
if (node.name.eql? 'row') and (node.node_type.eql? opener)
|
158
|
+
row = []
|
159
|
+
y << (row) if node.self_closing?
|
160
|
+
|
161
|
+
elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
|
162
|
+
y << row
|
163
|
+
|
164
|
+
elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
|
165
|
+
cell_type = node.attribute('t')
|
166
|
+
cell_style_idx = node.attribute('s')
|
167
|
+
cell_address = node.attribute('r')
|
168
|
+
|
169
|
+
elsif (node.name.eql? 'c') and (node.node_type.eql? closer)
|
170
|
+
cell_type = nil
|
171
|
+
cell_style_idx = nil
|
172
|
+
cell_address = nil
|
173
|
+
|
174
|
+
elsif (node.name.eql? '#text')
|
175
|
+
if !cell_address.nil? and node.value?
|
176
|
+
idx = col_index_for_cell_address(cell_address)
|
177
|
+
value = convert(node.value, cell_type, cell_style_idx)
|
178
|
+
row[idx] = value
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|