mindreframer-creek 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.rdoc +76 -0
- data/Rakefile +7 -0
- data/creek.gemspec +30 -0
- data/lib/creek/book.rb +38 -0
- data/lib/creek/shared_strings.rb +44 -0
- data/lib/creek/sheet.rb +187 -0
- data/lib/creek/styles/constants.rb +44 -0
- data/lib/creek/styles/converter.rb +116 -0
- data/lib/creek/styles/style_types.rb +85 -0
- data/lib/creek/styles.rb +27 -0
- data/lib/creek/version.rb +3 -0
- data/lib/creek.rb +12 -0
- data/spec/fixtures/invalid.xls +0 -0
- data/spec/fixtures/sample-as-zip.zip +0 -0
- data/spec/fixtures/sample.xlsx +0 -0
- data/spec/fixtures/sheets/sheet1.xml +459 -0
- data/spec/fixtures/sst.xml +78 -0
- data/spec/fixtures/styles/first.xml +208 -0
- data/spec/fixtures/temp_string_io_file_path_with_no_extension +0 -0
- data/spec/shared_string_spec.rb +18 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/styles/converter_spec.rb +16 -0
- data/spec/styles/style_types_spec.rb +15 -0
- data/spec/test_spec.rb +99 -0
- metadata +168 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f3f021cdb45360b0892a1e1a023302c484efbbdc
|
4
|
+
data.tar.gz: 3f3ca6081a343a1c7ee20c94cb8bb2ad3f37404a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0083c8069933048d18b45795eda213d74fa655aaa717651f812f152fc0dfae500d637e17d1833601ab2a1b531a9e417f9ed8ea19637ba7d54008dec4c0e9498b
|
7
|
+
data.tar.gz: 976cc1d6c079bef67193cb135d5d1c36e440d49ebb93f06f046e26cac0dcba4aa34fb9764a1c8ebb65a83a3a789daf330f0426bdea8a202d6b04f3768c8089be
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 TODO: Write your name
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
= Creek -- Stream parser for large Excel(xlsx and xlsm) files.
|
2
|
+
|
3
|
+
Creek is a Ruby gem that provide a fast, simple and efficient method of parsing large Excel(xlsx and xlsm) files.
|
4
|
+
|
5
|
+
|
6
|
+
== Installation
|
7
|
+
|
8
|
+
Creek can be used from the command line or as part of a Ruby web framework. To install the gem using terminal, run the following command:
|
9
|
+
|
10
|
+
gem install creek
|
11
|
+
|
12
|
+
To use it in Rails, add this line to your Gemfile:
|
13
|
+
|
14
|
+
gem "creek"
|
15
|
+
|
16
|
+
|
17
|
+
== Basic Usage
|
18
|
+
Creek can simply parse an Excel file by looping through the rows enumerator:
|
19
|
+
|
20
|
+
require 'creek'
|
21
|
+
creek = Creek::Book.new "specs/fixtures/sample.xlsx"
|
22
|
+
sheet= creek.sheets[0]
|
23
|
+
|
24
|
+
sheet.rows.each do |row|
|
25
|
+
puts row # => {"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
sheet.rows_with_meta_data.each do |row|
|
30
|
+
puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}}
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
sheet.state # => 'visible'
|
35
|
+
sheet.name # => 'Sheet1'
|
36
|
+
sheet.rid # => 'rId2'
|
37
|
+
|
38
|
+
== Filename considerations
|
39
|
+
By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed:
|
40
|
+
|
41
|
+
path = 'sample-as-zip.zip'
|
42
|
+
Creek::Book.new path, :check_file_extension => false
|
43
|
+
|
44
|
+
By default, the Rails {file_field_tag}[http://api.rubyonrails.org/classes/ActionView/Helpers/FormTagHelper.html#method-i-file_field_tag] uploads to a temporary location and stores the original filename with the StringIO object. (See {this section}[http://guides.rubyonrails.org/form_helpers.html#uploading-files] of the Rails Guides for more information.)
|
45
|
+
|
46
|
+
Creek can parse this directly without the need for file upload gems such as Carrierwave or Paperclip by passing the original filename as an option:
|
47
|
+
|
48
|
+
# Import endpoint in Rails controller
|
49
|
+
def import
|
50
|
+
file = params[:file]
|
51
|
+
Creek::Book.new file.path, check_file_extension: false
|
52
|
+
end
|
53
|
+
|
54
|
+
== Contributing
|
55
|
+
|
56
|
+
Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests cover your new changes and finally create a pull request.
|
57
|
+
|
58
|
+
After forking and then cloning the repository locally, install Bundler and then use it
|
59
|
+
to install the development gem dependecies:
|
60
|
+
|
61
|
+
gem install bundler
|
62
|
+
bundle install
|
63
|
+
|
64
|
+
Once this is complete, you should be able to run the test suite:
|
65
|
+
|
66
|
+
rake
|
67
|
+
|
68
|
+
|
69
|
+
== Bug Reporting
|
70
|
+
|
71
|
+
Please use the {Issues}[https://github.com/pythonicrubyist/creek/issues] page to report bugs or suggest new enhancements.
|
72
|
+
|
73
|
+
|
74
|
+
== License
|
75
|
+
|
76
|
+
Creek has been published under {MIT License}[https://github.com/pythonicrubyist/creek/blob/master/LICENSE.txt]
|
data/Rakefile
ADDED
data/creek.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'creek/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "mindreframer-creek"
|
8
|
+
spec.version = Creek::VERSION
|
9
|
+
spec.authors = ["pythonicrubyist"]
|
10
|
+
spec.email = ["pythonicrubyist@gmail.com"]
|
11
|
+
spec.description = %q{A Ruby gem that streams and parses large Excel(xlsx and xlsm) files fast and efficiently.}
|
12
|
+
spec.summary = %q{A Ruby gem for parsing large Excel(xlsx and xlsm) files.}
|
13
|
+
spec.homepage = "https://github.com/mindreframer/creek"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '>= 1.9.2'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency 'rspec', '~> 2.13.0'
|
26
|
+
spec.add_development_dependency 'pry'
|
27
|
+
|
28
|
+
spec.add_dependency 'nokogiri', '~> 1.6.0'
|
29
|
+
spec.add_dependency 'rubyzip', '>= 1.0.0'
|
30
|
+
end
|
data/lib/creek/book.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'zip/filesystem'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Creek
|
5
|
+
|
6
|
+
class Creek::Book
|
7
|
+
|
8
|
+
attr_reader :files,
|
9
|
+
:sheets,
|
10
|
+
:shared_strings
|
11
|
+
|
12
|
+
def initialize path, options = {}
|
13
|
+
check_file_extension = options.fetch(:check_file_extension, true)
|
14
|
+
if check_file_extension
|
15
|
+
extension = File.extname(options[:original_filename] || path).downcase
|
16
|
+
raise 'Not a valid file format.' unless (['.xlsx', '.xlsm'].include? extension)
|
17
|
+
end
|
18
|
+
@files = Zip::File.open path
|
19
|
+
@shared_strings = SharedStrings.new(self)
|
20
|
+
end
|
21
|
+
|
22
|
+
def sheets
|
23
|
+
doc = @files.file.open "xl/workbook.xml"
|
24
|
+
xml = Nokogiri::XML::Document.parse doc
|
25
|
+
@sheets = xml.css('sheet').each_with_index.map do |sheet, i|
|
26
|
+
Sheet.new(self, sheet.attr("name"), sheet.attr("sheetid"), sheet.attr("state"), sheet.attr("visible"), sheet.attr("r:id"), i+1)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def style_types
|
31
|
+
@style_types ||= Creek::Styles.new(self).style_types
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
@files.close
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'zip/filesystem'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Creek
|
5
|
+
|
6
|
+
class Creek::SharedStrings
|
7
|
+
|
8
|
+
attr_reader :book, :dictionary
|
9
|
+
|
10
|
+
def initialize book
|
11
|
+
@book = book
|
12
|
+
parse_shared_shared_strings
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse_shared_shared_strings
|
16
|
+
path = "xl/sharedStrings.xml"
|
17
|
+
if @book.files.file.exist?(path)
|
18
|
+
doc = @book.files.file.open path
|
19
|
+
xml = Nokogiri::XML::Document.parse doc
|
20
|
+
parse_shared_string_from_document(xml)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_shared_string_from_document(xml)
|
25
|
+
@dictionary = self.class.parse_shared_string_from_document(xml)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.parse_shared_string_from_document(xml)
|
29
|
+
dictionary = Hash.new
|
30
|
+
|
31
|
+
xml.css('si').each_with_index do |si, idx|
|
32
|
+
text_nodes = si.css('t')
|
33
|
+
if text_nodes.count == 1 # plain text node
|
34
|
+
dictionary[idx] = text_nodes.first.content
|
35
|
+
else # rich text nodes with text fragments
|
36
|
+
dictionary[idx] = text_nodes.map(&:content).join('')
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
dictionary
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
data/lib/creek/sheet.rb
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'zip/filesystem'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Creek
|
5
|
+
class Creek::Sheet
|
6
|
+
|
7
|
+
attr_reader :book,
|
8
|
+
:name,
|
9
|
+
:sheetid,
|
10
|
+
:state,
|
11
|
+
:visible,
|
12
|
+
:rid,
|
13
|
+
:index
|
14
|
+
|
15
|
+
|
16
|
+
def initialize(book, name, sheetid, state, visible, rid, index)
|
17
|
+
@book = book
|
18
|
+
@name = name
|
19
|
+
@sheetid = sheetid
|
20
|
+
@visible = visible
|
21
|
+
@rid = rid
|
22
|
+
@state = state
|
23
|
+
@index = index
|
24
|
+
|
25
|
+
# An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns.
|
26
|
+
# This function creates a hash with all valid XLSX column names and associated indices.
|
27
|
+
@@excel_col_names = Hash.new
|
28
|
+
(0...16384).each do |i|
|
29
|
+
@@excel_col_names[col_name(i)] = i
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# Provides an Enumerator that returns a hash representing each row.
|
35
|
+
# The key of the hash is the Cell id and the value is the value of the cell.
|
36
|
+
def rows
|
37
|
+
rows_generator
|
38
|
+
end
|
39
|
+
|
40
|
+
def rows_array
|
41
|
+
rows_array_generator
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Provides an Enumerator that returns a hash representing each row.
|
46
|
+
# The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents.
|
47
|
+
def rows_with_meta_data
|
48
|
+
rows_generator true
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
##
|
53
|
+
# Returns valid Excel column name for a given column index.
|
54
|
+
# For example, returns "A" for 0, "B" for 1 and "AQ" for 42.
|
55
|
+
def col_name(i)
|
56
|
+
quot = i/26
|
57
|
+
(quot>0 ? col_name(quot-1) : "") + (i%26+65).chr
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# Returns a hash per row that includes the cell ids and values.
|
62
|
+
# Empty cells will be also included in the hash with a nil value.
|
63
|
+
def rows_generator(include_meta_data=false)
|
64
|
+
path = "xl/worksheets/sheet#{@index}.xml"
|
65
|
+
if @book.files.file.exist?(path)
|
66
|
+
# SAX parsing, Each element in the stream comes through as two events:
|
67
|
+
# one to open the element and one to close it.
|
68
|
+
opener = Nokogiri::XML::Reader::TYPE_ELEMENT
|
69
|
+
closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
70
|
+
Enumerator.new do |y|
|
71
|
+
row = nil
|
72
|
+
cells = {}
|
73
|
+
cell = nil
|
74
|
+
cell_type = nil
|
75
|
+
cell_style_idx = nil
|
76
|
+
@book.files.file.open(path) do |xml|
|
77
|
+
Nokogiri::XML::Reader.from_io(xml).each do |node|
|
78
|
+
if (node.name.eql? 'row') and (node.node_type.eql? opener)
|
79
|
+
row = node.attributes
|
80
|
+
row['cells'] = Hash.new
|
81
|
+
cells = Hash.new
|
82
|
+
y << (include_meta_data ? row : cells) if node.self_closing?
|
83
|
+
elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
|
84
|
+
processed_cells = fill_in_empty_cells(cells, row['r'], cell)
|
85
|
+
row['cells'] = processed_cells
|
86
|
+
y << (include_meta_data ? row : processed_cells)
|
87
|
+
elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
|
88
|
+
cell_type = node.attribute('t')
|
89
|
+
cell_style_idx = node.attribute('s')
|
90
|
+
cell = node.attribute('r')
|
91
|
+
|
92
|
+
elsif node.value?
|
93
|
+
if !cell.nil?
|
94
|
+
cells[cell] = convert(node.value, cell_type, cell_style_idx)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def convert(value, type, style_idx)
|
104
|
+
style = @book.style_types[style_idx.to_i]
|
105
|
+
Creek::Styles::Converter.call(value, type, style, converter_options)
|
106
|
+
end
|
107
|
+
|
108
|
+
def converter_options
|
109
|
+
@converter_options ||= {shared_strings: @book.shared_strings.dictionary}
|
110
|
+
end
|
111
|
+
|
112
|
+
##
|
113
|
+
# The unzipped XML file does not contain any node for empty cells.
|
114
|
+
# Empty cells are being padded in using this function
|
115
|
+
def fill_in_empty_cells(cells, row_number, last_col)
|
116
|
+
new_cells = Hash.new
|
117
|
+
unless cells.empty?
|
118
|
+
keys = cells.keys.sort
|
119
|
+
last_col = last_col.gsub(row_number, '')
|
120
|
+
last_col_index = @@excel_col_names[last_col]
|
121
|
+
[*(0..last_col_index)].each do |i|
|
122
|
+
col = col_name(i)
|
123
|
+
id = "#{col}#{row_number}"
|
124
|
+
unless cells.has_key? id
|
125
|
+
new_cells[id] = nil
|
126
|
+
else
|
127
|
+
new_cells[id] = cells[id]
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
new_cells
|
132
|
+
end
|
133
|
+
|
134
|
+
def col_index_for_cell_address(cell_address)
|
135
|
+
col = cell_address.delete('^A-Z')
|
136
|
+
col_index = @@excel_col_names[col]
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
##
|
141
|
+
# Returns a hash per row that includes the cell ids and values.
|
142
|
+
# Empty cells will be also included in the hash with a nil value.
|
143
|
+
def rows_array_generator
|
144
|
+
path = "xl/worksheets/sheet#{@index}.xml"
|
145
|
+
if @book.files.file.exist?(path)
|
146
|
+
# SAX parsing, Each element in the stream comes through as two events:
|
147
|
+
# one to open the element and one to close it.
|
148
|
+
opener = Nokogiri::XML::Reader::TYPE_ELEMENT
|
149
|
+
closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
150
|
+
Enumerator.new do |y|
|
151
|
+
row = nil
|
152
|
+
cell_type = nil
|
153
|
+
cell_style_idx = nil
|
154
|
+
cell_address = nil
|
155
|
+
@book.files.file.open(path) do |xml|
|
156
|
+
Nokogiri::XML::Reader.from_io(xml).each do |node|
|
157
|
+
if (node.name.eql? 'row') and (node.node_type.eql? opener)
|
158
|
+
row = []
|
159
|
+
y << (row) if node.self_closing?
|
160
|
+
|
161
|
+
elsif (node.name.eql? 'row') and (node.node_type.eql? closer)
|
162
|
+
y << row
|
163
|
+
|
164
|
+
elsif (node.name.eql? 'c') and (node.node_type.eql? opener)
|
165
|
+
cell_type = node.attribute('t')
|
166
|
+
cell_style_idx = node.attribute('s')
|
167
|
+
cell_address = node.attribute('r')
|
168
|
+
|
169
|
+
elsif (node.name.eql? 'c') and (node.node_type.eql? closer)
|
170
|
+
cell_type = nil
|
171
|
+
cell_style_idx = nil
|
172
|
+
cell_address = nil
|
173
|
+
|
174
|
+
elsif (node.name.eql? '#text')
|
175
|
+
if !cell_address.nil? and node.value?
|
176
|
+
idx = col_index_for_cell_address(cell_address)
|
177
|
+
value = convert(node.value, cell_type, cell_style_idx)
|
178
|
+
row[idx] = value
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|