read_xls 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +4 -0
  5. data/CODE_OF_CONDUCT.md +13 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +52 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/lib/read_xls.rb +49 -0
  13. data/lib/read_xls/evaluator/blank.rb +9 -0
  14. data/lib/read_xls/evaluator/boolean.rb +15 -0
  15. data/lib/read_xls/evaluator/extended_format.rb +20 -0
  16. data/lib/read_xls/evaluator/format_number.rb +47 -0
  17. data/lib/read_xls/evaluator/formula.rb +33 -0
  18. data/lib/read_xls/evaluator/number.rb +24 -0
  19. data/lib/read_xls/evaluator/rk_number.rb +38 -0
  20. data/lib/read_xls/evaluator/row.rb +26 -0
  21. data/lib/read_xls/evaluator/sst_string.rb +16 -0
  22. data/lib/read_xls/record_handler.rb +262 -0
  23. data/lib/read_xls/record_handler/base.rb +19 -0
  24. data/lib/read_xls/record_handler/blank.rb +11 -0
  25. data/lib/read_xls/record_handler/bof.rb +9 -0
  26. data/lib/read_xls/record_handler/boolerr.rb +11 -0
  27. data/lib/read_xls/record_handler/boundsheet.rb +45 -0
  28. data/lib/read_xls/record_handler/format.rb +20 -0
  29. data/lib/read_xls/record_handler/formula.rb +17 -0
  30. data/lib/read_xls/record_handler/label_sst.rb +15 -0
  31. data/lib/read_xls/record_handler/mul_rk.rb +30 -0
  32. data/lib/read_xls/record_handler/not_implemented.rb +11 -0
  33. data/lib/read_xls/record_handler/number.rb +18 -0
  34. data/lib/read_xls/record_handler/rk.rb +23 -0
  35. data/lib/read_xls/record_handler/row.rb +10 -0
  36. data/lib/read_xls/record_handler/skip.rb +8 -0
  37. data/lib/read_xls/record_handler/sst.rb +36 -0
  38. data/lib/read_xls/record_handler/string.rb +13 -0
  39. data/lib/read_xls/record_handler/xf.rb +19 -0
  40. data/lib/read_xls/spreadsheet.rb +60 -0
  41. data/lib/read_xls/type/extended_format.rb +25 -0
  42. data/lib/read_xls/version.rb +3 -0
  43. data/lib/read_xls/workbook.rb +11 -0
  44. data/lib/read_xls/workbook/shared_string_table.rb +15 -0
  45. data/lib/read_xls/workbook/worksheet.rb +11 -0
  46. data/lib/read_xls/workbook/worksheet_builder.rb +44 -0
  47. data/lib/read_xls/workbook_builder.rb +96 -0
  48. data/read_xls.gemspec +27 -0
  49. metadata +147 -0
@@ -0,0 +1,11 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class NotImplemented < ::ReadXls::RecordHandler::Base
4
+ RecordHandlerNotImplementedError = Class.new(StandardError)
5
+
6
+ def call
7
+ raise RecordHandlerNotImplementedError, "there is no implementation for #{record_number}"
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class Number < ::ReadXls::RecordHandler::Base
4
+ def call
5
+ row, column, xf_index = record_data.byteslice(0, 6).unpack("v3")
6
+ number = record_data.byteslice(6, 8).unpack("E").first
7
+
8
+ number_column = ::ReadXls::Evaluator::Number.new(builder, number, xf_index)
9
+
10
+ builder.add_column_to_row(
11
+ row,
12
+ column,
13
+ number_column
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class Rk < ::ReadXls::RecordHandler::Base
4
+ def call
5
+ row, column, xf_index = record_data
6
+ .byteslice(0, 6)
7
+ .unpack("v3")
8
+ rk_bits = record_data
9
+ .byteslice(6, 4)
10
+ .unpack("V")
11
+ .first
12
+
13
+ rk_column = ::ReadXls::Evaluator::RkNumber.new(builder, rk_bits, xf_index)
14
+
15
+ builder.add_column_to_row(
16
+ row,
17
+ column,
18
+ rk_column
19
+ )
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,10 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class Row < ::ReadXls::RecordHandler::Base
4
+ def call
5
+ row_number, first_col, last_col = record_data.unpack("v3")
6
+ builder.add_row(row_number, ::ReadXls::Evaluator::Row.new(row_number, first_col, last_col))
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,8 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class Skip < ::ReadXls::RecordHandler::Base
4
+ def call(*)
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,36 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class Sst < ::ReadXls::RecordHandler::Base
4
+ F_HIGH_BYTE = 0x01
5
+ STRING_BEGIN_OFFSET = 3
6
+ DATA_OFFSET = 8
7
+ STRING_COUNT_OFFSET = 4
8
+
9
+ def call
10
+ string_count = record_data
11
+ .byteslice(STRING_COUNT_OFFSET, 4)
12
+ .unpack("V")
13
+ .first
14
+
15
+ string_data = record_data.byteslice(DATA_OFFSET..-1)
16
+ string_position = 0
17
+
18
+ strings = string_count.times.map do |i|
19
+ char_count, grbit = string_data
20
+ .byteslice(string_position, STRING_BEGIN_OFFSET)
21
+ .unpack("vC")
22
+
23
+ char_byte_size = (grbit & F_HIGH_BYTE) == 0 ? 1 : 2
24
+ string_begin = string_position + STRING_BEGIN_OFFSET
25
+ string_length = char_count * char_byte_size
26
+
27
+ string_position = string_begin + string_length
28
+
29
+ string_data.byteslice(string_begin, string_length)
30
+ end
31
+
32
+ builder.sst = ::ReadXls::Workbook::SharedStringTable.new(strings.uniq)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,13 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class String < ::ReadXls::RecordHandler::Base
4
+ def call
5
+ char_length, grbit = record_data.byteslice(0, 3).unpack("vC")
6
+ char_byte_size = grbit == 0 ? 1 : 2
7
+
8
+ string = record_data.byteslice(3, char_byte_size * char_length)
9
+ builder.add_formula_string(string)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ module ReadXls
2
+ module RecordHandler
3
+ class Xf < ::ReadXls::RecordHandler::Base
4
+ def call
5
+ format_index = record_data
6
+ .byteslice(2, 2)
7
+ .unpack("v")
8
+ .first
9
+
10
+ builder.add_extended_format(
11
+ ::ReadXls::Evaluator::ExtendedFormat.new(
12
+ :builder => builder,
13
+ :format_index => format_index
14
+ )
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,60 @@
1
+ module ReadXls
2
+ class Spreadsheet
3
+ ParsingFailedError = Class.new(StandardError)
4
+ BYTE_LENGTH = 2
5
+
6
+ attr_accessor :biff, :position, :workbook
7
+
8
+ def self.parse(xls_file_path)
9
+ new(
10
+ Ole::Storage.open(xls_file_path, "rb+")
11
+ )
12
+ end
13
+
14
+ def initialize(ole)
15
+ self.position = 0
16
+ self.biff = ole.file.read("Workbook")
17
+ self.workbook = parse_workbook
18
+ ensure
19
+ ole.close
20
+ end
21
+
22
+ def sheets
23
+ workbook.worksheets
24
+ end
25
+
26
+ def parse_workbook
27
+ workbook_builder = WorkbookBuilder.new(biff)
28
+
29
+ loop do
30
+ record_number = read_byte
31
+ break if record_number == ::ReadXls::RecordHandler::EOF
32
+
33
+ record_length = read_byte
34
+ record_data = read_data(record_length)
35
+
36
+ ::ReadXls::RecordHandler.call(
37
+ record_number,
38
+ workbook_builder,
39
+ biff,
40
+ record_data
41
+ )
42
+ end
43
+
44
+ workbook_builder.build
45
+ end
46
+
47
+ def read_data(bytes)
48
+ val = biff[position, bytes]
49
+ self.position += bytes
50
+ val
51
+ end
52
+
53
+
54
+ def read_byte
55
+ val = biff[position, BYTE_LENGTH].unpack("v")
56
+ self.position += BYTE_LENGTH
57
+ val.first || raise(ParsingFailedError, "expected to get value, got nil")
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,25 @@
1
+ module ReadXls
2
+ module Type
3
+ class ExtendedFormat
4
+ attr_accessor :format_string
5
+
6
+ FORMAT_MATCHERS = {
7
+ /[YMDymd]/ => :date
8
+ }
9
+
10
+ def initialize(options)
11
+ self.format_string = options.fetch(:format_string)
12
+ end
13
+
14
+ def format_type
15
+ matched_types = FORMAT_MATCHERS.select { |matcher, _| format_string =~ matcher }
16
+
17
+ if matched_types.length > 1
18
+ raise "got more than one match, expected only one matched format type"
19
+ end
20
+
21
+ matched_types.values.first
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module ReadXls
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,11 @@
1
+ module ReadXls
2
+ class Workbook
3
+ attr_accessor :worksheets, :formats, :extended_formats
4
+
5
+ def initialize
6
+ self.worksheets = []
7
+ self.formats = []
8
+ self.extended_formats = []
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,15 @@
1
+ module ReadXls
2
+ class Workbook
3
+ class SharedStringTable
4
+ attr_accessor :strings
5
+
6
+ def initialize(strings)
7
+ self.strings = strings
8
+ end
9
+
10
+ def index(i)
11
+ strings[i]
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module ReadXls
2
+ class Workbook
3
+ class Worksheet
4
+ attr_accessor :rows
5
+
6
+ def initialize(options)
7
+ self.rows = options.fetch(:rows)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,44 @@
1
+ module ReadXls
2
+ class Workbook
3
+ class WorksheetBuilder
4
+ attr_accessor :rows, :formula_strings, :sst, :formats, :extended_formats
5
+
6
+ def initialize
7
+ self.rows = []
8
+ self.formula_strings = []
9
+ end
10
+
11
+ def add_row(row_index, row)
12
+ rows[row_index] = row
13
+ end
14
+
15
+ def add_column_to_row(row_index, column_index, value)
16
+ row = rows[row_index] || raise("could not find row")
17
+ row.add_column(column_index, value)
18
+ end
19
+
20
+ def add_formula_string(string)
21
+ self.formula_strings.push(string)
22
+ end
23
+
24
+ def build
25
+ ::ReadXls::Workbook::Worksheet.new(:rows => build_rows)
26
+ end
27
+
28
+ def next_formula_string!
29
+ self.formula_strings.shift
30
+ end
31
+
32
+
33
+ private
34
+
35
+ def build_rows
36
+ rows.each_with_index.each do |_, row_index|
37
+ rows[row_index] ||= ::ReadXls::Evaluator::Row.new(row_index, 0, 0)
38
+ end
39
+
40
+ rows.map(&:evaluate)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,96 @@
1
+ module ReadXls
2
+ class WorkbookBuilder
3
+ attr_accessor :biff, :worksheet_builders, :sst, :formats, :extended_formats
4
+
5
+ def initialize(biff)
6
+ self.biff = biff
7
+ self.worksheet_builders = []
8
+ self.formats = default_formats
9
+ self.extended_formats = []
10
+ end
11
+
12
+ def add_worksheet_builder(worksheet_builder)
13
+ self.worksheet_builders.push(worksheet_builder)
14
+ end
15
+
16
+ def add_format(format_index, format_string)
17
+ self.formats[format_index] = format_string
18
+ end
19
+
20
+ def add_extended_format(extended_format)
21
+ self.extended_formats.push(extended_format)
22
+ end
23
+
24
+ def build
25
+ workbook = ::ReadXls::Workbook.new
26
+ workbook.formats = build_formats
27
+ workbook.extended_formats = build_extended_formats
28
+ workbook.worksheets = build_worksheets
29
+ workbook
30
+ end
31
+
32
+
33
+ private
34
+
35
+ def build_formats
36
+ @_formats ||= formats.dup
37
+ end
38
+
39
+ def build_extended_formats
40
+ @_extended_formats ||= extended_formats.map(&:evaluate)
41
+ end
42
+
43
+ def build_worksheets
44
+ worksheet_builders.map do |worksheet_builder|
45
+ raise "no sst found!" if sst.nil?
46
+
47
+ worksheet_builder.sst = sst
48
+ worksheet_builder.formats = build_formats
49
+ worksheet_builder.extended_formats = build_extended_formats
50
+
51
+ worksheet_builder.build
52
+ end
53
+ end
54
+
55
+ def default_formats
56
+ [
57
+ "General",
58
+ "0",
59
+ "0.00",
60
+ "#,##0",
61
+ "#,##0.00",
62
+ "$#,##0_);($#,##0)",
63
+ "$#,##0_);[Red]($#,##0)",
64
+ "$#,##0.00_);($#,##0.00)",
65
+ "$#,##0.00_);[Red]($#,##0.00)",
66
+ "0%",
67
+ "0.00%",
68
+ "0.00E+00",
69
+ "# ?/?",
70
+ "# ??/??",
71
+ "M/D/YY",
72
+ "D-MMM-YY",
73
+ "D-MMM",
74
+ "MMM-YY",
75
+ "h:mm AM/PM",
76
+ "h:mm:ss AM/PM",
77
+ "h:mm",
78
+ "h:mm:ss",
79
+ "M/D/YY h:mm",
80
+ "_(#,##0_);(#,##0)",
81
+ "_(#,##0_);[Red](#,##0)",
82
+ "_(#,##0.00_);(#,##0.00)",
83
+ "_(#,##0.00_);[Red](#,##0.00)",
84
+ '_($* #,##0_);_($* (#,##0);_($* "-"_);_(@_)',
85
+ '_(* #,##0_);_(* (#,##0);_(* "-"_);_(@_)',
86
+ '_($* #,##0.00_);_($* (#,##0.00);_($* "-"??_);_(@_)',
87
+ '_(* #,##0.00_);_(* (#,##0.00);_(* "-"??_);_(@_)',
88
+ "mm:ss",
89
+ "[h]:mm:ss",
90
+ "mm:ss.0",
91
+ "##0.0E+0",
92
+ "@"
93
+ ]
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "read_xls/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "read_xls"
8
+ spec.version = ReadXls::VERSION
9
+ spec.authors = ["P2Binvestor"]
10
+ spec.email = ["techadmin@p2bi.com"]
11
+
12
+ spec.summary = "Parse XLS files."
13
+ spec.description = ""
14
+ spec.homepage = "http://github.com/p2bi/read_xls"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "ruby-ole"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.10"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec"
27
+ end