parseexcel 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,87 @@
1
+ Spreadsheet::ParseExcel - Get information from an Excel file.
2
+ ============
3
+ Version: 0.5.1
4
+ Date: 2006-05-18
5
+
6
+ Short Description:
7
+ Spreadsheet::ParseExcel allows you to get information out of a
8
+ simple Excel file
9
+ This Package is an - as of today incomplete - translation of
10
+ Kawai Takanoris Perl-Module.
11
+
12
+
13
+ Requirements
14
+ ------------
15
+
16
+ * ruby 1.8
17
+
18
+ Install
19
+ -------
20
+
21
+ De-Compress archive and enter its top directory.
22
+ Then type:
23
+
24
+ $ ruby setup.rb config
25
+ $ ruby setup.rb setup
26
+ ($ su)
27
+ # ruby setup.rb install
28
+
29
+ You can also install files into your favorite directory
30
+ by supplying setup.rb some options. Try "ruby setup.rb --help".
31
+
32
+
33
+ Usage
34
+ -----
35
+ #!/usr/bin/env ruby
36
+
37
+ require 'parseexcel/parser'
38
+
39
+ # your first step is always reading in the file.
40
+ # that gives you a workbook-object, which has one or more worksheets,
41
+ # just like in Excel you have the possibility of multiple worksheets.
42
+ workbook = Spreadsheet::ParseExcel.parse(path_to_file)
43
+
44
+ # usually, you want the first worksheet:
45
+ worksheet = workbook.worksheet(0)
46
+
47
+ # now you can either iterate over all rows, skipping the first number of
48
+ # rows (in case you know they just contain column headers)
49
+ skip = 2
50
+ worksheet.each(skip) { |row|
51
+ # a row is actually just an Array of Cells..
52
+ first_cell = row.at(0)
53
+
54
+ # how you get data out of the cell depends on what datatype you
55
+ # expect:
56
+
57
+ # if you expect a String, you can pass an encoding and (iconv
58
+ # required) the content of the cell will be converted.
59
+ str = row.at(1).to_s('latin1')
60
+
61
+ # if you expect a Float:
62
+ float = row.at(2).to_f
63
+
64
+ # if you expect an Integer:
65
+ int = row.at(3).to_i
66
+
67
+ # if you expect a Date:
68
+ date = row.at(4).date
69
+
70
+ # ParseExcel makes a guess at what Datatype a cell has. At the moment,
71
+ # possible values are: :date, :numeric, :text
72
+ celltype = first_cell.type
73
+ }
74
+
75
+ # if you know exactly which row your data resides in, you may just
76
+ # retrieve that row, which is again simply an Array of Cells
77
+ row = worksheet.row(26)
78
+
79
+
80
+ License
81
+ -------
82
+
83
+ LGPL
84
+
85
+
86
+ URL: http://download.ywesee.com/parseexcel
87
+ Author: Hannes Wyss <hwyss@ywesee.com>
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Spreadsheet::ParseExcel -- Extract Data from an Excel File
4
+ # Copyright (C) 2003 ywesee -- intellectual capital connected
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com
22
+ #
23
+ # Format -- Spreadsheet::ParseExcel -- 10.06.2003 -- hwyss@ywesee.com
24
+
25
+ module Spreadsheet
26
+ module ParseExcel
27
+ class Format
28
+ @@fmt_strs = {
29
+ 0x00 => '@',
30
+ 0x01 => '0',
31
+ 0x02 => '0.00',
32
+ 0x03 => '#,##0',
33
+ 0x04 => '#,##0.00',
34
+ 0x05 => '($#,##0_);($#,##0)',
35
+ 0x06 => '($#,##0_);[RED]($#,##0)',
36
+ 0x07 => '($#,##0.00_);($#,##0.00_)',
37
+ 0x08 => '($#,##0.00_);[RED]($#,##0.00_)',
38
+ 0x09 => '0%',
39
+ 0x0A => '0.00%',
40
+ 0x0B => '0.00E+00',
41
+ 0x0C => '# ?/?',
42
+ 0x0D => '# ??/??',
43
+ 0x0E => 'm-d-yy',
44
+ 0x0F => 'd-mmm-yy',
45
+ 0x10 => 'd-mmm',
46
+ 0x11 => 'mmm-yy',
47
+ 0x12 => 'h:mm AM/PM',
48
+ 0x13 => 'h:mm:ss AM/PM',
49
+ 0x14 => 'h:mm',
50
+ 0x15 => 'h:mm:ss',
51
+ 0x16 => 'm-d-yy h:mm',
52
+ #0x17-0x24 -- Differs in Natinal
53
+ 0x25 => '(#,##0_);(#,##0)',
54
+ 0x26 => '(#,##0_);[RED](#,##0)',
55
+ 0x27 => '(#,##0.00);(#,##0.00)',
56
+ 0x28 => '(#,##0.00);[RED](#,##0.00)',
57
+ 0x29 => '_(*#,##0_);_(*(#,##0);_(*"-"_);_(@_)',
58
+ 0x2A => '_($*#,##0_);_($*(#,##0);_(*"-"_);_(@_)',
59
+ 0x2B => '_(*#,##0.00_);_(*(#,##0.00);_(*"-"??_);_(@_)',
60
+ 0x2C => '_($*#,##0.00_);_($*(#,##0.00);_(*"-"??_);_(@_)',
61
+ 0x2D => 'mm:ss',
62
+ 0x2E => '[h]:mm:ss',
63
+ 0x2F => 'mm:ss.0',
64
+ 0x30 => '##0.0E+0',
65
+ 0x31 => '@',
66
+ }
67
+ attr_accessor :font_no, :fmt_idx, :lock, :hidden, :style, :key_123
68
+ attr_accessor :align_h, :wrap, :align_v, :just_last, :rotate, :indent
69
+ attr_accessor :shrink, :merge, :read_dir, :encoding
70
+ attr_accessor :border_style, :border_color, :border_diag, :fill
71
+ @@date_pattern = /(\0?d\0?d|\0?m\0?m|\0?y\0?y|\0?h|\0?s\0?s)/i
72
+ def initialize(params={})
73
+ params.each { |key, val|
74
+ mthd = key.to_s + '='
75
+ if(self.respond_to?(mthd))
76
+ self.send(mthd, val)
77
+ end
78
+ }
79
+ end
80
+ def add_text_format(idx, fmt_str)
81
+ @@fmt_strs.store(idx, fmt_str)
82
+ end
83
+ def cell_type(cell)
84
+ if(cell.numeric)
85
+ if([0x0E..0x16, 0x2D..0x2F].any? { |range| range.include?(@fmt_idx.to_i) })
86
+ :date
87
+ elsif((fmt = @@fmt_strs[@fmt_idx]) && @@date_pattern.match(fmt))
88
+ :date
89
+ else
90
+ :numeric
91
+ end
92
+ else
93
+ :text
94
+ end
95
+ end
96
+ def text_format(str, code=:_native_)
97
+ (code == :_native_) ? str : str.unpack('n*').pack('C*')
98
+ end
99
+ def to_s(target_encoding=nil)
100
+ fmt_str = @@fmt_strs[@fmt_idx].to_s
101
+ if(target_encoding)
102
+ Iconv.new(target_encoding, @encoding).iconv(fmt_str)
103
+ else
104
+ fmt_str.dup
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,355 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Spreadsheet::ParseExcel -- Extract Data from an Excel File
4
+ # Copyright (C) 2003 ywesee -- intellectual capital connected
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com
22
+ #
23
+ # OLEReader -- Spreadsheet::ParseExcel -- 05.06.2003 -- hwyss@ywesee.com
24
+
25
+ require 'date'
26
+
27
+ module OLE
28
+ class UnknownFormatError < RuntimeError; end
29
+ class DateTime
30
+ attr_reader :year, :month, :day, :hour, :min, :sec, :msec
31
+ def initialize(year, month=1, day=1, hour=0, min=0, sec=0, msec=0)
32
+ @year = year
33
+ @month = month
34
+ @day = day
35
+ @hour = hour
36
+ @min = min
37
+ @sec = sec
38
+ @msec = msec
39
+ end
40
+ def date
41
+ begin
42
+ Date.new(@year, @month, @day)
43
+ rescue ArgumentError
44
+ end
45
+ end
46
+ class << self
47
+ def month_days(month, year)
48
+ case month % 12
49
+ when 0,1,3,5,7,8,10
50
+ 31
51
+ when 4,6,9,11
52
+ 30
53
+ else
54
+ Date.leap?(year) ? 29 : 28
55
+ end
56
+ end
57
+ def parse(datetime)
58
+ #1.Divide Day and Time
59
+ big_dt = datetime.split(//).reverse.inject(0) { |inj, char|
60
+ inj *= 0x100
61
+ inj += char.to_i
62
+ }
63
+ msec = big_dt % 10000000
64
+ big_dt /= 10000000
65
+ day = (big_dt / (24*60*60)) + 1
66
+ time = big_dt % (24*60*60)
67
+ #2. Year->Day(1601/1/2?)
68
+ year = 1601
69
+ attr_reader :year, :month, :day, :hour, :min, :sec, :msec
70
+ ydays = year_days(year)
71
+ while(day > ydays)
72
+ day -= ydays
73
+ year += 1
74
+ ydays = year_days(year)
75
+ end
76
+ month = 1
77
+ 1.upto(11) { |month|
78
+ mdays = month_days(month, year)
79
+ break if(day <= mdays)
80
+ day -= mdays
81
+ }
82
+ #3. Hour->iSec
83
+ hour = time / 3600
84
+ min = (time % 3600) / 60
85
+ sec = time % 60
86
+ new(year, month, day, hour, min, sec, msec)
87
+ end
88
+ def year_days(year)
89
+ Date.leap?(year) ? 366 : 365
90
+ end
91
+ end
92
+ end
93
+ class Storage < File
94
+ PpsType_Root = 5
95
+ PpsType_Dir = 1
96
+ PpsType_File = 2
97
+ DataSizeSmall = 0x1000
98
+ LongIntSize = 4
99
+ PpsSize = 0x80
100
+ attr_reader :header
101
+ def initialize(filename)
102
+ super(filename, "r")
103
+ binmode
104
+ @header = get_header
105
+ end
106
+ module PPS
107
+ class Node
108
+ attr_reader :no, :type, :prev_pps, :next_pps, :data
109
+ attr_reader :dir_pps, :time_1st, :time_2nd, :start_block, :size
110
+ attr_reader :name
111
+ def initialize(no, datastr)
112
+ @no = no
113
+ #def init(datastr)
114
+ nm_size, @type, @prev_pps,
115
+ @next_pps, @dir_pps = datastr[0x40,16].unpack('vvVVV')
116
+ @time_1st = DateTime.parse(datastr[0x64, 8])
117
+ @time_2nd = DateTime.parse(datastr[0x6C, 8])
118
+ @start_block, @size = datastr[0x74,8].unpack('VV')
119
+ nm_size -= 2 if(nm_size > 2)
120
+ @name = datastr[0,nm_size]
121
+ #end
122
+ end
123
+ def get_data(header)
124
+ end
125
+ private
126
+ end
127
+ class Root < Node
128
+ def get_data(header)
129
+ @data = header.get_big_data(@start_block, @size)
130
+ end
131
+ end
132
+ class Dir < Node
133
+ end
134
+ class File < Node
135
+ def get_data(header)
136
+ @data = if(@size < DataSizeSmall)
137
+ header.get_small_data(@start_block, @size)
138
+ else
139
+ header.get_big_data(@start_block, @size)
140
+ end
141
+ end
142
+ end
143
+ end
144
+ class << self
145
+ def is_normal_block?(block)
146
+ block < 0xFFFFFFFC
147
+ end
148
+ def pps_factory(pos, datastr)
149
+ nm_size, type = datastr[0x40,4].unpack('vC')
150
+ nm_size -= 2 if(nm_size > 2)
151
+ nm = datastr[0,nm_size]
152
+ klass = {
153
+ PpsType_Root => PPS::Root,
154
+ PpsType_Dir => PPS::Dir,
155
+ PpsType_File => PPS::File,
156
+ }[type] or raise("unknown pps_type: #{type} / #{nm}")
157
+ klass.new(pos, datastr)
158
+ end
159
+ end
160
+ class Header
161
+ attr_reader :big_block_size, :small_block_size, :bdb_count, :root_start
162
+ attr_reader :sbd_start, :sbd_count, :extra_bbd_start, :extra_bbd_count
163
+ attr_reader :bbd_info
164
+ def initialize(fh)
165
+ @fh = fh
166
+ @pps_table = {}
167
+ #BIG BLOCK SIZE
168
+ exp = get_info(0x1E, 2, 'v')
169
+ raise UnknownFormatError.new if exp.nil?
170
+ @big_block_size = (2 ** exp)
171
+ #SMALL BLOCK SIZE
172
+ exp = get_info(0x20, 2, 'v')
173
+ raise UnknownFormatError.new if exp.nil?
174
+ @small_block_size = (2 ** exp)
175
+ #BDB Count
176
+ @bdb_count = get_info(0x2C, 4, 'V') or raise UnknownFormatError.new
177
+ #START BLOCK
178
+ @root_start = get_info(0x30, 4, 'V') or raise UnknownFormatError.new
179
+ #SMALL BD START
180
+ @sbd_start = get_info(0x3C, 4, 'V') or raise UnknownFormatError.new
181
+ #SMALL BD COUNT
182
+ @sbd_count = get_info(0x40, 4, 'V') or raise UnknownFormatError.new
183
+ #EXTRA BBD START
184
+ @extra_bbd_start = get_info(0x44, 4, 'V') or raise UnknownFormatError.new
185
+ #EXTRA BBD COUNT
186
+ @extra_bbd_count = get_info(0x48, 4, 'V') or raise UnknownFormatError.new
187
+ #GET BBD INFO
188
+ @bbd_info = get_bbd_info
189
+ #GET ROOT PPS
190
+ @root = get_nth_pps(0)
191
+ end
192
+ def get_bbd_info
193
+ bdb_count = @bdb_count
194
+ first_count = (@big_block_size - 0x4C) / LongIntSize
195
+ bdl_count = (@big_block_size / LongIntSize) - 1
196
+ #1. 1st BDlist
197
+ @fh.seek(0x4C)
198
+ get_count = [first_count, bdb_count].min
199
+ buff = @fh.read(LongIntSize * get_count)
200
+ bdl_list = buff.unpack("V#{get_count}")
201
+ bdb_count -= get_count
202
+ #2. Extra BDList
203
+ block = @extra_bbd_start
204
+ while((bdb_count > 0) && Storage.is_normal_block?(block))
205
+ set_file_pos(block, 0)
206
+ get_count = [bdb_count, bdl_count].min
207
+ buff = @fh.read(LongIntSize * get_count)
208
+ bdl_list += buff.unpack("V#{get_count}")
209
+ bdb_count -= get_count
210
+ buff = @fh.read(LongIntSize)
211
+ block = buff.unpack('V')
212
+ end
213
+ #3.Get BDs
214
+ bd_table = {}
215
+ block_no = 0
216
+ bd_count = @big_block_size / LongIntSize
217
+ bdl_list.each { |bdl|
218
+ set_file_pos(bdl, 0)
219
+ buff = @fh.read(@big_block_size)
220
+ array = buff.unpack("V#{bd_count}")
221
+ bd_count.times { |idx|
222
+ bd_table.store(block_no, array[idx]) unless(array[idx]==block_no.next)
223
+ block_no += 1
224
+ }
225
+ }
226
+ bd_table
227
+ end
228
+ def get_big_data(block, size)
229
+ result = ''
230
+ return result unless Storage.is_normal_block?(block)
231
+ rest = size
232
+ keys = @bbd_info.keys.sort
233
+ while(rest > 0)
234
+ res = keys.select { |key| key >= block }
235
+ nkey = res.first
236
+ idx = nkey - block
237
+ nxt = @bbd_info[nkey]
238
+ set_file_pos(block, 0)
239
+ get_size = [rest, @big_block_size * idx.next].min
240
+ result << @fh.read(get_size)
241
+ rest -= get_size
242
+ block = nxt
243
+ end
244
+ result
245
+ end
246
+ def get_info(pos, len, fmt)
247
+ @fh.seek(pos)
248
+ if(buff = @fh.read(len))
249
+ buff.unpack(fmt).first
250
+ end
251
+ end
252
+ def get_next_block_no(block)
253
+ @bbd_info[block] || block.next
254
+ end
255
+ def get_next_small_block_no(block)
256
+ base = @big_block_size / LongIntSize
257
+ nth = block / base
258
+ pos = block % base
259
+ blk = get_nth_block_no(@sbd_start, nth)
260
+ set_file_pos(blk, pos * LongIntSize)
261
+ @fh.read(LongIntSize).unpack('V').first
262
+ end
263
+ def get_nth_block_no(start_block, nth)
264
+ nxt = start_block
265
+ nth.times { |idx|
266
+ nxt = get_next_block_no(nxt)
267
+ return nil unless Storage.is_normal_block?(nxt)
268
+ }
269
+ nxt
270
+ end
271
+ def get_nth_pps(pos)
272
+ @pps_table.fetch(pos) {
273
+ base_count = @big_block_size / PpsSize
274
+ pps_block = pos / base_count
275
+ pps_pos = pos % base_count
276
+
277
+ block = get_nth_block_no(@root_start, pps_block) or return
278
+ set_file_pos(block, PpsSize*pps_pos)
279
+ buff = @fh.read(PpsSize) or return
280
+ pps = Storage.pps_factory(pos, buff)
281
+ pps.get_data(self)
282
+ @pps_table.store(pos, pps)
283
+ }
284
+ end
285
+ def get_small_data(block, size)
286
+ result = ''
287
+ rest = size
288
+ while(rest > 0)
289
+ set_file_pos_small(block)
290
+ get_size = [rest, @small_block_size].min
291
+ result << @fh.read(get_size)
292
+ rest -= @small_block_size
293
+ block = get_next_small_block_no(block)
294
+ end
295
+ result
296
+ end
297
+ def sb_start
298
+ @root.start_block
299
+ end
300
+ def sb_size
301
+ @root.size
302
+ end
303
+ def set_file_pos(block, pos)
304
+ @fh.seek((block+1) * @big_block_size + pos)
305
+ end
306
+ def set_file_pos_small(block)
307
+ base = @big_block_size / @small_block_size
308
+ nth = block / base
309
+ pos = block % base
310
+ blk = get_nth_block_no(sb_start, nth)
311
+ set_file_pos(blk, pos * @small_block_size)
312
+ end
313
+ end
314
+ def search_pps(names, cse=false, no=0, done=[])
315
+ #1. Check it self
316
+ return [] if(done.include?(no))
317
+ done.push(no)
318
+ pps = @header.get_nth_pps(no) or return []
319
+ cond = if(cse)
320
+ Proc.new { |name|
321
+ /^#{Regexp.escape pps.name}$/i.match(name)
322
+ }
323
+ else
324
+ Proc.new { |name| name == pps.name }
325
+ end
326
+ result = if(names.any? { |name| cond.call(name) })
327
+ [pps]
328
+ else
329
+ []
330
+ end
331
+ #2. Check Child, Previous, Next PPSs
332
+ [ pps.dir_pps, pps.prev_pps, pps.next_pps ].each { |node|
333
+ unless(node == 0xFFFFFFFF)
334
+ result += search_pps(names, cse, node, done)
335
+ end
336
+ }
337
+ result
338
+ end
339
+ private
340
+ def get_header
341
+ #0. Check ID
342
+ rewind
343
+ return unless(read(8) == "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
344
+ Header.new(self)
345
+ end
346
+ end
347
+ def asc2ucs(str)
348
+ str.split(//).join("\000") + "\000"
349
+ end
350
+ module_function :asc2ucs
351
+ end
352
+
353
+ =begin
354
+ ToDo: Merge with Daniel J. Bergers OLEWriter
355
+ =end