parseexcel 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,87 @@
1
+ Spreadsheet::ParseExcel - Get information from an Excel file.
2
+ ============
3
+ Version: 0.5.1
4
+ Date: 2006-05-18
5
+
6
+ Short Description:
7
+ Spreadsheet::ParseExcel allows you to get information out of a
8
+ simple Excel file
9
+ This Package is an - as of today incomplete - translation of
10
+ Kawai Takanoris Perl-Module.
11
+
12
+
13
+ Requirements
14
+ ------------
15
+
16
+ * ruby 1.8
17
+
18
+ Install
19
+ -------
20
+
21
+ De-Compress archive and enter its top directory.
22
+ Then type:
23
+
24
+ $ ruby setup.rb config
25
+ $ ruby setup.rb setup
26
+ ($ su)
27
+ # ruby setup.rb install
28
+
29
+ You can also install files into your favorite directory
30
+ by supplying setup.rb some options. Try "ruby setup.rb --help".
31
+
32
+
33
+ Usage
34
+ -----
35
+ #!/usr/bin/env ruby
36
+
37
+ require 'parseexcel/parser'
38
+
39
+ # your first step is always reading in the file.
40
+ # that gives you a workbook-object, which has one or more worksheets,
41
+ # just like in Excel you have the possibility of multiple worksheets.
42
+ workbook = Spreadsheet::ParseExcel.parse(path_to_file)
43
+
44
+ # usually, you want the first worksheet:
45
+ worksheet = workbook.worksheet(0)
46
+
47
+ # now you can either iterate over all rows, skipping the first number of
48
+ # rows (in case you know they just contain column headers)
49
+ skip = 2
50
+ worksheet.each(skip) { |row|
51
+ # a row is actually just an Array of Cells..
52
+ first_cell = row.at(0)
53
+
54
+ # how you get data out of the cell depends on what datatype you
55
+ # expect:
56
+
57
+ # if you expect a String, you can pass an encoding and (iconv
58
+ # required) the content of the cell will be converted.
59
+ str = row.at(1).to_s('latin1')
60
+
61
+ # if you expect a Float:
62
+ float = row.at(2).to_f
63
+
64
+ # if you expect an Integer:
65
+ int = row.at(3).to_i
66
+
67
+ # if you expect a Date:
68
+ date = row.at(4).date
69
+
70
+ # ParseExcel makes a guess at what Datatype a cell has. At the moment,
71
+ # possible values are: :date, :numeric, :text
72
+ celltype = first_cell.type
73
+ }
74
+
75
+ # if you know exactly which row your data resides in, you may just
76
+ # retrieve that row, which is again simply an Array of Cells
77
+ row = worksheet.row(26)
78
+
79
+
80
+ License
81
+ -------
82
+
83
+ LGPL
84
+
85
+
86
+ URL: http://download.ywesee.com/parseexcel
87
+ Author: Hannes Wyss <hwyss@ywesee.com>
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Spreadsheet::ParseExcel -- Extract Data from an Excel File
4
+ # Copyright (C) 2003 ywesee -- intellectual capital connected
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com
22
+ #
23
+ # Format -- Spreadsheet::ParseExcel -- 10.06.2003 -- hwyss@ywesee.com
24
+
25
+ module Spreadsheet
26
+ module ParseExcel
27
+ class Format
28
+ @@fmt_strs = {
29
+ 0x00 => '@',
30
+ 0x01 => '0',
31
+ 0x02 => '0.00',
32
+ 0x03 => '#,##0',
33
+ 0x04 => '#,##0.00',
34
+ 0x05 => '($#,##0_);($#,##0)',
35
+ 0x06 => '($#,##0_);[RED]($#,##0)',
36
+ 0x07 => '($#,##0.00_);($#,##0.00_)',
37
+ 0x08 => '($#,##0.00_);[RED]($#,##0.00_)',
38
+ 0x09 => '0%',
39
+ 0x0A => '0.00%',
40
+ 0x0B => '0.00E+00',
41
+ 0x0C => '# ?/?',
42
+ 0x0D => '# ??/??',
43
+ 0x0E => 'm-d-yy',
44
+ 0x0F => 'd-mmm-yy',
45
+ 0x10 => 'd-mmm',
46
+ 0x11 => 'mmm-yy',
47
+ 0x12 => 'h:mm AM/PM',
48
+ 0x13 => 'h:mm:ss AM/PM',
49
+ 0x14 => 'h:mm',
50
+ 0x15 => 'h:mm:ss',
51
+ 0x16 => 'm-d-yy h:mm',
52
+ #0x17-0x24 -- Differs in Natinal
53
+ 0x25 => '(#,##0_);(#,##0)',
54
+ 0x26 => '(#,##0_);[RED](#,##0)',
55
+ 0x27 => '(#,##0.00);(#,##0.00)',
56
+ 0x28 => '(#,##0.00);[RED](#,##0.00)',
57
+ 0x29 => '_(*#,##0_);_(*(#,##0);_(*"-"_);_(@_)',
58
+ 0x2A => '_($*#,##0_);_($*(#,##0);_(*"-"_);_(@_)',
59
+ 0x2B => '_(*#,##0.00_);_(*(#,##0.00);_(*"-"??_);_(@_)',
60
+ 0x2C => '_($*#,##0.00_);_($*(#,##0.00);_(*"-"??_);_(@_)',
61
+ 0x2D => 'mm:ss',
62
+ 0x2E => '[h]:mm:ss',
63
+ 0x2F => 'mm:ss.0',
64
+ 0x30 => '##0.0E+0',
65
+ 0x31 => '@',
66
+ }
67
+ attr_accessor :font_no, :fmt_idx, :lock, :hidden, :style, :key_123
68
+ attr_accessor :align_h, :wrap, :align_v, :just_last, :rotate, :indent
69
+ attr_accessor :shrink, :merge, :read_dir, :encoding
70
+ attr_accessor :border_style, :border_color, :border_diag, :fill
71
+ @@date_pattern = /(\0?d\0?d|\0?m\0?m|\0?y\0?y|\0?h|\0?s\0?s)/i
72
+ def initialize(params={})
73
+ params.each { |key, val|
74
+ mthd = key.to_s + '='
75
+ if(self.respond_to?(mthd))
76
+ self.send(mthd, val)
77
+ end
78
+ }
79
+ end
80
+ def add_text_format(idx, fmt_str)
81
+ @@fmt_strs.store(idx, fmt_str)
82
+ end
83
+ def cell_type(cell)
84
+ if(cell.numeric)
85
+ if([0x0E..0x16, 0x2D..0x2F].any? { |range| range.include?(@fmt_idx.to_i) })
86
+ :date
87
+ elsif((fmt = @@fmt_strs[@fmt_idx]) && @@date_pattern.match(fmt))
88
+ :date
89
+ else
90
+ :numeric
91
+ end
92
+ else
93
+ :text
94
+ end
95
+ end
96
+ def text_format(str, code=:_native_)
97
+ (code == :_native_) ? str : str.unpack('n*').pack('C*')
98
+ end
99
+ def to_s(target_encoding=nil)
100
+ fmt_str = @@fmt_strs[@fmt_idx].to_s
101
+ if(target_encoding)
102
+ Iconv.new(target_encoding, @encoding).iconv(fmt_str)
103
+ else
104
+ fmt_str.dup
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,355 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Spreadsheet::ParseExcel -- Extract Data from an Excel File
4
+ # Copyright (C) 2003 ywesee -- intellectual capital connected
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com
22
+ #
23
+ # OLEReader -- Spreadsheet::ParseExcel -- 05.06.2003 -- hwyss@ywesee.com
24
+
25
+ require 'date'
26
+
27
+ module OLE
28
+ class UnknownFormatError < RuntimeError; end
29
+ class DateTime
30
+ attr_reader :year, :month, :day, :hour, :min, :sec, :msec
31
+ def initialize(year, month=1, day=1, hour=0, min=0, sec=0, msec=0)
32
+ @year = year
33
+ @month = month
34
+ @day = day
35
+ @hour = hour
36
+ @min = min
37
+ @sec = sec
38
+ @msec = msec
39
+ end
40
+ def date
41
+ begin
42
+ Date.new(@year, @month, @day)
43
+ rescue ArgumentError
44
+ end
45
+ end
46
+ class << self
47
+ def month_days(month, year)
48
+ case month % 12
49
+ when 0,1,3,5,7,8,10
50
+ 31
51
+ when 4,6,9,11
52
+ 30
53
+ else
54
+ Date.leap?(year) ? 29 : 28
55
+ end
56
+ end
57
+ def parse(datetime)
58
+ #1.Divide Day and Time
59
+ big_dt = datetime.split(//).reverse.inject(0) { |inj, char|
60
+ inj *= 0x100
61
+ inj += char.to_i
62
+ }
63
+ msec = big_dt % 10000000
64
+ big_dt /= 10000000
65
+ day = (big_dt / (24*60*60)) + 1
66
+ time = big_dt % (24*60*60)
67
+ #2. Year->Day(1601/1/2?)
68
+ year = 1601
69
+ attr_reader :year, :month, :day, :hour, :min, :sec, :msec
70
+ ydays = year_days(year)
71
+ while(day > ydays)
72
+ day -= ydays
73
+ year += 1
74
+ ydays = year_days(year)
75
+ end
76
+ month = 1
77
+ 1.upto(11) { |month|
78
+ mdays = month_days(month, year)
79
+ break if(day <= mdays)
80
+ day -= mdays
81
+ }
82
+ #3. Hour->iSec
83
+ hour = time / 3600
84
+ min = (time % 3600) / 60
85
+ sec = time % 60
86
+ new(year, month, day, hour, min, sec, msec)
87
+ end
88
+ def year_days(year)
89
+ Date.leap?(year) ? 366 : 365
90
+ end
91
+ end
92
+ end
93
+ class Storage < File
94
+ PpsType_Root = 5
95
+ PpsType_Dir = 1
96
+ PpsType_File = 2
97
+ DataSizeSmall = 0x1000
98
+ LongIntSize = 4
99
+ PpsSize = 0x80
100
+ attr_reader :header
101
+ def initialize(filename)
102
+ super(filename, "r")
103
+ binmode
104
+ @header = get_header
105
+ end
106
+ module PPS
107
+ class Node
108
+ attr_reader :no, :type, :prev_pps, :next_pps, :data
109
+ attr_reader :dir_pps, :time_1st, :time_2nd, :start_block, :size
110
+ attr_reader :name
111
+ def initialize(no, datastr)
112
+ @no = no
113
+ #def init(datastr)
114
+ nm_size, @type, @prev_pps,
115
+ @next_pps, @dir_pps = datastr[0x40,16].unpack('vvVVV')
116
+ @time_1st = DateTime.parse(datastr[0x64, 8])
117
+ @time_2nd = DateTime.parse(datastr[0x6C, 8])
118
+ @start_block, @size = datastr[0x74,8].unpack('VV')
119
+ nm_size -= 2 if(nm_size > 2)
120
+ @name = datastr[0,nm_size]
121
+ #end
122
+ end
123
+ def get_data(header)
124
+ end
125
+ private
126
+ end
127
+ class Root < Node
128
+ def get_data(header)
129
+ @data = header.get_big_data(@start_block, @size)
130
+ end
131
+ end
132
+ class Dir < Node
133
+ end
134
+ class File < Node
135
+ def get_data(header)
136
+ @data = if(@size < DataSizeSmall)
137
+ header.get_small_data(@start_block, @size)
138
+ else
139
+ header.get_big_data(@start_block, @size)
140
+ end
141
+ end
142
+ end
143
+ end
144
+ class << self
145
+ def is_normal_block?(block)
146
+ block < 0xFFFFFFFC
147
+ end
148
+ def pps_factory(pos, datastr)
149
+ nm_size, type = datastr[0x40,4].unpack('vC')
150
+ nm_size -= 2 if(nm_size > 2)
151
+ nm = datastr[0,nm_size]
152
+ klass = {
153
+ PpsType_Root => PPS::Root,
154
+ PpsType_Dir => PPS::Dir,
155
+ PpsType_File => PPS::File,
156
+ }[type] or raise("unknown pps_type: #{type} / #{nm}")
157
+ klass.new(pos, datastr)
158
+ end
159
+ end
160
+ class Header
161
+ attr_reader :big_block_size, :small_block_size, :bdb_count, :root_start
162
+ attr_reader :sbd_start, :sbd_count, :extra_bbd_start, :extra_bbd_count
163
+ attr_reader :bbd_info
164
+ def initialize(fh)
165
+ @fh = fh
166
+ @pps_table = {}
167
+ #BIG BLOCK SIZE
168
+ exp = get_info(0x1E, 2, 'v')
169
+ raise UnknownFormatError.new if exp.nil?
170
+ @big_block_size = (2 ** exp)
171
+ #SMALL BLOCK SIZE
172
+ exp = get_info(0x20, 2, 'v')
173
+ raise UnknownFormatError.new if exp.nil?
174
+ @small_block_size = (2 ** exp)
175
+ #BDB Count
176
+ @bdb_count = get_info(0x2C, 4, 'V') or raise UnknownFormatError.new
177
+ #START BLOCK
178
+ @root_start = get_info(0x30, 4, 'V') or raise UnknownFormatError.new
179
+ #SMALL BD START
180
+ @sbd_start = get_info(0x3C, 4, 'V') or raise UnknownFormatError.new
181
+ #SMALL BD COUNT
182
+ @sbd_count = get_info(0x40, 4, 'V') or raise UnknownFormatError.new
183
+ #EXTRA BBD START
184
+ @extra_bbd_start = get_info(0x44, 4, 'V') or raise UnknownFormatError.new
185
+ #EXTRA BBD COUNT
186
+ @extra_bbd_count = get_info(0x48, 4, 'V') or raise UnknownFormatError.new
187
+ #GET BBD INFO
188
+ @bbd_info = get_bbd_info
189
+ #GET ROOT PPS
190
+ @root = get_nth_pps(0)
191
+ end
192
+ def get_bbd_info
193
+ bdb_count = @bdb_count
194
+ first_count = (@big_block_size - 0x4C) / LongIntSize
195
+ bdl_count = (@big_block_size / LongIntSize) - 1
196
+ #1. 1st BDlist
197
+ @fh.seek(0x4C)
198
+ get_count = [first_count, bdb_count].min
199
+ buff = @fh.read(LongIntSize * get_count)
200
+ bdl_list = buff.unpack("V#{get_count}")
201
+ bdb_count -= get_count
202
+ #2. Extra BDList
203
+ block = @extra_bbd_start
204
+ while((bdb_count > 0) && Storage.is_normal_block?(block))
205
+ set_file_pos(block, 0)
206
+ get_count = [bdb_count, bdl_count].min
207
+ buff = @fh.read(LongIntSize * get_count)
208
+ bdl_list += buff.unpack("V#{get_count}")
209
+ bdb_count -= get_count
210
+ buff = @fh.read(LongIntSize)
211
+ block = buff.unpack('V')
212
+ end
213
+ #3.Get BDs
214
+ bd_table = {}
215
+ block_no = 0
216
+ bd_count = @big_block_size / LongIntSize
217
+ bdl_list.each { |bdl|
218
+ set_file_pos(bdl, 0)
219
+ buff = @fh.read(@big_block_size)
220
+ array = buff.unpack("V#{bd_count}")
221
+ bd_count.times { |idx|
222
+ bd_table.store(block_no, array[idx]) unless(array[idx]==block_no.next)
223
+ block_no += 1
224
+ }
225
+ }
226
+ bd_table
227
+ end
228
+ def get_big_data(block, size)
229
+ result = ''
230
+ return result unless Storage.is_normal_block?(block)
231
+ rest = size
232
+ keys = @bbd_info.keys.sort
233
+ while(rest > 0)
234
+ res = keys.select { |key| key >= block }
235
+ nkey = res.first
236
+ idx = nkey - block
237
+ nxt = @bbd_info[nkey]
238
+ set_file_pos(block, 0)
239
+ get_size = [rest, @big_block_size * idx.next].min
240
+ result << @fh.read(get_size)
241
+ rest -= get_size
242
+ block = nxt
243
+ end
244
+ result
245
+ end
246
+ def get_info(pos, len, fmt)
247
+ @fh.seek(pos)
248
+ if(buff = @fh.read(len))
249
+ buff.unpack(fmt).first
250
+ end
251
+ end
252
+ def get_next_block_no(block)
253
+ @bbd_info[block] || block.next
254
+ end
255
+ def get_next_small_block_no(block)
256
+ base = @big_block_size / LongIntSize
257
+ nth = block / base
258
+ pos = block % base
259
+ blk = get_nth_block_no(@sbd_start, nth)
260
+ set_file_pos(blk, pos * LongIntSize)
261
+ @fh.read(LongIntSize).unpack('V').first
262
+ end
263
+ def get_nth_block_no(start_block, nth)
264
+ nxt = start_block
265
+ nth.times { |idx|
266
+ nxt = get_next_block_no(nxt)
267
+ return nil unless Storage.is_normal_block?(nxt)
268
+ }
269
+ nxt
270
+ end
271
+ def get_nth_pps(pos)
272
+ @pps_table.fetch(pos) {
273
+ base_count = @big_block_size / PpsSize
274
+ pps_block = pos / base_count
275
+ pps_pos = pos % base_count
276
+
277
+ block = get_nth_block_no(@root_start, pps_block) or return
278
+ set_file_pos(block, PpsSize*pps_pos)
279
+ buff = @fh.read(PpsSize) or return
280
+ pps = Storage.pps_factory(pos, buff)
281
+ pps.get_data(self)
282
+ @pps_table.store(pos, pps)
283
+ }
284
+ end
285
+ def get_small_data(block, size)
286
+ result = ''
287
+ rest = size
288
+ while(rest > 0)
289
+ set_file_pos_small(block)
290
+ get_size = [rest, @small_block_size].min
291
+ result << @fh.read(get_size)
292
+ rest -= @small_block_size
293
+ block = get_next_small_block_no(block)
294
+ end
295
+ result
296
+ end
297
+ def sb_start
298
+ @root.start_block
299
+ end
300
+ def sb_size
301
+ @root.size
302
+ end
303
+ def set_file_pos(block, pos)
304
+ @fh.seek((block+1) * @big_block_size + pos)
305
+ end
306
+ def set_file_pos_small(block)
307
+ base = @big_block_size / @small_block_size
308
+ nth = block / base
309
+ pos = block % base
310
+ blk = get_nth_block_no(sb_start, nth)
311
+ set_file_pos(blk, pos * @small_block_size)
312
+ end
313
+ end
314
+ def search_pps(names, cse=false, no=0, done=[])
315
+ #1. Check it self
316
+ return [] if(done.include?(no))
317
+ done.push(no)
318
+ pps = @header.get_nth_pps(no) or return []
319
+ cond = if(cse)
320
+ Proc.new { |name|
321
+ /^#{Regexp.escape pps.name}$/i.match(name)
322
+ }
323
+ else
324
+ Proc.new { |name| name == pps.name }
325
+ end
326
+ result = if(names.any? { |name| cond.call(name) })
327
+ [pps]
328
+ else
329
+ []
330
+ end
331
+ #2. Check Child, Previous, Next PPSs
332
+ [ pps.dir_pps, pps.prev_pps, pps.next_pps ].each { |node|
333
+ unless(node == 0xFFFFFFFF)
334
+ result += search_pps(names, cse, node, done)
335
+ end
336
+ }
337
+ result
338
+ end
339
+ private
340
+ def get_header
341
+ #0. Check ID
342
+ rewind
343
+ return unless(read(8) == "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
344
+ Header.new(self)
345
+ end
346
+ end
347
+ def asc2ucs(str)
348
+ str.split(//).join("\000") + "\000"
349
+ end
350
+ module_function :asc2ucs
351
+ end
352
+
353
+ =begin
354
+ ToDo: Merge with Daniel J. Bergers OLEWriter
355
+ =end