msworddoc-extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'msworddoc-extractor'
5
+
6
+ def app(*argv)
7
+ actions = []
8
+
9
+ options = [
10
+ [ '-d', '--document', 'Main contents (default)', :document ],
11
+ [ '-w', '--whole', 'Whole text contents', :whole_contents ],
12
+ [ '-i', '--header', 'Header parts', :header ],
13
+ [ '-f', '--footnote', 'Footnotes', :footnote ],
14
+ [ '-e', '--endnote', 'Endnotes', :endnote ],
15
+ [ '-a', '--annotation', 'Annotations', :annotatation ],
16
+ [ '-t', '--textbox', 'Text boxes', :textbox ],
17
+ [ '--header_textbox', 'Header text boxes', :header_textbox ],
18
+ [ '-m', '--macro', 'Macro part', :marco ],
19
+ ]
20
+
21
+ optparse = OptionParser.new do |opt|
22
+ opt.banner = 'Usage: worddoc-extract [options] <files> ...'
23
+
24
+ options.each do |o|
25
+ action = o.pop
26
+ opt.on(*o) { actions << action }
27
+ end
28
+
29
+ opt.separator ''
30
+ opt.on('-h', '--help', 'Show this help') { puts opt; exit }
31
+ end
32
+
33
+ if actions.empty?
34
+ actions = [ :document ]
35
+ end
36
+
37
+ files = optparse.parse(argv)
38
+
39
+ files.each do |file|
40
+ doc = MSWordDoc::Extractor.load_file(file)
41
+ actions.each do |action|
42
+ puts doc.send(action)
43
+ end
44
+ end
45
+ end
46
+
47
+ app(*ARGV)
48
+
@@ -0,0 +1,376 @@
1
+ require 'rubygems'
2
+ require 'ole/storage'
3
+
4
+ module MSWordDoc
5
+ module Extractor
6
+ VERSION = '0.1.0'
7
+
8
+ def self.load(file)
9
+ doc = Essence.new()
10
+
11
+ ole = Ole::Storage.open(file)
12
+
13
+ doc.load_storage(ole)
14
+
15
+ if block_given?
16
+ begin
17
+ yield doc
18
+ ensure
19
+ doc.close()
20
+ end
21
+
22
+ return
23
+ end
24
+
25
+ return doc
26
+ end
27
+ end
28
+
29
+ class Essence
30
+ PPS_NAME_WORDDOC = 'WordDocument'
31
+ PPS_NAME_TABLE_TMPL = '%dTable'
32
+
33
+ MAGIC_MSWORD = 0xa5ec
34
+ NFIB_MSWORD6 = 101
35
+
36
+ OFFSET_FIB_IDENT = 0x0000
37
+ OFFSET_FIB_FIB = 0x0002
38
+
39
+ OFFSET_FIB_FLAGS = 0x000a
40
+ OFFSET_FIB_FCCLX = 0x01a2
41
+ OFFSET_FIB_LCBCLX = 0x01a6
42
+
43
+ OFFSET_FIB_FCMIN = 0x0018
44
+ OFFSET_FIB_FCMAC = 0x001c
45
+ OFFSET_FIB_CBMAC = 0x0040
46
+
47
+ MASK_FIBFLAG_COMPLEX = 0x0004
48
+ MASK_FIBFLAG_ENCRYPTED = 0x0100
49
+ MASK_FIBFLAG_WHICHTBLSTM = 0x0200
50
+
51
+ LENGTH_CP = 4
52
+ LENGTH_PCD = 8
53
+
54
+ OFFSET_FIB_CCP_MAP = {
55
+ :Text => 0x004c,
56
+ :Ftn => 0x0050,
57
+ :Hdd => 0x0054,
58
+ :Mcr => 0x0058,
59
+ :Atn => 0x005c,
60
+ :Edn => 0x0060,
61
+ :Txbx => 0x0064,
62
+ :HdrTxbx => 0x0068,
63
+ }
64
+
65
+ def initialize
66
+ @flag = {}
67
+ @ccp = {}
68
+ @ole = nil
69
+ end
70
+
71
+ def close
72
+ @ole.close()
73
+ @ole = nil
74
+ end
75
+
76
+ def load_storage(ole)
77
+ @ole = ole
78
+
79
+ @ole.file.open(PPS_NAME_WORDDOC) do |f|
80
+ parse_fib(f)
81
+ end
82
+
83
+ name_of_table = PPS_NAME_TABLE_TMPL % (@flag[:fWhichTblStm] ? 1 : 0)
84
+ @ole.file.open name_of_table do |f|
85
+ parse_piece_table(f)
86
+ end
87
+ end
88
+
89
+ def whole_contents(*args)
90
+ return retrieve_and_filter(0, -1, *args)
91
+ end
92
+
93
+ def document(*args)
94
+ return retrieve_and_filter(0, @ccp[:Text], *args)
95
+ end
96
+
97
+ def footnote(*args)
98
+ return retrieve_and_filter(@ccp[:Text], @ccp[:Ftn], *args)
99
+ end
100
+
101
+ def header(*args)
102
+ skips = [ :Text, :Ftn ]
103
+ return retrieve_token_and_filter(skips, :Hdd, *args)
104
+ end
105
+
106
+ def macro(*args)
107
+ skips = [ :Text, :Ftn, :Hdd ]
108
+ return retrieve_token_and_filter(skips, :Mcr, *args)
109
+ end
110
+
111
+ def annotation(*args)
112
+ skips = [ :Text, :Ftn, :Hdd, :Mcr ]
113
+ return retrieve_token_and_filter(skips, :Atn, *args)
114
+ end
115
+
116
+ def endnote(*args)
117
+ skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn ]
118
+ return retrieve_token_and_filter(skips, :Edn, *args)
119
+ end
120
+
121
+ def textbox(*args)
122
+ skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn, :Edn ]
123
+ return retrieve_token_and_filter(skips, :Txbx, *args)
124
+ end
125
+
126
+ def header_textbox(*args)
127
+ skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn, :Edn, :Txbx ]
128
+ return retrieve_token_and_filter(skips, :HdrTxbx, *args)
129
+ end
130
+
131
+ private
132
+
133
+ def parse_fib(f)
134
+ if get_ushort(f, OFFSET_FIB_IDENT) != MAGIC_MSWORD then
135
+ raise 'Not a Word document'
136
+ end
137
+
138
+ nFib = get_ushort(f, OFFSET_FIB_FIB)
139
+ if nFib < NFIB_MSWORD6
140
+ raise 'Unsupported version'
141
+ end
142
+
143
+ flags = get_ushort(f, OFFSET_FIB_FLAGS)
144
+
145
+ @flag[:fComplex] = (flags & MASK_FIBFLAG_COMPLEX != 0)
146
+
147
+ @flag[:fEncrypted] = (flags & MASK_FIBFLAG_ENCRYPTED != 0)
148
+ if @flag[:fEncypted]
149
+ raise 'Encrypted MSWord document file is not supported'
150
+ end
151
+
152
+ @flag[:fWhichTblStm] = (flags & MASK_FIBFLAG_WHICHTBLSTM != 0)
153
+
154
+ @fcMin = get_ulong(f, OFFSET_FIB_FCMIN)
155
+ @fcMac = get_ulong(f, OFFSET_FIB_FCMAC)
156
+ @cbMac = get_ulong(f, OFFSET_FIB_CBMAC)
157
+
158
+ @fcClx = get_ulong(f, OFFSET_FIB_FCCLX)
159
+ @lcbClx = get_ulong(f, OFFSET_FIB_LCBCLX)
160
+
161
+ parse_fib_ccps(f)
162
+ end
163
+
164
+ def parse_fib_ccps(f)
165
+ OFFSET_FIB_CCP_MAP.each do |key, offset|
166
+ @ccp[key] = get_ulong(f, offset)
167
+ end
168
+ end
169
+
170
+ def parse_piece_table(f)
171
+ if @lcbClx <= 0
172
+ # create pseudo piece table
173
+ ccpAll = 0
174
+ OFFSET_FIB_CCP_MAP.each do |key, offset|
175
+ ccpAll += @ccp[key]
176
+ end
177
+
178
+ @pcds = [
179
+ {
180
+ :fc => @fcMin,
181
+ :cp => 0,
182
+ :ccp => ccpAll,
183
+ }
184
+ ]
185
+
186
+ return
187
+ end
188
+
189
+ f.pos = @fcClx
190
+ clx = f.read(@lcbClx)
191
+
192
+ while clx.length > 0
193
+ clxt = clx.slice!(0, 1).unpack('C')[0]
194
+ break if clxt == 2 # plcfpcd
195
+
196
+ if clxt == 1 # grpprl => SKIP
197
+ skip = clx.slice!(0, 2).unpack('v')[0]
198
+
199
+ clx.slice!(0, skip)
200
+ else
201
+ raise 'Unknown CLX block'
202
+ end
203
+ end
204
+ raise 'PCDs not found' unless clx.length > 0
205
+
206
+ length = clx.slice!(0, 4).unpack('V')[0]
207
+
208
+ n = (length - LENGTH_CP) / (LENGTH_CP + LENGTH_PCD)
209
+
210
+ cps = []
211
+ (n+1).times do
212
+ cps << clx.slice!(0, LENGTH_CP).unpack('V')[0]
213
+ end
214
+
215
+ @pcds = []
216
+ 1.upto(n) do |i|
217
+ pcd_data = clx.slice!(0, LENGTH_PCD)
218
+
219
+ fc = pcd_data.slice(2, 4).unpack('V')[0]
220
+
221
+ @pcds << {
222
+ :fc => fc,
223
+ :cp => cps[i - 1],
224
+ :ccp => cps[i] - cps[i - 1]
225
+ }
226
+ end
227
+ end
228
+
229
+ def retrieve_substring(f, offset, length = -1)
230
+ i = 0
231
+ while i < @pcds.length
232
+ if @pcds[i][:cp] > offset then break end
233
+
234
+ i += 1
235
+ end
236
+ i -= 1
237
+ raise 'could not find suitable heading piece' unless i >= 0
238
+
239
+ output = ""
240
+ while length > 0 || length < 0
241
+ pcd = @pcds[i]
242
+
243
+ len = length
244
+ if pcd[:ccp] < len || len < 0
245
+ len = pcd[:ccp]
246
+ end
247
+
248
+ if pcd[:fc] & 0x40000000 != 0
249
+ # cp1252
250
+ fc = (pcd[:fc] ^ 0x40000000) >> 1
251
+ fc += offset
252
+ offset = 0
253
+
254
+ f.pos = fc
255
+ output << convert_from_cp1252(f.read(len))
256
+ else
257
+ # UTF-16LE
258
+ fc = pcd[:fc]
259
+ fc += offset * 2
260
+ offset = 0
261
+
262
+ f.pos = fc
263
+ output << convert_from_utf16le(f.read(len * 2))
264
+ end
265
+
266
+ if length >= 0
267
+ length -= len
268
+ end
269
+
270
+ i += 1
271
+ break if i >= @pcds.length
272
+ end
273
+
274
+ return output
275
+ end
276
+
277
+ def get_ushort(f, pos)
278
+ f.pos = pos
279
+ return f.read(2).unpack('v')[0]
280
+ end
281
+
282
+ def get_ulong(f, pos)
283
+ f.pos = pos
284
+ return f.read(4).unpack('V')[0]
285
+ end
286
+
287
+ def retrieve_token_and_filter(skip_tokens, target, *args)
288
+ skip = skip_tokens.inject(0) {|sum, key| sum + @ccp[key] }
289
+ return retrieve_and_filter(skip, @ccp[target], *args)
290
+ end
291
+
292
+ def retrieve_and_filter(offset, length, *args)
293
+ opts = Hash[*args]
294
+
295
+ string = ""
296
+ @ole.file.open PPS_NAME_WORDDOC do |f|
297
+ string = retrieve_substring(f, offset, length)
298
+ end
299
+
300
+ if ! opts[:raw]
301
+ return format_into_plain(string)
302
+ end
303
+
304
+ return string
305
+ end
306
+
307
+ CHARMAP = {
308
+ "\x0d" => "\n", # ASIS: Line Feed
309
+ "\x09" => "\t", # ASIS: Tab
310
+
311
+ "\x0d" => "\n", # Paragraph ends; \n + U+2029?
312
+
313
+ "\x0b" => "\n", # Hard line breaks
314
+
315
+ "\x2d" => "\x2d", # ASIS: Breaking hyphens; U+2010?
316
+ "\x1f" => "\u{00ad}", # Non-required hyphens (into Soft hyphen)
317
+ "\x1e" => "\u{2011}", # Non-breaking hyphens
318
+
319
+ "\xa0" => "\xa0", # ASIS: Non-breaking-spaces
320
+
321
+ "\x0c" => "\x0c", # ASIS: Page breaks or Section marks
322
+
323
+ "\x0e" => "\x0e", # ASIS: Column breaks
324
+
325
+ "\x13" => "", # Field begin mark
326
+ "\x15" => "", # Field end mark
327
+ "\x14" => "", # Field separator
328
+
329
+ "\x07" => "\t", # Cell mark or Row mark
330
+ }
331
+
332
+ def format_into_plain(text)
333
+ text = text.gsub(/([\x07]*)[\x07]{2}/, '\1\n') \
334
+ .gsub(/([\x00-\x1f])/) { CHARMAP[$1] || "" }
335
+
336
+ return text
337
+ end
338
+
339
+ if defined?(Encoding)
340
+ # for Ruby 1.9+
341
+
342
+ def convert_from_cp1252(str)
343
+ @enc_utf8 ||= Encoding.find('UTF-8')
344
+ @enc_cp1252 ||= Encoding.find('Windows-1252')
345
+ return str.encode(@enc_utf8, @enc_cp1252)
346
+ end
347
+
348
+ def convert_from_utf16le(str)
349
+ @enc_utf8 ||= Encoding.find('UTF-8')
350
+ @enc_utf16 ||= Encoding.find('UTF-16LE')
351
+ return str.encode(@enc_utf8, @enc_utf16)
352
+ end
353
+ else
354
+ # for Ruby 1.8
355
+ require 'nkf'
356
+
357
+ def convert_from_cp1252(str)
358
+ return NKF.nkf(dest_encoding() + ' -W', str)
359
+ end
360
+
361
+ def convert_from_utf16le(str)
362
+ return NKF.nkf(dest_encoding() + ' -W16L0', str)
363
+ end
364
+
365
+ def dest_encoding
366
+ case $KCODE
367
+ when /^E/i then '-e'
368
+ when /^S/i then '-s'
369
+ when /^U/i then '-w'
370
+ else '-w'
371
+ end
372
+ end
373
+ end
374
+
375
+ end
376
+ end
@@ -0,0 +1 @@
1
+ require File.expand_path('../msworddoc/extractor', __FILE__)
data/test/fareast.doc ADDED
Binary file
data/test/lorem.doc ADDED
Binary file
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ require 'rubygems'
3
+ require 'test/unit'
4
+ begin
5
+ require 'redgreen'
6
+ rescue LoadError
7
+ end
8
+
9
+ require 'msworddoc-extractor'
10
+
11
+ class TestFareast < Test::Unit::TestCase
12
+ def setup
13
+ @doc = MSWordDoc::Extractor.load('test/fareast.doc')
14
+ end
15
+
16
+ def teardown
17
+ @doc.close
18
+ end
19
+
20
+ def test_document
21
+ assert_match %r{ 色は匂へど \s+ 散りぬるを }xm, @doc.document, "document"
22
+ end
23
+
24
+ def test_header
25
+ assert_match %r{ いろは歌 }xm, @doc.header, "header"
26
+ end
27
+
28
+ def test_footnote
29
+ assert_match %r{ いろはにほへとちりぬるを }xm, @doc.footnote, "footnote"
30
+ end
31
+
32
+ end
data/test/test_io.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ begin
4
+ require 'redgreen'
5
+ rescue LoadError
6
+ end
7
+
8
+ require 'msworddoc-extractor'
9
+ require 'stringio'
10
+
11
+ class TestIO < Test::Unit::TestCase
12
+ def test_fileio
13
+ open('test/lorem.doc', 'r') do |file|
14
+ MSWordDoc::Extractor.load(file) do |doc|
15
+ assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, doc.document, "document"
16
+ end
17
+ end
18
+ end
19
+
20
+ def test_stringio
21
+ data = ''
22
+ open('test/lorem.doc', 'r') do |file|
23
+ data = file.read()
24
+ end
25
+
26
+ io = StringIO.new(data, 'r')
27
+
28
+ MSWordDoc::Extractor.load(io) do |doc|
29
+ assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, doc.document, "document"
30
+ end
31
+
32
+ io.close()
33
+ end
34
+ end
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ begin
4
+ require 'redgreen'
5
+ rescue LoadError
6
+ end
7
+
8
+ require 'msworddoc-extractor'
9
+
10
+ class TestMsworddoc < Test::Unit::TestCase
11
+ def setup
12
+ @doc = MSWordDoc::Extractor.load('test/lorem.doc')
13
+ end
14
+
15
+ def teardown
16
+ @doc.close
17
+ end
18
+
19
+ def test_document
20
+ assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, @doc.document, "document"
21
+ assert_match %r{ \s+ Duis \s+ aute \s+ }xm, @doc.document, "document"
22
+ assert_match %r{ \s+ id \s+ est \s+ laborum[.] }xm, @doc.document, "document"
23
+ end
24
+
25
+ def test_header
26
+ assert_match %r{ \A Lorem \s+ ipsum \s+ ... }xm, @doc.header, "header"
27
+ end
28
+
29
+ def test_footnote
30
+ assert_match %r{ The \s+ quick \s+ brown \s+ fox \s+ }xm, @doc.footnote, "footnote"
31
+ assert_match %r{ \s+ jumps \s+ over \s+ the \s+ lazy \s+ dog[.] }xm, @doc.footnote, "footnote"
32
+ end
33
+
34
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: msworddoc-extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - ITO Nobuaki
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-11 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ruby-ole
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Extract text contents from Microsoft Word Document.
31
+ email:
32
+ - daydream.trippers@gmail.com
33
+ executables:
34
+ - worddoc-extract
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - bin/worddoc-extract
39
+ - lib/msworddoc-extractor.rb
40
+ - lib/msworddoc/extractor.rb
41
+ - test/test_msworddoc.rb
42
+ - test/test_fareast.rb
43
+ - test/test_io.rb
44
+ - test/lorem.doc
45
+ - test/fareast.doc
46
+ homepage: ''
47
+ licenses: []
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ segments:
59
+ - 0
60
+ hash: 1216945514845922976
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ segments:
68
+ - 0
69
+ hash: 1216945514845922976
70
+ requirements: []
71
+ rubyforge_project:
72
+ rubygems_version: 1.8.23
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Extract text contents from Microsoft Word Document
76
+ test_files:
77
+ - test/test_msworddoc.rb
78
+ - test/test_fareast.rb
79
+ - test/test_io.rb
80
+ - test/lorem.doc
81
+ - test/fareast.doc