msworddoc-extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'msworddoc-extractor'
5
+
6
+ def app(*argv)
7
+ actions = []
8
+
9
+ options = [
10
+ [ '-d', '--document', 'Main contents (default)', :document ],
11
+ [ '-w', '--whole', 'Whole text contents', :whole_contents ],
12
+ [ '-i', '--header', 'Header parts', :header ],
13
+ [ '-f', '--footnote', 'Footnotes', :footnote ],
14
+ [ '-e', '--endnote', 'Endnotes', :endnote ],
15
+ [ '-a', '--annotation', 'Annotations', :annotatation ],
16
+ [ '-t', '--textbox', 'Text boxes', :textbox ],
17
+ [ '--header_textbox', 'Header text boxes', :header_textbox ],
18
+ [ '-m', '--macro', 'Macro part', :marco ],
19
+ ]
20
+
21
+ optparse = OptionParser.new do |opt|
22
+ opt.banner = 'Usage: worddoc-extract [options] <files> ...'
23
+
24
+ options.each do |o|
25
+ action = o.pop
26
+ opt.on(*o) { actions << action }
27
+ end
28
+
29
+ opt.separator ''
30
+ opt.on('-h', '--help', 'Show this help') { puts opt; exit }
31
+ end
32
+
33
+ if actions.empty?
34
+ actions = [ :document ]
35
+ end
36
+
37
+ files = optparse.parse(argv)
38
+
39
+ files.each do |file|
40
+ doc = MSWordDoc::Extractor.load_file(file)
41
+ actions.each do |action|
42
+ puts doc.send(action)
43
+ end
44
+ end
45
+ end
46
+
47
+ app(*ARGV)
48
+
@@ -0,0 +1,376 @@
1
+ require 'rubygems'
2
+ require 'ole/storage'
3
+
4
+ module MSWordDoc
5
+ module Extractor
6
+ VERSION = '0.1.0'
7
+
8
+ def self.load(file)
9
+ doc = Essence.new()
10
+
11
+ ole = Ole::Storage.open(file)
12
+
13
+ doc.load_storage(ole)
14
+
15
+ if block_given?
16
+ begin
17
+ yield doc
18
+ ensure
19
+ doc.close()
20
+ end
21
+
22
+ return
23
+ end
24
+
25
+ return doc
26
+ end
27
+ end
28
+
29
+ class Essence
30
+ PPS_NAME_WORDDOC = 'WordDocument'
31
+ PPS_NAME_TABLE_TMPL = '%dTable'
32
+
33
+ MAGIC_MSWORD = 0xa5ec
34
+ NFIB_MSWORD6 = 101
35
+
36
+ OFFSET_FIB_IDENT = 0x0000
37
+ OFFSET_FIB_FIB = 0x0002
38
+
39
+ OFFSET_FIB_FLAGS = 0x000a
40
+ OFFSET_FIB_FCCLX = 0x01a2
41
+ OFFSET_FIB_LCBCLX = 0x01a6
42
+
43
+ OFFSET_FIB_FCMIN = 0x0018
44
+ OFFSET_FIB_FCMAC = 0x001c
45
+ OFFSET_FIB_CBMAC = 0x0040
46
+
47
+ MASK_FIBFLAG_COMPLEX = 0x0004
48
+ MASK_FIBFLAG_ENCRYPTED = 0x0100
49
+ MASK_FIBFLAG_WHICHTBLSTM = 0x0200
50
+
51
+ LENGTH_CP = 4
52
+ LENGTH_PCD = 8
53
+
54
+ OFFSET_FIB_CCP_MAP = {
55
+ :Text => 0x004c,
56
+ :Ftn => 0x0050,
57
+ :Hdd => 0x0054,
58
+ :Mcr => 0x0058,
59
+ :Atn => 0x005c,
60
+ :Edn => 0x0060,
61
+ :Txbx => 0x0064,
62
+ :HdrTxbx => 0x0068,
63
+ }
64
+
65
+ def initialize
66
+ @flag = {}
67
+ @ccp = {}
68
+ @ole = nil
69
+ end
70
+
71
+ def close
72
+ @ole.close()
73
+ @ole = nil
74
+ end
75
+
76
+ def load_storage(ole)
77
+ @ole = ole
78
+
79
+ @ole.file.open(PPS_NAME_WORDDOC) do |f|
80
+ parse_fib(f)
81
+ end
82
+
83
+ name_of_table = PPS_NAME_TABLE_TMPL % (@flag[:fWhichTblStm] ? 1 : 0)
84
+ @ole.file.open name_of_table do |f|
85
+ parse_piece_table(f)
86
+ end
87
+ end
88
+
89
+ def whole_contents(*args)
90
+ return retrieve_and_filter(0, -1, *args)
91
+ end
92
+
93
+ def document(*args)
94
+ return retrieve_and_filter(0, @ccp[:Text], *args)
95
+ end
96
+
97
+ def footnote(*args)
98
+ return retrieve_and_filter(@ccp[:Text], @ccp[:Ftn], *args)
99
+ end
100
+
101
+ def header(*args)
102
+ skips = [ :Text, :Ftn ]
103
+ return retrieve_token_and_filter(skips, :Hdd, *args)
104
+ end
105
+
106
+ def macro(*args)
107
+ skips = [ :Text, :Ftn, :Hdd ]
108
+ return retrieve_token_and_filter(skips, :Mcr, *args)
109
+ end
110
+
111
+ def annotation(*args)
112
+ skips = [ :Text, :Ftn, :Hdd, :Mcr ]
113
+ return retrieve_token_and_filter(skips, :Atn, *args)
114
+ end
115
+
116
+ def endnote(*args)
117
+ skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn ]
118
+ return retrieve_token_and_filter(skips, :Edn, *args)
119
+ end
120
+
121
+ def textbox(*args)
122
+ skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn, :Edn ]
123
+ return retrieve_token_and_filter(skips, :Txbx, *args)
124
+ end
125
+
126
+ def header_textbox(*args)
127
+ skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn, :Edn, :Txbx ]
128
+ return retrieve_token_and_filter(skips, :HdrTxbx, *args)
129
+ end
130
+
131
+ private
132
+
133
+ def parse_fib(f)
134
+ if get_ushort(f, OFFSET_FIB_IDENT) != MAGIC_MSWORD then
135
+ raise 'Not a Word document'
136
+ end
137
+
138
+ nFib = get_ushort(f, OFFSET_FIB_FIB)
139
+ if nFib < NFIB_MSWORD6
140
+ raise 'Unsupported version'
141
+ end
142
+
143
+ flags = get_ushort(f, OFFSET_FIB_FLAGS)
144
+
145
+ @flag[:fComplex] = (flags & MASK_FIBFLAG_COMPLEX != 0)
146
+
147
+ @flag[:fEncrypted] = (flags & MASK_FIBFLAG_ENCRYPTED != 0)
148
+ if @flag[:fEncypted]
149
+ raise 'Encrypted MSWord document file is not supported'
150
+ end
151
+
152
+ @flag[:fWhichTblStm] = (flags & MASK_FIBFLAG_WHICHTBLSTM != 0)
153
+
154
+ @fcMin = get_ulong(f, OFFSET_FIB_FCMIN)
155
+ @fcMac = get_ulong(f, OFFSET_FIB_FCMAC)
156
+ @cbMac = get_ulong(f, OFFSET_FIB_CBMAC)
157
+
158
+ @fcClx = get_ulong(f, OFFSET_FIB_FCCLX)
159
+ @lcbClx = get_ulong(f, OFFSET_FIB_LCBCLX)
160
+
161
+ parse_fib_ccps(f)
162
+ end
163
+
164
+ def parse_fib_ccps(f)
165
+ OFFSET_FIB_CCP_MAP.each do |key, offset|
166
+ @ccp[key] = get_ulong(f, offset)
167
+ end
168
+ end
169
+
170
+ def parse_piece_table(f)
171
+ if @lcbClx <= 0
172
+ # create pseudo piece table
173
+ ccpAll = 0
174
+ OFFSET_FIB_CCP_MAP.each do |key, offset|
175
+ ccpAll += @ccp[key]
176
+ end
177
+
178
+ @pcds = [
179
+ {
180
+ :fc => @fcMin,
181
+ :cp => 0,
182
+ :ccp => ccpAll,
183
+ }
184
+ ]
185
+
186
+ return
187
+ end
188
+
189
+ f.pos = @fcClx
190
+ clx = f.read(@lcbClx)
191
+
192
+ while clx.length > 0
193
+ clxt = clx.slice!(0, 1).unpack('C')[0]
194
+ break if clxt == 2 # plcfpcd
195
+
196
+ if clxt == 1 # grpprl => SKIP
197
+ skip = clx.slice!(0, 2).unpack('v')[0]
198
+
199
+ clx.slice!(0, skip)
200
+ else
201
+ raise 'Unknown CLX block'
202
+ end
203
+ end
204
+ raise 'PCDs not found' unless clx.length > 0
205
+
206
+ length = clx.slice!(0, 4).unpack('V')[0]
207
+
208
+ n = (length - LENGTH_CP) / (LENGTH_CP + LENGTH_PCD)
209
+
210
+ cps = []
211
+ (n+1).times do
212
+ cps << clx.slice!(0, LENGTH_CP).unpack('V')[0]
213
+ end
214
+
215
+ @pcds = []
216
+ 1.upto(n) do |i|
217
+ pcd_data = clx.slice!(0, LENGTH_PCD)
218
+
219
+ fc = pcd_data.slice(2, 4).unpack('V')[0]
220
+
221
+ @pcds << {
222
+ :fc => fc,
223
+ :cp => cps[i - 1],
224
+ :ccp => cps[i] - cps[i - 1]
225
+ }
226
+ end
227
+ end
228
+
229
+ def retrieve_substring(f, offset, length = -1)
230
+ i = 0
231
+ while i < @pcds.length
232
+ if @pcds[i][:cp] > offset then break end
233
+
234
+ i += 1
235
+ end
236
+ i -= 1
237
+ raise 'could not find suitable heading piece' unless i >= 0
238
+
239
+ output = ""
240
+ while length > 0 || length < 0
241
+ pcd = @pcds[i]
242
+
243
+ len = length
244
+ if pcd[:ccp] < len || len < 0
245
+ len = pcd[:ccp]
246
+ end
247
+
248
+ if pcd[:fc] & 0x40000000 != 0
249
+ # cp1252
250
+ fc = (pcd[:fc] ^ 0x40000000) >> 1
251
+ fc += offset
252
+ offset = 0
253
+
254
+ f.pos = fc
255
+ output << convert_from_cp1252(f.read(len))
256
+ else
257
+ # UTF-16LE
258
+ fc = pcd[:fc]
259
+ fc += offset * 2
260
+ offset = 0
261
+
262
+ f.pos = fc
263
+ output << convert_from_utf16le(f.read(len * 2))
264
+ end
265
+
266
+ if length >= 0
267
+ length -= len
268
+ end
269
+
270
+ i += 1
271
+ break if i >= @pcds.length
272
+ end
273
+
274
+ return output
275
+ end
276
+
277
+ def get_ushort(f, pos)
278
+ f.pos = pos
279
+ return f.read(2).unpack('v')[0]
280
+ end
281
+
282
+ def get_ulong(f, pos)
283
+ f.pos = pos
284
+ return f.read(4).unpack('V')[0]
285
+ end
286
+
287
+ def retrieve_token_and_filter(skip_tokens, target, *args)
288
+ skip = skip_tokens.inject(0) {|sum, key| sum + @ccp[key] }
289
+ return retrieve_and_filter(skip, @ccp[target], *args)
290
+ end
291
+
292
+ def retrieve_and_filter(offset, length, *args)
293
+ opts = Hash[*args]
294
+
295
+ string = ""
296
+ @ole.file.open PPS_NAME_WORDDOC do |f|
297
+ string = retrieve_substring(f, offset, length)
298
+ end
299
+
300
+ if ! opts[:raw]
301
+ return format_into_plain(string)
302
+ end
303
+
304
+ return string
305
+ end
306
+
307
+ CHARMAP = {
308
+ "\x0d" => "\n", # ASIS: Line Feed
309
+ "\x09" => "\t", # ASIS: Tab
310
+
311
+ "\x0d" => "\n", # Paragraph ends; \n + U+2029?
312
+
313
+ "\x0b" => "\n", # Hard line breaks
314
+
315
+ "\x2d" => "\x2d", # ASIS: Breaking hyphens; U+2010?
316
+ "\x1f" => "\u{00ad}", # Non-required hyphens (into Soft hyphen)
317
+ "\x1e" => "\u{2011}", # Non-breaking hyphens
318
+
319
+ "\xa0" => "\xa0", # ASIS: Non-breaking-spaces
320
+
321
+ "\x0c" => "\x0c", # ASIS: Page breaks or Section marks
322
+
323
+ "\x0e" => "\x0e", # ASIS: Column breaks
324
+
325
+ "\x13" => "", # Field begin mark
326
+ "\x15" => "", # Field end mark
327
+ "\x14" => "", # Field separator
328
+
329
+ "\x07" => "\t", # Cell mark or Row mark
330
+ }
331
+
332
+ def format_into_plain(text)
333
+ text = text.gsub(/([\x07]*)[\x07]{2}/, '\1\n') \
334
+ .gsub(/([\x00-\x1f])/) { CHARMAP[$1] || "" }
335
+
336
+ return text
337
+ end
338
+
339
+ if defined?(Encoding)
340
+ # for Ruby 1.9+
341
+
342
+ def convert_from_cp1252(str)
343
+ @enc_utf8 ||= Encoding.find('UTF-8')
344
+ @enc_cp1252 ||= Encoding.find('Windows-1252')
345
+ return str.encode(@enc_utf8, @enc_cp1252)
346
+ end
347
+
348
+ def convert_from_utf16le(str)
349
+ @enc_utf8 ||= Encoding.find('UTF-8')
350
+ @enc_utf16 ||= Encoding.find('UTF-16LE')
351
+ return str.encode(@enc_utf8, @enc_utf16)
352
+ end
353
+ else
354
+ # for Ruby 1.8
355
+ require 'nkf'
356
+
357
+ def convert_from_cp1252(str)
358
+ return NKF.nkf(dest_encoding() + ' -W', str)
359
+ end
360
+
361
+ def convert_from_utf16le(str)
362
+ return NKF.nkf(dest_encoding() + ' -W16L0', str)
363
+ end
364
+
365
+ def dest_encoding
366
+ case $KCODE
367
+ when /^E/i then '-e'
368
+ when /^S/i then '-s'
369
+ when /^U/i then '-w'
370
+ else '-w'
371
+ end
372
+ end
373
+ end
374
+
375
+ end
376
+ end
@@ -0,0 +1 @@
1
+ require File.expand_path('../msworddoc/extractor', __FILE__)
data/test/fareast.doc ADDED
Binary file
data/test/lorem.doc ADDED
Binary file
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ require 'rubygems'
3
+ require 'test/unit'
4
+ begin
5
+ require 'redgreen'
6
+ rescue LoadError
7
+ end
8
+
9
+ require 'msworddoc-extractor'
10
+
11
+ class TestFareast < Test::Unit::TestCase
12
+ def setup
13
+ @doc = MSWordDoc::Extractor.load('test/fareast.doc')
14
+ end
15
+
16
+ def teardown
17
+ @doc.close
18
+ end
19
+
20
+ def test_document
21
+ assert_match %r{ 色は匂へど \s+ 散りぬるを }xm, @doc.document, "document"
22
+ end
23
+
24
+ def test_header
25
+ assert_match %r{ いろは歌 }xm, @doc.header, "header"
26
+ end
27
+
28
+ def test_footnote
29
+ assert_match %r{ いろはにほへとちりぬるを }xm, @doc.footnote, "footnote"
30
+ end
31
+
32
+ end
data/test/test_io.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ begin
4
+ require 'redgreen'
5
+ rescue LoadError
6
+ end
7
+
8
+ require 'msworddoc-extractor'
9
+ require 'stringio'
10
+
11
+ class TestIO < Test::Unit::TestCase
12
+ def test_fileio
13
+ open('test/lorem.doc', 'r') do |file|
14
+ MSWordDoc::Extractor.load(file) do |doc|
15
+ assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, doc.document, "document"
16
+ end
17
+ end
18
+ end
19
+
20
+ def test_stringio
21
+ data = ''
22
+ open('test/lorem.doc', 'r') do |file|
23
+ data = file.read()
24
+ end
25
+
26
+ io = StringIO.new(data, 'r')
27
+
28
+ MSWordDoc::Extractor.load(io) do |doc|
29
+ assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, doc.document, "document"
30
+ end
31
+
32
+ io.close()
33
+ end
34
+ end
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ begin
4
+ require 'redgreen'
5
+ rescue LoadError
6
+ end
7
+
8
+ require 'msworddoc-extractor'
9
+
10
+ class TestMsworddoc < Test::Unit::TestCase
11
+ def setup
12
+ @doc = MSWordDoc::Extractor.load('test/lorem.doc')
13
+ end
14
+
15
+ def teardown
16
+ @doc.close
17
+ end
18
+
19
+ def test_document
20
+ assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, @doc.document, "document"
21
+ assert_match %r{ \s+ Duis \s+ aute \s+ }xm, @doc.document, "document"
22
+ assert_match %r{ \s+ id \s+ est \s+ laborum[.] }xm, @doc.document, "document"
23
+ end
24
+
25
+ def test_header
26
+ assert_match %r{ \A Lorem \s+ ipsum \s+ ... }xm, @doc.header, "header"
27
+ end
28
+
29
+ def test_footnote
30
+ assert_match %r{ The \s+ quick \s+ brown \s+ fox \s+ }xm, @doc.footnote, "footnote"
31
+ assert_match %r{ \s+ jumps \s+ over \s+ the \s+ lazy \s+ dog[.] }xm, @doc.footnote, "footnote"
32
+ end
33
+
34
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: msworddoc-extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - ITO Nobuaki
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-11 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ruby-ole
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Extract text contents from Microsoft Word Document.
31
+ email:
32
+ - daydream.trippers@gmail.com
33
+ executables:
34
+ - worddoc-extract
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - bin/worddoc-extract
39
+ - lib/msworddoc-extractor.rb
40
+ - lib/msworddoc/extractor.rb
41
+ - test/test_msworddoc.rb
42
+ - test/test_fareast.rb
43
+ - test/test_io.rb
44
+ - test/lorem.doc
45
+ - test/fareast.doc
46
+ homepage: ''
47
+ licenses: []
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ segments:
59
+ - 0
60
+ hash: 1216945514845922976
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ segments:
68
+ - 0
69
+ hash: 1216945514845922976
70
+ requirements: []
71
+ rubyforge_project:
72
+ rubygems_version: 1.8.23
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Extract text contents from Microsoft Word Document
76
+ test_files:
77
+ - test/test_msworddoc.rb
78
+ - test/test_fareast.rb
79
+ - test/test_io.rb
80
+ - test/lorem.doc
81
+ - test/fareast.doc