msworddoc-extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/worddoc-extract +48 -0
- data/lib/msworddoc/extractor.rb +376 -0
- data/lib/msworddoc-extractor.rb +1 -0
- data/test/fareast.doc +0 -0
- data/test/lorem.doc +0 -0
- data/test/test_fareast.rb +32 -0
- data/test/test_io.rb +34 -0
- data/test/test_msworddoc.rb +34 -0
- metadata +81 -0
data/bin/worddoc-extract
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'msworddoc-extractor'
|
5
|
+
|
6
|
+
def app(*argv)
|
7
|
+
actions = []
|
8
|
+
|
9
|
+
options = [
|
10
|
+
[ '-d', '--document', 'Main contents (default)', :document ],
|
11
|
+
[ '-w', '--whole', 'Whole text contents', :whole_contents ],
|
12
|
+
[ '-i', '--header', 'Header parts', :header ],
|
13
|
+
[ '-f', '--footnote', 'Footnotes', :footnote ],
|
14
|
+
[ '-e', '--endnote', 'Endnotes', :endnote ],
|
15
|
+
[ '-a', '--annotation', 'Annotations', :annotatation ],
|
16
|
+
[ '-t', '--textbox', 'Text boxes', :textbox ],
|
17
|
+
[ '--header_textbox', 'Header text boxes', :header_textbox ],
|
18
|
+
[ '-m', '--macro', 'Macro part', :marco ],
|
19
|
+
]
|
20
|
+
|
21
|
+
optparse = OptionParser.new do |opt|
|
22
|
+
opt.banner = 'Usage: worddoc-extract [options] <files> ...'
|
23
|
+
|
24
|
+
options.each do |o|
|
25
|
+
action = o.pop
|
26
|
+
opt.on(*o) { actions << action }
|
27
|
+
end
|
28
|
+
|
29
|
+
opt.separator ''
|
30
|
+
opt.on('-h', '--help', 'Show this help') { puts opt; exit }
|
31
|
+
end
|
32
|
+
|
33
|
+
if actions.empty?
|
34
|
+
actions = [ :document ]
|
35
|
+
end
|
36
|
+
|
37
|
+
files = optparse.parse(argv)
|
38
|
+
|
39
|
+
files.each do |file|
|
40
|
+
doc = MSWordDoc::Extractor.load_file(file)
|
41
|
+
actions.each do |action|
|
42
|
+
puts doc.send(action)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
app(*ARGV)
|
48
|
+
|
@@ -0,0 +1,376 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'ole/storage'
|
3
|
+
|
4
|
+
module MSWordDoc
|
5
|
+
module Extractor
|
6
|
+
VERSION = '0.1.0'
|
7
|
+
|
8
|
+
def self.load(file)
|
9
|
+
doc = Essence.new()
|
10
|
+
|
11
|
+
ole = Ole::Storage.open(file)
|
12
|
+
|
13
|
+
doc.load_storage(ole)
|
14
|
+
|
15
|
+
if block_given?
|
16
|
+
begin
|
17
|
+
yield doc
|
18
|
+
ensure
|
19
|
+
doc.close()
|
20
|
+
end
|
21
|
+
|
22
|
+
return
|
23
|
+
end
|
24
|
+
|
25
|
+
return doc
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Essence
|
30
|
+
PPS_NAME_WORDDOC = 'WordDocument'
|
31
|
+
PPS_NAME_TABLE_TMPL = '%dTable'
|
32
|
+
|
33
|
+
MAGIC_MSWORD = 0xa5ec
|
34
|
+
NFIB_MSWORD6 = 101
|
35
|
+
|
36
|
+
OFFSET_FIB_IDENT = 0x0000
|
37
|
+
OFFSET_FIB_FIB = 0x0002
|
38
|
+
|
39
|
+
OFFSET_FIB_FLAGS = 0x000a
|
40
|
+
OFFSET_FIB_FCCLX = 0x01a2
|
41
|
+
OFFSET_FIB_LCBCLX = 0x01a6
|
42
|
+
|
43
|
+
OFFSET_FIB_FCMIN = 0x0018
|
44
|
+
OFFSET_FIB_FCMAC = 0x001c
|
45
|
+
OFFSET_FIB_CBMAC = 0x0040
|
46
|
+
|
47
|
+
MASK_FIBFLAG_COMPLEX = 0x0004
|
48
|
+
MASK_FIBFLAG_ENCRYPTED = 0x0100
|
49
|
+
MASK_FIBFLAG_WHICHTBLSTM = 0x0200
|
50
|
+
|
51
|
+
LENGTH_CP = 4
|
52
|
+
LENGTH_PCD = 8
|
53
|
+
|
54
|
+
OFFSET_FIB_CCP_MAP = {
|
55
|
+
:Text => 0x004c,
|
56
|
+
:Ftn => 0x0050,
|
57
|
+
:Hdd => 0x0054,
|
58
|
+
:Mcr => 0x0058,
|
59
|
+
:Atn => 0x005c,
|
60
|
+
:Edn => 0x0060,
|
61
|
+
:Txbx => 0x0064,
|
62
|
+
:HdrTxbx => 0x0068,
|
63
|
+
}
|
64
|
+
|
65
|
+
def initialize
|
66
|
+
@flag = {}
|
67
|
+
@ccp = {}
|
68
|
+
@ole = nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def close
|
72
|
+
@ole.close()
|
73
|
+
@ole = nil
|
74
|
+
end
|
75
|
+
|
76
|
+
def load_storage(ole)
|
77
|
+
@ole = ole
|
78
|
+
|
79
|
+
@ole.file.open(PPS_NAME_WORDDOC) do |f|
|
80
|
+
parse_fib(f)
|
81
|
+
end
|
82
|
+
|
83
|
+
name_of_table = PPS_NAME_TABLE_TMPL % (@flag[:fWhichTblStm] ? 1 : 0)
|
84
|
+
@ole.file.open name_of_table do |f|
|
85
|
+
parse_piece_table(f)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def whole_contents(*args)
|
90
|
+
return retrieve_and_filter(0, -1, *args)
|
91
|
+
end
|
92
|
+
|
93
|
+
def document(*args)
|
94
|
+
return retrieve_and_filter(0, @ccp[:Text], *args)
|
95
|
+
end
|
96
|
+
|
97
|
+
def footnote(*args)
|
98
|
+
return retrieve_and_filter(@ccp[:Text], @ccp[:Ftn], *args)
|
99
|
+
end
|
100
|
+
|
101
|
+
def header(*args)
|
102
|
+
skips = [ :Text, :Ftn ]
|
103
|
+
return retrieve_token_and_filter(skips, :Hdd, *args)
|
104
|
+
end
|
105
|
+
|
106
|
+
def macro(*args)
|
107
|
+
skips = [ :Text, :Ftn, :Hdd ]
|
108
|
+
return retrieve_token_and_filter(skips, :Mcr, *args)
|
109
|
+
end
|
110
|
+
|
111
|
+
def annotation(*args)
|
112
|
+
skips = [ :Text, :Ftn, :Hdd, :Mcr ]
|
113
|
+
return retrieve_token_and_filter(skips, :Atn, *args)
|
114
|
+
end
|
115
|
+
|
116
|
+
def endnote(*args)
|
117
|
+
skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn ]
|
118
|
+
return retrieve_token_and_filter(skips, :Edn, *args)
|
119
|
+
end
|
120
|
+
|
121
|
+
def textbox(*args)
|
122
|
+
skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn, :Edn ]
|
123
|
+
return retrieve_token_and_filter(skips, :Txbx, *args)
|
124
|
+
end
|
125
|
+
|
126
|
+
def header_textbox(*args)
|
127
|
+
skips = [ :Text, :Ftn, :Hdd, :Mcr, :Atn, :Edn, :Txbx ]
|
128
|
+
return retrieve_token_and_filter(skips, :HdrTxbx, *args)
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def parse_fib(f)
|
134
|
+
if get_ushort(f, OFFSET_FIB_IDENT) != MAGIC_MSWORD then
|
135
|
+
raise 'Not a Word document'
|
136
|
+
end
|
137
|
+
|
138
|
+
nFib = get_ushort(f, OFFSET_FIB_FIB)
|
139
|
+
if nFib < NFIB_MSWORD6
|
140
|
+
raise 'Unsupported version'
|
141
|
+
end
|
142
|
+
|
143
|
+
flags = get_ushort(f, OFFSET_FIB_FLAGS)
|
144
|
+
|
145
|
+
@flag[:fComplex] = (flags & MASK_FIBFLAG_COMPLEX != 0)
|
146
|
+
|
147
|
+
@flag[:fEncrypted] = (flags & MASK_FIBFLAG_ENCRYPTED != 0)
|
148
|
+
if @flag[:fEncypted]
|
149
|
+
raise 'Encrypted MSWord document file is not supported'
|
150
|
+
end
|
151
|
+
|
152
|
+
@flag[:fWhichTblStm] = (flags & MASK_FIBFLAG_WHICHTBLSTM != 0)
|
153
|
+
|
154
|
+
@fcMin = get_ulong(f, OFFSET_FIB_FCMIN)
|
155
|
+
@fcMac = get_ulong(f, OFFSET_FIB_FCMAC)
|
156
|
+
@cbMac = get_ulong(f, OFFSET_FIB_CBMAC)
|
157
|
+
|
158
|
+
@fcClx = get_ulong(f, OFFSET_FIB_FCCLX)
|
159
|
+
@lcbClx = get_ulong(f, OFFSET_FIB_LCBCLX)
|
160
|
+
|
161
|
+
parse_fib_ccps(f)
|
162
|
+
end
|
163
|
+
|
164
|
+
def parse_fib_ccps(f)
|
165
|
+
OFFSET_FIB_CCP_MAP.each do |key, offset|
|
166
|
+
@ccp[key] = get_ulong(f, offset)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def parse_piece_table(f)
|
171
|
+
if @lcbClx <= 0
|
172
|
+
# create pseudo piece table
|
173
|
+
ccpAll = 0
|
174
|
+
OFFSET_FIB_CCP_MAP.each do |key, offset|
|
175
|
+
ccpAll += @ccp[key]
|
176
|
+
end
|
177
|
+
|
178
|
+
@pcds = [
|
179
|
+
{
|
180
|
+
:fc => @fcMin,
|
181
|
+
:cp => 0,
|
182
|
+
:ccp => ccpAll,
|
183
|
+
}
|
184
|
+
]
|
185
|
+
|
186
|
+
return
|
187
|
+
end
|
188
|
+
|
189
|
+
f.pos = @fcClx
|
190
|
+
clx = f.read(@lcbClx)
|
191
|
+
|
192
|
+
while clx.length > 0
|
193
|
+
clxt = clx.slice!(0, 1).unpack('C')[0]
|
194
|
+
break if clxt == 2 # plcfpcd
|
195
|
+
|
196
|
+
if clxt == 1 # grpprl => SKIP
|
197
|
+
skip = clx.slice!(0, 2).unpack('v')[0]
|
198
|
+
|
199
|
+
clx.slice!(0, skip)
|
200
|
+
else
|
201
|
+
raise 'Unknown CLX block'
|
202
|
+
end
|
203
|
+
end
|
204
|
+
raise 'PCDs not found' unless clx.length > 0
|
205
|
+
|
206
|
+
length = clx.slice!(0, 4).unpack('V')[0]
|
207
|
+
|
208
|
+
n = (length - LENGTH_CP) / (LENGTH_CP + LENGTH_PCD)
|
209
|
+
|
210
|
+
cps = []
|
211
|
+
(n+1).times do
|
212
|
+
cps << clx.slice!(0, LENGTH_CP).unpack('V')[0]
|
213
|
+
end
|
214
|
+
|
215
|
+
@pcds = []
|
216
|
+
1.upto(n) do |i|
|
217
|
+
pcd_data = clx.slice!(0, LENGTH_PCD)
|
218
|
+
|
219
|
+
fc = pcd_data.slice(2, 4).unpack('V')[0]
|
220
|
+
|
221
|
+
@pcds << {
|
222
|
+
:fc => fc,
|
223
|
+
:cp => cps[i - 1],
|
224
|
+
:ccp => cps[i] - cps[i - 1]
|
225
|
+
}
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
def retrieve_substring(f, offset, length = -1)
|
230
|
+
i = 0
|
231
|
+
while i < @pcds.length
|
232
|
+
if @pcds[i][:cp] > offset then break end
|
233
|
+
|
234
|
+
i += 1
|
235
|
+
end
|
236
|
+
i -= 1
|
237
|
+
raise 'could not find suitable heading piece' unless i >= 0
|
238
|
+
|
239
|
+
output = ""
|
240
|
+
while length > 0 || length < 0
|
241
|
+
pcd = @pcds[i]
|
242
|
+
|
243
|
+
len = length
|
244
|
+
if pcd[:ccp] < len || len < 0
|
245
|
+
len = pcd[:ccp]
|
246
|
+
end
|
247
|
+
|
248
|
+
if pcd[:fc] & 0x40000000 != 0
|
249
|
+
# cp1252
|
250
|
+
fc = (pcd[:fc] ^ 0x40000000) >> 1
|
251
|
+
fc += offset
|
252
|
+
offset = 0
|
253
|
+
|
254
|
+
f.pos = fc
|
255
|
+
output << convert_from_cp1252(f.read(len))
|
256
|
+
else
|
257
|
+
# UTF-16LE
|
258
|
+
fc = pcd[:fc]
|
259
|
+
fc += offset * 2
|
260
|
+
offset = 0
|
261
|
+
|
262
|
+
f.pos = fc
|
263
|
+
output << convert_from_utf16le(f.read(len * 2))
|
264
|
+
end
|
265
|
+
|
266
|
+
if length >= 0
|
267
|
+
length -= len
|
268
|
+
end
|
269
|
+
|
270
|
+
i += 1
|
271
|
+
break if i >= @pcds.length
|
272
|
+
end
|
273
|
+
|
274
|
+
return output
|
275
|
+
end
|
276
|
+
|
277
|
+
def get_ushort(f, pos)
|
278
|
+
f.pos = pos
|
279
|
+
return f.read(2).unpack('v')[0]
|
280
|
+
end
|
281
|
+
|
282
|
+
def get_ulong(f, pos)
|
283
|
+
f.pos = pos
|
284
|
+
return f.read(4).unpack('V')[0]
|
285
|
+
end
|
286
|
+
|
287
|
+
def retrieve_token_and_filter(skip_tokens, target, *args)
|
288
|
+
skip = skip_tokens.inject(0) {|sum, key| sum + @ccp[key] }
|
289
|
+
return retrieve_and_filter(skip, @ccp[target], *args)
|
290
|
+
end
|
291
|
+
|
292
|
+
def retrieve_and_filter(offset, length, *args)
|
293
|
+
opts = Hash[*args]
|
294
|
+
|
295
|
+
string = ""
|
296
|
+
@ole.file.open PPS_NAME_WORDDOC do |f|
|
297
|
+
string = retrieve_substring(f, offset, length)
|
298
|
+
end
|
299
|
+
|
300
|
+
if ! opts[:raw]
|
301
|
+
return format_into_plain(string)
|
302
|
+
end
|
303
|
+
|
304
|
+
return string
|
305
|
+
end
|
306
|
+
|
307
|
+
CHARMAP = {
|
308
|
+
"\x0d" => "\n", # ASIS: Line Feed
|
309
|
+
"\x09" => "\t", # ASIS: Tab
|
310
|
+
|
311
|
+
"\x0d" => "\n", # Paragraph ends; \n + U+2029?
|
312
|
+
|
313
|
+
"\x0b" => "\n", # Hard line breaks
|
314
|
+
|
315
|
+
"\x2d" => "\x2d", # ASIS: Breaking hyphens; U+2010?
|
316
|
+
"\x1f" => "\u{00ad}", # Non-required hyphens (into Soft hyphen)
|
317
|
+
"\x1e" => "\u{2011}", # Non-breaking hyphens
|
318
|
+
|
319
|
+
"\xa0" => "\xa0", # ASIS: Non-breaking-spaces
|
320
|
+
|
321
|
+
"\x0c" => "\x0c", # ASIS: Page breaks or Section marks
|
322
|
+
|
323
|
+
"\x0e" => "\x0e", # ASIS: Column breaks
|
324
|
+
|
325
|
+
"\x13" => "", # Field begin mark
|
326
|
+
"\x15" => "", # Field end mark
|
327
|
+
"\x14" => "", # Field separator
|
328
|
+
|
329
|
+
"\x07" => "\t", # Cell mark or Row mark
|
330
|
+
}
|
331
|
+
|
332
|
+
def format_into_plain(text)
|
333
|
+
text = text.gsub(/([\x07]*)[\x07]{2}/, '\1\n') \
|
334
|
+
.gsub(/([\x00-\x1f])/) { CHARMAP[$1] || "" }
|
335
|
+
|
336
|
+
return text
|
337
|
+
end
|
338
|
+
|
339
|
+
if defined?(Encoding)
|
340
|
+
# for Ruby 1.9+
|
341
|
+
|
342
|
+
def convert_from_cp1252(str)
|
343
|
+
@enc_utf8 ||= Encoding.find('UTF-8')
|
344
|
+
@enc_cp1252 ||= Encoding.find('Windows-1252')
|
345
|
+
return str.encode(@enc_utf8, @enc_cp1252)
|
346
|
+
end
|
347
|
+
|
348
|
+
def convert_from_utf16le(str)
|
349
|
+
@enc_utf8 ||= Encoding.find('UTF-8')
|
350
|
+
@enc_utf16 ||= Encoding.find('UTF-16LE')
|
351
|
+
return str.encode(@enc_utf8, @enc_utf16)
|
352
|
+
end
|
353
|
+
else
|
354
|
+
# for Ruby 1.8
|
355
|
+
require 'nkf'
|
356
|
+
|
357
|
+
def convert_from_cp1252(str)
|
358
|
+
return NKF.nkf(dest_encoding() + ' -W', str)
|
359
|
+
end
|
360
|
+
|
361
|
+
def convert_from_utf16le(str)
|
362
|
+
return NKF.nkf(dest_encoding() + ' -W16L0', str)
|
363
|
+
end
|
364
|
+
|
365
|
+
def dest_encoding
|
366
|
+
case $KCODE
|
367
|
+
when /^E/i then '-e'
|
368
|
+
when /^S/i then '-s'
|
369
|
+
when /^U/i then '-w'
|
370
|
+
else '-w'
|
371
|
+
end
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
end
|
376
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path('../msworddoc/extractor', __FILE__)
|
data/test/fareast.doc
ADDED
Binary file
|
data/test/lorem.doc
ADDED
Binary file
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'test/unit'
|
4
|
+
begin
|
5
|
+
require 'redgreen'
|
6
|
+
rescue LoadError
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'msworddoc-extractor'
|
10
|
+
|
11
|
+
class TestFareast < Test::Unit::TestCase
|
12
|
+
def setup
|
13
|
+
@doc = MSWordDoc::Extractor.load('test/fareast.doc')
|
14
|
+
end
|
15
|
+
|
16
|
+
def teardown
|
17
|
+
@doc.close
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_document
|
21
|
+
assert_match %r{ 色は匂へど \s+ 散りぬるを }xm, @doc.document, "document"
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_header
|
25
|
+
assert_match %r{ いろは歌 }xm, @doc.header, "header"
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_footnote
|
29
|
+
assert_match %r{ いろはにほへとちりぬるを }xm, @doc.footnote, "footnote"
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
data/test/test_io.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'test/unit'
|
3
|
+
begin
|
4
|
+
require 'redgreen'
|
5
|
+
rescue LoadError
|
6
|
+
end
|
7
|
+
|
8
|
+
require 'msworddoc-extractor'
|
9
|
+
require 'stringio'
|
10
|
+
|
11
|
+
class TestIO < Test::Unit::TestCase
|
12
|
+
def test_fileio
|
13
|
+
open('test/lorem.doc', 'r') do |file|
|
14
|
+
MSWordDoc::Extractor.load(file) do |doc|
|
15
|
+
assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, doc.document, "document"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_stringio
|
21
|
+
data = ''
|
22
|
+
open('test/lorem.doc', 'r') do |file|
|
23
|
+
data = file.read()
|
24
|
+
end
|
25
|
+
|
26
|
+
io = StringIO.new(data, 'r')
|
27
|
+
|
28
|
+
MSWordDoc::Extractor.load(io) do |doc|
|
29
|
+
assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, doc.document, "document"
|
30
|
+
end
|
31
|
+
|
32
|
+
io.close()
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'test/unit'
|
3
|
+
begin
|
4
|
+
require 'redgreen'
|
5
|
+
rescue LoadError
|
6
|
+
end
|
7
|
+
|
8
|
+
require 'msworddoc-extractor'
|
9
|
+
|
10
|
+
class TestMsworddoc < Test::Unit::TestCase
|
11
|
+
def setup
|
12
|
+
@doc = MSWordDoc::Extractor.load('test/lorem.doc')
|
13
|
+
end
|
14
|
+
|
15
|
+
def teardown
|
16
|
+
@doc.close
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_document
|
20
|
+
assert_match %r{ \A Lorem \s+ ipsum \s+ }xm, @doc.document, "document"
|
21
|
+
assert_match %r{ \s+ Duis \s+ aute \s+ }xm, @doc.document, "document"
|
22
|
+
assert_match %r{ \s+ id \s+ est \s+ laborum[.] }xm, @doc.document, "document"
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_header
|
26
|
+
assert_match %r{ \A Lorem \s+ ipsum \s+ ... }xm, @doc.header, "header"
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_footnote
|
30
|
+
assert_match %r{ The \s+ quick \s+ brown \s+ fox \s+ }xm, @doc.footnote, "footnote"
|
31
|
+
assert_match %r{ \s+ jumps \s+ over \s+ the \s+ lazy \s+ dog[.] }xm, @doc.footnote, "footnote"
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: msworddoc-extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- ITO Nobuaki
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-05-11 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: ruby-ole
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: Extract text contents from Microsoft Word Document.
|
31
|
+
email:
|
32
|
+
- daydream.trippers@gmail.com
|
33
|
+
executables:
|
34
|
+
- worddoc-extract
|
35
|
+
extensions: []
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- bin/worddoc-extract
|
39
|
+
- lib/msworddoc-extractor.rb
|
40
|
+
- lib/msworddoc/extractor.rb
|
41
|
+
- test/test_msworddoc.rb
|
42
|
+
- test/test_fareast.rb
|
43
|
+
- test/test_io.rb
|
44
|
+
- test/lorem.doc
|
45
|
+
- test/fareast.doc
|
46
|
+
homepage: ''
|
47
|
+
licenses: []
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options: []
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
hash: 1216945514845922976
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
hash: 1216945514845922976
|
70
|
+
requirements: []
|
71
|
+
rubyforge_project:
|
72
|
+
rubygems_version: 1.8.23
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: Extract text contents from Microsoft Word Document
|
76
|
+
test_files:
|
77
|
+
- test/test_msworddoc.rb
|
78
|
+
- test/test_fareast.rb
|
79
|
+
- test/test_io.rb
|
80
|
+
- test/lorem.doc
|
81
|
+
- test/fareast.doc
|