ruby-msg 1.2.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/FIXES +34 -0
- data/README +121 -0
- data/Rakefile +66 -0
- data/bin/msgtool +63 -0
- data/bin/oletool +35 -0
- data/data/mapitags.yaml +4168 -0
- data/data/named_map.yaml +114 -0
- data/data/types.yaml +15 -0
- data/lib/blah.rb +106 -0
- data/lib/mime-new.rb +210 -0
- data/lib/mime.rb +165 -0
- data/lib/msg/properties.rb +515 -0
- data/lib/msg/rtf.rb +236 -0
- data/lib/msg.rb +505 -0
- data/lib/ole/base.rb +5 -0
- data/lib/ole/file_system.rb +181 -0
- data/lib/ole/io_helpers.rb +184 -0
- data/lib/ole/storage.rb +927 -0
- data/lib/ole/types.rb +36 -0
- data/lib/orderedhash.rb +218 -0
- data/lib/rtf.rb +118 -0
- data/lib/support.rb +51 -0
- data/test/test_mime.rb +22 -0
- data/test/test_storage.rb +139 -0
- data/test/test_word_6.doc +0 -0
- data/test/test_word_95.doc +0 -0
- data/test/test_word_97.doc +0 -0
- metadata +73 -0
data/lib/msg/rtf.rb
ADDED
@@ -0,0 +1,236 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
require 'strscan'
|
3
|
+
|
4
|
+
require 'rtf.rb'
|
5
|
+
|
6
|
+
class Msg
|
7
|
+
#
|
8
|
+
# = Introduction
|
9
|
+
#
|
10
|
+
# The +RTF+ module contains a few helper functions for dealing with rtf
|
11
|
+
# in msgs: +rtfdecompr+, and <tt>rtf2html</tt>.
|
12
|
+
#
|
13
|
+
# Both were ported from their original C versions for simplicity's sake.
|
14
|
+
#
|
15
|
+
module RTF
|
16
|
+
RTF_PREBUF =
|
17
|
+
"{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
|
18
|
+
"{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
|
19
|
+
"\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
|
20
|
+
"{\\colortbl\\red0\\green0\\blue0\n\r\\par " \
|
21
|
+
"\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"
|
22
|
+
|
23
|
+
# Decompresses compressed rtf +data+, as found in the mapi property
|
24
|
+
# +PR_RTF_COMPRESSED+. Code converted from my C version, which in turn
|
25
|
+
# was ported from Java source, in JTNEF I believe.
|
26
|
+
#
|
27
|
+
# C version was modified to use circular buffer for back references,
|
28
|
+
# instead of the optimization of the Java version to index directly into
|
29
|
+
# output buffer. This was in preparation to support streaming in a
|
30
|
+
# read/write neutral fashion.
|
31
|
+
def rtfdecompr data
|
32
|
+
io = StringIO.new data
|
33
|
+
buf = RTF_PREBUF + "\x00" * (4096 - RTF_PREBUF.length)
|
34
|
+
wp = RTF_PREBUF.length
|
35
|
+
rtf = ''
|
36
|
+
|
37
|
+
# get header fields (as defined in RTFLIB.H)
|
38
|
+
compr_size, uncompr_size, magic, crc32 = io.read(16).unpack 'L*'
|
39
|
+
#warn "compressed-RTF data size mismatch" unless io.size == data.compr_size + 4
|
40
|
+
|
41
|
+
# process the data
|
42
|
+
case magic
|
43
|
+
when 0x414c454d # magic number that identifies the stream as a uncompressed stream
|
44
|
+
rtf = io.read uncompr_size
|
45
|
+
when 0x75465a4c # magic number that identifies the stream as a compressed stream
|
46
|
+
flag_count = -1
|
47
|
+
flags = nil
|
48
|
+
while rtf.length < uncompr_size and !io.eof?
|
49
|
+
#p [rtf.length, uncompr_size]
|
50
|
+
# each flag byte flags 8 literals/references, 1 per bit
|
51
|
+
flags = ((flag_count += 1) % 8 == 0) ? io.getc : flags >> 1
|
52
|
+
if 1 == (flags & 1) # each flag bit is 1 for reference, 0 for literal
|
53
|
+
rp, l = io.getc, io.getc
|
54
|
+
# offset is a 12 byte number. 2^12 is 4096, so thats fine
|
55
|
+
rp = (rp << 4) | (l >> 4) # the offset relative to block start
|
56
|
+
l = (l & 0xf) + 2 # the number of bytes to copy
|
57
|
+
l.times do
|
58
|
+
rtf << (buf[wp] = buf[rp])
|
59
|
+
wp = (wp + 1) % 4096
|
60
|
+
rp = (rp + 1) % 4096
|
61
|
+
end
|
62
|
+
else
|
63
|
+
rtf << (buf[wp] = io.getc)
|
64
|
+
wp = (wp + 1) % 4096
|
65
|
+
end
|
66
|
+
end
|
67
|
+
else # unknown magic number
|
68
|
+
raise "Unknown compression type (magic number 0x%08x)" % magic
|
69
|
+
end
|
70
|
+
rtf
|
71
|
+
end
|
72
|
+
|
73
|
+
=begin
|
74
|
+
# = RTF/HTML functions
|
75
|
+
#
|
76
|
+
# Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message.
|
77
|
+
# But more usually, the HTML is encoded inside the RTF body (which you get in the
|
78
|
+
# PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML
|
79
|
+
# from this RTF body.
|
80
|
+
#
|
81
|
+
# An encoded htmlrtf file is a valid RTF document, but which contains additional
|
82
|
+
# html markup information in its comments, and sometimes contains the equivalent
|
83
|
+
# rtf markup outside the comments. Therefore, when it is displayed by a plain
|
84
|
+
# simple RTF reader, the html comments are ignored and only the rtf markup has
|
85
|
+
# effect. Typically, this rtf markup is not as rich as the html markup would have been.
|
86
|
+
# But for an html-aware reader (such as the code below), we can ignore all the
|
87
|
+
# rtf markup, and extract the html markup out of the comments, and get a valid
|
88
|
+
# html document.
|
89
|
+
#
|
90
|
+
# There are actually two kinds of html markup in comments. Most of them are
|
91
|
+
# prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one
|
92
|
+
# prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case,
|
93
|
+
# the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message
|
94
|
+
# and contains tags that refer to content-ids (e.g. img src="cid:072344a7")
|
95
|
+
# while the normal tag just refers to a name (e.g. img src="fred.jpg")
|
96
|
+
# The code below keeps the m-tag and discards the normal tag.
|
97
|
+
# If there are any m-tags like this, then the message also contains an
|
98
|
+
# attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually,
|
99
|
+
# sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the
|
100
|
+
# attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead
|
101
|
+
# of a PR_CONTENT_ID.
|
102
|
+
#
|
103
|
+
# This code is experimental. It works on my own message archive, of about
|
104
|
+
# a thousand html-encoded messages, received in Outlook97 and Outlook2000
|
105
|
+
# and OutlookXP. But I can't guarantee that it will work on all rtf-encoded
|
106
|
+
# messages. Indeed, it used to be the case that people would simply stick
|
107
|
+
# {\fromhtml at the start of an html document, and } at the end, and send
|
108
|
+
# this as RTF. If someone did this, then it will almost work in my function
|
109
|
+
# but not quite. (Because I ignore \r and \n, and respect only \par. Thus,
|
110
|
+
# any linefeeds in the erroneous encoded-html will be ignored.)
|
111
|
+
|
112
|
+
# ISRTFHTML -- Given an uncompressed RTF body of the message, this
|
113
|
+
# function tells you whether it encodes some html.
|
114
|
+
# [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
|
115
|
+
# [return-value] true or false, for whether it really does encode some html
|
116
|
+
bool isrtfhtml(const char *buf,unsigned int len)
|
117
|
+
{ // We look for the words "\fromhtml" somewhere in the file.
|
118
|
+
// If the rtf encodes text rather than html, then instead
|
119
|
+
// it will only find "\fromtext".
|
120
|
+
const char *c;
|
121
|
+
for (c=buf; c<buf+len; c++)
|
122
|
+
{ if (strncmp(c,"\\from",5)==0) return strncmp(c,"\\fromhtml",9)==0;
|
123
|
+
}
|
124
|
+
return false;
|
125
|
+
}
|
126
|
+
|
127
|
+
|
128
|
+
# DECODERTFHTML -- Given an uncompressed RTF body of the message,
|
129
|
+
# and assuming that it contains encoded-html, this function
|
130
|
+
# turns it onto regular html.
|
131
|
+
# [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
|
132
|
+
# [out] the buffer is overwritten with the HTML version, null-terminated,
|
133
|
+
# and *len indicates the length of this HTML.
|
134
|
+
#
|
135
|
+
# Notes: (1) because of how the encoding works, the HTML version is necessarily
|
136
|
+
# shorter than the encoded version. That's why it's safe for the function to
|
137
|
+
# place the decoded html in the same buffer that formerly held the encoded stuff.
|
138
|
+
# (2) Some messages include characters \'XX, where XX is a hexedecimal number.
|
139
|
+
# This function simply converts this into ASCII. The conversion will only make
|
140
|
+
# sense if the right code-page is being used. I don't know how rtf specifies which
|
141
|
+
# code page it wants.
|
142
|
+
# (3) By experiment, I discovered that \pntext{..} and \liN and \fi-N are RTF
|
143
|
+
# markup that should be removed. There might be other RTF markup that should
|
144
|
+
# also be removed. But I don't know what else.
|
145
|
+
#
|
146
|
+
void decodertfhtml(char *buf,unsigned int *len)
|
147
|
+
{ // c -- pointer to where we're reading from
|
148
|
+
// d -- pointer to where we're writing to. Invariant: d<c
|
149
|
+
// max -- how far we can read from (i.e. to the end of the original rtf)
|
150
|
+
// ignore_tag -- stores 'N': after \mhtmlN, we will ignore the subsequent \htmlN.
|
151
|
+
char *c=buf, *max=buf+*len, *d=buf; int ignore_tag=-1;
|
152
|
+
// First, we skip forwards to the first \htmltag.
|
153
|
+
while (c<max && strncmp(c,"{\\*\\htmltag",11)!=0) c++;
|
154
|
+
//
|
155
|
+
// Now work through the document. Our plan is as follows:
|
156
|
+
// * Ignore { and }. These are part of RTF markup.
|
157
|
+
// * Ignore \htmlrtf...\htmlrtf0. This is how RTF keeps its equivalent markup separate from the html.
|
158
|
+
// * Ignore \r and \n. The real carriage returns are stored in \par tags.
|
159
|
+
// * Ignore \pntext{..} and \liN and \fi-N. These are RTF junk.
|
160
|
+
// * Convert \par and \tab into \r\n and \t
|
161
|
+
// * Convert \'XX into the ascii character indicated by the hex number XX
|
162
|
+
// * Convert \{ and \} into { and }. This is how RTF escapes its curly braces.
|
163
|
+
// * When we get \*\mhtmltagN, keep the tag, but ignore the subsequent \*\htmltagN
|
164
|
+
// * When we get \*\htmltagN, keep the tag as long as it isn't subsequent to a \*\mhtmltagN
|
165
|
+
// * All other text should be kept as it is.
|
166
|
+
=end
|
167
|
+
|
168
|
+
|
169
|
+
# html encoded in rtf comments.
|
170
|
+
# {\*\htmltag84 "}\htmlrtf "\htmlrtf0
|
171
|
+
|
172
|
+
# already generates better output that the c predecessor. eg from this chunk, where
|
173
|
+
# there are tags outside of the htmlrtf ignore block.
|
174
|
+
# "{\\*\\htmltag116 <br />}\\htmlrtf \\line \\htmlrtf0 \\line {\\*\\htmltag84 <a href..."
|
175
|
+
# we take the approach of ignoring
|
176
|
+
# all rtf tags not explicitly handled. a proper parse tree would be nicer to work with.
|
177
|
+
# ruby rtf library?
|
178
|
+
# check http://homepage.ntlworld.com/peterhi/rtf_tools.html
|
179
|
+
# and
|
180
|
+
# http://rubyforge.org/projects/ruby-rtf/
|
181
|
+
|
182
|
+
# Substandard conversion of the original C code.
|
183
|
+
# Test and refactor, and try to correct some inaccuracies.
|
184
|
+
# Returns +nil+ if it doesn't look like an rtf encapsulated rtf.
|
185
|
+
#
|
186
|
+
# Code is a hack, but it works.
|
187
|
+
def rtf2html rtf
|
188
|
+
scan = StringScanner.new rtf
|
189
|
+
# require \fromhtml. is this worth keeping?
|
190
|
+
return nil unless rtf["\\fromhtml"]
|
191
|
+
html = ''
|
192
|
+
ignore_tag = nil
|
193
|
+
# skip up to the first htmltag. return nil if we don't ever find one
|
194
|
+
return nil unless scan.scan_until /(?=\{\\\*\\htmltag)/
|
195
|
+
until scan.empty?
|
196
|
+
if scan.scan /\{/
|
197
|
+
elsif scan.scan /\}/
|
198
|
+
elsif scan.scan /\\\*\\htmltag(\d+) ?/
|
199
|
+
#p scan[1]
|
200
|
+
if ignore_tag == scan[1]
|
201
|
+
scan.scan_until /\}/
|
202
|
+
ignore_tag = nil
|
203
|
+
end
|
204
|
+
elsif scan.scan /\\\*\\mhtmltag(\d+) ?/
|
205
|
+
ignore_tag = scan[1]
|
206
|
+
elsif scan.scan /\\par ?/
|
207
|
+
html << "\r\n"
|
208
|
+
elsif scan.scan /\\tab ?/
|
209
|
+
html << "\t"
|
210
|
+
elsif scan.scan /\\'([0-9A-Za-z]{2})/
|
211
|
+
html << scan[1].hex.chr
|
212
|
+
elsif scan.scan /\\pntext/
|
213
|
+
scan.scan_until /\}/
|
214
|
+
elsif scan.scan /\\htmlrtf/
|
215
|
+
scan.scan_until /\\htmlrtf0 ?/
|
216
|
+
# a generic throw away unknown tags thing.
|
217
|
+
# the above 2 however, are handled specially
|
218
|
+
elsif scan.scan /\\[a-z-]+(\d+)? ?/
|
219
|
+
#elsif scan.scan /\\li(\d+) ?/
|
220
|
+
#elsif scan.scan /\\fi-(\d+) ?/
|
221
|
+
elsif scan.scan /[\r\n]/
|
222
|
+
elsif scan.scan /\\([{}\\])/
|
223
|
+
html << scan[1]
|
224
|
+
elsif scan.scan /(.)/
|
225
|
+
html << scan[1]
|
226
|
+
else
|
227
|
+
p :wtf
|
228
|
+
end
|
229
|
+
end
|
230
|
+
html.strip.empty? ? nil : html
|
231
|
+
end
|
232
|
+
|
233
|
+
module_function :rtf2html, :rtfdecompr
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|