ruby-msg 1.2.17

Sign up to get free protection for your applications and to get access to all the features.
data/lib/msg/rtf.rb ADDED
@@ -0,0 +1,236 @@
1
+ require 'stringio'
2
+ require 'strscan'
3
+
4
+ require 'rtf.rb'
5
+
6
+ class Msg
7
+ #
8
+ # = Introduction
9
+ #
10
+ # The +RTF+ module contains a few helper functions for dealing with rtf
11
+ # in msgs: +rtfdecompr+, and <tt>rtf2html</tt>.
12
+ #
13
+ # Both were ported from their original C versions for simplicity's sake.
14
+ #
15
+ module RTF
16
+ RTF_PREBUF =
17
+ "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
18
+ "{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
19
+ "\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
20
+ "{\\colortbl\\red0\\green0\\blue0\n\r\\par " \
21
+ "\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"
22
+
23
+ # Decompresses compressed rtf +data+, as found in the mapi property
24
+ # +PR_RTF_COMPRESSED+. Code converted from my C version, which in turn
25
+ # was ported from Java source, in JTNEF I believe.
26
+ #
27
+ # C version was modified to use circular buffer for back references,
28
+ # instead of the optimization of the Java version to index directly into
29
+ # output buffer. This was in preparation to support streaming in a
30
+ # read/write neutral fashion.
31
+ def rtfdecompr data
32
+ io = StringIO.new data
33
+ buf = RTF_PREBUF + "\x00" * (4096 - RTF_PREBUF.length)
34
+ wp = RTF_PREBUF.length
35
+ rtf = ''
36
+
37
+ # get header fields (as defined in RTFLIB.H)
38
+ compr_size, uncompr_size, magic, crc32 = io.read(16).unpack 'L*'
39
+ #warn "compressed-RTF data size mismatch" unless io.size == data.compr_size + 4
40
+
41
+ # process the data
42
+ case magic
43
+ when 0x414c454d # magic number that identifies the stream as a uncompressed stream
44
+ rtf = io.read uncompr_size
45
+ when 0x75465a4c # magic number that identifies the stream as a compressed stream
46
+ flag_count = -1
47
+ flags = nil
48
+ while rtf.length < uncompr_size and !io.eof?
49
+ #p [rtf.length, uncompr_size]
50
+ # each flag byte flags 8 literals/references, 1 per bit
51
+ flags = ((flag_count += 1) % 8 == 0) ? io.getc : flags >> 1
52
+ if 1 == (flags & 1) # each flag bit is 1 for reference, 0 for literal
53
+ rp, l = io.getc, io.getc
54
+ # offset is a 12 byte number. 2^12 is 4096, so thats fine
55
+ rp = (rp << 4) | (l >> 4) # the offset relative to block start
56
+ l = (l & 0xf) + 2 # the number of bytes to copy
57
+ l.times do
58
+ rtf << (buf[wp] = buf[rp])
59
+ wp = (wp + 1) % 4096
60
+ rp = (rp + 1) % 4096
61
+ end
62
+ else
63
+ rtf << (buf[wp] = io.getc)
64
+ wp = (wp + 1) % 4096
65
+ end
66
+ end
67
+ else # unknown magic number
68
+ raise "Unknown compression type (magic number 0x%08x)" % magic
69
+ end
70
+ rtf
71
+ end
72
+
73
+ =begin
74
+ # = RTF/HTML functions
75
+ #
76
+ # Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message.
77
+ # But more usually, the HTML is encoded inside the RTF body (which you get in the
78
+ # PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML
79
+ # from this RTF body.
80
+ #
81
+ # An encoded htmlrtf file is a valid RTF document, but which contains additional
82
+ # html markup information in its comments, and sometimes contains the equivalent
83
+ # rtf markup outside the comments. Therefore, when it is displayed by a plain
84
+ # simple RTF reader, the html comments are ignored and only the rtf markup has
85
+ # effect. Typically, this rtf markup is not as rich as the html markup would have been.
86
+ # But for an html-aware reader (such as the code below), we can ignore all the
87
+ # rtf markup, and extract the html markup out of the comments, and get a valid
88
+ # html document.
89
+ #
90
+ # There are actually two kinds of html markup in comments. Most of them are
91
+ # prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one
92
+ # prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case,
93
+ # the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message
94
+ # and contains tags that refer to content-ids (e.g. img src="cid:072344a7")
95
+ # while the normal tag just refers to a name (e.g. img src="fred.jpg")
96
+ # The code below keeps the m-tag and discards the normal tag.
97
+ # If there are any m-tags like this, then the message also contains an
98
+ # attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually,
99
+ # sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the
100
+ # attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead
101
+ # of a PR_CONTENT_ID.
102
+ #
103
+ # This code is experimental. It works on my own message archive, of about
104
+ # a thousand html-encoded messages, received in Outlook97 and Outlook2000
105
+ # and OutlookXP. But I can't guarantee that it will work on all rtf-encoded
106
+ # messages. Indeed, it used to be the case that people would simply stick
107
+ # {\fromhtml at the start of an html document, and } at the end, and send
108
+ # this as RTF. If someone did this, then it will almost work in my function
109
+ # but not quite. (Because I ignore \r and \n, and respect only \par. Thus,
110
+ # any linefeeds in the erroneous encoded-html will be ignored.)
111
+
112
+ # ISRTFHTML -- Given an uncompressed RTF body of the message, this
113
+ # function tells you whether it encodes some html.
114
+ # [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
115
+ # [return-value] true or false, for whether it really does encode some html
116
+ bool isrtfhtml(const char *buf,unsigned int len)
117
+ { // We look for the words "\fromhtml" somewhere in the file.
118
+ // If the rtf encodes text rather than html, then instead
119
+ // it will only find "\fromtext".
120
+ const char *c;
121
+ for (c=buf; c<buf+len; c++)
122
+ { if (strncmp(c,"\\from",5)==0) return strncmp(c,"\\fromhtml",9)==0;
123
+ }
124
+ return false;
125
+ }
126
+
127
+
128
+ # DECODERTFHTML -- Given an uncompressed RTF body of the message,
129
+ # and assuming that it contains encoded-html, this function
130
+ # turns it onto regular html.
131
+ # [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
132
+ # [out] the buffer is overwritten with the HTML version, null-terminated,
133
+ # and *len indicates the length of this HTML.
134
+ #
135
+ # Notes: (1) because of how the encoding works, the HTML version is necessarily
136
+ # shorter than the encoded version. That's why it's safe for the function to
137
+ # place the decoded html in the same buffer that formerly held the encoded stuff.
138
+ # (2) Some messages include characters \'XX, where XX is a hexedecimal number.
139
+ # This function simply converts this into ASCII. The conversion will only make
140
+ # sense if the right code-page is being used. I don't know how rtf specifies which
141
+ # code page it wants.
142
+ # (3) By experiment, I discovered that \pntext{..} and \liN and \fi-N are RTF
143
+ # markup that should be removed. There might be other RTF markup that should
144
+ # also be removed. But I don't know what else.
145
+ #
146
+ void decodertfhtml(char *buf,unsigned int *len)
147
+ { // c -- pointer to where we're reading from
148
+ // d -- pointer to where we're writing to. Invariant: d<c
149
+ // max -- how far we can read from (i.e. to the end of the original rtf)
150
+ // ignore_tag -- stores 'N': after \mhtmlN, we will ignore the subsequent \htmlN.
151
+ char *c=buf, *max=buf+*len, *d=buf; int ignore_tag=-1;
152
+ // First, we skip forwards to the first \htmltag.
153
+ while (c<max && strncmp(c,"{\\*\\htmltag",11)!=0) c++;
154
+ //
155
+ // Now work through the document. Our plan is as follows:
156
+ // * Ignore { and }. These are part of RTF markup.
157
+ // * Ignore \htmlrtf...\htmlrtf0. This is how RTF keeps its equivalent markup separate from the html.
158
+ // * Ignore \r and \n. The real carriage returns are stored in \par tags.
159
+ // * Ignore \pntext{..} and \liN and \fi-N. These are RTF junk.
160
+ // * Convert \par and \tab into \r\n and \t
161
+ // * Convert \'XX into the ascii character indicated by the hex number XX
162
+ // * Convert \{ and \} into { and }. This is how RTF escapes its curly braces.
163
+ // * When we get \*\mhtmltagN, keep the tag, but ignore the subsequent \*\htmltagN
164
+ // * When we get \*\htmltagN, keep the tag as long as it isn't subsequent to a \*\mhtmltagN
165
+ // * All other text should be kept as it is.
166
+ =end
167
+
168
+
169
+ # html encoded in rtf comments.
170
+ # {\*\htmltag84 &quot;}\htmlrtf "\htmlrtf0
171
+
172
+ # already generates better output that the c predecessor. eg from this chunk, where
173
+ # there are tags outside of the htmlrtf ignore block.
174
+ # "{\\*\\htmltag116 <br />}\\htmlrtf \\line \\htmlrtf0 \\line {\\*\\htmltag84 <a href..."
175
+ # we take the approach of ignoring
176
+ # all rtf tags not explicitly handled. a proper parse tree would be nicer to work with.
177
+ # ruby rtf library?
178
+ # check http://homepage.ntlworld.com/peterhi/rtf_tools.html
179
+ # and
180
+ # http://rubyforge.org/projects/ruby-rtf/
181
+
182
+ # Substandard conversion of the original C code.
183
+ # Test and refactor, and try to correct some inaccuracies.
184
+ # Returns +nil+ if it doesn't look like an rtf encapsulated rtf.
185
+ #
186
+ # Code is a hack, but it works.
187
+ def rtf2html rtf
188
+ scan = StringScanner.new rtf
189
+ # require \fromhtml. is this worth keeping?
190
+ return nil unless rtf["\\fromhtml"]
191
+ html = ''
192
+ ignore_tag = nil
193
+ # skip up to the first htmltag. return nil if we don't ever find one
194
+ return nil unless scan.scan_until /(?=\{\\\*\\htmltag)/
195
+ until scan.empty?
196
+ if scan.scan /\{/
197
+ elsif scan.scan /\}/
198
+ elsif scan.scan /\\\*\\htmltag(\d+) ?/
199
+ #p scan[1]
200
+ if ignore_tag == scan[1]
201
+ scan.scan_until /\}/
202
+ ignore_tag = nil
203
+ end
204
+ elsif scan.scan /\\\*\\mhtmltag(\d+) ?/
205
+ ignore_tag = scan[1]
206
+ elsif scan.scan /\\par ?/
207
+ html << "\r\n"
208
+ elsif scan.scan /\\tab ?/
209
+ html << "\t"
210
+ elsif scan.scan /\\'([0-9A-Za-z]{2})/
211
+ html << scan[1].hex.chr
212
+ elsif scan.scan /\\pntext/
213
+ scan.scan_until /\}/
214
+ elsif scan.scan /\\htmlrtf/
215
+ scan.scan_until /\\htmlrtf0 ?/
216
+ # a generic throw away unknown tags thing.
217
+ # the above 2 however, are handled specially
218
+ elsif scan.scan /\\[a-z-]+(\d+)? ?/
219
+ #elsif scan.scan /\\li(\d+) ?/
220
+ #elsif scan.scan /\\fi-(\d+) ?/
221
+ elsif scan.scan /[\r\n]/
222
+ elsif scan.scan /\\([{}\\])/
223
+ html << scan[1]
224
+ elsif scan.scan /(.)/
225
+ html << scan[1]
226
+ else
227
+ p :wtf
228
+ end
229
+ end
230
+ html.strip.empty? ? nil : html
231
+ end
232
+
233
+ module_function :rtf2html, :rtfdecompr
234
+ end
235
+ end
236
+