ruby-msg 1.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/msg/rtf.rb ADDED
@@ -0,0 +1,236 @@
1
+ require 'stringio'
2
+ require 'strscan'
3
+
4
+ require 'rtf.rb'
5
+
6
+ class Msg
7
+ #
8
+ # = Introduction
9
+ #
10
+ # The +RTF+ module contains a few helper functions for dealing with rtf
11
+ # in msgs: +rtfdecompr+, and <tt>rtf2html</tt>.
12
+ #
13
+ # Both were ported from their original C versions for simplicity's sake.
14
+ #
15
+ module RTF
16
+ RTF_PREBUF =
17
+ "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
18
+ "{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
19
+ "\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
20
+ "{\\colortbl\\red0\\green0\\blue0\n\r\\par " \
21
+ "\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"
22
+
23
+ # Decompresses compressed rtf +data+, as found in the mapi property
24
+ # +PR_RTF_COMPRESSED+. Code converted from my C version, which in turn
25
+ # was ported from Java source, in JTNEF I believe.
26
+ #
27
+ # C version was modified to use circular buffer for back references,
28
+ # instead of the optimization of the Java version to index directly into
29
+ # output buffer. This was in preparation to support streaming in a
30
+ # read/write neutral fashion.
31
+ def rtfdecompr data
32
+ io = StringIO.new data
33
+ buf = RTF_PREBUF + "\x00" * (4096 - RTF_PREBUF.length)
34
+ wp = RTF_PREBUF.length
35
+ rtf = ''
36
+
37
+ # get header fields (as defined in RTFLIB.H)
38
+ compr_size, uncompr_size, magic, crc32 = io.read(16).unpack 'L*'
39
+ #warn "compressed-RTF data size mismatch" unless io.size == data.compr_size + 4
40
+
41
+ # process the data
42
+ case magic
43
+ when 0x414c454d # magic number that identifies the stream as a uncompressed stream
44
+ rtf = io.read uncompr_size
45
+ when 0x75465a4c # magic number that identifies the stream as a compressed stream
46
+ flag_count = -1
47
+ flags = nil
48
+ while rtf.length < uncompr_size and !io.eof?
49
+ #p [rtf.length, uncompr_size]
50
+ # each flag byte flags 8 literals/references, 1 per bit
51
+ flags = ((flag_count += 1) % 8 == 0) ? io.getc : flags >> 1
52
+ if 1 == (flags & 1) # each flag bit is 1 for reference, 0 for literal
53
+ rp, l = io.getc, io.getc
54
+ # offset is a 12 byte number. 2^12 is 4096, so thats fine
55
+ rp = (rp << 4) | (l >> 4) # the offset relative to block start
56
+ l = (l & 0xf) + 2 # the number of bytes to copy
57
+ l.times do
58
+ rtf << (buf[wp] = buf[rp])
59
+ wp = (wp + 1) % 4096
60
+ rp = (rp + 1) % 4096
61
+ end
62
+ else
63
+ rtf << (buf[wp] = io.getc)
64
+ wp = (wp + 1) % 4096
65
+ end
66
+ end
67
+ else # unknown magic number
68
+ raise "Unknown compression type (magic number 0x%08x)" % magic
69
+ end
70
+ rtf
71
+ end
72
+
73
+ =begin
74
+ # = RTF/HTML functions
75
+ #
76
+ # Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message.
77
+ # But more usually, the HTML is encoded inside the RTF body (which you get in the
78
+ # PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML
79
+ # from this RTF body.
80
+ #
81
+ # An encoded htmlrtf file is a valid RTF document, but which contains additional
82
+ # html markup information in its comments, and sometimes contains the equivalent
83
+ # rtf markup outside the comments. Therefore, when it is displayed by a plain
84
+ # simple RTF reader, the html comments are ignored and only the rtf markup has
85
+ # effect. Typically, this rtf markup is not as rich as the html markup would have been.
86
+ # But for an html-aware reader (such as the code below), we can ignore all the
87
+ # rtf markup, and extract the html markup out of the comments, and get a valid
88
+ # html document.
89
+ #
90
+ # There are actually two kinds of html markup in comments. Most of them are
91
+ # prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one
92
+ # prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case,
93
+ # the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message
94
+ # and contains tags that refer to content-ids (e.g. img src="cid:072344a7")
95
+ # while the normal tag just refers to a name (e.g. img src="fred.jpg")
96
+ # The code below keeps the m-tag and discards the normal tag.
97
+ # If there are any m-tags like this, then the message also contains an
98
+ # attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually,
99
+ # sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the
100
+ # attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead
101
+ # of a PR_CONTENT_ID.
102
+ #
103
+ # This code is experimental. It works on my own message archive, of about
104
+ # a thousand html-encoded messages, received in Outlook97 and Outlook2000
105
+ # and OutlookXP. But I can't guarantee that it will work on all rtf-encoded
106
+ # messages. Indeed, it used to be the case that people would simply stick
107
+ # {\fromhtml at the start of an html document, and } at the end, and send
108
+ # this as RTF. If someone did this, then it will almost work in my function
109
+ # but not quite. (Because I ignore \r and \n, and respect only \par. Thus,
110
+ # any linefeeds in the erroneous encoded-html will be ignored.)
111
+
112
+ # ISRTFHTML -- Given an uncompressed RTF body of the message, this
113
+ # function tells you whether it encodes some html.
114
+ # [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
115
+ # [return-value] true or false, for whether it really does encode some html
116
+ bool isrtfhtml(const char *buf,unsigned int len)
117
+ { // We look for the words "\fromhtml" somewhere in the file.
118
+ // If the rtf encodes text rather than html, then instead
119
+ // it will only find "\fromtext".
120
+ const char *c;
121
+ for (c=buf; c<buf+len; c++)
122
+ { if (strncmp(c,"\\from",5)==0) return strncmp(c,"\\fromhtml",9)==0;
123
+ }
124
+ return false;
125
+ }
126
+
127
+
128
+ # DECODERTFHTML -- Given an uncompressed RTF body of the message,
129
+ # and assuming that it contains encoded-html, this function
130
+ # turns it onto regular html.
131
+ # [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
132
+ # [out] the buffer is overwritten with the HTML version, null-terminated,
133
+ # and *len indicates the length of this HTML.
134
+ #
135
+ # Notes: (1) because of how the encoding works, the HTML version is necessarily
136
+ # shorter than the encoded version. That's why it's safe for the function to
137
+ # place the decoded html in the same buffer that formerly held the encoded stuff.
138
+ # (2) Some messages include characters \'XX, where XX is a hexedecimal number.
139
+ # This function simply converts this into ASCII. The conversion will only make
140
+ # sense if the right code-page is being used. I don't know how rtf specifies which
141
+ # code page it wants.
142
+ # (3) By experiment, I discovered that \pntext{..} and \liN and \fi-N are RTF
143
+ # markup that should be removed. There might be other RTF markup that should
144
+ # also be removed. But I don't know what else.
145
+ #
146
+ void decodertfhtml(char *buf,unsigned int *len)
147
+ { // c -- pointer to where we're reading from
148
+ // d -- pointer to where we're writing to. Invariant: d<c
149
+ // max -- how far we can read from (i.e. to the end of the original rtf)
150
+ // ignore_tag -- stores 'N': after \mhtmlN, we will ignore the subsequent \htmlN.
151
+ char *c=buf, *max=buf+*len, *d=buf; int ignore_tag=-1;
152
+ // First, we skip forwards to the first \htmltag.
153
+ while (c<max && strncmp(c,"{\\*\\htmltag",11)!=0) c++;
154
+ //
155
+ // Now work through the document. Our plan is as follows:
156
+ // * Ignore { and }. These are part of RTF markup.
157
+ // * Ignore \htmlrtf...\htmlrtf0. This is how RTF keeps its equivalent markup separate from the html.
158
+ // * Ignore \r and \n. The real carriage returns are stored in \par tags.
159
+ // * Ignore \pntext{..} and \liN and \fi-N. These are RTF junk.
160
+ // * Convert \par and \tab into \r\n and \t
161
+ // * Convert \'XX into the ascii character indicated by the hex number XX
162
+ // * Convert \{ and \} into { and }. This is how RTF escapes its curly braces.
163
+ // * When we get \*\mhtmltagN, keep the tag, but ignore the subsequent \*\htmltagN
164
+ // * When we get \*\htmltagN, keep the tag as long as it isn't subsequent to a \*\mhtmltagN
165
+ // * All other text should be kept as it is.
166
+ =end
167
+
168
+
169
+ # html encoded in rtf comments.
170
+ # {\*\htmltag84 &quot;}\htmlrtf "\htmlrtf0
171
+
172
+ # already generates better output that the c predecessor. eg from this chunk, where
173
+ # there are tags outside of the htmlrtf ignore block.
174
+ # "{\\*\\htmltag116 <br />}\\htmlrtf \\line \\htmlrtf0 \\line {\\*\\htmltag84 <a href..."
175
+ # we take the approach of ignoring
176
+ # all rtf tags not explicitly handled. a proper parse tree would be nicer to work with.
177
+ # ruby rtf library?
178
+ # check http://homepage.ntlworld.com/peterhi/rtf_tools.html
179
+ # and
180
+ # http://rubyforge.org/projects/ruby-rtf/
181
+
182
+ # Substandard conversion of the original C code.
183
+ # Test and refactor, and try to correct some inaccuracies.
184
+ # Returns +nil+ if it doesn't look like an rtf encapsulated rtf.
185
+ #
186
+ # Code is a hack, but it works.
187
+ def rtf2html rtf
188
+ scan = StringScanner.new rtf
189
+ # require \fromhtml. is this worth keeping?
190
+ return nil unless rtf["\\fromhtml"]
191
+ html = ''
192
+ ignore_tag = nil
193
+ # skip up to the first htmltag. return nil if we don't ever find one
194
+ return nil unless scan.scan_until /(?=\{\\\*\\htmltag)/
195
+ until scan.empty?
196
+ if scan.scan /\{/
197
+ elsif scan.scan /\}/
198
+ elsif scan.scan /\\\*\\htmltag(\d+) ?/
199
+ #p scan[1]
200
+ if ignore_tag == scan[1]
201
+ scan.scan_until /\}/
202
+ ignore_tag = nil
203
+ end
204
+ elsif scan.scan /\\\*\\mhtmltag(\d+) ?/
205
+ ignore_tag = scan[1]
206
+ elsif scan.scan /\\par ?/
207
+ html << "\r\n"
208
+ elsif scan.scan /\\tab ?/
209
+ html << "\t"
210
+ elsif scan.scan /\\'([0-9A-Za-z]{2})/
211
+ html << scan[1].hex.chr
212
+ elsif scan.scan /\\pntext/
213
+ scan.scan_until /\}/
214
+ elsif scan.scan /\\htmlrtf/
215
+ scan.scan_until /\\htmlrtf0 ?/
216
+ # a generic throw away unknown tags thing.
217
+ # the above 2 however, are handled specially
218
+ elsif scan.scan /\\[a-z-]+(\d+)? ?/
219
+ #elsif scan.scan /\\li(\d+) ?/
220
+ #elsif scan.scan /\\fi-(\d+) ?/
221
+ elsif scan.scan /[\r\n]/
222
+ elsif scan.scan /\\([{}\\])/
223
+ html << scan[1]
224
+ elsif scan.scan /(.)/
225
+ html << scan[1]
226
+ else
227
+ p :wtf
228
+ end
229
+ end
230
+ html.strip.empty? ? nil : html
231
+ end
232
+
233
+ module_function :rtf2html, :rtfdecompr
234
+ end
235
+ end
236
+