ruby-msg 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,236 +0,0 @@
1
- require 'stringio'
2
- require 'strscan'
3
-
4
- require 'rtf.rb'
5
-
6
- class Msg
7
- #
8
- # = Introduction
9
- #
10
- # The +RTF+ module contains a few helper functions for dealing with rtf
11
- # in msgs: +rtfdecompr+, and <tt>rtf2html</tt>.
12
- #
13
- # Both were ported from their original C versions for simplicity's sake.
14
- #
15
- module RTF
16
- RTF_PREBUF =
17
- "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
18
- "{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
19
- "\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
20
- "{\\colortbl\\red0\\green0\\blue0\n\r\\par " \
21
- "\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"
22
-
23
- # Decompresses compressed rtf +data+, as found in the mapi property
24
- # +PR_RTF_COMPRESSED+. Code converted from my C version, which in turn
25
- # was ported from Java source, in JTNEF I believe.
26
- #
27
- # C version was modified to use circular buffer for back references,
28
- # instead of the optimization of the Java version to index directly into
29
- # output buffer. This was in preparation to support streaming in a
30
- # read/write neutral fashion.
31
- def rtfdecompr data
32
- io = StringIO.new data
33
- buf = RTF_PREBUF + "\x00" * (4096 - RTF_PREBUF.length)
34
- wp = RTF_PREBUF.length
35
- rtf = ''
36
-
37
- # get header fields (as defined in RTFLIB.H)
38
- compr_size, uncompr_size, magic, crc32 = io.read(16).unpack 'L*'
39
- #warn "compressed-RTF data size mismatch" unless io.size == data.compr_size + 4
40
-
41
- # process the data
42
- case magic
43
- when 0x414c454d # magic number that identifies the stream as a uncompressed stream
44
- rtf = io.read uncompr_size
45
- when 0x75465a4c # magic number that identifies the stream as a compressed stream
46
- flag_count = -1
47
- flags = nil
48
- while rtf.length < uncompr_size and !io.eof?
49
- #p [rtf.length, uncompr_size]
50
- # each flag byte flags 8 literals/references, 1 per bit
51
- flags = ((flag_count += 1) % 8 == 0) ? io.getc : flags >> 1
52
- if 1 == (flags & 1) # each flag bit is 1 for reference, 0 for literal
53
- rp, l = io.getc, io.getc
54
- # offset is a 12 byte number. 2^12 is 4096, so thats fine
55
- rp = (rp << 4) | (l >> 4) # the offset relative to block start
56
- l = (l & 0xf) + 2 # the number of bytes to copy
57
- l.times do
58
- rtf << (buf[wp] = buf[rp])
59
- wp = (wp + 1) % 4096
60
- rp = (rp + 1) % 4096
61
- end
62
- else
63
- rtf << (buf[wp] = io.getc)
64
- wp = (wp + 1) % 4096
65
- end
66
- end
67
- else # unknown magic number
68
- raise "Unknown compression type (magic number 0x%08x)" % magic
69
- end
70
- rtf
71
- end
72
-
73
- =begin
74
- # = RTF/HTML functions
75
- #
76
- # Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message.
77
- # But more usually, the HTML is encoded inside the RTF body (which you get in the
78
- # PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML
79
- # from this RTF body.
80
- #
81
- # An encoded htmlrtf file is a valid RTF document, but which contains additional
82
- # html markup information in its comments, and sometimes contains the equivalent
83
- # rtf markup outside the comments. Therefore, when it is displayed by a plain
84
- # simple RTF reader, the html comments are ignored and only the rtf markup has
85
- # effect. Typically, this rtf markup is not as rich as the html markup would have been.
86
- # But for an html-aware reader (such as the code below), we can ignore all the
87
- # rtf markup, and extract the html markup out of the comments, and get a valid
88
- # html document.
89
- #
90
- # There are actually two kinds of html markup in comments. Most of them are
91
- # prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one
92
- # prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case,
93
- # the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message
94
- # and contains tags that refer to content-ids (e.g. img src="cid:072344a7")
95
- # while the normal tag just refers to a name (e.g. img src="fred.jpg")
96
- # The code below keeps the m-tag and discards the normal tag.
97
- # If there are any m-tags like this, then the message also contains an
98
- # attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually,
99
- # sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the
100
- # attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead
101
- # of a PR_CONTENT_ID.
102
- #
103
- # This code is experimental. It works on my own message archive, of about
104
- # a thousand html-encoded messages, received in Outlook97 and Outlook2000
105
- # and OutlookXP. But I can't guarantee that it will work on all rtf-encoded
106
- # messages. Indeed, it used to be the case that people would simply stick
107
- # {\fromhtml at the start of an html document, and } at the end, and send
108
- # this as RTF. If someone did this, then it will almost work in my function
109
- # but not quite. (Because I ignore \r and \n, and respect only \par. Thus,
110
- # any linefeeds in the erroneous encoded-html will be ignored.)
111
-
112
- # ISRTFHTML -- Given an uncompressed RTF body of the message, this
113
- # function tells you whether it encodes some html.
114
- # [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
115
- # [return-value] true or false, for whether it really does encode some html
116
- bool isrtfhtml(const char *buf,unsigned int len)
117
- { // We look for the words "\fromhtml" somewhere in the file.
118
- // If the rtf encodes text rather than html, then instead
119
- // it will only find "\fromtext".
120
- const char *c;
121
- for (c=buf; c<buf+len; c++)
122
- { if (strncmp(c,"\\from",5)==0) return strncmp(c,"\\fromhtml",9)==0;
123
- }
124
- return false;
125
- }
126
-
127
-
128
- # DECODERTFHTML -- Given an uncompressed RTF body of the message,
129
- # and assuming that it contains encoded-html, this function
130
- # turns it onto regular html.
131
- # [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
132
- # [out] the buffer is overwritten with the HTML version, null-terminated,
133
- # and *len indicates the length of this HTML.
134
- #
135
- # Notes: (1) because of how the encoding works, the HTML version is necessarily
136
- # shorter than the encoded version. That's why it's safe for the function to
137
- # place the decoded html in the same buffer that formerly held the encoded stuff.
138
- # (2) Some messages include characters \'XX, where XX is a hexedecimal number.
139
- # This function simply converts this into ASCII. The conversion will only make
140
- # sense if the right code-page is being used. I don't know how rtf specifies which
141
- # code page it wants.
142
- # (3) By experiment, I discovered that \pntext{..} and \liN and \fi-N are RTF
143
- # markup that should be removed. There might be other RTF markup that should
144
- # also be removed. But I don't know what else.
145
- #
146
- void decodertfhtml(char *buf,unsigned int *len)
147
- { // c -- pointer to where we're reading from
148
- // d -- pointer to where we're writing to. Invariant: d<c
149
- // max -- how far we can read from (i.e. to the end of the original rtf)
150
- // ignore_tag -- stores 'N': after \mhtmlN, we will ignore the subsequent \htmlN.
151
- char *c=buf, *max=buf+*len, *d=buf; int ignore_tag=-1;
152
- // First, we skip forwards to the first \htmltag.
153
- while (c<max && strncmp(c,"{\\*\\htmltag",11)!=0) c++;
154
- //
155
- // Now work through the document. Our plan is as follows:
156
- // * Ignore { and }. These are part of RTF markup.
157
- // * Ignore \htmlrtf...\htmlrtf0. This is how RTF keeps its equivalent markup separate from the html.
158
- // * Ignore \r and \n. The real carriage returns are stored in \par tags.
159
- // * Ignore \pntext{..} and \liN and \fi-N. These are RTF junk.
160
- // * Convert \par and \tab into \r\n and \t
161
- // * Convert \'XX into the ascii character indicated by the hex number XX
162
- // * Convert \{ and \} into { and }. This is how RTF escapes its curly braces.
163
- // * When we get \*\mhtmltagN, keep the tag, but ignore the subsequent \*\htmltagN
164
- // * When we get \*\htmltagN, keep the tag as long as it isn't subsequent to a \*\mhtmltagN
165
- // * All other text should be kept as it is.
166
- =end
167
-
168
-
169
- # html encoded in rtf comments.
170
- # {\*\htmltag84 &quot;}\htmlrtf "\htmlrtf0
171
-
172
- # already generates better output that the c predecessor. eg from this chunk, where
173
- # there are tags outside of the htmlrtf ignore block.
174
- # "{\\*\\htmltag116 <br />}\\htmlrtf \\line \\htmlrtf0 \\line {\\*\\htmltag84 <a href..."
175
- # we take the approach of ignoring
176
- # all rtf tags not explicitly handled. a proper parse tree would be nicer to work with.
177
- # ruby rtf library?
178
- # check http://homepage.ntlworld.com/peterhi/rtf_tools.html
179
- # and
180
- # http://rubyforge.org/projects/ruby-rtf/
181
-
182
- # Substandard conversion of the original C code.
183
- # Test and refactor, and try to correct some inaccuracies.
184
- # Returns +nil+ if it doesn't look like an rtf encapsulated rtf.
185
- #
186
- # Code is a hack, but it works.
187
- def rtf2html rtf
188
- scan = StringScanner.new rtf
189
- # require \fromhtml. is this worth keeping?
190
- return nil unless rtf["\\fromhtml"]
191
- html = ''
192
- ignore_tag = nil
193
- # skip up to the first htmltag. return nil if we don't ever find one
194
- return nil unless scan.scan_until /(?=\{\\\*\\htmltag)/
195
- until scan.empty?
196
- if scan.scan /\{/
197
- elsif scan.scan /\}/
198
- elsif scan.scan /\\\*\\htmltag(\d+) ?/
199
- #p scan[1]
200
- if ignore_tag == scan[1]
201
- scan.scan_until /\}/
202
- ignore_tag = nil
203
- end
204
- elsif scan.scan /\\\*\\mhtmltag(\d+) ?/
205
- ignore_tag = scan[1]
206
- elsif scan.scan /\\par ?/
207
- html << "\r\n"
208
- elsif scan.scan /\\tab ?/
209
- html << "\t"
210
- elsif scan.scan /\\'([0-9A-Za-z]{2})/
211
- html << scan[1].hex.chr
212
- elsif scan.scan /\\pntext/
213
- scan.scan_until /\}/
214
- elsif scan.scan /\\htmlrtf/
215
- scan.scan_until /\\htmlrtf0 ?/
216
- # a generic throw away unknown tags thing.
217
- # the above 2 however, are handled specially
218
- elsif scan.scan /\\[a-z-]+(\d+)? ?/
219
- #elsif scan.scan /\\li(\d+) ?/
220
- #elsif scan.scan /\\fi-(\d+) ?/
221
- elsif scan.scan /[\r\n]/
222
- elsif scan.scan /\\([{}\\])/
223
- html << scan[1]
224
- elsif scan.scan /(.)/
225
- html << scan[1]
226
- else
227
- p :wtf
228
- end
229
- end
230
- html.strip.empty? ? nil : html
231
- end
232
-
233
- module_function :rtf2html, :rtfdecompr
234
- end
235
- end
236
-