ruby-msg 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +108 -113
- data/Rakefile +42 -28
- data/bin/mapitool +195 -0
- data/lib/mapi.rb +109 -0
- data/lib/mapi/convert.rb +61 -0
- data/lib/mapi/convert/contact.rb +142 -0
- data/lib/mapi/convert/note-mime.rb +274 -0
- data/lib/mapi/convert/note-tmail.rb +287 -0
- data/lib/mapi/msg.rb +440 -0
- data/lib/mapi/property_set.rb +269 -0
- data/lib/mapi/pst.rb +1806 -0
- data/lib/mapi/rtf.rb +169 -0
- data/lib/mapi/types.rb +51 -0
- data/lib/rtf.rb +0 -9
- data/test/test_convert_contact.rb +60 -0
- data/test/test_convert_note.rb +66 -0
- data/test/test_mime.rb +4 -2
- data/test/test_msg.rb +29 -0
- data/test/test_property_set.rb +116 -0
- data/test/test_types.rb +17 -0
- metadata +78 -48
- data/bin/msgtool +0 -65
- data/lib/msg.rb +0 -522
- data/lib/msg/properties.rb +0 -532
- data/lib/msg/rtf.rb +0 -236
data/lib/msg/rtf.rb
DELETED
@@ -1,236 +0,0 @@
|
|
1
|
-
require 'stringio'
|
2
|
-
require 'strscan'
|
3
|
-
|
4
|
-
require 'rtf.rb'
|
5
|
-
|
6
|
-
class Msg
|
7
|
-
#
|
8
|
-
# = Introduction
|
9
|
-
#
|
10
|
-
# The +RTF+ module contains a few helper functions for dealing with rtf
|
11
|
-
# in msgs: +rtfdecompr+, and <tt>rtf2html</tt>.
|
12
|
-
#
|
13
|
-
# Both were ported from their original C versions for simplicity's sake.
|
14
|
-
#
|
15
|
-
module RTF
|
16
|
-
RTF_PREBUF =
|
17
|
-
"{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
|
18
|
-
"{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
|
19
|
-
"\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
|
20
|
-
"{\\colortbl\\red0\\green0\\blue0\n\r\\par " \
|
21
|
-
"\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"
|
22
|
-
|
23
|
-
# Decompresses compressed rtf +data+, as found in the mapi property
|
24
|
-
# +PR_RTF_COMPRESSED+. Code converted from my C version, which in turn
|
25
|
-
# was ported from Java source, in JTNEF I believe.
|
26
|
-
#
|
27
|
-
# C version was modified to use circular buffer for back references,
|
28
|
-
# instead of the optimization of the Java version to index directly into
|
29
|
-
# output buffer. This was in preparation to support streaming in a
|
30
|
-
# read/write neutral fashion.
|
31
|
-
def rtfdecompr data
|
32
|
-
io = StringIO.new data
|
33
|
-
buf = RTF_PREBUF + "\x00" * (4096 - RTF_PREBUF.length)
|
34
|
-
wp = RTF_PREBUF.length
|
35
|
-
rtf = ''
|
36
|
-
|
37
|
-
# get header fields (as defined in RTFLIB.H)
|
38
|
-
compr_size, uncompr_size, magic, crc32 = io.read(16).unpack 'L*'
|
39
|
-
#warn "compressed-RTF data size mismatch" unless io.size == data.compr_size + 4
|
40
|
-
|
41
|
-
# process the data
|
42
|
-
case magic
|
43
|
-
when 0x414c454d # magic number that identifies the stream as a uncompressed stream
|
44
|
-
rtf = io.read uncompr_size
|
45
|
-
when 0x75465a4c # magic number that identifies the stream as a compressed stream
|
46
|
-
flag_count = -1
|
47
|
-
flags = nil
|
48
|
-
while rtf.length < uncompr_size and !io.eof?
|
49
|
-
#p [rtf.length, uncompr_size]
|
50
|
-
# each flag byte flags 8 literals/references, 1 per bit
|
51
|
-
flags = ((flag_count += 1) % 8 == 0) ? io.getc : flags >> 1
|
52
|
-
if 1 == (flags & 1) # each flag bit is 1 for reference, 0 for literal
|
53
|
-
rp, l = io.getc, io.getc
|
54
|
-
# offset is a 12 byte number. 2^12 is 4096, so thats fine
|
55
|
-
rp = (rp << 4) | (l >> 4) # the offset relative to block start
|
56
|
-
l = (l & 0xf) + 2 # the number of bytes to copy
|
57
|
-
l.times do
|
58
|
-
rtf << (buf[wp] = buf[rp])
|
59
|
-
wp = (wp + 1) % 4096
|
60
|
-
rp = (rp + 1) % 4096
|
61
|
-
end
|
62
|
-
else
|
63
|
-
rtf << (buf[wp] = io.getc)
|
64
|
-
wp = (wp + 1) % 4096
|
65
|
-
end
|
66
|
-
end
|
67
|
-
else # unknown magic number
|
68
|
-
raise "Unknown compression type (magic number 0x%08x)" % magic
|
69
|
-
end
|
70
|
-
rtf
|
71
|
-
end
|
72
|
-
|
73
|
-
=begin
|
74
|
-
# = RTF/HTML functions
|
75
|
-
#
|
76
|
-
# Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message.
|
77
|
-
# But more usually, the HTML is encoded inside the RTF body (which you get in the
|
78
|
-
# PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML
|
79
|
-
# from this RTF body.
|
80
|
-
#
|
81
|
-
# An encoded htmlrtf file is a valid RTF document, but which contains additional
|
82
|
-
# html markup information in its comments, and sometimes contains the equivalent
|
83
|
-
# rtf markup outside the comments. Therefore, when it is displayed by a plain
|
84
|
-
# simple RTF reader, the html comments are ignored and only the rtf markup has
|
85
|
-
# effect. Typically, this rtf markup is not as rich as the html markup would have been.
|
86
|
-
# But for an html-aware reader (such as the code below), we can ignore all the
|
87
|
-
# rtf markup, and extract the html markup out of the comments, and get a valid
|
88
|
-
# html document.
|
89
|
-
#
|
90
|
-
# There are actually two kinds of html markup in comments. Most of them are
|
91
|
-
# prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one
|
92
|
-
# prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case,
|
93
|
-
# the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message
|
94
|
-
# and contains tags that refer to content-ids (e.g. img src="cid:072344a7")
|
95
|
-
# while the normal tag just refers to a name (e.g. img src="fred.jpg")
|
96
|
-
# The code below keeps the m-tag and discards the normal tag.
|
97
|
-
# If there are any m-tags like this, then the message also contains an
|
98
|
-
# attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually,
|
99
|
-
# sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the
|
100
|
-
# attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead
|
101
|
-
# of a PR_CONTENT_ID.
|
102
|
-
#
|
103
|
-
# This code is experimental. It works on my own message archive, of about
|
104
|
-
# a thousand html-encoded messages, received in Outlook97 and Outlook2000
|
105
|
-
# and OutlookXP. But I can't guarantee that it will work on all rtf-encoded
|
106
|
-
# messages. Indeed, it used to be the case that people would simply stick
|
107
|
-
# {\fromhtml at the start of an html document, and } at the end, and send
|
108
|
-
# this as RTF. If someone did this, then it will almost work in my function
|
109
|
-
# but not quite. (Because I ignore \r and \n, and respect only \par. Thus,
|
110
|
-
# any linefeeds in the erroneous encoded-html will be ignored.)
|
111
|
-
|
112
|
-
# ISRTFHTML -- Given an uncompressed RTF body of the message, this
|
113
|
-
# function tells you whether it encodes some html.
|
114
|
-
# [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
|
115
|
-
# [return-value] true or false, for whether it really does encode some html
|
116
|
-
bool isrtfhtml(const char *buf,unsigned int len)
|
117
|
-
{ // We look for the words "\fromhtml" somewhere in the file.
|
118
|
-
// If the rtf encodes text rather than html, then instead
|
119
|
-
// it will only find "\fromtext".
|
120
|
-
const char *c;
|
121
|
-
for (c=buf; c<buf+len; c++)
|
122
|
-
{ if (strncmp(c,"\\from",5)==0) return strncmp(c,"\\fromhtml",9)==0;
|
123
|
-
}
|
124
|
-
return false;
|
125
|
-
}
|
126
|
-
|
127
|
-
|
128
|
-
# DECODERTFHTML -- Given an uncompressed RTF body of the message,
|
129
|
-
# and assuming that it contains encoded-html, this function
|
130
|
-
# turns it onto regular html.
|
131
|
-
# [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
|
132
|
-
# [out] the buffer is overwritten with the HTML version, null-terminated,
|
133
|
-
# and *len indicates the length of this HTML.
|
134
|
-
#
|
135
|
-
# Notes: (1) because of how the encoding works, the HTML version is necessarily
|
136
|
-
# shorter than the encoded version. That's why it's safe for the function to
|
137
|
-
# place the decoded html in the same buffer that formerly held the encoded stuff.
|
138
|
-
# (2) Some messages include characters \'XX, where XX is a hexedecimal number.
|
139
|
-
# This function simply converts this into ASCII. The conversion will only make
|
140
|
-
# sense if the right code-page is being used. I don't know how rtf specifies which
|
141
|
-
# code page it wants.
|
142
|
-
# (3) By experiment, I discovered that \pntext{..} and \liN and \fi-N are RTF
|
143
|
-
# markup that should be removed. There might be other RTF markup that should
|
144
|
-
# also be removed. But I don't know what else.
|
145
|
-
#
|
146
|
-
void decodertfhtml(char *buf,unsigned int *len)
|
147
|
-
{ // c -- pointer to where we're reading from
|
148
|
-
// d -- pointer to where we're writing to. Invariant: d<c
|
149
|
-
// max -- how far we can read from (i.e. to the end of the original rtf)
|
150
|
-
// ignore_tag -- stores 'N': after \mhtmlN, we will ignore the subsequent \htmlN.
|
151
|
-
char *c=buf, *max=buf+*len, *d=buf; int ignore_tag=-1;
|
152
|
-
// First, we skip forwards to the first \htmltag.
|
153
|
-
while (c<max && strncmp(c,"{\\*\\htmltag",11)!=0) c++;
|
154
|
-
//
|
155
|
-
// Now work through the document. Our plan is as follows:
|
156
|
-
// * Ignore { and }. These are part of RTF markup.
|
157
|
-
// * Ignore \htmlrtf...\htmlrtf0. This is how RTF keeps its equivalent markup separate from the html.
|
158
|
-
// * Ignore \r and \n. The real carriage returns are stored in \par tags.
|
159
|
-
// * Ignore \pntext{..} and \liN and \fi-N. These are RTF junk.
|
160
|
-
// * Convert \par and \tab into \r\n and \t
|
161
|
-
// * Convert \'XX into the ascii character indicated by the hex number XX
|
162
|
-
// * Convert \{ and \} into { and }. This is how RTF escapes its curly braces.
|
163
|
-
// * When we get \*\mhtmltagN, keep the tag, but ignore the subsequent \*\htmltagN
|
164
|
-
// * When we get \*\htmltagN, keep the tag as long as it isn't subsequent to a \*\mhtmltagN
|
165
|
-
// * All other text should be kept as it is.
|
166
|
-
=end
|
167
|
-
|
168
|
-
|
169
|
-
# html encoded in rtf comments.
|
170
|
-
# {\*\htmltag84 "}\htmlrtf "\htmlrtf0
|
171
|
-
|
172
|
-
# already generates better output that the c predecessor. eg from this chunk, where
|
173
|
-
# there are tags outside of the htmlrtf ignore block.
|
174
|
-
# "{\\*\\htmltag116 <br />}\\htmlrtf \\line \\htmlrtf0 \\line {\\*\\htmltag84 <a href..."
|
175
|
-
# we take the approach of ignoring
|
176
|
-
# all rtf tags not explicitly handled. a proper parse tree would be nicer to work with.
|
177
|
-
# ruby rtf library?
|
178
|
-
# check http://homepage.ntlworld.com/peterhi/rtf_tools.html
|
179
|
-
# and
|
180
|
-
# http://rubyforge.org/projects/ruby-rtf/
|
181
|
-
|
182
|
-
# Substandard conversion of the original C code.
|
183
|
-
# Test and refactor, and try to correct some inaccuracies.
|
184
|
-
# Returns +nil+ if it doesn't look like an rtf encapsulated rtf.
|
185
|
-
#
|
186
|
-
# Code is a hack, but it works.
|
187
|
-
def rtf2html rtf
|
188
|
-
scan = StringScanner.new rtf
|
189
|
-
# require \fromhtml. is this worth keeping?
|
190
|
-
return nil unless rtf["\\fromhtml"]
|
191
|
-
html = ''
|
192
|
-
ignore_tag = nil
|
193
|
-
# skip up to the first htmltag. return nil if we don't ever find one
|
194
|
-
return nil unless scan.scan_until /(?=\{\\\*\\htmltag)/
|
195
|
-
until scan.empty?
|
196
|
-
if scan.scan /\{/
|
197
|
-
elsif scan.scan /\}/
|
198
|
-
elsif scan.scan /\\\*\\htmltag(\d+) ?/
|
199
|
-
#p scan[1]
|
200
|
-
if ignore_tag == scan[1]
|
201
|
-
scan.scan_until /\}/
|
202
|
-
ignore_tag = nil
|
203
|
-
end
|
204
|
-
elsif scan.scan /\\\*\\mhtmltag(\d+) ?/
|
205
|
-
ignore_tag = scan[1]
|
206
|
-
elsif scan.scan /\\par ?/
|
207
|
-
html << "\r\n"
|
208
|
-
elsif scan.scan /\\tab ?/
|
209
|
-
html << "\t"
|
210
|
-
elsif scan.scan /\\'([0-9A-Za-z]{2})/
|
211
|
-
html << scan[1].hex.chr
|
212
|
-
elsif scan.scan /\\pntext/
|
213
|
-
scan.scan_until /\}/
|
214
|
-
elsif scan.scan /\\htmlrtf/
|
215
|
-
scan.scan_until /\\htmlrtf0 ?/
|
216
|
-
# a generic throw away unknown tags thing.
|
217
|
-
# the above 2 however, are handled specially
|
218
|
-
elsif scan.scan /\\[a-z-]+(\d+)? ?/
|
219
|
-
#elsif scan.scan /\\li(\d+) ?/
|
220
|
-
#elsif scan.scan /\\fi-(\d+) ?/
|
221
|
-
elsif scan.scan /[\r\n]/
|
222
|
-
elsif scan.scan /\\([{}\\])/
|
223
|
-
html << scan[1]
|
224
|
-
elsif scan.scan /(.)/
|
225
|
-
html << scan[1]
|
226
|
-
else
|
227
|
-
p :wtf
|
228
|
-
end
|
229
|
-
end
|
230
|
-
html.strip.empty? ? nil : html
|
231
|
-
end
|
232
|
-
|
233
|
-
module_function :rtf2html, :rtfdecompr
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|