msg_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ module MsgExtractor
2
+ module Mapi
3
+ # Converts raw MAPI property bytes into Ruby values. All text becomes
4
+ # UTF-8 with invalid sequences replaced; binary stays ASCII-8BIT.
5
+ module Decoders
6
+ CODE_PAGES = {
7
+ 437 => "IBM437", 850 => "IBM850", 932 => "Windows-31J", 936 => "GBK",
8
+ 949 => "EUC-KR", 950 => "Big5",
9
+ 1250 => "Windows-1250", 1251 => "Windows-1251", 1252 => "Windows-1252",
10
+ 1253 => "Windows-1253", 1254 => "Windows-1254", 1255 => "Windows-1255",
11
+ 1256 => "Windows-1256", 1257 => "Windows-1257", 1258 => "Windows-1258",
12
+ 20127 => "US-ASCII", 28591 => "ISO-8859-1", 28592 => "ISO-8859-2",
13
+ 28605 => "ISO8859-15", 65001 => "UTF-8"
14
+ }.freeze
15
+
16
+ # Seconds between 1601-01-01 (FILETIME epoch) and 1970-01-01 (Unix).
17
+ EPOCH_DELTA = 11_644_473_600
18
+
19
+ module_function
20
+
21
+ # For fixed-width types, +bytes+ may be the full 8-byte record value
22
+ # field; unpack reads only the leading bytes it needs.
23
+ def decode(type, bytes, codepage: 1252)
24
+ case type
25
+ when PT_UNICODE then utf16(bytes)
26
+ when PT_STRING8 then string8(bytes, codepage)
27
+ when PT_SYSTIME then filetime(bytes.unpack1("Q<"))
28
+ when PT_LONG then bytes.unpack1("l<")
29
+ when PT_SHORT then bytes.unpack1("s<")
30
+ when PT_I8 then bytes.unpack1("q<")
31
+ when PT_DOUBLE then bytes.unpack1("E")
32
+ when PT_BOOLEAN then (bytes.unpack1("v") || 0) != 0
33
+ else bytes # PT_BINARY, PT_OBJECT, PT_CLSID and anything unknown: raw
34
+ end
35
+ end
36
+
37
+ def utf16(bytes)
38
+ bytes.dup.force_encoding(Encoding::UTF_16LE)
39
+ .encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
40
+ .sub(/\0+\z/, "")
41
+ end
42
+
43
+ def string8(bytes, codepage)
44
+ encoding = CODE_PAGES.fetch(codepage, "Windows-1252")
45
+ bytes.dup.force_encoding(encoding)
46
+ .encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
47
+ .sub(/\0+\z/, "")
48
+ end
49
+
50
+ # PR_HTML bytes -> UTF-8 string using PR_INTERNET_CPID.
51
+ def bytes_to_utf8(bytes, codepage) = string8(bytes, codepage)
52
+
53
+ def filetime(ticks)
54
+ return nil if ticks.nil? || ticks.zero?
55
+ Time.at(Rational(ticks, 10_000_000) - EPOCH_DELTA).utc
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,74 @@
1
+ module MsgExtractor
2
+ module Mapi
3
+ # Parses the __nameid_version1.0 storage: maps (property-set GUID, numeric
4
+ # LID or string name) pairs to the 0x8000+ property ids used in this file.
5
+ class NamedPropertyMap
6
+ PS_MAPI = "00020328-0000-0000-c000-000000000046"
7
+ PS_PUBLIC_STRINGS = "00020329-0000-0000-c000-000000000046"
8
+
9
+ def self.read(cfbf, storage)
10
+ nameid = storage.children["__NAMEID_VERSION1.0"]
11
+ return new({}) unless nameid&.storage?
12
+
13
+ guid_stream = read_child(cfbf, nameid, "__SUBSTG1.0_00020102")
14
+ entry_stream = read_child(cfbf, nameid, "__SUBSTG1.0_00030102")
15
+ string_stream = read_child(cfbf, nameid, "__SUBSTG1.0_00040102")
16
+ parse(guid_stream, entry_stream, string_stream)
17
+ end
18
+
19
+ def self.read_child(cfbf, storage, name)
20
+ entry = storage.children[name]
21
+ entry&.stream? ? cfbf.read_stream(entry) : "".b
22
+ end
23
+ private_class_method :read_child
24
+
25
+ def self.parse(guid_stream, entry_stream, string_stream)
26
+ map = {}
27
+ (entry_stream.bytesize / 8).times do |i|
28
+ name_id, info, prop_index = entry_stream.byteslice(i * 8, 8).unpack("Vvv")
29
+ guid_index = info >> 1
30
+ guid =
31
+ case guid_index
32
+ when 1 then PS_MAPI
33
+ when 2 then PS_PUBLIC_STRINGS
34
+ else
35
+ if guid_index >= 3
36
+ format_guid(guid_stream.byteslice((guid_index - 3) * 16, 16))
37
+ else
38
+ "00000000-0000-0000-0000-000000000000"
39
+ end
40
+ end
41
+ key =
42
+ if (info & 1) == 1
43
+ length = string_stream.byteslice(name_id, 4)&.unpack1("V")
44
+ next if length.nil?
45
+ raw = string_stream.byteslice(name_id + 4, length) || "".b
46
+ raw.force_encoding(Encoding::UTF_16LE)
47
+ .encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
48
+ .downcase
49
+ else
50
+ name_id
51
+ end
52
+ map[[guid, key]] = 0x8000 + prop_index
53
+ end
54
+ new(map)
55
+ end
56
+
57
+ def self.format_guid(bytes)
58
+ return "00000000-0000-0000-0000-000000000000" if bytes.nil? || bytes.bytesize < 16
59
+ d1, d2, d3 = bytes.unpack("Vvv")
60
+ format("%08x-%04x-%04x-%s-%s", d1, d2, d3,
61
+ bytes.byteslice(8, 2).unpack1("H4"), bytes.byteslice(10, 6).unpack1("H12"))
62
+ end
63
+
64
+ def initialize(map)
65
+ @map = map
66
+ end
67
+
68
+ def resolve(guid, name_or_lid)
69
+ key = name_or_lid.is_a?(::String) ? name_or_lid.downcase : name_or_lid
70
+ @map[[guid.downcase, key]]
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,106 @@
1
+ module MsgExtractor
2
+ module Mapi
3
+ # Reads the MAPI properties of one storage: fixed-width values from the
4
+ # __properties_version1.0 stream, variable-width values from companion
5
+ # __substg1.0_XXXXYYYY streams. The properties stream header length
6
+ # depends on what kind of storage this is.
7
+ class PropertyStore
8
+ HEADER_SIZES = { root: 32, embedded: 24, attachment: 8, recipient: 8 }.freeze
9
+
10
+ # These counts come directly from the file and are untrusted; the model
11
+ # layer iterates actual storages rather than relying on them.
12
+ attr_reader :recipient_count, :attachment_count
13
+
14
+ SUBSTG_RE = /\A__SUBSTG1\.0_([0-9A-F]{4})([0-9A-F]{4})\z/
15
+
16
+ def initialize(cfbf, storage, kind)
17
+ @cfbf = cfbf
18
+ @storage = storage
19
+ @kind = kind
20
+ @records = {} # id => [type, 8-byte value field]
21
+ @streams = {} # id => [type, Cfbf::Entry]
22
+ parse
23
+ end
24
+
25
+ def key?(id) = @streams.key?(id) || @records.key?(id)
26
+
27
+ # Variable-width types (PT_UNICODE, PT_STRING8, PT_BINARY, PT_OBJECT,
28
+ # PT_CLSID) must be sourced from a substg stream. If one appears only in
29
+ # @records the 8-byte value field is not the actual payload, so return nil
30
+ # rather than decoding garbage.
31
+ VARIABLE_WIDTH_TYPES = [
32
+ MsgExtractor::Mapi::PT_UNICODE,
33
+ MsgExtractor::Mapi::PT_STRING8,
34
+ MsgExtractor::Mapi::PT_BINARY,
35
+ MsgExtractor::Mapi::PT_OBJECT,
36
+ MsgExtractor::Mapi::PT_CLSID
37
+ ].freeze
38
+
39
+ def [](id)
40
+ if (type_entry = @streams[id])
41
+ type, entry = type_entry
42
+ Decoders.decode(type, @cfbf.read_stream(entry), codepage: codepage)
43
+ elsif (type_value = @records[id])
44
+ type, value = type_value
45
+ return nil if VARIABLE_WIDTH_TYPES.include?(type)
46
+ Decoders.decode(type, value, codepage: codepage)
47
+ end
48
+ end
49
+
50
+ # Raw bytes without decoding (binary props, or the 8-byte record field).
51
+ def raw(id)
52
+ if (type_entry = @streams[id])
53
+ @cfbf.read_stream(type_entry[1])
54
+ elsif (type_value = @records[id])
55
+ type_value[1]
56
+ end
57
+ end
58
+
59
+ def type_of(id) = (@streams[id] || @records[id])&.first
60
+
61
+ def codepage
62
+ @codepage ||=
63
+ if (record = @records[PR_MESSAGE_CODEPAGE])
64
+ record[1].unpack1("l<")
65
+ else
66
+ 1252
67
+ end
68
+ end
69
+
70
+ def internet_codepage
71
+ if (record = @records[PR_INTERNET_CPID])
72
+ record[1].unpack1("l<")
73
+ else
74
+ codepage
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ def parse
81
+ @storage.children.each_value do |entry|
82
+ next unless entry.stream?
83
+ match = SUBSTG_RE.match(entry.name.upcase) or next
84
+ type = match[2].to_i(16)
85
+ next unless (type & MV_FLAG).zero? # multi-valued props unsupported in v1
86
+ @streams[match[1].to_i(16)] = [type, entry]
87
+ end
88
+
89
+ properties_entry = @storage.children["__PROPERTIES_VERSION1.0"] or return
90
+ data = @cfbf.read_stream(properties_entry)
91
+ if %i[root embedded].include?(@kind) && data.bytesize >= 24
92
+ @recipient_count, @attachment_count = data.byteslice(16, 8).unpack("V2")
93
+ end
94
+ position = HEADER_SIZES.fetch(@kind)
95
+ while position + 16 <= data.bytesize
96
+ tag = data.byteslice(position, 4).unpack1("V")
97
+ value = data.byteslice(position + 8, 8)
98
+ id = tag >> 16
99
+ type = tag & 0xFFFF
100
+ @records[id] = [type, value] unless @streams.key?(id)
101
+ position += 16
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,55 @@
1
+ module MsgExtractor
2
+ module Mapi
3
+ # Property types ([MS-OXCDATA] 2.11.1)
4
+ PT_SHORT = 0x0002
5
+ PT_LONG = 0x0003
6
+ PT_DOUBLE = 0x0005
7
+ PT_BOOLEAN = 0x000B
8
+ PT_OBJECT = 0x000D
9
+ PT_I8 = 0x0014
10
+ PT_STRING8 = 0x001E
11
+ PT_UNICODE = 0x001F
12
+ PT_SYSTIME = 0x0040
13
+ PT_CLSID = 0x0048
14
+ PT_BINARY = 0x0102
15
+ MV_FLAG = 0x1000
16
+
17
+ # Property IDs (MAPI PR_* convention, 16-bit id without the type word)
18
+ PR_MESSAGE_CLASS = 0x001A
19
+ PR_SUBJECT = 0x0037
20
+ PR_CLIENT_SUBMIT_TIME = 0x0039
21
+ PR_TRANSPORT_HEADERS = 0x007D
22
+ PR_RECIPIENT_TYPE = 0x0C15
23
+ PR_SENDER_NAME = 0x0C1A
24
+ PR_SENDER_EMAIL = 0x0C1F
25
+ PR_DISPLAY_BCC = 0x0E02
26
+ PR_DISPLAY_CC = 0x0E03
27
+ PR_DISPLAY_TO = 0x0E04
28
+ PR_MESSAGE_DELIVERY_TIME = 0x0E06
29
+ PR_BODY = 0x1000
30
+ PR_RTF_COMPRESSED = 0x1009
31
+ PR_HTML = 0x1013
32
+ PR_DISPLAY_NAME = 0x3001
33
+ PR_ADDRTYPE = 0x3002
34
+ PR_EMAIL_ADDRESS = 0x3003
35
+ PR_ATTACH_DATA = 0x3701
36
+ PR_ATTACH_FILENAME = 0x3704
37
+ PR_ATTACH_METHOD = 0x3705
38
+ PR_ATTACH_LONG_FILENAME = 0x3707
39
+ PR_ATTACH_MIME_TAG = 0x370E
40
+ PR_ATTACH_CONTENT_ID = 0x3712
41
+ PR_SMTP_ADDRESS = 0x39FE
42
+ PR_POSTAL_ADDRESS = 0x3A15
43
+ PR_GIVEN_NAME = 0x3A06
44
+ PR_BUSINESS_PHONE = 0x3A08
45
+ PR_HOME_PHONE = 0x3A09
46
+ PR_SURNAME = 0x3A11
47
+ PR_COMPANY_NAME = 0x3A16
48
+ PR_JOB_TITLE = 0x3A17
49
+ PR_MOBILE_PHONE = 0x3A1C
50
+ PR_INTERNET_CPID = 0x3FDE
51
+ PR_MESSAGE_CODEPAGE = 0x3FFD
52
+ PR_SENDER_SMTP = 0x5D01
53
+ PR_ATTACHMENT_HIDDEN = 0x7FFE
54
+ end
55
+ end
@@ -0,0 +1,6 @@
1
+ module MsgExtractor
2
+ # A regular email (IPM.Note and reports). All behavior lives on
3
+ # MessageObject; this class exists so dispatch results are introspectable.
4
+ class Message < MessageObject
5
+ end
6
+ end
@@ -0,0 +1,100 @@
1
+ require "fileutils"
2
+
3
+ module MsgExtractor
4
+ # Shared base for every MSG item type: property access, bodies, recipients,
5
+ # attachments. Works for the root message and for embedded messages.
6
+ class MessageObject
7
+ attr_reader :properties
8
+
9
+ attr_reader :named # :nodoc: internal reuse by MsgExtractor.from_storage
10
+
11
+ def initialize(cfbf, storage: nil, named: nil, kind: :root, properties: nil)
12
+ @cfbf = cfbf
13
+ @storage = storage || cfbf.root
14
+ @kind = kind
15
+ @properties = properties || Mapi::PropertyStore.new(cfbf, @storage, kind)
16
+ @named = named || Mapi::NamedPropertyMap.read(cfbf, @storage)
17
+ end
18
+
19
+ def message_class = properties[Mapi::PR_MESSAGE_CLASS]
20
+ def subject = properties[Mapi::PR_SUBJECT]
21
+
22
+ def date
23
+ properties[Mapi::PR_CLIENT_SUBMIT_TIME] || properties[Mapi::PR_MESSAGE_DELIVERY_TIME]
24
+ end
25
+
26
+ def body
27
+ return @body if defined?(@body)
28
+ @body = properties[Mapi::PR_BODY] || (html_body && Util.html_to_text(html_body))
29
+ end
30
+
31
+ def rtf_body
32
+ return @rtf_body if defined?(@rtf_body)
33
+ raw = properties.raw(Mapi::PR_RTF_COMPRESSED)
34
+ @rtf_body = raw && Rtf::CompressedRtf.decompress(raw)
35
+ end
36
+
37
+ def html_body
38
+ return @html_body if defined?(@html_body)
39
+ @html_body =
40
+ if properties.type_of(Mapi::PR_HTML) == Mapi::PT_UNICODE
41
+ properties[Mapi::PR_HTML]
42
+ elsif (raw = properties.raw(Mapi::PR_HTML))
43
+ Mapi::Decoders.bytes_to_utf8(raw, properties.internet_codepage)
44
+ elsif rtf_body
45
+ Rtf::Decapsulator.html_from(rtf_body)
46
+ end
47
+ end
48
+
49
+ def headers
50
+ @headers ||= Headers.parse(properties[Mapi::PR_TRANSPORT_HEADERS])
51
+ end
52
+
53
+ def sender
54
+ return @sender if defined?(@sender)
55
+ name = properties[Mapi::PR_SENDER_NAME]
56
+ email = properties[Mapi::PR_SENDER_SMTP] || properties[Mapi::PR_SENDER_EMAIL]
57
+ email = nil unless email&.include?("@")
58
+ @sender = (name || email) && Recipient.new(name: name, email: email, type: nil)
59
+ end
60
+
61
+ def recipients
62
+ @recipients ||= child_storages("__RECIP_VERSION1.0_#")
63
+ .map { |e| Recipient.from_storage(@cfbf, e) }
64
+ end
65
+
66
+ def to = recipients.select { |r| r.type == Recipient::TO }
67
+ def cc = recipients.select { |r| r.type == Recipient::CC }
68
+ def bcc = recipients.select { |r| r.type == Recipient::BCC }
69
+
70
+ def attachments
71
+ @attachments ||= child_storages("__ATTACH_VERSION1.0_#")
72
+ .map { |e| Attachment.new(@cfbf, e, named: @named) }
73
+ end
74
+
75
+ def named_value(guid, lid_or_name)
76
+ id = @named.resolve(guid, lid_or_name)
77
+ id && properties[id]
78
+ end
79
+
80
+ def save(dir: ".")
81
+ name = Util.sanitize_filename(subject || "message")
82
+ base = Util.dedupe_path(::File.join(dir, name))
83
+ FileUtils.mkdir_p(base)
84
+ ::File.write(::File.join(base, "message.txt"), body, encoding: Encoding::UTF_8) if body
85
+ ::File.binwrite(::File.join(base, "message.html"), html_body.b) if html_body
86
+ attachments.each do |attachment|
87
+ attachment.save(dir: base) unless attachment.embedded_message?
88
+ end
89
+ base
90
+ end
91
+
92
+ private
93
+
94
+ def child_storages(prefix)
95
+ @storage.children.values
96
+ .select { |e| e.storage? && e.name.upcase.start_with?(prefix) }
97
+ .sort_by(&:name)
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,34 @@
1
+ module MsgExtractor
2
+ class Recipient
3
+ TO = 1
4
+ CC = 2
5
+ BCC = 3
6
+
7
+ attr_reader :name, :email, :type
8
+
9
+ def initialize(name:, email:, type:)
10
+ @name = name
11
+ @email = email
12
+ @type = type
13
+ end
14
+
15
+ def self.from_storage(cfbf, storage)
16
+ props = Mapi::PropertyStore.new(cfbf, storage, :recipient)
17
+ email = props[Mapi::PR_SMTP_ADDRESS]
18
+ if email.nil?
19
+ address = props[Mapi::PR_EMAIL_ADDRESS]
20
+ email = address if address&.include?("@")
21
+ end
22
+ new(name: props[Mapi::PR_DISPLAY_NAME], email: email,
23
+ type: props[Mapi::PR_RECIPIENT_TYPE] || TO)
24
+ end
25
+
26
+ def to_s
27
+ if name && email && name != email
28
+ "#{name} <#{email}>"
29
+ else
30
+ (email || name).to_s
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,88 @@
1
+ module MsgExtractor
2
+ module Rtf
3
+ # LZFu decompression for PR_RTF_COMPRESSED per [MS-OXRTFCP].
4
+ module CompressedRtf
5
+ MAGIC_COMPRESSED = 0x75465A4C # "LZFu"
6
+ MAGIC_UNCOMPRESSED = 0x414C454D # "MELA"
7
+
8
+ # The fixed 207-byte initial dictionary defined by the spec.
9
+ INITIAL_DICTIONARY =
10
+ ("{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}" \
11
+ "{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript " \
12
+ "\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier" \
13
+ "{\\colortbl\\red0\\green0\\blue0\r\n\\par " \
14
+ "\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx").b.freeze
15
+
16
+ # CRC32 table (polynomial 0xEDB88320), init 0, no final XOR — per spec.
17
+ CRC_TABLE = (0...256).map { |i|
18
+ crc = i
19
+ 8.times { crc = crc.odd? ? (0xEDB88320 ^ (crc >> 1)) : (crc >> 1) }
20
+ crc
21
+ }.freeze
22
+
23
+ module_function
24
+
25
+ def crc32(bytes)
26
+ bytes.each_byte.reduce(0) { |crc, b| CRC_TABLE[(crc ^ b) & 0xFF] ^ (crc >> 8) }
27
+ end
28
+
29
+ def decompress(data)
30
+ raise CorruptFileError, "compressed RTF too short" if data.nil? || data.bytesize < 16
31
+ comp_size, raw_size, magic, crc = data.unpack("V4")
32
+ case magic
33
+ when MAGIC_UNCOMPRESSED
34
+ raise CorruptFileError, "MELA RTF truncated" if data.bytesize - 16 < raw_size
35
+ data.byteslice(16, raw_size)
36
+ when MAGIC_COMPRESSED
37
+ payload = data.byteslice(16, comp_size - 12) if comp_size >= 12
38
+ raise CorruptFileError, "compressed RTF truncated header" if payload.nil?
39
+ unless crc32(payload) == crc
40
+ raise CorruptFileError, "compressed RTF CRC mismatch"
41
+ end
42
+ lzfu(payload)
43
+ else
44
+ raise CorruptFileError, format("bad compressed RTF magic 0x%08x", magic)
45
+ end
46
+ end
47
+
48
+ def lzfu(payload)
49
+ dictionary = INITIAL_DICTIONARY.dup
50
+ dictionary << ("\0".b * (4096 - dictionary.bytesize))
51
+ write_pos = INITIAL_DICTIONARY.bytesize # 207
52
+ out = +"".b
53
+ pos = 0
54
+ while pos < payload.bytesize
55
+ control = payload.getbyte(pos)
56
+ pos += 1
57
+ 8.times do |bit|
58
+ if ((control >> bit) & 1) == 1
59
+ high = payload.getbyte(pos)
60
+ low = payload.getbyte(pos + 1)
61
+ return out if high.nil? || low.nil?
62
+ pos += 2
63
+ reference = (high << 8) | low
64
+ offset = reference >> 4
65
+ length = (reference & 0x0F) + 2
66
+ return out if offset == write_pos # end-of-stream marker
67
+ length.times do
68
+ byte = dictionary.getbyte(offset)
69
+ offset = (offset + 1) % 4096
70
+ out << byte
71
+ dictionary.setbyte(write_pos, byte)
72
+ write_pos = (write_pos + 1) % 4096
73
+ end
74
+ else
75
+ byte = payload.getbyte(pos)
76
+ return out if byte.nil?
77
+ pos += 1
78
+ out << byte
79
+ dictionary.setbyte(write_pos, byte)
80
+ write_pos = (write_pos + 1) % 4096
81
+ end
82
+ end
83
+ end
84
+ out
85
+ end
86
+ end
87
+ end
88
+ end