msg_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,206 @@
1
+ module MsgExtractor
2
+ module Rtf
3
+ # Extracts HTML encapsulated in RTF ([MS-OXRTFEX]). Targets well-formed
4
+ # Outlook-generated RTF; returns nil on anything it cannot handle.
5
+ class Decapsulator
6
+ SKIP_DESTINATIONS = %w[
7
+ fonttbl colortbl stylesheet info generator pntext listtable
8
+ listoverridetable themedata colorschememapping datastore latentstyles
9
+ xmlnstbl rsidtbl pgptbl background pict object header footer footnote
10
+ ].freeze
11
+
12
+ def self.html_from(rtf)
13
+ return nil unless rtf
14
+ return nil unless rtf.byteslice(0, 512).to_s.include?("\\fromhtml1")
15
+ new(rtf).run
16
+ rescue StandardError
17
+ nil # malformed RTF: html_body falls back to nil rather than raising
18
+ end
19
+
20
+ def initialize(rtf)
21
+ @rtf = rtf.b
22
+ @pos = 0
23
+ @out = +"".encode(Encoding::UTF_8) # accumulates decoded UTF-8 text
24
+ @pending = +"".b # raw codepage bytes not yet decoded
25
+ @codepage = 1252
26
+ # Group-scoped state, saved on "{" and restored on "}".
27
+ @state = { suppress: false, destination: :normal, uc: 1 }
28
+ @stack = []
29
+ end
30
+
31
+ def run
32
+ while @pos < @rtf.bytesize
33
+ byte = @rtf.getbyte(@pos)
34
+ case byte
35
+ when 0x7B # {
36
+ @pos += 1
37
+ @stack.push(@state.dup)
38
+ handle_group_start
39
+ when 0x7D # }
40
+ @pos += 1
41
+ @state = @stack.pop || @state
42
+ when 0x5C # backslash
43
+ handle_control
44
+ when 0x0D, 0x0A # bare CR/LF are not document text in RTF
45
+ @pos += 1
46
+ else
47
+ @pos += 1
48
+ emit(byte.chr)
49
+ end
50
+ end
51
+ finalize
52
+ end
53
+
54
+ private
55
+
56
+ def suppressed?
57
+ @state[:destination] == :skip ||
58
+ (@state[:suppress] && @state[:destination] != :htmltag)
59
+ end
60
+
61
+ # Append raw codepage bytes to the pending buffer.
62
+ def emit(str)
63
+ @pending << str.b unless suppressed?
64
+ end
65
+
66
+ # Flush @pending: decode raw codepage bytes to UTF-8 and append to @out.
67
+ def flush_pending
68
+ return if @pending.empty?
69
+ enc = codepage_encoding
70
+ decoded = @pending.dup
71
+ .force_encoding(enc)
72
+ .encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
73
+ @out << decoded
74
+ @pending.clear
75
+ end
76
+
77
+ def handle_group_start
78
+ return unless @rtf.byteslice(@pos, 2) == "\\*"
79
+ saved = @pos
80
+ @pos += 2
81
+ word, _param = read_control_word
82
+ if word == "htmltag"
83
+ @state[:destination] = :htmltag
84
+ else
85
+ @pos = saved
86
+ @state[:destination] = :skip # \*\unknown: ignorable destination
87
+ end
88
+ end
89
+
90
+ def read_control_word
91
+ @pos += 1 # consume the backslash
92
+ word = +""
93
+ while @pos < @rtf.bytesize && letter?(@rtf.getbyte(@pos))
94
+ word << @rtf.getbyte(@pos)
95
+ @pos += 1
96
+ end
97
+ param = nil
98
+ if @pos < @rtf.bytesize && (digit?(@rtf.getbyte(@pos)) || @rtf.getbyte(@pos) == 0x2D)
99
+ digits = +""
100
+ if @rtf.getbyte(@pos) == 0x2D
101
+ digits << "-"
102
+ @pos += 1
103
+ end
104
+ while @pos < @rtf.bytesize && digit?(@rtf.getbyte(@pos))
105
+ digits << @rtf.getbyte(@pos)
106
+ @pos += 1
107
+ end
108
+ param = digits.to_i
109
+ end
110
+ # A single space after a control word is part of the control word.
111
+ @pos += 1 if @pos < @rtf.bytesize && @rtf.getbyte(@pos) == 0x20
112
+ [word, param]
113
+ end
114
+
115
+ def letter?(byte) = (byte >= 0x41 && byte <= 0x5A) || (byte >= 0x61 && byte <= 0x7A)
116
+ def digit?(byte) = byte >= 0x30 && byte <= 0x39
117
+
118
+ def handle_control
119
+ next_byte = @rtf.getbyte(@pos + 1)
120
+ if next_byte.nil?
121
+ @pos += 1
122
+ elsif letter?(next_byte)
123
+ word, param = read_control_word
124
+ control_word(word, param)
125
+ elsif next_byte == 0x27 # \'xx hex escape
126
+ @pos += 2
127
+ hex = @rtf.byteslice(@pos, 2)
128
+ @pos += 2
129
+ emit(hex.to_i(16).chr) if hex&.match?(/\A\h\h\z/)
130
+ else
131
+ @pos += 2
132
+ case next_byte.chr
133
+ when "{", "}", "\\" then emit(next_byte.chr)
134
+ when "~" then emit_unicode(0x00A0) # non-breaking space
135
+ end # \- \_ \* etc.: no text output
136
+ end
137
+ end
138
+
139
+ def control_word(word, param)
140
+ case word
141
+ when "ansicpg"
142
+ flush_pending
143
+ @codepage = param || 1252
144
+ when "htmlrtf" then @state[:suppress] = param != 0
145
+ when "uc" then @state[:uc] = param || 1
146
+ when "u"
147
+ raise CorruptFileError, "\\u without codepoint" if param.nil?
148
+ codepoint = param
149
+ codepoint += 65_536 if codepoint.negative?
150
+ emit_unicode(codepoint)
151
+ skip_unicode_fallback(@state[:uc])
152
+ when "par", "line" then emit("\r\n")
153
+ when "tab" then emit("\t")
154
+ when "lquote" then emit_unicode(0x2018)
155
+ when "rquote" then emit_unicode(0x2019)
156
+ when "ldblquote" then emit_unicode(0x201C)
157
+ when "rdblquote" then emit_unicode(0x201D)
158
+ when "bullet" then emit_unicode(0x2022)
159
+ when "endash" then emit_unicode(0x2013)
160
+ when "emdash" then emit_unicode(0x2014)
161
+ when *SKIP_DESTINATIONS then @state[:destination] = :skip
162
+ end
163
+ end
164
+
165
+ def emit_unicode(codepoint)
166
+ return if suppressed?
167
+ flush_pending
168
+ char = [codepoint].pack("U")
169
+ @out << char
170
+ rescue RangeError, ArgumentError
171
+ @out << "?"
172
+ end
173
+
174
+ # After \uN, skip the fallback representation (\ucN chars, default 1).
175
+ # A control word counts as one fallback character and must be consumed
176
+ # in full (per RTF spec; the control word + trailing space is one unit).
177
+ def skip_unicode_fallback(count)
178
+ count.times do
179
+ byte = @rtf.getbyte(@pos)
180
+ break if byte.nil? || byte == 0x7B || byte == 0x7D
181
+ if byte == 0x5C
182
+ next_byte = @rtf.getbyte(@pos + 1)
183
+ if next_byte && letter?(next_byte)
184
+ read_control_word # consumes backslash + word + optional digit param + space
185
+ elsif next_byte == 0x27 # \'xx — counts as one fallback char
186
+ @pos += 4
187
+ else
188
+ @pos += 2
189
+ end
190
+ else
191
+ @pos += 1
192
+ end
193
+ end
194
+ end
195
+
196
+ def codepage_encoding
197
+ MsgExtractor::Mapi::Decoders::CODE_PAGES.fetch(@codepage, "Windows-1252")
198
+ end
199
+
200
+ def finalize
201
+ flush_pending
202
+ @out.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
203
+ end
204
+ end
205
+ end
206
+ end
@@ -0,0 +1,25 @@
1
+ module MsgExtractor
2
+ # IPM.Task items. Fields come from PSETID_Task named properties.
3
+ class Task < MessageObject
4
+ PSETID_TASK = "00062003-0000-0000-c000-000000000046"
5
+
6
+ LID_START_DATE = 0x8104
7
+ LID_DUE_DATE = 0x8105
8
+ LID_STATUS = 0x8101
9
+ LID_PERCENT_COMPLETE = 0x8102
10
+ LID_COMPLETE = 0x811C
11
+ LID_OWNER = 0x811F
12
+
13
+ STATUSES = {
14
+ 0 => :not_started, 1 => :in_progress, 2 => :complete,
15
+ 3 => :waiting, 4 => :deferred
16
+ }.freeze
17
+
18
+ def starts_on = named_value(PSETID_TASK, LID_START_DATE)
19
+ def due_on = named_value(PSETID_TASK, LID_DUE_DATE)
20
+ def status = STATUSES[named_value(PSETID_TASK, LID_STATUS)]
21
+ def percent_complete = named_value(PSETID_TASK, LID_PERCENT_COMPLETE)
22
+ def complete? = named_value(PSETID_TASK, LID_COMPLETE) == true
23
+ def owner = named_value(PSETID_TASK, LID_OWNER)
24
+ end
25
+ end
@@ -0,0 +1,72 @@
1
+ module MsgExtractor
2
+ module Util
3
+ module_function
4
+
5
+ def sanitize_filename(name)
6
+ cleaned = name.to_s.gsub(%r{[\x00-\x1F\\/:*?"<>|]}, "_").strip
7
+ cleaned = "unnamed" if cleaned.empty? || cleaned.match?(/\A\.+\z/)
8
+ cleaned
9
+ end
10
+
11
+ # "f.txt" -> "f (1).txt" -> "f (2).txt" until the path is free.
12
+ def dedupe_path(path)
13
+ return path unless ::File.exist?(path)
14
+ extension = ::File.extname(path)
15
+ base = path.delete_suffix(extension)
16
+ counter = 1
17
+ counter += 1 while ::File.exist?("#{base} (#{counter})#{extension}")
18
+ "#{base} (#{counter})#{extension}"
19
+ end
20
+
21
+ # Crude tag-stripping fallback used only when a message has an HTML body
22
+ # but no plain-text body.
23
+ def html_to_text(html)
24
+ text = strip_blocks(html)
25
+ .gsub(/<br\s*\/?>/i, "\n")
26
+ .gsub(%r{</(p|div|tr|li|h[1-6])>}i, "\n")
27
+ .gsub(/<[^>]+>/, "")
28
+ decode_entities(text).gsub(/[ \t]+\n/, "\n").gsub(/\n{3,}/, "\n\n").strip
29
+ end
30
+
31
+ ENTITIES = {
32
+ "amp" => "&", "lt" => "<", "gt" => ">", "quot" => '"',
33
+ "apos" => "'", "nbsp" => " "
34
+ }.freeze
35
+
36
+ # Single-pass entity decoder. Handles named entities, decimal numeric
37
+ # references, and hex numeric references. Hostile codepoints (out-of-range
38
+ # or surrogate) are replaced with the Unicode replacement character instead
39
+ # of raising. Avoids double-decoding: &amp;#65; → "&#65;", not "A".
40
+ def decode_entities(text)
41
+ text.gsub(/&(?:(amp|lt|gt|quot|apos|nbsp)|#(\d+)|#x(\h+));/) do
42
+ if (name = Regexp.last_match(1))
43
+ ENTITIES[name]
44
+ else
45
+ cp = Regexp.last_match(2)&.to_i || Regexp.last_match(3).to_i(16)
46
+ cp <= 0x10FFFF && !(0xD800..0xDFFF).cover?(cp) ? cp.chr(Encoding::UTF_8) : "\u{FFFD}"
47
+ end
48
+ end
49
+ end
50
+
51
+ # Linear index-based stripper for <script> and <style> blocks. The naive
52
+ # back-reference regex /<(script|style)\b.*?<\/\1>/mi is O(n²) on unclosed
53
+ # tags because the engine backtracks across the entire remaining input for
54
+ # each opening tag it cannot close.
55
+ def strip_blocks(html)
56
+ out = +""
57
+ pos = 0
58
+ while (open_at = html.index(%r{<(script|style)\b}i, pos))
59
+ tag = Regexp.last_match(1)
60
+ out << html[pos...open_at]
61
+ close = html.index(%r{</#{tag}\s*>}i, open_at)
62
+ if close && (gt = html.index(">", close))
63
+ pos = gt + 1
64
+ else
65
+ pos = html.length
66
+ end
67
+ end
68
+ out << html[pos..].to_s
69
+ end
70
+ private_class_method :strip_blocks
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ module MsgExtractor
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,63 @@
1
+ require_relative "msg_extractor/version"
2
+ require_relative "msg_extractor/errors"
3
+ require_relative "msg_extractor/cfbf/header"
4
+ require_relative "msg_extractor/cfbf/fat"
5
+ require_relative "msg_extractor/cfbf/directory"
6
+ require_relative "msg_extractor/cfbf/file"
7
+ require_relative "msg_extractor/mapi/ptag"
8
+ require_relative "msg_extractor/mapi/decoders"
9
+ require_relative "msg_extractor/mapi/property_store"
10
+ require_relative "msg_extractor/mapi/named_property_map"
11
+ require_relative "msg_extractor/rtf/compressed_rtf"
12
+ require_relative "msg_extractor/rtf/decapsulator"
13
+ require_relative "msg_extractor/headers"
14
+ require_relative "msg_extractor/recipient"
15
+ require_relative "msg_extractor/util"
16
+ require_relative "msg_extractor/attachment"
17
+ require_relative "msg_extractor/message_object"
18
+ require_relative "msg_extractor/message"
19
+ require_relative "msg_extractor/contact"
20
+ require_relative "msg_extractor/appointment"
21
+ require_relative "msg_extractor/task"
22
+
23
+ module MsgExtractor
24
+ DISPATCH = [
25
+ [/\A(ipm\.note|report)/i, :Message],
26
+ [/\Aipm\.(contact|distlist)/i, :Contact],
27
+ [/\A(ipm\.appointment|ipm\.schedule\.meeting)/i, :Appointment],
28
+ # Note: intentionally also matches IPM.TaskRequest.* (delegation messages) per the spec's IPM.Task* scope.
29
+ [/\Aipm\.task/i, :Task]
30
+ ].freeze
31
+
32
+ # Opens a .msg file (path, binary String, or IO) and returns the model
33
+ # object matching its MAPI message class.
34
+ #
35
+ # strict: when false, unknown/unsupported message classes return a generic
36
+ # MessageObject instead of raising.
37
+ def self.open(source, strict: true)
38
+ cfbf = Cfbf::File.read(source)
39
+ unless cfbf.entry("__properties_version1.0")
40
+ raise InvalidFormatError, "OLE file does not contain MSG property streams"
41
+ end
42
+ from_storage(cfbf, cfbf.root, kind: :root, strict: strict)
43
+ end
44
+
45
+ # Builds the right model class for a message storage (the file root or an
46
+ # embedded message). Used internally by open and Attachment#message.
47
+ def self.from_storage(cfbf, storage, named: nil, kind: :root, strict: true)
48
+ base = MessageObject.new(cfbf, storage: storage, named: named, kind: kind)
49
+ message_class = base.message_class
50
+ if message_class.nil?
51
+ raise InvalidFormatError, "MSG file has no message class" if strict
52
+ return base
53
+ end
54
+ match = DISPATCH.find { |pattern, _| pattern.match?(message_class) }
55
+ if match.nil?
56
+ if strict
57
+ raise UnsupportedTypeError, "unsupported message class #{message_class.inspect}"
58
+ end
59
+ return base
60
+ end
61
+ const_get(match[1]).new(cfbf, storage: storage, named: base.named, kind: kind, properties: base.properties)
62
+ end
63
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: msg_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Bart Duchesne
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: 'Parses Outlook .msg (OLE2/CFBF) files into structured Ruby objects:
13
+ subject, recipients, bodies (text/HTML/RTF), headers and attachments. No native
14
+ extensions, no runtime dependencies.'
15
+ email:
16
+ - bduc@dyndaco.be
17
+ executables:
18
+ - msg_extractor
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - CHANGELOG.md
23
+ - LICENSE.txt
24
+ - README.md
25
+ - exe/msg_extractor
26
+ - lib/msg_extractor.rb
27
+ - lib/msg_extractor/appointment.rb
28
+ - lib/msg_extractor/attachment.rb
29
+ - lib/msg_extractor/cfbf/directory.rb
30
+ - lib/msg_extractor/cfbf/fat.rb
31
+ - lib/msg_extractor/cfbf/file.rb
32
+ - lib/msg_extractor/cfbf/header.rb
33
+ - lib/msg_extractor/cli.rb
34
+ - lib/msg_extractor/contact.rb
35
+ - lib/msg_extractor/errors.rb
36
+ - lib/msg_extractor/headers.rb
37
+ - lib/msg_extractor/mapi/decoders.rb
38
+ - lib/msg_extractor/mapi/named_property_map.rb
39
+ - lib/msg_extractor/mapi/property_store.rb
40
+ - lib/msg_extractor/mapi/ptag.rb
41
+ - lib/msg_extractor/message.rb
42
+ - lib/msg_extractor/message_object.rb
43
+ - lib/msg_extractor/recipient.rb
44
+ - lib/msg_extractor/rtf/compressed_rtf.rb
45
+ - lib/msg_extractor/rtf/decapsulator.rb
46
+ - lib/msg_extractor/task.rb
47
+ - lib/msg_extractor/util.rb
48
+ - lib/msg_extractor/version.rb
49
+ homepage: https://github.com/bduc/msg-extractor-ruby
50
+ licenses:
51
+ - MIT
52
+ metadata:
53
+ source_code_uri: https://github.com/bduc/msg-extractor-ruby
54
+ changelog_uri: https://github.com/bduc/msg-extractor-ruby/blob/main/CHANGELOG.md
55
+ bug_tracker_uri: https://github.com/bduc/msg-extractor-ruby/issues
56
+ rubygems_mfa_required: 'true'
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: '3.1'
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ requirements: []
71
+ rubygems_version: 4.0.3
72
+ specification_version: 4
73
+ summary: Pure Ruby parser for Microsoft Outlook .msg files
74
+ test_files: []