RubyGems - msg_extractor - Versions diffs - 0.1.0 - Mend

msg_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +9 -0
data/LICENSE.txt +21 -0
data/README.md +88 -0
data/exe/msg_extractor +5 -0
data/lib/msg_extractor/appointment.rb +20 -0
data/lib/msg_extractor/attachment.rb +57 -0
data/lib/msg_extractor/cfbf/directory.rb +84 -0
data/lib/msg_extractor/cfbf/fat.rb +75 -0
data/lib/msg_extractor/cfbf/file.rb +114 -0
data/lib/msg_extractor/cfbf/header.rb +40 -0
data/lib/msg_extractor/cli.rb +77 -0
data/lib/msg_extractor/contact.rb +23 -0
data/lib/msg_extractor/errors.rb +12 -0
data/lib/msg_extractor/headers.rb +39 -0
data/lib/msg_extractor/mapi/decoders.rb +59 -0
data/lib/msg_extractor/mapi/named_property_map.rb +74 -0
data/lib/msg_extractor/mapi/property_store.rb +106 -0
data/lib/msg_extractor/mapi/ptag.rb +55 -0
data/lib/msg_extractor/message.rb +6 -0
data/lib/msg_extractor/message_object.rb +100 -0
data/lib/msg_extractor/recipient.rb +34 -0
data/lib/msg_extractor/rtf/compressed_rtf.rb +88 -0
data/lib/msg_extractor/rtf/decapsulator.rb +206 -0
data/lib/msg_extractor/task.rb +25 -0
data/lib/msg_extractor/util.rb +72 -0
data/lib/msg_extractor/version.rb +3 -0
data/lib/msg_extractor.rb +63 -0
metadata +74 -0

data/lib/msg_extractor/rtf/decapsulator.rb ADDED Viewed

@@ -0,0 +1,206 @@
+module MsgExtractor
+  module Rtf
+    # Extracts HTML encapsulated in RTF ([MS-OXRTFEX]). Targets well-formed
+    # Outlook-generated RTF; returns nil on anything it cannot handle.
+    class Decapsulator
+      SKIP_DESTINATIONS = %w[
+        fonttbl colortbl stylesheet info generator pntext listtable
+        listoverridetable themedata colorschememapping datastore latentstyles
+        xmlnstbl rsidtbl pgptbl background pict object header footer footnote
+      ].freeze
+      def self.html_from(rtf)
+        return nil unless rtf
+        return nil unless rtf.byteslice(0, 512).to_s.include?("\\fromhtml1")
+        new(rtf).run
+      rescue StandardError
+        nil # malformed RTF: html_body falls back to nil rather than raising
+      end
+      def initialize(rtf)
+        @rtf = rtf.b
+        @pos = 0
+        @out = +"".encode(Encoding::UTF_8)   # accumulates decoded UTF-8 text
+        @pending = +"".b                      # raw codepage bytes not yet decoded
+        @codepage = 1252
+        # Group-scoped state, saved on "{" and restored on "}".
+        @state = { suppress: false, destination: :normal, uc: 1 }
+        @stack = []
+      end
+      def run
+        while @pos < @rtf.bytesize
+          byte = @rtf.getbyte(@pos)
+          case byte
+          when 0x7B # {
+            @pos += 1
+            @stack.push(@state.dup)
+            handle_group_start
+          when 0x7D # }
+            @pos += 1
+            @state = @stack.pop || @state
+          when 0x5C # backslash
+            handle_control
+          when 0x0D, 0x0A # bare CR/LF are not document text in RTF
+            @pos += 1
+          else
+            @pos += 1
+            emit(byte.chr)
+          end
+        end
+        finalize
+      end
+      private
+      def suppressed?
+        @state[:destination] == :skip ||
+          (@state[:suppress] && @state[:destination] != :htmltag)
+      end
+      # Append raw codepage bytes to the pending buffer.
+      def emit(str)
+        @pending << str.b unless suppressed?
+      end
+      # Flush @pending: decode raw codepage bytes to UTF-8 and append to @out.
+      def flush_pending
+        return if @pending.empty?
+        enc = codepage_encoding
+        decoded = @pending.dup
+                          .force_encoding(enc)
+                          .encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
+        @out << decoded
+        @pending.clear
+      end
+      def handle_group_start
+        return unless @rtf.byteslice(@pos, 2) == "\\*"
+        saved = @pos
+        @pos += 2
+        word, _param = read_control_word
+        if word == "htmltag"
+          @state[:destination] = :htmltag
+        else
+          @pos = saved
+          @state[:destination] = :skip # \*\unknown: ignorable destination
+        end
+      end
+      def read_control_word
+        @pos += 1 # consume the backslash
+        word = +""
+        while @pos < @rtf.bytesize && letter?(@rtf.getbyte(@pos))
+          word << @rtf.getbyte(@pos)
+          @pos += 1
+        end
+        param = nil
+        if @pos < @rtf.bytesize && (digit?(@rtf.getbyte(@pos)) || @rtf.getbyte(@pos) == 0x2D)
+          digits = +""
+          if @rtf.getbyte(@pos) == 0x2D
+            digits << "-"
+            @pos += 1
+          end
+          while @pos < @rtf.bytesize && digit?(@rtf.getbyte(@pos))
+            digits << @rtf.getbyte(@pos)
+            @pos += 1
+          end
+          param = digits.to_i
+        end
+        # A single space after a control word is part of the control word.
+        @pos += 1 if @pos < @rtf.bytesize && @rtf.getbyte(@pos) == 0x20
+        [word, param]
+      end
+      def letter?(byte) = (byte >= 0x41 && byte <= 0x5A) || (byte >= 0x61 && byte <= 0x7A)
+      def digit?(byte) = byte >= 0x30 && byte <= 0x39
+      def handle_control
+        next_byte = @rtf.getbyte(@pos + 1)
+        if next_byte.nil?
+          @pos += 1
+        elsif letter?(next_byte)
+          word, param = read_control_word
+          control_word(word, param)
+        elsif next_byte == 0x27 # \'xx hex escape
+          @pos += 2
+          hex = @rtf.byteslice(@pos, 2)
+          @pos += 2
+          emit(hex.to_i(16).chr) if hex&.match?(/\A\h\h\z/)
+        else
+          @pos += 2
+          case next_byte.chr
+          when "{", "}", "\\" then emit(next_byte.chr)
+          when "~" then emit_unicode(0x00A0) # non-breaking space
+          end # \- \_ \* etc.: no text output
+        end
+      end
+      def control_word(word, param)
+        case word
+        when "ansicpg"
+          flush_pending
+          @codepage = param || 1252
+        when "htmlrtf" then @state[:suppress] = param != 0
+        when "uc" then @state[:uc] = param || 1
+        when "u"
+          raise CorruptFileError, "\\u without codepoint" if param.nil?
+          codepoint = param
+          codepoint += 65_536 if codepoint.negative?
+          emit_unicode(codepoint)
+          skip_unicode_fallback(@state[:uc])
+        when "par", "line" then emit("\r\n")
+        when "tab" then emit("\t")
+        when "lquote" then emit_unicode(0x2018)
+        when "rquote" then emit_unicode(0x2019)
+        when "ldblquote" then emit_unicode(0x201C)
+        when "rdblquote" then emit_unicode(0x201D)
+        when "bullet" then emit_unicode(0x2022)
+        when "endash" then emit_unicode(0x2013)
+        when "emdash" then emit_unicode(0x2014)
+        when *SKIP_DESTINATIONS then @state[:destination] = :skip
+        end
+      end
+      def emit_unicode(codepoint)
+        return if suppressed?
+        flush_pending
+        char = [codepoint].pack("U")
+        @out << char
+      rescue RangeError, ArgumentError
+        @out << "?"
+      end
+      # After \uN, skip the fallback representation (\ucN chars, default 1).
+      # A control word counts as one fallback character and must be consumed
+      # in full (per RTF spec; the control word + trailing space is one unit).
+      def skip_unicode_fallback(count)
+        count.times do
+          byte = @rtf.getbyte(@pos)
+          break if byte.nil? || byte == 0x7B || byte == 0x7D
+          if byte == 0x5C
+            next_byte = @rtf.getbyte(@pos + 1)
+            if next_byte && letter?(next_byte)
+              read_control_word # consumes backslash + word + optional digit param + space
+            elsif next_byte == 0x27 # \'xx — counts as one fallback char
+              @pos += 4
+            else
+              @pos += 2
+            end
+          else
+            @pos += 1
+          end
+        end
+      end
+      def codepage_encoding
+        MsgExtractor::Mapi::Decoders::CODE_PAGES.fetch(@codepage, "Windows-1252")
+      end
+      def finalize
+        flush_pending
+        @out.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
+      end
+    end
+  end
+end

data/lib/msg_extractor/task.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module MsgExtractor
+  # IPM.Task items. Fields come from PSETID_Task named properties.
+  class Task < MessageObject
+    PSETID_TASK = "00062003-0000-0000-c000-000000000046"
+    LID_START_DATE = 0x8104
+    LID_DUE_DATE = 0x8105
+    LID_STATUS = 0x8101
+    LID_PERCENT_COMPLETE = 0x8102
+    LID_COMPLETE = 0x811C
+    LID_OWNER = 0x811F
+    STATUSES = {
+      0 => :not_started, 1 => :in_progress, 2 => :complete,
+      3 => :waiting, 4 => :deferred
+    }.freeze
+    def starts_on = named_value(PSETID_TASK, LID_START_DATE)
+    def due_on = named_value(PSETID_TASK, LID_DUE_DATE)
+    def status = STATUSES[named_value(PSETID_TASK, LID_STATUS)]
+    def percent_complete = named_value(PSETID_TASK, LID_PERCENT_COMPLETE)
+    def complete? = named_value(PSETID_TASK, LID_COMPLETE) == true
+    def owner = named_value(PSETID_TASK, LID_OWNER)
+  end
+end

data/lib/msg_extractor/util.rb ADDED Viewed

@@ -0,0 +1,72 @@
+module MsgExtractor
+  module Util
+    module_function
+    def sanitize_filename(name)
+      cleaned = name.to_s.gsub(%r{[\x00-\x1F\\/:*?"<>|]}, "_").strip
+      cleaned = "unnamed" if cleaned.empty? || cleaned.match?(/\A\.+\z/)
+      cleaned
+    end
+    # "f.txt" -> "f (1).txt" -> "f (2).txt" until the path is free.
+    def dedupe_path(path)
+      return path unless ::File.exist?(path)
+      extension = ::File.extname(path)
+      base = path.delete_suffix(extension)
+      counter = 1
+      counter += 1 while ::File.exist?("#{base} (#{counter})#{extension}")
+      "#{base} (#{counter})#{extension}"
+    end
+    # Crude tag-stripping fallback used only when a message has an HTML body
+    # but no plain-text body.
+    def html_to_text(html)
+      text = strip_blocks(html)
+               .gsub(/<br\s*\/?>/i, "\n")
+               .gsub(%r{</(p|div|tr|li|h[1-6])>}i, "\n")
+               .gsub(/<[^>]+>/, "")
+      decode_entities(text).gsub(/[ \t]+\n/, "\n").gsub(/\n{3,}/, "\n\n").strip
+    end
+    ENTITIES = {
+      "amp" => "&", "lt" => "<", "gt" => ">", "quot" => '"',
+      "apos" => "'", "nbsp" => " "
+    }.freeze
+    # Single-pass entity decoder. Handles named entities, decimal numeric
+    # references, and hex numeric references. Hostile codepoints (out-of-range
+    # or surrogate) are replaced with the Unicode replacement character instead
+    # of raising. Avoids double-decoding: &amp;#65; → "&#65;", not "A".
+    def decode_entities(text)
+      text.gsub(/&(?:(amp|lt|gt|quot|apos|nbsp)|#(\d+)|#x(\h+));/) do
+        if (name = Regexp.last_match(1))
+          ENTITIES[name]
+        else
+          cp = Regexp.last_match(2)&.to_i || Regexp.last_match(3).to_i(16)
+          cp <= 0x10FFFF && !(0xD800..0xDFFF).cover?(cp) ? cp.chr(Encoding::UTF_8) : "\u{FFFD}"
+        end
+      end
+    end
+    # Linear index-based stripper for <script> and <style> blocks. The naive
+    # back-reference regex /<(script|style)\b.*?<\/\1>/mi is O(n²) on unclosed
+    # tags because the engine backtracks across the entire remaining input for
+    # each opening tag it cannot close.
+    def strip_blocks(html)
+      out = +""
+      pos = 0
+      while (open_at = html.index(%r{<(script|style)\b}i, pos))
+        tag = Regexp.last_match(1)
+        out << html[pos...open_at]
+        close = html.index(%r{</#{tag}\s*>}i, open_at)
+        if close && (gt = html.index(">", close))
+          pos = gt + 1
+        else
+          pos = html.length
+        end
+      end
+      out << html[pos..].to_s
+    end
+    private_class_method :strip_blocks
+  end
+end

data/lib/msg_extractor/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module MsgExtractor
+  VERSION = "0.1.0"
+end

data/lib/msg_extractor.rb ADDED Viewed

@@ -0,0 +1,63 @@
+require_relative "msg_extractor/version"
+require_relative "msg_extractor/errors"
+require_relative "msg_extractor/cfbf/header"
+require_relative "msg_extractor/cfbf/fat"
+require_relative "msg_extractor/cfbf/directory"
+require_relative "msg_extractor/cfbf/file"
+require_relative "msg_extractor/mapi/ptag"
+require_relative "msg_extractor/mapi/decoders"
+require_relative "msg_extractor/mapi/property_store"
+require_relative "msg_extractor/mapi/named_property_map"
+require_relative "msg_extractor/rtf/compressed_rtf"
+require_relative "msg_extractor/rtf/decapsulator"
+require_relative "msg_extractor/headers"
+require_relative "msg_extractor/recipient"
+require_relative "msg_extractor/util"
+require_relative "msg_extractor/attachment"
+require_relative "msg_extractor/message_object"
+require_relative "msg_extractor/message"
+require_relative "msg_extractor/contact"
+require_relative "msg_extractor/appointment"
+require_relative "msg_extractor/task"
+module MsgExtractor
+  DISPATCH = [
+    [/\A(ipm\.note|report)/i, :Message],
+    [/\Aipm\.(contact|distlist)/i, :Contact],
+    [/\A(ipm\.appointment|ipm\.schedule\.meeting)/i, :Appointment],
+    # Note: intentionally also matches IPM.TaskRequest.* (delegation messages) per the spec's IPM.Task* scope.
+    [/\Aipm\.task/i, :Task]
+  ].freeze
+  # Opens a .msg file (path, binary String, or IO) and returns the model
+  # object matching its MAPI message class.
+  #
+  # strict: when false, unknown/unsupported message classes return a generic
+  # MessageObject instead of raising.
+  def self.open(source, strict: true)
+    cfbf = Cfbf::File.read(source)
+    unless cfbf.entry("__properties_version1.0")
+      raise InvalidFormatError, "OLE file does not contain MSG property streams"
+    end
+    from_storage(cfbf, cfbf.root, kind: :root, strict: strict)
+  end
+  # Builds the right model class for a message storage (the file root or an
+  # embedded message). Used internally by open and Attachment#message.
+  def self.from_storage(cfbf, storage, named: nil, kind: :root, strict: true)
+    base = MessageObject.new(cfbf, storage: storage, named: named, kind: kind)
+    message_class = base.message_class
+    if message_class.nil?
+      raise InvalidFormatError, "MSG file has no message class" if strict
+      return base
+    end
+    match = DISPATCH.find { |pattern, _| pattern.match?(message_class) }
+    if match.nil?
+      if strict
+        raise UnsupportedTypeError, "unsupported message class #{message_class.inspect}"
+      end
+      return base
+    end
+    const_get(match[1]).new(cfbf, storage: storage, named: base.named, kind: kind, properties: base.properties)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,74 @@
+--- !ruby/object:Gem::Specification
+name: msg_extractor
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Bart Duchesne
+bindir: exe
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies: []
+description: 'Parses Outlook .msg (OLE2/CFBF) files into structured Ruby objects:
+  subject, recipients, bodies (text/HTML/RTF), headers and attachments. No native
+  extensions, no runtime dependencies.'
+email:
+- bduc@dyndaco.be
+executables:
+- msg_extractor
+extensions: []
+extra_rdoc_files: []
+files:
+- CHANGELOG.md
+- LICENSE.txt
+- README.md
+- exe/msg_extractor
+- lib/msg_extractor.rb
+- lib/msg_extractor/appointment.rb
+- lib/msg_extractor/attachment.rb
+- lib/msg_extractor/cfbf/directory.rb
+- lib/msg_extractor/cfbf/fat.rb
+- lib/msg_extractor/cfbf/file.rb
+- lib/msg_extractor/cfbf/header.rb
+- lib/msg_extractor/cli.rb
+- lib/msg_extractor/contact.rb
+- lib/msg_extractor/errors.rb
+- lib/msg_extractor/headers.rb
+- lib/msg_extractor/mapi/decoders.rb
+- lib/msg_extractor/mapi/named_property_map.rb
+- lib/msg_extractor/mapi/property_store.rb
+- lib/msg_extractor/mapi/ptag.rb
+- lib/msg_extractor/message.rb
+- lib/msg_extractor/message_object.rb
+- lib/msg_extractor/recipient.rb
+- lib/msg_extractor/rtf/compressed_rtf.rb
+- lib/msg_extractor/rtf/decapsulator.rb
+- lib/msg_extractor/task.rb
+- lib/msg_extractor/util.rb
+- lib/msg_extractor/version.rb
+homepage: https://github.com/bduc/msg-extractor-ruby
+licenses:
+- MIT
+metadata:
+  source_code_uri: https://github.com/bduc/msg-extractor-ruby
+  changelog_uri: https://github.com/bduc/msg-extractor-ruby/blob/main/CHANGELOG.md
+  bug_tracker_uri: https://github.com/bduc/msg-extractor-ruby/issues
+  rubygems_mfa_required: 'true'
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '3.1'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 4.0.3
+specification_version: 4
+summary: Pure Ruby parser for Microsoft Outlook .msg files
+test_files: []