RubyGems - mhtml - Versions diffs - 0.1.0 - Mend

mhtml 0.1.0

Files changed (17) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7bd65b32102f1862282c6980695730474a60011f
+  data.tar.gz: 96c49f4a7a9e1132d93c89dc7cb27b03f746b179
+SHA512:
+  metadata.gz: c2c2effc9cac20e3c6284f5c61137b553c8b0115bbc6540bd9c15ce97f2cefd541cba9cd9e1a0ede97010e2f7465074897233ee2d3e736172839f27dd29fc5d1
+  data.tar.gz: e4015b53919c0d6a29caf8c11df08ef6a8188eaea2459ca07460ff127df0d287b40eadc4e39e3c5985ef7c1d938ae35c69d5b8b7a0a5897ecf01d437f9167c68

data/.gitignore ADDED Viewed

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+.byebug_history
+# rspec failure tracking
+.rspec_status

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,5 @@
+sudo: false
+language: ruby
+rvm:
+  - 2.3.1
+before_install: gem install bundler -v 1.14.6

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in mhtml.gemspec
+gemspec

data/README.md ADDED Viewed

@@ -0,0 +1,97 @@
+# Mhtml
+A ruby gem for parsing MHTML.
+Uses the NodeJS C HTTP Parser under the hood (thanks to @cotag for the gem).
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'mhtml'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install mhtml
+## Usage
+Two interfaces are provided - all at once, or chunked.
+### All at once
+For when you have all of the data in memory.
+```ruby
+source = File.open('/file/path.mht').read
+doc = Mhtml::RootDocument.new(source)
+doc.headers.each { |h| puts h }
+# body is decoded from printed quotable, and encoded according to charset header
+puts doc.body
+doc.sub_docs.each { |s| puts subdoc }
+```
+### Chunked
+For when source data is being streamed, or when concerned about memory usage.
+```ruby
+doc = Mhtml::RootDocument.new
+doc.on_header { |h| handle_header(h) } # yields each header
+# yields body, possibly in parts
+doc.on_body do |b|
+  encoding = doc.encoding
+  handle_body(b)
+end
+doc.on_subdoc_begin { handle_subdoc_begin } # yields nil on each subdoc begin
+doc.on_subdoc_header { |h| handle_subdoc_header(h) } # yields each subdoc header
+doc.on_subdoc_body { |b| handle_subdoc_body(b) } # yields each subdoc's body, possibly in parts
+doc.on_subdoc_complete { handle_subdoc_begin } # yields nil on each subdoc complete
+File.open('/file/path.mht').read.scan(/.{128}/).each do |chunk|
+  doc << chunk
+end
+```
+### Headers
+The header class looks like this (portayed as a hash):
+```ruby
+# Content-Type: multipart/related; charset="windows-1252"; boundary="----=_NextPart_01C74319.B7EA56A0"
+{
+  key: 'Content-Type',
+  values: [
+    { key: nil, value: 'multipart/related' },
+    { key: 'charset', value: 'windows-1252' },
+    { key: 'boundary', value: '----=_NextPart_01C74319.B7EA56A0' }
+  ]
+}
+```
+## TODO
+- Revisit spec fixtures - either use existing solution or break out to separate
+  gem
+- Build up body of fixtures using MHTML from various sources
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run
+`rake spec` to run the tests. You can also run `bin/console` for an interactive
+prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`.
+To release a new version, update the version number in `version.rb`, and then
+run `bundle exec rake release`, which will create a git tag for the version,
+push git commits and tags, and push the `.gem` file to
+[rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at
+https://github.com/benjineering/mhtml_rb.

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require 'bundler/setup'
+require 'mhtml'
+require 'byebug'
+require 'irb'
+Dir.glob('spec/support/**/*.rb') do |path|
+  mod = path.gsub(/\.rb\Z/, '')
+  require_relative "../#{mod}"
+end
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/mhtml/document.rb ADDED Viewed

@@ -0,0 +1,135 @@
+require 'http-parser'
+module Mhtml
+  class Document
+    attr_reader :chunked, :parser
+    attr_accessor :headers, :body, :is_quoted_printable, :encoding
+    def initialize(str = nil)
+      @chunked = !str.is_a?(String)
+      @header_key = nil
+      @header_value_lines = nil
+      @is_quoted_printable = false
+      @encoding = nil
+      @request = HttpParser::Parser.new_instance { |inst| inst.type = :response }
+      @parser = HttpParser::Parser.new do |parser|
+        parser.on_header_field { |inst, data| handle_header_field(inst, data) }
+        parser.on_header_value { |inst, data| handle_header_value(inst, data) }
+        parser.on_body { |inst, data| handle_body(inst, data) }
+        parser.on_message_begin { |inst| handle_message_begin(inst) }
+        parser.on_message_complete { |inst| handle_message_complete(inst) }
+      end
+      @parser.parse(@request, Mhtml::STATUS_LINE)
+      unless @chunked
+        @headers = []
+        @body = ''
+        @parser.parse(@request, str)
+      end
+    end
+    def <<(chunk)
+      @parser.parse(@request, chunk)
+    end
+    def ==(other)
+      @headers == other.headers &&
+        @body.gsub(/\r\n/, "\n").strip == other.body.gsub(/\r\n/, "\n").strip
+    end
+    def on_header
+      @headers_proc = Proc.new
+    end
+    def on_body
+      @body_proc = Proc.new
+    end
+    def header(key)
+      header = nil
+      @headers.each do |h|
+        if h.key == key
+          header = h
+          break
+        end
+      end
+      header
+    end
+    # for testing only = no spec implemented
+    def to_s
+      @headers.join(LINE_BREAK) + Mhtml::DOUBLE_LINE_BREAK + @body
+    end
+    private
+    def handle_header_field(inst, data)
+      maybe_create_header
+      @header_key = data
+      @header_value_lines = []
+    end
+    def handle_header_value(inst, data)
+      @header_value_lines << data
+    end
+    def handle_body(inst, data)
+      maybe_create_header
+      decoded = decode(data)
+      if @chunked
+        @body_proc.call(decoded) unless @body_proc.nil?
+      else
+        @body.force_encoding(@encoding) if @body.empty? && !@encoding.nil?
+        @body += decoded
+      end
+    end
+    def handle_message_begin(inst)
+    end
+    def handle_message_complete(inst)
+    end
+    def maybe_create_header
+      unless @header_key.nil?
+        header = HttpHeader.new(@header_key, @header_value_lines)
+        @headers << header unless @chunked
+        if header.key == 'Content-Type'
+          boundary = header.value('boundary')
+          @boundary = boundary.value unless boundary.nil?
+          charset = header.value('charset')
+          unless charset.nil?
+            @encoding = Encoding.find(charset.value) rescue nil
+          end
+        elsif header.key == 'Content-Transfer-Encoding'
+          value = header.values.first
+          if !value.nil? && value.value == 'quoted-printable'
+            @is_quoted_printable = true
+          end
+        end
+        @headers_proc.call(header) unless @headers_proc.nil?
+        @header_key = nil
+        @header_value_lines = []
+      end
+    end
+    def decode(str)
+      str = str.unpack1('M*') if @is_quoted_printable
+      str = str.force_encoding(@encoding) unless @encoding.nil?
+      str
+    end
+  end
+end

data/lib/mhtml/http_header.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Mhtml
+  class HttpHeader
+    require 'string'
+    attr_accessor :key, :values
+    KEY_VALUE_SEP = ':'.freeze
+    VALUE_SEP = ';'.freeze
+    def initialize(key_or_hash, value_lines = nil)
+      if key_or_hash.is_a?(Hash)
+        @key = key_or_hash[:key]
+        @values = key_or_hash[:values]
+        return
+      end
+      @key = key_or_hash
+      @values = []
+      values_str = value_lines.join('')
+      values_str.split(VALUE_SEP).each do |val_str|
+        val_str.strip!
+        val = Value.new(val_str)
+        if val.nil?
+          raise "Invalid value:\n#{val_str}\n\nFrom string:\n#{val_str}"
+        end
+        @values << val
+      end
+    end
+    def ==(other)
+      @key == other.key && @values == other.values
+    end
+    def value(key)
+      value = nil
+      @values.each do |v|
+        if v.key == key
+          value = v
+          break
+        end
+      end
+      value
+    end
+    # following methods are for debugging only - no spec implemented
+    def to_s
+      "#{@key}#{KEY_VALUE_SEP} #{@values.join(VALUE_SEP + ' ')}"
+    end
+    def clone
+      vals = @values.collect { |v| v.clone }
+      HttpHeader.new(key: @key.clone, values: vals)
+    end
+    class Value
+      attr_reader :key, :value
+      # str examples:
+      # value
+      # key="value"
+      def initialize(str_or_hash)
+        if str_or_hash.is_a?(Hash)
+          @key = str_or_hash[:key]
+          @value = str_or_hash[:value]
+          return
+        end
+        str = str_or_hash
+        split_i = str.index('=')
+        @key = str[0, split_i].strip unless split_i.nil?
+        @value =
+        if split_i.nil?
+          str.strip
+        else
+          str[split_i + 1, str.length - 1].strip.strip_other('"')
+        end
+      end
+      def ==(other)
+        @key == other.key && @value == other.value
+      end
+      # following methods are for debugging only - no spec implemented
+      def to_s
+        if @key.nil?
+          @value
+        else
+          %Q[#{@key}="#{@value}"]
+        end
+      end
+      def clone
+        Value.new(key: @key.clone, value: @value.clone)
+      end
+    end
+  end
+end

data/lib/mhtml/root_document.rb ADDED Viewed

@@ -0,0 +1,123 @@
+module Mhtml
+  class RootDocument < Document
+    BOUNDARY_PREFIX = '--'.freeze
+    attr_accessor :boundary, :sub_docs
+    def initialize(str = nil)
+      @sub_docs = []
+      super(str)
+    end
+    def ==(other)
+      super(other) && @boundary == other.boundary && @sub_docs == other.sub_docs
+    end
+    def on_subdoc_begin
+      @subdoc_begin_proc = Proc.new
+    end
+    def on_subdoc_header
+      @subdoc_header_proc = Proc.new
+    end
+    def on_subdoc_body
+      @subdoc_body_proc = Proc.new
+    end
+    def on_subdoc_complete
+      @subdoc_complete_proc = Proc.new
+    end
+    def boundary_str
+      "#{Mhtml::LINE_BREAK}#{BOUNDARY_PREFIX}#{@boundary}#{Mhtml::LINE_BREAK}"
+    end
+    def last_boundary_str
+      "#{Mhtml::LINE_BREAK}#{BOUNDARY_PREFIX}#{@boundary}#{BOUNDARY_PREFIX}#{Mhtml::LINE_BREAK}"
+    end
+    # for testing only = no spec implemented
+    def to_s
+      doc_sep = Mhtml::DOUBLE_LINE_BREAK + BOUNDARY_PREFIX + @boundary +
+        Mhtml::LINE_BREAK
+      super + doc_sep + @sub_docs.join(doc_sep)
+    end
+    private
+    def handle_body(inst, data)
+      maybe_create_header
+      boundary = boundary_str
+      unless @split.nil?
+        data = @split + data
+        @split = nil
+      end
+      parts = data.split(boundary)
+      unless @body_read
+        @body_read = parts.length > 1
+        super(inst, parts.shift)
+      end
+      parts.each_with_index do |part, i|
+        end_boundary_pos = part.rindex(last_boundary_str)
+        is_last_subdoc = !end_boundary_pos.nil?
+        part = part[0..(end_boundary_pos - 1)] if is_last_subdoc
+        if @chunked
+          is_last_part = i + 1 == parts.length
+          handle_chunked_body(part, is_last_part, is_last_subdoc)
+        else
+          @sub_docs << Document.new(part)
+        end
+      end
+    end
+    def handle_chunked_body(chunk, is_last_part, is_last_subdoc)
+      if @chunked_sub_doc.nil?
+        create_chunked_subdoc
+        @subdoc_begin_proc.call unless @subdoc_begin_proc.nil?
+      end
+      if is_last_part
+        split_idx = chunk.rindex_of_split(boundary_str)
+        if split_idx.nil?
+          quoted_matches = chunk.match(/=[0-9A-F\r\n]{0,2}\Z/)
+          unless quoted_matches.nil?
+            split_idx = chunk.length - quoted_matches[0].length + 1
+          end
+        end
+        unless split_idx.nil?
+          @split = chunk[split_idx..(chunk.length - 1)]
+          chunk = chunk[0..(split_idx - 1)]
+        end
+      end
+      @chunked_sub_doc << chunk
+      unless is_last_part && !is_last_subdoc
+        @sub_docs << @chunked_sub_doc
+        @chunked_sub_doc = nil
+        @subdoc_complete_proc.call unless @subdoc_complete_proc.nil?
+      end
+    end
+    def create_chunked_subdoc
+      @chunked_sub_doc = Document.new
+      @chunked_sub_doc.on_header do |header|
+        @subdoc_header_proc.call(header) unless @subdoc_header_proc.nil?
+      end
+      @chunked_sub_doc.on_body do |body|
+        @subdoc_body_proc.call(body) unless @subdoc_body_proc.nil?
+      end
+    end
+  end
+end

data/lib/mhtml/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Mhtml
+  VERSION = '0.1.0'
+end

data/lib/mhtml.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Mhtml
+  LINE_BREAK = "\r\n".freeze
+  DOUBLE_LINE_BREAK = "#{LINE_BREAK}#{LINE_BREAK}".freeze
+  STATUS_LINE = "HTTP/1.1 200 OK#{LINE_BREAK}".freeze
+end
+require 'mhtml/document'
+require 'mhtml/http_header'
+require 'mhtml/root_document'
+require 'mhtml/version'

data/lib/string.rb ADDED Viewed

@@ -0,0 +1,64 @@
+class String
+  def each_index(x)
+    raise 'Block required' unless block_given?
+    return if empty? || x.nil?
+    i = 0
+    while true
+      i = index(x, i)
+      return if i.nil?
+      yield i
+      i += 1
+      return if i + 1 == length
+    end
+  end
+  def strip_other(str)
+    start_i = 0
+    new_length = length
+    if start_with?(str)
+      start_i += str.length
+      new_length -= str.length
+    end
+    if end_with?(str)
+      new_length -= str.length
+    end
+    self[start_i, new_length]
+  end
+  def underscore
+    self.gsub(/::/, '/').
+    gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+    gsub(/([a-z\d])([A-Z])/,'\1_\2').
+    tr("-", "_").
+    downcase
+  end
+  def index_of_split(other)
+    last_idx = (other.length - 1)
+    (0..last_idx).step do |i|
+      part = other[i..last_idx]
+      return part.length - 1 if start_with?(part)
+    end
+    nil
+  end
+  def rindex_of_split(other)
+    last_idx = (other.length - 1)
+    (0..last_idx).step do |i|
+      part = other[0..(last_idx - i)]
+      return length - part.length if end_with?(part)
+    end
+    nil
+  end
+end

data/mhtml.gemspec ADDED Viewed

@@ -0,0 +1,30 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'mhtml/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'mhtml'
+  spec.version       = Mhtml::VERSION
+  spec.authors       = [ 'Ben Williams' ]
+  spec.email         = [ '8enwilliams@gmail.com' ]
+  spec.summary       = 'A Ruby gem for reading and extracting MHTML files'
+  spec.description   = 'A Ruby gem for reading and extracting MHTML files'
+  spec.homepage      = 'https://github.com/benjineering/mhtml_rb'
+  spec.licenses      = [ 'MIT', 'GPL-2' ]
+  spec.metadata[ 'allowed_push_host' ] = 'https://rubygems.org'
+  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features)/})
+  end
+  spec.require_paths = [ 'lib' ]
+  spec.add_development_dependency 'bundler', '~> 1.14'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+  spec.add_development_dependency 'byebug'
+  spec.add_dependency 'http-parser'
+end

metadata ADDED Viewed

@@ -0,0 +1,131 @@
+--- !ruby/object:Gem::Specification
+name: mhtml
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Ben Williams
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2017-09-15 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.14'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.14'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: http-parser
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A Ruby gem for reading and extracting MHTML files
+email:
+- 8enwilliams@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/mhtml.rb
+- lib/mhtml/document.rb
+- lib/mhtml/http_header.rb
+- lib/mhtml/root_document.rb
+- lib/mhtml/version.rb
+- lib/string.rb
+- mhtml.gemspec
+homepage: https://github.com/benjineering/mhtml_rb
+licenses:
+- MIT
+- GPL-2
+metadata:
+  allowed_push_host: https://rubygems.org
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.6.11
+signing_key:
+specification_version: 4
+summary: A Ruby gem for reading and extracting MHTML files
+test_files: []