RubyGems - markitdown - Versions diffs - 0.0.1 - Mend

markitdown 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in markitdown.gemspec
+gemspec

data/LICENSE ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Christopher Petersen
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,112 @@
+# Markitdown
+Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
+ * Ordered and unordered lists
+ * Nested lists
+ * Blockquotes
+ * Lists (and nested list) inside of block quotes
+ * Images
+ * Links
+As well as other tags.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'markitdown'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install markitdown
+## Usage
+To convert HTML to Markdown:
+```ruby
+Markitdown.from_html(html)
+```
+```Markitdown``` uses Nokogiri internally. If you already have a Nokogiri object you can use ```from_nokogiri```
+```ruby
+Markitdown.from_html(nokogiri_node)
+```
+## Example
+From the specs:
+### HTML
+```html
+<html>
+  <head>
+    <title>Test Document</title>
+  </head>
+  <body>
+    <h1>Main Header</h1>
+    <p>
+      This <em>is</em> a <b>test</b>. It includes a <a href="http://www.google.com">link</a> as well as an image <img src="https://www.google.com/images/srpr/logo3w.png" alt="Google Logo" />
+      <ul>
+        <li>bullet 1</li>
+        <li>bullet 2</li>
+        <li>bullet 3</li>
+      </ul>
+    </p>
+    <hr/>
+    <h2>Subheader</h2>
+    <p>
+      This is paragraph two.
+      <ol>
+        <li>bullet 1</li>
+        <ul>
+          <li>Sub-bullet 1 <a href="http://github.com">Nested link</a>.</li>
+        </ul>
+        <li>bullet 2</li>
+        <li>bullet 3</li>
+      </ol>
+    </p>
+  </body>
+</html>
+```
+Gets converted to the following Markdown:
+```md
+# Main Header
+This *is* a **test**. It includes a [link](http://www.google.com) as well as an image ![Google Logo](https://www.google.com/images/srpr/logo3w.png)
+ * bullet 1
+ * bullet 2
+ * bullet 3
+***
+## Subheader
+This is paragraph two.
+ 1. bullet 1
+    * Sub-bullet 1 [Nested link](http://github.com).
+ 1. bullet 2
+ 1. bullet 3
+```
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Added some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :test => :spec
+task :default => :spec

data/lib/markitdown.rb ADDED

@@ -0,0 +1,166 @@
+require "markitdown/version"
+require "nokogiri"
+module Markitdown
+  def self.from_html(html)
+    from_nokogiri(Nokogiri::XML(html).root)
+  end
+  def self.from_nokogiri(node)
+    # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
+    # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
+    # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
+    # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
+    self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1')
+  end
+  private
+  def self.parse_node(node, states=[])
+    results=[]
+    after = nil
+    states.unshift node.name.downcase
+    pre = prefix(states)
+    strip_contents = false
+    case node.name
+    when "head"
+      return []
+    when "title"
+      return []
+    when "style"
+      return []
+    when "div"
+      results << "\t"
+      after = "\t"
+    when "span"
+      results << "\t"
+      after = "\t"
+    when "p"
+      results << self.newline(pre, nil, 2)
+      after = self.newline(pre, nil,  2)
+    when "h1"
+      results << self.newline(pre, nil,  2)
+      results << "# "
+      after = self.newline(pre, nil,  2)
+    when "h2"
+      results << self.newline(pre, nil,  2)
+      results << "## "
+      after = self.newline(pre, nil,  2)
+    when "h3"
+      results << self.newline(pre, nil,  2)
+      results << "### "
+      after = self.newline(pre, nil,  2)
+    when "h4"
+      results << self.newline(pre, nil,  2)
+      results << "#### "
+      after = self.newline(pre, nil,  2)
+    when "h5"
+      results << self.newline(pre, nil,  2)
+      results << "##### "
+      after = self.newline(pre, nil,  2)
+    when "h6"
+      results << self.newline(pre, nil,  2)
+      results << "###### "
+      after = self.newline(pre, nil,  2)
+    when "hr"
+      results << self.newline(pre, nil,  2)
+      results << "***"
+      results << self.newline(pre, nil,  2)
+    when "br"
+      results << self.newline(pre, nil,  2)
+    when "em"
+      results << " *"
+      after = "* "
+    when "i"
+      results << " *"
+      after = "* "
+    when "strong"
+      results << " **"
+      after = "** "
+    when "b"
+      results << " **"
+      after = "** "
+    when "blockquote"
+      results << pre
+      after = "\n"
+    when "ol"
+      unless self.nested_list?(states)
+        results << self.newline(pre, nil)
+        after = "\n"
+      end
+    when "ul"
+      unless self.nested_list?(states)
+        results << self.newline(pre, nil)
+        after = "\n"
+      end
+    when "li"
+      results << "\n"
+      results << pre
+    when "a"
+      results << " ["
+      after = ["](#{node.attributes["href"].value}) "]
+      strip_content = true
+    when "img"
+      results << " !["
+      results << node.attributes["alt"].value if node.attributes["alt"]
+      results << "]("
+      results << node.attributes["src"].value if node.attributes["src"]
+      results << ") "
+    when "text"
+      results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
+    end
+    node.children.each do |child|
+      contents = self.parse_node(child, states)
+      contents = contents.flatten.compact.join.strip if strip_content
+      results << contents
+    end
+    results << after
+    states.shift
+    results
+  end
+  def self.nested_list?(states)
+    result = false
+    states.each_with_index do |state, index|
+      next if index==0
+      result = true if ["ul","ol","blockquote"].include?(state)
+    end
+    result
+  end
+  def self.newline(pre, line, count=1)
+    result = []
+    count.times do
+      result << pre
+      result << line
+      result << "\n"
+    end
+    result
+  end
+  def self.prefix(states)
+    result = []
+    states.each_with_index do |state, index|
+      if state == "blockquote"
+        result.unshift(" > ")
+      end
+      next if index==0
+      if index==1
+        if states.first == "li"
+          if state == "ol"
+            result.unshift(" 1. ")
+          elsif state == "ul"
+            result.unshift(" * ")
+          end
+        end
+        next
+      end
+      case state
+      when "ol"
+        result.unshift("   ")
+      when "ul"
+        result.unshift("  ")
+      end
+    end
+    result
+  end
+end

data/lib/markitdown/version.rb ADDED

@@ -0,0 +1,3 @@
+module Markitdown
+  VERSION = "0.0.1"
+end

data/markitdown.gemspec ADDED

@@ -0,0 +1,21 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/markitdown/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.authors       = ["Christopher Petersen"]
+  gem.email         = ["christopher.petersen@gmail.com"]
+  gem.description   = %q{A small library that uses Nokogiri to parse an HTML file and produce Markdown}
+  gem.summary       = %q{Converts HTML to Markdown}
+  gem.homepage      = ""
+  gem.add_dependency('nokogiri')
+  gem.add_development_dependency('rake')
+  gem.add_development_dependency('rspec')
+  gem.files         = `git ls-files`.split($\)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.name          = "markitdown"
+  gem.require_paths = ["lib"]
+  gem.version       = Markitdown::VERSION
+end

data/spec/doc.html ADDED

@@ -0,0 +1,29 @@
+<html>
+  <head>
+    <title>Test Document</title>
+  </head>
+  <body>
+    <h1>Main Header</h1>
+    <p>
+      This <em>is</em> a <b>test</b>. It includes a <a href="http://www.google.com">link</a> as well as an image <img src="https://www.google.com/images/srpr/logo3w.png" alt="Google Logo" />
+      <ul>
+        <li>bullet 1</li>
+        <li>bullet 2</li>
+        <li>bullet 3</li>
+      </ul>
+    </p>
+    <hr/>
+    <h2>Subheader</h2>
+    <p>
+      This is paragraph two.
+      <ol>
+        <li>bullet 1</li>
+        <ul>
+          <li>Sub-bullet 1 <a href="http://github.com">Nested link</a>.</li>
+        </ul>
+        <li>bullet 2</li>
+        <li>bullet 3</li>
+      </ol>
+    </p>
+  </body>
+</html>

data/spec/doc_spec.rb ADDED

@@ -0,0 +1,32 @@
+require 'markitdown'
+describe Markitdown do
+  context "When parsing a document" do
+    let(:html) { File.read("spec/doc.html") }
+    it "should produce valid markdown" do
+      Markitdown.from_html(html).should == "
+# Main Header
+This *is* a **test**. It includes a [link](http://www.google.com) as well as an image ![Google Logo](https://www.google.com/images/srpr/logo3w.png)
+ * bullet 1
+ * bullet 2
+ * bullet 3
+***
+## Subheader
+This is paragraph two.
+ 1. bullet 1
+    * Sub-bullet 1 [Nested link](http://github.com).
+ 1. bullet 2
+ 1. bullet 3
+"
+    end
+  end
+end

data/spec/nesting_spec.rb ADDED

@@ -0,0 +1,130 @@
+require 'markitdown'
+describe Markitdown do
+  context "when parsing nested ordered lists" do
+    let(:html) { "
+      <ol>
+        <li>line 1.1</li>
+        <ol>
+          <li>line 2.1</li>
+          <li>line 2.2</li>
+          <ol>
+            <li>line 3.1</li>
+            <li>line 3.2</li>
+          </ol>
+        </ol>
+        <li>line 1.2</li>
+      </ol>"
+    }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "
+ 1. line 1.1
+    1. line 2.1
+    1. line 2.2
+       1. line 3.1
+       1. line 3.2
+ 1. line 1.2
+"
+    end
+  end
+  context "when parsing nested unordered lists" do
+    let(:html) { "
+      <ul>
+        <li>line 1.1</li>
+        <ul>
+          <li>line 2.1</li>
+          <li>line 2.2</li>
+          <ul>
+            <li>line 3.1</li>
+            <li>line 3.2</li>
+          </ul>
+        </ul>
+        <li>line 1.2</li>
+      </ul>"
+    }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "
+ * line 1.1
+   * line 2.1
+   * line 2.2
+     * line 3.1
+     * line 3.2
+ * line 1.2
+"
+    end
+  end
+  context "when parsing nested ordered and unordered lists" do
+    let(:html) { "
+      <ul>
+        <li>line 1.1</li>
+        <ol>
+          <li>line 2.1</li>
+          <li>line 2.2</li>
+          <ul>
+            <li>line 3.1</li>
+            <li>line 3.2</li>
+          </ul>
+        </ol>
+        <li>line 1.2</li>
+      </ul>"
+    }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "
+ * line 1.1
+   1. line 2.1
+   1. line 2.2
+      * line 3.1
+      * line 3.2
+ * line 1.2
+"
+    end
+  end
+  context "when parsing an unordered list nested under a blockquote" do
+    let(:html) { "
+      <blockquote>
+        This is a quote with a list
+        <ul>
+          <li>item 1</li>
+          <li>item 2</li>
+        </ul>
+      </blockquote>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should ==
+" > This is a quote with a list
+ >  * item 1
+ >  * item 2
+"
+    end
+  end
+  context "when parsing nested lists with links nested under a blockquote" do
+    let(:html) { "
+      <blockquote>
+        This is a quote with a list
+        <ul>
+          <li>item <a href='http://www.google.com'>1.1</a></li>
+          <ol>
+            <li>item <a href='http://www.google.com'>2.1</a></li>
+            <li>item 2.2</li>
+          </ol>
+          <li>item 1.2</li>
+        </ul>
+      </blockquote>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should ==
+" > This is a quote with a list
+ >  * item [1.1](http://www.google.com)
+ >    1. item [2.1](http://www.google.com)
+ >    1. item 2.2
+ >  * item 1.2
+"
+    end
+  end
+end

data/spec/tag_spec.rb ADDED

@@ -0,0 +1,209 @@
+require 'markitdown'
+describe Markitdown do
+  context "When parsing a paragraph" do
+    let(:html) { "<p>This is a paragraph</p>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\nThis is a paragraph\n\n"
+    end
+  end
+  context "When parsing an H1" do
+    let(:html) { "<h1>This is a test</h1>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n# This is a test\n\n"
+    end
+  end
+  context "When parsing an H2" do
+    let(:html) { "<h2>This is a test</h2>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n## This is a test\n\n"
+    end
+  end
+  context "When parsing an H3" do
+    let(:html) { "<h3>This is a test</h3>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n### This is a test\n\n"
+    end
+  end
+  context "When parsing an H4" do
+    let(:html) { "<h4>This is a test</h4>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n#### This is a test\n\n"
+    end
+  end
+  context "When parsing an H5" do
+    let(:html) { "<h5>This is a test</h5>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n##### This is a test\n\n"
+    end
+  end
+  context "When parsing an H6" do
+    let(:html) { "<h6>This is a test</h6>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n###### This is a test\n\n"
+    end
+  end
+  context "When parsing an HR" do
+    let(:html) { "<hr/>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n***\n\n"
+    end
+  end
+  context "When parsing an BR" do
+    let(:html) { "<br/>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "\n\n"
+    end
+  end
+  context "When parsing an EM element" do
+    let(:html) { "<em>emphasis added</em>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " *emphasis added* "
+    end
+  end
+  context "When parsing an italicized element" do
+    let(:html) { "<i>italics added</i>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " *italics added* "
+    end
+  end
+  context "When parsing a strong element" do
+    let(:html) { "<strong>strong added</strong>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " **strong added** "
+    end
+  end
+  context "When parsing a bold element" do
+    let(:html) { "<b>bold added</b>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " **bold added** "
+    end
+  end
+  context "When parsing a bold element that's followed by a punctuation" do
+    let(:html) { "<html><b>bold added</b>.</html>" }
+    it "should return valid markdown without a space" do
+      Markitdown.from_html(html).should == " **bold added**."
+    end
+  end
+  context "When parsing a em element that's followed by a punctuation" do
+    let(:html) { "<html><em>emphasis added</em>?</html>" }
+    it "should return valid markdown without a space" do
+      Markitdown.from_html(html).should == " *emphasis added*?"
+    end
+  end
+  context "When parsing an OL" do
+    let(:html) { "<ol>
+  <li>first bullet</li>
+  <li>second bullet</li>
+  <li>third bullet</li>
+</ol>"
+    }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "
+ 1. first bullet
+ 1. second bullet
+ 1. third bullet
+"
+    end
+  end
+  context "When parsing an UL" do
+    let(:html) { "<ul>
+  <li>first bullet</li>
+  <li>second bullet</li>
+  <li>third bullet</li>
+</ul>"
+    }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == "
+ * first bullet
+ * second bullet
+ * third bullet
+"
+    end
+  end
+  context "When parsing a link" do
+    let(:html) { "<a href='http://www.google.com'>this is a link</strong>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " [this is a link](http://www.google.com) "
+    end
+  end
+  context "When parsing an image" do
+    let(:html) { "<img src='https://www.google.com/images/srpr/logo3w.png' alt='Google Logo'>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " ![Google Logo](https://www.google.com/images/srpr/logo3w.png) "
+    end
+  end
+  context "When parsing an image without an alt tag" do
+    let(:html) { "<img src='https://www.google.com/images/srpr/logo3w.png'>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " ![](https://www.google.com/images/srpr/logo3w.png) "
+    end
+  end
+  context "When parsing a style block" do
+    let(:html) { "<style>div.whatever { font-weight: bold; }</style>" }
+    it "should ignore it" do
+      Markitdown.from_html(html).should == ""
+    end
+  end
+  context "When parsing a blockquote" do
+    let(:html) { "<blockquote>this is a block quote</blockquote>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " > this is a block quote\n"
+    end
+  end
+  context "When parsing a multi line blockquote" do
+    let(:html) { "<blockquote>
+      line 1
+      line 2
+      line 3
+    </blockquote>" }
+    it "should return valid markdown" do
+      Markitdown.from_html(html).should == " > line 1 line 2 line 3\n"
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,116 @@
+--- !ruby/object:Gem::Specification
+name: markitdown
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Christopher Petersen
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-10-16 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: A small library that uses Nokogiri to parse an HTML file and produce
+  Markdown
+email:
+- christopher.petersen@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- README.md
+- Rakefile
+- lib/markitdown.rb
+- lib/markitdown/version.rb
+- markitdown.gemspec
+- spec/doc.html
+- spec/doc_spec.rb
+- spec/nesting_spec.rb
+- spec/tag_spec.rb
+homepage: ''
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: 4314622301527767866
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: 4314622301527767866
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Converts HTML to Markdown
+test_files:
+- spec/doc.html
+- spec/doc_spec.rb
+- spec/nesting_spec.rb
+- spec/tag_spec.rb