unmarkdown 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 89763c4c6a33be8a9d51bda83db646aa71966e06
4
+ data.tar.gz: 0b2588b7f7c9e5d15f6a082cc3a9752169679183
5
+ SHA512:
6
+ metadata.gz: aa2ad64dfa026571241613e404c60fe216e218e85cdff364b35cd29531002cd9f4cc04e5ebc71c262e695f6244b58819b7eaf6e15ccc02886ab6a678fbbdc8e3
7
+ data.tar.gz: ad77913f14ec92cb9813aa3be5d14d36c23656388d5ba2c7fa0d104e99aac94114d6beac1c3d29baaab9bcdabbbb91fd003cc0b0d3cf27b0608cdb79faacd922
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem 'rake'
6
+
7
+ group :test do
8
+ gem 'minitest'
9
+ gem 'minitest-rg'
10
+ end
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Sam Soffes, http://soff.es
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << 'test'
6
+ t.pattern = 'test/**/*_test.rb'
7
+ end
8
+ task default: :test
data/Readme.markdown ADDED
@@ -0,0 +1,62 @@
1
+ # Unmarkdown
2
+
3
+ Convert HTML to Markdown with Ruby.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ``` ruby
10
+ gem 'unmarkdown'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install unmarkdown
20
+
21
+ ## Usage
22
+
23
+ ``` ruby
24
+ markdown = Unmarkdown.parse('Some <strong>HTML</strong>')
25
+ #=> Some **HTML**
26
+
27
+ markdown = Unmarkdown.parse('My website is http://soff.es', autolink: true')
28
+ #=> My website is <a href="http://soff.es">http://soff.es</a>
29
+ ```
30
+
31
+ ## Support
32
+
33
+ ### Supported tags
34
+
35
+ * h1-h6
36
+ * blockquote
37
+ * ul, ol, li
38
+ * pre
39
+ * hr
40
+ * a
41
+ * em, i
42
+ * strong, b
43
+ * u
44
+ * mark
45
+ * code
46
+ * img
47
+
48
+ For tags that aren't supported, their content will be added to the output. Basically it treats everything like a `<p>`.
49
+
50
+ ### Options
51
+
52
+ * `fenced_code_blocks` — Uses three backticks before and after instead of four spaces before each line
53
+ * `allow_scripts` — By default, script tags are removed. If you set this option to `true` their original HTML will be included in the output
54
+ * `underline_headers` — By default number signs are added before headers. If you turn this option on, it will use equal signs for h1's or hypens for h2's and the reset will remain number signs.
55
+
56
+ ## Contributing
57
+
58
+ 1. Fork it ( http://github.com/soffes/unmarkdown/fork )
59
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
60
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
61
+ 4. Push to the branch (`git push origin my-new-feature`)
62
+ 5. Create new Pull Request
data/lib/unmarkdown.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'unmarkdown/version'
2
+ require 'unmarkdown/parser'
3
+
4
+ module Unmarkdown
5
+
6
+ module_function
7
+
8
+ # Takes an HTML string and returns a Markdown string
9
+ def parse(html, options = {})
10
+ Parser.new(html, options).parse
11
+ end
12
+ end
@@ -0,0 +1,146 @@
1
+ require 'nokogiri'
2
+
3
+ module Unmarkdown
4
+ class Parser
5
+ BLOCK_ELEMENT_NAMES = %w{h1 h2 h3 h4 h5 h6 blockquote pre hr ul ol li p div}.freeze
6
+ AUTOLINK_URL_REGEX = /((?:https?|ftp):[^'"\s]+)/i.freeze
7
+ AUTOLINK_EMAIL_REGEX = %r{([-.\w]+\@[-a-z0-9]+(?:\.[-a-z0-9]+)*\.[a-z]+)}i.freeze
8
+
9
+ def initialize(html, options = {})
10
+ @html = html
11
+ @options = options
12
+ end
13
+
14
+ def parse
15
+ # Setup document
16
+ doc = Nokogiri::HTML(@html)
17
+ doc.encoding = 'UTF-8'
18
+
19
+ # Reset bookkeeping
20
+ @list = []
21
+ @list_position = []
22
+
23
+ # Parse the root node recursively
24
+ root_node = doc.xpath('//body')
25
+ markdown = parse_nodes(root_node.children)
26
+
27
+ # Strip whitespace
28
+ markdown.rstrip.gsub(/\n{2}+/, "\n\n")
29
+
30
+ # TODO: Strip trailing whitespace
31
+ end
32
+
33
+ private
34
+
35
+ # Parse the children of a node
36
+ def parse_nodes(nodes)
37
+ output = ''
38
+
39
+ # Short-circuit if it's empty
40
+ return output if !nodes || nodes.empty?
41
+
42
+ # Loop through nodes
43
+ nodes.each do |node|
44
+ case node.name
45
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
46
+ level = node.name.match(/\Ah(\d)\Z/)[1].to_i
47
+ if @options[:underline_headers] && level < 3
48
+ content = parse_content(node)
49
+ output << content + "\n"
50
+ character = level == 1 ? '=' : '-'
51
+ content.length.times { output << character}
52
+ else
53
+ hashes = ''
54
+ level.times { hashes << '#' }
55
+ output << "#{hashes} #{parse_content(node)}"
56
+ end
57
+ when 'blockquote'
58
+ parse_content(node).split("\n").each do |line|
59
+ output << "> #{line}\n"
60
+ end
61
+ when 'ul', 'ol'
62
+ output << "\n\n" if @list.count > 0
63
+
64
+ if unordered = node.name == 'ul'
65
+ @list << :unordered
66
+ else
67
+ @list << :ordered
68
+ @list_position << 0
69
+ end
70
+
71
+ output << parse_nodes(node.children)
72
+
73
+ @list.pop
74
+ @list_position.pop unless unordered
75
+ when 'li'
76
+ (@list.count - 1).times { output << ' ' }
77
+ if @list.last == :unordered
78
+ output << "* #{parse_content(node)}"
79
+ else
80
+ num = (@list_position[@list_position.count - 1] += 1)
81
+ output << "#{num}. #{parse_content(node)}"
82
+ end
83
+ when 'pre'
84
+ content = parse_content(node)
85
+
86
+ if @options[:fenced_code_blocks]
87
+ output << "```\n#{content}\n```"
88
+ else
89
+ content.split("\n").each do |line|
90
+ output << " #{line}\n"
91
+ end
92
+ end
93
+ when 'hr'
94
+ output << "---\n\n"
95
+ when 'a'
96
+ output << "[#{parse_content(node)}](#{node['href'] + build_title(node)})"
97
+ when 'i', 'em'
98
+ output << "*#{parse_content(node)}*"
99
+ when 'b', 'strong'
100
+ output << "**#{parse_content(node)}**"
101
+ when 'u'
102
+ output << "_#{parse_content(node)}_"
103
+ when 'mark'
104
+ output << "==#{parse_content(node)}=="
105
+ when 'code'
106
+ output << "`#{parse_content(node)}`"
107
+ when 'img'
108
+ output << "![#{node['alt']}](#{node['src'] + build_title(node)})"
109
+ when 'text'
110
+ content = parse_content(node)
111
+
112
+ # Optionally look for links
113
+ content.gsub!(AUTOLINK_URL_REGEX, '<\1>') if @options[:autolink]
114
+ content.gsub!(AUTOLINK_EMAIL_REGEX, '<\1>') if @options[:autolink]
115
+
116
+ output << content
117
+ when 'script'
118
+ next unless @options[:allow_scripts]
119
+ output << node.to_html
120
+ else
121
+ # If it's an supported node or a node that just contains text, just get
122
+ # its content
123
+ output << parse_content(node)
124
+ end
125
+
126
+ output << "\n\n" if BLOCK_ELEMENT_NAMES.include?(node.name)
127
+ end
128
+
129
+ output
130
+ end
131
+
132
+ # Get the content from a node
133
+ def parse_content(node)
134
+ content = if node.children.empty?
135
+ node.content
136
+ else
137
+ parse_nodes(node.children)
138
+ end
139
+ end
140
+
141
+ # Build the title for links or images
142
+ def build_title(node)
143
+ node['title'] ? %Q{ "#{node['title']}"} : ''
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,3 @@
1
+ module Unmarkdown
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,156 @@
1
+ require 'test_helper'
2
+
3
+ class ParserTest < Unmarkdown::Test
4
+ include Unmarkdown
5
+
6
+ def test_headers
7
+ 6.times do |i|
8
+ i += 1
9
+ html = "<h#{i}>Header</h#{i}>"
10
+
11
+ markdown = ''
12
+ i.times { markdown << '#' }
13
+ markdown << ' Header'
14
+
15
+ assert_equal markdown, parse(html)
16
+ end
17
+
18
+ html = '<h1>Something Huge</h1>'
19
+ markdown = "Something Huge\n=============="
20
+ assert_equal markdown, parse(html, underline_headers: true)
21
+
22
+ html = '<h2>Something Smaller</h1>'
23
+ markdown = "Something Smaller\n-----------------"
24
+ assert_equal markdown, parse(html, underline_headers: true)
25
+ end
26
+
27
+ def test_blockquote
28
+ html = '<blockquote>Awesome.</blockquote>'
29
+ markdown = '> Awesome.'
30
+ assert_equal markdown, parse(html)
31
+ end
32
+
33
+ def test_unorder_list
34
+ html = '<ul><li>Ruby<ul><li>Gem</li><li>Stuff</li></ul></li><li>Objective-C</li></ul>'
35
+ markdown = "* Ruby\n\n * Gem\n\n * Stuff\n\n* Objective-C"
36
+ assert_equal markdown, parse(html)
37
+ end
38
+
39
+ def test_ordered_list
40
+ html = '<ol><li>Ruby<ol><li>Gem</li><li>Stuff</li></ol></li><li>Objective-C</li></ol>'
41
+ markdown = "1. Ruby\n\n 1. Gem\n\n 2. Stuff\n\n2. Objective-C"
42
+ assert_equal markdown, parse(html)
43
+ end
44
+
45
+ def test_code_block
46
+ html = "<pre>puts 'Hello world'</pre>"
47
+ markdown = " puts 'Hello world'"
48
+ assert_equal markdown, parse(html)
49
+
50
+ html = "<pre>puts 'Hello world'</pre>"
51
+ markdown = "```\nputs 'Hello world'\n```"
52
+ assert_equal markdown, parse(html, fenced_code_blocks: true)
53
+ end
54
+
55
+ def test_line_break
56
+ html = '<hr>'
57
+ markdown = '---'
58
+ assert_equal markdown, parse(html)
59
+ end
60
+
61
+ def test_link
62
+ html = '<a href="http://soff.es">Sam Soffes</a>'
63
+ markdown = '[Sam Soffes](http://soff.es)'
64
+ assert_equal markdown, parse(html)
65
+
66
+ html = '<a href="http://soff.es" title="My site">Sam Soffes</a>'
67
+ markdown = '[Sam Soffes](http://soff.es "My site")'
68
+ assert_equal markdown, parse(html)
69
+ end
70
+
71
+ def test_emphasis
72
+ html = '<i>italic</i>'
73
+ markdown = '*italic*'
74
+ assert_equal markdown, parse(html)
75
+
76
+ html = '<em>italic</em>'
77
+ markdown = '*italic*'
78
+ assert_equal markdown, parse(html)
79
+ end
80
+
81
+ def test_double_emphasis
82
+ html = '<b>bold</b>'
83
+ markdown = '**bold**'
84
+ assert_equal markdown, parse(html)
85
+
86
+ html = '<strong>bold</strong>'
87
+ markdown = '**bold**'
88
+ assert_equal markdown, parse(html)
89
+ end
90
+
91
+ def test_triple_emphasis
92
+ html = '<b><i>bold italic</i></b>'
93
+ markdown = '***bold italic***'
94
+ assert_equal markdown, parse(html)
95
+ end
96
+
97
+ def test_underline
98
+ html = '<u>underline</u>'
99
+ markdown = '_underline_'
100
+ assert_equal markdown, parse(html)
101
+ end
102
+
103
+ def test_bold_underline
104
+ html = '<b><u>underline</u></b>'
105
+ markdown = '**_underline_**'
106
+ assert_equal markdown, parse(html)
107
+
108
+ html = '<u><b>underline</b></u>'
109
+ markdown = '_**underline**_'
110
+ assert_equal markdown, parse(html)
111
+ end
112
+
113
+ def test_mark
114
+ html = '<mark>highlighted</mark>'
115
+ markdown = '==highlighted=='
116
+ assert_equal markdown, parse(html)
117
+ end
118
+
119
+ def test_code
120
+ html = '<code>Unmarkdown.parse(html)</code>'
121
+ markdown = '`Unmarkdown.parse(html)`'
122
+ assert_equal markdown, parse(html)
123
+ end
124
+
125
+ def test_image
126
+ html = '<img src="http://soffes-assets.s3.amazonaws.com/images/Sam-Soffes.jpg">'
127
+ markdown = '![](http://soffes-assets.s3.amazonaws.com/images/Sam-Soffes.jpg)'
128
+ assert_equal markdown, parse(html)
129
+
130
+ html = '<img src="http://soffes-assets.s3.amazonaws.com/images/Sam-Soffes.jpg" alt="Sam Soffes">'
131
+ markdown = '![Sam Soffes](http://soffes-assets.s3.amazonaws.com/images/Sam-Soffes.jpg)'
132
+ assert_equal markdown, parse(html)
133
+
134
+ html = '<img src="http://soffes-assets.s3.amazonaws.com/images/Sam-Soffes.jpg" title="That guy">'
135
+ markdown = '![](http://soffes-assets.s3.amazonaws.com/images/Sam-Soffes.jpg "That guy")'
136
+ assert_equal markdown, parse(html)
137
+ end
138
+
139
+ def test_script
140
+ html = %Q{<blockquote class="twitter-tweet"><p><a href="https://twitter.com/soffes">@soffes</a> If people think Apple is going to redo their promo videos and 3D animation intros for iOS 7 they&#39;re crazy. The design is ~final.</p>&mdash; Mike Rundle (@flyosity) <a href="https://twitter.com/flyosity/statuses/348358938296733696">June 22, 2013</a></blockquote>\n<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>}
141
+ markdown = %Q{> [@soffes](https://twitter.com/soffes) If people think Apple is going to redo their promo videos and 3D animation intros for iOS 7 they're crazy. The design is ~final.\n> \n> — Mike Rundle (@flyosity) [June 22, 2013](https://twitter.com/flyosity/statuses/348358938296733696)}
142
+ assert_equal markdown, parse(html)
143
+
144
+ html = %Q{<blockquote class="twitter-tweet"><p><a href="https://twitter.com/soffes">@soffes</a> If people think Apple is going to redo their promo videos and 3D animation intros for iOS 7 they&#39;re crazy. The design is ~final.</p>&mdash; Mike Rundle (@flyosity) <a href="https://twitter.com/flyosity/statuses/348358938296733696">June 22, 2013</a></blockquote>\n<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>}
145
+ markdown = %Q{> [@soffes](https://twitter.com/soffes) If people think Apple is going to redo their promo videos and 3D animation intros for iOS 7 they're crazy. The design is ~final.\n> \n> — Mike Rundle (@flyosity) [June 22, 2013](https://twitter.com/flyosity/statuses/348358938296733696)\n\n<script async src="//platform.twitter.com/widgets.js" charset="utf-8"></script>}
146
+ assert_equal markdown, parse(html, allow_scripts: true)
147
+ end
148
+
149
+ def test_autolink
150
+ html = 'Head to http://soff.es and email sam@soff.es'
151
+ assert_equal html, parse(html)
152
+
153
+ markdown = 'Head to <http://soff.es> and email <sam@soff.es>'
154
+ assert_equal markdown, parse(html, autolink: true)
155
+ end
156
+ end
@@ -0,0 +1,19 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ Bundler.require :test
4
+
5
+ if ENV['COVERAGE']
6
+ require 'simplecov'
7
+ SimpleCov.start
8
+ end
9
+
10
+ require 'minitest/autorun'
11
+ require 'unmarkdown'
12
+
13
+ # Support files
14
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/support/*.rb"].each do |file|
15
+ require file
16
+ end
17
+
18
+ class Unmarkdown::Test < MiniTest::Test
19
+ end
@@ -0,0 +1,9 @@
1
+ require 'test_helper'
2
+
3
+ module Unmarkdown
4
+ class UnmarkdownTest < Test
5
+ def test_that_it_parses
6
+ refute_nil Unmarkdown.parse('foo')
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'unmarkdown/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'unmarkdown'
8
+ spec.version = Unmarkdown::VERSION
9
+ spec.authors = ['Sam Soffes']
10
+ spec.email = ['sam@soff.es']
11
+ spec.summary = 'Convert HTML to Markdown'
12
+ spec.homepage = 'https://github.com/soffes/unmarkdown'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.required_ruby_version = '>= 1.9.2'
21
+
22
+ spec.add_development_dependency 'bundler'
23
+
24
+ # HTML parsing
25
+ spec.add_dependency 'nokogiri'
26
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unmarkdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sam Soffes
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description:
42
+ email:
43
+ - sam@soff.es
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - LICENSE
51
+ - Rakefile
52
+ - Readme.markdown
53
+ - lib/unmarkdown.rb
54
+ - lib/unmarkdown/parser.rb
55
+ - lib/unmarkdown/version.rb
56
+ - test/parser_test.rb
57
+ - test/test_helper.rb
58
+ - test/unmarkdown_test.rb
59
+ - unmarkdown.gemspec
60
+ homepage: https://github.com/soffes/unmarkdown
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - '>='
71
+ - !ruby/object:Gem::Version
72
+ version: 1.9.2
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.0.3
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: Convert HTML to Markdown
84
+ test_files:
85
+ - test/parser_test.rb
86
+ - test/test_helper.rb
87
+ - test/unmarkdown_test.rb