markitdown 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +112 -0
- data/Rakefile +8 -0
- data/lib/markitdown.rb +166 -0
- data/lib/markitdown/version.rb +3 -0
- data/markitdown.gemspec +21 -0
- data/spec/doc.html +29 -0
- data/spec/doc_spec.rb +32 -0
- data/spec/nesting_spec.rb +130 -0
- data/spec/tag_spec.rb +209 -0
- metadata +116 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Christopher Petersen
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
# Markitdown
|
2
|
+
|
3
|
+
Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
|
4
|
+
|
5
|
+
* Ordered and unordered lists
|
6
|
+
* Nested lists
|
7
|
+
* Blockquotes
|
8
|
+
* Lists (and nested list) inside of block quotes
|
9
|
+
* Images
|
10
|
+
* Links
|
11
|
+
|
12
|
+
As well as other tags.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
Add this line to your application's Gemfile:
|
17
|
+
|
18
|
+
gem 'markitdown'
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
$ bundle
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
|
26
|
+
$ gem install markitdown
|
27
|
+
|
28
|
+
## Usage
|
29
|
+
|
30
|
+
To convert HTML to Markdown:
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
Markitdown.from_html(html)
|
34
|
+
```
|
35
|
+
|
36
|
+
```Markitdown``` uses Nokogiri internally. If you already have a Nokogiri object you can use ```from_nokogiri```
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
Markitdown.from_html(nokogiri_node)
|
40
|
+
```
|
41
|
+
|
42
|
+
## Example
|
43
|
+
|
44
|
+
From the specs:
|
45
|
+
|
46
|
+
### HTML
|
47
|
+
```html
|
48
|
+
<html>
|
49
|
+
<head>
|
50
|
+
<title>Test Document</title>
|
51
|
+
</head>
|
52
|
+
<body>
|
53
|
+
<h1>Main Header</h1>
|
54
|
+
<p>
|
55
|
+
This <em>is</em> a <b>test</b>. It includes a <a href="http://www.google.com">link</a> as well as an image <img src="https://www.google.com/images/srpr/logo3w.png" alt="Google Logo" />
|
56
|
+
<ul>
|
57
|
+
<li>bullet 1</li>
|
58
|
+
<li>bullet 2</li>
|
59
|
+
<li>bullet 3</li>
|
60
|
+
</ul>
|
61
|
+
</p>
|
62
|
+
<hr/>
|
63
|
+
<h2>Subheader</h2>
|
64
|
+
<p>
|
65
|
+
This is paragraph two.
|
66
|
+
<ol>
|
67
|
+
<li>bullet 1</li>
|
68
|
+
<ul>
|
69
|
+
<li>Sub-bullet 1 <a href="http://github.com">Nested link</a>.</li>
|
70
|
+
</ul>
|
71
|
+
<li>bullet 2</li>
|
72
|
+
<li>bullet 3</li>
|
73
|
+
</ol>
|
74
|
+
</p>
|
75
|
+
</body>
|
76
|
+
</html>
|
77
|
+
```
|
78
|
+
|
79
|
+
Gets converted to the following Markdown:
|
80
|
+
|
81
|
+
```md
|
82
|
+
|
83
|
+
|
84
|
+
# Main Header
|
85
|
+
|
86
|
+
This *is* a **test**. It includes a [link](http://www.google.com) as well as an image 
|
87
|
+
|
88
|
+
* bullet 1
|
89
|
+
* bullet 2
|
90
|
+
* bullet 3
|
91
|
+
|
92
|
+
***
|
93
|
+
|
94
|
+
## Subheader
|
95
|
+
|
96
|
+
This is paragraph two.
|
97
|
+
|
98
|
+
1. bullet 1
|
99
|
+
* Sub-bullet 1 [Nested link](http://github.com).
|
100
|
+
1. bullet 2
|
101
|
+
1. bullet 3
|
102
|
+
|
103
|
+
|
104
|
+
```
|
105
|
+
|
106
|
+
## Contributing
|
107
|
+
|
108
|
+
1. Fork it
|
109
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
110
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
111
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
112
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/lib/markitdown.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require "markitdown/version"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Markitdown
|
5
|
+
def self.from_html(html)
|
6
|
+
from_nokogiri(Nokogiri::XML(html).root)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_nokogiri(node)
|
10
|
+
# gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
|
11
|
+
# gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
|
12
|
+
# gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
|
13
|
+
# gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
|
14
|
+
self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1')
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
def self.parse_node(node, states=[])
|
19
|
+
results=[]
|
20
|
+
after = nil
|
21
|
+
states.unshift node.name.downcase
|
22
|
+
pre = prefix(states)
|
23
|
+
strip_contents = false
|
24
|
+
case node.name
|
25
|
+
when "head"
|
26
|
+
return []
|
27
|
+
when "title"
|
28
|
+
return []
|
29
|
+
when "style"
|
30
|
+
return []
|
31
|
+
when "div"
|
32
|
+
results << "\t"
|
33
|
+
after = "\t"
|
34
|
+
when "span"
|
35
|
+
results << "\t"
|
36
|
+
after = "\t"
|
37
|
+
when "p"
|
38
|
+
results << self.newline(pre, nil, 2)
|
39
|
+
after = self.newline(pre, nil, 2)
|
40
|
+
when "h1"
|
41
|
+
results << self.newline(pre, nil, 2)
|
42
|
+
results << "# "
|
43
|
+
after = self.newline(pre, nil, 2)
|
44
|
+
when "h2"
|
45
|
+
results << self.newline(pre, nil, 2)
|
46
|
+
results << "## "
|
47
|
+
after = self.newline(pre, nil, 2)
|
48
|
+
when "h3"
|
49
|
+
results << self.newline(pre, nil, 2)
|
50
|
+
results << "### "
|
51
|
+
after = self.newline(pre, nil, 2)
|
52
|
+
when "h4"
|
53
|
+
results << self.newline(pre, nil, 2)
|
54
|
+
results << "#### "
|
55
|
+
after = self.newline(pre, nil, 2)
|
56
|
+
when "h5"
|
57
|
+
results << self.newline(pre, nil, 2)
|
58
|
+
results << "##### "
|
59
|
+
after = self.newline(pre, nil, 2)
|
60
|
+
when "h6"
|
61
|
+
results << self.newline(pre, nil, 2)
|
62
|
+
results << "###### "
|
63
|
+
after = self.newline(pre, nil, 2)
|
64
|
+
when "hr"
|
65
|
+
results << self.newline(pre, nil, 2)
|
66
|
+
results << "***"
|
67
|
+
results << self.newline(pre, nil, 2)
|
68
|
+
when "br"
|
69
|
+
results << self.newline(pre, nil, 2)
|
70
|
+
when "em"
|
71
|
+
results << " *"
|
72
|
+
after = "* "
|
73
|
+
when "i"
|
74
|
+
results << " *"
|
75
|
+
after = "* "
|
76
|
+
when "strong"
|
77
|
+
results << " **"
|
78
|
+
after = "** "
|
79
|
+
when "b"
|
80
|
+
results << " **"
|
81
|
+
after = "** "
|
82
|
+
when "blockquote"
|
83
|
+
results << pre
|
84
|
+
after = "\n"
|
85
|
+
when "ol"
|
86
|
+
unless self.nested_list?(states)
|
87
|
+
results << self.newline(pre, nil)
|
88
|
+
after = "\n"
|
89
|
+
end
|
90
|
+
when "ul"
|
91
|
+
unless self.nested_list?(states)
|
92
|
+
results << self.newline(pre, nil)
|
93
|
+
after = "\n"
|
94
|
+
end
|
95
|
+
when "li"
|
96
|
+
results << "\n"
|
97
|
+
results << pre
|
98
|
+
when "a"
|
99
|
+
results << " ["
|
100
|
+
after = ["](#{node.attributes["href"].value}) "]
|
101
|
+
strip_content = true
|
102
|
+
when "img"
|
103
|
+
results << " !["
|
104
|
+
results << node.attributes["alt"].value if node.attributes["alt"]
|
105
|
+
results << "]("
|
106
|
+
results << node.attributes["src"].value if node.attributes["src"]
|
107
|
+
results << ") "
|
108
|
+
when "text"
|
109
|
+
results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
|
110
|
+
end
|
111
|
+
node.children.each do |child|
|
112
|
+
contents = self.parse_node(child, states)
|
113
|
+
contents = contents.flatten.compact.join.strip if strip_content
|
114
|
+
results << contents
|
115
|
+
end
|
116
|
+
results << after
|
117
|
+
states.shift
|
118
|
+
results
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.nested_list?(states)
|
122
|
+
result = false
|
123
|
+
states.each_with_index do |state, index|
|
124
|
+
next if index==0
|
125
|
+
result = true if ["ul","ol","blockquote"].include?(state)
|
126
|
+
end
|
127
|
+
result
|
128
|
+
end
|
129
|
+
|
130
|
+
def self.newline(pre, line, count=1)
|
131
|
+
result = []
|
132
|
+
count.times do
|
133
|
+
result << pre
|
134
|
+
result << line
|
135
|
+
result << "\n"
|
136
|
+
end
|
137
|
+
result
|
138
|
+
end
|
139
|
+
|
140
|
+
def self.prefix(states)
|
141
|
+
result = []
|
142
|
+
states.each_with_index do |state, index|
|
143
|
+
if state == "blockquote"
|
144
|
+
result.unshift(" > ")
|
145
|
+
end
|
146
|
+
next if index==0
|
147
|
+
if index==1
|
148
|
+
if states.first == "li"
|
149
|
+
if state == "ol"
|
150
|
+
result.unshift(" 1. ")
|
151
|
+
elsif state == "ul"
|
152
|
+
result.unshift(" * ")
|
153
|
+
end
|
154
|
+
end
|
155
|
+
next
|
156
|
+
end
|
157
|
+
case state
|
158
|
+
when "ol"
|
159
|
+
result.unshift(" ")
|
160
|
+
when "ul"
|
161
|
+
result.unshift(" ")
|
162
|
+
end
|
163
|
+
end
|
164
|
+
result
|
165
|
+
end
|
166
|
+
end
|
data/markitdown.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/markitdown/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Christopher Petersen"]
|
6
|
+
gem.email = ["christopher.petersen@gmail.com"]
|
7
|
+
gem.description = %q{A small library that uses Nokogiri to parse an HTML file and produce Markdown}
|
8
|
+
gem.summary = %q{Converts HTML to Markdown}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.add_dependency('nokogiri')
|
12
|
+
gem.add_development_dependency('rake')
|
13
|
+
gem.add_development_dependency('rspec')
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($\)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.name = "markitdown"
|
19
|
+
gem.require_paths = ["lib"]
|
20
|
+
gem.version = Markitdown::VERSION
|
21
|
+
end
|
data/spec/doc.html
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Test Document</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
<h1>Main Header</h1>
|
7
|
+
<p>
|
8
|
+
This <em>is</em> a <b>test</b>. It includes a <a href="http://www.google.com">link</a> as well as an image <img src="https://www.google.com/images/srpr/logo3w.png" alt="Google Logo" />
|
9
|
+
<ul>
|
10
|
+
<li>bullet 1</li>
|
11
|
+
<li>bullet 2</li>
|
12
|
+
<li>bullet 3</li>
|
13
|
+
</ul>
|
14
|
+
</p>
|
15
|
+
<hr/>
|
16
|
+
<h2>Subheader</h2>
|
17
|
+
<p>
|
18
|
+
This is paragraph two.
|
19
|
+
<ol>
|
20
|
+
<li>bullet 1</li>
|
21
|
+
<ul>
|
22
|
+
<li>Sub-bullet 1 <a href="http://github.com">Nested link</a>.</li>
|
23
|
+
</ul>
|
24
|
+
<li>bullet 2</li>
|
25
|
+
<li>bullet 3</li>
|
26
|
+
</ol>
|
27
|
+
</p>
|
28
|
+
</body>
|
29
|
+
</html>
|
data/spec/doc_spec.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
|
3
|
+
describe Markitdown do
|
4
|
+
context "When parsing a document" do
|
5
|
+
let(:html) { File.read("spec/doc.html") }
|
6
|
+
|
7
|
+
it "should produce valid markdown" do
|
8
|
+
Markitdown.from_html(html).should == "
|
9
|
+
|
10
|
+
# Main Header
|
11
|
+
|
12
|
+
This *is* a **test**. It includes a [link](http://www.google.com) as well as an image 
|
13
|
+
|
14
|
+
* bullet 1
|
15
|
+
* bullet 2
|
16
|
+
* bullet 3
|
17
|
+
|
18
|
+
***
|
19
|
+
|
20
|
+
## Subheader
|
21
|
+
|
22
|
+
This is paragraph two.
|
23
|
+
|
24
|
+
1. bullet 1
|
25
|
+
* Sub-bullet 1 [Nested link](http://github.com).
|
26
|
+
1. bullet 2
|
27
|
+
1. bullet 3
|
28
|
+
|
29
|
+
"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
|
3
|
+
describe Markitdown do
|
4
|
+
context "when parsing nested ordered lists" do
|
5
|
+
let(:html) { "
|
6
|
+
<ol>
|
7
|
+
<li>line 1.1</li>
|
8
|
+
<ol>
|
9
|
+
<li>line 2.1</li>
|
10
|
+
<li>line 2.2</li>
|
11
|
+
<ol>
|
12
|
+
<li>line 3.1</li>
|
13
|
+
<li>line 3.2</li>
|
14
|
+
</ol>
|
15
|
+
</ol>
|
16
|
+
<li>line 1.2</li>
|
17
|
+
</ol>"
|
18
|
+
}
|
19
|
+
it "should return valid markdown" do
|
20
|
+
Markitdown.from_html(html).should == "
|
21
|
+
|
22
|
+
1. line 1.1
|
23
|
+
1. line 2.1
|
24
|
+
1. line 2.2
|
25
|
+
1. line 3.1
|
26
|
+
1. line 3.2
|
27
|
+
1. line 1.2
|
28
|
+
"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "when parsing nested unordered lists" do
|
33
|
+
let(:html) { "
|
34
|
+
<ul>
|
35
|
+
<li>line 1.1</li>
|
36
|
+
<ul>
|
37
|
+
<li>line 2.1</li>
|
38
|
+
<li>line 2.2</li>
|
39
|
+
<ul>
|
40
|
+
<li>line 3.1</li>
|
41
|
+
<li>line 3.2</li>
|
42
|
+
</ul>
|
43
|
+
</ul>
|
44
|
+
<li>line 1.2</li>
|
45
|
+
</ul>"
|
46
|
+
}
|
47
|
+
it "should return valid markdown" do
|
48
|
+
Markitdown.from_html(html).should == "
|
49
|
+
|
50
|
+
* line 1.1
|
51
|
+
* line 2.1
|
52
|
+
* line 2.2
|
53
|
+
* line 3.1
|
54
|
+
* line 3.2
|
55
|
+
* line 1.2
|
56
|
+
"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "when parsing nested ordered and unordered lists" do
|
61
|
+
let(:html) { "
|
62
|
+
<ul>
|
63
|
+
<li>line 1.1</li>
|
64
|
+
<ol>
|
65
|
+
<li>line 2.1</li>
|
66
|
+
<li>line 2.2</li>
|
67
|
+
<ul>
|
68
|
+
<li>line 3.1</li>
|
69
|
+
<li>line 3.2</li>
|
70
|
+
</ul>
|
71
|
+
</ol>
|
72
|
+
<li>line 1.2</li>
|
73
|
+
</ul>"
|
74
|
+
}
|
75
|
+
it "should return valid markdown" do
|
76
|
+
Markitdown.from_html(html).should == "
|
77
|
+
|
78
|
+
* line 1.1
|
79
|
+
1. line 2.1
|
80
|
+
1. line 2.2
|
81
|
+
* line 3.1
|
82
|
+
* line 3.2
|
83
|
+
* line 1.2
|
84
|
+
"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context "when parsing an unordered list nested under a blockquote" do
|
89
|
+
let(:html) { "
|
90
|
+
<blockquote>
|
91
|
+
This is a quote with a list
|
92
|
+
<ul>
|
93
|
+
<li>item 1</li>
|
94
|
+
<li>item 2</li>
|
95
|
+
</ul>
|
96
|
+
</blockquote>" }
|
97
|
+
it "should return valid markdown" do
|
98
|
+
Markitdown.from_html(html).should ==
|
99
|
+
" > This is a quote with a list
|
100
|
+
> * item 1
|
101
|
+
> * item 2
|
102
|
+
"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
context "when parsing nested lists with links nested under a blockquote" do
|
108
|
+
let(:html) { "
|
109
|
+
<blockquote>
|
110
|
+
This is a quote with a list
|
111
|
+
<ul>
|
112
|
+
<li>item <a href='http://www.google.com'>1.1</a></li>
|
113
|
+
<ol>
|
114
|
+
<li>item <a href='http://www.google.com'>2.1</a></li>
|
115
|
+
<li>item 2.2</li>
|
116
|
+
</ol>
|
117
|
+
<li>item 1.2</li>
|
118
|
+
</ul>
|
119
|
+
</blockquote>" }
|
120
|
+
it "should return valid markdown" do
|
121
|
+
Markitdown.from_html(html).should ==
|
122
|
+
" > This is a quote with a list
|
123
|
+
> * item [1.1](http://www.google.com)
|
124
|
+
> 1. item [2.1](http://www.google.com)
|
125
|
+
> 1. item 2.2
|
126
|
+
> * item 1.2
|
127
|
+
"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
data/spec/tag_spec.rb
ADDED
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
|
3
|
+
describe Markitdown do
|
4
|
+
context "When parsing a paragraph" do
|
5
|
+
let(:html) { "<p>This is a paragraph</p>" }
|
6
|
+
|
7
|
+
it "should return valid markdown" do
|
8
|
+
Markitdown.from_html(html).should == "\n\nThis is a paragraph\n\n"
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context "When parsing an H1" do
|
13
|
+
let(:html) { "<h1>This is a test</h1>" }
|
14
|
+
|
15
|
+
it "should return valid markdown" do
|
16
|
+
Markitdown.from_html(html).should == "\n\n# This is a test\n\n"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
context "When parsing an H2" do
|
21
|
+
let(:html) { "<h2>This is a test</h2>" }
|
22
|
+
|
23
|
+
it "should return valid markdown" do
|
24
|
+
Markitdown.from_html(html).should == "\n\n## This is a test\n\n"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "When parsing an H3" do
|
29
|
+
let(:html) { "<h3>This is a test</h3>" }
|
30
|
+
|
31
|
+
it "should return valid markdown" do
|
32
|
+
Markitdown.from_html(html).should == "\n\n### This is a test\n\n"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
context "When parsing an H4" do
|
37
|
+
let(:html) { "<h4>This is a test</h4>" }
|
38
|
+
|
39
|
+
it "should return valid markdown" do
|
40
|
+
Markitdown.from_html(html).should == "\n\n#### This is a test\n\n"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "When parsing an H5" do
|
45
|
+
let(:html) { "<h5>This is a test</h5>" }
|
46
|
+
|
47
|
+
it "should return valid markdown" do
|
48
|
+
Markitdown.from_html(html).should == "\n\n##### This is a test\n\n"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context "When parsing an H6" do
|
53
|
+
let(:html) { "<h6>This is a test</h6>" }
|
54
|
+
|
55
|
+
it "should return valid markdown" do
|
56
|
+
Markitdown.from_html(html).should == "\n\n###### This is a test\n\n"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context "When parsing an HR" do
|
61
|
+
let(:html) { "<hr/>" }
|
62
|
+
|
63
|
+
it "should return valid markdown" do
|
64
|
+
Markitdown.from_html(html).should == "\n\n***\n\n"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "When parsing an BR" do
|
69
|
+
let(:html) { "<br/>" }
|
70
|
+
|
71
|
+
it "should return valid markdown" do
|
72
|
+
Markitdown.from_html(html).should == "\n\n"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context "When parsing an EM element" do
|
77
|
+
let(:html) { "<em>emphasis added</em>" }
|
78
|
+
|
79
|
+
it "should return valid markdown" do
|
80
|
+
Markitdown.from_html(html).should == " *emphasis added* "
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
context "When parsing an italicized element" do
|
85
|
+
let(:html) { "<i>italics added</i>" }
|
86
|
+
|
87
|
+
it "should return valid markdown" do
|
88
|
+
Markitdown.from_html(html).should == " *italics added* "
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
context "When parsing a strong element" do
|
93
|
+
let(:html) { "<strong>strong added</strong>" }
|
94
|
+
|
95
|
+
it "should return valid markdown" do
|
96
|
+
Markitdown.from_html(html).should == " **strong added** "
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
context "When parsing a bold element" do
|
101
|
+
let(:html) { "<b>bold added</b>" }
|
102
|
+
|
103
|
+
it "should return valid markdown" do
|
104
|
+
Markitdown.from_html(html).should == " **bold added** "
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
context "When parsing a bold element that's followed by a punctuation" do
|
109
|
+
let(:html) { "<html><b>bold added</b>.</html>" }
|
110
|
+
|
111
|
+
it "should return valid markdown without a space" do
|
112
|
+
Markitdown.from_html(html).should == " **bold added**."
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
context "When parsing a em element that's followed by a punctuation" do
|
117
|
+
let(:html) { "<html><em>emphasis added</em>?</html>" }
|
118
|
+
|
119
|
+
it "should return valid markdown without a space" do
|
120
|
+
Markitdown.from_html(html).should == " *emphasis added*?"
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
context "When parsing an OL" do
|
125
|
+
let(:html) { "<ol>
|
126
|
+
<li>first bullet</li>
|
127
|
+
<li>second bullet</li>
|
128
|
+
<li>third bullet</li>
|
129
|
+
</ol>"
|
130
|
+
}
|
131
|
+
it "should return valid markdown" do
|
132
|
+
Markitdown.from_html(html).should == "
|
133
|
+
|
134
|
+
1. first bullet
|
135
|
+
1. second bullet
|
136
|
+
1. third bullet
|
137
|
+
"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
context "When parsing an UL" do
|
142
|
+
let(:html) { "<ul>
|
143
|
+
<li>first bullet</li>
|
144
|
+
<li>second bullet</li>
|
145
|
+
<li>third bullet</li>
|
146
|
+
</ul>"
|
147
|
+
}
|
148
|
+
it "should return valid markdown" do
|
149
|
+
Markitdown.from_html(html).should == "
|
150
|
+
|
151
|
+
* first bullet
|
152
|
+
* second bullet
|
153
|
+
* third bullet
|
154
|
+
"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
context "When parsing a link" do
|
159
|
+
let(:html) { "<a href='http://www.google.com'>this is a link</strong>" }
|
160
|
+
|
161
|
+
it "should return valid markdown" do
|
162
|
+
Markitdown.from_html(html).should == " [this is a link](http://www.google.com) "
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
context "When parsing an image" do
|
167
|
+
let(:html) { "<img src='https://www.google.com/images/srpr/logo3w.png' alt='Google Logo'>" }
|
168
|
+
|
169
|
+
it "should return valid markdown" do
|
170
|
+
Markitdown.from_html(html).should == "  "
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
context "When parsing an image without an alt tag" do
|
175
|
+
let(:html) { "<img src='https://www.google.com/images/srpr/logo3w.png'>" }
|
176
|
+
|
177
|
+
it "should return valid markdown" do
|
178
|
+
Markitdown.from_html(html).should == "  "
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
context "When parsing a style block" do
|
183
|
+
let(:html) { "<style>div.whatever { font-weight: bold; }</style>" }
|
184
|
+
|
185
|
+
it "should ignore it" do
|
186
|
+
Markitdown.from_html(html).should == ""
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
context "When parsing a blockquote" do
|
191
|
+
let(:html) { "<blockquote>this is a block quote</blockquote>" }
|
192
|
+
|
193
|
+
it "should return valid markdown" do
|
194
|
+
Markitdown.from_html(html).should == " > this is a block quote\n"
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
context "When parsing a multi line blockquote" do
|
199
|
+
let(:html) { "<blockquote>
|
200
|
+
line 1
|
201
|
+
line 2
|
202
|
+
line 3
|
203
|
+
</blockquote>" }
|
204
|
+
|
205
|
+
it "should return valid markdown" do
|
206
|
+
Markitdown.from_html(html).should == " > line 1 line 2 line 3\n"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
metadata
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: markitdown
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Christopher Petersen
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-10-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: A small library that uses Nokogiri to parse an HTML file and produce
|
63
|
+
Markdown
|
64
|
+
email:
|
65
|
+
- christopher.petersen@gmail.com
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- .gitignore
|
71
|
+
- Gemfile
|
72
|
+
- LICENSE
|
73
|
+
- README.md
|
74
|
+
- Rakefile
|
75
|
+
- lib/markitdown.rb
|
76
|
+
- lib/markitdown/version.rb
|
77
|
+
- markitdown.gemspec
|
78
|
+
- spec/doc.html
|
79
|
+
- spec/doc_spec.rb
|
80
|
+
- spec/nesting_spec.rb
|
81
|
+
- spec/tag_spec.rb
|
82
|
+
homepage: ''
|
83
|
+
licenses: []
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options: []
|
86
|
+
require_paths:
|
87
|
+
- lib
|
88
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
segments:
|
95
|
+
- 0
|
96
|
+
hash: 4314622301527767866
|
97
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
segments:
|
104
|
+
- 0
|
105
|
+
hash: 4314622301527767866
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 1.8.24
|
109
|
+
signing_key:
|
110
|
+
specification_version: 3
|
111
|
+
summary: Converts HTML to Markdown
|
112
|
+
test_files:
|
113
|
+
- spec/doc.html
|
114
|
+
- spec/doc_spec.rb
|
115
|
+
- spec/nesting_spec.rb
|
116
|
+
- spec/tag_spec.rb
|