mislav-remark 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +17 -0
- data/Rakefile +25 -0
- data/bin/remark +4 -0
- data/lib/remark.rb +99 -0
- data/spec/remark_spec.rb +96 -0
- data/spec/sample.html +40 -0
- metadata +58 -0
data/README.markdown
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Remark
|
2
|
+
======
|
3
|
+
|
4
|
+
A Ruby tool that parses HTML and delivers proper Markup.
|
5
|
+
|
6
|
+
Usage
|
7
|
+
-----
|
8
|
+
|
9
|
+
From command-line:
|
10
|
+
|
11
|
+
ruby -Ilib -rubygems bin/remark spec/sample.html
|
12
|
+
|
13
|
+
(You can also give input to STDIN instead as file argument.)
|
14
|
+
|
15
|
+
From Ruby code:
|
16
|
+
|
17
|
+
Remark.new('<h1>My document</h1><p>Some content</p>').to_markdown
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
desc "generates .gemspec file"
|
2
|
+
task :gemspec do
|
3
|
+
spec = Gem::Specification.new do |gem|
|
4
|
+
gem.name = "remark"
|
5
|
+
gem.summary = "HTML to Markdown converter"
|
6
|
+
gem.email = "mislav.marohnic@gmail.com"
|
7
|
+
gem.homepage = "http://github.com/mislav/remark"
|
8
|
+
gem.authors = ["Mislav Marohnić"]
|
9
|
+
gem.has_rdoc = false
|
10
|
+
|
11
|
+
gem.version = '0.1.0'
|
12
|
+
gem.files = FileList['Rakefile', '{bin,lib,rails,spec}/**/*', 'README*', 'LICENSE*']
|
13
|
+
gem.executables = Dir['bin/*'].map { |f| File.basename(f) }
|
14
|
+
end
|
15
|
+
|
16
|
+
spec_string = spec.to_ruby
|
17
|
+
|
18
|
+
begin
|
19
|
+
Thread.new { eval("$SAFE = 3\n#{spec_string}", binding) }.join
|
20
|
+
rescue
|
21
|
+
abort "unsafe gemspec: #{$!}"
|
22
|
+
else
|
23
|
+
File.open("#{spec.name}.gemspec", 'w') { |file| file.write spec_string }
|
24
|
+
end
|
25
|
+
end
|
data/bin/remark
ADDED
data/lib/remark.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
|
3
|
+
class Remark
|
4
|
+
def initialize(source)
|
5
|
+
@doc = Hpricot(source)
|
6
|
+
end
|
7
|
+
|
8
|
+
def to_markdown
|
9
|
+
remark_children(@doc).join("\n\n")
|
10
|
+
end
|
11
|
+
|
12
|
+
IGNORE = %w(script head style)
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def valid_attributes?(elem)
|
17
|
+
case elem.name
|
18
|
+
when 'a'
|
19
|
+
(elem.attributes.keys - %w(title)) == %w(href)
|
20
|
+
when 'img'
|
21
|
+
(elem.attributes.keys - %w(title)).sort == %w(alt src)
|
22
|
+
else
|
23
|
+
elem.attributes.empty?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def remark_children(node)
|
28
|
+
remarked = []
|
29
|
+
node.children.each do |item|
|
30
|
+
result = remark_item(item)
|
31
|
+
remarked << result if result
|
32
|
+
end
|
33
|
+
remarked
|
34
|
+
end
|
35
|
+
|
36
|
+
def remark_item(item)
|
37
|
+
if item.text?
|
38
|
+
item.to_s.gsub(/\n+/, ' ') unless item.to_s =~ /^\s*$/
|
39
|
+
elsif item.elem?
|
40
|
+
if IGNORE.include?(item.name)
|
41
|
+
nil
|
42
|
+
elsif valid_attributes?(item)
|
43
|
+
remark_element(item)
|
44
|
+
else
|
45
|
+
item
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def remark_element(elem)
|
51
|
+
case elem.name
|
52
|
+
when 'p'
|
53
|
+
remark_inline(elem)
|
54
|
+
when /^h([1-6])$/
|
55
|
+
('#' * $1.to_i) + ' ' + remark_inline(elem)
|
56
|
+
when 'ul', 'ol'
|
57
|
+
remark_list(elem)
|
58
|
+
when 'li'
|
59
|
+
remark_inline(elem)
|
60
|
+
when 'pre'
|
61
|
+
elem.inner_text.gsub(/^/, ' '*4)
|
62
|
+
when 'em'
|
63
|
+
"_#{elem.inner_text}_"
|
64
|
+
when 'strong'
|
65
|
+
"**#{elem.inner_text}**"
|
66
|
+
when 'code'
|
67
|
+
"`#{elem.inner_text}`"
|
68
|
+
when 'a'
|
69
|
+
remark_link(elem.inner_html, elem.attributes['href'], elem.attributes['title'])
|
70
|
+
when 'img'
|
71
|
+
'!' + remark_link(elem.attributes['alt'], elem.attributes['src'], elem.attributes['title'])
|
72
|
+
when 'blockquote'
|
73
|
+
remark_children(elem).join("\n\n").gsub(/^/, '> ')
|
74
|
+
else
|
75
|
+
elem
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def remark_link(text, href, title = nil)
|
80
|
+
title_markup = title ? %( "#{title}") : ''
|
81
|
+
"[#{text}](#{href}#{title_markup})"
|
82
|
+
end
|
83
|
+
|
84
|
+
def remark_inline(elem)
|
85
|
+
remark_children(elem).join('')
|
86
|
+
end
|
87
|
+
|
88
|
+
def remark_list(list)
|
89
|
+
unordered = list.name == 'ul'
|
90
|
+
marker = unordered ? '*' : 0
|
91
|
+
remark_children(list).map do |item|
|
92
|
+
if unordered
|
93
|
+
marker + ' ' + item
|
94
|
+
else
|
95
|
+
(marker += 1).to_s + '. ' + item
|
96
|
+
end
|
97
|
+
end.join("\n")
|
98
|
+
end
|
99
|
+
end
|
data/spec/remark_spec.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'remark'
|
2
|
+
|
3
|
+
describe Remark do
|
4
|
+
def remark(source)
|
5
|
+
described_class.new(source).to_markdown
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should let through text content" do
|
9
|
+
remark("Foo bar").should == 'Foo bar'
|
10
|
+
remark("Foo bar\nbaz").should == 'Foo bar baz'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should split paragraphs with an empty line" do
|
14
|
+
remark("<p>Foo bar</p>").should == 'Foo bar'
|
15
|
+
remark("<p>Foo bar</p><p>baz").should == "Foo bar\n\nbaz"
|
16
|
+
remark("<p>Foo bar</p>baz").should == "Foo bar\n\nbaz"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should output title syntax" do
|
20
|
+
remark("<h1>Foo bar</h1>").should == '# Foo bar'
|
21
|
+
remark("<h2>Foo bar</h2>").should == '## Foo bar'
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should preserve elements in remarked blocks" do
|
25
|
+
remark("<p>Foo <ins>bar</ins></p>").should == 'Foo <ins>bar</ins>'
|
26
|
+
remark("<h2>Foo <ins>bar</ins></h2>").should == '## Foo <ins>bar</ins>'
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should unescape HTML entities" do
|
30
|
+
remark("Foo&bar").should == 'Foo&bar'
|
31
|
+
remark("<p>If you’re doing all your development on the “master” branch, you’re not using git").should == "If you’re doing all your development on the “master” branch, you’re not using git"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should ignore tags without user-facing content" do
|
35
|
+
remark("<script>foo</script>").should == ''
|
36
|
+
remark("<head>foo</head>").should == ''
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should leave known elements with attributes intact" do
|
40
|
+
remark("<p class='notice'>Kittens attack!</p>").should == '<p class="notice">Kittens attack!</p>'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should leave unknown elements intact" do
|
44
|
+
remark(<<-HTML).should == "Foo\n\n<table>data</table>\n\nBar"
|
45
|
+
<p>Foo</p>
|
46
|
+
<table>data</table>
|
47
|
+
<p>Bar</p>
|
48
|
+
HTML
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should support lists" do
|
52
|
+
remark(<<-HTML).should == "* foo\n* bar"
|
53
|
+
<ul>
|
54
|
+
<li>foo</li>
|
55
|
+
<li>bar</li>
|
56
|
+
</ul>
|
57
|
+
HTML
|
58
|
+
|
59
|
+
remark(<<-HTML).should == "1. foo\n2. bar"
|
60
|
+
<ol>
|
61
|
+
<li>foo</li>
|
62
|
+
<li>bar</li>
|
63
|
+
</ol>
|
64
|
+
HTML
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should support preformatted blocks" do
|
68
|
+
remark("<pre>def foo\n bar\nend</pre>").should == " def foo\n bar\n end"
|
69
|
+
remark("<pre><code>def foo\n <bar>\nend</code></pre>").should == " def foo\n <bar>\n end"
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should remark inline elements" do
|
73
|
+
remark("<p>I'm so <strong>strong</strong></p>").should == "I'm so **strong**"
|
74
|
+
remark("<p>I'm so <em>emo</em></p>").should == "I'm so _emo_"
|
75
|
+
remark("<p>Write more <code>code</code></p>").should == "Write more `code`"
|
76
|
+
remark("<ul><li><em>Inline</em> stuff in <strong>lists</strong></li></ul>").should == "* _Inline_ stuff in **lists**"
|
77
|
+
remark("<h1>Headings <em>too</em></h1>").should == '# Headings _too_'
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should support hyperlinks" do
|
81
|
+
remark("<p>Click <a href='http://mislav.uniqpath.com'>here</a></p>").should ==
|
82
|
+
"Click [here](http://mislav.uniqpath.com)"
|
83
|
+
remark("<a href='/foo' title='bar'>baz</a>").should == '[baz](/foo "bar")'
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should support blockquotes" do
|
87
|
+
remark("<blockquote>Cogito, ergo sum</blockquote>").should == '> Cogito, ergo sum'
|
88
|
+
remark("<blockquote><p>I think</p><p>therefore I am</p></blockquote>").should == "> I think\n> \n> therefore I am"
|
89
|
+
end
|
90
|
+
|
91
|
+
it "should support image tags" do
|
92
|
+
remark("<img src='moo.jpg' alt='cow'>").should == '![cow](moo.jpg)'
|
93
|
+
remark("<img src='moo.jpg' alt='cow' width='16'>").should == '<img src="moo.jpg" alt="cow" width="16" />'
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
data/spec/sample.html
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
<h1>Remark — HTML to Markdown converter</h1>
|
2
|
+
|
3
|
+
<p>This is a sample document which will get updated as Remark understands more HTML.
|
4
|
+
It reflects what's currently supported.</p>
|
5
|
+
|
6
|
+
<p class="nice">Known block elements are left intact if they have attributes.
|
7
|
+
Markdown doesn't have a syntax for them.</p>
|
8
|
+
|
9
|
+
<table>
|
10
|
+
<tr>
|
11
|
+
<td>Elements that can't be represented in Markdown are left intact.</td>
|
12
|
+
</tr>
|
13
|
+
</table>
|
14
|
+
|
15
|
+
<p>SCRIPT and HEAD tags are swallowed, as browsers don't render them as content.</p>
|
16
|
+
|
17
|
+
<script type="text/javascript">
|
18
|
+
alert("I will not survive")
|
19
|
+
</script>
|
20
|
+
|
21
|
+
<p>Remark supports Markdown syntax for <em>inline</em> markup.
|
22
|
+
<a href="http://github.com/mislav">Hyperlinks</a> and <code>code spans</code> are a must.</p>
|
23
|
+
|
24
|
+
<ul>
|
25
|
+
<li>List items too;</li>
|
26
|
+
<li>ordered or unordered.</li>
|
27
|
+
</ul>
|
28
|
+
|
29
|
+
<pre><code>And who would forget
|
30
|
+
Preformatted code blocks :)</code></pre>
|
31
|
+
|
32
|
+
<h2>TODO</h2>
|
33
|
+
|
34
|
+
<p>Remark should probably support BR elements in paragraphs,<br>
|
35
|
+
although people tend to abuse them.</p>
|
36
|
+
|
37
|
+
<div class="content">
|
38
|
+
<p>What to do with pieces of content inside wrapper elements,
|
39
|
+
like DIV, is still undecided.</p>
|
40
|
+
</div>
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mislav-remark
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- "Mislav Marohni\xC4\x87"
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-23 00:00:00 -07:00
|
13
|
+
default_executable: remark
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: mislav.marohnic@gmail.com
|
18
|
+
executables:
|
19
|
+
- remark
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- Rakefile
|
26
|
+
- bin/remark
|
27
|
+
- lib/remark.rb
|
28
|
+
- spec/remark_spec.rb
|
29
|
+
- spec/sample.html
|
30
|
+
- README.markdown
|
31
|
+
has_rdoc: false
|
32
|
+
homepage: http://github.com/mislav/remark
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: "0"
|
43
|
+
version:
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
rubyforge_project:
|
53
|
+
rubygems_version: 1.2.0
|
54
|
+
signing_key:
|
55
|
+
specification_version: 3
|
56
|
+
summary: HTML to Markdown converter
|
57
|
+
test_files: []
|
58
|
+
|