mislav-remark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +17 -0
- data/Rakefile +25 -0
- data/bin/remark +4 -0
- data/lib/remark.rb +99 -0
- data/spec/remark_spec.rb +96 -0
- data/spec/sample.html +40 -0
- metadata +58 -0
data/README.markdown
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Remark
|
2
|
+
======
|
3
|
+
|
4
|
+
A Ruby tool that parses HTML and delivers proper Markup.
|
5
|
+
|
6
|
+
Usage
|
7
|
+
-----
|
8
|
+
|
9
|
+
From command-line:
|
10
|
+
|
11
|
+
ruby -Ilib -rubygems bin/remark spec/sample.html
|
12
|
+
|
13
|
+
(You can also give input to STDIN instead as file argument.)
|
14
|
+
|
15
|
+
From Ruby code:
|
16
|
+
|
17
|
+
Remark.new('<h1>My document</h1><p>Some content</p>').to_markdown
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
desc "generates .gemspec file"
|
2
|
+
task :gemspec do
|
3
|
+
spec = Gem::Specification.new do |gem|
|
4
|
+
gem.name = "remark"
|
5
|
+
gem.summary = "HTML to Markdown converter"
|
6
|
+
gem.email = "mislav.marohnic@gmail.com"
|
7
|
+
gem.homepage = "http://github.com/mislav/remark"
|
8
|
+
gem.authors = ["Mislav Marohnić"]
|
9
|
+
gem.has_rdoc = false
|
10
|
+
|
11
|
+
gem.version = '0.1.0'
|
12
|
+
gem.files = FileList['Rakefile', '{bin,lib,rails,spec}/**/*', 'README*', 'LICENSE*']
|
13
|
+
gem.executables = Dir['bin/*'].map { |f| File.basename(f) }
|
14
|
+
end
|
15
|
+
|
16
|
+
spec_string = spec.to_ruby
|
17
|
+
|
18
|
+
begin
|
19
|
+
Thread.new { eval("$SAFE = 3\n#{spec_string}", binding) }.join
|
20
|
+
rescue
|
21
|
+
abort "unsafe gemspec: #{$!}"
|
22
|
+
else
|
23
|
+
File.open("#{spec.name}.gemspec", 'w') { |file| file.write spec_string }
|
24
|
+
end
|
25
|
+
end
|
data/bin/remark
ADDED
data/lib/remark.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
|
3
|
+
class Remark
|
4
|
+
def initialize(source)
|
5
|
+
@doc = Hpricot(source)
|
6
|
+
end
|
7
|
+
|
8
|
+
def to_markdown
|
9
|
+
remark_children(@doc).join("\n\n")
|
10
|
+
end
|
11
|
+
|
12
|
+
IGNORE = %w(script head style)
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def valid_attributes?(elem)
|
17
|
+
case elem.name
|
18
|
+
when 'a'
|
19
|
+
(elem.attributes.keys - %w(title)) == %w(href)
|
20
|
+
when 'img'
|
21
|
+
(elem.attributes.keys - %w(title)).sort == %w(alt src)
|
22
|
+
else
|
23
|
+
elem.attributes.empty?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def remark_children(node)
|
28
|
+
remarked = []
|
29
|
+
node.children.each do |item|
|
30
|
+
result = remark_item(item)
|
31
|
+
remarked << result if result
|
32
|
+
end
|
33
|
+
remarked
|
34
|
+
end
|
35
|
+
|
36
|
+
def remark_item(item)
|
37
|
+
if item.text?
|
38
|
+
item.to_s.gsub(/\n+/, ' ') unless item.to_s =~ /^\s*$/
|
39
|
+
elsif item.elem?
|
40
|
+
if IGNORE.include?(item.name)
|
41
|
+
nil
|
42
|
+
elsif valid_attributes?(item)
|
43
|
+
remark_element(item)
|
44
|
+
else
|
45
|
+
item
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def remark_element(elem)
|
51
|
+
case elem.name
|
52
|
+
when 'p'
|
53
|
+
remark_inline(elem)
|
54
|
+
when /^h([1-6])$/
|
55
|
+
('#' * $1.to_i) + ' ' + remark_inline(elem)
|
56
|
+
when 'ul', 'ol'
|
57
|
+
remark_list(elem)
|
58
|
+
when 'li'
|
59
|
+
remark_inline(elem)
|
60
|
+
when 'pre'
|
61
|
+
elem.inner_text.gsub(/^/, ' '*4)
|
62
|
+
when 'em'
|
63
|
+
"_#{elem.inner_text}_"
|
64
|
+
when 'strong'
|
65
|
+
"**#{elem.inner_text}**"
|
66
|
+
when 'code'
|
67
|
+
"`#{elem.inner_text}`"
|
68
|
+
when 'a'
|
69
|
+
remark_link(elem.inner_html, elem.attributes['href'], elem.attributes['title'])
|
70
|
+
when 'img'
|
71
|
+
'!' + remark_link(elem.attributes['alt'], elem.attributes['src'], elem.attributes['title'])
|
72
|
+
when 'blockquote'
|
73
|
+
remark_children(elem).join("\n\n").gsub(/^/, '> ')
|
74
|
+
else
|
75
|
+
elem
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def remark_link(text, href, title = nil)
|
80
|
+
title_markup = title ? %( "#{title}") : ''
|
81
|
+
"[#{text}](#{href}#{title_markup})"
|
82
|
+
end
|
83
|
+
|
84
|
+
def remark_inline(elem)
|
85
|
+
remark_children(elem).join('')
|
86
|
+
end
|
87
|
+
|
88
|
+
def remark_list(list)
|
89
|
+
unordered = list.name == 'ul'
|
90
|
+
marker = unordered ? '*' : 0
|
91
|
+
remark_children(list).map do |item|
|
92
|
+
if unordered
|
93
|
+
marker + ' ' + item
|
94
|
+
else
|
95
|
+
(marker += 1).to_s + '. ' + item
|
96
|
+
end
|
97
|
+
end.join("\n")
|
98
|
+
end
|
99
|
+
end
|
data/spec/remark_spec.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'remark'
|
2
|
+
|
3
|
+
describe Remark do
|
4
|
+
def remark(source)
|
5
|
+
described_class.new(source).to_markdown
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should let through text content" do
|
9
|
+
remark("Foo bar").should == 'Foo bar'
|
10
|
+
remark("Foo bar\nbaz").should == 'Foo bar baz'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should split paragraphs with an empty line" do
|
14
|
+
remark("<p>Foo bar</p>").should == 'Foo bar'
|
15
|
+
remark("<p>Foo bar</p><p>baz").should == "Foo bar\n\nbaz"
|
16
|
+
remark("<p>Foo bar</p>baz").should == "Foo bar\n\nbaz"
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should output title syntax" do
|
20
|
+
remark("<h1>Foo bar</h1>").should == '# Foo bar'
|
21
|
+
remark("<h2>Foo bar</h2>").should == '## Foo bar'
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should preserve elements in remarked blocks" do
|
25
|
+
remark("<p>Foo <ins>bar</ins></p>").should == 'Foo <ins>bar</ins>'
|
26
|
+
remark("<h2>Foo <ins>bar</ins></h2>").should == '## Foo <ins>bar</ins>'
|
27
|
+
end
|
28
|
+
|
29
|
+
it "should unescape HTML entities" do
|
30
|
+
remark("Foo&bar").should == 'Foo&bar'
|
31
|
+
remark("<p>If you’re doing all your development on the “master” branch, you’re not using git").should == "If you’re doing all your development on the “master” branch, you’re not using git"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should ignore tags without user-facing content" do
|
35
|
+
remark("<script>foo</script>").should == ''
|
36
|
+
remark("<head>foo</head>").should == ''
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should leave known elements with attributes intact" do
|
40
|
+
remark("<p class='notice'>Kittens attack!</p>").should == '<p class="notice">Kittens attack!</p>'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should leave unknown elements intact" do
|
44
|
+
remark(<<-HTML).should == "Foo\n\n<table>data</table>\n\nBar"
|
45
|
+
<p>Foo</p>
|
46
|
+
<table>data</table>
|
47
|
+
<p>Bar</p>
|
48
|
+
HTML
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should support lists" do
|
52
|
+
remark(<<-HTML).should == "* foo\n* bar"
|
53
|
+
<ul>
|
54
|
+
<li>foo</li>
|
55
|
+
<li>bar</li>
|
56
|
+
</ul>
|
57
|
+
HTML
|
58
|
+
|
59
|
+
remark(<<-HTML).should == "1. foo\n2. bar"
|
60
|
+
<ol>
|
61
|
+
<li>foo</li>
|
62
|
+
<li>bar</li>
|
63
|
+
</ol>
|
64
|
+
HTML
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should support preformatted blocks" do
|
68
|
+
remark("<pre>def foo\n bar\nend</pre>").should == " def foo\n bar\n end"
|
69
|
+
remark("<pre><code>def foo\n <bar>\nend</code></pre>").should == " def foo\n <bar>\n end"
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should remark inline elements" do
|
73
|
+
remark("<p>I'm so <strong>strong</strong></p>").should == "I'm so **strong**"
|
74
|
+
remark("<p>I'm so <em>emo</em></p>").should == "I'm so _emo_"
|
75
|
+
remark("<p>Write more <code>code</code></p>").should == "Write more `code`"
|
76
|
+
remark("<ul><li><em>Inline</em> stuff in <strong>lists</strong></li></ul>").should == "* _Inline_ stuff in **lists**"
|
77
|
+
remark("<h1>Headings <em>too</em></h1>").should == '# Headings _too_'
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should support hyperlinks" do
|
81
|
+
remark("<p>Click <a href='http://mislav.uniqpath.com'>here</a></p>").should ==
|
82
|
+
"Click [here](http://mislav.uniqpath.com)"
|
83
|
+
remark("<a href='/foo' title='bar'>baz</a>").should == '[baz](/foo "bar")'
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should support blockquotes" do
|
87
|
+
remark("<blockquote>Cogito, ergo sum</blockquote>").should == '> Cogito, ergo sum'
|
88
|
+
remark("<blockquote><p>I think</p><p>therefore I am</p></blockquote>").should == "> I think\n> \n> therefore I am"
|
89
|
+
end
|
90
|
+
|
91
|
+
it "should support image tags" do
|
92
|
+
remark("<img src='moo.jpg' alt='cow'>").should == ''
|
93
|
+
remark("<img src='moo.jpg' alt='cow' width='16'>").should == '<img src="moo.jpg" alt="cow" width="16" />'
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
data/spec/sample.html
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
<h1>Remark — HTML to Markdown converter</h1>
|
2
|
+
|
3
|
+
<p>This is a sample document which will get updated as Remark understands more HTML.
|
4
|
+
It reflects what's currently supported.</p>
|
5
|
+
|
6
|
+
<p class="nice">Known block elements are left intact if they have attributes.
|
7
|
+
Markdown doesn't have a syntax for them.</p>
|
8
|
+
|
9
|
+
<table>
|
10
|
+
<tr>
|
11
|
+
<td>Elements that can't be represented in Markdown are left intact.</td>
|
12
|
+
</tr>
|
13
|
+
</table>
|
14
|
+
|
15
|
+
<p>SCRIPT and HEAD tags are swallowed, as browsers don't render them as content.</p>
|
16
|
+
|
17
|
+
<script type="text/javascript">
|
18
|
+
alert("I will not survive")
|
19
|
+
</script>
|
20
|
+
|
21
|
+
<p>Remark supports Markdown syntax for <em>inline</em> markup.
|
22
|
+
<a href="http://github.com/mislav">Hyperlinks</a> and <code>code spans</code> are a must.</p>
|
23
|
+
|
24
|
+
<ul>
|
25
|
+
<li>List items too;</li>
|
26
|
+
<li>ordered or unordered.</li>
|
27
|
+
</ul>
|
28
|
+
|
29
|
+
<pre><code>And who would forget
|
30
|
+
Preformatted code blocks :)</code></pre>
|
31
|
+
|
32
|
+
<h2>TODO</h2>
|
33
|
+
|
34
|
+
<p>Remark should probably support BR elements in paragraphs,<br>
|
35
|
+
although people tend to abuse them.</p>
|
36
|
+
|
37
|
+
<div class="content">
|
38
|
+
<p>What to do with pieces of content inside wrapper elements,
|
39
|
+
like DIV, is still undecided.</p>
|
40
|
+
</div>
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mislav-remark
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- "Mislav Marohni\xC4\x87"
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-06-23 00:00:00 -07:00
|
13
|
+
default_executable: remark
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: mislav.marohnic@gmail.com
|
18
|
+
executables:
|
19
|
+
- remark
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- Rakefile
|
26
|
+
- bin/remark
|
27
|
+
- lib/remark.rb
|
28
|
+
- spec/remark_spec.rb
|
29
|
+
- spec/sample.html
|
30
|
+
- README.markdown
|
31
|
+
has_rdoc: false
|
32
|
+
homepage: http://github.com/mislav/remark
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: "0"
|
43
|
+
version:
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
rubyforge_project:
|
53
|
+
rubygems_version: 1.2.0
|
54
|
+
signing_key:
|
55
|
+
specification_version: 3
|
56
|
+
summary: HTML to Markdown converter
|
57
|
+
test_files: []
|
58
|
+
|