mislav-remark 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.markdown ADDED
@@ -0,0 +1,17 @@
1
+ Remark
2
+ ======
3
+
4
+ A Ruby tool that parses HTML and delivers proper Markup.
5
+
6
+ Usage
7
+ -----
8
+
9
+ From command-line:
10
+
11
+ ruby -Ilib -rubygems bin/remark spec/sample.html
12
+
13
+ (You can also give input to STDIN instead as file argument.)
14
+
15
+ From Ruby code:
16
+
17
+ Remark.new('<h1>My document</h1><p>Some content</p>').to_markdown
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ desc "generates .gemspec file"
2
+ task :gemspec do
3
+ spec = Gem::Specification.new do |gem|
4
+ gem.name = "remark"
5
+ gem.summary = "HTML to Markdown converter"
6
+ gem.email = "mislav.marohnic@gmail.com"
7
+ gem.homepage = "http://github.com/mislav/remark"
8
+ gem.authors = ["Mislav Marohnić"]
9
+ gem.has_rdoc = false
10
+
11
+ gem.version = '0.1.0'
12
+ gem.files = FileList['Rakefile', '{bin,lib,rails,spec}/**/*', 'README*', 'LICENSE*']
13
+ gem.executables = Dir['bin/*'].map { |f| File.basename(f) }
14
+ end
15
+
16
+ spec_string = spec.to_ruby
17
+
18
+ begin
19
+ Thread.new { eval("$SAFE = 3\n#{spec_string}", binding) }.join
20
+ rescue
21
+ abort "unsafe gemspec: #{$!}"
22
+ else
23
+ File.open("#{spec.name}.gemspec", 'w') { |file| file.write spec_string }
24
+ end
25
+ end
data/bin/remark ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'remark'
3
+
4
+ puts Remark.new(ARGF.read).to_markdown
data/lib/remark.rb ADDED
@@ -0,0 +1,99 @@
1
+ require 'hpricot'
2
+
3
+ class Remark
4
+ def initialize(source)
5
+ @doc = Hpricot(source)
6
+ end
7
+
8
+ def to_markdown
9
+ remark_children(@doc).join("\n\n")
10
+ end
11
+
12
+ IGNORE = %w(script head style)
13
+
14
+ private
15
+
16
+ def valid_attributes?(elem)
17
+ case elem.name
18
+ when 'a'
19
+ (elem.attributes.keys - %w(title)) == %w(href)
20
+ when 'img'
21
+ (elem.attributes.keys - %w(title)).sort == %w(alt src)
22
+ else
23
+ elem.attributes.empty?
24
+ end
25
+ end
26
+
27
+ def remark_children(node)
28
+ remarked = []
29
+ node.children.each do |item|
30
+ result = remark_item(item)
31
+ remarked << result if result
32
+ end
33
+ remarked
34
+ end
35
+
36
+ def remark_item(item)
37
+ if item.text?
38
+ item.to_s.gsub(/\n+/, ' ') unless item.to_s =~ /^\s*$/
39
+ elsif item.elem?
40
+ if IGNORE.include?(item.name)
41
+ nil
42
+ elsif valid_attributes?(item)
43
+ remark_element(item)
44
+ else
45
+ item
46
+ end
47
+ end
48
+ end
49
+
50
+ def remark_element(elem)
51
+ case elem.name
52
+ when 'p'
53
+ remark_inline(elem)
54
+ when /^h([1-6])$/
55
+ ('#' * $1.to_i) + ' ' + remark_inline(elem)
56
+ when 'ul', 'ol'
57
+ remark_list(elem)
58
+ when 'li'
59
+ remark_inline(elem)
60
+ when 'pre'
61
+ elem.inner_text.gsub(/^/, ' '*4)
62
+ when 'em'
63
+ "_#{elem.inner_text}_"
64
+ when 'strong'
65
+ "**#{elem.inner_text}**"
66
+ when 'code'
67
+ "`#{elem.inner_text}`"
68
+ when 'a'
69
+ remark_link(elem.inner_html, elem.attributes['href'], elem.attributes['title'])
70
+ when 'img'
71
+ '!' + remark_link(elem.attributes['alt'], elem.attributes['src'], elem.attributes['title'])
72
+ when 'blockquote'
73
+ remark_children(elem).join("\n\n").gsub(/^/, '> ')
74
+ else
75
+ elem
76
+ end
77
+ end
78
+
79
+ def remark_link(text, href, title = nil)
80
+ title_markup = title ? %( "#{title}") : ''
81
+ "[#{text}](#{href}#{title_markup})"
82
+ end
83
+
84
+ def remark_inline(elem)
85
+ remark_children(elem).join('')
86
+ end
87
+
88
+ def remark_list(list)
89
+ unordered = list.name == 'ul'
90
+ marker = unordered ? '*' : 0
91
+ remark_children(list).map do |item|
92
+ if unordered
93
+ marker + ' ' + item
94
+ else
95
+ (marker += 1).to_s + '. ' + item
96
+ end
97
+ end.join("\n")
98
+ end
99
+ end
@@ -0,0 +1,96 @@
1
+ require 'remark'
2
+
3
+ describe Remark do
4
+ def remark(source)
5
+ described_class.new(source).to_markdown
6
+ end
7
+
8
+ it "should let through text content" do
9
+ remark("Foo bar").should == 'Foo bar'
10
+ remark("Foo bar\nbaz").should == 'Foo bar baz'
11
+ end
12
+
13
+ it "should split paragraphs with an empty line" do
14
+ remark("<p>Foo bar</p>").should == 'Foo bar'
15
+ remark("<p>Foo bar</p><p>baz").should == "Foo bar\n\nbaz"
16
+ remark("<p>Foo bar</p>baz").should == "Foo bar\n\nbaz"
17
+ end
18
+
19
+ it "should output title syntax" do
20
+ remark("<h1>Foo bar</h1>").should == '# Foo bar'
21
+ remark("<h2>Foo bar</h2>").should == '## Foo bar'
22
+ end
23
+
24
+ it "should preserve elements in remarked blocks" do
25
+ remark("<p>Foo <ins>bar</ins></p>").should == 'Foo <ins>bar</ins>'
26
+ remark("<h2>Foo <ins>bar</ins></h2>").should == '## Foo <ins>bar</ins>'
27
+ end
28
+
29
+ it "should unescape HTML entities" do
30
+ remark("Foo&amp;bar").should == 'Foo&bar'
31
+ remark("<p>If you&#8217;re doing all your development on the &#8220;master&#8221; branch, you&#8217;re not using git").should == "If you’re doing all your development on the “master” branch, you’re not using git"
32
+ end
33
+
34
+ it "should ignore tags without user-facing content" do
35
+ remark("<script>foo</script>").should == ''
36
+ remark("<head>foo</head>").should == ''
37
+ end
38
+
39
+ it "should leave known elements with attributes intact" do
40
+ remark("<p class='notice'>Kittens attack!</p>").should == '<p class="notice">Kittens attack!</p>'
41
+ end
42
+
43
+ it "should leave unknown elements intact" do
44
+ remark(<<-HTML).should == "Foo\n\n<table>data</table>\n\nBar"
45
+ <p>Foo</p>
46
+ <table>data</table>
47
+ <p>Bar</p>
48
+ HTML
49
+ end
50
+
51
+ it "should support lists" do
52
+ remark(<<-HTML).should == "* foo\n* bar"
53
+ <ul>
54
+ <li>foo</li>
55
+ <li>bar</li>
56
+ </ul>
57
+ HTML
58
+
59
+ remark(<<-HTML).should == "1. foo\n2. bar"
60
+ <ol>
61
+ <li>foo</li>
62
+ <li>bar</li>
63
+ </ol>
64
+ HTML
65
+ end
66
+
67
+ it "should support preformatted blocks" do
68
+ remark("<pre>def foo\n bar\nend</pre>").should == " def foo\n bar\n end"
69
+ remark("<pre><code>def foo\n &lt;bar&gt;\nend</code></pre>").should == " def foo\n <bar>\n end"
70
+ end
71
+
72
+ it "should remark inline elements" do
73
+ remark("<p>I'm so <strong>strong</strong></p>").should == "I'm so **strong**"
74
+ remark("<p>I'm so <em>emo</em></p>").should == "I'm so _emo_"
75
+ remark("<p>Write more <code>code</code></p>").should == "Write more `code`"
76
+ remark("<ul><li><em>Inline</em> stuff in <strong>lists</strong></li></ul>").should == "* _Inline_ stuff in **lists**"
77
+ remark("<h1>Headings <em>too</em></h1>").should == '# Headings _too_'
78
+ end
79
+
80
+ it "should support hyperlinks" do
81
+ remark("<p>Click <a href='http://mislav.uniqpath.com'>here</a></p>").should ==
82
+ "Click [here](http://mislav.uniqpath.com)"
83
+ remark("<a href='/foo' title='bar'>baz</a>").should == '[baz](/foo "bar")'
84
+ end
85
+
86
+ it "should support blockquotes" do
87
+ remark("<blockquote>Cogito, ergo sum</blockquote>").should == '> Cogito, ergo sum'
88
+ remark("<blockquote><p>I think</p><p>therefore I am</p></blockquote>").should == "> I think\n> \n> therefore I am"
89
+ end
90
+
91
+ it "should support image tags" do
92
+ remark("<img src='moo.jpg' alt='cow'>").should == '![cow](moo.jpg)'
93
+ remark("<img src='moo.jpg' alt='cow' width='16'>").should == '<img src="moo.jpg" alt="cow" width="16" />'
94
+ end
95
+ end
96
+
data/spec/sample.html ADDED
@@ -0,0 +1,40 @@
1
+ <h1>Remark &mdash; HTML to Markdown converter</h1>
2
+
3
+ <p>This is a sample document which will get updated as Remark understands more HTML.
4
+ It reflects what's currently supported.</p>
5
+
6
+ <p class="nice">Known block elements are left intact if they have attributes.
7
+ Markdown doesn't have a syntax for them.</p>
8
+
9
+ <table>
10
+ <tr>
11
+ <td>Elements that can't be represented in Markdown are left intact.</td>
12
+ </tr>
13
+ </table>
14
+
15
+ <p>SCRIPT and HEAD tags are swallowed, as browsers don't render them as content.</p>
16
+
17
+ <script type="text/javascript">
18
+ alert("I will not survive")
19
+ </script>
20
+
21
+ <p>Remark supports Markdown syntax for <em>inline</em> markup.
22
+ <a href="http://github.com/mislav">Hyperlinks</a> and <code>code spans</code> are a must.</p>
23
+
24
+ <ul>
25
+ <li>List items too;</li>
26
+ <li>ordered or unordered.</li>
27
+ </ul>
28
+
29
+ <pre><code>And who would forget
30
+ Preformatted code blocks :)</code></pre>
31
+
32
+ <h2>TODO</h2>
33
+
34
+ <p>Remark should probably support BR elements in paragraphs,<br>
35
+ although people tend to abuse them.</p>
36
+
37
+ <div class="content">
38
+ <p>What to do with pieces of content inside wrapper elements,
39
+ like DIV, is still undecided.</p>
40
+ </div>
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mislav-remark
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - "Mislav Marohni\xC4\x87"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-23 00:00:00 -07:00
13
+ default_executable: remark
14
+ dependencies: []
15
+
16
+ description:
17
+ email: mislav.marohnic@gmail.com
18
+ executables:
19
+ - remark
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - Rakefile
26
+ - bin/remark
27
+ - lib/remark.rb
28
+ - spec/remark_spec.rb
29
+ - spec/sample.html
30
+ - README.markdown
31
+ has_rdoc: false
32
+ homepage: http://github.com/mislav/remark
33
+ post_install_message:
34
+ rdoc_options: []
35
+
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: "0"
43
+ version:
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ requirements: []
51
+
52
+ rubyforge_project:
53
+ rubygems_version: 1.2.0
54
+ signing_key:
55
+ specification_version: 3
56
+ summary: HTML to Markdown converter
57
+ test_files: []
58
+