mislav-remark 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.markdown ADDED
@@ -0,0 +1,17 @@
1
+ Remark
2
+ ======
3
+
4
+ A Ruby tool that parses HTML and delivers proper Markup.
5
+
6
+ Usage
7
+ -----
8
+
9
+ From command-line:
10
+
11
+ ruby -Ilib -rubygems bin/remark spec/sample.html
12
+
13
+ (You can also give input to STDIN instead as file argument.)
14
+
15
+ From Ruby code:
16
+
17
+ Remark.new('<h1>My document</h1><p>Some content</p>').to_markdown
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ desc "generates .gemspec file"
2
+ task :gemspec do
3
+ spec = Gem::Specification.new do |gem|
4
+ gem.name = "remark"
5
+ gem.summary = "HTML to Markdown converter"
6
+ gem.email = "mislav.marohnic@gmail.com"
7
+ gem.homepage = "http://github.com/mislav/remark"
8
+ gem.authors = ["Mislav Marohnić"]
9
+ gem.has_rdoc = false
10
+
11
+ gem.version = '0.1.0'
12
+ gem.files = FileList['Rakefile', '{bin,lib,rails,spec}/**/*', 'README*', 'LICENSE*']
13
+ gem.executables = Dir['bin/*'].map { |f| File.basename(f) }
14
+ end
15
+
16
+ spec_string = spec.to_ruby
17
+
18
+ begin
19
+ Thread.new { eval("$SAFE = 3\n#{spec_string}", binding) }.join
20
+ rescue
21
+ abort "unsafe gemspec: #{$!}"
22
+ else
23
+ File.open("#{spec.name}.gemspec", 'w') { |file| file.write spec_string }
24
+ end
25
+ end
data/bin/remark ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'remark'
3
+
4
+ puts Remark.new(ARGF.read).to_markdown
data/lib/remark.rb ADDED
@@ -0,0 +1,99 @@
1
+ require 'hpricot'
2
+
3
+ class Remark
4
+ def initialize(source)
5
+ @doc = Hpricot(source)
6
+ end
7
+
8
+ def to_markdown
9
+ remark_children(@doc).join("\n\n")
10
+ end
11
+
12
+ IGNORE = %w(script head style)
13
+
14
+ private
15
+
16
+ def valid_attributes?(elem)
17
+ case elem.name
18
+ when 'a'
19
+ (elem.attributes.keys - %w(title)) == %w(href)
20
+ when 'img'
21
+ (elem.attributes.keys - %w(title)).sort == %w(alt src)
22
+ else
23
+ elem.attributes.empty?
24
+ end
25
+ end
26
+
27
+ def remark_children(node)
28
+ remarked = []
29
+ node.children.each do |item|
30
+ result = remark_item(item)
31
+ remarked << result if result
32
+ end
33
+ remarked
34
+ end
35
+
36
+ def remark_item(item)
37
+ if item.text?
38
+ item.to_s.gsub(/\n+/, ' ') unless item.to_s =~ /^\s*$/
39
+ elsif item.elem?
40
+ if IGNORE.include?(item.name)
41
+ nil
42
+ elsif valid_attributes?(item)
43
+ remark_element(item)
44
+ else
45
+ item
46
+ end
47
+ end
48
+ end
49
+
50
+ def remark_element(elem)
51
+ case elem.name
52
+ when 'p'
53
+ remark_inline(elem)
54
+ when /^h([1-6])$/
55
+ ('#' * $1.to_i) + ' ' + remark_inline(elem)
56
+ when 'ul', 'ol'
57
+ remark_list(elem)
58
+ when 'li'
59
+ remark_inline(elem)
60
+ when 'pre'
61
+ elem.inner_text.gsub(/^/, ' '*4)
62
+ when 'em'
63
+ "_#{elem.inner_text}_"
64
+ when 'strong'
65
+ "**#{elem.inner_text}**"
66
+ when 'code'
67
+ "`#{elem.inner_text}`"
68
+ when 'a'
69
+ remark_link(elem.inner_html, elem.attributes['href'], elem.attributes['title'])
70
+ when 'img'
71
+ '!' + remark_link(elem.attributes['alt'], elem.attributes['src'], elem.attributes['title'])
72
+ when 'blockquote'
73
+ remark_children(elem).join("\n\n").gsub(/^/, '> ')
74
+ else
75
+ elem
76
+ end
77
+ end
78
+
79
+ def remark_link(text, href, title = nil)
80
+ title_markup = title ? %( "#{title}") : ''
81
+ "[#{text}](#{href}#{title_markup})"
82
+ end
83
+
84
+ def remark_inline(elem)
85
+ remark_children(elem).join('')
86
+ end
87
+
88
+ def remark_list(list)
89
+ unordered = list.name == 'ul'
90
+ marker = unordered ? '*' : 0
91
+ remark_children(list).map do |item|
92
+ if unordered
93
+ marker + ' ' + item
94
+ else
95
+ (marker += 1).to_s + '. ' + item
96
+ end
97
+ end.join("\n")
98
+ end
99
+ end
@@ -0,0 +1,96 @@
1
+ require 'remark'
2
+
3
+ describe Remark do
4
+ def remark(source)
5
+ described_class.new(source).to_markdown
6
+ end
7
+
8
+ it "should let through text content" do
9
+ remark("Foo bar").should == 'Foo bar'
10
+ remark("Foo bar\nbaz").should == 'Foo bar baz'
11
+ end
12
+
13
+ it "should split paragraphs with an empty line" do
14
+ remark("<p>Foo bar</p>").should == 'Foo bar'
15
+ remark("<p>Foo bar</p><p>baz").should == "Foo bar\n\nbaz"
16
+ remark("<p>Foo bar</p>baz").should == "Foo bar\n\nbaz"
17
+ end
18
+
19
+ it "should output title syntax" do
20
+ remark("<h1>Foo bar</h1>").should == '# Foo bar'
21
+ remark("<h2>Foo bar</h2>").should == '## Foo bar'
22
+ end
23
+
24
+ it "should preserve elements in remarked blocks" do
25
+ remark("<p>Foo <ins>bar</ins></p>").should == 'Foo <ins>bar</ins>'
26
+ remark("<h2>Foo <ins>bar</ins></h2>").should == '## Foo <ins>bar</ins>'
27
+ end
28
+
29
+ it "should unescape HTML entities" do
30
+ remark("Foo&amp;bar").should == 'Foo&bar'
31
+ remark("<p>If you&#8217;re doing all your development on the &#8220;master&#8221; branch, you&#8217;re not using git").should == "If you’re doing all your development on the “master” branch, you’re not using git"
32
+ end
33
+
34
+ it "should ignore tags without user-facing content" do
35
+ remark("<script>foo</script>").should == ''
36
+ remark("<head>foo</head>").should == ''
37
+ end
38
+
39
+ it "should leave known elements with attributes intact" do
40
+ remark("<p class='notice'>Kittens attack!</p>").should == '<p class="notice">Kittens attack!</p>'
41
+ end
42
+
43
+ it "should leave unknown elements intact" do
44
+ remark(<<-HTML).should == "Foo\n\n<table>data</table>\n\nBar"
45
+ <p>Foo</p>
46
+ <table>data</table>
47
+ <p>Bar</p>
48
+ HTML
49
+ end
50
+
51
+ it "should support lists" do
52
+ remark(<<-HTML).should == "* foo\n* bar"
53
+ <ul>
54
+ <li>foo</li>
55
+ <li>bar</li>
56
+ </ul>
57
+ HTML
58
+
59
+ remark(<<-HTML).should == "1. foo\n2. bar"
60
+ <ol>
61
+ <li>foo</li>
62
+ <li>bar</li>
63
+ </ol>
64
+ HTML
65
+ end
66
+
67
+ it "should support preformatted blocks" do
68
+ remark("<pre>def foo\n bar\nend</pre>").should == " def foo\n bar\n end"
69
+ remark("<pre><code>def foo\n &lt;bar&gt;\nend</code></pre>").should == " def foo\n <bar>\n end"
70
+ end
71
+
72
+ it "should remark inline elements" do
73
+ remark("<p>I'm so <strong>strong</strong></p>").should == "I'm so **strong**"
74
+ remark("<p>I'm so <em>emo</em></p>").should == "I'm so _emo_"
75
+ remark("<p>Write more <code>code</code></p>").should == "Write more `code`"
76
+ remark("<ul><li><em>Inline</em> stuff in <strong>lists</strong></li></ul>").should == "* _Inline_ stuff in **lists**"
77
+ remark("<h1>Headings <em>too</em></h1>").should == '# Headings _too_'
78
+ end
79
+
80
+ it "should support hyperlinks" do
81
+ remark("<p>Click <a href='http://mislav.uniqpath.com'>here</a></p>").should ==
82
+ "Click [here](http://mislav.uniqpath.com)"
83
+ remark("<a href='/foo' title='bar'>baz</a>").should == '[baz](/foo "bar")'
84
+ end
85
+
86
+ it "should support blockquotes" do
87
+ remark("<blockquote>Cogito, ergo sum</blockquote>").should == '> Cogito, ergo sum'
88
+ remark("<blockquote><p>I think</p><p>therefore I am</p></blockquote>").should == "> I think\n> \n> therefore I am"
89
+ end
90
+
91
+ it "should support image tags" do
92
+ remark("<img src='moo.jpg' alt='cow'>").should == '![cow](moo.jpg)'
93
+ remark("<img src='moo.jpg' alt='cow' width='16'>").should == '<img src="moo.jpg" alt="cow" width="16" />'
94
+ end
95
+ end
96
+
data/spec/sample.html ADDED
@@ -0,0 +1,40 @@
1
+ <h1>Remark &mdash; HTML to Markdown converter</h1>
2
+
3
+ <p>This is a sample document which will get updated as Remark understands more HTML.
4
+ It reflects what's currently supported.</p>
5
+
6
+ <p class="nice">Known block elements are left intact if they have attributes.
7
+ Markdown doesn't have a syntax for them.</p>
8
+
9
+ <table>
10
+ <tr>
11
+ <td>Elements that can't be represented in Markdown are left intact.</td>
12
+ </tr>
13
+ </table>
14
+
15
+ <p>SCRIPT and HEAD tags are swallowed, as browsers don't render them as content.</p>
16
+
17
+ <script type="text/javascript">
18
+ alert("I will not survive")
19
+ </script>
20
+
21
+ <p>Remark supports Markdown syntax for <em>inline</em> markup.
22
+ <a href="http://github.com/mislav">Hyperlinks</a> and <code>code spans</code> are a must.</p>
23
+
24
+ <ul>
25
+ <li>List items too;</li>
26
+ <li>ordered or unordered.</li>
27
+ </ul>
28
+
29
+ <pre><code>And who would forget
30
+ Preformatted code blocks :)</code></pre>
31
+
32
+ <h2>TODO</h2>
33
+
34
+ <p>Remark should probably support BR elements in paragraphs,<br>
35
+ although people tend to abuse them.</p>
36
+
37
+ <div class="content">
38
+ <p>What to do with pieces of content inside wrapper elements,
39
+ like DIV, is still undecided.</p>
40
+ </div>
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mislav-remark
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - "Mislav Marohni\xC4\x87"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-23 00:00:00 -07:00
13
+ default_executable: remark
14
+ dependencies: []
15
+
16
+ description:
17
+ email: mislav.marohnic@gmail.com
18
+ executables:
19
+ - remark
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - Rakefile
26
+ - bin/remark
27
+ - lib/remark.rb
28
+ - spec/remark_spec.rb
29
+ - spec/sample.html
30
+ - README.markdown
31
+ has_rdoc: false
32
+ homepage: http://github.com/mislav/remark
33
+ post_install_message:
34
+ rdoc_options: []
35
+
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: "0"
43
+ version:
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ requirements: []
51
+
52
+ rubyforge_project:
53
+ rubygems_version: 1.2.0
54
+ signing_key:
55
+ specification_version: 3
56
+ summary: HTML to Markdown converter
57
+ test_files: []
58
+