remark 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +18 -0
- data/README.markdown +29 -0
- data/Rakefile +31 -0
- data/bin/remark +22 -0
- data/lib/remark.rb +61 -0
- data/lib/remark/core_ext.rb +25 -0
- data/lib/remark/hpricot_ext.rb +212 -0
- data/spec/hpricot_ext_spec.rb +138 -0
- data/spec/remark_spec.rb +157 -0
- data/spec/sample.html +53 -0
- metadata +83 -0
data/LICENSE
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2009 Mislav Marohnić
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
5
|
+
the Software without restriction, including without limitation the rights to
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
7
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
8
|
+
subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
15
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
16
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
Remark — HTML→Markdown tool
|
2
|
+
===========================
|
3
|
+
|
4
|
+
<i>Remark</i> parses HTML and delivers proper Markdown.
|
5
|
+
|
6
|
+
Usage
|
7
|
+
-----
|
8
|
+
|
9
|
+
From command-line:
|
10
|
+
|
11
|
+
remark path/to/file.html
|
12
|
+
|
13
|
+
or by STDIN:
|
14
|
+
|
15
|
+
echo "..." | remark
|
16
|
+
|
17
|
+
You can try feeding it a document from the web:
|
18
|
+
|
19
|
+
curl -s daringfireball.net/projects/markdown/basics | remark > result.markdown
|
20
|
+
|
21
|
+
See how it does.
|
22
|
+
|
23
|
+
If you've cloned the repository, invoke the binary like this:
|
24
|
+
|
25
|
+
ruby -Ilib -rubygems bin/remark spec/sample.html
|
26
|
+
|
27
|
+
And this is how you use it from Ruby code:
|
28
|
+
|
29
|
+
Remark.new('<h1>My document</h1><p>Some content</p>').to_markdown
|
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
desc "renders the spec/sample.html to Markdown"
|
2
|
+
task :sample do
|
3
|
+
system %(ruby -Ilib -rubygems bin/remark spec/sample.html)
|
4
|
+
end
|
5
|
+
|
6
|
+
desc "generates .gemspec file"
|
7
|
+
task :gemspec do
|
8
|
+
spec = Gem::Specification.new do |gem|
|
9
|
+
gem.name = "remark"
|
10
|
+
gem.version = '0.3.0'
|
11
|
+
|
12
|
+
gem.summary = "HTML to Markdown converter"
|
13
|
+
gem.email = "mislav.marohnic@gmail.com"
|
14
|
+
gem.homepage = "http://github.com/mislav/remark"
|
15
|
+
gem.authors = ["Mislav Marohnić"]
|
16
|
+
gem.has_rdoc = false
|
17
|
+
|
18
|
+
gem.files = FileList['Rakefile', '{bin,lib,rails,spec}/**/*', 'README*', 'LICENSE*'] & `git ls-files`.split("\n")
|
19
|
+
gem.executables = Dir['bin/*'].map { |f| File.basename(f) }
|
20
|
+
end
|
21
|
+
|
22
|
+
spec_string = spec.to_ruby
|
23
|
+
|
24
|
+
begin
|
25
|
+
Thread.new { eval("$SAFE = 3\n#{spec_string}", binding) }.join
|
26
|
+
rescue
|
27
|
+
abort "unsafe gemspec: #{$!}"
|
28
|
+
else
|
29
|
+
File.open("#{spec.name}.gemspec", 'w') { |file| file.write spec_string }
|
30
|
+
end
|
31
|
+
end
|
data/bin/remark
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'remark'
|
4
|
+
|
5
|
+
options = {}
|
6
|
+
OptionParser.new do |opts|
|
7
|
+
opts.banner = "Usage: remark [options] [FILE]"
|
8
|
+
|
9
|
+
opts.on("-n", "--inline-links", "Render link URLs inline (instead of reference-style)") do |inline|
|
10
|
+
options[:reference_links] = !inline
|
11
|
+
end
|
12
|
+
|
13
|
+
opts.on("-s", "--scope EXPR", "Scope to a spefic CSS/XPath expression in the HTML document") do |scope|
|
14
|
+
options[:scope] = scope
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on("-i", "--ignore EXPR", "Ignore elements that match CSS/XPath expression") do |expr|
|
18
|
+
(options[:ignores] ||= []) << expr
|
19
|
+
end
|
20
|
+
end.parse!
|
21
|
+
|
22
|
+
puts Remark.new(ARGF.read, options).to_markdown
|
data/lib/remark.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'remark/hpricot_ext'
|
2
|
+
|
3
|
+
class Remark
|
4
|
+
DEFAULT_OPTIONS = { :reference_links => true }
|
5
|
+
|
6
|
+
def initialize(source, options = {})
|
7
|
+
@doc = Hpricot(source)
|
8
|
+
@options = DEFAULT_OPTIONS.merge options
|
9
|
+
@links = []
|
10
|
+
@ignored_elements = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_markdown
|
14
|
+
parent = scope
|
15
|
+
collect_ignored_elements(parent)
|
16
|
+
links = @options[:links] = [] unless inline_links?
|
17
|
+
result = parent.to_markdown(@options)
|
18
|
+
result + (inline_links? || links.empty?? '' : "\n\n\n" + output_reference_links(links))
|
19
|
+
end
|
20
|
+
|
21
|
+
def scope
|
22
|
+
if scope = @options[:scope]
|
23
|
+
@doc.at(scope)
|
24
|
+
elsif body = @doc.at('/html/body')
|
25
|
+
candidates = (body / 'p').inject(Hash.new(0)) do |memo, para|
|
26
|
+
memo[para.parent] += 1
|
27
|
+
memo
|
28
|
+
end.invert
|
29
|
+
|
30
|
+
candidates[candidates.keys.max]
|
31
|
+
else
|
32
|
+
@doc
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def inline_links?
|
37
|
+
!@options[:reference_links]
|
38
|
+
end
|
39
|
+
|
40
|
+
def output_reference_links(links)
|
41
|
+
references = []
|
42
|
+
links.each_with_index do |(href, title), i|
|
43
|
+
references << "[#{i + 1}]: #{href}#{title ? ' ' + title.inspect : ''}"
|
44
|
+
end
|
45
|
+
references.join("\n")
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def ignore_element?(elem)
|
51
|
+
IGNORE.include?(elem.name) or (@ignored_elements and @ignored_elements.include?(elem))
|
52
|
+
end
|
53
|
+
|
54
|
+
def collect_ignored_elements(scope)
|
55
|
+
if @options[:ignores]
|
56
|
+
@options[:ignored_elements] = @options[:ignores].map do |expr|
|
57
|
+
scope.search(expr).to_a
|
58
|
+
end.flatten.uniq
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
Object.class_eval do
|
2
|
+
def blank?() false end
|
3
|
+
end
|
4
|
+
|
5
|
+
NilClass.class_eval do
|
6
|
+
def blank?() true end
|
7
|
+
end
|
8
|
+
|
9
|
+
String.class_eval do
|
10
|
+
def blank?
|
11
|
+
self.empty? or !!(self =~ /\A\s+\Z/)
|
12
|
+
end
|
13
|
+
|
14
|
+
def squeeze_whitespace
|
15
|
+
self.tr("\n\t", ' ').squeeze(' ')
|
16
|
+
end
|
17
|
+
|
18
|
+
def indent(with = ' ' * 4)
|
19
|
+
self.gsub(/^/, with)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
Hpricot::Text.module_eval do
|
24
|
+
def blank?() to_s.blank? end
|
25
|
+
end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'remark/core_ext'
|
3
|
+
|
4
|
+
# this applies the default behavior to virtually all Hpricot classes
|
5
|
+
Hpricot::Node.module_eval do
|
6
|
+
def to_markdown(options = {}) nil end
|
7
|
+
def markdown_block?() false end
|
8
|
+
end
|
9
|
+
|
10
|
+
# nothing special to process on Text or CData
|
11
|
+
Hpricot::Text.module_eval do
|
12
|
+
def to_markdown(options = {}) to_s.squeeze_whitespace end
|
13
|
+
end
|
14
|
+
|
15
|
+
Hpricot::CData.module_eval do
|
16
|
+
def to_markdown(options = {}) to_s.squeeze_whitespace end
|
17
|
+
end
|
18
|
+
|
19
|
+
# elements that have children
|
20
|
+
Hpricot::Container.module_eval do
|
21
|
+
def to_markdown(options = {})
|
22
|
+
return '' unless self.children
|
23
|
+
previous_was_block = false
|
24
|
+
parent_is_block = self.markdown_block?
|
25
|
+
|
26
|
+
# recurse over this element's children
|
27
|
+
content = self.children.inject([]) do |all, child|
|
28
|
+
current_is_block = child.markdown_block?
|
29
|
+
child_content = child.to_markdown(options)
|
30
|
+
|
31
|
+
# skip this node if its markdown is nil, empty or, in case
|
32
|
+
# that the previous element was a block, all-whitespace
|
33
|
+
unless child_content.nil? or child_content.empty? or (previous_was_block and child_content.blank?)
|
34
|
+
# handle separating of adjacent markdown blocks with an empty line
|
35
|
+
if not all.empty? and current_is_block or previous_was_block
|
36
|
+
# strip trailing whitespace if we're opening a new block
|
37
|
+
all.last.blank?? all.pop : all.last.rstrip!
|
38
|
+
# guard against adding a newline at the beginning
|
39
|
+
all << "\n\n" if all.any?
|
40
|
+
end
|
41
|
+
|
42
|
+
unless 'pre' == child.name
|
43
|
+
# strip whitespace from the left if ...
|
44
|
+
child_content.lstrip! if previous_was_block or # we're adjacent to a block
|
45
|
+
(parent_is_block and child == self.children.first) or # this is the first child
|
46
|
+
(not all.empty? and all.last =~ / ( \n)?$/) # we're following a space or a forced line break token
|
47
|
+
|
48
|
+
|
49
|
+
# strip whitespace from the right if this is the last node in a block
|
50
|
+
child_content.rstrip! if parent_is_block and self.children.last == child
|
51
|
+
end
|
52
|
+
|
53
|
+
all << child_content
|
54
|
+
end
|
55
|
+
|
56
|
+
previous_was_block = current_is_block
|
57
|
+
all
|
58
|
+
end
|
59
|
+
|
60
|
+
result = content.join('')
|
61
|
+
return result
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# elements without children
|
66
|
+
Hpricot::Leaf.module_eval do
|
67
|
+
def to_markdown(options = {})
|
68
|
+
inner_text.squeeze_whitespace if elem?
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
Hpricot::Elem.module_eval do
|
73
|
+
IGNORE = %w(script head style)
|
74
|
+
ALLOWED_EMPTY = %w(img br hr )
|
75
|
+
MARKDOWN_BLOCK = %w(p blockquote h1 h2 h3 h4 h5 h6 pre hr)
|
76
|
+
MARKDOWN_INLINE = %w(em strong code a img br)
|
77
|
+
MARKDOWN_RECOGNIZED = MARKDOWN_BLOCK + MARKDOWN_INLINE + %w(div)
|
78
|
+
HTML_BLOCK = MARKDOWN_BLOCK + %w(ul ol dl div noscript form table address fieldset)
|
79
|
+
|
80
|
+
def to_markdown(options = {})
|
81
|
+
return nil if markdown_ignored?(options)
|
82
|
+
return '' if markdown_empty?
|
83
|
+
return to_s unless markdown_supported_attributes?
|
84
|
+
|
85
|
+
case name
|
86
|
+
when 'div', 'noscript'
|
87
|
+
super
|
88
|
+
when 'p'
|
89
|
+
super
|
90
|
+
when /^h([1-6])$/
|
91
|
+
('#' * $1.to_i) + ' ' + super
|
92
|
+
when 'ul', 'ol'
|
93
|
+
remark_list(options)
|
94
|
+
when 'li'
|
95
|
+
content = super
|
96
|
+
content = content.indent if children.any? { |e| e.markdown_block? }
|
97
|
+
content
|
98
|
+
when 'pre'
|
99
|
+
inner_text.rstrip.indent
|
100
|
+
when 'em'
|
101
|
+
"_#{super}_"
|
102
|
+
when 'strong'
|
103
|
+
"**#{super}**"
|
104
|
+
when 'code'
|
105
|
+
code = inner_text
|
106
|
+
code.index('`') ? "`` #{code} ``" : "`#{code}`"
|
107
|
+
when 'a'
|
108
|
+
remark_link(super, self['href'], self['title'], options)
|
109
|
+
when 'img'
|
110
|
+
'!' + remark_link(self['alt'], self['src'], self['title'], :reference_links => false)
|
111
|
+
when 'blockquote'
|
112
|
+
super.indent('> ')
|
113
|
+
when 'br'
|
114
|
+
" \n" + inner_html
|
115
|
+
else
|
116
|
+
to_s
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def remark_list(options = {})
|
121
|
+
unordered = self.name == 'ul'
|
122
|
+
marker = unordered ? '*' : 0
|
123
|
+
nested = false
|
124
|
+
|
125
|
+
items = self.children_of_type('li').map do |item|
|
126
|
+
item = item.to_markdown(options)
|
127
|
+
current = unordered ? marker : "#{marker += 1}."
|
128
|
+
if item =~ /\A\s/
|
129
|
+
nested = true
|
130
|
+
item[0, current.length] = current
|
131
|
+
item
|
132
|
+
else
|
133
|
+
current + ' ' + item
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
items.join("\n" * (nested ? 2 : 1))
|
138
|
+
end
|
139
|
+
|
140
|
+
def markdown_block?
|
141
|
+
HTML_BLOCK.include?(name)
|
142
|
+
end
|
143
|
+
|
144
|
+
def markdown_recognized?
|
145
|
+
MARKDOWN_RECOGNIZED.include?(name)
|
146
|
+
end
|
147
|
+
|
148
|
+
protected
|
149
|
+
|
150
|
+
def markdown_ignored?(options)
|
151
|
+
IGNORE.include?(name) or
|
152
|
+
(options[:ignored_elements] and options[:ignored_elements].include?(self))
|
153
|
+
end
|
154
|
+
|
155
|
+
def markdown_empty?
|
156
|
+
empty? and markdown_recognized? and not ALLOWED_EMPTY.include?(name)
|
157
|
+
end
|
158
|
+
|
159
|
+
def markdown_supported_attributes?
|
160
|
+
case name
|
161
|
+
when 'div'
|
162
|
+
true
|
163
|
+
when 'a'
|
164
|
+
attribute_names_match?('href', 'title')
|
165
|
+
when 'img'
|
166
|
+
attribute_names_match?(%w(alt src), 'title')
|
167
|
+
when 'ol', 'ul'
|
168
|
+
attributes.empty? and children.all? do |item|
|
169
|
+
not item.elem? or (item.name == 'li' and item.attributes.empty?)
|
170
|
+
end
|
171
|
+
else
|
172
|
+
attributes.empty?
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def attribute_names_match?(only, optional = nil)
|
177
|
+
names = attributes.keys.sort
|
178
|
+
names -= Array(optional) if optional
|
179
|
+
names == Array(only)
|
180
|
+
end
|
181
|
+
|
182
|
+
def remark_link(text, href, title = nil, options = {})
|
183
|
+
if options[:reference_links]
|
184
|
+
if existing = options[:links].find { |h, t| href == h }
|
185
|
+
num = options[:links].index(existing) + 1
|
186
|
+
else
|
187
|
+
options[:links] << [href, title]
|
188
|
+
num = options[:links].length
|
189
|
+
end
|
190
|
+
"[#{text}][#{num}]"
|
191
|
+
else
|
192
|
+
title_markup = title ? %( "#{title}") : ''
|
193
|
+
"[#{text}](#{href}#{title_markup})"
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
Hpricot::Attributes.class_eval do
|
199
|
+
methods = instance_methods.map { |m| m.to_sym }
|
200
|
+
|
201
|
+
unless methods.include? :empty?
|
202
|
+
def empty?
|
203
|
+
self.to_hash.empty?
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
unless methods.include? :keys
|
208
|
+
def keys
|
209
|
+
self.to_hash.keys
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'remark/hpricot_ext'
|
2
|
+
|
3
|
+
describe Hpricot, "remark extensions" do
|
4
|
+
before(:all) do
|
5
|
+
@doc = Hpricot(<<-HTML.strip)
|
6
|
+
<?xml version="moo" ?>
|
7
|
+
<!DOCTYPE html>
|
8
|
+
<html>
|
9
|
+
<head>
|
10
|
+
<title>Sample document</title>
|
11
|
+
</head>
|
12
|
+
<body>
|
13
|
+
<h1>Sample <strong>Remark</strong> document</h1>
|
14
|
+
<p>
|
15
|
+
A paragraph with <em>nested</em> <strong>content</strong>
|
16
|
+
and <i>Remark</i>-supported elements.
|
17
|
+
</p>
|
18
|
+
|
19
|
+
<a name="content"> </a>
|
20
|
+
<h2>The content</h2>
|
21
|
+
<div id="content">
|
22
|
+
<p>First</p>
|
23
|
+
<p>Second</p>
|
24
|
+
Some content
|
25
|
+
<em>in-between</em>
|
26
|
+
<p>Third</p>
|
27
|
+
</div>
|
28
|
+
<p class="foo">I has classname</p>
|
29
|
+
|
30
|
+
<div id="empty"></div>
|
31
|
+
<blockquote>
|
32
|
+
Some famous quote
|
33
|
+
<blockquote>Nested famous quote</blockquote>
|
34
|
+
</blockquote>
|
35
|
+
<div class="code">
|
36
|
+
<p>Sample code:</p>
|
37
|
+
<pre>def preformatted
|
38
|
+
text
|
39
|
+
end
|
40
|
+
</pre>
|
41
|
+
</div>
|
42
|
+
<img src='moo.jpg' alt='cow'>
|
43
|
+
<img src='moo.jpg' alt='cow' width='16'>
|
44
|
+
|
45
|
+
<code>simple</code> <code>comp ` lex</code> <code><tag></code>
|
46
|
+
|
47
|
+
<div id="br">
|
48
|
+
<p>Foo<br>bar</p>
|
49
|
+
<p>Foo<br>
|
50
|
+
bar <code>baz</code></p>
|
51
|
+
<p>Foo</p><br><br><p>Bar</p><br>
|
52
|
+
</div>
|
53
|
+
|
54
|
+
<ul>
|
55
|
+
<li>First</li>
|
56
|
+
<li>Second</li>
|
57
|
+
</ul>
|
58
|
+
<ol>
|
59
|
+
<li>First</li>
|
60
|
+
<li>Second</li>
|
61
|
+
</ol>
|
62
|
+
</body>
|
63
|
+
</html>
|
64
|
+
HTML
|
65
|
+
end
|
66
|
+
|
67
|
+
def remark(elem, options = {})
|
68
|
+
(String === elem ? @doc.at(elem) : elem).to_markdown(options)
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should return empty string for empty document" do
|
72
|
+
remark(Hpricot('')).should == ''
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should ignore DOCTYPE, HEAD and XML processing instructions" do
|
76
|
+
remark('head').should be_nil
|
77
|
+
remark(@doc.children[0]).should be_nil # doctype
|
78
|
+
remark(@doc.children[2]).should be_nil # xmldecl
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should have whitespace nodes respond to blank" do
|
82
|
+
@doc.at('a[@name]').children.first.blank?
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should support headings" do
|
86
|
+
remark('h1').should == "# Sample **Remark** document"
|
87
|
+
remark('h2').should == "## The content"
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should support paragraphs" do
|
91
|
+
remark('p').should == "A paragraph with _nested_ **content** and <i>Remark</i>-supported elements."
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should split paragraphs with an empty line" do
|
95
|
+
remark('#content').should == "First\n\nSecond\n\nSome content _in-between_\n\nThird"
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should keep full HTML for paragraphs if they have attributes" do
|
99
|
+
remark('p.foo').should == '<p class="foo">I has classname</p>'
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should not break on empty DIV" do
|
103
|
+
remark('#empty').should == ""
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should support blockquotes" do
|
107
|
+
remark('blockquote > blockquote').should == "> Nested famous quote"
|
108
|
+
remark('blockquote').should == "> Some famous quote\n> \n> > Nested famous quote"
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should support preformatted text" do
|
112
|
+
remark('div.code').should == "Sample code:\n\n def preformatted\n text\n end"
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should support image tags" do
|
116
|
+
remark('img[@alt]').should == ''
|
117
|
+
remark('img[@width]').should == '<img src="moo.jpg" alt="cow" width="16" />'
|
118
|
+
end
|
119
|
+
|
120
|
+
it "should support code spans" do
|
121
|
+
remark('code').should == "`simple`"
|
122
|
+
remark('code ~ code').should == "`` comp ` lex ``"
|
123
|
+
remark('code ~ code ~ code').should == "`<tag>`"
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should support BR" do
|
127
|
+
remark('#br').should == "Foo \nbar\n\nFoo \nbar `baz`\n\nFoo\n\nBar"
|
128
|
+
end
|
129
|
+
|
130
|
+
it "should support unordered list" do
|
131
|
+
remark('ul').should == "* First\n* Second"
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should support ordered list" do
|
135
|
+
remark('ol').should == "1. First\n2. Second"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
data/spec/remark_spec.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'remark'
|
2
|
+
|
3
|
+
describe Remark do
|
4
|
+
def remark(source, options = {})
|
5
|
+
options = {:reference_links => false}.merge(options)
|
6
|
+
described_class.new(source, options).to_markdown
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should let through text content" do
|
10
|
+
remark("Foo bar").should == 'Foo bar'
|
11
|
+
remark("Foo bar\nbaz").should == 'Foo bar baz'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should preserve elements in remarked blocks" do
|
15
|
+
remark("<p>Foo <ins>bar</ins></p>").should == 'Foo <ins>bar</ins>'
|
16
|
+
remark("<h2>Foo <ins>bar</ins></h2>").should == '## Foo <ins>bar</ins>'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should unescape HTML entities" do
|
20
|
+
remark("Foo&bar").should == 'Foo&bar'
|
21
|
+
remark("<p>If you’re doing all your development on the “master” branch, you’re not using git").should == "If you’re doing all your development on the “master” branch, you’re not using git"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should leave unknown elements intact" do
|
25
|
+
remark(<<-HTML).should == "Foo\n\n<table>data</table>\n\nBar"
|
26
|
+
<p>Foo</p>
|
27
|
+
<table>data</table>
|
28
|
+
<p>Bar</p>
|
29
|
+
HTML
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "whitespace" do
|
33
|
+
it "should strip excess whitespace" do
|
34
|
+
remark(<<-HTML).should == "Foo bar"
|
35
|
+
<p>
|
36
|
+
Foo
|
37
|
+
bar
|
38
|
+
</p>
|
39
|
+
HTML
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should strip whitespace in text nodes between processed nodes" do
|
43
|
+
remark(<<-HTML).should == "Foo\n\nbar\n\nBaz"
|
44
|
+
<p>Foo</p>
|
45
|
+
|
46
|
+
bar
|
47
|
+
<p>Baz</p>
|
48
|
+
HTML
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "lists" do
|
53
|
+
it "should support lists" do
|
54
|
+
remark(<<-HTML).should == "* foo\n* bar"
|
55
|
+
<ul>
|
56
|
+
<li>foo</li>
|
57
|
+
<li>bar</li>
|
58
|
+
</ul>
|
59
|
+
HTML
|
60
|
+
|
61
|
+
remark(<<-HTML).should == "1. foo\n2. bar"
|
62
|
+
<ol>
|
63
|
+
<li>foo</li>
|
64
|
+
<li>bar</li>
|
65
|
+
</ol>
|
66
|
+
HTML
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should support lists with nested content" do
|
70
|
+
remark(<<-HTML).should == "* foo\n \n bar\n\n* baz"
|
71
|
+
<ul>
|
72
|
+
<li><p>foo</p><p>bar</p></li>
|
73
|
+
<li><p>baz</p></li>
|
74
|
+
</ul>
|
75
|
+
HTML
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should output malformed lists as HTML" do
|
79
|
+
remark(<<-HTML).should == "<ul>\n <span>bar</span>\n </ul>"
|
80
|
+
<ul>
|
81
|
+
<span>bar</span>
|
82
|
+
</ul>
|
83
|
+
HTML
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
it "should support preformatted blocks" do
|
88
|
+
remark("<pre>def foo\n bar\nend</pre>").should == " def foo\n bar\n end"
|
89
|
+
remark("<pre><code>def foo\n <bar>\nend</code></pre>").should == " def foo\n <bar>\n end"
|
90
|
+
remark("<pre>def foo\n</pre>").should == " def foo"
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "inline" do
|
94
|
+
it "should remark inline elements" do
|
95
|
+
remark("<p>I'm so <strong>strong</strong></p>").should == "I'm so **strong**"
|
96
|
+
remark("<p>I'm so <em>emo</em></p>").should == "I'm so _emo_"
|
97
|
+
remark("<ul><li><em>Inline</em> stuff in <strong>lists</strong></li></ul>").should == "* _Inline_ stuff in **lists**"
|
98
|
+
remark("<h1>Headings <em>too</em></h1>").should == '# Headings _too_'
|
99
|
+
end
|
100
|
+
|
101
|
+
it "should handle nested inline elements" do
|
102
|
+
remark("<p>I <strong>love <code>code</code></strong></p>").should == "I **love `code`**"
|
103
|
+
remark("<p>I <a href='#'>am <em>fine</em></a></p>").should == "I [am _fine_](#)"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
describe "hyperlinks" do
|
108
|
+
it "should support hyperlinks" do
|
109
|
+
remark("<p>Click <a href='http://mislav.uniqpath.com'>here</a></p>").should ==
|
110
|
+
"Click [here](http://mislav.uniqpath.com)"
|
111
|
+
remark("<a href='/foo' title='bar'>baz</a>").should == '[baz](/foo "bar")'
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should have reference-style hyperlinks" do
|
115
|
+
remark("<p>Click <a href='foo' title='mooslav'>here</a> and <a href='bar'>there</a></p>", :reference_links => true).should ==
|
116
|
+
"Click [here][1] and [there][2]\n\n\n[1]: foo \"mooslav\"\n[2]: bar"
|
117
|
+
remark("<p>Click <a href='foo'>here</a> and <a href='foo'>there</a></p>", :reference_links => true).should ==
|
118
|
+
"Click [here][1] and [there][1]\n\n\n[1]: foo"
|
119
|
+
remark("", :reference_links => true).should == ""
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
it "should support ignores" do
|
124
|
+
remark("<p>Foo <span>bar</span> baz</p>", :ignores => ['span']).should == "Foo baz"
|
125
|
+
end
|
126
|
+
|
127
|
+
describe "scoping" do
|
128
|
+
before do
|
129
|
+
@html = <<-HTML
|
130
|
+
<html>
|
131
|
+
<body>
|
132
|
+
<div id="div1">
|
133
|
+
<p>Only 1 paragraph</p>
|
134
|
+
</div>
|
135
|
+
<div id="div3">
|
136
|
+
<p>Wow, 3 paragraphs</p>
|
137
|
+
<p>This must be where the content is</p>
|
138
|
+
<p>I'm sure</p>
|
139
|
+
</div>
|
140
|
+
<div id="div2">
|
141
|
+
<p>Only 2 paragraphs</p>
|
142
|
+
<p>How disappointing</p>
|
143
|
+
</div>
|
144
|
+
</body>
|
145
|
+
</html>
|
146
|
+
HTML
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should scope to the most likely element that holds content" do
|
150
|
+
remark(@html).should == "Wow, 3 paragraphs\n\nThis must be where the content is\n\nI'm sure"
|
151
|
+
end
|
152
|
+
|
153
|
+
it "should scope to the explicit scope" do
|
154
|
+
remark(@html, :scope => '#div2').should == "Only 2 paragraphs\n\nHow disappointing"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
data/spec/sample.html
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
<h1>Remark — HTML to Markdown converter</h1>
|
2
|
+
|
3
|
+
<p>This is a sample document which will get updated as Remark understands more HTML.
|
4
|
+
It reflects what's currently supported.</p>
|
5
|
+
|
6
|
+
<p class="nice">Known block elements are left intact if they have attributes.
|
7
|
+
Markdown doesn't have a syntax for them.</p>
|
8
|
+
|
9
|
+
<table>
|
10
|
+
<tr>
|
11
|
+
<td>Elements that can't be represented in Markdown are left intact.</td>
|
12
|
+
</tr>
|
13
|
+
</table>
|
14
|
+
|
15
|
+
<p>SCRIPT and HEAD tags are swallowed, as browsers don't render them as content.</p>
|
16
|
+
|
17
|
+
<script type="text/javascript">
|
18
|
+
alert("I will not survive")
|
19
|
+
</script>
|
20
|
+
|
21
|
+
<p>Remark supports Markdown syntax for <em>inline</em> markup.
|
22
|
+
<a href="http://github.com/mislav">Hyperlinks</a> and <code>code spans</code> are a must.</p>
|
23
|
+
|
24
|
+
<ul>
|
25
|
+
<li>List items too;</li>
|
26
|
+
<li>ordered or unordered.</li>
|
27
|
+
</ul>
|
28
|
+
|
29
|
+
<ol>
|
30
|
+
<li><p>Paragraphs in list items</p></li>
|
31
|
+
<li><p>Make them have one blank line between them in Markdown</p></li>
|
32
|
+
<li>
|
33
|
+
<p>Some list items even have multiple paragraphs</p>
|
34
|
+
<p>That shouldn't be too hard to do … right?</p>
|
35
|
+
<pre>code blocks too</pre>
|
36
|
+
</li>
|
37
|
+
</ol>
|
38
|
+
|
39
|
+
<p>Remark supports BR elements in paragraphs,<br>
|
40
|
+
although people tend to abuse them.</p>
|
41
|
+
|
42
|
+
<pre><code>And who would forget
|
43
|
+
Preformatted code blocks :)</code></pre>
|
44
|
+
|
45
|
+
<p>Notice how it handles <img src="moo.jpg" alt="images" title="Awesum img"> in a nice way.</p>
|
46
|
+
|
47
|
+
<blockquote>
|
48
|
+
<p>I think</p>
|
49
|
+
<p>therefore I am</p>
|
50
|
+
<blockquote>
|
51
|
+
<p>Nested blockquotes</p>
|
52
|
+
</blockquote>
|
53
|
+
</blockquote>
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: remark
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- "Mislav Marohni\xC4\x87"
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-11-25 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.8.2
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.9
|
34
|
+
version:
|
35
|
+
description: Remark turns simple HTML documents or content in web pages to Markdown source.
|
36
|
+
email: mislav.marohnic@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- Rakefile
|
45
|
+
- bin/remark
|
46
|
+
- lib/remark/core_ext.rb
|
47
|
+
- lib/remark/hpricot_ext.rb
|
48
|
+
- lib/remark.rb
|
49
|
+
- spec/hpricot_ext_spec.rb
|
50
|
+
- spec/remark_spec.rb
|
51
|
+
- spec/sample.html
|
52
|
+
- README.markdown
|
53
|
+
- LICENSE
|
54
|
+
has_rdoc: false
|
55
|
+
homepage: http://github.com/mislav/remark
|
56
|
+
licenses: []
|
57
|
+
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
version:
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: "0"
|
74
|
+
version:
|
75
|
+
requirements: []
|
76
|
+
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 1.3.5
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: HTML to Markdown converter
|
82
|
+
test_files: []
|
83
|
+
|