undress 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +5 -1
- data/lib/undress/grammar.rb +40 -0
- data/lib/undress/textile.rb +48 -28
- data/test/test_grammar.rb +20 -0
- data/test/test_textile.rb +22 -0
- data/undress.gemspec +2 -2
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
Easily convert back HTML to Textile, Markdown, RDoc or whatever other
|
4
4
|
markup language you like.
|
5
5
|
|
6
|
-
require "undress"
|
6
|
+
require "undress/textile"
|
7
7
|
|
8
8
|
code =<<html
|
9
9
|
<h1>Hello world!</h1>
|
@@ -33,6 +33,10 @@ Will produce
|
|
33
33
|
For now the only language supported is Textile. But I'll be happy to accept
|
34
34
|
patches to add more languages :)
|
35
35
|
|
36
|
+
== Get it
|
37
|
+
|
38
|
+
gem install undress
|
39
|
+
|
36
40
|
== License
|
37
41
|
|
38
42
|
Authors:: Nicolas Sanguinetti (foca[http://github.com/foca])
|
data/lib/undress/grammar.rb
CHANGED
@@ -61,6 +61,21 @@ module Undress
|
|
61
61
|
pre_processing_rules[selector] = handler
|
62
62
|
end
|
63
63
|
|
64
|
+
# Set a list of attributes you wish to whitelist
|
65
|
+
#
|
66
|
+
# Any attribute not in this list at the moment of parsing will be ignored by the
|
67
|
+
# parser. The method Grammar#attributes(node) will return a hash of the filtered
|
68
|
+
# attributes. Read its documentation for more details.
|
69
|
+
#
|
70
|
+
# whitelist_attributes :id, :class, :lang
|
71
|
+
def self.whitelist_attributes(*attrs)
|
72
|
+
@whitelisted_attributes = attrs
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.whitelisted_attributes #:nodoc:
|
76
|
+
@whitelisted_attributes || []
|
77
|
+
end
|
78
|
+
|
64
79
|
def self.post_processing_rules #:nodoc:
|
65
80
|
@post_processing_rules ||= {}
|
66
81
|
end
|
@@ -75,10 +90,12 @@ module Undress
|
|
75
90
|
|
76
91
|
attr_reader :pre_processing_rules #:nodoc:
|
77
92
|
attr_reader :post_processing_rules #:nodoc:
|
93
|
+
attr_reader :whitelisted_attributes #:nodoc:
|
78
94
|
|
79
95
|
def initialize #:nodoc:
|
80
96
|
@pre_processing_rules = self.class.pre_processing_rules.dup
|
81
97
|
@post_processing_rules = self.class.post_processing_rules.dup
|
98
|
+
@whitelisted_attributes = self.class.whitelisted_attributes.dup
|
82
99
|
end
|
83
100
|
|
84
101
|
# Process a DOM node, converting it to your markup language according to
|
@@ -120,6 +137,29 @@ module Undress
|
|
120
137
|
(node.next.text? && node.next.to_s =~ /^\s+/)
|
121
138
|
end
|
122
139
|
|
140
|
+
# Hash of attributes, according to the white list. By default, no attributes
|
141
|
+
# are whitelisted, so you must set which ones to whitelist on each grammar.
|
142
|
+
#
|
143
|
+
# Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
|
144
|
+
# <tt>whitelisted_attributes</tt>, and you have a node representing this
|
145
|
+
# HTML:
|
146
|
+
#
|
147
|
+
# <p lang="en" class="greeting">Hello World</p>
|
148
|
+
#
|
149
|
+
# Then the method would return:
|
150
|
+
#
|
151
|
+
# { :class => "greeting" }
|
152
|
+
#
|
153
|
+
# You can override this method in each grammar and call +super+ if you
|
154
|
+
# will represent your attributes consistently across all nodes (for
|
155
|
+
# example, +Textile+ always shows class an id inside parenthesis.)
|
156
|
+
def attributes(node)
|
157
|
+
node.attributes.inject({}) do |attrs,(key,value)|
|
158
|
+
attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
|
159
|
+
attrs
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
123
163
|
def method_missing(tag, node, *args) #:nodoc:
|
124
164
|
process(node.children)
|
125
165
|
end
|
data/lib/undress/textile.rb
CHANGED
@@ -2,6 +2,8 @@ require File.expand_path(File.dirname(__FILE__) + "/../undress")
|
|
2
2
|
|
3
3
|
module Undress
|
4
4
|
class Textile < Grammar
|
5
|
+
whitelist_attributes :class, :id, :lang, :style, :colspan, :rowspan
|
6
|
+
|
5
7
|
# whitespace handling
|
6
8
|
post_processing(/\n\n+/, "\n\n")
|
7
9
|
post_processing(/\A\s+/, "")
|
@@ -27,35 +29,35 @@ module Undress
|
|
27
29
|
alt = e.has_attribute?("alt") ? "(#{e["alt"]})" : ""
|
28
30
|
"!#{e["src"]}#{alt}!"
|
29
31
|
}
|
30
|
-
rule_for(:strong) {|e| "*#{content_of(e)}*" }
|
31
|
-
rule_for(:em) {|e| "_#{content_of(e)}_" }
|
32
|
-
rule_for(:code) {|e| "@#{content_of(e)}@" }
|
33
|
-
rule_for(:cite) {|e| "??#{content_of(e)}??" }
|
34
|
-
rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{content_of(e)}^" : "[^#{content_of(e)}^]" }
|
35
|
-
rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{content_of(e)}~" : "[~#{content_of(e)}~]" }
|
36
|
-
rule_for(:ins) {|e| "+#{content_of(e)}+" }
|
37
|
-
rule_for(:del) {|e| "-#{content_of(e)}-" }
|
32
|
+
rule_for(:strong) {|e| "*#{attributes(e)}#{content_of(e)}*" }
|
33
|
+
rule_for(:em) {|e| "_#{attributes(e)}#{content_of(e)}_" }
|
34
|
+
rule_for(:code) {|e| "@#{attributes(e)}#{content_of(e)}@" }
|
35
|
+
rule_for(:cite) {|e| "??#{attributes(e)}#{content_of(e)}??" }
|
36
|
+
rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{attributes(e)}#{content_of(e)}^" : "[^#{attributes(e)}#{content_of(e)}^]" }
|
37
|
+
rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{attributes(e)}#{content_of(e)}~" : "[~#{attributes(e)}#{content_of(e)}~]" }
|
38
|
+
rule_for(:ins) {|e| "+#{attributes(e)}#{content_of(e)}+" }
|
39
|
+
rule_for(:del) {|e| "-#{attributes(e)}#{content_of(e)}-" }
|
38
40
|
rule_for(:acronym) {|e| e.has_attribute?("title") ? "#{content_of(e)}(#{e["title"]})" : content_of(e) }
|
39
41
|
|
40
42
|
# text formatting and layout
|
41
|
-
rule_for(:p) {|e| "\n\n#{content_of(e)}\n\n" }
|
43
|
+
rule_for(:p) {|e| "\n\n#{attributes(e) != "" ? "p#{attributes(e)}. " : ""}#{content_of(e)}\n\n" }
|
42
44
|
rule_for(:br) {|e| "\n" }
|
43
|
-
rule_for(:blockquote) {|e| "\n\nbq. #{content_of(e)}\n\n" }
|
45
|
+
rule_for(:blockquote) {|e| "\n\nbq#{attributes(e)}. #{content_of(e)}\n\n" }
|
44
46
|
rule_for(:pre) {|e|
|
45
47
|
if e.children.all? {|n| n.text? && n.content =~ /^\s+$/ || n.elem? && n.name == "code" }
|
46
|
-
"\n\npc. #{content_of(e % "code")}\n\n"
|
48
|
+
"\n\npc#{attributes(e)}. #{content_of(e % "code")}\n\n"
|
47
49
|
else
|
48
50
|
"<pre>#{content_of(e)}</pre>"
|
49
51
|
end
|
50
52
|
}
|
51
53
|
|
52
54
|
# headings
|
53
|
-
rule_for(:h1) {|e| "\n\nh1. #{content_of(e)}\n\n" }
|
54
|
-
rule_for(:h2) {|e| "\n\nh2. #{content_of(e)}\n\n" }
|
55
|
-
rule_for(:h3) {|e| "\n\nh3. #{content_of(e)}\n\n" }
|
56
|
-
rule_for(:h4) {|e| "\n\nh4. #{content_of(e)}\n\n" }
|
57
|
-
rule_for(:h5) {|e| "\n\nh5. #{content_of(e)}\n\n" }
|
58
|
-
rule_for(:h6) {|e| "\n\nh6. #{content_of(e)}\n\n" }
|
55
|
+
rule_for(:h1) {|e| "\n\nh1#{attributes(e)}. #{content_of(e)}\n\n" }
|
56
|
+
rule_for(:h2) {|e| "\n\nh2#{attributes(e)}. #{content_of(e)}\n\n" }
|
57
|
+
rule_for(:h3) {|e| "\n\nh3#{attributes(e)}. #{content_of(e)}\n\n" }
|
58
|
+
rule_for(:h4) {|e| "\n\nh4#{attributes(e)}. #{content_of(e)}\n\n" }
|
59
|
+
rule_for(:h5) {|e| "\n\nh5#{attributes(e)}. #{content_of(e)}\n\n" }
|
60
|
+
rule_for(:h6) {|e| "\n\nh6#{attributes(e)}. #{content_of(e)}\n\n" }
|
59
61
|
|
60
62
|
# lists
|
61
63
|
rule_for(:li) {|e|
|
@@ -77,19 +79,37 @@ module Undress
|
|
77
79
|
rule_for(:dd) {|e| ":= #{content_of(e)} =:\n" }
|
78
80
|
|
79
81
|
# tables
|
80
|
-
rule_for(:table)
|
81
|
-
rule_for(:tr)
|
82
|
-
rule_for(:td, :th) {|e|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
"
|
82
|
+
rule_for(:table) {|e| "\n\n#{content_of(e)}\n" }
|
83
|
+
rule_for(:tr) {|e| "#{content_of(e)}|\n" }
|
84
|
+
rule_for(:td, :th) {|e| "|#{e.name == "th" ? "_. " : attributes(e)}#{content_of(e)}" }
|
85
|
+
|
86
|
+
def attributes(node) #:nodoc:
|
87
|
+
filtered = super(node)
|
88
|
+
|
89
|
+
if filtered.has_key?(:colspan)
|
90
|
+
return "\\#{filtered[:colspan]}. "
|
89
91
|
end
|
90
92
|
|
91
|
-
|
92
|
-
|
93
|
+
if filtered.has_key?(:rowspan)
|
94
|
+
return "/#{filtered[:rowspan]}. "
|
95
|
+
end
|
96
|
+
|
97
|
+
if filtered.has_key?(:lang)
|
98
|
+
return "[#{filtered[:lang]}]"
|
99
|
+
end
|
100
|
+
|
101
|
+
if filtered.has_key?(:class) || filtered.has_key?(:id)
|
102
|
+
klass = filtered.fetch(:class, "")
|
103
|
+
id = filtered.fetch(:id, false) ? "#" + filtered[:id] : ""
|
104
|
+
return "(#{klass}#{id})"
|
105
|
+
end
|
106
|
+
|
107
|
+
if filtered.has_key?(:style)
|
108
|
+
return "{#{filtered[:style]}}"
|
109
|
+
end
|
110
|
+
|
111
|
+
""
|
112
|
+
end
|
93
113
|
end
|
94
114
|
|
95
115
|
add_markup :textile, Textile
|
data/test/test_grammar.rb
CHANGED
@@ -21,6 +21,10 @@ module Undress
|
|
21
21
|
rule_for(:a) {|e| "" }
|
22
22
|
end
|
23
23
|
|
24
|
+
class WithAttributes < Parent
|
25
|
+
whitelist_attributes :id, :class
|
26
|
+
end
|
27
|
+
|
24
28
|
def parse_with(grammar, html)
|
25
29
|
grammar.process!(Hpricot(html))
|
26
30
|
end
|
@@ -51,5 +55,21 @@ module Undress
|
|
51
55
|
assert_equal "<this was a div>Cuack</this was a div><this is a paragraph>O hai</this is a paragraph>", output
|
52
56
|
end
|
53
57
|
end
|
58
|
+
|
59
|
+
context "handles attributes" do
|
60
|
+
def attributes_for_tag(html)
|
61
|
+
WithAttributes.new.attributes(Hpricot(html).children.first)
|
62
|
+
end
|
63
|
+
|
64
|
+
test "whitelisted attributes are picked up in the attributes hash" do
|
65
|
+
attributes = attributes_for_tag("<p class='foo bar' id='baz'>Cuack</p>")
|
66
|
+
assert_equal({ :class => "foo bar", :id => "baz" }, attributes)
|
67
|
+
end
|
68
|
+
|
69
|
+
test "attributes that are not in the whitelist are ignored" do
|
70
|
+
attributes = attributes_for_tag("<p lang='es' id='saludo'>Hola</p>")
|
71
|
+
assert_equal({ :id => "saludo" }, attributes)
|
72
|
+
end
|
73
|
+
end
|
54
74
|
end
|
55
75
|
end
|
data/test/test_textile.rb
CHANGED
@@ -193,6 +193,28 @@ module Undress
|
|
193
193
|
assert_renders_textile "Trademarked(tm)", "Trademarked™"
|
194
194
|
end
|
195
195
|
end
|
196
|
+
|
197
|
+
context "handling nodes with attributes" do
|
198
|
+
test "converts 'lang' to [_]" do
|
199
|
+
assert_renders_textile "*[es]hola*", "<strong lang='es'>hola</strong>"
|
200
|
+
end
|
201
|
+
|
202
|
+
test "converts 'class' to (_)" do
|
203
|
+
assert_renders_textile "*(foo)hola*", "<strong class='foo'>hola</strong>"
|
204
|
+
end
|
205
|
+
|
206
|
+
test "converts 'id' to (#_)" do
|
207
|
+
assert_renders_textile "*(#bar)hola*", "<strong id='bar'>hola</strong>"
|
208
|
+
end
|
209
|
+
|
210
|
+
test "converts both 'class' and 'id' to (_#_)" do
|
211
|
+
assert_renders_textile "*(foo#bar)hola*", "<strong id='bar' class='foo'>hola</strong>"
|
212
|
+
end
|
213
|
+
|
214
|
+
test "converts 'style' into {_}" do
|
215
|
+
assert_renders_textile "*{color:blue;}hola*", "<strong style='color:blue;'>hola</strong>"
|
216
|
+
end
|
217
|
+
end
|
196
218
|
end
|
197
219
|
end
|
198
220
|
end
|
data/undress.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "undress"
|
3
|
-
s.version = "0.1"
|
4
|
-
s.date = "2009-07-
|
3
|
+
s.version = "0.1.1"
|
4
|
+
s.date = "2009-07-21"
|
5
5
|
|
6
6
|
s.description = "Simply translate HTML to Textile, Markdown, or whatever other markup format you need"
|
7
7
|
s.summary = "Convert HTML into other markup languages"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: undress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- "Nicol\xC3\xA1s Sanguinetti"
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-07-
|
12
|
+
date: 2009-07-21 00:00:00 -03:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
requirements: []
|
98
98
|
|
99
99
|
rubyforge_project: undress
|
100
|
-
rubygems_version: 1.3.
|
100
|
+
rubygems_version: 1.3.5
|
101
101
|
signing_key:
|
102
102
|
specification_version: 3
|
103
103
|
summary: Convert HTML into other markup languages
|