undress 0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +5 -1
- data/lib/undress/grammar.rb +40 -0
- data/lib/undress/textile.rb +48 -28
- data/test/test_grammar.rb +20 -0
- data/test/test_textile.rb +22 -0
- data/undress.gemspec +2 -2
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
Easily convert back HTML to Textile, Markdown, RDoc or whatever other
|
4
4
|
markup language you like.
|
5
5
|
|
6
|
-
require "undress"
|
6
|
+
require "undress/textile"
|
7
7
|
|
8
8
|
code =<<html
|
9
9
|
<h1>Hello world!</h1>
|
@@ -33,6 +33,10 @@ Will produce
|
|
33
33
|
For now the only language supported is Textile. But I'll be happy to accept
|
34
34
|
patches to add more languages :)
|
35
35
|
|
36
|
+
== Get it
|
37
|
+
|
38
|
+
gem install undress
|
39
|
+
|
36
40
|
== License
|
37
41
|
|
38
42
|
Authors:: Nicolas Sanguinetti (foca[http://github.com/foca])
|
data/lib/undress/grammar.rb
CHANGED
@@ -61,6 +61,21 @@ module Undress
|
|
61
61
|
pre_processing_rules[selector] = handler
|
62
62
|
end
|
63
63
|
|
64
|
+
# Set a list of attributes you wish to whitelist
|
65
|
+
#
|
66
|
+
# Any attribute not in this list at the moment of parsing will be ignored by the
|
67
|
+
# parser. The method Grammar#attributes(node) will return a hash of the filtered
|
68
|
+
# attributes. Read its documentation for more details.
|
69
|
+
#
|
70
|
+
# whitelist_attributes :id, :class, :lang
|
71
|
+
def self.whitelist_attributes(*attrs)
|
72
|
+
@whitelisted_attributes = attrs
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.whitelisted_attributes #:nodoc:
|
76
|
+
@whitelisted_attributes || []
|
77
|
+
end
|
78
|
+
|
64
79
|
def self.post_processing_rules #:nodoc:
|
65
80
|
@post_processing_rules ||= {}
|
66
81
|
end
|
@@ -75,10 +90,12 @@ module Undress
|
|
75
90
|
|
76
91
|
attr_reader :pre_processing_rules #:nodoc:
|
77
92
|
attr_reader :post_processing_rules #:nodoc:
|
93
|
+
attr_reader :whitelisted_attributes #:nodoc:
|
78
94
|
|
79
95
|
def initialize #:nodoc:
|
80
96
|
@pre_processing_rules = self.class.pre_processing_rules.dup
|
81
97
|
@post_processing_rules = self.class.post_processing_rules.dup
|
98
|
+
@whitelisted_attributes = self.class.whitelisted_attributes.dup
|
82
99
|
end
|
83
100
|
|
84
101
|
# Process a DOM node, converting it to your markup language according to
|
@@ -120,6 +137,29 @@ module Undress
|
|
120
137
|
(node.next.text? && node.next.to_s =~ /^\s+/)
|
121
138
|
end
|
122
139
|
|
140
|
+
# Hash of attributes, according to the white list. By default, no attributes
|
141
|
+
# are whitelisted, so you must set which ones to whitelist on each grammar.
|
142
|
+
#
|
143
|
+
# Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
|
144
|
+
# <tt>whitelisted_attributes</tt>, and you have a node representing this
|
145
|
+
# HTML:
|
146
|
+
#
|
147
|
+
# <p lang="en" class="greeting">Hello World</p>
|
148
|
+
#
|
149
|
+
# Then the method would return:
|
150
|
+
#
|
151
|
+
# { :class => "greeting" }
|
152
|
+
#
|
153
|
+
# You can override this method in each grammar and call +super+ if you
|
154
|
+
# will represent your attributes consistently across all nodes (for
|
155
|
+
# example, +Textile+ always shows class an id inside parenthesis.)
|
156
|
+
def attributes(node)
|
157
|
+
node.attributes.inject({}) do |attrs,(key,value)|
|
158
|
+
attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
|
159
|
+
attrs
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
123
163
|
def method_missing(tag, node, *args) #:nodoc:
|
124
164
|
process(node.children)
|
125
165
|
end
|
data/lib/undress/textile.rb
CHANGED
@@ -2,6 +2,8 @@ require File.expand_path(File.dirname(__FILE__) + "/../undress")
|
|
2
2
|
|
3
3
|
module Undress
|
4
4
|
class Textile < Grammar
|
5
|
+
whitelist_attributes :class, :id, :lang, :style, :colspan, :rowspan
|
6
|
+
|
5
7
|
# whitespace handling
|
6
8
|
post_processing(/\n\n+/, "\n\n")
|
7
9
|
post_processing(/\A\s+/, "")
|
@@ -27,35 +29,35 @@ module Undress
|
|
27
29
|
alt = e.has_attribute?("alt") ? "(#{e["alt"]})" : ""
|
28
30
|
"!#{e["src"]}#{alt}!"
|
29
31
|
}
|
30
|
-
rule_for(:strong) {|e| "*#{content_of(e)}*" }
|
31
|
-
rule_for(:em) {|e| "_#{content_of(e)}_" }
|
32
|
-
rule_for(:code) {|e| "@#{content_of(e)}@" }
|
33
|
-
rule_for(:cite) {|e| "??#{content_of(e)}??" }
|
34
|
-
rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{content_of(e)}^" : "[^#{content_of(e)}^]" }
|
35
|
-
rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{content_of(e)}~" : "[~#{content_of(e)}~]" }
|
36
|
-
rule_for(:ins) {|e| "+#{content_of(e)}+" }
|
37
|
-
rule_for(:del) {|e| "-#{content_of(e)}-" }
|
32
|
+
rule_for(:strong) {|e| "*#{attributes(e)}#{content_of(e)}*" }
|
33
|
+
rule_for(:em) {|e| "_#{attributes(e)}#{content_of(e)}_" }
|
34
|
+
rule_for(:code) {|e| "@#{attributes(e)}#{content_of(e)}@" }
|
35
|
+
rule_for(:cite) {|e| "??#{attributes(e)}#{content_of(e)}??" }
|
36
|
+
rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{attributes(e)}#{content_of(e)}^" : "[^#{attributes(e)}#{content_of(e)}^]" }
|
37
|
+
rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{attributes(e)}#{content_of(e)}~" : "[~#{attributes(e)}#{content_of(e)}~]" }
|
38
|
+
rule_for(:ins) {|e| "+#{attributes(e)}#{content_of(e)}+" }
|
39
|
+
rule_for(:del) {|e| "-#{attributes(e)}#{content_of(e)}-" }
|
38
40
|
rule_for(:acronym) {|e| e.has_attribute?("title") ? "#{content_of(e)}(#{e["title"]})" : content_of(e) }
|
39
41
|
|
40
42
|
# text formatting and layout
|
41
|
-
rule_for(:p) {|e| "\n\n#{content_of(e)}\n\n" }
|
43
|
+
rule_for(:p) {|e| "\n\n#{attributes(e) != "" ? "p#{attributes(e)}. " : ""}#{content_of(e)}\n\n" }
|
42
44
|
rule_for(:br) {|e| "\n" }
|
43
|
-
rule_for(:blockquote) {|e| "\n\nbq. #{content_of(e)}\n\n" }
|
45
|
+
rule_for(:blockquote) {|e| "\n\nbq#{attributes(e)}. #{content_of(e)}\n\n" }
|
44
46
|
rule_for(:pre) {|e|
|
45
47
|
if e.children.all? {|n| n.text? && n.content =~ /^\s+$/ || n.elem? && n.name == "code" }
|
46
|
-
"\n\npc. #{content_of(e % "code")}\n\n"
|
48
|
+
"\n\npc#{attributes(e)}. #{content_of(e % "code")}\n\n"
|
47
49
|
else
|
48
50
|
"<pre>#{content_of(e)}</pre>"
|
49
51
|
end
|
50
52
|
}
|
51
53
|
|
52
54
|
# headings
|
53
|
-
rule_for(:h1) {|e| "\n\nh1. #{content_of(e)}\n\n" }
|
54
|
-
rule_for(:h2) {|e| "\n\nh2. #{content_of(e)}\n\n" }
|
55
|
-
rule_for(:h3) {|e| "\n\nh3. #{content_of(e)}\n\n" }
|
56
|
-
rule_for(:h4) {|e| "\n\nh4. #{content_of(e)}\n\n" }
|
57
|
-
rule_for(:h5) {|e| "\n\nh5. #{content_of(e)}\n\n" }
|
58
|
-
rule_for(:h6) {|e| "\n\nh6. #{content_of(e)}\n\n" }
|
55
|
+
rule_for(:h1) {|e| "\n\nh1#{attributes(e)}. #{content_of(e)}\n\n" }
|
56
|
+
rule_for(:h2) {|e| "\n\nh2#{attributes(e)}. #{content_of(e)}\n\n" }
|
57
|
+
rule_for(:h3) {|e| "\n\nh3#{attributes(e)}. #{content_of(e)}\n\n" }
|
58
|
+
rule_for(:h4) {|e| "\n\nh4#{attributes(e)}. #{content_of(e)}\n\n" }
|
59
|
+
rule_for(:h5) {|e| "\n\nh5#{attributes(e)}. #{content_of(e)}\n\n" }
|
60
|
+
rule_for(:h6) {|e| "\n\nh6#{attributes(e)}. #{content_of(e)}\n\n" }
|
59
61
|
|
60
62
|
# lists
|
61
63
|
rule_for(:li) {|e|
|
@@ -77,19 +79,37 @@ module Undress
|
|
77
79
|
rule_for(:dd) {|e| ":= #{content_of(e)} =:\n" }
|
78
80
|
|
79
81
|
# tables
|
80
|
-
rule_for(:table)
|
81
|
-
rule_for(:tr)
|
82
|
-
rule_for(:td, :th) {|e|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
"
|
82
|
+
rule_for(:table) {|e| "\n\n#{content_of(e)}\n" }
|
83
|
+
rule_for(:tr) {|e| "#{content_of(e)}|\n" }
|
84
|
+
rule_for(:td, :th) {|e| "|#{e.name == "th" ? "_. " : attributes(e)}#{content_of(e)}" }
|
85
|
+
|
86
|
+
def attributes(node) #:nodoc:
|
87
|
+
filtered = super(node)
|
88
|
+
|
89
|
+
if filtered.has_key?(:colspan)
|
90
|
+
return "\\#{filtered[:colspan]}. "
|
89
91
|
end
|
90
92
|
|
91
|
-
|
92
|
-
|
93
|
+
if filtered.has_key?(:rowspan)
|
94
|
+
return "/#{filtered[:rowspan]}. "
|
95
|
+
end
|
96
|
+
|
97
|
+
if filtered.has_key?(:lang)
|
98
|
+
return "[#{filtered[:lang]}]"
|
99
|
+
end
|
100
|
+
|
101
|
+
if filtered.has_key?(:class) || filtered.has_key?(:id)
|
102
|
+
klass = filtered.fetch(:class, "")
|
103
|
+
id = filtered.fetch(:id, false) ? "#" + filtered[:id] : ""
|
104
|
+
return "(#{klass}#{id})"
|
105
|
+
end
|
106
|
+
|
107
|
+
if filtered.has_key?(:style)
|
108
|
+
return "{#{filtered[:style]}}"
|
109
|
+
end
|
110
|
+
|
111
|
+
""
|
112
|
+
end
|
93
113
|
end
|
94
114
|
|
95
115
|
add_markup :textile, Textile
|
data/test/test_grammar.rb
CHANGED
@@ -21,6 +21,10 @@ module Undress
|
|
21
21
|
rule_for(:a) {|e| "" }
|
22
22
|
end
|
23
23
|
|
24
|
+
class WithAttributes < Parent
|
25
|
+
whitelist_attributes :id, :class
|
26
|
+
end
|
27
|
+
|
24
28
|
def parse_with(grammar, html)
|
25
29
|
grammar.process!(Hpricot(html))
|
26
30
|
end
|
@@ -51,5 +55,21 @@ module Undress
|
|
51
55
|
assert_equal "<this was a div>Cuack</this was a div><this is a paragraph>O hai</this is a paragraph>", output
|
52
56
|
end
|
53
57
|
end
|
58
|
+
|
59
|
+
context "handles attributes" do
|
60
|
+
def attributes_for_tag(html)
|
61
|
+
WithAttributes.new.attributes(Hpricot(html).children.first)
|
62
|
+
end
|
63
|
+
|
64
|
+
test "whitelisted attributes are picked up in the attributes hash" do
|
65
|
+
attributes = attributes_for_tag("<p class='foo bar' id='baz'>Cuack</p>")
|
66
|
+
assert_equal({ :class => "foo bar", :id => "baz" }, attributes)
|
67
|
+
end
|
68
|
+
|
69
|
+
test "attributes that are not in the whitelist are ignored" do
|
70
|
+
attributes = attributes_for_tag("<p lang='es' id='saludo'>Hola</p>")
|
71
|
+
assert_equal({ :id => "saludo" }, attributes)
|
72
|
+
end
|
73
|
+
end
|
54
74
|
end
|
55
75
|
end
|
data/test/test_textile.rb
CHANGED
@@ -193,6 +193,28 @@ module Undress
|
|
193
193
|
assert_renders_textile "Trademarked(tm)", "Trademarked™"
|
194
194
|
end
|
195
195
|
end
|
196
|
+
|
197
|
+
context "handling nodes with attributes" do
|
198
|
+
test "converts 'lang' to [_]" do
|
199
|
+
assert_renders_textile "*[es]hola*", "<strong lang='es'>hola</strong>"
|
200
|
+
end
|
201
|
+
|
202
|
+
test "converts 'class' to (_)" do
|
203
|
+
assert_renders_textile "*(foo)hola*", "<strong class='foo'>hola</strong>"
|
204
|
+
end
|
205
|
+
|
206
|
+
test "converts 'id' to (#_)" do
|
207
|
+
assert_renders_textile "*(#bar)hola*", "<strong id='bar'>hola</strong>"
|
208
|
+
end
|
209
|
+
|
210
|
+
test "converts both 'class' and 'id' to (_#_)" do
|
211
|
+
assert_renders_textile "*(foo#bar)hola*", "<strong id='bar' class='foo'>hola</strong>"
|
212
|
+
end
|
213
|
+
|
214
|
+
test "converts 'style' into {_}" do
|
215
|
+
assert_renders_textile "*{color:blue;}hola*", "<strong style='color:blue;'>hola</strong>"
|
216
|
+
end
|
217
|
+
end
|
196
218
|
end
|
197
219
|
end
|
198
220
|
end
|
data/undress.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "undress"
|
3
|
-
s.version = "0.1"
|
4
|
-
s.date = "2009-07-
|
3
|
+
s.version = "0.1.1"
|
4
|
+
s.date = "2009-07-21"
|
5
5
|
|
6
6
|
s.description = "Simply translate HTML to Textile, Markdown, or whatever other markup format you need"
|
7
7
|
s.summary = "Convert HTML into other markup languages"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: undress
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- "Nicol\xC3\xA1s Sanguinetti"
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-07-
|
12
|
+
date: 2009-07-21 00:00:00 -03:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
requirements: []
|
98
98
|
|
99
99
|
rubyforge_project: undress
|
100
|
-
rubygems_version: 1.3.
|
100
|
+
rubygems_version: 1.3.5
|
101
101
|
signing_key:
|
102
102
|
specification_version: 3
|
103
103
|
summary: Convert HTML into other markup languages
|