undress 0.1 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,7 +3,7 @@
3
3
  Easily convert back HTML to Textile, Markdown, RDoc or whatever other
4
4
  markup language you like.
5
5
 
6
- require "undress"
6
+ require "undress/textile"
7
7
 
8
8
  code =<<html
9
9
  <h1>Hello world!</h1>
@@ -33,6 +33,10 @@ Will produce
33
33
  For now the only language supported is Textile. But I'll be happy to accept
34
34
  patches to add more languages :)
35
35
 
36
+ == Get it
37
+
38
+ gem install undress
39
+
36
40
  == License
37
41
 
38
42
  Authors:: Nicolas Sanguinetti (foca[http://github.com/foca])
@@ -61,6 +61,21 @@ module Undress
61
61
  pre_processing_rules[selector] = handler
62
62
  end
63
63
 
64
+ # Set a list of attributes you wish to whitelist
65
+ #
66
+ # Any attribute not in this list at the moment of parsing will be ignored by the
67
+ # parser. The method Grammar#attributes(node) will return a hash of the filtered
68
+ # attributes. Read its documentation for more details.
69
+ #
70
+ # whitelist_attributes :id, :class, :lang
71
+ def self.whitelist_attributes(*attrs)
72
+ @whitelisted_attributes = attrs
73
+ end
74
+
75
+ def self.whitelisted_attributes #:nodoc:
76
+ @whitelisted_attributes || []
77
+ end
78
+
64
79
  def self.post_processing_rules #:nodoc:
65
80
  @post_processing_rules ||= {}
66
81
  end
@@ -75,10 +90,12 @@ module Undress
75
90
 
76
91
  attr_reader :pre_processing_rules #:nodoc:
77
92
  attr_reader :post_processing_rules #:nodoc:
93
+ attr_reader :whitelisted_attributes #:nodoc:
78
94
 
79
95
  def initialize #:nodoc:
80
96
  @pre_processing_rules = self.class.pre_processing_rules.dup
81
97
  @post_processing_rules = self.class.post_processing_rules.dup
98
+ @whitelisted_attributes = self.class.whitelisted_attributes.dup
82
99
  end
83
100
 
84
101
  # Process a DOM node, converting it to your markup language according to
@@ -120,6 +137,29 @@ module Undress
120
137
  (node.next.text? && node.next.to_s =~ /^\s+/)
121
138
  end
122
139
 
140
+ # Hash of attributes, according to the white list. By default, no attributes
141
+ # are whitelisted, so you must set which ones to whitelist on each grammar.
142
+ #
143
+ # Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
144
+ # <tt>whitelisted_attributes</tt>, and you have a node representing this
145
+ # HTML:
146
+ #
147
+ # <p lang="en" class="greeting">Hello World</p>
148
+ #
149
+ # Then the method would return:
150
+ #
151
+ # { :class => "greeting" }
152
+ #
153
+ # You can override this method in each grammar and call +super+ if you
154
+ # will represent your attributes consistently across all nodes (for
155
+ # example, +Textile+ always shows class an id inside parenthesis.)
156
+ def attributes(node)
157
+ node.attributes.inject({}) do |attrs,(key,value)|
158
+ attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
159
+ attrs
160
+ end
161
+ end
162
+
123
163
  def method_missing(tag, node, *args) #:nodoc:
124
164
  process(node.children)
125
165
  end
@@ -2,6 +2,8 @@ require File.expand_path(File.dirname(__FILE__) + "/../undress")
2
2
 
3
3
  module Undress
4
4
  class Textile < Grammar
5
+ whitelist_attributes :class, :id, :lang, :style, :colspan, :rowspan
6
+
5
7
  # whitespace handling
6
8
  post_processing(/\n\n+/, "\n\n")
7
9
  post_processing(/\A\s+/, "")
@@ -27,35 +29,35 @@ module Undress
27
29
  alt = e.has_attribute?("alt") ? "(#{e["alt"]})" : ""
28
30
  "!#{e["src"]}#{alt}!"
29
31
  }
30
- rule_for(:strong) {|e| "*#{content_of(e)}*" }
31
- rule_for(:em) {|e| "_#{content_of(e)}_" }
32
- rule_for(:code) {|e| "@#{content_of(e)}@" }
33
- rule_for(:cite) {|e| "??#{content_of(e)}??" }
34
- rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{content_of(e)}^" : "[^#{content_of(e)}^]" }
35
- rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{content_of(e)}~" : "[~#{content_of(e)}~]" }
36
- rule_for(:ins) {|e| "+#{content_of(e)}+" }
37
- rule_for(:del) {|e| "-#{content_of(e)}-" }
32
+ rule_for(:strong) {|e| "*#{attributes(e)}#{content_of(e)}*" }
33
+ rule_for(:em) {|e| "_#{attributes(e)}#{content_of(e)}_" }
34
+ rule_for(:code) {|e| "@#{attributes(e)}#{content_of(e)}@" }
35
+ rule_for(:cite) {|e| "??#{attributes(e)}#{content_of(e)}??" }
36
+ rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{attributes(e)}#{content_of(e)}^" : "[^#{attributes(e)}#{content_of(e)}^]" }
37
+ rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{attributes(e)}#{content_of(e)}~" : "[~#{attributes(e)}#{content_of(e)}~]" }
38
+ rule_for(:ins) {|e| "+#{attributes(e)}#{content_of(e)}+" }
39
+ rule_for(:del) {|e| "-#{attributes(e)}#{content_of(e)}-" }
38
40
  rule_for(:acronym) {|e| e.has_attribute?("title") ? "#{content_of(e)}(#{e["title"]})" : content_of(e) }
39
41
 
40
42
  # text formatting and layout
41
- rule_for(:p) {|e| "\n\n#{content_of(e)}\n\n" }
43
+ rule_for(:p) {|e| "\n\n#{attributes(e) != "" ? "p#{attributes(e)}. " : ""}#{content_of(e)}\n\n" }
42
44
  rule_for(:br) {|e| "\n" }
43
- rule_for(:blockquote) {|e| "\n\nbq. #{content_of(e)}\n\n" }
45
+ rule_for(:blockquote) {|e| "\n\nbq#{attributes(e)}. #{content_of(e)}\n\n" }
44
46
  rule_for(:pre) {|e|
45
47
  if e.children.all? {|n| n.text? && n.content =~ /^\s+$/ || n.elem? && n.name == "code" }
46
- "\n\npc. #{content_of(e % "code")}\n\n"
48
+ "\n\npc#{attributes(e)}. #{content_of(e % "code")}\n\n"
47
49
  else
48
50
  "<pre>#{content_of(e)}</pre>"
49
51
  end
50
52
  }
51
53
 
52
54
  # headings
53
- rule_for(:h1) {|e| "\n\nh1. #{content_of(e)}\n\n" }
54
- rule_for(:h2) {|e| "\n\nh2. #{content_of(e)}\n\n" }
55
- rule_for(:h3) {|e| "\n\nh3. #{content_of(e)}\n\n" }
56
- rule_for(:h4) {|e| "\n\nh4. #{content_of(e)}\n\n" }
57
- rule_for(:h5) {|e| "\n\nh5. #{content_of(e)}\n\n" }
58
- rule_for(:h6) {|e| "\n\nh6. #{content_of(e)}\n\n" }
55
+ rule_for(:h1) {|e| "\n\nh1#{attributes(e)}. #{content_of(e)}\n\n" }
56
+ rule_for(:h2) {|e| "\n\nh2#{attributes(e)}. #{content_of(e)}\n\n" }
57
+ rule_for(:h3) {|e| "\n\nh3#{attributes(e)}. #{content_of(e)}\n\n" }
58
+ rule_for(:h4) {|e| "\n\nh4#{attributes(e)}. #{content_of(e)}\n\n" }
59
+ rule_for(:h5) {|e| "\n\nh5#{attributes(e)}. #{content_of(e)}\n\n" }
60
+ rule_for(:h6) {|e| "\n\nh6#{attributes(e)}. #{content_of(e)}\n\n" }
59
61
 
60
62
  # lists
61
63
  rule_for(:li) {|e|
@@ -77,19 +79,37 @@ module Undress
77
79
  rule_for(:dd) {|e| ":= #{content_of(e)} =:\n" }
78
80
 
79
81
  # tables
80
- rule_for(:table) {|e| "\n\n#{content_of(e)}\n" }
81
- rule_for(:tr) {|e| "#{content_of(e)}|\n" }
82
- rule_for(:td, :th) {|e|
83
- prefix = if e.name == "th"
84
- "_. "
85
- elsif e.has_attribute?("colspan")
86
- "\\#{e["colspan"]}. "
87
- elsif e.has_attribute?("rowspan")
88
- "/#{e["rowspan"]}. "
82
+ rule_for(:table) {|e| "\n\n#{content_of(e)}\n" }
83
+ rule_for(:tr) {|e| "#{content_of(e)}|\n" }
84
+ rule_for(:td, :th) {|e| "|#{e.name == "th" ? "_. " : attributes(e)}#{content_of(e)}" }
85
+
86
+ def attributes(node) #:nodoc:
87
+ filtered = super(node)
88
+
89
+ if filtered.has_key?(:colspan)
90
+ return "\\#{filtered[:colspan]}. "
89
91
  end
90
92
 
91
- "|#{prefix}#{content_of(e)}"
92
- }
93
+ if filtered.has_key?(:rowspan)
94
+ return "/#{filtered[:rowspan]}. "
95
+ end
96
+
97
+ if filtered.has_key?(:lang)
98
+ return "[#{filtered[:lang]}]"
99
+ end
100
+
101
+ if filtered.has_key?(:class) || filtered.has_key?(:id)
102
+ klass = filtered.fetch(:class, "")
103
+ id = filtered.fetch(:id, false) ? "#" + filtered[:id] : ""
104
+ return "(#{klass}#{id})"
105
+ end
106
+
107
+ if filtered.has_key?(:style)
108
+ return "{#{filtered[:style]}}"
109
+ end
110
+
111
+ ""
112
+ end
93
113
  end
94
114
 
95
115
  add_markup :textile, Textile
@@ -21,6 +21,10 @@ module Undress
21
21
  rule_for(:a) {|e| "" }
22
22
  end
23
23
 
24
+ class WithAttributes < Parent
25
+ whitelist_attributes :id, :class
26
+ end
27
+
24
28
  def parse_with(grammar, html)
25
29
  grammar.process!(Hpricot(html))
26
30
  end
@@ -51,5 +55,21 @@ module Undress
51
55
  assert_equal "<this was a div>Cuack</this was a div><this is a paragraph>O hai</this is a paragraph>", output
52
56
  end
53
57
  end
58
+
59
+ context "handles attributes" do
60
+ def attributes_for_tag(html)
61
+ WithAttributes.new.attributes(Hpricot(html).children.first)
62
+ end
63
+
64
+ test "whitelisted attributes are picked up in the attributes hash" do
65
+ attributes = attributes_for_tag("<p class='foo bar' id='baz'>Cuack</p>")
66
+ assert_equal({ :class => "foo bar", :id => "baz" }, attributes)
67
+ end
68
+
69
+ test "attributes that are not in the whitelist are ignored" do
70
+ attributes = attributes_for_tag("<p lang='es' id='saludo'>Hola</p>")
71
+ assert_equal({ :id => "saludo" }, attributes)
72
+ end
73
+ end
54
74
  end
55
75
  end
@@ -193,6 +193,28 @@ module Undress
193
193
  assert_renders_textile "Trademarked(tm)", "Trademarked&#8482;"
194
194
  end
195
195
  end
196
+
197
+ context "handling nodes with attributes" do
198
+ test "converts 'lang' to [_]" do
199
+ assert_renders_textile "*[es]hola*", "<strong lang='es'>hola</strong>"
200
+ end
201
+
202
+ test "converts 'class' to (_)" do
203
+ assert_renders_textile "*(foo)hola*", "<strong class='foo'>hola</strong>"
204
+ end
205
+
206
+ test "converts 'id' to (#_)" do
207
+ assert_renders_textile "*(#bar)hola*", "<strong id='bar'>hola</strong>"
208
+ end
209
+
210
+ test "converts both 'class' and 'id' to (_#_)" do
211
+ assert_renders_textile "*(foo#bar)hola*", "<strong id='bar' class='foo'>hola</strong>"
212
+ end
213
+
214
+ test "converts 'style' into {_}" do
215
+ assert_renders_textile "*{color:blue;}hola*", "<strong style='color:blue;'>hola</strong>"
216
+ end
217
+ end
196
218
  end
197
219
  end
198
220
  end
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "undress"
3
- s.version = "0.1"
4
- s.date = "2009-07-13"
3
+ s.version = "0.1.1"
4
+ s.date = "2009-07-21"
5
5
 
6
6
  s.description = "Simply translate HTML to Textile, Markdown, or whatever other markup format you need"
7
7
  s.summary = "Convert HTML into other markup languages"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: undress
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - "Nicol\xC3\xA1s Sanguinetti"
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-07-13 00:00:00 -03:00
12
+ date: 2009-07-21 00:00:00 -03:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  requirements: []
98
98
 
99
99
  rubyforge_project: undress
100
- rubygems_version: 1.3.4
100
+ rubygems_version: 1.3.5
101
101
  signing_key:
102
102
  specification_version: 3
103
103
  summary: Convert HTML into other markup languages