undress 0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  Easily convert back HTML to Textile, Markdown, RDoc or whatever other
4
4
  markup language you like.
5
5
 
6
- require "undress"
6
+ require "undress/textile"
7
7
 
8
8
  code =<<html
9
9
  <h1>Hello world!</h1>
@@ -33,6 +33,10 @@ Will produce
33
33
  For now the only language supported is Textile. But I'll be happy to accept
34
34
  patches to add more languages :)
35
35
 
36
+ == Get it
37
+
38
+ gem install undress
39
+
36
40
  == License
37
41
 
38
42
  Authors:: Nicolas Sanguinetti (foca[http://github.com/foca])
@@ -61,6 +61,21 @@ module Undress
61
61
  pre_processing_rules[selector] = handler
62
62
  end
63
63
 
64
+ # Set a list of attributes you wish to whitelist
65
+ #
66
+ # Any attribute not in this list at the moment of parsing will be ignored by the
67
+ # parser. The method Grammar#attributes(node) will return a hash of the filtered
68
+ # attributes. Read its documentation for more details.
69
+ #
70
+ # whitelist_attributes :id, :class, :lang
71
+ def self.whitelist_attributes(*attrs)
72
+ @whitelisted_attributes = attrs
73
+ end
74
+
75
+ def self.whitelisted_attributes #:nodoc:
76
+ @whitelisted_attributes || []
77
+ end
78
+
64
79
  def self.post_processing_rules #:nodoc:
65
80
  @post_processing_rules ||= {}
66
81
  end
@@ -75,10 +90,12 @@ module Undress
75
90
 
76
91
  attr_reader :pre_processing_rules #:nodoc:
77
92
  attr_reader :post_processing_rules #:nodoc:
93
+ attr_reader :whitelisted_attributes #:nodoc:
78
94
 
79
95
  def initialize #:nodoc:
80
96
  @pre_processing_rules = self.class.pre_processing_rules.dup
81
97
  @post_processing_rules = self.class.post_processing_rules.dup
98
+ @whitelisted_attributes = self.class.whitelisted_attributes.dup
82
99
  end
83
100
 
84
101
  # Process a DOM node, converting it to your markup language according to
@@ -120,6 +137,29 @@ module Undress
120
137
  (node.next.text? && node.next.to_s =~ /^\s+/)
121
138
  end
122
139
 
140
+ # Hash of attributes, according to the white list. By default, no attributes
141
+ # are whitelisted, so you must set which ones to whitelist on each grammar.
142
+ #
143
+ # Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
144
+ # <tt>whitelisted_attributes</tt>, and you have a node representing this
145
+ # HTML:
146
+ #
147
+ # <p lang="en" class="greeting">Hello World</p>
148
+ #
149
+ # Then the method would return:
150
+ #
151
+ # { :class => "greeting" }
152
+ #
153
+ # You can override this method in each grammar and call +super+ if you
154
+ # will represent your attributes consistently across all nodes (for
155
+ # example, +Textile+ always shows class an id inside parenthesis.)
156
+ def attributes(node)
157
+ node.attributes.inject({}) do |attrs,(key,value)|
158
+ attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
159
+ attrs
160
+ end
161
+ end
162
+
123
163
  def method_missing(tag, node, *args) #:nodoc:
124
164
  process(node.children)
125
165
  end
@@ -2,6 +2,8 @@ require File.expand_path(File.dirname(__FILE__) + "/../undress")
2
2
 
3
3
  module Undress
4
4
  class Textile < Grammar
5
+ whitelist_attributes :class, :id, :lang, :style, :colspan, :rowspan
6
+
5
7
  # whitespace handling
6
8
  post_processing(/\n\n+/, "\n\n")
7
9
  post_processing(/\A\s+/, "")
@@ -27,35 +29,35 @@ module Undress
27
29
  alt = e.has_attribute?("alt") ? "(#{e["alt"]})" : ""
28
30
  "!#{e["src"]}#{alt}!"
29
31
  }
30
- rule_for(:strong) {|e| "*#{content_of(e)}*" }
31
- rule_for(:em) {|e| "_#{content_of(e)}_" }
32
- rule_for(:code) {|e| "@#{content_of(e)}@" }
33
- rule_for(:cite) {|e| "??#{content_of(e)}??" }
34
- rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{content_of(e)}^" : "[^#{content_of(e)}^]" }
35
- rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{content_of(e)}~" : "[~#{content_of(e)}~]" }
36
- rule_for(:ins) {|e| "+#{content_of(e)}+" }
37
- rule_for(:del) {|e| "-#{content_of(e)}-" }
32
+ rule_for(:strong) {|e| "*#{attributes(e)}#{content_of(e)}*" }
33
+ rule_for(:em) {|e| "_#{attributes(e)}#{content_of(e)}_" }
34
+ rule_for(:code) {|e| "@#{attributes(e)}#{content_of(e)}@" }
35
+ rule_for(:cite) {|e| "??#{attributes(e)}#{content_of(e)}??" }
36
+ rule_for(:sup) {|e| surrounded_by_whitespace?(e) ? "^#{attributes(e)}#{content_of(e)}^" : "[^#{attributes(e)}#{content_of(e)}^]" }
37
+ rule_for(:sub) {|e| surrounded_by_whitespace?(e) ? "~#{attributes(e)}#{content_of(e)}~" : "[~#{attributes(e)}#{content_of(e)}~]" }
38
+ rule_for(:ins) {|e| "+#{attributes(e)}#{content_of(e)}+" }
39
+ rule_for(:del) {|e| "-#{attributes(e)}#{content_of(e)}-" }
38
40
  rule_for(:acronym) {|e| e.has_attribute?("title") ? "#{content_of(e)}(#{e["title"]})" : content_of(e) }
39
41
 
40
42
  # text formatting and layout
41
- rule_for(:p) {|e| "\n\n#{content_of(e)}\n\n" }
43
+ rule_for(:p) {|e| "\n\n#{attributes(e) != "" ? "p#{attributes(e)}. " : ""}#{content_of(e)}\n\n" }
42
44
  rule_for(:br) {|e| "\n" }
43
- rule_for(:blockquote) {|e| "\n\nbq. #{content_of(e)}\n\n" }
45
+ rule_for(:blockquote) {|e| "\n\nbq#{attributes(e)}. #{content_of(e)}\n\n" }
44
46
  rule_for(:pre) {|e|
45
47
  if e.children.all? {|n| n.text? && n.content =~ /^\s+$/ || n.elem? && n.name == "code" }
46
- "\n\npc. #{content_of(e % "code")}\n\n"
48
+ "\n\npc#{attributes(e)}. #{content_of(e % "code")}\n\n"
47
49
  else
48
50
  "<pre>#{content_of(e)}</pre>"
49
51
  end
50
52
  }
51
53
 
52
54
  # headings
53
- rule_for(:h1) {|e| "\n\nh1. #{content_of(e)}\n\n" }
54
- rule_for(:h2) {|e| "\n\nh2. #{content_of(e)}\n\n" }
55
- rule_for(:h3) {|e| "\n\nh3. #{content_of(e)}\n\n" }
56
- rule_for(:h4) {|e| "\n\nh4. #{content_of(e)}\n\n" }
57
- rule_for(:h5) {|e| "\n\nh5. #{content_of(e)}\n\n" }
58
- rule_for(:h6) {|e| "\n\nh6. #{content_of(e)}\n\n" }
55
+ rule_for(:h1) {|e| "\n\nh1#{attributes(e)}. #{content_of(e)}\n\n" }
56
+ rule_for(:h2) {|e| "\n\nh2#{attributes(e)}. #{content_of(e)}\n\n" }
57
+ rule_for(:h3) {|e| "\n\nh3#{attributes(e)}. #{content_of(e)}\n\n" }
58
+ rule_for(:h4) {|e| "\n\nh4#{attributes(e)}. #{content_of(e)}\n\n" }
59
+ rule_for(:h5) {|e| "\n\nh5#{attributes(e)}. #{content_of(e)}\n\n" }
60
+ rule_for(:h6) {|e| "\n\nh6#{attributes(e)}. #{content_of(e)}\n\n" }
59
61
 
60
62
  # lists
61
63
  rule_for(:li) {|e|
@@ -77,19 +79,37 @@ module Undress
77
79
  rule_for(:dd) {|e| ":= #{content_of(e)} =:\n" }
78
80
 
79
81
  # tables
80
- rule_for(:table) {|e| "\n\n#{content_of(e)}\n" }
81
- rule_for(:tr) {|e| "#{content_of(e)}|\n" }
82
- rule_for(:td, :th) {|e|
83
- prefix = if e.name == "th"
84
- "_. "
85
- elsif e.has_attribute?("colspan")
86
- "\\#{e["colspan"]}. "
87
- elsif e.has_attribute?("rowspan")
88
- "/#{e["rowspan"]}. "
82
+ rule_for(:table) {|e| "\n\n#{content_of(e)}\n" }
83
+ rule_for(:tr) {|e| "#{content_of(e)}|\n" }
84
+ rule_for(:td, :th) {|e| "|#{e.name == "th" ? "_. " : attributes(e)}#{content_of(e)}" }
85
+
86
+ def attributes(node) #:nodoc:
87
+ filtered = super(node)
88
+
89
+ if filtered.has_key?(:colspan)
90
+ return "\\#{filtered[:colspan]}. "
89
91
  end
90
92
 
91
- "|#{prefix}#{content_of(e)}"
92
- }
93
+ if filtered.has_key?(:rowspan)
94
+ return "/#{filtered[:rowspan]}. "
95
+ end
96
+
97
+ if filtered.has_key?(:lang)
98
+ return "[#{filtered[:lang]}]"
99
+ end
100
+
101
+ if filtered.has_key?(:class) || filtered.has_key?(:id)
102
+ klass = filtered.fetch(:class, "")
103
+ id = filtered.fetch(:id, false) ? "#" + filtered[:id] : ""
104
+ return "(#{klass}#{id})"
105
+ end
106
+
107
+ if filtered.has_key?(:style)
108
+ return "{#{filtered[:style]}}"
109
+ end
110
+
111
+ ""
112
+ end
93
113
  end
94
114
 
95
115
  add_markup :textile, Textile
@@ -21,6 +21,10 @@ module Undress
21
21
  rule_for(:a) {|e| "" }
22
22
  end
23
23
 
24
+ class WithAttributes < Parent
25
+ whitelist_attributes :id, :class
26
+ end
27
+
24
28
  def parse_with(grammar, html)
25
29
  grammar.process!(Hpricot(html))
26
30
  end
@@ -51,5 +55,21 @@ module Undress
51
55
  assert_equal "<this was a div>Cuack</this was a div><this is a paragraph>O hai</this is a paragraph>", output
52
56
  end
53
57
  end
58
+
59
+ context "handles attributes" do
60
+ def attributes_for_tag(html)
61
+ WithAttributes.new.attributes(Hpricot(html).children.first)
62
+ end
63
+
64
+ test "whitelisted attributes are picked up in the attributes hash" do
65
+ attributes = attributes_for_tag("<p class='foo bar' id='baz'>Cuack</p>")
66
+ assert_equal({ :class => "foo bar", :id => "baz" }, attributes)
67
+ end
68
+
69
+ test "attributes that are not in the whitelist are ignored" do
70
+ attributes = attributes_for_tag("<p lang='es' id='saludo'>Hola</p>")
71
+ assert_equal({ :id => "saludo" }, attributes)
72
+ end
73
+ end
54
74
  end
55
75
  end
@@ -193,6 +193,28 @@ module Undress
193
193
  assert_renders_textile "Trademarked(tm)", "Trademarked&#8482;"
194
194
  end
195
195
  end
196
+
197
+ context "handling nodes with attributes" do
198
+ test "converts 'lang' to [_]" do
199
+ assert_renders_textile "*[es]hola*", "<strong lang='es'>hola</strong>"
200
+ end
201
+
202
+ test "converts 'class' to (_)" do
203
+ assert_renders_textile "*(foo)hola*", "<strong class='foo'>hola</strong>"
204
+ end
205
+
206
+ test "converts 'id' to (#_)" do
207
+ assert_renders_textile "*(#bar)hola*", "<strong id='bar'>hola</strong>"
208
+ end
209
+
210
+ test "converts both 'class' and 'id' to (_#_)" do
211
+ assert_renders_textile "*(foo#bar)hola*", "<strong id='bar' class='foo'>hola</strong>"
212
+ end
213
+
214
+ test "converts 'style' into {_}" do
215
+ assert_renders_textile "*{color:blue;}hola*", "<strong style='color:blue;'>hola</strong>"
216
+ end
217
+ end
196
218
  end
197
219
  end
198
220
  end
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "undress"
3
- s.version = "0.1"
4
- s.date = "2009-07-13"
3
+ s.version = "0.1.1"
4
+ s.date = "2009-07-21"
5
5
 
6
6
  s.description = "Simply translate HTML to Textile, Markdown, or whatever other markup format you need"
7
7
  s.summary = "Convert HTML into other markup languages"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: undress
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - "Nicol\xC3\xA1s Sanguinetti"
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-07-13 00:00:00 -03:00
12
+ date: 2009-07-21 00:00:00 -03:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  requirements: []
98
98
 
99
99
  rubyforge_project: undress
100
- rubygems_version: 1.3.4
100
+ rubygems_version: 1.3.5
101
101
  signing_key:
102
102
  specification_version: 3
103
103
  summary: Convert HTML into other markup languages