suung-undress 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ doc
2
+ dist
3
+ tmp
@@ -0,0 +1,23 @@
1
+ * Fix spaces and <br> inside table cells
2
+
3
+ 0.2.3
4
+
5
+ * Add Hpricot style extension
6
+ * Move hpricot extensions from undress.rb to a single file
7
+ * Convert to Textile single formatted letters inside a word
8
+
9
+ 0.2.2
10
+
11
+ * Adding a \n after [[toc]] in greencloth parser
12
+
13
+ 0.2.1
14
+
15
+ * <span> with more than one styling bold|italic|underline|line-through
16
+ * <p> styling Italics, Underlines, Lines thorough and Bold are converted.
17
+
18
+ 0.2
19
+
20
+ * Fix on removing new lines, tabs and spaces on element and tag nodes.
21
+ * xhtmlize! method added with:
22
+ ** Invalid nested <ul> and <ol> lists are converted.
23
+ ** <span> styling Italics, Underlines, Lines thorough and Bold are converted.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ (The MIT License)
2
+
3
+ Copyright (c) 2009 Nicolas Sanguinetti, entp.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ 'Software'), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,42 @@
1
+ = Undress
2
+
3
+ Easily convert back HTML to Textile, Greencloth.
4
+
5
+ require "undress/textile"
6
+
7
+ code =<<html
8
+ <h1>Hello world!</h1>
9
+ <p><strong>Hey!</strong> How is it going?</p>
10
+ <h2>Supported Markup Languages so far:</h2>
11
+ <ul>
12
+ <li>Textile</li>
13
+ <li>Greencloth</li>
14
+ </ul>
15
+ html
16
+
17
+ Undress(code).to_textile
18
+
19
+ Will produce
20
+
21
+ h1. Hello world!
22
+
23
+ *Hey!* How is it going?
24
+
25
+ h2. Supported Markup Languages so far:
26
+
27
+ * Textile
28
+ * Greencloth
29
+
30
+ == Supported Markup Languages
31
+
32
+ * Textile
33
+ * Greencloth, see [http://we.riseup.net]
34
+
35
+ == Get it
36
+
37
+ gem install zevarito-undress
38
+
39
+ == License
40
+
41
+ Authors:: Nicolas Sanguinetti (foca[http://github.com/foca]), Alvaro Gil (zevarito[http://github.com/zevarito])
42
+ License:: MIT (Check LICENSE for details)
@@ -0,0 +1,32 @@
1
+ require "rake/testtask"
2
+
3
+ begin
4
+ require "hanna/rdoctask"
5
+ rescue LoadError
6
+ require "rake/rdoctask"
7
+ end
8
+
9
+ Rake::RDocTask.new do |rd|
10
+ rd.main = "README"
11
+ rd.title = "API Documentation for Undress"
12
+ rd.rdoc_files.include("README.rdoc", "LICENSE", "lib/**/*.rb")
13
+ rd.rdoc_dir = "doc"
14
+ end
15
+
16
+ begin
17
+ require "metric_fu"
18
+ rescue LoadError
19
+ end
20
+
21
+ begin
22
+ require "mg"
23
+ MG.new("undress.gemspec")
24
+ rescue LoadError
25
+ end
26
+
27
+ desc "Default: run tests"
28
+ task :default => :test
29
+
30
+ Rake::TestTask.new do |t|
31
+ t.test_files = FileList["test/test_*.rb"]
32
+ end
@@ -0,0 +1,6 @@
1
+ class Object #:nodoc:
2
+ def tap
3
+ yield self
4
+ self
5
+ end
6
+ end
@@ -0,0 +1,88 @@
1
+ require "hpricot"
2
+
3
+ module ::Hpricot #:nodoc:
4
+ module Elem::Trav
5
+ def set_style(name, value)
6
+ styles[name.to_s] = value.fast_xs
7
+ end
8
+
9
+ def del_style(name)
10
+ styles.delete(name)
11
+ end
12
+
13
+ def has_style?(name)
14
+ styles.has_style?(name)
15
+ end
16
+
17
+ def get_style(name)
18
+ styles[name]
19
+ end
20
+ end
21
+
22
+ class Styles
23
+ def initialize e
24
+ @element = e
25
+ end
26
+
27
+ def delete(key)
28
+ p = properties.dup
29
+ if p.delete key
30
+ @element.set_attribute("style", "#{p.map {|pty,val| "#{pty}:#{val}"}.join(";")}")
31
+ end
32
+ end
33
+
34
+ def [] key
35
+ properties[key]
36
+ end
37
+
38
+ def []= k, v
39
+ s = properties.map {|pty,val| "#{pty}:#{val}"}.join(";")
40
+ @element.set_attribute("style", "#{s.chomp(";")};#{k}:#{v}".sub(/^\;/, ""))
41
+ end
42
+
43
+ def empty?
44
+ return true if properties.size == 0
45
+ end
46
+
47
+ def has_style?(key)
48
+ properties.has_key?(key)
49
+ end
50
+
51
+ def to_s
52
+ properties.to_s
53
+ end
54
+
55
+ def to_h
56
+ properties
57
+ end
58
+
59
+ def properties
60
+ return {} if not @element.has_attribute?("style")
61
+ @element.get_attribute("style").split(";").inject({}) do |hash,v|
62
+ v = v.split(":")
63
+ hash.update v.first.strip => v.last.strip
64
+ end
65
+ end
66
+ end
67
+
68
+ class Elem #:nodoc:
69
+ def ancestors
70
+ node, ancestors = parent, Elements[]
71
+ while node.respond_to?(:parent) && node.parent
72
+ ancestors << node
73
+ node = node.parent
74
+ end
75
+ ancestors
76
+ end
77
+
78
+ def change_tag!(new_tag, preserve_attr = true)
79
+ return if not etag
80
+ self.name = new_tag
81
+ attributes.each {|k,v| remove_attribute(k)} if not preserve_attr
82
+ end
83
+
84
+ def styles
85
+ Styles.new self
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,118 @@
1
+ require File.expand_path(File.dirname(__FILE__) + "/hpricot_ext")
2
+ require File.expand_path(File.dirname(__FILE__) + "/core_ext/object")
3
+ require File.expand_path(File.dirname(__FILE__) + "/undress/grammar")
4
+
5
+ # Load an HTML document so you can undress it. Pass it either a string or an IO
6
+ # object. You can pass an optional hash of options, which will be forwarded
7
+ # straight to Hpricot. Check it's
8
+ # documentation[http://code.whytheluckystiff.net/doc/hpricot] for details.
9
+ def Undress(html, options={})
10
+ Undress::Document.new(html, options)
11
+ end
12
+
13
+ module Undress
14
+
15
+ INLINE_ELEMENTS = ['span', 'b', 'strong', 'i', 'em', 'ins', 'del','strike', 'abbr', 'acronym', 'cite', 'code', 'label', 'sub', 'sup']
16
+
17
+ # Register a markup language. The name will become the method used to convert
18
+ # HTML to this markup language: for example registering the name +:textile+
19
+ # gives you <tt>Undress(code).to_textile</tt>, registering +:markdown+ would
20
+ # give you <tt>Undress(code).to_markdown</tt>, etc.
21
+ def self.add_markup(name, grammar)
22
+ Document.add_markup(name, grammar)
23
+ end
24
+
25
+ class Document #:nodoc:
26
+ def initialize(html, options)
27
+ @doc = Hpricot(html, options)
28
+ xhtmlize!
29
+ cleanup_indentation
30
+ end
31
+
32
+ def self.add_markup(name, grammar)
33
+ define_method "to_#{name}" do
34
+ grammar.process!(@doc)
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ # We try to fix those elements which aren't write as xhtml standard but more
41
+ # important we can't parse it ok without correct it before.
42
+ def xhtmlize!
43
+ (@doc/"ul|ol").each {|list| fixup_list(list) if list.parent != "li" && list.parent.name !~ /ul|ol/}
44
+ (@doc/"p|span").each {|e| fixup_span_with_styles(e)}
45
+ (@doc/"strike").each {|e| e.change_tag! "del"}
46
+ (@doc/"u").each {|e| e.change_tag! "ins"}
47
+ (@doc/"td|th").each {|e| fixup_cells(e)}
48
+ end
49
+
50
+ # Delete tabs, newlines and more than 2 spaces from inside elements
51
+ # except <pre> or <code> elements
52
+ def cleanup_indentation
53
+ (@doc/"*").each do |e|
54
+ if e.elem? && e.inner_html != "" && e.name !~ (/pre|code/) && e.children.size == 0
55
+ e.inner_html = e.inner_html.gsub(/\n|\t/,"").gsub(/\s+/," ")
56
+ elsif e.text? && e.parent.name !~ /pre|code/
57
+ e.content = e.content.gsub(/\n|\t/,"").gsub(/\s+/," ")
58
+ e.content = e.content.gsub(/^\s+$/, "") if e.next_node && ! INLINE_ELEMENTS.include?(e.next_node.name)
59
+ end
60
+ end
61
+ end
62
+
63
+ # For those elements like <span> if they are used to represent bold, italic
64
+ # such as those used on wysiwyg editors, we remove that after convert to not
65
+ # use them on the final convertion.
66
+ def fixup_span_with_styles(e)
67
+ return if !e.has_attribute?("style")
68
+
69
+ if e.get_style("font-style") == "italic"
70
+ e.inner_html = "<em>#{e.inner_html}</em>"
71
+ e.del_style("font-style")
72
+ end
73
+
74
+ if e.get_style("text-decoration") == "underline"
75
+ e.inner_html = "<ins>#{e.inner_html}</ins>"
76
+ e.del_style("text-decoration")
77
+ end
78
+
79
+ if e.get_style("text-decoration") == "line-through"
80
+ e.inner_html = "<del>#{e.inner_html}</del>"
81
+ e.del_style("text-decoration")
82
+ end
83
+
84
+ if e.get_style("font-weight") == "bold"
85
+ e.inner_html = "<strong>#{e.inner_html}</strong>"
86
+ e.del_style("font-weight")
87
+ end
88
+
89
+ e.swap e.inner_html if e.styles.empty? && e.name == "span"
90
+ end
91
+
92
+ # Fixup a badly nested list such as <ul> sibling to <li> instead inside of <li>.
93
+ def fixup_list(list)
94
+ list.children.each {|e| fixup_list(e) if e.elem? && e.name =~ /ol|ul/}
95
+
96
+ if list.parent.name != "li"
97
+ li_side = list.next_sibling if list.next_sibling && list.next_sibling.name == "li"
98
+ li_side = list.previous_sibling if list.previous_sibling && list.previous_sibling.name == "li"
99
+
100
+ if li_side
101
+ li_side.inner_html = "#{li_side.inner_html}#{list.to_html}"
102
+ list.parent.replace_child(list, "")
103
+ end
104
+ end
105
+ end
106
+
107
+ # spaces beetween td and th elements break textile formatting
108
+ # <br> aren't allowed
109
+ # strip spaces
110
+ def fixup_cells(e)
111
+ e.search("br").remove
112
+ e.next_node.content = "" if e.next_node && e.next_node.text?
113
+ e.previous_node.content = "" if e.previous_node && e.previous_node.text?
114
+ content = e.inner_html.gsub(/\&nbsp\;/,"\s").strip
115
+ e.inner_html = content == "" ? [] : content
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,188 @@
1
+ module Undress
2
+ # Grammars give you a DSL to declare how to convert an HTML document into a
3
+ # different markup language.
4
+ class Grammar
5
+ def self.inherited(base) # :nodoc:
6
+ base.instance_variable_set(:@post_processing_rules, post_processing_rules)
7
+ base.instance_variable_set(:@pre_processing_rules, pre_processing_rules)
8
+ end
9
+
10
+ # Add a parsing rule for a group of html tags.
11
+ #
12
+ # rule_for :p do |element|
13
+ # "<this was a paragraph>#{content_of(element)}</this was a paragraph>"
14
+ # end
15
+ #
16
+ # will replace your <tt><p></tt> tags for <tt><this was a paragraph></tt>
17
+ # tags, without altering the contents.
18
+ #
19
+ # The element yielded to the block is an Hpricot element for the given tag.
20
+ def self.rule_for(*tags, &handler) # :yields: element
21
+ tags.each do |tag|
22
+ define_method tag.to_sym, &handler
23
+ end
24
+ end
25
+
26
+ # Set a default rule for unrecognized tags.
27
+ #
28
+ # Unless you define a special case, it will ignore the tags and just output
29
+ # the contents of unrecognized tags.
30
+ def self.default(&handler) # :yields: element
31
+ define_method :method_missing do |tag, node, *args|
32
+ handler.call(node)
33
+ end
34
+ end
35
+
36
+ # Add a post-processing rule to your parser.
37
+ #
38
+ # This takes a regular expression that will be applied to the output after
39
+ # processing any nodes. It can take a string as a replacement, or a block
40
+ # that will be passed to String#gsub.
41
+ #
42
+ # post_processing(/\n\n+/, "\n\n") # compress more than two newlines
43
+ # post_processing(/whatever/) { ... }
44
+ def self.post_processing(regexp, replacement = nil, &handler) #:yields: matched_string
45
+ post_processing_rules[regexp] = replacement || handler
46
+ end
47
+
48
+ # Add a pre-processing rule to your parser.
49
+ #
50
+ # This lets you mutate the DOM before applying any rule defined with
51
+ # +rule_for+. You need to pass a CSS/XPath selector, and a block that
52
+ # takes an Hpricot element to parse it.
53
+ #
54
+ # pre_processing "ul.toc" do |element|
55
+ # element.swap("<p>[[toc]]</p>")
56
+ # end
57
+ #
58
+ # Would replace any unordered lists with the class +toc+ for a
59
+ # paragraph containing the code <tt>[[toc]]</tt>.
60
+ def self.pre_processing(selector, &handler) # :yields: element
61
+ pre_processing_rules[selector] = handler
62
+ end
63
+
64
+ # Set a list of attributes you wish to whitelist
65
+ #
66
+ # Any attribute not in this list at the moment of parsing will be ignored by the
67
+ # parser. The method Grammar#attributes(node) will return a hash of the filtered
68
+ # attributes. Read its documentation for more details.
69
+ #
70
+ # whitelist_attributes :id, :class, :lang
71
+ def self.whitelist_attributes(*attrs)
72
+ @whitelisted_attributes = attrs
73
+ end
74
+
75
+ def self.whitelisted_attributes #:nodoc:
76
+ @whitelisted_attributes || []
77
+ end
78
+
79
+ def self.post_processing_rules #:nodoc:
80
+ @post_processing_rules ||= {}
81
+ end
82
+
83
+ def self.pre_processing_rules #:nodoc:
84
+ @pre_processing_rules ||= {}
85
+ end
86
+
87
+ def self.process!(node) #:nodoc:
88
+ new.process!(node)
89
+ end
90
+
91
+ attr_reader :pre_processing_rules #:nodoc:
92
+ attr_reader :post_processing_rules #:nodoc:
93
+ attr_reader :whitelisted_attributes #:nodoc:
94
+
95
+ def initialize #:nodoc:
96
+ @pre_processing_rules = self.class.pre_processing_rules.dup
97
+ @post_processing_rules = self.class.post_processing_rules.dup
98
+ @whitelisted_attributes = self.class.whitelisted_attributes.dup
99
+ end
100
+
101
+ # Process a DOM node, converting it to your markup language according to
102
+ # your defined rules. If the node is a Text node, it will return it's
103
+ # string representation. Otherwise it will call the rule defined for it.
104
+ def process(nodes)
105
+ Array(nodes).map do |node|
106
+ if node.text?
107
+ node.to_html
108
+ elsif node.elem?
109
+ send node.name.to_sym, node if ! defined?(ALLOWED_TAGS) || ALLOWED_TAGS.empty? || ALLOWED_TAGS.include?(node.name)
110
+ else
111
+ ""
112
+ end
113
+ end.join("")
114
+ end
115
+
116
+ def process!(node) #:nodoc:
117
+ pre_processing_rules.each do |selector, handler|
118
+ node.search(selector).each(&handler)
119
+ end
120
+
121
+ process(node.children).tap do |text|
122
+ post_processing_rules.each do |rule, handler|
123
+ handler.is_a?(String) ? text.gsub!(rule, handler) : text.gsub!(rule, &handler)
124
+ end
125
+ end
126
+ end
127
+
128
+ # Get the result of parsing the contents of a node.
129
+ def content_of(node)
130
+ process(node.respond_to?(:children) ? node.children : node)
131
+ end
132
+
133
+ # Helper method that tells you if the given DOM node is immediately
134
+ # surrounded by whitespace.
135
+ def surrounded_by_whitespace?(node)
136
+ (node.previous && node.previous.text? && node.previous.to_s =~ /\s+$/) ||
137
+ (node.next && node.next.text? && node.next.to_s =~ /^\s+/)
138
+ end
139
+
140
+ # Helper to determine if a node contents a whole word
141
+ # useful to convert for example a letter italic inside a word
142
+ def complete_word?(node)
143
+ p, n = node.previous_node, node.next_node
144
+
145
+ return true if !p && !n
146
+
147
+ if p.respond_to?(:content)
148
+ return false if p.content !~ /\s$/
149
+ elsif p.respond_to?(:inner_html)
150
+ return false if p.inner_html !~ /\s$/
151
+ end
152
+
153
+ if n.respond_to?(:content)
154
+ return false if n.content !~ /^\s/
155
+ elsif n.respond_to?(:inner_html)
156
+ return false if n.inner_html !~ /^\s/
157
+ end
158
+ true
159
+ end
160
+
161
+ # Hash of attributes, according to the white list. By default, no attributes
162
+ # are whitelisted, so you must set which ones to whitelist on each grammar.
163
+ #
164
+ # Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
165
+ # <tt>whitelisted_attributes</tt>, and you have a node representing this
166
+ # HTML:
167
+ #
168
+ # <p lang="en" class="greeting">Hello World</p>
169
+ #
170
+ # Then the method would return:
171
+ #
172
+ # { :class => "greeting" }
173
+ #
174
+ # You can override this method in each grammar and call +super+ if you
175
+ # will represent your attributes consistently across all nodes (for
176
+ # example, +Textile+ always shows class an id inside parenthesis.)
177
+ def attributes(node)
178
+ node.attributes.to_hash.inject({}) do |attrs,(key,value)|
179
+ attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
180
+ attrs
181
+ end
182
+ end
183
+
184
+ def method_missing(tag, node, *args) #:nodoc:
185
+ process(node.children)
186
+ end
187
+ end
188
+ end