suung-undress 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ doc
2
+ dist
3
+ tmp
@@ -0,0 +1,23 @@
1
+ * Fix spaces and <br> inside table cells
2
+
3
+ 0.2.3
4
+
5
+ * Add Hpricot style extension
6
+ * Move hpricot extensions from undress.rb to a single file
7
+ * Convert to Textile single formatted letters inside a word
8
+
9
+ 0.2.2
10
+
11
+ * Adding a \n after [[toc]] in greencloth parser
12
+
13
+ 0.2.1
14
+
15
+ * <span> with more than one styling bold|italic|underline|line-through
16
+ * <p> styling Italics, Underlines, Lines thorough and Bold are converted.
17
+
18
+ 0.2
19
+
20
+ * Fix on removing new lines, tabs and spaces on element and tag nodes.
21
+ * xhtmlize! method added with:
22
+ ** Invalid nested <ul> and <ol> lists are converted.
23
+ ** <span> styling Italics, Underlines, Lines thorough and Bold are converted.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ (The MIT License)
2
+
3
+ Copyright (c) 2009 Nicolas Sanguinetti, entp.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ 'Software'), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,42 @@
1
+ = Undress
2
+
3
+ Easily convert back HTML to Textile, Greencloth.
4
+
5
+ require "undress/textile"
6
+
7
+ code =<<html
8
+ <h1>Hello world!</h1>
9
+ <p><strong>Hey!</strong> How is it going?</p>
10
+ <h2>Supported Markup Languages so far:</h2>
11
+ <ul>
12
+ <li>Textile</li>
13
+ <li>Greencloth</li>
14
+ </ul>
15
+ html
16
+
17
+ Undress(code).to_textile
18
+
19
+ Will produce
20
+
21
+ h1. Hello world!
22
+
23
+ *Hey!* How is it going?
24
+
25
+ h2. Supported Markup Languages so far:
26
+
27
+ * Textile
28
+ * Greencloth
29
+
30
+ == Supported Markup Languages
31
+
32
+ * Textile
33
+ * Greencloth, see [http://we.riseup.net]
34
+
35
+ == Get it
36
+
37
+ gem install zevarito-undress
38
+
39
+ == License
40
+
41
+ Authors:: Nicolas Sanguinetti (foca[http://github.com/foca]), Alvaro Gil (zevarito[http://github.com/zevarito])
42
+ License:: MIT (Check LICENSE for details)
@@ -0,0 +1,32 @@
1
+ require "rake/testtask"
2
+
3
+ begin
4
+ require "hanna/rdoctask"
5
+ rescue LoadError
6
+ require "rake/rdoctask"
7
+ end
8
+
9
+ Rake::RDocTask.new do |rd|
10
+ rd.main = "README"
11
+ rd.title = "API Documentation for Undress"
12
+ rd.rdoc_files.include("README.rdoc", "LICENSE", "lib/**/*.rb")
13
+ rd.rdoc_dir = "doc"
14
+ end
15
+
16
+ begin
17
+ require "metric_fu"
18
+ rescue LoadError
19
+ end
20
+
21
+ begin
22
+ require "mg"
23
+ MG.new("undress.gemspec")
24
+ rescue LoadError
25
+ end
26
+
27
+ desc "Default: run tests"
28
+ task :default => :test
29
+
30
+ Rake::TestTask.new do |t|
31
+ t.test_files = FileList["test/test_*.rb"]
32
+ end
@@ -0,0 +1,6 @@
1
+ class Object #:nodoc:
2
+ def tap
3
+ yield self
4
+ self
5
+ end
6
+ end
@@ -0,0 +1,88 @@
1
+ require "hpricot"
2
+
3
+ module ::Hpricot #:nodoc:
4
+ module Elem::Trav
5
+ def set_style(name, value)
6
+ styles[name.to_s] = value.fast_xs
7
+ end
8
+
9
+ def del_style(name)
10
+ styles.delete(name)
11
+ end
12
+
13
+ def has_style?(name)
14
+ styles.has_style?(name)
15
+ end
16
+
17
+ def get_style(name)
18
+ styles[name]
19
+ end
20
+ end
21
+
22
+ class Styles
23
+ def initialize e
24
+ @element = e
25
+ end
26
+
27
+ def delete(key)
28
+ p = properties.dup
29
+ if p.delete key
30
+ @element.set_attribute("style", "#{p.map {|pty,val| "#{pty}:#{val}"}.join(";")}")
31
+ end
32
+ end
33
+
34
+ def [] key
35
+ properties[key]
36
+ end
37
+
38
+ def []= k, v
39
+ s = properties.map {|pty,val| "#{pty}:#{val}"}.join(";")
40
+ @element.set_attribute("style", "#{s.chomp(";")};#{k}:#{v}".sub(/^\;/, ""))
41
+ end
42
+
43
+ def empty?
44
+ return true if properties.size == 0
45
+ end
46
+
47
+ def has_style?(key)
48
+ properties.has_key?(key)
49
+ end
50
+
51
+ def to_s
52
+ properties.to_s
53
+ end
54
+
55
+ def to_h
56
+ properties
57
+ end
58
+
59
+ def properties
60
+ return {} if not @element.has_attribute?("style")
61
+ @element.get_attribute("style").split(";").inject({}) do |hash,v|
62
+ v = v.split(":")
63
+ hash.update v.first.strip => v.last.strip
64
+ end
65
+ end
66
+ end
67
+
68
+ class Elem #:nodoc:
69
+ def ancestors
70
+ node, ancestors = parent, Elements[]
71
+ while node.respond_to?(:parent) && node.parent
72
+ ancestors << node
73
+ node = node.parent
74
+ end
75
+ ancestors
76
+ end
77
+
78
+ def change_tag!(new_tag, preserve_attr = true)
79
+ return if not etag
80
+ self.name = new_tag
81
+ attributes.each {|k,v| remove_attribute(k)} if not preserve_attr
82
+ end
83
+
84
+ def styles
85
+ Styles.new self
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,118 @@
1
+ require File.expand_path(File.dirname(__FILE__) + "/hpricot_ext")
2
+ require File.expand_path(File.dirname(__FILE__) + "/core_ext/object")
3
+ require File.expand_path(File.dirname(__FILE__) + "/undress/grammar")
4
+
5
+ # Load an HTML document so you can undress it. Pass it either a string or an IO
6
+ # object. You can pass an optional hash of options, which will be forwarded
7
+ # straight to Hpricot. Check it's
8
+ # documentation[http://code.whytheluckystiff.net/doc/hpricot] for details.
9
+ def Undress(html, options={})
10
+ Undress::Document.new(html, options)
11
+ end
12
+
13
+ module Undress
14
+
15
+ INLINE_ELEMENTS = ['span', 'b', 'strong', 'i', 'em', 'ins', 'del','strike', 'abbr', 'acronym', 'cite', 'code', 'label', 'sub', 'sup']
16
+
17
+ # Register a markup language. The name will become the method used to convert
18
+ # HTML to this markup language: for example registering the name +:textile+
19
+ # gives you <tt>Undress(code).to_textile</tt>, registering +:markdown+ would
20
+ # give you <tt>Undress(code).to_markdown</tt>, etc.
21
+ def self.add_markup(name, grammar)
22
+ Document.add_markup(name, grammar)
23
+ end
24
+
25
+ class Document #:nodoc:
26
+ def initialize(html, options)
27
+ @doc = Hpricot(html, options)
28
+ xhtmlize!
29
+ cleanup_indentation
30
+ end
31
+
32
+ def self.add_markup(name, grammar)
33
+ define_method "to_#{name}" do
34
+ grammar.process!(@doc)
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ # We try to fix those elements which aren't write as xhtml standard but more
41
+ # important we can't parse it ok without correct it before.
42
+ def xhtmlize!
43
+ (@doc/"ul|ol").each {|list| fixup_list(list) if list.parent != "li" && list.parent.name !~ /ul|ol/}
44
+ (@doc/"p|span").each {|e| fixup_span_with_styles(e)}
45
+ (@doc/"strike").each {|e| e.change_tag! "del"}
46
+ (@doc/"u").each {|e| e.change_tag! "ins"}
47
+ (@doc/"td|th").each {|e| fixup_cells(e)}
48
+ end
49
+
50
+ # Delete tabs, newlines and more than 2 spaces from inside elements
51
+ # except <pre> or <code> elements
52
+ def cleanup_indentation
53
+ (@doc/"*").each do |e|
54
+ if e.elem? && e.inner_html != "" && e.name !~ (/pre|code/) && e.children.size == 0
55
+ e.inner_html = e.inner_html.gsub(/\n|\t/,"").gsub(/\s+/," ")
56
+ elsif e.text? && e.parent.name !~ /pre|code/
57
+ e.content = e.content.gsub(/\n|\t/,"").gsub(/\s+/," ")
58
+ e.content = e.content.gsub(/^\s+$/, "") if e.next_node && ! INLINE_ELEMENTS.include?(e.next_node.name)
59
+ end
60
+ end
61
+ end
62
+
63
+ # For those elements like <span> if they are used to represent bold, italic
64
+ # such as those used on wysiwyg editors, we remove that after convert to not
65
+ # use them on the final convertion.
66
+ def fixup_span_with_styles(e)
67
+ return if !e.has_attribute?("style")
68
+
69
+ if e.get_style("font-style") == "italic"
70
+ e.inner_html = "<em>#{e.inner_html}</em>"
71
+ e.del_style("font-style")
72
+ end
73
+
74
+ if e.get_style("text-decoration") == "underline"
75
+ e.inner_html = "<ins>#{e.inner_html}</ins>"
76
+ e.del_style("text-decoration")
77
+ end
78
+
79
+ if e.get_style("text-decoration") == "line-through"
80
+ e.inner_html = "<del>#{e.inner_html}</del>"
81
+ e.del_style("text-decoration")
82
+ end
83
+
84
+ if e.get_style("font-weight") == "bold"
85
+ e.inner_html = "<strong>#{e.inner_html}</strong>"
86
+ e.del_style("font-weight")
87
+ end
88
+
89
+ e.swap e.inner_html if e.styles.empty? && e.name == "span"
90
+ end
91
+
92
+ # Fixup a badly nested list such as <ul> sibling to <li> instead inside of <li>.
93
+ def fixup_list(list)
94
+ list.children.each {|e| fixup_list(e) if e.elem? && e.name =~ /ol|ul/}
95
+
96
+ if list.parent.name != "li"
97
+ li_side = list.next_sibling if list.next_sibling && list.next_sibling.name == "li"
98
+ li_side = list.previous_sibling if list.previous_sibling && list.previous_sibling.name == "li"
99
+
100
+ if li_side
101
+ li_side.inner_html = "#{li_side.inner_html}#{list.to_html}"
102
+ list.parent.replace_child(list, "")
103
+ end
104
+ end
105
+ end
106
+
107
+ # spaces beetween td and th elements break textile formatting
108
+ # <br> aren't allowed
109
+ # strip spaces
110
+ def fixup_cells(e)
111
+ e.search("br").remove
112
+ e.next_node.content = "" if e.next_node && e.next_node.text?
113
+ e.previous_node.content = "" if e.previous_node && e.previous_node.text?
114
+ content = e.inner_html.gsub(/\&nbsp\;/,"\s").strip
115
+ e.inner_html = content == "" ? [] : content
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,188 @@
1
+ module Undress
2
+ # Grammars give you a DSL to declare how to convert an HTML document into a
3
+ # different markup language.
4
+ class Grammar
5
+ def self.inherited(base) # :nodoc:
6
+ base.instance_variable_set(:@post_processing_rules, post_processing_rules)
7
+ base.instance_variable_set(:@pre_processing_rules, pre_processing_rules)
8
+ end
9
+
10
+ # Add a parsing rule for a group of html tags.
11
+ #
12
+ # rule_for :p do |element|
13
+ # "<this was a paragraph>#{content_of(element)}</this was a paragraph>"
14
+ # end
15
+ #
16
+ # will replace your <tt><p></tt> tags for <tt><this was a paragraph></tt>
17
+ # tags, without altering the contents.
18
+ #
19
+ # The element yielded to the block is an Hpricot element for the given tag.
20
+ def self.rule_for(*tags, &handler) # :yields: element
21
+ tags.each do |tag|
22
+ define_method tag.to_sym, &handler
23
+ end
24
+ end
25
+
26
+ # Set a default rule for unrecognized tags.
27
+ #
28
+ # Unless you define a special case, it will ignore the tags and just output
29
+ # the contents of unrecognized tags.
30
+ def self.default(&handler) # :yields: element
31
+ define_method :method_missing do |tag, node, *args|
32
+ handler.call(node)
33
+ end
34
+ end
35
+
36
+ # Add a post-processing rule to your parser.
37
+ #
38
+ # This takes a regular expression that will be applied to the output after
39
+ # processing any nodes. It can take a string as a replacement, or a block
40
+ # that will be passed to String#gsub.
41
+ #
42
+ # post_processing(/\n\n+/, "\n\n") # compress more than two newlines
43
+ # post_processing(/whatever/) { ... }
44
+ def self.post_processing(regexp, replacement = nil, &handler) #:yields: matched_string
45
+ post_processing_rules[regexp] = replacement || handler
46
+ end
47
+
48
+ # Add a pre-processing rule to your parser.
49
+ #
50
+ # This lets you mutate the DOM before applying any rule defined with
51
+ # +rule_for+. You need to pass a CSS/XPath selector, and a block that
52
+ # takes an Hpricot element to parse it.
53
+ #
54
+ # pre_processing "ul.toc" do |element|
55
+ # element.swap("<p>[[toc]]</p>")
56
+ # end
57
+ #
58
+ # Would replace any unordered lists with the class +toc+ for a
59
+ # paragraph containing the code <tt>[[toc]]</tt>.
60
+ def self.pre_processing(selector, &handler) # :yields: element
61
+ pre_processing_rules[selector] = handler
62
+ end
63
+
64
+ # Set a list of attributes you wish to whitelist
65
+ #
66
+ # Any attribute not in this list at the moment of parsing will be ignored by the
67
+ # parser. The method Grammar#attributes(node) will return a hash of the filtered
68
+ # attributes. Read its documentation for more details.
69
+ #
70
+ # whitelist_attributes :id, :class, :lang
71
+ def self.whitelist_attributes(*attrs)
72
+ @whitelisted_attributes = attrs
73
+ end
74
+
75
+ def self.whitelisted_attributes #:nodoc:
76
+ @whitelisted_attributes || []
77
+ end
78
+
79
+ def self.post_processing_rules #:nodoc:
80
+ @post_processing_rules ||= {}
81
+ end
82
+
83
+ def self.pre_processing_rules #:nodoc:
84
+ @pre_processing_rules ||= {}
85
+ end
86
+
87
+ def self.process!(node) #:nodoc:
88
+ new.process!(node)
89
+ end
90
+
91
+ attr_reader :pre_processing_rules #:nodoc:
92
+ attr_reader :post_processing_rules #:nodoc:
93
+ attr_reader :whitelisted_attributes #:nodoc:
94
+
95
+ def initialize #:nodoc:
96
+ @pre_processing_rules = self.class.pre_processing_rules.dup
97
+ @post_processing_rules = self.class.post_processing_rules.dup
98
+ @whitelisted_attributes = self.class.whitelisted_attributes.dup
99
+ end
100
+
101
+ # Process a DOM node, converting it to your markup language according to
102
+ # your defined rules. If the node is a Text node, it will return it's
103
+ # string representation. Otherwise it will call the rule defined for it.
104
+ def process(nodes)
105
+ Array(nodes).map do |node|
106
+ if node.text?
107
+ node.to_html
108
+ elsif node.elem?
109
+ send node.name.to_sym, node if ! defined?(ALLOWED_TAGS) || ALLOWED_TAGS.empty? || ALLOWED_TAGS.include?(node.name)
110
+ else
111
+ ""
112
+ end
113
+ end.join("")
114
+ end
115
+
116
+ def process!(node) #:nodoc:
117
+ pre_processing_rules.each do |selector, handler|
118
+ node.search(selector).each(&handler)
119
+ end
120
+
121
+ process(node.children).tap do |text|
122
+ post_processing_rules.each do |rule, handler|
123
+ handler.is_a?(String) ? text.gsub!(rule, handler) : text.gsub!(rule, &handler)
124
+ end
125
+ end
126
+ end
127
+
128
+ # Get the result of parsing the contents of a node.
129
+ def content_of(node)
130
+ process(node.respond_to?(:children) ? node.children : node)
131
+ end
132
+
133
+ # Helper method that tells you if the given DOM node is immediately
134
+ # surrounded by whitespace.
135
+ def surrounded_by_whitespace?(node)
136
+ (node.previous && node.previous.text? && node.previous.to_s =~ /\s+$/) ||
137
+ (node.next && node.next.text? && node.next.to_s =~ /^\s+/)
138
+ end
139
+
140
+ # Helper to determine if a node contents a whole word
141
+ # useful to convert for example a letter italic inside a word
142
+ def complete_word?(node)
143
+ p, n = node.previous_node, node.next_node
144
+
145
+ return true if !p && !n
146
+
147
+ if p.respond_to?(:content)
148
+ return false if p.content !~ /\s$/
149
+ elsif p.respond_to?(:inner_html)
150
+ return false if p.inner_html !~ /\s$/
151
+ end
152
+
153
+ if n.respond_to?(:content)
154
+ return false if n.content !~ /^\s/
155
+ elsif n.respond_to?(:inner_html)
156
+ return false if n.inner_html !~ /^\s/
157
+ end
158
+ true
159
+ end
160
+
161
+ # Hash of attributes, according to the white list. By default, no attributes
162
+ # are whitelisted, so you must set which ones to whitelist on each grammar.
163
+ #
164
+ # Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
165
+ # <tt>whitelisted_attributes</tt>, and you have a node representing this
166
+ # HTML:
167
+ #
168
+ # <p lang="en" class="greeting">Hello World</p>
169
+ #
170
+ # Then the method would return:
171
+ #
172
+ # { :class => "greeting" }
173
+ #
174
+ # You can override this method in each grammar and call +super+ if you
175
+ # will represent your attributes consistently across all nodes (for
176
+ # example, +Textile+ always shows class an id inside parenthesis.)
177
+ def attributes(node)
178
+ node.attributes.to_hash.inject({}) do |attrs,(key,value)|
179
+ attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
180
+ attrs
181
+ end
182
+ end
183
+
184
+ def method_missing(tag, node, *args) #:nodoc:
185
+ process(node.children)
186
+ end
187
+ end
188
+ end