RubyGems - suung-undress - Versions diffs - 0.2.5 - Mend

suung-undress 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/.gitignore ADDED

@@ -0,0 +1,3 @@
+doc
+dist
+tmp

data/CHANGELOG ADDED

@@ -0,0 +1,23 @@
+* Fix spaces and <br> inside table cells
+0.2.3
+* Add Hpricot style extension
+* Move hpricot extensions from undress.rb to a single file
+* Convert to Textile single formatted letters inside a word
+0.2.2
+* Adding a \n after [[toc]] in greencloth parser
+0.2.1
+* <span> with more than one styling bold|italic|underline|line-through
+* <p> styling Italics, Underlines, Lines thorough and Bold are converted.
+0.2
+* Fix on removing new lines, tabs and spaces on element and tag nodes.
+* xhtmlize! method added with:
+** Invalid nested <ul> and <ol> lists are converted.
+** <span> styling Italics, Underlines, Lines thorough and Bold are converted.

data/LICENSE ADDED

@@ -0,0 +1,22 @@
+(The MIT License)
+Copyright (c) 2009 Nicolas Sanguinetti, entp.com
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,42 @@
+= Undress
+Easily convert back HTML to Textile, Greencloth.
+    require "undress/textile"
+    code =<<html
+      <h1>Hello world!</h1>
+      <p><strong>Hey!</strong> How is it going?</p>
+      <h2>Supported Markup Languages so far:</h2>
+      <ul>
+        <li>Textile</li>
+        <li>Greencloth</li>
+      </ul>
+    html
+    Undress(code).to_textile
+Will produce
+    h1. Hello world!
+    *Hey!* How is it going?
+    h2. Supported Markup Languages so far:
+    * Textile
+    * Greencloth
+== Supported Markup Languages
+* Textile
+* Greencloth, see [http://we.riseup.net]
+== Get it
+    gem install zevarito-undress
+== License
+Authors:: Nicolas Sanguinetti (foca[http://github.com/foca]), Alvaro Gil (zevarito[http://github.com/zevarito])
+License:: MIT (Check LICENSE for details)

data/Rakefile ADDED

@@ -0,0 +1,32 @@
+require "rake/testtask"
+begin
+  require "hanna/rdoctask"
+rescue LoadError
+  require "rake/rdoctask"
+end
+Rake::RDocTask.new do |rd|
+  rd.main = "README"
+  rd.title = "API Documentation for Undress"
+  rd.rdoc_files.include("README.rdoc", "LICENSE", "lib/**/*.rb")
+  rd.rdoc_dir = "doc"
+end
+begin
+  require "metric_fu"
+rescue LoadError
+end
+begin
+  require "mg"
+  MG.new("undress.gemspec")
+rescue LoadError
+end
+desc "Default: run tests"
+task :default => :test
+Rake::TestTask.new do |t|
+  t.test_files = FileList["test/test_*.rb"]
+end

data/lib/core_ext/object.rb ADDED

@@ -0,0 +1,6 @@
+class Object #:nodoc:
+  def tap
+    yield self
+    self
+  end
+end

data/lib/hpricot_ext.rb ADDED

@@ -0,0 +1,88 @@
+require "hpricot"
+module ::Hpricot #:nodoc:
+  module Elem::Trav
+    def set_style(name, value)
+      styles[name.to_s] = value.fast_xs
+    end
+    def del_style(name)
+      styles.delete(name)
+    end
+    def has_style?(name)
+      styles.has_style?(name)
+    end
+    def get_style(name)
+      styles[name]
+    end
+  end
+  class Styles
+    def initialize e
+      @element = e
+    end
+    def delete(key)
+      p = properties.dup
+      if p.delete key
+        @element.set_attribute("style", "#{p.map {|pty,val| "#{pty}:#{val}"}.join(";")}")
+      end
+    end
+    def [] key
+      properties[key]
+    end
+    def []= k, v
+      s = properties.map {|pty,val| "#{pty}:#{val}"}.join(";")
+      @element.set_attribute("style", "#{s.chomp(";")};#{k}:#{v}".sub(/^\;/, ""))
+    end
+    def empty?
+      return true if properties.size == 0
+    end
+    def has_style?(key)
+      properties.has_key?(key)
+    end
+    def to_s
+      properties.to_s
+    end
+    def to_h
+      properties
+    end
+    def properties
+      return {} if not @element.has_attribute?("style")
+      @element.get_attribute("style").split(";").inject({}) do |hash,v|
+        v = v.split(":")
+        hash.update v.first.strip => v.last.strip
+      end
+    end
+  end
+  class Elem #:nodoc:
+    def ancestors
+      node, ancestors = parent, Elements[]
+      while node.respond_to?(:parent) && node.parent
+        ancestors << node
+        node = node.parent
+      end
+      ancestors
+    end
+    def change_tag!(new_tag, preserve_attr = true)
+      return if not etag
+      self.name = new_tag
+      attributes.each {|k,v| remove_attribute(k)} if not preserve_attr
+    end
+    def styles
+      Styles.new self
+    end
+  end
+end

data/lib/undress.rb ADDED

@@ -0,0 +1,118 @@
+require File.expand_path(File.dirname(__FILE__) + "/hpricot_ext")
+require File.expand_path(File.dirname(__FILE__) + "/core_ext/object")
+require File.expand_path(File.dirname(__FILE__) + "/undress/grammar")
+# Load an HTML document so you can undress it. Pass it either a string or an IO
+# object. You can pass an optional hash of options, which will be forwarded
+# straight to Hpricot. Check it's
+# documentation[http://code.whytheluckystiff.net/doc/hpricot] for details.
+def Undress(html, options={})
+  Undress::Document.new(html, options)
+end
+module Undress
+  INLINE_ELEMENTS = ['span', 'b', 'strong', 'i', 'em', 'ins', 'del','strike', 'abbr', 'acronym', 'cite', 'code', 'label', 'sub', 'sup']
+  # Register a markup language. The name will become the method used to convert
+  # HTML to this markup language: for example registering the name +:textile+
+  # gives you <tt>Undress(code).to_textile</tt>, registering +:markdown+ would
+  # give you <tt>Undress(code).to_markdown</tt>, etc.
+  def self.add_markup(name, grammar)
+    Document.add_markup(name, grammar)
+  end
+  class Document #:nodoc:
+    def initialize(html, options)
+      @doc = Hpricot(html, options)
+      xhtmlize!
+      cleanup_indentation
+    end
+    def self.add_markup(name, grammar)
+      define_method "to_#{name}" do
+        grammar.process!(@doc)
+      end
+    end
+    private
+    # We try to fix those elements which aren't write as xhtml standard but more
+    # important we can't parse it ok without correct it before.
+    def xhtmlize!
+      (@doc/"ul|ol").each   {|list| fixup_list(list) if list.parent != "li" && list.parent.name !~ /ul|ol/}
+      (@doc/"p|span").each  {|e| fixup_span_with_styles(e)}
+      (@doc/"strike").each  {|e| e.change_tag! "del"}
+      (@doc/"u").each       {|e| e.change_tag! "ins"}
+      (@doc/"td|th").each   {|e| fixup_cells(e)}
+    end
+    # Delete tabs, newlines and more than 2 spaces from inside elements
+    # except <pre> or <code> elements
+    def cleanup_indentation
+      (@doc/"*").each do |e|
+        if e.elem? && e.inner_html != "" && e.name !~ (/pre|code/) && e.children.size == 0
+          e.inner_html = e.inner_html.gsub(/\n|\t/,"").gsub(/\s+/," ")
+        elsif e.text? && e.parent.name !~ /pre|code/
+          e.content = e.content.gsub(/\n|\t/,"").gsub(/\s+/," ")
+          e.content = e.content.gsub(/^\s+$/, "") if e.next_node && ! INLINE_ELEMENTS.include?(e.next_node.name)
+        end
+      end
+    end
+    # For those elements like <span> if they are used to represent bold, italic
+    # such as those used on wysiwyg editors, we remove that after convert to not
+    # use them on the final convertion.
+    def fixup_span_with_styles(e)
+      return if !e.has_attribute?("style")
+      if e.get_style("font-style") == "italic"
+        e.inner_html = "<em>#{e.inner_html}</em>"
+        e.del_style("font-style")
+      end
+      if e.get_style("text-decoration") == "underline"
+        e.inner_html = "<ins>#{e.inner_html}</ins>"
+        e.del_style("text-decoration")
+      end
+      if e.get_style("text-decoration") == "line-through"
+        e.inner_html = "<del>#{e.inner_html}</del>"
+        e.del_style("text-decoration")
+      end
+      if e.get_style("font-weight") == "bold"
+        e.inner_html = "<strong>#{e.inner_html}</strong>"
+        e.del_style("font-weight")
+      end
+      e.swap e.inner_html if e.styles.empty? && e.name == "span"
+    end
+    # Fixup a badly nested list such as <ul> sibling to <li> instead inside of <li>.
+    def fixup_list(list)
+      list.children.each {|e| fixup_list(e) if e.elem? && e.name =~ /ol|ul/}
+      if list.parent.name != "li"
+        li_side = list.next_sibling     if list.next_sibling     && list.next_sibling.name     == "li"
+        li_side = list.previous_sibling if list.previous_sibling && list.previous_sibling.name == "li"
+        if li_side
+          li_side.inner_html = "#{li_side.inner_html}#{list.to_html}"
+          list.parent.replace_child(list, "")
+        end
+      end
+    end
+    # spaces beetween td and th elements break textile formatting
+    # <br> aren't allowed
+    # strip spaces
+    def fixup_cells(e)
+      e.search("br").remove
+      e.next_node.content = "" if e.next_node && e.next_node.text?
+      e.previous_node.content = "" if e.previous_node && e.previous_node.text?
+      content = e.inner_html.gsub(/\&nbsp\;/,"\s").strip
+      e.inner_html = content == "" ? [] : content
+    end
+  end
+end

data/lib/undress/grammar.rb ADDED

@@ -0,0 +1,188 @@
+module Undress
+  # Grammars give you a DSL to declare how to convert an HTML document into a
+  # different markup language.
+  class Grammar
+    def self.inherited(base) # :nodoc:
+      base.instance_variable_set(:@post_processing_rules, post_processing_rules)
+      base.instance_variable_set(:@pre_processing_rules, pre_processing_rules)
+    end
+    # Add a parsing rule for a group of html tags.
+    #
+    #     rule_for :p do |element|
+    #       "<this was a paragraph>#{content_of(element)}</this was a paragraph>"
+    #     end
+    #
+    # will replace your <tt><p></tt> tags for <tt><this was a paragraph></tt>
+    # tags, without altering the contents.
+    #
+    # The element yielded to the block is an Hpricot element for the given tag.
+    def self.rule_for(*tags, &handler) # :yields: element
+      tags.each do |tag|
+        define_method tag.to_sym, &handler
+      end
+    end
+    # Set a default rule for unrecognized tags.
+    #
+    # Unless you define a special case, it will ignore the tags and just output
+    # the contents of unrecognized tags.
+    def self.default(&handler) # :yields: element
+      define_method :method_missing do |tag, node, *args|
+        handler.call(node)
+      end
+    end
+    # Add a post-processing rule to your parser.
+    #
+    # This takes a regular expression that will be applied to the output after
+    # processing any nodes. It can take a string as a replacement, or a block
+    # that will be passed to String#gsub.
+    #
+    #     post_processing(/\n\n+/, "\n\n") # compress more than two newlines
+    #     post_processing(/whatever/) { ... }
+    def self.post_processing(regexp, replacement = nil, &handler) #:yields: matched_string
+      post_processing_rules[regexp] = replacement || handler
+    end
+    # Add a pre-processing rule to your parser.
+    #
+    # This lets you mutate the DOM before applying any rule defined with
+    # +rule_for+. You need to pass a CSS/XPath selector, and a block that
+    # takes an Hpricot element to parse it.
+    #
+    #     pre_processing "ul.toc" do |element|
+    #       element.swap("<p>[[toc]]</p>")
+    #     end
+    #
+    # Would replace any unordered lists with the class +toc+ for a
+    # paragraph containing the code <tt>[[toc]]</tt>.
+    def self.pre_processing(selector, &handler) # :yields: element
+      pre_processing_rules[selector] = handler
+    end
+    # Set a list of attributes you wish to whitelist
+    #
+    # Any attribute not in this list at the moment of parsing will be ignored by the
+    # parser. The method Grammar#attributes(node) will return a hash of the filtered
+    # attributes. Read its documentation for more details.
+    #
+    #     whitelist_attributes :id, :class, :lang
+    def self.whitelist_attributes(*attrs)
+      @whitelisted_attributes = attrs
+    end
+    def self.whitelisted_attributes #:nodoc:
+      @whitelisted_attributes || []
+    end
+    def self.post_processing_rules #:nodoc:
+      @post_processing_rules ||= {}
+    end
+    def self.pre_processing_rules #:nodoc:
+      @pre_processing_rules ||= {}
+    end
+    def self.process!(node) #:nodoc:
+      new.process!(node)
+    end
+    attr_reader :pre_processing_rules #:nodoc:
+    attr_reader :post_processing_rules #:nodoc:
+    attr_reader :whitelisted_attributes #:nodoc:
+    def initialize #:nodoc:
+      @pre_processing_rules = self.class.pre_processing_rules.dup
+      @post_processing_rules = self.class.post_processing_rules.dup
+      @whitelisted_attributes = self.class.whitelisted_attributes.dup
+    end
+    # Process a DOM node, converting it to your markup language according to
+    # your defined rules. If the node is a Text node, it will return it's
+    # string representation. Otherwise it will call the rule defined for it.
+    def process(nodes)
+      Array(nodes).map do |node|
+        if node.text?
+          node.to_html
+        elsif node.elem?
+          send node.name.to_sym, node if ! defined?(ALLOWED_TAGS) || ALLOWED_TAGS.empty? || ALLOWED_TAGS.include?(node.name)
+        else
+          ""
+        end
+      end.join("")
+    end
+    def process!(node) #:nodoc:
+      pre_processing_rules.each do |selector, handler|
+        node.search(selector).each(&handler)
+      end
+      process(node.children).tap do |text|
+        post_processing_rules.each do |rule, handler|
+          handler.is_a?(String) ?  text.gsub!(rule, handler) : text.gsub!(rule, &handler)
+        end
+      end
+    end
+    # Get the result of parsing the contents of a node.
+    def content_of(node)
+      process(node.respond_to?(:children) ? node.children : node)
+    end
+    # Helper method that tells you if the given DOM node is immediately
+    # surrounded by whitespace.
+    def surrounded_by_whitespace?(node)
+      (node.previous && node.previous.text? && node.previous.to_s =~ /\s+$/) ||
+        (node.next && node.next.text? && node.next.to_s =~ /^\s+/)
+    end
+    # Helper to determine if a node contents a whole word
+    # useful to convert for example a letter italic inside a word
+    def complete_word?(node)
+      p, n = node.previous_node, node.next_node
+      return true if !p && !n
+      if p.respond_to?(:content)
+        return false if p.content       !~ /\s$/
+      elsif p.respond_to?(:inner_html)
+        return false if p.inner_html    !~ /\s$/
+      end
+      if n.respond_to?(:content)
+        return false if n.content       !~ /^\s/
+      elsif n.respond_to?(:inner_html)
+        return false if n.inner_html    !~ /^\s/
+      end
+      true
+    end
+    # Hash of attributes, according to the white list. By default, no attributes
+    # are whitelisted, so you must set which ones to whitelist on each grammar.
+    #
+    # Supposing you set <tt>:id</tt> and <tt>:class</tt> as your
+    # <tt>whitelisted_attributes</tt>, and you have a node representing this
+    # HTML:
+    #
+    #     <p lang="en" class="greeting">Hello World</p>
+    #
+    # Then the method would return:
+    #
+    #     { :class => "greeting" }
+    #
+    # You can override this method in each grammar and call +super+ if you
+    # will represent your attributes consistently across all nodes (for
+    # example, +Textile+ always shows class an id inside parenthesis.)
+    def attributes(node)
+      node.attributes.to_hash.inject({}) do |attrs,(key,value)|
+        attrs[key.to_sym] = value if whitelisted_attributes.include?(key.to_sym)
+        attrs
+      end
+    end
+    def method_missing(tag, node, *args) #:nodoc:
+      process(node.children)
+    end
+  end
+end