RubyGems - spix_parser - Versions diffs - 1.7.3 → 1.7.5 - Mend

spix_parser 1.7.3 → 1.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/spix_parser/parser.rb +48 -39
data/lib/spix_parser/version.rb +1 -1
data/lib/spix_parser/wrappers/entry.rb +2 -2
data/spec/spix_parser/parser_spec.rb +1 -1
data/spec/spix_parser/utils_spec.rb +42 -14
metadata +1 -1

data/lib/spix_parser/parser.rb CHANGED Viewed

@@ -32,10 +32,10 @@ module Spix
     extend self
     def format_links(options)
-      text     = Sanitizer.html_decode options[:text]
-      site_url = Sanitizer.html_decode options[:site_url]
+      text     = options[:text]
+      site_url = options[:site_url]
-      parse_links(text)
+      parse_links(text, site_url)
       parse_images(text, site_url)
       text
@@ -45,66 +45,75 @@ module Spix
     def join_attributes(attrs)
       attrs.map do |attr, value|
-        %Q[#{attr}="#{value.to_s.gsub(/"/, "&quot;")}"] unless value.blank?
-      end.compact.join(" ")
+        %Q[#{attr}="#{value.to_s}"] unless value.blank?
+      end.compact.join(" ").gsub(/"/, "&quot;")
     end
-    def parse_attrs(str)
-      attrs = {}
-      return attrs unless str || str.respond_to?(:scan)
+    def parse_attrs(str, options = {})
+      attrs_to_add = options.delete(:adding)
+      allowed = options.delete(:allowed_attrs)
-      match_by_spaces = str !~ /'|"/
-      if match_by_spaces
-        # Make sure to match the last html attribute.
+      if match_by_spaces = str =~ /'|"|&quot;|&#034;/
+        value_regexp = %r{\s*#{$&}(.*?)#{$&}}
+        encloser = $&.gsub(/'/, '"')
+      else
+        encloser = "\""
         str += " "
         value_regexp = /\s*(.*?)\s/
-      else
-        value_regexp = /\s*["'](.*?)["']/
       end
       attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
-      str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
-        attrs[$1.to_s.downcase] = $2
-      end
+      finded_attrs = {}
+      result = str.gsub(/#{attribute_regexp}=#{value_regexp}/im) {
+        attribute, value = $1.downcase, $2
+        next if value.blank? or not allowed.include?(attribute.to_sym)
-      attrs
+        parsed_value = yield(attribute, value) || value
+        finded_attrs[attribute.to_sym] = parsed_value
+        %{#{attribute}=#{enclose parsed_value, encloser} }
+      } + " " + (
+        attrs_to_add.to_a - finded_attrs.to_a
+      ).map {|k,v| "#{k}=#{enclose v, encloser}" }.join(" ")
+      return [result.split.join(" "), finded_attrs.keys]
     end
-    def parse_links(text)
-      text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
-        attrs = parse_attrs($2.to_s)
+    def enclose(str, encloser)
+      "#{encloser}#{str}#{encloser}"
+    end
-        # just parse these attributes
+    def parse_links(text, site_url)
+      allowed = [:href, :title, :target, :rel]
+      text.gsub!(/<a\s+([^>]+)>/uim) do |match|
         attrs = {
-          :href   => attrs["href"],
-          :title  => attrs["title"],
           :target => "_blank",
           :rel    => "external nofollow"
         }
-        "<a #{join_attributes(attrs)}>"
+        parsed_attribute, attrs = parse_attrs($1.to_s, :adding => attrs, :allowed_attrs => allowed ) do |attr, value|
+          parse_relative_source(value, site_url) if attr == "href"
+        end
+        %{<a #{parsed_attribute}>}
       end
     end
     def parse_images(text, site_url)
-      text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
-        attrs = parse_attrs($2.to_s)
-        # just parse these attributes
-        attrs = {
-          :src    => parse_relative_image_source(attrs["src"], site_url),
-          :alt    => attrs["alt"],
-          :title  => attrs["title"],
-          :style  => attrs["style"],
-          :width  => attrs["width"],
-          :height => attrs["height"]
-        }
-        "<img #{join_attributes(attrs)} />" if attrs[:src].present?
+      allowed = [:src, :alt, :title, :style, :width, :height]
+      text.gsub!(/<img(.*?)\/?>/uim) do |match|
+        parsed_attribute, attrs = parse_attrs($1.to_s, :allowed_attrs => allowed) do |attr, value|
+          parse_relative_source(value, site_url) if attr == "src"
+        end
+        %{<img #{parsed_attribute} />} if attrs.include?(:src)
       end
     end
-    def parse_relative_image_source(src, site_url)
+    def parse_relative_source(src, site_url)
+      src = Sanitizer.html_decode(src) if src
+      site_url = Sanitizer.html_decode(site_url) if site_url
       if src.present? && site_url
         begin
           src = URI.parse(src)

data/lib/spix_parser/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Spix
     module Version
       MAJOR = 1
       MINOR = 7
-      TINY  = 3
+      TINY  = 5
       def self.current_version
         "#{MAJOR}.#{MINOR}.#{TINY}"

data/lib/spix_parser/wrappers/entry.rb CHANGED Viewed

@@ -47,8 +47,8 @@ module Spix
       def uid
         uid = self.url || ""
-        uid += self.encoded_raw_content_for :title
-        uid += self.encoded_raw_content[0..25]
+        uid += encoded_raw_content_for :title
+        uid += encoded_raw_content[0..25]
         uid.to_sha1
       end
       memoize(:uid)

data/spec/spix_parser/parser_spec.rb CHANGED Viewed

@@ -30,7 +30,7 @@ describe Spix::Parser do
     it "should parse correctly images with absolute with another domain" do
       feed = Spix::Parser.parse(load_fixture('feed_with_absolute_images_from_another_domain.atom'), :mode => :local)
-      feed.feed_items.first.content[/<img.*src=["'](.*?)["'].*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
+      feed.feed_items.first.content[/<img.*src=&#034;(.*?)&#034;.*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
     end
   end

data/spec/spix_parser/utils_spec.rb CHANGED Viewed

@@ -4,6 +4,21 @@ require 'spec_helper'
 describe Spix::Utils do
   describe ".format_links" do
     context "html containing links" do
+      it "parses link tags with html escaped quote (&quot;) and absolute sources" do
+        input_html = %q[<div><a href=&quot;/foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><a href=&quot;http://busk.com/foo/bar.html&quot; title=&quot;FooBar!&quot; target=&quot;_blank&quot; rel=&quot;external nofollow&quot;>FooBar!</a></div>]
+      end
+      it "parses link tags with absolute sources" do
+          input_html = %q[<div><a href="/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+           %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
       it "parsers links in the given html string adding rel and target" do
         input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
@@ -21,23 +36,22 @@ describe Spix::Utils do
       it "parses links with simple quotes" do
         input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
-        Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
       end
-      # TODO: should we strip these extra &quot; ?
       it "parses links with html escaped quote (&quot;)" do
         input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
-        Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><a href=&quot;http://busk.com/foo/bar.html&quot; title=&quot;FooBar!&quot; target=&quot;_blank&quot; rel=&quot;external nofollow&quot;>FooBar!</a></div>]
       end
       it "parses links with html attributes without quotes, based on spaces" do
         input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
-        Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
       end
       it "parses links with html attributes having spaces before or after the equal sign" do
@@ -67,7 +81,7 @@ describe Spix::Utils do
         input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
         Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
+          %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
       end
       it "parses image tags removing other invalid html attributes" do
@@ -138,23 +152,37 @@ describe Spix::Utils do
       it "parses image tags with simple quotes" do
         input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
-        Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
       end
       # TODO: should we strip these extra &quot; ?
       it "parses image tags with html escaped quote (&quot;)" do
         input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
-        Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
+      end
+      it "parses image tags with html escaped quote (&quot;) and absolute sources" do
+        input_html = %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
+      end
+      it "parses image tags with absolute sources" do
+        input_html = %q[<div><img src="/images/bar.jpg" title="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
       end
       it "parses image tags with html attributes without quotes, based on spaces" do
         input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
-        Spix::Utils.format_links(:text => input_html).should ==
-          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
+          %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
       end
       it "parses image tags with html attributes having spaces before or after the equal sign" do

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: spix_parser
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 1.7.3
+  version: 1.7.5
 platform: ruby
 authors:
 - Marcio Lopes de Faria