spix_parser 1.7.3 → 1.7.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -32,10 +32,10 @@ module Spix
32
32
  extend self
33
33
 
34
34
  def format_links(options)
35
- text = Sanitizer.html_decode options[:text]
36
- site_url = Sanitizer.html_decode options[:site_url]
35
+ text = options[:text]
36
+ site_url = options[:site_url]
37
37
 
38
- parse_links(text)
38
+ parse_links(text, site_url)
39
39
  parse_images(text, site_url)
40
40
 
41
41
  text
@@ -45,66 +45,75 @@ module Spix
45
45
 
46
46
  def join_attributes(attrs)
47
47
  attrs.map do |attr, value|
48
- %Q[#{attr}="#{value.to_s.gsub(/"/, """)}"] unless value.blank?
49
- end.compact.join(" ")
48
+ %Q[#{attr}="#{value.to_s}"] unless value.blank?
49
+ end.compact.join(" ").gsub(/"/, """)
50
50
  end
51
51
 
52
- def parse_attrs(str)
53
- attrs = {}
54
- return attrs unless str || str.respond_to?(:scan)
52
+ def parse_attrs(str, options = {})
53
+ attrs_to_add = options.delete(:adding)
54
+ allowed = options.delete(:allowed_attrs)
55
55
 
56
- match_by_spaces = str !~ /'|"/
57
- if match_by_spaces
58
- # Make sure to match the last html attribute.
56
+ if match_by_spaces = str =~ /'|"|"|"/
57
+ value_regexp = %r{\s*#{$&}(.*?)#{$&}}
58
+ encloser = $&.gsub(/'/, '"')
59
+ else
60
+ encloser = "\""
59
61
  str += " "
60
62
  value_regexp = /\s*(.*?)\s/
61
- else
62
- value_regexp = /\s*["'](.*?)["']/
63
63
  end
64
64
  attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
65
65
 
66
- str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
67
- attrs[$1.to_s.downcase] = $2
68
- end
66
+ finded_attrs = {}
67
+ result = str.gsub(/#{attribute_regexp}=#{value_regexp}/im) {
68
+ attribute, value = $1.downcase, $2
69
+
70
+ next if value.blank? or not allowed.include?(attribute.to_sym)
69
71
 
70
- attrs
72
+ parsed_value = yield(attribute, value) || value
73
+
74
+ finded_attrs[attribute.to_sym] = parsed_value
75
+
76
+ %{#{attribute}=#{enclose parsed_value, encloser} }
77
+
78
+ } + " " + (
79
+ attrs_to_add.to_a - finded_attrs.to_a
80
+ ).map {|k,v| "#{k}=#{enclose v, encloser}" }.join(" ")
81
+ return [result.split.join(" "), finded_attrs.keys]
71
82
  end
72
83
 
73
- def parse_links(text)
74
- text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
75
- attrs = parse_attrs($2.to_s)
84
+ def enclose(str, encloser)
85
+ "#{encloser}#{str}#{encloser}"
86
+ end
76
87
 
77
- # just parse these attributes
88
+ def parse_links(text, site_url)
89
+ allowed = [:href, :title, :target, :rel]
90
+ text.gsub!(/<a\s+([^>]+)>/uim) do |match|
78
91
  attrs = {
79
- :href => attrs["href"],
80
- :title => attrs["title"],
81
92
  :target => "_blank",
82
93
  :rel => "external nofollow"
83
94
  }
84
95
 
85
- "<a #{join_attributes(attrs)}>"
96
+ parsed_attribute, attrs = parse_attrs($1.to_s, :adding => attrs, :allowed_attrs => allowed ) do |attr, value|
97
+ parse_relative_source(value, site_url) if attr == "href"
98
+ end
99
+
100
+ %{<a #{parsed_attribute}>}
86
101
  end
87
102
  end
88
103
 
89
104
  def parse_images(text, site_url)
90
- text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
91
- attrs = parse_attrs($2.to_s)
92
-
93
- # just parse these attributes
94
- attrs = {
95
- :src => parse_relative_image_source(attrs["src"], site_url),
96
- :alt => attrs["alt"],
97
- :title => attrs["title"],
98
- :style => attrs["style"],
99
- :width => attrs["width"],
100
- :height => attrs["height"]
101
- }
102
-
103
- "<img #{join_attributes(attrs)} />" if attrs[:src].present?
105
+ allowed = [:src, :alt, :title, :style, :width, :height]
106
+ text.gsub!(/<img(.*?)\/?>/uim) do |match|
107
+ parsed_attribute, attrs = parse_attrs($1.to_s, :allowed_attrs => allowed) do |attr, value|
108
+ parse_relative_source(value, site_url) if attr == "src"
109
+ end
110
+ %{<img #{parsed_attribute} />} if attrs.include?(:src)
104
111
  end
105
112
  end
106
113
 
107
- def parse_relative_image_source(src, site_url)
114
+ def parse_relative_source(src, site_url)
115
+ src = Sanitizer.html_decode(src) if src
116
+ site_url = Sanitizer.html_decode(site_url) if site_url
108
117
  if src.present? && site_url
109
118
  begin
110
119
  src = URI.parse(src)
@@ -4,7 +4,7 @@ module Spix
4
4
  module Version
5
5
  MAJOR = 1
6
6
  MINOR = 7
7
- TINY = 3
7
+ TINY = 5
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
@@ -47,8 +47,8 @@ module Spix
47
47
 
48
48
  def uid
49
49
  uid = self.url || ""
50
- uid += self.encoded_raw_content_for :title
51
- uid += self.encoded_raw_content[0..25]
50
+ uid += encoded_raw_content_for :title
51
+ uid += encoded_raw_content[0..25]
52
52
  uid.to_sha1
53
53
  end
54
54
  memoize(:uid)
@@ -30,7 +30,7 @@ describe Spix::Parser do
30
30
 
31
31
  it "should parse correctly images with absolute with another domain" do
32
32
  feed = Spix::Parser.parse(load_fixture('feed_with_absolute_images_from_another_domain.atom'), :mode => :local)
33
- feed.feed_items.first.content[/<img.*src=["'](.*?)["'].*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
33
+ feed.feed_items.first.content[/<img.*src=&#034;(.*?)&#034;.*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
34
34
  end
35
35
  end
36
36
 
@@ -4,6 +4,21 @@ require 'spec_helper'
4
4
  describe Spix::Utils do
5
5
  describe ".format_links" do
6
6
  context "html containing links" do
7
+
8
+ it "parses link tags with html escaped quote (&quot;) and absolute sources" do
9
+ input_html = %q[<div><a href=&quot;/foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
10
+
11
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
12
+ %q[<div><a href=&quot;http://busk.com/foo/bar.html&quot; title=&quot;FooBar!&quot; target=&quot;_blank&quot; rel=&quot;external nofollow&quot;>FooBar!</a></div>]
13
+ end
14
+
15
+ it "parses link tags with absolute sources" do
16
+ input_html = %q[<div><a href="/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
17
+
18
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
19
+ %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
20
+ end
21
+
7
22
  it "parsers links in the given html string adding rel and target" do
8
23
  input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
9
24
 
@@ -21,23 +36,22 @@ describe Spix::Utils do
21
36
  it "parses links with simple quotes" do
22
37
  input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
23
38
 
24
- Spix::Utils.format_links(:text => input_html).should ==
25
- %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
39
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
40
+ %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
26
41
  end
27
42
 
28
- # TODO: should we strip these extra &quot; ?
29
43
  it "parses links with html escaped quote (&quot;)" do
30
44
  input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
31
45
 
32
- Spix::Utils.format_links(:text => input_html).should ==
33
- %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
46
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
47
+ %q[<div><a href=&quot;http://busk.com/foo/bar.html&quot; title=&quot;FooBar!&quot; target=&quot;_blank&quot; rel=&quot;external nofollow&quot;>FooBar!</a></div>]
34
48
  end
35
49
 
36
50
  it "parses links with html attributes without quotes, based on spaces" do
37
51
  input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
38
52
 
39
- Spix::Utils.format_links(:text => input_html).should ==
40
- %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
53
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
54
+ %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
41
55
  end
42
56
 
43
57
  it "parses links with html attributes having spaces before or after the equal sign" do
@@ -67,7 +81,7 @@ describe Spix::Utils do
67
81
  input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
68
82
 
69
83
  Spix::Utils.format_links(:text => input_html).should ==
70
- %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
84
+ %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
71
85
  end
72
86
 
73
87
  it "parses image tags removing other invalid html attributes" do
@@ -138,23 +152,37 @@ describe Spix::Utils do
138
152
  it "parses image tags with simple quotes" do
139
153
  input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
140
154
 
141
- Spix::Utils.format_links(:text => input_html).should ==
142
- %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
155
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
156
+ %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
143
157
  end
144
158
 
145
159
  # TODO: should we strip these extra &quot; ?
146
160
  it "parses image tags with html escaped quote (&quot;)" do
147
161
  input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
148
162
 
149
- Spix::Utils.format_links(:text => input_html).should ==
150
- %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
163
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
164
+ %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
165
+ end
166
+
167
+ it "parses image tags with html escaped quote (&quot;) and absolute sources" do
168
+ input_html = %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
169
+
170
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
171
+ %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
172
+ end
173
+
174
+ it "parses image tags with absolute sources" do
175
+ input_html = %q[<div><img src="/images/bar.jpg" title="FooBar!" /></div>]
176
+
177
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
178
+ %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
151
179
  end
152
180
 
153
181
  it "parses image tags with html attributes without quotes, based on spaces" do
154
182
  input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
155
183
 
156
- Spix::Utils.format_links(:text => input_html).should ==
157
- %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
184
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
185
+ %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
158
186
  end
159
187
 
160
188
  it "parses image tags with html attributes having spaces before or after the equal sign" do
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.7.3
5
+ version: 1.7.5
6
6
  platform: ruby
7
7
  authors:
8
8
  - Marcio Lopes de Faria