spix_parser 1.7.3 → 1.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,10 +32,10 @@ module Spix
32
32
  extend self
33
33
 
34
34
  def format_links(options)
35
- text = Sanitizer.html_decode options[:text]
36
- site_url = Sanitizer.html_decode options[:site_url]
35
+ text = options[:text]
36
+ site_url = options[:site_url]
37
37
 
38
- parse_links(text)
38
+ parse_links(text, site_url)
39
39
  parse_images(text, site_url)
40
40
 
41
41
  text
@@ -45,66 +45,75 @@ module Spix
45
45
 
46
46
  def join_attributes(attrs)
47
47
  attrs.map do |attr, value|
48
- %Q[#{attr}="#{value.to_s.gsub(/"/, """)}"] unless value.blank?
49
- end.compact.join(" ")
48
+ %Q[#{attr}="#{value.to_s}"] unless value.blank?
49
+ end.compact.join(" ").gsub(/"/, """)
50
50
  end
51
51
 
52
- def parse_attrs(str)
53
- attrs = {}
54
- return attrs unless str || str.respond_to?(:scan)
52
+ def parse_attrs(str, options = {})
53
+ attrs_to_add = options.delete(:adding)
54
+ allowed = options.delete(:allowed_attrs)
55
55
 
56
- match_by_spaces = str !~ /'|"/
57
- if match_by_spaces
58
- # Make sure to match the last html attribute.
56
+ if match_by_spaces = str =~ /'|"|"|"/
57
+ value_regexp = %r{\s*#{$&}(.*?)#{$&}}
58
+ encloser = $&.gsub(/'/, '"')
59
+ else
60
+ encloser = "\""
59
61
  str += " "
60
62
  value_regexp = /\s*(.*?)\s/
61
- else
62
- value_regexp = /\s*["'](.*?)["']/
63
63
  end
64
64
  attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
65
65
 
66
- str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
67
- attrs[$1.to_s.downcase] = $2
68
- end
66
+ finded_attrs = {}
67
+ result = str.gsub(/#{attribute_regexp}=#{value_regexp}/im) {
68
+ attribute, value = $1.downcase, $2
69
+
70
+ next if value.blank? or not allowed.include?(attribute.to_sym)
69
71
 
70
- attrs
72
+ parsed_value = yield(attribute, value) || value
73
+
74
+ finded_attrs[attribute.to_sym] = parsed_value
75
+
76
+ %{#{attribute}=#{enclose parsed_value, encloser} }
77
+
78
+ } + " " + (
79
+ attrs_to_add.to_a - finded_attrs.to_a
80
+ ).map {|k,v| "#{k}=#{enclose v, encloser}" }.join(" ")
81
+ return [result.split.join(" "), finded_attrs.keys]
71
82
  end
72
83
 
73
- def parse_links(text)
74
- text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
75
- attrs = parse_attrs($2.to_s)
84
+ def enclose(str, encloser)
85
+ "#{encloser}#{str}#{encloser}"
86
+ end
76
87
 
77
- # just parse these attributes
88
+ def parse_links(text, site_url)
89
+ allowed = [:href, :title, :target, :rel]
90
+ text.gsub!(/<a\s+([^>]+)>/uim) do |match|
78
91
  attrs = {
79
- :href => attrs["href"],
80
- :title => attrs["title"],
81
92
  :target => "_blank",
82
93
  :rel => "external nofollow"
83
94
  }
84
95
 
85
- "<a #{join_attributes(attrs)}>"
96
+ parsed_attribute, attrs = parse_attrs($1.to_s, :adding => attrs, :allowed_attrs => allowed ) do |attr, value|
97
+ parse_relative_source(value, site_url) if attr == "href"
98
+ end
99
+
100
+ %{<a #{parsed_attribute}>}
86
101
  end
87
102
  end
88
103
 
89
104
  def parse_images(text, site_url)
90
- text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
91
- attrs = parse_attrs($2.to_s)
92
-
93
- # just parse these attributes
94
- attrs = {
95
- :src => parse_relative_image_source(attrs["src"], site_url),
96
- :alt => attrs["alt"],
97
- :title => attrs["title"],
98
- :style => attrs["style"],
99
- :width => attrs["width"],
100
- :height => attrs["height"]
101
- }
102
-
103
- "<img #{join_attributes(attrs)} />" if attrs[:src].present?
105
+ allowed = [:src, :alt, :title, :style, :width, :height]
106
+ text.gsub!(/<img(.*?)\/?>/uim) do |match|
107
+ parsed_attribute, attrs = parse_attrs($1.to_s, :allowed_attrs => allowed) do |attr, value|
108
+ parse_relative_source(value, site_url) if attr == "src"
109
+ end
110
+ %{<img #{parsed_attribute} />} if attrs.include?(:src)
104
111
  end
105
112
  end
106
113
 
107
- def parse_relative_image_source(src, site_url)
114
+ def parse_relative_source(src, site_url)
115
+ src = Sanitizer.html_decode(src) if src
116
+ site_url = Sanitizer.html_decode(site_url) if site_url
108
117
  if src.present? && site_url
109
118
  begin
110
119
  src = URI.parse(src)
@@ -4,7 +4,7 @@ module Spix
4
4
  module Version
5
5
  MAJOR = 1
6
6
  MINOR = 7
7
- TINY = 3
7
+ TINY = 5
8
8
 
9
9
  def self.current_version
10
10
  "#{MAJOR}.#{MINOR}.#{TINY}"
@@ -47,8 +47,8 @@ module Spix
47
47
 
48
48
  def uid
49
49
  uid = self.url || ""
50
- uid += self.encoded_raw_content_for :title
51
- uid += self.encoded_raw_content[0..25]
50
+ uid += encoded_raw_content_for :title
51
+ uid += encoded_raw_content[0..25]
52
52
  uid.to_sha1
53
53
  end
54
54
  memoize(:uid)
@@ -30,7 +30,7 @@ describe Spix::Parser do
30
30
 
31
31
  it "should parse correctly images with absolute with another domain" do
32
32
  feed = Spix::Parser.parse(load_fixture('feed_with_absolute_images_from_another_domain.atom'), :mode => :local)
33
- feed.feed_items.first.content[/<img.*src=["'](.*?)["'].*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
33
+ feed.feed_items.first.content[/<img.*src=&#034;(.*?)&#034;.*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
34
34
  end
35
35
  end
36
36
 
@@ -4,6 +4,21 @@ require 'spec_helper'
4
4
  describe Spix::Utils do
5
5
  describe ".format_links" do
6
6
  context "html containing links" do
7
+
8
+ it "parses link tags with html escaped quote (&quot;) and absolute sources" do
9
+ input_html = %q[<div><a href=&quot;/foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
10
+
11
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
12
+ %q[<div><a href=&quot;http://busk.com/foo/bar.html&quot; title=&quot;FooBar!&quot; target=&quot;_blank&quot; rel=&quot;external nofollow&quot;>FooBar!</a></div>]
13
+ end
14
+
15
+ it "parses link tags with absolute sources" do
16
+ input_html = %q[<div><a href="/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
17
+
18
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
19
+ %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
20
+ end
21
+
7
22
  it "parsers links in the given html string adding rel and target" do
8
23
  input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
9
24
 
@@ -21,23 +36,22 @@ describe Spix::Utils do
21
36
  it "parses links with simple quotes" do
22
37
  input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
23
38
 
24
- Spix::Utils.format_links(:text => input_html).should ==
25
- %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
39
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
40
+ %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
26
41
  end
27
42
 
28
- # TODO: should we strip these extra &quot; ?
29
43
  it "parses links with html escaped quote (&quot;)" do
30
44
  input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
31
45
 
32
- Spix::Utils.format_links(:text => input_html).should ==
33
- %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
46
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
47
+ %q[<div><a href=&quot;http://busk.com/foo/bar.html&quot; title=&quot;FooBar!&quot; target=&quot;_blank&quot; rel=&quot;external nofollow&quot;>FooBar!</a></div>]
34
48
  end
35
49
 
36
50
  it "parses links with html attributes without quotes, based on spaces" do
37
51
  input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
38
52
 
39
- Spix::Utils.format_links(:text => input_html).should ==
40
- %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
53
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
54
+ %q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
41
55
  end
42
56
 
43
57
  it "parses links with html attributes having spaces before or after the equal sign" do
@@ -67,7 +81,7 @@ describe Spix::Utils do
67
81
  input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
68
82
 
69
83
  Spix::Utils.format_links(:text => input_html).should ==
70
- %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
84
+ %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
71
85
  end
72
86
 
73
87
  it "parses image tags removing other invalid html attributes" do
@@ -138,23 +152,37 @@ describe Spix::Utils do
138
152
  it "parses image tags with simple quotes" do
139
153
  input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
140
154
 
141
- Spix::Utils.format_links(:text => input_html).should ==
142
- %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
155
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
156
+ %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
143
157
  end
144
158
 
145
159
  # TODO: should we strip these extra &quot; ?
146
160
  it "parses image tags with html escaped quote (&quot;)" do
147
161
  input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
148
162
 
149
- Spix::Utils.format_links(:text => input_html).should ==
150
- %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
163
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
164
+ %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
165
+ end
166
+
167
+ it "parses image tags with html escaped quote (&quot;) and absolute sources" do
168
+ input_html = %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
169
+
170
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
171
+ %q[<div><img src=&quot;http://busk.com/images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
172
+ end
173
+
174
+ it "parses image tags with absolute sources" do
175
+ input_html = %q[<div><img src="/images/bar.jpg" title="FooBar!" /></div>]
176
+
177
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
178
+ %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
151
179
  end
152
180
 
153
181
  it "parses image tags with html attributes without quotes, based on spaces" do
154
182
  input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
155
183
 
156
- Spix::Utils.format_links(:text => input_html).should ==
157
- %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
184
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
185
+ %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
158
186
  end
159
187
 
160
188
  it "parses image tags with html attributes having spaces before or after the equal sign" do
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: spix_parser
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.7.3
5
+ version: 1.7.5
6
6
  platform: ruby
7
7
  authors:
8
8
  - Marcio Lopes de Faria