spix_parser 1.7.3 → 1.7.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/spix_parser/parser.rb +48 -39
- data/lib/spix_parser/version.rb +1 -1
- data/lib/spix_parser/wrappers/entry.rb +2 -2
- data/spec/spix_parser/parser_spec.rb +1 -1
- data/spec/spix_parser/utils_spec.rb +42 -14
- metadata +1 -1
data/lib/spix_parser/parser.rb
CHANGED
@@ -32,10 +32,10 @@ module Spix
|
|
32
32
|
extend self
|
33
33
|
|
34
34
|
def format_links(options)
|
35
|
-
text =
|
36
|
-
site_url =
|
35
|
+
text = options[:text]
|
36
|
+
site_url = options[:site_url]
|
37
37
|
|
38
|
-
parse_links(text)
|
38
|
+
parse_links(text, site_url)
|
39
39
|
parse_images(text, site_url)
|
40
40
|
|
41
41
|
text
|
@@ -45,66 +45,75 @@ module Spix
|
|
45
45
|
|
46
46
|
def join_attributes(attrs)
|
47
47
|
attrs.map do |attr, value|
|
48
|
-
%Q[#{attr}="#{value.to_s
|
49
|
-
end.compact.join(" ")
|
48
|
+
%Q[#{attr}="#{value.to_s}"] unless value.blank?
|
49
|
+
end.compact.join(" ").gsub(/"/, """)
|
50
50
|
end
|
51
51
|
|
52
|
-
def parse_attrs(str)
|
53
|
-
|
54
|
-
|
52
|
+
def parse_attrs(str, options = {})
|
53
|
+
attrs_to_add = options.delete(:adding)
|
54
|
+
allowed = options.delete(:allowed_attrs)
|
55
55
|
|
56
|
-
match_by_spaces = str
|
57
|
-
|
58
|
-
|
56
|
+
if match_by_spaces = str =~ /'|"|"|"/
|
57
|
+
value_regexp = %r{\s*#{$&}(.*?)#{$&}}
|
58
|
+
encloser = $&.gsub(/'/, '"')
|
59
|
+
else
|
60
|
+
encloser = "\""
|
59
61
|
str += " "
|
60
62
|
value_regexp = /\s*(.*?)\s/
|
61
|
-
else
|
62
|
-
value_regexp = /\s*["'](.*?)["']/
|
63
63
|
end
|
64
64
|
attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
|
65
65
|
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
finded_attrs = {}
|
67
|
+
result = str.gsub(/#{attribute_regexp}=#{value_regexp}/im) {
|
68
|
+
attribute, value = $1.downcase, $2
|
69
|
+
|
70
|
+
next if value.blank? or not allowed.include?(attribute.to_sym)
|
69
71
|
|
70
|
-
|
72
|
+
parsed_value = yield(attribute, value) || value
|
73
|
+
|
74
|
+
finded_attrs[attribute.to_sym] = parsed_value
|
75
|
+
|
76
|
+
%{#{attribute}=#{enclose parsed_value, encloser} }
|
77
|
+
|
78
|
+
} + " " + (
|
79
|
+
attrs_to_add.to_a - finded_attrs.to_a
|
80
|
+
).map {|k,v| "#{k}=#{enclose v, encloser}" }.join(" ")
|
81
|
+
return [result.split.join(" "), finded_attrs.keys]
|
71
82
|
end
|
72
83
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
84
|
+
def enclose(str, encloser)
|
85
|
+
"#{encloser}#{str}#{encloser}"
|
86
|
+
end
|
76
87
|
|
77
|
-
|
88
|
+
def parse_links(text, site_url)
|
89
|
+
allowed = [:href, :title, :target, :rel]
|
90
|
+
text.gsub!(/<a\s+([^>]+)>/uim) do |match|
|
78
91
|
attrs = {
|
79
|
-
:href => attrs["href"],
|
80
|
-
:title => attrs["title"],
|
81
92
|
:target => "_blank",
|
82
93
|
:rel => "external nofollow"
|
83
94
|
}
|
84
95
|
|
85
|
-
|
96
|
+
parsed_attribute, attrs = parse_attrs($1.to_s, :adding => attrs, :allowed_attrs => allowed ) do |attr, value|
|
97
|
+
parse_relative_source(value, site_url) if attr == "href"
|
98
|
+
end
|
99
|
+
|
100
|
+
%{<a #{parsed_attribute}>}
|
86
101
|
end
|
87
102
|
end
|
88
103
|
|
89
104
|
def parse_images(text, site_url)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
:alt => attrs["alt"],
|
97
|
-
:title => attrs["title"],
|
98
|
-
:style => attrs["style"],
|
99
|
-
:width => attrs["width"],
|
100
|
-
:height => attrs["height"]
|
101
|
-
}
|
102
|
-
|
103
|
-
"<img #{join_attributes(attrs)} />" if attrs[:src].present?
|
105
|
+
allowed = [:src, :alt, :title, :style, :width, :height]
|
106
|
+
text.gsub!(/<img(.*?)\/?>/uim) do |match|
|
107
|
+
parsed_attribute, attrs = parse_attrs($1.to_s, :allowed_attrs => allowed) do |attr, value|
|
108
|
+
parse_relative_source(value, site_url) if attr == "src"
|
109
|
+
end
|
110
|
+
%{<img #{parsed_attribute} />} if attrs.include?(:src)
|
104
111
|
end
|
105
112
|
end
|
106
113
|
|
107
|
-
def
|
114
|
+
def parse_relative_source(src, site_url)
|
115
|
+
src = Sanitizer.html_decode(src) if src
|
116
|
+
site_url = Sanitizer.html_decode(site_url) if site_url
|
108
117
|
if src.present? && site_url
|
109
118
|
begin
|
110
119
|
src = URI.parse(src)
|
data/lib/spix_parser/version.rb
CHANGED
@@ -30,7 +30,7 @@ describe Spix::Parser do
|
|
30
30
|
|
31
31
|
it "should parse correctly images with absolute with another domain" do
|
32
32
|
feed = Spix::Parser.parse(load_fixture('feed_with_absolute_images_from_another_domain.atom'), :mode => :local)
|
33
|
-
feed.feed_items.first.content[/<img.*src
|
33
|
+
feed.feed_items.first.content[/<img.*src="(.*?)".*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
@@ -4,6 +4,21 @@ require 'spec_helper'
|
|
4
4
|
describe Spix::Utils do
|
5
5
|
describe ".format_links" do
|
6
6
|
context "html containing links" do
|
7
|
+
|
8
|
+
it "parses link tags with html escaped quote (") and absolute sources" do
|
9
|
+
input_html = %q[<div><a href="/foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
10
|
+
|
11
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
12
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "parses link tags with absolute sources" do
|
16
|
+
input_html = %q[<div><a href="/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
17
|
+
|
18
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
19
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
20
|
+
end
|
21
|
+
|
7
22
|
it "parsers links in the given html string adding rel and target" do
|
8
23
|
input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
9
24
|
|
@@ -21,23 +36,22 @@ describe Spix::Utils do
|
|
21
36
|
it "parses links with simple quotes" do
|
22
37
|
input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
|
23
38
|
|
24
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
25
|
-
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
39
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
40
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
26
41
|
end
|
27
42
|
|
28
|
-
# TODO: should we strip these extra " ?
|
29
43
|
it "parses links with html escaped quote (")" do
|
30
44
|
input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
31
45
|
|
32
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
33
|
-
%q[<div><a href
|
46
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
47
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
34
48
|
end
|
35
49
|
|
36
50
|
it "parses links with html attributes without quotes, based on spaces" do
|
37
51
|
input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
|
38
52
|
|
39
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
40
|
-
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
53
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
54
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
41
55
|
end
|
42
56
|
|
43
57
|
it "parses links with html attributes having spaces before or after the equal sign" do
|
@@ -67,7 +81,7 @@ describe Spix::Utils do
|
|
67
81
|
input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
|
68
82
|
|
69
83
|
Spix::Utils.format_links(:text => input_html).should ==
|
70
|
-
%q[<div><img src="images/bar.jpg"
|
84
|
+
%q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
|
71
85
|
end
|
72
86
|
|
73
87
|
it "parses image tags removing other invalid html attributes" do
|
@@ -138,23 +152,37 @@ describe Spix::Utils do
|
|
138
152
|
it "parses image tags with simple quotes" do
|
139
153
|
input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
|
140
154
|
|
141
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
142
|
-
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
155
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
156
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
143
157
|
end
|
144
158
|
|
145
159
|
# TODO: should we strip these extra " ?
|
146
160
|
it "parses image tags with html escaped quote (")" do
|
147
161
|
input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
148
162
|
|
149
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
150
|
-
%q[<div><img src
|
163
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
164
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
165
|
+
end
|
166
|
+
|
167
|
+
it "parses image tags with html escaped quote (") and absolute sources" do
|
168
|
+
input_html = %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
169
|
+
|
170
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
171
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
172
|
+
end
|
173
|
+
|
174
|
+
it "parses image tags with absolute sources" do
|
175
|
+
input_html = %q[<div><img src="/images/bar.jpg" title="FooBar!" /></div>]
|
176
|
+
|
177
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
178
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
151
179
|
end
|
152
180
|
|
153
181
|
it "parses image tags with html attributes without quotes, based on spaces" do
|
154
182
|
input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
|
155
183
|
|
156
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
157
|
-
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
184
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
185
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
158
186
|
end
|
159
187
|
|
160
188
|
it "parses image tags with html attributes having spaces before or after the equal sign" do
|