spix_parser 1.7.3 → 1.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/spix_parser/parser.rb +48 -39
- data/lib/spix_parser/version.rb +1 -1
- data/lib/spix_parser/wrappers/entry.rb +2 -2
- data/spec/spix_parser/parser_spec.rb +1 -1
- data/spec/spix_parser/utils_spec.rb +42 -14
- metadata +1 -1
data/lib/spix_parser/parser.rb
CHANGED
@@ -32,10 +32,10 @@ module Spix
|
|
32
32
|
extend self
|
33
33
|
|
34
34
|
def format_links(options)
|
35
|
-
text =
|
36
|
-
site_url =
|
35
|
+
text = options[:text]
|
36
|
+
site_url = options[:site_url]
|
37
37
|
|
38
|
-
parse_links(text)
|
38
|
+
parse_links(text, site_url)
|
39
39
|
parse_images(text, site_url)
|
40
40
|
|
41
41
|
text
|
@@ -45,66 +45,75 @@ module Spix
|
|
45
45
|
|
46
46
|
def join_attributes(attrs)
|
47
47
|
attrs.map do |attr, value|
|
48
|
-
%Q[#{attr}="#{value.to_s
|
49
|
-
end.compact.join(" ")
|
48
|
+
%Q[#{attr}="#{value.to_s}"] unless value.blank?
|
49
|
+
end.compact.join(" ").gsub(/"/, """)
|
50
50
|
end
|
51
51
|
|
52
|
-
def parse_attrs(str)
|
53
|
-
|
54
|
-
|
52
|
+
def parse_attrs(str, options = {})
|
53
|
+
attrs_to_add = options.delete(:adding)
|
54
|
+
allowed = options.delete(:allowed_attrs)
|
55
55
|
|
56
|
-
match_by_spaces = str
|
57
|
-
|
58
|
-
|
56
|
+
if match_by_spaces = str =~ /'|"|"|"/
|
57
|
+
value_regexp = %r{\s*#{$&}(.*?)#{$&}}
|
58
|
+
encloser = $&.gsub(/'/, '"')
|
59
|
+
else
|
60
|
+
encloser = "\""
|
59
61
|
str += " "
|
60
62
|
value_regexp = /\s*(.*?)\s/
|
61
|
-
else
|
62
|
-
value_regexp = /\s*["'](.*?)["']/
|
63
63
|
end
|
64
64
|
attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
|
65
65
|
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
finded_attrs = {}
|
67
|
+
result = str.gsub(/#{attribute_regexp}=#{value_regexp}/im) {
|
68
|
+
attribute, value = $1.downcase, $2
|
69
|
+
|
70
|
+
next if value.blank? or not allowed.include?(attribute.to_sym)
|
69
71
|
|
70
|
-
|
72
|
+
parsed_value = yield(attribute, value) || value
|
73
|
+
|
74
|
+
finded_attrs[attribute.to_sym] = parsed_value
|
75
|
+
|
76
|
+
%{#{attribute}=#{enclose parsed_value, encloser} }
|
77
|
+
|
78
|
+
} + " " + (
|
79
|
+
attrs_to_add.to_a - finded_attrs.to_a
|
80
|
+
).map {|k,v| "#{k}=#{enclose v, encloser}" }.join(" ")
|
81
|
+
return [result.split.join(" "), finded_attrs.keys]
|
71
82
|
end
|
72
83
|
|
73
|
-
def
|
74
|
-
|
75
|
-
|
84
|
+
def enclose(str, encloser)
|
85
|
+
"#{encloser}#{str}#{encloser}"
|
86
|
+
end
|
76
87
|
|
77
|
-
|
88
|
+
def parse_links(text, site_url)
|
89
|
+
allowed = [:href, :title, :target, :rel]
|
90
|
+
text.gsub!(/<a\s+([^>]+)>/uim) do |match|
|
78
91
|
attrs = {
|
79
|
-
:href => attrs["href"],
|
80
|
-
:title => attrs["title"],
|
81
92
|
:target => "_blank",
|
82
93
|
:rel => "external nofollow"
|
83
94
|
}
|
84
95
|
|
85
|
-
|
96
|
+
parsed_attribute, attrs = parse_attrs($1.to_s, :adding => attrs, :allowed_attrs => allowed ) do |attr, value|
|
97
|
+
parse_relative_source(value, site_url) if attr == "href"
|
98
|
+
end
|
99
|
+
|
100
|
+
%{<a #{parsed_attribute}>}
|
86
101
|
end
|
87
102
|
end
|
88
103
|
|
89
104
|
def parse_images(text, site_url)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
:alt => attrs["alt"],
|
97
|
-
:title => attrs["title"],
|
98
|
-
:style => attrs["style"],
|
99
|
-
:width => attrs["width"],
|
100
|
-
:height => attrs["height"]
|
101
|
-
}
|
102
|
-
|
103
|
-
"<img #{join_attributes(attrs)} />" if attrs[:src].present?
|
105
|
+
allowed = [:src, :alt, :title, :style, :width, :height]
|
106
|
+
text.gsub!(/<img(.*?)\/?>/uim) do |match|
|
107
|
+
parsed_attribute, attrs = parse_attrs($1.to_s, :allowed_attrs => allowed) do |attr, value|
|
108
|
+
parse_relative_source(value, site_url) if attr == "src"
|
109
|
+
end
|
110
|
+
%{<img #{parsed_attribute} />} if attrs.include?(:src)
|
104
111
|
end
|
105
112
|
end
|
106
113
|
|
107
|
-
def
|
114
|
+
def parse_relative_source(src, site_url)
|
115
|
+
src = Sanitizer.html_decode(src) if src
|
116
|
+
site_url = Sanitizer.html_decode(site_url) if site_url
|
108
117
|
if src.present? && site_url
|
109
118
|
begin
|
110
119
|
src = URI.parse(src)
|
data/lib/spix_parser/version.rb
CHANGED
@@ -30,7 +30,7 @@ describe Spix::Parser do
|
|
30
30
|
|
31
31
|
it "should parse correctly images with absolute with another domain" do
|
32
32
|
feed = Spix::Parser.parse(load_fixture('feed_with_absolute_images_from_another_domain.atom'), :mode => :local)
|
33
|
-
feed.feed_items.first.content[/<img.*src
|
33
|
+
feed.feed_items.first.content[/<img.*src="(.*?)".*\/>/, 1].should == "http://oglobo.globo.com/fotos/2011/07/06/06_MHB_ballmer.jpg"
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
@@ -4,6 +4,21 @@ require 'spec_helper'
|
|
4
4
|
describe Spix::Utils do
|
5
5
|
describe ".format_links" do
|
6
6
|
context "html containing links" do
|
7
|
+
|
8
|
+
it "parses link tags with html escaped quote (") and absolute sources" do
|
9
|
+
input_html = %q[<div><a href="/foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
10
|
+
|
11
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
12
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "parses link tags with absolute sources" do
|
16
|
+
input_html = %q[<div><a href="/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
17
|
+
|
18
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
19
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
20
|
+
end
|
21
|
+
|
7
22
|
it "parsers links in the given html string adding rel and target" do
|
8
23
|
input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
9
24
|
|
@@ -21,23 +36,22 @@ describe Spix::Utils do
|
|
21
36
|
it "parses links with simple quotes" do
|
22
37
|
input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
|
23
38
|
|
24
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
25
|
-
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
39
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
40
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
26
41
|
end
|
27
42
|
|
28
|
-
# TODO: should we strip these extra " ?
|
29
43
|
it "parses links with html escaped quote (")" do
|
30
44
|
input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
31
45
|
|
32
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
33
|
-
%q[<div><a href
|
46
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
47
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
34
48
|
end
|
35
49
|
|
36
50
|
it "parses links with html attributes without quotes, based on spaces" do
|
37
51
|
input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
|
38
52
|
|
39
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
40
|
-
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
53
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
54
|
+
%q[<div><a href="http://busk.com/foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
41
55
|
end
|
42
56
|
|
43
57
|
it "parses links with html attributes having spaces before or after the equal sign" do
|
@@ -67,7 +81,7 @@ describe Spix::Utils do
|
|
67
81
|
input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
|
68
82
|
|
69
83
|
Spix::Utils.format_links(:text => input_html).should ==
|
70
|
-
%q[<div><img src="images/bar.jpg"
|
84
|
+
%q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
|
71
85
|
end
|
72
86
|
|
73
87
|
it "parses image tags removing other invalid html attributes" do
|
@@ -138,23 +152,37 @@ describe Spix::Utils do
|
|
138
152
|
it "parses image tags with simple quotes" do
|
139
153
|
input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
|
140
154
|
|
141
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
142
|
-
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
155
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
156
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
143
157
|
end
|
144
158
|
|
145
159
|
# TODO: should we strip these extra " ?
|
146
160
|
it "parses image tags with html escaped quote (")" do
|
147
161
|
input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
148
162
|
|
149
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
150
|
-
%q[<div><img src
|
163
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
164
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
165
|
+
end
|
166
|
+
|
167
|
+
it "parses image tags with html escaped quote (") and absolute sources" do
|
168
|
+
input_html = %q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
169
|
+
|
170
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
171
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
172
|
+
end
|
173
|
+
|
174
|
+
it "parses image tags with absolute sources" do
|
175
|
+
input_html = %q[<div><img src="/images/bar.jpg" title="FooBar!" /></div>]
|
176
|
+
|
177
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
178
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
151
179
|
end
|
152
180
|
|
153
181
|
it "parses image tags with html attributes without quotes, based on spaces" do
|
154
182
|
input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
|
155
183
|
|
156
|
-
Spix::Utils.format_links(:text => input_html).should ==
|
157
|
-
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
184
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://busk.com/").should ==
|
185
|
+
%q[<div><img src="http://busk.com/images/bar.jpg" title="FooBar!" /></div>]
|
158
186
|
end
|
159
187
|
|
160
188
|
it "parses image tags with html attributes having spaces before or after the equal sign" do
|