spix_parser 1.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/spix_parser/config.rb +10 -0
- data/lib/spix_parser/core_ext.rb +117 -0
- data/lib/spix_parser/custom_parsers/atom.rb +36 -0
- data/lib/spix_parser/custom_parsers/atom_entry.rb +28 -0
- data/lib/spix_parser/custom_parsers/enclosure.rb +13 -0
- data/lib/spix_parser/custom_parsers/rss.rb +28 -0
- data/lib/spix_parser/custom_parsers/rss_entry.rb +36 -0
- data/lib/spix_parser/datetime.rb +34 -0
- data/lib/spix_parser/parser.rb +124 -0
- data/lib/spix_parser/tools/feed_discovery.rb +94 -0
- data/lib/spix_parser/tools/redirect_follower.rb +31 -0
- data/lib/spix_parser/version.rb +18 -0
- data/lib/spix_parser/wrappers/enclosure_interface.rb +18 -0
- data/lib/spix_parser/wrappers/entry.rb +87 -0
- data/lib/spix_parser/wrappers/feed.rb +81 -0
- data/lib/spix_parser/wrappers/parsing_error.rb +7 -0
- data/lib/spix_parser.rb +46 -0
- data/spec/parser_spec.rb +6 -0
- data/spec/spix_parser/parser_spec.rb +42 -0
- data/spec/spix_parser/tools/feed_discovery_spec.rb +72 -0
- data/spec/spix_parser/utils_spec.rb +182 -0
- metadata +184 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class Time
|
3
|
+
def to_db_format(format=:default)
|
4
|
+
format == :default ? to_s_default : strftime("%Y-%m-%d %H:%M:%S").strip
|
5
|
+
end
|
6
|
+
alias_method :to_s_default, :to_s
|
7
|
+
alias_method :to_s, :to_db_format unless method_defined?(:to_formatted_s)
|
8
|
+
|
9
|
+
def self.can_parse?(date)
|
10
|
+
begin
|
11
|
+
Time.parse(date)
|
12
|
+
rescue StandardError => e
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
true
|
16
|
+
end
|
17
|
+
|
18
|
+
# We have a lot of timestamps in portuguese and Ruby can't parse'em
|
19
|
+
# So, this function will translate portuguese date related terms to english for correct parsing,
|
20
|
+
# because we can't afford so many feed entries with wrong timestamps
|
21
|
+
def self.translate_for_parsing(date_as_string)
|
22
|
+
# First, add leading zero to days of month below 10
|
23
|
+
formatted_date = date_as_string.sub(/\A[a-zA-Z]+\,\s{1}(\d)[^\d]/, '0\1 ')
|
24
|
+
|
25
|
+
day_names = {"Domingo" => "Sunday", "Segunda" => "Monday", "Terça" => "Tuesday", "Quarta" => "Wednesday",
|
26
|
+
"Quinta" => "Thursday", "Sexta" => "Friday", "Sábado" => "Saturday", "Sabado" => "Saturday"}
|
27
|
+
abbr_day_names = {"Dom" => "Sun", "Seg" => "Mon", "Ter" => "Tue", "Qua" => "Wed",
|
28
|
+
"Qui" => "Thu", "Sex" => "Fri", "Sáb" => "Sat", "Sab" => "Sat"}
|
29
|
+
month_names = {"Janeiro" => "January", "Fevereiro" => "February", "Março" => "March", "Marco" => "March",
|
30
|
+
"Abril" => "April", "Maio" => "May", "Junho" => "June", "Julho" => "July",
|
31
|
+
"Agosto" => "August", "Setembro" => "September", "Outubro" => "October",
|
32
|
+
"Novembro" => "November", "Dezembro" => "December"}
|
33
|
+
abbr_month_names = {"Jan" => "Jan", "Fev" => "Feb", "Abr" => "Apr", "Mai" => "May",
|
34
|
+
"Ago" => "Aug", "Set" => "Sep", "Out" => "Oct", "Dez" => "Dec"}
|
35
|
+
|
36
|
+
day_names.each do |key, value|
|
37
|
+
formatted_date.sub!(key, value)
|
38
|
+
end
|
39
|
+
|
40
|
+
abbr_day_names.each do |key, value|
|
41
|
+
formatted_date.sub!(key, value)
|
42
|
+
end
|
43
|
+
|
44
|
+
month_names.each do |key, value|
|
45
|
+
formatted_date.sub!(key, value)
|
46
|
+
end
|
47
|
+
|
48
|
+
abbr_month_names.each do |key, value|
|
49
|
+
formatted_date.sub!(key, value)
|
50
|
+
end
|
51
|
+
|
52
|
+
formatted_date
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Object
|
57
|
+
def blank?
|
58
|
+
respond_to?(:empty?) ? empty? : !self
|
59
|
+
end
|
60
|
+
|
61
|
+
def silence_warnings
|
62
|
+
old_verbose, $VERBOSE = $VERBOSE, nil
|
63
|
+
yield
|
64
|
+
ensure
|
65
|
+
$VERBOSE = old_verbose
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class NilClass
|
70
|
+
def blank?
|
71
|
+
true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class FalseClass
|
76
|
+
def blank?
|
77
|
+
true
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class TrueClass
|
82
|
+
def blank?
|
83
|
+
false
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Array
|
88
|
+
alias_method :blank?, :empty?
|
89
|
+
end
|
90
|
+
|
91
|
+
class Hash
|
92
|
+
alias_method :blank?, :empty?
|
93
|
+
end
|
94
|
+
|
95
|
+
class Numeric
|
96
|
+
def blank?
|
97
|
+
false
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class String
|
102
|
+
def to_sha1
|
103
|
+
Digest::SHA1.hexdigest "--17f7e62310d5a2bbb9bfc535b95134ece1cb474d--#{self}"
|
104
|
+
end
|
105
|
+
|
106
|
+
def blank?
|
107
|
+
self !~ /\S/
|
108
|
+
end
|
109
|
+
|
110
|
+
def busk_normalize
|
111
|
+
if RUBY_VERSION >= '1.9'
|
112
|
+
self.force_encoding(Spix::Parser::Config::ENCODING)
|
113
|
+
else
|
114
|
+
self
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class Atom
|
5
|
+
include SAXMachine
|
6
|
+
include Feedzirra::FeedUtilities
|
7
|
+
|
8
|
+
element :title, :as => :feed_title
|
9
|
+
element :subtitle, :as => :feed_subtitle
|
10
|
+
element :language, :as => :feed_language
|
11
|
+
element :updated, :as => :last_modified
|
12
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
13
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
14
|
+
elements :link, :as => :links, :value => :href
|
15
|
+
elements :entry, :as => :feed_entries, :class => Spix::Parser::AtomEntry
|
16
|
+
|
17
|
+
alias_method :entries, :feed_entries
|
18
|
+
|
19
|
+
def self.able_to_parse?(xml) #:nodoc:
|
20
|
+
(xml =~ /application\/atom\+xml|(#{Regexp.escape("http://www.w3.org/2005/Atom")})|(#{Regexp.escape("http://purl.org/atom")})/) && (xml =~ /\<feed\s/)
|
21
|
+
end
|
22
|
+
|
23
|
+
def url
|
24
|
+
@url || links.last
|
25
|
+
end
|
26
|
+
|
27
|
+
def feed_url
|
28
|
+
@feed_url || links.first
|
29
|
+
end
|
30
|
+
|
31
|
+
def last_modified
|
32
|
+
@last_modified.present? ? @last_modified : super
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class AtomEntry
|
5
|
+
include SAXMachine
|
6
|
+
include Feedzirra::FeedEntryUtilities
|
7
|
+
|
8
|
+
element :title, :as => :entry_title
|
9
|
+
element :link, :as => :entry_url, :value => :href, :with => {:rel => "alternate"}
|
10
|
+
element :name, :as => :entry_author
|
11
|
+
element :content, :as => :entry_content
|
12
|
+
element :summary, :as => :entry_summary
|
13
|
+
element :published
|
14
|
+
element :id
|
15
|
+
element :created, :as => :published
|
16
|
+
element :issued, :as => :published
|
17
|
+
element :updated
|
18
|
+
element :modified, :as => :updated
|
19
|
+
elements :category, :as => :entry_categories, :value => :term
|
20
|
+
|
21
|
+
elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
|
22
|
+
|
23
|
+
element :"media:content", :as => :media_content, :value => :url
|
24
|
+
element :"media:description", :as => :media_description
|
25
|
+
element :"media:thumbnail", :as => :media_thumbnail, :value => :url
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class Enclosure
|
5
|
+
include SAXMachine
|
6
|
+
include Spix::Parser::EnclosureInterface
|
7
|
+
|
8
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
9
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
10
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class RSS
|
5
|
+
include SAXMachine
|
6
|
+
include Feedzirra::FeedUtilities
|
7
|
+
|
8
|
+
element :title, :as => :feed_title
|
9
|
+
element :description, :as => :feed_subtitle
|
10
|
+
element :language, :as => :feed_language
|
11
|
+
element :link, :as => :url
|
12
|
+
element :pubDate, :as => :last_modified
|
13
|
+
elements :item, :as => :feed_entries, :class => Spix::Parser::RSSEntry
|
14
|
+
|
15
|
+
alias_method :entries, :feed_entries
|
16
|
+
|
17
|
+
attr_accessor :feed_url
|
18
|
+
|
19
|
+
def self.able_to_parse?(xml) #:nodoc:
|
20
|
+
(xml =~ /\<rss|rdf/) && (xml =~ /\<channel/)
|
21
|
+
end
|
22
|
+
|
23
|
+
def last_modified
|
24
|
+
@last_modified.present? ? @last_modified : super
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class RSSEntry
|
5
|
+
include SAXMachine
|
6
|
+
include Feedzirra::FeedEntryUtilities
|
7
|
+
|
8
|
+
element :title, :as => :entry_title
|
9
|
+
element :link, :as => :entry_url
|
10
|
+
|
11
|
+
element :author, :as => :entry_author
|
12
|
+
element :"dc:creator", :as => :entry_author
|
13
|
+
|
14
|
+
element :"content:encoded", :as => :entry_content
|
15
|
+
element :description, :as => :entry_summary
|
16
|
+
element :summary, :as => :entry_summary
|
17
|
+
|
18
|
+
element :pubDate, :as => :published
|
19
|
+
element :"dc:date", :as => :published
|
20
|
+
element :"dc:Date", :as => :published
|
21
|
+
element :"dcterms:created", :as => :published
|
22
|
+
|
23
|
+
element :"dcterms:modified", :as => :updated
|
24
|
+
element :issued, :as => :published
|
25
|
+
elements :category, :as => :entry_categories
|
26
|
+
|
27
|
+
element :guid, :as => :id
|
28
|
+
|
29
|
+
elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
|
30
|
+
|
31
|
+
element :"media:content", :as => :media_content, :value => :url
|
32
|
+
element :"media:description", :as => :media_description
|
33
|
+
element :"media:thumbnail", :as => :media_thumbnail, :value => :url
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
module DateTimeUtilities
|
5
|
+
def build_datetime_object(timestamp)
|
6
|
+
timestamp = normalize_timestamp(timestamp)
|
7
|
+
|
8
|
+
if Time.can_parse?(timestamp)
|
9
|
+
#if the timestamp is a non-date string, it will be Time.mktime("1970").utc
|
10
|
+
timestamp = Time.parse(timestamp, Spix::Parser::Config::BASE_TIMESTAMP).utc
|
11
|
+
|
12
|
+
# non-english dates sometimes are parsed to "future" dates by Ruby
|
13
|
+
# we also cover the case where the timestamp is Time.mktime("1970").utc as explained above
|
14
|
+
if (timestamp > Time.now.utc) || (timestamp == Spix::Parser::Config::BASE_TIMESTAMP)
|
15
|
+
timestamp = nil
|
16
|
+
end
|
17
|
+
else
|
18
|
+
timestamp = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
timestamp
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
def normalize_timestamp(timestamp)
|
26
|
+
# In Ruby 1.9 the date is returned as String
|
27
|
+
# In Ruby 1.8 it is returned as Time
|
28
|
+
timestamp_string = timestamp.to_s
|
29
|
+
Time.translate_for_parsing(timestamp_string.busk_normalize)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
def self.parse(url, options = {})
|
5
|
+
feed = case options.delete(:mode)
|
6
|
+
when :local
|
7
|
+
Feedzirra::Feed.parse(url)
|
8
|
+
when :file
|
9
|
+
Feedzirra::Feed.parse(File.read(url))
|
10
|
+
else
|
11
|
+
Feedzirra::Feed.fetch_and_parse(url, options)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Feedzirra has some issues with failure cases:
|
15
|
+
# If the failure occurs on the parsing phase, then the on_failure callback is triggered
|
16
|
+
# If the failure occurs on the fetching phase (i. e. a network error), the a number is returned
|
17
|
+
# That number may represent an http status code or be 0 in case of other errors.
|
18
|
+
# Also, we can't raise an exception on the on_failure callback, 'cause it will be raised even on success - that's really odd
|
19
|
+
# So we need this 'safety net' here until we patch it to use an uniform error architecture
|
20
|
+
if feed.nil? || (feed.is_a?(Fixnum) && feed == 0)
|
21
|
+
Log.error("The parser couldn't fetch the feed at #{url}")
|
22
|
+
return nil
|
23
|
+
elsif feed.is_a?(Fixnum)
|
24
|
+
feed
|
25
|
+
else
|
26
|
+
Spix::Parser::Feed.new(feed)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module Utils
|
32
|
+
extend self
|
33
|
+
|
34
|
+
def format_links(options)
|
35
|
+
text = options[:text]
|
36
|
+
site_url = options[:site_url]
|
37
|
+
|
38
|
+
parse_links(text)
|
39
|
+
parse_images(text, site_url)
|
40
|
+
|
41
|
+
text
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def join_attributes(attrs)
|
47
|
+
attrs.map do |attr, value|
|
48
|
+
%Q[#{attr}="#{value.to_s.gsub(/"/, """)}"] unless value.blank?
|
49
|
+
end.compact.join(" ")
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_attrs(str)
|
53
|
+
attrs = {}
|
54
|
+
return attrs unless str || str.respond_to?(:scan)
|
55
|
+
|
56
|
+
match_by_spaces = str !~ /'|"/
|
57
|
+
if match_by_spaces
|
58
|
+
# Make sure to match the last html attribute.
|
59
|
+
str += " "
|
60
|
+
value_regexp = /\s*(.*?)\s/
|
61
|
+
else
|
62
|
+
value_regexp = /\s*["'](.*?)["']/
|
63
|
+
end
|
64
|
+
attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
|
65
|
+
|
66
|
+
str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
|
67
|
+
attrs[$1.to_s.downcase] = $2
|
68
|
+
end
|
69
|
+
|
70
|
+
attrs
|
71
|
+
end
|
72
|
+
|
73
|
+
def parse_links(text)
|
74
|
+
text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
|
75
|
+
attrs = parse_attrs($2.to_s)
|
76
|
+
|
77
|
+
# just parse these attributes
|
78
|
+
attrs = {
|
79
|
+
:href => attrs["href"],
|
80
|
+
:title => attrs["title"],
|
81
|
+
:target => "_blank",
|
82
|
+
:rel => "external nofollow"
|
83
|
+
}
|
84
|
+
|
85
|
+
"<a #{join_attributes(attrs)}>"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def parse_images(text, site_url)
|
90
|
+
text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
|
91
|
+
attrs = parse_attrs($2.to_s)
|
92
|
+
|
93
|
+
# just parse these attributes
|
94
|
+
attrs = {
|
95
|
+
:src => parse_relative_image_source(attrs["src"], site_url),
|
96
|
+
:alt => attrs["alt"],
|
97
|
+
:title => attrs["title"],
|
98
|
+
:style => attrs["style"],
|
99
|
+
:width => attrs["width"],
|
100
|
+
:height => attrs["height"]
|
101
|
+
}
|
102
|
+
|
103
|
+
"<img #{join_attributes(attrs)} />" if attrs[:src].present?
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def parse_relative_image_source(src, site_url)
|
108
|
+
if src.present? && site_url
|
109
|
+
begin
|
110
|
+
src = URI.parse(src)
|
111
|
+
src = URI.parse(site_url).merge(src) if src.relative?
|
112
|
+
rescue URI::InvalidURIError
|
113
|
+
# Manually concatenating if it is "relative uri", stripping slashes.
|
114
|
+
if src !~ /\A(https?|ftp):\/\//
|
115
|
+
site_url = site_url[0..-2] if site_url[-1] == ?/
|
116
|
+
src = src[1..-1] if src[0] == ?/
|
117
|
+
src = "#{site_url}/#{src}"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
src
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
gem "feedzirra", ">=0.0.24"
|
2
|
+
require "feedzirra"
|
3
|
+
require "nokogiri"
|
4
|
+
require "uri"
|
5
|
+
require "open-uri"
|
6
|
+
|
7
|
+
module Spix
|
8
|
+
class FeedDiscovery
|
9
|
+
|
10
|
+
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
11
|
+
USER_AGENT = "SpixParser"
|
12
|
+
|
13
|
+
def self.feed?(uri)
|
14
|
+
Spix::Parser.parse(uri, :mode => :fetch) ? true : false
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.list(uri)
|
18
|
+
|
19
|
+
content = self.read(uri)
|
20
|
+
|
21
|
+
doc = Nokogiri::HTML(content)
|
22
|
+
|
23
|
+
# get page title
|
24
|
+
title = doc.search('title')[0].content
|
25
|
+
|
26
|
+
items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
|
27
|
+
url_object = URI::parse(uri).normalize
|
28
|
+
|
29
|
+
href = link.get_attribute(:href).to_s
|
30
|
+
|
31
|
+
feed_url_object = URI::parse(href)
|
32
|
+
|
33
|
+
if feed_url_object.relative?
|
34
|
+
|
35
|
+
# there's 2 types of relative URIs
|
36
|
+
# the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
|
37
|
+
# and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
|
38
|
+
if feed_url_object.path.match(/^\//)
|
39
|
+
# when the feed_url_object is relative and starts with a "/" we should ignore the domain path
|
40
|
+
path = nil
|
41
|
+
else
|
42
|
+
# when the feed_url_object is relative and do not starts with a "/" we should use the domain path
|
43
|
+
|
44
|
+
if url_object.path.match(/\/$/)
|
45
|
+
# when the url_object ends with a "/" we should use it
|
46
|
+
path = url_object.path
|
47
|
+
else
|
48
|
+
# when the url_object do not ends with a "/" we should add it
|
49
|
+
path = url_object.path + "/"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
href = "#{url_object.scheme}://" +
|
54
|
+
"#{url_object.host}" +
|
55
|
+
"#{path}" +
|
56
|
+
"#{url_object.query}" +
|
57
|
+
href
|
58
|
+
end
|
59
|
+
|
60
|
+
item = {
|
61
|
+
:title => link.get_attribute(:title) || title,
|
62
|
+
:url => href
|
63
|
+
}
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
if items.size == 0
|
68
|
+
# if there's no item found at the given URI, maybe it's a feed URI
|
69
|
+
if self.feed?(uri)
|
70
|
+
items = [
|
71
|
+
{
|
72
|
+
:title => title,
|
73
|
+
:url => uri
|
74
|
+
}
|
75
|
+
]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
items
|
80
|
+
rescue
|
81
|
+
nil
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.read(uri)
|
85
|
+
if uri.respond_to?(:read)
|
86
|
+
content = uri.read
|
87
|
+
else
|
88
|
+
req_headers = {}
|
89
|
+
req_headers["User-Agent"] = USER_AGENT
|
90
|
+
content = open(uri, req_headers).read
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'thread'
|
3
|
+
|
4
|
+
class RedirectFollower
|
5
|
+
def self.resolve(url)
|
6
|
+
@response = ""
|
7
|
+
begin
|
8
|
+
timeout(5) do
|
9
|
+
t = Thread.new {@response = Net::HTTP.get_response(URI.parse(url)) }
|
10
|
+
t.join
|
11
|
+
|
12
|
+
if @response.kind_of?(Net::HTTPRedirection)
|
13
|
+
return redirect_url(@response)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
rescue Timeout::Error, URI::InvalidURIError
|
17
|
+
return url
|
18
|
+
end
|
19
|
+
|
20
|
+
url
|
21
|
+
end
|
22
|
+
|
23
|
+
protected
|
24
|
+
def self.redirect_url(response)
|
25
|
+
if response['location'].nil?
|
26
|
+
response.body.match(/<a href=\"([^>]+)\">/i)[1]
|
27
|
+
else
|
28
|
+
response['location']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class FeedEntry
|
5
|
+
include Spix::Parser::DateTimeUtilities
|
6
|
+
include Memoizable
|
7
|
+
|
8
|
+
def initialize(entry, feed)
|
9
|
+
@feed = feed
|
10
|
+
@entry = entry
|
11
|
+
end
|
12
|
+
|
13
|
+
def title
|
14
|
+
text = @entry.entry_title || "(title unknow)"
|
15
|
+
text = text.busk_normalize
|
16
|
+
Sanitizer.sanitize(text)
|
17
|
+
end
|
18
|
+
memoize(:title)
|
19
|
+
|
20
|
+
def summary
|
21
|
+
text = @entry.entry_summary || ""
|
22
|
+
text = text.busk_normalize
|
23
|
+
Sanitizer.strip_comments(text)
|
24
|
+
Sanitizer.strip_disallowed_tags(text)
|
25
|
+
Sanitizer.entities_to_chars(text)
|
26
|
+
end
|
27
|
+
memoize(:summary)
|
28
|
+
|
29
|
+
def url
|
30
|
+
entry_url = @entry.entry_url || @feed.site_url
|
31
|
+
RedirectFollower.resolve(entry_url).busk_normalize if entry_url.present?
|
32
|
+
end
|
33
|
+
memoize(:url)
|
34
|
+
|
35
|
+
def author
|
36
|
+
text = @entry.entry_author || ""
|
37
|
+
Sanitizer.sanitize(text.busk_normalize)
|
38
|
+
end
|
39
|
+
|
40
|
+
def published_at
|
41
|
+
build_datetime_object(@entry.published) if @entry.published
|
42
|
+
end
|
43
|
+
|
44
|
+
def updated_at
|
45
|
+
build_datetime_object(@entry.updated) if @entry.updated
|
46
|
+
end
|
47
|
+
|
48
|
+
def uid
|
49
|
+
uid = self.url || ""
|
50
|
+
uid += self.title.downcase.busk_normalize
|
51
|
+
uid += self.striped_content.downcase.busk_normalize[0..25]
|
52
|
+
uid.to_sha1
|
53
|
+
end
|
54
|
+
memoize(:uid)
|
55
|
+
|
56
|
+
def content
|
57
|
+
text = encoded_raw_content
|
58
|
+
Sanitizer.strip_comments(text)
|
59
|
+
Sanitizer.strip_disallowed_tags(text)
|
60
|
+
Sanitizer.entities_to_chars(text)
|
61
|
+
end
|
62
|
+
memoize(:content)
|
63
|
+
|
64
|
+
def striped_content
|
65
|
+
text = encoded_raw_content
|
66
|
+
Sanitizer.strip_tags(text)
|
67
|
+
end
|
68
|
+
memoize(:striped_content)
|
69
|
+
|
70
|
+
def categories
|
71
|
+
@entry.entry_categories.map do |category|
|
72
|
+
Sanitizer.sanitize(category.busk_normalize)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def enclosures
|
77
|
+
@entry.entry_enclosures
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
def encoded_raw_content
|
82
|
+
text = @entry.entry_content || @entry.entry_summary || ""
|
83
|
+
text.busk_normalize
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Spix
|
3
|
+
module Parser
|
4
|
+
class Feed
|
5
|
+
include Spix::Parser::DateTimeUtilities
|
6
|
+
|
7
|
+
def initialize(parsed_feed)
|
8
|
+
@feed = parsed_feed
|
9
|
+
verify_entries_timestamps
|
10
|
+
end
|
11
|
+
|
12
|
+
def title
|
13
|
+
text = @feed.feed_title || "(title unknow)"
|
14
|
+
text = text.busk_normalize
|
15
|
+
Sanitizer.sanitize(text)
|
16
|
+
end
|
17
|
+
|
18
|
+
def subtitle
|
19
|
+
text = @feed.feed_subtitle || ""
|
20
|
+
text = text.busk_normalize
|
21
|
+
Sanitizer.sanitize(text)
|
22
|
+
end
|
23
|
+
|
24
|
+
def language
|
25
|
+
text = @feed.feed_language || "en"
|
26
|
+
text = text.busk_normalize
|
27
|
+
Sanitizer.sanitize(text)
|
28
|
+
end
|
29
|
+
|
30
|
+
def site_url
|
31
|
+
@feed.url || extract_site_from_feed_url
|
32
|
+
end
|
33
|
+
|
34
|
+
def feed_url
|
35
|
+
@feed.feed_url
|
36
|
+
end
|
37
|
+
|
38
|
+
def uid
|
39
|
+
@feed.feed_url.to_sha1
|
40
|
+
end
|
41
|
+
|
42
|
+
def updated_at
|
43
|
+
timestamp = @feed.last_modified || @feed.feed_entries.first.published_at || Time.now.utc
|
44
|
+
build_datetime_object(timestamp)
|
45
|
+
end
|
46
|
+
|
47
|
+
def feed_items
|
48
|
+
#Se não for um feed válido, o accessor feed_entries não existe
|
49
|
+
if @feed.respond_to?(:feed_entries) && @feed.feed_entries.present?
|
50
|
+
@feed.feed_entries.map{|entry| Spix::Parser::FeedEntry.new(entry, self)}
|
51
|
+
else
|
52
|
+
[]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def verify_entries_timestamps
|
58
|
+
# Some feeds return the timestamps of all entries as the timestamp of the request
|
59
|
+
# This means that the timestamp will change everytime we parse the feed, thus duplicating entries
|
60
|
+
# One way to detect that is to verify if all the entries have the same timestamp
|
61
|
+
items_with_same_timestamp = feed_items.map{|i| i.published_at}.uniq.size == 1
|
62
|
+
more_than_one_item = feed_items.count > 1
|
63
|
+
|
64
|
+
if items_with_same_timestamp && more_than_one_item
|
65
|
+
@feed.feed_entries.each {|item| item.published = Spix::Parser::Config::BASE_TIMESTAMP.to_s}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_site_from_feed_url
|
70
|
+
# Eventually, we run into a feed that for some reason does not include
|
71
|
+
# the publisher website. In those cases, we try to guess the website
|
72
|
+
# root path looking at the feed_url. It may fail also, so be mindful.
|
73
|
+
return unless @feed.feed_url.present?
|
74
|
+
feed_host = URI.parse(@feed.feed_url).host
|
75
|
+
|
76
|
+
"http://#{feed_host}"
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/spix_parser.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "rubygems"
|
3
|
+
require "feedzirra"
|
4
|
+
require "digest/sha1"
|
5
|
+
require "zlib"
|
6
|
+
require "logger"
|
7
|
+
require "cgi"
|
8
|
+
require "memoizable"
|
9
|
+
|
10
|
+
$:.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
|
12
|
+
require "spix_parser/version"
|
13
|
+
require "spix_parser/core_ext"
|
14
|
+
require "spix_parser/config"
|
15
|
+
require "spix_parser/parser"
|
16
|
+
require "spix_parser/datetime"
|
17
|
+
|
18
|
+
require "spix_parser/tools/redirect_follower"
|
19
|
+
|
20
|
+
require "spix_parser/wrappers/entry"
|
21
|
+
require "spix_parser/wrappers/enclosure_interface"
|
22
|
+
require "spix_parser/wrappers/feed"
|
23
|
+
require "spix_parser/wrappers/parsing_error"
|
24
|
+
|
25
|
+
require "spix_parser/custom_parsers/enclosure"
|
26
|
+
require "spix_parser/custom_parsers/atom_entry"
|
27
|
+
require "spix_parser/custom_parsers/atom"
|
28
|
+
require "spix_parser/custom_parsers/rss_entry"
|
29
|
+
require "spix_parser/custom_parsers/rss"
|
30
|
+
|
31
|
+
require "spix_parser/tools/feed_discovery"
|
32
|
+
|
33
|
+
if RUBY_VERSION < '1.9'
|
34
|
+
$KCODE='u'
|
35
|
+
else
|
36
|
+
Encoding.default_internal = Encoding::UTF_8
|
37
|
+
Encoding.default_external = Encoding::UTF_8
|
38
|
+
end
|
39
|
+
|
40
|
+
Feedzirra::Feed.add_feed_class(Spix::Parser::RSS)
|
41
|
+
Feedzirra::Feed.add_feed_class(Spix::Parser::Atom)
|
42
|
+
|
43
|
+
# Start the log over whenever the log exceeds 100 megabytes in size.
|
44
|
+
Log = Logger.new('/var/log/spix/spix_parser.log', 0, 100 * 1024 * 1024)
|
45
|
+
Log.level = Logger::ERROR
|
46
|
+
Log.datetime_format = "%d-%m-%Y %H:%M:%S"
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Spix::Parser do
|
5
|
+
describe "atom parsing" do
|
6
|
+
it 'should parse from a file path' do
|
7
|
+
feed = Spix::Parser.parse(fixture('feed.atom'), :mode => :file)
|
8
|
+
feed.should_not be_nil
|
9
|
+
feed.feed_items.should have(1).item
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should parse from a file' do
|
13
|
+
feed = Spix::Parser.parse(load_fixture('feed.atom'), :mode => :local)
|
14
|
+
feed.should_not be_nil
|
15
|
+
feed.feed_items.should have(1).item
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "rss parsing" do
|
20
|
+
it 'should parse from a file path' do
|
21
|
+
feed = Spix::Parser.parse(fixture('feed.rss'), :mode => :file)
|
22
|
+
feed.should_not be_nil
|
23
|
+
feed.feed_items.should have(9).item
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should parse from a file' do
|
27
|
+
feed = Spix::Parser.parse(load_fixture('feed.rss'), :mode => :local)
|
28
|
+
feed.should_not be_nil
|
29
|
+
feed.feed_items.should have(9).item
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should parse a feed from meioemensagem.com' do
|
33
|
+
url = 'http://www.meioemensagem.com.br/home/rss/geral.xml'
|
34
|
+
feed = Spix::Parser.parse(load_fixture('meioemensagem.xml'), :mode => :local)
|
35
|
+
|
36
|
+
feed.should_not be_nil
|
37
|
+
feed.title.should == "RSS: Notícias Gerais"
|
38
|
+
feed.feed_items[0].title.should == "Cielo volta à mídia com o cantor Fiuk"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Spix::FeedDiscovery, "#list" do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@domain_url = "http://sitewithfeed.com"
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "when the feed have an absolute URI" do
|
10
|
+
it "should return the feed url" do
|
11
|
+
FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
|
12
|
+
Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "when the feed have a relative URI" do
|
17
|
+
describe "which is relative to a path" do
|
18
|
+
it "should return the feed url when the URI is at the top domain" do
|
19
|
+
FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
|
20
|
+
Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should return the feed url when the URI is inside a path" do
|
24
|
+
@path_url = "/foo/bar"
|
25
|
+
@feed_url = @domain_url + @path_url
|
26
|
+
|
27
|
+
FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
|
28
|
+
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "which is relative to the top domain" do
|
33
|
+
it "should return the feed url when the URI is at the top domain" do
|
34
|
+
FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
|
35
|
+
Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should return the feed url when the URI is inside a path" do
|
39
|
+
@path_url = "/foo/bar"
|
40
|
+
@feed_url = @domain_url + @path_url
|
41
|
+
|
42
|
+
FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
|
43
|
+
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "when the URI is a feed" do
|
49
|
+
before(:all) do
|
50
|
+
@path_url = "/feed.xml"
|
51
|
+
@feed_url = @domain_url + @path_url
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should return the extracted url when there's a link at the feed" do
|
55
|
+
FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("feed_with_self_link.xml"))
|
56
|
+
Spix::FeedDiscovery.list(@feed_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should return the same url when there's no link at the feed" do
|
60
|
+
fixture = load_fixture("feed_without_self_link.xml")
|
61
|
+
|
62
|
+
FakeWeb.register_uri(:get, @feed_url, :body => fixture)
|
63
|
+
|
64
|
+
# feedzirra doesn't work with fakeweb
|
65
|
+
feed_xml = fixture
|
66
|
+
feed = Feedzirra::Feed.parse(feed_xml)
|
67
|
+
Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
|
68
|
+
|
69
|
+
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Spix::Utils do
|
5
|
+
describe ".format_links" do
|
6
|
+
context "html containing links" do
|
7
|
+
it "parsers links in the given html string adding rel and target" do
|
8
|
+
input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
9
|
+
|
10
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
11
|
+
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
12
|
+
end
|
13
|
+
|
14
|
+
it "parses links removing other html attributes" do
|
15
|
+
input_html = %q[<div><a href="foo/bar.html" title="FooBar!" style="color: red" invalid="test">FooBar!</a></div>]
|
16
|
+
|
17
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
18
|
+
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
19
|
+
end
|
20
|
+
|
21
|
+
it "parses links with simple quotes" do
|
22
|
+
input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
|
23
|
+
|
24
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
25
|
+
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
26
|
+
end
|
27
|
+
|
28
|
+
# TODO: should we strip these extra " ?
|
29
|
+
it "parses links with html escaped quote (")" do
|
30
|
+
input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
|
31
|
+
|
32
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
33
|
+
%q[<div><a href=""foo/bar.html"" title=""FooBar!"" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
34
|
+
end
|
35
|
+
|
36
|
+
it "parses links with html attributes without quotes, based on spaces" do
|
37
|
+
input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
|
38
|
+
|
39
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
40
|
+
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
41
|
+
end
|
42
|
+
|
43
|
+
it "parses links with html attributes having spaces before or after the equal sign" do
|
44
|
+
input_html = %q[<div><a href = foo/bar.html title = FooBar!>FooBar!</a></div>]
|
45
|
+
|
46
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
47
|
+
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
48
|
+
end
|
49
|
+
|
50
|
+
it "parses links downcasing attribute names" do
|
51
|
+
input_html = %q[<div><a HREF="foo/bar.html" TITLE="FooBar!">FooBar!</a></div>]
|
52
|
+
|
53
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
54
|
+
%q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
55
|
+
end
|
56
|
+
|
57
|
+
it "parses links ignoring blank attributes" do
|
58
|
+
input_html = %q[<div><a href="foo/bar.html" title="">FooBar!</a></div>]
|
59
|
+
|
60
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
61
|
+
%q[<div><a href="foo/bar.html" target="_blank" rel="external nofollow">FooBar!</a></div>]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "html containing images" do
|
66
|
+
it "parsers images in the given html string matching default attributes (src, style, alt, title, width and height)" do
|
67
|
+
input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
|
68
|
+
|
69
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
70
|
+
%q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
|
71
|
+
end
|
72
|
+
|
73
|
+
it "parses image tags removing other invalid html attributes" do
|
74
|
+
input_html = %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" invalid="test" target="_blank" /></div>]
|
75
|
+
|
76
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
77
|
+
%q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" /></div>]
|
78
|
+
end
|
79
|
+
|
80
|
+
it "parses image tags appending the given site url to relative images" do
|
81
|
+
input_html = %q[<div><img src="images/bar.jpg" /></div>]
|
82
|
+
|
83
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
|
84
|
+
%q[<div><img src="http://example.com/images/bar.jpg" /></div>]
|
85
|
+
end
|
86
|
+
|
87
|
+
it "parses image tags having relative sources with invalid URI, appending the site url" do
|
88
|
+
input_html = %q[<div><img src="images/radiação.jpg" /></div>]
|
89
|
+
|
90
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
|
91
|
+
%q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
|
92
|
+
end
|
93
|
+
|
94
|
+
it "parses image tags having relative sources starting with / and with invalid URI, appending the site url" do
|
95
|
+
input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
|
96
|
+
|
97
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
|
98
|
+
%q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
|
99
|
+
end
|
100
|
+
|
101
|
+
it "parses image tags having relative sources starting with / and with invalid URI, appending the site url also ending with /" do
|
102
|
+
input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
|
103
|
+
|
104
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com/").should ==
|
105
|
+
%q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
|
106
|
+
end
|
107
|
+
|
108
|
+
%w(http https ftp).each do |scheme|
|
109
|
+
it "parses image tags having absolute sources with #{scheme} and invalid URI" do
|
110
|
+
input_html = %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
|
111
|
+
|
112
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
|
113
|
+
%Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
it "parses image tags having sources with spaces but using quotes" do
|
118
|
+
input_html = %q[<div><img src="images/foo bar.jpg" /></div>]
|
119
|
+
|
120
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
|
121
|
+
%q[<div><img src="http://example.com/images/foo bar.jpg" /></div>]
|
122
|
+
end
|
123
|
+
|
124
|
+
it "parses image tags having style attributes with spaces" do
|
125
|
+
input_html = %q[<div><img src="images/foobar.jpg" style="color: blue;" /></div>]
|
126
|
+
|
127
|
+
Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
|
128
|
+
%q[<div><img src="http://example.com/images/foobar.jpg" style="color: blue;" /></div>]
|
129
|
+
end
|
130
|
+
|
131
|
+
it "parses image tags ignoring images with empty sources" do
|
132
|
+
input_html = %q[<div><img src="" title="FooBar!" /></div>]
|
133
|
+
|
134
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
135
|
+
%q[<div></div>]
|
136
|
+
end
|
137
|
+
|
138
|
+
it "parses image tags with simple quotes" do
|
139
|
+
input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
|
140
|
+
|
141
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
142
|
+
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
143
|
+
end
|
144
|
+
|
145
|
+
# TODO: should we strip these extra " ?
|
146
|
+
it "parses image tags with html escaped quote (")" do
|
147
|
+
input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
148
|
+
|
149
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
150
|
+
%q[<div><img src=""images/bar.jpg"" title=""FooBar!"" /></div>]
|
151
|
+
end
|
152
|
+
|
153
|
+
it "parses image tags with html attributes without quotes, based on spaces" do
|
154
|
+
input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
|
155
|
+
|
156
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
157
|
+
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
158
|
+
end
|
159
|
+
|
160
|
+
it "parses image tags with html attributes having spaces before or after the equal sign" do
|
161
|
+
input_html = %q[<div><img src = images/bar.jpg title = FooBar! /></div>]
|
162
|
+
|
163
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
164
|
+
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
165
|
+
end
|
166
|
+
|
167
|
+
it "parses image tags downcasing attribute names" do
|
168
|
+
input_html = %q[<div><img SRC="images/bar.jpg" TITLE="FooBar!" /></div>]
|
169
|
+
|
170
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
171
|
+
%q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
|
172
|
+
end
|
173
|
+
|
174
|
+
it "parses image tags ignoring empty attributes" do
|
175
|
+
input_html = %q[<div><img src="images/bar.jpg" title="" /></div>]
|
176
|
+
|
177
|
+
Spix::Utils.format_links(:text => input_html).should ==
|
178
|
+
%q[<div><img src="images/bar.jpg" /></div>]
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
metadata
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spix_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 7
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 5
|
9
|
+
- 2
|
10
|
+
version: 1.5.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Marcelo Eden
|
14
|
+
- Fabio Mont'Alegre
|
15
|
+
- "Lucas H\xC3\xBAngaro"
|
16
|
+
- Luiz Rocha
|
17
|
+
autorequire:
|
18
|
+
bindir: bin
|
19
|
+
cert_chain: []
|
20
|
+
|
21
|
+
date: 2011-05-12 00:00:00 -03:00
|
22
|
+
default_executable:
|
23
|
+
dependencies:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: feedzirra
|
26
|
+
prerelease: false
|
27
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
hash: 47
|
33
|
+
segments:
|
34
|
+
- 0
|
35
|
+
- 0
|
36
|
+
- 24
|
37
|
+
version: 0.0.24
|
38
|
+
type: :runtime
|
39
|
+
version_requirements: *id001
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: memoizable
|
42
|
+
prerelease: false
|
43
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
hash: 27
|
49
|
+
segments:
|
50
|
+
- 0
|
51
|
+
- 1
|
52
|
+
- 0
|
53
|
+
version: 0.1.0
|
54
|
+
type: :runtime
|
55
|
+
version_requirements: *id002
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: sanitizer
|
58
|
+
prerelease: false
|
59
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 21
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
- 1
|
68
|
+
- 7
|
69
|
+
version: 0.1.7
|
70
|
+
type: :runtime
|
71
|
+
version_requirements: *id003
|
72
|
+
- !ruby/object:Gem::Dependency
|
73
|
+
name: i18n
|
74
|
+
prerelease: false
|
75
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
hash: 25
|
81
|
+
segments:
|
82
|
+
- 0
|
83
|
+
- 1
|
84
|
+
- 1
|
85
|
+
version: 0.1.1
|
86
|
+
type: :runtime
|
87
|
+
version_requirements: *id004
|
88
|
+
- !ruby/object:Gem::Dependency
|
89
|
+
name: rspec
|
90
|
+
prerelease: false
|
91
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
hash: 3
|
97
|
+
segments:
|
98
|
+
- 0
|
99
|
+
version: "0"
|
100
|
+
type: :development
|
101
|
+
version_requirements: *id005
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: fakeweb
|
104
|
+
prerelease: false
|
105
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
hash: 3
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
type: :development
|
115
|
+
version_requirements: *id006
|
116
|
+
description: A feed parser wrapper for Spix
|
117
|
+
email: busk@busk.com
|
118
|
+
executables: []
|
119
|
+
|
120
|
+
extensions: []
|
121
|
+
|
122
|
+
extra_rdoc_files: []
|
123
|
+
|
124
|
+
files:
|
125
|
+
- lib/spix_parser/config.rb
|
126
|
+
- lib/spix_parser/core_ext.rb
|
127
|
+
- lib/spix_parser/custom_parsers/atom.rb
|
128
|
+
- lib/spix_parser/custom_parsers/atom_entry.rb
|
129
|
+
- lib/spix_parser/custom_parsers/enclosure.rb
|
130
|
+
- lib/spix_parser/custom_parsers/rss.rb
|
131
|
+
- lib/spix_parser/custom_parsers/rss_entry.rb
|
132
|
+
- lib/spix_parser/datetime.rb
|
133
|
+
- lib/spix_parser/parser.rb
|
134
|
+
- lib/spix_parser/tools/feed_discovery.rb
|
135
|
+
- lib/spix_parser/tools/redirect_follower.rb
|
136
|
+
- lib/spix_parser/version.rb
|
137
|
+
- lib/spix_parser/wrappers/enclosure_interface.rb
|
138
|
+
- lib/spix_parser/wrappers/entry.rb
|
139
|
+
- lib/spix_parser/wrappers/feed.rb
|
140
|
+
- lib/spix_parser/wrappers/parsing_error.rb
|
141
|
+
- lib/spix_parser.rb
|
142
|
+
- spec/parser_spec.rb
|
143
|
+
- spec/spix_parser/parser_spec.rb
|
144
|
+
- spec/spix_parser/tools/feed_discovery_spec.rb
|
145
|
+
- spec/spix_parser/utils_spec.rb
|
146
|
+
has_rdoc: true
|
147
|
+
homepage: http://github.com/busk/spix_parser
|
148
|
+
licenses: []
|
149
|
+
|
150
|
+
post_install_message:
|
151
|
+
rdoc_options: []
|
152
|
+
|
153
|
+
require_paths:
|
154
|
+
- lib
|
155
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
156
|
+
none: false
|
157
|
+
requirements:
|
158
|
+
- - ">="
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
hash: 3
|
161
|
+
segments:
|
162
|
+
- 0
|
163
|
+
version: "0"
|
164
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
165
|
+
none: false
|
166
|
+
requirements:
|
167
|
+
- - ">="
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
hash: 3
|
170
|
+
segments:
|
171
|
+
- 0
|
172
|
+
version: "0"
|
173
|
+
requirements: []
|
174
|
+
|
175
|
+
rubyforge_project:
|
176
|
+
rubygems_version: 1.6.2
|
177
|
+
signing_key:
|
178
|
+
specification_version: 3
|
179
|
+
summary: FeedParser for Spix
|
180
|
+
test_files:
|
181
|
+
- spec/parser_spec.rb
|
182
|
+
- spec/spix_parser/parser_spec.rb
|
183
|
+
- spec/spix_parser/tools/feed_discovery_spec.rb
|
184
|
+
- spec/spix_parser/utils_spec.rb
|