spix_parser 1.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module Config
5
+ ENCODING = "UTF-8"
6
+
7
+ BASE_TIMESTAMP = Time.mktime("1970").utc
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,117 @@
1
+ # encoding: utf-8
2
+ class Time
3
+ def to_db_format(format=:default)
4
+ format == :default ? to_s_default : strftime("%Y-%m-%d %H:%M:%S").strip
5
+ end
6
+ alias_method :to_s_default, :to_s
7
+ alias_method :to_s, :to_db_format unless method_defined?(:to_formatted_s)
8
+
9
+ def self.can_parse?(date)
10
+ begin
11
+ Time.parse(date)
12
+ rescue StandardError => e
13
+ return false
14
+ end
15
+ true
16
+ end
17
+
18
+ # We have a lot of timestamps in portuguese and Ruby can't parse'em
19
+ # So, this function will translate portuguese date related terms to english for correct parsing,
20
+ # because we can't afford so many feed entries with wrong timestamps
21
+ def self.translate_for_parsing(date_as_string)
22
+ # First, add leading zero to days of month below 10
23
+ formatted_date = date_as_string.sub(/\A[a-zA-Z]+\,\s{1}(\d)[^\d]/, '0\1 ')
24
+
25
+ day_names = {"Domingo" => "Sunday", "Segunda" => "Monday", "Terça" => "Tuesday", "Quarta" => "Wednesday",
26
+ "Quinta" => "Thursday", "Sexta" => "Friday", "Sábado" => "Saturday", "Sabado" => "Saturday"}
27
+ abbr_day_names = {"Dom" => "Sun", "Seg" => "Mon", "Ter" => "Tue", "Qua" => "Wed",
28
+ "Qui" => "Thu", "Sex" => "Fri", "Sáb" => "Sat", "Sab" => "Sat"}
29
+ month_names = {"Janeiro" => "January", "Fevereiro" => "February", "Março" => "March", "Marco" => "March",
30
+ "Abril" => "April", "Maio" => "May", "Junho" => "June", "Julho" => "July",
31
+ "Agosto" => "August", "Setembro" => "September", "Outubro" => "October",
32
+ "Novembro" => "November", "Dezembro" => "December"}
33
+ abbr_month_names = {"Jan" => "Jan", "Fev" => "Feb", "Abr" => "Apr", "Mai" => "May",
34
+ "Ago" => "Aug", "Set" => "Sep", "Out" => "Oct", "Dez" => "Dec"}
35
+
36
+ day_names.each do |key, value|
37
+ formatted_date.sub!(key, value)
38
+ end
39
+
40
+ abbr_day_names.each do |key, value|
41
+ formatted_date.sub!(key, value)
42
+ end
43
+
44
+ month_names.each do |key, value|
45
+ formatted_date.sub!(key, value)
46
+ end
47
+
48
+ abbr_month_names.each do |key, value|
49
+ formatted_date.sub!(key, value)
50
+ end
51
+
52
+ formatted_date
53
+ end
54
+ end
55
+
56
+ class Object
57
+ def blank?
58
+ respond_to?(:empty?) ? empty? : !self
59
+ end
60
+
61
+ def silence_warnings
62
+ old_verbose, $VERBOSE = $VERBOSE, nil
63
+ yield
64
+ ensure
65
+ $VERBOSE = old_verbose
66
+ end
67
+ end
68
+
69
+ class NilClass
70
+ def blank?
71
+ true
72
+ end
73
+ end
74
+
75
+ class FalseClass
76
+ def blank?
77
+ true
78
+ end
79
+ end
80
+
81
+ class TrueClass
82
+ def blank?
83
+ false
84
+ end
85
+ end
86
+
87
+ class Array
88
+ alias_method :blank?, :empty?
89
+ end
90
+
91
+ class Hash
92
+ alias_method :blank?, :empty?
93
+ end
94
+
95
+ class Numeric
96
+ def blank?
97
+ false
98
+ end
99
+ end
100
+
101
+ class String
102
+ def to_sha1
103
+ Digest::SHA1.hexdigest "--17f7e62310d5a2bbb9bfc535b95134ece1cb474d--#{self}"
104
+ end
105
+
106
+ def blank?
107
+ self !~ /\S/
108
+ end
109
+
110
+ def busk_normalize
111
+ if RUBY_VERSION >= '1.9'
112
+ self.force_encoding(Spix::Parser::Config::ENCODING)
113
+ else
114
+ self
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class Atom
5
+ include SAXMachine
6
+ include Feedzirra::FeedUtilities
7
+
8
+ element :title, :as => :feed_title
9
+ element :subtitle, :as => :feed_subtitle
10
+ element :language, :as => :feed_language
11
+ element :updated, :as => :last_modified
12
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
13
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
14
+ elements :link, :as => :links, :value => :href
15
+ elements :entry, :as => :feed_entries, :class => Spix::Parser::AtomEntry
16
+
17
+ alias_method :entries, :feed_entries
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ (xml =~ /application\/atom\+xml|(#{Regexp.escape("http://www.w3.org/2005/Atom")})|(#{Regexp.escape("http://purl.org/atom")})/) && (xml =~ /\<feed\s/)
21
+ end
22
+
23
+ def url
24
+ @url || links.last
25
+ end
26
+
27
+ def feed_url
28
+ @feed_url || links.first
29
+ end
30
+
31
+ def last_modified
32
+ @last_modified.present? ? @last_modified : super
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class AtomEntry
5
+ include SAXMachine
6
+ include Feedzirra::FeedEntryUtilities
7
+
8
+ element :title, :as => :entry_title
9
+ element :link, :as => :entry_url, :value => :href, :with => {:rel => "alternate"}
10
+ element :name, :as => :entry_author
11
+ element :content, :as => :entry_content
12
+ element :summary, :as => :entry_summary
13
+ element :published
14
+ element :id
15
+ element :created, :as => :published
16
+ element :issued, :as => :published
17
+ element :updated
18
+ element :modified, :as => :updated
19
+ elements :category, :as => :entry_categories, :value => :term
20
+
21
+ elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
22
+
23
+ element :"media:content", :as => :media_content, :value => :url
24
+ element :"media:description", :as => :media_description
25
+ element :"media:thumbnail", :as => :media_thumbnail, :value => :url
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class Enclosure
5
+ include SAXMachine
6
+ include Spix::Parser::EnclosureInterface
7
+
8
+ element :enclosure, :value => :length, :as => :enclosure_length
9
+ element :enclosure, :value => :type, :as => :enclosure_type
10
+ element :enclosure, :value => :url, :as => :enclosure_url
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class RSS
5
+ include SAXMachine
6
+ include Feedzirra::FeedUtilities
7
+
8
+ element :title, :as => :feed_title
9
+ element :description, :as => :feed_subtitle
10
+ element :language, :as => :feed_language
11
+ element :link, :as => :url
12
+ element :pubDate, :as => :last_modified
13
+ elements :item, :as => :feed_entries, :class => Spix::Parser::RSSEntry
14
+
15
+ alias_method :entries, :feed_entries
16
+
17
+ attr_accessor :feed_url
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ (xml =~ /\<rss|rdf/) && (xml =~ /\<channel/)
21
+ end
22
+
23
+ def last_modified
24
+ @last_modified.present? ? @last_modified : super
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class RSSEntry
5
+ include SAXMachine
6
+ include Feedzirra::FeedEntryUtilities
7
+
8
+ element :title, :as => :entry_title
9
+ element :link, :as => :entry_url
10
+
11
+ element :author, :as => :entry_author
12
+ element :"dc:creator", :as => :entry_author
13
+
14
+ element :"content:encoded", :as => :entry_content
15
+ element :description, :as => :entry_summary
16
+ element :summary, :as => :entry_summary
17
+
18
+ element :pubDate, :as => :published
19
+ element :"dc:date", :as => :published
20
+ element :"dc:Date", :as => :published
21
+ element :"dcterms:created", :as => :published
22
+
23
+ element :"dcterms:modified", :as => :updated
24
+ element :issued, :as => :published
25
+ elements :category, :as => :entry_categories
26
+
27
+ element :guid, :as => :id
28
+
29
+ elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
30
+
31
+ element :"media:content", :as => :media_content, :value => :url
32
+ element :"media:description", :as => :media_description
33
+ element :"media:thumbnail", :as => :media_thumbnail, :value => :url
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module DateTimeUtilities
5
+ def build_datetime_object(timestamp)
6
+ timestamp = normalize_timestamp(timestamp)
7
+
8
+ if Time.can_parse?(timestamp)
9
+ #if the timestamp is a non-date string, it will be Time.mktime("1970").utc
10
+ timestamp = Time.parse(timestamp, Spix::Parser::Config::BASE_TIMESTAMP).utc
11
+
12
+ # non-english dates sometimes are parsed to "future" dates by Ruby
13
+ # we also cover the case where the timestamp is Time.mktime("1970").utc as explained above
14
+ if (timestamp > Time.now.utc) || (timestamp == Spix::Parser::Config::BASE_TIMESTAMP)
15
+ timestamp = nil
16
+ end
17
+ else
18
+ timestamp = nil
19
+ end
20
+
21
+ timestamp
22
+ end
23
+
24
+ private
25
+ def normalize_timestamp(timestamp)
26
+ # In Ruby 1.9 the date is returned as String
27
+ # In Ruby 1.8 it is returned as Time
28
+ timestamp_string = timestamp.to_s
29
+ Time.translate_for_parsing(timestamp_string.busk_normalize)
30
+ end
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,124 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ def self.parse(url, options = {})
5
+ feed = case options.delete(:mode)
6
+ when :local
7
+ Feedzirra::Feed.parse(url)
8
+ when :file
9
+ Feedzirra::Feed.parse(File.read(url))
10
+ else
11
+ Feedzirra::Feed.fetch_and_parse(url, options)
12
+ end
13
+
14
+ # Feedzirra has some issues with failure cases:
15
+ # If the failure occurs on the parsing phase, then the on_failure callback is triggered
16
+ # If the failure occurs on the fetching phase (i. e. a network error), the a number is returned
17
+ # That number may represent an http status code or be 0 in case of other errors.
18
+ # Also, we can't raise an exception on the on_failure callback, 'cause it will be raised even on success - that's really odd
19
+ # So we need this 'safety net' here until we patch it to use an uniform error architecture
20
+ if feed.nil? || (feed.is_a?(Fixnum) && feed == 0)
21
+ Log.error("The parser couldn't fetch the feed at #{url}")
22
+ return nil
23
+ elsif feed.is_a?(Fixnum)
24
+ feed
25
+ else
26
+ Spix::Parser::Feed.new(feed)
27
+ end
28
+ end
29
+ end
30
+
31
+ module Utils
32
+ extend self
33
+
34
+ def format_links(options)
35
+ text = options[:text]
36
+ site_url = options[:site_url]
37
+
38
+ parse_links(text)
39
+ parse_images(text, site_url)
40
+
41
+ text
42
+ end
43
+
44
+ private
45
+
46
+ def join_attributes(attrs)
47
+ attrs.map do |attr, value|
48
+ %Q[#{attr}="#{value.to_s.gsub(/"/, "&quot;")}"] unless value.blank?
49
+ end.compact.join(" ")
50
+ end
51
+
52
+ def parse_attrs(str)
53
+ attrs = {}
54
+ return attrs unless str || str.respond_to?(:scan)
55
+
56
+ match_by_spaces = str !~ /'|"/
57
+ if match_by_spaces
58
+ # Make sure to match the last html attribute.
59
+ str += " "
60
+ value_regexp = /\s*(.*?)\s/
61
+ else
62
+ value_regexp = /\s*["'](.*?)["']/
63
+ end
64
+ attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
65
+
66
+ str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
67
+ attrs[$1.to_s.downcase] = $2
68
+ end
69
+
70
+ attrs
71
+ end
72
+
73
+ def parse_links(text)
74
+ text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
75
+ attrs = parse_attrs($2.to_s)
76
+
77
+ # just parse these attributes
78
+ attrs = {
79
+ :href => attrs["href"],
80
+ :title => attrs["title"],
81
+ :target => "_blank",
82
+ :rel => "external nofollow"
83
+ }
84
+
85
+ "<a #{join_attributes(attrs)}>"
86
+ end
87
+ end
88
+
89
+ def parse_images(text, site_url)
90
+ text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
91
+ attrs = parse_attrs($2.to_s)
92
+
93
+ # just parse these attributes
94
+ attrs = {
95
+ :src => parse_relative_image_source(attrs["src"], site_url),
96
+ :alt => attrs["alt"],
97
+ :title => attrs["title"],
98
+ :style => attrs["style"],
99
+ :width => attrs["width"],
100
+ :height => attrs["height"]
101
+ }
102
+
103
+ "<img #{join_attributes(attrs)} />" if attrs[:src].present?
104
+ end
105
+ end
106
+
107
+ def parse_relative_image_source(src, site_url)
108
+ if src.present? && site_url
109
+ begin
110
+ src = URI.parse(src)
111
+ src = URI.parse(site_url).merge(src) if src.relative?
112
+ rescue URI::InvalidURIError
113
+ # Manually concatenating if it is "relative uri", stripping slashes.
114
+ if src !~ /\A(https?|ftp):\/\//
115
+ site_url = site_url[0..-2] if site_url[-1] == ?/
116
+ src = src[1..-1] if src[0] == ?/
117
+ src = "#{site_url}/#{src}"
118
+ end
119
+ end
120
+ end
121
+ src
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,94 @@
1
+ gem "feedzirra", ">=0.0.24"
2
+ require "feedzirra"
3
+ require "nokogiri"
4
+ require "uri"
5
+ require "open-uri"
6
+
7
+ module Spix
8
+ class FeedDiscovery
9
+
10
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
11
+ USER_AGENT = "SpixParser"
12
+
13
+ def self.feed?(uri)
14
+ Spix::Parser.parse(uri, :mode => :fetch) ? true : false
15
+ end
16
+
17
+ def self.list(uri)
18
+
19
+ content = self.read(uri)
20
+
21
+ doc = Nokogiri::HTML(content)
22
+
23
+ # get page title
24
+ title = doc.search('title')[0].content
25
+
26
+ items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
27
+ url_object = URI::parse(uri).normalize
28
+
29
+ href = link.get_attribute(:href).to_s
30
+
31
+ feed_url_object = URI::parse(href)
32
+
33
+ if feed_url_object.relative?
34
+
35
+ # there's 2 types of relative URIs
36
+ # the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
37
+ # and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
38
+ if feed_url_object.path.match(/^\//)
39
+ # when the feed_url_object is relative and starts with a "/" we should ignore the domain path
40
+ path = nil
41
+ else
42
+ # when the feed_url_object is relative and do not starts with a "/" we should use the domain path
43
+
44
+ if url_object.path.match(/\/$/)
45
+ # when the url_object ends with a "/" we should use it
46
+ path = url_object.path
47
+ else
48
+ # when the url_object do not ends with a "/" we should add it
49
+ path = url_object.path + "/"
50
+ end
51
+ end
52
+
53
+ href = "#{url_object.scheme}://" +
54
+ "#{url_object.host}" +
55
+ "#{path}" +
56
+ "#{url_object.query}" +
57
+ href
58
+ end
59
+
60
+ item = {
61
+ :title => link.get_attribute(:title) || title,
62
+ :url => href
63
+ }
64
+
65
+ end
66
+
67
+ if items.size == 0
68
+ # if there's no item found at the given URI, maybe it's a feed URI
69
+ if self.feed?(uri)
70
+ items = [
71
+ {
72
+ :title => title,
73
+ :url => uri
74
+ }
75
+ ]
76
+ end
77
+ end
78
+
79
+ items
80
+ rescue
81
+ nil
82
+ end
83
+
84
+ def self.read(uri)
85
+ if uri.respond_to?(:read)
86
+ content = uri.read
87
+ else
88
+ req_headers = {}
89
+ req_headers["User-Agent"] = USER_AGENT
90
+ content = open(uri, req_headers).read
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,31 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+
4
+ class RedirectFollower
5
+ def self.resolve(url)
6
+ @response = ""
7
+ begin
8
+ timeout(5) do
9
+ t = Thread.new {@response = Net::HTTP.get_response(URI.parse(url)) }
10
+ t.join
11
+
12
+ if @response.kind_of?(Net::HTTPRedirection)
13
+ return redirect_url(@response)
14
+ end
15
+ end
16
+ rescue Timeout::Error, URI::InvalidURIError
17
+ return url
18
+ end
19
+
20
+ url
21
+ end
22
+
23
+ protected
24
+ def self.redirect_url(response)
25
+ if response['location'].nil?
26
+ response.body.match(/<a href=\"([^>]+)\">/i)[1]
27
+ else
28
+ response['location']
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module Version
5
+ MAJOR = 1
6
+ MINOR = 5
7
+ TINY = 2
8
+
9
+ def self.current_version
10
+ "#{MAJOR}.#{MINOR}.#{TINY}"
11
+ end
12
+
13
+ def self.date
14
+ "2011-04-25"
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module EnclosureInterface
5
+ def url
6
+ enclosure_url
7
+ end
8
+
9
+ def mime_type
10
+ enclosure_type
11
+ end
12
+
13
+ def length
14
+ enclosure_length
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,87 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class FeedEntry
5
+ include Spix::Parser::DateTimeUtilities
6
+ include Memoizable
7
+
8
+ def initialize(entry, feed)
9
+ @feed = feed
10
+ @entry = entry
11
+ end
12
+
13
+ def title
14
+ text = @entry.entry_title || "(title unknow)"
15
+ text = text.busk_normalize
16
+ Sanitizer.sanitize(text)
17
+ end
18
+ memoize(:title)
19
+
20
+ def summary
21
+ text = @entry.entry_summary || ""
22
+ text = text.busk_normalize
23
+ Sanitizer.strip_comments(text)
24
+ Sanitizer.strip_disallowed_tags(text)
25
+ Sanitizer.entities_to_chars(text)
26
+ end
27
+ memoize(:summary)
28
+
29
+ def url
30
+ entry_url = @entry.entry_url || @feed.site_url
31
+ RedirectFollower.resolve(entry_url).busk_normalize if entry_url.present?
32
+ end
33
+ memoize(:url)
34
+
35
+ def author
36
+ text = @entry.entry_author || ""
37
+ Sanitizer.sanitize(text.busk_normalize)
38
+ end
39
+
40
+ def published_at
41
+ build_datetime_object(@entry.published) if @entry.published
42
+ end
43
+
44
+ def updated_at
45
+ build_datetime_object(@entry.updated) if @entry.updated
46
+ end
47
+
48
+ def uid
49
+ uid = self.url || ""
50
+ uid += self.title.downcase.busk_normalize
51
+ uid += self.striped_content.downcase.busk_normalize[0..25]
52
+ uid.to_sha1
53
+ end
54
+ memoize(:uid)
55
+
56
+ def content
57
+ text = encoded_raw_content
58
+ Sanitizer.strip_comments(text)
59
+ Sanitizer.strip_disallowed_tags(text)
60
+ Sanitizer.entities_to_chars(text)
61
+ end
62
+ memoize(:content)
63
+
64
+ def striped_content
65
+ text = encoded_raw_content
66
+ Sanitizer.strip_tags(text)
67
+ end
68
+ memoize(:striped_content)
69
+
70
+ def categories
71
+ @entry.entry_categories.map do |category|
72
+ Sanitizer.sanitize(category.busk_normalize)
73
+ end
74
+ end
75
+
76
+ def enclosures
77
+ @entry.entry_enclosures
78
+ end
79
+
80
+ private
81
+ def encoded_raw_content
82
+ text = @entry.entry_content || @entry.entry_summary || ""
83
+ text.busk_normalize
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class Feed
5
+ include Spix::Parser::DateTimeUtilities
6
+
7
+ def initialize(parsed_feed)
8
+ @feed = parsed_feed
9
+ verify_entries_timestamps
10
+ end
11
+
12
+ def title
13
+ text = @feed.feed_title || "(title unknow)"
14
+ text = text.busk_normalize
15
+ Sanitizer.sanitize(text)
16
+ end
17
+
18
+ def subtitle
19
+ text = @feed.feed_subtitle || ""
20
+ text = text.busk_normalize
21
+ Sanitizer.sanitize(text)
22
+ end
23
+
24
+ def language
25
+ text = @feed.feed_language || "en"
26
+ text = text.busk_normalize
27
+ Sanitizer.sanitize(text)
28
+ end
29
+
30
+ def site_url
31
+ @feed.url || extract_site_from_feed_url
32
+ end
33
+
34
+ def feed_url
35
+ @feed.feed_url
36
+ end
37
+
38
+ def uid
39
+ @feed.feed_url.to_sha1
40
+ end
41
+
42
+ def updated_at
43
+ timestamp = @feed.last_modified || @feed.feed_entries.first.published_at || Time.now.utc
44
+ build_datetime_object(timestamp)
45
+ end
46
+
47
+ def feed_items
48
+ #Se não for um feed válido, o accessor feed_entries não existe
49
+ if @feed.respond_to?(:feed_entries) && @feed.feed_entries.present?
50
+ @feed.feed_entries.map{|entry| Spix::Parser::FeedEntry.new(entry, self)}
51
+ else
52
+ []
53
+ end
54
+ end
55
+
56
+ private
57
+ def verify_entries_timestamps
58
+ # Some feeds return the timestamps of all entries as the timestamp of the request
59
+ # This means that the timestamp will change everytime we parse the feed, thus duplicating entries
60
+ # One way to detect that is to verify if all the entries have the same timestamp
61
+ items_with_same_timestamp = feed_items.map{|i| i.published_at}.uniq.size == 1
62
+ more_than_one_item = feed_items.count > 1
63
+
64
+ if items_with_same_timestamp && more_than_one_item
65
+ @feed.feed_entries.each {|item| item.published = Spix::Parser::Config::BASE_TIMESTAMP.to_s}
66
+ end
67
+ end
68
+
69
+ def extract_site_from_feed_url
70
+ # Eventually, we run into a feed that for some reason does not include
71
+ # the publisher website. In those cases, we try to guess the website
72
+ # root path looking at the feed_url. It may fail also, so be mindful.
73
+ return unless @feed.feed_url.present?
74
+ feed_host = URI.parse(@feed.feed_url).host
75
+
76
+ "http://#{feed_host}"
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class ParsingError < StandardError
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+ require "rubygems"
3
+ require "feedzirra"
4
+ require "digest/sha1"
5
+ require "zlib"
6
+ require "logger"
7
+ require "cgi"
8
+ require "memoizable"
9
+
10
+ $:.unshift(File.dirname(__FILE__) + '/../../lib')
11
+
12
+ require "spix_parser/version"
13
+ require "spix_parser/core_ext"
14
+ require "spix_parser/config"
15
+ require "spix_parser/parser"
16
+ require "spix_parser/datetime"
17
+
18
+ require "spix_parser/tools/redirect_follower"
19
+
20
+ require "spix_parser/wrappers/entry"
21
+ require "spix_parser/wrappers/enclosure_interface"
22
+ require "spix_parser/wrappers/feed"
23
+ require "spix_parser/wrappers/parsing_error"
24
+
25
+ require "spix_parser/custom_parsers/enclosure"
26
+ require "spix_parser/custom_parsers/atom_entry"
27
+ require "spix_parser/custom_parsers/atom"
28
+ require "spix_parser/custom_parsers/rss_entry"
29
+ require "spix_parser/custom_parsers/rss"
30
+
31
+ require "spix_parser/tools/feed_discovery"
32
+
33
+ if RUBY_VERSION < '1.9'
34
+ $KCODE='u'
35
+ else
36
+ Encoding.default_internal = Encoding::UTF_8
37
+ Encoding.default_external = Encoding::UTF_8
38
+ end
39
+
40
+ Feedzirra::Feed.add_feed_class(Spix::Parser::RSS)
41
+ Feedzirra::Feed.add_feed_class(Spix::Parser::Atom)
42
+
43
+ # Start the log over whenever the log exceeds 100 megabytes in size.
44
+ Log = Logger.new('/var/log/spix/spix_parser.log', 0, 100 * 1024 * 1024)
45
+ Log.level = Logger::ERROR
46
+ Log.datetime_format = "%d-%m-%Y %H:%M:%S"
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::Parser, 'parsing wellformed feeds' do
4
+ run_tests :wellformed
5
+ end
6
+
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe Spix::Parser do
5
+ describe "atom parsing" do
6
+ it 'should parse from a file path' do
7
+ feed = Spix::Parser.parse(fixture('feed.atom'), :mode => :file)
8
+ feed.should_not be_nil
9
+ feed.feed_items.should have(1).item
10
+ end
11
+
12
+ it 'should parse from a file' do
13
+ feed = Spix::Parser.parse(load_fixture('feed.atom'), :mode => :local)
14
+ feed.should_not be_nil
15
+ feed.feed_items.should have(1).item
16
+ end
17
+ end
18
+
19
+ describe "rss parsing" do
20
+ it 'should parse from a file path' do
21
+ feed = Spix::Parser.parse(fixture('feed.rss'), :mode => :file)
22
+ feed.should_not be_nil
23
+ feed.feed_items.should have(9).item
24
+ end
25
+
26
+ it 'should parse from a file' do
27
+ feed = Spix::Parser.parse(load_fixture('feed.rss'), :mode => :local)
28
+ feed.should_not be_nil
29
+ feed.feed_items.should have(9).item
30
+ end
31
+
32
+ it 'should parse a feed from meioemensagem.com' do
33
+ url = 'http://www.meioemensagem.com.br/home/rss/geral.xml'
34
+ feed = Spix::Parser.parse(load_fixture('meioemensagem.xml'), :mode => :local)
35
+
36
+ feed.should_not be_nil
37
+ feed.title.should == "RSS: Not&Atilde;&shy;cias Gerais"
38
+ feed.feed_items[0].title.should == "Cielo volta &Atilde;&nbsp; m&Atilde;&shy;dia com o cantor Fiuk"
39
+ end
40
+ end
41
+
42
+ end
@@ -0,0 +1,72 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::FeedDiscovery, "#list" do
4
+
5
+ before(:all) do
6
+ @domain_url = "http://sitewithfeed.com"
7
+ end
8
+
9
+ describe "when the feed have an absolute URI" do
10
+ it "should return the feed url" do
11
+ FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
12
+ Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
13
+ end
14
+ end
15
+
16
+ describe "when the feed have a relative URI" do
17
+ describe "which is relative to a path" do
18
+ it "should return the feed url when the URI is at the top domain" do
19
+ FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
20
+ Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
21
+ end
22
+
23
+ it "should return the feed url when the URI is inside a path" do
24
+ @path_url = "/foo/bar"
25
+ @feed_url = @domain_url + @path_url
26
+
27
+ FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
28
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
29
+ end
30
+ end
31
+
32
+ describe "which is relative to the top domain" do
33
+ it "should return the feed url when the URI is at the top domain" do
34
+ FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
35
+ Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
36
+ end
37
+
38
+ it "should return the feed url when the URI is inside a path" do
39
+ @path_url = "/foo/bar"
40
+ @feed_url = @domain_url + @path_url
41
+
42
+ FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
43
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
44
+ end
45
+ end
46
+ end
47
+
48
+ describe "when the URI is a feed" do
49
+ before(:all) do
50
+ @path_url = "/feed.xml"
51
+ @feed_url = @domain_url + @path_url
52
+ end
53
+
54
+ it "should return the extracted url when there's a link at the feed" do
55
+ FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("feed_with_self_link.xml"))
56
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
57
+ end
58
+
59
+ it "should return the same url when there's no link at the feed" do
60
+ fixture = load_fixture("feed_without_self_link.xml")
61
+
62
+ FakeWeb.register_uri(:get, @feed_url, :body => fixture)
63
+
64
+ # feedzirra doesn't work with fakeweb
65
+ feed_xml = fixture
66
+ feed = Feedzirra::Feed.parse(feed_xml)
67
+ Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
68
+
69
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,182 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe Spix::Utils do
5
+ describe ".format_links" do
6
+ context "html containing links" do
7
+ it "parsers links in the given html string adding rel and target" do
8
+ input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
9
+
10
+ Spix::Utils.format_links(:text => input_html).should ==
11
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
12
+ end
13
+
14
+ it "parses links removing other html attributes" do
15
+ input_html = %q[<div><a href="foo/bar.html" title="FooBar!" style="color: red" invalid="test">FooBar!</a></div>]
16
+
17
+ Spix::Utils.format_links(:text => input_html).should ==
18
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
19
+ end
20
+
21
+ it "parses links with simple quotes" do
22
+ input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
23
+
24
+ Spix::Utils.format_links(:text => input_html).should ==
25
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
26
+ end
27
+
28
+ # TODO: should we strip these extra &quot; ?
29
+ it "parses links with html escaped quote (&quot;)" do
30
+ input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
31
+
32
+ Spix::Utils.format_links(:text => input_html).should ==
33
+ %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
34
+ end
35
+
36
+ it "parses links with html attributes without quotes, based on spaces" do
37
+ input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
38
+
39
+ Spix::Utils.format_links(:text => input_html).should ==
40
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
41
+ end
42
+
43
+ it "parses links with html attributes having spaces before or after the equal sign" do
44
+ input_html = %q[<div><a href = foo/bar.html title = FooBar!>FooBar!</a></div>]
45
+
46
+ Spix::Utils.format_links(:text => input_html).should ==
47
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
48
+ end
49
+
50
+ it "parses links downcasing attribute names" do
51
+ input_html = %q[<div><a HREF="foo/bar.html" TITLE="FooBar!">FooBar!</a></div>]
52
+
53
+ Spix::Utils.format_links(:text => input_html).should ==
54
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
55
+ end
56
+
57
+ it "parses links ignoring blank attributes" do
58
+ input_html = %q[<div><a href="foo/bar.html" title="">FooBar!</a></div>]
59
+
60
+ Spix::Utils.format_links(:text => input_html).should ==
61
+ %q[<div><a href="foo/bar.html" target="_blank" rel="external nofollow">FooBar!</a></div>]
62
+ end
63
+ end
64
+
65
+ context "html containing images" do
66
+ it "parsers images in the given html string matching default attributes (src, style, alt, title, width and height)" do
67
+ input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
68
+
69
+ Spix::Utils.format_links(:text => input_html).should ==
70
+ %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
71
+ end
72
+
73
+ it "parses image tags removing other invalid html attributes" do
74
+ input_html = %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" invalid="test" target="_blank" /></div>]
75
+
76
+ Spix::Utils.format_links(:text => input_html).should ==
77
+ %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" /></div>]
78
+ end
79
+
80
+ it "parses image tags appending the given site url to relative images" do
81
+ input_html = %q[<div><img src="images/bar.jpg" /></div>]
82
+
83
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
84
+ %q[<div><img src="http://example.com/images/bar.jpg" /></div>]
85
+ end
86
+
87
+ it "parses image tags having relative sources with invalid URI, appending the site url" do
88
+ input_html = %q[<div><img src="images/radiação.jpg" /></div>]
89
+
90
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
91
+ %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
92
+ end
93
+
94
+ it "parses image tags having relative sources starting with / and with invalid URI, appending the site url" do
95
+ input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
96
+
97
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
98
+ %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
99
+ end
100
+
101
+ it "parses image tags having relative sources starting with / and with invalid URI, appending the site url also ending with /" do
102
+ input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
103
+
104
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com/").should ==
105
+ %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
106
+ end
107
+
108
+ %w(http https ftp).each do |scheme|
109
+ it "parses image tags having absolute sources with #{scheme} and invalid URI" do
110
+ input_html = %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
111
+
112
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
113
+ %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
114
+ end
115
+ end
116
+
117
+ it "parses image tags having sources with spaces but using quotes" do
118
+ input_html = %q[<div><img src="images/foo bar.jpg" /></div>]
119
+
120
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
121
+ %q[<div><img src="http://example.com/images/foo bar.jpg" /></div>]
122
+ end
123
+
124
+ it "parses image tags having style attributes with spaces" do
125
+ input_html = %q[<div><img src="images/foobar.jpg" style="color: blue;" /></div>]
126
+
127
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
128
+ %q[<div><img src="http://example.com/images/foobar.jpg" style="color: blue;" /></div>]
129
+ end
130
+
131
+ it "parses image tags ignoring images with empty sources" do
132
+ input_html = %q[<div><img src="" title="FooBar!" /></div>]
133
+
134
+ Spix::Utils.format_links(:text => input_html).should ==
135
+ %q[<div></div>]
136
+ end
137
+
138
+ it "parses image tags with simple quotes" do
139
+ input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
140
+
141
+ Spix::Utils.format_links(:text => input_html).should ==
142
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
143
+ end
144
+
145
+ # TODO: should we strip these extra &quot; ?
146
+ it "parses image tags with html escaped quote (&quot;)" do
147
+ input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
148
+
149
+ Spix::Utils.format_links(:text => input_html).should ==
150
+ %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
151
+ end
152
+
153
+ it "parses image tags with html attributes without quotes, based on spaces" do
154
+ input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
155
+
156
+ Spix::Utils.format_links(:text => input_html).should ==
157
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
158
+ end
159
+
160
+ it "parses image tags with html attributes having spaces before or after the equal sign" do
161
+ input_html = %q[<div><img src = images/bar.jpg title = FooBar! /></div>]
162
+
163
+ Spix::Utils.format_links(:text => input_html).should ==
164
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
165
+ end
166
+
167
+ it "parses image tags downcasing attribute names" do
168
+ input_html = %q[<div><img SRC="images/bar.jpg" TITLE="FooBar!" /></div>]
169
+
170
+ Spix::Utils.format_links(:text => input_html).should ==
171
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
172
+ end
173
+
174
+ it "parses image tags ignoring empty attributes" do
175
+ input_html = %q[<div><img src="images/bar.jpg" title="" /></div>]
176
+
177
+ Spix::Utils.format_links(:text => input_html).should ==
178
+ %q[<div><img src="images/bar.jpg" /></div>]
179
+ end
180
+ end
181
+ end
182
+ end
metadata ADDED
@@ -0,0 +1,184 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spix_parser
3
+ version: !ruby/object:Gem::Version
4
+ hash: 7
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 5
9
+ - 2
10
+ version: 1.5.2
11
+ platform: ruby
12
+ authors:
13
+ - Marcelo Eden
14
+ - Fabio Mont'Alegre
15
+ - "Lucas H\xC3\xBAngaro"
16
+ - Luiz Rocha
17
+ autorequire:
18
+ bindir: bin
19
+ cert_chain: []
20
+
21
+ date: 2011-05-12 00:00:00 -03:00
22
+ default_executable:
23
+ dependencies:
24
+ - !ruby/object:Gem::Dependency
25
+ name: feedzirra
26
+ prerelease: false
27
+ requirement: &id001 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ hash: 47
33
+ segments:
34
+ - 0
35
+ - 0
36
+ - 24
37
+ version: 0.0.24
38
+ type: :runtime
39
+ version_requirements: *id001
40
+ - !ruby/object:Gem::Dependency
41
+ name: memoizable
42
+ prerelease: false
43
+ requirement: &id002 !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ hash: 27
49
+ segments:
50
+ - 0
51
+ - 1
52
+ - 0
53
+ version: 0.1.0
54
+ type: :runtime
55
+ version_requirements: *id002
56
+ - !ruby/object:Gem::Dependency
57
+ name: sanitizer
58
+ prerelease: false
59
+ requirement: &id003 !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 21
65
+ segments:
66
+ - 0
67
+ - 1
68
+ - 7
69
+ version: 0.1.7
70
+ type: :runtime
71
+ version_requirements: *id003
72
+ - !ruby/object:Gem::Dependency
73
+ name: i18n
74
+ prerelease: false
75
+ requirement: &id004 !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ hash: 25
81
+ segments:
82
+ - 0
83
+ - 1
84
+ - 1
85
+ version: 0.1.1
86
+ type: :runtime
87
+ version_requirements: *id004
88
+ - !ruby/object:Gem::Dependency
89
+ name: rspec
90
+ prerelease: false
91
+ requirement: &id005 !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ hash: 3
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ type: :development
101
+ version_requirements: *id005
102
+ - !ruby/object:Gem::Dependency
103
+ name: fakeweb
104
+ prerelease: false
105
+ requirement: &id006 !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ hash: 3
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ type: :development
115
+ version_requirements: *id006
116
+ description: A feed parser wrapper for Spix
117
+ email: busk@busk.com
118
+ executables: []
119
+
120
+ extensions: []
121
+
122
+ extra_rdoc_files: []
123
+
124
+ files:
125
+ - lib/spix_parser/config.rb
126
+ - lib/spix_parser/core_ext.rb
127
+ - lib/spix_parser/custom_parsers/atom.rb
128
+ - lib/spix_parser/custom_parsers/atom_entry.rb
129
+ - lib/spix_parser/custom_parsers/enclosure.rb
130
+ - lib/spix_parser/custom_parsers/rss.rb
131
+ - lib/spix_parser/custom_parsers/rss_entry.rb
132
+ - lib/spix_parser/datetime.rb
133
+ - lib/spix_parser/parser.rb
134
+ - lib/spix_parser/tools/feed_discovery.rb
135
+ - lib/spix_parser/tools/redirect_follower.rb
136
+ - lib/spix_parser/version.rb
137
+ - lib/spix_parser/wrappers/enclosure_interface.rb
138
+ - lib/spix_parser/wrappers/entry.rb
139
+ - lib/spix_parser/wrappers/feed.rb
140
+ - lib/spix_parser/wrappers/parsing_error.rb
141
+ - lib/spix_parser.rb
142
+ - spec/parser_spec.rb
143
+ - spec/spix_parser/parser_spec.rb
144
+ - spec/spix_parser/tools/feed_discovery_spec.rb
145
+ - spec/spix_parser/utils_spec.rb
146
+ has_rdoc: true
147
+ homepage: http://github.com/busk/spix_parser
148
+ licenses: []
149
+
150
+ post_install_message:
151
+ rdoc_options: []
152
+
153
+ require_paths:
154
+ - lib
155
+ required_ruby_version: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ hash: 3
161
+ segments:
162
+ - 0
163
+ version: "0"
164
+ required_rubygems_version: !ruby/object:Gem::Requirement
165
+ none: false
166
+ requirements:
167
+ - - ">="
168
+ - !ruby/object:Gem::Version
169
+ hash: 3
170
+ segments:
171
+ - 0
172
+ version: "0"
173
+ requirements: []
174
+
175
+ rubyforge_project:
176
+ rubygems_version: 1.6.2
177
+ signing_key:
178
+ specification_version: 3
179
+ summary: FeedParser for Spix
180
+ test_files:
181
+ - spec/parser_spec.rb
182
+ - spec/spix_parser/parser_spec.rb
183
+ - spec/spix_parser/tools/feed_discovery_spec.rb
184
+ - spec/spix_parser/utils_spec.rb