spix_parser 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module Config
5
+ ENCODING = "UTF-8"
6
+
7
+ BASE_TIMESTAMP = Time.mktime("1970").utc
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,117 @@
1
+ # encoding: utf-8
2
+ class Time
3
+ def to_db_format(format=:default)
4
+ format == :default ? to_s_default : strftime("%Y-%m-%d %H:%M:%S").strip
5
+ end
6
+ alias_method :to_s_default, :to_s
7
+ alias_method :to_s, :to_db_format unless method_defined?(:to_formatted_s)
8
+
9
+ def self.can_parse?(date)
10
+ begin
11
+ Time.parse(date)
12
+ rescue StandardError => e
13
+ return false
14
+ end
15
+ true
16
+ end
17
+
18
+ # We have a lot of timestamps in portuguese and Ruby can't parse'em
19
+ # So, this function will translate portuguese date related terms to english for correct parsing,
20
+ # because we can't afford so many feed entries with wrong timestamps
21
+ def self.translate_for_parsing(date_as_string)
22
+ # First, add leading zero to days of month below 10
23
+ formatted_date = date_as_string.sub(/\A[a-zA-Z]+\,\s{1}(\d)[^\d]/, '0\1 ')
24
+
25
+ day_names = {"Domingo" => "Sunday", "Segunda" => "Monday", "Terça" => "Tuesday", "Quarta" => "Wednesday",
26
+ "Quinta" => "Thursday", "Sexta" => "Friday", "Sábado" => "Saturday", "Sabado" => "Saturday"}
27
+ abbr_day_names = {"Dom" => "Sun", "Seg" => "Mon", "Ter" => "Tue", "Qua" => "Wed",
28
+ "Qui" => "Thu", "Sex" => "Fri", "Sáb" => "Sat", "Sab" => "Sat"}
29
+ month_names = {"Janeiro" => "January", "Fevereiro" => "February", "Março" => "March", "Marco" => "March",
30
+ "Abril" => "April", "Maio" => "May", "Junho" => "June", "Julho" => "July",
31
+ "Agosto" => "August", "Setembro" => "September", "Outubro" => "October",
32
+ "Novembro" => "November", "Dezembro" => "December"}
33
+ abbr_month_names = {"Jan" => "Jan", "Fev" => "Feb", "Abr" => "Apr", "Mai" => "May",
34
+ "Ago" => "Aug", "Set" => "Sep", "Out" => "Oct", "Dez" => "Dec"}
35
+
36
+ day_names.each do |key, value|
37
+ formatted_date.sub!(key, value)
38
+ end
39
+
40
+ abbr_day_names.each do |key, value|
41
+ formatted_date.sub!(key, value)
42
+ end
43
+
44
+ month_names.each do |key, value|
45
+ formatted_date.sub!(key, value)
46
+ end
47
+
48
+ abbr_month_names.each do |key, value|
49
+ formatted_date.sub!(key, value)
50
+ end
51
+
52
+ formatted_date
53
+ end
54
+ end
55
+
56
+ class Object
57
+ def blank?
58
+ respond_to?(:empty?) ? empty? : !self
59
+ end
60
+
61
+ def silence_warnings
62
+ old_verbose, $VERBOSE = $VERBOSE, nil
63
+ yield
64
+ ensure
65
+ $VERBOSE = old_verbose
66
+ end
67
+ end
68
+
69
+ class NilClass
70
+ def blank?
71
+ true
72
+ end
73
+ end
74
+
75
+ class FalseClass
76
+ def blank?
77
+ true
78
+ end
79
+ end
80
+
81
+ class TrueClass
82
+ def blank?
83
+ false
84
+ end
85
+ end
86
+
87
+ class Array
88
+ alias_method :blank?, :empty?
89
+ end
90
+
91
+ class Hash
92
+ alias_method :blank?, :empty?
93
+ end
94
+
95
+ class Numeric
96
+ def blank?
97
+ false
98
+ end
99
+ end
100
+
101
+ class String
102
+ def to_sha1
103
+ Digest::SHA1.hexdigest "--17f7e62310d5a2bbb9bfc535b95134ece1cb474d--#{self}"
104
+ end
105
+
106
+ def blank?
107
+ self !~ /\S/
108
+ end
109
+
110
+ def busk_normalize
111
+ if RUBY_VERSION >= '1.9'
112
+ self.force_encoding(Spix::Parser::Config::ENCODING)
113
+ else
114
+ self
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class Atom
5
+ include SAXMachine
6
+ include Feedzirra::FeedUtilities
7
+
8
+ element :title, :as => :feed_title
9
+ element :subtitle, :as => :feed_subtitle
10
+ element :language, :as => :feed_language
11
+ element :updated, :as => :last_modified
12
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
13
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
14
+ elements :link, :as => :links, :value => :href
15
+ elements :entry, :as => :feed_entries, :class => Spix::Parser::AtomEntry
16
+
17
+ alias_method :entries, :feed_entries
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ (xml =~ /application\/atom\+xml|(#{Regexp.escape("http://www.w3.org/2005/Atom")})|(#{Regexp.escape("http://purl.org/atom")})/) && (xml =~ /\<feed\s/)
21
+ end
22
+
23
+ def url
24
+ @url || links.last
25
+ end
26
+
27
+ def feed_url
28
+ @feed_url || links.first
29
+ end
30
+
31
+ def last_modified
32
+ @last_modified.present? ? @last_modified : super
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class AtomEntry
5
+ include SAXMachine
6
+ include Feedzirra::FeedEntryUtilities
7
+
8
+ element :title, :as => :entry_title
9
+ element :link, :as => :entry_url, :value => :href, :with => {:rel => "alternate"}
10
+ element :name, :as => :entry_author
11
+ element :content, :as => :entry_content
12
+ element :summary, :as => :entry_summary
13
+ element :published
14
+ element :id
15
+ element :created, :as => :published
16
+ element :issued, :as => :published
17
+ element :updated
18
+ element :modified, :as => :updated
19
+ elements :category, :as => :entry_categories, :value => :term
20
+
21
+ elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
22
+
23
+ element :"media:content", :as => :media_content, :value => :url
24
+ element :"media:description", :as => :media_description
25
+ element :"media:thumbnail", :as => :media_thumbnail, :value => :url
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class Enclosure
5
+ include SAXMachine
6
+ include Spix::Parser::EnclosureInterface
7
+
8
+ element :enclosure, :value => :length, :as => :enclosure_length
9
+ element :enclosure, :value => :type, :as => :enclosure_type
10
+ element :enclosure, :value => :url, :as => :enclosure_url
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class RSS
5
+ include SAXMachine
6
+ include Feedzirra::FeedUtilities
7
+
8
+ element :title, :as => :feed_title
9
+ element :description, :as => :feed_subtitle
10
+ element :language, :as => :feed_language
11
+ element :link, :as => :url
12
+ element :pubDate, :as => :last_modified
13
+ elements :item, :as => :feed_entries, :class => Spix::Parser::RSSEntry
14
+
15
+ alias_method :entries, :feed_entries
16
+
17
+ attr_accessor :feed_url
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ (xml =~ /\<rss|rdf/) && (xml =~ /\<channel/)
21
+ end
22
+
23
+ def last_modified
24
+ @last_modified.present? ? @last_modified : super
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class RSSEntry
5
+ include SAXMachine
6
+ include Feedzirra::FeedEntryUtilities
7
+
8
+ element :title, :as => :entry_title
9
+ element :link, :as => :entry_url
10
+
11
+ element :author, :as => :entry_author
12
+ element :"dc:creator", :as => :entry_author
13
+
14
+ element :"content:encoded", :as => :entry_content
15
+ element :description, :as => :entry_summary
16
+ element :summary, :as => :entry_summary
17
+
18
+ element :pubDate, :as => :published
19
+ element :"dc:date", :as => :published
20
+ element :"dc:Date", :as => :published
21
+ element :"dcterms:created", :as => :published
22
+
23
+ element :"dcterms:modified", :as => :updated
24
+ element :issued, :as => :published
25
+ elements :category, :as => :entry_categories
26
+
27
+ element :guid, :as => :id
28
+
29
+ elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
30
+
31
+ element :"media:content", :as => :media_content, :value => :url
32
+ element :"media:description", :as => :media_description
33
+ element :"media:thumbnail", :as => :media_thumbnail, :value => :url
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module DateTimeUtilities
5
+ def build_datetime_object(timestamp)
6
+ timestamp = normalize_timestamp(timestamp)
7
+
8
+ if Time.can_parse?(timestamp)
9
+ #if the timestamp is a non-date string, it will be Time.mktime("1970").utc
10
+ timestamp = Time.parse(timestamp, Spix::Parser::Config::BASE_TIMESTAMP).utc
11
+
12
+ # non-english dates sometimes are parsed to "future" dates by Ruby
13
+ # we also cover the case where the timestamp is Time.mktime("1970").utc as explained above
14
+ if (timestamp > Time.now.utc) || (timestamp == Spix::Parser::Config::BASE_TIMESTAMP)
15
+ timestamp = nil
16
+ end
17
+ else
18
+ timestamp = nil
19
+ end
20
+
21
+ timestamp
22
+ end
23
+
24
+ private
25
+ def normalize_timestamp(timestamp)
26
+ # In Ruby 1.9 the date is returned as String
27
+ # In Ruby 1.8 it is returned as Time
28
+ timestamp_string = timestamp.to_s
29
+ Time.translate_for_parsing(timestamp_string.busk_normalize)
30
+ end
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,124 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ def self.parse(url, options = {})
5
+ feed = case options.delete(:mode)
6
+ when :local
7
+ Feedzirra::Feed.parse(url)
8
+ when :file
9
+ Feedzirra::Feed.parse(File.read(url))
10
+ else
11
+ Feedzirra::Feed.fetch_and_parse(url, options)
12
+ end
13
+
14
+ # Feedzirra has some issues with failure cases:
15
+ # If the failure occurs on the parsing phase, then the on_failure callback is triggered
16
+ # If the failure occurs on the fetching phase (i. e. a network error), the a number is returned
17
+ # That number may represent an http status code or be 0 in case of other errors.
18
+ # Also, we can't raise an exception on the on_failure callback, 'cause it will be raised even on success - that's really odd
19
+ # So we need this 'safety net' here until we patch it to use an uniform error architecture
20
+ if feed.nil? || (feed.is_a?(Fixnum) && feed == 0)
21
+ Log.error("The parser couldn't fetch the feed at #{url}")
22
+ return nil
23
+ elsif feed.is_a?(Fixnum)
24
+ feed
25
+ else
26
+ Spix::Parser::Feed.new(feed)
27
+ end
28
+ end
29
+ end
30
+
31
+ module Utils
32
+ extend self
33
+
34
+ def format_links(options)
35
+ text = options[:text]
36
+ site_url = options[:site_url]
37
+
38
+ parse_links(text)
39
+ parse_images(text, site_url)
40
+
41
+ text
42
+ end
43
+
44
+ private
45
+
46
+ def join_attributes(attrs)
47
+ attrs.map do |attr, value|
48
+ %Q[#{attr}="#{value.to_s.gsub(/"/, "&quot;")}"] unless value.blank?
49
+ end.compact.join(" ")
50
+ end
51
+
52
+ def parse_attrs(str)
53
+ attrs = {}
54
+ return attrs unless str || str.respond_to?(:scan)
55
+
56
+ match_by_spaces = str !~ /'|"/
57
+ if match_by_spaces
58
+ # Make sure to match the last html attribute.
59
+ str += " "
60
+ value_regexp = /\s*(.*?)\s/
61
+ else
62
+ value_regexp = /\s*["'](.*?)["']/
63
+ end
64
+ attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
65
+
66
+ str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
67
+ attrs[$1.to_s.downcase] = $2
68
+ end
69
+
70
+ attrs
71
+ end
72
+
73
+ def parse_links(text)
74
+ text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
75
+ attrs = parse_attrs($2.to_s)
76
+
77
+ # just parse these attributes
78
+ attrs = {
79
+ :href => attrs["href"],
80
+ :title => attrs["title"],
81
+ :target => "_blank",
82
+ :rel => "external nofollow"
83
+ }
84
+
85
+ "<a #{join_attributes(attrs)}>"
86
+ end
87
+ end
88
+
89
+ def parse_images(text, site_url)
90
+ text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
91
+ attrs = parse_attrs($2.to_s)
92
+
93
+ # just parse these attributes
94
+ attrs = {
95
+ :src => parse_relative_image_source(attrs["src"], site_url),
96
+ :alt => attrs["alt"],
97
+ :title => attrs["title"],
98
+ :style => attrs["style"],
99
+ :width => attrs["width"],
100
+ :height => attrs["height"]
101
+ }
102
+
103
+ "<img #{join_attributes(attrs)} />" if attrs[:src].present?
104
+ end
105
+ end
106
+
107
+ def parse_relative_image_source(src, site_url)
108
+ if src.present? && site_url
109
+ begin
110
+ src = URI.parse(src)
111
+ src = URI.parse(site_url).merge(src) if src.relative?
112
+ rescue URI::InvalidURIError
113
+ # Manually concatenating if it is "relative uri", stripping slashes.
114
+ if src !~ /\A(https?|ftp):\/\//
115
+ site_url = site_url[0..-2] if site_url[-1] == ?/
116
+ src = src[1..-1] if src[0] == ?/
117
+ src = "#{site_url}/#{src}"
118
+ end
119
+ end
120
+ end
121
+ src
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,94 @@
1
+ gem "feedzirra", ">=0.0.24"
2
+ require "feedzirra"
3
+ require "nokogiri"
4
+ require "uri"
5
+ require "open-uri"
6
+
7
+ module Spix
8
+ class FeedDiscovery
9
+
10
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
11
+ USER_AGENT = "SpixParser"
12
+
13
+ def self.feed?(uri)
14
+ Spix::Parser.parse(uri, :mode => :fetch) ? true : false
15
+ end
16
+
17
+ def self.list(uri)
18
+
19
+ content = self.read(uri)
20
+
21
+ doc = Nokogiri::HTML(content)
22
+
23
+ # get page title
24
+ title = doc.search('title')[0].content
25
+
26
+ items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
27
+ url_object = URI::parse(uri).normalize
28
+
29
+ href = link.get_attribute(:href).to_s
30
+
31
+ feed_url_object = URI::parse(href)
32
+
33
+ if feed_url_object.relative?
34
+
35
+ # there's 2 types of relative URIs
36
+ # the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
37
+ # and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
38
+ if feed_url_object.path.match(/^\//)
39
+ # when the feed_url_object is relative and starts with a "/" we should ignore the domain path
40
+ path = nil
41
+ else
42
+ # when the feed_url_object is relative and do not starts with a "/" we should use the domain path
43
+
44
+ if url_object.path.match(/\/$/)
45
+ # when the url_object ends with a "/" we should use it
46
+ path = url_object.path
47
+ else
48
+ # when the url_object do not ends with a "/" we should add it
49
+ path = url_object.path + "/"
50
+ end
51
+ end
52
+
53
+ href = "#{url_object.scheme}://" +
54
+ "#{url_object.host}" +
55
+ "#{path}" +
56
+ "#{url_object.query}" +
57
+ href
58
+ end
59
+
60
+ item = {
61
+ :title => link.get_attribute(:title) || title,
62
+ :url => href
63
+ }
64
+
65
+ end
66
+
67
+ if items.size == 0
68
+ # if there's no item found at the given URI, maybe it's a feed URI
69
+ if self.feed?(uri)
70
+ items = [
71
+ {
72
+ :title => title,
73
+ :url => uri
74
+ }
75
+ ]
76
+ end
77
+ end
78
+
79
+ items
80
+ rescue
81
+ nil
82
+ end
83
+
84
+ def self.read(uri)
85
+ if uri.respond_to?(:read)
86
+ content = uri.read
87
+ else
88
+ req_headers = {}
89
+ req_headers["User-Agent"] = USER_AGENT
90
+ content = open(uri, req_headers).read
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,31 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+
4
+ class RedirectFollower
5
+ def self.resolve(url)
6
+ @response = ""
7
+ begin
8
+ timeout(5) do
9
+ t = Thread.new {@response = Net::HTTP.get_response(URI.parse(url)) }
10
+ t.join
11
+
12
+ if @response.kind_of?(Net::HTTPRedirection)
13
+ return redirect_url(@response)
14
+ end
15
+ end
16
+ rescue Timeout::Error, URI::InvalidURIError
17
+ return url
18
+ end
19
+
20
+ url
21
+ end
22
+
23
+ protected
24
+ def self.redirect_url(response)
25
+ if response['location'].nil?
26
+ response.body.match(/<a href=\"([^>]+)\">/i)[1]
27
+ else
28
+ response['location']
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module Version
5
+ MAJOR = 1
6
+ MINOR = 5
7
+ TINY = 2
8
+
9
+ def self.current_version
10
+ "#{MAJOR}.#{MINOR}.#{TINY}"
11
+ end
12
+
13
+ def self.date
14
+ "2011-04-25"
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ module EnclosureInterface
5
+ def url
6
+ enclosure_url
7
+ end
8
+
9
+ def mime_type
10
+ enclosure_type
11
+ end
12
+
13
+ def length
14
+ enclosure_length
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,87 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class FeedEntry
5
+ include Spix::Parser::DateTimeUtilities
6
+ include Memoizable
7
+
8
+ def initialize(entry, feed)
9
+ @feed = feed
10
+ @entry = entry
11
+ end
12
+
13
+ def title
14
+ text = @entry.entry_title || "(title unknow)"
15
+ text = text.busk_normalize
16
+ Sanitizer.sanitize(text)
17
+ end
18
+ memoize(:title)
19
+
20
+ def summary
21
+ text = @entry.entry_summary || ""
22
+ text = text.busk_normalize
23
+ Sanitizer.strip_comments(text)
24
+ Sanitizer.strip_disallowed_tags(text)
25
+ Sanitizer.entities_to_chars(text)
26
+ end
27
+ memoize(:summary)
28
+
29
+ def url
30
+ entry_url = @entry.entry_url || @feed.site_url
31
+ RedirectFollower.resolve(entry_url).busk_normalize if entry_url.present?
32
+ end
33
+ memoize(:url)
34
+
35
+ def author
36
+ text = @entry.entry_author || ""
37
+ Sanitizer.sanitize(text.busk_normalize)
38
+ end
39
+
40
+ def published_at
41
+ build_datetime_object(@entry.published) if @entry.published
42
+ end
43
+
44
+ def updated_at
45
+ build_datetime_object(@entry.updated) if @entry.updated
46
+ end
47
+
48
+ def uid
49
+ uid = self.url || ""
50
+ uid += self.title.downcase.busk_normalize
51
+ uid += self.striped_content.downcase.busk_normalize[0..25]
52
+ uid.to_sha1
53
+ end
54
+ memoize(:uid)
55
+
56
+ def content
57
+ text = encoded_raw_content
58
+ Sanitizer.strip_comments(text)
59
+ Sanitizer.strip_disallowed_tags(text)
60
+ Sanitizer.entities_to_chars(text)
61
+ end
62
+ memoize(:content)
63
+
64
+ def striped_content
65
+ text = encoded_raw_content
66
+ Sanitizer.strip_tags(text)
67
+ end
68
+ memoize(:striped_content)
69
+
70
+ def categories
71
+ @entry.entry_categories.map do |category|
72
+ Sanitizer.sanitize(category.busk_normalize)
73
+ end
74
+ end
75
+
76
+ def enclosures
77
+ @entry.entry_enclosures
78
+ end
79
+
80
+ private
81
+ def encoded_raw_content
82
+ text = @entry.entry_content || @entry.entry_summary || ""
83
+ text.busk_normalize
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class Feed
5
+ include Spix::Parser::DateTimeUtilities
6
+
7
+ def initialize(parsed_feed)
8
+ @feed = parsed_feed
9
+ verify_entries_timestamps
10
+ end
11
+
12
+ def title
13
+ text = @feed.feed_title || "(title unknow)"
14
+ text = text.busk_normalize
15
+ Sanitizer.sanitize(text)
16
+ end
17
+
18
+ def subtitle
19
+ text = @feed.feed_subtitle || ""
20
+ text = text.busk_normalize
21
+ Sanitizer.sanitize(text)
22
+ end
23
+
24
+ def language
25
+ text = @feed.feed_language || "en"
26
+ text = text.busk_normalize
27
+ Sanitizer.sanitize(text)
28
+ end
29
+
30
+ def site_url
31
+ @feed.url || extract_site_from_feed_url
32
+ end
33
+
34
+ def feed_url
35
+ @feed.feed_url
36
+ end
37
+
38
+ def uid
39
+ @feed.feed_url.to_sha1
40
+ end
41
+
42
+ def updated_at
43
+ timestamp = @feed.last_modified || @feed.feed_entries.first.published_at || Time.now.utc
44
+ build_datetime_object(timestamp)
45
+ end
46
+
47
+ def feed_items
48
+ #Se não for um feed válido, o accessor feed_entries não existe
49
+ if @feed.respond_to?(:feed_entries) && @feed.feed_entries.present?
50
+ @feed.feed_entries.map{|entry| Spix::Parser::FeedEntry.new(entry, self)}
51
+ else
52
+ []
53
+ end
54
+ end
55
+
56
+ private
57
+ def verify_entries_timestamps
58
+ # Some feeds return the timestamps of all entries as the timestamp of the request
59
+ # This means that the timestamp will change everytime we parse the feed, thus duplicating entries
60
+ # One way to detect that is to verify if all the entries have the same timestamp
61
+ items_with_same_timestamp = feed_items.map{|i| i.published_at}.uniq.size == 1
62
+ more_than_one_item = feed_items.count > 1
63
+
64
+ if items_with_same_timestamp && more_than_one_item
65
+ @feed.feed_entries.each {|item| item.published = Spix::Parser::Config::BASE_TIMESTAMP.to_s}
66
+ end
67
+ end
68
+
69
+ def extract_site_from_feed_url
70
+ # Eventually, we run into a feed that for some reason does not include
71
+ # the publisher website. In those cases, we try to guess the website
72
+ # root path looking at the feed_url. It may fail also, so be mindful.
73
+ return unless @feed.feed_url.present?
74
+ feed_host = URI.parse(@feed.feed_url).host
75
+
76
+ "http://#{feed_host}"
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+ module Spix
3
+ module Parser
4
+ class ParsingError < StandardError
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+ require "rubygems"
3
+ require "feedzirra"
4
+ require "digest/sha1"
5
+ require "zlib"
6
+ require "logger"
7
+ require "cgi"
8
+ require "memoizable"
9
+
10
+ $:.unshift(File.dirname(__FILE__) + '/../../lib')
11
+
12
+ require "spix_parser/version"
13
+ require "spix_parser/core_ext"
14
+ require "spix_parser/config"
15
+ require "spix_parser/parser"
16
+ require "spix_parser/datetime"
17
+
18
+ require "spix_parser/tools/redirect_follower"
19
+
20
+ require "spix_parser/wrappers/entry"
21
+ require "spix_parser/wrappers/enclosure_interface"
22
+ require "spix_parser/wrappers/feed"
23
+ require "spix_parser/wrappers/parsing_error"
24
+
25
+ require "spix_parser/custom_parsers/enclosure"
26
+ require "spix_parser/custom_parsers/atom_entry"
27
+ require "spix_parser/custom_parsers/atom"
28
+ require "spix_parser/custom_parsers/rss_entry"
29
+ require "spix_parser/custom_parsers/rss"
30
+
31
+ require "spix_parser/tools/feed_discovery"
32
+
33
+ if RUBY_VERSION < '1.9'
34
+ $KCODE='u'
35
+ else
36
+ Encoding.default_internal = Encoding::UTF_8
37
+ Encoding.default_external = Encoding::UTF_8
38
+ end
39
+
40
+ Feedzirra::Feed.add_feed_class(Spix::Parser::RSS)
41
+ Feedzirra::Feed.add_feed_class(Spix::Parser::Atom)
42
+
43
+ # Start the log over whenever the log exceeds 100 megabytes in size.
44
+ Log = Logger.new('/var/log/spix/spix_parser.log', 0, 100 * 1024 * 1024)
45
+ Log.level = Logger::ERROR
46
+ Log.datetime_format = "%d-%m-%Y %H:%M:%S"
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::Parser, 'parsing wellformed feeds' do
4
+ run_tests :wellformed
5
+ end
6
+
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe Spix::Parser do
5
+ describe "atom parsing" do
6
+ it 'should parse from a file path' do
7
+ feed = Spix::Parser.parse(fixture('feed.atom'), :mode => :file)
8
+ feed.should_not be_nil
9
+ feed.feed_items.should have(1).item
10
+ end
11
+
12
+ it 'should parse from a file' do
13
+ feed = Spix::Parser.parse(load_fixture('feed.atom'), :mode => :local)
14
+ feed.should_not be_nil
15
+ feed.feed_items.should have(1).item
16
+ end
17
+ end
18
+
19
+ describe "rss parsing" do
20
+ it 'should parse from a file path' do
21
+ feed = Spix::Parser.parse(fixture('feed.rss'), :mode => :file)
22
+ feed.should_not be_nil
23
+ feed.feed_items.should have(9).item
24
+ end
25
+
26
+ it 'should parse from a file' do
27
+ feed = Spix::Parser.parse(load_fixture('feed.rss'), :mode => :local)
28
+ feed.should_not be_nil
29
+ feed.feed_items.should have(9).item
30
+ end
31
+
32
+ it 'should parse a feed from meioemensagem.com' do
33
+ url = 'http://www.meioemensagem.com.br/home/rss/geral.xml'
34
+ feed = Spix::Parser.parse(load_fixture('meioemensagem.xml'), :mode => :local)
35
+
36
+ feed.should_not be_nil
37
+ feed.title.should == "RSS: Not&Atilde;&shy;cias Gerais"
38
+ feed.feed_items[0].title.should == "Cielo volta &Atilde;&nbsp; m&Atilde;&shy;dia com o cantor Fiuk"
39
+ end
40
+ end
41
+
42
+ end
@@ -0,0 +1,72 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spix::FeedDiscovery, "#list" do
4
+
5
+ before(:all) do
6
+ @domain_url = "http://sitewithfeed.com"
7
+ end
8
+
9
+ describe "when the feed have an absolute URI" do
10
+ it "should return the feed url" do
11
+ FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
12
+ Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
13
+ end
14
+ end
15
+
16
+ describe "when the feed have a relative URI" do
17
+ describe "which is relative to a path" do
18
+ it "should return the feed url when the URI is at the top domain" do
19
+ FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
20
+ Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
21
+ end
22
+
23
+ it "should return the feed url when the URI is inside a path" do
24
+ @path_url = "/foo/bar"
25
+ @feed_url = @domain_url + @path_url
26
+
27
+ FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
28
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
29
+ end
30
+ end
31
+
32
+ describe "which is relative to the top domain" do
33
+ it "should return the feed url when the URI is at the top domain" do
34
+ FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
35
+ Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
36
+ end
37
+
38
+ it "should return the feed url when the URI is inside a path" do
39
+ @path_url = "/foo/bar"
40
+ @feed_url = @domain_url + @path_url
41
+
42
+ FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
43
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
44
+ end
45
+ end
46
+ end
47
+
48
+ describe "when the URI is a feed" do
49
+ before(:all) do
50
+ @path_url = "/feed.xml"
51
+ @feed_url = @domain_url + @path_url
52
+ end
53
+
54
+ it "should return the extracted url when there's a link at the feed" do
55
+ FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("feed_with_self_link.xml"))
56
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
57
+ end
58
+
59
+ it "should return the same url when there's no link at the feed" do
60
+ fixture = load_fixture("feed_without_self_link.xml")
61
+
62
+ FakeWeb.register_uri(:get, @feed_url, :body => fixture)
63
+
64
+ # feedzirra doesn't work with fakeweb
65
+ feed_xml = fixture
66
+ feed = Feedzirra::Feed.parse(feed_xml)
67
+ Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
68
+
69
+ Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,182 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe Spix::Utils do
5
+ describe ".format_links" do
6
+ context "html containing links" do
7
+ it "parsers links in the given html string adding rel and target" do
8
+ input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
9
+
10
+ Spix::Utils.format_links(:text => input_html).should ==
11
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
12
+ end
13
+
14
+ it "parses links removing other html attributes" do
15
+ input_html = %q[<div><a href="foo/bar.html" title="FooBar!" style="color: red" invalid="test">FooBar!</a></div>]
16
+
17
+ Spix::Utils.format_links(:text => input_html).should ==
18
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
19
+ end
20
+
21
+ it "parses links with simple quotes" do
22
+ input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
23
+
24
+ Spix::Utils.format_links(:text => input_html).should ==
25
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
26
+ end
27
+
28
+ # TODO: should we strip these extra &quot; ?
29
+ it "parses links with html escaped quote (&quot;)" do
30
+ input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
31
+
32
+ Spix::Utils.format_links(:text => input_html).should ==
33
+ %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
34
+ end
35
+
36
+ it "parses links with html attributes without quotes, based on spaces" do
37
+ input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
38
+
39
+ Spix::Utils.format_links(:text => input_html).should ==
40
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
41
+ end
42
+
43
+ it "parses links with html attributes having spaces before or after the equal sign" do
44
+ input_html = %q[<div><a href = foo/bar.html title = FooBar!>FooBar!</a></div>]
45
+
46
+ Spix::Utils.format_links(:text => input_html).should ==
47
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
48
+ end
49
+
50
+ it "parses links downcasing attribute names" do
51
+ input_html = %q[<div><a HREF="foo/bar.html" TITLE="FooBar!">FooBar!</a></div>]
52
+
53
+ Spix::Utils.format_links(:text => input_html).should ==
54
+ %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
55
+ end
56
+
57
+ it "parses links ignoring blank attributes" do
58
+ input_html = %q[<div><a href="foo/bar.html" title="">FooBar!</a></div>]
59
+
60
+ Spix::Utils.format_links(:text => input_html).should ==
61
+ %q[<div><a href="foo/bar.html" target="_blank" rel="external nofollow">FooBar!</a></div>]
62
+ end
63
+ end
64
+
65
+ context "html containing images" do
66
+ it "parsers images in the given html string matching default attributes (src, style, alt, title, width and height)" do
67
+ input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
68
+
69
+ Spix::Utils.format_links(:text => input_html).should ==
70
+ %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
71
+ end
72
+
73
+ it "parses image tags removing other invalid html attributes" do
74
+ input_html = %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" invalid="test" target="_blank" /></div>]
75
+
76
+ Spix::Utils.format_links(:text => input_html).should ==
77
+ %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" /></div>]
78
+ end
79
+
80
+ it "parses image tags appending the given site url to relative images" do
81
+ input_html = %q[<div><img src="images/bar.jpg" /></div>]
82
+
83
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
84
+ %q[<div><img src="http://example.com/images/bar.jpg" /></div>]
85
+ end
86
+
87
+ it "parses image tags having relative sources with invalid URI, appending the site url" do
88
+ input_html = %q[<div><img src="images/radiação.jpg" /></div>]
89
+
90
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
91
+ %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
92
+ end
93
+
94
+ it "parses image tags having relative sources starting with / and with invalid URI, appending the site url" do
95
+ input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
96
+
97
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
98
+ %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
99
+ end
100
+
101
+ it "parses image tags having relative sources starting with / and with invalid URI, appending the site url also ending with /" do
102
+ input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
103
+
104
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com/").should ==
105
+ %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
106
+ end
107
+
108
+ %w(http https ftp).each do |scheme|
109
+ it "parses image tags having absolute sources with #{scheme} and invalid URI" do
110
+ input_html = %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
111
+
112
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
113
+ %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
114
+ end
115
+ end
116
+
117
+ it "parses image tags having sources with spaces but using quotes" do
118
+ input_html = %q[<div><img src="images/foo bar.jpg" /></div>]
119
+
120
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
121
+ %q[<div><img src="http://example.com/images/foo bar.jpg" /></div>]
122
+ end
123
+
124
+ it "parses image tags having style attributes with spaces" do
125
+ input_html = %q[<div><img src="images/foobar.jpg" style="color: blue;" /></div>]
126
+
127
+ Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
128
+ %q[<div><img src="http://example.com/images/foobar.jpg" style="color: blue;" /></div>]
129
+ end
130
+
131
+ it "parses image tags ignoring images with empty sources" do
132
+ input_html = %q[<div><img src="" title="FooBar!" /></div>]
133
+
134
+ Spix::Utils.format_links(:text => input_html).should ==
135
+ %q[<div></div>]
136
+ end
137
+
138
+ it "parses image tags with simple quotes" do
139
+ input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
140
+
141
+ Spix::Utils.format_links(:text => input_html).should ==
142
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
143
+ end
144
+
145
+ # TODO: should we strip these extra &quot; ?
146
+ it "parses image tags with html escaped quote (&quot;)" do
147
+ input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
148
+
149
+ Spix::Utils.format_links(:text => input_html).should ==
150
+ %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
151
+ end
152
+
153
+ it "parses image tags with html attributes without quotes, based on spaces" do
154
+ input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
155
+
156
+ Spix::Utils.format_links(:text => input_html).should ==
157
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
158
+ end
159
+
160
+ it "parses image tags with html attributes having spaces before or after the equal sign" do
161
+ input_html = %q[<div><img src = images/bar.jpg title = FooBar! /></div>]
162
+
163
+ Spix::Utils.format_links(:text => input_html).should ==
164
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
165
+ end
166
+
167
+ it "parses image tags downcasing attribute names" do
168
+ input_html = %q[<div><img SRC="images/bar.jpg" TITLE="FooBar!" /></div>]
169
+
170
+ Spix::Utils.format_links(:text => input_html).should ==
171
+ %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
172
+ end
173
+
174
+ it "parses image tags ignoring empty attributes" do
175
+ input_html = %q[<div><img src="images/bar.jpg" title="" /></div>]
176
+
177
+ Spix::Utils.format_links(:text => input_html).should ==
178
+ %q[<div><img src="images/bar.jpg" /></div>]
179
+ end
180
+ end
181
+ end
182
+ end
metadata ADDED
@@ -0,0 +1,184 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spix_parser
3
+ version: !ruby/object:Gem::Version
4
+ hash: 7
5
+ prerelease:
6
+ segments:
7
+ - 1
8
+ - 5
9
+ - 2
10
+ version: 1.5.2
11
+ platform: ruby
12
+ authors:
13
+ - Marcelo Eden
14
+ - Fabio Mont'Alegre
15
+ - "Lucas H\xC3\xBAngaro"
16
+ - Luiz Rocha
17
+ autorequire:
18
+ bindir: bin
19
+ cert_chain: []
20
+
21
+ date: 2011-05-12 00:00:00 -03:00
22
+ default_executable:
23
+ dependencies:
24
+ - !ruby/object:Gem::Dependency
25
+ name: feedzirra
26
+ prerelease: false
27
+ requirement: &id001 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ hash: 47
33
+ segments:
34
+ - 0
35
+ - 0
36
+ - 24
37
+ version: 0.0.24
38
+ type: :runtime
39
+ version_requirements: *id001
40
+ - !ruby/object:Gem::Dependency
41
+ name: memoizable
42
+ prerelease: false
43
+ requirement: &id002 !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ hash: 27
49
+ segments:
50
+ - 0
51
+ - 1
52
+ - 0
53
+ version: 0.1.0
54
+ type: :runtime
55
+ version_requirements: *id002
56
+ - !ruby/object:Gem::Dependency
57
+ name: sanitizer
58
+ prerelease: false
59
+ requirement: &id003 !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 21
65
+ segments:
66
+ - 0
67
+ - 1
68
+ - 7
69
+ version: 0.1.7
70
+ type: :runtime
71
+ version_requirements: *id003
72
+ - !ruby/object:Gem::Dependency
73
+ name: i18n
74
+ prerelease: false
75
+ requirement: &id004 !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ hash: 25
81
+ segments:
82
+ - 0
83
+ - 1
84
+ - 1
85
+ version: 0.1.1
86
+ type: :runtime
87
+ version_requirements: *id004
88
+ - !ruby/object:Gem::Dependency
89
+ name: rspec
90
+ prerelease: false
91
+ requirement: &id005 !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ hash: 3
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ type: :development
101
+ version_requirements: *id005
102
+ - !ruby/object:Gem::Dependency
103
+ name: fakeweb
104
+ prerelease: false
105
+ requirement: &id006 !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ hash: 3
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ type: :development
115
+ version_requirements: *id006
116
+ description: A feed parser wrapper for Spix
117
+ email: busk@busk.com
118
+ executables: []
119
+
120
+ extensions: []
121
+
122
+ extra_rdoc_files: []
123
+
124
+ files:
125
+ - lib/spix_parser/config.rb
126
+ - lib/spix_parser/core_ext.rb
127
+ - lib/spix_parser/custom_parsers/atom.rb
128
+ - lib/spix_parser/custom_parsers/atom_entry.rb
129
+ - lib/spix_parser/custom_parsers/enclosure.rb
130
+ - lib/spix_parser/custom_parsers/rss.rb
131
+ - lib/spix_parser/custom_parsers/rss_entry.rb
132
+ - lib/spix_parser/datetime.rb
133
+ - lib/spix_parser/parser.rb
134
+ - lib/spix_parser/tools/feed_discovery.rb
135
+ - lib/spix_parser/tools/redirect_follower.rb
136
+ - lib/spix_parser/version.rb
137
+ - lib/spix_parser/wrappers/enclosure_interface.rb
138
+ - lib/spix_parser/wrappers/entry.rb
139
+ - lib/spix_parser/wrappers/feed.rb
140
+ - lib/spix_parser/wrappers/parsing_error.rb
141
+ - lib/spix_parser.rb
142
+ - spec/parser_spec.rb
143
+ - spec/spix_parser/parser_spec.rb
144
+ - spec/spix_parser/tools/feed_discovery_spec.rb
145
+ - spec/spix_parser/utils_spec.rb
146
+ has_rdoc: true
147
+ homepage: http://github.com/busk/spix_parser
148
+ licenses: []
149
+
150
+ post_install_message:
151
+ rdoc_options: []
152
+
153
+ require_paths:
154
+ - lib
155
+ required_ruby_version: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ hash: 3
161
+ segments:
162
+ - 0
163
+ version: "0"
164
+ required_rubygems_version: !ruby/object:Gem::Requirement
165
+ none: false
166
+ requirements:
167
+ - - ">="
168
+ - !ruby/object:Gem::Version
169
+ hash: 3
170
+ segments:
171
+ - 0
172
+ version: "0"
173
+ requirements: []
174
+
175
+ rubyforge_project:
176
+ rubygems_version: 1.6.2
177
+ signing_key:
178
+ specification_version: 3
179
+ summary: FeedParser for Spix
180
+ test_files:
181
+ - spec/parser_spec.rb
182
+ - spec/spix_parser/parser_spec.rb
183
+ - spec/spix_parser/tools/feed_discovery_spec.rb
184
+ - spec/spix_parser/utils_spec.rb