feedjira 2.0.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +2 -0
- data/.rubocop.yml +15 -0
- data/.travis.yml +31 -12
- data/CHANGELOG.md +34 -1
- data/Dangerfile +1 -0
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +210 -7
- data/Rakefile +11 -1
- data/feedjira.gemspec +17 -14
- data/fixtures/vcr_cassettes/fetch_failure.yml +62 -0
- data/fixtures/vcr_cassettes/parse_error.yml +222 -0
- data/fixtures/vcr_cassettes/success.yml +281 -0
- data/lib/feedjira/configuration.rb +76 -0
- data/lib/feedjira/core_ext/date.rb +3 -1
- data/lib/feedjira/core_ext/string.rb +2 -1
- data/lib/feedjira/core_ext/time.rb +24 -17
- data/lib/feedjira/core_ext.rb +3 -3
- data/lib/feedjira/date_time_utilities/date_time_epoch_parser.rb +13 -0
- data/lib/feedjira/date_time_utilities/date_time_language_parser.rb +24 -0
- data/lib/feedjira/date_time_utilities/date_time_pattern_parser.rb +34 -0
- data/lib/feedjira/date_time_utilities.rb +32 -0
- data/lib/feedjira/feed.rb +89 -62
- data/lib/feedjira/feed_entry_utilities.rb +20 -19
- data/lib/feedjira/feed_utilities.rb +37 -22
- data/lib/feedjira/parser/atom.rb +10 -8
- data/lib/feedjira/parser/atom_entry.rb +11 -13
- data/lib/feedjira/parser/atom_feed_burner.rb +27 -10
- data/lib/feedjira/parser/atom_feed_burner_entry.rb +12 -14
- data/lib/feedjira/parser/atom_youtube.rb +21 -0
- data/lib/feedjira/parser/atom_youtube_entry.rb +30 -0
- data/lib/feedjira/parser/google_docs_atom.rb +8 -7
- data/lib/feedjira/parser/google_docs_atom_entry.rb +13 -11
- data/lib/feedjira/parser/itunes_rss.rb +41 -22
- data/lib/feedjira/parser/itunes_rss_category.rb +39 -0
- data/lib/feedjira/parser/itunes_rss_item.rb +32 -20
- data/lib/feedjira/parser/itunes_rss_owner.rb +4 -4
- data/lib/feedjira/parser/podlove_chapter.rb +22 -0
- data/lib/feedjira/parser/rss.rb +11 -8
- data/lib/feedjira/parser/rss_entry.rb +17 -21
- data/lib/feedjira/parser/rss_feed_burner.rb +5 -6
- data/lib/feedjira/parser/rss_feed_burner_entry.rb +24 -28
- data/lib/feedjira/parser/rss_image.rb +15 -0
- data/lib/feedjira/parser.rb +1 -1
- data/lib/feedjira/preprocessor.rb +4 -2
- data/lib/feedjira/version.rb +1 -1
- data/lib/feedjira.rb +15 -0
- data/spec/feedjira/configuration_spec.rb +25 -0
- data/spec/feedjira/date_time_utilities_spec.rb +47 -0
- data/spec/feedjira/feed_entry_utilities_spec.rb +23 -19
- data/spec/feedjira/feed_spec.rb +140 -75
- data/spec/feedjira/feed_utilities_spec.rb +83 -63
- data/spec/feedjira/parser/atom_entry_spec.rb +54 -34
- data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +27 -20
- data/spec/feedjira/parser/atom_feed_burner_spec.rb +87 -30
- data/spec/feedjira/parser/atom_spec.rb +50 -48
- data/spec/feedjira/parser/atom_youtube_entry_spec.rb +86 -0
- data/spec/feedjira/parser/atom_youtube_spec.rb +43 -0
- data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +5 -4
- data/spec/feedjira/parser/google_docs_atom_spec.rb +6 -6
- data/spec/feedjira/parser/itunes_rss_item_spec.rb +49 -29
- data/spec/feedjira/parser/itunes_rss_owner_spec.rb +10 -9
- data/spec/feedjira/parser/itunes_rss_spec.rb +87 -30
- data/spec/feedjira/parser/podlove_chapter_spec.rb +37 -0
- data/spec/feedjira/parser/rss_entry_spec.rb +50 -33
- data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +55 -33
- data/spec/feedjira/parser/rss_feed_burner_spec.rb +31 -26
- data/spec/feedjira/parser/rss_spec.rb +56 -24
- data/spec/feedjira/preprocessor_spec.rb +11 -3
- data/spec/sample_feeds/AmazonWebServicesBlog.xml +797 -797
- data/spec/sample_feeds/AtomEscapedHTMLInPreTag.xml +13 -0
- data/spec/sample_feeds/CRE.xml +5849 -0
- data/spec/sample_feeds/FeedBurnerXHTML.xml +400 -400
- data/spec/sample_feeds/GiantRobotsSmashingIntoOtherGiantRobots.xml +682 -0
- data/spec/sample_feeds/ITunesWithSingleQuotedAttributes.xml +67 -0
- data/spec/sample_feeds/InvalidDateFormat.xml +20 -0
- data/spec/sample_feeds/PaulDixExplainsNothing.xml +175 -175
- data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +175 -175
- data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +16 -16
- data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -174
- data/spec/sample_feeds/TenderLovemaking.xml +12 -2
- data/spec/sample_feeds/TrotterCashionHome.xml +611 -611
- data/spec/sample_feeds/TypePadNews.xml +368 -368
- data/spec/sample_feeds/itunes.xml +31 -2
- data/spec/sample_feeds/pet_atom.xml +229 -229
- data/spec/sample_feeds/youtube_atom.xml +395 -0
- data/spec/sample_feeds.rb +31 -21
- data/spec/spec_helper.rb +6 -0
- metadata +132 -25
@@ -1,6 +1,7 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'time'
|
2
|
+
require 'date'
|
3
3
|
|
4
|
+
# rubocop:disable Style/DocumentationMethod
|
4
5
|
class Time
|
5
6
|
# Parse a time string and convert it to UTC without raising errors.
|
6
7
|
# Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
|
@@ -10,22 +11,28 @@ class Time
|
|
10
11
|
#
|
11
12
|
# === Returns
|
12
13
|
# A Time instance in UTC or nil if there were errors while parsing.
|
14
|
+
# rubocop:disable Metrics/MethodLength
|
13
15
|
def self.parse_safely(dt)
|
14
|
-
if dt
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
when dt.respond_to?(:to_datetime)
|
21
|
-
dt.to_datetime.utc
|
22
|
-
when dt.to_s =~ /\A\d{14}\z/
|
23
|
-
parse("#{dt.to_s}Z", true)
|
24
|
-
else
|
25
|
-
parse(dt.to_s, true).utc
|
26
|
-
end
|
16
|
+
if dt.is_a?(Time)
|
17
|
+
dt.utc
|
18
|
+
elsif dt.respond_to?(:to_datetime)
|
19
|
+
dt.to_datetime.utc
|
20
|
+
elsif dt.respond_to? :to_s
|
21
|
+
parse_string_safely dt.to_s
|
27
22
|
end
|
28
|
-
rescue StandardError
|
23
|
+
rescue StandardError => e
|
24
|
+
Feedjira.logger.debug { "Failed to parse time #{dt}" }
|
25
|
+
Feedjira.logger.debug(e)
|
29
26
|
nil
|
30
|
-
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.parse_string_safely(string)
|
30
|
+
return nil if string.empty?
|
31
|
+
|
32
|
+
if string =~ /\A\d{14}\z/
|
33
|
+
parse("#{string}Z", true)
|
34
|
+
else
|
35
|
+
parse(string).utc
|
36
|
+
end
|
37
|
+
end
|
31
38
|
end
|
data/lib/feedjira/core_ext.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'feedjira/core_ext/time'
|
2
|
+
require 'feedjira/core_ext/date'
|
3
|
+
require 'feedjira/core_ext/string'
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimeEpochParser
|
6
|
+
def self.parse(string)
|
7
|
+
epoch_time = string.to_i
|
8
|
+
return Time.at(epoch_time).to_datetime if epoch_time.to_s == string
|
9
|
+
raise "#{string} is not a valid epoch time"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimeLanguageParser
|
6
|
+
MONTHS_ENGLISH =
|
7
|
+
%w(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec).freeze
|
8
|
+
MONTHS_SPANISH =
|
9
|
+
%w(Ene Feb Mar Abr May Jun Jul Ago Sep Oct Nov Dic).freeze
|
10
|
+
|
11
|
+
def self.parse(string)
|
12
|
+
DateTime.parse(translate(string))
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.translate(string)
|
16
|
+
MONTHS_SPANISH.each_with_index do |m, i|
|
17
|
+
rgx = Regexp.new("\s#{m}\s", Regexp::IGNORECASE)
|
18
|
+
return string.gsub(rgx, MONTHS_ENGLISH[i]) if string =~ rgx
|
19
|
+
end
|
20
|
+
raise "No translation found for #{string}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimePatternParser
|
6
|
+
# rubocop:disable Style/AsciiComments
|
7
|
+
# Japanese Symbols are required for strange Date Strings like
|
8
|
+
# '水, 31 8 2016 07:37:00 PDT'
|
9
|
+
JAPANESE_SYMBOLS = %w(日 月 火 水 木 金 土).freeze
|
10
|
+
PATTERNS = ['%m/%d/%Y %T %p', '%d %m %Y %T %Z'].freeze
|
11
|
+
|
12
|
+
# rubocop:disable Metrics/MethodLength
|
13
|
+
def self.parse(string)
|
14
|
+
PATTERNS.each do |p|
|
15
|
+
begin
|
16
|
+
datetime = DateTime.strptime(prepare(string), p)
|
17
|
+
return datetime
|
18
|
+
rescue StandardError => e
|
19
|
+
Feedjira.logger.debug("Failed to parse date #{string}")
|
20
|
+
Feedjira.logger.debug(e)
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
raise "No pattern matched #{string}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.prepare(string)
|
28
|
+
rgx = Regexp.new("^(#{JAPANESE_SYMBOLS.join('|')}),\s")
|
29
|
+
string.gsub(rgx, '')
|
30
|
+
end
|
31
|
+
private_class_method :prepare
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
module Feedjira
|
3
|
+
module DateTimeUtilities
|
4
|
+
# This is our date parsing heuristic.
|
5
|
+
# Date Parsers are attempted in order.
|
6
|
+
DATE_PARSERS = [
|
7
|
+
DateTimePatternParser,
|
8
|
+
DateTimeLanguageParser,
|
9
|
+
DateTimeEpochParser,
|
10
|
+
DateTime
|
11
|
+
].freeze
|
12
|
+
|
13
|
+
# Parse the given string starting with the most common parser (default ruby)
|
14
|
+
# and going over all other available parsers
|
15
|
+
# rubocop:disable Metrics/MethodLength
|
16
|
+
def parse_datetime(string)
|
17
|
+
res = DATE_PARSERS.find do |parser|
|
18
|
+
begin
|
19
|
+
return parser.parse(string).feed_utils_to_gm_time
|
20
|
+
rescue StandardError => e
|
21
|
+
Feedjira.logger.debug { "Failed to parse date #{string}" }
|
22
|
+
Feedjira.logger.debug(e)
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Feedjira.logger.warn { "Failed to parse date #{string}" } if res.nil?
|
28
|
+
|
29
|
+
res
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/feedjira/feed.rb
CHANGED
@@ -1,85 +1,112 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
class Feed
|
3
|
-
|
4
|
-
parser
|
5
|
-
|
5
|
+
class << self
|
6
|
+
def parse_with(parser, xml, &block)
|
7
|
+
parser.parse xml, &block
|
8
|
+
end
|
6
9
|
|
7
|
-
|
8
|
-
|
10
|
+
def parse(xml, &block)
|
11
|
+
parser = determine_feed_parser_for_xml(xml)
|
12
|
+
raise NoParserAvailable, 'No valid parser for XML.' unless parser
|
9
13
|
parse_with parser, xml, &block
|
10
|
-
else
|
11
|
-
raise NoParserAvailable.new("No valid parser for XML.")
|
12
14
|
end
|
13
|
-
end
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
def determine_feed_parser_for_xml(xml)
|
17
|
+
start_of_doc = xml.slice(0, 2000)
|
18
|
+
feed_classes.detect { |klass| klass.able_to_parse?(start_of_doc) }
|
19
|
+
end
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
def add_feed_class(klass)
|
22
|
+
feed_classes.unshift klass
|
23
|
+
end
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
Feedjira::Parser::GoogleDocsAtom,
|
28
|
-
Feedjira::Parser::AtomFeedBurner,
|
29
|
-
Feedjira::Parser::Atom,
|
30
|
-
Feedjira::Parser::ITunesRSS,
|
31
|
-
Feedjira::Parser::RSS
|
32
|
-
]
|
33
|
-
end
|
25
|
+
def feed_classes
|
26
|
+
@feed_classes ||= Feedjira.parsers
|
27
|
+
end
|
34
28
|
|
35
|
-
|
36
|
-
|
37
|
-
k.element element_tag, options
|
29
|
+
def reset_parsers!
|
30
|
+
@feed_classes = nil
|
38
31
|
end
|
39
|
-
end
|
40
32
|
|
41
|
-
|
42
|
-
|
43
|
-
|
33
|
+
def add_common_feed_element(element_tag, options = {})
|
34
|
+
feed_classes.each do |k|
|
35
|
+
k.element element_tag, options
|
36
|
+
end
|
44
37
|
end
|
45
|
-
end
|
46
38
|
|
47
|
-
|
48
|
-
|
49
|
-
|
39
|
+
def add_common_feed_elements(element_tag, options = {})
|
40
|
+
feed_classes.each do |k|
|
41
|
+
k.elements element_tag, options
|
42
|
+
end
|
43
|
+
end
|
50
44
|
|
51
|
-
|
52
|
-
|
53
|
-
|
45
|
+
def add_common_feed_entry_element(element_tag, options = {})
|
46
|
+
call_on_each_feed_entry :element, element_tag, options
|
47
|
+
end
|
54
48
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
49
|
+
def add_common_feed_entry_elements(element_tag, options = {})
|
50
|
+
call_on_each_feed_entry :elements, element_tag, options
|
51
|
+
end
|
52
|
+
|
53
|
+
def call_on_each_feed_entry(method, *parameters)
|
54
|
+
feed_classes.each do |klass|
|
55
|
+
klass.sax_config.collection_elements.each_value do |value|
|
56
|
+
collection_configs = value.select do |v|
|
57
|
+
v.accessor == 'entries' && v.data_class.class == Class
|
58
|
+
end
|
59
|
+
|
60
|
+
collection_configs.each do |config|
|
61
|
+
config.data_class.send(method, *parameters)
|
62
|
+
end
|
60
63
|
end
|
61
64
|
end
|
62
65
|
end
|
63
|
-
end
|
64
66
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
67
|
+
def fetch_and_parse(url)
|
68
|
+
response = connection(url).get
|
69
|
+
unless response.success?
|
70
|
+
raise FetchFailure, "Fetch failed - #{response.status}"
|
71
|
+
end
|
72
|
+
feed = parse response.body
|
73
|
+
feed.feed_url = url
|
74
|
+
feed.etag = response.headers['etag'].to_s.delete '"'
|
75
|
+
|
76
|
+
feed.last_modified = parse_last_modified(response)
|
77
|
+
feed
|
78
|
+
end
|
79
|
+
|
80
|
+
# rubocop:disable LineLength
|
81
|
+
def connection(url)
|
82
|
+
Faraday.new(url: url, headers: headers, request: request_options) do |conn|
|
83
|
+
conn.use FaradayMiddleware::FollowRedirects, limit: Feedjira.follow_redirect_limit
|
84
|
+
conn.adapter(*Faraday.default_adapter)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
# rubocop:enable LineLength
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def headers
|
92
|
+
{
|
93
|
+
user_agent: Feedjira.user_agent
|
94
|
+
}
|
95
|
+
end
|
96
|
+
|
97
|
+
def request_options
|
98
|
+
{
|
99
|
+
timeout: Feedjira.request_timeout
|
100
|
+
}
|
101
|
+
end
|
78
102
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
103
|
+
def parse_last_modified(response)
|
104
|
+
lm = response.headers['last-modified']
|
105
|
+
DateTime.parse(lm).to_time
|
106
|
+
rescue StandardError => e
|
107
|
+
Feedjira.logger.warn { "Failed to parse last modified '#{lm}'" }
|
108
|
+
Feedjira.logger.debug(e)
|
109
|
+
nil
|
83
110
|
end
|
84
111
|
end
|
85
112
|
end
|
@@ -1,23 +1,25 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module FeedEntryUtilities
|
3
|
-
|
4
5
|
include Enumerable
|
6
|
+
include DateTimeUtilities
|
5
7
|
|
6
8
|
def published
|
7
9
|
@published ||= @updated
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse_datetime(string)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
13
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
14
|
+
rescue StandardError => e
|
15
|
+
Feedjira.logger.warn { "Failed to parse date #{string.inspect}" }
|
16
|
+
Feedjira.logger.debug(e)
|
17
|
+
nil
|
17
18
|
end
|
18
19
|
|
19
20
|
##
|
20
|
-
# Returns the id of the entry or its url if not id is present, as some
|
21
|
+
# Returns the id of the entry or its url if not id is present, as some
|
22
|
+
# formats don't support it
|
21
23
|
def id
|
22
24
|
@entry_id ||= @url
|
23
25
|
end
|
@@ -26,41 +28,40 @@ module Feedjira
|
|
26
28
|
# Writer for published. By default, we keep the "oldest" publish time found.
|
27
29
|
def published=(val)
|
28
30
|
parsed = parse_datetime(val)
|
29
|
-
@published = parsed if !@published || parsed < @published
|
31
|
+
@published = parsed if parsed && (!@published || parsed < @published)
|
30
32
|
end
|
31
33
|
|
32
34
|
##
|
33
35
|
# Writer for updated. By default, we keep the most recent update time found.
|
34
36
|
def updated=(val)
|
35
37
|
parsed = parse_datetime(val)
|
36
|
-
@updated = parsed if !@updated || parsed > @updated
|
38
|
+
@updated = parsed if parsed && (!@updated || parsed > @updated)
|
37
39
|
end
|
38
40
|
|
39
41
|
def sanitize!
|
40
|
-
%w
|
41
|
-
if
|
42
|
-
|
42
|
+
%w(title author summary content image).each do |name|
|
43
|
+
if respond_to?(name) && send(name).respond_to?(:sanitize!)
|
44
|
+
send(name).send :sanitize!
|
43
45
|
end
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
47
|
-
|
49
|
+
alias last_modified published
|
48
50
|
|
49
51
|
def each
|
50
|
-
@rss_fields ||=
|
52
|
+
@rss_fields ||= instance_variables
|
51
53
|
|
52
54
|
@rss_fields.each do |field|
|
53
|
-
yield(field.to_s.sub('@', ''),
|
55
|
+
yield(field.to_s.sub('@', ''), instance_variable_get(field))
|
54
56
|
end
|
55
57
|
end
|
56
58
|
|
57
59
|
def [](field)
|
58
|
-
|
60
|
+
instance_variable_get("@#{field}")
|
59
61
|
end
|
60
62
|
|
61
63
|
def []=(field, value)
|
62
|
-
|
64
|
+
instance_variable_set("@#{field}", value)
|
63
65
|
end
|
64
|
-
|
65
66
|
end
|
66
67
|
end
|
@@ -1,6 +1,8 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module FeedUtilities
|
3
|
-
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
|
5
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag).freeze
|
4
6
|
|
5
7
|
attr_writer :new_entries, :updated, :last_modified
|
6
8
|
attr_accessor :etag
|
@@ -11,7 +13,7 @@ module Feedjira
|
|
11
13
|
|
12
14
|
module ClassMethods
|
13
15
|
def parse(xml, &block)
|
14
|
-
xml = xml
|
16
|
+
xml = strip_whitespace(xml)
|
15
17
|
xml = preprocess(xml) if preprocess_xml
|
16
18
|
super xml, &block
|
17
19
|
end
|
@@ -28,11 +30,20 @@ module Feedjira
|
|
28
30
|
def preprocess_xml
|
29
31
|
@preprocess_xml
|
30
32
|
end
|
33
|
+
|
34
|
+
def strip_whitespace(xml)
|
35
|
+
if Feedjira.strip_whitespace
|
36
|
+
xml.strip
|
37
|
+
else
|
38
|
+
xml.lstrip
|
39
|
+
end
|
40
|
+
end
|
31
41
|
end
|
32
42
|
|
33
43
|
def last_modified
|
34
44
|
@last_modified ||= begin
|
35
|
-
|
45
|
+
published = entries.reject { |e| e.published.nil? }
|
46
|
+
entry = published.sort_by { |e| e.published if e.published }.last
|
36
47
|
entry ? entry.published : nil
|
37
48
|
end
|
38
49
|
end
|
@@ -45,13 +56,13 @@ module Feedjira
|
|
45
56
|
@new_entries ||= []
|
46
57
|
end
|
47
58
|
|
48
|
-
def
|
49
|
-
new_entries.
|
59
|
+
def new_entries?
|
60
|
+
!new_entries.empty?
|
50
61
|
end
|
51
62
|
|
52
63
|
def update_from_feed(feed)
|
53
64
|
self.new_entries += find_new_entries_for(feed)
|
54
|
-
|
65
|
+
entries.unshift(*self.new_entries)
|
55
66
|
|
56
67
|
@updated = false
|
57
68
|
|
@@ -61,7 +72,8 @@ module Feedjira
|
|
61
72
|
end
|
62
73
|
|
63
74
|
def update_attribute(feed, name)
|
64
|
-
old_value
|
75
|
+
old_value = send(name)
|
76
|
+
new_value = feed.send(name)
|
65
77
|
|
66
78
|
if old_value != new_value
|
67
79
|
send("#{name}=", new_value)
|
@@ -72,33 +84,36 @@ module Feedjira
|
|
72
84
|
end
|
73
85
|
|
74
86
|
def sanitize_entries!
|
75
|
-
entries.each
|
87
|
+
entries.each(&:sanitize!)
|
76
88
|
end
|
77
89
|
|
78
90
|
private
|
79
91
|
|
92
|
+
# This implementation is a hack, which is why it's so ugly. It's to get
|
93
|
+
# around the fact that not all feeds have a published date. However,
|
94
|
+
# they're always ordered with the newest one first. So we go through the
|
95
|
+
# entries just parsed and insert each one as a new entry until we get to
|
96
|
+
# one that has the same id as the the newest for the feed.
|
80
97
|
def find_new_entries_for(feed)
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
# So we go through the entries just parsed and insert each one as a new entry
|
85
|
-
# until we get to one that has the same id as the the newest for the feed
|
86
|
-
return feed.entries if self.entries.length == 0
|
87
|
-
latest_entry = self.entries.first
|
98
|
+
return feed.entries if entries.length.zero?
|
99
|
+
|
100
|
+
latest_entry = entries.first
|
88
101
|
found_new_entries = []
|
102
|
+
|
89
103
|
feed.entries.each do |entry|
|
90
|
-
|
91
|
-
break if entry.url == latest_entry.url
|
92
|
-
else
|
93
|
-
break if entry.entry_id == latest_entry.entry_id || entry.url == latest_entry.url
|
94
|
-
end
|
104
|
+
break unless new_entry?(entry, latest_entry)
|
95
105
|
found_new_entries << entry
|
96
106
|
end
|
107
|
+
|
97
108
|
found_new_entries
|
98
109
|
end
|
99
110
|
|
100
|
-
def
|
101
|
-
|
111
|
+
def new_entry?(entry, latest)
|
112
|
+
nil_ids = entry.entry_id.nil? && latest.entry_id.nil?
|
113
|
+
new_id = entry.entry_id != latest.entry_id
|
114
|
+
new_url = entry.url != latest.url
|
115
|
+
|
116
|
+
(nil_ids || new_id) && new_url
|
102
117
|
end
|
103
118
|
end
|
104
119
|
end
|
data/lib/feedjira/parser/atom.rb
CHANGED
@@ -1,19 +1,21 @@
|
|
1
|
+
# rubocop:disable Style/DocumentationMethod
|
1
2
|
module Feedjira
|
2
3
|
module Parser
|
3
4
|
# Parser for dealing with Atom feeds.
|
4
5
|
class Atom
|
5
6
|
include SAXMachine
|
6
7
|
include FeedUtilities
|
8
|
+
|
7
9
|
element :title
|
8
|
-
element :subtitle, :
|
9
|
-
element :link, :
|
10
|
-
element :link, :
|
11
|
-
elements :link, :
|
12
|
-
elements :link, :
|
13
|
-
elements :entry, :
|
10
|
+
element :subtitle, as: :description
|
11
|
+
element :link, as: :url, value: :href, with: { type: 'text/html' }
|
12
|
+
element :link, as: :feed_url, value: :href, with: { rel: 'self' }
|
13
|
+
elements :link, as: :links, value: :href
|
14
|
+
elements :link, as: :hubs, value: :href, with: { rel: 'hub' }
|
15
|
+
elements :entry, as: :entries, class: AtomEntry
|
14
16
|
|
15
|
-
def self.able_to_parse?(xml)
|
16
|
-
|
17
|
+
def self.able_to_parse?(xml)
|
18
|
+
%r{\<feed[^\>]+xmlns\s?=\s?[\"\'](http://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)[\"\'][^\>]*\>} =~ xml # rubocop:disable Metrics/LineLength
|
17
19
|
end
|
18
20
|
|
19
21
|
def url
|
@@ -1,5 +1,5 @@
|
|
1
|
+
# rubocop:disable Style/DocumentationMethod
|
1
2
|
module Feedjira
|
2
|
-
|
3
3
|
module Parser
|
4
4
|
# Parser for dealing with Atom feed entries.
|
5
5
|
class AtomEntry
|
@@ -7,28 +7,26 @@ module Feedjira
|
|
7
7
|
include FeedEntryUtilities
|
8
8
|
|
9
9
|
element :title
|
10
|
-
element :link, :
|
11
|
-
element :name, :
|
10
|
+
element :link, as: :url, value: :href, with: { type: 'text/html', rel: 'alternate' } # rubocop:disable Metrics/LineLength
|
11
|
+
element :name, as: :author
|
12
12
|
element :content
|
13
13
|
element :summary
|
14
14
|
|
15
|
-
element :"media:content", :
|
16
|
-
element :enclosure, :
|
15
|
+
element :"media:content", as: :image, value: :url
|
16
|
+
element :enclosure, as: :image, value: :href
|
17
17
|
|
18
18
|
element :published
|
19
|
-
element :id, :
|
20
|
-
element :created, :
|
21
|
-
element :issued, :
|
19
|
+
element :id, as: :entry_id
|
20
|
+
element :created, as: :published
|
21
|
+
element :issued, as: :published
|
22
22
|
element :updated
|
23
|
-
element :modified, :
|
24
|
-
elements :category, :
|
25
|
-
elements :link, :
|
23
|
+
element :modified, as: :updated
|
24
|
+
elements :category, as: :categories, value: :term
|
25
|
+
elements :link, as: :links, value: :href
|
26
26
|
|
27
27
|
def url
|
28
28
|
@url ||= links.first
|
29
29
|
end
|
30
30
|
end
|
31
|
-
|
32
31
|
end
|
33
|
-
|
34
32
|
end
|