feedjira 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +2 -0
- data/.rubocop.yml +15 -0
- data/.travis.yml +31 -12
- data/CHANGELOG.md +34 -1
- data/Dangerfile +1 -0
- data/Gemfile +2 -1
- data/LICENSE +1 -1
- data/README.md +210 -7
- data/Rakefile +11 -1
- data/feedjira.gemspec +17 -14
- data/fixtures/vcr_cassettes/fetch_failure.yml +62 -0
- data/fixtures/vcr_cassettes/parse_error.yml +222 -0
- data/fixtures/vcr_cassettes/success.yml +281 -0
- data/lib/feedjira/configuration.rb +76 -0
- data/lib/feedjira/core_ext/date.rb +3 -1
- data/lib/feedjira/core_ext/string.rb +2 -1
- data/lib/feedjira/core_ext/time.rb +24 -17
- data/lib/feedjira/core_ext.rb +3 -3
- data/lib/feedjira/date_time_utilities/date_time_epoch_parser.rb +13 -0
- data/lib/feedjira/date_time_utilities/date_time_language_parser.rb +24 -0
- data/lib/feedjira/date_time_utilities/date_time_pattern_parser.rb +34 -0
- data/lib/feedjira/date_time_utilities.rb +32 -0
- data/lib/feedjira/feed.rb +89 -62
- data/lib/feedjira/feed_entry_utilities.rb +20 -19
- data/lib/feedjira/feed_utilities.rb +37 -22
- data/lib/feedjira/parser/atom.rb +10 -8
- data/lib/feedjira/parser/atom_entry.rb +11 -13
- data/lib/feedjira/parser/atom_feed_burner.rb +27 -10
- data/lib/feedjira/parser/atom_feed_burner_entry.rb +12 -14
- data/lib/feedjira/parser/atom_youtube.rb +21 -0
- data/lib/feedjira/parser/atom_youtube_entry.rb +30 -0
- data/lib/feedjira/parser/google_docs_atom.rb +8 -7
- data/lib/feedjira/parser/google_docs_atom_entry.rb +13 -11
- data/lib/feedjira/parser/itunes_rss.rb +41 -22
- data/lib/feedjira/parser/itunes_rss_category.rb +39 -0
- data/lib/feedjira/parser/itunes_rss_item.rb +32 -20
- data/lib/feedjira/parser/itunes_rss_owner.rb +4 -4
- data/lib/feedjira/parser/podlove_chapter.rb +22 -0
- data/lib/feedjira/parser/rss.rb +11 -8
- data/lib/feedjira/parser/rss_entry.rb +17 -21
- data/lib/feedjira/parser/rss_feed_burner.rb +5 -6
- data/lib/feedjira/parser/rss_feed_burner_entry.rb +24 -28
- data/lib/feedjira/parser/rss_image.rb +15 -0
- data/lib/feedjira/parser.rb +1 -1
- data/lib/feedjira/preprocessor.rb +4 -2
- data/lib/feedjira/version.rb +1 -1
- data/lib/feedjira.rb +15 -0
- data/spec/feedjira/configuration_spec.rb +25 -0
- data/spec/feedjira/date_time_utilities_spec.rb +47 -0
- data/spec/feedjira/feed_entry_utilities_spec.rb +23 -19
- data/spec/feedjira/feed_spec.rb +140 -75
- data/spec/feedjira/feed_utilities_spec.rb +83 -63
- data/spec/feedjira/parser/atom_entry_spec.rb +54 -34
- data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +27 -20
- data/spec/feedjira/parser/atom_feed_burner_spec.rb +87 -30
- data/spec/feedjira/parser/atom_spec.rb +50 -48
- data/spec/feedjira/parser/atom_youtube_entry_spec.rb +86 -0
- data/spec/feedjira/parser/atom_youtube_spec.rb +43 -0
- data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +5 -4
- data/spec/feedjira/parser/google_docs_atom_spec.rb +6 -6
- data/spec/feedjira/parser/itunes_rss_item_spec.rb +49 -29
- data/spec/feedjira/parser/itunes_rss_owner_spec.rb +10 -9
- data/spec/feedjira/parser/itunes_rss_spec.rb +87 -30
- data/spec/feedjira/parser/podlove_chapter_spec.rb +37 -0
- data/spec/feedjira/parser/rss_entry_spec.rb +50 -33
- data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +55 -33
- data/spec/feedjira/parser/rss_feed_burner_spec.rb +31 -26
- data/spec/feedjira/parser/rss_spec.rb +56 -24
- data/spec/feedjira/preprocessor_spec.rb +11 -3
- data/spec/sample_feeds/AmazonWebServicesBlog.xml +797 -797
- data/spec/sample_feeds/AtomEscapedHTMLInPreTag.xml +13 -0
- data/spec/sample_feeds/CRE.xml +5849 -0
- data/spec/sample_feeds/FeedBurnerXHTML.xml +400 -400
- data/spec/sample_feeds/GiantRobotsSmashingIntoOtherGiantRobots.xml +682 -0
- data/spec/sample_feeds/ITunesWithSingleQuotedAttributes.xml +67 -0
- data/spec/sample_feeds/InvalidDateFormat.xml +20 -0
- data/spec/sample_feeds/PaulDixExplainsNothing.xml +175 -175
- data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +175 -175
- data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +16 -16
- data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -174
- data/spec/sample_feeds/TenderLovemaking.xml +12 -2
- data/spec/sample_feeds/TrotterCashionHome.xml +611 -611
- data/spec/sample_feeds/TypePadNews.xml +368 -368
- data/spec/sample_feeds/itunes.xml +31 -2
- data/spec/sample_feeds/pet_atom.xml +229 -229
- data/spec/sample_feeds/youtube_atom.xml +395 -0
- data/spec/sample_feeds.rb +31 -21
- data/spec/spec_helper.rb +6 -0
- metadata +132 -25
@@ -1,6 +1,7 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'time'
|
2
|
+
require 'date'
|
3
3
|
|
4
|
+
# rubocop:disable Style/DocumentationMethod
|
4
5
|
class Time
|
5
6
|
# Parse a time string and convert it to UTC without raising errors.
|
6
7
|
# Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
|
@@ -10,22 +11,28 @@ class Time
|
|
10
11
|
#
|
11
12
|
# === Returns
|
12
13
|
# A Time instance in UTC or nil if there were errors while parsing.
|
14
|
+
# rubocop:disable Metrics/MethodLength
|
13
15
|
def self.parse_safely(dt)
|
14
|
-
if dt
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
when dt.respond_to?(:to_datetime)
|
21
|
-
dt.to_datetime.utc
|
22
|
-
when dt.to_s =~ /\A\d{14}\z/
|
23
|
-
parse("#{dt.to_s}Z", true)
|
24
|
-
else
|
25
|
-
parse(dt.to_s, true).utc
|
26
|
-
end
|
16
|
+
if dt.is_a?(Time)
|
17
|
+
dt.utc
|
18
|
+
elsif dt.respond_to?(:to_datetime)
|
19
|
+
dt.to_datetime.utc
|
20
|
+
elsif dt.respond_to? :to_s
|
21
|
+
parse_string_safely dt.to_s
|
27
22
|
end
|
28
|
-
rescue StandardError
|
23
|
+
rescue StandardError => e
|
24
|
+
Feedjira.logger.debug { "Failed to parse time #{dt}" }
|
25
|
+
Feedjira.logger.debug(e)
|
29
26
|
nil
|
30
|
-
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.parse_string_safely(string)
|
30
|
+
return nil if string.empty?
|
31
|
+
|
32
|
+
if string =~ /\A\d{14}\z/
|
33
|
+
parse("#{string}Z", true)
|
34
|
+
else
|
35
|
+
parse(string).utc
|
36
|
+
end
|
37
|
+
end
|
31
38
|
end
|
data/lib/feedjira/core_ext.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'feedjira/core_ext/time'
|
2
|
+
require 'feedjira/core_ext/date'
|
3
|
+
require 'feedjira/core_ext/string'
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimeEpochParser
|
6
|
+
def self.parse(string)
|
7
|
+
epoch_time = string.to_i
|
8
|
+
return Time.at(epoch_time).to_datetime if epoch_time.to_s == string
|
9
|
+
raise "#{string} is not a valid epoch time"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimeLanguageParser
|
6
|
+
MONTHS_ENGLISH =
|
7
|
+
%w(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec).freeze
|
8
|
+
MONTHS_SPANISH =
|
9
|
+
%w(Ene Feb Mar Abr May Jun Jul Ago Sep Oct Nov Dic).freeze
|
10
|
+
|
11
|
+
def self.parse(string)
|
12
|
+
DateTime.parse(translate(string))
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.translate(string)
|
16
|
+
MONTHS_SPANISH.each_with_index do |m, i|
|
17
|
+
rgx = Regexp.new("\s#{m}\s", Regexp::IGNORECASE)
|
18
|
+
return string.gsub(rgx, MONTHS_ENGLISH[i]) if string =~ rgx
|
19
|
+
end
|
20
|
+
raise "No translation found for #{string}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimePatternParser
|
6
|
+
# rubocop:disable Style/AsciiComments
|
7
|
+
# Japanese Symbols are required for strange Date Strings like
|
8
|
+
# '水, 31 8 2016 07:37:00 PDT'
|
9
|
+
JAPANESE_SYMBOLS = %w(日 月 火 水 木 金 土).freeze
|
10
|
+
PATTERNS = ['%m/%d/%Y %T %p', '%d %m %Y %T %Z'].freeze
|
11
|
+
|
12
|
+
# rubocop:disable Metrics/MethodLength
|
13
|
+
def self.parse(string)
|
14
|
+
PATTERNS.each do |p|
|
15
|
+
begin
|
16
|
+
datetime = DateTime.strptime(prepare(string), p)
|
17
|
+
return datetime
|
18
|
+
rescue StandardError => e
|
19
|
+
Feedjira.logger.debug("Failed to parse date #{string}")
|
20
|
+
Feedjira.logger.debug(e)
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
raise "No pattern matched #{string}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.prepare(string)
|
28
|
+
rgx = Regexp.new("^(#{JAPANESE_SYMBOLS.join('|')}),\s")
|
29
|
+
string.gsub(rgx, '')
|
30
|
+
end
|
31
|
+
private_class_method :prepare
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
module Feedjira
|
3
|
+
module DateTimeUtilities
|
4
|
+
# This is our date parsing heuristic.
|
5
|
+
# Date Parsers are attempted in order.
|
6
|
+
DATE_PARSERS = [
|
7
|
+
DateTimePatternParser,
|
8
|
+
DateTimeLanguageParser,
|
9
|
+
DateTimeEpochParser,
|
10
|
+
DateTime
|
11
|
+
].freeze
|
12
|
+
|
13
|
+
# Parse the given string starting with the most common parser (default ruby)
|
14
|
+
# and going over all other available parsers
|
15
|
+
# rubocop:disable Metrics/MethodLength
|
16
|
+
def parse_datetime(string)
|
17
|
+
res = DATE_PARSERS.find do |parser|
|
18
|
+
begin
|
19
|
+
return parser.parse(string).feed_utils_to_gm_time
|
20
|
+
rescue StandardError => e
|
21
|
+
Feedjira.logger.debug { "Failed to parse date #{string}" }
|
22
|
+
Feedjira.logger.debug(e)
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Feedjira.logger.warn { "Failed to parse date #{string}" } if res.nil?
|
28
|
+
|
29
|
+
res
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/feedjira/feed.rb
CHANGED
@@ -1,85 +1,112 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
class Feed
|
3
|
-
|
4
|
-
parser
|
5
|
-
|
5
|
+
class << self
|
6
|
+
def parse_with(parser, xml, &block)
|
7
|
+
parser.parse xml, &block
|
8
|
+
end
|
6
9
|
|
7
|
-
|
8
|
-
|
10
|
+
def parse(xml, &block)
|
11
|
+
parser = determine_feed_parser_for_xml(xml)
|
12
|
+
raise NoParserAvailable, 'No valid parser for XML.' unless parser
|
9
13
|
parse_with parser, xml, &block
|
10
|
-
else
|
11
|
-
raise NoParserAvailable.new("No valid parser for XML.")
|
12
14
|
end
|
13
|
-
end
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
def determine_feed_parser_for_xml(xml)
|
17
|
+
start_of_doc = xml.slice(0, 2000)
|
18
|
+
feed_classes.detect { |klass| klass.able_to_parse?(start_of_doc) }
|
19
|
+
end
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
def add_feed_class(klass)
|
22
|
+
feed_classes.unshift klass
|
23
|
+
end
|
23
24
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
Feedjira::Parser::GoogleDocsAtom,
|
28
|
-
Feedjira::Parser::AtomFeedBurner,
|
29
|
-
Feedjira::Parser::Atom,
|
30
|
-
Feedjira::Parser::ITunesRSS,
|
31
|
-
Feedjira::Parser::RSS
|
32
|
-
]
|
33
|
-
end
|
25
|
+
def feed_classes
|
26
|
+
@feed_classes ||= Feedjira.parsers
|
27
|
+
end
|
34
28
|
|
35
|
-
|
36
|
-
|
37
|
-
k.element element_tag, options
|
29
|
+
def reset_parsers!
|
30
|
+
@feed_classes = nil
|
38
31
|
end
|
39
|
-
end
|
40
32
|
|
41
|
-
|
42
|
-
|
43
|
-
|
33
|
+
def add_common_feed_element(element_tag, options = {})
|
34
|
+
feed_classes.each do |k|
|
35
|
+
k.element element_tag, options
|
36
|
+
end
|
44
37
|
end
|
45
|
-
end
|
46
38
|
|
47
|
-
|
48
|
-
|
49
|
-
|
39
|
+
def add_common_feed_elements(element_tag, options = {})
|
40
|
+
feed_classes.each do |k|
|
41
|
+
k.elements element_tag, options
|
42
|
+
end
|
43
|
+
end
|
50
44
|
|
51
|
-
|
52
|
-
|
53
|
-
|
45
|
+
def add_common_feed_entry_element(element_tag, options = {})
|
46
|
+
call_on_each_feed_entry :element, element_tag, options
|
47
|
+
end
|
54
48
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
49
|
+
def add_common_feed_entry_elements(element_tag, options = {})
|
50
|
+
call_on_each_feed_entry :elements, element_tag, options
|
51
|
+
end
|
52
|
+
|
53
|
+
def call_on_each_feed_entry(method, *parameters)
|
54
|
+
feed_classes.each do |klass|
|
55
|
+
klass.sax_config.collection_elements.each_value do |value|
|
56
|
+
collection_configs = value.select do |v|
|
57
|
+
v.accessor == 'entries' && v.data_class.class == Class
|
58
|
+
end
|
59
|
+
|
60
|
+
collection_configs.each do |config|
|
61
|
+
config.data_class.send(method, *parameters)
|
62
|
+
end
|
60
63
|
end
|
61
64
|
end
|
62
65
|
end
|
63
|
-
end
|
64
66
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
67
|
+
def fetch_and_parse(url)
|
68
|
+
response = connection(url).get
|
69
|
+
unless response.success?
|
70
|
+
raise FetchFailure, "Fetch failed - #{response.status}"
|
71
|
+
end
|
72
|
+
feed = parse response.body
|
73
|
+
feed.feed_url = url
|
74
|
+
feed.etag = response.headers['etag'].to_s.delete '"'
|
75
|
+
|
76
|
+
feed.last_modified = parse_last_modified(response)
|
77
|
+
feed
|
78
|
+
end
|
79
|
+
|
80
|
+
# rubocop:disable LineLength
|
81
|
+
def connection(url)
|
82
|
+
Faraday.new(url: url, headers: headers, request: request_options) do |conn|
|
83
|
+
conn.use FaradayMiddleware::FollowRedirects, limit: Feedjira.follow_redirect_limit
|
84
|
+
conn.adapter(*Faraday.default_adapter)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
# rubocop:enable LineLength
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def headers
|
92
|
+
{
|
93
|
+
user_agent: Feedjira.user_agent
|
94
|
+
}
|
95
|
+
end
|
96
|
+
|
97
|
+
def request_options
|
98
|
+
{
|
99
|
+
timeout: Feedjira.request_timeout
|
100
|
+
}
|
101
|
+
end
|
78
102
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
103
|
+
def parse_last_modified(response)
|
104
|
+
lm = response.headers['last-modified']
|
105
|
+
DateTime.parse(lm).to_time
|
106
|
+
rescue StandardError => e
|
107
|
+
Feedjira.logger.warn { "Failed to parse last modified '#{lm}'" }
|
108
|
+
Feedjira.logger.debug(e)
|
109
|
+
nil
|
83
110
|
end
|
84
111
|
end
|
85
112
|
end
|
@@ -1,23 +1,25 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module FeedEntryUtilities
|
3
|
-
|
4
5
|
include Enumerable
|
6
|
+
include DateTimeUtilities
|
5
7
|
|
6
8
|
def published
|
7
9
|
@published ||= @updated
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse_datetime(string)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
13
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
14
|
+
rescue StandardError => e
|
15
|
+
Feedjira.logger.warn { "Failed to parse date #{string.inspect}" }
|
16
|
+
Feedjira.logger.debug(e)
|
17
|
+
nil
|
17
18
|
end
|
18
19
|
|
19
20
|
##
|
20
|
-
# Returns the id of the entry or its url if not id is present, as some
|
21
|
+
# Returns the id of the entry or its url if not id is present, as some
|
22
|
+
# formats don't support it
|
21
23
|
def id
|
22
24
|
@entry_id ||= @url
|
23
25
|
end
|
@@ -26,41 +28,40 @@ module Feedjira
|
|
26
28
|
# Writer for published. By default, we keep the "oldest" publish time found.
|
27
29
|
def published=(val)
|
28
30
|
parsed = parse_datetime(val)
|
29
|
-
@published = parsed if !@published || parsed < @published
|
31
|
+
@published = parsed if parsed && (!@published || parsed < @published)
|
30
32
|
end
|
31
33
|
|
32
34
|
##
|
33
35
|
# Writer for updated. By default, we keep the most recent update time found.
|
34
36
|
def updated=(val)
|
35
37
|
parsed = parse_datetime(val)
|
36
|
-
@updated = parsed if !@updated || parsed > @updated
|
38
|
+
@updated = parsed if parsed && (!@updated || parsed > @updated)
|
37
39
|
end
|
38
40
|
|
39
41
|
def sanitize!
|
40
|
-
%w
|
41
|
-
if
|
42
|
-
|
42
|
+
%w(title author summary content image).each do |name|
|
43
|
+
if respond_to?(name) && send(name).respond_to?(:sanitize!)
|
44
|
+
send(name).send :sanitize!
|
43
45
|
end
|
44
46
|
end
|
45
47
|
end
|
46
48
|
|
47
|
-
|
49
|
+
alias last_modified published
|
48
50
|
|
49
51
|
def each
|
50
|
-
@rss_fields ||=
|
52
|
+
@rss_fields ||= instance_variables
|
51
53
|
|
52
54
|
@rss_fields.each do |field|
|
53
|
-
yield(field.to_s.sub('@', ''),
|
55
|
+
yield(field.to_s.sub('@', ''), instance_variable_get(field))
|
54
56
|
end
|
55
57
|
end
|
56
58
|
|
57
59
|
def [](field)
|
58
|
-
|
60
|
+
instance_variable_get("@#{field}")
|
59
61
|
end
|
60
62
|
|
61
63
|
def []=(field, value)
|
62
|
-
|
64
|
+
instance_variable_set("@#{field}", value)
|
63
65
|
end
|
64
|
-
|
65
66
|
end
|
66
67
|
end
|
@@ -1,6 +1,8 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module FeedUtilities
|
3
|
-
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
|
5
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag).freeze
|
4
6
|
|
5
7
|
attr_writer :new_entries, :updated, :last_modified
|
6
8
|
attr_accessor :etag
|
@@ -11,7 +13,7 @@ module Feedjira
|
|
11
13
|
|
12
14
|
module ClassMethods
|
13
15
|
def parse(xml, &block)
|
14
|
-
xml = xml
|
16
|
+
xml = strip_whitespace(xml)
|
15
17
|
xml = preprocess(xml) if preprocess_xml
|
16
18
|
super xml, &block
|
17
19
|
end
|
@@ -28,11 +30,20 @@ module Feedjira
|
|
28
30
|
def preprocess_xml
|
29
31
|
@preprocess_xml
|
30
32
|
end
|
33
|
+
|
34
|
+
def strip_whitespace(xml)
|
35
|
+
if Feedjira.strip_whitespace
|
36
|
+
xml.strip
|
37
|
+
else
|
38
|
+
xml.lstrip
|
39
|
+
end
|
40
|
+
end
|
31
41
|
end
|
32
42
|
|
33
43
|
def last_modified
|
34
44
|
@last_modified ||= begin
|
35
|
-
|
45
|
+
published = entries.reject { |e| e.published.nil? }
|
46
|
+
entry = published.sort_by { |e| e.published if e.published }.last
|
36
47
|
entry ? entry.published : nil
|
37
48
|
end
|
38
49
|
end
|
@@ -45,13 +56,13 @@ module Feedjira
|
|
45
56
|
@new_entries ||= []
|
46
57
|
end
|
47
58
|
|
48
|
-
def
|
49
|
-
new_entries.
|
59
|
+
def new_entries?
|
60
|
+
!new_entries.empty?
|
50
61
|
end
|
51
62
|
|
52
63
|
def update_from_feed(feed)
|
53
64
|
self.new_entries += find_new_entries_for(feed)
|
54
|
-
|
65
|
+
entries.unshift(*self.new_entries)
|
55
66
|
|
56
67
|
@updated = false
|
57
68
|
|
@@ -61,7 +72,8 @@ module Feedjira
|
|
61
72
|
end
|
62
73
|
|
63
74
|
def update_attribute(feed, name)
|
64
|
-
old_value
|
75
|
+
old_value = send(name)
|
76
|
+
new_value = feed.send(name)
|
65
77
|
|
66
78
|
if old_value != new_value
|
67
79
|
send("#{name}=", new_value)
|
@@ -72,33 +84,36 @@ module Feedjira
|
|
72
84
|
end
|
73
85
|
|
74
86
|
def sanitize_entries!
|
75
|
-
entries.each
|
87
|
+
entries.each(&:sanitize!)
|
76
88
|
end
|
77
89
|
|
78
90
|
private
|
79
91
|
|
92
|
+
# This implementation is a hack, which is why it's so ugly. It's to get
|
93
|
+
# around the fact that not all feeds have a published date. However,
|
94
|
+
# they're always ordered with the newest one first. So we go through the
|
95
|
+
# entries just parsed and insert each one as a new entry until we get to
|
96
|
+
# one that has the same id as the the newest for the feed.
|
80
97
|
def find_new_entries_for(feed)
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
# So we go through the entries just parsed and insert each one as a new entry
|
85
|
-
# until we get to one that has the same id as the the newest for the feed
|
86
|
-
return feed.entries if self.entries.length == 0
|
87
|
-
latest_entry = self.entries.first
|
98
|
+
return feed.entries if entries.length.zero?
|
99
|
+
|
100
|
+
latest_entry = entries.first
|
88
101
|
found_new_entries = []
|
102
|
+
|
89
103
|
feed.entries.each do |entry|
|
90
|
-
|
91
|
-
break if entry.url == latest_entry.url
|
92
|
-
else
|
93
|
-
break if entry.entry_id == latest_entry.entry_id || entry.url == latest_entry.url
|
94
|
-
end
|
104
|
+
break unless new_entry?(entry, latest_entry)
|
95
105
|
found_new_entries << entry
|
96
106
|
end
|
107
|
+
|
97
108
|
found_new_entries
|
98
109
|
end
|
99
110
|
|
100
|
-
def
|
101
|
-
|
111
|
+
def new_entry?(entry, latest)
|
112
|
+
nil_ids = entry.entry_id.nil? && latest.entry_id.nil?
|
113
|
+
new_id = entry.entry_id != latest.entry_id
|
114
|
+
new_url = entry.url != latest.url
|
115
|
+
|
116
|
+
(nil_ids || new_id) && new_url
|
102
117
|
end
|
103
118
|
end
|
104
119
|
end
|
data/lib/feedjira/parser/atom.rb
CHANGED
@@ -1,19 +1,21 @@
|
|
1
|
+
# rubocop:disable Style/DocumentationMethod
|
1
2
|
module Feedjira
|
2
3
|
module Parser
|
3
4
|
# Parser for dealing with Atom feeds.
|
4
5
|
class Atom
|
5
6
|
include SAXMachine
|
6
7
|
include FeedUtilities
|
8
|
+
|
7
9
|
element :title
|
8
|
-
element :subtitle, :
|
9
|
-
element :link, :
|
10
|
-
element :link, :
|
11
|
-
elements :link, :
|
12
|
-
elements :link, :
|
13
|
-
elements :entry, :
|
10
|
+
element :subtitle, as: :description
|
11
|
+
element :link, as: :url, value: :href, with: { type: 'text/html' }
|
12
|
+
element :link, as: :feed_url, value: :href, with: { rel: 'self' }
|
13
|
+
elements :link, as: :links, value: :href
|
14
|
+
elements :link, as: :hubs, value: :href, with: { rel: 'hub' }
|
15
|
+
elements :entry, as: :entries, class: AtomEntry
|
14
16
|
|
15
|
-
def self.able_to_parse?(xml)
|
16
|
-
|
17
|
+
def self.able_to_parse?(xml)
|
18
|
+
%r{\<feed[^\>]+xmlns\s?=\s?[\"\'](http://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)[\"\'][^\>]*\>} =~ xml # rubocop:disable Metrics/LineLength
|
17
19
|
end
|
18
20
|
|
19
21
|
def url
|
@@ -1,5 +1,5 @@
|
|
1
|
+
# rubocop:disable Style/DocumentationMethod
|
1
2
|
module Feedjira
|
2
|
-
|
3
3
|
module Parser
|
4
4
|
# Parser for dealing with Atom feed entries.
|
5
5
|
class AtomEntry
|
@@ -7,28 +7,26 @@ module Feedjira
|
|
7
7
|
include FeedEntryUtilities
|
8
8
|
|
9
9
|
element :title
|
10
|
-
element :link, :
|
11
|
-
element :name, :
|
10
|
+
element :link, as: :url, value: :href, with: { type: 'text/html', rel: 'alternate' } # rubocop:disable Metrics/LineLength
|
11
|
+
element :name, as: :author
|
12
12
|
element :content
|
13
13
|
element :summary
|
14
14
|
|
15
|
-
element :"media:content", :
|
16
|
-
element :enclosure, :
|
15
|
+
element :"media:content", as: :image, value: :url
|
16
|
+
element :enclosure, as: :image, value: :href
|
17
17
|
|
18
18
|
element :published
|
19
|
-
element :id, :
|
20
|
-
element :created, :
|
21
|
-
element :issued, :
|
19
|
+
element :id, as: :entry_id
|
20
|
+
element :created, as: :published
|
21
|
+
element :issued, as: :published
|
22
22
|
element :updated
|
23
|
-
element :modified, :
|
24
|
-
elements :category, :
|
25
|
-
elements :link, :
|
23
|
+
element :modified, as: :updated
|
24
|
+
elements :category, as: :categories, value: :term
|
25
|
+
elements :link, as: :links, value: :href
|
26
26
|
|
27
27
|
def url
|
28
28
|
@url ||= links.first
|
29
29
|
end
|
30
30
|
end
|
31
|
-
|
32
31
|
end
|
33
|
-
|
34
32
|
end
|