feedjira 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +9 -2
- data/CHANGELOG.md +4 -0
- data/LICENSE +1 -1
- data/README.md +210 -7
- data/Rakefile +5 -0
- data/feedjira.gemspec +2 -1
- data/lib/feedjira.rb +7 -1
- data/lib/feedjira/configuration.rb +76 -0
- data/lib/feedjira/core_ext/date.rb +1 -0
- data/lib/feedjira/core_ext/string.rb +1 -0
- data/lib/feedjira/core_ext/time.rb +5 -1
- data/lib/feedjira/date_time_utilities.rb +11 -3
- data/lib/feedjira/date_time_utilities/date_time_epoch_parser.rb +13 -0
- data/lib/feedjira/date_time_utilities/date_time_language_parser.rb +2 -0
- data/lib/feedjira/date_time_utilities/date_time_pattern_parser.rb +6 -1
- data/lib/feedjira/feed.rb +87 -69
- data/lib/feedjira/feed_entry_utilities.rb +5 -2
- data/lib/feedjira/feed_utilities.rb +11 -1
- data/lib/feedjira/parser.rb +1 -1
- data/lib/feedjira/parser/atom.rb +1 -0
- data/lib/feedjira/parser/atom_entry.rb +1 -0
- data/lib/feedjira/parser/atom_feed_burner.rb +19 -2
- data/lib/feedjira/parser/atom_feed_burner_entry.rb +1 -0
- data/lib/feedjira/parser/atom_youtube.rb +1 -0
- data/lib/feedjira/parser/atom_youtube_entry.rb +1 -0
- data/lib/feedjira/parser/google_docs_atom.rb +2 -1
- data/lib/feedjira/parser/google_docs_atom_entry.rb +2 -0
- data/lib/feedjira/parser/itunes_rss.rb +1 -0
- data/lib/feedjira/parser/itunes_rss_category.rb +1 -0
- data/lib/feedjira/parser/itunes_rss_owner.rb +1 -0
- data/lib/feedjira/parser/podlove_chapter.rb +2 -0
- data/lib/feedjira/parser/rss.rb +1 -0
- data/lib/feedjira/parser/rss_feed_burner.rb +1 -0
- data/lib/feedjira/parser/rss_feed_burner_entry.rb +1 -0
- data/lib/feedjira/preprocessor.rb +2 -0
- data/lib/feedjira/version.rb +1 -1
- data/spec/feedjira/configuration_spec.rb +25 -0
- data/spec/feedjira/date_time_utilities_spec.rb +6 -0
- data/spec/feedjira/feed_spec.rb +20 -2
- data/spec/feedjira/feed_utilities_spec.rb +18 -0
- data/spec/feedjira/parser/atom_feed_burner_spec.rb +32 -1
- data/spec/sample_feeds.rb +1 -0
- data/spec/sample_feeds/GiantRobotsSmashingIntoOtherGiantRobots.xml +682 -0
- metadata +49 -29
@@ -1,3 +1,4 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
1
2
|
module Feedjira
|
2
3
|
module DateTimeUtilities
|
3
4
|
# This is our date parsing heuristic.
|
@@ -5,20 +6,27 @@ module Feedjira
|
|
5
6
|
DATE_PARSERS = [
|
6
7
|
DateTimePatternParser,
|
7
8
|
DateTimeLanguageParser,
|
9
|
+
DateTimeEpochParser,
|
8
10
|
DateTime
|
9
11
|
].freeze
|
10
12
|
|
11
13
|
# Parse the given string starting with the most common parser (default ruby)
|
12
14
|
# and going over all other available parsers
|
15
|
+
# rubocop:disable Metrics/MethodLength
|
13
16
|
def parse_datetime(string)
|
14
|
-
DATE_PARSERS.find do |parser|
|
17
|
+
res = DATE_PARSERS.find do |parser|
|
15
18
|
begin
|
16
19
|
return parser.parse(string).feed_utils_to_gm_time
|
17
|
-
rescue
|
20
|
+
rescue StandardError => e
|
21
|
+
Feedjira.logger.debug { "Failed to parse date #{string}" }
|
22
|
+
Feedjira.logger.debug(e)
|
18
23
|
nil
|
19
24
|
end
|
20
25
|
end
|
21
|
-
|
26
|
+
|
27
|
+
Feedjira.logger.warn { "Failed to parse date #{string}" } if res.nil?
|
28
|
+
|
29
|
+
res
|
22
30
|
end
|
23
31
|
end
|
24
32
|
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
3
|
+
module Feedjira
|
4
|
+
module DateTimeUtilities
|
5
|
+
class DateTimeEpochParser
|
6
|
+
def self.parse(string)
|
7
|
+
epoch_time = string.to_i
|
8
|
+
return Time.at(epoch_time).to_datetime if epoch_time.to_s == string
|
9
|
+
raise "#{string} is not a valid epoch time"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module DateTimeUtilities
|
3
5
|
class DateTimePatternParser
|
@@ -7,12 +9,15 @@ module Feedjira
|
|
7
9
|
JAPANESE_SYMBOLS = %w(日 月 火 水 木 金 土).freeze
|
8
10
|
PATTERNS = ['%m/%d/%Y %T %p', '%d %m %Y %T %Z'].freeze
|
9
11
|
|
12
|
+
# rubocop:disable Metrics/MethodLength
|
10
13
|
def self.parse(string)
|
11
14
|
PATTERNS.each do |p|
|
12
15
|
begin
|
13
16
|
datetime = DateTime.strptime(prepare(string), p)
|
14
17
|
return datetime
|
15
|
-
rescue
|
18
|
+
rescue StandardError => e
|
19
|
+
Feedjira.logger.debug("Failed to parse date #{string}")
|
20
|
+
Feedjira.logger.debug(e)
|
16
21
|
nil
|
17
22
|
end
|
18
23
|
end
|
data/lib/feedjira/feed.rb
CHANGED
@@ -1,95 +1,113 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
class Feed
|
3
|
-
|
4
|
-
parser
|
5
|
-
|
5
|
+
class << self
|
6
|
+
def parse_with(parser, xml, &block)
|
7
|
+
parser.parse xml, &block
|
8
|
+
end
|
6
9
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
def parse(xml, &block)
|
11
|
+
parser = determine_feed_parser_for_xml(xml)
|
12
|
+
raise NoParserAvailable, 'No valid parser for XML.' unless parser
|
13
|
+
parse_with parser, xml, &block
|
14
|
+
end
|
12
15
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
def determine_feed_parser_for_xml(xml)
|
17
|
+
start_of_doc = xml.slice(0, 2000)
|
18
|
+
feed_classes.detect { |klass| klass.able_to_parse?(start_of_doc) }
|
19
|
+
end
|
17
20
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
+
def add_feed_class(klass)
|
22
|
+
feed_classes.unshift klass
|
23
|
+
end
|
21
24
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
Feedjira::Parser::GoogleDocsAtom,
|
26
|
-
Feedjira::Parser::AtomYoutube,
|
27
|
-
Feedjira::Parser::AtomFeedBurner,
|
28
|
-
Feedjira::Parser::Atom,
|
29
|
-
Feedjira::Parser::ITunesRSS,
|
30
|
-
Feedjira::Parser::RSS
|
31
|
-
]
|
32
|
-
end
|
25
|
+
def feed_classes
|
26
|
+
@feed_classes ||= Feedjira.parsers
|
27
|
+
end
|
33
28
|
|
34
|
-
|
35
|
-
|
36
|
-
k.element element_tag, options
|
29
|
+
def reset_parsers!
|
30
|
+
@feed_classes = nil
|
37
31
|
end
|
38
|
-
end
|
39
32
|
|
40
|
-
|
41
|
-
|
42
|
-
|
33
|
+
def add_common_feed_element(element_tag, options = {})
|
34
|
+
feed_classes.each do |k|
|
35
|
+
k.element element_tag, options
|
36
|
+
end
|
43
37
|
end
|
44
|
-
end
|
45
38
|
|
46
|
-
|
47
|
-
|
48
|
-
|
39
|
+
def add_common_feed_elements(element_tag, options = {})
|
40
|
+
feed_classes.each do |k|
|
41
|
+
k.elements element_tag, options
|
42
|
+
end
|
43
|
+
end
|
49
44
|
|
50
|
-
|
51
|
-
|
52
|
-
|
45
|
+
def add_common_feed_entry_element(element_tag, options = {})
|
46
|
+
call_on_each_feed_entry :element, element_tag, options
|
47
|
+
end
|
53
48
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
collection_configs = value.select do |v|
|
58
|
-
v.accessor == 'entries' && v.data_class.class == Class
|
59
|
-
end
|
49
|
+
def add_common_feed_entry_elements(element_tag, options = {})
|
50
|
+
call_on_each_feed_entry :elements, element_tag, options
|
51
|
+
end
|
60
52
|
|
61
|
-
|
62
|
-
|
53
|
+
def call_on_each_feed_entry(method, *parameters)
|
54
|
+
feed_classes.each do |klass|
|
55
|
+
klass.sax_config.collection_elements.each_value do |value|
|
56
|
+
collection_configs = value.select do |v|
|
57
|
+
v.accessor == 'entries' && v.data_class.class == Class
|
58
|
+
end
|
59
|
+
|
60
|
+
collection_configs.each do |config|
|
61
|
+
config.data_class.send(method, *parameters)
|
62
|
+
end
|
63
63
|
end
|
64
64
|
end
|
65
65
|
end
|
66
|
-
end
|
67
66
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
def fetch_and_parse(url)
|
68
|
+
response = connection(url).get
|
69
|
+
unless response.success?
|
70
|
+
raise FetchFailure, "Fetch failed - #{response.status}"
|
71
|
+
end
|
72
|
+
feed = parse response.body
|
73
|
+
feed.feed_url = url
|
74
|
+
feed.etag = response.headers['etag'].to_s.delete '"'
|
75
|
+
|
76
|
+
feed.last_modified = parse_last_modified(response)
|
77
|
+
feed
|
72
78
|
end
|
73
|
-
feed = parse response.body
|
74
|
-
feed.feed_url = url
|
75
|
-
feed.etag = response.headers['etag'].to_s.delete '"'
|
76
79
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
+
# rubocop:disable LineLength
|
81
|
+
def connection(url)
|
82
|
+
Faraday.new(url: url, headers: headers, request: request_options) do |conn|
|
83
|
+
conn.use FaradayMiddleware::FollowRedirects, limit: Feedjira.follow_redirect_limit
|
84
|
+
conn.adapter :net_http
|
85
|
+
end
|
86
|
+
end
|
87
|
+
# rubocop:enable LineLength
|
88
|
+
|
89
|
+
private
|
80
90
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
91
|
+
def headers
|
92
|
+
{
|
93
|
+
user_agent: Feedjira.user_agent
|
94
|
+
}
|
95
|
+
end
|
96
|
+
|
97
|
+
def request_options
|
98
|
+
{
|
99
|
+
timeout: Feedjira.request_timeout
|
100
|
+
}
|
85
101
|
end
|
86
|
-
end
|
87
102
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
103
|
+
def parse_last_modified(response)
|
104
|
+
lm = response.headers['last-modified']
|
105
|
+
DateTime.parse(lm).to_time
|
106
|
+
rescue StandardError => e
|
107
|
+
Feedjira.logger.warn { "Failed to parse last modified '#{lm}'" }
|
108
|
+
Feedjira.logger.debug(e)
|
109
|
+
nil
|
110
|
+
end
|
92
111
|
end
|
93
|
-
private_class_method :parse_last_modified
|
94
112
|
end
|
95
113
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module FeedEntryUtilities
|
3
5
|
include Enumerable
|
@@ -9,8 +11,9 @@ module Feedjira
|
|
9
11
|
|
10
12
|
def parse_datetime(string)
|
11
13
|
DateTime.parse(string).feed_utils_to_gm_time
|
12
|
-
rescue
|
13
|
-
warn "Failed to parse date #{string.inspect}"
|
14
|
+
rescue StandardError => e
|
15
|
+
Feedjira.logger.warn { "Failed to parse date #{string.inspect}" }
|
16
|
+
Feedjira.logger.warn(e)
|
14
17
|
nil
|
15
18
|
end
|
16
19
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# rubocop:disable Style/Documentation
|
2
|
+
# rubocop:disable Style/DocumentationMethod
|
1
3
|
module Feedjira
|
2
4
|
module FeedUtilities
|
3
5
|
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag).freeze
|
@@ -11,7 +13,7 @@ module Feedjira
|
|
11
13
|
|
12
14
|
module ClassMethods
|
13
15
|
def parse(xml, &block)
|
14
|
-
xml = xml
|
16
|
+
xml = strip_whitespace(xml)
|
15
17
|
xml = preprocess(xml) if preprocess_xml
|
16
18
|
super xml, &block
|
17
19
|
end
|
@@ -28,6 +30,14 @@ module Feedjira
|
|
28
30
|
def preprocess_xml
|
29
31
|
@preprocess_xml
|
30
32
|
end
|
33
|
+
|
34
|
+
def strip_whitespace(xml)
|
35
|
+
if Feedjira.strip_whitespace
|
36
|
+
xml.strip
|
37
|
+
else
|
38
|
+
xml.lstrip
|
39
|
+
end
|
40
|
+
end
|
31
41
|
end
|
32
42
|
|
33
43
|
def last_modified
|
data/lib/feedjira/parser.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
module Feedjira::Parser; end
|
1
|
+
module Feedjira::Parser; end # rubocop:disable Style/Documentation
|
data/lib/feedjira/parser/atom.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# rubocop:disable Style/DocumentationMethod
|
1
2
|
module Feedjira
|
2
3
|
module Parser
|
3
4
|
# Parser for dealing with Feedburner Atom feeds.
|
@@ -7,8 +8,12 @@ module Feedjira
|
|
7
8
|
|
8
9
|
element :title
|
9
10
|
element :subtitle, as: :description
|
10
|
-
element :link, as: :
|
11
|
-
|
11
|
+
element :link, as: :url_text_html, value: :href,
|
12
|
+
with: { type: 'text/html' }
|
13
|
+
element :link, as: :url_notype, value: :href, with: { type: nil }
|
14
|
+
element :link, as: :feed_url_link, value: :href, with: { type: 'application/atom+xml' } # rubocop:disable Metrics/LineLength
|
15
|
+
element :"atom10:link", as: :feed_url_atom10_link, value: :href,
|
16
|
+
with: { type: 'application/atom+xml' }
|
12
17
|
elements :"atom10:link", as: :hubs, value: :href, with: { rel: 'hub' }
|
13
18
|
elements :entry, as: :entries, class: AtomFeedBurnerEntry
|
14
19
|
|
@@ -16,6 +21,18 @@ module Feedjira
|
|
16
21
|
((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false # rubocop:disable Metrics/LineLength
|
17
22
|
end
|
18
23
|
|
24
|
+
# Feed url is <link> with type="text/html" if present,
|
25
|
+
# <link> with no type attribute otherwise
|
26
|
+
def url
|
27
|
+
@url_text_html || url_notype
|
28
|
+
end
|
29
|
+
|
30
|
+
# Feed feed_url is <link> with type="application/atom+xml" if present,
|
31
|
+
# <atom10:link> with type="application/atom+xml" otherwise
|
32
|
+
def feed_url
|
33
|
+
@feed_url_link || feed_url_atom10_link
|
34
|
+
end
|
35
|
+
|
19
36
|
def self.preprocess(xml)
|
20
37
|
Preprocessor.new(xml).to_xml
|
21
38
|
end
|
data/lib/feedjira/parser/rss.rb
CHANGED