feedjira 2.2.0 → 3.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/feed-parsing.md +15 -0
  3. data/.rubocop.yml +32 -8
  4. data/.rubocop_todo.yml +11 -0
  5. data/.travis.yml +3 -7
  6. data/CHANGELOG.md +18 -9
  7. data/CODE_OF_CONDUCT.md +74 -0
  8. data/Gemfile +8 -5
  9. data/README.md +46 -99
  10. data/Rakefile +8 -6
  11. data/feedjira.gemspec +31 -20
  12. data/lib/feedjira.rb +75 -41
  13. data/lib/feedjira/atom_entry_utilities.rb +51 -0
  14. data/lib/feedjira/configuration.rb +8 -10
  15. data/lib/feedjira/core_ext.rb +5 -3
  16. data/lib/feedjira/core_ext/date.rb +2 -1
  17. data/lib/feedjira/core_ext/string.rb +2 -1
  18. data/lib/feedjira/core_ext/time.rb +12 -12
  19. data/lib/feedjira/date_time_utilities.rb +8 -10
  20. data/lib/feedjira/date_time_utilities/date_time_epoch_parser.rb +3 -2
  21. data/lib/feedjira/date_time_utilities/date_time_language_parser.rb +4 -4
  22. data/lib/feedjira/date_time_utilities/date_time_pattern_parser.rb +11 -15
  23. data/lib/feedjira/feed.rb +12 -82
  24. data/lib/feedjira/feed_entry_utilities.rb +14 -7
  25. data/lib/feedjira/feed_utilities.rb +5 -4
  26. data/lib/feedjira/parser.rb +6 -1
  27. data/lib/feedjira/parser/atom.rb +6 -5
  28. data/lib/feedjira/parser/atom_entry.rb +4 -21
  29. data/lib/feedjira/parser/atom_feed_burner.rb +7 -6
  30. data/lib/feedjira/parser/atom_feed_burner_entry.rb +7 -18
  31. data/lib/feedjira/parser/atom_google_alerts.rb +26 -0
  32. data/lib/feedjira/parser/atom_google_alerts_entry.rb +21 -0
  33. data/lib/feedjira/parser/atom_youtube.rb +4 -3
  34. data/lib/feedjira/parser/atom_youtube_entry.rb +9 -8
  35. data/lib/feedjira/parser/globally_unique_identifier.rb +21 -0
  36. data/lib/feedjira/parser/google_docs_atom.rb +6 -6
  37. data/lib/feedjira/parser/google_docs_atom_entry.rb +3 -19
  38. data/lib/feedjira/parser/itunes_rss.rb +4 -3
  39. data/lib/feedjira/parser/itunes_rss_category.rb +6 -5
  40. data/lib/feedjira/parser/itunes_rss_item.rb +5 -8
  41. data/lib/feedjira/parser/itunes_rss_owner.rb +2 -1
  42. data/lib/feedjira/parser/json_feed.rb +41 -0
  43. data/lib/feedjira/parser/json_feed_item.rb +57 -0
  44. data/lib/feedjira/parser/podlove_chapter.rb +4 -3
  45. data/lib/feedjira/parser/rss.rb +5 -3
  46. data/lib/feedjira/parser/rss_entry.rb +3 -24
  47. data/lib/feedjira/parser/rss_feed_burner.rb +4 -3
  48. data/lib/feedjira/parser/rss_feed_burner_entry.rb +6 -26
  49. data/lib/feedjira/parser/rss_image.rb +2 -0
  50. data/lib/feedjira/preprocessor.rb +4 -4
  51. data/lib/feedjira/rss_entry_utilities.rb +53 -0
  52. data/lib/feedjira/version.rb +3 -1
  53. data/spec/feedjira/configuration_spec.rb +11 -16
  54. data/spec/feedjira/date_time_utilities_spec.rb +22 -20
  55. data/spec/feedjira/feed_entry_utilities_spec.rb +20 -18
  56. data/spec/feedjira/feed_spec.rb +17 -229
  57. data/spec/feedjira/feed_utilities_spec.rb +75 -73
  58. data/spec/feedjira/parser/atom_entry_spec.rb +41 -38
  59. data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +22 -20
  60. data/spec/feedjira/parser/atom_feed_burner_spec.rb +122 -118
  61. data/spec/feedjira/parser/atom_google_alerts_entry_spec.rb +34 -0
  62. data/spec/feedjira/parser/atom_google_alerts_spec.rb +62 -0
  63. data/spec/feedjira/parser/atom_spec.rb +83 -77
  64. data/spec/feedjira/parser/atom_youtube_entry_spec.rb +41 -39
  65. data/spec/feedjira/parser/atom_youtube_spec.rb +21 -19
  66. data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +10 -8
  67. data/spec/feedjira/parser/google_docs_atom_spec.rb +25 -21
  68. data/spec/feedjira/parser/itunes_rss_item_spec.rb +39 -37
  69. data/spec/feedjira/parser/itunes_rss_owner_spec.rb +7 -5
  70. data/spec/feedjira/parser/itunes_rss_spec.rb +120 -116
  71. data/spec/feedjira/parser/json_feed_item_spec.rb +81 -0
  72. data/spec/feedjira/parser/json_feed_spec.rb +55 -0
  73. data/spec/feedjira/parser/podlove_chapter_spec.rb +14 -12
  74. data/spec/feedjira/parser/rss_entry_spec.rb +56 -34
  75. data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +36 -34
  76. data/spec/feedjira/parser/rss_feed_burner_spec.rb +49 -45
  77. data/spec/feedjira/parser/rss_spec.rb +38 -36
  78. data/spec/feedjira/preprocessor_spec.rb +9 -7
  79. data/spec/feedjira_spec.rb +166 -0
  80. data/spec/sample_feeds.rb +32 -29
  81. data/spec/sample_feeds/HuffPostCanada.xml +279 -0
  82. data/spec/sample_feeds/Permalinks.xml +22 -0
  83. data/spec/sample_feeds/a10.xml +72 -0
  84. data/spec/sample_feeds/google_alerts_atom.xml +1 -0
  85. data/spec/sample_feeds/json_feed.json +156 -0
  86. data/spec/spec_helper.rb +7 -5
  87. metadata +59 -70
  88. data/Dangerfile +0 -1
  89. data/fixtures/vcr_cassettes/fetch_failure.yml +0 -62
  90. data/fixtures/vcr_cassettes/parse_error.yml +0 -222
  91. data/fixtures/vcr_cassettes/success.yml +0 -281
@@ -1,32 +1,28 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  module DateTimeUtilities
5
5
  class DateTimePatternParser
6
- # rubocop:disable Style/AsciiComments
7
6
  # Japanese Symbols are required for strange Date Strings like
8
7
  # '水, 31 8 2016 07:37:00 PDT'
9
- JAPANESE_SYMBOLS = %w(日 月 火 水 木 金 土).freeze
10
- PATTERNS = ['%m/%d/%Y %T %p', '%d %m %Y %T %Z'].freeze
8
+ JAPANESE_SYMBOLS = %w[日 月 火 水 木 金 土].freeze
9
+ PATTERNS = ["%m/%d/%Y %T %p", "%d %m %Y %T %Z"].freeze
11
10
 
12
- # rubocop:disable Metrics/MethodLength
13
11
  def self.parse(string)
14
12
  PATTERNS.each do |p|
15
- begin
16
- datetime = DateTime.strptime(prepare(string), p)
17
- return datetime
18
- rescue StandardError => e
19
- Feedjira.logger.debug("Failed to parse date #{string}")
20
- Feedjira.logger.debug(e)
21
- nil
22
- end
13
+ datetime = DateTime.strptime(prepare(string), p)
14
+ return datetime
15
+ rescue StandardError => e
16
+ Feedjira.logger.debug("Failed to parse date #{string}")
17
+ Feedjira.logger.debug(e)
18
+ nil
23
19
  end
24
20
  raise "No pattern matched #{string}"
25
21
  end
26
22
 
27
23
  def self.prepare(string)
28
24
  rgx = Regexp.new("^(#{JAPANESE_SYMBOLS.join('|')}),\s")
29
- string.gsub(rgx, '')
25
+ string.gsub(rgx, "")
30
26
  end
31
27
  private_class_method :prepare
32
28
  end
@@ -1,60 +1,35 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  class Feed
5
5
  class << self
6
- def parse_with(parser, xml, &block)
7
- parser.parse xml, &block
8
- end
9
-
10
- def parse(xml, &block)
11
- parser = determine_feed_parser_for_xml(xml)
12
- raise NoParserAvailable, 'No valid parser for XML.' unless parser
13
- parse_with parser, xml, &block
14
- end
15
-
16
- def determine_feed_parser_for_xml(xml)
17
- start_of_doc = xml.slice(0, 2000)
18
- feed_classes.detect { |klass| klass.able_to_parse?(start_of_doc) }
19
- end
20
-
21
- def add_feed_class(klass)
22
- feed_classes.unshift klass
23
- end
24
-
25
- def feed_classes
26
- @feed_classes ||= Feedjira.parsers
27
- end
28
-
29
- def reset_parsers!
30
- @feed_classes = nil
31
- end
32
-
33
6
  def add_common_feed_element(element_tag, options = {})
34
- feed_classes.each do |k|
35
- k.element element_tag, options
7
+ Feedjira.parsers.each do |k|
8
+ k.element(element_tag, options)
36
9
  end
37
10
  end
38
11
 
39
12
  def add_common_feed_elements(element_tag, options = {})
40
- feed_classes.each do |k|
41
- k.elements element_tag, options
13
+ Feedjira.parsers.each do |k|
14
+ k.elements(element_tag, options)
42
15
  end
43
16
  end
44
17
 
45
18
  def add_common_feed_entry_element(element_tag, options = {})
46
- call_on_each_feed_entry :element, element_tag, options
19
+ call_on_each_feed_entry(:element, element_tag, options)
47
20
  end
48
21
 
49
22
  def add_common_feed_entry_elements(element_tag, options = {})
50
- call_on_each_feed_entry :elements, element_tag, options
23
+ call_on_each_feed_entry(:elements, element_tag, options)
51
24
  end
52
25
 
26
+ private
27
+
53
28
  def call_on_each_feed_entry(method, *parameters)
54
- feed_classes.each do |klass|
29
+ Feedjira.parsers.each do |klass|
55
30
  klass.sax_config.collection_elements.each_value do |value|
56
31
  collection_configs = value.select do |v|
57
- v.accessor == 'entries' && v.data_class.class == Class
32
+ v.accessor == "entries" && v.data_class.class == Class
58
33
  end
59
34
 
60
35
  collection_configs.each do |config|
@@ -63,51 +38,6 @@ module Feedjira
63
38
  end
64
39
  end
65
40
  end
66
-
67
- def fetch_and_parse(url)
68
- response = connection(url).get
69
- unless response.success?
70
- raise FetchFailure, "Fetch failed - #{response.status}"
71
- end
72
- feed = parse response.body
73
- feed.feed_url = url
74
- feed.etag = response.headers['etag'].to_s.delete '"'
75
-
76
- feed.last_modified = parse_last_modified(response)
77
- feed
78
- end
79
-
80
- # rubocop:disable LineLength
81
- def connection(url)
82
- Faraday.new(url: url, headers: headers, request: request_options) do |conn|
83
- conn.use FaradayMiddleware::FollowRedirects, limit: Feedjira.follow_redirect_limit
84
- conn.adapter(*Faraday.default_adapter)
85
- end
86
- end
87
- # rubocop:enable LineLength
88
-
89
- private
90
-
91
- def headers
92
- {
93
- user_agent: Feedjira.user_agent
94
- }
95
- end
96
-
97
- def request_options
98
- {
99
- timeout: Feedjira.request_timeout
100
- }
101
- end
102
-
103
- def parse_last_modified(response)
104
- lm = response.headers['last-modified']
105
- DateTime.parse(lm).to_time
106
- rescue StandardError => e
107
- Feedjira.logger.warn { "Failed to parse last modified '#{lm}'" }
108
- Feedjira.logger.debug(e)
109
- nil
110
- end
111
41
  end
112
42
  end
113
43
  end
@@ -1,5 +1,5 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  module FeedEntryUtilities
5
5
  include Enumerable
@@ -13,16 +13,18 @@ module Feedjira
13
13
  DateTime.parse(string).feed_utils_to_gm_time
14
14
  rescue StandardError => e
15
15
  Feedjira.logger.warn { "Failed to parse date #{string.inspect}" }
16
- Feedjira.logger.debug(e)
16
+ Feedjira.logger.warn(e)
17
17
  nil
18
18
  end
19
19
 
20
20
  ##
21
21
  # Returns the id of the entry or its url if not id is present, as some
22
22
  # formats don't support it
23
+ # rubocop:disable Naming/MemoizedInstanceVariableName
23
24
  def id
24
25
  @entry_id ||= @url
25
26
  end
27
+ # rubocop:enable Naming/MemoizedInstanceVariableName
26
28
 
27
29
  ##
28
30
  # Writer for published. By default, we keep the "oldest" publish time found.
@@ -39,9 +41,9 @@ module Feedjira
39
41
  end
40
42
 
41
43
  def sanitize!
42
- %w(title author summary content image).each do |name|
44
+ %w[title author summary content image].each do |name|
43
45
  if respond_to?(name) && send(name).respond_to?(:sanitize!)
44
- send(name).send :sanitize!
46
+ send(name).send(:sanitize!)
45
47
  end
46
48
  end
47
49
  end
@@ -49,10 +51,15 @@ module Feedjira
49
51
  alias last_modified published
50
52
 
51
53
  def each
52
- @rss_fields ||= instance_variables
54
+ @rss_fields ||= instance_variables.map do |ivar|
55
+ ivar.to_s.sub("@", "")
56
+ end.select do |field| # rubocop:disable Style/MultilineBlockChain
57
+ # select callable (public) methods only
58
+ respond_to?(field)
59
+ end
53
60
 
54
61
  @rss_fields.each do |field|
55
- yield(field.to_s.sub('@', ''), instance_variable_get(field))
62
+ yield(field, instance_variable_get(:"@#{field}"))
56
63
  end
57
64
  end
58
65
 
@@ -1,8 +1,8 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  module FeedUtilities
5
- UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag).freeze
5
+ UPDATABLE_ATTRIBUTES = %w[title feed_url url last_modified etag].freeze
6
6
 
7
7
  attr_writer :new_entries, :updated, :last_modified
8
8
  attr_accessor :etag
@@ -43,7 +43,7 @@ module Feedjira
43
43
  def last_modified
44
44
  @last_modified ||= begin
45
45
  published = entries.reject { |e| e.published.nil? }
46
- entry = published.sort_by { |e| e.published if e.published }.last
46
+ entry = published.max_by(&:published)
47
47
  entry ? entry.published : nil
48
48
  end
49
49
  end
@@ -102,6 +102,7 @@ module Feedjira
102
102
 
103
103
  feed.entries.each do |entry|
104
104
  break unless new_entry?(entry, latest_entry)
105
+
105
106
  found_new_entries << entry
106
107
  end
107
108
 
@@ -1 +1,6 @@
1
- module Feedjira::Parser; end # rubocop:disable Style/Documentation
1
+ # frozen_string_literal: true
2
+
3
+ module Feedjira
4
+ module Parser
5
+ end
6
+ end
@@ -1,4 +1,5 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Atom feeds.
@@ -8,14 +9,14 @@ module Feedjira
8
9
 
9
10
  element :title
10
11
  element :subtitle, as: :description
11
- element :link, as: :url, value: :href, with: { type: 'text/html' }
12
- element :link, as: :feed_url, value: :href, with: { rel: 'self' }
12
+ element :link, as: :url, value: :href, with: { type: "text/html" }
13
+ element :link, as: :feed_url, value: :href, with: { rel: "self" }
13
14
  elements :link, as: :links, value: :href
14
- elements :link, as: :hubs, value: :href, with: { rel: 'hub' }
15
+ elements :link, as: :hubs, value: :href, with: { rel: "hub" }
15
16
  elements :entry, as: :entries, class: AtomEntry
16
17
 
17
18
  def self.able_to_parse?(xml)
18
- %r{\<feed[^\>]+xmlns\s?=\s?[\"\'](http://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)[\"\'][^\>]*\>} =~ xml # rubocop:disable Metrics/LineLength
19
+ %r{<feed[^>]+xmlns\s?=\s?["'](http://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)["'][^>]*>} =~ xml
19
20
  end
20
21
 
21
22
  def url
@@ -1,32 +1,15 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Atom feed entries.
5
6
  class AtomEntry
6
7
  include SAXMachine
7
8
  include FeedEntryUtilities
9
+ include AtomEntryUtilities
8
10
 
9
- element :title
10
- element :link, as: :url, value: :href, with: { type: 'text/html', rel: 'alternate' } # rubocop:disable Metrics/LineLength
11
- element :name, as: :author
12
- element :content
13
- element :summary
14
-
11
+ element :"media:thumbnail", as: :image, value: :url
15
12
  element :"media:content", as: :image, value: :url
16
- element :enclosure, as: :image, value: :href
17
-
18
- element :published
19
- element :id, as: :entry_id
20
- element :created, as: :published
21
- element :issued, as: :published
22
- element :updated
23
- element :modified, as: :updated
24
- elements :category, as: :categories, value: :term
25
- elements :link, as: :links, value: :href
26
-
27
- def url
28
- @url ||= links.first
29
- end
30
13
  end
31
14
  end
32
15
  end
@@ -1,4 +1,5 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Feedburner Atom feeds.
@@ -9,18 +10,18 @@ module Feedjira
9
10
  element :title
10
11
  element :subtitle, as: :description
11
12
  element :link, as: :url_text_html, value: :href,
12
- with: { type: 'text/html' }
13
+ with: { type: "text/html" }
13
14
  element :link, as: :url_notype, value: :href, with: { type: nil }
14
- element :link, as: :feed_url_link, value: :href, with: { type: 'application/atom+xml' } # rubocop:disable Metrics/LineLength
15
+ element :link, as: :feed_url_link, value: :href, with: { type: "application/atom+xml" }
15
16
  element :"atom10:link", as: :feed_url_atom10_link, value: :href,
16
- with: { type: 'application/atom+xml' }
17
- elements :"atom10:link", as: :hubs, value: :href, with: { rel: 'hub' }
17
+ with: { type: "application/atom+xml" }
18
+ elements :"atom10:link", as: :hubs, value: :href, with: { rel: "hub" }
18
19
  elements :entry, as: :entries, class: AtomFeedBurnerEntry
19
20
 
20
21
  attr_writer :url, :feed_url
21
22
 
22
23
  def self.able_to_parse?(xml)
23
- ((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false # rubocop:disable Metrics/LineLength
24
+ ((/<feed/ =~ xml) && (/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/<rss|<rdf/ =~ xml)) || false
24
25
  end
25
26
 
26
27
  # Feed url is <link> with type="text/html" if present,
@@ -1,32 +1,21 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Feedburner Atom feed entries.
5
6
  class AtomFeedBurnerEntry
6
7
  include SAXMachine
7
8
  include FeedEntryUtilities
9
+ include AtomEntryUtilities
8
10
 
9
- element :title
10
- element :name, as: :author
11
- element :link, as: :url, value: :href, with: { type: 'text/html', rel: 'alternate' } # rubocop:disable Metrics/LineLength
12
- element :"feedburner:origLink", as: :url
13
- element :summary
14
- element :content
11
+ element :"feedburner:origLink", as: :orig_link
12
+ private :orig_link
15
13
 
14
+ element :"media:thumbnail", as: :image, value: :url
16
15
  element :"media:content", as: :image, value: :url
17
- element :enclosure, as: :image, value: :href
18
-
19
- element :published
20
- element :id, as: :entry_id
21
- element :issued, as: :published
22
- element :created, as: :published
23
- element :updated
24
- element :modified, as: :updated
25
- elements :category, as: :categories, value: :term
26
- elements :link, as: :links, value: :href
27
16
 
28
17
  def url
29
- @url ||= links.first
18
+ orig_link || super
30
19
  end
31
20
  end
32
21
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Feedjira
4
+ module Parser
5
+ # Parser for dealing with Feedburner Atom feeds.
6
+ class AtomGoogleAlerts
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ element :title
11
+ element :subtitle, as: :description
12
+ element :link, as: :feed_url, value: :href, with: { rel: "self" }
13
+ element :link, as: :url, value: :href, with: { rel: "self" }
14
+ elements :link, as: :links, value: :href
15
+ elements :entry, as: :entries, class: AtomGoogleAlertsEntry
16
+
17
+ def self.able_to_parse?(xml)
18
+ Atom.able_to_parse?(xml) && (%r{<id>tag:google\.com,2005:[^<]+/com\.google/alerts/} === xml) # rubocop:disable Style/CaseEquality
19
+ end
20
+
21
+ def self.preprocess(xml)
22
+ Preprocessor.new(xml).to_xml
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Feedjira
4
+ module Parser
5
+ # Parser for dealing with Feedburner Atom feed entries.
6
+ class AtomGoogleAlertsEntry
7
+ include SAXMachine
8
+ include FeedEntryUtilities
9
+ include AtomEntryUtilities
10
+
11
+ def url
12
+ url = super
13
+ return unless url&.start_with?("https://www.google.com/url?")
14
+
15
+ uri = URI(url)
16
+ cons = URI.decode_www_form(uri.query).assoc("url")
17
+ cons && cons[1]
18
+ end
19
+ end
20
+ end
21
+ end