feedjira 2.2.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/feed-parsing.md +15 -0
  3. data/.rubocop.yml +32 -8
  4. data/.rubocop_todo.yml +11 -0
  5. data/.travis.yml +3 -7
  6. data/CHANGELOG.md +18 -9
  7. data/CODE_OF_CONDUCT.md +74 -0
  8. data/Gemfile +8 -5
  9. data/README.md +46 -99
  10. data/Rakefile +8 -6
  11. data/feedjira.gemspec +31 -20
  12. data/lib/feedjira.rb +75 -41
  13. data/lib/feedjira/atom_entry_utilities.rb +51 -0
  14. data/lib/feedjira/configuration.rb +8 -10
  15. data/lib/feedjira/core_ext.rb +5 -3
  16. data/lib/feedjira/core_ext/date.rb +2 -1
  17. data/lib/feedjira/core_ext/string.rb +2 -1
  18. data/lib/feedjira/core_ext/time.rb +12 -12
  19. data/lib/feedjira/date_time_utilities.rb +8 -10
  20. data/lib/feedjira/date_time_utilities/date_time_epoch_parser.rb +3 -2
  21. data/lib/feedjira/date_time_utilities/date_time_language_parser.rb +4 -4
  22. data/lib/feedjira/date_time_utilities/date_time_pattern_parser.rb +11 -15
  23. data/lib/feedjira/feed.rb +12 -82
  24. data/lib/feedjira/feed_entry_utilities.rb +14 -7
  25. data/lib/feedjira/feed_utilities.rb +5 -4
  26. data/lib/feedjira/parser.rb +6 -1
  27. data/lib/feedjira/parser/atom.rb +6 -5
  28. data/lib/feedjira/parser/atom_entry.rb +4 -21
  29. data/lib/feedjira/parser/atom_feed_burner.rb +7 -6
  30. data/lib/feedjira/parser/atom_feed_burner_entry.rb +7 -18
  31. data/lib/feedjira/parser/atom_google_alerts.rb +26 -0
  32. data/lib/feedjira/parser/atom_google_alerts_entry.rb +21 -0
  33. data/lib/feedjira/parser/atom_youtube.rb +4 -3
  34. data/lib/feedjira/parser/atom_youtube_entry.rb +9 -8
  35. data/lib/feedjira/parser/globally_unique_identifier.rb +21 -0
  36. data/lib/feedjira/parser/google_docs_atom.rb +6 -6
  37. data/lib/feedjira/parser/google_docs_atom_entry.rb +3 -19
  38. data/lib/feedjira/parser/itunes_rss.rb +4 -3
  39. data/lib/feedjira/parser/itunes_rss_category.rb +6 -5
  40. data/lib/feedjira/parser/itunes_rss_item.rb +5 -8
  41. data/lib/feedjira/parser/itunes_rss_owner.rb +2 -1
  42. data/lib/feedjira/parser/json_feed.rb +41 -0
  43. data/lib/feedjira/parser/json_feed_item.rb +57 -0
  44. data/lib/feedjira/parser/podlove_chapter.rb +4 -3
  45. data/lib/feedjira/parser/rss.rb +5 -3
  46. data/lib/feedjira/parser/rss_entry.rb +3 -24
  47. data/lib/feedjira/parser/rss_feed_burner.rb +4 -3
  48. data/lib/feedjira/parser/rss_feed_burner_entry.rb +6 -26
  49. data/lib/feedjira/parser/rss_image.rb +2 -0
  50. data/lib/feedjira/preprocessor.rb +4 -4
  51. data/lib/feedjira/rss_entry_utilities.rb +53 -0
  52. data/lib/feedjira/version.rb +3 -1
  53. data/spec/feedjira/configuration_spec.rb +11 -16
  54. data/spec/feedjira/date_time_utilities_spec.rb +22 -20
  55. data/spec/feedjira/feed_entry_utilities_spec.rb +20 -18
  56. data/spec/feedjira/feed_spec.rb +17 -229
  57. data/spec/feedjira/feed_utilities_spec.rb +75 -73
  58. data/spec/feedjira/parser/atom_entry_spec.rb +41 -38
  59. data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +22 -20
  60. data/spec/feedjira/parser/atom_feed_burner_spec.rb +122 -118
  61. data/spec/feedjira/parser/atom_google_alerts_entry_spec.rb +34 -0
  62. data/spec/feedjira/parser/atom_google_alerts_spec.rb +62 -0
  63. data/spec/feedjira/parser/atom_spec.rb +83 -77
  64. data/spec/feedjira/parser/atom_youtube_entry_spec.rb +41 -39
  65. data/spec/feedjira/parser/atom_youtube_spec.rb +21 -19
  66. data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +10 -8
  67. data/spec/feedjira/parser/google_docs_atom_spec.rb +25 -21
  68. data/spec/feedjira/parser/itunes_rss_item_spec.rb +39 -37
  69. data/spec/feedjira/parser/itunes_rss_owner_spec.rb +7 -5
  70. data/spec/feedjira/parser/itunes_rss_spec.rb +120 -116
  71. data/spec/feedjira/parser/json_feed_item_spec.rb +81 -0
  72. data/spec/feedjira/parser/json_feed_spec.rb +55 -0
  73. data/spec/feedjira/parser/podlove_chapter_spec.rb +14 -12
  74. data/spec/feedjira/parser/rss_entry_spec.rb +56 -34
  75. data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +36 -34
  76. data/spec/feedjira/parser/rss_feed_burner_spec.rb +49 -45
  77. data/spec/feedjira/parser/rss_spec.rb +38 -36
  78. data/spec/feedjira/preprocessor_spec.rb +9 -7
  79. data/spec/feedjira_spec.rb +166 -0
  80. data/spec/sample_feeds.rb +32 -29
  81. data/spec/sample_feeds/HuffPostCanada.xml +279 -0
  82. data/spec/sample_feeds/Permalinks.xml +22 -0
  83. data/spec/sample_feeds/a10.xml +72 -0
  84. data/spec/sample_feeds/google_alerts_atom.xml +1 -0
  85. data/spec/sample_feeds/json_feed.json +156 -0
  86. data/spec/spec_helper.rb +7 -5
  87. metadata +59 -70
  88. data/Dangerfile +0 -1
  89. data/fixtures/vcr_cassettes/fetch_failure.yml +0 -62
  90. data/fixtures/vcr_cassettes/parse_error.yml +0 -222
  91. data/fixtures/vcr_cassettes/success.yml +0 -281
@@ -1,32 +1,28 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  module DateTimeUtilities
5
5
  class DateTimePatternParser
6
- # rubocop:disable Style/AsciiComments
7
6
  # Japanese Symbols are required for strange Date Strings like
8
7
  # '水, 31 8 2016 07:37:00 PDT'
9
- JAPANESE_SYMBOLS = %w(日 月 火 水 木 金 土).freeze
10
- PATTERNS = ['%m/%d/%Y %T %p', '%d %m %Y %T %Z'].freeze
8
+ JAPANESE_SYMBOLS = %w[日 月 火 水 木 金 土].freeze
9
+ PATTERNS = ["%m/%d/%Y %T %p", "%d %m %Y %T %Z"].freeze
11
10
 
12
- # rubocop:disable Metrics/MethodLength
13
11
  def self.parse(string)
14
12
  PATTERNS.each do |p|
15
- begin
16
- datetime = DateTime.strptime(prepare(string), p)
17
- return datetime
18
- rescue StandardError => e
19
- Feedjira.logger.debug("Failed to parse date #{string}")
20
- Feedjira.logger.debug(e)
21
- nil
22
- end
13
+ datetime = DateTime.strptime(prepare(string), p)
14
+ return datetime
15
+ rescue StandardError => e
16
+ Feedjira.logger.debug("Failed to parse date #{string}")
17
+ Feedjira.logger.debug(e)
18
+ nil
23
19
  end
24
20
  raise "No pattern matched #{string}"
25
21
  end
26
22
 
27
23
  def self.prepare(string)
28
24
  rgx = Regexp.new("^(#{JAPANESE_SYMBOLS.join('|')}),\s")
29
- string.gsub(rgx, '')
25
+ string.gsub(rgx, "")
30
26
  end
31
27
  private_class_method :prepare
32
28
  end
@@ -1,60 +1,35 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  class Feed
5
5
  class << self
6
- def parse_with(parser, xml, &block)
7
- parser.parse xml, &block
8
- end
9
-
10
- def parse(xml, &block)
11
- parser = determine_feed_parser_for_xml(xml)
12
- raise NoParserAvailable, 'No valid parser for XML.' unless parser
13
- parse_with parser, xml, &block
14
- end
15
-
16
- def determine_feed_parser_for_xml(xml)
17
- start_of_doc = xml.slice(0, 2000)
18
- feed_classes.detect { |klass| klass.able_to_parse?(start_of_doc) }
19
- end
20
-
21
- def add_feed_class(klass)
22
- feed_classes.unshift klass
23
- end
24
-
25
- def feed_classes
26
- @feed_classes ||= Feedjira.parsers
27
- end
28
-
29
- def reset_parsers!
30
- @feed_classes = nil
31
- end
32
-
33
6
  def add_common_feed_element(element_tag, options = {})
34
- feed_classes.each do |k|
35
- k.element element_tag, options
7
+ Feedjira.parsers.each do |k|
8
+ k.element(element_tag, options)
36
9
  end
37
10
  end
38
11
 
39
12
  def add_common_feed_elements(element_tag, options = {})
40
- feed_classes.each do |k|
41
- k.elements element_tag, options
13
+ Feedjira.parsers.each do |k|
14
+ k.elements(element_tag, options)
42
15
  end
43
16
  end
44
17
 
45
18
  def add_common_feed_entry_element(element_tag, options = {})
46
- call_on_each_feed_entry :element, element_tag, options
19
+ call_on_each_feed_entry(:element, element_tag, options)
47
20
  end
48
21
 
49
22
  def add_common_feed_entry_elements(element_tag, options = {})
50
- call_on_each_feed_entry :elements, element_tag, options
23
+ call_on_each_feed_entry(:elements, element_tag, options)
51
24
  end
52
25
 
26
+ private
27
+
53
28
  def call_on_each_feed_entry(method, *parameters)
54
- feed_classes.each do |klass|
29
+ Feedjira.parsers.each do |klass|
55
30
  klass.sax_config.collection_elements.each_value do |value|
56
31
  collection_configs = value.select do |v|
57
- v.accessor == 'entries' && v.data_class.class == Class
32
+ v.accessor == "entries" && v.data_class.class == Class
58
33
  end
59
34
 
60
35
  collection_configs.each do |config|
@@ -63,51 +38,6 @@ module Feedjira
63
38
  end
64
39
  end
65
40
  end
66
-
67
- def fetch_and_parse(url)
68
- response = connection(url).get
69
- unless response.success?
70
- raise FetchFailure, "Fetch failed - #{response.status}"
71
- end
72
- feed = parse response.body
73
- feed.feed_url = url
74
- feed.etag = response.headers['etag'].to_s.delete '"'
75
-
76
- feed.last_modified = parse_last_modified(response)
77
- feed
78
- end
79
-
80
- # rubocop:disable LineLength
81
- def connection(url)
82
- Faraday.new(url: url, headers: headers, request: request_options) do |conn|
83
- conn.use FaradayMiddleware::FollowRedirects, limit: Feedjira.follow_redirect_limit
84
- conn.adapter(*Faraday.default_adapter)
85
- end
86
- end
87
- # rubocop:enable LineLength
88
-
89
- private
90
-
91
- def headers
92
- {
93
- user_agent: Feedjira.user_agent
94
- }
95
- end
96
-
97
- def request_options
98
- {
99
- timeout: Feedjira.request_timeout
100
- }
101
- end
102
-
103
- def parse_last_modified(response)
104
- lm = response.headers['last-modified']
105
- DateTime.parse(lm).to_time
106
- rescue StandardError => e
107
- Feedjira.logger.warn { "Failed to parse last modified '#{lm}'" }
108
- Feedjira.logger.debug(e)
109
- nil
110
- end
111
41
  end
112
42
  end
113
43
  end
@@ -1,5 +1,5 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  module FeedEntryUtilities
5
5
  include Enumerable
@@ -13,16 +13,18 @@ module Feedjira
13
13
  DateTime.parse(string).feed_utils_to_gm_time
14
14
  rescue StandardError => e
15
15
  Feedjira.logger.warn { "Failed to parse date #{string.inspect}" }
16
- Feedjira.logger.debug(e)
16
+ Feedjira.logger.warn(e)
17
17
  nil
18
18
  end
19
19
 
20
20
  ##
21
21
  # Returns the id of the entry or its url if not id is present, as some
22
22
  # formats don't support it
23
+ # rubocop:disable Naming/MemoizedInstanceVariableName
23
24
  def id
24
25
  @entry_id ||= @url
25
26
  end
27
+ # rubocop:enable Naming/MemoizedInstanceVariableName
26
28
 
27
29
  ##
28
30
  # Writer for published. By default, we keep the "oldest" publish time found.
@@ -39,9 +41,9 @@ module Feedjira
39
41
  end
40
42
 
41
43
  def sanitize!
42
- %w(title author summary content image).each do |name|
44
+ %w[title author summary content image].each do |name|
43
45
  if respond_to?(name) && send(name).respond_to?(:sanitize!)
44
- send(name).send :sanitize!
46
+ send(name).send(:sanitize!)
45
47
  end
46
48
  end
47
49
  end
@@ -49,10 +51,15 @@ module Feedjira
49
51
  alias last_modified published
50
52
 
51
53
  def each
52
- @rss_fields ||= instance_variables
54
+ @rss_fields ||= instance_variables.map do |ivar|
55
+ ivar.to_s.sub("@", "")
56
+ end.select do |field| # rubocop:disable Style/MultilineBlockChain
57
+ # select callable (public) methods only
58
+ respond_to?(field)
59
+ end
53
60
 
54
61
  @rss_fields.each do |field|
55
- yield(field.to_s.sub('@', ''), instance_variable_get(field))
62
+ yield(field, instance_variable_get(:"@#{field}"))
56
63
  end
57
64
  end
58
65
 
@@ -1,8 +1,8 @@
1
- # rubocop:disable Style/Documentation
2
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
3
3
  module Feedjira
4
4
  module FeedUtilities
5
- UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag).freeze
5
+ UPDATABLE_ATTRIBUTES = %w[title feed_url url last_modified etag].freeze
6
6
 
7
7
  attr_writer :new_entries, :updated, :last_modified
8
8
  attr_accessor :etag
@@ -43,7 +43,7 @@ module Feedjira
43
43
  def last_modified
44
44
  @last_modified ||= begin
45
45
  published = entries.reject { |e| e.published.nil? }
46
- entry = published.sort_by { |e| e.published if e.published }.last
46
+ entry = published.max_by(&:published)
47
47
  entry ? entry.published : nil
48
48
  end
49
49
  end
@@ -102,6 +102,7 @@ module Feedjira
102
102
 
103
103
  feed.entries.each do |entry|
104
104
  break unless new_entry?(entry, latest_entry)
105
+
105
106
  found_new_entries << entry
106
107
  end
107
108
 
@@ -1 +1,6 @@
1
- module Feedjira::Parser; end # rubocop:disable Style/Documentation
1
+ # frozen_string_literal: true
2
+
3
+ module Feedjira
4
+ module Parser
5
+ end
6
+ end
@@ -1,4 +1,5 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Atom feeds.
@@ -8,14 +9,14 @@ module Feedjira
8
9
 
9
10
  element :title
10
11
  element :subtitle, as: :description
11
- element :link, as: :url, value: :href, with: { type: 'text/html' }
12
- element :link, as: :feed_url, value: :href, with: { rel: 'self' }
12
+ element :link, as: :url, value: :href, with: { type: "text/html" }
13
+ element :link, as: :feed_url, value: :href, with: { rel: "self" }
13
14
  elements :link, as: :links, value: :href
14
- elements :link, as: :hubs, value: :href, with: { rel: 'hub' }
15
+ elements :link, as: :hubs, value: :href, with: { rel: "hub" }
15
16
  elements :entry, as: :entries, class: AtomEntry
16
17
 
17
18
  def self.able_to_parse?(xml)
18
- %r{\<feed[^\>]+xmlns\s?=\s?[\"\'](http://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)[\"\'][^\>]*\>} =~ xml # rubocop:disable Metrics/LineLength
19
+ %r{<feed[^>]+xmlns\s?=\s?["'](http://www\.w3\.org/2005/Atom|http://purl\.org/atom/ns\#)["'][^>]*>} =~ xml
19
20
  end
20
21
 
21
22
  def url
@@ -1,32 +1,15 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Atom feed entries.
5
6
  class AtomEntry
6
7
  include SAXMachine
7
8
  include FeedEntryUtilities
9
+ include AtomEntryUtilities
8
10
 
9
- element :title
10
- element :link, as: :url, value: :href, with: { type: 'text/html', rel: 'alternate' } # rubocop:disable Metrics/LineLength
11
- element :name, as: :author
12
- element :content
13
- element :summary
14
-
11
+ element :"media:thumbnail", as: :image, value: :url
15
12
  element :"media:content", as: :image, value: :url
16
- element :enclosure, as: :image, value: :href
17
-
18
- element :published
19
- element :id, as: :entry_id
20
- element :created, as: :published
21
- element :issued, as: :published
22
- element :updated
23
- element :modified, as: :updated
24
- elements :category, as: :categories, value: :term
25
- elements :link, as: :links, value: :href
26
-
27
- def url
28
- @url ||= links.first
29
- end
30
13
  end
31
14
  end
32
15
  end
@@ -1,4 +1,5 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Feedburner Atom feeds.
@@ -9,18 +10,18 @@ module Feedjira
9
10
  element :title
10
11
  element :subtitle, as: :description
11
12
  element :link, as: :url_text_html, value: :href,
12
- with: { type: 'text/html' }
13
+ with: { type: "text/html" }
13
14
  element :link, as: :url_notype, value: :href, with: { type: nil }
14
- element :link, as: :feed_url_link, value: :href, with: { type: 'application/atom+xml' } # rubocop:disable Metrics/LineLength
15
+ element :link, as: :feed_url_link, value: :href, with: { type: "application/atom+xml" }
15
16
  element :"atom10:link", as: :feed_url_atom10_link, value: :href,
16
- with: { type: 'application/atom+xml' }
17
- elements :"atom10:link", as: :hubs, value: :href, with: { rel: 'hub' }
17
+ with: { type: "application/atom+xml" }
18
+ elements :"atom10:link", as: :hubs, value: :href, with: { rel: "hub" }
18
19
  elements :entry, as: :entries, class: AtomFeedBurnerEntry
19
20
 
20
21
  attr_writer :url, :feed_url
21
22
 
22
23
  def self.able_to_parse?(xml)
23
- ((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false # rubocop:disable Metrics/LineLength
24
+ ((/<feed/ =~ xml) && (/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/<rss|<rdf/ =~ xml)) || false
24
25
  end
25
26
 
26
27
  # Feed url is <link> with type="text/html" if present,
@@ -1,32 +1,21 @@
1
- # rubocop:disable Style/DocumentationMethod
1
+ # frozen_string_literal: true
2
+
2
3
  module Feedjira
3
4
  module Parser
4
5
  # Parser for dealing with Feedburner Atom feed entries.
5
6
  class AtomFeedBurnerEntry
6
7
  include SAXMachine
7
8
  include FeedEntryUtilities
9
+ include AtomEntryUtilities
8
10
 
9
- element :title
10
- element :name, as: :author
11
- element :link, as: :url, value: :href, with: { type: 'text/html', rel: 'alternate' } # rubocop:disable Metrics/LineLength
12
- element :"feedburner:origLink", as: :url
13
- element :summary
14
- element :content
11
+ element :"feedburner:origLink", as: :orig_link
12
+ private :orig_link
15
13
 
14
+ element :"media:thumbnail", as: :image, value: :url
16
15
  element :"media:content", as: :image, value: :url
17
- element :enclosure, as: :image, value: :href
18
-
19
- element :published
20
- element :id, as: :entry_id
21
- element :issued, as: :published
22
- element :created, as: :published
23
- element :updated
24
- element :modified, as: :updated
25
- elements :category, as: :categories, value: :term
26
- elements :link, as: :links, value: :href
27
16
 
28
17
  def url
29
- @url ||= links.first
18
+ orig_link || super
30
19
  end
31
20
  end
32
21
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Feedjira
4
+ module Parser
5
+ # Parser for dealing with Feedburner Atom feeds.
6
+ class AtomGoogleAlerts
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ element :title
11
+ element :subtitle, as: :description
12
+ element :link, as: :feed_url, value: :href, with: { rel: "self" }
13
+ element :link, as: :url, value: :href, with: { rel: "self" }
14
+ elements :link, as: :links, value: :href
15
+ elements :entry, as: :entries, class: AtomGoogleAlertsEntry
16
+
17
+ def self.able_to_parse?(xml)
18
+ Atom.able_to_parse?(xml) && (%r{<id>tag:google\.com,2005:[^<]+/com\.google/alerts/} === xml) # rubocop:disable Style/CaseEquality
19
+ end
20
+
21
+ def self.preprocess(xml)
22
+ Preprocessor.new(xml).to_xml
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Feedjira
4
+ module Parser
5
+ # Parser for dealing with Feedburner Atom feed entries.
6
+ class AtomGoogleAlertsEntry
7
+ include SAXMachine
8
+ include FeedEntryUtilities
9
+ include AtomEntryUtilities
10
+
11
+ def url
12
+ url = super
13
+ return unless url&.start_with?("https://www.google.com/url?")
14
+
15
+ uri = URI(url)
16
+ cons = URI.decode_www_form(uri.query).assoc("url")
17
+ cons && cons[1]
18
+ end
19
+ end
20
+ end
21
+ end