html2rss 0.13.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a2bf557dd65533533e07b4581e195f2d2b32ff906831526a4d7aed27a558d71
4
- data.tar.gz: f42e5f03649a08219d310a2545413c371f851530c4d323fd68ef783b4b3b5e13
3
+ metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
4
+ data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
5
5
  SHA512:
6
- metadata.gz: 724a1fa8ab15ae140278eb9b055f22e7aad12e94627795f7a2f13c78f5421607e39d6ba040821b4c47b69f963cc0180bf8e964ff0b896403cb6305ed1d67dbb5
7
- data.tar.gz: a06c2e16b0b51c6b6d2184430efc2a4e8b2812fee413163aa2991567e7608141f1c18189fdded58c8c3383940c4790478cd631abc6a1470ad648b2030fdefaab
6
+ metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
7
+ data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/) [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss) ![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
4
4
 
5
- `html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
5
+ `html2rss` is a Ruby gem that generates RSS 2.0 feeds from websites automatically, and as a fallback via _feed config_.
6
6
 
7
7
  With the _feed config_, you provide a URL to scrape and CSS selectors for extracting information (like title, URL, etc.). The gem builds the RSS feed accordingly. [Extractors](#using-extractors) and chainable [post processors](#using-post-processors) make information extraction, processing, and sanitizing a breeze. The gem also supports [scraping JSON](#scraping-and-handling-json-responses) responses and [setting HTTP request headers](#set-any-http-header-in-the-request).
8
8
 
@@ -26,9 +26,9 @@ You can also install it as a dependency in your Ruby project:
26
26
 
27
27
  ## Generating a feed on the CLI
28
28
 
29
- ### using automatic scraping
29
+ ### using automatic generation
30
30
 
31
- html2rss offers an automatic scrapting feature. Try it with:
31
+ html2rss offers an automatic RSS generation feature. Try it with:
32
32
 
33
33
  `html2rss auto https://unmatchedstyle.com/`
34
34
 
@@ -10,21 +10,27 @@ module Html2rss
10
10
  ##
11
11
  #
12
12
  # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
13
- # @param response [Faraday::Response] The URL of the HTML document.
14
- def initialize(parsed_body, url:, response:, articles: [])
13
+ # @param url [Addressable::URI] The URL of the channel.
14
+ # @param headers [Hash<String, String>] the http headers
15
+ # @param articles [Array<Html2rss::AutoSource::Article>] The articles.
16
+ def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
15
17
  @parsed_body = parsed_body
16
18
  @url = url
17
- @response = response
19
+ @headers = headers
18
20
  @articles = articles
21
+ @stylesheets = stylesheets
19
22
  end
20
23
 
24
+ attr_writer :articles
25
+ attr_reader :stylesheets
26
+
21
27
  def url = extract_url
22
28
  def title = extract_title
23
29
  def language = extract_language
24
30
  def description = extract_description
25
31
  def image = extract_image
26
32
  def ttl = extract_ttl
27
- def last_build_date = response.headers['last-modified']
33
+ def last_build_date = headers['last-modified']
28
34
 
29
35
  def generator
30
36
  "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
@@ -32,7 +38,7 @@ module Html2rss
32
38
 
33
39
  private
34
40
 
35
- attr_reader :parsed_body, :response
41
+ attr_reader :parsed_body, :headers
36
42
 
37
43
  def extract_url
38
44
  @url.normalize.to_s
@@ -58,7 +64,7 @@ module Html2rss
58
64
  end
59
65
 
60
66
  def extract_ttl
61
- ttl = response.headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
67
+ ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
62
68
  return unless ttl
63
69
 
64
70
  ttl.to_i.fdiv(60).ceil
@@ -31,6 +31,8 @@ module Html2rss
31
31
 
32
32
  def call
33
33
  RSS::Maker.make('2.0') do |maker|
34
+ Html2rss::RssBuilder::Stylesheet.add(maker, channel.stylesheets)
35
+
34
36
  make_channel(maker.channel)
35
37
  make_items(maker)
36
38
  end
@@ -1,5 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'set'
6
+
3
7
  module Html2rss
4
8
  class AutoSource
5
9
  module Scraper
@@ -99,6 +103,8 @@ module Html2rss
99
103
  # @yield [Hash] Each scraped article_hash
100
104
  # @return [Array<Hash>] the scraped article_hashes
101
105
  def each(&)
106
+ return enum_for(:each) unless block_given?
107
+
102
108
  schema_objects.filter_map do |schema_object|
103
109
  next unless (klass = self.class.scraper_for_schema_object(schema_object))
104
110
  next unless (article_hash = klass.new(schema_object, url:).call)
@@ -74,7 +74,7 @@ module Html2rss
74
74
  def find_heading
75
75
  heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
76
76
  smallest_heading = heading_tags.keys.min
77
- heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
77
+ heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
78
78
  end
79
79
 
80
80
  def extract_title
@@ -16,16 +16,18 @@ module Html2rss
16
16
 
17
17
  SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
18
 
19
- def initialize(url)
20
- unless url.is_a?(String) || url.is_a?(Addressable::URI)
21
- raise ArgumentError,
22
- 'URL must be a String or Addressable::URI'
23
- end
24
-
25
- @url = Addressable::URI.parse(url)
26
-
27
- raise ArgumentError, 'URL must be absolute' unless @url.absolute?
28
- raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
19
+ ##
20
+ # @param url [Addressable::URI] The URL to extract articles from.
21
+ # @param body [String] The body of the response.
22
+ # @param headers [Hash] The headers of the response.
23
+ def initialize(url, body:, headers: {})
24
+ raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
25
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
26
+ raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
27
+
28
+ @url = url
29
+ @body = body
30
+ @headers = headers
29
31
  end
30
32
 
31
33
  def build
@@ -34,6 +36,8 @@ module Html2rss
34
36
  Reducer.call(articles, url:)
35
37
  Cleanup.call(articles, url:, keep_different_domain: true)
36
38
 
39
+ channel.articles = articles
40
+
37
41
  Html2rss::AutoSource::RssBuilder.new(
38
42
  channel:,
39
43
  articles:
@@ -57,21 +61,20 @@ module Html2rss
57
61
  end
58
62
 
59
63
  def channel
60
- Channel.new(parsed_body, response:, url:, articles:)
64
+ @channel ||= Channel.new(parsed_body, headers: @headers, url:)
61
65
  end
62
66
 
63
67
  private
64
68
 
65
69
  attr_reader :url
66
70
 
67
- def response
68
- @response ||= Html2rss::Utils.request_url(url)
69
- end
70
-
71
- # Parses the HTML body of the response using Nokogiri.
72
71
  # @return [Nokogiri::HTML::Document]
73
72
  def parsed_body
74
- @parsed_body ||= Nokogiri.HTML(response.body).freeze
73
+ @parsed_body ||= Nokogiri.HTML(@body)
74
+ .tap do |doc|
75
+ # Remove comments from the document
76
+ doc.xpath('//comment()').each(&:remove)
77
+ end.freeze
75
78
  end
76
79
  end
77
80
  end
@@ -18,9 +18,6 @@ module Html2rss
18
18
  # Thrown when the feed config does not contain a value at `:channel`.
19
19
  class ChannelMissing < Html2rss::Error; end
20
20
 
21
- # Struct to store XML Stylesheet attributes
22
- Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
23
-
24
21
  def_delegator :@channel, :author
25
22
  def_delegator :@channel, :ttl
26
23
  def_delegator :@channel, :title
@@ -75,7 +72,7 @@ module Html2rss
75
72
  #
76
73
  # @return [Array<Stylesheet>] Array of Stylesheet structs.
77
74
  def stylesheets
78
- @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
75
+ @global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
79
76
  end
80
77
 
81
78
  # Provides read-only access to the channel object.
data/lib/html2rss/item.rb CHANGED
@@ -19,7 +19,7 @@ module Html2rss
19
19
  ##
20
20
  # Fetches items from a given URL using configuration settings.
21
21
  #
22
- # @param url [String] URL to fetch items from.
22
+ # @param url [Addressable::URI] URL to fetch items from.
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
@@ -3,35 +3,50 @@
3
3
  module Html2rss
4
4
  module RssBuilder
5
5
  ##
6
- # Adds XML stylesheet tags (with the provided maker).
6
+ # Represents a stylesheet.
7
7
  class Stylesheet
8
- ##
9
- # Adds the stylesheet XML tags to the RSS.
10
- #
11
- # @param maker [RSS::Maker::RSS20] RSS maker object.
12
- # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
13
- # @return [nil]
14
- def self.add(maker, stylesheets)
15
- stylesheets.each do |stylesheet|
16
- add_stylesheet(maker, stylesheet)
8
+ class << self
9
+ ##
10
+ # Adds the stylesheet XML tags to the RSS.
11
+ #
12
+ # @param maker [RSS::Maker::RSS20] RSS maker object.
13
+ # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
14
+ # @return [nil]
15
+ def add(maker, stylesheets)
16
+ stylesheets.each do |stylesheet|
17
+ add_stylesheet(maker, stylesheet)
18
+ end
17
19
  end
18
- end
19
20
 
20
- ##
21
- # Adds a single Stylesheet to the RSS.
22
- #
23
- # @param maker [RSS::Maker::RSS20] RSS maker object.
24
- # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
25
- # @return [nil]
26
- def self.add_stylesheet(maker, stylesheet)
27
- maker.xml_stylesheets.new_xml_stylesheet do |xss|
28
- xss.href = stylesheet.href
29
- xss.type = stylesheet.type
30
- xss.media = stylesheet.media
21
+ private
22
+
23
+ ##
24
+ # Adds a single Stylesheet to the RSS.
25
+ #
26
+ # @param maker [RSS::Maker::RSS20] RSS maker object.
27
+ # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
28
+ # @return [nil]
29
+ def add_stylesheet(maker, stylesheet)
30
+ maker.xml_stylesheets.new_xml_stylesheet do |xss|
31
+ xss.href = stylesheet.href
32
+ xss.type = stylesheet.type
33
+ xss.media = stylesheet.media
34
+ end
31
35
  end
32
36
  end
33
37
 
34
- private_class_method :add_stylesheet
38
+ TYPES = ['text/css', 'text/xsl'].freeze
39
+
40
+ def initialize(href:, type:, media: 'all')
41
+ raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
42
+ raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
43
+ raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
44
+
45
+ @href = href
46
+ @type = type
47
+ @media = media
48
+ end
49
+ attr_reader :href, :type, :media
35
50
  end
36
51
  end
37
52
  end
@@ -44,6 +44,7 @@ module Html2rss
44
44
  #
45
45
  # @param time_zone [String]
46
46
  # @param default_time_zone [String]
47
+ # @yield block to execute with the given time zone
47
48
  # @return [Object] whatever the given block returns
48
49
  def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
49
50
  raise ArgumentError, 'a block is required' unless block_given?
@@ -74,6 +75,11 @@ module Html2rss
74
75
  # @param headers [Hash] additional HTTP request headers to use for the request
75
76
  # @return [Faraday::Response] body of the HTTP response
76
77
  def self.request_url(url, headers: {})
78
+ url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
79
+
80
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
81
+ raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
82
+
77
83
  Faraday.new(url:, headers:) do |faraday|
78
84
  faraday.use Faraday::FollowRedirects::Middleware
79
85
  faraday.adapter Faraday.default_adapter
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.13.0'
6
+ VERSION = '0.14.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -43,7 +43,7 @@ module Html2rss
43
43
  # @param params [Hash] Dynamic parameters for the feed configuration.
44
44
  # @return [RSS::Rss] RSS object generated from the configuration.
45
45
  def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
46
- yaml = load_yaml(file)
46
+ yaml = YAML.safe_load_file(file, symbolize_names: true)
47
47
  feeds = yaml[CONFIG_KEY_FEEDS] || {}
48
48
 
49
49
  feed_config = find_feed_config(yaml, feeds, name, global_config)
@@ -73,15 +73,6 @@ module Html2rss
73
73
  RssBuilder.build(config)
74
74
  end
75
75
 
76
- ##
77
- # Loads and parses the YAML file.
78
- #
79
- # @param file [String] Path to the YAML file.
80
- # @return [Hash] Parsed YAML content.
81
- def self.load_yaml(file)
82
- YAML.safe_load_file(file, symbolize_names: true)
83
- end
84
-
85
76
  ##
86
77
  # Builds the feed configuration based on the provided parameters.
87
78
  #
@@ -109,8 +100,12 @@ module Html2rss
109
100
  # @param url [String] the URL to automatically source the feed from
110
101
  # @return [RSS::Rss]
111
102
  def self.auto_source(url)
112
- Html2rss::AutoSource.new(url).build
103
+ url = Addressable::URI.parse(url)
104
+
105
+ response = Html2rss::Utils.request_url(url)
106
+
107
+ Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
113
108
  end
114
109
 
115
- private_class_method :load_yaml, :find_feed_config
110
+ private_class_method :find_feed_config
116
111
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-16 00:00:00.000000000 Z
11
+ date: 2024-10-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -279,7 +279,7 @@ licenses:
279
279
  - MIT
280
280
  metadata:
281
281
  allowed_push_host: https://rubygems.org
282
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
282
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
283
283
  rubygems_mfa_required: 'true'
284
284
  post_install_message:
285
285
  rdoc_options: []
@@ -296,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
296
296
  - !ruby/object:Gem::Version
297
297
  version: '0'
298
298
  requirements: []
299
- rubygems_version: 3.5.11
299
+ rubygems_version: 3.5.16
300
300
  signing_key:
301
301
  specification_version: 4
302
302
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors