html2rss 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a2bf557dd65533533e07b4581e195f2d2b32ff906831526a4d7aed27a558d71
4
- data.tar.gz: f42e5f03649a08219d310a2545413c371f851530c4d323fd68ef783b4b3b5e13
3
+ metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
4
+ data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
5
5
  SHA512:
6
- metadata.gz: 724a1fa8ab15ae140278eb9b055f22e7aad12e94627795f7a2f13c78f5421607e39d6ba040821b4c47b69f963cc0180bf8e964ff0b896403cb6305ed1d67dbb5
7
- data.tar.gz: a06c2e16b0b51c6b6d2184430efc2a4e8b2812fee413163aa2991567e7608141f1c18189fdded58c8c3383940c4790478cd631abc6a1470ad648b2030fdefaab
6
+ metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
7
+ data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/) [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss) ![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
4
4
 
5
- `html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
5
+ `html2rss` is a Ruby gem that generates RSS 2.0 feeds from websites automatically, and as a fallback via _feed config_.
6
6
 
7
7
  With the _feed config_, you provide a URL to scrape and CSS selectors for extracting information (like title, URL, etc.). The gem builds the RSS feed accordingly. [Extractors](#using-extractors) and chainable [post processors](#using-post-processors) make information extraction, processing, and sanitizing a breeze. The gem also supports [scraping JSON](#scraping-and-handling-json-responses) responses and [setting HTTP request headers](#set-any-http-header-in-the-request).
8
8
 
@@ -26,9 +26,9 @@ You can also install it as a dependency in your Ruby project:
26
26
 
27
27
  ## Generating a feed on the CLI
28
28
 
29
- ### using automatic scraping
29
+ ### using automatic generation
30
30
 
31
- html2rss offers an automatic scrapting feature. Try it with:
31
+ html2rss offers an automatic RSS generation feature. Try it with:
32
32
 
33
33
  `html2rss auto https://unmatchedstyle.com/`
34
34
 
@@ -10,21 +10,27 @@ module Html2rss
10
10
  ##
11
11
  #
12
12
  # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
13
- # @param response [Faraday::Response] The URL of the HTML document.
14
- def initialize(parsed_body, url:, response:, articles: [])
13
+ # @param url [Addressable::URI] The URL of the channel.
14
+ # @param headers [Hash<String, String>] the http headers
15
+ # @param articles [Array<Html2rss::AutoSource::Article>] The articles.
16
+ def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
15
17
  @parsed_body = parsed_body
16
18
  @url = url
17
- @response = response
19
+ @headers = headers
18
20
  @articles = articles
21
+ @stylesheets = stylesheets
19
22
  end
20
23
 
24
+ attr_writer :articles
25
+ attr_reader :stylesheets
26
+
21
27
  def url = extract_url
22
28
  def title = extract_title
23
29
  def language = extract_language
24
30
  def description = extract_description
25
31
  def image = extract_image
26
32
  def ttl = extract_ttl
27
- def last_build_date = response.headers['last-modified']
33
+ def last_build_date = headers['last-modified']
28
34
 
29
35
  def generator
30
36
  "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
@@ -32,7 +38,7 @@ module Html2rss
32
38
 
33
39
  private
34
40
 
35
- attr_reader :parsed_body, :response
41
+ attr_reader :parsed_body, :headers
36
42
 
37
43
  def extract_url
38
44
  @url.normalize.to_s
@@ -58,7 +64,7 @@ module Html2rss
58
64
  end
59
65
 
60
66
  def extract_ttl
61
- ttl = response.headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
67
+ ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
62
68
  return unless ttl
63
69
 
64
70
  ttl.to_i.fdiv(60).ceil
@@ -31,6 +31,8 @@ module Html2rss
31
31
 
32
32
  def call
33
33
  RSS::Maker.make('2.0') do |maker|
34
+ Html2rss::RssBuilder::Stylesheet.add(maker, channel.stylesheets)
35
+
34
36
  make_channel(maker.channel)
35
37
  make_items(maker)
36
38
  end
@@ -1,5 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'set'
6
+
3
7
  module Html2rss
4
8
  class AutoSource
5
9
  module Scraper
@@ -99,6 +103,8 @@ module Html2rss
99
103
  # @yield [Hash] Each scraped article_hash
100
104
  # @return [Array<Hash>] the scraped article_hashes
101
105
  def each(&)
106
+ return enum_for(:each) unless block_given?
107
+
102
108
  schema_objects.filter_map do |schema_object|
103
109
  next unless (klass = self.class.scraper_for_schema_object(schema_object))
104
110
  next unless (article_hash = klass.new(schema_object, url:).call)
@@ -74,7 +74,7 @@ module Html2rss
74
74
  def find_heading
75
75
  heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
76
76
  smallest_heading = heading_tags.keys.min
77
- heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
77
+ heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
78
78
  end
79
79
 
80
80
  def extract_title
@@ -16,16 +16,18 @@ module Html2rss
16
16
 
17
17
  SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
18
 
19
- def initialize(url)
20
- unless url.is_a?(String) || url.is_a?(Addressable::URI)
21
- raise ArgumentError,
22
- 'URL must be a String or Addressable::URI'
23
- end
24
-
25
- @url = Addressable::URI.parse(url)
26
-
27
- raise ArgumentError, 'URL must be absolute' unless @url.absolute?
28
- raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
19
+ ##
20
+ # @param url [Addressable::URI] The URL to extract articles from.
21
+ # @param body [String] The body of the response.
22
+ # @param headers [Hash] The headers of the response.
23
+ def initialize(url, body:, headers: {})
24
+ raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
25
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
26
+ raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
27
+
28
+ @url = url
29
+ @body = body
30
+ @headers = headers
29
31
  end
30
32
 
31
33
  def build
@@ -34,6 +36,8 @@ module Html2rss
34
36
  Reducer.call(articles, url:)
35
37
  Cleanup.call(articles, url:, keep_different_domain: true)
36
38
 
39
+ channel.articles = articles
40
+
37
41
  Html2rss::AutoSource::RssBuilder.new(
38
42
  channel:,
39
43
  articles:
@@ -57,21 +61,20 @@ module Html2rss
57
61
  end
58
62
 
59
63
  def channel
60
- Channel.new(parsed_body, response:, url:, articles:)
64
+ @channel ||= Channel.new(parsed_body, headers: @headers, url:)
61
65
  end
62
66
 
63
67
  private
64
68
 
65
69
  attr_reader :url
66
70
 
67
- def response
68
- @response ||= Html2rss::Utils.request_url(url)
69
- end
70
-
71
- # Parses the HTML body of the response using Nokogiri.
72
71
  # @return [Nokogiri::HTML::Document]
73
72
  def parsed_body
74
- @parsed_body ||= Nokogiri.HTML(response.body).freeze
73
+ @parsed_body ||= Nokogiri.HTML(@body)
74
+ .tap do |doc|
75
+ # Remove comments from the document
76
+ doc.xpath('//comment()').each(&:remove)
77
+ end.freeze
75
78
  end
76
79
  end
77
80
  end
@@ -18,9 +18,6 @@ module Html2rss
18
18
  # Thrown when the feed config does not contain a value at `:channel`.
19
19
  class ChannelMissing < Html2rss::Error; end
20
20
 
21
- # Struct to store XML Stylesheet attributes
22
- Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
23
-
24
21
  def_delegator :@channel, :author
25
22
  def_delegator :@channel, :ttl
26
23
  def_delegator :@channel, :title
@@ -75,7 +72,7 @@ module Html2rss
75
72
  #
76
73
  # @return [Array<Stylesheet>] Array of Stylesheet structs.
77
74
  def stylesheets
78
- @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
75
+ @global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
79
76
  end
80
77
 
81
78
  # Provides read-only access to the channel object.
data/lib/html2rss/item.rb CHANGED
@@ -19,7 +19,7 @@ module Html2rss
19
19
  ##
20
20
  # Fetches items from a given URL using configuration settings.
21
21
  #
22
- # @param url [String] URL to fetch items from.
22
+ # @param url [Addressable::URI] URL to fetch items from.
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
@@ -3,35 +3,50 @@
3
3
  module Html2rss
4
4
  module RssBuilder
5
5
  ##
6
- # Adds XML stylesheet tags (with the provided maker).
6
+ # Represents a stylesheet.
7
7
  class Stylesheet
8
- ##
9
- # Adds the stylesheet XML tags to the RSS.
10
- #
11
- # @param maker [RSS::Maker::RSS20] RSS maker object.
12
- # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
13
- # @return [nil]
14
- def self.add(maker, stylesheets)
15
- stylesheets.each do |stylesheet|
16
- add_stylesheet(maker, stylesheet)
8
+ class << self
9
+ ##
10
+ # Adds the stylesheet XML tags to the RSS.
11
+ #
12
+ # @param maker [RSS::Maker::RSS20] RSS maker object.
13
+ # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
14
+ # @return [nil]
15
+ def add(maker, stylesheets)
16
+ stylesheets.each do |stylesheet|
17
+ add_stylesheet(maker, stylesheet)
18
+ end
17
19
  end
18
- end
19
20
 
20
- ##
21
- # Adds a single Stylesheet to the RSS.
22
- #
23
- # @param maker [RSS::Maker::RSS20] RSS maker object.
24
- # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
25
- # @return [nil]
26
- def self.add_stylesheet(maker, stylesheet)
27
- maker.xml_stylesheets.new_xml_stylesheet do |xss|
28
- xss.href = stylesheet.href
29
- xss.type = stylesheet.type
30
- xss.media = stylesheet.media
21
+ private
22
+
23
+ ##
24
+ # Adds a single Stylesheet to the RSS.
25
+ #
26
+ # @param maker [RSS::Maker::RSS20] RSS maker object.
27
+ # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
28
+ # @return [nil]
29
+ def add_stylesheet(maker, stylesheet)
30
+ maker.xml_stylesheets.new_xml_stylesheet do |xss|
31
+ xss.href = stylesheet.href
32
+ xss.type = stylesheet.type
33
+ xss.media = stylesheet.media
34
+ end
31
35
  end
32
36
  end
33
37
 
34
- private_class_method :add_stylesheet
38
+ TYPES = ['text/css', 'text/xsl'].freeze
39
+
40
+ def initialize(href:, type:, media: 'all')
41
+ raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
42
+ raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
43
+ raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
44
+
45
+ @href = href
46
+ @type = type
47
+ @media = media
48
+ end
49
+ attr_reader :href, :type, :media
35
50
  end
36
51
  end
37
52
  end
@@ -44,6 +44,7 @@ module Html2rss
44
44
  #
45
45
  # @param time_zone [String]
46
46
  # @param default_time_zone [String]
47
+ # @yield block to execute with the given time zone
47
48
  # @return [Object] whatever the given block returns
48
49
  def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
49
50
  raise ArgumentError, 'a block is required' unless block_given?
@@ -74,6 +75,11 @@ module Html2rss
74
75
  # @param headers [Hash] additional HTTP request headers to use for the request
75
76
  # @return [Faraday::Response] body of the HTTP response
76
77
  def self.request_url(url, headers: {})
78
+ url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
79
+
80
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
81
+ raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
82
+
77
83
  Faraday.new(url:, headers:) do |faraday|
78
84
  faraday.use Faraday::FollowRedirects::Middleware
79
85
  faraday.adapter Faraday.default_adapter
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.13.0'
6
+ VERSION = '0.14.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -43,7 +43,7 @@ module Html2rss
43
43
  # @param params [Hash] Dynamic parameters for the feed configuration.
44
44
  # @return [RSS::Rss] RSS object generated from the configuration.
45
45
  def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
46
- yaml = load_yaml(file)
46
+ yaml = YAML.safe_load_file(file, symbolize_names: true)
47
47
  feeds = yaml[CONFIG_KEY_FEEDS] || {}
48
48
 
49
49
  feed_config = find_feed_config(yaml, feeds, name, global_config)
@@ -73,15 +73,6 @@ module Html2rss
73
73
  RssBuilder.build(config)
74
74
  end
75
75
 
76
- ##
77
- # Loads and parses the YAML file.
78
- #
79
- # @param file [String] Path to the YAML file.
80
- # @return [Hash] Parsed YAML content.
81
- def self.load_yaml(file)
82
- YAML.safe_load_file(file, symbolize_names: true)
83
- end
84
-
85
76
  ##
86
77
  # Builds the feed configuration based on the provided parameters.
87
78
  #
@@ -109,8 +100,12 @@ module Html2rss
109
100
  # @param url [String] the URL to automatically source the feed from
110
101
  # @return [RSS::Rss]
111
102
  def self.auto_source(url)
112
- Html2rss::AutoSource.new(url).build
103
+ url = Addressable::URI.parse(url)
104
+
105
+ response = Html2rss::Utils.request_url(url)
106
+
107
+ Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
113
108
  end
114
109
 
115
- private_class_method :load_yaml, :find_feed_config
110
+ private_class_method :find_feed_config
116
111
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-16 00:00:00.000000000 Z
11
+ date: 2024-10-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -279,7 +279,7 @@ licenses:
279
279
  - MIT
280
280
  metadata:
281
281
  allowed_push_host: https://rubygems.org
282
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
282
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
283
283
  rubygems_mfa_required: 'true'
284
284
  post_install_message:
285
285
  rdoc_options: []
@@ -296,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
296
296
  - !ruby/object:Gem::Version
297
297
  version: '0'
298
298
  requirements: []
299
- rubygems_version: 3.5.11
299
+ rubygems_version: 3.5.16
300
300
  signing_key:
301
301
  specification_version: 4
302
302
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors