html2rss 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +38 -10
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +74 -0
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +17 -8
  11. data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
  12. data/lib/html2rss/attribute_post_processors/template.rb +19 -11
  13. data/lib/html2rss/attribute_post_processors.rb +8 -0
  14. data/lib/html2rss/auto_source/article.rb +95 -0
  15. data/lib/html2rss/auto_source/channel.rb +79 -0
  16. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  17. data/lib/html2rss/auto_source/reducer.rb +48 -0
  18. data/lib/html2rss/auto_source/rss_builder.rb +68 -0
  19. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  20. data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  23. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  24. data/lib/html2rss/auto_source/scraper.rb +33 -0
  25. data/lib/html2rss/auto_source.rb +77 -0
  26. data/lib/html2rss/cli.rb +10 -0
  27. data/lib/html2rss/config/channel.rb +4 -2
  28. data/lib/html2rss/config/selectors.rb +13 -2
  29. data/lib/html2rss/item.rb +8 -2
  30. data/lib/html2rss/utils.rb +5 -10
  31. data/lib/html2rss/version.rb +1 -1
  32. data/lib/html2rss.rb +21 -0
  33. metadata +30 -3
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ ##
6
+ # Extracts channel information from
7
+ # 1. the HTML document's <head>.
8
+ # 2. the HTTP response
9
+ class Channel
10
+ ##
11
+ #
12
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
13
+ # @param response [Faraday::Response] The URL of the HTML document.
14
+ def initialize(parsed_body, url:, response:, articles: [])
15
+ @parsed_body = parsed_body
16
+ @url = url
17
+ @response = response
18
+ @articles = articles
19
+ end
20
+
21
+ def url = extract_url
22
+ def title = extract_title
23
+ def language = extract_language
24
+ def description = extract_description
25
+ def image = extract_image
26
+ def ttl = extract_ttl
27
+ def last_build_date = response.headers['last-modified']
28
+
29
+ def generator
30
+ "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
31
+ end
32
+
33
+ private
34
+
35
+ attr_reader :parsed_body, :response
36
+
37
+ def extract_url
38
+ @url.normalize.to_s
39
+ end
40
+
41
+ def extract_title
42
+ parsed_body.at_css('head > title')&.text
43
+ end
44
+
45
+ def extract_language
46
+ return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
47
+
48
+ parsed_body.at_css('[lang]')&.[]('lang')
49
+ end
50
+
51
+ def extract_description
52
+ parsed_body.at_css('meta[name="description"]')&.[]('content') || ''
53
+ end
54
+
55
+ def extract_image
56
+ url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
57
+ Html2rss::Utils.sanitize_url(url) if url
58
+ end
59
+
60
+ def extract_ttl
61
+ ttl = response.headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
62
+ return unless ttl
63
+
64
+ ttl.to_i.fdiv(60).ceil
65
+ end
66
+
67
+ def scraper_counts
68
+ scraper_counts = +''
69
+
70
+ @articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 }
71
+ .each do |klass, count|
72
+ scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]")
73
+ end
74
+
75
+ scraper_counts
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ ##
6
+ # Cleanup is responsible for cleaning up the extracted articles.
7
+ # :reek:MissingSafeMethod { enabled: false }
8
+ # It applies various strategies to filter and refine the article list.
9
+ class Cleanup
10
+ class << self
11
+ def call(articles, url:, keep_different_domain: false)
12
+ Log.debug "Cleanup: start with #{articles.size} articles"
13
+
14
+ articles.select!(&:valid?)
15
+
16
+ remove_short!(articles, :title)
17
+
18
+ deduplicate_by!(articles, :url)
19
+ deduplicate_by!(articles, :title)
20
+
21
+ keep_only_http_urls!(articles)
22
+ reject_different_domain!(articles, url) unless keep_different_domain
23
+
24
+ Log.debug "Cleanup: end with #{articles.size} articles"
25
+ articles
26
+ end
27
+
28
+ private
29
+
30
+ ##
31
+ # Removes articles with short values for a given key.
32
+ #
33
+ # @param articles [Array<Article>] The list of articles to process.
34
+ # @param key [Symbol] The key to check for short values.
35
+ # @param min_words [Integer] The minimum number of words required.
36
+ def remove_short!(articles, key = :title, min_words: 2)
37
+ articles.reject! do |article|
38
+ value = article.public_send(key)
39
+ value.nil? || value.to_s.split.size < min_words
40
+ end
41
+ end
42
+
43
+ ##
44
+ # Deduplicates articles by a given key.
45
+ #
46
+ # @param articles [Array<Article>] The list of articles to process.
47
+ # @param key [Symbol] The key to deduplicate by.
48
+ def deduplicate_by!(articles, key)
49
+ seen = {}
50
+ articles.reject! do |article|
51
+ value = article.public_send(key)
52
+ value.nil? || seen.key?(value).tap { seen[value] = true }
53
+ end
54
+ end
55
+
56
+ ##
57
+ # Keeps only articles with HTTP or HTTPS URLs.
58
+ #
59
+ # @param articles [Array<Article>] The list of articles to process.
60
+ def keep_only_http_urls!(articles)
61
+ articles.select! { |article| %w[http https].include?(article.url&.scheme) }
62
+ end
63
+
64
+ ##
65
+ # Rejects articles that have a URL not on the same domain as the source.
66
+ #
67
+ # @param articles [Array<Article>] The list of articles to process.
68
+ # @param base_url [Addressable::URI] The source URL to compare against.
69
+ def reject_different_domain!(articles, base_url)
70
+ base_host = base_url.host
71
+ articles.select! { |article| article.url&.host == base_host }
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ ##
6
+ # Reducer is responsible for reducing the list of articles.
7
+ # It keeps only the longest attributes of articles with the same URL.
8
+ # It also filters out invalid articles.
9
+ class Reducer
10
+ class << self
11
+ def call(articles, **_options)
12
+ Log.debug "Reducer: inited with #{articles.size} articles"
13
+
14
+ reduce_by_keeping_longest_values(articles, keep: [:scraper]) { |article| article.url&.path }
15
+ end
16
+
17
+ private
18
+
19
+ # @param articles [Array<Article>]
20
+ # @return [Array<Article>] reduced articles
21
+ def reduce_by_keeping_longest_values(articles, keep:, &)
22
+ grouped_by_block = articles.group_by(&)
23
+ grouped_by_block.each_with_object([]) do |(_key, grouped_articles), result|
24
+ memo_object = {}
25
+ grouped_articles.each do |article_hash|
26
+ keep_longest_values(memo_object, article_hash, keep:)
27
+ end
28
+
29
+ result << Article.new(**memo_object)
30
+ end
31
+ end
32
+
33
+ def keep_longest_values(memo_object, article_hash, keep:)
34
+ article_hash.each do |key, value|
35
+ next if value.eql?(memo_object[key])
36
+
37
+ if keep.include?(key)
38
+ memo_object[key] ||= []
39
+ memo_object[key] << value
40
+ elsif value && value.to_s.size > memo_object[key].to_s.size
41
+ memo_object[key] = value
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rss'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ ##
8
+ # Converts the autosourced channel and articles to an RSS feed.
9
+ class RssBuilder
10
+ def self.add_guid(article, maker)
11
+ maker.guid.tap do |guid|
12
+ guid.content = article.guid
13
+ guid.isPermaLink = false
14
+ end
15
+ end
16
+
17
+ def self.add_image(article, maker)
18
+ url = article.image || return
19
+
20
+ maker.enclosure.tap do |enclosure|
21
+ enclosure.url = url
22
+ enclosure.type = Html2rss::Utils.guess_content_type_from_url(url)
23
+ enclosure.length = 0
24
+ end
25
+ end
26
+
27
+ def initialize(channel:, articles:)
28
+ @channel = channel
29
+ @articles = articles
30
+ end
31
+
32
+ def call
33
+ RSS::Maker.make('2.0') do |maker|
34
+ make_channel(maker.channel)
35
+ make_items(maker)
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ attr_reader :channel, :articles
42
+
43
+ def make_channel(maker)
44
+ %i[language title description ttl].each do |key|
45
+ maker.public_send(:"#{key}=", channel.public_send(key))
46
+ end
47
+
48
+ maker.link = channel.url
49
+ maker.generator = channel.generator
50
+ maker.updated = channel.last_build_date
51
+ end
52
+
53
+ def make_items(maker)
54
+ articles.each do |article|
55
+ maker.items.new_item do |item_maker|
56
+ RssBuilder.add_guid(article, item_maker)
57
+ RssBuilder.add_image(article, item_maker)
58
+
59
+ item_maker.title = article.title
60
+ item_maker.description = article.description
61
+ item_maker.pubDate = article.published_at
62
+ item_maker.link = article.url
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class Schema
9
+ ##
10
+ # Base class for Schema.org schema_objects.
11
+ #
12
+ # @see https://schema.org/Article
13
+ class Base
14
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
15
+
16
+ def initialize(schema_object, url:)
17
+ @schema_object = schema_object
18
+ @url = url
19
+ end
20
+
21
+ # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
22
+ def call
23
+ DEFAULT_ATTRIBUTES.to_h do |attribute|
24
+ [attribute, public_send(attribute)]
25
+ end
26
+ end
27
+
28
+ def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
29
+ def title = schema_object[:title]
30
+
31
+ def description
32
+ [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
33
+ .max_by { |desc| desc.to_s.size }
34
+ end
35
+
36
+ # @return [Addressable::URI, nil] the URL of the schema object
37
+ def url
38
+ url = schema_object[:url]
39
+ if url.to_s.empty?
40
+ Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
41
+ return
42
+ end
43
+
44
+ Utils.build_absolute_url_from_relative(url, @url)
45
+ end
46
+
47
+ def image = images.first || nil
48
+ def published_at = schema_object[:datePublished]
49
+
50
+ private
51
+
52
+ attr_reader :schema_object
53
+
54
+ def images
55
+ Array(schema_object[:image]).compact
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ ##
7
+ # Scraps articles from Schema.org objects, by looking for the objects in:
8
+
9
+ # 1. <script type="application/ld+json"> "schema" tag.
10
+ # 2. tbd
11
+ #
12
+ # See:
13
+ # 1. https://schema.org/NewsArticle
14
+ # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
15
+ class Schema
16
+ include Enumerable
17
+
18
+ TAG_SELECTOR = 'script[type="application/ld+json"]'
19
+ SCHEMA_OBJECT_TYPES = %w[
20
+ AdvertiserContentArticle
21
+ AnalysisNewsArticle
22
+ APIReference
23
+ Article
24
+ AskPublicNewsArticle
25
+ BackgroundNewsArticle
26
+ BlogPosting
27
+ DiscussionForumPosting
28
+ LiveBlogPosting
29
+ NewsArticle
30
+ OpinionNewsArticle
31
+ Report
32
+ ReportageNewsArticle
33
+ ReviewNewsArticle
34
+ SatiricalArticle
35
+ ScholarlyArticle
36
+ SocialMediaPosting
37
+ TechArticle
38
+ ].to_set.freeze
39
+
40
+ class << self
41
+ def articles?(parsed_body)
42
+ parsed_body.css(TAG_SELECTOR).any? do |script|
43
+ SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
44
+ end
45
+ end
46
+
47
+ ##
48
+ # Returns a flat array
49
+ # of all supported schema objects
50
+ # by recursively traversing the `from` object.
51
+ #
52
+ # @param object [Hash, Array]
53
+ # @return [Array<Hash>] the schema_objects, or an empty array
54
+ # :reek:DuplicateMethodCall
55
+ def from(object)
56
+ case object
57
+ when Nokogiri::XML::Element
58
+ from(parse_script_tag(object))
59
+ when Hash
60
+ supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
61
+ when Array
62
+ object.flat_map { |item| from(item) }
63
+ else
64
+ []
65
+ end
66
+ end
67
+
68
+ def supported_schema_object?(object)
69
+ scraper_for_schema_object(object) ? true : false
70
+ end
71
+
72
+ ##
73
+ # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
74
+ def scraper_for_schema_object(schema_object)
75
+ if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
76
+ Base
77
+ else
78
+ Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
79
+ nil
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def parse_script_tag(script_tag)
86
+ JSON.parse(script_tag.text, symbolize_names: true)
87
+ rescue JSON::ParserError => error
88
+ Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
89
+ []
90
+ end
91
+ end
92
+
93
+ def initialize(parsed_body, url:)
94
+ @parsed_body = parsed_body
95
+ @url = url
96
+ end
97
+
98
+ ##
99
+ # @yield [Hash] Each scraped article_hash
100
+ # @return [Array<Hash>] the scraped article_hashes
101
+ def each(&)
102
+ schema_objects.filter_map do |schema_object|
103
+ next unless (klass = self.class.scraper_for_schema_object(schema_object))
104
+ next unless (article_hash = klass.new(schema_object, url:).call)
105
+
106
+ yield article_hash
107
+ end
108
+ end
109
+
110
+ private
111
+
112
+ def schema_objects
113
+ @parsed_body.css(TAG_SELECTOR).flat_map do |tag|
114
+ Schema.from(tag)
115
+ end
116
+ end
117
+
118
+ attr_reader :parsed_body, :url
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class SemanticHtml
9
+ ##
10
+ # ArticleExtractor is responsible for extracting the details of an article.
11
+ # It focuses on finding a headline first, and from it traverse as much as possible,
12
+ # to find the DOM upwards to find the other details.
13
+ class Extractor
14
+ INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
15
+ HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
16
+ NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
17
+ INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
18
+
19
+ def self.visible_text_from_tag(tag, separator: ' ')
20
+ text = if (children = tag.children).empty?
21
+ tag.text.strip
22
+ else
23
+ children.filter_map do |child|
24
+ next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
25
+
26
+ visible_text_from_tag(child)
27
+ end.join(separator)
28
+ end
29
+
30
+ return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
31
+
32
+ sanitized_text
33
+ end
34
+
35
+ def initialize(article_tag, url:)
36
+ @article_tag = article_tag
37
+ @url = url
38
+ @heading = find_heading
39
+ @extract_url = find_url
40
+ end
41
+
42
+ # @return [Hash, nil] The scraped article or nil.
43
+ def call
44
+ return unless heading
45
+
46
+ {
47
+ title: extract_title,
48
+ url: extract_url,
49
+ image: extract_image,
50
+ description: extract_description,
51
+ id: generate_id,
52
+ published_at: extract_published_at
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ attr_reader :article_tag, :url, :heading, :extract_url
59
+
60
+ def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
61
+
62
+ # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
63
+ def extract_published_at
64
+ times = article_tag.css('time[datetime]')
65
+ .filter_map do |tag|
66
+ DateTime.parse(tag['datetime'])
67
+ rescue ArgumentError, TypeError
68
+ nil
69
+ end
70
+
71
+ times.min
72
+ end
73
+
74
+ def find_heading
75
+ heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
76
+ smallest_heading = heading_tags.keys.min
77
+ heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
78
+ end
79
+
80
+ def extract_title
81
+ @extract_title ||= if heading.children.empty? && heading.text
82
+ visible_text_from_tag(heading)
83
+ else
84
+ visible_text_from_tag(
85
+ article_tag.css(HEADING_TAGS.join(','))
86
+ .max_by { |tag| tag.text.size }
87
+ )
88
+ end
89
+ end
90
+
91
+ def extract_description
92
+ text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
93
+ return text if text
94
+
95
+ description = visible_text_from_tag(article_tag)
96
+ return nil unless description
97
+
98
+ title_text = extract_title
99
+ description.gsub!(title_text, '') if title_text
100
+ description.strip!
101
+ description.empty? ? nil : description
102
+ end
103
+
104
+ def find_url
105
+ closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
106
+ selector: 'a[href]:not([href=""])')
107
+ href = closest_anchor&.[]('href')&.split('#')&.first&.strip
108
+ Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
109
+ end
110
+
111
+ def extract_image
112
+ Image.call(article_tag, url:)
113
+ end
114
+
115
+ def generate_id
116
+ [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
117
+ extract_url&.path].compact.reject(&:empty?).first
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class SemanticHtml
7
+ ##
8
+ # Image is responsible for extracting image URLs the article_tag.
9
+ class Image
10
+ def self.call(article_tag, url:)
11
+ img_src = from_source(article_tag) ||
12
+ from_img(article_tag) ||
13
+ from_style(article_tag)
14
+
15
+ Utils.build_absolute_url_from_relative(img_src, url) if img_src
16
+ end
17
+
18
+ def self.from_img(article_tag)
19
+ article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
20
+ end
21
+
22
+ ##
23
+ # Extracts the largest image source from the srcset attribute
24
+ # of an img tag or a source tag inside a picture tag.
25
+ #
26
+ # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
27
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
28
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
29
+ def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
30
+ hash = article_tag.css('img[srcset], picture > source[srcset]')
31
+ .flat_map { |source| source['srcset'].to_s.split(',') }
32
+ .filter_map do |line|
33
+ width, url = line.split.reverse
34
+ next if url.nil? || url.start_with?('data:')
35
+
36
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
37
+
38
+ [width_value, url.strip]
39
+ end.to_h
40
+
41
+ hash[hash.keys.max]
42
+ end
43
+
44
+ def self.from_style(article_tag)
45
+ article_tag.css('[style*="url"]')
46
+ .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
47
+ .reject { |src| !src || src.start_with?('data:') }
48
+ .max_by(&:size)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end