html2rss 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a33d49918c1e75268b3b314908305986dd863ccf31dbe1b6ace8202d3a652de
4
- data.tar.gz: 7121a463570c62ffdddb85b9c0d7ba098bdc784ad6649b6bb34b232125a9bf49
3
+ metadata.gz: bb6b3eb69655bdbb4511f74db9e1bcc766a98aa55d7afc2561a176c6973bda4f
4
+ data.tar.gz: 45193122489ba965b489c981696f71508030dc80f156e5b0f077932fc55caec3
5
5
  SHA512:
6
- metadata.gz: e77faa7bd81f63894f0001157a81e858a67eae502f5a5f3ecd41790f48c630b3ab94cd281531cb0a8cf0a7693fa03c7d6176edde04d063804f69f15d4b7469f3
7
- data.tar.gz: d84a132a76997336f840c7b5e9261ab304ecfb302628b10d839350fbe3de92919bdc3321b5f374231c703202560d7b392da51cfbcb66ae8e18debdcd69d9e3d0
6
+ metadata.gz: 294529cd8cb1d289e969f94c32757656d12c864a92c438fceb73ba9dbddd85cca822b146898cdeff928aeefcba75652b91c0a56ded66241bcf23014fea299196
7
+ data.tar.gz: a2b50e52a7f6ad7768a7092fcdf04c7000dd43d190ee12461f6773fe4b324e43be7b411c7aff69b3af1266aeca3fbb4678e5c43e3bac2b0c2a49d86589869b38
data/README.md CHANGED
@@ -681,7 +681,6 @@ To submit changes:
681
681
  ## Development Helpers
682
682
 
683
683
  1. `bin/setup`: installs dependencies and sets up the development environment.
684
- 2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
685
- 3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE.
684
+ 2. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE.
686
685
 
687
686
  For example: [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby).
data/html2rss.gemspec CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
15
15
  spec.homepage = 'https://github.com/html2rss/html2rss'
16
16
  spec.license = 'MIT'
17
- spec.required_ruby_version = '>= 3.1'
17
+ spec.required_ruby_version = '>= 3.2'
18
18
 
19
19
  if spec.respond_to?(:metadata)
20
20
  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -46,5 +46,5 @@ Gem::Specification.new do |spec|
46
46
  spec.add_dependency 'sanitize', '~> 6.0'
47
47
  spec.add_dependency 'thor'
48
48
  spec.add_dependency 'tzinfo'
49
- spec.add_dependency 'zeitwerk', '~> 2.6.0'
49
+ spec.add_dependency 'zeitwerk'
50
50
  end
@@ -77,10 +77,17 @@ module Html2rss
77
77
  )
78
78
  end
79
79
 
80
+ ##
81
+ # @return [Hash]
82
+ # @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
80
83
  def add_attributes
81
84
  {
82
85
  'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
83
- 'img' => { 'referrer-policy' => 'no-referrer' }
86
+ 'area' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
87
+ 'img' => { 'referrerpolicy' => 'no-referrer' },
88
+ 'iframe' => { 'referrerpolicy' => 'no-referrer' },
89
+ 'video' => { 'referrerpolicy' => 'no-referrer' },
90
+ 'audio' => { 'referrerpolicy' => 'no-referrer' }
84
91
  }
85
92
  end
86
93
 
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'nokogiri'
4
- require 'set'
5
4
 
6
5
  module Html2rss
7
6
  class AutoSource
@@ -50,9 +49,10 @@ module Html2rss
50
49
  frequent_selectors.each do |selector|
51
50
  parsed_body.xpath(selector).each do |selected_tag|
52
51
  article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
53
- article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
54
52
 
55
- yield article_hash if article_hash
53
+ if article_tag && (article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call)
54
+ yield article_hash
55
+ end
56
56
  end
57
57
  end
58
58
  end
@@ -2,7 +2,6 @@
2
2
 
3
3
  require 'json'
4
4
  require 'nokogiri'
5
- require 'set'
6
5
 
7
6
  module Html2rss
8
7
  class AutoSource
@@ -1,7 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'set'
4
-
5
3
  module Html2rss
6
4
  class AutoSource
7
5
  module Scraper
@@ -33,6 +31,8 @@ module Html2rss
33
31
  end
34
32
 
35
33
  def initialize(article_tag, url:)
34
+ raise ArgumentError, 'article_tag is required' unless article_tag
35
+
36
36
  @article_tag = article_tag
37
37
  @url = url
38
38
  end
@@ -28,14 +28,14 @@ module Html2rss
28
28
  # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
29
29
  def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
30
30
  hash = article_tag.css('img[srcset], picture > source[srcset]')
31
- .flat_map { |source| source['srcset'].to_s.split(',') }
32
- .filter_map do |line|
33
- width, url = line.split.reverse
34
- next if url.nil? || url.start_with?('data:')
31
+ .flat_map do |source|
32
+ source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)/).map do |url, width|
33
+ next if url.nil? || url.start_with?('data:')
35
34
 
36
- width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
35
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
37
36
 
38
- [width_value, url.strip]
37
+ [width_value, url.strip]
38
+ end
39
39
  end.to_h
40
40
 
41
41
  hash[hash.keys.max]
@@ -106,9 +106,10 @@ module Html2rss
106
106
  SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
107
  parsed_body.css(selector).each do |selected_tag|
108
108
  article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
109
- article_hash = Extractor.new(article_tag, url: @url).call
110
109
 
111
- yield article_hash if article_hash
110
+ if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
111
+ yield article_hash
112
+ end
112
113
  end
113
114
  end
114
115
  end
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.16.0'
6
+ VERSION = '0.17.0'
7
7
  public_constant :VERSION
8
8
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-12-24 00:00:00.000000000 Z
10
+ date: 2025-01-18 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: addressable
@@ -222,16 +221,16 @@ dependencies:
222
221
  name: zeitwerk
223
222
  requirement: !ruby/object:Gem::Requirement
224
223
  requirements:
225
- - - "~>"
224
+ - - ">="
226
225
  - !ruby/object:Gem::Version
227
- version: 2.6.0
226
+ version: '0'
228
227
  type: :runtime
229
228
  prerelease: false
230
229
  version_requirements: !ruby/object:Gem::Requirement
231
230
  requirements:
232
- - - "~>"
231
+ - - ">="
233
232
  - !ruby/object:Gem::Version
234
- version: 2.6.0
233
+ version: '0'
235
234
  description: Supports JSON content, custom HTTP headers, and post-processing of extracted
236
235
  content.
237
236
  email:
@@ -303,9 +302,8 @@ licenses:
303
302
  - MIT
304
303
  metadata:
305
304
  allowed_push_host: https://rubygems.org
306
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.16.0
305
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.17.0
307
306
  rubygems_mfa_required: 'true'
308
- post_install_message:
309
307
  rdoc_options: []
310
308
  require_paths:
311
309
  - lib
@@ -313,15 +311,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
313
311
  requirements:
314
312
  - - ">="
315
313
  - !ruby/object:Gem::Version
316
- version: '3.1'
314
+ version: '3.2'
317
315
  required_rubygems_version: !ruby/object:Gem::Requirement
318
316
  requirements:
319
317
  - - ">="
320
318
  - !ruby/object:Gem::Version
321
319
  version: '0'
322
320
  requirements: []
323
- rubygems_version: 3.5.22
324
- signing_key:
321
+ rubygems_version: 3.6.2
325
322
  specification_version: 4
326
323
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
327
324
  to extract item.