html2rss 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -2
- data/html2rss.gemspec +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
- data/lib/html2rss/auto_source/scraper/html.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema.rb +0 -1
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +2 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
- data/lib/html2rss/version.rb +1 -1
- metadata +9 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb6b3eb69655bdbb4511f74db9e1bcc766a98aa55d7afc2561a176c6973bda4f
|
4
|
+
data.tar.gz: 45193122489ba965b489c981696f71508030dc80f156e5b0f077932fc55caec3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 294529cd8cb1d289e969f94c32757656d12c864a92c438fceb73ba9dbddd85cca822b146898cdeff928aeefcba75652b91c0a56ded66241bcf23014fea299196
|
7
|
+
data.tar.gz: a2b50e52a7f6ad7768a7092fcdf04c7000dd43d190ee12461f6773fe4b324e43be7b411c7aff69b3af1266aeca3fbb4678e5c43e3bac2b0c2a49d86589869b38
|
data/README.md
CHANGED
@@ -681,7 +681,6 @@ To submit changes:
|
|
681
681
|
## Development Helpers
|
682
682
|
|
683
683
|
1. `bin/setup`: installs dependencies and sets up the development environment.
|
684
|
-
2.
|
685
|
-
3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE.
|
684
|
+
2. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE.
|
686
685
|
|
687
686
|
For example: [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby).
|
data/html2rss.gemspec
CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
|
15
15
|
spec.homepage = 'https://github.com/html2rss/html2rss'
|
16
16
|
spec.license = 'MIT'
|
17
|
-
spec.required_ruby_version = '>= 3.
|
17
|
+
spec.required_ruby_version = '>= 3.2'
|
18
18
|
|
19
19
|
if spec.respond_to?(:metadata)
|
20
20
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
@@ -46,5 +46,5 @@ Gem::Specification.new do |spec|
|
|
46
46
|
spec.add_dependency 'sanitize', '~> 6.0'
|
47
47
|
spec.add_dependency 'thor'
|
48
48
|
spec.add_dependency 'tzinfo'
|
49
|
-
spec.add_dependency 'zeitwerk'
|
49
|
+
spec.add_dependency 'zeitwerk'
|
50
50
|
end
|
@@ -77,10 +77,17 @@ module Html2rss
|
|
77
77
|
)
|
78
78
|
end
|
79
79
|
|
80
|
+
##
|
81
|
+
# @return [Hash]
|
82
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy
|
80
83
|
def add_attributes
|
81
84
|
{
|
82
85
|
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
83
|
-
'
|
86
|
+
'area' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
87
|
+
'img' => { 'referrerpolicy' => 'no-referrer' },
|
88
|
+
'iframe' => { 'referrerpolicy' => 'no-referrer' },
|
89
|
+
'video' => { 'referrerpolicy' => 'no-referrer' },
|
90
|
+
'audio' => { 'referrerpolicy' => 'no-referrer' }
|
84
91
|
}
|
85
92
|
end
|
86
93
|
|
@@ -1,7 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'nokogiri'
|
4
|
-
require 'set'
|
5
4
|
|
6
5
|
module Html2rss
|
7
6
|
class AutoSource
|
@@ -50,9 +49,10 @@ module Html2rss
|
|
50
49
|
frequent_selectors.each do |selector|
|
51
50
|
parsed_body.xpath(selector).each do |selected_tag|
|
52
51
|
article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
|
53
|
-
article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
|
54
52
|
|
55
|
-
|
53
|
+
if article_tag && (article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call)
|
54
|
+
yield article_hash
|
55
|
+
end
|
56
56
|
end
|
57
57
|
end
|
58
58
|
end
|
@@ -1,7 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'set'
|
4
|
-
|
5
3
|
module Html2rss
|
6
4
|
class AutoSource
|
7
5
|
module Scraper
|
@@ -33,6 +31,8 @@ module Html2rss
|
|
33
31
|
end
|
34
32
|
|
35
33
|
def initialize(article_tag, url:)
|
34
|
+
raise ArgumentError, 'article_tag is required' unless article_tag
|
35
|
+
|
36
36
|
@article_tag = article_tag
|
37
37
|
@url = url
|
38
38
|
end
|
@@ -28,14 +28,14 @@ module Html2rss
|
|
28
28
|
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
|
29
29
|
def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
|
30
30
|
hash = article_tag.css('img[srcset], picture > source[srcset]')
|
31
|
-
.flat_map
|
32
|
-
|
33
|
-
|
34
|
-
next if url.nil? || url.start_with?('data:')
|
31
|
+
.flat_map do |source|
|
32
|
+
source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)/).map do |url, width|
|
33
|
+
next if url.nil? || url.start_with?('data:')
|
35
34
|
|
36
|
-
|
35
|
+
width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
|
37
36
|
|
38
|
-
|
37
|
+
[width_value, url.strip]
|
38
|
+
end
|
39
39
|
end.to_h
|
40
40
|
|
41
41
|
hash[hash.keys.max]
|
@@ -106,9 +106,10 @@ module Html2rss
|
|
106
106
|
SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
|
107
107
|
parsed_body.css(selector).each do |selected_tag|
|
108
108
|
article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
|
109
|
-
article_hash = Extractor.new(article_tag, url: @url).call
|
110
109
|
|
111
|
-
|
110
|
+
if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
|
111
|
+
yield article_hash
|
112
|
+
end
|
112
113
|
end
|
113
114
|
end
|
114
115
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-18 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: addressable
|
@@ -222,16 +221,16 @@ dependencies:
|
|
222
221
|
name: zeitwerk
|
223
222
|
requirement: !ruby/object:Gem::Requirement
|
224
223
|
requirements:
|
225
|
-
- - "
|
224
|
+
- - ">="
|
226
225
|
- !ruby/object:Gem::Version
|
227
|
-
version:
|
226
|
+
version: '0'
|
228
227
|
type: :runtime
|
229
228
|
prerelease: false
|
230
229
|
version_requirements: !ruby/object:Gem::Requirement
|
231
230
|
requirements:
|
232
|
-
- - "
|
231
|
+
- - ">="
|
233
232
|
- !ruby/object:Gem::Version
|
234
|
-
version:
|
233
|
+
version: '0'
|
235
234
|
description: Supports JSON content, custom HTTP headers, and post-processing of extracted
|
236
235
|
content.
|
237
236
|
email:
|
@@ -303,9 +302,8 @@ licenses:
|
|
303
302
|
- MIT
|
304
303
|
metadata:
|
305
304
|
allowed_push_host: https://rubygems.org
|
306
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
305
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.17.0
|
307
306
|
rubygems_mfa_required: 'true'
|
308
|
-
post_install_message:
|
309
307
|
rdoc_options: []
|
310
308
|
require_paths:
|
311
309
|
- lib
|
@@ -313,15 +311,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
313
311
|
requirements:
|
314
312
|
- - ">="
|
315
313
|
- !ruby/object:Gem::Version
|
316
|
-
version: '3.
|
314
|
+
version: '3.2'
|
317
315
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
318
316
|
requirements:
|
319
317
|
- - ">="
|
320
318
|
- !ruby/object:Gem::Version
|
321
319
|
version: '0'
|
322
320
|
requirements: []
|
323
|
-
rubygems_version: 3.
|
324
|
-
signing_key:
|
321
|
+
rubygems_version: 3.6.2
|
325
322
|
specification_version: 4
|
326
323
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
327
324
|
to extract item.
|