html2rss 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +113 -44
- data/html2rss.gemspec +3 -2
- data/lib/html2rss/auto_source/article.rb +37 -5
- data/lib/html2rss/auto_source/channel.rb +21 -28
- data/lib/html2rss/auto_source/cleanup.rb +0 -16
- data/lib/html2rss/auto_source/rss_builder.rb +1 -1
- data/lib/html2rss/auto_source/scraper/html.rb +96 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +51 -38
- data/lib/html2rss/auto_source/scraper.rb +1 -0
- data/lib/html2rss/auto_source.rb +0 -7
- data/lib/html2rss/cli.rb +11 -4
- data/lib/html2rss/config/channel.rb +7 -1
- data/lib/html2rss/config/selectors.rb +2 -1
- data/lib/html2rss/config.rb +1 -0
- data/lib/html2rss/item.rb +7 -2
- data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
- data/lib/html2rss/request_service/context.rb +46 -0
- data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
- data/lib/html2rss/request_service/puppet_commander.rb +61 -0
- data/lib/html2rss/request_service/response.rb +27 -0
- data/lib/html2rss/request_service/strategy.rb +28 -0
- data/lib/html2rss/request_service.rb +97 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
- data/lib/html2rss/utils.rb +23 -26
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +7 -6
- metadata +35 -11
- data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3a33d49918c1e75268b3b314908305986dd863ccf31dbe1b6ace8202d3a652de
|
4
|
+
data.tar.gz: 7121a463570c62ffdddb85b9c0d7ba098bdc784ad6649b6bb34b232125a9bf49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e77faa7bd81f63894f0001157a81e858a67eae502f5a5f3ecd41790f48c630b3ab94cd281531cb0a8cf0a7693fa03c7d6176edde04d063804f69f15d4b7469f3
|
7
|
+
data.tar.gz: d84a132a76997336f840c7b5e9261ab304ecfb302628b10d839350fbe3de92919bdc3321b5f374231c703202560d7b392da51cfbcb66ae8e18debdcd69d9e3d0
|
data/README.md
CHANGED
@@ -10,21 +10,11 @@ With the _feed config_, you provide a URL to scrape and CSS selectors for extrac
|
|
10
10
|
|
11
11
|
Support the development by sponsoring this project on GitHub. Thank you! 💓
|
12
12
|
|
13
|
-
##
|
14
|
-
|
15
|
-
| Install | `gem install html2rss` |
|
16
|
-
| ------- | ---------------------- |
|
17
|
-
| Usage | `html2rss help` |
|
18
|
-
|
19
|
-
You can also install it as a dependency in your Ruby project:
|
13
|
+
## Generating a feed on the CLI
|
20
14
|
|
21
|
-
|
22
|
-
| -------------------------------: | -------------------- |
|
23
|
-
| Add this line to your `Gemfile`: | `gem 'html2rss'` |
|
24
|
-
| Then execute: | `bundle` |
|
25
|
-
| In your code: | `require 'html2rss'` |
|
15
|
+
[Install Ruby](https://www.ruby-lang.org/en/documentation/installation/) (latest version is recommended) on your machine and run `gem install html2rss` in your terminal.
|
26
16
|
|
27
|
-
|
17
|
+
After the installation has finished, `html2rss help` will print usage information.
|
28
18
|
|
29
19
|
### using automatic generation
|
30
20
|
|
@@ -59,6 +49,14 @@ Build the feed from this config with: `html2rss feed ./my_config_file.yml`.
|
|
59
49
|
|
60
50
|
## Generating a feed with Ruby
|
61
51
|
|
52
|
+
You can also install it as a dependency in your Ruby project:
|
53
|
+
|
54
|
+
| 🤩 Like it? | Star it! ⭐️ |
|
55
|
+
| -------------------------------: | -------------------- |
|
56
|
+
| Add this line to your `Gemfile`: | `gem 'html2rss'` |
|
57
|
+
| Then execute: | `bundle` |
|
58
|
+
| In your code: | `require 'html2rss'` |
|
59
|
+
|
62
60
|
Here's a minimal working example using Ruby:
|
63
61
|
|
64
62
|
```ruby
|
@@ -117,7 +115,7 @@ channel:
|
|
117
115
|
Command line usage example:
|
118
116
|
|
119
117
|
```sh
|
120
|
-
|
118
|
+
html2rss feed the_feed_config.yml id=42
|
121
119
|
```
|
122
120
|
|
123
121
|
<details><summary>See a Ruby example</summary>
|
@@ -154,9 +152,9 @@ Your `selectors` hash can contain arbitrary named selectors, but only a few will
|
|
154
152
|
| `comments` | `comments` | A URL. |
|
155
153
|
| `source` | ~~source~~ | Not yet supported. |
|
156
154
|
|
157
|
-
###
|
155
|
+
### Build RSS 2.0 item attributes by specifying selectors
|
158
156
|
|
159
|
-
Every named selector in your `selectors` hash can have these attributes:
|
157
|
+
Every named selector (i.e. `title`, `description`, see table above) in your `selectors` hash can have these attributes:
|
160
158
|
|
161
159
|
| name | value |
|
162
160
|
| -------------- | -------------------------------------------------------- |
|
@@ -164,7 +162,7 @@ Every named selector in your `selectors` hash can have these attributes:
|
|
164
162
|
| `extractor` | Name of the extractor. See notes below. |
|
165
163
|
| `post_process` | A hash or array of hashes. See notes below. |
|
166
164
|
|
167
|
-
|
165
|
+
#### Using extractors
|
168
166
|
|
169
167
|
Extractors help with extracting the information from the selected HTML tag.
|
170
168
|
|
@@ -201,7 +199,7 @@ selectors:
|
|
201
199
|
|
202
200
|
</details>
|
203
201
|
|
204
|
-
|
202
|
+
### Using post processors
|
205
203
|
|
206
204
|
Extracted information can be further manipulated with post processors.
|
207
205
|
|
@@ -218,7 +216,7 @@ Extracted information can be further manipulated with post processors.
|
|
218
216
|
|
219
217
|
⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
|
220
218
|
|
221
|
-
|
219
|
+
#### Chaining post processors
|
222
220
|
|
223
221
|
Pass an array to `post_process` to chain the post processors.
|
224
222
|
|
@@ -244,14 +242,14 @@ selectors:
|
|
244
242
|
|
245
243
|
</details>
|
246
244
|
|
247
|
-
|
245
|
+
##### Post processor `gsub`
|
248
246
|
|
249
247
|
The post processor `gsub` makes use of Ruby's [`gsub`](https://apidock.com/ruby/String/gsub) method.
|
250
248
|
|
251
|
-
| key | type | required | note
|
252
|
-
| ------------- | ------ | -------- |
|
253
|
-
| `pattern` | String | yes | Can be Regexp or String.
|
254
|
-
| `replacement` | String | yes | Can be a
|
249
|
+
| key | type | required | note |
|
250
|
+
| ------------- | ------ | -------- | ------------------------ |
|
251
|
+
| `pattern` | String | yes | Can be Regexp or String. |
|
252
|
+
| `replacement` | String | yes | Can be a backreference. |
|
255
253
|
|
256
254
|
<details><summary>See a Ruby example</summary>
|
257
255
|
|
@@ -283,7 +281,7 @@ selectors:
|
|
283
281
|
|
284
282
|
</details>
|
285
283
|
|
286
|
-
|
284
|
+
#### Adding `<category>` tags to an item
|
287
285
|
|
288
286
|
The `categories` selector takes an array of selector names. Each value of those
|
289
287
|
selectors will become a `<category>` on the RSS item.
|
@@ -326,7 +324,7 @@ selectors:
|
|
326
324
|
|
327
325
|
</details>
|
328
326
|
|
329
|
-
|
327
|
+
#### Custom item GUID
|
330
328
|
|
331
329
|
By default, html2rss generates a GUID from the `title` or `description`.
|
332
330
|
|
@@ -371,7 +369,7 @@ selectors:
|
|
371
369
|
|
372
370
|
</details>
|
373
371
|
|
374
|
-
|
372
|
+
#### Adding an `<enclosure>` tag to an item
|
375
373
|
|
376
374
|
An enclosure can be any file, e.g. a image, audio or video - think Podcast.
|
377
375
|
|
@@ -379,7 +377,7 @@ The `enclosure` selector needs to return a URL of the content to enclose. If the
|
|
379
377
|
|
380
378
|
Since `html2rss` does no further inspection of the enclosure, its support comes with trade-offs:
|
381
379
|
|
382
|
-
1. The content-type is guessed from the file extension of the URL
|
380
|
+
1. The content-type is guessed from the file extension of the URL, unless one is specified in `content_type`.
|
383
381
|
2. If the content-type guessing fails, it will default to `application/octet-stream`.
|
384
382
|
3. The content-length will always be undetermined and therefore stated as `0` bytes.
|
385
383
|
|
@@ -392,7 +390,12 @@ Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item
|
|
392
390
|
Html2rss.feed(
|
393
391
|
channel: {},
|
394
392
|
selectors: {
|
395
|
-
enclosure: {
|
393
|
+
enclosure: {
|
394
|
+
selector: 'audio',
|
395
|
+
extractor: 'attribute',
|
396
|
+
attribute: 'src',
|
397
|
+
content_type: 'audio/mp3'
|
398
|
+
}
|
396
399
|
}
|
397
400
|
)
|
398
401
|
```
|
@@ -411,17 +414,16 @@ selectors:
|
|
411
414
|
selector: "audio"
|
412
415
|
extractor: "attribute"
|
413
416
|
attribute: "src"
|
417
|
+
content_type: "audio/mp3"
|
414
418
|
```
|
415
419
|
|
416
420
|
</details>
|
421
|
+
|
417
422
|
## Scraping and handling JSON responses
|
418
423
|
|
419
|
-
By default, `html2rss` assumes the URL responds with HTML. However, it can also handle JSON responses. The JSON must
|
424
|
+
By default, `html2rss` assumes the URL responds with HTML. However, it can also handle JSON responses. The JSON response must be an Array or Hash.
|
420
425
|
|
421
|
-
|
422
|
-
| ---------- | -------- | ------- | ---------------------------------------------------- |
|
423
|
-
| `json` | optional | false | If set to `true`, the response is parsed as JSON. |
|
424
|
-
| `jsonpath` | optional | $ | Use [JSONPath syntax]() to select nodes of interest. |
|
426
|
+
The JSON is converted to XML which you can query using CSS selectors.
|
425
427
|
|
426
428
|
<details><summary>See a Ruby example</summary>
|
427
429
|
|
@@ -447,7 +449,73 @@ selectors:
|
|
447
449
|
|
448
450
|
</details>
|
449
451
|
|
450
|
-
##
|
452
|
+
## Customization of how requests to the channel URL are sent
|
453
|
+
|
454
|
+
By default, html2rss issues a naiive HTTP request and extracts information from the response. That is performant and works for many websites.
|
455
|
+
|
456
|
+
However, modern websites often do not render much HTML on the server, but evaluate JavaScript on the client to create the HTML. In such cases, the default strategy will not find the "juicy content".
|
457
|
+
|
458
|
+
### Use Browserless.io
|
459
|
+
|
460
|
+
You can use _Browserless.io_ to run a Chrome browser and return the website's source code after the website generated it.
|
461
|
+
For this, you can either run your own Browserless.io instance (Docker image available -- [read their license](https://github.com/browserless/browserless/pkgs/container/chromium#licensing)!) or pay them for a hosted instance.
|
462
|
+
|
463
|
+
To run a local Browserless.io instance, you can use the following Docker command:
|
464
|
+
|
465
|
+
```sh
|
466
|
+
docker run \
|
467
|
+
--rm \
|
468
|
+
-p 3000:3000 \
|
469
|
+
-e "CONCURRENT=10" \
|
470
|
+
-e "TOKEN=6R0W53R135510" \
|
471
|
+
ghcr.io/browserless/chromium
|
472
|
+
```
|
473
|
+
|
474
|
+
To make html2rss use your instance,
|
475
|
+
|
476
|
+
1. specify the environment variables accordingly, and
|
477
|
+
2. use the `browserless` strategy for those websites.
|
478
|
+
|
479
|
+
When running locally with commands from above, you can skip setting the environment variables, as they are aligned with the default values.
|
480
|
+
|
481
|
+
```sh
|
482
|
+
BROWSERLESS_IO_WEBSOCKET_URL="ws://127.0.0.1:3000" BROWSERLESS_IO_API_TOKEN="6R0W53R135510" \
|
483
|
+
html2rss auto --strategy=browserless https://example.com
|
484
|
+
```
|
485
|
+
|
486
|
+
When using traditional feed configs, inside your channel config set `strategy: browserless`.
|
487
|
+
|
488
|
+
<details><summary>See a YAML feed config example</summary>
|
489
|
+
|
490
|
+
```yml
|
491
|
+
channel:
|
492
|
+
url: https://www.imdb.com/user/ur67728460/ratings
|
493
|
+
time_zone: UTC
|
494
|
+
ttl: 1440
|
495
|
+
strategy: browserless
|
496
|
+
headers:
|
497
|
+
User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
498
|
+
selectors:
|
499
|
+
items:
|
500
|
+
selector: "li.ipc-metadata-list-summary-item"
|
501
|
+
title:
|
502
|
+
selector: ".ipc-title__text"
|
503
|
+
post_process:
|
504
|
+
- name: gsub
|
505
|
+
pattern: "/^(\\d+.)\\s/"
|
506
|
+
replacement: ""
|
507
|
+
- name: template
|
508
|
+
string: "%{self} rated with: %{user_rating}"
|
509
|
+
link:
|
510
|
+
selector: "a.ipc-title-link-wrapper"
|
511
|
+
extractor: "href"
|
512
|
+
user_rating:
|
513
|
+
selector: "[data-testid='ratingGroup--other-user-rating'] > .ipc-rating-star--rating"
|
514
|
+
```
|
515
|
+
|
516
|
+
</details>
|
517
|
+
|
518
|
+
### Set any HTTP header in the request
|
451
519
|
|
452
520
|
To set HTTP request headers, you can add them to the channel's `headers` hash. This is useful for APIs that require an Authorization header.
|
453
521
|
|
@@ -595,24 +663,25 @@ Recommended further readings:
|
|
595
663
|
- Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
|
596
664
|
- [CSS selectors are versatile. Here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
|
597
665
|
|
598
|
-
|
666
|
+
## Contributing
|
599
667
|
|
600
668
|
Find ideas what to contribute in:
|
601
669
|
|
602
670
|
1. <https://github.com/orgs/html2rss/discussions>
|
603
671
|
2. the issues tracker: <https://github.com/html2rss/html2rss/issues>
|
604
672
|
|
605
|
-
|
606
|
-
|
607
|
-
1. `bin/setup`: installs dependencies and sets up the development environment.
|
608
|
-
2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
|
609
|
-
3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE:
|
610
|
-
a. [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby)
|
611
|
-
|
612
|
-
#### How to submit changes
|
673
|
+
To submit changes:
|
613
674
|
|
614
675
|
1. Fork this repo ( <https://github.com/html2rss/html2rss/fork> )
|
615
676
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
616
677
|
3. Implement a commit your changes (`git commit -am 'feat: add XYZ'`)
|
617
678
|
4. Push to the branch (`git push origin my-new-feature`)
|
618
679
|
5. Create a new Pull Request using the Github web UI
|
680
|
+
|
681
|
+
## Development Helpers
|
682
|
+
|
683
|
+
1. `bin/setup`: installs dependencies and sets up the development environment.
|
684
|
+
2. `bin/guard`: automatically runs rspec, rubocop and reek when a file changes.
|
685
|
+
3. for a modern Ruby development experience: install [`ruby-lsp`](https://github.com/Shopify/ruby-lsp) and integrate it to your IDE.
|
686
|
+
|
687
|
+
For example: [Ruby in Visual Studio Code](https://code.visualstudio.com/docs/languages/ruby).
|
data/html2rss.gemspec
CHANGED
@@ -39,11 +39,12 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_dependency 'mime-types', '> 3.0'
|
40
40
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
41
41
|
spec.add_dependency 'parallel'
|
42
|
+
spec.add_dependency 'puppeteer-ruby'
|
42
43
|
spec.add_dependency 'regexp_parser'
|
43
|
-
spec.add_dependency 'reverse_markdown', '~>
|
44
|
+
spec.add_dependency 'reverse_markdown', '~> 3.0'
|
44
45
|
spec.add_dependency 'rss'
|
45
46
|
spec.add_dependency 'sanitize', '~> 6.0'
|
46
47
|
spec.add_dependency 'thor'
|
47
48
|
spec.add_dependency 'tzinfo'
|
48
|
-
spec.add_dependency 'zeitwerk'
|
49
|
+
spec.add_dependency 'zeitwerk', '~> 2.6.0'
|
49
50
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'zlib'
|
4
4
|
require 'sanitize'
|
5
|
+
require 'nokogiri'
|
5
6
|
|
6
7
|
module Html2rss
|
7
8
|
class AutoSource
|
@@ -14,6 +15,31 @@ module Html2rss
|
|
14
15
|
|
15
16
|
PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
|
16
17
|
|
18
|
+
##
|
19
|
+
# Removes the specified pattern from the beginning of the text
|
20
|
+
# within a given range if the pattern occurs before the range's end.
|
21
|
+
#
|
22
|
+
# @param text [String]
|
23
|
+
# @param pattern [String]
|
24
|
+
# @param end_of_range [Integer] - Optional, defaults to half the size of the text
|
25
|
+
# @return [String]
|
26
|
+
def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
|
27
|
+
return text unless text.is_a?(String) && pattern.is_a?(String)
|
28
|
+
|
29
|
+
index = text.index(pattern)
|
30
|
+
return text if index.nil? || index >= end_of_range
|
31
|
+
|
32
|
+
text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Checks if the text contains HTML tags.
|
37
|
+
# @param text [String]
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.contains_html?(text)
|
40
|
+
Nokogiri::HTML.fragment(text).children.any?(&:element?)
|
41
|
+
end
|
42
|
+
|
17
43
|
# @param options [Hash<Symbol, String>]
|
18
44
|
def initialize(**options)
|
19
45
|
@to_h = {}
|
@@ -50,9 +76,15 @@ module Html2rss
|
|
50
76
|
def description
|
51
77
|
return @description if defined?(@description)
|
52
78
|
|
53
|
-
return if
|
79
|
+
return if (description = @to_h[:description]).to_s.empty?
|
80
|
+
|
81
|
+
@description = self.class.remove_pattern_from_start(description, title) if title
|
54
82
|
|
55
|
-
|
83
|
+
if self.class.contains_html?(@description) && url
|
84
|
+
@description = Html2rss::AttributePostProcessors::SanitizeHtml.get(description, url)
|
85
|
+
else
|
86
|
+
@description
|
87
|
+
end
|
56
88
|
end
|
57
89
|
|
58
90
|
# @return [Addressable::URI, nil]
|
@@ -72,11 +104,11 @@ module Html2rss
|
|
72
104
|
end
|
73
105
|
|
74
106
|
# Parses and returns the published_at time.
|
75
|
-
# @return [
|
107
|
+
# @return [DateTime, nil]
|
76
108
|
def published_at
|
77
|
-
return if (string = @to_h[:published_at].to_s
|
109
|
+
return if (string = @to_h[:published_at].to_s.strip).empty?
|
78
110
|
|
79
|
-
@published_at ||=
|
111
|
+
@published_at ||= DateTime.parse(string)
|
80
112
|
rescue ArgumentError
|
81
113
|
nil
|
82
114
|
end
|
@@ -24,52 +24,45 @@ module Html2rss
|
|
24
24
|
attr_writer :articles
|
25
25
|
attr_reader :stylesheets
|
26
26
|
|
27
|
-
def url =
|
28
|
-
|
29
|
-
def
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
def generator
|
36
|
-
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
27
|
+
def url = @url.normalize.to_s
|
28
|
+
|
29
|
+
def title
|
30
|
+
@title ||= if (title = parsed_body.at_css('head > title')&.text.to_s) && !title.empty?
|
31
|
+
title.gsub(/\s+/, ' ').strip
|
32
|
+
else
|
33
|
+
Utils.titleized_channel_url(@url)
|
34
|
+
end
|
37
35
|
end
|
38
36
|
|
39
|
-
|
40
|
-
|
41
|
-
attr_reader :parsed_body, :headers
|
42
|
-
|
43
|
-
def extract_url
|
44
|
-
@url.normalize.to_s
|
45
|
-
end
|
46
|
-
|
47
|
-
def extract_title
|
48
|
-
parsed_body.at_css('head > title')&.text
|
49
|
-
end
|
37
|
+
def description = parsed_body.at_css('meta[name="description"]')&.[]('content')
|
38
|
+
def last_build_date = headers['last-modified']
|
50
39
|
|
51
|
-
def
|
40
|
+
def language
|
52
41
|
return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
|
53
42
|
|
54
43
|
parsed_body.at_css('[lang]')&.[]('lang')
|
55
44
|
end
|
56
45
|
|
57
|
-
def
|
58
|
-
parsed_body.at_css('meta[name="description"]')&.[]('content') || ''
|
59
|
-
end
|
60
|
-
|
61
|
-
def extract_image
|
46
|
+
def image
|
62
47
|
url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
|
63
48
|
Html2rss::Utils.sanitize_url(url) if url
|
64
49
|
end
|
65
50
|
|
66
|
-
def
|
51
|
+
def ttl
|
67
52
|
ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
|
68
53
|
return unless ttl
|
69
54
|
|
70
55
|
ttl.to_i.fdiv(60).ceil
|
71
56
|
end
|
72
57
|
|
58
|
+
def generator
|
59
|
+
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
attr_reader :parsed_body, :headers
|
65
|
+
|
73
66
|
def scraper_counts
|
74
67
|
scraper_counts = +''
|
75
68
|
|
@@ -13,10 +13,7 @@ module Html2rss
|
|
13
13
|
|
14
14
|
articles.select!(&:valid?)
|
15
15
|
|
16
|
-
remove_short!(articles, :title)
|
17
|
-
|
18
16
|
deduplicate_by!(articles, :url)
|
19
|
-
deduplicate_by!(articles, :title)
|
20
17
|
|
21
18
|
keep_only_http_urls!(articles)
|
22
19
|
reject_different_domain!(articles, url) unless keep_different_domain
|
@@ -27,19 +24,6 @@ module Html2rss
|
|
27
24
|
|
28
25
|
private
|
29
26
|
|
30
|
-
##
|
31
|
-
# Removes articles with short values for a given key.
|
32
|
-
#
|
33
|
-
# @param articles [Array<Article>] The list of articles to process.
|
34
|
-
# @param key [Symbol] The key to check for short values.
|
35
|
-
# @param min_words [Integer] The minimum number of words required.
|
36
|
-
def remove_short!(articles, key = :title, min_words: 2)
|
37
|
-
articles.reject! do |article|
|
38
|
-
value = article.public_send(key)
|
39
|
-
value.nil? || value.to_s.split.size < min_words
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
27
|
##
|
44
28
|
# Deduplicates articles by a given key.
|
45
29
|
#
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
module Scraper
|
9
|
+
##
|
10
|
+
# Scrapes articles from HTML pages by
|
11
|
+
# finding similar structures around anchor tags in the parsed_body.
|
12
|
+
class Html
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
TAGS_TO_IGNORE = /(nav|footer|header)/i
|
16
|
+
|
17
|
+
def self.articles?(parsed_body)
|
18
|
+
new(parsed_body, url: '').any?
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.parent_until_condition(node, condition)
|
22
|
+
return nil if !node || node.document? || node.parent.name == 'html'
|
23
|
+
return node if condition.call(node)
|
24
|
+
|
25
|
+
parent_until_condition(node.parent, condition)
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Simplify an XPath selector by removing the index notation.
|
30
|
+
def self.simplify_xpath(xpath)
|
31
|
+
xpath.gsub(/\[\d+\]/, '')
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(parsed_body, url:)
|
35
|
+
@parsed_body = parsed_body
|
36
|
+
@url = url
|
37
|
+
@selectors = Hash.new(0)
|
38
|
+
end
|
39
|
+
|
40
|
+
attr_reader :parsed_body
|
41
|
+
|
42
|
+
##
|
43
|
+
# @yieldparam [Hash] The scraped article hash
|
44
|
+
# @return [Enumerator] Enumerator for the scraped articles
|
45
|
+
def each
|
46
|
+
return enum_for(:each) unless block_given?
|
47
|
+
|
48
|
+
return if frequent_selectors.empty?
|
49
|
+
|
50
|
+
frequent_selectors.each do |selector|
|
51
|
+
parsed_body.xpath(selector).each do |selected_tag|
|
52
|
+
article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
|
53
|
+
article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
|
54
|
+
|
55
|
+
yield article_hash if article_hash
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
##
|
61
|
+
# Find all the anchors in root.
|
62
|
+
# @param root [Nokogiri::XML::Node] The root node to search for anchors
|
63
|
+
# @return [Set<String>] The set of XPath selectors which exist at least min_frequency times
|
64
|
+
def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
|
65
|
+
@frequent_selectors ||= begin
|
66
|
+
root.traverse do |node|
|
67
|
+
next if !node.element? || node.name != 'a'
|
68
|
+
|
69
|
+
@selectors[self.class.simplify_xpath(node.path)] += 1
|
70
|
+
end
|
71
|
+
|
72
|
+
@selectors.keys
|
73
|
+
.select { |selector| (@selectors[selector]).to_i >= min_frequency }
|
74
|
+
.to_set
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def article_condition(node)
|
79
|
+
# Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
|
80
|
+
return false if node.path.match?(TAGS_TO_IGNORE)
|
81
|
+
|
82
|
+
# Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
|
83
|
+
return false if self.class.parent_until_condition(node, proc do |current_node|
|
84
|
+
current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
|
85
|
+
end)
|
86
|
+
|
87
|
+
return true if %w[body html].include?(node.name)
|
88
|
+
|
89
|
+
return true if node.parent.css('a').size > 1
|
90
|
+
|
91
|
+
false
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
module Scraper
|
6
|
+
class Schema
|
7
|
+
##
|
8
|
+
# Handles schema.org ItemList objects, which contain
|
9
|
+
# 1. itemListElements, and/or
|
10
|
+
# 2. interesting attributes, i.e. description, url, image, itself.
|
11
|
+
#
|
12
|
+
# @see https://schema.org/ItemList
|
13
|
+
class ItemList < Thing
|
14
|
+
SUPPORTED_TYPES = Set['ItemList']
|
15
|
+
|
16
|
+
# @return [Array<Hash>] the scraped article hashes with DEFAULT_ATTRIBUTES
|
17
|
+
def call
|
18
|
+
hashes = [super]
|
19
|
+
|
20
|
+
return hashes if (elements = @schema_object[:itemListElement]).nil?
|
21
|
+
|
22
|
+
elements = [elements] unless elements.is_a?(Array)
|
23
|
+
|
24
|
+
elements.each do |schema_object|
|
25
|
+
hashes << ListItem.new(schema_object, url: @url).call
|
26
|
+
end
|
27
|
+
|
28
|
+
hashes
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
module Scraper
|
6
|
+
class Schema
|
7
|
+
##
|
8
|
+
#
|
9
|
+
# @see https://schema.org/ListItem
|
10
|
+
class ListItem < Thing
|
11
|
+
def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
|
12
|
+
def title = schema_object.dig(:item, :name) || super || (url ? Utils.titleized_url(url) : nil)
|
13
|
+
def description = schema_object.dig(:item, :description) || super
|
14
|
+
|
15
|
+
# @return [Addressable::URI, nil]
|
16
|
+
def url
|
17
|
+
url = schema_object.dig(:item, :url) || super
|
18
|
+
|
19
|
+
Utils.build_absolute_url_from_relative(url, @url) if url
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|