html2rss 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 645fe7ea0ebe4a733c9833e8c22a93aa3f2d2b3b8589160fb001ad6ffd4659d3
4
- data.tar.gz: 0fb454258a1f334243984bdb73a699ffcfc676c529d87bb0886fbba75127b7cf
3
+ metadata.gz: 38b547bdfe0799d71348690ae20d29d74d88c3cbb0828cdd807a382dec14f895
4
+ data.tar.gz: 8e9ae611d6bbf0174bd9038c3aa7a7b214ec8ffa152e990b2224e98d5022bec0
5
5
  SHA512:
6
- metadata.gz: 350f0fd0e11bd35963c89b56aea9eb1dc90ae72956d849fd85b0b67d39e0bb7625176558becc0712f5e573a7605b541dea6497557699f727e4c27ef510d6d58f
7
- data.tar.gz: e83905ddf24fc4c63793392cd47aa2d02e984c44ba489610119cff6a25bb70be57f4a95f5b9fb78229d9dc6366e0181d2322743eeee8ba59ee840831f8790bb1
6
+ metadata.gz: c0c4dc94d9e339d054ec4ee1c586ac21355cdfa1c9664dd05ef01ca474668f7e483baec35bb8abfd1971eff296b7b5c270816c992ec9fc6abd51a89a7ce28000
7
+ data.tar.gz: 3f45fb28e10360055ead9b757b4972bd2c52ad3285b94b506c3a6ce66b7bd5ec6364da63cc83b066fab6bc69396ed7408108536b5b9178d4a4a34c28436b4d2d
@@ -1,4 +1,14 @@
1
- # [](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v) (2019-10-28)
1
+ # [](https://github.com/gildesmarais/html2rss/compare/v0.8.0...v) (2019-11-02)
2
+
3
+
4
+
5
+ # [0.8.0](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v0.8.0) (2019-11-02)
6
+
7
+
8
+ ### Features
9
+
10
+ * **post_processors:** add markdown to html ([#54](https://github.com/gildesmarais/html2rss/issues/54)) ([cdf77b8](https://github.com/gildesmarais/html2rss/commit/cdf77b8))
11
+ * **post_processors:** support annotated tokens ([#62](https://github.com/gildesmarais/html2rss/issues/62)) ([b57bd7b](https://github.com/gildesmarais/html2rss/commit/b57bd7b)), closes [#56](https://github.com/gildesmarais/html2rss/issues/56)
2
12
 
3
13
 
4
14
 
data/Gemfile CHANGED
@@ -2,5 +2,7 @@ source 'https://rubygems.org'
2
2
 
3
3
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
+ gem 'coveralls', require: false
6
+
5
7
  # Specify your gem's dependencies in html2rss.gemspec
6
8
  gemspec
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html2rss (0.7.0)
4
+ html2rss (0.8.0)
5
5
  activesupport (~> 5.0)
6
6
  builder
7
7
  faraday (~> 0.15)
8
8
  faraday_middleware (~> 0.13)
9
9
  hashie (~> 3.6)
10
+ kramdown
10
11
  mime-types (> 3.0)
11
12
  nokogiri (>= 1.10, < 2.0)
12
13
  reverse_markdown (~> 1.3)
@@ -26,6 +27,12 @@ GEM
26
27
  builder (3.2.3)
27
28
  byebug (11.0.1)
28
29
  concurrent-ruby (1.1.5)
30
+ coveralls (0.7.2)
31
+ multi_json (~> 1.3)
32
+ rest-client (= 1.6.7)
33
+ simplecov (>= 0.7)
34
+ term-ansicolor (= 1.2.2)
35
+ thor (= 0.18.1)
29
36
  crass (1.0.5)
30
37
  diff-lcs (1.3)
31
38
  docile (1.3.2)
@@ -38,13 +45,15 @@ GEM
38
45
  concurrent-ruby (~> 1.0)
39
46
  jaro_winkler (1.5.3)
40
47
  json (2.2.0)
48
+ kramdown (2.1.0)
41
49
  mime-types (3.3)
42
50
  mime-types-data (~> 3.2015)
43
51
  mime-types-data (3.2019.1009)
44
52
  mini_portile2 (2.4.0)
45
- minitest (5.12.2)
53
+ minitest (5.13.0)
54
+ multi_json (1.14.1)
46
55
  multipart-post (2.1.1)
47
- nokogiri (1.10.4)
56
+ nokogiri (1.10.5)
48
57
  mini_portile2 (~> 2.4.0)
49
58
  nokogumbo (2.0.1)
50
59
  nokogiri (~> 1.8, >= 1.8.4)
@@ -52,6 +61,8 @@ GEM
52
61
  parser (2.6.5.0)
53
62
  ast (~> 2.4.0)
54
63
  rainbow (3.0.0)
64
+ rest-client (1.6.7)
65
+ mime-types (>= 1.16)
55
66
  reverse_markdown (1.3.0)
56
67
  nokogiri
57
68
  rspec (3.9.0)
@@ -88,14 +99,18 @@ GEM
88
99
  json (>= 1.8, < 3)
89
100
  simplecov-html (~> 0.10.0)
90
101
  simplecov-html (0.10.2)
102
+ term-ansicolor (1.2.2)
103
+ tins (~> 0.8)
104
+ thor (0.18.1)
91
105
  thread_safe (0.3.6)
106
+ tins (0.13.2)
92
107
  to_regexp (0.2.1)
93
108
  tzinfo (1.2.5)
94
109
  thread_safe (~> 0.1)
95
110
  unicode-display_width (1.6.0)
96
111
  vcr (5.0.0)
97
112
  yard (0.9.20)
98
- zeitwerk (2.2.0)
113
+ zeitwerk (2.2.1)
99
114
 
100
115
  PLATFORMS
101
116
  ruby
@@ -103,6 +118,7 @@ PLATFORMS
103
118
  DEPENDENCIES
104
119
  bundler (~> 1.16)
105
120
  byebug
121
+ coveralls
106
122
  html2rss!
107
123
  rspec (~> 3.0)
108
124
  rubocop
data/README.md CHANGED
@@ -2,27 +2,44 @@
2
2
 
3
3
  [![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
4
4
  [![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/)
5
- [API docs on RubyDoc.info](https://www.rubydoc.info/gems/html2rss)
5
+ [![Coverage Status](https://coveralls.io/repos/github/gildesmarais/html2rss/badge.svg?branch=master)](https://coveralls.io/github/gildesmarais/html2rss?branch=master)
6
+ [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss)
7
+ ![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
6
8
 
7
- Request HTML from an URL and transform it to a Ruby RSS 2.0 object.
9
+ **Searching for a ready to use app which serves generated feeds via HTTP?**
10
+ [Head over to `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
8
11
 
9
- **Are you searching for a ready to use "website to RSS" solution?**
10
- [Check out `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
12
+ This Ruby gem builds RSS 2.0 feeds from a _feed config_.
11
13
 
12
- Each website needs a _feed config_ which contains the URL to scrape and
13
- CSS selectors to extract the required information (like title, URL, ...).
14
- This gem provides [extractors](https://github.com/gildesmarais/html2rss/blob/master/lib/html2rss/item_extractors) (e.g. extract the information from an HTML attribute)
15
- and chainable [post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors) to make information retrieval even easier.
14
+ With the _feed config_ containing the URL to scrape and
15
+ CSS selectors for information extraction (like title, URL, ...) your RSS builds.
16
+ [Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
17
+ make information extraction, processing and sanitizing a breeze.
18
+ [Scraping JSON](#scraping-json) responses and
19
+ [setting HTTP request headers](#set-any-http-header-in-the-request) is
20
+ supported, too.
16
21
 
17
22
  ## Installation
18
23
 
19
- Add this line to your application's Gemfile: `gem 'html2rss'`
20
- Then execute: `bundle`
24
+ | 🤩 Like it? | Star it! ⭐️ |
25
+ | ---------------------------------------------: | -------------------- |
26
+ | Add this line to your application's `Gemfile`: | `gem 'html2rss'` |
27
+ | Then execute: | `bundle` |
28
+ | In your code: | `require 'html2rss'` |
29
+
30
+ ## Building a feed config
31
+
32
+ Here's a minimal working example:
21
33
 
22
34
  ```ruby
35
+ require 'html2rss'
36
+
23
37
  rss =
24
38
  Html2rss.feed(
25
- channel: { title: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com/questions' },
39
+ channel: {
40
+ title: 'StackOverflow: Hot Network Questions',
41
+ url: 'https://stackoverflow.com/questions'
42
+ },
26
43
  selectors: {
27
44
  items: { selector: '#hot-network-questions > ul > li' },
28
45
  title: { selector: 'a' },
@@ -30,35 +47,218 @@ rss =
30
47
  }
31
48
  )
32
49
 
33
- puts rss.to_s
50
+ puts rss
34
51
  ```
35
52
 
36
- ## Usage with a YAML config file
53
+ A _feed config_ consists of a `channel` and a `selectors` Hash.
54
+ The contents of both hashes are explained below.
55
+
56
+ **Looks too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
57
+
58
+ ### The `channel`
59
+
60
+ | attribute | | type | remark |
61
+ | ------------- | -------- | ------- | ----------------------- |
62
+ | `title` | required | String | |
63
+ | `url` | required | String | |
64
+ | `ttl` | optional | Integer | time to live in minutes |
65
+ | `description` | optional | String | |
66
+ | `headers` | optional | Hash | See notes below. |
67
+
68
+ ### The `selectors`
69
+
70
+ You must provide an `items` selector hash which contains the CSS selector.
71
+ `items` needs to return a collection of HTML tags.
72
+ The other selectors are scoped to the tags of the items' collection.
73
+
74
+ To build a
75
+ [valid RSS 2.0 item](http://www.rssboard.org/rss-profile#element-channel-item)
76
+ each item has to have at least a `title` or a `description`.
77
+
78
+ Your `selectors` can contain arbitrary selector names, but only these
79
+ will make it into the RSS feed:
80
+
81
+ | RSS 2.0 tag | name in html2rss | remark |
82
+ | ------------- | ---------------- | --------------------------- |
83
+ | `title` | `title` | |
84
+ | `description` | `description` | Supports HTML. |
85
+ | `link` | `link` | A URL. |
86
+ | `author` | `author` | |
87
+ | `category` | `categories` | See notes below. |
88
+ | `enclosure` | `enclosure` | See notes below. |
89
+ | `pubDate` | `update` | An instance of `Time`. |
90
+ | `guid` | `guid` | Generated from the `title`. |
91
+ | `comments` | `comments` | A URL. |
92
+ | `source` | ~~source~~ | Not yet supported. |
93
+
94
+ ### The `selector` hash
95
+
96
+ Your selector hash can have these attributes:
97
+
98
+ | name | value |
99
+ | -------------- | -------------------------------------------------------- |
100
+ | `selector` | The CSS selector to select the tag with the information. |
101
+ | `extractor` | Name of the extractor. See notes below. |
102
+ | `post_process` | A hash or array of hashes. See notes below. |
103
+
104
+ ## Using extractors
105
+
106
+ Extractors help with extracting the information from the selected HTML tag.
107
+
108
+ - The default extractor is `text`, which returns the tag's inner text.
109
+ - The `html` extractor returns the tag's outer HTML.
110
+ - The `href` extractor returns a URL from the tag's `href` attribute and corrects relative ones to absolute ones.
111
+ - The `attribute` extractor returns the value of that tag's attribute.
112
+ - The `static` extractor returns the configured static value (it doesn't extract anything).
113
+ - [See file list of extractors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/item_extractors).
114
+
115
+ Extractors can require additional attributes on the selector hash.
116
+ 👉 [Read their docs for usage examples](https://www.rubydoc.info/gems/html2rss/Html2rss/ItemExtractors).
117
+
118
+ <details>
119
+ <summary>See a Ruby example</summary>
120
+
121
+ ```ruby
122
+ Html2rss.feed(
123
+ channel: {}, selectors: { link: { selector: 'a', extractor: 'href' } }
124
+ )
125
+ ```
126
+
127
+ </details>
128
+
129
+ <details>
130
+ <summary>See a YAML feed config example</summary>
131
+
132
+ ```yml
133
+ channel:
134
+   # ... omitted
135
+ selectors:
136
+   # ... omitted
137
+ link:
138
+ selector: 'a'
139
+ extractor: 'href'
140
+ ```
141
+
142
+ </details>
143
+
144
+ ## Using post processors
145
+
146
+ Extracted information can be further manipulated with post processors.
147
+
148
+ | name | |
149
+ | ------------------ | ------------------------------------------------------------------------------------- |
150
+ | `gsub` | Allows global substitution operations on Strings (Regexp or simple pattern). |
151
+ | `html_to_markdown` | HTML to Markdown, using [reverse_markdown](https://github.com/xijo/reverse_markdown). |
152
+ | `markdown_to_html` | converts Markdown to HTML, using [kramdown](https://github.com/gettalong/kramdown). |
153
+ | `parse_time` | Parses a String containing a time in a time zone. |
154
+ | `parse_uri` | Parses a String as URL. |
155
+ | `sanitize_html` | Strips unsafe and uneeded HTML and adds security related attributes. |
156
+ | `substring` | Cuts a part off of a String, starting at a position. |
157
+ | `template` | Based on a template, it creates a new String filled with other selectors values. |
158
+
159
+ ⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
37
160
 
38
- Create a YAML config file. Find an example at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
161
+ - [See file list of post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors).
39
162
 
40
- `Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')`
41
- returns an `RSS:Rss` object.
163
+ 👉 [Read their docs for usage examples.](https://www.rubydoc.info/gems/html2rss/Html2rss/AttributePostProcessors)
164
+
165
+ <details>
166
+ <summary>See a Ruby example</summary>
167
+
168
+ ```ruby
169
+ Html2rss.feed(
170
+ channel: {},
171
+ selectors: {
172
+ description: {
173
+ selector: '.content', post_process: { name: 'sanitize_html' }
174
+ }
175
+ }
176
+ )
177
+ ```
42
178
 
43
- **Too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
179
+ </details>
44
180
 
45
- ## Assigning categories to an item
181
+ <details>
182
+ <summary>See a YAML feed config example</summary>
183
+
184
+ ```yml
185
+ channel:
186
+   # ... omitted
187
+ selectors:
188
+   # ... omitted
189
+ description:
190
+ selector: '.content'
191
+ post_process:
192
+ - name: sanitize_html
193
+ ```
194
+
195
+ </details>
196
+
197
+ ### Chaining post processors
198
+
199
+ Pass an array to `post_process` to chain the post processors.
200
+
201
+ <details>
202
+ <summary>YAML example: build the description from a template String (in Markdown) and convert that Markdown to HTML</summary>
203
+
204
+ ```yml
205
+ channel:
206
+   # ... omitted
207
+ selectors:
208
+   # ... omitted
209
+ price:
210
+ selector: '.price'
211
+ description:
212
+ selector: '.section'
213
+ post_process:
214
+ - name: template
215
+ string: |
216
+ # %{self}
217
+
218
+ Price: %{price}
219
+ - name: markdown_to_html
220
+ ```
221
+
222
+ Note the use of `|` for a multi-line String in YAML.
223
+
224
+ </details>
225
+
226
+ ## Adding `<category>` tags to an item
46
227
 
47
228
  The `categories` selector takes an array of selector names. The value of those
48
- selectors will become a category on the item.
229
+ selectors will become a `<category>` on the RSS item.
230
+
231
+ <details>
232
+ <summary>See a Ruby example</summary>
233
+
234
+ ```ruby
235
+ Html2rss.feed(
236
+ channel: {},
237
+ selectors: {
238
+ genre: {
239
+ # ... omitted
240
+ selector: '.genre'
241
+ },
242
+ branch: { selector: '.branch' },
243
+ categories: %i[genre branch]
244
+ }
245
+ )
246
+ ```
247
+
248
+ </details>
49
249
 
50
250
  <details>
51
- <summary>See a YAML config example</summary>
251
+ <summary>See a YAML feed config example</summary>
52
252
 
53
253
  ```yml
54
254
  channel:
55
- # ... omitted
255
+   # ... omitted
56
256
  selectors:
57
- #... omitted
257
+ # ... omitted
58
258
  genre:
59
- selector: '.genre'
259
+ selector: ".genre"
60
260
  branch:
61
- selector: '.branch'
261
+ selector: ".branch"
62
262
  categories:
63
263
  - genre
64
264
  - branch
@@ -66,13 +266,13 @@ selectors:
66
266
 
67
267
  </details>
68
268
 
69
- ## Adding an enclosure to each item
269
+ ## Adding an `<enclosure>` tag to an item
70
270
 
71
271
  An enclosure can be 'anything', e.g. a image, audio or video file.
72
272
 
73
- The config's `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's url as a base.
273
+ The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
74
274
 
75
- Since html2rss does no further inspection of the enclosure, the support of this tag comes with trade-offs:
275
+ Since html2rss does no further inspection of the enclosure, its support comes with trade-offs:
76
276
 
77
277
  1. The content-type is guessed from the file extension of the URL.
78
278
  2. If the content-type guessing fails, it will default to `application/octet-stream`.
@@ -81,43 +281,71 @@ Since html2rss does no further inspection of the enclosure, the support of this
81
281
  Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
82
282
 
83
283
  <details>
84
- <summary>See a YAML config example</summary>
284
+ <summary>See a Ruby example</summary>
285
+
286
+ ```ruby
287
+ Html2rss.feed(
288
+ channel: {},
289
+ selectors: {
290
+ enclosure: { selector: 'img', extractor: 'attribute', attribute: 'src' }
291
+ }
292
+ )
293
+ ```
294
+
295
+ </details>
296
+
297
+ <details>
298
+ <summary>See a YAML feed config example</summary>
85
299
 
86
300
  ```yml
87
301
  channel:
88
- # ... omitted
302
+   # ... omitted
89
303
  selectors:
90
- #... omitted
91
- enclosure:
92
- selector: 'img'
93
- extractor: 'attribute'
94
- attribute: 'src'
304
+   # ... omitted
305
+ enclosure:
306
+ selector: "img"
307
+ extractor: "attribute"
308
+ attribute: "src"
95
309
  ```
96
310
 
97
311
  </details>
98
312
 
99
313
  ## Scraping JSON
100
314
 
101
- Since 0.5.0 it's possible to scrape and process JSON.
315
+ Although this gem is called **html**​*2rss*, it's possible to scrape and process JSON.
102
316
 
103
317
  Adding `json: true` to the channel config will convert the JSON response to XML.
104
318
 
319
+ <details>
320
+ <summary>See a Ruby example</summary>
321
+
322
+ ```ruby
323
+ Html2rss.feed(
324
+ channel: {
325
+ url: 'https://example.com', title: 'Example with JSON', json: true
326
+ },
327
+ selectors: {} # ... omitted
328
+ )
329
+ ```
330
+
331
+ </details>
332
+
105
333
  <details>
106
334
  <summary>See a YAML feed config example</summary>
107
335
 
108
336
  ```yaml
109
337
  channel:
110
338
  url: https://example.com
111
- title: 'Example with JSON'
339
+ title: "Example with JSON"
112
340
  json: true
113
- # ...
341
+ selectors:
342
+   # ... omitted
114
343
  ```
115
344
 
116
345
  </details>
117
346
 
118
- Under the hood it uses ActiveSupport's [`Hash.to_xml`](https://apidock.com/rails/Hash/to_xml) core extension for the JSON to XML conversion.
119
-
120
- ### Conversion of JSON objects
347
+ <details>
348
+ <summary>See example of a converted JSON object</summary>
121
349
 
122
350
  This JSON object:
123
351
 
@@ -127,7 +355,7 @@ This JSON object:
127
355
  }
128
356
  ```
129
357
 
130
- will be converted to:
358
+ converts to:
131
359
 
132
360
  ```xml
133
361
  <hash>
@@ -142,7 +370,12 @@ will be converted to:
142
370
 
143
371
  Your items selector would be `data > datum`, the item's `link` selector would be `url`.
144
372
 
145
- ### Conversion of JSON arrays
373
+ Find further information in [ActiveSupport's `Hash.to_xml` documentation](https://apidock.com/rails/Hash/to_xml).
374
+
375
+ </details>
376
+
377
+ <details>
378
+ <summary>See example of a converted JSON array</summary>
146
379
 
147
380
  This JSON array:
148
381
 
@@ -150,7 +383,7 @@ This JSON array:
150
383
  [{ "title": "Headline", "url": "https://example.com" }]
151
384
  ```
152
385
 
153
- will be converted to:
386
+ converts to:
154
387
 
155
388
  ```xml
156
389
  <objects>
@@ -163,10 +396,38 @@ will be converted to:
163
396
 
164
397
  Your items selector would be `objects > object`, the item's `link` selector would be `url`.
165
398
 
399
+ Find further information in [ActiveSupport's `Array.to_xml` documentation](https://apidock.com/rails/Array/to_xml).
400
+
401
+ </details>
402
+
166
403
  ## Set any HTTP header in the request
167
404
 
168
405
  You can add any HTTP headers to the request to the channel URL.
169
- You can use this to e.g. have Cookie or Authorization information being sent or to overwrite the User-Agent.
406
+ Use this to e.g. have Cookie or Authorization information sent or to spoof the User-Agent.
407
+
408
+ <details>
409
+ <summary>See a Ruby example</summary>
410
+
411
+ ```ruby
412
+ Html2rss.feed(
413
+ channel: {
414
+ url: 'https://example.com',
415
+ title: "Example with http headers",
416
+ headers: {
417
+ "User-Agent" => "html2rss-request",
418
+ "X-Something" => "Foobar",
419
+ "Authorization" => "Token deadbea7",
420
+ "Cookie" => "monster=MeWantCookie"
421
+ }
422
+ },
423
+ selectors: {}
424
+ )
425
+ ```
426
+
427
+ </details>
428
+
429
+ <details>
430
+ <summary>See a YAML feed config example</summary>
170
431
 
171
432
  ```yaml
172
433
  channel:
@@ -177,27 +438,79 @@ channel:
177
438
  "X-Something": "Foobar"
178
439
  "Authorization": "Token deadbea7"
179
440
  "Cookie": "monster=MeWantCookie"
180
- # ...
441
+ selectors:
442
+   # ...
181
443
  ```
182
444
 
183
- The headers provided by the channel will be merged into the global headers.
445
+ </details>
184
446
 
185
- ## Development
447
+ The headers provided by the channel are merged into the global headers.
186
448
 
187
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
449
+ ## Usage with a YAML config file
188
450
 
189
- ## Contributing
451
+ This step is not required to work with this gem. If you're using
452
+ [`html2rss-web`](https://github.com/gildesmarais/html2rss-web)
453
+ and want to create your private feed configs, keep on reading!
190
454
 
191
- Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
455
+ First, create your YAML file, e.g. called `config.yml`.
456
+ This file will contain your global config and feed configs.
192
457
 
193
- ## Releasing a new version
458
+ Example:
459
+
460
+ ```yml
461
+ headers:
462
+ 'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
463
+ feeds:
464
+ myfeed:
465
+ channel:
466
+ selectors:
467
+ myotherfeed:
468
+ channel:
469
+ selectors:
470
+ ```
471
+
472
+ Your feed configs go below `feeds`. Everything else is part of the global config.
473
+
474
+ Build your feeds like this:
475
+
476
+ ```ruby
477
+ require 'html2rss'
478
+
479
+ myfeed = Html2rss.feed_from_yaml_config('config.yml', 'myfeed')
480
+ myotherfeed = Html2rss.feed_from_yaml_config('config.yml', 'myotherfeed')
481
+ ```
482
+
483
+ Find a full example of a `config.yml` at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
484
+
485
+ ## Gotchas and tips & tricks
486
+
487
+ - Check that the channel URL does not redirect to a mobile page with a different markup structure.
488
+ - Do not rely on your web browser's developer console. html2rss does not execute JavaScript.
489
+ - Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
490
+ - [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
491
+
492
+ ## Development
493
+
494
+ After checking out the repository, run `bin/setup` to install dependencies. Then, run `bundle exec rspec` to run the tests.
495
+ You can also run `bin/console` for an interactive prompt that will allow you to experiment.
496
+
497
+ <details>
498
+ <summary>Releasing a new version</summary>
194
499
 
195
500
  1. `git pull`
196
501
  2. increase version in `lib/html2rss/version.rb`
197
502
  3. `bundle`
198
- 4. commit the changes
199
- 5. `git tag v....`
200
- 6. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
201
- 7. `git add CHANGELOG.md && git commit --amend`
202
- 8. `git tag v.... -f`
203
- 9. `git push && git push --tags`
503
+ 4. `git add Gemfile.lock lib/html2rss/version.rb`
504
+ 5. `VERSION=$(ruby -e 'require "./lib/html2rss/version.rb"; puts Html2rss::VERSION')`
505
+ 6. `git commit -m "chore: release $VERSION"`
506
+ 7. `git tag v$VERSION`
507
+ 8. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
508
+ 9. `git add CHANGELOG.md && git commit --amend`
509
+ 10. `git tag v$VERSION -f`
510
+ 11. `git push && git push --tags`
511
+
512
+ </details>
513
+
514
+ ## Contributing
515
+
516
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.add_dependency 'faraday', '~> 0.15'
35
35
  spec.add_dependency 'faraday_middleware', '~> 0.13'
36
36
  spec.add_dependency 'hashie', '~> 3.6'
37
+ spec.add_dependency 'kramdown'
37
38
  spec.add_dependency 'mime-types', '> 3.0'
38
39
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
39
40
  spec.add_dependency 'reverse_markdown', '~> 1.3'
@@ -4,9 +4,7 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  def self.get_processor(name)
6
6
  @get_processor ||= Hash.new do |processors, key|
7
- camel_cased_name = key.split('_').map(&:capitalize).join
8
- class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
9
- processors[key] = Object.const_get(class_name)
7
+ processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
10
8
  end
11
9
 
12
10
  @get_processor[name]
@@ -12,15 +12,17 @@ module Html2rss
12
12
  # title:
13
13
  # selector: h1
14
14
  # post_process:
15
- # name: gsub
16
- # pattern: boo
17
- # replacement: baz
15
+ # name: gsub
16
+ # pattern: boo
17
+ # replacement: baz
18
18
  #
19
19
  # Would return:
20
20
  # 'Foo bar and baz'
21
21
  #
22
22
  # `pattern` can be a Regexp or a String.
23
+ #
23
24
  # `replacement` can be a String or a Hash.
25
+ #
24
26
  # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
25
27
  class Gsub
26
28
  def initialize(value, env)
@@ -0,0 +1,48 @@
1
+ require 'kramdown'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ # Generates HTML from Markdown.
7
+ #
8
+ # It's particularly useful in conjunction with the Template post processor
9
+ # to generate a description from other selectors.
10
+ #
11
+ # YAML usage example:
12
+ #
13
+ # selectors:
14
+ # description:
15
+ # selector: section
16
+ # post_process:
17
+ # - name: template
18
+ # string: |
19
+ # # %s
20
+ #
21
+ # Price: %s
22
+ # methods:
23
+ # - self
24
+ # - price
25
+ # - name: markdown_to_html
26
+ #
27
+ # Would e.g. return:
28
+ #
29
+ # <h1>Section</h1>
30
+ #
31
+ # <p>Price: 12.34</p>
32
+ class MarkdownToHtml
33
+ def initialize(value, env)
34
+ @value = value
35
+ @env = env
36
+ end
37
+
38
+ ##
39
+ # @return [String] formatted in Markdown
40
+ def get
41
+ SanitizeHtml.new(
42
+ Kramdown::Document.new(@value).to_html,
43
+ @env
44
+ ).get
45
+ end
46
+ end
47
+ end
48
+ end
@@ -17,12 +17,12 @@ module Html2rss
17
17
  # selector: span
18
18
  # post_process:
19
19
  # name: 'parse_time'
20
+ # time_zone: 'Europe/Berlin'
20
21
  #
21
22
  # Would return:
22
23
  # "Tue, 02 Jul 2019 00:00:00 +0200"
23
24
  #
24
25
  # It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
25
- # As of now it ignores time zones and always falls back to the UTC time zone.
26
26
  class ParseTime
27
27
  def initialize(value, env)
28
28
  @value = value.to_s
@@ -15,6 +15,7 @@ module Html2rss
15
15
  # extractor: text
16
16
  # post_process:
17
17
  # name: parse_uri
18
+ #
18
19
  # Would return:
19
20
  # 'http://why-not-use-a-link.uh'
20
21
  class ParseUri
@@ -4,10 +4,16 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ##
6
6
  # Returns sanitized HTML code as String.
7
- # Adds
8
7
  #
9
- # - rel="nofollow noopener noreferrer" to a elements
10
- # - referrer-policy='no-referrer' to img elements
8
+ # It adds:
9
+ #
10
+ # - `rel="nofollow noopener noreferrer"` to <a> tags
11
+ # - `referrer-policy='no-referrer'` to <img> tags
12
+ #
13
+ # It also:
14
+ #
15
+ # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
16
+ # linking to the <img>'s `src`.
11
17
  #
12
18
  # Imagine this HTML structure:
13
19
  #
@@ -17,15 +23,11 @@ module Html2rss
17
23
  # <script>alert();</script>
18
24
  # </section>
19
25
  #
20
- # It also:
21
- #
22
- # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
23
- #
24
26
  # YAML usage example:
25
27
  #
26
28
  # selectors:
27
29
  # description:
28
- # selector: section
30
+ # selector: '.section'
29
31
  # extractor: html
30
32
  # post_process:
31
33
  # name: sanitize_html
@@ -2,9 +2,15 @@ module Html2rss
2
2
  module AttributePostProcessors
3
3
  ## Returns a defined part of a String.
4
4
  #
5
+ # Both parameters must be an Integer and they can be negative.
5
6
  # The +end+ parameter can be omitted, in that case it will not cut the
6
7
  # String at the end.
7
8
  #
9
+ # A Regexp or a MatchString is not supported.
10
+ #
11
+ # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
12
+ # documentation for more information.
13
+ #
8
14
  # Imagine this HTML:
9
15
  # <h1>Foo bar and baz<h1>
10
16
  #
@@ -13,9 +19,9 @@ module Html2rss
13
19
  # title:
14
20
  # selector: h1
15
21
  # post_process:
16
- # name: substring
17
- # start: 4
18
- # end: 6
22
+ # name: substring
23
+ # start: 4
24
+ # end: 6
19
25
  #
20
26
  # Would return:
21
27
  # 'bar'
@@ -4,7 +4,8 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ## Returns a formatted String according to the string pattern.
6
6
  #
7
- # If +self+ is given as a method, the extracted value will be used.
7
+ # If +self+ is used, the selectors extracted value will be used.
8
+ # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
8
9
  #
9
10
  # Imagine this HTML:
10
11
  # <li>
@@ -22,11 +23,8 @@ module Html2rss
22
23
  # title:
23
24
  # selector: h1
24
25
  # post_process:
25
- # name: template
26
- # string: '%s (%s)'
27
- # methods:
28
- # - self
29
- # - price
26
+ # name: template
27
+ # string: '%{self} (%{price})'
30
28
  #
31
29
  # Would return:
32
30
  # 'Product (23,42€)'
@@ -38,10 +36,16 @@ module Html2rss
38
36
  end
39
37
 
40
38
  ##
41
- # - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
42
39
  # @return [String]
43
40
  def get
44
- string % methods
41
+ if @options['methods']
42
+ string % methods
43
+ else
44
+ names = string.scan(/%[<|{](\w*)[>|}]/).flatten
45
+ names.uniq!
46
+
47
+ format(string, names.map { |name| [name.to_sym, item_value(name)] }.to_h)
48
+ end
45
49
  end
46
50
 
47
51
  private
@@ -51,9 +55,11 @@ module Html2rss
51
55
  end
52
56
 
53
57
  def methods
54
- @methods ||= @options['methods'].map do |method|
55
- method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
56
- end
58
+ @methods ||= @options['methods'].map(&method(:item_value))
59
+ end
60
+
61
+ def item_value(method_name)
62
+ method_name.to_s == 'self' ? @value.to_s : @item.public_send(method_name.to_sym).to_s
57
63
  end
58
64
  end
59
65
  end
@@ -2,13 +2,11 @@ module Html2rss
2
2
  ##
3
3
  # Provides a namespace for item extractors.
4
4
  module ItemExtractors
5
- DEFAULT = 'Text'.freeze
5
+ DEFAULT = 'text'.freeze
6
6
 
7
7
  def self.get_extractor(name)
8
8
  @get_extractor ||= Hash.new do |extractors, key|
9
- camel_cased_name = (key || DEFAULT).split('_').map(&:capitalize).join
10
- class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
11
- extractors[key] = Object.const_get(class_name)
9
+ extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
12
10
  end
13
11
 
14
12
  @get_extractor[name]
@@ -30,5 +30,11 @@ module Html2rss
30
30
  def self.hash_to_xml(hash)
31
31
  hash.to_xml(skip_instruct: true, skip_types: true)
32
32
  end
33
+
34
+ def self.get_class_from_name(snake_cased_name, module_name)
35
+ camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
36
+ class_name = ['Html2rss', module_name, camel_cased_name].join('::')
37
+ Object.const_get(class_name)
38
+ end
33
39
  end
34
40
  end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.7.0'.freeze
2
+ VERSION = '0.8.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-10-28 00:00:00.000000000 Z
11
+ date: 2019-11-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '3.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: kramdown
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: mime-types
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -321,6 +335,7 @@ files:
321
335
  - lib/html2rss/attribute_post_processors.rb
322
336
  - lib/html2rss/attribute_post_processors/gsub.rb
323
337
  - lib/html2rss/attribute_post_processors/html_to_markdown.rb
338
+ - lib/html2rss/attribute_post_processors/markdown_to_html.rb
324
339
  - lib/html2rss/attribute_post_processors/parse_time.rb
325
340
  - lib/html2rss/attribute_post_processors/parse_uri.rb
326
341
  - lib/html2rss/attribute_post_processors/sanitize_html.rb