html2rss 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 645fe7ea0ebe4a733c9833e8c22a93aa3f2d2b3b8589160fb001ad6ffd4659d3
4
- data.tar.gz: 0fb454258a1f334243984bdb73a699ffcfc676c529d87bb0886fbba75127b7cf
3
+ metadata.gz: 38b547bdfe0799d71348690ae20d29d74d88c3cbb0828cdd807a382dec14f895
4
+ data.tar.gz: 8e9ae611d6bbf0174bd9038c3aa7a7b214ec8ffa152e990b2224e98d5022bec0
5
5
  SHA512:
6
- metadata.gz: 350f0fd0e11bd35963c89b56aea9eb1dc90ae72956d849fd85b0b67d39e0bb7625176558becc0712f5e573a7605b541dea6497557699f727e4c27ef510d6d58f
7
- data.tar.gz: e83905ddf24fc4c63793392cd47aa2d02e984c44ba489610119cff6a25bb70be57f4a95f5b9fb78229d9dc6366e0181d2322743eeee8ba59ee840831f8790bb1
6
+ metadata.gz: c0c4dc94d9e339d054ec4ee1c586ac21355cdfa1c9664dd05ef01ca474668f7e483baec35bb8abfd1971eff296b7b5c270816c992ec9fc6abd51a89a7ce28000
7
+ data.tar.gz: 3f45fb28e10360055ead9b757b4972bd2c52ad3285b94b506c3a6ce66b7bd5ec6364da63cc83b066fab6bc69396ed7408108536b5b9178d4a4a34c28436b4d2d
@@ -1,4 +1,14 @@
1
- # [](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v) (2019-10-28)
1
+ # [](https://github.com/gildesmarais/html2rss/compare/v0.8.0...v) (2019-11-02)
2
+
3
+
4
+
5
+ # [0.8.0](https://github.com/gildesmarais/html2rss/compare/v0.7.0...v0.8.0) (2019-11-02)
6
+
7
+
8
+ ### Features
9
+
10
+ * **post_processors:** add markdown to html ([#54](https://github.com/gildesmarais/html2rss/issues/54)) ([cdf77b8](https://github.com/gildesmarais/html2rss/commit/cdf77b8))
11
+ * **post_processors:** support annotated tokens ([#62](https://github.com/gildesmarais/html2rss/issues/62)) ([b57bd7b](https://github.com/gildesmarais/html2rss/commit/b57bd7b)), closes [#56](https://github.com/gildesmarais/html2rss/issues/56)
2
12
 
3
13
 
4
14
 
data/Gemfile CHANGED
@@ -2,5 +2,7 @@ source 'https://rubygems.org'
2
2
 
3
3
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
+ gem 'coveralls', require: false
6
+
5
7
  # Specify your gem's dependencies in html2rss.gemspec
6
8
  gemspec
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html2rss (0.7.0)
4
+ html2rss (0.8.0)
5
5
  activesupport (~> 5.0)
6
6
  builder
7
7
  faraday (~> 0.15)
8
8
  faraday_middleware (~> 0.13)
9
9
  hashie (~> 3.6)
10
+ kramdown
10
11
  mime-types (> 3.0)
11
12
  nokogiri (>= 1.10, < 2.0)
12
13
  reverse_markdown (~> 1.3)
@@ -26,6 +27,12 @@ GEM
26
27
  builder (3.2.3)
27
28
  byebug (11.0.1)
28
29
  concurrent-ruby (1.1.5)
30
+ coveralls (0.7.2)
31
+ multi_json (~> 1.3)
32
+ rest-client (= 1.6.7)
33
+ simplecov (>= 0.7)
34
+ term-ansicolor (= 1.2.2)
35
+ thor (= 0.18.1)
29
36
  crass (1.0.5)
30
37
  diff-lcs (1.3)
31
38
  docile (1.3.2)
@@ -38,13 +45,15 @@ GEM
38
45
  concurrent-ruby (~> 1.0)
39
46
  jaro_winkler (1.5.3)
40
47
  json (2.2.0)
48
+ kramdown (2.1.0)
41
49
  mime-types (3.3)
42
50
  mime-types-data (~> 3.2015)
43
51
  mime-types-data (3.2019.1009)
44
52
  mini_portile2 (2.4.0)
45
- minitest (5.12.2)
53
+ minitest (5.13.0)
54
+ multi_json (1.14.1)
46
55
  multipart-post (2.1.1)
47
- nokogiri (1.10.4)
56
+ nokogiri (1.10.5)
48
57
  mini_portile2 (~> 2.4.0)
49
58
  nokogumbo (2.0.1)
50
59
  nokogiri (~> 1.8, >= 1.8.4)
@@ -52,6 +61,8 @@ GEM
52
61
  parser (2.6.5.0)
53
62
  ast (~> 2.4.0)
54
63
  rainbow (3.0.0)
64
+ rest-client (1.6.7)
65
+ mime-types (>= 1.16)
55
66
  reverse_markdown (1.3.0)
56
67
  nokogiri
57
68
  rspec (3.9.0)
@@ -88,14 +99,18 @@ GEM
88
99
  json (>= 1.8, < 3)
89
100
  simplecov-html (~> 0.10.0)
90
101
  simplecov-html (0.10.2)
102
+ term-ansicolor (1.2.2)
103
+ tins (~> 0.8)
104
+ thor (0.18.1)
91
105
  thread_safe (0.3.6)
106
+ tins (0.13.2)
92
107
  to_regexp (0.2.1)
93
108
  tzinfo (1.2.5)
94
109
  thread_safe (~> 0.1)
95
110
  unicode-display_width (1.6.0)
96
111
  vcr (5.0.0)
97
112
  yard (0.9.20)
98
- zeitwerk (2.2.0)
113
+ zeitwerk (2.2.1)
99
114
 
100
115
  PLATFORMS
101
116
  ruby
@@ -103,6 +118,7 @@ PLATFORMS
103
118
  DEPENDENCIES
104
119
  bundler (~> 1.16)
105
120
  byebug
121
+ coveralls
106
122
  html2rss!
107
123
  rspec (~> 3.0)
108
124
  rubocop
data/README.md CHANGED
@@ -2,27 +2,44 @@
2
2
 
3
3
  [![Build Status](https://travis-ci.org/gildesmarais/html2rss.svg?branch=master)](https://travis-ci.org/gildesmarais/html2rss)
4
4
  [![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/)
5
- [API docs on RubyDoc.info](https://www.rubydoc.info/gems/html2rss)
5
+ [![Coverage Status](https://coveralls.io/repos/github/gildesmarais/html2rss/badge.svg?branch=master)](https://coveralls.io/github/gildesmarais/html2rss?branch=master)
6
+ [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss)
7
+ ![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
6
8
 
7
- Request HTML from an URL and transform it to a Ruby RSS 2.0 object.
9
+ **Searching for a ready to use app which serves generated feeds via HTTP?**
10
+ [Head over to `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
8
11
 
9
- **Are you searching for a ready to use "website to RSS" solution?**
10
- [Check out `html2rss-web`!](https://github.com/gildesmarais/html2rss-web)
12
+ This Ruby gem builds RSS 2.0 feeds from a _feed config_.
11
13
 
12
- Each website needs a _feed config_ which contains the URL to scrape and
13
- CSS selectors to extract the required information (like title, URL, ...).
14
- This gem provides [extractors](https://github.com/gildesmarais/html2rss/blob/master/lib/html2rss/item_extractors) (e.g. extract the information from an HTML attribute)
15
- and chainable [post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors) to make information retrieval even easier.
14
+ With the _feed config_ containing the URL to scrape and
15
+ CSS selectors for information extraction (like title, URL, ...) your RSS builds.
16
+ [Extractors](#using-extractors) and chain-able [post processors](#using-post-processors)
17
+ make information extraction, processing and sanitizing a breeze.
18
+ [Scraping JSON](#scraping-json) responses and
19
+ [setting HTTP request headers](#set-any-http-header-in-the-request) is
20
+ supported, too.
16
21
 
17
22
  ## Installation
18
23
 
19
- Add this line to your application's Gemfile: `gem 'html2rss'`
20
- Then execute: `bundle`
24
+ | 🤩 Like it? | Star it! ⭐️ |
25
+ | ---------------------------------------------: | -------------------- |
26
+ | Add this line to your application's `Gemfile`: | `gem 'html2rss'` |
27
+ | Then execute: | `bundle` |
28
+ | In your code: | `require 'html2rss'` |
29
+
30
+ ## Building a feed config
31
+
32
+ Here's a minimal working example:
21
33
 
22
34
  ```ruby
35
+ require 'html2rss'
36
+
23
37
  rss =
24
38
  Html2rss.feed(
25
- channel: { title: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com/questions' },
39
+ channel: {
40
+ title: 'StackOverflow: Hot Network Questions',
41
+ url: 'https://stackoverflow.com/questions'
42
+ },
26
43
  selectors: {
27
44
  items: { selector: '#hot-network-questions > ul > li' },
28
45
  title: { selector: 'a' },
@@ -30,35 +47,218 @@ rss =
30
47
  }
31
48
  )
32
49
 
33
- puts rss.to_s
50
+ puts rss
34
51
  ```
35
52
 
36
- ## Usage with a YAML config file
53
+ A _feed config_ consists of a `channel` and a `selectors` Hash.
54
+ The contents of both hashes are explained below.
55
+
56
+ **Looks too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
57
+
58
+ ### The `channel`
59
+
60
+ | attribute | | type | remark |
61
+ | ------------- | -------- | ------- | ----------------------- |
62
+ | `title` | required | String | |
63
+ | `url` | required | String | |
64
+ | `ttl` | optional | Integer | time to live in minutes |
65
+ | `description` | optional | String | |
66
+ | `headers` | optional | Hash | See notes below. |
67
+
68
+ ### The `selectors`
69
+
70
+ You must provide an `items` selector hash which contains the CSS selector.
71
+ `items` needs to return a collection of HTML tags.
72
+ The other selectors are scoped to the tags of the items' collection.
73
+
74
+ To build a
75
+ [valid RSS 2.0 item](http://www.rssboard.org/rss-profile#element-channel-item)
76
+ each item has to have at least a `title` or a `description`.
77
+
78
+ Your `selectors` can contain arbitrary selector names, but only these
79
+ will make it into the RSS feed:
80
+
81
+ | RSS 2.0 tag | name in html2rss | remark |
82
+ | ------------- | ---------------- | --------------------------- |
83
+ | `title` | `title` | |
84
+ | `description` | `description` | Supports HTML. |
85
+ | `link` | `link` | A URL. |
86
+ | `author` | `author` | |
87
+ | `category` | `categories` | See notes below. |
88
+ | `enclosure` | `enclosure` | See notes below. |
89
+ | `pubDate` | `update` | An instance of `Time`. |
90
+ | `guid` | `guid` | Generated from the `title`. |
91
+ | `comments` | `comments` | A URL. |
92
+ | `source` | ~~source~~ | Not yet supported. |
93
+
94
+ ### The `selector` hash
95
+
96
+ Your selector hash can have these attributes:
97
+
98
+ | name | value |
99
+ | -------------- | -------------------------------------------------------- |
100
+ | `selector` | The CSS selector to select the tag with the information. |
101
+ | `extractor` | Name of the extractor. See notes below. |
102
+ | `post_process` | A hash or array of hashes. See notes below. |
103
+
104
+ ## Using extractors
105
+
106
+ Extractors help with extracting the information from the selected HTML tag.
107
+
108
+ - The default extractor is `text`, which returns the tag's inner text.
109
+ - The `html` extractor returns the tag's outer HTML.
110
+ - The `href` extractor returns a URL from the tag's `href` attribute and corrects relative ones to absolute ones.
111
+ - The `attribute` extractor returns the value of that tag's attribute.
112
+ - The `static` extractor returns the configured static value (it doesn't extract anything).
113
+ - [See file list of extractors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/item_extractors).
114
+
115
+ Extractors can require additional attributes on the selector hash.
116
+ 👉 [Read their docs for usage examples](https://www.rubydoc.info/gems/html2rss/Html2rss/ItemExtractors).
117
+
118
+ <details>
119
+ <summary>See a Ruby example</summary>
120
+
121
+ ```ruby
122
+ Html2rss.feed(
123
+ channel: {}, selectors: { link: { selector: 'a', extractor: 'href' } }
124
+ )
125
+ ```
126
+
127
+ </details>
128
+
129
+ <details>
130
+ <summary>See a YAML feed config example</summary>
131
+
132
+ ```yml
133
+ channel:
134
+   # ... omitted
135
+ selectors:
136
+   # ... omitted
137
+ link:
138
+ selector: 'a'
139
+ extractor: 'href'
140
+ ```
141
+
142
+ </details>
143
+
144
+ ## Using post processors
145
+
146
+ Extracted information can be further manipulated with post processors.
147
+
148
+ | name | |
149
+ | ------------------ | ------------------------------------------------------------------------------------- |
150
+ | `gsub` | Allows global substitution operations on Strings (Regexp or simple pattern). |
151
+ | `html_to_markdown` | HTML to Markdown, using [reverse_markdown](https://github.com/xijo/reverse_markdown). |
152
+ | `markdown_to_html` | converts Markdown to HTML, using [kramdown](https://github.com/gettalong/kramdown). |
153
+ | `parse_time` | Parses a String containing a time in a time zone. |
154
+ | `parse_uri` | Parses a String as URL. |
155
+ | `sanitize_html` | Strips unsafe and uneeded HTML and adds security related attributes. |
156
+ | `substring` | Cuts a part off of a String, starting at a position. |
157
+ | `template` | Based on a template, it creates a new String filled with other selectors values. |
158
+
159
+ ⚠️ Always make use of the `sanitize_html` post processor for HTML content. _Never trust the internet!_ ⚠️
37
160
 
38
- Create a YAML config file. Find an example at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
161
+ - [See file list of post processors](https://github.com/gildesmarais/html2rss/tree/master/lib/html2rss/attribute_post_processors).
39
162
 
40
- `Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')`
41
- returns an `RSS:Rss` object.
163
+ 👉 [Read their docs for usage examples.](https://www.rubydoc.info/gems/html2rss/Html2rss/AttributePostProcessors)
164
+
165
+ <details>
166
+ <summary>See a Ruby example</summary>
167
+
168
+ ```ruby
169
+ Html2rss.feed(
170
+ channel: {},
171
+ selectors: {
172
+ description: {
173
+ selector: '.content', post_process: { name: 'sanitize_html' }
174
+ }
175
+ }
176
+ )
177
+ ```
42
178
 
43
- **Too complicated?** See [`html2rss-configs`](https://github.com/gildesmarais/html2rss-configs) for ready-made feed configs!
179
+ </details>
44
180
 
45
- ## Assigning categories to an item
181
+ <details>
182
+ <summary>See a YAML feed config example</summary>
183
+
184
+ ```yml
185
+ channel:
186
+   # ... omitted
187
+ selectors:
188
+   # ... omitted
189
+ description:
190
+ selector: '.content'
191
+ post_process:
192
+ - name: sanitize_html
193
+ ```
194
+
195
+ </details>
196
+
197
+ ### Chaining post processors
198
+
199
+ Pass an array to `post_process` to chain the post processors.
200
+
201
+ <details>
202
+ <summary>YAML example: build the description from a template String (in Markdown) and convert that Markdown to HTML</summary>
203
+
204
+ ```yml
205
+ channel:
206
+   # ... omitted
207
+ selectors:
208
+   # ... omitted
209
+ price:
210
+ selector: '.price'
211
+ description:
212
+ selector: '.section'
213
+ post_process:
214
+ - name: template
215
+ string: |
216
+ # %{self}
217
+
218
+ Price: %{price}
219
+ - name: markdown_to_html
220
+ ```
221
+
222
+ Note the use of `|` for a multi-line String in YAML.
223
+
224
+ </details>
225
+
226
+ ## Adding `<category>` tags to an item
46
227
 
47
228
  The `categories` selector takes an array of selector names. The value of those
48
- selectors will become a category on the item.
229
+ selectors will become a `<category>` on the RSS item.
230
+
231
+ <details>
232
+ <summary>See a Ruby example</summary>
233
+
234
+ ```ruby
235
+ Html2rss.feed(
236
+ channel: {},
237
+ selectors: {
238
+ genre: {
239
+ # ... omitted
240
+ selector: '.genre'
241
+ },
242
+ branch: { selector: '.branch' },
243
+ categories: %i[genre branch]
244
+ }
245
+ )
246
+ ```
247
+
248
+ </details>
49
249
 
50
250
  <details>
51
- <summary>See a YAML config example</summary>
251
+ <summary>See a YAML feed config example</summary>
52
252
 
53
253
  ```yml
54
254
  channel:
55
- # ... omitted
255
+   # ... omitted
56
256
  selectors:
57
- #... omitted
257
+ # ... omitted
58
258
  genre:
59
- selector: '.genre'
259
+ selector: ".genre"
60
260
  branch:
61
- selector: '.branch'
261
+ selector: ".branch"
62
262
  categories:
63
263
  - genre
64
264
  - branch
@@ -66,13 +266,13 @@ selectors:
66
266
 
67
267
  </details>
68
268
 
69
- ## Adding an enclosure to each item
269
+ ## Adding an `<enclosure>` tag to an item
70
270
 
71
271
  An enclosure can be 'anything', e.g. a image, audio or video file.
72
272
 
73
- The config's `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's url as a base.
273
+ The `enclosure` selector needs to return a URL of the content to enclose. If the extracted URL is relative, it will be converted to an absolute one using the channel's URL as base.
74
274
 
75
- Since html2rss does no further inspection of the enclosure, the support of this tag comes with trade-offs:
275
+ Since html2rss does no further inspection of the enclosure, its support comes with trade-offs:
76
276
 
77
277
  1. The content-type is guessed from the file extension of the URL.
78
278
  2. If the content-type guessing fails, it will default to `application/octet-stream`.
@@ -81,43 +281,71 @@ Since html2rss does no further inspection of the enclosure, the support of this
81
281
  Read the [RSS 2.0 spec](http://www.rssboard.org/rss-profile#element-channel-item-enclosure) for further information on enclosing content.
82
282
 
83
283
  <details>
84
- <summary>See a YAML config example</summary>
284
+ <summary>See a Ruby example</summary>
285
+
286
+ ```ruby
287
+ Html2rss.feed(
288
+ channel: {},
289
+ selectors: {
290
+ enclosure: { selector: 'img', extractor: 'attribute', attribute: 'src' }
291
+ }
292
+ )
293
+ ```
294
+
295
+ </details>
296
+
297
+ <details>
298
+ <summary>See a YAML feed config example</summary>
85
299
 
86
300
  ```yml
87
301
  channel:
88
- # ... omitted
302
+   # ... omitted
89
303
  selectors:
90
- #... omitted
91
- enclosure:
92
- selector: 'img'
93
- extractor: 'attribute'
94
- attribute: 'src'
304
+   # ... omitted
305
+ enclosure:
306
+ selector: "img"
307
+ extractor: "attribute"
308
+ attribute: "src"
95
309
  ```
96
310
 
97
311
  </details>
98
312
 
99
313
  ## Scraping JSON
100
314
 
101
- Since 0.5.0 it's possible to scrape and process JSON.
315
+ Although this gem is called **html**​*2rss*, it's possible to scrape and process JSON.
102
316
 
103
317
  Adding `json: true` to the channel config will convert the JSON response to XML.
104
318
 
319
+ <details>
320
+ <summary>See a Ruby example</summary>
321
+
322
+ ```ruby
323
+ Html2rss.feed(
324
+ channel: {
325
+ url: 'https://example.com', title: 'Example with JSON', json: true
326
+ },
327
+ selectors: {} # ... omitted
328
+ )
329
+ ```
330
+
331
+ </details>
332
+
105
333
  <details>
106
334
  <summary>See a YAML feed config example</summary>
107
335
 
108
336
  ```yaml
109
337
  channel:
110
338
  url: https://example.com
111
- title: 'Example with JSON'
339
+ title: "Example with JSON"
112
340
  json: true
113
- # ...
341
+ selectors:
342
+   # ... omitted
114
343
  ```
115
344
 
116
345
  </details>
117
346
 
118
- Under the hood it uses ActiveSupport's [`Hash.to_xml`](https://apidock.com/rails/Hash/to_xml) core extension for the JSON to XML conversion.
119
-
120
- ### Conversion of JSON objects
347
+ <details>
348
+ <summary>See example of a converted JSON object</summary>
121
349
 
122
350
  This JSON object:
123
351
 
@@ -127,7 +355,7 @@ This JSON object:
127
355
  }
128
356
  ```
129
357
 
130
- will be converted to:
358
+ converts to:
131
359
 
132
360
  ```xml
133
361
  <hash>
@@ -142,7 +370,12 @@ will be converted to:
142
370
 
143
371
  Your items selector would be `data > datum`, the item's `link` selector would be `url`.
144
372
 
145
- ### Conversion of JSON arrays
373
+ Find further information in [ActiveSupport's `Hash.to_xml` documentation](https://apidock.com/rails/Hash/to_xml).
374
+
375
+ </details>
376
+
377
+ <details>
378
+ <summary>See example of a converted JSON array</summary>
146
379
 
147
380
  This JSON array:
148
381
 
@@ -150,7 +383,7 @@ This JSON array:
150
383
  [{ "title": "Headline", "url": "https://example.com" }]
151
384
  ```
152
385
 
153
- will be converted to:
386
+ converts to:
154
387
 
155
388
  ```xml
156
389
  <objects>
@@ -163,10 +396,38 @@ will be converted to:
163
396
 
164
397
  Your items selector would be `objects > object`, the item's `link` selector would be `url`.
165
398
 
399
+ Find further information in [ActiveSupport's `Array.to_xml` documentation](https://apidock.com/rails/Array/to_xml).
400
+
401
+ </details>
402
+
166
403
  ## Set any HTTP header in the request
167
404
 
168
405
  You can add any HTTP headers to the request to the channel URL.
169
- You can use this to e.g. have Cookie or Authorization information being sent or to overwrite the User-Agent.
406
+ Use this to e.g. have Cookie or Authorization information sent or to spoof the User-Agent.
407
+
408
+ <details>
409
+ <summary>See a Ruby example</summary>
410
+
411
+ ```ruby
412
+ Html2rss.feed(
413
+ channel: {
414
+ url: 'https://example.com',
415
+ title: "Example with http headers",
416
+ headers: {
417
+ "User-Agent" => "html2rss-request",
418
+ "X-Something" => "Foobar",
419
+ "Authorization" => "Token deadbea7",
420
+ "Cookie" => "monster=MeWantCookie"
421
+ }
422
+ },
423
+ selectors: {}
424
+ )
425
+ ```
426
+
427
+ </details>
428
+
429
+ <details>
430
+ <summary>See a YAML feed config example</summary>
170
431
 
171
432
  ```yaml
172
433
  channel:
@@ -177,27 +438,79 @@ channel:
177
438
  "X-Something": "Foobar"
178
439
  "Authorization": "Token deadbea7"
179
440
  "Cookie": "monster=MeWantCookie"
180
- # ...
441
+ selectors:
442
+   # ...
181
443
  ```
182
444
 
183
- The headers provided by the channel will be merged into the global headers.
445
+ </details>
184
446
 
185
- ## Development
447
+ The headers provided by the channel are merged into the global headers.
186
448
 
187
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
449
+ ## Usage with a YAML config file
188
450
 
189
- ## Contributing
451
+ This step is not required to work with this gem. If you're using
452
+ [`html2rss-web`](https://github.com/gildesmarais/html2rss-web)
453
+ and want to create your private feed configs, keep on reading!
190
454
 
191
- Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
455
+ First, create your YAML file, e.g. called `config.yml`.
456
+ This file will contain your global config and feed configs.
192
457
 
193
- ## Releasing a new version
458
+ Example:
459
+
460
+ ```yml
461
+ headers:
462
+ 'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
463
+ feeds:
464
+ myfeed:
465
+ channel:
466
+ selectors:
467
+ myotherfeed:
468
+ channel:
469
+ selectors:
470
+ ```
471
+
472
+ Your feed configs go below `feeds`. Everything else is part of the global config.
473
+
474
+ Build your feeds like this:
475
+
476
+ ```ruby
477
+ require 'html2rss'
478
+
479
+ myfeed = Html2rss.feed_from_yaml_config('config.yml', 'myfeed')
480
+ myotherfeed = Html2rss.feed_from_yaml_config('config.yml', 'myotherfeed')
481
+ ```
482
+
483
+ Find a full example of a `config.yml` at [`spec/config.test.yml`](https://github.com/gildesmarais/html2rss/blob/master/spec/config.test.yml).
484
+
485
+ ## Gotchas and tips & tricks
486
+
487
+ - Check that the channel URL does not redirect to a mobile page with a different markup structure.
488
+ - Do not rely on your web browser's developer console. html2rss does not execute JavaScript.
489
+ - Fiddling with [`curl`](https://github.com/curl/curl) and [`pup`](https://github.com/ericchiang/pup) to find the selectors seems efficient (`curl URL | pup`).
490
+ - [CSS selectors are quite versatile, here's an overview.](https://www.w3.org/TR/selectors-4/#overview)
491
+
492
+ ## Development
493
+
494
+ After checking out the repository, run `bin/setup` to install dependencies. Then, run `bundle exec rspec` to run the tests.
495
+ You can also run `bin/console` for an interactive prompt that will allow you to experiment.
496
+
497
+ <details>
498
+ <summary>Releasing a new version</summary>
194
499
 
195
500
  1. `git pull`
196
501
  2. increase version in `lib/html2rss/version.rb`
197
502
  3. `bundle`
198
- 4. commit the changes
199
- 5. `git tag v....`
200
- 6. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
201
- 7. `git add CHANGELOG.md && git commit --amend`
202
- 8. `git tag v.... -f`
203
- 9. `git push && git push --tags`
503
+ 4. `git add Gemfile.lock lib/html2rss/version.rb`
504
+ 5. `VERSION=$(ruby -e 'require "./lib/html2rss/version.rb"; puts Html2rss::VERSION')`
505
+ 6. `git commit -m "chore: release $VERSION"`
506
+ 7. `git tag v$VERSION`
507
+ 8. [`standard-changelog -f`](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/standard-changelog)
508
+ 9. `git add CHANGELOG.md && git commit --amend`
509
+ 10. `git tag v$VERSION -f`
510
+ 11. `git push && git push --tags`
511
+
512
+ </details>
513
+
514
+ ## Contributing
515
+
516
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gildesmarais/html2rss.
@@ -34,6 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.add_dependency 'faraday', '~> 0.15'
35
35
  spec.add_dependency 'faraday_middleware', '~> 0.13'
36
36
  spec.add_dependency 'hashie', '~> 3.6'
37
+ spec.add_dependency 'kramdown'
37
38
  spec.add_dependency 'mime-types', '> 3.0'
38
39
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
39
40
  spec.add_dependency 'reverse_markdown', '~> 1.3'
@@ -4,9 +4,7 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  def self.get_processor(name)
6
6
  @get_processor ||= Hash.new do |processors, key|
7
- camel_cased_name = key.split('_').map(&:capitalize).join
8
- class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
9
- processors[key] = Object.const_get(class_name)
7
+ processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
10
8
  end
11
9
 
12
10
  @get_processor[name]
@@ -12,15 +12,17 @@ module Html2rss
12
12
  # title:
13
13
  # selector: h1
14
14
  # post_process:
15
- # name: gsub
16
- # pattern: boo
17
- # replacement: baz
15
+ # name: gsub
16
+ # pattern: boo
17
+ # replacement: baz
18
18
  #
19
19
  # Would return:
20
20
  # 'Foo bar and baz'
21
21
  #
22
22
  # `pattern` can be a Regexp or a String.
23
+ #
23
24
  # `replacement` can be a String or a Hash.
25
+ #
24
26
  # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
25
27
  class Gsub
26
28
  def initialize(value, env)
@@ -0,0 +1,48 @@
1
+ require 'kramdown'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ # Generates HTML from Markdown.
7
+ #
8
+ # It's particularly useful in conjunction with the Template post processor
9
+ # to generate a description from other selectors.
10
+ #
11
+ # YAML usage example:
12
+ #
13
+ # selectors:
14
+ # description:
15
+ # selector: section
16
+ # post_process:
17
+ # - name: template
18
+ # string: |
19
+ # # %s
20
+ #
21
+ # Price: %s
22
+ # methods:
23
+ # - self
24
+ # - price
25
+ # - name: markdown_to_html
26
+ #
27
+ # Would e.g. return:
28
+ #
29
+ # <h1>Section</h1>
30
+ #
31
+ # <p>Price: 12.34</p>
32
+ class MarkdownToHtml
33
+ def initialize(value, env)
34
+ @value = value
35
+ @env = env
36
+ end
37
+
38
+ ##
39
+ # @return [String] formatted in Markdown
40
+ def get
41
+ SanitizeHtml.new(
42
+ Kramdown::Document.new(@value).to_html,
43
+ @env
44
+ ).get
45
+ end
46
+ end
47
+ end
48
+ end
@@ -17,12 +17,12 @@ module Html2rss
17
17
  # selector: span
18
18
  # post_process:
19
19
  # name: 'parse_time'
20
+ # time_zone: 'Europe/Berlin'
20
21
  #
21
22
  # Would return:
22
23
  # "Tue, 02 Jul 2019 00:00:00 +0200"
23
24
  #
24
25
  # It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
25
- # As of now it ignores time zones and always falls back to the UTC time zone.
26
26
  class ParseTime
27
27
  def initialize(value, env)
28
28
  @value = value.to_s
@@ -15,6 +15,7 @@ module Html2rss
15
15
  # extractor: text
16
16
  # post_process:
17
17
  # name: parse_uri
18
+ #
18
19
  # Would return:
19
20
  # 'http://why-not-use-a-link.uh'
20
21
  class ParseUri
@@ -4,10 +4,16 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ##
6
6
  # Returns sanitized HTML code as String.
7
- # Adds
8
7
  #
9
- # - rel="nofollow noopener noreferrer" to a elements
10
- # - referrer-policy='no-referrer' to img elements
8
+ # It adds:
9
+ #
10
+ # - `rel="nofollow noopener noreferrer"` to <a> tags
11
+ # - `referrer-policy='no-referrer'` to <img> tags
12
+ #
13
+ # It also:
14
+ #
15
+ # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
16
+ # linking to the <img>'s `src`.
11
17
  #
12
18
  # Imagine this HTML structure:
13
19
  #
@@ -17,15 +23,11 @@ module Html2rss
17
23
  # <script>alert();</script>
18
24
  # </section>
19
25
  #
20
- # It also:
21
- #
22
- # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
23
- #
24
26
  # YAML usage example:
25
27
  #
26
28
  # selectors:
27
29
  # description:
28
- # selector: section
30
+ # selector: '.section'
29
31
  # extractor: html
30
32
  # post_process:
31
33
  # name: sanitize_html
@@ -2,9 +2,15 @@ module Html2rss
2
2
  module AttributePostProcessors
3
3
  ## Returns a defined part of a String.
4
4
  #
5
+ # Both parameters must be an Integer and they can be negative.
5
6
  # The +end+ parameter can be omitted, in that case it will not cut the
6
7
  # String at the end.
7
8
  #
9
+ # A Regexp or a MatchString is not supported.
10
+ #
11
+ # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
12
+ # documentation for more information.
13
+ #
8
14
  # Imagine this HTML:
9
15
  # <h1>Foo bar and baz<h1>
10
16
  #
@@ -13,9 +19,9 @@ module Html2rss
13
19
  # title:
14
20
  # selector: h1
15
21
  # post_process:
16
- # name: substring
17
- # start: 4
18
- # end: 6
22
+ # name: substring
23
+ # start: 4
24
+ # end: 6
19
25
  #
20
26
  # Would return:
21
27
  # 'bar'
@@ -4,7 +4,8 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ## Returns a formatted String according to the string pattern.
6
6
  #
7
- # If +self+ is given as a method, the extracted value will be used.
7
+ # If +self+ is used, the selectors extracted value will be used.
8
+ # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
8
9
  #
9
10
  # Imagine this HTML:
10
11
  # <li>
@@ -22,11 +23,8 @@ module Html2rss
22
23
  # title:
23
24
  # selector: h1
24
25
  # post_process:
25
- # name: template
26
- # string: '%s (%s)'
27
- # methods:
28
- # - self
29
- # - price
26
+ # name: template
27
+ # string: '%{self} (%{price})'
30
28
  #
31
29
  # Would return:
32
30
  # 'Product (23,42€)'
@@ -38,10 +36,16 @@ module Html2rss
38
36
  end
39
37
 
40
38
  ##
41
- # - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
42
39
  # @return [String]
43
40
  def get
44
- string % methods
41
+ if @options['methods']
42
+ string % methods
43
+ else
44
+ names = string.scan(/%[<|{](\w*)[>|}]/).flatten
45
+ names.uniq!
46
+
47
+ format(string, names.map { |name| [name.to_sym, item_value(name)] }.to_h)
48
+ end
45
49
  end
46
50
 
47
51
  private
@@ -51,9 +55,11 @@ module Html2rss
51
55
  end
52
56
 
53
57
  def methods
54
- @methods ||= @options['methods'].map do |method|
55
- method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
56
- end
58
+ @methods ||= @options['methods'].map(&method(:item_value))
59
+ end
60
+
61
+ def item_value(method_name)
62
+ method_name.to_s == 'self' ? @value.to_s : @item.public_send(method_name.to_sym).to_s
57
63
  end
58
64
  end
59
65
  end
@@ -2,13 +2,11 @@ module Html2rss
2
2
  ##
3
3
  # Provides a namespace for item extractors.
4
4
  module ItemExtractors
5
- DEFAULT = 'Text'.freeze
5
+ DEFAULT = 'text'.freeze
6
6
 
7
7
  def self.get_extractor(name)
8
8
  @get_extractor ||= Hash.new do |extractors, key|
9
- camel_cased_name = (key || DEFAULT).split('_').map(&:capitalize).join
10
- class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
11
- extractors[key] = Object.const_get(class_name)
9
+ extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
12
10
  end
13
11
 
14
12
  @get_extractor[name]
@@ -30,5 +30,11 @@ module Html2rss
30
30
  def self.hash_to_xml(hash)
31
31
  hash.to_xml(skip_instruct: true, skip_types: true)
32
32
  end
33
+
34
+ def self.get_class_from_name(snake_cased_name, module_name)
35
+ camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
36
+ class_name = ['Html2rss', module_name, camel_cased_name].join('::')
37
+ Object.const_get(class_name)
38
+ end
33
39
  end
34
40
  end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.7.0'.freeze
2
+ VERSION = '0.8.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-10-28 00:00:00.000000000 Z
11
+ date: 2019-11-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '3.6'
83
+ - !ruby/object:Gem::Dependency
84
+ name: kramdown
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: mime-types
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -321,6 +335,7 @@ files:
321
335
  - lib/html2rss/attribute_post_processors.rb
322
336
  - lib/html2rss/attribute_post_processors/gsub.rb
323
337
  - lib/html2rss/attribute_post_processors/html_to_markdown.rb
338
+ - lib/html2rss/attribute_post_processors/markdown_to_html.rb
324
339
  - lib/html2rss/attribute_post_processors/parse_time.rb
325
340
  - lib/html2rss/attribute_post_processors/parse_uri.rb
326
341
  - lib/html2rss/attribute_post_processors/sanitize_html.rb