html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.19.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: addressable
|
|
@@ -23,6 +23,34 @@ dependencies:
|
|
|
23
23
|
- - "~>"
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '2.7'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: brotli
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: dry-validation
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0'
|
|
26
54
|
- !ruby/object:Gem::Dependency
|
|
27
55
|
name: faraday
|
|
28
56
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -57,6 +85,20 @@ dependencies:
|
|
|
57
85
|
- - ">="
|
|
58
86
|
- !ruby/object:Gem::Version
|
|
59
87
|
version: '0'
|
|
88
|
+
- !ruby/object:Gem::Dependency
|
|
89
|
+
name: faraday-gzip
|
|
90
|
+
requirement: !ruby/object:Gem::Requirement
|
|
91
|
+
requirements:
|
|
92
|
+
- - "~>"
|
|
93
|
+
- !ruby/object:Gem::Version
|
|
94
|
+
version: '3'
|
|
95
|
+
type: :runtime
|
|
96
|
+
prerelease: false
|
|
97
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
98
|
+
requirements:
|
|
99
|
+
- - "~>"
|
|
100
|
+
- !ruby/object:Gem::Version
|
|
101
|
+
version: '3'
|
|
60
102
|
- !ruby/object:Gem::Dependency
|
|
61
103
|
name: kramdown
|
|
62
104
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -179,16 +221,16 @@ dependencies:
|
|
|
179
221
|
name: sanitize
|
|
180
222
|
requirement: !ruby/object:Gem::Requirement
|
|
181
223
|
requirements:
|
|
182
|
-
- - "
|
|
224
|
+
- - ">="
|
|
183
225
|
- !ruby/object:Gem::Version
|
|
184
|
-
version: '
|
|
226
|
+
version: '0'
|
|
185
227
|
type: :runtime
|
|
186
228
|
prerelease: false
|
|
187
229
|
version_requirements: !ruby/object:Gem::Requirement
|
|
188
230
|
requirements:
|
|
189
|
-
- - "
|
|
231
|
+
- - ">="
|
|
190
232
|
- !ruby/object:Gem::Version
|
|
191
|
-
version: '
|
|
233
|
+
version: '0'
|
|
192
234
|
- !ruby/object:Gem::Dependency
|
|
193
235
|
name: thor
|
|
194
236
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -245,64 +287,104 @@ files:
|
|
|
245
287
|
- exe/html2rss
|
|
246
288
|
- html2rss.gemspec
|
|
247
289
|
- lib/html2rss.rb
|
|
248
|
-
- lib/html2rss/
|
|
249
|
-
- lib/html2rss/attribute_post_processors/base.rb
|
|
250
|
-
- lib/html2rss/attribute_post_processors/gsub.rb
|
|
251
|
-
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
|
252
|
-
- lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
|
253
|
-
- lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb
|
|
254
|
-
- lib/html2rss/attribute_post_processors/markdown_to_html.rb
|
|
255
|
-
- lib/html2rss/attribute_post_processors/parse_time.rb
|
|
256
|
-
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
|
257
|
-
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
|
258
|
-
- lib/html2rss/attribute_post_processors/substring.rb
|
|
259
|
-
- lib/html2rss/attribute_post_processors/template.rb
|
|
290
|
+
- lib/html2rss/articles/deduplicator.rb
|
|
260
291
|
- lib/html2rss/auto_source.rb
|
|
261
|
-
- lib/html2rss/auto_source/article.rb
|
|
262
|
-
- lib/html2rss/auto_source/channel.rb
|
|
263
292
|
- lib/html2rss/auto_source/cleanup.rb
|
|
264
|
-
- lib/html2rss/auto_source/reducer.rb
|
|
265
|
-
- lib/html2rss/auto_source/rss_builder.rb
|
|
266
293
|
- lib/html2rss/auto_source/scraper.rb
|
|
267
294
|
- lib/html2rss/auto_source/scraper/html.rb
|
|
295
|
+
- lib/html2rss/auto_source/scraper/json_state.rb
|
|
296
|
+
- lib/html2rss/auto_source/scraper/microdata.rb
|
|
268
297
|
- lib/html2rss/auto_source/scraper/schema.rb
|
|
298
|
+
- lib/html2rss/auto_source/scraper/schema/category_extractor.rb
|
|
269
299
|
- lib/html2rss/auto_source/scraper/schema/item_list.rb
|
|
270
300
|
- lib/html2rss/auto_source/scraper/schema/list_item.rb
|
|
271
301
|
- lib/html2rss/auto_source/scraper/schema/thing.rb
|
|
272
302
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
|
273
|
-
- lib/html2rss/auto_source/scraper/semantic_html/
|
|
274
|
-
- lib/html2rss/auto_source/scraper/
|
|
303
|
+
- lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb
|
|
304
|
+
- lib/html2rss/auto_source/scraper/wordpress_api.rb
|
|
305
|
+
- lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb
|
|
306
|
+
- lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb
|
|
307
|
+
- lib/html2rss/blocked_surface.rb
|
|
308
|
+
- lib/html2rss/category_extractor.rb
|
|
275
309
|
- lib/html2rss/cli.rb
|
|
276
310
|
- lib/html2rss/config.rb
|
|
277
|
-
- lib/html2rss/config/
|
|
278
|
-
- lib/html2rss/config/
|
|
279
|
-
- lib/html2rss/
|
|
280
|
-
- lib/html2rss/
|
|
281
|
-
- lib/html2rss/
|
|
282
|
-
- lib/html2rss/
|
|
283
|
-
- lib/html2rss/
|
|
284
|
-
- lib/html2rss/
|
|
285
|
-
- lib/html2rss/
|
|
286
|
-
- lib/html2rss/
|
|
311
|
+
- lib/html2rss/config/class_methods.rb
|
|
312
|
+
- lib/html2rss/config/dynamic_params.rb
|
|
313
|
+
- lib/html2rss/config/multiple_feeds_config.rb
|
|
314
|
+
- lib/html2rss/config/request_headers.rb
|
|
315
|
+
- lib/html2rss/config/schema.rb
|
|
316
|
+
- lib/html2rss/config/validator.rb
|
|
317
|
+
- lib/html2rss/error.rb
|
|
318
|
+
- lib/html2rss/feed_pipeline.rb
|
|
319
|
+
- lib/html2rss/feed_pipeline/auto_fallback.rb
|
|
320
|
+
- lib/html2rss/hash_util.rb
|
|
321
|
+
- lib/html2rss/html_extractor.rb
|
|
322
|
+
- lib/html2rss/html_extractor/date_extractor.rb
|
|
323
|
+
- lib/html2rss/html_extractor/enclosure_extractor.rb
|
|
324
|
+
- lib/html2rss/html_extractor/image_extractor.rb
|
|
325
|
+
- lib/html2rss/html_navigator.rb
|
|
326
|
+
- lib/html2rss/json_feed_builder.rb
|
|
327
|
+
- lib/html2rss/json_feed_builder/item.rb
|
|
328
|
+
- lib/html2rss/rendering.rb
|
|
329
|
+
- lib/html2rss/rendering/audio_renderer.rb
|
|
330
|
+
- lib/html2rss/rendering/description_builder.rb
|
|
331
|
+
- lib/html2rss/rendering/image_renderer.rb
|
|
332
|
+
- lib/html2rss/rendering/media_renderer.rb
|
|
333
|
+
- lib/html2rss/rendering/pdf_renderer.rb
|
|
334
|
+
- lib/html2rss/rendering/video_renderer.rb
|
|
335
|
+
- lib/html2rss/request_controls.rb
|
|
287
336
|
- lib/html2rss/request_service.rb
|
|
337
|
+
- lib/html2rss/request_service/botasaurus_contract.rb
|
|
338
|
+
- lib/html2rss/request_service/botasaurus_strategy.rb
|
|
288
339
|
- lib/html2rss/request_service/browserless_strategy.rb
|
|
340
|
+
- lib/html2rss/request_service/budget.rb
|
|
289
341
|
- lib/html2rss/request_service/context.rb
|
|
290
342
|
- lib/html2rss/request_service/faraday_strategy.rb
|
|
343
|
+
- lib/html2rss/request_service/policy.rb
|
|
291
344
|
- lib/html2rss/request_service/puppet_commander.rb
|
|
292
345
|
- lib/html2rss/request_service/response.rb
|
|
346
|
+
- lib/html2rss/request_service/response_guard.rb
|
|
293
347
|
- lib/html2rss/request_service/strategy.rb
|
|
348
|
+
- lib/html2rss/request_session.rb
|
|
349
|
+
- lib/html2rss/request_session/rel_next_pager.rb
|
|
350
|
+
- lib/html2rss/request_session/runtime_input.rb
|
|
351
|
+
- lib/html2rss/request_session/runtime_policy.rb
|
|
294
352
|
- lib/html2rss/rss_builder.rb
|
|
353
|
+
- lib/html2rss/rss_builder/article.rb
|
|
295
354
|
- lib/html2rss/rss_builder/channel.rb
|
|
296
|
-
- lib/html2rss/rss_builder/
|
|
355
|
+
- lib/html2rss/rss_builder/enclosure.rb
|
|
297
356
|
- lib/html2rss/rss_builder/stylesheet.rb
|
|
298
|
-
- lib/html2rss/
|
|
357
|
+
- lib/html2rss/selectors.rb
|
|
358
|
+
- lib/html2rss/selectors/config.rb
|
|
359
|
+
- lib/html2rss/selectors/extractors.rb
|
|
360
|
+
- lib/html2rss/selectors/extractors/attribute.rb
|
|
361
|
+
- lib/html2rss/selectors/extractors/href.rb
|
|
362
|
+
- lib/html2rss/selectors/extractors/html.rb
|
|
363
|
+
- lib/html2rss/selectors/extractors/static.rb
|
|
364
|
+
- lib/html2rss/selectors/extractors/text.rb
|
|
365
|
+
- lib/html2rss/selectors/object_to_xml_converter.rb
|
|
366
|
+
- lib/html2rss/selectors/post_processors.rb
|
|
367
|
+
- lib/html2rss/selectors/post_processors/base.rb
|
|
368
|
+
- lib/html2rss/selectors/post_processors/gsub.rb
|
|
369
|
+
- lib/html2rss/selectors/post_processors/html_to_markdown.rb
|
|
370
|
+
- lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
|
371
|
+
- lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb
|
|
372
|
+
- lib/html2rss/selectors/post_processors/markdown_to_html.rb
|
|
373
|
+
- lib/html2rss/selectors/post_processors/parse_time.rb
|
|
374
|
+
- lib/html2rss/selectors/post_processors/parse_uri.rb
|
|
375
|
+
- lib/html2rss/selectors/post_processors/sanitize_html.rb
|
|
376
|
+
- lib/html2rss/selectors/post_processors/substring.rb
|
|
377
|
+
- lib/html2rss/selectors/post_processors/template.rb
|
|
378
|
+
- lib/html2rss/url.rb
|
|
299
379
|
- lib/html2rss/version.rb
|
|
380
|
+
- lib/tasks/config_schema.rake
|
|
381
|
+
- schema/html2rss-config.schema.json
|
|
300
382
|
homepage: https://github.com/html2rss/html2rss
|
|
301
383
|
licenses:
|
|
302
384
|
- MIT
|
|
303
385
|
metadata:
|
|
304
386
|
allowed_push_host: https://rubygems.org
|
|
305
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
|
387
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.19.0
|
|
306
388
|
rubygems_mfa_required: 'true'
|
|
307
389
|
rdoc_options: []
|
|
308
390
|
require_paths:
|
|
@@ -318,7 +400,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
318
400
|
- !ruby/object:Gem::Version
|
|
319
401
|
version: '0'
|
|
320
402
|
requirements: []
|
|
321
|
-
rubygems_version:
|
|
403
|
+
rubygems_version: 4.0.6
|
|
322
404
|
specification_version: 4
|
|
323
405
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
|
324
406
|
to extract item.
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
##
|
|
5
|
-
# Provides a namespace for attribute post processors.
|
|
6
|
-
module AttributePostProcessors
|
|
7
|
-
##
|
|
8
|
-
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
|
9
|
-
class Base
|
|
10
|
-
# Validates the presence of required options in the context
|
|
11
|
-
#
|
|
12
|
-
# @param keys [Array<Symbol>] the keys to check for presence
|
|
13
|
-
# @param context [Hash] the context containing options
|
|
14
|
-
# @raise [MissingOption] if any key is missing
|
|
15
|
-
def self.expect_options(keys, context)
|
|
16
|
-
keys.each do |key|
|
|
17
|
-
unless (options = context[:options]).key?(key)
|
|
18
|
-
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
|
19
|
-
cause: nil
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# Asserts that the value is of the expected type(s)
|
|
25
|
-
#
|
|
26
|
-
# @param value [Object] the value to check
|
|
27
|
-
# @param types [Array<Class>, Class] the expected type(s)
|
|
28
|
-
# @param name [String] the name of the option being checked
|
|
29
|
-
# @param context [Item::Context] the context
|
|
30
|
-
# @raise [InvalidType] if the value is not of the expected type(s)
|
|
31
|
-
def self.assert_type(value, types = [], name, context:)
|
|
32
|
-
types = [types] unless types.is_a?(Array)
|
|
33
|
-
|
|
34
|
-
return if types.any? { |type| value.is_a?(type) }
|
|
35
|
-
|
|
36
|
-
options = context[:options] if context.is_a?(Hash)
|
|
37
|
-
options ||= { file: File.basename(caller_locations(1, 1).first.absolute_path) }
|
|
38
|
-
|
|
39
|
-
raise InvalidType, format('The type of `%<name>s` must be %<types>s, but is: %<type>s in: %<options>s',
|
|
40
|
-
name:, types: types.join(' or '), type: value.class, options: options.inspect),
|
|
41
|
-
[], cause: nil
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
##
|
|
45
|
-
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
|
46
|
-
def self.validate_args!(_value, _context)
|
|
47
|
-
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
# Initializes the post processor
|
|
51
|
-
#
|
|
52
|
-
# @param value [Object] the value to be processed
|
|
53
|
-
# @param context [Item::Context] the context
|
|
54
|
-
def initialize(value, context)
|
|
55
|
-
klass = self.class
|
|
56
|
-
# TODO: get rid of Hash
|
|
57
|
-
klass.assert_type(context, [Item::Context, Hash], 'context', context:)
|
|
58
|
-
klass.validate_args!(value, context)
|
|
59
|
-
|
|
60
|
-
@value = value
|
|
61
|
-
@context = context
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
attr_reader :value, :context
|
|
65
|
-
|
|
66
|
-
# Abstract method to be implemented by subclasses
|
|
67
|
-
#
|
|
68
|
-
# @raise [NotImplementedError] if not implemented in subclass
|
|
69
|
-
def get
|
|
70
|
-
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
##
|
|
6
|
-
# Imagine this HTML:
|
|
7
|
-
# <h1>Foo bar and boo<h1>
|
|
8
|
-
#
|
|
9
|
-
# YAML usage example:
|
|
10
|
-
# selectors:
|
|
11
|
-
# title:
|
|
12
|
-
# selector: h1
|
|
13
|
-
# post_process:
|
|
14
|
-
# name: gsub
|
|
15
|
-
# pattern: boo
|
|
16
|
-
# replacement: baz
|
|
17
|
-
#
|
|
18
|
-
# Would return:
|
|
19
|
-
# 'Foo bar and baz'
|
|
20
|
-
#
|
|
21
|
-
# `pattern` can be a Regexp or a String. If it is a String, it will remove
|
|
22
|
-
# one pair of surrounding slashes ('/') to keep backwards compatibility
|
|
23
|
-
# and then parse it to build a Regexp.
|
|
24
|
-
#
|
|
25
|
-
# `replacement` can be a String or a Hash.
|
|
26
|
-
#
|
|
27
|
-
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
|
28
|
-
class Gsub < Base
|
|
29
|
-
def self.validate_args!(value, context)
|
|
30
|
-
assert_type value, String, :value, context:
|
|
31
|
-
expect_options(%i[replacement pattern], context)
|
|
32
|
-
assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
##
|
|
36
|
-
# @param value [String]
|
|
37
|
-
# @param context [Item::Context]
|
|
38
|
-
def initialize(value, context)
|
|
39
|
-
super
|
|
40
|
-
|
|
41
|
-
options = context[:options]
|
|
42
|
-
|
|
43
|
-
@replacement = options[:replacement]
|
|
44
|
-
@pattern = options[:pattern]
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
##
|
|
48
|
-
# @return [String]
|
|
49
|
-
def get
|
|
50
|
-
value.to_s.gsub(pattern, replacement)
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
private
|
|
54
|
-
|
|
55
|
-
attr_accessor :replacement
|
|
56
|
-
|
|
57
|
-
##
|
|
58
|
-
# @return [Regexp]
|
|
59
|
-
def pattern
|
|
60
|
-
@pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
end
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'reverse_markdown'
|
|
4
|
-
|
|
5
|
-
module Html2rss
|
|
6
|
-
module AttributePostProcessors
|
|
7
|
-
##
|
|
8
|
-
# Returns HTML code as Markdown formatted String.
|
|
9
|
-
# Before converting to markdown, the HTML is sanitized with SanitizeHtml.
|
|
10
|
-
# Imagine this HTML structure:
|
|
11
|
-
#
|
|
12
|
-
# <section>
|
|
13
|
-
# Lorem <b>ipsum</b> dolor...
|
|
14
|
-
# <iframe src="https://evil.corp/miner"></iframe>
|
|
15
|
-
# <script>alert();</script>
|
|
16
|
-
# </section>
|
|
17
|
-
#
|
|
18
|
-
# YAML usage example:
|
|
19
|
-
#
|
|
20
|
-
# selectors:
|
|
21
|
-
# description:
|
|
22
|
-
# selector: section
|
|
23
|
-
# extractor: html
|
|
24
|
-
# post_process:
|
|
25
|
-
# name: html_to_markdown
|
|
26
|
-
#
|
|
27
|
-
# Would return:
|
|
28
|
-
# 'Lorem **ipsum** dolor'
|
|
29
|
-
class HtmlToMarkdown < Base
|
|
30
|
-
def self.validate_args!(value, context)
|
|
31
|
-
assert_type value, String, :value, context:
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# @return [String] formatted in Markdown
|
|
36
|
-
def get
|
|
37
|
-
sanitized_value = SanitizeHtml.new(value, context).get
|
|
38
|
-
|
|
39
|
-
ReverseMarkdown.convert(sanitized_value)
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
end
|
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
module HtmlTransformers
|
|
6
|
-
##
|
|
7
|
-
# Transformer that converts relative URLs to absolute URLs within specified HTML elements.
|
|
8
|
-
class TransformUrlsToAbsoluteOnes
|
|
9
|
-
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
|
10
|
-
|
|
11
|
-
def initialize(channel_url)
|
|
12
|
-
@channel_url = channel_url
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
##
|
|
16
|
-
# Transforms URLs to absolute ones.
|
|
17
|
-
def call(node_name:, node:, **_env)
|
|
18
|
-
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
|
|
19
|
-
|
|
20
|
-
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
|
|
21
|
-
url = node[url_attribute]
|
|
22
|
-
node[url_attribute] = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
end
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
module HtmlTransformers
|
|
6
|
-
##
|
|
7
|
-
# Transformer that wraps <img> tags into <a> tags linking to `img.src`.
|
|
8
|
-
class WrapImgInA
|
|
9
|
-
##
|
|
10
|
-
# Wraps <img> tags into <a> tags that link to `img.src`.
|
|
11
|
-
#
|
|
12
|
-
# @param node_name [String]
|
|
13
|
-
# @param node [Nokogiri::XML::Node]
|
|
14
|
-
# @return [nil]
|
|
15
|
-
def call(node_name:, node:, **_env)
|
|
16
|
-
return unless already_wrapped?(node_name, node)
|
|
17
|
-
|
|
18
|
-
wrap_image_in_anchor(node)
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def already_wrapped?(node_name, node)
|
|
22
|
-
node_name == 'img' && node.parent.name != 'a'
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
private
|
|
26
|
-
|
|
27
|
-
##
|
|
28
|
-
# Wraps the <img> node in an <a> tag.
|
|
29
|
-
#
|
|
30
|
-
# @param node [Nokogiri::XML::Node]
|
|
31
|
-
# @return [nil]
|
|
32
|
-
def wrap_image_in_anchor(node)
|
|
33
|
-
anchor = Nokogiri::XML::Node.new('a', node.document)
|
|
34
|
-
anchor['href'] = node['src']
|
|
35
|
-
node.add_next_sibling(anchor)
|
|
36
|
-
anchor.add_child(node.remove)
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
end
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'kramdown'
|
|
4
|
-
require_relative 'sanitize_html'
|
|
5
|
-
|
|
6
|
-
module Html2rss
|
|
7
|
-
module AttributePostProcessors
|
|
8
|
-
##
|
|
9
|
-
# Generates HTML from Markdown.
|
|
10
|
-
#
|
|
11
|
-
# It's particularly useful in conjunction with the Template post processor
|
|
12
|
-
# to generate a description from other selectors.
|
|
13
|
-
#
|
|
14
|
-
# YAML usage example:
|
|
15
|
-
#
|
|
16
|
-
# selectors:
|
|
17
|
-
# description:
|
|
18
|
-
# selector: section
|
|
19
|
-
# post_process:
|
|
20
|
-
# - name: template
|
|
21
|
-
# string: |
|
|
22
|
-
# # %s
|
|
23
|
-
#
|
|
24
|
-
# Price: %s
|
|
25
|
-
# methods:
|
|
26
|
-
# - self
|
|
27
|
-
# - price
|
|
28
|
-
# - name: markdown_to_html
|
|
29
|
-
#
|
|
30
|
-
# Would e.g. return:
|
|
31
|
-
#
|
|
32
|
-
# <h1>Section</h1>
|
|
33
|
-
#
|
|
34
|
-
# <p>Price: 12.34</p>
|
|
35
|
-
class MarkdownToHtml < Base
|
|
36
|
-
def self.validate_args!(value, context)
|
|
37
|
-
assert_type value, String, :value, context:
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
##
|
|
41
|
-
# Converts Markdown to sanitized HTML.
|
|
42
|
-
#
|
|
43
|
-
# @return [String] Sanitized HTML content
|
|
44
|
-
def get
|
|
45
|
-
html_content = Kramdown::Document.new(value).to_html
|
|
46
|
-
SanitizeHtml.new(html_content, context).get
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'time'
|
|
4
|
-
require_relative '../utils'
|
|
5
|
-
|
|
6
|
-
module Html2rss
|
|
7
|
-
module AttributePostProcessors
|
|
8
|
-
##
|
|
9
|
-
# Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
|
|
10
|
-
#
|
|
11
|
-
# Imagine this HTML structure:
|
|
12
|
-
#
|
|
13
|
-
# <p>Published on <span>2019-07-02</span></p>
|
|
14
|
-
#
|
|
15
|
-
# YAML usage example:
|
|
16
|
-
#
|
|
17
|
-
# selectors:
|
|
18
|
-
# description:
|
|
19
|
-
# selector: span
|
|
20
|
-
# post_process:
|
|
21
|
-
# name: 'parse_time'
|
|
22
|
-
# time_zone: 'Europe/Berlin'
|
|
23
|
-
#
|
|
24
|
-
# Would return:
|
|
25
|
-
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
|
26
|
-
#
|
|
27
|
-
# It uses `Time.parse`.
|
|
28
|
-
class ParseTime < Base
|
|
29
|
-
def self.validate_args!(value, context)
|
|
30
|
-
assert_type(value, String, :value, context:)
|
|
31
|
-
assert_type(context[:config].time_zone, String, :time_zone, context:)
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
|
36
|
-
#
|
|
37
|
-
# @return [String] RFC822 formatted time
|
|
38
|
-
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
|
39
|
-
def get
|
|
40
|
-
time_zone = context[:config].time_zone
|
|
41
|
-
|
|
42
|
-
Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
##
|
|
6
|
-
# Returns the URI as String.
|
|
7
|
-
# If the URL is relative, it builds an absolute one with the channel's URL as base.
|
|
8
|
-
#
|
|
9
|
-
# Imagine this HTML structure:
|
|
10
|
-
#
|
|
11
|
-
# <span>http://why-not-use-a-link.uh </span>
|
|
12
|
-
#
|
|
13
|
-
# YAML usage example:
|
|
14
|
-
#
|
|
15
|
-
# selectors:
|
|
16
|
-
# link:
|
|
17
|
-
# selector: span
|
|
18
|
-
# extractor: text
|
|
19
|
-
# post_process:
|
|
20
|
-
# name: parse_uri
|
|
21
|
-
#
|
|
22
|
-
# Would return:
|
|
23
|
-
# 'http://why-not-use-a-link.uh'
|
|
24
|
-
class ParseUri < Base
|
|
25
|
-
def self.validate_args!(value, context)
|
|
26
|
-
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
|
27
|
-
|
|
28
|
-
assert_type(value, url_types, :value, context:)
|
|
29
|
-
assert_type(context.config.url, url_types, :url, context:)
|
|
30
|
-
|
|
31
|
-
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# @return [String]
|
|
36
|
-
def get
|
|
37
|
-
config_url = context.config.url
|
|
38
|
-
|
|
39
|
-
Html2rss::Utils.build_absolute_url_from_relative(
|
|
40
|
-
Html2rss::Utils.sanitize_url(value),
|
|
41
|
-
config_url
|
|
42
|
-
).to_s
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|