html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.0
4
+ version: 0.18.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-12-24 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: addressable
@@ -24,6 +23,34 @@ dependencies:
24
23
  - - "~>"
25
24
  - !ruby/object:Gem::Version
26
25
  version: '2.7'
26
+ - !ruby/object:Gem::Dependency
27
+ name: brotli
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: dry-validation
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
27
54
  - !ruby/object:Gem::Dependency
28
55
  name: faraday
29
56
  requirement: !ruby/object:Gem::Requirement
@@ -58,6 +85,20 @@ dependencies:
58
85
  - - ">="
59
86
  - !ruby/object:Gem::Version
60
87
  version: '0'
88
+ - !ruby/object:Gem::Dependency
89
+ name: faraday-gzip
90
+ requirement: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - "~>"
93
+ - !ruby/object:Gem::Version
94
+ version: '3'
95
+ type: :runtime
96
+ prerelease: false
97
+ version_requirements: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - "~>"
100
+ - !ruby/object:Gem::Version
101
+ version: '3'
61
102
  - !ruby/object:Gem::Dependency
62
103
  name: kramdown
63
104
  requirement: !ruby/object:Gem::Requirement
@@ -180,16 +221,16 @@ dependencies:
180
221
  name: sanitize
181
222
  requirement: !ruby/object:Gem::Requirement
182
223
  requirements:
183
- - - "~>"
224
+ - - ">="
184
225
  - !ruby/object:Gem::Version
185
- version: '6.0'
226
+ version: '0'
186
227
  type: :runtime
187
228
  prerelease: false
188
229
  version_requirements: !ruby/object:Gem::Requirement
189
230
  requirements:
190
- - - "~>"
231
+ - - ">="
191
232
  - !ruby/object:Gem::Version
192
- version: '6.0'
233
+ version: '0'
193
234
  - !ruby/object:Gem::Dependency
194
235
  name: thor
195
236
  requirement: !ruby/object:Gem::Requirement
@@ -222,16 +263,16 @@ dependencies:
222
263
  name: zeitwerk
223
264
  requirement: !ruby/object:Gem::Requirement
224
265
  requirements:
225
- - - "~>"
266
+ - - ">="
226
267
  - !ruby/object:Gem::Version
227
- version: 2.6.0
268
+ version: '0'
228
269
  type: :runtime
229
270
  prerelease: false
230
271
  version_requirements: !ruby/object:Gem::Requirement
231
272
  requirements:
232
- - - "~>"
273
+ - - ">="
233
274
  - !ruby/object:Gem::Version
234
- version: 2.6.0
275
+ version: '0'
235
276
  description: Supports JSON content, custom HTTP headers, and post-processing of extracted
236
277
  content.
237
278
  email:
@@ -246,66 +287,100 @@ files:
246
287
  - exe/html2rss
247
288
  - html2rss.gemspec
248
289
  - lib/html2rss.rb
249
- - lib/html2rss/attribute_post_processors.rb
250
- - lib/html2rss/attribute_post_processors/base.rb
251
- - lib/html2rss/attribute_post_processors/gsub.rb
252
- - lib/html2rss/attribute_post_processors/html_to_markdown.rb
253
- - lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
254
- - lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb
255
- - lib/html2rss/attribute_post_processors/markdown_to_html.rb
256
- - lib/html2rss/attribute_post_processors/parse_time.rb
257
- - lib/html2rss/attribute_post_processors/parse_uri.rb
258
- - lib/html2rss/attribute_post_processors/sanitize_html.rb
259
- - lib/html2rss/attribute_post_processors/substring.rb
260
- - lib/html2rss/attribute_post_processors/template.rb
290
+ - lib/html2rss/articles/deduplicator.rb
261
291
  - lib/html2rss/auto_source.rb
262
- - lib/html2rss/auto_source/article.rb
263
- - lib/html2rss/auto_source/channel.rb
264
292
  - lib/html2rss/auto_source/cleanup.rb
265
- - lib/html2rss/auto_source/reducer.rb
266
- - lib/html2rss/auto_source/rss_builder.rb
267
293
  - lib/html2rss/auto_source/scraper.rb
268
294
  - lib/html2rss/auto_source/scraper/html.rb
295
+ - lib/html2rss/auto_source/scraper/json_state.rb
296
+ - lib/html2rss/auto_source/scraper/microdata.rb
269
297
  - lib/html2rss/auto_source/scraper/schema.rb
298
+ - lib/html2rss/auto_source/scraper/schema/category_extractor.rb
270
299
  - lib/html2rss/auto_source/scraper/schema/item_list.rb
271
300
  - lib/html2rss/auto_source/scraper/schema/list_item.rb
272
301
  - lib/html2rss/auto_source/scraper/schema/thing.rb
273
302
  - lib/html2rss/auto_source/scraper/semantic_html.rb
274
- - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
275
- - lib/html2rss/auto_source/scraper/semantic_html/image.rb
303
+ - lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb
304
+ - lib/html2rss/auto_source/scraper/wordpress_api.rb
305
+ - lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb
306
+ - lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb
307
+ - lib/html2rss/blocked_surface.rb
308
+ - lib/html2rss/category_extractor.rb
276
309
  - lib/html2rss/cli.rb
277
310
  - lib/html2rss/config.rb
278
- - lib/html2rss/config/channel.rb
279
- - lib/html2rss/config/selectors.rb
280
- - lib/html2rss/item.rb
281
- - lib/html2rss/item_extractors.rb
282
- - lib/html2rss/item_extractors/attribute.rb
283
- - lib/html2rss/item_extractors/href.rb
284
- - lib/html2rss/item_extractors/html.rb
285
- - lib/html2rss/item_extractors/static.rb
286
- - lib/html2rss/item_extractors/text.rb
287
- - lib/html2rss/object_to_xml_converter.rb
311
+ - lib/html2rss/config/class_methods.rb
312
+ - lib/html2rss/config/dynamic_params.rb
313
+ - lib/html2rss/config/multiple_feeds_config.rb
314
+ - lib/html2rss/config/request_headers.rb
315
+ - lib/html2rss/config/schema.rb
316
+ - lib/html2rss/config/validator.rb
317
+ - lib/html2rss/error.rb
318
+ - lib/html2rss/html_extractor.rb
319
+ - lib/html2rss/html_extractor/date_extractor.rb
320
+ - lib/html2rss/html_extractor/enclosure_extractor.rb
321
+ - lib/html2rss/html_extractor/image_extractor.rb
322
+ - lib/html2rss/html_navigator.rb
323
+ - lib/html2rss/json_feed_builder.rb
324
+ - lib/html2rss/json_feed_builder/item.rb
325
+ - lib/html2rss/rendering.rb
326
+ - lib/html2rss/rendering/audio_renderer.rb
327
+ - lib/html2rss/rendering/description_builder.rb
328
+ - lib/html2rss/rendering/image_renderer.rb
329
+ - lib/html2rss/rendering/media_renderer.rb
330
+ - lib/html2rss/rendering/pdf_renderer.rb
331
+ - lib/html2rss/rendering/video_renderer.rb
332
+ - lib/html2rss/request_controls.rb
288
333
  - lib/html2rss/request_service.rb
289
334
  - lib/html2rss/request_service/browserless_strategy.rb
335
+ - lib/html2rss/request_service/budget.rb
290
336
  - lib/html2rss/request_service/context.rb
291
337
  - lib/html2rss/request_service/faraday_strategy.rb
338
+ - lib/html2rss/request_service/policy.rb
292
339
  - lib/html2rss/request_service/puppet_commander.rb
293
340
  - lib/html2rss/request_service/response.rb
341
+ - lib/html2rss/request_service/response_guard.rb
294
342
  - lib/html2rss/request_service/strategy.rb
343
+ - lib/html2rss/request_session.rb
344
+ - lib/html2rss/request_session/rel_next_pager.rb
345
+ - lib/html2rss/request_session/runtime_input.rb
346
+ - lib/html2rss/request_session/runtime_policy.rb
295
347
  - lib/html2rss/rss_builder.rb
348
+ - lib/html2rss/rss_builder/article.rb
296
349
  - lib/html2rss/rss_builder/channel.rb
297
- - lib/html2rss/rss_builder/item.rb
350
+ - lib/html2rss/rss_builder/enclosure.rb
298
351
  - lib/html2rss/rss_builder/stylesheet.rb
299
- - lib/html2rss/utils.rb
352
+ - lib/html2rss/selectors.rb
353
+ - lib/html2rss/selectors/config.rb
354
+ - lib/html2rss/selectors/extractors.rb
355
+ - lib/html2rss/selectors/extractors/attribute.rb
356
+ - lib/html2rss/selectors/extractors/href.rb
357
+ - lib/html2rss/selectors/extractors/html.rb
358
+ - lib/html2rss/selectors/extractors/static.rb
359
+ - lib/html2rss/selectors/extractors/text.rb
360
+ - lib/html2rss/selectors/object_to_xml_converter.rb
361
+ - lib/html2rss/selectors/post_processors.rb
362
+ - lib/html2rss/selectors/post_processors/base.rb
363
+ - lib/html2rss/selectors/post_processors/gsub.rb
364
+ - lib/html2rss/selectors/post_processors/html_to_markdown.rb
365
+ - lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb
366
+ - lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb
367
+ - lib/html2rss/selectors/post_processors/markdown_to_html.rb
368
+ - lib/html2rss/selectors/post_processors/parse_time.rb
369
+ - lib/html2rss/selectors/post_processors/parse_uri.rb
370
+ - lib/html2rss/selectors/post_processors/sanitize_html.rb
371
+ - lib/html2rss/selectors/post_processors/substring.rb
372
+ - lib/html2rss/selectors/post_processors/template.rb
373
+ - lib/html2rss/url.rb
300
374
  - lib/html2rss/version.rb
375
+ - lib/tasks/config_schema.rake
376
+ - schema/html2rss-config.schema.json
301
377
  homepage: https://github.com/html2rss/html2rss
302
378
  licenses:
303
379
  - MIT
304
380
  metadata:
305
381
  allowed_push_host: https://rubygems.org
306
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.16.0
382
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.18.0
307
383
  rubygems_mfa_required: 'true'
308
- post_install_message:
309
384
  rdoc_options: []
310
385
  require_paths:
311
386
  - lib
@@ -313,15 +388,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
313
388
  requirements:
314
389
  - - ">="
315
390
  - !ruby/object:Gem::Version
316
- version: '3.1'
391
+ version: '3.2'
317
392
  required_rubygems_version: !ruby/object:Gem::Requirement
318
393
  requirements:
319
394
  - - ">="
320
395
  - !ruby/object:Gem::Version
321
396
  version: '0'
322
397
  requirements: []
323
- rubygems_version: 3.5.22
324
- signing_key:
398
+ rubygems_version: 4.0.6
325
399
  specification_version: 4
326
400
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
327
401
  to extract item.
@@ -1,74 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- ##
5
- # Provides a namespace for attribute post processors.
6
- module AttributePostProcessors
7
- ##
8
- # All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
9
- class Base
10
- # Validates the presence of required options in the context
11
- #
12
- # @param keys [Array<Symbol>] the keys to check for presence
13
- # @param context [Hash] the context containing options
14
- # @raise [MissingOption] if any key is missing
15
- def self.expect_options(keys, context)
16
- keys.each do |key|
17
- unless (options = context[:options]).key?(key)
18
- raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
19
- cause: nil
20
- end
21
- end
22
- end
23
-
24
- # Asserts that the value is of the expected type(s)
25
- #
26
- # @param value [Object] the value to check
27
- # @param types [Array<Class>, Class] the expected type(s)
28
- # @param name [String] the name of the option being checked
29
- # @param context [Item::Context] the context
30
- # @raise [InvalidType] if the value is not of the expected type(s)
31
- def self.assert_type(value, types = [], name, context:)
32
- types = [types] unless types.is_a?(Array)
33
-
34
- return if types.any? { |type| value.is_a?(type) }
35
-
36
- options = context[:options] if context.is_a?(Hash)
37
- options ||= { file: File.basename(caller_locations(1, 1).first.absolute_path) }
38
-
39
- raise InvalidType, format('The type of `%<name>s` must be %<types>s, but is: %<type>s in: %<options>s',
40
- name:, types: types.join(' or '), type: value.class, options: options.inspect),
41
- [], cause: nil
42
- end
43
-
44
- ##
45
- # This method validates the arguments passed to the post processor. Must be implemented by subclasses.
46
- def self.validate_args!(_value, _context)
47
- raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
48
- end
49
-
50
- # Initializes the post processor
51
- #
52
- # @param value [Object] the value to be processed
53
- # @param context [Item::Context] the context
54
- def initialize(value, context)
55
- klass = self.class
56
- # TODO: get rid of Hash
57
- klass.assert_type(context, [Item::Context, Hash], 'context', context:)
58
- klass.validate_args!(value, context)
59
-
60
- @value = value
61
- @context = context
62
- end
63
-
64
- attr_reader :value, :context
65
-
66
- # Abstract method to be implemented by subclasses
67
- #
68
- # @raise [NotImplementedError] if not implemented in subclass
69
- def get
70
- raise NotImplementedError, 'You must implement the `get` method in the post processor'
71
- end
72
- end
73
- end
74
- end
@@ -1,64 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module AttributePostProcessors
5
- ##
6
- # Imagine this HTML:
7
- # <h1>Foo bar and boo<h1>
8
- #
9
- # YAML usage example:
10
- # selectors:
11
- # title:
12
- # selector: h1
13
- # post_process:
14
- # name: gsub
15
- # pattern: boo
16
- # replacement: baz
17
- #
18
- # Would return:
19
- # 'Foo bar and baz'
20
- #
21
- # `pattern` can be a Regexp or a String. If it is a String, it will remove
22
- # one pair of surrounding slashes ('/') to keep backwards compatibility
23
- # and then parse it to build a Regexp.
24
- #
25
- # `replacement` can be a String or a Hash.
26
- #
27
- # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
28
- class Gsub < Base
29
- def self.validate_args!(value, context)
30
- assert_type value, String, :value, context:
31
- expect_options(%i[replacement pattern], context)
32
- assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
33
- end
34
-
35
- ##
36
- # @param value [String]
37
- # @param context [Item::Context]
38
- def initialize(value, context)
39
- super
40
-
41
- options = context[:options]
42
-
43
- @replacement = options[:replacement]
44
- @pattern = options[:pattern]
45
- end
46
-
47
- ##
48
- # @return [String]
49
- def get
50
- value.to_s.gsub(pattern, replacement)
51
- end
52
-
53
- private
54
-
55
- attr_accessor :replacement
56
-
57
- ##
58
- # @return [Regexp]
59
- def pattern
60
- @pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
61
- end
62
- end
63
- end
64
- end
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'reverse_markdown'
4
-
5
- module Html2rss
6
- module AttributePostProcessors
7
- ##
8
- # Returns HTML code as Markdown formatted String.
9
- # Before converting to markdown, the HTML is sanitized with SanitizeHtml.
10
- # Imagine this HTML structure:
11
- #
12
- # <section>
13
- # Lorem <b>ipsum</b> dolor...
14
- # <iframe src="https://evil.corp/miner"></iframe>
15
- # <script>alert();</script>
16
- # </section>
17
- #
18
- # YAML usage example:
19
- #
20
- # selectors:
21
- # description:
22
- # selector: section
23
- # extractor: html
24
- # post_process:
25
- # name: html_to_markdown
26
- #
27
- # Would return:
28
- # 'Lorem **ipsum** dolor'
29
- class HtmlToMarkdown < Base
30
- def self.validate_args!(value, context)
31
- assert_type value, String, :value, context:
32
- end
33
-
34
- ##
35
- # @return [String] formatted in Markdown
36
- def get
37
- sanitized_value = SanitizeHtml.new(value, context).get
38
-
39
- ReverseMarkdown.convert(sanitized_value)
40
- end
41
- end
42
- end
43
- end
@@ -1,27 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module AttributePostProcessors
5
- module HtmlTransformers
6
- ##
7
- # Transformer that converts relative URLs to absolute URLs within specified HTML elements.
8
- class TransformUrlsToAbsoluteOnes
9
- URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
10
-
11
- def initialize(channel_url)
12
- @channel_url = channel_url
13
- end
14
-
15
- ##
16
- # Transforms URLs to absolute ones.
17
- def call(node_name:, node:, **_env)
18
- return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
19
-
20
- url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
21
- url = node[url_attribute]
22
- node[url_attribute] = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
23
- end
24
- end
25
- end
26
- end
27
- end
@@ -1,41 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module AttributePostProcessors
5
- module HtmlTransformers
6
- ##
7
- # Transformer that wraps <img> tags into <a> tags linking to `img.src`.
8
- class WrapImgInA
9
- ##
10
- # Wraps <img> tags into <a> tags that link to `img.src`.
11
- #
12
- # @param node_name [String]
13
- # @param node [Nokogiri::XML::Node]
14
- # @return [nil]
15
- def call(node_name:, node:, **_env)
16
- return unless already_wrapped?(node_name, node)
17
-
18
- wrap_image_in_anchor(node)
19
- end
20
-
21
- def already_wrapped?(node_name, node)
22
- node_name == 'img' && node.parent.name != 'a'
23
- end
24
-
25
- private
26
-
27
- ##
28
- # Wraps the <img> node in an <a> tag.
29
- #
30
- # @param node [Nokogiri::XML::Node]
31
- # @return [nil]
32
- def wrap_image_in_anchor(node)
33
- anchor = Nokogiri::XML::Node.new('a', node.document)
34
- anchor['href'] = node['src']
35
- node.add_next_sibling(anchor)
36
- anchor.add_child(node.remove)
37
- end
38
- end
39
- end
40
- end
41
- end
@@ -1,50 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'kramdown'
4
- require_relative 'sanitize_html'
5
-
6
- module Html2rss
7
- module AttributePostProcessors
8
- ##
9
- # Generates HTML from Markdown.
10
- #
11
- # It's particularly useful in conjunction with the Template post processor
12
- # to generate a description from other selectors.
13
- #
14
- # YAML usage example:
15
- #
16
- # selectors:
17
- # description:
18
- # selector: section
19
- # post_process:
20
- # - name: template
21
- # string: |
22
- # # %s
23
- #
24
- # Price: %s
25
- # methods:
26
- # - self
27
- # - price
28
- # - name: markdown_to_html
29
- #
30
- # Would e.g. return:
31
- #
32
- # <h1>Section</h1>
33
- #
34
- # <p>Price: 12.34</p>
35
- class MarkdownToHtml < Base
36
- def self.validate_args!(value, context)
37
- assert_type value, String, :value, context:
38
- end
39
-
40
- ##
41
- # Converts Markdown to sanitized HTML.
42
- #
43
- # @return [String] Sanitized HTML content
44
- def get
45
- html_content = Kramdown::Document.new(value).to_html
46
- SanitizeHtml.new(html_content, context).get
47
- end
48
- end
49
- end
50
- end
@@ -1,46 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'time'
4
- require_relative '../utils'
5
-
6
- module Html2rss
7
- module AttributePostProcessors
8
- ##
9
- # Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
10
- #
11
- # Imagine this HTML structure:
12
- #
13
- # <p>Published on <span>2019-07-02</span></p>
14
- #
15
- # YAML usage example:
16
- #
17
- # selectors:
18
- # description:
19
- # selector: span
20
- # post_process:
21
- # name: 'parse_time'
22
- # time_zone: 'Europe/Berlin'
23
- #
24
- # Would return:
25
- # "Tue, 02 Jul 2019 00:00:00 +0200"
26
- #
27
- # It uses `Time.parse`.
28
- class ParseTime < Base
29
- def self.validate_args!(value, context)
30
- assert_type(value, String, :value, context:)
31
- assert_type(context[:config].time_zone, String, :time_zone, context:)
32
- end
33
-
34
- ##
35
- # Converts the provided time string to RFC822 format, taking into account the time_zone.
36
- #
37
- # @return [String] RFC822 formatted time
38
- # @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
39
- def get
40
- time_zone = context[:config].time_zone
41
-
42
- Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
43
- end
44
- end
45
- end
46
- end
@@ -1,46 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module AttributePostProcessors
5
- ##
6
- # Returns the URI as String.
7
- # If the URL is relative, it builds an absolute one with the channel's URL as base.
8
- #
9
- # Imagine this HTML structure:
10
- #
11
- # <span>http://why-not-use-a-link.uh </span>
12
- #
13
- # YAML usage example:
14
- #
15
- # selectors:
16
- # link:
17
- # selector: span
18
- # extractor: text
19
- # post_process:
20
- # name: parse_uri
21
- #
22
- # Would return:
23
- # 'http://why-not-use-a-link.uh'
24
- class ParseUri < Base
25
- def self.validate_args!(value, context)
26
- url_types = [String, URI::HTTP, Addressable::URI].freeze
27
-
28
- assert_type(value, url_types, :value, context:)
29
- assert_type(context.config.url, url_types, :url, context:)
30
-
31
- raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
32
- end
33
-
34
- ##
35
- # @return [String]
36
- def get
37
- config_url = context.config.url
38
-
39
- Html2rss::Utils.build_absolute_url_from_relative(
40
- Html2rss::Utils.sanitize_url(value),
41
- config_url
42
- ).to_s
43
- end
44
- end
45
- end
46
- end