html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.18.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: addressable
|
|
@@ -24,6 +23,34 @@ dependencies:
|
|
|
24
23
|
- - "~>"
|
|
25
24
|
- !ruby/object:Gem::Version
|
|
26
25
|
version: '2.7'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: brotli
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: dry-validation
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0'
|
|
27
54
|
- !ruby/object:Gem::Dependency
|
|
28
55
|
name: faraday
|
|
29
56
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -58,6 +85,20 @@ dependencies:
|
|
|
58
85
|
- - ">="
|
|
59
86
|
- !ruby/object:Gem::Version
|
|
60
87
|
version: '0'
|
|
88
|
+
- !ruby/object:Gem::Dependency
|
|
89
|
+
name: faraday-gzip
|
|
90
|
+
requirement: !ruby/object:Gem::Requirement
|
|
91
|
+
requirements:
|
|
92
|
+
- - "~>"
|
|
93
|
+
- !ruby/object:Gem::Version
|
|
94
|
+
version: '3'
|
|
95
|
+
type: :runtime
|
|
96
|
+
prerelease: false
|
|
97
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
98
|
+
requirements:
|
|
99
|
+
- - "~>"
|
|
100
|
+
- !ruby/object:Gem::Version
|
|
101
|
+
version: '3'
|
|
61
102
|
- !ruby/object:Gem::Dependency
|
|
62
103
|
name: kramdown
|
|
63
104
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -180,16 +221,16 @@ dependencies:
|
|
|
180
221
|
name: sanitize
|
|
181
222
|
requirement: !ruby/object:Gem::Requirement
|
|
182
223
|
requirements:
|
|
183
|
-
- - "
|
|
224
|
+
- - ">="
|
|
184
225
|
- !ruby/object:Gem::Version
|
|
185
|
-
version: '
|
|
226
|
+
version: '0'
|
|
186
227
|
type: :runtime
|
|
187
228
|
prerelease: false
|
|
188
229
|
version_requirements: !ruby/object:Gem::Requirement
|
|
189
230
|
requirements:
|
|
190
|
-
- - "
|
|
231
|
+
- - ">="
|
|
191
232
|
- !ruby/object:Gem::Version
|
|
192
|
-
version: '
|
|
233
|
+
version: '0'
|
|
193
234
|
- !ruby/object:Gem::Dependency
|
|
194
235
|
name: thor
|
|
195
236
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -222,16 +263,16 @@ dependencies:
|
|
|
222
263
|
name: zeitwerk
|
|
223
264
|
requirement: !ruby/object:Gem::Requirement
|
|
224
265
|
requirements:
|
|
225
|
-
- - "
|
|
266
|
+
- - ">="
|
|
226
267
|
- !ruby/object:Gem::Version
|
|
227
|
-
version:
|
|
268
|
+
version: '0'
|
|
228
269
|
type: :runtime
|
|
229
270
|
prerelease: false
|
|
230
271
|
version_requirements: !ruby/object:Gem::Requirement
|
|
231
272
|
requirements:
|
|
232
|
-
- - "
|
|
273
|
+
- - ">="
|
|
233
274
|
- !ruby/object:Gem::Version
|
|
234
|
-
version:
|
|
275
|
+
version: '0'
|
|
235
276
|
description: Supports JSON content, custom HTTP headers, and post-processing of extracted
|
|
236
277
|
content.
|
|
237
278
|
email:
|
|
@@ -246,66 +287,100 @@ files:
|
|
|
246
287
|
- exe/html2rss
|
|
247
288
|
- html2rss.gemspec
|
|
248
289
|
- lib/html2rss.rb
|
|
249
|
-
- lib/html2rss/
|
|
250
|
-
- lib/html2rss/attribute_post_processors/base.rb
|
|
251
|
-
- lib/html2rss/attribute_post_processors/gsub.rb
|
|
252
|
-
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
|
253
|
-
- lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
|
254
|
-
- lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb
|
|
255
|
-
- lib/html2rss/attribute_post_processors/markdown_to_html.rb
|
|
256
|
-
- lib/html2rss/attribute_post_processors/parse_time.rb
|
|
257
|
-
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
|
258
|
-
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
|
259
|
-
- lib/html2rss/attribute_post_processors/substring.rb
|
|
260
|
-
- lib/html2rss/attribute_post_processors/template.rb
|
|
290
|
+
- lib/html2rss/articles/deduplicator.rb
|
|
261
291
|
- lib/html2rss/auto_source.rb
|
|
262
|
-
- lib/html2rss/auto_source/article.rb
|
|
263
|
-
- lib/html2rss/auto_source/channel.rb
|
|
264
292
|
- lib/html2rss/auto_source/cleanup.rb
|
|
265
|
-
- lib/html2rss/auto_source/reducer.rb
|
|
266
|
-
- lib/html2rss/auto_source/rss_builder.rb
|
|
267
293
|
- lib/html2rss/auto_source/scraper.rb
|
|
268
294
|
- lib/html2rss/auto_source/scraper/html.rb
|
|
295
|
+
- lib/html2rss/auto_source/scraper/json_state.rb
|
|
296
|
+
- lib/html2rss/auto_source/scraper/microdata.rb
|
|
269
297
|
- lib/html2rss/auto_source/scraper/schema.rb
|
|
298
|
+
- lib/html2rss/auto_source/scraper/schema/category_extractor.rb
|
|
270
299
|
- lib/html2rss/auto_source/scraper/schema/item_list.rb
|
|
271
300
|
- lib/html2rss/auto_source/scraper/schema/list_item.rb
|
|
272
301
|
- lib/html2rss/auto_source/scraper/schema/thing.rb
|
|
273
302
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
|
274
|
-
- lib/html2rss/auto_source/scraper/semantic_html/
|
|
275
|
-
- lib/html2rss/auto_source/scraper/
|
|
303
|
+
- lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb
|
|
304
|
+
- lib/html2rss/auto_source/scraper/wordpress_api.rb
|
|
305
|
+
- lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb
|
|
306
|
+
- lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb
|
|
307
|
+
- lib/html2rss/blocked_surface.rb
|
|
308
|
+
- lib/html2rss/category_extractor.rb
|
|
276
309
|
- lib/html2rss/cli.rb
|
|
277
310
|
- lib/html2rss/config.rb
|
|
278
|
-
- lib/html2rss/config/
|
|
279
|
-
- lib/html2rss/config/
|
|
280
|
-
- lib/html2rss/
|
|
281
|
-
- lib/html2rss/
|
|
282
|
-
- lib/html2rss/
|
|
283
|
-
- lib/html2rss/
|
|
284
|
-
- lib/html2rss/
|
|
285
|
-
- lib/html2rss/
|
|
286
|
-
- lib/html2rss/
|
|
287
|
-
- lib/html2rss/
|
|
311
|
+
- lib/html2rss/config/class_methods.rb
|
|
312
|
+
- lib/html2rss/config/dynamic_params.rb
|
|
313
|
+
- lib/html2rss/config/multiple_feeds_config.rb
|
|
314
|
+
- lib/html2rss/config/request_headers.rb
|
|
315
|
+
- lib/html2rss/config/schema.rb
|
|
316
|
+
- lib/html2rss/config/validator.rb
|
|
317
|
+
- lib/html2rss/error.rb
|
|
318
|
+
- lib/html2rss/html_extractor.rb
|
|
319
|
+
- lib/html2rss/html_extractor/date_extractor.rb
|
|
320
|
+
- lib/html2rss/html_extractor/enclosure_extractor.rb
|
|
321
|
+
- lib/html2rss/html_extractor/image_extractor.rb
|
|
322
|
+
- lib/html2rss/html_navigator.rb
|
|
323
|
+
- lib/html2rss/json_feed_builder.rb
|
|
324
|
+
- lib/html2rss/json_feed_builder/item.rb
|
|
325
|
+
- lib/html2rss/rendering.rb
|
|
326
|
+
- lib/html2rss/rendering/audio_renderer.rb
|
|
327
|
+
- lib/html2rss/rendering/description_builder.rb
|
|
328
|
+
- lib/html2rss/rendering/image_renderer.rb
|
|
329
|
+
- lib/html2rss/rendering/media_renderer.rb
|
|
330
|
+
- lib/html2rss/rendering/pdf_renderer.rb
|
|
331
|
+
- lib/html2rss/rendering/video_renderer.rb
|
|
332
|
+
- lib/html2rss/request_controls.rb
|
|
288
333
|
- lib/html2rss/request_service.rb
|
|
289
334
|
- lib/html2rss/request_service/browserless_strategy.rb
|
|
335
|
+
- lib/html2rss/request_service/budget.rb
|
|
290
336
|
- lib/html2rss/request_service/context.rb
|
|
291
337
|
- lib/html2rss/request_service/faraday_strategy.rb
|
|
338
|
+
- lib/html2rss/request_service/policy.rb
|
|
292
339
|
- lib/html2rss/request_service/puppet_commander.rb
|
|
293
340
|
- lib/html2rss/request_service/response.rb
|
|
341
|
+
- lib/html2rss/request_service/response_guard.rb
|
|
294
342
|
- lib/html2rss/request_service/strategy.rb
|
|
343
|
+
- lib/html2rss/request_session.rb
|
|
344
|
+
- lib/html2rss/request_session/rel_next_pager.rb
|
|
345
|
+
- lib/html2rss/request_session/runtime_input.rb
|
|
346
|
+
- lib/html2rss/request_session/runtime_policy.rb
|
|
295
347
|
- lib/html2rss/rss_builder.rb
|
|
348
|
+
- lib/html2rss/rss_builder/article.rb
|
|
296
349
|
- lib/html2rss/rss_builder/channel.rb
|
|
297
|
-
- lib/html2rss/rss_builder/
|
|
350
|
+
- lib/html2rss/rss_builder/enclosure.rb
|
|
298
351
|
- lib/html2rss/rss_builder/stylesheet.rb
|
|
299
|
-
- lib/html2rss/
|
|
352
|
+
- lib/html2rss/selectors.rb
|
|
353
|
+
- lib/html2rss/selectors/config.rb
|
|
354
|
+
- lib/html2rss/selectors/extractors.rb
|
|
355
|
+
- lib/html2rss/selectors/extractors/attribute.rb
|
|
356
|
+
- lib/html2rss/selectors/extractors/href.rb
|
|
357
|
+
- lib/html2rss/selectors/extractors/html.rb
|
|
358
|
+
- lib/html2rss/selectors/extractors/static.rb
|
|
359
|
+
- lib/html2rss/selectors/extractors/text.rb
|
|
360
|
+
- lib/html2rss/selectors/object_to_xml_converter.rb
|
|
361
|
+
- lib/html2rss/selectors/post_processors.rb
|
|
362
|
+
- lib/html2rss/selectors/post_processors/base.rb
|
|
363
|
+
- lib/html2rss/selectors/post_processors/gsub.rb
|
|
364
|
+
- lib/html2rss/selectors/post_processors/html_to_markdown.rb
|
|
365
|
+
- lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
|
366
|
+
- lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb
|
|
367
|
+
- lib/html2rss/selectors/post_processors/markdown_to_html.rb
|
|
368
|
+
- lib/html2rss/selectors/post_processors/parse_time.rb
|
|
369
|
+
- lib/html2rss/selectors/post_processors/parse_uri.rb
|
|
370
|
+
- lib/html2rss/selectors/post_processors/sanitize_html.rb
|
|
371
|
+
- lib/html2rss/selectors/post_processors/substring.rb
|
|
372
|
+
- lib/html2rss/selectors/post_processors/template.rb
|
|
373
|
+
- lib/html2rss/url.rb
|
|
300
374
|
- lib/html2rss/version.rb
|
|
375
|
+
- lib/tasks/config_schema.rake
|
|
376
|
+
- schema/html2rss-config.schema.json
|
|
301
377
|
homepage: https://github.com/html2rss/html2rss
|
|
302
378
|
licenses:
|
|
303
379
|
- MIT
|
|
304
380
|
metadata:
|
|
305
381
|
allowed_push_host: https://rubygems.org
|
|
306
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
|
382
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.18.0
|
|
307
383
|
rubygems_mfa_required: 'true'
|
|
308
|
-
post_install_message:
|
|
309
384
|
rdoc_options: []
|
|
310
385
|
require_paths:
|
|
311
386
|
- lib
|
|
@@ -313,15 +388,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
313
388
|
requirements:
|
|
314
389
|
- - ">="
|
|
315
390
|
- !ruby/object:Gem::Version
|
|
316
|
-
version: '3.
|
|
391
|
+
version: '3.2'
|
|
317
392
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
318
393
|
requirements:
|
|
319
394
|
- - ">="
|
|
320
395
|
- !ruby/object:Gem::Version
|
|
321
396
|
version: '0'
|
|
322
397
|
requirements: []
|
|
323
|
-
rubygems_version:
|
|
324
|
-
signing_key:
|
|
398
|
+
rubygems_version: 4.0.6
|
|
325
399
|
specification_version: 4
|
|
326
400
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
|
327
401
|
to extract item.
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
##
|
|
5
|
-
# Provides a namespace for attribute post processors.
|
|
6
|
-
module AttributePostProcessors
|
|
7
|
-
##
|
|
8
|
-
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
|
9
|
-
class Base
|
|
10
|
-
# Validates the presence of required options in the context
|
|
11
|
-
#
|
|
12
|
-
# @param keys [Array<Symbol>] the keys to check for presence
|
|
13
|
-
# @param context [Hash] the context containing options
|
|
14
|
-
# @raise [MissingOption] if any key is missing
|
|
15
|
-
def self.expect_options(keys, context)
|
|
16
|
-
keys.each do |key|
|
|
17
|
-
unless (options = context[:options]).key?(key)
|
|
18
|
-
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
|
19
|
-
cause: nil
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# Asserts that the value is of the expected type(s)
|
|
25
|
-
#
|
|
26
|
-
# @param value [Object] the value to check
|
|
27
|
-
# @param types [Array<Class>, Class] the expected type(s)
|
|
28
|
-
# @param name [String] the name of the option being checked
|
|
29
|
-
# @param context [Item::Context] the context
|
|
30
|
-
# @raise [InvalidType] if the value is not of the expected type(s)
|
|
31
|
-
def self.assert_type(value, types = [], name, context:)
|
|
32
|
-
types = [types] unless types.is_a?(Array)
|
|
33
|
-
|
|
34
|
-
return if types.any? { |type| value.is_a?(type) }
|
|
35
|
-
|
|
36
|
-
options = context[:options] if context.is_a?(Hash)
|
|
37
|
-
options ||= { file: File.basename(caller_locations(1, 1).first.absolute_path) }
|
|
38
|
-
|
|
39
|
-
raise InvalidType, format('The type of `%<name>s` must be %<types>s, but is: %<type>s in: %<options>s',
|
|
40
|
-
name:, types: types.join(' or '), type: value.class, options: options.inspect),
|
|
41
|
-
[], cause: nil
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
##
|
|
45
|
-
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
|
46
|
-
def self.validate_args!(_value, _context)
|
|
47
|
-
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
# Initializes the post processor
|
|
51
|
-
#
|
|
52
|
-
# @param value [Object] the value to be processed
|
|
53
|
-
# @param context [Item::Context] the context
|
|
54
|
-
def initialize(value, context)
|
|
55
|
-
klass = self.class
|
|
56
|
-
# TODO: get rid of Hash
|
|
57
|
-
klass.assert_type(context, [Item::Context, Hash], 'context', context:)
|
|
58
|
-
klass.validate_args!(value, context)
|
|
59
|
-
|
|
60
|
-
@value = value
|
|
61
|
-
@context = context
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
attr_reader :value, :context
|
|
65
|
-
|
|
66
|
-
# Abstract method to be implemented by subclasses
|
|
67
|
-
#
|
|
68
|
-
# @raise [NotImplementedError] if not implemented in subclass
|
|
69
|
-
def get
|
|
70
|
-
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
end
|
|
74
|
-
end
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
##
|
|
6
|
-
# Imagine this HTML:
|
|
7
|
-
# <h1>Foo bar and boo<h1>
|
|
8
|
-
#
|
|
9
|
-
# YAML usage example:
|
|
10
|
-
# selectors:
|
|
11
|
-
# title:
|
|
12
|
-
# selector: h1
|
|
13
|
-
# post_process:
|
|
14
|
-
# name: gsub
|
|
15
|
-
# pattern: boo
|
|
16
|
-
# replacement: baz
|
|
17
|
-
#
|
|
18
|
-
# Would return:
|
|
19
|
-
# 'Foo bar and baz'
|
|
20
|
-
#
|
|
21
|
-
# `pattern` can be a Regexp or a String. If it is a String, it will remove
|
|
22
|
-
# one pair of surrounding slashes ('/') to keep backwards compatibility
|
|
23
|
-
# and then parse it to build a Regexp.
|
|
24
|
-
#
|
|
25
|
-
# `replacement` can be a String or a Hash.
|
|
26
|
-
#
|
|
27
|
-
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
|
28
|
-
class Gsub < Base
|
|
29
|
-
def self.validate_args!(value, context)
|
|
30
|
-
assert_type value, String, :value, context:
|
|
31
|
-
expect_options(%i[replacement pattern], context)
|
|
32
|
-
assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
##
|
|
36
|
-
# @param value [String]
|
|
37
|
-
# @param context [Item::Context]
|
|
38
|
-
def initialize(value, context)
|
|
39
|
-
super
|
|
40
|
-
|
|
41
|
-
options = context[:options]
|
|
42
|
-
|
|
43
|
-
@replacement = options[:replacement]
|
|
44
|
-
@pattern = options[:pattern]
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
##
|
|
48
|
-
# @return [String]
|
|
49
|
-
def get
|
|
50
|
-
value.to_s.gsub(pattern, replacement)
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
private
|
|
54
|
-
|
|
55
|
-
attr_accessor :replacement
|
|
56
|
-
|
|
57
|
-
##
|
|
58
|
-
# @return [Regexp]
|
|
59
|
-
def pattern
|
|
60
|
-
@pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
end
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'reverse_markdown'
|
|
4
|
-
|
|
5
|
-
module Html2rss
|
|
6
|
-
module AttributePostProcessors
|
|
7
|
-
##
|
|
8
|
-
# Returns HTML code as Markdown formatted String.
|
|
9
|
-
# Before converting to markdown, the HTML is sanitized with SanitizeHtml.
|
|
10
|
-
# Imagine this HTML structure:
|
|
11
|
-
#
|
|
12
|
-
# <section>
|
|
13
|
-
# Lorem <b>ipsum</b> dolor...
|
|
14
|
-
# <iframe src="https://evil.corp/miner"></iframe>
|
|
15
|
-
# <script>alert();</script>
|
|
16
|
-
# </section>
|
|
17
|
-
#
|
|
18
|
-
# YAML usage example:
|
|
19
|
-
#
|
|
20
|
-
# selectors:
|
|
21
|
-
# description:
|
|
22
|
-
# selector: section
|
|
23
|
-
# extractor: html
|
|
24
|
-
# post_process:
|
|
25
|
-
# name: html_to_markdown
|
|
26
|
-
#
|
|
27
|
-
# Would return:
|
|
28
|
-
# 'Lorem **ipsum** dolor'
|
|
29
|
-
class HtmlToMarkdown < Base
|
|
30
|
-
def self.validate_args!(value, context)
|
|
31
|
-
assert_type value, String, :value, context:
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# @return [String] formatted in Markdown
|
|
36
|
-
def get
|
|
37
|
-
sanitized_value = SanitizeHtml.new(value, context).get
|
|
38
|
-
|
|
39
|
-
ReverseMarkdown.convert(sanitized_value)
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
end
|
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
module HtmlTransformers
|
|
6
|
-
##
|
|
7
|
-
# Transformer that converts relative URLs to absolute URLs within specified HTML elements.
|
|
8
|
-
class TransformUrlsToAbsoluteOnes
|
|
9
|
-
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
|
10
|
-
|
|
11
|
-
def initialize(channel_url)
|
|
12
|
-
@channel_url = channel_url
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
##
|
|
16
|
-
# Transforms URLs to absolute ones.
|
|
17
|
-
def call(node_name:, node:, **_env)
|
|
18
|
-
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
|
|
19
|
-
|
|
20
|
-
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
|
|
21
|
-
url = node[url_attribute]
|
|
22
|
-
node[url_attribute] = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
end
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
module HtmlTransformers
|
|
6
|
-
##
|
|
7
|
-
# Transformer that wraps <img> tags into <a> tags linking to `img.src`.
|
|
8
|
-
class WrapImgInA
|
|
9
|
-
##
|
|
10
|
-
# Wraps <img> tags into <a> tags that link to `img.src`.
|
|
11
|
-
#
|
|
12
|
-
# @param node_name [String]
|
|
13
|
-
# @param node [Nokogiri::XML::Node]
|
|
14
|
-
# @return [nil]
|
|
15
|
-
def call(node_name:, node:, **_env)
|
|
16
|
-
return unless already_wrapped?(node_name, node)
|
|
17
|
-
|
|
18
|
-
wrap_image_in_anchor(node)
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def already_wrapped?(node_name, node)
|
|
22
|
-
node_name == 'img' && node.parent.name != 'a'
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
private
|
|
26
|
-
|
|
27
|
-
##
|
|
28
|
-
# Wraps the <img> node in an <a> tag.
|
|
29
|
-
#
|
|
30
|
-
# @param node [Nokogiri::XML::Node]
|
|
31
|
-
# @return [nil]
|
|
32
|
-
def wrap_image_in_anchor(node)
|
|
33
|
-
anchor = Nokogiri::XML::Node.new('a', node.document)
|
|
34
|
-
anchor['href'] = node['src']
|
|
35
|
-
node.add_next_sibling(anchor)
|
|
36
|
-
anchor.add_child(node.remove)
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
end
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'kramdown'
|
|
4
|
-
require_relative 'sanitize_html'
|
|
5
|
-
|
|
6
|
-
module Html2rss
|
|
7
|
-
module AttributePostProcessors
|
|
8
|
-
##
|
|
9
|
-
# Generates HTML from Markdown.
|
|
10
|
-
#
|
|
11
|
-
# It's particularly useful in conjunction with the Template post processor
|
|
12
|
-
# to generate a description from other selectors.
|
|
13
|
-
#
|
|
14
|
-
# YAML usage example:
|
|
15
|
-
#
|
|
16
|
-
# selectors:
|
|
17
|
-
# description:
|
|
18
|
-
# selector: section
|
|
19
|
-
# post_process:
|
|
20
|
-
# - name: template
|
|
21
|
-
# string: |
|
|
22
|
-
# # %s
|
|
23
|
-
#
|
|
24
|
-
# Price: %s
|
|
25
|
-
# methods:
|
|
26
|
-
# - self
|
|
27
|
-
# - price
|
|
28
|
-
# - name: markdown_to_html
|
|
29
|
-
#
|
|
30
|
-
# Would e.g. return:
|
|
31
|
-
#
|
|
32
|
-
# <h1>Section</h1>
|
|
33
|
-
#
|
|
34
|
-
# <p>Price: 12.34</p>
|
|
35
|
-
class MarkdownToHtml < Base
|
|
36
|
-
def self.validate_args!(value, context)
|
|
37
|
-
assert_type value, String, :value, context:
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
##
|
|
41
|
-
# Converts Markdown to sanitized HTML.
|
|
42
|
-
#
|
|
43
|
-
# @return [String] Sanitized HTML content
|
|
44
|
-
def get
|
|
45
|
-
html_content = Kramdown::Document.new(value).to_html
|
|
46
|
-
SanitizeHtml.new(html_content, context).get
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'time'
|
|
4
|
-
require_relative '../utils'
|
|
5
|
-
|
|
6
|
-
module Html2rss
|
|
7
|
-
module AttributePostProcessors
|
|
8
|
-
##
|
|
9
|
-
# Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
|
|
10
|
-
#
|
|
11
|
-
# Imagine this HTML structure:
|
|
12
|
-
#
|
|
13
|
-
# <p>Published on <span>2019-07-02</span></p>
|
|
14
|
-
#
|
|
15
|
-
# YAML usage example:
|
|
16
|
-
#
|
|
17
|
-
# selectors:
|
|
18
|
-
# description:
|
|
19
|
-
# selector: span
|
|
20
|
-
# post_process:
|
|
21
|
-
# name: 'parse_time'
|
|
22
|
-
# time_zone: 'Europe/Berlin'
|
|
23
|
-
#
|
|
24
|
-
# Would return:
|
|
25
|
-
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
|
26
|
-
#
|
|
27
|
-
# It uses `Time.parse`.
|
|
28
|
-
class ParseTime < Base
|
|
29
|
-
def self.validate_args!(value, context)
|
|
30
|
-
assert_type(value, String, :value, context:)
|
|
31
|
-
assert_type(context[:config].time_zone, String, :time_zone, context:)
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
|
36
|
-
#
|
|
37
|
-
# @return [String] RFC822 formatted time
|
|
38
|
-
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
|
39
|
-
def get
|
|
40
|
-
time_zone = context[:config].time_zone
|
|
41
|
-
|
|
42
|
-
Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
##
|
|
6
|
-
# Returns the URI as String.
|
|
7
|
-
# If the URL is relative, it builds an absolute one with the channel's URL as base.
|
|
8
|
-
#
|
|
9
|
-
# Imagine this HTML structure:
|
|
10
|
-
#
|
|
11
|
-
# <span>http://why-not-use-a-link.uh </span>
|
|
12
|
-
#
|
|
13
|
-
# YAML usage example:
|
|
14
|
-
#
|
|
15
|
-
# selectors:
|
|
16
|
-
# link:
|
|
17
|
-
# selector: span
|
|
18
|
-
# extractor: text
|
|
19
|
-
# post_process:
|
|
20
|
-
# name: parse_uri
|
|
21
|
-
#
|
|
22
|
-
# Would return:
|
|
23
|
-
# 'http://why-not-use-a-link.uh'
|
|
24
|
-
class ParseUri < Base
|
|
25
|
-
def self.validate_args!(value, context)
|
|
26
|
-
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
|
27
|
-
|
|
28
|
-
assert_type(value, url_types, :value, context:)
|
|
29
|
-
assert_type(context.config.url, url_types, :url, context:)
|
|
30
|
-
|
|
31
|
-
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# @return [String]
|
|
36
|
-
def get
|
|
37
|
-
config_url = context.config.url
|
|
38
|
-
|
|
39
|
-
Html2rss::Utils.build_absolute_url_from_relative(
|
|
40
|
-
Html2rss::Utils.sanitize_url(value),
|
|
41
|
-
config_url
|
|
42
|
-
).to_s
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|