html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,551 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "type": "object",
4
+ "properties": {
5
+ "strategy": {
6
+ "type": "string",
7
+ "not": {
8
+ "type": "null"
9
+ }
10
+ },
11
+ "channel": {
12
+ "type": "object",
13
+ "properties": {
14
+ "url": {
15
+ "type": "string",
16
+ "minLength": 1
17
+ },
18
+ "title": {
19
+ "type": [
20
+ "null",
21
+ "string"
22
+ ]
23
+ },
24
+ "description": {
25
+ "type": [
26
+ "null",
27
+ "string"
28
+ ]
29
+ },
30
+ "language": {
31
+ "type": [
32
+ "null",
33
+ "string"
34
+ ]
35
+ },
36
+ "ttl": {
37
+ "type": [
38
+ "null",
39
+ "integer"
40
+ ],
41
+ "exclusiveMinimum": 0
42
+ },
43
+ "time_zone": {
44
+ "type": [
45
+ "null",
46
+ "string"
47
+ ]
48
+ }
49
+ },
50
+ "required": [
51
+ "url"
52
+ ]
53
+ },
54
+ "headers": {
55
+ "type": "object",
56
+ "description": "HTTP headers applied to every request.",
57
+ "additionalProperties": {
58
+ "type": "string"
59
+ }
60
+ },
61
+ "stylesheets": {
62
+ "type": "array",
63
+ "description": "Collection of stylesheets to attach to the RSS feed.",
64
+ "items": {
65
+ "$schema": "http://json-schema.org/draft-06/schema#",
66
+ "type": "object",
67
+ "properties": {
68
+ "href": {
69
+ "type": "string",
70
+ "minLength": 1
71
+ },
72
+ "type": {
73
+ "type": "string",
74
+ "minLength": 1,
75
+ "enum": [
76
+ "text/css",
77
+ "text/xsl"
78
+ ]
79
+ },
80
+ "media": {
81
+ "type": [
82
+ "null",
83
+ "string"
84
+ ]
85
+ }
86
+ },
87
+ "required": [
88
+ "href",
89
+ "type"
90
+ ]
91
+ }
92
+ },
93
+ "auto_source": {
94
+ "$schema": "http://json-schema.org/draft-06/schema#",
95
+ "type": "object",
96
+ "properties": {
97
+ "scraper": {
98
+ "type": "object",
99
+ "properties": {
100
+ "wordpress_api": {
101
+ "type": "object",
102
+ "properties": {
103
+ "enabled": {
104
+ "type": "boolean",
105
+ "not": {
106
+ "type": "null"
107
+ }
108
+ }
109
+ },
110
+ "required": []
111
+ },
112
+ "schema": {
113
+ "type": "object",
114
+ "properties": {
115
+ "enabled": {
116
+ "type": "boolean",
117
+ "not": {
118
+ "type": "null"
119
+ }
120
+ }
121
+ },
122
+ "required": []
123
+ },
124
+ "microdata": {
125
+ "type": "object",
126
+ "properties": {
127
+ "enabled": {
128
+ "type": "boolean",
129
+ "not": {
130
+ "type": "null"
131
+ }
132
+ }
133
+ },
134
+ "required": []
135
+ },
136
+ "json_state": {
137
+ "type": "object",
138
+ "properties": {
139
+ "enabled": {
140
+ "type": "boolean",
141
+ "not": {
142
+ "type": "null"
143
+ }
144
+ }
145
+ },
146
+ "required": []
147
+ },
148
+ "semantic_html": {
149
+ "type": "object",
150
+ "properties": {
151
+ "enabled": {
152
+ "type": "boolean",
153
+ "not": {
154
+ "type": "null"
155
+ }
156
+ }
157
+ },
158
+ "required": []
159
+ },
160
+ "html": {
161
+ "type": "object",
162
+ "properties": {
163
+ "enabled": {
164
+ "type": "boolean",
165
+ "not": {
166
+ "type": "null"
167
+ }
168
+ },
169
+ "minimum_selector_frequency": {
170
+ "type": "integer",
171
+ "not": {
172
+ "type": "null"
173
+ },
174
+ "exclusiveMinimum": 0
175
+ },
176
+ "use_top_selectors": {
177
+ "type": "integer",
178
+ "not": {
179
+ "type": "null"
180
+ },
181
+ "exclusiveMinimum": 0
182
+ }
183
+ },
184
+ "required": []
185
+ }
186
+ },
187
+ "required": []
188
+ },
189
+ "cleanup": {
190
+ "type": "object",
191
+ "properties": {
192
+ "keep_different_domain": {
193
+ "type": "boolean",
194
+ "not": {
195
+ "type": "null"
196
+ }
197
+ },
198
+ "min_words_title": {
199
+ "type": "integer",
200
+ "not": {
201
+ "type": "null"
202
+ },
203
+ "exclusiveMinimum": 0
204
+ }
205
+ },
206
+ "required": []
207
+ }
208
+ },
209
+ "required": [],
210
+ "default": {
211
+ "scraper": {
212
+ "wordpress_api": {
213
+ "enabled": true
214
+ },
215
+ "schema": {
216
+ "enabled": true
217
+ },
218
+ "microdata": {
219
+ "enabled": true
220
+ },
221
+ "json_state": {
222
+ "enabled": true
223
+ },
224
+ "semantic_html": {
225
+ "enabled": true
226
+ },
227
+ "html": {
228
+ "enabled": true,
229
+ "minimum_selector_frequency": 2,
230
+ "use_top_selectors": 5
231
+ }
232
+ },
233
+ "cleanup": {
234
+ "keep_different_domain": false,
235
+ "min_words_title": 3
236
+ }
237
+ }
238
+ },
239
+ "selectors": {
240
+ "type": "object",
241
+ "description": "Selectors used to extract article attributes.",
242
+ "properties": {
243
+ "items": {
244
+ "$schema": "http://json-schema.org/draft-06/schema#",
245
+ "type": "object",
246
+ "properties": {
247
+ "selector": {
248
+ "type": "string",
249
+ "minLength": 1
250
+ },
251
+ "order": {
252
+ "enum": [
253
+ "reverse"
254
+ ],
255
+ "not": {
256
+ "type": "null"
257
+ }
258
+ },
259
+ "enhance": {
260
+ "type": "boolean",
261
+ "not": {
262
+ "type": "null"
263
+ }
264
+ },
265
+ "pagination": {
266
+ "type": "object",
267
+ "properties": {
268
+ "max_pages": {
269
+ "type": "integer",
270
+ "not": {
271
+ "type": "null"
272
+ },
273
+ "exclusiveMinimum": 0
274
+ }
275
+ },
276
+ "required": [
277
+ "max_pages"
278
+ ]
279
+ }
280
+ },
281
+ "required": [
282
+ "selector"
283
+ ],
284
+ "description": "Defines the items selector and optional enhancement settings."
285
+ },
286
+ "enclosure": {
287
+ "$schema": "http://json-schema.org/draft-06/schema#",
288
+ "type": "object",
289
+ "properties": {
290
+ "extractor": {
291
+ "type": "string",
292
+ "minLength": 1
293
+ },
294
+ "attribute": {
295
+ "type": "string",
296
+ "minLength": 1
297
+ },
298
+ "static": {
299
+ "type": "string",
300
+ "minLength": 1
301
+ },
302
+ "post_process": {
303
+ "type": "array",
304
+ "items": {
305
+ "type": "object"
306
+ }
307
+ },
308
+ "content_type": {
309
+ "type": "string",
310
+ "minLength": 1
311
+ }
312
+ },
313
+ "required": [],
314
+ "description": "Describes enclosure extraction settings."
315
+ },
316
+ "guid": {
317
+ "type": "array",
318
+ "description": "List of selector keys used to build the GUID. Each entry must reference a sibling selector key; runtime validation enforces those references.",
319
+ "minItems": 1,
320
+ "items": {
321
+ "type": "string",
322
+ "description": "Selector key defined elsewhere in this object."
323
+ }
324
+ },
325
+ "categories": {
326
+ "type": "array",
327
+ "description": "List of selector keys whose values will be used as categories. Each entry must reference a sibling selector key; runtime validation enforces those references.",
328
+ "minItems": 1,
329
+ "items": {
330
+ "type": "string",
331
+ "description": "Selector key defined elsewhere in this object."
332
+ }
333
+ }
334
+ },
335
+ "patternProperties": {
336
+ "^(?!items$|enclosure$|guid$|categories$).+$": {
337
+ "$schema": "http://json-schema.org/draft-06/schema#",
338
+ "type": "object",
339
+ "properties": {
340
+ "extractor": {
341
+ "type": "string",
342
+ "minLength": 1
343
+ },
344
+ "attribute": {
345
+ "type": "string",
346
+ "minLength": 1
347
+ },
348
+ "static": {
349
+ "type": "string",
350
+ "minLength": 1
351
+ },
352
+ "post_process": {
353
+ "type": "array",
354
+ "items": {
355
+ "type": "object"
356
+ }
357
+ }
358
+ },
359
+ "required": [],
360
+ "description": "Dynamic selector definition keyed by attribute name."
361
+ }
362
+ },
363
+ "additionalProperties": true
364
+ },
365
+ "request": {
366
+ "type": "object",
367
+ "properties": {
368
+ "max_redirects": {
369
+ "type": "integer",
370
+ "not": {
371
+ "type": "null"
372
+ },
373
+ "minimum": 0
374
+ },
375
+ "max_requests": {
376
+ "type": "integer",
377
+ "not": {
378
+ "type": "null"
379
+ },
380
+ "exclusiveMinimum": 0
381
+ },
382
+ "browserless": {
383
+ "type": "object",
384
+ "properties": {
385
+ "preload": {
386
+ "type": "object",
387
+ "properties": {
388
+ "wait_after_ms": {
389
+ "type": "integer",
390
+ "not": {
391
+ "type": "null"
392
+ },
393
+ "minimum": 0
394
+ },
395
+ "click_selectors": {
396
+ "type": "array",
397
+ "items": {
398
+ "type": "object",
399
+ "properties": {
400
+ "selector": {
401
+ "type": "string",
402
+ "minLength": 1
403
+ },
404
+ "max_clicks": {
405
+ "type": "integer",
406
+ "not": {
407
+ "type": "null"
408
+ },
409
+ "exclusiveMinimum": 0
410
+ },
411
+ "wait_after_ms": {
412
+ "type": "integer",
413
+ "not": {
414
+ "type": "null"
415
+ },
416
+ "minimum": 0
417
+ }
418
+ },
419
+ "required": [
420
+ "selector"
421
+ ]
422
+ }
423
+ },
424
+ "scroll_down": {
425
+ "type": "object",
426
+ "properties": {
427
+ "iterations": {
428
+ "type": "integer",
429
+ "not": {
430
+ "type": "null"
431
+ },
432
+ "exclusiveMinimum": 0
433
+ },
434
+ "wait_after_ms": {
435
+ "type": "integer",
436
+ "not": {
437
+ "type": "null"
438
+ },
439
+ "minimum": 0
440
+ }
441
+ },
442
+ "required": []
443
+ }
444
+ },
445
+ "required": []
446
+ }
447
+ },
448
+ "required": []
449
+ },
450
+ "botasaurus": {
451
+ "type": "object",
452
+ "properties": {
453
+ "navigation_mode": {
454
+ "type": "string",
455
+ "minLength": 1,
456
+ "enum": [
457
+ "auto",
458
+ "get",
459
+ "google_get",
460
+ "google_get_bypass"
461
+ ]
462
+ },
463
+ "max_retries": {
464
+ "type": "integer",
465
+ "not": {
466
+ "type": "null"
467
+ },
468
+ "minimum": 0,
469
+ "maximum": 3
470
+ },
471
+ "wait_for_selector": {
472
+ "type": [
473
+ "null",
474
+ "string"
475
+ ]
476
+ },
477
+ "wait_timeout_seconds": {
478
+ "type": "integer",
479
+ "not": {
480
+ "type": "null"
481
+ },
482
+ "exclusiveMinimum": 0
483
+ },
484
+ "block_images": {
485
+ "type": "boolean",
486
+ "not": {
487
+ "type": "null"
488
+ }
489
+ },
490
+ "block_images_and_css": {
491
+ "type": "boolean",
492
+ "not": {
493
+ "type": "null"
494
+ }
495
+ },
496
+ "wait_for_complete_page_load": {
497
+ "type": "boolean",
498
+ "not": {
499
+ "type": "null"
500
+ }
501
+ },
502
+ "headless": {
503
+ "type": "boolean",
504
+ "not": {
505
+ "type": "null"
506
+ }
507
+ },
508
+ "proxy": {
509
+ "type": "string",
510
+ "minLength": 1
511
+ },
512
+ "user_agent": {
513
+ "type": "string",
514
+ "minLength": 1
515
+ },
516
+ "window_size": {
517
+ "type": "array",
518
+ "items": {
519
+ "minLength": 2,
520
+ "maxLength": 2,
521
+ "type": "integer",
522
+ "exclusiveMinimum": 0
523
+ }
524
+ },
525
+ "lang": {
526
+ "type": "string",
527
+ "minLength": 1
528
+ }
529
+ },
530
+ "required": []
531
+ }
532
+ },
533
+ "required": []
534
+ }
535
+ },
536
+ "required": [
537
+ "channel"
538
+ ],
539
+ "anyOf": [
540
+ {
541
+ "required": [
542
+ "selectors"
543
+ ]
544
+ },
545
+ {
546
+ "required": [
547
+ "auto_source"
548
+ ]
549
+ }
550
+ ]
551
+ }