html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,469 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "type": "object",
4
+ "properties": {
5
+ "strategy": {
6
+ "not": {
7
+ "type": "null"
8
+ }
9
+ },
10
+ "channel": {
11
+ "type": "object",
12
+ "properties": {
13
+ "url": {
14
+ "type": "string",
15
+ "minLength": 1
16
+ },
17
+ "title": {
18
+ "type": [
19
+ "null",
20
+ "string"
21
+ ]
22
+ },
23
+ "description": {
24
+ "type": [
25
+ "null",
26
+ "string"
27
+ ]
28
+ },
29
+ "language": {
30
+ "type": [
31
+ "null",
32
+ "string"
33
+ ]
34
+ },
35
+ "ttl": {
36
+ "type": [
37
+ "null",
38
+ "integer"
39
+ ],
40
+ "exclusiveMinimum": 0
41
+ },
42
+ "time_zone": {
43
+ "type": [
44
+ "null",
45
+ "string"
46
+ ]
47
+ }
48
+ },
49
+ "required": [
50
+ "url"
51
+ ]
52
+ },
53
+ "headers": {
54
+ "type": "object",
55
+ "description": "HTTP headers applied to every request.",
56
+ "additionalProperties": {
57
+ "type": "string"
58
+ }
59
+ },
60
+ "stylesheets": {
61
+ "type": "array",
62
+ "description": "Collection of stylesheets to attach to the RSS feed.",
63
+ "items": {
64
+ "$schema": "http://json-schema.org/draft-06/schema#",
65
+ "type": "object",
66
+ "properties": {
67
+ "href": {
68
+ "type": "string",
69
+ "minLength": 1
70
+ },
71
+ "type": {
72
+ "type": "string",
73
+ "minLength": 1,
74
+ "enum": [
75
+ "text/css",
76
+ "text/xsl"
77
+ ]
78
+ },
79
+ "media": {
80
+ "type": [
81
+ "null",
82
+ "string"
83
+ ]
84
+ }
85
+ },
86
+ "required": [
87
+ "href",
88
+ "type"
89
+ ]
90
+ }
91
+ },
92
+ "auto_source": {
93
+ "$schema": "http://json-schema.org/draft-06/schema#",
94
+ "type": "object",
95
+ "properties": {
96
+ "scraper": {
97
+ "type": "object",
98
+ "properties": {
99
+ "wordpress_api": {
100
+ "type": "object",
101
+ "properties": {
102
+ "enabled": {
103
+ "type": "boolean",
104
+ "not": {
105
+ "type": "null"
106
+ }
107
+ }
108
+ },
109
+ "required": []
110
+ },
111
+ "schema": {
112
+ "type": "object",
113
+ "properties": {
114
+ "enabled": {
115
+ "type": "boolean",
116
+ "not": {
117
+ "type": "null"
118
+ }
119
+ }
120
+ },
121
+ "required": []
122
+ },
123
+ "microdata": {
124
+ "type": "object",
125
+ "properties": {
126
+ "enabled": {
127
+ "type": "boolean",
128
+ "not": {
129
+ "type": "null"
130
+ }
131
+ }
132
+ },
133
+ "required": []
134
+ },
135
+ "json_state": {
136
+ "type": "object",
137
+ "properties": {
138
+ "enabled": {
139
+ "type": "boolean",
140
+ "not": {
141
+ "type": "null"
142
+ }
143
+ }
144
+ },
145
+ "required": []
146
+ },
147
+ "semantic_html": {
148
+ "type": "object",
149
+ "properties": {
150
+ "enabled": {
151
+ "type": "boolean",
152
+ "not": {
153
+ "type": "null"
154
+ }
155
+ }
156
+ },
157
+ "required": []
158
+ },
159
+ "html": {
160
+ "type": "object",
161
+ "properties": {
162
+ "enabled": {
163
+ "type": "boolean",
164
+ "not": {
165
+ "type": "null"
166
+ }
167
+ },
168
+ "minimum_selector_frequency": {
169
+ "type": "integer",
170
+ "not": {
171
+ "type": "null"
172
+ },
173
+ "exclusiveMinimum": 0
174
+ },
175
+ "use_top_selectors": {
176
+ "type": "integer",
177
+ "not": {
178
+ "type": "null"
179
+ },
180
+ "exclusiveMinimum": 0
181
+ }
182
+ },
183
+ "required": []
184
+ }
185
+ },
186
+ "required": []
187
+ },
188
+ "cleanup": {
189
+ "type": "object",
190
+ "properties": {
191
+ "keep_different_domain": {
192
+ "type": "boolean",
193
+ "not": {
194
+ "type": "null"
195
+ }
196
+ },
197
+ "min_words_title": {
198
+ "type": "integer",
199
+ "not": {
200
+ "type": "null"
201
+ },
202
+ "exclusiveMinimum": 0
203
+ }
204
+ },
205
+ "required": []
206
+ }
207
+ },
208
+ "required": [],
209
+ "default": {
210
+ "scraper": {
211
+ "wordpress_api": {
212
+ "enabled": true
213
+ },
214
+ "schema": {
215
+ "enabled": true
216
+ },
217
+ "microdata": {
218
+ "enabled": true
219
+ },
220
+ "json_state": {
221
+ "enabled": true
222
+ },
223
+ "semantic_html": {
224
+ "enabled": true
225
+ },
226
+ "html": {
227
+ "enabled": true,
228
+ "minimum_selector_frequency": 2,
229
+ "use_top_selectors": 5
230
+ }
231
+ },
232
+ "cleanup": {
233
+ "keep_different_domain": false,
234
+ "min_words_title": 3
235
+ }
236
+ }
237
+ },
238
+ "selectors": {
239
+ "type": "object",
240
+ "description": "Selectors used to extract article attributes.",
241
+ "properties": {
242
+ "items": {
243
+ "$schema": "http://json-schema.org/draft-06/schema#",
244
+ "type": "object",
245
+ "properties": {
246
+ "selector": {
247
+ "type": "string",
248
+ "minLength": 1
249
+ },
250
+ "order": {
251
+ "enum": [
252
+ "reverse"
253
+ ],
254
+ "not": {
255
+ "type": "null"
256
+ }
257
+ },
258
+ "enhance": {
259
+ "type": "boolean",
260
+ "not": {
261
+ "type": "null"
262
+ }
263
+ },
264
+ "pagination": {
265
+ "type": "object",
266
+ "properties": {
267
+ "max_pages": {
268
+ "type": "integer",
269
+ "not": {
270
+ "type": "null"
271
+ },
272
+ "exclusiveMinimum": 0
273
+ }
274
+ },
275
+ "required": [
276
+ "max_pages"
277
+ ]
278
+ }
279
+ },
280
+ "required": [
281
+ "selector"
282
+ ],
283
+ "description": "Defines the items selector and optional enhancement settings."
284
+ },
285
+ "enclosure": {
286
+ "$schema": "http://json-schema.org/draft-06/schema#",
287
+ "type": "object",
288
+ "properties": {
289
+ "extractor": {
290
+ "type": "string",
291
+ "minLength": 1
292
+ },
293
+ "attribute": {
294
+ "type": "string",
295
+ "minLength": 1
296
+ },
297
+ "static": {
298
+ "type": "string",
299
+ "minLength": 1
300
+ },
301
+ "post_process": {
302
+ "type": "array",
303
+ "items": {
304
+ "type": "object"
305
+ }
306
+ },
307
+ "content_type": {
308
+ "type": "string",
309
+ "minLength": 1
310
+ }
311
+ },
312
+ "required": [],
313
+ "description": "Describes enclosure extraction settings."
314
+ },
315
+ "guid": {
316
+ "type": "array",
317
+ "description": "List of selector keys used to build the GUID. Each entry must reference a sibling selector key; runtime validation enforces those references.",
318
+ "minItems": 1,
319
+ "items": {
320
+ "type": "string",
321
+ "description": "Selector key defined elsewhere in this object."
322
+ }
323
+ },
324
+ "categories": {
325
+ "type": "array",
326
+ "description": "List of selector keys whose values will be used as categories. Each entry must reference a sibling selector key; runtime validation enforces those references.",
327
+ "minItems": 1,
328
+ "items": {
329
+ "type": "string",
330
+ "description": "Selector key defined elsewhere in this object."
331
+ }
332
+ }
333
+ },
334
+ "patternProperties": {
335
+ "^(?!items$|enclosure$|guid$|categories$).+$": {
336
+ "$schema": "http://json-schema.org/draft-06/schema#",
337
+ "type": "object",
338
+ "properties": {
339
+ "extractor": {
340
+ "type": "string",
341
+ "minLength": 1
342
+ },
343
+ "attribute": {
344
+ "type": "string",
345
+ "minLength": 1
346
+ },
347
+ "static": {
348
+ "type": "string",
349
+ "minLength": 1
350
+ },
351
+ "post_process": {
352
+ "type": "array",
353
+ "items": {
354
+ "type": "object"
355
+ }
356
+ }
357
+ },
358
+ "required": [],
359
+ "description": "Dynamic selector definition keyed by attribute name."
360
+ }
361
+ },
362
+ "additionalProperties": true
363
+ },
364
+ "request": {
365
+ "type": "object",
366
+ "properties": {
367
+ "max_redirects": {
368
+ "type": "integer",
369
+ "not": {
370
+ "type": "null"
371
+ },
372
+ "minimum": 0
373
+ },
374
+ "max_requests": {
375
+ "type": "integer",
376
+ "not": {
377
+ "type": "null"
378
+ },
379
+ "exclusiveMinimum": 0
380
+ },
381
+ "browserless": {
382
+ "type": "object",
383
+ "properties": {
384
+ "preload": {
385
+ "type": "object",
386
+ "properties": {
387
+ "wait_after_ms": {
388
+ "type": "integer",
389
+ "not": {
390
+ "type": "null"
391
+ },
392
+ "minimum": 0
393
+ },
394
+ "click_selectors": {
395
+ "type": "array",
396
+ "items": {
397
+ "type": "object",
398
+ "properties": {
399
+ "selector": {
400
+ "type": "string",
401
+ "minLength": 1
402
+ },
403
+ "max_clicks": {
404
+ "type": "integer",
405
+ "not": {
406
+ "type": "null"
407
+ },
408
+ "exclusiveMinimum": 0
409
+ },
410
+ "wait_after_ms": {
411
+ "type": "integer",
412
+ "not": {
413
+ "type": "null"
414
+ },
415
+ "minimum": 0
416
+ }
417
+ },
418
+ "required": [
419
+ "selector"
420
+ ]
421
+ }
422
+ },
423
+ "scroll_down": {
424
+ "type": "object",
425
+ "properties": {
426
+ "iterations": {
427
+ "type": "integer",
428
+ "not": {
429
+ "type": "null"
430
+ },
431
+ "exclusiveMinimum": 0
432
+ },
433
+ "wait_after_ms": {
434
+ "type": "integer",
435
+ "not": {
436
+ "type": "null"
437
+ },
438
+ "minimum": 0
439
+ }
440
+ },
441
+ "required": []
442
+ }
443
+ },
444
+ "required": []
445
+ }
446
+ },
447
+ "required": []
448
+ }
449
+ },
450
+ "required": []
451
+ }
452
+ },
453
+ "required": [
454
+ "strategy",
455
+ "channel"
456
+ ],
457
+ "anyOf": [
458
+ {
459
+ "required": [
460
+ "selectors"
461
+ ]
462
+ },
463
+ {
464
+ "required": [
465
+ "auto_source"
466
+ ]
467
+ }
468
+ ]
469
+ }