html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -1,83 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'mime/types'
4
-
5
- module Html2rss
6
- module RssBuilder
7
- ##
8
- # Builds an <item> tag (with the provided maker).
9
- class Item
10
- # Tags which should be processed every time and require non-trivial assignments/treatments.
11
- SPECIAL_TAGS = %i[categories enclosure guid].freeze
12
-
13
- ##
14
- # Adds the item to the Item Maker
15
- #
16
- # @param maker [RSS::Maker::RSS20::Items::Item]
17
- # @param item [Html2rss::Item]
18
- # @param tags [Set<Symbol>]
19
- # @return nil
20
- def self.add(maker, item, tags)
21
- tags.each do |tag|
22
- next if SPECIAL_TAGS.include?(tag)
23
-
24
- maker.public_send(:"#{tag}=", item.public_send(tag))
25
- end
26
-
27
- SPECIAL_TAGS.each do |tag|
28
- send(:"add_#{tag}", item, maker)
29
- end
30
- end
31
-
32
- ##
33
- # Adds the <category> tags, if there should be any.
34
- #
35
- # @param item [Html2rss::Item]
36
- # @param maker [RSS::Maker::RSS20::Items::Item]
37
- # @return nil
38
- def self.add_categories(item, maker)
39
- item.categories.each { |category| maker.categories.new_category.content = category }
40
- end
41
- private_class_method :add_categories
42
-
43
- ##
44
- # Adds an enclosure, if there should be one.
45
- #
46
- # @param item [Html2rss::Item]
47
- # @param maker [RSS::Maker::RSS20::Items::Item]
48
- # @return nil
49
- def self.add_enclosure(item, maker)
50
- return unless item.enclosure?
51
-
52
- set_enclosure_attributes(item.enclosure, maker.enclosure)
53
- end
54
- private_class_method :add_enclosure
55
-
56
- ##
57
- # Sets the attributes of an RSS enclosure.
58
- #
59
- # @param item_enclosure [Html2rss::Enclosure]
60
- # @param rss_enclosure [RSS::Maker::RSS20::Items::Enclosure]
61
- # @return nil
62
- def self.set_enclosure_attributes(item_enclosure, rss_enclosure)
63
- rss_enclosure.type = item_enclosure.type
64
- rss_enclosure.length = item_enclosure.bits_length
65
- rss_enclosure.url = item_enclosure.url
66
- end
67
- private_class_method :set_enclosure_attributes
68
-
69
- ##
70
- # Adds a non-permalink GUID to the item.
71
- #
72
- # @param item [Html2rss::Item]
73
- # @param maker [RSS::Maker::RSS20::Items::Item]
74
- # @return nil
75
- def self.add_guid(item, maker)
76
- guid = maker.guid
77
- guid.content = item.guid
78
- guid.isPermaLink = false
79
- end
80
- private_class_method :add_guid
81
- end
82
- end
83
- end
@@ -1,113 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'addressable/uri'
4
- require 'json'
5
- require 'regexp_parser'
6
- require 'tzinfo'
7
- require 'mime/types'
8
- require_relative 'object_to_xml_converter'
9
-
10
- module Html2rss
11
- ##
12
- # The collecting tank for utility methods.
13
- module Utils
14
- ##
15
- # @param url [String, Addressable::URI]
16
- # @param base_url [String, Addressable::URI]
17
- # @return [Addressable::URI]
18
- def self.build_absolute_url_from_relative(url, base_url)
19
- url = Addressable::URI.parse(url)
20
- return url if url.absolute?
21
-
22
- base_uri = Addressable::URI.parse(base_url)
23
- base_uri.path = '/' if base_uri.path.empty?
24
-
25
- base_uri.join(url).normalize
26
- end
27
-
28
- ##
29
- # Removes any space, parses and normalizes the given url.
30
- # @param url [String]
31
- # @return [Addressable::URI, nil] normalized URL, or nil if input is empty
32
- def self.sanitize_url(url)
33
- url = url.to_s.gsub(/\s+/, ' ').strip
34
- return if url.empty?
35
-
36
- Addressable::URI.parse(url).normalize
37
- end
38
-
39
- ##
40
- # Allows override of time zone locally inside supplied block; resets previous time zone when done.
41
- #
42
- # @param time_zone [String]
43
- # @param default_time_zone [String]
44
- # @yield block to execute with the given time zone
45
- # @return [Object] whatever the given block returns
46
- def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
47
- raise ArgumentError, 'a block is required' unless block_given?
48
-
49
- time_zone = TZInfo::Timezone.get(time_zone)
50
-
51
- prev_tz = ENV.fetch('TZ', default_time_zone)
52
- ENV['TZ'] = time_zone.name
53
- yield
54
- ensure
55
- ENV['TZ'] = prev_tz if prev_tz
56
- end
57
-
58
- ##
59
- # Builds a titleized representation of the URL with prefixed host.
60
- # @param url [Addressable::URI]
61
- # @return [String]
62
- def self.titleized_channel_url(url)
63
- nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
64
- host = url.host
65
-
66
- nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
67
- end
68
-
69
- ##
70
- # Builds a titleized representation of the URL.
71
- # @param url [Addressable::URI]
72
- # @return [String]
73
- def self.titleized_url(url)
74
- return '' if url.path.empty?
75
-
76
- nicer_path = CGI.unescapeURIComponent(url.path)
77
- .split('/')
78
- .flat_map do |part|
79
- part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
80
- end
81
-
82
- nicer_path.map!(&:capitalize)
83
- File.basename nicer_path.join(' '), '.*'
84
- end
85
-
86
- ##
87
- # Parses the given String and builds a Regexp out of it.
88
- #
89
- # It will remove one pair of surrounding slashes ('/') from the String
90
- # to maintain backwards compatibility before building the Regexp.
91
- #
92
- # @param string [String]
93
- # @return [Regexp]
94
- def self.build_regexp_from_string(string)
95
- raise ArgumentError, 'must be a string!' unless string.is_a?(String)
96
-
97
- string = string[1..-2] if string.start_with?('/') && string.end_with?('/')
98
- Regexp::Parser.parse(string, options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE).to_re
99
- end
100
-
101
- ##
102
- # Guesses the content type based on the file extension of the URL.
103
- #
104
- # @param url [Addressable::URI]
105
- # @return [String] guessed content type, defaults to 'application/octet-stream'
106
- def self.guess_content_type_from_url(url)
107
- url = url.path.split('?').first
108
-
109
- content_type = MIME::Types.type_for(File.extname(url).delete('.'))
110
- content_type.first&.to_s || 'application/octet-stream'
111
- end
112
- end
113
- end