html2rss 0.19.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/auto_source/scraper/html.rb +48 -56
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +447 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +6 -161
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +102 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +172 -30
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +1 -1
- data/lib/html2rss/config/class_methods.rb +2 -2
- data/lib/html2rss/config/request_headers.rb +18 -9
- data/lib/html2rss/configuration.rb +176 -0
- data/lib/html2rss/html_extractor/list_candidates.rb +94 -0
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +257 -0
- data/lib/html2rss/html_extractor/semantic_containers.rb +70 -0
- data/lib/html2rss/html_extractor.rb +11 -0
- data/lib/html2rss/rss_builder/channel.rb +10 -7
- data/lib/html2rss/url.rb +2 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +54 -5
- metadata +9 -3
data/lib/html2rss.rb
CHANGED
|
@@ -7,18 +7,21 @@ loader.inflector.inflect('cli' => 'CLI')
|
|
|
7
7
|
loader.setup
|
|
8
8
|
|
|
9
9
|
require 'logger'
|
|
10
|
+
require 'forwardable'
|
|
11
|
+
require 'html2rss/configuration'
|
|
10
12
|
|
|
11
13
|
##
|
|
12
14
|
# The Html2rss namespace.
|
|
13
15
|
module Html2rss
|
|
14
16
|
##
|
|
15
17
|
# The logger instance.
|
|
16
|
-
Log
|
|
18
|
+
module Log
|
|
19
|
+
class << self
|
|
20
|
+
extend Forwardable
|
|
17
21
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
"#{datetime} [#{severity}] #{msg}\n"
|
|
22
|
+
def_delegator 'Html2rss', :logger
|
|
23
|
+
def_delegators :logger, :debug, :info, :warn, :error, :fatal, :unknown, :level, :level=, :formatter, :formatter=
|
|
24
|
+
end
|
|
22
25
|
end
|
|
23
26
|
|
|
24
27
|
##
|
|
@@ -75,6 +78,50 @@ module Html2rss
|
|
|
75
78
|
json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
76
79
|
end
|
|
77
80
|
|
|
81
|
+
# rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
82
|
+
class << self
|
|
83
|
+
##
|
|
84
|
+
# @return [Html2rss::Configuration] the global configuration instance
|
|
85
|
+
def configuration
|
|
86
|
+
@configuration ||= Configuration.new.freeze
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
##
|
|
90
|
+
# Configures global library defaults.
|
|
91
|
+
#
|
|
92
|
+
# @yieldparam config [Html2rss::Configuration]
|
|
93
|
+
# @return [Html2rss::Configuration] the frozen configuration
|
|
94
|
+
def configure
|
|
95
|
+
config = configuration.dup
|
|
96
|
+
yield config
|
|
97
|
+
@configuration = config.freeze
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
##
|
|
101
|
+
# @return [Object] the logger
|
|
102
|
+
def logger
|
|
103
|
+
configuration.logger
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
##
|
|
107
|
+
# @param logger [Object] the new logger
|
|
108
|
+
def logger=(logger)
|
|
109
|
+
configure { |config| config.logger = logger }
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
private
|
|
113
|
+
|
|
114
|
+
##
|
|
115
|
+
# Resets the global configuration to defaults (mainly for testing).
|
|
116
|
+
#
|
|
117
|
+
# @return [void]
|
|
118
|
+
def reset_configuration!
|
|
119
|
+
@configuration = nil
|
|
120
|
+
logger.level = configuration.log_level if logger.respond_to?(:level=)
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
# rubocop:enable ThreadSafety/ClassInstanceVariable
|
|
124
|
+
|
|
78
125
|
class << self
|
|
79
126
|
private
|
|
80
127
|
|
|
@@ -103,6 +150,8 @@ module Html2rss
|
|
|
103
150
|
keys
|
|
104
151
|
end
|
|
105
152
|
end
|
|
153
|
+
|
|
154
|
+
logger.level = configuration.log_level if logger.respond_to?(:level=)
|
|
106
155
|
end
|
|
107
156
|
|
|
108
157
|
loader.eager_load
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.20.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
@@ -293,6 +293,7 @@ files:
|
|
|
293
293
|
- lib/html2rss/auto_source/scraper.rb
|
|
294
294
|
- lib/html2rss/auto_source/scraper/html.rb
|
|
295
295
|
- lib/html2rss/auto_source/scraper/json_state.rb
|
|
296
|
+
- lib/html2rss/auto_source/scraper/link_heuristics.rb
|
|
296
297
|
- lib/html2rss/auto_source/scraper/microdata.rb
|
|
297
298
|
- lib/html2rss/auto_source/scraper/schema.rb
|
|
298
299
|
- lib/html2rss/auto_source/scraper/schema/category_extractor.rb
|
|
@@ -301,6 +302,7 @@ files:
|
|
|
301
302
|
- lib/html2rss/auto_source/scraper/schema/thing.rb
|
|
302
303
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
|
303
304
|
- lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb
|
|
305
|
+
- lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb
|
|
304
306
|
- lib/html2rss/auto_source/scraper/wordpress_api.rb
|
|
305
307
|
- lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb
|
|
306
308
|
- lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb
|
|
@@ -314,6 +316,7 @@ files:
|
|
|
314
316
|
- lib/html2rss/config/request_headers.rb
|
|
315
317
|
- lib/html2rss/config/schema.rb
|
|
316
318
|
- lib/html2rss/config/validator.rb
|
|
319
|
+
- lib/html2rss/configuration.rb
|
|
317
320
|
- lib/html2rss/error.rb
|
|
318
321
|
- lib/html2rss/feed_pipeline.rb
|
|
319
322
|
- lib/html2rss/feed_pipeline/auto_fallback.rb
|
|
@@ -322,6 +325,9 @@ files:
|
|
|
322
325
|
- lib/html2rss/html_extractor/date_extractor.rb
|
|
323
326
|
- lib/html2rss/html_extractor/enclosure_extractor.rb
|
|
324
327
|
- lib/html2rss/html_extractor/image_extractor.rb
|
|
328
|
+
- lib/html2rss/html_extractor/list_candidates.rb
|
|
329
|
+
- lib/html2rss/html_extractor/semantic_anchor_candidates.rb
|
|
330
|
+
- lib/html2rss/html_extractor/semantic_containers.rb
|
|
325
331
|
- lib/html2rss/html_navigator.rb
|
|
326
332
|
- lib/html2rss/json_feed_builder.rb
|
|
327
333
|
- lib/html2rss/json_feed_builder/item.rb
|
|
@@ -384,7 +390,7 @@ licenses:
|
|
|
384
390
|
- MIT
|
|
385
391
|
metadata:
|
|
386
392
|
allowed_push_host: https://rubygems.org
|
|
387
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
|
393
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.20.0
|
|
388
394
|
rubygems_mfa_required: 'true'
|
|
389
395
|
rdoc_options: []
|
|
390
396
|
require_paths:
|
|
@@ -400,7 +406,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
400
406
|
- !ruby/object:Gem::Version
|
|
401
407
|
version: '0'
|
|
402
408
|
requirements: []
|
|
403
|
-
rubygems_version: 4.0.
|
|
409
|
+
rubygems_version: 4.0.10
|
|
404
410
|
specification_version: 4
|
|
405
411
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
|
406
412
|
to extract item.
|