html2rss 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -2
- data/Gemfile.lock +10 -7
- data/docs/Html2rss.html +3 -3
- data/docs/Html2rss/AttributePostProcessors.html +6 -6
- data/docs/Html2rss/AttributePostProcessors/HtmlToMarkdown.html +325 -0
- data/docs/Html2rss/AttributePostProcessors/ParseTime.html +12 -18
- data/docs/Html2rss/AttributePostProcessors/ParseUri.html +4 -4
- data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +38 -17
- data/docs/Html2rss/AttributePostProcessors/Substring.html +5 -5
- data/docs/Html2rss/AttributePostProcessors/Template.html +6 -6
- data/docs/Html2rss/Config.html +75 -23
- data/docs/Html2rss/FeedBuilder.html +1 -1
- data/docs/Html2rss/Item.html +14 -8
- data/docs/Html2rss/ItemExtractors.html +1 -1
- data/docs/Html2rss/ItemExtractors/Attribute.html +1 -1
- data/docs/Html2rss/ItemExtractors/CurrentTime.html +1 -1
- data/docs/Html2rss/ItemExtractors/Href.html +2 -2
- data/docs/Html2rss/ItemExtractors/Html.html +1 -1
- data/docs/Html2rss/ItemExtractors/Static.html +1 -1
- data/docs/Html2rss/ItemExtractors/Text.html +1 -1
- data/docs/Html2rss/Utils.html +86 -1
- data/docs/Html2rss/Utils/IndifferentAccessHash.html +1 -1
- data/docs/_index.html +8 -1
- data/docs/class_list.html +1 -1
- data/docs/file.README.html +1 -1
- data/docs/index.html +1 -1
- data/docs/method_list.html +69 -37
- data/docs/top-level-namespace.html +1 -1
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors.rb +1 -0
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +39 -0
- data/lib/html2rss/utils.rb +3 -4
- data/lib/html2rss/version.rb +1 -1
- metadata +19 -5
- data/.changelogrc +0 -19
@@ -100,7 +100,7 @@
|
|
100
100
|
</div>
|
101
101
|
|
102
102
|
<div id="footer">
|
103
|
-
Generated on
|
103
|
+
Generated on Wed Sep 18 12:52:16 2019 by
|
104
104
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
105
105
|
0.9.20 (ruby-2.6.3).
|
106
106
|
</div>
|
data/html2rss.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
33
33
|
spec.add_dependency 'hashie', '~> 3.6'
|
34
34
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
35
|
+
spec.add_dependency 'reverse_markdown', '~> 1.3'
|
35
36
|
spec.add_dependency 'sanitize', '~> 5.0'
|
36
37
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
37
38
|
spec.add_development_dependency 'byebug'
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'reverse_markdown'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Returns HTML code as Markdown formatted String.
|
7
|
+
# Before converting to markdown, the HTML is sanitized with SanitizeHtml.
|
8
|
+
# Imagine this HTML structure:
|
9
|
+
#
|
10
|
+
# <section>
|
11
|
+
# Lorem <b>ipsum</b> dolor...
|
12
|
+
# <iframe src="https://evil.corp/miner"></iframe>
|
13
|
+
# <script>alert();</script>
|
14
|
+
# </section>
|
15
|
+
#
|
16
|
+
# YAML usage example:
|
17
|
+
#
|
18
|
+
# selectors:
|
19
|
+
# description:
|
20
|
+
# selector: section
|
21
|
+
# extractor: html
|
22
|
+
# post_process:
|
23
|
+
# name: html_to_markdown
|
24
|
+
#
|
25
|
+
# Would return:
|
26
|
+
# 'Lorem **ipsum** dolor'
|
27
|
+
class HtmlToMarkdown
|
28
|
+
def initialize(value, env)
|
29
|
+
@value = SanitizeHtml.new(value, env).get
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# @return [String] formatted in Markdown
|
34
|
+
def get
|
35
|
+
ReverseMarkdown.convert @value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -14,11 +14,10 @@ module Html2rss
|
|
14
14
|
|
15
15
|
return url if url.absolute?
|
16
16
|
|
17
|
-
path, query = url.to_s.split('?')
|
18
|
-
|
19
17
|
URI(channel_url).tap do |uri|
|
20
|
-
uri.path = path.to_s.start_with?('/') ? path : "/#{path}"
|
21
|
-
uri.query = query
|
18
|
+
uri.path = url.path.to_s.start_with?('/') ? url.path : "/#{url.path}"
|
19
|
+
uri.query = url.query
|
20
|
+
uri.fragment = url.fragment if url.fragment
|
22
21
|
end
|
23
22
|
end
|
24
23
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -86,6 +86,20 @@ dependencies:
|
|
86
86
|
- - "<"
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '2.0'
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: reverse_markdown
|
91
|
+
requirement: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - "~>"
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '1.3'
|
96
|
+
type: :runtime
|
97
|
+
prerelease: false
|
98
|
+
version_requirements: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '1.3'
|
89
103
|
- !ruby/object:Gem::Dependency
|
90
104
|
name: sanitize
|
91
105
|
requirement: !ruby/object:Gem::Requirement
|
@@ -234,7 +248,6 @@ executables: []
|
|
234
248
|
extensions: []
|
235
249
|
extra_rdoc_files: []
|
236
250
|
files:
|
237
|
-
- ".changelogrc"
|
238
251
|
- ".gitignore"
|
239
252
|
- ".rspec"
|
240
253
|
- ".rubocop.yml"
|
@@ -249,6 +262,7 @@ files:
|
|
249
262
|
- bin/setup
|
250
263
|
- docs/Html2rss.html
|
251
264
|
- docs/Html2rss/AttributePostProcessors.html
|
265
|
+
- docs/Html2rss/AttributePostProcessors/HtmlToMarkdown.html
|
252
266
|
- docs/Html2rss/AttributePostProcessors/ParseTime.html
|
253
267
|
- docs/Html2rss/AttributePostProcessors/ParseUri.html
|
254
268
|
- docs/Html2rss/AttributePostProcessors/SanitizeHtml.html
|
@@ -283,6 +297,7 @@ files:
|
|
283
297
|
- html2rss.gemspec
|
284
298
|
- lib/html2rss.rb
|
285
299
|
- lib/html2rss/attribute_post_processors.rb
|
300
|
+
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
286
301
|
- lib/html2rss/attribute_post_processors/parse_time.rb
|
287
302
|
- lib/html2rss/attribute_post_processors/parse_uri.rb
|
288
303
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
@@ -321,8 +336,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
321
336
|
- !ruby/object:Gem::Version
|
322
337
|
version: '0'
|
323
338
|
requirements: []
|
324
|
-
|
325
|
-
rubygems_version: 2.7.7
|
339
|
+
rubygems_version: 3.0.6
|
326
340
|
signing_key:
|
327
341
|
specification_version: 4
|
328
342
|
summary: Returns an RSS::Rss object by scraping a URL.
|
data/.changelogrc
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"app_name": "html2rss",
|
3
|
-
"logo": "https://github.com/gildesmarais/html2rss/raw/master/support/logo.png",
|
4
|
-
"intro": "Generate RSS feeds by scraping websites by providing a config.",
|
5
|
-
"debug": "true",
|
6
|
-
"template": "support/changelog.md",
|
7
|
-
"sections": [
|
8
|
-
{ "title": "Bugfixes", "grep": "^fix" },
|
9
|
-
{ "title": "Features", "grep": "^feat" },
|
10
|
-
{ "title": "Documentation", "grep": "^docs" },
|
11
|
-
{ "title": "Breaking changes", "grep": "BREAKING" },
|
12
|
-
{ "title": "Refactorings", "grep": "^refactor" },
|
13
|
-
{ "title": "Code style", "grep": "^style" },
|
14
|
-
{ "title": "Test", "grep": "^spec" },
|
15
|
-
{ "title": "Chore", "grep": "^chore" },
|
16
|
-
{ "title": "Branches merged", "grep": "^Merge branch" },
|
17
|
-
{ "title": "Pull requests merged", "grep": "^Merge pull request" }
|
18
|
-
]
|
19
|
-
}
|