html2rss 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +53 -2
  3. data/Gemfile.lock +10 -7
  4. data/docs/Html2rss.html +3 -3
  5. data/docs/Html2rss/AttributePostProcessors.html +6 -6
  6. data/docs/Html2rss/AttributePostProcessors/HtmlToMarkdown.html +325 -0
  7. data/docs/Html2rss/AttributePostProcessors/ParseTime.html +12 -18
  8. data/docs/Html2rss/AttributePostProcessors/ParseUri.html +4 -4
  9. data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +38 -17
  10. data/docs/Html2rss/AttributePostProcessors/Substring.html +5 -5
  11. data/docs/Html2rss/AttributePostProcessors/Template.html +6 -6
  12. data/docs/Html2rss/Config.html +75 -23
  13. data/docs/Html2rss/FeedBuilder.html +1 -1
  14. data/docs/Html2rss/Item.html +14 -8
  15. data/docs/Html2rss/ItemExtractors.html +1 -1
  16. data/docs/Html2rss/ItemExtractors/Attribute.html +1 -1
  17. data/docs/Html2rss/ItemExtractors/CurrentTime.html +1 -1
  18. data/docs/Html2rss/ItemExtractors/Href.html +2 -2
  19. data/docs/Html2rss/ItemExtractors/Html.html +1 -1
  20. data/docs/Html2rss/ItemExtractors/Static.html +1 -1
  21. data/docs/Html2rss/ItemExtractors/Text.html +1 -1
  22. data/docs/Html2rss/Utils.html +86 -1
  23. data/docs/Html2rss/Utils/IndifferentAccessHash.html +1 -1
  24. data/docs/_index.html +8 -1
  25. data/docs/class_list.html +1 -1
  26. data/docs/file.README.html +1 -1
  27. data/docs/index.html +1 -1
  28. data/docs/method_list.html +69 -37
  29. data/docs/top-level-namespace.html +1 -1
  30. data/html2rss.gemspec +1 -0
  31. data/lib/html2rss/attribute_post_processors.rb +1 -0
  32. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +39 -0
  33. data/lib/html2rss/utils.rb +3 -4
  34. data/lib/html2rss/version.rb +1 -1
  35. metadata +19 -5
  36. data/.changelogrc +0 -19
@@ -100,7 +100,7 @@
100
100
  </div>
101
101
 
102
102
  <div id="footer">
103
- Generated on Sun Jul 14 19:35:05 2019 by
103
+ Generated on Wed Sep 18 12:52:16 2019 by
104
104
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
105
  0.9.20 (ruby-2.6.3).
106
106
  </div>
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency 'faraday_middleware', '~> 0.13'
33
33
  spec.add_dependency 'hashie', '~> 3.6'
34
34
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
35
+ spec.add_dependency 'reverse_markdown', '~> 1.3'
35
36
  spec.add_dependency 'sanitize', '~> 5.0'
36
37
  spec.add_development_dependency 'bundler', '~> 1.16'
37
38
  spec.add_development_dependency 'byebug'
@@ -1,3 +1,4 @@
1
+ require_relative 'attribute_post_processors/html_to_markdown'
1
2
  require_relative 'attribute_post_processors/parse_time'
2
3
  require_relative 'attribute_post_processors/parse_uri'
3
4
  require_relative 'attribute_post_processors/sanitize_html'
@@ -0,0 +1,39 @@
1
+ require 'reverse_markdown'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ # Returns HTML code as Markdown formatted String.
7
+ # Before converting to markdown, the HTML is sanitized with SanitizeHtml.
8
+ # Imagine this HTML structure:
9
+ #
10
+ # <section>
11
+ # Lorem <b>ipsum</b> dolor...
12
+ # <iframe src="https://evil.corp/miner"></iframe>
13
+ # <script>alert();</script>
14
+ # </section>
15
+ #
16
+ # YAML usage example:
17
+ #
18
+ # selectors:
19
+ # description:
20
+ # selector: section
21
+ # extractor: html
22
+ # post_process:
23
+ # name: html_to_markdown
24
+ #
25
+ # Would return:
26
+ # 'Lorem **ipsum** dolor'
27
+ class HtmlToMarkdown
28
+ def initialize(value, env)
29
+ @value = SanitizeHtml.new(value, env).get
30
+ end
31
+
32
+ ##
33
+ # @return [String] formatted in Markdown
34
+ def get
35
+ ReverseMarkdown.convert @value
36
+ end
37
+ end
38
+ end
39
+ end
@@ -14,11 +14,10 @@ module Html2rss
14
14
 
15
15
  return url if url.absolute?
16
16
 
17
- path, query = url.to_s.split('?')
18
-
19
17
  URI(channel_url).tap do |uri|
20
- uri.path = path.to_s.start_with?('/') ? path : "/#{path}"
21
- uri.query = query
18
+ uri.path = url.path.to_s.start_with?('/') ? url.path : "/#{url.path}"
19
+ uri.query = url.query
20
+ uri.fragment = url.fragment if url.fragment
22
21
  end
23
22
  end
24
23
  end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.4.0'.freeze
2
+ VERSION = '0.4.1'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-09-07 00:00:00.000000000 Z
11
+ date: 2019-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -86,6 +86,20 @@ dependencies:
86
86
  - - "<"
87
87
  - !ruby/object:Gem::Version
88
88
  version: '2.0'
89
+ - !ruby/object:Gem::Dependency
90
+ name: reverse_markdown
91
+ requirement: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '1.3'
96
+ type: :runtime
97
+ prerelease: false
98
+ version_requirements: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '1.3'
89
103
  - !ruby/object:Gem::Dependency
90
104
  name: sanitize
91
105
  requirement: !ruby/object:Gem::Requirement
@@ -234,7 +248,6 @@ executables: []
234
248
  extensions: []
235
249
  extra_rdoc_files: []
236
250
  files:
237
- - ".changelogrc"
238
251
  - ".gitignore"
239
252
  - ".rspec"
240
253
  - ".rubocop.yml"
@@ -249,6 +262,7 @@ files:
249
262
  - bin/setup
250
263
  - docs/Html2rss.html
251
264
  - docs/Html2rss/AttributePostProcessors.html
265
+ - docs/Html2rss/AttributePostProcessors/HtmlToMarkdown.html
252
266
  - docs/Html2rss/AttributePostProcessors/ParseTime.html
253
267
  - docs/Html2rss/AttributePostProcessors/ParseUri.html
254
268
  - docs/Html2rss/AttributePostProcessors/SanitizeHtml.html
@@ -283,6 +297,7 @@ files:
283
297
  - html2rss.gemspec
284
298
  - lib/html2rss.rb
285
299
  - lib/html2rss/attribute_post_processors.rb
300
+ - lib/html2rss/attribute_post_processors/html_to_markdown.rb
286
301
  - lib/html2rss/attribute_post_processors/parse_time.rb
287
302
  - lib/html2rss/attribute_post_processors/parse_uri.rb
288
303
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
@@ -321,8 +336,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
321
336
  - !ruby/object:Gem::Version
322
337
  version: '0'
323
338
  requirements: []
324
- rubyforge_project:
325
- rubygems_version: 2.7.7
339
+ rubygems_version: 3.0.6
326
340
  signing_key:
327
341
  specification_version: 4
328
342
  summary: Returns an RSS::Rss object by scraping a URL.
@@ -1,19 +0,0 @@
1
- {
2
- "app_name": "html2rss",
3
- "logo": "https://github.com/gildesmarais/html2rss/raw/master/support/logo.png",
4
- "intro": "Generate RSS feeds by scraping websites by providing a config.",
5
- "debug": "true",
6
- "template": "support/changelog.md",
7
- "sections": [
8
- { "title": "Bugfixes", "grep": "^fix" },
9
- { "title": "Features", "grep": "^feat" },
10
- { "title": "Documentation", "grep": "^docs" },
11
- { "title": "Breaking changes", "grep": "BREAKING" },
12
- { "title": "Refactorings", "grep": "^refactor" },
13
- { "title": "Code style", "grep": "^style" },
14
- { "title": "Test", "grep": "^spec" },
15
- { "title": "Chore", "grep": "^chore" },
16
- { "title": "Branches merged", "grep": "^Merge branch" },
17
- { "title": "Pull requests merged", "grep": "^Merge pull request" }
18
- ]
19
- }