decontaminator 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1342e0fb22e699e0c0713c095b31231f2d161c46
4
+ data.tar.gz: a00919d401c57f5c0aae265842d6bfc8b7f946a2
5
+ SHA512:
6
+ metadata.gz: 9bd7cd77f541ec1176fce4b3e7ca3f00f6fc24c1bd6e813e06212694a1b2c62abba407dfa030883ecda07f41527054018ccd15bceff99b82dc84394ef2c5f802
7
+ data.tar.gz: dd07913b676ca10a1230158fdc2aeab321a3fbeff9ee436960b346a34135f4671e61ec2f03bf6da1fd84dc157524371a58bd5d44d3c56998e039068019854bae
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Altmetric LLP
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,48 @@
1
+ # Decontaminator [![Build Status](https://travis-ci.org/altmetric/decontaminator.svg?branch=master)](https://travis-ci.org/altmetric/decontaminator)
2
+
3
+ Ruby HTML sanitizer based on a lightweight Oga parser.
4
+
5
+ **Current version:** 1.0.0
6
+
7
+ **Supported Ruby versions:** 1.9.3, 2.0, 2.1, 2.2, JRuby 1.7, and Rubinius 2.5
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'decontaminator'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install decontaminator
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'decontaminator'
29
+
30
+ input = '<h1>Heading</h1><p>Lorem ipsum...</p><script>alert(1)</script>'
31
+ fragment = Decontaminator::Fragment.new(input)
32
+ puts fragment.decontaminate.inspect
33
+ " Heading Lorem ipsum... "
34
+ ```
35
+
36
+ ## Contributing
37
+
38
+ 1. Fork it ( https://github.com/altmetric/decontaminator/fork )
39
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
40
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
41
+ 4. Push to the branch (`git push origin my-new-feature`)
42
+ 5. Create a new Pull Request
43
+
44
+ ## License
45
+
46
+ Copyright © 2015 Altmetric LLP
47
+
48
+ Distributed under the [MIT license](https://github.com/altmetric/decontaminator/blob/master/LICENSE.txt).
@@ -0,0 +1 @@
1
+ require 'decontaminator/fragment'
@@ -0,0 +1,80 @@
1
+ require 'oga'
2
+
3
+ module Decontaminator
4
+ class Fragment
5
+ def initialize(html_fragment)
6
+ @html_fragment = html_fragment
7
+ end
8
+
9
+ def decontaminate(options = {})
10
+ blacklisted_tags = NON_CONTENT_TAGS + options.fetch(:blacklist, [])
11
+
12
+ sanitize(Oga.parse_html(html_fragment).children, blacklisted_tags)
13
+ end
14
+
15
+ private
16
+
17
+ attr_reader :html_fragment
18
+
19
+ NON_CONTENT_TAGS = %w(
20
+ script
21
+ style
22
+ )
23
+
24
+ WHITESPACE_CONTENT_TAGS = %w(
25
+ address
26
+ article
27
+ aside
28
+ blockquote
29
+ br
30
+ dd
31
+ div
32
+ dl
33
+ dt
34
+ footer
35
+ h1
36
+ h2
37
+ h3
38
+ h4
39
+ h5
40
+ h6
41
+ header
42
+ hgroup
43
+ hr
44
+ li
45
+ nav
46
+ ol
47
+ p
48
+ pre
49
+ section
50
+ ul
51
+ )
52
+
53
+ def sanitize(node_set, blacklisted_tags)
54
+ node_set
55
+ .reject { |node| !text?(node) && blacklisted_tags.include?(node.name) }
56
+ .flat_map { |node| [whitespace(node, :prefix), text(node, blacklisted_tags), whitespace(node, :suffix)] }
57
+ .join
58
+ end
59
+
60
+ def text?(node)
61
+ node.is_a?(Oga::XML::Text)
62
+ end
63
+
64
+ def whitespace(node, _position)
65
+ if !text?(node) && WHITESPACE_CONTENT_TAGS.include?(node.name)
66
+ ' '
67
+ else
68
+ ''
69
+ end
70
+ end
71
+
72
+ def text(node, blacklisted_tags)
73
+ if text?(node)
74
+ node.text
75
+ else
76
+ sanitize(node.children, blacklisted_tags)
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,3 @@
1
+ module Decontaminator
2
+ VERSION = '1.0.0'
3
+ end
@@ -0,0 +1,49 @@
1
+ require 'decontaminator'
2
+
3
+ RSpec.describe Decontaminator::Fragment do
4
+ describe '#decontaminate' do
5
+ it 'sanitizes an empty string' do
6
+ expect(described_class.new('').decontaminate).to eq('')
7
+ end
8
+
9
+ it 'sanitizes an empty paragraph' do
10
+ expect(described_class.new('<p></p>').decontaminate).to eq(' ')
11
+ end
12
+
13
+ it 'sanitizes a paragraph' do
14
+ expect(described_class.new('<p>Text</p>').decontaminate).to eq(' Text ')
15
+ end
16
+
17
+ it 'sanitizes a formatted paragraph' do
18
+ expect(described_class.new('<p><b>Some</b> <i>text</i></p>').decontaminate).to eq(' Some text ')
19
+ end
20
+
21
+ it 'sanitizes a formatted paragraph with attributes' do
22
+ expect(described_class.new('<p class="text"><b data-important>Some</b> <i>text</i></p>').decontaminate).to eq(' Some text ')
23
+ end
24
+
25
+ it 'sanitizes two formatted paragraphs' do
26
+ expect(described_class.new('<p>Paragraph one.</p><p>Paragraph two.</p>').decontaminate).to eq(' Paragraph one. Paragraph two. ')
27
+ end
28
+
29
+ it 'sanitizes a link' do
30
+ expect(described_class.new('<a href="#">link</a>').decontaminate).to eq('link')
31
+ end
32
+
33
+ it 'sanitizes a script' do
34
+ expect(described_class.new('<script>alert("I am evil but well formatted!");</script').decontaminate).to eq('')
35
+ end
36
+
37
+ it 'sanitizes a stylesheet' do
38
+ expect(described_class.new('<style>a{color:red}</style>').decontaminate).to eq('')
39
+ end
40
+
41
+ it 'sanitizes multiple tags' do
42
+ expect(described_class.new('<div><section><p>Section.</p></section><aside><p>Aside.</p></aside></div>').decontaminate).to eq(' Section. Aside. ')
43
+ end
44
+
45
+ it 'sanitizes content with blacklisted tags' do
46
+ expect(described_class.new('<figcaption>Blacklist this</figcaption><p>but not that</p>').decontaminate(blacklist: %w(figcaption))).to eq(' but not that ')
47
+ end
48
+ end
49
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: decontaminator
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew MacLeod
8
+ - Paul Mucur
9
+ - Jakub Pawlowicz
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2015-09-16 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: oga
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - "~>"
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - "~>"
27
+ - !ruby/object:Gem::Version
28
+ version: '1.3'
29
+ description: Ruby HTML sanitizer based on a lightweight Oga parser.
30
+ email:
31
+ - matt@matt-m.co.uk
32
+ - mudge@mudge.name
33
+ - jakub@altmetric.com
34
+ executables: []
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - LICENSE.txt
39
+ - README.md
40
+ - lib/decontaminator.rb
41
+ - lib/decontaminator/fragment.rb
42
+ - lib/decontaminator/version.rb
43
+ - spec/decontaminator/fragment_spec.rb
44
+ homepage: https://github.com/altmetric/decontaminator
45
+ licenses:
46
+ - MIT
47
+ metadata: {}
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 2.4.8
65
+ signing_key:
66
+ specification_version: 4
67
+ summary: HTML sanitizer using lightweight Oga HTML parser.
68
+ test_files:
69
+ - spec/decontaminator/fragment_spec.rb