crazy_harry 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rvmrc
6
+ coverage/*
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in crazy_harry.gemspec
4
+ gemspec
5
+
6
+ gem 'rake'
data/Guardfile ADDED
@@ -0,0 +1,10 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec', :version => 2, :cli => '--color' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/crazy_harry/(.+)\.rb$}) { |m| ["spec/lib/crazy_harry/#{m[1]}_spec.rb", "spec/integration/crazy_harry/#{m[1]}_spec.rb", "spec/lib/crazy_harry_spec.rb"] }
7
+ watch(%r{^lib/(.+)\.rb$}) { |m| ["spec/lib/#{m[1]}_spec.rb", "spec/integration/#{m[1]}_spec.rb"] }
8
+ watch('spec/spec_helper.rb') { "spec" }
9
+ end
10
+
data/README.md ADDED
@@ -0,0 +1,116 @@
1
+ # CrazyHarry
2
+
3
+ [![Build Status](https://secure.travis-ci.org/lonelyplanet/crazy_harry.png)](http://travis-ci.org/lonelyplanet/crazy_harry)
4
+
5
+ CrazyHarry is a high-level html fragment sanitiser/cleaner in use at
6
+ [Lonely Planet](http://www.lonelyplanet.com). It is based on [Flavour
7
+ Jones's](http://mike.daless.io) [Loofah Gem](https://github.com/flavorjones).
8
+
9
+ [Loofah](https://github.com/flavorjones) is a great tool and we've been
10
+ using it in a number of different projects. Hoewever, we found that we
11
+ were repeating the same types of cleaning job in multiple places.
12
+
13
+ CrazyHarry wraps up a number these tasks in a simple DSL, while adding commands for a few edge cases that are not straightforward with Loofah.
14
+
15
+ ## Installation
16
+
17
+ gem 'crazy_harry'
18
+
19
+ bundle install
20
+
21
+ ## Usage
22
+
23
+ object_with_description.each do |obj|
24
+
25
+ if descriptions[l.external_id]
26
+ sanitised_fragment = CrazyHarry.fragment(descriptions[obj.external_id])
27
+ .redact!( unsafe: true, tags: 'img')
28
+ .change!( from: 'b', to: 'h3' )
29
+ .change!( from: 'strong', to: 'h3' )
30
+
31
+ obj.update_column(:description, sanitised_fragment.to_s)
32
+ end
33
+ end
34
+
35
+ ## Default Actions
36
+
37
+ It automatically removes blank tags, converts `<br \>` tags to wrapped
38
+ paragraphs and de-dupes content.
39
+
40
+ ## Chaining
41
+
42
+ As per the previous example, all calls **except** `.strip!` (which removes
43
+ all markup) may be chained. (`.strip!` can be the last element in a
44
+ chain. See below).
45
+
46
+ ## Scoping and Targeting by Content
47
+
48
+ All commands except `.strip!` accept `scope:` and `text:` attributes:
49
+
50
+ CrazyHarry.fragment( '<div><b>Hotels</b></div><p><b>Hotels</b></p><b>Tents</b>' ).change!( from: 'b', to: 'em', scope: 'p' ).to_s
51
+
52
+ will produce:
53
+
54
+ <div><b>Hotels</b></div><p><em>Hotels</em></p><b>Tents</b>
55
+
56
+ while:
57
+
58
+ CrazyHarry.fragment( 'Hot <b>hotels</b> in <b>Saigon</b>' ).change!( from: 'b', to: 'em', text: 'hotels' ).to_s
59
+
60
+ will produce:
61
+
62
+ Hot <em>hotels</em> in <b>Saigon</b>
63
+
64
+
65
+ ## Adding Attributes
66
+
67
+ Use the `.translate` command to do this:
68
+
69
+ harry.fragment( '<b>Header</b><p>Content</p>' ).translate!( add_attributes: { class: 'partner'} ).to_s
70
+
71
+ will return:
72
+
73
+ <b class="partner">Header</b><p class="partner">Content</p>
74
+
75
+ If a tag already has an attribute, the new attribute will be appended to
76
+ the existing one:
77
+
78
+ <b class="bright-red partner">Header</b><p class="partner">Content</p>
79
+
80
+ ## Stripping
81
+
82
+ ### Specific Tags
83
+
84
+ Use the `.redact!` command. It **does not** strip unsafe tags by default.
85
+ To do this, pass the `unsafe: true` option.
86
+
87
+ ### All Tags
88
+
89
+ Use the `.strip!` command. It can be used as the last tag in a chain
90
+ (with `.translate( from_text: <some text>, to_text: <other text> )`, for instance), but should generally be the only call
91
+ you make.
92
+
93
+ ## Text Translation
94
+
95
+ The `.translate!` command can change tag content, preserving case:
96
+
97
+ CrazyHarry.fragment( '<h3>Lodging</h3> lodging' ).translate!( from_text: 'lodging', to_text: 'hotel' ).to_s
98
+
99
+ will return:
100
+
101
+ <h3>Hotel</h3> lodging
102
+
103
+ ## Known Issues/TODO
104
+
105
+ * De-duping does not take account of whitespace. So, `<p>Some Content</p>` and `<p>Some Content </p>` will not be treated as duplicates.
106
+ * Be able to turn off default actions.
107
+ * It should be able to work on documents as well as fragments.
108
+ * Merge `.translate!` with `.change!`
109
+
110
+ ## Contributing
111
+
112
+ 1. Fork it
113
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
114
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
115
+ 4. Push to the branch (`git push origin my-new-feature`)
116
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ task :default => :spec
5
+
6
+ RSpec::Core::RakeTask.new
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "crazy_harry/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "crazy_harry"
7
+ s.version = CrazyHarry::VERSION
8
+ s.authors = ["TA Tyree"]
9
+ s.email = ["todd.tyree@lonelyplanet.co.uk"]
10
+ s.homepage = "https://github.com/lonelyplanet/crazy_harry"
11
+ s.summary = %q{A High level HTML fragment sanitizer.}
12
+ s.description = %q{CrazyHarry is a high-level html fragment sanitiser/cleaner based on Loofah.}
13
+
14
+ s.rubyforge_project = "crazy_harry"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency 'rspec'
23
+ s.add_development_dependency 'guard'
24
+ s.add_development_dependency 'guard-rspec'
25
+ s.add_development_dependency 'simplecov'
26
+
27
+ s.add_runtime_dependency 'loofah'
28
+ end
@@ -0,0 +1,27 @@
1
+ require 'loofah'
2
+
3
+ %w(default change redact translate base version).each do |load_lib|
4
+ require_relative "crazy_harry/#{load_lib}"
5
+ end
6
+
7
+ module CrazyHarry
8
+
9
+ attr_accessor :base
10
+
11
+ class << self
12
+
13
+ def fragment(fragment)
14
+ base = Base.new(fragment: fragment)
15
+ base.no_blanks!
16
+ base.convert_br_to_p!
17
+ base.dedupe!
18
+ base
19
+ end
20
+
21
+ def to_s
22
+ @base.to_s
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,31 @@
1
+ module CrazyHarry
2
+ class Base
3
+ include CrazyHarry::Default
4
+ include CrazyHarry::Change
5
+ include CrazyHarry::Redact
6
+ include CrazyHarry::Translate
7
+
8
+ attr_accessor :fragment, :scope, :steps, :text
9
+
10
+ def initialize(opts = {})
11
+ self.fragment = Loofah.fragment(opts.delete(:fragment)) if opts.has_key?(:fragment)
12
+ self.steps = []
13
+ end
14
+
15
+ def to_s
16
+ self.fragment.to_s.squeeze(' ').strip
17
+ end
18
+
19
+ private
20
+
21
+ def run!
22
+ steps.compact.delete_if do |step|
23
+ if steps.size > 0
24
+ fragment.scrub!(step)
25
+ true
26
+ end
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,59 @@
1
+ module CrazyHarry
2
+ module Change
3
+
4
+ BLOCK_CONVERSION_ELEMENTS = {
5
+ inlines: %w(br),
6
+ blocks: %w(div p)
7
+ }
8
+
9
+ attr_accessor :from, :to
10
+
11
+ def change!(opts)
12
+ self.from = opts.delete(:from)
13
+ self.to = opts.delete(:to)
14
+ self.text = opts.delete(:text)
15
+ self.scope = opts.delete(:scope)
16
+
17
+ self.steps << generic_from_to
18
+ self.steps << unwrap_unnecessary_paragraphs
19
+
20
+ run!
21
+
22
+ self
23
+ end
24
+
25
+ private
26
+
27
+ def change_this_node?(node)
28
+ ( self.text ? node.text == self.text : true ) &&
29
+ ( self.scope ? node.parent.name == self.scope : true ) &&
30
+ ( node.name == self.from )
31
+ end
32
+
33
+ def convert_inline_element_to_block?
34
+ BLOCK_CONVERSION_ELEMENTS[:inlines].include?(self.from) &&
35
+ BLOCK_CONVERSION_ELEMENTS[:blocks].include?(self.to)
36
+ end
37
+
38
+ def generic_from_to
39
+ return if convert_inline_element_to_block?
40
+
41
+ Loofah::Scrubber.new do |node|
42
+ node.name = self.to if change_this_node?(node)
43
+ end
44
+ end
45
+
46
+ def unwrap_unnecessary_paragraphs
47
+ Loofah::Scrubber.new do |node|
48
+ node.replace(node.children.first) if unnecessary_paragraph?(node)
49
+ end
50
+ end
51
+
52
+ def unnecessary_paragraph?(node)
53
+ node.name == 'p' &&
54
+ node.children.size == 1 &&
55
+ Loofah::Elements::BLOCK_LEVEL.include?(node.children.first.name)
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,67 @@
1
+ module CrazyHarry
2
+ module Default
3
+ TARGET_ELEMENTS = ['b'].to_set
4
+ TARGET_ELEMENTS.merge(Loofah::Elements::BLOCK_LEVEL)
5
+
6
+ def no_blanks!
7
+ self.steps << remove_blank_elements
8
+ run!
9
+ self
10
+ end
11
+
12
+ def dedupe!
13
+ self.fragment = Loofah.fragment(html_with_duplicates_removed_from_the_bottom_up)
14
+ self
15
+ end
16
+
17
+ def convert_br_to_p!
18
+ fragment.xpath('br/following-sibling::text()|br/preceding-sibling::text()').each do |node|
19
+ node.replace(Nokogiri.make("<p>#{node.to_html.strip}</p>")) unless node.content =~ /\A\s*\z/
20
+ end
21
+
22
+ fragment.scrub!(scrub_tag('br'))
23
+ end
24
+
25
+ private
26
+
27
+ def scrub_tag(tag_name)
28
+ Loofah::Scrubber.new do |node|
29
+ if node.name == tag_name
30
+ node.remove
31
+ Loofah::Scrubber::STOP # don't bother with the rest of the subtree
32
+ end
33
+ end
34
+ end
35
+
36
+ def this_element?(element)
37
+ name = element.respond_to?(:name) ? element.name : element
38
+ TARGET_ELEMENTS.include?(name)
39
+ end
40
+
41
+ def remove_blank_elements
42
+ Loofah::Scrubber.new do |node|
43
+ node.remove if this_element?(node) && node.content.empty?
44
+ end
45
+ end
46
+
47
+ def html_with_duplicates_removed_from_the_bottom_up
48
+ all_elements.reverse.delete_if{ |e| all_elements_less_current_element(e).include?(e) }.reverse.compact.join('')
49
+ end
50
+
51
+ def all_elements_less_current_element(element)
52
+ remove = true
53
+
54
+ all_elements_less_current_node = all_elements.delete_if do |e|
55
+ if e == element && remove
56
+ remove = false
57
+ true
58
+ end
59
+ end
60
+ end
61
+
62
+ def all_elements
63
+ @all_elements ||= self.fragment.children.map(&:to_html)
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,73 @@
1
+ module CrazyHarry
2
+ module Redact
3
+
4
+ class InvalidStripMethod < StandardError; end;
5
+
6
+ STRIP_METHODS = %w(strip prune escape whitewash)
7
+
8
+ attr_accessor :attributes, :tags, :unsafe
9
+
10
+ def strip!
11
+ fragment.to_text
12
+ end
13
+
14
+ def redact!(opts = {})
15
+ self.unsafe = opts.delete(:unsafe) || opts == {}
16
+ self.tags = [opts.delete(:tags)].compact.flatten
17
+ self.attributes = opts.delete(:attributes)
18
+ self.text = opts.delete(:text)
19
+ self.scope = opts.delete(:scope)
20
+ prune = opts.delete(:prune)
21
+
22
+ self.steps << strip_unsafe if self.unsafe
23
+ self.steps << strip_tags unless prune || self.tags == []
24
+ self.steps << prune_tags if prune
25
+
26
+ run!
27
+
28
+ self
29
+ end
30
+
31
+ private
32
+
33
+ def redact_this_node?(node)
34
+ ( self.text ? node.text == self.text : true ) &&
35
+ ( self.scope ? node.parent.name == self.scope : true ) &&
36
+ ( self.attributes ? self.attributes.any?{ |a,v| node[a.to_s] == v } : true ) &&
37
+ self.tags.include?(node.name)
38
+ end
39
+
40
+ def strip_unsafe
41
+ fail InvalidStripMethod, "vaild methods are #{STRIP_METHODS.join(', ')}." unless valid_strip_method?
42
+
43
+ STRIP_METHODS.include?(self.unsafe.to_s) ? self.unsafe.to_sym : :prune
44
+ end
45
+
46
+ def valid_strip_method?
47
+ return true if self.unsafe == true || STRIP_METHODS.include?(self.unsafe.to_s)
48
+ end
49
+
50
+ def strip_tags
51
+ Loofah::Scrubber.new do |node|
52
+ content = block_node?(node) ? "#{node.content}\n" : "#{node.content} "
53
+ node.replace(content) if redact_this_node?(node)
54
+ end
55
+ end
56
+
57
+ def block_node?(node)
58
+ Loofah::Elements::BLOCK_LEVEL.include?(node.name)
59
+ end
60
+
61
+ # TODO: Refactor and DRY out--nearly identical to
62
+ # ::Default#scrub_tag
63
+ def prune_tags
64
+ Loofah::Scrubber.new do |node|
65
+ if redact_this_node?(node)
66
+ node.remove
67
+ Loofah::Scrubber::STOP # don't bother with the rest of the subtree
68
+ end
69
+ end
70
+ end
71
+
72
+ end
73
+ end
@@ -0,0 +1,56 @@
1
+ module CrazyHarry
2
+ module Translate
3
+
4
+ attr_accessor :add_attributes, :from_text, :to_text
5
+
6
+ def translate!(opts = {})
7
+ self.add_attributes = opts.delete(:add_attributes)
8
+ self.from_text = opts.delete(:from_text)
9
+ self.to_text = opts.delete(:to_text)
10
+ self.text = opts.delete(:text)
11
+ self.scope = opts.delete(:scope)
12
+
13
+ self.steps << change_attributes if self.add_attributes
14
+ self.steps << change_text if change_text?
15
+
16
+ run!
17
+
18
+ self
19
+ end
20
+
21
+ private
22
+
23
+ def translate_this_node?(node)
24
+ ( self.text ? node.text == self.text : true ) &&
25
+ ( self.scope ? node.parent.name == self.scope : true )
26
+ end
27
+
28
+ def change_text?
29
+ self.from_text && self.to_text
30
+ end
31
+
32
+ # TODO move this to CrazyHarry::Change.
33
+ def change_attributes
34
+ Loofah::Scrubber.new do |node|
35
+ if translate_this_node?(node)
36
+ self.add_attributes.map do |k,v|
37
+ node_key = k.to_s
38
+
39
+ node[node_key] = [node[node_key], v].compact.join(' ') unless node[node_key].to_s[/#{v}/]
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ def change_text
46
+ Loofah::Scrubber.new do |node|
47
+ capitalized_from = self.from_text.capitalize
48
+ capitalized_to = self.to_text.capitalize
49
+
50
+ node.content = node.content.gsub(/#{self.from_text.downcase}/, self.to_text.downcase) if node.content[/#{self.from_text.downcase}/]
51
+ node.content = node.content.gsub(/#{capitalized_from}/, capitalized_to) if node.content[/#{capitalized_from}/]
52
+ end
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,3 @@
1
+ module CrazyHarry
2
+ VERSION = "0.0.3"
3
+ end
data/rvmrc.example ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.3@crazy_harry --create
@@ -0,0 +1,50 @@
1
+ <p>
2
+ <b>Location. </b>
3
+ <br />
4
+ Golden Nugget formerly Trump Marina is located in Atlantic City, close to Atlantic City Aquarium, Steel Pier, and Atlantic City Hall. Nearby points of interest also include Atlantic City Public Library and Absecon Lighthouse.
5
+ </p>
6
+ <p>
7
+ <b>Resort Features. </b>
8
+ <br />
9
+ Dining options at Golden Nugget formerly Trump Marina include 8 restaurants. 4 bars/lounges, a beach bar, and a poolside bar are open for drinks. Room service is available 24 hours a day. Recreational amenities include a children's pool and a health club. The property's full service health spa has massage/treatment rooms and beauty services. This 3 star property has a business center and offers small meeting rooms and audio visual equipment. Wireless and wired high speed Internet access is available in public areas (surcharges apply)..
10
+ </p>
11
+ <p>
12
+ <b>Guestrooms. </b>
13
+ <br />
14
+ 728 air conditioned guestrooms at Golden Nugget formerly Trump Marina feature coffee/tea makers and blackout drapes/curtains. Bathrooms feature complimentary toiletries and hair dryers. Wired high speed Internet access is available for a surcharge. Televisions have pay movies. Also included are irons/ironing boards and clock radios. Guests may request a turndown service and wake up calls. Housekeeping is available daily.
15
+ </p>
16
+ <br /><br />
17
+ <p><strong>Notifications and Fees:</strong>
18
+ <br />
19
+ </p>
20
+ <p></p>
21
+ <p></p>
22
+ <p></p>
23
+ <p>The following fees and deposits are charged by the property at time of service, check in, or check out.
24
+ <ul>
25
+ <li>Self parking fee: USD 5 per stay</li>
26
+ <li>Valet parking fee: USD 10 per stay</li>
27
+ <li>Fee for in room high speed Internet (wired): USD 12.99 (for 24 hours, rates may vary)</li>
28
+ <li>Rollaway bed fee: USD 25. per stay</li>
29
+ </ul>
30
+ </p>
31
+ <p>
32
+ The above list may not be comprehensive. Fees and deposits may not include tax and are subject to change.
33
+ </p>
34
+ <br /><br />
35
+ <p>
36
+ <strong>Notifications and Fees:</strong>
37
+ <br />
38
+ </p>
39
+ <p></p>
40
+ <p></p>
41
+ <p></p>
42
+ <p>The following fees and deposits are charged by the property at time of service, check in, or check out.
43
+ <ul>
44
+ <li>Self parking fee: USD 5 per stay</li>
45
+ <li>Valet parking fee: USD 10 per stay</li>
46
+ <li>Fee for in room high speed Internet (wired): USD 12.99 (for 24 hours, rates may vary)</li>
47
+ <li>Rollaway bed fee: USD 25. per stay</li>
48
+ </ul>
49
+ </p>
50
+ <p>The above list may not be comprehensive. Fees and deposits may not include tax and are subject to change. </p>
@@ -0,0 +1,24 @@
1
+ <p>
2
+ <h3>Location. </h3>
3
+
4
+ Golden Nugget formerly Trump Marina is located in Atlantic City, close to Atlantic City Aquarium, Steel Pier, and Atlantic City Hall. Nearby points of interest also include Atlantic City Public Library and Absecon Lighthouse.
5
+ </p>
6
+ <p>
7
+ <h3>Resort Features. </h3>
8
+
9
+ Dining options at Golden Nugget formerly Trump Marina include 8 restaurants. 4 bars/lounges, a beach bar, and a poolside bar are open for drinks. Room service is available 24 hours a day. Recreational amenities include a children's pool and a health club. The property's full service health spa has massage/treatment rooms and beauty services. This 3 star property has a business center and offers small meeting rooms and audio visual equipment. Wireless and wired high speed Internet access is available in public areas (surcharges apply)..
10
+ </p><p>
11
+ <h3>Guestrooms. </h3>
12
+
13
+ 728 air conditioned guestrooms at Golden Nugget formerly Trump Marina feature coffee/tea makers and blackout drapes/curtains. Bathrooms feature complimentary toiletries and hair dryers. Wired high speed Internet access is available for a surcharge. Televisions have pay movies. Also included are irons/ironing boards and clock radios. Guests may request a turndown service and wake up calls. Housekeeping is available daily.
14
+
15
+ </p><p><h3>Notifications and Fees:</h3>
16
+ </p><p>The following fees and deposits are charged by the property at time of service, check in, or check out.
17
+ </p><ul>
18
+ <li>Self parking fee: USD 5 per stay</li>
19
+ <li>Valet parking fee: USD 10 per stay</li>
20
+ <li>Fee for in room high speed Internet (wired): USD 12.99 (for 24 hours, rates may vary)</li>
21
+ <li>Rollaway bed fee: USD 25. per stay</li>
22
+ </ul><p>
23
+ The above list may not be comprehensive. Fees and deposits may not include tax and are subject to change.
24
+ </p>
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe CrazyHarry do
4
+
5
+ let(:harry) { CrazyHarry }
6
+
7
+ it "should allow method chaining" do
8
+ harry.fragment('<script>STEAL COOKIE!</script><em>Place</em><p>Lodging</p><b>Location:</b>')
9
+ .redact!( unsafe: true, tags: 'em' )
10
+ .change!( from: 'b', to: 'h3' )
11
+ .translate!( from_text: 'Lodging', to_text: 'Hotel', add_attributes: { class: 'partner' } )
12
+ .to_s.should == 'Place <p class="partner">Hotel</p><h3 class="partner">Location:</h3>'
13
+ end
14
+
15
+ it "should not care about the chain order" do
16
+ harry.fragment('<script>STEAL COOKIE!</script><em>Place</em><p>Lodging</p><b>Location:</b>')
17
+ .translate!( from_text: 'Lodging', to_text: 'Hotel', add_attributes: { class: 'partner' } )
18
+ .redact!( unsafe: true, tags: 'em' )
19
+ .change!( from: 'b', to: 'h3' )
20
+ .to_s.should == 'Place <p class="partner">Hotel</p><h3 class="partner">Location:</h3>'
21
+ end
22
+
23
+ end
@@ -0,0 +1,83 @@
1
+ require 'spec_helper'
2
+
3
+ describe CrazyHarry::Change do
4
+
5
+ # Change br to p
6
+ # Change b to h3 where b has text: "location"
7
+ # Change b to em where b inside p
8
+ #
9
+ # Add attribute: "partner" to all tags
10
+ #
11
+ # Another ADS suggestion. Not sure it if it needed for the first cut.
12
+ # Transform "lodging" to "hotel"
13
+
14
+ context "change" do
15
+
16
+ let(:harry){ CrazyHarry }
17
+
18
+ it "should be able to change one tag to another" do
19
+ harry.fragment( '<b>Location:</b>' ).change!( from: 'b', to: 'h3' ).to_s.should == '<h3>Location:</h3>'
20
+ end
21
+
22
+ it "should unwrap unnecessary paragraphs" do
23
+ harry.fragment('<p><strong>Header</strong><br /></p>').change!(from: 'strong', to: 'h3').to_s.should ==
24
+ '<h3>Header</h3>'
25
+ end
26
+
27
+ it "should not unwrap paragraphs indiscriminately" do
28
+ harry.fragment('<p><b>Header</b><br />Content</p>').change!(from: 'b', to: 'h3').to_s.should ==
29
+ '<p><h3>Header</h3>Content</p>'
30
+ end
31
+
32
+ context "chaining" do
33
+
34
+ it "should be able to chain calls" do
35
+ harry.fragment( '<b>Location:</b><br />Saigon' ).change!( from: 'b', to: 'h3').change!( from: 'br', to: 'p' ).to_s.should ==
36
+ '<h3>Location:</h3><p>Saigon</p>'
37
+ end
38
+
39
+ it "should not care about the order of the chain" do
40
+ harry.fragment( '<b>Location:</b><br />Saigon' ).change!( from: 'br', to: 'p' ).change!( from: 'b', to: 'h3' ).to_s.should ==
41
+ '<h3>Location:</h3><p>Saigon</p>'
42
+ end
43
+
44
+ end
45
+
46
+ it "should be able to change all the tags it finds" do
47
+ harry.fragment( '<b>1</b><b>2</b><b>3</b>' ).change!( from: 'b', to: 'h3' ).to_s.should == '<h3>1</h3><h3>2</h3><h3>3</h3>'
48
+ end
49
+
50
+ it "should be able to change a span tag to a div" do
51
+ harry.fragment( '<span>Hotel Details</span>' ).change!( from: 'span', to: 'div' ).to_s.should == '<div>Hotel Details</div>'
52
+ end
53
+
54
+ it "should not change things that it isn't directed to" do
55
+ harry.fragment( '<h3>Hotel Details</h3>' ).change!( from: 'span', to: 'div' ).to_s.should == '<h3>Hotel Details</h3>'
56
+ end
57
+
58
+ it "shouldn't choke if it gets invalid tags" do
59
+ harry.fragment( '<h3>Hotel Details</h3>' ).change!( from: 'snoogenflozen', to: 'snarglebat' ).to_s.should == '<h3>Hotel Details</h3>'
60
+ end
61
+
62
+ context "targeting and scoping" do
63
+
64
+ it "should change all occurrences" do
65
+ harry.fragment( '<div><b>Hotels</b></div><p><b>Hotels</b></p><b>Tents</b><div><p><b>Campervan</b></p></div>' ).change!( from: 'b', to: 'em').to_s.should ==
66
+ '<div><em>Hotels</em></div><p><em>Hotels</em></p><em>Tents</em><div><p><em>Campervan</em></p></div>'
67
+ end
68
+
69
+ it "should be able to target changes by content" do
70
+ harry.fragment( 'Hot <b>hotels</b> in <b>Saigon</b>' ).change!( from: 'b', to: 'em', text: 'hotels' ).to_s.should ==
71
+ 'Hot <em>hotels</em> in <b>Saigon</b>'
72
+ end
73
+
74
+ it "should be able to scope changes to specific blocks" do
75
+ harry.fragment( '<div><b>Hotels</b></div><p><b>Hotels</b></p><b>Tents</b>' ).change!( from: 'b', to: 'em', scope: 'p' ).to_s.should ==
76
+ '<div><b>Hotels</b></div><p><em>Hotels</em></p><b>Tents</b>'
77
+ end
78
+
79
+ end
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe CrazyHarry do
4
+ TAGS_TO_REMOVE = {
5
+ paragraph: '<p></p>',
6
+ ul: '<ul></ul>',
7
+ div: '<div></div>'
8
+ }
9
+
10
+ let(:harry) { CrazyHarry }
11
+
12
+ context "default actions" do
13
+
14
+ it "should relentlessly remove br tags" do
15
+ harry.fragment('<p>hello</p><br /><strong>HELLO</strong><br /><br />').to_s.should == '<p>hello</p><strong>HELLO</strong>'
16
+ end
17
+
18
+ context "removing blank tags" do
19
+
20
+ TAGS_TO_REMOVE.each do |name,example|
21
+ it "should automatically remove blank #{name} tags" do
22
+ harry.fragment("#{example}<p>Hello!</p>").to_s.should == '<p>Hello!</p>'
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ context "inline tags to blocks" do
29
+
30
+ it "should automatically change br to p" do
31
+ harry.fragment( 'Hotel Details<br />' ).to_s.should == '<p>Hotel Details</p>'
32
+ end
33
+
34
+ it "should automatically change multiple br tags to a single p" do
35
+ harry.fragment( 'Hotel Details<br /><br /><br />' ).to_s.should == '<p>Hotel Details</p>'
36
+ end
37
+
38
+ it "should automatically clear whitespace when converting line-style to block-style" do
39
+ harry.fragment( 'hotel<br /> hostel <br />').to_s.should == '<p>hotel</p><p>hostel</p>'
40
+ end
41
+
42
+ it "should automatically change all br tags to wrapped p tags" do
43
+ harry.fragment( 'hotel<br /> hostel<br /> tent<br />').to_s.should == '<p>hotel</p><p>hostel</p><p>tent</p>'
44
+ end
45
+
46
+ it "should automatically change all multiple br tags to wrapped tags" do
47
+ harry.fragment( 'hotel<br /><br /><br/> hostel<br /> tent<br />').to_s.should == '<p>hotel</p><p>hostel</p><p>tent</p>'
48
+ end
49
+
50
+ it "should automatically change leading inline tags to block tags" do
51
+ harry.fragment( '<br /><br />hotel').to_s.should == '<p>hotel</p>'
52
+ end
53
+
54
+ it "should ignore stray inline tags if they don't appear relevant to the context" do
55
+ harry.fragment( '<br /><br />hotel <br />hostel<br />').to_s.should == '<p>hotel</p><p>hostel</p>'
56
+ end
57
+
58
+ end
59
+
60
+ context "de-duping" do
61
+
62
+ it "should automatically de-dupe lists" do
63
+ harry.fragment('<ul><li>Duplicate.</li></ul><ul><li>Duplicate.</li></ul>').to_s.should == '<ul><li>Duplicate.</li></ul>'
64
+ end
65
+
66
+ it "should automatically de-dupe paragraphs" do
67
+ harry.fragment('<p>Lorem Ipsum</p><p>Lorem Ipsum</p>').to_s.should == '<p>Lorem Ipsum</p>'
68
+ end
69
+
70
+ it "should not remove duplicate content that exsists at a different markup level" do
71
+ harry.fragment('<p><strong>Location:</strong></p><strong>Location:</strong>').to_s.should == '<p><strong>Location:</strong></p><strong>Location:</strong>'
72
+ end
73
+
74
+ it "should not alter other content when it de-dupes" do
75
+ harry.fragment('<h3>Here</h3><p>Yep, here.</p><h3>Here</h3><p>Here again.</p>').to_s.should == '<h3>Here</h3><p>Yep, here.</p><p>Here again.</p>'
76
+ end
77
+
78
+ it "should de-dupe if the only content difference is whitespace" do
79
+ pending "Too complicated for the first cut."
80
+ end
81
+
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,122 @@
1
+ require 'spec_helper'
2
+
3
+ describe CrazyHarry::Redact do
4
+
5
+ # Strip all tags -- should convert </p> and <br> to linebreaks by default
6
+ # Strip tags from text: "HostelWorld"
7
+ # Strip tags from text: "HostelWorld" inside h3
8
+ #
9
+ # ADS suggested ability to remove tag and content, so 'kill' Probably
10
+ # more appropriate for script and image tags
11
+ #
12
+ # Kill a where a has attribute: "nofollow", text: "visit Disneyland"
13
+ # Kill img
14
+
15
+ let(:harry){ CrazyHarry }
16
+
17
+ context "strip all tags" do
18
+
19
+ it "should be able to strip all tags" do
20
+ harry.fragment( "<b>a</b>b<br /><h3>c</h3><img src='blah.jpg' />" ).strip!.should == "a\nb\n\nc\n"
21
+ end
22
+
23
+ end
24
+
25
+ context "strip specific tags" do
26
+
27
+ it "should be able to strip unsafe tags" do
28
+ harry.fragment( 'Me steal <script>Cookie!</script>').redact!( unsafe: true ).to_s.should == 'Me steal'
29
+ harry.fragment( 'What is that <bloog>thing</bloog>?').redact!( unsafe: true ).to_s.should == 'What is that ?'
30
+ end
31
+
32
+ it "should allow the unsafe strip method to be overridden (it prunes by default)" do
33
+ fragment = 'Me steal <script>Cookie!</script>'
34
+ harry.fragment( fragment).redact!( unsafe: :escape ).to_s.should == 'Me steal &lt;script&gt;Cookie!&lt;/script&gt;'
35
+ harry.fragment( fragment).redact!( unsafe: :strip ).to_s.should == 'Me steal Cookie!'
36
+ end
37
+
38
+ it "should prune unsafe tags if called without any arguments" do
39
+ harry.fragment( 'Me steal <script>Cookie!</script>').redact!.to_s.should == 'Me steal'
40
+ end
41
+
42
+ it "should raise an error if asked for an unknown strip method" do
43
+ -> { harry.fragment( 'Any old thing').redact!( unsafe: 'magical_magic' ).to_s }.should raise_error(CrazyHarry::Redact::InvalidStripMethod)
44
+ end
45
+
46
+ it "should be able to strip a specific tag" do
47
+ harry.fragment( '<b>Location:</b><p>Saigon</p>' ).redact!( tags: 'b' ).to_s.should == "Location: <p>Saigon</p>"
48
+ end
49
+
50
+ it "should strip every occurrence of a tag" do
51
+ harry.fragment( '<b>Location:</b><p>Saigon</p><b>Rates:</b>' ).redact!( tags: 'b' ).to_s.should == "Location: <p>Saigon</p>Rates:"
52
+ end
53
+
54
+ it "should be able to strip multiple tags" do
55
+ harry.fragment( '<b>One</b><p>Saigon <em>Plaza</em></p>').redact!( tags: %w(b em)).to_s.should == "One <p>Saigon Plaza </p>"
56
+ end
57
+
58
+ it "should close a paragraph tag early if the fragment has an illegally nested header tag" do
59
+ harry.fragment( '<b>One</b><p>Saigon <h3>Plaza</h3></p>').redact!( tags: %w(b h3)).to_s.should == "One <p>Saigon </p>Plaza"
60
+ end
61
+
62
+ context "chaining and multiple arguments" do
63
+
64
+ it "should be able to chain calls" do
65
+ harry.fragment( '<b>Location:</b><script>Steal Cookie</script><em>Plaza</em>' ).redact!( tags: 'b' ).redact!( unsafe: true ).to_s.should ==
66
+ 'Location: <em>Plaza</em>'
67
+ end
68
+
69
+ it "should accept multiple, different commands in a single call" do
70
+ harry.fragment( '<b>Location:</b><script>Steal Cookie</script><em>Plaza</em>' ).redact!( tags: 'b', unsafe: true ).to_s.should ==
71
+ 'Location: <em>Plaza</em>'
72
+ end
73
+
74
+ end
75
+
76
+ context "whitespace and breaks" do
77
+
78
+ it "shouldn't add excessive extra whitespace" do
79
+ harry.fragment( '<b>Location: </b> <b>Prices:</b>' ).redact!( tags: 'b' ).to_s.should == 'Location: Prices:'
80
+ end
81
+
82
+ it "should add a newline after a stripped block element" do
83
+ harry.fragment( '<h3>Location:</h3><p>Lorem ipsum</p>' ).redact!( tags: 'h3' ).to_s.should == "Location:\n<p>Lorem ipsum</p>"
84
+ end
85
+
86
+ end
87
+
88
+ context "targeting and scope" do
89
+
90
+ it "should allow strip to be targeted by the tag content" do
91
+ harry.fragment( '<h3>Location:</h3><h3>Ho Chi Minh City</h3>' ).redact!( tags: 'h3', text: 'Location:' ).to_s.should ==
92
+ "Location:\n<h3>Ho Chi Minh City</h3>"
93
+ end
94
+
95
+ it "should allow strip to be targeted by an attribute" do
96
+ harry.fragment( '<h3 class="big">Big</h3><h3 class="red">Red</h3>').redact!( tags: 'h3', attributes: { class: 'red' } ).to_s.should ==
97
+ '<h3 class="big">Big</h3>Red'
98
+ end
99
+
100
+ it "should be able to scope changes to specific blocks" do
101
+ harry.fragment( '<div><b>Hotels</b></div><p><b>Hotels</b></p><b>Tents</b>' ).redact!( tags: 'b', scope: 'p' ).to_s.should ==
102
+ '<div><b>Hotels</b></div><p>Hotels </p><b>Tents</b>'
103
+ end
104
+
105
+ it "should allow strip to be targeted and scoped" do
106
+ harry.fragment( '<b>Location:</b><p><b>Big</b> <b>Bad</b> Hotel</p>' ).redact!( tags: 'b', scope: 'p', text: 'Big' ).to_s.should ==
107
+ '<b>Location:</b><p>Big <b>Bad</b> Hotel</p>'
108
+ end
109
+
110
+ end
111
+
112
+ end
113
+
114
+ context 'pruning' do
115
+
116
+ it 'should be able to prune a tag, removing the tag and its contents' do
117
+ harry.fragment( 'I do not want <h3>Big</h3> images.').redact!( tags: 'h3', prune: true ).to_s.should == 'I do not want images.'
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper'
2
+
3
+ describe CrazyHarry::Translate do
4
+
5
+ # Add attribute: "partner" to all tags
6
+ # Transform "lodging" to "hotel"
7
+
8
+ let(:harry) { CrazyHarry }
9
+
10
+ it "should be able to add an attribute to all tags" do
11
+ harry.fragment( '<b>a</b><h3>c</h3>' ).translate!( add_attributes: { class: 'from-partner'} ).to_s.should ==
12
+ '<b class="from-partner">a</b><h3 class="from-partner">c</h3>'
13
+ end
14
+
15
+ it "should be able to add multiple attributes to all tags" do
16
+ harry.fragment( '<b>a</b><h3>c</h3>' ).translate!( add_attributes: { class: 'boo', bubba: 'beau' } ).to_s.should ==
17
+ '<b class="boo" bubba="beau">a</b><h3 class="boo" bubba="beau">c</h3>'
18
+ end
19
+
20
+ it "should append information to existing attributes" do
21
+ harry.fragment( '<h3 class="red">Lorem</h3>' ).translate!( add_attributes: { class: 'from-partner'} ).to_s.should ==
22
+ '<h3 class="red from-partner">Lorem</h3>'
23
+ end
24
+
25
+ it "should be able to tranlate free text in the fragment, preservering case" do
26
+ harry.fragment( '<h3>Lodging</h3> lodging' ).translate!( from_text: 'lodging', to_text: 'hotel' ).to_s.should ==
27
+ '<h3>Hotel</h3> hotel'
28
+ end
29
+
30
+ context "targeting and scope" do
31
+
32
+ it "should only add attributes to targeted content" do
33
+ harry.fragment( '<h3>Location:</h3><h3>Saigon</h3>' ).translate!( add_attributes: { class: 'partner' }, text: 'Location:' ).to_s.should ==
34
+ '<h3 class="partner">Location:</h3><h3>Saigon</h3>'
35
+ end
36
+
37
+ it "should be able to scope changes to specific blocks" do
38
+ harry.fragment( '<div><b>Hotel</b></div><p><b>Hotel</b></p><b>Tent</b>' ).translate!( add_attributes: { class: 'ugly' }, scope: 'p' ).to_s.should ==
39
+ '<div><b>Hotel</b></div><p><b class="ugly">Hotel</b></p><b>Tent</b>'
40
+ end
41
+
42
+ end
43
+
44
+
45
+
46
+ end
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ require 'simplecov'
4
+
5
+ SimpleCov.start if ENV["COVERAGE"]
6
+
7
+ Bundler.require :default, :development
8
+
9
+ require 'rspec'
10
+
11
+ RSpec.configure do |config|
12
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crazy_harry
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - TA Tyree
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &2151932960 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *2151932960
25
+ - !ruby/object:Gem::Dependency
26
+ name: guard
27
+ requirement: &2151930220 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *2151930220
36
+ - !ruby/object:Gem::Dependency
37
+ name: guard-rspec
38
+ requirement: &2151928080 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *2151928080
47
+ - !ruby/object:Gem::Dependency
48
+ name: simplecov
49
+ requirement: &2151940320 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *2151940320
58
+ - !ruby/object:Gem::Dependency
59
+ name: loofah
60
+ requirement: &2151939040 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *2151939040
69
+ description: CrazyHarry is a high-level html fragment sanitiser/cleaner based on Loofah.
70
+ email:
71
+ - todd.tyree@lonelyplanet.co.uk
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - .gitignore
77
+ - Gemfile
78
+ - Guardfile
79
+ - README.md
80
+ - Rakefile
81
+ - crazy_harry.gemspec
82
+ - lib/crazy_harry.rb
83
+ - lib/crazy_harry/base.rb
84
+ - lib/crazy_harry/change.rb
85
+ - lib/crazy_harry/default.rb
86
+ - lib/crazy_harry/redact.rb
87
+ - lib/crazy_harry/translate.rb
88
+ - lib/crazy_harry/version.rb
89
+ - rvmrc.example
90
+ - spec/fixtures/golden_nugget.html
91
+ - spec/fixtures/golden_nugget_corrected.html
92
+ - spec/integration/crazy_harry_spec.rb
93
+ - spec/lib/change_spec.rb
94
+ - spec/lib/crazy_harry_spec.rb
95
+ - spec/lib/redact_spec.rb
96
+ - spec/lib/translate_spec.rb
97
+ - spec/spec_helper.rb
98
+ homepage: https://github.com/lonelyplanet/crazy_harry
99
+ licenses: []
100
+ post_install_message:
101
+ rdoc_options: []
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ! '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ requirements: []
117
+ rubyforge_project: crazy_harry
118
+ rubygems_version: 1.8.10
119
+ signing_key:
120
+ specification_version: 3
121
+ summary: A High level HTML fragment sanitizer.
122
+ test_files:
123
+ - spec/fixtures/golden_nugget.html
124
+ - spec/fixtures/golden_nugget_corrected.html
125
+ - spec/integration/crazy_harry_spec.rb
126
+ - spec/lib/change_spec.rb
127
+ - spec/lib/crazy_harry_spec.rb
128
+ - spec/lib/redact_spec.rb
129
+ - spec/lib/translate_spec.rb
130
+ - spec/spec_helper.rb