RubyGems - html-hierarchy-extractor - Versions diffs - 1.0.2 → 1.0.9 - Mend

html-hierarchy-extractor 1.0.2 → 1.0.9

Files changed (31) hide show

checksums.yaml +4 -4
metadata +45 -48
data/.coveralls.yml +0 -1
data/.document +0 -5
data/.rspec +0 -2
data/.rubocop.yml +0 -26
data/.travis.yml +0 -12
data/CONTRIBUTING.md +0 -53
data/Gemfile +0 -16
data/Guardfile +0 -7
data/LICENSE.txt +0 -20
data/README.md +0 -141
data/Rakefile +0 -58
data/VERSION +0 -1
data/html-hierarchy-extractor.gemspec +0 -99
data/lib/html-hierarchy-extractor.rb +0 -144
data/lib/version.rb +0 -6
data/scripts/bump_version +0 -47
data/scripts/check_flay +0 -30
data/scripts/check_flog +0 -31
data/scripts/coverage +0 -3
data/scripts/git_hooks/pre-commit +0 -16
data/scripts/git_hooks/pre-push +0 -9
data/scripts/lint +0 -2
data/scripts/release +0 -13
data/scripts/test +0 -4
data/scripts/test_ci +0 -7
data/scripts/watch +0 -4
data/spec/html_hierarchy_extractor_spec.rb +0 -441
data/spec/spec_helper.rb +0 -14
data/spec/spec_helper_simplecov.rb +0 -9

@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-./scripts/test || exit 1
-# No over-complex methods
-./scripts/check_flog || exit 1
-# No duplication
-./scripts/check_flay

data/scripts/lint DELETED

	@@ -1,2 +0,0 @@
1	- #!/usr/bin/env bash
2	- rubocop -F './lib/' './spec'

data/scripts/release DELETED

@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-# Stop if any command fails
-set -e
-git checkout master
-git pull
-git rebase develop
-bundle install
-rake release
-git checkout develop
-git rebase master

data/scripts/test DELETED

@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-cd "$(dirname "$BASH_SOURCE")"/..
-COVERAGE=1 bundle exec rspec

data/scripts/test_ci DELETED

@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-# This script will be started by Travis, in the correct context (matrix of Ruby
-# version + Gemfile version), so it only needs to load the tests, without
-# worrying about appraisal
-cd "$(dirname "$BASH_SOURCE")"/..
-COVERAGE=1 bundle exec rspec

data/scripts/watch DELETED

@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-cd "$(dirname "$BASH_SOURCE")"/..
-guard

data/spec/html_hierarchy_extractor_spec.rb DELETED

@@ -1,441 +0,0 @@
-require 'spec_helper'
-describe(HTMLHierarchyExtractor) do
-  describe 'extract' do
-    it 'should load from an HTML string' do
-      # Given
-      input = '<p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual.size).to eq 1
-    end
-    it 'should allow overriding of the default css selector of nodes' do
-      # Given
-      input = '<div>foo</div>'
-      # When
-      options = {
-        css_selector: 'div'
-      }
-      actual = HTMLHierarchyExtractor.new(input, options: options).extract
-      # Then
-      expect(actual.size).to eq 1
-    end
-    it 'should export the Nokogiri node' do
-      # Given
-      input = '<p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
-    end
-    it 'should remove empty elements' do
-      # Given
-      input = '<p></p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual.size).to eq 0
-    end
-    it 'should add the DOM position to each element' do
-      # Given
-      input = '<p>foo</p>
-               <p>bar</p>
-               <p>baz</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:weight][:position]).to eq 0
-      expect(actual[1][:weight][:position]).to eq 1
-      expect(actual[2][:weight][:position]).to eq 2
-    end
-  end
-  describe 'extract_html' do
-    it 'should extract outer html' do
-      # Given
-      input = '<p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:html]).to eq '<p>foo</p>'
-    end
-    it 'should trim content' do
-      # Given
-      input = '<p>foo</p>
-               <blink>irrelevant</blink>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:html]).to eq '<p>foo</p>'
-    end
-  end
-  describe 'extract_text' do
-    it 'should extract inner text' do
-      # Given
-      input = '<p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:text]).to eq 'foo'
-    end
-    it 'should extract UTF8 correctly' do
-      # Given
-      input = '<p>UTF8‽✗✓</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:text]).to eq 'UTF8‽✗✓'
-    end
-  end
-  describe 'extract_tag_name' do
-    it 'should extract the tag name' do
-      # Given
-      input = '<p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:tag_name]).to eq 'p'
-    end
-    it 'should always return lowercase' do
-      # Given
-      input = '<P>foo</P>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:tag_name]).to eq 'p'
-    end
-  end
-  describe 'extract_hierarchy' do
-    it 'should extract a simple hierarchy' do
-      # Given
-      input = '<h1>Foo</h1>
-               <p>First paragraph</p>
-               <h2>Bar</h2>
-               <p>Second paragraph</p>
-               <h3>Baz</h3>
-               <p>Third paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[0][:hierarchy][:lvl1]).to eq nil
-      expect(actual[0][:hierarchy][:lvl2]).to eq nil
-      expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
-      expect(actual[1][:hierarchy][:lvl2]).to eq nil
-      expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
-      expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
-    end
-    it 'should use inner text of headings' do
-      # Given
-      input = '<h1><a href="#">Foo</a><span></span></h1>
-               <p>First paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[0][:hierarchy][:lvl1]).to eq nil
-      expect(actual[0][:hierarchy][:lvl2]).to eq nil
-    end
-    it 'should handle nodes not in any hierarchy' do
-      # Given
-      input = '<p>First paragraph</p>
-               <h1>Foo</h1>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:hierarchy][:lvl0]).to eq nil
-      expect(actual[0][:hierarchy][:lvl1]).to eq nil
-      expect(actual[0][:hierarchy][:lvl2]).to eq nil
-    end
-    it 'should handle any number of wrappers' do
-      # Given
-      input = '<header>
-                 <h1>Foo</h1>
-                 <p>First paragraph</p>
-               </header>
-               <div>
-                 <div>
-                   <div>
-                     <h2>Bar</h2>
-                     <p>Second paragraph</p>
-                     </div>
-                   </div>
-                 <div>
-                   <h3>Baz</h3>
-                   <p>Third paragraph</p>
-                 </div>
-               </div>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[0][:hierarchy][:lvl1]).to eq nil
-      expect(actual[0][:hierarchy][:lvl2]).to eq nil
-      expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
-      expect(actual[1][:hierarchy][:lvl2]).to eq nil
-      expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
-      expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
-      expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
-    end
-  end
-  describe 'extract_anchor' do
-    it 'should get the anchor of parent' do
-      # Given
-      input = '<h1 name="anchor">Foo</h1>
-               <p>First paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq 'anchor'
-    end
-    it 'should get no anchor if none found' do
-      # Given
-      input = '<h1>Foo</h1>
-               <p>First paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq nil
-    end
-    it 'should use the id as anchor if no name set' do
-      # Given
-      input = '<h1 id="anchor">Foo</h1>
-               <p>First paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq 'anchor'
-    end
-    it 'should be set to nil if no name nor id' do
-      # Given
-      input = '<h1>Foo</h1>
-               <p>First paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq nil
-    end
-    it 'should get the anchor of closest parent with an anchor' do
-      # Given
-      input = '<h1 name="anchor">Foo</h1>
-               <p>First paragraph</p>
-               <h2>Bar</h2>
-               <p>Second paragraph</p>
-               <h3 name="subanchor">Baz</h3>
-               <p>Third paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq 'anchor'
-      expect(actual[1][:anchor]).to eq 'anchor'
-      expect(actual[2][:anchor]).to eq 'subanchor'
-    end
-    it 'should get anchor even if heading not a direct parent' do
-      # Given
-      input = '<header>
-                 <h1 name="anchor">Foo</h1>
-                 <p>First paragraph</p>
-               </header>
-               <div>
-                 <div>
-                   <div>
-                     <h2>Bar</h2>
-                     <p>Second paragraph</p>
-                   </div>
-                 </div>
-                 <div>
-                   <h3 name="subanchor">Baz</h3>
-                   <p>Third paragraph</p>
-                 </div>
-               </div>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq 'anchor'
-      expect(actual[1][:anchor]).to eq 'anchor'
-      expect(actual[2][:anchor]).to eq 'subanchor'
-    end
-    it 'should get anchor if not directly on the header but inner element' do
-      # Given
-      input = '<h1><a name="anchor">Foo</a></h1>
-               <p>First paragraph</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:anchor]).to eq 'anchor'
-    end
-  end
-  describe 'uuid' do
-    it 'should give different uuid if different content' do
-      # Given
-      input_a = '<p>foo</p>'
-      input_b = '<p>bar</p>'
-      # When
-      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
-      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
-      # Then
-      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
-    end
-    it 'should give different uuid if different HTML tag' do
-      # Given
-      input_a = '<p>foo</p>'
-      input_b = '<p class="bar">foo</p>'
-      # When
-      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
-      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
-      # Then
-      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
-    end
-    it 'should give different uuid if different position in page' do
-      # Given
-      input_a = '<p>foo</p><p>bar</p>'
-      input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
-      # When
-      actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
-      actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
-      # Then
-      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
-    end
-    it 'should give different uuid if different parent header' do
-      # Given
-      input_a = '<h1 name="foo">foo</h1><p>bar</p>'
-      input_b = '<h1 name="bar">bar</h1><p>bar</p>'
-      # When
-      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
-      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
-      # Then
-      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
-    end
-    it 'should always give the same uuid for the same content' do
-      # Given
-      input_a = '<h1 name="foo">foo</h1><p>bar</p>'
-      input_b = '<h1 name="foo">foo</h1><p>bar</p>'
-      # When
-      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
-      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
-      # Then
-      expect(actual_a[:uuid]).to eq(actual_b[:uuid])
-    end
-  end
-  describe 'heading_weight' do
-    it 'should have 100 if no heading' do
-      # Given
-      input = '<p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:weight][:heading]).to eq 100
-    end
-    it 'should have decreasing value under small headers' do
-      # Given
-      input = '<h1 name="one">bar</h1><p>foo</p>
-               <h2 name="two">bar</h2><p>foo</p>
-               <h3 name="three">bar</h3><p>foo</p>
-               <h4 name="four">bar</h4><p>foo</p>
-               <h5 name="five">bar</h5><p>foo</p>
-               <h6 name="six">bar</h6><p>foo</p>'
-      # When
-      actual = HTMLHierarchyExtractor.new(input).extract
-      # Then
-      expect(actual[0][:weight][:heading]).to eq 90
-      expect(actual[1][:weight][:heading]).to eq 80
-      expect(actual[2][:weight][:heading]).to eq 70
-      expect(actual[3][:weight][:heading]).to eq 60
-      expect(actual[4][:weight][:heading]).to eq 50
-      expect(actual[5][:weight][:heading]).to eq 40
-    end
-  end
-end