RubyGems - html-hierarchy-extractor - Versions diffs - 1.0.0 - Mend

html-hierarchy-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/.coveralls.yml +1 -0
data/.document +5 -0
data/.rspec +2 -0
data/.rubocop.yml +26 -0
data/.travis.yml +12 -0
data/CONTRIBUTING.md +53 -0
data/Gemfile +16 -0
data/Guardfile +7 -0
data/LICENSE.txt +20 -0
data/README.md +17 -0
data/Rakefile +58 -0
data/VERSION +1 -0
data/html-hierarchy-extractor.gemspec +99 -0
data/lib/html-hierarchy-extractor.rb +144 -0
data/lib/version.rb +6 -0
data/scripts/bump_version +47 -0
data/scripts/check_flay +30 -0
data/scripts/check_flog +31 -0
data/scripts/coverage +3 -0
data/scripts/git_hooks/pre-commit +16 -0
data/scripts/git_hooks/pre-push +9 -0
data/scripts/lint +2 -0
data/scripts/release +16 -0
data/scripts/test +4 -0
data/scripts/test_ci +7 -0
data/scripts/watch +4 -0
data/spec/html_hierarchy_extractor_spec.rb +441 -0
data/spec/spec_helper.rb +14 -0
data/spec/spec_helper_simplecov.rb +9 -0
metadata +230 -0

data/lib/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# Expose gem version
+class HTMLHierarchyExtractorVersion
+  def self.to_s
+    '1.0.0'
+  end
+end

data/scripts/bump_version ADDED Viewed

@@ -0,0 +1,47 @@
+#!/usr/bin/env ruby
+require_relative '../lib/version.rb'
+# Simple script used to bump the version number
+class BumpVersion
+  def initialize(*args)
+    @type = args[0]
+    unless valid_type?(@type)
+      puts "Invalid bump type: #{@type}"
+      exit 1
+    end
+  end
+  def valid_type?(type)
+    %w(major minor patch).include?(type)
+  end
+  def bump(current_version, type)
+    major, minor, patch = current_version.split('.').map(&:to_i)
+    if type == 'major'
+      major += 1
+      minor = 0
+      patch = 0
+    end
+    if type == 'minor'
+      minor += 1
+      patch = 0
+    end
+    patch += 1 if type == 'patch'
+    "#{major}.#{minor}.#{patch}"
+  end
+  def run
+    old_version = HTMLHierarchyExtractorVersion.to_s
+    new_version = bump(old_version, @type)
+    script_dir = File.expand_path(File.dirname(__FILE__))
+    file = File.join(script_dir, '../lib/version.rb')
+    old_content = File.read(file)
+    new_content = old_content.gsub(old_version, new_version)
+    File.write(file, new_content)
+    `git add #{file}`
+    `git commit -m "chore(bump): Version bump to #{new_version}"`
+  end
+end
+BumpVersion.new(*ARGV).run

data/scripts/check_flay ADDED Viewed

@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+MAX_SCORE = 45
+flay_lines = `flay -s ./lib/`.split("\n")
+errors = []
+flay_lines.each_with_index do |line, index|
+  # Skip header
+  next if index < 2
+  pattern = /^ *(.*): (.*)/
+  matches = line.match(pattern)
+  next if matches.nil?
+  score = matches[1].to_f
+  next if score < MAX_SCORE
+  errors << {
+    score: score,
+    file: matches[2]
+  }
+end
+exit 0 if errors.size == 0
+puts 'Flay test failed:'
+errors.sort_by { |a| a[:score] }.each do |error|
+  puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
+end
+exit 1

data/scripts/check_flog ADDED Viewed

@@ -0,0 +1,31 @@
+#!/usr/bin/env ruby
+MAX_SCORE = 45
+flog_lines = `flog ./lib/`.split("\n")
+errors = []
+flog_lines.each_with_index do |line, index|
+  # Skip header
+  next if index < 3
+  pattern = /^ *(.*): (.*) (.*):[0-9]*/
+  matches = line.match(pattern)
+  next if matches.nil?
+  score = matches[1].to_f
+  next if score < MAX_SCORE
+  errors << {
+    score: score,
+    method: matches[2],
+    file: matches[3]
+  }
+end
+exit 0 if errors.size == 0
+puts 'Flog test failed:'
+errors.sort_by { |a| a[:score] }.each do |error|
+  puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
+end
+exit 1

data/scripts/coverage ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+COVERAGE=1 bundle exec rspec

data/scripts/git_hooks/pre-commit ADDED Viewed

@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# Succeed fast if we did not change any ruby file
+if ! git status --short | grep -q '\.rb$'; then
+  exit 0
+fi
+# Do not commit any focused or excluded tests
+if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
+  echo '✘ You have focused and/or skipped tests'
+  exit 1
+fi
+# Match style guide
+./scripts/lint || exit 1

data/scripts/git_hooks/pre-push ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+./scripts/test || exit 1
+# No over-complex methods
+./scripts/check_flog || exit 1
+# No duplication
+./scripts/check_flay

data/scripts/lint ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env bash
2	+ rubocop -F './lib/' './spec'

data/scripts/release ADDED Viewed

@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# Stop if any command fails
+set -e
+git checkout master
+git pull
+bundle install
+git rebase develop
+bundle install
+rake release
+git checkout develop
+bundle install
+git rebase master
+bundle install

data/scripts/test ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+cd "$(dirname "$BASH_SOURCE")"/..
+COVERAGE=1 bundle exec rspec

data/scripts/test_ci ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+# This script will be started by Travis, in the correct context (matrix of Ruby
+# version + Gemfile version), so it only needs to load the tests, without
+# worrying about appraisal
+cd "$(dirname "$BASH_SOURCE")"/..
+COVERAGE=1 bundle exec rspec

data/scripts/watch ADDED Viewed

@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+cd "$(dirname "$BASH_SOURCE")"/..
+guard

data/spec/html_hierarchy_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,441 @@
+require 'spec_helper'
+describe(HTMLHierarchyExtractor) do
+  describe 'extract' do
+    it 'should load from an HTML string' do
+      # Given
+      input = '<p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual.size).to eq 1
+    end
+    it 'should allow overriding of the default css selector of nodes' do
+      # Given
+      input = '<div>foo</div>'
+      # When
+      options = {
+        css_selector: 'div'
+      }
+      actual = HTMLHierarchyExtractor.new(input, options: options).extract
+      # Then
+      expect(actual.size).to eq 1
+    end
+    it 'should export the Nokogiri node' do
+      # Given
+      input = '<p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:node]).to be_an(Nokogiri::XML::Element)
+    end
+    it 'should remove empty elements' do
+      # Given
+      input = '<p></p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual.size).to eq 0
+    end
+    it 'should add the DOM position to each element' do
+      # Given
+      input = '<p>foo</p>
+               <p>bar</p>
+               <p>baz</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:weight][:position]).to eq 0
+      expect(actual[1][:weight][:position]).to eq 1
+      expect(actual[2][:weight][:position]).to eq 2
+    end
+  end
+  describe 'extract_html' do
+    it 'should extract outer html' do
+      # Given
+      input = '<p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:html]).to eq '<p>foo</p>'
+    end
+    it 'should trim content' do
+      # Given
+      input = '<p>foo</p>
+               <blink>irrelevant</blink>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:html]).to eq '<p>foo</p>'
+    end
+  end
+  describe 'extract_text' do
+    it 'should extract inner text' do
+      # Given
+      input = '<p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:text]).to eq 'foo'
+    end
+    it 'should extract UTF8 correctly' do
+      # Given
+      input = '<p>UTF8‽✗✓</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:text]).to eq 'UTF8‽✗✓'
+    end
+  end
+  describe 'extract_tag_name' do
+    it 'should extract the tag name' do
+      # Given
+      input = '<p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:tag_name]).to eq 'p'
+    end
+    it 'should always return lowercase' do
+      # Given
+      input = '<P>foo</P>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:tag_name]).to eq 'p'
+    end
+  end
+  describe 'extract_hierarchy' do
+    it 'should extract a simple hierarchy' do
+      # Given
+      input = '<h1>Foo</h1>
+               <p>First paragraph</p>
+               <h2>Bar</h2>
+               <p>Second paragraph</p>
+               <h3>Baz</h3>
+               <p>Third paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[0][:hierarchy][:lvl1]).to eq nil
+      expect(actual[0][:hierarchy][:lvl2]).to eq nil
+      expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
+      expect(actual[1][:hierarchy][:lvl2]).to eq nil
+      expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
+      expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
+    end
+    it 'should use inner text of headings' do
+      # Given
+      input = '<h1><a href="#">Foo</a><span></span></h1>
+               <p>First paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[0][:hierarchy][:lvl1]).to eq nil
+      expect(actual[0][:hierarchy][:lvl2]).to eq nil
+    end
+    it 'should handle nodes not in any hierarchy' do
+      # Given
+      input = '<p>First paragraph</p>
+               <h1>Foo</h1>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:hierarchy][:lvl0]).to eq nil
+      expect(actual[0][:hierarchy][:lvl1]).to eq nil
+      expect(actual[0][:hierarchy][:lvl2]).to eq nil
+    end
+    it 'should handle any number of wrappers' do
+      # Given
+      input = '<header>
+                 <h1>Foo</h1>
+                 <p>First paragraph</p>
+               </header>
+               <div>
+                 <div>
+                   <div>
+                     <h2>Bar</h2>
+                     <p>Second paragraph</p>
+                     </div>
+                   </div>
+                 <div>
+                   <h3>Baz</h3>
+                   <p>Third paragraph</p>
+                 </div>
+               </div>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[0][:hierarchy][:lvl1]).to eq nil
+      expect(actual[0][:hierarchy][:lvl2]).to eq nil
+      expect(actual[1][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[1][:hierarchy][:lvl1]).to eq 'Bar'
+      expect(actual[1][:hierarchy][:lvl2]).to eq nil
+      expect(actual[2][:hierarchy][:lvl0]).to eq 'Foo'
+      expect(actual[2][:hierarchy][:lvl1]).to eq 'Bar'
+      expect(actual[2][:hierarchy][:lvl2]).to eq 'Baz'
+    end
+  end
+  describe 'extract_anchor' do
+    it 'should get the anchor of parent' do
+      # Given
+      input = '<h1 name="anchor">Foo</h1>
+               <p>First paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq 'anchor'
+    end
+    it 'should get no anchor if none found' do
+      # Given
+      input = '<h1>Foo</h1>
+               <p>First paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq nil
+    end
+    it 'should use the id as anchor if no name set' do
+      # Given
+      input = '<h1 id="anchor">Foo</h1>
+               <p>First paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq 'anchor'
+    end
+    it 'should be set to nil if no name nor id' do
+      # Given
+      input = '<h1>Foo</h1>
+               <p>First paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq nil
+    end
+    it 'should get the anchor of closest parent with an anchor' do
+      # Given
+      input = '<h1 name="anchor">Foo</h1>
+               <p>First paragraph</p>
+               <h2>Bar</h2>
+               <p>Second paragraph</p>
+               <h3 name="subanchor">Baz</h3>
+               <p>Third paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq 'anchor'
+      expect(actual[1][:anchor]).to eq 'anchor'
+      expect(actual[2][:anchor]).to eq 'subanchor'
+    end
+    it 'should get anchor even if heading not a direct parent' do
+      # Given
+      input = '<header>
+                 <h1 name="anchor">Foo</h1>
+                 <p>First paragraph</p>
+               </header>
+               <div>
+                 <div>
+                   <div>
+                     <h2>Bar</h2>
+                     <p>Second paragraph</p>
+                   </div>
+                 </div>
+                 <div>
+                   <h3 name="subanchor">Baz</h3>
+                   <p>Third paragraph</p>
+                 </div>
+               </div>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq 'anchor'
+      expect(actual[1][:anchor]).to eq 'anchor'
+      expect(actual[2][:anchor]).to eq 'subanchor'
+    end
+    it 'should get anchor if not directly on the header but inner element' do
+      # Given
+      input = '<h1><a name="anchor">Foo</a></h1>
+               <p>First paragraph</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:anchor]).to eq 'anchor'
+    end
+  end
+  describe 'uuid' do
+    it 'should give different uuid if different content' do
+      # Given
+      input_a = '<p>foo</p>'
+      input_b = '<p>bar</p>'
+      # When
+      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
+      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
+      # Then
+      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
+    end
+    it 'should give different uuid if different HTML tag' do
+      # Given
+      input_a = '<p>foo</p>'
+      input_b = '<p class="bar">foo</p>'
+      # When
+      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
+      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
+      # Then
+      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
+    end
+    it 'should give different uuid if different position in page' do
+      # Given
+      input_a = '<p>foo</p><p>bar</p>'
+      input_b = '<p>foo</p><p>foo again</p><p>bar</p>'
+      # When
+      actual_a = HTMLHierarchyExtractor.new(input_a).extract[1]
+      actual_b = HTMLHierarchyExtractor.new(input_b).extract[2]
+      # Then
+      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
+    end
+    it 'should give different uuid if different parent header' do
+      # Given
+      input_a = '<h1 name="foo">foo</h1><p>bar</p>'
+      input_b = '<h1 name="bar">bar</h1><p>bar</p>'
+      # When
+      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
+      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
+      # Then
+      expect(actual_a[:uuid]).not_to eq(actual_b[:uuid])
+    end
+    it 'should always give the same uuid for the same content' do
+      # Given
+      input_a = '<h1 name="foo">foo</h1><p>bar</p>'
+      input_b = '<h1 name="foo">foo</h1><p>bar</p>'
+      # When
+      actual_a = HTMLHierarchyExtractor.new(input_a).extract[0]
+      actual_b = HTMLHierarchyExtractor.new(input_b).extract[0]
+      # Then
+      expect(actual_a[:uuid]).to eq(actual_b[:uuid])
+    end
+  end
+  describe 'heading_weight' do
+    it 'should have 100 if no heading' do
+      # Given
+      input = '<p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:weight][:heading]).to eq 100
+    end
+    it 'should have decreasing value under small headers' do
+      # Given
+      input = '<h1 name="one">bar</h1><p>foo</p>
+               <h2 name="two">bar</h2><p>foo</p>
+               <h3 name="three">bar</h3><p>foo</p>
+               <h4 name="four">bar</h4><p>foo</p>
+               <h5 name="five">bar</h5><p>foo</p>
+               <h6 name="six">bar</h6><p>foo</p>'
+      # When
+      actual = HTMLHierarchyExtractor.new(input).extract
+      # Then
+      expect(actual[0][:weight][:heading]).to eq 90
+      expect(actual[1][:weight][:heading]).to eq 80
+      expect(actual[2][:weight][:heading]).to eq 70
+      expect(actual[3][:weight][:heading]).to eq 60
+      expect(actual[4][:weight][:heading]).to eq 50
+      expect(actual[5][:weight][:heading]).to eq 40
+    end
+  end
+end