RubyGems - html-hierarchy-extractor - Versions diffs - 1.0.0 - Mend

html-hierarchy-extractor 1.0.0

Files changed (31) hide show

checksums.yaml +7 -0
data/.coveralls.yml +1 -0
data/.document +5 -0
data/.rspec +2 -0
data/.rubocop.yml +26 -0
data/.travis.yml +12 -0
data/CONTRIBUTING.md +53 -0
data/Gemfile +16 -0
data/Guardfile +7 -0
data/LICENSE.txt +20 -0
data/README.md +17 -0
data/Rakefile +58 -0
data/VERSION +1 -0
data/html-hierarchy-extractor.gemspec +99 -0
data/lib/html-hierarchy-extractor.rb +144 -0
data/lib/version.rb +6 -0
data/scripts/bump_version +47 -0
data/scripts/check_flay +30 -0
data/scripts/check_flog +31 -0
data/scripts/coverage +3 -0
data/scripts/git_hooks/pre-commit +16 -0
data/scripts/git_hooks/pre-push +9 -0
data/scripts/lint +2 -0
data/scripts/release +16 -0
data/scripts/test +4 -0
data/scripts/test_ci +7 -0
data/scripts/watch +4 -0
data/spec/html_hierarchy_extractor_spec.rb +441 -0
data/spec/spec_helper.rb +14 -0
data/spec/spec_helper_simplecov.rb +9 -0
metadata +230 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 9c0a516a852828d433ba0495206acc9febbd1670
+  data.tar.gz: 444cedeb76c06fd048526cb02c7fcac294927540
+SHA512:
+  metadata.gz: 7e6505db7a21b42db30d4afffa496358642c1eb6332174f5ada9418f973056c0b0f9762b6458f68c02a1eb035700fe9746d6dbc92a613b4a5797a4b54512f2cc
+  data.tar.gz: bcba7859c0e37030d6a209bef9b3980f35ea9dc08283f6d86445400cb115aa92f11f6ef1331ed0b7d353b077bf9e29542e730bde118e7479f96a3dbe96164833

data/.coveralls.yml ADDED Viewed

	@@ -0,0 +1 @@
1	+ service_name: travis-ci

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,26 @@
+# Defaults:
+# https://github.com/bbatsov/rubocop/blob/master/config/default.yml
+Metrics/AbcSize:
+  Max: 100
+Metrics/ClassLength:
+  Max: 200
+Metrics/ModuleLength:
+  Max: 200
+Metrics/MethodLength:
+  Max: 50
+Metrics/CyclomaticComplexity:
+  Max: 10
+Metrics/PerceivedComplexity:
+  Max: 10
+Style/FileName:
+  Enabled: false
+Style/MultilineOperationIndentation:
+  Enabled: false

data/.travis.yml ADDED Viewed

@@ -0,0 +1,12 @@
+language: ruby
+cache: bundler
+before_script: bundle update
+script: ./scripts/test_ci
+rvm:
+ - 2.2
+ - 2.1
+ - 2.0
+notifications:
+  email:
+    on_success: never
+    on_failure: never

data/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,53 @@
+Hi collaborator!
+If you have a fix or a new feature, please start by checking in the
+[issues](https://github.com/pixelastic/html-hierarchy-extractor/issues) if it is
+already referenced. If not, feel free to open one.
+We use [pull requests](https://github.com/pixelastic/html-hierarchy-extractor/pulls)
+for collaboration. The workflow is as follow:
+- Create a local branch, starting from `develop`
+- Submit the PR on `develop`
+- Wait for review
+- Do the changes requested (if any)
+- We may ask you to rebase the branch to latest `develop` if it gets out of sync
+- Get praise for your awesome contribution
+# Development workflow
+Run `bundle install` to get all dependencies up to date.
+You can then launch:
+- `./scripts/test` to launch tests
+- `./scripts/watch` to start a test watcher (for TDD) using Guard
+If you plan on submitting a PR, I suggest you install the git hooks. This will
+run pre-commit and pre-push checks. Those checks will also be run by TravisCI,
+but running them locally gives faster feedback.
+If you want to a local version of the gem in your local project, I suggest
+updating your project `Gemfile` to point to the correct local directory
+```ruby
+gem "html-hierarchy-extractor", :path => "/path/to/local/gem/folder"
+```
+You should also run `rake gemspec` from the `html-hierarchy-extractor`
+repository the first time and if you added/deleted any file or dependency.
+# Tagging and releasing
+This part is for main contributors:
+```
+# Bump the version (in develop)
+./scripts/bump_version minor
+# Update master and release
+./scripts/release
+# Install the gem locally (optional)
+rake install
+```

data/Gemfile ADDED Viewed

@@ -0,0 +1,16 @@
+source 'http://rubygems.org'
+gem 'awesome_print', '~> 1.6'
+gem 'json', '~> 1.8'
+gem 'nokogiri', '~> 1.6'
+group :development do
+  gem 'coveralls', '~> 0.8'
+  gem 'flay', '~> 2.6'
+  gem 'flog', '~> 4.3'
+  gem 'guard-rspec', '~> 4.6'
+  gem 'jeweler', '~> 2.0'
+  gem 'rspec', '~> 3.0'
+  gem 'rubocop', '~> 0.31'
+  gem 'simplecov', '~> 0.10'
+end

data/Guardfile ADDED Viewed

@@ -0,0 +1,7 @@
+guard :rspec, cmd: 'bundle exec rspec --color --format documentation' do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { 'spec' }
+end
+notification :off

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2016 Pixelastic
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,17 @@
+# html-hierarchy-extractor
+This gems lets you extract the hierarchy of headings and content from any HTML
+page into and array of elements.
+It is intended to be used with Algolia to improve relevance of search results
+inside large HTML pages.
+Note: This repo is still a work in progress, and follows the RDD (Readme Driven
+Development) principle. All you see in the Readme might not be implemented yet.
+## How to use
+```ruby
+page = HTMLHierarchyExtractor(html) # Or filepath
+page.extract
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,58 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts 'Run `bundle install` to install missing gems'
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+require_relative 'lib/version'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification...
+  # see http://guides.rubygems.org/specification-reference/ for more options
+  gem.name = 'html-hierarchy-extractor'
+  gem.version = HTMLHierarchyExtractorVersion.to_s
+  gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
+  gem.license = 'MIT'
+  gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
+                ' list of items'
+  gem.description = 'Take any arbitrary HTML as input and extract its' \
+                    ' hierarchy as a list of items, including parents and' \
+                    ' contents.' \
+                    'It is primarily intended to be used along with Algolia,' \
+                    ' to improve the relevance of searching into huge chunks' \
+                    ' of text'
+  gem.email = 'tim@pixelastic.com'
+  gem.authors = ['Tim Carry']
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.rspec_opts = '--color --format documentation'
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task test: :spec
+desc 'Code coverage detail'
+task :coverage do
+  ENV['COVERAGE'] = 'true'
+  Rake::Task['spec'].execute
+end
+task default: :test

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

data/html-hierarchy-extractor.gemspec ADDED Viewed

@@ -0,0 +1,99 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+# stub: html-hierarchy-extractor 1.0.0 ruby lib
+Gem::Specification.new do |s|
+  s.name = "html-hierarchy-extractor"
+  s.version = "1.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.require_paths = ["lib"]
+  s.authors = ["Tim Carry"]
+  s.date = "2016-07-20"
+  s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
+  s.email = "tim@pixelastic.com"
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.md"
+  ]
+  s.files = [
+    ".coveralls.yml",
+    ".document",
+    ".rspec",
+    ".rubocop.yml",
+    ".travis.yml",
+    "CONTRIBUTING.md",
+    "Gemfile",
+    "Guardfile",
+    "LICENSE.txt",
+    "README.md",
+    "Rakefile",
+    "VERSION",
+    "html-hierarchy-extractor.gemspec",
+    "lib/html-hierarchy-extractor.rb",
+    "lib/version.rb",
+    "scripts/bump_version",
+    "scripts/check_flay",
+    "scripts/check_flog",
+    "scripts/coverage",
+    "scripts/git_hooks/pre-commit",
+    "scripts/git_hooks/pre-push",
+    "scripts/lint",
+    "scripts/release",
+    "scripts/test",
+    "scripts/test_ci",
+    "scripts/watch",
+    "spec/html_hierarchy_extractor_spec.rb",
+    "spec/spec_helper.rb",
+    "spec/spec_helper_simplecov.rb"
+  ]
+  s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
+  s.licenses = ["MIT"]
+  s.rubygems_version = "2.4.8"
+  s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
+  if s.respond_to? :specification_version then
+    s.specification_version = 4
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
+      s.add_runtime_dependency(%q<json>, ["~> 1.8"])
+      s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
+      s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
+      s.add_development_dependency(%q<flay>, ["~> 2.6"])
+      s.add_development_dependency(%q<flog>, ["~> 4.3"])
+      s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
+      s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
+      s.add_development_dependency(%q<rspec>, ["~> 3.0"])
+      s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
+      s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
+    else
+      s.add_dependency(%q<awesome_print>, ["~> 1.6"])
+      s.add_dependency(%q<json>, ["~> 1.8"])
+      s.add_dependency(%q<nokogiri>, ["~> 1.6"])
+      s.add_dependency(%q<coveralls>, ["~> 0.8"])
+      s.add_dependency(%q<flay>, ["~> 2.6"])
+      s.add_dependency(%q<flog>, ["~> 4.3"])
+      s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
+      s.add_dependency(%q<jeweler>, ["~> 2.0"])
+      s.add_dependency(%q<rspec>, ["~> 3.0"])
+      s.add_dependency(%q<rubocop>, ["~> 0.31"])
+      s.add_dependency(%q<simplecov>, ["~> 0.10"])
+    end
+  else
+    s.add_dependency(%q<awesome_print>, ["~> 1.6"])
+    s.add_dependency(%q<json>, ["~> 1.8"])
+    s.add_dependency(%q<nokogiri>, ["~> 1.6"])
+    s.add_dependency(%q<coveralls>, ["~> 0.8"])
+    s.add_dependency(%q<flay>, ["~> 2.6"])
+    s.add_dependency(%q<flog>, ["~> 4.3"])
+    s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
+    s.add_dependency(%q<jeweler>, ["~> 2.0"])
+    s.add_dependency(%q<rspec>, ["~> 3.0"])
+    s.add_dependency(%q<rubocop>, ["~> 0.31"])
+    s.add_dependency(%q<simplecov>, ["~> 0.10"])
+  end
+end

data/lib/html-hierarchy-extractor.rb ADDED Viewed

@@ -0,0 +1,144 @@
+require 'nokogiri'
+require 'digest/md5'
+# Extract content from an HTML page in the form of items with associated
+# hierarchy data
+class HTMLHierarchyExtractor
+  def initialize(input, options: {})
+    @dom = Nokogiri::HTML(input)
+    default_options = {
+      css_selector: 'p'
+    }
+    @options = default_options.merge(options)
+  end
+  # Returns the outer HTML of a given node
+  #
+  # eg.
+  # <p>foo</p> => <p>foo</p>
+  def extract_html(node)
+    node.to_s.strip
+  end
+  # Returns the inner HTML of a given node
+  #
+  # eg.
+  # <p>foo</p> => foo
+  def extract_text(node)
+    node.content
+  end
+  # Returns the tag name of a given node
+  #
+  # eg
+  # <p>foo</p> => p
+  def extract_tag_name(node)
+    node.name.downcase
+  end
+  # Returns the anchor to the node
+  #
+  # eg.
+  # <h1 name="anchor">Foo</h1> => anchor
+  # <h1 id="anchor">Foo</h1> => anchor
+  # <h1><a name="anchor">Foo</a></h1> => anchor
+  def extract_anchor(node)
+    anchor = node.attr('name') || node.attr('id') || nil
+    return anchor unless anchor.nil?
+    # No anchor found directly in the header, search on children
+    subelement = node.css('[name],[id]')
+    return extract_anchor(subelement[0]) unless subelement.empty?
+    nil
+  end
+  ##
+  # Generate a unique identifier for the item
+  def uuid(item)
+    # We first get all the keys of the object, sorted alphabetically...
+    ordered_keys = item.keys.sort
+    # ...then we build a huge array of "key=value" pairs...
+    ordered_array = ordered_keys.map do |key|
+      value = item[key]
+      # We apply the method recursively on other hashes
+      value = uuid(value) if value.is_a?(Hash)
+      "#{key}=#{value}"
+    end
+    # ...then we build a unique md5 hash of it
+    Digest::MD5.hexdigest(ordered_array.join(','))
+  end
+  ##
+  # Get a relative numeric value of the importance of the heading
+  # 100 for top level, then -10 per heading
+  def heading_weight(heading_level)
+    weight = 100
+    return weight if heading_level.nil?
+    weight - ((heading_level + 1) * 10)
+  end
+  def extract
+    heading_selector = 'h1,h2,h3,h4,h5,h6'
+    # We select all nodes that match either the headings or the elements to
+    # extract. This will allow us to loop over it in order it appears in the DOM
+    all_selector = "#{heading_selector},#{@options[:css_selector]}"
+    items = []
+    current_hierarchy = {
+      lvl0: nil,
+      lvl1: nil,
+      lvl2: nil,
+      lvl3: nil,
+      lvl4: nil,
+      lvl5: nil
+    }
+    current_position = 0 # Position of the DOM node in the tree
+    current_lvl = nil # Current closest hierarchy level
+    current_anchor = nil # Current closest anchor
+    @dom.css(all_selector).each do |node|
+      # If it's a heading, we update our current hierarchy
+      if node.matches?(heading_selector)
+        # Which level heading is it?
+        current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
+        # Update this level, and set all the following ones to nil
+        current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
+        (current_lvl + 1..6).each do |lvl|
+          current_hierarchy["lvl#{lvl}".to_sym] = nil
+        end
+        # Update the anchor, if the new heading has one
+        new_anchor = extract_anchor(node)
+        current_anchor = new_anchor if new_anchor
+      end
+      # Stop if node is not to be extracted
+      next unless node.matches?(@options[:css_selector])
+      # Stop if node is empty
+      text = extract_text(node)
+      next if text.empty?
+      item = {
+        html: extract_html(node),
+        text: text,
+        tag_name: extract_tag_name(node),
+        hierarchy: current_hierarchy.clone,
+        anchor: current_anchor,
+        node: node,
+        weight: {
+          position: current_position,
+          heading: heading_weight(current_lvl)
+        }
+      }
+      item[:uuid] = uuid(item)
+      items << item
+      current_position += 1
+    end
+    items
+  end
+end