RubyGems - html-hierarchy-extractor - Versions diffs - 1.0.0 - Mend

html-hierarchy-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/.coveralls.yml +1 -0
data/.document +5 -0
data/.rspec +2 -0
data/.rubocop.yml +26 -0
data/.travis.yml +12 -0
data/CONTRIBUTING.md +53 -0
data/Gemfile +16 -0
data/Guardfile +7 -0
data/LICENSE.txt +20 -0
data/README.md +17 -0
data/Rakefile +58 -0
data/VERSION +1 -0
data/html-hierarchy-extractor.gemspec +99 -0
data/lib/html-hierarchy-extractor.rb +144 -0
data/lib/version.rb +6 -0
data/scripts/bump_version +47 -0
data/scripts/check_flay +30 -0
data/scripts/check_flog +31 -0
data/scripts/coverage +3 -0
data/scripts/git_hooks/pre-commit +16 -0
data/scripts/git_hooks/pre-push +9 -0
data/scripts/lint +2 -0
data/scripts/release +16 -0
data/scripts/test +4 -0
data/scripts/test_ci +7 -0
data/scripts/watch +4 -0
data/spec/html_hierarchy_extractor_spec.rb +441 -0
data/spec/spec_helper.rb +14 -0
data/spec/spec_helper_simplecov.rb +9 -0
metadata +230 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 9c0a516a852828d433ba0495206acc9febbd1670
+  data.tar.gz: 444cedeb76c06fd048526cb02c7fcac294927540
+SHA512:
+  metadata.gz: 7e6505db7a21b42db30d4afffa496358642c1eb6332174f5ada9418f973056c0b0f9762b6458f68c02a1eb035700fe9746d6dbc92a613b4a5797a4b54512f2cc
+  data.tar.gz: bcba7859c0e37030d6a209bef9b3980f35ea9dc08283f6d86445400cb115aa92f11f6ef1331ed0b7d353b077bf9e29542e730bde118e7479f96a3dbe96164833

data/.coveralls.yml ADDED Viewed

	@@ -0,0 +1 @@
1	+ service_name: travis-ci

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,26 @@
+# Defaults:
+# https://github.com/bbatsov/rubocop/blob/master/config/default.yml
+Metrics/AbcSize:
+  Max: 100
+Metrics/ClassLength:
+  Max: 200
+Metrics/ModuleLength:
+  Max: 200
+Metrics/MethodLength:
+  Max: 50
+Metrics/CyclomaticComplexity:
+  Max: 10
+Metrics/PerceivedComplexity:
+  Max: 10
+Style/FileName:
+  Enabled: false
+Style/MultilineOperationIndentation:
+  Enabled: false

data/.travis.yml ADDED Viewed

@@ -0,0 +1,12 @@
+language: ruby
+cache: bundler
+before_script: bundle update
+script: ./scripts/test_ci
+rvm:
+ - 2.2
+ - 2.1
+ - 2.0
+notifications:
+  email:
+    on_success: never
+    on_failure: never

data/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,53 @@
+Hi collaborator!
+If you have a fix or a new feature, please start by checking in the
+[issues](https://github.com/pixelastic/html-hierarchy-extractor/issues) if it is
+already referenced. If not, feel free to open one.
+We use [pull requests](https://github.com/pixelastic/html-hierarchy-extractor/pulls)
+for collaboration. The workflow is as follow:
+- Create a local branch, starting from `develop`
+- Submit the PR on `develop`
+- Wait for review
+- Do the changes requested (if any)
+- We may ask you to rebase the branch to latest `develop` if it gets out of sync
+- Get praise for your awesome contribution
+# Development workflow
+Run `bundle install` to get all dependencies up to date.
+You can then launch:
+- `./scripts/test` to launch tests
+- `./scripts/watch` to start a test watcher (for TDD) using Guard
+If you plan on submitting a PR, I suggest you install the git hooks. This will
+run pre-commit and pre-push checks. Those checks will also be run by TravisCI,
+but running them locally gives faster feedback.
+If you want to a local version of the gem in your local project, I suggest
+updating your project `Gemfile` to point to the correct local directory
+```ruby
+gem "html-hierarchy-extractor", :path => "/path/to/local/gem/folder"
+```
+You should also run `rake gemspec` from the `html-hierarchy-extractor`
+repository the first time and if you added/deleted any file or dependency.
+# Tagging and releasing
+This part is for main contributors:
+```
+# Bump the version (in develop)
+./scripts/bump_version minor
+# Update master and release
+./scripts/release
+# Install the gem locally (optional)
+rake install
+```

data/Gemfile ADDED Viewed

@@ -0,0 +1,16 @@
+source 'http://rubygems.org'
+gem 'awesome_print', '~> 1.6'
+gem 'json', '~> 1.8'
+gem 'nokogiri', '~> 1.6'
+group :development do
+  gem 'coveralls', '~> 0.8'
+  gem 'flay', '~> 2.6'
+  gem 'flog', '~> 4.3'
+  gem 'guard-rspec', '~> 4.6'
+  gem 'jeweler', '~> 2.0'
+  gem 'rspec', '~> 3.0'
+  gem 'rubocop', '~> 0.31'
+  gem 'simplecov', '~> 0.10'
+end

data/Guardfile ADDED Viewed

@@ -0,0 +1,7 @@
+guard :rspec, cmd: 'bundle exec rspec --color --format documentation' do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { 'spec' }
+end
+notification :off

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2016 Pixelastic
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,17 @@
+# html-hierarchy-extractor
+This gems lets you extract the hierarchy of headings and content from any HTML
+page into and array of elements.
+It is intended to be used with Algolia to improve relevance of search results
+inside large HTML pages.
+Note: This repo is still a work in progress, and follows the RDD (Readme Driven
+Development) principle. All you see in the Readme might not be implemented yet.
+## How to use
+```ruby
+page = HTMLHierarchyExtractor(html) # Or filepath
+page.extract
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,58 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts 'Run `bundle install` to install missing gems'
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+require_relative 'lib/version'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification...
+  # see http://guides.rubygems.org/specification-reference/ for more options
+  gem.name = 'html-hierarchy-extractor'
+  gem.version = HTMLHierarchyExtractorVersion.to_s
+  gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
+  gem.license = 'MIT'
+  gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
+                ' list of items'
+  gem.description = 'Take any arbitrary HTML as input and extract its' \
+                    ' hierarchy as a list of items, including parents and' \
+                    ' contents.' \
+                    'It is primarily intended to be used along with Algolia,' \
+                    ' to improve the relevance of searching into huge chunks' \
+                    ' of text'
+  gem.email = 'tim@pixelastic.com'
+  gem.authors = ['Tim Carry']
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.rspec_opts = '--color --format documentation'
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task test: :spec
+desc 'Code coverage detail'
+task :coverage do
+  ENV['COVERAGE'] = 'true'
+  Rake::Task['spec'].execute
+end
+task default: :test

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

data/html-hierarchy-extractor.gemspec ADDED Viewed

@@ -0,0 +1,99 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+# stub: html-hierarchy-extractor 1.0.0 ruby lib
+Gem::Specification.new do |s|
+  s.name = "html-hierarchy-extractor"
+  s.version = "1.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.require_paths = ["lib"]
+  s.authors = ["Tim Carry"]
+  s.date = "2016-07-20"
+  s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
+  s.email = "tim@pixelastic.com"
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.md"
+  ]
+  s.files = [
+    ".coveralls.yml",
+    ".document",
+    ".rspec",
+    ".rubocop.yml",
+    ".travis.yml",
+    "CONTRIBUTING.md",
+    "Gemfile",
+    "Guardfile",
+    "LICENSE.txt",
+    "README.md",
+    "Rakefile",
+    "VERSION",
+    "html-hierarchy-extractor.gemspec",
+    "lib/html-hierarchy-extractor.rb",
+    "lib/version.rb",
+    "scripts/bump_version",
+    "scripts/check_flay",
+    "scripts/check_flog",
+    "scripts/coverage",
+    "scripts/git_hooks/pre-commit",
+    "scripts/git_hooks/pre-push",
+    "scripts/lint",
+    "scripts/release",
+    "scripts/test",
+    "scripts/test_ci",
+    "scripts/watch",
+    "spec/html_hierarchy_extractor_spec.rb",
+    "spec/spec_helper.rb",
+    "spec/spec_helper_simplecov.rb"
+  ]
+  s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
+  s.licenses = ["MIT"]
+  s.rubygems_version = "2.4.8"
+  s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
+  if s.respond_to? :specification_version then
+    s.specification_version = 4
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
+      s.add_runtime_dependency(%q<json>, ["~> 1.8"])
+      s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
+      s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
+      s.add_development_dependency(%q<flay>, ["~> 2.6"])
+      s.add_development_dependency(%q<flog>, ["~> 4.3"])
+      s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
+      s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
+      s.add_development_dependency(%q<rspec>, ["~> 3.0"])
+      s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
+      s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
+    else
+      s.add_dependency(%q<awesome_print>, ["~> 1.6"])
+      s.add_dependency(%q<json>, ["~> 1.8"])
+      s.add_dependency(%q<nokogiri>, ["~> 1.6"])
+      s.add_dependency(%q<coveralls>, ["~> 0.8"])
+      s.add_dependency(%q<flay>, ["~> 2.6"])
+      s.add_dependency(%q<flog>, ["~> 4.3"])
+      s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
+      s.add_dependency(%q<jeweler>, ["~> 2.0"])
+      s.add_dependency(%q<rspec>, ["~> 3.0"])
+      s.add_dependency(%q<rubocop>, ["~> 0.31"])
+      s.add_dependency(%q<simplecov>, ["~> 0.10"])
+    end
+  else
+    s.add_dependency(%q<awesome_print>, ["~> 1.6"])
+    s.add_dependency(%q<json>, ["~> 1.8"])
+    s.add_dependency(%q<nokogiri>, ["~> 1.6"])
+    s.add_dependency(%q<coveralls>, ["~> 0.8"])
+    s.add_dependency(%q<flay>, ["~> 2.6"])
+    s.add_dependency(%q<flog>, ["~> 4.3"])
+    s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
+    s.add_dependency(%q<jeweler>, ["~> 2.0"])
+    s.add_dependency(%q<rspec>, ["~> 3.0"])
+    s.add_dependency(%q<rubocop>, ["~> 0.31"])
+    s.add_dependency(%q<simplecov>, ["~> 0.10"])
+  end
+end

data/lib/html-hierarchy-extractor.rb ADDED Viewed

@@ -0,0 +1,144 @@
+require 'nokogiri'
+require 'digest/md5'
+# Extract content from an HTML page in the form of items with associated
+# hierarchy data
+class HTMLHierarchyExtractor
+  def initialize(input, options: {})
+    @dom = Nokogiri::HTML(input)
+    default_options = {
+      css_selector: 'p'
+    }
+    @options = default_options.merge(options)
+  end
+  # Returns the outer HTML of a given node
+  #
+  # eg.
+  # <p>foo</p> => <p>foo</p>
+  def extract_html(node)
+    node.to_s.strip
+  end
+  # Returns the inner HTML of a given node
+  #
+  # eg.
+  # <p>foo</p> => foo
+  def extract_text(node)
+    node.content
+  end
+  # Returns the tag name of a given node
+  #
+  # eg
+  # <p>foo</p> => p
+  def extract_tag_name(node)
+    node.name.downcase
+  end
+  # Returns the anchor to the node
+  #
+  # eg.
+  # <h1 name="anchor">Foo</h1> => anchor
+  # <h1 id="anchor">Foo</h1> => anchor
+  # <h1><a name="anchor">Foo</a></h1> => anchor
+  def extract_anchor(node)
+    anchor = node.attr('name') || node.attr('id') || nil
+    return anchor unless anchor.nil?
+    # No anchor found directly in the header, search on children
+    subelement = node.css('[name],[id]')
+    return extract_anchor(subelement[0]) unless subelement.empty?
+    nil
+  end
+  ##
+  # Generate a unique identifier for the item
+  def uuid(item)
+    # We first get all the keys of the object, sorted alphabetically...
+    ordered_keys = item.keys.sort
+    # ...then we build a huge array of "key=value" pairs...
+    ordered_array = ordered_keys.map do |key|
+      value = item[key]
+      # We apply the method recursively on other hashes
+      value = uuid(value) if value.is_a?(Hash)
+      "#{key}=#{value}"
+    end
+    # ...then we build a unique md5 hash of it
+    Digest::MD5.hexdigest(ordered_array.join(','))
+  end
+  ##
+  # Get a relative numeric value of the importance of the heading
+  # 100 for top level, then -10 per heading
+  def heading_weight(heading_level)
+    weight = 100
+    return weight if heading_level.nil?
+    weight - ((heading_level + 1) * 10)
+  end
+  def extract
+    heading_selector = 'h1,h2,h3,h4,h5,h6'
+    # We select all nodes that match either the headings or the elements to
+    # extract. This will allow us to loop over it in order it appears in the DOM
+    all_selector = "#{heading_selector},#{@options[:css_selector]}"
+    items = []
+    current_hierarchy = {
+      lvl0: nil,
+      lvl1: nil,
+      lvl2: nil,
+      lvl3: nil,
+      lvl4: nil,
+      lvl5: nil
+    }
+    current_position = 0 # Position of the DOM node in the tree
+    current_lvl = nil # Current closest hierarchy level
+    current_anchor = nil # Current closest anchor
+    @dom.css(all_selector).each do |node|
+      # If it's a heading, we update our current hierarchy
+      if node.matches?(heading_selector)
+        # Which level heading is it?
+        current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
+        # Update this level, and set all the following ones to nil
+        current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
+        (current_lvl + 1..6).each do |lvl|
+          current_hierarchy["lvl#{lvl}".to_sym] = nil
+        end
+        # Update the anchor, if the new heading has one
+        new_anchor = extract_anchor(node)
+        current_anchor = new_anchor if new_anchor
+      end
+      # Stop if node is not to be extracted
+      next unless node.matches?(@options[:css_selector])
+      # Stop if node is empty
+      text = extract_text(node)
+      next if text.empty?
+      item = {
+        html: extract_html(node),
+        text: text,
+        tag_name: extract_tag_name(node),
+        hierarchy: current_hierarchy.clone,
+        anchor: current_anchor,
+        node: node,
+        weight: {
+          position: current_position,
+          heading: heading_weight(current_lvl)
+        }
+      }
+      item[:uuid] = uuid(item)
+      items << item
+      current_position += 1
+    end
+    items
+  end
+end