html-hierarchy-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9c0a516a852828d433ba0495206acc9febbd1670
4
+ data.tar.gz: 444cedeb76c06fd048526cb02c7fcac294927540
5
+ SHA512:
6
+ metadata.gz: 7e6505db7a21b42db30d4afffa496358642c1eb6332174f5ada9418f973056c0b0f9762b6458f68c02a1eb035700fe9746d6dbc92a613b4a5797a4b54512f2cc
7
+ data.tar.gz: bcba7859c0e37030d6a209bef9b3980f35ea9dc08283f6d86445400cb115aa92f11f6ef1331ed0b7d353b077bf9e29542e730bde118e7479f96a3dbe96164833
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.rubocop.yml ADDED
@@ -0,0 +1,26 @@
1
+ # Defaults:
2
+ # https://github.com/bbatsov/rubocop/blob/master/config/default.yml
3
+ Metrics/AbcSize:
4
+ Max: 100
5
+
6
+ Metrics/ClassLength:
7
+ Max: 200
8
+
9
+ Metrics/ModuleLength:
10
+ Max: 200
11
+
12
+ Metrics/MethodLength:
13
+ Max: 50
14
+
15
+ Metrics/CyclomaticComplexity:
16
+ Max: 10
17
+
18
+ Metrics/PerceivedComplexity:
19
+ Max: 10
20
+
21
+ Style/FileName:
22
+ Enabled: false
23
+
24
+ Style/MultilineOperationIndentation:
25
+ Enabled: false
26
+
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ cache: bundler
3
+ before_script: bundle update
4
+ script: ./scripts/test_ci
5
+ rvm:
6
+ - 2.2
7
+ - 2.1
8
+ - 2.0
9
+ notifications:
10
+ email:
11
+ on_success: never
12
+ on_failure: never
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,53 @@
1
+ Hi collaborator!
2
+
3
+ If you have a fix or a new feature, please start by checking in the
4
+ [issues](https://github.com/pixelastic/html-hierarchy-extractor/issues) if it is
5
+ already referenced. If not, feel free to open one.
6
+
7
+ We use [pull requests](https://github.com/pixelastic/html-hierarchy-extractor/pulls)
8
+ for collaboration. The workflow is as follow:
9
+
10
+ - Create a local branch, starting from `develop`
11
+ - Submit the PR on `develop`
12
+ - Wait for review
13
+ - Do the changes requested (if any)
14
+ - We may ask you to rebase the branch to latest `develop` if it gets out of sync
15
+ - Get praise for your awesome contribution
16
+
17
+ # Development workflow
18
+
19
+ Run `bundle install` to get all dependencies up to date.
20
+
21
+ You can then launch:
22
+
23
+ - `./scripts/test` to launch tests
24
+ - `./scripts/watch` to start a test watcher (for TDD) using Guard
25
+
26
+ If you plan on submitting a PR, I suggest you install the git hooks. This will
27
+ run pre-commit and pre-push checks. Those checks will also be run by TravisCI,
28
+ but running them locally gives faster feedback.
29
+
30
+ If you want to a local version of the gem in your local project, I suggest
31
+ updating your project `Gemfile` to point to the correct local directory
32
+
33
+ ```ruby
34
+ gem "html-hierarchy-extractor", :path => "/path/to/local/gem/folder"
35
+ ```
36
+
37
+ You should also run `rake gemspec` from the `html-hierarchy-extractor`
38
+ repository the first time and if you added/deleted any file or dependency.
39
+
40
+ # Tagging and releasing
41
+
42
+ This part is for main contributors:
43
+
44
+ ```
45
+ # Bump the version (in develop)
46
+ ./scripts/bump_version minor
47
+
48
+ # Update master and release
49
+ ./scripts/release
50
+
51
+ # Install the gem locally (optional)
52
+ rake install
53
+ ```
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gem 'awesome_print', '~> 1.6'
4
+ gem 'json', '~> 1.8'
5
+ gem 'nokogiri', '~> 1.6'
6
+
7
+ group :development do
8
+ gem 'coveralls', '~> 0.8'
9
+ gem 'flay', '~> 2.6'
10
+ gem 'flog', '~> 4.3'
11
+ gem 'guard-rspec', '~> 4.6'
12
+ gem 'jeweler', '~> 2.0'
13
+ gem 'rspec', '~> 3.0'
14
+ gem 'rubocop', '~> 0.31'
15
+ gem 'simplecov', '~> 0.10'
16
+ end
data/Guardfile ADDED
@@ -0,0 +1,7 @@
1
+ guard :rspec, cmd: 'bundle exec rspec --color --format documentation' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { 'spec' }
5
+ end
6
+
7
+ notification :off
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2016 Pixelastic
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,17 @@
1
+ # html-hierarchy-extractor
2
+
3
+ This gems lets you extract the hierarchy of headings and content from any HTML
4
+ page into and array of elements.
5
+
6
+ It is intended to be used with Algolia to improve relevance of search results
7
+ inside large HTML pages.
8
+
9
+ Note: This repo is still a work in progress, and follows the RDD (Readme Driven
10
+ Development) principle. All you see in the Readme might not be implemented yet.
11
+
12
+ ## How to use
13
+
14
+ ```ruby
15
+ page = HTMLHierarchyExtractor(html) # Or filepath
16
+ page.extract
17
+ ```
data/Rakefile ADDED
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts 'Run `bundle install` to install missing gems'
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ require_relative 'lib/version'
16
+ Jeweler::Tasks.new do |gem|
17
+ # gem is a Gem::Specification...
18
+ # see http://guides.rubygems.org/specification-reference/ for more options
19
+ gem.name = 'html-hierarchy-extractor'
20
+ gem.version = HTMLHierarchyExtractorVersion.to_s
21
+ gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
22
+ gem.license = 'MIT'
23
+ gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
24
+ ' list of items'
25
+ gem.description = 'Take any arbitrary HTML as input and extract its' \
26
+ ' hierarchy as a list of items, including parents and' \
27
+ ' contents.' \
28
+ 'It is primarily intended to be used along with Algolia,' \
29
+ ' to improve the relevance of searching into huge chunks' \
30
+ ' of text'
31
+ gem.email = 'tim@pixelastic.com'
32
+ gem.authors = ['Tim Carry']
33
+ # dependencies defined in Gemfile
34
+ end
35
+ Jeweler::RubygemsDotOrgTasks.new
36
+
37
+ require 'rake/testtask'
38
+ Rake::TestTask.new(:test) do |test|
39
+ test.libs << 'lib' << 'test'
40
+ test.pattern = 'test/**/test_*.rb'
41
+ test.verbose = true
42
+ end
43
+
44
+ require 'rspec/core'
45
+ require 'rspec/core/rake_task'
46
+ RSpec::Core::RakeTask.new(:spec) do |spec|
47
+ spec.rspec_opts = '--color --format documentation'
48
+ spec.pattern = FileList['spec/**/*_spec.rb']
49
+ end
50
+ task test: :spec
51
+
52
+ desc 'Code coverage detail'
53
+ task :coverage do
54
+ ENV['COVERAGE'] = 'true'
55
+ Rake::Task['spec'].execute
56
+ end
57
+
58
+ task default: :test
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,99 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+ # stub: html-hierarchy-extractor 1.0.0 ruby lib
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = "html-hierarchy-extractor"
9
+ s.version = "1.0.0"
10
+
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib"]
13
+ s.authors = ["Tim Carry"]
14
+ s.date = "2016-07-20"
15
+ s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
16
+ s.email = "tim@pixelastic.com"
17
+ s.extra_rdoc_files = [
18
+ "LICENSE.txt",
19
+ "README.md"
20
+ ]
21
+ s.files = [
22
+ ".coveralls.yml",
23
+ ".document",
24
+ ".rspec",
25
+ ".rubocop.yml",
26
+ ".travis.yml",
27
+ "CONTRIBUTING.md",
28
+ "Gemfile",
29
+ "Guardfile",
30
+ "LICENSE.txt",
31
+ "README.md",
32
+ "Rakefile",
33
+ "VERSION",
34
+ "html-hierarchy-extractor.gemspec",
35
+ "lib/html-hierarchy-extractor.rb",
36
+ "lib/version.rb",
37
+ "scripts/bump_version",
38
+ "scripts/check_flay",
39
+ "scripts/check_flog",
40
+ "scripts/coverage",
41
+ "scripts/git_hooks/pre-commit",
42
+ "scripts/git_hooks/pre-push",
43
+ "scripts/lint",
44
+ "scripts/release",
45
+ "scripts/test",
46
+ "scripts/test_ci",
47
+ "scripts/watch",
48
+ "spec/html_hierarchy_extractor_spec.rb",
49
+ "spec/spec_helper.rb",
50
+ "spec/spec_helper_simplecov.rb"
51
+ ]
52
+ s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
53
+ s.licenses = ["MIT"]
54
+ s.rubygems_version = "2.4.8"
55
+ s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
56
+
57
+ if s.respond_to? :specification_version then
58
+ s.specification_version = 4
59
+
60
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
+ s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
62
+ s.add_runtime_dependency(%q<json>, ["~> 1.8"])
63
+ s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
64
+ s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
65
+ s.add_development_dependency(%q<flay>, ["~> 2.6"])
66
+ s.add_development_dependency(%q<flog>, ["~> 4.3"])
67
+ s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
68
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
69
+ s.add_development_dependency(%q<rspec>, ["~> 3.0"])
70
+ s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
71
+ s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
72
+ else
73
+ s.add_dependency(%q<awesome_print>, ["~> 1.6"])
74
+ s.add_dependency(%q<json>, ["~> 1.8"])
75
+ s.add_dependency(%q<nokogiri>, ["~> 1.6"])
76
+ s.add_dependency(%q<coveralls>, ["~> 0.8"])
77
+ s.add_dependency(%q<flay>, ["~> 2.6"])
78
+ s.add_dependency(%q<flog>, ["~> 4.3"])
79
+ s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
80
+ s.add_dependency(%q<jeweler>, ["~> 2.0"])
81
+ s.add_dependency(%q<rspec>, ["~> 3.0"])
82
+ s.add_dependency(%q<rubocop>, ["~> 0.31"])
83
+ s.add_dependency(%q<simplecov>, ["~> 0.10"])
84
+ end
85
+ else
86
+ s.add_dependency(%q<awesome_print>, ["~> 1.6"])
87
+ s.add_dependency(%q<json>, ["~> 1.8"])
88
+ s.add_dependency(%q<nokogiri>, ["~> 1.6"])
89
+ s.add_dependency(%q<coveralls>, ["~> 0.8"])
90
+ s.add_dependency(%q<flay>, ["~> 2.6"])
91
+ s.add_dependency(%q<flog>, ["~> 4.3"])
92
+ s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
93
+ s.add_dependency(%q<jeweler>, ["~> 2.0"])
94
+ s.add_dependency(%q<rspec>, ["~> 3.0"])
95
+ s.add_dependency(%q<rubocop>, ["~> 0.31"])
96
+ s.add_dependency(%q<simplecov>, ["~> 0.10"])
97
+ end
98
+ end
99
+
@@ -0,0 +1,144 @@
1
+ require 'nokogiri'
2
+ require 'digest/md5'
3
+
4
+ # Extract content from an HTML page in the form of items with associated
5
+ # hierarchy data
6
+ class HTMLHierarchyExtractor
7
+ def initialize(input, options: {})
8
+ @dom = Nokogiri::HTML(input)
9
+ default_options = {
10
+ css_selector: 'p'
11
+ }
12
+ @options = default_options.merge(options)
13
+ end
14
+
15
+ # Returns the outer HTML of a given node
16
+ #
17
+ # eg.
18
+ # <p>foo</p> => <p>foo</p>
19
+ def extract_html(node)
20
+ node.to_s.strip
21
+ end
22
+
23
+ # Returns the inner HTML of a given node
24
+ #
25
+ # eg.
26
+ # <p>foo</p> => foo
27
+ def extract_text(node)
28
+ node.content
29
+ end
30
+
31
+ # Returns the tag name of a given node
32
+ #
33
+ # eg
34
+ # <p>foo</p> => p
35
+ def extract_tag_name(node)
36
+ node.name.downcase
37
+ end
38
+
39
+ # Returns the anchor to the node
40
+ #
41
+ # eg.
42
+ # <h1 name="anchor">Foo</h1> => anchor
43
+ # <h1 id="anchor">Foo</h1> => anchor
44
+ # <h1><a name="anchor">Foo</a></h1> => anchor
45
+ def extract_anchor(node)
46
+ anchor = node.attr('name') || node.attr('id') || nil
47
+ return anchor unless anchor.nil?
48
+
49
+ # No anchor found directly in the header, search on children
50
+ subelement = node.css('[name],[id]')
51
+ return extract_anchor(subelement[0]) unless subelement.empty?
52
+
53
+ nil
54
+ end
55
+
56
+ ##
57
+ # Generate a unique identifier for the item
58
+ def uuid(item)
59
+ # We first get all the keys of the object, sorted alphabetically...
60
+ ordered_keys = item.keys.sort
61
+
62
+ # ...then we build a huge array of "key=value" pairs...
63
+ ordered_array = ordered_keys.map do |key|
64
+ value = item[key]
65
+ # We apply the method recursively on other hashes
66
+ value = uuid(value) if value.is_a?(Hash)
67
+ "#{key}=#{value}"
68
+ end
69
+
70
+ # ...then we build a unique md5 hash of it
71
+ Digest::MD5.hexdigest(ordered_array.join(','))
72
+ end
73
+
74
+ ##
75
+ # Get a relative numeric value of the importance of the heading
76
+ # 100 for top level, then -10 per heading
77
+ def heading_weight(heading_level)
78
+ weight = 100
79
+ return weight if heading_level.nil?
80
+ weight - ((heading_level + 1) * 10)
81
+ end
82
+
83
+ def extract
84
+ heading_selector = 'h1,h2,h3,h4,h5,h6'
85
+ # We select all nodes that match either the headings or the elements to
86
+ # extract. This will allow us to loop over it in order it appears in the DOM
87
+ all_selector = "#{heading_selector},#{@options[:css_selector]}"
88
+
89
+ items = []
90
+ current_hierarchy = {
91
+ lvl0: nil,
92
+ lvl1: nil,
93
+ lvl2: nil,
94
+ lvl3: nil,
95
+ lvl4: nil,
96
+ lvl5: nil
97
+ }
98
+ current_position = 0 # Position of the DOM node in the tree
99
+ current_lvl = nil # Current closest hierarchy level
100
+ current_anchor = nil # Current closest anchor
101
+
102
+ @dom.css(all_selector).each do |node|
103
+ # If it's a heading, we update our current hierarchy
104
+ if node.matches?(heading_selector)
105
+ # Which level heading is it?
106
+ current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
107
+ # Update this level, and set all the following ones to nil
108
+ current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
109
+ (current_lvl + 1..6).each do |lvl|
110
+ current_hierarchy["lvl#{lvl}".to_sym] = nil
111
+ end
112
+ # Update the anchor, if the new heading has one
113
+ new_anchor = extract_anchor(node)
114
+ current_anchor = new_anchor if new_anchor
115
+ end
116
+
117
+ # Stop if node is not to be extracted
118
+ next unless node.matches?(@options[:css_selector])
119
+
120
+ # Stop if node is empty
121
+ text = extract_text(node)
122
+ next if text.empty?
123
+
124
+ item = {
125
+ html: extract_html(node),
126
+ text: text,
127
+ tag_name: extract_tag_name(node),
128
+ hierarchy: current_hierarchy.clone,
129
+ anchor: current_anchor,
130
+ node: node,
131
+ weight: {
132
+ position: current_position,
133
+ heading: heading_weight(current_lvl)
134
+ }
135
+ }
136
+ item[:uuid] = uuid(item)
137
+ items << item
138
+
139
+ current_position += 1
140
+ end
141
+
142
+ items
143
+ end
144
+ end