html-hierarchy-extractor 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9c0a516a852828d433ba0495206acc9febbd1670
4
+ data.tar.gz: 444cedeb76c06fd048526cb02c7fcac294927540
5
+ SHA512:
6
+ metadata.gz: 7e6505db7a21b42db30d4afffa496358642c1eb6332174f5ada9418f973056c0b0f9762b6458f68c02a1eb035700fe9746d6dbc92a613b4a5797a4b54512f2cc
7
+ data.tar.gz: bcba7859c0e37030d6a209bef9b3980f35ea9dc08283f6d86445400cb115aa92f11f6ef1331ed0b7d353b077bf9e29542e730bde118e7479f96a3dbe96164833
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.rubocop.yml ADDED
@@ -0,0 +1,26 @@
1
+ # Defaults:
2
+ # https://github.com/bbatsov/rubocop/blob/master/config/default.yml
3
+ Metrics/AbcSize:
4
+ Max: 100
5
+
6
+ Metrics/ClassLength:
7
+ Max: 200
8
+
9
+ Metrics/ModuleLength:
10
+ Max: 200
11
+
12
+ Metrics/MethodLength:
13
+ Max: 50
14
+
15
+ Metrics/CyclomaticComplexity:
16
+ Max: 10
17
+
18
+ Metrics/PerceivedComplexity:
19
+ Max: 10
20
+
21
+ Style/FileName:
22
+ Enabled: false
23
+
24
+ Style/MultilineOperationIndentation:
25
+ Enabled: false
26
+
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ cache: bundler
3
+ before_script: bundle update
4
+ script: ./scripts/test_ci
5
+ rvm:
6
+ - 2.2
7
+ - 2.1
8
+ - 2.0
9
+ notifications:
10
+ email:
11
+ on_success: never
12
+ on_failure: never
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,53 @@
1
+ Hi collaborator!
2
+
3
+ If you have a fix or a new feature, please start by checking in the
4
+ [issues](https://github.com/pixelastic/html-hierarchy-extractor/issues) if it is
5
+ already referenced. If not, feel free to open one.
6
+
7
+ We use [pull requests](https://github.com/pixelastic/html-hierarchy-extractor/pulls)
8
+ for collaboration. The workflow is as follow:
9
+
10
+ - Create a local branch, starting from `develop`
11
+ - Submit the PR on `develop`
12
+ - Wait for review
13
+ - Do the changes requested (if any)
14
+ - We may ask you to rebase the branch to latest `develop` if it gets out of sync
15
+ - Get praise for your awesome contribution
16
+
17
+ # Development workflow
18
+
19
+ Run `bundle install` to get all dependencies up to date.
20
+
21
+ You can then launch:
22
+
23
+ - `./scripts/test` to launch tests
24
+ - `./scripts/watch` to start a test watcher (for TDD) using Guard
25
+
26
+ If you plan on submitting a PR, I suggest you install the git hooks. This will
27
+ run pre-commit and pre-push checks. Those checks will also be run by TravisCI,
28
+ but running them locally gives faster feedback.
29
+
30
+ If you want to a local version of the gem in your local project, I suggest
31
+ updating your project `Gemfile` to point to the correct local directory
32
+
33
+ ```ruby
34
+ gem "html-hierarchy-extractor", :path => "/path/to/local/gem/folder"
35
+ ```
36
+
37
+ You should also run `rake gemspec` from the `html-hierarchy-extractor`
38
+ repository the first time and if you added/deleted any file or dependency.
39
+
40
+ # Tagging and releasing
41
+
42
+ This part is for main contributors:
43
+
44
+ ```
45
+ # Bump the version (in develop)
46
+ ./scripts/bump_version minor
47
+
48
+ # Update master and release
49
+ ./scripts/release
50
+
51
+ # Install the gem locally (optional)
52
+ rake install
53
+ ```
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gem 'awesome_print', '~> 1.6'
4
+ gem 'json', '~> 1.8'
5
+ gem 'nokogiri', '~> 1.6'
6
+
7
+ group :development do
8
+ gem 'coveralls', '~> 0.8'
9
+ gem 'flay', '~> 2.6'
10
+ gem 'flog', '~> 4.3'
11
+ gem 'guard-rspec', '~> 4.6'
12
+ gem 'jeweler', '~> 2.0'
13
+ gem 'rspec', '~> 3.0'
14
+ gem 'rubocop', '~> 0.31'
15
+ gem 'simplecov', '~> 0.10'
16
+ end
data/Guardfile ADDED
@@ -0,0 +1,7 @@
1
+ guard :rspec, cmd: 'bundle exec rspec --color --format documentation' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { 'spec' }
5
+ end
6
+
7
+ notification :off
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2016 Pixelastic
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,17 @@
1
+ # html-hierarchy-extractor
2
+
3
+ This gems lets you extract the hierarchy of headings and content from any HTML
4
+ page into and array of elements.
5
+
6
+ It is intended to be used with Algolia to improve relevance of search results
7
+ inside large HTML pages.
8
+
9
+ Note: This repo is still a work in progress, and follows the RDD (Readme Driven
10
+ Development) principle. All you see in the Readme might not be implemented yet.
11
+
12
+ ## How to use
13
+
14
+ ```ruby
15
+ page = HTMLHierarchyExtractor(html) # Or filepath
16
+ page.extract
17
+ ```
data/Rakefile ADDED
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts 'Run `bundle install` to install missing gems'
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ require_relative 'lib/version'
16
+ Jeweler::Tasks.new do |gem|
17
+ # gem is a Gem::Specification...
18
+ # see http://guides.rubygems.org/specification-reference/ for more options
19
+ gem.name = 'html-hierarchy-extractor'
20
+ gem.version = HTMLHierarchyExtractorVersion.to_s
21
+ gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
22
+ gem.license = 'MIT'
23
+ gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
24
+ ' list of items'
25
+ gem.description = 'Take any arbitrary HTML as input and extract its' \
26
+ ' hierarchy as a list of items, including parents and' \
27
+ ' contents.' \
28
+ 'It is primarily intended to be used along with Algolia,' \
29
+ ' to improve the relevance of searching into huge chunks' \
30
+ ' of text'
31
+ gem.email = 'tim@pixelastic.com'
32
+ gem.authors = ['Tim Carry']
33
+ # dependencies defined in Gemfile
34
+ end
35
+ Jeweler::RubygemsDotOrgTasks.new
36
+
37
+ require 'rake/testtask'
38
+ Rake::TestTask.new(:test) do |test|
39
+ test.libs << 'lib' << 'test'
40
+ test.pattern = 'test/**/test_*.rb'
41
+ test.verbose = true
42
+ end
43
+
44
+ require 'rspec/core'
45
+ require 'rspec/core/rake_task'
46
+ RSpec::Core::RakeTask.new(:spec) do |spec|
47
+ spec.rspec_opts = '--color --format documentation'
48
+ spec.pattern = FileList['spec/**/*_spec.rb']
49
+ end
50
+ task test: :spec
51
+
52
+ desc 'Code coverage detail'
53
+ task :coverage do
54
+ ENV['COVERAGE'] = 'true'
55
+ Rake::Task['spec'].execute
56
+ end
57
+
58
+ task default: :test
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,99 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+ # stub: html-hierarchy-extractor 1.0.0 ruby lib
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = "html-hierarchy-extractor"
9
+ s.version = "1.0.0"
10
+
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib"]
13
+ s.authors = ["Tim Carry"]
14
+ s.date = "2016-07-20"
15
+ s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
16
+ s.email = "tim@pixelastic.com"
17
+ s.extra_rdoc_files = [
18
+ "LICENSE.txt",
19
+ "README.md"
20
+ ]
21
+ s.files = [
22
+ ".coveralls.yml",
23
+ ".document",
24
+ ".rspec",
25
+ ".rubocop.yml",
26
+ ".travis.yml",
27
+ "CONTRIBUTING.md",
28
+ "Gemfile",
29
+ "Guardfile",
30
+ "LICENSE.txt",
31
+ "README.md",
32
+ "Rakefile",
33
+ "VERSION",
34
+ "html-hierarchy-extractor.gemspec",
35
+ "lib/html-hierarchy-extractor.rb",
36
+ "lib/version.rb",
37
+ "scripts/bump_version",
38
+ "scripts/check_flay",
39
+ "scripts/check_flog",
40
+ "scripts/coverage",
41
+ "scripts/git_hooks/pre-commit",
42
+ "scripts/git_hooks/pre-push",
43
+ "scripts/lint",
44
+ "scripts/release",
45
+ "scripts/test",
46
+ "scripts/test_ci",
47
+ "scripts/watch",
48
+ "spec/html_hierarchy_extractor_spec.rb",
49
+ "spec/spec_helper.rb",
50
+ "spec/spec_helper_simplecov.rb"
51
+ ]
52
+ s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
53
+ s.licenses = ["MIT"]
54
+ s.rubygems_version = "2.4.8"
55
+ s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
56
+
57
+ if s.respond_to? :specification_version then
58
+ s.specification_version = 4
59
+
60
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
+ s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
62
+ s.add_runtime_dependency(%q<json>, ["~> 1.8"])
63
+ s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
64
+ s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
65
+ s.add_development_dependency(%q<flay>, ["~> 2.6"])
66
+ s.add_development_dependency(%q<flog>, ["~> 4.3"])
67
+ s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
68
+ s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
69
+ s.add_development_dependency(%q<rspec>, ["~> 3.0"])
70
+ s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
71
+ s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
72
+ else
73
+ s.add_dependency(%q<awesome_print>, ["~> 1.6"])
74
+ s.add_dependency(%q<json>, ["~> 1.8"])
75
+ s.add_dependency(%q<nokogiri>, ["~> 1.6"])
76
+ s.add_dependency(%q<coveralls>, ["~> 0.8"])
77
+ s.add_dependency(%q<flay>, ["~> 2.6"])
78
+ s.add_dependency(%q<flog>, ["~> 4.3"])
79
+ s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
80
+ s.add_dependency(%q<jeweler>, ["~> 2.0"])
81
+ s.add_dependency(%q<rspec>, ["~> 3.0"])
82
+ s.add_dependency(%q<rubocop>, ["~> 0.31"])
83
+ s.add_dependency(%q<simplecov>, ["~> 0.10"])
84
+ end
85
+ else
86
+ s.add_dependency(%q<awesome_print>, ["~> 1.6"])
87
+ s.add_dependency(%q<json>, ["~> 1.8"])
88
+ s.add_dependency(%q<nokogiri>, ["~> 1.6"])
89
+ s.add_dependency(%q<coveralls>, ["~> 0.8"])
90
+ s.add_dependency(%q<flay>, ["~> 2.6"])
91
+ s.add_dependency(%q<flog>, ["~> 4.3"])
92
+ s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
93
+ s.add_dependency(%q<jeweler>, ["~> 2.0"])
94
+ s.add_dependency(%q<rspec>, ["~> 3.0"])
95
+ s.add_dependency(%q<rubocop>, ["~> 0.31"])
96
+ s.add_dependency(%q<simplecov>, ["~> 0.10"])
97
+ end
98
+ end
99
+
@@ -0,0 +1,144 @@
1
+ require 'nokogiri'
2
+ require 'digest/md5'
3
+
4
+ # Extract content from an HTML page in the form of items with associated
5
+ # hierarchy data
6
+ class HTMLHierarchyExtractor
7
+ def initialize(input, options: {})
8
+ @dom = Nokogiri::HTML(input)
9
+ default_options = {
10
+ css_selector: 'p'
11
+ }
12
+ @options = default_options.merge(options)
13
+ end
14
+
15
+ # Returns the outer HTML of a given node
16
+ #
17
+ # eg.
18
+ # <p>foo</p> => <p>foo</p>
19
+ def extract_html(node)
20
+ node.to_s.strip
21
+ end
22
+
23
+ # Returns the inner HTML of a given node
24
+ #
25
+ # eg.
26
+ # <p>foo</p> => foo
27
+ def extract_text(node)
28
+ node.content
29
+ end
30
+
31
+ # Returns the tag name of a given node
32
+ #
33
+ # eg
34
+ # <p>foo</p> => p
35
+ def extract_tag_name(node)
36
+ node.name.downcase
37
+ end
38
+
39
+ # Returns the anchor to the node
40
+ #
41
+ # eg.
42
+ # <h1 name="anchor">Foo</h1> => anchor
43
+ # <h1 id="anchor">Foo</h1> => anchor
44
+ # <h1><a name="anchor">Foo</a></h1> => anchor
45
+ def extract_anchor(node)
46
+ anchor = node.attr('name') || node.attr('id') || nil
47
+ return anchor unless anchor.nil?
48
+
49
+ # No anchor found directly in the header, search on children
50
+ subelement = node.css('[name],[id]')
51
+ return extract_anchor(subelement[0]) unless subelement.empty?
52
+
53
+ nil
54
+ end
55
+
56
+ ##
57
+ # Generate a unique identifier for the item
58
+ def uuid(item)
59
+ # We first get all the keys of the object, sorted alphabetically...
60
+ ordered_keys = item.keys.sort
61
+
62
+ # ...then we build a huge array of "key=value" pairs...
63
+ ordered_array = ordered_keys.map do |key|
64
+ value = item[key]
65
+ # We apply the method recursively on other hashes
66
+ value = uuid(value) if value.is_a?(Hash)
67
+ "#{key}=#{value}"
68
+ end
69
+
70
+ # ...then we build a unique md5 hash of it
71
+ Digest::MD5.hexdigest(ordered_array.join(','))
72
+ end
73
+
74
+ ##
75
+ # Get a relative numeric value of the importance of the heading
76
+ # 100 for top level, then -10 per heading
77
+ def heading_weight(heading_level)
78
+ weight = 100
79
+ return weight if heading_level.nil?
80
+ weight - ((heading_level + 1) * 10)
81
+ end
82
+
83
+ def extract
84
+ heading_selector = 'h1,h2,h3,h4,h5,h6'
85
+ # We select all nodes that match either the headings or the elements to
86
+ # extract. This will allow us to loop over it in order it appears in the DOM
87
+ all_selector = "#{heading_selector},#{@options[:css_selector]}"
88
+
89
+ items = []
90
+ current_hierarchy = {
91
+ lvl0: nil,
92
+ lvl1: nil,
93
+ lvl2: nil,
94
+ lvl3: nil,
95
+ lvl4: nil,
96
+ lvl5: nil
97
+ }
98
+ current_position = 0 # Position of the DOM node in the tree
99
+ current_lvl = nil # Current closest hierarchy level
100
+ current_anchor = nil # Current closest anchor
101
+
102
+ @dom.css(all_selector).each do |node|
103
+ # If it's a heading, we update our current hierarchy
104
+ if node.matches?(heading_selector)
105
+ # Which level heading is it?
106
+ current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
107
+ # Update this level, and set all the following ones to nil
108
+ current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
109
+ (current_lvl + 1..6).each do |lvl|
110
+ current_hierarchy["lvl#{lvl}".to_sym] = nil
111
+ end
112
+ # Update the anchor, if the new heading has one
113
+ new_anchor = extract_anchor(node)
114
+ current_anchor = new_anchor if new_anchor
115
+ end
116
+
117
+ # Stop if node is not to be extracted
118
+ next unless node.matches?(@options[:css_selector])
119
+
120
+ # Stop if node is empty
121
+ text = extract_text(node)
122
+ next if text.empty?
123
+
124
+ item = {
125
+ html: extract_html(node),
126
+ text: text,
127
+ tag_name: extract_tag_name(node),
128
+ hierarchy: current_hierarchy.clone,
129
+ anchor: current_anchor,
130
+ node: node,
131
+ weight: {
132
+ position: current_position,
133
+ heading: heading_weight(current_lvl)
134
+ }
135
+ }
136
+ item[:uuid] = uuid(item)
137
+ items << item
138
+
139
+ current_position += 1
140
+ end
141
+
142
+ items
143
+ end
144
+ end