html-hierarchy-extractor 1.0.2 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- metadata +45 -48
- data/.coveralls.yml +0 -1
- data/.document +0 -5
- data/.rspec +0 -2
- data/.rubocop.yml +0 -26
- data/.travis.yml +0 -12
- data/CONTRIBUTING.md +0 -53
- data/Gemfile +0 -16
- data/Guardfile +0 -7
- data/LICENSE.txt +0 -20
- data/README.md +0 -141
- data/Rakefile +0 -58
- data/VERSION +0 -1
- data/html-hierarchy-extractor.gemspec +0 -99
- data/lib/html-hierarchy-extractor.rb +0 -144
- data/lib/version.rb +0 -6
- data/scripts/bump_version +0 -47
- data/scripts/check_flay +0 -30
- data/scripts/check_flog +0 -31
- data/scripts/coverage +0 -3
- data/scripts/git_hooks/pre-commit +0 -16
- data/scripts/git_hooks/pre-push +0 -9
- data/scripts/lint +0 -2
- data/scripts/release +0 -13
- data/scripts/test +0 -4
- data/scripts/test_ci +0 -7
- data/scripts/watch +0 -4
- data/spec/html_hierarchy_extractor_spec.rb +0 -441
- data/spec/spec_helper.rb +0 -14
- data/spec/spec_helper_simplecov.rb +0 -9
data/Rakefile
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts 'Run `bundle install` to install missing gems'
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
13
|
-
|
14
|
-
require 'jeweler'
|
15
|
-
require_relative 'lib/version'
|
16
|
-
Jeweler::Tasks.new do |gem|
|
17
|
-
# gem is a Gem::Specification...
|
18
|
-
# see http://guides.rubygems.org/specification-reference/ for more options
|
19
|
-
gem.name = 'html-hierarchy-extractor'
|
20
|
-
gem.version = HTMLHierarchyExtractorVersion.to_s
|
21
|
-
gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
|
22
|
-
gem.license = 'MIT'
|
23
|
-
gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
|
24
|
-
' list of items'
|
25
|
-
gem.description = 'Take any arbitrary HTML as input and extract its' \
|
26
|
-
' hierarchy as a list of items, including parents and' \
|
27
|
-
' contents.' \
|
28
|
-
'It is primarily intended to be used along with Algolia,' \
|
29
|
-
' to improve the relevance of searching into huge chunks' \
|
30
|
-
' of text'
|
31
|
-
gem.email = 'tim@pixelastic.com'
|
32
|
-
gem.authors = ['Tim Carry']
|
33
|
-
# dependencies defined in Gemfile
|
34
|
-
end
|
35
|
-
Jeweler::RubygemsDotOrgTasks.new
|
36
|
-
|
37
|
-
require 'rake/testtask'
|
38
|
-
Rake::TestTask.new(:test) do |test|
|
39
|
-
test.libs << 'lib' << 'test'
|
40
|
-
test.pattern = 'test/**/test_*.rb'
|
41
|
-
test.verbose = true
|
42
|
-
end
|
43
|
-
|
44
|
-
require 'rspec/core'
|
45
|
-
require 'rspec/core/rake_task'
|
46
|
-
RSpec::Core::RakeTask.new(:spec) do |spec|
|
47
|
-
spec.rspec_opts = '--color --format documentation'
|
48
|
-
spec.pattern = FileList['spec/**/*_spec.rb']
|
49
|
-
end
|
50
|
-
task test: :spec
|
51
|
-
|
52
|
-
desc 'Code coverage detail'
|
53
|
-
task :coverage do
|
54
|
-
ENV['COVERAGE'] = 'true'
|
55
|
-
Rake::Task['spec'].execute
|
56
|
-
end
|
57
|
-
|
58
|
-
task default: :test
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.0
|
@@ -1,99 +0,0 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
-
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: html-hierarchy-extractor 1.0.2 ruby lib
|
6
|
-
|
7
|
-
Gem::Specification.new do |s|
|
8
|
-
s.name = "html-hierarchy-extractor"
|
9
|
-
s.version = "1.0.2"
|
10
|
-
|
11
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
-
s.require_paths = ["lib"]
|
13
|
-
s.authors = ["Tim Carry"]
|
14
|
-
s.date = "2016-07-20"
|
15
|
-
s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
|
16
|
-
s.email = "tim@pixelastic.com"
|
17
|
-
s.extra_rdoc_files = [
|
18
|
-
"LICENSE.txt",
|
19
|
-
"README.md"
|
20
|
-
]
|
21
|
-
s.files = [
|
22
|
-
".coveralls.yml",
|
23
|
-
".document",
|
24
|
-
".rspec",
|
25
|
-
".rubocop.yml",
|
26
|
-
".travis.yml",
|
27
|
-
"CONTRIBUTING.md",
|
28
|
-
"Gemfile",
|
29
|
-
"Guardfile",
|
30
|
-
"LICENSE.txt",
|
31
|
-
"README.md",
|
32
|
-
"Rakefile",
|
33
|
-
"VERSION",
|
34
|
-
"html-hierarchy-extractor.gemspec",
|
35
|
-
"lib/html-hierarchy-extractor.rb",
|
36
|
-
"lib/version.rb",
|
37
|
-
"scripts/bump_version",
|
38
|
-
"scripts/check_flay",
|
39
|
-
"scripts/check_flog",
|
40
|
-
"scripts/coverage",
|
41
|
-
"scripts/git_hooks/pre-commit",
|
42
|
-
"scripts/git_hooks/pre-push",
|
43
|
-
"scripts/lint",
|
44
|
-
"scripts/release",
|
45
|
-
"scripts/test",
|
46
|
-
"scripts/test_ci",
|
47
|
-
"scripts/watch",
|
48
|
-
"spec/html_hierarchy_extractor_spec.rb",
|
49
|
-
"spec/spec_helper.rb",
|
50
|
-
"spec/spec_helper_simplecov.rb"
|
51
|
-
]
|
52
|
-
s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
|
53
|
-
s.licenses = ["MIT"]
|
54
|
-
s.rubygems_version = "2.4.8"
|
55
|
-
s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
|
56
|
-
|
57
|
-
if s.respond_to? :specification_version then
|
58
|
-
s.specification_version = 4
|
59
|
-
|
60
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
61
|
-
s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
|
62
|
-
s.add_runtime_dependency(%q<json>, ["~> 1.8"])
|
63
|
-
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
|
64
|
-
s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
|
65
|
-
s.add_development_dependency(%q<flay>, ["~> 2.6"])
|
66
|
-
s.add_development_dependency(%q<flog>, ["~> 4.3"])
|
67
|
-
s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
|
68
|
-
s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
|
69
|
-
s.add_development_dependency(%q<rspec>, ["~> 3.0"])
|
70
|
-
s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
|
71
|
-
s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
|
72
|
-
else
|
73
|
-
s.add_dependency(%q<awesome_print>, ["~> 1.6"])
|
74
|
-
s.add_dependency(%q<json>, ["~> 1.8"])
|
75
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6"])
|
76
|
-
s.add_dependency(%q<coveralls>, ["~> 0.8"])
|
77
|
-
s.add_dependency(%q<flay>, ["~> 2.6"])
|
78
|
-
s.add_dependency(%q<flog>, ["~> 4.3"])
|
79
|
-
s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
|
80
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
81
|
-
s.add_dependency(%q<rspec>, ["~> 3.0"])
|
82
|
-
s.add_dependency(%q<rubocop>, ["~> 0.31"])
|
83
|
-
s.add_dependency(%q<simplecov>, ["~> 0.10"])
|
84
|
-
end
|
85
|
-
else
|
86
|
-
s.add_dependency(%q<awesome_print>, ["~> 1.6"])
|
87
|
-
s.add_dependency(%q<json>, ["~> 1.8"])
|
88
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6"])
|
89
|
-
s.add_dependency(%q<coveralls>, ["~> 0.8"])
|
90
|
-
s.add_dependency(%q<flay>, ["~> 2.6"])
|
91
|
-
s.add_dependency(%q<flog>, ["~> 4.3"])
|
92
|
-
s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
|
93
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
94
|
-
s.add_dependency(%q<rspec>, ["~> 3.0"])
|
95
|
-
s.add_dependency(%q<rubocop>, ["~> 0.31"])
|
96
|
-
s.add_dependency(%q<simplecov>, ["~> 0.10"])
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
@@ -1,144 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'digest/md5'
|
3
|
-
|
4
|
-
# Extract content from an HTML page in the form of items with associated
|
5
|
-
# hierarchy data
|
6
|
-
class HTMLHierarchyExtractor
|
7
|
-
def initialize(input, options: {})
|
8
|
-
@dom = Nokogiri::HTML(input)
|
9
|
-
default_options = {
|
10
|
-
css_selector: 'p'
|
11
|
-
}
|
12
|
-
@options = default_options.merge(options)
|
13
|
-
end
|
14
|
-
|
15
|
-
# Returns the outer HTML of a given node
|
16
|
-
#
|
17
|
-
# eg.
|
18
|
-
# <p>foo</p> => <p>foo</p>
|
19
|
-
def extract_html(node)
|
20
|
-
node.to_s.strip
|
21
|
-
end
|
22
|
-
|
23
|
-
# Returns the inner HTML of a given node
|
24
|
-
#
|
25
|
-
# eg.
|
26
|
-
# <p>foo</p> => foo
|
27
|
-
def extract_text(node)
|
28
|
-
node.content
|
29
|
-
end
|
30
|
-
|
31
|
-
# Returns the tag name of a given node
|
32
|
-
#
|
33
|
-
# eg
|
34
|
-
# <p>foo</p> => p
|
35
|
-
def extract_tag_name(node)
|
36
|
-
node.name.downcase
|
37
|
-
end
|
38
|
-
|
39
|
-
# Returns the anchor to the node
|
40
|
-
#
|
41
|
-
# eg.
|
42
|
-
# <h1 name="anchor">Foo</h1> => anchor
|
43
|
-
# <h1 id="anchor">Foo</h1> => anchor
|
44
|
-
# <h1><a name="anchor">Foo</a></h1> => anchor
|
45
|
-
def extract_anchor(node)
|
46
|
-
anchor = node.attr('name') || node.attr('id') || nil
|
47
|
-
return anchor unless anchor.nil?
|
48
|
-
|
49
|
-
# No anchor found directly in the header, search on children
|
50
|
-
subelement = node.css('[name],[id]')
|
51
|
-
return extract_anchor(subelement[0]) unless subelement.empty?
|
52
|
-
|
53
|
-
nil
|
54
|
-
end
|
55
|
-
|
56
|
-
##
|
57
|
-
# Generate a unique identifier for the item
|
58
|
-
def uuid(item)
|
59
|
-
# We first get all the keys of the object, sorted alphabetically...
|
60
|
-
ordered_keys = item.keys.sort
|
61
|
-
|
62
|
-
# ...then we build a huge array of "key=value" pairs...
|
63
|
-
ordered_array = ordered_keys.map do |key|
|
64
|
-
value = item[key]
|
65
|
-
# We apply the method recursively on other hashes
|
66
|
-
value = uuid(value) if value.is_a?(Hash)
|
67
|
-
"#{key}=#{value}"
|
68
|
-
end
|
69
|
-
|
70
|
-
# ...then we build a unique md5 hash of it
|
71
|
-
Digest::MD5.hexdigest(ordered_array.join(','))
|
72
|
-
end
|
73
|
-
|
74
|
-
##
|
75
|
-
# Get a relative numeric value of the importance of the heading
|
76
|
-
# 100 for top level, then -10 per heading
|
77
|
-
def heading_weight(heading_level)
|
78
|
-
weight = 100
|
79
|
-
return weight if heading_level.nil?
|
80
|
-
weight - ((heading_level + 1) * 10)
|
81
|
-
end
|
82
|
-
|
83
|
-
def extract
|
84
|
-
heading_selector = 'h1,h2,h3,h4,h5,h6'
|
85
|
-
# We select all nodes that match either the headings or the elements to
|
86
|
-
# extract. This will allow us to loop over it in order it appears in the DOM
|
87
|
-
all_selector = "#{heading_selector},#{@options[:css_selector]}"
|
88
|
-
|
89
|
-
items = []
|
90
|
-
current_hierarchy = {
|
91
|
-
lvl0: nil,
|
92
|
-
lvl1: nil,
|
93
|
-
lvl2: nil,
|
94
|
-
lvl3: nil,
|
95
|
-
lvl4: nil,
|
96
|
-
lvl5: nil
|
97
|
-
}
|
98
|
-
current_position = 0 # Position of the DOM node in the tree
|
99
|
-
current_lvl = nil # Current closest hierarchy level
|
100
|
-
current_anchor = nil # Current closest anchor
|
101
|
-
|
102
|
-
@dom.css(all_selector).each do |node|
|
103
|
-
# If it's a heading, we update our current hierarchy
|
104
|
-
if node.matches?(heading_selector)
|
105
|
-
# Which level heading is it?
|
106
|
-
current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
|
107
|
-
# Update this level, and set all the following ones to nil
|
108
|
-
current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
|
109
|
-
(current_lvl + 1..6).each do |lvl|
|
110
|
-
current_hierarchy["lvl#{lvl}".to_sym] = nil
|
111
|
-
end
|
112
|
-
# Update the anchor, if the new heading has one
|
113
|
-
new_anchor = extract_anchor(node)
|
114
|
-
current_anchor = new_anchor if new_anchor
|
115
|
-
end
|
116
|
-
|
117
|
-
# Stop if node is not to be extracted
|
118
|
-
next unless node.matches?(@options[:css_selector])
|
119
|
-
|
120
|
-
# Stop if node is empty
|
121
|
-
text = extract_text(node)
|
122
|
-
next if text.empty?
|
123
|
-
|
124
|
-
item = {
|
125
|
-
html: extract_html(node),
|
126
|
-
text: text,
|
127
|
-
tag_name: extract_tag_name(node),
|
128
|
-
hierarchy: current_hierarchy.clone,
|
129
|
-
anchor: current_anchor,
|
130
|
-
node: node,
|
131
|
-
weight: {
|
132
|
-
position: current_position,
|
133
|
-
heading: heading_weight(current_lvl)
|
134
|
-
}
|
135
|
-
}
|
136
|
-
item[:uuid] = uuid(item)
|
137
|
-
items << item
|
138
|
-
|
139
|
-
current_position += 1
|
140
|
-
end
|
141
|
-
|
142
|
-
items
|
143
|
-
end
|
144
|
-
end
|
data/lib/version.rb
DELETED
data/scripts/bump_version
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require_relative '../lib/version.rb'
|
3
|
-
|
4
|
-
# Simple script used to bump the version number
|
5
|
-
class BumpVersion
|
6
|
-
def initialize(*args)
|
7
|
-
@type = args[0]
|
8
|
-
unless valid_type?(@type)
|
9
|
-
puts "Invalid bump type: #{@type}"
|
10
|
-
exit 1
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def valid_type?(type)
|
15
|
-
%w(major minor patch).include?(type)
|
16
|
-
end
|
17
|
-
|
18
|
-
def bump(current_version, type)
|
19
|
-
major, minor, patch = current_version.split('.').map(&:to_i)
|
20
|
-
if type == 'major'
|
21
|
-
major += 1
|
22
|
-
minor = 0
|
23
|
-
patch = 0
|
24
|
-
end
|
25
|
-
if type == 'minor'
|
26
|
-
minor += 1
|
27
|
-
patch = 0
|
28
|
-
end
|
29
|
-
patch += 1 if type == 'patch'
|
30
|
-
"#{major}.#{minor}.#{patch}"
|
31
|
-
end
|
32
|
-
|
33
|
-
def run
|
34
|
-
old_version = HTMLHierarchyExtractorVersion.to_s
|
35
|
-
new_version = bump(old_version, @type)
|
36
|
-
|
37
|
-
script_dir = File.expand_path(File.dirname(__FILE__))
|
38
|
-
file = File.join(script_dir, '../lib/version.rb')
|
39
|
-
old_content = File.read(file)
|
40
|
-
new_content = old_content.gsub(old_version, new_version)
|
41
|
-
File.write(file, new_content)
|
42
|
-
|
43
|
-
`git add #{file}`
|
44
|
-
`git commit -m "chore(bump): Version bump to #{new_version}"`
|
45
|
-
end
|
46
|
-
end
|
47
|
-
BumpVersion.new(*ARGV).run
|
data/scripts/check_flay
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
MAX_SCORE = 45
|
4
|
-
|
5
|
-
flay_lines = `flay -s ./lib/`.split("\n")
|
6
|
-
|
7
|
-
errors = []
|
8
|
-
flay_lines.each_with_index do |line, index|
|
9
|
-
# Skip header
|
10
|
-
next if index < 2
|
11
|
-
|
12
|
-
pattern = /^ *(.*): (.*)/
|
13
|
-
matches = line.match(pattern)
|
14
|
-
next if matches.nil?
|
15
|
-
score = matches[1].to_f
|
16
|
-
|
17
|
-
next if score < MAX_SCORE
|
18
|
-
errors << {
|
19
|
-
score: score,
|
20
|
-
file: matches[2]
|
21
|
-
}
|
22
|
-
end
|
23
|
-
|
24
|
-
exit 0 if errors.size == 0
|
25
|
-
|
26
|
-
puts 'Flay test failed:'
|
27
|
-
errors.sort_by { |a| a[:score] }.each do |error|
|
28
|
-
puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
|
29
|
-
end
|
30
|
-
exit 1
|
data/scripts/check_flog
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
MAX_SCORE = 45
|
4
|
-
|
5
|
-
flog_lines = `flog ./lib/`.split("\n")
|
6
|
-
|
7
|
-
errors = []
|
8
|
-
flog_lines.each_with_index do |line, index|
|
9
|
-
# Skip header
|
10
|
-
next if index < 3
|
11
|
-
|
12
|
-
pattern = /^ *(.*): (.*) (.*):[0-9]*/
|
13
|
-
matches = line.match(pattern)
|
14
|
-
next if matches.nil?
|
15
|
-
score = matches[1].to_f
|
16
|
-
|
17
|
-
next if score < MAX_SCORE
|
18
|
-
errors << {
|
19
|
-
score: score,
|
20
|
-
method: matches[2],
|
21
|
-
file: matches[3]
|
22
|
-
}
|
23
|
-
end
|
24
|
-
|
25
|
-
exit 0 if errors.size == 0
|
26
|
-
|
27
|
-
puts 'Flog test failed:'
|
28
|
-
errors.sort_by { |a| a[:score] }.each do |error|
|
29
|
-
puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
|
30
|
-
end
|
31
|
-
exit 1
|
data/scripts/coverage
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
# Succeed fast if we did not change any ruby file
|
4
|
-
if ! git status --short | grep -q '\.rb$'; then
|
5
|
-
exit 0
|
6
|
-
fi
|
7
|
-
|
8
|
-
# Do not commit any focused or excluded tests
|
9
|
-
if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
|
10
|
-
echo '✘ You have focused and/or skipped tests'
|
11
|
-
exit 1
|
12
|
-
fi
|
13
|
-
|
14
|
-
# Match style guide
|
15
|
-
./scripts/lint || exit 1
|
16
|
-
|