html-hierarchy-extractor 1.0.2 → 1.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- metadata +45 -48
- data/.coveralls.yml +0 -1
- data/.document +0 -5
- data/.rspec +0 -2
- data/.rubocop.yml +0 -26
- data/.travis.yml +0 -12
- data/CONTRIBUTING.md +0 -53
- data/Gemfile +0 -16
- data/Guardfile +0 -7
- data/LICENSE.txt +0 -20
- data/README.md +0 -141
- data/Rakefile +0 -58
- data/VERSION +0 -1
- data/html-hierarchy-extractor.gemspec +0 -99
- data/lib/html-hierarchy-extractor.rb +0 -144
- data/lib/version.rb +0 -6
- data/scripts/bump_version +0 -47
- data/scripts/check_flay +0 -30
- data/scripts/check_flog +0 -31
- data/scripts/coverage +0 -3
- data/scripts/git_hooks/pre-commit +0 -16
- data/scripts/git_hooks/pre-push +0 -9
- data/scripts/lint +0 -2
- data/scripts/release +0 -13
- data/scripts/test +0 -4
- data/scripts/test_ci +0 -7
- data/scripts/watch +0 -4
- data/spec/html_hierarchy_extractor_spec.rb +0 -441
- data/spec/spec_helper.rb +0 -14
- data/spec/spec_helper_simplecov.rb +0 -9
data/Rakefile
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts 'Run `bundle install` to install missing gems'
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
13
|
-
|
14
|
-
require 'jeweler'
|
15
|
-
require_relative 'lib/version'
|
16
|
-
Jeweler::Tasks.new do |gem|
|
17
|
-
# gem is a Gem::Specification...
|
18
|
-
# see http://guides.rubygems.org/specification-reference/ for more options
|
19
|
-
gem.name = 'html-hierarchy-extractor'
|
20
|
-
gem.version = HTMLHierarchyExtractorVersion.to_s
|
21
|
-
gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
|
22
|
-
gem.license = 'MIT'
|
23
|
-
gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
|
24
|
-
' list of items'
|
25
|
-
gem.description = 'Take any arbitrary HTML as input and extract its' \
|
26
|
-
' hierarchy as a list of items, including parents and' \
|
27
|
-
' contents.' \
|
28
|
-
'It is primarily intended to be used along with Algolia,' \
|
29
|
-
' to improve the relevance of searching into huge chunks' \
|
30
|
-
' of text'
|
31
|
-
gem.email = 'tim@pixelastic.com'
|
32
|
-
gem.authors = ['Tim Carry']
|
33
|
-
# dependencies defined in Gemfile
|
34
|
-
end
|
35
|
-
Jeweler::RubygemsDotOrgTasks.new
|
36
|
-
|
37
|
-
require 'rake/testtask'
|
38
|
-
Rake::TestTask.new(:test) do |test|
|
39
|
-
test.libs << 'lib' << 'test'
|
40
|
-
test.pattern = 'test/**/test_*.rb'
|
41
|
-
test.verbose = true
|
42
|
-
end
|
43
|
-
|
44
|
-
require 'rspec/core'
|
45
|
-
require 'rspec/core/rake_task'
|
46
|
-
RSpec::Core::RakeTask.new(:spec) do |spec|
|
47
|
-
spec.rspec_opts = '--color --format documentation'
|
48
|
-
spec.pattern = FileList['spec/**/*_spec.rb']
|
49
|
-
end
|
50
|
-
task test: :spec
|
51
|
-
|
52
|
-
desc 'Code coverage detail'
|
53
|
-
task :coverage do
|
54
|
-
ENV['COVERAGE'] = 'true'
|
55
|
-
Rake::Task['spec'].execute
|
56
|
-
end
|
57
|
-
|
58
|
-
task default: :test
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.0
|
@@ -1,99 +0,0 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
-
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: html-hierarchy-extractor 1.0.2 ruby lib
|
6
|
-
|
7
|
-
Gem::Specification.new do |s|
|
8
|
-
s.name = "html-hierarchy-extractor"
|
9
|
-
s.version = "1.0.2"
|
10
|
-
|
11
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
-
s.require_paths = ["lib"]
|
13
|
-
s.authors = ["Tim Carry"]
|
14
|
-
s.date = "2016-07-20"
|
15
|
-
s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
|
16
|
-
s.email = "tim@pixelastic.com"
|
17
|
-
s.extra_rdoc_files = [
|
18
|
-
"LICENSE.txt",
|
19
|
-
"README.md"
|
20
|
-
]
|
21
|
-
s.files = [
|
22
|
-
".coveralls.yml",
|
23
|
-
".document",
|
24
|
-
".rspec",
|
25
|
-
".rubocop.yml",
|
26
|
-
".travis.yml",
|
27
|
-
"CONTRIBUTING.md",
|
28
|
-
"Gemfile",
|
29
|
-
"Guardfile",
|
30
|
-
"LICENSE.txt",
|
31
|
-
"README.md",
|
32
|
-
"Rakefile",
|
33
|
-
"VERSION",
|
34
|
-
"html-hierarchy-extractor.gemspec",
|
35
|
-
"lib/html-hierarchy-extractor.rb",
|
36
|
-
"lib/version.rb",
|
37
|
-
"scripts/bump_version",
|
38
|
-
"scripts/check_flay",
|
39
|
-
"scripts/check_flog",
|
40
|
-
"scripts/coverage",
|
41
|
-
"scripts/git_hooks/pre-commit",
|
42
|
-
"scripts/git_hooks/pre-push",
|
43
|
-
"scripts/lint",
|
44
|
-
"scripts/release",
|
45
|
-
"scripts/test",
|
46
|
-
"scripts/test_ci",
|
47
|
-
"scripts/watch",
|
48
|
-
"spec/html_hierarchy_extractor_spec.rb",
|
49
|
-
"spec/spec_helper.rb",
|
50
|
-
"spec/spec_helper_simplecov.rb"
|
51
|
-
]
|
52
|
-
s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
|
53
|
-
s.licenses = ["MIT"]
|
54
|
-
s.rubygems_version = "2.4.8"
|
55
|
-
s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
|
56
|
-
|
57
|
-
if s.respond_to? :specification_version then
|
58
|
-
s.specification_version = 4
|
59
|
-
|
60
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
61
|
-
s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
|
62
|
-
s.add_runtime_dependency(%q<json>, ["~> 1.8"])
|
63
|
-
s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
|
64
|
-
s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
|
65
|
-
s.add_development_dependency(%q<flay>, ["~> 2.6"])
|
66
|
-
s.add_development_dependency(%q<flog>, ["~> 4.3"])
|
67
|
-
s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
|
68
|
-
s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
|
69
|
-
s.add_development_dependency(%q<rspec>, ["~> 3.0"])
|
70
|
-
s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
|
71
|
-
s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
|
72
|
-
else
|
73
|
-
s.add_dependency(%q<awesome_print>, ["~> 1.6"])
|
74
|
-
s.add_dependency(%q<json>, ["~> 1.8"])
|
75
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6"])
|
76
|
-
s.add_dependency(%q<coveralls>, ["~> 0.8"])
|
77
|
-
s.add_dependency(%q<flay>, ["~> 2.6"])
|
78
|
-
s.add_dependency(%q<flog>, ["~> 4.3"])
|
79
|
-
s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
|
80
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
81
|
-
s.add_dependency(%q<rspec>, ["~> 3.0"])
|
82
|
-
s.add_dependency(%q<rubocop>, ["~> 0.31"])
|
83
|
-
s.add_dependency(%q<simplecov>, ["~> 0.10"])
|
84
|
-
end
|
85
|
-
else
|
86
|
-
s.add_dependency(%q<awesome_print>, ["~> 1.6"])
|
87
|
-
s.add_dependency(%q<json>, ["~> 1.8"])
|
88
|
-
s.add_dependency(%q<nokogiri>, ["~> 1.6"])
|
89
|
-
s.add_dependency(%q<coveralls>, ["~> 0.8"])
|
90
|
-
s.add_dependency(%q<flay>, ["~> 2.6"])
|
91
|
-
s.add_dependency(%q<flog>, ["~> 4.3"])
|
92
|
-
s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
|
93
|
-
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
94
|
-
s.add_dependency(%q<rspec>, ["~> 3.0"])
|
95
|
-
s.add_dependency(%q<rubocop>, ["~> 0.31"])
|
96
|
-
s.add_dependency(%q<simplecov>, ["~> 0.10"])
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
@@ -1,144 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'digest/md5'
|
3
|
-
|
4
|
-
# Extract content from an HTML page in the form of items with associated
|
5
|
-
# hierarchy data
|
6
|
-
class HTMLHierarchyExtractor
|
7
|
-
def initialize(input, options: {})
|
8
|
-
@dom = Nokogiri::HTML(input)
|
9
|
-
default_options = {
|
10
|
-
css_selector: 'p'
|
11
|
-
}
|
12
|
-
@options = default_options.merge(options)
|
13
|
-
end
|
14
|
-
|
15
|
-
# Returns the outer HTML of a given node
|
16
|
-
#
|
17
|
-
# eg.
|
18
|
-
# <p>foo</p> => <p>foo</p>
|
19
|
-
def extract_html(node)
|
20
|
-
node.to_s.strip
|
21
|
-
end
|
22
|
-
|
23
|
-
# Returns the inner HTML of a given node
|
24
|
-
#
|
25
|
-
# eg.
|
26
|
-
# <p>foo</p> => foo
|
27
|
-
def extract_text(node)
|
28
|
-
node.content
|
29
|
-
end
|
30
|
-
|
31
|
-
# Returns the tag name of a given node
|
32
|
-
#
|
33
|
-
# eg
|
34
|
-
# <p>foo</p> => p
|
35
|
-
def extract_tag_name(node)
|
36
|
-
node.name.downcase
|
37
|
-
end
|
38
|
-
|
39
|
-
# Returns the anchor to the node
|
40
|
-
#
|
41
|
-
# eg.
|
42
|
-
# <h1 name="anchor">Foo</h1> => anchor
|
43
|
-
# <h1 id="anchor">Foo</h1> => anchor
|
44
|
-
# <h1><a name="anchor">Foo</a></h1> => anchor
|
45
|
-
def extract_anchor(node)
|
46
|
-
anchor = node.attr('name') || node.attr('id') || nil
|
47
|
-
return anchor unless anchor.nil?
|
48
|
-
|
49
|
-
# No anchor found directly in the header, search on children
|
50
|
-
subelement = node.css('[name],[id]')
|
51
|
-
return extract_anchor(subelement[0]) unless subelement.empty?
|
52
|
-
|
53
|
-
nil
|
54
|
-
end
|
55
|
-
|
56
|
-
##
|
57
|
-
# Generate a unique identifier for the item
|
58
|
-
def uuid(item)
|
59
|
-
# We first get all the keys of the object, sorted alphabetically...
|
60
|
-
ordered_keys = item.keys.sort
|
61
|
-
|
62
|
-
# ...then we build a huge array of "key=value" pairs...
|
63
|
-
ordered_array = ordered_keys.map do |key|
|
64
|
-
value = item[key]
|
65
|
-
# We apply the method recursively on other hashes
|
66
|
-
value = uuid(value) if value.is_a?(Hash)
|
67
|
-
"#{key}=#{value}"
|
68
|
-
end
|
69
|
-
|
70
|
-
# ...then we build a unique md5 hash of it
|
71
|
-
Digest::MD5.hexdigest(ordered_array.join(','))
|
72
|
-
end
|
73
|
-
|
74
|
-
##
|
75
|
-
# Get a relative numeric value of the importance of the heading
|
76
|
-
# 100 for top level, then -10 per heading
|
77
|
-
def heading_weight(heading_level)
|
78
|
-
weight = 100
|
79
|
-
return weight if heading_level.nil?
|
80
|
-
weight - ((heading_level + 1) * 10)
|
81
|
-
end
|
82
|
-
|
83
|
-
def extract
|
84
|
-
heading_selector = 'h1,h2,h3,h4,h5,h6'
|
85
|
-
# We select all nodes that match either the headings or the elements to
|
86
|
-
# extract. This will allow us to loop over it in order it appears in the DOM
|
87
|
-
all_selector = "#{heading_selector},#{@options[:css_selector]}"
|
88
|
-
|
89
|
-
items = []
|
90
|
-
current_hierarchy = {
|
91
|
-
lvl0: nil,
|
92
|
-
lvl1: nil,
|
93
|
-
lvl2: nil,
|
94
|
-
lvl3: nil,
|
95
|
-
lvl4: nil,
|
96
|
-
lvl5: nil
|
97
|
-
}
|
98
|
-
current_position = 0 # Position of the DOM node in the tree
|
99
|
-
current_lvl = nil # Current closest hierarchy level
|
100
|
-
current_anchor = nil # Current closest anchor
|
101
|
-
|
102
|
-
@dom.css(all_selector).each do |node|
|
103
|
-
# If it's a heading, we update our current hierarchy
|
104
|
-
if node.matches?(heading_selector)
|
105
|
-
# Which level heading is it?
|
106
|
-
current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
|
107
|
-
# Update this level, and set all the following ones to nil
|
108
|
-
current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
|
109
|
-
(current_lvl + 1..6).each do |lvl|
|
110
|
-
current_hierarchy["lvl#{lvl}".to_sym] = nil
|
111
|
-
end
|
112
|
-
# Update the anchor, if the new heading has one
|
113
|
-
new_anchor = extract_anchor(node)
|
114
|
-
current_anchor = new_anchor if new_anchor
|
115
|
-
end
|
116
|
-
|
117
|
-
# Stop if node is not to be extracted
|
118
|
-
next unless node.matches?(@options[:css_selector])
|
119
|
-
|
120
|
-
# Stop if node is empty
|
121
|
-
text = extract_text(node)
|
122
|
-
next if text.empty?
|
123
|
-
|
124
|
-
item = {
|
125
|
-
html: extract_html(node),
|
126
|
-
text: text,
|
127
|
-
tag_name: extract_tag_name(node),
|
128
|
-
hierarchy: current_hierarchy.clone,
|
129
|
-
anchor: current_anchor,
|
130
|
-
node: node,
|
131
|
-
weight: {
|
132
|
-
position: current_position,
|
133
|
-
heading: heading_weight(current_lvl)
|
134
|
-
}
|
135
|
-
}
|
136
|
-
item[:uuid] = uuid(item)
|
137
|
-
items << item
|
138
|
-
|
139
|
-
current_position += 1
|
140
|
-
end
|
141
|
-
|
142
|
-
items
|
143
|
-
end
|
144
|
-
end
|
data/lib/version.rb
DELETED
data/scripts/bump_version
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require_relative '../lib/version.rb'
|
3
|
-
|
4
|
-
# Simple script used to bump the version number
|
5
|
-
class BumpVersion
|
6
|
-
def initialize(*args)
|
7
|
-
@type = args[0]
|
8
|
-
unless valid_type?(@type)
|
9
|
-
puts "Invalid bump type: #{@type}"
|
10
|
-
exit 1
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def valid_type?(type)
|
15
|
-
%w(major minor patch).include?(type)
|
16
|
-
end
|
17
|
-
|
18
|
-
def bump(current_version, type)
|
19
|
-
major, minor, patch = current_version.split('.').map(&:to_i)
|
20
|
-
if type == 'major'
|
21
|
-
major += 1
|
22
|
-
minor = 0
|
23
|
-
patch = 0
|
24
|
-
end
|
25
|
-
if type == 'minor'
|
26
|
-
minor += 1
|
27
|
-
patch = 0
|
28
|
-
end
|
29
|
-
patch += 1 if type == 'patch'
|
30
|
-
"#{major}.#{minor}.#{patch}"
|
31
|
-
end
|
32
|
-
|
33
|
-
def run
|
34
|
-
old_version = HTMLHierarchyExtractorVersion.to_s
|
35
|
-
new_version = bump(old_version, @type)
|
36
|
-
|
37
|
-
script_dir = File.expand_path(File.dirname(__FILE__))
|
38
|
-
file = File.join(script_dir, '../lib/version.rb')
|
39
|
-
old_content = File.read(file)
|
40
|
-
new_content = old_content.gsub(old_version, new_version)
|
41
|
-
File.write(file, new_content)
|
42
|
-
|
43
|
-
`git add #{file}`
|
44
|
-
`git commit -m "chore(bump): Version bump to #{new_version}"`
|
45
|
-
end
|
46
|
-
end
|
47
|
-
BumpVersion.new(*ARGV).run
|
data/scripts/check_flay
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
MAX_SCORE = 45
|
4
|
-
|
5
|
-
flay_lines = `flay -s ./lib/`.split("\n")
|
6
|
-
|
7
|
-
errors = []
|
8
|
-
flay_lines.each_with_index do |line, index|
|
9
|
-
# Skip header
|
10
|
-
next if index < 2
|
11
|
-
|
12
|
-
pattern = /^ *(.*): (.*)/
|
13
|
-
matches = line.match(pattern)
|
14
|
-
next if matches.nil?
|
15
|
-
score = matches[1].to_f
|
16
|
-
|
17
|
-
next if score < MAX_SCORE
|
18
|
-
errors << {
|
19
|
-
score: score,
|
20
|
-
file: matches[2]
|
21
|
-
}
|
22
|
-
end
|
23
|
-
|
24
|
-
exit 0 if errors.size == 0
|
25
|
-
|
26
|
-
puts 'Flay test failed:'
|
27
|
-
errors.sort_by { |a| a[:score] }.each do |error|
|
28
|
-
puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
|
29
|
-
end
|
30
|
-
exit 1
|
data/scripts/check_flog
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
MAX_SCORE = 45
|
4
|
-
|
5
|
-
flog_lines = `flog ./lib/`.split("\n")
|
6
|
-
|
7
|
-
errors = []
|
8
|
-
flog_lines.each_with_index do |line, index|
|
9
|
-
# Skip header
|
10
|
-
next if index < 3
|
11
|
-
|
12
|
-
pattern = /^ *(.*): (.*) (.*):[0-9]*/
|
13
|
-
matches = line.match(pattern)
|
14
|
-
next if matches.nil?
|
15
|
-
score = matches[1].to_f
|
16
|
-
|
17
|
-
next if score < MAX_SCORE
|
18
|
-
errors << {
|
19
|
-
score: score,
|
20
|
-
method: matches[2],
|
21
|
-
file: matches[3]
|
22
|
-
}
|
23
|
-
end
|
24
|
-
|
25
|
-
exit 0 if errors.size == 0
|
26
|
-
|
27
|
-
puts 'Flog test failed:'
|
28
|
-
errors.sort_by { |a| a[:score] }.each do |error|
|
29
|
-
puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
|
30
|
-
end
|
31
|
-
exit 1
|
data/scripts/coverage
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
# Succeed fast if we did not change any ruby file
|
4
|
-
if ! git status --short | grep -q '\.rb$'; then
|
5
|
-
exit 0
|
6
|
-
fi
|
7
|
-
|
8
|
-
# Do not commit any focused or excluded tests
|
9
|
-
if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
|
10
|
-
echo '✘ You have focused and/or skipped tests'
|
11
|
-
exit 1
|
12
|
-
fi
|
13
|
-
|
14
|
-
# Match style guide
|
15
|
-
./scripts/lint || exit 1
|
16
|
-
|