html-hierarchy-extractor 1.0.2 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile DELETED
@@ -1,58 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts 'Run `bundle install` to install missing gems'
10
- exit e.status_code
11
- end
12
- require 'rake'
13
-
14
- require 'jeweler'
15
- require_relative 'lib/version'
16
- Jeweler::Tasks.new do |gem|
17
- # gem is a Gem::Specification...
18
- # see http://guides.rubygems.org/specification-reference/ for more options
19
- gem.name = 'html-hierarchy-extractor'
20
- gem.version = HTMLHierarchyExtractorVersion.to_s
21
- gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
22
- gem.license = 'MIT'
23
- gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
24
- ' list of items'
25
- gem.description = 'Take any arbitrary HTML as input and extract its' \
26
- ' hierarchy as a list of items, including parents and' \
27
- ' contents.' \
28
- 'It is primarily intended to be used along with Algolia,' \
29
- ' to improve the relevance of searching into huge chunks' \
30
- ' of text'
31
- gem.email = 'tim@pixelastic.com'
32
- gem.authors = ['Tim Carry']
33
- # dependencies defined in Gemfile
34
- end
35
- Jeweler::RubygemsDotOrgTasks.new
36
-
37
- require 'rake/testtask'
38
- Rake::TestTask.new(:test) do |test|
39
- test.libs << 'lib' << 'test'
40
- test.pattern = 'test/**/test_*.rb'
41
- test.verbose = true
42
- end
43
-
44
- require 'rspec/core'
45
- require 'rspec/core/rake_task'
46
- RSpec::Core::RakeTask.new(:spec) do |spec|
47
- spec.rspec_opts = '--color --format documentation'
48
- spec.pattern = FileList['spec/**/*_spec.rb']
49
- end
50
- task test: :spec
51
-
52
- desc 'Code coverage detail'
53
- task :coverage do
54
- ENV['COVERAGE'] = 'true'
55
- Rake::Task['spec'].execute
56
- end
57
-
58
- task default: :test
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.1.0
@@ -1,99 +0,0 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
- # -*- encoding: utf-8 -*-
5
- # stub: html-hierarchy-extractor 1.0.2 ruby lib
6
-
7
- Gem::Specification.new do |s|
8
- s.name = "html-hierarchy-extractor"
9
- s.version = "1.0.2"
10
-
11
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib"]
13
- s.authors = ["Tim Carry"]
14
- s.date = "2016-07-20"
15
- s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
16
- s.email = "tim@pixelastic.com"
17
- s.extra_rdoc_files = [
18
- "LICENSE.txt",
19
- "README.md"
20
- ]
21
- s.files = [
22
- ".coveralls.yml",
23
- ".document",
24
- ".rspec",
25
- ".rubocop.yml",
26
- ".travis.yml",
27
- "CONTRIBUTING.md",
28
- "Gemfile",
29
- "Guardfile",
30
- "LICENSE.txt",
31
- "README.md",
32
- "Rakefile",
33
- "VERSION",
34
- "html-hierarchy-extractor.gemspec",
35
- "lib/html-hierarchy-extractor.rb",
36
- "lib/version.rb",
37
- "scripts/bump_version",
38
- "scripts/check_flay",
39
- "scripts/check_flog",
40
- "scripts/coverage",
41
- "scripts/git_hooks/pre-commit",
42
- "scripts/git_hooks/pre-push",
43
- "scripts/lint",
44
- "scripts/release",
45
- "scripts/test",
46
- "scripts/test_ci",
47
- "scripts/watch",
48
- "spec/html_hierarchy_extractor_spec.rb",
49
- "spec/spec_helper.rb",
50
- "spec/spec_helper_simplecov.rb"
51
- ]
52
- s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
53
- s.licenses = ["MIT"]
54
- s.rubygems_version = "2.4.8"
55
- s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
56
-
57
- if s.respond_to? :specification_version then
58
- s.specification_version = 4
59
-
60
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
- s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
62
- s.add_runtime_dependency(%q<json>, ["~> 1.8"])
63
- s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
64
- s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
65
- s.add_development_dependency(%q<flay>, ["~> 2.6"])
66
- s.add_development_dependency(%q<flog>, ["~> 4.3"])
67
- s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
68
- s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
69
- s.add_development_dependency(%q<rspec>, ["~> 3.0"])
70
- s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
71
- s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
72
- else
73
- s.add_dependency(%q<awesome_print>, ["~> 1.6"])
74
- s.add_dependency(%q<json>, ["~> 1.8"])
75
- s.add_dependency(%q<nokogiri>, ["~> 1.6"])
76
- s.add_dependency(%q<coveralls>, ["~> 0.8"])
77
- s.add_dependency(%q<flay>, ["~> 2.6"])
78
- s.add_dependency(%q<flog>, ["~> 4.3"])
79
- s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
80
- s.add_dependency(%q<jeweler>, ["~> 2.0"])
81
- s.add_dependency(%q<rspec>, ["~> 3.0"])
82
- s.add_dependency(%q<rubocop>, ["~> 0.31"])
83
- s.add_dependency(%q<simplecov>, ["~> 0.10"])
84
- end
85
- else
86
- s.add_dependency(%q<awesome_print>, ["~> 1.6"])
87
- s.add_dependency(%q<json>, ["~> 1.8"])
88
- s.add_dependency(%q<nokogiri>, ["~> 1.6"])
89
- s.add_dependency(%q<coveralls>, ["~> 0.8"])
90
- s.add_dependency(%q<flay>, ["~> 2.6"])
91
- s.add_dependency(%q<flog>, ["~> 4.3"])
92
- s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
93
- s.add_dependency(%q<jeweler>, ["~> 2.0"])
94
- s.add_dependency(%q<rspec>, ["~> 3.0"])
95
- s.add_dependency(%q<rubocop>, ["~> 0.31"])
96
- s.add_dependency(%q<simplecov>, ["~> 0.10"])
97
- end
98
- end
99
-
@@ -1,144 +0,0 @@
1
- require 'nokogiri'
2
- require 'digest/md5'
3
-
4
- # Extract content from an HTML page in the form of items with associated
5
- # hierarchy data
6
- class HTMLHierarchyExtractor
7
- def initialize(input, options: {})
8
- @dom = Nokogiri::HTML(input)
9
- default_options = {
10
- css_selector: 'p'
11
- }
12
- @options = default_options.merge(options)
13
- end
14
-
15
- # Returns the outer HTML of a given node
16
- #
17
- # eg.
18
- # <p>foo</p> => <p>foo</p>
19
- def extract_html(node)
20
- node.to_s.strip
21
- end
22
-
23
- # Returns the inner HTML of a given node
24
- #
25
- # eg.
26
- # <p>foo</p> => foo
27
- def extract_text(node)
28
- node.content
29
- end
30
-
31
- # Returns the tag name of a given node
32
- #
33
- # eg
34
- # <p>foo</p> => p
35
- def extract_tag_name(node)
36
- node.name.downcase
37
- end
38
-
39
- # Returns the anchor to the node
40
- #
41
- # eg.
42
- # <h1 name="anchor">Foo</h1> => anchor
43
- # <h1 id="anchor">Foo</h1> => anchor
44
- # <h1><a name="anchor">Foo</a></h1> => anchor
45
- def extract_anchor(node)
46
- anchor = node.attr('name') || node.attr('id') || nil
47
- return anchor unless anchor.nil?
48
-
49
- # No anchor found directly in the header, search on children
50
- subelement = node.css('[name],[id]')
51
- return extract_anchor(subelement[0]) unless subelement.empty?
52
-
53
- nil
54
- end
55
-
56
- ##
57
- # Generate a unique identifier for the item
58
- def uuid(item)
59
- # We first get all the keys of the object, sorted alphabetically...
60
- ordered_keys = item.keys.sort
61
-
62
- # ...then we build a huge array of "key=value" pairs...
63
- ordered_array = ordered_keys.map do |key|
64
- value = item[key]
65
- # We apply the method recursively on other hashes
66
- value = uuid(value) if value.is_a?(Hash)
67
- "#{key}=#{value}"
68
- end
69
-
70
- # ...then we build a unique md5 hash of it
71
- Digest::MD5.hexdigest(ordered_array.join(','))
72
- end
73
-
74
- ##
75
- # Get a relative numeric value of the importance of the heading
76
- # 100 for top level, then -10 per heading
77
- def heading_weight(heading_level)
78
- weight = 100
79
- return weight if heading_level.nil?
80
- weight - ((heading_level + 1) * 10)
81
- end
82
-
83
- def extract
84
- heading_selector = 'h1,h2,h3,h4,h5,h6'
85
- # We select all nodes that match either the headings or the elements to
86
- # extract. This will allow us to loop over it in order it appears in the DOM
87
- all_selector = "#{heading_selector},#{@options[:css_selector]}"
88
-
89
- items = []
90
- current_hierarchy = {
91
- lvl0: nil,
92
- lvl1: nil,
93
- lvl2: nil,
94
- lvl3: nil,
95
- lvl4: nil,
96
- lvl5: nil
97
- }
98
- current_position = 0 # Position of the DOM node in the tree
99
- current_lvl = nil # Current closest hierarchy level
100
- current_anchor = nil # Current closest anchor
101
-
102
- @dom.css(all_selector).each do |node|
103
- # If it's a heading, we update our current hierarchy
104
- if node.matches?(heading_selector)
105
- # Which level heading is it?
106
- current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
107
- # Update this level, and set all the following ones to nil
108
- current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
109
- (current_lvl + 1..6).each do |lvl|
110
- current_hierarchy["lvl#{lvl}".to_sym] = nil
111
- end
112
- # Update the anchor, if the new heading has one
113
- new_anchor = extract_anchor(node)
114
- current_anchor = new_anchor if new_anchor
115
- end
116
-
117
- # Stop if node is not to be extracted
118
- next unless node.matches?(@options[:css_selector])
119
-
120
- # Stop if node is empty
121
- text = extract_text(node)
122
- next if text.empty?
123
-
124
- item = {
125
- html: extract_html(node),
126
- text: text,
127
- tag_name: extract_tag_name(node),
128
- hierarchy: current_hierarchy.clone,
129
- anchor: current_anchor,
130
- node: node,
131
- weight: {
132
- position: current_position,
133
- heading: heading_weight(current_lvl)
134
- }
135
- }
136
- item[:uuid] = uuid(item)
137
- items << item
138
-
139
- current_position += 1
140
- end
141
-
142
- items
143
- end
144
- end
@@ -1,6 +0,0 @@
1
- # Expose gem version
2
- class HTMLHierarchyExtractorVersion
3
- def self.to_s
4
- '1.0.2'
5
- end
6
- end
@@ -1,47 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require_relative '../lib/version.rb'
3
-
4
- # Simple script used to bump the version number
5
- class BumpVersion
6
- def initialize(*args)
7
- @type = args[0]
8
- unless valid_type?(@type)
9
- puts "Invalid bump type: #{@type}"
10
- exit 1
11
- end
12
- end
13
-
14
- def valid_type?(type)
15
- %w(major minor patch).include?(type)
16
- end
17
-
18
- def bump(current_version, type)
19
- major, minor, patch = current_version.split('.').map(&:to_i)
20
- if type == 'major'
21
- major += 1
22
- minor = 0
23
- patch = 0
24
- end
25
- if type == 'minor'
26
- minor += 1
27
- patch = 0
28
- end
29
- patch += 1 if type == 'patch'
30
- "#{major}.#{minor}.#{patch}"
31
- end
32
-
33
- def run
34
- old_version = HTMLHierarchyExtractorVersion.to_s
35
- new_version = bump(old_version, @type)
36
-
37
- script_dir = File.expand_path(File.dirname(__FILE__))
38
- file = File.join(script_dir, '../lib/version.rb')
39
- old_content = File.read(file)
40
- new_content = old_content.gsub(old_version, new_version)
41
- File.write(file, new_content)
42
-
43
- `git add #{file}`
44
- `git commit -m "chore(bump): Version bump to #{new_version}"`
45
- end
46
- end
47
- BumpVersion.new(*ARGV).run
@@ -1,30 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- MAX_SCORE = 45
4
-
5
- flay_lines = `flay -s ./lib/`.split("\n")
6
-
7
- errors = []
8
- flay_lines.each_with_index do |line, index|
9
- # Skip header
10
- next if index < 2
11
-
12
- pattern = /^ *(.*): (.*)/
13
- matches = line.match(pattern)
14
- next if matches.nil?
15
- score = matches[1].to_f
16
-
17
- next if score < MAX_SCORE
18
- errors << {
19
- score: score,
20
- file: matches[2]
21
- }
22
- end
23
-
24
- exit 0 if errors.size == 0
25
-
26
- puts 'Flay test failed:'
27
- errors.sort_by { |a| a[:score] }.each do |error|
28
- puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
29
- end
30
- exit 1
@@ -1,31 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- MAX_SCORE = 45
4
-
5
- flog_lines = `flog ./lib/`.split("\n")
6
-
7
- errors = []
8
- flog_lines.each_with_index do |line, index|
9
- # Skip header
10
- next if index < 3
11
-
12
- pattern = /^ *(.*): (.*) (.*):[0-9]*/
13
- matches = line.match(pattern)
14
- next if matches.nil?
15
- score = matches[1].to_f
16
-
17
- next if score < MAX_SCORE
18
- errors << {
19
- score: score,
20
- method: matches[2],
21
- file: matches[3]
22
- }
23
- end
24
-
25
- exit 0 if errors.size == 0
26
-
27
- puts 'Flog test failed:'
28
- errors.sort_by { |a| a[:score] }.each do |error|
29
- puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
30
- end
31
- exit 1
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- COVERAGE=1 bundle exec rspec
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- # Succeed fast if we did not change any ruby file
4
- if ! git status --short | grep -q '\.rb$'; then
5
- exit 0
6
- fi
7
-
8
- # Do not commit any focused or excluded tests
9
- if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
10
- echo '✘ You have focused and/or skipped tests'
11
- exit 1
12
- fi
13
-
14
- # Match style guide
15
- ./scripts/lint || exit 1
16
-