html-hierarchy-extractor 1.0.2 → 1.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile DELETED
@@ -1,58 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts 'Run `bundle install` to install missing gems'
10
- exit e.status_code
11
- end
12
- require 'rake'
13
-
14
- require 'jeweler'
15
- require_relative 'lib/version'
16
- Jeweler::Tasks.new do |gem|
17
- # gem is a Gem::Specification...
18
- # see http://guides.rubygems.org/specification-reference/ for more options
19
- gem.name = 'html-hierarchy-extractor'
20
- gem.version = HTMLHierarchyExtractorVersion.to_s
21
- gem.homepage = 'http://github.com/pixelastic/html-hierarchy-extractor'
22
- gem.license = 'MIT'
23
- gem.summary = 'Extract HTML hierarchy (headings and content) into a' \
24
- ' list of items'
25
- gem.description = 'Take any arbitrary HTML as input and extract its' \
26
- ' hierarchy as a list of items, including parents and' \
27
- ' contents.' \
28
- 'It is primarily intended to be used along with Algolia,' \
29
- ' to improve the relevance of searching into huge chunks' \
30
- ' of text'
31
- gem.email = 'tim@pixelastic.com'
32
- gem.authors = ['Tim Carry']
33
- # dependencies defined in Gemfile
34
- end
35
- Jeweler::RubygemsDotOrgTasks.new
36
-
37
- require 'rake/testtask'
38
- Rake::TestTask.new(:test) do |test|
39
- test.libs << 'lib' << 'test'
40
- test.pattern = 'test/**/test_*.rb'
41
- test.verbose = true
42
- end
43
-
44
- require 'rspec/core'
45
- require 'rspec/core/rake_task'
46
- RSpec::Core::RakeTask.new(:spec) do |spec|
47
- spec.rspec_opts = '--color --format documentation'
48
- spec.pattern = FileList['spec/**/*_spec.rb']
49
- end
50
- task test: :spec
51
-
52
- desc 'Code coverage detail'
53
- task :coverage do
54
- ENV['COVERAGE'] = 'true'
55
- Rake::Task['spec'].execute
56
- end
57
-
58
- task default: :test
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.1.0
@@ -1,99 +0,0 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
- # -*- encoding: utf-8 -*-
5
- # stub: html-hierarchy-extractor 1.0.2 ruby lib
6
-
7
- Gem::Specification.new do |s|
8
- s.name = "html-hierarchy-extractor"
9
- s.version = "1.0.2"
10
-
11
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib"]
13
- s.authors = ["Tim Carry"]
14
- s.date = "2016-07-20"
15
- s.description = "Take any arbitrary HTML as input and extract its hierarchy as a list of items, including parents and contents.It is primarily intended to be used along with Algolia, to improve the relevance of searching into huge chunks of text"
16
- s.email = "tim@pixelastic.com"
17
- s.extra_rdoc_files = [
18
- "LICENSE.txt",
19
- "README.md"
20
- ]
21
- s.files = [
22
- ".coveralls.yml",
23
- ".document",
24
- ".rspec",
25
- ".rubocop.yml",
26
- ".travis.yml",
27
- "CONTRIBUTING.md",
28
- "Gemfile",
29
- "Guardfile",
30
- "LICENSE.txt",
31
- "README.md",
32
- "Rakefile",
33
- "VERSION",
34
- "html-hierarchy-extractor.gemspec",
35
- "lib/html-hierarchy-extractor.rb",
36
- "lib/version.rb",
37
- "scripts/bump_version",
38
- "scripts/check_flay",
39
- "scripts/check_flog",
40
- "scripts/coverage",
41
- "scripts/git_hooks/pre-commit",
42
- "scripts/git_hooks/pre-push",
43
- "scripts/lint",
44
- "scripts/release",
45
- "scripts/test",
46
- "scripts/test_ci",
47
- "scripts/watch",
48
- "spec/html_hierarchy_extractor_spec.rb",
49
- "spec/spec_helper.rb",
50
- "spec/spec_helper_simplecov.rb"
51
- ]
52
- s.homepage = "http://github.com/pixelastic/html-hierarchy-extractor"
53
- s.licenses = ["MIT"]
54
- s.rubygems_version = "2.4.8"
55
- s.summary = "Extract HTML hierarchy (headings and content) into a list of items"
56
-
57
- if s.respond_to? :specification_version then
58
- s.specification_version = 4
59
-
60
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
61
- s.add_runtime_dependency(%q<awesome_print>, ["~> 1.6"])
62
- s.add_runtime_dependency(%q<json>, ["~> 1.8"])
63
- s.add_runtime_dependency(%q<nokogiri>, ["~> 1.6"])
64
- s.add_development_dependency(%q<coveralls>, ["~> 0.8"])
65
- s.add_development_dependency(%q<flay>, ["~> 2.6"])
66
- s.add_development_dependency(%q<flog>, ["~> 4.3"])
67
- s.add_development_dependency(%q<guard-rspec>, ["~> 4.6"])
68
- s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
69
- s.add_development_dependency(%q<rspec>, ["~> 3.0"])
70
- s.add_development_dependency(%q<rubocop>, ["~> 0.31"])
71
- s.add_development_dependency(%q<simplecov>, ["~> 0.10"])
72
- else
73
- s.add_dependency(%q<awesome_print>, ["~> 1.6"])
74
- s.add_dependency(%q<json>, ["~> 1.8"])
75
- s.add_dependency(%q<nokogiri>, ["~> 1.6"])
76
- s.add_dependency(%q<coveralls>, ["~> 0.8"])
77
- s.add_dependency(%q<flay>, ["~> 2.6"])
78
- s.add_dependency(%q<flog>, ["~> 4.3"])
79
- s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
80
- s.add_dependency(%q<jeweler>, ["~> 2.0"])
81
- s.add_dependency(%q<rspec>, ["~> 3.0"])
82
- s.add_dependency(%q<rubocop>, ["~> 0.31"])
83
- s.add_dependency(%q<simplecov>, ["~> 0.10"])
84
- end
85
- else
86
- s.add_dependency(%q<awesome_print>, ["~> 1.6"])
87
- s.add_dependency(%q<json>, ["~> 1.8"])
88
- s.add_dependency(%q<nokogiri>, ["~> 1.6"])
89
- s.add_dependency(%q<coveralls>, ["~> 0.8"])
90
- s.add_dependency(%q<flay>, ["~> 2.6"])
91
- s.add_dependency(%q<flog>, ["~> 4.3"])
92
- s.add_dependency(%q<guard-rspec>, ["~> 4.6"])
93
- s.add_dependency(%q<jeweler>, ["~> 2.0"])
94
- s.add_dependency(%q<rspec>, ["~> 3.0"])
95
- s.add_dependency(%q<rubocop>, ["~> 0.31"])
96
- s.add_dependency(%q<simplecov>, ["~> 0.10"])
97
- end
98
- end
99
-
@@ -1,144 +0,0 @@
1
- require 'nokogiri'
2
- require 'digest/md5'
3
-
4
- # Extract content from an HTML page in the form of items with associated
5
- # hierarchy data
6
- class HTMLHierarchyExtractor
7
- def initialize(input, options: {})
8
- @dom = Nokogiri::HTML(input)
9
- default_options = {
10
- css_selector: 'p'
11
- }
12
- @options = default_options.merge(options)
13
- end
14
-
15
- # Returns the outer HTML of a given node
16
- #
17
- # eg.
18
- # <p>foo</p> => <p>foo</p>
19
- def extract_html(node)
20
- node.to_s.strip
21
- end
22
-
23
- # Returns the inner HTML of a given node
24
- #
25
- # eg.
26
- # <p>foo</p> => foo
27
- def extract_text(node)
28
- node.content
29
- end
30
-
31
- # Returns the tag name of a given node
32
- #
33
- # eg
34
- # <p>foo</p> => p
35
- def extract_tag_name(node)
36
- node.name.downcase
37
- end
38
-
39
- # Returns the anchor to the node
40
- #
41
- # eg.
42
- # <h1 name="anchor">Foo</h1> => anchor
43
- # <h1 id="anchor">Foo</h1> => anchor
44
- # <h1><a name="anchor">Foo</a></h1> => anchor
45
- def extract_anchor(node)
46
- anchor = node.attr('name') || node.attr('id') || nil
47
- return anchor unless anchor.nil?
48
-
49
- # No anchor found directly in the header, search on children
50
- subelement = node.css('[name],[id]')
51
- return extract_anchor(subelement[0]) unless subelement.empty?
52
-
53
- nil
54
- end
55
-
56
- ##
57
- # Generate a unique identifier for the item
58
- def uuid(item)
59
- # We first get all the keys of the object, sorted alphabetically...
60
- ordered_keys = item.keys.sort
61
-
62
- # ...then we build a huge array of "key=value" pairs...
63
- ordered_array = ordered_keys.map do |key|
64
- value = item[key]
65
- # We apply the method recursively on other hashes
66
- value = uuid(value) if value.is_a?(Hash)
67
- "#{key}=#{value}"
68
- end
69
-
70
- # ...then we build a unique md5 hash of it
71
- Digest::MD5.hexdigest(ordered_array.join(','))
72
- end
73
-
74
- ##
75
- # Get a relative numeric value of the importance of the heading
76
- # 100 for top level, then -10 per heading
77
- def heading_weight(heading_level)
78
- weight = 100
79
- return weight if heading_level.nil?
80
- weight - ((heading_level + 1) * 10)
81
- end
82
-
83
- def extract
84
- heading_selector = 'h1,h2,h3,h4,h5,h6'
85
- # We select all nodes that match either the headings or the elements to
86
- # extract. This will allow us to loop over it in order it appears in the DOM
87
- all_selector = "#{heading_selector},#{@options[:css_selector]}"
88
-
89
- items = []
90
- current_hierarchy = {
91
- lvl0: nil,
92
- lvl1: nil,
93
- lvl2: nil,
94
- lvl3: nil,
95
- lvl4: nil,
96
- lvl5: nil
97
- }
98
- current_position = 0 # Position of the DOM node in the tree
99
- current_lvl = nil # Current closest hierarchy level
100
- current_anchor = nil # Current closest anchor
101
-
102
- @dom.css(all_selector).each do |node|
103
- # If it's a heading, we update our current hierarchy
104
- if node.matches?(heading_selector)
105
- # Which level heading is it?
106
- current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
107
- # Update this level, and set all the following ones to nil
108
- current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
109
- (current_lvl + 1..6).each do |lvl|
110
- current_hierarchy["lvl#{lvl}".to_sym] = nil
111
- end
112
- # Update the anchor, if the new heading has one
113
- new_anchor = extract_anchor(node)
114
- current_anchor = new_anchor if new_anchor
115
- end
116
-
117
- # Stop if node is not to be extracted
118
- next unless node.matches?(@options[:css_selector])
119
-
120
- # Stop if node is empty
121
- text = extract_text(node)
122
- next if text.empty?
123
-
124
- item = {
125
- html: extract_html(node),
126
- text: text,
127
- tag_name: extract_tag_name(node),
128
- hierarchy: current_hierarchy.clone,
129
- anchor: current_anchor,
130
- node: node,
131
- weight: {
132
- position: current_position,
133
- heading: heading_weight(current_lvl)
134
- }
135
- }
136
- item[:uuid] = uuid(item)
137
- items << item
138
-
139
- current_position += 1
140
- end
141
-
142
- items
143
- end
144
- end
@@ -1,6 +0,0 @@
1
- # Expose gem version
2
- class HTMLHierarchyExtractorVersion
3
- def self.to_s
4
- '1.0.2'
5
- end
6
- end
@@ -1,47 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require_relative '../lib/version.rb'
3
-
4
- # Simple script used to bump the version number
5
- class BumpVersion
6
- def initialize(*args)
7
- @type = args[0]
8
- unless valid_type?(@type)
9
- puts "Invalid bump type: #{@type}"
10
- exit 1
11
- end
12
- end
13
-
14
- def valid_type?(type)
15
- %w(major minor patch).include?(type)
16
- end
17
-
18
- def bump(current_version, type)
19
- major, minor, patch = current_version.split('.').map(&:to_i)
20
- if type == 'major'
21
- major += 1
22
- minor = 0
23
- patch = 0
24
- end
25
- if type == 'minor'
26
- minor += 1
27
- patch = 0
28
- end
29
- patch += 1 if type == 'patch'
30
- "#{major}.#{minor}.#{patch}"
31
- end
32
-
33
- def run
34
- old_version = HTMLHierarchyExtractorVersion.to_s
35
- new_version = bump(old_version, @type)
36
-
37
- script_dir = File.expand_path(File.dirname(__FILE__))
38
- file = File.join(script_dir, '../lib/version.rb')
39
- old_content = File.read(file)
40
- new_content = old_content.gsub(old_version, new_version)
41
- File.write(file, new_content)
42
-
43
- `git add #{file}`
44
- `git commit -m "chore(bump): Version bump to #{new_version}"`
45
- end
46
- end
47
- BumpVersion.new(*ARGV).run
@@ -1,30 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- MAX_SCORE = 45
4
-
5
- flay_lines = `flay -s ./lib/`.split("\n")
6
-
7
- errors = []
8
- flay_lines.each_with_index do |line, index|
9
- # Skip header
10
- next if index < 2
11
-
12
- pattern = /^ *(.*): (.*)/
13
- matches = line.match(pattern)
14
- next if matches.nil?
15
- score = matches[1].to_f
16
-
17
- next if score < MAX_SCORE
18
- errors << {
19
- score: score,
20
- file: matches[2]
21
- }
22
- end
23
-
24
- exit 0 if errors.size == 0
25
-
26
- puts 'Flay test failed:'
27
- errors.sort_by { |a| a[:score] }.each do |error|
28
- puts "#{error[:score]} / #{MAX_SCORE} in #{error[:file]}"
29
- end
30
- exit 1
@@ -1,31 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- MAX_SCORE = 45
4
-
5
- flog_lines = `flog ./lib/`.split("\n")
6
-
7
- errors = []
8
- flog_lines.each_with_index do |line, index|
9
- # Skip header
10
- next if index < 3
11
-
12
- pattern = /^ *(.*): (.*) (.*):[0-9]*/
13
- matches = line.match(pattern)
14
- next if matches.nil?
15
- score = matches[1].to_f
16
-
17
- next if score < MAX_SCORE
18
- errors << {
19
- score: score,
20
- method: matches[2],
21
- file: matches[3]
22
- }
23
- end
24
-
25
- exit 0 if errors.size == 0
26
-
27
- puts 'Flog test failed:'
28
- errors.sort_by { |a| a[:score] }.each do |error|
29
- puts "#{error[:score]} / #{MAX_SCORE}: #{error[:method]} in #{error[:file]}"
30
- end
31
- exit 1
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- COVERAGE=1 bundle exec rspec
@@ -1,16 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- # Succeed fast if we did not change any ruby file
4
- if ! git status --short | grep -q '\.rb$'; then
5
- exit 0
6
- fi
7
-
8
- # Do not commit any focused or excluded tests
9
- if grep --color -r 'spec' -E -e '^( |\t)*(fit|fdescribe|xit|xdescribe)'; then
10
- echo '✘ You have focused and/or skipped tests'
11
- exit 1
12
- fi
13
-
14
- # Match style guide
15
- ./scripts/lint || exit 1
16
-