algolia_html_extractor 2.2.0 → 2.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbf8df27c69c4d6f2f16de4bd7cf18fcd703fb43
4
- data.tar.gz: a01708af7fe1a3c42d364a099e443ac05f6f8a75
3
+ metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
4
+ data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
5
5
  SHA512:
6
- metadata.gz: 9d9d8af70a4310d871a96fd34a789de3ce0df0ba4621cf237727fcc514dbbfb9fd3d26a35ae3df6fd9b6574752e290d4254bdea7f1622cadba99a07a6a870adf
7
- data.tar.gz: e74cc7ca6db7fddc84c903715a44c70df47fb27f303ee1635579b89f47269fab168e9933582fef73269ad0e24fdeae97caa5c1924c57a0553242c33407f7492c
6
+ metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
7
+ data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8
@@ -3,20 +3,80 @@ require 'digest/md5'
3
3
 
4
4
  # Extract content from an HTML page in the form of items with associated
5
5
  # hierarchy data
6
- class AlgoliaHTMLExtractor
7
- def initialize(input, options: {})
8
- @dom = Nokogiri::HTML(input)
6
+ module AlgoliaHTMLExtractor
7
+ def self.run(input, options: {})
9
8
  default_options = {
10
9
  css_selector: 'p'
11
10
  }
12
- @options = default_options.merge(options)
11
+ options = default_options.merge(options)
12
+
13
+ heading_selector = 'h1,h2,h3,h4,h5,h6'
14
+ # We select all nodes that match either the headings or the elements to
15
+ # extract. This will allow us to loop over it in order it appears in the DOM
16
+ all_selector = "#{heading_selector},#{options[:css_selector]}"
17
+
18
+ items = []
19
+ current_hierarchy = {
20
+ lvl0: nil,
21
+ lvl1: nil,
22
+ lvl2: nil,
23
+ lvl3: nil,
24
+ lvl4: nil,
25
+ lvl5: nil
26
+ }
27
+ current_position = 0 # Position of the DOM node in the tree
28
+ current_lvl = nil # Current closest hierarchy level
29
+ current_anchor = nil # Current closest anchor
30
+
31
+ dom = Nokogiri::HTML(input)
32
+ dom.css(all_selector).each do |node|
33
+ # If it's a heading, we update our current hierarchy
34
+ if node.matches?(heading_selector)
35
+ # Which level heading is it?
36
+ current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
37
+ # Update this level, and set all the following ones to nil
38
+ current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
39
+ (current_lvl + 1..6).each do |lvl|
40
+ current_hierarchy["lvl#{lvl}".to_sym] = nil
41
+ end
42
+ # Update the anchor, if the new heading has one
43
+ new_anchor = extract_anchor(node)
44
+ current_anchor = new_anchor if new_anchor
45
+ end
46
+
47
+ # Stop if node is not to be extracted
48
+ next unless node.matches?(options[:css_selector])
49
+
50
+ # Stop if node is empty
51
+ content = extract_text(node)
52
+ next if content.empty?
53
+
54
+ item = {
55
+ html: extract_html(node),
56
+ content: content,
57
+ tag_name: extract_tag_name(node),
58
+ hierarchy: current_hierarchy.clone,
59
+ anchor: current_anchor,
60
+ node: node,
61
+ weight: {
62
+ position: current_position,
63
+ heading: heading_weight(current_lvl)
64
+ }
65
+ }
66
+ item[:objectID] = uuid(item)
67
+ items << item
68
+
69
+ current_position += 1
70
+ end
71
+
72
+ items
13
73
  end
14
74
 
15
75
  # Returns the outer HTML of a given node
16
76
  #
17
77
  # eg.
18
78
  # <p>foo</p> => <p>foo</p>
19
- def extract_html(node)
79
+ def self.extract_html(node)
20
80
  node.to_s.strip
21
81
  end
22
82
 
@@ -24,7 +84,7 @@ class AlgoliaHTMLExtractor
24
84
  #
25
85
  # eg.
26
86
  # <p>foo</p> => foo
27
- def extract_text(node)
87
+ def self.extract_text(node)
28
88
  node.content
29
89
  end
30
90
 
@@ -32,7 +92,7 @@ class AlgoliaHTMLExtractor
32
92
  #
33
93
  # eg
34
94
  # <p>foo</p> => p
35
- def extract_tag_name(node)
95
+ def self.extract_tag_name(node)
36
96
  node.name.downcase
37
97
  end
38
98
 
@@ -42,7 +102,7 @@ class AlgoliaHTMLExtractor
42
102
  # <h1 name="anchor">Foo</h1> => anchor
43
103
  # <h1 id="anchor">Foo</h1> => anchor
44
104
  # <h1><a name="anchor">Foo</a></h1> => anchor
45
- def extract_anchor(node)
105
+ def self.extract_anchor(node)
46
106
  anchor = node.attr('name') || node.attr('id') || nil
47
107
  return anchor unless anchor.nil?
48
108
 
@@ -55,7 +115,10 @@ class AlgoliaHTMLExtractor
55
115
 
56
116
  ##
57
117
  # Generate a unique identifier for the item
58
- def uuid(item)
118
+ def self.uuid(item)
119
+ # We don't use the objectID as part of the hash algorithm
120
+
121
+ item.delete(:objectID)
59
122
  # We first get all the keys of the object, sorted alphabetically...
60
123
  ordered_keys = item.keys.sort
61
124
 
@@ -74,71 +137,9 @@ class AlgoliaHTMLExtractor
74
137
  ##
75
138
  # Get a relative numeric value of the importance of the heading
76
139
  # 100 for top level, then -10 per heading
77
- def heading_weight(heading_level)
140
+ def self.heading_weight(heading_level)
78
141
  weight = 100
79
142
  return weight if heading_level.nil?
80
143
  weight - ((heading_level + 1) * 10)
81
144
  end
82
-
83
- def extract
84
- heading_selector = 'h1,h2,h3,h4,h5,h6'
85
- # We select all nodes that match either the headings or the elements to
86
- # extract. This will allow us to loop over it in order it appears in the DOM
87
- all_selector = "#{heading_selector},#{@options[:css_selector]}"
88
-
89
- items = []
90
- current_hierarchy = {
91
- lvl0: nil,
92
- lvl1: nil,
93
- lvl2: nil,
94
- lvl3: nil,
95
- lvl4: nil,
96
- lvl5: nil
97
- }
98
- current_position = 0 # Position of the DOM node in the tree
99
- current_lvl = nil # Current closest hierarchy level
100
- current_anchor = nil # Current closest anchor
101
-
102
- @dom.css(all_selector).each do |node|
103
- # If it's a heading, we update our current hierarchy
104
- if node.matches?(heading_selector)
105
- # Which level heading is it?
106
- current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
107
- # Update this level, and set all the following ones to nil
108
- current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
109
- (current_lvl + 1..6).each do |lvl|
110
- current_hierarchy["lvl#{lvl}".to_sym] = nil
111
- end
112
- # Update the anchor, if the new heading has one
113
- new_anchor = extract_anchor(node)
114
- current_anchor = new_anchor if new_anchor
115
- end
116
-
117
- # Stop if node is not to be extracted
118
- next unless node.matches?(@options[:css_selector])
119
-
120
- # Stop if node is empty
121
- content = extract_text(node)
122
- next if content.empty?
123
-
124
- item = {
125
- html: extract_html(node),
126
- content: content,
127
- tag_name: extract_tag_name(node),
128
- hierarchy: current_hierarchy.clone,
129
- anchor: current_anchor,
130
- node: node,
131
- weight: {
132
- position: current_position,
133
- heading: heading_weight(current_lvl)
134
- }
135
- }
136
- item[:objectID] = uuid(item)
137
- items << item
138
-
139
- current_position += 1
140
- end
141
-
142
- items
143
- end
144
145
  end
@@ -1,5 +1,5 @@
1
1
  # Expose gem version
2
2
  # rubocop:disable Style/SingleLineMethods
3
3
  class AlgoliaHTMLExtractorVersion
4
- def self.to_s; '2.2.0' end
4
+ def self.to_s; '2.2.1' end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: algolia_html_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry