algolia_html_extractor 2.2.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbf8df27c69c4d6f2f16de4bd7cf18fcd703fb43
4
- data.tar.gz: a01708af7fe1a3c42d364a099e443ac05f6f8a75
3
+ metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
4
+ data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
5
5
  SHA512:
6
- metadata.gz: 9d9d8af70a4310d871a96fd34a789de3ce0df0ba4621cf237727fcc514dbbfb9fd3d26a35ae3df6fd9b6574752e290d4254bdea7f1622cadba99a07a6a870adf
7
- data.tar.gz: e74cc7ca6db7fddc84c903715a44c70df47fb27f303ee1635579b89f47269fab168e9933582fef73269ad0e24fdeae97caa5c1924c57a0553242c33407f7492c
6
+ metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
7
+ data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8
@@ -3,20 +3,80 @@ require 'digest/md5'
3
3
 
4
4
  # Extract content from an HTML page in the form of items with associated
5
5
  # hierarchy data
6
- class AlgoliaHTMLExtractor
7
- def initialize(input, options: {})
8
- @dom = Nokogiri::HTML(input)
6
+ module AlgoliaHTMLExtractor
7
+ def self.run(input, options: {})
9
8
  default_options = {
10
9
  css_selector: 'p'
11
10
  }
12
- @options = default_options.merge(options)
11
+ options = default_options.merge(options)
12
+
13
+ heading_selector = 'h1,h2,h3,h4,h5,h6'
14
+ # We select all nodes that match either the headings or the elements to
15
+ # extract. This will allow us to loop over it in order it appears in the DOM
16
+ all_selector = "#{heading_selector},#{options[:css_selector]}"
17
+
18
+ items = []
19
+ current_hierarchy = {
20
+ lvl0: nil,
21
+ lvl1: nil,
22
+ lvl2: nil,
23
+ lvl3: nil,
24
+ lvl4: nil,
25
+ lvl5: nil
26
+ }
27
+ current_position = 0 # Position of the DOM node in the tree
28
+ current_lvl = nil # Current closest hierarchy level
29
+ current_anchor = nil # Current closest anchor
30
+
31
+ dom = Nokogiri::HTML(input)
32
+ dom.css(all_selector).each do |node|
33
+ # If it's a heading, we update our current hierarchy
34
+ if node.matches?(heading_selector)
35
+ # Which level heading is it?
36
+ current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
37
+ # Update this level, and set all the following ones to nil
38
+ current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
39
+ (current_lvl + 1..6).each do |lvl|
40
+ current_hierarchy["lvl#{lvl}".to_sym] = nil
41
+ end
42
+ # Update the anchor, if the new heading has one
43
+ new_anchor = extract_anchor(node)
44
+ current_anchor = new_anchor if new_anchor
45
+ end
46
+
47
+ # Stop if node is not to be extracted
48
+ next unless node.matches?(options[:css_selector])
49
+
50
+ # Stop if node is empty
51
+ content = extract_text(node)
52
+ next if content.empty?
53
+
54
+ item = {
55
+ html: extract_html(node),
56
+ content: content,
57
+ tag_name: extract_tag_name(node),
58
+ hierarchy: current_hierarchy.clone,
59
+ anchor: current_anchor,
60
+ node: node,
61
+ weight: {
62
+ position: current_position,
63
+ heading: heading_weight(current_lvl)
64
+ }
65
+ }
66
+ item[:objectID] = uuid(item)
67
+ items << item
68
+
69
+ current_position += 1
70
+ end
71
+
72
+ items
13
73
  end
14
74
 
15
75
  # Returns the outer HTML of a given node
16
76
  #
17
77
  # eg.
18
78
  # <p>foo</p> => <p>foo</p>
19
- def extract_html(node)
79
+ def self.extract_html(node)
20
80
  node.to_s.strip
21
81
  end
22
82
 
@@ -24,7 +84,7 @@ class AlgoliaHTMLExtractor
24
84
  #
25
85
  # eg.
26
86
  # <p>foo</p> => foo
27
- def extract_text(node)
87
+ def self.extract_text(node)
28
88
  node.content
29
89
  end
30
90
 
@@ -32,7 +92,7 @@ class AlgoliaHTMLExtractor
32
92
  #
33
93
  # eg
34
94
  # <p>foo</p> => p
35
- def extract_tag_name(node)
95
+ def self.extract_tag_name(node)
36
96
  node.name.downcase
37
97
  end
38
98
 
@@ -42,7 +102,7 @@ class AlgoliaHTMLExtractor
42
102
  # <h1 name="anchor">Foo</h1> => anchor
43
103
  # <h1 id="anchor">Foo</h1> => anchor
44
104
  # <h1><a name="anchor">Foo</a></h1> => anchor
45
- def extract_anchor(node)
105
+ def self.extract_anchor(node)
46
106
  anchor = node.attr('name') || node.attr('id') || nil
47
107
  return anchor unless anchor.nil?
48
108
 
@@ -55,7 +115,10 @@ class AlgoliaHTMLExtractor
55
115
 
56
116
  ##
57
117
  # Generate a unique identifier for the item
58
- def uuid(item)
118
+ def self.uuid(item)
119
+ # We don't use the objectID as part of the hash algorithm
120
+
121
+ item.delete(:objectID)
59
122
  # We first get all the keys of the object, sorted alphabetically...
60
123
  ordered_keys = item.keys.sort
61
124
 
@@ -74,71 +137,9 @@ class AlgoliaHTMLExtractor
74
137
  ##
75
138
  # Get a relative numeric value of the importance of the heading
76
139
  # 100 for top level, then -10 per heading
77
- def heading_weight(heading_level)
140
+ def self.heading_weight(heading_level)
78
141
  weight = 100
79
142
  return weight if heading_level.nil?
80
143
  weight - ((heading_level + 1) * 10)
81
144
  end
82
-
83
- def extract
84
- heading_selector = 'h1,h2,h3,h4,h5,h6'
85
- # We select all nodes that match either the headings or the elements to
86
- # extract. This will allow us to loop over it in order it appears in the DOM
87
- all_selector = "#{heading_selector},#{@options[:css_selector]}"
88
-
89
- items = []
90
- current_hierarchy = {
91
- lvl0: nil,
92
- lvl1: nil,
93
- lvl2: nil,
94
- lvl3: nil,
95
- lvl4: nil,
96
- lvl5: nil
97
- }
98
- current_position = 0 # Position of the DOM node in the tree
99
- current_lvl = nil # Current closest hierarchy level
100
- current_anchor = nil # Current closest anchor
101
-
102
- @dom.css(all_selector).each do |node|
103
- # If it's a heading, we update our current hierarchy
104
- if node.matches?(heading_selector)
105
- # Which level heading is it?
106
- current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
107
- # Update this level, and set all the following ones to nil
108
- current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
109
- (current_lvl + 1..6).each do |lvl|
110
- current_hierarchy["lvl#{lvl}".to_sym] = nil
111
- end
112
- # Update the anchor, if the new heading has one
113
- new_anchor = extract_anchor(node)
114
- current_anchor = new_anchor if new_anchor
115
- end
116
-
117
- # Stop if node is not to be extracted
118
- next unless node.matches?(@options[:css_selector])
119
-
120
- # Stop if node is empty
121
- content = extract_text(node)
122
- next if content.empty?
123
-
124
- item = {
125
- html: extract_html(node),
126
- content: content,
127
- tag_name: extract_tag_name(node),
128
- hierarchy: current_hierarchy.clone,
129
- anchor: current_anchor,
130
- node: node,
131
- weight: {
132
- position: current_position,
133
- heading: heading_weight(current_lvl)
134
- }
135
- }
136
- item[:objectID] = uuid(item)
137
- items << item
138
-
139
- current_position += 1
140
- end
141
-
142
- items
143
- end
144
145
  end
@@ -1,5 +1,5 @@
1
1
  # Expose gem version
2
2
  # rubocop:disable Style/SingleLineMethods
3
3
  class AlgoliaHTMLExtractorVersion
4
- def self.to_s; '2.2.0' end
4
+ def self.to_s; '2.2.1' end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: algolia_html_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry