RubyGems - algolia_html_extractor - Versions diffs - 2.2.0 → 2.2.1 - Mend

algolia_html_extractor 2.2.0 → 2.2.1

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/algolia_html_extractor.rb +73 -72
data/lib/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: bbf8df27c69c4d6f2f16de4bd7cf18fcd703fb43
-  data.tar.gz: a01708af7fe1a3c42d364a099e443ac05f6f8a75
+  metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
+  data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
 SHA512:
-  metadata.gz: 9d9d8af70a4310d871a96fd34a789de3ce0df0ba4621cf237727fcc514dbbfb9fd3d26a35ae3df6fd9b6574752e290d4254bdea7f1622cadba99a07a6a870adf
-  data.tar.gz: e74cc7ca6db7fddc84c903715a44c70df47fb27f303ee1635579b89f47269fab168e9933582fef73269ad0e24fdeae97caa5c1924c57a0553242c33407f7492c
+  metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
+  data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8

data/lib/algolia_html_extractor.rb CHANGED

@@ -3,20 +3,80 @@ require 'digest/md5'
 # Extract content from an HTML page in the form of items with associated
 # hierarchy data
-class AlgoliaHTMLExtractor
-  def initialize(input, options: {})
-    @dom = Nokogiri::HTML(input)
+module AlgoliaHTMLExtractor
+  def self.run(input, options: {})
     default_options = {
       css_selector: 'p'
     }
-    @options = default_options.merge(options)
+    options = default_options.merge(options)
+    heading_selector = 'h1,h2,h3,h4,h5,h6'
+    # We select all nodes that match either the headings or the elements to
+    # extract. This will allow us to loop over it in order it appears in the DOM
+    all_selector = "#{heading_selector},#{options[:css_selector]}"
+    items = []
+    current_hierarchy = {
+      lvl0: nil,
+      lvl1: nil,
+      lvl2: nil,
+      lvl3: nil,
+      lvl4: nil,
+      lvl5: nil
+    }
+    current_position = 0 # Position of the DOM node in the tree
+    current_lvl = nil # Current closest hierarchy level
+    current_anchor = nil # Current closest anchor
+    dom = Nokogiri::HTML(input)
+    dom.css(all_selector).each do |node|
+      # If it's a heading, we update our current hierarchy
+      if node.matches?(heading_selector)
+        # Which level heading is it?
+        current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
+        # Update this level, and set all the following ones to nil
+        current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
+        (current_lvl + 1..6).each do |lvl|
+          current_hierarchy["lvl#{lvl}".to_sym] = nil
+        end
+        # Update the anchor, if the new heading has one
+        new_anchor = extract_anchor(node)
+        current_anchor = new_anchor if new_anchor
+      end
+      # Stop if node is not to be extracted
+      next unless node.matches?(options[:css_selector])
+      # Stop if node is empty
+      content = extract_text(node)
+      next if content.empty?
+      item = {
+        html: extract_html(node),
+        content: content,
+        tag_name: extract_tag_name(node),
+        hierarchy: current_hierarchy.clone,
+        anchor: current_anchor,
+        node: node,
+        weight: {
+          position: current_position,
+          heading: heading_weight(current_lvl)
+        }
+      }
+      item[:objectID] = uuid(item)
+      items << item
+      current_position += 1
+    end
+    items
   end
   # Returns the outer HTML of a given node
   #
   # eg.
   # <p>foo</p> => <p>foo</p>
-  def extract_html(node)
+  def self.extract_html(node)
     node.to_s.strip
   end
@@ -24,7 +84,7 @@ class AlgoliaHTMLExtractor
   #
   # eg.
   # <p>foo</p> => foo
-  def extract_text(node)
+  def self.extract_text(node)
     node.content
   end
@@ -32,7 +92,7 @@ class AlgoliaHTMLExtractor
   #
   # eg
   # <p>foo</p> => p
-  def extract_tag_name(node)
+  def self.extract_tag_name(node)
     node.name.downcase
   end
@@ -42,7 +102,7 @@ class AlgoliaHTMLExtractor
   # <h1 name="anchor">Foo</h1> => anchor
   # <h1 id="anchor">Foo</h1> => anchor
   # <h1><a name="anchor">Foo</a></h1> => anchor
-  def extract_anchor(node)
+  def self.extract_anchor(node)
     anchor = node.attr('name') || node.attr('id') || nil
     return anchor unless anchor.nil?
@@ -55,7 +115,10 @@ class AlgoliaHTMLExtractor
   ##
   # Generate a unique identifier for the item
-  def uuid(item)
+  def self.uuid(item)
+    # We don't use the objectID as part of the hash algorithm
+    item.delete(:objectID)
     # We first get all the keys of the object, sorted alphabetically...
     ordered_keys = item.keys.sort
@@ -74,71 +137,9 @@ class AlgoliaHTMLExtractor
   ##
   # Get a relative numeric value of the importance of the heading
   # 100 for top level, then -10 per heading
-  def heading_weight(heading_level)
+  def self.heading_weight(heading_level)
     weight = 100
     return weight if heading_level.nil?
     weight - ((heading_level + 1) * 10)
   end
-  def extract
-    heading_selector = 'h1,h2,h3,h4,h5,h6'
-    # We select all nodes that match either the headings or the elements to
-    # extract. This will allow us to loop over it in order it appears in the DOM
-    all_selector = "#{heading_selector},#{@options[:css_selector]}"
-    items = []
-    current_hierarchy = {
-      lvl0: nil,
-      lvl1: nil,
-      lvl2: nil,
-      lvl3: nil,
-      lvl4: nil,
-      lvl5: nil
-    }
-    current_position = 0 # Position of the DOM node in the tree
-    current_lvl = nil # Current closest hierarchy level
-    current_anchor = nil # Current closest anchor
-    @dom.css(all_selector).each do |node|
-      # If it's a heading, we update our current hierarchy
-      if node.matches?(heading_selector)
-        # Which level heading is it?
-        current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
-        # Update this level, and set all the following ones to nil
-        current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
-        (current_lvl + 1..6).each do |lvl|
-          current_hierarchy["lvl#{lvl}".to_sym] = nil
-        end
-        # Update the anchor, if the new heading has one
-        new_anchor = extract_anchor(node)
-        current_anchor = new_anchor if new_anchor
-      end
-      # Stop if node is not to be extracted
-      next unless node.matches?(@options[:css_selector])
-      # Stop if node is empty
-      content = extract_text(node)
-      next if content.empty?
-      item = {
-        html: extract_html(node),
-        content: content,
-        tag_name: extract_tag_name(node),
-        hierarchy: current_hierarchy.clone,
-        anchor: current_anchor,
-        node: node,
-        weight: {
-          position: current_position,
-          heading: heading_weight(current_lvl)
-        }
-      }
-      item[:objectID] = uuid(item)
-      items << item
-      current_position += 1
-    end
-    items
-  end
 end

data/lib/version.rb CHANGED

@@ -1,5 +1,5 @@
 # Expose gem version
 # rubocop:disable Style/SingleLineMethods
 class AlgoliaHTMLExtractorVersion
-  def self.to_s; '2.2.0' end
+  def self.to_s; '2.2.1' end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: algolia_html_extractor
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.2.1
 platform: ruby
 authors:
 - Tim Carry