RubyGems - algolia_html_extractor - Versions diffs - 2.2.0 → 2.2.1 - Mend

algolia_html_extractor 2.2.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/algolia_html_extractor.rb +73 -72
data/lib/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: bbf8df27c69c4d6f2f16de4bd7cf18fcd703fb43
-  data.tar.gz: a01708af7fe1a3c42d364a099e443ac05f6f8a75
+  metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
+  data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
 SHA512:
-  metadata.gz: 9d9d8af70a4310d871a96fd34a789de3ce0df0ba4621cf237727fcc514dbbfb9fd3d26a35ae3df6fd9b6574752e290d4254bdea7f1622cadba99a07a6a870adf
-  data.tar.gz: e74cc7ca6db7fddc84c903715a44c70df47fb27f303ee1635579b89f47269fab168e9933582fef73269ad0e24fdeae97caa5c1924c57a0553242c33407f7492c
+  metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
+  data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8

data/lib/algolia_html_extractor.rb CHANGED

@@ -3,20 +3,80 @@ require 'digest/md5'
 # Extract content from an HTML page in the form of items with associated
 # hierarchy data
-class AlgoliaHTMLExtractor
-  def initialize(input, options: {})
-    @dom = Nokogiri::HTML(input)
+module AlgoliaHTMLExtractor
+  def self.run(input, options: {})
     default_options = {
       css_selector: 'p'
     }
-    @options = default_options.merge(options)
+    options = default_options.merge(options)
+    heading_selector = 'h1,h2,h3,h4,h5,h6'
+    # We select all nodes that match either the headings or the elements to
+    # extract. This will allow us to loop over it in order it appears in the DOM
+    all_selector = "#{heading_selector},#{options[:css_selector]}"
+    items = []
+    current_hierarchy = {
+      lvl0: nil,
+      lvl1: nil,
+      lvl2: nil,
+      lvl3: nil,
+      lvl4: nil,
+      lvl5: nil
+    }
+    current_position = 0 # Position of the DOM node in the tree
+    current_lvl = nil # Current closest hierarchy level
+    current_anchor = nil # Current closest anchor
+    dom = Nokogiri::HTML(input)
+    dom.css(all_selector).each do |node|
+      # If it's a heading, we update our current hierarchy
+      if node.matches?(heading_selector)
+        # Which level heading is it?
+        current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
+        # Update this level, and set all the following ones to nil
+        current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
+        (current_lvl + 1..6).each do |lvl|
+          current_hierarchy["lvl#{lvl}".to_sym] = nil
+        end
+        # Update the anchor, if the new heading has one
+        new_anchor = extract_anchor(node)
+        current_anchor = new_anchor if new_anchor
+      end
+      # Stop if node is not to be extracted
+      next unless node.matches?(options[:css_selector])
+      # Stop if node is empty
+      content = extract_text(node)
+      next if content.empty?
+      item = {
+        html: extract_html(node),
+        content: content,
+        tag_name: extract_tag_name(node),
+        hierarchy: current_hierarchy.clone,
+        anchor: current_anchor,
+        node: node,
+        weight: {
+          position: current_position,
+          heading: heading_weight(current_lvl)
+        }
+      }
+      item[:objectID] = uuid(item)
+      items << item
+      current_position += 1
+    end
+    items
   end
   # Returns the outer HTML of a given node
   #
   # eg.
   # <p>foo</p> => <p>foo</p>
-  def extract_html(node)
+  def self.extract_html(node)
     node.to_s.strip
   end
@@ -24,7 +84,7 @@ class AlgoliaHTMLExtractor
   #
   # eg.
   # <p>foo</p> => foo
-  def extract_text(node)
+  def self.extract_text(node)
     node.content
   end
@@ -32,7 +92,7 @@ class AlgoliaHTMLExtractor
   #
   # eg
   # <p>foo</p> => p
-  def extract_tag_name(node)
+  def self.extract_tag_name(node)
     node.name.downcase
   end
@@ -42,7 +102,7 @@ class AlgoliaHTMLExtractor
   # <h1 name="anchor">Foo</h1> => anchor
   # <h1 id="anchor">Foo</h1> => anchor
   # <h1><a name="anchor">Foo</a></h1> => anchor
-  def extract_anchor(node)
+  def self.extract_anchor(node)
     anchor = node.attr('name') || node.attr('id') || nil
     return anchor unless anchor.nil?
@@ -55,7 +115,10 @@ class AlgoliaHTMLExtractor
   ##
   # Generate a unique identifier for the item
-  def uuid(item)
+  def self.uuid(item)
+    # We don't use the objectID as part of the hash algorithm
+    item.delete(:objectID)
     # We first get all the keys of the object, sorted alphabetically...
     ordered_keys = item.keys.sort
@@ -74,71 +137,9 @@ class AlgoliaHTMLExtractor
   ##
   # Get a relative numeric value of the importance of the heading
   # 100 for top level, then -10 per heading
-  def heading_weight(heading_level)
+  def self.heading_weight(heading_level)
     weight = 100
     return weight if heading_level.nil?
     weight - ((heading_level + 1) * 10)
   end
-  def extract
-    heading_selector = 'h1,h2,h3,h4,h5,h6'
-    # We select all nodes that match either the headings or the elements to
-    # extract. This will allow us to loop over it in order it appears in the DOM
-    all_selector = "#{heading_selector},#{@options[:css_selector]}"
-    items = []
-    current_hierarchy = {
-      lvl0: nil,
-      lvl1: nil,
-      lvl2: nil,
-      lvl3: nil,
-      lvl4: nil,
-      lvl5: nil
-    }
-    current_position = 0 # Position of the DOM node in the tree
-    current_lvl = nil # Current closest hierarchy level
-    current_anchor = nil # Current closest anchor
-    @dom.css(all_selector).each do |node|
-      # If it's a heading, we update our current hierarchy
-      if node.matches?(heading_selector)
-        # Which level heading is it?
-        current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
-        # Update this level, and set all the following ones to nil
-        current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
-        (current_lvl + 1..6).each do |lvl|
-          current_hierarchy["lvl#{lvl}".to_sym] = nil
-        end
-        # Update the anchor, if the new heading has one
-        new_anchor = extract_anchor(node)
-        current_anchor = new_anchor if new_anchor
-      end
-      # Stop if node is not to be extracted
-      next unless node.matches?(@options[:css_selector])
-      # Stop if node is empty
-      content = extract_text(node)
-      next if content.empty?
-      item = {
-        html: extract_html(node),
-        content: content,
-        tag_name: extract_tag_name(node),
-        hierarchy: current_hierarchy.clone,
-        anchor: current_anchor,
-        node: node,
-        weight: {
-          position: current_position,
-          heading: heading_weight(current_lvl)
-        }
-      }
-      item[:objectID] = uuid(item)
-      items << item
-      current_position += 1
-    end
-    items
-  end
 end

data/lib/version.rb CHANGED

@@ -1,5 +1,5 @@
 # Expose gem version
 # rubocop:disable Style/SingleLineMethods
 class AlgoliaHTMLExtractorVersion
-  def self.to_s; '2.2.0' end
+  def self.to_s; '2.2.1' end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: algolia_html_extractor
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.2.1
 platform: ruby
 authors:
 - Tim Carry