algolia_html_extractor 2.5.2 → 2.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
 - data/lib/algolia_html_extractor.rb +9 -1
 - data/lib/version.rb +3 -1
 - metadata +8 -9
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 2 
     | 
    
         
            +
            SHA256:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 1496286d1762c4231f9f38e42549b0128c53d1ff5cf86ffc5b0da04c3c9934bb
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 6af72f32fa6e8e86064677688a2268a02a1ad2a8bdd9fc44146f2e72f527eb3f
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 70bdd12e5e15d62cf9c5a5c992180b2491c090cdcca4e671ff8039c7ae91c91817fd38dcf6a4b54e09f59082dfa2157cd4ca44513cf83c64aad8e1f8c771a258
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: b5111d1f7fbc948a1186daf0c2c3760cb2a6ea0a7b1b64ae677f96f4ff849a8e99a30f31efb75807e1d509f131c236f6c750a4f94384180df8b70c0736b3c2c1
         
     | 
| 
         @@ -1,3 +1,5 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # frozen_string_literal: true
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
       1 
3 
     | 
    
         
             
            require 'nokogiri'
         
     | 
| 
       2 
4 
     | 
    
         
             
            require 'digest/md5'
         
     | 
| 
       3 
5 
     | 
    
         | 
| 
         @@ -8,7 +10,8 @@ module AlgoliaHTMLExtractor 
     | 
|
| 
       8 
10 
     | 
    
         
             
              def self.default_options(options)
         
     | 
| 
       9 
11 
     | 
    
         
             
                default_options = {
         
     | 
| 
       10 
12 
     | 
    
         
             
                  css_selector: 'p',
         
     | 
| 
       11 
     | 
    
         
            -
                  heading_selector: 'h1,h2,h3,h4,h5,h6'
         
     | 
| 
      
 13 
     | 
    
         
            +
                  heading_selector: 'h1,h2,h3,h4,h5,h6',
         
     | 
| 
      
 14 
     | 
    
         
            +
                  tags_to_exclude: ''
         
     | 
| 
       12 
15 
     | 
    
         
             
                }
         
     | 
| 
       13 
16 
     | 
    
         
             
                default_options.merge(options)
         
     | 
| 
       14 
17 
     | 
    
         
             
              end
         
     | 
| 
         @@ -22,6 +25,7 @@ module AlgoliaHTMLExtractor 
     | 
|
| 
       22 
25 
     | 
    
         
             
                options = default_options(options)
         
     | 
| 
       23 
26 
     | 
    
         
             
                heading_selector = options[:heading_selector]
         
     | 
| 
       24 
27 
     | 
    
         
             
                css_selector = options[:css_selector]
         
     | 
| 
      
 28 
     | 
    
         
            +
                tags_to_exclude = options[:tags_to_exclude]
         
     | 
| 
       25 
29 
     | 
    
         | 
| 
       26 
30 
     | 
    
         
             
                items = []
         
     | 
| 
       27 
31 
     | 
    
         
             
                current_hierarchy = {
         
     | 
| 
         @@ -56,6 +60,9 @@ module AlgoliaHTMLExtractor 
     | 
|
| 
       56 
60 
     | 
    
         
             
                  # Stop if node is not to be extracted
         
     | 
| 
       57 
61 
     | 
    
         
             
                  next unless node.matches?(css_selector)
         
     | 
| 
       58 
62 
     | 
    
         | 
| 
      
 63 
     | 
    
         
            +
                  # Removing excluded child from the node
         
     | 
| 
      
 64 
     | 
    
         
            +
                  node.search(tags_to_exclude).each(&:remove) unless tags_to_exclude.empty?
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
       59 
66 
     | 
    
         
             
                  # Stop if node is empty
         
     | 
| 
       60 
67 
     | 
    
         
             
                  content = extract_text(node)
         
     | 
| 
       61 
68 
     | 
    
         
             
                  next if content.empty?
         
     | 
| 
         @@ -148,6 +155,7 @@ module AlgoliaHTMLExtractor 
     | 
|
| 
       148 
155 
     | 
    
         
             
              def self.heading_weight(heading_level)
         
     | 
| 
       149 
156 
     | 
    
         
             
                weight = 100
         
     | 
| 
       150 
157 
     | 
    
         
             
                return weight if heading_level.nil?
         
     | 
| 
      
 158 
     | 
    
         
            +
             
     | 
| 
       151 
159 
     | 
    
         
             
                weight - ((heading_level + 1) * 10)
         
     | 
| 
       152 
160 
     | 
    
         
             
              end
         
     | 
| 
       153 
161 
     | 
    
         
             
            end
         
     | 
    
        data/lib/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: algolia_html_extractor
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 2. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 2.6.4
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Tim Carry
         
     | 
| 
       8 
     | 
    
         
            -
            autorequire: 
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2021-01-06 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: json
         
     | 
| 
         @@ -30,14 +30,14 @@ dependencies: 
     | 
|
| 
       30 
30 
     | 
    
         
             
                requirements:
         
     | 
| 
       31 
31 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       32 
32 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       33 
     | 
    
         
            -
                    version: 1. 
     | 
| 
      
 33 
     | 
    
         
            +
                    version: '1.10'
         
     | 
| 
       34 
34 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       35 
35 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       36 
36 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       37 
37 
     | 
    
         
             
                requirements:
         
     | 
| 
       38 
38 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       39 
39 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       40 
     | 
    
         
            -
                    version: 1. 
     | 
| 
      
 40 
     | 
    
         
            +
                    version: '1.10'
         
     | 
| 
       41 
41 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       42 
42 
     | 
    
         
             
              name: coveralls
         
     | 
| 
       43 
43 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
         @@ -180,7 +180,7 @@ homepage: https://github.com/algolia/html-extractor 
     | 
|
| 
       180 
180 
     | 
    
         
             
            licenses:
         
     | 
| 
       181 
181 
     | 
    
         
             
            - MIT
         
     | 
| 
       182 
182 
     | 
    
         
             
            metadata: {}
         
     | 
| 
       183 
     | 
    
         
            -
            post_install_message: 
     | 
| 
      
 183 
     | 
    
         
            +
            post_install_message:
         
     | 
| 
       184 
184 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
       185 
185 
     | 
    
         
             
            require_paths:
         
     | 
| 
       186 
186 
     | 
    
         
             
            - lib
         
     | 
| 
         @@ -195,9 +195,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       195 
195 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       196 
196 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       197 
197 
     | 
    
         
             
            requirements: []
         
     | 
| 
       198 
     | 
    
         
            -
             
     | 
| 
       199 
     | 
    
         
            -
             
     | 
| 
       200 
     | 
    
         
            -
            signing_key: 
         
     | 
| 
      
 198 
     | 
    
         
            +
            rubygems_version: 3.1.2
         
     | 
| 
      
 199 
     | 
    
         
            +
            signing_key:
         
     | 
| 
       201 
200 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       202 
201 
     | 
    
         
             
            summary: Convert HTML content into Algolia records
         
     | 
| 
       203 
202 
     | 
    
         
             
            test_files: []
         
     |