curation 2.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation/finders/text.rb +34 -22
- data/lib/curation/finders/title.rb +2 -1
- data/lib/curation/version.rb +1 -1
- metadata +1 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: c2062c7ec7fb444d27f102d26658b386a01a964ea4e03aa0d81a472316012d11
         | 
| 4 | 
            +
              data.tar.gz: 634f5216e61801b3ac42c8b340d5012c101024f4d5b5e0ba155c01a89f38cafb
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 15008b92c6a51fdf9bd79b9f1da01d59323c13589b6e8ccd5d8754b96153173fe8a2cdffa5faa45a4d38c92b097a35012f8dff98fea6244ade8c9af22c13d1cc
         | 
| 7 | 
            +
              data.tar.gz: 96a29d3c8482fce0101f91a3f24eafaec51a9115f2169831b67939edf79a47d7f5bb0c3223466de355cf8a367143a262cc54a1d5e4dd7e6bffe2d310879a9cae
         | 
    
        data/Gemfile.lock
    CHANGED
    
    
| @@ -1,6 +1,12 @@ | |
| 1 1 | 
             
            module Text
         | 
| 2 2 |  | 
| 3 | 
            -
               | 
| 3 | 
            +
              def text
         | 
| 4 | 
            +
                @text ||= find_text_and_clean
         | 
| 5 | 
            +
              end
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              protected
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              BLACKLIST_HARD = [
         | 
| 4 10 | 
             
                'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
         | 
| 5 11 | 
             
                '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
         | 
| 6 12 | 
             
                '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
         | 
| @@ -11,19 +17,23 @@ module Text | |
| 11 17 | 
             
                '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
         | 
| 12 18 | 
             
              ]
         | 
| 13 19 |  | 
| 14 | 
            -
               | 
| 15 | 
            -
                 | 
| 16 | 
            -
             | 
| 17 | 
            -
              end
         | 
| 20 | 
            +
              BLACKLIST_SOFT = [
         | 
| 21 | 
            +
                'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
         | 
| 22 | 
            +
              ]
         | 
| 18 23 |  | 
| 19 | 
            -
               | 
| 24 | 
            +
              def find_text_and_clean
         | 
| 25 | 
            +
                text = find_text.to_s.dup
         | 
| 26 | 
            +
                text = text.gsub('<br><br>', '<br>')
         | 
| 27 | 
            +
                text = text.gsub(/\s+/, ' ')
         | 
| 28 | 
            +
                text = clean_encoding(text)
         | 
| 29 | 
            +
                text
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              end
         | 
| 20 32 |  | 
| 21 33 | 
             
              def find_text
         | 
| 22 | 
            -
                 | 
| 23 | 
            -
                 | 
| 24 | 
            -
                 | 
| 25 | 
            -
                text = clean_encoding text
         | 
| 26 | 
            -
                text
         | 
| 34 | 
            +
                find_text_with_json_ld || 
         | 
| 35 | 
            +
                find_text_with_nokogiri_hard ||
         | 
| 36 | 
            +
                find_text_with_nokogiri_soft
         | 
| 27 37 | 
             
              end
         | 
| 28 38 |  | 
| 29 39 | 
             
              def find_text_with_json_ld
         | 
| @@ -34,24 +44,27 @@ module Text | |
| 34 44 | 
             
                    return ld['articleBody'] if ld.has_key? 'articleBody'
         | 
| 35 45 | 
             
                  end
         | 
| 36 46 | 
             
                end
         | 
| 37 | 
            -
                 | 
| 47 | 
            +
                false
         | 
| 38 48 | 
             
              end
         | 
| 39 49 |  | 
| 40 | 
            -
              def  | 
| 50 | 
            +
              def find_text_with_nokogiri_hard
         | 
| 41 51 | 
             
                h = nokogiri.dup
         | 
| 42 52 | 
             
                h.xpath('//style').remove
         | 
| 43 | 
            -
                 | 
| 53 | 
            +
                BLACKLIST_HARD.each do |tag|
         | 
| 44 54 | 
             
                  h.css(tag).remove
         | 
| 45 55 | 
             
                end
         | 
| 46 56 | 
             
                nodes = h.css('p')
         | 
| 47 | 
            -
                 | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 57 | 
            +
                text = nodes.to_html
         | 
| 58 | 
            +
                text.present? ? text : false
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              def find_text_with_nokogiri_soft
         | 
| 62 | 
            +
                h = nokogiri.dup
         | 
| 63 | 
            +
                h.xpath('//style').remove
         | 
| 64 | 
            +
                BLACKLIST_SOFT.each do |tag|
         | 
| 65 | 
            +
                  h.css(tag).remove
         | 
| 54 66 | 
             
                end
         | 
| 67 | 
            +
                h.text
         | 
| 55 68 | 
             
              end
         | 
| 56 69 |  | 
| 57 70 | 
             
              # réforme -> réforme
         | 
| @@ -64,7 +77,6 @@ module Text | |
| 64 77 | 
             
                  'î', # î
         | 
| 65 78 | 
             
                  'ê', # ê
         | 
| 66 79 | 
             
                ].each do |string|
         | 
| 67 | 
            -
                  # require 'byebug'; byebug
         | 
| 68 80 | 
             
                  double_encoding = true if clean_text.include? string
         | 
| 69 81 | 
             
                end
         | 
| 70 82 | 
             
                if double_encoding
         | 
| @@ -21,7 +21,7 @@ module Title | |
| 21 21 | 
             
                    return ld['headline'] if ld.has_key? 'headline'
         | 
| 22 22 | 
             
                  end
         | 
| 23 23 | 
             
                end
         | 
| 24 | 
            -
                 | 
| 24 | 
            +
                false
         | 
| 25 25 | 
             
              end
         | 
| 26 26 |  | 
| 27 27 | 
             
              def find_title_with_metainspector
         | 
| @@ -39,6 +39,7 @@ module Title | |
| 39 39 | 
             
                elsif metainspector_title.present?
         | 
| 40 40 | 
             
                  return metainspector_title
         | 
| 41 41 | 
             
                end
         | 
| 42 | 
            +
                false
         | 
| 42 43 | 
             
              end
         | 
| 43 44 |  | 
| 44 45 | 
             
              def find_title_with_nokogiri
         | 
    
        data/lib/curation/version.rb
    CHANGED